From ea29e81042c25252c09dea58c40ae758cf580f27 Mon Sep 17 00:00:00 2001 From: Tony Date: Sun, 25 Oct 2020 19:43:11 -0400 Subject: [PATCH] Merge linux-tkg packages into a single package, add Void Linux and Clang/LLVM support (#63) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add Clang/LLVM and Void Linux support * merge packages, update .gitignore * Stage 2 of merge * Stage 3 of merge * Stage 4 of merge * Stage 5 of mrege, almost done * Complete merge of linux-tkg, sync with master (undead PDS 5.8) * Complete merge of linux-tkg, sync with master (undead PDS 5.8) * Forgot to add all the patches * Fix prompt, add config_hardened files * Fix some stuff for Void * Merge linux-tkg README, add Void Linux info * typo * Fix broken MuQSS on Void at least, Fix CPU opts patch apply * update to 5.8.8, 5.4.64, block LLVM build with kernel 5.7 because it seems to segfault at linking vmlinux, fix RC building on Void Linux * update to 5.8.8, 5.4.64. seems making a confdir variable wasn't necessary, revert it. sync with master. * remove variables for messages in favor of defined functions in the Void template * resync Linux58-tkg back to master * Clear patches is the same file for every version * glitched base is the same file for 5.8 and 5.9, fix wrong version in kernel selection for 5.8, also just use * since it's the same .-. * merge some patches that are the same together, fix building 5.4 BMQ * BMQ and PDS ondemand are the same file, fix missing space on kernel version selection * add Clang makedepends to PKGBUILD, add missing compiler option in customization.cfg, make GCC recommended * Add kernel selection to install.sh * Somehow this wasn't edited * Add optional clang deps to install.sh * Update gitignore to just ignore linux-5.x instead of commenting it out * Missing fi * forgot to sync back install.sh... * generalize desktop profile configs, uncomment fsync and bcachefs in customization.cfg, add Project C /PDS and Undead PDS to README, credit plug, add kernel version selection to customization.cfg, fallback for compiler selection * Fix compiler selection error out on Void (for some reason xbps-src always runs the else statement) also avoid conflict with _compiler, rename it to _compiler_opt for the actual options in the make command * oops * Fix Void extra config not appearing, missing indent for _configfile, missed PKGBUILD in other commit... * remove globbing in install.sh * Move loading of external configuration to the beginning of _tkg_initscript * Change MuQSS note in README * 5.4.68, 5.8.12, 5.9-rc7 * Update Project C patchset to v5.8-r3 * 5.8 bcachefs * LLVM=1 was required to be used on make commands otherwise it would reset config, we don't need to know if gcc was used to build a kernel (since llvm is appended to llvm builds), remove the compiler_name if not using llvm, make kernel versions variables in prepare * _compileropt does not need to be llvm specific * add fix for AMD GPU DCN3.0 build with clang in glitched base 5.9 https://lore.kernel.org/amd-gfx/4b5927dd-9f2d-40f9-ef63-007cf7b65f37@amd.com/T/#t * Revert "add fix for AMD GPU DCN3.0 build with clang in glitched base 5.9" This reverts commit 276e219f9fe89397332c91e601f34a37b3a0503f. merged upstream * Sync with linux-tkg master * Just stick every patch into it's own kernel version folder * update 5.4.72, 5.8.16 * check for sum check fail on Void, fix fsync patch * Update README.md * README.md formatting * forgot to move that * linux59-tkg: Import 5.9 version of the bcachefs patchset - https://gthub.com/koverstreet/bcachefs Co-authored-by: Lukáš Horáček --- .gitignore | 2 +- linux-tkg/PKGBUILD | 468 + linux-tkg/README.md | 73 + linux-tkg/customization.cfg | 196 + linux-tkg/install.sh | 299 + .../linux-tkg-config/5.4/90-cleanup.hook | 14 + linux-tkg/linux-tkg-config/5.4/cleanup | 10 + linux-tkg/linux-tkg-config/5.4/config.x86_64 | 10598 +++ .../5.4/config_hardened.x86_64 | 10527 +++ .../linux-tkg-config/5.7/90-cleanup.hook | 14 + linux-tkg/linux-tkg-config/5.7/cleanup | 10 + linux-tkg/linux-tkg-config/5.7/config.x86_64 | 10864 +++ .../5.7/config_hardened.x86_64 | 10839 +++ .../linux-tkg-config/5.8/90-cleanup.hook | 14 + linux-tkg/linux-tkg-config/5.8/cleanup | 10 + linux-tkg/linux-tkg-config/5.8/config.x86_64 | 11019 +++ .../linux-tkg-config/5.9/90-cleanup.hook | 14 + linux-tkg/linux-tkg-config/5.9/cleanup | 10 + linux-tkg/linux-tkg-config/5.9/config.x86_64 | 11049 +++ .../generic-desktop-profile.cfg | 35 + linux-tkg/linux-tkg-config/prepare | 1260 + .../ryzen-desktop-profile.cfg | 38 + ...sallow-unprivileged-CLONE_NEWUSER-by.patch | 156 + .../5.4/0002-clear-patches.patch | 354 + .../5.4/0003-glitched-base.patch | 4612 + .../5.4/0003-glitched-cfs.patch | 72 + .../linux-tkg-patches/5.4/0004-5.4-ck1.patch | 17684 ++++ .../5.4/0004-glitched-muqss.patch | 78 + .../5.4/0004-glitched-ondemand-muqss.patch | 18 + .../5.4/0005-glitched-ondemand-pds.patch | 18 + .../5.4/0005-glitched-pds.patch | 213 + .../5.4/0005-v5.4_undead-pds099o.patch | 8387 ++ .../5.4/0006-add-acs-overrides_iommu.patch | 193 + .../5.4/0007-v5.4-fsync.patch | 419 + .../5.4/0009-bmq_v5.4-r2.patch | 7601 ++ .../5.4/0009-glitched-bmq.patch | 108 + .../linux-tkg-patches/5.4/0011-ZFS-fix.patch | 43 + .../5.4/0012-linux-hardened.patch | 2806 + ...sallow-unprivileged-CLONE_NEWUSER-by.patch | 156 + .../5.7/0002-clear-patches.patch | 354 + .../5.7/0003-glitched-base.patch | 545 + .../5.7/0003-glitched-cfs.patch | 72 + .../linux-tkg-patches/5.7/0004-5.7-ck1.patch | 13147 +++ .../5.7/0004-glitched-muqss.patch | 78 + .../5.7/0004-glitched-ondemand-muqss.patch | 18 + .../5.7/0005-glitched-ondemand-pds.patch | 18 + .../5.7/0005-glitched-pds.patch | 166 + .../5.7/0005-v5.7_undead-pds099o.patch | 8400 ++ .../5.7/0006-add-acs-overrides_iommu.patch | 193 + .../5.7/0007-v5.7-fsync.patch | 908 + .../5.7/0008-5.7-bcachefs.patch | 71085 ++++++++++++++++ .../5.7/0009-glitched-bmq.patch | 90 + .../5.7/0009-glitched-ondemand-bmq.patch | 18 + .../5.7/0009-prjc_v5.7-r3.patch | 7817 ++ .../5.7/0010-5.7-glitched-cachy.patch | 3936 + .../linux-tkg-patches/5.7/0011-ZFS-fix.patch | 43 + .../5.7/0012-linux-hardened.patch | 2916 + .../5.7/0012-misc-additions.patch | 55 + ...sallow-unprivileged-CLONE_NEWUSER-by.patch | 156 + .../5.8/0002-clear-patches.patch | 360 + .../5.8/0003-glitched-base.patch | 708 + .../5.8/0003-glitched-cfs.patch | 72 + .../5.8/0005-glitched-pds.patch | 90 + .../0005-undead-glitched-ondemand-pds.patch | 18 + .../5.8/0005-undead-glitched-pds.patch | 166 + .../5.8/0005-v5.8_undead-pds099o.patch | 8530 ++ .../5.8/0006-add-acs-overrides_iommu.patch | 193 + .../5.8/0007-v5.8-fsync.patch | 908 + .../5.8/0008-5.8-bcachefs.patch | 70598 +++++++++++++++ .../5.8/0009-glitched-bmq.patch | 90 + .../5.8/0009-glitched-ondemand-bmq.patch | 18 + .../5.8/0009-prjc_v5.8-r3.patch | 8582 ++ .../linux-tkg-patches/5.8/0011-ZFS-fix.patch | 43 + .../5.8/0012-misc-additions.patch | 54 + ...sallow-unprivileged-CLONE_NEWUSER-by.patch | 156 + .../5.9/0002-clear-patches.patch | 360 + .../5.9/0003-glitched-base.patch | 708 + .../5.9/0003-glitched-cfs.patch | 72 + .../linux-tkg-patches/5.9/0004-5.9-ck1.patch | 13384 +++ .../5.9/0004-glitched-muqss.patch | 78 + .../5.9/0004-glitched-ondemand-muqss.patch | 18 + .../5.9/0005-glitched-pds.patch | 90 + .../5.9/0006-add-acs-overrides_iommu.patch | 193 + .../5.9/0007-v5.9-fsync.patch | 597 + .../5.9/0008-5.9-bcachefs.patch | 70821 +++++++++++++++ .../5.9/0009-glitched-bmq.patch | 90 + .../5.9/0009-glitched-ondemand-bmq.patch | 18 + .../5.9/0009-prjc_v5.9-r0.patch | 8809 ++ .../linux-tkg-patches/5.9/0011-ZFS-fix.patch | 43 + .../5.9/0012-misc-additions.patch | 54 + .../5.9/0013-remove-debian-deps-cross.patch | 25 + 91 files changed, 406241 insertions(+), 1 deletion(-) create mode 100644 linux-tkg/PKGBUILD create mode 100644 linux-tkg/README.md create mode 100644 linux-tkg/customization.cfg create mode 100755 linux-tkg/install.sh create mode 100644 linux-tkg/linux-tkg-config/5.4/90-cleanup.hook create mode 100755 linux-tkg/linux-tkg-config/5.4/cleanup create mode 100644 linux-tkg/linux-tkg-config/5.4/config.x86_64 create mode 100644 linux-tkg/linux-tkg-config/5.4/config_hardened.x86_64 create mode 100644 linux-tkg/linux-tkg-config/5.7/90-cleanup.hook create mode 100755 linux-tkg/linux-tkg-config/5.7/cleanup create mode 100644 linux-tkg/linux-tkg-config/5.7/config.x86_64 create mode 100644 linux-tkg/linux-tkg-config/5.7/config_hardened.x86_64 create mode 100644 linux-tkg/linux-tkg-config/5.8/90-cleanup.hook create mode 100755 linux-tkg/linux-tkg-config/5.8/cleanup create mode 100644 linux-tkg/linux-tkg-config/5.8/config.x86_64 create mode 100644 linux-tkg/linux-tkg-config/5.9/90-cleanup.hook create mode 100755 linux-tkg/linux-tkg-config/5.9/cleanup create mode 100644 linux-tkg/linux-tkg-config/5.9/config.x86_64 create mode 100644 linux-tkg/linux-tkg-config/generic-desktop-profile.cfg create mode 100644 linux-tkg/linux-tkg-config/prepare create mode 100644 linux-tkg/linux-tkg-config/ryzen-desktop-profile.cfg create mode 100644 linux-tkg/linux-tkg-patches/5.4/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch create mode 100644 linux-tkg/linux-tkg-patches/5.4/0002-clear-patches.patch create mode 100644 linux-tkg/linux-tkg-patches/5.4/0003-glitched-base.patch create mode 100644 linux-tkg/linux-tkg-patches/5.4/0003-glitched-cfs.patch create mode 100644 linux-tkg/linux-tkg-patches/5.4/0004-5.4-ck1.patch create mode 100644 linux-tkg/linux-tkg-patches/5.4/0004-glitched-muqss.patch create mode 100644 linux-tkg/linux-tkg-patches/5.4/0004-glitched-ondemand-muqss.patch create mode 100644 linux-tkg/linux-tkg-patches/5.4/0005-glitched-ondemand-pds.patch create mode 100644 linux-tkg/linux-tkg-patches/5.4/0005-glitched-pds.patch create mode 100644 linux-tkg/linux-tkg-patches/5.4/0005-v5.4_undead-pds099o.patch create mode 100644 linux-tkg/linux-tkg-patches/5.4/0006-add-acs-overrides_iommu.patch create mode 100644 linux-tkg/linux-tkg-patches/5.4/0007-v5.4-fsync.patch create mode 100644 linux-tkg/linux-tkg-patches/5.4/0009-bmq_v5.4-r2.patch create mode 100644 linux-tkg/linux-tkg-patches/5.4/0009-glitched-bmq.patch create mode 100644 linux-tkg/linux-tkg-patches/5.4/0011-ZFS-fix.patch create mode 100644 linux-tkg/linux-tkg-patches/5.4/0012-linux-hardened.patch create mode 100644 linux-tkg/linux-tkg-patches/5.7/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch create mode 100644 linux-tkg/linux-tkg-patches/5.7/0002-clear-patches.patch create mode 100644 linux-tkg/linux-tkg-patches/5.7/0003-glitched-base.patch create mode 100644 linux-tkg/linux-tkg-patches/5.7/0003-glitched-cfs.patch create mode 100644 linux-tkg/linux-tkg-patches/5.7/0004-5.7-ck1.patch create mode 100644 linux-tkg/linux-tkg-patches/5.7/0004-glitched-muqss.patch create mode 100644 linux-tkg/linux-tkg-patches/5.7/0004-glitched-ondemand-muqss.patch create mode 100644 linux-tkg/linux-tkg-patches/5.7/0005-glitched-ondemand-pds.patch create mode 100644 linux-tkg/linux-tkg-patches/5.7/0005-glitched-pds.patch create mode 100644 linux-tkg/linux-tkg-patches/5.7/0005-v5.7_undead-pds099o.patch create mode 100644 linux-tkg/linux-tkg-patches/5.7/0006-add-acs-overrides_iommu.patch create mode 100644 linux-tkg/linux-tkg-patches/5.7/0007-v5.7-fsync.patch create mode 100644 linux-tkg/linux-tkg-patches/5.7/0008-5.7-bcachefs.patch create mode 100644 linux-tkg/linux-tkg-patches/5.7/0009-glitched-bmq.patch create mode 100644 linux-tkg/linux-tkg-patches/5.7/0009-glitched-ondemand-bmq.patch create mode 100644 linux-tkg/linux-tkg-patches/5.7/0009-prjc_v5.7-r3.patch create mode 100644 linux-tkg/linux-tkg-patches/5.7/0010-5.7-glitched-cachy.patch create mode 100644 linux-tkg/linux-tkg-patches/5.7/0011-ZFS-fix.patch create mode 100644 linux-tkg/linux-tkg-patches/5.7/0012-linux-hardened.patch create mode 100644 linux-tkg/linux-tkg-patches/5.7/0012-misc-additions.patch create mode 100644 linux-tkg/linux-tkg-patches/5.8/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch create mode 100644 linux-tkg/linux-tkg-patches/5.8/0002-clear-patches.patch create mode 100644 linux-tkg/linux-tkg-patches/5.8/0003-glitched-base.patch create mode 100644 linux-tkg/linux-tkg-patches/5.8/0003-glitched-cfs.patch create mode 100644 linux-tkg/linux-tkg-patches/5.8/0005-glitched-pds.patch create mode 100644 linux-tkg/linux-tkg-patches/5.8/0005-undead-glitched-ondemand-pds.patch create mode 100644 linux-tkg/linux-tkg-patches/5.8/0005-undead-glitched-pds.patch create mode 100644 linux-tkg/linux-tkg-patches/5.8/0005-v5.8_undead-pds099o.patch create mode 100644 linux-tkg/linux-tkg-patches/5.8/0006-add-acs-overrides_iommu.patch create mode 100644 linux-tkg/linux-tkg-patches/5.8/0007-v5.8-fsync.patch create mode 100644 linux-tkg/linux-tkg-patches/5.8/0008-5.8-bcachefs.patch create mode 100644 linux-tkg/linux-tkg-patches/5.8/0009-glitched-bmq.patch create mode 100644 linux-tkg/linux-tkg-patches/5.8/0009-glitched-ondemand-bmq.patch create mode 100644 linux-tkg/linux-tkg-patches/5.8/0009-prjc_v5.8-r3.patch create mode 100644 linux-tkg/linux-tkg-patches/5.8/0011-ZFS-fix.patch create mode 100644 linux-tkg/linux-tkg-patches/5.8/0012-misc-additions.patch create mode 100644 linux-tkg/linux-tkg-patches/5.9/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch create mode 100644 linux-tkg/linux-tkg-patches/5.9/0002-clear-patches.patch create mode 100644 linux-tkg/linux-tkg-patches/5.9/0003-glitched-base.patch create mode 100644 linux-tkg/linux-tkg-patches/5.9/0003-glitched-cfs.patch create mode 100644 linux-tkg/linux-tkg-patches/5.9/0004-5.9-ck1.patch create mode 100644 linux-tkg/linux-tkg-patches/5.9/0004-glitched-muqss.patch create mode 100644 linux-tkg/linux-tkg-patches/5.9/0004-glitched-ondemand-muqss.patch create mode 100644 linux-tkg/linux-tkg-patches/5.9/0005-glitched-pds.patch create mode 100644 linux-tkg/linux-tkg-patches/5.9/0006-add-acs-overrides_iommu.patch create mode 100644 linux-tkg/linux-tkg-patches/5.9/0007-v5.9-fsync.patch create mode 100644 linux-tkg/linux-tkg-patches/5.9/0008-5.9-bcachefs.patch create mode 100644 linux-tkg/linux-tkg-patches/5.9/0009-glitched-bmq.patch create mode 100644 linux-tkg/linux-tkg-patches/5.9/0009-glitched-ondemand-bmq.patch create mode 100644 linux-tkg/linux-tkg-patches/5.9/0009-prjc_v5.9-r0.patch create mode 100644 linux-tkg/linux-tkg-patches/5.9/0011-ZFS-fix.patch create mode 100644 linux-tkg/linux-tkg-patches/5.9/0012-misc-additions.patch create mode 100644 linux-tkg/linux-tkg-patches/5.9/0013-remove-debian-deps-cross.patch diff --git a/.gitignore b/.gitignore index 5189f7c..f9fd432 100644 --- a/.gitignore +++ b/.gitignore @@ -15,4 +15,4 @@ *.files */src/ */pkg/ -*/linux-*/ +*/linux-5*/ diff --git a/linux-tkg/PKGBUILD b/linux-tkg/PKGBUILD new file mode 100644 index 0000000..c9dc81c --- /dev/null +++ b/linux-tkg/PKGBUILD @@ -0,0 +1,468 @@ +# Based on the file created for Arch Linux by: +# Tobias Powalowski +# Thomas Baechler + +# Contributor: Tk-Glitch +# Contributor: Hyper-KVM + +plain ' .---.` `.---.' +plain ' `/syhhhyso- -osyhhhys/`' +plain ' .syNMdhNNhss/``.---.``/sshNNhdMNys.' +plain ' +sdMh.`+MNsssssssssssssssNM+`.hMds+' +plain ' :syNNdhNNhssssssssssssssshNNhdNNys:' +plain ' /ssyhhhysssssssssssssssssyhhhyss/' +plain ' .ossssssssssssssssssssssssssssso.' +plain ' :sssssssssssssssssssssssssssssssss:' +plain ' /sssssssssssssssssssssssssssssssssss/' +plain ' :sssssssssssssoosssssssoosssssssssssss:' +plain ' osssssssssssssoosssssssoossssssssssssso' +plain ' osssssssssssyyyyhhhhhhhyyyyssssssssssso' +plain ' /yyyyyyhhdmmmmNNNNNNNNNNNmmmmdhhyyyyyy/' +plain ' smmmNNNNNNNNNNNNNNNNNNNNNNNNNNNNNmmms' +plain ' /dNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNd/' +plain ' `:sdNNNNNNNNNNNNNNNNNNNNNNNNNds:`' +plain ' `-+shdNNNNNNNNNNNNNNNdhs+-`' +plain ' `.-:///////:-.`' + +_where="$PWD" # track basedir as different Arch based distros are moving srcdir around + +source "$_where"/customization.cfg # load default configuration from file +source "$_where"/linux-tkg-config/prepare + +_distro="Arch" + +_tkg_initscript + +if [[ "$_sub" = rc* ]]; then + _srcpath="linux-${_basekernel}-${_sub}" + kernel_site="https://git.kernel.org/torvalds/t/linux-${_basekernel}-${_sub}.tar.gz" +else + _srcpath="linux-${_basekernel}" + kernel_site="https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-${_basekernel}.tar.xz" + patch_site="https://www.kernel.org/pub/linux/kernel/v5.x/patch-${_basekernel}.${_sub}.xz" +fi + +if [ -n "$_custom_pkgbase" ]; then + pkgbase="${_custom_pkgbase}" +else + pkgbase=linux"${_basever}"-tkg-"${_cpusched}"${_compiler_name} +fi +pkgname=("${pkgbase}" "${pkgbase}-headers") +pkgver="${_basekernel}"."${_sub}" +pkgrel=4 +pkgdesc='Linux-tkg' +arch=('x86_64') # no i686 in here +url="http://www.kernel.org/" +license=('GPL2') +makedepends=('xmlto' 'docbook-xsl' 'kmod' 'inetutils' 'bc' 'libelf' 'pahole' 'patchutils' 'flex' 'python-sphinx' 'python-sphinx_rtd_theme' 'graphviz' 'imagemagick' 'git') +if [ "$_compiler_name" = "-llvm" ]; then + makedepends+=( 'lld' 'clang' 'llvm') +fi +optdepends=('schedtool') +options=('!strip' 'docs') + +case $_basever in + 54) + opt_ver="4.19-v5.4" + source=("$kernel_site" + "$patch_site" + "https://raw.githubusercontent.com/graysky2/kernel_gcc_patch/master/enable_additional_cpu_optimizations_for_gcc_v10.1%2B_kernel_v4.19-v5.4.patch" + 'config.x86_64' # stock Arch config + 'config_hardened.x86_64' # hardened Arch config + 90-cleanup.hook + cleanup + # ARCH Patches + 0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch + # TkG + 0002-clear-patches.patch + 0003-glitched-base.patch + 0003-glitched-cfs.patch + 0004-glitched-ondemand-muqss.patch + 0004-glitched-muqss.patch + 0004-5.4-ck1.patch + 0005-glitched-ondemand-pds.patch + 0005-glitched-pds.patch + 0005-v5.4_undead-pds099o.patch + 0006-add-acs-overrides_iommu.patch + 0007-v5.4-fsync.patch + #0008-5.4-bcachefs.patch + 0009-glitched-bmq.patch + 0009-bmq_v5.4-r2.patch + 0011-ZFS-fix.patch + 0012-linux-hardened.patch + ) + sha256sums=('bf338980b1670bca287f9994b7441c2361907635879169c64ae78364efc5f491' + 'bce941bcb6c8148ac19cd2fa4f1e19c6c75f699a3bcdfd452df7484cff2a2353' + '27b7fc535ade94b636c3ec4e809e141831e9465a0ef55215a9852b87048629e2' + '55dd5117c1da17c9ec38d7bc995958958bcc8b7ebcfd81de1d4c7650b85537ab' + '1f4a20d6eaaa0d969af93152a65191492400c6aa838fc1c290b0dd29bb6019d8' + '1e15fc2ef3fa770217ecc63a220e5df2ddbcf3295eb4a021171e7edd4c6cc898' + '66a03c246037451a77b4d448565b1d7e9368270c7d02872fbd0b5d024ed0a997' + '31dc68e84aecfb7d069efb1305049122c65694676be8b955634abcf0675922a2' + 'd02bf5ca08fd610394b9d3a0c3b176d74af206f897dee826e5cbaec97bb4a4aa' + '156a2c75fd228920e3c3da5e04a110afa403951bdfbb85772c2fd4b82fd24d61' + '7058e57fd68367b029adc77f2a82928f1433daaf02c8c279cb2d13556c8804d7' + 'c605f638d74c61861ebdc36ebd4cb8b6475eae2f6273e1ccb2bbb3e10a2ec3fe' + 'bc69d6e5ee8172b0242c8fa72d13cfe2b8d2b6601468836908a7dfe8b78a3bbb' + '815974c65f47301d2a5d1577bf95e8a4b54cad7d77f226e0065f83e763837c48' + '62496f9ca788996181ef145f96ad26291282fcc3fb95cdc04080dcf84365be33' + 'eac7e5d6201528e64f4bdf5e286c842511e1afc52e1518dc8e7d11932bbe0a99' + 'db03fbd179ec78941eefe1c0edde4c19071bc603511d0b5c06c04e412994b62e' + '19661ec0d39f9663452b34433214c755179894528bf73a42f6ba52ccf572832a' + '2d9260b80b43bbd605cf420d6bd53aa7262103dfd77196ba590ece5600b6dc0d' + '3832f828a9f402b153fc9a6829c5a4eaf6091804bcda3a0423c8e1b57e26420d' + '6a6a736cf1b3513d108bfd36f60baf50bb36b33aec21ab0d0ffad13602b7ff75' + '49262ce4a8089fa70275aad742fc914baa28d9c384f710c9a62f64796d13e104' + 'aeb31404c26ee898d007b1f66cb9572c9884ad8eca14edc4587d68f6cba6de46') + ;; + 57) + opt_ver="5.7%2B" + source=("$kernel_site" + "$patch_site" + "https://raw.githubusercontent.com/graysky2/kernel_gcc_patch/master/enable_additional_cpu_optimizations_for_gcc_v10.1%2B_kernel_v5.7%2B.patch" + 'config.x86_64' # stock Arch config + 'config_hardened.x86_64' # hardened Arch config + 90-cleanup.hook + cleanup + # ARCH Patches + 0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch + # TkG + 0002-clear-patches.patch + 0003-glitched-base.patch + 0003-glitched-cfs.patch + 0004-glitched-ondemand-muqss.patch + 0004-glitched-muqss.patch + 0004-5.7-ck1.patch + 0005-glitched-ondemand-pds.patch + 0005-glitched-pds.patch + 0005-v5.7_undead-pds099o.patch + 0006-add-acs-overrides_iommu.patch + 0007-v5.7-fsync.patch + 0008-5.7-bcachefs.patch + 0009-glitched-ondemand-bmq.patch + 0009-glitched-bmq.patch + 0009-prjc_v5.7-r3.patch + 0011-ZFS-fix.patch + 0012-linux-hardened.patch + 0012-misc-additions.patch + ) + sha256sums=('de8163bb62f822d84f7a3983574ec460060bf013a78ff79cd7c979ff1ec1d7e0' + '66a0173a13cd58015f5bf1b14f67bfa15dc1db5d8e7225fcd95ac2e9a5341653' + '1f56a2466bd9b4477925682d8f944fabb38727140e246733214fe50aa326fc47' + '6313ccad7f8e4d8ce09dd5bdb51b8dfa124d0034d7097ba47008380a14a84f09' + '15ce09447b7e9b28425c1df5961c955378f2829e4115037337eef347b1db3d9d' + '1e15fc2ef3fa770217ecc63a220e5df2ddbcf3295eb4a021171e7edd4c6cc898' + '66a03c246037451a77b4d448565b1d7e9368270c7d02872fbd0b5d024ed0a997' + '31dc68e84aecfb7d069efb1305049122c65694676be8b955634abcf0675922a2' + 'd02bf5ca08fd610394b9d3a0c3b176d74af206f897dee826e5cbaec97bb4a4aa' + 'bbf332201423888257c9687bee06916a5dbbac2194f9df5b4126100c40e48d16' + '7058e57fd68367b029adc77f2a82928f1433daaf02c8c279cb2d13556c8804d7' + 'c605f638d74c61861ebdc36ebd4cb8b6475eae2f6273e1ccb2bbb3e10a2ec3fe' + 'bc69d6e5ee8172b0242c8fa72d13cfe2b8d2b6601468836908a7dfe8b78a3bbb' + '8d8aec86e34dbec6cc3a47f2cd55dc9212e95d36b6cd34d6e637c66731e7d838' + '62496f9ca788996181ef145f96ad26291282fcc3fb95cdc04080dcf84365be33' + '7fd8e776209dac98627453fda754bdf9aff4a09f27cb0b3766d7983612eb3c74' + '55be5e4c6254da0a9d34bbfac807a70d8b58b3f7b2ec852026195c4db5e263e2' + '19661ec0d39f9663452b34433214c755179894528bf73a42f6ba52ccf572832a' + 'cd225e86d72eaf6c31ef3d7b20df397f4cc44ddd04389850691292cdf292b204' + 'd2214504c43f9d297a8ef68dffc198143bfebf85614b71637a71978d7a86bd78' + '9fad4a40449e09522899955762c8928ae17f4cdaa16e01239fd12592e9d58177' + '965a517a283f265a012545fbb5cc9e516efc9f6166d2aa1baf7293a32a1086b7' + 'b2a2ae866fc3f1093f67e69ba59738827e336b8f800fb0487599127f7f3ef881' + '49262ce4a8089fa70275aad742fc914baa28d9c384f710c9a62f64796d13e104' + '6821f92bd2bde3a3938d17b070d70f18a2f33cae81647567b5a4d94c9cd75f3d' + 'bdc60c83cd5fbf9912f9201d6e4fe3c84fe5f634e6823bd8e78264ad606b3a9e') + ;; + 58) + opt_ver="5.8%2B" + source=("$kernel_site" + "$patch_site" + "https://raw.githubusercontent.com/graysky2/kernel_gcc_patch/master/enable_additional_cpu_optimizations_for_gcc_v10.1%2B_kernel_v5.8%2B.patch" + 'config.x86_64' # stock Arch config + #'config_hardened.x86_64' # hardened Arch config + 90-cleanup.hook + cleanup + # ARCH Patches + 0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch + # TkG + 0002-clear-patches.patch + 0003-glitched-base.patch + 0003-glitched-cfs.patch + #0004-glitched-ondemand-muqss.patch + #0004-glitched-muqss.patch + #0004-5.8-ck1.patch + 0005-undead-glitched-ondemand-pds.patch + 0005-undead-glitched-pds.patch + 0005-v5.8_undead-pds099o.patch + 0005-glitched-pds.patch + 0006-add-acs-overrides_iommu.patch + 0007-v5.8-fsync.patch + 0008-5.8-bcachefs.patch + 0009-glitched-ondemand-bmq.patch + 0009-glitched-bmq.patch + 0009-prjc_v5.8-r3.patch + 0011-ZFS-fix.patch + #0012-linux-hardened.patch + 0012-misc-additions.patch + ) + sha256sums=('e7f75186aa0642114af8f19d99559937300ca27acaf7451b36d4f9b0f85cf1f5' + '2ea49982bd10e4c880d49051535bd820e276dd3235c3c913b255aaaadc707e1d' + '5ab29eb64e57df83b395a29a6a4f89030d142feffbfbf73b3afc6d97a2a7fd12' + 'ac66686b0e1ed057ea5f099cd00366decc00f999aa1cb19ba8d3ccf9f92d60e2' + '1e15fc2ef3fa770217ecc63a220e5df2ddbcf3295eb4a021171e7edd4c6cc898' + '66a03c246037451a77b4d448565b1d7e9368270c7d02872fbd0b5d024ed0a997' + 'f6383abef027fd9a430fd33415355e0df492cdc3c90e9938bf2d98f4f63b32e6' + '35a7cde86fb94939c0f25a62b8c47f3de0dbd3c65f876f460b263181b3e92fc0' + 'b9ebe0ae69bc2b2091d6bfcf6c7875a87ea7969fcfa4e306c48d47a60f9ef4d6' + '7058e57fd68367b029adc77f2a82928f1433daaf02c8c279cb2d13556c8804d7' + '62496f9ca788996181ef145f96ad26291282fcc3fb95cdc04080dcf84365be33' + '7fd8e776209dac98627453fda754bdf9aff4a09f27cb0b3766d7983612eb3c74' + '31b172eb6a0c635a8d64cc1c2e8181d9f928ee991bd44f6e556d1713b815f8d9' + '87bca363416655bc865fcb2cc0d1532cb010a61d9b9f625e3c15cd12eeee3a59' + '19661ec0d39f9663452b34433214c755179894528bf73a42f6ba52ccf572832a' + 'cd225e86d72eaf6c31ef3d7b20df397f4cc44ddd04389850691292cdf292b204' + '86414a20225deec084e0e48b35552b3a4eef67f76755b32a10febb7b6308dcb7' + '9fad4a40449e09522899955762c8928ae17f4cdaa16e01239fd12592e9d58177' + '965a517a283f265a012545fbb5cc9e516efc9f6166d2aa1baf7293a32a1086b7' + 'f5dbff4833a2e3ca94c202e5197894d5f1006c689ff149355353e77d2e17c943' + '49262ce4a8089fa70275aad742fc914baa28d9c384f710c9a62f64796d13e104' + '98311deeb474b39e821cd1e64198793d5c4d797155b3b8bbcb1938b7f11e8d74') + ;; + 59) + opt_ver="5.8%2B" + source=("$kernel_site" + $patch_site + "https://raw.githubusercontent.com/graysky2/kernel_gcc_patch/master/enable_additional_cpu_optimizations_for_gcc_v10.1%2B_kernel_v5.8%2B.patch" + "config.x86_64" # stock Arch config + #$hardened_config_file # hardened Arch config + 90-cleanup.hook + cleanup + # ARCH Patches + 0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch + # TkG + 0002-clear-patches.patch + 0003-glitched-base.patch + 0003-glitched-cfs.patch + 0004-glitched-ondemand-muqss.patch + 0004-glitched-muqss.patch + 0004-5.9-ck1.patch + #0005-undead-glitched-ondemand-pds.patch + #0005-undead-glitched-pds.patch + #0005-v5.8_undead-pds099o.patch + 0005-glitched-pds.patch + 0006-add-acs-overrides_iommu.patch + 0007-v5.9-fsync.patch + 0008-5.9-bcachefs.patch + 0009-glitched-ondemand-bmq.patch + 0009-glitched-bmq.patch + 0009-prjc_v5.9-r0.patch + 0011-ZFS-fix.patch + #0012-linux-hardened.patch + 0012-misc-additions.patch + ) + sha256sums=('3239a4ee1250bf2048be988cc8cb46c487b2c8a0de5b1b032d38394d5c6b1a06' + '7edb7b9d06b02f9b88d868c74ab618baf899c94edb19a73291f640dbea55c312' + '5ab29eb64e57df83b395a29a6a4f89030d142feffbfbf73b3afc6d97a2a7fd12' + '20da98426048a222adeaf6606c9695d7a36974f4110a5adbe77c482898b59348' + '1e15fc2ef3fa770217ecc63a220e5df2ddbcf3295eb4a021171e7edd4c6cc898' + '66a03c246037451a77b4d448565b1d7e9368270c7d02872fbd0b5d024ed0a997' + 'f6383abef027fd9a430fd33415355e0df492cdc3c90e9938bf2d98f4f63b32e6' + '35a7cde86fb94939c0f25a62b8c47f3de0dbd3c65f876f460b263181b3e92fc0' + 'b9ebe0ae69bc2b2091d6bfcf6c7875a87ea7969fcfa4e306c48d47a60f9ef4d6' + '7058e57fd68367b029adc77f2a82928f1433daaf02c8c279cb2d13556c8804d7' + 'c605f638d74c61861ebdc36ebd4cb8b6475eae2f6273e1ccb2bbb3e10a2ec3fe' + 'bc69d6e5ee8172b0242c8fa72d13cfe2b8d2b6601468836908a7dfe8b78a3bbb' + '45a9ab99215ab3313be6e66e073d29154aac55bc58975a4df2dad116c918d27c' + 'fca63d15ca4502aebd73e76d7499b243d2c03db71ff5ab0bf5cf268b2e576320' + '19661ec0d39f9663452b34433214c755179894528bf73a42f6ba52ccf572832a' + 'b302ba6c5bbe8ed19b20207505d513208fae1e678cf4d8e7ac0b154e5fe3f456' + '3956c324798f25bcf8e6c5f6d160551245304c5cfa3a2cba73e5b1e350c364ce' + '9fad4a40449e09522899955762c8928ae17f4cdaa16e01239fd12592e9d58177' + 'a557b342111849a5f920bbe1c129f3ff1fc1eff62c6bd6685e0972fc88e39911' + '88c7e308e474c845e0cc09e09bd223fc39876eca757abf6d6c3b8321f49ce1f1' + '49262ce4a8089fa70275aad742fc914baa28d9c384f710c9a62f64796d13e104' + '433b919e6a0be26784fb4304c43b1811a28f12ad3de9e26c0af827f64c0c316e') + ;; +esac + +export KBUILD_BUILD_HOST=archlinux +export KBUILD_BUILD_USER=$pkgbase +export KBUILD_BUILD_TIMESTAMP="$(date -Ru${SOURCE_DATE_EPOCH:+d @$SOURCE_DATE_EPOCH})" + +prepare() { + rm -rf $pkgdir # Nuke the entire pkg folder so it'll get regenerated clean on next build + + ln -s "${_where}/customization.cfg" "${srcdir}" # workaround + + cd "${srcdir}/${_srcpath}" + + _tkg_srcprep +} + +build() { + cd "${srcdir}/${_srcpath}" + + # Use custom compiler paths if defined + if [ -n "${CUSTOM_GCC_PATH}" ]; then + PATH=${CUSTOM_GCC_PATH}/bin:${CUSTOM_GCC_PATH}/lib:${CUSTOM_GCC_PATH}/include:${PATH} + fi + + if [ "$_force_all_threads" = "true" ]; then + _force_all_threads="-j$((`nproc`*2))" + else + _force_all_threads="${MAKEFLAGS}" + fi + + # ccache + if [ "$_noccache" != "true" ] && pacman -Qq ccache &> /dev/null; then + export PATH="/usr/lib/ccache/bin/:$PATH" + export CCACHE_SLOPPINESS="file_macro,locale,time_macros" + export CCACHE_NOHASHDIR="true" + msg2 'ccache was found and will be used' + fi + + # document the TkG variables, excluding "_", "_EXT_CONFIG_PATH", and "_where". + declare -p | cut -d ' ' -f 3 | grep -P '^_(?!=|EXT_CONFIG_PATH|where)' > "${srcdir}/customization-full.cfg" + + # remove -O2 flag and place user optimization flag + CFLAGS=${CFLAGS/-O2/} + CFLAGS+=" ${_compileropt}" + + # build! + _runtime=$( time ( schedtool -B -n 1 -e ionice -n 1 make ${_force_all_threads} ${llvm_opt} LOCALVERSION= bzImage modules 2>&1 ) 3>&1 1>&2 2>&3 ) || _runtime=$( time ( make ${_force_all_threads} ${llvm_opt} LOCALVERSION= bzImage modules 2>&1 ) 3>&1 1>&2 2>&3 ) +} + +hackbase() { + pkgdesc="The $pkgdesc kernel and modules" + depends=('coreutils' 'kmod' 'initramfs') + optdepends=('linux-docs: Kernel hackers manual - HTML documentation that comes with the Linux kernel.' + 'crda: to set the correct wireless channels of your country.' + 'linux-firmware: Firmware files for Linux' + 'modprobed-db: Keeps track of EVERY kernel module that has ever been probed. Useful for make localmodconfig.' + 'nvidia-tkg: NVIDIA drivers for all installed kernels - non-dkms version.' + 'nvidia-dkms-tkg: NVIDIA drivers for all installed kernels - dkms version.' + 'update-grub: Simple wrapper around grub-mkconfig.') + provides=("linux=${pkgver}" "${pkgbase}" VIRTUALBOX-GUEST-MODULES WIREGUARD-MODULE) + replaces=(virtualbox-guest-modules-arch wireguard-arch) + + cd "${srcdir}/${_srcpath}" + + # get kernel version + local _kernver="$(\033[1;0m \033[1;1m$1\033[1;0m" >&2 +} + +error() { + echo -e " \033[1;31m==> ERROR: $1\033[1;0m" >&2 +} + +warning() { + echo -e " \033[1;33m==> WARNING: $1\033[1;0m" >&2 +} + +plain() { + echo "$1" >&2 +} + +# Stop the script at any ecountered error +set -e + +_where=`pwd` +srcdir="$_where" + +source linux-tkg-config/prepare + +# Run init script that is also run in PKGBUILD, it will define some env vars that we will use +_tkg_initscript + +case "$_basever" in + "54") + opt_ver="4.19-v5.4" + ;; + "57") + opt_ver="5.7%2B" + ;; + "58") + opt_ver="5.8%2B" + ;; + "59") + opt_ver="5.8%2B" + ;; +esac + +_cpu_opt_patch_link="https://raw.githubusercontent.com/graysky2/kernel_gcc_patch/master/enable_additional_cpu_optimizations_for_gcc_v10.1%2B_kernel_v${opt_ver}.patch" + +source customization.cfg + +if [ "$1" != "install" ] && [ "$1" != "config" ] && [ "$1" != "uninstall-help" ]; then + msg2 "Argument not recognised, options are: + - config : shallow clones the linux ${_basekernel}.x git tree into the folder linux-${_basekernel}, then applies on it the extra patches and prepares the .config file + by copying the one from the current linux system in /boot/config-`uname -r` and updates it. + - install : [RPM and DEB based distros only], does the config step, proceeds to compile, then prompts to install + - uninstall-help : [RPM and DEB based distros only], lists the installed kernels in this system, then gives a hint on how to uninstall them manually." + exit 0 +fi + +# Load external configuration file if present. Available variable values will overwrite customization.cfg ones. +if [ -e "$_EXT_CONFIG_PATH" ]; then + msg2 "External configuration file $_EXT_CONFIG_PATH will be used and will override customization.cfg values." + source "$_EXT_CONFIG_PATH" +fi + +_misc_adds="false" # We currently don't want this enabled on non-Arch + +if [ "$1" = "install" ] || [ "$1" = "config" ]; then + + if [ -z $_distro ] && [ "$1" = "install" ]; then + while true; do + echo "Which linux distribution are you running ?" + echo "if it's not on the list, chose the closest one to it: Fedora/Suse for RPM, Ubuntu/Debian for DEB" + echo " 1) Debian" + echo " 2) Fedora" + echo " 3) Suse" + echo " 4) Ubuntu" + read -p "[1-4]: " _distro_index + + if [ "$_distro_index" = "1" ]; then + _distro="Debian" + break + elif [ "$_distro_index" = "2" ]; then + _distro="Fedora" + break + elif [ "$_distro_index" = "3" ]; then + _distro="Suse" + break + elif [ "$_distro_index" = "4" ]; then + _distro="Ubuntu" + break + else + echo "Wrong index." + fi + done + fi + + if [[ $1 = "install" && "$_distro" != "Ubuntu" && "$_distro" != "Debian" && "$_distro" != "Fedora" && "$_distro" != "Suse" ]]; then + msg2 "Variable \"_distro\" in \"customization.cfg\" hasn't been set to \"Ubuntu\", \"Debian\", \"Fedora\" or \"Suse\"" + msg2 "This script can only install custom kernels for RPM and DEB based distros, though only those keywords are permitted. Exiting..." + exit 0 + fi + + if [ "$_compiler_name" = "llvm" ]; then + clang_deps="llvm clang lld" + fi + if [ "$_distro" = "Ubuntu" ] || [ "$_distro" = "Debian" ]; then + msg2 "Installing dependencies" + sudo apt install git build-essential kernel-package fakeroot libncurses5-dev libssl-dev ccache bison flex qtbase5-dev ${clang_deps} -y + elif [ "$_distro" = "Fedora" ]; then + msg2 "Installing dependencies" + sudo dnf install fedpkg fedora-packager rpmdevtools ncurses-devel pesign grubby qt5-devel libXi-devel gcc-c++ git ccache flex bison elfutils-libelf-devel openssl-devel dwarves rpm-build ${clang_deps} -y + elif [ "$_distro" = "Suse" ]; then + msg2 "Installing dependencies" + sudo zypper install -y rpmdevtools ncurses-devel pesign libXi-devel gcc-c++ git ccache flex bison elfutils libelf-devel openssl-devel dwarves make patch bc rpm-build libqt5-qtbase-common-devel libqt5-qtbase-devel lz4 ${clang_deps} + fi + + # Force prepare script to avoid Arch specific commands if the user is using `config` + if [ "$1" = "config" ]; then + _distro="" + fi + + if [ -d linux-${_basekernel}.orig ]; then + rm -rf linux-${_basekernel}.orig + fi + + if [ -d linux-${_basekernel} ]; then + msg2 "Reseting files in linux-$_basekernel to their original state and getting latest updates" + cd "$_where"/linux-${_basekernel} + git checkout --force linux-$_basekernel.y + git clean -f -d -x + git pull + msg2 "Done" + cd "$_where" + else + msg2 "Shallow git cloning linux $_basekernel" + git clone --branch linux-$_basekernel.y --single-branch --depth=1 https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git linux-${_basekernel} + msg2 "Done" + fi + + # Define current kernel subversion + if [ -z $_kernel_subver ]; then + cd "$_where"/linux-${_basekernel} + _kernelverstr=`git describe` + _kernel_subver=${_kernelverstr:5} + cd "$_where" + fi + + cd "$_where" + msg2 "Downloading Graysky2's CPU optimisations patch" + wget "$_cpu_opt_patch_link" + + # Follow Ubuntu install isntructions in https://wiki.ubuntu.com/KernelTeam/GitKernelBuild + + # cd in linux folder, copy Ubuntu's current config file, update with new params + cd "$_where"/linux-${_basekernel} + + msg2 "Copying current kernel's config and running make oldconfig..." + cp /boot/config-`uname -r` .config + if [ "$_distro" = "Debian" ]; then #Help Debian cert problem. + sed -i -e 's#CONFIG_SYSTEM_TRUSTED_KEYS="debian/certs/test-signing-certs.pem"#CONFIG_SYSTEM_TRUSTED_KEYS=""#g' .config + fi + yes '' | make oldconfig + msg2 "Done" + + # apply linux-tkg patching script + _tkg_srcprep + + msg2 "Configuration done." +fi + +if [ "$1" = "install" ]; then + + # Use custom compiler paths if defined + if [ -n "${CUSTOM_GCC_PATH}" ]; then + PATH=${CUSTOM_GCC_PATH}/bin:${CUSTOM_GCC_PATH}/lib:${CUSTOM_GCC_PATH}/include:${PATH} + fi + + if [ "$_force_all_threads" = "true" ]; then + _thread_num=`nproc` + else + _thread_num=`expr \`nproc\` / 4` + if [ "$_thread_num" = "0" ]; then + _thread_num=1 + fi + fi + + # ccache + if [ "$_noccache" != "true" ]; then + + if [ "$_distro" = "Ubuntu" ] || [ "$_distro" = "Debian" ]; then + export PATH="/usr/lib/ccache/bin/:$PATH" + elif [ "$_distro" = "Fedora" ] || [ "$_distro" = "Suse" ]; then + export PATH="/usr/lib64/ccache/:$PATH" + fi + + export CCACHE_SLOPPINESS="file_macro,locale,time_macros" + export CCACHE_NOHASHDIR="true" + msg2 'ccache was found and will be used' + + fi + + if [ -z $_kernel_localversion ]; then + _kernel_flavor="tkg-${_cpusched}" + else + _kernel_flavor="tkg-${_kernel_localversion}" + fi + + if [ "$_distro" = "Ubuntu" ] || [ "$_distro" = "Debian" ]; then + + if make -j ${_thread_num} deb-pkg LOCALVERSION=-${_kernel_flavor}; then + msg2 "Building successfully finished!" + + cd "$_where" + + # Create DEBS folder if it doesn't exist + mkdir -p DEBS + + # Move rpm files to RPMS folder inside the linux-tkg folder + mv "$_where"/*.deb "$_where"/DEBS/ + + read -p "Do you want to install the new Kernel ? y/[n]: " _install + if [[ $_install =~ [yY] ]] || [ $_install = "yes" ] || [ $_install = "Yes" ]; then + cd "$_where" + _kernelname=$_basekernel.$_kernel_subver-$_kernel_flavor + _headers_deb="linux-headers-${_kernelname}*.deb" + _image_deb="linux-image-${_kernelname}_*.deb" + _kernel_devel_deb="linux-libc-dev_${_kernelname}*.deb" + + cd DEBS + sudo dpkg -i $_headers_deb $_image_deb $_kernel_devel_deb + fi + fi + + elif [[ "$_distro" = "Fedora" || "$_distro" = "Suse" ]]; then + + # Replace dashes with underscores, it seems that it's being done by binrpm-pkg + # Se we can actually refer properly to the rpm files. + _kernel_flavor=${_kernel_flavor//-/_} + + if make -j ${_thread_num} rpm-pkg EXTRAVERSION="_${_kernel_flavor}"; then + msg2 "Building successfully finished!" + + cd "$_where" + + # Create RPMS folder if it doesn't exist + mkdir -p RPMS + + # Move rpm files to RPMS folder inside the linux-tkg folder + mv ~/rpmbuild/RPMS/x86_64/* "$_where"/RPMS/ + + #Clean up the original folder, unneeded and takes a lot of space + rm -rf ~/rpmbuild/ + + read -p "Do you want to install the new Kernel ? y/[n]: " _install + if [ "$_install" = "y" ] || [ "$_install" = "Y" ] || [ "$_install" = "yes" ] || [ "$_install" = "Yes" ]; then + + _kernelname=$_basekernel.${_kernel_subver}_$_kernel_flavor + _headers_rpm="kernel-headers-${_kernelname}*.rpm" + _kernel_rpm="kernel-${_kernelname}*.rpm" + _kernel_devel_rpm="kernel-devel-${_kernelname}*.rpm" + + cd RPMS + if [ "$_distro" = "Fedora" ]; then + sudo dnf install $_headers_rpm $_kernel_rpm $_kernel_devel_rpm + elif [ "$_distro" = "Suse" ]; then + msg2 "Some files from 'linux-glibc-devel' will be replaced by files from the custom kernel-hearders package" + msg2 "To revert back to the original kernel headers do 'sudo zypper install -f linux-glibc-devel'" + sudo zypper install --replacefiles --allow-unsigned-rpm $_headers_rpm $_kernel_rpm $_kernel_devel_rpm + fi + + msg2 "Install successful" + fi + fi + fi +fi + +if [ "$1" = "uninstall-help" ]; then + + cd "$_where" + msg2 "List of installed custom tkg kernels: " + + if [ "$_distro" = "Ubuntu" ]; then + dpkg -l "*tkg*" | grep "linux.*tkg" + dpkg -l "*linux-libc-dev*" | grep "linux.*tkg" + msg2 "To uninstall a version, you should remove the linux-image, linux-headers and linux-libc-dev associated to it (if installed), with: " + msg2 " sudo apt remove linux-image-VERSION linux-headers-VERSION linux-libc-dev-VERSION" + msg2 " where VERSION is displayed in the lists above, uninstall only versions that have \"tkg\" in its name" + elif [ "$_distro" = "Fedora" ]; then + dnf list --installed kernel* + msg2 "To uninstall a version, you should remove the kernel, kernel-headers and kernel-devel associated to it (if installed), with: " + msg2 " sudo dnf remove --noautoremove kernel-VERSION kernel-devel-VERSION kernel-headers-VERSION" + msg2 " where VERSION is displayed in the second column" + elif [ "$_distro" = "Suse" ]; then + zypper packages --installed-only | grep "kernel.*tkg" + msg2 "To uninstall a version, you should remove the kernel, kernel-headers and kernel-devel associated to it (if installed), with: " + msg2 " sudo zypper remove --no-clean-deps kernel-VERSION kernel-devel-VERSION kernel-headers-VERSION" + msg2 " where VERSION is displayed in the second to last column" + fi + +fi diff --git a/linux-tkg/linux-tkg-config/5.4/90-cleanup.hook b/linux-tkg/linux-tkg-config/5.4/90-cleanup.hook new file mode 100644 index 0000000..99f5221 --- /dev/null +++ b/linux-tkg/linux-tkg-config/5.4/90-cleanup.hook @@ -0,0 +1,14 @@ +[Trigger] +Type = File +Operation = Install +Operation = Upgrade +Operation = Remove +Target = usr/lib/modules/*/ +Target = !usr/lib/modules/*/?* + +[Action] +Description = Cleaning up... +When = PostTransaction +Exec = /usr/share/libalpm/scripts/cleanup +NeedsTargets + diff --git a/linux-tkg/linux-tkg-config/5.4/cleanup b/linux-tkg/linux-tkg-config/5.4/cleanup new file mode 100755 index 0000000..c00c08d --- /dev/null +++ b/linux-tkg/linux-tkg-config/5.4/cleanup @@ -0,0 +1,10 @@ +#!/bin/bash + +for _f in /usr/lib/modules/*tkg*; do + if [[ ! -e ${_f}/vmlinuz ]]; then + rm -rf "$_f" + fi +done + +# vim:set ft=sh sw=2 et: + diff --git a/linux-tkg/linux-tkg-config/5.4/config.x86_64 b/linux-tkg/linux-tkg-config/5.4/config.x86_64 new file mode 100644 index 0000000..8216172 --- /dev/null +++ b/linux-tkg/linux-tkg-config/5.4/config.x86_64 @@ -0,0 +1,10598 @@ +# +# Automatically generated file; DO NOT EDIT. +# Linux/x86 5.4.57 Kernel Configuration +# + +# +# Compiler: gcc (GCC) 10.1.0 +# +CONFIG_CC_IS_GCC=y +CONFIG_GCC_VERSION=100100 +CONFIG_CLANG_VERSION=0 +CONFIG_CC_CAN_LINK=y +CONFIG_CC_HAS_ASM_GOTO=y +CONFIG_CC_HAS_ASM_INLINE=y +CONFIG_IRQ_WORK=y +CONFIG_BUILDTIME_EXTABLE_SORT=y +CONFIG_THREAD_INFO_IN_TASK=y + +# +# General setup +# +CONFIG_INIT_ENV_ARG_LIMIT=32 +# CONFIG_COMPILE_TEST is not set +CONFIG_LOCALVERSION="" +CONFIG_LOCALVERSION_AUTO=y +CONFIG_BUILD_SALT="" +CONFIG_HAVE_KERNEL_GZIP=y +CONFIG_HAVE_KERNEL_BZIP2=y +CONFIG_HAVE_KERNEL_LZMA=y +CONFIG_HAVE_KERNEL_XZ=y +CONFIG_HAVE_KERNEL_LZO=y +CONFIG_HAVE_KERNEL_LZ4=y +# CONFIG_KERNEL_GZIP is not set +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set +CONFIG_KERNEL_XZ=y +# CONFIG_KERNEL_LZO is not set +# CONFIG_KERNEL_LZ4 is not set +CONFIG_DEFAULT_HOSTNAME="archlinux" +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_SYSVIPC_SYSCTL=y +CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y +CONFIG_CROSS_MEMORY_ATTACH=y +# CONFIG_USELIB is not set +CONFIG_AUDIT=y +CONFIG_HAVE_ARCH_AUDITSYSCALL=y +CONFIG_AUDITSYSCALL=y + +# +# IRQ subsystem +# +CONFIG_GENERIC_IRQ_PROBE=y +CONFIG_GENERIC_IRQ_SHOW=y +CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y +CONFIG_GENERIC_PENDING_IRQ=y +CONFIG_GENERIC_IRQ_MIGRATION=y +CONFIG_GENERIC_IRQ_CHIP=y +CONFIG_IRQ_DOMAIN=y +CONFIG_IRQ_SIM=y +CONFIG_IRQ_DOMAIN_HIERARCHY=y +CONFIG_GENERIC_MSI_IRQ=y +CONFIG_GENERIC_MSI_IRQ_DOMAIN=y +CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR=y +CONFIG_GENERIC_IRQ_RESERVATION_MODE=y +CONFIG_IRQ_FORCED_THREADING=y +CONFIG_SPARSE_IRQ=y +# CONFIG_GENERIC_IRQ_DEBUGFS is not set +# end of IRQ subsystem + +CONFIG_CLOCKSOURCE_WATCHDOG=y +CONFIG_ARCH_CLOCKSOURCE_DATA=y +CONFIG_ARCH_CLOCKSOURCE_INIT=y +CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y +CONFIG_GENERIC_TIME_VSYSCALL=y +CONFIG_GENERIC_CLOCKEVENTS=y +CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y +CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=y +CONFIG_GENERIC_CMOS_UPDATE=y + +# +# Timers subsystem +# +CONFIG_TICK_ONESHOT=y +CONFIG_NO_HZ_COMMON=y +# CONFIG_HZ_PERIODIC is not set +CONFIG_NO_HZ_IDLE=y +# CONFIG_NO_HZ_FULL is not set +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +# end of Timers subsystem + +CONFIG_PREEMPT_NONE=y +# CONFIG_PREEMPT_VOLUNTARY is not set +# CONFIG_PREEMPT is not set + +# +# CPU/Task time and stats accounting +# +CONFIG_TICK_CPU_ACCOUNTING=y +# CONFIG_VIRT_CPU_ACCOUNTING_GEN is not set +CONFIG_IRQ_TIME_ACCOUNTING=y +CONFIG_HAVE_SCHED_AVG_IRQ=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_BSD_PROCESS_ACCT_V3=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_PSI=y +# CONFIG_PSI_DEFAULT_DISABLED is not set +# end of CPU/Task time and stats accounting + +CONFIG_CPU_ISOLATION=y + +# +# RCU Subsystem +# +CONFIG_TREE_RCU=y +CONFIG_RCU_EXPERT=y +CONFIG_SRCU=y +CONFIG_TREE_SRCU=y +CONFIG_RCU_STALL_COMMON=y +CONFIG_RCU_NEED_SEGCBLIST=y +CONFIG_RCU_FANOUT=64 +CONFIG_RCU_FANOUT_LEAF=16 +# CONFIG_RCU_FAST_NO_HZ is not set +# CONFIG_RCU_NOCB_CPU is not set +# end of RCU Subsystem + +CONFIG_BUILD_BIN2C=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +# CONFIG_IKHEADERS is not set +CONFIG_LOG_BUF_SHIFT=17 +CONFIG_LOG_CPU_MAX_BUF_SHIFT=12 +CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=13 +CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y + +# +# Scheduler features +# +CONFIG_UCLAMP_TASK=y +CONFIG_UCLAMP_BUCKETS_COUNT=5 +# end of Scheduler features + +CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y +CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y +CONFIG_ARCH_SUPPORTS_INT128=y +CONFIG_NUMA_BALANCING=y +CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y +CONFIG_CGROUPS=y +CONFIG_PAGE_COUNTER=y +CONFIG_MEMCG=y +CONFIG_MEMCG_SWAP=y +CONFIG_MEMCG_SWAP_ENABLED=y +CONFIG_MEMCG_KMEM=y +CONFIG_BLK_CGROUP=y +CONFIG_CGROUP_WRITEBACK=y +CONFIG_CGROUP_SCHED=y +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_CFS_BANDWIDTH=y +# CONFIG_RT_GROUP_SCHED is not set +CONFIG_UCLAMP_TASK_GROUP=y +CONFIG_CGROUP_PIDS=y +CONFIG_CGROUP_RDMA=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_HUGETLB=y +CONFIG_CPUSETS=y +CONFIG_PROC_PID_CPUSET=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_PERF=y +CONFIG_CGROUP_BPF=y +# CONFIG_CGROUP_DEBUG is not set +CONFIG_SOCK_CGROUP_DATA=y +CONFIG_NAMESPACES=y +CONFIG_UTS_NS=y +CONFIG_IPC_NS=y +CONFIG_USER_NS=y +CONFIG_USER_NS_UNPRIVILEGED=y +CONFIG_PID_NS=y +CONFIG_NET_NS=y +CONFIG_CHECKPOINT_RESTORE=y +CONFIG_SCHED_AUTOGROUP=y +# CONFIG_SYSFS_DEPRECATED is not set +CONFIG_RELAY=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_GZIP=y +CONFIG_RD_BZIP2=y +CONFIG_RD_LZMA=y +CONFIG_RD_XZ=y +CONFIG_RD_LZO=y +CONFIG_RD_LZ4=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set +CONFIG_SYSCTL=y +CONFIG_HAVE_UID16=y +CONFIG_SYSCTL_EXCEPTION_TRACE=y +CONFIG_HAVE_PCSPKR_PLATFORM=y +CONFIG_BPF=y +CONFIG_EXPERT=y +# CONFIG_UID16 is not set +CONFIG_MULTIUSER=y +CONFIG_SGETMASK_SYSCALL=y +# CONFIG_SYSFS_SYSCALL is not set +# CONFIG_SYSCTL_SYSCALL is not set +CONFIG_FHANDLE=y +CONFIG_POSIX_TIMERS=y +CONFIG_PRINTK=y +CONFIG_PRINTK_NMI=y +CONFIG_BUG=y +CONFIG_ELF_CORE=y +CONFIG_PCSPKR_PLATFORM=y +CONFIG_BASE_FULL=y +CONFIG_FUTEX=y +CONFIG_FUTEX_PI=y +CONFIG_EPOLL=y +CONFIG_SIGNALFD=y +CONFIG_TIMERFD=y +CONFIG_EVENTFD=y +CONFIG_SHMEM=y +CONFIG_AIO=y +CONFIG_IO_URING=y +CONFIG_ADVISE_SYSCALLS=y +CONFIG_MEMBARRIER=y +CONFIG_KALLSYMS=y +CONFIG_KALLSYMS_ALL=y +CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y +CONFIG_KALLSYMS_BASE_RELATIVE=y +CONFIG_BPF_SYSCALL=y +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_USERFAULTFD=y +CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y +CONFIG_RSEQ=y +# CONFIG_DEBUG_RSEQ is not set +# CONFIG_EMBEDDED is not set +CONFIG_HAVE_PERF_EVENTS=y +# CONFIG_PC104 is not set + +# +# Kernel Performance Events And Counters +# +CONFIG_PERF_EVENTS=y +# CONFIG_DEBUG_PERF_USE_VMALLOC is not set +# end of Kernel Performance Events And Counters + +CONFIG_VM_EVENT_COUNTERS=y +CONFIG_SLUB_DEBUG=y +# CONFIG_SLUB_MEMCG_SYSFS_ON is not set +# CONFIG_COMPAT_BRK is not set +# CONFIG_SLAB is not set +CONFIG_SLUB=y +# CONFIG_SLOB is not set +CONFIG_SLAB_MERGE_DEFAULT=y +CONFIG_SLAB_FREELIST_RANDOM=y +CONFIG_SLAB_FREELIST_HARDENED=y +CONFIG_SHUFFLE_PAGE_ALLOCATOR=y +CONFIG_SLUB_CPU_PARTIAL=y +CONFIG_SYSTEM_DATA_VERIFICATION=y +CONFIG_PROFILING=y +CONFIG_TRACEPOINTS=y +# end of General setup + +CONFIG_64BIT=y +CONFIG_X86_64=y +CONFIG_X86=y +CONFIG_INSTRUCTION_DECODER=y +CONFIG_OUTPUT_FORMAT="elf64-x86-64" +CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig" +CONFIG_LOCKDEP_SUPPORT=y +CONFIG_STACKTRACE_SUPPORT=y +CONFIG_MMU=y +CONFIG_ARCH_MMAP_RND_BITS_MIN=28 +CONFIG_ARCH_MMAP_RND_BITS_MAX=32 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=8 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16 +CONFIG_GENERIC_ISA_DMA=y +CONFIG_GENERIC_BUG=y +CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y +CONFIG_ARCH_MAY_HAVE_PC_FDC=y +CONFIG_GENERIC_CALIBRATE_DELAY=y +CONFIG_ARCH_HAS_CPU_RELAX=y +CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y +CONFIG_ARCH_HAS_FILTER_PGPROT=y +CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y +CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y +CONFIG_ARCH_HIBERNATION_POSSIBLE=y +CONFIG_ARCH_SUSPEND_POSSIBLE=y +CONFIG_ARCH_WANT_GENERAL_HUGETLB=y +CONFIG_ZONE_DMA32=y +CONFIG_AUDIT_ARCH=y +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y +CONFIG_HAVE_INTEL_TXT=y +CONFIG_X86_64_SMP=y +CONFIG_ARCH_SUPPORTS_UPROBES=y +CONFIG_FIX_EARLYCON_MEM=y +CONFIG_DYNAMIC_PHYSICAL_MASK=y +CONFIG_PGTABLE_LEVELS=5 +CONFIG_CC_HAS_SANE_STACKPROTECTOR=y + +# +# Processor type and features +# +CONFIG_ZONE_DMA=y +CONFIG_SMP=y +CONFIG_X86_FEATURE_NAMES=y +CONFIG_X86_X2APIC=y +CONFIG_X86_MPPARSE=y +# CONFIG_GOLDFISH is not set +CONFIG_RETPOLINE=y +CONFIG_X86_CPU_RESCTRL=y +# CONFIG_X86_EXTENDED_PLATFORM is not set +CONFIG_X86_INTEL_LPSS=y +CONFIG_X86_AMD_PLATFORM_DEVICE=y +CONFIG_IOSF_MBI=y +# CONFIG_IOSF_MBI_DEBUG is not set +CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y +CONFIG_SCHED_OMIT_FRAME_POINTER=y +CONFIG_HYPERVISOR_GUEST=y +CONFIG_PARAVIRT=y +CONFIG_PARAVIRT_XXL=y +# CONFIG_PARAVIRT_DEBUG is not set +CONFIG_PARAVIRT_SPINLOCKS=y +CONFIG_X86_HV_CALLBACK_VECTOR=y +CONFIG_XEN=y +CONFIG_XEN_PV=y +CONFIG_XEN_PV_SMP=y +CONFIG_XEN_DOM0=y +CONFIG_XEN_PVHVM=y +CONFIG_XEN_PVHVM_SMP=y +CONFIG_XEN_512GB=y +CONFIG_XEN_SAVE_RESTORE=y +# CONFIG_XEN_DEBUG_FS is not set +CONFIG_XEN_PVH=y +CONFIG_KVM_GUEST=y +CONFIG_ARCH_CPUIDLE_HALTPOLL=y +CONFIG_PVH=y +# CONFIG_KVM_DEBUG_FS is not set +CONFIG_PARAVIRT_TIME_ACCOUNTING=y +CONFIG_PARAVIRT_CLOCK=y +CONFIG_JAILHOUSE_GUEST=y +CONFIG_ACRN_GUEST=y +# CONFIG_MK8 is not set +# CONFIG_MPSC is not set +# CONFIG_MCORE2 is not set +# CONFIG_MATOM is not set +CONFIG_GENERIC_CPU=y +CONFIG_X86_INTERNODE_CACHE_SHIFT=6 +CONFIG_X86_L1_CACHE_SHIFT=6 +CONFIG_X86_TSC=y +CONFIG_X86_CMPXCHG64=y +CONFIG_X86_CMOV=y +CONFIG_X86_MINIMUM_CPU_FAMILY=64 +CONFIG_X86_DEBUGCTLMSR=y +CONFIG_PROCESSOR_SELECT=y +CONFIG_CPU_SUP_INTEL=y +CONFIG_CPU_SUP_AMD=y +CONFIG_CPU_SUP_HYGON=y +CONFIG_CPU_SUP_CENTAUR=y +CONFIG_CPU_SUP_ZHAOXIN=y +CONFIG_HPET_TIMER=y +CONFIG_HPET_EMULATE_RTC=y +CONFIG_DMI=y +CONFIG_GART_IOMMU=y +CONFIG_CALGARY_IOMMU=y +CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y +# CONFIG_MAXSMP is not set +CONFIG_NR_CPUS_RANGE_BEGIN=2 +CONFIG_NR_CPUS_RANGE_END=512 +CONFIG_NR_CPUS_DEFAULT=64 +CONFIG_NR_CPUS=320 +CONFIG_SCHED_SMT=y +CONFIG_SCHED_MC=y +CONFIG_SCHED_MC_PRIO=y +CONFIG_X86_LOCAL_APIC=y +CONFIG_X86_IO_APIC=y +CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y +CONFIG_X86_MCE=y +# CONFIG_X86_MCELOG_LEGACY is not set +CONFIG_X86_MCE_INTEL=y +CONFIG_X86_MCE_AMD=y +CONFIG_X86_MCE_THRESHOLD=y +CONFIG_X86_MCE_INJECT=m +CONFIG_X86_THERMAL_VECTOR=y + +# +# Performance monitoring +# +CONFIG_PERF_EVENTS_INTEL_UNCORE=m +CONFIG_PERF_EVENTS_INTEL_RAPL=m +CONFIG_PERF_EVENTS_INTEL_CSTATE=m +CONFIG_PERF_EVENTS_AMD_POWER=m +# end of Performance monitoring + +CONFIG_X86_16BIT=y +CONFIG_X86_ESPFIX64=y +CONFIG_X86_VSYSCALL_EMULATION=y +CONFIG_I8K=m +CONFIG_MICROCODE=y +CONFIG_MICROCODE_INTEL=y +CONFIG_MICROCODE_AMD=y +CONFIG_MICROCODE_OLD_INTERFACE=y +CONFIG_X86_MSR=m +CONFIG_X86_CPUID=m +CONFIG_X86_5LEVEL=y +CONFIG_X86_DIRECT_GBPAGES=y +# CONFIG_X86_CPA_STATISTICS is not set +CONFIG_AMD_MEM_ENCRYPT=y +# CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT is not set +CONFIG_NUMA=y +CONFIG_AMD_NUMA=y +CONFIG_X86_64_ACPI_NUMA=y +CONFIG_NODES_SPAN_OTHER_NODES=y +# CONFIG_NUMA_EMU is not set +CONFIG_NODES_SHIFT=5 +CONFIG_ARCH_SPARSEMEM_ENABLE=y +CONFIG_ARCH_SPARSEMEM_DEFAULT=y +CONFIG_ARCH_SELECT_MEMORY_MODEL=y +CONFIG_ARCH_MEMORY_PROBE=y +CONFIG_ARCH_PROC_KCORE_TEXT=y +CONFIG_ILLEGAL_POINTER_VALUE=0xdead000000000000 +CONFIG_X86_PMEM_LEGACY_DEVICE=y +CONFIG_X86_PMEM_LEGACY=m +CONFIG_X86_CHECK_BIOS_CORRUPTION=y +CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y +CONFIG_X86_RESERVE_LOW=64 +CONFIG_MTRR=y +CONFIG_MTRR_SANITIZER=y +CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=1 +CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT=0 +CONFIG_X86_PAT=y +CONFIG_ARCH_USES_PG_UNCACHED=y +CONFIG_ARCH_RANDOM=y +CONFIG_X86_SMAP=y +CONFIG_X86_INTEL_UMIP=y +# CONFIG_X86_INTEL_MPX is not set +CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS=y +# CONFIG_X86_INTEL_TSX_MODE_OFF is not set +# CONFIG_X86_INTEL_TSX_MODE_ON is not set +CONFIG_X86_INTEL_TSX_MODE_AUTO=y +CONFIG_EFI=y +CONFIG_EFI_STUB=y +CONFIG_EFI_MIXED=y +CONFIG_SECCOMP=y +CONFIG_HZ_100=y +# CONFIG_HZ_250 is not set +# CONFIG_HZ_300 is not set +# CONFIG_HZ_1000 is not set +CONFIG_HZ=100 +CONFIG_SCHED_HRTICK=y +CONFIG_KEXEC=y +CONFIG_KEXEC_FILE=y +CONFIG_ARCH_HAS_KEXEC_PURGATORY=y +# CONFIG_KEXEC_SIG is not set +CONFIG_CRASH_DUMP=y +CONFIG_KEXEC_JUMP=y +CONFIG_PHYSICAL_START=0x1000000 +CONFIG_RELOCATABLE=y +CONFIG_RANDOMIZE_BASE=y +CONFIG_X86_NEED_RELOCS=y +CONFIG_PHYSICAL_ALIGN=0x200000 +CONFIG_DYNAMIC_MEMORY_LAYOUT=y +CONFIG_RANDOMIZE_MEMORY=y +CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING=0x1 +CONFIG_HOTPLUG_CPU=y +# CONFIG_BOOTPARAM_HOTPLUG_CPU0 is not set +# CONFIG_DEBUG_HOTPLUG_CPU0 is not set +# CONFIG_COMPAT_VDSO is not set +# CONFIG_LEGACY_VSYSCALL_EMULATE is not set +CONFIG_LEGACY_VSYSCALL_XONLY=y +# CONFIG_LEGACY_VSYSCALL_NONE is not set +# CONFIG_CMDLINE_BOOL is not set +CONFIG_MODIFY_LDT_SYSCALL=y +CONFIG_HAVE_LIVEPATCH=y +# CONFIG_LIVEPATCH is not set +# end of Processor type and features + +CONFIG_ARCH_HAS_ADD_PAGES=y +CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y +CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y +CONFIG_USE_PERCPU_NUMA_NODE_ID=y +CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK=y +CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION=y +CONFIG_ARCH_ENABLE_THP_MIGRATION=y + +# +# Power management and ACPI options +# +CONFIG_ARCH_HIBERNATION_HEADER=y +CONFIG_SUSPEND=y +CONFIG_SUSPEND_FREEZER=y +# CONFIG_SUSPEND_SKIP_SYNC is not set +CONFIG_HIBERNATE_CALLBACKS=y +CONFIG_HIBERNATION=y +CONFIG_PM_STD_PARTITION="" +CONFIG_PM_SLEEP=y +CONFIG_PM_SLEEP_SMP=y +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y +CONFIG_PM_WAKELOCKS_LIMIT=100 +CONFIG_PM_WAKELOCKS_GC=y +CONFIG_PM=y +CONFIG_PM_DEBUG=y +CONFIG_PM_ADVANCED_DEBUG=y +# CONFIG_PM_TEST_SUSPEND is not set +CONFIG_PM_SLEEP_DEBUG=y +# CONFIG_DPM_WATCHDOG is not set +CONFIG_PM_TRACE=y +CONFIG_PM_TRACE_RTC=y +CONFIG_PM_CLK=y +CONFIG_PM_GENERIC_DOMAINS=y +CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y +CONFIG_PM_GENERIC_DOMAINS_SLEEP=y +CONFIG_PM_GENERIC_DOMAINS_OF=y +CONFIG_ENERGY_MODEL=y +CONFIG_ARCH_SUPPORTS_ACPI=y +CONFIG_ACPI=y +CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y +CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC=y +CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT=y +# CONFIG_ACPI_DEBUGGER is not set +CONFIG_ACPI_SPCR_TABLE=y +CONFIG_ACPI_LPIT=y +CONFIG_ACPI_SLEEP=y +# CONFIG_ACPI_PROCFS_POWER is not set +CONFIG_ACPI_REV_OVERRIDE_POSSIBLE=y +CONFIG_ACPI_EC_DEBUGFS=y +CONFIG_ACPI_AC=m +CONFIG_ACPI_BATTERY=m +CONFIG_ACPI_BUTTON=y +CONFIG_ACPI_VIDEO=y +CONFIG_ACPI_FAN=y +CONFIG_ACPI_TAD=m +CONFIG_ACPI_DOCK=y +CONFIG_ACPI_CPU_FREQ_PSS=y +CONFIG_ACPI_PROCESSOR_CSTATE=y +CONFIG_ACPI_PROCESSOR_IDLE=y +CONFIG_ACPI_CPPC_LIB=y +CONFIG_ACPI_PROCESSOR=y +CONFIG_ACPI_IPMI=m +CONFIG_ACPI_HOTPLUG_CPU=y +CONFIG_ACPI_PROCESSOR_AGGREGATOR=y +CONFIG_ACPI_THERMAL=y +CONFIG_ACPI_NUMA=y +CONFIG_ARCH_HAS_ACPI_TABLE_UPGRADE=y +CONFIG_ACPI_TABLE_UPGRADE=y +CONFIG_ACPI_DEBUG=y +CONFIG_ACPI_PCI_SLOT=y +CONFIG_ACPI_CONTAINER=y +CONFIG_ACPI_HOTPLUG_MEMORY=y +CONFIG_ACPI_HOTPLUG_IOAPIC=y +CONFIG_ACPI_SBS=m +CONFIG_ACPI_HED=y +CONFIG_ACPI_CUSTOM_METHOD=m +CONFIG_ACPI_BGRT=y +# CONFIG_ACPI_REDUCED_HARDWARE_ONLY is not set +CONFIG_ACPI_NFIT=m +# CONFIG_NFIT_SECURITY_DEBUG is not set +CONFIG_ACPI_HMAT=y +CONFIG_HAVE_ACPI_APEI=y +CONFIG_HAVE_ACPI_APEI_NMI=y +CONFIG_ACPI_APEI=y +CONFIG_ACPI_APEI_GHES=y +CONFIG_ACPI_APEI_PCIEAER=y +CONFIG_ACPI_APEI_MEMORY_FAILURE=y +CONFIG_ACPI_APEI_EINJ=m +CONFIG_ACPI_APEI_ERST_DEBUG=m +CONFIG_DPTF_POWER=m +CONFIG_ACPI_WATCHDOG=y +CONFIG_ACPI_EXTLOG=m +CONFIG_ACPI_ADXL=y +CONFIG_PMIC_OPREGION=y +CONFIG_CRC_PMIC_OPREGION=y +CONFIG_XPOWER_PMIC_OPREGION=y +CONFIG_BXT_WC_PMIC_OPREGION=y +CONFIG_CHT_WC_PMIC_OPREGION=y +CONFIG_CHT_DC_TI_PMIC_OPREGION=y +CONFIG_ACPI_CONFIGFS=m +CONFIG_TPS68470_PMIC_OPREGION=y +CONFIG_X86_PM_TIMER=y +CONFIG_SFI=y + +# +# CPU Frequency scaling +# +CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_GOV_ATTR_SET=y +CONFIG_CPU_FREQ_GOV_COMMON=y +CONFIG_CPU_FREQ_STAT=y +# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y +CONFIG_CPU_FREQ_GOV_PERFORMANCE=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=m +CONFIG_CPU_FREQ_GOV_USERSPACE=m +CONFIG_CPU_FREQ_GOV_ONDEMAND=m +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m +CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y + +# +# CPU frequency scaling drivers +# +CONFIG_CPUFREQ_DT=m +CONFIG_CPUFREQ_DT_PLATDEV=y +CONFIG_X86_INTEL_PSTATE=y +CONFIG_X86_PCC_CPUFREQ=m +CONFIG_X86_ACPI_CPUFREQ=m +CONFIG_X86_ACPI_CPUFREQ_CPB=y +CONFIG_X86_POWERNOW_K8=m +CONFIG_X86_AMD_FREQ_SENSITIVITY=m +# CONFIG_X86_SPEEDSTEP_CENTRINO is not set +CONFIG_X86_P4_CLOCKMOD=m + +# +# shared options +# +CONFIG_X86_SPEEDSTEP_LIB=m +# end of CPU Frequency scaling + +# +# CPU Idle +# +CONFIG_CPU_IDLE=y +CONFIG_CPU_IDLE_GOV_LADDER=y +CONFIG_CPU_IDLE_GOV_MENU=y +CONFIG_CPU_IDLE_GOV_TEO=y +CONFIG_CPU_IDLE_GOV_HALTPOLL=y +CONFIG_HALTPOLL_CPUIDLE=m +# end of CPU Idle + +CONFIG_INTEL_IDLE=y +# end of Power management and ACPI options + +# +# Bus options (PCI etc.) +# +CONFIG_PCI_DIRECT=y +CONFIG_PCI_MMCONFIG=y +CONFIG_PCI_XEN=y +CONFIG_MMCONF_FAM10H=y +# CONFIG_PCI_CNB20LE_QUIRK is not set +# CONFIG_ISA_BUS is not set +CONFIG_ISA_DMA_API=y +CONFIG_AMD_NB=y +# CONFIG_X86_SYSFB is not set +# end of Bus options (PCI etc.) + +# +# Binary Emulations +# +CONFIG_IA32_EMULATION=y +# CONFIG_X86_X32 is not set +CONFIG_COMPAT_32=y +CONFIG_COMPAT=y +CONFIG_COMPAT_FOR_U64_ALIGNMENT=y +CONFIG_SYSVIPC_COMPAT=y +# end of Binary Emulations + +CONFIG_X86_DEV_DMA_OPS=y + +# +# Firmware Drivers +# +CONFIG_EDD=m +# CONFIG_EDD_OFF is not set +CONFIG_FIRMWARE_MEMMAP=y +CONFIG_DMIID=y +CONFIG_DMI_SYSFS=m +CONFIG_DMI_SCAN_MACHINE_NON_EFI_FALLBACK=y +CONFIG_ISCSI_IBFT_FIND=y +CONFIG_ISCSI_IBFT=m +CONFIG_FW_CFG_SYSFS=m +# CONFIG_FW_CFG_SYSFS_CMDLINE is not set +CONFIG_GOOGLE_FIRMWARE=y +# CONFIG_GOOGLE_SMI is not set +CONFIG_GOOGLE_COREBOOT_TABLE=m +CONFIG_GOOGLE_MEMCONSOLE=m +# CONFIG_GOOGLE_MEMCONSOLE_X86_LEGACY is not set +CONFIG_GOOGLE_FRAMEBUFFER_COREBOOT=m +CONFIG_GOOGLE_MEMCONSOLE_COREBOOT=m +CONFIG_GOOGLE_VPD=m + +# +# EFI (Extensible Firmware Interface) Support +# +# CONFIG_EFI_VARS is not set +CONFIG_EFI_ESRT=y +CONFIG_EFI_RUNTIME_MAP=y +# CONFIG_EFI_FAKE_MEMMAP is not set +CONFIG_EFI_RUNTIME_WRAPPERS=y +CONFIG_EFI_CAPSULE_LOADER=m +# CONFIG_EFI_TEST is not set +CONFIG_APPLE_PROPERTIES=y +# CONFIG_RESET_ATTACK_MITIGATION is not set +CONFIG_EFI_RCI2_TABLE=y +# end of EFI (Extensible Firmware Interface) Support + +CONFIG_UEFI_CPER=y +CONFIG_UEFI_CPER_X86=y +CONFIG_EFI_DEV_PATH_PARSER=y +CONFIG_EFI_EARLYCON=y + +# +# Tegra firmware driver +# +# end of Tegra firmware driver +# end of Firmware Drivers + +CONFIG_HAVE_KVM=y +CONFIG_HAVE_KVM_IRQCHIP=y +CONFIG_HAVE_KVM_IRQFD=y +CONFIG_HAVE_KVM_IRQ_ROUTING=y +CONFIG_HAVE_KVM_EVENTFD=y +CONFIG_KVM_MMIO=y +CONFIG_KVM_ASYNC_PF=y +CONFIG_HAVE_KVM_MSI=y +CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT=y +CONFIG_KVM_VFIO=y +CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT=y +CONFIG_KVM_COMPAT=y +CONFIG_HAVE_KVM_IRQ_BYPASS=y +CONFIG_HAVE_KVM_NO_POLL=y +CONFIG_VIRTUALIZATION=y +CONFIG_KVM=m +CONFIG_KVM_INTEL=m +CONFIG_KVM_AMD=m +CONFIG_KVM_AMD_SEV=y +CONFIG_KVM_MMU_AUDIT=y +CONFIG_VHOST_NET=m +CONFIG_VHOST_SCSI=m +CONFIG_VHOST_VSOCK=m +CONFIG_VHOST=m +# CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set + +# +# General architecture-dependent options +# +CONFIG_CRASH_CORE=y +CONFIG_KEXEC_CORE=y +CONFIG_HOTPLUG_SMT=y +CONFIG_OPROFILE=m +# CONFIG_OPROFILE_EVENT_MULTIPLEX is not set +CONFIG_HAVE_OPROFILE=y +CONFIG_OPROFILE_NMI_TIMER=y +CONFIG_KPROBES=y +CONFIG_JUMP_LABEL=y +# CONFIG_STATIC_KEYS_SELFTEST is not set +CONFIG_OPTPROBES=y +CONFIG_KPROBES_ON_FTRACE=y +CONFIG_UPROBES=y +CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y +CONFIG_ARCH_USE_BUILTIN_BSWAP=y +CONFIG_KRETPROBES=y +CONFIG_USER_RETURN_NOTIFIER=y +CONFIG_HAVE_IOREMAP_PROT=y +CONFIG_HAVE_KPROBES=y +CONFIG_HAVE_KRETPROBES=y +CONFIG_HAVE_OPTPROBES=y +CONFIG_HAVE_KPROBES_ON_FTRACE=y +CONFIG_HAVE_FUNCTION_ERROR_INJECTION=y +CONFIG_HAVE_NMI=y +CONFIG_HAVE_ARCH_TRACEHOOK=y +CONFIG_HAVE_DMA_CONTIGUOUS=y +CONFIG_GENERIC_SMP_IDLE_THREAD=y +CONFIG_ARCH_HAS_FORTIFY_SOURCE=y +CONFIG_ARCH_HAS_SET_MEMORY=y +CONFIG_ARCH_HAS_SET_DIRECT_MAP=y +CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y +CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT=y +CONFIG_HAVE_ASM_MODVERSIONS=y +CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y +CONFIG_HAVE_RSEQ=y +CONFIG_HAVE_FUNCTION_ARG_ACCESS_API=y +CONFIG_HAVE_CLK=y +CONFIG_HAVE_HW_BREAKPOINT=y +CONFIG_HAVE_MIXED_BREAKPOINTS_REGS=y +CONFIG_HAVE_USER_RETURN_NOTIFIER=y +CONFIG_HAVE_PERF_EVENTS_NMI=y +CONFIG_HAVE_HARDLOCKUP_DETECTOR_PERF=y +CONFIG_HAVE_PERF_REGS=y +CONFIG_HAVE_PERF_USER_STACK_DUMP=y +CONFIG_HAVE_ARCH_JUMP_LABEL=y +CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE=y +CONFIG_HAVE_RCU_TABLE_FREE=y +CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG=y +CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y +CONFIG_HAVE_CMPXCHG_LOCAL=y +CONFIG_HAVE_CMPXCHG_DOUBLE=y +CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION=y +CONFIG_ARCH_WANT_OLD_COMPAT_IPC=y +CONFIG_HAVE_ARCH_SECCOMP_FILTER=y +CONFIG_SECCOMP_FILTER=y +CONFIG_HAVE_ARCH_STACKLEAK=y +CONFIG_HAVE_STACKPROTECTOR=y +CONFIG_CC_HAS_STACKPROTECTOR_NONE=y +CONFIG_STACKPROTECTOR=y +CONFIG_STACKPROTECTOR_STRONG=y +CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES=y +CONFIG_HAVE_CONTEXT_TRACKING=y +CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y +CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y +CONFIG_HAVE_MOVE_PMD=y +CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y +CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD=y +CONFIG_HAVE_ARCH_HUGE_VMAP=y +CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y +CONFIG_HAVE_ARCH_SOFT_DIRTY=y +CONFIG_HAVE_MOD_ARCH_SPECIFIC=y +CONFIG_MODULES_USE_ELF_RELA=y +CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK=y +CONFIG_ARCH_HAS_ELF_RANDOMIZE=y +CONFIG_HAVE_ARCH_MMAP_RND_BITS=y +CONFIG_HAVE_EXIT_THREAD=y +CONFIG_ARCH_MMAP_RND_BITS=28 +CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS=y +CONFIG_ARCH_MMAP_RND_COMPAT_BITS=8 +CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES=y +CONFIG_HAVE_COPY_THREAD_TLS=y +CONFIG_HAVE_STACK_VALIDATION=y +CONFIG_HAVE_RELIABLE_STACKTRACE=y +CONFIG_ISA_BUS_API=y +CONFIG_OLD_SIGSUSPEND3=y +CONFIG_COMPAT_OLD_SIGACTION=y +CONFIG_64BIT_TIME=y +CONFIG_COMPAT_32BIT_TIME=y +CONFIG_HAVE_ARCH_VMAP_STACK=y +CONFIG_VMAP_STACK=y +CONFIG_ARCH_HAS_STRICT_KERNEL_RWX=y +CONFIG_STRICT_KERNEL_RWX=y +CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y +CONFIG_STRICT_MODULE_RWX=y +CONFIG_ARCH_HAS_REFCOUNT=y +# CONFIG_REFCOUNT_FULL is not set +CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y +CONFIG_ARCH_USE_MEMREMAP_PROT=y +CONFIG_LOCK_EVENT_COUNTS=y +CONFIG_ARCH_HAS_MEM_ENCRYPT=y + +# +# GCOV-based kernel profiling +# +# CONFIG_GCOV_KERNEL is not set +CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y +# end of GCOV-based kernel profiling + +CONFIG_PLUGIN_HOSTCC="g++" +CONFIG_HAVE_GCC_PLUGINS=y +CONFIG_GCC_PLUGINS=y +# CONFIG_GCC_PLUGIN_CYC_COMPLEXITY is not set +# CONFIG_GCC_PLUGIN_LATENT_ENTROPY is not set +# CONFIG_GCC_PLUGIN_RANDSTRUCT is not set +# end of General architecture-dependent options + +CONFIG_RT_MUTEXES=y +CONFIG_BASE_SMALL=0 +CONFIG_MODULE_SIG_FORMAT=y +CONFIG_MODULES=y +CONFIG_MODULE_FORCE_LOAD=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_FORCE_UNLOAD=y +# CONFIG_MODVERSIONS is not set +CONFIG_MODULE_SRCVERSION_ALL=y +CONFIG_MODULE_SIG=y +# CONFIG_MODULE_SIG_FORCE is not set +CONFIG_MODULE_SIG_ALL=y +# CONFIG_MODULE_SIG_SHA1 is not set +# CONFIG_MODULE_SIG_SHA224 is not set +# CONFIG_MODULE_SIG_SHA256 is not set +# CONFIG_MODULE_SIG_SHA384 is not set +CONFIG_MODULE_SIG_SHA512=y +CONFIG_MODULE_SIG_HASH="sha512" +CONFIG_MODULE_COMPRESS=y +# CONFIG_MODULE_COMPRESS_GZIP is not set +CONFIG_MODULE_COMPRESS_XZ=y +CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS=y +CONFIG_UNUSED_SYMBOLS=y +CONFIG_MODULES_TREE_LOOKUP=y +CONFIG_BLOCK=y +CONFIG_BLK_RQ_ALLOC_TIME=y +CONFIG_BLK_SCSI_REQUEST=y +CONFIG_BLK_DEV_BSG=y +CONFIG_BLK_DEV_BSGLIB=y +CONFIG_BLK_DEV_INTEGRITY=y +CONFIG_BLK_DEV_ZONED=y +CONFIG_BLK_DEV_THROTTLING=y +CONFIG_BLK_DEV_THROTTLING_LOW=y +# CONFIG_BLK_CMDLINE_PARSER is not set +CONFIG_BLK_WBT=y +CONFIG_BLK_CGROUP_IOLATENCY=y +CONFIG_BLK_CGROUP_IOCOST=y +CONFIG_BLK_WBT_MQ=y +CONFIG_BLK_DEBUG_FS=y +CONFIG_BLK_DEBUG_FS_ZONED=y +CONFIG_BLK_SED_OPAL=y + +# +# Partition Types +# +CONFIG_PARTITION_ADVANCED=y +# CONFIG_ACORN_PARTITION is not set +CONFIG_AIX_PARTITION=y +# CONFIG_OSF_PARTITION is not set +# CONFIG_AMIGA_PARTITION is not set +# CONFIG_ATARI_PARTITION is not set +CONFIG_MAC_PARTITION=y +CONFIG_MSDOS_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +# CONFIG_UNIXWARE_DISKLABEL is not set +CONFIG_LDM_PARTITION=y +# CONFIG_LDM_DEBUG is not set +# CONFIG_SGI_PARTITION is not set +# CONFIG_ULTRIX_PARTITION is not set +# CONFIG_SUN_PARTITION is not set +CONFIG_KARMA_PARTITION=y +CONFIG_EFI_PARTITION=y +# CONFIG_SYSV68_PARTITION is not set +# CONFIG_CMDLINE_PARTITION is not set +# end of Partition Types + +CONFIG_BLOCK_COMPAT=y +CONFIG_BLK_MQ_PCI=y +CONFIG_BLK_MQ_VIRTIO=y +CONFIG_BLK_MQ_RDMA=y +CONFIG_BLK_PM=y + +# +# IO Schedulers +# +CONFIG_MQ_IOSCHED_DEADLINE=y +CONFIG_MQ_IOSCHED_KYBER=y +CONFIG_IOSCHED_BFQ=y +CONFIG_BFQ_GROUP_IOSCHED=y +# CONFIG_BFQ_CGROUP_DEBUG is not set +# end of IO Schedulers + +CONFIG_PREEMPT_NOTIFIERS=y +CONFIG_PADATA=y +CONFIG_ASN1=y +CONFIG_INLINE_SPIN_UNLOCK_IRQ=y +CONFIG_INLINE_READ_UNLOCK=y +CONFIG_INLINE_READ_UNLOCK_IRQ=y +CONFIG_INLINE_WRITE_UNLOCK=y +CONFIG_INLINE_WRITE_UNLOCK_IRQ=y +CONFIG_ARCH_SUPPORTS_ATOMIC_RMW=y +CONFIG_MUTEX_SPIN_ON_OWNER=y +CONFIG_RWSEM_SPIN_ON_OWNER=y +CONFIG_LOCK_SPIN_ON_OWNER=y +CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y +CONFIG_QUEUED_SPINLOCKS=y +CONFIG_ARCH_USE_QUEUED_RWLOCKS=y +CONFIG_QUEUED_RWLOCKS=y +CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE=y +CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y +CONFIG_FREEZER=y + +# +# Executable file formats +# +CONFIG_BINFMT_ELF=y +CONFIG_COMPAT_BINFMT_ELF=y +CONFIG_ELFCORE=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y +CONFIG_BINFMT_SCRIPT=y +CONFIG_BINFMT_MISC=y +CONFIG_COREDUMP=y +# end of Executable file formats + +# +# Memory Management options +# +CONFIG_SELECT_MEMORY_MODEL=y +CONFIG_SPARSEMEM_MANUAL=y +CONFIG_SPARSEMEM=y +CONFIG_NEED_MULTIPLE_NODES=y +CONFIG_HAVE_MEMORY_PRESENT=y +CONFIG_SPARSEMEM_EXTREME=y +CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y +CONFIG_SPARSEMEM_VMEMMAP=y +CONFIG_HAVE_MEMBLOCK_NODE_MAP=y +CONFIG_HAVE_FAST_GUP=y +CONFIG_MEMORY_ISOLATION=y +CONFIG_HAVE_BOOTMEM_INFO_NODE=y +CONFIG_MEMORY_HOTPLUG=y +CONFIG_MEMORY_HOTPLUG_SPARSE=y +CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y +CONFIG_MEMORY_HOTREMOVE=y +CONFIG_SPLIT_PTLOCK_CPUS=4 +CONFIG_MEMORY_BALLOON=y +CONFIG_BALLOON_COMPACTION=y +CONFIG_COMPACTION=y +CONFIG_MIGRATION=y +CONFIG_CONTIG_ALLOC=y +CONFIG_PHYS_ADDR_T_64BIT=y +CONFIG_BOUNCE=y +CONFIG_VIRT_TO_BUS=y +CONFIG_MMU_NOTIFIER=y +CONFIG_KSM=y +CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 +CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y +CONFIG_MEMORY_FAILURE=y +CONFIG_HWPOISON_INJECT=m +CONFIG_TRANSPARENT_HUGEPAGE=y +# CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS is not set +CONFIG_TRANSPARENT_HUGEPAGE_MADVISE=y +CONFIG_ARCH_WANTS_THP_SWAP=y +CONFIG_THP_SWAP=y +CONFIG_TRANSPARENT_HUGE_PAGECACHE=y +CONFIG_CLEANCACHE=y +CONFIG_FRONTSWAP=y +# CONFIG_CMA is not set +CONFIG_MEM_SOFT_DIRTY=y +CONFIG_ZSWAP=y +CONFIG_ZPOOL=y +CONFIG_ZBUD=y +CONFIG_Z3FOLD=y +CONFIG_ZSMALLOC=y +# CONFIG_PGTABLE_MAPPING is not set +# CONFIG_ZSMALLOC_STAT is not set +CONFIG_GENERIC_EARLY_IOREMAP=y +# CONFIG_DEFERRED_STRUCT_PAGE_INIT is not set +# CONFIG_IDLE_PAGE_TRACKING is not set +CONFIG_ARCH_HAS_PTE_DEVMAP=y +CONFIG_ZONE_DEVICE=y +CONFIG_DEV_PAGEMAP_OPS=y +CONFIG_HMM_MIRROR=y +CONFIG_DEVICE_PRIVATE=y +CONFIG_FRAME_VECTOR=y +CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y +CONFIG_ARCH_HAS_PKEYS=y +# CONFIG_PERCPU_STATS is not set +# CONFIG_GUP_BENCHMARK is not set +CONFIG_READ_ONLY_THP_FOR_FS=y +CONFIG_ARCH_HAS_PTE_SPECIAL=y +# end of Memory Management options + +CONFIG_NET=y +CONFIG_COMPAT_NETLINK_MESSAGES=y +CONFIG_NET_INGRESS=y +CONFIG_NET_EGRESS=y +CONFIG_NET_REDIRECT=y +CONFIG_SKB_EXTENSIONS=y + +# +# Networking options +# +CONFIG_PACKET=y +CONFIG_PACKET_DIAG=y +CONFIG_UNIX=y +CONFIG_UNIX_SCM=y +CONFIG_UNIX_DIAG=y +CONFIG_TLS=m +CONFIG_TLS_DEVICE=y +CONFIG_XFRM=y +CONFIG_XFRM_OFFLOAD=y +CONFIG_XFRM_ALGO=m +CONFIG_XFRM_USER=m +CONFIG_XFRM_INTERFACE=m +CONFIG_XFRM_SUB_POLICY=y +CONFIG_XFRM_MIGRATE=y +CONFIG_XFRM_STATISTICS=y +CONFIG_XFRM_IPCOMP=m +CONFIG_NET_KEY=m +CONFIG_NET_KEY_MIGRATE=y +CONFIG_SMC=m +CONFIG_SMC_DIAG=m +CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +# CONFIG_IP_FIB_TRIE_STATS is not set +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_IP_ROUTE_CLASSID=y +# CONFIG_IP_PNP is not set +CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE_DEMUX=m +CONFIG_NET_IP_TUNNEL=m +CONFIG_NET_IPGRE=m +# CONFIG_NET_IPGRE_BROADCAST is not set +CONFIG_IP_MROUTE_COMMON=y +CONFIG_IP_MROUTE=y +CONFIG_IP_MROUTE_MULTIPLE_TABLES=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +CONFIG_SYN_COOKIES=y +CONFIG_NET_IPVTI=m +CONFIG_NET_UDP_TUNNEL=m +CONFIG_NET_FOU=m +CONFIG_NET_FOU_IP_TUNNELS=y +CONFIG_INET_AH=m +CONFIG_INET_ESP=m +CONFIG_INET_ESP_OFFLOAD=m +CONFIG_INET_IPCOMP=m +CONFIG_INET_XFRM_TUNNEL=m +CONFIG_INET_TUNNEL=m +CONFIG_INET_DIAG=m +CONFIG_INET_TCP_DIAG=m +CONFIG_INET_UDP_DIAG=m +CONFIG_INET_RAW_DIAG=m +CONFIG_INET_DIAG_DESTROY=y +CONFIG_TCP_CONG_ADVANCED=y +CONFIG_TCP_CONG_BIC=m +CONFIG_TCP_CONG_CUBIC=y +CONFIG_TCP_CONG_WESTWOOD=m +CONFIG_TCP_CONG_HTCP=m +CONFIG_TCP_CONG_HSTCP=m +CONFIG_TCP_CONG_HYBLA=m +CONFIG_TCP_CONG_VEGAS=m +CONFIG_TCP_CONG_NV=m +CONFIG_TCP_CONG_SCALABLE=m +CONFIG_TCP_CONG_LP=m +CONFIG_TCP_CONG_VENO=m +CONFIG_TCP_CONG_YEAH=m +CONFIG_TCP_CONG_ILLINOIS=m +CONFIG_TCP_CONG_DCTCP=m +CONFIG_TCP_CONG_CDG=m +CONFIG_TCP_CONG_BBR=m +CONFIG_DEFAULT_CUBIC=y +# CONFIG_DEFAULT_RENO is not set +CONFIG_DEFAULT_TCP_CONG="cubic" +CONFIG_TCP_MD5SIG=y +CONFIG_IPV6=y +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_INET6_AH=m +CONFIG_INET6_ESP=m +CONFIG_INET6_ESP_OFFLOAD=m +CONFIG_INET6_IPCOMP=m +CONFIG_IPV6_MIP6=m +CONFIG_IPV6_ILA=m +CONFIG_INET6_XFRM_TUNNEL=m +CONFIG_INET6_TUNNEL=m +CONFIG_IPV6_VTI=m +CONFIG_IPV6_SIT=m +CONFIG_IPV6_SIT_6RD=y +CONFIG_IPV6_NDISC_NODETYPE=y +CONFIG_IPV6_TUNNEL=m +CONFIG_IPV6_GRE=m +CONFIG_IPV6_FOU=m +CONFIG_IPV6_FOU_TUNNEL=m +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_SUBTREES=y +CONFIG_IPV6_MROUTE=y +CONFIG_IPV6_MROUTE_MULTIPLE_TABLES=y +CONFIG_IPV6_PIMSM_V2=y +CONFIG_IPV6_SEG6_LWTUNNEL=y +CONFIG_IPV6_SEG6_HMAC=y +CONFIG_IPV6_SEG6_BPF=y +CONFIG_NETLABEL=y +CONFIG_NETWORK_SECMARK=y +CONFIG_NET_PTP_CLASSIFY=y +CONFIG_NETWORK_PHY_TIMESTAMPING=y +CONFIG_NETFILTER=y +CONFIG_NETFILTER_ADVANCED=y +CONFIG_BRIDGE_NETFILTER=m + +# +# Core Netfilter Configuration +# +CONFIG_NETFILTER_INGRESS=y +CONFIG_NETFILTER_NETLINK=m +CONFIG_NETFILTER_FAMILY_BRIDGE=y +CONFIG_NETFILTER_FAMILY_ARP=y +CONFIG_NETFILTER_NETLINK_ACCT=m +CONFIG_NETFILTER_NETLINK_QUEUE=m +CONFIG_NETFILTER_NETLINK_LOG=m +CONFIG_NETFILTER_NETLINK_OSF=m +CONFIG_NF_CONNTRACK=m +CONFIG_NF_LOG_COMMON=m +CONFIG_NF_LOG_NETDEV=m +CONFIG_NETFILTER_CONNCOUNT=m +CONFIG_NF_CONNTRACK_MARK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_ZONES=y +CONFIG_NF_CONNTRACK_PROCFS=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_TIMEOUT=y +CONFIG_NF_CONNTRACK_TIMESTAMP=y +CONFIG_NF_CONNTRACK_LABELS=y +CONFIG_NF_CT_PROTO_DCCP=y +CONFIG_NF_CT_PROTO_GRE=y +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_CT_PROTO_UDPLITE=y +CONFIG_NF_CONNTRACK_AMANDA=m +CONFIG_NF_CONNTRACK_FTP=m +CONFIG_NF_CONNTRACK_H323=m +CONFIG_NF_CONNTRACK_IRC=m +CONFIG_NF_CONNTRACK_BROADCAST=m +CONFIG_NF_CONNTRACK_NETBIOS_NS=m +CONFIG_NF_CONNTRACK_SNMP=m +CONFIG_NF_CONNTRACK_PPTP=m +CONFIG_NF_CONNTRACK_SANE=m +CONFIG_NF_CONNTRACK_SIP=m +CONFIG_NF_CONNTRACK_TFTP=m +CONFIG_NF_CT_NETLINK=m +CONFIG_NF_CT_NETLINK_TIMEOUT=m +CONFIG_NF_CT_NETLINK_HELPER=m +CONFIG_NETFILTER_NETLINK_GLUE_CT=y +CONFIG_NF_NAT=m +CONFIG_NF_NAT_AMANDA=m +CONFIG_NF_NAT_FTP=m +CONFIG_NF_NAT_IRC=m +CONFIG_NF_NAT_SIP=m +CONFIG_NF_NAT_TFTP=m +CONFIG_NF_NAT_REDIRECT=y +CONFIG_NF_NAT_MASQUERADE=y +CONFIG_NETFILTER_SYNPROXY=m +CONFIG_NF_TABLES=m +CONFIG_NF_TABLES_SET=m +CONFIG_NF_TABLES_INET=y +CONFIG_NF_TABLES_NETDEV=y +CONFIG_NFT_NUMGEN=m +CONFIG_NFT_CT=m +CONFIG_NFT_FLOW_OFFLOAD=m +CONFIG_NFT_COUNTER=m +CONFIG_NFT_CONNLIMIT=m +CONFIG_NFT_LOG=m +CONFIG_NFT_LIMIT=m +CONFIG_NFT_MASQ=m +CONFIG_NFT_REDIR=m +CONFIG_NFT_NAT=m +CONFIG_NFT_TUNNEL=m +CONFIG_NFT_OBJREF=m +CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m +CONFIG_NFT_REJECT=m +CONFIG_NFT_REJECT_INET=m +CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m +CONFIG_NFT_FIB=m +CONFIG_NFT_FIB_INET=m +CONFIG_NFT_XFRM=m +CONFIG_NFT_SOCKET=m +CONFIG_NFT_OSF=m +CONFIG_NFT_TPROXY=m +CONFIG_NFT_SYNPROXY=m +CONFIG_NF_DUP_NETDEV=m +CONFIG_NFT_DUP_NETDEV=m +CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m +CONFIG_NF_FLOW_TABLE_INET=m +CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES=m + +# +# Xtables combined modules +# +CONFIG_NETFILTER_XT_MARK=m +CONFIG_NETFILTER_XT_CONNMARK=m +CONFIG_NETFILTER_XT_SET=m + +# +# Xtables targets +# +CONFIG_NETFILTER_XT_TARGET_AUDIT=m +CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m +CONFIG_NETFILTER_XT_TARGET_CONNMARK=m +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m +CONFIG_NETFILTER_XT_TARGET_CT=m +CONFIG_NETFILTER_XT_TARGET_DSCP=m +CONFIG_NETFILTER_XT_TARGET_HL=m +CONFIG_NETFILTER_XT_TARGET_HMARK=m +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m +CONFIG_NETFILTER_XT_TARGET_LED=m +CONFIG_NETFILTER_XT_TARGET_LOG=m +CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_NAT=m +CONFIG_NETFILTER_XT_TARGET_NETMAP=m +CONFIG_NETFILTER_XT_TARGET_NFLOG=m +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_NOTRACK=m +CONFIG_NETFILTER_XT_TARGET_RATEEST=m +CONFIG_NETFILTER_XT_TARGET_REDIRECT=m +CONFIG_NETFILTER_XT_TARGET_MASQUERADE=m +CONFIG_NETFILTER_XT_TARGET_TEE=m +CONFIG_NETFILTER_XT_TARGET_TPROXY=m +CONFIG_NETFILTER_XT_TARGET_TRACE=m +CONFIG_NETFILTER_XT_TARGET_SECMARK=m +CONFIG_NETFILTER_XT_TARGET_TCPMSS=m +CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m + +# +# Xtables matches +# +CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m +CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NETFILTER_XT_MATCH_CGROUP=m +CONFIG_NETFILTER_XT_MATCH_CLUSTER=m +CONFIG_NETFILTER_XT_MATCH_COMMENT=m +CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m +CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m +CONFIG_NETFILTER_XT_MATCH_CONNMARK=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NETFILTER_XT_MATCH_CPU=m +CONFIG_NETFILTER_XT_MATCH_DCCP=m +CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m +CONFIG_NETFILTER_XT_MATCH_DSCP=m +CONFIG_NETFILTER_XT_MATCH_ECN=m +CONFIG_NETFILTER_XT_MATCH_ESP=m +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m +CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_HL=m +CONFIG_NETFILTER_XT_MATCH_IPCOMP=m +CONFIG_NETFILTER_XT_MATCH_IPRANGE=m +CONFIG_NETFILTER_XT_MATCH_IPVS=m +CONFIG_NETFILTER_XT_MATCH_L2TP=m +CONFIG_NETFILTER_XT_MATCH_LENGTH=m +CONFIG_NETFILTER_XT_MATCH_LIMIT=m +CONFIG_NETFILTER_XT_MATCH_MAC=m +CONFIG_NETFILTER_XT_MATCH_MARK=m +CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m +CONFIG_NETFILTER_XT_MATCH_NFACCT=m +CONFIG_NETFILTER_XT_MATCH_OSF=m +CONFIG_NETFILTER_XT_MATCH_OWNER=m +CONFIG_NETFILTER_XT_MATCH_POLICY=m +CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m +CONFIG_NETFILTER_XT_MATCH_QUOTA=m +CONFIG_NETFILTER_XT_MATCH_RATEEST=m +CONFIG_NETFILTER_XT_MATCH_REALM=m +CONFIG_NETFILTER_XT_MATCH_RECENT=m +CONFIG_NETFILTER_XT_MATCH_SCTP=m +CONFIG_NETFILTER_XT_MATCH_SOCKET=m +CONFIG_NETFILTER_XT_MATCH_STATE=m +CONFIG_NETFILTER_XT_MATCH_STATISTIC=m +CONFIG_NETFILTER_XT_MATCH_STRING=m +CONFIG_NETFILTER_XT_MATCH_TCPMSS=m +CONFIG_NETFILTER_XT_MATCH_TIME=m +CONFIG_NETFILTER_XT_MATCH_U32=m +# end of Core Netfilter Configuration + +CONFIG_IP_SET=m +CONFIG_IP_SET_MAX=256 +CONFIG_IP_SET_BITMAP_IP=m +CONFIG_IP_SET_BITMAP_IPMAC=m +CONFIG_IP_SET_BITMAP_PORT=m +CONFIG_IP_SET_HASH_IP=m +CONFIG_IP_SET_HASH_IPMARK=m +CONFIG_IP_SET_HASH_IPPORT=m +CONFIG_IP_SET_HASH_IPPORTIP=m +CONFIG_IP_SET_HASH_IPPORTNET=m +CONFIG_IP_SET_HASH_IPMAC=m +CONFIG_IP_SET_HASH_MAC=m +CONFIG_IP_SET_HASH_NETPORTNET=m +CONFIG_IP_SET_HASH_NET=m +CONFIG_IP_SET_HASH_NETNET=m +CONFIG_IP_SET_HASH_NETPORT=m +CONFIG_IP_SET_HASH_NETIFACE=m +CONFIG_IP_SET_LIST_SET=m +CONFIG_IP_VS=m +CONFIG_IP_VS_IPV6=y +# CONFIG_IP_VS_DEBUG is not set +CONFIG_IP_VS_TAB_BITS=15 + +# +# IPVS transport protocol load balancing support +# +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_PROTO_UDP=y +CONFIG_IP_VS_PROTO_AH_ESP=y +CONFIG_IP_VS_PROTO_ESP=y +CONFIG_IP_VS_PROTO_AH=y +CONFIG_IP_VS_PROTO_SCTP=y + +# +# IPVS scheduler +# +CONFIG_IP_VS_RR=m +CONFIG_IP_VS_WRR=m +CONFIG_IP_VS_LC=m +CONFIG_IP_VS_WLC=m +CONFIG_IP_VS_FO=m +CONFIG_IP_VS_OVF=m +CONFIG_IP_VS_LBLC=m +CONFIG_IP_VS_LBLCR=m +CONFIG_IP_VS_DH=m +CONFIG_IP_VS_SH=m +CONFIG_IP_VS_MH=m +CONFIG_IP_VS_SED=m +CONFIG_IP_VS_NQ=m + +# +# IPVS SH scheduler +# +CONFIG_IP_VS_SH_TAB_BITS=8 + +# +# IPVS MH scheduler +# +CONFIG_IP_VS_MH_TAB_INDEX=12 + +# +# IPVS application helper +# +CONFIG_IP_VS_FTP=m +CONFIG_IP_VS_NFCT=y +CONFIG_IP_VS_PE_SIP=m + +# +# IP: Netfilter Configuration +# +CONFIG_NF_DEFRAG_IPV4=m +CONFIG_NF_SOCKET_IPV4=m +CONFIG_NF_TPROXY_IPV4=m +CONFIG_NF_TABLES_IPV4=y +CONFIG_NFT_REJECT_IPV4=m +CONFIG_NFT_DUP_IPV4=m +CONFIG_NFT_FIB_IPV4=m +CONFIG_NF_TABLES_ARP=y +CONFIG_NF_FLOW_TABLE_IPV4=m +CONFIG_NF_DUP_IPV4=m +CONFIG_NF_LOG_ARP=m +CONFIG_NF_LOG_IPV4=m +CONFIG_NF_REJECT_IPV4=m +CONFIG_NF_NAT_SNMP_BASIC=m +CONFIG_NF_NAT_PPTP=m +CONFIG_NF_NAT_H323=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_AH=m +CONFIG_IP_NF_MATCH_ECN=m +CONFIG_IP_NF_MATCH_RPFILTER=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_TARGET_SYNPROXY=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_TARGET_NETMAP=m +CONFIG_IP_NF_TARGET_REDIRECT=m +CONFIG_IP_NF_MANGLE=m +CONFIG_IP_NF_TARGET_CLUSTERIP=m +CONFIG_IP_NF_TARGET_ECN=m +CONFIG_IP_NF_TARGET_TTL=m +CONFIG_IP_NF_RAW=m +CONFIG_IP_NF_SECURITY=m +CONFIG_IP_NF_ARPTABLES=m +CONFIG_IP_NF_ARPFILTER=m +CONFIG_IP_NF_ARP_MANGLE=m +# end of IP: Netfilter Configuration + +# +# IPv6: Netfilter Configuration +# +CONFIG_NF_SOCKET_IPV6=m +CONFIG_NF_TPROXY_IPV6=m +CONFIG_NF_TABLES_IPV6=y +CONFIG_NFT_REJECT_IPV6=m +CONFIG_NFT_DUP_IPV6=m +CONFIG_NFT_FIB_IPV6=m +CONFIG_NF_FLOW_TABLE_IPV6=m +CONFIG_NF_DUP_IPV6=m +CONFIG_NF_REJECT_IPV6=m +CONFIG_NF_LOG_IPV6=m +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_AH=m +CONFIG_IP6_NF_MATCH_EUI64=m +CONFIG_IP6_NF_MATCH_FRAG=m +CONFIG_IP6_NF_MATCH_OPTS=m +CONFIG_IP6_NF_MATCH_HL=m +CONFIG_IP6_NF_MATCH_IPV6HEADER=m +CONFIG_IP6_NF_MATCH_MH=m +CONFIG_IP6_NF_MATCH_RPFILTER=m +CONFIG_IP6_NF_MATCH_RT=m +CONFIG_IP6_NF_MATCH_SRH=m +CONFIG_IP6_NF_TARGET_HL=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_REJECT=m +CONFIG_IP6_NF_TARGET_SYNPROXY=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_RAW=m +CONFIG_IP6_NF_SECURITY=m +CONFIG_IP6_NF_NAT=m +CONFIG_IP6_NF_TARGET_MASQUERADE=m +CONFIG_IP6_NF_TARGET_NPT=m +# end of IPv6: Netfilter Configuration + +CONFIG_NF_DEFRAG_IPV6=m +CONFIG_NF_TABLES_BRIDGE=m +CONFIG_NFT_BRIDGE_META=m +CONFIG_NFT_BRIDGE_REJECT=m +CONFIG_NF_LOG_BRIDGE=m +CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES=m +CONFIG_BRIDGE_EBT_BROUTE=m +CONFIG_BRIDGE_EBT_T_FILTER=m +CONFIG_BRIDGE_EBT_T_NAT=m +CONFIG_BRIDGE_EBT_802_3=m +CONFIG_BRIDGE_EBT_AMONG=m +CONFIG_BRIDGE_EBT_ARP=m +CONFIG_BRIDGE_EBT_IP=m +CONFIG_BRIDGE_EBT_IP6=m +CONFIG_BRIDGE_EBT_LIMIT=m +CONFIG_BRIDGE_EBT_MARK=m +CONFIG_BRIDGE_EBT_PKTTYPE=m +CONFIG_BRIDGE_EBT_STP=m +CONFIG_BRIDGE_EBT_VLAN=m +CONFIG_BRIDGE_EBT_ARPREPLY=m +CONFIG_BRIDGE_EBT_DNAT=m +CONFIG_BRIDGE_EBT_MARK_T=m +CONFIG_BRIDGE_EBT_REDIRECT=m +CONFIG_BRIDGE_EBT_SNAT=m +CONFIG_BRIDGE_EBT_LOG=m +CONFIG_BRIDGE_EBT_NFLOG=m +# CONFIG_BPFILTER is not set +CONFIG_IP_DCCP=m +CONFIG_INET_DCCP_DIAG=m + +# +# DCCP CCIDs Configuration +# +# CONFIG_IP_DCCP_CCID2_DEBUG is not set +CONFIG_IP_DCCP_CCID3=y +# CONFIG_IP_DCCP_CCID3_DEBUG is not set +CONFIG_IP_DCCP_TFRC_LIB=y +# end of DCCP CCIDs Configuration + +# +# DCCP Kernel Hacking +# +# CONFIG_IP_DCCP_DEBUG is not set +# end of DCCP Kernel Hacking + +CONFIG_IP_SCTP=m +# CONFIG_SCTP_DBG_OBJCNT is not set +# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5 is not set +CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1=y +# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE is not set +CONFIG_SCTP_COOKIE_HMAC_MD5=y +CONFIG_SCTP_COOKIE_HMAC_SHA1=y +CONFIG_INET_SCTP_DIAG=m +CONFIG_RDS=m +CONFIG_RDS_RDMA=m +CONFIG_RDS_TCP=m +# CONFIG_RDS_DEBUG is not set +CONFIG_TIPC=m +CONFIG_TIPC_MEDIA_IB=y +CONFIG_TIPC_MEDIA_UDP=y +CONFIG_TIPC_DIAG=m +CONFIG_ATM=m +CONFIG_ATM_CLIP=m +# CONFIG_ATM_CLIP_NO_ICMP is not set +CONFIG_ATM_LANE=m +CONFIG_ATM_MPOA=m +CONFIG_ATM_BR2684=m +# CONFIG_ATM_BR2684_IPFILTER is not set +CONFIG_L2TP=m +# CONFIG_L2TP_DEBUGFS is not set +CONFIG_L2TP_V3=y +CONFIG_L2TP_IP=m +CONFIG_L2TP_ETH=m +CONFIG_STP=m +CONFIG_GARP=m +CONFIG_MRP=m +CONFIG_BRIDGE=m +CONFIG_BRIDGE_IGMP_SNOOPING=y +CONFIG_BRIDGE_VLAN_FILTERING=y +CONFIG_HAVE_NET_DSA=y +CONFIG_NET_DSA=m +CONFIG_NET_DSA_TAG_8021Q=m +CONFIG_NET_DSA_TAG_BRCM_COMMON=m +CONFIG_NET_DSA_TAG_BRCM=m +CONFIG_NET_DSA_TAG_BRCM_PREPEND=m +CONFIG_NET_DSA_TAG_GSWIP=m +CONFIG_NET_DSA_TAG_DSA=m +CONFIG_NET_DSA_TAG_EDSA=m +CONFIG_NET_DSA_TAG_MTK=m +CONFIG_NET_DSA_TAG_KSZ=m +CONFIG_NET_DSA_TAG_QCA=m +CONFIG_NET_DSA_TAG_LAN9303=m +CONFIG_NET_DSA_TAG_SJA1105=m +CONFIG_NET_DSA_TAG_TRAILER=m +CONFIG_VLAN_8021Q=m +CONFIG_VLAN_8021Q_GVRP=y +CONFIG_VLAN_8021Q_MVRP=y +# CONFIG_DECNET is not set +CONFIG_LLC=m +CONFIG_LLC2=m +CONFIG_ATALK=m +CONFIG_DEV_APPLETALK=m +CONFIG_IPDDP=m +CONFIG_IPDDP_ENCAP=y +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +CONFIG_PHONET=m +CONFIG_6LOWPAN=m +# CONFIG_6LOWPAN_DEBUGFS is not set +CONFIG_6LOWPAN_NHC=m +CONFIG_6LOWPAN_NHC_DEST=m +CONFIG_6LOWPAN_NHC_FRAGMENT=m +CONFIG_6LOWPAN_NHC_HOP=m +CONFIG_6LOWPAN_NHC_IPV6=m +CONFIG_6LOWPAN_NHC_MOBILITY=m +CONFIG_6LOWPAN_NHC_ROUTING=m +CONFIG_6LOWPAN_NHC_UDP=m +CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m +CONFIG_6LOWPAN_GHC_UDP=m +CONFIG_6LOWPAN_GHC_ICMPV6=m +CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m +CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m +CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m +CONFIG_IEEE802154=m +CONFIG_IEEE802154_NL802154_EXPERIMENTAL=y +CONFIG_IEEE802154_SOCKET=m +CONFIG_IEEE802154_6LOWPAN=m +CONFIG_MAC802154=m +CONFIG_NET_SCHED=y + +# +# Queueing/Scheduling +# +CONFIG_NET_SCH_CBQ=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_HFSC=m +CONFIG_NET_SCH_ATM=m +CONFIG_NET_SCH_PRIO=m +CONFIG_NET_SCH_MULTIQ=m +CONFIG_NET_SCH_RED=m +CONFIG_NET_SCH_SFB=m +CONFIG_NET_SCH_SFQ=m +CONFIG_NET_SCH_TEQL=m +CONFIG_NET_SCH_TBF=m +CONFIG_NET_SCH_CBS=m +CONFIG_NET_SCH_ETF=m +CONFIG_NET_SCH_TAPRIO=m +CONFIG_NET_SCH_GRED=m +CONFIG_NET_SCH_DSMARK=m +CONFIG_NET_SCH_NETEM=m +CONFIG_NET_SCH_DRR=m +CONFIG_NET_SCH_MQPRIO=m +CONFIG_NET_SCH_SKBPRIO=m +CONFIG_NET_SCH_CHOKE=m +CONFIG_NET_SCH_QFQ=m +CONFIG_NET_SCH_CODEL=m +CONFIG_NET_SCH_FQ_CODEL=y +CONFIG_NET_SCH_CAKE=m +CONFIG_NET_SCH_FQ=m +CONFIG_NET_SCH_HHF=m +CONFIG_NET_SCH_PIE=m +CONFIG_NET_SCH_INGRESS=m +CONFIG_NET_SCH_PLUG=m +CONFIG_NET_SCH_DEFAULT=y +# CONFIG_DEFAULT_FQ is not set +# CONFIG_DEFAULT_CODEL is not set +CONFIG_DEFAULT_FQ_CODEL=y +# CONFIG_DEFAULT_SFQ is not set +# CONFIG_DEFAULT_PFIFO_FAST is not set +CONFIG_DEFAULT_NET_SCH="fq_codel" + +# +# Classification +# +CONFIG_NET_CLS=y +CONFIG_NET_CLS_BASIC=m +CONFIG_NET_CLS_TCINDEX=m +CONFIG_NET_CLS_ROUTE4=m +CONFIG_NET_CLS_FW=m +CONFIG_NET_CLS_U32=m +CONFIG_CLS_U32_PERF=y +CONFIG_CLS_U32_MARK=y +CONFIG_NET_CLS_RSVP=m +CONFIG_NET_CLS_RSVP6=m +CONFIG_NET_CLS_FLOW=m +CONFIG_NET_CLS_CGROUP=m +CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_FLOWER=m +CONFIG_NET_CLS_MATCHALL=m +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_STACK=32 +CONFIG_NET_EMATCH_CMP=m +CONFIG_NET_EMATCH_NBYTE=m +CONFIG_NET_EMATCH_U32=m +CONFIG_NET_EMATCH_META=m +CONFIG_NET_EMATCH_TEXT=m +CONFIG_NET_EMATCH_CANID=m +CONFIG_NET_EMATCH_IPSET=m +CONFIG_NET_EMATCH_IPT=m +CONFIG_NET_CLS_ACT=y +CONFIG_NET_ACT_POLICE=m +CONFIG_NET_ACT_GACT=m +CONFIG_GACT_PROB=y +CONFIG_NET_ACT_MIRRED=m +CONFIG_NET_ACT_SAMPLE=m +CONFIG_NET_ACT_IPT=m +CONFIG_NET_ACT_NAT=m +CONFIG_NET_ACT_PEDIT=m +CONFIG_NET_ACT_SIMP=m +CONFIG_NET_ACT_SKBEDIT=m +CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_MPLS=m +CONFIG_NET_ACT_VLAN=m +CONFIG_NET_ACT_BPF=m +CONFIG_NET_ACT_CONNMARK=m +CONFIG_NET_ACT_CTINFO=m +CONFIG_NET_ACT_SKBMOD=m +CONFIG_NET_ACT_IFE=m +CONFIG_NET_ACT_TUNNEL_KEY=m +CONFIG_NET_ACT_CT=m +CONFIG_NET_IFE_SKBMARK=m +CONFIG_NET_IFE_SKBPRIO=m +CONFIG_NET_IFE_SKBTCINDEX=m +CONFIG_NET_TC_SKB_EXT=y +CONFIG_NET_SCH_FIFO=y +CONFIG_DCB=y +CONFIG_DNS_RESOLVER=m +CONFIG_BATMAN_ADV=m +CONFIG_BATMAN_ADV_BATMAN_V=y +CONFIG_BATMAN_ADV_BLA=y +CONFIG_BATMAN_ADV_DAT=y +CONFIG_BATMAN_ADV_NC=y +CONFIG_BATMAN_ADV_MCAST=y +CONFIG_BATMAN_ADV_DEBUGFS=y +# CONFIG_BATMAN_ADV_DEBUG is not set +CONFIG_BATMAN_ADV_SYSFS=y +# CONFIG_BATMAN_ADV_TRACING is not set +CONFIG_OPENVSWITCH=m +CONFIG_OPENVSWITCH_GRE=m +CONFIG_OPENVSWITCH_VXLAN=m +CONFIG_OPENVSWITCH_GENEVE=m +CONFIG_VSOCKETS=m +CONFIG_VSOCKETS_DIAG=m +CONFIG_VMWARE_VMCI_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS_COMMON=m +CONFIG_HYPERV_VSOCKETS=m +CONFIG_NETLINK_DIAG=m +CONFIG_MPLS=y +CONFIG_NET_MPLS_GSO=m +CONFIG_MPLS_ROUTING=m +CONFIG_MPLS_IPTUNNEL=m +CONFIG_NET_NSH=m +CONFIG_HSR=m +CONFIG_NET_SWITCHDEV=y +CONFIG_NET_L3_MASTER_DEV=y +CONFIG_NET_NCSI=y +CONFIG_NCSI_OEM_CMD_GET_MAC=y +CONFIG_RPS=y +CONFIG_RFS_ACCEL=y +CONFIG_XPS=y +CONFIG_CGROUP_NET_PRIO=y +CONFIG_CGROUP_NET_CLASSID=y +CONFIG_NET_RX_BUSY_POLL=y +CONFIG_BQL=y +CONFIG_BPF_JIT=y +CONFIG_BPF_STREAM_PARSER=y +CONFIG_NET_FLOW_LIMIT=y + +# +# Network testing +# +CONFIG_NET_PKTGEN=m +CONFIG_NET_DROP_MONITOR=y +# end of Network testing +# end of Networking options + +CONFIG_HAMRADIO=y + +# +# Packet Radio protocols +# +CONFIG_AX25=m +CONFIG_AX25_DAMA_SLAVE=y +CONFIG_NETROM=m +CONFIG_ROSE=m + +# +# AX.25 network device drivers +# +CONFIG_MKISS=m +CONFIG_6PACK=m +CONFIG_BPQETHER=m +CONFIG_BAYCOM_SER_FDX=m +CONFIG_BAYCOM_SER_HDX=m +CONFIG_BAYCOM_PAR=m +CONFIG_YAM=m +# end of AX.25 network device drivers + +CONFIG_CAN=m +CONFIG_CAN_RAW=m +CONFIG_CAN_BCM=m +CONFIG_CAN_GW=m +CONFIG_CAN_J1939=m + +# +# CAN Device Drivers +# +CONFIG_CAN_VCAN=m +CONFIG_CAN_VXCAN=m +CONFIG_CAN_SLCAN=m +CONFIG_CAN_DEV=m +CONFIG_CAN_CALC_BITTIMING=y +CONFIG_CAN_FLEXCAN=m +CONFIG_CAN_GRCAN=m +CONFIG_CAN_JANZ_ICAN3=m +CONFIG_CAN_KVASER_PCIEFD=m +CONFIG_CAN_C_CAN=m +CONFIG_CAN_C_CAN_PLATFORM=m +CONFIG_CAN_C_CAN_PCI=m +CONFIG_CAN_CC770=m +# CONFIG_CAN_CC770_ISA is not set +CONFIG_CAN_CC770_PLATFORM=m +CONFIG_CAN_IFI_CANFD=m +CONFIG_CAN_M_CAN=m +CONFIG_CAN_M_CAN_PLATFORM=m +CONFIG_CAN_M_CAN_TCAN4X5X=m +CONFIG_CAN_PEAK_PCIEFD=m +CONFIG_CAN_SJA1000=m +CONFIG_CAN_EMS_PCI=m +# CONFIG_CAN_EMS_PCMCIA is not set +CONFIG_CAN_F81601=m +CONFIG_CAN_KVASER_PCI=m +CONFIG_CAN_PEAK_PCI=m +CONFIG_CAN_PEAK_PCIEC=y +CONFIG_CAN_PEAK_PCMCIA=m +CONFIG_CAN_PLX_PCI=m +# CONFIG_CAN_SJA1000_ISA is not set +CONFIG_CAN_SJA1000_PLATFORM=m +CONFIG_CAN_SOFTING=m +CONFIG_CAN_SOFTING_CS=m + +# +# CAN SPI interfaces +# +CONFIG_CAN_HI311X=m +CONFIG_CAN_MCP251X=m +# end of CAN SPI interfaces + +# +# CAN USB interfaces +# +CONFIG_CAN_8DEV_USB=m +CONFIG_CAN_EMS_USB=m +CONFIG_CAN_ESD_USB2=m +CONFIG_CAN_GS_USB=m +CONFIG_CAN_KVASER_USB=m +CONFIG_CAN_MCBA_USB=m +CONFIG_CAN_PEAK_USB=m +CONFIG_CAN_UCAN=m +# end of CAN USB interfaces + +# CONFIG_CAN_DEBUG_DEVICES is not set +# end of CAN Device Drivers + +CONFIG_BT=m +CONFIG_BT_BREDR=y +CONFIG_BT_RFCOMM=m +CONFIG_BT_RFCOMM_TTY=y +CONFIG_BT_BNEP=m +CONFIG_BT_BNEP_MC_FILTER=y +CONFIG_BT_BNEP_PROTO_FILTER=y +CONFIG_BT_CMTP=m +CONFIG_BT_HIDP=m +CONFIG_BT_HS=y +CONFIG_BT_LE=y +CONFIG_BT_6LOWPAN=m +CONFIG_BT_LEDS=y +# CONFIG_BT_SELFTEST is not set +CONFIG_BT_DEBUGFS=y + +# +# Bluetooth device drivers +# +CONFIG_BT_INTEL=m +CONFIG_BT_BCM=m +CONFIG_BT_RTL=m +CONFIG_BT_QCA=m +CONFIG_BT_HCIBTUSB=m +CONFIG_BT_HCIBTUSB_AUTOSUSPEND=y +CONFIG_BT_HCIBTUSB_BCM=y +CONFIG_BT_HCIBTUSB_MTK=y +CONFIG_BT_HCIBTUSB_RTL=y +CONFIG_BT_HCIBTSDIO=m +CONFIG_BT_HCIUART=m +CONFIG_BT_HCIUART_SERDEV=y +CONFIG_BT_HCIUART_H4=y +CONFIG_BT_HCIUART_NOKIA=m +CONFIG_BT_HCIUART_BCSP=y +CONFIG_BT_HCIUART_ATH3K=y +CONFIG_BT_HCIUART_LL=y +CONFIG_BT_HCIUART_3WIRE=y +CONFIG_BT_HCIUART_INTEL=y +CONFIG_BT_HCIUART_BCM=y +CONFIG_BT_HCIUART_RTL=y +CONFIG_BT_HCIUART_QCA=y +CONFIG_BT_HCIUART_AG6XX=y +CONFIG_BT_HCIUART_MRVL=y +CONFIG_BT_HCIBCM203X=m +CONFIG_BT_HCIBPA10X=m +CONFIG_BT_HCIBFUSB=m +CONFIG_BT_HCIDTL1=m +CONFIG_BT_HCIBT3C=m +CONFIG_BT_HCIBLUECARD=m +CONFIG_BT_HCIVHCI=m +CONFIG_BT_MRVL=m +CONFIG_BT_MRVL_SDIO=m +CONFIG_BT_ATH3K=m +CONFIG_BT_WILINK=m +CONFIG_BT_MTKSDIO=m +CONFIG_BT_MTKUART=m +CONFIG_BT_HCIRSI=m +# end of Bluetooth device drivers + +CONFIG_AF_RXRPC=m +CONFIG_AF_RXRPC_IPV6=y +# CONFIG_AF_RXRPC_INJECT_LOSS is not set +CONFIG_AF_RXRPC_DEBUG=y +CONFIG_RXKAD=y +CONFIG_AF_KCM=m +CONFIG_STREAM_PARSER=y +CONFIG_FIB_RULES=y +CONFIG_WIRELESS=y +CONFIG_WIRELESS_EXT=y +CONFIG_WEXT_CORE=y +CONFIG_WEXT_PROC=y +CONFIG_WEXT_SPY=y +CONFIG_WEXT_PRIV=y +CONFIG_CFG80211=m +# CONFIG_NL80211_TESTMODE is not set +# CONFIG_CFG80211_DEVELOPER_WARNINGS is not set +# CONFIG_CFG80211_CERTIFICATION_ONUS is not set +CONFIG_CFG80211_REQUIRE_SIGNED_REGDB=y +CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS=y +CONFIG_CFG80211_DEFAULT_PS=y +CONFIG_CFG80211_DEBUGFS=y +CONFIG_CFG80211_CRDA_SUPPORT=y +CONFIG_CFG80211_WEXT=y +CONFIG_CFG80211_WEXT_EXPORT=y +CONFIG_LIB80211=m +CONFIG_LIB80211_CRYPT_WEP=m +CONFIG_LIB80211_CRYPT_CCMP=m +CONFIG_LIB80211_CRYPT_TKIP=m +# CONFIG_LIB80211_DEBUG is not set +CONFIG_MAC80211=m +CONFIG_MAC80211_HAS_RC=y +CONFIG_MAC80211_RC_MINSTREL=y +CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y +CONFIG_MAC80211_RC_DEFAULT="minstrel_ht" +CONFIG_MAC80211_MESH=y +CONFIG_MAC80211_LEDS=y +CONFIG_MAC80211_DEBUGFS=y +# CONFIG_MAC80211_MESSAGE_TRACING is not set +# CONFIG_MAC80211_DEBUG_MENU is not set +CONFIG_MAC80211_STA_HASH_MAX_SIZE=0 +CONFIG_WIMAX=m +CONFIG_WIMAX_DEBUG_LEVEL=8 +CONFIG_RFKILL=m +CONFIG_RFKILL_LEDS=y +CONFIG_RFKILL_INPUT=y +CONFIG_RFKILL_GPIO=m +CONFIG_NET_9P=m +CONFIG_NET_9P_VIRTIO=m +CONFIG_NET_9P_XEN=m +CONFIG_NET_9P_RDMA=m +# CONFIG_NET_9P_DEBUG is not set +CONFIG_CAIF=m +# CONFIG_CAIF_DEBUG is not set +CONFIG_CAIF_NETDEV=m +CONFIG_CAIF_USB=m +CONFIG_CEPH_LIB=m +CONFIG_CEPH_LIB_PRETTYDEBUG=y +CONFIG_CEPH_LIB_USE_DNS_RESOLVER=y +CONFIG_NFC=m +CONFIG_NFC_DIGITAL=m +CONFIG_NFC_NCI=m +CONFIG_NFC_NCI_SPI=m +CONFIG_NFC_NCI_UART=m +CONFIG_NFC_HCI=m +CONFIG_NFC_SHDLC=y + +# +# Near Field Communication (NFC) devices +# +CONFIG_NFC_TRF7970A=m +CONFIG_NFC_MEI_PHY=m +CONFIG_NFC_SIM=m +CONFIG_NFC_PORT100=m +CONFIG_NFC_FDP=m +CONFIG_NFC_FDP_I2C=m +CONFIG_NFC_PN544=m +CONFIG_NFC_PN544_I2C=m +CONFIG_NFC_PN544_MEI=m +CONFIG_NFC_PN533=m +CONFIG_NFC_PN533_USB=m +CONFIG_NFC_PN533_I2C=m +CONFIG_NFC_MICROREAD=m +CONFIG_NFC_MICROREAD_I2C=m +CONFIG_NFC_MICROREAD_MEI=m +CONFIG_NFC_MRVL=m +CONFIG_NFC_MRVL_USB=m +CONFIG_NFC_MRVL_UART=m +CONFIG_NFC_MRVL_I2C=m +CONFIG_NFC_MRVL_SPI=m +CONFIG_NFC_ST21NFCA=m +CONFIG_NFC_ST21NFCA_I2C=m +CONFIG_NFC_ST_NCI=m +CONFIG_NFC_ST_NCI_I2C=m +CONFIG_NFC_ST_NCI_SPI=m +CONFIG_NFC_NXP_NCI=m +CONFIG_NFC_NXP_NCI_I2C=m +CONFIG_NFC_S3FWRN5=m +CONFIG_NFC_S3FWRN5_I2C=m +CONFIG_NFC_ST95HF=m +# end of Near Field Communication (NFC) devices + +CONFIG_PSAMPLE=m +CONFIG_NET_IFE=m +CONFIG_LWTUNNEL=y +CONFIG_LWTUNNEL_BPF=y +CONFIG_DST_CACHE=y +CONFIG_GRO_CELLS=y +CONFIG_SOCK_VALIDATE_XMIT=y +CONFIG_NET_SOCK_MSG=y +CONFIG_NET_DEVLINK=y +CONFIG_PAGE_POOL=y +CONFIG_FAILOVER=m +CONFIG_HAVE_EBPF_JIT=y + +# +# Device Drivers +# +CONFIG_HAVE_EISA=y +# CONFIG_EISA is not set +CONFIG_HAVE_PCI=y +CONFIG_PCI=y +CONFIG_PCI_DOMAINS=y +CONFIG_PCIEPORTBUS=y +CONFIG_HOTPLUG_PCI_PCIE=y +CONFIG_PCIEAER=y +# CONFIG_PCIEAER_INJECT is not set +CONFIG_PCIE_ECRC=y +CONFIG_PCIEASPM=y +# CONFIG_PCIEASPM_DEBUG is not set +CONFIG_PCIEASPM_DEFAULT=y +# CONFIG_PCIEASPM_POWERSAVE is not set +# CONFIG_PCIEASPM_POWER_SUPERSAVE is not set +# CONFIG_PCIEASPM_PERFORMANCE is not set +CONFIG_PCIE_PME=y +CONFIG_PCIE_DPC=y +CONFIG_PCIE_PTM=y +# CONFIG_PCIE_BW is not set +CONFIG_PCI_MSI=y +CONFIG_PCI_MSI_IRQ_DOMAIN=y +CONFIG_PCI_QUIRKS=y +# CONFIG_PCI_DEBUG is not set +CONFIG_PCI_REALLOC_ENABLE_AUTO=y +CONFIG_PCI_STUB=y +CONFIG_PCI_PF_STUB=m +CONFIG_XEN_PCIDEV_FRONTEND=m +CONFIG_PCI_ATS=y +CONFIG_PCI_ECAM=y +CONFIG_PCI_LOCKLESS_CONFIG=y +CONFIG_PCI_IOV=y +CONFIG_PCI_PRI=y +CONFIG_PCI_PASID=y +CONFIG_PCI_P2PDMA=y +CONFIG_PCI_LABEL=y +CONFIG_PCI_HYPERV=m +CONFIG_HOTPLUG_PCI=y +CONFIG_HOTPLUG_PCI_ACPI=y +CONFIG_HOTPLUG_PCI_ACPI_IBM=m +CONFIG_HOTPLUG_PCI_CPCI=y +CONFIG_HOTPLUG_PCI_CPCI_ZT5550=m +CONFIG_HOTPLUG_PCI_CPCI_GENERIC=m +CONFIG_HOTPLUG_PCI_SHPC=y + +# +# PCI controller drivers +# + +# +# Cadence PCIe controllers support +# +CONFIG_PCIE_CADENCE=y +CONFIG_PCIE_CADENCE_HOST=y +CONFIG_PCIE_CADENCE_EP=y +# end of Cadence PCIe controllers support + +CONFIG_PCI_FTPCI100=y +CONFIG_PCI_HOST_COMMON=y +CONFIG_PCI_HOST_GENERIC=y +CONFIG_PCIE_XILINX=y +CONFIG_VMD=m +CONFIG_PCI_HYPERV_INTERFACE=m + +# +# DesignWare PCI Core Support +# +CONFIG_PCIE_DW=y +CONFIG_PCIE_DW_HOST=y +CONFIG_PCIE_DW_EP=y +CONFIG_PCIE_DW_PLAT=y +CONFIG_PCIE_DW_PLAT_HOST=y +CONFIG_PCIE_DW_PLAT_EP=y +CONFIG_PCI_MESON=y +# end of DesignWare PCI Core Support +# end of PCI controller drivers + +# +# PCI Endpoint +# +CONFIG_PCI_ENDPOINT=y +CONFIG_PCI_ENDPOINT_CONFIGFS=y +# CONFIG_PCI_EPF_TEST is not set +# end of PCI Endpoint + +# +# PCI switch controller drivers +# +CONFIG_PCI_SW_SWITCHTEC=m +# end of PCI switch controller drivers + +CONFIG_PCCARD=m +CONFIG_PCMCIA=m +CONFIG_PCMCIA_LOAD_CIS=y +CONFIG_CARDBUS=y + +# +# PC-card bridges +# +CONFIG_YENTA=m +CONFIG_YENTA_O2=y +CONFIG_YENTA_RICOH=y +CONFIG_YENTA_TI=y +CONFIG_YENTA_ENE_TUNE=y +CONFIG_YENTA_TOSHIBA=y +CONFIG_PD6729=m +CONFIG_I82092=m +CONFIG_PCCARD_NONSTATIC=y +CONFIG_RAPIDIO=m +CONFIG_RAPIDIO_TSI721=m +CONFIG_RAPIDIO_DISC_TIMEOUT=30 +CONFIG_RAPIDIO_ENABLE_RX_TX_PORTS=y +CONFIG_RAPIDIO_DMA_ENGINE=y +# CONFIG_RAPIDIO_DEBUG is not set +CONFIG_RAPIDIO_ENUM_BASIC=m +CONFIG_RAPIDIO_CHMAN=m +CONFIG_RAPIDIO_MPORT_CDEV=m + +# +# RapidIO Switch drivers +# +CONFIG_RAPIDIO_TSI57X=m +CONFIG_RAPIDIO_CPS_XX=m +CONFIG_RAPIDIO_TSI568=m +CONFIG_RAPIDIO_CPS_GEN2=m +CONFIG_RAPIDIO_RXS_GEN3=m +# end of RapidIO Switch drivers + +# +# Generic Driver Options +# +# CONFIG_UEVENT_HELPER is not set +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +CONFIG_STANDALONE=y +CONFIG_PREVENT_FIRMWARE_BUILD=y + +# +# Firmware loader +# +CONFIG_FW_LOADER=y +CONFIG_FW_LOADER_PAGED_BUF=y +CONFIG_EXTRA_FIRMWARE="" +# CONFIG_FW_LOADER_USER_HELPER is not set +CONFIG_FW_LOADER_COMPRESS=y +# end of Firmware loader + +CONFIG_WANT_DEV_COREDUMP=y +CONFIG_ALLOW_DEV_COREDUMP=y +CONFIG_DEV_COREDUMP=y +# CONFIG_DEBUG_DRIVER is not set +# CONFIG_DEBUG_DEVRES is not set +# CONFIG_DEBUG_TEST_DRIVER_REMOVE is not set +CONFIG_HMEM_REPORTING=y +# CONFIG_TEST_ASYNC_DRIVER_PROBE is not set +CONFIG_SYS_HYPERVISOR=y +CONFIG_GENERIC_CPU_AUTOPROBE=y +CONFIG_GENERIC_CPU_VULNERABILITIES=y +CONFIG_REGMAP=y +CONFIG_REGMAP_I2C=y +CONFIG_REGMAP_SLIMBUS=m +CONFIG_REGMAP_SPI=y +CONFIG_REGMAP_SPMI=m +CONFIG_REGMAP_W1=m +CONFIG_REGMAP_MMIO=y +CONFIG_REGMAP_IRQ=y +CONFIG_REGMAP_SCCB=m +CONFIG_REGMAP_I3C=m +CONFIG_DMA_SHARED_BUFFER=y +# CONFIG_DMA_FENCE_TRACE is not set +# end of Generic Driver Options + +# +# Bus devices +# +CONFIG_MOXTET=m +CONFIG_SIMPLE_PM_BUS=y +# end of Bus devices + +CONFIG_CONNECTOR=y +CONFIG_PROC_EVENTS=y +CONFIG_GNSS=m +CONFIG_GNSS_SERIAL=m +CONFIG_GNSS_MTK_SERIAL=m +CONFIG_GNSS_SIRF_SERIAL=m +CONFIG_GNSS_UBX_SERIAL=m +CONFIG_MTD=m +CONFIG_MTD_TESTS=m + +# +# Partition parsers +# +CONFIG_MTD_AR7_PARTS=m +CONFIG_MTD_CMDLINE_PARTS=m +CONFIG_MTD_OF_PARTS=m +CONFIG_MTD_REDBOOT_PARTS=m +CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1 +# CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED is not set +# CONFIG_MTD_REDBOOT_PARTS_READONLY is not set +# end of Partition parsers + +# +# User Modules And Translation Layers +# +CONFIG_MTD_BLKDEVS=m +CONFIG_MTD_BLOCK=m +CONFIG_MTD_BLOCK_RO=m +CONFIG_FTL=m +CONFIG_NFTL=m +CONFIG_NFTL_RW=y +CONFIG_INFTL=m +CONFIG_RFD_FTL=m +CONFIG_SSFDC=m +CONFIG_SM_FTL=m +CONFIG_MTD_OOPS=m +CONFIG_MTD_SWAP=m +CONFIG_MTD_PARTITIONED_MASTER=y + +# +# RAM/ROM/Flash chip drivers +# +CONFIG_MTD_CFI=m +CONFIG_MTD_JEDECPROBE=m +CONFIG_MTD_GEN_PROBE=m +# CONFIG_MTD_CFI_ADV_OPTIONS is not set +CONFIG_MTD_MAP_BANK_WIDTH_1=y +CONFIG_MTD_MAP_BANK_WIDTH_2=y +CONFIG_MTD_MAP_BANK_WIDTH_4=y +CONFIG_MTD_CFI_I1=y +CONFIG_MTD_CFI_I2=y +CONFIG_MTD_CFI_INTELEXT=m +CONFIG_MTD_CFI_AMDSTD=m +CONFIG_MTD_CFI_STAA=m +CONFIG_MTD_CFI_UTIL=m +CONFIG_MTD_RAM=m +CONFIG_MTD_ROM=m +CONFIG_MTD_ABSENT=m +# end of RAM/ROM/Flash chip drivers + +# +# Mapping drivers for chip access +# +CONFIG_MTD_COMPLEX_MAPPINGS=y +CONFIG_MTD_PHYSMAP=m +# CONFIG_MTD_PHYSMAP_COMPAT is not set +CONFIG_MTD_PHYSMAP_OF=y +CONFIG_MTD_PHYSMAP_VERSATILE=y +CONFIG_MTD_PHYSMAP_GEMINI=y +CONFIG_MTD_PHYSMAP_GPIO_ADDR=y +CONFIG_MTD_SBC_GXX=m +CONFIG_MTD_AMD76XROM=m +CONFIG_MTD_ICHXROM=m +CONFIG_MTD_ESB2ROM=m +CONFIG_MTD_CK804XROM=m +CONFIG_MTD_SCB2_FLASH=m +CONFIG_MTD_NETtel=m +CONFIG_MTD_L440GX=m +CONFIG_MTD_PCI=m +CONFIG_MTD_PCMCIA=m +# CONFIG_MTD_PCMCIA_ANONYMOUS is not set +CONFIG_MTD_INTEL_VR_NOR=m +CONFIG_MTD_PLATRAM=m +# end of Mapping drivers for chip access + +# +# Self-contained MTD device drivers +# +CONFIG_MTD_PMC551=m +# CONFIG_MTD_PMC551_BUGFIX is not set +# CONFIG_MTD_PMC551_DEBUG is not set +CONFIG_MTD_DATAFLASH=m +# CONFIG_MTD_DATAFLASH_WRITE_VERIFY is not set +CONFIG_MTD_DATAFLASH_OTP=y +CONFIG_MTD_MCHP23K256=m +CONFIG_MTD_SST25L=m +CONFIG_MTD_SLRAM=m +CONFIG_MTD_PHRAM=m +CONFIG_MTD_MTDRAM=m +CONFIG_MTDRAM_TOTAL_SIZE=4096 +CONFIG_MTDRAM_ERASE_SIZE=128 +CONFIG_MTD_BLOCK2MTD=m + +# +# Disk-On-Chip Device Drivers +# +CONFIG_MTD_DOCG3=m +CONFIG_BCH_CONST_M=14 +CONFIG_BCH_CONST_T=4 +# end of Self-contained MTD device drivers + +CONFIG_MTD_NAND_CORE=m +CONFIG_MTD_ONENAND=m +# CONFIG_MTD_ONENAND_VERIFY_WRITE is not set +CONFIG_MTD_ONENAND_GENERIC=m +CONFIG_MTD_ONENAND_OTP=y +CONFIG_MTD_ONENAND_2X_PROGRAM=y +CONFIG_MTD_NAND_ECC_SW_HAMMING=m +CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC=y +CONFIG_MTD_RAW_NAND=m +CONFIG_MTD_NAND_ECC_SW_BCH=y + +# +# Raw/parallel NAND flash controllers +# +CONFIG_MTD_NAND_DENALI=m +CONFIG_MTD_NAND_DENALI_PCI=m +CONFIG_MTD_NAND_DENALI_DT=m +CONFIG_MTD_NAND_CAFE=m +CONFIG_MTD_NAND_MXIC=m +CONFIG_MTD_NAND_GPIO=m +CONFIG_MTD_NAND_PLATFORM=m + +# +# Misc +# +CONFIG_MTD_SM_COMMON=m +CONFIG_MTD_NAND_NANDSIM=m +CONFIG_MTD_NAND_RICOH=m +CONFIG_MTD_NAND_DISKONCHIP=m +# CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED is not set +CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0 +CONFIG_MTD_NAND_DISKONCHIP_BBTWRITE=y +CONFIG_MTD_SPI_NAND=m + +# +# LPDDR & LPDDR2 PCM memory drivers +# +CONFIG_MTD_LPDDR=m +CONFIG_MTD_QINFO_PROBE=m +# end of LPDDR & LPDDR2 PCM memory drivers + +CONFIG_MTD_SPI_NOR=m +CONFIG_MTD_SPI_NOR_USE_4K_SECTORS=y +CONFIG_SPI_MTK_QUADSPI=m +CONFIG_SPI_INTEL_SPI=m +CONFIG_SPI_INTEL_SPI_PCI=m +CONFIG_SPI_INTEL_SPI_PLATFORM=m +CONFIG_MTD_UBI=m +CONFIG_MTD_UBI_WL_THRESHOLD=4096 +CONFIG_MTD_UBI_BEB_LIMIT=20 +CONFIG_MTD_UBI_FASTMAP=y +CONFIG_MTD_UBI_GLUEBI=m +CONFIG_MTD_UBI_BLOCK=y +CONFIG_MTD_HYPERBUS=m +CONFIG_DTC=y +CONFIG_OF=y +# CONFIG_OF_UNITTEST is not set +CONFIG_OF_FLATTREE=y +CONFIG_OF_KOBJ=y +CONFIG_OF_DYNAMIC=y +CONFIG_OF_ADDRESS=y +CONFIG_OF_IRQ=y +CONFIG_OF_NET=y +CONFIG_OF_MDIO=m +CONFIG_OF_RESOLVE=y +CONFIG_OF_OVERLAY=y +CONFIG_ARCH_MIGHT_HAVE_PC_PARPORT=y +CONFIG_PARPORT=m +CONFIG_PARPORT_PC=m +CONFIG_PARPORT_SERIAL=m +CONFIG_PARPORT_PC_FIFO=y +CONFIG_PARPORT_PC_SUPERIO=y +CONFIG_PARPORT_PC_PCMCIA=m +CONFIG_PARPORT_AX88796=m +CONFIG_PARPORT_1284=y +CONFIG_PARPORT_NOT_PC=y +CONFIG_PNP=y +CONFIG_PNP_DEBUG_MESSAGES=y + +# +# Protocols +# +CONFIG_PNPACPI=y +CONFIG_BLK_DEV=y +# CONFIG_BLK_DEV_NULL_BLK is not set +CONFIG_BLK_DEV_FD=m +CONFIG_CDROM=m +# CONFIG_PARIDE is not set +CONFIG_BLK_DEV_PCIESSD_MTIP32XX=m +CONFIG_ZRAM=m +CONFIG_ZRAM_WRITEBACK=y +# CONFIG_ZRAM_MEMORY_TRACKING is not set +CONFIG_BLK_DEV_UMEM=m +CONFIG_BLK_DEV_LOOP=m +CONFIG_BLK_DEV_LOOP_MIN_COUNT=8 +CONFIG_BLK_DEV_CRYPTOLOOP=m +CONFIG_BLK_DEV_DRBD=m +# CONFIG_DRBD_FAULT_INJECTION is not set +CONFIG_BLK_DEV_NBD=m +CONFIG_BLK_DEV_SKD=m +CONFIG_BLK_DEV_SX8=m +CONFIG_BLK_DEV_RAM=m +CONFIG_BLK_DEV_RAM_COUNT=16 +CONFIG_BLK_DEV_RAM_SIZE=16384 +CONFIG_CDROM_PKTCDVD=m +CONFIG_CDROM_PKTCDVD_BUFFERS=8 +# CONFIG_CDROM_PKTCDVD_WCACHE is not set +CONFIG_ATA_OVER_ETH=m +CONFIG_XEN_BLKDEV_FRONTEND=m +CONFIG_XEN_BLKDEV_BACKEND=m +CONFIG_VIRTIO_BLK=m +# CONFIG_VIRTIO_BLK_SCSI is not set +CONFIG_BLK_DEV_RBD=m +CONFIG_BLK_DEV_RSXX=m + +# +# NVME Support +# +CONFIG_NVME_CORE=y +CONFIG_BLK_DEV_NVME=y +CONFIG_NVME_MULTIPATH=y +CONFIG_NVME_FABRICS=m +CONFIG_NVME_RDMA=m +CONFIG_NVME_FC=m +CONFIG_NVME_TCP=m +CONFIG_NVME_TARGET=m +CONFIG_NVME_TARGET_LOOP=m +CONFIG_NVME_TARGET_RDMA=m +CONFIG_NVME_TARGET_FC=m +CONFIG_NVME_TARGET_FCLOOP=m +CONFIG_NVME_TARGET_TCP=m +# end of NVME Support + +# +# Misc devices +# +CONFIG_SENSORS_LIS3LV02D=m +CONFIG_AD525X_DPOT=m +CONFIG_AD525X_DPOT_I2C=m +CONFIG_AD525X_DPOT_SPI=m +# CONFIG_DUMMY_IRQ is not set +CONFIG_IBM_ASM=m +CONFIG_PHANTOM=m +CONFIG_TIFM_CORE=m +CONFIG_TIFM_7XX1=m +CONFIG_ICS932S401=m +CONFIG_ENCLOSURE_SERVICES=m +CONFIG_HP_ILO=m +CONFIG_APDS9802ALS=m +CONFIG_ISL29003=m +CONFIG_ISL29020=m +CONFIG_SENSORS_TSL2550=m +CONFIG_SENSORS_BH1770=m +CONFIG_SENSORS_APDS990X=m +CONFIG_HMC6352=m +CONFIG_DS1682=m +CONFIG_VMWARE_BALLOON=m +CONFIG_LATTICE_ECP3_CONFIG=m +# CONFIG_SRAM is not set +CONFIG_PCI_ENDPOINT_TEST=m +CONFIG_XILINX_SDFEC=m +CONFIG_MISC_RTSX=m +CONFIG_PVPANIC=m +CONFIG_C2PORT=m +CONFIG_C2PORT_DURAMAR_2150=m + +# +# EEPROM support +# +CONFIG_EEPROM_AT24=m +# CONFIG_EEPROM_AT25 is not set +CONFIG_EEPROM_LEGACY=m +CONFIG_EEPROM_MAX6875=m +CONFIG_EEPROM_93CX6=m +# CONFIG_EEPROM_93XX46 is not set +CONFIG_EEPROM_IDT_89HPESX=m +CONFIG_EEPROM_EE1004=m +# end of EEPROM support + +CONFIG_CB710_CORE=m +# CONFIG_CB710_DEBUG is not set +CONFIG_CB710_DEBUG_ASSUMPTIONS=y + +# +# Texas Instruments shared transport line discipline +# +CONFIG_TI_ST=m +# end of Texas Instruments shared transport line discipline + +CONFIG_SENSORS_LIS3_I2C=m +CONFIG_ALTERA_STAPL=m +CONFIG_INTEL_MEI=m +CONFIG_INTEL_MEI_ME=m +CONFIG_INTEL_MEI_TXE=m +CONFIG_INTEL_MEI_HDCP=m +CONFIG_VMWARE_VMCI=m + +# +# Intel MIC & related support +# + +# +# Intel MIC Bus Driver +# +CONFIG_INTEL_MIC_BUS=m + +# +# SCIF Bus Driver +# +CONFIG_SCIF_BUS=m + +# +# VOP Bus Driver +# +CONFIG_VOP_BUS=m + +# +# Intel MIC Host Driver +# +CONFIG_INTEL_MIC_HOST=m + +# +# Intel MIC Card Driver +# +CONFIG_INTEL_MIC_CARD=m + +# +# SCIF Driver +# +CONFIG_SCIF=m + +# +# Intel MIC Coprocessor State Management (COSM) Drivers +# +CONFIG_MIC_COSM=m + +# +# VOP Driver +# +CONFIG_VOP=m +CONFIG_VHOST_RING=m +# end of Intel MIC & related support + +CONFIG_GENWQE=m +CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY=0 +CONFIG_ECHO=m +CONFIG_MISC_ALCOR_PCI=m +CONFIG_MISC_RTSX_PCI=m +CONFIG_MISC_RTSX_USB=m +CONFIG_HABANA_AI=m +# end of Misc devices + +CONFIG_HAVE_IDE=y +# CONFIG_IDE is not set + +# +# SCSI device support +# +CONFIG_SCSI_MOD=m +CONFIG_RAID_ATTRS=m +CONFIG_SCSI=m +CONFIG_SCSI_DMA=y +CONFIG_SCSI_NETLINK=y +CONFIG_SCSI_PROC_FS=y + +# +# SCSI support type (disk, tape, CD-ROM) +# +CONFIG_BLK_DEV_SD=m +CONFIG_CHR_DEV_ST=m +CONFIG_BLK_DEV_SR=m +CONFIG_CHR_DEV_SG=m +CONFIG_CHR_DEV_SCH=m +CONFIG_SCSI_ENCLOSURE=m +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_LOGGING=y +CONFIG_SCSI_SCAN_ASYNC=y + +# +# SCSI Transports +# +CONFIG_SCSI_SPI_ATTRS=m +CONFIG_SCSI_FC_ATTRS=m +CONFIG_SCSI_ISCSI_ATTRS=m +CONFIG_SCSI_SAS_ATTRS=m +CONFIG_SCSI_SAS_LIBSAS=m +CONFIG_SCSI_SAS_ATA=y +CONFIG_SCSI_SAS_HOST_SMP=y +CONFIG_SCSI_SRP_ATTRS=m +# end of SCSI Transports + +CONFIG_SCSI_LOWLEVEL=y +CONFIG_ISCSI_TCP=m +CONFIG_ISCSI_BOOT_SYSFS=m +CONFIG_SCSI_CXGB3_ISCSI=m +CONFIG_SCSI_CXGB4_ISCSI=m +CONFIG_SCSI_BNX2_ISCSI=m +CONFIG_SCSI_BNX2X_FCOE=m +CONFIG_BE2ISCSI=m +CONFIG_BLK_DEV_3W_XXXX_RAID=m +CONFIG_SCSI_HPSA=m +CONFIG_SCSI_3W_9XXX=m +CONFIG_SCSI_3W_SAS=m +CONFIG_SCSI_ACARD=m +CONFIG_SCSI_AACRAID=m +CONFIG_SCSI_AIC7XXX=m +CONFIG_AIC7XXX_CMDS_PER_DEVICE=32 +CONFIG_AIC7XXX_RESET_DELAY_MS=15000 +CONFIG_AIC7XXX_DEBUG_ENABLE=y +CONFIG_AIC7XXX_DEBUG_MASK=0 +CONFIG_AIC7XXX_REG_PRETTY_PRINT=y +CONFIG_SCSI_AIC79XX=m +CONFIG_AIC79XX_CMDS_PER_DEVICE=32 +CONFIG_AIC79XX_RESET_DELAY_MS=15000 +CONFIG_AIC79XX_DEBUG_ENABLE=y +CONFIG_AIC79XX_DEBUG_MASK=0 +CONFIG_AIC79XX_REG_PRETTY_PRINT=y +CONFIG_SCSI_AIC94XX=m +CONFIG_AIC94XX_DEBUG=y +CONFIG_SCSI_MVSAS=m +CONFIG_SCSI_MVSAS_DEBUG=y +CONFIG_SCSI_MVSAS_TASKLET=y +CONFIG_SCSI_MVUMI=m +CONFIG_SCSI_DPT_I2O=m +CONFIG_SCSI_ADVANSYS=m +CONFIG_SCSI_ARCMSR=m +CONFIG_SCSI_ESAS2R=m +CONFIG_MEGARAID_NEWGEN=y +CONFIG_MEGARAID_MM=m +CONFIG_MEGARAID_MAILBOX=m +CONFIG_MEGARAID_LEGACY=m +CONFIG_MEGARAID_SAS=m +CONFIG_SCSI_MPT3SAS=m +CONFIG_SCSI_MPT2SAS_MAX_SGE=128 +CONFIG_SCSI_MPT3SAS_MAX_SGE=128 +CONFIG_SCSI_MPT2SAS=m +CONFIG_SCSI_SMARTPQI=m +CONFIG_SCSI_UFSHCD=m +CONFIG_SCSI_UFSHCD_PCI=m +# CONFIG_SCSI_UFS_DWC_TC_PCI is not set +CONFIG_SCSI_UFSHCD_PLATFORM=m +CONFIG_SCSI_UFS_CDNS_PLATFORM=m +# CONFIG_SCSI_UFS_DWC_TC_PLATFORM is not set +CONFIG_SCSI_UFS_BSG=y +CONFIG_SCSI_HPTIOP=m +CONFIG_SCSI_BUSLOGIC=m +CONFIG_SCSI_FLASHPOINT=y +CONFIG_SCSI_MYRB=m +CONFIG_SCSI_MYRS=m +CONFIG_VMWARE_PVSCSI=m +CONFIG_XEN_SCSI_FRONTEND=m +CONFIG_HYPERV_STORAGE=m +CONFIG_LIBFC=m +CONFIG_LIBFCOE=m +CONFIG_FCOE=m +CONFIG_FCOE_FNIC=m +CONFIG_SCSI_SNIC=m +# CONFIG_SCSI_SNIC_DEBUG_FS is not set +CONFIG_SCSI_DMX3191D=m +CONFIG_SCSI_FDOMAIN=m +CONFIG_SCSI_FDOMAIN_PCI=m +CONFIG_SCSI_GDTH=m +CONFIG_SCSI_ISCI=m +CONFIG_SCSI_IPS=m +CONFIG_SCSI_INITIO=m +CONFIG_SCSI_INIA100=m +CONFIG_SCSI_PPA=m +CONFIG_SCSI_IMM=m +# CONFIG_SCSI_IZIP_EPP16 is not set +# CONFIG_SCSI_IZIP_SLOW_CTR is not set +CONFIG_SCSI_STEX=m +CONFIG_SCSI_SYM53C8XX_2=m +CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1 +CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 +CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 +CONFIG_SCSI_SYM53C8XX_MMIO=y +CONFIG_SCSI_IPR=m +CONFIG_SCSI_IPR_TRACE=y +CONFIG_SCSI_IPR_DUMP=y +CONFIG_SCSI_QLOGIC_1280=m +CONFIG_SCSI_QLA_FC=m +CONFIG_TCM_QLA2XXX=m +# CONFIG_TCM_QLA2XXX_DEBUG is not set +CONFIG_SCSI_QLA_ISCSI=m +CONFIG_QEDI=m +CONFIG_QEDF=m +CONFIG_SCSI_LPFC=m +# CONFIG_SCSI_LPFC_DEBUG_FS is not set +CONFIG_SCSI_DC395x=m +CONFIG_SCSI_AM53C974=m +CONFIG_SCSI_WD719X=m +CONFIG_SCSI_DEBUG=m +CONFIG_SCSI_PMCRAID=m +CONFIG_SCSI_PM8001=m +CONFIG_SCSI_BFA_FC=m +CONFIG_SCSI_VIRTIO=m +CONFIG_SCSI_CHELSIO_FCOE=m +CONFIG_SCSI_LOWLEVEL_PCMCIA=y +CONFIG_PCMCIA_AHA152X=m +CONFIG_PCMCIA_FDOMAIN=m +CONFIG_PCMCIA_QLOGIC=m +CONFIG_PCMCIA_SYM53C500=m +CONFIG_SCSI_DH=y +CONFIG_SCSI_DH_RDAC=m +CONFIG_SCSI_DH_HP_SW=m +CONFIG_SCSI_DH_EMC=m +CONFIG_SCSI_DH_ALUA=m +# end of SCSI device support + +CONFIG_ATA=m +CONFIG_ATA_VERBOSE_ERROR=y +CONFIG_ATA_ACPI=y +CONFIG_SATA_ZPODD=y +CONFIG_SATA_PMP=y + +# +# Controllers with non-SFF native interface +# +CONFIG_SATA_AHCI=m +CONFIG_SATA_MOBILE_LPM_POLICY=3 +CONFIG_SATA_AHCI_PLATFORM=m +CONFIG_AHCI_CEVA=m +CONFIG_AHCI_QORIQ=m +CONFIG_SATA_INIC162X=m +CONFIG_SATA_ACARD_AHCI=m +CONFIG_SATA_SIL24=m +CONFIG_ATA_SFF=y + +# +# SFF controllers with custom DMA interface +# +CONFIG_PDC_ADMA=m +CONFIG_SATA_QSTOR=m +CONFIG_SATA_SX4=m +CONFIG_ATA_BMDMA=y + +# +# SATA SFF controllers with BMDMA +# +CONFIG_ATA_PIIX=m +CONFIG_SATA_DWC=m +# CONFIG_SATA_DWC_OLD_DMA is not set +# CONFIG_SATA_DWC_DEBUG is not set +CONFIG_SATA_MV=m +CONFIG_SATA_NV=m +CONFIG_SATA_PROMISE=m +CONFIG_SATA_SIL=m +CONFIG_SATA_SIS=m +CONFIG_SATA_SVW=m +CONFIG_SATA_ULI=m +CONFIG_SATA_VIA=m +CONFIG_SATA_VITESSE=m + +# +# PATA SFF controllers with BMDMA +# +CONFIG_PATA_ALI=m +CONFIG_PATA_AMD=m +CONFIG_PATA_ARTOP=m +CONFIG_PATA_ATIIXP=m +CONFIG_PATA_ATP867X=m +CONFIG_PATA_CMD64X=m +CONFIG_PATA_CYPRESS=m +CONFIG_PATA_EFAR=m +CONFIG_PATA_HPT366=m +CONFIG_PATA_HPT37X=m +CONFIG_PATA_HPT3X2N=m +CONFIG_PATA_HPT3X3=m +CONFIG_PATA_HPT3X3_DMA=y +CONFIG_PATA_IT8213=m +CONFIG_PATA_IT821X=m +CONFIG_PATA_JMICRON=m +CONFIG_PATA_MARVELL=m +CONFIG_PATA_NETCELL=m +CONFIG_PATA_NINJA32=m +CONFIG_PATA_NS87415=m +CONFIG_PATA_OLDPIIX=m +CONFIG_PATA_OPTIDMA=m +CONFIG_PATA_PDC2027X=m +CONFIG_PATA_PDC_OLD=m +CONFIG_PATA_RADISYS=m +CONFIG_PATA_RDC=m +CONFIG_PATA_SCH=m +CONFIG_PATA_SERVERWORKS=m +CONFIG_PATA_SIL680=m +CONFIG_PATA_SIS=m +CONFIG_PATA_TOSHIBA=m +CONFIG_PATA_TRIFLEX=m +CONFIG_PATA_VIA=m +CONFIG_PATA_WINBOND=m + +# +# PIO-only SFF controllers +# +CONFIG_PATA_CMD640_PCI=m +CONFIG_PATA_MPIIX=m +CONFIG_PATA_NS87410=m +CONFIG_PATA_OPTI=m +CONFIG_PATA_PCMCIA=m +# CONFIG_PATA_PLATFORM is not set +CONFIG_PATA_RZ1000=m + +# +# Generic fallback / legacy drivers +# +CONFIG_PATA_ACPI=m +CONFIG_ATA_GENERIC=m +CONFIG_PATA_LEGACY=m +CONFIG_MD=y +CONFIG_BLK_DEV_MD=m +CONFIG_MD_LINEAR=m +CONFIG_MD_RAID0=m +CONFIG_MD_RAID1=m +CONFIG_MD_RAID10=m +CONFIG_MD_RAID456=m +CONFIG_MD_MULTIPATH=m +CONFIG_MD_FAULTY=m +CONFIG_MD_CLUSTER=m +CONFIG_BCACHE=m +# CONFIG_BCACHE_DEBUG is not set +# CONFIG_BCACHE_CLOSURES_DEBUG is not set +CONFIG_BLK_DEV_DM_BUILTIN=y +CONFIG_BLK_DEV_DM=m +CONFIG_DM_DEBUG=y +CONFIG_DM_BUFIO=m +# CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING is not set +CONFIG_DM_BIO_PRISON=m +CONFIG_DM_PERSISTENT_DATA=m +CONFIG_DM_UNSTRIPED=m +CONFIG_DM_CRYPT=m +CONFIG_DM_SNAPSHOT=m +CONFIG_DM_THIN_PROVISIONING=m +CONFIG_DM_CACHE=m +CONFIG_DM_CACHE_SMQ=m +CONFIG_DM_WRITECACHE=m +CONFIG_DM_ERA=m +CONFIG_DM_CLONE=m +CONFIG_DM_MIRROR=m +CONFIG_DM_LOG_USERSPACE=m +CONFIG_DM_RAID=m +CONFIG_DM_ZERO=m +CONFIG_DM_MULTIPATH=m +CONFIG_DM_MULTIPATH_QL=m +CONFIG_DM_MULTIPATH_ST=m +CONFIG_DM_DELAY=m +CONFIG_DM_DUST=m +CONFIG_DM_UEVENT=y +CONFIG_DM_FLAKEY=m +CONFIG_DM_VERITY=m +CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y +CONFIG_DM_VERITY_FEC=y +CONFIG_DM_SWITCH=m +CONFIG_DM_LOG_WRITES=m +CONFIG_DM_INTEGRITY=m +CONFIG_DM_ZONED=m +CONFIG_TARGET_CORE=m +CONFIG_TCM_IBLOCK=m +CONFIG_TCM_FILEIO=m +CONFIG_TCM_PSCSI=m +CONFIG_TCM_USER2=m +CONFIG_LOOPBACK_TARGET=m +CONFIG_TCM_FC=m +CONFIG_ISCSI_TARGET=m +CONFIG_ISCSI_TARGET_CXGB4=m +CONFIG_SBP_TARGET=m +CONFIG_FUSION=y +CONFIG_FUSION_SPI=m +CONFIG_FUSION_FC=m +CONFIG_FUSION_SAS=m +CONFIG_FUSION_MAX_SGE=128 +CONFIG_FUSION_CTL=m +CONFIG_FUSION_LAN=m +# CONFIG_FUSION_LOGGING is not set + +# +# IEEE 1394 (FireWire) support +# +CONFIG_FIREWIRE=m +CONFIG_FIREWIRE_OHCI=m +CONFIG_FIREWIRE_SBP2=m +CONFIG_FIREWIRE_NET=m +CONFIG_FIREWIRE_NOSY=m +# end of IEEE 1394 (FireWire) support + +CONFIG_MACINTOSH_DRIVERS=y +CONFIG_MAC_EMUMOUSEBTN=m +CONFIG_NETDEVICES=y +CONFIG_MII=m +CONFIG_NET_CORE=y +CONFIG_BONDING=m +CONFIG_DUMMY=m +CONFIG_EQUALIZER=m +CONFIG_NET_FC=y +CONFIG_IFB=m +CONFIG_NET_TEAM=m +CONFIG_NET_TEAM_MODE_BROADCAST=m +CONFIG_NET_TEAM_MODE_ROUNDROBIN=m +CONFIG_NET_TEAM_MODE_RANDOM=m +CONFIG_NET_TEAM_MODE_ACTIVEBACKUP=m +CONFIG_NET_TEAM_MODE_LOADBALANCE=m +CONFIG_MACVLAN=m +CONFIG_MACVTAP=m +CONFIG_IPVLAN_L3S=y +CONFIG_IPVLAN=m +CONFIG_IPVTAP=m +CONFIG_VXLAN=m +CONFIG_GENEVE=m +CONFIG_GTP=m +CONFIG_MACSEC=m +CONFIG_NETCONSOLE=m +CONFIG_NETCONSOLE_DYNAMIC=y +CONFIG_NETPOLL=y +CONFIG_NET_POLL_CONTROLLER=y +CONFIG_NTB_NETDEV=m +CONFIG_RIONET=m +CONFIG_RIONET_TX_SIZE=128 +CONFIG_RIONET_RX_SIZE=128 +CONFIG_TUN=m +CONFIG_TAP=m +# CONFIG_TUN_VNET_CROSS_LE is not set +CONFIG_VETH=m +CONFIG_VIRTIO_NET=m +CONFIG_NLMON=m +CONFIG_NET_VRF=m +CONFIG_VSOCKMON=m +CONFIG_SUNGEM_PHY=m +# CONFIG_ARCNET is not set +CONFIG_ATM_DRIVERS=y +# CONFIG_ATM_DUMMY is not set +CONFIG_ATM_TCP=m +CONFIG_ATM_LANAI=m +CONFIG_ATM_ENI=m +# CONFIG_ATM_ENI_DEBUG is not set +# CONFIG_ATM_ENI_TUNE_BURST is not set +CONFIG_ATM_FIRESTREAM=m +CONFIG_ATM_ZATM=m +# CONFIG_ATM_ZATM_DEBUG is not set +CONFIG_ATM_NICSTAR=m +# CONFIG_ATM_NICSTAR_USE_SUNI is not set +# CONFIG_ATM_NICSTAR_USE_IDT77105 is not set +CONFIG_ATM_IDT77252=m +# CONFIG_ATM_IDT77252_DEBUG is not set +# CONFIG_ATM_IDT77252_RCV_ALL is not set +CONFIG_ATM_IDT77252_USE_SUNI=y +CONFIG_ATM_AMBASSADOR=m +# CONFIG_ATM_AMBASSADOR_DEBUG is not set +CONFIG_ATM_HORIZON=m +# CONFIG_ATM_HORIZON_DEBUG is not set +CONFIG_ATM_IA=m +# CONFIG_ATM_IA_DEBUG is not set +CONFIG_ATM_FORE200E=m +CONFIG_ATM_FORE200E_USE_TASKLET=y +CONFIG_ATM_FORE200E_TX_RETRY=16 +CONFIG_ATM_FORE200E_DEBUG=0 +CONFIG_ATM_HE=m +CONFIG_ATM_HE_USE_SUNI=y +CONFIG_ATM_SOLOS=m + +# +# CAIF transport drivers +# +CONFIG_CAIF_TTY=m +CONFIG_CAIF_SPI_SLAVE=m +CONFIG_CAIF_SPI_SYNC=y +CONFIG_CAIF_HSI=m +CONFIG_CAIF_VIRTIO=m + +# +# Distributed Switch Architecture drivers +# +CONFIG_B53=m +# CONFIG_B53_SPI_DRIVER is not set +CONFIG_B53_MDIO_DRIVER=m +CONFIG_B53_MMAP_DRIVER=m +CONFIG_B53_SRAB_DRIVER=m +CONFIG_B53_SERDES=m +CONFIG_NET_DSA_BCM_SF2=m +CONFIG_NET_DSA_LOOP=m +CONFIG_NET_DSA_LANTIQ_GSWIP=m +CONFIG_NET_DSA_MT7530=m +CONFIG_NET_DSA_MV88E6060=m +CONFIG_NET_DSA_MICROCHIP_KSZ_COMMON=m +CONFIG_NET_DSA_MICROCHIP_KSZ9477=m +CONFIG_NET_DSA_MICROCHIP_KSZ9477_I2C=m +CONFIG_NET_DSA_MICROCHIP_KSZ9477_SPI=m +CONFIG_NET_DSA_MICROCHIP_KSZ8795=m +CONFIG_NET_DSA_MICROCHIP_KSZ8795_SPI=m +CONFIG_NET_DSA_MV88E6XXX=m +CONFIG_NET_DSA_MV88E6XXX_GLOBAL2=y +CONFIG_NET_DSA_MV88E6XXX_PTP=y +CONFIG_NET_DSA_SJA1105=m +CONFIG_NET_DSA_SJA1105_PTP=y +CONFIG_NET_DSA_SJA1105_TAS=y +CONFIG_NET_DSA_QCA8K=m +CONFIG_NET_DSA_REALTEK_SMI=m +CONFIG_NET_DSA_SMSC_LAN9303=m +CONFIG_NET_DSA_SMSC_LAN9303_I2C=m +CONFIG_NET_DSA_SMSC_LAN9303_MDIO=m +CONFIG_NET_DSA_VITESSE_VSC73XX=m +CONFIG_NET_DSA_VITESSE_VSC73XX_SPI=m +CONFIG_NET_DSA_VITESSE_VSC73XX_PLATFORM=m +# end of Distributed Switch Architecture drivers + +CONFIG_ETHERNET=y +CONFIG_MDIO=m +CONFIG_NET_VENDOR_3COM=y +CONFIG_PCMCIA_3C574=m +CONFIG_PCMCIA_3C589=m +CONFIG_VORTEX=m +CONFIG_TYPHOON=m +CONFIG_NET_VENDOR_ADAPTEC=y +CONFIG_ADAPTEC_STARFIRE=m +CONFIG_NET_VENDOR_AGERE=y +CONFIG_ET131X=m +CONFIG_NET_VENDOR_ALACRITECH=y +CONFIG_SLICOSS=m +CONFIG_NET_VENDOR_ALTEON=y +CONFIG_ACENIC=m +# CONFIG_ACENIC_OMIT_TIGON_I is not set +CONFIG_ALTERA_TSE=m +CONFIG_NET_VENDOR_AMAZON=y +CONFIG_ENA_ETHERNET=m +CONFIG_NET_VENDOR_AMD=y +CONFIG_AMD8111_ETH=m +CONFIG_PCNET32=m +CONFIG_PCMCIA_NMCLAN=m +CONFIG_AMD_XGBE=m +CONFIG_AMD_XGBE_DCB=y +CONFIG_AMD_XGBE_HAVE_ECC=y +CONFIG_NET_VENDOR_AQUANTIA=y +CONFIG_AQTION=m +CONFIG_NET_VENDOR_ARC=y +CONFIG_NET_VENDOR_ATHEROS=y +CONFIG_ATL2=m +CONFIG_ATL1=m +CONFIG_ATL1E=m +CONFIG_ATL1C=m +CONFIG_ALX=m +CONFIG_NET_VENDOR_AURORA=y +CONFIG_AURORA_NB8800=m +CONFIG_NET_VENDOR_BROADCOM=y +CONFIG_B44=m +CONFIG_B44_PCI_AUTOSELECT=y +CONFIG_B44_PCICORE_AUTOSELECT=y +CONFIG_B44_PCI=y +CONFIG_BCMGENET=m +CONFIG_BNX2=m +CONFIG_CNIC=m +CONFIG_TIGON3=m +CONFIG_TIGON3_HWMON=y +CONFIG_BNX2X=m +CONFIG_BNX2X_SRIOV=y +CONFIG_SYSTEMPORT=m +CONFIG_BNXT=m +CONFIG_BNXT_SRIOV=y +CONFIG_BNXT_FLOWER_OFFLOAD=y +CONFIG_BNXT_DCB=y +CONFIG_BNXT_HWMON=y +CONFIG_NET_VENDOR_BROCADE=y +CONFIG_BNA=m +CONFIG_NET_VENDOR_CADENCE=y +CONFIG_MACB=m +CONFIG_MACB_USE_HWSTAMP=y +CONFIG_MACB_PCI=m +CONFIG_NET_VENDOR_CAVIUM=y +CONFIG_THUNDER_NIC_PF=m +CONFIG_THUNDER_NIC_VF=m +CONFIG_THUNDER_NIC_BGX=m +CONFIG_THUNDER_NIC_RGX=m +CONFIG_CAVIUM_PTP=m +CONFIG_LIQUIDIO=m +CONFIG_LIQUIDIO_VF=m +CONFIG_NET_VENDOR_CHELSIO=y +CONFIG_CHELSIO_T1=m +CONFIG_CHELSIO_T1_1G=y +CONFIG_CHELSIO_T3=m +CONFIG_CHELSIO_T4=m +CONFIG_CHELSIO_T4_DCB=y +CONFIG_CHELSIO_T4_FCOE=y +CONFIG_CHELSIO_T4VF=m +CONFIG_CHELSIO_LIB=m +CONFIG_NET_VENDOR_CISCO=y +CONFIG_ENIC=m +CONFIG_NET_VENDOR_CORTINA=y +CONFIG_GEMINI_ETHERNET=m +CONFIG_CX_ECAT=m +CONFIG_DNET=m +CONFIG_NET_VENDOR_DEC=y +CONFIG_NET_TULIP=y +CONFIG_DE2104X=m +CONFIG_DE2104X_DSL=0 +CONFIG_TULIP=m +CONFIG_TULIP_MWI=y +CONFIG_TULIP_MMIO=y +CONFIG_TULIP_NAPI=y +CONFIG_TULIP_NAPI_HW_MITIGATION=y +CONFIG_DE4X5=m +CONFIG_WINBOND_840=m +CONFIG_DM9102=m +CONFIG_ULI526X=m +CONFIG_PCMCIA_XIRCOM=m +CONFIG_NET_VENDOR_DLINK=y +CONFIG_DL2K=m +CONFIG_SUNDANCE=m +# CONFIG_SUNDANCE_MMIO is not set +CONFIG_NET_VENDOR_EMULEX=y +CONFIG_BE2NET=m +CONFIG_BE2NET_HWMON=y +CONFIG_BE2NET_BE2=y +CONFIG_BE2NET_BE3=y +CONFIG_BE2NET_LANCER=y +CONFIG_BE2NET_SKYHAWK=y +CONFIG_NET_VENDOR_EZCHIP=y +CONFIG_EZCHIP_NPS_MANAGEMENT_ENET=m +CONFIG_NET_VENDOR_FUJITSU=y +CONFIG_PCMCIA_FMVJ18X=m +CONFIG_NET_VENDOR_GOOGLE=y +CONFIG_GVE=m +CONFIG_NET_VENDOR_HP=y +CONFIG_HP100=m +CONFIG_NET_VENDOR_HUAWEI=y +CONFIG_HINIC=m +CONFIG_NET_VENDOR_I825XX=y +CONFIG_NET_VENDOR_INTEL=y +CONFIG_E100=m +CONFIG_E1000=m +CONFIG_E1000E=m +CONFIG_E1000E_HWTS=y +CONFIG_IGB=m +CONFIG_IGB_HWMON=y +CONFIG_IGB_DCA=y +CONFIG_IGBVF=m +CONFIG_IXGB=m +CONFIG_IXGBE=m +CONFIG_IXGBE_HWMON=y +CONFIG_IXGBE_DCA=y +CONFIG_IXGBE_DCB=y +# CONFIG_IXGBE_IPSEC is not set +CONFIG_IXGBEVF=m +CONFIG_IXGBEVF_IPSEC=y +CONFIG_I40E=m +CONFIG_I40E_DCB=y +CONFIG_IAVF=m +CONFIG_I40EVF=m +CONFIG_ICE=m +CONFIG_FM10K=m +CONFIG_IGC=m +CONFIG_JME=m +CONFIG_NET_VENDOR_MARVELL=y +CONFIG_MVMDIO=m +CONFIG_SKGE=m +# CONFIG_SKGE_DEBUG is not set +CONFIG_SKGE_GENESIS=y +CONFIG_SKY2=m +# CONFIG_SKY2_DEBUG is not set +CONFIG_NET_VENDOR_MELLANOX=y +CONFIG_MLX4_EN=m +CONFIG_MLX4_EN_DCB=y +CONFIG_MLX4_CORE=m +CONFIG_MLX4_DEBUG=y +CONFIG_MLX4_CORE_GEN2=y +CONFIG_MLX5_CORE=m +CONFIG_MLX5_ACCEL=y +CONFIG_MLX5_FPGA=y +CONFIG_MLX5_CORE_EN=y +CONFIG_MLX5_EN_ARFS=y +CONFIG_MLX5_EN_RXNFC=y +CONFIG_MLX5_MPFS=y +CONFIG_MLX5_ESWITCH=y +CONFIG_MLX5_CORE_EN_DCB=y +CONFIG_MLX5_CORE_IPOIB=y +CONFIG_MLX5_FPGA_IPSEC=y +CONFIG_MLX5_EN_IPSEC=y +CONFIG_MLX5_FPGA_TLS=y +CONFIG_MLX5_TLS=y +CONFIG_MLX5_EN_TLS=y +CONFIG_MLX5_SW_STEERING=y +CONFIG_MLXSW_CORE=m +CONFIG_MLXSW_CORE_HWMON=y +CONFIG_MLXSW_CORE_THERMAL=y +CONFIG_MLXSW_PCI=m +CONFIG_MLXSW_I2C=m +CONFIG_MLXSW_SWITCHIB=m +CONFIG_MLXSW_SWITCHX2=m +CONFIG_MLXSW_SPECTRUM=m +CONFIG_MLXSW_SPECTRUM_DCB=y +CONFIG_MLXSW_MINIMAL=m +CONFIG_MLXFW=m +CONFIG_NET_VENDOR_MICREL=y +CONFIG_KS8842=m +CONFIG_KS8851=m +CONFIG_KS8851_MLL=m +CONFIG_KSZ884X_PCI=m +CONFIG_NET_VENDOR_MICROCHIP=y +CONFIG_ENC28J60=m +# CONFIG_ENC28J60_WRITEVERIFY is not set +CONFIG_ENCX24J600=m +CONFIG_LAN743X=m +CONFIG_NET_VENDOR_MICROSEMI=y +CONFIG_MSCC_OCELOT_SWITCH=m +CONFIG_MSCC_OCELOT_SWITCH_OCELOT=m +CONFIG_NET_VENDOR_MYRI=y +CONFIG_MYRI10GE=m +CONFIG_MYRI10GE_DCA=y +CONFIG_FEALNX=m +CONFIG_NET_VENDOR_NATSEMI=y +CONFIG_NATSEMI=m +CONFIG_NS83820=m +CONFIG_NET_VENDOR_NETERION=y +CONFIG_S2IO=m +CONFIG_VXGE=m +# CONFIG_VXGE_DEBUG_TRACE_ALL is not set +CONFIG_NET_VENDOR_NETRONOME=y +CONFIG_NFP=m +CONFIG_NFP_APP_FLOWER=y +CONFIG_NFP_APP_ABM_NIC=y +# CONFIG_NFP_DEBUG is not set +CONFIG_NET_VENDOR_NI=y +CONFIG_NI_XGE_MANAGEMENT_ENET=m +CONFIG_NET_VENDOR_8390=y +CONFIG_PCMCIA_AXNET=m +CONFIG_NE2K_PCI=m +CONFIG_PCMCIA_PCNET=m +CONFIG_NET_VENDOR_NVIDIA=y +CONFIG_FORCEDETH=m +CONFIG_NET_VENDOR_OKI=y +CONFIG_ETHOC=m +CONFIG_NET_VENDOR_PACKET_ENGINES=y +CONFIG_HAMACHI=m +CONFIG_YELLOWFIN=m +CONFIG_NET_VENDOR_PENSANDO=y +CONFIG_IONIC=m +CONFIG_NET_VENDOR_QLOGIC=y +CONFIG_QLA3XXX=m +CONFIG_QLCNIC=m +CONFIG_QLCNIC_SRIOV=y +CONFIG_QLCNIC_DCB=y +CONFIG_QLCNIC_HWMON=y +CONFIG_NETXEN_NIC=m +CONFIG_QED=m +CONFIG_QED_LL2=y +CONFIG_QED_SRIOV=y +CONFIG_QEDE=m +CONFIG_QED_RDMA=y +CONFIG_QED_ISCSI=y +CONFIG_QED_FCOE=y +CONFIG_QED_OOO=y +CONFIG_NET_VENDOR_QUALCOMM=y +CONFIG_QCA7000=m +CONFIG_QCA7000_SPI=m +CONFIG_QCA7000_UART=m +CONFIG_QCOM_EMAC=m +CONFIG_RMNET=m +CONFIG_NET_VENDOR_RDC=y +CONFIG_R6040=m +CONFIG_NET_VENDOR_REALTEK=y +CONFIG_ATP=m +CONFIG_8139CP=m +CONFIG_8139TOO=m +# CONFIG_8139TOO_PIO is not set +CONFIG_8139TOO_TUNE_TWISTER=y +CONFIG_8139TOO_8129=y +# CONFIG_8139_OLD_RX_RESET is not set +CONFIG_R8169=m +CONFIG_NET_VENDOR_RENESAS=y +CONFIG_NET_VENDOR_ROCKER=y +CONFIG_ROCKER=m +CONFIG_NET_VENDOR_SAMSUNG=y +CONFIG_SXGBE_ETH=m +CONFIG_NET_VENDOR_SEEQ=y +CONFIG_NET_VENDOR_SOLARFLARE=y +CONFIG_SFC=m +CONFIG_SFC_MTD=y +CONFIG_SFC_MCDI_MON=y +CONFIG_SFC_SRIOV=y +CONFIG_SFC_MCDI_LOGGING=y +CONFIG_SFC_FALCON=m +CONFIG_SFC_FALCON_MTD=y +CONFIG_NET_VENDOR_SILAN=y +CONFIG_SC92031=m +CONFIG_NET_VENDOR_SIS=y +CONFIG_SIS900=m +CONFIG_SIS190=m +CONFIG_NET_VENDOR_SMSC=y +CONFIG_PCMCIA_SMC91C92=m +CONFIG_EPIC100=m +CONFIG_SMSC911X=m +CONFIG_SMSC9420=m +CONFIG_NET_VENDOR_SOCIONEXT=y +CONFIG_NET_VENDOR_STMICRO=y +CONFIG_STMMAC_ETH=m +# CONFIG_STMMAC_SELFTESTS is not set +CONFIG_STMMAC_PLATFORM=m +CONFIG_DWMAC_DWC_QOS_ETH=m +CONFIG_DWMAC_GENERIC=m +CONFIG_STMMAC_PCI=m +CONFIG_NET_VENDOR_SUN=y +CONFIG_HAPPYMEAL=m +CONFIG_SUNGEM=m +CONFIG_CASSINI=m +CONFIG_NIU=m +CONFIG_NET_VENDOR_SYNOPSYS=y +CONFIG_DWC_XLGMAC=m +CONFIG_DWC_XLGMAC_PCI=m +CONFIG_NET_VENDOR_TEHUTI=y +CONFIG_TEHUTI=m +CONFIG_NET_VENDOR_TI=y +# CONFIG_TI_CPSW_PHY_SEL is not set +CONFIG_TLAN=m +CONFIG_NET_VENDOR_VIA=y +CONFIG_VIA_RHINE=m +CONFIG_VIA_RHINE_MMIO=y +CONFIG_VIA_VELOCITY=m +CONFIG_NET_VENDOR_WIZNET=y +CONFIG_WIZNET_W5100=m +CONFIG_WIZNET_W5300=m +# CONFIG_WIZNET_BUS_DIRECT is not set +# CONFIG_WIZNET_BUS_INDIRECT is not set +CONFIG_WIZNET_BUS_ANY=y +CONFIG_WIZNET_W5100_SPI=m +CONFIG_NET_VENDOR_XILINX=y +CONFIG_XILINX_AXI_EMAC=m +CONFIG_XILINX_LL_TEMAC=m +CONFIG_NET_VENDOR_XIRCOM=y +CONFIG_PCMCIA_XIRC2PS=m +CONFIG_FDDI=m +CONFIG_DEFXX=m +CONFIG_DEFXX_MMIO=y +CONFIG_SKFP=m +# CONFIG_HIPPI is not set +CONFIG_NET_SB1000=m +CONFIG_MDIO_DEVICE=m +CONFIG_MDIO_BUS=m +CONFIG_MDIO_BCM_UNIMAC=m +CONFIG_MDIO_BITBANG=m +CONFIG_MDIO_BUS_MUX=m +CONFIG_MDIO_BUS_MUX_GPIO=m +CONFIG_MDIO_BUS_MUX_MMIOREG=m +CONFIG_MDIO_BUS_MUX_MULTIPLEXER=m +CONFIG_MDIO_CAVIUM=m +CONFIG_MDIO_GPIO=m +CONFIG_MDIO_HISI_FEMAC=m +CONFIG_MDIO_I2C=m +CONFIG_MDIO_MSCC_MIIM=m +CONFIG_MDIO_OCTEON=m +CONFIG_MDIO_THUNDER=m +CONFIG_PHYLINK=m +CONFIG_PHYLIB=m +CONFIG_SWPHY=y +CONFIG_LED_TRIGGER_PHY=y + +# +# MII PHY device drivers +# +CONFIG_SFP=m +CONFIG_ADIN_PHY=m +CONFIG_AMD_PHY=m +CONFIG_AQUANTIA_PHY=m +CONFIG_AX88796B_PHY=m +CONFIG_AT803X_PHY=m +CONFIG_BCM7XXX_PHY=m +CONFIG_BCM87XX_PHY=m +CONFIG_BCM_NET_PHYLIB=m +CONFIG_BROADCOM_PHY=m +CONFIG_CICADA_PHY=m +CONFIG_CORTINA_PHY=m +CONFIG_DAVICOM_PHY=m +CONFIG_DP83822_PHY=m +CONFIG_DP83TC811_PHY=m +CONFIG_DP83848_PHY=m +CONFIG_DP83867_PHY=m +CONFIG_FIXED_PHY=m +CONFIG_ICPLUS_PHY=m +CONFIG_INTEL_XWAY_PHY=m +CONFIG_LSI_ET1011C_PHY=m +CONFIG_LXT_PHY=m +CONFIG_MARVELL_PHY=m +CONFIG_MARVELL_10G_PHY=m +CONFIG_MICREL_PHY=m +CONFIG_MICROCHIP_PHY=m +CONFIG_MICROCHIP_T1_PHY=m +CONFIG_MICROSEMI_PHY=m +CONFIG_NATIONAL_PHY=m +CONFIG_NXP_TJA11XX_PHY=m +CONFIG_QSEMI_PHY=m +CONFIG_REALTEK_PHY=m +CONFIG_RENESAS_PHY=m +CONFIG_ROCKCHIP_PHY=m +CONFIG_SMSC_PHY=m +CONFIG_STE10XP=m +CONFIG_TERANETICS_PHY=m +CONFIG_VITESSE_PHY=m +CONFIG_XILINX_GMII2RGMII=m +CONFIG_MICREL_KS8995MA=m +CONFIG_PLIP=m +CONFIG_PPP=m +CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_FILTER=y +CONFIG_PPP_MPPE=m +CONFIG_PPP_MULTILINK=y +CONFIG_PPPOATM=m +CONFIG_PPPOE=m +CONFIG_PPTP=m +CONFIG_PPPOL2TP=m +CONFIG_PPP_ASYNC=m +CONFIG_PPP_SYNC_TTY=m +CONFIG_SLIP=m +CONFIG_SLHC=m +CONFIG_SLIP_COMPRESSED=y +CONFIG_SLIP_SMART=y +CONFIG_SLIP_MODE_SLIP6=y +CONFIG_USB_NET_DRIVERS=m +CONFIG_USB_CATC=m +CONFIG_USB_KAWETH=m +CONFIG_USB_PEGASUS=m +CONFIG_USB_RTL8150=m +CONFIG_USB_RTL8152=m +CONFIG_USB_LAN78XX=m +CONFIG_USB_USBNET=m +CONFIG_USB_NET_AX8817X=m +CONFIG_USB_NET_AX88179_178A=m +CONFIG_USB_NET_CDCETHER=m +CONFIG_USB_NET_CDC_EEM=m +CONFIG_USB_NET_CDC_NCM=m +CONFIG_USB_NET_HUAWEI_CDC_NCM=m +CONFIG_USB_NET_CDC_MBIM=m +CONFIG_USB_NET_DM9601=m +CONFIG_USB_NET_SR9700=m +CONFIG_USB_NET_SR9800=m +CONFIG_USB_NET_SMSC75XX=m +CONFIG_USB_NET_SMSC95XX=m +CONFIG_USB_NET_GL620A=m +CONFIG_USB_NET_NET1080=m +CONFIG_USB_NET_PLUSB=m +CONFIG_USB_NET_MCS7830=m +CONFIG_USB_NET_RNDIS_HOST=m +CONFIG_USB_NET_CDC_SUBSET_ENABLE=m +CONFIG_USB_NET_CDC_SUBSET=m +CONFIG_USB_ALI_M5632=y +CONFIG_USB_AN2720=y +CONFIG_USB_BELKIN=y +CONFIG_USB_ARMLINUX=y +CONFIG_USB_EPSON2888=y +CONFIG_USB_KC2190=y +CONFIG_USB_NET_ZAURUS=m +CONFIG_USB_NET_CX82310_ETH=m +CONFIG_USB_NET_KALMIA=m +CONFIG_USB_NET_QMI_WWAN=m +CONFIG_USB_HSO=m +CONFIG_USB_NET_INT51X1=m +CONFIG_USB_CDC_PHONET=m +CONFIG_USB_IPHETH=m +CONFIG_USB_SIERRA_NET=m +CONFIG_USB_VL600=m +CONFIG_USB_NET_CH9200=m +CONFIG_USB_NET_AQC111=m +CONFIG_WLAN=y +# CONFIG_WIRELESS_WDS is not set +CONFIG_WLAN_VENDOR_ADMTEK=y +CONFIG_ADM8211=m +CONFIG_ATH_COMMON=m +CONFIG_WLAN_VENDOR_ATH=y +# CONFIG_ATH_DEBUG is not set +CONFIG_ATH5K=m +CONFIG_ATH5K_DEBUG=y +CONFIG_ATH5K_TRACER=y +CONFIG_ATH5K_PCI=y +CONFIG_ATH9K_HW=m +CONFIG_ATH9K_COMMON=m +CONFIG_ATH9K_COMMON_DEBUG=y +CONFIG_ATH9K_BTCOEX_SUPPORT=y +CONFIG_ATH9K=m +CONFIG_ATH9K_PCI=y +CONFIG_ATH9K_AHB=y +CONFIG_ATH9K_DEBUGFS=y +CONFIG_ATH9K_STATION_STATISTICS=y +CONFIG_ATH9K_DYNACK=y +CONFIG_ATH9K_WOW=y +CONFIG_ATH9K_RFKILL=y +CONFIG_ATH9K_CHANNEL_CONTEXT=y +CONFIG_ATH9K_PCOEM=y +CONFIG_ATH9K_PCI_NO_EEPROM=m +CONFIG_ATH9K_HTC=m +CONFIG_ATH9K_HTC_DEBUGFS=y +CONFIG_ATH9K_HWRNG=y +CONFIG_ATH9K_COMMON_SPECTRAL=y +CONFIG_CARL9170=m +CONFIG_CARL9170_LEDS=y +CONFIG_CARL9170_DEBUGFS=y +CONFIG_CARL9170_WPC=y +# CONFIG_CARL9170_HWRNG is not set +CONFIG_ATH6KL=m +CONFIG_ATH6KL_SDIO=m +CONFIG_ATH6KL_USB=m +CONFIG_ATH6KL_DEBUG=y +CONFIG_ATH6KL_TRACING=y +CONFIG_AR5523=m +CONFIG_WIL6210=m +CONFIG_WIL6210_ISR_COR=y +CONFIG_WIL6210_TRACING=y +CONFIG_WIL6210_DEBUGFS=y +CONFIG_ATH10K=m +CONFIG_ATH10K_CE=y +CONFIG_ATH10K_PCI=m +CONFIG_ATH10K_AHB=y +CONFIG_ATH10K_SDIO=m +CONFIG_ATH10K_USB=m +CONFIG_ATH10K_DEBUG=y +CONFIG_ATH10K_DEBUGFS=y +CONFIG_ATH10K_SPECTRAL=y +CONFIG_ATH10K_TRACING=y +CONFIG_WCN36XX=m +CONFIG_WCN36XX_DEBUGFS=y +CONFIG_WLAN_VENDOR_ATMEL=y +CONFIG_ATMEL=m +CONFIG_PCI_ATMEL=m +CONFIG_PCMCIA_ATMEL=m +CONFIG_AT76C50X_USB=m +CONFIG_WLAN_VENDOR_BROADCOM=y +CONFIG_B43=m +CONFIG_B43_BCMA=y +CONFIG_B43_SSB=y +CONFIG_B43_BUSES_BCMA_AND_SSB=y +# CONFIG_B43_BUSES_BCMA is not set +# CONFIG_B43_BUSES_SSB is not set +CONFIG_B43_PCI_AUTOSELECT=y +CONFIG_B43_PCICORE_AUTOSELECT=y +CONFIG_B43_SDIO=y +CONFIG_B43_BCMA_PIO=y +CONFIG_B43_PIO=y +CONFIG_B43_PHY_G=y +CONFIG_B43_PHY_N=y +CONFIG_B43_PHY_LP=y +CONFIG_B43_PHY_HT=y +CONFIG_B43_LEDS=y +CONFIG_B43_HWRNG=y +# CONFIG_B43_DEBUG is not set +CONFIG_B43LEGACY=m +CONFIG_B43LEGACY_PCI_AUTOSELECT=y +CONFIG_B43LEGACY_PCICORE_AUTOSELECT=y +CONFIG_B43LEGACY_LEDS=y +CONFIG_B43LEGACY_HWRNG=y +CONFIG_B43LEGACY_DEBUG=y +CONFIG_B43LEGACY_DMA=y +CONFIG_B43LEGACY_PIO=y +CONFIG_B43LEGACY_DMA_AND_PIO_MODE=y +# CONFIG_B43LEGACY_DMA_MODE is not set +# CONFIG_B43LEGACY_PIO_MODE is not set +CONFIG_BRCMUTIL=m +CONFIG_BRCMSMAC=m +CONFIG_BRCMFMAC=m +CONFIG_BRCMFMAC_PROTO_BCDC=y +CONFIG_BRCMFMAC_PROTO_MSGBUF=y +CONFIG_BRCMFMAC_SDIO=y +CONFIG_BRCMFMAC_USB=y +CONFIG_BRCMFMAC_PCIE=y +CONFIG_BRCM_TRACING=y +CONFIG_BRCMDBG=y +CONFIG_WLAN_VENDOR_CISCO=y +CONFIG_AIRO=m +CONFIG_AIRO_CS=m +CONFIG_WLAN_VENDOR_INTEL=y +CONFIG_IPW2100=m +CONFIG_IPW2100_MONITOR=y +# CONFIG_IPW2100_DEBUG is not set +CONFIG_IPW2200=m +CONFIG_IPW2200_MONITOR=y +CONFIG_IPW2200_RADIOTAP=y +CONFIG_IPW2200_PROMISCUOUS=y +CONFIG_IPW2200_QOS=y +# CONFIG_IPW2200_DEBUG is not set +CONFIG_LIBIPW=m +# CONFIG_LIBIPW_DEBUG is not set +CONFIG_IWLEGACY=m +CONFIG_IWL4965=m +CONFIG_IWL3945=m + +# +# iwl3945 / iwl4965 Debugging Options +# +CONFIG_IWLEGACY_DEBUG=y +CONFIG_IWLEGACY_DEBUGFS=y +# end of iwl3945 / iwl4965 Debugging Options + +CONFIG_IWLWIFI=m +CONFIG_IWLWIFI_LEDS=y +CONFIG_IWLDVM=m +CONFIG_IWLMVM=m +CONFIG_IWLWIFI_OPMODE_MODULAR=y +# CONFIG_IWLWIFI_BCAST_FILTERING is not set + +# +# Debugging Options +# +CONFIG_IWLWIFI_DEBUG=y +CONFIG_IWLWIFI_DEBUGFS=y +CONFIG_IWLWIFI_DEVICE_TRACING=y +# end of Debugging Options + +CONFIG_WLAN_VENDOR_INTERSIL=y +CONFIG_HOSTAP=m +CONFIG_HOSTAP_FIRMWARE=y +CONFIG_HOSTAP_FIRMWARE_NVRAM=y +CONFIG_HOSTAP_PLX=m +CONFIG_HOSTAP_PCI=m +CONFIG_HOSTAP_CS=m +CONFIG_HERMES=m +CONFIG_HERMES_PRISM=y +CONFIG_HERMES_CACHE_FW_ON_INIT=y +CONFIG_PLX_HERMES=m +CONFIG_TMD_HERMES=m +CONFIG_NORTEL_HERMES=m +CONFIG_PCI_HERMES=m +CONFIG_PCMCIA_HERMES=m +CONFIG_PCMCIA_SPECTRUM=m +CONFIG_ORINOCO_USB=m +CONFIG_P54_COMMON=m +CONFIG_P54_USB=m +CONFIG_P54_PCI=m +CONFIG_P54_SPI=m +# CONFIG_P54_SPI_DEFAULT_EEPROM is not set +CONFIG_P54_LEDS=y +CONFIG_PRISM54=m +CONFIG_WLAN_VENDOR_MARVELL=y +CONFIG_LIBERTAS=m +CONFIG_LIBERTAS_USB=m +CONFIG_LIBERTAS_CS=m +CONFIG_LIBERTAS_SDIO=m +CONFIG_LIBERTAS_SPI=m +# CONFIG_LIBERTAS_DEBUG is not set +CONFIG_LIBERTAS_MESH=y +CONFIG_LIBERTAS_THINFIRM=m +# CONFIG_LIBERTAS_THINFIRM_DEBUG is not set +CONFIG_LIBERTAS_THINFIRM_USB=m +CONFIG_MWIFIEX=m +CONFIG_MWIFIEX_SDIO=m +CONFIG_MWIFIEX_PCIE=m +CONFIG_MWIFIEX_USB=m +CONFIG_MWL8K=m +CONFIG_WLAN_VENDOR_MEDIATEK=y +CONFIG_MT7601U=m +CONFIG_MT76_CORE=m +CONFIG_MT76_LEDS=y +CONFIG_MT76_USB=m +CONFIG_MT76x02_LIB=m +CONFIG_MT76x02_USB=m +CONFIG_MT76x0_COMMON=m +CONFIG_MT76x0U=m +CONFIG_MT76x0E=m +CONFIG_MT76x2_COMMON=m +CONFIG_MT76x2E=m +CONFIG_MT76x2U=m +CONFIG_MT7603E=m +CONFIG_MT7615E=m +CONFIG_WLAN_VENDOR_RALINK=y +CONFIG_RT2X00=m +CONFIG_RT2400PCI=m +CONFIG_RT2500PCI=m +CONFIG_RT61PCI=m +CONFIG_RT2800PCI=m +CONFIG_RT2800PCI_RT33XX=y +CONFIG_RT2800PCI_RT35XX=y +CONFIG_RT2800PCI_RT53XX=y +CONFIG_RT2800PCI_RT3290=y +CONFIG_RT2500USB=m +CONFIG_RT73USB=m +CONFIG_RT2800USB=m +CONFIG_RT2800USB_RT33XX=y +CONFIG_RT2800USB_RT35XX=y +CONFIG_RT2800USB_RT3573=y +CONFIG_RT2800USB_RT53XX=y +CONFIG_RT2800USB_RT55XX=y +CONFIG_RT2800USB_UNKNOWN=y +CONFIG_RT2800_LIB=m +CONFIG_RT2800_LIB_MMIO=m +CONFIG_RT2X00_LIB_MMIO=m +CONFIG_RT2X00_LIB_PCI=m +CONFIG_RT2X00_LIB_USB=m +CONFIG_RT2X00_LIB=m +CONFIG_RT2X00_LIB_FIRMWARE=y +CONFIG_RT2X00_LIB_CRYPTO=y +CONFIG_RT2X00_LIB_LEDS=y +CONFIG_RT2X00_LIB_DEBUGFS=y +# CONFIG_RT2X00_DEBUG is not set +CONFIG_WLAN_VENDOR_REALTEK=y +CONFIG_RTL8180=m +CONFIG_RTL8187=m +CONFIG_RTL8187_LEDS=y +CONFIG_RTL_CARDS=m +CONFIG_RTL8192CE=m +CONFIG_RTL8192SE=m +CONFIG_RTL8192DE=m +CONFIG_RTL8723AE=m +CONFIG_RTL8723BE=m +CONFIG_RTL8188EE=m +CONFIG_RTL8192EE=m +CONFIG_RTL8821AE=m +CONFIG_RTL8192CU=m +CONFIG_RTLWIFI=m +CONFIG_RTLWIFI_PCI=m +CONFIG_RTLWIFI_USB=m +CONFIG_RTLWIFI_DEBUG=y +CONFIG_RTL8192C_COMMON=m +CONFIG_RTL8723_COMMON=m +CONFIG_RTLBTCOEXIST=m +CONFIG_RTL8XXXU=m +CONFIG_RTL8XXXU_UNTESTED=y +CONFIG_RTW88=m +CONFIG_RTW88_CORE=m +CONFIG_RTW88_PCI=m +CONFIG_RTW88_8822BE=y +CONFIG_RTW88_8822CE=y +CONFIG_RTW88_DEBUG=y +CONFIG_RTW88_DEBUGFS=y +CONFIG_WLAN_VENDOR_RSI=y +CONFIG_RSI_91X=m +CONFIG_RSI_DEBUGFS=y +CONFIG_RSI_SDIO=m +CONFIG_RSI_USB=m +CONFIG_RSI_COEX=y +CONFIG_WLAN_VENDOR_ST=y +CONFIG_CW1200=m +CONFIG_CW1200_WLAN_SDIO=m +CONFIG_CW1200_WLAN_SPI=m +CONFIG_WLAN_VENDOR_TI=y +CONFIG_WL1251=m +CONFIG_WL1251_SPI=m +CONFIG_WL1251_SDIO=m +CONFIG_WL12XX=m +CONFIG_WL18XX=m +CONFIG_WLCORE=m +CONFIG_WLCORE_SPI=m +CONFIG_WLCORE_SDIO=m +CONFIG_WILINK_PLATFORM_DATA=y +CONFIG_WLAN_VENDOR_ZYDAS=y +CONFIG_USB_ZD1201=m +CONFIG_ZD1211RW=m +# CONFIG_ZD1211RW_DEBUG is not set +CONFIG_WLAN_VENDOR_QUANTENNA=y +CONFIG_QTNFMAC=m +CONFIG_QTNFMAC_PCIE=m +CONFIG_PCMCIA_RAYCS=m +CONFIG_PCMCIA_WL3501=m +CONFIG_MAC80211_HWSIM=m +CONFIG_USB_NET_RNDIS_WLAN=m +CONFIG_VIRT_WIFI=m + +# +# WiMAX Wireless Broadband devices +# +CONFIG_WIMAX_I2400M=m +CONFIG_WIMAX_I2400M_USB=m +CONFIG_WIMAX_I2400M_DEBUG_LEVEL=8 +# end of WiMAX Wireless Broadband devices + +# CONFIG_WAN is not set +CONFIG_IEEE802154_DRIVERS=m +CONFIG_IEEE802154_FAKELB=m +CONFIG_IEEE802154_AT86RF230=m +# CONFIG_IEEE802154_AT86RF230_DEBUGFS is not set +CONFIG_IEEE802154_MRF24J40=m +CONFIG_IEEE802154_CC2520=m +CONFIG_IEEE802154_ATUSB=m +CONFIG_IEEE802154_ADF7242=m +CONFIG_IEEE802154_CA8210=m +# CONFIG_IEEE802154_CA8210_DEBUGFS is not set +CONFIG_IEEE802154_MCR20A=m +CONFIG_IEEE802154_HWSIM=m +CONFIG_XEN_NETDEV_FRONTEND=m +CONFIG_XEN_NETDEV_BACKEND=m +CONFIG_VMXNET3=m +CONFIG_FUJITSU_ES=m +CONFIG_THUNDERBOLT_NET=m +CONFIG_HYPERV_NET=m +CONFIG_NETDEVSIM=m +CONFIG_NET_FAILOVER=m +CONFIG_ISDN=y +CONFIG_ISDN_CAPI=m +CONFIG_CAPI_TRACE=y +CONFIG_ISDN_CAPI_CAPI20=m +CONFIG_ISDN_CAPI_MIDDLEWARE=y +CONFIG_MISDN=m +CONFIG_MISDN_DSP=m +CONFIG_MISDN_L1OIP=m + +# +# mISDN hardware drivers +# +CONFIG_MISDN_HFCPCI=m +CONFIG_MISDN_HFCMULTI=m +CONFIG_MISDN_HFCUSB=m +CONFIG_MISDN_AVMFRITZ=m +CONFIG_MISDN_SPEEDFAX=m +CONFIG_MISDN_INFINEON=m +CONFIG_MISDN_W6692=m +CONFIG_MISDN_NETJET=m +CONFIG_MISDN_HDLC=m +CONFIG_MISDN_IPAC=m +CONFIG_MISDN_ISAR=m +CONFIG_NVM=y +CONFIG_NVM_PBLK=m +# CONFIG_NVM_PBLK_DEBUG is not set + +# +# Input device support +# +CONFIG_INPUT=y +CONFIG_INPUT_LEDS=m +CONFIG_INPUT_FF_MEMLESS=m +CONFIG_INPUT_POLLDEV=m +CONFIG_INPUT_SPARSEKMAP=m +CONFIG_INPUT_MATRIXKMAP=m + +# +# Userland interfaces +# +CONFIG_INPUT_MOUSEDEV=m +CONFIG_INPUT_MOUSEDEV_PSAUX=y +CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 +CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 +CONFIG_INPUT_JOYDEV=m +CONFIG_INPUT_EVDEV=m +# CONFIG_INPUT_EVBUG is not set + +# +# Input Device Drivers +# +CONFIG_INPUT_KEYBOARD=y +CONFIG_KEYBOARD_ADC=m +CONFIG_KEYBOARD_ADP5520=m +CONFIG_KEYBOARD_ADP5588=m +CONFIG_KEYBOARD_ADP5589=m +CONFIG_KEYBOARD_APPLESPI=m +CONFIG_KEYBOARD_ATKBD=m +CONFIG_KEYBOARD_QT1050=m +CONFIG_KEYBOARD_QT1070=m +CONFIG_KEYBOARD_QT2160=m +CONFIG_KEYBOARD_DLINK_DIR685=m +CONFIG_KEYBOARD_LKKBD=m +CONFIG_KEYBOARD_GPIO=m +CONFIG_KEYBOARD_GPIO_POLLED=m +CONFIG_KEYBOARD_TCA6416=m +CONFIG_KEYBOARD_TCA8418=m +CONFIG_KEYBOARD_MATRIX=m +CONFIG_KEYBOARD_LM8323=m +CONFIG_KEYBOARD_LM8333=m +CONFIG_KEYBOARD_MAX7359=m +CONFIG_KEYBOARD_MCS=m +CONFIG_KEYBOARD_MPR121=m +CONFIG_KEYBOARD_NEWTON=m +CONFIG_KEYBOARD_OPENCORES=m +CONFIG_KEYBOARD_SAMSUNG=m +CONFIG_KEYBOARD_STOWAWAY=m +CONFIG_KEYBOARD_SUNKBD=m +CONFIG_KEYBOARD_STMPE=m +CONFIG_KEYBOARD_OMAP4=m +CONFIG_KEYBOARD_TC3589X=m +CONFIG_KEYBOARD_TM2_TOUCHKEY=m +CONFIG_KEYBOARD_TWL4030=m +CONFIG_KEYBOARD_XTKBD=m +CONFIG_KEYBOARD_CROS_EC=m +CONFIG_KEYBOARD_CAP11XX=m +CONFIG_KEYBOARD_BCM=m +CONFIG_KEYBOARD_MTK_PMIC=m +CONFIG_INPUT_MOUSE=y +CONFIG_MOUSE_PS2=m +CONFIG_MOUSE_PS2_ALPS=y +CONFIG_MOUSE_PS2_BYD=y +CONFIG_MOUSE_PS2_LOGIPS2PP=y +CONFIG_MOUSE_PS2_SYNAPTICS=y +CONFIG_MOUSE_PS2_SYNAPTICS_SMBUS=y +CONFIG_MOUSE_PS2_CYPRESS=y +CONFIG_MOUSE_PS2_LIFEBOOK=y +CONFIG_MOUSE_PS2_TRACKPOINT=y +CONFIG_MOUSE_PS2_ELANTECH=y +CONFIG_MOUSE_PS2_ELANTECH_SMBUS=y +CONFIG_MOUSE_PS2_SENTELIC=y +CONFIG_MOUSE_PS2_TOUCHKIT=y +CONFIG_MOUSE_PS2_FOCALTECH=y +CONFIG_MOUSE_PS2_VMMOUSE=y +CONFIG_MOUSE_PS2_SMBUS=y +CONFIG_MOUSE_SERIAL=m +CONFIG_MOUSE_APPLETOUCH=m +CONFIG_MOUSE_BCM5974=m +CONFIG_MOUSE_CYAPA=m +CONFIG_MOUSE_ELAN_I2C=m +CONFIG_MOUSE_ELAN_I2C_I2C=y +CONFIG_MOUSE_ELAN_I2C_SMBUS=y +CONFIG_MOUSE_VSXXXAA=m +CONFIG_MOUSE_GPIO=m +CONFIG_MOUSE_SYNAPTICS_I2C=m +CONFIG_MOUSE_SYNAPTICS_USB=m +CONFIG_INPUT_JOYSTICK=y +CONFIG_JOYSTICK_ANALOG=m +CONFIG_JOYSTICK_A3D=m +CONFIG_JOYSTICK_ADI=m +CONFIG_JOYSTICK_COBRA=m +CONFIG_JOYSTICK_GF2K=m +CONFIG_JOYSTICK_GRIP=m +CONFIG_JOYSTICK_GRIP_MP=m +CONFIG_JOYSTICK_GUILLEMOT=m +CONFIG_JOYSTICK_INTERACT=m +CONFIG_JOYSTICK_SIDEWINDER=m +CONFIG_JOYSTICK_TMDC=m +CONFIG_JOYSTICK_IFORCE=m +CONFIG_JOYSTICK_IFORCE_USB=m +CONFIG_JOYSTICK_IFORCE_232=m +CONFIG_JOYSTICK_WARRIOR=m +CONFIG_JOYSTICK_MAGELLAN=m +CONFIG_JOYSTICK_SPACEORB=m +CONFIG_JOYSTICK_SPACEBALL=m +CONFIG_JOYSTICK_STINGER=m +CONFIG_JOYSTICK_TWIDJOY=m +CONFIG_JOYSTICK_ZHENHUA=m +CONFIG_JOYSTICK_DB9=m +CONFIG_JOYSTICK_GAMECON=m +CONFIG_JOYSTICK_TURBOGRAFX=m +CONFIG_JOYSTICK_AS5011=m +CONFIG_JOYSTICK_JOYDUMP=m +CONFIG_JOYSTICK_XPAD=m +CONFIG_JOYSTICK_XPAD_FF=y +CONFIG_JOYSTICK_XPAD_LEDS=y +CONFIG_JOYSTICK_WALKERA0701=m +CONFIG_JOYSTICK_PSXPAD_SPI=m +CONFIG_JOYSTICK_PSXPAD_SPI_FF=y +CONFIG_JOYSTICK_PXRC=m +CONFIG_JOYSTICK_FSIA6B=m +CONFIG_INPUT_TABLET=y +CONFIG_TABLET_USB_ACECAD=m +CONFIG_TABLET_USB_AIPTEK=m +CONFIG_TABLET_USB_GTCO=m +CONFIG_TABLET_USB_HANWANG=m +CONFIG_TABLET_USB_KBTAB=m +CONFIG_TABLET_USB_PEGASUS=m +CONFIG_TABLET_SERIAL_WACOM4=m +CONFIG_INPUT_TOUCHSCREEN=y +CONFIG_TOUCHSCREEN_PROPERTIES=y +CONFIG_TOUCHSCREEN_88PM860X=m +CONFIG_TOUCHSCREEN_ADS7846=m +CONFIG_TOUCHSCREEN_AD7877=m +CONFIG_TOUCHSCREEN_AD7879=m +CONFIG_TOUCHSCREEN_AD7879_I2C=m +CONFIG_TOUCHSCREEN_AD7879_SPI=m +CONFIG_TOUCHSCREEN_ADC=m +CONFIG_TOUCHSCREEN_AR1021_I2C=m +CONFIG_TOUCHSCREEN_ATMEL_MXT=m +CONFIG_TOUCHSCREEN_ATMEL_MXT_T37=y +CONFIG_TOUCHSCREEN_AUO_PIXCIR=m +CONFIG_TOUCHSCREEN_BU21013=m +CONFIG_TOUCHSCREEN_BU21029=m +CONFIG_TOUCHSCREEN_CHIPONE_ICN8318=m +CONFIG_TOUCHSCREEN_CHIPONE_ICN8505=m +CONFIG_TOUCHSCREEN_CY8CTMG110=m +CONFIG_TOUCHSCREEN_CYTTSP_CORE=m +CONFIG_TOUCHSCREEN_CYTTSP_I2C=m +CONFIG_TOUCHSCREEN_CYTTSP_SPI=m +CONFIG_TOUCHSCREEN_CYTTSP4_CORE=m +CONFIG_TOUCHSCREEN_CYTTSP4_I2C=m +CONFIG_TOUCHSCREEN_CYTTSP4_SPI=m +CONFIG_TOUCHSCREEN_DA9034=m +CONFIG_TOUCHSCREEN_DA9052=m +CONFIG_TOUCHSCREEN_DYNAPRO=m +CONFIG_TOUCHSCREEN_HAMPSHIRE=m +CONFIG_TOUCHSCREEN_EETI=m +CONFIG_TOUCHSCREEN_EGALAX=m +CONFIG_TOUCHSCREEN_EGALAX_SERIAL=m +CONFIG_TOUCHSCREEN_EXC3000=m +CONFIG_TOUCHSCREEN_FUJITSU=m +CONFIG_TOUCHSCREEN_GOODIX=m +CONFIG_TOUCHSCREEN_HIDEEP=m +CONFIG_TOUCHSCREEN_ILI210X=m +CONFIG_TOUCHSCREEN_S6SY761=m +CONFIG_TOUCHSCREEN_GUNZE=m +CONFIG_TOUCHSCREEN_EKTF2127=m +CONFIG_TOUCHSCREEN_ELAN=m +CONFIG_TOUCHSCREEN_ELO=m +CONFIG_TOUCHSCREEN_WACOM_W8001=m +CONFIG_TOUCHSCREEN_WACOM_I2C=m +CONFIG_TOUCHSCREEN_MAX11801=m +CONFIG_TOUCHSCREEN_MCS5000=m +CONFIG_TOUCHSCREEN_MMS114=m +CONFIG_TOUCHSCREEN_MELFAS_MIP4=m +CONFIG_TOUCHSCREEN_MTOUCH=m +CONFIG_TOUCHSCREEN_IMX6UL_TSC=m +CONFIG_TOUCHSCREEN_INEXIO=m +CONFIG_TOUCHSCREEN_MK712=m +CONFIG_TOUCHSCREEN_PENMOUNT=m +CONFIG_TOUCHSCREEN_EDT_FT5X06=m +CONFIG_TOUCHSCREEN_TOUCHRIGHT=m +CONFIG_TOUCHSCREEN_TOUCHWIN=m +CONFIG_TOUCHSCREEN_TI_AM335X_TSC=m +CONFIG_TOUCHSCREEN_UCB1400=m +CONFIG_TOUCHSCREEN_PIXCIR=m +CONFIG_TOUCHSCREEN_WDT87XX_I2C=m +CONFIG_TOUCHSCREEN_WM831X=m +CONFIG_TOUCHSCREEN_WM97XX=m +CONFIG_TOUCHSCREEN_WM9705=y +CONFIG_TOUCHSCREEN_WM9712=y +CONFIG_TOUCHSCREEN_WM9713=y +CONFIG_TOUCHSCREEN_USB_COMPOSITE=m +CONFIG_TOUCHSCREEN_MC13783=m +CONFIG_TOUCHSCREEN_USB_EGALAX=y +CONFIG_TOUCHSCREEN_USB_PANJIT=y +CONFIG_TOUCHSCREEN_USB_3M=y +CONFIG_TOUCHSCREEN_USB_ITM=y +CONFIG_TOUCHSCREEN_USB_ETURBO=y +CONFIG_TOUCHSCREEN_USB_GUNZE=y +CONFIG_TOUCHSCREEN_USB_DMC_TSC10=y +CONFIG_TOUCHSCREEN_USB_IRTOUCH=y +CONFIG_TOUCHSCREEN_USB_IDEALTEK=y +CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH=y +CONFIG_TOUCHSCREEN_USB_GOTOP=y +CONFIG_TOUCHSCREEN_USB_JASTEC=y +CONFIG_TOUCHSCREEN_USB_ELO=y +CONFIG_TOUCHSCREEN_USB_E2I=y +CONFIG_TOUCHSCREEN_USB_ZYTRONIC=y +CONFIG_TOUCHSCREEN_USB_ETT_TC45USB=y +CONFIG_TOUCHSCREEN_USB_NEXIO=y +CONFIG_TOUCHSCREEN_USB_EASYTOUCH=y +CONFIG_TOUCHSCREEN_TOUCHIT213=m +CONFIG_TOUCHSCREEN_TSC_SERIO=m +CONFIG_TOUCHSCREEN_TSC200X_CORE=m +CONFIG_TOUCHSCREEN_TSC2004=m +CONFIG_TOUCHSCREEN_TSC2005=m +CONFIG_TOUCHSCREEN_TSC2007=m +CONFIG_TOUCHSCREEN_TSC2007_IIO=y +CONFIG_TOUCHSCREEN_PCAP=m +CONFIG_TOUCHSCREEN_RM_TS=m +CONFIG_TOUCHSCREEN_SILEAD=m +CONFIG_TOUCHSCREEN_SIS_I2C=m +CONFIG_TOUCHSCREEN_ST1232=m +CONFIG_TOUCHSCREEN_STMFTS=m +CONFIG_TOUCHSCREEN_STMPE=m +CONFIG_TOUCHSCREEN_SUR40=m +CONFIG_TOUCHSCREEN_SURFACE3_SPI=m +CONFIG_TOUCHSCREEN_SX8654=m +CONFIG_TOUCHSCREEN_TPS6507X=m +CONFIG_TOUCHSCREEN_ZET6223=m +CONFIG_TOUCHSCREEN_ZFORCE=m +CONFIG_TOUCHSCREEN_COLIBRI_VF50=m +CONFIG_TOUCHSCREEN_ROHM_BU21023=m +CONFIG_TOUCHSCREEN_IQS5XX=m +CONFIG_INPUT_MISC=y +CONFIG_INPUT_88PM860X_ONKEY=m +CONFIG_INPUT_88PM80X_ONKEY=m +CONFIG_INPUT_AD714X=m +CONFIG_INPUT_AD714X_I2C=m +CONFIG_INPUT_AD714X_SPI=m +CONFIG_INPUT_ARIZONA_HAPTICS=m +CONFIG_INPUT_ATMEL_CAPTOUCH=m +CONFIG_INPUT_BMA150=m +CONFIG_INPUT_E3X0_BUTTON=m +CONFIG_INPUT_MSM_VIBRATOR=m +CONFIG_INPUT_PCSPKR=m +CONFIG_INPUT_MAX77650_ONKEY=m +CONFIG_INPUT_MAX77693_HAPTIC=m +CONFIG_INPUT_MAX8925_ONKEY=m +CONFIG_INPUT_MAX8997_HAPTIC=m +CONFIG_INPUT_MC13783_PWRBUTTON=m +CONFIG_INPUT_MMA8450=m +CONFIG_INPUT_APANEL=m +CONFIG_INPUT_GP2A=m +CONFIG_INPUT_GPIO_BEEPER=m +CONFIG_INPUT_GPIO_DECODER=m +CONFIG_INPUT_GPIO_VIBRA=m +CONFIG_INPUT_CPCAP_PWRBUTTON=m +CONFIG_INPUT_ATLAS_BTNS=m +CONFIG_INPUT_ATI_REMOTE2=m +CONFIG_INPUT_KEYSPAN_REMOTE=m +CONFIG_INPUT_KXTJ9=m +# CONFIG_INPUT_KXTJ9_POLLED_MODE is not set +CONFIG_INPUT_POWERMATE=m +CONFIG_INPUT_YEALINK=m +CONFIG_INPUT_CM109=m +CONFIG_INPUT_REGULATOR_HAPTIC=m +CONFIG_INPUT_RETU_PWRBUTTON=m +CONFIG_INPUT_TPS65218_PWRBUTTON=m +CONFIG_INPUT_AXP20X_PEK=m +CONFIG_INPUT_TWL4030_PWRBUTTON=m +CONFIG_INPUT_TWL4030_VIBRA=m +CONFIG_INPUT_TWL6040_VIBRA=m +CONFIG_INPUT_UINPUT=m +CONFIG_INPUT_PALMAS_PWRBUTTON=m +CONFIG_INPUT_PCF50633_PMU=m +CONFIG_INPUT_PCF8574=m +CONFIG_INPUT_PWM_BEEPER=m +CONFIG_INPUT_PWM_VIBRA=m +CONFIG_INPUT_RK805_PWRKEY=m +CONFIG_INPUT_GPIO_ROTARY_ENCODER=m +CONFIG_INPUT_DA9052_ONKEY=m +CONFIG_INPUT_DA9055_ONKEY=m +CONFIG_INPUT_DA9063_ONKEY=m +CONFIG_INPUT_WM831X_ON=m +CONFIG_INPUT_PCAP=m +CONFIG_INPUT_ADXL34X=m +CONFIG_INPUT_ADXL34X_I2C=m +CONFIG_INPUT_ADXL34X_SPI=m +CONFIG_INPUT_IMS_PCU=m +CONFIG_INPUT_CMA3000=m +CONFIG_INPUT_CMA3000_I2C=m +CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m +CONFIG_INPUT_IDEAPAD_SLIDEBAR=m +CONFIG_INPUT_SOC_BUTTON_ARRAY=m +CONFIG_INPUT_DRV260X_HAPTICS=m +CONFIG_INPUT_DRV2665_HAPTICS=m +CONFIG_INPUT_DRV2667_HAPTICS=m +CONFIG_INPUT_RAVE_SP_PWRBUTTON=m +CONFIG_INPUT_STPMIC1_ONKEY=m +CONFIG_RMI4_CORE=m +CONFIG_RMI4_I2C=m +CONFIG_RMI4_SPI=m +CONFIG_RMI4_SMB=m +CONFIG_RMI4_F03=y +CONFIG_RMI4_F03_SERIO=m +CONFIG_RMI4_2D_SENSOR=y +CONFIG_RMI4_F11=y +CONFIG_RMI4_F12=y +CONFIG_RMI4_F30=y +CONFIG_RMI4_F34=y +# CONFIG_RMI4_F54 is not set +CONFIG_RMI4_F55=y + +# +# Hardware I/O ports +# +CONFIG_SERIO=m +CONFIG_ARCH_MIGHT_HAVE_PC_SERIO=y +CONFIG_SERIO_I8042=m +CONFIG_SERIO_SERPORT=m +CONFIG_SERIO_CT82C710=m +CONFIG_SERIO_PARKBD=m +CONFIG_SERIO_PCIPS2=m +CONFIG_SERIO_LIBPS2=m +CONFIG_SERIO_RAW=m +CONFIG_SERIO_ALTERA_PS2=m +CONFIG_SERIO_PS2MULT=m +CONFIG_SERIO_ARC_PS2=m +# CONFIG_SERIO_APBPS2 is not set +CONFIG_HYPERV_KEYBOARD=m +CONFIG_SERIO_GPIO_PS2=m +CONFIG_USERIO=m +CONFIG_GAMEPORT=m +CONFIG_GAMEPORT_NS558=m +CONFIG_GAMEPORT_L4=m +CONFIG_GAMEPORT_EMU10K1=m +CONFIG_GAMEPORT_FM801=m +# end of Hardware I/O ports +# end of Input device support + +# +# Character devices +# +CONFIG_TTY=y +CONFIG_VT=y +CONFIG_CONSOLE_TRANSLATIONS=y +CONFIG_VT_CONSOLE=y +CONFIG_VT_CONSOLE_SLEEP=y +CONFIG_HW_CONSOLE=y +CONFIG_VT_HW_CONSOLE_BINDING=y +CONFIG_UNIX98_PTYS=y +# CONFIG_LEGACY_PTYS is not set +CONFIG_SERIAL_NONSTANDARD=y +CONFIG_ROCKETPORT=m +CONFIG_CYCLADES=m +CONFIG_CYZ_INTR=y +CONFIG_MOXA_INTELLIO=m +CONFIG_MOXA_SMARTIO=m +CONFIG_SYNCLINK=m +CONFIG_SYNCLINKMP=m +CONFIG_SYNCLINK_GT=m +CONFIG_NOZOMI=m +CONFIG_ISI=m +CONFIG_N_HDLC=m +CONFIG_N_GSM=m +CONFIG_TRACE_ROUTER=m +CONFIG_TRACE_SINK=m +CONFIG_NULL_TTY=m +CONFIG_LDISC_AUTOLOAD=y +CONFIG_DEVMEM=y +# CONFIG_DEVKMEM is not set + +# +# Serial drivers +# +CONFIG_SERIAL_EARLYCON=y +CONFIG_SERIAL_8250=y +# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set +CONFIG_SERIAL_8250_PNP=y +CONFIG_SERIAL_8250_FINTEK=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_8250_DMA=y +CONFIG_SERIAL_8250_PCI=y +CONFIG_SERIAL_8250_EXAR=m +CONFIG_SERIAL_8250_CS=m +CONFIG_SERIAL_8250_MEN_MCB=m +CONFIG_SERIAL_8250_NR_UARTS=32 +CONFIG_SERIAL_8250_RUNTIME_UARTS=4 +CONFIG_SERIAL_8250_EXTENDED=y +CONFIG_SERIAL_8250_MANY_PORTS=y +CONFIG_SERIAL_8250_ASPEED_VUART=m +CONFIG_SERIAL_8250_SHARE_IRQ=y +# CONFIG_SERIAL_8250_DETECT_IRQ is not set +CONFIG_SERIAL_8250_RSA=y +CONFIG_SERIAL_8250_DWLIB=y +CONFIG_SERIAL_8250_DW=m +CONFIG_SERIAL_8250_RT288X=y +CONFIG_SERIAL_8250_LPSS=y +CONFIG_SERIAL_8250_MID=y +CONFIG_SERIAL_OF_PLATFORM=m + +# +# Non-8250 serial port support +# +CONFIG_SERIAL_MAX3100=m +CONFIG_SERIAL_MAX310X=m +CONFIG_SERIAL_UARTLITE=m +CONFIG_SERIAL_UARTLITE_NR_UARTS=1 +CONFIG_SERIAL_CORE=y +CONFIG_SERIAL_CORE_CONSOLE=y +CONFIG_SERIAL_JSM=m +CONFIG_SERIAL_SIFIVE=m +CONFIG_SERIAL_SCCNXP=m +CONFIG_SERIAL_SC16IS7XX_CORE=m +CONFIG_SERIAL_SC16IS7XX=m +CONFIG_SERIAL_SC16IS7XX_I2C=y +CONFIG_SERIAL_SC16IS7XX_SPI=y +CONFIG_SERIAL_ALTERA_JTAGUART=m +CONFIG_SERIAL_ALTERA_UART=m +CONFIG_SERIAL_ALTERA_UART_MAXPORTS=4 +CONFIG_SERIAL_ALTERA_UART_BAUDRATE=115200 +CONFIG_SERIAL_IFX6X60=m +CONFIG_SERIAL_XILINX_PS_UART=m +CONFIG_SERIAL_ARC=m +CONFIG_SERIAL_ARC_NR_PORTS=1 +CONFIG_SERIAL_RP2=m +CONFIG_SERIAL_RP2_NR_UARTS=32 +CONFIG_SERIAL_FSL_LPUART=m +CONFIG_SERIAL_FSL_LINFLEXUART=m +CONFIG_SERIAL_CONEXANT_DIGICOLOR=m +CONFIG_SERIAL_MEN_Z135=m +# end of Serial drivers + +CONFIG_SERIAL_MCTRL_GPIO=y +CONFIG_SERIAL_DEV_BUS=y +CONFIG_SERIAL_DEV_CTRL_TTYPORT=y +# CONFIG_TTY_PRINTK is not set +CONFIG_PRINTER=m +# CONFIG_LP_CONSOLE is not set +CONFIG_PPDEV=m +CONFIG_HVC_DRIVER=y +CONFIG_HVC_IRQ=y +CONFIG_HVC_XEN=y +CONFIG_HVC_XEN_FRONTEND=y +CONFIG_VIRTIO_CONSOLE=m +CONFIG_IPMI_HANDLER=m +CONFIG_IPMI_DMI_DECODE=y +CONFIG_IPMI_PLAT_DATA=y +# CONFIG_IPMI_PANIC_EVENT is not set +CONFIG_IPMI_DEVICE_INTERFACE=m +CONFIG_IPMI_SI=m +CONFIG_IPMI_SSIF=m +CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m +CONFIG_IPMB_DEVICE_INTERFACE=m +CONFIG_HW_RANDOM=m +CONFIG_HW_RANDOM_TIMERIOMEM=m +CONFIG_HW_RANDOM_INTEL=m +CONFIG_HW_RANDOM_AMD=m +CONFIG_HW_RANDOM_VIA=m +CONFIG_HW_RANDOM_VIRTIO=m +CONFIG_NVRAM=m +CONFIG_APPLICOM=m + +# +# PCMCIA character devices +# +CONFIG_SYNCLINK_CS=m +CONFIG_CARDMAN_4000=m +CONFIG_CARDMAN_4040=m +CONFIG_SCR24X=m +CONFIG_IPWIRELESS=m +# end of PCMCIA character devices + +CONFIG_MWAVE=m +CONFIG_RAW_DRIVER=m +CONFIG_MAX_RAW_DEVS=256 +CONFIG_HPET=y +CONFIG_HPET_MMAP=y +CONFIG_HPET_MMAP_DEFAULT=y +CONFIG_HANGCHECK_TIMER=m +CONFIG_TCG_TPM=m +CONFIG_HW_RANDOM_TPM=y +CONFIG_TCG_TIS_CORE=m +CONFIG_TCG_TIS=m +CONFIG_TCG_TIS_SPI=m +CONFIG_TCG_TIS_I2C_ATMEL=m +CONFIG_TCG_TIS_I2C_INFINEON=m +CONFIG_TCG_TIS_I2C_NUVOTON=m +CONFIG_TCG_NSC=m +CONFIG_TCG_ATMEL=m +CONFIG_TCG_INFINEON=m +CONFIG_TCG_XEN=m +CONFIG_TCG_CRB=m +CONFIG_TCG_VTPM_PROXY=m +CONFIG_TCG_TIS_ST33ZP24=m +CONFIG_TCG_TIS_ST33ZP24_I2C=m +CONFIG_TCG_TIS_ST33ZP24_SPI=m +CONFIG_TELCLOCK=m +CONFIG_DEVPORT=y +CONFIG_XILLYBUS=m +CONFIG_XILLYBUS_PCIE=m +CONFIG_XILLYBUS_OF=m +# end of Character devices + +# CONFIG_RANDOM_TRUST_CPU is not set +# CONFIG_RANDOM_TRUST_BOOTLOADER is not set + +# +# I2C support +# +CONFIG_I2C=y +CONFIG_ACPI_I2C_OPREGION=y +CONFIG_I2C_BOARDINFO=y +CONFIG_I2C_COMPAT=y +CONFIG_I2C_CHARDEV=m +CONFIG_I2C_MUX=m + +# +# Multiplexer I2C Chip support +# +CONFIG_I2C_ARB_GPIO_CHALLENGE=m +CONFIG_I2C_MUX_GPIO=m +CONFIG_I2C_MUX_GPMUX=m +CONFIG_I2C_MUX_LTC4306=m +CONFIG_I2C_MUX_PCA9541=m +CONFIG_I2C_MUX_PCA954x=m +CONFIG_I2C_MUX_PINCTRL=m +CONFIG_I2C_MUX_REG=m +CONFIG_I2C_DEMUX_PINCTRL=m +CONFIG_I2C_MUX_MLXCPLD=m +# end of Multiplexer I2C Chip support + +CONFIG_I2C_HELPER_AUTO=y +CONFIG_I2C_SMBUS=m +CONFIG_I2C_ALGOBIT=m +CONFIG_I2C_ALGOPCA=m + +# +# I2C Hardware Bus support +# + +# +# PC SMBus host controller drivers +# +CONFIG_I2C_ALI1535=m +CONFIG_I2C_ALI1563=m +CONFIG_I2C_ALI15X3=m +CONFIG_I2C_AMD756=m +CONFIG_I2C_AMD756_S4882=m +CONFIG_I2C_AMD8111=m +CONFIG_I2C_AMD_MP2=m +CONFIG_I2C_I801=m +CONFIG_I2C_ISCH=m +CONFIG_I2C_ISMT=m +CONFIG_I2C_PIIX4=m +CONFIG_I2C_CHT_WC=m +CONFIG_I2C_NFORCE2=m +CONFIG_I2C_NFORCE2_S4985=m +CONFIG_I2C_NVIDIA_GPU=m +CONFIG_I2C_SIS5595=m +CONFIG_I2C_SIS630=m +CONFIG_I2C_SIS96X=m +CONFIG_I2C_VIA=m +CONFIG_I2C_VIAPRO=m + +# +# ACPI drivers +# +CONFIG_I2C_SCMI=m + +# +# I2C system bus drivers (mostly embedded / system-on-chip) +# +CONFIG_I2C_CBUS_GPIO=m +CONFIG_I2C_DESIGNWARE_CORE=y +CONFIG_I2C_DESIGNWARE_PLATFORM=y +CONFIG_I2C_DESIGNWARE_SLAVE=y +CONFIG_I2C_DESIGNWARE_PCI=m +CONFIG_I2C_DESIGNWARE_BAYTRAIL=y +CONFIG_I2C_EMEV2=m +CONFIG_I2C_GPIO=m +# CONFIG_I2C_GPIO_FAULT_INJECTOR is not set +CONFIG_I2C_KEMPLD=m +CONFIG_I2C_OCORES=m +CONFIG_I2C_PCA_PLATFORM=m +CONFIG_I2C_RK3X=m +CONFIG_I2C_SIMTEC=m +CONFIG_I2C_XILINX=m + +# +# External I2C/SMBus adapter drivers +# +CONFIG_I2C_DIOLAN_U2C=m +CONFIG_I2C_DLN2=m +CONFIG_I2C_PARPORT=m +CONFIG_I2C_PARPORT_LIGHT=m +CONFIG_I2C_ROBOTFUZZ_OSIF=m +CONFIG_I2C_TAOS_EVM=m +CONFIG_I2C_TINY_USB=m +CONFIG_I2C_VIPERBOARD=m + +# +# Other I2C/SMBus bus drivers +# +CONFIG_I2C_MLXCPLD=m +CONFIG_I2C_CROS_EC_TUNNEL=m +CONFIG_I2C_FSI=m +# end of I2C Hardware Bus support + +CONFIG_I2C_STUB=m +CONFIG_I2C_SLAVE=y +CONFIG_I2C_SLAVE_EEPROM=m +# CONFIG_I2C_DEBUG_CORE is not set +# CONFIG_I2C_DEBUG_ALGO is not set +# CONFIG_I2C_DEBUG_BUS is not set +# end of I2C support + +CONFIG_I3C=m +CONFIG_CDNS_I3C_MASTER=m +CONFIG_DW_I3C_MASTER=m +CONFIG_SPI=y +# CONFIG_SPI_DEBUG is not set +CONFIG_SPI_MASTER=y +CONFIG_SPI_MEM=y + +# +# SPI Master Controller Drivers +# +CONFIG_SPI_ALTERA=m +CONFIG_SPI_AXI_SPI_ENGINE=m +CONFIG_SPI_BITBANG=m +CONFIG_SPI_BUTTERFLY=m +CONFIG_SPI_CADENCE=m +CONFIG_SPI_DESIGNWARE=m +CONFIG_SPI_DW_PCI=m +CONFIG_SPI_DW_MID_DMA=y +CONFIG_SPI_DW_MMIO=m +CONFIG_SPI_DLN2=m +CONFIG_SPI_NXP_FLEXSPI=m +CONFIG_SPI_GPIO=m +CONFIG_SPI_LM70_LLP=m +CONFIG_SPI_FSL_LIB=m +CONFIG_SPI_FSL_SPI=m +CONFIG_SPI_OC_TINY=m +CONFIG_SPI_PXA2XX=m +CONFIG_SPI_PXA2XX_PCI=m +CONFIG_SPI_ROCKCHIP=m +CONFIG_SPI_SC18IS602=m +CONFIG_SPI_SIFIVE=m +CONFIG_SPI_MXIC=m +CONFIG_SPI_XCOMM=m +CONFIG_SPI_XILINX=m +CONFIG_SPI_ZYNQMP_GQSPI=m + +# +# SPI Protocol Masters +# +CONFIG_SPI_SPIDEV=m +CONFIG_SPI_LOOPBACK_TEST=m +CONFIG_SPI_TLE62X0=m +CONFIG_SPI_SLAVE=y +CONFIG_SPI_SLAVE_TIME=m +CONFIG_SPI_SLAVE_SYSTEM_CONTROL=m +CONFIG_SPMI=m +CONFIG_HSI=m +CONFIG_HSI_BOARDINFO=y + +# +# HSI controllers +# + +# +# HSI clients +# +CONFIG_HSI_CHAR=m +CONFIG_PPS=y +# CONFIG_PPS_DEBUG is not set + +# +# PPS clients support +# +CONFIG_PPS_CLIENT_KTIMER=m +CONFIG_PPS_CLIENT_LDISC=m +CONFIG_PPS_CLIENT_PARPORT=m +CONFIG_PPS_CLIENT_GPIO=m + +# +# PPS generators support +# + +# +# PTP clock support +# +CONFIG_PTP_1588_CLOCK=y +CONFIG_DP83640_PHY=m +CONFIG_PTP_1588_CLOCK_KVM=m +# end of PTP clock support + +CONFIG_PINCTRL=y +CONFIG_GENERIC_PINCTRL_GROUPS=y +CONFIG_PINMUX=y +CONFIG_GENERIC_PINMUX_FUNCTIONS=y +CONFIG_PINCONF=y +CONFIG_GENERIC_PINCONF=y +# CONFIG_DEBUG_PINCTRL is not set +CONFIG_PINCTRL_AS3722=m +CONFIG_PINCTRL_AXP209=m +CONFIG_PINCTRL_AMD=m +CONFIG_PINCTRL_MCP23S08=m +CONFIG_PINCTRL_SINGLE=m +CONFIG_PINCTRL_SX150X=y +CONFIG_PINCTRL_STMFX=m +CONFIG_PINCTRL_MAX77620=m +CONFIG_PINCTRL_PALMAS=m +CONFIG_PINCTRL_RK805=m +CONFIG_PINCTRL_OCELOT=y +CONFIG_PINCTRL_BAYTRAIL=y +CONFIG_PINCTRL_CHERRYVIEW=y +CONFIG_PINCTRL_INTEL=y +CONFIG_PINCTRL_BROXTON=y +CONFIG_PINCTRL_CANNONLAKE=y +CONFIG_PINCTRL_CEDARFORK=y +CONFIG_PINCTRL_DENVERTON=y +CONFIG_PINCTRL_GEMINILAKE=y +CONFIG_PINCTRL_ICELAKE=y +CONFIG_PINCTRL_LEWISBURG=y +CONFIG_PINCTRL_SUNRISEPOINT=y +CONFIG_PINCTRL_LOCHNAGAR=m +CONFIG_PINCTRL_MADERA=m +CONFIG_PINCTRL_CS47L15=y +CONFIG_PINCTRL_CS47L35=y +CONFIG_PINCTRL_CS47L85=y +CONFIG_PINCTRL_CS47L90=y +CONFIG_PINCTRL_CS47L92=y +CONFIG_GPIOLIB=y +CONFIG_GPIOLIB_FASTPATH_LIMIT=512 +CONFIG_OF_GPIO=y +CONFIG_GPIO_ACPI=y +CONFIG_GPIOLIB_IRQCHIP=y +# CONFIG_DEBUG_GPIO is not set +CONFIG_GPIO_SYSFS=y +CONFIG_GPIO_GENERIC=y +CONFIG_GPIO_MAX730X=m + +# +# Memory mapped GPIO drivers +# +CONFIG_GPIO_74XX_MMIO=m +CONFIG_GPIO_ALTERA=m +CONFIG_GPIO_AMDPT=m +CONFIG_GPIO_CADENCE=m +CONFIG_GPIO_DWAPB=m +CONFIG_GPIO_EXAR=m +CONFIG_GPIO_FTGPIO010=y +CONFIG_GPIO_GENERIC_PLATFORM=m +CONFIG_GPIO_GRGPIO=m +CONFIG_GPIO_HLWD=m +CONFIG_GPIO_ICH=m +CONFIG_GPIO_LYNXPOINT=m +CONFIG_GPIO_MB86S7X=m +CONFIG_GPIO_MENZ127=m +CONFIG_GPIO_SAMA5D2_PIOBU=m +CONFIG_GPIO_SIOX=m +CONFIG_GPIO_SYSCON=m +CONFIG_GPIO_VX855=m +CONFIG_GPIO_XILINX=m +CONFIG_GPIO_AMD_FCH=m +# end of Memory mapped GPIO drivers + +# +# Port-mapped I/O GPIO drivers +# +CONFIG_GPIO_F7188X=m +CONFIG_GPIO_IT87=m +CONFIG_GPIO_SCH=m +CONFIG_GPIO_SCH311X=m +CONFIG_GPIO_WINBOND=m +CONFIG_GPIO_WS16C48=m +# end of Port-mapped I/O GPIO drivers + +# +# I2C GPIO expanders +# +CONFIG_GPIO_ADP5588=m +CONFIG_GPIO_ADNP=m +CONFIG_GPIO_GW_PLD=m +CONFIG_GPIO_MAX7300=m +CONFIG_GPIO_MAX732X=m +CONFIG_GPIO_PCA953X=m +CONFIG_GPIO_PCF857X=m +CONFIG_GPIO_TPIC2810=m +# end of I2C GPIO expanders + +# +# MFD GPIO expanders +# +CONFIG_GPIO_ADP5520=m +CONFIG_GPIO_ARIZONA=m +CONFIG_GPIO_BD70528=m +CONFIG_GPIO_BD9571MWV=m +CONFIG_GPIO_CRYSTAL_COVE=m +CONFIG_GPIO_DA9052=m +CONFIG_GPIO_DA9055=m +CONFIG_GPIO_DLN2=m +CONFIG_GPIO_JANZ_TTL=m +CONFIG_GPIO_KEMPLD=m +CONFIG_GPIO_LP3943=m +CONFIG_GPIO_LP873X=m +CONFIG_GPIO_LP87565=m +CONFIG_GPIO_MADERA=m +CONFIG_GPIO_MAX77620=m +CONFIG_GPIO_MAX77650=m +CONFIG_GPIO_PALMAS=y +CONFIG_GPIO_RC5T583=y +CONFIG_GPIO_STMPE=y +CONFIG_GPIO_TC3589X=y +CONFIG_GPIO_TPS65086=m +CONFIG_GPIO_TPS65218=m +CONFIG_GPIO_TPS6586X=y +CONFIG_GPIO_TPS65910=y +CONFIG_GPIO_TPS65912=m +CONFIG_GPIO_TPS68470=y +CONFIG_GPIO_TQMX86=m +CONFIG_GPIO_TWL4030=m +CONFIG_GPIO_TWL6040=m +CONFIG_GPIO_UCB1400=m +CONFIG_GPIO_WHISKEY_COVE=m +CONFIG_GPIO_WM831X=m +CONFIG_GPIO_WM8350=m +CONFIG_GPIO_WM8994=m +# end of MFD GPIO expanders + +# +# PCI GPIO expanders +# +CONFIG_GPIO_AMD8111=m +CONFIG_GPIO_ML_IOH=m +CONFIG_GPIO_PCI_IDIO_16=m +CONFIG_GPIO_PCIE_IDIO_24=m +CONFIG_GPIO_RDC321X=m +CONFIG_GPIO_SODAVILLE=y +# end of PCI GPIO expanders + +# +# SPI GPIO expanders +# +CONFIG_GPIO_74X164=m +CONFIG_GPIO_MAX3191X=m +CONFIG_GPIO_MAX7301=m +CONFIG_GPIO_MC33880=m +CONFIG_GPIO_PISOSR=m +CONFIG_GPIO_XRA1403=m +CONFIG_GPIO_MOXTET=m +# end of SPI GPIO expanders + +# +# USB GPIO expanders +# +CONFIG_GPIO_VIPERBOARD=m +# end of USB GPIO expanders + +CONFIG_GPIO_MOCKUP=m +CONFIG_W1=m +CONFIG_W1_CON=y + +# +# 1-wire Bus Masters +# +CONFIG_W1_MASTER_MATROX=m +CONFIG_W1_MASTER_DS2490=m +CONFIG_W1_MASTER_DS2482=m +CONFIG_W1_MASTER_DS1WM=m +CONFIG_W1_MASTER_GPIO=m +CONFIG_W1_MASTER_SGI=m +# end of 1-wire Bus Masters + +# +# 1-wire Slaves +# +CONFIG_W1_SLAVE_THERM=m +CONFIG_W1_SLAVE_SMEM=m +CONFIG_W1_SLAVE_DS2405=m +CONFIG_W1_SLAVE_DS2408=m +# CONFIG_W1_SLAVE_DS2408_READBACK is not set +CONFIG_W1_SLAVE_DS2413=m +CONFIG_W1_SLAVE_DS2406=m +CONFIG_W1_SLAVE_DS2423=m +CONFIG_W1_SLAVE_DS2805=m +CONFIG_W1_SLAVE_DS2431=m +CONFIG_W1_SLAVE_DS2433=m +# CONFIG_W1_SLAVE_DS2433_CRC is not set +CONFIG_W1_SLAVE_DS2438=m +CONFIG_W1_SLAVE_DS250X=m +CONFIG_W1_SLAVE_DS2780=m +CONFIG_W1_SLAVE_DS2781=m +CONFIG_W1_SLAVE_DS28E04=m +CONFIG_W1_SLAVE_DS28E17=m +# end of 1-wire Slaves + +CONFIG_POWER_AVS=y +CONFIG_POWER_RESET=y +CONFIG_POWER_RESET_AS3722=y +CONFIG_POWER_RESET_GPIO=y +CONFIG_POWER_RESET_GPIO_RESTART=y +CONFIG_POWER_RESET_LTC2952=y +CONFIG_POWER_RESET_RESTART=y +CONFIG_POWER_RESET_SYSCON=y +CONFIG_POWER_RESET_SYSCON_POWEROFF=y +CONFIG_REBOOT_MODE=m +CONFIG_SYSCON_REBOOT_MODE=m +CONFIG_NVMEM_REBOOT_MODE=m +CONFIG_POWER_SUPPLY=y +# CONFIG_POWER_SUPPLY_DEBUG is not set +CONFIG_POWER_SUPPLY_HWMON=y +CONFIG_PDA_POWER=m +CONFIG_GENERIC_ADC_BATTERY=m +CONFIG_MAX8925_POWER=m +CONFIG_WM831X_BACKUP=m +CONFIG_WM831X_POWER=m +CONFIG_WM8350_POWER=m +CONFIG_TEST_POWER=m +CONFIG_BATTERY_88PM860X=m +CONFIG_CHARGER_ADP5061=m +CONFIG_BATTERY_ACT8945A=m +CONFIG_BATTERY_CPCAP=m +CONFIG_BATTERY_DS2760=m +CONFIG_BATTERY_DS2780=m +CONFIG_BATTERY_DS2781=m +CONFIG_BATTERY_DS2782=m +CONFIG_BATTERY_LEGO_EV3=m +CONFIG_BATTERY_SBS=m +CONFIG_CHARGER_SBS=m +CONFIG_MANAGER_SBS=m +CONFIG_BATTERY_BQ27XXX=m +CONFIG_BATTERY_BQ27XXX_I2C=m +CONFIG_BATTERY_BQ27XXX_HDQ=m +# CONFIG_BATTERY_BQ27XXX_DT_UPDATES_NVM is not set +CONFIG_BATTERY_DA9030=m +CONFIG_BATTERY_DA9052=m +CONFIG_CHARGER_DA9150=m +CONFIG_BATTERY_DA9150=m +CONFIG_CHARGER_AXP20X=m +CONFIG_BATTERY_AXP20X=m +CONFIG_AXP20X_POWER=m +CONFIG_AXP288_CHARGER=m +CONFIG_AXP288_FUEL_GAUGE=m +CONFIG_BATTERY_MAX17040=m +CONFIG_BATTERY_MAX17042=m +CONFIG_BATTERY_MAX1721X=m +CONFIG_BATTERY_TWL4030_MADC=m +CONFIG_CHARGER_88PM860X=m +CONFIG_CHARGER_PCF50633=m +CONFIG_BATTERY_RX51=m +CONFIG_CHARGER_ISP1704=m +CONFIG_CHARGER_MAX8903=m +CONFIG_CHARGER_TWL4030=m +CONFIG_CHARGER_LP8727=m +CONFIG_CHARGER_LP8788=m +CONFIG_CHARGER_GPIO=m +CONFIG_CHARGER_MANAGER=y +CONFIG_CHARGER_LT3651=m +CONFIG_CHARGER_MAX14577=m +CONFIG_CHARGER_DETECTOR_MAX14656=m +CONFIG_CHARGER_MAX77650=m +CONFIG_CHARGER_MAX77693=m +CONFIG_CHARGER_MAX8997=m +CONFIG_CHARGER_MAX8998=m +CONFIG_CHARGER_BQ2415X=m +CONFIG_CHARGER_BQ24190=m +CONFIG_CHARGER_BQ24257=m +CONFIG_CHARGER_BQ24735=m +CONFIG_CHARGER_BQ25890=m +CONFIG_CHARGER_SMB347=m +CONFIG_CHARGER_TPS65090=m +CONFIG_CHARGER_TPS65217=m +CONFIG_BATTERY_GAUGE_LTC2941=m +CONFIG_BATTERY_RT5033=m +CONFIG_CHARGER_RT9455=m +CONFIG_CHARGER_CROS_USBPD=m +CONFIG_CHARGER_UCS1002=m +CONFIG_CHARGER_BD70528=m +CONFIG_CHARGER_WILCO=m +CONFIG_HWMON=y +CONFIG_HWMON_VID=m +# CONFIG_HWMON_DEBUG_CHIP is not set + +# +# Native drivers +# +CONFIG_SENSORS_ABITUGURU=m +CONFIG_SENSORS_ABITUGURU3=m +CONFIG_SENSORS_AD7314=m +CONFIG_SENSORS_AD7414=m +CONFIG_SENSORS_AD7418=m +CONFIG_SENSORS_ADM1021=m +CONFIG_SENSORS_ADM1025=m +CONFIG_SENSORS_ADM1026=m +CONFIG_SENSORS_ADM1029=m +CONFIG_SENSORS_ADM1031=m +CONFIG_SENSORS_ADM9240=m +CONFIG_SENSORS_ADT7X10=m +CONFIG_SENSORS_ADT7310=m +CONFIG_SENSORS_ADT7410=m +CONFIG_SENSORS_ADT7411=m +CONFIG_SENSORS_ADT7462=m +CONFIG_SENSORS_ADT7470=m +CONFIG_SENSORS_ADT7475=m +CONFIG_SENSORS_AS370=m +CONFIG_SENSORS_ASC7621=m +CONFIG_SENSORS_K8TEMP=m +CONFIG_SENSORS_K10TEMP=m +CONFIG_SENSORS_FAM15H_POWER=m +CONFIG_SENSORS_APPLESMC=m +CONFIG_SENSORS_ASB100=m +CONFIG_SENSORS_ASPEED=m +CONFIG_SENSORS_ATXP1=m +CONFIG_SENSORS_DS620=m +CONFIG_SENSORS_DS1621=m +CONFIG_SENSORS_DELL_SMM=m +CONFIG_SENSORS_DA9052_ADC=m +CONFIG_SENSORS_DA9055=m +CONFIG_SENSORS_I5K_AMB=m +CONFIG_SENSORS_F71805F=m +CONFIG_SENSORS_F71882FG=m +CONFIG_SENSORS_F75375S=m +CONFIG_SENSORS_MC13783_ADC=m +CONFIG_SENSORS_FSCHMD=m +CONFIG_SENSORS_FTSTEUTATES=m +CONFIG_SENSORS_GL518SM=m +CONFIG_SENSORS_GL520SM=m +CONFIG_SENSORS_G760A=m +CONFIG_SENSORS_G762=m +CONFIG_SENSORS_GPIO_FAN=m +CONFIG_SENSORS_HIH6130=m +CONFIG_SENSORS_IBMAEM=m +CONFIG_SENSORS_IBMPEX=m +CONFIG_SENSORS_IIO_HWMON=m +CONFIG_SENSORS_I5500=m +CONFIG_SENSORS_CORETEMP=m +CONFIG_SENSORS_IT87=m +CONFIG_SENSORS_JC42=m +CONFIG_SENSORS_POWR1220=m +CONFIG_SENSORS_LINEAGE=m +CONFIG_SENSORS_LOCHNAGAR=m +CONFIG_SENSORS_LTC2945=m +CONFIG_SENSORS_LTC2990=m +CONFIG_SENSORS_LTC4151=m +CONFIG_SENSORS_LTC4215=m +CONFIG_SENSORS_LTC4222=m +CONFIG_SENSORS_LTC4245=m +CONFIG_SENSORS_LTC4260=m +CONFIG_SENSORS_LTC4261=m +CONFIG_SENSORS_MAX1111=m +CONFIG_SENSORS_MAX16065=m +CONFIG_SENSORS_MAX1619=m +CONFIG_SENSORS_MAX1668=m +CONFIG_SENSORS_MAX197=m +CONFIG_SENSORS_MAX31722=m +CONFIG_SENSORS_MAX6621=m +CONFIG_SENSORS_MAX6639=m +CONFIG_SENSORS_MAX6642=m +CONFIG_SENSORS_MAX6650=m +CONFIG_SENSORS_MAX6697=m +CONFIG_SENSORS_MAX31790=m +CONFIG_SENSORS_MCP3021=m +CONFIG_SENSORS_MLXREG_FAN=m +CONFIG_SENSORS_TC654=m +CONFIG_SENSORS_MENF21BMC_HWMON=m +CONFIG_SENSORS_ADCXX=m +CONFIG_SENSORS_LM63=m +CONFIG_SENSORS_LM70=m +CONFIG_SENSORS_LM73=m +CONFIG_SENSORS_LM75=m +CONFIG_SENSORS_LM77=m +CONFIG_SENSORS_LM78=m +CONFIG_SENSORS_LM80=m +CONFIG_SENSORS_LM83=m +CONFIG_SENSORS_LM85=m +CONFIG_SENSORS_LM87=m +CONFIG_SENSORS_LM90=m +CONFIG_SENSORS_LM92=m +CONFIG_SENSORS_LM93=m +CONFIG_SENSORS_LM95234=m +CONFIG_SENSORS_LM95241=m +CONFIG_SENSORS_LM95245=m +CONFIG_SENSORS_PC87360=m +CONFIG_SENSORS_PC87427=m +CONFIG_SENSORS_NTC_THERMISTOR=m +CONFIG_SENSORS_NCT6683=m +CONFIG_SENSORS_NCT6775=m +CONFIG_SENSORS_NCT7802=m +CONFIG_SENSORS_NCT7904=m +CONFIG_SENSORS_NPCM7XX=m +CONFIG_SENSORS_PCF8591=m +CONFIG_PMBUS=m +CONFIG_SENSORS_PMBUS=m +CONFIG_SENSORS_ADM1275=m +CONFIG_SENSORS_IBM_CFFPS=m +CONFIG_SENSORS_INSPUR_IPSPS=m +CONFIG_SENSORS_IR35221=m +CONFIG_SENSORS_IR38064=m +CONFIG_SENSORS_IRPS5401=m +CONFIG_SENSORS_ISL68137=m +CONFIG_SENSORS_LM25066=m +CONFIG_SENSORS_LTC2978=m +# CONFIG_SENSORS_LTC2978_REGULATOR is not set +CONFIG_SENSORS_LTC3815=m +CONFIG_SENSORS_MAX16064=m +CONFIG_SENSORS_MAX20751=m +CONFIG_SENSORS_MAX31785=m +CONFIG_SENSORS_MAX34440=m +CONFIG_SENSORS_MAX8688=m +CONFIG_SENSORS_PXE1610=m +CONFIG_SENSORS_TPS40422=m +CONFIG_SENSORS_TPS53679=m +CONFIG_SENSORS_UCD9000=m +CONFIG_SENSORS_UCD9200=m +CONFIG_SENSORS_ZL6100=m +CONFIG_SENSORS_PWM_FAN=m +CONFIG_SENSORS_SHT15=m +CONFIG_SENSORS_SHT21=m +CONFIG_SENSORS_SHT3x=m +CONFIG_SENSORS_SHTC1=m +CONFIG_SENSORS_SIS5595=m +CONFIG_SENSORS_DME1737=m +CONFIG_SENSORS_EMC1403=m +CONFIG_SENSORS_EMC2103=m +CONFIG_SENSORS_EMC6W201=m +CONFIG_SENSORS_SMSC47M1=m +CONFIG_SENSORS_SMSC47M192=m +CONFIG_SENSORS_SMSC47B397=m +CONFIG_SENSORS_SCH56XX_COMMON=m +CONFIG_SENSORS_SCH5627=m +CONFIG_SENSORS_SCH5636=m +CONFIG_SENSORS_STTS751=m +CONFIG_SENSORS_SMM665=m +CONFIG_SENSORS_ADC128D818=m +CONFIG_SENSORS_ADS7828=m +CONFIG_SENSORS_ADS7871=m +CONFIG_SENSORS_AMC6821=m +CONFIG_SENSORS_INA209=m +CONFIG_SENSORS_INA2XX=m +CONFIG_SENSORS_INA3221=m +CONFIG_SENSORS_TC74=m +CONFIG_SENSORS_THMC50=m +CONFIG_SENSORS_TMP102=m +CONFIG_SENSORS_TMP103=m +CONFIG_SENSORS_TMP108=m +CONFIG_SENSORS_TMP401=m +CONFIG_SENSORS_TMP421=m +CONFIG_SENSORS_VIA_CPUTEMP=m +CONFIG_SENSORS_VIA686A=m +CONFIG_SENSORS_VT1211=m +CONFIG_SENSORS_VT8231=m +CONFIG_SENSORS_W83773G=m +CONFIG_SENSORS_W83781D=m +CONFIG_SENSORS_W83791D=m +CONFIG_SENSORS_W83792D=m +CONFIG_SENSORS_W83793=m +CONFIG_SENSORS_W83795=m +# CONFIG_SENSORS_W83795_FANCTRL is not set +CONFIG_SENSORS_W83L785TS=m +CONFIG_SENSORS_W83L786NG=m +CONFIG_SENSORS_W83627HF=m +CONFIG_SENSORS_W83627EHF=m +CONFIG_SENSORS_WM831X=m +CONFIG_SENSORS_WM8350=m +CONFIG_SENSORS_XGENE=m + +# +# ACPI drivers +# +CONFIG_SENSORS_ACPI_POWER=m +CONFIG_SENSORS_ATK0110=m +CONFIG_THERMAL=y +# CONFIG_THERMAL_STATISTICS is not set +CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=100 +CONFIG_THERMAL_HWMON=y +CONFIG_THERMAL_OF=y +CONFIG_THERMAL_WRITABLE_TRIPS=y +CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y +# CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE is not set +# CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE is not set +# CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR is not set +CONFIG_THERMAL_GOV_FAIR_SHARE=y +CONFIG_THERMAL_GOV_STEP_WISE=y +CONFIG_THERMAL_GOV_BANG_BANG=y +CONFIG_THERMAL_GOV_USER_SPACE=y +CONFIG_THERMAL_GOV_POWER_ALLOCATOR=y +CONFIG_CPU_THERMAL=y +CONFIG_CLOCK_THERMAL=y +CONFIG_DEVFREQ_THERMAL=y +# CONFIG_THERMAL_EMULATION is not set +CONFIG_THERMAL_MMIO=m +CONFIG_MAX77620_THERMAL=m +CONFIG_QORIQ_THERMAL=m +CONFIG_DA9062_THERMAL=m + +# +# Intel thermal drivers +# +CONFIG_INTEL_POWERCLAMP=m +CONFIG_X86_PKG_TEMP_THERMAL=m +CONFIG_INTEL_SOC_DTS_IOSF_CORE=m +CONFIG_INTEL_SOC_DTS_THERMAL=m + +# +# ACPI INT340X thermal drivers +# +CONFIG_INT340X_THERMAL=m +CONFIG_ACPI_THERMAL_REL=m +CONFIG_INT3406_THERMAL=m +CONFIG_PROC_THERMAL_MMIO_RAPL=y +# end of ACPI INT340X thermal drivers + +CONFIG_INTEL_BXT_PMIC_THERMAL=m +CONFIG_INTEL_PCH_THERMAL=m +# end of Intel thermal drivers + +CONFIG_GENERIC_ADC_THERMAL=m +CONFIG_WATCHDOG=y +CONFIG_WATCHDOG_CORE=y +# CONFIG_WATCHDOG_NOWAYOUT is not set +CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED=y +CONFIG_WATCHDOG_OPEN_TIMEOUT=0 +CONFIG_WATCHDOG_SYSFS=y + +# +# Watchdog Pretimeout Governors +# +CONFIG_WATCHDOG_PRETIMEOUT_GOV=y +CONFIG_WATCHDOG_PRETIMEOUT_GOV_SEL=m +CONFIG_WATCHDOG_PRETIMEOUT_GOV_NOOP=m +CONFIG_WATCHDOG_PRETIMEOUT_GOV_PANIC=y +# CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_NOOP is not set +CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC=y + +# +# Watchdog Device Drivers +# +CONFIG_SOFT_WATCHDOG=m +# CONFIG_SOFT_WATCHDOG_PRETIMEOUT is not set +CONFIG_BD70528_WATCHDOG=m +CONFIG_DA9052_WATCHDOG=m +CONFIG_DA9055_WATCHDOG=m +CONFIG_DA9063_WATCHDOG=m +CONFIG_DA9062_WATCHDOG=m +CONFIG_GPIO_WATCHDOG=m +CONFIG_MENF21BMC_WATCHDOG=m +CONFIG_MENZ069_WATCHDOG=m +CONFIG_WDAT_WDT=m +CONFIG_WM831X_WATCHDOG=m +CONFIG_WM8350_WATCHDOG=m +CONFIG_XILINX_WATCHDOG=m +CONFIG_ZIIRAVE_WATCHDOG=m +CONFIG_RAVE_SP_WATCHDOG=m +CONFIG_MLX_WDT=m +CONFIG_CADENCE_WATCHDOG=m +CONFIG_DW_WATCHDOG=m +CONFIG_RN5T618_WATCHDOG=m +CONFIG_TWL4030_WATCHDOG=m +CONFIG_MAX63XX_WATCHDOG=m +CONFIG_MAX77620_WATCHDOG=m +CONFIG_RETU_WATCHDOG=m +CONFIG_STPMIC1_WATCHDOG=m +CONFIG_ACQUIRE_WDT=m +CONFIG_ADVANTECH_WDT=m +CONFIG_ALIM1535_WDT=m +CONFIG_ALIM7101_WDT=m +CONFIG_EBC_C384_WDT=m +CONFIG_F71808E_WDT=m +CONFIG_SP5100_TCO=m +CONFIG_SBC_FITPC2_WATCHDOG=m +CONFIG_EUROTECH_WDT=m +CONFIG_IB700_WDT=m +CONFIG_IBMASR=m +CONFIG_WAFER_WDT=m +CONFIG_I6300ESB_WDT=m +CONFIG_IE6XX_WDT=m +CONFIG_ITCO_WDT=m +CONFIG_ITCO_VENDOR_SUPPORT=y +CONFIG_IT8712F_WDT=m +CONFIG_IT87_WDT=m +CONFIG_HP_WATCHDOG=m +CONFIG_HPWDT_NMI_DECODING=y +CONFIG_KEMPLD_WDT=m +CONFIG_SC1200_WDT=m +CONFIG_PC87413_WDT=m +CONFIG_NV_TCO=m +CONFIG_60XX_WDT=m +CONFIG_CPU5_WDT=m +CONFIG_SMSC_SCH311X_WDT=m +CONFIG_SMSC37B787_WDT=m +CONFIG_TQMX86_WDT=m +CONFIG_VIA_WDT=m +CONFIG_W83627HF_WDT=m +CONFIG_W83877F_WDT=m +CONFIG_W83977F_WDT=m +CONFIG_MACHZ_WDT=m +CONFIG_SBC_EPX_C3_WATCHDOG=m +CONFIG_INTEL_MEI_WDT=m +CONFIG_NI903X_WDT=m +CONFIG_NIC7018_WDT=m +CONFIG_MEN_A21_WDT=m +CONFIG_XEN_WDT=m + +# +# PCI-based Watchdog Cards +# +CONFIG_PCIPCWATCHDOG=m +CONFIG_WDTPCI=m + +# +# USB-based Watchdog Cards +# +CONFIG_USBPCWATCHDOG=m +CONFIG_SSB_POSSIBLE=y +CONFIG_SSB=m +CONFIG_SSB_SPROM=y +CONFIG_SSB_BLOCKIO=y +CONFIG_SSB_PCIHOST_POSSIBLE=y +CONFIG_SSB_PCIHOST=y +CONFIG_SSB_B43_PCI_BRIDGE=y +CONFIG_SSB_PCMCIAHOST_POSSIBLE=y +CONFIG_SSB_PCMCIAHOST=y +CONFIG_SSB_SDIOHOST_POSSIBLE=y +CONFIG_SSB_SDIOHOST=y +CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y +CONFIG_SSB_DRIVER_PCICORE=y +CONFIG_SSB_DRIVER_GPIO=y +CONFIG_BCMA_POSSIBLE=y +CONFIG_BCMA=m +CONFIG_BCMA_BLOCKIO=y +CONFIG_BCMA_HOST_PCI_POSSIBLE=y +CONFIG_BCMA_HOST_PCI=y +# CONFIG_BCMA_HOST_SOC is not set +CONFIG_BCMA_DRIVER_PCI=y +CONFIG_BCMA_DRIVER_GMAC_CMN=y +CONFIG_BCMA_DRIVER_GPIO=y +# CONFIG_BCMA_DEBUG is not set + +# +# Multifunction device drivers +# +CONFIG_MFD_CORE=y +CONFIG_MFD_ACT8945A=m +CONFIG_MFD_AS3711=y +CONFIG_MFD_AS3722=m +CONFIG_PMIC_ADP5520=y +CONFIG_MFD_AAT2870_CORE=y +CONFIG_MFD_ATMEL_FLEXCOM=m +CONFIG_MFD_ATMEL_HLCDC=m +CONFIG_MFD_BCM590XX=m +CONFIG_MFD_BD9571MWV=m +CONFIG_MFD_AXP20X=m +CONFIG_MFD_AXP20X_I2C=m +CONFIG_MFD_CROS_EC_DEV=m +CONFIG_MFD_MADERA=m +CONFIG_MFD_MADERA_I2C=m +CONFIG_MFD_MADERA_SPI=m +CONFIG_MFD_CS47L15=y +CONFIG_MFD_CS47L35=y +CONFIG_MFD_CS47L85=y +CONFIG_MFD_CS47L90=y +CONFIG_MFD_CS47L92=y +CONFIG_PMIC_DA903X=y +CONFIG_PMIC_DA9052=y +CONFIG_MFD_DA9052_SPI=y +CONFIG_MFD_DA9052_I2C=y +CONFIG_MFD_DA9055=y +CONFIG_MFD_DA9062=m +CONFIG_MFD_DA9063=m +CONFIG_MFD_DA9150=m +CONFIG_MFD_DLN2=m +CONFIG_MFD_MC13XXX=m +CONFIG_MFD_MC13XXX_SPI=m +CONFIG_MFD_MC13XXX_I2C=m +CONFIG_MFD_HI6421_PMIC=m +CONFIG_HTC_PASIC3=m +CONFIG_HTC_I2CPLD=y +CONFIG_MFD_INTEL_QUARK_I2C_GPIO=m +CONFIG_LPC_ICH=m +CONFIG_LPC_SCH=m +CONFIG_INTEL_SOC_PMIC=y +CONFIG_INTEL_SOC_PMIC_BXTWC=m +CONFIG_INTEL_SOC_PMIC_CHTWC=y +CONFIG_INTEL_SOC_PMIC_CHTDC_TI=m +CONFIG_MFD_INTEL_LPSS=m +CONFIG_MFD_INTEL_LPSS_ACPI=m +CONFIG_MFD_INTEL_LPSS_PCI=m +CONFIG_MFD_JANZ_CMODIO=m +CONFIG_MFD_KEMPLD=m +CONFIG_MFD_88PM800=m +CONFIG_MFD_88PM805=m +CONFIG_MFD_88PM860X=y +CONFIG_MFD_MAX14577=m +CONFIG_MFD_MAX77620=y +CONFIG_MFD_MAX77650=m +CONFIG_MFD_MAX77686=m +CONFIG_MFD_MAX77693=m +CONFIG_MFD_MAX77843=y +CONFIG_MFD_MAX8907=m +CONFIG_MFD_MAX8925=y +CONFIG_MFD_MAX8997=y +CONFIG_MFD_MAX8998=y +CONFIG_MFD_MT6397=m +CONFIG_MFD_MENF21BMC=m +CONFIG_EZX_PCAP=y +CONFIG_MFD_CPCAP=m +CONFIG_MFD_VIPERBOARD=m +CONFIG_MFD_RETU=m +CONFIG_MFD_PCF50633=m +CONFIG_PCF50633_ADC=m +CONFIG_PCF50633_GPIO=m +CONFIG_UCB1400_CORE=m +CONFIG_MFD_RDC321X=m +CONFIG_MFD_RT5033=m +CONFIG_MFD_RC5T583=y +CONFIG_MFD_RK808=m +CONFIG_MFD_RN5T618=m +CONFIG_MFD_SEC_CORE=y +CONFIG_MFD_SI476X_CORE=m +CONFIG_MFD_SM501=m +CONFIG_MFD_SM501_GPIO=y +CONFIG_MFD_SKY81452=m +CONFIG_MFD_SMSC=y +CONFIG_ABX500_CORE=y +CONFIG_AB3100_CORE=y +CONFIG_AB3100_OTP=y +CONFIG_MFD_STMPE=y + +# +# STMicroelectronics STMPE Interface Drivers +# +CONFIG_STMPE_I2C=y +CONFIG_STMPE_SPI=y +# end of STMicroelectronics STMPE Interface Drivers + +CONFIG_MFD_SYSCON=y +CONFIG_MFD_TI_AM335X_TSCADC=m +CONFIG_MFD_LP3943=m +CONFIG_MFD_LP8788=y +CONFIG_MFD_TI_LMU=m +CONFIG_MFD_PALMAS=y +CONFIG_TPS6105X=m +CONFIG_TPS65010=m +CONFIG_TPS6507X=m +CONFIG_MFD_TPS65086=m +CONFIG_MFD_TPS65090=y +CONFIG_MFD_TPS65217=m +CONFIG_MFD_TPS68470=y +CONFIG_MFD_TI_LP873X=m +CONFIG_MFD_TI_LP87565=m +CONFIG_MFD_TPS65218=m +CONFIG_MFD_TPS6586X=y +CONFIG_MFD_TPS65910=y +CONFIG_MFD_TPS65912=m +CONFIG_MFD_TPS65912_I2C=m +CONFIG_MFD_TPS65912_SPI=m +CONFIG_MFD_TPS80031=y +CONFIG_TWL4030_CORE=y +CONFIG_MFD_TWL4030_AUDIO=y +CONFIG_TWL6040_CORE=y +CONFIG_MFD_WL1273_CORE=m +CONFIG_MFD_LM3533=m +CONFIG_MFD_TC3589X=y +CONFIG_MFD_TQMX86=m +CONFIG_MFD_VX855=m +CONFIG_MFD_LOCHNAGAR=y +CONFIG_MFD_ARIZONA=y +CONFIG_MFD_ARIZONA_I2C=m +CONFIG_MFD_ARIZONA_SPI=m +CONFIG_MFD_CS47L24=y +CONFIG_MFD_WM5102=y +CONFIG_MFD_WM5110=y +CONFIG_MFD_WM8997=y +CONFIG_MFD_WM8998=y +CONFIG_MFD_WM8400=y +CONFIG_MFD_WM831X=y +CONFIG_MFD_WM831X_I2C=y +CONFIG_MFD_WM831X_SPI=y +CONFIG_MFD_WM8350=y +CONFIG_MFD_WM8350_I2C=y +CONFIG_MFD_WM8994=m +CONFIG_MFD_ROHM_BD718XX=m +CONFIG_MFD_ROHM_BD70528=m +CONFIG_MFD_STPMIC1=m +CONFIG_MFD_STMFX=m +CONFIG_RAVE_SP_CORE=m +# end of Multifunction device drivers + +CONFIG_REGULATOR=y +# CONFIG_REGULATOR_DEBUG is not set +CONFIG_REGULATOR_FIXED_VOLTAGE=m +CONFIG_REGULATOR_VIRTUAL_CONSUMER=m +CONFIG_REGULATOR_USERSPACE_CONSUMER=m +CONFIG_REGULATOR_88PG86X=m +CONFIG_REGULATOR_88PM800=m +CONFIG_REGULATOR_88PM8607=m +CONFIG_REGULATOR_ACT8865=m +CONFIG_REGULATOR_ACT8945A=m +CONFIG_REGULATOR_AD5398=m +CONFIG_REGULATOR_ANATOP=m +CONFIG_REGULATOR_AAT2870=m +CONFIG_REGULATOR_AB3100=m +CONFIG_REGULATOR_ARIZONA_LDO1=m +CONFIG_REGULATOR_ARIZONA_MICSUPP=m +CONFIG_REGULATOR_AS3711=m +CONFIG_REGULATOR_AS3722=m +CONFIG_REGULATOR_AXP20X=m +CONFIG_REGULATOR_BCM590XX=m +CONFIG_REGULATOR_BD70528=m +CONFIG_REGULATOR_BD718XX=m +CONFIG_REGULATOR_BD9571MWV=m +CONFIG_REGULATOR_CPCAP=m +CONFIG_REGULATOR_DA903X=m +CONFIG_REGULATOR_DA9052=m +CONFIG_REGULATOR_DA9055=m +CONFIG_REGULATOR_DA9062=m +CONFIG_REGULATOR_DA9063=m +CONFIG_REGULATOR_DA9210=m +CONFIG_REGULATOR_DA9211=m +CONFIG_REGULATOR_FAN53555=m +CONFIG_REGULATOR_GPIO=m +CONFIG_REGULATOR_HI6421=m +CONFIG_REGULATOR_HI6421V530=m +CONFIG_REGULATOR_ISL9305=m +CONFIG_REGULATOR_ISL6271A=m +CONFIG_REGULATOR_LM363X=m +CONFIG_REGULATOR_LOCHNAGAR=m +CONFIG_REGULATOR_LP3971=m +CONFIG_REGULATOR_LP3972=m +CONFIG_REGULATOR_LP872X=m +CONFIG_REGULATOR_LP873X=m +CONFIG_REGULATOR_LP8755=m +CONFIG_REGULATOR_LP87565=m +CONFIG_REGULATOR_LP8788=m +CONFIG_REGULATOR_LTC3589=m +CONFIG_REGULATOR_LTC3676=m +CONFIG_REGULATOR_MAX14577=m +CONFIG_REGULATOR_MAX1586=m +CONFIG_REGULATOR_MAX77620=m +CONFIG_REGULATOR_MAX77650=m +CONFIG_REGULATOR_MAX8649=m +CONFIG_REGULATOR_MAX8660=m +CONFIG_REGULATOR_MAX8907=m +CONFIG_REGULATOR_MAX8925=m +CONFIG_REGULATOR_MAX8952=m +CONFIG_REGULATOR_MAX8973=m +CONFIG_REGULATOR_MAX8997=m +CONFIG_REGULATOR_MAX8998=m +CONFIG_REGULATOR_MAX77686=m +CONFIG_REGULATOR_MAX77693=m +CONFIG_REGULATOR_MAX77802=m +CONFIG_REGULATOR_MC13XXX_CORE=m +CONFIG_REGULATOR_MC13783=m +CONFIG_REGULATOR_MC13892=m +CONFIG_REGULATOR_MCP16502=m +CONFIG_REGULATOR_MT6311=m +CONFIG_REGULATOR_MT6323=m +CONFIG_REGULATOR_MT6397=m +CONFIG_REGULATOR_PALMAS=m +CONFIG_REGULATOR_PCAP=m +CONFIG_REGULATOR_PCF50633=m +CONFIG_REGULATOR_PFUZE100=m +CONFIG_REGULATOR_PV88060=m +CONFIG_REGULATOR_PV88080=m +CONFIG_REGULATOR_PV88090=m +CONFIG_REGULATOR_PWM=m +CONFIG_REGULATOR_QCOM_SPMI=m +CONFIG_REGULATOR_RC5T583=m +CONFIG_REGULATOR_RK808=m +CONFIG_REGULATOR_RN5T618=m +CONFIG_REGULATOR_RT5033=m +CONFIG_REGULATOR_S2MPA01=m +CONFIG_REGULATOR_S2MPS11=m +CONFIG_REGULATOR_S5M8767=m +CONFIG_REGULATOR_SKY81452=m +CONFIG_REGULATOR_SLG51000=m +CONFIG_REGULATOR_STPMIC1=m +CONFIG_REGULATOR_SY8106A=m +CONFIG_REGULATOR_SY8824X=m +CONFIG_REGULATOR_TPS51632=m +CONFIG_REGULATOR_TPS6105X=m +CONFIG_REGULATOR_TPS62360=m +CONFIG_REGULATOR_TPS65023=m +CONFIG_REGULATOR_TPS6507X=m +CONFIG_REGULATOR_TPS65086=m +CONFIG_REGULATOR_TPS65090=m +CONFIG_REGULATOR_TPS65132=m +CONFIG_REGULATOR_TPS65217=m +CONFIG_REGULATOR_TPS65218=m +CONFIG_REGULATOR_TPS6524X=m +CONFIG_REGULATOR_TPS6586X=m +CONFIG_REGULATOR_TPS65910=m +CONFIG_REGULATOR_TPS65912=m +CONFIG_REGULATOR_TPS80031=m +CONFIG_REGULATOR_TWL4030=m +CONFIG_REGULATOR_VCTRL=m +CONFIG_REGULATOR_WM831X=m +CONFIG_REGULATOR_WM8350=m +CONFIG_REGULATOR_WM8400=m +CONFIG_REGULATOR_WM8994=m +CONFIG_CEC_CORE=y +CONFIG_CEC_NOTIFIER=y +CONFIG_RC_CORE=m +CONFIG_RC_MAP=m +CONFIG_LIRC=y +CONFIG_RC_DECODERS=y +CONFIG_IR_NEC_DECODER=m +CONFIG_IR_RC5_DECODER=m +CONFIG_IR_RC6_DECODER=m +CONFIG_IR_JVC_DECODER=m +CONFIG_IR_SONY_DECODER=m +CONFIG_IR_SANYO_DECODER=m +CONFIG_IR_SHARP_DECODER=m +CONFIG_IR_MCE_KBD_DECODER=m +CONFIG_IR_XMP_DECODER=m +CONFIG_IR_IMON_DECODER=m +CONFIG_IR_RCMM_DECODER=m +CONFIG_RC_DEVICES=y +CONFIG_RC_ATI_REMOTE=m +CONFIG_IR_ENE=m +CONFIG_IR_HIX5HD2=m +CONFIG_IR_IMON=m +CONFIG_IR_IMON_RAW=m +CONFIG_IR_MCEUSB=m +CONFIG_IR_ITE_CIR=m +CONFIG_IR_FINTEK=m +CONFIG_IR_NUVOTON=m +CONFIG_IR_REDRAT3=m +CONFIG_IR_SPI=m +CONFIG_IR_STREAMZAP=m +CONFIG_IR_WINBOND_CIR=m +CONFIG_IR_IGORPLUGUSB=m +CONFIG_IR_IGUANA=m +CONFIG_IR_TTUSBIR=m +CONFIG_RC_LOOPBACK=m +CONFIG_IR_GPIO_CIR=m +CONFIG_IR_GPIO_TX=m +CONFIG_IR_PWM_TX=m +CONFIG_IR_SERIAL=m +CONFIG_IR_SERIAL_TRANSMITTER=y +CONFIG_IR_SIR=m +CONFIG_RC_XBOX_DVD=m +CONFIG_MEDIA_SUPPORT=m + +# +# Multimedia core support +# +CONFIG_MEDIA_CAMERA_SUPPORT=y +CONFIG_MEDIA_ANALOG_TV_SUPPORT=y +CONFIG_MEDIA_DIGITAL_TV_SUPPORT=y +CONFIG_MEDIA_RADIO_SUPPORT=y +CONFIG_MEDIA_SDR_SUPPORT=y +CONFIG_MEDIA_CEC_SUPPORT=y +CONFIG_MEDIA_CONTROLLER=y +CONFIG_MEDIA_CONTROLLER_DVB=y +# CONFIG_MEDIA_CONTROLLER_REQUEST_API is not set +CONFIG_VIDEO_DEV=m +CONFIG_VIDEO_V4L2_SUBDEV_API=y +CONFIG_VIDEO_V4L2=m +CONFIG_VIDEO_V4L2_I2C=y +# CONFIG_VIDEO_ADV_DEBUG is not set +# CONFIG_VIDEO_FIXED_MINOR_RANGES is not set +CONFIG_VIDEO_TUNER=m +CONFIG_V4L2_MEM2MEM_DEV=m +CONFIG_V4L2_FLASH_LED_CLASS=m +CONFIG_V4L2_FWNODE=m +CONFIG_VIDEOBUF_GEN=m +CONFIG_VIDEOBUF_DMA_SG=m +CONFIG_VIDEOBUF_VMALLOC=m +CONFIG_DVB_CORE=m +CONFIG_DVB_MMAP=y +CONFIG_DVB_NET=y +CONFIG_TTPCI_EEPROM=m +CONFIG_DVB_MAX_ADAPTERS=16 +# CONFIG_DVB_DYNAMIC_MINORS is not set +# CONFIG_DVB_DEMUX_SECTION_LOSS_LOG is not set +# CONFIG_DVB_ULE_DEBUG is not set + +# +# Media drivers +# +CONFIG_MEDIA_USB_SUPPORT=y + +# +# Webcam devices +# +CONFIG_USB_VIDEO_CLASS=m +CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y +CONFIG_USB_GSPCA=m +CONFIG_USB_M5602=m +CONFIG_USB_STV06XX=m +CONFIG_USB_GL860=m +CONFIG_USB_GSPCA_BENQ=m +CONFIG_USB_GSPCA_CONEX=m +CONFIG_USB_GSPCA_CPIA1=m +CONFIG_USB_GSPCA_DTCS033=m +CONFIG_USB_GSPCA_ETOMS=m +CONFIG_USB_GSPCA_FINEPIX=m +CONFIG_USB_GSPCA_JEILINJ=m +CONFIG_USB_GSPCA_JL2005BCD=m +CONFIG_USB_GSPCA_KINECT=m +CONFIG_USB_GSPCA_KONICA=m +CONFIG_USB_GSPCA_MARS=m +CONFIG_USB_GSPCA_MR97310A=m +CONFIG_USB_GSPCA_NW80X=m +CONFIG_USB_GSPCA_OV519=m +CONFIG_USB_GSPCA_OV534=m +CONFIG_USB_GSPCA_OV534_9=m +CONFIG_USB_GSPCA_PAC207=m +CONFIG_USB_GSPCA_PAC7302=m +CONFIG_USB_GSPCA_PAC7311=m +CONFIG_USB_GSPCA_SE401=m +CONFIG_USB_GSPCA_SN9C2028=m +CONFIG_USB_GSPCA_SN9C20X=m +CONFIG_USB_GSPCA_SONIXB=m +CONFIG_USB_GSPCA_SONIXJ=m +CONFIG_USB_GSPCA_SPCA500=m +CONFIG_USB_GSPCA_SPCA501=m +CONFIG_USB_GSPCA_SPCA505=m +CONFIG_USB_GSPCA_SPCA506=m +CONFIG_USB_GSPCA_SPCA508=m +CONFIG_USB_GSPCA_SPCA561=m +CONFIG_USB_GSPCA_SPCA1528=m +CONFIG_USB_GSPCA_SQ905=m +CONFIG_USB_GSPCA_SQ905C=m +CONFIG_USB_GSPCA_SQ930X=m +CONFIG_USB_GSPCA_STK014=m +CONFIG_USB_GSPCA_STK1135=m +CONFIG_USB_GSPCA_STV0680=m +CONFIG_USB_GSPCA_SUNPLUS=m +CONFIG_USB_GSPCA_T613=m +CONFIG_USB_GSPCA_TOPRO=m +CONFIG_USB_GSPCA_TOUPTEK=m +CONFIG_USB_GSPCA_TV8532=m +CONFIG_USB_GSPCA_VC032X=m +CONFIG_USB_GSPCA_VICAM=m +CONFIG_USB_GSPCA_XIRLINK_CIT=m +CONFIG_USB_GSPCA_ZC3XX=m +CONFIG_USB_PWC=m +# CONFIG_USB_PWC_DEBUG is not set +CONFIG_USB_PWC_INPUT_EVDEV=y +CONFIG_VIDEO_CPIA2=m +CONFIG_USB_ZR364XX=m +CONFIG_USB_STKWEBCAM=m +CONFIG_USB_S2255=m +CONFIG_VIDEO_USBTV=m + +# +# Analog TV USB devices +# +CONFIG_VIDEO_PVRUSB2=m +CONFIG_VIDEO_PVRUSB2_SYSFS=y +CONFIG_VIDEO_PVRUSB2_DVB=y +# CONFIG_VIDEO_PVRUSB2_DEBUGIFC is not set +CONFIG_VIDEO_HDPVR=m +CONFIG_VIDEO_USBVISION=m +CONFIG_VIDEO_STK1160_COMMON=m +CONFIG_VIDEO_STK1160=m +CONFIG_VIDEO_GO7007=m +CONFIG_VIDEO_GO7007_USB=m +CONFIG_VIDEO_GO7007_LOADER=m +CONFIG_VIDEO_GO7007_USB_S2250_BOARD=m + +# +# Analog/digital TV USB devices +# +CONFIG_VIDEO_AU0828=m +CONFIG_VIDEO_AU0828_V4L2=y +CONFIG_VIDEO_AU0828_RC=y +CONFIG_VIDEO_CX231XX=m +CONFIG_VIDEO_CX231XX_RC=y +CONFIG_VIDEO_CX231XX_ALSA=m +CONFIG_VIDEO_CX231XX_DVB=m +CONFIG_VIDEO_TM6000=m +CONFIG_VIDEO_TM6000_ALSA=m +CONFIG_VIDEO_TM6000_DVB=m + +# +# Digital TV USB devices +# +CONFIG_DVB_USB=m +# CONFIG_DVB_USB_DEBUG is not set +CONFIG_DVB_USB_DIB3000MC=m +CONFIG_DVB_USB_A800=m +CONFIG_DVB_USB_DIBUSB_MB=m +CONFIG_DVB_USB_DIBUSB_MB_FAULTY=y +CONFIG_DVB_USB_DIBUSB_MC=m +CONFIG_DVB_USB_DIB0700=m +CONFIG_DVB_USB_UMT_010=m +CONFIG_DVB_USB_CXUSB=m +CONFIG_DVB_USB_CXUSB_ANALOG=y +CONFIG_DVB_USB_M920X=m +CONFIG_DVB_USB_DIGITV=m +CONFIG_DVB_USB_VP7045=m +CONFIG_DVB_USB_VP702X=m +CONFIG_DVB_USB_GP8PSK=m +CONFIG_DVB_USB_NOVA_T_USB2=m +CONFIG_DVB_USB_TTUSB2=m +CONFIG_DVB_USB_DTT200U=m +CONFIG_DVB_USB_OPERA1=m +CONFIG_DVB_USB_AF9005=m +CONFIG_DVB_USB_AF9005_REMOTE=m +CONFIG_DVB_USB_PCTV452E=m +CONFIG_DVB_USB_DW2102=m +CONFIG_DVB_USB_CINERGY_T2=m +CONFIG_DVB_USB_DTV5100=m +CONFIG_DVB_USB_AZ6027=m +CONFIG_DVB_USB_TECHNISAT_USB2=m +CONFIG_DVB_USB_V2=m +CONFIG_DVB_USB_AF9015=m +CONFIG_DVB_USB_AF9035=m +CONFIG_DVB_USB_ANYSEE=m +CONFIG_DVB_USB_AU6610=m +CONFIG_DVB_USB_AZ6007=m +CONFIG_DVB_USB_CE6230=m +CONFIG_DVB_USB_EC168=m +CONFIG_DVB_USB_GL861=m +CONFIG_DVB_USB_LME2510=m +CONFIG_DVB_USB_MXL111SF=m +CONFIG_DVB_USB_RTL28XXU=m +CONFIG_DVB_USB_DVBSKY=m +CONFIG_DVB_USB_ZD1301=m +CONFIG_DVB_TTUSB_BUDGET=m +CONFIG_DVB_TTUSB_DEC=m +CONFIG_SMS_USB_DRV=m +CONFIG_DVB_B2C2_FLEXCOP_USB=m +# CONFIG_DVB_B2C2_FLEXCOP_USB_DEBUG is not set +CONFIG_DVB_AS102=m + +# +# Webcam, TV (analog/digital) USB devices +# +CONFIG_VIDEO_EM28XX=m +CONFIG_VIDEO_EM28XX_V4L2=m +CONFIG_VIDEO_EM28XX_ALSA=m +CONFIG_VIDEO_EM28XX_DVB=m +CONFIG_VIDEO_EM28XX_RC=m + +# +# Software defined radio USB devices +# +CONFIG_USB_AIRSPY=m +CONFIG_USB_HACKRF=m +CONFIG_USB_MSI2500=m + +# +# USB HDMI CEC adapters +# +CONFIG_USB_PULSE8_CEC=m +CONFIG_USB_RAINSHADOW_CEC=m +CONFIG_MEDIA_PCI_SUPPORT=y + +# +# Media capture support +# +CONFIG_VIDEO_MEYE=m +CONFIG_VIDEO_SOLO6X10=m +CONFIG_VIDEO_TW5864=m +CONFIG_VIDEO_TW68=m +CONFIG_VIDEO_TW686X=m + +# +# Media capture/analog TV support +# +CONFIG_VIDEO_IVTV=m +# CONFIG_VIDEO_IVTV_DEPRECATED_IOCTLS is not set +CONFIG_VIDEO_IVTV_ALSA=m +CONFIG_VIDEO_FB_IVTV=m +# CONFIG_VIDEO_FB_IVTV_FORCE_PAT is not set +CONFIG_VIDEO_HEXIUM_GEMINI=m +CONFIG_VIDEO_HEXIUM_ORION=m +CONFIG_VIDEO_MXB=m +CONFIG_VIDEO_DT3155=m + +# +# Media capture/analog/hybrid TV support +# +CONFIG_VIDEO_CX18=m +CONFIG_VIDEO_CX18_ALSA=m +CONFIG_VIDEO_CX23885=m +CONFIG_MEDIA_ALTERA_CI=m +CONFIG_VIDEO_CX25821=m +CONFIG_VIDEO_CX25821_ALSA=m +CONFIG_VIDEO_CX88=m +CONFIG_VIDEO_CX88_ALSA=m +CONFIG_VIDEO_CX88_BLACKBIRD=m +CONFIG_VIDEO_CX88_DVB=m +CONFIG_VIDEO_CX88_ENABLE_VP3054=y +CONFIG_VIDEO_CX88_VP3054=m +CONFIG_VIDEO_CX88_MPEG=m +CONFIG_VIDEO_BT848=m +CONFIG_DVB_BT8XX=m +CONFIG_VIDEO_SAA7134=m +CONFIG_VIDEO_SAA7134_ALSA=m +CONFIG_VIDEO_SAA7134_RC=y +CONFIG_VIDEO_SAA7134_DVB=m +CONFIG_VIDEO_SAA7134_GO7007=m +CONFIG_VIDEO_SAA7164=m + +# +# Media digital TV PCI Adapters +# +CONFIG_DVB_AV7110_IR=y +CONFIG_DVB_AV7110=m +CONFIG_DVB_AV7110_OSD=y +CONFIG_DVB_BUDGET_CORE=m +CONFIG_DVB_BUDGET=m +CONFIG_DVB_BUDGET_CI=m +CONFIG_DVB_BUDGET_AV=m +CONFIG_DVB_BUDGET_PATCH=m +CONFIG_DVB_B2C2_FLEXCOP_PCI=m +# CONFIG_DVB_B2C2_FLEXCOP_PCI_DEBUG is not set +CONFIG_DVB_PLUTO2=m +CONFIG_DVB_DM1105=m +CONFIG_DVB_PT1=m +CONFIG_DVB_PT3=m +CONFIG_MANTIS_CORE=m +CONFIG_DVB_MANTIS=m +CONFIG_DVB_HOPPER=m +CONFIG_DVB_NGENE=m +CONFIG_DVB_DDBRIDGE=m +# CONFIG_DVB_DDBRIDGE_MSIENABLE is not set +CONFIG_DVB_SMIPCIE=m +CONFIG_DVB_NETUP_UNIDVB=m +CONFIG_VIDEO_IPU3_CIO2=m +CONFIG_V4L_PLATFORM_DRIVERS=y +CONFIG_VIDEO_CAFE_CCIC=m +CONFIG_VIDEO_CADENCE=y +CONFIG_VIDEO_CADENCE_CSI2RX=m +CONFIG_VIDEO_CADENCE_CSI2TX=m +CONFIG_VIDEO_ASPEED=m +CONFIG_VIDEO_MUX=m +CONFIG_VIDEO_XILINX=m +CONFIG_VIDEO_XILINX_TPG=m +CONFIG_VIDEO_XILINX_VTC=m +CONFIG_V4L_MEM2MEM_DRIVERS=y +CONFIG_VIDEO_MEM2MEM_DEINTERLACE=m +CONFIG_VIDEO_SH_VEU=m +CONFIG_V4L_TEST_DRIVERS=y +CONFIG_VIDEO_VIMC=m +CONFIG_VIDEO_VIVID=m +CONFIG_VIDEO_VIVID_CEC=y +CONFIG_VIDEO_VIVID_MAX_DEVS=64 +CONFIG_VIDEO_VIM2M=m +CONFIG_VIDEO_VICODEC=m +CONFIG_DVB_PLATFORM_DRIVERS=y +CONFIG_CEC_PLATFORM_DRIVERS=y +CONFIG_VIDEO_CROS_EC_CEC=m +CONFIG_VIDEO_SECO_CEC=m +CONFIG_VIDEO_SECO_RC=y +CONFIG_SDR_PLATFORM_DRIVERS=y + +# +# Supported MMC/SDIO adapters +# +CONFIG_SMS_SDIO_DRV=m +CONFIG_RADIO_ADAPTERS=y +CONFIG_RADIO_TEA575X=m +CONFIG_RADIO_SI470X=m +CONFIG_USB_SI470X=m +CONFIG_I2C_SI470X=m +CONFIG_RADIO_SI4713=m +CONFIG_USB_SI4713=m +CONFIG_PLATFORM_SI4713=m +CONFIG_I2C_SI4713=m +CONFIG_RADIO_SI476X=m +CONFIG_USB_MR800=m +CONFIG_USB_DSBR=m +CONFIG_RADIO_MAXIRADIO=m +CONFIG_RADIO_SHARK=m +CONFIG_RADIO_SHARK2=m +CONFIG_USB_KEENE=m +CONFIG_USB_RAREMONO=m +CONFIG_USB_MA901=m +CONFIG_RADIO_TEA5764=m +CONFIG_RADIO_SAA7706H=m +CONFIG_RADIO_TEF6862=m +CONFIG_RADIO_WL1273=m + +# +# Texas Instruments WL128x FM driver (ST based) +# +CONFIG_RADIO_WL128X=m +# end of Texas Instruments WL128x FM driver (ST based) + +# +# Supported FireWire (IEEE 1394) Adapters +# +CONFIG_DVB_FIREDTV=m +CONFIG_DVB_FIREDTV_INPUT=y +CONFIG_MEDIA_COMMON_OPTIONS=y + +# +# common driver options +# +CONFIG_VIDEO_CX2341X=m +CONFIG_VIDEO_TVEEPROM=m +CONFIG_CYPRESS_FIRMWARE=m +CONFIG_VIDEOBUF2_CORE=m +CONFIG_VIDEOBUF2_V4L2=m +CONFIG_VIDEOBUF2_MEMOPS=m +CONFIG_VIDEOBUF2_DMA_CONTIG=m +CONFIG_VIDEOBUF2_VMALLOC=m +CONFIG_VIDEOBUF2_DMA_SG=m +CONFIG_VIDEOBUF2_DVB=m +CONFIG_DVB_B2C2_FLEXCOP=m +CONFIG_VIDEO_SAA7146=m +CONFIG_VIDEO_SAA7146_VV=m +CONFIG_SMS_SIANO_MDTV=m +CONFIG_SMS_SIANO_RC=y +# CONFIG_SMS_SIANO_DEBUGFS is not set +CONFIG_VIDEO_V4L2_TPG=m + +# +# Media ancillary drivers (tuners, sensors, i2c, spi, frontends) +# +CONFIG_MEDIA_SUBDRV_AUTOSELECT=y +CONFIG_MEDIA_ATTACH=y +CONFIG_VIDEO_IR_I2C=m + +# +# I2C Encoders, decoders, sensors and other helper chips +# + +# +# Audio decoders, processors and mixers +# +CONFIG_VIDEO_TVAUDIO=m +CONFIG_VIDEO_TDA7432=m +CONFIG_VIDEO_TDA9840=m +CONFIG_VIDEO_TDA1997X=m +CONFIG_VIDEO_TEA6415C=m +CONFIG_VIDEO_TEA6420=m +CONFIG_VIDEO_MSP3400=m +CONFIG_VIDEO_CS3308=m +CONFIG_VIDEO_CS5345=m +CONFIG_VIDEO_CS53L32A=m +CONFIG_VIDEO_TLV320AIC23B=m +CONFIG_VIDEO_UDA1342=m +CONFIG_VIDEO_WM8775=m +CONFIG_VIDEO_WM8739=m +CONFIG_VIDEO_VP27SMPX=m +CONFIG_VIDEO_SONY_BTF_MPX=m + +# +# RDS decoders +# +CONFIG_VIDEO_SAA6588=m + +# +# Video decoders +# +CONFIG_VIDEO_ADV7180=m +CONFIG_VIDEO_ADV7183=m +CONFIG_VIDEO_ADV748X=m +CONFIG_VIDEO_ADV7604=m +CONFIG_VIDEO_ADV7604_CEC=y +CONFIG_VIDEO_ADV7842=m +CONFIG_VIDEO_ADV7842_CEC=y +CONFIG_VIDEO_BT819=m +CONFIG_VIDEO_BT856=m +CONFIG_VIDEO_BT866=m +CONFIG_VIDEO_KS0127=m +CONFIG_VIDEO_ML86V7667=m +CONFIG_VIDEO_SAA7110=m +CONFIG_VIDEO_SAA711X=m +CONFIG_VIDEO_TC358743=m +CONFIG_VIDEO_TC358743_CEC=y +CONFIG_VIDEO_TVP514X=m +CONFIG_VIDEO_TVP5150=m +CONFIG_VIDEO_TVP7002=m +CONFIG_VIDEO_TW2804=m +CONFIG_VIDEO_TW9903=m +CONFIG_VIDEO_TW9906=m +CONFIG_VIDEO_TW9910=m +CONFIG_VIDEO_VPX3220=m + +# +# Video and audio decoders +# +CONFIG_VIDEO_SAA717X=m +CONFIG_VIDEO_CX25840=m + +# +# Video encoders +# +CONFIG_VIDEO_SAA7127=m +CONFIG_VIDEO_SAA7185=m +CONFIG_VIDEO_ADV7170=m +CONFIG_VIDEO_ADV7175=m +CONFIG_VIDEO_ADV7343=m +CONFIG_VIDEO_ADV7393=m +CONFIG_VIDEO_AD9389B=m +CONFIG_VIDEO_AK881X=m +CONFIG_VIDEO_THS8200=m + +# +# Camera sensor devices +# +CONFIG_VIDEO_APTINA_PLL=m +CONFIG_VIDEO_SMIAPP_PLL=m +CONFIG_VIDEO_IMX214=m +CONFIG_VIDEO_IMX258=m +CONFIG_VIDEO_IMX274=m +CONFIG_VIDEO_IMX319=m +CONFIG_VIDEO_IMX355=m +CONFIG_VIDEO_OV2640=m +CONFIG_VIDEO_OV2659=m +CONFIG_VIDEO_OV2680=m +CONFIG_VIDEO_OV2685=m +CONFIG_VIDEO_OV5640=m +CONFIG_VIDEO_OV5645=m +CONFIG_VIDEO_OV5647=m +CONFIG_VIDEO_OV6650=m +CONFIG_VIDEO_OV5670=m +CONFIG_VIDEO_OV5675=m +CONFIG_VIDEO_OV5695=m +CONFIG_VIDEO_OV7251=m +CONFIG_VIDEO_OV772X=m +CONFIG_VIDEO_OV7640=m +CONFIG_VIDEO_OV7670=m +CONFIG_VIDEO_OV7740=m +CONFIG_VIDEO_OV8856=m +CONFIG_VIDEO_OV9640=m +CONFIG_VIDEO_OV9650=m +CONFIG_VIDEO_OV13858=m +CONFIG_VIDEO_VS6624=m +CONFIG_VIDEO_MT9M001=m +CONFIG_VIDEO_MT9M032=m +CONFIG_VIDEO_MT9M111=m +CONFIG_VIDEO_MT9P031=m +CONFIG_VIDEO_MT9T001=m +CONFIG_VIDEO_MT9T112=m +CONFIG_VIDEO_MT9V011=m +CONFIG_VIDEO_MT9V032=m +CONFIG_VIDEO_MT9V111=m +CONFIG_VIDEO_SR030PC30=m +CONFIG_VIDEO_NOON010PC30=m +CONFIG_VIDEO_M5MOLS=m +CONFIG_VIDEO_RJ54N1=m +CONFIG_VIDEO_S5K6AA=m +CONFIG_VIDEO_S5K6A3=m +CONFIG_VIDEO_S5K4ECGX=m +CONFIG_VIDEO_S5K5BAF=m +CONFIG_VIDEO_SMIAPP=m +CONFIG_VIDEO_ET8EK8=m +CONFIG_VIDEO_S5C73M3=m + +# +# Lens drivers +# +CONFIG_VIDEO_AD5820=m +CONFIG_VIDEO_AK7375=m +CONFIG_VIDEO_DW9714=m +CONFIG_VIDEO_DW9807_VCM=m + +# +# Flash devices +# +CONFIG_VIDEO_ADP1653=m +CONFIG_VIDEO_LM3560=m +CONFIG_VIDEO_LM3646=m + +# +# Video improvement chips +# +CONFIG_VIDEO_UPD64031A=m +CONFIG_VIDEO_UPD64083=m + +# +# Audio/Video compression chips +# +CONFIG_VIDEO_SAA6752HS=m + +# +# SDR tuner chips +# +CONFIG_SDR_MAX2175=m + +# +# Miscellaneous helper chips +# +CONFIG_VIDEO_THS7303=m +CONFIG_VIDEO_M52790=m +CONFIG_VIDEO_I2C=m +CONFIG_VIDEO_ST_MIPID02=m +# end of I2C Encoders, decoders, sensors and other helper chips + +# +# SPI helper chips +# +CONFIG_VIDEO_GS1662=m +# end of SPI helper chips + +# +# Media SPI Adapters +# +CONFIG_CXD2880_SPI_DRV=m +# end of Media SPI Adapters + +CONFIG_MEDIA_TUNER=m + +# +# Customize TV tuners +# +CONFIG_MEDIA_TUNER_SIMPLE=m +CONFIG_MEDIA_TUNER_TDA18250=m +CONFIG_MEDIA_TUNER_TDA8290=m +CONFIG_MEDIA_TUNER_TDA827X=m +CONFIG_MEDIA_TUNER_TDA18271=m +CONFIG_MEDIA_TUNER_TDA9887=m +CONFIG_MEDIA_TUNER_TEA5761=m +CONFIG_MEDIA_TUNER_TEA5767=m +CONFIG_MEDIA_TUNER_MSI001=m +CONFIG_MEDIA_TUNER_MT20XX=m +CONFIG_MEDIA_TUNER_MT2060=m +CONFIG_MEDIA_TUNER_MT2063=m +CONFIG_MEDIA_TUNER_MT2266=m +CONFIG_MEDIA_TUNER_MT2131=m +CONFIG_MEDIA_TUNER_QT1010=m +CONFIG_MEDIA_TUNER_XC2028=m +CONFIG_MEDIA_TUNER_XC5000=m +CONFIG_MEDIA_TUNER_XC4000=m +CONFIG_MEDIA_TUNER_MXL5005S=m +CONFIG_MEDIA_TUNER_MXL5007T=m +CONFIG_MEDIA_TUNER_MC44S803=m +CONFIG_MEDIA_TUNER_MAX2165=m +CONFIG_MEDIA_TUNER_TDA18218=m +CONFIG_MEDIA_TUNER_FC0011=m +CONFIG_MEDIA_TUNER_FC0012=m +CONFIG_MEDIA_TUNER_FC0013=m +CONFIG_MEDIA_TUNER_TDA18212=m +CONFIG_MEDIA_TUNER_E4000=m +CONFIG_MEDIA_TUNER_FC2580=m +CONFIG_MEDIA_TUNER_M88RS6000T=m +CONFIG_MEDIA_TUNER_TUA9001=m +CONFIG_MEDIA_TUNER_SI2157=m +CONFIG_MEDIA_TUNER_IT913X=m +CONFIG_MEDIA_TUNER_R820T=m +CONFIG_MEDIA_TUNER_MXL301RF=m +CONFIG_MEDIA_TUNER_QM1D1C0042=m +CONFIG_MEDIA_TUNER_QM1D1B0004=m +# end of Customize TV tuners + +# +# Customise DVB Frontends +# + +# +# Multistandard (satellite) frontends +# +CONFIG_DVB_STB0899=m +CONFIG_DVB_STB6100=m +CONFIG_DVB_STV090x=m +CONFIG_DVB_STV0910=m +CONFIG_DVB_STV6110x=m +CONFIG_DVB_STV6111=m +CONFIG_DVB_MXL5XX=m +CONFIG_DVB_M88DS3103=m + +# +# Multistandard (cable + terrestrial) frontends +# +CONFIG_DVB_DRXK=m +CONFIG_DVB_TDA18271C2DD=m +CONFIG_DVB_SI2165=m +CONFIG_DVB_MN88472=m +CONFIG_DVB_MN88473=m + +# +# DVB-S (satellite) frontends +# +CONFIG_DVB_CX24110=m +CONFIG_DVB_CX24123=m +CONFIG_DVB_MT312=m +CONFIG_DVB_ZL10036=m +CONFIG_DVB_ZL10039=m +CONFIG_DVB_S5H1420=m +CONFIG_DVB_STV0288=m +CONFIG_DVB_STB6000=m +CONFIG_DVB_STV0299=m +CONFIG_DVB_STV6110=m +CONFIG_DVB_STV0900=m +CONFIG_DVB_TDA8083=m +CONFIG_DVB_TDA10086=m +CONFIG_DVB_TDA8261=m +CONFIG_DVB_VES1X93=m +CONFIG_DVB_TUNER_ITD1000=m +CONFIG_DVB_TUNER_CX24113=m +CONFIG_DVB_TDA826X=m +CONFIG_DVB_TUA6100=m +CONFIG_DVB_CX24116=m +CONFIG_DVB_CX24117=m +CONFIG_DVB_CX24120=m +CONFIG_DVB_SI21XX=m +CONFIG_DVB_TS2020=m +CONFIG_DVB_DS3000=m +CONFIG_DVB_MB86A16=m +CONFIG_DVB_TDA10071=m + +# +# DVB-T (terrestrial) frontends +# +CONFIG_DVB_SP8870=m +CONFIG_DVB_SP887X=m +CONFIG_DVB_CX22700=m +CONFIG_DVB_CX22702=m +CONFIG_DVB_S5H1432=m +CONFIG_DVB_DRXD=m +CONFIG_DVB_L64781=m +CONFIG_DVB_TDA1004X=m +CONFIG_DVB_NXT6000=m +CONFIG_DVB_MT352=m +CONFIG_DVB_ZL10353=m +CONFIG_DVB_DIB3000MB=m +CONFIG_DVB_DIB3000MC=m +CONFIG_DVB_DIB7000M=m +CONFIG_DVB_DIB7000P=m +CONFIG_DVB_DIB9000=m +CONFIG_DVB_TDA10048=m +CONFIG_DVB_AF9013=m +CONFIG_DVB_EC100=m +CONFIG_DVB_STV0367=m +CONFIG_DVB_CXD2820R=m +CONFIG_DVB_CXD2841ER=m +CONFIG_DVB_RTL2830=m +CONFIG_DVB_RTL2832=m +CONFIG_DVB_RTL2832_SDR=m +CONFIG_DVB_SI2168=m +CONFIG_DVB_AS102_FE=m +CONFIG_DVB_ZD1301_DEMOD=m +CONFIG_DVB_GP8PSK_FE=m +CONFIG_DVB_CXD2880=m + +# +# DVB-C (cable) frontends +# +CONFIG_DVB_VES1820=m +CONFIG_DVB_TDA10021=m +CONFIG_DVB_TDA10023=m +CONFIG_DVB_STV0297=m + +# +# ATSC (North American/Korean Terrestrial/Cable DTV) frontends +# +CONFIG_DVB_NXT200X=m +CONFIG_DVB_OR51211=m +CONFIG_DVB_OR51132=m +CONFIG_DVB_BCM3510=m +CONFIG_DVB_LGDT330X=m +CONFIG_DVB_LGDT3305=m +CONFIG_DVB_LGDT3306A=m +CONFIG_DVB_LG2160=m +CONFIG_DVB_S5H1409=m +CONFIG_DVB_AU8522=m +CONFIG_DVB_AU8522_DTV=m +CONFIG_DVB_AU8522_V4L=m +CONFIG_DVB_S5H1411=m + +# +# ISDB-T (terrestrial) frontends +# +CONFIG_DVB_S921=m +CONFIG_DVB_DIB8000=m +CONFIG_DVB_MB86A20S=m + +# +# ISDB-S (satellite) & ISDB-T (terrestrial) frontends +# +CONFIG_DVB_TC90522=m +CONFIG_DVB_MN88443X=m + +# +# Digital terrestrial only tuners/PLL +# +CONFIG_DVB_PLL=m +CONFIG_DVB_TUNER_DIB0070=m +CONFIG_DVB_TUNER_DIB0090=m + +# +# SEC control devices for DVB-S +# +CONFIG_DVB_DRX39XYJ=m +CONFIG_DVB_LNBH25=m +CONFIG_DVB_LNBH29=m +CONFIG_DVB_LNBP21=m +CONFIG_DVB_LNBP22=m +CONFIG_DVB_ISL6405=m +CONFIG_DVB_ISL6421=m +CONFIG_DVB_ISL6423=m +CONFIG_DVB_A8293=m +CONFIG_DVB_LGS8GL5=m +CONFIG_DVB_LGS8GXX=m +CONFIG_DVB_ATBM8830=m +CONFIG_DVB_TDA665x=m +CONFIG_DVB_IX2505V=m +CONFIG_DVB_M88RS2000=m +CONFIG_DVB_AF9033=m +CONFIG_DVB_HORUS3A=m +CONFIG_DVB_ASCOT2E=m +CONFIG_DVB_HELENE=m + +# +# Common Interface (EN50221) controller drivers +# +CONFIG_DVB_CXD2099=m +CONFIG_DVB_SP2=m + +# +# Tools to develop new frontends +# +CONFIG_DVB_DUMMY_FE=m +# end of Customise DVB Frontends + +# +# Graphics support +# +CONFIG_AGP=m +CONFIG_AGP_AMD64=m +CONFIG_AGP_INTEL=m +CONFIG_AGP_SIS=m +CONFIG_AGP_VIA=m +CONFIG_INTEL_GTT=m +CONFIG_VGA_ARB=y +CONFIG_VGA_ARB_MAX_GPUS=10 +CONFIG_VGA_SWITCHEROO=y +CONFIG_DRM=m +CONFIG_DRM_MIPI_DBI=m +CONFIG_DRM_MIPI_DSI=y +CONFIG_DRM_DP_AUX_CHARDEV=y +# CONFIG_DRM_DEBUG_SELFTEST is not set +CONFIG_DRM_KMS_HELPER=m +CONFIG_DRM_KMS_FB_HELPER=y +CONFIG_DRM_FBDEV_EMULATION=y +CONFIG_DRM_FBDEV_OVERALLOC=100 +# CONFIG_DRM_FBDEV_LEAK_PHYS_SMEM is not set +CONFIG_DRM_LOAD_EDID_FIRMWARE=y +CONFIG_DRM_DP_CEC=y +CONFIG_DRM_TTM=m +CONFIG_DRM_VRAM_HELPER=m +CONFIG_DRM_GEM_CMA_HELPER=y +CONFIG_DRM_KMS_CMA_HELPER=y +CONFIG_DRM_GEM_SHMEM_HELPER=y +CONFIG_DRM_SCHED=m + +# +# I2C encoder or helper chips +# +CONFIG_DRM_I2C_CH7006=m +CONFIG_DRM_I2C_SIL164=m +CONFIG_DRM_I2C_NXP_TDA998X=m +CONFIG_DRM_I2C_NXP_TDA9950=m +# end of I2C encoder or helper chips + +# +# ARM devices +# +CONFIG_DRM_KOMEDA=m +# end of ARM devices + +CONFIG_DRM_RADEON=m +CONFIG_DRM_RADEON_USERPTR=y +CONFIG_DRM_AMDGPU=m +CONFIG_DRM_AMDGPU_SI=y +CONFIG_DRM_AMDGPU_CIK=y +CONFIG_DRM_AMDGPU_USERPTR=y +# CONFIG_DRM_AMDGPU_GART_DEBUGFS is not set + +# +# ACP (Audio CoProcessor) Configuration +# +CONFIG_DRM_AMD_ACP=y +# end of ACP (Audio CoProcessor) Configuration + +# +# Display Engine Configuration +# +CONFIG_DRM_AMD_DC=y +CONFIG_DRM_AMD_DC_DCN1_0=y +CONFIG_DRM_AMD_DC_DCN2_0=y +CONFIG_DRM_AMD_DC_DCN2_1=y +CONFIG_DRM_AMD_DC_DSC_SUPPORT=y +# CONFIG_DEBUG_KERNEL_DC is not set +# end of Display Engine Configuration + +CONFIG_HSA_AMD=y +CONFIG_DRM_NOUVEAU=m +# CONFIG_NOUVEAU_LEGACY_CTX_SUPPORT is not set +CONFIG_NOUVEAU_DEBUG=5 +CONFIG_NOUVEAU_DEBUG_DEFAULT=3 +# CONFIG_NOUVEAU_DEBUG_MMU is not set +CONFIG_DRM_NOUVEAU_BACKLIGHT=y +CONFIG_DRM_NOUVEAU_SVM=y +CONFIG_DRM_I915=m +CONFIG_DRM_I915_ALPHA_SUPPORT=y +CONFIG_DRM_I915_FORCE_PROBE="*" +CONFIG_DRM_I915_CAPTURE_ERROR=y +CONFIG_DRM_I915_COMPRESS_ERROR=y +CONFIG_DRM_I915_USERPTR=y +CONFIG_DRM_I915_GVT=y +CONFIG_DRM_I915_GVT_KVMGT=m + +# +# drm/i915 Debugging +# +# CONFIG_DRM_I915_WERROR is not set +# CONFIG_DRM_I915_DEBUG is not set +# CONFIG_DRM_I915_DEBUG_MMIO is not set +# CONFIG_DRM_I915_SW_FENCE_DEBUG_OBJECTS is not set +# CONFIG_DRM_I915_SW_FENCE_CHECK_DAG is not set +# CONFIG_DRM_I915_DEBUG_GUC is not set +# CONFIG_DRM_I915_SELFTEST is not set +# CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS is not set +# CONFIG_DRM_I915_DEBUG_VBLANK_EVADE is not set +# CONFIG_DRM_I915_DEBUG_RUNTIME_PM is not set +# end of drm/i915 Debugging + +# +# drm/i915 Profile Guided Optimisation +# +CONFIG_DRM_I915_USERFAULT_AUTOSUSPEND=250 +CONFIG_DRM_I915_SPIN_REQUEST=5 +# end of drm/i915 Profile Guided Optimisation + +CONFIG_DRM_VGEM=m +CONFIG_DRM_VKMS=m +CONFIG_DRM_VMWGFX=m +CONFIG_DRM_VMWGFX_FBCON=y +CONFIG_DRM_GMA500=m +CONFIG_DRM_GMA600=y +CONFIG_DRM_GMA3600=y +CONFIG_DRM_UDL=m +CONFIG_DRM_AST=m +CONFIG_DRM_MGAG200=m +CONFIG_DRM_CIRRUS_QEMU=m +CONFIG_DRM_RCAR_DW_HDMI=m +CONFIG_DRM_RCAR_LVDS=m +CONFIG_DRM_QXL=m +CONFIG_DRM_BOCHS=m +CONFIG_DRM_VIRTIO_GPU=m +CONFIG_DRM_PANEL=y + +# +# Display Panels +# +CONFIG_DRM_PANEL_ARM_VERSATILE=m +CONFIG_DRM_PANEL_LVDS=m +CONFIG_DRM_PANEL_SIMPLE=m +CONFIG_DRM_PANEL_FEIYANG_FY07024DI26A30D=m +CONFIG_DRM_PANEL_ILITEK_IL9322=m +CONFIG_DRM_PANEL_ILITEK_ILI9881C=m +CONFIG_DRM_PANEL_INNOLUX_P079ZCA=m +CONFIG_DRM_PANEL_JDI_LT070ME05000=m +CONFIG_DRM_PANEL_KINGDISPLAY_KD097D04=m +CONFIG_DRM_PANEL_SAMSUNG_LD9040=m +CONFIG_DRM_PANEL_LG_LB035Q02=m +CONFIG_DRM_PANEL_LG_LG4573=m +CONFIG_DRM_PANEL_NEC_NL8048HL11=m +CONFIG_DRM_PANEL_NOVATEK_NT39016=m +CONFIG_DRM_PANEL_OLIMEX_LCD_OLINUXINO=m +CONFIG_DRM_PANEL_ORISETECH_OTM8009A=m +CONFIG_DRM_PANEL_OSD_OSD101T2587_53TS=m +CONFIG_DRM_PANEL_PANASONIC_VVX10F034N00=m +CONFIG_DRM_PANEL_RASPBERRYPI_TOUCHSCREEN=m +CONFIG_DRM_PANEL_RAYDIUM_RM67191=m +CONFIG_DRM_PANEL_RAYDIUM_RM68200=m +CONFIG_DRM_PANEL_ROCKTECH_JH057N00900=m +CONFIG_DRM_PANEL_RONBO_RB070D30=m +CONFIG_DRM_PANEL_SAMSUNG_S6D16D0=m +CONFIG_DRM_PANEL_SAMSUNG_S6E3HA2=m +CONFIG_DRM_PANEL_SAMSUNG_S6E63J0X03=m +CONFIG_DRM_PANEL_SAMSUNG_S6E63M0=m +CONFIG_DRM_PANEL_SAMSUNG_S6E8AA0=m +CONFIG_DRM_PANEL_SEIKO_43WVF1G=m +CONFIG_DRM_PANEL_SHARP_LQ101R1SX01=m +CONFIG_DRM_PANEL_SHARP_LS037V7DW01=m +CONFIG_DRM_PANEL_SHARP_LS043T1LE01=m +CONFIG_DRM_PANEL_SITRONIX_ST7701=m +CONFIG_DRM_PANEL_SITRONIX_ST7789V=m +CONFIG_DRM_PANEL_SONY_ACX565AKM=m +CONFIG_DRM_PANEL_TPO_TD028TTEC1=m +CONFIG_DRM_PANEL_TPO_TD043MTEA1=m +CONFIG_DRM_PANEL_TPO_TPG110=m +CONFIG_DRM_PANEL_TRULY_NT35597_WQXGA=m +# end of Display Panels + +CONFIG_DRM_BRIDGE=y +CONFIG_DRM_PANEL_BRIDGE=y + +# +# Display Interface Bridges +# +CONFIG_DRM_ANALOGIX_ANX78XX=m +CONFIG_DRM_CDNS_DSI=m +CONFIG_DRM_DUMB_VGA_DAC=m +CONFIG_DRM_LVDS_ENCODER=m +CONFIG_DRM_MEGACHIPS_STDPXXXX_GE_B850V3_FW=m +CONFIG_DRM_NXP_PTN3460=m +CONFIG_DRM_PARADE_PS8622=m +CONFIG_DRM_SIL_SII8620=m +CONFIG_DRM_SII902X=m +CONFIG_DRM_SII9234=m +CONFIG_DRM_THINE_THC63LVD1024=m +CONFIG_DRM_TOSHIBA_TC358764=m +CONFIG_DRM_TOSHIBA_TC358767=m +CONFIG_DRM_TI_TFP410=m +CONFIG_DRM_TI_SN65DSI86=m +CONFIG_DRM_I2C_ADV7511=m +CONFIG_DRM_I2C_ADV7511_AUDIO=y +CONFIG_DRM_I2C_ADV7533=y +CONFIG_DRM_I2C_ADV7511_CEC=y +CONFIG_DRM_DW_HDMI=m +CONFIG_DRM_DW_HDMI_AHB_AUDIO=m +CONFIG_DRM_DW_HDMI_I2S_AUDIO=m +CONFIG_DRM_DW_HDMI_CEC=m +# end of Display Interface Bridges + +# CONFIG_DRM_ETNAVIV is not set +CONFIG_DRM_ARCPGU=m +CONFIG_DRM_MXS=y +CONFIG_DRM_MXSFB=m +CONFIG_DRM_GM12U320=m +CONFIG_TINYDRM_HX8357D=m +CONFIG_TINYDRM_ILI9225=m +CONFIG_TINYDRM_ILI9341=m +CONFIG_TINYDRM_MI0283QT=m +CONFIG_TINYDRM_REPAPER=m +CONFIG_TINYDRM_ST7586=m +CONFIG_TINYDRM_ST7735R=m +CONFIG_DRM_XEN=y +CONFIG_DRM_XEN_FRONTEND=m +CONFIG_DRM_VBOXVIDEO=m +# CONFIG_DRM_LEGACY is not set +CONFIG_DRM_PANEL_ORIENTATION_QUIRKS=y + +# +# Frame buffer Devices +# +CONFIG_FB_CMDLINE=y +CONFIG_FB_NOTIFY=y +CONFIG_FB=y +CONFIG_FIRMWARE_EDID=y +CONFIG_FB_BOOT_VESA_SUPPORT=y +CONFIG_FB_CFB_FILLRECT=y +CONFIG_FB_CFB_COPYAREA=y +CONFIG_FB_CFB_IMAGEBLIT=y +CONFIG_FB_SYS_FILLRECT=m +CONFIG_FB_SYS_COPYAREA=m +CONFIG_FB_SYS_IMAGEBLIT=m +# CONFIG_FB_FOREIGN_ENDIAN is not set +CONFIG_FB_SYS_FOPS=m +CONFIG_FB_DEFERRED_IO=y +CONFIG_FB_BACKLIGHT=m +CONFIG_FB_MODE_HELPERS=y +CONFIG_FB_TILEBLITTING=y + +# +# Frame buffer hardware drivers +# +# CONFIG_FB_CIRRUS is not set +# CONFIG_FB_PM2 is not set +# CONFIG_FB_CYBER2000 is not set +# CONFIG_FB_ARC is not set +# CONFIG_FB_ASILIANT is not set +# CONFIG_FB_IMSTT is not set +# CONFIG_FB_VGA16 is not set +# CONFIG_FB_UVESA is not set +CONFIG_FB_VESA=y +CONFIG_FB_EFI=y +# CONFIG_FB_N411 is not set +# CONFIG_FB_HGA is not set +# CONFIG_FB_OPENCORES is not set +# CONFIG_FB_S1D13XXX is not set +# CONFIG_FB_NVIDIA is not set +# CONFIG_FB_RIVA is not set +# CONFIG_FB_I740 is not set +# CONFIG_FB_LE80578 is not set +# CONFIG_FB_INTEL is not set +# CONFIG_FB_MATROX is not set +# CONFIG_FB_RADEON is not set +# CONFIG_FB_ATY128 is not set +# CONFIG_FB_ATY is not set +# CONFIG_FB_S3 is not set +# CONFIG_FB_SAVAGE is not set +# CONFIG_FB_SIS is not set +# CONFIG_FB_VIA is not set +# CONFIG_FB_NEOMAGIC is not set +# CONFIG_FB_KYRO is not set +# CONFIG_FB_3DFX is not set +# CONFIG_FB_VOODOO1 is not set +# CONFIG_FB_VT8623 is not set +# CONFIG_FB_TRIDENT is not set +# CONFIG_FB_ARK is not set +# CONFIG_FB_PM3 is not set +# CONFIG_FB_CARMINE is not set +# CONFIG_FB_SM501 is not set +# CONFIG_FB_SMSCUFX is not set +# CONFIG_FB_UDL is not set +# CONFIG_FB_IBM_GXT4500 is not set +# CONFIG_FB_VIRTUAL is not set +CONFIG_XEN_FBDEV_FRONTEND=m +# CONFIG_FB_METRONOME is not set +# CONFIG_FB_MB862XX is not set +CONFIG_FB_HYPERV=m +CONFIG_FB_SIMPLE=y +# CONFIG_FB_SSD1307 is not set +# CONFIG_FB_SM712 is not set +# end of Frame buffer Devices + +# +# Backlight & LCD device support +# +CONFIG_LCD_CLASS_DEVICE=m +CONFIG_LCD_L4F00242T03=m +CONFIG_LCD_LMS283GF05=m +CONFIG_LCD_LTV350QV=m +CONFIG_LCD_ILI922X=m +CONFIG_LCD_ILI9320=m +CONFIG_LCD_TDO24M=m +CONFIG_LCD_VGG2432A4=m +CONFIG_LCD_PLATFORM=m +CONFIG_LCD_AMS369FG06=m +CONFIG_LCD_LMS501KF03=m +CONFIG_LCD_HX8357=m +CONFIG_LCD_OTM3225A=m +CONFIG_BACKLIGHT_CLASS_DEVICE=y +CONFIG_BACKLIGHT_GENERIC=m +CONFIG_BACKLIGHT_LM3533=m +CONFIG_BACKLIGHT_PWM=m +CONFIG_BACKLIGHT_DA903X=m +CONFIG_BACKLIGHT_DA9052=m +CONFIG_BACKLIGHT_MAX8925=m +CONFIG_BACKLIGHT_APPLE=m +CONFIG_BACKLIGHT_PM8941_WLED=m +CONFIG_BACKLIGHT_SAHARA=m +CONFIG_BACKLIGHT_WM831X=m +CONFIG_BACKLIGHT_ADP5520=m +CONFIG_BACKLIGHT_ADP8860=m +CONFIG_BACKLIGHT_ADP8870=m +CONFIG_BACKLIGHT_88PM860X=m +CONFIG_BACKLIGHT_PCF50633=m +CONFIG_BACKLIGHT_AAT2870=m +CONFIG_BACKLIGHT_LM3630A=m +CONFIG_BACKLIGHT_LM3639=m +CONFIG_BACKLIGHT_LP855X=m +CONFIG_BACKLIGHT_LP8788=m +CONFIG_BACKLIGHT_PANDORA=m +CONFIG_BACKLIGHT_SKY81452=m +CONFIG_BACKLIGHT_TPS65217=m +CONFIG_BACKLIGHT_AS3711=m +CONFIG_BACKLIGHT_GPIO=m +CONFIG_BACKLIGHT_LV5207LP=m +CONFIG_BACKLIGHT_BD6107=m +CONFIG_BACKLIGHT_ARCXCNN=m +CONFIG_BACKLIGHT_RAVE_SP=m +# end of Backlight & LCD device support + +CONFIG_VIDEOMODE_HELPERS=y +CONFIG_HDMI=y + +# +# Console display driver support +# +CONFIG_VGA_CONSOLE=y +CONFIG_VGACON_SOFT_SCROLLBACK=y +CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 +# CONFIG_VGACON_SOFT_SCROLLBACK_PERSISTENT_ENABLE_BY_DEFAULT is not set +CONFIG_DUMMY_CONSOLE=y +CONFIG_DUMMY_CONSOLE_COLUMNS=80 +CONFIG_DUMMY_CONSOLE_ROWS=25 +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y +CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y +CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER=y +# end of Console display driver support + +# CONFIG_LOGO is not set +# end of Graphics support + +CONFIG_SOUND=m +CONFIG_SOUND_OSS_CORE=y +# CONFIG_SOUND_OSS_CORE_PRECLAIM is not set +CONFIG_SND=m +CONFIG_SND_TIMER=m +CONFIG_SND_PCM=m +CONFIG_SND_PCM_ELD=y +CONFIG_SND_PCM_IEC958=y +CONFIG_SND_DMAENGINE_PCM=m +CONFIG_SND_HWDEP=m +CONFIG_SND_SEQ_DEVICE=m +CONFIG_SND_RAWMIDI=m +CONFIG_SND_COMPRESS_OFFLOAD=m +CONFIG_SND_JACK=y +CONFIG_SND_JACK_INPUT_DEV=y +CONFIG_SND_OSSEMUL=y +CONFIG_SND_MIXER_OSS=m +CONFIG_SND_PCM_OSS=m +CONFIG_SND_PCM_OSS_PLUGINS=y +CONFIG_SND_PCM_TIMER=y +CONFIG_SND_HRTIMER=m +CONFIG_SND_DYNAMIC_MINORS=y +CONFIG_SND_MAX_CARDS=32 +# CONFIG_SND_SUPPORT_OLD_API is not set +CONFIG_SND_PROC_FS=y +CONFIG_SND_VERBOSE_PROCFS=y +CONFIG_SND_VERBOSE_PRINTK=y +CONFIG_SND_DEBUG=y +# CONFIG_SND_DEBUG_VERBOSE is not set +# CONFIG_SND_PCM_XRUN_DEBUG is not set +CONFIG_SND_VMASTER=y +CONFIG_SND_DMA_SGBUF=y +CONFIG_SND_SEQUENCER=m +CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_SEQUENCER_OSS=m +CONFIG_SND_SEQ_HRTIMER_DEFAULT=y +CONFIG_SND_SEQ_MIDI_EVENT=m +CONFIG_SND_SEQ_MIDI=m +CONFIG_SND_SEQ_MIDI_EMUL=m +CONFIG_SND_SEQ_VIRMIDI=m +CONFIG_SND_MPU401_UART=m +CONFIG_SND_OPL3_LIB=m +CONFIG_SND_OPL3_LIB_SEQ=m +CONFIG_SND_VX_LIB=m +CONFIG_SND_AC97_CODEC=m +CONFIG_SND_DRIVERS=y +# CONFIG_SND_PCSP is not set +CONFIG_SND_DUMMY=m +CONFIG_SND_ALOOP=m +CONFIG_SND_VIRMIDI=m +CONFIG_SND_MTPAV=m +CONFIG_SND_MTS64=m +CONFIG_SND_SERIAL_U16550=m +CONFIG_SND_MPU401=m +CONFIG_SND_PORTMAN2X4=m +CONFIG_SND_AC97_POWER_SAVE=y +CONFIG_SND_AC97_POWER_SAVE_DEFAULT=0 +CONFIG_SND_SB_COMMON=m +CONFIG_SND_PCI=y +CONFIG_SND_AD1889=m +CONFIG_SND_ALS300=m +CONFIG_SND_ALS4000=m +CONFIG_SND_ALI5451=m +CONFIG_SND_ASIHPI=m +CONFIG_SND_ATIIXP=m +CONFIG_SND_ATIIXP_MODEM=m +CONFIG_SND_AU8810=m +CONFIG_SND_AU8820=m +CONFIG_SND_AU8830=m +CONFIG_SND_AW2=m +CONFIG_SND_AZT3328=m +CONFIG_SND_BT87X=m +# CONFIG_SND_BT87X_OVERCLOCK is not set +CONFIG_SND_CA0106=m +CONFIG_SND_CMIPCI=m +CONFIG_SND_OXYGEN_LIB=m +CONFIG_SND_OXYGEN=m +CONFIG_SND_CS4281=m +CONFIG_SND_CS46XX=m +CONFIG_SND_CS46XX_NEW_DSP=y +CONFIG_SND_CTXFI=m +CONFIG_SND_DARLA20=m +CONFIG_SND_GINA20=m +CONFIG_SND_LAYLA20=m +CONFIG_SND_DARLA24=m +CONFIG_SND_GINA24=m +CONFIG_SND_LAYLA24=m +CONFIG_SND_MONA=m +CONFIG_SND_MIA=m +CONFIG_SND_ECHO3G=m +CONFIG_SND_INDIGO=m +CONFIG_SND_INDIGOIO=m +CONFIG_SND_INDIGODJ=m +CONFIG_SND_INDIGOIOX=m +CONFIG_SND_INDIGODJX=m +CONFIG_SND_EMU10K1=m +CONFIG_SND_EMU10K1_SEQ=m +CONFIG_SND_EMU10K1X=m +CONFIG_SND_ENS1370=m +CONFIG_SND_ENS1371=m +CONFIG_SND_ES1938=m +CONFIG_SND_ES1968=m +CONFIG_SND_ES1968_INPUT=y +CONFIG_SND_ES1968_RADIO=y +CONFIG_SND_FM801=m +CONFIG_SND_FM801_TEA575X_BOOL=y +CONFIG_SND_HDSP=m +CONFIG_SND_HDSPM=m +CONFIG_SND_ICE1712=m +CONFIG_SND_ICE1724=m +CONFIG_SND_INTEL8X0=m +CONFIG_SND_INTEL8X0M=m +CONFIG_SND_KORG1212=m +CONFIG_SND_LOLA=m +CONFIG_SND_LX6464ES=m +CONFIG_SND_MAESTRO3=m +CONFIG_SND_MAESTRO3_INPUT=y +CONFIG_SND_MIXART=m +CONFIG_SND_NM256=m +CONFIG_SND_PCXHR=m +CONFIG_SND_RIPTIDE=m +CONFIG_SND_RME32=m +CONFIG_SND_RME96=m +CONFIG_SND_RME9652=m +CONFIG_SND_SONICVIBES=m +CONFIG_SND_TRIDENT=m +CONFIG_SND_VIA82XX=m +CONFIG_SND_VIA82XX_MODEM=m +CONFIG_SND_VIRTUOSO=m +CONFIG_SND_VX222=m +CONFIG_SND_YMFPCI=m + +# +# HD-Audio +# +CONFIG_SND_HDA=m +CONFIG_SND_HDA_INTEL=m +# CONFIG_SND_HDA_INTEL_DETECT_DMIC is not set +CONFIG_SND_HDA_HWDEP=y +CONFIG_SND_HDA_RECONFIG=y +CONFIG_SND_HDA_INPUT_BEEP=y +CONFIG_SND_HDA_INPUT_BEEP_MODE=1 +CONFIG_SND_HDA_PATCH_LOADER=y +CONFIG_SND_HDA_CODEC_REALTEK=m +CONFIG_SND_HDA_CODEC_ANALOG=m +CONFIG_SND_HDA_CODEC_SIGMATEL=m +CONFIG_SND_HDA_CODEC_VIA=m +CONFIG_SND_HDA_CODEC_HDMI=m +CONFIG_SND_HDA_CODEC_CIRRUS=m +CONFIG_SND_HDA_CODEC_CONEXANT=m +CONFIG_SND_HDA_CODEC_CA0110=m +CONFIG_SND_HDA_CODEC_CA0132=m +CONFIG_SND_HDA_CODEC_CA0132_DSP=y +CONFIG_SND_HDA_CODEC_CMEDIA=m +CONFIG_SND_HDA_CODEC_SI3054=m +CONFIG_SND_HDA_GENERIC=m +CONFIG_SND_HDA_POWER_SAVE_DEFAULT=0 +# end of HD-Audio + +CONFIG_SND_HDA_CORE=m +CONFIG_SND_HDA_DSP_LOADER=y +CONFIG_SND_HDA_COMPONENT=y +CONFIG_SND_HDA_I915=y +CONFIG_SND_HDA_EXT_CORE=m +CONFIG_SND_HDA_PREALLOC_SIZE=4096 +CONFIG_SND_INTEL_NHLT=m +CONFIG_SND_SPI=y +CONFIG_SND_USB=y +CONFIG_SND_USB_AUDIO=m +CONFIG_SND_USB_AUDIO_USE_MEDIA_CONTROLLER=y +CONFIG_SND_USB_UA101=m +CONFIG_SND_USB_USX2Y=m +CONFIG_SND_USB_CAIAQ=m +CONFIG_SND_USB_CAIAQ_INPUT=y +CONFIG_SND_USB_US122L=m +CONFIG_SND_USB_6FIRE=m +CONFIG_SND_USB_HIFACE=m +CONFIG_SND_BCD2000=m +CONFIG_SND_USB_LINE6=m +CONFIG_SND_USB_POD=m +CONFIG_SND_USB_PODHD=m +CONFIG_SND_USB_TONEPORT=m +CONFIG_SND_USB_VARIAX=m +CONFIG_SND_FIREWIRE=y +CONFIG_SND_FIREWIRE_LIB=m +CONFIG_SND_DICE=m +CONFIG_SND_OXFW=m +CONFIG_SND_ISIGHT=m +CONFIG_SND_FIREWORKS=m +CONFIG_SND_BEBOB=m +CONFIG_SND_FIREWIRE_DIGI00X=m +CONFIG_SND_FIREWIRE_TASCAM=m +CONFIG_SND_FIREWIRE_MOTU=m +CONFIG_SND_FIREFACE=m +CONFIG_SND_PCMCIA=y +CONFIG_SND_VXPOCKET=m +CONFIG_SND_PDAUDIOCF=m +CONFIG_SND_SOC=m +CONFIG_SND_SOC_AC97_BUS=y +CONFIG_SND_SOC_GENERIC_DMAENGINE_PCM=y +CONFIG_SND_SOC_COMPRESS=y +CONFIG_SND_SOC_TOPOLOGY=y +CONFIG_SND_SOC_ACPI=m +CONFIG_SND_SOC_AMD_ACP=m +CONFIG_SND_SOC_AMD_CZ_DA7219MX98357_MACH=m +CONFIG_SND_SOC_AMD_CZ_RT5645_MACH=m +CONFIG_SND_SOC_AMD_ACP3x=m +CONFIG_SND_ATMEL_SOC=m +CONFIG_SND_SOC_MIKROE_PROTO=m +CONFIG_SND_DESIGNWARE_I2S=m +CONFIG_SND_DESIGNWARE_PCM=y + +# +# SoC Audio for Freescale CPUs +# + +# +# Common SoC Audio options for Freescale CPUs: +# +# CONFIG_SND_SOC_FSL_ASRC is not set +# CONFIG_SND_SOC_FSL_SAI is not set +# CONFIG_SND_SOC_FSL_AUDMIX is not set +# CONFIG_SND_SOC_FSL_SSI is not set +# CONFIG_SND_SOC_FSL_SPDIF is not set +# CONFIG_SND_SOC_FSL_ESAI is not set +# CONFIG_SND_SOC_FSL_MICFIL is not set +# CONFIG_SND_SOC_IMX_AUDMUX is not set +# end of SoC Audio for Freescale CPUs + +CONFIG_SND_I2S_HI6210_I2S=m +CONFIG_SND_SOC_IMG=y +CONFIG_SND_SOC_IMG_I2S_IN=m +CONFIG_SND_SOC_IMG_I2S_OUT=m +CONFIG_SND_SOC_IMG_PARALLEL_OUT=m +CONFIG_SND_SOC_IMG_SPDIF_IN=m +CONFIG_SND_SOC_IMG_SPDIF_OUT=m +CONFIG_SND_SOC_IMG_PISTACHIO_INTERNAL_DAC=m +CONFIG_SND_SOC_INTEL_SST_TOPLEVEL=y +CONFIG_SND_SST_IPC=m +CONFIG_SND_SST_IPC_PCI=m +CONFIG_SND_SST_IPC_ACPI=m +CONFIG_SND_SOC_INTEL_SST_ACPI=m +CONFIG_SND_SOC_INTEL_SST=m +CONFIG_SND_SOC_INTEL_SST_FIRMWARE=m +CONFIG_SND_SOC_INTEL_HASWELL=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_PCI=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_ACPI=m +# CONFIG_SND_SOC_INTEL_SKYLAKE is not set +CONFIG_SND_SOC_INTEL_SKL=m +CONFIG_SND_SOC_INTEL_APL=m +CONFIG_SND_SOC_INTEL_KBL=m +CONFIG_SND_SOC_INTEL_GLK=m +# CONFIG_SND_SOC_INTEL_CNL is not set +# CONFIG_SND_SOC_INTEL_CFL is not set +# CONFIG_SND_SOC_INTEL_CML_H is not set +# CONFIG_SND_SOC_INTEL_CML_LP is not set +CONFIG_SND_SOC_INTEL_SKYLAKE_FAMILY=m +CONFIG_SND_SOC_INTEL_SKYLAKE_SSP_CLK=m +# CONFIG_SND_SOC_INTEL_SKYLAKE_HDAUDIO_CODEC is not set +CONFIG_SND_SOC_INTEL_SKYLAKE_COMMON=m +CONFIG_SND_SOC_ACPI_INTEL_MATCH=m +CONFIG_SND_SOC_INTEL_MACH=y +CONFIG_SND_SOC_INTEL_HASWELL_MACH=m +CONFIG_SND_SOC_INTEL_BDW_RT5677_MACH=m +CONFIG_SND_SOC_INTEL_BROADWELL_MACH=m +CONFIG_SND_SOC_INTEL_BYTCR_RT5640_MACH=m +CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_RT5672_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_RT5645_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_MAX98090_TI_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_NAU8824_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_CX2072X_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_DA7213_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_ES8316_MACH=m +# CONFIG_SND_SOC_INTEL_BYT_CHT_NOCODEC_MACH is not set +CONFIG_SND_SOC_INTEL_SKL_RT286_MACH=m +CONFIG_SND_SOC_INTEL_SKL_NAU88L25_SSM4567_MACH=m +CONFIG_SND_SOC_INTEL_SKL_NAU88L25_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_DA7219_MAX98357A_GENERIC=m +CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_BXT_RT298_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5663_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5663_RT5514_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5660_MACH=m +CONFIG_SND_SOC_INTEL_GLK_RT5682_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_SKL_HDA_DSP_GENERIC_MACH=m +CONFIG_SND_SOC_INTEL_SOF_RT5682_MACH=m +CONFIG_SND_SOC_INTEL_CML_LP_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_MTK_BTCVSD=m +CONFIG_SND_SOC_SOF_TOPLEVEL=y +CONFIG_SND_SOC_SOF_PCI=m +CONFIG_SND_SOC_SOF_ACPI=m +CONFIG_SND_SOC_SOF_OF=m +CONFIG_SND_SOC_SOF_OPTIONS=m +# CONFIG_SND_SOC_SOF_NOCODEC_SUPPORT is not set +# CONFIG_SND_SOC_SOF_STRICT_ABI_CHECKS is not set +# CONFIG_SND_SOC_SOF_DEBUG is not set +CONFIG_SND_SOC_SOF=m +CONFIG_SND_SOC_SOF_PROBE_WORK_QUEUE=y +CONFIG_SND_SOC_SOF_INTEL_TOPLEVEL=y +CONFIG_SND_SOC_SOF_INTEL_ACPI=m +CONFIG_SND_SOC_SOF_INTEL_PCI=m +CONFIG_SND_SOC_SOF_INTEL_HIFI_EP_IPC=m +CONFIG_SND_SOC_SOF_INTEL_ATOM_HIFI_EP=m +CONFIG_SND_SOC_SOF_INTEL_COMMON=m +# CONFIG_SND_SOC_SOF_BAYTRAIL_SUPPORT is not set +CONFIG_SND_SOC_SOF_MERRIFIELD_SUPPORT=y +CONFIG_SND_SOC_SOF_MERRIFIELD=m +CONFIG_SND_SOC_SOF_APOLLOLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_APOLLOLAKE=m +CONFIG_SND_SOC_SOF_GEMINILAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_GEMINILAKE=m +CONFIG_SND_SOC_SOF_CANNONLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_CANNONLAKE=m +CONFIG_SND_SOC_SOF_COFFEELAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_COFFEELAKE=m +CONFIG_SND_SOC_SOF_ICELAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_ICELAKE=m +CONFIG_SND_SOC_SOF_COMETLAKE_LP=m +CONFIG_SND_SOC_SOF_COMETLAKE_LP_SUPPORT=y +CONFIG_SND_SOC_SOF_COMETLAKE_H=m +CONFIG_SND_SOC_SOF_COMETLAKE_H_SUPPORT=y +CONFIG_SND_SOC_SOF_TIGERLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_TIGERLAKE=m +CONFIG_SND_SOC_SOF_ELKHARTLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_ELKHARTLAKE=m +CONFIG_SND_SOC_SOF_HDA_COMMON=m +CONFIG_SND_SOC_SOF_HDA_LINK=y +CONFIG_SND_SOC_SOF_HDA_AUDIO_CODEC=y +# CONFIG_SND_SOC_SOF_HDA_ALWAYS_ENABLE_DMI_L1 is not set +CONFIG_SND_SOC_SOF_HDA_LINK_BASELINE=m +CONFIG_SND_SOC_SOF_HDA=m +CONFIG_SND_SOC_SOF_XTENSA=m + +# +# STMicroelectronics STM32 SOC audio support +# +# end of STMicroelectronics STM32 SOC audio support + +CONFIG_SND_SOC_XILINX_I2S=m +CONFIG_SND_SOC_XILINX_AUDIO_FORMATTER=m +CONFIG_SND_SOC_XILINX_SPDIF=m +CONFIG_SND_SOC_XTFPGA_I2S=m +CONFIG_ZX_TDM=m +CONFIG_SND_SOC_I2C_AND_SPI=m + +# +# CODEC drivers +# +CONFIG_SND_SOC_AC97_CODEC=m +CONFIG_SND_SOC_ADAU_UTILS=m +CONFIG_SND_SOC_ADAU1701=m +CONFIG_SND_SOC_ADAU17X1=m +CONFIG_SND_SOC_ADAU1761=m +CONFIG_SND_SOC_ADAU1761_I2C=m +CONFIG_SND_SOC_ADAU1761_SPI=m +CONFIG_SND_SOC_ADAU7002=m +CONFIG_SND_SOC_AK4104=m +CONFIG_SND_SOC_AK4118=m +CONFIG_SND_SOC_AK4458=m +CONFIG_SND_SOC_AK4554=m +CONFIG_SND_SOC_AK4613=m +CONFIG_SND_SOC_AK4642=m +CONFIG_SND_SOC_AK5386=m +CONFIG_SND_SOC_AK5558=m +CONFIG_SND_SOC_ALC5623=m +CONFIG_SND_SOC_BD28623=m +# CONFIG_SND_SOC_BT_SCO is not set +CONFIG_SND_SOC_CPCAP=m +CONFIG_SND_SOC_CROS_EC_CODEC=m +CONFIG_SND_SOC_CS35L32=m +CONFIG_SND_SOC_CS35L33=m +CONFIG_SND_SOC_CS35L34=m +CONFIG_SND_SOC_CS35L35=m +CONFIG_SND_SOC_CS35L36=m +CONFIG_SND_SOC_CS42L42=m +CONFIG_SND_SOC_CS42L51=m +CONFIG_SND_SOC_CS42L51_I2C=m +CONFIG_SND_SOC_CS42L52=m +CONFIG_SND_SOC_CS42L56=m +CONFIG_SND_SOC_CS42L73=m +CONFIG_SND_SOC_CS4265=m +CONFIG_SND_SOC_CS4270=m +CONFIG_SND_SOC_CS4271=m +CONFIG_SND_SOC_CS4271_I2C=m +CONFIG_SND_SOC_CS4271_SPI=m +CONFIG_SND_SOC_CS42XX8=m +CONFIG_SND_SOC_CS42XX8_I2C=m +CONFIG_SND_SOC_CS43130=m +CONFIG_SND_SOC_CS4341=m +CONFIG_SND_SOC_CS4349=m +CONFIG_SND_SOC_CS53L30=m +CONFIG_SND_SOC_CX2072X=m +CONFIG_SND_SOC_DA7213=m +CONFIG_SND_SOC_DA7219=m +CONFIG_SND_SOC_DMIC=m +CONFIG_SND_SOC_HDMI_CODEC=m +CONFIG_SND_SOC_ES7134=m +CONFIG_SND_SOC_ES7241=m +CONFIG_SND_SOC_ES8316=m +CONFIG_SND_SOC_ES8328=m +CONFIG_SND_SOC_ES8328_I2C=m +CONFIG_SND_SOC_ES8328_SPI=m +CONFIG_SND_SOC_GTM601=m +CONFIG_SND_SOC_HDAC_HDMI=m +CONFIG_SND_SOC_HDAC_HDA=m +CONFIG_SND_SOC_INNO_RK3036=m +CONFIG_SND_SOC_LOCHNAGAR_SC=m +CONFIG_SND_SOC_MAX98088=m +CONFIG_SND_SOC_MAX98090=m +CONFIG_SND_SOC_MAX98357A=m +CONFIG_SND_SOC_MAX98504=m +CONFIG_SND_SOC_MAX9867=m +CONFIG_SND_SOC_MAX98927=m +CONFIG_SND_SOC_MAX98373=m +CONFIG_SND_SOC_MAX9860=m +CONFIG_SND_SOC_MSM8916_WCD_ANALOG=m +CONFIG_SND_SOC_MSM8916_WCD_DIGITAL=m +CONFIG_SND_SOC_PCM1681=m +CONFIG_SND_SOC_PCM1789=m +CONFIG_SND_SOC_PCM1789_I2C=m +CONFIG_SND_SOC_PCM179X=m +CONFIG_SND_SOC_PCM179X_I2C=m +CONFIG_SND_SOC_PCM179X_SPI=m +CONFIG_SND_SOC_PCM186X=m +CONFIG_SND_SOC_PCM186X_I2C=m +CONFIG_SND_SOC_PCM186X_SPI=m +CONFIG_SND_SOC_PCM3060=m +CONFIG_SND_SOC_PCM3060_I2C=m +CONFIG_SND_SOC_PCM3060_SPI=m +CONFIG_SND_SOC_PCM3168A=m +CONFIG_SND_SOC_PCM3168A_I2C=m +CONFIG_SND_SOC_PCM3168A_SPI=m +CONFIG_SND_SOC_PCM512x=m +CONFIG_SND_SOC_PCM512x_I2C=m +CONFIG_SND_SOC_PCM512x_SPI=m +CONFIG_SND_SOC_RK3328=m +CONFIG_SND_SOC_RL6231=m +CONFIG_SND_SOC_RL6347A=m +CONFIG_SND_SOC_RT286=m +CONFIG_SND_SOC_RT298=m +CONFIG_SND_SOC_RT5514=m +CONFIG_SND_SOC_RT5514_SPI=m +CONFIG_SND_SOC_RT5616=m +CONFIG_SND_SOC_RT5631=m +CONFIG_SND_SOC_RT5640=m +CONFIG_SND_SOC_RT5645=m +CONFIG_SND_SOC_RT5651=m +CONFIG_SND_SOC_RT5660=m +CONFIG_SND_SOC_RT5663=m +CONFIG_SND_SOC_RT5670=m +CONFIG_SND_SOC_RT5677=m +CONFIG_SND_SOC_RT5677_SPI=m +CONFIG_SND_SOC_RT5682=m +CONFIG_SND_SOC_SGTL5000=m +CONFIG_SND_SOC_SI476X=m +CONFIG_SND_SOC_SIGMADSP=m +CONFIG_SND_SOC_SIGMADSP_I2C=m +CONFIG_SND_SOC_SIGMADSP_REGMAP=m +CONFIG_SND_SOC_SIMPLE_AMPLIFIER=m +CONFIG_SND_SOC_SIRF_AUDIO_CODEC=m +CONFIG_SND_SOC_SPDIF=m +CONFIG_SND_SOC_SSM2305=m +CONFIG_SND_SOC_SSM2602=m +CONFIG_SND_SOC_SSM2602_SPI=m +CONFIG_SND_SOC_SSM2602_I2C=m +CONFIG_SND_SOC_SSM4567=m +CONFIG_SND_SOC_STA32X=m +CONFIG_SND_SOC_STA350=m +CONFIG_SND_SOC_STI_SAS=m +CONFIG_SND_SOC_TAS2552=m +CONFIG_SND_SOC_TAS5086=m +CONFIG_SND_SOC_TAS571X=m +CONFIG_SND_SOC_TAS5720=m +CONFIG_SND_SOC_TAS6424=m +CONFIG_SND_SOC_TDA7419=m +CONFIG_SND_SOC_TFA9879=m +CONFIG_SND_SOC_TLV320AIC23=m +CONFIG_SND_SOC_TLV320AIC23_I2C=m +CONFIG_SND_SOC_TLV320AIC23_SPI=m +CONFIG_SND_SOC_TLV320AIC31XX=m +CONFIG_SND_SOC_TLV320AIC32X4=m +CONFIG_SND_SOC_TLV320AIC32X4_I2C=m +CONFIG_SND_SOC_TLV320AIC32X4_SPI=m +CONFIG_SND_SOC_TLV320AIC3X=m +CONFIG_SND_SOC_TS3A227E=m +CONFIG_SND_SOC_TSCS42XX=m +CONFIG_SND_SOC_TSCS454=m +CONFIG_SND_SOC_UDA1334=m +CONFIG_SND_SOC_WCD9335=m +CONFIG_SND_SOC_WM8510=m +CONFIG_SND_SOC_WM8523=m +CONFIG_SND_SOC_WM8524=m +CONFIG_SND_SOC_WM8580=m +CONFIG_SND_SOC_WM8711=m +CONFIG_SND_SOC_WM8728=m +CONFIG_SND_SOC_WM8731=m +CONFIG_SND_SOC_WM8737=m +CONFIG_SND_SOC_WM8741=m +CONFIG_SND_SOC_WM8750=m +CONFIG_SND_SOC_WM8753=m +CONFIG_SND_SOC_WM8770=m +CONFIG_SND_SOC_WM8776=m +CONFIG_SND_SOC_WM8782=m +CONFIG_SND_SOC_WM8804=m +CONFIG_SND_SOC_WM8804_I2C=m +CONFIG_SND_SOC_WM8804_SPI=m +CONFIG_SND_SOC_WM8903=m +CONFIG_SND_SOC_WM8904=m +CONFIG_SND_SOC_WM8960=m +CONFIG_SND_SOC_WM8962=m +CONFIG_SND_SOC_WM8974=m +CONFIG_SND_SOC_WM8978=m +CONFIG_SND_SOC_WM8985=m +CONFIG_SND_SOC_ZX_AUD96P22=m +CONFIG_SND_SOC_MAX9759=m +CONFIG_SND_SOC_MT6351=m +CONFIG_SND_SOC_MT6358=m +CONFIG_SND_SOC_NAU8540=m +CONFIG_SND_SOC_NAU8810=m +CONFIG_SND_SOC_NAU8822=m +CONFIG_SND_SOC_NAU8824=m +CONFIG_SND_SOC_NAU8825=m +CONFIG_SND_SOC_TPA6130A2=m +# end of CODEC drivers + +CONFIG_SND_SIMPLE_CARD_UTILS=m +CONFIG_SND_SIMPLE_CARD=m +CONFIG_SND_AUDIO_GRAPH_CARD=m +CONFIG_SND_X86=y +CONFIG_HDMI_LPE_AUDIO=m +CONFIG_SND_SYNTH_EMUX=m +CONFIG_SND_XEN_FRONTEND=m +CONFIG_AC97_BUS=m + +# +# HID support +# +CONFIG_HID=m +CONFIG_HID_BATTERY_STRENGTH=y +CONFIG_HIDRAW=y +CONFIG_UHID=m +CONFIG_HID_GENERIC=m + +# +# Special HID drivers +# +CONFIG_HID_A4TECH=m +CONFIG_HID_ACCUTOUCH=m +CONFIG_HID_ACRUX=m +CONFIG_HID_ACRUX_FF=y +CONFIG_HID_APPLE=m +CONFIG_HID_APPLEIR=m +CONFIG_HID_ASUS=m +CONFIG_HID_AUREAL=m +CONFIG_HID_BELKIN=m +CONFIG_HID_BETOP_FF=m +CONFIG_HID_BIGBEN_FF=m +CONFIG_HID_CHERRY=m +CONFIG_HID_CHICONY=m +CONFIG_HID_CORSAIR=m +CONFIG_HID_COUGAR=m +CONFIG_HID_MACALLY=m +CONFIG_HID_PRODIKEYS=m +CONFIG_HID_CMEDIA=m +CONFIG_HID_CP2112=m +CONFIG_HID_CREATIVE_SB0540=m +CONFIG_HID_CYPRESS=m +CONFIG_HID_DRAGONRISE=m +CONFIG_DRAGONRISE_FF=y +CONFIG_HID_EMS_FF=m +CONFIG_HID_ELAN=m +CONFIG_HID_ELECOM=m +CONFIG_HID_ELO=m +CONFIG_HID_EZKEY=m +CONFIG_HID_GEMBIRD=m +CONFIG_HID_GFRM=m +CONFIG_HID_HOLTEK=m +CONFIG_HOLTEK_FF=y +CONFIG_HID_GOOGLE_HAMMER=m +CONFIG_HID_GT683R=m +CONFIG_HID_KEYTOUCH=m +CONFIG_HID_KYE=m +CONFIG_HID_UCLOGIC=m +CONFIG_HID_WALTOP=m +CONFIG_HID_VIEWSONIC=m +CONFIG_HID_GYRATION=m +CONFIG_HID_ICADE=m +CONFIG_HID_ITE=m +CONFIG_HID_JABRA=m +CONFIG_HID_TWINHAN=m +CONFIG_HID_KENSINGTON=m +CONFIG_HID_LCPOWER=m +CONFIG_HID_LED=m +CONFIG_HID_LENOVO=m +CONFIG_HID_LOGITECH=m +CONFIG_HID_LOGITECH_DJ=m +CONFIG_HID_LOGITECH_HIDPP=m +CONFIG_LOGITECH_FF=y +CONFIG_LOGIRUMBLEPAD2_FF=y +CONFIG_LOGIG940_FF=y +CONFIG_LOGIWHEELS_FF=y +CONFIG_HID_MAGICMOUSE=m +CONFIG_HID_MALTRON=m +CONFIG_HID_MAYFLASH=m +CONFIG_HID_REDRAGON=m +CONFIG_HID_MICROSOFT=m +CONFIG_HID_MONTEREY=m +CONFIG_HID_MULTITOUCH=m +CONFIG_HID_NTI=m +CONFIG_HID_NTRIG=m +CONFIG_HID_ORTEK=m +CONFIG_HID_PANTHERLORD=m +CONFIG_PANTHERLORD_FF=y +CONFIG_HID_PENMOUNT=m +CONFIG_HID_PETALYNX=m +CONFIG_HID_PICOLCD=m +CONFIG_HID_PICOLCD_FB=y +CONFIG_HID_PICOLCD_BACKLIGHT=y +CONFIG_HID_PICOLCD_LCD=y +CONFIG_HID_PICOLCD_LEDS=y +CONFIG_HID_PICOLCD_CIR=y +CONFIG_HID_PLANTRONICS=m +CONFIG_HID_PRIMAX=m +CONFIG_HID_RETRODE=m +CONFIG_HID_ROCCAT=m +CONFIG_HID_SAITEK=m +CONFIG_HID_SAMSUNG=m +CONFIG_HID_SONY=m +CONFIG_SONY_FF=y +CONFIG_HID_SPEEDLINK=m +CONFIG_HID_STEAM=m +CONFIG_HID_STEELSERIES=m +CONFIG_HID_SUNPLUS=m +CONFIG_HID_RMI=m +CONFIG_HID_GREENASIA=m +CONFIG_GREENASIA_FF=y +CONFIG_HID_HYPERV_MOUSE=m +CONFIG_HID_SMARTJOYPLUS=m +CONFIG_SMARTJOYPLUS_FF=y +CONFIG_HID_TIVO=m +CONFIG_HID_TOPSEED=m +CONFIG_HID_THINGM=m +CONFIG_HID_THRUSTMASTER=m +CONFIG_THRUSTMASTER_FF=y +CONFIG_HID_UDRAW_PS3=m +CONFIG_HID_U2FZERO=m +CONFIG_HID_WACOM=m +CONFIG_HID_WIIMOTE=m +CONFIG_HID_XINMO=m +CONFIG_HID_ZEROPLUS=m +CONFIG_ZEROPLUS_FF=y +CONFIG_HID_ZYDACRON=m +CONFIG_HID_SENSOR_HUB=m +# CONFIG_HID_SENSOR_CUSTOM_SENSOR is not set +CONFIG_HID_ALPS=m +# end of Special HID drivers + +# +# USB HID support +# +CONFIG_USB_HID=m +CONFIG_HID_PID=y +CONFIG_USB_HIDDEV=y + +# +# USB HID Boot Protocol drivers +# +# CONFIG_USB_KBD is not set +# CONFIG_USB_MOUSE is not set +# end of USB HID Boot Protocol drivers +# end of USB HID support + +# +# I2C HID support +# +CONFIG_I2C_HID=m +# end of I2C HID support + +# +# Intel ISH HID support +# +CONFIG_INTEL_ISH_HID=m +CONFIG_INTEL_ISH_FIRMWARE_DOWNLOADER=m +# end of Intel ISH HID support +# end of HID support + +CONFIG_USB_OHCI_LITTLE_ENDIAN=y +CONFIG_USB_SUPPORT=y +CONFIG_USB_COMMON=y +CONFIG_USB_LED_TRIG=y +CONFIG_USB_ULPI_BUS=m +CONFIG_USB_CONN_GPIO=m +CONFIG_USB_ARCH_HAS_HCD=y +CONFIG_USB=y +CONFIG_USB_PCI=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y + +# +# Miscellaneous USB options +# +CONFIG_USB_DEFAULT_PERSIST=y +CONFIG_USB_DYNAMIC_MINORS=y +# CONFIG_USB_OTG is not set +# CONFIG_USB_OTG_WHITELIST is not set +# CONFIG_USB_OTG_BLACKLIST_HUB is not set +CONFIG_USB_LEDS_TRIGGER_USBPORT=m +CONFIG_USB_AUTOSUSPEND_DELAY=2 +CONFIG_USB_MON=m + +# +# USB Host Controller Drivers +# +CONFIG_USB_C67X00_HCD=m +CONFIG_USB_XHCI_HCD=m +# CONFIG_USB_XHCI_DBGCAP is not set +CONFIG_USB_XHCI_PCI=m +CONFIG_USB_XHCI_PLATFORM=m +CONFIG_USB_EHCI_HCD=m +CONFIG_USB_EHCI_ROOT_HUB_TT=y +CONFIG_USB_EHCI_TT_NEWSCHED=y +CONFIG_USB_EHCI_PCI=m +CONFIG_USB_EHCI_FSL=m +CONFIG_USB_EHCI_HCD_PLATFORM=m +CONFIG_USB_OXU210HP_HCD=m +CONFIG_USB_ISP116X_HCD=m +CONFIG_USB_FOTG210_HCD=m +CONFIG_USB_MAX3421_HCD=m +CONFIG_USB_OHCI_HCD=m +CONFIG_USB_OHCI_HCD_PCI=m +# CONFIG_USB_OHCI_HCD_SSB is not set +CONFIG_USB_OHCI_HCD_PLATFORM=m +CONFIG_USB_UHCI_HCD=m +CONFIG_USB_U132_HCD=m +CONFIG_USB_SL811_HCD=m +# CONFIG_USB_SL811_HCD_ISO is not set +CONFIG_USB_SL811_CS=m +CONFIG_USB_R8A66597_HCD=m +CONFIG_USB_HCD_BCMA=m +CONFIG_USB_HCD_SSB=m +# CONFIG_USB_HCD_TEST_MODE is not set + +# +# USB Device Class drivers +# +CONFIG_USB_ACM=m +CONFIG_USB_PRINTER=m +CONFIG_USB_WDM=m +CONFIG_USB_TMC=m + +# +# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may +# + +# +# also be needed; see USB_STORAGE Help for more info +# +CONFIG_USB_STORAGE=m +# CONFIG_USB_STORAGE_DEBUG is not set +CONFIG_USB_STORAGE_REALTEK=m +CONFIG_REALTEK_AUTOPM=y +CONFIG_USB_STORAGE_DATAFAB=m +CONFIG_USB_STORAGE_FREECOM=m +CONFIG_USB_STORAGE_ISD200=m +CONFIG_USB_STORAGE_USBAT=m +CONFIG_USB_STORAGE_SDDR09=m +CONFIG_USB_STORAGE_SDDR55=m +CONFIG_USB_STORAGE_JUMPSHOT=m +CONFIG_USB_STORAGE_ALAUDA=m +CONFIG_USB_STORAGE_ONETOUCH=m +CONFIG_USB_STORAGE_KARMA=m +CONFIG_USB_STORAGE_CYPRESS_ATACB=m +CONFIG_USB_STORAGE_ENE_UB6250=m +CONFIG_USB_UAS=m + +# +# USB Imaging devices +# +CONFIG_USB_MDC800=m +CONFIG_USB_MICROTEK=m +CONFIG_USBIP_CORE=m +CONFIG_USBIP_VHCI_HCD=m +CONFIG_USBIP_VHCI_HC_PORTS=8 +CONFIG_USBIP_VHCI_NR_HCS=1 +CONFIG_USBIP_HOST=m +CONFIG_USBIP_VUDC=m +# CONFIG_USBIP_DEBUG is not set +CONFIG_USB_CDNS3=m +CONFIG_USB_CDNS3_GADGET=y +CONFIG_USB_CDNS3_HOST=y +CONFIG_USB_CDNS3_PCI_WRAP=m +CONFIG_USB_MUSB_HDRC=m +# CONFIG_USB_MUSB_HOST is not set +# CONFIG_USB_MUSB_GADGET is not set +CONFIG_USB_MUSB_DUAL_ROLE=y + +# +# Platform Glue Layer +# + +# +# MUSB DMA mode +# +# CONFIG_MUSB_PIO_ONLY is not set +CONFIG_USB_DWC3=m +CONFIG_USB_DWC3_ULPI=y +# CONFIG_USB_DWC3_HOST is not set +# CONFIG_USB_DWC3_GADGET is not set +CONFIG_USB_DWC3_DUAL_ROLE=y + +# +# Platform Glue Driver Support +# +CONFIG_USB_DWC3_PCI=m +CONFIG_USB_DWC3_HAPS=m +CONFIG_USB_DWC3_OF_SIMPLE=m +CONFIG_USB_DWC2=m +# CONFIG_USB_DWC2_HOST is not set + +# +# Gadget/Dual-role mode requires USB Gadget support to be enabled +# +# CONFIG_USB_DWC2_PERIPHERAL is not set +CONFIG_USB_DWC2_DUAL_ROLE=y +CONFIG_USB_DWC2_PCI=m +# CONFIG_USB_DWC2_DEBUG is not set +# CONFIG_USB_DWC2_TRACK_MISSED_SOFS is not set +CONFIG_USB_CHIPIDEA=m +CONFIG_USB_CHIPIDEA_OF=m +CONFIG_USB_CHIPIDEA_PCI=m +CONFIG_USB_CHIPIDEA_UDC=y +CONFIG_USB_CHIPIDEA_HOST=y +CONFIG_USB_ISP1760=m +CONFIG_USB_ISP1760_HCD=y +CONFIG_USB_ISP1761_UDC=y +# CONFIG_USB_ISP1760_HOST_ROLE is not set +# CONFIG_USB_ISP1760_GADGET_ROLE is not set +CONFIG_USB_ISP1760_DUAL_ROLE=y + +# +# USB port drivers +# +CONFIG_USB_USS720=m +CONFIG_USB_SERIAL=y +CONFIG_USB_SERIAL_CONSOLE=y +CONFIG_USB_SERIAL_GENERIC=y +CONFIG_USB_SERIAL_SIMPLE=m +CONFIG_USB_SERIAL_AIRCABLE=m +CONFIG_USB_SERIAL_ARK3116=m +CONFIG_USB_SERIAL_BELKIN=m +CONFIG_USB_SERIAL_CH341=m +CONFIG_USB_SERIAL_WHITEHEAT=m +CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m +CONFIG_USB_SERIAL_CP210X=m +CONFIG_USB_SERIAL_CYPRESS_M8=m +CONFIG_USB_SERIAL_EMPEG=m +CONFIG_USB_SERIAL_FTDI_SIO=m +CONFIG_USB_SERIAL_VISOR=m +CONFIG_USB_SERIAL_IPAQ=m +CONFIG_USB_SERIAL_IR=m +CONFIG_USB_SERIAL_EDGEPORT=m +CONFIG_USB_SERIAL_EDGEPORT_TI=m +CONFIG_USB_SERIAL_F81232=m +CONFIG_USB_SERIAL_F8153X=m +CONFIG_USB_SERIAL_GARMIN=m +CONFIG_USB_SERIAL_IPW=m +CONFIG_USB_SERIAL_IUU=m +CONFIG_USB_SERIAL_KEYSPAN_PDA=m +CONFIG_USB_SERIAL_KEYSPAN=m +CONFIG_USB_SERIAL_KLSI=m +CONFIG_USB_SERIAL_KOBIL_SCT=m +CONFIG_USB_SERIAL_MCT_U232=m +CONFIG_USB_SERIAL_METRO=m +CONFIG_USB_SERIAL_MOS7720=m +CONFIG_USB_SERIAL_MOS7715_PARPORT=y +CONFIG_USB_SERIAL_MOS7840=m +CONFIG_USB_SERIAL_MXUPORT=m +CONFIG_USB_SERIAL_NAVMAN=m +CONFIG_USB_SERIAL_PL2303=m +CONFIG_USB_SERIAL_OTI6858=m +CONFIG_USB_SERIAL_QCAUX=m +CONFIG_USB_SERIAL_QUALCOMM=m +CONFIG_USB_SERIAL_SPCP8X5=m +CONFIG_USB_SERIAL_SAFE=m +# CONFIG_USB_SERIAL_SAFE_PADDED is not set +CONFIG_USB_SERIAL_SIERRAWIRELESS=m +CONFIG_USB_SERIAL_SYMBOL=m +CONFIG_USB_SERIAL_TI=m +CONFIG_USB_SERIAL_CYBERJACK=m +CONFIG_USB_SERIAL_XIRCOM=m +CONFIG_USB_SERIAL_WWAN=m +CONFIG_USB_SERIAL_OPTION=m +CONFIG_USB_SERIAL_OMNINET=m +CONFIG_USB_SERIAL_OPTICON=m +CONFIG_USB_SERIAL_XSENS_MT=m +CONFIG_USB_SERIAL_WISHBONE=m +CONFIG_USB_SERIAL_SSU100=m +CONFIG_USB_SERIAL_QT2=m +CONFIG_USB_SERIAL_UPD78F0730=m +CONFIG_USB_SERIAL_DEBUG=m + +# +# USB Miscellaneous drivers +# +CONFIG_USB_EMI62=m +CONFIG_USB_EMI26=m +CONFIG_USB_ADUTUX=m +CONFIG_USB_SEVSEG=m +CONFIG_USB_LEGOTOWER=m +CONFIG_USB_LCD=m +CONFIG_USB_CYPRESS_CY7C63=m +CONFIG_USB_CYTHERM=m +CONFIG_USB_IDMOUSE=m +CONFIG_USB_FTDI_ELAN=m +CONFIG_USB_APPLEDISPLAY=m +CONFIG_USB_SISUSBVGA=m +CONFIG_USB_SISUSBVGA_CON=y +CONFIG_USB_LD=m +CONFIG_USB_TRANCEVIBRATOR=m +CONFIG_USB_IOWARRIOR=m +CONFIG_USB_TEST=m +CONFIG_USB_EHSET_TEST_FIXTURE=m +CONFIG_USB_ISIGHTFW=m +CONFIG_USB_YUREX=m +CONFIG_USB_EZUSB_FX2=m +CONFIG_USB_HUB_USB251XB=m +CONFIG_USB_HSIC_USB3503=m +CONFIG_USB_HSIC_USB4604=m +CONFIG_USB_LINK_LAYER_TEST=m +CONFIG_USB_CHAOSKEY=m +CONFIG_USB_ATM=m +CONFIG_USB_SPEEDTOUCH=m +CONFIG_USB_CXACRU=m +CONFIG_USB_UEAGLEATM=m +CONFIG_USB_XUSBATM=m + +# +# USB Physical Layer drivers +# +CONFIG_USB_PHY=y +CONFIG_NOP_USB_XCEIV=m +CONFIG_USB_GPIO_VBUS=m +CONFIG_TAHVO_USB=m +# CONFIG_TAHVO_USB_HOST_BY_DEFAULT is not set +CONFIG_USB_ISP1301=m +# end of USB Physical Layer drivers + +CONFIG_USB_GADGET=m +# CONFIG_USB_GADGET_DEBUG is not set +# CONFIG_USB_GADGET_DEBUG_FILES is not set +# CONFIG_USB_GADGET_DEBUG_FS is not set +CONFIG_USB_GADGET_VBUS_DRAW=2 +CONFIG_USB_GADGET_STORAGE_NUM_BUFFERS=2 +CONFIG_U_SERIAL_CONSOLE=y + +# +# USB Peripheral Controller +# +CONFIG_USB_FOTG210_UDC=m +CONFIG_USB_GR_UDC=m +CONFIG_USB_R8A66597=m +CONFIG_USB_PXA27X=m +CONFIG_USB_MV_UDC=m +CONFIG_USB_MV_U3D=m +CONFIG_USB_SNP_CORE=m +CONFIG_USB_SNP_UDC_PLAT=m +CONFIG_USB_M66592=m +CONFIG_USB_BDC_UDC=m + +# +# Platform Support +# +CONFIG_USB_BDC_PCI=m +CONFIG_USB_AMD5536UDC=m +CONFIG_USB_NET2272=m +CONFIG_USB_NET2272_DMA=y +CONFIG_USB_NET2280=m +CONFIG_USB_GOKU=m +CONFIG_USB_EG20T=m +CONFIG_USB_GADGET_XILINX=m +CONFIG_USB_DUMMY_HCD=m +# end of USB Peripheral Controller + +CONFIG_USB_LIBCOMPOSITE=m +CONFIG_USB_F_ACM=m +CONFIG_USB_F_SS_LB=m +CONFIG_USB_U_SERIAL=m +CONFIG_USB_U_ETHER=m +CONFIG_USB_U_AUDIO=m +CONFIG_USB_F_SERIAL=m +CONFIG_USB_F_OBEX=m +CONFIG_USB_F_NCM=m +CONFIG_USB_F_ECM=m +CONFIG_USB_F_PHONET=m +CONFIG_USB_F_EEM=m +CONFIG_USB_F_SUBSET=m +CONFIG_USB_F_RNDIS=m +CONFIG_USB_F_MASS_STORAGE=m +CONFIG_USB_F_FS=m +CONFIG_USB_F_UAC1=m +CONFIG_USB_F_UAC1_LEGACY=m +CONFIG_USB_F_UAC2=m +CONFIG_USB_F_UVC=m +CONFIG_USB_F_MIDI=m +CONFIG_USB_F_HID=m +CONFIG_USB_F_PRINTER=m +CONFIG_USB_F_TCM=m +CONFIG_USB_CONFIGFS=m +CONFIG_USB_CONFIGFS_SERIAL=y +CONFIG_USB_CONFIGFS_ACM=y +CONFIG_USB_CONFIGFS_OBEX=y +CONFIG_USB_CONFIGFS_NCM=y +CONFIG_USB_CONFIGFS_ECM=y +CONFIG_USB_CONFIGFS_ECM_SUBSET=y +CONFIG_USB_CONFIGFS_RNDIS=y +CONFIG_USB_CONFIGFS_EEM=y +CONFIG_USB_CONFIGFS_PHONET=y +CONFIG_USB_CONFIGFS_MASS_STORAGE=y +CONFIG_USB_CONFIGFS_F_LB_SS=y +CONFIG_USB_CONFIGFS_F_FS=y +CONFIG_USB_CONFIGFS_F_UAC1=y +CONFIG_USB_CONFIGFS_F_UAC1_LEGACY=y +CONFIG_USB_CONFIGFS_F_UAC2=y +CONFIG_USB_CONFIGFS_F_MIDI=y +CONFIG_USB_CONFIGFS_F_HID=y +CONFIG_USB_CONFIGFS_F_UVC=y +CONFIG_USB_CONFIGFS_F_PRINTER=y +CONFIG_USB_CONFIGFS_F_TCM=y +CONFIG_USB_ZERO=m +CONFIG_USB_AUDIO=m +# CONFIG_GADGET_UAC1 is not set +CONFIG_USB_ETH=m +CONFIG_USB_ETH_RNDIS=y +CONFIG_USB_ETH_EEM=y +CONFIG_USB_G_NCM=m +CONFIG_USB_GADGETFS=m +CONFIG_USB_FUNCTIONFS=m +CONFIG_USB_FUNCTIONFS_ETH=y +CONFIG_USB_FUNCTIONFS_RNDIS=y +CONFIG_USB_FUNCTIONFS_GENERIC=y +CONFIG_USB_MASS_STORAGE=m +CONFIG_USB_GADGET_TARGET=m +CONFIG_USB_G_SERIAL=m +CONFIG_USB_MIDI_GADGET=m +CONFIG_USB_G_PRINTER=m +CONFIG_USB_CDC_COMPOSITE=m +CONFIG_USB_G_NOKIA=m +CONFIG_USB_G_ACM_MS=m +CONFIG_USB_G_MULTI=m +CONFIG_USB_G_MULTI_RNDIS=y +CONFIG_USB_G_MULTI_CDC=y +CONFIG_USB_G_HID=m +CONFIG_USB_G_DBGP=m +# CONFIG_USB_G_DBGP_PRINTK is not set +CONFIG_USB_G_DBGP_SERIAL=y +CONFIG_USB_G_WEBCAM=m +CONFIG_TYPEC=m +CONFIG_TYPEC_TCPM=m +CONFIG_TYPEC_TCPCI=m +CONFIG_TYPEC_RT1711H=m +CONFIG_TYPEC_FUSB302=m +CONFIG_TYPEC_WCOVE=m +CONFIG_TYPEC_UCSI=m +CONFIG_UCSI_CCG=m +CONFIG_UCSI_ACPI=m +CONFIG_TYPEC_TPS6598X=m + +# +# USB Type-C Multiplexer/DeMultiplexer Switch support +# +CONFIG_TYPEC_MUX_PI3USB30532=m +# end of USB Type-C Multiplexer/DeMultiplexer Switch support + +# +# USB Type-C Alternate Mode drivers +# +CONFIG_TYPEC_DP_ALTMODE=m +CONFIG_TYPEC_NVIDIA_ALTMODE=m +# end of USB Type-C Alternate Mode drivers + +CONFIG_USB_ROLE_SWITCH=m +CONFIG_USB_ROLES_INTEL_XHCI=m +CONFIG_MMC=m +CONFIG_PWRSEQ_EMMC=m +CONFIG_PWRSEQ_SD8787=m +CONFIG_PWRSEQ_SIMPLE=m +CONFIG_MMC_BLOCK=m +CONFIG_MMC_BLOCK_MINORS=8 +CONFIG_SDIO_UART=m +CONFIG_MMC_TEST=m + +# +# MMC/SD/SDIO Host Controller Drivers +# +# CONFIG_MMC_DEBUG is not set +CONFIG_MMC_SDHCI=m +CONFIG_MMC_SDHCI_IO_ACCESSORS=y +CONFIG_MMC_SDHCI_PCI=m +CONFIG_MMC_RICOH_MMC=y +CONFIG_MMC_SDHCI_ACPI=m +CONFIG_MMC_SDHCI_PLTFM=m +CONFIG_MMC_SDHCI_OF_ARASAN=m +CONFIG_MMC_SDHCI_OF_ASPEED=m +CONFIG_MMC_SDHCI_OF_AT91=m +CONFIG_MMC_SDHCI_OF_DWCMSHC=m +CONFIG_MMC_SDHCI_CADENCE=m +CONFIG_MMC_SDHCI_F_SDH30=m +CONFIG_MMC_WBSD=m +CONFIG_MMC_ALCOR=m +CONFIG_MMC_TIFM_SD=m +CONFIG_MMC_SPI=m +CONFIG_MMC_SDRICOH_CS=m +CONFIG_MMC_CB710=m +CONFIG_MMC_VIA_SDMMC=m +CONFIG_MMC_VUB300=m +CONFIG_MMC_USHC=m +CONFIG_MMC_USDHI6ROL0=m +CONFIG_MMC_REALTEK_PCI=m +CONFIG_MMC_REALTEK_USB=m +CONFIG_MMC_CQHCI=m +CONFIG_MMC_TOSHIBA_PCI=m +CONFIG_MMC_MTK=m +CONFIG_MMC_SDHCI_XENON=m +CONFIG_MMC_SDHCI_OMAP=m +CONFIG_MMC_SDHCI_AM654=m +CONFIG_MEMSTICK=m +# CONFIG_MEMSTICK_DEBUG is not set + +# +# MemoryStick drivers +# +# CONFIG_MEMSTICK_UNSAFE_RESUME is not set +CONFIG_MSPRO_BLOCK=m +CONFIG_MS_BLOCK=m + +# +# MemoryStick Host Controller Drivers +# +CONFIG_MEMSTICK_TIFM_MS=m +CONFIG_MEMSTICK_JMICRON_38X=m +CONFIG_MEMSTICK_R592=m +CONFIG_MEMSTICK_REALTEK_PCI=m +CONFIG_MEMSTICK_REALTEK_USB=m +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=y +CONFIG_LEDS_CLASS_FLASH=m +CONFIG_LEDS_BRIGHTNESS_HW_CHANGED=y + +# +# LED drivers +# +CONFIG_LEDS_88PM860X=m +CONFIG_LEDS_AAT1290=m +CONFIG_LEDS_AN30259A=m +CONFIG_LEDS_APU=m +CONFIG_LEDS_AS3645A=m +CONFIG_LEDS_BCM6328=m +CONFIG_LEDS_BCM6358=m +CONFIG_LEDS_CPCAP=m +CONFIG_LEDS_CR0014114=m +CONFIG_LEDS_LM3530=m +CONFIG_LEDS_LM3532=m +CONFIG_LEDS_LM3533=m +CONFIG_LEDS_LM3642=m +CONFIG_LEDS_LM3692X=m +CONFIG_LEDS_LM3601X=m +CONFIG_LEDS_MT6323=m +CONFIG_LEDS_PCA9532=m +CONFIG_LEDS_PCA9532_GPIO=y +CONFIG_LEDS_GPIO=m +CONFIG_LEDS_LP3944=m +CONFIG_LEDS_LP3952=m +# CONFIG_LEDS_LP5521 is not set +# CONFIG_LEDS_LP5523 is not set +# CONFIG_LEDS_LP5562 is not set +# CONFIG_LEDS_LP8501 is not set +CONFIG_LEDS_LP8788=m +CONFIG_LEDS_LP8860=m +CONFIG_LEDS_CLEVO_MAIL=m +CONFIG_LEDS_PCA955X=m +CONFIG_LEDS_PCA955X_GPIO=y +CONFIG_LEDS_PCA963X=m +CONFIG_LEDS_WM831X_STATUS=m +CONFIG_LEDS_WM8350=m +CONFIG_LEDS_DA903X=m +CONFIG_LEDS_DA9052=m +CONFIG_LEDS_DAC124S085=m +CONFIG_LEDS_PWM=m +CONFIG_LEDS_REGULATOR=m +CONFIG_LEDS_BD2802=m +CONFIG_LEDS_INTEL_SS4200=m +CONFIG_LEDS_LT3593=m +CONFIG_LEDS_ADP5520=m +CONFIG_LEDS_MC13783=m +CONFIG_LEDS_TCA6507=m +CONFIG_LEDS_TLC591XX=m +CONFIG_LEDS_MAX77650=m +CONFIG_LEDS_MAX77693=m +CONFIG_LEDS_MAX8997=m +CONFIG_LEDS_LM355x=m +CONFIG_LEDS_MENF21BMC=m +CONFIG_LEDS_KTD2692=m +CONFIG_LEDS_IS31FL319X=m +CONFIG_LEDS_IS31FL32XX=m + +# +# LED driver for blink(1) USB RGB LED is under Special HID drivers (HID_THINGM) +# +CONFIG_LEDS_BLINKM=m +CONFIG_LEDS_SYSCON=y +CONFIG_LEDS_MLXCPLD=m +CONFIG_LEDS_MLXREG=m +CONFIG_LEDS_USER=m +CONFIG_LEDS_NIC78BX=m +CONFIG_LEDS_SPI_BYTE=m +CONFIG_LEDS_TI_LMU_COMMON=m +CONFIG_LEDS_LM3697=m +CONFIG_LEDS_LM36274=m + +# +# LED Triggers +# +CONFIG_LEDS_TRIGGERS=y +CONFIG_LEDS_TRIGGER_TIMER=m +CONFIG_LEDS_TRIGGER_ONESHOT=m +CONFIG_LEDS_TRIGGER_DISK=y +CONFIG_LEDS_TRIGGER_MTD=y +CONFIG_LEDS_TRIGGER_HEARTBEAT=m +CONFIG_LEDS_TRIGGER_BACKLIGHT=m +CONFIG_LEDS_TRIGGER_CPU=y +CONFIG_LEDS_TRIGGER_ACTIVITY=m +CONFIG_LEDS_TRIGGER_GPIO=m +CONFIG_LEDS_TRIGGER_DEFAULT_ON=m + +# +# iptables trigger is under Netfilter config (LED target) +# +CONFIG_LEDS_TRIGGER_TRANSIENT=m +CONFIG_LEDS_TRIGGER_CAMERA=m +CONFIG_LEDS_TRIGGER_PANIC=y +CONFIG_LEDS_TRIGGER_NETDEV=m +CONFIG_LEDS_TRIGGER_PATTERN=m +CONFIG_LEDS_TRIGGER_AUDIO=m +CONFIG_ACCESSIBILITY=y +CONFIG_A11Y_BRAILLE_CONSOLE=y +CONFIG_INFINIBAND=m +CONFIG_INFINIBAND_USER_MAD=m +CONFIG_INFINIBAND_USER_ACCESS=m +# CONFIG_INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI is not set +CONFIG_INFINIBAND_USER_MEM=y +CONFIG_INFINIBAND_ON_DEMAND_PAGING=y +CONFIG_INFINIBAND_ADDR_TRANS=y +CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS=y +CONFIG_INFINIBAND_MTHCA=m +CONFIG_INFINIBAND_MTHCA_DEBUG=y +CONFIG_INFINIBAND_QIB=m +CONFIG_INFINIBAND_QIB_DCA=y +CONFIG_INFINIBAND_CXGB3=m +CONFIG_INFINIBAND_CXGB4=m +CONFIG_INFINIBAND_EFA=m +CONFIG_INFINIBAND_I40IW=m +CONFIG_MLX4_INFINIBAND=m +CONFIG_MLX5_INFINIBAND=m +CONFIG_INFINIBAND_OCRDMA=m +CONFIG_INFINIBAND_VMWARE_PVRDMA=m +CONFIG_INFINIBAND_USNIC=m +CONFIG_INFINIBAND_BNXT_RE=m +CONFIG_INFINIBAND_HFI1=m +# CONFIG_HFI1_DEBUG_SDMA_ORDER is not set +# CONFIG_SDMA_VERBOSITY is not set +CONFIG_INFINIBAND_QEDR=m +CONFIG_INFINIBAND_RDMAVT=m +CONFIG_RDMA_RXE=m +CONFIG_RDMA_SIW=m +CONFIG_INFINIBAND_IPOIB=m +CONFIG_INFINIBAND_IPOIB_CM=y +CONFIG_INFINIBAND_IPOIB_DEBUG=y +# CONFIG_INFINIBAND_IPOIB_DEBUG_DATA is not set +CONFIG_INFINIBAND_SRP=m +CONFIG_INFINIBAND_SRPT=m +CONFIG_INFINIBAND_ISER=m +CONFIG_INFINIBAND_ISERT=m +CONFIG_INFINIBAND_OPA_VNIC=m +CONFIG_EDAC_ATOMIC_SCRUB=y +CONFIG_EDAC_SUPPORT=y +CONFIG_EDAC=y +CONFIG_EDAC_LEGACY_SYSFS=y +# CONFIG_EDAC_DEBUG is not set +CONFIG_EDAC_DECODE_MCE=m +CONFIG_EDAC_GHES=y +CONFIG_EDAC_AMD64=m +# CONFIG_EDAC_AMD64_ERROR_INJECTION is not set +CONFIG_EDAC_E752X=m +CONFIG_EDAC_I82975X=m +CONFIG_EDAC_I3000=m +CONFIG_EDAC_I3200=m +CONFIG_EDAC_IE31200=m +CONFIG_EDAC_X38=m +CONFIG_EDAC_I5400=m +CONFIG_EDAC_I7CORE=m +CONFIG_EDAC_I5000=m +CONFIG_EDAC_I5100=m +CONFIG_EDAC_I7300=m +CONFIG_EDAC_SBRIDGE=m +CONFIG_EDAC_SKX=m +CONFIG_EDAC_I10NM=m +CONFIG_EDAC_PND2=m +CONFIG_RTC_LIB=y +CONFIG_RTC_MC146818_LIB=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_HCTOSYS=y +CONFIG_RTC_HCTOSYS_DEVICE="rtc0" +CONFIG_RTC_SYSTOHC=y +CONFIG_RTC_SYSTOHC_DEVICE="rtc0" +# CONFIG_RTC_DEBUG is not set +CONFIG_RTC_NVMEM=y + +# +# RTC interfaces +# +CONFIG_RTC_INTF_SYSFS=y +CONFIG_RTC_INTF_PROC=y +CONFIG_RTC_INTF_DEV=y +CONFIG_RTC_INTF_DEV_UIE_EMUL=y +# CONFIG_RTC_DRV_TEST is not set + +# +# I2C RTC drivers +# +CONFIG_RTC_DRV_88PM860X=m +CONFIG_RTC_DRV_88PM80X=m +CONFIG_RTC_DRV_ABB5ZES3=m +CONFIG_RTC_DRV_ABEOZ9=m +CONFIG_RTC_DRV_ABX80X=m +CONFIG_RTC_DRV_AS3722=m +CONFIG_RTC_DRV_DS1307=m +CONFIG_RTC_DRV_DS1307_CENTURY=y +CONFIG_RTC_DRV_DS1374=m +CONFIG_RTC_DRV_DS1374_WDT=y +CONFIG_RTC_DRV_DS1672=m +CONFIG_RTC_DRV_HYM8563=m +CONFIG_RTC_DRV_LP8788=m +CONFIG_RTC_DRV_MAX6900=m +CONFIG_RTC_DRV_MAX8907=m +CONFIG_RTC_DRV_MAX8925=m +CONFIG_RTC_DRV_MAX8998=m +CONFIG_RTC_DRV_MAX8997=m +CONFIG_RTC_DRV_MAX77686=m +CONFIG_RTC_DRV_RK808=m +CONFIG_RTC_DRV_RS5C372=m +CONFIG_RTC_DRV_ISL1208=m +CONFIG_RTC_DRV_ISL12022=m +CONFIG_RTC_DRV_ISL12026=m +CONFIG_RTC_DRV_X1205=m +CONFIG_RTC_DRV_PCF8523=m +CONFIG_RTC_DRV_PCF85063=m +CONFIG_RTC_DRV_PCF85363=m +CONFIG_RTC_DRV_PCF8563=m +CONFIG_RTC_DRV_PCF8583=m +CONFIG_RTC_DRV_M41T80=m +CONFIG_RTC_DRV_M41T80_WDT=y +CONFIG_RTC_DRV_BD70528=m +CONFIG_RTC_DRV_BQ32K=m +CONFIG_RTC_DRV_TWL4030=m +CONFIG_RTC_DRV_PALMAS=m +CONFIG_RTC_DRV_TPS6586X=m +CONFIG_RTC_DRV_TPS65910=m +CONFIG_RTC_DRV_TPS80031=m +CONFIG_RTC_DRV_RC5T583=m +CONFIG_RTC_DRV_S35390A=m +CONFIG_RTC_DRV_FM3130=m +CONFIG_RTC_DRV_RX8010=m +CONFIG_RTC_DRV_RX8581=m +CONFIG_RTC_DRV_RX8025=m +CONFIG_RTC_DRV_EM3027=m +CONFIG_RTC_DRV_RV3028=m +CONFIG_RTC_DRV_RV8803=m +CONFIG_RTC_DRV_S5M=m +CONFIG_RTC_DRV_SD3078=m + +# +# SPI RTC drivers +# +CONFIG_RTC_DRV_M41T93=m +CONFIG_RTC_DRV_M41T94=m +CONFIG_RTC_DRV_DS1302=m +CONFIG_RTC_DRV_DS1305=m +CONFIG_RTC_DRV_DS1343=m +CONFIG_RTC_DRV_DS1347=m +CONFIG_RTC_DRV_DS1390=m +CONFIG_RTC_DRV_MAX6916=m +CONFIG_RTC_DRV_R9701=m +CONFIG_RTC_DRV_RX4581=m +CONFIG_RTC_DRV_RX6110=m +CONFIG_RTC_DRV_RS5C348=m +CONFIG_RTC_DRV_MAX6902=m +CONFIG_RTC_DRV_PCF2123=m +CONFIG_RTC_DRV_MCP795=m +CONFIG_RTC_I2C_AND_SPI=y + +# +# SPI and I2C RTC drivers +# +CONFIG_RTC_DRV_DS3232=m +CONFIG_RTC_DRV_DS3232_HWMON=y +CONFIG_RTC_DRV_PCF2127=m +CONFIG_RTC_DRV_RV3029C2=m +CONFIG_RTC_DRV_RV3029_HWMON=y + +# +# Platform RTC drivers +# +CONFIG_RTC_DRV_CMOS=y +CONFIG_RTC_DRV_DS1286=m +CONFIG_RTC_DRV_DS1511=m +CONFIG_RTC_DRV_DS1553=m +CONFIG_RTC_DRV_DS1685_FAMILY=m +CONFIG_RTC_DRV_DS1685=y +# CONFIG_RTC_DRV_DS1689 is not set +# CONFIG_RTC_DRV_DS17285 is not set +# CONFIG_RTC_DRV_DS17485 is not set +# CONFIG_RTC_DRV_DS17885 is not set +CONFIG_RTC_DRV_DS1742=m +CONFIG_RTC_DRV_DS2404=m +CONFIG_RTC_DRV_DA9052=m +CONFIG_RTC_DRV_DA9055=m +CONFIG_RTC_DRV_DA9063=m +CONFIG_RTC_DRV_STK17TA8=m +CONFIG_RTC_DRV_M48T86=m +CONFIG_RTC_DRV_M48T35=m +CONFIG_RTC_DRV_M48T59=m +CONFIG_RTC_DRV_MSM6242=m +CONFIG_RTC_DRV_BQ4802=m +CONFIG_RTC_DRV_RP5C01=m +CONFIG_RTC_DRV_V3020=m +CONFIG_RTC_DRV_WM831X=m +CONFIG_RTC_DRV_WM8350=m +CONFIG_RTC_DRV_PCF50633=m +CONFIG_RTC_DRV_AB3100=m +CONFIG_RTC_DRV_ZYNQMP=m +CONFIG_RTC_DRV_CROS_EC=m + +# +# on-CPU RTC drivers +# +CONFIG_RTC_DRV_CADENCE=m +CONFIG_RTC_DRV_FTRTC010=m +CONFIG_RTC_DRV_PCAP=m +CONFIG_RTC_DRV_MC13XXX=m +CONFIG_RTC_DRV_SNVS=m +CONFIG_RTC_DRV_MT6397=m +CONFIG_RTC_DRV_R7301=m +CONFIG_RTC_DRV_CPCAP=m + +# +# HID Sensor RTC drivers +# +CONFIG_RTC_DRV_HID_SENSOR_TIME=m +CONFIG_RTC_DRV_WILCO_EC=m +CONFIG_DMADEVICES=y +# CONFIG_DMADEVICES_DEBUG is not set + +# +# DMA Devices +# +CONFIG_DMA_ENGINE=y +CONFIG_DMA_VIRTUAL_CHANNELS=y +CONFIG_DMA_ACPI=y +CONFIG_DMA_OF=y +CONFIG_ALTERA_MSGDMA=m +CONFIG_DW_AXI_DMAC=m +CONFIG_FSL_EDMA=m +CONFIG_INTEL_IDMA64=m +CONFIG_INTEL_IOATDMA=m +CONFIG_INTEL_MIC_X100_DMA=m +CONFIG_QCOM_HIDMA_MGMT=m +CONFIG_QCOM_HIDMA=m +CONFIG_DW_DMAC_CORE=y +CONFIG_DW_DMAC=y +CONFIG_DW_DMAC_PCI=y +CONFIG_DW_EDMA=m +CONFIG_DW_EDMA_PCIE=m +CONFIG_HSU_DMA=y + +# +# DMA Clients +# +CONFIG_ASYNC_TX_DMA=y +# CONFIG_DMATEST is not set +CONFIG_DMA_ENGINE_RAID=y + +# +# DMABUF options +# +CONFIG_SYNC_FILE=y +# CONFIG_SW_SYNC is not set +CONFIG_UDMABUF=y +# CONFIG_DMABUF_SELFTESTS is not set +# end of DMABUF options + +CONFIG_DCA=m +CONFIG_AUXDISPLAY=y +CONFIG_HD44780=m +CONFIG_KS0108=m +CONFIG_KS0108_PORT=0x378 +CONFIG_KS0108_DELAY=2 +CONFIG_CFAG12864B=m +CONFIG_CFAG12864B_RATE=20 +CONFIG_IMG_ASCII_LCD=m +CONFIG_HT16K33=m +CONFIG_PARPORT_PANEL=m +CONFIG_PANEL_PARPORT=0 +CONFIG_PANEL_PROFILE=5 +# CONFIG_PANEL_CHANGE_MESSAGE is not set +# CONFIG_CHARLCD_BL_OFF is not set +# CONFIG_CHARLCD_BL_ON is not set +CONFIG_CHARLCD_BL_FLASH=y +CONFIG_PANEL=m +CONFIG_CHARLCD=m +CONFIG_UIO=m +CONFIG_UIO_CIF=m +CONFIG_UIO_PDRV_GENIRQ=m +CONFIG_UIO_DMEM_GENIRQ=m +CONFIG_UIO_AEC=m +CONFIG_UIO_SERCOS3=m +CONFIG_UIO_PCI_GENERIC=m +CONFIG_UIO_NETX=m +CONFIG_UIO_PRUSS=m +CONFIG_UIO_MF624=m +CONFIG_UIO_HV_GENERIC=m +CONFIG_VFIO_IOMMU_TYPE1=m +CONFIG_VFIO_VIRQFD=m +CONFIG_VFIO=m +# CONFIG_VFIO_NOIOMMU is not set +CONFIG_VFIO_PCI=m +CONFIG_VFIO_PCI_VGA=y +CONFIG_VFIO_PCI_MMAP=y +CONFIG_VFIO_PCI_INTX=y +CONFIG_VFIO_PCI_IGD=y +CONFIG_VFIO_MDEV=m +CONFIG_VFIO_MDEV_DEVICE=m +CONFIG_IRQ_BYPASS_MANAGER=m +CONFIG_VIRT_DRIVERS=y +CONFIG_VBOXGUEST=m +CONFIG_VIRTIO=y +CONFIG_VIRTIO_MENU=y +CONFIG_VIRTIO_PCI=m +CONFIG_VIRTIO_PCI_LEGACY=y +CONFIG_VIRTIO_PMEM=m +CONFIG_VIRTIO_BALLOON=m +CONFIG_VIRTIO_INPUT=m +CONFIG_VIRTIO_MMIO=m +CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y + +# +# Microsoft Hyper-V guest support +# +CONFIG_HYPERV=m +CONFIG_HYPERV_TIMER=y +CONFIG_HYPERV_UTILS=m +CONFIG_HYPERV_BALLOON=m +# end of Microsoft Hyper-V guest support + +# +# Xen driver support +# +CONFIG_XEN_BALLOON=y +CONFIG_XEN_BALLOON_MEMORY_HOTPLUG=y +CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT=512 +CONFIG_XEN_SCRUB_PAGES_DEFAULT=y +CONFIG_XEN_DEV_EVTCHN=m +CONFIG_XEN_BACKEND=y +CONFIG_XENFS=m +CONFIG_XEN_COMPAT_XENFS=y +CONFIG_XEN_SYS_HYPERVISOR=y +CONFIG_XEN_XENBUS_FRONTEND=y +CONFIG_XEN_GNTDEV=m +CONFIG_XEN_GNTDEV_DMABUF=y +CONFIG_XEN_GRANT_DEV_ALLOC=m +CONFIG_XEN_GRANT_DMA_ALLOC=y +CONFIG_SWIOTLB_XEN=y +CONFIG_XEN_PCIDEV_BACKEND=m +CONFIG_XEN_PVCALLS_FRONTEND=m +CONFIG_XEN_PVCALLS_BACKEND=y +CONFIG_XEN_SCSI_BACKEND=m +CONFIG_XEN_PRIVCMD=m +CONFIG_XEN_ACPI_PROCESSOR=m +CONFIG_XEN_MCE_LOG=y +CONFIG_XEN_HAVE_PVMMU=y +CONFIG_XEN_EFI=y +CONFIG_XEN_AUTO_XLATE=y +CONFIG_XEN_ACPI=y +CONFIG_XEN_SYMS=y +CONFIG_XEN_HAVE_VPMU=y +CONFIG_XEN_FRONT_PGDIR_SHBUF=m +# end of Xen driver support + +# CONFIG_GREYBUS is not set +CONFIG_STAGING=y +CONFIG_PRISM2_USB=m +CONFIG_COMEDI=m +# CONFIG_COMEDI_DEBUG is not set +CONFIG_COMEDI_DEFAULT_BUF_SIZE_KB=2048 +CONFIG_COMEDI_DEFAULT_BUF_MAXSIZE_KB=20480 +CONFIG_COMEDI_MISC_DRIVERS=y +CONFIG_COMEDI_BOND=m +CONFIG_COMEDI_TEST=m +CONFIG_COMEDI_PARPORT=m +# CONFIG_COMEDI_ISA_DRIVERS is not set +CONFIG_COMEDI_PCI_DRIVERS=m +CONFIG_COMEDI_8255_PCI=m +CONFIG_COMEDI_ADDI_WATCHDOG=m +CONFIG_COMEDI_ADDI_APCI_1032=m +CONFIG_COMEDI_ADDI_APCI_1500=m +CONFIG_COMEDI_ADDI_APCI_1516=m +CONFIG_COMEDI_ADDI_APCI_1564=m +CONFIG_COMEDI_ADDI_APCI_16XX=m +CONFIG_COMEDI_ADDI_APCI_2032=m +CONFIG_COMEDI_ADDI_APCI_2200=m +CONFIG_COMEDI_ADDI_APCI_3120=m +CONFIG_COMEDI_ADDI_APCI_3501=m +CONFIG_COMEDI_ADDI_APCI_3XXX=m +CONFIG_COMEDI_ADL_PCI6208=m +CONFIG_COMEDI_ADL_PCI7X3X=m +CONFIG_COMEDI_ADL_PCI8164=m +CONFIG_COMEDI_ADL_PCI9111=m +CONFIG_COMEDI_ADL_PCI9118=m +CONFIG_COMEDI_ADV_PCI1710=m +CONFIG_COMEDI_ADV_PCI1720=m +CONFIG_COMEDI_ADV_PCI1723=m +CONFIG_COMEDI_ADV_PCI1724=m +CONFIG_COMEDI_ADV_PCI1760=m +CONFIG_COMEDI_ADV_PCI_DIO=m +CONFIG_COMEDI_AMPLC_DIO200_PCI=m +CONFIG_COMEDI_AMPLC_PC236_PCI=m +CONFIG_COMEDI_AMPLC_PC263_PCI=m +CONFIG_COMEDI_AMPLC_PCI224=m +CONFIG_COMEDI_AMPLC_PCI230=m +CONFIG_COMEDI_CONTEC_PCI_DIO=m +CONFIG_COMEDI_DAS08_PCI=m +CONFIG_COMEDI_DT3000=m +CONFIG_COMEDI_DYNA_PCI10XX=m +CONFIG_COMEDI_GSC_HPDI=m +CONFIG_COMEDI_MF6X4=m +CONFIG_COMEDI_ICP_MULTI=m +CONFIG_COMEDI_DAQBOARD2000=m +CONFIG_COMEDI_JR3_PCI=m +CONFIG_COMEDI_KE_COUNTER=m +CONFIG_COMEDI_CB_PCIDAS64=m +CONFIG_COMEDI_CB_PCIDAS=m +CONFIG_COMEDI_CB_PCIDDA=m +CONFIG_COMEDI_CB_PCIMDAS=m +CONFIG_COMEDI_CB_PCIMDDA=m +CONFIG_COMEDI_ME4000=m +CONFIG_COMEDI_ME_DAQ=m +CONFIG_COMEDI_NI_6527=m +CONFIG_COMEDI_NI_65XX=m +CONFIG_COMEDI_NI_660X=m +CONFIG_COMEDI_NI_670X=m +CONFIG_COMEDI_NI_LABPC_PCI=m +CONFIG_COMEDI_NI_PCIDIO=m +CONFIG_COMEDI_NI_PCIMIO=m +CONFIG_COMEDI_RTD520=m +CONFIG_COMEDI_S626=m +CONFIG_COMEDI_MITE=m +CONFIG_COMEDI_NI_TIOCMD=m +CONFIG_COMEDI_PCMCIA_DRIVERS=m +CONFIG_COMEDI_CB_DAS16_CS=m +CONFIG_COMEDI_DAS08_CS=m +CONFIG_COMEDI_NI_DAQ_700_CS=m +CONFIG_COMEDI_NI_DAQ_DIO24_CS=m +CONFIG_COMEDI_NI_LABPC_CS=m +CONFIG_COMEDI_NI_MIO_CS=m +CONFIG_COMEDI_QUATECH_DAQP_CS=m +CONFIG_COMEDI_USB_DRIVERS=m +CONFIG_COMEDI_DT9812=m +CONFIG_COMEDI_NI_USB6501=m +CONFIG_COMEDI_USBDUX=m +CONFIG_COMEDI_USBDUXFAST=m +CONFIG_COMEDI_USBDUXSIGMA=m +CONFIG_COMEDI_VMK80XX=m +CONFIG_COMEDI_8254=m +CONFIG_COMEDI_8255=m +CONFIG_COMEDI_8255_SA=m +CONFIG_COMEDI_KCOMEDILIB=m +CONFIG_COMEDI_AMPLC_DIO200=m +CONFIG_COMEDI_AMPLC_PC236=m +CONFIG_COMEDI_DAS08=m +CONFIG_COMEDI_NI_LABPC=m +CONFIG_COMEDI_NI_TIO=m +CONFIG_COMEDI_NI_ROUTING=m +CONFIG_RTL8192U=m +CONFIG_RTLLIB=m +CONFIG_RTLLIB_CRYPTO_CCMP=m +CONFIG_RTLLIB_CRYPTO_TKIP=m +CONFIG_RTLLIB_CRYPTO_WEP=m +CONFIG_RTL8192E=m +CONFIG_RTL8723BS=m +CONFIG_R8712U=m +CONFIG_R8188EU=m +CONFIG_88EU_AP_MODE=y +CONFIG_RTS5208=m +CONFIG_VT6655=m +CONFIG_VT6656=m + +# +# IIO staging drivers +# + +# +# Accelerometers +# +CONFIG_ADIS16203=m +CONFIG_ADIS16240=m +# end of Accelerometers + +# +# Analog to digital converters +# +CONFIG_AD7816=m +CONFIG_AD7192=m +CONFIG_AD7280=m +# end of Analog to digital converters + +# +# Analog digital bi-direction converters +# +CONFIG_ADT7316=m +CONFIG_ADT7316_SPI=m +CONFIG_ADT7316_I2C=m +# end of Analog digital bi-direction converters + +# +# Capacitance to digital converters +# +CONFIG_AD7150=m +CONFIG_AD7746=m +# end of Capacitance to digital converters + +# +# Direct Digital Synthesis +# +CONFIG_AD9832=m +CONFIG_AD9834=m +# end of Direct Digital Synthesis + +# +# Network Analyzer, Impedance Converters +# +CONFIG_AD5933=m +# end of Network Analyzer, Impedance Converters + +# +# Active energy metering IC +# +CONFIG_ADE7854=m +CONFIG_ADE7854_I2C=m +CONFIG_ADE7854_SPI=m +# end of Active energy metering IC + +# +# Resolver to digital converters +# +CONFIG_AD2S1210=m +# end of Resolver to digital converters +# end of IIO staging drivers + +# CONFIG_FB_SM750 is not set + +# +# Speakup console speech +# +CONFIG_SPEAKUP=m +CONFIG_SPEAKUP_SYNTH_ACNTSA=m +CONFIG_SPEAKUP_SYNTH_APOLLO=m +CONFIG_SPEAKUP_SYNTH_AUDPTR=m +CONFIG_SPEAKUP_SYNTH_BNS=m +CONFIG_SPEAKUP_SYNTH_DECTLK=m +CONFIG_SPEAKUP_SYNTH_DECEXT=m +CONFIG_SPEAKUP_SYNTH_LTLK=m +CONFIG_SPEAKUP_SYNTH_SOFT=m +CONFIG_SPEAKUP_SYNTH_SPKOUT=m +CONFIG_SPEAKUP_SYNTH_TXPRT=m +CONFIG_SPEAKUP_SYNTH_DUMMY=m +# end of Speakup console speech + +CONFIG_STAGING_MEDIA=y +CONFIG_VIDEO_IPU3_IMGU=m + +# +# soc_camera sensor drivers +# + +# +# Android +# +# end of Android + +CONFIG_STAGING_BOARD=y +CONFIG_LTE_GDM724X=m +CONFIG_FIREWIRE_SERIAL=m +CONFIG_FWTTY_MAX_TOTAL_PORTS=64 +CONFIG_FWTTY_MAX_CARD_PORTS=32 +CONFIG_GS_FPGABOOT=m +CONFIG_UNISYSSPAR=y +CONFIG_UNISYS_VISORNIC=m +CONFIG_UNISYS_VISORINPUT=m +CONFIG_UNISYS_VISORHBA=m +CONFIG_COMMON_CLK_XLNX_CLKWZRD=m +# CONFIG_FB_TFT is not set +CONFIG_WILC1000=m +CONFIG_WILC1000_SDIO=m +CONFIG_WILC1000_SPI=m +# CONFIG_WILC1000_HW_OOB_INTR is not set +CONFIG_MOST=m +CONFIG_MOST_CDEV=m +CONFIG_MOST_NET=m +CONFIG_MOST_SOUND=m +CONFIG_MOST_VIDEO=m +CONFIG_MOST_DIM2=m +CONFIG_MOST_I2C=m +CONFIG_MOST_USB=m +CONFIG_KS7010=m +CONFIG_PI433=m + +# +# Gasket devices +# +CONFIG_STAGING_GASKET_FRAMEWORK=m +CONFIG_STAGING_APEX_DRIVER=m +# end of Gasket devices + +CONFIG_XIL_AXIS_FIFO=m +CONFIG_FIELDBUS_DEV=m +CONFIG_HMS_ANYBUSS_BUS=m +CONFIG_ARCX_ANYBUS_CONTROLLER=m +CONFIG_HMS_PROFINET=m +CONFIG_KPC2000=y +CONFIG_KPC2000_CORE=m +CONFIG_KPC2000_SPI=m +CONFIG_KPC2000_I2C=m +CONFIG_KPC2000_DMA=m + +# +# ISDN CAPI drivers +# +CONFIG_CAPI_AVM=y +CONFIG_ISDN_DRV_AVMB1_B1PCI=m +CONFIG_ISDN_DRV_AVMB1_B1PCIV4=y +CONFIG_ISDN_DRV_AVMB1_B1PCMCIA=m +CONFIG_ISDN_DRV_AVMB1_AVM_CS=m +CONFIG_ISDN_DRV_AVMB1_T1PCI=m +CONFIG_ISDN_DRV_AVMB1_C4=m +CONFIG_ISDN_DRV_GIGASET=m +CONFIG_GIGASET_CAPI=y +CONFIG_GIGASET_BASE=m +CONFIG_GIGASET_M105=m +CONFIG_GIGASET_M101=m +# CONFIG_GIGASET_DEBUG is not set +CONFIG_HYSDN=m +CONFIG_HYSDN_CAPI=y +# end of ISDN CAPI drivers + +CONFIG_USB_WUSB=m +CONFIG_USB_WUSB_CBAF=m +# CONFIG_USB_WUSB_CBAF_DEBUG is not set +CONFIG_USB_WHCI_HCD=m +CONFIG_USB_HWA_HCD=m +CONFIG_UWB=m +CONFIG_UWB_HWA=m +CONFIG_UWB_WHCI=m +CONFIG_UWB_I1480U=m +CONFIG_EXFAT_FS=m +CONFIG_EXFAT_DONT_MOUNT_VFAT=y +CONFIG_EXFAT_DISCARD=y +# CONFIG_EXFAT_DELAYED_SYNC is not set +# CONFIG_EXFAT_KERNEL_DEBUG is not set +# CONFIG_EXFAT_DEBUG_MSG is not set +CONFIG_EXFAT_DEFAULT_CODEPAGE=437 +CONFIG_EXFAT_DEFAULT_IOCHARSET="utf8" +CONFIG_QLGE=m +CONFIG_X86_PLATFORM_DEVICES=y +CONFIG_ACER_WMI=m +CONFIG_ACER_WIRELESS=m +CONFIG_ACERHDF=m +CONFIG_ALIENWARE_WMI=m +CONFIG_ASUS_LAPTOP=m +CONFIG_DCDBAS=m +CONFIG_DELL_SMBIOS=m +CONFIG_DELL_SMBIOS_WMI=y +CONFIG_DELL_SMBIOS_SMM=y +CONFIG_DELL_LAPTOP=m +CONFIG_DELL_WMI=m +CONFIG_DELL_WMI_DESCRIPTOR=m +CONFIG_DELL_WMI_AIO=m +CONFIG_DELL_WMI_LED=m +CONFIG_DELL_SMO8800=m +CONFIG_DELL_RBTN=m +# CONFIG_DELL_RBU is not set +CONFIG_FUJITSU_LAPTOP=m +CONFIG_FUJITSU_TABLET=m +CONFIG_AMILO_RFKILL=m +CONFIG_GPD_POCKET_FAN=m +CONFIG_HP_ACCEL=m +CONFIG_HP_WIRELESS=m +CONFIG_HP_WMI=m +CONFIG_LG_LAPTOP=m +CONFIG_MSI_LAPTOP=m +CONFIG_PANASONIC_LAPTOP=m +CONFIG_COMPAL_LAPTOP=m +CONFIG_SONY_LAPTOP=m +CONFIG_SONYPI_COMPAT=y +CONFIG_IDEAPAD_LAPTOP=m +CONFIG_SURFACE3_WMI=m +CONFIG_THINKPAD_ACPI=m +CONFIG_THINKPAD_ACPI_ALSA_SUPPORT=y +# CONFIG_THINKPAD_ACPI_DEBUGFACILITIES is not set +# CONFIG_THINKPAD_ACPI_DEBUG is not set +# CONFIG_THINKPAD_ACPI_UNSAFE_LEDS is not set +CONFIG_THINKPAD_ACPI_VIDEO=y +CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y +CONFIG_SENSORS_HDAPS=m +CONFIG_INTEL_MENLOW=m +CONFIG_EEEPC_LAPTOP=m +CONFIG_ASUS_WMI=m +CONFIG_ASUS_NB_WMI=m +CONFIG_EEEPC_WMI=m +CONFIG_ASUS_WIRELESS=m +CONFIG_ACPI_WMI=m +CONFIG_WMI_BMOF=m +CONFIG_INTEL_WMI_THUNDERBOLT=m +CONFIG_XIAOMI_WMI=m +CONFIG_MSI_WMI=m +CONFIG_PEAQ_WMI=m +CONFIG_TOPSTAR_LAPTOP=m +CONFIG_ACPI_TOSHIBA=m +CONFIG_TOSHIBA_BT_RFKILL=m +CONFIG_TOSHIBA_HAPS=m +CONFIG_TOSHIBA_WMI=m +CONFIG_ACPI_CMPC=m +CONFIG_INTEL_CHT_INT33FE=m +CONFIG_INTEL_INT0002_VGPIO=m +CONFIG_INTEL_HID_EVENT=m +CONFIG_INTEL_VBTN=m +CONFIG_INTEL_IPS=m +CONFIG_INTEL_PMC_CORE=y +CONFIG_IBM_RTL=m +CONFIG_SAMSUNG_LAPTOP=m +CONFIG_MXM_WMI=m +CONFIG_INTEL_OAKTRAIL=m +CONFIG_SAMSUNG_Q10=m +CONFIG_APPLE_GMUX=m +CONFIG_INTEL_RST=m +CONFIG_INTEL_SMARTCONNECT=m +CONFIG_INTEL_PMC_IPC=m +CONFIG_INTEL_BXTWC_PMIC_TMU=m +CONFIG_SURFACE_PRO3_BUTTON=m +CONFIG_SURFACE_3_BUTTON=m +CONFIG_INTEL_PUNIT_IPC=m +CONFIG_INTEL_TELEMETRY=m +CONFIG_MLX_PLATFORM=m +CONFIG_INTEL_TURBO_MAX_3=y +CONFIG_TOUCHSCREEN_DMI=y +CONFIG_INTEL_CHTDC_TI_PWRBTN=m +CONFIG_I2C_MULTI_INSTANTIATE=m +CONFIG_INTEL_ATOMISP2_PM=m +CONFIG_HUAWEI_WMI=m +CONFIG_PCENGINES_APU2=m + +# +# Intel Speed Select Technology interface support +# +CONFIG_INTEL_SPEED_SELECT_INTERFACE=m +# end of Intel Speed Select Technology interface support + +CONFIG_PMC_ATOM=y +CONFIG_MFD_CROS_EC=m +CONFIG_CHROME_PLATFORMS=y +CONFIG_CHROMEOS_LAPTOP=m +CONFIG_CHROMEOS_PSTORE=m +CONFIG_CHROMEOS_TBMC=m +CONFIG_CROS_EC=m +CONFIG_CROS_EC_I2C=m +CONFIG_CROS_EC_RPMSG=m +CONFIG_CROS_EC_ISHTP=m +CONFIG_CROS_EC_SPI=m +CONFIG_CROS_EC_LPC=m +CONFIG_CROS_EC_PROTO=y +CONFIG_CROS_KBD_LED_BACKLIGHT=m +CONFIG_CROS_EC_CHARDEV=m +CONFIG_CROS_EC_LIGHTBAR=m +CONFIG_CROS_EC_VBC=m +CONFIG_CROS_EC_DEBUGFS=m +CONFIG_CROS_EC_SYSFS=m +CONFIG_CROS_USBPD_LOGGER=m +CONFIG_WILCO_EC=m +CONFIG_WILCO_EC_DEBUGFS=m +CONFIG_WILCO_EC_EVENTS=m +CONFIG_WILCO_EC_TELEMETRY=m +CONFIG_MELLANOX_PLATFORM=y +CONFIG_MLXREG_HOTPLUG=m +CONFIG_MLXREG_IO=m +CONFIG_CLKDEV_LOOKUP=y +CONFIG_HAVE_CLK_PREPARE=y +CONFIG_COMMON_CLK=y + +# +# Common Clock Framework +# +CONFIG_COMMON_CLK_WM831X=m +CONFIG_CLK_HSDK=y +CONFIG_COMMON_CLK_MAX77686=m +CONFIG_COMMON_CLK_MAX9485=m +CONFIG_COMMON_CLK_RK808=m +CONFIG_COMMON_CLK_SI5341=m +CONFIG_COMMON_CLK_SI5351=m +CONFIG_COMMON_CLK_SI514=m +CONFIG_COMMON_CLK_SI544=m +CONFIG_COMMON_CLK_SI570=m +CONFIG_COMMON_CLK_CDCE706=m +CONFIG_COMMON_CLK_CDCE925=m +CONFIG_COMMON_CLK_CS2000_CP=m +CONFIG_COMMON_CLK_S2MPS11=m +CONFIG_CLK_TWL6040=m +CONFIG_COMMON_CLK_LOCHNAGAR=m +CONFIG_COMMON_CLK_PALMAS=m +CONFIG_COMMON_CLK_PWM=m +CONFIG_COMMON_CLK_VC5=m +CONFIG_COMMON_CLK_BD718XX=m +CONFIG_COMMON_CLK_FIXED_MMIO=y +# end of Common Clock Framework + +CONFIG_HWSPINLOCK=y + +# +# Clock Source drivers +# +CONFIG_CLKEVT_I8253=y +CONFIG_I8253_LOCK=y +CONFIG_CLKBLD_I8253=y +# end of Clock Source drivers + +CONFIG_MAILBOX=y +CONFIG_PLATFORM_MHU=m +CONFIG_PCC=y +CONFIG_ALTERA_MBOX=m +CONFIG_MAILBOX_TEST=m +CONFIG_IOMMU_IOVA=y +CONFIG_IOMMU_API=y +CONFIG_IOMMU_SUPPORT=y + +# +# Generic IOMMU Pagetable Support +# +# end of Generic IOMMU Pagetable Support + +# CONFIG_IOMMU_DEBUGFS is not set +# CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set +CONFIG_OF_IOMMU=y +CONFIG_AMD_IOMMU=y +CONFIG_AMD_IOMMU_V2=y +CONFIG_DMAR_TABLE=y +CONFIG_INTEL_IOMMU=y +CONFIG_INTEL_IOMMU_SVM=y +# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set +CONFIG_INTEL_IOMMU_FLOPPY_WA=y +CONFIG_IRQ_REMAP=y +CONFIG_HYPERV_IOMMU=y + +# +# Remoteproc drivers +# +CONFIG_REMOTEPROC=y +# end of Remoteproc drivers + +# +# Rpmsg drivers +# +CONFIG_RPMSG=m +CONFIG_RPMSG_CHAR=m +CONFIG_RPMSG_QCOM_GLINK_NATIVE=m +CONFIG_RPMSG_QCOM_GLINK_RPM=m +CONFIG_RPMSG_VIRTIO=m +# end of Rpmsg drivers + +CONFIG_SOUNDWIRE=m + +# +# SoundWire Devices +# +CONFIG_SOUNDWIRE_CADENCE=m +CONFIG_SOUNDWIRE_INTEL=m + +# +# SOC (System On Chip) specific Drivers +# + +# +# Amlogic SoC drivers +# +# end of Amlogic SoC drivers + +# +# Aspeed SoC drivers +# +# end of Aspeed SoC drivers + +# +# Broadcom SoC drivers +# +# end of Broadcom SoC drivers + +# +# NXP/Freescale QorIQ SoC drivers +# +# end of NXP/Freescale QorIQ SoC drivers + +# +# i.MX SoC drivers +# +# end of i.MX SoC drivers + +# +# Qualcomm SoC drivers +# +# end of Qualcomm SoC drivers + +CONFIG_SOC_TI=y + +# +# Xilinx SoC drivers +# +CONFIG_XILINX_VCU=m +# end of Xilinx SoC drivers +# end of SOC (System On Chip) specific Drivers + +CONFIG_PM_DEVFREQ=y + +# +# DEVFREQ Governors +# +CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=m +CONFIG_DEVFREQ_GOV_PERFORMANCE=m +CONFIG_DEVFREQ_GOV_POWERSAVE=m +CONFIG_DEVFREQ_GOV_USERSPACE=m +CONFIG_DEVFREQ_GOV_PASSIVE=m + +# +# DEVFREQ Drivers +# +CONFIG_PM_DEVFREQ_EVENT=y +CONFIG_EXTCON=y + +# +# Extcon Device Drivers +# +CONFIG_EXTCON_ADC_JACK=m +CONFIG_EXTCON_ARIZONA=m +CONFIG_EXTCON_AXP288=m +CONFIG_EXTCON_FSA9480=m +CONFIG_EXTCON_GPIO=m +CONFIG_EXTCON_INTEL_INT3496=m +CONFIG_EXTCON_INTEL_CHT_WC=m +CONFIG_EXTCON_MAX14577=m +CONFIG_EXTCON_MAX3355=m +CONFIG_EXTCON_MAX77693=m +CONFIG_EXTCON_MAX77843=m +CONFIG_EXTCON_MAX8997=m +CONFIG_EXTCON_PALMAS=m +CONFIG_EXTCON_PTN5150=m +CONFIG_EXTCON_RT8973A=m +CONFIG_EXTCON_SM5502=m +CONFIG_EXTCON_USB_GPIO=m +CONFIG_EXTCON_USBC_CROS_EC=m +CONFIG_MEMORY=y +CONFIG_IIO=m +CONFIG_IIO_BUFFER=y +CONFIG_IIO_BUFFER_CB=m +CONFIG_IIO_BUFFER_HW_CONSUMER=m +CONFIG_IIO_KFIFO_BUF=m +CONFIG_IIO_TRIGGERED_BUFFER=m +CONFIG_IIO_CONFIGFS=m +CONFIG_IIO_TRIGGER=y +CONFIG_IIO_CONSUMERS_PER_TRIGGER=2 +CONFIG_IIO_SW_DEVICE=m +CONFIG_IIO_SW_TRIGGER=m +CONFIG_IIO_TRIGGERED_EVENT=m + +# +# Accelerometers +# +CONFIG_ADIS16201=m +CONFIG_ADIS16209=m +CONFIG_ADXL372=m +CONFIG_ADXL372_SPI=m +CONFIG_ADXL372_I2C=m +CONFIG_BMA180=m +CONFIG_BMA220=m +CONFIG_BMC150_ACCEL=m +CONFIG_BMC150_ACCEL_I2C=m +CONFIG_BMC150_ACCEL_SPI=m +CONFIG_DA280=m +CONFIG_DA311=m +CONFIG_DMARD06=m +CONFIG_DMARD09=m +CONFIG_DMARD10=m +CONFIG_HID_SENSOR_ACCEL_3D=m +CONFIG_IIO_CROS_EC_ACCEL_LEGACY=m +CONFIG_IIO_ST_ACCEL_3AXIS=m +CONFIG_IIO_ST_ACCEL_I2C_3AXIS=m +CONFIG_IIO_ST_ACCEL_SPI_3AXIS=m +CONFIG_KXSD9=m +CONFIG_KXSD9_SPI=m +CONFIG_KXSD9_I2C=m +CONFIG_KXCJK1013=m +CONFIG_MC3230=m +CONFIG_MMA7455=m +CONFIG_MMA7455_I2C=m +CONFIG_MMA7455_SPI=m +CONFIG_MMA7660=m +CONFIG_MMA8452=m +CONFIG_MMA9551_CORE=m +CONFIG_MMA9551=m +CONFIG_MMA9553=m +CONFIG_MXC4005=m +CONFIG_MXC6255=m +CONFIG_SCA3000=m +CONFIG_STK8312=m +CONFIG_STK8BA50=m +# end of Accelerometers + +# +# Analog to digital converters +# +CONFIG_AD_SIGMA_DELTA=m +CONFIG_AD7124=m +CONFIG_AD7266=m +CONFIG_AD7291=m +CONFIG_AD7298=m +CONFIG_AD7476=m +CONFIG_AD7606=m +CONFIG_AD7606_IFACE_PARALLEL=m +CONFIG_AD7606_IFACE_SPI=m +CONFIG_AD7766=m +CONFIG_AD7768_1=m +CONFIG_AD7780=m +CONFIG_AD7791=m +CONFIG_AD7793=m +CONFIG_AD7887=m +CONFIG_AD7923=m +CONFIG_AD7949=m +CONFIG_AD799X=m +CONFIG_AXP20X_ADC=m +CONFIG_AXP288_ADC=m +CONFIG_CC10001_ADC=m +CONFIG_CPCAP_ADC=m +CONFIG_DA9150_GPADC=m +CONFIG_DLN2_ADC=m +CONFIG_ENVELOPE_DETECTOR=m +CONFIG_HI8435=m +CONFIG_HX711=m +CONFIG_INA2XX_ADC=m +CONFIG_LP8788_ADC=m +CONFIG_LTC2471=m +CONFIG_LTC2485=m +CONFIG_LTC2497=m +CONFIG_MAX1027=m +CONFIG_MAX11100=m +CONFIG_MAX1118=m +CONFIG_MAX1363=m +CONFIG_MAX9611=m +CONFIG_MCP320X=m +CONFIG_MCP3422=m +CONFIG_MCP3911=m +CONFIG_MEN_Z188_ADC=m +CONFIG_NAU7802=m +CONFIG_PALMAS_GPADC=m +CONFIG_QCOM_VADC_COMMON=m +CONFIG_QCOM_SPMI_IADC=m +CONFIG_QCOM_SPMI_VADC=m +CONFIG_QCOM_SPMI_ADC5=m +CONFIG_SD_ADC_MODULATOR=m +CONFIG_STMPE_ADC=m +CONFIG_TI_ADC081C=m +CONFIG_TI_ADC0832=m +CONFIG_TI_ADC084S021=m +CONFIG_TI_ADC12138=m +CONFIG_TI_ADC108S102=m +CONFIG_TI_ADC128S052=m +CONFIG_TI_ADC161S626=m +CONFIG_TI_ADS1015=m +CONFIG_TI_ADS7950=m +CONFIG_TI_ADS8344=m +CONFIG_TI_ADS8688=m +CONFIG_TI_ADS124S08=m +CONFIG_TI_AM335X_ADC=m +CONFIG_TI_TLC4541=m +CONFIG_TWL4030_MADC=m +CONFIG_TWL6030_GPADC=m +CONFIG_VF610_ADC=m +CONFIG_VIPERBOARD_ADC=m +CONFIG_XILINX_XADC=m +# end of Analog to digital converters + +# +# Analog Front Ends +# +CONFIG_IIO_RESCALE=m +# end of Analog Front Ends + +# +# Amplifiers +# +CONFIG_AD8366=m +# end of Amplifiers + +# +# Chemical Sensors +# +CONFIG_ATLAS_PH_SENSOR=m +CONFIG_BME680=m +CONFIG_BME680_I2C=m +CONFIG_BME680_SPI=m +CONFIG_CCS811=m +CONFIG_IAQCORE=m +CONFIG_PMS7003=m +CONFIG_SENSIRION_SGP30=m +CONFIG_SPS30=m +CONFIG_VZ89X=m +# end of Chemical Sensors + +CONFIG_IIO_CROS_EC_SENSORS_CORE=m +CONFIG_IIO_CROS_EC_SENSORS=m +CONFIG_IIO_CROS_EC_SENSORS_LID_ANGLE=m + +# +# Hid Sensor IIO Common +# +CONFIG_HID_SENSOR_IIO_COMMON=m +CONFIG_HID_SENSOR_IIO_TRIGGER=m +# end of Hid Sensor IIO Common + +CONFIG_IIO_MS_SENSORS_I2C=m + +# +# SSP Sensor Common +# +CONFIG_IIO_SSP_SENSORS_COMMONS=m +CONFIG_IIO_SSP_SENSORHUB=m +# end of SSP Sensor Common + +CONFIG_IIO_ST_SENSORS_I2C=m +CONFIG_IIO_ST_SENSORS_SPI=m +CONFIG_IIO_ST_SENSORS_CORE=m + +# +# Digital to analog converters +# +CONFIG_AD5064=m +CONFIG_AD5360=m +CONFIG_AD5380=m +CONFIG_AD5421=m +CONFIG_AD5446=m +CONFIG_AD5449=m +CONFIG_AD5592R_BASE=m +CONFIG_AD5592R=m +CONFIG_AD5593R=m +CONFIG_AD5504=m +CONFIG_AD5624R_SPI=m +CONFIG_LTC1660=m +CONFIG_LTC2632=m +CONFIG_AD5686=m +CONFIG_AD5686_SPI=m +CONFIG_AD5696_I2C=m +CONFIG_AD5755=m +CONFIG_AD5758=m +CONFIG_AD5761=m +CONFIG_AD5764=m +CONFIG_AD5791=m +CONFIG_AD7303=m +CONFIG_AD8801=m +CONFIG_DPOT_DAC=m +CONFIG_DS4424=m +CONFIG_M62332=m +CONFIG_MAX517=m +CONFIG_MAX5821=m +CONFIG_MCP4725=m +CONFIG_MCP4922=m +CONFIG_TI_DAC082S085=m +CONFIG_TI_DAC5571=m +CONFIG_TI_DAC7311=m +CONFIG_TI_DAC7612=m +CONFIG_VF610_DAC=m +# end of Digital to analog converters + +# +# IIO dummy driver +# +# CONFIG_IIO_SIMPLE_DUMMY is not set +# end of IIO dummy driver + +# +# Frequency Synthesizers DDS/PLL +# + +# +# Clock Generator/Distribution +# +CONFIG_AD9523=m +# end of Clock Generator/Distribution + +# +# Phase-Locked Loop (PLL) frequency synthesizers +# +CONFIG_ADF4350=m +CONFIG_ADF4371=m +# end of Phase-Locked Loop (PLL) frequency synthesizers +# end of Frequency Synthesizers DDS/PLL + +# +# Digital gyroscope sensors +# +CONFIG_ADIS16080=m +CONFIG_ADIS16130=m +CONFIG_ADIS16136=m +CONFIG_ADIS16260=m +CONFIG_ADXRS450=m +CONFIG_BMG160=m +CONFIG_BMG160_I2C=m +CONFIG_BMG160_SPI=m +CONFIG_FXAS21002C=m +CONFIG_FXAS21002C_I2C=m +CONFIG_FXAS21002C_SPI=m +CONFIG_HID_SENSOR_GYRO_3D=m +CONFIG_MPU3050=m +CONFIG_MPU3050_I2C=m +CONFIG_IIO_ST_GYRO_3AXIS=m +CONFIG_IIO_ST_GYRO_I2C_3AXIS=m +CONFIG_IIO_ST_GYRO_SPI_3AXIS=m +CONFIG_ITG3200=m +# end of Digital gyroscope sensors + +# +# Health Sensors +# + +# +# Heart Rate Monitors +# +CONFIG_AFE4403=m +CONFIG_AFE4404=m +CONFIG_MAX30100=m +CONFIG_MAX30102=m +# end of Heart Rate Monitors +# end of Health Sensors + +# +# Humidity sensors +# +CONFIG_AM2315=m +CONFIG_DHT11=m +CONFIG_HDC100X=m +CONFIG_HID_SENSOR_HUMIDITY=m +CONFIG_HTS221=m +CONFIG_HTS221_I2C=m +CONFIG_HTS221_SPI=m +CONFIG_HTU21=m +CONFIG_SI7005=m +CONFIG_SI7020=m +# end of Humidity sensors + +# +# Inertial measurement units +# +CONFIG_ADIS16400=m +CONFIG_ADIS16460=m +CONFIG_ADIS16480=m +CONFIG_BMI160=m +CONFIG_BMI160_I2C=m +CONFIG_BMI160_SPI=m +CONFIG_KMX61=m +CONFIG_INV_MPU6050_IIO=m +CONFIG_INV_MPU6050_I2C=m +CONFIG_INV_MPU6050_SPI=m +CONFIG_IIO_ST_LSM6DSX=m +CONFIG_IIO_ST_LSM6DSX_I2C=m +CONFIG_IIO_ST_LSM6DSX_SPI=m +CONFIG_IIO_ST_LSM6DSX_I3C=m +# end of Inertial measurement units + +CONFIG_IIO_ADIS_LIB=m +CONFIG_IIO_ADIS_LIB_BUFFER=y + +# +# Light sensors +# +CONFIG_ACPI_ALS=m +CONFIG_ADJD_S311=m +CONFIG_AL3320A=m +CONFIG_APDS9300=m +CONFIG_APDS9960=m +CONFIG_BH1750=m +CONFIG_BH1780=m +CONFIG_CM32181=m +CONFIG_CM3232=m +CONFIG_CM3323=m +CONFIG_CM3605=m +CONFIG_CM36651=m +CONFIG_IIO_CROS_EC_LIGHT_PROX=m +CONFIG_GP2AP020A00F=m +CONFIG_SENSORS_ISL29018=m +CONFIG_SENSORS_ISL29028=m +CONFIG_ISL29125=m +CONFIG_HID_SENSOR_ALS=m +CONFIG_HID_SENSOR_PROX=m +CONFIG_JSA1212=m +CONFIG_RPR0521=m +CONFIG_SENSORS_LM3533=m +CONFIG_LTR501=m +CONFIG_LV0104CS=m +CONFIG_MAX44000=m +CONFIG_MAX44009=m +CONFIG_NOA1305=m +CONFIG_OPT3001=m +CONFIG_PA12203001=m +CONFIG_SI1133=m +CONFIG_SI1145=m +CONFIG_STK3310=m +CONFIG_ST_UVIS25=m +CONFIG_ST_UVIS25_I2C=m +CONFIG_ST_UVIS25_SPI=m +CONFIG_TCS3414=m +CONFIG_TCS3472=m +CONFIG_SENSORS_TSL2563=m +CONFIG_TSL2583=m +CONFIG_TSL2772=m +CONFIG_TSL4531=m +CONFIG_US5182D=m +CONFIG_VCNL4000=m +CONFIG_VCNL4035=m +CONFIG_VEML6070=m +CONFIG_VL6180=m +CONFIG_ZOPT2201=m +# end of Light sensors + +# +# Magnetometer sensors +# +CONFIG_AK8974=m +CONFIG_AK8975=m +CONFIG_AK09911=m +CONFIG_BMC150_MAGN=m +CONFIG_BMC150_MAGN_I2C=m +CONFIG_BMC150_MAGN_SPI=m +CONFIG_MAG3110=m +CONFIG_HID_SENSOR_MAGNETOMETER_3D=m +CONFIG_MMC35240=m +CONFIG_IIO_ST_MAGN_3AXIS=m +CONFIG_IIO_ST_MAGN_I2C_3AXIS=m +CONFIG_IIO_ST_MAGN_SPI_3AXIS=m +CONFIG_SENSORS_HMC5843=m +CONFIG_SENSORS_HMC5843_I2C=m +CONFIG_SENSORS_HMC5843_SPI=m +CONFIG_SENSORS_RM3100=m +CONFIG_SENSORS_RM3100_I2C=m +CONFIG_SENSORS_RM3100_SPI=m +# end of Magnetometer sensors + +# +# Multiplexers +# +CONFIG_IIO_MUX=m +# end of Multiplexers + +# +# Inclinometer sensors +# +CONFIG_HID_SENSOR_INCLINOMETER_3D=m +CONFIG_HID_SENSOR_DEVICE_ROTATION=m +# end of Inclinometer sensors + +# +# Triggers - standalone +# +CONFIG_IIO_HRTIMER_TRIGGER=m +CONFIG_IIO_INTERRUPT_TRIGGER=m +CONFIG_IIO_TIGHTLOOP_TRIGGER=m +CONFIG_IIO_SYSFS_TRIGGER=m +# end of Triggers - standalone + +# +# Digital potentiometers +# +CONFIG_AD5272=m +CONFIG_DS1803=m +CONFIG_MAX5432=m +CONFIG_MAX5481=m +CONFIG_MAX5487=m +CONFIG_MCP4018=m +CONFIG_MCP4131=m +CONFIG_MCP4531=m +CONFIG_MCP41010=m +CONFIG_TPL0102=m +# end of Digital potentiometers + +# +# Digital potentiostats +# +CONFIG_LMP91000=m +# end of Digital potentiostats + +# +# Pressure sensors +# +CONFIG_ABP060MG=m +CONFIG_BMP280=m +CONFIG_BMP280_I2C=m +CONFIG_BMP280_SPI=m +CONFIG_IIO_CROS_EC_BARO=m +CONFIG_DPS310=m +CONFIG_HID_SENSOR_PRESS=m +CONFIG_HP03=m +CONFIG_MPL115=m +CONFIG_MPL115_I2C=m +CONFIG_MPL115_SPI=m +CONFIG_MPL3115=m +CONFIG_MS5611=m +CONFIG_MS5611_I2C=m +CONFIG_MS5611_SPI=m +CONFIG_MS5637=m +CONFIG_IIO_ST_PRESS=m +CONFIG_IIO_ST_PRESS_I2C=m +CONFIG_IIO_ST_PRESS_SPI=m +CONFIG_T5403=m +CONFIG_HP206C=m +CONFIG_ZPA2326=m +CONFIG_ZPA2326_I2C=m +CONFIG_ZPA2326_SPI=m +# end of Pressure sensors + +# +# Lightning sensors +# +CONFIG_AS3935=m +# end of Lightning sensors + +# +# Proximity and distance sensors +# +CONFIG_ISL29501=m +CONFIG_LIDAR_LITE_V2=m +CONFIG_MB1232=m +CONFIG_RFD77402=m +CONFIG_SRF04=m +CONFIG_SX9500=m +CONFIG_SRF08=m +CONFIG_VL53L0X_I2C=m +# end of Proximity and distance sensors + +# +# Resolver to digital converters +# +CONFIG_AD2S90=m +CONFIG_AD2S1200=m +# end of Resolver to digital converters + +# +# Temperature sensors +# +CONFIG_MAXIM_THERMOCOUPLE=m +CONFIG_HID_SENSOR_TEMP=m +CONFIG_MLX90614=m +CONFIG_MLX90632=m +CONFIG_TMP006=m +CONFIG_TMP007=m +CONFIG_TSYS01=m +CONFIG_TSYS02D=m +CONFIG_MAX31856=m +# end of Temperature sensors + +CONFIG_NTB=m +CONFIG_NTB_MSI=y +CONFIG_NTB_AMD=m +CONFIG_NTB_IDT=m +CONFIG_NTB_INTEL=m +CONFIG_NTB_SWITCHTEC=m +# CONFIG_NTB_PINGPONG is not set +# CONFIG_NTB_TOOL is not set +# CONFIG_NTB_PERF is not set +# CONFIG_NTB_MSI_TEST is not set +CONFIG_NTB_TRANSPORT=m +CONFIG_VME_BUS=y + +# +# VME Bridge Drivers +# +CONFIG_VME_CA91CX42=m +CONFIG_VME_TSI148=m +# CONFIG_VME_FAKE is not set + +# +# VME Board Drivers +# +CONFIG_VMIVME_7805=m + +# +# VME Device Drivers +# +CONFIG_VME_USER=m +CONFIG_PWM=y +CONFIG_PWM_SYSFS=y +CONFIG_PWM_ATMEL_HLCDC_PWM=m +CONFIG_PWM_CRC=y +CONFIG_PWM_CROS_EC=m +CONFIG_PWM_FSL_FTM=m +CONFIG_PWM_LP3943=m +CONFIG_PWM_LPSS=m +CONFIG_PWM_LPSS_PCI=m +CONFIG_PWM_LPSS_PLATFORM=m +CONFIG_PWM_PCA9685=m +CONFIG_PWM_STMPE=y +CONFIG_PWM_TWL=m +CONFIG_PWM_TWL_LED=m + +# +# IRQ chip support +# +CONFIG_IRQCHIP=y +CONFIG_AL_FIC=y +CONFIG_MADERA_IRQ=m +# end of IRQ chip support + +CONFIG_IPACK_BUS=m +CONFIG_BOARD_TPCI200=m +CONFIG_SERIAL_IPOCTAL=m +CONFIG_RESET_CONTROLLER=y +CONFIG_RESET_TI_SYSCON=m + +# +# PHY Subsystem +# +CONFIG_GENERIC_PHY=y +CONFIG_GENERIC_PHY_MIPI_DPHY=y +CONFIG_BCM_KONA_USB2_PHY=m +CONFIG_PHY_CADENCE_DP=m +CONFIG_PHY_CADENCE_DPHY=m +CONFIG_PHY_CADENCE_SIERRA=m +CONFIG_PHY_FSL_IMX8MQ_USB=m +CONFIG_PHY_MIXEL_MIPI_DPHY=m +CONFIG_PHY_PXA_28NM_HSIC=m +CONFIG_PHY_PXA_28NM_USB2=m +CONFIG_PHY_CPCAP_USB=m +CONFIG_PHY_MAPPHONE_MDM6600=m +CONFIG_PHY_OCELOT_SERDES=m +CONFIG_PHY_QCOM_USB_HS=m +CONFIG_PHY_QCOM_USB_HSIC=m +CONFIG_PHY_SAMSUNG_USB2=m +CONFIG_PHY_TUSB1210=m +# end of PHY Subsystem + +CONFIG_POWERCAP=y +CONFIG_INTEL_RAPL_CORE=m +CONFIG_INTEL_RAPL=m +CONFIG_IDLE_INJECT=y +CONFIG_MCB=m +CONFIG_MCB_PCI=m +CONFIG_MCB_LPC=m + +# +# Performance monitor support +# +# end of Performance monitor support + +CONFIG_RAS=y +CONFIG_RAS_CEC=y +# CONFIG_RAS_CEC_DEBUG is not set +CONFIG_THUNDERBOLT=m + +# +# Android +# +# CONFIG_ANDROID is not set +# end of Android + +CONFIG_LIBNVDIMM=y +CONFIG_BLK_DEV_PMEM=m +CONFIG_ND_BLK=m +CONFIG_ND_CLAIM=y +CONFIG_ND_BTT=m +CONFIG_BTT=y +CONFIG_ND_PFN=m +CONFIG_NVDIMM_PFN=y +CONFIG_NVDIMM_DAX=y +CONFIG_OF_PMEM=m +CONFIG_DAX_DRIVER=y +CONFIG_DAX=y +CONFIG_DEV_DAX=m +CONFIG_DEV_DAX_PMEM=m +CONFIG_DEV_DAX_KMEM=m +CONFIG_DEV_DAX_PMEM_COMPAT=m +CONFIG_NVMEM=y +CONFIG_NVMEM_SYSFS=y +CONFIG_RAVE_SP_EEPROM=m + +# +# HW tracing support +# +CONFIG_STM=m +CONFIG_STM_PROTO_BASIC=m +CONFIG_STM_PROTO_SYS_T=m +# CONFIG_STM_DUMMY is not set +CONFIG_STM_SOURCE_CONSOLE=m +CONFIG_STM_SOURCE_HEARTBEAT=m +CONFIG_STM_SOURCE_FTRACE=m +CONFIG_INTEL_TH=m +CONFIG_INTEL_TH_PCI=m +CONFIG_INTEL_TH_ACPI=m +CONFIG_INTEL_TH_GTH=m +CONFIG_INTEL_TH_STH=m +CONFIG_INTEL_TH_MSU=m +CONFIG_INTEL_TH_PTI=m +# CONFIG_INTEL_TH_DEBUG is not set +# end of HW tracing support + +CONFIG_FPGA=m +CONFIG_ALTERA_PR_IP_CORE=m +CONFIG_ALTERA_PR_IP_CORE_PLAT=m +CONFIG_FPGA_MGR_ALTERA_PS_SPI=m +CONFIG_FPGA_MGR_ALTERA_CVP=m +CONFIG_FPGA_MGR_XILINX_SPI=m +CONFIG_FPGA_MGR_ICE40_SPI=m +CONFIG_FPGA_MGR_MACHXO2_SPI=m +CONFIG_FPGA_BRIDGE=m +CONFIG_ALTERA_FREEZE_BRIDGE=m +CONFIG_XILINX_PR_DECOUPLER=m +CONFIG_FPGA_REGION=m +CONFIG_OF_FPGA_REGION=m +CONFIG_FPGA_DFL=m +CONFIG_FPGA_DFL_FME=m +CONFIG_FPGA_DFL_FME_MGR=m +CONFIG_FPGA_DFL_FME_BRIDGE=m +CONFIG_FPGA_DFL_FME_REGION=m +CONFIG_FPGA_DFL_AFU=m +CONFIG_FPGA_DFL_PCI=m +CONFIG_FSI=m +CONFIG_FSI_NEW_DEV_NODE=y +CONFIG_FSI_MASTER_GPIO=m +CONFIG_FSI_MASTER_HUB=m +CONFIG_FSI_SCOM=m +CONFIG_FSI_SBEFIFO=m +CONFIG_FSI_OCC=m +CONFIG_MULTIPLEXER=m + +# +# Multiplexer drivers +# +CONFIG_MUX_ADG792A=m +CONFIG_MUX_ADGS1408=m +CONFIG_MUX_GPIO=m +CONFIG_MUX_MMIO=m +# end of Multiplexer drivers + +CONFIG_PM_OPP=y +CONFIG_UNISYS_VISORBUS=m +CONFIG_SIOX=m +CONFIG_SIOX_BUS_GPIO=m +CONFIG_SLIMBUS=m +CONFIG_SLIM_QCOM_CTRL=m +CONFIG_INTERCONNECT=m +CONFIG_COUNTER=m +CONFIG_FTM_QUADDEC=m +# end of Device Drivers + +# +# File systems +# +CONFIG_DCACHE_WORD_ACCESS=y +CONFIG_VALIDATE_FS_PARSER=y +CONFIG_FS_IOMAP=y +# CONFIG_EXT2_FS is not set +# CONFIG_EXT3_FS is not set +CONFIG_EXT4_FS=m +CONFIG_EXT4_USE_FOR_EXT2=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +# CONFIG_EXT4_DEBUG is not set +CONFIG_JBD2=m +# CONFIG_JBD2_DEBUG is not set +CONFIG_FS_MBCACHE=m +CONFIG_REISERFS_FS=m +# CONFIG_REISERFS_CHECK is not set +CONFIG_REISERFS_PROC_INFO=y +CONFIG_REISERFS_FS_XATTR=y +CONFIG_REISERFS_FS_POSIX_ACL=y +CONFIG_REISERFS_FS_SECURITY=y +CONFIG_JFS_FS=m +CONFIG_JFS_POSIX_ACL=y +CONFIG_JFS_SECURITY=y +# CONFIG_JFS_DEBUG is not set +CONFIG_JFS_STATISTICS=y +CONFIG_XFS_FS=m +CONFIG_XFS_QUOTA=y +CONFIG_XFS_POSIX_ACL=y +CONFIG_XFS_RT=y +CONFIG_XFS_ONLINE_SCRUB=y +CONFIG_XFS_ONLINE_REPAIR=y +# CONFIG_XFS_WARN is not set +# CONFIG_XFS_DEBUG is not set +CONFIG_GFS2_FS=m +CONFIG_GFS2_FS_LOCKING_DLM=y +CONFIG_OCFS2_FS=m +CONFIG_OCFS2_FS_O2CB=m +CONFIG_OCFS2_FS_USERSPACE_CLUSTER=m +CONFIG_OCFS2_FS_STATS=y +CONFIG_OCFS2_DEBUG_MASKLOG=y +# CONFIG_OCFS2_DEBUG_FS is not set +CONFIG_BTRFS_FS=m +CONFIG_BTRFS_FS_POSIX_ACL=y +# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set +# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set +# CONFIG_BTRFS_DEBUG is not set +# CONFIG_BTRFS_ASSERT is not set +# CONFIG_BTRFS_FS_REF_VERIFY is not set +CONFIG_NILFS2_FS=m +CONFIG_F2FS_FS=m +CONFIG_F2FS_STAT_FS=y +CONFIG_F2FS_FS_XATTR=y +CONFIG_F2FS_FS_POSIX_ACL=y +CONFIG_F2FS_FS_SECURITY=y +CONFIG_F2FS_CHECK_FS=y +# CONFIG_F2FS_IO_TRACE is not set +# CONFIG_F2FS_FAULT_INJECTION is not set +CONFIG_FS_DAX=y +CONFIG_FS_DAX_PMD=y +CONFIG_FS_POSIX_ACL=y +CONFIG_EXPORTFS=y +CONFIG_EXPORTFS_BLOCK_OPS=y +CONFIG_FILE_LOCKING=y +# CONFIG_MANDATORY_FILE_LOCKING is not set +CONFIG_FS_ENCRYPTION=y +CONFIG_FS_VERITY=y +# CONFIG_FS_VERITY_DEBUG is not set +CONFIG_FS_VERITY_BUILTIN_SIGNATURES=y +CONFIG_FSNOTIFY=y +CONFIG_DNOTIFY=y +CONFIG_INOTIFY_USER=y +CONFIG_FANOTIFY=y +CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y +CONFIG_QUOTA=y +CONFIG_QUOTA_NETLINK_INTERFACE=y +# CONFIG_PRINT_QUOTA_WARNING is not set +# CONFIG_QUOTA_DEBUG is not set +CONFIG_QUOTA_TREE=m +CONFIG_QFMT_V1=m +CONFIG_QFMT_V2=m +CONFIG_QUOTACTL=y +CONFIG_QUOTACTL_COMPAT=y +CONFIG_AUTOFS4_FS=y +CONFIG_AUTOFS_FS=y +CONFIG_FUSE_FS=m +CONFIG_CUSE=m +CONFIG_VIRTIO_FS=m +CONFIG_OVERLAY_FS=m +CONFIG_OVERLAY_FS_REDIRECT_DIR=y +# CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW is not set +CONFIG_OVERLAY_FS_INDEX=y +CONFIG_OVERLAY_FS_XINO_AUTO=y +CONFIG_OVERLAY_FS_METACOPY=y + +# +# Caches +# +CONFIG_FSCACHE=m +CONFIG_FSCACHE_STATS=y +CONFIG_FSCACHE_HISTOGRAM=y +# CONFIG_FSCACHE_DEBUG is not set +# CONFIG_FSCACHE_OBJECT_LIST is not set +CONFIG_CACHEFILES=m +# CONFIG_CACHEFILES_DEBUG is not set +# CONFIG_CACHEFILES_HISTOGRAM is not set +# end of Caches + +# +# CD-ROM/DVD Filesystems +# +CONFIG_ISO9660_FS=m +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_UDF_FS=m +# end of CD-ROM/DVD Filesystems + +# +# DOS/FAT/NT Filesystems +# +CONFIG_FAT_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_CODEPAGE=437 +CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" +CONFIG_FAT_DEFAULT_UTF8=y +CONFIG_NTFS_FS=m +# CONFIG_NTFS_DEBUG is not set +CONFIG_NTFS_RW=y +# end of DOS/FAT/NT Filesystems + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +CONFIG_PROC_KCORE=y +CONFIG_PROC_VMCORE=y +CONFIG_PROC_VMCORE_DEVICE_DUMP=y +CONFIG_PROC_SYSCTL=y +CONFIG_PROC_PAGE_MONITOR=y +CONFIG_PROC_CHILDREN=y +CONFIG_PROC_PID_ARCH_STATUS=y +CONFIG_KERNFS=y +CONFIG_SYSFS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_TMPFS_XATTR=y +CONFIG_HUGETLBFS=y +CONFIG_HUGETLB_PAGE=y +CONFIG_MEMFD_CREATE=y +CONFIG_ARCH_HAS_GIGANTIC_PAGE=y +CONFIG_CONFIGFS_FS=y +CONFIG_EFIVAR_FS=y +# end of Pseudo filesystems + +CONFIG_MISC_FILESYSTEMS=y +CONFIG_ORANGEFS_FS=m +# CONFIG_ADFS_FS is not set +CONFIG_AFFS_FS=m +CONFIG_ECRYPT_FS=m +# CONFIG_ECRYPT_FS_MESSAGING is not set +CONFIG_HFS_FS=m +CONFIG_HFSPLUS_FS=m +CONFIG_BEFS_FS=m +# CONFIG_BEFS_DEBUG is not set +# CONFIG_BFS_FS is not set +# CONFIG_EFS_FS is not set +CONFIG_JFFS2_FS=m +CONFIG_JFFS2_FS_DEBUG=0 +CONFIG_JFFS2_FS_WRITEBUFFER=y +# CONFIG_JFFS2_FS_WBUF_VERIFY is not set +CONFIG_JFFS2_SUMMARY=y +CONFIG_JFFS2_FS_XATTR=y +CONFIG_JFFS2_FS_POSIX_ACL=y +CONFIG_JFFS2_FS_SECURITY=y +# CONFIG_JFFS2_COMPRESSION_OPTIONS is not set +CONFIG_JFFS2_ZLIB=y +CONFIG_JFFS2_RTIME=y +CONFIG_UBIFS_FS=m +# CONFIG_UBIFS_FS_ADVANCED_COMPR is not set +CONFIG_UBIFS_FS_LZO=y +CONFIG_UBIFS_FS_ZLIB=y +CONFIG_UBIFS_FS_ZSTD=y +CONFIG_UBIFS_ATIME_SUPPORT=y +CONFIG_UBIFS_FS_XATTR=y +CONFIG_UBIFS_FS_SECURITY=y +CONFIG_UBIFS_FS_AUTHENTICATION=y +CONFIG_CRAMFS=m +CONFIG_CRAMFS_BLOCKDEV=y +CONFIG_CRAMFS_MTD=y +CONFIG_SQUASHFS=m +# CONFIG_SQUASHFS_FILE_CACHE is not set +CONFIG_SQUASHFS_FILE_DIRECT=y +# CONFIG_SQUASHFS_DECOMP_SINGLE is not set +CONFIG_SQUASHFS_DECOMP_MULTI=y +# CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU is not set +CONFIG_SQUASHFS_XATTR=y +CONFIG_SQUASHFS_ZLIB=y +CONFIG_SQUASHFS_LZ4=y +CONFIG_SQUASHFS_LZO=y +CONFIG_SQUASHFS_XZ=y +CONFIG_SQUASHFS_ZSTD=y +# CONFIG_SQUASHFS_4K_DEVBLK_SIZE is not set +# CONFIG_SQUASHFS_EMBEDDED is not set +CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3 +# CONFIG_VXFS_FS is not set +CONFIG_MINIX_FS=m +CONFIG_OMFS_FS=m +# CONFIG_HPFS_FS is not set +# CONFIG_QNX4FS_FS is not set +# CONFIG_QNX6FS_FS is not set +CONFIG_ROMFS_FS=m +CONFIG_ROMFS_BACKED_BY_BLOCK=y +# CONFIG_ROMFS_BACKED_BY_MTD is not set +# CONFIG_ROMFS_BACKED_BY_BOTH is not set +CONFIG_ROMFS_ON_BLOCK=y +CONFIG_PSTORE=y +CONFIG_PSTORE_DEFLATE_COMPRESS=m +CONFIG_PSTORE_LZO_COMPRESS=m +CONFIG_PSTORE_LZ4_COMPRESS=m +CONFIG_PSTORE_LZ4HC_COMPRESS=m +# CONFIG_PSTORE_842_COMPRESS is not set +CONFIG_PSTORE_ZSTD_COMPRESS=y +CONFIG_PSTORE_COMPRESS=y +# CONFIG_PSTORE_DEFLATE_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_LZO_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_LZ4_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_LZ4HC_COMPRESS_DEFAULT is not set +CONFIG_PSTORE_ZSTD_COMPRESS_DEFAULT=y +CONFIG_PSTORE_COMPRESS_DEFAULT="zstd" +# CONFIG_PSTORE_CONSOLE is not set +# CONFIG_PSTORE_PMSG is not set +# CONFIG_PSTORE_FTRACE is not set +CONFIG_PSTORE_RAM=y +# CONFIG_SYSV_FS is not set +CONFIG_UFS_FS=m +# CONFIG_UFS_FS_WRITE is not set +# CONFIG_UFS_DEBUG is not set +CONFIG_EROFS_FS=m +# CONFIG_EROFS_FS_DEBUG is not set +CONFIG_EROFS_FS_XATTR=y +CONFIG_EROFS_FS_POSIX_ACL=y +CONFIG_EROFS_FS_SECURITY=y +CONFIG_EROFS_FS_ZIP=y +CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT=2 +CONFIG_NETWORK_FILESYSTEMS=y +CONFIG_NFS_FS=m +CONFIG_NFS_V2=m +CONFIG_NFS_V3=m +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=m +CONFIG_NFS_SWAP=y +CONFIG_NFS_V4_1=y +CONFIG_NFS_V4_2=y +CONFIG_PNFS_FILE_LAYOUT=m +CONFIG_PNFS_BLOCK=m +CONFIG_PNFS_FLEXFILE_LAYOUT=m +CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="kernel.org" +CONFIG_NFS_V4_1_MIGRATION=y +CONFIG_NFS_V4_SECURITY_LABEL=y +CONFIG_NFS_FSCACHE=y +# CONFIG_NFS_USE_LEGACY_DNS is not set +CONFIG_NFS_USE_KERNEL_DNS=y +CONFIG_NFS_DEBUG=y +CONFIG_NFSD=m +CONFIG_NFSD_V2_ACL=y +CONFIG_NFSD_V3=y +CONFIG_NFSD_V3_ACL=y +CONFIG_NFSD_V4=y +CONFIG_NFSD_PNFS=y +CONFIG_NFSD_BLOCKLAYOUT=y +CONFIG_NFSD_SCSILAYOUT=y +# CONFIG_NFSD_FLEXFILELAYOUT is not set +CONFIG_NFSD_V4_SECURITY_LABEL=y +CONFIG_GRACE_PERIOD=m +CONFIG_LOCKD=m +CONFIG_LOCKD_V4=y +CONFIG_NFS_ACL_SUPPORT=m +CONFIG_NFS_COMMON=y +CONFIG_SUNRPC=m +CONFIG_SUNRPC_GSS=m +CONFIG_SUNRPC_BACKCHANNEL=y +CONFIG_SUNRPC_SWAP=y +CONFIG_RPCSEC_GSS_KRB5=m +CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES=y +CONFIG_SUNRPC_DEBUG=y +CONFIG_SUNRPC_XPRT_RDMA=m +CONFIG_CEPH_FS=m +CONFIG_CEPH_FSCACHE=y +CONFIG_CEPH_FS_POSIX_ACL=y +CONFIG_CEPH_FS_SECURITY_LABEL=y +CONFIG_CIFS=m +# CONFIG_CIFS_STATS2 is not set +CONFIG_CIFS_ALLOW_INSECURE_LEGACY=y +# CONFIG_CIFS_WEAK_PW_HASH is not set +CONFIG_CIFS_UPCALL=y +CONFIG_CIFS_XATTR=y +CONFIG_CIFS_POSIX=y +CONFIG_CIFS_DEBUG=y +# CONFIG_CIFS_DEBUG2 is not set +# CONFIG_CIFS_DEBUG_DUMP_KEYS is not set +CONFIG_CIFS_DFS_UPCALL=y +# CONFIG_CIFS_SMB_DIRECT is not set +CONFIG_CIFS_FSCACHE=y +CONFIG_CODA_FS=m +CONFIG_AFS_FS=m +# CONFIG_AFS_DEBUG is not set +CONFIG_AFS_FSCACHE=y +# CONFIG_AFS_DEBUG_CURSOR is not set +CONFIG_9P_FS=m +CONFIG_9P_FSCACHE=y +CONFIG_9P_FS_POSIX_ACL=y +CONFIG_9P_FS_SECURITY=y +CONFIG_NLS=y +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_737=m +CONFIG_NLS_CODEPAGE_775=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_CODEPAGE_852=m +CONFIG_NLS_CODEPAGE_855=m +CONFIG_NLS_CODEPAGE_857=m +CONFIG_NLS_CODEPAGE_860=m +CONFIG_NLS_CODEPAGE_861=m +CONFIG_NLS_CODEPAGE_862=m +CONFIG_NLS_CODEPAGE_863=m +CONFIG_NLS_CODEPAGE_864=m +CONFIG_NLS_CODEPAGE_865=m +CONFIG_NLS_CODEPAGE_866=m +CONFIG_NLS_CODEPAGE_869=m +CONFIG_NLS_CODEPAGE_936=m +CONFIG_NLS_CODEPAGE_950=m +CONFIG_NLS_CODEPAGE_932=m +CONFIG_NLS_CODEPAGE_949=m +CONFIG_NLS_CODEPAGE_874=m +CONFIG_NLS_ISO8859_8=m +CONFIG_NLS_CODEPAGE_1250=m +CONFIG_NLS_CODEPAGE_1251=m +CONFIG_NLS_ASCII=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_ISO8859_2=m +CONFIG_NLS_ISO8859_3=m +CONFIG_NLS_ISO8859_4=m +CONFIG_NLS_ISO8859_5=m +CONFIG_NLS_ISO8859_6=m +CONFIG_NLS_ISO8859_7=m +CONFIG_NLS_ISO8859_9=m +CONFIG_NLS_ISO8859_13=m +CONFIG_NLS_ISO8859_14=m +CONFIG_NLS_ISO8859_15=m +CONFIG_NLS_KOI8_R=m +CONFIG_NLS_KOI8_U=m +CONFIG_NLS_MAC_ROMAN=m +CONFIG_NLS_MAC_CELTIC=m +CONFIG_NLS_MAC_CENTEURO=m +CONFIG_NLS_MAC_CROATIAN=m +CONFIG_NLS_MAC_CYRILLIC=m +CONFIG_NLS_MAC_GAELIC=m +CONFIG_NLS_MAC_GREEK=m +CONFIG_NLS_MAC_ICELAND=m +CONFIG_NLS_MAC_INUIT=m +CONFIG_NLS_MAC_ROMANIAN=m +CONFIG_NLS_MAC_TURKISH=m +CONFIG_NLS_UTF8=m +CONFIG_DLM=m +# CONFIG_DLM_DEBUG is not set +CONFIG_UNICODE=y +# CONFIG_UNICODE_NORMALIZATION_SELFTEST is not set +# end of File systems + +# +# Security options +# +CONFIG_KEYS=y +CONFIG_KEYS_COMPAT=y +CONFIG_KEYS_REQUEST_CACHE=y +CONFIG_PERSISTENT_KEYRINGS=y +CONFIG_BIG_KEYS=y +CONFIG_TRUSTED_KEYS=m +CONFIG_ENCRYPTED_KEYS=m +CONFIG_KEY_DH_OPERATIONS=y +# CONFIG_SECURITY_DMESG_RESTRICT is not set +CONFIG_SECURITY=y +CONFIG_SECURITYFS=y +CONFIG_SECURITY_NETWORK=y +CONFIG_PAGE_TABLE_ISOLATION=y +CONFIG_SECURITY_INFINIBAND=y +CONFIG_SECURITY_NETWORK_XFRM=y +CONFIG_SECURITY_PATH=y +# CONFIG_INTEL_TXT is not set +CONFIG_LSM_MMAP_MIN_ADDR=65536 +CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR=y +CONFIG_HARDENED_USERCOPY=y +CONFIG_HARDENED_USERCOPY_FALLBACK=y +# CONFIG_HARDENED_USERCOPY_PAGESPAN is not set +CONFIG_FORTIFY_SOURCE=y +# CONFIG_STATIC_USERMODEHELPER is not set +CONFIG_SECURITY_SELINUX=y +CONFIG_SECURITY_SELINUX_BOOTPARAM=y +# CONFIG_SECURITY_SELINUX_DISABLE is not set +CONFIG_SECURITY_SELINUX_DEVELOP=y +CONFIG_SECURITY_SELINUX_AVC_STATS=y +CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=0 +CONFIG_SECURITY_SMACK=y +CONFIG_SECURITY_SMACK_BRINGUP=y +CONFIG_SECURITY_SMACK_NETFILTER=y +CONFIG_SECURITY_SMACK_APPEND_SIGNALS=y +CONFIG_SECURITY_TOMOYO=y +CONFIG_SECURITY_TOMOYO_MAX_ACCEPT_ENTRY=2048 +CONFIG_SECURITY_TOMOYO_MAX_AUDIT_LOG=1024 +# CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER is not set +CONFIG_SECURITY_TOMOYO_POLICY_LOADER="/sbin/tomoyo-init" +CONFIG_SECURITY_TOMOYO_ACTIVATION_TRIGGER="/sbin/init" +# CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING is not set +CONFIG_SECURITY_APPARMOR=y +CONFIG_SECURITY_APPARMOR_HASH=y +CONFIG_SECURITY_APPARMOR_HASH_DEFAULT=y +# CONFIG_SECURITY_APPARMOR_DEBUG is not set +# CONFIG_SECURITY_LOADPIN is not set +CONFIG_SECURITY_YAMA=y +CONFIG_SECURITY_SAFESETID=y +CONFIG_SECURITY_LOCKDOWN_LSM=y +# CONFIG_SECURITY_LOCKDOWN_LSM_EARLY is not set +CONFIG_LOCK_DOWN_KERNEL_FORCE_NONE=y +# CONFIG_LOCK_DOWN_KERNEL_FORCE_INTEGRITY is not set +# CONFIG_LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY is not set +# CONFIG_INTEGRITY is not set +# CONFIG_DEFAULT_SECURITY_SELINUX is not set +# CONFIG_DEFAULT_SECURITY_SMACK is not set +# CONFIG_DEFAULT_SECURITY_TOMOYO is not set +# CONFIG_DEFAULT_SECURITY_APPARMOR is not set +CONFIG_DEFAULT_SECURITY_DAC=y +CONFIG_LSM="lockdown,yama" + +# +# Kernel hardening options +# +CONFIG_GCC_PLUGIN_STRUCTLEAK=y + +# +# Memory initialization +# +# CONFIG_INIT_STACK_NONE is not set +# CONFIG_GCC_PLUGIN_STRUCTLEAK_USER is not set +# CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF is not set +CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL=y +# CONFIG_GCC_PLUGIN_STRUCTLEAK_VERBOSE is not set +# CONFIG_GCC_PLUGIN_STACKLEAK is not set +CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y +# CONFIG_INIT_ON_FREE_DEFAULT_ON is not set +# end of Memory initialization +# end of Kernel hardening options +# end of Security options + +CONFIG_XOR_BLOCKS=m +CONFIG_ASYNC_CORE=m +CONFIG_ASYNC_MEMCPY=m +CONFIG_ASYNC_XOR=m +CONFIG_ASYNC_PQ=m +CONFIG_ASYNC_RAID6_RECOV=m +CONFIG_CRYPTO=y + +# +# Crypto core or helper +# +CONFIG_CRYPTO_ALGAPI=y +CONFIG_CRYPTO_ALGAPI2=y +CONFIG_CRYPTO_AEAD=y +CONFIG_CRYPTO_AEAD2=y +CONFIG_CRYPTO_BLKCIPHER=y +CONFIG_CRYPTO_BLKCIPHER2=y +CONFIG_CRYPTO_HASH=y +CONFIG_CRYPTO_HASH2=y +CONFIG_CRYPTO_RNG=y +CONFIG_CRYPTO_RNG2=y +CONFIG_CRYPTO_RNG_DEFAULT=y +CONFIG_CRYPTO_AKCIPHER2=y +CONFIG_CRYPTO_AKCIPHER=y +CONFIG_CRYPTO_KPP2=y +CONFIG_CRYPTO_KPP=y +CONFIG_CRYPTO_ACOMP2=y +CONFIG_CRYPTO_MANAGER=y +CONFIG_CRYPTO_MANAGER2=y +CONFIG_CRYPTO_USER=m +CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y +CONFIG_CRYPTO_GF128MUL=y +CONFIG_CRYPTO_NULL=y +CONFIG_CRYPTO_NULL2=y +CONFIG_CRYPTO_PCRYPT=m +CONFIG_CRYPTO_CRYPTD=m +CONFIG_CRYPTO_AUTHENC=m +CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_SIMD=m +CONFIG_CRYPTO_GLUE_HELPER_X86=m +CONFIG_CRYPTO_ENGINE=m + +# +# Public-key cryptography +# +CONFIG_CRYPTO_RSA=y +CONFIG_CRYPTO_DH=y +CONFIG_CRYPTO_ECC=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECRDSA=m + +# +# Authenticated Encryption with Associated Data +# +CONFIG_CRYPTO_CCM=m +CONFIG_CRYPTO_GCM=y +CONFIG_CRYPTO_CHACHA20POLY1305=m +CONFIG_CRYPTO_AEGIS128=m +CONFIG_CRYPTO_AEGIS128_AESNI_SSE2=m +CONFIG_CRYPTO_SEQIV=y +CONFIG_CRYPTO_ECHAINIV=m + +# +# Block modes +# +CONFIG_CRYPTO_CBC=y +CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTR=y +CONFIG_CRYPTO_CTS=y +CONFIG_CRYPTO_ECB=y +CONFIG_CRYPTO_LRW=m +CONFIG_CRYPTO_OFB=m +CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=y +CONFIG_CRYPTO_KEYWRAP=m +CONFIG_CRYPTO_NHPOLY1305=m +CONFIG_CRYPTO_NHPOLY1305_SSE2=m +CONFIG_CRYPTO_NHPOLY1305_AVX2=m +CONFIG_CRYPTO_ADIANTUM=m +CONFIG_CRYPTO_ESSIV=m + +# +# Hash modes +# +CONFIG_CRYPTO_CMAC=m +CONFIG_CRYPTO_HMAC=y +CONFIG_CRYPTO_XCBC=m +CONFIG_CRYPTO_VMAC=m + +# +# Digest +# +CONFIG_CRYPTO_CRC32C=m +CONFIG_CRYPTO_CRC32C_INTEL=m +CONFIG_CRYPTO_CRC32=m +CONFIG_CRYPTO_CRC32_PCLMUL=m +CONFIG_CRYPTO_XXHASH=m +CONFIG_CRYPTO_CRCT10DIF=y +CONFIG_CRYPTO_CRCT10DIF_PCLMUL=m +CONFIG_CRYPTO_GHASH=y +CONFIG_CRYPTO_POLY1305=m +CONFIG_CRYPTO_POLY1305_X86_64=m +CONFIG_CRYPTO_MD4=m +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_RMD128=m +CONFIG_CRYPTO_RMD160=m +CONFIG_CRYPTO_RMD256=m +CONFIG_CRYPTO_RMD320=m +CONFIG_CRYPTO_SHA1=y +CONFIG_CRYPTO_SHA1_SSSE3=m +CONFIG_CRYPTO_SHA256_SSSE3=m +CONFIG_CRYPTO_SHA512_SSSE3=m +CONFIG_CRYPTO_LIB_SHA256=y +CONFIG_CRYPTO_SHA256=y +CONFIG_CRYPTO_SHA512=y +CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m +CONFIG_CRYPTO_STREEBOG=m +CONFIG_CRYPTO_TGR192=m +CONFIG_CRYPTO_WP512=m +CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL=m + +# +# Ciphers +# +CONFIG_CRYPTO_LIB_AES=y +CONFIG_CRYPTO_AES=y +CONFIG_CRYPTO_AES_TI=m +CONFIG_CRYPTO_AES_NI_INTEL=m +CONFIG_CRYPTO_ANUBIS=m +CONFIG_CRYPTO_LIB_ARC4=m +CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_BLOWFISH=m +CONFIG_CRYPTO_BLOWFISH_COMMON=m +CONFIG_CRYPTO_BLOWFISH_X86_64=m +CONFIG_CRYPTO_CAMELLIA=m +CONFIG_CRYPTO_CAMELLIA_X86_64=m +CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64=m +CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64=m +CONFIG_CRYPTO_CAST_COMMON=m +CONFIG_CRYPTO_CAST5=m +CONFIG_CRYPTO_CAST5_AVX_X86_64=m +CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_CAST6_AVX_X86_64=m +CONFIG_CRYPTO_LIB_DES=m +CONFIG_CRYPTO_DES=m +CONFIG_CRYPTO_DES3_EDE_X86_64=m +CONFIG_CRYPTO_FCRYPT=m +CONFIG_CRYPTO_KHAZAD=m +CONFIG_CRYPTO_SALSA20=m +CONFIG_CRYPTO_CHACHA20=m +CONFIG_CRYPTO_CHACHA20_X86_64=m +CONFIG_CRYPTO_SEED=m +CONFIG_CRYPTO_SERPENT=m +CONFIG_CRYPTO_SERPENT_SSE2_X86_64=m +CONFIG_CRYPTO_SERPENT_AVX_X86_64=m +CONFIG_CRYPTO_SERPENT_AVX2_X86_64=m +CONFIG_CRYPTO_SM4=m +CONFIG_CRYPTO_TEA=m +CONFIG_CRYPTO_TWOFISH=m +CONFIG_CRYPTO_TWOFISH_COMMON=m +CONFIG_CRYPTO_TWOFISH_X86_64=m +CONFIG_CRYPTO_TWOFISH_X86_64_3WAY=m +CONFIG_CRYPTO_TWOFISH_AVX_X86_64=m + +# +# Compression +# +CONFIG_CRYPTO_DEFLATE=m +CONFIG_CRYPTO_LZO=y +CONFIG_CRYPTO_842=m +CONFIG_CRYPTO_LZ4=m +CONFIG_CRYPTO_LZ4HC=m +CONFIG_CRYPTO_ZSTD=y + +# +# Random Number Generation +# +CONFIG_CRYPTO_ANSI_CPRNG=m +CONFIG_CRYPTO_DRBG_MENU=y +CONFIG_CRYPTO_DRBG_HMAC=y +CONFIG_CRYPTO_DRBG_HASH=y +CONFIG_CRYPTO_DRBG_CTR=y +CONFIG_CRYPTO_DRBG=y +CONFIG_CRYPTO_JITTERENTROPY=y +CONFIG_CRYPTO_USER_API=m +CONFIG_CRYPTO_USER_API_HASH=m +CONFIG_CRYPTO_USER_API_SKCIPHER=m +CONFIG_CRYPTO_USER_API_RNG=m +CONFIG_CRYPTO_USER_API_AEAD=m +# CONFIG_CRYPTO_STATS is not set +CONFIG_CRYPTO_HASH_INFO=y +CONFIG_CRYPTO_HW=y +CONFIG_CRYPTO_DEV_PADLOCK=m +CONFIG_CRYPTO_DEV_PADLOCK_AES=m +CONFIG_CRYPTO_DEV_PADLOCK_SHA=m +CONFIG_CRYPTO_DEV_ATMEL_I2C=m +CONFIG_CRYPTO_DEV_ATMEL_ECC=m +CONFIG_CRYPTO_DEV_ATMEL_SHA204A=m +CONFIG_CRYPTO_DEV_CCP=y +CONFIG_CRYPTO_DEV_CCP_DD=m +CONFIG_CRYPTO_DEV_SP_CCP=y +CONFIG_CRYPTO_DEV_CCP_CRYPTO=m +CONFIG_CRYPTO_DEV_SP_PSP=y +CONFIG_CRYPTO_DEV_CCP_DEBUGFS=y +CONFIG_CRYPTO_DEV_QAT=m +CONFIG_CRYPTO_DEV_QAT_DH895xCC=m +CONFIG_CRYPTO_DEV_QAT_C3XXX=m +CONFIG_CRYPTO_DEV_QAT_C62X=m +CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m +CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m +CONFIG_CRYPTO_DEV_QAT_C62XVF=m +CONFIG_CRYPTO_DEV_NITROX=m +CONFIG_CRYPTO_DEV_NITROX_CNN55XX=m +CONFIG_CRYPTO_DEV_CHELSIO=m +CONFIG_CHELSIO_IPSEC_INLINE=y +CONFIG_CRYPTO_DEV_CHELSIO_TLS=m +CONFIG_CRYPTO_DEV_VIRTIO=m +CONFIG_CRYPTO_DEV_SAFEXCEL=m +CONFIG_CRYPTO_DEV_CCREE=m +CONFIG_ASYMMETRIC_KEY_TYPE=y +CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y +CONFIG_ASYMMETRIC_TPM_KEY_SUBTYPE=m +CONFIG_X509_CERTIFICATE_PARSER=y +CONFIG_PKCS8_PRIVATE_KEY_PARSER=m +CONFIG_TPM_KEY_PARSER=m +CONFIG_PKCS7_MESSAGE_PARSER=y +# CONFIG_PKCS7_TEST_KEY is not set +CONFIG_SIGNED_PE_FILE_VERIFICATION=y + +# +# Certificates for signature checking +# +CONFIG_MODULE_SIG_KEY="certs/signing_key.pem" +CONFIG_SYSTEM_TRUSTED_KEYRING=y +CONFIG_SYSTEM_TRUSTED_KEYS="" +# CONFIG_SYSTEM_EXTRA_CERTIFICATE is not set +CONFIG_SECONDARY_TRUSTED_KEYRING=y +CONFIG_SYSTEM_BLACKLIST_KEYRING=y +CONFIG_SYSTEM_BLACKLIST_HASH_LIST="" +# end of Certificates for signature checking + +CONFIG_BINARY_PRINTF=y + +# +# Library routines +# +CONFIG_RAID6_PQ=m +CONFIG_RAID6_PQ_BENCHMARK=y +CONFIG_PACKING=y +CONFIG_BITREVERSE=y +CONFIG_GENERIC_STRNCPY_FROM_USER=y +CONFIG_GENERIC_STRNLEN_USER=y +CONFIG_GENERIC_NET_UTILS=y +CONFIG_GENERIC_FIND_FIRST_BIT=y +CONFIG_CORDIC=m +CONFIG_RATIONAL=y +CONFIG_GENERIC_PCI_IOMAP=y +CONFIG_GENERIC_IOMAP=y +CONFIG_ARCH_USE_CMPXCHG_LOCKREF=y +CONFIG_ARCH_HAS_FAST_MULTIPLIER=y +CONFIG_CRC_CCITT=y +CONFIG_CRC16=m +CONFIG_CRC_T10DIF=y +CONFIG_CRC_ITU_T=m +CONFIG_CRC32=y +# CONFIG_CRC32_SELFTEST is not set +CONFIG_CRC32_SLICEBY8=y +# CONFIG_CRC32_SLICEBY4 is not set +# CONFIG_CRC32_SARWATE is not set +# CONFIG_CRC32_BIT is not set +CONFIG_CRC64=m +CONFIG_CRC4=m +CONFIG_CRC7=m +CONFIG_LIBCRC32C=m +CONFIG_CRC8=m +CONFIG_XXHASH=y +# CONFIG_RANDOM32_SELFTEST is not set +CONFIG_842_COMPRESS=m +CONFIG_842_DECOMPRESS=m +CONFIG_ZLIB_INFLATE=y +CONFIG_ZLIB_DEFLATE=y +CONFIG_LZO_COMPRESS=y +CONFIG_LZO_DECOMPRESS=y +CONFIG_LZ4_COMPRESS=m +CONFIG_LZ4HC_COMPRESS=m +CONFIG_LZ4_DECOMPRESS=y +CONFIG_ZSTD_COMPRESS=y +CONFIG_ZSTD_DECOMPRESS=y +CONFIG_XZ_DEC=y +CONFIG_XZ_DEC_X86=y +CONFIG_XZ_DEC_POWERPC=y +CONFIG_XZ_DEC_IA64=y +CONFIG_XZ_DEC_ARM=y +CONFIG_XZ_DEC_ARMTHUMB=y +CONFIG_XZ_DEC_SPARC=y +CONFIG_XZ_DEC_BCJ=y +# CONFIG_XZ_DEC_TEST is not set +CONFIG_DECOMPRESS_GZIP=y +CONFIG_DECOMPRESS_BZIP2=y +CONFIG_DECOMPRESS_LZMA=y +CONFIG_DECOMPRESS_XZ=y +CONFIG_DECOMPRESS_LZO=y +CONFIG_DECOMPRESS_LZ4=y +CONFIG_GENERIC_ALLOCATOR=y +CONFIG_REED_SOLOMON=y +CONFIG_REED_SOLOMON_ENC8=y +CONFIG_REED_SOLOMON_DEC8=y +CONFIG_REED_SOLOMON_DEC16=y +CONFIG_BCH=m +CONFIG_TEXTSEARCH=y +CONFIG_TEXTSEARCH_KMP=m +CONFIG_TEXTSEARCH_BM=m +CONFIG_TEXTSEARCH_FSM=m +CONFIG_BTREE=y +CONFIG_INTERVAL_TREE=y +CONFIG_XARRAY_MULTI=y +CONFIG_ASSOCIATIVE_ARRAY=y +CONFIG_HAS_IOMEM=y +CONFIG_HAS_IOPORT_MAP=y +CONFIG_HAS_DMA=y +CONFIG_NEED_SG_DMA_LENGTH=y +CONFIG_NEED_DMA_MAP_STATE=y +CONFIG_ARCH_DMA_ADDR_T_64BIT=y +CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED=y +CONFIG_DMA_VIRT_OPS=y +CONFIG_SWIOTLB=y +# CONFIG_DMA_API_DEBUG is not set +CONFIG_SGL_ALLOC=y +CONFIG_IOMMU_HELPER=y +CONFIG_CHECK_SIGNATURE=y +CONFIG_CPU_RMAP=y +CONFIG_DQL=y +CONFIG_GLOB=y +# CONFIG_GLOB_SELFTEST is not set +CONFIG_NLATTR=y +CONFIG_LRU_CACHE=m +CONFIG_CLZ_TAB=y +CONFIG_IRQ_POLL=y +CONFIG_MPILIB=y +CONFIG_DIMLIB=y +CONFIG_LIBFDT=y +CONFIG_OID_REGISTRY=y +CONFIG_UCS2_STRING=y +CONFIG_HAVE_GENERIC_VDSO=y +CONFIG_GENERIC_GETTIMEOFDAY=y +CONFIG_FONT_SUPPORT=y +CONFIG_FONTS=y +# CONFIG_FONT_8x8 is not set +CONFIG_FONT_8x16=y +# CONFIG_FONT_6x11 is not set +# CONFIG_FONT_7x14 is not set +# CONFIG_FONT_PEARL_8x8 is not set +# CONFIG_FONT_ACORN_8x8 is not set +# CONFIG_FONT_MINI_4x6 is not set +# CONFIG_FONT_6x10 is not set +# CONFIG_FONT_10x18 is not set +# CONFIG_FONT_SUN8x16 is not set +# CONFIG_FONT_SUN12x22 is not set +# CONFIG_FONT_TER16x32 is not set +CONFIG_FONT_AUTOSELECT=y +CONFIG_SG_POOL=y +CONFIG_ARCH_HAS_PMEM_API=y +CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE=y +CONFIG_ARCH_HAS_UACCESS_MCSAFE=y +CONFIG_ARCH_STACKWALK=y +CONFIG_SBITMAP=y +CONFIG_PARMAN=m +CONFIG_OBJAGG=m +# CONFIG_STRING_SELFTEST is not set +# end of Library routines + +# +# Kernel hacking +# + +# +# printk and dmesg options +# +CONFIG_PRINTK_TIME=y +# CONFIG_PRINTK_CALLER is not set +CONFIG_CONSOLE_LOGLEVEL_DEFAULT=4 +CONFIG_CONSOLE_LOGLEVEL_QUIET=1 +CONFIG_MESSAGE_LOGLEVEL_DEFAULT=4 +# CONFIG_BOOT_PRINTK_DELAY is not set +CONFIG_DYNAMIC_DEBUG=y +# end of printk and dmesg options + +# +# Compile-time checks and compiler options +# +# CONFIG_DEBUG_INFO is not set +# CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_FRAME_WARN=2048 +CONFIG_STRIP_ASM_SYMS=y +# CONFIG_READABLE_ASM is not set +CONFIG_DEBUG_FS=y +# CONFIG_HEADERS_INSTALL is not set +CONFIG_OPTIMIZE_INLINING=y +# CONFIG_DEBUG_SECTION_MISMATCH is not set +CONFIG_SECTION_MISMATCH_WARN_ONLY=y +CONFIG_STACK_VALIDATION=y +# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set +# end of Compile-time checks and compiler options + +CONFIG_MAGIC_SYSRQ=y +CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0x0 +CONFIG_MAGIC_SYSRQ_SERIAL=y +CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_MISC=y + +# +# Memory Debugging +# +# CONFIG_PAGE_EXTENSION is not set +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_PAGE_OWNER is not set +# CONFIG_PAGE_POISONING is not set +# CONFIG_DEBUG_PAGE_REF is not set +# CONFIG_DEBUG_RODATA_TEST is not set +# CONFIG_DEBUG_OBJECTS is not set +# CONFIG_SLUB_DEBUG_ON is not set +# CONFIG_SLUB_STATS is not set +CONFIG_HAVE_DEBUG_KMEMLEAK=y +# CONFIG_DEBUG_KMEMLEAK is not set +# CONFIG_DEBUG_STACK_USAGE is not set +# CONFIG_DEBUG_VM is not set +CONFIG_ARCH_HAS_DEBUG_VIRTUAL=y +# CONFIG_DEBUG_VIRTUAL is not set +CONFIG_DEBUG_MEMORY_INIT=y +# CONFIG_DEBUG_PER_CPU_MAPS is not set +CONFIG_HAVE_ARCH_KASAN=y +CONFIG_CC_HAS_KASAN_GENERIC=y +# CONFIG_KASAN is not set +CONFIG_KASAN_STACK=1 +# end of Memory Debugging + +CONFIG_ARCH_HAS_KCOV=y +CONFIG_CC_HAS_SANCOV_TRACE_PC=y +# CONFIG_KCOV is not set +# CONFIG_DEBUG_SHIRQ is not set + +# +# Debug Lockups and Hangs +# +CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y +# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 +CONFIG_HARDLOCKUP_DETECTOR_PERF=y +CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y +CONFIG_HARDLOCKUP_DETECTOR=y +# CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is not set +CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE=0 +CONFIG_DETECT_HUNG_TASK=y +CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120 +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 +# CONFIG_WQ_WATCHDOG is not set +# end of Debug Lockups and Hangs + +# CONFIG_PANIC_ON_OOPS is not set +CONFIG_PANIC_ON_OOPS_VALUE=0 +CONFIG_PANIC_TIMEOUT=0 +CONFIG_SCHED_DEBUG=y +CONFIG_SCHED_INFO=y +CONFIG_SCHEDSTATS=y +CONFIG_SCHED_STACK_END_CHECK=y +# CONFIG_DEBUG_TIMEKEEPING is not set + +# +# Lock Debugging (spinlocks, mutexes, etc...) +# +CONFIG_LOCK_DEBUGGING_SUPPORT=y +# CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set +# CONFIG_DEBUG_RT_MUTEXES is not set +# CONFIG_DEBUG_SPINLOCK is not set +# CONFIG_DEBUG_MUTEXES is not set +# CONFIG_DEBUG_WW_MUTEX_SLOWPATH is not set +# CONFIG_DEBUG_RWSEMS is not set +# CONFIG_DEBUG_LOCK_ALLOC is not set +# CONFIG_DEBUG_ATOMIC_SLEEP is not set +# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +# CONFIG_LOCK_TORTURE_TEST is not set +# CONFIG_WW_MUTEX_SELFTEST is not set +# end of Lock Debugging (spinlocks, mutexes, etc...) + +CONFIG_STACKTRACE=y +# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set +# CONFIG_DEBUG_KOBJECT is not set +CONFIG_DEBUG_BUGVERBOSE=y +# CONFIG_DEBUG_LIST is not set +# CONFIG_DEBUG_PLIST is not set +# CONFIG_DEBUG_SG is not set +# CONFIG_DEBUG_NOTIFIERS is not set +# CONFIG_DEBUG_CREDENTIALS is not set + +# +# RCU Debugging +# +# CONFIG_RCU_PERF_TEST is not set +# CONFIG_RCU_TORTURE_TEST is not set +CONFIG_RCU_CPU_STALL_TIMEOUT=60 +# CONFIG_RCU_TRACE is not set +# CONFIG_RCU_EQS_DEBUG is not set +# end of RCU Debugging + +# CONFIG_DEBUG_WQ_FORCE_RR_CPU is not set +# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set +# CONFIG_CPU_HOTPLUG_STATE_CONTROL is not set +# CONFIG_NOTIFIER_ERROR_INJECTION is not set +CONFIG_FUNCTION_ERROR_INJECTION=y +# CONFIG_FAULT_INJECTION is not set +CONFIG_LATENCYTOP=y +CONFIG_USER_STACKTRACE_SUPPORT=y +CONFIG_NOP_TRACER=y +CONFIG_HAVE_FUNCTION_TRACER=y +CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y +CONFIG_HAVE_DYNAMIC_FTRACE=y +CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS=y +CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y +CONFIG_HAVE_FENTRY=y +CONFIG_HAVE_C_RECORDMCOUNT=y +CONFIG_TRACER_MAX_TRACE=y +CONFIG_TRACE_CLOCK=y +CONFIG_RING_BUFFER=y +CONFIG_EVENT_TRACING=y +CONFIG_CONTEXT_SWITCH_TRACER=y +CONFIG_RING_BUFFER_ALLOW_SWAP=y +CONFIG_TRACING=y +CONFIG_GENERIC_TRACER=y +CONFIG_TRACING_SUPPORT=y +CONFIG_FTRACE=y +CONFIG_FUNCTION_TRACER=y +CONFIG_FUNCTION_GRAPH_TRACER=y +# CONFIG_PREEMPTIRQ_EVENTS is not set +# CONFIG_IRQSOFF_TRACER is not set +CONFIG_SCHED_TRACER=y +CONFIG_HWLAT_TRACER=y +CONFIG_FTRACE_SYSCALLS=y +CONFIG_TRACER_SNAPSHOT=y +# CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP is not set +CONFIG_BRANCH_PROFILE_NONE=y +# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set +CONFIG_STACK_TRACER=y +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_KPROBE_EVENTS=y +# CONFIG_KPROBE_EVENTS_ON_NOTRACE is not set +CONFIG_UPROBE_EVENTS=y +CONFIG_BPF_EVENTS=y +CONFIG_DYNAMIC_EVENTS=y +CONFIG_PROBE_EVENTS=y +CONFIG_DYNAMIC_FTRACE=y +CONFIG_DYNAMIC_FTRACE_WITH_REGS=y +CONFIG_FUNCTION_PROFILER=y +CONFIG_BPF_KPROBE_OVERRIDE=y +CONFIG_FTRACE_MCOUNT_RECORD=y +# CONFIG_FTRACE_STARTUP_TEST is not set +CONFIG_MMIOTRACE=y +CONFIG_TRACING_MAP=y +CONFIG_HIST_TRIGGERS=y +# CONFIG_MMIOTRACE_TEST is not set +# CONFIG_TRACEPOINT_BENCHMARK is not set +# CONFIG_RING_BUFFER_BENCHMARK is not set +# CONFIG_RING_BUFFER_STARTUP_TEST is not set +# CONFIG_PREEMPTIRQ_DELAY_TEST is not set +# CONFIG_TRACE_EVAL_MAP_FILE is not set +# CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set +CONFIG_RUNTIME_TESTING_MENU=y +CONFIG_LKDTM=m +# CONFIG_TEST_LIST_SORT is not set +# CONFIG_TEST_SORT is not set +# CONFIG_KPROBES_SANITY_TEST is not set +# CONFIG_BACKTRACE_SELF_TEST is not set +# CONFIG_RBTREE_TEST is not set +# CONFIG_REED_SOLOMON_TEST is not set +# CONFIG_INTERVAL_TREE_TEST is not set +# CONFIG_PERCPU_TEST is not set +# CONFIG_ATOMIC64_SELFTEST is not set +# CONFIG_ASYNC_RAID6_TEST is not set +# CONFIG_TEST_HEXDUMP is not set +# CONFIG_TEST_STRING_HELPERS is not set +# CONFIG_TEST_STRSCPY is not set +# CONFIG_TEST_KSTRTOX is not set +# CONFIG_TEST_PRINTF is not set +# CONFIG_TEST_BITMAP is not set +# CONFIG_TEST_BITFIELD is not set +# CONFIG_TEST_UUID is not set +# CONFIG_TEST_XARRAY is not set +# CONFIG_TEST_OVERFLOW is not set +# CONFIG_TEST_RHASHTABLE is not set +# CONFIG_TEST_HASH is not set +# CONFIG_TEST_IDA is not set +# CONFIG_TEST_PARMAN is not set +# CONFIG_TEST_LKM is not set +# CONFIG_TEST_VMALLOC is not set +# CONFIG_TEST_USER_COPY is not set +# CONFIG_TEST_BPF is not set +# CONFIG_TEST_BLACKHOLE_DEV is not set +# CONFIG_FIND_BIT_BENCHMARK is not set +# CONFIG_TEST_FIRMWARE is not set +# CONFIG_TEST_SYSCTL is not set +# CONFIG_TEST_UDELAY is not set +# CONFIG_TEST_STATIC_KEYS is not set +# CONFIG_TEST_KMOD is not set +# CONFIG_TEST_MEMCAT_P is not set +# CONFIG_TEST_OBJAGG is not set +# CONFIG_TEST_STACKINIT is not set +# CONFIG_TEST_MEMINIT is not set +# CONFIG_MEMTEST is not set +# CONFIG_BUG_ON_DATA_CORRUPTION is not set +# CONFIG_SAMPLES is not set +CONFIG_HAVE_ARCH_KGDB=y +# CONFIG_KGDB is not set +CONFIG_ARCH_HAS_UBSAN_SANITIZE_ALL=y +# CONFIG_UBSAN is not set +CONFIG_UBSAN_ALIGNMENT=y +CONFIG_ARCH_HAS_DEVMEM_IS_ALLOWED=y +CONFIG_STRICT_DEVMEM=y +CONFIG_IO_STRICT_DEVMEM=y +CONFIG_TRACE_IRQFLAGS_SUPPORT=y +# CONFIG_X86_VERBOSE_BOOTUP is not set +CONFIG_EARLY_PRINTK=y +# CONFIG_EARLY_PRINTK_DBGP is not set +# CONFIG_EARLY_PRINTK_USB_XDBC is not set +CONFIG_X86_PTDUMP_CORE=y +# CONFIG_X86_PTDUMP is not set +# CONFIG_EFI_PGT_DUMP is not set +CONFIG_DEBUG_WX=y +CONFIG_DOUBLEFAULT=y +# CONFIG_DEBUG_TLBFLUSH is not set +# CONFIG_IOMMU_DEBUG is not set +CONFIG_HAVE_MMIOTRACE_SUPPORT=y +# CONFIG_X86_DECODER_SELFTEST is not set +CONFIG_IO_DELAY_0X80=y +# CONFIG_IO_DELAY_0XED is not set +# CONFIG_IO_DELAY_UDELAY is not set +# CONFIG_IO_DELAY_NONE is not set +CONFIG_DEBUG_BOOT_PARAMS=y +# CONFIG_CPA_DEBUG is not set +# CONFIG_DEBUG_ENTRY is not set +# CONFIG_DEBUG_NMI_SELFTEST is not set +# CONFIG_X86_DEBUG_FPU is not set +# CONFIG_PUNIT_ATOM_DEBUG is not set +CONFIG_UNWINDER_ORC=y +# CONFIG_UNWINDER_FRAME_POINTER is not set +# CONFIG_UNWINDER_GUESS is not set +# end of Kernel hacking diff --git a/linux-tkg/linux-tkg-config/5.4/config_hardened.x86_64 b/linux-tkg/linux-tkg-config/5.4/config_hardened.x86_64 new file mode 100644 index 0000000..ac94c74 --- /dev/null +++ b/linux-tkg/linux-tkg-config/5.4/config_hardened.x86_64 @@ -0,0 +1,10527 @@ +# +# Automatically generated file; DO NOT EDIT. +# Linux/x86 5.4.0-rc8 Kernel Configuration +# + +# +# Compiler: gcc (GCC) 9.2.0 +# +CONFIG_CC_IS_GCC=y +CONFIG_GCC_VERSION=90200 +CONFIG_CLANG_VERSION=0 +CONFIG_CC_CAN_LINK=y +CONFIG_CC_HAS_ASM_GOTO=y +CONFIG_CC_HAS_ASM_INLINE=y +CONFIG_CC_HAS_WARN_MAYBE_UNINITIALIZED=y +CONFIG_IRQ_WORK=y +CONFIG_BUILDTIME_EXTABLE_SORT=y +CONFIG_THREAD_INFO_IN_TASK=y + +# +# General setup +# +CONFIG_INIT_ENV_ARG_LIMIT=32 +# CONFIG_COMPILE_TEST is not set +# CONFIG_HEADER_TEST is not set +CONFIG_LOCALVERSION="" +CONFIG_LOCALVERSION_AUTO=y +CONFIG_BUILD_SALT="" +CONFIG_HAVE_KERNEL_GZIP=y +CONFIG_HAVE_KERNEL_BZIP2=y +CONFIG_HAVE_KERNEL_LZMA=y +CONFIG_HAVE_KERNEL_XZ=y +CONFIG_HAVE_KERNEL_LZO=y +CONFIG_HAVE_KERNEL_LZ4=y +# CONFIG_KERNEL_GZIP is not set +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set +CONFIG_KERNEL_XZ=y +# CONFIG_KERNEL_LZO is not set +# CONFIG_KERNEL_LZ4 is not set +CONFIG_DEFAULT_HOSTNAME="archlinux" +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_SYSVIPC_SYSCTL=y +CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y +CONFIG_CROSS_MEMORY_ATTACH=y +# CONFIG_USELIB is not set +CONFIG_AUDIT=y +CONFIG_HAVE_ARCH_AUDITSYSCALL=y +CONFIG_AUDITSYSCALL=y + +# +# IRQ subsystem +# +CONFIG_GENERIC_IRQ_PROBE=y +CONFIG_GENERIC_IRQ_SHOW=y +CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y +CONFIG_GENERIC_PENDING_IRQ=y +CONFIG_GENERIC_IRQ_MIGRATION=y +CONFIG_GENERIC_IRQ_CHIP=y +CONFIG_IRQ_DOMAIN=y +CONFIG_IRQ_SIM=y +CONFIG_IRQ_DOMAIN_HIERARCHY=y +CONFIG_GENERIC_MSI_IRQ=y +CONFIG_GENERIC_MSI_IRQ_DOMAIN=y +CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR=y +CONFIG_GENERIC_IRQ_RESERVATION_MODE=y +CONFIG_IRQ_FORCED_THREADING=y +CONFIG_SPARSE_IRQ=y +# CONFIG_GENERIC_IRQ_DEBUGFS is not set +# end of IRQ subsystem + +CONFIG_CLOCKSOURCE_WATCHDOG=y +CONFIG_ARCH_CLOCKSOURCE_DATA=y +CONFIG_ARCH_CLOCKSOURCE_INIT=y +CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y +CONFIG_GENERIC_TIME_VSYSCALL=y +CONFIG_GENERIC_CLOCKEVENTS=y +CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y +CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=y +CONFIG_GENERIC_CMOS_UPDATE=y + +# +# Timers subsystem +# +CONFIG_TICK_ONESHOT=y +CONFIG_NO_HZ_COMMON=y +# CONFIG_HZ_PERIODIC is not set +CONFIG_NO_HZ_IDLE=y +# CONFIG_NO_HZ_FULL is not set +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +# end of Timers subsystem + +# CONFIG_PREEMPT_NONE is not set +# CONFIG_PREEMPT_VOLUNTARY is not set +CONFIG_PREEMPT=y +CONFIG_PREEMPT_COUNT=y +CONFIG_PREEMPTION=y + +# +# CPU/Task time and stats accounting +# +CONFIG_TICK_CPU_ACCOUNTING=y +# CONFIG_VIRT_CPU_ACCOUNTING_GEN is not set +CONFIG_IRQ_TIME_ACCOUNTING=y +CONFIG_HAVE_SCHED_AVG_IRQ=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_BSD_PROCESS_ACCT_V3=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_PSI=y +# CONFIG_PSI_DEFAULT_DISABLED is not set +# end of CPU/Task time and stats accounting + +CONFIG_CPU_ISOLATION=y + +# +# RCU Subsystem +# +CONFIG_PREEMPT_RCU=y +CONFIG_RCU_EXPERT=y +CONFIG_SRCU=y +CONFIG_TREE_SRCU=y +CONFIG_TASKS_RCU=y +CONFIG_RCU_STALL_COMMON=y +CONFIG_RCU_NEED_SEGCBLIST=y +CONFIG_RCU_FANOUT=64 +CONFIG_RCU_FANOUT_LEAF=16 +CONFIG_RCU_FAST_NO_HZ=y +CONFIG_RCU_BOOST=y +CONFIG_RCU_BOOST_DELAY=500 +# CONFIG_RCU_NOCB_CPU is not set +# end of RCU Subsystem + +CONFIG_BUILD_BIN2C=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +# CONFIG_IKHEADERS is not set +CONFIG_LOG_BUF_SHIFT=17 +CONFIG_LOG_CPU_MAX_BUF_SHIFT=12 +CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=13 +CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y + +# +# Scheduler features +# +CONFIG_UCLAMP_TASK=y +CONFIG_UCLAMP_BUCKETS_COUNT=5 +# end of Scheduler features + +CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y +CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y +CONFIG_ARCH_SUPPORTS_INT128=y +CONFIG_NUMA_BALANCING=y +CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y +CONFIG_CGROUPS=y +CONFIG_PAGE_COUNTER=y +CONFIG_MEMCG=y +CONFIG_MEMCG_SWAP=y +CONFIG_MEMCG_SWAP_ENABLED=y +CONFIG_MEMCG_KMEM=y +CONFIG_BLK_CGROUP=y +CONFIG_CGROUP_WRITEBACK=y +CONFIG_CGROUP_SCHED=y +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_CFS_BANDWIDTH=y +# CONFIG_RT_GROUP_SCHED is not set +# CONFIG_UCLAMP_TASK_GROUP is not set +CONFIG_CGROUP_PIDS=y +CONFIG_CGROUP_RDMA=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_HUGETLB=y +CONFIG_CPUSETS=y +CONFIG_PROC_PID_CPUSET=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_PERF=y +CONFIG_CGROUP_BPF=y +# CONFIG_CGROUP_DEBUG is not set +CONFIG_SOCK_CGROUP_DATA=y +CONFIG_NAMESPACES=y +CONFIG_UTS_NS=y +CONFIG_IPC_NS=y +CONFIG_USER_NS=y +# CONFIG_USER_NS_UNPRIVILEGED is not set +CONFIG_PID_NS=y +CONFIG_NET_NS=y +# CONFIG_CHECKPOINT_RESTORE is not set +CONFIG_SCHED_AUTOGROUP=y +# CONFIG_SYSFS_DEPRECATED is not set +CONFIG_RELAY=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_GZIP=y +CONFIG_RD_BZIP2=y +CONFIG_RD_LZMA=y +CONFIG_RD_XZ=y +CONFIG_RD_LZO=y +CONFIG_RD_LZ4=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set +CONFIG_SYSCTL=y +CONFIG_HAVE_UID16=y +CONFIG_SYSCTL_EXCEPTION_TRACE=y +CONFIG_HAVE_PCSPKR_PLATFORM=y +CONFIG_BPF=y +CONFIG_EXPERT=y +# CONFIG_UID16 is not set +CONFIG_MULTIUSER=y +CONFIG_SGETMASK_SYSCALL=y +# CONFIG_SYSFS_SYSCALL is not set +# CONFIG_SYSCTL_SYSCALL is not set +CONFIG_FHANDLE=y +CONFIG_POSIX_TIMERS=y +CONFIG_PRINTK=y +CONFIG_PRINTK_NMI=y +CONFIG_BUG=y +CONFIG_ELF_CORE=y +CONFIG_PCSPKR_PLATFORM=y +CONFIG_BASE_FULL=y +CONFIG_FUTEX=y +CONFIG_FUTEX_PI=y +CONFIG_EPOLL=y +CONFIG_SIGNALFD=y +CONFIG_TIMERFD=y +CONFIG_EVENTFD=y +CONFIG_SHMEM=y +CONFIG_AIO=y +CONFIG_IO_URING=y +CONFIG_ADVISE_SYSCALLS=y +CONFIG_MEMBARRIER=y +CONFIG_KALLSYMS=y +CONFIG_KALLSYMS_ALL=y +CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y +CONFIG_KALLSYMS_BASE_RELATIVE=y +CONFIG_BPF_SYSCALL=y +CONFIG_BPF_JIT_ALWAYS_ON=y +# CONFIG_USERFAULTFD is not set +CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y +CONFIG_RSEQ=y +# CONFIG_DEBUG_RSEQ is not set +# CONFIG_EMBEDDED is not set +CONFIG_HAVE_PERF_EVENTS=y +# CONFIG_PC104 is not set + +# +# Kernel Performance Events And Counters +# +CONFIG_PERF_EVENTS=y +# CONFIG_DEBUG_PERF_USE_VMALLOC is not set +# end of Kernel Performance Events And Counters + +CONFIG_VM_EVENT_COUNTERS=y +CONFIG_SLUB_DEBUG=y +# CONFIG_SLUB_MEMCG_SYSFS_ON is not set +# CONFIG_COMPAT_BRK is not set +# CONFIG_SLAB is not set +CONFIG_SLUB=y +# CONFIG_SLOB is not set +# CONFIG_SLAB_MERGE_DEFAULT is not set +CONFIG_SLAB_FREELIST_RANDOM=y +CONFIG_SLAB_FREELIST_HARDENED=y +CONFIG_SLAB_CANARY=y +CONFIG_SHUFFLE_PAGE_ALLOCATOR=y +CONFIG_SLUB_CPU_PARTIAL=y +CONFIG_SYSTEM_DATA_VERIFICATION=y +CONFIG_PROFILING=y +CONFIG_TRACEPOINTS=y +# end of General setup + +CONFIG_64BIT=y +CONFIG_X86_64=y +CONFIG_X86=y +CONFIG_INSTRUCTION_DECODER=y +CONFIG_OUTPUT_FORMAT="elf64-x86-64" +CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig" +CONFIG_LOCKDEP_SUPPORT=y +CONFIG_STACKTRACE_SUPPORT=y +CONFIG_MMU=y +CONFIG_ARCH_MMAP_RND_BITS_MIN=28 +CONFIG_ARCH_MMAP_RND_BITS_MAX=32 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=8 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16 +CONFIG_GENERIC_ISA_DMA=y +CONFIG_GENERIC_BUG=y +CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y +CONFIG_ARCH_MAY_HAVE_PC_FDC=y +CONFIG_GENERIC_CALIBRATE_DELAY=y +CONFIG_ARCH_HAS_CPU_RELAX=y +CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y +CONFIG_ARCH_HAS_FILTER_PGPROT=y +CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y +CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y +CONFIG_ARCH_HIBERNATION_POSSIBLE=y +CONFIG_ARCH_SUSPEND_POSSIBLE=y +CONFIG_ARCH_WANT_GENERAL_HUGETLB=y +CONFIG_ZONE_DMA32=y +CONFIG_AUDIT_ARCH=y +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y +CONFIG_HAVE_INTEL_TXT=y +CONFIG_X86_64_SMP=y +CONFIG_ARCH_SUPPORTS_UPROBES=y +CONFIG_FIX_EARLYCON_MEM=y +CONFIG_DYNAMIC_PHYSICAL_MASK=y +CONFIG_PGTABLE_LEVELS=4 +CONFIG_CC_HAS_SANE_STACKPROTECTOR=y + +# +# Processor type and features +# +CONFIG_ZONE_DMA=y +CONFIG_SMP=y +CONFIG_X86_FEATURE_NAMES=y +CONFIG_X86_X2APIC=y +CONFIG_X86_MPPARSE=y +# CONFIG_GOLDFISH is not set +CONFIG_RETPOLINE=y +CONFIG_X86_CPU_RESCTRL=y +# CONFIG_X86_EXTENDED_PLATFORM is not set +CONFIG_X86_INTEL_LPSS=y +CONFIG_X86_AMD_PLATFORM_DEVICE=y +CONFIG_IOSF_MBI=y +# CONFIG_IOSF_MBI_DEBUG is not set +CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y +CONFIG_SCHED_OMIT_FRAME_POINTER=y +CONFIG_HYPERVISOR_GUEST=y +CONFIG_PARAVIRT=y +CONFIG_PARAVIRT_XXL=y +# CONFIG_PARAVIRT_DEBUG is not set +CONFIG_PARAVIRT_SPINLOCKS=y +CONFIG_X86_HV_CALLBACK_VECTOR=y +CONFIG_XEN=y +CONFIG_XEN_PV=y +CONFIG_XEN_PV_SMP=y +CONFIG_XEN_DOM0=y +CONFIG_XEN_PVHVM=y +CONFIG_XEN_PVHVM_SMP=y +CONFIG_XEN_512GB=y +CONFIG_XEN_SAVE_RESTORE=y +# CONFIG_XEN_DEBUG_FS is not set +CONFIG_XEN_PVH=y +CONFIG_KVM_GUEST=y +CONFIG_ARCH_CPUIDLE_HALTPOLL=y +CONFIG_PVH=y +# CONFIG_KVM_DEBUG_FS is not set +CONFIG_PARAVIRT_TIME_ACCOUNTING=y +CONFIG_PARAVIRT_CLOCK=y +CONFIG_JAILHOUSE_GUEST=y +CONFIG_ACRN_GUEST=y +# CONFIG_MK8 is not set +# CONFIG_MPSC is not set +# CONFIG_MCORE2 is not set +# CONFIG_MATOM is not set +CONFIG_GENERIC_CPU=y +CONFIG_X86_INTERNODE_CACHE_SHIFT=6 +CONFIG_X86_L1_CACHE_SHIFT=6 +CONFIG_X86_TSC=y +CONFIG_X86_CMPXCHG64=y +CONFIG_X86_CMOV=y +CONFIG_X86_MINIMUM_CPU_FAMILY=64 +CONFIG_X86_DEBUGCTLMSR=y +CONFIG_PROCESSOR_SELECT=y +CONFIG_CPU_SUP_INTEL=y +CONFIG_CPU_SUP_AMD=y +CONFIG_CPU_SUP_HYGON=y +CONFIG_CPU_SUP_CENTAUR=y +CONFIG_CPU_SUP_ZHAOXIN=y +CONFIG_HPET_TIMER=y +CONFIG_HPET_EMULATE_RTC=y +CONFIG_DMI=y +CONFIG_GART_IOMMU=y +CONFIG_CALGARY_IOMMU=y +CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y +# CONFIG_MAXSMP is not set +CONFIG_NR_CPUS_RANGE_BEGIN=2 +CONFIG_NR_CPUS_RANGE_END=512 +CONFIG_NR_CPUS_DEFAULT=64 +CONFIG_NR_CPUS=320 +CONFIG_SCHED_SMT=y +CONFIG_SCHED_MC=y +CONFIG_SCHED_MC_PRIO=y +CONFIG_X86_LOCAL_APIC=y +CONFIG_X86_IO_APIC=y +CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y +CONFIG_X86_MCE=y +# CONFIG_X86_MCELOG_LEGACY is not set +CONFIG_X86_MCE_INTEL=y +CONFIG_X86_MCE_AMD=y +CONFIG_X86_MCE_THRESHOLD=y +CONFIG_X86_MCE_INJECT=m +CONFIG_X86_THERMAL_VECTOR=y + +# +# Performance monitoring +# +CONFIG_PERF_EVENTS_INTEL_UNCORE=m +CONFIG_PERF_EVENTS_INTEL_RAPL=m +CONFIG_PERF_EVENTS_INTEL_CSTATE=m +CONFIG_PERF_EVENTS_AMD_POWER=m +# end of Performance monitoring + +CONFIG_X86_VSYSCALL_EMULATION=y +CONFIG_I8K=m +CONFIG_MICROCODE=y +CONFIG_MICROCODE_INTEL=y +CONFIG_MICROCODE_AMD=y +CONFIG_MICROCODE_OLD_INTERFACE=y +CONFIG_X86_MSR=m +CONFIG_X86_CPUID=m +# CONFIG_X86_5LEVEL is not set +CONFIG_X86_DIRECT_GBPAGES=y +# CONFIG_X86_CPA_STATISTICS is not set +CONFIG_AMD_MEM_ENCRYPT=y +CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y +CONFIG_NUMA=y +CONFIG_AMD_NUMA=y +CONFIG_X86_64_ACPI_NUMA=y +CONFIG_NODES_SPAN_OTHER_NODES=y +# CONFIG_NUMA_EMU is not set +CONFIG_NODES_SHIFT=5 +CONFIG_ARCH_SPARSEMEM_ENABLE=y +CONFIG_ARCH_SPARSEMEM_DEFAULT=y +CONFIG_ARCH_SELECT_MEMORY_MODEL=y +CONFIG_ARCH_MEMORY_PROBE=y +CONFIG_ILLEGAL_POINTER_VALUE=0xdead000000000000 +CONFIG_X86_PMEM_LEGACY_DEVICE=y +CONFIG_X86_PMEM_LEGACY=m +CONFIG_X86_CHECK_BIOS_CORRUPTION=y +CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y +CONFIG_X86_RESERVE_LOW=64 +CONFIG_MTRR=y +CONFIG_MTRR_SANITIZER=y +CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=1 +CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT=0 +CONFIG_X86_PAT=y +CONFIG_ARCH_USES_PG_UNCACHED=y +CONFIG_ARCH_RANDOM=y +CONFIG_X86_SMAP=y +CONFIG_X86_INTEL_UMIP=y +# CONFIG_X86_INTEL_MPX is not set +CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS=y +# CONFIG_X86_INTEL_TSX_MODE_OFF is not set +# CONFIG_X86_INTEL_TSX_MODE_ON is not set +CONFIG_X86_INTEL_TSX_MODE_AUTO=y +CONFIG_EFI=y +CONFIG_EFI_STUB=y +CONFIG_EFI_MIXED=y +CONFIG_SECCOMP=y +# CONFIG_HZ_100 is not set +# CONFIG_HZ_250 is not set +CONFIG_HZ_300=y +# CONFIG_HZ_1000 is not set +CONFIG_HZ=300 +CONFIG_SCHED_HRTICK=y +# CONFIG_KEXEC is not set +# CONFIG_KEXEC_FILE is not set +CONFIG_CRASH_DUMP=y +CONFIG_PHYSICAL_START=0x1000000 +CONFIG_RELOCATABLE=y +CONFIG_RANDOMIZE_BASE=y +CONFIG_X86_NEED_RELOCS=y +CONFIG_PHYSICAL_ALIGN=0x1000000 +CONFIG_DYNAMIC_MEMORY_LAYOUT=y +CONFIG_RANDOMIZE_MEMORY=y +CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING=0x1 +CONFIG_HOTPLUG_CPU=y +# CONFIG_BOOTPARAM_HOTPLUG_CPU0 is not set +# CONFIG_DEBUG_HOTPLUG_CPU0 is not set +# CONFIG_COMPAT_VDSO is not set +# CONFIG_LEGACY_VSYSCALL_EMULATE is not set +# CONFIG_LEGACY_VSYSCALL_XONLY is not set +CONFIG_LEGACY_VSYSCALL_NONE=y +CONFIG_CMDLINE_BOOL=y +CONFIG_CMDLINE="pti=on page_alloc.shuffle=1" +# CONFIG_CMDLINE_OVERRIDE is not set +# CONFIG_MODIFY_LDT_SYSCALL is not set +CONFIG_HAVE_LIVEPATCH=y +# CONFIG_LIVEPATCH is not set +# end of Processor type and features + +CONFIG_ARCH_HAS_ADD_PAGES=y +CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y +CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y +CONFIG_USE_PERCPU_NUMA_NODE_ID=y +CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK=y +CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION=y +CONFIG_ARCH_ENABLE_THP_MIGRATION=y + +# +# Power management and ACPI options +# +CONFIG_SUSPEND=y +CONFIG_SUSPEND_FREEZER=y +# CONFIG_SUSPEND_SKIP_SYNC is not set +CONFIG_HIBERNATE_CALLBACKS=y +# CONFIG_HIBERNATION is not set +CONFIG_PM_SLEEP=y +CONFIG_PM_SLEEP_SMP=y +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y +CONFIG_PM_WAKELOCKS_LIMIT=100 +CONFIG_PM_WAKELOCKS_GC=y +CONFIG_PM=y +CONFIG_PM_DEBUG=y +CONFIG_PM_ADVANCED_DEBUG=y +# CONFIG_PM_TEST_SUSPEND is not set +CONFIG_PM_SLEEP_DEBUG=y +# CONFIG_DPM_WATCHDOG is not set +CONFIG_PM_TRACE=y +CONFIG_PM_TRACE_RTC=y +CONFIG_PM_CLK=y +CONFIG_PM_GENERIC_DOMAINS=y +CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y +CONFIG_PM_GENERIC_DOMAINS_SLEEP=y +CONFIG_PM_GENERIC_DOMAINS_OF=y +CONFIG_ENERGY_MODEL=y +CONFIG_ARCH_SUPPORTS_ACPI=y +CONFIG_ACPI=y +CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y +CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC=y +CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT=y +# CONFIG_ACPI_DEBUGGER is not set +CONFIG_ACPI_SPCR_TABLE=y +CONFIG_ACPI_LPIT=y +CONFIG_ACPI_SLEEP=y +# CONFIG_ACPI_PROCFS_POWER is not set +CONFIG_ACPI_REV_OVERRIDE_POSSIBLE=y +# CONFIG_ACPI_EC_DEBUGFS is not set +CONFIG_ACPI_AC=m +CONFIG_ACPI_BATTERY=m +CONFIG_ACPI_BUTTON=y +CONFIG_ACPI_VIDEO=y +CONFIG_ACPI_FAN=y +CONFIG_ACPI_TAD=m +CONFIG_ACPI_DOCK=y +CONFIG_ACPI_CPU_FREQ_PSS=y +CONFIG_ACPI_PROCESSOR_CSTATE=y +CONFIG_ACPI_PROCESSOR_IDLE=y +CONFIG_ACPI_CPPC_LIB=y +CONFIG_ACPI_PROCESSOR=y +CONFIG_ACPI_IPMI=m +CONFIG_ACPI_HOTPLUG_CPU=y +CONFIG_ACPI_PROCESSOR_AGGREGATOR=y +CONFIG_ACPI_THERMAL=y +CONFIG_ACPI_NUMA=y +CONFIG_ARCH_HAS_ACPI_TABLE_UPGRADE=y +CONFIG_ACPI_TABLE_UPGRADE=y +# CONFIG_ACPI_DEBUG is not set +CONFIG_ACPI_PCI_SLOT=y +CONFIG_ACPI_CONTAINER=y +CONFIG_ACPI_HOTPLUG_MEMORY=y +CONFIG_ACPI_HOTPLUG_IOAPIC=y +CONFIG_ACPI_SBS=m +CONFIG_ACPI_HED=y +# CONFIG_ACPI_CUSTOM_METHOD is not set +CONFIG_ACPI_BGRT=y +# CONFIG_ACPI_REDUCED_HARDWARE_ONLY is not set +CONFIG_ACPI_NFIT=m +# CONFIG_NFIT_SECURITY_DEBUG is not set +CONFIG_ACPI_HMAT=y +CONFIG_HAVE_ACPI_APEI=y +CONFIG_HAVE_ACPI_APEI_NMI=y +CONFIG_ACPI_APEI=y +CONFIG_ACPI_APEI_GHES=y +CONFIG_ACPI_APEI_PCIEAER=y +CONFIG_ACPI_APEI_MEMORY_FAILURE=y +CONFIG_ACPI_APEI_EINJ=m +CONFIG_ACPI_APEI_ERST_DEBUG=m +CONFIG_DPTF_POWER=m +CONFIG_ACPI_WATCHDOG=y +CONFIG_ACPI_EXTLOG=m +CONFIG_ACPI_ADXL=y +CONFIG_PMIC_OPREGION=y +CONFIG_CRC_PMIC_OPREGION=y +CONFIG_XPOWER_PMIC_OPREGION=y +CONFIG_BXT_WC_PMIC_OPREGION=y +CONFIG_CHT_WC_PMIC_OPREGION=y +CONFIG_CHT_DC_TI_PMIC_OPREGION=y +CONFIG_ACPI_CONFIGFS=m +CONFIG_TPS68470_PMIC_OPREGION=y +CONFIG_X86_PM_TIMER=y +CONFIG_SFI=y + +# +# CPU Frequency scaling +# +CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_GOV_ATTR_SET=y +CONFIG_CPU_FREQ_GOV_COMMON=y +CONFIG_CPU_FREQ_STAT=y +# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y +CONFIG_CPU_FREQ_GOV_PERFORMANCE=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=m +CONFIG_CPU_FREQ_GOV_USERSPACE=m +CONFIG_CPU_FREQ_GOV_ONDEMAND=m +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m +CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y + +# +# CPU frequency scaling drivers +# +CONFIG_CPUFREQ_DT=m +CONFIG_CPUFREQ_DT_PLATDEV=y +CONFIG_X86_INTEL_PSTATE=y +CONFIG_X86_PCC_CPUFREQ=m +CONFIG_X86_ACPI_CPUFREQ=m +CONFIG_X86_ACPI_CPUFREQ_CPB=y +CONFIG_X86_POWERNOW_K8=m +CONFIG_X86_AMD_FREQ_SENSITIVITY=m +# CONFIG_X86_SPEEDSTEP_CENTRINO is not set +CONFIG_X86_P4_CLOCKMOD=m + +# +# shared options +# +CONFIG_X86_SPEEDSTEP_LIB=m +# end of CPU Frequency scaling + +# +# CPU Idle +# +CONFIG_CPU_IDLE=y +CONFIG_CPU_IDLE_GOV_LADDER=y +CONFIG_CPU_IDLE_GOV_MENU=y +CONFIG_CPU_IDLE_GOV_TEO=y +# CONFIG_CPU_IDLE_GOV_HALTPOLL is not set +CONFIG_HALTPOLL_CPUIDLE=y +# end of CPU Idle + +CONFIG_INTEL_IDLE=y +# end of Power management and ACPI options + +# +# Bus options (PCI etc.) +# +CONFIG_PCI_DIRECT=y +CONFIG_PCI_MMCONFIG=y +CONFIG_PCI_XEN=y +CONFIG_MMCONF_FAM10H=y +# CONFIG_PCI_CNB20LE_QUIRK is not set +# CONFIG_ISA_BUS is not set +CONFIG_ISA_DMA_API=y +CONFIG_AMD_NB=y +# CONFIG_X86_SYSFB is not set +# end of Bus options (PCI etc.) + +# +# Binary Emulations +# +CONFIG_IA32_EMULATION=y +# CONFIG_X86_X32 is not set +CONFIG_COMPAT_32=y +CONFIG_COMPAT=y +CONFIG_COMPAT_FOR_U64_ALIGNMENT=y +CONFIG_SYSVIPC_COMPAT=y +# end of Binary Emulations + +CONFIG_X86_DEV_DMA_OPS=y + +# +# Firmware Drivers +# +CONFIG_EDD=m +# CONFIG_EDD_OFF is not set +CONFIG_FIRMWARE_MEMMAP=y +CONFIG_DMIID=y +CONFIG_DMI_SYSFS=m +CONFIG_DMI_SCAN_MACHINE_NON_EFI_FALLBACK=y +CONFIG_ISCSI_IBFT_FIND=y +CONFIG_ISCSI_IBFT=m +CONFIG_FW_CFG_SYSFS=m +# CONFIG_FW_CFG_SYSFS_CMDLINE is not set +CONFIG_GOOGLE_FIRMWARE=y +# CONFIG_GOOGLE_SMI is not set +CONFIG_GOOGLE_COREBOOT_TABLE=m +CONFIG_GOOGLE_MEMCONSOLE=m +# CONFIG_GOOGLE_MEMCONSOLE_X86_LEGACY is not set +CONFIG_GOOGLE_FRAMEBUFFER_COREBOOT=m +CONFIG_GOOGLE_MEMCONSOLE_COREBOOT=m +CONFIG_GOOGLE_VPD=m + +# +# EFI (Extensible Firmware Interface) Support +# +# CONFIG_EFI_VARS is not set +CONFIG_EFI_ESRT=y +# CONFIG_EFI_FAKE_MEMMAP is not set +CONFIG_EFI_RUNTIME_WRAPPERS=y +CONFIG_EFI_CAPSULE_LOADER=m +# CONFIG_EFI_TEST is not set +CONFIG_APPLE_PROPERTIES=y +CONFIG_RESET_ATTACK_MITIGATION=y +CONFIG_EFI_RCI2_TABLE=y +# end of EFI (Extensible Firmware Interface) Support + +CONFIG_UEFI_CPER=y +CONFIG_UEFI_CPER_X86=y +CONFIG_EFI_DEV_PATH_PARSER=y +CONFIG_EFI_EARLYCON=y + +# +# Tegra firmware driver +# +# end of Tegra firmware driver +# end of Firmware Drivers + +CONFIG_HAVE_KVM=y +CONFIG_HAVE_KVM_IRQCHIP=y +CONFIG_HAVE_KVM_IRQFD=y +CONFIG_HAVE_KVM_IRQ_ROUTING=y +CONFIG_HAVE_KVM_EVENTFD=y +CONFIG_KVM_MMIO=y +CONFIG_KVM_ASYNC_PF=y +CONFIG_HAVE_KVM_MSI=y +CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT=y +CONFIG_KVM_VFIO=y +CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT=y +CONFIG_KVM_COMPAT=y +CONFIG_HAVE_KVM_IRQ_BYPASS=y +CONFIG_HAVE_KVM_NO_POLL=y +CONFIG_VIRTUALIZATION=y +CONFIG_KVM=m +CONFIG_KVM_INTEL=m +CONFIG_KVM_AMD=m +CONFIG_KVM_AMD_SEV=y +CONFIG_KVM_MMU_AUDIT=y +CONFIG_VHOST_NET=m +CONFIG_VHOST_SCSI=m +CONFIG_VHOST_VSOCK=m +CONFIG_VHOST=m +# CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set + +# +# General architecture-dependent options +# +CONFIG_HOTPLUG_SMT=y +CONFIG_OPROFILE=m +# CONFIG_OPROFILE_EVENT_MULTIPLEX is not set +CONFIG_HAVE_OPROFILE=y +CONFIG_OPROFILE_NMI_TIMER=y +CONFIG_KPROBES=y +CONFIG_JUMP_LABEL=y +# CONFIG_STATIC_KEYS_SELFTEST is not set +CONFIG_OPTPROBES=y +CONFIG_KPROBES_ON_FTRACE=y +CONFIG_UPROBES=y +CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y +CONFIG_ARCH_USE_BUILTIN_BSWAP=y +CONFIG_KRETPROBES=y +CONFIG_USER_RETURN_NOTIFIER=y +CONFIG_HAVE_IOREMAP_PROT=y +CONFIG_HAVE_KPROBES=y +CONFIG_HAVE_KRETPROBES=y +CONFIG_HAVE_OPTPROBES=y +CONFIG_HAVE_KPROBES_ON_FTRACE=y +CONFIG_HAVE_FUNCTION_ERROR_INJECTION=y +CONFIG_HAVE_NMI=y +CONFIG_HAVE_ARCH_TRACEHOOK=y +CONFIG_HAVE_DMA_CONTIGUOUS=y +CONFIG_GENERIC_SMP_IDLE_THREAD=y +CONFIG_ARCH_HAS_FORTIFY_SOURCE=y +CONFIG_ARCH_HAS_SET_MEMORY=y +CONFIG_ARCH_HAS_SET_DIRECT_MAP=y +CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y +CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT=y +CONFIG_HAVE_ASM_MODVERSIONS=y +CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y +CONFIG_HAVE_RSEQ=y +CONFIG_HAVE_FUNCTION_ARG_ACCESS_API=y +CONFIG_HAVE_CLK=y +CONFIG_HAVE_HW_BREAKPOINT=y +CONFIG_HAVE_MIXED_BREAKPOINTS_REGS=y +CONFIG_HAVE_USER_RETURN_NOTIFIER=y +CONFIG_HAVE_PERF_EVENTS_NMI=y +CONFIG_HAVE_HARDLOCKUP_DETECTOR_PERF=y +CONFIG_HAVE_PERF_REGS=y +CONFIG_HAVE_PERF_USER_STACK_DUMP=y +CONFIG_HAVE_ARCH_JUMP_LABEL=y +CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE=y +CONFIG_HAVE_RCU_TABLE_FREE=y +CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG=y +CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y +CONFIG_HAVE_CMPXCHG_LOCAL=y +CONFIG_HAVE_CMPXCHG_DOUBLE=y +CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION=y +CONFIG_ARCH_WANT_OLD_COMPAT_IPC=y +CONFIG_HAVE_ARCH_SECCOMP_FILTER=y +CONFIG_SECCOMP_FILTER=y +CONFIG_HAVE_ARCH_STACKLEAK=y +CONFIG_HAVE_STACKPROTECTOR=y +CONFIG_CC_HAS_STACKPROTECTOR_NONE=y +CONFIG_STACKPROTECTOR=y +CONFIG_STACKPROTECTOR_STRONG=y +CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES=y +CONFIG_HAVE_CONTEXT_TRACKING=y +CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y +CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y +CONFIG_HAVE_MOVE_PMD=y +CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y +CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD=y +CONFIG_HAVE_ARCH_HUGE_VMAP=y +CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y +CONFIG_HAVE_ARCH_SOFT_DIRTY=y +CONFIG_HAVE_MOD_ARCH_SPECIFIC=y +CONFIG_MODULES_USE_ELF_RELA=y +CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK=y +CONFIG_ARCH_HAS_ELF_RANDOMIZE=y +CONFIG_HAVE_ARCH_MMAP_RND_BITS=y +CONFIG_HAVE_EXIT_THREAD=y +CONFIG_ARCH_MMAP_RND_BITS=32 +CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS=y +CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16 +CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES=y +CONFIG_HAVE_COPY_THREAD_TLS=y +CONFIG_HAVE_STACK_VALIDATION=y +CONFIG_HAVE_RELIABLE_STACKTRACE=y +CONFIG_ISA_BUS_API=y +CONFIG_OLD_SIGSUSPEND3=y +CONFIG_COMPAT_OLD_SIGACTION=y +CONFIG_64BIT_TIME=y +CONFIG_COMPAT_32BIT_TIME=y +CONFIG_HAVE_ARCH_VMAP_STACK=y +CONFIG_VMAP_STACK=y +CONFIG_ARCH_HAS_STRICT_KERNEL_RWX=y +CONFIG_STRICT_KERNEL_RWX=y +CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y +CONFIG_STRICT_MODULE_RWX=y +CONFIG_ARCH_HAS_REFCOUNT=y +CONFIG_REFCOUNT_FULL=y +CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y +CONFIG_ARCH_USE_MEMREMAP_PROT=y +CONFIG_LOCK_EVENT_COUNTS=y +CONFIG_ARCH_HAS_MEM_ENCRYPT=y + +# +# GCOV-based kernel profiling +# +# CONFIG_GCOV_KERNEL is not set +CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y +# end of GCOV-based kernel profiling + +CONFIG_PLUGIN_HOSTCC="g++" +CONFIG_HAVE_GCC_PLUGINS=y +CONFIG_GCC_PLUGINS=y + +# +# GCC plugins +# +# CONFIG_GCC_PLUGIN_CYC_COMPLEXITY is not set +CONFIG_GCC_PLUGIN_LATENT_ENTROPY=y +# CONFIG_GCC_PLUGIN_RANDSTRUCT is not set +# end of GCC plugins +# end of General architecture-dependent options + +CONFIG_RT_MUTEXES=y +CONFIG_BASE_SMALL=0 +CONFIG_MODULE_SIG_FORMAT=y +CONFIG_MODULES=y +CONFIG_MODULE_FORCE_LOAD=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_FORCE_UNLOAD=y +# CONFIG_MODVERSIONS is not set +CONFIG_MODULE_SRCVERSION_ALL=y +CONFIG_MODULE_SIG=y +# CONFIG_MODULE_SIG_FORCE is not set +CONFIG_MODULE_SIG_ALL=y +# CONFIG_MODULE_SIG_SHA1 is not set +# CONFIG_MODULE_SIG_SHA224 is not set +# CONFIG_MODULE_SIG_SHA256 is not set +# CONFIG_MODULE_SIG_SHA384 is not set +CONFIG_MODULE_SIG_SHA512=y +CONFIG_MODULE_SIG_HASH="sha512" +CONFIG_MODULE_COMPRESS=y +# CONFIG_MODULE_COMPRESS_GZIP is not set +CONFIG_MODULE_COMPRESS_XZ=y +# CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS is not set +CONFIG_UNUSED_SYMBOLS=y +CONFIG_MODULES_TREE_LOOKUP=y +CONFIG_BLOCK=y +CONFIG_BLK_SCSI_REQUEST=y +CONFIG_BLK_DEV_BSG=y +CONFIG_BLK_DEV_BSGLIB=y +CONFIG_BLK_DEV_INTEGRITY=y +CONFIG_BLK_DEV_ZONED=y +CONFIG_BLK_DEV_THROTTLING=y +CONFIG_BLK_DEV_THROTTLING_LOW=y +# CONFIG_BLK_CMDLINE_PARSER is not set +CONFIG_BLK_WBT=y +CONFIG_BLK_CGROUP_IOLATENCY=y +# CONFIG_BLK_CGROUP_IOCOST is not set +CONFIG_BLK_WBT_MQ=y +CONFIG_BLK_DEBUG_FS=y +CONFIG_BLK_DEBUG_FS_ZONED=y +CONFIG_BLK_SED_OPAL=y + +# +# Partition Types +# +CONFIG_PARTITION_ADVANCED=y +# CONFIG_ACORN_PARTITION is not set +CONFIG_AIX_PARTITION=y +# CONFIG_OSF_PARTITION is not set +# CONFIG_AMIGA_PARTITION is not set +# CONFIG_ATARI_PARTITION is not set +CONFIG_MAC_PARTITION=y +CONFIG_MSDOS_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +# CONFIG_UNIXWARE_DISKLABEL is not set +CONFIG_LDM_PARTITION=y +# CONFIG_LDM_DEBUG is not set +# CONFIG_SGI_PARTITION is not set +# CONFIG_ULTRIX_PARTITION is not set +# CONFIG_SUN_PARTITION is not set +CONFIG_KARMA_PARTITION=y +CONFIG_EFI_PARTITION=y +# CONFIG_SYSV68_PARTITION is not set +# CONFIG_CMDLINE_PARTITION is not set +# end of Partition Types + +CONFIG_BLOCK_COMPAT=y +CONFIG_BLK_MQ_PCI=y +CONFIG_BLK_MQ_VIRTIO=y +CONFIG_BLK_MQ_RDMA=y +CONFIG_BLK_PM=y + +# +# IO Schedulers +# +CONFIG_MQ_IOSCHED_DEADLINE=y +CONFIG_MQ_IOSCHED_KYBER=y +CONFIG_IOSCHED_BFQ=y +CONFIG_BFQ_GROUP_IOSCHED=y +# CONFIG_BFQ_CGROUP_DEBUG is not set +# end of IO Schedulers + +CONFIG_PREEMPT_NOTIFIERS=y +CONFIG_PADATA=y +CONFIG_ASN1=y +CONFIG_UNINLINE_SPIN_UNLOCK=y +CONFIG_ARCH_SUPPORTS_ATOMIC_RMW=y +CONFIG_MUTEX_SPIN_ON_OWNER=y +CONFIG_RWSEM_SPIN_ON_OWNER=y +CONFIG_LOCK_SPIN_ON_OWNER=y +CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y +CONFIG_QUEUED_SPINLOCKS=y +CONFIG_ARCH_USE_QUEUED_RWLOCKS=y +CONFIG_QUEUED_RWLOCKS=y +CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE=y +CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y +CONFIG_FREEZER=y + +# +# Executable file formats +# +CONFIG_BINFMT_ELF=y +CONFIG_COMPAT_BINFMT_ELF=y +CONFIG_ELFCORE=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y +CONFIG_BINFMT_SCRIPT=y +CONFIG_BINFMT_MISC=y +CONFIG_COREDUMP=y +# end of Executable file formats + +# +# Memory Management options +# +CONFIG_SELECT_MEMORY_MODEL=y +CONFIG_SPARSEMEM_MANUAL=y +CONFIG_SPARSEMEM=y +CONFIG_NEED_MULTIPLE_NODES=y +CONFIG_HAVE_MEMORY_PRESENT=y +CONFIG_SPARSEMEM_EXTREME=y +CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y +CONFIG_SPARSEMEM_VMEMMAP=y +CONFIG_HAVE_MEMBLOCK_NODE_MAP=y +CONFIG_HAVE_FAST_GUP=y +CONFIG_MEMORY_ISOLATION=y +CONFIG_HAVE_BOOTMEM_INFO_NODE=y +CONFIG_MEMORY_HOTPLUG=y +CONFIG_MEMORY_HOTPLUG_SPARSE=y +CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y +CONFIG_MEMORY_HOTREMOVE=y +CONFIG_SPLIT_PTLOCK_CPUS=4 +CONFIG_MEMORY_BALLOON=y +CONFIG_BALLOON_COMPACTION=y +CONFIG_COMPACTION=y +CONFIG_MIGRATION=y +CONFIG_CONTIG_ALLOC=y +CONFIG_PHYS_ADDR_T_64BIT=y +CONFIG_BOUNCE=y +CONFIG_VIRT_TO_BUS=y +CONFIG_MMU_NOTIFIER=y +CONFIG_KSM=y +CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 +CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y +CONFIG_MEMORY_FAILURE=y +CONFIG_HWPOISON_INJECT=m +CONFIG_TRANSPARENT_HUGEPAGE=y +# CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS is not set +CONFIG_TRANSPARENT_HUGEPAGE_MADVISE=y +CONFIG_ARCH_WANTS_THP_SWAP=y +CONFIG_THP_SWAP=y +CONFIG_TRANSPARENT_HUGE_PAGECACHE=y +CONFIG_CLEANCACHE=y +CONFIG_FRONTSWAP=y +# CONFIG_CMA is not set +CONFIG_ZSWAP=y +CONFIG_ZPOOL=y +CONFIG_ZBUD=y +CONFIG_Z3FOLD=y +CONFIG_ZSMALLOC=y +# CONFIG_PGTABLE_MAPPING is not set +# CONFIG_ZSMALLOC_STAT is not set +CONFIG_GENERIC_EARLY_IOREMAP=y +# CONFIG_DEFERRED_STRUCT_PAGE_INIT is not set +# CONFIG_IDLE_PAGE_TRACKING is not set +CONFIG_ARCH_HAS_PTE_DEVMAP=y +CONFIG_ZONE_DEVICE=y +CONFIG_DEV_PAGEMAP_OPS=y +CONFIG_HMM_MIRROR=y +CONFIG_DEVICE_PRIVATE=y +CONFIG_FRAME_VECTOR=y +CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y +CONFIG_ARCH_HAS_PKEYS=y +# CONFIG_PERCPU_STATS is not set +# CONFIG_GUP_BENCHMARK is not set +# CONFIG_READ_ONLY_THP_FOR_FS is not set +CONFIG_ARCH_HAS_PTE_SPECIAL=y +# end of Memory Management options + +CONFIG_NET=y +CONFIG_COMPAT_NETLINK_MESSAGES=y +CONFIG_NET_INGRESS=y +CONFIG_NET_EGRESS=y +CONFIG_SKB_EXTENSIONS=y + +# +# Networking options +# +CONFIG_PACKET=y +CONFIG_PACKET_DIAG=y +CONFIG_UNIX=y +CONFIG_UNIX_SCM=y +CONFIG_UNIX_DIAG=y +CONFIG_TLS=m +CONFIG_TLS_DEVICE=y +CONFIG_XFRM=y +CONFIG_XFRM_OFFLOAD=y +CONFIG_XFRM_ALGO=m +CONFIG_XFRM_USER=m +CONFIG_XFRM_INTERFACE=m +CONFIG_XFRM_SUB_POLICY=y +CONFIG_XFRM_MIGRATE=y +CONFIG_XFRM_STATISTICS=y +CONFIG_XFRM_IPCOMP=m +CONFIG_NET_KEY=m +CONFIG_NET_KEY_MIGRATE=y +CONFIG_SMC=m +CONFIG_SMC_DIAG=m +CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +# CONFIG_IP_FIB_TRIE_STATS is not set +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_IP_ROUTE_CLASSID=y +# CONFIG_IP_PNP is not set +CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE_DEMUX=m +CONFIG_NET_IP_TUNNEL=m +CONFIG_NET_IPGRE=m +# CONFIG_NET_IPGRE_BROADCAST is not set +CONFIG_IP_MROUTE_COMMON=y +CONFIG_IP_MROUTE=y +CONFIG_IP_MROUTE_MULTIPLE_TABLES=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +CONFIG_SYN_COOKIES=y +CONFIG_NET_IPVTI=m +CONFIG_NET_UDP_TUNNEL=m +CONFIG_NET_FOU=m +CONFIG_NET_FOU_IP_TUNNELS=y +CONFIG_INET_AH=m +CONFIG_INET_ESP=m +CONFIG_INET_ESP_OFFLOAD=m +CONFIG_INET_IPCOMP=m +CONFIG_INET_XFRM_TUNNEL=m +CONFIG_INET_TUNNEL=m +CONFIG_INET_DIAG=m +CONFIG_INET_TCP_DIAG=m +CONFIG_INET_UDP_DIAG=m +CONFIG_INET_RAW_DIAG=m +CONFIG_INET_DIAG_DESTROY=y +CONFIG_TCP_CONG_ADVANCED=y +CONFIG_TCP_CONG_BIC=m +CONFIG_TCP_CONG_CUBIC=y +CONFIG_TCP_CONG_WESTWOOD=m +CONFIG_TCP_CONG_HTCP=m +CONFIG_TCP_CONG_HSTCP=m +CONFIG_TCP_CONG_HYBLA=m +CONFIG_TCP_CONG_VEGAS=m +CONFIG_TCP_CONG_NV=m +CONFIG_TCP_CONG_SCALABLE=m +CONFIG_TCP_CONG_LP=m +CONFIG_TCP_CONG_VENO=m +CONFIG_TCP_CONG_YEAH=m +CONFIG_TCP_CONG_ILLINOIS=m +CONFIG_TCP_CONG_DCTCP=m +CONFIG_TCP_CONG_CDG=m +CONFIG_TCP_CONG_BBR=m +CONFIG_DEFAULT_CUBIC=y +# CONFIG_DEFAULT_RENO is not set +CONFIG_DEFAULT_TCP_CONG="cubic" +CONFIG_TCP_MD5SIG=y +CONFIG_IPV6=y +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_INET6_AH=m +CONFIG_INET6_ESP=m +CONFIG_INET6_ESP_OFFLOAD=m +CONFIG_INET6_IPCOMP=m +CONFIG_IPV6_MIP6=m +CONFIG_IPV6_ILA=m +CONFIG_INET6_XFRM_TUNNEL=m +CONFIG_INET6_TUNNEL=m +CONFIG_IPV6_VTI=m +CONFIG_IPV6_SIT=m +CONFIG_IPV6_SIT_6RD=y +CONFIG_IPV6_NDISC_NODETYPE=y +CONFIG_IPV6_TUNNEL=m +CONFIG_IPV6_GRE=m +CONFIG_IPV6_FOU=m +CONFIG_IPV6_FOU_TUNNEL=m +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_SUBTREES=y +CONFIG_IPV6_MROUTE=y +CONFIG_IPV6_MROUTE_MULTIPLE_TABLES=y +CONFIG_IPV6_PIMSM_V2=y +CONFIG_IPV6_SEG6_LWTUNNEL=y +CONFIG_IPV6_SEG6_HMAC=y +CONFIG_IPV6_SEG6_BPF=y +CONFIG_NETLABEL=y +CONFIG_NETWORK_SECMARK=y +CONFIG_NET_PTP_CLASSIFY=y +CONFIG_NETWORK_PHY_TIMESTAMPING=y +CONFIG_NETFILTER=y +CONFIG_NETFILTER_ADVANCED=y +CONFIG_BRIDGE_NETFILTER=m + +# +# Core Netfilter Configuration +# +CONFIG_NETFILTER_INGRESS=y +CONFIG_NETFILTER_NETLINK=m +CONFIG_NETFILTER_FAMILY_BRIDGE=y +CONFIG_NETFILTER_FAMILY_ARP=y +CONFIG_NETFILTER_NETLINK_ACCT=m +CONFIG_NETFILTER_NETLINK_QUEUE=m +CONFIG_NETFILTER_NETLINK_LOG=m +CONFIG_NETFILTER_NETLINK_OSF=m +CONFIG_NF_CONNTRACK=m +CONFIG_NF_LOG_COMMON=m +CONFIG_NF_LOG_NETDEV=m +CONFIG_NETFILTER_CONNCOUNT=m +CONFIG_NF_CONNTRACK_MARK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_ZONES=y +CONFIG_NF_CONNTRACK_PROCFS=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_TIMEOUT=y +CONFIG_NF_CONNTRACK_TIMESTAMP=y +CONFIG_NF_CONNTRACK_LABELS=y +CONFIG_NF_CT_PROTO_DCCP=y +CONFIG_NF_CT_PROTO_GRE=y +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_CT_PROTO_UDPLITE=y +CONFIG_NF_CONNTRACK_AMANDA=m +CONFIG_NF_CONNTRACK_FTP=m +CONFIG_NF_CONNTRACK_H323=m +CONFIG_NF_CONNTRACK_IRC=m +CONFIG_NF_CONNTRACK_BROADCAST=m +CONFIG_NF_CONNTRACK_NETBIOS_NS=m +CONFIG_NF_CONNTRACK_SNMP=m +CONFIG_NF_CONNTRACK_PPTP=m +CONFIG_NF_CONNTRACK_SANE=m +CONFIG_NF_CONNTRACK_SIP=m +CONFIG_NF_CONNTRACK_TFTP=m +CONFIG_NF_CT_NETLINK=m +CONFIG_NF_CT_NETLINK_TIMEOUT=m +CONFIG_NF_CT_NETLINK_HELPER=m +CONFIG_NETFILTER_NETLINK_GLUE_CT=y +CONFIG_NF_NAT=m +CONFIG_NF_NAT_AMANDA=m +CONFIG_NF_NAT_FTP=m +CONFIG_NF_NAT_IRC=m +CONFIG_NF_NAT_SIP=m +CONFIG_NF_NAT_TFTP=m +CONFIG_NF_NAT_REDIRECT=y +CONFIG_NF_NAT_MASQUERADE=y +CONFIG_NETFILTER_SYNPROXY=m +CONFIG_NF_TABLES=m +CONFIG_NF_TABLES_SET=m +CONFIG_NF_TABLES_INET=y +CONFIG_NF_TABLES_NETDEV=y +CONFIG_NFT_NUMGEN=m +CONFIG_NFT_CT=m +CONFIG_NFT_FLOW_OFFLOAD=m +CONFIG_NFT_COUNTER=m +CONFIG_NFT_CONNLIMIT=m +CONFIG_NFT_LOG=m +CONFIG_NFT_LIMIT=m +CONFIG_NFT_MASQ=m +CONFIG_NFT_REDIR=m +CONFIG_NFT_NAT=m +CONFIG_NFT_TUNNEL=m +CONFIG_NFT_OBJREF=m +CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m +CONFIG_NFT_REJECT=m +CONFIG_NFT_REJECT_INET=m +CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m +CONFIG_NFT_FIB=m +CONFIG_NFT_FIB_INET=m +CONFIG_NFT_XFRM=m +CONFIG_NFT_SOCKET=m +CONFIG_NFT_OSF=m +CONFIG_NFT_TPROXY=m +CONFIG_NFT_SYNPROXY=m +CONFIG_NF_DUP_NETDEV=m +CONFIG_NFT_DUP_NETDEV=m +CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m +CONFIG_NF_FLOW_TABLE_INET=m +CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES=m + +# +# Xtables combined modules +# +CONFIG_NETFILTER_XT_MARK=m +CONFIG_NETFILTER_XT_CONNMARK=m +CONFIG_NETFILTER_XT_SET=m + +# +# Xtables targets +# +CONFIG_NETFILTER_XT_TARGET_AUDIT=m +CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m +CONFIG_NETFILTER_XT_TARGET_CONNMARK=m +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m +CONFIG_NETFILTER_XT_TARGET_CT=m +CONFIG_NETFILTER_XT_TARGET_DSCP=m +CONFIG_NETFILTER_XT_TARGET_HL=m +CONFIG_NETFILTER_XT_TARGET_HMARK=m +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m +CONFIG_NETFILTER_XT_TARGET_LED=m +CONFIG_NETFILTER_XT_TARGET_LOG=m +CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_NAT=m +CONFIG_NETFILTER_XT_TARGET_NETMAP=m +CONFIG_NETFILTER_XT_TARGET_NFLOG=m +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_NOTRACK=m +CONFIG_NETFILTER_XT_TARGET_RATEEST=m +CONFIG_NETFILTER_XT_TARGET_REDIRECT=m +CONFIG_NETFILTER_XT_TARGET_MASQUERADE=m +CONFIG_NETFILTER_XT_TARGET_TEE=m +CONFIG_NETFILTER_XT_TARGET_TPROXY=m +CONFIG_NETFILTER_XT_TARGET_TRACE=m +CONFIG_NETFILTER_XT_TARGET_SECMARK=m +CONFIG_NETFILTER_XT_TARGET_TCPMSS=m +CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m + +# +# Xtables matches +# +CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m +CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NETFILTER_XT_MATCH_CGROUP=m +CONFIG_NETFILTER_XT_MATCH_CLUSTER=m +CONFIG_NETFILTER_XT_MATCH_COMMENT=m +CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m +CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m +CONFIG_NETFILTER_XT_MATCH_CONNMARK=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NETFILTER_XT_MATCH_CPU=m +CONFIG_NETFILTER_XT_MATCH_DCCP=m +CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m +CONFIG_NETFILTER_XT_MATCH_DSCP=m +CONFIG_NETFILTER_XT_MATCH_ECN=m +CONFIG_NETFILTER_XT_MATCH_ESP=m +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m +CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_HL=m +CONFIG_NETFILTER_XT_MATCH_IPCOMP=m +CONFIG_NETFILTER_XT_MATCH_IPRANGE=m +CONFIG_NETFILTER_XT_MATCH_IPVS=m +CONFIG_NETFILTER_XT_MATCH_L2TP=m +CONFIG_NETFILTER_XT_MATCH_LENGTH=m +CONFIG_NETFILTER_XT_MATCH_LIMIT=m +CONFIG_NETFILTER_XT_MATCH_MAC=m +CONFIG_NETFILTER_XT_MATCH_MARK=m +CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m +CONFIG_NETFILTER_XT_MATCH_NFACCT=m +CONFIG_NETFILTER_XT_MATCH_OSF=m +CONFIG_NETFILTER_XT_MATCH_OWNER=m +CONFIG_NETFILTER_XT_MATCH_POLICY=m +CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m +CONFIG_NETFILTER_XT_MATCH_QUOTA=m +CONFIG_NETFILTER_XT_MATCH_RATEEST=m +CONFIG_NETFILTER_XT_MATCH_REALM=m +CONFIG_NETFILTER_XT_MATCH_RECENT=m +CONFIG_NETFILTER_XT_MATCH_SCTP=m +CONFIG_NETFILTER_XT_MATCH_SOCKET=m +CONFIG_NETFILTER_XT_MATCH_STATE=m +CONFIG_NETFILTER_XT_MATCH_STATISTIC=m +CONFIG_NETFILTER_XT_MATCH_STRING=m +CONFIG_NETFILTER_XT_MATCH_TCPMSS=m +CONFIG_NETFILTER_XT_MATCH_TIME=m +CONFIG_NETFILTER_XT_MATCH_U32=m +# end of Core Netfilter Configuration + +CONFIG_IP_SET=m +CONFIG_IP_SET_MAX=256 +CONFIG_IP_SET_BITMAP_IP=m +CONFIG_IP_SET_BITMAP_IPMAC=m +CONFIG_IP_SET_BITMAP_PORT=m +CONFIG_IP_SET_HASH_IP=m +CONFIG_IP_SET_HASH_IPMARK=m +CONFIG_IP_SET_HASH_IPPORT=m +CONFIG_IP_SET_HASH_IPPORTIP=m +CONFIG_IP_SET_HASH_IPPORTNET=m +CONFIG_IP_SET_HASH_IPMAC=m +CONFIG_IP_SET_HASH_MAC=m +CONFIG_IP_SET_HASH_NETPORTNET=m +CONFIG_IP_SET_HASH_NET=m +CONFIG_IP_SET_HASH_NETNET=m +CONFIG_IP_SET_HASH_NETPORT=m +CONFIG_IP_SET_HASH_NETIFACE=m +CONFIG_IP_SET_LIST_SET=m +CONFIG_IP_VS=m +CONFIG_IP_VS_IPV6=y +# CONFIG_IP_VS_DEBUG is not set +CONFIG_IP_VS_TAB_BITS=15 + +# +# IPVS transport protocol load balancing support +# +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_PROTO_UDP=y +CONFIG_IP_VS_PROTO_AH_ESP=y +CONFIG_IP_VS_PROTO_ESP=y +CONFIG_IP_VS_PROTO_AH=y +CONFIG_IP_VS_PROTO_SCTP=y + +# +# IPVS scheduler +# +CONFIG_IP_VS_RR=m +CONFIG_IP_VS_WRR=m +CONFIG_IP_VS_LC=m +CONFIG_IP_VS_WLC=m +CONFIG_IP_VS_FO=m +CONFIG_IP_VS_OVF=m +CONFIG_IP_VS_LBLC=m +CONFIG_IP_VS_LBLCR=m +CONFIG_IP_VS_DH=m +CONFIG_IP_VS_SH=m +CONFIG_IP_VS_MH=m +CONFIG_IP_VS_SED=m +CONFIG_IP_VS_NQ=m + +# +# IPVS SH scheduler +# +CONFIG_IP_VS_SH_TAB_BITS=8 + +# +# IPVS MH scheduler +# +CONFIG_IP_VS_MH_TAB_INDEX=12 + +# +# IPVS application helper +# +CONFIG_IP_VS_FTP=m +CONFIG_IP_VS_NFCT=y +CONFIG_IP_VS_PE_SIP=m + +# +# IP: Netfilter Configuration +# +CONFIG_NF_DEFRAG_IPV4=m +CONFIG_NF_SOCKET_IPV4=m +CONFIG_NF_TPROXY_IPV4=m +CONFIG_NF_TABLES_IPV4=y +CONFIG_NFT_REJECT_IPV4=m +CONFIG_NFT_DUP_IPV4=m +CONFIG_NFT_FIB_IPV4=m +CONFIG_NF_TABLES_ARP=y +CONFIG_NF_FLOW_TABLE_IPV4=m +CONFIG_NF_DUP_IPV4=m +CONFIG_NF_LOG_ARP=m +CONFIG_NF_LOG_IPV4=m +CONFIG_NF_REJECT_IPV4=m +CONFIG_NF_NAT_SNMP_BASIC=m +CONFIG_NF_NAT_PPTP=m +CONFIG_NF_NAT_H323=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_AH=m +CONFIG_IP_NF_MATCH_ECN=m +CONFIG_IP_NF_MATCH_RPFILTER=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_TARGET_SYNPROXY=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_TARGET_NETMAP=m +CONFIG_IP_NF_TARGET_REDIRECT=m +CONFIG_IP_NF_MANGLE=m +CONFIG_IP_NF_TARGET_CLUSTERIP=m +CONFIG_IP_NF_TARGET_ECN=m +CONFIG_IP_NF_TARGET_TTL=m +CONFIG_IP_NF_RAW=m +CONFIG_IP_NF_SECURITY=m +CONFIG_IP_NF_ARPTABLES=m +CONFIG_IP_NF_ARPFILTER=m +CONFIG_IP_NF_ARP_MANGLE=m +# end of IP: Netfilter Configuration + +# +# IPv6: Netfilter Configuration +# +CONFIG_NF_SOCKET_IPV6=m +CONFIG_NF_TPROXY_IPV6=m +CONFIG_NF_TABLES_IPV6=y +CONFIG_NFT_REJECT_IPV6=m +CONFIG_NFT_DUP_IPV6=m +CONFIG_NFT_FIB_IPV6=m +CONFIG_NF_FLOW_TABLE_IPV6=m +CONFIG_NF_DUP_IPV6=m +CONFIG_NF_REJECT_IPV6=m +CONFIG_NF_LOG_IPV6=m +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_AH=m +CONFIG_IP6_NF_MATCH_EUI64=m +CONFIG_IP6_NF_MATCH_FRAG=m +CONFIG_IP6_NF_MATCH_OPTS=m +CONFIG_IP6_NF_MATCH_HL=m +CONFIG_IP6_NF_MATCH_IPV6HEADER=m +CONFIG_IP6_NF_MATCH_MH=m +CONFIG_IP6_NF_MATCH_RPFILTER=m +CONFIG_IP6_NF_MATCH_RT=m +CONFIG_IP6_NF_MATCH_SRH=m +CONFIG_IP6_NF_TARGET_HL=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_REJECT=m +CONFIG_IP6_NF_TARGET_SYNPROXY=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_RAW=m +CONFIG_IP6_NF_SECURITY=m +CONFIG_IP6_NF_NAT=m +CONFIG_IP6_NF_TARGET_MASQUERADE=m +CONFIG_IP6_NF_TARGET_NPT=m +# end of IPv6: Netfilter Configuration + +CONFIG_NF_DEFRAG_IPV6=m +CONFIG_NF_TABLES_BRIDGE=m +CONFIG_NFT_BRIDGE_META=m +CONFIG_NFT_BRIDGE_REJECT=m +CONFIG_NF_LOG_BRIDGE=m +CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES=m +CONFIG_BRIDGE_EBT_BROUTE=m +CONFIG_BRIDGE_EBT_T_FILTER=m +CONFIG_BRIDGE_EBT_T_NAT=m +CONFIG_BRIDGE_EBT_802_3=m +CONFIG_BRIDGE_EBT_AMONG=m +CONFIG_BRIDGE_EBT_ARP=m +CONFIG_BRIDGE_EBT_IP=m +CONFIG_BRIDGE_EBT_IP6=m +CONFIG_BRIDGE_EBT_LIMIT=m +CONFIG_BRIDGE_EBT_MARK=m +CONFIG_BRIDGE_EBT_PKTTYPE=m +CONFIG_BRIDGE_EBT_STP=m +CONFIG_BRIDGE_EBT_VLAN=m +CONFIG_BRIDGE_EBT_ARPREPLY=m +CONFIG_BRIDGE_EBT_DNAT=m +CONFIG_BRIDGE_EBT_MARK_T=m +CONFIG_BRIDGE_EBT_REDIRECT=m +CONFIG_BRIDGE_EBT_SNAT=m +CONFIG_BRIDGE_EBT_LOG=m +CONFIG_BRIDGE_EBT_NFLOG=m +# CONFIG_BPFILTER is not set +CONFIG_IP_DCCP=m +CONFIG_INET_DCCP_DIAG=m + +# +# DCCP CCIDs Configuration +# +# CONFIG_IP_DCCP_CCID2_DEBUG is not set +CONFIG_IP_DCCP_CCID3=y +# CONFIG_IP_DCCP_CCID3_DEBUG is not set +CONFIG_IP_DCCP_TFRC_LIB=y +# end of DCCP CCIDs Configuration + +# +# DCCP Kernel Hacking +# +# CONFIG_IP_DCCP_DEBUG is not set +# end of DCCP Kernel Hacking + +CONFIG_IP_SCTP=m +# CONFIG_SCTP_DBG_OBJCNT is not set +# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5 is not set +CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1=y +# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE is not set +CONFIG_SCTP_COOKIE_HMAC_MD5=y +CONFIG_SCTP_COOKIE_HMAC_SHA1=y +CONFIG_INET_SCTP_DIAG=m +CONFIG_RDS=m +CONFIG_RDS_RDMA=m +CONFIG_RDS_TCP=m +# CONFIG_RDS_DEBUG is not set +CONFIG_TIPC=m +CONFIG_TIPC_MEDIA_IB=y +CONFIG_TIPC_MEDIA_UDP=y +CONFIG_TIPC_DIAG=m +CONFIG_ATM=m +CONFIG_ATM_CLIP=m +# CONFIG_ATM_CLIP_NO_ICMP is not set +CONFIG_ATM_LANE=m +CONFIG_ATM_MPOA=m +CONFIG_ATM_BR2684=m +# CONFIG_ATM_BR2684_IPFILTER is not set +CONFIG_L2TP=m +# CONFIG_L2TP_DEBUGFS is not set +CONFIG_L2TP_V3=y +CONFIG_L2TP_IP=m +CONFIG_L2TP_ETH=m +CONFIG_STP=m +CONFIG_GARP=m +CONFIG_MRP=m +CONFIG_BRIDGE=m +CONFIG_BRIDGE_IGMP_SNOOPING=y +CONFIG_BRIDGE_VLAN_FILTERING=y +CONFIG_HAVE_NET_DSA=y +CONFIG_NET_DSA=m +CONFIG_NET_DSA_TAG_8021Q=m +CONFIG_NET_DSA_TAG_BRCM_COMMON=m +CONFIG_NET_DSA_TAG_BRCM=m +CONFIG_NET_DSA_TAG_BRCM_PREPEND=m +CONFIG_NET_DSA_TAG_GSWIP=m +CONFIG_NET_DSA_TAG_DSA=m +CONFIG_NET_DSA_TAG_EDSA=m +CONFIG_NET_DSA_TAG_MTK=m +CONFIG_NET_DSA_TAG_KSZ=m +CONFIG_NET_DSA_TAG_QCA=m +CONFIG_NET_DSA_TAG_LAN9303=m +CONFIG_NET_DSA_TAG_SJA1105=m +CONFIG_NET_DSA_TAG_TRAILER=m +CONFIG_VLAN_8021Q=m +CONFIG_VLAN_8021Q_GVRP=y +CONFIG_VLAN_8021Q_MVRP=y +# CONFIG_DECNET is not set +CONFIG_LLC=m +CONFIG_LLC2=m +# CONFIG_ATALK is not set +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +CONFIG_PHONET=m +CONFIG_6LOWPAN=m +# CONFIG_6LOWPAN_DEBUGFS is not set +CONFIG_6LOWPAN_NHC=m +CONFIG_6LOWPAN_NHC_DEST=m +CONFIG_6LOWPAN_NHC_FRAGMENT=m +CONFIG_6LOWPAN_NHC_HOP=m +CONFIG_6LOWPAN_NHC_IPV6=m +CONFIG_6LOWPAN_NHC_MOBILITY=m +CONFIG_6LOWPAN_NHC_ROUTING=m +CONFIG_6LOWPAN_NHC_UDP=m +CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m +CONFIG_6LOWPAN_GHC_UDP=m +CONFIG_6LOWPAN_GHC_ICMPV6=m +CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m +CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m +CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m +CONFIG_IEEE802154=m +CONFIG_IEEE802154_NL802154_EXPERIMENTAL=y +CONFIG_IEEE802154_SOCKET=m +CONFIG_IEEE802154_6LOWPAN=m +CONFIG_MAC802154=m +CONFIG_NET_SCHED=y + +# +# Queueing/Scheduling +# +CONFIG_NET_SCH_CBQ=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_HFSC=m +CONFIG_NET_SCH_ATM=m +CONFIG_NET_SCH_PRIO=m +CONFIG_NET_SCH_MULTIQ=m +CONFIG_NET_SCH_RED=m +CONFIG_NET_SCH_SFB=m +CONFIG_NET_SCH_SFQ=m +CONFIG_NET_SCH_TEQL=m +CONFIG_NET_SCH_TBF=m +CONFIG_NET_SCH_CBS=m +CONFIG_NET_SCH_ETF=m +CONFIG_NET_SCH_TAPRIO=m +CONFIG_NET_SCH_GRED=m +CONFIG_NET_SCH_DSMARK=m +CONFIG_NET_SCH_NETEM=m +CONFIG_NET_SCH_DRR=m +CONFIG_NET_SCH_MQPRIO=m +CONFIG_NET_SCH_SKBPRIO=m +CONFIG_NET_SCH_CHOKE=m +CONFIG_NET_SCH_QFQ=m +CONFIG_NET_SCH_CODEL=m +CONFIG_NET_SCH_FQ_CODEL=y +CONFIG_NET_SCH_CAKE=m +CONFIG_NET_SCH_FQ=m +CONFIG_NET_SCH_HHF=m +CONFIG_NET_SCH_PIE=m +CONFIG_NET_SCH_INGRESS=m +CONFIG_NET_SCH_PLUG=m +CONFIG_NET_SCH_DEFAULT=y +# CONFIG_DEFAULT_FQ is not set +# CONFIG_DEFAULT_CODEL is not set +CONFIG_DEFAULT_FQ_CODEL=y +# CONFIG_DEFAULT_SFQ is not set +# CONFIG_DEFAULT_PFIFO_FAST is not set +CONFIG_DEFAULT_NET_SCH="fq_codel" + +# +# Classification +# +CONFIG_NET_CLS=y +CONFIG_NET_CLS_BASIC=m +CONFIG_NET_CLS_TCINDEX=m +CONFIG_NET_CLS_ROUTE4=m +CONFIG_NET_CLS_FW=m +CONFIG_NET_CLS_U32=m +CONFIG_CLS_U32_PERF=y +CONFIG_CLS_U32_MARK=y +CONFIG_NET_CLS_RSVP=m +CONFIG_NET_CLS_RSVP6=m +CONFIG_NET_CLS_FLOW=m +CONFIG_NET_CLS_CGROUP=m +CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_FLOWER=m +CONFIG_NET_CLS_MATCHALL=m +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_STACK=32 +CONFIG_NET_EMATCH_CMP=m +CONFIG_NET_EMATCH_NBYTE=m +CONFIG_NET_EMATCH_U32=m +CONFIG_NET_EMATCH_META=m +CONFIG_NET_EMATCH_TEXT=m +CONFIG_NET_EMATCH_CANID=m +CONFIG_NET_EMATCH_IPSET=m +CONFIG_NET_EMATCH_IPT=m +CONFIG_NET_CLS_ACT=y +CONFIG_NET_ACT_POLICE=m +CONFIG_NET_ACT_GACT=m +CONFIG_GACT_PROB=y +CONFIG_NET_ACT_MIRRED=m +CONFIG_NET_ACT_SAMPLE=m +CONFIG_NET_ACT_IPT=m +CONFIG_NET_ACT_NAT=m +CONFIG_NET_ACT_PEDIT=m +CONFIG_NET_ACT_SIMP=m +CONFIG_NET_ACT_SKBEDIT=m +CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_MPLS=m +CONFIG_NET_ACT_VLAN=m +CONFIG_NET_ACT_BPF=m +CONFIG_NET_ACT_CONNMARK=m +CONFIG_NET_ACT_CTINFO=m +CONFIG_NET_ACT_SKBMOD=m +CONFIG_NET_ACT_IFE=m +CONFIG_NET_ACT_TUNNEL_KEY=m +CONFIG_NET_ACT_CT=m +CONFIG_NET_IFE_SKBMARK=m +CONFIG_NET_IFE_SKBPRIO=m +CONFIG_NET_IFE_SKBTCINDEX=m +# CONFIG_NET_TC_SKB_EXT is not set +CONFIG_NET_SCH_FIFO=y +CONFIG_DCB=y +CONFIG_DNS_RESOLVER=m +CONFIG_BATMAN_ADV=m +CONFIG_BATMAN_ADV_BATMAN_V=y +CONFIG_BATMAN_ADV_BLA=y +CONFIG_BATMAN_ADV_DAT=y +CONFIG_BATMAN_ADV_NC=y +CONFIG_BATMAN_ADV_MCAST=y +# CONFIG_BATMAN_ADV_DEBUGFS is not set +# CONFIG_BATMAN_ADV_DEBUG is not set +CONFIG_BATMAN_ADV_SYSFS=y +# CONFIG_BATMAN_ADV_TRACING is not set +CONFIG_OPENVSWITCH=m +CONFIG_OPENVSWITCH_GRE=m +CONFIG_OPENVSWITCH_VXLAN=m +CONFIG_OPENVSWITCH_GENEVE=m +CONFIG_VSOCKETS=m +CONFIG_VSOCKETS_DIAG=m +CONFIG_VMWARE_VMCI_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS_COMMON=m +CONFIG_HYPERV_VSOCKETS=m +CONFIG_NETLINK_DIAG=m +CONFIG_MPLS=y +CONFIG_NET_MPLS_GSO=m +CONFIG_MPLS_ROUTING=m +CONFIG_MPLS_IPTUNNEL=m +CONFIG_NET_NSH=m +CONFIG_HSR=m +CONFIG_NET_SWITCHDEV=y +CONFIG_NET_L3_MASTER_DEV=y +CONFIG_NET_NCSI=y +CONFIG_NCSI_OEM_CMD_GET_MAC=y +CONFIG_RPS=y +CONFIG_RFS_ACCEL=y +CONFIG_XPS=y +CONFIG_CGROUP_NET_PRIO=y +CONFIG_CGROUP_NET_CLASSID=y +CONFIG_NET_RX_BUSY_POLL=y +CONFIG_BQL=y +CONFIG_BPF_JIT=y +CONFIG_BPF_STREAM_PARSER=y +CONFIG_NET_FLOW_LIMIT=y + +# +# Network testing +# +CONFIG_NET_PKTGEN=m +CONFIG_NET_DROP_MONITOR=y +# end of Network testing +# end of Networking options + +CONFIG_HAMRADIO=y + +# +# Packet Radio protocols +# +CONFIG_AX25=m +CONFIG_AX25_DAMA_SLAVE=y +CONFIG_NETROM=m +CONFIG_ROSE=m + +# +# AX.25 network device drivers +# +CONFIG_MKISS=m +CONFIG_6PACK=m +CONFIG_BPQETHER=m +CONFIG_BAYCOM_SER_FDX=m +CONFIG_BAYCOM_SER_HDX=m +CONFIG_BAYCOM_PAR=m +CONFIG_YAM=m +# end of AX.25 network device drivers + +CONFIG_CAN=m +CONFIG_CAN_RAW=m +CONFIG_CAN_BCM=m +CONFIG_CAN_GW=m +# CONFIG_CAN_J1939 is not set + +# +# CAN Device Drivers +# +CONFIG_CAN_VCAN=m +CONFIG_CAN_VXCAN=m +CONFIG_CAN_SLCAN=m +CONFIG_CAN_DEV=m +CONFIG_CAN_CALC_BITTIMING=y +CONFIG_CAN_FLEXCAN=m +CONFIG_CAN_GRCAN=m +CONFIG_CAN_JANZ_ICAN3=m +# CONFIG_CAN_KVASER_PCIEFD is not set +CONFIG_CAN_C_CAN=m +CONFIG_CAN_C_CAN_PLATFORM=m +CONFIG_CAN_C_CAN_PCI=m +CONFIG_CAN_CC770=m +# CONFIG_CAN_CC770_ISA is not set +CONFIG_CAN_CC770_PLATFORM=m +CONFIG_CAN_IFI_CANFD=m +CONFIG_CAN_M_CAN=m +# CONFIG_CAN_M_CAN_PLATFORM is not set +# CONFIG_CAN_M_CAN_TCAN4X5X is not set +CONFIG_CAN_PEAK_PCIEFD=m +CONFIG_CAN_SJA1000=m +CONFIG_CAN_EMS_PCI=m +# CONFIG_CAN_EMS_PCMCIA is not set +# CONFIG_CAN_F81601 is not set +CONFIG_CAN_KVASER_PCI=m +CONFIG_CAN_PEAK_PCI=m +CONFIG_CAN_PEAK_PCIEC=y +CONFIG_CAN_PEAK_PCMCIA=m +CONFIG_CAN_PLX_PCI=m +# CONFIG_CAN_SJA1000_ISA is not set +CONFIG_CAN_SJA1000_PLATFORM=m +CONFIG_CAN_SOFTING=m +CONFIG_CAN_SOFTING_CS=m + +# +# CAN SPI interfaces +# +CONFIG_CAN_HI311X=m +CONFIG_CAN_MCP251X=m +# end of CAN SPI interfaces + +# +# CAN USB interfaces +# +CONFIG_CAN_8DEV_USB=m +CONFIG_CAN_EMS_USB=m +CONFIG_CAN_ESD_USB2=m +CONFIG_CAN_GS_USB=m +CONFIG_CAN_KVASER_USB=m +CONFIG_CAN_MCBA_USB=m +CONFIG_CAN_PEAK_USB=m +CONFIG_CAN_UCAN=m +# end of CAN USB interfaces + +# CONFIG_CAN_DEBUG_DEVICES is not set +# end of CAN Device Drivers + +CONFIG_BT=m +CONFIG_BT_BREDR=y +CONFIG_BT_RFCOMM=m +CONFIG_BT_RFCOMM_TTY=y +CONFIG_BT_BNEP=m +CONFIG_BT_BNEP_MC_FILTER=y +CONFIG_BT_BNEP_PROTO_FILTER=y +CONFIG_BT_CMTP=m +CONFIG_BT_HIDP=m +CONFIG_BT_HS=y +CONFIG_BT_LE=y +CONFIG_BT_6LOWPAN=m +CONFIG_BT_LEDS=y +# CONFIG_BT_SELFTEST is not set +# CONFIG_BT_DEBUGFS is not set + +# +# Bluetooth device drivers +# +CONFIG_BT_INTEL=m +CONFIG_BT_BCM=m +CONFIG_BT_RTL=m +CONFIG_BT_QCA=m +CONFIG_BT_HCIBTUSB=m +CONFIG_BT_HCIBTUSB_AUTOSUSPEND=y +CONFIG_BT_HCIBTUSB_BCM=y +CONFIG_BT_HCIBTUSB_MTK=y +CONFIG_BT_HCIBTUSB_RTL=y +CONFIG_BT_HCIBTSDIO=m +CONFIG_BT_HCIUART=m +CONFIG_BT_HCIUART_SERDEV=y +CONFIG_BT_HCIUART_H4=y +CONFIG_BT_HCIUART_NOKIA=m +CONFIG_BT_HCIUART_BCSP=y +CONFIG_BT_HCIUART_ATH3K=y +CONFIG_BT_HCIUART_LL=y +CONFIG_BT_HCIUART_3WIRE=y +CONFIG_BT_HCIUART_INTEL=y +CONFIG_BT_HCIUART_BCM=y +CONFIG_BT_HCIUART_RTL=y +CONFIG_BT_HCIUART_QCA=y +CONFIG_BT_HCIUART_AG6XX=y +CONFIG_BT_HCIUART_MRVL=y +CONFIG_BT_HCIBCM203X=m +CONFIG_BT_HCIBPA10X=m +CONFIG_BT_HCIBFUSB=m +CONFIG_BT_HCIDTL1=m +CONFIG_BT_HCIBT3C=m +CONFIG_BT_HCIBLUECARD=m +CONFIG_BT_HCIVHCI=m +CONFIG_BT_MRVL=m +CONFIG_BT_MRVL_SDIO=m +CONFIG_BT_ATH3K=m +CONFIG_BT_WILINK=m +CONFIG_BT_MTKSDIO=m +CONFIG_BT_MTKUART=m +CONFIG_BT_HCIRSI=m +# end of Bluetooth device drivers + +CONFIG_AF_RXRPC=m +CONFIG_AF_RXRPC_IPV6=y +# CONFIG_AF_RXRPC_INJECT_LOSS is not set +# CONFIG_AF_RXRPC_DEBUG is not set +CONFIG_RXKAD=y +CONFIG_AF_KCM=m +CONFIG_STREAM_PARSER=y +CONFIG_FIB_RULES=y +CONFIG_WIRELESS=y +CONFIG_WIRELESS_EXT=y +CONFIG_WEXT_CORE=y +CONFIG_WEXT_PROC=y +CONFIG_WEXT_SPY=y +CONFIG_WEXT_PRIV=y +CONFIG_CFG80211=m +# CONFIG_NL80211_TESTMODE is not set +# CONFIG_CFG80211_DEVELOPER_WARNINGS is not set +# CONFIG_CFG80211_CERTIFICATION_ONUS is not set +CONFIG_CFG80211_REQUIRE_SIGNED_REGDB=y +CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS=y +CONFIG_CFG80211_DEFAULT_PS=y +# CONFIG_CFG80211_DEBUGFS is not set +CONFIG_CFG80211_CRDA_SUPPORT=y +CONFIG_CFG80211_WEXT=y +CONFIG_CFG80211_WEXT_EXPORT=y +CONFIG_LIB80211=m +CONFIG_LIB80211_CRYPT_WEP=m +CONFIG_LIB80211_CRYPT_CCMP=m +CONFIG_LIB80211_CRYPT_TKIP=m +# CONFIG_LIB80211_DEBUG is not set +CONFIG_MAC80211=m +CONFIG_MAC80211_HAS_RC=y +CONFIG_MAC80211_RC_MINSTREL=y +CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y +CONFIG_MAC80211_RC_DEFAULT="minstrel_ht" +CONFIG_MAC80211_MESH=y +CONFIG_MAC80211_LEDS=y +# CONFIG_MAC80211_DEBUGFS is not set +# CONFIG_MAC80211_MESSAGE_TRACING is not set +# CONFIG_MAC80211_DEBUG_MENU is not set +CONFIG_MAC80211_STA_HASH_MAX_SIZE=0 +CONFIG_WIMAX=m +CONFIG_WIMAX_DEBUG_LEVEL=8 +CONFIG_RFKILL=m +CONFIG_RFKILL_LEDS=y +CONFIG_RFKILL_INPUT=y +CONFIG_RFKILL_GPIO=m +CONFIG_NET_9P=m +CONFIG_NET_9P_VIRTIO=m +CONFIG_NET_9P_XEN=m +CONFIG_NET_9P_RDMA=m +# CONFIG_NET_9P_DEBUG is not set +CONFIG_CAIF=m +# CONFIG_CAIF_DEBUG is not set +CONFIG_CAIF_NETDEV=m +CONFIG_CAIF_USB=m +CONFIG_CEPH_LIB=m +CONFIG_CEPH_LIB_PRETTYDEBUG=y +CONFIG_CEPH_LIB_USE_DNS_RESOLVER=y +CONFIG_NFC=m +CONFIG_NFC_DIGITAL=m +CONFIG_NFC_NCI=m +CONFIG_NFC_NCI_SPI=m +CONFIG_NFC_NCI_UART=m +CONFIG_NFC_HCI=m +CONFIG_NFC_SHDLC=y + +# +# Near Field Communication (NFC) devices +# +CONFIG_NFC_TRF7970A=m +CONFIG_NFC_MEI_PHY=m +CONFIG_NFC_SIM=m +CONFIG_NFC_PORT100=m +CONFIG_NFC_FDP=m +CONFIG_NFC_FDP_I2C=m +CONFIG_NFC_PN544=m +CONFIG_NFC_PN544_I2C=m +CONFIG_NFC_PN544_MEI=m +CONFIG_NFC_PN533=m +CONFIG_NFC_PN533_USB=m +CONFIG_NFC_PN533_I2C=m +CONFIG_NFC_MICROREAD=m +CONFIG_NFC_MICROREAD_I2C=m +CONFIG_NFC_MICROREAD_MEI=m +CONFIG_NFC_MRVL=m +CONFIG_NFC_MRVL_USB=m +CONFIG_NFC_MRVL_UART=m +CONFIG_NFC_MRVL_I2C=m +CONFIG_NFC_MRVL_SPI=m +CONFIG_NFC_ST21NFCA=m +CONFIG_NFC_ST21NFCA_I2C=m +CONFIG_NFC_ST_NCI=m +CONFIG_NFC_ST_NCI_I2C=m +CONFIG_NFC_ST_NCI_SPI=m +CONFIG_NFC_NXP_NCI=m +CONFIG_NFC_NXP_NCI_I2C=m +CONFIG_NFC_S3FWRN5=m +CONFIG_NFC_S3FWRN5_I2C=m +CONFIG_NFC_ST95HF=m +# end of Near Field Communication (NFC) devices + +CONFIG_PSAMPLE=m +CONFIG_NET_IFE=m +CONFIG_LWTUNNEL=y +CONFIG_LWTUNNEL_BPF=y +CONFIG_DST_CACHE=y +CONFIG_GRO_CELLS=y +CONFIG_SOCK_VALIDATE_XMIT=y +CONFIG_NET_SOCK_MSG=y +CONFIG_NET_DEVLINK=y +CONFIG_PAGE_POOL=y +CONFIG_FAILOVER=m +CONFIG_HAVE_EBPF_JIT=y + +# +# Device Drivers +# +CONFIG_HAVE_EISA=y +# CONFIG_EISA is not set +CONFIG_HAVE_PCI=y +CONFIG_PCI=y +CONFIG_PCI_DOMAINS=y +CONFIG_PCIEPORTBUS=y +CONFIG_HOTPLUG_PCI_PCIE=y +CONFIG_PCIEAER=y +# CONFIG_PCIEAER_INJECT is not set +CONFIG_PCIE_ECRC=y +CONFIG_PCIEASPM=y +# CONFIG_PCIEASPM_DEBUG is not set +CONFIG_PCIEASPM_DEFAULT=y +# CONFIG_PCIEASPM_POWERSAVE is not set +# CONFIG_PCIEASPM_POWER_SUPERSAVE is not set +# CONFIG_PCIEASPM_PERFORMANCE is not set +CONFIG_PCIE_PME=y +CONFIG_PCIE_DPC=y +CONFIG_PCIE_PTM=y +# CONFIG_PCIE_BW is not set +CONFIG_PCI_MSI=y +CONFIG_PCI_MSI_IRQ_DOMAIN=y +CONFIG_PCI_QUIRKS=y +# CONFIG_PCI_DEBUG is not set +CONFIG_PCI_REALLOC_ENABLE_AUTO=y +CONFIG_PCI_STUB=y +CONFIG_PCI_PF_STUB=m +CONFIG_XEN_PCIDEV_FRONTEND=m +CONFIG_PCI_ATS=y +CONFIG_PCI_ECAM=y +CONFIG_PCI_LOCKLESS_CONFIG=y +CONFIG_PCI_IOV=y +CONFIG_PCI_PRI=y +CONFIG_PCI_PASID=y +CONFIG_PCI_P2PDMA=y +CONFIG_PCI_LABEL=y +CONFIG_PCI_HYPERV=m +CONFIG_HOTPLUG_PCI=y +CONFIG_HOTPLUG_PCI_ACPI=y +CONFIG_HOTPLUG_PCI_ACPI_IBM=m +CONFIG_HOTPLUG_PCI_CPCI=y +CONFIG_HOTPLUG_PCI_CPCI_ZT5550=m +CONFIG_HOTPLUG_PCI_CPCI_GENERIC=m +CONFIG_HOTPLUG_PCI_SHPC=y + +# +# PCI controller drivers +# + +# +# Cadence PCIe controllers support +# +CONFIG_PCIE_CADENCE=y +CONFIG_PCIE_CADENCE_HOST=y +CONFIG_PCIE_CADENCE_EP=y +# end of Cadence PCIe controllers support + +CONFIG_PCI_FTPCI100=y +CONFIG_PCI_HOST_COMMON=y +CONFIG_PCI_HOST_GENERIC=y +CONFIG_PCIE_XILINX=y +CONFIG_VMD=m +CONFIG_PCI_HYPERV_INTERFACE=m + +# +# DesignWare PCI Core Support +# +CONFIG_PCIE_DW=y +CONFIG_PCIE_DW_HOST=y +CONFIG_PCIE_DW_EP=y +CONFIG_PCIE_DW_PLAT=y +CONFIG_PCIE_DW_PLAT_HOST=y +CONFIG_PCIE_DW_PLAT_EP=y +CONFIG_PCI_MESON=y +# end of DesignWare PCI Core Support +# end of PCI controller drivers + +# +# PCI Endpoint +# +CONFIG_PCI_ENDPOINT=y +CONFIG_PCI_ENDPOINT_CONFIGFS=y +# CONFIG_PCI_EPF_TEST is not set +# end of PCI Endpoint + +# +# PCI switch controller drivers +# +CONFIG_PCI_SW_SWITCHTEC=m +# end of PCI switch controller drivers + +CONFIG_PCCARD=m +CONFIG_PCMCIA=m +CONFIG_PCMCIA_LOAD_CIS=y +CONFIG_CARDBUS=y + +# +# PC-card bridges +# +CONFIG_YENTA=m +CONFIG_YENTA_O2=y +CONFIG_YENTA_RICOH=y +CONFIG_YENTA_TI=y +CONFIG_YENTA_ENE_TUNE=y +CONFIG_YENTA_TOSHIBA=y +CONFIG_PD6729=m +CONFIG_I82092=m +CONFIG_PCCARD_NONSTATIC=y +CONFIG_RAPIDIO=m +CONFIG_RAPIDIO_TSI721=m +CONFIG_RAPIDIO_DISC_TIMEOUT=30 +CONFIG_RAPIDIO_ENABLE_RX_TX_PORTS=y +CONFIG_RAPIDIO_DMA_ENGINE=y +# CONFIG_RAPIDIO_DEBUG is not set +CONFIG_RAPIDIO_ENUM_BASIC=m +CONFIG_RAPIDIO_CHMAN=m +CONFIG_RAPIDIO_MPORT_CDEV=m + +# +# RapidIO Switch drivers +# +CONFIG_RAPIDIO_TSI57X=m +CONFIG_RAPIDIO_CPS_XX=m +CONFIG_RAPIDIO_TSI568=m +CONFIG_RAPIDIO_CPS_GEN2=m +CONFIG_RAPIDIO_RXS_GEN3=m +# end of RapidIO Switch drivers + +# +# Generic Driver Options +# +# CONFIG_UEVENT_HELPER is not set +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +CONFIG_STANDALONE=y +CONFIG_PREVENT_FIRMWARE_BUILD=y + +# +# Firmware loader +# +CONFIG_FW_LOADER=y +CONFIG_FW_LOADER_PAGED_BUF=y +CONFIG_EXTRA_FIRMWARE="" +# CONFIG_FW_LOADER_USER_HELPER is not set +CONFIG_FW_LOADER_COMPRESS=y +# end of Firmware loader + +CONFIG_WANT_DEV_COREDUMP=y +CONFIG_ALLOW_DEV_COREDUMP=y +CONFIG_DEV_COREDUMP=y +# CONFIG_DEBUG_DRIVER is not set +# CONFIG_DEBUG_DEVRES is not set +# CONFIG_DEBUG_TEST_DRIVER_REMOVE is not set +CONFIG_HMEM_REPORTING=y +# CONFIG_TEST_ASYNC_DRIVER_PROBE is not set +CONFIG_SYS_HYPERVISOR=y +CONFIG_GENERIC_CPU_AUTOPROBE=y +CONFIG_GENERIC_CPU_VULNERABILITIES=y +CONFIG_REGMAP=y +CONFIG_REGMAP_I2C=y +CONFIG_REGMAP_SLIMBUS=m +CONFIG_REGMAP_SPI=y +CONFIG_REGMAP_SPMI=m +CONFIG_REGMAP_W1=m +CONFIG_REGMAP_MMIO=y +CONFIG_REGMAP_IRQ=y +CONFIG_REGMAP_SCCB=m +CONFIG_REGMAP_I3C=m +CONFIG_DMA_SHARED_BUFFER=y +# CONFIG_DMA_FENCE_TRACE is not set +# end of Generic Driver Options + +# +# Bus devices +# +# CONFIG_MOXTET is not set +CONFIG_SIMPLE_PM_BUS=y +# end of Bus devices + +CONFIG_CONNECTOR=y +CONFIG_PROC_EVENTS=y +CONFIG_GNSS=m +CONFIG_GNSS_SERIAL=m +CONFIG_GNSS_MTK_SERIAL=m +CONFIG_GNSS_SIRF_SERIAL=m +CONFIG_GNSS_UBX_SERIAL=m +CONFIG_MTD=m +CONFIG_MTD_TESTS=m + +# +# Partition parsers +# +CONFIG_MTD_AR7_PARTS=m +CONFIG_MTD_CMDLINE_PARTS=m +CONFIG_MTD_OF_PARTS=m +CONFIG_MTD_REDBOOT_PARTS=m +CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1 +# CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED is not set +# CONFIG_MTD_REDBOOT_PARTS_READONLY is not set +# end of Partition parsers + +# +# User Modules And Translation Layers +# +CONFIG_MTD_BLKDEVS=m +CONFIG_MTD_BLOCK=m +CONFIG_MTD_BLOCK_RO=m +CONFIG_FTL=m +CONFIG_NFTL=m +CONFIG_NFTL_RW=y +CONFIG_INFTL=m +CONFIG_RFD_FTL=m +CONFIG_SSFDC=m +CONFIG_SM_FTL=m +CONFIG_MTD_OOPS=m +CONFIG_MTD_SWAP=m +CONFIG_MTD_PARTITIONED_MASTER=y + +# +# RAM/ROM/Flash chip drivers +# +CONFIG_MTD_CFI=m +CONFIG_MTD_JEDECPROBE=m +CONFIG_MTD_GEN_PROBE=m +# CONFIG_MTD_CFI_ADV_OPTIONS is not set +CONFIG_MTD_MAP_BANK_WIDTH_1=y +CONFIG_MTD_MAP_BANK_WIDTH_2=y +CONFIG_MTD_MAP_BANK_WIDTH_4=y +CONFIG_MTD_CFI_I1=y +CONFIG_MTD_CFI_I2=y +CONFIG_MTD_CFI_INTELEXT=m +CONFIG_MTD_CFI_AMDSTD=m +CONFIG_MTD_CFI_STAA=m +CONFIG_MTD_CFI_UTIL=m +CONFIG_MTD_RAM=m +CONFIG_MTD_ROM=m +CONFIG_MTD_ABSENT=m +# end of RAM/ROM/Flash chip drivers + +# +# Mapping drivers for chip access +# +CONFIG_MTD_COMPLEX_MAPPINGS=y +CONFIG_MTD_PHYSMAP=m +# CONFIG_MTD_PHYSMAP_COMPAT is not set +CONFIG_MTD_PHYSMAP_OF=y +CONFIG_MTD_PHYSMAP_VERSATILE=y +CONFIG_MTD_PHYSMAP_GEMINI=y +CONFIG_MTD_PHYSMAP_GPIO_ADDR=y +CONFIG_MTD_SBC_GXX=m +CONFIG_MTD_AMD76XROM=m +CONFIG_MTD_ICHXROM=m +CONFIG_MTD_ESB2ROM=m +CONFIG_MTD_CK804XROM=m +CONFIG_MTD_SCB2_FLASH=m +CONFIG_MTD_NETtel=m +CONFIG_MTD_L440GX=m +CONFIG_MTD_PCI=m +CONFIG_MTD_PCMCIA=m +# CONFIG_MTD_PCMCIA_ANONYMOUS is not set +CONFIG_MTD_INTEL_VR_NOR=m +CONFIG_MTD_PLATRAM=m +# end of Mapping drivers for chip access + +# +# Self-contained MTD device drivers +# +CONFIG_MTD_PMC551=m +# CONFIG_MTD_PMC551_BUGFIX is not set +# CONFIG_MTD_PMC551_DEBUG is not set +CONFIG_MTD_DATAFLASH=m +# CONFIG_MTD_DATAFLASH_WRITE_VERIFY is not set +CONFIG_MTD_DATAFLASH_OTP=y +CONFIG_MTD_MCHP23K256=m +CONFIG_MTD_SST25L=m +CONFIG_MTD_SLRAM=m +CONFIG_MTD_PHRAM=m +CONFIG_MTD_MTDRAM=m +CONFIG_MTDRAM_TOTAL_SIZE=4096 +CONFIG_MTDRAM_ERASE_SIZE=128 +CONFIG_MTD_BLOCK2MTD=m + +# +# Disk-On-Chip Device Drivers +# +CONFIG_MTD_DOCG3=m +CONFIG_BCH_CONST_M=14 +CONFIG_BCH_CONST_T=4 +# end of Self-contained MTD device drivers + +CONFIG_MTD_NAND_CORE=m +CONFIG_MTD_ONENAND=m +# CONFIG_MTD_ONENAND_VERIFY_WRITE is not set +CONFIG_MTD_ONENAND_GENERIC=m +CONFIG_MTD_ONENAND_OTP=y +CONFIG_MTD_ONENAND_2X_PROGRAM=y +CONFIG_MTD_NAND_ECC_SW_HAMMING=m +CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC=y +CONFIG_MTD_RAW_NAND=m +CONFIG_MTD_NAND_ECC_SW_BCH=y + +# +# Raw/parallel NAND flash controllers +# +CONFIG_MTD_NAND_DENALI=m +CONFIG_MTD_NAND_DENALI_PCI=m +CONFIG_MTD_NAND_DENALI_DT=m +CONFIG_MTD_NAND_CAFE=m +# CONFIG_MTD_NAND_MXIC is not set +CONFIG_MTD_NAND_GPIO=m +CONFIG_MTD_NAND_PLATFORM=m + +# +# Misc +# +CONFIG_MTD_SM_COMMON=m +CONFIG_MTD_NAND_NANDSIM=m +CONFIG_MTD_NAND_RICOH=m +CONFIG_MTD_NAND_DISKONCHIP=m +# CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED is not set +CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0 +CONFIG_MTD_NAND_DISKONCHIP_BBTWRITE=y +CONFIG_MTD_SPI_NAND=m + +# +# LPDDR & LPDDR2 PCM memory drivers +# +CONFIG_MTD_LPDDR=m +CONFIG_MTD_QINFO_PROBE=m +# end of LPDDR & LPDDR2 PCM memory drivers + +CONFIG_MTD_SPI_NOR=m +CONFIG_MTD_SPI_NOR_USE_4K_SECTORS=y +CONFIG_SPI_MTK_QUADSPI=m +CONFIG_SPI_INTEL_SPI=m +CONFIG_SPI_INTEL_SPI_PCI=m +CONFIG_SPI_INTEL_SPI_PLATFORM=m +CONFIG_MTD_UBI=m +CONFIG_MTD_UBI_WL_THRESHOLD=4096 +CONFIG_MTD_UBI_BEB_LIMIT=20 +CONFIG_MTD_UBI_FASTMAP=y +CONFIG_MTD_UBI_GLUEBI=m +CONFIG_MTD_UBI_BLOCK=y +CONFIG_MTD_HYPERBUS=m +CONFIG_DTC=y +CONFIG_OF=y +# CONFIG_OF_UNITTEST is not set +CONFIG_OF_FLATTREE=y +CONFIG_OF_KOBJ=y +CONFIG_OF_DYNAMIC=y +CONFIG_OF_ADDRESS=y +CONFIG_OF_IRQ=y +CONFIG_OF_NET=y +CONFIG_OF_MDIO=m +CONFIG_OF_RESOLVE=y +CONFIG_OF_OVERLAY=y +CONFIG_ARCH_MIGHT_HAVE_PC_PARPORT=y +CONFIG_PARPORT=m +CONFIG_PARPORT_PC=m +CONFIG_PARPORT_SERIAL=m +CONFIG_PARPORT_PC_FIFO=y +CONFIG_PARPORT_PC_SUPERIO=y +CONFIG_PARPORT_PC_PCMCIA=m +CONFIG_PARPORT_AX88796=m +CONFIG_PARPORT_1284=y +CONFIG_PARPORT_NOT_PC=y +CONFIG_PNP=y +CONFIG_PNP_DEBUG_MESSAGES=y + +# +# Protocols +# +CONFIG_PNPACPI=y +CONFIG_BLK_DEV=y +# CONFIG_BLK_DEV_NULL_BLK is not set +CONFIG_BLK_DEV_FD=m +CONFIG_CDROM=m +# CONFIG_PARIDE is not set +CONFIG_BLK_DEV_PCIESSD_MTIP32XX=m +CONFIG_ZRAM=m +CONFIG_ZRAM_WRITEBACK=y +# CONFIG_ZRAM_MEMORY_TRACKING is not set +CONFIG_BLK_DEV_UMEM=m +CONFIG_BLK_DEV_LOOP=m +CONFIG_BLK_DEV_LOOP_MIN_COUNT=8 +CONFIG_BLK_DEV_CRYPTOLOOP=m +CONFIG_BLK_DEV_DRBD=m +# CONFIG_DRBD_FAULT_INJECTION is not set +CONFIG_BLK_DEV_NBD=m +CONFIG_BLK_DEV_SKD=m +CONFIG_BLK_DEV_SX8=m +CONFIG_BLK_DEV_RAM=m +CONFIG_BLK_DEV_RAM_COUNT=16 +CONFIG_BLK_DEV_RAM_SIZE=16384 +CONFIG_CDROM_PKTCDVD=m +CONFIG_CDROM_PKTCDVD_BUFFERS=8 +# CONFIG_CDROM_PKTCDVD_WCACHE is not set +CONFIG_ATA_OVER_ETH=m +CONFIG_XEN_BLKDEV_FRONTEND=m +CONFIG_XEN_BLKDEV_BACKEND=m +CONFIG_VIRTIO_BLK=m +# CONFIG_VIRTIO_BLK_SCSI is not set +CONFIG_BLK_DEV_RBD=m +CONFIG_BLK_DEV_RSXX=m + +# +# NVME Support +# +CONFIG_NVME_CORE=y +CONFIG_BLK_DEV_NVME=y +CONFIG_NVME_MULTIPATH=y +CONFIG_NVME_FABRICS=m +CONFIG_NVME_RDMA=m +CONFIG_NVME_FC=m +CONFIG_NVME_TCP=m +CONFIG_NVME_TARGET=m +CONFIG_NVME_TARGET_LOOP=m +CONFIG_NVME_TARGET_RDMA=m +CONFIG_NVME_TARGET_FC=m +CONFIG_NVME_TARGET_FCLOOP=m +CONFIG_NVME_TARGET_TCP=m +# end of NVME Support + +# +# Misc devices +# +CONFIG_SENSORS_LIS3LV02D=m +CONFIG_AD525X_DPOT=m +CONFIG_AD525X_DPOT_I2C=m +CONFIG_AD525X_DPOT_SPI=m +# CONFIG_DUMMY_IRQ is not set +CONFIG_IBM_ASM=m +CONFIG_PHANTOM=m +CONFIG_TIFM_CORE=m +CONFIG_TIFM_7XX1=m +CONFIG_ICS932S401=m +CONFIG_ENCLOSURE_SERVICES=m +CONFIG_HP_ILO=m +CONFIG_APDS9802ALS=m +CONFIG_ISL29003=m +CONFIG_ISL29020=m +CONFIG_SENSORS_TSL2550=m +CONFIG_SENSORS_BH1770=m +CONFIG_SENSORS_APDS990X=m +CONFIG_HMC6352=m +CONFIG_DS1682=m +CONFIG_VMWARE_BALLOON=m +CONFIG_LATTICE_ECP3_CONFIG=m +# CONFIG_SRAM is not set +CONFIG_PCI_ENDPOINT_TEST=m +CONFIG_XILINX_SDFEC=m +CONFIG_MISC_RTSX=m +CONFIG_PVPANIC=m +CONFIG_C2PORT=m +CONFIG_C2PORT_DURAMAR_2150=m + +# +# EEPROM support +# +CONFIG_EEPROM_AT24=m +# CONFIG_EEPROM_AT25 is not set +CONFIG_EEPROM_LEGACY=m +CONFIG_EEPROM_MAX6875=m +CONFIG_EEPROM_93CX6=m +# CONFIG_EEPROM_93XX46 is not set +CONFIG_EEPROM_IDT_89HPESX=m +CONFIG_EEPROM_EE1004=m +# end of EEPROM support + +CONFIG_CB710_CORE=m +# CONFIG_CB710_DEBUG is not set +CONFIG_CB710_DEBUG_ASSUMPTIONS=y + +# +# Texas Instruments shared transport line discipline +# +CONFIG_TI_ST=m +# end of Texas Instruments shared transport line discipline + +CONFIG_SENSORS_LIS3_I2C=m +CONFIG_ALTERA_STAPL=m +CONFIG_INTEL_MEI=m +CONFIG_INTEL_MEI_ME=m +CONFIG_INTEL_MEI_TXE=m +CONFIG_INTEL_MEI_HDCP=m +CONFIG_VMWARE_VMCI=m + +# +# Intel MIC & related support +# + +# +# Intel MIC Bus Driver +# +CONFIG_INTEL_MIC_BUS=m + +# +# SCIF Bus Driver +# +CONFIG_SCIF_BUS=m + +# +# VOP Bus Driver +# +CONFIG_VOP_BUS=m + +# +# Intel MIC Host Driver +# +CONFIG_INTEL_MIC_HOST=m + +# +# Intel MIC Card Driver +# +CONFIG_INTEL_MIC_CARD=m + +# +# SCIF Driver +# +CONFIG_SCIF=m + +# +# Intel MIC Coprocessor State Management (COSM) Drivers +# +CONFIG_MIC_COSM=m + +# +# VOP Driver +# +CONFIG_VOP=m +CONFIG_VHOST_RING=m +# end of Intel MIC & related support + +CONFIG_GENWQE=m +CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY=0 +CONFIG_ECHO=m +CONFIG_MISC_ALCOR_PCI=m +CONFIG_MISC_RTSX_PCI=m +CONFIG_MISC_RTSX_USB=m +CONFIG_HABANA_AI=m +# end of Misc devices + +CONFIG_HAVE_IDE=y +# CONFIG_IDE is not set + +# +# SCSI device support +# +CONFIG_SCSI_MOD=m +CONFIG_RAID_ATTRS=m +CONFIG_SCSI=m +CONFIG_SCSI_DMA=y +CONFIG_SCSI_NETLINK=y +CONFIG_SCSI_PROC_FS=y + +# +# SCSI support type (disk, tape, CD-ROM) +# +CONFIG_BLK_DEV_SD=m +CONFIG_CHR_DEV_ST=m +CONFIG_BLK_DEV_SR=m +CONFIG_BLK_DEV_SR_VENDOR=y +CONFIG_CHR_DEV_SG=m +CONFIG_CHR_DEV_SCH=m +CONFIG_SCSI_ENCLOSURE=m +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_LOGGING=y +CONFIG_SCSI_SCAN_ASYNC=y + +# +# SCSI Transports +# +CONFIG_SCSI_SPI_ATTRS=m +CONFIG_SCSI_FC_ATTRS=m +CONFIG_SCSI_ISCSI_ATTRS=m +CONFIG_SCSI_SAS_ATTRS=m +CONFIG_SCSI_SAS_LIBSAS=m +CONFIG_SCSI_SAS_ATA=y +CONFIG_SCSI_SAS_HOST_SMP=y +CONFIG_SCSI_SRP_ATTRS=m +# end of SCSI Transports + +CONFIG_SCSI_LOWLEVEL=y +CONFIG_ISCSI_TCP=m +CONFIG_ISCSI_BOOT_SYSFS=m +CONFIG_SCSI_CXGB3_ISCSI=m +CONFIG_SCSI_CXGB4_ISCSI=m +CONFIG_SCSI_BNX2_ISCSI=m +CONFIG_SCSI_BNX2X_FCOE=m +CONFIG_BE2ISCSI=m +CONFIG_BLK_DEV_3W_XXXX_RAID=m +CONFIG_SCSI_HPSA=m +CONFIG_SCSI_3W_9XXX=m +CONFIG_SCSI_3W_SAS=m +CONFIG_SCSI_ACARD=m +CONFIG_SCSI_AACRAID=m +CONFIG_SCSI_AIC7XXX=m +CONFIG_AIC7XXX_CMDS_PER_DEVICE=32 +CONFIG_AIC7XXX_RESET_DELAY_MS=15000 +CONFIG_AIC7XXX_DEBUG_ENABLE=y +CONFIG_AIC7XXX_DEBUG_MASK=0 +CONFIG_AIC7XXX_REG_PRETTY_PRINT=y +CONFIG_SCSI_AIC79XX=m +CONFIG_AIC79XX_CMDS_PER_DEVICE=32 +CONFIG_AIC79XX_RESET_DELAY_MS=15000 +CONFIG_AIC79XX_DEBUG_ENABLE=y +CONFIG_AIC79XX_DEBUG_MASK=0 +CONFIG_AIC79XX_REG_PRETTY_PRINT=y +CONFIG_SCSI_AIC94XX=m +CONFIG_AIC94XX_DEBUG=y +CONFIG_SCSI_MVSAS=m +CONFIG_SCSI_MVSAS_DEBUG=y +CONFIG_SCSI_MVSAS_TASKLET=y +CONFIG_SCSI_MVUMI=m +CONFIG_SCSI_DPT_I2O=m +CONFIG_SCSI_ADVANSYS=m +CONFIG_SCSI_ARCMSR=m +CONFIG_SCSI_ESAS2R=m +CONFIG_MEGARAID_NEWGEN=y +CONFIG_MEGARAID_MM=m +CONFIG_MEGARAID_MAILBOX=m +CONFIG_MEGARAID_LEGACY=m +CONFIG_MEGARAID_SAS=m +CONFIG_SCSI_MPT3SAS=m +CONFIG_SCSI_MPT2SAS_MAX_SGE=128 +CONFIG_SCSI_MPT3SAS_MAX_SGE=128 +CONFIG_SCSI_MPT2SAS=m +CONFIG_SCSI_SMARTPQI=m +CONFIG_SCSI_UFSHCD=m +CONFIG_SCSI_UFSHCD_PCI=m +# CONFIG_SCSI_UFS_DWC_TC_PCI is not set +CONFIG_SCSI_UFSHCD_PLATFORM=m +CONFIG_SCSI_UFS_CDNS_PLATFORM=m +# CONFIG_SCSI_UFS_DWC_TC_PLATFORM is not set +CONFIG_SCSI_UFS_BSG=y +CONFIG_SCSI_HPTIOP=m +CONFIG_SCSI_BUSLOGIC=m +CONFIG_SCSI_FLASHPOINT=y +CONFIG_SCSI_MYRB=m +CONFIG_SCSI_MYRS=m +CONFIG_VMWARE_PVSCSI=m +CONFIG_XEN_SCSI_FRONTEND=m +CONFIG_HYPERV_STORAGE=m +CONFIG_LIBFC=m +CONFIG_LIBFCOE=m +CONFIG_FCOE=m +CONFIG_FCOE_FNIC=m +CONFIG_SCSI_SNIC=m +# CONFIG_SCSI_SNIC_DEBUG_FS is not set +CONFIG_SCSI_DMX3191D=m +CONFIG_SCSI_FDOMAIN=m +CONFIG_SCSI_FDOMAIN_PCI=m +CONFIG_SCSI_GDTH=m +CONFIG_SCSI_ISCI=m +CONFIG_SCSI_IPS=m +CONFIG_SCSI_INITIO=m +CONFIG_SCSI_INIA100=m +CONFIG_SCSI_PPA=m +CONFIG_SCSI_IMM=m +# CONFIG_SCSI_IZIP_EPP16 is not set +# CONFIG_SCSI_IZIP_SLOW_CTR is not set +CONFIG_SCSI_STEX=m +CONFIG_SCSI_SYM53C8XX_2=m +CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1 +CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 +CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 +CONFIG_SCSI_SYM53C8XX_MMIO=y +CONFIG_SCSI_IPR=m +CONFIG_SCSI_IPR_TRACE=y +CONFIG_SCSI_IPR_DUMP=y +CONFIG_SCSI_QLOGIC_1280=m +CONFIG_SCSI_QLA_FC=m +CONFIG_TCM_QLA2XXX=m +# CONFIG_TCM_QLA2XXX_DEBUG is not set +CONFIG_SCSI_QLA_ISCSI=m +CONFIG_QEDI=m +CONFIG_QEDF=m +CONFIG_SCSI_LPFC=m +# CONFIG_SCSI_LPFC_DEBUG_FS is not set +CONFIG_SCSI_DC395x=m +CONFIG_SCSI_AM53C974=m +CONFIG_SCSI_WD719X=m +CONFIG_SCSI_DEBUG=m +CONFIG_SCSI_PMCRAID=m +CONFIG_SCSI_PM8001=m +CONFIG_SCSI_BFA_FC=m +CONFIG_SCSI_VIRTIO=m +CONFIG_SCSI_CHELSIO_FCOE=m +CONFIG_SCSI_LOWLEVEL_PCMCIA=y +CONFIG_PCMCIA_AHA152X=m +CONFIG_PCMCIA_FDOMAIN=m +CONFIG_PCMCIA_QLOGIC=m +CONFIG_PCMCIA_SYM53C500=m +CONFIG_SCSI_DH=y +CONFIG_SCSI_DH_RDAC=m +CONFIG_SCSI_DH_HP_SW=m +CONFIG_SCSI_DH_EMC=m +CONFIG_SCSI_DH_ALUA=m +# end of SCSI device support + +CONFIG_ATA=m +CONFIG_ATA_VERBOSE_ERROR=y +CONFIG_ATA_ACPI=y +CONFIG_SATA_ZPODD=y +CONFIG_SATA_PMP=y + +# +# Controllers with non-SFF native interface +# +CONFIG_SATA_AHCI=m +CONFIG_SATA_MOBILE_LPM_POLICY=3 +CONFIG_SATA_AHCI_PLATFORM=m +CONFIG_AHCI_CEVA=m +CONFIG_AHCI_QORIQ=m +CONFIG_SATA_INIC162X=m +CONFIG_SATA_ACARD_AHCI=m +CONFIG_SATA_SIL24=m +CONFIG_ATA_SFF=y + +# +# SFF controllers with custom DMA interface +# +CONFIG_PDC_ADMA=m +CONFIG_SATA_QSTOR=m +CONFIG_SATA_SX4=m +CONFIG_ATA_BMDMA=y + +# +# SATA SFF controllers with BMDMA +# +CONFIG_ATA_PIIX=m +CONFIG_SATA_DWC=m +# CONFIG_SATA_DWC_OLD_DMA is not set +# CONFIG_SATA_DWC_DEBUG is not set +CONFIG_SATA_MV=m +CONFIG_SATA_NV=m +CONFIG_SATA_PROMISE=m +CONFIG_SATA_SIL=m +CONFIG_SATA_SIS=m +CONFIG_SATA_SVW=m +CONFIG_SATA_ULI=m +CONFIG_SATA_VIA=m +CONFIG_SATA_VITESSE=m + +# +# PATA SFF controllers with BMDMA +# +CONFIG_PATA_ALI=m +CONFIG_PATA_AMD=m +CONFIG_PATA_ARTOP=m +CONFIG_PATA_ATIIXP=m +CONFIG_PATA_ATP867X=m +CONFIG_PATA_CMD64X=m +CONFIG_PATA_CYPRESS=m +CONFIG_PATA_EFAR=m +CONFIG_PATA_HPT366=m +CONFIG_PATA_HPT37X=m +CONFIG_PATA_HPT3X2N=m +CONFIG_PATA_HPT3X3=m +CONFIG_PATA_HPT3X3_DMA=y +CONFIG_PATA_IT8213=m +CONFIG_PATA_IT821X=m +CONFIG_PATA_JMICRON=m +CONFIG_PATA_MARVELL=m +CONFIG_PATA_NETCELL=m +CONFIG_PATA_NINJA32=m +CONFIG_PATA_NS87415=m +CONFIG_PATA_OLDPIIX=m +CONFIG_PATA_OPTIDMA=m +CONFIG_PATA_PDC2027X=m +CONFIG_PATA_PDC_OLD=m +CONFIG_PATA_RADISYS=m +CONFIG_PATA_RDC=m +CONFIG_PATA_SCH=m +CONFIG_PATA_SERVERWORKS=m +CONFIG_PATA_SIL680=m +CONFIG_PATA_SIS=m +CONFIG_PATA_TOSHIBA=m +CONFIG_PATA_TRIFLEX=m +CONFIG_PATA_VIA=m +CONFIG_PATA_WINBOND=m + +# +# PIO-only SFF controllers +# +CONFIG_PATA_CMD640_PCI=m +CONFIG_PATA_MPIIX=m +CONFIG_PATA_NS87410=m +CONFIG_PATA_OPTI=m +CONFIG_PATA_PCMCIA=m +# CONFIG_PATA_PLATFORM is not set +CONFIG_PATA_RZ1000=m + +# +# Generic fallback / legacy drivers +# +CONFIG_PATA_ACPI=m +CONFIG_ATA_GENERIC=m +CONFIG_PATA_LEGACY=m +CONFIG_MD=y +CONFIG_BLK_DEV_MD=m +CONFIG_MD_LINEAR=m +CONFIG_MD_RAID0=m +CONFIG_MD_RAID1=m +CONFIG_MD_RAID10=m +CONFIG_MD_RAID456=m +CONFIG_MD_MULTIPATH=m +CONFIG_MD_FAULTY=m +CONFIG_MD_CLUSTER=m +CONFIG_BCACHE=m +# CONFIG_BCACHE_DEBUG is not set +# CONFIG_BCACHE_CLOSURES_DEBUG is not set +CONFIG_BLK_DEV_DM_BUILTIN=y +CONFIG_BLK_DEV_DM=m +# CONFIG_DM_DEBUG is not set +CONFIG_DM_BUFIO=m +# CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING is not set +CONFIG_DM_BIO_PRISON=m +CONFIG_DM_PERSISTENT_DATA=m +CONFIG_DM_UNSTRIPED=m +CONFIG_DM_CRYPT=m +CONFIG_DM_SNAPSHOT=m +CONFIG_DM_THIN_PROVISIONING=m +CONFIG_DM_CACHE=m +CONFIG_DM_CACHE_SMQ=m +CONFIG_DM_WRITECACHE=m +CONFIG_DM_ERA=m +# CONFIG_DM_CLONE is not set +CONFIG_DM_MIRROR=m +CONFIG_DM_LOG_USERSPACE=m +CONFIG_DM_RAID=m +CONFIG_DM_ZERO=m +CONFIG_DM_MULTIPATH=m +CONFIG_DM_MULTIPATH_QL=m +CONFIG_DM_MULTIPATH_ST=m +CONFIG_DM_DELAY=m +CONFIG_DM_DUST=m +CONFIG_DM_UEVENT=y +CONFIG_DM_FLAKEY=m +CONFIG_DM_VERITY=m +CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y +CONFIG_DM_VERITY_FEC=y +CONFIG_DM_SWITCH=m +CONFIG_DM_LOG_WRITES=m +CONFIG_DM_INTEGRITY=m +CONFIG_DM_ZONED=m +CONFIG_TARGET_CORE=m +CONFIG_TCM_IBLOCK=m +CONFIG_TCM_FILEIO=m +CONFIG_TCM_PSCSI=m +CONFIG_TCM_USER2=m +CONFIG_LOOPBACK_TARGET=m +CONFIG_TCM_FC=m +CONFIG_ISCSI_TARGET=m +CONFIG_ISCSI_TARGET_CXGB4=m +CONFIG_SBP_TARGET=m +CONFIG_FUSION=y +CONFIG_FUSION_SPI=m +CONFIG_FUSION_FC=m +CONFIG_FUSION_SAS=m +CONFIG_FUSION_MAX_SGE=128 +CONFIG_FUSION_CTL=m +CONFIG_FUSION_LAN=m +# CONFIG_FUSION_LOGGING is not set + +# +# IEEE 1394 (FireWire) support +# +CONFIG_FIREWIRE=m +CONFIG_FIREWIRE_OHCI=m +CONFIG_FIREWIRE_SBP2=m +CONFIG_FIREWIRE_NET=m +CONFIG_FIREWIRE_NOSY=m +# end of IEEE 1394 (FireWire) support + +CONFIG_MACINTOSH_DRIVERS=y +CONFIG_MAC_EMUMOUSEBTN=m +CONFIG_NETDEVICES=y +CONFIG_MII=m +CONFIG_NET_CORE=y +CONFIG_BONDING=m +CONFIG_DUMMY=m +CONFIG_EQUALIZER=m +CONFIG_NET_FC=y +CONFIG_IFB=m +CONFIG_NET_TEAM=m +CONFIG_NET_TEAM_MODE_BROADCAST=m +CONFIG_NET_TEAM_MODE_ROUNDROBIN=m +CONFIG_NET_TEAM_MODE_RANDOM=m +CONFIG_NET_TEAM_MODE_ACTIVEBACKUP=m +CONFIG_NET_TEAM_MODE_LOADBALANCE=m +CONFIG_MACVLAN=m +CONFIG_MACVTAP=m +CONFIG_IPVLAN_L3S=y +CONFIG_IPVLAN=m +CONFIG_IPVTAP=m +CONFIG_VXLAN=m +CONFIG_GENEVE=m +CONFIG_GTP=m +CONFIG_MACSEC=m +CONFIG_NETCONSOLE=m +CONFIG_NETCONSOLE_DYNAMIC=y +CONFIG_NETPOLL=y +CONFIG_NET_POLL_CONTROLLER=y +CONFIG_NTB_NETDEV=m +CONFIG_RIONET=m +CONFIG_RIONET_TX_SIZE=128 +CONFIG_RIONET_RX_SIZE=128 +CONFIG_TUN=m +CONFIG_TAP=m +# CONFIG_TUN_VNET_CROSS_LE is not set +CONFIG_VETH=m +CONFIG_VIRTIO_NET=m +CONFIG_NLMON=m +CONFIG_NET_VRF=m +CONFIG_VSOCKMON=m +CONFIG_SUNGEM_PHY=m +# CONFIG_ARCNET is not set +CONFIG_ATM_DRIVERS=y +# CONFIG_ATM_DUMMY is not set +CONFIG_ATM_TCP=m +CONFIG_ATM_LANAI=m +CONFIG_ATM_ENI=m +# CONFIG_ATM_ENI_DEBUG is not set +# CONFIG_ATM_ENI_TUNE_BURST is not set +CONFIG_ATM_FIRESTREAM=m +CONFIG_ATM_ZATM=m +# CONFIG_ATM_ZATM_DEBUG is not set +CONFIG_ATM_NICSTAR=m +# CONFIG_ATM_NICSTAR_USE_SUNI is not set +# CONFIG_ATM_NICSTAR_USE_IDT77105 is not set +CONFIG_ATM_IDT77252=m +# CONFIG_ATM_IDT77252_DEBUG is not set +# CONFIG_ATM_IDT77252_RCV_ALL is not set +CONFIG_ATM_IDT77252_USE_SUNI=y +CONFIG_ATM_AMBASSADOR=m +# CONFIG_ATM_AMBASSADOR_DEBUG is not set +CONFIG_ATM_HORIZON=m +# CONFIG_ATM_HORIZON_DEBUG is not set +CONFIG_ATM_IA=m +# CONFIG_ATM_IA_DEBUG is not set +CONFIG_ATM_FORE200E=m +CONFIG_ATM_FORE200E_USE_TASKLET=y +CONFIG_ATM_FORE200E_TX_RETRY=16 +CONFIG_ATM_FORE200E_DEBUG=0 +CONFIG_ATM_HE=m +CONFIG_ATM_HE_USE_SUNI=y +CONFIG_ATM_SOLOS=m + +# +# CAIF transport drivers +# +CONFIG_CAIF_TTY=m +CONFIG_CAIF_SPI_SLAVE=m +CONFIG_CAIF_SPI_SYNC=y +CONFIG_CAIF_HSI=m +CONFIG_CAIF_VIRTIO=m + +# +# Distributed Switch Architecture drivers +# +CONFIG_B53=m +# CONFIG_B53_SPI_DRIVER is not set +CONFIG_B53_MDIO_DRIVER=m +CONFIG_B53_MMAP_DRIVER=m +CONFIG_B53_SRAB_DRIVER=m +CONFIG_B53_SERDES=m +CONFIG_NET_DSA_BCM_SF2=m +CONFIG_NET_DSA_LOOP=m +CONFIG_NET_DSA_LANTIQ_GSWIP=m +CONFIG_NET_DSA_MT7530=m +CONFIG_NET_DSA_MV88E6060=m +CONFIG_NET_DSA_MICROCHIP_KSZ_COMMON=m +CONFIG_NET_DSA_MICROCHIP_KSZ9477=m +# CONFIG_NET_DSA_MICROCHIP_KSZ9477_I2C is not set +CONFIG_NET_DSA_MICROCHIP_KSZ9477_SPI=m +# CONFIG_NET_DSA_MICROCHIP_KSZ8795 is not set +CONFIG_NET_DSA_MV88E6XXX=m +CONFIG_NET_DSA_MV88E6XXX_GLOBAL2=y +CONFIG_NET_DSA_MV88E6XXX_PTP=y +CONFIG_NET_DSA_SJA1105=m +CONFIG_NET_DSA_SJA1105_PTP=y +# CONFIG_NET_DSA_SJA1105_TAS is not set +CONFIG_NET_DSA_QCA8K=m +CONFIG_NET_DSA_REALTEK_SMI=m +CONFIG_NET_DSA_SMSC_LAN9303=m +CONFIG_NET_DSA_SMSC_LAN9303_I2C=m +CONFIG_NET_DSA_SMSC_LAN9303_MDIO=m +CONFIG_NET_DSA_VITESSE_VSC73XX=m +CONFIG_NET_DSA_VITESSE_VSC73XX_SPI=m +CONFIG_NET_DSA_VITESSE_VSC73XX_PLATFORM=m +# end of Distributed Switch Architecture drivers + +CONFIG_ETHERNET=y +CONFIG_MDIO=m +CONFIG_NET_VENDOR_3COM=y +CONFIG_PCMCIA_3C574=m +CONFIG_PCMCIA_3C589=m +CONFIG_VORTEX=m +CONFIG_TYPHOON=m +CONFIG_NET_VENDOR_ADAPTEC=y +CONFIG_ADAPTEC_STARFIRE=m +CONFIG_NET_VENDOR_AGERE=y +CONFIG_ET131X=m +CONFIG_NET_VENDOR_ALACRITECH=y +CONFIG_SLICOSS=m +CONFIG_NET_VENDOR_ALTEON=y +CONFIG_ACENIC=m +# CONFIG_ACENIC_OMIT_TIGON_I is not set +CONFIG_ALTERA_TSE=m +CONFIG_NET_VENDOR_AMAZON=y +CONFIG_ENA_ETHERNET=m +CONFIG_NET_VENDOR_AMD=y +CONFIG_AMD8111_ETH=m +CONFIG_PCNET32=m +CONFIG_PCMCIA_NMCLAN=m +CONFIG_AMD_XGBE=m +CONFIG_AMD_XGBE_DCB=y +CONFIG_AMD_XGBE_HAVE_ECC=y +CONFIG_NET_VENDOR_AQUANTIA=y +CONFIG_AQTION=m +CONFIG_NET_VENDOR_ARC=y +CONFIG_NET_VENDOR_ATHEROS=y +CONFIG_ATL2=m +CONFIG_ATL1=m +CONFIG_ATL1E=m +CONFIG_ATL1C=m +CONFIG_ALX=m +CONFIG_NET_VENDOR_AURORA=y +CONFIG_AURORA_NB8800=m +CONFIG_NET_VENDOR_BROADCOM=y +CONFIG_B44=m +CONFIG_B44_PCI_AUTOSELECT=y +CONFIG_B44_PCICORE_AUTOSELECT=y +CONFIG_B44_PCI=y +CONFIG_BCMGENET=m +CONFIG_BNX2=m +CONFIG_CNIC=m +CONFIG_TIGON3=m +CONFIG_TIGON3_HWMON=y +CONFIG_BNX2X=m +CONFIG_BNX2X_SRIOV=y +CONFIG_SYSTEMPORT=m +CONFIG_BNXT=m +CONFIG_BNXT_SRIOV=y +CONFIG_BNXT_FLOWER_OFFLOAD=y +CONFIG_BNXT_DCB=y +CONFIG_BNXT_HWMON=y +CONFIG_NET_VENDOR_BROCADE=y +CONFIG_BNA=m +CONFIG_NET_VENDOR_CADENCE=y +CONFIG_MACB=m +CONFIG_MACB_USE_HWSTAMP=y +CONFIG_MACB_PCI=m +CONFIG_NET_VENDOR_CAVIUM=y +CONFIG_THUNDER_NIC_PF=m +CONFIG_THUNDER_NIC_VF=m +CONFIG_THUNDER_NIC_BGX=m +CONFIG_THUNDER_NIC_RGX=m +CONFIG_CAVIUM_PTP=m +CONFIG_LIQUIDIO=m +CONFIG_LIQUIDIO_VF=m +CONFIG_NET_VENDOR_CHELSIO=y +CONFIG_CHELSIO_T1=m +CONFIG_CHELSIO_T1_1G=y +CONFIG_CHELSIO_T3=m +CONFIG_CHELSIO_T4=m +CONFIG_CHELSIO_T4_DCB=y +CONFIG_CHELSIO_T4_FCOE=y +CONFIG_CHELSIO_T4VF=m +CONFIG_CHELSIO_LIB=m +CONFIG_NET_VENDOR_CISCO=y +CONFIG_ENIC=m +CONFIG_NET_VENDOR_CORTINA=y +CONFIG_GEMINI_ETHERNET=m +CONFIG_CX_ECAT=m +CONFIG_DNET=m +CONFIG_NET_VENDOR_DEC=y +CONFIG_NET_TULIP=y +CONFIG_DE2104X=m +CONFIG_DE2104X_DSL=0 +CONFIG_TULIP=m +CONFIG_TULIP_MWI=y +CONFIG_TULIP_MMIO=y +CONFIG_TULIP_NAPI=y +CONFIG_TULIP_NAPI_HW_MITIGATION=y +CONFIG_DE4X5=m +CONFIG_WINBOND_840=m +CONFIG_DM9102=m +CONFIG_ULI526X=m +CONFIG_PCMCIA_XIRCOM=m +CONFIG_NET_VENDOR_DLINK=y +CONFIG_DL2K=m +CONFIG_SUNDANCE=m +# CONFIG_SUNDANCE_MMIO is not set +CONFIG_NET_VENDOR_EMULEX=y +CONFIG_BE2NET=m +CONFIG_BE2NET_HWMON=y +CONFIG_BE2NET_BE2=y +CONFIG_BE2NET_BE3=y +CONFIG_BE2NET_LANCER=y +CONFIG_BE2NET_SKYHAWK=y +CONFIG_NET_VENDOR_EZCHIP=y +CONFIG_EZCHIP_NPS_MANAGEMENT_ENET=m +CONFIG_NET_VENDOR_FUJITSU=y +CONFIG_PCMCIA_FMVJ18X=m +CONFIG_NET_VENDOR_GOOGLE=y +CONFIG_GVE=m +CONFIG_NET_VENDOR_HP=y +CONFIG_HP100=m +CONFIG_NET_VENDOR_HUAWEI=y +CONFIG_HINIC=m +CONFIG_NET_VENDOR_I825XX=y +CONFIG_NET_VENDOR_INTEL=y +CONFIG_E100=m +CONFIG_E1000=m +CONFIG_E1000E=m +CONFIG_E1000E_HWTS=y +CONFIG_IGB=m +CONFIG_IGB_HWMON=y +CONFIG_IGB_DCA=y +CONFIG_IGBVF=m +CONFIG_IXGB=m +CONFIG_IXGBE=m +CONFIG_IXGBE_HWMON=y +CONFIG_IXGBE_DCA=y +CONFIG_IXGBE_DCB=y +# CONFIG_IXGBE_IPSEC is not set +CONFIG_IXGBEVF=m +CONFIG_IXGBEVF_IPSEC=y +CONFIG_I40E=m +CONFIG_I40E_DCB=y +CONFIG_IAVF=m +CONFIG_I40EVF=m +CONFIG_ICE=m +CONFIG_FM10K=m +CONFIG_IGC=m +CONFIG_JME=m +CONFIG_NET_VENDOR_MARVELL=y +CONFIG_MVMDIO=m +CONFIG_SKGE=m +# CONFIG_SKGE_DEBUG is not set +CONFIG_SKGE_GENESIS=y +CONFIG_SKY2=m +# CONFIG_SKY2_DEBUG is not set +CONFIG_NET_VENDOR_MELLANOX=y +CONFIG_MLX4_EN=m +CONFIG_MLX4_EN_DCB=y +CONFIG_MLX4_CORE=m +CONFIG_MLX4_DEBUG=y +CONFIG_MLX4_CORE_GEN2=y +CONFIG_MLX5_CORE=m +CONFIG_MLX5_ACCEL=y +CONFIG_MLX5_FPGA=y +CONFIG_MLX5_CORE_EN=y +CONFIG_MLX5_EN_ARFS=y +CONFIG_MLX5_EN_RXNFC=y +CONFIG_MLX5_MPFS=y +CONFIG_MLX5_ESWITCH=y +CONFIG_MLX5_CORE_EN_DCB=y +CONFIG_MLX5_CORE_IPOIB=y +CONFIG_MLX5_FPGA_IPSEC=y +CONFIG_MLX5_EN_IPSEC=y +CONFIG_MLX5_FPGA_TLS=y +CONFIG_MLX5_TLS=y +CONFIG_MLX5_EN_TLS=y +CONFIG_MLX5_SW_STEERING=y +CONFIG_MLXSW_CORE=m +CONFIG_MLXSW_CORE_HWMON=y +CONFIG_MLXSW_CORE_THERMAL=y +CONFIG_MLXSW_PCI=m +CONFIG_MLXSW_I2C=m +CONFIG_MLXSW_SWITCHIB=m +CONFIG_MLXSW_SWITCHX2=m +CONFIG_MLXSW_SPECTRUM=m +CONFIG_MLXSW_SPECTRUM_DCB=y +CONFIG_MLXSW_MINIMAL=m +CONFIG_MLXFW=m +CONFIG_NET_VENDOR_MICREL=y +CONFIG_KS8842=m +CONFIG_KS8851=m +CONFIG_KS8851_MLL=m +CONFIG_KSZ884X_PCI=m +CONFIG_NET_VENDOR_MICROCHIP=y +CONFIG_ENC28J60=m +# CONFIG_ENC28J60_WRITEVERIFY is not set +CONFIG_ENCX24J600=m +CONFIG_LAN743X=m +CONFIG_NET_VENDOR_MICROSEMI=y +CONFIG_MSCC_OCELOT_SWITCH=m +CONFIG_MSCC_OCELOT_SWITCH_OCELOT=m +CONFIG_NET_VENDOR_MYRI=y +CONFIG_MYRI10GE=m +CONFIG_MYRI10GE_DCA=y +CONFIG_FEALNX=m +CONFIG_NET_VENDOR_NATSEMI=y +CONFIG_NATSEMI=m +CONFIG_NS83820=m +CONFIG_NET_VENDOR_NETERION=y +CONFIG_S2IO=m +CONFIG_VXGE=m +# CONFIG_VXGE_DEBUG_TRACE_ALL is not set +CONFIG_NET_VENDOR_NETRONOME=y +CONFIG_NFP=m +CONFIG_NFP_APP_FLOWER=y +CONFIG_NFP_APP_ABM_NIC=y +# CONFIG_NFP_DEBUG is not set +CONFIG_NET_VENDOR_NI=y +CONFIG_NI_XGE_MANAGEMENT_ENET=m +CONFIG_NET_VENDOR_8390=y +CONFIG_PCMCIA_AXNET=m +CONFIG_NE2K_PCI=m +CONFIG_PCMCIA_PCNET=m +CONFIG_NET_VENDOR_NVIDIA=y +CONFIG_FORCEDETH=m +CONFIG_NET_VENDOR_OKI=y +CONFIG_ETHOC=m +CONFIG_NET_VENDOR_PACKET_ENGINES=y +CONFIG_HAMACHI=m +CONFIG_YELLOWFIN=m +CONFIG_NET_VENDOR_PENSANDO=y +# CONFIG_IONIC is not set +CONFIG_NET_VENDOR_QLOGIC=y +CONFIG_QLA3XXX=m +CONFIG_QLCNIC=m +CONFIG_QLCNIC_SRIOV=y +CONFIG_QLCNIC_DCB=y +CONFIG_QLCNIC_HWMON=y +CONFIG_NETXEN_NIC=m +CONFIG_QED=m +CONFIG_QED_LL2=y +CONFIG_QED_SRIOV=y +CONFIG_QEDE=m +CONFIG_QED_RDMA=y +CONFIG_QED_ISCSI=y +CONFIG_QED_FCOE=y +CONFIG_QED_OOO=y +CONFIG_NET_VENDOR_QUALCOMM=y +CONFIG_QCA7000=m +CONFIG_QCA7000_SPI=m +CONFIG_QCA7000_UART=m +CONFIG_QCOM_EMAC=m +CONFIG_RMNET=m +CONFIG_NET_VENDOR_RDC=y +CONFIG_R6040=m +CONFIG_NET_VENDOR_REALTEK=y +CONFIG_ATP=m +CONFIG_8139CP=m +CONFIG_8139TOO=m +# CONFIG_8139TOO_PIO is not set +CONFIG_8139TOO_TUNE_TWISTER=y +CONFIG_8139TOO_8129=y +# CONFIG_8139_OLD_RX_RESET is not set +CONFIG_R8169=m +CONFIG_NET_VENDOR_RENESAS=y +CONFIG_NET_VENDOR_ROCKER=y +CONFIG_ROCKER=m +CONFIG_NET_VENDOR_SAMSUNG=y +CONFIG_SXGBE_ETH=m +CONFIG_NET_VENDOR_SEEQ=y +CONFIG_NET_VENDOR_SOLARFLARE=y +CONFIG_SFC=m +CONFIG_SFC_MTD=y +CONFIG_SFC_MCDI_MON=y +CONFIG_SFC_SRIOV=y +CONFIG_SFC_MCDI_LOGGING=y +CONFIG_SFC_FALCON=m +CONFIG_SFC_FALCON_MTD=y +CONFIG_NET_VENDOR_SILAN=y +CONFIG_SC92031=m +CONFIG_NET_VENDOR_SIS=y +CONFIG_SIS900=m +CONFIG_SIS190=m +CONFIG_NET_VENDOR_SMSC=y +CONFIG_PCMCIA_SMC91C92=m +CONFIG_EPIC100=m +CONFIG_SMSC911X=m +CONFIG_SMSC9420=m +CONFIG_NET_VENDOR_SOCIONEXT=y +CONFIG_NET_VENDOR_STMICRO=y +CONFIG_STMMAC_ETH=m +# CONFIG_STMMAC_SELFTESTS is not set +CONFIG_STMMAC_PLATFORM=m +CONFIG_DWMAC_DWC_QOS_ETH=m +CONFIG_DWMAC_GENERIC=m +CONFIG_STMMAC_PCI=m +CONFIG_NET_VENDOR_SUN=y +CONFIG_HAPPYMEAL=m +CONFIG_SUNGEM=m +CONFIG_CASSINI=m +CONFIG_NIU=m +CONFIG_NET_VENDOR_SYNOPSYS=y +CONFIG_DWC_XLGMAC=m +CONFIG_DWC_XLGMAC_PCI=m +CONFIG_NET_VENDOR_TEHUTI=y +CONFIG_TEHUTI=m +CONFIG_NET_VENDOR_TI=y +# CONFIG_TI_CPSW_PHY_SEL is not set +CONFIG_TLAN=m +CONFIG_NET_VENDOR_VIA=y +CONFIG_VIA_RHINE=m +CONFIG_VIA_RHINE_MMIO=y +CONFIG_VIA_VELOCITY=m +CONFIG_NET_VENDOR_WIZNET=y +CONFIG_WIZNET_W5100=m +CONFIG_WIZNET_W5300=m +# CONFIG_WIZNET_BUS_DIRECT is not set +# CONFIG_WIZNET_BUS_INDIRECT is not set +CONFIG_WIZNET_BUS_ANY=y +CONFIG_WIZNET_W5100_SPI=m +CONFIG_NET_VENDOR_XILINX=y +CONFIG_XILINX_AXI_EMAC=m +CONFIG_XILINX_LL_TEMAC=m +CONFIG_NET_VENDOR_XIRCOM=y +CONFIG_PCMCIA_XIRC2PS=m +CONFIG_FDDI=m +CONFIG_DEFXX=m +CONFIG_DEFXX_MMIO=y +CONFIG_SKFP=m +# CONFIG_HIPPI is not set +CONFIG_NET_SB1000=m +CONFIG_MDIO_DEVICE=m +CONFIG_MDIO_BUS=m +CONFIG_MDIO_BCM_UNIMAC=m +CONFIG_MDIO_BITBANG=m +CONFIG_MDIO_BUS_MUX=m +CONFIG_MDIO_BUS_MUX_GPIO=m +CONFIG_MDIO_BUS_MUX_MMIOREG=m +CONFIG_MDIO_BUS_MUX_MULTIPLEXER=m +CONFIG_MDIO_CAVIUM=m +CONFIG_MDIO_GPIO=m +CONFIG_MDIO_HISI_FEMAC=m +CONFIG_MDIO_I2C=m +CONFIG_MDIO_MSCC_MIIM=m +CONFIG_MDIO_OCTEON=m +CONFIG_MDIO_THUNDER=m +CONFIG_PHYLINK=m +CONFIG_PHYLIB=m +CONFIG_SWPHY=y +CONFIG_LED_TRIGGER_PHY=y + +# +# MII PHY device drivers +# +CONFIG_SFP=m +# CONFIG_ADIN_PHY is not set +CONFIG_AMD_PHY=m +CONFIG_AQUANTIA_PHY=m +CONFIG_AX88796B_PHY=m +CONFIG_AT803X_PHY=m +CONFIG_BCM7XXX_PHY=m +CONFIG_BCM87XX_PHY=m +CONFIG_BCM_NET_PHYLIB=m +CONFIG_BROADCOM_PHY=m +CONFIG_CICADA_PHY=m +CONFIG_CORTINA_PHY=m +CONFIG_DAVICOM_PHY=m +CONFIG_DP83822_PHY=m +CONFIG_DP83TC811_PHY=m +CONFIG_DP83848_PHY=m +CONFIG_DP83867_PHY=m +CONFIG_FIXED_PHY=m +CONFIG_ICPLUS_PHY=m +CONFIG_INTEL_XWAY_PHY=m +CONFIG_LSI_ET1011C_PHY=m +CONFIG_LXT_PHY=m +CONFIG_MARVELL_PHY=m +CONFIG_MARVELL_10G_PHY=m +CONFIG_MICREL_PHY=m +CONFIG_MICROCHIP_PHY=m +CONFIG_MICROCHIP_T1_PHY=m +CONFIG_MICROSEMI_PHY=m +CONFIG_NATIONAL_PHY=m +CONFIG_NXP_TJA11XX_PHY=m +CONFIG_QSEMI_PHY=m +CONFIG_REALTEK_PHY=m +CONFIG_RENESAS_PHY=m +CONFIG_ROCKCHIP_PHY=m +CONFIG_SMSC_PHY=m +CONFIG_STE10XP=m +CONFIG_TERANETICS_PHY=m +CONFIG_VITESSE_PHY=m +CONFIG_XILINX_GMII2RGMII=m +CONFIG_MICREL_KS8995MA=m +CONFIG_PLIP=m +CONFIG_PPP=m +CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_FILTER=y +CONFIG_PPP_MPPE=m +CONFIG_PPP_MULTILINK=y +CONFIG_PPPOATM=m +CONFIG_PPPOE=m +CONFIG_PPTP=m +CONFIG_PPPOL2TP=m +CONFIG_PPP_ASYNC=m +CONFIG_PPP_SYNC_TTY=m +CONFIG_SLIP=m +CONFIG_SLHC=m +CONFIG_SLIP_COMPRESSED=y +CONFIG_SLIP_SMART=y +CONFIG_SLIP_MODE_SLIP6=y +CONFIG_USB_NET_DRIVERS=m +CONFIG_USB_CATC=m +CONFIG_USB_KAWETH=m +CONFIG_USB_PEGASUS=m +CONFIG_USB_RTL8150=m +CONFIG_USB_RTL8152=m +CONFIG_USB_LAN78XX=m +CONFIG_USB_USBNET=m +CONFIG_USB_NET_AX8817X=m +CONFIG_USB_NET_AX88179_178A=m +CONFIG_USB_NET_CDCETHER=m +CONFIG_USB_NET_CDC_EEM=m +CONFIG_USB_NET_CDC_NCM=m +CONFIG_USB_NET_HUAWEI_CDC_NCM=m +CONFIG_USB_NET_CDC_MBIM=m +CONFIG_USB_NET_DM9601=m +CONFIG_USB_NET_SR9700=m +CONFIG_USB_NET_SR9800=m +CONFIG_USB_NET_SMSC75XX=m +CONFIG_USB_NET_SMSC95XX=m +CONFIG_USB_NET_GL620A=m +CONFIG_USB_NET_NET1080=m +CONFIG_USB_NET_PLUSB=m +CONFIG_USB_NET_MCS7830=m +CONFIG_USB_NET_RNDIS_HOST=m +CONFIG_USB_NET_CDC_SUBSET_ENABLE=m +CONFIG_USB_NET_CDC_SUBSET=m +CONFIG_USB_ALI_M5632=y +CONFIG_USB_AN2720=y +CONFIG_USB_BELKIN=y +CONFIG_USB_ARMLINUX=y +CONFIG_USB_EPSON2888=y +CONFIG_USB_KC2190=y +CONFIG_USB_NET_ZAURUS=m +CONFIG_USB_NET_CX82310_ETH=m +CONFIG_USB_NET_KALMIA=m +CONFIG_USB_NET_QMI_WWAN=m +CONFIG_USB_HSO=m +CONFIG_USB_NET_INT51X1=m +CONFIG_USB_CDC_PHONET=m +CONFIG_USB_IPHETH=m +CONFIG_USB_SIERRA_NET=m +CONFIG_USB_VL600=m +CONFIG_USB_NET_CH9200=m +CONFIG_USB_NET_AQC111=m +CONFIG_WLAN=y +# CONFIG_WIRELESS_WDS is not set +CONFIG_WLAN_VENDOR_ADMTEK=y +CONFIG_ADM8211=m +CONFIG_ATH_COMMON=m +CONFIG_WLAN_VENDOR_ATH=y +# CONFIG_ATH_DEBUG is not set +CONFIG_ATH5K=m +# CONFIG_ATH5K_DEBUG is not set +# CONFIG_ATH5K_TRACER is not set +CONFIG_ATH5K_PCI=y +CONFIG_ATH9K_HW=m +CONFIG_ATH9K_COMMON=m +CONFIG_ATH9K_BTCOEX_SUPPORT=y +CONFIG_ATH9K=m +CONFIG_ATH9K_PCI=y +CONFIG_ATH9K_AHB=y +# CONFIG_ATH9K_DEBUGFS is not set +CONFIG_ATH9K_DYNACK=y +CONFIG_ATH9K_WOW=y +CONFIG_ATH9K_RFKILL=y +CONFIG_ATH9K_CHANNEL_CONTEXT=y +CONFIG_ATH9K_PCOEM=y +# CONFIG_ATH9K_PCI_NO_EEPROM is not set +CONFIG_ATH9K_HTC=m +# CONFIG_ATH9K_HTC_DEBUGFS is not set +CONFIG_ATH9K_HWRNG=y +CONFIG_CARL9170=m +CONFIG_CARL9170_LEDS=y +CONFIG_CARL9170_WPC=y +# CONFIG_CARL9170_HWRNG is not set +CONFIG_ATH6KL=m +CONFIG_ATH6KL_SDIO=m +CONFIG_ATH6KL_USB=m +# CONFIG_ATH6KL_DEBUG is not set +# CONFIG_ATH6KL_TRACING is not set +CONFIG_AR5523=m +CONFIG_WIL6210=m +CONFIG_WIL6210_ISR_COR=y +CONFIG_WIL6210_TRACING=y +# CONFIG_WIL6210_DEBUGFS is not set +CONFIG_ATH10K=m +CONFIG_ATH10K_CE=y +CONFIG_ATH10K_PCI=m +CONFIG_ATH10K_AHB=y +CONFIG_ATH10K_SDIO=m +CONFIG_ATH10K_USB=m +# CONFIG_ATH10K_DEBUG is not set +# CONFIG_ATH10K_DEBUGFS is not set +# CONFIG_ATH10K_TRACING is not set +CONFIG_WCN36XX=m +# CONFIG_WCN36XX_DEBUGFS is not set +CONFIG_WLAN_VENDOR_ATMEL=y +CONFIG_ATMEL=m +CONFIG_PCI_ATMEL=m +CONFIG_PCMCIA_ATMEL=m +CONFIG_AT76C50X_USB=m +CONFIG_WLAN_VENDOR_BROADCOM=y +CONFIG_B43=m +CONFIG_B43_BCMA=y +CONFIG_B43_SSB=y +CONFIG_B43_BUSES_BCMA_AND_SSB=y +# CONFIG_B43_BUSES_BCMA is not set +# CONFIG_B43_BUSES_SSB is not set +CONFIG_B43_PCI_AUTOSELECT=y +CONFIG_B43_PCICORE_AUTOSELECT=y +CONFIG_B43_SDIO=y +CONFIG_B43_BCMA_PIO=y +CONFIG_B43_PIO=y +CONFIG_B43_PHY_G=y +CONFIG_B43_PHY_N=y +CONFIG_B43_PHY_LP=y +CONFIG_B43_PHY_HT=y +CONFIG_B43_LEDS=y +CONFIG_B43_HWRNG=y +# CONFIG_B43_DEBUG is not set +CONFIG_B43LEGACY=m +CONFIG_B43LEGACY_PCI_AUTOSELECT=y +CONFIG_B43LEGACY_PCICORE_AUTOSELECT=y +CONFIG_B43LEGACY_LEDS=y +CONFIG_B43LEGACY_HWRNG=y +CONFIG_B43LEGACY_DEBUG=y +CONFIG_B43LEGACY_DMA=y +CONFIG_B43LEGACY_PIO=y +CONFIG_B43LEGACY_DMA_AND_PIO_MODE=y +# CONFIG_B43LEGACY_DMA_MODE is not set +# CONFIG_B43LEGACY_PIO_MODE is not set +CONFIG_BRCMUTIL=m +CONFIG_BRCMSMAC=m +CONFIG_BRCMFMAC=m +CONFIG_BRCMFMAC_PROTO_BCDC=y +CONFIG_BRCMFMAC_PROTO_MSGBUF=y +CONFIG_BRCMFMAC_SDIO=y +CONFIG_BRCMFMAC_USB=y +CONFIG_BRCMFMAC_PCIE=y +# CONFIG_BRCM_TRACING is not set +CONFIG_BRCMDBG=y +CONFIG_WLAN_VENDOR_CISCO=y +CONFIG_AIRO=m +CONFIG_AIRO_CS=m +CONFIG_WLAN_VENDOR_INTEL=y +CONFIG_IPW2100=m +CONFIG_IPW2100_MONITOR=y +# CONFIG_IPW2100_DEBUG is not set +CONFIG_IPW2200=m +CONFIG_IPW2200_MONITOR=y +CONFIG_IPW2200_RADIOTAP=y +CONFIG_IPW2200_PROMISCUOUS=y +CONFIG_IPW2200_QOS=y +# CONFIG_IPW2200_DEBUG is not set +CONFIG_LIBIPW=m +# CONFIG_LIBIPW_DEBUG is not set +CONFIG_IWLEGACY=m +CONFIG_IWL4965=m +CONFIG_IWL3945=m + +# +# iwl3945 / iwl4965 Debugging Options +# +# CONFIG_IWLEGACY_DEBUG is not set +# end of iwl3945 / iwl4965 Debugging Options + +CONFIG_IWLWIFI=m +CONFIG_IWLWIFI_LEDS=y +CONFIG_IWLDVM=m +CONFIG_IWLMVM=m +CONFIG_IWLWIFI_OPMODE_MODULAR=y +# CONFIG_IWLWIFI_BCAST_FILTERING is not set + +# +# Debugging Options +# +# CONFIG_IWLWIFI_DEBUG is not set +# CONFIG_IWLWIFI_DEVICE_TRACING is not set +# end of Debugging Options + +CONFIG_WLAN_VENDOR_INTERSIL=y +CONFIG_HOSTAP=m +CONFIG_HOSTAP_FIRMWARE=y +CONFIG_HOSTAP_FIRMWARE_NVRAM=y +CONFIG_HOSTAP_PLX=m +CONFIG_HOSTAP_PCI=m +CONFIG_HOSTAP_CS=m +CONFIG_HERMES=m +CONFIG_HERMES_PRISM=y +CONFIG_HERMES_CACHE_FW_ON_INIT=y +CONFIG_PLX_HERMES=m +CONFIG_TMD_HERMES=m +CONFIG_NORTEL_HERMES=m +CONFIG_PCI_HERMES=m +CONFIG_PCMCIA_HERMES=m +CONFIG_PCMCIA_SPECTRUM=m +CONFIG_ORINOCO_USB=m +CONFIG_P54_COMMON=m +CONFIG_P54_USB=m +CONFIG_P54_PCI=m +CONFIG_P54_SPI=m +# CONFIG_P54_SPI_DEFAULT_EEPROM is not set +CONFIG_P54_LEDS=y +CONFIG_PRISM54=m +CONFIG_WLAN_VENDOR_MARVELL=y +CONFIG_LIBERTAS=m +CONFIG_LIBERTAS_USB=m +CONFIG_LIBERTAS_CS=m +CONFIG_LIBERTAS_SDIO=m +CONFIG_LIBERTAS_SPI=m +# CONFIG_LIBERTAS_DEBUG is not set +CONFIG_LIBERTAS_MESH=y +CONFIG_LIBERTAS_THINFIRM=m +# CONFIG_LIBERTAS_THINFIRM_DEBUG is not set +CONFIG_LIBERTAS_THINFIRM_USB=m +CONFIG_MWIFIEX=m +CONFIG_MWIFIEX_SDIO=m +CONFIG_MWIFIEX_PCIE=m +CONFIG_MWIFIEX_USB=m +CONFIG_MWL8K=m +CONFIG_WLAN_VENDOR_MEDIATEK=y +CONFIG_MT7601U=m +CONFIG_MT76_CORE=m +CONFIG_MT76_LEDS=y +CONFIG_MT76_USB=m +CONFIG_MT76x02_LIB=m +CONFIG_MT76x02_USB=m +CONFIG_MT76x0_COMMON=m +CONFIG_MT76x0U=m +CONFIG_MT76x0E=m +CONFIG_MT76x2_COMMON=m +CONFIG_MT76x2E=m +CONFIG_MT76x2U=m +CONFIG_MT7603E=m +CONFIG_MT7615E=m +CONFIG_WLAN_VENDOR_RALINK=y +CONFIG_RT2X00=m +CONFIG_RT2400PCI=m +CONFIG_RT2500PCI=m +CONFIG_RT61PCI=m +CONFIG_RT2800PCI=m +CONFIG_RT2800PCI_RT33XX=y +CONFIG_RT2800PCI_RT35XX=y +CONFIG_RT2800PCI_RT53XX=y +CONFIG_RT2800PCI_RT3290=y +CONFIG_RT2500USB=m +CONFIG_RT73USB=m +CONFIG_RT2800USB=m +CONFIG_RT2800USB_RT33XX=y +CONFIG_RT2800USB_RT35XX=y +CONFIG_RT2800USB_RT3573=y +CONFIG_RT2800USB_RT53XX=y +CONFIG_RT2800USB_RT55XX=y +CONFIG_RT2800USB_UNKNOWN=y +CONFIG_RT2800_LIB=m +CONFIG_RT2800_LIB_MMIO=m +CONFIG_RT2X00_LIB_MMIO=m +CONFIG_RT2X00_LIB_PCI=m +CONFIG_RT2X00_LIB_USB=m +CONFIG_RT2X00_LIB=m +CONFIG_RT2X00_LIB_FIRMWARE=y +CONFIG_RT2X00_LIB_CRYPTO=y +CONFIG_RT2X00_LIB_LEDS=y +# CONFIG_RT2X00_DEBUG is not set +CONFIG_WLAN_VENDOR_REALTEK=y +CONFIG_RTL8180=m +CONFIG_RTL8187=m +CONFIG_RTL8187_LEDS=y +CONFIG_RTL_CARDS=m +CONFIG_RTL8192CE=m +CONFIG_RTL8192SE=m +CONFIG_RTL8192DE=m +CONFIG_RTL8723AE=m +CONFIG_RTL8723BE=m +CONFIG_RTL8188EE=m +CONFIG_RTL8192EE=m +CONFIG_RTL8821AE=m +CONFIG_RTL8192CU=m +CONFIG_RTLWIFI=m +CONFIG_RTLWIFI_PCI=m +CONFIG_RTLWIFI_USB=m +CONFIG_RTLWIFI_DEBUG=y +CONFIG_RTL8192C_COMMON=m +CONFIG_RTL8723_COMMON=m +CONFIG_RTLBTCOEXIST=m +CONFIG_RTL8XXXU=m +CONFIG_RTL8XXXU_UNTESTED=y +CONFIG_RTW88=m +CONFIG_RTW88_CORE=m +CONFIG_RTW88_PCI=m +CONFIG_RTW88_8822BE=y +CONFIG_RTW88_8822CE=y +# CONFIG_RTW88_DEBUG is not set +# CONFIG_RTW88_DEBUGFS is not set +CONFIG_WLAN_VENDOR_RSI=y +CONFIG_RSI_91X=m +# CONFIG_RSI_DEBUGFS is not set +CONFIG_RSI_SDIO=m +CONFIG_RSI_USB=m +CONFIG_RSI_COEX=y +CONFIG_WLAN_VENDOR_ST=y +CONFIG_CW1200=m +CONFIG_CW1200_WLAN_SDIO=m +CONFIG_CW1200_WLAN_SPI=m +CONFIG_WLAN_VENDOR_TI=y +CONFIG_WL1251=m +CONFIG_WL1251_SPI=m +CONFIG_WL1251_SDIO=m +CONFIG_WL12XX=m +CONFIG_WL18XX=m +CONFIG_WLCORE=m +CONFIG_WLCORE_SPI=m +CONFIG_WLCORE_SDIO=m +CONFIG_WILINK_PLATFORM_DATA=y +CONFIG_WLAN_VENDOR_ZYDAS=y +CONFIG_USB_ZD1201=m +CONFIG_ZD1211RW=m +# CONFIG_ZD1211RW_DEBUG is not set +CONFIG_WLAN_VENDOR_QUANTENNA=y +CONFIG_QTNFMAC=m +CONFIG_QTNFMAC_PCIE=m +CONFIG_PCMCIA_RAYCS=m +CONFIG_PCMCIA_WL3501=m +CONFIG_MAC80211_HWSIM=m +CONFIG_USB_NET_RNDIS_WLAN=m +CONFIG_VIRT_WIFI=m + +# +# WiMAX Wireless Broadband devices +# +CONFIG_WIMAX_I2400M=m +CONFIG_WIMAX_I2400M_USB=m +CONFIG_WIMAX_I2400M_DEBUG_LEVEL=8 +# end of WiMAX Wireless Broadband devices + +# CONFIG_WAN is not set +CONFIG_IEEE802154_DRIVERS=m +CONFIG_IEEE802154_FAKELB=m +CONFIG_IEEE802154_AT86RF230=m +# CONFIG_IEEE802154_AT86RF230_DEBUGFS is not set +CONFIG_IEEE802154_MRF24J40=m +CONFIG_IEEE802154_CC2520=m +CONFIG_IEEE802154_ATUSB=m +CONFIG_IEEE802154_ADF7242=m +CONFIG_IEEE802154_CA8210=m +# CONFIG_IEEE802154_CA8210_DEBUGFS is not set +CONFIG_IEEE802154_MCR20A=m +CONFIG_IEEE802154_HWSIM=m +CONFIG_XEN_NETDEV_FRONTEND=m +CONFIG_XEN_NETDEV_BACKEND=m +CONFIG_VMXNET3=m +CONFIG_FUJITSU_ES=m +CONFIG_THUNDERBOLT_NET=m +CONFIG_HYPERV_NET=m +CONFIG_NETDEVSIM=m +CONFIG_NET_FAILOVER=m +CONFIG_ISDN=y +CONFIG_ISDN_CAPI=m +CONFIG_CAPI_TRACE=y +CONFIG_ISDN_CAPI_CAPI20=m +CONFIG_ISDN_CAPI_MIDDLEWARE=y +CONFIG_MISDN=m +CONFIG_MISDN_DSP=m +CONFIG_MISDN_L1OIP=m + +# +# mISDN hardware drivers +# +CONFIG_MISDN_HFCPCI=m +CONFIG_MISDN_HFCMULTI=m +CONFIG_MISDN_HFCUSB=m +CONFIG_MISDN_AVMFRITZ=m +CONFIG_MISDN_SPEEDFAX=m +CONFIG_MISDN_INFINEON=m +CONFIG_MISDN_W6692=m +CONFIG_MISDN_NETJET=m +CONFIG_MISDN_HDLC=m +CONFIG_MISDN_IPAC=m +CONFIG_MISDN_ISAR=m +CONFIG_NVM=y +CONFIG_NVM_PBLK=m +# CONFIG_NVM_PBLK_DEBUG is not set + +# +# Input device support +# +CONFIG_INPUT=y +CONFIG_INPUT_LEDS=m +CONFIG_INPUT_FF_MEMLESS=m +CONFIG_INPUT_POLLDEV=m +CONFIG_INPUT_SPARSEKMAP=m +CONFIG_INPUT_MATRIXKMAP=m + +# +# Userland interfaces +# +CONFIG_INPUT_MOUSEDEV=m +CONFIG_INPUT_MOUSEDEV_PSAUX=y +CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 +CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 +CONFIG_INPUT_JOYDEV=m +CONFIG_INPUT_EVDEV=m +# CONFIG_INPUT_EVBUG is not set + +# +# Input Device Drivers +# +CONFIG_INPUT_KEYBOARD=y +CONFIG_KEYBOARD_ADC=m +CONFIG_KEYBOARD_ADP5520=m +CONFIG_KEYBOARD_ADP5588=m +CONFIG_KEYBOARD_ADP5589=m +CONFIG_KEYBOARD_APPLESPI=m +CONFIG_KEYBOARD_ATKBD=m +CONFIG_KEYBOARD_QT1050=m +CONFIG_KEYBOARD_QT1070=m +CONFIG_KEYBOARD_QT2160=m +CONFIG_KEYBOARD_DLINK_DIR685=m +CONFIG_KEYBOARD_LKKBD=m +CONFIG_KEYBOARD_GPIO=m +CONFIG_KEYBOARD_GPIO_POLLED=m +CONFIG_KEYBOARD_TCA6416=m +CONFIG_KEYBOARD_TCA8418=m +CONFIG_KEYBOARD_MATRIX=m +CONFIG_KEYBOARD_LM8323=m +CONFIG_KEYBOARD_LM8333=m +CONFIG_KEYBOARD_MAX7359=m +CONFIG_KEYBOARD_MCS=m +CONFIG_KEYBOARD_MPR121=m +CONFIG_KEYBOARD_NEWTON=m +CONFIG_KEYBOARD_OPENCORES=m +CONFIG_KEYBOARD_SAMSUNG=m +CONFIG_KEYBOARD_STOWAWAY=m +CONFIG_KEYBOARD_SUNKBD=m +CONFIG_KEYBOARD_STMPE=m +CONFIG_KEYBOARD_OMAP4=m +CONFIG_KEYBOARD_TC3589X=m +CONFIG_KEYBOARD_TM2_TOUCHKEY=m +CONFIG_KEYBOARD_TWL4030=m +CONFIG_KEYBOARD_XTKBD=m +CONFIG_KEYBOARD_CROS_EC=m +CONFIG_KEYBOARD_CAP11XX=m +CONFIG_KEYBOARD_BCM=m +CONFIG_KEYBOARD_MTK_PMIC=m +CONFIG_INPUT_MOUSE=y +CONFIG_MOUSE_PS2=m +CONFIG_MOUSE_PS2_ALPS=y +CONFIG_MOUSE_PS2_BYD=y +CONFIG_MOUSE_PS2_LOGIPS2PP=y +CONFIG_MOUSE_PS2_SYNAPTICS=y +CONFIG_MOUSE_PS2_SYNAPTICS_SMBUS=y +CONFIG_MOUSE_PS2_CYPRESS=y +CONFIG_MOUSE_PS2_LIFEBOOK=y +CONFIG_MOUSE_PS2_TRACKPOINT=y +CONFIG_MOUSE_PS2_ELANTECH=y +CONFIG_MOUSE_PS2_ELANTECH_SMBUS=y +CONFIG_MOUSE_PS2_SENTELIC=y +CONFIG_MOUSE_PS2_TOUCHKIT=y +CONFIG_MOUSE_PS2_FOCALTECH=y +CONFIG_MOUSE_PS2_VMMOUSE=y +CONFIG_MOUSE_PS2_SMBUS=y +CONFIG_MOUSE_SERIAL=m +CONFIG_MOUSE_APPLETOUCH=m +CONFIG_MOUSE_BCM5974=m +CONFIG_MOUSE_CYAPA=m +CONFIG_MOUSE_ELAN_I2C=m +CONFIG_MOUSE_ELAN_I2C_I2C=y +CONFIG_MOUSE_ELAN_I2C_SMBUS=y +CONFIG_MOUSE_VSXXXAA=m +CONFIG_MOUSE_GPIO=m +CONFIG_MOUSE_SYNAPTICS_I2C=m +CONFIG_MOUSE_SYNAPTICS_USB=m +CONFIG_INPUT_JOYSTICK=y +CONFIG_JOYSTICK_ANALOG=m +CONFIG_JOYSTICK_A3D=m +CONFIG_JOYSTICK_ADI=m +CONFIG_JOYSTICK_COBRA=m +CONFIG_JOYSTICK_GF2K=m +CONFIG_JOYSTICK_GRIP=m +CONFIG_JOYSTICK_GRIP_MP=m +CONFIG_JOYSTICK_GUILLEMOT=m +CONFIG_JOYSTICK_INTERACT=m +CONFIG_JOYSTICK_SIDEWINDER=m +CONFIG_JOYSTICK_TMDC=m +CONFIG_JOYSTICK_IFORCE=m +CONFIG_JOYSTICK_IFORCE_USB=m +CONFIG_JOYSTICK_IFORCE_232=m +CONFIG_JOYSTICK_WARRIOR=m +CONFIG_JOYSTICK_MAGELLAN=m +CONFIG_JOYSTICK_SPACEORB=m +CONFIG_JOYSTICK_SPACEBALL=m +CONFIG_JOYSTICK_STINGER=m +CONFIG_JOYSTICK_TWIDJOY=m +CONFIG_JOYSTICK_ZHENHUA=m +CONFIG_JOYSTICK_DB9=m +CONFIG_JOYSTICK_GAMECON=m +CONFIG_JOYSTICK_TURBOGRAFX=m +CONFIG_JOYSTICK_AS5011=m +CONFIG_JOYSTICK_JOYDUMP=m +CONFIG_JOYSTICK_XPAD=m +CONFIG_JOYSTICK_XPAD_FF=y +CONFIG_JOYSTICK_XPAD_LEDS=y +CONFIG_JOYSTICK_WALKERA0701=m +CONFIG_JOYSTICK_PSXPAD_SPI=m +CONFIG_JOYSTICK_PSXPAD_SPI_FF=y +CONFIG_JOYSTICK_PXRC=m +# CONFIG_JOYSTICK_FSIA6B is not set +CONFIG_INPUT_TABLET=y +CONFIG_TABLET_USB_ACECAD=m +CONFIG_TABLET_USB_AIPTEK=m +CONFIG_TABLET_USB_GTCO=m +CONFIG_TABLET_USB_HANWANG=m +CONFIG_TABLET_USB_KBTAB=m +CONFIG_TABLET_USB_PEGASUS=m +CONFIG_TABLET_SERIAL_WACOM4=m +CONFIG_INPUT_TOUCHSCREEN=y +CONFIG_TOUCHSCREEN_PROPERTIES=y +CONFIG_TOUCHSCREEN_88PM860X=m +CONFIG_TOUCHSCREEN_ADS7846=m +CONFIG_TOUCHSCREEN_AD7877=m +CONFIG_TOUCHSCREEN_AD7879=m +CONFIG_TOUCHSCREEN_AD7879_I2C=m +CONFIG_TOUCHSCREEN_AD7879_SPI=m +CONFIG_TOUCHSCREEN_ADC=m +CONFIG_TOUCHSCREEN_AR1021_I2C=m +CONFIG_TOUCHSCREEN_ATMEL_MXT=m +CONFIG_TOUCHSCREEN_ATMEL_MXT_T37=y +CONFIG_TOUCHSCREEN_AUO_PIXCIR=m +CONFIG_TOUCHSCREEN_BU21013=m +CONFIG_TOUCHSCREEN_BU21029=m +CONFIG_TOUCHSCREEN_CHIPONE_ICN8318=m +CONFIG_TOUCHSCREEN_CHIPONE_ICN8505=m +CONFIG_TOUCHSCREEN_CY8CTMG110=m +CONFIG_TOUCHSCREEN_CYTTSP_CORE=m +CONFIG_TOUCHSCREEN_CYTTSP_I2C=m +CONFIG_TOUCHSCREEN_CYTTSP_SPI=m +CONFIG_TOUCHSCREEN_CYTTSP4_CORE=m +CONFIG_TOUCHSCREEN_CYTTSP4_I2C=m +CONFIG_TOUCHSCREEN_CYTTSP4_SPI=m +CONFIG_TOUCHSCREEN_DA9034=m +CONFIG_TOUCHSCREEN_DA9052=m +CONFIG_TOUCHSCREEN_DYNAPRO=m +CONFIG_TOUCHSCREEN_HAMPSHIRE=m +CONFIG_TOUCHSCREEN_EETI=m +CONFIG_TOUCHSCREEN_EGALAX=m +CONFIG_TOUCHSCREEN_EGALAX_SERIAL=m +CONFIG_TOUCHSCREEN_EXC3000=m +CONFIG_TOUCHSCREEN_FUJITSU=m +CONFIG_TOUCHSCREEN_GOODIX=m +CONFIG_TOUCHSCREEN_HIDEEP=m +CONFIG_TOUCHSCREEN_ILI210X=m +CONFIG_TOUCHSCREEN_S6SY761=m +CONFIG_TOUCHSCREEN_GUNZE=m +CONFIG_TOUCHSCREEN_EKTF2127=m +CONFIG_TOUCHSCREEN_ELAN=m +CONFIG_TOUCHSCREEN_ELO=m +CONFIG_TOUCHSCREEN_WACOM_W8001=m +CONFIG_TOUCHSCREEN_WACOM_I2C=m +CONFIG_TOUCHSCREEN_MAX11801=m +CONFIG_TOUCHSCREEN_MCS5000=m +CONFIG_TOUCHSCREEN_MMS114=m +CONFIG_TOUCHSCREEN_MELFAS_MIP4=m +CONFIG_TOUCHSCREEN_MTOUCH=m +CONFIG_TOUCHSCREEN_IMX6UL_TSC=m +CONFIG_TOUCHSCREEN_INEXIO=m +CONFIG_TOUCHSCREEN_MK712=m +CONFIG_TOUCHSCREEN_PENMOUNT=m +CONFIG_TOUCHSCREEN_EDT_FT5X06=m +CONFIG_TOUCHSCREEN_TOUCHRIGHT=m +CONFIG_TOUCHSCREEN_TOUCHWIN=m +CONFIG_TOUCHSCREEN_TI_AM335X_TSC=m +CONFIG_TOUCHSCREEN_UCB1400=m +CONFIG_TOUCHSCREEN_PIXCIR=m +CONFIG_TOUCHSCREEN_WDT87XX_I2C=m +CONFIG_TOUCHSCREEN_WM831X=m +CONFIG_TOUCHSCREEN_WM97XX=m +CONFIG_TOUCHSCREEN_WM9705=y +CONFIG_TOUCHSCREEN_WM9712=y +CONFIG_TOUCHSCREEN_WM9713=y +CONFIG_TOUCHSCREEN_USB_COMPOSITE=m +CONFIG_TOUCHSCREEN_MC13783=m +CONFIG_TOUCHSCREEN_USB_EGALAX=y +CONFIG_TOUCHSCREEN_USB_PANJIT=y +CONFIG_TOUCHSCREEN_USB_3M=y +CONFIG_TOUCHSCREEN_USB_ITM=y +CONFIG_TOUCHSCREEN_USB_ETURBO=y +CONFIG_TOUCHSCREEN_USB_GUNZE=y +CONFIG_TOUCHSCREEN_USB_DMC_TSC10=y +CONFIG_TOUCHSCREEN_USB_IRTOUCH=y +CONFIG_TOUCHSCREEN_USB_IDEALTEK=y +CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH=y +CONFIG_TOUCHSCREEN_USB_GOTOP=y +CONFIG_TOUCHSCREEN_USB_JASTEC=y +CONFIG_TOUCHSCREEN_USB_ELO=y +CONFIG_TOUCHSCREEN_USB_E2I=y +CONFIG_TOUCHSCREEN_USB_ZYTRONIC=y +CONFIG_TOUCHSCREEN_USB_ETT_TC45USB=y +CONFIG_TOUCHSCREEN_USB_NEXIO=y +CONFIG_TOUCHSCREEN_USB_EASYTOUCH=y +CONFIG_TOUCHSCREEN_TOUCHIT213=m +CONFIG_TOUCHSCREEN_TSC_SERIO=m +CONFIG_TOUCHSCREEN_TSC200X_CORE=m +CONFIG_TOUCHSCREEN_TSC2004=m +CONFIG_TOUCHSCREEN_TSC2005=m +CONFIG_TOUCHSCREEN_TSC2007=m +CONFIG_TOUCHSCREEN_TSC2007_IIO=y +CONFIG_TOUCHSCREEN_PCAP=m +CONFIG_TOUCHSCREEN_RM_TS=m +CONFIG_TOUCHSCREEN_SILEAD=m +CONFIG_TOUCHSCREEN_SIS_I2C=m +CONFIG_TOUCHSCREEN_ST1232=m +CONFIG_TOUCHSCREEN_STMFTS=m +CONFIG_TOUCHSCREEN_STMPE=m +CONFIG_TOUCHSCREEN_SUR40=m +CONFIG_TOUCHSCREEN_SURFACE3_SPI=m +CONFIG_TOUCHSCREEN_SX8654=m +CONFIG_TOUCHSCREEN_TPS6507X=m +CONFIG_TOUCHSCREEN_ZET6223=m +CONFIG_TOUCHSCREEN_ZFORCE=m +CONFIG_TOUCHSCREEN_COLIBRI_VF50=m +CONFIG_TOUCHSCREEN_ROHM_BU21023=m +CONFIG_TOUCHSCREEN_IQS5XX=m +CONFIG_INPUT_MISC=y +CONFIG_INPUT_88PM860X_ONKEY=m +CONFIG_INPUT_88PM80X_ONKEY=m +CONFIG_INPUT_AD714X=m +CONFIG_INPUT_AD714X_I2C=m +CONFIG_INPUT_AD714X_SPI=m +CONFIG_INPUT_ARIZONA_HAPTICS=m +CONFIG_INPUT_ATMEL_CAPTOUCH=m +CONFIG_INPUT_BMA150=m +CONFIG_INPUT_E3X0_BUTTON=m +CONFIG_INPUT_MSM_VIBRATOR=m +CONFIG_INPUT_PCSPKR=m +CONFIG_INPUT_MAX77650_ONKEY=m +CONFIG_INPUT_MAX77693_HAPTIC=m +CONFIG_INPUT_MAX8925_ONKEY=m +CONFIG_INPUT_MAX8997_HAPTIC=m +CONFIG_INPUT_MC13783_PWRBUTTON=m +CONFIG_INPUT_MMA8450=m +CONFIG_INPUT_APANEL=m +CONFIG_INPUT_GP2A=m +CONFIG_INPUT_GPIO_BEEPER=m +CONFIG_INPUT_GPIO_DECODER=m +CONFIG_INPUT_GPIO_VIBRA=m +CONFIG_INPUT_CPCAP_PWRBUTTON=m +CONFIG_INPUT_ATLAS_BTNS=m +CONFIG_INPUT_ATI_REMOTE2=m +CONFIG_INPUT_KEYSPAN_REMOTE=m +CONFIG_INPUT_KXTJ9=m +# CONFIG_INPUT_KXTJ9_POLLED_MODE is not set +CONFIG_INPUT_POWERMATE=m +CONFIG_INPUT_YEALINK=m +CONFIG_INPUT_CM109=m +CONFIG_INPUT_REGULATOR_HAPTIC=m +CONFIG_INPUT_RETU_PWRBUTTON=m +CONFIG_INPUT_TPS65218_PWRBUTTON=m +CONFIG_INPUT_AXP20X_PEK=m +CONFIG_INPUT_TWL4030_PWRBUTTON=m +CONFIG_INPUT_TWL4030_VIBRA=m +CONFIG_INPUT_TWL6040_VIBRA=m +CONFIG_INPUT_UINPUT=m +CONFIG_INPUT_PALMAS_PWRBUTTON=m +CONFIG_INPUT_PCF50633_PMU=m +CONFIG_INPUT_PCF8574=m +CONFIG_INPUT_PWM_BEEPER=m +CONFIG_INPUT_PWM_VIBRA=m +CONFIG_INPUT_RK805_PWRKEY=m +CONFIG_INPUT_GPIO_ROTARY_ENCODER=m +CONFIG_INPUT_DA9052_ONKEY=m +CONFIG_INPUT_DA9055_ONKEY=m +CONFIG_INPUT_DA9063_ONKEY=m +CONFIG_INPUT_WM831X_ON=m +CONFIG_INPUT_PCAP=m +CONFIG_INPUT_ADXL34X=m +CONFIG_INPUT_ADXL34X_I2C=m +CONFIG_INPUT_ADXL34X_SPI=m +CONFIG_INPUT_IMS_PCU=m +CONFIG_INPUT_CMA3000=m +CONFIG_INPUT_CMA3000_I2C=m +CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m +CONFIG_INPUT_IDEAPAD_SLIDEBAR=m +CONFIG_INPUT_SOC_BUTTON_ARRAY=m +CONFIG_INPUT_DRV260X_HAPTICS=m +CONFIG_INPUT_DRV2665_HAPTICS=m +CONFIG_INPUT_DRV2667_HAPTICS=m +CONFIG_INPUT_RAVE_SP_PWRBUTTON=m +CONFIG_INPUT_STPMIC1_ONKEY=m +CONFIG_RMI4_CORE=m +CONFIG_RMI4_I2C=m +CONFIG_RMI4_SPI=m +CONFIG_RMI4_SMB=m +CONFIG_RMI4_F03=y +CONFIG_RMI4_F03_SERIO=m +CONFIG_RMI4_2D_SENSOR=y +CONFIG_RMI4_F11=y +CONFIG_RMI4_F12=y +CONFIG_RMI4_F30=y +CONFIG_RMI4_F34=y +# CONFIG_RMI4_F54 is not set +CONFIG_RMI4_F55=y + +# +# Hardware I/O ports +# +CONFIG_SERIO=m +CONFIG_ARCH_MIGHT_HAVE_PC_SERIO=y +CONFIG_SERIO_I8042=m +CONFIG_SERIO_SERPORT=m +CONFIG_SERIO_CT82C710=m +CONFIG_SERIO_PARKBD=m +CONFIG_SERIO_PCIPS2=m +CONFIG_SERIO_LIBPS2=m +CONFIG_SERIO_RAW=m +CONFIG_SERIO_ALTERA_PS2=m +CONFIG_SERIO_PS2MULT=m +CONFIG_SERIO_ARC_PS2=m +# CONFIG_SERIO_APBPS2 is not set +CONFIG_HYPERV_KEYBOARD=m +CONFIG_SERIO_GPIO_PS2=m +CONFIG_USERIO=m +CONFIG_GAMEPORT=m +CONFIG_GAMEPORT_NS558=m +CONFIG_GAMEPORT_L4=m +CONFIG_GAMEPORT_EMU10K1=m +CONFIG_GAMEPORT_FM801=m +# end of Hardware I/O ports +# end of Input device support + +# +# Character devices +# +CONFIG_TTY=y +CONFIG_VT=y +CONFIG_CONSOLE_TRANSLATIONS=y +CONFIG_VT_CONSOLE=y +CONFIG_VT_CONSOLE_SLEEP=y +CONFIG_HW_CONSOLE=y +CONFIG_VT_HW_CONSOLE_BINDING=y +CONFIG_UNIX98_PTYS=y +# CONFIG_LEGACY_PTYS is not set +CONFIG_SERIAL_NONSTANDARD=y +CONFIG_ROCKETPORT=m +CONFIG_CYCLADES=m +CONFIG_CYZ_INTR=y +CONFIG_MOXA_INTELLIO=m +CONFIG_MOXA_SMARTIO=m +CONFIG_SYNCLINK=m +CONFIG_SYNCLINKMP=m +CONFIG_SYNCLINK_GT=m +CONFIG_NOZOMI=m +CONFIG_ISI=m +CONFIG_N_HDLC=m +CONFIG_N_GSM=m +CONFIG_TRACE_ROUTER=m +CONFIG_TRACE_SINK=m +CONFIG_NULL_TTY=m +CONFIG_LDISC_AUTOLOAD=y +# CONFIG_DEVMEM is not set +# CONFIG_DEVKMEM is not set + +# +# Serial drivers +# +CONFIG_SERIAL_EARLYCON=y +CONFIG_SERIAL_8250=y +# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set +CONFIG_SERIAL_8250_PNP=y +CONFIG_SERIAL_8250_FINTEK=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_8250_DMA=y +CONFIG_SERIAL_8250_PCI=y +CONFIG_SERIAL_8250_EXAR=m +CONFIG_SERIAL_8250_CS=m +CONFIG_SERIAL_8250_MEN_MCB=m +CONFIG_SERIAL_8250_NR_UARTS=32 +CONFIG_SERIAL_8250_RUNTIME_UARTS=4 +CONFIG_SERIAL_8250_EXTENDED=y +CONFIG_SERIAL_8250_MANY_PORTS=y +CONFIG_SERIAL_8250_ASPEED_VUART=m +CONFIG_SERIAL_8250_SHARE_IRQ=y +# CONFIG_SERIAL_8250_DETECT_IRQ is not set +CONFIG_SERIAL_8250_RSA=y +CONFIG_SERIAL_8250_DWLIB=y +CONFIG_SERIAL_8250_DW=m +CONFIG_SERIAL_8250_RT288X=y +CONFIG_SERIAL_8250_LPSS=y +CONFIG_SERIAL_8250_MID=y +CONFIG_SERIAL_OF_PLATFORM=m + +# +# Non-8250 serial port support +# +CONFIG_SERIAL_MAX3100=m +CONFIG_SERIAL_MAX310X=m +CONFIG_SERIAL_UARTLITE=m +CONFIG_SERIAL_UARTLITE_NR_UARTS=1 +CONFIG_SERIAL_CORE=y +CONFIG_SERIAL_CORE_CONSOLE=y +CONFIG_SERIAL_JSM=m +CONFIG_SERIAL_SIFIVE=m +CONFIG_SERIAL_SCCNXP=m +CONFIG_SERIAL_SC16IS7XX_CORE=m +CONFIG_SERIAL_SC16IS7XX=m +CONFIG_SERIAL_SC16IS7XX_I2C=y +CONFIG_SERIAL_SC16IS7XX_SPI=y +CONFIG_SERIAL_ALTERA_JTAGUART=m +CONFIG_SERIAL_ALTERA_UART=m +CONFIG_SERIAL_ALTERA_UART_MAXPORTS=4 +CONFIG_SERIAL_ALTERA_UART_BAUDRATE=115200 +CONFIG_SERIAL_IFX6X60=m +CONFIG_SERIAL_XILINX_PS_UART=m +CONFIG_SERIAL_ARC=m +CONFIG_SERIAL_ARC_NR_PORTS=1 +CONFIG_SERIAL_RP2=m +CONFIG_SERIAL_RP2_NR_UARTS=32 +CONFIG_SERIAL_FSL_LPUART=m +# CONFIG_SERIAL_FSL_LINFLEXUART is not set +CONFIG_SERIAL_CONEXANT_DIGICOLOR=m +CONFIG_SERIAL_MEN_Z135=m +# end of Serial drivers + +CONFIG_SERIAL_MCTRL_GPIO=y +CONFIG_SERIAL_DEV_BUS=y +CONFIG_SERIAL_DEV_CTRL_TTYPORT=y +# CONFIG_TTY_PRINTK is not set +CONFIG_PRINTER=m +# CONFIG_LP_CONSOLE is not set +CONFIG_PPDEV=m +CONFIG_HVC_DRIVER=y +CONFIG_HVC_IRQ=y +CONFIG_HVC_XEN=y +CONFIG_HVC_XEN_FRONTEND=y +CONFIG_VIRTIO_CONSOLE=m +CONFIG_IPMI_HANDLER=m +CONFIG_IPMI_DMI_DECODE=y +CONFIG_IPMI_PLAT_DATA=y +# CONFIG_IPMI_PANIC_EVENT is not set +CONFIG_IPMI_DEVICE_INTERFACE=m +CONFIG_IPMI_SI=m +CONFIG_IPMI_SSIF=m +CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m +CONFIG_IPMB_DEVICE_INTERFACE=m +CONFIG_HW_RANDOM=m +CONFIG_HW_RANDOM_TIMERIOMEM=m +CONFIG_HW_RANDOM_INTEL=m +CONFIG_HW_RANDOM_AMD=m +CONFIG_HW_RANDOM_VIA=m +CONFIG_HW_RANDOM_VIRTIO=m +CONFIG_NVRAM=m +CONFIG_APPLICOM=m + +# +# PCMCIA character devices +# +CONFIG_SYNCLINK_CS=m +CONFIG_CARDMAN_4000=m +CONFIG_CARDMAN_4040=m +CONFIG_SCR24X=m +CONFIG_IPWIRELESS=m +# end of PCMCIA character devices + +CONFIG_MWAVE=m +CONFIG_RAW_DRIVER=m +CONFIG_MAX_RAW_DEVS=256 +CONFIG_HPET=y +CONFIG_HPET_MMAP=y +CONFIG_HPET_MMAP_DEFAULT=y +CONFIG_HANGCHECK_TIMER=m +CONFIG_TCG_TPM=m +CONFIG_HW_RANDOM_TPM=y +CONFIG_TCG_TIS_CORE=m +CONFIG_TCG_TIS=m +CONFIG_TCG_TIS_SPI=m +CONFIG_TCG_TIS_I2C_ATMEL=m +CONFIG_TCG_TIS_I2C_INFINEON=m +CONFIG_TCG_TIS_I2C_NUVOTON=m +CONFIG_TCG_NSC=m +CONFIG_TCG_ATMEL=m +CONFIG_TCG_INFINEON=m +CONFIG_TCG_XEN=m +CONFIG_TCG_CRB=m +CONFIG_TCG_VTPM_PROXY=m +CONFIG_TCG_TIS_ST33ZP24=m +CONFIG_TCG_TIS_ST33ZP24_I2C=m +CONFIG_TCG_TIS_ST33ZP24_SPI=m +CONFIG_TELCLOCK=m +# CONFIG_DEVPORT is not set +CONFIG_XILLYBUS=m +CONFIG_XILLYBUS_PCIE=m +CONFIG_XILLYBUS_OF=m +# end of Character devices + +# CONFIG_RANDOM_TRUST_CPU is not set +# CONFIG_RANDOM_TRUST_BOOTLOADER is not set + +# +# I2C support +# +CONFIG_I2C=y +CONFIG_ACPI_I2C_OPREGION=y +CONFIG_I2C_BOARDINFO=y +CONFIG_I2C_COMPAT=y +CONFIG_I2C_CHARDEV=m +CONFIG_I2C_MUX=m + +# +# Multiplexer I2C Chip support +# +CONFIG_I2C_ARB_GPIO_CHALLENGE=m +CONFIG_I2C_MUX_GPIO=m +CONFIG_I2C_MUX_GPMUX=m +CONFIG_I2C_MUX_LTC4306=m +CONFIG_I2C_MUX_PCA9541=m +CONFIG_I2C_MUX_PCA954x=m +CONFIG_I2C_MUX_PINCTRL=m +CONFIG_I2C_MUX_REG=m +CONFIG_I2C_DEMUX_PINCTRL=m +CONFIG_I2C_MUX_MLXCPLD=m +# end of Multiplexer I2C Chip support + +CONFIG_I2C_HELPER_AUTO=y +CONFIG_I2C_SMBUS=m +CONFIG_I2C_ALGOBIT=m +CONFIG_I2C_ALGOPCA=m + +# +# I2C Hardware Bus support +# + +# +# PC SMBus host controller drivers +# +CONFIG_I2C_ALI1535=m +CONFIG_I2C_ALI1563=m +CONFIG_I2C_ALI15X3=m +CONFIG_I2C_AMD756=m +CONFIG_I2C_AMD756_S4882=m +CONFIG_I2C_AMD8111=m +CONFIG_I2C_AMD_MP2=m +CONFIG_I2C_I801=m +CONFIG_I2C_ISCH=m +CONFIG_I2C_ISMT=m +CONFIG_I2C_PIIX4=m +CONFIG_I2C_CHT_WC=m +CONFIG_I2C_NFORCE2=m +CONFIG_I2C_NFORCE2_S4985=m +CONFIG_I2C_NVIDIA_GPU=m +CONFIG_I2C_SIS5595=m +CONFIG_I2C_SIS630=m +CONFIG_I2C_SIS96X=m +CONFIG_I2C_VIA=m +CONFIG_I2C_VIAPRO=m + +# +# ACPI drivers +# +CONFIG_I2C_SCMI=m + +# +# I2C system bus drivers (mostly embedded / system-on-chip) +# +CONFIG_I2C_CBUS_GPIO=m +CONFIG_I2C_DESIGNWARE_CORE=y +CONFIG_I2C_DESIGNWARE_PLATFORM=y +CONFIG_I2C_DESIGNWARE_SLAVE=y +CONFIG_I2C_DESIGNWARE_PCI=m +CONFIG_I2C_DESIGNWARE_BAYTRAIL=y +CONFIG_I2C_EMEV2=m +CONFIG_I2C_GPIO=m +# CONFIG_I2C_GPIO_FAULT_INJECTOR is not set +CONFIG_I2C_KEMPLD=m +CONFIG_I2C_OCORES=m +CONFIG_I2C_PCA_PLATFORM=m +CONFIG_I2C_RK3X=m +CONFIG_I2C_SIMTEC=m +CONFIG_I2C_XILINX=m + +# +# External I2C/SMBus adapter drivers +# +CONFIG_I2C_DIOLAN_U2C=m +CONFIG_I2C_DLN2=m +CONFIG_I2C_PARPORT=m +CONFIG_I2C_PARPORT_LIGHT=m +CONFIG_I2C_ROBOTFUZZ_OSIF=m +CONFIG_I2C_TAOS_EVM=m +CONFIG_I2C_TINY_USB=m +CONFIG_I2C_VIPERBOARD=m + +# +# Other I2C/SMBus bus drivers +# +CONFIG_I2C_MLXCPLD=m +CONFIG_I2C_CROS_EC_TUNNEL=m +CONFIG_I2C_FSI=m +# end of I2C Hardware Bus support + +# CONFIG_I2C_STUB is not set +CONFIG_I2C_SLAVE=y +CONFIG_I2C_SLAVE_EEPROM=m +# CONFIG_I2C_DEBUG_CORE is not set +# CONFIG_I2C_DEBUG_ALGO is not set +# CONFIG_I2C_DEBUG_BUS is not set +# end of I2C support + +CONFIG_I3C=m +CONFIG_CDNS_I3C_MASTER=m +CONFIG_DW_I3C_MASTER=m +CONFIG_SPI=y +# CONFIG_SPI_DEBUG is not set +CONFIG_SPI_MASTER=y +CONFIG_SPI_MEM=y + +# +# SPI Master Controller Drivers +# +CONFIG_SPI_ALTERA=m +CONFIG_SPI_AXI_SPI_ENGINE=m +CONFIG_SPI_BITBANG=m +CONFIG_SPI_BUTTERFLY=m +CONFIG_SPI_CADENCE=m +CONFIG_SPI_DESIGNWARE=m +CONFIG_SPI_DW_PCI=m +CONFIG_SPI_DW_MID_DMA=y +CONFIG_SPI_DW_MMIO=m +CONFIG_SPI_DLN2=m +CONFIG_SPI_NXP_FLEXSPI=m +CONFIG_SPI_GPIO=m +CONFIG_SPI_LM70_LLP=m +CONFIG_SPI_FSL_LIB=m +CONFIG_SPI_FSL_SPI=m +CONFIG_SPI_OC_TINY=m +CONFIG_SPI_PXA2XX=m +CONFIG_SPI_PXA2XX_PCI=m +CONFIG_SPI_ROCKCHIP=m +CONFIG_SPI_SC18IS602=m +CONFIG_SPI_SIFIVE=m +CONFIG_SPI_MXIC=m +CONFIG_SPI_XCOMM=m +CONFIG_SPI_XILINX=m +CONFIG_SPI_ZYNQMP_GQSPI=m + +# +# SPI Protocol Masters +# +CONFIG_SPI_SPIDEV=m +CONFIG_SPI_LOOPBACK_TEST=m +CONFIG_SPI_TLE62X0=m +CONFIG_SPI_SLAVE=y +CONFIG_SPI_SLAVE_TIME=m +CONFIG_SPI_SLAVE_SYSTEM_CONTROL=m +CONFIG_SPMI=m +CONFIG_HSI=m +CONFIG_HSI_BOARDINFO=y + +# +# HSI controllers +# + +# +# HSI clients +# +CONFIG_HSI_CHAR=m +CONFIG_PPS=y +# CONFIG_PPS_DEBUG is not set + +# +# PPS clients support +# +CONFIG_PPS_CLIENT_KTIMER=m +CONFIG_PPS_CLIENT_LDISC=m +CONFIG_PPS_CLIENT_PARPORT=m +CONFIG_PPS_CLIENT_GPIO=m + +# +# PPS generators support +# + +# +# PTP clock support +# +CONFIG_PTP_1588_CLOCK=y +CONFIG_DP83640_PHY=m +CONFIG_PTP_1588_CLOCK_KVM=m +# end of PTP clock support + +CONFIG_PINCTRL=y +CONFIG_GENERIC_PINCTRL_GROUPS=y +CONFIG_PINMUX=y +CONFIG_GENERIC_PINMUX_FUNCTIONS=y +CONFIG_PINCONF=y +CONFIG_GENERIC_PINCONF=y +# CONFIG_DEBUG_PINCTRL is not set +CONFIG_PINCTRL_AS3722=m +CONFIG_PINCTRL_AXP209=m +CONFIG_PINCTRL_AMD=m +CONFIG_PINCTRL_MCP23S08=m +CONFIG_PINCTRL_SINGLE=m +CONFIG_PINCTRL_SX150X=y +CONFIG_PINCTRL_STMFX=m +CONFIG_PINCTRL_MAX77620=m +CONFIG_PINCTRL_PALMAS=m +CONFIG_PINCTRL_RK805=m +CONFIG_PINCTRL_OCELOT=y +CONFIG_PINCTRL_BAYTRAIL=y +CONFIG_PINCTRL_CHERRYVIEW=y +CONFIG_PINCTRL_INTEL=y +CONFIG_PINCTRL_BROXTON=y +CONFIG_PINCTRL_CANNONLAKE=y +CONFIG_PINCTRL_CEDARFORK=y +CONFIG_PINCTRL_DENVERTON=y +CONFIG_PINCTRL_GEMINILAKE=y +CONFIG_PINCTRL_ICELAKE=y +CONFIG_PINCTRL_LEWISBURG=y +CONFIG_PINCTRL_SUNRISEPOINT=y +CONFIG_PINCTRL_LOCHNAGAR=m +CONFIG_PINCTRL_MADERA=m +CONFIG_PINCTRL_CS47L15=y +CONFIG_PINCTRL_CS47L35=y +CONFIG_PINCTRL_CS47L85=y +CONFIG_PINCTRL_CS47L90=y +CONFIG_PINCTRL_CS47L92=y +CONFIG_GPIOLIB=y +CONFIG_GPIOLIB_FASTPATH_LIMIT=512 +CONFIG_OF_GPIO=y +CONFIG_GPIO_ACPI=y +CONFIG_GPIOLIB_IRQCHIP=y +# CONFIG_DEBUG_GPIO is not set +CONFIG_GPIO_SYSFS=y +CONFIG_GPIO_GENERIC=y +CONFIG_GPIO_MAX730X=m + +# +# Memory mapped GPIO drivers +# +CONFIG_GPIO_74XX_MMIO=m +CONFIG_GPIO_ALTERA=m +CONFIG_GPIO_AMDPT=m +CONFIG_GPIO_CADENCE=m +CONFIG_GPIO_DWAPB=m +CONFIG_GPIO_EXAR=m +CONFIG_GPIO_FTGPIO010=y +CONFIG_GPIO_GENERIC_PLATFORM=m +CONFIG_GPIO_GRGPIO=m +CONFIG_GPIO_HLWD=m +CONFIG_GPIO_ICH=m +CONFIG_GPIO_LYNXPOINT=m +CONFIG_GPIO_MB86S7X=m +CONFIG_GPIO_MENZ127=m +CONFIG_GPIO_SAMA5D2_PIOBU=m +CONFIG_GPIO_SIOX=m +CONFIG_GPIO_SYSCON=m +CONFIG_GPIO_VX855=m +CONFIG_GPIO_XILINX=m +CONFIG_GPIO_AMD_FCH=m +# end of Memory mapped GPIO drivers + +# +# Port-mapped I/O GPIO drivers +# +CONFIG_GPIO_F7188X=m +CONFIG_GPIO_IT87=m +CONFIG_GPIO_SCH=m +CONFIG_GPIO_SCH311X=m +CONFIG_GPIO_WINBOND=m +CONFIG_GPIO_WS16C48=m +# end of Port-mapped I/O GPIO drivers + +# +# I2C GPIO expanders +# +CONFIG_GPIO_ADP5588=m +CONFIG_GPIO_ADNP=m +CONFIG_GPIO_GW_PLD=m +CONFIG_GPIO_MAX7300=m +CONFIG_GPIO_MAX732X=m +CONFIG_GPIO_PCA953X=m +CONFIG_GPIO_PCF857X=m +CONFIG_GPIO_TPIC2810=m +# end of I2C GPIO expanders + +# +# MFD GPIO expanders +# +CONFIG_GPIO_ADP5520=m +CONFIG_GPIO_ARIZONA=m +CONFIG_GPIO_BD70528=m +CONFIG_GPIO_BD9571MWV=m +CONFIG_GPIO_CRYSTAL_COVE=m +CONFIG_GPIO_DA9052=m +CONFIG_GPIO_DA9055=m +CONFIG_GPIO_DLN2=m +CONFIG_GPIO_JANZ_TTL=m +CONFIG_GPIO_KEMPLD=m +CONFIG_GPIO_LP3943=m +CONFIG_GPIO_LP873X=m +CONFIG_GPIO_LP87565=m +CONFIG_GPIO_MADERA=m +CONFIG_GPIO_MAX77620=m +CONFIG_GPIO_MAX77650=m +CONFIG_GPIO_PALMAS=y +CONFIG_GPIO_RC5T583=y +CONFIG_GPIO_STMPE=y +CONFIG_GPIO_TC3589X=y +CONFIG_GPIO_TPS65086=m +CONFIG_GPIO_TPS65218=m +CONFIG_GPIO_TPS6586X=y +CONFIG_GPIO_TPS65910=y +CONFIG_GPIO_TPS65912=m +CONFIG_GPIO_TPS68470=y +CONFIG_GPIO_TQMX86=m +CONFIG_GPIO_TWL4030=m +CONFIG_GPIO_TWL6040=m +CONFIG_GPIO_UCB1400=m +CONFIG_GPIO_WHISKEY_COVE=m +CONFIG_GPIO_WM831X=m +CONFIG_GPIO_WM8350=m +CONFIG_GPIO_WM8994=m +# end of MFD GPIO expanders + +# +# PCI GPIO expanders +# +CONFIG_GPIO_AMD8111=m +CONFIG_GPIO_ML_IOH=m +CONFIG_GPIO_PCI_IDIO_16=m +CONFIG_GPIO_PCIE_IDIO_24=m +CONFIG_GPIO_RDC321X=m +CONFIG_GPIO_SODAVILLE=y +# end of PCI GPIO expanders + +# +# SPI GPIO expanders +# +CONFIG_GPIO_74X164=m +CONFIG_GPIO_MAX3191X=m +CONFIG_GPIO_MAX7301=m +CONFIG_GPIO_MC33880=m +CONFIG_GPIO_PISOSR=m +CONFIG_GPIO_XRA1403=m +# end of SPI GPIO expanders + +# +# USB GPIO expanders +# +CONFIG_GPIO_VIPERBOARD=m +# end of USB GPIO expanders + +CONFIG_GPIO_MOCKUP=m +CONFIG_W1=m +CONFIG_W1_CON=y + +# +# 1-wire Bus Masters +# +CONFIG_W1_MASTER_MATROX=m +CONFIG_W1_MASTER_DS2490=m +CONFIG_W1_MASTER_DS2482=m +CONFIG_W1_MASTER_DS1WM=m +CONFIG_W1_MASTER_GPIO=m +# CONFIG_W1_MASTER_SGI is not set +# end of 1-wire Bus Masters + +# +# 1-wire Slaves +# +CONFIG_W1_SLAVE_THERM=m +CONFIG_W1_SLAVE_SMEM=m +CONFIG_W1_SLAVE_DS2405=m +CONFIG_W1_SLAVE_DS2408=m +# CONFIG_W1_SLAVE_DS2408_READBACK is not set +CONFIG_W1_SLAVE_DS2413=m +CONFIG_W1_SLAVE_DS2406=m +CONFIG_W1_SLAVE_DS2423=m +CONFIG_W1_SLAVE_DS2805=m +CONFIG_W1_SLAVE_DS2431=m +CONFIG_W1_SLAVE_DS2433=m +# CONFIG_W1_SLAVE_DS2433_CRC is not set +CONFIG_W1_SLAVE_DS2438=m +# CONFIG_W1_SLAVE_DS250X is not set +CONFIG_W1_SLAVE_DS2780=m +CONFIG_W1_SLAVE_DS2781=m +CONFIG_W1_SLAVE_DS28E04=m +CONFIG_W1_SLAVE_DS28E17=m +# end of 1-wire Slaves + +CONFIG_POWER_AVS=y +CONFIG_POWER_RESET=y +CONFIG_POWER_RESET_AS3722=y +CONFIG_POWER_RESET_GPIO=y +CONFIG_POWER_RESET_GPIO_RESTART=y +CONFIG_POWER_RESET_LTC2952=y +CONFIG_POWER_RESET_RESTART=y +CONFIG_POWER_RESET_SYSCON=y +CONFIG_POWER_RESET_SYSCON_POWEROFF=y +CONFIG_REBOOT_MODE=m +CONFIG_SYSCON_REBOOT_MODE=m +CONFIG_NVMEM_REBOOT_MODE=m +CONFIG_POWER_SUPPLY=y +# CONFIG_POWER_SUPPLY_DEBUG is not set +CONFIG_POWER_SUPPLY_HWMON=y +CONFIG_PDA_POWER=m +CONFIG_GENERIC_ADC_BATTERY=m +CONFIG_MAX8925_POWER=m +CONFIG_WM831X_BACKUP=m +CONFIG_WM831X_POWER=m +CONFIG_WM8350_POWER=m +CONFIG_TEST_POWER=m +CONFIG_BATTERY_88PM860X=m +CONFIG_CHARGER_ADP5061=m +CONFIG_BATTERY_ACT8945A=m +CONFIG_BATTERY_CPCAP=m +CONFIG_BATTERY_DS2760=m +CONFIG_BATTERY_DS2780=m +CONFIG_BATTERY_DS2781=m +CONFIG_BATTERY_DS2782=m +CONFIG_BATTERY_LEGO_EV3=m +CONFIG_BATTERY_SBS=m +CONFIG_CHARGER_SBS=m +CONFIG_MANAGER_SBS=m +CONFIG_BATTERY_BQ27XXX=m +CONFIG_BATTERY_BQ27XXX_I2C=m +CONFIG_BATTERY_BQ27XXX_HDQ=m +# CONFIG_BATTERY_BQ27XXX_DT_UPDATES_NVM is not set +CONFIG_BATTERY_DA9030=m +CONFIG_BATTERY_DA9052=m +CONFIG_CHARGER_DA9150=m +CONFIG_BATTERY_DA9150=m +CONFIG_CHARGER_AXP20X=m +CONFIG_BATTERY_AXP20X=m +CONFIG_AXP20X_POWER=m +CONFIG_AXP288_CHARGER=m +CONFIG_AXP288_FUEL_GAUGE=m +CONFIG_BATTERY_MAX17040=m +CONFIG_BATTERY_MAX17042=m +CONFIG_BATTERY_MAX1721X=m +CONFIG_BATTERY_TWL4030_MADC=m +CONFIG_CHARGER_88PM860X=m +CONFIG_CHARGER_PCF50633=m +CONFIG_BATTERY_RX51=m +CONFIG_CHARGER_ISP1704=m +CONFIG_CHARGER_MAX8903=m +CONFIG_CHARGER_TWL4030=m +CONFIG_CHARGER_LP8727=m +CONFIG_CHARGER_LP8788=m +CONFIG_CHARGER_GPIO=m +CONFIG_CHARGER_MANAGER=y +CONFIG_CHARGER_LT3651=m +CONFIG_CHARGER_MAX14577=m +CONFIG_CHARGER_DETECTOR_MAX14656=m +CONFIG_CHARGER_MAX77650=m +CONFIG_CHARGER_MAX77693=m +CONFIG_CHARGER_MAX8997=m +CONFIG_CHARGER_MAX8998=m +CONFIG_CHARGER_BQ2415X=m +CONFIG_CHARGER_BQ24190=m +CONFIG_CHARGER_BQ24257=m +CONFIG_CHARGER_BQ24735=m +CONFIG_CHARGER_BQ25890=m +CONFIG_CHARGER_SMB347=m +CONFIG_CHARGER_TPS65090=m +CONFIG_CHARGER_TPS65217=m +CONFIG_BATTERY_GAUGE_LTC2941=m +CONFIG_BATTERY_RT5033=m +CONFIG_CHARGER_RT9455=m +CONFIG_CHARGER_CROS_USBPD=m +CONFIG_CHARGER_UCS1002=m +CONFIG_CHARGER_BD70528=m +CONFIG_CHARGER_WILCO=m +CONFIG_HWMON=y +CONFIG_HWMON_VID=m +# CONFIG_HWMON_DEBUG_CHIP is not set + +# +# Native drivers +# +CONFIG_SENSORS_ABITUGURU=m +CONFIG_SENSORS_ABITUGURU3=m +CONFIG_SENSORS_AD7314=m +CONFIG_SENSORS_AD7414=m +CONFIG_SENSORS_AD7418=m +CONFIG_SENSORS_ADM1021=m +CONFIG_SENSORS_ADM1025=m +CONFIG_SENSORS_ADM1026=m +CONFIG_SENSORS_ADM1029=m +CONFIG_SENSORS_ADM1031=m +CONFIG_SENSORS_ADM9240=m +CONFIG_SENSORS_ADT7X10=m +CONFIG_SENSORS_ADT7310=m +CONFIG_SENSORS_ADT7410=m +CONFIG_SENSORS_ADT7411=m +CONFIG_SENSORS_ADT7462=m +CONFIG_SENSORS_ADT7470=m +CONFIG_SENSORS_ADT7475=m +# CONFIG_SENSORS_AS370 is not set +CONFIG_SENSORS_ASC7621=m +CONFIG_SENSORS_K8TEMP=m +CONFIG_SENSORS_K10TEMP=m +CONFIG_SENSORS_FAM15H_POWER=m +CONFIG_SENSORS_APPLESMC=m +CONFIG_SENSORS_ASB100=m +CONFIG_SENSORS_ASPEED=m +CONFIG_SENSORS_ATXP1=m +CONFIG_SENSORS_DS620=m +CONFIG_SENSORS_DS1621=m +CONFIG_SENSORS_DELL_SMM=m +CONFIG_SENSORS_DA9052_ADC=m +CONFIG_SENSORS_DA9055=m +CONFIG_SENSORS_I5K_AMB=m +CONFIG_SENSORS_F71805F=m +CONFIG_SENSORS_F71882FG=m +CONFIG_SENSORS_F75375S=m +CONFIG_SENSORS_MC13783_ADC=m +CONFIG_SENSORS_FSCHMD=m +CONFIG_SENSORS_FTSTEUTATES=m +CONFIG_SENSORS_GL518SM=m +CONFIG_SENSORS_GL520SM=m +CONFIG_SENSORS_G760A=m +CONFIG_SENSORS_G762=m +CONFIG_SENSORS_GPIO_FAN=m +CONFIG_SENSORS_HIH6130=m +CONFIG_SENSORS_IBMAEM=m +CONFIG_SENSORS_IBMPEX=m +CONFIG_SENSORS_IIO_HWMON=m +CONFIG_SENSORS_I5500=m +CONFIG_SENSORS_CORETEMP=m +CONFIG_SENSORS_IT87=m +CONFIG_SENSORS_JC42=m +CONFIG_SENSORS_POWR1220=m +CONFIG_SENSORS_LINEAGE=m +CONFIG_SENSORS_LOCHNAGAR=m +CONFIG_SENSORS_LTC2945=m +CONFIG_SENSORS_LTC2990=m +CONFIG_SENSORS_LTC4151=m +CONFIG_SENSORS_LTC4215=m +CONFIG_SENSORS_LTC4222=m +CONFIG_SENSORS_LTC4245=m +CONFIG_SENSORS_LTC4260=m +CONFIG_SENSORS_LTC4261=m +CONFIG_SENSORS_MAX1111=m +CONFIG_SENSORS_MAX16065=m +CONFIG_SENSORS_MAX1619=m +CONFIG_SENSORS_MAX1668=m +CONFIG_SENSORS_MAX197=m +CONFIG_SENSORS_MAX31722=m +CONFIG_SENSORS_MAX6621=m +CONFIG_SENSORS_MAX6639=m +CONFIG_SENSORS_MAX6642=m +CONFIG_SENSORS_MAX6650=m +CONFIG_SENSORS_MAX6697=m +CONFIG_SENSORS_MAX31790=m +CONFIG_SENSORS_MCP3021=m +CONFIG_SENSORS_MLXREG_FAN=m +CONFIG_SENSORS_TC654=m +CONFIG_SENSORS_MENF21BMC_HWMON=m +CONFIG_SENSORS_ADCXX=m +CONFIG_SENSORS_LM63=m +CONFIG_SENSORS_LM70=m +CONFIG_SENSORS_LM73=m +CONFIG_SENSORS_LM75=m +CONFIG_SENSORS_LM77=m +CONFIG_SENSORS_LM78=m +CONFIG_SENSORS_LM80=m +CONFIG_SENSORS_LM83=m +CONFIG_SENSORS_LM85=m +CONFIG_SENSORS_LM87=m +CONFIG_SENSORS_LM90=m +CONFIG_SENSORS_LM92=m +CONFIG_SENSORS_LM93=m +CONFIG_SENSORS_LM95234=m +CONFIG_SENSORS_LM95241=m +CONFIG_SENSORS_LM95245=m +CONFIG_SENSORS_PC87360=m +CONFIG_SENSORS_PC87427=m +CONFIG_SENSORS_NTC_THERMISTOR=m +CONFIG_SENSORS_NCT6683=m +CONFIG_SENSORS_NCT6775=m +CONFIG_SENSORS_NCT7802=m +CONFIG_SENSORS_NCT7904=m +CONFIG_SENSORS_NPCM7XX=m +CONFIG_SENSORS_PCF8591=m +CONFIG_PMBUS=m +CONFIG_SENSORS_PMBUS=m +CONFIG_SENSORS_ADM1275=m +CONFIG_SENSORS_IBM_CFFPS=m +# CONFIG_SENSORS_INSPUR_IPSPS is not set +CONFIG_SENSORS_IR35221=m +CONFIG_SENSORS_IR38064=m +CONFIG_SENSORS_IRPS5401=m +CONFIG_SENSORS_ISL68137=m +CONFIG_SENSORS_LM25066=m +CONFIG_SENSORS_LTC2978=m +# CONFIG_SENSORS_LTC2978_REGULATOR is not set +CONFIG_SENSORS_LTC3815=m +CONFIG_SENSORS_MAX16064=m +CONFIG_SENSORS_MAX20751=m +CONFIG_SENSORS_MAX31785=m +CONFIG_SENSORS_MAX34440=m +CONFIG_SENSORS_MAX8688=m +CONFIG_SENSORS_PXE1610=m +CONFIG_SENSORS_TPS40422=m +CONFIG_SENSORS_TPS53679=m +CONFIG_SENSORS_UCD9000=m +CONFIG_SENSORS_UCD9200=m +CONFIG_SENSORS_ZL6100=m +CONFIG_SENSORS_PWM_FAN=m +CONFIG_SENSORS_SHT15=m +CONFIG_SENSORS_SHT21=m +CONFIG_SENSORS_SHT3x=m +CONFIG_SENSORS_SHTC1=m +CONFIG_SENSORS_SIS5595=m +CONFIG_SENSORS_DME1737=m +CONFIG_SENSORS_EMC1403=m +CONFIG_SENSORS_EMC2103=m +CONFIG_SENSORS_EMC6W201=m +CONFIG_SENSORS_SMSC47M1=m +CONFIG_SENSORS_SMSC47M192=m +CONFIG_SENSORS_SMSC47B397=m +CONFIG_SENSORS_SCH56XX_COMMON=m +CONFIG_SENSORS_SCH5627=m +CONFIG_SENSORS_SCH5636=m +CONFIG_SENSORS_STTS751=m +CONFIG_SENSORS_SMM665=m +CONFIG_SENSORS_ADC128D818=m +CONFIG_SENSORS_ADS7828=m +CONFIG_SENSORS_ADS7871=m +CONFIG_SENSORS_AMC6821=m +CONFIG_SENSORS_INA209=m +CONFIG_SENSORS_INA2XX=m +CONFIG_SENSORS_INA3221=m +CONFIG_SENSORS_TC74=m +CONFIG_SENSORS_THMC50=m +CONFIG_SENSORS_TMP102=m +CONFIG_SENSORS_TMP103=m +CONFIG_SENSORS_TMP108=m +CONFIG_SENSORS_TMP401=m +CONFIG_SENSORS_TMP421=m +CONFIG_SENSORS_VIA_CPUTEMP=m +CONFIG_SENSORS_VIA686A=m +CONFIG_SENSORS_VT1211=m +CONFIG_SENSORS_VT8231=m +CONFIG_SENSORS_W83773G=m +CONFIG_SENSORS_W83781D=m +CONFIG_SENSORS_W83791D=m +CONFIG_SENSORS_W83792D=m +CONFIG_SENSORS_W83793=m +CONFIG_SENSORS_W83795=m +# CONFIG_SENSORS_W83795_FANCTRL is not set +CONFIG_SENSORS_W83L785TS=m +CONFIG_SENSORS_W83L786NG=m +CONFIG_SENSORS_W83627HF=m +CONFIG_SENSORS_W83627EHF=m +CONFIG_SENSORS_WM831X=m +CONFIG_SENSORS_WM8350=m +CONFIG_SENSORS_XGENE=m + +# +# ACPI drivers +# +CONFIG_SENSORS_ACPI_POWER=m +CONFIG_SENSORS_ATK0110=m +CONFIG_THERMAL=y +# CONFIG_THERMAL_STATISTICS is not set +CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=100 +CONFIG_THERMAL_HWMON=y +CONFIG_THERMAL_OF=y +CONFIG_THERMAL_WRITABLE_TRIPS=y +CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y +# CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE is not set +# CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE is not set +# CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR is not set +CONFIG_THERMAL_GOV_FAIR_SHARE=y +CONFIG_THERMAL_GOV_STEP_WISE=y +CONFIG_THERMAL_GOV_BANG_BANG=y +CONFIG_THERMAL_GOV_USER_SPACE=y +CONFIG_THERMAL_GOV_POWER_ALLOCATOR=y +CONFIG_CPU_THERMAL=y +CONFIG_CLOCK_THERMAL=y +CONFIG_DEVFREQ_THERMAL=y +# CONFIG_THERMAL_EMULATION is not set +CONFIG_THERMAL_MMIO=m +CONFIG_MAX77620_THERMAL=m +CONFIG_QORIQ_THERMAL=m +CONFIG_DA9062_THERMAL=m + +# +# Intel thermal drivers +# +CONFIG_INTEL_POWERCLAMP=m +CONFIG_X86_PKG_TEMP_THERMAL=m +CONFIG_INTEL_SOC_DTS_IOSF_CORE=m +CONFIG_INTEL_SOC_DTS_THERMAL=m + +# +# ACPI INT340X thermal drivers +# +CONFIG_INT340X_THERMAL=m +CONFIG_ACPI_THERMAL_REL=m +CONFIG_INT3406_THERMAL=m +CONFIG_PROC_THERMAL_MMIO_RAPL=y +# end of ACPI INT340X thermal drivers + +CONFIG_INTEL_BXT_PMIC_THERMAL=m +CONFIG_INTEL_PCH_THERMAL=m +# end of Intel thermal drivers + +CONFIG_GENERIC_ADC_THERMAL=m +CONFIG_WATCHDOG=y +CONFIG_WATCHDOG_CORE=y +# CONFIG_WATCHDOG_NOWAYOUT is not set +CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED=y +CONFIG_WATCHDOG_OPEN_TIMEOUT=0 +CONFIG_WATCHDOG_SYSFS=y + +# +# Watchdog Pretimeout Governors +# +CONFIG_WATCHDOG_PRETIMEOUT_GOV=y +CONFIG_WATCHDOG_PRETIMEOUT_GOV_SEL=m +CONFIG_WATCHDOG_PRETIMEOUT_GOV_NOOP=m +CONFIG_WATCHDOG_PRETIMEOUT_GOV_PANIC=y +# CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_NOOP is not set +CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC=y + +# +# Watchdog Device Drivers +# +CONFIG_SOFT_WATCHDOG=m +# CONFIG_SOFT_WATCHDOG_PRETIMEOUT is not set +CONFIG_BD70528_WATCHDOG=m +CONFIG_DA9052_WATCHDOG=m +CONFIG_DA9055_WATCHDOG=m +CONFIG_DA9063_WATCHDOG=m +CONFIG_DA9062_WATCHDOG=m +CONFIG_GPIO_WATCHDOG=m +CONFIG_MENF21BMC_WATCHDOG=m +CONFIG_MENZ069_WATCHDOG=m +CONFIG_WDAT_WDT=m +CONFIG_WM831X_WATCHDOG=m +CONFIG_WM8350_WATCHDOG=m +CONFIG_XILINX_WATCHDOG=m +CONFIG_ZIIRAVE_WATCHDOG=m +CONFIG_RAVE_SP_WATCHDOG=m +CONFIG_MLX_WDT=m +CONFIG_CADENCE_WATCHDOG=m +CONFIG_DW_WATCHDOG=m +CONFIG_RN5T618_WATCHDOG=m +CONFIG_TWL4030_WATCHDOG=m +CONFIG_MAX63XX_WATCHDOG=m +CONFIG_MAX77620_WATCHDOG=m +CONFIG_RETU_WATCHDOG=m +CONFIG_STPMIC1_WATCHDOG=m +CONFIG_ACQUIRE_WDT=m +CONFIG_ADVANTECH_WDT=m +CONFIG_ALIM1535_WDT=m +CONFIG_ALIM7101_WDT=m +CONFIG_EBC_C384_WDT=m +CONFIG_F71808E_WDT=m +CONFIG_SP5100_TCO=m +CONFIG_SBC_FITPC2_WATCHDOG=m +CONFIG_EUROTECH_WDT=m +CONFIG_IB700_WDT=m +CONFIG_IBMASR=m +CONFIG_WAFER_WDT=m +CONFIG_I6300ESB_WDT=m +CONFIG_IE6XX_WDT=m +CONFIG_ITCO_WDT=m +CONFIG_ITCO_VENDOR_SUPPORT=y +CONFIG_IT8712F_WDT=m +CONFIG_IT87_WDT=m +CONFIG_HP_WATCHDOG=m +CONFIG_HPWDT_NMI_DECODING=y +CONFIG_KEMPLD_WDT=m +CONFIG_SC1200_WDT=m +CONFIG_PC87413_WDT=m +CONFIG_NV_TCO=m +CONFIG_60XX_WDT=m +CONFIG_CPU5_WDT=m +CONFIG_SMSC_SCH311X_WDT=m +CONFIG_SMSC37B787_WDT=m +CONFIG_TQMX86_WDT=m +CONFIG_VIA_WDT=m +CONFIG_W83627HF_WDT=m +CONFIG_W83877F_WDT=m +CONFIG_W83977F_WDT=m +CONFIG_MACHZ_WDT=m +CONFIG_SBC_EPX_C3_WATCHDOG=m +CONFIG_INTEL_MEI_WDT=m +CONFIG_NI903X_WDT=m +CONFIG_NIC7018_WDT=m +CONFIG_MEN_A21_WDT=m +CONFIG_XEN_WDT=m + +# +# PCI-based Watchdog Cards +# +CONFIG_PCIPCWATCHDOG=m +CONFIG_WDTPCI=m + +# +# USB-based Watchdog Cards +# +CONFIG_USBPCWATCHDOG=m +CONFIG_SSB_POSSIBLE=y +CONFIG_SSB=m +CONFIG_SSB_SPROM=y +CONFIG_SSB_BLOCKIO=y +CONFIG_SSB_PCIHOST_POSSIBLE=y +CONFIG_SSB_PCIHOST=y +CONFIG_SSB_B43_PCI_BRIDGE=y +CONFIG_SSB_PCMCIAHOST_POSSIBLE=y +CONFIG_SSB_PCMCIAHOST=y +CONFIG_SSB_SDIOHOST_POSSIBLE=y +CONFIG_SSB_SDIOHOST=y +CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y +CONFIG_SSB_DRIVER_PCICORE=y +CONFIG_SSB_DRIVER_GPIO=y +CONFIG_BCMA_POSSIBLE=y +CONFIG_BCMA=m +CONFIG_BCMA_BLOCKIO=y +CONFIG_BCMA_HOST_PCI_POSSIBLE=y +CONFIG_BCMA_HOST_PCI=y +# CONFIG_BCMA_HOST_SOC is not set +CONFIG_BCMA_DRIVER_PCI=y +CONFIG_BCMA_DRIVER_GMAC_CMN=y +CONFIG_BCMA_DRIVER_GPIO=y +# CONFIG_BCMA_DEBUG is not set + +# +# Multifunction device drivers +# +CONFIG_MFD_CORE=y +CONFIG_MFD_ACT8945A=m +CONFIG_MFD_AS3711=y +CONFIG_MFD_AS3722=m +CONFIG_PMIC_ADP5520=y +CONFIG_MFD_AAT2870_CORE=y +CONFIG_MFD_ATMEL_FLEXCOM=m +CONFIG_MFD_ATMEL_HLCDC=m +CONFIG_MFD_BCM590XX=m +CONFIG_MFD_BD9571MWV=m +CONFIG_MFD_AXP20X=m +CONFIG_MFD_AXP20X_I2C=m +CONFIG_MFD_CROS_EC_DEV=m +CONFIG_MFD_MADERA=m +CONFIG_MFD_MADERA_I2C=m +CONFIG_MFD_MADERA_SPI=m +CONFIG_MFD_CS47L15=y +CONFIG_MFD_CS47L35=y +CONFIG_MFD_CS47L85=y +CONFIG_MFD_CS47L90=y +CONFIG_MFD_CS47L92=y +CONFIG_PMIC_DA903X=y +CONFIG_PMIC_DA9052=y +CONFIG_MFD_DA9052_SPI=y +CONFIG_MFD_DA9052_I2C=y +CONFIG_MFD_DA9055=y +CONFIG_MFD_DA9062=m +CONFIG_MFD_DA9063=m +CONFIG_MFD_DA9150=m +CONFIG_MFD_DLN2=m +CONFIG_MFD_MC13XXX=m +CONFIG_MFD_MC13XXX_SPI=m +CONFIG_MFD_MC13XXX_I2C=m +CONFIG_MFD_HI6421_PMIC=m +CONFIG_HTC_PASIC3=m +CONFIG_HTC_I2CPLD=y +CONFIG_MFD_INTEL_QUARK_I2C_GPIO=m +CONFIG_LPC_ICH=m +CONFIG_LPC_SCH=m +CONFIG_INTEL_SOC_PMIC=y +CONFIG_INTEL_SOC_PMIC_BXTWC=m +CONFIG_INTEL_SOC_PMIC_CHTWC=y +CONFIG_INTEL_SOC_PMIC_CHTDC_TI=m +CONFIG_MFD_INTEL_LPSS=m +CONFIG_MFD_INTEL_LPSS_ACPI=m +CONFIG_MFD_INTEL_LPSS_PCI=m +CONFIG_MFD_JANZ_CMODIO=m +CONFIG_MFD_KEMPLD=m +CONFIG_MFD_88PM800=m +CONFIG_MFD_88PM805=m +CONFIG_MFD_88PM860X=y +CONFIG_MFD_MAX14577=m +CONFIG_MFD_MAX77620=y +CONFIG_MFD_MAX77650=m +CONFIG_MFD_MAX77686=m +CONFIG_MFD_MAX77693=m +CONFIG_MFD_MAX77843=y +CONFIG_MFD_MAX8907=m +CONFIG_MFD_MAX8925=y +CONFIG_MFD_MAX8997=y +CONFIG_MFD_MAX8998=y +CONFIG_MFD_MT6397=m +CONFIG_MFD_MENF21BMC=m +CONFIG_EZX_PCAP=y +CONFIG_MFD_CPCAP=m +CONFIG_MFD_VIPERBOARD=m +CONFIG_MFD_RETU=m +CONFIG_MFD_PCF50633=m +CONFIG_PCF50633_ADC=m +CONFIG_PCF50633_GPIO=m +CONFIG_UCB1400_CORE=m +CONFIG_MFD_RDC321X=m +CONFIG_MFD_RT5033=m +CONFIG_MFD_RC5T583=y +CONFIG_MFD_RK808=m +CONFIG_MFD_RN5T618=m +CONFIG_MFD_SEC_CORE=y +CONFIG_MFD_SI476X_CORE=m +CONFIG_MFD_SM501=m +CONFIG_MFD_SM501_GPIO=y +CONFIG_MFD_SKY81452=m +CONFIG_MFD_SMSC=y +CONFIG_ABX500_CORE=y +CONFIG_AB3100_CORE=y +CONFIG_AB3100_OTP=y +CONFIG_MFD_STMPE=y + +# +# STMicroelectronics STMPE Interface Drivers +# +CONFIG_STMPE_I2C=y +CONFIG_STMPE_SPI=y +# end of STMicroelectronics STMPE Interface Drivers + +CONFIG_MFD_SYSCON=y +CONFIG_MFD_TI_AM335X_TSCADC=m +CONFIG_MFD_LP3943=m +CONFIG_MFD_LP8788=y +CONFIG_MFD_TI_LMU=m +CONFIG_MFD_PALMAS=y +CONFIG_TPS6105X=m +CONFIG_TPS65010=m +CONFIG_TPS6507X=m +CONFIG_MFD_TPS65086=m +CONFIG_MFD_TPS65090=y +CONFIG_MFD_TPS65217=m +CONFIG_MFD_TPS68470=y +CONFIG_MFD_TI_LP873X=m +CONFIG_MFD_TI_LP87565=m +CONFIG_MFD_TPS65218=m +CONFIG_MFD_TPS6586X=y +CONFIG_MFD_TPS65910=y +CONFIG_MFD_TPS65912=m +CONFIG_MFD_TPS65912_I2C=m +CONFIG_MFD_TPS65912_SPI=m +CONFIG_MFD_TPS80031=y +CONFIG_TWL4030_CORE=y +CONFIG_MFD_TWL4030_AUDIO=y +CONFIG_TWL6040_CORE=y +CONFIG_MFD_WL1273_CORE=m +CONFIG_MFD_LM3533=m +CONFIG_MFD_TC3589X=y +CONFIG_MFD_TQMX86=m +CONFIG_MFD_VX855=m +CONFIG_MFD_LOCHNAGAR=y +CONFIG_MFD_ARIZONA=y +CONFIG_MFD_ARIZONA_I2C=m +CONFIG_MFD_ARIZONA_SPI=m +CONFIG_MFD_CS47L24=y +CONFIG_MFD_WM5102=y +CONFIG_MFD_WM5110=y +CONFIG_MFD_WM8997=y +CONFIG_MFD_WM8998=y +CONFIG_MFD_WM8400=y +CONFIG_MFD_WM831X=y +CONFIG_MFD_WM831X_I2C=y +CONFIG_MFD_WM831X_SPI=y +CONFIG_MFD_WM8350=y +CONFIG_MFD_WM8350_I2C=y +CONFIG_MFD_WM8994=m +CONFIG_MFD_ROHM_BD718XX=m +CONFIG_MFD_ROHM_BD70528=m +CONFIG_MFD_STPMIC1=m +CONFIG_MFD_STMFX=m +CONFIG_RAVE_SP_CORE=m +# end of Multifunction device drivers + +CONFIG_REGULATOR=y +# CONFIG_REGULATOR_DEBUG is not set +CONFIG_REGULATOR_FIXED_VOLTAGE=m +CONFIG_REGULATOR_VIRTUAL_CONSUMER=m +CONFIG_REGULATOR_USERSPACE_CONSUMER=m +CONFIG_REGULATOR_88PG86X=m +CONFIG_REGULATOR_88PM800=m +CONFIG_REGULATOR_88PM8607=m +CONFIG_REGULATOR_ACT8865=m +CONFIG_REGULATOR_ACT8945A=m +CONFIG_REGULATOR_AD5398=m +CONFIG_REGULATOR_ANATOP=m +CONFIG_REGULATOR_AAT2870=m +CONFIG_REGULATOR_AB3100=m +CONFIG_REGULATOR_ARIZONA_LDO1=m +CONFIG_REGULATOR_ARIZONA_MICSUPP=m +CONFIG_REGULATOR_AS3711=m +CONFIG_REGULATOR_AS3722=m +CONFIG_REGULATOR_AXP20X=m +CONFIG_REGULATOR_BCM590XX=m +CONFIG_REGULATOR_BD70528=m +CONFIG_REGULATOR_BD718XX=m +CONFIG_REGULATOR_BD9571MWV=m +CONFIG_REGULATOR_CPCAP=m +CONFIG_REGULATOR_DA903X=m +CONFIG_REGULATOR_DA9052=m +CONFIG_REGULATOR_DA9055=m +CONFIG_REGULATOR_DA9062=m +CONFIG_REGULATOR_DA9063=m +CONFIG_REGULATOR_DA9210=m +CONFIG_REGULATOR_DA9211=m +CONFIG_REGULATOR_FAN53555=m +CONFIG_REGULATOR_GPIO=m +CONFIG_REGULATOR_HI6421=m +CONFIG_REGULATOR_HI6421V530=m +CONFIG_REGULATOR_ISL9305=m +CONFIG_REGULATOR_ISL6271A=m +CONFIG_REGULATOR_LM363X=m +CONFIG_REGULATOR_LOCHNAGAR=m +CONFIG_REGULATOR_LP3971=m +CONFIG_REGULATOR_LP3972=m +CONFIG_REGULATOR_LP872X=m +CONFIG_REGULATOR_LP873X=m +CONFIG_REGULATOR_LP8755=m +CONFIG_REGULATOR_LP87565=m +CONFIG_REGULATOR_LP8788=m +CONFIG_REGULATOR_LTC3589=m +CONFIG_REGULATOR_LTC3676=m +CONFIG_REGULATOR_MAX14577=m +CONFIG_REGULATOR_MAX1586=m +CONFIG_REGULATOR_MAX77620=m +CONFIG_REGULATOR_MAX77650=m +CONFIG_REGULATOR_MAX8649=m +CONFIG_REGULATOR_MAX8660=m +CONFIG_REGULATOR_MAX8907=m +CONFIG_REGULATOR_MAX8925=m +CONFIG_REGULATOR_MAX8952=m +CONFIG_REGULATOR_MAX8973=m +CONFIG_REGULATOR_MAX8997=m +CONFIG_REGULATOR_MAX8998=m +CONFIG_REGULATOR_MAX77686=m +CONFIG_REGULATOR_MAX77693=m +CONFIG_REGULATOR_MAX77802=m +CONFIG_REGULATOR_MC13XXX_CORE=m +CONFIG_REGULATOR_MC13783=m +CONFIG_REGULATOR_MC13892=m +CONFIG_REGULATOR_MCP16502=m +CONFIG_REGULATOR_MT6311=m +CONFIG_REGULATOR_MT6323=m +CONFIG_REGULATOR_MT6397=m +CONFIG_REGULATOR_PALMAS=m +CONFIG_REGULATOR_PCAP=m +CONFIG_REGULATOR_PCF50633=m +CONFIG_REGULATOR_PFUZE100=m +CONFIG_REGULATOR_PV88060=m +CONFIG_REGULATOR_PV88080=m +CONFIG_REGULATOR_PV88090=m +CONFIG_REGULATOR_PWM=m +CONFIG_REGULATOR_QCOM_SPMI=m +CONFIG_REGULATOR_RC5T583=m +CONFIG_REGULATOR_RK808=m +CONFIG_REGULATOR_RN5T618=m +CONFIG_REGULATOR_RT5033=m +CONFIG_REGULATOR_S2MPA01=m +CONFIG_REGULATOR_S2MPS11=m +CONFIG_REGULATOR_S5M8767=m +CONFIG_REGULATOR_SKY81452=m +CONFIG_REGULATOR_SLG51000=m +CONFIG_REGULATOR_STPMIC1=m +CONFIG_REGULATOR_SY8106A=m +# CONFIG_REGULATOR_SY8824X is not set +CONFIG_REGULATOR_TPS51632=m +CONFIG_REGULATOR_TPS6105X=m +CONFIG_REGULATOR_TPS62360=m +CONFIG_REGULATOR_TPS65023=m +CONFIG_REGULATOR_TPS6507X=m +CONFIG_REGULATOR_TPS65086=m +CONFIG_REGULATOR_TPS65090=m +CONFIG_REGULATOR_TPS65132=m +CONFIG_REGULATOR_TPS65217=m +CONFIG_REGULATOR_TPS65218=m +CONFIG_REGULATOR_TPS6524X=m +CONFIG_REGULATOR_TPS6586X=m +CONFIG_REGULATOR_TPS65910=m +CONFIG_REGULATOR_TPS65912=m +CONFIG_REGULATOR_TPS80031=m +CONFIG_REGULATOR_TWL4030=m +CONFIG_REGULATOR_VCTRL=m +CONFIG_REGULATOR_WM831X=m +CONFIG_REGULATOR_WM8350=m +CONFIG_REGULATOR_WM8400=m +CONFIG_REGULATOR_WM8994=m +CONFIG_CEC_CORE=y +CONFIG_CEC_NOTIFIER=y +CONFIG_CEC_PIN=y +CONFIG_RC_CORE=m +CONFIG_RC_MAP=m +CONFIG_LIRC=y +CONFIG_RC_DECODERS=y +CONFIG_IR_NEC_DECODER=m +CONFIG_IR_RC5_DECODER=m +CONFIG_IR_RC6_DECODER=m +CONFIG_IR_JVC_DECODER=m +CONFIG_IR_SONY_DECODER=m +CONFIG_IR_SANYO_DECODER=m +CONFIG_IR_SHARP_DECODER=m +CONFIG_IR_MCE_KBD_DECODER=m +CONFIG_IR_XMP_DECODER=m +CONFIG_IR_IMON_DECODER=m +CONFIG_IR_RCMM_DECODER=m +CONFIG_RC_DEVICES=y +CONFIG_RC_ATI_REMOTE=m +CONFIG_IR_ENE=m +CONFIG_IR_HIX5HD2=m +CONFIG_IR_IMON=m +CONFIG_IR_IMON_RAW=m +CONFIG_IR_MCEUSB=m +CONFIG_IR_ITE_CIR=m +CONFIG_IR_FINTEK=m +CONFIG_IR_NUVOTON=m +CONFIG_IR_REDRAT3=m +CONFIG_IR_SPI=m +CONFIG_IR_STREAMZAP=m +CONFIG_IR_WINBOND_CIR=m +CONFIG_IR_IGORPLUGUSB=m +CONFIG_IR_IGUANA=m +CONFIG_IR_TTUSBIR=m +CONFIG_RC_LOOPBACK=m +CONFIG_IR_GPIO_CIR=m +CONFIG_IR_GPIO_TX=m +CONFIG_IR_PWM_TX=m +CONFIG_IR_SERIAL=m +CONFIG_IR_SERIAL_TRANSMITTER=y +CONFIG_IR_SIR=m +CONFIG_RC_XBOX_DVD=m +CONFIG_MEDIA_SUPPORT=m + +# +# Multimedia core support +# +CONFIG_MEDIA_CAMERA_SUPPORT=y +CONFIG_MEDIA_ANALOG_TV_SUPPORT=y +CONFIG_MEDIA_DIGITAL_TV_SUPPORT=y +CONFIG_MEDIA_RADIO_SUPPORT=y +CONFIG_MEDIA_SDR_SUPPORT=y +CONFIG_MEDIA_CEC_SUPPORT=y +# CONFIG_CEC_PIN_ERROR_INJ is not set +CONFIG_MEDIA_CONTROLLER=y +CONFIG_MEDIA_CONTROLLER_DVB=y +# CONFIG_MEDIA_CONTROLLER_REQUEST_API is not set +CONFIG_VIDEO_DEV=m +CONFIG_VIDEO_V4L2_SUBDEV_API=y +CONFIG_VIDEO_V4L2=m +CONFIG_VIDEO_V4L2_I2C=y +# CONFIG_VIDEO_ADV_DEBUG is not set +# CONFIG_VIDEO_FIXED_MINOR_RANGES is not set +CONFIG_VIDEO_TUNER=m +CONFIG_V4L2_MEM2MEM_DEV=m +CONFIG_V4L2_FLASH_LED_CLASS=m +CONFIG_V4L2_FWNODE=m +CONFIG_VIDEOBUF_GEN=m +CONFIG_VIDEOBUF_DMA_SG=m +CONFIG_VIDEOBUF_VMALLOC=m +CONFIG_DVB_CORE=m +CONFIG_DVB_MMAP=y +CONFIG_DVB_NET=y +CONFIG_TTPCI_EEPROM=m +CONFIG_DVB_MAX_ADAPTERS=16 +# CONFIG_DVB_DYNAMIC_MINORS is not set +# CONFIG_DVB_DEMUX_SECTION_LOSS_LOG is not set +# CONFIG_DVB_ULE_DEBUG is not set + +# +# Media drivers +# +CONFIG_MEDIA_USB_SUPPORT=y + +# +# Webcam devices +# +CONFIG_USB_VIDEO_CLASS=m +CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y +CONFIG_USB_GSPCA=m +CONFIG_USB_M5602=m +CONFIG_USB_STV06XX=m +CONFIG_USB_GL860=m +CONFIG_USB_GSPCA_BENQ=m +CONFIG_USB_GSPCA_CONEX=m +CONFIG_USB_GSPCA_CPIA1=m +CONFIG_USB_GSPCA_DTCS033=m +CONFIG_USB_GSPCA_ETOMS=m +CONFIG_USB_GSPCA_FINEPIX=m +CONFIG_USB_GSPCA_JEILINJ=m +CONFIG_USB_GSPCA_JL2005BCD=m +CONFIG_USB_GSPCA_KINECT=m +CONFIG_USB_GSPCA_KONICA=m +CONFIG_USB_GSPCA_MARS=m +CONFIG_USB_GSPCA_MR97310A=m +CONFIG_USB_GSPCA_NW80X=m +CONFIG_USB_GSPCA_OV519=m +CONFIG_USB_GSPCA_OV534=m +CONFIG_USB_GSPCA_OV534_9=m +CONFIG_USB_GSPCA_PAC207=m +CONFIG_USB_GSPCA_PAC7302=m +CONFIG_USB_GSPCA_PAC7311=m +CONFIG_USB_GSPCA_SE401=m +CONFIG_USB_GSPCA_SN9C2028=m +CONFIG_USB_GSPCA_SN9C20X=m +CONFIG_USB_GSPCA_SONIXB=m +CONFIG_USB_GSPCA_SONIXJ=m +CONFIG_USB_GSPCA_SPCA500=m +CONFIG_USB_GSPCA_SPCA501=m +CONFIG_USB_GSPCA_SPCA505=m +CONFIG_USB_GSPCA_SPCA506=m +CONFIG_USB_GSPCA_SPCA508=m +CONFIG_USB_GSPCA_SPCA561=m +CONFIG_USB_GSPCA_SPCA1528=m +CONFIG_USB_GSPCA_SQ905=m +CONFIG_USB_GSPCA_SQ905C=m +CONFIG_USB_GSPCA_SQ930X=m +CONFIG_USB_GSPCA_STK014=m +CONFIG_USB_GSPCA_STK1135=m +CONFIG_USB_GSPCA_STV0680=m +CONFIG_USB_GSPCA_SUNPLUS=m +CONFIG_USB_GSPCA_T613=m +CONFIG_USB_GSPCA_TOPRO=m +CONFIG_USB_GSPCA_TOUPTEK=m +CONFIG_USB_GSPCA_TV8532=m +CONFIG_USB_GSPCA_VC032X=m +CONFIG_USB_GSPCA_VICAM=m +CONFIG_USB_GSPCA_XIRLINK_CIT=m +CONFIG_USB_GSPCA_ZC3XX=m +CONFIG_USB_PWC=m +# CONFIG_USB_PWC_DEBUG is not set +CONFIG_USB_PWC_INPUT_EVDEV=y +CONFIG_VIDEO_CPIA2=m +CONFIG_USB_ZR364XX=m +CONFIG_USB_STKWEBCAM=m +CONFIG_USB_S2255=m +CONFIG_VIDEO_USBTV=m + +# +# Analog TV USB devices +# +CONFIG_VIDEO_PVRUSB2=m +CONFIG_VIDEO_PVRUSB2_SYSFS=y +CONFIG_VIDEO_PVRUSB2_DVB=y +# CONFIG_VIDEO_PVRUSB2_DEBUGIFC is not set +CONFIG_VIDEO_HDPVR=m +CONFIG_VIDEO_USBVISION=m +CONFIG_VIDEO_STK1160_COMMON=m +CONFIG_VIDEO_STK1160=m +CONFIG_VIDEO_GO7007=m +CONFIG_VIDEO_GO7007_USB=m +CONFIG_VIDEO_GO7007_LOADER=m +CONFIG_VIDEO_GO7007_USB_S2250_BOARD=m + +# +# Analog/digital TV USB devices +# +CONFIG_VIDEO_AU0828=m +CONFIG_VIDEO_AU0828_V4L2=y +CONFIG_VIDEO_AU0828_RC=y +CONFIG_VIDEO_CX231XX=m +CONFIG_VIDEO_CX231XX_RC=y +CONFIG_VIDEO_CX231XX_ALSA=m +CONFIG_VIDEO_CX231XX_DVB=m +CONFIG_VIDEO_TM6000=m +CONFIG_VIDEO_TM6000_ALSA=m +CONFIG_VIDEO_TM6000_DVB=m + +# +# Digital TV USB devices +# +CONFIG_DVB_USB=m +# CONFIG_DVB_USB_DEBUG is not set +CONFIG_DVB_USB_DIB3000MC=m +CONFIG_DVB_USB_A800=m +CONFIG_DVB_USB_DIBUSB_MB=m +CONFIG_DVB_USB_DIBUSB_MB_FAULTY=y +CONFIG_DVB_USB_DIBUSB_MC=m +CONFIG_DVB_USB_DIB0700=m +CONFIG_DVB_USB_UMT_010=m +CONFIG_DVB_USB_CXUSB=m +CONFIG_DVB_USB_CXUSB_ANALOG=y +CONFIG_DVB_USB_M920X=m +CONFIG_DVB_USB_DIGITV=m +CONFIG_DVB_USB_VP7045=m +CONFIG_DVB_USB_VP702X=m +CONFIG_DVB_USB_GP8PSK=m +CONFIG_DVB_USB_NOVA_T_USB2=m +CONFIG_DVB_USB_TTUSB2=m +CONFIG_DVB_USB_DTT200U=m +CONFIG_DVB_USB_OPERA1=m +CONFIG_DVB_USB_AF9005=m +CONFIG_DVB_USB_AF9005_REMOTE=m +CONFIG_DVB_USB_PCTV452E=m +CONFIG_DVB_USB_DW2102=m +CONFIG_DVB_USB_CINERGY_T2=m +CONFIG_DVB_USB_DTV5100=m +CONFIG_DVB_USB_AZ6027=m +CONFIG_DVB_USB_TECHNISAT_USB2=m +CONFIG_DVB_USB_V2=m +CONFIG_DVB_USB_AF9015=m +CONFIG_DVB_USB_AF9035=m +CONFIG_DVB_USB_ANYSEE=m +CONFIG_DVB_USB_AU6610=m +CONFIG_DVB_USB_AZ6007=m +CONFIG_DVB_USB_CE6230=m +CONFIG_DVB_USB_EC168=m +CONFIG_DVB_USB_GL861=m +CONFIG_DVB_USB_LME2510=m +CONFIG_DVB_USB_MXL111SF=m +CONFIG_DVB_USB_RTL28XXU=m +CONFIG_DVB_USB_DVBSKY=m +CONFIG_DVB_USB_ZD1301=m +CONFIG_DVB_TTUSB_BUDGET=m +CONFIG_DVB_TTUSB_DEC=m +CONFIG_SMS_USB_DRV=m +CONFIG_DVB_B2C2_FLEXCOP_USB=m +# CONFIG_DVB_B2C2_FLEXCOP_USB_DEBUG is not set +CONFIG_DVB_AS102=m + +# +# Webcam, TV (analog/digital) USB devices +# +CONFIG_VIDEO_EM28XX=m +CONFIG_VIDEO_EM28XX_V4L2=m +CONFIG_VIDEO_EM28XX_ALSA=m +CONFIG_VIDEO_EM28XX_DVB=m +CONFIG_VIDEO_EM28XX_RC=m + +# +# Software defined radio USB devices +# +CONFIG_USB_AIRSPY=m +CONFIG_USB_HACKRF=m +CONFIG_USB_MSI2500=m + +# +# USB HDMI CEC adapters +# +CONFIG_USB_PULSE8_CEC=m +CONFIG_USB_RAINSHADOW_CEC=m +CONFIG_MEDIA_PCI_SUPPORT=y + +# +# Media capture support +# +CONFIG_VIDEO_MEYE=m +CONFIG_VIDEO_SOLO6X10=m +CONFIG_VIDEO_TW5864=m +CONFIG_VIDEO_TW68=m +CONFIG_VIDEO_TW686X=m + +# +# Media capture/analog TV support +# +CONFIG_VIDEO_IVTV=m +# CONFIG_VIDEO_IVTV_DEPRECATED_IOCTLS is not set +CONFIG_VIDEO_IVTV_ALSA=m +CONFIG_VIDEO_FB_IVTV=m +# CONFIG_VIDEO_FB_IVTV_FORCE_PAT is not set +CONFIG_VIDEO_HEXIUM_GEMINI=m +CONFIG_VIDEO_HEXIUM_ORION=m +CONFIG_VIDEO_MXB=m +CONFIG_VIDEO_DT3155=m + +# +# Media capture/analog/hybrid TV support +# +CONFIG_VIDEO_CX18=m +CONFIG_VIDEO_CX18_ALSA=m +CONFIG_VIDEO_CX23885=m +CONFIG_MEDIA_ALTERA_CI=m +CONFIG_VIDEO_CX25821=m +CONFIG_VIDEO_CX25821_ALSA=m +CONFIG_VIDEO_CX88=m +CONFIG_VIDEO_CX88_ALSA=m +CONFIG_VIDEO_CX88_BLACKBIRD=m +CONFIG_VIDEO_CX88_DVB=m +CONFIG_VIDEO_CX88_ENABLE_VP3054=y +CONFIG_VIDEO_CX88_VP3054=m +CONFIG_VIDEO_CX88_MPEG=m +CONFIG_VIDEO_BT848=m +CONFIG_DVB_BT8XX=m +CONFIG_VIDEO_SAA7134=m +CONFIG_VIDEO_SAA7134_ALSA=m +CONFIG_VIDEO_SAA7134_RC=y +CONFIG_VIDEO_SAA7134_DVB=m +CONFIG_VIDEO_SAA7134_GO7007=m +CONFIG_VIDEO_SAA7164=m + +# +# Media digital TV PCI Adapters +# +CONFIG_DVB_AV7110_IR=y +CONFIG_DVB_AV7110=m +CONFIG_DVB_AV7110_OSD=y +CONFIG_DVB_BUDGET_CORE=m +CONFIG_DVB_BUDGET=m +CONFIG_DVB_BUDGET_CI=m +CONFIG_DVB_BUDGET_AV=m +CONFIG_DVB_BUDGET_PATCH=m +CONFIG_DVB_B2C2_FLEXCOP_PCI=m +# CONFIG_DVB_B2C2_FLEXCOP_PCI_DEBUG is not set +CONFIG_DVB_PLUTO2=m +CONFIG_DVB_DM1105=m +CONFIG_DVB_PT1=m +CONFIG_DVB_PT3=m +CONFIG_MANTIS_CORE=m +CONFIG_DVB_MANTIS=m +CONFIG_DVB_HOPPER=m +CONFIG_DVB_NGENE=m +CONFIG_DVB_DDBRIDGE=m +# CONFIG_DVB_DDBRIDGE_MSIENABLE is not set +CONFIG_DVB_SMIPCIE=m +CONFIG_DVB_NETUP_UNIDVB=m +CONFIG_VIDEO_IPU3_CIO2=m +CONFIG_V4L_PLATFORM_DRIVERS=y +CONFIG_VIDEO_CAFE_CCIC=m +CONFIG_VIDEO_CADENCE=y +CONFIG_VIDEO_CADENCE_CSI2RX=m +CONFIG_VIDEO_CADENCE_CSI2TX=m +CONFIG_VIDEO_ASPEED=m +CONFIG_VIDEO_MUX=m +CONFIG_VIDEO_XILINX=m +CONFIG_VIDEO_XILINX_TPG=m +CONFIG_VIDEO_XILINX_VTC=m +CONFIG_V4L_MEM2MEM_DRIVERS=y +CONFIG_VIDEO_MEM2MEM_DEINTERLACE=m +CONFIG_VIDEO_SH_VEU=m +CONFIG_V4L_TEST_DRIVERS=y +CONFIG_VIDEO_VIMC=m +CONFIG_VIDEO_VIVID=m +CONFIG_VIDEO_VIVID_CEC=y +CONFIG_VIDEO_VIVID_MAX_DEVS=64 +CONFIG_VIDEO_VIM2M=m +CONFIG_VIDEO_VICODEC=m +CONFIG_DVB_PLATFORM_DRIVERS=y +CONFIG_CEC_PLATFORM_DRIVERS=y +CONFIG_VIDEO_CROS_EC_CEC=m +CONFIG_CEC_GPIO=m +CONFIG_VIDEO_SECO_CEC=m +CONFIG_VIDEO_SECO_RC=y +CONFIG_SDR_PLATFORM_DRIVERS=y + +# +# Supported MMC/SDIO adapters +# +CONFIG_SMS_SDIO_DRV=m +CONFIG_RADIO_ADAPTERS=y +CONFIG_RADIO_TEA575X=m +CONFIG_RADIO_SI470X=m +CONFIG_USB_SI470X=m +CONFIG_I2C_SI470X=m +CONFIG_RADIO_SI4713=m +CONFIG_USB_SI4713=m +CONFIG_PLATFORM_SI4713=m +CONFIG_I2C_SI4713=m +CONFIG_RADIO_SI476X=m +CONFIG_USB_MR800=m +CONFIG_USB_DSBR=m +CONFIG_RADIO_MAXIRADIO=m +CONFIG_RADIO_SHARK=m +CONFIG_RADIO_SHARK2=m +CONFIG_USB_KEENE=m +CONFIG_USB_RAREMONO=m +CONFIG_USB_MA901=m +CONFIG_RADIO_TEA5764=m +CONFIG_RADIO_SAA7706H=m +CONFIG_RADIO_TEF6862=m +CONFIG_RADIO_WL1273=m + +# +# Texas Instruments WL128x FM driver (ST based) +# +CONFIG_RADIO_WL128X=m +# end of Texas Instruments WL128x FM driver (ST based) + +# +# Supported FireWire (IEEE 1394) Adapters +# +CONFIG_DVB_FIREDTV=m +CONFIG_DVB_FIREDTV_INPUT=y +CONFIG_MEDIA_COMMON_OPTIONS=y + +# +# common driver options +# +CONFIG_VIDEO_CX2341X=m +CONFIG_VIDEO_TVEEPROM=m +CONFIG_CYPRESS_FIRMWARE=m +CONFIG_VIDEOBUF2_CORE=m +CONFIG_VIDEOBUF2_V4L2=m +CONFIG_VIDEOBUF2_MEMOPS=m +CONFIG_VIDEOBUF2_DMA_CONTIG=m +CONFIG_VIDEOBUF2_VMALLOC=m +CONFIG_VIDEOBUF2_DMA_SG=m +CONFIG_VIDEOBUF2_DVB=m +CONFIG_DVB_B2C2_FLEXCOP=m +CONFIG_VIDEO_SAA7146=m +CONFIG_VIDEO_SAA7146_VV=m +CONFIG_SMS_SIANO_MDTV=m +CONFIG_SMS_SIANO_RC=y +# CONFIG_SMS_SIANO_DEBUGFS is not set +CONFIG_VIDEO_V4L2_TPG=m + +# +# Media ancillary drivers (tuners, sensors, i2c, spi, frontends) +# +CONFIG_MEDIA_SUBDRV_AUTOSELECT=y +CONFIG_MEDIA_ATTACH=y +CONFIG_VIDEO_IR_I2C=m + +# +# I2C Encoders, decoders, sensors and other helper chips +# + +# +# Audio decoders, processors and mixers +# +CONFIG_VIDEO_TVAUDIO=m +CONFIG_VIDEO_TDA7432=m +CONFIG_VIDEO_TDA9840=m +CONFIG_VIDEO_TDA1997X=m +CONFIG_VIDEO_TEA6415C=m +CONFIG_VIDEO_TEA6420=m +CONFIG_VIDEO_MSP3400=m +CONFIG_VIDEO_CS3308=m +CONFIG_VIDEO_CS5345=m +CONFIG_VIDEO_CS53L32A=m +CONFIG_VIDEO_TLV320AIC23B=m +CONFIG_VIDEO_UDA1342=m +CONFIG_VIDEO_WM8775=m +CONFIG_VIDEO_WM8739=m +CONFIG_VIDEO_VP27SMPX=m +CONFIG_VIDEO_SONY_BTF_MPX=m + +# +# RDS decoders +# +CONFIG_VIDEO_SAA6588=m + +# +# Video decoders +# +CONFIG_VIDEO_ADV7180=m +CONFIG_VIDEO_ADV7183=m +CONFIG_VIDEO_ADV748X=m +CONFIG_VIDEO_ADV7604=m +CONFIG_VIDEO_ADV7604_CEC=y +CONFIG_VIDEO_ADV7842=m +CONFIG_VIDEO_ADV7842_CEC=y +CONFIG_VIDEO_BT819=m +CONFIG_VIDEO_BT856=m +CONFIG_VIDEO_BT866=m +CONFIG_VIDEO_KS0127=m +CONFIG_VIDEO_ML86V7667=m +CONFIG_VIDEO_SAA7110=m +CONFIG_VIDEO_SAA711X=m +CONFIG_VIDEO_TC358743=m +CONFIG_VIDEO_TC358743_CEC=y +CONFIG_VIDEO_TVP514X=m +CONFIG_VIDEO_TVP5150=m +CONFIG_VIDEO_TVP7002=m +CONFIG_VIDEO_TW2804=m +CONFIG_VIDEO_TW9903=m +CONFIG_VIDEO_TW9906=m +CONFIG_VIDEO_TW9910=m +CONFIG_VIDEO_VPX3220=m + +# +# Video and audio decoders +# +CONFIG_VIDEO_SAA717X=m +CONFIG_VIDEO_CX25840=m + +# +# Video encoders +# +CONFIG_VIDEO_SAA7127=m +CONFIG_VIDEO_SAA7185=m +CONFIG_VIDEO_ADV7170=m +CONFIG_VIDEO_ADV7175=m +CONFIG_VIDEO_ADV7343=m +CONFIG_VIDEO_ADV7393=m +CONFIG_VIDEO_AD9389B=m +CONFIG_VIDEO_AK881X=m +CONFIG_VIDEO_THS8200=m + +# +# Camera sensor devices +# +CONFIG_VIDEO_APTINA_PLL=m +CONFIG_VIDEO_SMIAPP_PLL=m +CONFIG_VIDEO_IMX214=m +CONFIG_VIDEO_IMX258=m +CONFIG_VIDEO_IMX274=m +CONFIG_VIDEO_IMX319=m +CONFIG_VIDEO_IMX355=m +CONFIG_VIDEO_OV2640=m +CONFIG_VIDEO_OV2659=m +CONFIG_VIDEO_OV2680=m +CONFIG_VIDEO_OV2685=m +CONFIG_VIDEO_OV5640=m +CONFIG_VIDEO_OV5645=m +CONFIG_VIDEO_OV5647=m +CONFIG_VIDEO_OV6650=m +CONFIG_VIDEO_OV5670=m +# CONFIG_VIDEO_OV5675 is not set +CONFIG_VIDEO_OV5695=m +CONFIG_VIDEO_OV7251=m +CONFIG_VIDEO_OV772X=m +CONFIG_VIDEO_OV7640=m +CONFIG_VIDEO_OV7670=m +CONFIG_VIDEO_OV7740=m +CONFIG_VIDEO_OV8856=m +CONFIG_VIDEO_OV9640=m +CONFIG_VIDEO_OV9650=m +CONFIG_VIDEO_OV13858=m +CONFIG_VIDEO_VS6624=m +CONFIG_VIDEO_MT9M001=m +CONFIG_VIDEO_MT9M032=m +CONFIG_VIDEO_MT9M111=m +CONFIG_VIDEO_MT9P031=m +CONFIG_VIDEO_MT9T001=m +CONFIG_VIDEO_MT9T112=m +CONFIG_VIDEO_MT9V011=m +CONFIG_VIDEO_MT9V032=m +CONFIG_VIDEO_MT9V111=m +CONFIG_VIDEO_SR030PC30=m +CONFIG_VIDEO_NOON010PC30=m +CONFIG_VIDEO_M5MOLS=m +CONFIG_VIDEO_RJ54N1=m +CONFIG_VIDEO_S5K6AA=m +CONFIG_VIDEO_S5K6A3=m +CONFIG_VIDEO_S5K4ECGX=m +CONFIG_VIDEO_S5K5BAF=m +CONFIG_VIDEO_SMIAPP=m +CONFIG_VIDEO_ET8EK8=m +CONFIG_VIDEO_S5C73M3=m + +# +# Lens drivers +# +CONFIG_VIDEO_AD5820=m +CONFIG_VIDEO_AK7375=m +CONFIG_VIDEO_DW9714=m +CONFIG_VIDEO_DW9807_VCM=m + +# +# Flash devices +# +CONFIG_VIDEO_ADP1653=m +CONFIG_VIDEO_LM3560=m +CONFIG_VIDEO_LM3646=m + +# +# Video improvement chips +# +CONFIG_VIDEO_UPD64031A=m +CONFIG_VIDEO_UPD64083=m + +# +# Audio/Video compression chips +# +CONFIG_VIDEO_SAA6752HS=m + +# +# SDR tuner chips +# +CONFIG_SDR_MAX2175=m + +# +# Miscellaneous helper chips +# +CONFIG_VIDEO_THS7303=m +CONFIG_VIDEO_M52790=m +CONFIG_VIDEO_I2C=m +CONFIG_VIDEO_ST_MIPID02=m +# end of I2C Encoders, decoders, sensors and other helper chips + +# +# SPI helper chips +# +CONFIG_VIDEO_GS1662=m +# end of SPI helper chips + +# +# Media SPI Adapters +# +CONFIG_CXD2880_SPI_DRV=m +# end of Media SPI Adapters + +CONFIG_MEDIA_TUNER=m + +# +# Customize TV tuners +# +CONFIG_MEDIA_TUNER_SIMPLE=m +CONFIG_MEDIA_TUNER_TDA18250=m +CONFIG_MEDIA_TUNER_TDA8290=m +CONFIG_MEDIA_TUNER_TDA827X=m +CONFIG_MEDIA_TUNER_TDA18271=m +CONFIG_MEDIA_TUNER_TDA9887=m +CONFIG_MEDIA_TUNER_TEA5761=m +CONFIG_MEDIA_TUNER_TEA5767=m +CONFIG_MEDIA_TUNER_MSI001=m +CONFIG_MEDIA_TUNER_MT20XX=m +CONFIG_MEDIA_TUNER_MT2060=m +CONFIG_MEDIA_TUNER_MT2063=m +CONFIG_MEDIA_TUNER_MT2266=m +CONFIG_MEDIA_TUNER_MT2131=m +CONFIG_MEDIA_TUNER_QT1010=m +CONFIG_MEDIA_TUNER_XC2028=m +CONFIG_MEDIA_TUNER_XC5000=m +CONFIG_MEDIA_TUNER_XC4000=m +CONFIG_MEDIA_TUNER_MXL5005S=m +CONFIG_MEDIA_TUNER_MXL5007T=m +CONFIG_MEDIA_TUNER_MC44S803=m +CONFIG_MEDIA_TUNER_MAX2165=m +CONFIG_MEDIA_TUNER_TDA18218=m +CONFIG_MEDIA_TUNER_FC0011=m +CONFIG_MEDIA_TUNER_FC0012=m +CONFIG_MEDIA_TUNER_FC0013=m +CONFIG_MEDIA_TUNER_TDA18212=m +CONFIG_MEDIA_TUNER_E4000=m +CONFIG_MEDIA_TUNER_FC2580=m +CONFIG_MEDIA_TUNER_M88RS6000T=m +CONFIG_MEDIA_TUNER_TUA9001=m +CONFIG_MEDIA_TUNER_SI2157=m +CONFIG_MEDIA_TUNER_IT913X=m +CONFIG_MEDIA_TUNER_R820T=m +CONFIG_MEDIA_TUNER_MXL301RF=m +CONFIG_MEDIA_TUNER_QM1D1C0042=m +CONFIG_MEDIA_TUNER_QM1D1B0004=m +# end of Customize TV tuners + +# +# Customise DVB Frontends +# + +# +# Multistandard (satellite) frontends +# +CONFIG_DVB_STB0899=m +CONFIG_DVB_STB6100=m +CONFIG_DVB_STV090x=m +CONFIG_DVB_STV0910=m +CONFIG_DVB_STV6110x=m +CONFIG_DVB_STV6111=m +CONFIG_DVB_MXL5XX=m +CONFIG_DVB_M88DS3103=m + +# +# Multistandard (cable + terrestrial) frontends +# +CONFIG_DVB_DRXK=m +CONFIG_DVB_TDA18271C2DD=m +CONFIG_DVB_SI2165=m +CONFIG_DVB_MN88472=m +CONFIG_DVB_MN88473=m + +# +# DVB-S (satellite) frontends +# +CONFIG_DVB_CX24110=m +CONFIG_DVB_CX24123=m +CONFIG_DVB_MT312=m +CONFIG_DVB_ZL10036=m +CONFIG_DVB_ZL10039=m +CONFIG_DVB_S5H1420=m +CONFIG_DVB_STV0288=m +CONFIG_DVB_STB6000=m +CONFIG_DVB_STV0299=m +CONFIG_DVB_STV6110=m +CONFIG_DVB_STV0900=m +CONFIG_DVB_TDA8083=m +CONFIG_DVB_TDA10086=m +CONFIG_DVB_TDA8261=m +CONFIG_DVB_VES1X93=m +CONFIG_DVB_TUNER_ITD1000=m +CONFIG_DVB_TUNER_CX24113=m +CONFIG_DVB_TDA826X=m +CONFIG_DVB_TUA6100=m +CONFIG_DVB_CX24116=m +CONFIG_DVB_CX24117=m +CONFIG_DVB_CX24120=m +CONFIG_DVB_SI21XX=m +CONFIG_DVB_TS2020=m +CONFIG_DVB_DS3000=m +CONFIG_DVB_MB86A16=m +CONFIG_DVB_TDA10071=m + +# +# DVB-T (terrestrial) frontends +# +CONFIG_DVB_SP8870=m +CONFIG_DVB_SP887X=m +CONFIG_DVB_CX22700=m +CONFIG_DVB_CX22702=m +CONFIG_DVB_S5H1432=m +CONFIG_DVB_DRXD=m +CONFIG_DVB_L64781=m +CONFIG_DVB_TDA1004X=m +CONFIG_DVB_NXT6000=m +CONFIG_DVB_MT352=m +CONFIG_DVB_ZL10353=m +CONFIG_DVB_DIB3000MB=m +CONFIG_DVB_DIB3000MC=m +CONFIG_DVB_DIB7000M=m +CONFIG_DVB_DIB7000P=m +CONFIG_DVB_DIB9000=m +CONFIG_DVB_TDA10048=m +CONFIG_DVB_AF9013=m +CONFIG_DVB_EC100=m +CONFIG_DVB_STV0367=m +CONFIG_DVB_CXD2820R=m +CONFIG_DVB_CXD2841ER=m +CONFIG_DVB_RTL2830=m +CONFIG_DVB_RTL2832=m +CONFIG_DVB_RTL2832_SDR=m +CONFIG_DVB_SI2168=m +CONFIG_DVB_AS102_FE=m +CONFIG_DVB_ZD1301_DEMOD=m +CONFIG_DVB_GP8PSK_FE=m +CONFIG_DVB_CXD2880=m + +# +# DVB-C (cable) frontends +# +CONFIG_DVB_VES1820=m +CONFIG_DVB_TDA10021=m +CONFIG_DVB_TDA10023=m +CONFIG_DVB_STV0297=m + +# +# ATSC (North American/Korean Terrestrial/Cable DTV) frontends +# +CONFIG_DVB_NXT200X=m +CONFIG_DVB_OR51211=m +CONFIG_DVB_OR51132=m +CONFIG_DVB_BCM3510=m +CONFIG_DVB_LGDT330X=m +CONFIG_DVB_LGDT3305=m +CONFIG_DVB_LGDT3306A=m +CONFIG_DVB_LG2160=m +CONFIG_DVB_S5H1409=m +CONFIG_DVB_AU8522=m +CONFIG_DVB_AU8522_DTV=m +CONFIG_DVB_AU8522_V4L=m +CONFIG_DVB_S5H1411=m + +# +# ISDB-T (terrestrial) frontends +# +CONFIG_DVB_S921=m +CONFIG_DVB_DIB8000=m +CONFIG_DVB_MB86A20S=m + +# +# ISDB-S (satellite) & ISDB-T (terrestrial) frontends +# +CONFIG_DVB_TC90522=m +CONFIG_DVB_MN88443X=m + +# +# Digital terrestrial only tuners/PLL +# +CONFIG_DVB_PLL=m +CONFIG_DVB_TUNER_DIB0070=m +CONFIG_DVB_TUNER_DIB0090=m + +# +# SEC control devices for DVB-S +# +CONFIG_DVB_DRX39XYJ=m +CONFIG_DVB_LNBH25=m +CONFIG_DVB_LNBH29=m +CONFIG_DVB_LNBP21=m +CONFIG_DVB_LNBP22=m +CONFIG_DVB_ISL6405=m +CONFIG_DVB_ISL6421=m +CONFIG_DVB_ISL6423=m +CONFIG_DVB_A8293=m +CONFIG_DVB_LGS8GL5=m +CONFIG_DVB_LGS8GXX=m +CONFIG_DVB_ATBM8830=m +CONFIG_DVB_TDA665x=m +CONFIG_DVB_IX2505V=m +CONFIG_DVB_M88RS2000=m +CONFIG_DVB_AF9033=m +CONFIG_DVB_HORUS3A=m +CONFIG_DVB_ASCOT2E=m +CONFIG_DVB_HELENE=m + +# +# Common Interface (EN50221) controller drivers +# +CONFIG_DVB_CXD2099=m +CONFIG_DVB_SP2=m + +# +# Tools to develop new frontends +# +CONFIG_DVB_DUMMY_FE=m +# end of Customise DVB Frontends + +# +# Graphics support +# +CONFIG_AGP=m +CONFIG_AGP_AMD64=m +CONFIG_AGP_INTEL=m +CONFIG_AGP_SIS=m +CONFIG_AGP_VIA=m +CONFIG_INTEL_GTT=m +CONFIG_VGA_ARB=y +CONFIG_VGA_ARB_MAX_GPUS=10 +CONFIG_VGA_SWITCHEROO=y +CONFIG_DRM=m +CONFIG_DRM_MIPI_DBI=m +CONFIG_DRM_MIPI_DSI=y +CONFIG_DRM_DP_AUX_CHARDEV=y +# CONFIG_DRM_DEBUG_SELFTEST is not set +CONFIG_DRM_KMS_HELPER=m +CONFIG_DRM_KMS_FB_HELPER=y +CONFIG_DRM_FBDEV_EMULATION=y +CONFIG_DRM_FBDEV_OVERALLOC=100 +# CONFIG_DRM_FBDEV_LEAK_PHYS_SMEM is not set +CONFIG_DRM_LOAD_EDID_FIRMWARE=y +CONFIG_DRM_DP_CEC=y +CONFIG_DRM_TTM=m +CONFIG_DRM_VRAM_HELPER=m +CONFIG_DRM_GEM_CMA_HELPER=y +CONFIG_DRM_KMS_CMA_HELPER=y +CONFIG_DRM_GEM_SHMEM_HELPER=y +CONFIG_DRM_SCHED=m + +# +# I2C encoder or helper chips +# +CONFIG_DRM_I2C_CH7006=m +CONFIG_DRM_I2C_SIL164=m +CONFIG_DRM_I2C_NXP_TDA998X=m +CONFIG_DRM_I2C_NXP_TDA9950=m +# end of I2C encoder or helper chips + +# +# ARM devices +# +CONFIG_DRM_KOMEDA=m +# end of ARM devices + +CONFIG_DRM_RADEON=m +CONFIG_DRM_RADEON_USERPTR=y +CONFIG_DRM_AMDGPU=m +CONFIG_DRM_AMDGPU_SI=y +CONFIG_DRM_AMDGPU_CIK=y +CONFIG_DRM_AMDGPU_USERPTR=y +# CONFIG_DRM_AMDGPU_GART_DEBUGFS is not set + +# +# ACP (Audio CoProcessor) Configuration +# +CONFIG_DRM_AMD_ACP=y +# end of ACP (Audio CoProcessor) Configuration + +# +# Display Engine Configuration +# +CONFIG_DRM_AMD_DC=y +CONFIG_DRM_AMD_DC_DCN1_0=y +CONFIG_DRM_AMD_DC_DCN2_0=y +# CONFIG_DRM_AMD_DC_DCN2_1 is not set +CONFIG_DRM_AMD_DC_DSC_SUPPORT=y +# CONFIG_DEBUG_KERNEL_DC is not set +# end of Display Engine Configuration + +CONFIG_HSA_AMD=y +CONFIG_DRM_NOUVEAU=m +# CONFIG_NOUVEAU_LEGACY_CTX_SUPPORT is not set +CONFIG_NOUVEAU_DEBUG=5 +CONFIG_NOUVEAU_DEBUG_DEFAULT=3 +# CONFIG_NOUVEAU_DEBUG_MMU is not set +CONFIG_DRM_NOUVEAU_BACKLIGHT=y +CONFIG_DRM_NOUVEAU_SVM=y +CONFIG_DRM_I915=m +CONFIG_DRM_I915_ALPHA_SUPPORT=y +CONFIG_DRM_I915_FORCE_PROBE="*" +CONFIG_DRM_I915_CAPTURE_ERROR=y +CONFIG_DRM_I915_COMPRESS_ERROR=y +CONFIG_DRM_I915_USERPTR=y +CONFIG_DRM_I915_GVT=y +CONFIG_DRM_I915_GVT_KVMGT=m + +# +# drm/i915 Debugging +# +# CONFIG_DRM_I915_WERROR is not set +# CONFIG_DRM_I915_DEBUG is not set +# CONFIG_DRM_I915_DEBUG_MMIO is not set +# CONFIG_DRM_I915_SW_FENCE_DEBUG_OBJECTS is not set +# CONFIG_DRM_I915_SW_FENCE_CHECK_DAG is not set +# CONFIG_DRM_I915_DEBUG_GUC is not set +# CONFIG_DRM_I915_SELFTEST is not set +# CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS is not set +# CONFIG_DRM_I915_DEBUG_VBLANK_EVADE is not set +# CONFIG_DRM_I915_DEBUG_RUNTIME_PM is not set +# end of drm/i915 Debugging + +# +# drm/i915 Profile Guided Optimisation +# +CONFIG_DRM_I915_USERFAULT_AUTOSUSPEND=250 +CONFIG_DRM_I915_SPIN_REQUEST=5 +# end of drm/i915 Profile Guided Optimisation + +CONFIG_DRM_VGEM=m +CONFIG_DRM_VKMS=m +CONFIG_DRM_VMWGFX=m +CONFIG_DRM_VMWGFX_FBCON=y +CONFIG_DRM_GMA500=m +CONFIG_DRM_GMA600=y +CONFIG_DRM_GMA3600=y +CONFIG_DRM_UDL=m +CONFIG_DRM_AST=m +CONFIG_DRM_MGAG200=m +CONFIG_DRM_CIRRUS_QEMU=m +CONFIG_DRM_RCAR_DW_HDMI=m +CONFIG_DRM_RCAR_LVDS=m +CONFIG_DRM_QXL=m +CONFIG_DRM_BOCHS=m +CONFIG_DRM_VIRTIO_GPU=m +CONFIG_DRM_PANEL=y + +# +# Display Panels +# +CONFIG_DRM_PANEL_ARM_VERSATILE=m +CONFIG_DRM_PANEL_LVDS=m +CONFIG_DRM_PANEL_SIMPLE=m +CONFIG_DRM_PANEL_FEIYANG_FY07024DI26A30D=m +CONFIG_DRM_PANEL_ILITEK_IL9322=m +CONFIG_DRM_PANEL_ILITEK_ILI9881C=m +CONFIG_DRM_PANEL_INNOLUX_P079ZCA=m +CONFIG_DRM_PANEL_JDI_LT070ME05000=m +CONFIG_DRM_PANEL_KINGDISPLAY_KD097D04=m +CONFIG_DRM_PANEL_SAMSUNG_LD9040=m +# CONFIG_DRM_PANEL_LG_LB035Q02 is not set +CONFIG_DRM_PANEL_LG_LG4573=m +# CONFIG_DRM_PANEL_NEC_NL8048HL11 is not set +# CONFIG_DRM_PANEL_NOVATEK_NT39016 is not set +CONFIG_DRM_PANEL_OLIMEX_LCD_OLINUXINO=m +CONFIG_DRM_PANEL_ORISETECH_OTM8009A=m +CONFIG_DRM_PANEL_OSD_OSD101T2587_53TS=m +CONFIG_DRM_PANEL_PANASONIC_VVX10F034N00=m +CONFIG_DRM_PANEL_RASPBERRYPI_TOUCHSCREEN=m +# CONFIG_DRM_PANEL_RAYDIUM_RM67191 is not set +CONFIG_DRM_PANEL_RAYDIUM_RM68200=m +CONFIG_DRM_PANEL_ROCKTECH_JH057N00900=m +CONFIG_DRM_PANEL_RONBO_RB070D30=m +CONFIG_DRM_PANEL_SAMSUNG_S6D16D0=m +CONFIG_DRM_PANEL_SAMSUNG_S6E3HA2=m +CONFIG_DRM_PANEL_SAMSUNG_S6E63J0X03=m +CONFIG_DRM_PANEL_SAMSUNG_S6E63M0=m +CONFIG_DRM_PANEL_SAMSUNG_S6E8AA0=m +CONFIG_DRM_PANEL_SEIKO_43WVF1G=m +CONFIG_DRM_PANEL_SHARP_LQ101R1SX01=m +# CONFIG_DRM_PANEL_SHARP_LS037V7DW01 is not set +CONFIG_DRM_PANEL_SHARP_LS043T1LE01=m +CONFIG_DRM_PANEL_SITRONIX_ST7701=m +CONFIG_DRM_PANEL_SITRONIX_ST7789V=m +# CONFIG_DRM_PANEL_SONY_ACX565AKM is not set +# CONFIG_DRM_PANEL_TPO_TD028TTEC1 is not set +# CONFIG_DRM_PANEL_TPO_TD043MTEA1 is not set +CONFIG_DRM_PANEL_TPO_TPG110=m +CONFIG_DRM_PANEL_TRULY_NT35597_WQXGA=m +# end of Display Panels + +CONFIG_DRM_BRIDGE=y +CONFIG_DRM_PANEL_BRIDGE=y + +# +# Display Interface Bridges +# +CONFIG_DRM_ANALOGIX_ANX78XX=m +CONFIG_DRM_CDNS_DSI=m +CONFIG_DRM_DUMB_VGA_DAC=m +CONFIG_DRM_LVDS_ENCODER=m +CONFIG_DRM_MEGACHIPS_STDPXXXX_GE_B850V3_FW=m +CONFIG_DRM_NXP_PTN3460=m +CONFIG_DRM_PARADE_PS8622=m +CONFIG_DRM_SIL_SII8620=m +CONFIG_DRM_SII902X=m +CONFIG_DRM_SII9234=m +CONFIG_DRM_THINE_THC63LVD1024=m +CONFIG_DRM_TOSHIBA_TC358764=m +CONFIG_DRM_TOSHIBA_TC358767=m +CONFIG_DRM_TI_TFP410=m +CONFIG_DRM_TI_SN65DSI86=m +CONFIG_DRM_I2C_ADV7511=m +CONFIG_DRM_I2C_ADV7511_AUDIO=y +CONFIG_DRM_I2C_ADV7533=y +CONFIG_DRM_I2C_ADV7511_CEC=y +CONFIG_DRM_DW_HDMI=m +CONFIG_DRM_DW_HDMI_AHB_AUDIO=m +CONFIG_DRM_DW_HDMI_I2S_AUDIO=m +CONFIG_DRM_DW_HDMI_CEC=m +# end of Display Interface Bridges + +# CONFIG_DRM_ETNAVIV is not set +CONFIG_DRM_ARCPGU=m +CONFIG_DRM_MXS=y +CONFIG_DRM_MXSFB=m +# CONFIG_DRM_GM12U320 is not set +CONFIG_TINYDRM_HX8357D=m +CONFIG_TINYDRM_ILI9225=m +CONFIG_TINYDRM_ILI9341=m +CONFIG_TINYDRM_MI0283QT=m +CONFIG_TINYDRM_REPAPER=m +CONFIG_TINYDRM_ST7586=m +CONFIG_TINYDRM_ST7735R=m +CONFIG_DRM_XEN=y +CONFIG_DRM_XEN_FRONTEND=m +CONFIG_DRM_VBOXVIDEO=m +# CONFIG_DRM_LEGACY is not set +CONFIG_DRM_PANEL_ORIENTATION_QUIRKS=y + +# +# Frame buffer Devices +# +CONFIG_FB_CMDLINE=y +CONFIG_FB_NOTIFY=y +CONFIG_FB=y +CONFIG_FIRMWARE_EDID=y +CONFIG_FB_BOOT_VESA_SUPPORT=y +CONFIG_FB_CFB_FILLRECT=y +CONFIG_FB_CFB_COPYAREA=y +CONFIG_FB_CFB_IMAGEBLIT=y +CONFIG_FB_SYS_FILLRECT=m +CONFIG_FB_SYS_COPYAREA=m +CONFIG_FB_SYS_IMAGEBLIT=m +# CONFIG_FB_FOREIGN_ENDIAN is not set +CONFIG_FB_SYS_FOPS=m +CONFIG_FB_DEFERRED_IO=y +CONFIG_FB_BACKLIGHT=m +CONFIG_FB_MODE_HELPERS=y +CONFIG_FB_TILEBLITTING=y + +# +# Frame buffer hardware drivers +# +# CONFIG_FB_CIRRUS is not set +# CONFIG_FB_PM2 is not set +# CONFIG_FB_CYBER2000 is not set +# CONFIG_FB_ARC is not set +# CONFIG_FB_ASILIANT is not set +# CONFIG_FB_IMSTT is not set +# CONFIG_FB_VGA16 is not set +# CONFIG_FB_UVESA is not set +CONFIG_FB_VESA=y +CONFIG_FB_EFI=y +# CONFIG_FB_N411 is not set +# CONFIG_FB_HGA is not set +# CONFIG_FB_OPENCORES is not set +# CONFIG_FB_S1D13XXX is not set +# CONFIG_FB_NVIDIA is not set +# CONFIG_FB_RIVA is not set +# CONFIG_FB_I740 is not set +# CONFIG_FB_LE80578 is not set +# CONFIG_FB_INTEL is not set +# CONFIG_FB_MATROX is not set +# CONFIG_FB_RADEON is not set +# CONFIG_FB_ATY128 is not set +# CONFIG_FB_ATY is not set +# CONFIG_FB_S3 is not set +# CONFIG_FB_SAVAGE is not set +# CONFIG_FB_SIS is not set +# CONFIG_FB_VIA is not set +# CONFIG_FB_NEOMAGIC is not set +# CONFIG_FB_KYRO is not set +# CONFIG_FB_3DFX is not set +# CONFIG_FB_VOODOO1 is not set +# CONFIG_FB_VT8623 is not set +# CONFIG_FB_TRIDENT is not set +# CONFIG_FB_ARK is not set +# CONFIG_FB_PM3 is not set +# CONFIG_FB_CARMINE is not set +# CONFIG_FB_SM501 is not set +# CONFIG_FB_SMSCUFX is not set +# CONFIG_FB_UDL is not set +# CONFIG_FB_IBM_GXT4500 is not set +# CONFIG_FB_VIRTUAL is not set +CONFIG_XEN_FBDEV_FRONTEND=m +# CONFIG_FB_METRONOME is not set +# CONFIG_FB_MB862XX is not set +CONFIG_FB_HYPERV=m +CONFIG_FB_SIMPLE=y +# CONFIG_FB_SSD1307 is not set +# CONFIG_FB_SM712 is not set +# end of Frame buffer Devices + +# +# Backlight & LCD device support +# +CONFIG_LCD_CLASS_DEVICE=m +CONFIG_LCD_L4F00242T03=m +CONFIG_LCD_LMS283GF05=m +CONFIG_LCD_LTV350QV=m +CONFIG_LCD_ILI922X=m +CONFIG_LCD_ILI9320=m +CONFIG_LCD_TDO24M=m +CONFIG_LCD_VGG2432A4=m +CONFIG_LCD_PLATFORM=m +CONFIG_LCD_AMS369FG06=m +CONFIG_LCD_LMS501KF03=m +CONFIG_LCD_HX8357=m +CONFIG_LCD_OTM3225A=m +CONFIG_BACKLIGHT_CLASS_DEVICE=y +CONFIG_BACKLIGHT_GENERIC=m +CONFIG_BACKLIGHT_LM3533=m +CONFIG_BACKLIGHT_PWM=m +CONFIG_BACKLIGHT_DA903X=m +CONFIG_BACKLIGHT_DA9052=m +CONFIG_BACKLIGHT_MAX8925=m +CONFIG_BACKLIGHT_APPLE=m +CONFIG_BACKLIGHT_PM8941_WLED=m +CONFIG_BACKLIGHT_SAHARA=m +CONFIG_BACKLIGHT_WM831X=m +CONFIG_BACKLIGHT_ADP5520=m +CONFIG_BACKLIGHT_ADP8860=m +CONFIG_BACKLIGHT_ADP8870=m +CONFIG_BACKLIGHT_88PM860X=m +CONFIG_BACKLIGHT_PCF50633=m +CONFIG_BACKLIGHT_AAT2870=m +CONFIG_BACKLIGHT_LM3630A=m +CONFIG_BACKLIGHT_LM3639=m +CONFIG_BACKLIGHT_LP855X=m +CONFIG_BACKLIGHT_LP8788=m +CONFIG_BACKLIGHT_PANDORA=m +CONFIG_BACKLIGHT_SKY81452=m +CONFIG_BACKLIGHT_TPS65217=m +CONFIG_BACKLIGHT_AS3711=m +CONFIG_BACKLIGHT_GPIO=m +CONFIG_BACKLIGHT_LV5207LP=m +CONFIG_BACKLIGHT_BD6107=m +CONFIG_BACKLIGHT_ARCXCNN=m +CONFIG_BACKLIGHT_RAVE_SP=m +# end of Backlight & LCD device support + +CONFIG_VIDEOMODE_HELPERS=y +CONFIG_HDMI=y + +# +# Console display driver support +# +CONFIG_VGA_CONSOLE=y +CONFIG_VGACON_SOFT_SCROLLBACK=y +CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 +# CONFIG_VGACON_SOFT_SCROLLBACK_PERSISTENT_ENABLE_BY_DEFAULT is not set +CONFIG_DUMMY_CONSOLE=y +CONFIG_DUMMY_CONSOLE_COLUMNS=80 +CONFIG_DUMMY_CONSOLE_ROWS=25 +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y +CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y +CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER=y +# end of Console display driver support + +# CONFIG_LOGO is not set +# end of Graphics support + +CONFIG_SOUND=m +CONFIG_SOUND_OSS_CORE=y +# CONFIG_SOUND_OSS_CORE_PRECLAIM is not set +CONFIG_SND=m +CONFIG_SND_TIMER=m +CONFIG_SND_PCM=m +CONFIG_SND_PCM_ELD=y +CONFIG_SND_PCM_IEC958=y +CONFIG_SND_DMAENGINE_PCM=m +CONFIG_SND_HWDEP=m +CONFIG_SND_SEQ_DEVICE=m +CONFIG_SND_RAWMIDI=m +CONFIG_SND_COMPRESS_OFFLOAD=m +CONFIG_SND_JACK=y +CONFIG_SND_JACK_INPUT_DEV=y +CONFIG_SND_OSSEMUL=y +CONFIG_SND_MIXER_OSS=m +CONFIG_SND_PCM_OSS=m +CONFIG_SND_PCM_OSS_PLUGINS=y +CONFIG_SND_PCM_TIMER=y +CONFIG_SND_HRTIMER=m +CONFIG_SND_DYNAMIC_MINORS=y +CONFIG_SND_MAX_CARDS=32 +# CONFIG_SND_SUPPORT_OLD_API is not set +CONFIG_SND_PROC_FS=y +CONFIG_SND_VERBOSE_PROCFS=y +CONFIG_SND_VERBOSE_PRINTK=y +CONFIG_SND_DEBUG=y +# CONFIG_SND_DEBUG_VERBOSE is not set +# CONFIG_SND_PCM_XRUN_DEBUG is not set +CONFIG_SND_VMASTER=y +CONFIG_SND_DMA_SGBUF=y +CONFIG_SND_SEQUENCER=m +CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_SEQUENCER_OSS=m +CONFIG_SND_SEQ_HRTIMER_DEFAULT=y +CONFIG_SND_SEQ_MIDI_EVENT=m +CONFIG_SND_SEQ_MIDI=m +CONFIG_SND_SEQ_MIDI_EMUL=m +CONFIG_SND_SEQ_VIRMIDI=m +CONFIG_SND_MPU401_UART=m +CONFIG_SND_OPL3_LIB=m +CONFIG_SND_OPL3_LIB_SEQ=m +CONFIG_SND_VX_LIB=m +CONFIG_SND_AC97_CODEC=m +CONFIG_SND_DRIVERS=y +# CONFIG_SND_PCSP is not set +CONFIG_SND_DUMMY=m +CONFIG_SND_ALOOP=m +CONFIG_SND_VIRMIDI=m +CONFIG_SND_MTPAV=m +CONFIG_SND_MTS64=m +CONFIG_SND_SERIAL_U16550=m +CONFIG_SND_MPU401=m +CONFIG_SND_PORTMAN2X4=m +CONFIG_SND_AC97_POWER_SAVE=y +CONFIG_SND_AC97_POWER_SAVE_DEFAULT=0 +CONFIG_SND_SB_COMMON=m +CONFIG_SND_PCI=y +CONFIG_SND_AD1889=m +CONFIG_SND_ALS300=m +CONFIG_SND_ALS4000=m +CONFIG_SND_ALI5451=m +CONFIG_SND_ASIHPI=m +CONFIG_SND_ATIIXP=m +CONFIG_SND_ATIIXP_MODEM=m +CONFIG_SND_AU8810=m +CONFIG_SND_AU8820=m +CONFIG_SND_AU8830=m +CONFIG_SND_AW2=m +CONFIG_SND_AZT3328=m +CONFIG_SND_BT87X=m +# CONFIG_SND_BT87X_OVERCLOCK is not set +CONFIG_SND_CA0106=m +CONFIG_SND_CMIPCI=m +CONFIG_SND_OXYGEN_LIB=m +CONFIG_SND_OXYGEN=m +CONFIG_SND_CS4281=m +CONFIG_SND_CS46XX=m +CONFIG_SND_CS46XX_NEW_DSP=y +CONFIG_SND_CTXFI=m +CONFIG_SND_DARLA20=m +CONFIG_SND_GINA20=m +CONFIG_SND_LAYLA20=m +CONFIG_SND_DARLA24=m +CONFIG_SND_GINA24=m +CONFIG_SND_LAYLA24=m +CONFIG_SND_MONA=m +CONFIG_SND_MIA=m +CONFIG_SND_ECHO3G=m +CONFIG_SND_INDIGO=m +CONFIG_SND_INDIGOIO=m +CONFIG_SND_INDIGODJ=m +CONFIG_SND_INDIGOIOX=m +CONFIG_SND_INDIGODJX=m +CONFIG_SND_EMU10K1=m +CONFIG_SND_EMU10K1_SEQ=m +CONFIG_SND_EMU10K1X=m +CONFIG_SND_ENS1370=m +CONFIG_SND_ENS1371=m +CONFIG_SND_ES1938=m +CONFIG_SND_ES1968=m +CONFIG_SND_ES1968_INPUT=y +CONFIG_SND_ES1968_RADIO=y +CONFIG_SND_FM801=m +CONFIG_SND_FM801_TEA575X_BOOL=y +CONFIG_SND_HDSP=m +CONFIG_SND_HDSPM=m +CONFIG_SND_ICE1712=m +CONFIG_SND_ICE1724=m +CONFIG_SND_INTEL8X0=m +CONFIG_SND_INTEL8X0M=m +CONFIG_SND_KORG1212=m +CONFIG_SND_LOLA=m +CONFIG_SND_LX6464ES=m +CONFIG_SND_MAESTRO3=m +CONFIG_SND_MAESTRO3_INPUT=y +CONFIG_SND_MIXART=m +CONFIG_SND_NM256=m +CONFIG_SND_PCXHR=m +CONFIG_SND_RIPTIDE=m +CONFIG_SND_RME32=m +CONFIG_SND_RME96=m +CONFIG_SND_RME9652=m +CONFIG_SND_SONICVIBES=m +CONFIG_SND_TRIDENT=m +CONFIG_SND_VIA82XX=m +CONFIG_SND_VIA82XX_MODEM=m +CONFIG_SND_VIRTUOSO=m +CONFIG_SND_VX222=m +CONFIG_SND_YMFPCI=m + +# +# HD-Audio +# +CONFIG_SND_HDA=m +CONFIG_SND_HDA_INTEL=m +# CONFIG_SND_HDA_INTEL_DETECT_DMIC is not set +CONFIG_SND_HDA_HWDEP=y +CONFIG_SND_HDA_RECONFIG=y +CONFIG_SND_HDA_INPUT_BEEP=y +CONFIG_SND_HDA_INPUT_BEEP_MODE=1 +CONFIG_SND_HDA_PATCH_LOADER=y +CONFIG_SND_HDA_CODEC_REALTEK=m +CONFIG_SND_HDA_CODEC_ANALOG=m +CONFIG_SND_HDA_CODEC_SIGMATEL=m +CONFIG_SND_HDA_CODEC_VIA=m +CONFIG_SND_HDA_CODEC_HDMI=m +CONFIG_SND_HDA_CODEC_CIRRUS=m +CONFIG_SND_HDA_CODEC_CONEXANT=m +CONFIG_SND_HDA_CODEC_CA0110=m +CONFIG_SND_HDA_CODEC_CA0132=m +CONFIG_SND_HDA_CODEC_CA0132_DSP=y +CONFIG_SND_HDA_CODEC_CMEDIA=m +CONFIG_SND_HDA_CODEC_SI3054=m +CONFIG_SND_HDA_GENERIC=m +CONFIG_SND_HDA_POWER_SAVE_DEFAULT=0 +# end of HD-Audio + +CONFIG_SND_HDA_CORE=m +CONFIG_SND_HDA_DSP_LOADER=y +CONFIG_SND_HDA_COMPONENT=y +CONFIG_SND_HDA_I915=y +CONFIG_SND_HDA_EXT_CORE=m +CONFIG_SND_HDA_PREALLOC_SIZE=4096 +CONFIG_SND_INTEL_NHLT=m +CONFIG_SND_SPI=y +CONFIG_SND_USB=y +CONFIG_SND_USB_AUDIO=m +CONFIG_SND_USB_AUDIO_USE_MEDIA_CONTROLLER=y +CONFIG_SND_USB_UA101=m +CONFIG_SND_USB_USX2Y=m +CONFIG_SND_USB_CAIAQ=m +CONFIG_SND_USB_CAIAQ_INPUT=y +CONFIG_SND_USB_US122L=m +CONFIG_SND_USB_6FIRE=m +CONFIG_SND_USB_HIFACE=m +CONFIG_SND_BCD2000=m +CONFIG_SND_USB_LINE6=m +CONFIG_SND_USB_POD=m +CONFIG_SND_USB_PODHD=m +CONFIG_SND_USB_TONEPORT=m +CONFIG_SND_USB_VARIAX=m +CONFIG_SND_FIREWIRE=y +CONFIG_SND_FIREWIRE_LIB=m +CONFIG_SND_DICE=m +CONFIG_SND_OXFW=m +CONFIG_SND_ISIGHT=m +CONFIG_SND_FIREWORKS=m +CONFIG_SND_BEBOB=m +CONFIG_SND_FIREWIRE_DIGI00X=m +CONFIG_SND_FIREWIRE_TASCAM=m +CONFIG_SND_FIREWIRE_MOTU=m +CONFIG_SND_FIREFACE=m +CONFIG_SND_PCMCIA=y +CONFIG_SND_VXPOCKET=m +CONFIG_SND_PDAUDIOCF=m +CONFIG_SND_SOC=m +CONFIG_SND_SOC_AC97_BUS=y +CONFIG_SND_SOC_GENERIC_DMAENGINE_PCM=y +CONFIG_SND_SOC_COMPRESS=y +CONFIG_SND_SOC_TOPOLOGY=y +CONFIG_SND_SOC_ACPI=m +CONFIG_SND_SOC_AMD_ACP=m +CONFIG_SND_SOC_AMD_CZ_DA7219MX98357_MACH=m +CONFIG_SND_SOC_AMD_CZ_RT5645_MACH=m +CONFIG_SND_SOC_AMD_ACP3x=m +CONFIG_SND_ATMEL_SOC=m +CONFIG_SND_SOC_MIKROE_PROTO=m +CONFIG_SND_DESIGNWARE_I2S=m +CONFIG_SND_DESIGNWARE_PCM=y + +# +# SoC Audio for Freescale CPUs +# + +# +# Common SoC Audio options for Freescale CPUs: +# +# CONFIG_SND_SOC_FSL_ASRC is not set +# CONFIG_SND_SOC_FSL_SAI is not set +# CONFIG_SND_SOC_FSL_AUDMIX is not set +# CONFIG_SND_SOC_FSL_SSI is not set +# CONFIG_SND_SOC_FSL_SPDIF is not set +# CONFIG_SND_SOC_FSL_ESAI is not set +# CONFIG_SND_SOC_FSL_MICFIL is not set +# CONFIG_SND_SOC_IMX_AUDMUX is not set +# end of SoC Audio for Freescale CPUs + +CONFIG_SND_I2S_HI6210_I2S=m +CONFIG_SND_SOC_IMG=y +CONFIG_SND_SOC_IMG_I2S_IN=m +CONFIG_SND_SOC_IMG_I2S_OUT=m +CONFIG_SND_SOC_IMG_PARALLEL_OUT=m +CONFIG_SND_SOC_IMG_SPDIF_IN=m +CONFIG_SND_SOC_IMG_SPDIF_OUT=m +CONFIG_SND_SOC_IMG_PISTACHIO_INTERNAL_DAC=m +CONFIG_SND_SOC_INTEL_SST_TOPLEVEL=y +CONFIG_SND_SST_IPC=m +CONFIG_SND_SST_IPC_PCI=m +CONFIG_SND_SST_IPC_ACPI=m +CONFIG_SND_SOC_INTEL_SST_ACPI=m +CONFIG_SND_SOC_INTEL_SST=m +CONFIG_SND_SOC_INTEL_SST_FIRMWARE=m +CONFIG_SND_SOC_INTEL_HASWELL=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_PCI=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_ACPI=m +CONFIG_SND_SOC_INTEL_SKYLAKE=m +CONFIG_SND_SOC_INTEL_SKL=m +CONFIG_SND_SOC_INTEL_APL=m +CONFIG_SND_SOC_INTEL_KBL=m +CONFIG_SND_SOC_INTEL_GLK=m +CONFIG_SND_SOC_INTEL_CNL=m +CONFIG_SND_SOC_INTEL_CFL=m +CONFIG_SND_SOC_INTEL_CML_H=m +CONFIG_SND_SOC_INTEL_CML_LP=m +CONFIG_SND_SOC_INTEL_SKYLAKE_FAMILY=m +CONFIG_SND_SOC_INTEL_SKYLAKE_SSP_CLK=m +# CONFIG_SND_SOC_INTEL_SKYLAKE_HDAUDIO_CODEC is not set +CONFIG_SND_SOC_INTEL_SKYLAKE_COMMON=m +CONFIG_SND_SOC_ACPI_INTEL_MATCH=m +CONFIG_SND_SOC_INTEL_MACH=y +CONFIG_SND_SOC_INTEL_HASWELL_MACH=m +CONFIG_SND_SOC_INTEL_BDW_RT5677_MACH=m +CONFIG_SND_SOC_INTEL_BROADWELL_MACH=m +CONFIG_SND_SOC_INTEL_BYTCR_RT5640_MACH=m +CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_RT5672_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_RT5645_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_MAX98090_TI_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_NAU8824_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_CX2072X_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_DA7213_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_ES8316_MACH=m +# CONFIG_SND_SOC_INTEL_BYT_CHT_NOCODEC_MACH is not set +CONFIG_SND_SOC_INTEL_SKL_RT286_MACH=m +CONFIG_SND_SOC_INTEL_SKL_NAU88L25_SSM4567_MACH=m +CONFIG_SND_SOC_INTEL_SKL_NAU88L25_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_DA7219_MAX98357A_GENERIC=m +CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_BXT_RT298_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5663_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5663_RT5514_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5660_MACH=m +CONFIG_SND_SOC_INTEL_GLK_RT5682_MAX98357A_MACH=m +CONFIG_SND_SOC_MTK_BTCVSD=m +# CONFIG_SND_SOC_SOF_TOPLEVEL is not set + +# +# STMicroelectronics STM32 SOC audio support +# +# end of STMicroelectronics STM32 SOC audio support + +CONFIG_SND_SOC_XILINX_I2S=m +CONFIG_SND_SOC_XILINX_AUDIO_FORMATTER=m +CONFIG_SND_SOC_XILINX_SPDIF=m +CONFIG_SND_SOC_XTFPGA_I2S=m +CONFIG_ZX_TDM=m +CONFIG_SND_SOC_I2C_AND_SPI=m + +# +# CODEC drivers +# +CONFIG_SND_SOC_AC97_CODEC=m +CONFIG_SND_SOC_ADAU_UTILS=m +CONFIG_SND_SOC_ADAU1701=m +CONFIG_SND_SOC_ADAU17X1=m +CONFIG_SND_SOC_ADAU1761=m +CONFIG_SND_SOC_ADAU1761_I2C=m +CONFIG_SND_SOC_ADAU1761_SPI=m +CONFIG_SND_SOC_ADAU7002=m +CONFIG_SND_SOC_AK4104=m +CONFIG_SND_SOC_AK4118=m +CONFIG_SND_SOC_AK4458=m +CONFIG_SND_SOC_AK4554=m +CONFIG_SND_SOC_AK4613=m +CONFIG_SND_SOC_AK4642=m +CONFIG_SND_SOC_AK5386=m +CONFIG_SND_SOC_AK5558=m +CONFIG_SND_SOC_ALC5623=m +CONFIG_SND_SOC_BD28623=m +# CONFIG_SND_SOC_BT_SCO is not set +CONFIG_SND_SOC_CPCAP=m +CONFIG_SND_SOC_CROS_EC_CODEC=m +CONFIG_SND_SOC_CS35L32=m +CONFIG_SND_SOC_CS35L33=m +CONFIG_SND_SOC_CS35L34=m +CONFIG_SND_SOC_CS35L35=m +CONFIG_SND_SOC_CS35L36=m +CONFIG_SND_SOC_CS42L42=m +CONFIG_SND_SOC_CS42L51=m +CONFIG_SND_SOC_CS42L51_I2C=m +CONFIG_SND_SOC_CS42L52=m +CONFIG_SND_SOC_CS42L56=m +CONFIG_SND_SOC_CS42L73=m +CONFIG_SND_SOC_CS4265=m +CONFIG_SND_SOC_CS4270=m +CONFIG_SND_SOC_CS4271=m +CONFIG_SND_SOC_CS4271_I2C=m +CONFIG_SND_SOC_CS4271_SPI=m +CONFIG_SND_SOC_CS42XX8=m +CONFIG_SND_SOC_CS42XX8_I2C=m +CONFIG_SND_SOC_CS43130=m +CONFIG_SND_SOC_CS4341=m +CONFIG_SND_SOC_CS4349=m +CONFIG_SND_SOC_CS53L30=m +CONFIG_SND_SOC_CX2072X=m +CONFIG_SND_SOC_DA7213=m +CONFIG_SND_SOC_DA7219=m +CONFIG_SND_SOC_DMIC=m +CONFIG_SND_SOC_HDMI_CODEC=m +CONFIG_SND_SOC_ES7134=m +CONFIG_SND_SOC_ES7241=m +CONFIG_SND_SOC_ES8316=m +CONFIG_SND_SOC_ES8328=m +CONFIG_SND_SOC_ES8328_I2C=m +CONFIG_SND_SOC_ES8328_SPI=m +CONFIG_SND_SOC_GTM601=m +CONFIG_SND_SOC_HDAC_HDMI=m +CONFIG_SND_SOC_INNO_RK3036=m +CONFIG_SND_SOC_LOCHNAGAR_SC=m +CONFIG_SND_SOC_MAX98088=m +CONFIG_SND_SOC_MAX98090=m +CONFIG_SND_SOC_MAX98357A=m +CONFIG_SND_SOC_MAX98504=m +CONFIG_SND_SOC_MAX9867=m +CONFIG_SND_SOC_MAX98927=m +CONFIG_SND_SOC_MAX98373=m +CONFIG_SND_SOC_MAX9860=m +CONFIG_SND_SOC_MSM8916_WCD_ANALOG=m +CONFIG_SND_SOC_MSM8916_WCD_DIGITAL=m +CONFIG_SND_SOC_PCM1681=m +CONFIG_SND_SOC_PCM1789=m +CONFIG_SND_SOC_PCM1789_I2C=m +CONFIG_SND_SOC_PCM179X=m +CONFIG_SND_SOC_PCM179X_I2C=m +CONFIG_SND_SOC_PCM179X_SPI=m +CONFIG_SND_SOC_PCM186X=m +CONFIG_SND_SOC_PCM186X_I2C=m +CONFIG_SND_SOC_PCM186X_SPI=m +CONFIG_SND_SOC_PCM3060=m +CONFIG_SND_SOC_PCM3060_I2C=m +CONFIG_SND_SOC_PCM3060_SPI=m +CONFIG_SND_SOC_PCM3168A=m +CONFIG_SND_SOC_PCM3168A_I2C=m +CONFIG_SND_SOC_PCM3168A_SPI=m +CONFIG_SND_SOC_PCM512x=m +CONFIG_SND_SOC_PCM512x_I2C=m +CONFIG_SND_SOC_PCM512x_SPI=m +CONFIG_SND_SOC_RK3328=m +CONFIG_SND_SOC_RL6231=m +CONFIG_SND_SOC_RL6347A=m +CONFIG_SND_SOC_RT286=m +CONFIG_SND_SOC_RT298=m +CONFIG_SND_SOC_RT5514=m +CONFIG_SND_SOC_RT5514_SPI=m +CONFIG_SND_SOC_RT5616=m +CONFIG_SND_SOC_RT5631=m +CONFIG_SND_SOC_RT5640=m +CONFIG_SND_SOC_RT5645=m +CONFIG_SND_SOC_RT5651=m +CONFIG_SND_SOC_RT5660=m +CONFIG_SND_SOC_RT5663=m +CONFIG_SND_SOC_RT5670=m +CONFIG_SND_SOC_RT5677=m +CONFIG_SND_SOC_RT5677_SPI=m +CONFIG_SND_SOC_RT5682=m +CONFIG_SND_SOC_SGTL5000=m +CONFIG_SND_SOC_SI476X=m +CONFIG_SND_SOC_SIGMADSP=m +CONFIG_SND_SOC_SIGMADSP_I2C=m +CONFIG_SND_SOC_SIGMADSP_REGMAP=m +CONFIG_SND_SOC_SIMPLE_AMPLIFIER=m +CONFIG_SND_SOC_SIRF_AUDIO_CODEC=m +CONFIG_SND_SOC_SPDIF=m +CONFIG_SND_SOC_SSM2305=m +CONFIG_SND_SOC_SSM2602=m +CONFIG_SND_SOC_SSM2602_SPI=m +CONFIG_SND_SOC_SSM2602_I2C=m +CONFIG_SND_SOC_SSM4567=m +CONFIG_SND_SOC_STA32X=m +CONFIG_SND_SOC_STA350=m +CONFIG_SND_SOC_STI_SAS=m +CONFIG_SND_SOC_TAS2552=m +CONFIG_SND_SOC_TAS5086=m +CONFIG_SND_SOC_TAS571X=m +CONFIG_SND_SOC_TAS5720=m +CONFIG_SND_SOC_TAS6424=m +CONFIG_SND_SOC_TDA7419=m +CONFIG_SND_SOC_TFA9879=m +CONFIG_SND_SOC_TLV320AIC23=m +CONFIG_SND_SOC_TLV320AIC23_I2C=m +CONFIG_SND_SOC_TLV320AIC23_SPI=m +CONFIG_SND_SOC_TLV320AIC31XX=m +CONFIG_SND_SOC_TLV320AIC32X4=m +CONFIG_SND_SOC_TLV320AIC32X4_I2C=m +CONFIG_SND_SOC_TLV320AIC32X4_SPI=m +CONFIG_SND_SOC_TLV320AIC3X=m +CONFIG_SND_SOC_TS3A227E=m +CONFIG_SND_SOC_TSCS42XX=m +CONFIG_SND_SOC_TSCS454=m +# CONFIG_SND_SOC_UDA1334 is not set +CONFIG_SND_SOC_WCD9335=m +CONFIG_SND_SOC_WM8510=m +CONFIG_SND_SOC_WM8523=m +CONFIG_SND_SOC_WM8524=m +CONFIG_SND_SOC_WM8580=m +CONFIG_SND_SOC_WM8711=m +CONFIG_SND_SOC_WM8728=m +CONFIG_SND_SOC_WM8731=m +CONFIG_SND_SOC_WM8737=m +CONFIG_SND_SOC_WM8741=m +CONFIG_SND_SOC_WM8750=m +CONFIG_SND_SOC_WM8753=m +CONFIG_SND_SOC_WM8770=m +CONFIG_SND_SOC_WM8776=m +CONFIG_SND_SOC_WM8782=m +CONFIG_SND_SOC_WM8804=m +CONFIG_SND_SOC_WM8804_I2C=m +CONFIG_SND_SOC_WM8804_SPI=m +CONFIG_SND_SOC_WM8903=m +CONFIG_SND_SOC_WM8904=m +CONFIG_SND_SOC_WM8960=m +CONFIG_SND_SOC_WM8962=m +CONFIG_SND_SOC_WM8974=m +CONFIG_SND_SOC_WM8978=m +CONFIG_SND_SOC_WM8985=m +CONFIG_SND_SOC_ZX_AUD96P22=m +CONFIG_SND_SOC_MAX9759=m +CONFIG_SND_SOC_MT6351=m +CONFIG_SND_SOC_MT6358=m +CONFIG_SND_SOC_NAU8540=m +CONFIG_SND_SOC_NAU8810=m +CONFIG_SND_SOC_NAU8822=m +CONFIG_SND_SOC_NAU8824=m +CONFIG_SND_SOC_NAU8825=m +CONFIG_SND_SOC_TPA6130A2=m +# end of CODEC drivers + +CONFIG_SND_SIMPLE_CARD_UTILS=m +CONFIG_SND_SIMPLE_CARD=m +CONFIG_SND_AUDIO_GRAPH_CARD=m +CONFIG_SND_X86=y +CONFIG_HDMI_LPE_AUDIO=m +CONFIG_SND_SYNTH_EMUX=m +CONFIG_SND_XEN_FRONTEND=m +CONFIG_AC97_BUS=m + +# +# HID support +# +CONFIG_HID=m +CONFIG_HID_BATTERY_STRENGTH=y +CONFIG_HIDRAW=y +CONFIG_UHID=m +CONFIG_HID_GENERIC=m + +# +# Special HID drivers +# +CONFIG_HID_A4TECH=m +CONFIG_HID_ACCUTOUCH=m +CONFIG_HID_ACRUX=m +CONFIG_HID_ACRUX_FF=y +CONFIG_HID_APPLE=m +CONFIG_HID_APPLEIR=m +CONFIG_HID_ASUS=m +CONFIG_HID_AUREAL=m +CONFIG_HID_BELKIN=m +CONFIG_HID_BETOP_FF=m +CONFIG_HID_BIGBEN_FF=m +CONFIG_HID_CHERRY=m +CONFIG_HID_CHICONY=m +CONFIG_HID_CORSAIR=m +CONFIG_HID_COUGAR=m +CONFIG_HID_MACALLY=m +CONFIG_HID_PRODIKEYS=m +CONFIG_HID_CMEDIA=m +CONFIG_HID_CP2112=m +# CONFIG_HID_CREATIVE_SB0540 is not set +CONFIG_HID_CYPRESS=m +CONFIG_HID_DRAGONRISE=m +CONFIG_DRAGONRISE_FF=y +CONFIG_HID_EMS_FF=m +CONFIG_HID_ELAN=m +CONFIG_HID_ELECOM=m +CONFIG_HID_ELO=m +CONFIG_HID_EZKEY=m +CONFIG_HID_GEMBIRD=m +CONFIG_HID_GFRM=m +CONFIG_HID_HOLTEK=m +CONFIG_HOLTEK_FF=y +CONFIG_HID_GOOGLE_HAMMER=m +CONFIG_HID_GT683R=m +CONFIG_HID_KEYTOUCH=m +CONFIG_HID_KYE=m +CONFIG_HID_UCLOGIC=m +CONFIG_HID_WALTOP=m +CONFIG_HID_VIEWSONIC=m +CONFIG_HID_GYRATION=m +CONFIG_HID_ICADE=m +CONFIG_HID_ITE=m +CONFIG_HID_JABRA=m +CONFIG_HID_TWINHAN=m +CONFIG_HID_KENSINGTON=m +CONFIG_HID_LCPOWER=m +CONFIG_HID_LED=m +CONFIG_HID_LENOVO=m +CONFIG_HID_LOGITECH=m +CONFIG_HID_LOGITECH_DJ=m +CONFIG_HID_LOGITECH_HIDPP=m +CONFIG_LOGITECH_FF=y +CONFIG_LOGIRUMBLEPAD2_FF=y +CONFIG_LOGIG940_FF=y +CONFIG_LOGIWHEELS_FF=y +CONFIG_HID_MAGICMOUSE=m +CONFIG_HID_MALTRON=m +CONFIG_HID_MAYFLASH=m +CONFIG_HID_REDRAGON=m +CONFIG_HID_MICROSOFT=m +CONFIG_HID_MONTEREY=m +CONFIG_HID_MULTITOUCH=m +CONFIG_HID_NTI=m +CONFIG_HID_NTRIG=m +CONFIG_HID_ORTEK=m +CONFIG_HID_PANTHERLORD=m +CONFIG_PANTHERLORD_FF=y +CONFIG_HID_PENMOUNT=m +CONFIG_HID_PETALYNX=m +CONFIG_HID_PICOLCD=m +CONFIG_HID_PICOLCD_FB=y +CONFIG_HID_PICOLCD_BACKLIGHT=y +CONFIG_HID_PICOLCD_LCD=y +CONFIG_HID_PICOLCD_LEDS=y +CONFIG_HID_PICOLCD_CIR=y +CONFIG_HID_PLANTRONICS=m +CONFIG_HID_PRIMAX=m +CONFIG_HID_RETRODE=m +CONFIG_HID_ROCCAT=m +CONFIG_HID_SAITEK=m +CONFIG_HID_SAMSUNG=m +CONFIG_HID_SONY=m +CONFIG_SONY_FF=y +CONFIG_HID_SPEEDLINK=m +CONFIG_HID_STEAM=m +CONFIG_HID_STEELSERIES=m +CONFIG_HID_SUNPLUS=m +CONFIG_HID_RMI=m +CONFIG_HID_GREENASIA=m +CONFIG_GREENASIA_FF=y +CONFIG_HID_HYPERV_MOUSE=m +CONFIG_HID_SMARTJOYPLUS=m +CONFIG_SMARTJOYPLUS_FF=y +CONFIG_HID_TIVO=m +CONFIG_HID_TOPSEED=m +CONFIG_HID_THINGM=m +CONFIG_HID_THRUSTMASTER=m +CONFIG_THRUSTMASTER_FF=y +CONFIG_HID_UDRAW_PS3=m +CONFIG_HID_U2FZERO=m +CONFIG_HID_WACOM=m +CONFIG_HID_WIIMOTE=m +CONFIG_HID_XINMO=m +CONFIG_HID_ZEROPLUS=m +CONFIG_ZEROPLUS_FF=y +CONFIG_HID_ZYDACRON=m +CONFIG_HID_SENSOR_HUB=m +# CONFIG_HID_SENSOR_CUSTOM_SENSOR is not set +CONFIG_HID_ALPS=m +# end of Special HID drivers + +# +# USB HID support +# +CONFIG_USB_HID=m +CONFIG_HID_PID=y +CONFIG_USB_HIDDEV=y + +# +# USB HID Boot Protocol drivers +# +# CONFIG_USB_KBD is not set +# CONFIG_USB_MOUSE is not set +# end of USB HID Boot Protocol drivers +# end of USB HID support + +# +# I2C HID support +# +CONFIG_I2C_HID=m +# end of I2C HID support + +# +# Intel ISH HID support +# +CONFIG_INTEL_ISH_HID=m +CONFIG_INTEL_ISH_FIRMWARE_DOWNLOADER=m +# end of Intel ISH HID support +# end of HID support + +CONFIG_USB_OHCI_LITTLE_ENDIAN=y +CONFIG_USB_SUPPORT=y +CONFIG_USB_COMMON=y +CONFIG_USB_LED_TRIG=y +CONFIG_USB_ULPI_BUS=m +# CONFIG_USB_CONN_GPIO is not set +CONFIG_USB_ARCH_HAS_HCD=y +CONFIG_USB=y +CONFIG_USB_PCI=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y + +# +# Miscellaneous USB options +# +CONFIG_USB_DEFAULT_PERSIST=y +CONFIG_USB_DYNAMIC_MINORS=y +# CONFIG_USB_OTG is not set +# CONFIG_USB_OTG_WHITELIST is not set +# CONFIG_USB_OTG_BLACKLIST_HUB is not set +CONFIG_USB_LEDS_TRIGGER_USBPORT=m +CONFIG_USB_AUTOSUSPEND_DELAY=2 +CONFIG_USB_MON=m + +# +# USB Host Controller Drivers +# +CONFIG_USB_C67X00_HCD=m +CONFIG_USB_XHCI_HCD=m +# CONFIG_USB_XHCI_DBGCAP is not set +CONFIG_USB_XHCI_PCI=m +CONFIG_USB_XHCI_PLATFORM=m +CONFIG_USB_EHCI_HCD=m +CONFIG_USB_EHCI_ROOT_HUB_TT=y +CONFIG_USB_EHCI_TT_NEWSCHED=y +CONFIG_USB_EHCI_PCI=m +CONFIG_USB_EHCI_FSL=m +CONFIG_USB_EHCI_HCD_PLATFORM=m +CONFIG_USB_OXU210HP_HCD=m +CONFIG_USB_ISP116X_HCD=m +CONFIG_USB_FOTG210_HCD=m +CONFIG_USB_MAX3421_HCD=m +CONFIG_USB_OHCI_HCD=m +CONFIG_USB_OHCI_HCD_PCI=m +# CONFIG_USB_OHCI_HCD_SSB is not set +CONFIG_USB_OHCI_HCD_PLATFORM=m +CONFIG_USB_UHCI_HCD=m +CONFIG_USB_U132_HCD=m +CONFIG_USB_SL811_HCD=m +# CONFIG_USB_SL811_HCD_ISO is not set +CONFIG_USB_SL811_CS=m +CONFIG_USB_R8A66597_HCD=m +CONFIG_USB_HCD_BCMA=m +CONFIG_USB_HCD_SSB=m +# CONFIG_USB_HCD_TEST_MODE is not set + +# +# USB Device Class drivers +# +CONFIG_USB_ACM=m +CONFIG_USB_PRINTER=m +CONFIG_USB_WDM=m +CONFIG_USB_TMC=m + +# +# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may +# + +# +# also be needed; see USB_STORAGE Help for more info +# +CONFIG_USB_STORAGE=m +# CONFIG_USB_STORAGE_DEBUG is not set +CONFIG_USB_STORAGE_REALTEK=m +CONFIG_REALTEK_AUTOPM=y +CONFIG_USB_STORAGE_DATAFAB=m +CONFIG_USB_STORAGE_FREECOM=m +CONFIG_USB_STORAGE_ISD200=m +CONFIG_USB_STORAGE_USBAT=m +CONFIG_USB_STORAGE_SDDR09=m +CONFIG_USB_STORAGE_SDDR55=m +CONFIG_USB_STORAGE_JUMPSHOT=m +CONFIG_USB_STORAGE_ALAUDA=m +CONFIG_USB_STORAGE_ONETOUCH=m +CONFIG_USB_STORAGE_KARMA=m +CONFIG_USB_STORAGE_CYPRESS_ATACB=m +CONFIG_USB_STORAGE_ENE_UB6250=m +CONFIG_USB_UAS=m + +# +# USB Imaging devices +# +CONFIG_USB_MDC800=m +CONFIG_USB_MICROTEK=m +CONFIG_USBIP_CORE=m +CONFIG_USBIP_VHCI_HCD=m +CONFIG_USBIP_VHCI_HC_PORTS=8 +CONFIG_USBIP_VHCI_NR_HCS=1 +CONFIG_USBIP_HOST=m +CONFIG_USBIP_VUDC=m +# CONFIG_USBIP_DEBUG is not set +# CONFIG_USB_CDNS3 is not set +CONFIG_USB_MUSB_HDRC=m +# CONFIG_USB_MUSB_HOST is not set +# CONFIG_USB_MUSB_GADGET is not set +CONFIG_USB_MUSB_DUAL_ROLE=y + +# +# Platform Glue Layer +# + +# +# MUSB DMA mode +# +# CONFIG_MUSB_PIO_ONLY is not set +CONFIG_USB_DWC3=m +CONFIG_USB_DWC3_ULPI=y +# CONFIG_USB_DWC3_HOST is not set +# CONFIG_USB_DWC3_GADGET is not set +CONFIG_USB_DWC3_DUAL_ROLE=y + +# +# Platform Glue Driver Support +# +CONFIG_USB_DWC3_PCI=m +CONFIG_USB_DWC3_HAPS=m +CONFIG_USB_DWC3_OF_SIMPLE=m +CONFIG_USB_DWC2=m +# CONFIG_USB_DWC2_HOST is not set + +# +# Gadget/Dual-role mode requires USB Gadget support to be enabled +# +# CONFIG_USB_DWC2_PERIPHERAL is not set +CONFIG_USB_DWC2_DUAL_ROLE=y +CONFIG_USB_DWC2_PCI=m +# CONFIG_USB_DWC2_DEBUG is not set +# CONFIG_USB_DWC2_TRACK_MISSED_SOFS is not set +CONFIG_USB_CHIPIDEA=m +CONFIG_USB_CHIPIDEA_OF=m +CONFIG_USB_CHIPIDEA_PCI=m +CONFIG_USB_CHIPIDEA_UDC=y +CONFIG_USB_CHIPIDEA_HOST=y +CONFIG_USB_ISP1760=m +CONFIG_USB_ISP1760_HCD=y +CONFIG_USB_ISP1761_UDC=y +# CONFIG_USB_ISP1760_HOST_ROLE is not set +# CONFIG_USB_ISP1760_GADGET_ROLE is not set +CONFIG_USB_ISP1760_DUAL_ROLE=y + +# +# USB port drivers +# +CONFIG_USB_USS720=m +CONFIG_USB_SERIAL=y +CONFIG_USB_SERIAL_CONSOLE=y +CONFIG_USB_SERIAL_GENERIC=y +CONFIG_USB_SERIAL_SIMPLE=m +CONFIG_USB_SERIAL_AIRCABLE=m +CONFIG_USB_SERIAL_ARK3116=m +CONFIG_USB_SERIAL_BELKIN=m +CONFIG_USB_SERIAL_CH341=m +CONFIG_USB_SERIAL_WHITEHEAT=m +CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m +CONFIG_USB_SERIAL_CP210X=m +CONFIG_USB_SERIAL_CYPRESS_M8=m +CONFIG_USB_SERIAL_EMPEG=m +CONFIG_USB_SERIAL_FTDI_SIO=m +CONFIG_USB_SERIAL_VISOR=m +CONFIG_USB_SERIAL_IPAQ=m +CONFIG_USB_SERIAL_IR=m +CONFIG_USB_SERIAL_EDGEPORT=m +CONFIG_USB_SERIAL_EDGEPORT_TI=m +CONFIG_USB_SERIAL_F81232=m +CONFIG_USB_SERIAL_F8153X=m +CONFIG_USB_SERIAL_GARMIN=m +CONFIG_USB_SERIAL_IPW=m +CONFIG_USB_SERIAL_IUU=m +CONFIG_USB_SERIAL_KEYSPAN_PDA=m +CONFIG_USB_SERIAL_KEYSPAN=m +CONFIG_USB_SERIAL_KLSI=m +CONFIG_USB_SERIAL_KOBIL_SCT=m +CONFIG_USB_SERIAL_MCT_U232=m +CONFIG_USB_SERIAL_METRO=m +CONFIG_USB_SERIAL_MOS7720=m +CONFIG_USB_SERIAL_MOS7715_PARPORT=y +CONFIG_USB_SERIAL_MOS7840=m +CONFIG_USB_SERIAL_MXUPORT=m +CONFIG_USB_SERIAL_NAVMAN=m +CONFIG_USB_SERIAL_PL2303=m +CONFIG_USB_SERIAL_OTI6858=m +CONFIG_USB_SERIAL_QCAUX=m +CONFIG_USB_SERIAL_QUALCOMM=m +CONFIG_USB_SERIAL_SPCP8X5=m +CONFIG_USB_SERIAL_SAFE=m +# CONFIG_USB_SERIAL_SAFE_PADDED is not set +CONFIG_USB_SERIAL_SIERRAWIRELESS=m +CONFIG_USB_SERIAL_SYMBOL=m +CONFIG_USB_SERIAL_TI=m +CONFIG_USB_SERIAL_CYBERJACK=m +CONFIG_USB_SERIAL_XIRCOM=m +CONFIG_USB_SERIAL_WWAN=m +CONFIG_USB_SERIAL_OPTION=m +CONFIG_USB_SERIAL_OMNINET=m +CONFIG_USB_SERIAL_OPTICON=m +CONFIG_USB_SERIAL_XSENS_MT=m +CONFIG_USB_SERIAL_WISHBONE=m +CONFIG_USB_SERIAL_SSU100=m +CONFIG_USB_SERIAL_QT2=m +CONFIG_USB_SERIAL_UPD78F0730=m +CONFIG_USB_SERIAL_DEBUG=m + +# +# USB Miscellaneous drivers +# +CONFIG_USB_EMI62=m +CONFIG_USB_EMI26=m +CONFIG_USB_ADUTUX=m +CONFIG_USB_SEVSEG=m +CONFIG_USB_LEGOTOWER=m +CONFIG_USB_LCD=m +CONFIG_USB_CYPRESS_CY7C63=m +CONFIG_USB_CYTHERM=m +CONFIG_USB_IDMOUSE=m +CONFIG_USB_FTDI_ELAN=m +CONFIG_USB_APPLEDISPLAY=m +CONFIG_USB_SISUSBVGA=m +CONFIG_USB_SISUSBVGA_CON=y +CONFIG_USB_LD=m +CONFIG_USB_TRANCEVIBRATOR=m +CONFIG_USB_IOWARRIOR=m +CONFIG_USB_TEST=m +CONFIG_USB_EHSET_TEST_FIXTURE=m +CONFIG_USB_ISIGHTFW=m +CONFIG_USB_YUREX=m +CONFIG_USB_EZUSB_FX2=m +CONFIG_USB_HUB_USB251XB=m +CONFIG_USB_HSIC_USB3503=m +CONFIG_USB_HSIC_USB4604=m +CONFIG_USB_LINK_LAYER_TEST=m +CONFIG_USB_CHAOSKEY=m +CONFIG_USB_ATM=m +CONFIG_USB_SPEEDTOUCH=m +CONFIG_USB_CXACRU=m +CONFIG_USB_UEAGLEATM=m +CONFIG_USB_XUSBATM=m + +# +# USB Physical Layer drivers +# +CONFIG_USB_PHY=y +CONFIG_NOP_USB_XCEIV=m +CONFIG_USB_GPIO_VBUS=m +CONFIG_TAHVO_USB=m +# CONFIG_TAHVO_USB_HOST_BY_DEFAULT is not set +CONFIG_USB_ISP1301=m +# end of USB Physical Layer drivers + +CONFIG_USB_GADGET=m +# CONFIG_USB_GADGET_DEBUG is not set +# CONFIG_USB_GADGET_DEBUG_FILES is not set +# CONFIG_USB_GADGET_DEBUG_FS is not set +CONFIG_USB_GADGET_VBUS_DRAW=2 +CONFIG_USB_GADGET_STORAGE_NUM_BUFFERS=2 +CONFIG_U_SERIAL_CONSOLE=y + +# +# USB Peripheral Controller +# +CONFIG_USB_FOTG210_UDC=m +CONFIG_USB_GR_UDC=m +CONFIG_USB_R8A66597=m +CONFIG_USB_PXA27X=m +CONFIG_USB_MV_UDC=m +CONFIG_USB_MV_U3D=m +CONFIG_USB_SNP_CORE=m +CONFIG_USB_SNP_UDC_PLAT=m +CONFIG_USB_M66592=m +CONFIG_USB_BDC_UDC=m + +# +# Platform Support +# +CONFIG_USB_BDC_PCI=m +CONFIG_USB_AMD5536UDC=m +CONFIG_USB_NET2272=m +CONFIG_USB_NET2272_DMA=y +CONFIG_USB_NET2280=m +CONFIG_USB_GOKU=m +CONFIG_USB_EG20T=m +CONFIG_USB_GADGET_XILINX=m +CONFIG_USB_DUMMY_HCD=m +# end of USB Peripheral Controller + +CONFIG_USB_LIBCOMPOSITE=m +CONFIG_USB_F_ACM=m +CONFIG_USB_F_SS_LB=m +CONFIG_USB_U_SERIAL=m +CONFIG_USB_U_ETHER=m +CONFIG_USB_U_AUDIO=m +CONFIG_USB_F_SERIAL=m +CONFIG_USB_F_OBEX=m +CONFIG_USB_F_NCM=m +CONFIG_USB_F_ECM=m +CONFIG_USB_F_PHONET=m +CONFIG_USB_F_EEM=m +CONFIG_USB_F_SUBSET=m +CONFIG_USB_F_RNDIS=m +CONFIG_USB_F_MASS_STORAGE=m +CONFIG_USB_F_FS=m +CONFIG_USB_F_UAC1=m +CONFIG_USB_F_UAC1_LEGACY=m +CONFIG_USB_F_UAC2=m +CONFIG_USB_F_UVC=m +CONFIG_USB_F_MIDI=m +CONFIG_USB_F_HID=m +CONFIG_USB_F_PRINTER=m +CONFIG_USB_F_TCM=m +CONFIG_USB_CONFIGFS=m +CONFIG_USB_CONFIGFS_SERIAL=y +CONFIG_USB_CONFIGFS_ACM=y +CONFIG_USB_CONFIGFS_OBEX=y +CONFIG_USB_CONFIGFS_NCM=y +CONFIG_USB_CONFIGFS_ECM=y +CONFIG_USB_CONFIGFS_ECM_SUBSET=y +CONFIG_USB_CONFIGFS_RNDIS=y +CONFIG_USB_CONFIGFS_EEM=y +CONFIG_USB_CONFIGFS_PHONET=y +CONFIG_USB_CONFIGFS_MASS_STORAGE=y +CONFIG_USB_CONFIGFS_F_LB_SS=y +CONFIG_USB_CONFIGFS_F_FS=y +CONFIG_USB_CONFIGFS_F_UAC1=y +CONFIG_USB_CONFIGFS_F_UAC1_LEGACY=y +CONFIG_USB_CONFIGFS_F_UAC2=y +CONFIG_USB_CONFIGFS_F_MIDI=y +CONFIG_USB_CONFIGFS_F_HID=y +CONFIG_USB_CONFIGFS_F_UVC=y +CONFIG_USB_CONFIGFS_F_PRINTER=y +CONFIG_USB_CONFIGFS_F_TCM=y +CONFIG_USB_ZERO=m +CONFIG_USB_AUDIO=m +# CONFIG_GADGET_UAC1 is not set +CONFIG_USB_ETH=m +CONFIG_USB_ETH_RNDIS=y +CONFIG_USB_ETH_EEM=y +CONFIG_USB_G_NCM=m +CONFIG_USB_GADGETFS=m +CONFIG_USB_FUNCTIONFS=m +CONFIG_USB_FUNCTIONFS_ETH=y +CONFIG_USB_FUNCTIONFS_RNDIS=y +CONFIG_USB_FUNCTIONFS_GENERIC=y +CONFIG_USB_MASS_STORAGE=m +CONFIG_USB_GADGET_TARGET=m +CONFIG_USB_G_SERIAL=m +CONFIG_USB_MIDI_GADGET=m +CONFIG_USB_G_PRINTER=m +CONFIG_USB_CDC_COMPOSITE=m +CONFIG_USB_G_NOKIA=m +CONFIG_USB_G_ACM_MS=m +CONFIG_USB_G_MULTI=m +CONFIG_USB_G_MULTI_RNDIS=y +CONFIG_USB_G_MULTI_CDC=y +CONFIG_USB_G_HID=m +CONFIG_USB_G_DBGP=m +# CONFIG_USB_G_DBGP_PRINTK is not set +CONFIG_USB_G_DBGP_SERIAL=y +CONFIG_USB_G_WEBCAM=m +CONFIG_TYPEC=m +CONFIG_TYPEC_TCPM=m +CONFIG_TYPEC_TCPCI=m +CONFIG_TYPEC_RT1711H=m +CONFIG_TYPEC_FUSB302=m +CONFIG_TYPEC_WCOVE=m +CONFIG_TYPEC_UCSI=m +CONFIG_UCSI_CCG=m +CONFIG_UCSI_ACPI=m +CONFIG_TYPEC_TPS6598X=m + +# +# USB Type-C Multiplexer/DeMultiplexer Switch support +# +CONFIG_TYPEC_MUX_PI3USB30532=m +# end of USB Type-C Multiplexer/DeMultiplexer Switch support + +# +# USB Type-C Alternate Mode drivers +# +CONFIG_TYPEC_DP_ALTMODE=m +CONFIG_TYPEC_NVIDIA_ALTMODE=m +# end of USB Type-C Alternate Mode drivers + +CONFIG_USB_ROLE_SWITCH=m +CONFIG_USB_ROLES_INTEL_XHCI=m +CONFIG_MMC=m +CONFIG_PWRSEQ_EMMC=m +CONFIG_PWRSEQ_SD8787=m +CONFIG_PWRSEQ_SIMPLE=m +CONFIG_MMC_BLOCK=m +CONFIG_MMC_BLOCK_MINORS=8 +CONFIG_SDIO_UART=m +CONFIG_MMC_TEST=m + +# +# MMC/SD/SDIO Host Controller Drivers +# +# CONFIG_MMC_DEBUG is not set +CONFIG_MMC_SDHCI=m +CONFIG_MMC_SDHCI_IO_ACCESSORS=y +CONFIG_MMC_SDHCI_PCI=m +CONFIG_MMC_RICOH_MMC=y +CONFIG_MMC_SDHCI_ACPI=m +CONFIG_MMC_SDHCI_PLTFM=m +CONFIG_MMC_SDHCI_OF_ARASAN=m +# CONFIG_MMC_SDHCI_OF_ASPEED is not set +CONFIG_MMC_SDHCI_OF_AT91=m +CONFIG_MMC_SDHCI_OF_DWCMSHC=m +CONFIG_MMC_SDHCI_CADENCE=m +CONFIG_MMC_SDHCI_F_SDH30=m +CONFIG_MMC_WBSD=m +CONFIG_MMC_ALCOR=m +CONFIG_MMC_TIFM_SD=m +CONFIG_MMC_SPI=m +CONFIG_MMC_SDRICOH_CS=m +CONFIG_MMC_CB710=m +CONFIG_MMC_VIA_SDMMC=m +CONFIG_MMC_VUB300=m +CONFIG_MMC_USHC=m +CONFIG_MMC_USDHI6ROL0=m +CONFIG_MMC_REALTEK_PCI=m +CONFIG_MMC_REALTEK_USB=m +CONFIG_MMC_CQHCI=m +CONFIG_MMC_TOSHIBA_PCI=m +CONFIG_MMC_MTK=m +CONFIG_MMC_SDHCI_XENON=m +CONFIG_MMC_SDHCI_OMAP=m +CONFIG_MMC_SDHCI_AM654=m +CONFIG_MEMSTICK=m +# CONFIG_MEMSTICK_DEBUG is not set + +# +# MemoryStick drivers +# +# CONFIG_MEMSTICK_UNSAFE_RESUME is not set +CONFIG_MSPRO_BLOCK=m +CONFIG_MS_BLOCK=m + +# +# MemoryStick Host Controller Drivers +# +CONFIG_MEMSTICK_TIFM_MS=m +CONFIG_MEMSTICK_JMICRON_38X=m +CONFIG_MEMSTICK_R592=m +CONFIG_MEMSTICK_REALTEK_PCI=m +CONFIG_MEMSTICK_REALTEK_USB=m +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=y +CONFIG_LEDS_CLASS_FLASH=m +CONFIG_LEDS_BRIGHTNESS_HW_CHANGED=y + +# +# LED drivers +# +CONFIG_LEDS_88PM860X=m +CONFIG_LEDS_AAT1290=m +CONFIG_LEDS_AN30259A=m +CONFIG_LEDS_APU=m +CONFIG_LEDS_AS3645A=m +CONFIG_LEDS_BCM6328=m +CONFIG_LEDS_BCM6358=m +CONFIG_LEDS_CPCAP=m +CONFIG_LEDS_CR0014114=m +CONFIG_LEDS_LM3530=m +CONFIG_LEDS_LM3532=m +CONFIG_LEDS_LM3533=m +CONFIG_LEDS_LM3642=m +CONFIG_LEDS_LM3692X=m +CONFIG_LEDS_LM3601X=m +CONFIG_LEDS_MT6323=m +CONFIG_LEDS_PCA9532=m +CONFIG_LEDS_PCA9532_GPIO=y +CONFIG_LEDS_GPIO=m +CONFIG_LEDS_LP3944=m +CONFIG_LEDS_LP3952=m +# CONFIG_LEDS_LP5521 is not set +# CONFIG_LEDS_LP5523 is not set +# CONFIG_LEDS_LP5562 is not set +# CONFIG_LEDS_LP8501 is not set +CONFIG_LEDS_LP8788=m +CONFIG_LEDS_LP8860=m +CONFIG_LEDS_CLEVO_MAIL=m +CONFIG_LEDS_PCA955X=m +CONFIG_LEDS_PCA955X_GPIO=y +CONFIG_LEDS_PCA963X=m +CONFIG_LEDS_WM831X_STATUS=m +CONFIG_LEDS_WM8350=m +CONFIG_LEDS_DA903X=m +CONFIG_LEDS_DA9052=m +CONFIG_LEDS_DAC124S085=m +CONFIG_LEDS_PWM=m +CONFIG_LEDS_REGULATOR=m +CONFIG_LEDS_BD2802=m +CONFIG_LEDS_INTEL_SS4200=m +CONFIG_LEDS_LT3593=m +CONFIG_LEDS_ADP5520=m +CONFIG_LEDS_MC13783=m +CONFIG_LEDS_TCA6507=m +CONFIG_LEDS_TLC591XX=m +CONFIG_LEDS_MAX77650=m +CONFIG_LEDS_MAX77693=m +CONFIG_LEDS_MAX8997=m +CONFIG_LEDS_LM355x=m +CONFIG_LEDS_MENF21BMC=m +CONFIG_LEDS_KTD2692=m +CONFIG_LEDS_IS31FL319X=m +CONFIG_LEDS_IS31FL32XX=m + +# +# LED driver for blink(1) USB RGB LED is under Special HID drivers (HID_THINGM) +# +CONFIG_LEDS_BLINKM=m +CONFIG_LEDS_SYSCON=y +CONFIG_LEDS_MLXCPLD=m +CONFIG_LEDS_MLXREG=m +CONFIG_LEDS_USER=m +CONFIG_LEDS_NIC78BX=m +CONFIG_LEDS_SPI_BYTE=m +CONFIG_LEDS_TI_LMU_COMMON=m +CONFIG_LEDS_LM3697=m +CONFIG_LEDS_LM36274=m + +# +# LED Triggers +# +CONFIG_LEDS_TRIGGERS=y +CONFIG_LEDS_TRIGGER_TIMER=m +CONFIG_LEDS_TRIGGER_ONESHOT=m +CONFIG_LEDS_TRIGGER_DISK=y +CONFIG_LEDS_TRIGGER_MTD=y +CONFIG_LEDS_TRIGGER_HEARTBEAT=m +CONFIG_LEDS_TRIGGER_BACKLIGHT=m +CONFIG_LEDS_TRIGGER_CPU=y +CONFIG_LEDS_TRIGGER_ACTIVITY=m +CONFIG_LEDS_TRIGGER_GPIO=m +CONFIG_LEDS_TRIGGER_DEFAULT_ON=m + +# +# iptables trigger is under Netfilter config (LED target) +# +CONFIG_LEDS_TRIGGER_TRANSIENT=m +CONFIG_LEDS_TRIGGER_CAMERA=m +CONFIG_LEDS_TRIGGER_PANIC=y +CONFIG_LEDS_TRIGGER_NETDEV=m +CONFIG_LEDS_TRIGGER_PATTERN=m +CONFIG_LEDS_TRIGGER_AUDIO=m +CONFIG_ACCESSIBILITY=y +CONFIG_A11Y_BRAILLE_CONSOLE=y +CONFIG_INFINIBAND=m +CONFIG_INFINIBAND_USER_MAD=m +CONFIG_INFINIBAND_USER_ACCESS=m +# CONFIG_INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI is not set +CONFIG_INFINIBAND_USER_MEM=y +CONFIG_INFINIBAND_ON_DEMAND_PAGING=y +CONFIG_INFINIBAND_ADDR_TRANS=y +CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS=y +CONFIG_INFINIBAND_MTHCA=m +CONFIG_INFINIBAND_MTHCA_DEBUG=y +CONFIG_INFINIBAND_QIB=m +CONFIG_INFINIBAND_QIB_DCA=y +CONFIG_INFINIBAND_CXGB3=m +CONFIG_INFINIBAND_CXGB4=m +CONFIG_INFINIBAND_EFA=m +CONFIG_INFINIBAND_I40IW=m +CONFIG_MLX4_INFINIBAND=m +CONFIG_MLX5_INFINIBAND=m +CONFIG_INFINIBAND_OCRDMA=m +CONFIG_INFINIBAND_VMWARE_PVRDMA=m +CONFIG_INFINIBAND_USNIC=m +CONFIG_INFINIBAND_BNXT_RE=m +CONFIG_INFINIBAND_HFI1=m +# CONFIG_HFI1_DEBUG_SDMA_ORDER is not set +# CONFIG_SDMA_VERBOSITY is not set +CONFIG_INFINIBAND_QEDR=m +CONFIG_INFINIBAND_RDMAVT=m +CONFIG_RDMA_RXE=m +CONFIG_RDMA_SIW=m +CONFIG_INFINIBAND_IPOIB=m +CONFIG_INFINIBAND_IPOIB_CM=y +CONFIG_INFINIBAND_IPOIB_DEBUG=y +# CONFIG_INFINIBAND_IPOIB_DEBUG_DATA is not set +CONFIG_INFINIBAND_SRP=m +CONFIG_INFINIBAND_SRPT=m +CONFIG_INFINIBAND_ISER=m +CONFIG_INFINIBAND_ISERT=m +CONFIG_INFINIBAND_OPA_VNIC=m +CONFIG_EDAC_ATOMIC_SCRUB=y +CONFIG_EDAC_SUPPORT=y +CONFIG_EDAC=y +CONFIG_EDAC_LEGACY_SYSFS=y +# CONFIG_EDAC_DEBUG is not set +CONFIG_EDAC_DECODE_MCE=m +CONFIG_EDAC_GHES=y +CONFIG_EDAC_AMD64=m +# CONFIG_EDAC_AMD64_ERROR_INJECTION is not set +CONFIG_EDAC_E752X=m +CONFIG_EDAC_I82975X=m +CONFIG_EDAC_I3000=m +CONFIG_EDAC_I3200=m +CONFIG_EDAC_IE31200=m +CONFIG_EDAC_X38=m +CONFIG_EDAC_I5400=m +CONFIG_EDAC_I7CORE=m +CONFIG_EDAC_I5000=m +CONFIG_EDAC_I5100=m +CONFIG_EDAC_I7300=m +CONFIG_EDAC_SBRIDGE=m +CONFIG_EDAC_SKX=m +CONFIG_EDAC_I10NM=m +CONFIG_EDAC_PND2=m +CONFIG_RTC_LIB=y +CONFIG_RTC_MC146818_LIB=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_HCTOSYS=y +CONFIG_RTC_HCTOSYS_DEVICE="rtc0" +CONFIG_RTC_SYSTOHC=y +CONFIG_RTC_SYSTOHC_DEVICE="rtc0" +# CONFIG_RTC_DEBUG is not set +CONFIG_RTC_NVMEM=y + +# +# RTC interfaces +# +CONFIG_RTC_INTF_SYSFS=y +CONFIG_RTC_INTF_PROC=y +CONFIG_RTC_INTF_DEV=y +CONFIG_RTC_INTF_DEV_UIE_EMUL=y +# CONFIG_RTC_DRV_TEST is not set + +# +# I2C RTC drivers +# +CONFIG_RTC_DRV_88PM860X=m +CONFIG_RTC_DRV_88PM80X=m +CONFIG_RTC_DRV_ABB5ZES3=m +CONFIG_RTC_DRV_ABEOZ9=m +CONFIG_RTC_DRV_ABX80X=m +CONFIG_RTC_DRV_AS3722=m +CONFIG_RTC_DRV_DS1307=m +CONFIG_RTC_DRV_DS1307_CENTURY=y +CONFIG_RTC_DRV_DS1374=m +CONFIG_RTC_DRV_DS1374_WDT=y +CONFIG_RTC_DRV_DS1672=m +CONFIG_RTC_DRV_HYM8563=m +CONFIG_RTC_DRV_LP8788=m +CONFIG_RTC_DRV_MAX6900=m +CONFIG_RTC_DRV_MAX8907=m +CONFIG_RTC_DRV_MAX8925=m +CONFIG_RTC_DRV_MAX8998=m +CONFIG_RTC_DRV_MAX8997=m +CONFIG_RTC_DRV_MAX77686=m +CONFIG_RTC_DRV_RK808=m +CONFIG_RTC_DRV_RS5C372=m +CONFIG_RTC_DRV_ISL1208=m +CONFIG_RTC_DRV_ISL12022=m +CONFIG_RTC_DRV_ISL12026=m +CONFIG_RTC_DRV_X1205=m +CONFIG_RTC_DRV_PCF8523=m +CONFIG_RTC_DRV_PCF85063=m +CONFIG_RTC_DRV_PCF85363=m +CONFIG_RTC_DRV_PCF8563=m +CONFIG_RTC_DRV_PCF8583=m +CONFIG_RTC_DRV_M41T80=m +CONFIG_RTC_DRV_M41T80_WDT=y +CONFIG_RTC_DRV_BD70528=m +CONFIG_RTC_DRV_BQ32K=m +CONFIG_RTC_DRV_TWL4030=m +CONFIG_RTC_DRV_PALMAS=m +CONFIG_RTC_DRV_TPS6586X=m +CONFIG_RTC_DRV_TPS65910=m +CONFIG_RTC_DRV_TPS80031=m +CONFIG_RTC_DRV_RC5T583=m +CONFIG_RTC_DRV_S35390A=m +CONFIG_RTC_DRV_FM3130=m +CONFIG_RTC_DRV_RX8010=m +CONFIG_RTC_DRV_RX8581=m +CONFIG_RTC_DRV_RX8025=m +CONFIG_RTC_DRV_EM3027=m +CONFIG_RTC_DRV_RV3028=m +CONFIG_RTC_DRV_RV8803=m +CONFIG_RTC_DRV_S5M=m +CONFIG_RTC_DRV_SD3078=m + +# +# SPI RTC drivers +# +CONFIG_RTC_DRV_M41T93=m +CONFIG_RTC_DRV_M41T94=m +CONFIG_RTC_DRV_DS1302=m +CONFIG_RTC_DRV_DS1305=m +CONFIG_RTC_DRV_DS1343=m +CONFIG_RTC_DRV_DS1347=m +CONFIG_RTC_DRV_DS1390=m +CONFIG_RTC_DRV_MAX6916=m +CONFIG_RTC_DRV_R9701=m +CONFIG_RTC_DRV_RX4581=m +CONFIG_RTC_DRV_RX6110=m +CONFIG_RTC_DRV_RS5C348=m +CONFIG_RTC_DRV_MAX6902=m +CONFIG_RTC_DRV_PCF2123=m +CONFIG_RTC_DRV_MCP795=m +CONFIG_RTC_I2C_AND_SPI=y + +# +# SPI and I2C RTC drivers +# +CONFIG_RTC_DRV_DS3232=m +CONFIG_RTC_DRV_DS3232_HWMON=y +CONFIG_RTC_DRV_PCF2127=m +CONFIG_RTC_DRV_RV3029C2=m +CONFIG_RTC_DRV_RV3029_HWMON=y + +# +# Platform RTC drivers +# +CONFIG_RTC_DRV_CMOS=y +CONFIG_RTC_DRV_DS1286=m +CONFIG_RTC_DRV_DS1511=m +CONFIG_RTC_DRV_DS1553=m +CONFIG_RTC_DRV_DS1685_FAMILY=m +CONFIG_RTC_DRV_DS1685=y +# CONFIG_RTC_DRV_DS1689 is not set +# CONFIG_RTC_DRV_DS17285 is not set +# CONFIG_RTC_DRV_DS17485 is not set +# CONFIG_RTC_DRV_DS17885 is not set +CONFIG_RTC_DRV_DS1742=m +CONFIG_RTC_DRV_DS2404=m +CONFIG_RTC_DRV_DA9052=m +CONFIG_RTC_DRV_DA9055=m +CONFIG_RTC_DRV_DA9063=m +CONFIG_RTC_DRV_STK17TA8=m +CONFIG_RTC_DRV_M48T86=m +CONFIG_RTC_DRV_M48T35=m +CONFIG_RTC_DRV_M48T59=m +CONFIG_RTC_DRV_MSM6242=m +CONFIG_RTC_DRV_BQ4802=m +CONFIG_RTC_DRV_RP5C01=m +CONFIG_RTC_DRV_V3020=m +CONFIG_RTC_DRV_WM831X=m +CONFIG_RTC_DRV_WM8350=m +CONFIG_RTC_DRV_PCF50633=m +CONFIG_RTC_DRV_AB3100=m +CONFIG_RTC_DRV_ZYNQMP=m +CONFIG_RTC_DRV_CROS_EC=m + +# +# on-CPU RTC drivers +# +CONFIG_RTC_DRV_CADENCE=m +CONFIG_RTC_DRV_FTRTC010=m +CONFIG_RTC_DRV_PCAP=m +CONFIG_RTC_DRV_MC13XXX=m +CONFIG_RTC_DRV_SNVS=m +CONFIG_RTC_DRV_MT6397=m +CONFIG_RTC_DRV_R7301=m +CONFIG_RTC_DRV_CPCAP=m + +# +# HID Sensor RTC drivers +# +CONFIG_RTC_DRV_HID_SENSOR_TIME=m +CONFIG_RTC_DRV_WILCO_EC=m +CONFIG_DMADEVICES=y +# CONFIG_DMADEVICES_DEBUG is not set + +# +# DMA Devices +# +CONFIG_DMA_ENGINE=y +CONFIG_DMA_VIRTUAL_CHANNELS=y +CONFIG_DMA_ACPI=y +CONFIG_DMA_OF=y +CONFIG_ALTERA_MSGDMA=m +CONFIG_DW_AXI_DMAC=m +CONFIG_FSL_EDMA=m +CONFIG_INTEL_IDMA64=m +CONFIG_INTEL_IOATDMA=m +CONFIG_INTEL_MIC_X100_DMA=m +CONFIG_QCOM_HIDMA_MGMT=m +CONFIG_QCOM_HIDMA=m +CONFIG_DW_DMAC_CORE=y +CONFIG_DW_DMAC=y +CONFIG_DW_DMAC_PCI=y +CONFIG_DW_EDMA=m +CONFIG_DW_EDMA_PCIE=m +CONFIG_HSU_DMA=y + +# +# DMA Clients +# +CONFIG_ASYNC_TX_DMA=y +# CONFIG_DMATEST is not set +CONFIG_DMA_ENGINE_RAID=y + +# +# DMABUF options +# +CONFIG_SYNC_FILE=y +# CONFIG_SW_SYNC is not set +CONFIG_UDMABUF=y +# CONFIG_DMABUF_SELFTESTS is not set +# end of DMABUF options + +CONFIG_DCA=m +CONFIG_AUXDISPLAY=y +CONFIG_HD44780=m +CONFIG_KS0108=m +CONFIG_KS0108_PORT=0x378 +CONFIG_KS0108_DELAY=2 +CONFIG_CFAG12864B=m +CONFIG_CFAG12864B_RATE=20 +CONFIG_IMG_ASCII_LCD=m +CONFIG_HT16K33=m +CONFIG_PARPORT_PANEL=m +CONFIG_PANEL_PARPORT=0 +CONFIG_PANEL_PROFILE=5 +# CONFIG_PANEL_CHANGE_MESSAGE is not set +# CONFIG_CHARLCD_BL_OFF is not set +# CONFIG_CHARLCD_BL_ON is not set +CONFIG_CHARLCD_BL_FLASH=y +CONFIG_PANEL=m +CONFIG_CHARLCD=m +CONFIG_UIO=m +CONFIG_UIO_CIF=m +CONFIG_UIO_PDRV_GENIRQ=m +CONFIG_UIO_DMEM_GENIRQ=m +CONFIG_UIO_AEC=m +CONFIG_UIO_SERCOS3=m +CONFIG_UIO_PCI_GENERIC=m +CONFIG_UIO_NETX=m +CONFIG_UIO_PRUSS=m +CONFIG_UIO_MF624=m +CONFIG_UIO_HV_GENERIC=m +CONFIG_VFIO_IOMMU_TYPE1=m +CONFIG_VFIO_VIRQFD=m +CONFIG_VFIO=m +# CONFIG_VFIO_NOIOMMU is not set +CONFIG_VFIO_PCI=m +CONFIG_VFIO_PCI_VGA=y +CONFIG_VFIO_PCI_MMAP=y +CONFIG_VFIO_PCI_INTX=y +CONFIG_VFIO_PCI_IGD=y +CONFIG_VFIO_MDEV=m +CONFIG_VFIO_MDEV_DEVICE=m +CONFIG_IRQ_BYPASS_MANAGER=m +CONFIG_VIRT_DRIVERS=y +CONFIG_VBOXGUEST=m +CONFIG_VIRTIO=m +CONFIG_VIRTIO_MENU=y +CONFIG_VIRTIO_PCI=m +CONFIG_VIRTIO_PCI_LEGACY=y +CONFIG_VIRTIO_PMEM=m +CONFIG_VIRTIO_BALLOON=m +CONFIG_VIRTIO_INPUT=m +CONFIG_VIRTIO_MMIO=m +CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y + +# +# Microsoft Hyper-V guest support +# +CONFIG_HYPERV=m +CONFIG_HYPERV_TIMER=y +CONFIG_HYPERV_UTILS=m +CONFIG_HYPERV_BALLOON=m +# end of Microsoft Hyper-V guest support + +# +# Xen driver support +# +CONFIG_XEN_BALLOON=y +CONFIG_XEN_BALLOON_MEMORY_HOTPLUG=y +CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT=512 +CONFIG_XEN_SCRUB_PAGES_DEFAULT=y +CONFIG_XEN_DEV_EVTCHN=m +CONFIG_XEN_BACKEND=y +CONFIG_XENFS=m +CONFIG_XEN_COMPAT_XENFS=y +CONFIG_XEN_SYS_HYPERVISOR=y +CONFIG_XEN_XENBUS_FRONTEND=y +CONFIG_XEN_GNTDEV=m +CONFIG_XEN_GNTDEV_DMABUF=y +CONFIG_XEN_GRANT_DEV_ALLOC=m +CONFIG_XEN_GRANT_DMA_ALLOC=y +CONFIG_SWIOTLB_XEN=y +CONFIG_XEN_PCIDEV_BACKEND=m +CONFIG_XEN_PVCALLS_FRONTEND=m +CONFIG_XEN_PVCALLS_BACKEND=y +CONFIG_XEN_SCSI_BACKEND=m +CONFIG_XEN_PRIVCMD=m +CONFIG_XEN_ACPI_PROCESSOR=m +CONFIG_XEN_MCE_LOG=y +CONFIG_XEN_HAVE_PVMMU=y +CONFIG_XEN_EFI=y +CONFIG_XEN_AUTO_XLATE=y +CONFIG_XEN_ACPI=y +CONFIG_XEN_SYMS=y +CONFIG_XEN_HAVE_VPMU=y +CONFIG_XEN_FRONT_PGDIR_SHBUF=m +# end of Xen driver support + +# CONFIG_GREYBUS is not set +CONFIG_STAGING=y +CONFIG_PRISM2_USB=m +CONFIG_COMEDI=m +# CONFIG_COMEDI_DEBUG is not set +CONFIG_COMEDI_DEFAULT_BUF_SIZE_KB=2048 +CONFIG_COMEDI_DEFAULT_BUF_MAXSIZE_KB=20480 +CONFIG_COMEDI_MISC_DRIVERS=y +CONFIG_COMEDI_BOND=m +CONFIG_COMEDI_TEST=m +CONFIG_COMEDI_PARPORT=m +# CONFIG_COMEDI_ISA_DRIVERS is not set +CONFIG_COMEDI_PCI_DRIVERS=m +CONFIG_COMEDI_8255_PCI=m +CONFIG_COMEDI_ADDI_WATCHDOG=m +CONFIG_COMEDI_ADDI_APCI_1032=m +CONFIG_COMEDI_ADDI_APCI_1500=m +CONFIG_COMEDI_ADDI_APCI_1516=m +CONFIG_COMEDI_ADDI_APCI_1564=m +CONFIG_COMEDI_ADDI_APCI_16XX=m +CONFIG_COMEDI_ADDI_APCI_2032=m +CONFIG_COMEDI_ADDI_APCI_2200=m +CONFIG_COMEDI_ADDI_APCI_3120=m +CONFIG_COMEDI_ADDI_APCI_3501=m +CONFIG_COMEDI_ADDI_APCI_3XXX=m +CONFIG_COMEDI_ADL_PCI6208=m +CONFIG_COMEDI_ADL_PCI7X3X=m +CONFIG_COMEDI_ADL_PCI8164=m +CONFIG_COMEDI_ADL_PCI9111=m +CONFIG_COMEDI_ADL_PCI9118=m +CONFIG_COMEDI_ADV_PCI1710=m +CONFIG_COMEDI_ADV_PCI1720=m +CONFIG_COMEDI_ADV_PCI1723=m +CONFIG_COMEDI_ADV_PCI1724=m +CONFIG_COMEDI_ADV_PCI1760=m +CONFIG_COMEDI_ADV_PCI_DIO=m +CONFIG_COMEDI_AMPLC_DIO200_PCI=m +CONFIG_COMEDI_AMPLC_PC236_PCI=m +CONFIG_COMEDI_AMPLC_PC263_PCI=m +CONFIG_COMEDI_AMPLC_PCI224=m +CONFIG_COMEDI_AMPLC_PCI230=m +CONFIG_COMEDI_CONTEC_PCI_DIO=m +CONFIG_COMEDI_DAS08_PCI=m +CONFIG_COMEDI_DT3000=m +CONFIG_COMEDI_DYNA_PCI10XX=m +CONFIG_COMEDI_GSC_HPDI=m +CONFIG_COMEDI_MF6X4=m +CONFIG_COMEDI_ICP_MULTI=m +CONFIG_COMEDI_DAQBOARD2000=m +CONFIG_COMEDI_JR3_PCI=m +CONFIG_COMEDI_KE_COUNTER=m +CONFIG_COMEDI_CB_PCIDAS64=m +CONFIG_COMEDI_CB_PCIDAS=m +CONFIG_COMEDI_CB_PCIDDA=m +CONFIG_COMEDI_CB_PCIMDAS=m +CONFIG_COMEDI_CB_PCIMDDA=m +CONFIG_COMEDI_ME4000=m +CONFIG_COMEDI_ME_DAQ=m +CONFIG_COMEDI_NI_6527=m +CONFIG_COMEDI_NI_65XX=m +CONFIG_COMEDI_NI_660X=m +CONFIG_COMEDI_NI_670X=m +CONFIG_COMEDI_NI_LABPC_PCI=m +CONFIG_COMEDI_NI_PCIDIO=m +CONFIG_COMEDI_NI_PCIMIO=m +CONFIG_COMEDI_RTD520=m +CONFIG_COMEDI_S626=m +CONFIG_COMEDI_MITE=m +CONFIG_COMEDI_NI_TIOCMD=m +CONFIG_COMEDI_PCMCIA_DRIVERS=m +CONFIG_COMEDI_CB_DAS16_CS=m +CONFIG_COMEDI_DAS08_CS=m +CONFIG_COMEDI_NI_DAQ_700_CS=m +CONFIG_COMEDI_NI_DAQ_DIO24_CS=m +CONFIG_COMEDI_NI_LABPC_CS=m +CONFIG_COMEDI_NI_MIO_CS=m +CONFIG_COMEDI_QUATECH_DAQP_CS=m +CONFIG_COMEDI_USB_DRIVERS=m +CONFIG_COMEDI_DT9812=m +CONFIG_COMEDI_NI_USB6501=m +CONFIG_COMEDI_USBDUX=m +CONFIG_COMEDI_USBDUXFAST=m +CONFIG_COMEDI_USBDUXSIGMA=m +CONFIG_COMEDI_VMK80XX=m +CONFIG_COMEDI_8254=m +CONFIG_COMEDI_8255=m +CONFIG_COMEDI_8255_SA=m +CONFIG_COMEDI_KCOMEDILIB=m +CONFIG_COMEDI_AMPLC_DIO200=m +CONFIG_COMEDI_AMPLC_PC236=m +CONFIG_COMEDI_DAS08=m +CONFIG_COMEDI_NI_LABPC=m +CONFIG_COMEDI_NI_TIO=m +CONFIG_COMEDI_NI_ROUTING=m +CONFIG_RTL8192U=m +CONFIG_RTLLIB=m +CONFIG_RTLLIB_CRYPTO_CCMP=m +CONFIG_RTLLIB_CRYPTO_TKIP=m +CONFIG_RTLLIB_CRYPTO_WEP=m +CONFIG_RTL8192E=m +CONFIG_RTL8723BS=m +CONFIG_R8712U=m +CONFIG_R8188EU=m +CONFIG_88EU_AP_MODE=y +CONFIG_RTS5208=m +CONFIG_VT6655=m +CONFIG_VT6656=m + +# +# IIO staging drivers +# + +# +# Accelerometers +# +CONFIG_ADIS16203=m +CONFIG_ADIS16240=m +# end of Accelerometers + +# +# Analog to digital converters +# +CONFIG_AD7816=m +CONFIG_AD7192=m +CONFIG_AD7280=m +# end of Analog to digital converters + +# +# Analog digital bi-direction converters +# +CONFIG_ADT7316=m +CONFIG_ADT7316_SPI=m +CONFIG_ADT7316_I2C=m +# end of Analog digital bi-direction converters + +# +# Capacitance to digital converters +# +CONFIG_AD7150=m +CONFIG_AD7746=m +# end of Capacitance to digital converters + +# +# Direct Digital Synthesis +# +CONFIG_AD9832=m +CONFIG_AD9834=m +# end of Direct Digital Synthesis + +# +# Network Analyzer, Impedance Converters +# +CONFIG_AD5933=m +# end of Network Analyzer, Impedance Converters + +# +# Active energy metering IC +# +CONFIG_ADE7854=m +CONFIG_ADE7854_I2C=m +CONFIG_ADE7854_SPI=m +# end of Active energy metering IC + +# +# Resolver to digital converters +# +CONFIG_AD2S1210=m +# end of Resolver to digital converters +# end of IIO staging drivers + +# CONFIG_FB_SM750 is not set + +# +# Speakup console speech +# +CONFIG_SPEAKUP=m +CONFIG_SPEAKUP_SYNTH_ACNTSA=m +CONFIG_SPEAKUP_SYNTH_APOLLO=m +CONFIG_SPEAKUP_SYNTH_AUDPTR=m +CONFIG_SPEAKUP_SYNTH_BNS=m +CONFIG_SPEAKUP_SYNTH_DECTLK=m +CONFIG_SPEAKUP_SYNTH_DECEXT=m +CONFIG_SPEAKUP_SYNTH_LTLK=m +CONFIG_SPEAKUP_SYNTH_SOFT=m +CONFIG_SPEAKUP_SYNTH_SPKOUT=m +CONFIG_SPEAKUP_SYNTH_TXPRT=m +CONFIG_SPEAKUP_SYNTH_DUMMY=m +# end of Speakup console speech + +CONFIG_STAGING_MEDIA=y +CONFIG_VIDEO_IPU3_IMGU=m + +# +# soc_camera sensor drivers +# + +# +# Android +# +# end of Android + +CONFIG_STAGING_BOARD=y +CONFIG_LTE_GDM724X=m +CONFIG_FIREWIRE_SERIAL=m +CONFIG_FWTTY_MAX_TOTAL_PORTS=64 +CONFIG_FWTTY_MAX_CARD_PORTS=32 +CONFIG_GS_FPGABOOT=m +CONFIG_UNISYSSPAR=y +CONFIG_UNISYS_VISORNIC=m +CONFIG_UNISYS_VISORINPUT=m +CONFIG_UNISYS_VISORHBA=m +CONFIG_COMMON_CLK_XLNX_CLKWZRD=m +# CONFIG_FB_TFT is not set +CONFIG_WILC1000=m +CONFIG_WILC1000_SDIO=m +CONFIG_WILC1000_SPI=m +# CONFIG_WILC1000_HW_OOB_INTR is not set +CONFIG_MOST=m +CONFIG_MOST_CDEV=m +CONFIG_MOST_NET=m +CONFIG_MOST_SOUND=m +CONFIG_MOST_VIDEO=m +CONFIG_MOST_DIM2=m +CONFIG_MOST_I2C=m +CONFIG_MOST_USB=m +CONFIG_KS7010=m +CONFIG_PI433=m + +# +# Gasket devices +# +CONFIG_STAGING_GASKET_FRAMEWORK=m +CONFIG_STAGING_APEX_DRIVER=m +# end of Gasket devices + +CONFIG_XIL_AXIS_FIFO=m +CONFIG_FIELDBUS_DEV=m +CONFIG_HMS_ANYBUSS_BUS=m +CONFIG_ARCX_ANYBUS_CONTROLLER=m +CONFIG_HMS_PROFINET=m +CONFIG_KPC2000=y +CONFIG_KPC2000_CORE=m +CONFIG_KPC2000_SPI=m +CONFIG_KPC2000_I2C=m +CONFIG_KPC2000_DMA=m + +# +# ISDN CAPI drivers +# +CONFIG_CAPI_AVM=y +CONFIG_ISDN_DRV_AVMB1_B1PCI=m +CONFIG_ISDN_DRV_AVMB1_B1PCIV4=y +CONFIG_ISDN_DRV_AVMB1_B1PCMCIA=m +CONFIG_ISDN_DRV_AVMB1_AVM_CS=m +CONFIG_ISDN_DRV_AVMB1_T1PCI=m +CONFIG_ISDN_DRV_AVMB1_C4=m +CONFIG_ISDN_DRV_GIGASET=m +CONFIG_GIGASET_CAPI=y +CONFIG_GIGASET_BASE=m +CONFIG_GIGASET_M105=m +CONFIG_GIGASET_M101=m +# CONFIG_GIGASET_DEBUG is not set +CONFIG_HYSDN=m +CONFIG_HYSDN_CAPI=y +# end of ISDN CAPI drivers + +CONFIG_USB_WUSB=m +CONFIG_USB_WUSB_CBAF=m +# CONFIG_USB_WUSB_CBAF_DEBUG is not set +CONFIG_USB_WHCI_HCD=m +CONFIG_USB_HWA_HCD=m +CONFIG_UWB=m +CONFIG_UWB_HWA=m +CONFIG_UWB_WHCI=m +CONFIG_UWB_I1480U=m +# CONFIG_EXFAT_FS is not set +CONFIG_QLGE=m +CONFIG_X86_PLATFORM_DEVICES=y +CONFIG_ACER_WMI=m +CONFIG_ACER_WIRELESS=m +CONFIG_ACERHDF=m +CONFIG_ALIENWARE_WMI=m +CONFIG_ASUS_LAPTOP=m +CONFIG_DCDBAS=m +CONFIG_DELL_SMBIOS=m +CONFIG_DELL_SMBIOS_WMI=y +CONFIG_DELL_SMBIOS_SMM=y +CONFIG_DELL_LAPTOP=m +CONFIG_DELL_WMI=m +CONFIG_DELL_WMI_DESCRIPTOR=m +CONFIG_DELL_WMI_AIO=m +CONFIG_DELL_WMI_LED=m +CONFIG_DELL_SMO8800=m +CONFIG_DELL_RBTN=m +# CONFIG_DELL_RBU is not set +CONFIG_FUJITSU_LAPTOP=m +CONFIG_FUJITSU_TABLET=m +CONFIG_AMILO_RFKILL=m +CONFIG_GPD_POCKET_FAN=m +CONFIG_HP_ACCEL=m +CONFIG_HP_WIRELESS=m +CONFIG_HP_WMI=m +CONFIG_LG_LAPTOP=m +CONFIG_MSI_LAPTOP=m +CONFIG_PANASONIC_LAPTOP=m +CONFIG_COMPAL_LAPTOP=m +CONFIG_SONY_LAPTOP=m +CONFIG_SONYPI_COMPAT=y +CONFIG_IDEAPAD_LAPTOP=m +CONFIG_SURFACE3_WMI=m +CONFIG_THINKPAD_ACPI=m +CONFIG_THINKPAD_ACPI_ALSA_SUPPORT=y +# CONFIG_THINKPAD_ACPI_DEBUGFACILITIES is not set +# CONFIG_THINKPAD_ACPI_DEBUG is not set +# CONFIG_THINKPAD_ACPI_UNSAFE_LEDS is not set +CONFIG_THINKPAD_ACPI_VIDEO=y +CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y +CONFIG_SENSORS_HDAPS=m +CONFIG_INTEL_MENLOW=m +CONFIG_EEEPC_LAPTOP=m +CONFIG_ASUS_WMI=m +CONFIG_ASUS_NB_WMI=m +CONFIG_EEEPC_WMI=m +CONFIG_ASUS_WIRELESS=m +CONFIG_ACPI_WMI=m +CONFIG_WMI_BMOF=m +CONFIG_INTEL_WMI_THUNDERBOLT=m +CONFIG_XIAOMI_WMI=m +CONFIG_MSI_WMI=m +CONFIG_PEAQ_WMI=m +CONFIG_TOPSTAR_LAPTOP=m +CONFIG_ACPI_TOSHIBA=m +CONFIG_TOSHIBA_BT_RFKILL=m +CONFIG_TOSHIBA_HAPS=m +CONFIG_TOSHIBA_WMI=m +CONFIG_ACPI_CMPC=m +CONFIG_INTEL_CHT_INT33FE=m +CONFIG_INTEL_INT0002_VGPIO=m +CONFIG_INTEL_HID_EVENT=m +CONFIG_INTEL_VBTN=m +CONFIG_INTEL_IPS=m +CONFIG_INTEL_PMC_CORE=y +CONFIG_IBM_RTL=m +CONFIG_SAMSUNG_LAPTOP=m +CONFIG_MXM_WMI=m +CONFIG_INTEL_OAKTRAIL=m +CONFIG_SAMSUNG_Q10=m +CONFIG_APPLE_GMUX=m +CONFIG_INTEL_RST=m +CONFIG_INTEL_SMARTCONNECT=m +CONFIG_INTEL_PMC_IPC=m +CONFIG_INTEL_BXTWC_PMIC_TMU=m +CONFIG_SURFACE_PRO3_BUTTON=m +CONFIG_SURFACE_3_BUTTON=m +CONFIG_INTEL_PUNIT_IPC=m +CONFIG_INTEL_TELEMETRY=m +CONFIG_MLX_PLATFORM=m +CONFIG_INTEL_TURBO_MAX_3=y +CONFIG_TOUCHSCREEN_DMI=y +CONFIG_INTEL_CHTDC_TI_PWRBTN=m +CONFIG_I2C_MULTI_INSTANTIATE=m +CONFIG_INTEL_ATOMISP2_PM=m +CONFIG_HUAWEI_WMI=m +CONFIG_PCENGINES_APU2=m + +# +# Intel Speed Select Technology interface support +# +CONFIG_INTEL_SPEED_SELECT_INTERFACE=m +# end of Intel Speed Select Technology interface support + +CONFIG_PMC_ATOM=y +CONFIG_MFD_CROS_EC=m +CONFIG_CHROME_PLATFORMS=y +CONFIG_CHROMEOS_LAPTOP=m +CONFIG_CHROMEOS_PSTORE=m +CONFIG_CHROMEOS_TBMC=m +CONFIG_CROS_EC=m +CONFIG_CROS_EC_I2C=m +CONFIG_CROS_EC_RPMSG=m +CONFIG_CROS_EC_ISHTP=m +CONFIG_CROS_EC_SPI=m +CONFIG_CROS_EC_LPC=m +CONFIG_CROS_EC_PROTO=y +CONFIG_CROS_KBD_LED_BACKLIGHT=m +CONFIG_CROS_EC_CHARDEV=m +CONFIG_CROS_EC_LIGHTBAR=m +CONFIG_CROS_EC_VBC=m +# CONFIG_CROS_EC_DEBUGFS is not set +CONFIG_CROS_EC_SYSFS=m +CONFIG_CROS_USBPD_LOGGER=m +CONFIG_WILCO_EC=m +# CONFIG_WILCO_EC_DEBUGFS is not set +CONFIG_WILCO_EC_EVENTS=m +CONFIG_WILCO_EC_TELEMETRY=m +CONFIG_MELLANOX_PLATFORM=y +CONFIG_MLXREG_HOTPLUG=m +CONFIG_MLXREG_IO=m +CONFIG_CLKDEV_LOOKUP=y +CONFIG_HAVE_CLK_PREPARE=y +CONFIG_COMMON_CLK=y + +# +# Common Clock Framework +# +CONFIG_COMMON_CLK_WM831X=m +CONFIG_CLK_HSDK=y +CONFIG_COMMON_CLK_MAX77686=m +CONFIG_COMMON_CLK_MAX9485=m +CONFIG_COMMON_CLK_RK808=m +CONFIG_COMMON_CLK_SI5341=m +CONFIG_COMMON_CLK_SI5351=m +CONFIG_COMMON_CLK_SI514=m +CONFIG_COMMON_CLK_SI544=m +CONFIG_COMMON_CLK_SI570=m +CONFIG_COMMON_CLK_CDCE706=m +CONFIG_COMMON_CLK_CDCE925=m +CONFIG_COMMON_CLK_CS2000_CP=m +CONFIG_COMMON_CLK_S2MPS11=m +CONFIG_CLK_TWL6040=m +CONFIG_COMMON_CLK_LOCHNAGAR=m +CONFIG_COMMON_CLK_PALMAS=m +CONFIG_COMMON_CLK_PWM=m +CONFIG_COMMON_CLK_VC5=m +CONFIG_COMMON_CLK_BD718XX=m +CONFIG_COMMON_CLK_FIXED_MMIO=y +# end of Common Clock Framework + +CONFIG_HWSPINLOCK=y + +# +# Clock Source drivers +# +CONFIG_CLKEVT_I8253=y +CONFIG_I8253_LOCK=y +CONFIG_CLKBLD_I8253=y +# end of Clock Source drivers + +CONFIG_MAILBOX=y +CONFIG_PLATFORM_MHU=m +CONFIG_PCC=y +CONFIG_ALTERA_MBOX=m +CONFIG_MAILBOX_TEST=m +CONFIG_IOMMU_IOVA=y +CONFIG_IOMMU_API=y +CONFIG_IOMMU_SUPPORT=y + +# +# Generic IOMMU Pagetable Support +# +# end of Generic IOMMU Pagetable Support + +# CONFIG_IOMMU_DEBUGFS is not set +# CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set +CONFIG_OF_IOMMU=y +CONFIG_AMD_IOMMU=y +CONFIG_AMD_IOMMU_V2=y +CONFIG_DMAR_TABLE=y +CONFIG_INTEL_IOMMU=y +CONFIG_INTEL_IOMMU_SVM=y +# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set +CONFIG_INTEL_IOMMU_FLOPPY_WA=y +CONFIG_IRQ_REMAP=y +CONFIG_HYPERV_IOMMU=y + +# +# Remoteproc drivers +# +# CONFIG_REMOTEPROC is not set +# end of Remoteproc drivers + +# +# Rpmsg drivers +# +CONFIG_RPMSG=m +CONFIG_RPMSG_CHAR=m +CONFIG_RPMSG_QCOM_GLINK_NATIVE=m +CONFIG_RPMSG_QCOM_GLINK_RPM=m +CONFIG_RPMSG_VIRTIO=m +# end of Rpmsg drivers + +CONFIG_SOUNDWIRE=y + +# +# SoundWire Devices +# +CONFIG_SOUNDWIRE_CADENCE=m +CONFIG_SOUNDWIRE_INTEL=m + +# +# SOC (System On Chip) specific Drivers +# + +# +# Amlogic SoC drivers +# +# end of Amlogic SoC drivers + +# +# Aspeed SoC drivers +# +# end of Aspeed SoC drivers + +# +# Broadcom SoC drivers +# +# end of Broadcom SoC drivers + +# +# NXP/Freescale QorIQ SoC drivers +# +# end of NXP/Freescale QorIQ SoC drivers + +# +# i.MX SoC drivers +# +# end of i.MX SoC drivers + +# +# Qualcomm SoC drivers +# +# end of Qualcomm SoC drivers + +CONFIG_SOC_TI=y + +# +# Xilinx SoC drivers +# +CONFIG_XILINX_VCU=m +# end of Xilinx SoC drivers +# end of SOC (System On Chip) specific Drivers + +CONFIG_PM_DEVFREQ=y + +# +# DEVFREQ Governors +# +CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=m +CONFIG_DEVFREQ_GOV_PERFORMANCE=m +CONFIG_DEVFREQ_GOV_POWERSAVE=m +CONFIG_DEVFREQ_GOV_USERSPACE=m +CONFIG_DEVFREQ_GOV_PASSIVE=m + +# +# DEVFREQ Drivers +# +CONFIG_PM_DEVFREQ_EVENT=y +CONFIG_EXTCON=y + +# +# Extcon Device Drivers +# +CONFIG_EXTCON_ADC_JACK=m +CONFIG_EXTCON_ARIZONA=m +CONFIG_EXTCON_AXP288=m +CONFIG_EXTCON_FSA9480=m +CONFIG_EXTCON_GPIO=m +CONFIG_EXTCON_INTEL_INT3496=m +CONFIG_EXTCON_INTEL_CHT_WC=m +CONFIG_EXTCON_MAX14577=m +CONFIG_EXTCON_MAX3355=m +CONFIG_EXTCON_MAX77693=m +CONFIG_EXTCON_MAX77843=m +CONFIG_EXTCON_MAX8997=m +CONFIG_EXTCON_PALMAS=m +CONFIG_EXTCON_PTN5150=m +CONFIG_EXTCON_RT8973A=m +CONFIG_EXTCON_SM5502=m +CONFIG_EXTCON_USB_GPIO=m +CONFIG_EXTCON_USBC_CROS_EC=m +CONFIG_MEMORY=y +CONFIG_IIO=m +CONFIG_IIO_BUFFER=y +CONFIG_IIO_BUFFER_CB=m +CONFIG_IIO_BUFFER_HW_CONSUMER=m +CONFIG_IIO_KFIFO_BUF=m +CONFIG_IIO_TRIGGERED_BUFFER=m +CONFIG_IIO_CONFIGFS=m +CONFIG_IIO_TRIGGER=y +CONFIG_IIO_CONSUMERS_PER_TRIGGER=2 +CONFIG_IIO_SW_DEVICE=m +CONFIG_IIO_SW_TRIGGER=m +CONFIG_IIO_TRIGGERED_EVENT=m + +# +# Accelerometers +# +CONFIG_ADIS16201=m +CONFIG_ADIS16209=m +CONFIG_ADXL372=m +CONFIG_ADXL372_SPI=m +CONFIG_ADXL372_I2C=m +CONFIG_BMA180=m +CONFIG_BMA220=m +CONFIG_BMC150_ACCEL=m +CONFIG_BMC150_ACCEL_I2C=m +CONFIG_BMC150_ACCEL_SPI=m +CONFIG_DA280=m +CONFIG_DA311=m +CONFIG_DMARD06=m +CONFIG_DMARD09=m +CONFIG_DMARD10=m +CONFIG_HID_SENSOR_ACCEL_3D=m +CONFIG_IIO_CROS_EC_ACCEL_LEGACY=m +CONFIG_IIO_ST_ACCEL_3AXIS=m +CONFIG_IIO_ST_ACCEL_I2C_3AXIS=m +CONFIG_IIO_ST_ACCEL_SPI_3AXIS=m +CONFIG_KXSD9=m +CONFIG_KXSD9_SPI=m +CONFIG_KXSD9_I2C=m +CONFIG_KXCJK1013=m +CONFIG_MC3230=m +CONFIG_MMA7455=m +CONFIG_MMA7455_I2C=m +CONFIG_MMA7455_SPI=m +CONFIG_MMA7660=m +CONFIG_MMA8452=m +CONFIG_MMA9551_CORE=m +CONFIG_MMA9551=m +CONFIG_MMA9553=m +CONFIG_MXC4005=m +CONFIG_MXC6255=m +CONFIG_SCA3000=m +CONFIG_STK8312=m +CONFIG_STK8BA50=m +# end of Accelerometers + +# +# Analog to digital converters +# +CONFIG_AD_SIGMA_DELTA=m +CONFIG_AD7124=m +CONFIG_AD7266=m +CONFIG_AD7291=m +CONFIG_AD7298=m +CONFIG_AD7476=m +CONFIG_AD7606=m +CONFIG_AD7606_IFACE_PARALLEL=m +CONFIG_AD7606_IFACE_SPI=m +CONFIG_AD7766=m +CONFIG_AD7768_1=m +CONFIG_AD7780=m +CONFIG_AD7791=m +CONFIG_AD7793=m +CONFIG_AD7887=m +CONFIG_AD7923=m +CONFIG_AD7949=m +CONFIG_AD799X=m +CONFIG_AXP20X_ADC=m +CONFIG_AXP288_ADC=m +CONFIG_CC10001_ADC=m +CONFIG_CPCAP_ADC=m +CONFIG_DA9150_GPADC=m +CONFIG_DLN2_ADC=m +CONFIG_ENVELOPE_DETECTOR=m +CONFIG_HI8435=m +CONFIG_HX711=m +CONFIG_INA2XX_ADC=m +CONFIG_LP8788_ADC=m +CONFIG_LTC2471=m +CONFIG_LTC2485=m +CONFIG_LTC2497=m +CONFIG_MAX1027=m +CONFIG_MAX11100=m +CONFIG_MAX1118=m +CONFIG_MAX1363=m +CONFIG_MAX9611=m +CONFIG_MCP320X=m +CONFIG_MCP3422=m +CONFIG_MCP3911=m +CONFIG_MEN_Z188_ADC=m +CONFIG_NAU7802=m +CONFIG_PALMAS_GPADC=m +CONFIG_QCOM_VADC_COMMON=m +CONFIG_QCOM_SPMI_IADC=m +CONFIG_QCOM_SPMI_VADC=m +CONFIG_QCOM_SPMI_ADC5=m +CONFIG_SD_ADC_MODULATOR=m +CONFIG_STMPE_ADC=m +CONFIG_TI_ADC081C=m +CONFIG_TI_ADC0832=m +CONFIG_TI_ADC084S021=m +CONFIG_TI_ADC12138=m +CONFIG_TI_ADC108S102=m +CONFIG_TI_ADC128S052=m +CONFIG_TI_ADC161S626=m +CONFIG_TI_ADS1015=m +CONFIG_TI_ADS7950=m +CONFIG_TI_ADS8344=m +CONFIG_TI_ADS8688=m +CONFIG_TI_ADS124S08=m +CONFIG_TI_AM335X_ADC=m +CONFIG_TI_TLC4541=m +CONFIG_TWL4030_MADC=m +CONFIG_TWL6030_GPADC=m +CONFIG_VF610_ADC=m +CONFIG_VIPERBOARD_ADC=m +CONFIG_XILINX_XADC=m +# end of Analog to digital converters + +# +# Analog Front Ends +# +CONFIG_IIO_RESCALE=m +# end of Analog Front Ends + +# +# Amplifiers +# +CONFIG_AD8366=m +# end of Amplifiers + +# +# Chemical Sensors +# +CONFIG_ATLAS_PH_SENSOR=m +CONFIG_BME680=m +CONFIG_BME680_I2C=m +CONFIG_BME680_SPI=m +CONFIG_CCS811=m +CONFIG_IAQCORE=m +CONFIG_PMS7003=m +CONFIG_SENSIRION_SGP30=m +CONFIG_SPS30=m +CONFIG_VZ89X=m +# end of Chemical Sensors + +CONFIG_IIO_CROS_EC_SENSORS_CORE=m +CONFIG_IIO_CROS_EC_SENSORS=m +CONFIG_IIO_CROS_EC_SENSORS_LID_ANGLE=m + +# +# Hid Sensor IIO Common +# +CONFIG_HID_SENSOR_IIO_COMMON=m +CONFIG_HID_SENSOR_IIO_TRIGGER=m +# end of Hid Sensor IIO Common + +CONFIG_IIO_MS_SENSORS_I2C=m + +# +# SSP Sensor Common +# +CONFIG_IIO_SSP_SENSORS_COMMONS=m +CONFIG_IIO_SSP_SENSORHUB=m +# end of SSP Sensor Common + +CONFIG_IIO_ST_SENSORS_I2C=m +CONFIG_IIO_ST_SENSORS_SPI=m +CONFIG_IIO_ST_SENSORS_CORE=m + +# +# Digital to analog converters +# +CONFIG_AD5064=m +CONFIG_AD5360=m +CONFIG_AD5380=m +CONFIG_AD5421=m +CONFIG_AD5446=m +CONFIG_AD5449=m +CONFIG_AD5592R_BASE=m +CONFIG_AD5592R=m +CONFIG_AD5593R=m +CONFIG_AD5504=m +CONFIG_AD5624R_SPI=m +CONFIG_LTC1660=m +CONFIG_LTC2632=m +CONFIG_AD5686=m +CONFIG_AD5686_SPI=m +CONFIG_AD5696_I2C=m +CONFIG_AD5755=m +CONFIG_AD5758=m +CONFIG_AD5761=m +CONFIG_AD5764=m +CONFIG_AD5791=m +CONFIG_AD7303=m +CONFIG_AD8801=m +CONFIG_DPOT_DAC=m +CONFIG_DS4424=m +CONFIG_M62332=m +CONFIG_MAX517=m +CONFIG_MAX5821=m +CONFIG_MCP4725=m +CONFIG_MCP4922=m +CONFIG_TI_DAC082S085=m +CONFIG_TI_DAC5571=m +CONFIG_TI_DAC7311=m +CONFIG_TI_DAC7612=m +CONFIG_VF610_DAC=m +# end of Digital to analog converters + +# +# IIO dummy driver +# +# CONFIG_IIO_SIMPLE_DUMMY is not set +# end of IIO dummy driver + +# +# Frequency Synthesizers DDS/PLL +# + +# +# Clock Generator/Distribution +# +CONFIG_AD9523=m +# end of Clock Generator/Distribution + +# +# Phase-Locked Loop (PLL) frequency synthesizers +# +CONFIG_ADF4350=m +CONFIG_ADF4371=m +# end of Phase-Locked Loop (PLL) frequency synthesizers +# end of Frequency Synthesizers DDS/PLL + +# +# Digital gyroscope sensors +# +CONFIG_ADIS16080=m +CONFIG_ADIS16130=m +CONFIG_ADIS16136=m +CONFIG_ADIS16260=m +CONFIG_ADXRS450=m +CONFIG_BMG160=m +CONFIG_BMG160_I2C=m +CONFIG_BMG160_SPI=m +CONFIG_FXAS21002C=m +CONFIG_FXAS21002C_I2C=m +CONFIG_FXAS21002C_SPI=m +CONFIG_HID_SENSOR_GYRO_3D=m +CONFIG_MPU3050=m +CONFIG_MPU3050_I2C=m +CONFIG_IIO_ST_GYRO_3AXIS=m +CONFIG_IIO_ST_GYRO_I2C_3AXIS=m +CONFIG_IIO_ST_GYRO_SPI_3AXIS=m +CONFIG_ITG3200=m +# end of Digital gyroscope sensors + +# +# Health Sensors +# + +# +# Heart Rate Monitors +# +CONFIG_AFE4403=m +CONFIG_AFE4404=m +CONFIG_MAX30100=m +CONFIG_MAX30102=m +# end of Heart Rate Monitors +# end of Health Sensors + +# +# Humidity sensors +# +CONFIG_AM2315=m +CONFIG_DHT11=m +CONFIG_HDC100X=m +CONFIG_HID_SENSOR_HUMIDITY=m +CONFIG_HTS221=m +CONFIG_HTS221_I2C=m +CONFIG_HTS221_SPI=m +CONFIG_HTU21=m +CONFIG_SI7005=m +CONFIG_SI7020=m +# end of Humidity sensors + +# +# Inertial measurement units +# +CONFIG_ADIS16400=m +# CONFIG_ADIS16460 is not set +CONFIG_ADIS16480=m +CONFIG_BMI160=m +CONFIG_BMI160_I2C=m +CONFIG_BMI160_SPI=m +CONFIG_KMX61=m +CONFIG_INV_MPU6050_IIO=m +CONFIG_INV_MPU6050_I2C=m +CONFIG_INV_MPU6050_SPI=m +CONFIG_IIO_ST_LSM6DSX=m +CONFIG_IIO_ST_LSM6DSX_I2C=m +CONFIG_IIO_ST_LSM6DSX_SPI=m +CONFIG_IIO_ST_LSM6DSX_I3C=m +# end of Inertial measurement units + +CONFIG_IIO_ADIS_LIB=m +CONFIG_IIO_ADIS_LIB_BUFFER=y + +# +# Light sensors +# +CONFIG_ACPI_ALS=m +CONFIG_ADJD_S311=m +CONFIG_AL3320A=m +CONFIG_APDS9300=m +CONFIG_APDS9960=m +CONFIG_BH1750=m +CONFIG_BH1780=m +CONFIG_CM32181=m +CONFIG_CM3232=m +CONFIG_CM3323=m +CONFIG_CM3605=m +CONFIG_CM36651=m +CONFIG_IIO_CROS_EC_LIGHT_PROX=m +CONFIG_GP2AP020A00F=m +CONFIG_SENSORS_ISL29018=m +CONFIG_SENSORS_ISL29028=m +CONFIG_ISL29125=m +CONFIG_HID_SENSOR_ALS=m +CONFIG_HID_SENSOR_PROX=m +CONFIG_JSA1212=m +CONFIG_RPR0521=m +CONFIG_SENSORS_LM3533=m +CONFIG_LTR501=m +CONFIG_LV0104CS=m +CONFIG_MAX44000=m +CONFIG_MAX44009=m +# CONFIG_NOA1305 is not set +CONFIG_OPT3001=m +CONFIG_PA12203001=m +CONFIG_SI1133=m +CONFIG_SI1145=m +CONFIG_STK3310=m +CONFIG_ST_UVIS25=m +CONFIG_ST_UVIS25_I2C=m +CONFIG_ST_UVIS25_SPI=m +CONFIG_TCS3414=m +CONFIG_TCS3472=m +CONFIG_SENSORS_TSL2563=m +CONFIG_TSL2583=m +CONFIG_TSL2772=m +CONFIG_TSL4531=m +CONFIG_US5182D=m +CONFIG_VCNL4000=m +CONFIG_VCNL4035=m +CONFIG_VEML6070=m +CONFIG_VL6180=m +CONFIG_ZOPT2201=m +# end of Light sensors + +# +# Magnetometer sensors +# +CONFIG_AK8974=m +CONFIG_AK8975=m +CONFIG_AK09911=m +CONFIG_BMC150_MAGN=m +CONFIG_BMC150_MAGN_I2C=m +CONFIG_BMC150_MAGN_SPI=m +CONFIG_MAG3110=m +CONFIG_HID_SENSOR_MAGNETOMETER_3D=m +CONFIG_MMC35240=m +CONFIG_IIO_ST_MAGN_3AXIS=m +CONFIG_IIO_ST_MAGN_I2C_3AXIS=m +CONFIG_IIO_ST_MAGN_SPI_3AXIS=m +CONFIG_SENSORS_HMC5843=m +CONFIG_SENSORS_HMC5843_I2C=m +CONFIG_SENSORS_HMC5843_SPI=m +CONFIG_SENSORS_RM3100=m +CONFIG_SENSORS_RM3100_I2C=m +CONFIG_SENSORS_RM3100_SPI=m +# end of Magnetometer sensors + +# +# Multiplexers +# +CONFIG_IIO_MUX=m +# end of Multiplexers + +# +# Inclinometer sensors +# +CONFIG_HID_SENSOR_INCLINOMETER_3D=m +CONFIG_HID_SENSOR_DEVICE_ROTATION=m +# end of Inclinometer sensors + +# +# Triggers - standalone +# +CONFIG_IIO_HRTIMER_TRIGGER=m +CONFIG_IIO_INTERRUPT_TRIGGER=m +CONFIG_IIO_TIGHTLOOP_TRIGGER=m +CONFIG_IIO_SYSFS_TRIGGER=m +# end of Triggers - standalone + +# +# Digital potentiometers +# +CONFIG_AD5272=m +CONFIG_DS1803=m +# CONFIG_MAX5432 is not set +CONFIG_MAX5481=m +CONFIG_MAX5487=m +CONFIG_MCP4018=m +CONFIG_MCP4131=m +CONFIG_MCP4531=m +CONFIG_MCP41010=m +CONFIG_TPL0102=m +# end of Digital potentiometers + +# +# Digital potentiostats +# +CONFIG_LMP91000=m +# end of Digital potentiostats + +# +# Pressure sensors +# +CONFIG_ABP060MG=m +CONFIG_BMP280=m +CONFIG_BMP280_I2C=m +CONFIG_BMP280_SPI=m +CONFIG_IIO_CROS_EC_BARO=m +CONFIG_DPS310=m +CONFIG_HID_SENSOR_PRESS=m +CONFIG_HP03=m +CONFIG_MPL115=m +CONFIG_MPL115_I2C=m +CONFIG_MPL115_SPI=m +CONFIG_MPL3115=m +CONFIG_MS5611=m +CONFIG_MS5611_I2C=m +CONFIG_MS5611_SPI=m +CONFIG_MS5637=m +CONFIG_IIO_ST_PRESS=m +CONFIG_IIO_ST_PRESS_I2C=m +CONFIG_IIO_ST_PRESS_SPI=m +CONFIG_T5403=m +CONFIG_HP206C=m +CONFIG_ZPA2326=m +CONFIG_ZPA2326_I2C=m +CONFIG_ZPA2326_SPI=m +# end of Pressure sensors + +# +# Lightning sensors +# +CONFIG_AS3935=m +# end of Lightning sensors + +# +# Proximity and distance sensors +# +CONFIG_ISL29501=m +CONFIG_LIDAR_LITE_V2=m +CONFIG_MB1232=m +CONFIG_RFD77402=m +CONFIG_SRF04=m +CONFIG_SX9500=m +CONFIG_SRF08=m +CONFIG_VL53L0X_I2C=m +# end of Proximity and distance sensors + +# +# Resolver to digital converters +# +CONFIG_AD2S90=m +CONFIG_AD2S1200=m +# end of Resolver to digital converters + +# +# Temperature sensors +# +CONFIG_MAXIM_THERMOCOUPLE=m +CONFIG_HID_SENSOR_TEMP=m +CONFIG_MLX90614=m +CONFIG_MLX90632=m +CONFIG_TMP006=m +CONFIG_TMP007=m +CONFIG_TSYS01=m +CONFIG_TSYS02D=m +CONFIG_MAX31856=m +# end of Temperature sensors + +CONFIG_NTB=m +CONFIG_NTB_MSI=y +CONFIG_NTB_AMD=m +CONFIG_NTB_IDT=m +CONFIG_NTB_INTEL=m +CONFIG_NTB_SWITCHTEC=m +# CONFIG_NTB_PINGPONG is not set +# CONFIG_NTB_TOOL is not set +# CONFIG_NTB_PERF is not set +# CONFIG_NTB_MSI_TEST is not set +CONFIG_NTB_TRANSPORT=m +CONFIG_VME_BUS=y + +# +# VME Bridge Drivers +# +CONFIG_VME_CA91CX42=m +CONFIG_VME_TSI148=m +# CONFIG_VME_FAKE is not set + +# +# VME Board Drivers +# +CONFIG_VMIVME_7805=m + +# +# VME Device Drivers +# +CONFIG_VME_USER=m +CONFIG_PWM=y +CONFIG_PWM_SYSFS=y +CONFIG_PWM_ATMEL_HLCDC_PWM=m +CONFIG_PWM_CRC=y +CONFIG_PWM_CROS_EC=m +CONFIG_PWM_FSL_FTM=m +CONFIG_PWM_LP3943=m +CONFIG_PWM_LPSS=m +CONFIG_PWM_LPSS_PCI=m +CONFIG_PWM_LPSS_PLATFORM=m +CONFIG_PWM_PCA9685=m +CONFIG_PWM_STMPE=y +CONFIG_PWM_TWL=m +CONFIG_PWM_TWL_LED=m + +# +# IRQ chip support +# +CONFIG_IRQCHIP=y +CONFIG_AL_FIC=y +CONFIG_MADERA_IRQ=m +# end of IRQ chip support + +CONFIG_IPACK_BUS=m +CONFIG_BOARD_TPCI200=m +CONFIG_SERIAL_IPOCTAL=m +CONFIG_RESET_CONTROLLER=y +CONFIG_RESET_TI_SYSCON=m + +# +# PHY Subsystem +# +CONFIG_GENERIC_PHY=y +CONFIG_GENERIC_PHY_MIPI_DPHY=y +CONFIG_BCM_KONA_USB2_PHY=m +CONFIG_PHY_CADENCE_DP=m +CONFIG_PHY_CADENCE_DPHY=m +CONFIG_PHY_CADENCE_SIERRA=m +CONFIG_PHY_FSL_IMX8MQ_USB=m +CONFIG_PHY_MIXEL_MIPI_DPHY=m +CONFIG_PHY_PXA_28NM_HSIC=m +CONFIG_PHY_PXA_28NM_USB2=m +CONFIG_PHY_CPCAP_USB=m +CONFIG_PHY_MAPPHONE_MDM6600=m +CONFIG_PHY_OCELOT_SERDES=m +CONFIG_PHY_QCOM_USB_HS=m +CONFIG_PHY_QCOM_USB_HSIC=m +CONFIG_PHY_SAMSUNG_USB2=m +CONFIG_PHY_TUSB1210=m +# end of PHY Subsystem + +CONFIG_POWERCAP=y +CONFIG_INTEL_RAPL_CORE=m +CONFIG_INTEL_RAPL=m +CONFIG_IDLE_INJECT=y +CONFIG_MCB=m +CONFIG_MCB_PCI=m +CONFIG_MCB_LPC=m + +# +# Performance monitor support +# +# end of Performance monitor support + +CONFIG_RAS=y +CONFIG_RAS_CEC=y +# CONFIG_RAS_CEC_DEBUG is not set +CONFIG_THUNDERBOLT=m + +# +# Android +# +# CONFIG_ANDROID is not set +# end of Android + +CONFIG_LIBNVDIMM=y +CONFIG_BLK_DEV_PMEM=m +CONFIG_ND_BLK=m +CONFIG_ND_CLAIM=y +CONFIG_ND_BTT=m +CONFIG_BTT=y +CONFIG_ND_PFN=m +CONFIG_NVDIMM_PFN=y +CONFIG_NVDIMM_DAX=y +CONFIG_OF_PMEM=m +CONFIG_DAX_DRIVER=y +CONFIG_DAX=y +CONFIG_DEV_DAX=m +CONFIG_DEV_DAX_PMEM=m +CONFIG_DEV_DAX_KMEM=m +CONFIG_DEV_DAX_PMEM_COMPAT=m +CONFIG_NVMEM=y +CONFIG_NVMEM_SYSFS=y +CONFIG_RAVE_SP_EEPROM=m + +# +# HW tracing support +# +CONFIG_STM=m +CONFIG_STM_PROTO_BASIC=m +CONFIG_STM_PROTO_SYS_T=m +# CONFIG_STM_DUMMY is not set +CONFIG_STM_SOURCE_CONSOLE=m +CONFIG_STM_SOURCE_HEARTBEAT=m +CONFIG_STM_SOURCE_FTRACE=m +CONFIG_INTEL_TH=m +CONFIG_INTEL_TH_PCI=m +CONFIG_INTEL_TH_ACPI=m +CONFIG_INTEL_TH_GTH=m +CONFIG_INTEL_TH_STH=m +CONFIG_INTEL_TH_MSU=m +CONFIG_INTEL_TH_PTI=m +# CONFIG_INTEL_TH_DEBUG is not set +# end of HW tracing support + +CONFIG_FPGA=m +CONFIG_ALTERA_PR_IP_CORE=m +CONFIG_ALTERA_PR_IP_CORE_PLAT=m +CONFIG_FPGA_MGR_ALTERA_PS_SPI=m +CONFIG_FPGA_MGR_ALTERA_CVP=m +CONFIG_FPGA_MGR_XILINX_SPI=m +CONFIG_FPGA_MGR_ICE40_SPI=m +CONFIG_FPGA_MGR_MACHXO2_SPI=m +CONFIG_FPGA_BRIDGE=m +CONFIG_ALTERA_FREEZE_BRIDGE=m +CONFIG_XILINX_PR_DECOUPLER=m +CONFIG_FPGA_REGION=m +CONFIG_OF_FPGA_REGION=m +CONFIG_FPGA_DFL=m +CONFIG_FPGA_DFL_FME=m +CONFIG_FPGA_DFL_FME_MGR=m +CONFIG_FPGA_DFL_FME_BRIDGE=m +CONFIG_FPGA_DFL_FME_REGION=m +CONFIG_FPGA_DFL_AFU=m +CONFIG_FPGA_DFL_PCI=m +CONFIG_FSI=m +CONFIG_FSI_NEW_DEV_NODE=y +CONFIG_FSI_MASTER_GPIO=m +CONFIG_FSI_MASTER_HUB=m +CONFIG_FSI_SCOM=m +CONFIG_FSI_SBEFIFO=m +CONFIG_FSI_OCC=m +CONFIG_MULTIPLEXER=m + +# +# Multiplexer drivers +# +CONFIG_MUX_ADG792A=m +CONFIG_MUX_ADGS1408=m +CONFIG_MUX_GPIO=m +CONFIG_MUX_MMIO=m +# end of Multiplexer drivers + +CONFIG_PM_OPP=y +CONFIG_UNISYS_VISORBUS=m +CONFIG_SIOX=m +CONFIG_SIOX_BUS_GPIO=m +CONFIG_SLIMBUS=m +CONFIG_SLIM_QCOM_CTRL=m +CONFIG_INTERCONNECT=m +CONFIG_COUNTER=m +CONFIG_FTM_QUADDEC=m +# end of Device Drivers + +# +# File systems +# +CONFIG_DCACHE_WORD_ACCESS=y +CONFIG_VALIDATE_FS_PARSER=y +CONFIG_FS_IOMAP=y +# CONFIG_EXT2_FS is not set +# CONFIG_EXT3_FS is not set +CONFIG_EXT4_FS=m +CONFIG_EXT4_USE_FOR_EXT2=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +# CONFIG_EXT4_DEBUG is not set +CONFIG_JBD2=m +# CONFIG_JBD2_DEBUG is not set +CONFIG_FS_MBCACHE=m +CONFIG_REISERFS_FS=m +# CONFIG_REISERFS_CHECK is not set +CONFIG_REISERFS_PROC_INFO=y +CONFIG_REISERFS_FS_XATTR=y +CONFIG_REISERFS_FS_POSIX_ACL=y +CONFIG_REISERFS_FS_SECURITY=y +CONFIG_JFS_FS=m +CONFIG_JFS_POSIX_ACL=y +CONFIG_JFS_SECURITY=y +# CONFIG_JFS_DEBUG is not set +CONFIG_JFS_STATISTICS=y +CONFIG_XFS_FS=m +CONFIG_XFS_QUOTA=y +CONFIG_XFS_POSIX_ACL=y +CONFIG_XFS_RT=y +CONFIG_XFS_ONLINE_SCRUB=y +CONFIG_XFS_ONLINE_REPAIR=y +# CONFIG_XFS_WARN is not set +# CONFIG_XFS_DEBUG is not set +CONFIG_GFS2_FS=m +CONFIG_GFS2_FS_LOCKING_DLM=y +CONFIG_OCFS2_FS=m +CONFIG_OCFS2_FS_O2CB=m +CONFIG_OCFS2_FS_USERSPACE_CLUSTER=m +CONFIG_OCFS2_FS_STATS=y +CONFIG_OCFS2_DEBUG_MASKLOG=y +# CONFIG_OCFS2_DEBUG_FS is not set +CONFIG_BTRFS_FS=m +CONFIG_BTRFS_FS_POSIX_ACL=y +# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set +# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set +# CONFIG_BTRFS_DEBUG is not set +# CONFIG_BTRFS_ASSERT is not set +# CONFIG_BTRFS_FS_REF_VERIFY is not set +CONFIG_NILFS2_FS=m +CONFIG_F2FS_FS=m +CONFIG_F2FS_STAT_FS=y +CONFIG_F2FS_FS_XATTR=y +CONFIG_F2FS_FS_POSIX_ACL=y +CONFIG_F2FS_FS_SECURITY=y +CONFIG_F2FS_CHECK_FS=y +# CONFIG_F2FS_IO_TRACE is not set +# CONFIG_F2FS_FAULT_INJECTION is not set +CONFIG_FS_DAX=y +CONFIG_FS_DAX_PMD=y +CONFIG_FS_POSIX_ACL=y +CONFIG_EXPORTFS=y +CONFIG_EXPORTFS_BLOCK_OPS=y +CONFIG_FILE_LOCKING=y +# CONFIG_MANDATORY_FILE_LOCKING is not set +CONFIG_FS_ENCRYPTION=y +# CONFIG_FS_VERITY is not set +CONFIG_FSNOTIFY=y +CONFIG_DNOTIFY=y +CONFIG_INOTIFY_USER=y +CONFIG_FANOTIFY=y +CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y +CONFIG_QUOTA=y +CONFIG_QUOTA_NETLINK_INTERFACE=y +# CONFIG_PRINT_QUOTA_WARNING is not set +# CONFIG_QUOTA_DEBUG is not set +CONFIG_QUOTA_TREE=m +CONFIG_QFMT_V1=m +CONFIG_QFMT_V2=m +CONFIG_QUOTACTL=y +CONFIG_QUOTACTL_COMPAT=y +CONFIG_AUTOFS4_FS=y +CONFIG_AUTOFS_FS=y +CONFIG_FUSE_FS=m +CONFIG_CUSE=m +# CONFIG_VIRTIO_FS is not set +CONFIG_OVERLAY_FS=m +CONFIG_OVERLAY_FS_REDIRECT_DIR=y +# CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW is not set +CONFIG_OVERLAY_FS_INDEX=y +CONFIG_OVERLAY_FS_XINO_AUTO=y +CONFIG_OVERLAY_FS_METACOPY=y + +# +# Caches +# +CONFIG_FSCACHE=m +CONFIG_FSCACHE_STATS=y +CONFIG_FSCACHE_HISTOGRAM=y +# CONFIG_FSCACHE_DEBUG is not set +# CONFIG_FSCACHE_OBJECT_LIST is not set +CONFIG_CACHEFILES=m +# CONFIG_CACHEFILES_DEBUG is not set +# CONFIG_CACHEFILES_HISTOGRAM is not set +# end of Caches + +# +# CD-ROM/DVD Filesystems +# +CONFIG_ISO9660_FS=m +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_UDF_FS=m +# end of CD-ROM/DVD Filesystems + +# +# DOS/FAT/NT Filesystems +# +CONFIG_FAT_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_CODEPAGE=437 +CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" +CONFIG_FAT_DEFAULT_UTF8=y +CONFIG_NTFS_FS=m +# CONFIG_NTFS_DEBUG is not set +CONFIG_NTFS_RW=y +# end of DOS/FAT/NT Filesystems + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +# CONFIG_PROC_KCORE is not set +# CONFIG_PROC_VMCORE is not set +CONFIG_PROC_SYSCTL=y +CONFIG_PROC_PAGE_MONITOR=y +CONFIG_PROC_CHILDREN=y +CONFIG_PROC_PID_ARCH_STATUS=y +CONFIG_KERNFS=y +CONFIG_SYSFS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_TMPFS_XATTR=y +CONFIG_HUGETLBFS=y +CONFIG_HUGETLB_PAGE=y +CONFIG_MEMFD_CREATE=y +CONFIG_ARCH_HAS_GIGANTIC_PAGE=y +CONFIG_CONFIGFS_FS=y +CONFIG_EFIVAR_FS=y +# end of Pseudo filesystems + +CONFIG_MISC_FILESYSTEMS=y +CONFIG_ORANGEFS_FS=m +# CONFIG_ADFS_FS is not set +CONFIG_AFFS_FS=m +CONFIG_ECRYPT_FS=m +# CONFIG_ECRYPT_FS_MESSAGING is not set +CONFIG_HFS_FS=m +CONFIG_HFSPLUS_FS=m +CONFIG_BEFS_FS=m +# CONFIG_BEFS_DEBUG is not set +# CONFIG_BFS_FS is not set +# CONFIG_EFS_FS is not set +CONFIG_JFFS2_FS=m +CONFIG_JFFS2_FS_DEBUG=0 +CONFIG_JFFS2_FS_WRITEBUFFER=y +# CONFIG_JFFS2_FS_WBUF_VERIFY is not set +CONFIG_JFFS2_SUMMARY=y +CONFIG_JFFS2_FS_XATTR=y +CONFIG_JFFS2_FS_POSIX_ACL=y +CONFIG_JFFS2_FS_SECURITY=y +# CONFIG_JFFS2_COMPRESSION_OPTIONS is not set +CONFIG_JFFS2_ZLIB=y +CONFIG_JFFS2_RTIME=y +CONFIG_UBIFS_FS=m +# CONFIG_UBIFS_FS_ADVANCED_COMPR is not set +CONFIG_UBIFS_FS_LZO=y +CONFIG_UBIFS_FS_ZLIB=y +CONFIG_UBIFS_FS_ZSTD=y +CONFIG_UBIFS_ATIME_SUPPORT=y +CONFIG_UBIFS_FS_XATTR=y +CONFIG_UBIFS_FS_SECURITY=y +CONFIG_UBIFS_FS_AUTHENTICATION=y +CONFIG_CRAMFS=m +CONFIG_CRAMFS_BLOCKDEV=y +CONFIG_CRAMFS_MTD=y +CONFIG_SQUASHFS=m +# CONFIG_SQUASHFS_FILE_CACHE is not set +CONFIG_SQUASHFS_FILE_DIRECT=y +# CONFIG_SQUASHFS_DECOMP_SINGLE is not set +CONFIG_SQUASHFS_DECOMP_MULTI=y +# CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU is not set +CONFIG_SQUASHFS_XATTR=y +CONFIG_SQUASHFS_ZLIB=y +CONFIG_SQUASHFS_LZ4=y +CONFIG_SQUASHFS_LZO=y +CONFIG_SQUASHFS_XZ=y +CONFIG_SQUASHFS_ZSTD=y +# CONFIG_SQUASHFS_4K_DEVBLK_SIZE is not set +# CONFIG_SQUASHFS_EMBEDDED is not set +CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3 +# CONFIG_VXFS_FS is not set +CONFIG_MINIX_FS=m +CONFIG_OMFS_FS=m +# CONFIG_HPFS_FS is not set +# CONFIG_QNX4FS_FS is not set +# CONFIG_QNX6FS_FS is not set +CONFIG_ROMFS_FS=m +CONFIG_ROMFS_BACKED_BY_BLOCK=y +# CONFIG_ROMFS_BACKED_BY_MTD is not set +# CONFIG_ROMFS_BACKED_BY_BOTH is not set +CONFIG_ROMFS_ON_BLOCK=y +CONFIG_PSTORE=y +CONFIG_PSTORE_DEFLATE_COMPRESS=m +CONFIG_PSTORE_LZO_COMPRESS=m +CONFIG_PSTORE_LZ4_COMPRESS=m +CONFIG_PSTORE_LZ4HC_COMPRESS=m +# CONFIG_PSTORE_842_COMPRESS is not set +CONFIG_PSTORE_ZSTD_COMPRESS=y +CONFIG_PSTORE_COMPRESS=y +# CONFIG_PSTORE_DEFLATE_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_LZO_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_LZ4_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_LZ4HC_COMPRESS_DEFAULT is not set +CONFIG_PSTORE_ZSTD_COMPRESS_DEFAULT=y +CONFIG_PSTORE_COMPRESS_DEFAULT="zstd" +# CONFIG_PSTORE_CONSOLE is not set +# CONFIG_PSTORE_PMSG is not set +# CONFIG_PSTORE_FTRACE is not set +CONFIG_PSTORE_RAM=y +# CONFIG_SYSV_FS is not set +CONFIG_UFS_FS=m +# CONFIG_UFS_FS_WRITE is not set +# CONFIG_UFS_DEBUG is not set +CONFIG_EROFS_FS=m +# CONFIG_EROFS_FS_DEBUG is not set +CONFIG_EROFS_FS_XATTR=y +CONFIG_EROFS_FS_POSIX_ACL=y +CONFIG_EROFS_FS_SECURITY=y +CONFIG_EROFS_FS_ZIP=y +CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT=2 +CONFIG_NETWORK_FILESYSTEMS=y +CONFIG_NFS_FS=m +CONFIG_NFS_V2=m +CONFIG_NFS_V3=m +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=m +CONFIG_NFS_SWAP=y +CONFIG_NFS_V4_1=y +CONFIG_NFS_V4_2=y +CONFIG_PNFS_FILE_LAYOUT=m +CONFIG_PNFS_BLOCK=m +CONFIG_PNFS_FLEXFILE_LAYOUT=m +CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="kernel.org" +CONFIG_NFS_V4_1_MIGRATION=y +CONFIG_NFS_V4_SECURITY_LABEL=y +CONFIG_NFS_FSCACHE=y +# CONFIG_NFS_USE_LEGACY_DNS is not set +CONFIG_NFS_USE_KERNEL_DNS=y +CONFIG_NFSD=m +CONFIG_NFSD_V2_ACL=y +CONFIG_NFSD_V3=y +CONFIG_NFSD_V3_ACL=y +CONFIG_NFSD_V4=y +CONFIG_NFSD_PNFS=y +CONFIG_NFSD_BLOCKLAYOUT=y +CONFIG_NFSD_SCSILAYOUT=y +# CONFIG_NFSD_FLEXFILELAYOUT is not set +CONFIG_NFSD_V4_SECURITY_LABEL=y +CONFIG_GRACE_PERIOD=m +CONFIG_LOCKD=m +CONFIG_LOCKD_V4=y +CONFIG_NFS_ACL_SUPPORT=m +CONFIG_NFS_COMMON=y +CONFIG_SUNRPC=m +CONFIG_SUNRPC_GSS=m +CONFIG_SUNRPC_BACKCHANNEL=y +CONFIG_SUNRPC_SWAP=y +CONFIG_RPCSEC_GSS_KRB5=m +CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES=y +CONFIG_SUNRPC_DEBUG=y +CONFIG_SUNRPC_XPRT_RDMA=m +CONFIG_CEPH_FS=m +CONFIG_CEPH_FSCACHE=y +CONFIG_CEPH_FS_POSIX_ACL=y +CONFIG_CEPH_FS_SECURITY_LABEL=y +CONFIG_CIFS=m +# CONFIG_CIFS_STATS2 is not set +# CONFIG_CIFS_ALLOW_INSECURE_LEGACY is not set +CONFIG_CIFS_UPCALL=y +CONFIG_CIFS_XATTR=y +CONFIG_CIFS_DEBUG=y +# CONFIG_CIFS_DEBUG2 is not set +# CONFIG_CIFS_DEBUG_DUMP_KEYS is not set +CONFIG_CIFS_DFS_UPCALL=y +# CONFIG_CIFS_SMB_DIRECT is not set +CONFIG_CIFS_FSCACHE=y +CONFIG_CODA_FS=m +CONFIG_AFS_FS=m +# CONFIG_AFS_DEBUG is not set +CONFIG_AFS_FSCACHE=y +# CONFIG_AFS_DEBUG_CURSOR is not set +CONFIG_9P_FS=m +CONFIG_9P_FSCACHE=y +CONFIG_9P_FS_POSIX_ACL=y +CONFIG_9P_FS_SECURITY=y +CONFIG_NLS=y +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_737=m +CONFIG_NLS_CODEPAGE_775=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_CODEPAGE_852=m +CONFIG_NLS_CODEPAGE_855=m +CONFIG_NLS_CODEPAGE_857=m +CONFIG_NLS_CODEPAGE_860=m +CONFIG_NLS_CODEPAGE_861=m +CONFIG_NLS_CODEPAGE_862=m +CONFIG_NLS_CODEPAGE_863=m +CONFIG_NLS_CODEPAGE_864=m +CONFIG_NLS_CODEPAGE_865=m +CONFIG_NLS_CODEPAGE_866=m +CONFIG_NLS_CODEPAGE_869=m +CONFIG_NLS_CODEPAGE_936=m +CONFIG_NLS_CODEPAGE_950=m +CONFIG_NLS_CODEPAGE_932=m +CONFIG_NLS_CODEPAGE_949=m +CONFIG_NLS_CODEPAGE_874=m +CONFIG_NLS_ISO8859_8=m +CONFIG_NLS_CODEPAGE_1250=m +CONFIG_NLS_CODEPAGE_1251=m +CONFIG_NLS_ASCII=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_ISO8859_2=m +CONFIG_NLS_ISO8859_3=m +CONFIG_NLS_ISO8859_4=m +CONFIG_NLS_ISO8859_5=m +CONFIG_NLS_ISO8859_6=m +CONFIG_NLS_ISO8859_7=m +CONFIG_NLS_ISO8859_9=m +CONFIG_NLS_ISO8859_13=m +CONFIG_NLS_ISO8859_14=m +CONFIG_NLS_ISO8859_15=m +CONFIG_NLS_KOI8_R=m +CONFIG_NLS_KOI8_U=m +CONFIG_NLS_MAC_ROMAN=m +CONFIG_NLS_MAC_CELTIC=m +CONFIG_NLS_MAC_CENTEURO=m +CONFIG_NLS_MAC_CROATIAN=m +CONFIG_NLS_MAC_CYRILLIC=m +CONFIG_NLS_MAC_GAELIC=m +CONFIG_NLS_MAC_GREEK=m +CONFIG_NLS_MAC_ICELAND=m +CONFIG_NLS_MAC_INUIT=m +CONFIG_NLS_MAC_ROMANIAN=m +CONFIG_NLS_MAC_TURKISH=m +CONFIG_NLS_UTF8=m +CONFIG_DLM=m +# CONFIG_DLM_DEBUG is not set +CONFIG_UNICODE=y +# CONFIG_UNICODE_NORMALIZATION_SELFTEST is not set +# end of File systems + +# +# Security options +# +CONFIG_KEYS=y +CONFIG_KEYS_COMPAT=y +CONFIG_KEYS_REQUEST_CACHE=y +CONFIG_PERSISTENT_KEYRINGS=y +CONFIG_BIG_KEYS=y +CONFIG_TRUSTED_KEYS=m +CONFIG_ENCRYPTED_KEYS=m +CONFIG_KEY_DH_OPERATIONS=y +CONFIG_SECURITY_DMESG_RESTRICT=y +CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y +CONFIG_SECURITY_TIOCSTI_RESTRICT=y +CONFIG_SECURITY=y +CONFIG_SECURITYFS=y +CONFIG_SECURITY_NETWORK=y +CONFIG_PAGE_TABLE_ISOLATION=y +CONFIG_SECURITY_INFINIBAND=y +CONFIG_SECURITY_NETWORK_XFRM=y +CONFIG_SECURITY_PATH=y +# CONFIG_INTEL_TXT is not set +CONFIG_LSM_MMAP_MIN_ADDR=65536 +CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR=y +CONFIG_HARDENED_USERCOPY=y +# CONFIG_HARDENED_USERCOPY_FALLBACK is not set +# CONFIG_HARDENED_USERCOPY_PAGESPAN is not set +CONFIG_FORTIFY_SOURCE=y +# CONFIG_FORTIFY_SOURCE_STRICT_STRING is not set +# CONFIG_STATIC_USERMODEHELPER is not set +CONFIG_SECURITY_SELINUX=y +CONFIG_SECURITY_SELINUX_BOOTPARAM=y +# CONFIG_SECURITY_SELINUX_DISABLE is not set +CONFIG_SECURITY_SELINUX_DEVELOP=y +CONFIG_SECURITY_SELINUX_AVC_STATS=y +CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=0 +CONFIG_SECURITY_SMACK=y +CONFIG_SECURITY_SMACK_BRINGUP=y +CONFIG_SECURITY_SMACK_NETFILTER=y +CONFIG_SECURITY_SMACK_APPEND_SIGNALS=y +CONFIG_SECURITY_TOMOYO=y +CONFIG_SECURITY_TOMOYO_MAX_ACCEPT_ENTRY=2048 +CONFIG_SECURITY_TOMOYO_MAX_AUDIT_LOG=1024 +# CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER is not set +CONFIG_SECURITY_TOMOYO_POLICY_LOADER="/sbin/tomoyo-init" +CONFIG_SECURITY_TOMOYO_ACTIVATION_TRIGGER="/sbin/init" +# CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING is not set +CONFIG_SECURITY_APPARMOR=y +CONFIG_SECURITY_APPARMOR_HASH=y +CONFIG_SECURITY_APPARMOR_HASH_DEFAULT=y +# CONFIG_SECURITY_APPARMOR_DEBUG is not set +# CONFIG_SECURITY_LOADPIN is not set +CONFIG_SECURITY_YAMA=y +CONFIG_SECURITY_SAFESETID=y +# CONFIG_SECURITY_LOCKDOWN_LSM is not set +# CONFIG_INTEGRITY is not set +# CONFIG_DEFAULT_SECURITY_SELINUX is not set +# CONFIG_DEFAULT_SECURITY_SMACK is not set +# CONFIG_DEFAULT_SECURITY_TOMOYO is not set +# CONFIG_DEFAULT_SECURITY_APPARMOR is not set +CONFIG_DEFAULT_SECURITY_DAC=y +CONFIG_LSM="yama" + +# +# Kernel hardening options +# +CONFIG_GCC_PLUGIN_STRUCTLEAK=y + +# +# Memory initialization +# +# CONFIG_INIT_STACK_NONE is not set +# CONFIG_GCC_PLUGIN_STRUCTLEAK_USER is not set +# CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF is not set +CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL=y +# CONFIG_GCC_PLUGIN_STRUCTLEAK_VERBOSE is not set +CONFIG_GCC_PLUGIN_STACKLEAK=y +CONFIG_STACKLEAK_TRACK_MIN_SIZE=100 +# CONFIG_STACKLEAK_METRICS is not set +# CONFIG_STACKLEAK_RUNTIME_DISABLE is not set +CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y +CONFIG_INIT_ON_FREE_DEFAULT_ON=y +CONFIG_PAGE_SANITIZE_VERIFY=y +CONFIG_SLAB_SANITIZE_VERIFY=y +# end of Memory initialization +# end of Kernel hardening options +# end of Security options + +CONFIG_XOR_BLOCKS=m +CONFIG_ASYNC_CORE=m +CONFIG_ASYNC_MEMCPY=m +CONFIG_ASYNC_XOR=m +CONFIG_ASYNC_PQ=m +CONFIG_ASYNC_RAID6_RECOV=m +CONFIG_CRYPTO=y + +# +# Crypto core or helper +# +CONFIG_CRYPTO_ALGAPI=y +CONFIG_CRYPTO_ALGAPI2=y +CONFIG_CRYPTO_AEAD=y +CONFIG_CRYPTO_AEAD2=y +CONFIG_CRYPTO_BLKCIPHER=y +CONFIG_CRYPTO_BLKCIPHER2=y +CONFIG_CRYPTO_HASH=y +CONFIG_CRYPTO_HASH2=y +CONFIG_CRYPTO_RNG=y +CONFIG_CRYPTO_RNG2=y +CONFIG_CRYPTO_RNG_DEFAULT=y +CONFIG_CRYPTO_AKCIPHER2=y +CONFIG_CRYPTO_AKCIPHER=y +CONFIG_CRYPTO_KPP2=y +CONFIG_CRYPTO_KPP=y +CONFIG_CRYPTO_ACOMP2=y +CONFIG_CRYPTO_MANAGER=y +CONFIG_CRYPTO_MANAGER2=y +CONFIG_CRYPTO_USER=m +CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y +CONFIG_CRYPTO_GF128MUL=y +CONFIG_CRYPTO_NULL=y +CONFIG_CRYPTO_NULL2=y +CONFIG_CRYPTO_PCRYPT=m +CONFIG_CRYPTO_CRYPTD=m +CONFIG_CRYPTO_AUTHENC=m +CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_SIMD=m +CONFIG_CRYPTO_GLUE_HELPER_X86=m +CONFIG_CRYPTO_ENGINE=m + +# +# Public-key cryptography +# +CONFIG_CRYPTO_RSA=y +CONFIG_CRYPTO_DH=y +CONFIG_CRYPTO_ECC=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECRDSA=m + +# +# Authenticated Encryption with Associated Data +# +CONFIG_CRYPTO_CCM=m +CONFIG_CRYPTO_GCM=y +CONFIG_CRYPTO_CHACHA20POLY1305=m +CONFIG_CRYPTO_AEGIS128=m +CONFIG_CRYPTO_AEGIS128_AESNI_SSE2=m +CONFIG_CRYPTO_SEQIV=y +CONFIG_CRYPTO_ECHAINIV=m + +# +# Block modes +# +CONFIG_CRYPTO_CBC=y +CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTR=y +CONFIG_CRYPTO_CTS=y +CONFIG_CRYPTO_ECB=y +CONFIG_CRYPTO_LRW=m +CONFIG_CRYPTO_OFB=m +CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=y +CONFIG_CRYPTO_KEYWRAP=m +CONFIG_CRYPTO_NHPOLY1305=m +CONFIG_CRYPTO_NHPOLY1305_SSE2=m +CONFIG_CRYPTO_NHPOLY1305_AVX2=m +CONFIG_CRYPTO_ADIANTUM=m +CONFIG_CRYPTO_ESSIV=m + +# +# Hash modes +# +CONFIG_CRYPTO_CMAC=m +CONFIG_CRYPTO_HMAC=y +CONFIG_CRYPTO_XCBC=m +CONFIG_CRYPTO_VMAC=m + +# +# Digest +# +CONFIG_CRYPTO_CRC32C=m +CONFIG_CRYPTO_CRC32C_INTEL=m +CONFIG_CRYPTO_CRC32=m +CONFIG_CRYPTO_CRC32_PCLMUL=m +CONFIG_CRYPTO_XXHASH=m +CONFIG_CRYPTO_CRCT10DIF=y +CONFIG_CRYPTO_CRCT10DIF_PCLMUL=m +CONFIG_CRYPTO_GHASH=y +CONFIG_CRYPTO_POLY1305=m +CONFIG_CRYPTO_POLY1305_X86_64=m +CONFIG_CRYPTO_MD4=m +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_RMD128=m +CONFIG_CRYPTO_RMD160=m +CONFIG_CRYPTO_RMD256=m +CONFIG_CRYPTO_RMD320=m +CONFIG_CRYPTO_SHA1=y +CONFIG_CRYPTO_SHA1_SSSE3=m +CONFIG_CRYPTO_SHA256_SSSE3=m +CONFIG_CRYPTO_SHA512_SSSE3=m +CONFIG_CRYPTO_LIB_SHA256=y +CONFIG_CRYPTO_SHA256=y +CONFIG_CRYPTO_SHA512=y +CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m +CONFIG_CRYPTO_STREEBOG=m +CONFIG_CRYPTO_TGR192=m +CONFIG_CRYPTO_WP512=m +CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL=m + +# +# Ciphers +# +CONFIG_CRYPTO_LIB_AES=y +CONFIG_CRYPTO_AES=y +CONFIG_CRYPTO_AES_TI=m +CONFIG_CRYPTO_AES_NI_INTEL=m +CONFIG_CRYPTO_ANUBIS=m +CONFIG_CRYPTO_LIB_ARC4=m +CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_BLOWFISH=m +CONFIG_CRYPTO_BLOWFISH_COMMON=m +CONFIG_CRYPTO_BLOWFISH_X86_64=m +CONFIG_CRYPTO_CAMELLIA=m +CONFIG_CRYPTO_CAMELLIA_X86_64=m +CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64=m +CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64=m +CONFIG_CRYPTO_CAST_COMMON=m +CONFIG_CRYPTO_CAST5=m +CONFIG_CRYPTO_CAST5_AVX_X86_64=m +CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_CAST6_AVX_X86_64=m +CONFIG_CRYPTO_LIB_DES=m +CONFIG_CRYPTO_DES=m +CONFIG_CRYPTO_DES3_EDE_X86_64=m +CONFIG_CRYPTO_FCRYPT=m +CONFIG_CRYPTO_KHAZAD=m +CONFIG_CRYPTO_SALSA20=m +CONFIG_CRYPTO_CHACHA20=m +CONFIG_CRYPTO_CHACHA20_X86_64=m +CONFIG_CRYPTO_SEED=m +CONFIG_CRYPTO_SERPENT=m +CONFIG_CRYPTO_SERPENT_SSE2_X86_64=m +CONFIG_CRYPTO_SERPENT_AVX_X86_64=m +CONFIG_CRYPTO_SERPENT_AVX2_X86_64=m +CONFIG_CRYPTO_SM4=m +CONFIG_CRYPTO_TEA=m +CONFIG_CRYPTO_TWOFISH=m +CONFIG_CRYPTO_TWOFISH_COMMON=m +CONFIG_CRYPTO_TWOFISH_X86_64=m +CONFIG_CRYPTO_TWOFISH_X86_64_3WAY=m +CONFIG_CRYPTO_TWOFISH_AVX_X86_64=m + +# +# Compression +# +CONFIG_CRYPTO_DEFLATE=m +CONFIG_CRYPTO_LZO=y +CONFIG_CRYPTO_842=m +CONFIG_CRYPTO_LZ4=m +CONFIG_CRYPTO_LZ4HC=m +CONFIG_CRYPTO_ZSTD=y + +# +# Random Number Generation +# +CONFIG_CRYPTO_ANSI_CPRNG=m +CONFIG_CRYPTO_DRBG_MENU=y +CONFIG_CRYPTO_DRBG_HMAC=y +CONFIG_CRYPTO_DRBG_HASH=y +CONFIG_CRYPTO_DRBG_CTR=y +CONFIG_CRYPTO_DRBG=y +CONFIG_CRYPTO_JITTERENTROPY=y +CONFIG_CRYPTO_USER_API=m +CONFIG_CRYPTO_USER_API_HASH=m +CONFIG_CRYPTO_USER_API_SKCIPHER=m +CONFIG_CRYPTO_USER_API_RNG=m +CONFIG_CRYPTO_USER_API_AEAD=m +# CONFIG_CRYPTO_STATS is not set +CONFIG_CRYPTO_HASH_INFO=y +CONFIG_CRYPTO_HW=y +CONFIG_CRYPTO_DEV_PADLOCK=m +CONFIG_CRYPTO_DEV_PADLOCK_AES=m +CONFIG_CRYPTO_DEV_PADLOCK_SHA=m +CONFIG_CRYPTO_DEV_ATMEL_I2C=m +CONFIG_CRYPTO_DEV_ATMEL_ECC=m +CONFIG_CRYPTO_DEV_ATMEL_SHA204A=m +CONFIG_CRYPTO_DEV_CCP=y +CONFIG_CRYPTO_DEV_CCP_DD=m +CONFIG_CRYPTO_DEV_SP_CCP=y +CONFIG_CRYPTO_DEV_CCP_CRYPTO=m +CONFIG_CRYPTO_DEV_SP_PSP=y +# CONFIG_CRYPTO_DEV_CCP_DEBUGFS is not set +CONFIG_CRYPTO_DEV_QAT=m +CONFIG_CRYPTO_DEV_QAT_DH895xCC=m +CONFIG_CRYPTO_DEV_QAT_C3XXX=m +CONFIG_CRYPTO_DEV_QAT_C62X=m +CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m +CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m +CONFIG_CRYPTO_DEV_QAT_C62XVF=m +CONFIG_CRYPTO_DEV_NITROX=m +CONFIG_CRYPTO_DEV_NITROX_CNN55XX=m +CONFIG_CRYPTO_DEV_CHELSIO=m +CONFIG_CHELSIO_IPSEC_INLINE=y +CONFIG_CRYPTO_DEV_CHELSIO_TLS=m +CONFIG_CRYPTO_DEV_VIRTIO=m +# CONFIG_CRYPTO_DEV_SAFEXCEL is not set +CONFIG_CRYPTO_DEV_CCREE=m +CONFIG_ASYMMETRIC_KEY_TYPE=y +CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y +CONFIG_ASYMMETRIC_TPM_KEY_SUBTYPE=m +CONFIG_X509_CERTIFICATE_PARSER=y +CONFIG_PKCS8_PRIVATE_KEY_PARSER=m +CONFIG_TPM_KEY_PARSER=m +CONFIG_PKCS7_MESSAGE_PARSER=y +# CONFIG_PKCS7_TEST_KEY is not set +CONFIG_SIGNED_PE_FILE_VERIFICATION=y + +# +# Certificates for signature checking +# +CONFIG_MODULE_SIG_KEY="certs/signing_key.pem" +CONFIG_SYSTEM_TRUSTED_KEYRING=y +CONFIG_SYSTEM_TRUSTED_KEYS="" +# CONFIG_SYSTEM_EXTRA_CERTIFICATE is not set +CONFIG_SECONDARY_TRUSTED_KEYRING=y +CONFIG_SYSTEM_BLACKLIST_KEYRING=y +CONFIG_SYSTEM_BLACKLIST_HASH_LIST="" +# end of Certificates for signature checking + +CONFIG_BINARY_PRINTF=y + +# +# Library routines +# +CONFIG_RAID6_PQ=m +CONFIG_RAID6_PQ_BENCHMARK=y +CONFIG_PACKING=y +CONFIG_BITREVERSE=y +CONFIG_GENERIC_STRNCPY_FROM_USER=y +CONFIG_GENERIC_STRNLEN_USER=y +CONFIG_GENERIC_NET_UTILS=y +CONFIG_GENERIC_FIND_FIRST_BIT=y +CONFIG_CORDIC=m +CONFIG_RATIONAL=y +CONFIG_GENERIC_PCI_IOMAP=y +CONFIG_GENERIC_IOMAP=y +CONFIG_ARCH_USE_CMPXCHG_LOCKREF=y +CONFIG_ARCH_HAS_FAST_MULTIPLIER=y +CONFIG_CRC_CCITT=y +CONFIG_CRC16=m +CONFIG_CRC_T10DIF=y +CONFIG_CRC_ITU_T=m +CONFIG_CRC32=y +# CONFIG_CRC32_SELFTEST is not set +CONFIG_CRC32_SLICEBY8=y +# CONFIG_CRC32_SLICEBY4 is not set +# CONFIG_CRC32_SARWATE is not set +# CONFIG_CRC32_BIT is not set +CONFIG_CRC64=m +CONFIG_CRC4=m +CONFIG_CRC7=m +CONFIG_LIBCRC32C=m +CONFIG_CRC8=m +CONFIG_XXHASH=y +# CONFIG_RANDOM32_SELFTEST is not set +CONFIG_842_COMPRESS=m +CONFIG_842_DECOMPRESS=m +CONFIG_ZLIB_INFLATE=y +CONFIG_ZLIB_DEFLATE=y +CONFIG_LZO_COMPRESS=y +CONFIG_LZO_DECOMPRESS=y +CONFIG_LZ4_COMPRESS=m +CONFIG_LZ4HC_COMPRESS=m +CONFIG_LZ4_DECOMPRESS=y +CONFIG_ZSTD_COMPRESS=y +CONFIG_ZSTD_DECOMPRESS=y +CONFIG_XZ_DEC=y +CONFIG_XZ_DEC_X86=y +CONFIG_XZ_DEC_POWERPC=y +CONFIG_XZ_DEC_IA64=y +CONFIG_XZ_DEC_ARM=y +CONFIG_XZ_DEC_ARMTHUMB=y +CONFIG_XZ_DEC_SPARC=y +CONFIG_XZ_DEC_BCJ=y +# CONFIG_XZ_DEC_TEST is not set +CONFIG_DECOMPRESS_GZIP=y +CONFIG_DECOMPRESS_BZIP2=y +CONFIG_DECOMPRESS_LZMA=y +CONFIG_DECOMPRESS_XZ=y +CONFIG_DECOMPRESS_LZO=y +CONFIG_DECOMPRESS_LZ4=y +CONFIG_GENERIC_ALLOCATOR=y +CONFIG_REED_SOLOMON=y +CONFIG_REED_SOLOMON_ENC8=y +CONFIG_REED_SOLOMON_DEC8=y +CONFIG_REED_SOLOMON_DEC16=y +CONFIG_BCH=m +CONFIG_TEXTSEARCH=y +CONFIG_TEXTSEARCH_KMP=m +CONFIG_TEXTSEARCH_BM=m +CONFIG_TEXTSEARCH_FSM=m +CONFIG_BTREE=y +CONFIG_INTERVAL_TREE=y +CONFIG_XARRAY_MULTI=y +CONFIG_ASSOCIATIVE_ARRAY=y +CONFIG_HAS_IOMEM=y +CONFIG_HAS_IOPORT_MAP=y +CONFIG_HAS_DMA=y +CONFIG_NEED_SG_DMA_LENGTH=y +CONFIG_NEED_DMA_MAP_STATE=y +CONFIG_ARCH_DMA_ADDR_T_64BIT=y +CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED=y +CONFIG_DMA_VIRT_OPS=y +CONFIG_SWIOTLB=y +# CONFIG_DMA_API_DEBUG is not set +CONFIG_SGL_ALLOC=y +CONFIG_IOMMU_HELPER=y +CONFIG_CHECK_SIGNATURE=y +CONFIG_CPU_RMAP=y +CONFIG_DQL=y +CONFIG_GLOB=y +# CONFIG_GLOB_SELFTEST is not set +CONFIG_NLATTR=y +CONFIG_LRU_CACHE=m +CONFIG_CLZ_TAB=y +CONFIG_IRQ_POLL=y +CONFIG_MPILIB=y +CONFIG_DIMLIB=y +CONFIG_LIBFDT=y +CONFIG_OID_REGISTRY=y +CONFIG_UCS2_STRING=y +CONFIG_HAVE_GENERIC_VDSO=y +CONFIG_GENERIC_GETTIMEOFDAY=y +CONFIG_FONT_SUPPORT=y +CONFIG_FONTS=y +# CONFIG_FONT_8x8 is not set +CONFIG_FONT_8x16=y +# CONFIG_FONT_6x11 is not set +# CONFIG_FONT_7x14 is not set +# CONFIG_FONT_PEARL_8x8 is not set +# CONFIG_FONT_ACORN_8x8 is not set +# CONFIG_FONT_MINI_4x6 is not set +# CONFIG_FONT_6x10 is not set +# CONFIG_FONT_10x18 is not set +# CONFIG_FONT_SUN8x16 is not set +# CONFIG_FONT_SUN12x22 is not set +# CONFIG_FONT_TER16x32 is not set +CONFIG_FONT_AUTOSELECT=y +CONFIG_SG_POOL=y +CONFIG_ARCH_HAS_PMEM_API=y +CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE=y +CONFIG_ARCH_HAS_UACCESS_MCSAFE=y +CONFIG_ARCH_STACKWALK=y +CONFIG_SBITMAP=y +CONFIG_PARMAN=m +CONFIG_OBJAGG=m +# CONFIG_STRING_SELFTEST is not set +# end of Library routines + +# +# Kernel hacking +# + +# +# printk and dmesg options +# +CONFIG_PRINTK_TIME=y +# CONFIG_PRINTK_CALLER is not set +CONFIG_CONSOLE_LOGLEVEL_DEFAULT=4 +CONFIG_CONSOLE_LOGLEVEL_QUIET=1 +CONFIG_MESSAGE_LOGLEVEL_DEFAULT=4 +# CONFIG_BOOT_PRINTK_DELAY is not set +CONFIG_DYNAMIC_DEBUG=y +# end of printk and dmesg options + +# +# Compile-time checks and compiler options +# +# CONFIG_DEBUG_INFO is not set +# CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_FRAME_WARN=2048 +CONFIG_STRIP_ASM_SYMS=y +# CONFIG_READABLE_ASM is not set +CONFIG_DEBUG_FS=y +# CONFIG_HEADERS_INSTALL is not set +CONFIG_OPTIMIZE_INLINING=y +# CONFIG_DEBUG_SECTION_MISMATCH is not set +CONFIG_SECTION_MISMATCH_WARN_ONLY=y +# CONFIG_DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE is not set +CONFIG_STACK_VALIDATION=y +# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set +# end of Compile-time checks and compiler options + +CONFIG_MAGIC_SYSRQ=y +CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0x0 +CONFIG_MAGIC_SYSRQ_SERIAL=y +CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_MISC=y + +# +# Memory Debugging +# +# CONFIG_PAGE_EXTENSION is not set +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_PAGE_OWNER is not set +# CONFIG_PAGE_POISONING is not set +# CONFIG_DEBUG_PAGE_REF is not set +# CONFIG_DEBUG_RODATA_TEST is not set +# CONFIG_DEBUG_OBJECTS is not set +# CONFIG_SLUB_DEBUG_ON is not set +# CONFIG_SLUB_STATS is not set +CONFIG_HAVE_DEBUG_KMEMLEAK=y +# CONFIG_DEBUG_KMEMLEAK is not set +# CONFIG_DEBUG_STACK_USAGE is not set +# CONFIG_DEBUG_VM is not set +CONFIG_ARCH_HAS_DEBUG_VIRTUAL=y +# CONFIG_DEBUG_VIRTUAL is not set +CONFIG_DEBUG_MEMORY_INIT=y +# CONFIG_DEBUG_PER_CPU_MAPS is not set +CONFIG_HAVE_ARCH_KASAN=y +CONFIG_CC_HAS_KASAN_GENERIC=y +# CONFIG_KASAN is not set +CONFIG_KASAN_STACK=1 +# end of Memory Debugging + +CONFIG_ARCH_HAS_KCOV=y +CONFIG_CC_HAS_SANCOV_TRACE_PC=y +# CONFIG_KCOV is not set +# CONFIG_DEBUG_SHIRQ is not set + +# +# Debug Lockups and Hangs +# +CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y +# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 +CONFIG_HARDLOCKUP_DETECTOR_PERF=y +CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y +CONFIG_HARDLOCKUP_DETECTOR=y +# CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is not set +CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE=0 +CONFIG_DETECT_HUNG_TASK=y +CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120 +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 +# CONFIG_WQ_WATCHDOG is not set +# end of Debug Lockups and Hangs + +CONFIG_PANIC_ON_OOPS=y +CONFIG_PANIC_ON_OOPS_VALUE=1 +CONFIG_PANIC_TIMEOUT=0 +CONFIG_SCHED_DEBUG=y +CONFIG_SCHED_INFO=y +CONFIG_SCHEDSTATS=y +CONFIG_SCHED_STACK_END_CHECK=y +# CONFIG_DEBUG_TIMEKEEPING is not set +CONFIG_DEBUG_PREEMPT=y + +# +# Lock Debugging (spinlocks, mutexes, etc...) +# +CONFIG_LOCK_DEBUGGING_SUPPORT=y +# CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set +# CONFIG_DEBUG_RT_MUTEXES is not set +# CONFIG_DEBUG_SPINLOCK is not set +# CONFIG_DEBUG_MUTEXES is not set +# CONFIG_DEBUG_WW_MUTEX_SLOWPATH is not set +# CONFIG_DEBUG_RWSEMS is not set +# CONFIG_DEBUG_LOCK_ALLOC is not set +# CONFIG_DEBUG_ATOMIC_SLEEP is not set +# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +# CONFIG_LOCK_TORTURE_TEST is not set +# CONFIG_WW_MUTEX_SELFTEST is not set +# end of Lock Debugging (spinlocks, mutexes, etc...) + +CONFIG_STACKTRACE=y +# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set +# CONFIG_DEBUG_KOBJECT is not set +CONFIG_DEBUG_BUGVERBOSE=y +CONFIG_DEBUG_LIST=y +# CONFIG_DEBUG_PLIST is not set +CONFIG_DEBUG_SG=y +CONFIG_DEBUG_NOTIFIERS=y +CONFIG_DEBUG_CREDENTIALS=y + +# +# RCU Debugging +# +# CONFIG_RCU_PERF_TEST is not set +# CONFIG_RCU_TORTURE_TEST is not set +CONFIG_RCU_CPU_STALL_TIMEOUT=60 +# CONFIG_RCU_TRACE is not set +# CONFIG_RCU_EQS_DEBUG is not set +# end of RCU Debugging + +# CONFIG_DEBUG_WQ_FORCE_RR_CPU is not set +# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set +# CONFIG_CPU_HOTPLUG_STATE_CONTROL is not set +# CONFIG_NOTIFIER_ERROR_INJECTION is not set +CONFIG_FUNCTION_ERROR_INJECTION=y +# CONFIG_FAULT_INJECTION is not set +CONFIG_LATENCYTOP=y +CONFIG_USER_STACKTRACE_SUPPORT=y +CONFIG_NOP_TRACER=y +CONFIG_HAVE_FUNCTION_TRACER=y +CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y +CONFIG_HAVE_DYNAMIC_FTRACE=y +CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS=y +CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y +CONFIG_HAVE_FENTRY=y +CONFIG_HAVE_C_RECORDMCOUNT=y +CONFIG_TRACER_MAX_TRACE=y +CONFIG_TRACE_CLOCK=y +CONFIG_RING_BUFFER=y +CONFIG_EVENT_TRACING=y +CONFIG_CONTEXT_SWITCH_TRACER=y +CONFIG_RING_BUFFER_ALLOW_SWAP=y +CONFIG_TRACING=y +CONFIG_GENERIC_TRACER=y +CONFIG_TRACING_SUPPORT=y +CONFIG_FTRACE=y +CONFIG_FUNCTION_TRACER=y +CONFIG_FUNCTION_GRAPH_TRACER=y +# CONFIG_PREEMPTIRQ_EVENTS is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_PREEMPT_TRACER is not set +CONFIG_SCHED_TRACER=y +CONFIG_HWLAT_TRACER=y +CONFIG_FTRACE_SYSCALLS=y +CONFIG_TRACER_SNAPSHOT=y +# CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP is not set +CONFIG_BRANCH_PROFILE_NONE=y +# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set +CONFIG_STACK_TRACER=y +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_KPROBE_EVENTS=y +# CONFIG_KPROBE_EVENTS_ON_NOTRACE is not set +CONFIG_UPROBE_EVENTS=y +CONFIG_BPF_EVENTS=y +CONFIG_DYNAMIC_EVENTS=y +CONFIG_PROBE_EVENTS=y +CONFIG_DYNAMIC_FTRACE=y +CONFIG_DYNAMIC_FTRACE_WITH_REGS=y +CONFIG_FUNCTION_PROFILER=y +# CONFIG_BPF_KPROBE_OVERRIDE is not set +CONFIG_FTRACE_MCOUNT_RECORD=y +# CONFIG_FTRACE_STARTUP_TEST is not set +CONFIG_MMIOTRACE=y +# CONFIG_HIST_TRIGGERS is not set +# CONFIG_MMIOTRACE_TEST is not set +# CONFIG_TRACEPOINT_BENCHMARK is not set +# CONFIG_RING_BUFFER_BENCHMARK is not set +# CONFIG_RING_BUFFER_STARTUP_TEST is not set +# CONFIG_PREEMPTIRQ_DELAY_TEST is not set +# CONFIG_TRACE_EVAL_MAP_FILE is not set +# CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set +CONFIG_RUNTIME_TESTING_MENU=y +CONFIG_LKDTM=m +# CONFIG_TEST_LIST_SORT is not set +# CONFIG_TEST_SORT is not set +# CONFIG_KPROBES_SANITY_TEST is not set +# CONFIG_BACKTRACE_SELF_TEST is not set +# CONFIG_RBTREE_TEST is not set +# CONFIG_REED_SOLOMON_TEST is not set +# CONFIG_INTERVAL_TREE_TEST is not set +# CONFIG_PERCPU_TEST is not set +# CONFIG_ATOMIC64_SELFTEST is not set +# CONFIG_ASYNC_RAID6_TEST is not set +# CONFIG_TEST_HEXDUMP is not set +# CONFIG_TEST_STRING_HELPERS is not set +# CONFIG_TEST_STRSCPY is not set +# CONFIG_TEST_KSTRTOX is not set +# CONFIG_TEST_PRINTF is not set +# CONFIG_TEST_BITMAP is not set +# CONFIG_TEST_BITFIELD is not set +# CONFIG_TEST_UUID is not set +# CONFIG_TEST_XARRAY is not set +# CONFIG_TEST_OVERFLOW is not set +# CONFIG_TEST_RHASHTABLE is not set +# CONFIG_TEST_HASH is not set +# CONFIG_TEST_IDA is not set +# CONFIG_TEST_PARMAN is not set +# CONFIG_TEST_LKM is not set +# CONFIG_TEST_VMALLOC is not set +# CONFIG_TEST_USER_COPY is not set +# CONFIG_TEST_BPF is not set +# CONFIG_TEST_BLACKHOLE_DEV is not set +# CONFIG_FIND_BIT_BENCHMARK is not set +# CONFIG_TEST_FIRMWARE is not set +# CONFIG_TEST_SYSCTL is not set +# CONFIG_TEST_UDELAY is not set +# CONFIG_TEST_STATIC_KEYS is not set +# CONFIG_TEST_KMOD is not set +# CONFIG_TEST_MEMCAT_P is not set +# CONFIG_TEST_OBJAGG is not set +# CONFIG_TEST_STACKINIT is not set +# CONFIG_TEST_MEMINIT is not set +# CONFIG_MEMTEST is not set +CONFIG_BUG_ON_DATA_CORRUPTION=y +# CONFIG_SAMPLES is not set +CONFIG_HAVE_ARCH_KGDB=y +# CONFIG_KGDB is not set +CONFIG_ARCH_HAS_UBSAN_SANITIZE_ALL=y +# CONFIG_UBSAN is not set +CONFIG_UBSAN_ALIGNMENT=y +CONFIG_ARCH_HAS_DEVMEM_IS_ALLOWED=y +CONFIG_TRACE_IRQFLAGS_SUPPORT=y +# CONFIG_X86_VERBOSE_BOOTUP is not set +CONFIG_EARLY_PRINTK=y +# CONFIG_EARLY_PRINTK_DBGP is not set +# CONFIG_EARLY_PRINTK_USB_XDBC is not set +CONFIG_X86_PTDUMP_CORE=y +# CONFIG_X86_PTDUMP is not set +# CONFIG_EFI_PGT_DUMP is not set +CONFIG_DEBUG_WX=y +CONFIG_DOUBLEFAULT=y +# CONFIG_DEBUG_TLBFLUSH is not set +# CONFIG_IOMMU_DEBUG is not set +CONFIG_HAVE_MMIOTRACE_SUPPORT=y +# CONFIG_X86_DECODER_SELFTEST is not set +CONFIG_IO_DELAY_0X80=y +# CONFIG_IO_DELAY_0XED is not set +# CONFIG_IO_DELAY_UDELAY is not set +# CONFIG_IO_DELAY_NONE is not set +CONFIG_DEBUG_BOOT_PARAMS=y +# CONFIG_CPA_DEBUG is not set +# CONFIG_DEBUG_ENTRY is not set +# CONFIG_DEBUG_NMI_SELFTEST is not set +# CONFIG_X86_DEBUG_FPU is not set +# CONFIG_PUNIT_ATOM_DEBUG is not set +CONFIG_UNWINDER_ORC=y +# CONFIG_UNWINDER_FRAME_POINTER is not set +# CONFIG_UNWINDER_GUESS is not set +# end of Kernel hacking diff --git a/linux-tkg/linux-tkg-config/5.7/90-cleanup.hook b/linux-tkg/linux-tkg-config/5.7/90-cleanup.hook new file mode 100644 index 0000000..99f5221 --- /dev/null +++ b/linux-tkg/linux-tkg-config/5.7/90-cleanup.hook @@ -0,0 +1,14 @@ +[Trigger] +Type = File +Operation = Install +Operation = Upgrade +Operation = Remove +Target = usr/lib/modules/*/ +Target = !usr/lib/modules/*/?* + +[Action] +Description = Cleaning up... +When = PostTransaction +Exec = /usr/share/libalpm/scripts/cleanup +NeedsTargets + diff --git a/linux-tkg/linux-tkg-config/5.7/cleanup b/linux-tkg/linux-tkg-config/5.7/cleanup new file mode 100755 index 0000000..c00c08d --- /dev/null +++ b/linux-tkg/linux-tkg-config/5.7/cleanup @@ -0,0 +1,10 @@ +#!/bin/bash + +for _f in /usr/lib/modules/*tkg*; do + if [[ ! -e ${_f}/vmlinuz ]]; then + rm -rf "$_f" + fi +done + +# vim:set ft=sh sw=2 et: + diff --git a/linux-tkg/linux-tkg-config/5.7/config.x86_64 b/linux-tkg/linux-tkg-config/5.7/config.x86_64 new file mode 100644 index 0000000..1014972 --- /dev/null +++ b/linux-tkg/linux-tkg-config/5.7/config.x86_64 @@ -0,0 +1,10864 @@ +# +# Automatically generated file; DO NOT EDIT. +# Linux/x86 5.7.11-arch1 Kernel Configuration +# + +# +# Compiler: gcc (GCC) 10.1.0 +# +CONFIG_CC_IS_GCC=y +CONFIG_GCC_VERSION=100100 +CONFIG_LD_VERSION=234000000 +CONFIG_CLANG_VERSION=0 +CONFIG_CC_CAN_LINK=y +CONFIG_CC_HAS_ASM_GOTO=y +CONFIG_CC_HAS_ASM_INLINE=y +CONFIG_IRQ_WORK=y +CONFIG_BUILDTIME_TABLE_SORT=y +CONFIG_THREAD_INFO_IN_TASK=y + +# +# General setup +# +CONFIG_INIT_ENV_ARG_LIMIT=32 +# CONFIG_COMPILE_TEST is not set +CONFIG_LOCALVERSION="" +CONFIG_LOCALVERSION_AUTO=y +CONFIG_BUILD_SALT="" +CONFIG_HAVE_KERNEL_GZIP=y +CONFIG_HAVE_KERNEL_BZIP2=y +CONFIG_HAVE_KERNEL_LZMA=y +CONFIG_HAVE_KERNEL_XZ=y +CONFIG_HAVE_KERNEL_LZO=y +CONFIG_HAVE_KERNEL_LZ4=y +# CONFIG_KERNEL_GZIP is not set +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set +CONFIG_KERNEL_XZ=y +# CONFIG_KERNEL_LZO is not set +# CONFIG_KERNEL_LZ4 is not set +CONFIG_DEFAULT_HOSTNAME="archlinux" +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_SYSVIPC_SYSCTL=y +CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y +CONFIG_CROSS_MEMORY_ATTACH=y +# CONFIG_USELIB is not set +CONFIG_AUDIT=y +CONFIG_HAVE_ARCH_AUDITSYSCALL=y +CONFIG_AUDITSYSCALL=y + +# +# IRQ subsystem +# +CONFIG_GENERIC_IRQ_PROBE=y +CONFIG_GENERIC_IRQ_SHOW=y +CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y +CONFIG_GENERIC_PENDING_IRQ=y +CONFIG_GENERIC_IRQ_MIGRATION=y +CONFIG_HARDIRQS_SW_RESEND=y +CONFIG_GENERIC_IRQ_CHIP=y +CONFIG_IRQ_DOMAIN=y +CONFIG_IRQ_SIM=y +CONFIG_IRQ_DOMAIN_HIERARCHY=y +CONFIG_GENERIC_MSI_IRQ=y +CONFIG_GENERIC_MSI_IRQ_DOMAIN=y +CONFIG_IRQ_MSI_IOMMU=y +CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR=y +CONFIG_GENERIC_IRQ_RESERVATION_MODE=y +CONFIG_IRQ_FORCED_THREADING=y +CONFIG_SPARSE_IRQ=y +# CONFIG_GENERIC_IRQ_DEBUGFS is not set +# end of IRQ subsystem + +CONFIG_CLOCKSOURCE_WATCHDOG=y +CONFIG_ARCH_CLOCKSOURCE_INIT=y +CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y +CONFIG_GENERIC_TIME_VSYSCALL=y +CONFIG_GENERIC_CLOCKEVENTS=y +CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y +CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=y +CONFIG_GENERIC_CMOS_UPDATE=y + +# +# Timers subsystem +# +CONFIG_TICK_ONESHOT=y +CONFIG_NO_HZ_COMMON=y +# CONFIG_HZ_PERIODIC is not set +CONFIG_NO_HZ_IDLE=y +# CONFIG_NO_HZ_FULL is not set +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +# end of Timers subsystem + +# CONFIG_PREEMPT_NONE is not set +# CONFIG_PREEMPT_VOLUNTARY is not set +CONFIG_PREEMPT=y +CONFIG_PREEMPT_COUNT=y +CONFIG_PREEMPTION=y + +# +# CPU/Task time and stats accounting +# +CONFIG_TICK_CPU_ACCOUNTING=y +# CONFIG_VIRT_CPU_ACCOUNTING_GEN is not set +CONFIG_IRQ_TIME_ACCOUNTING=y +CONFIG_HAVE_SCHED_AVG_IRQ=y +# CONFIG_SCHED_THERMAL_PRESSURE is not set +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_BSD_PROCESS_ACCT_V3=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_PSI=y +# CONFIG_PSI_DEFAULT_DISABLED is not set +# end of CPU/Task time and stats accounting + +CONFIG_CPU_ISOLATION=y + +# +# RCU Subsystem +# +CONFIG_TREE_RCU=y +CONFIG_PREEMPT_RCU=y +CONFIG_RCU_EXPERT=y +CONFIG_SRCU=y +CONFIG_TREE_SRCU=y +CONFIG_TASKS_RCU=y +CONFIG_RCU_STALL_COMMON=y +CONFIG_RCU_NEED_SEGCBLIST=y +CONFIG_RCU_FANOUT=64 +CONFIG_RCU_FANOUT_LEAF=16 +CONFIG_RCU_FAST_NO_HZ=y +CONFIG_RCU_BOOST=y +CONFIG_RCU_BOOST_DELAY=500 +# CONFIG_RCU_NOCB_CPU is not set +# end of RCU Subsystem + +CONFIG_BUILD_BIN2C=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +# CONFIG_IKHEADERS is not set +CONFIG_LOG_BUF_SHIFT=17 +CONFIG_LOG_CPU_MAX_BUF_SHIFT=12 +CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=13 +CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y + +# +# Scheduler features +# +CONFIG_UCLAMP_TASK=y +CONFIG_UCLAMP_BUCKETS_COUNT=5 +# end of Scheduler features + +CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y +CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y +CONFIG_CC_HAS_INT128=y +CONFIG_ARCH_SUPPORTS_INT128=y +CONFIG_NUMA_BALANCING=y +CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y +CONFIG_CGROUPS=y +CONFIG_PAGE_COUNTER=y +CONFIG_MEMCG=y +CONFIG_MEMCG_SWAP=y +CONFIG_MEMCG_SWAP_ENABLED=y +CONFIG_MEMCG_KMEM=y +CONFIG_BLK_CGROUP=y +CONFIG_CGROUP_WRITEBACK=y +CONFIG_CGROUP_SCHED=y +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_CFS_BANDWIDTH=y +# CONFIG_RT_GROUP_SCHED is not set +CONFIG_UCLAMP_TASK_GROUP=y +CONFIG_CGROUP_PIDS=y +CONFIG_CGROUP_RDMA=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_HUGETLB=y +CONFIG_CPUSETS=y +CONFIG_PROC_PID_CPUSET=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_PERF=y +CONFIG_CGROUP_BPF=y +# CONFIG_CGROUP_DEBUG is not set +CONFIG_SOCK_CGROUP_DATA=y +CONFIG_NAMESPACES=y +CONFIG_UTS_NS=y +CONFIG_TIME_NS=y +CONFIG_IPC_NS=y +CONFIG_USER_NS=y +CONFIG_USER_NS_UNPRIVILEGED=y +CONFIG_PID_NS=y +CONFIG_NET_NS=y +CONFIG_CHECKPOINT_RESTORE=y +CONFIG_SCHED_AUTOGROUP=y +# CONFIG_SYSFS_DEPRECATED is not set +CONFIG_RELAY=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_GZIP=y +CONFIG_RD_BZIP2=y +CONFIG_RD_LZMA=y +CONFIG_RD_XZ=y +CONFIG_RD_LZO=y +CONFIG_RD_LZ4=y +CONFIG_BOOT_CONFIG=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set +CONFIG_SYSCTL=y +CONFIG_HAVE_UID16=y +CONFIG_SYSCTL_EXCEPTION_TRACE=y +CONFIG_HAVE_PCSPKR_PLATFORM=y +CONFIG_BPF=y +CONFIG_EXPERT=y +# CONFIG_UID16 is not set +CONFIG_MULTIUSER=y +CONFIG_SGETMASK_SYSCALL=y +# CONFIG_SYSFS_SYSCALL is not set +CONFIG_FHANDLE=y +CONFIG_POSIX_TIMERS=y +CONFIG_PRINTK=y +CONFIG_PRINTK_NMI=y +CONFIG_BUG=y +CONFIG_ELF_CORE=y +CONFIG_PCSPKR_PLATFORM=y +CONFIG_BASE_FULL=y +CONFIG_FUTEX=y +CONFIG_FUTEX_PI=y +CONFIG_EPOLL=y +CONFIG_SIGNALFD=y +CONFIG_TIMERFD=y +CONFIG_EVENTFD=y +CONFIG_SHMEM=y +CONFIG_AIO=y +CONFIG_IO_URING=y +CONFIG_ADVISE_SYSCALLS=y +CONFIG_MEMBARRIER=y +CONFIG_KALLSYMS=y +CONFIG_KALLSYMS_ALL=y +CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y +CONFIG_KALLSYMS_BASE_RELATIVE=y +CONFIG_BPF_LSM=y +CONFIG_BPF_SYSCALL=y +CONFIG_ARCH_WANT_DEFAULT_BPF_JIT=y +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_BPF_JIT_DEFAULT_ON=y +# CONFIG_USERFAULTFD is not set +CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y +CONFIG_RSEQ=y +# CONFIG_DEBUG_RSEQ is not set +# CONFIG_EMBEDDED is not set +CONFIG_HAVE_PERF_EVENTS=y +# CONFIG_PC104 is not set + +# +# Kernel Performance Events And Counters +# +CONFIG_PERF_EVENTS=y +# CONFIG_DEBUG_PERF_USE_VMALLOC is not set +# end of Kernel Performance Events And Counters + +CONFIG_VM_EVENT_COUNTERS=y +CONFIG_SLUB_DEBUG=y +# CONFIG_SLUB_MEMCG_SYSFS_ON is not set +# CONFIG_COMPAT_BRK is not set +# CONFIG_SLAB is not set +CONFIG_SLUB=y +# CONFIG_SLOB is not set +CONFIG_SLAB_MERGE_DEFAULT=y +CONFIG_SLAB_FREELIST_RANDOM=y +CONFIG_SLAB_FREELIST_HARDENED=y +CONFIG_SHUFFLE_PAGE_ALLOCATOR=y +CONFIG_SLUB_CPU_PARTIAL=y +CONFIG_SYSTEM_DATA_VERIFICATION=y +CONFIG_PROFILING=y +CONFIG_TRACEPOINTS=y +# end of General setup + +CONFIG_64BIT=y +CONFIG_X86_64=y +CONFIG_X86=y +CONFIG_INSTRUCTION_DECODER=y +CONFIG_OUTPUT_FORMAT="elf64-x86-64" +CONFIG_LOCKDEP_SUPPORT=y +CONFIG_STACKTRACE_SUPPORT=y +CONFIG_MMU=y +CONFIG_ARCH_MMAP_RND_BITS_MIN=28 +CONFIG_ARCH_MMAP_RND_BITS_MAX=32 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=8 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16 +CONFIG_GENERIC_ISA_DMA=y +CONFIG_GENERIC_BUG=y +CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y +CONFIG_ARCH_MAY_HAVE_PC_FDC=y +CONFIG_GENERIC_CALIBRATE_DELAY=y +CONFIG_ARCH_HAS_CPU_RELAX=y +CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y +CONFIG_ARCH_HAS_FILTER_PGPROT=y +CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y +CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y +CONFIG_ARCH_HIBERNATION_POSSIBLE=y +CONFIG_ARCH_SUSPEND_POSSIBLE=y +CONFIG_ARCH_WANT_GENERAL_HUGETLB=y +CONFIG_ZONE_DMA32=y +CONFIG_AUDIT_ARCH=y +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y +CONFIG_HAVE_INTEL_TXT=y +CONFIG_X86_64_SMP=y +CONFIG_ARCH_SUPPORTS_UPROBES=y +CONFIG_FIX_EARLYCON_MEM=y +CONFIG_DYNAMIC_PHYSICAL_MASK=y +CONFIG_PGTABLE_LEVELS=5 +CONFIG_CC_HAS_SANE_STACKPROTECTOR=y + +# +# Processor type and features +# +CONFIG_ZONE_DMA=y +CONFIG_SMP=y +CONFIG_X86_FEATURE_NAMES=y +CONFIG_X86_X2APIC=y +CONFIG_X86_MPPARSE=y +# CONFIG_GOLDFISH is not set +CONFIG_RETPOLINE=y +CONFIG_X86_CPU_RESCTRL=y +# CONFIG_X86_EXTENDED_PLATFORM is not set +CONFIG_X86_INTEL_LPSS=y +CONFIG_X86_AMD_PLATFORM_DEVICE=y +CONFIG_IOSF_MBI=y +# CONFIG_IOSF_MBI_DEBUG is not set +CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y +CONFIG_SCHED_OMIT_FRAME_POINTER=y +CONFIG_HYPERVISOR_GUEST=y +CONFIG_PARAVIRT=y +CONFIG_PARAVIRT_XXL=y +# CONFIG_PARAVIRT_DEBUG is not set +CONFIG_PARAVIRT_SPINLOCKS=y +CONFIG_X86_HV_CALLBACK_VECTOR=y +CONFIG_XEN=y +CONFIG_XEN_PV=y +CONFIG_XEN_PV_SMP=y +CONFIG_XEN_DOM0=y +CONFIG_XEN_PVHVM=y +CONFIG_XEN_PVHVM_SMP=y +CONFIG_XEN_512GB=y +CONFIG_XEN_SAVE_RESTORE=y +# CONFIG_XEN_DEBUG_FS is not set +CONFIG_XEN_PVH=y +CONFIG_KVM_GUEST=y +CONFIG_ARCH_CPUIDLE_HALTPOLL=y +CONFIG_PVH=y +# CONFIG_KVM_DEBUG_FS is not set +CONFIG_PARAVIRT_TIME_ACCOUNTING=y +CONFIG_PARAVIRT_CLOCK=y +CONFIG_JAILHOUSE_GUEST=y +CONFIG_ACRN_GUEST=y +# CONFIG_MK8 is not set +# CONFIG_MPSC is not set +# CONFIG_MCORE2 is not set +# CONFIG_MATOM is not set +CONFIG_GENERIC_CPU=y +CONFIG_X86_INTERNODE_CACHE_SHIFT=6 +CONFIG_X86_L1_CACHE_SHIFT=6 +CONFIG_X86_TSC=y +CONFIG_X86_CMPXCHG64=y +CONFIG_X86_CMOV=y +CONFIG_X86_MINIMUM_CPU_FAMILY=64 +CONFIG_X86_DEBUGCTLMSR=y +CONFIG_IA32_FEAT_CTL=y +CONFIG_X86_VMX_FEATURE_NAMES=y +CONFIG_PROCESSOR_SELECT=y +CONFIG_CPU_SUP_INTEL=y +CONFIG_CPU_SUP_AMD=y +CONFIG_CPU_SUP_HYGON=y +CONFIG_CPU_SUP_CENTAUR=y +CONFIG_CPU_SUP_ZHAOXIN=y +CONFIG_HPET_TIMER=y +CONFIG_HPET_EMULATE_RTC=y +CONFIG_DMI=y +CONFIG_GART_IOMMU=y +# CONFIG_MAXSMP is not set +CONFIG_NR_CPUS_RANGE_BEGIN=2 +CONFIG_NR_CPUS_RANGE_END=512 +CONFIG_NR_CPUS_DEFAULT=64 +CONFIG_NR_CPUS=320 +CONFIG_SCHED_SMT=y +CONFIG_SCHED_MC=y +CONFIG_SCHED_MC_PRIO=y +CONFIG_X86_LOCAL_APIC=y +CONFIG_X86_IO_APIC=y +CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y +CONFIG_X86_MCE=y +# CONFIG_X86_MCELOG_LEGACY is not set +CONFIG_X86_MCE_INTEL=y +CONFIG_X86_MCE_AMD=y +CONFIG_X86_MCE_THRESHOLD=y +CONFIG_X86_MCE_INJECT=m +CONFIG_X86_THERMAL_VECTOR=y + +# +# Performance monitoring +# +CONFIG_PERF_EVENTS_INTEL_UNCORE=m +CONFIG_PERF_EVENTS_INTEL_RAPL=m +CONFIG_PERF_EVENTS_INTEL_CSTATE=m +CONFIG_PERF_EVENTS_AMD_POWER=m +# end of Performance monitoring + +CONFIG_X86_16BIT=y +CONFIG_X86_ESPFIX64=y +CONFIG_X86_VSYSCALL_EMULATION=y +CONFIG_X86_IOPL_IOPERM=y +CONFIG_I8K=m +CONFIG_MICROCODE=y +CONFIG_MICROCODE_INTEL=y +CONFIG_MICROCODE_AMD=y +CONFIG_MICROCODE_OLD_INTERFACE=y +CONFIG_X86_MSR=m +CONFIG_X86_CPUID=m +CONFIG_X86_5LEVEL=y +CONFIG_X86_DIRECT_GBPAGES=y +# CONFIG_X86_CPA_STATISTICS is not set +CONFIG_AMD_MEM_ENCRYPT=y +# CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT is not set +CONFIG_NUMA=y +CONFIG_AMD_NUMA=y +CONFIG_X86_64_ACPI_NUMA=y +CONFIG_NODES_SPAN_OTHER_NODES=y +# CONFIG_NUMA_EMU is not set +CONFIG_NODES_SHIFT=5 +CONFIG_ARCH_SPARSEMEM_ENABLE=y +CONFIG_ARCH_SPARSEMEM_DEFAULT=y +CONFIG_ARCH_SELECT_MEMORY_MODEL=y +CONFIG_ARCH_MEMORY_PROBE=y +CONFIG_ARCH_PROC_KCORE_TEXT=y +CONFIG_ILLEGAL_POINTER_VALUE=0xdead000000000000 +CONFIG_X86_PMEM_LEGACY_DEVICE=y +CONFIG_X86_PMEM_LEGACY=m +CONFIG_X86_CHECK_BIOS_CORRUPTION=y +CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y +CONFIG_X86_RESERVE_LOW=64 +CONFIG_MTRR=y +CONFIG_MTRR_SANITIZER=y +CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=1 +CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT=0 +CONFIG_X86_PAT=y +CONFIG_ARCH_USES_PG_UNCACHED=y +CONFIG_ARCH_RANDOM=y +CONFIG_X86_SMAP=y +CONFIG_X86_UMIP=y +CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS=y +# CONFIG_X86_INTEL_TSX_MODE_OFF is not set +# CONFIG_X86_INTEL_TSX_MODE_ON is not set +CONFIG_X86_INTEL_TSX_MODE_AUTO=y +CONFIG_EFI=y +CONFIG_EFI_STUB=y +CONFIG_EFI_MIXED=y +CONFIG_SECCOMP=y +# CONFIG_HZ_100 is not set +# CONFIG_HZ_250 is not set +CONFIG_HZ_300=y +# CONFIG_HZ_1000 is not set +CONFIG_HZ=300 +CONFIG_SCHED_HRTICK=y +CONFIG_KEXEC=y +CONFIG_KEXEC_FILE=y +CONFIG_ARCH_HAS_KEXEC_PURGATORY=y +# CONFIG_KEXEC_SIG is not set +CONFIG_CRASH_DUMP=y +CONFIG_KEXEC_JUMP=y +CONFIG_PHYSICAL_START=0x1000000 +CONFIG_RELOCATABLE=y +CONFIG_RANDOMIZE_BASE=y +CONFIG_X86_NEED_RELOCS=y +CONFIG_PHYSICAL_ALIGN=0x200000 +CONFIG_DYNAMIC_MEMORY_LAYOUT=y +CONFIG_RANDOMIZE_MEMORY=y +CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING=0x1 +CONFIG_HOTPLUG_CPU=y +# CONFIG_BOOTPARAM_HOTPLUG_CPU0 is not set +# CONFIG_DEBUG_HOTPLUG_CPU0 is not set +# CONFIG_COMPAT_VDSO is not set +# CONFIG_LEGACY_VSYSCALL_EMULATE is not set +CONFIG_LEGACY_VSYSCALL_XONLY=y +# CONFIG_LEGACY_VSYSCALL_NONE is not set +# CONFIG_CMDLINE_BOOL is not set +CONFIG_MODIFY_LDT_SYSCALL=y +CONFIG_HAVE_LIVEPATCH=y +# CONFIG_LIVEPATCH is not set +# end of Processor type and features + +CONFIG_ARCH_HAS_ADD_PAGES=y +CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y +CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y +CONFIG_USE_PERCPU_NUMA_NODE_ID=y +CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK=y +CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION=y +CONFIG_ARCH_ENABLE_THP_MIGRATION=y + +# +# Power management and ACPI options +# +CONFIG_ARCH_HIBERNATION_HEADER=y +CONFIG_SUSPEND=y +CONFIG_SUSPEND_FREEZER=y +# CONFIG_SUSPEND_SKIP_SYNC is not set +CONFIG_HIBERNATE_CALLBACKS=y +CONFIG_HIBERNATION=y +CONFIG_PM_STD_PARTITION="" +CONFIG_PM_SLEEP=y +CONFIG_PM_SLEEP_SMP=y +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y +CONFIG_PM_WAKELOCKS_LIMIT=100 +CONFIG_PM_WAKELOCKS_GC=y +CONFIG_PM=y +CONFIG_PM_DEBUG=y +CONFIG_PM_ADVANCED_DEBUG=y +# CONFIG_PM_TEST_SUSPEND is not set +CONFIG_PM_SLEEP_DEBUG=y +# CONFIG_DPM_WATCHDOG is not set +CONFIG_PM_TRACE=y +CONFIG_PM_TRACE_RTC=y +CONFIG_PM_CLK=y +CONFIG_PM_GENERIC_DOMAINS=y +CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y +CONFIG_PM_GENERIC_DOMAINS_SLEEP=y +CONFIG_PM_GENERIC_DOMAINS_OF=y +CONFIG_ENERGY_MODEL=y +CONFIG_ARCH_SUPPORTS_ACPI=y +CONFIG_ACPI=y +CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y +CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC=y +CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT=y +# CONFIG_ACPI_DEBUGGER is not set +CONFIG_ACPI_SPCR_TABLE=y +CONFIG_ACPI_LPIT=y +CONFIG_ACPI_SLEEP=y +# CONFIG_ACPI_PROCFS_POWER is not set +CONFIG_ACPI_REV_OVERRIDE_POSSIBLE=y +CONFIG_ACPI_EC_DEBUGFS=y +CONFIG_ACPI_AC=m +CONFIG_ACPI_BATTERY=m +CONFIG_ACPI_BUTTON=y +CONFIG_ACPI_VIDEO=y +CONFIG_ACPI_FAN=y +CONFIG_ACPI_TAD=m +CONFIG_ACPI_DOCK=y +CONFIG_ACPI_CPU_FREQ_PSS=y +CONFIG_ACPI_PROCESSOR_CSTATE=y +CONFIG_ACPI_PROCESSOR_IDLE=y +CONFIG_ACPI_CPPC_LIB=y +CONFIG_ACPI_PROCESSOR=y +CONFIG_ACPI_IPMI=m +CONFIG_ACPI_HOTPLUG_CPU=y +CONFIG_ACPI_PROCESSOR_AGGREGATOR=y +CONFIG_ACPI_THERMAL=y +CONFIG_ARCH_HAS_ACPI_TABLE_UPGRADE=y +CONFIG_ACPI_TABLE_UPGRADE=y +CONFIG_ACPI_DEBUG=y +CONFIG_ACPI_PCI_SLOT=y +CONFIG_ACPI_CONTAINER=y +CONFIG_ACPI_HOTPLUG_MEMORY=y +CONFIG_ACPI_HOTPLUG_IOAPIC=y +CONFIG_ACPI_SBS=m +CONFIG_ACPI_HED=y +CONFIG_ACPI_CUSTOM_METHOD=m +CONFIG_ACPI_BGRT=y +# CONFIG_ACPI_REDUCED_HARDWARE_ONLY is not set +CONFIG_ACPI_NFIT=m +# CONFIG_NFIT_SECURITY_DEBUG is not set +CONFIG_ACPI_NUMA=y +CONFIG_ACPI_HMAT=y +CONFIG_HAVE_ACPI_APEI=y +CONFIG_HAVE_ACPI_APEI_NMI=y +CONFIG_ACPI_APEI=y +CONFIG_ACPI_APEI_GHES=y +CONFIG_ACPI_APEI_PCIEAER=y +CONFIG_ACPI_APEI_MEMORY_FAILURE=y +CONFIG_ACPI_APEI_EINJ=m +CONFIG_ACPI_APEI_ERST_DEBUG=m +CONFIG_DPTF_POWER=m +CONFIG_ACPI_WATCHDOG=y +CONFIG_ACPI_EXTLOG=m +CONFIG_ACPI_ADXL=y +CONFIG_PMIC_OPREGION=y +CONFIG_BYTCRC_PMIC_OPREGION=y +CONFIG_CHTCRC_PMIC_OPREGION=y +CONFIG_XPOWER_PMIC_OPREGION=y +CONFIG_BXT_WC_PMIC_OPREGION=y +CONFIG_CHT_WC_PMIC_OPREGION=y +CONFIG_CHT_DC_TI_PMIC_OPREGION=y +CONFIG_ACPI_CONFIGFS=m +CONFIG_TPS68470_PMIC_OPREGION=y +CONFIG_X86_PM_TIMER=y +CONFIG_SFI=y + +# +# CPU Frequency scaling +# +CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_GOV_ATTR_SET=y +CONFIG_CPU_FREQ_GOV_COMMON=y +CONFIG_CPU_FREQ_STAT=y +# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y +CONFIG_CPU_FREQ_GOV_PERFORMANCE=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=m +CONFIG_CPU_FREQ_GOV_USERSPACE=m +CONFIG_CPU_FREQ_GOV_ONDEMAND=m +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m +CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y + +# +# CPU frequency scaling drivers +# +CONFIG_CPUFREQ_DT=m +CONFIG_CPUFREQ_DT_PLATDEV=y +CONFIG_X86_INTEL_PSTATE=y +CONFIG_X86_PCC_CPUFREQ=m +CONFIG_X86_ACPI_CPUFREQ=m +CONFIG_X86_ACPI_CPUFREQ_CPB=y +CONFIG_X86_POWERNOW_K8=m +CONFIG_X86_AMD_FREQ_SENSITIVITY=m +# CONFIG_X86_SPEEDSTEP_CENTRINO is not set +CONFIG_X86_P4_CLOCKMOD=m + +# +# shared options +# +CONFIG_X86_SPEEDSTEP_LIB=m +# end of CPU Frequency scaling + +# +# CPU Idle +# +CONFIG_CPU_IDLE=y +CONFIG_CPU_IDLE_GOV_LADDER=y +CONFIG_CPU_IDLE_GOV_MENU=y +CONFIG_CPU_IDLE_GOV_TEO=y +CONFIG_CPU_IDLE_GOV_HALTPOLL=y +CONFIG_HALTPOLL_CPUIDLE=m +# end of CPU Idle + +CONFIG_INTEL_IDLE=y +# end of Power management and ACPI options + +# +# Bus options (PCI etc.) +# +CONFIG_PCI_DIRECT=y +CONFIG_PCI_MMCONFIG=y +CONFIG_PCI_XEN=y +CONFIG_MMCONF_FAM10H=y +# CONFIG_PCI_CNB20LE_QUIRK is not set +# CONFIG_ISA_BUS is not set +CONFIG_ISA_DMA_API=y +CONFIG_AMD_NB=y +# CONFIG_X86_SYSFB is not set +# end of Bus options (PCI etc.) + +# +# Binary Emulations +# +CONFIG_IA32_EMULATION=y +# CONFIG_X86_X32 is not set +CONFIG_COMPAT_32=y +CONFIG_COMPAT=y +CONFIG_COMPAT_FOR_U64_ALIGNMENT=y +CONFIG_SYSVIPC_COMPAT=y +# end of Binary Emulations + +# +# Firmware Drivers +# +CONFIG_EDD=m +# CONFIG_EDD_OFF is not set +CONFIG_FIRMWARE_MEMMAP=y +CONFIG_DMIID=y +CONFIG_DMI_SYSFS=m +CONFIG_DMI_SCAN_MACHINE_NON_EFI_FALLBACK=y +CONFIG_ISCSI_IBFT_FIND=y +CONFIG_ISCSI_IBFT=m +CONFIG_FW_CFG_SYSFS=m +# CONFIG_FW_CFG_SYSFS_CMDLINE is not set +CONFIG_GOOGLE_FIRMWARE=y +# CONFIG_GOOGLE_SMI is not set +CONFIG_GOOGLE_COREBOOT_TABLE=m +CONFIG_GOOGLE_MEMCONSOLE=m +# CONFIG_GOOGLE_MEMCONSOLE_X86_LEGACY is not set +CONFIG_GOOGLE_FRAMEBUFFER_COREBOOT=m +CONFIG_GOOGLE_MEMCONSOLE_COREBOOT=m +CONFIG_GOOGLE_VPD=m + +# +# EFI (Extensible Firmware Interface) Support +# +# CONFIG_EFI_VARS is not set +CONFIG_EFI_ESRT=y +CONFIG_EFI_RUNTIME_MAP=y +# CONFIG_EFI_FAKE_MEMMAP is not set +CONFIG_EFI_SOFT_RESERVE=y +CONFIG_EFI_RUNTIME_WRAPPERS=y +CONFIG_EFI_CAPSULE_LOADER=m +# CONFIG_EFI_TEST is not set +CONFIG_APPLE_PROPERTIES=y +# CONFIG_RESET_ATTACK_MITIGATION is not set +CONFIG_EFI_RCI2_TABLE=y +# CONFIG_EFI_DISABLE_PCI_DMA is not set +# end of EFI (Extensible Firmware Interface) Support + +CONFIG_EFI_EMBEDDED_FIRMWARE=y +CONFIG_UEFI_CPER=y +CONFIG_UEFI_CPER_X86=y +CONFIG_EFI_DEV_PATH_PARSER=y +CONFIG_EFI_EARLYCON=y + +# +# Tegra firmware driver +# +# end of Tegra firmware driver +# end of Firmware Drivers + +CONFIG_HAVE_KVM=y +CONFIG_HAVE_KVM_IRQCHIP=y +CONFIG_HAVE_KVM_IRQFD=y +CONFIG_HAVE_KVM_IRQ_ROUTING=y +CONFIG_HAVE_KVM_EVENTFD=y +CONFIG_KVM_MMIO=y +CONFIG_KVM_ASYNC_PF=y +CONFIG_HAVE_KVM_MSI=y +CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT=y +CONFIG_KVM_VFIO=y +CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT=y +CONFIG_KVM_COMPAT=y +CONFIG_HAVE_KVM_IRQ_BYPASS=y +CONFIG_HAVE_KVM_NO_POLL=y +CONFIG_VIRTUALIZATION=y +CONFIG_KVM=m +CONFIG_KVM_WERROR=y +CONFIG_KVM_INTEL=m +CONFIG_KVM_AMD=m +CONFIG_KVM_AMD_SEV=y +CONFIG_KVM_MMU_AUDIT=y +CONFIG_AS_AVX512=y +CONFIG_AS_SHA1_NI=y +CONFIG_AS_SHA256_NI=y + +# +# General architecture-dependent options +# +CONFIG_CRASH_CORE=y +CONFIG_KEXEC_CORE=y +CONFIG_HOTPLUG_SMT=y +CONFIG_OPROFILE=m +# CONFIG_OPROFILE_EVENT_MULTIPLEX is not set +CONFIG_HAVE_OPROFILE=y +CONFIG_OPROFILE_NMI_TIMER=y +CONFIG_KPROBES=y +CONFIG_JUMP_LABEL=y +# CONFIG_STATIC_KEYS_SELFTEST is not set +CONFIG_OPTPROBES=y +CONFIG_KPROBES_ON_FTRACE=y +CONFIG_UPROBES=y +CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y +CONFIG_ARCH_USE_BUILTIN_BSWAP=y +CONFIG_KRETPROBES=y +CONFIG_USER_RETURN_NOTIFIER=y +CONFIG_HAVE_IOREMAP_PROT=y +CONFIG_HAVE_KPROBES=y +CONFIG_HAVE_KRETPROBES=y +CONFIG_HAVE_OPTPROBES=y +CONFIG_HAVE_KPROBES_ON_FTRACE=y +CONFIG_HAVE_FUNCTION_ERROR_INJECTION=y +CONFIG_HAVE_NMI=y +CONFIG_HAVE_ARCH_TRACEHOOK=y +CONFIG_HAVE_DMA_CONTIGUOUS=y +CONFIG_GENERIC_SMP_IDLE_THREAD=y +CONFIG_ARCH_HAS_FORTIFY_SOURCE=y +CONFIG_ARCH_HAS_SET_MEMORY=y +CONFIG_ARCH_HAS_SET_DIRECT_MAP=y +CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y +CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT=y +CONFIG_HAVE_ASM_MODVERSIONS=y +CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y +CONFIG_HAVE_RSEQ=y +CONFIG_HAVE_FUNCTION_ARG_ACCESS_API=y +CONFIG_HAVE_CLK=y +CONFIG_HAVE_HW_BREAKPOINT=y +CONFIG_HAVE_MIXED_BREAKPOINTS_REGS=y +CONFIG_HAVE_USER_RETURN_NOTIFIER=y +CONFIG_HAVE_PERF_EVENTS_NMI=y +CONFIG_HAVE_HARDLOCKUP_DETECTOR_PERF=y +CONFIG_HAVE_PERF_REGS=y +CONFIG_HAVE_PERF_USER_STACK_DUMP=y +CONFIG_HAVE_ARCH_JUMP_LABEL=y +CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE=y +CONFIG_MMU_GATHER_TABLE_FREE=y +CONFIG_MMU_GATHER_RCU_TABLE_FREE=y +CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG=y +CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y +CONFIG_HAVE_CMPXCHG_LOCAL=y +CONFIG_HAVE_CMPXCHG_DOUBLE=y +CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION=y +CONFIG_ARCH_WANT_OLD_COMPAT_IPC=y +CONFIG_HAVE_ARCH_SECCOMP_FILTER=y +CONFIG_SECCOMP_FILTER=y +CONFIG_HAVE_ARCH_STACKLEAK=y +CONFIG_HAVE_STACKPROTECTOR=y +CONFIG_CC_HAS_STACKPROTECTOR_NONE=y +CONFIG_STACKPROTECTOR=y +CONFIG_STACKPROTECTOR_STRONG=y +CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES=y +CONFIG_HAVE_CONTEXT_TRACKING=y +CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y +CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y +CONFIG_HAVE_MOVE_PMD=y +CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y +CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD=y +CONFIG_HAVE_ARCH_HUGE_VMAP=y +CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y +CONFIG_HAVE_ARCH_SOFT_DIRTY=y +CONFIG_HAVE_MOD_ARCH_SPECIFIC=y +CONFIG_MODULES_USE_ELF_RELA=y +CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK=y +CONFIG_ARCH_HAS_ELF_RANDOMIZE=y +CONFIG_HAVE_ARCH_MMAP_RND_BITS=y +CONFIG_HAVE_EXIT_THREAD=y +CONFIG_ARCH_MMAP_RND_BITS=28 +CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS=y +CONFIG_ARCH_MMAP_RND_COMPAT_BITS=8 +CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES=y +CONFIG_HAVE_COPY_THREAD_TLS=y +CONFIG_HAVE_STACK_VALIDATION=y +CONFIG_HAVE_RELIABLE_STACKTRACE=y +CONFIG_ISA_BUS_API=y +CONFIG_OLD_SIGSUSPEND3=y +CONFIG_COMPAT_OLD_SIGACTION=y +CONFIG_COMPAT_32BIT_TIME=y +CONFIG_HAVE_ARCH_VMAP_STACK=y +CONFIG_VMAP_STACK=y +CONFIG_ARCH_HAS_STRICT_KERNEL_RWX=y +CONFIG_STRICT_KERNEL_RWX=y +CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y +CONFIG_STRICT_MODULE_RWX=y +CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y +CONFIG_ARCH_USE_MEMREMAP_PROT=y +CONFIG_LOCK_EVENT_COUNTS=y +CONFIG_ARCH_HAS_MEM_ENCRYPT=y + +# +# GCOV-based kernel profiling +# +# CONFIG_GCOV_KERNEL is not set +CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y +# end of GCOV-based kernel profiling + +CONFIG_HAVE_GCC_PLUGINS=y +CONFIG_GCC_PLUGINS=y +# CONFIG_GCC_PLUGIN_CYC_COMPLEXITY is not set +# CONFIG_GCC_PLUGIN_LATENT_ENTROPY is not set +# CONFIG_GCC_PLUGIN_RANDSTRUCT is not set +# end of General architecture-dependent options + +CONFIG_RT_MUTEXES=y +CONFIG_BASE_SMALL=0 +CONFIG_MODULE_SIG_FORMAT=y +CONFIG_MODULES=y +CONFIG_MODULE_FORCE_LOAD=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_FORCE_UNLOAD=y +# CONFIG_MODVERSIONS is not set +CONFIG_MODULE_SRCVERSION_ALL=y +CONFIG_MODULE_SIG=y +# CONFIG_MODULE_SIG_FORCE is not set +CONFIG_MODULE_SIG_ALL=y +# CONFIG_MODULE_SIG_SHA1 is not set +# CONFIG_MODULE_SIG_SHA224 is not set +# CONFIG_MODULE_SIG_SHA256 is not set +# CONFIG_MODULE_SIG_SHA384 is not set +CONFIG_MODULE_SIG_SHA512=y +CONFIG_MODULE_SIG_HASH="sha512" +CONFIG_MODULE_COMPRESS=y +# CONFIG_MODULE_COMPRESS_GZIP is not set +CONFIG_MODULE_COMPRESS_XZ=y +CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS=y +CONFIG_UNUSED_SYMBOLS=y +CONFIG_MODULES_TREE_LOOKUP=y +CONFIG_BLOCK=y +CONFIG_BLK_RQ_ALLOC_TIME=y +CONFIG_BLK_SCSI_REQUEST=y +CONFIG_BLK_CGROUP_RWSTAT=y +CONFIG_BLK_DEV_BSG=y +CONFIG_BLK_DEV_BSGLIB=y +CONFIG_BLK_DEV_INTEGRITY=y +CONFIG_BLK_DEV_INTEGRITY_T10=y +CONFIG_BLK_DEV_ZONED=y +CONFIG_BLK_DEV_THROTTLING=y +CONFIG_BLK_DEV_THROTTLING_LOW=y +# CONFIG_BLK_CMDLINE_PARSER is not set +CONFIG_BLK_WBT=y +CONFIG_BLK_CGROUP_IOLATENCY=y +CONFIG_BLK_CGROUP_IOCOST=y +CONFIG_BLK_WBT_MQ=y +CONFIG_BLK_DEBUG_FS=y +CONFIG_BLK_DEBUG_FS_ZONED=y +CONFIG_BLK_SED_OPAL=y + +# +# Partition Types +# +CONFIG_PARTITION_ADVANCED=y +# CONFIG_ACORN_PARTITION is not set +CONFIG_AIX_PARTITION=y +# CONFIG_OSF_PARTITION is not set +# CONFIG_AMIGA_PARTITION is not set +# CONFIG_ATARI_PARTITION is not set +CONFIG_MAC_PARTITION=y +CONFIG_MSDOS_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +# CONFIG_UNIXWARE_DISKLABEL is not set +CONFIG_LDM_PARTITION=y +# CONFIG_LDM_DEBUG is not set +# CONFIG_SGI_PARTITION is not set +# CONFIG_ULTRIX_PARTITION is not set +# CONFIG_SUN_PARTITION is not set +CONFIG_KARMA_PARTITION=y +CONFIG_EFI_PARTITION=y +# CONFIG_SYSV68_PARTITION is not set +# CONFIG_CMDLINE_PARTITION is not set +# end of Partition Types + +CONFIG_BLOCK_COMPAT=y +CONFIG_BLK_MQ_PCI=y +CONFIG_BLK_MQ_VIRTIO=y +CONFIG_BLK_MQ_RDMA=y +CONFIG_BLK_PM=y + +# +# IO Schedulers +# +CONFIG_MQ_IOSCHED_DEADLINE=y +CONFIG_MQ_IOSCHED_KYBER=y +CONFIG_IOSCHED_BFQ=y +CONFIG_BFQ_GROUP_IOSCHED=y +# CONFIG_BFQ_CGROUP_DEBUG is not set +# end of IO Schedulers + +CONFIG_PREEMPT_NOTIFIERS=y +CONFIG_PADATA=y +CONFIG_ASN1=y +CONFIG_UNINLINE_SPIN_UNLOCK=y +CONFIG_ARCH_SUPPORTS_ATOMIC_RMW=y +CONFIG_MUTEX_SPIN_ON_OWNER=y +CONFIG_RWSEM_SPIN_ON_OWNER=y +CONFIG_LOCK_SPIN_ON_OWNER=y +CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y +CONFIG_QUEUED_SPINLOCKS=y +CONFIG_ARCH_USE_QUEUED_RWLOCKS=y +CONFIG_QUEUED_RWLOCKS=y +CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE=y +CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE=y +CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y +CONFIG_FREEZER=y + +# +# Executable file formats +# +CONFIG_BINFMT_ELF=y +CONFIG_COMPAT_BINFMT_ELF=y +CONFIG_ELFCORE=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y +CONFIG_BINFMT_SCRIPT=y +CONFIG_BINFMT_MISC=y +CONFIG_COREDUMP=y +# end of Executable file formats + +# +# Memory Management options +# +CONFIG_SELECT_MEMORY_MODEL=y +CONFIG_SPARSEMEM_MANUAL=y +CONFIG_SPARSEMEM=y +CONFIG_NEED_MULTIPLE_NODES=y +CONFIG_HAVE_MEMORY_PRESENT=y +CONFIG_SPARSEMEM_EXTREME=y +CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y +CONFIG_SPARSEMEM_VMEMMAP=y +CONFIG_HAVE_MEMBLOCK_NODE_MAP=y +CONFIG_HAVE_FAST_GUP=y +CONFIG_NUMA_KEEP_MEMINFO=y +CONFIG_MEMORY_ISOLATION=y +CONFIG_HAVE_BOOTMEM_INFO_NODE=y +CONFIG_MEMORY_HOTPLUG=y +CONFIG_MEMORY_HOTPLUG_SPARSE=y +CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y +CONFIG_MEMORY_HOTREMOVE=y +CONFIG_SPLIT_PTLOCK_CPUS=4 +CONFIG_MEMORY_BALLOON=y +CONFIG_BALLOON_COMPACTION=y +CONFIG_COMPACTION=y +CONFIG_PAGE_REPORTING=y +CONFIG_MIGRATION=y +CONFIG_CONTIG_ALLOC=y +CONFIG_PHYS_ADDR_T_64BIT=y +CONFIG_BOUNCE=y +CONFIG_VIRT_TO_BUS=y +CONFIG_MMU_NOTIFIER=y +CONFIG_KSM=y +CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 +CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y +CONFIG_MEMORY_FAILURE=y +CONFIG_HWPOISON_INJECT=m +CONFIG_TRANSPARENT_HUGEPAGE=y +# CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS is not set +CONFIG_TRANSPARENT_HUGEPAGE_MADVISE=y +CONFIG_ARCH_WANTS_THP_SWAP=y +CONFIG_THP_SWAP=y +CONFIG_CLEANCACHE=y +CONFIG_FRONTSWAP=y +# CONFIG_CMA is not set +# CONFIG_MEM_SOFT_DIRTY is not set +CONFIG_ZSWAP=y +# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_DEFLATE is not set +# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZO is not set +# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_842 is not set +CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4=y +# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4HC is not set +# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_ZSTD is not set +CONFIG_ZSWAP_COMPRESSOR_DEFAULT="lz4" +# CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD is not set +CONFIG_ZSWAP_ZPOOL_DEFAULT_Z3FOLD=y +# CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC is not set +CONFIG_ZSWAP_ZPOOL_DEFAULT="z3fold" +CONFIG_ZSWAP_DEFAULT_ON=y +CONFIG_ZPOOL=y +CONFIG_ZBUD=y +CONFIG_Z3FOLD=y +CONFIG_ZSMALLOC=y +# CONFIG_PGTABLE_MAPPING is not set +# CONFIG_ZSMALLOC_STAT is not set +CONFIG_GENERIC_EARLY_IOREMAP=y +# CONFIG_DEFERRED_STRUCT_PAGE_INIT is not set +# CONFIG_IDLE_PAGE_TRACKING is not set +CONFIG_ARCH_HAS_PTE_DEVMAP=y +CONFIG_ZONE_DEVICE=y +CONFIG_DEV_PAGEMAP_OPS=y +CONFIG_HMM_MIRROR=y +CONFIG_DEVICE_PRIVATE=y +CONFIG_FRAME_VECTOR=y +CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y +CONFIG_ARCH_HAS_PKEYS=y +# CONFIG_PERCPU_STATS is not set +# CONFIG_GUP_BENCHMARK is not set +CONFIG_READ_ONLY_THP_FOR_FS=y +CONFIG_ARCH_HAS_PTE_SPECIAL=y +CONFIG_MAPPING_DIRTY_HELPERS=y +# end of Memory Management options + +CONFIG_NET=y +CONFIG_COMPAT_NETLINK_MESSAGES=y +CONFIG_NET_INGRESS=y +CONFIG_NET_EGRESS=y +CONFIG_NET_REDIRECT=y +CONFIG_SKB_EXTENSIONS=y + +# +# Networking options +# +CONFIG_PACKET=y +CONFIG_PACKET_DIAG=y +CONFIG_UNIX=y +CONFIG_UNIX_SCM=y +CONFIG_UNIX_DIAG=y +CONFIG_TLS=m +CONFIG_TLS_DEVICE=y +# CONFIG_TLS_TOE is not set +CONFIG_XFRM=y +CONFIG_XFRM_OFFLOAD=y +CONFIG_XFRM_ALGO=m +CONFIG_XFRM_USER=m +CONFIG_XFRM_INTERFACE=m +CONFIG_XFRM_SUB_POLICY=y +CONFIG_XFRM_MIGRATE=y +CONFIG_XFRM_STATISTICS=y +CONFIG_XFRM_IPCOMP=m +CONFIG_NET_KEY=m +CONFIG_NET_KEY_MIGRATE=y +CONFIG_SMC=m +CONFIG_SMC_DIAG=m +CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +# CONFIG_IP_FIB_TRIE_STATS is not set +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_IP_ROUTE_CLASSID=y +# CONFIG_IP_PNP is not set +CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE_DEMUX=m +CONFIG_NET_IP_TUNNEL=m +CONFIG_NET_IPGRE=m +# CONFIG_NET_IPGRE_BROADCAST is not set +CONFIG_IP_MROUTE_COMMON=y +CONFIG_IP_MROUTE=y +CONFIG_IP_MROUTE_MULTIPLE_TABLES=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +CONFIG_SYN_COOKIES=y +CONFIG_NET_IPVTI=m +CONFIG_NET_UDP_TUNNEL=m +CONFIG_NET_FOU=m +CONFIG_NET_FOU_IP_TUNNELS=y +CONFIG_INET_AH=m +CONFIG_INET_ESP=m +CONFIG_INET_ESP_OFFLOAD=m +CONFIG_INET_ESPINTCP=y +CONFIG_INET_IPCOMP=m +CONFIG_INET_XFRM_TUNNEL=m +CONFIG_INET_TUNNEL=m +CONFIG_INET_DIAG=m +CONFIG_INET_TCP_DIAG=m +CONFIG_INET_UDP_DIAG=m +CONFIG_INET_RAW_DIAG=m +CONFIG_INET_DIAG_DESTROY=y +CONFIG_TCP_CONG_ADVANCED=y +CONFIG_TCP_CONG_BIC=m +CONFIG_TCP_CONG_CUBIC=y +CONFIG_TCP_CONG_WESTWOOD=m +CONFIG_TCP_CONG_HTCP=m +CONFIG_TCP_CONG_HSTCP=m +CONFIG_TCP_CONG_HYBLA=m +CONFIG_TCP_CONG_VEGAS=m +CONFIG_TCP_CONG_NV=m +CONFIG_TCP_CONG_SCALABLE=m +CONFIG_TCP_CONG_LP=m +CONFIG_TCP_CONG_VENO=m +CONFIG_TCP_CONG_YEAH=m +CONFIG_TCP_CONG_ILLINOIS=m +CONFIG_TCP_CONG_DCTCP=m +CONFIG_TCP_CONG_CDG=m +CONFIG_TCP_CONG_BBR=m +CONFIG_DEFAULT_CUBIC=y +# CONFIG_DEFAULT_RENO is not set +CONFIG_DEFAULT_TCP_CONG="cubic" +CONFIG_TCP_MD5SIG=y +CONFIG_IPV6=y +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_INET6_AH=m +CONFIG_INET6_ESP=m +CONFIG_INET6_ESP_OFFLOAD=m +CONFIG_INET6_IPCOMP=m +CONFIG_IPV6_MIP6=m +CONFIG_IPV6_ILA=m +CONFIG_INET6_XFRM_TUNNEL=m +CONFIG_INET6_TUNNEL=m +CONFIG_IPV6_VTI=m +CONFIG_IPV6_SIT=m +CONFIG_IPV6_SIT_6RD=y +CONFIG_IPV6_NDISC_NODETYPE=y +CONFIG_IPV6_TUNNEL=m +CONFIG_IPV6_GRE=m +CONFIG_IPV6_FOU=m +CONFIG_IPV6_FOU_TUNNEL=m +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_SUBTREES=y +CONFIG_IPV6_MROUTE=y +CONFIG_IPV6_MROUTE_MULTIPLE_TABLES=y +CONFIG_IPV6_PIMSM_V2=y +CONFIG_IPV6_SEG6_LWTUNNEL=y +CONFIG_IPV6_SEG6_HMAC=y +CONFIG_IPV6_SEG6_BPF=y +CONFIG_IPV6_RPL_LWTUNNEL=y +CONFIG_NETLABEL=y +CONFIG_MPTCP=y +CONFIG_MPTCP_IPV6=y +# CONFIG_MPTCP_HMAC_TEST is not set +CONFIG_NETWORK_SECMARK=y +CONFIG_NET_PTP_CLASSIFY=y +CONFIG_NETWORK_PHY_TIMESTAMPING=y +CONFIG_NETFILTER=y +CONFIG_NETFILTER_ADVANCED=y +CONFIG_BRIDGE_NETFILTER=m + +# +# Core Netfilter Configuration +# +CONFIG_NETFILTER_INGRESS=y +CONFIG_NETFILTER_NETLINK=m +CONFIG_NETFILTER_FAMILY_BRIDGE=y +CONFIG_NETFILTER_FAMILY_ARP=y +CONFIG_NETFILTER_NETLINK_ACCT=m +CONFIG_NETFILTER_NETLINK_QUEUE=m +CONFIG_NETFILTER_NETLINK_LOG=m +CONFIG_NETFILTER_NETLINK_OSF=m +CONFIG_NF_CONNTRACK=m +CONFIG_NF_LOG_COMMON=m +CONFIG_NF_LOG_NETDEV=m +CONFIG_NETFILTER_CONNCOUNT=m +CONFIG_NF_CONNTRACK_MARK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_ZONES=y +CONFIG_NF_CONNTRACK_PROCFS=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_TIMEOUT=y +CONFIG_NF_CONNTRACK_TIMESTAMP=y +CONFIG_NF_CONNTRACK_LABELS=y +CONFIG_NF_CT_PROTO_DCCP=y +CONFIG_NF_CT_PROTO_GRE=y +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_CT_PROTO_UDPLITE=y +CONFIG_NF_CONNTRACK_AMANDA=m +CONFIG_NF_CONNTRACK_FTP=m +CONFIG_NF_CONNTRACK_H323=m +CONFIG_NF_CONNTRACK_IRC=m +CONFIG_NF_CONNTRACK_BROADCAST=m +CONFIG_NF_CONNTRACK_NETBIOS_NS=m +CONFIG_NF_CONNTRACK_SNMP=m +CONFIG_NF_CONNTRACK_PPTP=m +CONFIG_NF_CONNTRACK_SANE=m +CONFIG_NF_CONNTRACK_SIP=m +CONFIG_NF_CONNTRACK_TFTP=m +CONFIG_NF_CT_NETLINK=m +CONFIG_NF_CT_NETLINK_TIMEOUT=m +CONFIG_NF_CT_NETLINK_HELPER=m +CONFIG_NETFILTER_NETLINK_GLUE_CT=y +CONFIG_NF_NAT=m +CONFIG_NF_NAT_AMANDA=m +CONFIG_NF_NAT_FTP=m +CONFIG_NF_NAT_IRC=m +CONFIG_NF_NAT_SIP=m +CONFIG_NF_NAT_TFTP=m +CONFIG_NF_NAT_REDIRECT=y +CONFIG_NF_NAT_MASQUERADE=y +CONFIG_NETFILTER_SYNPROXY=m +CONFIG_NF_TABLES=m +CONFIG_NF_TABLES_INET=y +CONFIG_NF_TABLES_NETDEV=y +CONFIG_NFT_NUMGEN=m +CONFIG_NFT_CT=m +CONFIG_NFT_FLOW_OFFLOAD=m +CONFIG_NFT_COUNTER=m +CONFIG_NFT_CONNLIMIT=m +CONFIG_NFT_LOG=m +CONFIG_NFT_LIMIT=m +CONFIG_NFT_MASQ=m +CONFIG_NFT_REDIR=m +CONFIG_NFT_NAT=m +CONFIG_NFT_TUNNEL=m +CONFIG_NFT_OBJREF=m +CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m +CONFIG_NFT_REJECT=m +CONFIG_NFT_REJECT_INET=m +CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m +CONFIG_NFT_FIB=m +CONFIG_NFT_FIB_INET=m +CONFIG_NFT_XFRM=m +CONFIG_NFT_SOCKET=m +CONFIG_NFT_OSF=m +CONFIG_NFT_TPROXY=m +CONFIG_NFT_SYNPROXY=m +CONFIG_NF_DUP_NETDEV=m +CONFIG_NFT_DUP_NETDEV=m +CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m +CONFIG_NF_FLOW_TABLE_INET=m +CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES=m + +# +# Xtables combined modules +# +CONFIG_NETFILTER_XT_MARK=m +CONFIG_NETFILTER_XT_CONNMARK=m +CONFIG_NETFILTER_XT_SET=m + +# +# Xtables targets +# +CONFIG_NETFILTER_XT_TARGET_AUDIT=m +CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m +CONFIG_NETFILTER_XT_TARGET_CONNMARK=m +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m +CONFIG_NETFILTER_XT_TARGET_CT=m +CONFIG_NETFILTER_XT_TARGET_DSCP=m +CONFIG_NETFILTER_XT_TARGET_HL=m +CONFIG_NETFILTER_XT_TARGET_HMARK=m +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m +CONFIG_NETFILTER_XT_TARGET_LED=m +CONFIG_NETFILTER_XT_TARGET_LOG=m +CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_NAT=m +CONFIG_NETFILTER_XT_TARGET_NETMAP=m +CONFIG_NETFILTER_XT_TARGET_NFLOG=m +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_NOTRACK=m +CONFIG_NETFILTER_XT_TARGET_RATEEST=m +CONFIG_NETFILTER_XT_TARGET_REDIRECT=m +CONFIG_NETFILTER_XT_TARGET_MASQUERADE=m +CONFIG_NETFILTER_XT_TARGET_TEE=m +CONFIG_NETFILTER_XT_TARGET_TPROXY=m +CONFIG_NETFILTER_XT_TARGET_TRACE=m +CONFIG_NETFILTER_XT_TARGET_SECMARK=m +CONFIG_NETFILTER_XT_TARGET_TCPMSS=m +CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m + +# +# Xtables matches +# +CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m +CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NETFILTER_XT_MATCH_CGROUP=m +CONFIG_NETFILTER_XT_MATCH_CLUSTER=m +CONFIG_NETFILTER_XT_MATCH_COMMENT=m +CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m +CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m +CONFIG_NETFILTER_XT_MATCH_CONNMARK=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NETFILTER_XT_MATCH_CPU=m +CONFIG_NETFILTER_XT_MATCH_DCCP=m +CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m +CONFIG_NETFILTER_XT_MATCH_DSCP=m +CONFIG_NETFILTER_XT_MATCH_ECN=m +CONFIG_NETFILTER_XT_MATCH_ESP=m +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m +CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_HL=m +CONFIG_NETFILTER_XT_MATCH_IPCOMP=m +CONFIG_NETFILTER_XT_MATCH_IPRANGE=m +CONFIG_NETFILTER_XT_MATCH_IPVS=m +CONFIG_NETFILTER_XT_MATCH_L2TP=m +CONFIG_NETFILTER_XT_MATCH_LENGTH=m +CONFIG_NETFILTER_XT_MATCH_LIMIT=m +CONFIG_NETFILTER_XT_MATCH_MAC=m +CONFIG_NETFILTER_XT_MATCH_MARK=m +CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m +CONFIG_NETFILTER_XT_MATCH_NFACCT=m +CONFIG_NETFILTER_XT_MATCH_OSF=m +CONFIG_NETFILTER_XT_MATCH_OWNER=m +CONFIG_NETFILTER_XT_MATCH_POLICY=m +CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m +CONFIG_NETFILTER_XT_MATCH_QUOTA=m +CONFIG_NETFILTER_XT_MATCH_RATEEST=m +CONFIG_NETFILTER_XT_MATCH_REALM=m +CONFIG_NETFILTER_XT_MATCH_RECENT=m +CONFIG_NETFILTER_XT_MATCH_SCTP=m +CONFIG_NETFILTER_XT_MATCH_SOCKET=m +CONFIG_NETFILTER_XT_MATCH_STATE=m +CONFIG_NETFILTER_XT_MATCH_STATISTIC=m +CONFIG_NETFILTER_XT_MATCH_STRING=m +CONFIG_NETFILTER_XT_MATCH_TCPMSS=m +CONFIG_NETFILTER_XT_MATCH_TIME=m +CONFIG_NETFILTER_XT_MATCH_U32=m +# end of Core Netfilter Configuration + +CONFIG_IP_SET=m +CONFIG_IP_SET_MAX=256 +CONFIG_IP_SET_BITMAP_IP=m +CONFIG_IP_SET_BITMAP_IPMAC=m +CONFIG_IP_SET_BITMAP_PORT=m +CONFIG_IP_SET_HASH_IP=m +CONFIG_IP_SET_HASH_IPMARK=m +CONFIG_IP_SET_HASH_IPPORT=m +CONFIG_IP_SET_HASH_IPPORTIP=m +CONFIG_IP_SET_HASH_IPPORTNET=m +CONFIG_IP_SET_HASH_IPMAC=m +CONFIG_IP_SET_HASH_MAC=m +CONFIG_IP_SET_HASH_NETPORTNET=m +CONFIG_IP_SET_HASH_NET=m +CONFIG_IP_SET_HASH_NETNET=m +CONFIG_IP_SET_HASH_NETPORT=m +CONFIG_IP_SET_HASH_NETIFACE=m +CONFIG_IP_SET_LIST_SET=m +CONFIG_IP_VS=m +CONFIG_IP_VS_IPV6=y +# CONFIG_IP_VS_DEBUG is not set +CONFIG_IP_VS_TAB_BITS=15 + +# +# IPVS transport protocol load balancing support +# +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_PROTO_UDP=y +CONFIG_IP_VS_PROTO_AH_ESP=y +CONFIG_IP_VS_PROTO_ESP=y +CONFIG_IP_VS_PROTO_AH=y +CONFIG_IP_VS_PROTO_SCTP=y + +# +# IPVS scheduler +# +CONFIG_IP_VS_RR=m +CONFIG_IP_VS_WRR=m +CONFIG_IP_VS_LC=m +CONFIG_IP_VS_WLC=m +CONFIG_IP_VS_FO=m +CONFIG_IP_VS_OVF=m +CONFIG_IP_VS_LBLC=m +CONFIG_IP_VS_LBLCR=m +CONFIG_IP_VS_DH=m +CONFIG_IP_VS_SH=m +CONFIG_IP_VS_MH=m +CONFIG_IP_VS_SED=m +CONFIG_IP_VS_NQ=m + +# +# IPVS SH scheduler +# +CONFIG_IP_VS_SH_TAB_BITS=8 + +# +# IPVS MH scheduler +# +CONFIG_IP_VS_MH_TAB_INDEX=12 + +# +# IPVS application helper +# +CONFIG_IP_VS_FTP=m +CONFIG_IP_VS_NFCT=y +CONFIG_IP_VS_PE_SIP=m + +# +# IP: Netfilter Configuration +# +CONFIG_NF_DEFRAG_IPV4=m +CONFIG_NF_SOCKET_IPV4=m +CONFIG_NF_TPROXY_IPV4=m +CONFIG_NF_TABLES_IPV4=y +CONFIG_NFT_REJECT_IPV4=m +CONFIG_NFT_DUP_IPV4=m +CONFIG_NFT_FIB_IPV4=m +CONFIG_NF_TABLES_ARP=y +CONFIG_NF_FLOW_TABLE_IPV4=m +CONFIG_NF_DUP_IPV4=m +CONFIG_NF_LOG_ARP=m +CONFIG_NF_LOG_IPV4=m +CONFIG_NF_REJECT_IPV4=m +CONFIG_NF_NAT_SNMP_BASIC=m +CONFIG_NF_NAT_PPTP=m +CONFIG_NF_NAT_H323=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_AH=m +CONFIG_IP_NF_MATCH_ECN=m +CONFIG_IP_NF_MATCH_RPFILTER=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_TARGET_SYNPROXY=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_TARGET_NETMAP=m +CONFIG_IP_NF_TARGET_REDIRECT=m +CONFIG_IP_NF_MANGLE=m +CONFIG_IP_NF_TARGET_CLUSTERIP=m +CONFIG_IP_NF_TARGET_ECN=m +CONFIG_IP_NF_TARGET_TTL=m +CONFIG_IP_NF_RAW=m +CONFIG_IP_NF_SECURITY=m +CONFIG_IP_NF_ARPTABLES=m +CONFIG_IP_NF_ARPFILTER=m +CONFIG_IP_NF_ARP_MANGLE=m +# end of IP: Netfilter Configuration + +# +# IPv6: Netfilter Configuration +# +CONFIG_NF_SOCKET_IPV6=m +CONFIG_NF_TPROXY_IPV6=m +CONFIG_NF_TABLES_IPV6=y +CONFIG_NFT_REJECT_IPV6=m +CONFIG_NFT_DUP_IPV6=m +CONFIG_NFT_FIB_IPV6=m +CONFIG_NF_FLOW_TABLE_IPV6=m +CONFIG_NF_DUP_IPV6=m +CONFIG_NF_REJECT_IPV6=m +CONFIG_NF_LOG_IPV6=m +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_AH=m +CONFIG_IP6_NF_MATCH_EUI64=m +CONFIG_IP6_NF_MATCH_FRAG=m +CONFIG_IP6_NF_MATCH_OPTS=m +CONFIG_IP6_NF_MATCH_HL=m +CONFIG_IP6_NF_MATCH_IPV6HEADER=m +CONFIG_IP6_NF_MATCH_MH=m +CONFIG_IP6_NF_MATCH_RPFILTER=m +CONFIG_IP6_NF_MATCH_RT=m +CONFIG_IP6_NF_MATCH_SRH=m +CONFIG_IP6_NF_TARGET_HL=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_REJECT=m +CONFIG_IP6_NF_TARGET_SYNPROXY=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_RAW=m +CONFIG_IP6_NF_SECURITY=m +CONFIG_IP6_NF_NAT=m +CONFIG_IP6_NF_TARGET_MASQUERADE=m +CONFIG_IP6_NF_TARGET_NPT=m +# end of IPv6: Netfilter Configuration + +CONFIG_NF_DEFRAG_IPV6=m +CONFIG_NF_TABLES_BRIDGE=m +CONFIG_NFT_BRIDGE_META=m +CONFIG_NFT_BRIDGE_REJECT=m +CONFIG_NF_LOG_BRIDGE=m +CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES=m +CONFIG_BRIDGE_EBT_BROUTE=m +CONFIG_BRIDGE_EBT_T_FILTER=m +CONFIG_BRIDGE_EBT_T_NAT=m +CONFIG_BRIDGE_EBT_802_3=m +CONFIG_BRIDGE_EBT_AMONG=m +CONFIG_BRIDGE_EBT_ARP=m +CONFIG_BRIDGE_EBT_IP=m +CONFIG_BRIDGE_EBT_IP6=m +CONFIG_BRIDGE_EBT_LIMIT=m +CONFIG_BRIDGE_EBT_MARK=m +CONFIG_BRIDGE_EBT_PKTTYPE=m +CONFIG_BRIDGE_EBT_STP=m +CONFIG_BRIDGE_EBT_VLAN=m +CONFIG_BRIDGE_EBT_ARPREPLY=m +CONFIG_BRIDGE_EBT_DNAT=m +CONFIG_BRIDGE_EBT_MARK_T=m +CONFIG_BRIDGE_EBT_REDIRECT=m +CONFIG_BRIDGE_EBT_SNAT=m +CONFIG_BRIDGE_EBT_LOG=m +CONFIG_BRIDGE_EBT_NFLOG=m +# CONFIG_BPFILTER is not set +CONFIG_IP_DCCP=m +CONFIG_INET_DCCP_DIAG=m + +# +# DCCP CCIDs Configuration +# +# CONFIG_IP_DCCP_CCID2_DEBUG is not set +CONFIG_IP_DCCP_CCID3=y +# CONFIG_IP_DCCP_CCID3_DEBUG is not set +CONFIG_IP_DCCP_TFRC_LIB=y +# end of DCCP CCIDs Configuration + +# +# DCCP Kernel Hacking +# +# CONFIG_IP_DCCP_DEBUG is not set +# end of DCCP Kernel Hacking + +CONFIG_IP_SCTP=m +# CONFIG_SCTP_DBG_OBJCNT is not set +# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5 is not set +CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1=y +# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE is not set +CONFIG_SCTP_COOKIE_HMAC_MD5=y +CONFIG_SCTP_COOKIE_HMAC_SHA1=y +CONFIG_INET_SCTP_DIAG=m +CONFIG_RDS=m +CONFIG_RDS_RDMA=m +CONFIG_RDS_TCP=m +# CONFIG_RDS_DEBUG is not set +CONFIG_TIPC=m +CONFIG_TIPC_MEDIA_IB=y +CONFIG_TIPC_MEDIA_UDP=y +CONFIG_TIPC_CRYPTO=y +CONFIG_TIPC_DIAG=m +CONFIG_ATM=m +CONFIG_ATM_CLIP=m +# CONFIG_ATM_CLIP_NO_ICMP is not set +CONFIG_ATM_LANE=m +CONFIG_ATM_MPOA=m +CONFIG_ATM_BR2684=m +# CONFIG_ATM_BR2684_IPFILTER is not set +CONFIG_L2TP=m +# CONFIG_L2TP_DEBUGFS is not set +CONFIG_L2TP_V3=y +CONFIG_L2TP_IP=m +CONFIG_L2TP_ETH=m +CONFIG_STP=m +CONFIG_GARP=m +CONFIG_MRP=m +CONFIG_BRIDGE=m +CONFIG_BRIDGE_IGMP_SNOOPING=y +CONFIG_BRIDGE_VLAN_FILTERING=y +CONFIG_HAVE_NET_DSA=y +CONFIG_NET_DSA=m +CONFIG_NET_DSA_TAG_8021Q=m +CONFIG_NET_DSA_TAG_AR9331=m +CONFIG_NET_DSA_TAG_BRCM_COMMON=m +CONFIG_NET_DSA_TAG_BRCM=m +CONFIG_NET_DSA_TAG_BRCM_PREPEND=m +CONFIG_NET_DSA_TAG_GSWIP=m +CONFIG_NET_DSA_TAG_DSA=m +CONFIG_NET_DSA_TAG_EDSA=m +CONFIG_NET_DSA_TAG_MTK=m +CONFIG_NET_DSA_TAG_KSZ=m +CONFIG_NET_DSA_TAG_OCELOT=m +CONFIG_NET_DSA_TAG_QCA=m +CONFIG_NET_DSA_TAG_LAN9303=m +CONFIG_NET_DSA_TAG_SJA1105=m +CONFIG_NET_DSA_TAG_TRAILER=m +CONFIG_VLAN_8021Q=m +CONFIG_VLAN_8021Q_GVRP=y +CONFIG_VLAN_8021Q_MVRP=y +# CONFIG_DECNET is not set +CONFIG_LLC=m +CONFIG_LLC2=m +CONFIG_ATALK=m +CONFIG_DEV_APPLETALK=m +CONFIG_IPDDP=m +CONFIG_IPDDP_ENCAP=y +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +CONFIG_PHONET=m +CONFIG_6LOWPAN=m +# CONFIG_6LOWPAN_DEBUGFS is not set +CONFIG_6LOWPAN_NHC=m +CONFIG_6LOWPAN_NHC_DEST=m +CONFIG_6LOWPAN_NHC_FRAGMENT=m +CONFIG_6LOWPAN_NHC_HOP=m +CONFIG_6LOWPAN_NHC_IPV6=m +CONFIG_6LOWPAN_NHC_MOBILITY=m +CONFIG_6LOWPAN_NHC_ROUTING=m +CONFIG_6LOWPAN_NHC_UDP=m +CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m +CONFIG_6LOWPAN_GHC_UDP=m +CONFIG_6LOWPAN_GHC_ICMPV6=m +CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m +CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m +CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m +CONFIG_IEEE802154=m +CONFIG_IEEE802154_NL802154_EXPERIMENTAL=y +CONFIG_IEEE802154_SOCKET=m +CONFIG_IEEE802154_6LOWPAN=m +CONFIG_MAC802154=m +CONFIG_NET_SCHED=y + +# +# Queueing/Scheduling +# +CONFIG_NET_SCH_CBQ=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_HFSC=m +CONFIG_NET_SCH_ATM=m +CONFIG_NET_SCH_PRIO=m +CONFIG_NET_SCH_MULTIQ=m +CONFIG_NET_SCH_RED=m +CONFIG_NET_SCH_SFB=m +CONFIG_NET_SCH_SFQ=m +CONFIG_NET_SCH_TEQL=m +CONFIG_NET_SCH_TBF=m +CONFIG_NET_SCH_CBS=m +CONFIG_NET_SCH_ETF=m +CONFIG_NET_SCH_TAPRIO=m +CONFIG_NET_SCH_GRED=m +CONFIG_NET_SCH_DSMARK=m +CONFIG_NET_SCH_NETEM=m +CONFIG_NET_SCH_DRR=m +CONFIG_NET_SCH_MQPRIO=m +CONFIG_NET_SCH_SKBPRIO=m +CONFIG_NET_SCH_CHOKE=m +CONFIG_NET_SCH_QFQ=m +CONFIG_NET_SCH_CODEL=m +CONFIG_NET_SCH_FQ_CODEL=y +CONFIG_NET_SCH_CAKE=m +CONFIG_NET_SCH_FQ=m +CONFIG_NET_SCH_HHF=m +CONFIG_NET_SCH_PIE=m +CONFIG_NET_SCH_FQ_PIE=m +CONFIG_NET_SCH_INGRESS=m +CONFIG_NET_SCH_PLUG=m +CONFIG_NET_SCH_ETS=m +CONFIG_NET_SCH_DEFAULT=y +# CONFIG_DEFAULT_FQ is not set +# CONFIG_DEFAULT_CODEL is not set +CONFIG_DEFAULT_FQ_CODEL=y +# CONFIG_DEFAULT_SFQ is not set +# CONFIG_DEFAULT_PFIFO_FAST is not set +CONFIG_DEFAULT_NET_SCH="fq_codel" + +# +# Classification +# +CONFIG_NET_CLS=y +CONFIG_NET_CLS_BASIC=m +CONFIG_NET_CLS_TCINDEX=m +CONFIG_NET_CLS_ROUTE4=m +CONFIG_NET_CLS_FW=m +CONFIG_NET_CLS_U32=m +CONFIG_CLS_U32_PERF=y +CONFIG_CLS_U32_MARK=y +CONFIG_NET_CLS_RSVP=m +CONFIG_NET_CLS_RSVP6=m +CONFIG_NET_CLS_FLOW=m +CONFIG_NET_CLS_CGROUP=m +CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_FLOWER=m +CONFIG_NET_CLS_MATCHALL=m +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_STACK=32 +CONFIG_NET_EMATCH_CMP=m +CONFIG_NET_EMATCH_NBYTE=m +CONFIG_NET_EMATCH_U32=m +CONFIG_NET_EMATCH_META=m +CONFIG_NET_EMATCH_TEXT=m +CONFIG_NET_EMATCH_CANID=m +CONFIG_NET_EMATCH_IPSET=m +CONFIG_NET_EMATCH_IPT=m +CONFIG_NET_CLS_ACT=y +CONFIG_NET_ACT_POLICE=m +CONFIG_NET_ACT_GACT=m +CONFIG_GACT_PROB=y +CONFIG_NET_ACT_MIRRED=m +CONFIG_NET_ACT_SAMPLE=m +CONFIG_NET_ACT_IPT=m +CONFIG_NET_ACT_NAT=m +CONFIG_NET_ACT_PEDIT=m +CONFIG_NET_ACT_SIMP=m +CONFIG_NET_ACT_SKBEDIT=m +CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_MPLS=m +CONFIG_NET_ACT_VLAN=m +CONFIG_NET_ACT_BPF=m +CONFIG_NET_ACT_CONNMARK=m +CONFIG_NET_ACT_CTINFO=m +CONFIG_NET_ACT_SKBMOD=m +CONFIG_NET_ACT_IFE=m +CONFIG_NET_ACT_TUNNEL_KEY=m +CONFIG_NET_ACT_CT=m +CONFIG_NET_IFE_SKBMARK=m +CONFIG_NET_IFE_SKBPRIO=m +CONFIG_NET_IFE_SKBTCINDEX=m +CONFIG_NET_TC_SKB_EXT=y +CONFIG_NET_SCH_FIFO=y +CONFIG_DCB=y +CONFIG_DNS_RESOLVER=m +CONFIG_BATMAN_ADV=m +CONFIG_BATMAN_ADV_BATMAN_V=y +CONFIG_BATMAN_ADV_BLA=y +CONFIG_BATMAN_ADV_DAT=y +CONFIG_BATMAN_ADV_NC=y +CONFIG_BATMAN_ADV_MCAST=y +CONFIG_BATMAN_ADV_DEBUGFS=y +# CONFIG_BATMAN_ADV_DEBUG is not set +CONFIG_BATMAN_ADV_SYSFS=y +# CONFIG_BATMAN_ADV_TRACING is not set +CONFIG_OPENVSWITCH=m +CONFIG_OPENVSWITCH_GRE=m +CONFIG_OPENVSWITCH_VXLAN=m +CONFIG_OPENVSWITCH_GENEVE=m +CONFIG_VSOCKETS=m +CONFIG_VSOCKETS_DIAG=m +CONFIG_VSOCKETS_LOOPBACK=m +CONFIG_VMWARE_VMCI_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS_COMMON=m +CONFIG_HYPERV_VSOCKETS=m +CONFIG_NETLINK_DIAG=m +CONFIG_MPLS=y +CONFIG_NET_MPLS_GSO=m +CONFIG_MPLS_ROUTING=m +CONFIG_MPLS_IPTUNNEL=m +CONFIG_NET_NSH=m +CONFIG_HSR=m +CONFIG_NET_SWITCHDEV=y +CONFIG_NET_L3_MASTER_DEV=y +CONFIG_NET_NCSI=y +CONFIG_NCSI_OEM_CMD_GET_MAC=y +CONFIG_RPS=y +CONFIG_RFS_ACCEL=y +CONFIG_XPS=y +CONFIG_CGROUP_NET_PRIO=y +CONFIG_CGROUP_NET_CLASSID=y +CONFIG_NET_RX_BUSY_POLL=y +CONFIG_BQL=y +CONFIG_BPF_JIT=y +CONFIG_BPF_STREAM_PARSER=y +CONFIG_NET_FLOW_LIMIT=y + +# +# Network testing +# +CONFIG_NET_PKTGEN=m +CONFIG_NET_DROP_MONITOR=y +# end of Network testing +# end of Networking options + +CONFIG_HAMRADIO=y + +# +# Packet Radio protocols +# +CONFIG_AX25=m +CONFIG_AX25_DAMA_SLAVE=y +CONFIG_NETROM=m +CONFIG_ROSE=m + +# +# AX.25 network device drivers +# +CONFIG_MKISS=m +CONFIG_6PACK=m +CONFIG_BPQETHER=m +CONFIG_BAYCOM_SER_FDX=m +CONFIG_BAYCOM_SER_HDX=m +CONFIG_BAYCOM_PAR=m +CONFIG_YAM=m +# end of AX.25 network device drivers + +CONFIG_CAN=m +CONFIG_CAN_RAW=m +CONFIG_CAN_BCM=m +CONFIG_CAN_GW=m +CONFIG_CAN_J1939=m + +# +# CAN Device Drivers +# +CONFIG_CAN_VCAN=m +CONFIG_CAN_VXCAN=m +CONFIG_CAN_SLCAN=m +CONFIG_CAN_DEV=m +CONFIG_CAN_CALC_BITTIMING=y +CONFIG_CAN_FLEXCAN=m +CONFIG_CAN_GRCAN=m +CONFIG_CAN_JANZ_ICAN3=m +CONFIG_CAN_KVASER_PCIEFD=m +CONFIG_CAN_C_CAN=m +CONFIG_CAN_C_CAN_PLATFORM=m +CONFIG_CAN_C_CAN_PCI=m +CONFIG_CAN_CC770=m +# CONFIG_CAN_CC770_ISA is not set +CONFIG_CAN_CC770_PLATFORM=m +CONFIG_CAN_IFI_CANFD=m +CONFIG_CAN_M_CAN=m +CONFIG_CAN_M_CAN_PLATFORM=m +CONFIG_CAN_M_CAN_TCAN4X5X=m +CONFIG_CAN_PEAK_PCIEFD=m +CONFIG_CAN_SJA1000=m +CONFIG_CAN_EMS_PCI=m +# CONFIG_CAN_EMS_PCMCIA is not set +CONFIG_CAN_F81601=m +CONFIG_CAN_KVASER_PCI=m +CONFIG_CAN_PEAK_PCI=m +CONFIG_CAN_PEAK_PCIEC=y +CONFIG_CAN_PEAK_PCMCIA=m +CONFIG_CAN_PLX_PCI=m +# CONFIG_CAN_SJA1000_ISA is not set +CONFIG_CAN_SJA1000_PLATFORM=m +CONFIG_CAN_SOFTING=m +CONFIG_CAN_SOFTING_CS=m + +# +# CAN SPI interfaces +# +CONFIG_CAN_HI311X=m +CONFIG_CAN_MCP251X=m +# end of CAN SPI interfaces + +# +# CAN USB interfaces +# +CONFIG_CAN_8DEV_USB=m +CONFIG_CAN_EMS_USB=m +CONFIG_CAN_ESD_USB2=m +CONFIG_CAN_GS_USB=m +CONFIG_CAN_KVASER_USB=m +CONFIG_CAN_MCBA_USB=m +CONFIG_CAN_PEAK_USB=m +CONFIG_CAN_UCAN=m +# end of CAN USB interfaces + +# CONFIG_CAN_DEBUG_DEVICES is not set +# end of CAN Device Drivers + +CONFIG_BT=m +CONFIG_BT_BREDR=y +CONFIG_BT_RFCOMM=m +CONFIG_BT_RFCOMM_TTY=y +CONFIG_BT_BNEP=m +CONFIG_BT_BNEP_MC_FILTER=y +CONFIG_BT_BNEP_PROTO_FILTER=y +CONFIG_BT_CMTP=m +CONFIG_BT_HIDP=m +CONFIG_BT_HS=y +CONFIG_BT_LE=y +CONFIG_BT_6LOWPAN=m +CONFIG_BT_LEDS=y +# CONFIG_BT_SELFTEST is not set +CONFIG_BT_DEBUGFS=y + +# +# Bluetooth device drivers +# +CONFIG_BT_INTEL=m +CONFIG_BT_BCM=m +CONFIG_BT_RTL=m +CONFIG_BT_QCA=m +CONFIG_BT_HCIBTUSB=m +CONFIG_BT_HCIBTUSB_AUTOSUSPEND=y +CONFIG_BT_HCIBTUSB_BCM=y +CONFIG_BT_HCIBTUSB_MTK=y +CONFIG_BT_HCIBTUSB_RTL=y +CONFIG_BT_HCIBTSDIO=m +CONFIG_BT_HCIUART=m +CONFIG_BT_HCIUART_SERDEV=y +CONFIG_BT_HCIUART_H4=y +CONFIG_BT_HCIUART_NOKIA=m +CONFIG_BT_HCIUART_BCSP=y +CONFIG_BT_HCIUART_ATH3K=y +CONFIG_BT_HCIUART_LL=y +CONFIG_BT_HCIUART_3WIRE=y +CONFIG_BT_HCIUART_INTEL=y +CONFIG_BT_HCIUART_BCM=y +CONFIG_BT_HCIUART_RTL=y +CONFIG_BT_HCIUART_QCA=y +CONFIG_BT_HCIUART_AG6XX=y +CONFIG_BT_HCIUART_MRVL=y +CONFIG_BT_HCIBCM203X=m +CONFIG_BT_HCIBPA10X=m +CONFIG_BT_HCIBFUSB=m +CONFIG_BT_HCIDTL1=m +CONFIG_BT_HCIBT3C=m +CONFIG_BT_HCIBLUECARD=m +CONFIG_BT_HCIVHCI=m +CONFIG_BT_MRVL=m +CONFIG_BT_MRVL_SDIO=m +CONFIG_BT_ATH3K=m +CONFIG_BT_MTKSDIO=m +CONFIG_BT_MTKUART=m +CONFIG_BT_HCIRSI=m +# end of Bluetooth device drivers + +CONFIG_AF_RXRPC=m +CONFIG_AF_RXRPC_IPV6=y +# CONFIG_AF_RXRPC_INJECT_LOSS is not set +CONFIG_AF_RXRPC_DEBUG=y +CONFIG_RXKAD=y +CONFIG_AF_KCM=m +CONFIG_STREAM_PARSER=y +CONFIG_FIB_RULES=y +CONFIG_WIRELESS=y +CONFIG_WIRELESS_EXT=y +CONFIG_WEXT_CORE=y +CONFIG_WEXT_PROC=y +CONFIG_WEXT_SPY=y +CONFIG_WEXT_PRIV=y +CONFIG_CFG80211=m +# CONFIG_NL80211_TESTMODE is not set +# CONFIG_CFG80211_DEVELOPER_WARNINGS is not set +# CONFIG_CFG80211_CERTIFICATION_ONUS is not set +CONFIG_CFG80211_REQUIRE_SIGNED_REGDB=y +CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS=y +CONFIG_CFG80211_DEFAULT_PS=y +CONFIG_CFG80211_DEBUGFS=y +CONFIG_CFG80211_CRDA_SUPPORT=y +CONFIG_CFG80211_WEXT=y +CONFIG_CFG80211_WEXT_EXPORT=y +CONFIG_LIB80211=m +CONFIG_LIB80211_CRYPT_WEP=m +CONFIG_LIB80211_CRYPT_CCMP=m +CONFIG_LIB80211_CRYPT_TKIP=m +# CONFIG_LIB80211_DEBUG is not set +CONFIG_MAC80211=m +CONFIG_MAC80211_HAS_RC=y +CONFIG_MAC80211_RC_MINSTREL=y +CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y +CONFIG_MAC80211_RC_DEFAULT="minstrel_ht" +CONFIG_MAC80211_MESH=y +CONFIG_MAC80211_LEDS=y +CONFIG_MAC80211_DEBUGFS=y +# CONFIG_MAC80211_MESSAGE_TRACING is not set +# CONFIG_MAC80211_DEBUG_MENU is not set +CONFIG_MAC80211_STA_HASH_MAX_SIZE=0 +CONFIG_WIMAX=m +CONFIG_WIMAX_DEBUG_LEVEL=8 +CONFIG_RFKILL=m +CONFIG_RFKILL_LEDS=y +CONFIG_RFKILL_INPUT=y +CONFIG_RFKILL_GPIO=m +CONFIG_NET_9P=m +CONFIG_NET_9P_VIRTIO=m +CONFIG_NET_9P_XEN=m +CONFIG_NET_9P_RDMA=m +# CONFIG_NET_9P_DEBUG is not set +CONFIG_CAIF=m +# CONFIG_CAIF_DEBUG is not set +CONFIG_CAIF_NETDEV=m +CONFIG_CAIF_USB=m +CONFIG_CEPH_LIB=m +CONFIG_CEPH_LIB_PRETTYDEBUG=y +CONFIG_CEPH_LIB_USE_DNS_RESOLVER=y +CONFIG_NFC=m +CONFIG_NFC_DIGITAL=m +CONFIG_NFC_NCI=m +CONFIG_NFC_NCI_SPI=m +CONFIG_NFC_NCI_UART=m +CONFIG_NFC_HCI=m +CONFIG_NFC_SHDLC=y + +# +# Near Field Communication (NFC) devices +# +CONFIG_NFC_TRF7970A=m +CONFIG_NFC_MEI_PHY=m +CONFIG_NFC_SIM=m +CONFIG_NFC_PORT100=m +CONFIG_NFC_FDP=m +CONFIG_NFC_FDP_I2C=m +CONFIG_NFC_PN544=m +CONFIG_NFC_PN544_I2C=m +CONFIG_NFC_PN544_MEI=m +CONFIG_NFC_PN533=m +CONFIG_NFC_PN533_USB=m +CONFIG_NFC_PN533_I2C=m +CONFIG_NFC_PN532_UART=m +CONFIG_NFC_MICROREAD=m +CONFIG_NFC_MICROREAD_I2C=m +CONFIG_NFC_MICROREAD_MEI=m +CONFIG_NFC_MRVL=m +CONFIG_NFC_MRVL_USB=m +CONFIG_NFC_MRVL_UART=m +CONFIG_NFC_MRVL_I2C=m +CONFIG_NFC_MRVL_SPI=m +CONFIG_NFC_ST21NFCA=m +CONFIG_NFC_ST21NFCA_I2C=m +CONFIG_NFC_ST_NCI=m +CONFIG_NFC_ST_NCI_I2C=m +CONFIG_NFC_ST_NCI_SPI=m +CONFIG_NFC_NXP_NCI=m +CONFIG_NFC_NXP_NCI_I2C=m +CONFIG_NFC_S3FWRN5=m +CONFIG_NFC_S3FWRN5_I2C=m +CONFIG_NFC_ST95HF=m +# end of Near Field Communication (NFC) devices + +CONFIG_PSAMPLE=m +CONFIG_NET_IFE=m +CONFIG_LWTUNNEL=y +CONFIG_LWTUNNEL_BPF=y +CONFIG_DST_CACHE=y +CONFIG_GRO_CELLS=y +CONFIG_SOCK_VALIDATE_XMIT=y +CONFIG_NET_SOCK_MSG=y +CONFIG_NET_DEVLINK=y +CONFIG_PAGE_POOL=y +CONFIG_FAILOVER=m +CONFIG_ETHTOOL_NETLINK=y +CONFIG_HAVE_EBPF_JIT=y + +# +# Device Drivers +# +CONFIG_HAVE_EISA=y +# CONFIG_EISA is not set +CONFIG_HAVE_PCI=y +CONFIG_PCI=y +CONFIG_PCI_DOMAINS=y +CONFIG_PCIEPORTBUS=y +CONFIG_HOTPLUG_PCI_PCIE=y +CONFIG_PCIEAER=y +# CONFIG_PCIEAER_INJECT is not set +CONFIG_PCIE_ECRC=y +CONFIG_PCIEASPM=y +CONFIG_PCIEASPM_DEFAULT=y +# CONFIG_PCIEASPM_POWERSAVE is not set +# CONFIG_PCIEASPM_POWER_SUPERSAVE is not set +# CONFIG_PCIEASPM_PERFORMANCE is not set +CONFIG_PCIE_PME=y +CONFIG_PCIE_DPC=y +CONFIG_PCIE_PTM=y +# CONFIG_PCIE_BW is not set +CONFIG_PCIE_EDR=y +CONFIG_PCI_MSI=y +CONFIG_PCI_MSI_IRQ_DOMAIN=y +CONFIG_PCI_QUIRKS=y +# CONFIG_PCI_DEBUG is not set +CONFIG_PCI_REALLOC_ENABLE_AUTO=y +CONFIG_PCI_STUB=y +CONFIG_PCI_PF_STUB=m +CONFIG_XEN_PCIDEV_FRONTEND=m +CONFIG_PCI_ATS=y +CONFIG_PCI_ECAM=y +CONFIG_PCI_LOCKLESS_CONFIG=y +CONFIG_PCI_IOV=y +CONFIG_PCI_PRI=y +CONFIG_PCI_PASID=y +CONFIG_PCI_P2PDMA=y +CONFIG_PCI_LABEL=y +CONFIG_PCI_HYPERV=m +CONFIG_HOTPLUG_PCI=y +CONFIG_HOTPLUG_PCI_ACPI=y +CONFIG_HOTPLUG_PCI_ACPI_IBM=m +CONFIG_HOTPLUG_PCI_CPCI=y +CONFIG_HOTPLUG_PCI_CPCI_ZT5550=m +CONFIG_HOTPLUG_PCI_CPCI_GENERIC=m +CONFIG_HOTPLUG_PCI_SHPC=y + +# +# PCI controller drivers +# +CONFIG_PCI_FTPCI100=y +CONFIG_PCI_HOST_COMMON=y +CONFIG_PCI_HOST_GENERIC=y +CONFIG_PCIE_XILINX=y +CONFIG_VMD=m +CONFIG_PCI_HYPERV_INTERFACE=m + +# +# DesignWare PCI Core Support +# +CONFIG_PCIE_DW=y +CONFIG_PCIE_DW_HOST=y +CONFIG_PCIE_DW_EP=y +CONFIG_PCIE_DW_PLAT=y +CONFIG_PCIE_DW_PLAT_HOST=y +CONFIG_PCIE_DW_PLAT_EP=y +CONFIG_PCIE_INTEL_GW=y +CONFIG_PCI_MESON=y +# end of DesignWare PCI Core Support + +# +# Mobiveil PCIe Core Support +# +# end of Mobiveil PCIe Core Support + +# +# Cadence PCIe controllers support +# +CONFIG_PCIE_CADENCE=y +CONFIG_PCIE_CADENCE_HOST=y +CONFIG_PCIE_CADENCE_EP=y +CONFIG_PCIE_CADENCE_PLAT=y +CONFIG_PCIE_CADENCE_PLAT_HOST=y +CONFIG_PCIE_CADENCE_PLAT_EP=y +# end of Cadence PCIe controllers support +# end of PCI controller drivers + +# +# PCI Endpoint +# +CONFIG_PCI_ENDPOINT=y +CONFIG_PCI_ENDPOINT_CONFIGFS=y +# CONFIG_PCI_EPF_TEST is not set +# end of PCI Endpoint + +# +# PCI switch controller drivers +# +CONFIG_PCI_SW_SWITCHTEC=m +# end of PCI switch controller drivers + +CONFIG_PCCARD=m +CONFIG_PCMCIA=m +CONFIG_PCMCIA_LOAD_CIS=y +CONFIG_CARDBUS=y + +# +# PC-card bridges +# +CONFIG_YENTA=m +CONFIG_YENTA_O2=y +CONFIG_YENTA_RICOH=y +CONFIG_YENTA_TI=y +CONFIG_YENTA_ENE_TUNE=y +CONFIG_YENTA_TOSHIBA=y +CONFIG_PD6729=m +CONFIG_I82092=m +CONFIG_PCCARD_NONSTATIC=y +CONFIG_RAPIDIO=m +CONFIG_RAPIDIO_TSI721=m +CONFIG_RAPIDIO_DISC_TIMEOUT=30 +CONFIG_RAPIDIO_ENABLE_RX_TX_PORTS=y +CONFIG_RAPIDIO_DMA_ENGINE=y +# CONFIG_RAPIDIO_DEBUG is not set +CONFIG_RAPIDIO_ENUM_BASIC=m +CONFIG_RAPIDIO_CHMAN=m +CONFIG_RAPIDIO_MPORT_CDEV=m + +# +# RapidIO Switch drivers +# +CONFIG_RAPIDIO_TSI57X=m +CONFIG_RAPIDIO_CPS_XX=m +CONFIG_RAPIDIO_TSI568=m +CONFIG_RAPIDIO_CPS_GEN2=m +CONFIG_RAPIDIO_RXS_GEN3=m +# end of RapidIO Switch drivers + +# +# Generic Driver Options +# +# CONFIG_UEVENT_HELPER is not set +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +CONFIG_STANDALONE=y +CONFIG_PREVENT_FIRMWARE_BUILD=y + +# +# Firmware loader +# +CONFIG_FW_LOADER=y +CONFIG_FW_LOADER_PAGED_BUF=y +CONFIG_EXTRA_FIRMWARE="" +# CONFIG_FW_LOADER_USER_HELPER is not set +CONFIG_FW_LOADER_COMPRESS=y +CONFIG_FW_CACHE=y +# end of Firmware loader + +CONFIG_WANT_DEV_COREDUMP=y +CONFIG_ALLOW_DEV_COREDUMP=y +CONFIG_DEV_COREDUMP=y +# CONFIG_DEBUG_DRIVER is not set +# CONFIG_DEBUG_DEVRES is not set +# CONFIG_DEBUG_TEST_DRIVER_REMOVE is not set +CONFIG_HMEM_REPORTING=y +# CONFIG_TEST_ASYNC_DRIVER_PROBE is not set +CONFIG_SYS_HYPERVISOR=y +CONFIG_GENERIC_CPU_AUTOPROBE=y +CONFIG_GENERIC_CPU_VULNERABILITIES=y +CONFIG_REGMAP=y +CONFIG_REGMAP_I2C=y +CONFIG_REGMAP_SLIMBUS=m +CONFIG_REGMAP_SPI=y +CONFIG_REGMAP_SPMI=m +CONFIG_REGMAP_W1=m +CONFIG_REGMAP_MMIO=y +CONFIG_REGMAP_IRQ=y +CONFIG_REGMAP_SOUNDWIRE=m +CONFIG_REGMAP_SCCB=m +CONFIG_REGMAP_I3C=m +CONFIG_DMA_SHARED_BUFFER=y +# CONFIG_DMA_FENCE_TRACE is not set +# end of Generic Driver Options + +# +# Bus devices +# +CONFIG_MOXTET=m +CONFIG_SIMPLE_PM_BUS=y +CONFIG_MHI_BUS=m +# end of Bus devices + +CONFIG_CONNECTOR=y +CONFIG_PROC_EVENTS=y +CONFIG_GNSS=m +CONFIG_GNSS_SERIAL=m +CONFIG_GNSS_MTK_SERIAL=m +CONFIG_GNSS_SIRF_SERIAL=m +CONFIG_GNSS_UBX_SERIAL=m +CONFIG_MTD=m +CONFIG_MTD_TESTS=m + +# +# Partition parsers +# +CONFIG_MTD_AR7_PARTS=m +CONFIG_MTD_CMDLINE_PARTS=m +CONFIG_MTD_OF_PARTS=m +CONFIG_MTD_REDBOOT_PARTS=m +CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1 +# CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED is not set +# CONFIG_MTD_REDBOOT_PARTS_READONLY is not set +# end of Partition parsers + +# +# User Modules And Translation Layers +# +CONFIG_MTD_BLKDEVS=m +CONFIG_MTD_BLOCK=m +CONFIG_MTD_BLOCK_RO=m +CONFIG_FTL=m +CONFIG_NFTL=m +CONFIG_NFTL_RW=y +CONFIG_INFTL=m +CONFIG_RFD_FTL=m +CONFIG_SSFDC=m +CONFIG_SM_FTL=m +CONFIG_MTD_OOPS=m +CONFIG_MTD_SWAP=m +CONFIG_MTD_PARTITIONED_MASTER=y + +# +# RAM/ROM/Flash chip drivers +# +CONFIG_MTD_CFI=m +CONFIG_MTD_JEDECPROBE=m +CONFIG_MTD_GEN_PROBE=m +# CONFIG_MTD_CFI_ADV_OPTIONS is not set +CONFIG_MTD_MAP_BANK_WIDTH_1=y +CONFIG_MTD_MAP_BANK_WIDTH_2=y +CONFIG_MTD_MAP_BANK_WIDTH_4=y +CONFIG_MTD_CFI_I1=y +CONFIG_MTD_CFI_I2=y +CONFIG_MTD_CFI_INTELEXT=m +CONFIG_MTD_CFI_AMDSTD=m +CONFIG_MTD_CFI_STAA=m +CONFIG_MTD_CFI_UTIL=m +CONFIG_MTD_RAM=m +CONFIG_MTD_ROM=m +CONFIG_MTD_ABSENT=m +# end of RAM/ROM/Flash chip drivers + +# +# Mapping drivers for chip access +# +CONFIG_MTD_COMPLEX_MAPPINGS=y +CONFIG_MTD_PHYSMAP=m +# CONFIG_MTD_PHYSMAP_COMPAT is not set +CONFIG_MTD_PHYSMAP_OF=y +CONFIG_MTD_PHYSMAP_VERSATILE=y +CONFIG_MTD_PHYSMAP_GEMINI=y +CONFIG_MTD_PHYSMAP_GPIO_ADDR=y +CONFIG_MTD_SBC_GXX=m +CONFIG_MTD_AMD76XROM=m +CONFIG_MTD_ICHXROM=m +CONFIG_MTD_ESB2ROM=m +CONFIG_MTD_CK804XROM=m +CONFIG_MTD_SCB2_FLASH=m +CONFIG_MTD_NETtel=m +CONFIG_MTD_L440GX=m +CONFIG_MTD_PCI=m +CONFIG_MTD_PCMCIA=m +# CONFIG_MTD_PCMCIA_ANONYMOUS is not set +CONFIG_MTD_INTEL_VR_NOR=m +CONFIG_MTD_PLATRAM=m +# end of Mapping drivers for chip access + +# +# Self-contained MTD device drivers +# +CONFIG_MTD_PMC551=m +# CONFIG_MTD_PMC551_BUGFIX is not set +# CONFIG_MTD_PMC551_DEBUG is not set +CONFIG_MTD_DATAFLASH=m +# CONFIG_MTD_DATAFLASH_WRITE_VERIFY is not set +CONFIG_MTD_DATAFLASH_OTP=y +CONFIG_MTD_MCHP23K256=m +CONFIG_MTD_SST25L=m +CONFIG_MTD_SLRAM=m +CONFIG_MTD_PHRAM=m +CONFIG_MTD_MTDRAM=m +CONFIG_MTDRAM_TOTAL_SIZE=4096 +CONFIG_MTDRAM_ERASE_SIZE=128 +CONFIG_MTD_BLOCK2MTD=m + +# +# Disk-On-Chip Device Drivers +# +CONFIG_MTD_DOCG3=m +CONFIG_BCH_CONST_M=14 +CONFIG_BCH_CONST_T=4 +# end of Self-contained MTD device drivers + +CONFIG_MTD_NAND_CORE=m +CONFIG_MTD_ONENAND=m +# CONFIG_MTD_ONENAND_VERIFY_WRITE is not set +CONFIG_MTD_ONENAND_GENERIC=m +CONFIG_MTD_ONENAND_OTP=y +CONFIG_MTD_ONENAND_2X_PROGRAM=y +CONFIG_MTD_NAND_ECC_SW_HAMMING=m +CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC=y +CONFIG_MTD_RAW_NAND=m +CONFIG_MTD_NAND_ECC_SW_BCH=y + +# +# Raw/parallel NAND flash controllers +# +CONFIG_MTD_NAND_DENALI=m +CONFIG_MTD_NAND_DENALI_PCI=m +CONFIG_MTD_NAND_DENALI_DT=m +CONFIG_MTD_NAND_CAFE=m +CONFIG_MTD_NAND_MXIC=m +CONFIG_MTD_NAND_GPIO=m +CONFIG_MTD_NAND_PLATFORM=m +CONFIG_MTD_NAND_CADENCE=m + +# +# Misc +# +CONFIG_MTD_SM_COMMON=m +CONFIG_MTD_NAND_NANDSIM=m +CONFIG_MTD_NAND_RICOH=m +CONFIG_MTD_NAND_DISKONCHIP=m +# CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED is not set +CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0 +CONFIG_MTD_NAND_DISKONCHIP_BBTWRITE=y +CONFIG_MTD_SPI_NAND=m + +# +# LPDDR & LPDDR2 PCM memory drivers +# +CONFIG_MTD_LPDDR=m +CONFIG_MTD_QINFO_PROBE=m +# end of LPDDR & LPDDR2 PCM memory drivers + +CONFIG_MTD_SPI_NOR=m +CONFIG_MTD_SPI_NOR_USE_4K_SECTORS=y +CONFIG_SPI_INTEL_SPI=m +CONFIG_SPI_INTEL_SPI_PCI=m +CONFIG_SPI_INTEL_SPI_PLATFORM=m +CONFIG_MTD_UBI=m +CONFIG_MTD_UBI_WL_THRESHOLD=4096 +CONFIG_MTD_UBI_BEB_LIMIT=20 +CONFIG_MTD_UBI_FASTMAP=y +CONFIG_MTD_UBI_GLUEBI=m +CONFIG_MTD_UBI_BLOCK=y +CONFIG_MTD_HYPERBUS=m +CONFIG_DTC=y +CONFIG_OF=y +# CONFIG_OF_UNITTEST is not set +CONFIG_OF_FLATTREE=y +CONFIG_OF_KOBJ=y +CONFIG_OF_DYNAMIC=y +CONFIG_OF_ADDRESS=y +CONFIG_OF_IRQ=y +CONFIG_OF_NET=y +CONFIG_OF_MDIO=m +CONFIG_OF_RESOLVE=y +CONFIG_OF_OVERLAY=y +CONFIG_ARCH_MIGHT_HAVE_PC_PARPORT=y +CONFIG_PARPORT=m +CONFIG_PARPORT_PC=m +CONFIG_PARPORT_SERIAL=m +CONFIG_PARPORT_PC_FIFO=y +CONFIG_PARPORT_PC_SUPERIO=y +CONFIG_PARPORT_PC_PCMCIA=m +CONFIG_PARPORT_AX88796=m +CONFIG_PARPORT_1284=y +CONFIG_PARPORT_NOT_PC=y +CONFIG_PNP=y +CONFIG_PNP_DEBUG_MESSAGES=y + +# +# Protocols +# +CONFIG_PNPACPI=y +CONFIG_BLK_DEV=y +# CONFIG_BLK_DEV_NULL_BLK is not set +CONFIG_BLK_DEV_FD=m +CONFIG_CDROM=m +# CONFIG_PARIDE is not set +CONFIG_BLK_DEV_PCIESSD_MTIP32XX=m +CONFIG_ZRAM=m +CONFIG_ZRAM_WRITEBACK=y +# CONFIG_ZRAM_MEMORY_TRACKING is not set +CONFIG_BLK_DEV_UMEM=m +CONFIG_BLK_DEV_LOOP=m +CONFIG_BLK_DEV_LOOP_MIN_COUNT=8 +CONFIG_BLK_DEV_CRYPTOLOOP=m +CONFIG_BLK_DEV_DRBD=m +# CONFIG_DRBD_FAULT_INJECTION is not set +CONFIG_BLK_DEV_NBD=m +CONFIG_BLK_DEV_SKD=m +CONFIG_BLK_DEV_SX8=m +CONFIG_BLK_DEV_RAM=m +CONFIG_BLK_DEV_RAM_COUNT=16 +CONFIG_BLK_DEV_RAM_SIZE=16384 +CONFIG_CDROM_PKTCDVD=m +CONFIG_CDROM_PKTCDVD_BUFFERS=8 +# CONFIG_CDROM_PKTCDVD_WCACHE is not set +CONFIG_ATA_OVER_ETH=m +CONFIG_XEN_BLKDEV_FRONTEND=m +CONFIG_XEN_BLKDEV_BACKEND=m +CONFIG_VIRTIO_BLK=m +CONFIG_BLK_DEV_RBD=m +CONFIG_BLK_DEV_RSXX=m + +# +# NVME Support +# +CONFIG_NVME_CORE=y +CONFIG_BLK_DEV_NVME=y +CONFIG_NVME_MULTIPATH=y +CONFIG_NVME_HWMON=y +CONFIG_NVME_FABRICS=m +CONFIG_NVME_RDMA=m +CONFIG_NVME_FC=m +CONFIG_NVME_TCP=m +CONFIG_NVME_TARGET=m +CONFIG_NVME_TARGET_LOOP=m +CONFIG_NVME_TARGET_RDMA=m +CONFIG_NVME_TARGET_FC=m +CONFIG_NVME_TARGET_FCLOOP=m +CONFIG_NVME_TARGET_TCP=m +# end of NVME Support + +# +# Misc devices +# +CONFIG_SENSORS_LIS3LV02D=m +CONFIG_AD525X_DPOT=m +CONFIG_AD525X_DPOT_I2C=m +CONFIG_AD525X_DPOT_SPI=m +# CONFIG_DUMMY_IRQ is not set +CONFIG_IBM_ASM=m +CONFIG_PHANTOM=m +CONFIG_TIFM_CORE=m +CONFIG_TIFM_7XX1=m +CONFIG_ICS932S401=m +CONFIG_ENCLOSURE_SERVICES=m +CONFIG_HP_ILO=m +CONFIG_APDS9802ALS=m +CONFIG_ISL29003=m +CONFIG_ISL29020=m +CONFIG_SENSORS_TSL2550=m +CONFIG_SENSORS_BH1770=m +CONFIG_SENSORS_APDS990X=m +CONFIG_HMC6352=m +CONFIG_DS1682=m +CONFIG_VMWARE_BALLOON=m +CONFIG_LATTICE_ECP3_CONFIG=m +# CONFIG_SRAM is not set +CONFIG_PCI_ENDPOINT_TEST=m +CONFIG_XILINX_SDFEC=m +CONFIG_MISC_RTSX=m +CONFIG_PVPANIC=m +CONFIG_C2PORT=m +CONFIG_C2PORT_DURAMAR_2150=m + +# +# EEPROM support +# +CONFIG_EEPROM_AT24=m +# CONFIG_EEPROM_AT25 is not set +CONFIG_EEPROM_LEGACY=m +CONFIG_EEPROM_MAX6875=m +CONFIG_EEPROM_93CX6=m +# CONFIG_EEPROM_93XX46 is not set +CONFIG_EEPROM_IDT_89HPESX=m +CONFIG_EEPROM_EE1004=m +# end of EEPROM support + +CONFIG_CB710_CORE=m +# CONFIG_CB710_DEBUG is not set +CONFIG_CB710_DEBUG_ASSUMPTIONS=y + +# +# Texas Instruments shared transport line discipline +# +CONFIG_TI_ST=m +# end of Texas Instruments shared transport line discipline + +CONFIG_SENSORS_LIS3_I2C=m +CONFIG_ALTERA_STAPL=m +CONFIG_INTEL_MEI=m +CONFIG_INTEL_MEI_ME=m +CONFIG_INTEL_MEI_TXE=m +CONFIG_INTEL_MEI_HDCP=m +CONFIG_VMWARE_VMCI=m + +# +# Intel MIC & related support +# +CONFIG_INTEL_MIC_BUS=m +CONFIG_SCIF_BUS=m +CONFIG_VOP_BUS=m +CONFIG_INTEL_MIC_HOST=m +CONFIG_INTEL_MIC_CARD=m +CONFIG_SCIF=m +CONFIG_MIC_COSM=m +CONFIG_VOP=m +# end of Intel MIC & related support + +CONFIG_GENWQE=m +CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY=0 +CONFIG_ECHO=m +CONFIG_MISC_ALCOR_PCI=m +CONFIG_MISC_RTSX_PCI=m +CONFIG_MISC_RTSX_USB=m +CONFIG_HABANA_AI=m +CONFIG_UACCE=m +# end of Misc devices + +CONFIG_HAVE_IDE=y +# CONFIG_IDE is not set + +# +# SCSI device support +# +CONFIG_SCSI_MOD=y +CONFIG_RAID_ATTRS=m +CONFIG_SCSI=y +CONFIG_SCSI_DMA=y +CONFIG_SCSI_NETLINK=y +CONFIG_SCSI_PROC_FS=y + +# +# SCSI support type (disk, tape, CD-ROM) +# +CONFIG_BLK_DEV_SD=y +CONFIG_CHR_DEV_ST=m +CONFIG_BLK_DEV_SR=m +CONFIG_CHR_DEV_SG=m +CONFIG_CHR_DEV_SCH=m +CONFIG_SCSI_ENCLOSURE=m +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_LOGGING=y +CONFIG_SCSI_SCAN_ASYNC=y + +# +# SCSI Transports +# +CONFIG_SCSI_SPI_ATTRS=m +CONFIG_SCSI_FC_ATTRS=m +CONFIG_SCSI_ISCSI_ATTRS=m +CONFIG_SCSI_SAS_ATTRS=m +CONFIG_SCSI_SAS_LIBSAS=m +CONFIG_SCSI_SAS_ATA=y +CONFIG_SCSI_SAS_HOST_SMP=y +CONFIG_SCSI_SRP_ATTRS=m +# end of SCSI Transports + +CONFIG_SCSI_LOWLEVEL=y +CONFIG_ISCSI_TCP=m +CONFIG_ISCSI_BOOT_SYSFS=m +CONFIG_SCSI_CXGB3_ISCSI=m +CONFIG_SCSI_CXGB4_ISCSI=m +CONFIG_SCSI_BNX2_ISCSI=m +CONFIG_SCSI_BNX2X_FCOE=m +CONFIG_BE2ISCSI=m +CONFIG_BLK_DEV_3W_XXXX_RAID=m +CONFIG_SCSI_HPSA=m +CONFIG_SCSI_3W_9XXX=m +CONFIG_SCSI_3W_SAS=m +CONFIG_SCSI_ACARD=m +CONFIG_SCSI_AACRAID=m +CONFIG_SCSI_AIC7XXX=m +CONFIG_AIC7XXX_CMDS_PER_DEVICE=32 +CONFIG_AIC7XXX_RESET_DELAY_MS=15000 +CONFIG_AIC7XXX_DEBUG_ENABLE=y +CONFIG_AIC7XXX_DEBUG_MASK=0 +CONFIG_AIC7XXX_REG_PRETTY_PRINT=y +CONFIG_SCSI_AIC79XX=m +CONFIG_AIC79XX_CMDS_PER_DEVICE=32 +CONFIG_AIC79XX_RESET_DELAY_MS=15000 +CONFIG_AIC79XX_DEBUG_ENABLE=y +CONFIG_AIC79XX_DEBUG_MASK=0 +CONFIG_AIC79XX_REG_PRETTY_PRINT=y +CONFIG_SCSI_AIC94XX=m +CONFIG_AIC94XX_DEBUG=y +CONFIG_SCSI_MVSAS=m +CONFIG_SCSI_MVSAS_DEBUG=y +CONFIG_SCSI_MVSAS_TASKLET=y +CONFIG_SCSI_MVUMI=m +CONFIG_SCSI_DPT_I2O=m +CONFIG_SCSI_ADVANSYS=m +CONFIG_SCSI_ARCMSR=m +CONFIG_SCSI_ESAS2R=m +CONFIG_MEGARAID_NEWGEN=y +CONFIG_MEGARAID_MM=m +CONFIG_MEGARAID_MAILBOX=m +CONFIG_MEGARAID_LEGACY=m +CONFIG_MEGARAID_SAS=m +CONFIG_SCSI_MPT3SAS=m +CONFIG_SCSI_MPT2SAS_MAX_SGE=128 +CONFIG_SCSI_MPT3SAS_MAX_SGE=128 +CONFIG_SCSI_MPT2SAS=m +CONFIG_SCSI_SMARTPQI=m +CONFIG_SCSI_UFSHCD=m +CONFIG_SCSI_UFSHCD_PCI=m +# CONFIG_SCSI_UFS_DWC_TC_PCI is not set +CONFIG_SCSI_UFSHCD_PLATFORM=m +CONFIG_SCSI_UFS_CDNS_PLATFORM=m +# CONFIG_SCSI_UFS_DWC_TC_PLATFORM is not set +CONFIG_SCSI_UFS_BSG=y +CONFIG_SCSI_HPTIOP=m +CONFIG_SCSI_BUSLOGIC=m +CONFIG_SCSI_FLASHPOINT=y +CONFIG_SCSI_MYRB=m +CONFIG_SCSI_MYRS=m +CONFIG_VMWARE_PVSCSI=m +CONFIG_XEN_SCSI_FRONTEND=m +CONFIG_HYPERV_STORAGE=m +CONFIG_LIBFC=m +CONFIG_LIBFCOE=m +CONFIG_FCOE=m +CONFIG_FCOE_FNIC=m +CONFIG_SCSI_SNIC=m +# CONFIG_SCSI_SNIC_DEBUG_FS is not set +CONFIG_SCSI_DMX3191D=m +CONFIG_SCSI_FDOMAIN=m +CONFIG_SCSI_FDOMAIN_PCI=m +CONFIG_SCSI_GDTH=m +CONFIG_SCSI_ISCI=m +CONFIG_SCSI_IPS=m +CONFIG_SCSI_INITIO=m +CONFIG_SCSI_INIA100=m +CONFIG_SCSI_PPA=m +CONFIG_SCSI_IMM=m +# CONFIG_SCSI_IZIP_EPP16 is not set +# CONFIG_SCSI_IZIP_SLOW_CTR is not set +CONFIG_SCSI_STEX=m +CONFIG_SCSI_SYM53C8XX_2=m +CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1 +CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 +CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 +CONFIG_SCSI_SYM53C8XX_MMIO=y +CONFIG_SCSI_IPR=m +CONFIG_SCSI_IPR_TRACE=y +CONFIG_SCSI_IPR_DUMP=y +CONFIG_SCSI_QLOGIC_1280=m +CONFIG_SCSI_QLA_FC=m +CONFIG_TCM_QLA2XXX=m +# CONFIG_TCM_QLA2XXX_DEBUG is not set +CONFIG_SCSI_QLA_ISCSI=m +CONFIG_QEDI=m +CONFIG_QEDF=m +CONFIG_SCSI_LPFC=m +# CONFIG_SCSI_LPFC_DEBUG_FS is not set +CONFIG_SCSI_DC395x=m +CONFIG_SCSI_AM53C974=m +CONFIG_SCSI_WD719X=m +CONFIG_SCSI_DEBUG=m +CONFIG_SCSI_PMCRAID=m +CONFIG_SCSI_PM8001=m +CONFIG_SCSI_BFA_FC=m +CONFIG_SCSI_VIRTIO=m +CONFIG_SCSI_CHELSIO_FCOE=m +CONFIG_SCSI_LOWLEVEL_PCMCIA=y +CONFIG_PCMCIA_AHA152X=m +CONFIG_PCMCIA_FDOMAIN=m +CONFIG_PCMCIA_QLOGIC=m +CONFIG_PCMCIA_SYM53C500=m +CONFIG_SCSI_DH=y +CONFIG_SCSI_DH_RDAC=m +CONFIG_SCSI_DH_HP_SW=m +CONFIG_SCSI_DH_EMC=m +CONFIG_SCSI_DH_ALUA=m +# end of SCSI device support + +CONFIG_ATA=y +CONFIG_SATA_HOST=y +CONFIG_PATA_TIMINGS=y +CONFIG_ATA_VERBOSE_ERROR=y +CONFIG_ATA_FORCE=y +CONFIG_ATA_ACPI=y +CONFIG_SATA_ZPODD=y +CONFIG_SATA_PMP=y + +# +# Controllers with non-SFF native interface +# +CONFIG_SATA_AHCI=y +CONFIG_SATA_MOBILE_LPM_POLICY=3 +CONFIG_SATA_AHCI_PLATFORM=m +CONFIG_AHCI_CEVA=m +CONFIG_AHCI_QORIQ=m +CONFIG_SATA_INIC162X=m +CONFIG_SATA_ACARD_AHCI=m +CONFIG_SATA_SIL24=m +CONFIG_ATA_SFF=y + +# +# SFF controllers with custom DMA interface +# +CONFIG_PDC_ADMA=m +CONFIG_SATA_QSTOR=m +CONFIG_SATA_SX4=m +CONFIG_ATA_BMDMA=y + +# +# SATA SFF controllers with BMDMA +# +CONFIG_ATA_PIIX=m +CONFIG_SATA_DWC=m +# CONFIG_SATA_DWC_OLD_DMA is not set +# CONFIG_SATA_DWC_DEBUG is not set +CONFIG_SATA_MV=m +CONFIG_SATA_NV=m +CONFIG_SATA_PROMISE=m +CONFIG_SATA_SIL=m +CONFIG_SATA_SIS=m +CONFIG_SATA_SVW=m +CONFIG_SATA_ULI=m +CONFIG_SATA_VIA=m +CONFIG_SATA_VITESSE=m + +# +# PATA SFF controllers with BMDMA +# +CONFIG_PATA_ALI=m +CONFIG_PATA_AMD=m +CONFIG_PATA_ARTOP=m +CONFIG_PATA_ATIIXP=m +CONFIG_PATA_ATP867X=m +CONFIG_PATA_CMD64X=m +CONFIG_PATA_CYPRESS=m +CONFIG_PATA_EFAR=m +CONFIG_PATA_HPT366=m +CONFIG_PATA_HPT37X=m +CONFIG_PATA_HPT3X2N=m +CONFIG_PATA_HPT3X3=m +CONFIG_PATA_HPT3X3_DMA=y +CONFIG_PATA_IT8213=m +CONFIG_PATA_IT821X=m +CONFIG_PATA_JMICRON=m +CONFIG_PATA_MARVELL=m +CONFIG_PATA_NETCELL=m +CONFIG_PATA_NINJA32=m +CONFIG_PATA_NS87415=m +CONFIG_PATA_OLDPIIX=m +CONFIG_PATA_OPTIDMA=m +CONFIG_PATA_PDC2027X=m +CONFIG_PATA_PDC_OLD=m +CONFIG_PATA_RADISYS=m +CONFIG_PATA_RDC=m +CONFIG_PATA_SCH=m +CONFIG_PATA_SERVERWORKS=m +CONFIG_PATA_SIL680=m +CONFIG_PATA_SIS=m +CONFIG_PATA_TOSHIBA=m +CONFIG_PATA_TRIFLEX=m +CONFIG_PATA_VIA=m +CONFIG_PATA_WINBOND=m + +# +# PIO-only SFF controllers +# +CONFIG_PATA_CMD640_PCI=m +CONFIG_PATA_MPIIX=m +CONFIG_PATA_NS87410=m +CONFIG_PATA_OPTI=m +CONFIG_PATA_PCMCIA=m +# CONFIG_PATA_PLATFORM is not set +CONFIG_PATA_RZ1000=m + +# +# Generic fallback / legacy drivers +# +CONFIG_PATA_ACPI=m +CONFIG_ATA_GENERIC=m +CONFIG_PATA_LEGACY=m +CONFIG_MD=y +CONFIG_BLK_DEV_MD=m +CONFIG_MD_LINEAR=m +CONFIG_MD_RAID0=m +CONFIG_MD_RAID1=m +CONFIG_MD_RAID10=m +CONFIG_MD_RAID456=m +CONFIG_MD_MULTIPATH=m +CONFIG_MD_FAULTY=m +CONFIG_MD_CLUSTER=m +CONFIG_BCACHE=m +# CONFIG_BCACHE_DEBUG is not set +# CONFIG_BCACHE_CLOSURES_DEBUG is not set +CONFIG_BLK_DEV_DM_BUILTIN=y +CONFIG_BLK_DEV_DM=m +CONFIG_DM_DEBUG=y +CONFIG_DM_BUFIO=m +# CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING is not set +CONFIG_DM_BIO_PRISON=m +CONFIG_DM_PERSISTENT_DATA=m +CONFIG_DM_UNSTRIPED=m +CONFIG_DM_CRYPT=m +CONFIG_DM_SNAPSHOT=m +CONFIG_DM_THIN_PROVISIONING=m +CONFIG_DM_CACHE=m +CONFIG_DM_CACHE_SMQ=m +CONFIG_DM_WRITECACHE=m +CONFIG_DM_ERA=m +CONFIG_DM_CLONE=m +CONFIG_DM_MIRROR=m +CONFIG_DM_LOG_USERSPACE=m +CONFIG_DM_RAID=m +CONFIG_DM_ZERO=m +CONFIG_DM_MULTIPATH=m +CONFIG_DM_MULTIPATH_QL=m +CONFIG_DM_MULTIPATH_ST=m +CONFIG_DM_DELAY=m +CONFIG_DM_DUST=m +CONFIG_DM_UEVENT=y +CONFIG_DM_FLAKEY=m +CONFIG_DM_VERITY=m +CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y +CONFIG_DM_VERITY_FEC=y +CONFIG_DM_SWITCH=m +CONFIG_DM_LOG_WRITES=m +CONFIG_DM_INTEGRITY=m +CONFIG_DM_ZONED=m +CONFIG_TARGET_CORE=m +CONFIG_TCM_IBLOCK=m +CONFIG_TCM_FILEIO=m +CONFIG_TCM_PSCSI=m +CONFIG_TCM_USER2=m +CONFIG_LOOPBACK_TARGET=m +CONFIG_TCM_FC=m +CONFIG_ISCSI_TARGET=m +CONFIG_ISCSI_TARGET_CXGB4=m +CONFIG_SBP_TARGET=m +CONFIG_FUSION=y +CONFIG_FUSION_SPI=m +CONFIG_FUSION_FC=m +CONFIG_FUSION_SAS=m +CONFIG_FUSION_MAX_SGE=128 +CONFIG_FUSION_CTL=m +CONFIG_FUSION_LAN=m +# CONFIG_FUSION_LOGGING is not set + +# +# IEEE 1394 (FireWire) support +# +CONFIG_FIREWIRE=m +CONFIG_FIREWIRE_OHCI=m +CONFIG_FIREWIRE_SBP2=m +CONFIG_FIREWIRE_NET=m +CONFIG_FIREWIRE_NOSY=m +# end of IEEE 1394 (FireWire) support + +CONFIG_MACINTOSH_DRIVERS=y +CONFIG_MAC_EMUMOUSEBTN=m +CONFIG_NETDEVICES=y +CONFIG_MII=m +CONFIG_NET_CORE=y +CONFIG_BONDING=m +CONFIG_DUMMY=m +CONFIG_WIREGUARD=m +# CONFIG_WIREGUARD_DEBUG is not set +CONFIG_EQUALIZER=m +CONFIG_NET_FC=y +CONFIG_IFB=m +CONFIG_NET_TEAM=m +CONFIG_NET_TEAM_MODE_BROADCAST=m +CONFIG_NET_TEAM_MODE_ROUNDROBIN=m +CONFIG_NET_TEAM_MODE_RANDOM=m +CONFIG_NET_TEAM_MODE_ACTIVEBACKUP=m +CONFIG_NET_TEAM_MODE_LOADBALANCE=m +CONFIG_MACVLAN=m +CONFIG_MACVTAP=m +CONFIG_IPVLAN_L3S=y +CONFIG_IPVLAN=m +CONFIG_IPVTAP=m +CONFIG_VXLAN=m +CONFIG_GENEVE=m +CONFIG_BAREUDP=m +CONFIG_GTP=m +CONFIG_MACSEC=m +CONFIG_NETCONSOLE=m +CONFIG_NETCONSOLE_DYNAMIC=y +CONFIG_NETPOLL=y +CONFIG_NET_POLL_CONTROLLER=y +CONFIG_NTB_NETDEV=m +CONFIG_RIONET=m +CONFIG_RIONET_TX_SIZE=128 +CONFIG_RIONET_RX_SIZE=128 +CONFIG_TUN=m +CONFIG_TAP=m +# CONFIG_TUN_VNET_CROSS_LE is not set +CONFIG_VETH=m +CONFIG_VIRTIO_NET=m +CONFIG_NLMON=m +CONFIG_NET_VRF=m +CONFIG_VSOCKMON=m +CONFIG_SUNGEM_PHY=m +# CONFIG_ARCNET is not set +CONFIG_ATM_DRIVERS=y +# CONFIG_ATM_DUMMY is not set +CONFIG_ATM_TCP=m +CONFIG_ATM_LANAI=m +CONFIG_ATM_ENI=m +# CONFIG_ATM_ENI_DEBUG is not set +# CONFIG_ATM_ENI_TUNE_BURST is not set +CONFIG_ATM_FIRESTREAM=m +CONFIG_ATM_ZATM=m +# CONFIG_ATM_ZATM_DEBUG is not set +CONFIG_ATM_NICSTAR=m +# CONFIG_ATM_NICSTAR_USE_SUNI is not set +# CONFIG_ATM_NICSTAR_USE_IDT77105 is not set +CONFIG_ATM_IDT77252=m +# CONFIG_ATM_IDT77252_DEBUG is not set +# CONFIG_ATM_IDT77252_RCV_ALL is not set +CONFIG_ATM_IDT77252_USE_SUNI=y +CONFIG_ATM_AMBASSADOR=m +# CONFIG_ATM_AMBASSADOR_DEBUG is not set +CONFIG_ATM_HORIZON=m +# CONFIG_ATM_HORIZON_DEBUG is not set +CONFIG_ATM_IA=m +# CONFIG_ATM_IA_DEBUG is not set +CONFIG_ATM_FORE200E=m +CONFIG_ATM_FORE200E_USE_TASKLET=y +CONFIG_ATM_FORE200E_TX_RETRY=16 +CONFIG_ATM_FORE200E_DEBUG=0 +CONFIG_ATM_HE=m +CONFIG_ATM_HE_USE_SUNI=y +CONFIG_ATM_SOLOS=m +CONFIG_CAIF_DRIVERS=y +CONFIG_CAIF_TTY=m +CONFIG_CAIF_SPI_SLAVE=m +CONFIG_CAIF_SPI_SYNC=y +CONFIG_CAIF_HSI=m +CONFIG_CAIF_VIRTIO=m + +# +# Distributed Switch Architecture drivers +# +CONFIG_B53=m +# CONFIG_B53_SPI_DRIVER is not set +CONFIG_B53_MDIO_DRIVER=m +CONFIG_B53_MMAP_DRIVER=m +CONFIG_B53_SRAB_DRIVER=m +CONFIG_B53_SERDES=m +CONFIG_NET_DSA_BCM_SF2=m +CONFIG_NET_DSA_LOOP=m +CONFIG_NET_DSA_LANTIQ_GSWIP=m +CONFIG_NET_DSA_MT7530=m +CONFIG_NET_DSA_MV88E6060=m +CONFIG_NET_DSA_MICROCHIP_KSZ_COMMON=m +CONFIG_NET_DSA_MICROCHIP_KSZ9477=m +CONFIG_NET_DSA_MICROCHIP_KSZ9477_I2C=m +CONFIG_NET_DSA_MICROCHIP_KSZ9477_SPI=m +CONFIG_NET_DSA_MICROCHIP_KSZ8795=m +CONFIG_NET_DSA_MICROCHIP_KSZ8795_SPI=m +CONFIG_NET_DSA_MV88E6XXX=m +CONFIG_NET_DSA_MV88E6XXX_GLOBAL2=y +CONFIG_NET_DSA_MV88E6XXX_PTP=y +CONFIG_NET_DSA_AR9331=m +CONFIG_NET_DSA_SJA1105=m +CONFIG_NET_DSA_SJA1105_PTP=y +CONFIG_NET_DSA_SJA1105_TAS=y +CONFIG_NET_DSA_QCA8K=m +CONFIG_NET_DSA_REALTEK_SMI=m +CONFIG_NET_DSA_SMSC_LAN9303=m +CONFIG_NET_DSA_SMSC_LAN9303_I2C=m +CONFIG_NET_DSA_SMSC_LAN9303_MDIO=m +CONFIG_NET_DSA_VITESSE_VSC73XX=m +CONFIG_NET_DSA_VITESSE_VSC73XX_SPI=m +CONFIG_NET_DSA_VITESSE_VSC73XX_PLATFORM=m +# end of Distributed Switch Architecture drivers + +CONFIG_ETHERNET=y +CONFIG_MDIO=m +CONFIG_NET_VENDOR_3COM=y +CONFIG_PCMCIA_3C574=m +CONFIG_PCMCIA_3C589=m +CONFIG_VORTEX=m +CONFIG_TYPHOON=m +CONFIG_NET_VENDOR_ADAPTEC=y +CONFIG_ADAPTEC_STARFIRE=m +CONFIG_NET_VENDOR_AGERE=y +CONFIG_ET131X=m +CONFIG_NET_VENDOR_ALACRITECH=y +CONFIG_SLICOSS=m +CONFIG_NET_VENDOR_ALTEON=y +CONFIG_ACENIC=m +# CONFIG_ACENIC_OMIT_TIGON_I is not set +CONFIG_ALTERA_TSE=m +CONFIG_NET_VENDOR_AMAZON=y +CONFIG_ENA_ETHERNET=m +CONFIG_NET_VENDOR_AMD=y +CONFIG_AMD8111_ETH=m +CONFIG_PCNET32=m +CONFIG_PCMCIA_NMCLAN=m +CONFIG_AMD_XGBE=m +CONFIG_AMD_XGBE_DCB=y +CONFIG_AMD_XGBE_HAVE_ECC=y +CONFIG_NET_VENDOR_AQUANTIA=y +CONFIG_AQTION=m +CONFIG_NET_VENDOR_ARC=y +CONFIG_NET_VENDOR_ATHEROS=y +CONFIG_ATL2=m +CONFIG_ATL1=m +CONFIG_ATL1E=m +CONFIG_ATL1C=m +CONFIG_ALX=m +CONFIG_NET_VENDOR_AURORA=y +CONFIG_AURORA_NB8800=m +CONFIG_NET_VENDOR_BROADCOM=y +CONFIG_B44=m +CONFIG_B44_PCI_AUTOSELECT=y +CONFIG_B44_PCICORE_AUTOSELECT=y +CONFIG_B44_PCI=y +CONFIG_BCMGENET=m +CONFIG_BNX2=m +CONFIG_CNIC=m +CONFIG_TIGON3=m +CONFIG_TIGON3_HWMON=y +CONFIG_BNX2X=m +CONFIG_BNX2X_SRIOV=y +CONFIG_SYSTEMPORT=m +CONFIG_BNXT=m +CONFIG_BNXT_SRIOV=y +CONFIG_BNXT_FLOWER_OFFLOAD=y +CONFIG_BNXT_DCB=y +CONFIG_BNXT_HWMON=y +CONFIG_NET_VENDOR_BROCADE=y +CONFIG_BNA=m +CONFIG_NET_VENDOR_CADENCE=y +CONFIG_MACB=m +CONFIG_MACB_USE_HWSTAMP=y +CONFIG_MACB_PCI=m +CONFIG_NET_VENDOR_CAVIUM=y +CONFIG_THUNDER_NIC_PF=m +CONFIG_THUNDER_NIC_VF=m +CONFIG_THUNDER_NIC_BGX=m +CONFIG_THUNDER_NIC_RGX=m +CONFIG_CAVIUM_PTP=m +CONFIG_LIQUIDIO=m +CONFIG_LIQUIDIO_VF=m +CONFIG_NET_VENDOR_CHELSIO=y +CONFIG_CHELSIO_T1=m +CONFIG_CHELSIO_T1_1G=y +CONFIG_CHELSIO_T3=m +CONFIG_CHELSIO_T4=m +CONFIG_CHELSIO_T4_DCB=y +CONFIG_CHELSIO_T4_FCOE=y +CONFIG_CHELSIO_T4VF=m +CONFIG_CHELSIO_LIB=m +CONFIG_NET_VENDOR_CISCO=y +CONFIG_ENIC=m +CONFIG_NET_VENDOR_CORTINA=y +CONFIG_GEMINI_ETHERNET=m +CONFIG_CX_ECAT=m +CONFIG_DNET=m +CONFIG_NET_VENDOR_DEC=y +CONFIG_NET_TULIP=y +CONFIG_DE2104X=m +CONFIG_DE2104X_DSL=0 +CONFIG_TULIP=m +CONFIG_TULIP_MWI=y +CONFIG_TULIP_MMIO=y +CONFIG_TULIP_NAPI=y +CONFIG_TULIP_NAPI_HW_MITIGATION=y +CONFIG_DE4X5=m +CONFIG_WINBOND_840=m +CONFIG_DM9102=m +CONFIG_ULI526X=m +CONFIG_PCMCIA_XIRCOM=m +CONFIG_NET_VENDOR_DLINK=y +CONFIG_DL2K=m +CONFIG_SUNDANCE=m +# CONFIG_SUNDANCE_MMIO is not set +CONFIG_NET_VENDOR_EMULEX=y +CONFIG_BE2NET=m +CONFIG_BE2NET_HWMON=y +CONFIG_BE2NET_BE2=y +CONFIG_BE2NET_BE3=y +CONFIG_BE2NET_LANCER=y +CONFIG_BE2NET_SKYHAWK=y +CONFIG_NET_VENDOR_EZCHIP=y +CONFIG_EZCHIP_NPS_MANAGEMENT_ENET=m +CONFIG_NET_VENDOR_FUJITSU=y +CONFIG_PCMCIA_FMVJ18X=m +CONFIG_NET_VENDOR_GOOGLE=y +CONFIG_GVE=m +CONFIG_NET_VENDOR_HUAWEI=y +CONFIG_HINIC=m +CONFIG_NET_VENDOR_I825XX=y +CONFIG_NET_VENDOR_INTEL=y +CONFIG_E100=m +CONFIG_E1000=m +CONFIG_E1000E=m +CONFIG_E1000E_HWTS=y +CONFIG_IGB=m +CONFIG_IGB_HWMON=y +CONFIG_IGB_DCA=y +CONFIG_IGBVF=m +CONFIG_IXGB=m +CONFIG_IXGBE=m +CONFIG_IXGBE_HWMON=y +CONFIG_IXGBE_DCA=y +CONFIG_IXGBE_DCB=y +# CONFIG_IXGBE_IPSEC is not set +CONFIG_IXGBEVF=m +CONFIG_IXGBEVF_IPSEC=y +CONFIG_I40E=m +CONFIG_I40E_DCB=y +CONFIG_IAVF=m +CONFIG_I40EVF=m +CONFIG_ICE=m +CONFIG_FM10K=m +CONFIG_IGC=m +CONFIG_JME=m +CONFIG_NET_VENDOR_MARVELL=y +CONFIG_MVMDIO=m +CONFIG_SKGE=m +# CONFIG_SKGE_DEBUG is not set +CONFIG_SKGE_GENESIS=y +CONFIG_SKY2=m +# CONFIG_SKY2_DEBUG is not set +CONFIG_NET_VENDOR_MELLANOX=y +CONFIG_MLX4_EN=m +CONFIG_MLX4_EN_DCB=y +CONFIG_MLX4_CORE=m +CONFIG_MLX4_DEBUG=y +CONFIG_MLX4_CORE_GEN2=y +CONFIG_MLX5_CORE=m +CONFIG_MLX5_ACCEL=y +CONFIG_MLX5_FPGA=y +CONFIG_MLX5_CORE_EN=y +CONFIG_MLX5_EN_ARFS=y +CONFIG_MLX5_EN_RXNFC=y +CONFIG_MLX5_MPFS=y +CONFIG_MLX5_ESWITCH=y +CONFIG_MLX5_TC_CT=y +CONFIG_MLX5_CORE_EN_DCB=y +CONFIG_MLX5_CORE_IPOIB=y +CONFIG_MLX5_FPGA_IPSEC=y +CONFIG_MLX5_EN_IPSEC=y +CONFIG_MLX5_FPGA_TLS=y +CONFIG_MLX5_TLS=y +CONFIG_MLX5_EN_TLS=y +CONFIG_MLX5_SW_STEERING=y +CONFIG_MLXSW_CORE=m +CONFIG_MLXSW_CORE_HWMON=y +CONFIG_MLXSW_CORE_THERMAL=y +CONFIG_MLXSW_PCI=m +CONFIG_MLXSW_I2C=m +CONFIG_MLXSW_SWITCHIB=m +CONFIG_MLXSW_SWITCHX2=m +CONFIG_MLXSW_SPECTRUM=m +CONFIG_MLXSW_SPECTRUM_DCB=y +CONFIG_MLXSW_MINIMAL=m +CONFIG_MLXFW=m +CONFIG_NET_VENDOR_MICREL=y +CONFIG_KS8842=m +CONFIG_KS8851=m +CONFIG_KS8851_MLL=m +CONFIG_KSZ884X_PCI=m +CONFIG_NET_VENDOR_MICROCHIP=y +CONFIG_ENC28J60=m +# CONFIG_ENC28J60_WRITEVERIFY is not set +CONFIG_ENCX24J600=m +CONFIG_LAN743X=m +CONFIG_NET_VENDOR_MICROSEMI=y +CONFIG_MSCC_OCELOT_SWITCH=m +CONFIG_MSCC_OCELOT_SWITCH_OCELOT=m +CONFIG_NET_VENDOR_MYRI=y +CONFIG_MYRI10GE=m +CONFIG_MYRI10GE_DCA=y +CONFIG_FEALNX=m +CONFIG_NET_VENDOR_NATSEMI=y +CONFIG_NATSEMI=m +CONFIG_NS83820=m +CONFIG_NET_VENDOR_NETERION=y +CONFIG_S2IO=m +CONFIG_VXGE=m +# CONFIG_VXGE_DEBUG_TRACE_ALL is not set +CONFIG_NET_VENDOR_NETRONOME=y +CONFIG_NFP=m +CONFIG_NFP_APP_FLOWER=y +CONFIG_NFP_APP_ABM_NIC=y +# CONFIG_NFP_DEBUG is not set +CONFIG_NET_VENDOR_NI=y +CONFIG_NI_XGE_MANAGEMENT_ENET=m +CONFIG_NET_VENDOR_8390=y +CONFIG_PCMCIA_AXNET=m +CONFIG_NE2K_PCI=m +CONFIG_PCMCIA_PCNET=m +CONFIG_NET_VENDOR_NVIDIA=y +CONFIG_FORCEDETH=m +CONFIG_NET_VENDOR_OKI=y +CONFIG_ETHOC=m +CONFIG_NET_VENDOR_PACKET_ENGINES=y +CONFIG_HAMACHI=m +CONFIG_YELLOWFIN=m +CONFIG_NET_VENDOR_PENSANDO=y +CONFIG_IONIC=m +CONFIG_NET_VENDOR_QLOGIC=y +CONFIG_QLA3XXX=m +CONFIG_QLCNIC=m +CONFIG_QLCNIC_SRIOV=y +CONFIG_QLCNIC_DCB=y +CONFIG_QLCNIC_HWMON=y +CONFIG_NETXEN_NIC=m +CONFIG_QED=m +CONFIG_QED_LL2=y +CONFIG_QED_SRIOV=y +CONFIG_QEDE=m +CONFIG_QED_RDMA=y +CONFIG_QED_ISCSI=y +CONFIG_QED_FCOE=y +CONFIG_QED_OOO=y +CONFIG_NET_VENDOR_QUALCOMM=y +CONFIG_QCA7000=m +CONFIG_QCA7000_SPI=m +CONFIG_QCA7000_UART=m +CONFIG_QCOM_EMAC=m +CONFIG_RMNET=m +CONFIG_NET_VENDOR_RDC=y +CONFIG_R6040=m +CONFIG_NET_VENDOR_REALTEK=y +CONFIG_ATP=m +CONFIG_8139CP=m +CONFIG_8139TOO=m +# CONFIG_8139TOO_PIO is not set +CONFIG_8139TOO_TUNE_TWISTER=y +CONFIG_8139TOO_8129=y +# CONFIG_8139_OLD_RX_RESET is not set +CONFIG_R8169=m +CONFIG_NET_VENDOR_RENESAS=y +CONFIG_NET_VENDOR_ROCKER=y +CONFIG_ROCKER=m +CONFIG_NET_VENDOR_SAMSUNG=y +CONFIG_SXGBE_ETH=m +CONFIG_NET_VENDOR_SEEQ=y +CONFIG_NET_VENDOR_SOLARFLARE=y +CONFIG_SFC=m +CONFIG_SFC_MTD=y +CONFIG_SFC_MCDI_MON=y +CONFIG_SFC_SRIOV=y +CONFIG_SFC_MCDI_LOGGING=y +CONFIG_SFC_FALCON=m +CONFIG_SFC_FALCON_MTD=y +CONFIG_NET_VENDOR_SILAN=y +CONFIG_SC92031=m +CONFIG_NET_VENDOR_SIS=y +CONFIG_SIS900=m +CONFIG_SIS190=m +CONFIG_NET_VENDOR_SMSC=y +CONFIG_PCMCIA_SMC91C92=m +CONFIG_EPIC100=m +CONFIG_SMSC911X=m +CONFIG_SMSC9420=m +CONFIG_NET_VENDOR_SOCIONEXT=y +CONFIG_NET_VENDOR_STMICRO=y +CONFIG_STMMAC_ETH=m +# CONFIG_STMMAC_SELFTESTS is not set +CONFIG_STMMAC_PLATFORM=m +CONFIG_DWMAC_DWC_QOS_ETH=m +CONFIG_DWMAC_GENERIC=m +CONFIG_DWMAC_INTEL=m +CONFIG_STMMAC_PCI=m +CONFIG_NET_VENDOR_SUN=y +CONFIG_HAPPYMEAL=m +CONFIG_SUNGEM=m +CONFIG_CASSINI=m +CONFIG_NIU=m +CONFIG_NET_VENDOR_SYNOPSYS=y +CONFIG_DWC_XLGMAC=m +CONFIG_DWC_XLGMAC_PCI=m +CONFIG_NET_VENDOR_TEHUTI=y +CONFIG_TEHUTI=m +CONFIG_NET_VENDOR_TI=y +# CONFIG_TI_CPSW_PHY_SEL is not set +CONFIG_TLAN=m +CONFIG_NET_VENDOR_VIA=y +CONFIG_VIA_RHINE=m +CONFIG_VIA_RHINE_MMIO=y +CONFIG_VIA_VELOCITY=m +CONFIG_NET_VENDOR_WIZNET=y +CONFIG_WIZNET_W5100=m +CONFIG_WIZNET_W5300=m +# CONFIG_WIZNET_BUS_DIRECT is not set +# CONFIG_WIZNET_BUS_INDIRECT is not set +CONFIG_WIZNET_BUS_ANY=y +CONFIG_WIZNET_W5100_SPI=m +CONFIG_NET_VENDOR_XILINX=y +CONFIG_XILINX_AXI_EMAC=m +CONFIG_XILINX_LL_TEMAC=m +CONFIG_NET_VENDOR_XIRCOM=y +CONFIG_PCMCIA_XIRC2PS=m +CONFIG_FDDI=m +CONFIG_DEFXX=m +CONFIG_DEFXX_MMIO=y +CONFIG_SKFP=m +# CONFIG_HIPPI is not set +CONFIG_NET_SB1000=m +CONFIG_MDIO_DEVICE=m +CONFIG_MDIO_BUS=m +CONFIG_MDIO_BCM_UNIMAC=m +CONFIG_MDIO_BITBANG=m +CONFIG_MDIO_BUS_MUX=m +CONFIG_MDIO_BUS_MUX_GPIO=m +CONFIG_MDIO_BUS_MUX_MMIOREG=m +CONFIG_MDIO_BUS_MUX_MULTIPLEXER=m +CONFIG_MDIO_CAVIUM=m +CONFIG_MDIO_GPIO=m +CONFIG_MDIO_HISI_FEMAC=m +CONFIG_MDIO_I2C=m +CONFIG_MDIO_IPQ8064=m +CONFIG_MDIO_MSCC_MIIM=m +CONFIG_MDIO_MVUSB=m +CONFIG_MDIO_OCTEON=m +CONFIG_MDIO_THUNDER=m +CONFIG_MDIO_XPCS=m +CONFIG_PHYLINK=m +CONFIG_PHYLIB=m +CONFIG_SWPHY=y +CONFIG_LED_TRIGGER_PHY=y + +# +# MII PHY device drivers +# +CONFIG_SFP=m +CONFIG_ADIN_PHY=m +CONFIG_AMD_PHY=m +CONFIG_AQUANTIA_PHY=m +CONFIG_AX88796B_PHY=m +CONFIG_BCM7XXX_PHY=m +CONFIG_BCM87XX_PHY=m +CONFIG_BCM_NET_PHYLIB=m +CONFIG_BROADCOM_PHY=m +CONFIG_BCM84881_PHY=m +CONFIG_CICADA_PHY=m +CONFIG_CORTINA_PHY=m +CONFIG_DAVICOM_PHY=m +CONFIG_DP83822_PHY=m +CONFIG_DP83TC811_PHY=m +CONFIG_DP83848_PHY=m +CONFIG_DP83867_PHY=m +CONFIG_DP83869_PHY=m +CONFIG_FIXED_PHY=m +CONFIG_ICPLUS_PHY=m +CONFIG_INTEL_XWAY_PHY=m +CONFIG_LSI_ET1011C_PHY=m +CONFIG_LXT_PHY=m +CONFIG_MARVELL_PHY=m +CONFIG_MARVELL_10G_PHY=m +CONFIG_MICREL_PHY=m +CONFIG_MICROCHIP_PHY=m +CONFIG_MICROCHIP_T1_PHY=m +CONFIG_MICROSEMI_PHY=m +CONFIG_NATIONAL_PHY=m +CONFIG_NXP_TJA11XX_PHY=m +CONFIG_AT803X_PHY=m +CONFIG_QSEMI_PHY=m +CONFIG_REALTEK_PHY=m +CONFIG_RENESAS_PHY=m +CONFIG_ROCKCHIP_PHY=m +CONFIG_SMSC_PHY=m +CONFIG_STE10XP=m +CONFIG_TERANETICS_PHY=m +CONFIG_VITESSE_PHY=m +CONFIG_XILINX_GMII2RGMII=m +CONFIG_MICREL_KS8995MA=m +CONFIG_PLIP=m +CONFIG_PPP=m +CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_FILTER=y +CONFIG_PPP_MPPE=m +CONFIG_PPP_MULTILINK=y +CONFIG_PPPOATM=m +CONFIG_PPPOE=m +CONFIG_PPTP=m +CONFIG_PPPOL2TP=m +CONFIG_PPP_ASYNC=m +CONFIG_PPP_SYNC_TTY=m +CONFIG_SLIP=m +CONFIG_SLHC=m +CONFIG_SLIP_COMPRESSED=y +CONFIG_SLIP_SMART=y +CONFIG_SLIP_MODE_SLIP6=y +CONFIG_USB_NET_DRIVERS=m +CONFIG_USB_CATC=m +CONFIG_USB_KAWETH=m +CONFIG_USB_PEGASUS=m +CONFIG_USB_RTL8150=m +CONFIG_USB_RTL8152=m +CONFIG_USB_LAN78XX=m +CONFIG_USB_USBNET=m +CONFIG_USB_NET_AX8817X=m +CONFIG_USB_NET_AX88179_178A=m +CONFIG_USB_NET_CDCETHER=m +CONFIG_USB_NET_CDC_EEM=m +CONFIG_USB_NET_CDC_NCM=m +CONFIG_USB_NET_HUAWEI_CDC_NCM=m +CONFIG_USB_NET_CDC_MBIM=m +CONFIG_USB_NET_DM9601=m +CONFIG_USB_NET_SR9700=m +CONFIG_USB_NET_SR9800=m +CONFIG_USB_NET_SMSC75XX=m +CONFIG_USB_NET_SMSC95XX=m +CONFIG_USB_NET_GL620A=m +CONFIG_USB_NET_NET1080=m +CONFIG_USB_NET_PLUSB=m +CONFIG_USB_NET_MCS7830=m +CONFIG_USB_NET_RNDIS_HOST=m +CONFIG_USB_NET_CDC_SUBSET_ENABLE=m +CONFIG_USB_NET_CDC_SUBSET=m +CONFIG_USB_ALI_M5632=y +CONFIG_USB_AN2720=y +CONFIG_USB_BELKIN=y +CONFIG_USB_ARMLINUX=y +CONFIG_USB_EPSON2888=y +CONFIG_USB_KC2190=y +CONFIG_USB_NET_ZAURUS=m +CONFIG_USB_NET_CX82310_ETH=m +CONFIG_USB_NET_KALMIA=m +CONFIG_USB_NET_QMI_WWAN=m +CONFIG_USB_HSO=m +CONFIG_USB_NET_INT51X1=m +CONFIG_USB_CDC_PHONET=m +CONFIG_USB_IPHETH=m +CONFIG_USB_SIERRA_NET=m +CONFIG_USB_VL600=m +CONFIG_USB_NET_CH9200=m +CONFIG_USB_NET_AQC111=m +CONFIG_WLAN=y +# CONFIG_WIRELESS_WDS is not set +CONFIG_WLAN_VENDOR_ADMTEK=y +CONFIG_ADM8211=m +CONFIG_ATH_COMMON=m +CONFIG_WLAN_VENDOR_ATH=y +# CONFIG_ATH_DEBUG is not set +CONFIG_ATH5K=m +CONFIG_ATH5K_DEBUG=y +CONFIG_ATH5K_TRACER=y +CONFIG_ATH5K_PCI=y +CONFIG_ATH9K_HW=m +CONFIG_ATH9K_COMMON=m +CONFIG_ATH9K_COMMON_DEBUG=y +CONFIG_ATH9K_BTCOEX_SUPPORT=y +CONFIG_ATH9K=m +CONFIG_ATH9K_PCI=y +CONFIG_ATH9K_AHB=y +CONFIG_ATH9K_DEBUGFS=y +CONFIG_ATH9K_STATION_STATISTICS=y +CONFIG_ATH9K_DYNACK=y +CONFIG_ATH9K_WOW=y +CONFIG_ATH9K_RFKILL=y +CONFIG_ATH9K_CHANNEL_CONTEXT=y +CONFIG_ATH9K_PCOEM=y +CONFIG_ATH9K_PCI_NO_EEPROM=m +CONFIG_ATH9K_HTC=m +CONFIG_ATH9K_HTC_DEBUGFS=y +CONFIG_ATH9K_HWRNG=y +CONFIG_ATH9K_COMMON_SPECTRAL=y +CONFIG_CARL9170=m +CONFIG_CARL9170_LEDS=y +CONFIG_CARL9170_DEBUGFS=y +CONFIG_CARL9170_WPC=y +# CONFIG_CARL9170_HWRNG is not set +CONFIG_ATH6KL=m +CONFIG_ATH6KL_SDIO=m +CONFIG_ATH6KL_USB=m +CONFIG_ATH6KL_DEBUG=y +CONFIG_ATH6KL_TRACING=y +CONFIG_AR5523=m +CONFIG_WIL6210=m +CONFIG_WIL6210_ISR_COR=y +CONFIG_WIL6210_TRACING=y +CONFIG_WIL6210_DEBUGFS=y +CONFIG_ATH10K=m +CONFIG_ATH10K_CE=y +CONFIG_ATH10K_PCI=m +CONFIG_ATH10K_AHB=y +CONFIG_ATH10K_SDIO=m +CONFIG_ATH10K_USB=m +CONFIG_ATH10K_DEBUG=y +CONFIG_ATH10K_DEBUGFS=y +CONFIG_ATH10K_SPECTRAL=y +CONFIG_ATH10K_TRACING=y +CONFIG_WCN36XX=m +CONFIG_WCN36XX_DEBUGFS=y +CONFIG_WLAN_VENDOR_ATMEL=y +CONFIG_ATMEL=m +CONFIG_PCI_ATMEL=m +CONFIG_PCMCIA_ATMEL=m +CONFIG_AT76C50X_USB=m +CONFIG_WLAN_VENDOR_BROADCOM=y +CONFIG_B43=m +CONFIG_B43_BCMA=y +CONFIG_B43_SSB=y +CONFIG_B43_BUSES_BCMA_AND_SSB=y +# CONFIG_B43_BUSES_BCMA is not set +# CONFIG_B43_BUSES_SSB is not set +CONFIG_B43_PCI_AUTOSELECT=y +CONFIG_B43_PCICORE_AUTOSELECT=y +CONFIG_B43_SDIO=y +CONFIG_B43_BCMA_PIO=y +CONFIG_B43_PIO=y +CONFIG_B43_PHY_G=y +CONFIG_B43_PHY_N=y +CONFIG_B43_PHY_LP=y +CONFIG_B43_PHY_HT=y +CONFIG_B43_LEDS=y +CONFIG_B43_HWRNG=y +# CONFIG_B43_DEBUG is not set +CONFIG_B43LEGACY=m +CONFIG_B43LEGACY_PCI_AUTOSELECT=y +CONFIG_B43LEGACY_PCICORE_AUTOSELECT=y +CONFIG_B43LEGACY_LEDS=y +CONFIG_B43LEGACY_HWRNG=y +CONFIG_B43LEGACY_DEBUG=y +CONFIG_B43LEGACY_DMA=y +CONFIG_B43LEGACY_PIO=y +CONFIG_B43LEGACY_DMA_AND_PIO_MODE=y +# CONFIG_B43LEGACY_DMA_MODE is not set +# CONFIG_B43LEGACY_PIO_MODE is not set +CONFIG_BRCMUTIL=m +CONFIG_BRCMSMAC=m +CONFIG_BRCMFMAC=m +CONFIG_BRCMFMAC_PROTO_BCDC=y +CONFIG_BRCMFMAC_PROTO_MSGBUF=y +CONFIG_BRCMFMAC_SDIO=y +CONFIG_BRCMFMAC_USB=y +CONFIG_BRCMFMAC_PCIE=y +CONFIG_BRCM_TRACING=y +CONFIG_BRCMDBG=y +CONFIG_WLAN_VENDOR_CISCO=y +CONFIG_AIRO=m +CONFIG_AIRO_CS=m +CONFIG_WLAN_VENDOR_INTEL=y +CONFIG_IPW2100=m +CONFIG_IPW2100_MONITOR=y +# CONFIG_IPW2100_DEBUG is not set +CONFIG_IPW2200=m +CONFIG_IPW2200_MONITOR=y +CONFIG_IPW2200_RADIOTAP=y +CONFIG_IPW2200_PROMISCUOUS=y +CONFIG_IPW2200_QOS=y +# CONFIG_IPW2200_DEBUG is not set +CONFIG_LIBIPW=m +# CONFIG_LIBIPW_DEBUG is not set +CONFIG_IWLEGACY=m +CONFIG_IWL4965=m +CONFIG_IWL3945=m + +# +# iwl3945 / iwl4965 Debugging Options +# +CONFIG_IWLEGACY_DEBUG=y +CONFIG_IWLEGACY_DEBUGFS=y +# end of iwl3945 / iwl4965 Debugging Options + +CONFIG_IWLWIFI=m +CONFIG_IWLWIFI_LEDS=y +CONFIG_IWLDVM=m +CONFIG_IWLMVM=m +CONFIG_IWLWIFI_OPMODE_MODULAR=y +# CONFIG_IWLWIFI_BCAST_FILTERING is not set + +# +# Debugging Options +# +CONFIG_IWLWIFI_DEBUG=y +CONFIG_IWLWIFI_DEBUGFS=y +CONFIG_IWLWIFI_DEVICE_TRACING=y +# end of Debugging Options + +CONFIG_WLAN_VENDOR_INTERSIL=y +CONFIG_HOSTAP=m +CONFIG_HOSTAP_FIRMWARE=y +CONFIG_HOSTAP_FIRMWARE_NVRAM=y +CONFIG_HOSTAP_PLX=m +CONFIG_HOSTAP_PCI=m +CONFIG_HOSTAP_CS=m +CONFIG_HERMES=m +CONFIG_HERMES_PRISM=y +CONFIG_HERMES_CACHE_FW_ON_INIT=y +CONFIG_PLX_HERMES=m +CONFIG_TMD_HERMES=m +CONFIG_NORTEL_HERMES=m +CONFIG_PCI_HERMES=m +CONFIG_PCMCIA_HERMES=m +CONFIG_PCMCIA_SPECTRUM=m +CONFIG_ORINOCO_USB=m +CONFIG_P54_COMMON=m +CONFIG_P54_USB=m +CONFIG_P54_PCI=m +CONFIG_P54_SPI=m +# CONFIG_P54_SPI_DEFAULT_EEPROM is not set +CONFIG_P54_LEDS=y +CONFIG_PRISM54=m +CONFIG_WLAN_VENDOR_MARVELL=y +CONFIG_LIBERTAS=m +CONFIG_LIBERTAS_USB=m +CONFIG_LIBERTAS_CS=m +CONFIG_LIBERTAS_SDIO=m +CONFIG_LIBERTAS_SPI=m +# CONFIG_LIBERTAS_DEBUG is not set +CONFIG_LIBERTAS_MESH=y +CONFIG_LIBERTAS_THINFIRM=m +# CONFIG_LIBERTAS_THINFIRM_DEBUG is not set +CONFIG_LIBERTAS_THINFIRM_USB=m +CONFIG_MWIFIEX=m +CONFIG_MWIFIEX_SDIO=m +CONFIG_MWIFIEX_PCIE=m +CONFIG_MWIFIEX_USB=m +CONFIG_MWL8K=m +CONFIG_WLAN_VENDOR_MEDIATEK=y +CONFIG_MT7601U=m +CONFIG_MT76_CORE=m +CONFIG_MT76_LEDS=y +CONFIG_MT76_USB=m +CONFIG_MT76x02_LIB=m +CONFIG_MT76x02_USB=m +CONFIG_MT76x0_COMMON=m +CONFIG_MT76x0U=m +CONFIG_MT76x0E=m +CONFIG_MT76x2_COMMON=m +CONFIG_MT76x2E=m +CONFIG_MT76x2U=m +CONFIG_MT7603E=m +CONFIG_MT7615E=m +CONFIG_WLAN_VENDOR_RALINK=y +CONFIG_RT2X00=m +CONFIG_RT2400PCI=m +CONFIG_RT2500PCI=m +CONFIG_RT61PCI=m +CONFIG_RT2800PCI=m +CONFIG_RT2800PCI_RT33XX=y +CONFIG_RT2800PCI_RT35XX=y +CONFIG_RT2800PCI_RT53XX=y +CONFIG_RT2800PCI_RT3290=y +CONFIG_RT2500USB=m +CONFIG_RT73USB=m +CONFIG_RT2800USB=m +CONFIG_RT2800USB_RT33XX=y +CONFIG_RT2800USB_RT35XX=y +CONFIG_RT2800USB_RT3573=y +CONFIG_RT2800USB_RT53XX=y +CONFIG_RT2800USB_RT55XX=y +CONFIG_RT2800USB_UNKNOWN=y +CONFIG_RT2800_LIB=m +CONFIG_RT2800_LIB_MMIO=m +CONFIG_RT2X00_LIB_MMIO=m +CONFIG_RT2X00_LIB_PCI=m +CONFIG_RT2X00_LIB_USB=m +CONFIG_RT2X00_LIB=m +CONFIG_RT2X00_LIB_FIRMWARE=y +CONFIG_RT2X00_LIB_CRYPTO=y +CONFIG_RT2X00_LIB_LEDS=y +CONFIG_RT2X00_LIB_DEBUGFS=y +# CONFIG_RT2X00_DEBUG is not set +CONFIG_WLAN_VENDOR_REALTEK=y +CONFIG_RTL8180=m +CONFIG_RTL8187=m +CONFIG_RTL8187_LEDS=y +CONFIG_RTL_CARDS=m +CONFIG_RTL8192CE=m +CONFIG_RTL8192SE=m +CONFIG_RTL8192DE=m +CONFIG_RTL8723AE=m +CONFIG_RTL8723BE=m +CONFIG_RTL8188EE=m +CONFIG_RTL8192EE=m +CONFIG_RTL8821AE=m +CONFIG_RTL8192CU=m +CONFIG_RTLWIFI=m +CONFIG_RTLWIFI_PCI=m +CONFIG_RTLWIFI_USB=m +CONFIG_RTLWIFI_DEBUG=y +CONFIG_RTL8192C_COMMON=m +CONFIG_RTL8723_COMMON=m +CONFIG_RTLBTCOEXIST=m +CONFIG_RTL8XXXU=m +CONFIG_RTL8XXXU_UNTESTED=y +CONFIG_RTW88=m +CONFIG_RTW88_CORE=m +CONFIG_RTW88_PCI=m +CONFIG_RTW88_8822BE=y +CONFIG_RTW88_8822CE=y +CONFIG_RTW88_DEBUG=y +CONFIG_RTW88_DEBUGFS=y +CONFIG_WLAN_VENDOR_RSI=y +CONFIG_RSI_91X=m +CONFIG_RSI_DEBUGFS=y +CONFIG_RSI_SDIO=m +CONFIG_RSI_USB=m +CONFIG_RSI_COEX=y +CONFIG_WLAN_VENDOR_ST=y +CONFIG_CW1200=m +CONFIG_CW1200_WLAN_SDIO=m +CONFIG_CW1200_WLAN_SPI=m +CONFIG_WLAN_VENDOR_TI=y +CONFIG_WL1251=m +CONFIG_WL1251_SPI=m +CONFIG_WL1251_SDIO=m +CONFIG_WL12XX=m +CONFIG_WL18XX=m +CONFIG_WLCORE=m +CONFIG_WLCORE_SPI=m +CONFIG_WLCORE_SDIO=m +CONFIG_WILINK_PLATFORM_DATA=y +CONFIG_WLAN_VENDOR_ZYDAS=y +CONFIG_USB_ZD1201=m +CONFIG_ZD1211RW=m +# CONFIG_ZD1211RW_DEBUG is not set +CONFIG_WLAN_VENDOR_QUANTENNA=y +CONFIG_QTNFMAC=m +CONFIG_QTNFMAC_PCIE=m +CONFIG_PCMCIA_RAYCS=m +CONFIG_PCMCIA_WL3501=m +CONFIG_MAC80211_HWSIM=m +CONFIG_USB_NET_RNDIS_WLAN=m +CONFIG_VIRT_WIFI=m + +# +# WiMAX Wireless Broadband devices +# +CONFIG_WIMAX_I2400M=m +CONFIG_WIMAX_I2400M_USB=m +CONFIG_WIMAX_I2400M_DEBUG_LEVEL=8 +# end of WiMAX Wireless Broadband devices + +# CONFIG_WAN is not set +CONFIG_IEEE802154_DRIVERS=m +CONFIG_IEEE802154_FAKELB=m +CONFIG_IEEE802154_AT86RF230=m +# CONFIG_IEEE802154_AT86RF230_DEBUGFS is not set +CONFIG_IEEE802154_MRF24J40=m +CONFIG_IEEE802154_CC2520=m +CONFIG_IEEE802154_ATUSB=m +CONFIG_IEEE802154_ADF7242=m +CONFIG_IEEE802154_CA8210=m +# CONFIG_IEEE802154_CA8210_DEBUGFS is not set +CONFIG_IEEE802154_MCR20A=m +CONFIG_IEEE802154_HWSIM=m +CONFIG_XEN_NETDEV_FRONTEND=m +CONFIG_XEN_NETDEV_BACKEND=m +CONFIG_VMXNET3=m +CONFIG_FUJITSU_ES=m +CONFIG_USB4_NET=m +CONFIG_HYPERV_NET=m +CONFIG_NETDEVSIM=m +CONFIG_NET_FAILOVER=m +CONFIG_ISDN=y +CONFIG_ISDN_CAPI=y +CONFIG_CAPI_TRACE=y +CONFIG_ISDN_CAPI_MIDDLEWARE=y +CONFIG_MISDN=m +CONFIG_MISDN_DSP=m +CONFIG_MISDN_L1OIP=m + +# +# mISDN hardware drivers +# +CONFIG_MISDN_HFCPCI=m +CONFIG_MISDN_HFCMULTI=m +CONFIG_MISDN_HFCUSB=m +CONFIG_MISDN_AVMFRITZ=m +CONFIG_MISDN_SPEEDFAX=m +CONFIG_MISDN_INFINEON=m +CONFIG_MISDN_W6692=m +CONFIG_MISDN_NETJET=m +CONFIG_MISDN_HDLC=m +CONFIG_MISDN_IPAC=m +CONFIG_MISDN_ISAR=m +CONFIG_NVM=y +CONFIG_NVM_PBLK=m +# CONFIG_NVM_PBLK_DEBUG is not set + +# +# Input device support +# +CONFIG_INPUT=y +CONFIG_INPUT_LEDS=m +CONFIG_INPUT_FF_MEMLESS=m +CONFIG_INPUT_POLLDEV=m +CONFIG_INPUT_SPARSEKMAP=m +CONFIG_INPUT_MATRIXKMAP=m + +# +# Userland interfaces +# +CONFIG_INPUT_MOUSEDEV=m +CONFIG_INPUT_MOUSEDEV_PSAUX=y +CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 +CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 +CONFIG_INPUT_JOYDEV=m +CONFIG_INPUT_EVDEV=m +# CONFIG_INPUT_EVBUG is not set + +# +# Input Device Drivers +# +CONFIG_INPUT_KEYBOARD=y +CONFIG_KEYBOARD_ADC=m +CONFIG_KEYBOARD_ADP5520=m +CONFIG_KEYBOARD_ADP5588=m +CONFIG_KEYBOARD_ADP5589=m +CONFIG_KEYBOARD_APPLESPI=m +CONFIG_KEYBOARD_ATKBD=m +CONFIG_KEYBOARD_QT1050=m +CONFIG_KEYBOARD_QT1070=m +CONFIG_KEYBOARD_QT2160=m +CONFIG_KEYBOARD_DLINK_DIR685=m +CONFIG_KEYBOARD_LKKBD=m +CONFIG_KEYBOARD_GPIO=m +CONFIG_KEYBOARD_GPIO_POLLED=m +CONFIG_KEYBOARD_TCA6416=m +CONFIG_KEYBOARD_TCA8418=m +CONFIG_KEYBOARD_MATRIX=m +CONFIG_KEYBOARD_LM8323=m +CONFIG_KEYBOARD_LM8333=m +CONFIG_KEYBOARD_MAX7359=m +CONFIG_KEYBOARD_MCS=m +CONFIG_KEYBOARD_MPR121=m +CONFIG_KEYBOARD_NEWTON=m +CONFIG_KEYBOARD_OPENCORES=m +CONFIG_KEYBOARD_SAMSUNG=m +CONFIG_KEYBOARD_STOWAWAY=m +CONFIG_KEYBOARD_SUNKBD=m +CONFIG_KEYBOARD_STMPE=m +CONFIG_KEYBOARD_IQS62X=m +CONFIG_KEYBOARD_OMAP4=m +CONFIG_KEYBOARD_TC3589X=m +CONFIG_KEYBOARD_TM2_TOUCHKEY=m +CONFIG_KEYBOARD_TWL4030=m +CONFIG_KEYBOARD_XTKBD=m +CONFIG_KEYBOARD_CROS_EC=m +CONFIG_KEYBOARD_CAP11XX=m +CONFIG_KEYBOARD_BCM=m +CONFIG_KEYBOARD_MTK_PMIC=m +CONFIG_INPUT_MOUSE=y +CONFIG_MOUSE_PS2=m +CONFIG_MOUSE_PS2_ALPS=y +CONFIG_MOUSE_PS2_BYD=y +CONFIG_MOUSE_PS2_LOGIPS2PP=y +CONFIG_MOUSE_PS2_SYNAPTICS=y +CONFIG_MOUSE_PS2_SYNAPTICS_SMBUS=y +CONFIG_MOUSE_PS2_CYPRESS=y +CONFIG_MOUSE_PS2_LIFEBOOK=y +CONFIG_MOUSE_PS2_TRACKPOINT=y +CONFIG_MOUSE_PS2_ELANTECH=y +CONFIG_MOUSE_PS2_ELANTECH_SMBUS=y +CONFIG_MOUSE_PS2_SENTELIC=y +CONFIG_MOUSE_PS2_TOUCHKIT=y +CONFIG_MOUSE_PS2_FOCALTECH=y +CONFIG_MOUSE_PS2_VMMOUSE=y +CONFIG_MOUSE_PS2_SMBUS=y +CONFIG_MOUSE_SERIAL=m +CONFIG_MOUSE_APPLETOUCH=m +CONFIG_MOUSE_BCM5974=m +CONFIG_MOUSE_CYAPA=m +CONFIG_MOUSE_ELAN_I2C=m +CONFIG_MOUSE_ELAN_I2C_I2C=y +CONFIG_MOUSE_ELAN_I2C_SMBUS=y +CONFIG_MOUSE_VSXXXAA=m +CONFIG_MOUSE_GPIO=m +CONFIG_MOUSE_SYNAPTICS_I2C=m +CONFIG_MOUSE_SYNAPTICS_USB=m +CONFIG_INPUT_JOYSTICK=y +CONFIG_JOYSTICK_ANALOG=m +CONFIG_JOYSTICK_A3D=m +CONFIG_JOYSTICK_ADI=m +CONFIG_JOYSTICK_COBRA=m +CONFIG_JOYSTICK_GF2K=m +CONFIG_JOYSTICK_GRIP=m +CONFIG_JOYSTICK_GRIP_MP=m +CONFIG_JOYSTICK_GUILLEMOT=m +CONFIG_JOYSTICK_INTERACT=m +CONFIG_JOYSTICK_SIDEWINDER=m +CONFIG_JOYSTICK_TMDC=m +CONFIG_JOYSTICK_IFORCE=m +CONFIG_JOYSTICK_IFORCE_USB=m +CONFIG_JOYSTICK_IFORCE_232=m +CONFIG_JOYSTICK_WARRIOR=m +CONFIG_JOYSTICK_MAGELLAN=m +CONFIG_JOYSTICK_SPACEORB=m +CONFIG_JOYSTICK_SPACEBALL=m +CONFIG_JOYSTICK_STINGER=m +CONFIG_JOYSTICK_TWIDJOY=m +CONFIG_JOYSTICK_ZHENHUA=m +CONFIG_JOYSTICK_DB9=m +CONFIG_JOYSTICK_GAMECON=m +CONFIG_JOYSTICK_TURBOGRAFX=m +CONFIG_JOYSTICK_AS5011=m +CONFIG_JOYSTICK_JOYDUMP=m +CONFIG_JOYSTICK_XPAD=m +CONFIG_JOYSTICK_XPAD_FF=y +CONFIG_JOYSTICK_XPAD_LEDS=y +CONFIG_JOYSTICK_WALKERA0701=m +CONFIG_JOYSTICK_PSXPAD_SPI=m +CONFIG_JOYSTICK_PSXPAD_SPI_FF=y +CONFIG_JOYSTICK_PXRC=m +CONFIG_JOYSTICK_FSIA6B=m +CONFIG_INPUT_TABLET=y +CONFIG_TABLET_USB_ACECAD=m +CONFIG_TABLET_USB_AIPTEK=m +CONFIG_TABLET_USB_GTCO=m +CONFIG_TABLET_USB_HANWANG=m +CONFIG_TABLET_USB_KBTAB=m +CONFIG_TABLET_USB_PEGASUS=m +CONFIG_TABLET_SERIAL_WACOM4=m +CONFIG_INPUT_TOUCHSCREEN=y +CONFIG_TOUCHSCREEN_PROPERTIES=y +CONFIG_TOUCHSCREEN_88PM860X=m +CONFIG_TOUCHSCREEN_ADS7846=m +CONFIG_TOUCHSCREEN_AD7877=m +CONFIG_TOUCHSCREEN_AD7879=m +CONFIG_TOUCHSCREEN_AD7879_I2C=m +CONFIG_TOUCHSCREEN_AD7879_SPI=m +CONFIG_TOUCHSCREEN_ADC=m +CONFIG_TOUCHSCREEN_AR1021_I2C=m +CONFIG_TOUCHSCREEN_ATMEL_MXT=m +CONFIG_TOUCHSCREEN_ATMEL_MXT_T37=y +CONFIG_TOUCHSCREEN_AUO_PIXCIR=m +CONFIG_TOUCHSCREEN_BU21013=m +CONFIG_TOUCHSCREEN_BU21029=m +CONFIG_TOUCHSCREEN_CHIPONE_ICN8318=m +CONFIG_TOUCHSCREEN_CHIPONE_ICN8505=m +CONFIG_TOUCHSCREEN_CY8CTMG110=m +CONFIG_TOUCHSCREEN_CYTTSP_CORE=m +CONFIG_TOUCHSCREEN_CYTTSP_I2C=m +CONFIG_TOUCHSCREEN_CYTTSP_SPI=m +CONFIG_TOUCHSCREEN_CYTTSP4_CORE=m +CONFIG_TOUCHSCREEN_CYTTSP4_I2C=m +CONFIG_TOUCHSCREEN_CYTTSP4_SPI=m +CONFIG_TOUCHSCREEN_DA9034=m +CONFIG_TOUCHSCREEN_DA9052=m +CONFIG_TOUCHSCREEN_DYNAPRO=m +CONFIG_TOUCHSCREEN_HAMPSHIRE=m +CONFIG_TOUCHSCREEN_EETI=m +CONFIG_TOUCHSCREEN_EGALAX=m +CONFIG_TOUCHSCREEN_EGALAX_SERIAL=m +CONFIG_TOUCHSCREEN_EXC3000=m +CONFIG_TOUCHSCREEN_FUJITSU=m +CONFIG_TOUCHSCREEN_GOODIX=m +CONFIG_TOUCHSCREEN_HIDEEP=m +CONFIG_TOUCHSCREEN_ILI210X=m +CONFIG_TOUCHSCREEN_S6SY761=m +CONFIG_TOUCHSCREEN_GUNZE=m +CONFIG_TOUCHSCREEN_EKTF2127=m +CONFIG_TOUCHSCREEN_ELAN=m +CONFIG_TOUCHSCREEN_ELO=m +CONFIG_TOUCHSCREEN_WACOM_W8001=m +CONFIG_TOUCHSCREEN_WACOM_I2C=m +CONFIG_TOUCHSCREEN_MAX11801=m +CONFIG_TOUCHSCREEN_MCS5000=m +CONFIG_TOUCHSCREEN_MMS114=m +CONFIG_TOUCHSCREEN_MELFAS_MIP4=m +CONFIG_TOUCHSCREEN_MTOUCH=m +CONFIG_TOUCHSCREEN_IMX6UL_TSC=m +CONFIG_TOUCHSCREEN_INEXIO=m +CONFIG_TOUCHSCREEN_MK712=m +CONFIG_TOUCHSCREEN_PENMOUNT=m +CONFIG_TOUCHSCREEN_EDT_FT5X06=m +CONFIG_TOUCHSCREEN_TOUCHRIGHT=m +CONFIG_TOUCHSCREEN_TOUCHWIN=m +CONFIG_TOUCHSCREEN_TI_AM335X_TSC=m +CONFIG_TOUCHSCREEN_UCB1400=m +CONFIG_TOUCHSCREEN_PIXCIR=m +CONFIG_TOUCHSCREEN_WDT87XX_I2C=m +CONFIG_TOUCHSCREEN_WM831X=m +CONFIG_TOUCHSCREEN_WM97XX=m +CONFIG_TOUCHSCREEN_WM9705=y +CONFIG_TOUCHSCREEN_WM9712=y +CONFIG_TOUCHSCREEN_WM9713=y +CONFIG_TOUCHSCREEN_USB_COMPOSITE=m +CONFIG_TOUCHSCREEN_MC13783=m +CONFIG_TOUCHSCREEN_USB_EGALAX=y +CONFIG_TOUCHSCREEN_USB_PANJIT=y +CONFIG_TOUCHSCREEN_USB_3M=y +CONFIG_TOUCHSCREEN_USB_ITM=y +CONFIG_TOUCHSCREEN_USB_ETURBO=y +CONFIG_TOUCHSCREEN_USB_GUNZE=y +CONFIG_TOUCHSCREEN_USB_DMC_TSC10=y +CONFIG_TOUCHSCREEN_USB_IRTOUCH=y +CONFIG_TOUCHSCREEN_USB_IDEALTEK=y +CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH=y +CONFIG_TOUCHSCREEN_USB_GOTOP=y +CONFIG_TOUCHSCREEN_USB_JASTEC=y +CONFIG_TOUCHSCREEN_USB_ELO=y +CONFIG_TOUCHSCREEN_USB_E2I=y +CONFIG_TOUCHSCREEN_USB_ZYTRONIC=y +CONFIG_TOUCHSCREEN_USB_ETT_TC45USB=y +CONFIG_TOUCHSCREEN_USB_NEXIO=y +CONFIG_TOUCHSCREEN_USB_EASYTOUCH=y +CONFIG_TOUCHSCREEN_TOUCHIT213=m +CONFIG_TOUCHSCREEN_TSC_SERIO=m +CONFIG_TOUCHSCREEN_TSC200X_CORE=m +CONFIG_TOUCHSCREEN_TSC2004=m +CONFIG_TOUCHSCREEN_TSC2005=m +CONFIG_TOUCHSCREEN_TSC2007=m +CONFIG_TOUCHSCREEN_TSC2007_IIO=y +CONFIG_TOUCHSCREEN_PCAP=m +CONFIG_TOUCHSCREEN_RM_TS=m +CONFIG_TOUCHSCREEN_SILEAD=m +CONFIG_TOUCHSCREEN_SIS_I2C=m +CONFIG_TOUCHSCREEN_ST1232=m +CONFIG_TOUCHSCREEN_STMFTS=m +CONFIG_TOUCHSCREEN_STMPE=m +CONFIG_TOUCHSCREEN_SUR40=m +CONFIG_TOUCHSCREEN_SURFACE3_SPI=m +CONFIG_TOUCHSCREEN_SX8654=m +CONFIG_TOUCHSCREEN_TPS6507X=m +CONFIG_TOUCHSCREEN_ZET6223=m +CONFIG_TOUCHSCREEN_ZFORCE=m +CONFIG_TOUCHSCREEN_COLIBRI_VF50=m +CONFIG_TOUCHSCREEN_ROHM_BU21023=m +CONFIG_TOUCHSCREEN_IQS5XX=m +CONFIG_INPUT_MISC=y +CONFIG_INPUT_88PM860X_ONKEY=m +CONFIG_INPUT_88PM80X_ONKEY=m +CONFIG_INPUT_AD714X=m +CONFIG_INPUT_AD714X_I2C=m +CONFIG_INPUT_AD714X_SPI=m +CONFIG_INPUT_ARIZONA_HAPTICS=m +CONFIG_INPUT_ATMEL_CAPTOUCH=m +CONFIG_INPUT_BMA150=m +CONFIG_INPUT_E3X0_BUTTON=m +CONFIG_INPUT_MSM_VIBRATOR=m +CONFIG_INPUT_PCSPKR=m +CONFIG_INPUT_MAX77650_ONKEY=m +CONFIG_INPUT_MAX77693_HAPTIC=m +CONFIG_INPUT_MAX8925_ONKEY=m +CONFIG_INPUT_MAX8997_HAPTIC=m +CONFIG_INPUT_MC13783_PWRBUTTON=m +CONFIG_INPUT_MMA8450=m +CONFIG_INPUT_APANEL=m +CONFIG_INPUT_GP2A=m +CONFIG_INPUT_GPIO_BEEPER=m +CONFIG_INPUT_GPIO_DECODER=m +CONFIG_INPUT_GPIO_VIBRA=m +CONFIG_INPUT_CPCAP_PWRBUTTON=m +CONFIG_INPUT_ATLAS_BTNS=m +CONFIG_INPUT_ATI_REMOTE2=m +CONFIG_INPUT_KEYSPAN_REMOTE=m +CONFIG_INPUT_KXTJ9=m +CONFIG_INPUT_POWERMATE=m +CONFIG_INPUT_YEALINK=m +CONFIG_INPUT_CM109=m +CONFIG_INPUT_REGULATOR_HAPTIC=m +CONFIG_INPUT_RETU_PWRBUTTON=m +CONFIG_INPUT_TPS65218_PWRBUTTON=m +CONFIG_INPUT_AXP20X_PEK=m +CONFIG_INPUT_TWL4030_PWRBUTTON=m +CONFIG_INPUT_TWL4030_VIBRA=m +CONFIG_INPUT_TWL6040_VIBRA=m +CONFIG_INPUT_UINPUT=m +CONFIG_INPUT_PALMAS_PWRBUTTON=m +CONFIG_INPUT_PCF50633_PMU=m +CONFIG_INPUT_PCF8574=m +CONFIG_INPUT_PWM_BEEPER=m +CONFIG_INPUT_PWM_VIBRA=m +CONFIG_INPUT_RK805_PWRKEY=m +CONFIG_INPUT_GPIO_ROTARY_ENCODER=m +CONFIG_INPUT_DA9052_ONKEY=m +CONFIG_INPUT_DA9055_ONKEY=m +CONFIG_INPUT_DA9063_ONKEY=m +CONFIG_INPUT_WM831X_ON=m +CONFIG_INPUT_PCAP=m +CONFIG_INPUT_ADXL34X=m +CONFIG_INPUT_ADXL34X_I2C=m +CONFIG_INPUT_ADXL34X_SPI=m +CONFIG_INPUT_IMS_PCU=m +CONFIG_INPUT_CMA3000=m +CONFIG_INPUT_CMA3000_I2C=m +CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m +CONFIG_INPUT_IDEAPAD_SLIDEBAR=m +CONFIG_INPUT_SOC_BUTTON_ARRAY=m +CONFIG_INPUT_DRV260X_HAPTICS=m +CONFIG_INPUT_DRV2665_HAPTICS=m +CONFIG_INPUT_DRV2667_HAPTICS=m +CONFIG_INPUT_RAVE_SP_PWRBUTTON=m +CONFIG_INPUT_STPMIC1_ONKEY=m +CONFIG_RMI4_CORE=m +CONFIG_RMI4_I2C=m +CONFIG_RMI4_SPI=m +CONFIG_RMI4_SMB=m +CONFIG_RMI4_F03=y +CONFIG_RMI4_F03_SERIO=m +CONFIG_RMI4_2D_SENSOR=y +CONFIG_RMI4_F11=y +CONFIG_RMI4_F12=y +CONFIG_RMI4_F30=y +CONFIG_RMI4_F34=y +# CONFIG_RMI4_F54 is not set +CONFIG_RMI4_F55=y + +# +# Hardware I/O ports +# +CONFIG_SERIO=m +CONFIG_ARCH_MIGHT_HAVE_PC_SERIO=y +CONFIG_SERIO_I8042=m +CONFIG_SERIO_SERPORT=m +CONFIG_SERIO_CT82C710=m +CONFIG_SERIO_PARKBD=m +CONFIG_SERIO_PCIPS2=m +CONFIG_SERIO_LIBPS2=m +CONFIG_SERIO_RAW=m +CONFIG_SERIO_ALTERA_PS2=m +CONFIG_SERIO_PS2MULT=m +CONFIG_SERIO_ARC_PS2=m +# CONFIG_SERIO_APBPS2 is not set +CONFIG_HYPERV_KEYBOARD=m +CONFIG_SERIO_GPIO_PS2=m +CONFIG_USERIO=m +CONFIG_GAMEPORT=m +CONFIG_GAMEPORT_NS558=m +CONFIG_GAMEPORT_L4=m +CONFIG_GAMEPORT_EMU10K1=m +CONFIG_GAMEPORT_FM801=m +# end of Hardware I/O ports +# end of Input device support + +# +# Character devices +# +CONFIG_TTY=y +CONFIG_VT=y +CONFIG_CONSOLE_TRANSLATIONS=y +CONFIG_VT_CONSOLE=y +CONFIG_VT_CONSOLE_SLEEP=y +CONFIG_HW_CONSOLE=y +CONFIG_VT_HW_CONSOLE_BINDING=y +CONFIG_UNIX98_PTYS=y +# CONFIG_LEGACY_PTYS is not set +CONFIG_LDISC_AUTOLOAD=y + +# +# Serial drivers +# +CONFIG_SERIAL_EARLYCON=y +CONFIG_SERIAL_8250=y +# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set +CONFIG_SERIAL_8250_PNP=y +# CONFIG_SERIAL_8250_16550A_VARIANTS is not set +CONFIG_SERIAL_8250_FINTEK=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_8250_DMA=y +CONFIG_SERIAL_8250_PCI=y +CONFIG_SERIAL_8250_EXAR=m +CONFIG_SERIAL_8250_CS=m +CONFIG_SERIAL_8250_MEN_MCB=m +CONFIG_SERIAL_8250_NR_UARTS=32 +CONFIG_SERIAL_8250_RUNTIME_UARTS=4 +CONFIG_SERIAL_8250_EXTENDED=y +CONFIG_SERIAL_8250_MANY_PORTS=y +CONFIG_SERIAL_8250_ASPEED_VUART=m +CONFIG_SERIAL_8250_SHARE_IRQ=y +# CONFIG_SERIAL_8250_DETECT_IRQ is not set +CONFIG_SERIAL_8250_RSA=y +CONFIG_SERIAL_8250_DWLIB=y +CONFIG_SERIAL_8250_DW=m +CONFIG_SERIAL_8250_RT288X=y +CONFIG_SERIAL_8250_LPSS=y +CONFIG_SERIAL_8250_MID=y +CONFIG_SERIAL_OF_PLATFORM=m + +# +# Non-8250 serial port support +# +CONFIG_SERIAL_MAX3100=m +CONFIG_SERIAL_MAX310X=m +CONFIG_SERIAL_UARTLITE=m +CONFIG_SERIAL_UARTLITE_NR_UARTS=1 +CONFIG_SERIAL_CORE=y +CONFIG_SERIAL_CORE_CONSOLE=y +CONFIG_SERIAL_JSM=m +CONFIG_SERIAL_SIFIVE=m +CONFIG_SERIAL_SCCNXP=m +CONFIG_SERIAL_SC16IS7XX_CORE=m +CONFIG_SERIAL_SC16IS7XX=m +CONFIG_SERIAL_SC16IS7XX_I2C=y +CONFIG_SERIAL_SC16IS7XX_SPI=y +CONFIG_SERIAL_ALTERA_JTAGUART=m +CONFIG_SERIAL_ALTERA_UART=m +CONFIG_SERIAL_ALTERA_UART_MAXPORTS=4 +CONFIG_SERIAL_ALTERA_UART_BAUDRATE=115200 +CONFIG_SERIAL_IFX6X60=m +CONFIG_SERIAL_XILINX_PS_UART=m +CONFIG_SERIAL_ARC=m +CONFIG_SERIAL_ARC_NR_PORTS=1 +CONFIG_SERIAL_RP2=m +CONFIG_SERIAL_RP2_NR_UARTS=32 +CONFIG_SERIAL_FSL_LPUART=m +CONFIG_SERIAL_FSL_LINFLEXUART=m +CONFIG_SERIAL_CONEXANT_DIGICOLOR=m +CONFIG_SERIAL_MEN_Z135=m +CONFIG_SERIAL_SPRD=m +# end of Serial drivers + +CONFIG_SERIAL_MCTRL_GPIO=y +CONFIG_SERIAL_NONSTANDARD=y +CONFIG_ROCKETPORT=m +CONFIG_CYCLADES=m +CONFIG_CYZ_INTR=y +CONFIG_MOXA_INTELLIO=m +CONFIG_MOXA_SMARTIO=m +CONFIG_SYNCLINK=m +CONFIG_SYNCLINKMP=m +CONFIG_SYNCLINK_GT=m +CONFIG_ISI=m +CONFIG_N_HDLC=m +CONFIG_N_GSM=m +CONFIG_NOZOMI=m +CONFIG_NULL_TTY=m +CONFIG_TRACE_ROUTER=m +CONFIG_TRACE_SINK=m +CONFIG_HVC_DRIVER=y +CONFIG_HVC_IRQ=y +CONFIG_HVC_XEN=y +CONFIG_HVC_XEN_FRONTEND=y +CONFIG_SERIAL_DEV_BUS=y +CONFIG_SERIAL_DEV_CTRL_TTYPORT=y +# CONFIG_TTY_PRINTK is not set +CONFIG_PRINTER=m +# CONFIG_LP_CONSOLE is not set +CONFIG_PPDEV=m +CONFIG_VIRTIO_CONSOLE=m +CONFIG_IPMI_HANDLER=m +CONFIG_IPMI_DMI_DECODE=y +CONFIG_IPMI_PLAT_DATA=y +# CONFIG_IPMI_PANIC_EVENT is not set +CONFIG_IPMI_DEVICE_INTERFACE=m +CONFIG_IPMI_SI=m +CONFIG_IPMI_SSIF=m +CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m +CONFIG_IPMB_DEVICE_INTERFACE=m +CONFIG_HW_RANDOM=m +CONFIG_HW_RANDOM_TIMERIOMEM=m +CONFIG_HW_RANDOM_INTEL=m +CONFIG_HW_RANDOM_AMD=m +CONFIG_HW_RANDOM_VIA=m +CONFIG_HW_RANDOM_VIRTIO=m +CONFIG_APPLICOM=m + +# +# PCMCIA character devices +# +CONFIG_SYNCLINK_CS=m +CONFIG_CARDMAN_4000=m +CONFIG_CARDMAN_4040=m +CONFIG_SCR24X=m +CONFIG_IPWIRELESS=m +# end of PCMCIA character devices + +CONFIG_MWAVE=m +CONFIG_DEVMEM=y +# CONFIG_DEVKMEM is not set +CONFIG_NVRAM=m +CONFIG_RAW_DRIVER=m +CONFIG_MAX_RAW_DEVS=256 +CONFIG_DEVPORT=y +CONFIG_HPET=y +CONFIG_HPET_MMAP=y +CONFIG_HPET_MMAP_DEFAULT=y +CONFIG_HANGCHECK_TIMER=m +CONFIG_TCG_TPM=m +CONFIG_HW_RANDOM_TPM=y +CONFIG_TCG_TIS_CORE=m +CONFIG_TCG_TIS=m +CONFIG_TCG_TIS_SPI=m +CONFIG_TCG_TIS_SPI_CR50=y +CONFIG_TCG_TIS_I2C_ATMEL=m +CONFIG_TCG_TIS_I2C_INFINEON=m +CONFIG_TCG_TIS_I2C_NUVOTON=m +CONFIG_TCG_NSC=m +CONFIG_TCG_ATMEL=m +CONFIG_TCG_INFINEON=m +CONFIG_TCG_XEN=m +CONFIG_TCG_CRB=m +CONFIG_TCG_VTPM_PROXY=m +CONFIG_TCG_TIS_ST33ZP24=m +CONFIG_TCG_TIS_ST33ZP24_I2C=m +CONFIG_TCG_TIS_ST33ZP24_SPI=m +CONFIG_TELCLOCK=m +CONFIG_XILLYBUS=m +CONFIG_XILLYBUS_PCIE=m +CONFIG_XILLYBUS_OF=m +# end of Character devices + +# CONFIG_RANDOM_TRUST_CPU is not set +# CONFIG_RANDOM_TRUST_BOOTLOADER is not set + +# +# I2C support +# +CONFIG_I2C=y +CONFIG_ACPI_I2C_OPREGION=y +CONFIG_I2C_BOARDINFO=y +CONFIG_I2C_COMPAT=y +CONFIG_I2C_CHARDEV=m +CONFIG_I2C_MUX=m + +# +# Multiplexer I2C Chip support +# +CONFIG_I2C_ARB_GPIO_CHALLENGE=m +CONFIG_I2C_MUX_GPIO=m +CONFIG_I2C_MUX_GPMUX=m +CONFIG_I2C_MUX_LTC4306=m +CONFIG_I2C_MUX_PCA9541=m +CONFIG_I2C_MUX_PCA954x=m +CONFIG_I2C_MUX_PINCTRL=m +CONFIG_I2C_MUX_REG=m +CONFIG_I2C_DEMUX_PINCTRL=m +CONFIG_I2C_MUX_MLXCPLD=m +# end of Multiplexer I2C Chip support + +CONFIG_I2C_HELPER_AUTO=y +CONFIG_I2C_SMBUS=m +CONFIG_I2C_ALGOBIT=m +CONFIG_I2C_ALGOPCA=m + +# +# I2C Hardware Bus support +# + +# +# PC SMBus host controller drivers +# +CONFIG_I2C_ALI1535=m +CONFIG_I2C_ALI1563=m +CONFIG_I2C_ALI15X3=m +CONFIG_I2C_AMD756=m +CONFIG_I2C_AMD756_S4882=m +CONFIG_I2C_AMD8111=m +CONFIG_I2C_AMD_MP2=m +CONFIG_I2C_I801=m +CONFIG_I2C_ISCH=m +CONFIG_I2C_ISMT=m +CONFIG_I2C_PIIX4=m +CONFIG_I2C_CHT_WC=m +CONFIG_I2C_NFORCE2=m +CONFIG_I2C_NFORCE2_S4985=m +CONFIG_I2C_NVIDIA_GPU=m +CONFIG_I2C_SIS5595=m +CONFIG_I2C_SIS630=m +CONFIG_I2C_SIS96X=m +CONFIG_I2C_VIA=m +CONFIG_I2C_VIAPRO=m + +# +# ACPI drivers +# +CONFIG_I2C_SCMI=m + +# +# I2C system bus drivers (mostly embedded / system-on-chip) +# +CONFIG_I2C_CBUS_GPIO=m +CONFIG_I2C_DESIGNWARE_CORE=y +CONFIG_I2C_DESIGNWARE_PLATFORM=y +CONFIG_I2C_DESIGNWARE_SLAVE=y +CONFIG_I2C_DESIGNWARE_PCI=m +CONFIG_I2C_DESIGNWARE_BAYTRAIL=y +CONFIG_I2C_EMEV2=m +CONFIG_I2C_GPIO=m +# CONFIG_I2C_GPIO_FAULT_INJECTOR is not set +CONFIG_I2C_KEMPLD=m +CONFIG_I2C_OCORES=m +CONFIG_I2C_PCA_PLATFORM=m +CONFIG_I2C_RK3X=m +CONFIG_I2C_SIMTEC=m +CONFIG_I2C_XILINX=m + +# +# External I2C/SMBus adapter drivers +# +CONFIG_I2C_DIOLAN_U2C=m +CONFIG_I2C_DLN2=m +CONFIG_I2C_PARPORT=m +CONFIG_I2C_ROBOTFUZZ_OSIF=m +CONFIG_I2C_TAOS_EVM=m +CONFIG_I2C_TINY_USB=m +CONFIG_I2C_VIPERBOARD=m + +# +# Other I2C/SMBus bus drivers +# +CONFIG_I2C_MLXCPLD=m +CONFIG_I2C_CROS_EC_TUNNEL=m +CONFIG_I2C_FSI=m +# end of I2C Hardware Bus support + +CONFIG_I2C_STUB=m +CONFIG_I2C_SLAVE=y +CONFIG_I2C_SLAVE_EEPROM=m +# CONFIG_I2C_DEBUG_CORE is not set +# CONFIG_I2C_DEBUG_ALGO is not set +# CONFIG_I2C_DEBUG_BUS is not set +# end of I2C support + +CONFIG_I3C=m +CONFIG_CDNS_I3C_MASTER=m +CONFIG_DW_I3C_MASTER=m +CONFIG_SPI=y +# CONFIG_SPI_DEBUG is not set +CONFIG_SPI_MASTER=y +CONFIG_SPI_MEM=y + +# +# SPI Master Controller Drivers +# +CONFIG_SPI_ALTERA=m +CONFIG_SPI_AXI_SPI_ENGINE=m +CONFIG_SPI_BITBANG=m +CONFIG_SPI_BUTTERFLY=m +CONFIG_SPI_CADENCE=m +CONFIG_SPI_DESIGNWARE=m +CONFIG_SPI_DW_PCI=m +CONFIG_SPI_DW_MID_DMA=y +CONFIG_SPI_DW_MMIO=m +CONFIG_SPI_DLN2=m +CONFIG_SPI_FSI=m +CONFIG_SPI_NXP_FLEXSPI=m +CONFIG_SPI_GPIO=m +CONFIG_SPI_LM70_LLP=m +CONFIG_SPI_FSL_LIB=m +CONFIG_SPI_FSL_SPI=m +CONFIG_SPI_OC_TINY=m +CONFIG_SPI_PXA2XX=m +CONFIG_SPI_PXA2XX_PCI=m +CONFIG_SPI_ROCKCHIP=m +CONFIG_SPI_SC18IS602=m +CONFIG_SPI_SIFIVE=m +CONFIG_SPI_MXIC=m +CONFIG_SPI_XCOMM=m +CONFIG_SPI_XILINX=m +CONFIG_SPI_ZYNQMP_GQSPI=m + +# +# SPI Multiplexer support +# +CONFIG_SPI_MUX=m + +# +# SPI Protocol Masters +# +CONFIG_SPI_SPIDEV=m +CONFIG_SPI_LOOPBACK_TEST=m +CONFIG_SPI_TLE62X0=m +CONFIG_SPI_SLAVE=y +CONFIG_SPI_SLAVE_TIME=m +CONFIG_SPI_SLAVE_SYSTEM_CONTROL=m +CONFIG_SPMI=m +CONFIG_HSI=m +CONFIG_HSI_BOARDINFO=y + +# +# HSI controllers +# + +# +# HSI clients +# +CONFIG_HSI_CHAR=m +CONFIG_PPS=y +# CONFIG_PPS_DEBUG is not set + +# +# PPS clients support +# +CONFIG_PPS_CLIENT_KTIMER=m +CONFIG_PPS_CLIENT_LDISC=m +CONFIG_PPS_CLIENT_PARPORT=m +CONFIG_PPS_CLIENT_GPIO=m + +# +# PPS generators support +# + +# +# PTP clock support +# +CONFIG_PTP_1588_CLOCK=y +CONFIG_DP83640_PHY=m +CONFIG_PTP_1588_CLOCK_INES=m +CONFIG_PTP_1588_CLOCK_KVM=m +CONFIG_PTP_1588_CLOCK_IDT82P33=m +CONFIG_PTP_1588_CLOCK_IDTCM=m +CONFIG_PTP_1588_CLOCK_VMW=m +# end of PTP clock support + +CONFIG_PINCTRL=y +CONFIG_GENERIC_PINCTRL_GROUPS=y +CONFIG_PINMUX=y +CONFIG_GENERIC_PINMUX_FUNCTIONS=y +CONFIG_PINCONF=y +CONFIG_GENERIC_PINCONF=y +# CONFIG_DEBUG_PINCTRL is not set +CONFIG_PINCTRL_AS3722=m +CONFIG_PINCTRL_AXP209=m +CONFIG_PINCTRL_AMD=m +CONFIG_PINCTRL_DA9062=m +CONFIG_PINCTRL_MCP23S08=m +CONFIG_PINCTRL_SINGLE=m +CONFIG_PINCTRL_SX150X=y +CONFIG_PINCTRL_STMFX=m +CONFIG_PINCTRL_MAX77620=m +CONFIG_PINCTRL_PALMAS=m +CONFIG_PINCTRL_RK805=m +CONFIG_PINCTRL_OCELOT=y +CONFIG_PINCTRL_BAYTRAIL=y +CONFIG_PINCTRL_CHERRYVIEW=y +CONFIG_PINCTRL_LYNXPOINT=y +CONFIG_PINCTRL_INTEL=y +CONFIG_PINCTRL_BROXTON=y +CONFIG_PINCTRL_CANNONLAKE=y +CONFIG_PINCTRL_CEDARFORK=y +CONFIG_PINCTRL_DENVERTON=y +CONFIG_PINCTRL_GEMINILAKE=y +CONFIG_PINCTRL_ICELAKE=y +CONFIG_PINCTRL_LEWISBURG=y +CONFIG_PINCTRL_SUNRISEPOINT=y +CONFIG_PINCTRL_TIGERLAKE=y +CONFIG_PINCTRL_LOCHNAGAR=m +CONFIG_PINCTRL_MADERA=m +CONFIG_PINCTRL_CS47L15=y +CONFIG_PINCTRL_CS47L35=y +CONFIG_PINCTRL_CS47L85=y +CONFIG_PINCTRL_CS47L90=y +CONFIG_PINCTRL_CS47L92=y +CONFIG_PINCTRL_EQUILIBRIUM=m +CONFIG_GPIOLIB=y +CONFIG_GPIOLIB_FASTPATH_LIMIT=512 +CONFIG_OF_GPIO=y +CONFIG_GPIO_ACPI=y +CONFIG_GPIOLIB_IRQCHIP=y +# CONFIG_DEBUG_GPIO is not set +CONFIG_GPIO_SYSFS=y +CONFIG_GPIO_GENERIC=y +CONFIG_GPIO_MAX730X=m + +# +# Memory mapped GPIO drivers +# +CONFIG_GPIO_74XX_MMIO=m +CONFIG_GPIO_ALTERA=m +CONFIG_GPIO_AMDPT=m +CONFIG_GPIO_CADENCE=m +CONFIG_GPIO_DWAPB=m +CONFIG_GPIO_EXAR=m +CONFIG_GPIO_FTGPIO010=y +CONFIG_GPIO_GENERIC_PLATFORM=m +CONFIG_GPIO_GRGPIO=m +CONFIG_GPIO_HLWD=m +CONFIG_GPIO_ICH=m +CONFIG_GPIO_LOGICVC=m +CONFIG_GPIO_MB86S7X=m +CONFIG_GPIO_MENZ127=m +CONFIG_GPIO_SAMA5D2_PIOBU=m +CONFIG_GPIO_SIFIVE=y +CONFIG_GPIO_SIOX=m +CONFIG_GPIO_SYSCON=m +CONFIG_GPIO_VX855=m +CONFIG_GPIO_WCD934X=m +CONFIG_GPIO_XILINX=m +CONFIG_GPIO_AMD_FCH=m +# end of Memory mapped GPIO drivers + +# +# Port-mapped I/O GPIO drivers +# +CONFIG_GPIO_F7188X=m +CONFIG_GPIO_IT87=m +CONFIG_GPIO_SCH=m +CONFIG_GPIO_SCH311X=m +CONFIG_GPIO_WINBOND=m +CONFIG_GPIO_WS16C48=m +# end of Port-mapped I/O GPIO drivers + +# +# I2C GPIO expanders +# +CONFIG_GPIO_ADP5588=m +CONFIG_GPIO_ADNP=m +CONFIG_GPIO_GW_PLD=m +CONFIG_GPIO_MAX7300=m +CONFIG_GPIO_MAX732X=m +CONFIG_GPIO_PCA953X=m +CONFIG_GPIO_PCF857X=m +CONFIG_GPIO_TPIC2810=m +# end of I2C GPIO expanders + +# +# MFD GPIO expanders +# +CONFIG_GPIO_ADP5520=m +CONFIG_GPIO_ARIZONA=m +CONFIG_GPIO_BD70528=m +CONFIG_GPIO_BD71828=m +CONFIG_GPIO_BD9571MWV=m +CONFIG_GPIO_CRYSTAL_COVE=m +CONFIG_GPIO_DA9052=m +CONFIG_GPIO_DA9055=m +CONFIG_GPIO_DLN2=m +CONFIG_GPIO_JANZ_TTL=m +CONFIG_GPIO_KEMPLD=m +CONFIG_GPIO_LP3943=m +CONFIG_GPIO_LP873X=m +CONFIG_GPIO_LP87565=m +CONFIG_GPIO_MADERA=m +CONFIG_GPIO_MAX77620=m +CONFIG_GPIO_MAX77650=m +CONFIG_GPIO_PALMAS=y +CONFIG_GPIO_RC5T583=y +CONFIG_GPIO_STMPE=y +CONFIG_GPIO_TC3589X=y +CONFIG_GPIO_TPS65086=m +CONFIG_GPIO_TPS65218=m +CONFIG_GPIO_TPS6586X=y +CONFIG_GPIO_TPS65910=y +CONFIG_GPIO_TPS65912=m +CONFIG_GPIO_TPS68470=y +CONFIG_GPIO_TQMX86=m +CONFIG_GPIO_TWL4030=m +CONFIG_GPIO_TWL6040=m +CONFIG_GPIO_UCB1400=m +CONFIG_GPIO_WHISKEY_COVE=m +CONFIG_GPIO_WM831X=m +CONFIG_GPIO_WM8350=m +CONFIG_GPIO_WM8994=m +# end of MFD GPIO expanders + +# +# PCI GPIO expanders +# +CONFIG_GPIO_AMD8111=m +CONFIG_GPIO_ML_IOH=m +CONFIG_GPIO_PCI_IDIO_16=m +CONFIG_GPIO_PCIE_IDIO_24=m +CONFIG_GPIO_RDC321X=m +CONFIG_GPIO_SODAVILLE=y +# end of PCI GPIO expanders + +# +# SPI GPIO expanders +# +CONFIG_GPIO_74X164=m +CONFIG_GPIO_MAX3191X=m +CONFIG_GPIO_MAX7301=m +CONFIG_GPIO_MC33880=m +CONFIG_GPIO_PISOSR=m +CONFIG_GPIO_XRA1403=m +CONFIG_GPIO_MOXTET=m +# end of SPI GPIO expanders + +# +# USB GPIO expanders +# +CONFIG_GPIO_VIPERBOARD=m +# end of USB GPIO expanders + +CONFIG_GPIO_MOCKUP=m +CONFIG_W1=m +CONFIG_W1_CON=y + +# +# 1-wire Bus Masters +# +CONFIG_W1_MASTER_MATROX=m +CONFIG_W1_MASTER_DS2490=m +CONFIG_W1_MASTER_DS2482=m +CONFIG_W1_MASTER_DS1WM=m +CONFIG_W1_MASTER_GPIO=m +CONFIG_W1_MASTER_SGI=m +# end of 1-wire Bus Masters + +# +# 1-wire Slaves +# +CONFIG_W1_SLAVE_THERM=m +CONFIG_W1_SLAVE_SMEM=m +CONFIG_W1_SLAVE_DS2405=m +CONFIG_W1_SLAVE_DS2408=m +# CONFIG_W1_SLAVE_DS2408_READBACK is not set +CONFIG_W1_SLAVE_DS2413=m +CONFIG_W1_SLAVE_DS2406=m +CONFIG_W1_SLAVE_DS2423=m +CONFIG_W1_SLAVE_DS2805=m +CONFIG_W1_SLAVE_DS2430=m +CONFIG_W1_SLAVE_DS2431=m +CONFIG_W1_SLAVE_DS2433=m +# CONFIG_W1_SLAVE_DS2433_CRC is not set +CONFIG_W1_SLAVE_DS2438=m +CONFIG_W1_SLAVE_DS250X=m +CONFIG_W1_SLAVE_DS2780=m +CONFIG_W1_SLAVE_DS2781=m +CONFIG_W1_SLAVE_DS28E04=m +CONFIG_W1_SLAVE_DS28E17=m +# end of 1-wire Slaves + +CONFIG_POWER_AVS=y +CONFIG_QCOM_CPR=m +CONFIG_POWER_RESET=y +CONFIG_POWER_RESET_AS3722=y +CONFIG_POWER_RESET_GPIO=y +CONFIG_POWER_RESET_GPIO_RESTART=y +CONFIG_POWER_RESET_LTC2952=y +CONFIG_POWER_RESET_MT6323=y +CONFIG_POWER_RESET_RESTART=y +CONFIG_POWER_RESET_SYSCON=y +CONFIG_POWER_RESET_SYSCON_POWEROFF=y +CONFIG_REBOOT_MODE=m +CONFIG_SYSCON_REBOOT_MODE=m +CONFIG_NVMEM_REBOOT_MODE=m +CONFIG_POWER_SUPPLY=y +# CONFIG_POWER_SUPPLY_DEBUG is not set +CONFIG_POWER_SUPPLY_HWMON=y +CONFIG_PDA_POWER=m +CONFIG_GENERIC_ADC_BATTERY=m +CONFIG_MAX8925_POWER=m +CONFIG_WM831X_BACKUP=m +CONFIG_WM831X_POWER=m +CONFIG_WM8350_POWER=m +CONFIG_TEST_POWER=m +CONFIG_BATTERY_88PM860X=m +CONFIG_CHARGER_ADP5061=m +CONFIG_BATTERY_ACT8945A=m +CONFIG_BATTERY_CPCAP=m +CONFIG_BATTERY_DS2760=m +CONFIG_BATTERY_DS2780=m +CONFIG_BATTERY_DS2781=m +CONFIG_BATTERY_DS2782=m +CONFIG_BATTERY_LEGO_EV3=m +CONFIG_BATTERY_SBS=m +CONFIG_CHARGER_SBS=m +CONFIG_MANAGER_SBS=m +CONFIG_BATTERY_BQ27XXX=m +CONFIG_BATTERY_BQ27XXX_I2C=m +CONFIG_BATTERY_BQ27XXX_HDQ=m +# CONFIG_BATTERY_BQ27XXX_DT_UPDATES_NVM is not set +CONFIG_BATTERY_DA9030=m +CONFIG_BATTERY_DA9052=m +CONFIG_CHARGER_DA9150=m +CONFIG_BATTERY_DA9150=m +CONFIG_CHARGER_AXP20X=m +CONFIG_BATTERY_AXP20X=m +CONFIG_AXP20X_POWER=m +CONFIG_AXP288_CHARGER=m +CONFIG_AXP288_FUEL_GAUGE=m +CONFIG_BATTERY_MAX17040=m +CONFIG_BATTERY_MAX17042=m +CONFIG_BATTERY_MAX1721X=m +CONFIG_BATTERY_TWL4030_MADC=m +CONFIG_CHARGER_88PM860X=m +CONFIG_CHARGER_PCF50633=m +CONFIG_BATTERY_RX51=m +CONFIG_CHARGER_ISP1704=m +CONFIG_CHARGER_MAX8903=m +CONFIG_CHARGER_TWL4030=m +CONFIG_CHARGER_LP8727=m +CONFIG_CHARGER_LP8788=m +CONFIG_CHARGER_GPIO=m +CONFIG_CHARGER_MANAGER=y +CONFIG_CHARGER_LT3651=m +CONFIG_CHARGER_MAX14577=m +CONFIG_CHARGER_DETECTOR_MAX14656=m +CONFIG_CHARGER_MAX77650=m +CONFIG_CHARGER_MAX77693=m +CONFIG_CHARGER_MAX8997=m +CONFIG_CHARGER_MAX8998=m +CONFIG_CHARGER_BQ2415X=m +CONFIG_CHARGER_BQ24190=m +CONFIG_CHARGER_BQ24257=m +CONFIG_CHARGER_BQ24735=m +CONFIG_CHARGER_BQ25890=m +CONFIG_CHARGER_SMB347=m +CONFIG_CHARGER_TPS65090=m +CONFIG_CHARGER_TPS65217=m +CONFIG_BATTERY_GAUGE_LTC2941=m +CONFIG_BATTERY_RT5033=m +CONFIG_CHARGER_RT9455=m +CONFIG_CHARGER_CROS_USBPD=m +CONFIG_CHARGER_UCS1002=m +CONFIG_CHARGER_BD70528=m +CONFIG_CHARGER_WILCO=m +CONFIG_HWMON=y +CONFIG_HWMON_VID=m +# CONFIG_HWMON_DEBUG_CHIP is not set + +# +# Native drivers +# +CONFIG_SENSORS_ABITUGURU=m +CONFIG_SENSORS_ABITUGURU3=m +CONFIG_SENSORS_AD7314=m +CONFIG_SENSORS_AD7414=m +CONFIG_SENSORS_AD7418=m +CONFIG_SENSORS_ADM1021=m +CONFIG_SENSORS_ADM1025=m +CONFIG_SENSORS_ADM1026=m +CONFIG_SENSORS_ADM1029=m +CONFIG_SENSORS_ADM1031=m +CONFIG_SENSORS_ADM1177=m +CONFIG_SENSORS_ADM9240=m +CONFIG_SENSORS_ADT7X10=m +CONFIG_SENSORS_ADT7310=m +CONFIG_SENSORS_ADT7410=m +CONFIG_SENSORS_ADT7411=m +CONFIG_SENSORS_ADT7462=m +CONFIG_SENSORS_ADT7470=m +CONFIG_SENSORS_ADT7475=m +CONFIG_SENSORS_AS370=m +CONFIG_SENSORS_ASC7621=m +CONFIG_SENSORS_AXI_FAN_CONTROL=m +CONFIG_SENSORS_K8TEMP=m +CONFIG_SENSORS_K10TEMP=m +CONFIG_SENSORS_FAM15H_POWER=m +CONFIG_SENSORS_APPLESMC=m +CONFIG_SENSORS_ASB100=m +CONFIG_SENSORS_ASPEED=m +CONFIG_SENSORS_ATXP1=m +CONFIG_SENSORS_DRIVETEMP=m +CONFIG_SENSORS_DS620=m +CONFIG_SENSORS_DS1621=m +CONFIG_SENSORS_DELL_SMM=m +CONFIG_SENSORS_DA9052_ADC=m +CONFIG_SENSORS_DA9055=m +CONFIG_SENSORS_I5K_AMB=m +CONFIG_SENSORS_F71805F=m +CONFIG_SENSORS_F71882FG=m +CONFIG_SENSORS_F75375S=m +CONFIG_SENSORS_MC13783_ADC=m +CONFIG_SENSORS_FSCHMD=m +CONFIG_SENSORS_FTSTEUTATES=m +CONFIG_SENSORS_GL518SM=m +CONFIG_SENSORS_GL520SM=m +CONFIG_SENSORS_G760A=m +CONFIG_SENSORS_G762=m +CONFIG_SENSORS_GPIO_FAN=m +CONFIG_SENSORS_HIH6130=m +CONFIG_SENSORS_IBMAEM=m +CONFIG_SENSORS_IBMPEX=m +CONFIG_SENSORS_IIO_HWMON=m +CONFIG_SENSORS_I5500=m +CONFIG_SENSORS_CORETEMP=m +CONFIG_SENSORS_IT87=m +CONFIG_SENSORS_JC42=m +CONFIG_SENSORS_POWR1220=m +CONFIG_SENSORS_LINEAGE=m +CONFIG_SENSORS_LOCHNAGAR=m +CONFIG_SENSORS_LTC2945=m +CONFIG_SENSORS_LTC2947=m +CONFIG_SENSORS_LTC2947_I2C=m +CONFIG_SENSORS_LTC2947_SPI=m +CONFIG_SENSORS_LTC2990=m +CONFIG_SENSORS_LTC4151=m +CONFIG_SENSORS_LTC4215=m +CONFIG_SENSORS_LTC4222=m +CONFIG_SENSORS_LTC4245=m +CONFIG_SENSORS_LTC4260=m +CONFIG_SENSORS_LTC4261=m +CONFIG_SENSORS_MAX1111=m +CONFIG_SENSORS_MAX16065=m +CONFIG_SENSORS_MAX1619=m +CONFIG_SENSORS_MAX1668=m +CONFIG_SENSORS_MAX197=m +CONFIG_SENSORS_MAX31722=m +CONFIG_SENSORS_MAX31730=m +CONFIG_SENSORS_MAX6621=m +CONFIG_SENSORS_MAX6639=m +CONFIG_SENSORS_MAX6642=m +CONFIG_SENSORS_MAX6650=m +CONFIG_SENSORS_MAX6697=m +CONFIG_SENSORS_MAX31790=m +CONFIG_SENSORS_MCP3021=m +CONFIG_SENSORS_MLXREG_FAN=m +CONFIG_SENSORS_TC654=m +CONFIG_SENSORS_MENF21BMC_HWMON=m +CONFIG_SENSORS_ADCXX=m +CONFIG_SENSORS_LM63=m +CONFIG_SENSORS_LM70=m +CONFIG_SENSORS_LM73=m +CONFIG_SENSORS_LM75=m +CONFIG_SENSORS_LM77=m +CONFIG_SENSORS_LM78=m +CONFIG_SENSORS_LM80=m +CONFIG_SENSORS_LM83=m +CONFIG_SENSORS_LM85=m +CONFIG_SENSORS_LM87=m +CONFIG_SENSORS_LM90=m +CONFIG_SENSORS_LM92=m +CONFIG_SENSORS_LM93=m +CONFIG_SENSORS_LM95234=m +CONFIG_SENSORS_LM95241=m +CONFIG_SENSORS_LM95245=m +CONFIG_SENSORS_PC87360=m +CONFIG_SENSORS_PC87427=m +CONFIG_SENSORS_NTC_THERMISTOR=m +CONFIG_SENSORS_NCT6683=m +CONFIG_SENSORS_NCT6775=m +CONFIG_SENSORS_NCT7802=m +CONFIG_SENSORS_NCT7904=m +CONFIG_SENSORS_NPCM7XX=m +CONFIG_SENSORS_PCF8591=m +CONFIG_PMBUS=m +CONFIG_SENSORS_PMBUS=m +CONFIG_SENSORS_ADM1275=m +CONFIG_SENSORS_BEL_PFE=m +CONFIG_SENSORS_IBM_CFFPS=m +CONFIG_SENSORS_INSPUR_IPSPS=m +CONFIG_SENSORS_IR35221=m +CONFIG_SENSORS_IR38064=m +CONFIG_SENSORS_IRPS5401=m +CONFIG_SENSORS_ISL68137=m +CONFIG_SENSORS_LM25066=m +CONFIG_SENSORS_LTC2978=m +# CONFIG_SENSORS_LTC2978_REGULATOR is not set +CONFIG_SENSORS_LTC3815=m +CONFIG_SENSORS_MAX16064=m +CONFIG_SENSORS_MAX20730=m +CONFIG_SENSORS_MAX20751=m +CONFIG_SENSORS_MAX31785=m +CONFIG_SENSORS_MAX34440=m +CONFIG_SENSORS_MAX8688=m +CONFIG_SENSORS_PXE1610=m +CONFIG_SENSORS_TPS40422=m +CONFIG_SENSORS_TPS53679=m +CONFIG_SENSORS_UCD9000=m +CONFIG_SENSORS_UCD9200=m +CONFIG_SENSORS_XDPE122=m +CONFIG_SENSORS_ZL6100=m +CONFIG_SENSORS_PWM_FAN=m +CONFIG_SENSORS_SHT15=m +CONFIG_SENSORS_SHT21=m +CONFIG_SENSORS_SHT3x=m +CONFIG_SENSORS_SHTC1=m +CONFIG_SENSORS_SIS5595=m +CONFIG_SENSORS_DME1737=m +CONFIG_SENSORS_EMC1403=m +CONFIG_SENSORS_EMC2103=m +CONFIG_SENSORS_EMC6W201=m +CONFIG_SENSORS_SMSC47M1=m +CONFIG_SENSORS_SMSC47M192=m +CONFIG_SENSORS_SMSC47B397=m +CONFIG_SENSORS_SCH56XX_COMMON=m +CONFIG_SENSORS_SCH5627=m +CONFIG_SENSORS_SCH5636=m +CONFIG_SENSORS_STTS751=m +CONFIG_SENSORS_SMM665=m +CONFIG_SENSORS_ADC128D818=m +CONFIG_SENSORS_ADS7828=m +CONFIG_SENSORS_ADS7871=m +CONFIG_SENSORS_AMC6821=m +CONFIG_SENSORS_INA209=m +CONFIG_SENSORS_INA2XX=m +CONFIG_SENSORS_INA3221=m +CONFIG_SENSORS_TC74=m +CONFIG_SENSORS_THMC50=m +CONFIG_SENSORS_TMP102=m +CONFIG_SENSORS_TMP103=m +CONFIG_SENSORS_TMP108=m +CONFIG_SENSORS_TMP401=m +CONFIG_SENSORS_TMP421=m +CONFIG_SENSORS_TMP513=m +CONFIG_SENSORS_VIA_CPUTEMP=m +CONFIG_SENSORS_VIA686A=m +CONFIG_SENSORS_VT1211=m +CONFIG_SENSORS_VT8231=m +CONFIG_SENSORS_W83773G=m +CONFIG_SENSORS_W83781D=m +CONFIG_SENSORS_W83791D=m +CONFIG_SENSORS_W83792D=m +CONFIG_SENSORS_W83793=m +CONFIG_SENSORS_W83795=m +# CONFIG_SENSORS_W83795_FANCTRL is not set +CONFIG_SENSORS_W83L785TS=m +CONFIG_SENSORS_W83L786NG=m +CONFIG_SENSORS_W83627HF=m +CONFIG_SENSORS_W83627EHF=m +CONFIG_SENSORS_WM831X=m +CONFIG_SENSORS_WM8350=m +CONFIG_SENSORS_XGENE=m + +# +# ACPI drivers +# +CONFIG_SENSORS_ACPI_POWER=m +CONFIG_SENSORS_ATK0110=m +CONFIG_THERMAL=y +# CONFIG_THERMAL_STATISTICS is not set +CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=100 +CONFIG_THERMAL_HWMON=y +CONFIG_THERMAL_OF=y +CONFIG_THERMAL_WRITABLE_TRIPS=y +CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y +# CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE is not set +# CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE is not set +# CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR is not set +CONFIG_THERMAL_GOV_FAIR_SHARE=y +CONFIG_THERMAL_GOV_STEP_WISE=y +CONFIG_THERMAL_GOV_BANG_BANG=y +CONFIG_THERMAL_GOV_USER_SPACE=y +CONFIG_THERMAL_GOV_POWER_ALLOCATOR=y +CONFIG_CPU_THERMAL=y +CONFIG_CPU_FREQ_THERMAL=y +CONFIG_CPU_IDLE_THERMAL=y +CONFIG_CLOCK_THERMAL=y +CONFIG_DEVFREQ_THERMAL=y +# CONFIG_THERMAL_EMULATION is not set +CONFIG_THERMAL_MMIO=m +CONFIG_MAX77620_THERMAL=m +CONFIG_QORIQ_THERMAL=m +CONFIG_DA9062_THERMAL=m + +# +# Intel thermal drivers +# +CONFIG_INTEL_POWERCLAMP=m +CONFIG_X86_PKG_TEMP_THERMAL=m +CONFIG_INTEL_SOC_DTS_IOSF_CORE=m +CONFIG_INTEL_SOC_DTS_THERMAL=m + +# +# ACPI INT340X thermal drivers +# +CONFIG_INT340X_THERMAL=m +CONFIG_ACPI_THERMAL_REL=m +CONFIG_INT3406_THERMAL=m +CONFIG_PROC_THERMAL_MMIO_RAPL=y +# end of ACPI INT340X thermal drivers + +CONFIG_INTEL_BXT_PMIC_THERMAL=m +CONFIG_INTEL_PCH_THERMAL=m +# end of Intel thermal drivers + +# CONFIG_TI_SOC_THERMAL is not set +CONFIG_GENERIC_ADC_THERMAL=m +CONFIG_WATCHDOG=y +CONFIG_WATCHDOG_CORE=y +# CONFIG_WATCHDOG_NOWAYOUT is not set +CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED=y +CONFIG_WATCHDOG_OPEN_TIMEOUT=0 +CONFIG_WATCHDOG_SYSFS=y + +# +# Watchdog Pretimeout Governors +# +CONFIG_WATCHDOG_PRETIMEOUT_GOV=y +CONFIG_WATCHDOG_PRETIMEOUT_GOV_SEL=m +CONFIG_WATCHDOG_PRETIMEOUT_GOV_NOOP=m +CONFIG_WATCHDOG_PRETIMEOUT_GOV_PANIC=y +# CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_NOOP is not set +CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC=y + +# +# Watchdog Device Drivers +# +CONFIG_SOFT_WATCHDOG=m +# CONFIG_SOFT_WATCHDOG_PRETIMEOUT is not set +CONFIG_BD70528_WATCHDOG=m +CONFIG_DA9052_WATCHDOG=m +CONFIG_DA9055_WATCHDOG=m +CONFIG_DA9063_WATCHDOG=m +CONFIG_DA9062_WATCHDOG=m +CONFIG_GPIO_WATCHDOG=m +CONFIG_MENF21BMC_WATCHDOG=m +CONFIG_MENZ069_WATCHDOG=m +CONFIG_WDAT_WDT=m +CONFIG_WM831X_WATCHDOG=m +CONFIG_WM8350_WATCHDOG=m +CONFIG_XILINX_WATCHDOG=m +CONFIG_ZIIRAVE_WATCHDOG=m +CONFIG_RAVE_SP_WATCHDOG=m +CONFIG_MLX_WDT=m +CONFIG_CADENCE_WATCHDOG=m +CONFIG_DW_WATCHDOG=m +CONFIG_RN5T618_WATCHDOG=m +CONFIG_TWL4030_WATCHDOG=m +CONFIG_MAX63XX_WATCHDOG=m +CONFIG_MAX77620_WATCHDOG=m +CONFIG_RETU_WATCHDOG=m +CONFIG_STPMIC1_WATCHDOG=m +CONFIG_ACQUIRE_WDT=m +CONFIG_ADVANTECH_WDT=m +CONFIG_ALIM1535_WDT=m +CONFIG_ALIM7101_WDT=m +CONFIG_EBC_C384_WDT=m +CONFIG_F71808E_WDT=m +CONFIG_SP5100_TCO=m +CONFIG_SBC_FITPC2_WATCHDOG=m +CONFIG_EUROTECH_WDT=m +CONFIG_IB700_WDT=m +CONFIG_IBMASR=m +CONFIG_WAFER_WDT=m +CONFIG_I6300ESB_WDT=m +CONFIG_IE6XX_WDT=m +CONFIG_ITCO_WDT=m +CONFIG_ITCO_VENDOR_SUPPORT=y +CONFIG_IT8712F_WDT=m +CONFIG_IT87_WDT=m +CONFIG_HP_WATCHDOG=m +CONFIG_HPWDT_NMI_DECODING=y +CONFIG_KEMPLD_WDT=m +CONFIG_SC1200_WDT=m +CONFIG_PC87413_WDT=m +CONFIG_NV_TCO=m +CONFIG_60XX_WDT=m +CONFIG_CPU5_WDT=m +CONFIG_SMSC_SCH311X_WDT=m +CONFIG_SMSC37B787_WDT=m +CONFIG_TQMX86_WDT=m +CONFIG_VIA_WDT=m +CONFIG_W83627HF_WDT=m +CONFIG_W83877F_WDT=m +CONFIG_W83977F_WDT=m +CONFIG_MACHZ_WDT=m +CONFIG_SBC_EPX_C3_WATCHDOG=m +CONFIG_INTEL_MEI_WDT=m +CONFIG_NI903X_WDT=m +CONFIG_NIC7018_WDT=m +CONFIG_MEN_A21_WDT=m +CONFIG_XEN_WDT=m + +# +# PCI-based Watchdog Cards +# +CONFIG_PCIPCWATCHDOG=m +CONFIG_WDTPCI=m + +# +# USB-based Watchdog Cards +# +CONFIG_USBPCWATCHDOG=m +CONFIG_SSB_POSSIBLE=y +CONFIG_SSB=m +CONFIG_SSB_SPROM=y +CONFIG_SSB_BLOCKIO=y +CONFIG_SSB_PCIHOST_POSSIBLE=y +CONFIG_SSB_PCIHOST=y +CONFIG_SSB_B43_PCI_BRIDGE=y +CONFIG_SSB_PCMCIAHOST_POSSIBLE=y +CONFIG_SSB_PCMCIAHOST=y +CONFIG_SSB_SDIOHOST_POSSIBLE=y +CONFIG_SSB_SDIOHOST=y +CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y +CONFIG_SSB_DRIVER_PCICORE=y +CONFIG_SSB_DRIVER_GPIO=y +CONFIG_BCMA_POSSIBLE=y +CONFIG_BCMA=m +CONFIG_BCMA_BLOCKIO=y +CONFIG_BCMA_HOST_PCI_POSSIBLE=y +CONFIG_BCMA_HOST_PCI=y +# CONFIG_BCMA_HOST_SOC is not set +CONFIG_BCMA_DRIVER_PCI=y +CONFIG_BCMA_DRIVER_GMAC_CMN=y +CONFIG_BCMA_DRIVER_GPIO=y +# CONFIG_BCMA_DEBUG is not set + +# +# Multifunction device drivers +# +CONFIG_MFD_CORE=y +CONFIG_MFD_ACT8945A=m +CONFIG_MFD_AS3711=y +CONFIG_MFD_AS3722=m +CONFIG_PMIC_ADP5520=y +CONFIG_MFD_AAT2870_CORE=y +CONFIG_MFD_ATMEL_FLEXCOM=m +CONFIG_MFD_ATMEL_HLCDC=m +CONFIG_MFD_BCM590XX=m +CONFIG_MFD_BD9571MWV=m +CONFIG_MFD_AXP20X=m +CONFIG_MFD_AXP20X_I2C=m +CONFIG_MFD_CROS_EC_DEV=m +CONFIG_MFD_MADERA=m +CONFIG_MFD_MADERA_I2C=m +CONFIG_MFD_MADERA_SPI=m +CONFIG_MFD_CS47L15=y +CONFIG_MFD_CS47L35=y +CONFIG_MFD_CS47L85=y +CONFIG_MFD_CS47L90=y +CONFIG_MFD_CS47L92=y +CONFIG_PMIC_DA903X=y +CONFIG_PMIC_DA9052=y +CONFIG_MFD_DA9052_SPI=y +CONFIG_MFD_DA9052_I2C=y +CONFIG_MFD_DA9055=y +CONFIG_MFD_DA9062=m +CONFIG_MFD_DA9063=m +CONFIG_MFD_DA9150=m +CONFIG_MFD_DLN2=m +CONFIG_MFD_MC13XXX=m +CONFIG_MFD_MC13XXX_SPI=m +CONFIG_MFD_MC13XXX_I2C=m +CONFIG_MFD_HI6421_PMIC=m +CONFIG_HTC_PASIC3=m +CONFIG_HTC_I2CPLD=y +CONFIG_MFD_INTEL_QUARK_I2C_GPIO=m +CONFIG_LPC_ICH=m +CONFIG_LPC_SCH=m +CONFIG_INTEL_SOC_PMIC=y +CONFIG_INTEL_SOC_PMIC_BXTWC=m +CONFIG_INTEL_SOC_PMIC_CHTWC=y +CONFIG_INTEL_SOC_PMIC_CHTDC_TI=m +CONFIG_MFD_INTEL_LPSS=m +CONFIG_MFD_INTEL_LPSS_ACPI=m +CONFIG_MFD_INTEL_LPSS_PCI=m +CONFIG_MFD_IQS62X=m +CONFIG_MFD_JANZ_CMODIO=m +CONFIG_MFD_KEMPLD=m +CONFIG_MFD_88PM800=m +CONFIG_MFD_88PM805=m +CONFIG_MFD_88PM860X=y +CONFIG_MFD_MAX14577=m +CONFIG_MFD_MAX77620=y +CONFIG_MFD_MAX77650=m +CONFIG_MFD_MAX77686=m +CONFIG_MFD_MAX77693=m +CONFIG_MFD_MAX77843=y +CONFIG_MFD_MAX8907=m +CONFIG_MFD_MAX8925=y +CONFIG_MFD_MAX8997=y +CONFIG_MFD_MAX8998=y +CONFIG_MFD_MT6397=m +CONFIG_MFD_MENF21BMC=m +CONFIG_EZX_PCAP=y +CONFIG_MFD_CPCAP=m +CONFIG_MFD_VIPERBOARD=m +CONFIG_MFD_RETU=m +CONFIG_MFD_PCF50633=m +CONFIG_PCF50633_ADC=m +CONFIG_PCF50633_GPIO=m +CONFIG_UCB1400_CORE=m +CONFIG_MFD_RDC321X=m +CONFIG_MFD_RT5033=m +CONFIG_MFD_RC5T583=y +CONFIG_MFD_RK808=m +CONFIG_MFD_RN5T618=m +CONFIG_MFD_SEC_CORE=y +CONFIG_MFD_SI476X_CORE=m +CONFIG_MFD_SM501=m +CONFIG_MFD_SM501_GPIO=y +CONFIG_MFD_SKY81452=m +CONFIG_MFD_SMSC=y +CONFIG_ABX500_CORE=y +CONFIG_AB3100_CORE=y +CONFIG_AB3100_OTP=y +CONFIG_MFD_STMPE=y + +# +# STMicroelectronics STMPE Interface Drivers +# +CONFIG_STMPE_I2C=y +CONFIG_STMPE_SPI=y +# end of STMicroelectronics STMPE Interface Drivers + +CONFIG_MFD_SYSCON=y +CONFIG_MFD_TI_AM335X_TSCADC=m +CONFIG_MFD_LP3943=m +CONFIG_MFD_LP8788=y +CONFIG_MFD_TI_LMU=m +CONFIG_MFD_PALMAS=y +CONFIG_TPS6105X=m +CONFIG_TPS65010=m +CONFIG_TPS6507X=m +CONFIG_MFD_TPS65086=m +CONFIG_MFD_TPS65090=y +CONFIG_MFD_TPS65217=m +CONFIG_MFD_TPS68470=y +CONFIG_MFD_TI_LP873X=m +CONFIG_MFD_TI_LP87565=m +CONFIG_MFD_TPS65218=m +CONFIG_MFD_TPS6586X=y +CONFIG_MFD_TPS65910=y +CONFIG_MFD_TPS65912=m +CONFIG_MFD_TPS65912_I2C=m +CONFIG_MFD_TPS65912_SPI=m +CONFIG_MFD_TPS80031=y +CONFIG_TWL4030_CORE=y +CONFIG_MFD_TWL4030_AUDIO=y +CONFIG_TWL6040_CORE=y +CONFIG_MFD_WL1273_CORE=m +CONFIG_MFD_LM3533=m +CONFIG_MFD_TC3589X=y +CONFIG_MFD_TQMX86=m +CONFIG_MFD_VX855=m +CONFIG_MFD_LOCHNAGAR=y +CONFIG_MFD_ARIZONA=y +CONFIG_MFD_ARIZONA_I2C=m +CONFIG_MFD_ARIZONA_SPI=m +CONFIG_MFD_CS47L24=y +CONFIG_MFD_WM5102=y +CONFIG_MFD_WM5110=y +CONFIG_MFD_WM8997=y +CONFIG_MFD_WM8998=y +CONFIG_MFD_WM8400=y +CONFIG_MFD_WM831X=y +CONFIG_MFD_WM831X_I2C=y +CONFIG_MFD_WM831X_SPI=y +CONFIG_MFD_WM8350=y +CONFIG_MFD_WM8350_I2C=y +CONFIG_MFD_WM8994=m +CONFIG_MFD_ROHM_BD718XX=m +CONFIG_MFD_ROHM_BD70528=m +CONFIG_MFD_ROHM_BD71828=m +CONFIG_MFD_STPMIC1=m +CONFIG_MFD_STMFX=m +CONFIG_MFD_WCD934X=m +CONFIG_RAVE_SP_CORE=m +# end of Multifunction device drivers + +CONFIG_REGULATOR=y +# CONFIG_REGULATOR_DEBUG is not set +CONFIG_REGULATOR_FIXED_VOLTAGE=m +CONFIG_REGULATOR_VIRTUAL_CONSUMER=m +CONFIG_REGULATOR_USERSPACE_CONSUMER=m +CONFIG_REGULATOR_88PG86X=m +CONFIG_REGULATOR_88PM800=m +CONFIG_REGULATOR_88PM8607=m +CONFIG_REGULATOR_ACT8865=m +CONFIG_REGULATOR_ACT8945A=m +CONFIG_REGULATOR_AD5398=m +CONFIG_REGULATOR_AAT2870=m +CONFIG_REGULATOR_AB3100=m +CONFIG_REGULATOR_ARIZONA_LDO1=m +CONFIG_REGULATOR_ARIZONA_MICSUPP=m +CONFIG_REGULATOR_AS3711=m +CONFIG_REGULATOR_AS3722=m +CONFIG_REGULATOR_AXP20X=m +CONFIG_REGULATOR_BCM590XX=m +CONFIG_REGULATOR_BD70528=m +CONFIG_REGULATOR_BD71828=m +CONFIG_REGULATOR_BD718XX=m +CONFIG_REGULATOR_BD9571MWV=m +CONFIG_REGULATOR_CPCAP=m +CONFIG_REGULATOR_DA903X=m +CONFIG_REGULATOR_DA9052=m +CONFIG_REGULATOR_DA9055=m +CONFIG_REGULATOR_DA9062=m +CONFIG_REGULATOR_DA9063=m +CONFIG_REGULATOR_DA9210=m +CONFIG_REGULATOR_DA9211=m +CONFIG_REGULATOR_FAN53555=m +CONFIG_REGULATOR_GPIO=m +CONFIG_REGULATOR_HI6421=m +CONFIG_REGULATOR_HI6421V530=m +CONFIG_REGULATOR_ISL9305=m +CONFIG_REGULATOR_ISL6271A=m +CONFIG_REGULATOR_LM363X=m +CONFIG_REGULATOR_LOCHNAGAR=m +CONFIG_REGULATOR_LP3971=m +CONFIG_REGULATOR_LP3972=m +CONFIG_REGULATOR_LP872X=m +CONFIG_REGULATOR_LP873X=m +CONFIG_REGULATOR_LP8755=m +CONFIG_REGULATOR_LP87565=m +CONFIG_REGULATOR_LP8788=m +CONFIG_REGULATOR_LTC3589=m +CONFIG_REGULATOR_LTC3676=m +CONFIG_REGULATOR_MAX14577=m +CONFIG_REGULATOR_MAX1586=m +CONFIG_REGULATOR_MAX77620=m +CONFIG_REGULATOR_MAX77650=m +CONFIG_REGULATOR_MAX8649=m +CONFIG_REGULATOR_MAX8660=m +CONFIG_REGULATOR_MAX8907=m +CONFIG_REGULATOR_MAX8925=m +CONFIG_REGULATOR_MAX8952=m +CONFIG_REGULATOR_MAX8973=m +CONFIG_REGULATOR_MAX8997=m +CONFIG_REGULATOR_MAX8998=m +CONFIG_REGULATOR_MAX77686=m +CONFIG_REGULATOR_MAX77693=m +CONFIG_REGULATOR_MAX77802=m +CONFIG_REGULATOR_MC13XXX_CORE=m +CONFIG_REGULATOR_MC13783=m +CONFIG_REGULATOR_MC13892=m +CONFIG_REGULATOR_MCP16502=m +CONFIG_REGULATOR_MP5416=m +CONFIG_REGULATOR_MP8859=m +CONFIG_REGULATOR_MP886X=m +CONFIG_REGULATOR_MPQ7920=m +CONFIG_REGULATOR_MT6311=m +CONFIG_REGULATOR_MT6323=m +CONFIG_REGULATOR_MT6397=m +CONFIG_REGULATOR_PALMAS=m +CONFIG_REGULATOR_PCAP=m +CONFIG_REGULATOR_PCF50633=m +CONFIG_REGULATOR_PFUZE100=m +CONFIG_REGULATOR_PV88060=m +CONFIG_REGULATOR_PV88080=m +CONFIG_REGULATOR_PV88090=m +CONFIG_REGULATOR_PWM=m +CONFIG_REGULATOR_QCOM_SPMI=m +CONFIG_REGULATOR_RC5T583=m +CONFIG_REGULATOR_RK808=m +CONFIG_REGULATOR_RN5T618=m +CONFIG_REGULATOR_ROHM=m +CONFIG_REGULATOR_RT5033=m +CONFIG_REGULATOR_S2MPA01=m +CONFIG_REGULATOR_S2MPS11=m +CONFIG_REGULATOR_S5M8767=m +CONFIG_REGULATOR_SKY81452=m +CONFIG_REGULATOR_SLG51000=m +CONFIG_REGULATOR_STPMIC1=m +CONFIG_REGULATOR_SY8106A=m +CONFIG_REGULATOR_SY8824X=m +CONFIG_REGULATOR_TPS51632=m +CONFIG_REGULATOR_TPS6105X=m +CONFIG_REGULATOR_TPS62360=m +CONFIG_REGULATOR_TPS65023=m +CONFIG_REGULATOR_TPS6507X=m +CONFIG_REGULATOR_TPS65086=m +CONFIG_REGULATOR_TPS65090=m +CONFIG_REGULATOR_TPS65132=m +CONFIG_REGULATOR_TPS65217=m +CONFIG_REGULATOR_TPS65218=m +CONFIG_REGULATOR_TPS6524X=m +CONFIG_REGULATOR_TPS6586X=m +CONFIG_REGULATOR_TPS65910=m +CONFIG_REGULATOR_TPS65912=m +CONFIG_REGULATOR_TPS80031=m +CONFIG_REGULATOR_TWL4030=m +CONFIG_REGULATOR_VCTRL=m +CONFIG_REGULATOR_WM831X=m +CONFIG_REGULATOR_WM8350=m +CONFIG_REGULATOR_WM8400=m +CONFIG_REGULATOR_WM8994=m +CONFIG_CEC_CORE=m +CONFIG_CEC_NOTIFIER=y +CONFIG_CEC_PIN=y +CONFIG_RC_CORE=m +CONFIG_RC_MAP=m +CONFIG_LIRC=y +CONFIG_RC_DECODERS=y +CONFIG_IR_NEC_DECODER=m +CONFIG_IR_RC5_DECODER=m +CONFIG_IR_RC6_DECODER=m +CONFIG_IR_JVC_DECODER=m +CONFIG_IR_SONY_DECODER=m +CONFIG_IR_SANYO_DECODER=m +CONFIG_IR_SHARP_DECODER=m +CONFIG_IR_MCE_KBD_DECODER=m +CONFIG_IR_XMP_DECODER=m +CONFIG_IR_IMON_DECODER=m +CONFIG_IR_RCMM_DECODER=m +CONFIG_RC_DEVICES=y +CONFIG_RC_ATI_REMOTE=m +CONFIG_IR_ENE=m +CONFIG_IR_HIX5HD2=m +CONFIG_IR_IMON=m +CONFIG_IR_IMON_RAW=m +CONFIG_IR_MCEUSB=m +CONFIG_IR_ITE_CIR=m +CONFIG_IR_FINTEK=m +CONFIG_IR_NUVOTON=m +CONFIG_IR_REDRAT3=m +CONFIG_IR_SPI=m +CONFIG_IR_STREAMZAP=m +CONFIG_IR_WINBOND_CIR=m +CONFIG_IR_IGORPLUGUSB=m +CONFIG_IR_IGUANA=m +CONFIG_IR_TTUSBIR=m +CONFIG_RC_LOOPBACK=m +CONFIG_IR_GPIO_CIR=m +CONFIG_IR_GPIO_TX=m +CONFIG_IR_PWM_TX=m +CONFIG_IR_SERIAL=m +CONFIG_IR_SERIAL_TRANSMITTER=y +CONFIG_IR_SIR=m +CONFIG_RC_XBOX_DVD=m +CONFIG_MEDIA_SUPPORT=m + +# +# Multimedia core support +# +CONFIG_MEDIA_CAMERA_SUPPORT=y +CONFIG_MEDIA_ANALOG_TV_SUPPORT=y +CONFIG_MEDIA_DIGITAL_TV_SUPPORT=y +CONFIG_MEDIA_RADIO_SUPPORT=y +CONFIG_MEDIA_SDR_SUPPORT=y +CONFIG_MEDIA_CEC_SUPPORT=y +CONFIG_MEDIA_CEC_RC=y +# CONFIG_CEC_PIN_ERROR_INJ is not set +CONFIG_MEDIA_CONTROLLER=y +CONFIG_MEDIA_CONTROLLER_DVB=y +# CONFIG_MEDIA_CONTROLLER_REQUEST_API is not set +CONFIG_VIDEO_DEV=m +CONFIG_VIDEO_V4L2_SUBDEV_API=y +CONFIG_VIDEO_V4L2=m +CONFIG_VIDEO_V4L2_I2C=y +# CONFIG_VIDEO_ADV_DEBUG is not set +# CONFIG_VIDEO_FIXED_MINOR_RANGES is not set +CONFIG_VIDEO_TUNER=m +CONFIG_V4L2_MEM2MEM_DEV=m +CONFIG_V4L2_FLASH_LED_CLASS=m +CONFIG_V4L2_FWNODE=m +CONFIG_VIDEOBUF_GEN=m +CONFIG_VIDEOBUF_DMA_SG=m +CONFIG_VIDEOBUF_VMALLOC=m +CONFIG_DVB_CORE=m +CONFIG_DVB_MMAP=y +CONFIG_DVB_NET=y +CONFIG_TTPCI_EEPROM=m +CONFIG_DVB_MAX_ADAPTERS=16 +# CONFIG_DVB_DYNAMIC_MINORS is not set +# CONFIG_DVB_DEMUX_SECTION_LOSS_LOG is not set +# CONFIG_DVB_ULE_DEBUG is not set + +# +# Media drivers +# +CONFIG_MEDIA_USB_SUPPORT=y + +# +# Webcam devices +# +CONFIG_USB_VIDEO_CLASS=m +CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y +CONFIG_USB_GSPCA=m +CONFIG_USB_M5602=m +CONFIG_USB_STV06XX=m +CONFIG_USB_GL860=m +CONFIG_USB_GSPCA_BENQ=m +CONFIG_USB_GSPCA_CONEX=m +CONFIG_USB_GSPCA_CPIA1=m +CONFIG_USB_GSPCA_DTCS033=m +CONFIG_USB_GSPCA_ETOMS=m +CONFIG_USB_GSPCA_FINEPIX=m +CONFIG_USB_GSPCA_JEILINJ=m +CONFIG_USB_GSPCA_JL2005BCD=m +CONFIG_USB_GSPCA_KINECT=m +CONFIG_USB_GSPCA_KONICA=m +CONFIG_USB_GSPCA_MARS=m +CONFIG_USB_GSPCA_MR97310A=m +CONFIG_USB_GSPCA_NW80X=m +CONFIG_USB_GSPCA_OV519=m +CONFIG_USB_GSPCA_OV534=m +CONFIG_USB_GSPCA_OV534_9=m +CONFIG_USB_GSPCA_PAC207=m +CONFIG_USB_GSPCA_PAC7302=m +CONFIG_USB_GSPCA_PAC7311=m +CONFIG_USB_GSPCA_SE401=m +CONFIG_USB_GSPCA_SN9C2028=m +CONFIG_USB_GSPCA_SN9C20X=m +CONFIG_USB_GSPCA_SONIXB=m +CONFIG_USB_GSPCA_SONIXJ=m +CONFIG_USB_GSPCA_SPCA500=m +CONFIG_USB_GSPCA_SPCA501=m +CONFIG_USB_GSPCA_SPCA505=m +CONFIG_USB_GSPCA_SPCA506=m +CONFIG_USB_GSPCA_SPCA508=m +CONFIG_USB_GSPCA_SPCA561=m +CONFIG_USB_GSPCA_SPCA1528=m +CONFIG_USB_GSPCA_SQ905=m +CONFIG_USB_GSPCA_SQ905C=m +CONFIG_USB_GSPCA_SQ930X=m +CONFIG_USB_GSPCA_STK014=m +CONFIG_USB_GSPCA_STK1135=m +CONFIG_USB_GSPCA_STV0680=m +CONFIG_USB_GSPCA_SUNPLUS=m +CONFIG_USB_GSPCA_T613=m +CONFIG_USB_GSPCA_TOPRO=m +CONFIG_USB_GSPCA_TOUPTEK=m +CONFIG_USB_GSPCA_TV8532=m +CONFIG_USB_GSPCA_VC032X=m +CONFIG_USB_GSPCA_VICAM=m +CONFIG_USB_GSPCA_XIRLINK_CIT=m +CONFIG_USB_GSPCA_ZC3XX=m +CONFIG_USB_PWC=m +# CONFIG_USB_PWC_DEBUG is not set +CONFIG_USB_PWC_INPUT_EVDEV=y +CONFIG_VIDEO_CPIA2=m +CONFIG_USB_ZR364XX=m +CONFIG_USB_STKWEBCAM=m +CONFIG_USB_S2255=m +CONFIG_VIDEO_USBTV=m + +# +# Analog TV USB devices +# +CONFIG_VIDEO_PVRUSB2=m +CONFIG_VIDEO_PVRUSB2_SYSFS=y +CONFIG_VIDEO_PVRUSB2_DVB=y +# CONFIG_VIDEO_PVRUSB2_DEBUGIFC is not set +CONFIG_VIDEO_HDPVR=m +CONFIG_VIDEO_STK1160_COMMON=m +CONFIG_VIDEO_STK1160=m +CONFIG_VIDEO_GO7007=m +CONFIG_VIDEO_GO7007_USB=m +CONFIG_VIDEO_GO7007_LOADER=m +CONFIG_VIDEO_GO7007_USB_S2250_BOARD=m + +# +# Analog/digital TV USB devices +# +CONFIG_VIDEO_AU0828=m +CONFIG_VIDEO_AU0828_V4L2=y +CONFIG_VIDEO_AU0828_RC=y +CONFIG_VIDEO_CX231XX=m +CONFIG_VIDEO_CX231XX_RC=y +CONFIG_VIDEO_CX231XX_ALSA=m +CONFIG_VIDEO_CX231XX_DVB=m +CONFIG_VIDEO_TM6000=m +CONFIG_VIDEO_TM6000_ALSA=m +CONFIG_VIDEO_TM6000_DVB=m + +# +# Digital TV USB devices +# +CONFIG_DVB_USB=m +# CONFIG_DVB_USB_DEBUG is not set +CONFIG_DVB_USB_DIB3000MC=m +CONFIG_DVB_USB_A800=m +CONFIG_DVB_USB_DIBUSB_MB=m +CONFIG_DVB_USB_DIBUSB_MB_FAULTY=y +CONFIG_DVB_USB_DIBUSB_MC=m +CONFIG_DVB_USB_DIB0700=m +CONFIG_DVB_USB_UMT_010=m +CONFIG_DVB_USB_CXUSB=m +CONFIG_DVB_USB_CXUSB_ANALOG=y +CONFIG_DVB_USB_M920X=m +CONFIG_DVB_USB_DIGITV=m +CONFIG_DVB_USB_VP7045=m +CONFIG_DVB_USB_VP702X=m +CONFIG_DVB_USB_GP8PSK=m +CONFIG_DVB_USB_NOVA_T_USB2=m +CONFIG_DVB_USB_TTUSB2=m +CONFIG_DVB_USB_DTT200U=m +CONFIG_DVB_USB_OPERA1=m +CONFIG_DVB_USB_AF9005=m +CONFIG_DVB_USB_AF9005_REMOTE=m +CONFIG_DVB_USB_PCTV452E=m +CONFIG_DVB_USB_DW2102=m +CONFIG_DVB_USB_CINERGY_T2=m +CONFIG_DVB_USB_DTV5100=m +CONFIG_DVB_USB_AZ6027=m +CONFIG_DVB_USB_TECHNISAT_USB2=m +CONFIG_DVB_USB_V2=m +CONFIG_DVB_USB_AF9015=m +CONFIG_DVB_USB_AF9035=m +CONFIG_DVB_USB_ANYSEE=m +CONFIG_DVB_USB_AU6610=m +CONFIG_DVB_USB_AZ6007=m +CONFIG_DVB_USB_CE6230=m +CONFIG_DVB_USB_EC168=m +CONFIG_DVB_USB_GL861=m +CONFIG_DVB_USB_LME2510=m +CONFIG_DVB_USB_MXL111SF=m +CONFIG_DVB_USB_RTL28XXU=m +CONFIG_DVB_USB_DVBSKY=m +CONFIG_DVB_USB_ZD1301=m +CONFIG_DVB_TTUSB_BUDGET=m +CONFIG_DVB_TTUSB_DEC=m +CONFIG_SMS_USB_DRV=m +CONFIG_DVB_B2C2_FLEXCOP_USB=m +# CONFIG_DVB_B2C2_FLEXCOP_USB_DEBUG is not set +CONFIG_DVB_AS102=m + +# +# Webcam, TV (analog/digital) USB devices +# +CONFIG_VIDEO_EM28XX=m +CONFIG_VIDEO_EM28XX_V4L2=m +CONFIG_VIDEO_EM28XX_ALSA=m +CONFIG_VIDEO_EM28XX_DVB=m +CONFIG_VIDEO_EM28XX_RC=m + +# +# Software defined radio USB devices +# +CONFIG_USB_AIRSPY=m +CONFIG_USB_HACKRF=m +CONFIG_USB_MSI2500=m + +# +# USB HDMI CEC adapters +# +CONFIG_USB_PULSE8_CEC=m +CONFIG_USB_RAINSHADOW_CEC=m +CONFIG_MEDIA_PCI_SUPPORT=y + +# +# Media capture support +# +CONFIG_VIDEO_MEYE=m +CONFIG_VIDEO_SOLO6X10=m +CONFIG_VIDEO_TW5864=m +CONFIG_VIDEO_TW68=m +CONFIG_VIDEO_TW686X=m + +# +# Media capture/analog TV support +# +CONFIG_VIDEO_IVTV=m +# CONFIG_VIDEO_IVTV_DEPRECATED_IOCTLS is not set +CONFIG_VIDEO_IVTV_ALSA=m +CONFIG_VIDEO_FB_IVTV=m +# CONFIG_VIDEO_FB_IVTV_FORCE_PAT is not set +CONFIG_VIDEO_HEXIUM_GEMINI=m +CONFIG_VIDEO_HEXIUM_ORION=m +CONFIG_VIDEO_MXB=m +CONFIG_VIDEO_DT3155=m + +# +# Media capture/analog/hybrid TV support +# +CONFIG_VIDEO_CX18=m +CONFIG_VIDEO_CX18_ALSA=m +CONFIG_VIDEO_CX23885=m +CONFIG_MEDIA_ALTERA_CI=m +CONFIG_VIDEO_CX25821=m +CONFIG_VIDEO_CX25821_ALSA=m +CONFIG_VIDEO_CX88=m +CONFIG_VIDEO_CX88_ALSA=m +CONFIG_VIDEO_CX88_BLACKBIRD=m +CONFIG_VIDEO_CX88_DVB=m +CONFIG_VIDEO_CX88_ENABLE_VP3054=y +CONFIG_VIDEO_CX88_VP3054=m +CONFIG_VIDEO_CX88_MPEG=m +CONFIG_VIDEO_BT848=m +CONFIG_DVB_BT8XX=m +CONFIG_VIDEO_SAA7134=m +CONFIG_VIDEO_SAA7134_ALSA=m +CONFIG_VIDEO_SAA7134_RC=y +CONFIG_VIDEO_SAA7134_DVB=m +CONFIG_VIDEO_SAA7134_GO7007=m +CONFIG_VIDEO_SAA7164=m + +# +# Media digital TV PCI Adapters +# +CONFIG_DVB_AV7110_IR=y +CONFIG_DVB_AV7110=m +CONFIG_DVB_AV7110_OSD=y +CONFIG_DVB_BUDGET_CORE=m +CONFIG_DVB_BUDGET=m +CONFIG_DVB_BUDGET_CI=m +CONFIG_DVB_BUDGET_AV=m +CONFIG_DVB_BUDGET_PATCH=m +CONFIG_DVB_B2C2_FLEXCOP_PCI=m +# CONFIG_DVB_B2C2_FLEXCOP_PCI_DEBUG is not set +CONFIG_DVB_PLUTO2=m +CONFIG_DVB_DM1105=m +CONFIG_DVB_PT1=m +CONFIG_DVB_PT3=m +CONFIG_MANTIS_CORE=m +CONFIG_DVB_MANTIS=m +CONFIG_DVB_HOPPER=m +CONFIG_DVB_NGENE=m +CONFIG_DVB_DDBRIDGE=m +# CONFIG_DVB_DDBRIDGE_MSIENABLE is not set +CONFIG_DVB_SMIPCIE=m +CONFIG_DVB_NETUP_UNIDVB=m +CONFIG_VIDEO_IPU3_CIO2=m +CONFIG_V4L_PLATFORM_DRIVERS=y +CONFIG_VIDEO_CAFE_CCIC=m +CONFIG_VIDEO_CADENCE=y +CONFIG_VIDEO_CADENCE_CSI2RX=m +CONFIG_VIDEO_CADENCE_CSI2TX=m +CONFIG_VIDEO_ASPEED=m +CONFIG_VIDEO_MUX=m +CONFIG_VIDEO_XILINX=m +CONFIG_VIDEO_XILINX_TPG=m +CONFIG_VIDEO_XILINX_VTC=m +CONFIG_V4L_MEM2MEM_DRIVERS=y +CONFIG_VIDEO_MEM2MEM_DEINTERLACE=m +CONFIG_VIDEO_SH_VEU=m +CONFIG_V4L_TEST_DRIVERS=y +CONFIG_VIDEO_VIMC=m +CONFIG_VIDEO_VIVID=m +CONFIG_VIDEO_VIVID_CEC=y +CONFIG_VIDEO_VIVID_MAX_DEVS=64 +CONFIG_VIDEO_VIM2M=m +CONFIG_VIDEO_VICODEC=m +CONFIG_DVB_PLATFORM_DRIVERS=y +CONFIG_CEC_PLATFORM_DRIVERS=y +CONFIG_VIDEO_CROS_EC_CEC=m +CONFIG_CEC_GPIO=m +CONFIG_VIDEO_SECO_CEC=m +CONFIG_VIDEO_SECO_RC=y +CONFIG_SDR_PLATFORM_DRIVERS=y + +# +# Supported MMC/SDIO adapters +# +CONFIG_SMS_SDIO_DRV=m +CONFIG_RADIO_ADAPTERS=y +CONFIG_RADIO_TEA575X=m +CONFIG_RADIO_SI470X=m +CONFIG_USB_SI470X=m +CONFIG_I2C_SI470X=m +CONFIG_RADIO_SI4713=m +CONFIG_USB_SI4713=m +CONFIG_PLATFORM_SI4713=m +CONFIG_I2C_SI4713=m +CONFIG_RADIO_SI476X=m +CONFIG_USB_MR800=m +CONFIG_USB_DSBR=m +CONFIG_RADIO_MAXIRADIO=m +CONFIG_RADIO_SHARK=m +CONFIG_RADIO_SHARK2=m +CONFIG_USB_KEENE=m +CONFIG_USB_RAREMONO=m +CONFIG_USB_MA901=m +CONFIG_RADIO_TEA5764=m +CONFIG_RADIO_SAA7706H=m +CONFIG_RADIO_TEF6862=m +CONFIG_RADIO_WL1273=m + +# +# Texas Instruments WL128x FM driver (ST based) +# +CONFIG_RADIO_WL128X=m +# end of Texas Instruments WL128x FM driver (ST based) + +# +# Supported FireWire (IEEE 1394) Adapters +# +CONFIG_DVB_FIREDTV=m +CONFIG_DVB_FIREDTV_INPUT=y +CONFIG_MEDIA_COMMON_OPTIONS=y + +# +# common driver options +# +CONFIG_VIDEO_CX2341X=m +CONFIG_VIDEO_TVEEPROM=m +CONFIG_CYPRESS_FIRMWARE=m +CONFIG_VIDEOBUF2_CORE=m +CONFIG_VIDEOBUF2_V4L2=m +CONFIG_VIDEOBUF2_MEMOPS=m +CONFIG_VIDEOBUF2_DMA_CONTIG=m +CONFIG_VIDEOBUF2_VMALLOC=m +CONFIG_VIDEOBUF2_DMA_SG=m +CONFIG_VIDEOBUF2_DVB=m +CONFIG_DVB_B2C2_FLEXCOP=m +CONFIG_VIDEO_SAA7146=m +CONFIG_VIDEO_SAA7146_VV=m +CONFIG_SMS_SIANO_MDTV=m +CONFIG_SMS_SIANO_RC=y +# CONFIG_SMS_SIANO_DEBUGFS is not set +CONFIG_VIDEO_V4L2_TPG=m + +# +# Media ancillary drivers (tuners, sensors, i2c, spi, frontends) +# +CONFIG_MEDIA_SUBDRV_AUTOSELECT=y +CONFIG_MEDIA_ATTACH=y +CONFIG_VIDEO_IR_I2C=m + +# +# I2C Encoders, decoders, sensors and other helper chips +# + +# +# Audio decoders, processors and mixers +# +CONFIG_VIDEO_TVAUDIO=m +CONFIG_VIDEO_TDA7432=m +CONFIG_VIDEO_TDA9840=m +CONFIG_VIDEO_TDA1997X=m +CONFIG_VIDEO_TEA6415C=m +CONFIG_VIDEO_TEA6420=m +CONFIG_VIDEO_MSP3400=m +CONFIG_VIDEO_CS3308=m +CONFIG_VIDEO_CS5345=m +CONFIG_VIDEO_CS53L32A=m +CONFIG_VIDEO_TLV320AIC23B=m +CONFIG_VIDEO_UDA1342=m +CONFIG_VIDEO_WM8775=m +CONFIG_VIDEO_WM8739=m +CONFIG_VIDEO_VP27SMPX=m +CONFIG_VIDEO_SONY_BTF_MPX=m + +# +# RDS decoders +# +CONFIG_VIDEO_SAA6588=m + +# +# Video decoders +# +CONFIG_VIDEO_ADV7180=m +CONFIG_VIDEO_ADV7183=m +CONFIG_VIDEO_ADV748X=m +CONFIG_VIDEO_ADV7604=m +CONFIG_VIDEO_ADV7604_CEC=y +CONFIG_VIDEO_ADV7842=m +CONFIG_VIDEO_ADV7842_CEC=y +CONFIG_VIDEO_BT819=m +CONFIG_VIDEO_BT856=m +CONFIG_VIDEO_BT866=m +CONFIG_VIDEO_KS0127=m +CONFIG_VIDEO_ML86V7667=m +CONFIG_VIDEO_SAA7110=m +CONFIG_VIDEO_SAA711X=m +CONFIG_VIDEO_TC358743=m +CONFIG_VIDEO_TC358743_CEC=y +CONFIG_VIDEO_TVP514X=m +CONFIG_VIDEO_TVP5150=m +CONFIG_VIDEO_TVP7002=m +CONFIG_VIDEO_TW2804=m +CONFIG_VIDEO_TW9903=m +CONFIG_VIDEO_TW9906=m +CONFIG_VIDEO_TW9910=m +CONFIG_VIDEO_VPX3220=m + +# +# Video and audio decoders +# +CONFIG_VIDEO_SAA717X=m +CONFIG_VIDEO_CX25840=m + +# +# Video encoders +# +CONFIG_VIDEO_SAA7127=m +CONFIG_VIDEO_SAA7185=m +CONFIG_VIDEO_ADV7170=m +CONFIG_VIDEO_ADV7175=m +CONFIG_VIDEO_ADV7343=m +CONFIG_VIDEO_ADV7393=m +CONFIG_VIDEO_AD9389B=m +CONFIG_VIDEO_AK881X=m +CONFIG_VIDEO_THS8200=m + +# +# Camera sensor devices +# +CONFIG_VIDEO_APTINA_PLL=m +CONFIG_VIDEO_SMIAPP_PLL=m +CONFIG_VIDEO_HI556=m +CONFIG_VIDEO_IMX214=m +CONFIG_VIDEO_IMX219=m +CONFIG_VIDEO_IMX258=m +CONFIG_VIDEO_IMX274=m +CONFIG_VIDEO_IMX290=m +CONFIG_VIDEO_IMX319=m +CONFIG_VIDEO_IMX355=m +CONFIG_VIDEO_OV2640=m +CONFIG_VIDEO_OV2659=m +CONFIG_VIDEO_OV2680=m +CONFIG_VIDEO_OV2685=m +CONFIG_VIDEO_OV5640=m +CONFIG_VIDEO_OV5645=m +CONFIG_VIDEO_OV5647=m +CONFIG_VIDEO_OV6650=m +CONFIG_VIDEO_OV5670=m +CONFIG_VIDEO_OV5675=m +CONFIG_VIDEO_OV5695=m +CONFIG_VIDEO_OV7251=m +CONFIG_VIDEO_OV772X=m +CONFIG_VIDEO_OV7640=m +CONFIG_VIDEO_OV7670=m +CONFIG_VIDEO_OV7740=m +CONFIG_VIDEO_OV8856=m +CONFIG_VIDEO_OV9640=m +CONFIG_VIDEO_OV9650=m +CONFIG_VIDEO_OV13858=m +CONFIG_VIDEO_VS6624=m +CONFIG_VIDEO_MT9M001=m +CONFIG_VIDEO_MT9M032=m +CONFIG_VIDEO_MT9M111=m +CONFIG_VIDEO_MT9P031=m +CONFIG_VIDEO_MT9T001=m +CONFIG_VIDEO_MT9T112=m +CONFIG_VIDEO_MT9V011=m +CONFIG_VIDEO_MT9V032=m +CONFIG_VIDEO_MT9V111=m +CONFIG_VIDEO_SR030PC30=m +CONFIG_VIDEO_NOON010PC30=m +CONFIG_VIDEO_M5MOLS=m +CONFIG_VIDEO_RJ54N1=m +CONFIG_VIDEO_S5K6AA=m +CONFIG_VIDEO_S5K6A3=m +CONFIG_VIDEO_S5K4ECGX=m +CONFIG_VIDEO_S5K5BAF=m +CONFIG_VIDEO_SMIAPP=m +CONFIG_VIDEO_ET8EK8=m +CONFIG_VIDEO_S5C73M3=m + +# +# Lens drivers +# +CONFIG_VIDEO_AD5820=m +CONFIG_VIDEO_AK7375=m +CONFIG_VIDEO_DW9714=m +CONFIG_VIDEO_DW9807_VCM=m + +# +# Flash devices +# +CONFIG_VIDEO_ADP1653=m +CONFIG_VIDEO_LM3560=m +CONFIG_VIDEO_LM3646=m + +# +# Video improvement chips +# +CONFIG_VIDEO_UPD64031A=m +CONFIG_VIDEO_UPD64083=m + +# +# Audio/Video compression chips +# +CONFIG_VIDEO_SAA6752HS=m + +# +# SDR tuner chips +# +CONFIG_SDR_MAX2175=m + +# +# Miscellaneous helper chips +# +CONFIG_VIDEO_THS7303=m +CONFIG_VIDEO_M52790=m +CONFIG_VIDEO_I2C=m +CONFIG_VIDEO_ST_MIPID02=m +# end of I2C Encoders, decoders, sensors and other helper chips + +# +# SPI helper chips +# +CONFIG_VIDEO_GS1662=m +# end of SPI helper chips + +# +# Media SPI Adapters +# +CONFIG_CXD2880_SPI_DRV=m +# end of Media SPI Adapters + +CONFIG_MEDIA_TUNER=m + +# +# Customize TV tuners +# +CONFIG_MEDIA_TUNER_SIMPLE=m +CONFIG_MEDIA_TUNER_TDA18250=m +CONFIG_MEDIA_TUNER_TDA8290=m +CONFIG_MEDIA_TUNER_TDA827X=m +CONFIG_MEDIA_TUNER_TDA18271=m +CONFIG_MEDIA_TUNER_TDA9887=m +CONFIG_MEDIA_TUNER_TEA5761=m +CONFIG_MEDIA_TUNER_TEA5767=m +CONFIG_MEDIA_TUNER_MSI001=m +CONFIG_MEDIA_TUNER_MT20XX=m +CONFIG_MEDIA_TUNER_MT2060=m +CONFIG_MEDIA_TUNER_MT2063=m +CONFIG_MEDIA_TUNER_MT2266=m +CONFIG_MEDIA_TUNER_MT2131=m +CONFIG_MEDIA_TUNER_QT1010=m +CONFIG_MEDIA_TUNER_XC2028=m +CONFIG_MEDIA_TUNER_XC5000=m +CONFIG_MEDIA_TUNER_XC4000=m +CONFIG_MEDIA_TUNER_MXL5005S=m +CONFIG_MEDIA_TUNER_MXL5007T=m +CONFIG_MEDIA_TUNER_MC44S803=m +CONFIG_MEDIA_TUNER_MAX2165=m +CONFIG_MEDIA_TUNER_TDA18218=m +CONFIG_MEDIA_TUNER_FC0011=m +CONFIG_MEDIA_TUNER_FC0012=m +CONFIG_MEDIA_TUNER_FC0013=m +CONFIG_MEDIA_TUNER_TDA18212=m +CONFIG_MEDIA_TUNER_E4000=m +CONFIG_MEDIA_TUNER_FC2580=m +CONFIG_MEDIA_TUNER_M88RS6000T=m +CONFIG_MEDIA_TUNER_TUA9001=m +CONFIG_MEDIA_TUNER_SI2157=m +CONFIG_MEDIA_TUNER_IT913X=m +CONFIG_MEDIA_TUNER_R820T=m +CONFIG_MEDIA_TUNER_MXL301RF=m +CONFIG_MEDIA_TUNER_QM1D1C0042=m +CONFIG_MEDIA_TUNER_QM1D1B0004=m +# end of Customize TV tuners + +# +# Customise DVB Frontends +# + +# +# Multistandard (satellite) frontends +# +CONFIG_DVB_STB0899=m +CONFIG_DVB_STB6100=m +CONFIG_DVB_STV090x=m +CONFIG_DVB_STV0910=m +CONFIG_DVB_STV6110x=m +CONFIG_DVB_STV6111=m +CONFIG_DVB_MXL5XX=m +CONFIG_DVB_M88DS3103=m + +# +# Multistandard (cable + terrestrial) frontends +# +CONFIG_DVB_DRXK=m +CONFIG_DVB_TDA18271C2DD=m +CONFIG_DVB_SI2165=m +CONFIG_DVB_MN88472=m +CONFIG_DVB_MN88473=m + +# +# DVB-S (satellite) frontends +# +CONFIG_DVB_CX24110=m +CONFIG_DVB_CX24123=m +CONFIG_DVB_MT312=m +CONFIG_DVB_ZL10036=m +CONFIG_DVB_ZL10039=m +CONFIG_DVB_S5H1420=m +CONFIG_DVB_STV0288=m +CONFIG_DVB_STB6000=m +CONFIG_DVB_STV0299=m +CONFIG_DVB_STV6110=m +CONFIG_DVB_STV0900=m +CONFIG_DVB_TDA8083=m +CONFIG_DVB_TDA10086=m +CONFIG_DVB_TDA8261=m +CONFIG_DVB_VES1X93=m +CONFIG_DVB_TUNER_ITD1000=m +CONFIG_DVB_TUNER_CX24113=m +CONFIG_DVB_TDA826X=m +CONFIG_DVB_TUA6100=m +CONFIG_DVB_CX24116=m +CONFIG_DVB_CX24117=m +CONFIG_DVB_CX24120=m +CONFIG_DVB_SI21XX=m +CONFIG_DVB_TS2020=m +CONFIG_DVB_DS3000=m +CONFIG_DVB_MB86A16=m +CONFIG_DVB_TDA10071=m + +# +# DVB-T (terrestrial) frontends +# +CONFIG_DVB_SP8870=m +CONFIG_DVB_SP887X=m +CONFIG_DVB_CX22700=m +CONFIG_DVB_CX22702=m +CONFIG_DVB_S5H1432=m +CONFIG_DVB_DRXD=m +CONFIG_DVB_L64781=m +CONFIG_DVB_TDA1004X=m +CONFIG_DVB_NXT6000=m +CONFIG_DVB_MT352=m +CONFIG_DVB_ZL10353=m +CONFIG_DVB_DIB3000MB=m +CONFIG_DVB_DIB3000MC=m +CONFIG_DVB_DIB7000M=m +CONFIG_DVB_DIB7000P=m +CONFIG_DVB_DIB9000=m +CONFIG_DVB_TDA10048=m +CONFIG_DVB_AF9013=m +CONFIG_DVB_EC100=m +CONFIG_DVB_STV0367=m +CONFIG_DVB_CXD2820R=m +CONFIG_DVB_CXD2841ER=m +CONFIG_DVB_RTL2830=m +CONFIG_DVB_RTL2832=m +CONFIG_DVB_RTL2832_SDR=m +CONFIG_DVB_SI2168=m +CONFIG_DVB_AS102_FE=m +CONFIG_DVB_ZD1301_DEMOD=m +CONFIG_DVB_GP8PSK_FE=m +CONFIG_DVB_CXD2880=m + +# +# DVB-C (cable) frontends +# +CONFIG_DVB_VES1820=m +CONFIG_DVB_TDA10021=m +CONFIG_DVB_TDA10023=m +CONFIG_DVB_STV0297=m + +# +# ATSC (North American/Korean Terrestrial/Cable DTV) frontends +# +CONFIG_DVB_NXT200X=m +CONFIG_DVB_OR51211=m +CONFIG_DVB_OR51132=m +CONFIG_DVB_BCM3510=m +CONFIG_DVB_LGDT330X=m +CONFIG_DVB_LGDT3305=m +CONFIG_DVB_LGDT3306A=m +CONFIG_DVB_LG2160=m +CONFIG_DVB_S5H1409=m +CONFIG_DVB_AU8522=m +CONFIG_DVB_AU8522_DTV=m +CONFIG_DVB_AU8522_V4L=m +CONFIG_DVB_S5H1411=m + +# +# ISDB-T (terrestrial) frontends +# +CONFIG_DVB_S921=m +CONFIG_DVB_DIB8000=m +CONFIG_DVB_MB86A20S=m + +# +# ISDB-S (satellite) & ISDB-T (terrestrial) frontends +# +CONFIG_DVB_TC90522=m +CONFIG_DVB_MN88443X=m + +# +# Digital terrestrial only tuners/PLL +# +CONFIG_DVB_PLL=m +CONFIG_DVB_TUNER_DIB0070=m +CONFIG_DVB_TUNER_DIB0090=m + +# +# SEC control devices for DVB-S +# +CONFIG_DVB_DRX39XYJ=m +CONFIG_DVB_LNBH25=m +CONFIG_DVB_LNBH29=m +CONFIG_DVB_LNBP21=m +CONFIG_DVB_LNBP22=m +CONFIG_DVB_ISL6405=m +CONFIG_DVB_ISL6421=m +CONFIG_DVB_ISL6423=m +CONFIG_DVB_A8293=m +CONFIG_DVB_LGS8GL5=m +CONFIG_DVB_LGS8GXX=m +CONFIG_DVB_ATBM8830=m +CONFIG_DVB_TDA665x=m +CONFIG_DVB_IX2505V=m +CONFIG_DVB_M88RS2000=m +CONFIG_DVB_AF9033=m +CONFIG_DVB_HORUS3A=m +CONFIG_DVB_ASCOT2E=m +CONFIG_DVB_HELENE=m + +# +# Common Interface (EN50221) controller drivers +# +CONFIG_DVB_CXD2099=m +CONFIG_DVB_SP2=m + +# +# Tools to develop new frontends +# +CONFIG_DVB_DUMMY_FE=m +# end of Customise DVB Frontends + +# +# Graphics support +# +CONFIG_AGP=m +CONFIG_AGP_AMD64=m +CONFIG_AGP_INTEL=m +CONFIG_AGP_SIS=m +CONFIG_AGP_VIA=m +CONFIG_INTEL_GTT=m +CONFIG_VGA_ARB=y +CONFIG_VGA_ARB_MAX_GPUS=10 +CONFIG_VGA_SWITCHEROO=y +CONFIG_DRM=m +CONFIG_DRM_MIPI_DBI=m +CONFIG_DRM_MIPI_DSI=y +CONFIG_DRM_DP_AUX_CHARDEV=y +# CONFIG_DRM_DEBUG_SELFTEST is not set +CONFIG_DRM_KMS_HELPER=m +CONFIG_DRM_KMS_FB_HELPER=y +# CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS is not set +CONFIG_DRM_FBDEV_EMULATION=y +CONFIG_DRM_FBDEV_OVERALLOC=100 +# CONFIG_DRM_FBDEV_LEAK_PHYS_SMEM is not set +CONFIG_DRM_LOAD_EDID_FIRMWARE=y +CONFIG_DRM_DP_CEC=y +CONFIG_DRM_TTM=m +CONFIG_DRM_TTM_DMA_PAGE_POOL=y +CONFIG_DRM_VRAM_HELPER=m +CONFIG_DRM_TTM_HELPER=m +CONFIG_DRM_GEM_CMA_HELPER=y +CONFIG_DRM_KMS_CMA_HELPER=y +CONFIG_DRM_GEM_SHMEM_HELPER=y +CONFIG_DRM_SCHED=m + +# +# I2C encoder or helper chips +# +CONFIG_DRM_I2C_CH7006=m +CONFIG_DRM_I2C_SIL164=m +CONFIG_DRM_I2C_NXP_TDA998X=m +CONFIG_DRM_I2C_NXP_TDA9950=m +# end of I2C encoder or helper chips + +# +# ARM devices +# +CONFIG_DRM_KOMEDA=m +# end of ARM devices + +CONFIG_DRM_RADEON=m +CONFIG_DRM_RADEON_USERPTR=y +CONFIG_DRM_AMDGPU=m +CONFIG_DRM_AMDGPU_SI=y +CONFIG_DRM_AMDGPU_CIK=y +CONFIG_DRM_AMDGPU_USERPTR=y +# CONFIG_DRM_AMDGPU_GART_DEBUGFS is not set + +# +# ACP (Audio CoProcessor) Configuration +# +CONFIG_DRM_AMD_ACP=y +# end of ACP (Audio CoProcessor) Configuration + +# +# Display Engine Configuration +# +CONFIG_DRM_AMD_DC=y +CONFIG_DRM_AMD_DC_DCN=y +CONFIG_DRM_AMD_DC_HDCP=y +# CONFIG_DEBUG_KERNEL_DC is not set +# end of Display Engine Configuration + +CONFIG_HSA_AMD=y +CONFIG_DRM_NOUVEAU=m +# CONFIG_NOUVEAU_LEGACY_CTX_SUPPORT is not set +CONFIG_NOUVEAU_DEBUG=5 +CONFIG_NOUVEAU_DEBUG_DEFAULT=3 +# CONFIG_NOUVEAU_DEBUG_MMU is not set +CONFIG_DRM_NOUVEAU_BACKLIGHT=y +CONFIG_DRM_NOUVEAU_SVM=y +CONFIG_DRM_I915=m +CONFIG_DRM_I915_FORCE_PROBE="*" +CONFIG_DRM_I915_CAPTURE_ERROR=y +CONFIG_DRM_I915_COMPRESS_ERROR=y +CONFIG_DRM_I915_USERPTR=y +CONFIG_DRM_I915_GVT=y +CONFIG_DRM_I915_GVT_KVMGT=m + +# +# drm/i915 Debugging +# +# CONFIG_DRM_I915_WERROR is not set +# CONFIG_DRM_I915_DEBUG is not set +# CONFIG_DRM_I915_DEBUG_MMIO is not set +# CONFIG_DRM_I915_SW_FENCE_DEBUG_OBJECTS is not set +# CONFIG_DRM_I915_SW_FENCE_CHECK_DAG is not set +# CONFIG_DRM_I915_DEBUG_GUC is not set +# CONFIG_DRM_I915_SELFTEST is not set +# CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS is not set +# CONFIG_DRM_I915_DEBUG_VBLANK_EVADE is not set +# CONFIG_DRM_I915_DEBUG_RUNTIME_PM is not set +# end of drm/i915 Debugging + +# +# drm/i915 Profile Guided Optimisation +# +CONFIG_DRM_I915_USERFAULT_AUTOSUSPEND=250 +CONFIG_DRM_I915_HEARTBEAT_INTERVAL=2500 +CONFIG_DRM_I915_PREEMPT_TIMEOUT=640 +CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT=8000 +CONFIG_DRM_I915_STOP_TIMEOUT=100 +CONFIG_DRM_I915_TIMESLICE_DURATION=1 +# end of drm/i915 Profile Guided Optimisation + +CONFIG_DRM_VGEM=m +CONFIG_DRM_VKMS=m +CONFIG_DRM_VMWGFX=m +CONFIG_DRM_VMWGFX_FBCON=y +CONFIG_DRM_GMA500=m +CONFIG_DRM_GMA600=y +CONFIG_DRM_GMA3600=y +CONFIG_DRM_UDL=m +CONFIG_DRM_AST=m +CONFIG_DRM_MGAG200=m +CONFIG_DRM_CIRRUS_QEMU=m +CONFIG_DRM_RCAR_DW_HDMI=m +CONFIG_DRM_RCAR_LVDS=m +CONFIG_DRM_QXL=m +CONFIG_DRM_BOCHS=m +CONFIG_DRM_VIRTIO_GPU=m +CONFIG_DRM_PANEL=y + +# +# Display Panels +# +CONFIG_DRM_PANEL_ARM_VERSATILE=m +CONFIG_DRM_PANEL_BOE_HIMAX8279D=m +CONFIG_DRM_PANEL_BOE_TV101WUM_NL6=m +CONFIG_DRM_PANEL_LVDS=m +CONFIG_DRM_PANEL_SIMPLE=m +CONFIG_DRM_PANEL_ELIDA_KD35T133=m +CONFIG_DRM_PANEL_FEIXIN_K101_IM2BA02=m +CONFIG_DRM_PANEL_FEIYANG_FY07024DI26A30D=m +CONFIG_DRM_PANEL_ILITEK_IL9322=m +CONFIG_DRM_PANEL_ILITEK_ILI9881C=m +CONFIG_DRM_PANEL_INNOLUX_P079ZCA=m +CONFIG_DRM_PANEL_JDI_LT070ME05000=m +CONFIG_DRM_PANEL_KINGDISPLAY_KD097D04=m +CONFIG_DRM_PANEL_LEADTEK_LTK500HD1829=m +CONFIG_DRM_PANEL_SAMSUNG_LD9040=m +CONFIG_DRM_PANEL_LG_LB035Q02=m +CONFIG_DRM_PANEL_LG_LG4573=m +CONFIG_DRM_PANEL_NEC_NL8048HL11=m +CONFIG_DRM_PANEL_NOVATEK_NT35510=m +CONFIG_DRM_PANEL_NOVATEK_NT39016=m +CONFIG_DRM_PANEL_OLIMEX_LCD_OLINUXINO=m +CONFIG_DRM_PANEL_ORISETECH_OTM8009A=m +CONFIG_DRM_PANEL_OSD_OSD101T2587_53TS=m +CONFIG_DRM_PANEL_PANASONIC_VVX10F034N00=m +CONFIG_DRM_PANEL_RASPBERRYPI_TOUCHSCREEN=m +CONFIG_DRM_PANEL_RAYDIUM_RM67191=m +CONFIG_DRM_PANEL_RAYDIUM_RM68200=m +CONFIG_DRM_PANEL_ROCKTECH_JH057N00900=m +CONFIG_DRM_PANEL_RONBO_RB070D30=m +CONFIG_DRM_PANEL_SAMSUNG_S6D16D0=m +CONFIG_DRM_PANEL_SAMSUNG_S6E3HA2=m +CONFIG_DRM_PANEL_SAMSUNG_S6E63J0X03=m +CONFIG_DRM_PANEL_SAMSUNG_S6E63M0=m +CONFIG_DRM_PANEL_SAMSUNG_S6E88A0_AMS452EF01=m +CONFIG_DRM_PANEL_SAMSUNG_S6E8AA0=m +CONFIG_DRM_PANEL_SEIKO_43WVF1G=m +CONFIG_DRM_PANEL_SHARP_LQ101R1SX01=m +CONFIG_DRM_PANEL_SHARP_LS037V7DW01=m +CONFIG_DRM_PANEL_SHARP_LS043T1LE01=m +CONFIG_DRM_PANEL_SITRONIX_ST7701=m +CONFIG_DRM_PANEL_SITRONIX_ST7789V=m +CONFIG_DRM_PANEL_SONY_ACX424AKP=m +CONFIG_DRM_PANEL_SONY_ACX565AKM=m +CONFIG_DRM_PANEL_TPO_TD028TTEC1=m +CONFIG_DRM_PANEL_TPO_TD043MTEA1=m +CONFIG_DRM_PANEL_TPO_TPG110=m +CONFIG_DRM_PANEL_TRULY_NT35597_WQXGA=m +CONFIG_DRM_PANEL_XINPENG_XPP055C272=m +# end of Display Panels + +CONFIG_DRM_BRIDGE=y +CONFIG_DRM_PANEL_BRIDGE=y + +# +# Display Interface Bridges +# +CONFIG_DRM_CDNS_DSI=m +CONFIG_DRM_DISPLAY_CONNECTOR=m +CONFIG_DRM_LVDS_CODEC=m +CONFIG_DRM_MEGACHIPS_STDPXXXX_GE_B850V3_FW=m +CONFIG_DRM_NXP_PTN3460=m +CONFIG_DRM_PARADE_PS8622=m +CONFIG_DRM_PARADE_PS8640=m +CONFIG_DRM_SIL_SII8620=m +CONFIG_DRM_SII902X=m +CONFIG_DRM_SII9234=m +CONFIG_DRM_SIMPLE_BRIDGE=m +CONFIG_DRM_THINE_THC63LVD1024=m +CONFIG_DRM_TOSHIBA_TC358764=m +CONFIG_DRM_TOSHIBA_TC358767=m +CONFIG_DRM_TOSHIBA_TC358768=m +CONFIG_DRM_TI_TFP410=m +CONFIG_DRM_TI_SN65DSI86=m +CONFIG_DRM_TI_TPD12S015=m +CONFIG_DRM_ANALOGIX_ANX6345=m +CONFIG_DRM_ANALOGIX_ANX78XX=m +CONFIG_DRM_ANALOGIX_DP=m +CONFIG_DRM_I2C_ADV7511=m +CONFIG_DRM_I2C_ADV7511_AUDIO=y +CONFIG_DRM_I2C_ADV7511_CEC=y +CONFIG_DRM_DW_HDMI=m +CONFIG_DRM_DW_HDMI_AHB_AUDIO=m +CONFIG_DRM_DW_HDMI_I2S_AUDIO=m +CONFIG_DRM_DW_HDMI_CEC=m +# end of Display Interface Bridges + +# CONFIG_DRM_ETNAVIV is not set +CONFIG_DRM_ARCPGU=m +CONFIG_DRM_MXS=y +CONFIG_DRM_MXSFB=m +CONFIG_DRM_GM12U320=m +CONFIG_TINYDRM_HX8357D=m +CONFIG_TINYDRM_ILI9225=m +CONFIG_TINYDRM_ILI9341=m +CONFIG_TINYDRM_ILI9486=m +CONFIG_TINYDRM_MI0283QT=m +CONFIG_TINYDRM_REPAPER=m +CONFIG_TINYDRM_ST7586=m +CONFIG_TINYDRM_ST7735R=m +CONFIG_DRM_XEN=y +CONFIG_DRM_XEN_FRONTEND=m +CONFIG_DRM_VBOXVIDEO=m +# CONFIG_DRM_LEGACY is not set +CONFIG_DRM_PANEL_ORIENTATION_QUIRKS=y + +# +# Frame buffer Devices +# +CONFIG_FB_CMDLINE=y +CONFIG_FB_NOTIFY=y +CONFIG_FB=y +CONFIG_FIRMWARE_EDID=y +CONFIG_FB_BOOT_VESA_SUPPORT=y +CONFIG_FB_CFB_FILLRECT=y +CONFIG_FB_CFB_COPYAREA=y +CONFIG_FB_CFB_IMAGEBLIT=y +CONFIG_FB_SYS_FILLRECT=m +CONFIG_FB_SYS_COPYAREA=m +CONFIG_FB_SYS_IMAGEBLIT=m +# CONFIG_FB_FOREIGN_ENDIAN is not set +CONFIG_FB_SYS_FOPS=m +CONFIG_FB_DEFERRED_IO=y +CONFIG_FB_BACKLIGHT=m +CONFIG_FB_MODE_HELPERS=y +CONFIG_FB_TILEBLITTING=y + +# +# Frame buffer hardware drivers +# +# CONFIG_FB_CIRRUS is not set +# CONFIG_FB_PM2 is not set +# CONFIG_FB_CYBER2000 is not set +# CONFIG_FB_ARC is not set +# CONFIG_FB_ASILIANT is not set +# CONFIG_FB_IMSTT is not set +# CONFIG_FB_VGA16 is not set +# CONFIG_FB_UVESA is not set +CONFIG_FB_VESA=y +CONFIG_FB_EFI=y +# CONFIG_FB_N411 is not set +# CONFIG_FB_HGA is not set +# CONFIG_FB_OPENCORES is not set +# CONFIG_FB_S1D13XXX is not set +# CONFIG_FB_NVIDIA is not set +# CONFIG_FB_RIVA is not set +# CONFIG_FB_I740 is not set +# CONFIG_FB_LE80578 is not set +# CONFIG_FB_INTEL is not set +# CONFIG_FB_MATROX is not set +# CONFIG_FB_RADEON is not set +# CONFIG_FB_ATY128 is not set +# CONFIG_FB_ATY is not set +# CONFIG_FB_S3 is not set +# CONFIG_FB_SAVAGE is not set +# CONFIG_FB_SIS is not set +# CONFIG_FB_VIA is not set +# CONFIG_FB_NEOMAGIC is not set +# CONFIG_FB_KYRO is not set +# CONFIG_FB_3DFX is not set +# CONFIG_FB_VOODOO1 is not set +# CONFIG_FB_VT8623 is not set +# CONFIG_FB_TRIDENT is not set +# CONFIG_FB_ARK is not set +# CONFIG_FB_PM3 is not set +# CONFIG_FB_CARMINE is not set +# CONFIG_FB_SM501 is not set +# CONFIG_FB_SMSCUFX is not set +# CONFIG_FB_UDL is not set +# CONFIG_FB_IBM_GXT4500 is not set +# CONFIG_FB_VIRTUAL is not set +CONFIG_XEN_FBDEV_FRONTEND=m +# CONFIG_FB_METRONOME is not set +# CONFIG_FB_MB862XX is not set +CONFIG_FB_HYPERV=m +CONFIG_FB_SIMPLE=y +# CONFIG_FB_SSD1307 is not set +# CONFIG_FB_SM712 is not set +# end of Frame buffer Devices + +# +# Backlight & LCD device support +# +CONFIG_LCD_CLASS_DEVICE=m +CONFIG_LCD_L4F00242T03=m +CONFIG_LCD_LMS283GF05=m +CONFIG_LCD_LTV350QV=m +CONFIG_LCD_ILI922X=m +CONFIG_LCD_ILI9320=m +CONFIG_LCD_TDO24M=m +CONFIG_LCD_VGG2432A4=m +CONFIG_LCD_PLATFORM=m +CONFIG_LCD_AMS369FG06=m +CONFIG_LCD_LMS501KF03=m +CONFIG_LCD_HX8357=m +CONFIG_LCD_OTM3225A=m +CONFIG_BACKLIGHT_CLASS_DEVICE=y +CONFIG_BACKLIGHT_GENERIC=m +CONFIG_BACKLIGHT_LM3533=m +CONFIG_BACKLIGHT_PWM=m +CONFIG_BACKLIGHT_DA903X=m +CONFIG_BACKLIGHT_DA9052=m +CONFIG_BACKLIGHT_MAX8925=m +CONFIG_BACKLIGHT_APPLE=m +CONFIG_BACKLIGHT_QCOM_WLED=m +CONFIG_BACKLIGHT_SAHARA=m +CONFIG_BACKLIGHT_WM831X=m +CONFIG_BACKLIGHT_ADP5520=m +CONFIG_BACKLIGHT_ADP8860=m +CONFIG_BACKLIGHT_ADP8870=m +CONFIG_BACKLIGHT_88PM860X=m +CONFIG_BACKLIGHT_PCF50633=m +CONFIG_BACKLIGHT_AAT2870=m +CONFIG_BACKLIGHT_LM3630A=m +CONFIG_BACKLIGHT_LM3639=m +CONFIG_BACKLIGHT_LP855X=m +CONFIG_BACKLIGHT_LP8788=m +CONFIG_BACKLIGHT_PANDORA=m +CONFIG_BACKLIGHT_SKY81452=m +CONFIG_BACKLIGHT_TPS65217=m +CONFIG_BACKLIGHT_AS3711=m +CONFIG_BACKLIGHT_GPIO=m +CONFIG_BACKLIGHT_LV5207LP=m +CONFIG_BACKLIGHT_BD6107=m +CONFIG_BACKLIGHT_ARCXCNN=m +CONFIG_BACKLIGHT_RAVE_SP=m +CONFIG_BACKLIGHT_LED=m +# end of Backlight & LCD device support + +CONFIG_VIDEOMODE_HELPERS=y +CONFIG_HDMI=y + +# +# Console display driver support +# +CONFIG_VGA_CONSOLE=y +CONFIG_VGACON_SOFT_SCROLLBACK=y +CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 +# CONFIG_VGACON_SOFT_SCROLLBACK_PERSISTENT_ENABLE_BY_DEFAULT is not set +CONFIG_DUMMY_CONSOLE=y +CONFIG_DUMMY_CONSOLE_COLUMNS=80 +CONFIG_DUMMY_CONSOLE_ROWS=25 +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y +CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y +CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER=y +# end of Console display driver support + +# CONFIG_LOGO is not set +# end of Graphics support + +CONFIG_SOUND=m +CONFIG_SOUND_OSS_CORE=y +# CONFIG_SOUND_OSS_CORE_PRECLAIM is not set +CONFIG_SND=m +CONFIG_SND_TIMER=m +CONFIG_SND_PCM=m +CONFIG_SND_PCM_ELD=y +CONFIG_SND_PCM_IEC958=y +CONFIG_SND_DMAENGINE_PCM=m +CONFIG_SND_HWDEP=m +CONFIG_SND_SEQ_DEVICE=m +CONFIG_SND_RAWMIDI=m +CONFIG_SND_COMPRESS_OFFLOAD=m +CONFIG_SND_JACK=y +CONFIG_SND_JACK_INPUT_DEV=y +CONFIG_SND_OSSEMUL=y +CONFIG_SND_MIXER_OSS=m +CONFIG_SND_PCM_OSS=m +CONFIG_SND_PCM_OSS_PLUGINS=y +CONFIG_SND_PCM_TIMER=y +CONFIG_SND_HRTIMER=m +CONFIG_SND_DYNAMIC_MINORS=y +CONFIG_SND_MAX_CARDS=32 +# CONFIG_SND_SUPPORT_OLD_API is not set +CONFIG_SND_PROC_FS=y +CONFIG_SND_VERBOSE_PROCFS=y +CONFIG_SND_VERBOSE_PRINTK=y +CONFIG_SND_DEBUG=y +# CONFIG_SND_DEBUG_VERBOSE is not set +# CONFIG_SND_PCM_XRUN_DEBUG is not set +# CONFIG_SND_CTL_VALIDATION is not set +CONFIG_SND_VMASTER=y +CONFIG_SND_DMA_SGBUF=y +CONFIG_SND_SEQUENCER=m +CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_SEQUENCER_OSS=m +CONFIG_SND_SEQ_HRTIMER_DEFAULT=y +CONFIG_SND_SEQ_MIDI_EVENT=m +CONFIG_SND_SEQ_MIDI=m +CONFIG_SND_SEQ_MIDI_EMUL=m +CONFIG_SND_SEQ_VIRMIDI=m +CONFIG_SND_MPU401_UART=m +CONFIG_SND_OPL3_LIB=m +CONFIG_SND_OPL3_LIB_SEQ=m +CONFIG_SND_VX_LIB=m +CONFIG_SND_AC97_CODEC=m +CONFIG_SND_DRIVERS=y +# CONFIG_SND_PCSP is not set +CONFIG_SND_DUMMY=m +CONFIG_SND_ALOOP=m +CONFIG_SND_VIRMIDI=m +CONFIG_SND_MTPAV=m +CONFIG_SND_MTS64=m +CONFIG_SND_SERIAL_U16550=m +CONFIG_SND_MPU401=m +CONFIG_SND_PORTMAN2X4=m +CONFIG_SND_AC97_POWER_SAVE=y +CONFIG_SND_AC97_POWER_SAVE_DEFAULT=0 +CONFIG_SND_SB_COMMON=m +CONFIG_SND_PCI=y +CONFIG_SND_AD1889=m +CONFIG_SND_ALS300=m +CONFIG_SND_ALS4000=m +CONFIG_SND_ALI5451=m +CONFIG_SND_ASIHPI=m +CONFIG_SND_ATIIXP=m +CONFIG_SND_ATIIXP_MODEM=m +CONFIG_SND_AU8810=m +CONFIG_SND_AU8820=m +CONFIG_SND_AU8830=m +CONFIG_SND_AW2=m +CONFIG_SND_AZT3328=m +CONFIG_SND_BT87X=m +# CONFIG_SND_BT87X_OVERCLOCK is not set +CONFIG_SND_CA0106=m +CONFIG_SND_CMIPCI=m +CONFIG_SND_OXYGEN_LIB=m +CONFIG_SND_OXYGEN=m +CONFIG_SND_CS4281=m +CONFIG_SND_CS46XX=m +CONFIG_SND_CS46XX_NEW_DSP=y +CONFIG_SND_CTXFI=m +CONFIG_SND_DARLA20=m +CONFIG_SND_GINA20=m +CONFIG_SND_LAYLA20=m +CONFIG_SND_DARLA24=m +CONFIG_SND_GINA24=m +CONFIG_SND_LAYLA24=m +CONFIG_SND_MONA=m +CONFIG_SND_MIA=m +CONFIG_SND_ECHO3G=m +CONFIG_SND_INDIGO=m +CONFIG_SND_INDIGOIO=m +CONFIG_SND_INDIGODJ=m +CONFIG_SND_INDIGOIOX=m +CONFIG_SND_INDIGODJX=m +CONFIG_SND_EMU10K1=m +CONFIG_SND_EMU10K1_SEQ=m +CONFIG_SND_EMU10K1X=m +CONFIG_SND_ENS1370=m +CONFIG_SND_ENS1371=m +CONFIG_SND_ES1938=m +CONFIG_SND_ES1968=m +CONFIG_SND_ES1968_INPUT=y +CONFIG_SND_ES1968_RADIO=y +CONFIG_SND_FM801=m +CONFIG_SND_FM801_TEA575X_BOOL=y +CONFIG_SND_HDSP=m +CONFIG_SND_HDSPM=m +CONFIG_SND_ICE1712=m +CONFIG_SND_ICE1724=m +CONFIG_SND_INTEL8X0=m +CONFIG_SND_INTEL8X0M=m +CONFIG_SND_KORG1212=m +CONFIG_SND_LOLA=m +CONFIG_SND_LX6464ES=m +CONFIG_SND_MAESTRO3=m +CONFIG_SND_MAESTRO3_INPUT=y +CONFIG_SND_MIXART=m +CONFIG_SND_NM256=m +CONFIG_SND_PCXHR=m +CONFIG_SND_RIPTIDE=m +CONFIG_SND_RME32=m +CONFIG_SND_RME96=m +CONFIG_SND_RME9652=m +CONFIG_SND_SONICVIBES=m +CONFIG_SND_TRIDENT=m +CONFIG_SND_VIA82XX=m +CONFIG_SND_VIA82XX_MODEM=m +CONFIG_SND_VIRTUOSO=m +CONFIG_SND_VX222=m +CONFIG_SND_YMFPCI=m + +# +# HD-Audio +# +CONFIG_SND_HDA=m +CONFIG_SND_HDA_INTEL=m +CONFIG_SND_HDA_HWDEP=y +CONFIG_SND_HDA_RECONFIG=y +CONFIG_SND_HDA_INPUT_BEEP=y +CONFIG_SND_HDA_INPUT_BEEP_MODE=1 +CONFIG_SND_HDA_PATCH_LOADER=y +CONFIG_SND_HDA_CODEC_REALTEK=m +CONFIG_SND_HDA_CODEC_ANALOG=m +CONFIG_SND_HDA_CODEC_SIGMATEL=m +CONFIG_SND_HDA_CODEC_VIA=m +CONFIG_SND_HDA_CODEC_HDMI=m +CONFIG_SND_HDA_CODEC_CIRRUS=m +CONFIG_SND_HDA_CODEC_CONEXANT=m +CONFIG_SND_HDA_CODEC_CA0110=m +CONFIG_SND_HDA_CODEC_CA0132=m +CONFIG_SND_HDA_CODEC_CA0132_DSP=y +CONFIG_SND_HDA_CODEC_CMEDIA=m +CONFIG_SND_HDA_CODEC_SI3054=m +CONFIG_SND_HDA_GENERIC=m +CONFIG_SND_HDA_POWER_SAVE_DEFAULT=0 +# end of HD-Audio + +CONFIG_SND_HDA_CORE=m +CONFIG_SND_HDA_DSP_LOADER=y +CONFIG_SND_HDA_COMPONENT=y +CONFIG_SND_HDA_I915=y +CONFIG_SND_HDA_EXT_CORE=m +CONFIG_SND_HDA_PREALLOC_SIZE=0 +CONFIG_SND_INTEL_NHLT=y +CONFIG_SND_INTEL_DSP_CONFIG=m +CONFIG_SND_SPI=y +CONFIG_SND_USB=y +CONFIG_SND_USB_AUDIO=m +CONFIG_SND_USB_AUDIO_USE_MEDIA_CONTROLLER=y +CONFIG_SND_USB_UA101=m +CONFIG_SND_USB_USX2Y=m +CONFIG_SND_USB_CAIAQ=m +CONFIG_SND_USB_CAIAQ_INPUT=y +CONFIG_SND_USB_US122L=m +CONFIG_SND_USB_6FIRE=m +CONFIG_SND_USB_HIFACE=m +CONFIG_SND_BCD2000=m +CONFIG_SND_USB_LINE6=m +CONFIG_SND_USB_POD=m +CONFIG_SND_USB_PODHD=m +CONFIG_SND_USB_TONEPORT=m +CONFIG_SND_USB_VARIAX=m +CONFIG_SND_FIREWIRE=y +CONFIG_SND_FIREWIRE_LIB=m +CONFIG_SND_DICE=m +CONFIG_SND_OXFW=m +CONFIG_SND_ISIGHT=m +CONFIG_SND_FIREWORKS=m +CONFIG_SND_BEBOB=m +CONFIG_SND_FIREWIRE_DIGI00X=m +CONFIG_SND_FIREWIRE_TASCAM=m +CONFIG_SND_FIREWIRE_MOTU=m +CONFIG_SND_FIREFACE=m +CONFIG_SND_PCMCIA=y +CONFIG_SND_VXPOCKET=m +CONFIG_SND_PDAUDIOCF=m +CONFIG_SND_SOC=m +CONFIG_SND_SOC_AC97_BUS=y +CONFIG_SND_SOC_GENERIC_DMAENGINE_PCM=y +CONFIG_SND_SOC_COMPRESS=y +CONFIG_SND_SOC_TOPOLOGY=y +CONFIG_SND_SOC_ACPI=m +CONFIG_SND_SOC_AMD_ACP=m +CONFIG_SND_SOC_AMD_CZ_DA7219MX98357_MACH=m +CONFIG_SND_SOC_AMD_CZ_RT5645_MACH=m +CONFIG_SND_SOC_AMD_ACP3x=m +CONFIG_SND_SOC_AMD_RV_RT5682_MACH=m +CONFIG_SND_ATMEL_SOC=m +CONFIG_SND_SOC_MIKROE_PROTO=m +CONFIG_SND_BCM63XX_I2S_WHISTLER=m +CONFIG_SND_DESIGNWARE_I2S=m +CONFIG_SND_DESIGNWARE_PCM=y + +# +# SoC Audio for Freescale CPUs +# + +# +# Common SoC Audio options for Freescale CPUs: +# +# CONFIG_SND_SOC_FSL_ASRC is not set +# CONFIG_SND_SOC_FSL_SAI is not set +# CONFIG_SND_SOC_FSL_AUDMIX is not set +# CONFIG_SND_SOC_FSL_SSI is not set +# CONFIG_SND_SOC_FSL_SPDIF is not set +# CONFIG_SND_SOC_FSL_ESAI is not set +# CONFIG_SND_SOC_FSL_MICFIL is not set +# CONFIG_SND_SOC_IMX_AUDMUX is not set +# end of SoC Audio for Freescale CPUs + +CONFIG_SND_I2S_HI6210_I2S=m +CONFIG_SND_SOC_IMG=y +CONFIG_SND_SOC_IMG_I2S_IN=m +CONFIG_SND_SOC_IMG_I2S_OUT=m +CONFIG_SND_SOC_IMG_PARALLEL_OUT=m +CONFIG_SND_SOC_IMG_SPDIF_IN=m +CONFIG_SND_SOC_IMG_SPDIF_OUT=m +CONFIG_SND_SOC_IMG_PISTACHIO_INTERNAL_DAC=m +CONFIG_SND_SOC_INTEL_SST_TOPLEVEL=y +CONFIG_SND_SST_IPC=m +CONFIG_SND_SST_IPC_PCI=m +CONFIG_SND_SST_IPC_ACPI=m +CONFIG_SND_SOC_INTEL_SST_ACPI=m +CONFIG_SND_SOC_INTEL_SST=m +CONFIG_SND_SOC_INTEL_SST_FIRMWARE=m +CONFIG_SND_SOC_INTEL_HASWELL=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_PCI=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_ACPI=m +CONFIG_SND_SOC_INTEL_SKYLAKE=m +CONFIG_SND_SOC_INTEL_SKL=m +CONFIG_SND_SOC_INTEL_APL=m +CONFIG_SND_SOC_INTEL_KBL=m +CONFIG_SND_SOC_INTEL_GLK=m +CONFIG_SND_SOC_INTEL_CNL=m +CONFIG_SND_SOC_INTEL_CFL=m +CONFIG_SND_SOC_INTEL_CML_H=m +CONFIG_SND_SOC_INTEL_CML_LP=m +CONFIG_SND_SOC_INTEL_SKYLAKE_FAMILY=m +CONFIG_SND_SOC_INTEL_SKYLAKE_SSP_CLK=m +# CONFIG_SND_SOC_INTEL_SKYLAKE_HDAUDIO_CODEC is not set +CONFIG_SND_SOC_INTEL_SKYLAKE_COMMON=m +CONFIG_SND_SOC_ACPI_INTEL_MATCH=m +CONFIG_SND_SOC_INTEL_MACH=y +# CONFIG_SND_SOC_INTEL_USER_FRIENDLY_LONG_NAMES is not set +CONFIG_SND_SOC_INTEL_HASWELL_MACH=m +CONFIG_SND_SOC_INTEL_BDW_RT5650_MACH=m +CONFIG_SND_SOC_INTEL_BDW_RT5677_MACH=m +CONFIG_SND_SOC_INTEL_BROADWELL_MACH=m +CONFIG_SND_SOC_INTEL_BYTCR_RT5640_MACH=m +CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_RT5672_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_RT5645_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_MAX98090_TI_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_NAU8824_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_CX2072X_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_DA7213_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_ES8316_MACH=m +# CONFIG_SND_SOC_INTEL_BYT_CHT_NOCODEC_MACH is not set +CONFIG_SND_SOC_INTEL_SKL_RT286_MACH=m +CONFIG_SND_SOC_INTEL_SKL_NAU88L25_SSM4567_MACH=m +CONFIG_SND_SOC_INTEL_SKL_NAU88L25_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_DA7219_MAX98357A_GENERIC=m +CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_COMMON=m +CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_BXT_RT298_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5663_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5663_RT5514_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5660_MACH=m +CONFIG_SND_SOC_INTEL_GLK_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_GLK_RT5682_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_SKL_HDA_DSP_GENERIC_MACH=m +CONFIG_SND_SOC_INTEL_SOF_RT5682_MACH=m +CONFIG_SND_SOC_INTEL_SOF_PCM512x_MACH=m +CONFIG_SND_SOC_INTEL_CML_LP_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_SOF_CML_RT1011_RT5682_MACH=m +CONFIG_SND_SOC_INTEL_SOF_DA7219_MAX98373_MACH=m +CONFIG_SND_SOC_MTK_BTCVSD=m +CONFIG_SND_SOC_SOF_TOPLEVEL=y +CONFIG_SND_SOC_SOF_PCI=m +CONFIG_SND_SOC_SOF_ACPI=m +CONFIG_SND_SOC_SOF_OF=m +# CONFIG_SND_SOC_SOF_DEBUG_PROBES is not set +# CONFIG_SND_SOC_SOF_DEVELOPER_SUPPORT is not set +CONFIG_SND_SOC_SOF=m +CONFIG_SND_SOC_SOF_PROBE_WORK_QUEUE=y +CONFIG_SND_SOC_SOF_INTEL_TOPLEVEL=y +CONFIG_SND_SOC_SOF_INTEL_ACPI=m +CONFIG_SND_SOC_SOF_INTEL_PCI=m +CONFIG_SND_SOC_SOF_INTEL_HIFI_EP_IPC=m +CONFIG_SND_SOC_SOF_INTEL_ATOM_HIFI_EP=m +CONFIG_SND_SOC_SOF_INTEL_COMMON=m +CONFIG_SND_SOC_SOF_MERRIFIELD_SUPPORT=y +CONFIG_SND_SOC_SOF_MERRIFIELD=m +CONFIG_SND_SOC_SOF_APOLLOLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_APOLLOLAKE=m +CONFIG_SND_SOC_SOF_GEMINILAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_GEMINILAKE=m +CONFIG_SND_SOC_SOF_CANNONLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_CANNONLAKE=m +CONFIG_SND_SOC_SOF_COFFEELAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_COFFEELAKE=m +CONFIG_SND_SOC_SOF_ICELAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_ICELAKE=m +CONFIG_SND_SOC_SOF_COMETLAKE_LP=m +CONFIG_SND_SOC_SOF_COMETLAKE_LP_SUPPORT=y +CONFIG_SND_SOC_SOF_COMETLAKE_H=m +CONFIG_SND_SOC_SOF_COMETLAKE_H_SUPPORT=y +CONFIG_SND_SOC_SOF_TIGERLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_TIGERLAKE=m +CONFIG_SND_SOC_SOF_ELKHARTLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_ELKHARTLAKE=m +CONFIG_SND_SOC_SOF_JASPERLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_JASPERLAKE=m +CONFIG_SND_SOC_SOF_HDA_COMMON=m +CONFIG_SND_SOC_SOF_HDA_LINK=y +CONFIG_SND_SOC_SOF_HDA_AUDIO_CODEC=y +# CONFIG_SND_SOC_SOF_HDA_ALWAYS_ENABLE_DMI_L1 is not set +CONFIG_SND_SOC_SOF_HDA_LINK_BASELINE=m +CONFIG_SND_SOC_SOF_HDA=m +CONFIG_SND_SOC_SOF_XTENSA=m + +# +# STMicroelectronics STM32 SOC audio support +# +# end of STMicroelectronics STM32 SOC audio support + +CONFIG_SND_SOC_XILINX_I2S=m +CONFIG_SND_SOC_XILINX_AUDIO_FORMATTER=m +CONFIG_SND_SOC_XILINX_SPDIF=m +CONFIG_SND_SOC_XTFPGA_I2S=m +CONFIG_ZX_TDM=m +CONFIG_SND_SOC_I2C_AND_SPI=m + +# +# CODEC drivers +# +CONFIG_SND_SOC_AC97_CODEC=m +CONFIG_SND_SOC_ADAU_UTILS=m +CONFIG_SND_SOC_ADAU1701=m +CONFIG_SND_SOC_ADAU17X1=m +CONFIG_SND_SOC_ADAU1761=m +CONFIG_SND_SOC_ADAU1761_I2C=m +CONFIG_SND_SOC_ADAU1761_SPI=m +CONFIG_SND_SOC_ADAU7002=m +CONFIG_SND_SOC_ADAU7118=m +CONFIG_SND_SOC_ADAU7118_HW=m +CONFIG_SND_SOC_ADAU7118_I2C=m +CONFIG_SND_SOC_AK4104=m +CONFIG_SND_SOC_AK4118=m +CONFIG_SND_SOC_AK4458=m +CONFIG_SND_SOC_AK4554=m +CONFIG_SND_SOC_AK4613=m +CONFIG_SND_SOC_AK4642=m +CONFIG_SND_SOC_AK5386=m +CONFIG_SND_SOC_AK5558=m +CONFIG_SND_SOC_ALC5623=m +CONFIG_SND_SOC_BD28623=m +# CONFIG_SND_SOC_BT_SCO is not set +CONFIG_SND_SOC_CPCAP=m +CONFIG_SND_SOC_CROS_EC_CODEC=m +CONFIG_SND_SOC_CS35L32=m +CONFIG_SND_SOC_CS35L33=m +CONFIG_SND_SOC_CS35L34=m +CONFIG_SND_SOC_CS35L35=m +CONFIG_SND_SOC_CS35L36=m +CONFIG_SND_SOC_CS42L42=m +CONFIG_SND_SOC_CS42L51=m +CONFIG_SND_SOC_CS42L51_I2C=m +CONFIG_SND_SOC_CS42L52=m +CONFIG_SND_SOC_CS42L56=m +CONFIG_SND_SOC_CS42L73=m +CONFIG_SND_SOC_CS4265=m +CONFIG_SND_SOC_CS4270=m +CONFIG_SND_SOC_CS4271=m +CONFIG_SND_SOC_CS4271_I2C=m +CONFIG_SND_SOC_CS4271_SPI=m +CONFIG_SND_SOC_CS42XX8=m +CONFIG_SND_SOC_CS42XX8_I2C=m +CONFIG_SND_SOC_CS43130=m +CONFIG_SND_SOC_CS4341=m +CONFIG_SND_SOC_CS4349=m +CONFIG_SND_SOC_CS53L30=m +CONFIG_SND_SOC_CX2072X=m +CONFIG_SND_SOC_DA7213=m +CONFIG_SND_SOC_DA7219=m +CONFIG_SND_SOC_DMIC=m +CONFIG_SND_SOC_HDMI_CODEC=m +CONFIG_SND_SOC_ES7134=m +CONFIG_SND_SOC_ES7241=m +CONFIG_SND_SOC_ES8316=m +CONFIG_SND_SOC_ES8328=m +CONFIG_SND_SOC_ES8328_I2C=m +CONFIG_SND_SOC_ES8328_SPI=m +CONFIG_SND_SOC_GTM601=m +CONFIG_SND_SOC_HDAC_HDMI=m +CONFIG_SND_SOC_HDAC_HDA=m +CONFIG_SND_SOC_INNO_RK3036=m +CONFIG_SND_SOC_LOCHNAGAR_SC=m +CONFIG_SND_SOC_MAX98088=m +CONFIG_SND_SOC_MAX98090=m +CONFIG_SND_SOC_MAX98357A=m +CONFIG_SND_SOC_MAX98504=m +CONFIG_SND_SOC_MAX9867=m +CONFIG_SND_SOC_MAX98927=m +CONFIG_SND_SOC_MAX98373=m +CONFIG_SND_SOC_MAX9860=m +CONFIG_SND_SOC_MSM8916_WCD_ANALOG=m +CONFIG_SND_SOC_MSM8916_WCD_DIGITAL=m +CONFIG_SND_SOC_PCM1681=m +CONFIG_SND_SOC_PCM1789=m +CONFIG_SND_SOC_PCM1789_I2C=m +CONFIG_SND_SOC_PCM179X=m +CONFIG_SND_SOC_PCM179X_I2C=m +CONFIG_SND_SOC_PCM179X_SPI=m +CONFIG_SND_SOC_PCM186X=m +CONFIG_SND_SOC_PCM186X_I2C=m +CONFIG_SND_SOC_PCM186X_SPI=m +CONFIG_SND_SOC_PCM3060=m +CONFIG_SND_SOC_PCM3060_I2C=m +CONFIG_SND_SOC_PCM3060_SPI=m +CONFIG_SND_SOC_PCM3168A=m +CONFIG_SND_SOC_PCM3168A_I2C=m +CONFIG_SND_SOC_PCM3168A_SPI=m +CONFIG_SND_SOC_PCM512x=m +CONFIG_SND_SOC_PCM512x_I2C=m +CONFIG_SND_SOC_PCM512x_SPI=m +CONFIG_SND_SOC_RK3328=m +CONFIG_SND_SOC_RL6231=m +CONFIG_SND_SOC_RL6347A=m +CONFIG_SND_SOC_RT286=m +CONFIG_SND_SOC_RT298=m +CONFIG_SND_SOC_RT1011=m +CONFIG_SND_SOC_RT1015=m +CONFIG_SND_SOC_RT1308_SDW=m +CONFIG_SND_SOC_RT5514=m +CONFIG_SND_SOC_RT5514_SPI=m +CONFIG_SND_SOC_RT5616=m +CONFIG_SND_SOC_RT5631=m +CONFIG_SND_SOC_RT5640=m +CONFIG_SND_SOC_RT5645=m +CONFIG_SND_SOC_RT5651=m +CONFIG_SND_SOC_RT5660=m +CONFIG_SND_SOC_RT5663=m +CONFIG_SND_SOC_RT5670=m +CONFIG_SND_SOC_RT5677=m +CONFIG_SND_SOC_RT5677_SPI=m +CONFIG_SND_SOC_RT5682=m +CONFIG_SND_SOC_RT5682_SDW=m +CONFIG_SND_SOC_RT700=m +CONFIG_SND_SOC_RT700_SDW=m +CONFIG_SND_SOC_RT711=m +CONFIG_SND_SOC_RT711_SDW=m +CONFIG_SND_SOC_RT715=m +CONFIG_SND_SOC_RT715_SDW=m +CONFIG_SND_SOC_SGTL5000=m +CONFIG_SND_SOC_SI476X=m +CONFIG_SND_SOC_SIGMADSP=m +CONFIG_SND_SOC_SIGMADSP_I2C=m +CONFIG_SND_SOC_SIGMADSP_REGMAP=m +CONFIG_SND_SOC_SIMPLE_AMPLIFIER=m +CONFIG_SND_SOC_SIRF_AUDIO_CODEC=m +CONFIG_SND_SOC_SPDIF=m +CONFIG_SND_SOC_SSM2305=m +CONFIG_SND_SOC_SSM2602=m +CONFIG_SND_SOC_SSM2602_SPI=m +CONFIG_SND_SOC_SSM2602_I2C=m +CONFIG_SND_SOC_SSM4567=m +CONFIG_SND_SOC_STA32X=m +CONFIG_SND_SOC_STA350=m +CONFIG_SND_SOC_STI_SAS=m +CONFIG_SND_SOC_TAS2552=m +CONFIG_SND_SOC_TAS2562=m +CONFIG_SND_SOC_TAS2770=m +CONFIG_SND_SOC_TAS5086=m +CONFIG_SND_SOC_TAS571X=m +CONFIG_SND_SOC_TAS5720=m +CONFIG_SND_SOC_TAS6424=m +CONFIG_SND_SOC_TDA7419=m +CONFIG_SND_SOC_TFA9879=m +CONFIG_SND_SOC_TLV320AIC23=m +CONFIG_SND_SOC_TLV320AIC23_I2C=m +CONFIG_SND_SOC_TLV320AIC23_SPI=m +CONFIG_SND_SOC_TLV320AIC31XX=m +CONFIG_SND_SOC_TLV320AIC32X4=m +CONFIG_SND_SOC_TLV320AIC32X4_I2C=m +CONFIG_SND_SOC_TLV320AIC32X4_SPI=m +CONFIG_SND_SOC_TLV320AIC3X=m +CONFIG_SND_SOC_TLV320ADCX140=m +CONFIG_SND_SOC_TS3A227E=m +CONFIG_SND_SOC_TSCS42XX=m +CONFIG_SND_SOC_TSCS454=m +CONFIG_SND_SOC_UDA1334=m +CONFIG_SND_SOC_WCD9335=m +CONFIG_SND_SOC_WCD934X=m +CONFIG_SND_SOC_WM8510=m +CONFIG_SND_SOC_WM8523=m +CONFIG_SND_SOC_WM8524=m +CONFIG_SND_SOC_WM8580=m +CONFIG_SND_SOC_WM8711=m +CONFIG_SND_SOC_WM8728=m +CONFIG_SND_SOC_WM8731=m +CONFIG_SND_SOC_WM8737=m +CONFIG_SND_SOC_WM8741=m +CONFIG_SND_SOC_WM8750=m +CONFIG_SND_SOC_WM8753=m +CONFIG_SND_SOC_WM8770=m +CONFIG_SND_SOC_WM8776=m +CONFIG_SND_SOC_WM8782=m +CONFIG_SND_SOC_WM8804=m +CONFIG_SND_SOC_WM8804_I2C=m +CONFIG_SND_SOC_WM8804_SPI=m +CONFIG_SND_SOC_WM8903=m +CONFIG_SND_SOC_WM8904=m +CONFIG_SND_SOC_WM8960=m +CONFIG_SND_SOC_WM8962=m +CONFIG_SND_SOC_WM8974=m +CONFIG_SND_SOC_WM8978=m +CONFIG_SND_SOC_WM8985=m +CONFIG_SND_SOC_WSA881X=m +CONFIG_SND_SOC_ZX_AUD96P22=m +CONFIG_SND_SOC_MAX9759=m +CONFIG_SND_SOC_MT6351=m +CONFIG_SND_SOC_MT6358=m +CONFIG_SND_SOC_MT6660=m +CONFIG_SND_SOC_NAU8540=m +CONFIG_SND_SOC_NAU8810=m +CONFIG_SND_SOC_NAU8822=m +CONFIG_SND_SOC_NAU8824=m +CONFIG_SND_SOC_NAU8825=m +CONFIG_SND_SOC_TPA6130A2=m +# end of CODEC drivers + +CONFIG_SND_SIMPLE_CARD_UTILS=m +CONFIG_SND_SIMPLE_CARD=m +CONFIG_SND_AUDIO_GRAPH_CARD=m +CONFIG_SND_X86=y +CONFIG_HDMI_LPE_AUDIO=m +CONFIG_SND_SYNTH_EMUX=m +CONFIG_SND_XEN_FRONTEND=m +CONFIG_AC97_BUS=m + +# +# HID support +# +CONFIG_HID=m +CONFIG_HID_BATTERY_STRENGTH=y +CONFIG_HIDRAW=y +CONFIG_UHID=m +CONFIG_HID_GENERIC=m + +# +# Special HID drivers +# +CONFIG_HID_A4TECH=m +CONFIG_HID_ACCUTOUCH=m +CONFIG_HID_ACRUX=m +CONFIG_HID_ACRUX_FF=y +CONFIG_HID_APPLE=m +CONFIG_HID_APPLEIR=m +CONFIG_HID_ASUS=m +CONFIG_HID_AUREAL=m +CONFIG_HID_BELKIN=m +CONFIG_HID_BETOP_FF=m +CONFIG_HID_BIGBEN_FF=m +CONFIG_HID_CHERRY=m +CONFIG_HID_CHICONY=m +CONFIG_HID_CORSAIR=m +CONFIG_HID_COUGAR=m +CONFIG_HID_MACALLY=m +CONFIG_HID_PRODIKEYS=m +CONFIG_HID_CMEDIA=m +CONFIG_HID_CP2112=m +CONFIG_HID_CREATIVE_SB0540=m +CONFIG_HID_CYPRESS=m +CONFIG_HID_DRAGONRISE=m +CONFIG_DRAGONRISE_FF=y +CONFIG_HID_EMS_FF=m +CONFIG_HID_ELAN=m +CONFIG_HID_ELECOM=m +CONFIG_HID_ELO=m +CONFIG_HID_EZKEY=m +CONFIG_HID_GEMBIRD=m +CONFIG_HID_GFRM=m +CONFIG_HID_GLORIOUS=m +CONFIG_HID_HOLTEK=m +CONFIG_HOLTEK_FF=y +CONFIG_HID_GOOGLE_HAMMER=m +CONFIG_HID_GT683R=m +CONFIG_HID_KEYTOUCH=m +CONFIG_HID_KYE=m +CONFIG_HID_UCLOGIC=m +CONFIG_HID_WALTOP=m +CONFIG_HID_VIEWSONIC=m +CONFIG_HID_GYRATION=m +CONFIG_HID_ICADE=m +CONFIG_HID_ITE=m +CONFIG_HID_JABRA=m +CONFIG_HID_TWINHAN=m +CONFIG_HID_KENSINGTON=m +CONFIG_HID_LCPOWER=m +CONFIG_HID_LED=m +CONFIG_HID_LENOVO=m +CONFIG_HID_LOGITECH=m +CONFIG_HID_LOGITECH_DJ=m +CONFIG_HID_LOGITECH_HIDPP=m +CONFIG_LOGITECH_FF=y +CONFIG_LOGIRUMBLEPAD2_FF=y +CONFIG_LOGIG940_FF=y +CONFIG_LOGIWHEELS_FF=y +CONFIG_HID_MAGICMOUSE=m +CONFIG_HID_MALTRON=m +CONFIG_HID_MAYFLASH=m +CONFIG_HID_REDRAGON=m +CONFIG_HID_MICROSOFT=m +CONFIG_HID_MONTEREY=m +CONFIG_HID_MULTITOUCH=m +CONFIG_HID_NTI=m +CONFIG_HID_NTRIG=m +CONFIG_HID_ORTEK=m +CONFIG_HID_PANTHERLORD=m +CONFIG_PANTHERLORD_FF=y +CONFIG_HID_PENMOUNT=m +CONFIG_HID_PETALYNX=m +CONFIG_HID_PICOLCD=m +CONFIG_HID_PICOLCD_FB=y +CONFIG_HID_PICOLCD_BACKLIGHT=y +CONFIG_HID_PICOLCD_LCD=y +CONFIG_HID_PICOLCD_LEDS=y +CONFIG_HID_PICOLCD_CIR=y +CONFIG_HID_PLANTRONICS=m +CONFIG_HID_PRIMAX=m +CONFIG_HID_RETRODE=m +CONFIG_HID_ROCCAT=m +CONFIG_HID_SAITEK=m +CONFIG_HID_SAMSUNG=m +CONFIG_HID_SONY=m +CONFIG_SONY_FF=y +CONFIG_HID_SPEEDLINK=m +CONFIG_HID_STEAM=m +CONFIG_HID_STEELSERIES=m +CONFIG_HID_SUNPLUS=m +CONFIG_HID_RMI=m +CONFIG_HID_GREENASIA=m +CONFIG_GREENASIA_FF=y +CONFIG_HID_HYPERV_MOUSE=m +CONFIG_HID_SMARTJOYPLUS=m +CONFIG_SMARTJOYPLUS_FF=y +CONFIG_HID_TIVO=m +CONFIG_HID_TOPSEED=m +CONFIG_HID_THINGM=m +CONFIG_HID_THRUSTMASTER=m +CONFIG_THRUSTMASTER_FF=y +CONFIG_HID_UDRAW_PS3=m +CONFIG_HID_U2FZERO=m +CONFIG_HID_WACOM=m +CONFIG_HID_WIIMOTE=m +CONFIG_HID_XINMO=m +CONFIG_HID_ZEROPLUS=m +CONFIG_ZEROPLUS_FF=y +CONFIG_HID_ZYDACRON=m +CONFIG_HID_SENSOR_HUB=m +# CONFIG_HID_SENSOR_CUSTOM_SENSOR is not set +CONFIG_HID_ALPS=m +CONFIG_HID_MCP2221=m +# end of Special HID drivers + +# +# USB HID support +# +CONFIG_USB_HID=m +CONFIG_HID_PID=y +CONFIG_USB_HIDDEV=y + +# +# USB HID Boot Protocol drivers +# +# CONFIG_USB_KBD is not set +# CONFIG_USB_MOUSE is not set +# end of USB HID Boot Protocol drivers +# end of USB HID support + +# +# I2C HID support +# +CONFIG_I2C_HID=m +# end of I2C HID support + +# +# Intel ISH HID support +# +CONFIG_INTEL_ISH_HID=m +CONFIG_INTEL_ISH_FIRMWARE_DOWNLOADER=m +# end of Intel ISH HID support +# end of HID support + +CONFIG_USB_OHCI_LITTLE_ENDIAN=y +CONFIG_USB_SUPPORT=y +CONFIG_USB_COMMON=y +CONFIG_USB_LED_TRIG=y +CONFIG_USB_ULPI_BUS=m +CONFIG_USB_CONN_GPIO=m +CONFIG_USB_ARCH_HAS_HCD=y +CONFIG_USB=y +CONFIG_USB_PCI=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y + +# +# Miscellaneous USB options +# +CONFIG_USB_DEFAULT_PERSIST=y +CONFIG_USB_DYNAMIC_MINORS=y +# CONFIG_USB_OTG is not set +# CONFIG_USB_OTG_WHITELIST is not set +# CONFIG_USB_OTG_BLACKLIST_HUB is not set +CONFIG_USB_LEDS_TRIGGER_USBPORT=m +CONFIG_USB_AUTOSUSPEND_DELAY=2 +CONFIG_USB_MON=m + +# +# USB Host Controller Drivers +# +CONFIG_USB_C67X00_HCD=m +CONFIG_USB_XHCI_HCD=m +# CONFIG_USB_XHCI_DBGCAP is not set +CONFIG_USB_XHCI_PCI=m +CONFIG_USB_XHCI_PLATFORM=m +CONFIG_USB_EHCI_HCD=m +CONFIG_USB_EHCI_ROOT_HUB_TT=y +CONFIG_USB_EHCI_TT_NEWSCHED=y +CONFIG_USB_EHCI_PCI=m +CONFIG_USB_EHCI_FSL=m +CONFIG_USB_EHCI_HCD_PLATFORM=m +CONFIG_USB_OXU210HP_HCD=m +CONFIG_USB_ISP116X_HCD=m +CONFIG_USB_FOTG210_HCD=m +CONFIG_USB_MAX3421_HCD=m +CONFIG_USB_OHCI_HCD=m +CONFIG_USB_OHCI_HCD_PCI=m +# CONFIG_USB_OHCI_HCD_SSB is not set +CONFIG_USB_OHCI_HCD_PLATFORM=m +CONFIG_USB_UHCI_HCD=m +CONFIG_USB_U132_HCD=m +CONFIG_USB_SL811_HCD=m +# CONFIG_USB_SL811_HCD_ISO is not set +CONFIG_USB_SL811_CS=m +CONFIG_USB_R8A66597_HCD=m +CONFIG_USB_HCD_BCMA=m +CONFIG_USB_HCD_SSB=m +# CONFIG_USB_HCD_TEST_MODE is not set + +# +# USB Device Class drivers +# +CONFIG_USB_ACM=m +CONFIG_USB_PRINTER=m +CONFIG_USB_WDM=m +CONFIG_USB_TMC=m + +# +# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may +# + +# +# also be needed; see USB_STORAGE Help for more info +# +CONFIG_USB_STORAGE=m +# CONFIG_USB_STORAGE_DEBUG is not set +CONFIG_USB_STORAGE_REALTEK=m +CONFIG_REALTEK_AUTOPM=y +CONFIG_USB_STORAGE_DATAFAB=m +CONFIG_USB_STORAGE_FREECOM=m +CONFIG_USB_STORAGE_ISD200=m +CONFIG_USB_STORAGE_USBAT=m +CONFIG_USB_STORAGE_SDDR09=m +CONFIG_USB_STORAGE_SDDR55=m +CONFIG_USB_STORAGE_JUMPSHOT=m +CONFIG_USB_STORAGE_ALAUDA=m +CONFIG_USB_STORAGE_ONETOUCH=m +CONFIG_USB_STORAGE_KARMA=m +CONFIG_USB_STORAGE_CYPRESS_ATACB=m +CONFIG_USB_STORAGE_ENE_UB6250=m +CONFIG_USB_UAS=m + +# +# USB Imaging devices +# +CONFIG_USB_MDC800=m +CONFIG_USB_MICROTEK=m +CONFIG_USBIP_CORE=m +CONFIG_USBIP_VHCI_HCD=m +CONFIG_USBIP_VHCI_HC_PORTS=8 +CONFIG_USBIP_VHCI_NR_HCS=1 +CONFIG_USBIP_HOST=m +CONFIG_USBIP_VUDC=m +# CONFIG_USBIP_DEBUG is not set +CONFIG_USB_CDNS3=m +CONFIG_USB_CDNS3_GADGET=y +CONFIG_USB_CDNS3_HOST=y +CONFIG_USB_CDNS3_PCI_WRAP=m +CONFIG_USB_MUSB_HDRC=m +# CONFIG_USB_MUSB_HOST is not set +# CONFIG_USB_MUSB_GADGET is not set +CONFIG_USB_MUSB_DUAL_ROLE=y + +# +# Platform Glue Layer +# + +# +# MUSB DMA mode +# +# CONFIG_MUSB_PIO_ONLY is not set +CONFIG_USB_DWC3=m +CONFIG_USB_DWC3_ULPI=y +# CONFIG_USB_DWC3_HOST is not set +# CONFIG_USB_DWC3_GADGET is not set +CONFIG_USB_DWC3_DUAL_ROLE=y + +# +# Platform Glue Driver Support +# +CONFIG_USB_DWC3_PCI=m +CONFIG_USB_DWC3_HAPS=m +CONFIG_USB_DWC3_OF_SIMPLE=m +CONFIG_USB_DWC2=m +# CONFIG_USB_DWC2_HOST is not set + +# +# Gadget/Dual-role mode requires USB Gadget support to be enabled +# +# CONFIG_USB_DWC2_PERIPHERAL is not set +CONFIG_USB_DWC2_DUAL_ROLE=y +CONFIG_USB_DWC2_PCI=m +# CONFIG_USB_DWC2_DEBUG is not set +# CONFIG_USB_DWC2_TRACK_MISSED_SOFS is not set +CONFIG_USB_CHIPIDEA=m +CONFIG_USB_CHIPIDEA_OF=m +CONFIG_USB_CHIPIDEA_PCI=m +CONFIG_USB_CHIPIDEA_UDC=y +CONFIG_USB_CHIPIDEA_HOST=y +CONFIG_USB_ISP1760=m +CONFIG_USB_ISP1760_HCD=y +CONFIG_USB_ISP1761_UDC=y +# CONFIG_USB_ISP1760_HOST_ROLE is not set +# CONFIG_USB_ISP1760_GADGET_ROLE is not set +CONFIG_USB_ISP1760_DUAL_ROLE=y + +# +# USB port drivers +# +CONFIG_USB_USS720=m +CONFIG_USB_SERIAL=y +CONFIG_USB_SERIAL_CONSOLE=y +CONFIG_USB_SERIAL_GENERIC=y +CONFIG_USB_SERIAL_SIMPLE=m +CONFIG_USB_SERIAL_AIRCABLE=m +CONFIG_USB_SERIAL_ARK3116=m +CONFIG_USB_SERIAL_BELKIN=m +CONFIG_USB_SERIAL_CH341=m +CONFIG_USB_SERIAL_WHITEHEAT=m +CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m +CONFIG_USB_SERIAL_CP210X=m +CONFIG_USB_SERIAL_CYPRESS_M8=m +CONFIG_USB_SERIAL_EMPEG=m +CONFIG_USB_SERIAL_FTDI_SIO=m +CONFIG_USB_SERIAL_VISOR=m +CONFIG_USB_SERIAL_IPAQ=m +CONFIG_USB_SERIAL_IR=m +CONFIG_USB_SERIAL_EDGEPORT=m +CONFIG_USB_SERIAL_EDGEPORT_TI=m +CONFIG_USB_SERIAL_F81232=m +CONFIG_USB_SERIAL_F8153X=m +CONFIG_USB_SERIAL_GARMIN=m +CONFIG_USB_SERIAL_IPW=m +CONFIG_USB_SERIAL_IUU=m +CONFIG_USB_SERIAL_KEYSPAN_PDA=m +CONFIG_USB_SERIAL_KEYSPAN=m +CONFIG_USB_SERIAL_KLSI=m +CONFIG_USB_SERIAL_KOBIL_SCT=m +CONFIG_USB_SERIAL_MCT_U232=m +CONFIG_USB_SERIAL_METRO=m +CONFIG_USB_SERIAL_MOS7720=m +CONFIG_USB_SERIAL_MOS7715_PARPORT=y +CONFIG_USB_SERIAL_MOS7840=m +CONFIG_USB_SERIAL_MXUPORT=m +CONFIG_USB_SERIAL_NAVMAN=m +CONFIG_USB_SERIAL_PL2303=m +CONFIG_USB_SERIAL_OTI6858=m +CONFIG_USB_SERIAL_QCAUX=m +CONFIG_USB_SERIAL_QUALCOMM=m +CONFIG_USB_SERIAL_SPCP8X5=m +CONFIG_USB_SERIAL_SAFE=m +# CONFIG_USB_SERIAL_SAFE_PADDED is not set +CONFIG_USB_SERIAL_SIERRAWIRELESS=m +CONFIG_USB_SERIAL_SYMBOL=m +CONFIG_USB_SERIAL_TI=m +CONFIG_USB_SERIAL_CYBERJACK=m +CONFIG_USB_SERIAL_XIRCOM=m +CONFIG_USB_SERIAL_WWAN=m +CONFIG_USB_SERIAL_OPTION=m +CONFIG_USB_SERIAL_OMNINET=m +CONFIG_USB_SERIAL_OPTICON=m +CONFIG_USB_SERIAL_XSENS_MT=m +CONFIG_USB_SERIAL_WISHBONE=m +CONFIG_USB_SERIAL_SSU100=m +CONFIG_USB_SERIAL_QT2=m +CONFIG_USB_SERIAL_UPD78F0730=m +CONFIG_USB_SERIAL_DEBUG=m + +# +# USB Miscellaneous drivers +# +CONFIG_USB_EMI62=m +CONFIG_USB_EMI26=m +CONFIG_USB_ADUTUX=m +CONFIG_USB_SEVSEG=m +CONFIG_USB_LEGOTOWER=m +CONFIG_USB_LCD=m +CONFIG_USB_CYPRESS_CY7C63=m +CONFIG_USB_CYTHERM=m +CONFIG_USB_IDMOUSE=m +CONFIG_USB_FTDI_ELAN=m +CONFIG_USB_APPLEDISPLAY=m +CONFIG_APPLE_MFI_FASTCHARGE=m +CONFIG_USB_SISUSBVGA=m +CONFIG_USB_SISUSBVGA_CON=y +CONFIG_USB_LD=m +CONFIG_USB_TRANCEVIBRATOR=m +CONFIG_USB_IOWARRIOR=m +CONFIG_USB_TEST=m +CONFIG_USB_EHSET_TEST_FIXTURE=m +CONFIG_USB_ISIGHTFW=m +CONFIG_USB_YUREX=m +CONFIG_USB_EZUSB_FX2=m +CONFIG_USB_HUB_USB251XB=m +CONFIG_USB_HSIC_USB3503=m +CONFIG_USB_HSIC_USB4604=m +CONFIG_USB_LINK_LAYER_TEST=m +CONFIG_USB_CHAOSKEY=m +CONFIG_USB_ATM=m +CONFIG_USB_SPEEDTOUCH=m +CONFIG_USB_CXACRU=m +CONFIG_USB_UEAGLEATM=m +CONFIG_USB_XUSBATM=m + +# +# USB Physical Layer drivers +# +CONFIG_USB_PHY=y +CONFIG_NOP_USB_XCEIV=m +CONFIG_USB_GPIO_VBUS=m +CONFIG_TAHVO_USB=m +# CONFIG_TAHVO_USB_HOST_BY_DEFAULT is not set +CONFIG_USB_ISP1301=m +# end of USB Physical Layer drivers + +CONFIG_USB_GADGET=m +# CONFIG_USB_GADGET_DEBUG is not set +# CONFIG_USB_GADGET_DEBUG_FILES is not set +# CONFIG_USB_GADGET_DEBUG_FS is not set +CONFIG_USB_GADGET_VBUS_DRAW=2 +CONFIG_USB_GADGET_STORAGE_NUM_BUFFERS=2 +CONFIG_U_SERIAL_CONSOLE=y + +# +# USB Peripheral Controller +# +CONFIG_USB_FOTG210_UDC=m +CONFIG_USB_GR_UDC=m +CONFIG_USB_R8A66597=m +CONFIG_USB_PXA27X=m +CONFIG_USB_MV_UDC=m +CONFIG_USB_MV_U3D=m +CONFIG_USB_SNP_CORE=m +CONFIG_USB_SNP_UDC_PLAT=m +CONFIG_USB_M66592=m +CONFIG_USB_BDC_UDC=m + +# +# Platform Support +# +CONFIG_USB_BDC_PCI=m +CONFIG_USB_AMD5536UDC=m +CONFIG_USB_NET2272=m +CONFIG_USB_NET2272_DMA=y +CONFIG_USB_NET2280=m +CONFIG_USB_GOKU=m +CONFIG_USB_EG20T=m +CONFIG_USB_GADGET_XILINX=m +CONFIG_USB_MAX3420_UDC=m +CONFIG_USB_DUMMY_HCD=m +# end of USB Peripheral Controller + +CONFIG_USB_LIBCOMPOSITE=m +CONFIG_USB_F_ACM=m +CONFIG_USB_F_SS_LB=m +CONFIG_USB_U_SERIAL=m +CONFIG_USB_U_ETHER=m +CONFIG_USB_U_AUDIO=m +CONFIG_USB_F_SERIAL=m +CONFIG_USB_F_OBEX=m +CONFIG_USB_F_NCM=m +CONFIG_USB_F_ECM=m +CONFIG_USB_F_PHONET=m +CONFIG_USB_F_EEM=m +CONFIG_USB_F_SUBSET=m +CONFIG_USB_F_RNDIS=m +CONFIG_USB_F_MASS_STORAGE=m +CONFIG_USB_F_FS=m +CONFIG_USB_F_UAC1=m +CONFIG_USB_F_UAC1_LEGACY=m +CONFIG_USB_F_UAC2=m +CONFIG_USB_F_UVC=m +CONFIG_USB_F_MIDI=m +CONFIG_USB_F_HID=m +CONFIG_USB_F_PRINTER=m +CONFIG_USB_F_TCM=m +CONFIG_USB_CONFIGFS=m +CONFIG_USB_CONFIGFS_SERIAL=y +CONFIG_USB_CONFIGFS_ACM=y +CONFIG_USB_CONFIGFS_OBEX=y +CONFIG_USB_CONFIGFS_NCM=y +CONFIG_USB_CONFIGFS_ECM=y +CONFIG_USB_CONFIGFS_ECM_SUBSET=y +CONFIG_USB_CONFIGFS_RNDIS=y +CONFIG_USB_CONFIGFS_EEM=y +CONFIG_USB_CONFIGFS_PHONET=y +CONFIG_USB_CONFIGFS_MASS_STORAGE=y +CONFIG_USB_CONFIGFS_F_LB_SS=y +CONFIG_USB_CONFIGFS_F_FS=y +CONFIG_USB_CONFIGFS_F_UAC1=y +CONFIG_USB_CONFIGFS_F_UAC1_LEGACY=y +CONFIG_USB_CONFIGFS_F_UAC2=y +CONFIG_USB_CONFIGFS_F_MIDI=y +CONFIG_USB_CONFIGFS_F_HID=y +CONFIG_USB_CONFIGFS_F_UVC=y +CONFIG_USB_CONFIGFS_F_PRINTER=y +CONFIG_USB_CONFIGFS_F_TCM=y + +# +# USB Gadget precomposed configurations +# +CONFIG_USB_ZERO=m +CONFIG_USB_AUDIO=m +# CONFIG_GADGET_UAC1 is not set +CONFIG_USB_ETH=m +CONFIG_USB_ETH_RNDIS=y +CONFIG_USB_ETH_EEM=y +CONFIG_USB_G_NCM=m +CONFIG_USB_GADGETFS=m +CONFIG_USB_FUNCTIONFS=m +CONFIG_USB_FUNCTIONFS_ETH=y +CONFIG_USB_FUNCTIONFS_RNDIS=y +CONFIG_USB_FUNCTIONFS_GENERIC=y +CONFIG_USB_MASS_STORAGE=m +CONFIG_USB_GADGET_TARGET=m +CONFIG_USB_G_SERIAL=m +CONFIG_USB_MIDI_GADGET=m +CONFIG_USB_G_PRINTER=m +CONFIG_USB_CDC_COMPOSITE=m +CONFIG_USB_G_NOKIA=m +CONFIG_USB_G_ACM_MS=m +CONFIG_USB_G_MULTI=m +CONFIG_USB_G_MULTI_RNDIS=y +CONFIG_USB_G_MULTI_CDC=y +CONFIG_USB_G_HID=m +CONFIG_USB_G_DBGP=m +# CONFIG_USB_G_DBGP_PRINTK is not set +CONFIG_USB_G_DBGP_SERIAL=y +CONFIG_USB_G_WEBCAM=m +CONFIG_USB_RAW_GADGET=m +# end of USB Gadget precomposed configurations + +CONFIG_TYPEC=m +CONFIG_TYPEC_TCPM=m +CONFIG_TYPEC_TCPCI=m +CONFIG_TYPEC_RT1711H=m +CONFIG_TYPEC_FUSB302=m +CONFIG_TYPEC_WCOVE=m +CONFIG_TYPEC_UCSI=m +CONFIG_UCSI_CCG=m +CONFIG_UCSI_ACPI=m +CONFIG_TYPEC_HD3SS3220=m +CONFIG_TYPEC_TPS6598X=m + +# +# USB Type-C Multiplexer/DeMultiplexer Switch support +# +CONFIG_TYPEC_MUX_PI3USB30532=m +CONFIG_TYPEC_MUX_INTEL_PMC=m +# end of USB Type-C Multiplexer/DeMultiplexer Switch support + +# +# USB Type-C Alternate Mode drivers +# +CONFIG_TYPEC_DP_ALTMODE=m +CONFIG_TYPEC_NVIDIA_ALTMODE=m +# end of USB Type-C Alternate Mode drivers + +CONFIG_USB_ROLE_SWITCH=m +CONFIG_USB_ROLES_INTEL_XHCI=m +CONFIG_MMC=m +CONFIG_PWRSEQ_EMMC=m +CONFIG_PWRSEQ_SD8787=m +CONFIG_PWRSEQ_SIMPLE=m +CONFIG_MMC_BLOCK=m +CONFIG_MMC_BLOCK_MINORS=8 +CONFIG_SDIO_UART=m +CONFIG_MMC_TEST=m + +# +# MMC/SD/SDIO Host Controller Drivers +# +# CONFIG_MMC_DEBUG is not set +CONFIG_MMC_SDHCI=m +CONFIG_MMC_SDHCI_IO_ACCESSORS=y +CONFIG_MMC_SDHCI_PCI=m +CONFIG_MMC_RICOH_MMC=y +CONFIG_MMC_SDHCI_ACPI=m +CONFIG_MMC_SDHCI_PLTFM=m +CONFIG_MMC_SDHCI_OF_ARASAN=m +CONFIG_MMC_SDHCI_OF_ASPEED=m +CONFIG_MMC_SDHCI_OF_AT91=m +CONFIG_MMC_SDHCI_OF_DWCMSHC=m +CONFIG_MMC_SDHCI_CADENCE=m +CONFIG_MMC_SDHCI_F_SDH30=m +CONFIG_MMC_SDHCI_MILBEAUT=m +CONFIG_MMC_WBSD=m +CONFIG_MMC_ALCOR=m +CONFIG_MMC_TIFM_SD=m +CONFIG_MMC_SPI=m +CONFIG_MMC_SDRICOH_CS=m +CONFIG_MMC_CB710=m +CONFIG_MMC_VIA_SDMMC=m +CONFIG_MMC_VUB300=m +CONFIG_MMC_USHC=m +CONFIG_MMC_USDHI6ROL0=m +CONFIG_MMC_REALTEK_PCI=m +CONFIG_MMC_REALTEK_USB=m +CONFIG_MMC_CQHCI=m +CONFIG_MMC_HSQ=m +CONFIG_MMC_TOSHIBA_PCI=m +CONFIG_MMC_MTK=m +CONFIG_MMC_SDHCI_XENON=m +CONFIG_MMC_SDHCI_OMAP=m +CONFIG_MMC_SDHCI_AM654=m +CONFIG_MMC_SDHCI_EXTERNAL_DMA=y +CONFIG_MEMSTICK=m +# CONFIG_MEMSTICK_DEBUG is not set + +# +# MemoryStick drivers +# +# CONFIG_MEMSTICK_UNSAFE_RESUME is not set +CONFIG_MSPRO_BLOCK=m +CONFIG_MS_BLOCK=m + +# +# MemoryStick Host Controller Drivers +# +CONFIG_MEMSTICK_TIFM_MS=m +CONFIG_MEMSTICK_JMICRON_38X=m +CONFIG_MEMSTICK_R592=m +CONFIG_MEMSTICK_REALTEK_PCI=m +CONFIG_MEMSTICK_REALTEK_USB=m +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=y +CONFIG_LEDS_CLASS_FLASH=m +CONFIG_LEDS_BRIGHTNESS_HW_CHANGED=y + +# +# LED drivers +# +CONFIG_LEDS_88PM860X=m +CONFIG_LEDS_AAT1290=m +CONFIG_LEDS_AN30259A=m +CONFIG_LEDS_APU=m +CONFIG_LEDS_AS3645A=m +CONFIG_LEDS_BCM6328=m +CONFIG_LEDS_BCM6358=m +CONFIG_LEDS_CPCAP=m +CONFIG_LEDS_CR0014114=m +CONFIG_LEDS_EL15203000=m +CONFIG_LEDS_LM3530=m +CONFIG_LEDS_LM3532=m +CONFIG_LEDS_LM3533=m +CONFIG_LEDS_LM3642=m +CONFIG_LEDS_LM3692X=m +CONFIG_LEDS_LM3601X=m +CONFIG_LEDS_MT6323=m +CONFIG_LEDS_PCA9532=m +CONFIG_LEDS_PCA9532_GPIO=y +CONFIG_LEDS_GPIO=m +CONFIG_LEDS_LP3944=m +CONFIG_LEDS_LP3952=m +# CONFIG_LEDS_LP5521 is not set +# CONFIG_LEDS_LP5523 is not set +# CONFIG_LEDS_LP5562 is not set +# CONFIG_LEDS_LP8501 is not set +CONFIG_LEDS_LP8788=m +CONFIG_LEDS_LP8860=m +CONFIG_LEDS_CLEVO_MAIL=m +CONFIG_LEDS_PCA955X=m +CONFIG_LEDS_PCA955X_GPIO=y +CONFIG_LEDS_PCA963X=m +CONFIG_LEDS_WM831X_STATUS=m +CONFIG_LEDS_WM8350=m +CONFIG_LEDS_DA903X=m +CONFIG_LEDS_DA9052=m +CONFIG_LEDS_DAC124S085=m +CONFIG_LEDS_PWM=m +CONFIG_LEDS_REGULATOR=m +CONFIG_LEDS_BD2802=m +CONFIG_LEDS_INTEL_SS4200=m +CONFIG_LEDS_LT3593=m +CONFIG_LEDS_ADP5520=m +CONFIG_LEDS_MC13783=m +CONFIG_LEDS_TCA6507=m +CONFIG_LEDS_TLC591XX=m +CONFIG_LEDS_MAX77650=m +CONFIG_LEDS_MAX77693=m +CONFIG_LEDS_MAX8997=m +CONFIG_LEDS_LM355x=m +CONFIG_LEDS_MENF21BMC=m +CONFIG_LEDS_KTD2692=m +CONFIG_LEDS_IS31FL319X=m +CONFIG_LEDS_IS31FL32XX=m + +# +# LED driver for blink(1) USB RGB LED is under Special HID drivers (HID_THINGM) +# +CONFIG_LEDS_BLINKM=m +CONFIG_LEDS_SYSCON=y +CONFIG_LEDS_MLXCPLD=m +CONFIG_LEDS_MLXREG=m +CONFIG_LEDS_USER=m +CONFIG_LEDS_NIC78BX=m +CONFIG_LEDS_SPI_BYTE=m +CONFIG_LEDS_TI_LMU_COMMON=m +CONFIG_LEDS_LM3697=m +CONFIG_LEDS_LM36274=m +CONFIG_LEDS_TPS6105X=m + +# +# LED Triggers +# +CONFIG_LEDS_TRIGGERS=y +CONFIG_LEDS_TRIGGER_TIMER=m +CONFIG_LEDS_TRIGGER_ONESHOT=m +CONFIG_LEDS_TRIGGER_DISK=y +CONFIG_LEDS_TRIGGER_MTD=y +CONFIG_LEDS_TRIGGER_HEARTBEAT=m +CONFIG_LEDS_TRIGGER_BACKLIGHT=m +CONFIG_LEDS_TRIGGER_CPU=y +CONFIG_LEDS_TRIGGER_ACTIVITY=m +CONFIG_LEDS_TRIGGER_GPIO=m +CONFIG_LEDS_TRIGGER_DEFAULT_ON=m + +# +# iptables trigger is under Netfilter config (LED target) +# +CONFIG_LEDS_TRIGGER_TRANSIENT=m +CONFIG_LEDS_TRIGGER_CAMERA=m +CONFIG_LEDS_TRIGGER_PANIC=y +CONFIG_LEDS_TRIGGER_NETDEV=m +CONFIG_LEDS_TRIGGER_PATTERN=m +CONFIG_LEDS_TRIGGER_AUDIO=m +CONFIG_ACCESSIBILITY=y +CONFIG_A11Y_BRAILLE_CONSOLE=y +CONFIG_INFINIBAND=m +CONFIG_INFINIBAND_USER_MAD=m +CONFIG_INFINIBAND_USER_ACCESS=m +# CONFIG_INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI is not set +CONFIG_INFINIBAND_USER_MEM=y +CONFIG_INFINIBAND_ON_DEMAND_PAGING=y +CONFIG_INFINIBAND_ADDR_TRANS=y +CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS=y +CONFIG_INFINIBAND_MTHCA=m +CONFIG_INFINIBAND_MTHCA_DEBUG=y +CONFIG_INFINIBAND_QIB=m +CONFIG_INFINIBAND_QIB_DCA=y +CONFIG_INFINIBAND_CXGB4=m +CONFIG_INFINIBAND_EFA=m +CONFIG_INFINIBAND_I40IW=m +CONFIG_MLX4_INFINIBAND=m +CONFIG_MLX5_INFINIBAND=m +CONFIG_INFINIBAND_OCRDMA=m +CONFIG_INFINIBAND_VMWARE_PVRDMA=m +CONFIG_INFINIBAND_USNIC=m +CONFIG_INFINIBAND_BNXT_RE=m +CONFIG_INFINIBAND_HFI1=m +# CONFIG_HFI1_DEBUG_SDMA_ORDER is not set +# CONFIG_SDMA_VERBOSITY is not set +CONFIG_INFINIBAND_QEDR=m +CONFIG_INFINIBAND_RDMAVT=m +CONFIG_RDMA_RXE=m +CONFIG_RDMA_SIW=m +CONFIG_INFINIBAND_IPOIB=m +CONFIG_INFINIBAND_IPOIB_CM=y +CONFIG_INFINIBAND_IPOIB_DEBUG=y +# CONFIG_INFINIBAND_IPOIB_DEBUG_DATA is not set +CONFIG_INFINIBAND_SRP=m +CONFIG_INFINIBAND_SRPT=m +CONFIG_INFINIBAND_ISER=m +CONFIG_INFINIBAND_ISERT=m +CONFIG_INFINIBAND_OPA_VNIC=m +CONFIG_EDAC_ATOMIC_SCRUB=y +CONFIG_EDAC_SUPPORT=y +CONFIG_EDAC=y +CONFIG_EDAC_LEGACY_SYSFS=y +# CONFIG_EDAC_DEBUG is not set +CONFIG_EDAC_DECODE_MCE=m +CONFIG_EDAC_GHES=y +CONFIG_EDAC_AMD64=m +# CONFIG_EDAC_AMD64_ERROR_INJECTION is not set +CONFIG_EDAC_E752X=m +CONFIG_EDAC_I82975X=m +CONFIG_EDAC_I3000=m +CONFIG_EDAC_I3200=m +CONFIG_EDAC_IE31200=m +CONFIG_EDAC_X38=m +CONFIG_EDAC_I5400=m +CONFIG_EDAC_I7CORE=m +CONFIG_EDAC_I5000=m +CONFIG_EDAC_I5100=m +CONFIG_EDAC_I7300=m +CONFIG_EDAC_SBRIDGE=m +CONFIG_EDAC_SKX=m +CONFIG_EDAC_I10NM=m +CONFIG_EDAC_PND2=m +CONFIG_RTC_LIB=y +CONFIG_RTC_MC146818_LIB=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_HCTOSYS=y +CONFIG_RTC_HCTOSYS_DEVICE="rtc0" +CONFIG_RTC_SYSTOHC=y +CONFIG_RTC_SYSTOHC_DEVICE="rtc0" +# CONFIG_RTC_DEBUG is not set +CONFIG_RTC_NVMEM=y + +# +# RTC interfaces +# +CONFIG_RTC_INTF_SYSFS=y +CONFIG_RTC_INTF_PROC=y +CONFIG_RTC_INTF_DEV=y +CONFIG_RTC_INTF_DEV_UIE_EMUL=y +# CONFIG_RTC_DRV_TEST is not set + +# +# I2C RTC drivers +# +CONFIG_RTC_DRV_88PM860X=m +CONFIG_RTC_DRV_88PM80X=m +CONFIG_RTC_DRV_ABB5ZES3=m +CONFIG_RTC_DRV_ABEOZ9=m +CONFIG_RTC_DRV_ABX80X=m +CONFIG_RTC_DRV_AS3722=m +CONFIG_RTC_DRV_DS1307=m +CONFIG_RTC_DRV_DS1307_CENTURY=y +CONFIG_RTC_DRV_DS1374=m +CONFIG_RTC_DRV_DS1374_WDT=y +CONFIG_RTC_DRV_DS1672=m +CONFIG_RTC_DRV_HYM8563=m +CONFIG_RTC_DRV_LP8788=m +CONFIG_RTC_DRV_MAX6900=m +CONFIG_RTC_DRV_MAX8907=m +CONFIG_RTC_DRV_MAX8925=m +CONFIG_RTC_DRV_MAX8998=m +CONFIG_RTC_DRV_MAX8997=m +CONFIG_RTC_DRV_MAX77686=m +CONFIG_RTC_DRV_RK808=m +CONFIG_RTC_DRV_RS5C372=m +CONFIG_RTC_DRV_ISL1208=m +CONFIG_RTC_DRV_ISL12022=m +CONFIG_RTC_DRV_ISL12026=m +CONFIG_RTC_DRV_X1205=m +CONFIG_RTC_DRV_PCF8523=m +CONFIG_RTC_DRV_PCF85063=m +CONFIG_RTC_DRV_PCF85363=m +CONFIG_RTC_DRV_PCF8563=m +CONFIG_RTC_DRV_PCF8583=m +CONFIG_RTC_DRV_M41T80=m +CONFIG_RTC_DRV_M41T80_WDT=y +CONFIG_RTC_DRV_BD70528=m +CONFIG_RTC_DRV_BQ32K=m +CONFIG_RTC_DRV_TWL4030=m +CONFIG_RTC_DRV_PALMAS=m +CONFIG_RTC_DRV_TPS6586X=m +CONFIG_RTC_DRV_TPS65910=m +CONFIG_RTC_DRV_TPS80031=m +CONFIG_RTC_DRV_RC5T583=m +CONFIG_RTC_DRV_RC5T619=m +CONFIG_RTC_DRV_S35390A=m +CONFIG_RTC_DRV_FM3130=m +CONFIG_RTC_DRV_RX8010=m +CONFIG_RTC_DRV_RX8581=m +CONFIG_RTC_DRV_RX8025=m +CONFIG_RTC_DRV_EM3027=m +CONFIG_RTC_DRV_RV3028=m +CONFIG_RTC_DRV_RV8803=m +CONFIG_RTC_DRV_S5M=m +CONFIG_RTC_DRV_SD3078=m + +# +# SPI RTC drivers +# +CONFIG_RTC_DRV_M41T93=m +CONFIG_RTC_DRV_M41T94=m +CONFIG_RTC_DRV_DS1302=m +CONFIG_RTC_DRV_DS1305=m +CONFIG_RTC_DRV_DS1343=m +CONFIG_RTC_DRV_DS1347=m +CONFIG_RTC_DRV_DS1390=m +CONFIG_RTC_DRV_MAX6916=m +CONFIG_RTC_DRV_R9701=m +CONFIG_RTC_DRV_RX4581=m +CONFIG_RTC_DRV_RX6110=m +CONFIG_RTC_DRV_RS5C348=m +CONFIG_RTC_DRV_MAX6902=m +CONFIG_RTC_DRV_PCF2123=m +CONFIG_RTC_DRV_MCP795=m +CONFIG_RTC_I2C_AND_SPI=y + +# +# SPI and I2C RTC drivers +# +CONFIG_RTC_DRV_DS3232=m +CONFIG_RTC_DRV_DS3232_HWMON=y +CONFIG_RTC_DRV_PCF2127=m +CONFIG_RTC_DRV_RV3029C2=m +CONFIG_RTC_DRV_RV3029_HWMON=y + +# +# Platform RTC drivers +# +CONFIG_RTC_DRV_CMOS=y +CONFIG_RTC_DRV_DS1286=m +CONFIG_RTC_DRV_DS1511=m +CONFIG_RTC_DRV_DS1553=m +CONFIG_RTC_DRV_DS1685_FAMILY=m +CONFIG_RTC_DRV_DS1685=y +# CONFIG_RTC_DRV_DS1689 is not set +# CONFIG_RTC_DRV_DS17285 is not set +# CONFIG_RTC_DRV_DS17485 is not set +# CONFIG_RTC_DRV_DS17885 is not set +CONFIG_RTC_DRV_DS1742=m +CONFIG_RTC_DRV_DS2404=m +CONFIG_RTC_DRV_DA9052=m +CONFIG_RTC_DRV_DA9055=m +CONFIG_RTC_DRV_DA9063=m +CONFIG_RTC_DRV_STK17TA8=m +CONFIG_RTC_DRV_M48T86=m +CONFIG_RTC_DRV_M48T35=m +CONFIG_RTC_DRV_M48T59=m +CONFIG_RTC_DRV_MSM6242=m +CONFIG_RTC_DRV_BQ4802=m +CONFIG_RTC_DRV_RP5C01=m +CONFIG_RTC_DRV_V3020=m +CONFIG_RTC_DRV_WM831X=m +CONFIG_RTC_DRV_WM8350=m +CONFIG_RTC_DRV_PCF50633=m +CONFIG_RTC_DRV_AB3100=m +CONFIG_RTC_DRV_ZYNQMP=m +CONFIG_RTC_DRV_CROS_EC=m + +# +# on-CPU RTC drivers +# +CONFIG_RTC_DRV_CADENCE=m +CONFIG_RTC_DRV_FTRTC010=m +CONFIG_RTC_DRV_PCAP=m +CONFIG_RTC_DRV_MC13XXX=m +CONFIG_RTC_DRV_MT6397=m +CONFIG_RTC_DRV_R7301=m +CONFIG_RTC_DRV_CPCAP=m + +# +# HID Sensor RTC drivers +# +CONFIG_RTC_DRV_HID_SENSOR_TIME=m +CONFIG_RTC_DRV_WILCO_EC=m +CONFIG_DMADEVICES=y +# CONFIG_DMADEVICES_DEBUG is not set + +# +# DMA Devices +# +CONFIG_DMA_ENGINE=y +CONFIG_DMA_VIRTUAL_CHANNELS=y +CONFIG_DMA_ACPI=y +CONFIG_DMA_OF=y +CONFIG_ALTERA_MSGDMA=m +CONFIG_DW_AXI_DMAC=m +CONFIG_FSL_EDMA=m +CONFIG_INTEL_IDMA64=m +CONFIG_INTEL_IDXD=m +CONFIG_INTEL_IOATDMA=m +CONFIG_INTEL_MIC_X100_DMA=m +CONFIG_PLX_DMA=m +CONFIG_QCOM_HIDMA_MGMT=m +CONFIG_QCOM_HIDMA=m +CONFIG_DW_DMAC_CORE=y +CONFIG_DW_DMAC=y +CONFIG_DW_DMAC_PCI=y +CONFIG_DW_EDMA=m +CONFIG_DW_EDMA_PCIE=m +CONFIG_HSU_DMA=y +CONFIG_SF_PDMA=m + +# +# DMA Clients +# +CONFIG_ASYNC_TX_DMA=y +# CONFIG_DMATEST is not set +CONFIG_DMA_ENGINE_RAID=y + +# +# DMABUF options +# +CONFIG_SYNC_FILE=y +# CONFIG_SW_SYNC is not set +CONFIG_UDMABUF=y +# CONFIG_DMABUF_MOVE_NOTIFY is not set +# CONFIG_DMABUF_SELFTESTS is not set +CONFIG_DMABUF_HEAPS=y +CONFIG_DMABUF_HEAPS_SYSTEM=y +# end of DMABUF options + +CONFIG_DCA=m +CONFIG_AUXDISPLAY=y +CONFIG_HD44780=m +CONFIG_KS0108=m +CONFIG_KS0108_PORT=0x378 +CONFIG_KS0108_DELAY=2 +CONFIG_CFAG12864B=m +CONFIG_CFAG12864B_RATE=20 +CONFIG_IMG_ASCII_LCD=m +CONFIG_HT16K33=m +CONFIG_PARPORT_PANEL=m +CONFIG_PANEL_PARPORT=0 +CONFIG_PANEL_PROFILE=5 +# CONFIG_PANEL_CHANGE_MESSAGE is not set +# CONFIG_CHARLCD_BL_OFF is not set +# CONFIG_CHARLCD_BL_ON is not set +CONFIG_CHARLCD_BL_FLASH=y +CONFIG_PANEL=m +CONFIG_CHARLCD=m +CONFIG_UIO=m +CONFIG_UIO_CIF=m +CONFIG_UIO_PDRV_GENIRQ=m +CONFIG_UIO_DMEM_GENIRQ=m +CONFIG_UIO_AEC=m +CONFIG_UIO_SERCOS3=m +CONFIG_UIO_PCI_GENERIC=m +CONFIG_UIO_NETX=m +CONFIG_UIO_PRUSS=m +CONFIG_UIO_MF624=m +CONFIG_UIO_HV_GENERIC=m +CONFIG_VFIO_IOMMU_TYPE1=m +CONFIG_VFIO_VIRQFD=m +CONFIG_VFIO=m +# CONFIG_VFIO_NOIOMMU is not set +CONFIG_VFIO_PCI=m +CONFIG_VFIO_PCI_VGA=y +CONFIG_VFIO_PCI_MMAP=y +CONFIG_VFIO_PCI_INTX=y +CONFIG_VFIO_PCI_IGD=y +CONFIG_VFIO_MDEV=m +CONFIG_VFIO_MDEV_DEVICE=m +CONFIG_IRQ_BYPASS_MANAGER=m +CONFIG_VIRT_DRIVERS=y +CONFIG_VBOXGUEST=m +CONFIG_VIRTIO=y +CONFIG_VIRTIO_MENU=y +CONFIG_VIRTIO_PCI=m +CONFIG_VIRTIO_PCI_LEGACY=y +CONFIG_VIRTIO_VDPA=m +CONFIG_VIRTIO_PMEM=m +CONFIG_VIRTIO_BALLOON=m +CONFIG_VIRTIO_INPUT=m +CONFIG_VIRTIO_MMIO=m +CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y +CONFIG_VDPA=m +CONFIG_VDPA_SIM=m +CONFIG_IFCVF=m +CONFIG_VHOST_IOTLB=m +CONFIG_VHOST_RING=m +CONFIG_VHOST_DPN=y +CONFIG_VHOST=m +CONFIG_VHOST_MENU=y +CONFIG_VHOST_NET=m +CONFIG_VHOST_SCSI=m +CONFIG_VHOST_VSOCK=m +CONFIG_VHOST_VDPA=m +# CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set + +# +# Microsoft Hyper-V guest support +# +CONFIG_HYPERV=m +CONFIG_HYPERV_TIMER=y +CONFIG_HYPERV_UTILS=m +CONFIG_HYPERV_BALLOON=m +# end of Microsoft Hyper-V guest support + +# +# Xen driver support +# +CONFIG_XEN_BALLOON=y +CONFIG_XEN_BALLOON_MEMORY_HOTPLUG=y +CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT=512 +CONFIG_XEN_SCRUB_PAGES_DEFAULT=y +CONFIG_XEN_DEV_EVTCHN=m +CONFIG_XEN_BACKEND=y +CONFIG_XENFS=m +CONFIG_XEN_COMPAT_XENFS=y +CONFIG_XEN_SYS_HYPERVISOR=y +CONFIG_XEN_XENBUS_FRONTEND=y +CONFIG_XEN_GNTDEV=m +CONFIG_XEN_GNTDEV_DMABUF=y +CONFIG_XEN_GRANT_DEV_ALLOC=m +CONFIG_XEN_GRANT_DMA_ALLOC=y +CONFIG_SWIOTLB_XEN=y +CONFIG_XEN_PCIDEV_BACKEND=m +CONFIG_XEN_PVCALLS_FRONTEND=m +CONFIG_XEN_PVCALLS_BACKEND=y +CONFIG_XEN_SCSI_BACKEND=m +CONFIG_XEN_PRIVCMD=m +CONFIG_XEN_ACPI_PROCESSOR=m +CONFIG_XEN_MCE_LOG=y +CONFIG_XEN_HAVE_PVMMU=y +CONFIG_XEN_EFI=y +CONFIG_XEN_AUTO_XLATE=y +CONFIG_XEN_ACPI=y +CONFIG_XEN_SYMS=y +CONFIG_XEN_HAVE_VPMU=y +CONFIG_XEN_FRONT_PGDIR_SHBUF=m +# end of Xen driver support + +# CONFIG_GREYBUS is not set +CONFIG_STAGING=y +CONFIG_PRISM2_USB=m +CONFIG_COMEDI=m +# CONFIG_COMEDI_DEBUG is not set +CONFIG_COMEDI_DEFAULT_BUF_SIZE_KB=2048 +CONFIG_COMEDI_DEFAULT_BUF_MAXSIZE_KB=20480 +CONFIG_COMEDI_MISC_DRIVERS=y +CONFIG_COMEDI_BOND=m +CONFIG_COMEDI_TEST=m +CONFIG_COMEDI_PARPORT=m +# CONFIG_COMEDI_ISA_DRIVERS is not set +CONFIG_COMEDI_PCI_DRIVERS=m +CONFIG_COMEDI_8255_PCI=m +CONFIG_COMEDI_ADDI_WATCHDOG=m +CONFIG_COMEDI_ADDI_APCI_1032=m +CONFIG_COMEDI_ADDI_APCI_1500=m +CONFIG_COMEDI_ADDI_APCI_1516=m +CONFIG_COMEDI_ADDI_APCI_1564=m +CONFIG_COMEDI_ADDI_APCI_16XX=m +CONFIG_COMEDI_ADDI_APCI_2032=m +CONFIG_COMEDI_ADDI_APCI_2200=m +CONFIG_COMEDI_ADDI_APCI_3120=m +CONFIG_COMEDI_ADDI_APCI_3501=m +CONFIG_COMEDI_ADDI_APCI_3XXX=m +CONFIG_COMEDI_ADL_PCI6208=m +CONFIG_COMEDI_ADL_PCI7X3X=m +CONFIG_COMEDI_ADL_PCI8164=m +CONFIG_COMEDI_ADL_PCI9111=m +CONFIG_COMEDI_ADL_PCI9118=m +CONFIG_COMEDI_ADV_PCI1710=m +CONFIG_COMEDI_ADV_PCI1720=m +CONFIG_COMEDI_ADV_PCI1723=m +CONFIG_COMEDI_ADV_PCI1724=m +CONFIG_COMEDI_ADV_PCI1760=m +CONFIG_COMEDI_ADV_PCI_DIO=m +CONFIG_COMEDI_AMPLC_DIO200_PCI=m +CONFIG_COMEDI_AMPLC_PC236_PCI=m +CONFIG_COMEDI_AMPLC_PC263_PCI=m +CONFIG_COMEDI_AMPLC_PCI224=m +CONFIG_COMEDI_AMPLC_PCI230=m +CONFIG_COMEDI_CONTEC_PCI_DIO=m +CONFIG_COMEDI_DAS08_PCI=m +CONFIG_COMEDI_DT3000=m +CONFIG_COMEDI_DYNA_PCI10XX=m +CONFIG_COMEDI_GSC_HPDI=m +CONFIG_COMEDI_MF6X4=m +CONFIG_COMEDI_ICP_MULTI=m +CONFIG_COMEDI_DAQBOARD2000=m +CONFIG_COMEDI_JR3_PCI=m +CONFIG_COMEDI_KE_COUNTER=m +CONFIG_COMEDI_CB_PCIDAS64=m +CONFIG_COMEDI_CB_PCIDAS=m +CONFIG_COMEDI_CB_PCIDDA=m +CONFIG_COMEDI_CB_PCIMDAS=m +CONFIG_COMEDI_CB_PCIMDDA=m +CONFIG_COMEDI_ME4000=m +CONFIG_COMEDI_ME_DAQ=m +CONFIG_COMEDI_NI_6527=m +CONFIG_COMEDI_NI_65XX=m +CONFIG_COMEDI_NI_660X=m +CONFIG_COMEDI_NI_670X=m +CONFIG_COMEDI_NI_LABPC_PCI=m +CONFIG_COMEDI_NI_PCIDIO=m +CONFIG_COMEDI_NI_PCIMIO=m +CONFIG_COMEDI_RTD520=m +CONFIG_COMEDI_S626=m +CONFIG_COMEDI_MITE=m +CONFIG_COMEDI_NI_TIOCMD=m +CONFIG_COMEDI_PCMCIA_DRIVERS=m +CONFIG_COMEDI_CB_DAS16_CS=m +CONFIG_COMEDI_DAS08_CS=m +CONFIG_COMEDI_NI_DAQ_700_CS=m +CONFIG_COMEDI_NI_DAQ_DIO24_CS=m +CONFIG_COMEDI_NI_LABPC_CS=m +CONFIG_COMEDI_NI_MIO_CS=m +CONFIG_COMEDI_QUATECH_DAQP_CS=m +CONFIG_COMEDI_USB_DRIVERS=m +CONFIG_COMEDI_DT9812=m +CONFIG_COMEDI_NI_USB6501=m +CONFIG_COMEDI_USBDUX=m +CONFIG_COMEDI_USBDUXFAST=m +CONFIG_COMEDI_USBDUXSIGMA=m +CONFIG_COMEDI_VMK80XX=m +CONFIG_COMEDI_8254=m +CONFIG_COMEDI_8255=m +CONFIG_COMEDI_8255_SA=m +CONFIG_COMEDI_KCOMEDILIB=m +CONFIG_COMEDI_AMPLC_DIO200=m +CONFIG_COMEDI_AMPLC_PC236=m +CONFIG_COMEDI_DAS08=m +CONFIG_COMEDI_NI_LABPC=m +CONFIG_COMEDI_NI_TIO=m +CONFIG_COMEDI_NI_ROUTING=m +CONFIG_RTL8192U=m +CONFIG_RTLLIB=m +CONFIG_RTLLIB_CRYPTO_CCMP=m +CONFIG_RTLLIB_CRYPTO_TKIP=m +CONFIG_RTLLIB_CRYPTO_WEP=m +CONFIG_RTL8192E=m +CONFIG_RTL8723BS=m +CONFIG_R8712U=m +CONFIG_R8188EU=m +CONFIG_88EU_AP_MODE=y +CONFIG_RTS5208=m +CONFIG_VT6655=m +CONFIG_VT6656=m + +# +# IIO staging drivers +# + +# +# Accelerometers +# +CONFIG_ADIS16203=m +CONFIG_ADIS16240=m +# end of Accelerometers + +# +# Analog to digital converters +# +CONFIG_AD7816=m +CONFIG_AD7280=m +# end of Analog to digital converters + +# +# Analog digital bi-direction converters +# +CONFIG_ADT7316=m +CONFIG_ADT7316_SPI=m +CONFIG_ADT7316_I2C=m +# end of Analog digital bi-direction converters + +# +# Capacitance to digital converters +# +CONFIG_AD7150=m +CONFIG_AD7746=m +# end of Capacitance to digital converters + +# +# Direct Digital Synthesis +# +CONFIG_AD9832=m +CONFIG_AD9834=m +# end of Direct Digital Synthesis + +# +# Network Analyzer, Impedance Converters +# +CONFIG_AD5933=m +# end of Network Analyzer, Impedance Converters + +# +# Active energy metering IC +# +CONFIG_ADE7854=m +CONFIG_ADE7854_I2C=m +CONFIG_ADE7854_SPI=m +# end of Active energy metering IC + +# +# Resolver to digital converters +# +CONFIG_AD2S1210=m +# end of Resolver to digital converters +# end of IIO staging drivers + +# CONFIG_FB_SM750 is not set + +# +# Speakup console speech +# +CONFIG_SPEAKUP=m +CONFIG_SPEAKUP_SYNTH_ACNTSA=m +CONFIG_SPEAKUP_SYNTH_APOLLO=m +CONFIG_SPEAKUP_SYNTH_AUDPTR=m +CONFIG_SPEAKUP_SYNTH_BNS=m +CONFIG_SPEAKUP_SYNTH_DECTLK=m +CONFIG_SPEAKUP_SYNTH_DECEXT=m +CONFIG_SPEAKUP_SYNTH_LTLK=m +CONFIG_SPEAKUP_SYNTH_SOFT=m +CONFIG_SPEAKUP_SYNTH_SPKOUT=m +CONFIG_SPEAKUP_SYNTH_TXPRT=m +CONFIG_SPEAKUP_SYNTH_DUMMY=m +# end of Speakup console speech + +CONFIG_STAGING_MEDIA=y +CONFIG_VIDEO_IPU3_IMGU=m + +# +# soc_camera sensor drivers +# +CONFIG_VIDEO_USBVISION=m + +# +# Android +# +# end of Android + +CONFIG_STAGING_BOARD=y +CONFIG_LTE_GDM724X=m +CONFIG_FIREWIRE_SERIAL=m +CONFIG_FWTTY_MAX_TOTAL_PORTS=64 +CONFIG_FWTTY_MAX_CARD_PORTS=32 +CONFIG_GS_FPGABOOT=m +CONFIG_UNISYSSPAR=y +CONFIG_UNISYS_VISORNIC=m +CONFIG_UNISYS_VISORINPUT=m +CONFIG_UNISYS_VISORHBA=m +CONFIG_COMMON_CLK_XLNX_CLKWZRD=m +# CONFIG_FB_TFT is not set +CONFIG_WILC1000=m +CONFIG_WILC1000_SDIO=m +CONFIG_WILC1000_SPI=m +# CONFIG_WILC1000_HW_OOB_INTR is not set +CONFIG_MOST_COMPONENTS=m +CONFIG_MOST_CDEV=m +CONFIG_MOST_NET=m +CONFIG_MOST_SOUND=m +CONFIG_MOST_VIDEO=m +CONFIG_MOST_DIM2=m +CONFIG_MOST_I2C=m +CONFIG_MOST_USB=m +CONFIG_KS7010=m +CONFIG_PI433=m + +# +# Gasket devices +# +CONFIG_STAGING_GASKET_FRAMEWORK=m +CONFIG_STAGING_APEX_DRIVER=m +# end of Gasket devices + +CONFIG_XIL_AXIS_FIFO=m +CONFIG_FIELDBUS_DEV=m +CONFIG_HMS_ANYBUSS_BUS=m +CONFIG_ARCX_ANYBUS_CONTROLLER=m +CONFIG_HMS_PROFINET=m +CONFIG_KPC2000=y +CONFIG_KPC2000_CORE=m +CONFIG_KPC2000_SPI=m +CONFIG_KPC2000_I2C=m +CONFIG_KPC2000_DMA=m +CONFIG_QLGE=m +CONFIG_WFX=m +CONFIG_X86_PLATFORM_DEVICES=y +CONFIG_ACPI_WMI=m +CONFIG_WMI_BMOF=m +CONFIG_ALIENWARE_WMI=m +CONFIG_HUAWEI_WMI=m +CONFIG_INTEL_WMI_THUNDERBOLT=m +CONFIG_MXM_WMI=m +CONFIG_PEAQ_WMI=m +CONFIG_XIAOMI_WMI=m +CONFIG_ACERHDF=m +CONFIG_ACER_WIRELESS=m +CONFIG_ACER_WMI=m +CONFIG_APPLE_GMUX=m +CONFIG_ASUS_LAPTOP=m +CONFIG_ASUS_WIRELESS=m +CONFIG_ASUS_WMI=m +CONFIG_ASUS_NB_WMI=m +CONFIG_EEEPC_LAPTOP=m +CONFIG_EEEPC_WMI=m +CONFIG_DCDBAS=m +CONFIG_DELL_SMBIOS=m +CONFIG_DELL_SMBIOS_WMI=y +CONFIG_DELL_SMBIOS_SMM=y +CONFIG_DELL_LAPTOP=m +CONFIG_DELL_RBTN=m +# CONFIG_DELL_RBU is not set +CONFIG_DELL_SMO8800=m +CONFIG_DELL_WMI=m +CONFIG_DELL_WMI_DESCRIPTOR=m +CONFIG_DELL_WMI_AIO=m +CONFIG_DELL_WMI_LED=m +CONFIG_AMILO_RFKILL=m +CONFIG_FUJITSU_LAPTOP=m +CONFIG_FUJITSU_TABLET=m +CONFIG_GPD_POCKET_FAN=m +CONFIG_HP_ACCEL=m +CONFIG_HP_WIRELESS=m +CONFIG_HP_WMI=m +CONFIG_IBM_RTL=m +CONFIG_IDEAPAD_LAPTOP=m +CONFIG_SENSORS_HDAPS=m +CONFIG_THINKPAD_ACPI=m +CONFIG_THINKPAD_ACPI_ALSA_SUPPORT=y +# CONFIG_THINKPAD_ACPI_DEBUGFACILITIES is not set +# CONFIG_THINKPAD_ACPI_DEBUG is not set +# CONFIG_THINKPAD_ACPI_UNSAFE_LEDS is not set +CONFIG_THINKPAD_ACPI_VIDEO=y +CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y +CONFIG_INTEL_ATOMISP2_PM=m +CONFIG_INTEL_CHT_INT33FE=m +CONFIG_INTEL_HID_EVENT=m +CONFIG_INTEL_INT0002_VGPIO=m +CONFIG_INTEL_MENLOW=m +CONFIG_INTEL_OAKTRAIL=m +CONFIG_INTEL_VBTN=m +CONFIG_SURFACE3_WMI=m +CONFIG_SURFACE_3_BUTTON=m +CONFIG_SURFACE_3_POWER_OPREGION=m +CONFIG_SURFACE_PRO3_BUTTON=m +CONFIG_MSI_LAPTOP=m +CONFIG_MSI_WMI=m +CONFIG_PCENGINES_APU2=m +CONFIG_SAMSUNG_LAPTOP=m +CONFIG_SAMSUNG_Q10=m +CONFIG_ACPI_TOSHIBA=m +CONFIG_TOSHIBA_BT_RFKILL=m +CONFIG_TOSHIBA_HAPS=m +CONFIG_TOSHIBA_WMI=m +CONFIG_ACPI_CMPC=m +CONFIG_COMPAL_LAPTOP=m +CONFIG_LG_LAPTOP=m +CONFIG_PANASONIC_LAPTOP=m +CONFIG_SONY_LAPTOP=m +CONFIG_SONYPI_COMPAT=y +CONFIG_SYSTEM76_ACPI=m +CONFIG_TOPSTAR_LAPTOP=m +CONFIG_I2C_MULTI_INSTANTIATE=m +CONFIG_MLX_PLATFORM=m +CONFIG_TOUCHSCREEN_DMI=y +CONFIG_INTEL_IPS=m +CONFIG_INTEL_RST=m +CONFIG_INTEL_SMARTCONNECT=m + +# +# Intel Speed Select Technology interface support +# +CONFIG_INTEL_SPEED_SELECT_INTERFACE=m +# end of Intel Speed Select Technology interface support + +CONFIG_INTEL_TURBO_MAX_3=y +CONFIG_INTEL_UNCORE_FREQ_CONTROL=m +CONFIG_INTEL_BXTWC_PMIC_TMU=m +CONFIG_INTEL_CHTDC_TI_PWRBTN=m +CONFIG_INTEL_PMC_CORE=y +CONFIG_INTEL_PMC_IPC=m +CONFIG_INTEL_PUNIT_IPC=m +CONFIG_INTEL_TELEMETRY=m +CONFIG_PMC_ATOM=y +CONFIG_MFD_CROS_EC=m +CONFIG_CHROME_PLATFORMS=y +CONFIG_CHROMEOS_LAPTOP=m +CONFIG_CHROMEOS_PSTORE=m +CONFIG_CHROMEOS_TBMC=m +CONFIG_CROS_EC=m +CONFIG_CROS_EC_I2C=m +CONFIG_CROS_EC_RPMSG=m +CONFIG_CROS_EC_ISHTP=m +CONFIG_CROS_EC_SPI=m +CONFIG_CROS_EC_LPC=m +CONFIG_CROS_EC_PROTO=y +CONFIG_CROS_KBD_LED_BACKLIGHT=m +CONFIG_CROS_EC_CHARDEV=m +CONFIG_CROS_EC_LIGHTBAR=m +CONFIG_CROS_EC_VBC=m +CONFIG_CROS_EC_DEBUGFS=m +CONFIG_CROS_EC_SENSORHUB=m +CONFIG_CROS_EC_SYSFS=m +CONFIG_CROS_EC_TYPEC=m +CONFIG_CROS_USBPD_LOGGER=m +CONFIG_CROS_USBPD_NOTIFY=m +CONFIG_WILCO_EC=m +CONFIG_WILCO_EC_DEBUGFS=m +CONFIG_WILCO_EC_EVENTS=m +CONFIG_WILCO_EC_TELEMETRY=m +CONFIG_MELLANOX_PLATFORM=y +CONFIG_MLXREG_HOTPLUG=m +CONFIG_MLXREG_IO=m +CONFIG_CLKDEV_LOOKUP=y +CONFIG_HAVE_CLK_PREPARE=y +CONFIG_COMMON_CLK=y + +# +# Common Clock Framework +# +CONFIG_COMMON_CLK_WM831X=m +CONFIG_CLK_HSDK=y +CONFIG_COMMON_CLK_MAX77686=m +CONFIG_COMMON_CLK_MAX9485=m +CONFIG_COMMON_CLK_RK808=m +CONFIG_COMMON_CLK_SI5341=m +CONFIG_COMMON_CLK_SI5351=m +CONFIG_COMMON_CLK_SI514=m +CONFIG_COMMON_CLK_SI544=m +CONFIG_COMMON_CLK_SI570=m +CONFIG_COMMON_CLK_CDCE706=m +CONFIG_COMMON_CLK_CDCE925=m +CONFIG_COMMON_CLK_CS2000_CP=m +CONFIG_COMMON_CLK_S2MPS11=m +CONFIG_CLK_TWL6040=m +CONFIG_COMMON_CLK_LOCHNAGAR=m +CONFIG_COMMON_CLK_PALMAS=m +CONFIG_COMMON_CLK_PWM=m +CONFIG_COMMON_CLK_VC5=m +CONFIG_COMMON_CLK_BD718XX=m +CONFIG_COMMON_CLK_FIXED_MMIO=y +# end of Common Clock Framework + +CONFIG_HWSPINLOCK=y + +# +# Clock Source drivers +# +CONFIG_TIMER_OF=y +CONFIG_TIMER_PROBE=y +CONFIG_CLKEVT_I8253=y +CONFIG_I8253_LOCK=y +CONFIG_CLKBLD_I8253=y +CONFIG_CLKSRC_MMIO=y +CONFIG_MICROCHIP_PIT64B=y +# end of Clock Source drivers + +CONFIG_MAILBOX=y +CONFIG_PLATFORM_MHU=m +CONFIG_PCC=y +CONFIG_ALTERA_MBOX=m +CONFIG_MAILBOX_TEST=m +CONFIG_IOMMU_IOVA=y +CONFIG_IOASID=y +CONFIG_IOMMU_API=y +CONFIG_IOMMU_SUPPORT=y + +# +# Generic IOMMU Pagetable Support +# +# end of Generic IOMMU Pagetable Support + +# CONFIG_IOMMU_DEBUGFS is not set +# CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set +CONFIG_OF_IOMMU=y +CONFIG_IOMMU_DMA=y +CONFIG_AMD_IOMMU=y +CONFIG_AMD_IOMMU_V2=y +CONFIG_DMAR_TABLE=y +CONFIG_INTEL_IOMMU=y +CONFIG_INTEL_IOMMU_SVM=y +# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set +CONFIG_INTEL_IOMMU_FLOPPY_WA=y +# CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON is not set +CONFIG_IRQ_REMAP=y +CONFIG_HYPERV_IOMMU=y + +# +# Remoteproc drivers +# +CONFIG_REMOTEPROC=y +# end of Remoteproc drivers + +# +# Rpmsg drivers +# +CONFIG_RPMSG=m +CONFIG_RPMSG_CHAR=m +CONFIG_RPMSG_QCOM_GLINK_NATIVE=m +CONFIG_RPMSG_QCOM_GLINK_RPM=m +CONFIG_RPMSG_VIRTIO=m +# end of Rpmsg drivers + +CONFIG_SOUNDWIRE=m + +# +# SoundWire Devices +# +CONFIG_SOUNDWIRE_CADENCE=m +CONFIG_SOUNDWIRE_INTEL=m +CONFIG_SOUNDWIRE_QCOM=m + +# +# SOC (System On Chip) specific Drivers +# + +# +# Amlogic SoC drivers +# +# end of Amlogic SoC drivers + +# +# Aspeed SoC drivers +# +# end of Aspeed SoC drivers + +# +# Broadcom SoC drivers +# +# end of Broadcom SoC drivers + +# +# NXP/Freescale QorIQ SoC drivers +# +# end of NXP/Freescale QorIQ SoC drivers + +# +# i.MX SoC drivers +# +# end of i.MX SoC drivers + +# +# Qualcomm SoC drivers +# +# end of Qualcomm SoC drivers + +CONFIG_SOC_TI=y + +# +# Xilinx SoC drivers +# +CONFIG_XILINX_VCU=m +# end of Xilinx SoC drivers +# end of SOC (System On Chip) specific Drivers + +CONFIG_PM_DEVFREQ=y + +# +# DEVFREQ Governors +# +CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=m +CONFIG_DEVFREQ_GOV_PERFORMANCE=m +CONFIG_DEVFREQ_GOV_POWERSAVE=m +CONFIG_DEVFREQ_GOV_USERSPACE=m +CONFIG_DEVFREQ_GOV_PASSIVE=m + +# +# DEVFREQ Drivers +# +CONFIG_PM_DEVFREQ_EVENT=y +CONFIG_EXTCON=y + +# +# Extcon Device Drivers +# +CONFIG_EXTCON_ADC_JACK=m +CONFIG_EXTCON_ARIZONA=m +CONFIG_EXTCON_AXP288=m +CONFIG_EXTCON_FSA9480=m +CONFIG_EXTCON_GPIO=m +CONFIG_EXTCON_INTEL_INT3496=m +CONFIG_EXTCON_INTEL_CHT_WC=m +CONFIG_EXTCON_MAX14577=m +CONFIG_EXTCON_MAX3355=m +CONFIG_EXTCON_MAX77693=m +CONFIG_EXTCON_MAX77843=m +CONFIG_EXTCON_MAX8997=m +CONFIG_EXTCON_PALMAS=m +CONFIG_EXTCON_PTN5150=m +CONFIG_EXTCON_RT8973A=m +CONFIG_EXTCON_SM5502=m +CONFIG_EXTCON_USB_GPIO=m +CONFIG_EXTCON_USBC_CROS_EC=m +CONFIG_MEMORY=y +CONFIG_IIO=m +CONFIG_IIO_BUFFER=y +CONFIG_IIO_BUFFER_CB=m +CONFIG_IIO_BUFFER_HW_CONSUMER=m +CONFIG_IIO_KFIFO_BUF=m +CONFIG_IIO_TRIGGERED_BUFFER=m +CONFIG_IIO_CONFIGFS=m +CONFIG_IIO_TRIGGER=y +CONFIG_IIO_CONSUMERS_PER_TRIGGER=2 +CONFIG_IIO_SW_DEVICE=m +CONFIG_IIO_SW_TRIGGER=m +CONFIG_IIO_TRIGGERED_EVENT=m + +# +# Accelerometers +# +CONFIG_ADIS16201=m +CONFIG_ADIS16209=m +CONFIG_ADXL372=m +CONFIG_ADXL372_SPI=m +CONFIG_ADXL372_I2C=m +CONFIG_BMA180=m +CONFIG_BMA220=m +CONFIG_BMA400=m +CONFIG_BMA400_I2C=m +CONFIG_BMC150_ACCEL=m +CONFIG_BMC150_ACCEL_I2C=m +CONFIG_BMC150_ACCEL_SPI=m +CONFIG_DA280=m +CONFIG_DA311=m +CONFIG_DMARD06=m +CONFIG_DMARD09=m +CONFIG_DMARD10=m +CONFIG_HID_SENSOR_ACCEL_3D=m +CONFIG_IIO_CROS_EC_ACCEL_LEGACY=m +CONFIG_IIO_ST_ACCEL_3AXIS=m +CONFIG_IIO_ST_ACCEL_I2C_3AXIS=m +CONFIG_IIO_ST_ACCEL_SPI_3AXIS=m +CONFIG_KXSD9=m +CONFIG_KXSD9_SPI=m +CONFIG_KXSD9_I2C=m +CONFIG_KXCJK1013=m +CONFIG_MC3230=m +CONFIG_MMA7455=m +CONFIG_MMA7455_I2C=m +CONFIG_MMA7455_SPI=m +CONFIG_MMA7660=m +CONFIG_MMA8452=m +CONFIG_MMA9551_CORE=m +CONFIG_MMA9551=m +CONFIG_MMA9553=m +CONFIG_MXC4005=m +CONFIG_MXC6255=m +CONFIG_SCA3000=m +CONFIG_STK8312=m +CONFIG_STK8BA50=m +# end of Accelerometers + +# +# Analog to digital converters +# +CONFIG_AD_SIGMA_DELTA=m +CONFIG_AD7091R5=m +CONFIG_AD7124=m +CONFIG_AD7192=m +CONFIG_AD7266=m +CONFIG_AD7291=m +CONFIG_AD7292=m +CONFIG_AD7298=m +CONFIG_AD7476=m +CONFIG_AD7606=m +CONFIG_AD7606_IFACE_PARALLEL=m +CONFIG_AD7606_IFACE_SPI=m +CONFIG_AD7766=m +CONFIG_AD7768_1=m +CONFIG_AD7780=m +CONFIG_AD7791=m +CONFIG_AD7793=m +CONFIG_AD7887=m +CONFIG_AD7923=m +CONFIG_AD7949=m +CONFIG_AD799X=m +CONFIG_AXP20X_ADC=m +CONFIG_AXP288_ADC=m +CONFIG_CC10001_ADC=m +CONFIG_CPCAP_ADC=m +CONFIG_DA9150_GPADC=m +CONFIG_DLN2_ADC=m +CONFIG_ENVELOPE_DETECTOR=m +CONFIG_HI8435=m +CONFIG_HX711=m +CONFIG_INA2XX_ADC=m +CONFIG_LP8788_ADC=m +CONFIG_LTC2471=m +CONFIG_LTC2485=m +CONFIG_LTC2496=m +CONFIG_LTC2497=m +CONFIG_MAX1027=m +CONFIG_MAX11100=m +CONFIG_MAX1118=m +CONFIG_MAX1363=m +CONFIG_MAX9611=m +CONFIG_MCP320X=m +CONFIG_MCP3422=m +CONFIG_MCP3911=m +CONFIG_MEN_Z188_ADC=m +CONFIG_NAU7802=m +CONFIG_PALMAS_GPADC=m +CONFIG_QCOM_VADC_COMMON=m +CONFIG_QCOM_SPMI_IADC=m +CONFIG_QCOM_SPMI_VADC=m +CONFIG_QCOM_SPMI_ADC5=m +CONFIG_RN5T618_ADC=m +CONFIG_SD_ADC_MODULATOR=m +CONFIG_STMPE_ADC=m +CONFIG_TI_ADC081C=m +CONFIG_TI_ADC0832=m +CONFIG_TI_ADC084S021=m +CONFIG_TI_ADC12138=m +CONFIG_TI_ADC108S102=m +CONFIG_TI_ADC128S052=m +CONFIG_TI_ADC161S626=m +CONFIG_TI_ADS1015=m +CONFIG_TI_ADS7950=m +CONFIG_TI_ADS8344=m +CONFIG_TI_ADS8688=m +CONFIG_TI_ADS124S08=m +CONFIG_TI_AM335X_ADC=m +CONFIG_TI_TLC4541=m +CONFIG_TWL4030_MADC=m +CONFIG_TWL6030_GPADC=m +CONFIG_VF610_ADC=m +CONFIG_VIPERBOARD_ADC=m +CONFIG_XILINX_XADC=m +# end of Analog to digital converters + +# +# Analog Front Ends +# +CONFIG_IIO_RESCALE=m +# end of Analog Front Ends + +# +# Amplifiers +# +CONFIG_AD8366=m +CONFIG_HMC425=m +# end of Amplifiers + +# +# Chemical Sensors +# +CONFIG_ATLAS_PH_SENSOR=m +CONFIG_BME680=m +CONFIG_BME680_I2C=m +CONFIG_BME680_SPI=m +CONFIG_CCS811=m +CONFIG_IAQCORE=m +CONFIG_PMS7003=m +CONFIG_SENSIRION_SGP30=m +CONFIG_SPS30=m +CONFIG_VZ89X=m +# end of Chemical Sensors + +CONFIG_IIO_CROS_EC_SENSORS_CORE=m +CONFIG_IIO_CROS_EC_SENSORS=m +CONFIG_IIO_CROS_EC_SENSORS_LID_ANGLE=m + +# +# Hid Sensor IIO Common +# +CONFIG_HID_SENSOR_IIO_COMMON=m +CONFIG_HID_SENSOR_IIO_TRIGGER=m +# end of Hid Sensor IIO Common + +CONFIG_IIO_MS_SENSORS_I2C=m + +# +# SSP Sensor Common +# +CONFIG_IIO_SSP_SENSORS_COMMONS=m +CONFIG_IIO_SSP_SENSORHUB=m +# end of SSP Sensor Common + +CONFIG_IIO_ST_SENSORS_I2C=m +CONFIG_IIO_ST_SENSORS_SPI=m +CONFIG_IIO_ST_SENSORS_CORE=m + +# +# Digital to analog converters +# +CONFIG_AD5064=m +CONFIG_AD5360=m +CONFIG_AD5380=m +CONFIG_AD5421=m +CONFIG_AD5446=m +CONFIG_AD5449=m +CONFIG_AD5592R_BASE=m +CONFIG_AD5592R=m +CONFIG_AD5593R=m +CONFIG_AD5504=m +CONFIG_AD5624R_SPI=m +CONFIG_AD5686=m +CONFIG_AD5686_SPI=m +CONFIG_AD5696_I2C=m +CONFIG_AD5755=m +CONFIG_AD5758=m +CONFIG_AD5761=m +CONFIG_AD5764=m +CONFIG_AD5770R=m +CONFIG_AD5791=m +CONFIG_AD7303=m +CONFIG_AD8801=m +CONFIG_DPOT_DAC=m +CONFIG_DS4424=m +CONFIG_LTC1660=m +CONFIG_LTC2632=m +CONFIG_M62332=m +CONFIG_MAX517=m +CONFIG_MAX5821=m +CONFIG_MCP4725=m +CONFIG_MCP4922=m +CONFIG_TI_DAC082S085=m +CONFIG_TI_DAC5571=m +CONFIG_TI_DAC7311=m +CONFIG_TI_DAC7612=m +CONFIG_VF610_DAC=m +# end of Digital to analog converters + +# +# IIO dummy driver +# +# CONFIG_IIO_SIMPLE_DUMMY is not set +# end of IIO dummy driver + +# +# Frequency Synthesizers DDS/PLL +# + +# +# Clock Generator/Distribution +# +CONFIG_AD9523=m +# end of Clock Generator/Distribution + +# +# Phase-Locked Loop (PLL) frequency synthesizers +# +CONFIG_ADF4350=m +CONFIG_ADF4371=m +# end of Phase-Locked Loop (PLL) frequency synthesizers +# end of Frequency Synthesizers DDS/PLL + +# +# Digital gyroscope sensors +# +CONFIG_ADIS16080=m +CONFIG_ADIS16130=m +CONFIG_ADIS16136=m +CONFIG_ADIS16260=m +CONFIG_ADXRS450=m +CONFIG_BMG160=m +CONFIG_BMG160_I2C=m +CONFIG_BMG160_SPI=m +CONFIG_FXAS21002C=m +CONFIG_FXAS21002C_I2C=m +CONFIG_FXAS21002C_SPI=m +CONFIG_HID_SENSOR_GYRO_3D=m +CONFIG_MPU3050=m +CONFIG_MPU3050_I2C=m +CONFIG_IIO_ST_GYRO_3AXIS=m +CONFIG_IIO_ST_GYRO_I2C_3AXIS=m +CONFIG_IIO_ST_GYRO_SPI_3AXIS=m +CONFIG_ITG3200=m +# end of Digital gyroscope sensors + +# +# Health Sensors +# + +# +# Heart Rate Monitors +# +CONFIG_AFE4403=m +CONFIG_AFE4404=m +CONFIG_MAX30100=m +CONFIG_MAX30102=m +# end of Heart Rate Monitors +# end of Health Sensors + +# +# Humidity sensors +# +CONFIG_AM2315=m +CONFIG_DHT11=m +CONFIG_HDC100X=m +CONFIG_HID_SENSOR_HUMIDITY=m +CONFIG_HTS221=m +CONFIG_HTS221_I2C=m +CONFIG_HTS221_SPI=m +CONFIG_HTU21=m +CONFIG_SI7005=m +CONFIG_SI7020=m +# end of Humidity sensors + +# +# Inertial measurement units +# +CONFIG_ADIS16400=m +CONFIG_ADIS16460=m +CONFIG_ADIS16480=m +CONFIG_BMI160=m +CONFIG_BMI160_I2C=m +CONFIG_BMI160_SPI=m +CONFIG_FXOS8700=m +CONFIG_FXOS8700_I2C=m +CONFIG_FXOS8700_SPI=m +CONFIG_KMX61=m +CONFIG_INV_MPU6050_IIO=m +CONFIG_INV_MPU6050_I2C=m +CONFIG_INV_MPU6050_SPI=m +CONFIG_IIO_ST_LSM6DSX=m +CONFIG_IIO_ST_LSM6DSX_I2C=m +CONFIG_IIO_ST_LSM6DSX_SPI=m +CONFIG_IIO_ST_LSM6DSX_I3C=m +# end of Inertial measurement units + +CONFIG_IIO_ADIS_LIB=m +CONFIG_IIO_ADIS_LIB_BUFFER=y + +# +# Light sensors +# +CONFIG_ACPI_ALS=m +CONFIG_ADJD_S311=m +CONFIG_ADUX1020=m +CONFIG_AL3010=m +CONFIG_AL3320A=m +CONFIG_APDS9300=m +CONFIG_APDS9960=m +CONFIG_BH1750=m +CONFIG_BH1780=m +CONFIG_CM32181=m +CONFIG_CM3232=m +CONFIG_CM3323=m +CONFIG_CM3605=m +CONFIG_CM36651=m +CONFIG_IIO_CROS_EC_LIGHT_PROX=m +CONFIG_GP2AP002=m +CONFIG_GP2AP020A00F=m +CONFIG_IQS621_ALS=m +CONFIG_SENSORS_ISL29018=m +CONFIG_SENSORS_ISL29028=m +CONFIG_ISL29125=m +CONFIG_HID_SENSOR_ALS=m +CONFIG_HID_SENSOR_PROX=m +CONFIG_JSA1212=m +CONFIG_RPR0521=m +CONFIG_SENSORS_LM3533=m +CONFIG_LTR501=m +CONFIG_LV0104CS=m +CONFIG_MAX44000=m +CONFIG_MAX44009=m +CONFIG_NOA1305=m +CONFIG_OPT3001=m +CONFIG_PA12203001=m +CONFIG_SI1133=m +CONFIG_SI1145=m +CONFIG_STK3310=m +CONFIG_ST_UVIS25=m +CONFIG_ST_UVIS25_I2C=m +CONFIG_ST_UVIS25_SPI=m +CONFIG_TCS3414=m +CONFIG_TCS3472=m +CONFIG_SENSORS_TSL2563=m +CONFIG_TSL2583=m +CONFIG_TSL2772=m +CONFIG_TSL4531=m +CONFIG_US5182D=m +CONFIG_VCNL4000=m +CONFIG_VCNL4035=m +CONFIG_VEML6030=m +CONFIG_VEML6070=m +CONFIG_VL6180=m +CONFIG_ZOPT2201=m +# end of Light sensors + +# +# Magnetometer sensors +# +CONFIG_AK8974=m +CONFIG_AK8975=m +CONFIG_AK09911=m +CONFIG_BMC150_MAGN=m +CONFIG_BMC150_MAGN_I2C=m +CONFIG_BMC150_MAGN_SPI=m +CONFIG_MAG3110=m +CONFIG_HID_SENSOR_MAGNETOMETER_3D=m +CONFIG_MMC35240=m +CONFIG_IIO_ST_MAGN_3AXIS=m +CONFIG_IIO_ST_MAGN_I2C_3AXIS=m +CONFIG_IIO_ST_MAGN_SPI_3AXIS=m +CONFIG_SENSORS_HMC5843=m +CONFIG_SENSORS_HMC5843_I2C=m +CONFIG_SENSORS_HMC5843_SPI=m +CONFIG_SENSORS_RM3100=m +CONFIG_SENSORS_RM3100_I2C=m +CONFIG_SENSORS_RM3100_SPI=m +# end of Magnetometer sensors + +# +# Multiplexers +# +CONFIG_IIO_MUX=m +# end of Multiplexers + +# +# Inclinometer sensors +# +CONFIG_HID_SENSOR_INCLINOMETER_3D=m +CONFIG_HID_SENSOR_DEVICE_ROTATION=m +# end of Inclinometer sensors + +# +# Triggers - standalone +# +CONFIG_IIO_HRTIMER_TRIGGER=m +CONFIG_IIO_INTERRUPT_TRIGGER=m +CONFIG_IIO_TIGHTLOOP_TRIGGER=m +CONFIG_IIO_SYSFS_TRIGGER=m +# end of Triggers - standalone + +# +# Linear and angular position sensors +# +CONFIG_IQS624_POS=m +# end of Linear and angular position sensors + +# +# Digital potentiometers +# +CONFIG_AD5272=m +CONFIG_DS1803=m +CONFIG_MAX5432=m +CONFIG_MAX5481=m +CONFIG_MAX5487=m +CONFIG_MCP4018=m +CONFIG_MCP4131=m +CONFIG_MCP4531=m +CONFIG_MCP41010=m +CONFIG_TPL0102=m +# end of Digital potentiometers + +# +# Digital potentiostats +# +CONFIG_LMP91000=m +# end of Digital potentiostats + +# +# Pressure sensors +# +CONFIG_ABP060MG=m +CONFIG_BMP280=m +CONFIG_BMP280_I2C=m +CONFIG_BMP280_SPI=m +CONFIG_IIO_CROS_EC_BARO=m +CONFIG_DLHL60D=m +CONFIG_DPS310=m +CONFIG_HID_SENSOR_PRESS=m +CONFIG_HP03=m +CONFIG_ICP10100=m +CONFIG_MPL115=m +CONFIG_MPL115_I2C=m +CONFIG_MPL115_SPI=m +CONFIG_MPL3115=m +CONFIG_MS5611=m +CONFIG_MS5611_I2C=m +CONFIG_MS5611_SPI=m +CONFIG_MS5637=m +CONFIG_IIO_ST_PRESS=m +CONFIG_IIO_ST_PRESS_I2C=m +CONFIG_IIO_ST_PRESS_SPI=m +CONFIG_T5403=m +CONFIG_HP206C=m +CONFIG_ZPA2326=m +CONFIG_ZPA2326_I2C=m +CONFIG_ZPA2326_SPI=m +# end of Pressure sensors + +# +# Lightning sensors +# +CONFIG_AS3935=m +# end of Lightning sensors + +# +# Proximity and distance sensors +# +CONFIG_ISL29501=m +CONFIG_LIDAR_LITE_V2=m +CONFIG_MB1232=m +CONFIG_PING=m +CONFIG_RFD77402=m +CONFIG_SRF04=m +CONFIG_SX9500=m +CONFIG_SRF08=m +CONFIG_VL53L0X_I2C=m +# end of Proximity and distance sensors + +# +# Resolver to digital converters +# +CONFIG_AD2S90=m +CONFIG_AD2S1200=m +# end of Resolver to digital converters + +# +# Temperature sensors +# +CONFIG_IQS620AT_TEMP=m +CONFIG_LTC2983=m +CONFIG_MAXIM_THERMOCOUPLE=m +CONFIG_HID_SENSOR_TEMP=m +CONFIG_MLX90614=m +CONFIG_MLX90632=m +CONFIG_TMP006=m +CONFIG_TMP007=m +CONFIG_TSYS01=m +CONFIG_TSYS02D=m +CONFIG_MAX31856=m +# end of Temperature sensors + +CONFIG_NTB=m +CONFIG_NTB_MSI=y +CONFIG_NTB_AMD=m +CONFIG_NTB_IDT=m +CONFIG_NTB_INTEL=m +CONFIG_NTB_SWITCHTEC=m +# CONFIG_NTB_PINGPONG is not set +# CONFIG_NTB_TOOL is not set +# CONFIG_NTB_PERF is not set +# CONFIG_NTB_MSI_TEST is not set +CONFIG_NTB_TRANSPORT=m +CONFIG_VME_BUS=y + +# +# VME Bridge Drivers +# +CONFIG_VME_CA91CX42=m +CONFIG_VME_TSI148=m +# CONFIG_VME_FAKE is not set + +# +# VME Board Drivers +# +CONFIG_VMIVME_7805=m + +# +# VME Device Drivers +# +CONFIG_VME_USER=m +CONFIG_PWM=y +CONFIG_PWM_SYSFS=y +# CONFIG_PWM_DEBUG is not set +CONFIG_PWM_ATMEL_HLCDC_PWM=m +CONFIG_PWM_CRC=y +CONFIG_PWM_CROS_EC=m +CONFIG_PWM_FSL_FTM=m +CONFIG_PWM_LP3943=m +CONFIG_PWM_LPSS=m +CONFIG_PWM_LPSS_PCI=m +CONFIG_PWM_LPSS_PLATFORM=m +CONFIG_PWM_PCA9685=m +CONFIG_PWM_STMPE=y +CONFIG_PWM_TWL=m +CONFIG_PWM_TWL_LED=m + +# +# IRQ chip support +# +CONFIG_IRQCHIP=y +CONFIG_AL_FIC=y +CONFIG_MADERA_IRQ=m +# end of IRQ chip support + +CONFIG_IPACK_BUS=m +CONFIG_BOARD_TPCI200=m +CONFIG_SERIAL_IPOCTAL=m +CONFIG_RESET_CONTROLLER=y +CONFIG_RESET_BRCMSTB_RESCAL=y +CONFIG_RESET_INTEL_GW=y +CONFIG_RESET_TI_SYSCON=m + +# +# PHY Subsystem +# +CONFIG_GENERIC_PHY=y +CONFIG_GENERIC_PHY_MIPI_DPHY=y +CONFIG_BCM_KONA_USB2_PHY=m +CONFIG_PHY_CADENCE_TORRENT=m +CONFIG_PHY_CADENCE_DPHY=m +CONFIG_PHY_CADENCE_SIERRA=m +CONFIG_PHY_FSL_IMX8MQ_USB=m +CONFIG_PHY_MIXEL_MIPI_DPHY=m +CONFIG_PHY_PXA_28NM_HSIC=m +CONFIG_PHY_PXA_28NM_USB2=m +CONFIG_PHY_CPCAP_USB=m +CONFIG_PHY_MAPPHONE_MDM6600=m +CONFIG_PHY_OCELOT_SERDES=m +CONFIG_PHY_QCOM_USB_HS=m +CONFIG_PHY_QCOM_USB_HSIC=m +CONFIG_PHY_SAMSUNG_USB2=m +CONFIG_PHY_TUSB1210=m +CONFIG_PHY_INTEL_EMMC=m +# end of PHY Subsystem + +CONFIG_POWERCAP=y +CONFIG_INTEL_RAPL_CORE=m +CONFIG_INTEL_RAPL=m +CONFIG_IDLE_INJECT=y +CONFIG_MCB=m +CONFIG_MCB_PCI=m +CONFIG_MCB_LPC=m + +# +# Performance monitor support +# +# end of Performance monitor support + +CONFIG_RAS=y +CONFIG_RAS_CEC=y +# CONFIG_RAS_CEC_DEBUG is not set +CONFIG_USB4=m + +# +# Android +# +# CONFIG_ANDROID is not set +# end of Android + +CONFIG_LIBNVDIMM=y +CONFIG_BLK_DEV_PMEM=m +CONFIG_ND_BLK=m +CONFIG_ND_CLAIM=y +CONFIG_ND_BTT=m +CONFIG_BTT=y +CONFIG_ND_PFN=m +CONFIG_NVDIMM_PFN=y +CONFIG_NVDIMM_DAX=y +CONFIG_OF_PMEM=m +CONFIG_DAX_DRIVER=y +CONFIG_DAX=y +CONFIG_DEV_DAX=m +CONFIG_DEV_DAX_PMEM=m +CONFIG_DEV_DAX_HMEM=m +CONFIG_DEV_DAX_KMEM=m +CONFIG_DEV_DAX_PMEM_COMPAT=m +CONFIG_NVMEM=y +CONFIG_NVMEM_SYSFS=y +CONFIG_NVMEM_SPMI_SDAM=m +CONFIG_RAVE_SP_EEPROM=m + +# +# HW tracing support +# +CONFIG_STM=m +CONFIG_STM_PROTO_BASIC=m +CONFIG_STM_PROTO_SYS_T=m +# CONFIG_STM_DUMMY is not set +CONFIG_STM_SOURCE_CONSOLE=m +CONFIG_STM_SOURCE_HEARTBEAT=m +CONFIG_STM_SOURCE_FTRACE=m +CONFIG_INTEL_TH=m +CONFIG_INTEL_TH_PCI=m +CONFIG_INTEL_TH_ACPI=m +CONFIG_INTEL_TH_GTH=m +CONFIG_INTEL_TH_STH=m +CONFIG_INTEL_TH_MSU=m +CONFIG_INTEL_TH_PTI=m +# CONFIG_INTEL_TH_DEBUG is not set +# end of HW tracing support + +CONFIG_FPGA=m +CONFIG_ALTERA_PR_IP_CORE=m +CONFIG_ALTERA_PR_IP_CORE_PLAT=m +CONFIG_FPGA_MGR_ALTERA_PS_SPI=m +CONFIG_FPGA_MGR_ALTERA_CVP=m +CONFIG_FPGA_MGR_XILINX_SPI=m +CONFIG_FPGA_MGR_ICE40_SPI=m +CONFIG_FPGA_MGR_MACHXO2_SPI=m +CONFIG_FPGA_BRIDGE=m +CONFIG_ALTERA_FREEZE_BRIDGE=m +CONFIG_XILINX_PR_DECOUPLER=m +CONFIG_FPGA_REGION=m +CONFIG_OF_FPGA_REGION=m +CONFIG_FPGA_DFL=m +CONFIG_FPGA_DFL_FME=m +CONFIG_FPGA_DFL_FME_MGR=m +CONFIG_FPGA_DFL_FME_BRIDGE=m +CONFIG_FPGA_DFL_FME_REGION=m +CONFIG_FPGA_DFL_AFU=m +CONFIG_FPGA_DFL_PCI=m +CONFIG_FSI=m +CONFIG_FSI_NEW_DEV_NODE=y +CONFIG_FSI_MASTER_GPIO=m +CONFIG_FSI_MASTER_HUB=m +CONFIG_FSI_MASTER_ASPEED=m +CONFIG_FSI_SCOM=m +CONFIG_FSI_SBEFIFO=m +CONFIG_FSI_OCC=m +CONFIG_TEE=m + +# +# TEE drivers +# +CONFIG_AMDTEE=m +# end of TEE drivers + +CONFIG_MULTIPLEXER=m + +# +# Multiplexer drivers +# +CONFIG_MUX_ADG792A=m +CONFIG_MUX_ADGS1408=m +CONFIG_MUX_GPIO=m +CONFIG_MUX_MMIO=m +# end of Multiplexer drivers + +CONFIG_PM_OPP=y +CONFIG_UNISYS_VISORBUS=m +CONFIG_SIOX=m +CONFIG_SIOX_BUS_GPIO=m +CONFIG_SLIMBUS=m +CONFIG_SLIM_QCOM_CTRL=m +CONFIG_INTERCONNECT=m +CONFIG_COUNTER=m +CONFIG_FTM_QUADDEC=m +CONFIG_MOST=m +# end of Device Drivers + +# +# File systems +# +CONFIG_DCACHE_WORD_ACCESS=y +CONFIG_VALIDATE_FS_PARSER=y +CONFIG_FS_IOMAP=y +# CONFIG_EXT2_FS is not set +# CONFIG_EXT3_FS is not set +CONFIG_EXT4_FS=m +CONFIG_EXT4_USE_FOR_EXT2=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +# CONFIG_EXT4_DEBUG is not set +CONFIG_JBD2=m +# CONFIG_JBD2_DEBUG is not set +CONFIG_FS_MBCACHE=m +CONFIG_REISERFS_FS=m +# CONFIG_REISERFS_CHECK is not set +CONFIG_REISERFS_PROC_INFO=y +CONFIG_REISERFS_FS_XATTR=y +CONFIG_REISERFS_FS_POSIX_ACL=y +CONFIG_REISERFS_FS_SECURITY=y +CONFIG_JFS_FS=m +CONFIG_JFS_POSIX_ACL=y +CONFIG_JFS_SECURITY=y +# CONFIG_JFS_DEBUG is not set +CONFIG_JFS_STATISTICS=y +CONFIG_XFS_FS=m +CONFIG_XFS_QUOTA=y +CONFIG_XFS_POSIX_ACL=y +CONFIG_XFS_RT=y +CONFIG_XFS_ONLINE_SCRUB=y +CONFIG_XFS_ONLINE_REPAIR=y +# CONFIG_XFS_WARN is not set +# CONFIG_XFS_DEBUG is not set +CONFIG_GFS2_FS=m +CONFIG_GFS2_FS_LOCKING_DLM=y +CONFIG_OCFS2_FS=m +CONFIG_OCFS2_FS_O2CB=m +CONFIG_OCFS2_FS_USERSPACE_CLUSTER=m +CONFIG_OCFS2_FS_STATS=y +CONFIG_OCFS2_DEBUG_MASKLOG=y +# CONFIG_OCFS2_DEBUG_FS is not set +CONFIG_BTRFS_FS=m +CONFIG_BTRFS_FS_POSIX_ACL=y +# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set +# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set +# CONFIG_BTRFS_DEBUG is not set +# CONFIG_BTRFS_ASSERT is not set +# CONFIG_BTRFS_FS_REF_VERIFY is not set +CONFIG_NILFS2_FS=m +CONFIG_F2FS_FS=m +CONFIG_F2FS_STAT_FS=y +CONFIG_F2FS_FS_XATTR=y +CONFIG_F2FS_FS_POSIX_ACL=y +CONFIG_F2FS_FS_SECURITY=y +CONFIG_F2FS_CHECK_FS=y +# CONFIG_F2FS_IO_TRACE is not set +# CONFIG_F2FS_FAULT_INJECTION is not set +CONFIG_F2FS_FS_COMPRESSION=y +CONFIG_F2FS_FS_LZO=y +CONFIG_F2FS_FS_LZ4=y +CONFIG_F2FS_FS_ZSTD=y +CONFIG_ZONEFS_FS=m +CONFIG_FS_DAX=y +CONFIG_FS_DAX_PMD=y +CONFIG_FS_POSIX_ACL=y +CONFIG_EXPORTFS=y +CONFIG_EXPORTFS_BLOCK_OPS=y +CONFIG_FILE_LOCKING=y +# CONFIG_MANDATORY_FILE_LOCKING is not set +CONFIG_FS_ENCRYPTION=y +CONFIG_FS_ENCRYPTION_ALGS=m +CONFIG_FS_VERITY=y +# CONFIG_FS_VERITY_DEBUG is not set +CONFIG_FS_VERITY_BUILTIN_SIGNATURES=y +CONFIG_FSNOTIFY=y +CONFIG_DNOTIFY=y +CONFIG_INOTIFY_USER=y +CONFIG_FANOTIFY=y +CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y +CONFIG_QUOTA=y +CONFIG_QUOTA_NETLINK_INTERFACE=y +# CONFIG_PRINT_QUOTA_WARNING is not set +# CONFIG_QUOTA_DEBUG is not set +CONFIG_QUOTA_TREE=m +CONFIG_QFMT_V1=m +CONFIG_QFMT_V2=m +CONFIG_QUOTACTL=y +CONFIG_QUOTACTL_COMPAT=y +CONFIG_AUTOFS4_FS=y +CONFIG_AUTOFS_FS=y +CONFIG_FUSE_FS=m +CONFIG_CUSE=m +CONFIG_VIRTIO_FS=m +CONFIG_OVERLAY_FS=m +CONFIG_OVERLAY_FS_REDIRECT_DIR=y +# CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW is not set +CONFIG_OVERLAY_FS_INDEX=y +CONFIG_OVERLAY_FS_XINO_AUTO=y +CONFIG_OVERLAY_FS_METACOPY=y + +# +# Caches +# +CONFIG_FSCACHE=m +CONFIG_FSCACHE_STATS=y +CONFIG_FSCACHE_HISTOGRAM=y +# CONFIG_FSCACHE_DEBUG is not set +# CONFIG_FSCACHE_OBJECT_LIST is not set +CONFIG_CACHEFILES=m +# CONFIG_CACHEFILES_DEBUG is not set +# CONFIG_CACHEFILES_HISTOGRAM is not set +# end of Caches + +# +# CD-ROM/DVD Filesystems +# +CONFIG_ISO9660_FS=m +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_UDF_FS=m +# end of CD-ROM/DVD Filesystems + +# +# DOS/FAT/EXFAT/NT Filesystems +# +CONFIG_FAT_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_CODEPAGE=437 +CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" +CONFIG_FAT_DEFAULT_UTF8=y +CONFIG_EXFAT_FS=m +CONFIG_EXFAT_DEFAULT_IOCHARSET="utf8" +CONFIG_NTFS_FS=m +# CONFIG_NTFS_DEBUG is not set +CONFIG_NTFS_RW=y +# end of DOS/FAT/EXFAT/NT Filesystems + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +CONFIG_PROC_KCORE=y +CONFIG_PROC_VMCORE=y +CONFIG_PROC_VMCORE_DEVICE_DUMP=y +CONFIG_PROC_SYSCTL=y +CONFIG_PROC_PAGE_MONITOR=y +CONFIG_PROC_CHILDREN=y +CONFIG_PROC_PID_ARCH_STATUS=y +CONFIG_PROC_CPU_RESCTRL=y +CONFIG_KERNFS=y +CONFIG_SYSFS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_TMPFS_XATTR=y +CONFIG_HUGETLBFS=y +CONFIG_HUGETLB_PAGE=y +CONFIG_MEMFD_CREATE=y +CONFIG_ARCH_HAS_GIGANTIC_PAGE=y +CONFIG_CONFIGFS_FS=y +CONFIG_EFIVAR_FS=y +# end of Pseudo filesystems + +CONFIG_MISC_FILESYSTEMS=y +CONFIG_ORANGEFS_FS=m +# CONFIG_ADFS_FS is not set +CONFIG_AFFS_FS=m +CONFIG_ECRYPT_FS=m +# CONFIG_ECRYPT_FS_MESSAGING is not set +CONFIG_HFS_FS=m +CONFIG_HFSPLUS_FS=m +CONFIG_BEFS_FS=m +# CONFIG_BEFS_DEBUG is not set +# CONFIG_BFS_FS is not set +# CONFIG_EFS_FS is not set +CONFIG_JFFS2_FS=m +CONFIG_JFFS2_FS_DEBUG=0 +CONFIG_JFFS2_FS_WRITEBUFFER=y +# CONFIG_JFFS2_FS_WBUF_VERIFY is not set +CONFIG_JFFS2_SUMMARY=y +CONFIG_JFFS2_FS_XATTR=y +CONFIG_JFFS2_FS_POSIX_ACL=y +CONFIG_JFFS2_FS_SECURITY=y +# CONFIG_JFFS2_COMPRESSION_OPTIONS is not set +CONFIG_JFFS2_ZLIB=y +CONFIG_JFFS2_RTIME=y +CONFIG_UBIFS_FS=m +# CONFIG_UBIFS_FS_ADVANCED_COMPR is not set +CONFIG_UBIFS_FS_LZO=y +CONFIG_UBIFS_FS_ZLIB=y +CONFIG_UBIFS_FS_ZSTD=y +CONFIG_UBIFS_ATIME_SUPPORT=y +CONFIG_UBIFS_FS_XATTR=y +CONFIG_UBIFS_FS_SECURITY=y +CONFIG_UBIFS_FS_AUTHENTICATION=y +CONFIG_CRAMFS=m +CONFIG_CRAMFS_BLOCKDEV=y +CONFIG_CRAMFS_MTD=y +CONFIG_SQUASHFS=m +# CONFIG_SQUASHFS_FILE_CACHE is not set +CONFIG_SQUASHFS_FILE_DIRECT=y +# CONFIG_SQUASHFS_DECOMP_SINGLE is not set +CONFIG_SQUASHFS_DECOMP_MULTI=y +# CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU is not set +CONFIG_SQUASHFS_XATTR=y +CONFIG_SQUASHFS_ZLIB=y +CONFIG_SQUASHFS_LZ4=y +CONFIG_SQUASHFS_LZO=y +CONFIG_SQUASHFS_XZ=y +CONFIG_SQUASHFS_ZSTD=y +# CONFIG_SQUASHFS_4K_DEVBLK_SIZE is not set +# CONFIG_SQUASHFS_EMBEDDED is not set +CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3 +# CONFIG_VXFS_FS is not set +CONFIG_MINIX_FS=m +CONFIG_OMFS_FS=m +# CONFIG_HPFS_FS is not set +# CONFIG_QNX4FS_FS is not set +# CONFIG_QNX6FS_FS is not set +CONFIG_ROMFS_FS=m +CONFIG_ROMFS_BACKED_BY_BLOCK=y +# CONFIG_ROMFS_BACKED_BY_MTD is not set +# CONFIG_ROMFS_BACKED_BY_BOTH is not set +CONFIG_ROMFS_ON_BLOCK=y +CONFIG_PSTORE=y +CONFIG_PSTORE_DEFLATE_COMPRESS=m +CONFIG_PSTORE_LZO_COMPRESS=m +CONFIG_PSTORE_LZ4_COMPRESS=m +CONFIG_PSTORE_LZ4HC_COMPRESS=m +# CONFIG_PSTORE_842_COMPRESS is not set +CONFIG_PSTORE_ZSTD_COMPRESS=y +CONFIG_PSTORE_COMPRESS=y +# CONFIG_PSTORE_DEFLATE_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_LZO_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_LZ4_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_LZ4HC_COMPRESS_DEFAULT is not set +CONFIG_PSTORE_ZSTD_COMPRESS_DEFAULT=y +CONFIG_PSTORE_COMPRESS_DEFAULT="zstd" +# CONFIG_PSTORE_CONSOLE is not set +# CONFIG_PSTORE_PMSG is not set +# CONFIG_PSTORE_FTRACE is not set +CONFIG_PSTORE_RAM=y +# CONFIG_SYSV_FS is not set +CONFIG_UFS_FS=m +# CONFIG_UFS_FS_WRITE is not set +# CONFIG_UFS_DEBUG is not set +CONFIG_EROFS_FS=m +# CONFIG_EROFS_FS_DEBUG is not set +CONFIG_EROFS_FS_XATTR=y +CONFIG_EROFS_FS_POSIX_ACL=y +CONFIG_EROFS_FS_SECURITY=y +CONFIG_EROFS_FS_ZIP=y +CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT=2 +CONFIG_VBOXSF_FS=m +CONFIG_NETWORK_FILESYSTEMS=y +CONFIG_NFS_FS=m +CONFIG_NFS_V2=m +CONFIG_NFS_V3=m +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=m +CONFIG_NFS_SWAP=y +CONFIG_NFS_V4_1=y +CONFIG_NFS_V4_2=y +CONFIG_PNFS_FILE_LAYOUT=m +CONFIG_PNFS_BLOCK=m +CONFIG_PNFS_FLEXFILE_LAYOUT=m +CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="kernel.org" +CONFIG_NFS_V4_1_MIGRATION=y +CONFIG_NFS_V4_SECURITY_LABEL=y +CONFIG_NFS_FSCACHE=y +# CONFIG_NFS_USE_LEGACY_DNS is not set +CONFIG_NFS_USE_KERNEL_DNS=y +CONFIG_NFS_DEBUG=y +# CONFIG_NFS_DISABLE_UDP_SUPPORT is not set +CONFIG_NFSD=m +CONFIG_NFSD_V2_ACL=y +CONFIG_NFSD_V3=y +CONFIG_NFSD_V3_ACL=y +CONFIG_NFSD_V4=y +CONFIG_NFSD_PNFS=y +CONFIG_NFSD_BLOCKLAYOUT=y +CONFIG_NFSD_SCSILAYOUT=y +# CONFIG_NFSD_FLEXFILELAYOUT is not set +CONFIG_NFSD_V4_SECURITY_LABEL=y +CONFIG_GRACE_PERIOD=m +CONFIG_LOCKD=m +CONFIG_LOCKD_V4=y +CONFIG_NFS_ACL_SUPPORT=m +CONFIG_NFS_COMMON=y +CONFIG_SUNRPC=m +CONFIG_SUNRPC_GSS=m +CONFIG_SUNRPC_BACKCHANNEL=y +CONFIG_SUNRPC_SWAP=y +CONFIG_RPCSEC_GSS_KRB5=m +CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES=y +CONFIG_SUNRPC_DEBUG=y +CONFIG_SUNRPC_XPRT_RDMA=m +CONFIG_CEPH_FS=m +CONFIG_CEPH_FSCACHE=y +CONFIG_CEPH_FS_POSIX_ACL=y +CONFIG_CEPH_FS_SECURITY_LABEL=y +CONFIG_CIFS=m +# CONFIG_CIFS_STATS2 is not set +CONFIG_CIFS_ALLOW_INSECURE_LEGACY=y +# CONFIG_CIFS_WEAK_PW_HASH is not set +CONFIG_CIFS_UPCALL=y +CONFIG_CIFS_XATTR=y +CONFIG_CIFS_POSIX=y +CONFIG_CIFS_DEBUG=y +# CONFIG_CIFS_DEBUG2 is not set +# CONFIG_CIFS_DEBUG_DUMP_KEYS is not set +CONFIG_CIFS_DFS_UPCALL=y +# CONFIG_CIFS_SMB_DIRECT is not set +CONFIG_CIFS_FSCACHE=y +CONFIG_CODA_FS=m +CONFIG_AFS_FS=m +# CONFIG_AFS_DEBUG is not set +CONFIG_AFS_FSCACHE=y +# CONFIG_AFS_DEBUG_CURSOR is not set +CONFIG_9P_FS=m +CONFIG_9P_FSCACHE=y +CONFIG_9P_FS_POSIX_ACL=y +CONFIG_9P_FS_SECURITY=y +CONFIG_NLS=y +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_737=m +CONFIG_NLS_CODEPAGE_775=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_CODEPAGE_852=m +CONFIG_NLS_CODEPAGE_855=m +CONFIG_NLS_CODEPAGE_857=m +CONFIG_NLS_CODEPAGE_860=m +CONFIG_NLS_CODEPAGE_861=m +CONFIG_NLS_CODEPAGE_862=m +CONFIG_NLS_CODEPAGE_863=m +CONFIG_NLS_CODEPAGE_864=m +CONFIG_NLS_CODEPAGE_865=m +CONFIG_NLS_CODEPAGE_866=m +CONFIG_NLS_CODEPAGE_869=m +CONFIG_NLS_CODEPAGE_936=m +CONFIG_NLS_CODEPAGE_950=m +CONFIG_NLS_CODEPAGE_932=m +CONFIG_NLS_CODEPAGE_949=m +CONFIG_NLS_CODEPAGE_874=m +CONFIG_NLS_ISO8859_8=m +CONFIG_NLS_CODEPAGE_1250=m +CONFIG_NLS_CODEPAGE_1251=m +CONFIG_NLS_ASCII=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_ISO8859_2=m +CONFIG_NLS_ISO8859_3=m +CONFIG_NLS_ISO8859_4=m +CONFIG_NLS_ISO8859_5=m +CONFIG_NLS_ISO8859_6=m +CONFIG_NLS_ISO8859_7=m +CONFIG_NLS_ISO8859_9=m +CONFIG_NLS_ISO8859_13=m +CONFIG_NLS_ISO8859_14=m +CONFIG_NLS_ISO8859_15=m +CONFIG_NLS_KOI8_R=m +CONFIG_NLS_KOI8_U=m +CONFIG_NLS_MAC_ROMAN=m +CONFIG_NLS_MAC_CELTIC=m +CONFIG_NLS_MAC_CENTEURO=m +CONFIG_NLS_MAC_CROATIAN=m +CONFIG_NLS_MAC_CYRILLIC=m +CONFIG_NLS_MAC_GAELIC=m +CONFIG_NLS_MAC_GREEK=m +CONFIG_NLS_MAC_ICELAND=m +CONFIG_NLS_MAC_INUIT=m +CONFIG_NLS_MAC_ROMANIAN=m +CONFIG_NLS_MAC_TURKISH=m +CONFIG_NLS_UTF8=m +CONFIG_DLM=m +# CONFIG_DLM_DEBUG is not set +CONFIG_UNICODE=y +# CONFIG_UNICODE_NORMALIZATION_SELFTEST is not set +CONFIG_IO_WQ=y +# end of File systems + +# +# Security options +# +CONFIG_KEYS=y +CONFIG_KEYS_REQUEST_CACHE=y +CONFIG_PERSISTENT_KEYRINGS=y +CONFIG_BIG_KEYS=y +CONFIG_TRUSTED_KEYS=m +CONFIG_ENCRYPTED_KEYS=m +CONFIG_KEY_DH_OPERATIONS=y +# CONFIG_SECURITY_DMESG_RESTRICT is not set +CONFIG_SECURITY=y +CONFIG_SECURITYFS=y +CONFIG_SECURITY_NETWORK=y +CONFIG_PAGE_TABLE_ISOLATION=y +CONFIG_SECURITY_INFINIBAND=y +CONFIG_SECURITY_NETWORK_XFRM=y +CONFIG_SECURITY_PATH=y +# CONFIG_INTEL_TXT is not set +CONFIG_LSM_MMAP_MIN_ADDR=65536 +CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR=y +CONFIG_HARDENED_USERCOPY=y +CONFIG_HARDENED_USERCOPY_FALLBACK=y +# CONFIG_HARDENED_USERCOPY_PAGESPAN is not set +CONFIG_FORTIFY_SOURCE=y +# CONFIG_STATIC_USERMODEHELPER is not set +CONFIG_SECURITY_SELINUX=y +CONFIG_SECURITY_SELINUX_BOOTPARAM=y +# CONFIG_SECURITY_SELINUX_DISABLE is not set +CONFIG_SECURITY_SELINUX_DEVELOP=y +CONFIG_SECURITY_SELINUX_AVC_STATS=y +CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=0 +CONFIG_SECURITY_SELINUX_SIDTAB_HASH_BITS=9 +CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE=256 +CONFIG_SECURITY_SMACK=y +CONFIG_SECURITY_SMACK_BRINGUP=y +CONFIG_SECURITY_SMACK_NETFILTER=y +CONFIG_SECURITY_SMACK_APPEND_SIGNALS=y +CONFIG_SECURITY_TOMOYO=y +CONFIG_SECURITY_TOMOYO_MAX_ACCEPT_ENTRY=2048 +CONFIG_SECURITY_TOMOYO_MAX_AUDIT_LOG=1024 +# CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER is not set +CONFIG_SECURITY_TOMOYO_POLICY_LOADER="/sbin/tomoyo-init" +CONFIG_SECURITY_TOMOYO_ACTIVATION_TRIGGER="/sbin/init" +# CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING is not set +CONFIG_SECURITY_APPARMOR=y +CONFIG_SECURITY_APPARMOR_HASH=y +CONFIG_SECURITY_APPARMOR_HASH_DEFAULT=y +# CONFIG_SECURITY_APPARMOR_DEBUG is not set +# CONFIG_SECURITY_LOADPIN is not set +CONFIG_SECURITY_YAMA=y +CONFIG_SECURITY_SAFESETID=y +CONFIG_SECURITY_LOCKDOWN_LSM=y +# CONFIG_SECURITY_LOCKDOWN_LSM_EARLY is not set +CONFIG_LOCK_DOWN_KERNEL_FORCE_NONE=y +# CONFIG_LOCK_DOWN_KERNEL_FORCE_INTEGRITY is not set +# CONFIG_LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY is not set +# CONFIG_INTEGRITY is not set +# CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT is not set +# CONFIG_DEFAULT_SECURITY_SELINUX is not set +# CONFIG_DEFAULT_SECURITY_SMACK is not set +# CONFIG_DEFAULT_SECURITY_TOMOYO is not set +# CONFIG_DEFAULT_SECURITY_APPARMOR is not set +CONFIG_DEFAULT_SECURITY_DAC=y +CONFIG_LSM="lockdown,yama" + +# +# Kernel hardening options +# +CONFIG_GCC_PLUGIN_STRUCTLEAK=y + +# +# Memory initialization +# +# CONFIG_INIT_STACK_NONE is not set +# CONFIG_GCC_PLUGIN_STRUCTLEAK_USER is not set +# CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF is not set +CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL=y +# CONFIG_GCC_PLUGIN_STRUCTLEAK_VERBOSE is not set +# CONFIG_GCC_PLUGIN_STACKLEAK is not set +CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y +# CONFIG_INIT_ON_FREE_DEFAULT_ON is not set +# end of Memory initialization +# end of Kernel hardening options +# end of Security options + +CONFIG_XOR_BLOCKS=m +CONFIG_ASYNC_CORE=m +CONFIG_ASYNC_MEMCPY=m +CONFIG_ASYNC_XOR=m +CONFIG_ASYNC_PQ=m +CONFIG_ASYNC_RAID6_RECOV=m +CONFIG_CRYPTO=y + +# +# Crypto core or helper +# +CONFIG_CRYPTO_ALGAPI=y +CONFIG_CRYPTO_ALGAPI2=y +CONFIG_CRYPTO_AEAD=y +CONFIG_CRYPTO_AEAD2=y +CONFIG_CRYPTO_SKCIPHER=y +CONFIG_CRYPTO_SKCIPHER2=y +CONFIG_CRYPTO_HASH=y +CONFIG_CRYPTO_HASH2=y +CONFIG_CRYPTO_RNG=y +CONFIG_CRYPTO_RNG2=y +CONFIG_CRYPTO_RNG_DEFAULT=y +CONFIG_CRYPTO_AKCIPHER2=y +CONFIG_CRYPTO_AKCIPHER=y +CONFIG_CRYPTO_KPP2=y +CONFIG_CRYPTO_KPP=y +CONFIG_CRYPTO_ACOMP2=y +CONFIG_CRYPTO_MANAGER=y +CONFIG_CRYPTO_MANAGER2=y +CONFIG_CRYPTO_USER=m +CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y +CONFIG_CRYPTO_GF128MUL=y +CONFIG_CRYPTO_NULL=y +CONFIG_CRYPTO_NULL2=y +CONFIG_CRYPTO_PCRYPT=m +CONFIG_CRYPTO_CRYPTD=m +CONFIG_CRYPTO_AUTHENC=m +CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_SIMD=m +CONFIG_CRYPTO_GLUE_HELPER_X86=m +CONFIG_CRYPTO_ENGINE=m + +# +# Public-key cryptography +# +CONFIG_CRYPTO_RSA=y +CONFIG_CRYPTO_DH=y +CONFIG_CRYPTO_ECC=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECRDSA=m +CONFIG_CRYPTO_CURVE25519=m +CONFIG_CRYPTO_CURVE25519_X86=m + +# +# Authenticated Encryption with Associated Data +# +CONFIG_CRYPTO_CCM=m +CONFIG_CRYPTO_GCM=y +CONFIG_CRYPTO_CHACHA20POLY1305=m +CONFIG_CRYPTO_AEGIS128=m +CONFIG_CRYPTO_AEGIS128_AESNI_SSE2=m +CONFIG_CRYPTO_SEQIV=y +CONFIG_CRYPTO_ECHAINIV=m + +# +# Block modes +# +CONFIG_CRYPTO_CBC=m +CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTR=y +CONFIG_CRYPTO_CTS=m +CONFIG_CRYPTO_ECB=m +CONFIG_CRYPTO_LRW=m +CONFIG_CRYPTO_OFB=m +CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m +CONFIG_CRYPTO_KEYWRAP=m +CONFIG_CRYPTO_NHPOLY1305=m +CONFIG_CRYPTO_NHPOLY1305_SSE2=m +CONFIG_CRYPTO_NHPOLY1305_AVX2=m +CONFIG_CRYPTO_ADIANTUM=m +CONFIG_CRYPTO_ESSIV=m + +# +# Hash modes +# +CONFIG_CRYPTO_CMAC=m +CONFIG_CRYPTO_HMAC=y +CONFIG_CRYPTO_XCBC=m +CONFIG_CRYPTO_VMAC=m + +# +# Digest +# +CONFIG_CRYPTO_CRC32C=m +CONFIG_CRYPTO_CRC32C_INTEL=m +CONFIG_CRYPTO_CRC32=m +CONFIG_CRYPTO_CRC32_PCLMUL=m +CONFIG_CRYPTO_XXHASH=m +CONFIG_CRYPTO_BLAKE2B=m +CONFIG_CRYPTO_BLAKE2S=m +CONFIG_CRYPTO_BLAKE2S_X86=m +CONFIG_CRYPTO_CRCT10DIF=y +CONFIG_CRYPTO_CRCT10DIF_PCLMUL=m +CONFIG_CRYPTO_GHASH=y +CONFIG_CRYPTO_POLY1305=m +CONFIG_CRYPTO_POLY1305_X86_64=m +CONFIG_CRYPTO_MD4=m +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_RMD128=m +CONFIG_CRYPTO_RMD160=m +CONFIG_CRYPTO_RMD256=m +CONFIG_CRYPTO_RMD320=m +CONFIG_CRYPTO_SHA1=y +CONFIG_CRYPTO_SHA1_SSSE3=m +CONFIG_CRYPTO_SHA256_SSSE3=m +CONFIG_CRYPTO_SHA512_SSSE3=m +CONFIG_CRYPTO_SHA256=y +CONFIG_CRYPTO_SHA512=y +CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m +CONFIG_CRYPTO_STREEBOG=m +CONFIG_CRYPTO_TGR192=m +CONFIG_CRYPTO_WP512=m +CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL=m + +# +# Ciphers +# +CONFIG_CRYPTO_AES=y +CONFIG_CRYPTO_AES_TI=m +CONFIG_CRYPTO_AES_NI_INTEL=m +CONFIG_CRYPTO_ANUBIS=m +CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_BLOWFISH=m +CONFIG_CRYPTO_BLOWFISH_COMMON=m +CONFIG_CRYPTO_BLOWFISH_X86_64=m +CONFIG_CRYPTO_CAMELLIA=m +CONFIG_CRYPTO_CAMELLIA_X86_64=m +CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64=m +CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64=m +CONFIG_CRYPTO_CAST_COMMON=m +CONFIG_CRYPTO_CAST5=m +CONFIG_CRYPTO_CAST5_AVX_X86_64=m +CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_CAST6_AVX_X86_64=m +CONFIG_CRYPTO_DES=m +CONFIG_CRYPTO_DES3_EDE_X86_64=m +CONFIG_CRYPTO_FCRYPT=m +CONFIG_CRYPTO_KHAZAD=m +CONFIG_CRYPTO_SALSA20=m +CONFIG_CRYPTO_CHACHA20=m +CONFIG_CRYPTO_CHACHA20_X86_64=m +CONFIG_CRYPTO_SEED=m +CONFIG_CRYPTO_SERPENT=m +CONFIG_CRYPTO_SERPENT_SSE2_X86_64=m +CONFIG_CRYPTO_SERPENT_AVX_X86_64=m +CONFIG_CRYPTO_SERPENT_AVX2_X86_64=m +CONFIG_CRYPTO_SM4=m +CONFIG_CRYPTO_TEA=m +CONFIG_CRYPTO_TWOFISH=m +CONFIG_CRYPTO_TWOFISH_COMMON=m +CONFIG_CRYPTO_TWOFISH_X86_64=m +CONFIG_CRYPTO_TWOFISH_X86_64_3WAY=m +CONFIG_CRYPTO_TWOFISH_AVX_X86_64=m + +# +# Compression +# +CONFIG_CRYPTO_DEFLATE=m +CONFIG_CRYPTO_LZO=m +CONFIG_CRYPTO_842=m +CONFIG_CRYPTO_LZ4=y +CONFIG_CRYPTO_LZ4HC=m +CONFIG_CRYPTO_ZSTD=y + +# +# Random Number Generation +# +CONFIG_CRYPTO_ANSI_CPRNG=m +CONFIG_CRYPTO_DRBG_MENU=y +CONFIG_CRYPTO_DRBG_HMAC=y +CONFIG_CRYPTO_DRBG_HASH=y +CONFIG_CRYPTO_DRBG_CTR=y +CONFIG_CRYPTO_DRBG=y +CONFIG_CRYPTO_JITTERENTROPY=y +CONFIG_CRYPTO_USER_API=m +CONFIG_CRYPTO_USER_API_HASH=m +CONFIG_CRYPTO_USER_API_SKCIPHER=m +CONFIG_CRYPTO_USER_API_RNG=m +CONFIG_CRYPTO_USER_API_AEAD=m +# CONFIG_CRYPTO_STATS is not set +CONFIG_CRYPTO_HASH_INFO=y + +# +# Crypto library routines +# +CONFIG_CRYPTO_LIB_AES=y +CONFIG_CRYPTO_LIB_ARC4=m +CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S=m +CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC=m +CONFIG_CRYPTO_LIB_BLAKE2S=m +CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA=m +CONFIG_CRYPTO_LIB_CHACHA_GENERIC=m +CONFIG_CRYPTO_LIB_CHACHA=m +CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519=m +CONFIG_CRYPTO_LIB_CURVE25519_GENERIC=m +CONFIG_CRYPTO_LIB_CURVE25519=m +CONFIG_CRYPTO_LIB_DES=m +CONFIG_CRYPTO_LIB_POLY1305_RSIZE=11 +CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305=m +CONFIG_CRYPTO_LIB_POLY1305_GENERIC=m +CONFIG_CRYPTO_LIB_POLY1305=m +CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m +CONFIG_CRYPTO_LIB_SHA256=y +CONFIG_CRYPTO_HW=y +CONFIG_CRYPTO_DEV_PADLOCK=m +CONFIG_CRYPTO_DEV_PADLOCK_AES=m +CONFIG_CRYPTO_DEV_PADLOCK_SHA=m +CONFIG_CRYPTO_DEV_ATMEL_I2C=m +CONFIG_CRYPTO_DEV_ATMEL_ECC=m +CONFIG_CRYPTO_DEV_ATMEL_SHA204A=m +CONFIG_CRYPTO_DEV_CCP=y +CONFIG_CRYPTO_DEV_CCP_DD=m +CONFIG_CRYPTO_DEV_SP_CCP=y +CONFIG_CRYPTO_DEV_CCP_CRYPTO=m +CONFIG_CRYPTO_DEV_SP_PSP=y +CONFIG_CRYPTO_DEV_CCP_DEBUGFS=y +CONFIG_CRYPTO_DEV_QAT=m +CONFIG_CRYPTO_DEV_QAT_DH895xCC=m +CONFIG_CRYPTO_DEV_QAT_C3XXX=m +CONFIG_CRYPTO_DEV_QAT_C62X=m +CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m +CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m +CONFIG_CRYPTO_DEV_QAT_C62XVF=m +CONFIG_CRYPTO_DEV_NITROX=m +CONFIG_CRYPTO_DEV_NITROX_CNN55XX=m +CONFIG_CRYPTO_DEV_CHELSIO=m +CONFIG_CHELSIO_IPSEC_INLINE=y +CONFIG_CHELSIO_TLS_DEVICE=y +CONFIG_CRYPTO_DEV_VIRTIO=m +CONFIG_CRYPTO_DEV_SAFEXCEL=m +CONFIG_CRYPTO_DEV_CCREE=m +CONFIG_CRYPTO_DEV_AMLOGIC_GXL=m +CONFIG_CRYPTO_DEV_AMLOGIC_GXL_DEBUG=y +CONFIG_ASYMMETRIC_KEY_TYPE=y +CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y +CONFIG_ASYMMETRIC_TPM_KEY_SUBTYPE=m +CONFIG_X509_CERTIFICATE_PARSER=y +CONFIG_PKCS8_PRIVATE_KEY_PARSER=m +CONFIG_TPM_KEY_PARSER=m +CONFIG_PKCS7_MESSAGE_PARSER=y +# CONFIG_PKCS7_TEST_KEY is not set +CONFIG_SIGNED_PE_FILE_VERIFICATION=y + +# +# Certificates for signature checking +# +CONFIG_MODULE_SIG_KEY="certs/signing_key.pem" +CONFIG_SYSTEM_TRUSTED_KEYRING=y +CONFIG_SYSTEM_TRUSTED_KEYS="" +# CONFIG_SYSTEM_EXTRA_CERTIFICATE is not set +CONFIG_SECONDARY_TRUSTED_KEYRING=y +CONFIG_SYSTEM_BLACKLIST_KEYRING=y +CONFIG_SYSTEM_BLACKLIST_HASH_LIST="" +# end of Certificates for signature checking + +CONFIG_BINARY_PRINTF=y + +# +# Library routines +# +CONFIG_RAID6_PQ=m +CONFIG_RAID6_PQ_BENCHMARK=y +CONFIG_PACKING=y +CONFIG_BITREVERSE=y +CONFIG_GENERIC_STRNCPY_FROM_USER=y +CONFIG_GENERIC_STRNLEN_USER=y +CONFIG_GENERIC_NET_UTILS=y +CONFIG_GENERIC_FIND_FIRST_BIT=y +CONFIG_CORDIC=m +CONFIG_RATIONAL=y +CONFIG_GENERIC_PCI_IOMAP=y +CONFIG_GENERIC_IOMAP=y +CONFIG_ARCH_USE_CMPXCHG_LOCKREF=y +CONFIG_ARCH_HAS_FAST_MULTIPLIER=y +CONFIG_CRC_CCITT=y +CONFIG_CRC16=m +CONFIG_CRC_T10DIF=y +CONFIG_CRC_ITU_T=m +CONFIG_CRC32=y +# CONFIG_CRC32_SELFTEST is not set +CONFIG_CRC32_SLICEBY8=y +# CONFIG_CRC32_SLICEBY4 is not set +# CONFIG_CRC32_SARWATE is not set +# CONFIG_CRC32_BIT is not set +CONFIG_CRC64=m +CONFIG_CRC4=m +CONFIG_CRC7=m +CONFIG_LIBCRC32C=m +CONFIG_CRC8=m +CONFIG_XXHASH=y +# CONFIG_RANDOM32_SELFTEST is not set +CONFIG_842_COMPRESS=m +CONFIG_842_DECOMPRESS=m +CONFIG_ZLIB_INFLATE=y +CONFIG_ZLIB_DEFLATE=y +CONFIG_LZO_COMPRESS=y +CONFIG_LZO_DECOMPRESS=y +CONFIG_LZ4_COMPRESS=y +CONFIG_LZ4HC_COMPRESS=m +CONFIG_LZ4_DECOMPRESS=y +CONFIG_ZSTD_COMPRESS=y +CONFIG_ZSTD_DECOMPRESS=y +CONFIG_XZ_DEC=y +CONFIG_XZ_DEC_X86=y +CONFIG_XZ_DEC_POWERPC=y +CONFIG_XZ_DEC_IA64=y +CONFIG_XZ_DEC_ARM=y +CONFIG_XZ_DEC_ARMTHUMB=y +CONFIG_XZ_DEC_SPARC=y +CONFIG_XZ_DEC_BCJ=y +# CONFIG_XZ_DEC_TEST is not set +CONFIG_DECOMPRESS_GZIP=y +CONFIG_DECOMPRESS_BZIP2=y +CONFIG_DECOMPRESS_LZMA=y +CONFIG_DECOMPRESS_XZ=y +CONFIG_DECOMPRESS_LZO=y +CONFIG_DECOMPRESS_LZ4=y +CONFIG_GENERIC_ALLOCATOR=y +CONFIG_REED_SOLOMON=y +CONFIG_REED_SOLOMON_ENC8=y +CONFIG_REED_SOLOMON_DEC8=y +CONFIG_REED_SOLOMON_DEC16=y +CONFIG_BCH=m +CONFIG_TEXTSEARCH=y +CONFIG_TEXTSEARCH_KMP=m +CONFIG_TEXTSEARCH_BM=m +CONFIG_TEXTSEARCH_FSM=m +CONFIG_BTREE=y +CONFIG_INTERVAL_TREE=y +CONFIG_XARRAY_MULTI=y +CONFIG_ASSOCIATIVE_ARRAY=y +CONFIG_HAS_IOMEM=y +CONFIG_HAS_IOPORT_MAP=y +CONFIG_HAS_DMA=y +CONFIG_NEED_SG_DMA_LENGTH=y +CONFIG_NEED_DMA_MAP_STATE=y +CONFIG_ARCH_DMA_ADDR_T_64BIT=y +CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED=y +CONFIG_DMA_VIRT_OPS=y +CONFIG_SWIOTLB=y +# CONFIG_DMA_API_DEBUG is not set +CONFIG_SGL_ALLOC=y +CONFIG_IOMMU_HELPER=y +CONFIG_CHECK_SIGNATURE=y +CONFIG_CPU_RMAP=y +CONFIG_DQL=y +CONFIG_GLOB=y +# CONFIG_GLOB_SELFTEST is not set +CONFIG_NLATTR=y +CONFIG_LRU_CACHE=m +CONFIG_CLZ_TAB=y +CONFIG_IRQ_POLL=y +CONFIG_MPILIB=y +CONFIG_DIMLIB=y +CONFIG_LIBFDT=y +CONFIG_OID_REGISTRY=y +CONFIG_UCS2_STRING=y +CONFIG_HAVE_GENERIC_VDSO=y +CONFIG_GENERIC_GETTIMEOFDAY=y +CONFIG_GENERIC_VDSO_TIME_NS=y +CONFIG_FONT_SUPPORT=y +CONFIG_FONTS=y +# CONFIG_FONT_8x8 is not set +CONFIG_FONT_8x16=y +# CONFIG_FONT_6x11 is not set +# CONFIG_FONT_7x14 is not set +# CONFIG_FONT_PEARL_8x8 is not set +# CONFIG_FONT_ACORN_8x8 is not set +# CONFIG_FONT_MINI_4x6 is not set +# CONFIG_FONT_6x10 is not set +# CONFIG_FONT_10x18 is not set +# CONFIG_FONT_SUN8x16 is not set +# CONFIG_FONT_SUN12x22 is not set +CONFIG_FONT_TER16x32=y +CONFIG_SG_POOL=y +CONFIG_ARCH_HAS_PMEM_API=y +CONFIG_MEMREGION=y +CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE=y +CONFIG_ARCH_HAS_UACCESS_MCSAFE=y +CONFIG_ARCH_STACKWALK=y +CONFIG_SBITMAP=y +CONFIG_PARMAN=m +CONFIG_OBJAGG=m +# CONFIG_STRING_SELFTEST is not set +# end of Library routines + +# +# Kernel hacking +# + +# +# printk and dmesg options +# +CONFIG_PRINTK_TIME=y +# CONFIG_PRINTK_CALLER is not set +CONFIG_CONSOLE_LOGLEVEL_DEFAULT=4 +CONFIG_CONSOLE_LOGLEVEL_QUIET=1 +CONFIG_MESSAGE_LOGLEVEL_DEFAULT=4 +# CONFIG_BOOT_PRINTK_DELAY is not set +CONFIG_DYNAMIC_DEBUG=y +CONFIG_SYMBOLIC_ERRNAME=y +CONFIG_DEBUG_BUGVERBOSE=y +# end of printk and dmesg options + +# +# Compile-time checks and compiler options +# +CONFIG_DEBUG_INFO=y +# CONFIG_DEBUG_INFO_REDUCED is not set +# CONFIG_DEBUG_INFO_SPLIT is not set +CONFIG_DEBUG_INFO_DWARF4=y +CONFIG_DEBUG_INFO_BTF=y +# CONFIG_GDB_SCRIPTS is not set +# CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_FRAME_WARN=2048 +CONFIG_STRIP_ASM_SYMS=y +# CONFIG_READABLE_ASM is not set +# CONFIG_HEADERS_INSTALL is not set +# CONFIG_DEBUG_SECTION_MISMATCH is not set +CONFIG_SECTION_MISMATCH_WARN_ONLY=y +CONFIG_STACK_VALIDATION=y +# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set +# end of Compile-time checks and compiler options + +# +# Generic Kernel Debugging Instruments +# +CONFIG_MAGIC_SYSRQ=y +CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0x0 +CONFIG_MAGIC_SYSRQ_SERIAL=y +CONFIG_MAGIC_SYSRQ_SERIAL_SEQUENCE="" +CONFIG_DEBUG_FS=y +CONFIG_HAVE_ARCH_KGDB=y +# CONFIG_KGDB is not set +CONFIG_ARCH_HAS_UBSAN_SANITIZE_ALL=y +# CONFIG_UBSAN is not set +# end of Generic Kernel Debugging Instruments + +CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_MISC=y + +# +# Memory Debugging +# +# CONFIG_PAGE_EXTENSION is not set +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_PAGE_OWNER is not set +CONFIG_PAGE_POISONING=y +CONFIG_PAGE_POISONING_NO_SANITY=y +CONFIG_PAGE_POISONING_ZERO=y +# CONFIG_DEBUG_PAGE_REF is not set +# CONFIG_DEBUG_RODATA_TEST is not set +CONFIG_GENERIC_PTDUMP=y +CONFIG_PTDUMP_CORE=y +# CONFIG_PTDUMP_DEBUGFS is not set +# CONFIG_DEBUG_OBJECTS is not set +# CONFIG_SLUB_DEBUG_ON is not set +# CONFIG_SLUB_STATS is not set +CONFIG_HAVE_DEBUG_KMEMLEAK=y +# CONFIG_DEBUG_KMEMLEAK is not set +# CONFIG_DEBUG_STACK_USAGE is not set +CONFIG_SCHED_STACK_END_CHECK=y +# CONFIG_DEBUG_VM is not set +CONFIG_ARCH_HAS_DEBUG_VIRTUAL=y +# CONFIG_DEBUG_VIRTUAL is not set +CONFIG_DEBUG_MEMORY_INIT=y +# CONFIG_DEBUG_PER_CPU_MAPS is not set +CONFIG_HAVE_ARCH_KASAN=y +CONFIG_HAVE_ARCH_KASAN_VMALLOC=y +CONFIG_CC_HAS_KASAN_GENERIC=y +# CONFIG_KASAN is not set +CONFIG_KASAN_STACK=1 +# end of Memory Debugging + +# CONFIG_DEBUG_SHIRQ is not set + +# +# Debug Oops, Lockups and Hangs +# +# CONFIG_PANIC_ON_OOPS is not set +CONFIG_PANIC_ON_OOPS_VALUE=0 +CONFIG_PANIC_TIMEOUT=0 +CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y +# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 +CONFIG_HARDLOCKUP_DETECTOR_PERF=y +CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y +CONFIG_HARDLOCKUP_DETECTOR=y +# CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is not set +CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE=0 +CONFIG_DETECT_HUNG_TASK=y +CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120 +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 +# CONFIG_WQ_WATCHDOG is not set +# CONFIG_TEST_LOCKUP is not set +# end of Debug Oops, Lockups and Hangs + +# +# Scheduler Debugging +# +CONFIG_SCHED_DEBUG=y +CONFIG_SCHED_INFO=y +CONFIG_SCHEDSTATS=y +# end of Scheduler Debugging + +# CONFIG_DEBUG_TIMEKEEPING is not set +CONFIG_DEBUG_PREEMPT=y + +# +# Lock Debugging (spinlocks, mutexes, etc...) +# +CONFIG_LOCK_DEBUGGING_SUPPORT=y +# CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set +# CONFIG_DEBUG_RT_MUTEXES is not set +# CONFIG_DEBUG_SPINLOCK is not set +# CONFIG_DEBUG_MUTEXES is not set +# CONFIG_DEBUG_WW_MUTEX_SLOWPATH is not set +# CONFIG_DEBUG_RWSEMS is not set +# CONFIG_DEBUG_LOCK_ALLOC is not set +# CONFIG_DEBUG_ATOMIC_SLEEP is not set +# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +# CONFIG_LOCK_TORTURE_TEST is not set +# CONFIG_WW_MUTEX_SELFTEST is not set +# end of Lock Debugging (spinlocks, mutexes, etc...) + +CONFIG_STACKTRACE=y +# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set +# CONFIG_DEBUG_KOBJECT is not set + +# +# Debug kernel data structures +# +# CONFIG_DEBUG_LIST is not set +# CONFIG_DEBUG_PLIST is not set +# CONFIG_DEBUG_SG is not set +# CONFIG_DEBUG_NOTIFIERS is not set +# CONFIG_BUG_ON_DATA_CORRUPTION is not set +# end of Debug kernel data structures + +# CONFIG_DEBUG_CREDENTIALS is not set + +# +# RCU Debugging +# +# CONFIG_RCU_PERF_TEST is not set +# CONFIG_RCU_TORTURE_TEST is not set +CONFIG_RCU_CPU_STALL_TIMEOUT=60 +# CONFIG_RCU_TRACE is not set +# CONFIG_RCU_EQS_DEBUG is not set +# end of RCU Debugging + +# CONFIG_DEBUG_WQ_FORCE_RR_CPU is not set +# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set +# CONFIG_CPU_HOTPLUG_STATE_CONTROL is not set +CONFIG_LATENCYTOP=y +CONFIG_USER_STACKTRACE_SUPPORT=y +CONFIG_NOP_TRACER=y +CONFIG_HAVE_FUNCTION_TRACER=y +CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y +CONFIG_HAVE_DYNAMIC_FTRACE=y +CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS=y +CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y +CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y +CONFIG_HAVE_FENTRY=y +CONFIG_HAVE_C_RECORDMCOUNT=y +CONFIG_TRACER_MAX_TRACE=y +CONFIG_TRACE_CLOCK=y +CONFIG_RING_BUFFER=y +CONFIG_EVENT_TRACING=y +CONFIG_CONTEXT_SWITCH_TRACER=y +CONFIG_RING_BUFFER_ALLOW_SWAP=y +CONFIG_TRACING=y +CONFIG_GENERIC_TRACER=y +CONFIG_TRACING_SUPPORT=y +CONFIG_FTRACE=y +# CONFIG_BOOTTIME_TRACING is not set +CONFIG_FUNCTION_TRACER=y +CONFIG_FUNCTION_GRAPH_TRACER=y +CONFIG_DYNAMIC_FTRACE=y +CONFIG_DYNAMIC_FTRACE_WITH_REGS=y +CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y +CONFIG_FUNCTION_PROFILER=y +CONFIG_STACK_TRACER=y +# CONFIG_PREEMPTIRQ_EVENTS is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_PREEMPT_TRACER is not set +CONFIG_SCHED_TRACER=y +CONFIG_HWLAT_TRACER=y +CONFIG_MMIOTRACE=y +CONFIG_FTRACE_SYSCALLS=y +CONFIG_TRACER_SNAPSHOT=y +# CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP is not set +CONFIG_BRANCH_PROFILE_NONE=y +# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_KPROBE_EVENTS=y +# CONFIG_KPROBE_EVENTS_ON_NOTRACE is not set +CONFIG_UPROBE_EVENTS=y +CONFIG_BPF_EVENTS=y +CONFIG_DYNAMIC_EVENTS=y +CONFIG_PROBE_EVENTS=y +CONFIG_BPF_KPROBE_OVERRIDE=y +CONFIG_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_MAP=y +CONFIG_HIST_TRIGGERS=y +# CONFIG_TRACE_EVENT_INJECT is not set +# CONFIG_TRACEPOINT_BENCHMARK is not set +# CONFIG_RING_BUFFER_BENCHMARK is not set +# CONFIG_TRACE_EVAL_MAP_FILE is not set +# CONFIG_FTRACE_STARTUP_TEST is not set +# CONFIG_RING_BUFFER_STARTUP_TEST is not set +# CONFIG_MMIOTRACE_TEST is not set +# CONFIG_PREEMPTIRQ_DELAY_TEST is not set +# CONFIG_SYNTH_EVENT_GEN_TEST is not set +# CONFIG_KPROBE_EVENT_GEN_TEST is not set +# CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set +# CONFIG_SAMPLES is not set +CONFIG_ARCH_HAS_DEVMEM_IS_ALLOWED=y +CONFIG_STRICT_DEVMEM=y +CONFIG_IO_STRICT_DEVMEM=y + +# +# x86 Debugging +# +CONFIG_TRACE_IRQFLAGS_SUPPORT=y +# CONFIG_X86_VERBOSE_BOOTUP is not set +CONFIG_EARLY_PRINTK=y +# CONFIG_EARLY_PRINTK_DBGP is not set +# CONFIG_EARLY_PRINTK_USB_XDBC is not set +# CONFIG_EFI_PGT_DUMP is not set +CONFIG_DEBUG_WX=y +CONFIG_DOUBLEFAULT=y +# CONFIG_DEBUG_TLBFLUSH is not set +# CONFIG_IOMMU_DEBUG is not set +CONFIG_HAVE_MMIOTRACE_SUPPORT=y +# CONFIG_X86_DECODER_SELFTEST is not set +CONFIG_IO_DELAY_0X80=y +# CONFIG_IO_DELAY_0XED is not set +# CONFIG_IO_DELAY_UDELAY is not set +# CONFIG_IO_DELAY_NONE is not set +CONFIG_DEBUG_BOOT_PARAMS=y +# CONFIG_CPA_DEBUG is not set +# CONFIG_DEBUG_ENTRY is not set +# CONFIG_DEBUG_NMI_SELFTEST is not set +# CONFIG_X86_DEBUG_FPU is not set +# CONFIG_PUNIT_ATOM_DEBUG is not set +CONFIG_UNWINDER_ORC=y +# CONFIG_UNWINDER_FRAME_POINTER is not set +# CONFIG_UNWINDER_GUESS is not set +# end of x86 Debugging + +# +# Kernel Testing and Coverage +# +# CONFIG_KUNIT is not set +# CONFIG_NOTIFIER_ERROR_INJECTION is not set +CONFIG_FUNCTION_ERROR_INJECTION=y +# CONFIG_FAULT_INJECTION is not set +CONFIG_ARCH_HAS_KCOV=y +CONFIG_CC_HAS_SANCOV_TRACE_PC=y +# CONFIG_KCOV is not set +CONFIG_RUNTIME_TESTING_MENU=y +CONFIG_LKDTM=m +# CONFIG_TEST_LIST_SORT is not set +# CONFIG_TEST_MIN_HEAP is not set +# CONFIG_TEST_SORT is not set +# CONFIG_KPROBES_SANITY_TEST is not set +# CONFIG_BACKTRACE_SELF_TEST is not set +# CONFIG_RBTREE_TEST is not set +# CONFIG_REED_SOLOMON_TEST is not set +# CONFIG_INTERVAL_TREE_TEST is not set +# CONFIG_PERCPU_TEST is not set +# CONFIG_ATOMIC64_SELFTEST is not set +# CONFIG_ASYNC_RAID6_TEST is not set +# CONFIG_TEST_HEXDUMP is not set +# CONFIG_TEST_STRING_HELPERS is not set +# CONFIG_TEST_STRSCPY is not set +# CONFIG_TEST_KSTRTOX is not set +# CONFIG_TEST_PRINTF is not set +# CONFIG_TEST_BITMAP is not set +# CONFIG_TEST_BITFIELD is not set +# CONFIG_TEST_UUID is not set +# CONFIG_TEST_XARRAY is not set +# CONFIG_TEST_OVERFLOW is not set +# CONFIG_TEST_RHASHTABLE is not set +# CONFIG_TEST_HASH is not set +# CONFIG_TEST_IDA is not set +# CONFIG_TEST_PARMAN is not set +# CONFIG_TEST_LKM is not set +# CONFIG_TEST_VMALLOC is not set +# CONFIG_TEST_USER_COPY is not set +# CONFIG_TEST_BPF is not set +# CONFIG_TEST_BLACKHOLE_DEV is not set +# CONFIG_FIND_BIT_BENCHMARK is not set +# CONFIG_TEST_FIRMWARE is not set +# CONFIG_TEST_SYSCTL is not set +# CONFIG_TEST_UDELAY is not set +# CONFIG_TEST_STATIC_KEYS is not set +# CONFIG_TEST_KMOD is not set +# CONFIG_TEST_MEMCAT_P is not set +# CONFIG_TEST_OBJAGG is not set +# CONFIG_TEST_STACKINIT is not set +# CONFIG_TEST_MEMINIT is not set +# CONFIG_MEMTEST is not set +# CONFIG_HYPERV_TESTING is not set +# end of Kernel Testing and Coverage +# end of Kernel hacking diff --git a/linux-tkg/linux-tkg-config/5.7/config_hardened.x86_64 b/linux-tkg/linux-tkg-config/5.7/config_hardened.x86_64 new file mode 100644 index 0000000..105f167 --- /dev/null +++ b/linux-tkg/linux-tkg-config/5.7/config_hardened.x86_64 @@ -0,0 +1,10839 @@ +# +# Automatically generated file; DO NOT EDIT. +# Linux/x86 5.7.8 Kernel Configuration +# + +# +# Compiler: gcc (GCC) 10.1.0 +# +CONFIG_CC_IS_GCC=y +CONFIG_GCC_VERSION=100100 +CONFIG_LD_VERSION=234000000 +CONFIG_CLANG_VERSION=0 +CONFIG_CC_CAN_LINK=y +CONFIG_CC_HAS_ASM_GOTO=y +CONFIG_CC_HAS_ASM_INLINE=y +CONFIG_IRQ_WORK=y +CONFIG_BUILDTIME_TABLE_SORT=y +CONFIG_THREAD_INFO_IN_TASK=y + +# +# General setup +# +CONFIG_INIT_ENV_ARG_LIMIT=32 +# CONFIG_COMPILE_TEST is not set +CONFIG_LOCALVERSION="" +CONFIG_LOCALVERSION_AUTO=y +CONFIG_BUILD_SALT="" +CONFIG_HAVE_KERNEL_GZIP=y +CONFIG_HAVE_KERNEL_BZIP2=y +CONFIG_HAVE_KERNEL_LZMA=y +CONFIG_HAVE_KERNEL_XZ=y +CONFIG_HAVE_KERNEL_LZO=y +CONFIG_HAVE_KERNEL_LZ4=y +# CONFIG_KERNEL_GZIP is not set +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set +CONFIG_KERNEL_XZ=y +# CONFIG_KERNEL_LZO is not set +# CONFIG_KERNEL_LZ4 is not set +CONFIG_DEFAULT_HOSTNAME="archlinux" +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_SYSVIPC_SYSCTL=y +CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y +CONFIG_CROSS_MEMORY_ATTACH=y +# CONFIG_USELIB is not set +CONFIG_AUDIT=y +CONFIG_HAVE_ARCH_AUDITSYSCALL=y +CONFIG_AUDITSYSCALL=y + +# +# IRQ subsystem +# +CONFIG_GENERIC_IRQ_PROBE=y +CONFIG_GENERIC_IRQ_SHOW=y +CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y +CONFIG_GENERIC_PENDING_IRQ=y +CONFIG_GENERIC_IRQ_MIGRATION=y +CONFIG_HARDIRQS_SW_RESEND=y +CONFIG_GENERIC_IRQ_CHIP=y +CONFIG_IRQ_DOMAIN=y +CONFIG_IRQ_SIM=y +CONFIG_IRQ_DOMAIN_HIERARCHY=y +CONFIG_GENERIC_MSI_IRQ=y +CONFIG_GENERIC_MSI_IRQ_DOMAIN=y +CONFIG_IRQ_MSI_IOMMU=y +CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR=y +CONFIG_GENERIC_IRQ_RESERVATION_MODE=y +CONFIG_IRQ_FORCED_THREADING=y +CONFIG_SPARSE_IRQ=y +# CONFIG_GENERIC_IRQ_DEBUGFS is not set +# end of IRQ subsystem + +CONFIG_CLOCKSOURCE_WATCHDOG=y +CONFIG_ARCH_CLOCKSOURCE_INIT=y +CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y +CONFIG_GENERIC_TIME_VSYSCALL=y +CONFIG_GENERIC_CLOCKEVENTS=y +CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y +CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=y +CONFIG_GENERIC_CMOS_UPDATE=y + +# +# Timers subsystem +# +CONFIG_TICK_ONESHOT=y +CONFIG_NO_HZ_COMMON=y +# CONFIG_HZ_PERIODIC is not set +CONFIG_NO_HZ_IDLE=y +# CONFIG_NO_HZ_FULL is not set +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +# end of Timers subsystem + +# CONFIG_PREEMPT_NONE is not set +# CONFIG_PREEMPT_VOLUNTARY is not set +CONFIG_PREEMPT=y +CONFIG_PREEMPT_COUNT=y +CONFIG_PREEMPTION=y + +# +# CPU/Task time and stats accounting +# +CONFIG_TICK_CPU_ACCOUNTING=y +# CONFIG_VIRT_CPU_ACCOUNTING_GEN is not set +CONFIG_IRQ_TIME_ACCOUNTING=y +CONFIG_HAVE_SCHED_AVG_IRQ=y +# CONFIG_SCHED_THERMAL_PRESSURE is not set +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_BSD_PROCESS_ACCT_V3=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_PSI=y +# CONFIG_PSI_DEFAULT_DISABLED is not set +# end of CPU/Task time and stats accounting + +CONFIG_CPU_ISOLATION=y + +# +# RCU Subsystem +# +CONFIG_TREE_RCU=y +CONFIG_PREEMPT_RCU=y +CONFIG_RCU_EXPERT=y +CONFIG_SRCU=y +CONFIG_TREE_SRCU=y +CONFIG_TASKS_RCU=y +CONFIG_RCU_STALL_COMMON=y +CONFIG_RCU_NEED_SEGCBLIST=y +CONFIG_RCU_FANOUT=64 +CONFIG_RCU_FANOUT_LEAF=16 +CONFIG_RCU_FAST_NO_HZ=y +CONFIG_RCU_BOOST=y +CONFIG_RCU_BOOST_DELAY=500 +# CONFIG_RCU_NOCB_CPU is not set +# end of RCU Subsystem + +CONFIG_BUILD_BIN2C=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +# CONFIG_IKHEADERS is not set +CONFIG_LOG_BUF_SHIFT=17 +CONFIG_LOG_CPU_MAX_BUF_SHIFT=12 +CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=13 +CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y + +# +# Scheduler features +# +CONFIG_UCLAMP_TASK=y +CONFIG_UCLAMP_BUCKETS_COUNT=5 +# end of Scheduler features + +CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y +CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y +CONFIG_CC_HAS_INT128=y +CONFIG_ARCH_SUPPORTS_INT128=y +CONFIG_NUMA_BALANCING=y +CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y +CONFIG_CGROUPS=y +CONFIG_PAGE_COUNTER=y +CONFIG_MEMCG=y +CONFIG_MEMCG_SWAP=y +CONFIG_MEMCG_SWAP_ENABLED=y +CONFIG_MEMCG_KMEM=y +CONFIG_BLK_CGROUP=y +CONFIG_CGROUP_WRITEBACK=y +CONFIG_CGROUP_SCHED=y +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_CFS_BANDWIDTH=y +# CONFIG_RT_GROUP_SCHED is not set +CONFIG_UCLAMP_TASK_GROUP=y +CONFIG_CGROUP_PIDS=y +CONFIG_CGROUP_RDMA=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_HUGETLB=y +CONFIG_CPUSETS=y +CONFIG_PROC_PID_CPUSET=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_PERF=y +CONFIG_CGROUP_BPF=y +# CONFIG_CGROUP_DEBUG is not set +CONFIG_SOCK_CGROUP_DATA=y +CONFIG_NAMESPACES=y +CONFIG_UTS_NS=y +CONFIG_TIME_NS=y +CONFIG_IPC_NS=y +CONFIG_USER_NS=y +# CONFIG_USER_NS_UNPRIVILEGED is not set +CONFIG_PID_NS=y +CONFIG_NET_NS=y +# CONFIG_CHECKPOINT_RESTORE is not set +CONFIG_SCHED_AUTOGROUP=y +# CONFIG_SYSFS_DEPRECATED is not set +CONFIG_RELAY=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_GZIP=y +CONFIG_RD_BZIP2=y +CONFIG_RD_LZMA=y +CONFIG_RD_XZ=y +CONFIG_RD_LZO=y +CONFIG_RD_LZ4=y +CONFIG_BOOT_CONFIG=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set +CONFIG_SYSCTL=y +CONFIG_HAVE_UID16=y +CONFIG_SYSCTL_EXCEPTION_TRACE=y +CONFIG_HAVE_PCSPKR_PLATFORM=y +CONFIG_BPF=y +CONFIG_EXPERT=y +# CONFIG_UID16 is not set +CONFIG_MULTIUSER=y +CONFIG_SGETMASK_SYSCALL=y +# CONFIG_SYSFS_SYSCALL is not set +CONFIG_FHANDLE=y +CONFIG_POSIX_TIMERS=y +CONFIG_PRINTK=y +CONFIG_PRINTK_NMI=y +CONFIG_BUG=y +CONFIG_ELF_CORE=y +CONFIG_PCSPKR_PLATFORM=y +CONFIG_BASE_FULL=y +CONFIG_FUTEX=y +CONFIG_FUTEX_PI=y +CONFIG_EPOLL=y +CONFIG_SIGNALFD=y +CONFIG_TIMERFD=y +CONFIG_EVENTFD=y +CONFIG_SHMEM=y +CONFIG_AIO=y +CONFIG_IO_URING=y +CONFIG_ADVISE_SYSCALLS=y +CONFIG_MEMBARRIER=y +CONFIG_KALLSYMS=y +CONFIG_KALLSYMS_ALL=y +CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y +CONFIG_KALLSYMS_BASE_RELATIVE=y +CONFIG_BPF_LSM=y +CONFIG_BPF_SYSCALL=y +CONFIG_ARCH_WANT_DEFAULT_BPF_JIT=y +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_BPF_JIT_DEFAULT_ON=y +# CONFIG_USERFAULTFD is not set +CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y +CONFIG_RSEQ=y +# CONFIG_DEBUG_RSEQ is not set +# CONFIG_EMBEDDED is not set +CONFIG_HAVE_PERF_EVENTS=y +# CONFIG_PC104 is not set + +# +# Kernel Performance Events And Counters +# +CONFIG_PERF_EVENTS=y +# CONFIG_DEBUG_PERF_USE_VMALLOC is not set +# end of Kernel Performance Events And Counters + +CONFIG_VM_EVENT_COUNTERS=y +CONFIG_SLUB_DEBUG=y +# CONFIG_SLUB_MEMCG_SYSFS_ON is not set +# CONFIG_COMPAT_BRK is not set +# CONFIG_SLAB is not set +CONFIG_SLUB=y +# CONFIG_SLOB is not set +# CONFIG_SLAB_MERGE_DEFAULT is not set +CONFIG_SLAB_FREELIST_RANDOM=y +CONFIG_SLAB_FREELIST_HARDENED=y +CONFIG_SLAB_CANARY=y +CONFIG_SHUFFLE_PAGE_ALLOCATOR=y +CONFIG_SLUB_CPU_PARTIAL=y +CONFIG_SYSTEM_DATA_VERIFICATION=y +CONFIG_PROFILING=y +CONFIG_TRACEPOINTS=y +# end of General setup + +CONFIG_64BIT=y +CONFIG_X86_64=y +CONFIG_X86=y +CONFIG_INSTRUCTION_DECODER=y +CONFIG_OUTPUT_FORMAT="elf64-x86-64" +CONFIG_LOCKDEP_SUPPORT=y +CONFIG_STACKTRACE_SUPPORT=y +CONFIG_MMU=y +CONFIG_ARCH_MMAP_RND_BITS_MIN=28 +CONFIG_ARCH_MMAP_RND_BITS_MAX=32 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=8 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16 +CONFIG_GENERIC_ISA_DMA=y +CONFIG_GENERIC_BUG=y +CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y +CONFIG_ARCH_MAY_HAVE_PC_FDC=y +CONFIG_GENERIC_CALIBRATE_DELAY=y +CONFIG_ARCH_HAS_CPU_RELAX=y +CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y +CONFIG_ARCH_HAS_FILTER_PGPROT=y +CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y +CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y +CONFIG_ARCH_HIBERNATION_POSSIBLE=y +CONFIG_ARCH_SUSPEND_POSSIBLE=y +CONFIG_ARCH_WANT_GENERAL_HUGETLB=y +CONFIG_ZONE_DMA32=y +CONFIG_AUDIT_ARCH=y +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y +CONFIG_HAVE_INTEL_TXT=y +CONFIG_X86_64_SMP=y +CONFIG_ARCH_SUPPORTS_UPROBES=y +CONFIG_FIX_EARLYCON_MEM=y +CONFIG_DYNAMIC_PHYSICAL_MASK=y +CONFIG_PGTABLE_LEVELS=5 +CONFIG_CC_HAS_SANE_STACKPROTECTOR=y + +# +# Processor type and features +# +CONFIG_ZONE_DMA=y +CONFIG_SMP=y +CONFIG_X86_FEATURE_NAMES=y +CONFIG_X86_X2APIC=y +CONFIG_X86_MPPARSE=y +# CONFIG_GOLDFISH is not set +CONFIG_RETPOLINE=y +CONFIG_X86_CPU_RESCTRL=y +# CONFIG_X86_EXTENDED_PLATFORM is not set +CONFIG_X86_INTEL_LPSS=y +CONFIG_X86_AMD_PLATFORM_DEVICE=y +CONFIG_IOSF_MBI=y +# CONFIG_IOSF_MBI_DEBUG is not set +CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y +CONFIG_SCHED_OMIT_FRAME_POINTER=y +CONFIG_HYPERVISOR_GUEST=y +CONFIG_PARAVIRT=y +CONFIG_PARAVIRT_XXL=y +# CONFIG_PARAVIRT_DEBUG is not set +CONFIG_PARAVIRT_SPINLOCKS=y +CONFIG_X86_HV_CALLBACK_VECTOR=y +CONFIG_XEN=y +CONFIG_XEN_PV=y +CONFIG_XEN_PV_SMP=y +CONFIG_XEN_DOM0=y +CONFIG_XEN_PVHVM=y +CONFIG_XEN_PVHVM_SMP=y +CONFIG_XEN_512GB=y +CONFIG_XEN_SAVE_RESTORE=y +# CONFIG_XEN_DEBUG_FS is not set +CONFIG_XEN_PVH=y +CONFIG_KVM_GUEST=y +CONFIG_ARCH_CPUIDLE_HALTPOLL=y +CONFIG_PVH=y +# CONFIG_KVM_DEBUG_FS is not set +CONFIG_PARAVIRT_TIME_ACCOUNTING=y +CONFIG_PARAVIRT_CLOCK=y +CONFIG_JAILHOUSE_GUEST=y +CONFIG_ACRN_GUEST=y +# CONFIG_MK8 is not set +# CONFIG_MPSC is not set +# CONFIG_MCORE2 is not set +# CONFIG_MATOM is not set +CONFIG_GENERIC_CPU=y +CONFIG_X86_INTERNODE_CACHE_SHIFT=6 +CONFIG_X86_L1_CACHE_SHIFT=6 +CONFIG_X86_TSC=y +CONFIG_X86_CMPXCHG64=y +CONFIG_X86_CMOV=y +CONFIG_X86_MINIMUM_CPU_FAMILY=64 +CONFIG_X86_DEBUGCTLMSR=y +CONFIG_IA32_FEAT_CTL=y +CONFIG_X86_VMX_FEATURE_NAMES=y +CONFIG_PROCESSOR_SELECT=y +CONFIG_CPU_SUP_INTEL=y +CONFIG_CPU_SUP_AMD=y +CONFIG_CPU_SUP_HYGON=y +CONFIG_CPU_SUP_CENTAUR=y +CONFIG_CPU_SUP_ZHAOXIN=y +CONFIG_HPET_TIMER=y +CONFIG_HPET_EMULATE_RTC=y +CONFIG_DMI=y +CONFIG_GART_IOMMU=y +# CONFIG_MAXSMP is not set +CONFIG_NR_CPUS_RANGE_BEGIN=2 +CONFIG_NR_CPUS_RANGE_END=512 +CONFIG_NR_CPUS_DEFAULT=64 +CONFIG_NR_CPUS=320 +CONFIG_SCHED_SMT=y +CONFIG_SCHED_MC=y +CONFIG_SCHED_MC_PRIO=y +CONFIG_X86_LOCAL_APIC=y +CONFIG_X86_IO_APIC=y +CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y +CONFIG_X86_MCE=y +# CONFIG_X86_MCELOG_LEGACY is not set +CONFIG_X86_MCE_INTEL=y +CONFIG_X86_MCE_AMD=y +CONFIG_X86_MCE_THRESHOLD=y +CONFIG_X86_MCE_INJECT=m +CONFIG_X86_THERMAL_VECTOR=y + +# +# Performance monitoring +# +CONFIG_PERF_EVENTS_INTEL_UNCORE=m +CONFIG_PERF_EVENTS_INTEL_RAPL=m +CONFIG_PERF_EVENTS_INTEL_CSTATE=m +CONFIG_PERF_EVENTS_AMD_POWER=m +# end of Performance monitoring + +CONFIG_X86_VSYSCALL_EMULATION=y +CONFIG_X86_IOPL_IOPERM=y +CONFIG_I8K=m +CONFIG_MICROCODE=y +CONFIG_MICROCODE_INTEL=y +CONFIG_MICROCODE_AMD=y +CONFIG_MICROCODE_OLD_INTERFACE=y +CONFIG_X86_MSR=m +CONFIG_X86_CPUID=m +CONFIG_X86_5LEVEL=y +CONFIG_X86_DIRECT_GBPAGES=y +# CONFIG_X86_CPA_STATISTICS is not set +CONFIG_AMD_MEM_ENCRYPT=y +CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT=y +CONFIG_NUMA=y +CONFIG_AMD_NUMA=y +CONFIG_X86_64_ACPI_NUMA=y +CONFIG_NODES_SPAN_OTHER_NODES=y +# CONFIG_NUMA_EMU is not set +CONFIG_NODES_SHIFT=5 +CONFIG_ARCH_SPARSEMEM_ENABLE=y +CONFIG_ARCH_SPARSEMEM_DEFAULT=y +CONFIG_ARCH_SELECT_MEMORY_MODEL=y +CONFIG_ARCH_MEMORY_PROBE=y +CONFIG_ILLEGAL_POINTER_VALUE=0xdead000000000000 +CONFIG_X86_PMEM_LEGACY_DEVICE=y +CONFIG_X86_PMEM_LEGACY=m +CONFIG_X86_CHECK_BIOS_CORRUPTION=y +CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y +CONFIG_X86_RESERVE_LOW=64 +CONFIG_MTRR=y +CONFIG_MTRR_SANITIZER=y +CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=1 +CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT=0 +CONFIG_X86_PAT=y +CONFIG_ARCH_USES_PG_UNCACHED=y +CONFIG_ARCH_RANDOM=y +CONFIG_X86_SMAP=y +CONFIG_X86_UMIP=y +CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS=y +# CONFIG_X86_INTEL_TSX_MODE_OFF is not set +# CONFIG_X86_INTEL_TSX_MODE_ON is not set +CONFIG_X86_INTEL_TSX_MODE_AUTO=y +CONFIG_EFI=y +CONFIG_EFI_STUB=y +CONFIG_EFI_MIXED=y +CONFIG_SECCOMP=y +# CONFIG_HZ_100 is not set +# CONFIG_HZ_250 is not set +CONFIG_HZ_300=y +# CONFIG_HZ_1000 is not set +CONFIG_HZ=300 +CONFIG_SCHED_HRTICK=y +# CONFIG_KEXEC is not set +# CONFIG_KEXEC_FILE is not set +CONFIG_CRASH_DUMP=y +CONFIG_PHYSICAL_START=0x1000000 +CONFIG_RELOCATABLE=y +CONFIG_RANDOMIZE_BASE=y +CONFIG_X86_NEED_RELOCS=y +CONFIG_PHYSICAL_ALIGN=0x1000000 +CONFIG_DYNAMIC_MEMORY_LAYOUT=y +CONFIG_RANDOMIZE_MEMORY=y +CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING=0x1 +CONFIG_HOTPLUG_CPU=y +# CONFIG_BOOTPARAM_HOTPLUG_CPU0 is not set +# CONFIG_DEBUG_HOTPLUG_CPU0 is not set +# CONFIG_COMPAT_VDSO is not set +# CONFIG_LEGACY_VSYSCALL_EMULATE is not set +# CONFIG_LEGACY_VSYSCALL_XONLY is not set +CONFIG_LEGACY_VSYSCALL_NONE=y +CONFIG_CMDLINE_BOOL=y +CONFIG_CMDLINE="pti=on page_alloc.shuffle=1" +# CONFIG_CMDLINE_OVERRIDE is not set +# CONFIG_MODIFY_LDT_SYSCALL is not set +CONFIG_HAVE_LIVEPATCH=y +# CONFIG_LIVEPATCH is not set +# end of Processor type and features + +CONFIG_ARCH_HAS_ADD_PAGES=y +CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y +CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y +CONFIG_USE_PERCPU_NUMA_NODE_ID=y +CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK=y +CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION=y +CONFIG_ARCH_ENABLE_THP_MIGRATION=y + +# +# Power management and ACPI options +# +CONFIG_SUSPEND=y +CONFIG_SUSPEND_FREEZER=y +# CONFIG_SUSPEND_SKIP_SYNC is not set +CONFIG_HIBERNATE_CALLBACKS=y +# CONFIG_HIBERNATION is not set +CONFIG_PM_SLEEP=y +CONFIG_PM_SLEEP_SMP=y +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y +CONFIG_PM_WAKELOCKS_LIMIT=100 +CONFIG_PM_WAKELOCKS_GC=y +CONFIG_PM=y +CONFIG_PM_DEBUG=y +CONFIG_PM_ADVANCED_DEBUG=y +# CONFIG_PM_TEST_SUSPEND is not set +CONFIG_PM_SLEEP_DEBUG=y +# CONFIG_DPM_WATCHDOG is not set +CONFIG_PM_TRACE=y +CONFIG_PM_TRACE_RTC=y +CONFIG_PM_CLK=y +CONFIG_PM_GENERIC_DOMAINS=y +CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y +CONFIG_PM_GENERIC_DOMAINS_SLEEP=y +CONFIG_PM_GENERIC_DOMAINS_OF=y +CONFIG_ENERGY_MODEL=y +CONFIG_ARCH_SUPPORTS_ACPI=y +CONFIG_ACPI=y +CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y +CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC=y +CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT=y +# CONFIG_ACPI_DEBUGGER is not set +CONFIG_ACPI_SPCR_TABLE=y +CONFIG_ACPI_LPIT=y +CONFIG_ACPI_SLEEP=y +# CONFIG_ACPI_PROCFS_POWER is not set +CONFIG_ACPI_REV_OVERRIDE_POSSIBLE=y +# CONFIG_ACPI_EC_DEBUGFS is not set +CONFIG_ACPI_AC=m +CONFIG_ACPI_BATTERY=m +CONFIG_ACPI_BUTTON=y +CONFIG_ACPI_VIDEO=y +CONFIG_ACPI_FAN=y +CONFIG_ACPI_TAD=m +CONFIG_ACPI_DOCK=y +CONFIG_ACPI_CPU_FREQ_PSS=y +CONFIG_ACPI_PROCESSOR_CSTATE=y +CONFIG_ACPI_PROCESSOR_IDLE=y +CONFIG_ACPI_CPPC_LIB=y +CONFIG_ACPI_PROCESSOR=y +CONFIG_ACPI_IPMI=m +CONFIG_ACPI_HOTPLUG_CPU=y +CONFIG_ACPI_PROCESSOR_AGGREGATOR=y +CONFIG_ACPI_THERMAL=y +CONFIG_ARCH_HAS_ACPI_TABLE_UPGRADE=y +CONFIG_ACPI_TABLE_UPGRADE=y +# CONFIG_ACPI_DEBUG is not set +CONFIG_ACPI_PCI_SLOT=y +CONFIG_ACPI_CONTAINER=y +CONFIG_ACPI_HOTPLUG_MEMORY=y +CONFIG_ACPI_HOTPLUG_IOAPIC=y +CONFIG_ACPI_SBS=m +CONFIG_ACPI_HED=y +# CONFIG_ACPI_CUSTOM_METHOD is not set +CONFIG_ACPI_BGRT=y +# CONFIG_ACPI_REDUCED_HARDWARE_ONLY is not set +CONFIG_ACPI_NFIT=m +# CONFIG_NFIT_SECURITY_DEBUG is not set +CONFIG_ACPI_NUMA=y +CONFIG_ACPI_HMAT=y +CONFIG_HAVE_ACPI_APEI=y +CONFIG_HAVE_ACPI_APEI_NMI=y +CONFIG_ACPI_APEI=y +CONFIG_ACPI_APEI_GHES=y +CONFIG_ACPI_APEI_PCIEAER=y +CONFIG_ACPI_APEI_MEMORY_FAILURE=y +CONFIG_ACPI_APEI_EINJ=m +CONFIG_ACPI_APEI_ERST_DEBUG=m +CONFIG_DPTF_POWER=m +CONFIG_ACPI_WATCHDOG=y +CONFIG_ACPI_EXTLOG=m +CONFIG_ACPI_ADXL=y +CONFIG_PMIC_OPREGION=y +CONFIG_BYTCRC_PMIC_OPREGION=y +CONFIG_CHTCRC_PMIC_OPREGION=y +CONFIG_XPOWER_PMIC_OPREGION=y +CONFIG_BXT_WC_PMIC_OPREGION=y +CONFIG_CHT_WC_PMIC_OPREGION=y +CONFIG_CHT_DC_TI_PMIC_OPREGION=y +CONFIG_ACPI_CONFIGFS=m +CONFIG_TPS68470_PMIC_OPREGION=y +CONFIG_X86_PM_TIMER=y +CONFIG_SFI=y + +# +# CPU Frequency scaling +# +CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_GOV_ATTR_SET=y +CONFIG_CPU_FREQ_GOV_COMMON=y +CONFIG_CPU_FREQ_STAT=y +# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y +CONFIG_CPU_FREQ_GOV_PERFORMANCE=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=m +CONFIG_CPU_FREQ_GOV_USERSPACE=m +CONFIG_CPU_FREQ_GOV_ONDEMAND=m +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m +CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y + +# +# CPU frequency scaling drivers +# +CONFIG_CPUFREQ_DT=m +CONFIG_CPUFREQ_DT_PLATDEV=y +CONFIG_X86_INTEL_PSTATE=y +CONFIG_X86_PCC_CPUFREQ=m +CONFIG_X86_ACPI_CPUFREQ=m +CONFIG_X86_ACPI_CPUFREQ_CPB=y +CONFIG_X86_POWERNOW_K8=m +CONFIG_X86_AMD_FREQ_SENSITIVITY=m +# CONFIG_X86_SPEEDSTEP_CENTRINO is not set +CONFIG_X86_P4_CLOCKMOD=m + +# +# shared options +# +CONFIG_X86_SPEEDSTEP_LIB=m +# end of CPU Frequency scaling + +# +# CPU Idle +# +CONFIG_CPU_IDLE=y +CONFIG_CPU_IDLE_GOV_LADDER=y +CONFIG_CPU_IDLE_GOV_MENU=y +CONFIG_CPU_IDLE_GOV_TEO=y +CONFIG_CPU_IDLE_GOV_HALTPOLL=y +CONFIG_HALTPOLL_CPUIDLE=m +# end of CPU Idle + +CONFIG_INTEL_IDLE=y +# end of Power management and ACPI options + +# +# Bus options (PCI etc.) +# +CONFIG_PCI_DIRECT=y +CONFIG_PCI_MMCONFIG=y +CONFIG_PCI_XEN=y +CONFIG_MMCONF_FAM10H=y +# CONFIG_PCI_CNB20LE_QUIRK is not set +# CONFIG_ISA_BUS is not set +CONFIG_ISA_DMA_API=y +CONFIG_AMD_NB=y +# CONFIG_X86_SYSFB is not set +# end of Bus options (PCI etc.) + +# +# Binary Emulations +# +CONFIG_IA32_EMULATION=y +# CONFIG_X86_X32 is not set +CONFIG_COMPAT_32=y +CONFIG_COMPAT=y +CONFIG_COMPAT_FOR_U64_ALIGNMENT=y +CONFIG_SYSVIPC_COMPAT=y +# end of Binary Emulations + +# +# Firmware Drivers +# +CONFIG_EDD=m +# CONFIG_EDD_OFF is not set +CONFIG_FIRMWARE_MEMMAP=y +CONFIG_DMIID=y +CONFIG_DMI_SYSFS=m +CONFIG_DMI_SCAN_MACHINE_NON_EFI_FALLBACK=y +CONFIG_ISCSI_IBFT_FIND=y +CONFIG_ISCSI_IBFT=m +CONFIG_FW_CFG_SYSFS=m +# CONFIG_FW_CFG_SYSFS_CMDLINE is not set +CONFIG_GOOGLE_FIRMWARE=y +# CONFIG_GOOGLE_SMI is not set +CONFIG_GOOGLE_COREBOOT_TABLE=m +CONFIG_GOOGLE_MEMCONSOLE=m +# CONFIG_GOOGLE_MEMCONSOLE_X86_LEGACY is not set +CONFIG_GOOGLE_FRAMEBUFFER_COREBOOT=m +CONFIG_GOOGLE_MEMCONSOLE_COREBOOT=m +CONFIG_GOOGLE_VPD=m + +# +# EFI (Extensible Firmware Interface) Support +# +# CONFIG_EFI_VARS is not set +CONFIG_EFI_ESRT=y +# CONFIG_EFI_FAKE_MEMMAP is not set +CONFIG_EFI_SOFT_RESERVE=y +CONFIG_EFI_RUNTIME_WRAPPERS=y +CONFIG_EFI_CAPSULE_LOADER=m +# CONFIG_EFI_TEST is not set +CONFIG_APPLE_PROPERTIES=y +CONFIG_RESET_ATTACK_MITIGATION=y +CONFIG_EFI_RCI2_TABLE=y +# CONFIG_EFI_DISABLE_PCI_DMA is not set +# end of EFI (Extensible Firmware Interface) Support + +CONFIG_EFI_EMBEDDED_FIRMWARE=y +CONFIG_UEFI_CPER=y +CONFIG_UEFI_CPER_X86=y +CONFIG_EFI_DEV_PATH_PARSER=y +CONFIG_EFI_EARLYCON=y + +# +# Tegra firmware driver +# +# end of Tegra firmware driver +# end of Firmware Drivers + +CONFIG_HAVE_KVM=y +CONFIG_HAVE_KVM_IRQCHIP=y +CONFIG_HAVE_KVM_IRQFD=y +CONFIG_HAVE_KVM_IRQ_ROUTING=y +CONFIG_HAVE_KVM_EVENTFD=y +CONFIG_KVM_MMIO=y +CONFIG_KVM_ASYNC_PF=y +CONFIG_HAVE_KVM_MSI=y +CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT=y +CONFIG_KVM_VFIO=y +CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT=y +CONFIG_KVM_COMPAT=y +CONFIG_HAVE_KVM_IRQ_BYPASS=y +CONFIG_HAVE_KVM_NO_POLL=y +CONFIG_VIRTUALIZATION=y +CONFIG_KVM=m +CONFIG_KVM_WERROR=y +CONFIG_KVM_INTEL=m +CONFIG_KVM_AMD=m +CONFIG_KVM_AMD_SEV=y +CONFIG_KVM_MMU_AUDIT=y +CONFIG_AS_AVX512=y +CONFIG_AS_SHA1_NI=y +CONFIG_AS_SHA256_NI=y + +# +# General architecture-dependent options +# +CONFIG_HOTPLUG_SMT=y +CONFIG_OPROFILE=m +# CONFIG_OPROFILE_EVENT_MULTIPLEX is not set +CONFIG_HAVE_OPROFILE=y +CONFIG_OPROFILE_NMI_TIMER=y +CONFIG_KPROBES=y +CONFIG_JUMP_LABEL=y +# CONFIG_STATIC_KEYS_SELFTEST is not set +CONFIG_OPTPROBES=y +CONFIG_KPROBES_ON_FTRACE=y +CONFIG_UPROBES=y +CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y +CONFIG_ARCH_USE_BUILTIN_BSWAP=y +CONFIG_KRETPROBES=y +CONFIG_USER_RETURN_NOTIFIER=y +CONFIG_HAVE_IOREMAP_PROT=y +CONFIG_HAVE_KPROBES=y +CONFIG_HAVE_KRETPROBES=y +CONFIG_HAVE_OPTPROBES=y +CONFIG_HAVE_KPROBES_ON_FTRACE=y +CONFIG_HAVE_FUNCTION_ERROR_INJECTION=y +CONFIG_HAVE_NMI=y +CONFIG_HAVE_ARCH_TRACEHOOK=y +CONFIG_HAVE_DMA_CONTIGUOUS=y +CONFIG_GENERIC_SMP_IDLE_THREAD=y +CONFIG_ARCH_HAS_FORTIFY_SOURCE=y +CONFIG_ARCH_HAS_SET_MEMORY=y +CONFIG_ARCH_HAS_SET_DIRECT_MAP=y +CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y +CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT=y +CONFIG_HAVE_ASM_MODVERSIONS=y +CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y +CONFIG_HAVE_RSEQ=y +CONFIG_HAVE_FUNCTION_ARG_ACCESS_API=y +CONFIG_HAVE_CLK=y +CONFIG_HAVE_HW_BREAKPOINT=y +CONFIG_HAVE_MIXED_BREAKPOINTS_REGS=y +CONFIG_HAVE_USER_RETURN_NOTIFIER=y +CONFIG_HAVE_PERF_EVENTS_NMI=y +CONFIG_HAVE_HARDLOCKUP_DETECTOR_PERF=y +CONFIG_HAVE_PERF_REGS=y +CONFIG_HAVE_PERF_USER_STACK_DUMP=y +CONFIG_HAVE_ARCH_JUMP_LABEL=y +CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE=y +CONFIG_MMU_GATHER_TABLE_FREE=y +CONFIG_MMU_GATHER_RCU_TABLE_FREE=y +CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG=y +CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y +CONFIG_HAVE_CMPXCHG_LOCAL=y +CONFIG_HAVE_CMPXCHG_DOUBLE=y +CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION=y +CONFIG_ARCH_WANT_OLD_COMPAT_IPC=y +CONFIG_HAVE_ARCH_SECCOMP_FILTER=y +CONFIG_SECCOMP_FILTER=y +CONFIG_HAVE_ARCH_STACKLEAK=y +CONFIG_HAVE_STACKPROTECTOR=y +CONFIG_CC_HAS_STACKPROTECTOR_NONE=y +CONFIG_STACKPROTECTOR=y +CONFIG_STACKPROTECTOR_STRONG=y +CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES=y +CONFIG_HAVE_CONTEXT_TRACKING=y +CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y +CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y +CONFIG_HAVE_MOVE_PMD=y +CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y +CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD=y +CONFIG_HAVE_ARCH_HUGE_VMAP=y +CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y +CONFIG_HAVE_ARCH_SOFT_DIRTY=y +CONFIG_HAVE_MOD_ARCH_SPECIFIC=y +CONFIG_MODULES_USE_ELF_RELA=y +CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK=y +CONFIG_ARCH_HAS_ELF_RANDOMIZE=y +CONFIG_HAVE_ARCH_MMAP_RND_BITS=y +CONFIG_HAVE_EXIT_THREAD=y +CONFIG_ARCH_MMAP_RND_BITS=32 +CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS=y +CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16 +CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES=y +CONFIG_HAVE_COPY_THREAD_TLS=y +CONFIG_HAVE_STACK_VALIDATION=y +CONFIG_HAVE_RELIABLE_STACKTRACE=y +CONFIG_ISA_BUS_API=y +CONFIG_OLD_SIGSUSPEND3=y +CONFIG_COMPAT_OLD_SIGACTION=y +CONFIG_COMPAT_32BIT_TIME=y +CONFIG_HAVE_ARCH_VMAP_STACK=y +CONFIG_VMAP_STACK=y +CONFIG_ARCH_HAS_STRICT_KERNEL_RWX=y +CONFIG_STRICT_KERNEL_RWX=y +CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y +CONFIG_STRICT_MODULE_RWX=y +CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y +CONFIG_ARCH_USE_MEMREMAP_PROT=y +CONFIG_LOCK_EVENT_COUNTS=y +CONFIG_ARCH_HAS_MEM_ENCRYPT=y + +# +# GCOV-based kernel profiling +# +# CONFIG_GCOV_KERNEL is not set +CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y +# end of GCOV-based kernel profiling + +CONFIG_HAVE_GCC_PLUGINS=y +CONFIG_GCC_PLUGINS=y +# CONFIG_GCC_PLUGIN_CYC_COMPLEXITY is not set +CONFIG_GCC_PLUGIN_LATENT_ENTROPY=y +# CONFIG_GCC_PLUGIN_RANDSTRUCT is not set +# end of General architecture-dependent options + +CONFIG_RT_MUTEXES=y +CONFIG_BASE_SMALL=0 +CONFIG_MODULE_SIG_FORMAT=y +CONFIG_MODULES=y +CONFIG_MODULE_FORCE_LOAD=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_FORCE_UNLOAD=y +# CONFIG_MODVERSIONS is not set +CONFIG_MODULE_SRCVERSION_ALL=y +CONFIG_MODULE_SIG=y +# CONFIG_MODULE_SIG_FORCE is not set +CONFIG_MODULE_SIG_ALL=y +# CONFIG_MODULE_SIG_SHA1 is not set +# CONFIG_MODULE_SIG_SHA224 is not set +# CONFIG_MODULE_SIG_SHA256 is not set +# CONFIG_MODULE_SIG_SHA384 is not set +CONFIG_MODULE_SIG_SHA512=y +CONFIG_MODULE_SIG_HASH="sha512" +CONFIG_MODULE_COMPRESS=y +# CONFIG_MODULE_COMPRESS_GZIP is not set +CONFIG_MODULE_COMPRESS_XZ=y +# CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS is not set +CONFIG_UNUSED_SYMBOLS=y +CONFIG_MODULES_TREE_LOOKUP=y +CONFIG_BLOCK=y +CONFIG_BLK_RQ_ALLOC_TIME=y +CONFIG_BLK_SCSI_REQUEST=y +CONFIG_BLK_CGROUP_RWSTAT=y +CONFIG_BLK_DEV_BSG=y +CONFIG_BLK_DEV_BSGLIB=y +CONFIG_BLK_DEV_INTEGRITY=y +CONFIG_BLK_DEV_INTEGRITY_T10=y +CONFIG_BLK_DEV_ZONED=y +CONFIG_BLK_DEV_THROTTLING=y +CONFIG_BLK_DEV_THROTTLING_LOW=y +# CONFIG_BLK_CMDLINE_PARSER is not set +CONFIG_BLK_WBT=y +CONFIG_BLK_CGROUP_IOLATENCY=y +CONFIG_BLK_CGROUP_IOCOST=y +CONFIG_BLK_WBT_MQ=y +CONFIG_BLK_DEBUG_FS=y +CONFIG_BLK_DEBUG_FS_ZONED=y +CONFIG_BLK_SED_OPAL=y + +# +# Partition Types +# +CONFIG_PARTITION_ADVANCED=y +# CONFIG_ACORN_PARTITION is not set +CONFIG_AIX_PARTITION=y +# CONFIG_OSF_PARTITION is not set +# CONFIG_AMIGA_PARTITION is not set +# CONFIG_ATARI_PARTITION is not set +CONFIG_MAC_PARTITION=y +CONFIG_MSDOS_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +# CONFIG_UNIXWARE_DISKLABEL is not set +CONFIG_LDM_PARTITION=y +# CONFIG_LDM_DEBUG is not set +# CONFIG_SGI_PARTITION is not set +# CONFIG_ULTRIX_PARTITION is not set +# CONFIG_SUN_PARTITION is not set +CONFIG_KARMA_PARTITION=y +CONFIG_EFI_PARTITION=y +# CONFIG_SYSV68_PARTITION is not set +# CONFIG_CMDLINE_PARTITION is not set +# end of Partition Types + +CONFIG_BLOCK_COMPAT=y +CONFIG_BLK_MQ_PCI=y +CONFIG_BLK_MQ_VIRTIO=y +CONFIG_BLK_MQ_RDMA=y +CONFIG_BLK_PM=y + +# +# IO Schedulers +# +CONFIG_MQ_IOSCHED_DEADLINE=y +CONFIG_MQ_IOSCHED_KYBER=y +CONFIG_IOSCHED_BFQ=y +CONFIG_BFQ_GROUP_IOSCHED=y +# CONFIG_BFQ_CGROUP_DEBUG is not set +# end of IO Schedulers + +CONFIG_PREEMPT_NOTIFIERS=y +CONFIG_PADATA=y +CONFIG_ASN1=y +CONFIG_UNINLINE_SPIN_UNLOCK=y +CONFIG_ARCH_SUPPORTS_ATOMIC_RMW=y +CONFIG_MUTEX_SPIN_ON_OWNER=y +CONFIG_RWSEM_SPIN_ON_OWNER=y +CONFIG_LOCK_SPIN_ON_OWNER=y +CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y +CONFIG_QUEUED_SPINLOCKS=y +CONFIG_ARCH_USE_QUEUED_RWLOCKS=y +CONFIG_QUEUED_RWLOCKS=y +CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE=y +CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE=y +CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y +CONFIG_FREEZER=y + +# +# Executable file formats +# +CONFIG_BINFMT_ELF=y +CONFIG_COMPAT_BINFMT_ELF=y +CONFIG_ELFCORE=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y +CONFIG_BINFMT_SCRIPT=y +CONFIG_BINFMT_MISC=y +CONFIG_COREDUMP=y +# end of Executable file formats + +# +# Memory Management options +# +CONFIG_SELECT_MEMORY_MODEL=y +CONFIG_SPARSEMEM_MANUAL=y +CONFIG_SPARSEMEM=y +CONFIG_NEED_MULTIPLE_NODES=y +CONFIG_HAVE_MEMORY_PRESENT=y +CONFIG_SPARSEMEM_EXTREME=y +CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y +CONFIG_SPARSEMEM_VMEMMAP=y +CONFIG_HAVE_MEMBLOCK_NODE_MAP=y +CONFIG_HAVE_FAST_GUP=y +CONFIG_NUMA_KEEP_MEMINFO=y +CONFIG_MEMORY_ISOLATION=y +CONFIG_HAVE_BOOTMEM_INFO_NODE=y +CONFIG_MEMORY_HOTPLUG=y +CONFIG_MEMORY_HOTPLUG_SPARSE=y +CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y +CONFIG_MEMORY_HOTREMOVE=y +CONFIG_SPLIT_PTLOCK_CPUS=4 +CONFIG_MEMORY_BALLOON=y +CONFIG_BALLOON_COMPACTION=y +CONFIG_COMPACTION=y +CONFIG_PAGE_REPORTING=y +CONFIG_MIGRATION=y +CONFIG_CONTIG_ALLOC=y +CONFIG_PHYS_ADDR_T_64BIT=y +CONFIG_BOUNCE=y +CONFIG_VIRT_TO_BUS=y +CONFIG_MMU_NOTIFIER=y +CONFIG_KSM=y +CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 +CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y +CONFIG_MEMORY_FAILURE=y +CONFIG_HWPOISON_INJECT=m +CONFIG_TRANSPARENT_HUGEPAGE=y +# CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS is not set +CONFIG_TRANSPARENT_HUGEPAGE_MADVISE=y +CONFIG_ARCH_WANTS_THP_SWAP=y +CONFIG_THP_SWAP=y +CONFIG_CLEANCACHE=y +CONFIG_FRONTSWAP=y +# CONFIG_CMA is not set +CONFIG_ZSWAP=y +# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_DEFLATE is not set +# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZO is not set +# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_842 is not set +CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4=y +# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4HC is not set +# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_ZSTD is not set +CONFIG_ZSWAP_COMPRESSOR_DEFAULT="lz4" +# CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD is not set +CONFIG_ZSWAP_ZPOOL_DEFAULT_Z3FOLD=y +# CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC is not set +CONFIG_ZSWAP_ZPOOL_DEFAULT="z3fold" +CONFIG_ZSWAP_DEFAULT_ON=y +CONFIG_ZPOOL=y +CONFIG_ZBUD=y +CONFIG_Z3FOLD=y +CONFIG_ZSMALLOC=y +# CONFIG_PGTABLE_MAPPING is not set +# CONFIG_ZSMALLOC_STAT is not set +CONFIG_GENERIC_EARLY_IOREMAP=y +# CONFIG_DEFERRED_STRUCT_PAGE_INIT is not set +# CONFIG_IDLE_PAGE_TRACKING is not set +CONFIG_ARCH_HAS_PTE_DEVMAP=y +CONFIG_ZONE_DEVICE=y +CONFIG_DEV_PAGEMAP_OPS=y +CONFIG_HMM_MIRROR=y +CONFIG_DEVICE_PRIVATE=y +CONFIG_FRAME_VECTOR=y +CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y +CONFIG_ARCH_HAS_PKEYS=y +# CONFIG_PERCPU_STATS is not set +# CONFIG_GUP_BENCHMARK is not set +CONFIG_READ_ONLY_THP_FOR_FS=y +CONFIG_ARCH_HAS_PTE_SPECIAL=y +CONFIG_MAPPING_DIRTY_HELPERS=y +# end of Memory Management options + +CONFIG_NET=y +CONFIG_COMPAT_NETLINK_MESSAGES=y +CONFIG_NET_INGRESS=y +CONFIG_NET_EGRESS=y +CONFIG_NET_REDIRECT=y +CONFIG_SKB_EXTENSIONS=y + +# +# Networking options +# +CONFIG_PACKET=y +CONFIG_PACKET_DIAG=y +CONFIG_UNIX=y +CONFIG_UNIX_SCM=y +CONFIG_UNIX_DIAG=y +CONFIG_TLS=m +CONFIG_TLS_DEVICE=y +# CONFIG_TLS_TOE is not set +CONFIG_XFRM=y +CONFIG_XFRM_OFFLOAD=y +CONFIG_XFRM_ALGO=m +CONFIG_XFRM_USER=m +CONFIG_XFRM_INTERFACE=m +CONFIG_XFRM_SUB_POLICY=y +CONFIG_XFRM_MIGRATE=y +CONFIG_XFRM_STATISTICS=y +CONFIG_XFRM_IPCOMP=m +CONFIG_NET_KEY=m +CONFIG_NET_KEY_MIGRATE=y +CONFIG_SMC=m +CONFIG_SMC_DIAG=m +CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +# CONFIG_IP_FIB_TRIE_STATS is not set +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_IP_ROUTE_CLASSID=y +# CONFIG_IP_PNP is not set +CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE_DEMUX=m +CONFIG_NET_IP_TUNNEL=m +CONFIG_NET_IPGRE=m +# CONFIG_NET_IPGRE_BROADCAST is not set +CONFIG_IP_MROUTE_COMMON=y +CONFIG_IP_MROUTE=y +CONFIG_IP_MROUTE_MULTIPLE_TABLES=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +CONFIG_SYN_COOKIES=y +CONFIG_NET_IPVTI=m +CONFIG_NET_UDP_TUNNEL=m +CONFIG_NET_FOU=m +CONFIG_NET_FOU_IP_TUNNELS=y +CONFIG_INET_AH=m +CONFIG_INET_ESP=m +CONFIG_INET_ESP_OFFLOAD=m +CONFIG_INET_ESPINTCP=y +CONFIG_INET_IPCOMP=m +CONFIG_INET_XFRM_TUNNEL=m +CONFIG_INET_TUNNEL=m +CONFIG_INET_DIAG=m +CONFIG_INET_TCP_DIAG=m +CONFIG_INET_UDP_DIAG=m +CONFIG_INET_RAW_DIAG=m +CONFIG_INET_DIAG_DESTROY=y +CONFIG_TCP_CONG_ADVANCED=y +CONFIG_TCP_CONG_BIC=m +CONFIG_TCP_CONG_CUBIC=y +CONFIG_TCP_CONG_WESTWOOD=m +CONFIG_TCP_CONG_HTCP=m +CONFIG_TCP_CONG_HSTCP=m +CONFIG_TCP_CONG_HYBLA=m +CONFIG_TCP_CONG_VEGAS=m +CONFIG_TCP_CONG_NV=m +CONFIG_TCP_CONG_SCALABLE=m +CONFIG_TCP_CONG_LP=m +CONFIG_TCP_CONG_VENO=m +CONFIG_TCP_CONG_YEAH=m +CONFIG_TCP_CONG_ILLINOIS=m +CONFIG_TCP_CONG_DCTCP=m +CONFIG_TCP_CONG_CDG=m +CONFIG_TCP_CONG_BBR=m +CONFIG_DEFAULT_CUBIC=y +# CONFIG_DEFAULT_RENO is not set +CONFIG_DEFAULT_TCP_CONG="cubic" +CONFIG_TCP_MD5SIG=y +# CONFIG_TCP_SIMULT_CONNECT_DEFAULT_ON is not set +CONFIG_IPV6=y +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_INET6_AH=m +CONFIG_INET6_ESP=m +CONFIG_INET6_ESP_OFFLOAD=m +CONFIG_INET6_IPCOMP=m +CONFIG_IPV6_MIP6=m +CONFIG_IPV6_ILA=m +CONFIG_INET6_XFRM_TUNNEL=m +CONFIG_INET6_TUNNEL=m +CONFIG_IPV6_VTI=m +CONFIG_IPV6_SIT=m +CONFIG_IPV6_SIT_6RD=y +CONFIG_IPV6_NDISC_NODETYPE=y +CONFIG_IPV6_TUNNEL=m +CONFIG_IPV6_GRE=m +CONFIG_IPV6_FOU=m +CONFIG_IPV6_FOU_TUNNEL=m +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_SUBTREES=y +CONFIG_IPV6_MROUTE=y +CONFIG_IPV6_MROUTE_MULTIPLE_TABLES=y +CONFIG_IPV6_PIMSM_V2=y +CONFIG_IPV6_SEG6_LWTUNNEL=y +CONFIG_IPV6_SEG6_HMAC=y +CONFIG_IPV6_SEG6_BPF=y +CONFIG_IPV6_RPL_LWTUNNEL=y +CONFIG_NETLABEL=y +CONFIG_MPTCP=y +CONFIG_MPTCP_IPV6=y +# CONFIG_MPTCP_HMAC_TEST is not set +CONFIG_NETWORK_SECMARK=y +CONFIG_NET_PTP_CLASSIFY=y +CONFIG_NETWORK_PHY_TIMESTAMPING=y +CONFIG_NETFILTER=y +CONFIG_NETFILTER_ADVANCED=y +CONFIG_BRIDGE_NETFILTER=m + +# +# Core Netfilter Configuration +# +CONFIG_NETFILTER_INGRESS=y +CONFIG_NETFILTER_NETLINK=m +CONFIG_NETFILTER_FAMILY_BRIDGE=y +CONFIG_NETFILTER_FAMILY_ARP=y +CONFIG_NETFILTER_NETLINK_ACCT=m +CONFIG_NETFILTER_NETLINK_QUEUE=m +CONFIG_NETFILTER_NETLINK_LOG=m +CONFIG_NETFILTER_NETLINK_OSF=m +CONFIG_NF_CONNTRACK=m +CONFIG_NF_LOG_COMMON=m +CONFIG_NF_LOG_NETDEV=m +CONFIG_NETFILTER_CONNCOUNT=m +CONFIG_NF_CONNTRACK_MARK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_ZONES=y +CONFIG_NF_CONNTRACK_PROCFS=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_TIMEOUT=y +CONFIG_NF_CONNTRACK_TIMESTAMP=y +CONFIG_NF_CONNTRACK_LABELS=y +CONFIG_NF_CT_PROTO_DCCP=y +CONFIG_NF_CT_PROTO_GRE=y +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_CT_PROTO_UDPLITE=y +CONFIG_NF_CONNTRACK_AMANDA=m +CONFIG_NF_CONNTRACK_FTP=m +CONFIG_NF_CONNTRACK_H323=m +CONFIG_NF_CONNTRACK_IRC=m +CONFIG_NF_CONNTRACK_BROADCAST=m +CONFIG_NF_CONNTRACK_NETBIOS_NS=m +CONFIG_NF_CONNTRACK_SNMP=m +CONFIG_NF_CONNTRACK_PPTP=m +CONFIG_NF_CONNTRACK_SANE=m +CONFIG_NF_CONNTRACK_SIP=m +CONFIG_NF_CONNTRACK_TFTP=m +CONFIG_NF_CT_NETLINK=m +CONFIG_NF_CT_NETLINK_TIMEOUT=m +CONFIG_NF_CT_NETLINK_HELPER=m +CONFIG_NETFILTER_NETLINK_GLUE_CT=y +CONFIG_NF_NAT=m +CONFIG_NF_NAT_AMANDA=m +CONFIG_NF_NAT_FTP=m +CONFIG_NF_NAT_IRC=m +CONFIG_NF_NAT_SIP=m +CONFIG_NF_NAT_TFTP=m +CONFIG_NF_NAT_REDIRECT=y +CONFIG_NF_NAT_MASQUERADE=y +CONFIG_NETFILTER_SYNPROXY=m +CONFIG_NF_TABLES=m +CONFIG_NF_TABLES_INET=y +CONFIG_NF_TABLES_NETDEV=y +CONFIG_NFT_NUMGEN=m +CONFIG_NFT_CT=m +CONFIG_NFT_FLOW_OFFLOAD=m +CONFIG_NFT_COUNTER=m +CONFIG_NFT_CONNLIMIT=m +CONFIG_NFT_LOG=m +CONFIG_NFT_LIMIT=m +CONFIG_NFT_MASQ=m +CONFIG_NFT_REDIR=m +CONFIG_NFT_NAT=m +CONFIG_NFT_TUNNEL=m +CONFIG_NFT_OBJREF=m +CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m +CONFIG_NFT_REJECT=m +CONFIG_NFT_REJECT_INET=m +CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m +CONFIG_NFT_FIB=m +CONFIG_NFT_FIB_INET=m +CONFIG_NFT_XFRM=m +CONFIG_NFT_SOCKET=m +CONFIG_NFT_OSF=m +CONFIG_NFT_TPROXY=m +CONFIG_NFT_SYNPROXY=m +CONFIG_NF_DUP_NETDEV=m +CONFIG_NFT_DUP_NETDEV=m +CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m +CONFIG_NF_FLOW_TABLE_INET=m +CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES=m + +# +# Xtables combined modules +# +CONFIG_NETFILTER_XT_MARK=m +CONFIG_NETFILTER_XT_CONNMARK=m +CONFIG_NETFILTER_XT_SET=m + +# +# Xtables targets +# +CONFIG_NETFILTER_XT_TARGET_AUDIT=m +CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m +CONFIG_NETFILTER_XT_TARGET_CONNMARK=m +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m +CONFIG_NETFILTER_XT_TARGET_CT=m +CONFIG_NETFILTER_XT_TARGET_DSCP=m +CONFIG_NETFILTER_XT_TARGET_HL=m +CONFIG_NETFILTER_XT_TARGET_HMARK=m +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m +CONFIG_NETFILTER_XT_TARGET_LED=m +CONFIG_NETFILTER_XT_TARGET_LOG=m +CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_NAT=m +CONFIG_NETFILTER_XT_TARGET_NETMAP=m +CONFIG_NETFILTER_XT_TARGET_NFLOG=m +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_NOTRACK=m +CONFIG_NETFILTER_XT_TARGET_RATEEST=m +CONFIG_NETFILTER_XT_TARGET_REDIRECT=m +CONFIG_NETFILTER_XT_TARGET_MASQUERADE=m +CONFIG_NETFILTER_XT_TARGET_TEE=m +CONFIG_NETFILTER_XT_TARGET_TPROXY=m +CONFIG_NETFILTER_XT_TARGET_TRACE=m +CONFIG_NETFILTER_XT_TARGET_SECMARK=m +CONFIG_NETFILTER_XT_TARGET_TCPMSS=m +CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m + +# +# Xtables matches +# +CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m +CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NETFILTER_XT_MATCH_CGROUP=m +CONFIG_NETFILTER_XT_MATCH_CLUSTER=m +CONFIG_NETFILTER_XT_MATCH_COMMENT=m +CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m +CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m +CONFIG_NETFILTER_XT_MATCH_CONNMARK=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NETFILTER_XT_MATCH_CPU=m +CONFIG_NETFILTER_XT_MATCH_DCCP=m +CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m +CONFIG_NETFILTER_XT_MATCH_DSCP=m +CONFIG_NETFILTER_XT_MATCH_ECN=m +CONFIG_NETFILTER_XT_MATCH_ESP=m +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m +CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_HL=m +CONFIG_NETFILTER_XT_MATCH_IPCOMP=m +CONFIG_NETFILTER_XT_MATCH_IPRANGE=m +CONFIG_NETFILTER_XT_MATCH_IPVS=m +CONFIG_NETFILTER_XT_MATCH_L2TP=m +CONFIG_NETFILTER_XT_MATCH_LENGTH=m +CONFIG_NETFILTER_XT_MATCH_LIMIT=m +CONFIG_NETFILTER_XT_MATCH_MAC=m +CONFIG_NETFILTER_XT_MATCH_MARK=m +CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m +CONFIG_NETFILTER_XT_MATCH_NFACCT=m +CONFIG_NETFILTER_XT_MATCH_OSF=m +CONFIG_NETFILTER_XT_MATCH_OWNER=m +CONFIG_NETFILTER_XT_MATCH_POLICY=m +CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m +CONFIG_NETFILTER_XT_MATCH_QUOTA=m +CONFIG_NETFILTER_XT_MATCH_RATEEST=m +CONFIG_NETFILTER_XT_MATCH_REALM=m +CONFIG_NETFILTER_XT_MATCH_RECENT=m +CONFIG_NETFILTER_XT_MATCH_SCTP=m +CONFIG_NETFILTER_XT_MATCH_SOCKET=m +CONFIG_NETFILTER_XT_MATCH_STATE=m +CONFIG_NETFILTER_XT_MATCH_STATISTIC=m +CONFIG_NETFILTER_XT_MATCH_STRING=m +CONFIG_NETFILTER_XT_MATCH_TCPMSS=m +CONFIG_NETFILTER_XT_MATCH_TIME=m +CONFIG_NETFILTER_XT_MATCH_U32=m +# end of Core Netfilter Configuration + +CONFIG_IP_SET=m +CONFIG_IP_SET_MAX=256 +CONFIG_IP_SET_BITMAP_IP=m +CONFIG_IP_SET_BITMAP_IPMAC=m +CONFIG_IP_SET_BITMAP_PORT=m +CONFIG_IP_SET_HASH_IP=m +CONFIG_IP_SET_HASH_IPMARK=m +CONFIG_IP_SET_HASH_IPPORT=m +CONFIG_IP_SET_HASH_IPPORTIP=m +CONFIG_IP_SET_HASH_IPPORTNET=m +CONFIG_IP_SET_HASH_IPMAC=m +CONFIG_IP_SET_HASH_MAC=m +CONFIG_IP_SET_HASH_NETPORTNET=m +CONFIG_IP_SET_HASH_NET=m +CONFIG_IP_SET_HASH_NETNET=m +CONFIG_IP_SET_HASH_NETPORT=m +CONFIG_IP_SET_HASH_NETIFACE=m +CONFIG_IP_SET_LIST_SET=m +CONFIG_IP_VS=m +CONFIG_IP_VS_IPV6=y +# CONFIG_IP_VS_DEBUG is not set +CONFIG_IP_VS_TAB_BITS=15 + +# +# IPVS transport protocol load balancing support +# +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_PROTO_UDP=y +CONFIG_IP_VS_PROTO_AH_ESP=y +CONFIG_IP_VS_PROTO_ESP=y +CONFIG_IP_VS_PROTO_AH=y +CONFIG_IP_VS_PROTO_SCTP=y + +# +# IPVS scheduler +# +CONFIG_IP_VS_RR=m +CONFIG_IP_VS_WRR=m +CONFIG_IP_VS_LC=m +CONFIG_IP_VS_WLC=m +CONFIG_IP_VS_FO=m +CONFIG_IP_VS_OVF=m +CONFIG_IP_VS_LBLC=m +CONFIG_IP_VS_LBLCR=m +CONFIG_IP_VS_DH=m +CONFIG_IP_VS_SH=m +CONFIG_IP_VS_MH=m +CONFIG_IP_VS_SED=m +CONFIG_IP_VS_NQ=m + +# +# IPVS SH scheduler +# +CONFIG_IP_VS_SH_TAB_BITS=8 + +# +# IPVS MH scheduler +# +CONFIG_IP_VS_MH_TAB_INDEX=12 + +# +# IPVS application helper +# +CONFIG_IP_VS_FTP=m +CONFIG_IP_VS_NFCT=y +CONFIG_IP_VS_PE_SIP=m + +# +# IP: Netfilter Configuration +# +CONFIG_NF_DEFRAG_IPV4=m +CONFIG_NF_SOCKET_IPV4=m +CONFIG_NF_TPROXY_IPV4=m +CONFIG_NF_TABLES_IPV4=y +CONFIG_NFT_REJECT_IPV4=m +CONFIG_NFT_DUP_IPV4=m +CONFIG_NFT_FIB_IPV4=m +CONFIG_NF_TABLES_ARP=y +CONFIG_NF_FLOW_TABLE_IPV4=m +CONFIG_NF_DUP_IPV4=m +CONFIG_NF_LOG_ARP=m +CONFIG_NF_LOG_IPV4=m +CONFIG_NF_REJECT_IPV4=m +CONFIG_NF_NAT_SNMP_BASIC=m +CONFIG_NF_NAT_PPTP=m +CONFIG_NF_NAT_H323=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_AH=m +CONFIG_IP_NF_MATCH_ECN=m +CONFIG_IP_NF_MATCH_RPFILTER=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_TARGET_SYNPROXY=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_TARGET_NETMAP=m +CONFIG_IP_NF_TARGET_REDIRECT=m +CONFIG_IP_NF_MANGLE=m +CONFIG_IP_NF_TARGET_CLUSTERIP=m +CONFIG_IP_NF_TARGET_ECN=m +CONFIG_IP_NF_TARGET_TTL=m +CONFIG_IP_NF_RAW=m +CONFIG_IP_NF_SECURITY=m +CONFIG_IP_NF_ARPTABLES=m +CONFIG_IP_NF_ARPFILTER=m +CONFIG_IP_NF_ARP_MANGLE=m +# end of IP: Netfilter Configuration + +# +# IPv6: Netfilter Configuration +# +CONFIG_NF_SOCKET_IPV6=m +CONFIG_NF_TPROXY_IPV6=m +CONFIG_NF_TABLES_IPV6=y +CONFIG_NFT_REJECT_IPV6=m +CONFIG_NFT_DUP_IPV6=m +CONFIG_NFT_FIB_IPV6=m +CONFIG_NF_FLOW_TABLE_IPV6=m +CONFIG_NF_DUP_IPV6=m +CONFIG_NF_REJECT_IPV6=m +CONFIG_NF_LOG_IPV6=m +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_AH=m +CONFIG_IP6_NF_MATCH_EUI64=m +CONFIG_IP6_NF_MATCH_FRAG=m +CONFIG_IP6_NF_MATCH_OPTS=m +CONFIG_IP6_NF_MATCH_HL=m +CONFIG_IP6_NF_MATCH_IPV6HEADER=m +CONFIG_IP6_NF_MATCH_MH=m +CONFIG_IP6_NF_MATCH_RPFILTER=m +CONFIG_IP6_NF_MATCH_RT=m +CONFIG_IP6_NF_MATCH_SRH=m +CONFIG_IP6_NF_TARGET_HL=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_REJECT=m +CONFIG_IP6_NF_TARGET_SYNPROXY=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_RAW=m +CONFIG_IP6_NF_SECURITY=m +CONFIG_IP6_NF_NAT=m +CONFIG_IP6_NF_TARGET_MASQUERADE=m +CONFIG_IP6_NF_TARGET_NPT=m +# end of IPv6: Netfilter Configuration + +CONFIG_NF_DEFRAG_IPV6=m +CONFIG_NF_TABLES_BRIDGE=m +CONFIG_NFT_BRIDGE_META=m +CONFIG_NFT_BRIDGE_REJECT=m +CONFIG_NF_LOG_BRIDGE=m +CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES=m +CONFIG_BRIDGE_EBT_BROUTE=m +CONFIG_BRIDGE_EBT_T_FILTER=m +CONFIG_BRIDGE_EBT_T_NAT=m +CONFIG_BRIDGE_EBT_802_3=m +CONFIG_BRIDGE_EBT_AMONG=m +CONFIG_BRIDGE_EBT_ARP=m +CONFIG_BRIDGE_EBT_IP=m +CONFIG_BRIDGE_EBT_IP6=m +CONFIG_BRIDGE_EBT_LIMIT=m +CONFIG_BRIDGE_EBT_MARK=m +CONFIG_BRIDGE_EBT_PKTTYPE=m +CONFIG_BRIDGE_EBT_STP=m +CONFIG_BRIDGE_EBT_VLAN=m +CONFIG_BRIDGE_EBT_ARPREPLY=m +CONFIG_BRIDGE_EBT_DNAT=m +CONFIG_BRIDGE_EBT_MARK_T=m +CONFIG_BRIDGE_EBT_REDIRECT=m +CONFIG_BRIDGE_EBT_SNAT=m +CONFIG_BRIDGE_EBT_LOG=m +CONFIG_BRIDGE_EBT_NFLOG=m +# CONFIG_BPFILTER is not set +CONFIG_IP_DCCP=m +CONFIG_INET_DCCP_DIAG=m + +# +# DCCP CCIDs Configuration +# +# CONFIG_IP_DCCP_CCID2_DEBUG is not set +CONFIG_IP_DCCP_CCID3=y +# CONFIG_IP_DCCP_CCID3_DEBUG is not set +CONFIG_IP_DCCP_TFRC_LIB=y +# end of DCCP CCIDs Configuration + +# +# DCCP Kernel Hacking +# +# CONFIG_IP_DCCP_DEBUG is not set +# end of DCCP Kernel Hacking + +CONFIG_IP_SCTP=m +# CONFIG_SCTP_DBG_OBJCNT is not set +# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5 is not set +CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1=y +# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE is not set +CONFIG_SCTP_COOKIE_HMAC_MD5=y +CONFIG_SCTP_COOKIE_HMAC_SHA1=y +CONFIG_INET_SCTP_DIAG=m +CONFIG_RDS=m +CONFIG_RDS_RDMA=m +CONFIG_RDS_TCP=m +# CONFIG_RDS_DEBUG is not set +CONFIG_TIPC=m +CONFIG_TIPC_MEDIA_IB=y +CONFIG_TIPC_MEDIA_UDP=y +CONFIG_TIPC_CRYPTO=y +CONFIG_TIPC_DIAG=m +CONFIG_ATM=m +CONFIG_ATM_CLIP=m +# CONFIG_ATM_CLIP_NO_ICMP is not set +CONFIG_ATM_LANE=m +CONFIG_ATM_MPOA=m +CONFIG_ATM_BR2684=m +# CONFIG_ATM_BR2684_IPFILTER is not set +CONFIG_L2TP=m +# CONFIG_L2TP_DEBUGFS is not set +CONFIG_L2TP_V3=y +CONFIG_L2TP_IP=m +CONFIG_L2TP_ETH=m +CONFIG_STP=m +CONFIG_GARP=m +CONFIG_MRP=m +CONFIG_BRIDGE=m +CONFIG_BRIDGE_IGMP_SNOOPING=y +CONFIG_BRIDGE_VLAN_FILTERING=y +CONFIG_HAVE_NET_DSA=y +CONFIG_NET_DSA=m +CONFIG_NET_DSA_TAG_8021Q=m +CONFIG_NET_DSA_TAG_AR9331=m +CONFIG_NET_DSA_TAG_BRCM_COMMON=m +CONFIG_NET_DSA_TAG_BRCM=m +CONFIG_NET_DSA_TAG_BRCM_PREPEND=m +CONFIG_NET_DSA_TAG_GSWIP=m +CONFIG_NET_DSA_TAG_DSA=m +CONFIG_NET_DSA_TAG_EDSA=m +CONFIG_NET_DSA_TAG_MTK=m +CONFIG_NET_DSA_TAG_KSZ=m +CONFIG_NET_DSA_TAG_OCELOT=m +CONFIG_NET_DSA_TAG_QCA=m +CONFIG_NET_DSA_TAG_LAN9303=m +CONFIG_NET_DSA_TAG_SJA1105=m +CONFIG_NET_DSA_TAG_TRAILER=m +CONFIG_VLAN_8021Q=m +CONFIG_VLAN_8021Q_GVRP=y +CONFIG_VLAN_8021Q_MVRP=y +# CONFIG_DECNET is not set +CONFIG_LLC=m +CONFIG_LLC2=m +# CONFIG_ATALK is not set +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +CONFIG_PHONET=m +CONFIG_6LOWPAN=m +# CONFIG_6LOWPAN_DEBUGFS is not set +CONFIG_6LOWPAN_NHC=m +CONFIG_6LOWPAN_NHC_DEST=m +CONFIG_6LOWPAN_NHC_FRAGMENT=m +CONFIG_6LOWPAN_NHC_HOP=m +CONFIG_6LOWPAN_NHC_IPV6=m +CONFIG_6LOWPAN_NHC_MOBILITY=m +CONFIG_6LOWPAN_NHC_ROUTING=m +CONFIG_6LOWPAN_NHC_UDP=m +CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m +CONFIG_6LOWPAN_GHC_UDP=m +CONFIG_6LOWPAN_GHC_ICMPV6=m +CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m +CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m +CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m +CONFIG_IEEE802154=m +CONFIG_IEEE802154_NL802154_EXPERIMENTAL=y +CONFIG_IEEE802154_SOCKET=m +CONFIG_IEEE802154_6LOWPAN=m +CONFIG_MAC802154=m +CONFIG_NET_SCHED=y + +# +# Queueing/Scheduling +# +CONFIG_NET_SCH_CBQ=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_HFSC=m +CONFIG_NET_SCH_ATM=m +CONFIG_NET_SCH_PRIO=m +CONFIG_NET_SCH_MULTIQ=m +CONFIG_NET_SCH_RED=m +CONFIG_NET_SCH_SFB=m +CONFIG_NET_SCH_SFQ=m +CONFIG_NET_SCH_TEQL=m +CONFIG_NET_SCH_TBF=m +CONFIG_NET_SCH_CBS=m +CONFIG_NET_SCH_ETF=m +CONFIG_NET_SCH_TAPRIO=m +CONFIG_NET_SCH_GRED=m +CONFIG_NET_SCH_DSMARK=m +CONFIG_NET_SCH_NETEM=m +CONFIG_NET_SCH_DRR=m +CONFIG_NET_SCH_MQPRIO=m +CONFIG_NET_SCH_SKBPRIO=m +CONFIG_NET_SCH_CHOKE=m +CONFIG_NET_SCH_QFQ=m +CONFIG_NET_SCH_CODEL=m +CONFIG_NET_SCH_FQ_CODEL=y +CONFIG_NET_SCH_CAKE=m +CONFIG_NET_SCH_FQ=m +CONFIG_NET_SCH_HHF=m +CONFIG_NET_SCH_PIE=m +CONFIG_NET_SCH_FQ_PIE=m +CONFIG_NET_SCH_INGRESS=m +CONFIG_NET_SCH_PLUG=m +CONFIG_NET_SCH_ETS=m +CONFIG_NET_SCH_DEFAULT=y +# CONFIG_DEFAULT_FQ is not set +# CONFIG_DEFAULT_CODEL is not set +CONFIG_DEFAULT_FQ_CODEL=y +# CONFIG_DEFAULT_SFQ is not set +# CONFIG_DEFAULT_PFIFO_FAST is not set +CONFIG_DEFAULT_NET_SCH="fq_codel" + +# +# Classification +# +CONFIG_NET_CLS=y +CONFIG_NET_CLS_BASIC=m +CONFIG_NET_CLS_TCINDEX=m +CONFIG_NET_CLS_ROUTE4=m +CONFIG_NET_CLS_FW=m +CONFIG_NET_CLS_U32=m +CONFIG_CLS_U32_PERF=y +CONFIG_CLS_U32_MARK=y +CONFIG_NET_CLS_RSVP=m +CONFIG_NET_CLS_RSVP6=m +CONFIG_NET_CLS_FLOW=m +CONFIG_NET_CLS_CGROUP=m +CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_FLOWER=m +CONFIG_NET_CLS_MATCHALL=m +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_STACK=32 +CONFIG_NET_EMATCH_CMP=m +CONFIG_NET_EMATCH_NBYTE=m +CONFIG_NET_EMATCH_U32=m +CONFIG_NET_EMATCH_META=m +CONFIG_NET_EMATCH_TEXT=m +CONFIG_NET_EMATCH_CANID=m +CONFIG_NET_EMATCH_IPSET=m +CONFIG_NET_EMATCH_IPT=m +CONFIG_NET_CLS_ACT=y +CONFIG_NET_ACT_POLICE=m +CONFIG_NET_ACT_GACT=m +CONFIG_GACT_PROB=y +CONFIG_NET_ACT_MIRRED=m +CONFIG_NET_ACT_SAMPLE=m +CONFIG_NET_ACT_IPT=m +CONFIG_NET_ACT_NAT=m +CONFIG_NET_ACT_PEDIT=m +CONFIG_NET_ACT_SIMP=m +CONFIG_NET_ACT_SKBEDIT=m +CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_MPLS=m +CONFIG_NET_ACT_VLAN=m +CONFIG_NET_ACT_BPF=m +CONFIG_NET_ACT_CONNMARK=m +CONFIG_NET_ACT_CTINFO=m +CONFIG_NET_ACT_SKBMOD=m +CONFIG_NET_ACT_IFE=m +CONFIG_NET_ACT_TUNNEL_KEY=m +CONFIG_NET_ACT_CT=m +CONFIG_NET_IFE_SKBMARK=m +CONFIG_NET_IFE_SKBPRIO=m +CONFIG_NET_IFE_SKBTCINDEX=m +CONFIG_NET_TC_SKB_EXT=y +CONFIG_NET_SCH_FIFO=y +CONFIG_DCB=y +CONFIG_DNS_RESOLVER=m +CONFIG_BATMAN_ADV=m +CONFIG_BATMAN_ADV_BATMAN_V=y +CONFIG_BATMAN_ADV_BLA=y +CONFIG_BATMAN_ADV_DAT=y +CONFIG_BATMAN_ADV_NC=y +CONFIG_BATMAN_ADV_MCAST=y +# CONFIG_BATMAN_ADV_DEBUGFS is not set +# CONFIG_BATMAN_ADV_DEBUG is not set +CONFIG_BATMAN_ADV_SYSFS=y +# CONFIG_BATMAN_ADV_TRACING is not set +CONFIG_OPENVSWITCH=m +CONFIG_OPENVSWITCH_GRE=m +CONFIG_OPENVSWITCH_VXLAN=m +CONFIG_OPENVSWITCH_GENEVE=m +CONFIG_VSOCKETS=m +CONFIG_VSOCKETS_DIAG=m +CONFIG_VSOCKETS_LOOPBACK=m +CONFIG_VMWARE_VMCI_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS_COMMON=m +CONFIG_HYPERV_VSOCKETS=m +CONFIG_NETLINK_DIAG=m +CONFIG_MPLS=y +CONFIG_NET_MPLS_GSO=m +CONFIG_MPLS_ROUTING=m +CONFIG_MPLS_IPTUNNEL=m +CONFIG_NET_NSH=m +CONFIG_HSR=m +CONFIG_NET_SWITCHDEV=y +CONFIG_NET_L3_MASTER_DEV=y +CONFIG_NET_NCSI=y +CONFIG_NCSI_OEM_CMD_GET_MAC=y +CONFIG_RPS=y +CONFIG_RFS_ACCEL=y +CONFIG_XPS=y +CONFIG_CGROUP_NET_PRIO=y +CONFIG_CGROUP_NET_CLASSID=y +CONFIG_NET_RX_BUSY_POLL=y +CONFIG_BQL=y +CONFIG_BPF_JIT=y +CONFIG_BPF_STREAM_PARSER=y +CONFIG_NET_FLOW_LIMIT=y + +# +# Network testing +# +CONFIG_NET_PKTGEN=m +CONFIG_NET_DROP_MONITOR=y +# end of Network testing +# end of Networking options + +CONFIG_HAMRADIO=y + +# +# Packet Radio protocols +# +CONFIG_AX25=m +CONFIG_AX25_DAMA_SLAVE=y +CONFIG_NETROM=m +CONFIG_ROSE=m + +# +# AX.25 network device drivers +# +CONFIG_MKISS=m +CONFIG_6PACK=m +CONFIG_BPQETHER=m +CONFIG_BAYCOM_SER_FDX=m +CONFIG_BAYCOM_SER_HDX=m +CONFIG_BAYCOM_PAR=m +CONFIG_YAM=m +# end of AX.25 network device drivers + +CONFIG_CAN=m +CONFIG_CAN_RAW=m +CONFIG_CAN_BCM=m +CONFIG_CAN_GW=m +CONFIG_CAN_J1939=m + +# +# CAN Device Drivers +# +CONFIG_CAN_VCAN=m +CONFIG_CAN_VXCAN=m +CONFIG_CAN_SLCAN=m +CONFIG_CAN_DEV=m +CONFIG_CAN_CALC_BITTIMING=y +CONFIG_CAN_FLEXCAN=m +CONFIG_CAN_GRCAN=m +CONFIG_CAN_JANZ_ICAN3=m +CONFIG_CAN_KVASER_PCIEFD=m +CONFIG_CAN_C_CAN=m +CONFIG_CAN_C_CAN_PLATFORM=m +CONFIG_CAN_C_CAN_PCI=m +CONFIG_CAN_CC770=m +# CONFIG_CAN_CC770_ISA is not set +CONFIG_CAN_CC770_PLATFORM=m +CONFIG_CAN_IFI_CANFD=m +CONFIG_CAN_M_CAN=m +CONFIG_CAN_M_CAN_PLATFORM=m +CONFIG_CAN_M_CAN_TCAN4X5X=m +CONFIG_CAN_PEAK_PCIEFD=m +CONFIG_CAN_SJA1000=m +CONFIG_CAN_EMS_PCI=m +# CONFIG_CAN_EMS_PCMCIA is not set +CONFIG_CAN_F81601=m +CONFIG_CAN_KVASER_PCI=m +CONFIG_CAN_PEAK_PCI=m +CONFIG_CAN_PEAK_PCIEC=y +CONFIG_CAN_PEAK_PCMCIA=m +CONFIG_CAN_PLX_PCI=m +# CONFIG_CAN_SJA1000_ISA is not set +CONFIG_CAN_SJA1000_PLATFORM=m +CONFIG_CAN_SOFTING=m +CONFIG_CAN_SOFTING_CS=m + +# +# CAN SPI interfaces +# +CONFIG_CAN_HI311X=m +CONFIG_CAN_MCP251X=m +# end of CAN SPI interfaces + +# +# CAN USB interfaces +# +CONFIG_CAN_8DEV_USB=m +CONFIG_CAN_EMS_USB=m +CONFIG_CAN_ESD_USB2=m +CONFIG_CAN_GS_USB=m +CONFIG_CAN_KVASER_USB=m +CONFIG_CAN_MCBA_USB=m +CONFIG_CAN_PEAK_USB=m +CONFIG_CAN_UCAN=m +# end of CAN USB interfaces + +# CONFIG_CAN_DEBUG_DEVICES is not set +# end of CAN Device Drivers + +CONFIG_BT=m +CONFIG_BT_BREDR=y +CONFIG_BT_RFCOMM=m +CONFIG_BT_RFCOMM_TTY=y +CONFIG_BT_BNEP=m +CONFIG_BT_BNEP_MC_FILTER=y +CONFIG_BT_BNEP_PROTO_FILTER=y +CONFIG_BT_CMTP=m +CONFIG_BT_HIDP=m +CONFIG_BT_HS=y +CONFIG_BT_LE=y +CONFIG_BT_6LOWPAN=m +CONFIG_BT_LEDS=y +# CONFIG_BT_SELFTEST is not set +# CONFIG_BT_DEBUGFS is not set + +# +# Bluetooth device drivers +# +CONFIG_BT_INTEL=m +CONFIG_BT_BCM=m +CONFIG_BT_RTL=m +CONFIG_BT_QCA=m +CONFIG_BT_HCIBTUSB=m +CONFIG_BT_HCIBTUSB_AUTOSUSPEND=y +CONFIG_BT_HCIBTUSB_BCM=y +CONFIG_BT_HCIBTUSB_MTK=y +CONFIG_BT_HCIBTUSB_RTL=y +CONFIG_BT_HCIBTSDIO=m +CONFIG_BT_HCIUART=m +CONFIG_BT_HCIUART_SERDEV=y +CONFIG_BT_HCIUART_H4=y +CONFIG_BT_HCIUART_NOKIA=m +CONFIG_BT_HCIUART_BCSP=y +CONFIG_BT_HCIUART_ATH3K=y +CONFIG_BT_HCIUART_LL=y +CONFIG_BT_HCIUART_3WIRE=y +CONFIG_BT_HCIUART_INTEL=y +CONFIG_BT_HCIUART_BCM=y +CONFIG_BT_HCIUART_RTL=y +CONFIG_BT_HCIUART_QCA=y +CONFIG_BT_HCIUART_AG6XX=y +CONFIG_BT_HCIUART_MRVL=y +CONFIG_BT_HCIBCM203X=m +CONFIG_BT_HCIBPA10X=m +CONFIG_BT_HCIBFUSB=m +CONFIG_BT_HCIDTL1=m +CONFIG_BT_HCIBT3C=m +CONFIG_BT_HCIBLUECARD=m +CONFIG_BT_HCIVHCI=m +CONFIG_BT_MRVL=m +CONFIG_BT_MRVL_SDIO=m +CONFIG_BT_ATH3K=m +CONFIG_BT_MTKSDIO=m +CONFIG_BT_MTKUART=m +CONFIG_BT_HCIRSI=m +# end of Bluetooth device drivers + +CONFIG_AF_RXRPC=m +CONFIG_AF_RXRPC_IPV6=y +# CONFIG_AF_RXRPC_INJECT_LOSS is not set +# CONFIG_AF_RXRPC_DEBUG is not set +CONFIG_RXKAD=y +CONFIG_AF_KCM=m +CONFIG_STREAM_PARSER=y +CONFIG_FIB_RULES=y +CONFIG_WIRELESS=y +CONFIG_WIRELESS_EXT=y +CONFIG_WEXT_CORE=y +CONFIG_WEXT_PROC=y +CONFIG_WEXT_SPY=y +CONFIG_WEXT_PRIV=y +CONFIG_CFG80211=m +# CONFIG_NL80211_TESTMODE is not set +# CONFIG_CFG80211_DEVELOPER_WARNINGS is not set +# CONFIG_CFG80211_CERTIFICATION_ONUS is not set +CONFIG_CFG80211_REQUIRE_SIGNED_REGDB=y +CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS=y +CONFIG_CFG80211_DEFAULT_PS=y +# CONFIG_CFG80211_DEBUGFS is not set +CONFIG_CFG80211_CRDA_SUPPORT=y +CONFIG_CFG80211_WEXT=y +CONFIG_CFG80211_WEXT_EXPORT=y +CONFIG_LIB80211=m +CONFIG_LIB80211_CRYPT_WEP=m +CONFIG_LIB80211_CRYPT_CCMP=m +CONFIG_LIB80211_CRYPT_TKIP=m +# CONFIG_LIB80211_DEBUG is not set +CONFIG_MAC80211=m +CONFIG_MAC80211_HAS_RC=y +CONFIG_MAC80211_RC_MINSTREL=y +CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y +CONFIG_MAC80211_RC_DEFAULT="minstrel_ht" +CONFIG_MAC80211_MESH=y +CONFIG_MAC80211_LEDS=y +# CONFIG_MAC80211_DEBUGFS is not set +# CONFIG_MAC80211_MESSAGE_TRACING is not set +# CONFIG_MAC80211_DEBUG_MENU is not set +CONFIG_MAC80211_STA_HASH_MAX_SIZE=0 +CONFIG_WIMAX=m +CONFIG_WIMAX_DEBUG_LEVEL=8 +CONFIG_RFKILL=m +CONFIG_RFKILL_LEDS=y +CONFIG_RFKILL_INPUT=y +CONFIG_RFKILL_GPIO=m +CONFIG_NET_9P=m +CONFIG_NET_9P_VIRTIO=m +CONFIG_NET_9P_XEN=m +CONFIG_NET_9P_RDMA=m +# CONFIG_NET_9P_DEBUG is not set +CONFIG_CAIF=m +# CONFIG_CAIF_DEBUG is not set +CONFIG_CAIF_NETDEV=m +CONFIG_CAIF_USB=m +CONFIG_CEPH_LIB=m +CONFIG_CEPH_LIB_PRETTYDEBUG=y +CONFIG_CEPH_LIB_USE_DNS_RESOLVER=y +CONFIG_NFC=m +CONFIG_NFC_DIGITAL=m +CONFIG_NFC_NCI=m +CONFIG_NFC_NCI_SPI=m +CONFIG_NFC_NCI_UART=m +CONFIG_NFC_HCI=m +CONFIG_NFC_SHDLC=y + +# +# Near Field Communication (NFC) devices +# +CONFIG_NFC_TRF7970A=m +CONFIG_NFC_MEI_PHY=m +CONFIG_NFC_SIM=m +CONFIG_NFC_PORT100=m +CONFIG_NFC_FDP=m +CONFIG_NFC_FDP_I2C=m +CONFIG_NFC_PN544=m +CONFIG_NFC_PN544_I2C=m +CONFIG_NFC_PN544_MEI=m +CONFIG_NFC_PN533=m +CONFIG_NFC_PN533_USB=m +CONFIG_NFC_PN533_I2C=m +CONFIG_NFC_PN532_UART=m +CONFIG_NFC_MICROREAD=m +CONFIG_NFC_MICROREAD_I2C=m +CONFIG_NFC_MICROREAD_MEI=m +CONFIG_NFC_MRVL=m +CONFIG_NFC_MRVL_USB=m +CONFIG_NFC_MRVL_UART=m +CONFIG_NFC_MRVL_I2C=m +CONFIG_NFC_MRVL_SPI=m +CONFIG_NFC_ST21NFCA=m +CONFIG_NFC_ST21NFCA_I2C=m +CONFIG_NFC_ST_NCI=m +CONFIG_NFC_ST_NCI_I2C=m +CONFIG_NFC_ST_NCI_SPI=m +CONFIG_NFC_NXP_NCI=m +CONFIG_NFC_NXP_NCI_I2C=m +CONFIG_NFC_S3FWRN5=m +CONFIG_NFC_S3FWRN5_I2C=m +CONFIG_NFC_ST95HF=m +# end of Near Field Communication (NFC) devices + +CONFIG_PSAMPLE=m +CONFIG_NET_IFE=m +CONFIG_LWTUNNEL=y +CONFIG_LWTUNNEL_BPF=y +CONFIG_DST_CACHE=y +CONFIG_GRO_CELLS=y +CONFIG_SOCK_VALIDATE_XMIT=y +CONFIG_NET_SOCK_MSG=y +CONFIG_NET_DEVLINK=y +CONFIG_PAGE_POOL=y +CONFIG_FAILOVER=m +CONFIG_ETHTOOL_NETLINK=y +CONFIG_HAVE_EBPF_JIT=y + +# +# Device Drivers +# +CONFIG_HAVE_EISA=y +# CONFIG_EISA is not set +CONFIG_HAVE_PCI=y +CONFIG_PCI=y +CONFIG_PCI_DOMAINS=y +CONFIG_PCIEPORTBUS=y +CONFIG_HOTPLUG_PCI_PCIE=y +CONFIG_PCIEAER=y +# CONFIG_PCIEAER_INJECT is not set +CONFIG_PCIE_ECRC=y +CONFIG_PCIEASPM=y +CONFIG_PCIEASPM_DEFAULT=y +# CONFIG_PCIEASPM_POWERSAVE is not set +# CONFIG_PCIEASPM_POWER_SUPERSAVE is not set +# CONFIG_PCIEASPM_PERFORMANCE is not set +CONFIG_PCIE_PME=y +CONFIG_PCIE_DPC=y +CONFIG_PCIE_PTM=y +# CONFIG_PCIE_BW is not set +CONFIG_PCIE_EDR=y +CONFIG_PCI_MSI=y +CONFIG_PCI_MSI_IRQ_DOMAIN=y +CONFIG_PCI_QUIRKS=y +# CONFIG_PCI_DEBUG is not set +CONFIG_PCI_REALLOC_ENABLE_AUTO=y +CONFIG_PCI_STUB=y +CONFIG_PCI_PF_STUB=m +CONFIG_XEN_PCIDEV_FRONTEND=m +CONFIG_PCI_ATS=y +CONFIG_PCI_ECAM=y +CONFIG_PCI_LOCKLESS_CONFIG=y +CONFIG_PCI_IOV=y +CONFIG_PCI_PRI=y +CONFIG_PCI_PASID=y +CONFIG_PCI_P2PDMA=y +CONFIG_PCI_LABEL=y +CONFIG_PCI_HYPERV=m +CONFIG_HOTPLUG_PCI=y +CONFIG_HOTPLUG_PCI_ACPI=y +CONFIG_HOTPLUG_PCI_ACPI_IBM=m +CONFIG_HOTPLUG_PCI_CPCI=y +CONFIG_HOTPLUG_PCI_CPCI_ZT5550=m +CONFIG_HOTPLUG_PCI_CPCI_GENERIC=m +CONFIG_HOTPLUG_PCI_SHPC=y + +# +# PCI controller drivers +# +CONFIG_PCI_FTPCI100=y +CONFIG_PCI_HOST_COMMON=y +CONFIG_PCI_HOST_GENERIC=y +CONFIG_PCIE_XILINX=y +CONFIG_VMD=m +CONFIG_PCI_HYPERV_INTERFACE=m + +# +# DesignWare PCI Core Support +# +CONFIG_PCIE_DW=y +CONFIG_PCIE_DW_HOST=y +CONFIG_PCIE_DW_EP=y +CONFIG_PCIE_DW_PLAT=y +CONFIG_PCIE_DW_PLAT_HOST=y +CONFIG_PCIE_DW_PLAT_EP=y +CONFIG_PCIE_INTEL_GW=y +CONFIG_PCI_MESON=y +# end of DesignWare PCI Core Support + +# +# Mobiveil PCIe Core Support +# +# end of Mobiveil PCIe Core Support + +# +# Cadence PCIe controllers support +# +CONFIG_PCIE_CADENCE=y +CONFIG_PCIE_CADENCE_HOST=y +CONFIG_PCIE_CADENCE_EP=y +CONFIG_PCIE_CADENCE_PLAT=y +CONFIG_PCIE_CADENCE_PLAT_HOST=y +CONFIG_PCIE_CADENCE_PLAT_EP=y +# end of Cadence PCIe controllers support +# end of PCI controller drivers + +# +# PCI Endpoint +# +CONFIG_PCI_ENDPOINT=y +CONFIG_PCI_ENDPOINT_CONFIGFS=y +# CONFIG_PCI_EPF_TEST is not set +# end of PCI Endpoint + +# +# PCI switch controller drivers +# +CONFIG_PCI_SW_SWITCHTEC=m +# end of PCI switch controller drivers + +CONFIG_PCCARD=m +CONFIG_PCMCIA=m +CONFIG_PCMCIA_LOAD_CIS=y +CONFIG_CARDBUS=y + +# +# PC-card bridges +# +CONFIG_YENTA=m +CONFIG_YENTA_O2=y +CONFIG_YENTA_RICOH=y +CONFIG_YENTA_TI=y +CONFIG_YENTA_ENE_TUNE=y +CONFIG_YENTA_TOSHIBA=y +CONFIG_PD6729=m +CONFIG_I82092=m +CONFIG_PCCARD_NONSTATIC=y +CONFIG_RAPIDIO=m +CONFIG_RAPIDIO_TSI721=m +CONFIG_RAPIDIO_DISC_TIMEOUT=30 +CONFIG_RAPIDIO_ENABLE_RX_TX_PORTS=y +CONFIG_RAPIDIO_DMA_ENGINE=y +# CONFIG_RAPIDIO_DEBUG is not set +CONFIG_RAPIDIO_ENUM_BASIC=m +CONFIG_RAPIDIO_CHMAN=m +CONFIG_RAPIDIO_MPORT_CDEV=m + +# +# RapidIO Switch drivers +# +CONFIG_RAPIDIO_TSI57X=m +CONFIG_RAPIDIO_CPS_XX=m +CONFIG_RAPIDIO_TSI568=m +CONFIG_RAPIDIO_CPS_GEN2=m +CONFIG_RAPIDIO_RXS_GEN3=m +# end of RapidIO Switch drivers + +# +# Generic Driver Options +# +# CONFIG_UEVENT_HELPER is not set +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +CONFIG_STANDALONE=y +CONFIG_PREVENT_FIRMWARE_BUILD=y + +# +# Firmware loader +# +CONFIG_FW_LOADER=y +CONFIG_FW_LOADER_PAGED_BUF=y +CONFIG_EXTRA_FIRMWARE="" +# CONFIG_FW_LOADER_USER_HELPER is not set +CONFIG_FW_LOADER_COMPRESS=y +CONFIG_FW_CACHE=y +# end of Firmware loader + +CONFIG_WANT_DEV_COREDUMP=y +CONFIG_ALLOW_DEV_COREDUMP=y +CONFIG_DEV_COREDUMP=y +# CONFIG_DEBUG_DRIVER is not set +# CONFIG_DEBUG_DEVRES is not set +# CONFIG_DEBUG_TEST_DRIVER_REMOVE is not set +CONFIG_HMEM_REPORTING=y +# CONFIG_TEST_ASYNC_DRIVER_PROBE is not set +CONFIG_SYS_HYPERVISOR=y +CONFIG_GENERIC_CPU_AUTOPROBE=y +CONFIG_GENERIC_CPU_VULNERABILITIES=y +CONFIG_REGMAP=y +CONFIG_REGMAP_I2C=y +CONFIG_REGMAP_SLIMBUS=m +CONFIG_REGMAP_SPI=y +CONFIG_REGMAP_SPMI=m +CONFIG_REGMAP_W1=m +CONFIG_REGMAP_MMIO=y +CONFIG_REGMAP_IRQ=y +CONFIG_REGMAP_SOUNDWIRE=m +CONFIG_REGMAP_SCCB=m +CONFIG_REGMAP_I3C=m +CONFIG_DMA_SHARED_BUFFER=y +# CONFIG_DMA_FENCE_TRACE is not set +# end of Generic Driver Options + +# +# Bus devices +# +CONFIG_MOXTET=m +CONFIG_SIMPLE_PM_BUS=y +CONFIG_MHI_BUS=m +# end of Bus devices + +CONFIG_CONNECTOR=y +CONFIG_PROC_EVENTS=y +CONFIG_GNSS=m +CONFIG_GNSS_SERIAL=m +CONFIG_GNSS_MTK_SERIAL=m +CONFIG_GNSS_SIRF_SERIAL=m +CONFIG_GNSS_UBX_SERIAL=m +CONFIG_MTD=m +CONFIG_MTD_TESTS=m + +# +# Partition parsers +# +CONFIG_MTD_AR7_PARTS=m +CONFIG_MTD_CMDLINE_PARTS=m +CONFIG_MTD_OF_PARTS=m +CONFIG_MTD_REDBOOT_PARTS=m +CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1 +# CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED is not set +# CONFIG_MTD_REDBOOT_PARTS_READONLY is not set +# end of Partition parsers + +# +# User Modules And Translation Layers +# +CONFIG_MTD_BLKDEVS=m +CONFIG_MTD_BLOCK=m +CONFIG_MTD_BLOCK_RO=m +CONFIG_FTL=m +CONFIG_NFTL=m +CONFIG_NFTL_RW=y +CONFIG_INFTL=m +CONFIG_RFD_FTL=m +CONFIG_SSFDC=m +CONFIG_SM_FTL=m +CONFIG_MTD_OOPS=m +CONFIG_MTD_SWAP=m +CONFIG_MTD_PARTITIONED_MASTER=y + +# +# RAM/ROM/Flash chip drivers +# +CONFIG_MTD_CFI=m +CONFIG_MTD_JEDECPROBE=m +CONFIG_MTD_GEN_PROBE=m +# CONFIG_MTD_CFI_ADV_OPTIONS is not set +CONFIG_MTD_MAP_BANK_WIDTH_1=y +CONFIG_MTD_MAP_BANK_WIDTH_2=y +CONFIG_MTD_MAP_BANK_WIDTH_4=y +CONFIG_MTD_CFI_I1=y +CONFIG_MTD_CFI_I2=y +CONFIG_MTD_CFI_INTELEXT=m +CONFIG_MTD_CFI_AMDSTD=m +CONFIG_MTD_CFI_STAA=m +CONFIG_MTD_CFI_UTIL=m +CONFIG_MTD_RAM=m +CONFIG_MTD_ROM=m +CONFIG_MTD_ABSENT=m +# end of RAM/ROM/Flash chip drivers + +# +# Mapping drivers for chip access +# +CONFIG_MTD_COMPLEX_MAPPINGS=y +CONFIG_MTD_PHYSMAP=m +# CONFIG_MTD_PHYSMAP_COMPAT is not set +CONFIG_MTD_PHYSMAP_OF=y +CONFIG_MTD_PHYSMAP_VERSATILE=y +CONFIG_MTD_PHYSMAP_GEMINI=y +CONFIG_MTD_PHYSMAP_GPIO_ADDR=y +CONFIG_MTD_SBC_GXX=m +CONFIG_MTD_AMD76XROM=m +CONFIG_MTD_ICHXROM=m +CONFIG_MTD_ESB2ROM=m +CONFIG_MTD_CK804XROM=m +CONFIG_MTD_SCB2_FLASH=m +CONFIG_MTD_NETtel=m +CONFIG_MTD_L440GX=m +CONFIG_MTD_PCI=m +CONFIG_MTD_PCMCIA=m +# CONFIG_MTD_PCMCIA_ANONYMOUS is not set +CONFIG_MTD_INTEL_VR_NOR=m +CONFIG_MTD_PLATRAM=m +# end of Mapping drivers for chip access + +# +# Self-contained MTD device drivers +# +CONFIG_MTD_PMC551=m +# CONFIG_MTD_PMC551_BUGFIX is not set +# CONFIG_MTD_PMC551_DEBUG is not set +CONFIG_MTD_DATAFLASH=m +# CONFIG_MTD_DATAFLASH_WRITE_VERIFY is not set +CONFIG_MTD_DATAFLASH_OTP=y +CONFIG_MTD_MCHP23K256=m +CONFIG_MTD_SST25L=m +CONFIG_MTD_SLRAM=m +CONFIG_MTD_PHRAM=m +CONFIG_MTD_MTDRAM=m +CONFIG_MTDRAM_TOTAL_SIZE=4096 +CONFIG_MTDRAM_ERASE_SIZE=128 +CONFIG_MTD_BLOCK2MTD=m + +# +# Disk-On-Chip Device Drivers +# +CONFIG_MTD_DOCG3=m +CONFIG_BCH_CONST_M=14 +CONFIG_BCH_CONST_T=4 +# end of Self-contained MTD device drivers + +CONFIG_MTD_NAND_CORE=m +CONFIG_MTD_ONENAND=m +# CONFIG_MTD_ONENAND_VERIFY_WRITE is not set +CONFIG_MTD_ONENAND_GENERIC=m +CONFIG_MTD_ONENAND_OTP=y +CONFIG_MTD_ONENAND_2X_PROGRAM=y +CONFIG_MTD_NAND_ECC_SW_HAMMING=m +CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC=y +CONFIG_MTD_RAW_NAND=m +CONFIG_MTD_NAND_ECC_SW_BCH=y + +# +# Raw/parallel NAND flash controllers +# +CONFIG_MTD_NAND_DENALI=m +CONFIG_MTD_NAND_DENALI_PCI=m +CONFIG_MTD_NAND_DENALI_DT=m +CONFIG_MTD_NAND_CAFE=m +CONFIG_MTD_NAND_MXIC=m +CONFIG_MTD_NAND_GPIO=m +CONFIG_MTD_NAND_PLATFORM=m +CONFIG_MTD_NAND_CADENCE=m + +# +# Misc +# +CONFIG_MTD_SM_COMMON=m +CONFIG_MTD_NAND_NANDSIM=m +CONFIG_MTD_NAND_RICOH=m +CONFIG_MTD_NAND_DISKONCHIP=m +# CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED is not set +CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0 +CONFIG_MTD_NAND_DISKONCHIP_BBTWRITE=y +CONFIG_MTD_SPI_NAND=m + +# +# LPDDR & LPDDR2 PCM memory drivers +# +CONFIG_MTD_LPDDR=m +CONFIG_MTD_QINFO_PROBE=m +# end of LPDDR & LPDDR2 PCM memory drivers + +CONFIG_MTD_SPI_NOR=m +CONFIG_MTD_SPI_NOR_USE_4K_SECTORS=y +CONFIG_SPI_INTEL_SPI=m +CONFIG_SPI_INTEL_SPI_PCI=m +CONFIG_SPI_INTEL_SPI_PLATFORM=m +CONFIG_MTD_UBI=m +CONFIG_MTD_UBI_WL_THRESHOLD=4096 +CONFIG_MTD_UBI_BEB_LIMIT=20 +CONFIG_MTD_UBI_FASTMAP=y +CONFIG_MTD_UBI_GLUEBI=m +CONFIG_MTD_UBI_BLOCK=y +CONFIG_MTD_HYPERBUS=m +CONFIG_DTC=y +CONFIG_OF=y +# CONFIG_OF_UNITTEST is not set +CONFIG_OF_FLATTREE=y +CONFIG_OF_KOBJ=y +CONFIG_OF_DYNAMIC=y +CONFIG_OF_ADDRESS=y +CONFIG_OF_IRQ=y +CONFIG_OF_NET=y +CONFIG_OF_MDIO=m +CONFIG_OF_RESOLVE=y +CONFIG_OF_OVERLAY=y +CONFIG_ARCH_MIGHT_HAVE_PC_PARPORT=y +CONFIG_PARPORT=m +CONFIG_PARPORT_PC=m +CONFIG_PARPORT_SERIAL=m +CONFIG_PARPORT_PC_FIFO=y +CONFIG_PARPORT_PC_SUPERIO=y +CONFIG_PARPORT_PC_PCMCIA=m +CONFIG_PARPORT_AX88796=m +CONFIG_PARPORT_1284=y +CONFIG_PARPORT_NOT_PC=y +CONFIG_PNP=y +CONFIG_PNP_DEBUG_MESSAGES=y + +# +# Protocols +# +CONFIG_PNPACPI=y +CONFIG_BLK_DEV=y +# CONFIG_BLK_DEV_NULL_BLK is not set +CONFIG_BLK_DEV_FD=m +CONFIG_CDROM=m +# CONFIG_PARIDE is not set +CONFIG_BLK_DEV_PCIESSD_MTIP32XX=m +CONFIG_ZRAM=m +CONFIG_ZRAM_WRITEBACK=y +# CONFIG_ZRAM_MEMORY_TRACKING is not set +CONFIG_BLK_DEV_UMEM=m +CONFIG_BLK_DEV_LOOP=m +CONFIG_BLK_DEV_LOOP_MIN_COUNT=8 +CONFIG_BLK_DEV_CRYPTOLOOP=m +CONFIG_BLK_DEV_DRBD=m +# CONFIG_DRBD_FAULT_INJECTION is not set +CONFIG_BLK_DEV_NBD=m +CONFIG_BLK_DEV_SKD=m +CONFIG_BLK_DEV_SX8=m +CONFIG_BLK_DEV_RAM=m +CONFIG_BLK_DEV_RAM_COUNT=16 +CONFIG_BLK_DEV_RAM_SIZE=16384 +CONFIG_CDROM_PKTCDVD=m +CONFIG_CDROM_PKTCDVD_BUFFERS=8 +# CONFIG_CDROM_PKTCDVD_WCACHE is not set +CONFIG_ATA_OVER_ETH=m +CONFIG_XEN_BLKDEV_FRONTEND=m +CONFIG_XEN_BLKDEV_BACKEND=m +CONFIG_VIRTIO_BLK=m +CONFIG_BLK_DEV_RBD=m +CONFIG_BLK_DEV_RSXX=m + +# +# NVME Support +# +CONFIG_NVME_CORE=y +CONFIG_BLK_DEV_NVME=y +CONFIG_NVME_MULTIPATH=y +CONFIG_NVME_HWMON=y +CONFIG_NVME_FABRICS=m +CONFIG_NVME_RDMA=m +CONFIG_NVME_FC=m +CONFIG_NVME_TCP=m +CONFIG_NVME_TARGET=m +CONFIG_NVME_TARGET_LOOP=m +CONFIG_NVME_TARGET_RDMA=m +CONFIG_NVME_TARGET_FC=m +CONFIG_NVME_TARGET_FCLOOP=m +CONFIG_NVME_TARGET_TCP=m +# end of NVME Support + +# +# Misc devices +# +CONFIG_SENSORS_LIS3LV02D=m +CONFIG_AD525X_DPOT=m +CONFIG_AD525X_DPOT_I2C=m +CONFIG_AD525X_DPOT_SPI=m +# CONFIG_DUMMY_IRQ is not set +CONFIG_IBM_ASM=m +CONFIG_PHANTOM=m +CONFIG_TIFM_CORE=m +CONFIG_TIFM_7XX1=m +CONFIG_ICS932S401=m +CONFIG_ENCLOSURE_SERVICES=m +CONFIG_HP_ILO=m +CONFIG_APDS9802ALS=m +CONFIG_ISL29003=m +CONFIG_ISL29020=m +CONFIG_SENSORS_TSL2550=m +CONFIG_SENSORS_BH1770=m +CONFIG_SENSORS_APDS990X=m +CONFIG_HMC6352=m +CONFIG_DS1682=m +CONFIG_VMWARE_BALLOON=m +CONFIG_LATTICE_ECP3_CONFIG=m +# CONFIG_SRAM is not set +CONFIG_PCI_ENDPOINT_TEST=m +CONFIG_XILINX_SDFEC=m +CONFIG_MISC_RTSX=m +CONFIG_PVPANIC=m +CONFIG_C2PORT=m +CONFIG_C2PORT_DURAMAR_2150=m + +# +# EEPROM support +# +CONFIG_EEPROM_AT24=m +# CONFIG_EEPROM_AT25 is not set +CONFIG_EEPROM_LEGACY=m +CONFIG_EEPROM_MAX6875=m +CONFIG_EEPROM_93CX6=m +# CONFIG_EEPROM_93XX46 is not set +CONFIG_EEPROM_IDT_89HPESX=m +CONFIG_EEPROM_EE1004=m +# end of EEPROM support + +CONFIG_CB710_CORE=m +# CONFIG_CB710_DEBUG is not set +CONFIG_CB710_DEBUG_ASSUMPTIONS=y + +# +# Texas Instruments shared transport line discipline +# +CONFIG_TI_ST=m +# end of Texas Instruments shared transport line discipline + +CONFIG_SENSORS_LIS3_I2C=m +CONFIG_ALTERA_STAPL=m +CONFIG_INTEL_MEI=m +CONFIG_INTEL_MEI_ME=m +CONFIG_INTEL_MEI_TXE=m +CONFIG_INTEL_MEI_HDCP=m +CONFIG_VMWARE_VMCI=m + +# +# Intel MIC & related support +# +CONFIG_INTEL_MIC_BUS=m +CONFIG_SCIF_BUS=m +CONFIG_VOP_BUS=m +CONFIG_INTEL_MIC_HOST=m +CONFIG_INTEL_MIC_CARD=m +CONFIG_SCIF=m +CONFIG_MIC_COSM=m +CONFIG_VOP=m +# end of Intel MIC & related support + +CONFIG_GENWQE=m +CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY=0 +CONFIG_ECHO=m +CONFIG_MISC_ALCOR_PCI=m +CONFIG_MISC_RTSX_PCI=m +CONFIG_MISC_RTSX_USB=m +CONFIG_HABANA_AI=m +CONFIG_UACCE=m +# end of Misc devices + +CONFIG_HAVE_IDE=y +# CONFIG_IDE is not set + +# +# SCSI device support +# +CONFIG_SCSI_MOD=y +CONFIG_RAID_ATTRS=m +CONFIG_SCSI=y +CONFIG_SCSI_DMA=y +CONFIG_SCSI_NETLINK=y +CONFIG_SCSI_PROC_FS=y + +# +# SCSI support type (disk, tape, CD-ROM) +# +CONFIG_BLK_DEV_SD=y +CONFIG_CHR_DEV_ST=m +CONFIG_BLK_DEV_SR=m +CONFIG_CHR_DEV_SG=m +CONFIG_CHR_DEV_SCH=m +CONFIG_SCSI_ENCLOSURE=m +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_LOGGING=y +CONFIG_SCSI_SCAN_ASYNC=y + +# +# SCSI Transports +# +CONFIG_SCSI_SPI_ATTRS=m +CONFIG_SCSI_FC_ATTRS=m +CONFIG_SCSI_ISCSI_ATTRS=m +CONFIG_SCSI_SAS_ATTRS=m +CONFIG_SCSI_SAS_LIBSAS=m +CONFIG_SCSI_SAS_ATA=y +CONFIG_SCSI_SAS_HOST_SMP=y +CONFIG_SCSI_SRP_ATTRS=m +# end of SCSI Transports + +CONFIG_SCSI_LOWLEVEL=y +CONFIG_ISCSI_TCP=m +CONFIG_ISCSI_BOOT_SYSFS=m +CONFIG_SCSI_CXGB3_ISCSI=m +CONFIG_SCSI_CXGB4_ISCSI=m +CONFIG_SCSI_BNX2_ISCSI=m +CONFIG_SCSI_BNX2X_FCOE=m +CONFIG_BE2ISCSI=m +CONFIG_BLK_DEV_3W_XXXX_RAID=m +CONFIG_SCSI_HPSA=m +CONFIG_SCSI_3W_9XXX=m +CONFIG_SCSI_3W_SAS=m +CONFIG_SCSI_ACARD=m +CONFIG_SCSI_AACRAID=m +CONFIG_SCSI_AIC7XXX=m +CONFIG_AIC7XXX_CMDS_PER_DEVICE=32 +CONFIG_AIC7XXX_RESET_DELAY_MS=15000 +CONFIG_AIC7XXX_DEBUG_ENABLE=y +CONFIG_AIC7XXX_DEBUG_MASK=0 +CONFIG_AIC7XXX_REG_PRETTY_PRINT=y +CONFIG_SCSI_AIC79XX=m +CONFIG_AIC79XX_CMDS_PER_DEVICE=32 +CONFIG_AIC79XX_RESET_DELAY_MS=15000 +CONFIG_AIC79XX_DEBUG_ENABLE=y +CONFIG_AIC79XX_DEBUG_MASK=0 +CONFIG_AIC79XX_REG_PRETTY_PRINT=y +CONFIG_SCSI_AIC94XX=m +CONFIG_AIC94XX_DEBUG=y +CONFIG_SCSI_MVSAS=m +CONFIG_SCSI_MVSAS_DEBUG=y +CONFIG_SCSI_MVSAS_TASKLET=y +CONFIG_SCSI_MVUMI=m +CONFIG_SCSI_DPT_I2O=m +CONFIG_SCSI_ADVANSYS=m +CONFIG_SCSI_ARCMSR=m +CONFIG_SCSI_ESAS2R=m +CONFIG_MEGARAID_NEWGEN=y +CONFIG_MEGARAID_MM=m +CONFIG_MEGARAID_MAILBOX=m +CONFIG_MEGARAID_LEGACY=m +CONFIG_MEGARAID_SAS=m +CONFIG_SCSI_MPT3SAS=m +CONFIG_SCSI_MPT2SAS_MAX_SGE=128 +CONFIG_SCSI_MPT3SAS_MAX_SGE=128 +CONFIG_SCSI_MPT2SAS=m +CONFIG_SCSI_SMARTPQI=m +CONFIG_SCSI_UFSHCD=m +CONFIG_SCSI_UFSHCD_PCI=m +# CONFIG_SCSI_UFS_DWC_TC_PCI is not set +CONFIG_SCSI_UFSHCD_PLATFORM=m +CONFIG_SCSI_UFS_CDNS_PLATFORM=m +# CONFIG_SCSI_UFS_DWC_TC_PLATFORM is not set +CONFIG_SCSI_UFS_BSG=y +CONFIG_SCSI_HPTIOP=m +CONFIG_SCSI_BUSLOGIC=m +CONFIG_SCSI_FLASHPOINT=y +CONFIG_SCSI_MYRB=m +CONFIG_SCSI_MYRS=m +CONFIG_VMWARE_PVSCSI=m +CONFIG_XEN_SCSI_FRONTEND=m +CONFIG_HYPERV_STORAGE=m +CONFIG_LIBFC=m +CONFIG_LIBFCOE=m +CONFIG_FCOE=m +CONFIG_FCOE_FNIC=m +CONFIG_SCSI_SNIC=m +# CONFIG_SCSI_SNIC_DEBUG_FS is not set +CONFIG_SCSI_DMX3191D=m +CONFIG_SCSI_FDOMAIN=m +CONFIG_SCSI_FDOMAIN_PCI=m +CONFIG_SCSI_GDTH=m +CONFIG_SCSI_ISCI=m +CONFIG_SCSI_IPS=m +CONFIG_SCSI_INITIO=m +CONFIG_SCSI_INIA100=m +CONFIG_SCSI_PPA=m +CONFIG_SCSI_IMM=m +# CONFIG_SCSI_IZIP_EPP16 is not set +# CONFIG_SCSI_IZIP_SLOW_CTR is not set +CONFIG_SCSI_STEX=m +CONFIG_SCSI_SYM53C8XX_2=m +CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1 +CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 +CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 +CONFIG_SCSI_SYM53C8XX_MMIO=y +CONFIG_SCSI_IPR=m +CONFIG_SCSI_IPR_TRACE=y +CONFIG_SCSI_IPR_DUMP=y +CONFIG_SCSI_QLOGIC_1280=m +CONFIG_SCSI_QLA_FC=m +CONFIG_TCM_QLA2XXX=m +# CONFIG_TCM_QLA2XXX_DEBUG is not set +CONFIG_SCSI_QLA_ISCSI=m +CONFIG_QEDI=m +CONFIG_QEDF=m +CONFIG_SCSI_LPFC=m +# CONFIG_SCSI_LPFC_DEBUG_FS is not set +CONFIG_SCSI_DC395x=m +CONFIG_SCSI_AM53C974=m +CONFIG_SCSI_WD719X=m +CONFIG_SCSI_DEBUG=m +CONFIG_SCSI_PMCRAID=m +CONFIG_SCSI_PM8001=m +CONFIG_SCSI_BFA_FC=m +CONFIG_SCSI_VIRTIO=m +CONFIG_SCSI_CHELSIO_FCOE=m +CONFIG_SCSI_LOWLEVEL_PCMCIA=y +CONFIG_PCMCIA_AHA152X=m +CONFIG_PCMCIA_FDOMAIN=m +CONFIG_PCMCIA_QLOGIC=m +CONFIG_PCMCIA_SYM53C500=m +CONFIG_SCSI_DH=y +CONFIG_SCSI_DH_RDAC=m +CONFIG_SCSI_DH_HP_SW=m +CONFIG_SCSI_DH_EMC=m +CONFIG_SCSI_DH_ALUA=m +# end of SCSI device support + +CONFIG_ATA=y +CONFIG_SATA_HOST=y +CONFIG_PATA_TIMINGS=y +CONFIG_ATA_VERBOSE_ERROR=y +CONFIG_ATA_FORCE=y +CONFIG_ATA_ACPI=y +CONFIG_SATA_ZPODD=y +CONFIG_SATA_PMP=y + +# +# Controllers with non-SFF native interface +# +CONFIG_SATA_AHCI=y +CONFIG_SATA_MOBILE_LPM_POLICY=3 +CONFIG_SATA_AHCI_PLATFORM=m +CONFIG_AHCI_CEVA=m +CONFIG_AHCI_QORIQ=m +CONFIG_SATA_INIC162X=m +CONFIG_SATA_ACARD_AHCI=m +CONFIG_SATA_SIL24=m +CONFIG_ATA_SFF=y + +# +# SFF controllers with custom DMA interface +# +CONFIG_PDC_ADMA=m +CONFIG_SATA_QSTOR=m +CONFIG_SATA_SX4=m +CONFIG_ATA_BMDMA=y + +# +# SATA SFF controllers with BMDMA +# +CONFIG_ATA_PIIX=m +CONFIG_SATA_DWC=m +# CONFIG_SATA_DWC_OLD_DMA is not set +# CONFIG_SATA_DWC_DEBUG is not set +CONFIG_SATA_MV=m +CONFIG_SATA_NV=m +CONFIG_SATA_PROMISE=m +CONFIG_SATA_SIL=m +CONFIG_SATA_SIS=m +CONFIG_SATA_SVW=m +CONFIG_SATA_ULI=m +CONFIG_SATA_VIA=m +CONFIG_SATA_VITESSE=m + +# +# PATA SFF controllers with BMDMA +# +CONFIG_PATA_ALI=m +CONFIG_PATA_AMD=m +CONFIG_PATA_ARTOP=m +CONFIG_PATA_ATIIXP=m +CONFIG_PATA_ATP867X=m +CONFIG_PATA_CMD64X=m +CONFIG_PATA_CYPRESS=m +CONFIG_PATA_EFAR=m +CONFIG_PATA_HPT366=m +CONFIG_PATA_HPT37X=m +CONFIG_PATA_HPT3X2N=m +CONFIG_PATA_HPT3X3=m +CONFIG_PATA_HPT3X3_DMA=y +CONFIG_PATA_IT8213=m +CONFIG_PATA_IT821X=m +CONFIG_PATA_JMICRON=m +CONFIG_PATA_MARVELL=m +CONFIG_PATA_NETCELL=m +CONFIG_PATA_NINJA32=m +CONFIG_PATA_NS87415=m +CONFIG_PATA_OLDPIIX=m +CONFIG_PATA_OPTIDMA=m +CONFIG_PATA_PDC2027X=m +CONFIG_PATA_PDC_OLD=m +CONFIG_PATA_RADISYS=m +CONFIG_PATA_RDC=m +CONFIG_PATA_SCH=m +CONFIG_PATA_SERVERWORKS=m +CONFIG_PATA_SIL680=m +CONFIG_PATA_SIS=m +CONFIG_PATA_TOSHIBA=m +CONFIG_PATA_TRIFLEX=m +CONFIG_PATA_VIA=m +CONFIG_PATA_WINBOND=m + +# +# PIO-only SFF controllers +# +CONFIG_PATA_CMD640_PCI=m +CONFIG_PATA_MPIIX=m +CONFIG_PATA_NS87410=m +CONFIG_PATA_OPTI=m +CONFIG_PATA_PCMCIA=m +# CONFIG_PATA_PLATFORM is not set +CONFIG_PATA_RZ1000=m + +# +# Generic fallback / legacy drivers +# +CONFIG_PATA_ACPI=m +CONFIG_ATA_GENERIC=m +CONFIG_PATA_LEGACY=m +CONFIG_MD=y +CONFIG_BLK_DEV_MD=m +CONFIG_MD_LINEAR=m +CONFIG_MD_RAID0=m +CONFIG_MD_RAID1=m +CONFIG_MD_RAID10=m +CONFIG_MD_RAID456=m +CONFIG_MD_MULTIPATH=m +CONFIG_MD_FAULTY=m +CONFIG_MD_CLUSTER=m +CONFIG_BCACHE=m +# CONFIG_BCACHE_DEBUG is not set +# CONFIG_BCACHE_CLOSURES_DEBUG is not set +CONFIG_BLK_DEV_DM_BUILTIN=y +CONFIG_BLK_DEV_DM=m +# CONFIG_DM_DEBUG is not set +CONFIG_DM_BUFIO=m +# CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING is not set +CONFIG_DM_BIO_PRISON=m +CONFIG_DM_PERSISTENT_DATA=m +CONFIG_DM_UNSTRIPED=m +CONFIG_DM_CRYPT=m +CONFIG_DM_SNAPSHOT=m +CONFIG_DM_THIN_PROVISIONING=m +CONFIG_DM_CACHE=m +CONFIG_DM_CACHE_SMQ=m +CONFIG_DM_WRITECACHE=m +CONFIG_DM_ERA=m +CONFIG_DM_CLONE=m +CONFIG_DM_MIRROR=m +CONFIG_DM_LOG_USERSPACE=m +CONFIG_DM_RAID=m +CONFIG_DM_ZERO=m +CONFIG_DM_MULTIPATH=m +CONFIG_DM_MULTIPATH_QL=m +CONFIG_DM_MULTIPATH_ST=m +CONFIG_DM_DELAY=m +CONFIG_DM_DUST=m +CONFIG_DM_UEVENT=y +CONFIG_DM_FLAKEY=m +CONFIG_DM_VERITY=m +CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y +CONFIG_DM_VERITY_FEC=y +CONFIG_DM_SWITCH=m +CONFIG_DM_LOG_WRITES=m +CONFIG_DM_INTEGRITY=m +CONFIG_DM_ZONED=m +CONFIG_TARGET_CORE=m +CONFIG_TCM_IBLOCK=m +CONFIG_TCM_FILEIO=m +CONFIG_TCM_PSCSI=m +CONFIG_TCM_USER2=m +CONFIG_LOOPBACK_TARGET=m +CONFIG_TCM_FC=m +CONFIG_ISCSI_TARGET=m +CONFIG_ISCSI_TARGET_CXGB4=m +CONFIG_SBP_TARGET=m +CONFIG_FUSION=y +CONFIG_FUSION_SPI=m +CONFIG_FUSION_FC=m +CONFIG_FUSION_SAS=m +CONFIG_FUSION_MAX_SGE=128 +CONFIG_FUSION_CTL=m +CONFIG_FUSION_LAN=m +# CONFIG_FUSION_LOGGING is not set + +# +# IEEE 1394 (FireWire) support +# +CONFIG_FIREWIRE=m +CONFIG_FIREWIRE_OHCI=m +CONFIG_FIREWIRE_SBP2=m +CONFIG_FIREWIRE_NET=m +CONFIG_FIREWIRE_NOSY=m +# end of IEEE 1394 (FireWire) support + +CONFIG_MACINTOSH_DRIVERS=y +CONFIG_MAC_EMUMOUSEBTN=m +CONFIG_NETDEVICES=y +CONFIG_MII=m +CONFIG_NET_CORE=y +CONFIG_BONDING=m +CONFIG_DUMMY=m +CONFIG_WIREGUARD=m +# CONFIG_WIREGUARD_DEBUG is not set +CONFIG_EQUALIZER=m +CONFIG_NET_FC=y +CONFIG_IFB=m +CONFIG_NET_TEAM=m +CONFIG_NET_TEAM_MODE_BROADCAST=m +CONFIG_NET_TEAM_MODE_ROUNDROBIN=m +CONFIG_NET_TEAM_MODE_RANDOM=m +CONFIG_NET_TEAM_MODE_ACTIVEBACKUP=m +CONFIG_NET_TEAM_MODE_LOADBALANCE=m +CONFIG_MACVLAN=m +CONFIG_MACVTAP=m +CONFIG_IPVLAN_L3S=y +CONFIG_IPVLAN=m +CONFIG_IPVTAP=m +CONFIG_VXLAN=m +CONFIG_GENEVE=m +CONFIG_BAREUDP=m +CONFIG_GTP=m +CONFIG_MACSEC=m +CONFIG_NETCONSOLE=m +CONFIG_NETCONSOLE_DYNAMIC=y +CONFIG_NETPOLL=y +CONFIG_NET_POLL_CONTROLLER=y +CONFIG_NTB_NETDEV=m +CONFIG_RIONET=m +CONFIG_RIONET_TX_SIZE=128 +CONFIG_RIONET_RX_SIZE=128 +CONFIG_TUN=m +CONFIG_TAP=m +# CONFIG_TUN_VNET_CROSS_LE is not set +CONFIG_VETH=m +CONFIG_VIRTIO_NET=m +CONFIG_NLMON=m +CONFIG_NET_VRF=m +CONFIG_VSOCKMON=m +CONFIG_SUNGEM_PHY=m +# CONFIG_ARCNET is not set +CONFIG_ATM_DRIVERS=y +# CONFIG_ATM_DUMMY is not set +CONFIG_ATM_TCP=m +CONFIG_ATM_LANAI=m +CONFIG_ATM_ENI=m +# CONFIG_ATM_ENI_DEBUG is not set +# CONFIG_ATM_ENI_TUNE_BURST is not set +CONFIG_ATM_FIRESTREAM=m +CONFIG_ATM_ZATM=m +# CONFIG_ATM_ZATM_DEBUG is not set +CONFIG_ATM_NICSTAR=m +# CONFIG_ATM_NICSTAR_USE_SUNI is not set +# CONFIG_ATM_NICSTAR_USE_IDT77105 is not set +CONFIG_ATM_IDT77252=m +# CONFIG_ATM_IDT77252_DEBUG is not set +# CONFIG_ATM_IDT77252_RCV_ALL is not set +CONFIG_ATM_IDT77252_USE_SUNI=y +CONFIG_ATM_AMBASSADOR=m +# CONFIG_ATM_AMBASSADOR_DEBUG is not set +CONFIG_ATM_HORIZON=m +# CONFIG_ATM_HORIZON_DEBUG is not set +CONFIG_ATM_IA=m +# CONFIG_ATM_IA_DEBUG is not set +CONFIG_ATM_FORE200E=m +CONFIG_ATM_FORE200E_USE_TASKLET=y +CONFIG_ATM_FORE200E_TX_RETRY=16 +CONFIG_ATM_FORE200E_DEBUG=0 +CONFIG_ATM_HE=m +CONFIG_ATM_HE_USE_SUNI=y +CONFIG_ATM_SOLOS=m +CONFIG_CAIF_DRIVERS=y +CONFIG_CAIF_TTY=m +CONFIG_CAIF_SPI_SLAVE=m +CONFIG_CAIF_SPI_SYNC=y +CONFIG_CAIF_HSI=m +CONFIG_CAIF_VIRTIO=m + +# +# Distributed Switch Architecture drivers +# +CONFIG_B53=m +# CONFIG_B53_SPI_DRIVER is not set +CONFIG_B53_MDIO_DRIVER=m +CONFIG_B53_MMAP_DRIVER=m +CONFIG_B53_SRAB_DRIVER=m +CONFIG_B53_SERDES=m +CONFIG_NET_DSA_BCM_SF2=m +CONFIG_NET_DSA_LOOP=m +CONFIG_NET_DSA_LANTIQ_GSWIP=m +CONFIG_NET_DSA_MT7530=m +CONFIG_NET_DSA_MV88E6060=m +CONFIG_NET_DSA_MICROCHIP_KSZ_COMMON=m +CONFIG_NET_DSA_MICROCHIP_KSZ9477=m +CONFIG_NET_DSA_MICROCHIP_KSZ9477_I2C=m +CONFIG_NET_DSA_MICROCHIP_KSZ9477_SPI=m +CONFIG_NET_DSA_MICROCHIP_KSZ8795=m +CONFIG_NET_DSA_MICROCHIP_KSZ8795_SPI=m +CONFIG_NET_DSA_MV88E6XXX=m +CONFIG_NET_DSA_MV88E6XXX_GLOBAL2=y +CONFIG_NET_DSA_MV88E6XXX_PTP=y +CONFIG_NET_DSA_AR9331=m +CONFIG_NET_DSA_SJA1105=m +CONFIG_NET_DSA_SJA1105_PTP=y +CONFIG_NET_DSA_SJA1105_TAS=y +CONFIG_NET_DSA_QCA8K=m +CONFIG_NET_DSA_REALTEK_SMI=m +CONFIG_NET_DSA_SMSC_LAN9303=m +CONFIG_NET_DSA_SMSC_LAN9303_I2C=m +CONFIG_NET_DSA_SMSC_LAN9303_MDIO=m +CONFIG_NET_DSA_VITESSE_VSC73XX=m +CONFIG_NET_DSA_VITESSE_VSC73XX_SPI=m +CONFIG_NET_DSA_VITESSE_VSC73XX_PLATFORM=m +# end of Distributed Switch Architecture drivers + +CONFIG_ETHERNET=y +CONFIG_MDIO=m +CONFIG_NET_VENDOR_3COM=y +CONFIG_PCMCIA_3C574=m +CONFIG_PCMCIA_3C589=m +CONFIG_VORTEX=m +CONFIG_TYPHOON=m +CONFIG_NET_VENDOR_ADAPTEC=y +CONFIG_ADAPTEC_STARFIRE=m +CONFIG_NET_VENDOR_AGERE=y +CONFIG_ET131X=m +CONFIG_NET_VENDOR_ALACRITECH=y +CONFIG_SLICOSS=m +CONFIG_NET_VENDOR_ALTEON=y +CONFIG_ACENIC=m +# CONFIG_ACENIC_OMIT_TIGON_I is not set +CONFIG_ALTERA_TSE=m +CONFIG_NET_VENDOR_AMAZON=y +CONFIG_ENA_ETHERNET=m +CONFIG_NET_VENDOR_AMD=y +CONFIG_AMD8111_ETH=m +CONFIG_PCNET32=m +CONFIG_PCMCIA_NMCLAN=m +CONFIG_AMD_XGBE=m +CONFIG_AMD_XGBE_DCB=y +CONFIG_AMD_XGBE_HAVE_ECC=y +CONFIG_NET_VENDOR_AQUANTIA=y +CONFIG_AQTION=m +CONFIG_NET_VENDOR_ARC=y +CONFIG_NET_VENDOR_ATHEROS=y +CONFIG_ATL2=m +CONFIG_ATL1=m +CONFIG_ATL1E=m +CONFIG_ATL1C=m +CONFIG_ALX=m +CONFIG_NET_VENDOR_AURORA=y +CONFIG_AURORA_NB8800=m +CONFIG_NET_VENDOR_BROADCOM=y +CONFIG_B44=m +CONFIG_B44_PCI_AUTOSELECT=y +CONFIG_B44_PCICORE_AUTOSELECT=y +CONFIG_B44_PCI=y +CONFIG_BCMGENET=m +CONFIG_BNX2=m +CONFIG_CNIC=m +CONFIG_TIGON3=m +CONFIG_TIGON3_HWMON=y +CONFIG_BNX2X=m +CONFIG_BNX2X_SRIOV=y +CONFIG_SYSTEMPORT=m +CONFIG_BNXT=m +CONFIG_BNXT_SRIOV=y +CONFIG_BNXT_FLOWER_OFFLOAD=y +CONFIG_BNXT_DCB=y +CONFIG_BNXT_HWMON=y +CONFIG_NET_VENDOR_BROCADE=y +CONFIG_BNA=m +CONFIG_NET_VENDOR_CADENCE=y +CONFIG_MACB=m +CONFIG_MACB_USE_HWSTAMP=y +CONFIG_MACB_PCI=m +CONFIG_NET_VENDOR_CAVIUM=y +CONFIG_THUNDER_NIC_PF=m +CONFIG_THUNDER_NIC_VF=m +CONFIG_THUNDER_NIC_BGX=m +CONFIG_THUNDER_NIC_RGX=m +CONFIG_CAVIUM_PTP=m +CONFIG_LIQUIDIO=m +CONFIG_LIQUIDIO_VF=m +CONFIG_NET_VENDOR_CHELSIO=y +CONFIG_CHELSIO_T1=m +CONFIG_CHELSIO_T1_1G=y +CONFIG_CHELSIO_T3=m +CONFIG_CHELSIO_T4=m +CONFIG_CHELSIO_T4_DCB=y +CONFIG_CHELSIO_T4_FCOE=y +CONFIG_CHELSIO_T4VF=m +CONFIG_CHELSIO_LIB=m +CONFIG_NET_VENDOR_CISCO=y +CONFIG_ENIC=m +CONFIG_NET_VENDOR_CORTINA=y +CONFIG_GEMINI_ETHERNET=m +CONFIG_CX_ECAT=m +CONFIG_DNET=m +CONFIG_NET_VENDOR_DEC=y +CONFIG_NET_TULIP=y +CONFIG_DE2104X=m +CONFIG_DE2104X_DSL=0 +CONFIG_TULIP=m +CONFIG_TULIP_MWI=y +CONFIG_TULIP_MMIO=y +CONFIG_TULIP_NAPI=y +CONFIG_TULIP_NAPI_HW_MITIGATION=y +CONFIG_DE4X5=m +CONFIG_WINBOND_840=m +CONFIG_DM9102=m +CONFIG_ULI526X=m +CONFIG_PCMCIA_XIRCOM=m +CONFIG_NET_VENDOR_DLINK=y +CONFIG_DL2K=m +CONFIG_SUNDANCE=m +# CONFIG_SUNDANCE_MMIO is not set +CONFIG_NET_VENDOR_EMULEX=y +CONFIG_BE2NET=m +CONFIG_BE2NET_HWMON=y +CONFIG_BE2NET_BE2=y +CONFIG_BE2NET_BE3=y +CONFIG_BE2NET_LANCER=y +CONFIG_BE2NET_SKYHAWK=y +CONFIG_NET_VENDOR_EZCHIP=y +CONFIG_EZCHIP_NPS_MANAGEMENT_ENET=m +CONFIG_NET_VENDOR_FUJITSU=y +CONFIG_PCMCIA_FMVJ18X=m +CONFIG_NET_VENDOR_GOOGLE=y +CONFIG_GVE=m +CONFIG_NET_VENDOR_HUAWEI=y +CONFIG_HINIC=m +CONFIG_NET_VENDOR_I825XX=y +CONFIG_NET_VENDOR_INTEL=y +CONFIG_E100=m +CONFIG_E1000=m +CONFIG_E1000E=m +CONFIG_E1000E_HWTS=y +CONFIG_IGB=m +CONFIG_IGB_HWMON=y +CONFIG_IGB_DCA=y +CONFIG_IGBVF=m +CONFIG_IXGB=m +CONFIG_IXGBE=m +CONFIG_IXGBE_HWMON=y +CONFIG_IXGBE_DCA=y +CONFIG_IXGBE_DCB=y +# CONFIG_IXGBE_IPSEC is not set +CONFIG_IXGBEVF=m +CONFIG_IXGBEVF_IPSEC=y +CONFIG_I40E=m +CONFIG_I40E_DCB=y +CONFIG_IAVF=m +CONFIG_I40EVF=m +CONFIG_ICE=m +CONFIG_FM10K=m +CONFIG_IGC=m +CONFIG_JME=m +CONFIG_NET_VENDOR_MARVELL=y +CONFIG_MVMDIO=m +CONFIG_SKGE=m +# CONFIG_SKGE_DEBUG is not set +CONFIG_SKGE_GENESIS=y +CONFIG_SKY2=m +# CONFIG_SKY2_DEBUG is not set +CONFIG_NET_VENDOR_MELLANOX=y +CONFIG_MLX4_EN=m +CONFIG_MLX4_EN_DCB=y +CONFIG_MLX4_CORE=m +CONFIG_MLX4_DEBUG=y +CONFIG_MLX4_CORE_GEN2=y +CONFIG_MLX5_CORE=m +CONFIG_MLX5_ACCEL=y +CONFIG_MLX5_FPGA=y +CONFIG_MLX5_CORE_EN=y +CONFIG_MLX5_EN_ARFS=y +CONFIG_MLX5_EN_RXNFC=y +CONFIG_MLX5_MPFS=y +CONFIG_MLX5_ESWITCH=y +CONFIG_MLX5_TC_CT=y +CONFIG_MLX5_CORE_EN_DCB=y +CONFIG_MLX5_CORE_IPOIB=y +CONFIG_MLX5_FPGA_IPSEC=y +CONFIG_MLX5_EN_IPSEC=y +CONFIG_MLX5_FPGA_TLS=y +CONFIG_MLX5_TLS=y +CONFIG_MLX5_EN_TLS=y +CONFIG_MLX5_SW_STEERING=y +CONFIG_MLXSW_CORE=m +CONFIG_MLXSW_CORE_HWMON=y +CONFIG_MLXSW_CORE_THERMAL=y +CONFIG_MLXSW_PCI=m +CONFIG_MLXSW_I2C=m +CONFIG_MLXSW_SWITCHIB=m +CONFIG_MLXSW_SWITCHX2=m +CONFIG_MLXSW_SPECTRUM=m +CONFIG_MLXSW_SPECTRUM_DCB=y +CONFIG_MLXSW_MINIMAL=m +CONFIG_MLXFW=m +CONFIG_NET_VENDOR_MICREL=y +CONFIG_KS8842=m +CONFIG_KS8851=m +CONFIG_KS8851_MLL=m +CONFIG_KSZ884X_PCI=m +CONFIG_NET_VENDOR_MICROCHIP=y +CONFIG_ENC28J60=m +# CONFIG_ENC28J60_WRITEVERIFY is not set +CONFIG_ENCX24J600=m +CONFIG_LAN743X=m +CONFIG_NET_VENDOR_MICROSEMI=y +CONFIG_MSCC_OCELOT_SWITCH=m +CONFIG_MSCC_OCELOT_SWITCH_OCELOT=m +CONFIG_NET_VENDOR_MYRI=y +CONFIG_MYRI10GE=m +CONFIG_MYRI10GE_DCA=y +CONFIG_FEALNX=m +CONFIG_NET_VENDOR_NATSEMI=y +CONFIG_NATSEMI=m +CONFIG_NS83820=m +CONFIG_NET_VENDOR_NETERION=y +CONFIG_S2IO=m +CONFIG_VXGE=m +# CONFIG_VXGE_DEBUG_TRACE_ALL is not set +CONFIG_NET_VENDOR_NETRONOME=y +CONFIG_NFP=m +CONFIG_NFP_APP_FLOWER=y +CONFIG_NFP_APP_ABM_NIC=y +# CONFIG_NFP_DEBUG is not set +CONFIG_NET_VENDOR_NI=y +CONFIG_NI_XGE_MANAGEMENT_ENET=m +CONFIG_NET_VENDOR_8390=y +CONFIG_PCMCIA_AXNET=m +CONFIG_NE2K_PCI=m +CONFIG_PCMCIA_PCNET=m +CONFIG_NET_VENDOR_NVIDIA=y +CONFIG_FORCEDETH=m +CONFIG_NET_VENDOR_OKI=y +CONFIG_ETHOC=m +CONFIG_NET_VENDOR_PACKET_ENGINES=y +CONFIG_HAMACHI=m +CONFIG_YELLOWFIN=m +CONFIG_NET_VENDOR_PENSANDO=y +CONFIG_IONIC=m +CONFIG_NET_VENDOR_QLOGIC=y +CONFIG_QLA3XXX=m +CONFIG_QLCNIC=m +CONFIG_QLCNIC_SRIOV=y +CONFIG_QLCNIC_DCB=y +CONFIG_QLCNIC_HWMON=y +CONFIG_NETXEN_NIC=m +CONFIG_QED=m +CONFIG_QED_LL2=y +CONFIG_QED_SRIOV=y +CONFIG_QEDE=m +CONFIG_QED_RDMA=y +CONFIG_QED_ISCSI=y +CONFIG_QED_FCOE=y +CONFIG_QED_OOO=y +CONFIG_NET_VENDOR_QUALCOMM=y +CONFIG_QCA7000=m +CONFIG_QCA7000_SPI=m +CONFIG_QCA7000_UART=m +CONFIG_QCOM_EMAC=m +CONFIG_RMNET=m +CONFIG_NET_VENDOR_RDC=y +CONFIG_R6040=m +CONFIG_NET_VENDOR_REALTEK=y +CONFIG_ATP=m +CONFIG_8139CP=m +CONFIG_8139TOO=m +# CONFIG_8139TOO_PIO is not set +CONFIG_8139TOO_TUNE_TWISTER=y +CONFIG_8139TOO_8129=y +# CONFIG_8139_OLD_RX_RESET is not set +CONFIG_R8169=m +CONFIG_NET_VENDOR_RENESAS=y +CONFIG_NET_VENDOR_ROCKER=y +CONFIG_ROCKER=m +CONFIG_NET_VENDOR_SAMSUNG=y +CONFIG_SXGBE_ETH=m +CONFIG_NET_VENDOR_SEEQ=y +CONFIG_NET_VENDOR_SOLARFLARE=y +CONFIG_SFC=m +CONFIG_SFC_MTD=y +CONFIG_SFC_MCDI_MON=y +CONFIG_SFC_SRIOV=y +CONFIG_SFC_MCDI_LOGGING=y +CONFIG_SFC_FALCON=m +CONFIG_SFC_FALCON_MTD=y +CONFIG_NET_VENDOR_SILAN=y +CONFIG_SC92031=m +CONFIG_NET_VENDOR_SIS=y +CONFIG_SIS900=m +CONFIG_SIS190=m +CONFIG_NET_VENDOR_SMSC=y +CONFIG_PCMCIA_SMC91C92=m +CONFIG_EPIC100=m +CONFIG_SMSC911X=m +CONFIG_SMSC9420=m +CONFIG_NET_VENDOR_SOCIONEXT=y +CONFIG_NET_VENDOR_STMICRO=y +CONFIG_STMMAC_ETH=m +# CONFIG_STMMAC_SELFTESTS is not set +CONFIG_STMMAC_PLATFORM=m +CONFIG_DWMAC_DWC_QOS_ETH=m +CONFIG_DWMAC_GENERIC=m +CONFIG_DWMAC_INTEL=m +CONFIG_STMMAC_PCI=m +CONFIG_NET_VENDOR_SUN=y +CONFIG_HAPPYMEAL=m +CONFIG_SUNGEM=m +CONFIG_CASSINI=m +CONFIG_NIU=m +CONFIG_NET_VENDOR_SYNOPSYS=y +CONFIG_DWC_XLGMAC=m +CONFIG_DWC_XLGMAC_PCI=m +CONFIG_NET_VENDOR_TEHUTI=y +CONFIG_TEHUTI=m +CONFIG_NET_VENDOR_TI=y +# CONFIG_TI_CPSW_PHY_SEL is not set +CONFIG_TLAN=m +CONFIG_NET_VENDOR_VIA=y +CONFIG_VIA_RHINE=m +CONFIG_VIA_RHINE_MMIO=y +CONFIG_VIA_VELOCITY=m +CONFIG_NET_VENDOR_WIZNET=y +CONFIG_WIZNET_W5100=m +CONFIG_WIZNET_W5300=m +# CONFIG_WIZNET_BUS_DIRECT is not set +# CONFIG_WIZNET_BUS_INDIRECT is not set +CONFIG_WIZNET_BUS_ANY=y +CONFIG_WIZNET_W5100_SPI=m +CONFIG_NET_VENDOR_XILINX=y +CONFIG_XILINX_AXI_EMAC=m +CONFIG_XILINX_LL_TEMAC=m +CONFIG_NET_VENDOR_XIRCOM=y +CONFIG_PCMCIA_XIRC2PS=m +CONFIG_FDDI=m +CONFIG_DEFXX=m +CONFIG_DEFXX_MMIO=y +CONFIG_SKFP=m +# CONFIG_HIPPI is not set +CONFIG_NET_SB1000=m +CONFIG_MDIO_DEVICE=m +CONFIG_MDIO_BUS=m +CONFIG_MDIO_BCM_UNIMAC=m +CONFIG_MDIO_BITBANG=m +CONFIG_MDIO_BUS_MUX=m +CONFIG_MDIO_BUS_MUX_GPIO=m +CONFIG_MDIO_BUS_MUX_MMIOREG=m +CONFIG_MDIO_BUS_MUX_MULTIPLEXER=m +CONFIG_MDIO_CAVIUM=m +CONFIG_MDIO_GPIO=m +CONFIG_MDIO_HISI_FEMAC=m +CONFIG_MDIO_I2C=m +CONFIG_MDIO_IPQ8064=m +CONFIG_MDIO_MSCC_MIIM=m +CONFIG_MDIO_MVUSB=m +CONFIG_MDIO_OCTEON=m +CONFIG_MDIO_THUNDER=m +CONFIG_MDIO_XPCS=m +CONFIG_PHYLINK=m +CONFIG_PHYLIB=m +CONFIG_SWPHY=y +CONFIG_LED_TRIGGER_PHY=y + +# +# MII PHY device drivers +# +CONFIG_SFP=m +CONFIG_ADIN_PHY=m +CONFIG_AMD_PHY=m +CONFIG_AQUANTIA_PHY=m +CONFIG_AX88796B_PHY=m +CONFIG_BCM7XXX_PHY=m +CONFIG_BCM87XX_PHY=m +CONFIG_BCM_NET_PHYLIB=m +CONFIG_BROADCOM_PHY=m +CONFIG_BCM84881_PHY=m +CONFIG_CICADA_PHY=m +CONFIG_CORTINA_PHY=m +CONFIG_DAVICOM_PHY=m +CONFIG_DP83822_PHY=m +CONFIG_DP83TC811_PHY=m +CONFIG_DP83848_PHY=m +CONFIG_DP83867_PHY=m +CONFIG_DP83869_PHY=m +CONFIG_FIXED_PHY=m +CONFIG_ICPLUS_PHY=m +CONFIG_INTEL_XWAY_PHY=m +CONFIG_LSI_ET1011C_PHY=m +CONFIG_LXT_PHY=m +CONFIG_MARVELL_PHY=m +CONFIG_MARVELL_10G_PHY=m +CONFIG_MICREL_PHY=m +CONFIG_MICROCHIP_PHY=m +CONFIG_MICROCHIP_T1_PHY=m +CONFIG_MICROSEMI_PHY=m +CONFIG_NATIONAL_PHY=m +CONFIG_NXP_TJA11XX_PHY=m +CONFIG_AT803X_PHY=m +CONFIG_QSEMI_PHY=m +CONFIG_REALTEK_PHY=m +CONFIG_RENESAS_PHY=m +CONFIG_ROCKCHIP_PHY=m +CONFIG_SMSC_PHY=m +CONFIG_STE10XP=m +CONFIG_TERANETICS_PHY=m +CONFIG_VITESSE_PHY=m +CONFIG_XILINX_GMII2RGMII=m +CONFIG_MICREL_KS8995MA=m +CONFIG_PLIP=m +CONFIG_PPP=m +CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_FILTER=y +CONFIG_PPP_MPPE=m +CONFIG_PPP_MULTILINK=y +CONFIG_PPPOATM=m +CONFIG_PPPOE=m +CONFIG_PPTP=m +CONFIG_PPPOL2TP=m +CONFIG_PPP_ASYNC=m +CONFIG_PPP_SYNC_TTY=m +CONFIG_SLIP=m +CONFIG_SLHC=m +CONFIG_SLIP_COMPRESSED=y +CONFIG_SLIP_SMART=y +CONFIG_SLIP_MODE_SLIP6=y +CONFIG_USB_NET_DRIVERS=m +CONFIG_USB_CATC=m +CONFIG_USB_KAWETH=m +CONFIG_USB_PEGASUS=m +CONFIG_USB_RTL8150=m +CONFIG_USB_RTL8152=m +CONFIG_USB_LAN78XX=m +CONFIG_USB_USBNET=m +CONFIG_USB_NET_AX8817X=m +CONFIG_USB_NET_AX88179_178A=m +CONFIG_USB_NET_CDCETHER=m +CONFIG_USB_NET_CDC_EEM=m +CONFIG_USB_NET_CDC_NCM=m +CONFIG_USB_NET_HUAWEI_CDC_NCM=m +CONFIG_USB_NET_CDC_MBIM=m +CONFIG_USB_NET_DM9601=m +CONFIG_USB_NET_SR9700=m +CONFIG_USB_NET_SR9800=m +CONFIG_USB_NET_SMSC75XX=m +CONFIG_USB_NET_SMSC95XX=m +CONFIG_USB_NET_GL620A=m +CONFIG_USB_NET_NET1080=m +CONFIG_USB_NET_PLUSB=m +CONFIG_USB_NET_MCS7830=m +CONFIG_USB_NET_RNDIS_HOST=m +CONFIG_USB_NET_CDC_SUBSET_ENABLE=m +CONFIG_USB_NET_CDC_SUBSET=m +CONFIG_USB_ALI_M5632=y +CONFIG_USB_AN2720=y +CONFIG_USB_BELKIN=y +CONFIG_USB_ARMLINUX=y +CONFIG_USB_EPSON2888=y +CONFIG_USB_KC2190=y +CONFIG_USB_NET_ZAURUS=m +CONFIG_USB_NET_CX82310_ETH=m +CONFIG_USB_NET_KALMIA=m +CONFIG_USB_NET_QMI_WWAN=m +CONFIG_USB_HSO=m +CONFIG_USB_NET_INT51X1=m +CONFIG_USB_CDC_PHONET=m +CONFIG_USB_IPHETH=m +CONFIG_USB_SIERRA_NET=m +CONFIG_USB_VL600=m +CONFIG_USB_NET_CH9200=m +CONFIG_USB_NET_AQC111=m +CONFIG_WLAN=y +# CONFIG_WIRELESS_WDS is not set +CONFIG_WLAN_VENDOR_ADMTEK=y +CONFIG_ADM8211=m +CONFIG_ATH_COMMON=m +CONFIG_WLAN_VENDOR_ATH=y +# CONFIG_ATH_DEBUG is not set +CONFIG_ATH5K=m +# CONFIG_ATH5K_DEBUG is not set +# CONFIG_ATH5K_TRACER is not set +CONFIG_ATH5K_PCI=y +CONFIG_ATH9K_HW=m +CONFIG_ATH9K_COMMON=m +CONFIG_ATH9K_BTCOEX_SUPPORT=y +CONFIG_ATH9K=m +CONFIG_ATH9K_PCI=y +CONFIG_ATH9K_AHB=y +# CONFIG_ATH9K_DEBUGFS is not set +CONFIG_ATH9K_DYNACK=y +CONFIG_ATH9K_WOW=y +CONFIG_ATH9K_RFKILL=y +CONFIG_ATH9K_CHANNEL_CONTEXT=y +CONFIG_ATH9K_PCOEM=y +CONFIG_ATH9K_PCI_NO_EEPROM=m +CONFIG_ATH9K_HTC=m +# CONFIG_ATH9K_HTC_DEBUGFS is not set +CONFIG_ATH9K_HWRNG=y +CONFIG_CARL9170=m +CONFIG_CARL9170_LEDS=y +CONFIG_CARL9170_WPC=y +# CONFIG_CARL9170_HWRNG is not set +CONFIG_ATH6KL=m +CONFIG_ATH6KL_SDIO=m +CONFIG_ATH6KL_USB=m +# CONFIG_ATH6KL_DEBUG is not set +# CONFIG_ATH6KL_TRACING is not set +CONFIG_AR5523=m +CONFIG_WIL6210=m +CONFIG_WIL6210_ISR_COR=y +CONFIG_WIL6210_TRACING=y +# CONFIG_WIL6210_DEBUGFS is not set +CONFIG_ATH10K=m +CONFIG_ATH10K_CE=y +CONFIG_ATH10K_PCI=m +CONFIG_ATH10K_AHB=y +CONFIG_ATH10K_SDIO=m +CONFIG_ATH10K_USB=m +# CONFIG_ATH10K_DEBUG is not set +# CONFIG_ATH10K_DEBUGFS is not set +# CONFIG_ATH10K_TRACING is not set +CONFIG_WCN36XX=m +# CONFIG_WCN36XX_DEBUGFS is not set +CONFIG_WLAN_VENDOR_ATMEL=y +CONFIG_ATMEL=m +CONFIG_PCI_ATMEL=m +CONFIG_PCMCIA_ATMEL=m +CONFIG_AT76C50X_USB=m +CONFIG_WLAN_VENDOR_BROADCOM=y +CONFIG_B43=m +CONFIG_B43_BCMA=y +CONFIG_B43_SSB=y +CONFIG_B43_BUSES_BCMA_AND_SSB=y +# CONFIG_B43_BUSES_BCMA is not set +# CONFIG_B43_BUSES_SSB is not set +CONFIG_B43_PCI_AUTOSELECT=y +CONFIG_B43_PCICORE_AUTOSELECT=y +CONFIG_B43_SDIO=y +CONFIG_B43_BCMA_PIO=y +CONFIG_B43_PIO=y +CONFIG_B43_PHY_G=y +CONFIG_B43_PHY_N=y +CONFIG_B43_PHY_LP=y +CONFIG_B43_PHY_HT=y +CONFIG_B43_LEDS=y +CONFIG_B43_HWRNG=y +# CONFIG_B43_DEBUG is not set +CONFIG_B43LEGACY=m +CONFIG_B43LEGACY_PCI_AUTOSELECT=y +CONFIG_B43LEGACY_PCICORE_AUTOSELECT=y +CONFIG_B43LEGACY_LEDS=y +CONFIG_B43LEGACY_HWRNG=y +CONFIG_B43LEGACY_DEBUG=y +CONFIG_B43LEGACY_DMA=y +CONFIG_B43LEGACY_PIO=y +CONFIG_B43LEGACY_DMA_AND_PIO_MODE=y +# CONFIG_B43LEGACY_DMA_MODE is not set +# CONFIG_B43LEGACY_PIO_MODE is not set +CONFIG_BRCMUTIL=m +CONFIG_BRCMSMAC=m +CONFIG_BRCMFMAC=m +CONFIG_BRCMFMAC_PROTO_BCDC=y +CONFIG_BRCMFMAC_PROTO_MSGBUF=y +CONFIG_BRCMFMAC_SDIO=y +CONFIG_BRCMFMAC_USB=y +CONFIG_BRCMFMAC_PCIE=y +# CONFIG_BRCM_TRACING is not set +CONFIG_BRCMDBG=y +CONFIG_WLAN_VENDOR_CISCO=y +CONFIG_AIRO=m +CONFIG_AIRO_CS=m +CONFIG_WLAN_VENDOR_INTEL=y +CONFIG_IPW2100=m +CONFIG_IPW2100_MONITOR=y +# CONFIG_IPW2100_DEBUG is not set +CONFIG_IPW2200=m +CONFIG_IPW2200_MONITOR=y +CONFIG_IPW2200_RADIOTAP=y +CONFIG_IPW2200_PROMISCUOUS=y +CONFIG_IPW2200_QOS=y +# CONFIG_IPW2200_DEBUG is not set +CONFIG_LIBIPW=m +# CONFIG_LIBIPW_DEBUG is not set +CONFIG_IWLEGACY=m +CONFIG_IWL4965=m +CONFIG_IWL3945=m + +# +# iwl3945 / iwl4965 Debugging Options +# +# CONFIG_IWLEGACY_DEBUG is not set +# end of iwl3945 / iwl4965 Debugging Options + +CONFIG_IWLWIFI=m +CONFIG_IWLWIFI_LEDS=y +CONFIG_IWLDVM=m +CONFIG_IWLMVM=m +CONFIG_IWLWIFI_OPMODE_MODULAR=y +# CONFIG_IWLWIFI_BCAST_FILTERING is not set + +# +# Debugging Options +# +# CONFIG_IWLWIFI_DEBUG is not set +# CONFIG_IWLWIFI_DEVICE_TRACING is not set +# end of Debugging Options + +CONFIG_WLAN_VENDOR_INTERSIL=y +CONFIG_HOSTAP=m +CONFIG_HOSTAP_FIRMWARE=y +CONFIG_HOSTAP_FIRMWARE_NVRAM=y +CONFIG_HOSTAP_PLX=m +CONFIG_HOSTAP_PCI=m +CONFIG_HOSTAP_CS=m +CONFIG_HERMES=m +CONFIG_HERMES_PRISM=y +CONFIG_HERMES_CACHE_FW_ON_INIT=y +CONFIG_PLX_HERMES=m +CONFIG_TMD_HERMES=m +CONFIG_NORTEL_HERMES=m +CONFIG_PCI_HERMES=m +CONFIG_PCMCIA_HERMES=m +CONFIG_PCMCIA_SPECTRUM=m +CONFIG_ORINOCO_USB=m +CONFIG_P54_COMMON=m +CONFIG_P54_USB=m +CONFIG_P54_PCI=m +CONFIG_P54_SPI=m +# CONFIG_P54_SPI_DEFAULT_EEPROM is not set +CONFIG_P54_LEDS=y +CONFIG_PRISM54=m +CONFIG_WLAN_VENDOR_MARVELL=y +CONFIG_LIBERTAS=m +CONFIG_LIBERTAS_USB=m +CONFIG_LIBERTAS_CS=m +CONFIG_LIBERTAS_SDIO=m +CONFIG_LIBERTAS_SPI=m +# CONFIG_LIBERTAS_DEBUG is not set +CONFIG_LIBERTAS_MESH=y +CONFIG_LIBERTAS_THINFIRM=m +# CONFIG_LIBERTAS_THINFIRM_DEBUG is not set +CONFIG_LIBERTAS_THINFIRM_USB=m +CONFIG_MWIFIEX=m +CONFIG_MWIFIEX_SDIO=m +CONFIG_MWIFIEX_PCIE=m +CONFIG_MWIFIEX_USB=m +CONFIG_MWL8K=m +CONFIG_WLAN_VENDOR_MEDIATEK=y +CONFIG_MT7601U=m +CONFIG_MT76_CORE=m +CONFIG_MT76_LEDS=y +CONFIG_MT76_USB=m +CONFIG_MT76x02_LIB=m +CONFIG_MT76x02_USB=m +CONFIG_MT76x0_COMMON=m +CONFIG_MT76x0U=m +CONFIG_MT76x0E=m +CONFIG_MT76x2_COMMON=m +CONFIG_MT76x2E=m +CONFIG_MT76x2U=m +CONFIG_MT7603E=m +CONFIG_MT7615E=m +CONFIG_WLAN_VENDOR_RALINK=y +CONFIG_RT2X00=m +CONFIG_RT2400PCI=m +CONFIG_RT2500PCI=m +CONFIG_RT61PCI=m +CONFIG_RT2800PCI=m +CONFIG_RT2800PCI_RT33XX=y +CONFIG_RT2800PCI_RT35XX=y +CONFIG_RT2800PCI_RT53XX=y +CONFIG_RT2800PCI_RT3290=y +CONFIG_RT2500USB=m +CONFIG_RT73USB=m +CONFIG_RT2800USB=m +CONFIG_RT2800USB_RT33XX=y +CONFIG_RT2800USB_RT35XX=y +CONFIG_RT2800USB_RT3573=y +CONFIG_RT2800USB_RT53XX=y +CONFIG_RT2800USB_RT55XX=y +CONFIG_RT2800USB_UNKNOWN=y +CONFIG_RT2800_LIB=m +CONFIG_RT2800_LIB_MMIO=m +CONFIG_RT2X00_LIB_MMIO=m +CONFIG_RT2X00_LIB_PCI=m +CONFIG_RT2X00_LIB_USB=m +CONFIG_RT2X00_LIB=m +CONFIG_RT2X00_LIB_FIRMWARE=y +CONFIG_RT2X00_LIB_CRYPTO=y +CONFIG_RT2X00_LIB_LEDS=y +# CONFIG_RT2X00_DEBUG is not set +CONFIG_WLAN_VENDOR_REALTEK=y +CONFIG_RTL8180=m +CONFIG_RTL8187=m +CONFIG_RTL8187_LEDS=y +CONFIG_RTL_CARDS=m +CONFIG_RTL8192CE=m +CONFIG_RTL8192SE=m +CONFIG_RTL8192DE=m +CONFIG_RTL8723AE=m +CONFIG_RTL8723BE=m +CONFIG_RTL8188EE=m +CONFIG_RTL8192EE=m +CONFIG_RTL8821AE=m +CONFIG_RTL8192CU=m +CONFIG_RTLWIFI=m +CONFIG_RTLWIFI_PCI=m +CONFIG_RTLWIFI_USB=m +CONFIG_RTLWIFI_DEBUG=y +CONFIG_RTL8192C_COMMON=m +CONFIG_RTL8723_COMMON=m +CONFIG_RTLBTCOEXIST=m +CONFIG_RTL8XXXU=m +CONFIG_RTL8XXXU_UNTESTED=y +CONFIG_RTW88=m +CONFIG_RTW88_CORE=m +CONFIG_RTW88_PCI=m +CONFIG_RTW88_8822BE=y +CONFIG_RTW88_8822CE=y +# CONFIG_RTW88_DEBUG is not set +# CONFIG_RTW88_DEBUGFS is not set +CONFIG_WLAN_VENDOR_RSI=y +CONFIG_RSI_91X=m +# CONFIG_RSI_DEBUGFS is not set +CONFIG_RSI_SDIO=m +CONFIG_RSI_USB=m +CONFIG_RSI_COEX=y +CONFIG_WLAN_VENDOR_ST=y +CONFIG_CW1200=m +CONFIG_CW1200_WLAN_SDIO=m +CONFIG_CW1200_WLAN_SPI=m +CONFIG_WLAN_VENDOR_TI=y +CONFIG_WL1251=m +CONFIG_WL1251_SPI=m +CONFIG_WL1251_SDIO=m +CONFIG_WL12XX=m +CONFIG_WL18XX=m +CONFIG_WLCORE=m +CONFIG_WLCORE_SPI=m +CONFIG_WLCORE_SDIO=m +CONFIG_WILINK_PLATFORM_DATA=y +CONFIG_WLAN_VENDOR_ZYDAS=y +CONFIG_USB_ZD1201=m +CONFIG_ZD1211RW=m +# CONFIG_ZD1211RW_DEBUG is not set +CONFIG_WLAN_VENDOR_QUANTENNA=y +CONFIG_QTNFMAC=m +CONFIG_QTNFMAC_PCIE=m +CONFIG_PCMCIA_RAYCS=m +CONFIG_PCMCIA_WL3501=m +CONFIG_MAC80211_HWSIM=m +CONFIG_USB_NET_RNDIS_WLAN=m +CONFIG_VIRT_WIFI=m + +# +# WiMAX Wireless Broadband devices +# +CONFIG_WIMAX_I2400M=m +CONFIG_WIMAX_I2400M_USB=m +CONFIG_WIMAX_I2400M_DEBUG_LEVEL=8 +# end of WiMAX Wireless Broadband devices + +# CONFIG_WAN is not set +CONFIG_IEEE802154_DRIVERS=m +CONFIG_IEEE802154_FAKELB=m +CONFIG_IEEE802154_AT86RF230=m +# CONFIG_IEEE802154_AT86RF230_DEBUGFS is not set +CONFIG_IEEE802154_MRF24J40=m +CONFIG_IEEE802154_CC2520=m +CONFIG_IEEE802154_ATUSB=m +CONFIG_IEEE802154_ADF7242=m +CONFIG_IEEE802154_CA8210=m +# CONFIG_IEEE802154_CA8210_DEBUGFS is not set +CONFIG_IEEE802154_MCR20A=m +CONFIG_IEEE802154_HWSIM=m +CONFIG_XEN_NETDEV_FRONTEND=m +CONFIG_XEN_NETDEV_BACKEND=m +CONFIG_VMXNET3=m +CONFIG_FUJITSU_ES=m +CONFIG_USB4_NET=m +CONFIG_HYPERV_NET=m +CONFIG_NETDEVSIM=m +CONFIG_NET_FAILOVER=m +CONFIG_ISDN=y +CONFIG_ISDN_CAPI=y +CONFIG_CAPI_TRACE=y +CONFIG_ISDN_CAPI_MIDDLEWARE=y +CONFIG_MISDN=m +CONFIG_MISDN_DSP=m +CONFIG_MISDN_L1OIP=m + +# +# mISDN hardware drivers +# +CONFIG_MISDN_HFCPCI=m +CONFIG_MISDN_HFCMULTI=m +CONFIG_MISDN_HFCUSB=m +CONFIG_MISDN_AVMFRITZ=m +CONFIG_MISDN_SPEEDFAX=m +CONFIG_MISDN_INFINEON=m +CONFIG_MISDN_W6692=m +CONFIG_MISDN_NETJET=m +CONFIG_MISDN_HDLC=m +CONFIG_MISDN_IPAC=m +CONFIG_MISDN_ISAR=m +CONFIG_NVM=y +CONFIG_NVM_PBLK=m +# CONFIG_NVM_PBLK_DEBUG is not set + +# +# Input device support +# +CONFIG_INPUT=y +CONFIG_INPUT_LEDS=m +CONFIG_INPUT_FF_MEMLESS=m +CONFIG_INPUT_POLLDEV=m +CONFIG_INPUT_SPARSEKMAP=m +CONFIG_INPUT_MATRIXKMAP=m + +# +# Userland interfaces +# +CONFIG_INPUT_MOUSEDEV=m +CONFIG_INPUT_MOUSEDEV_PSAUX=y +CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 +CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 +CONFIG_INPUT_JOYDEV=m +CONFIG_INPUT_EVDEV=m +# CONFIG_INPUT_EVBUG is not set + +# +# Input Device Drivers +# +CONFIG_INPUT_KEYBOARD=y +CONFIG_KEYBOARD_ADC=m +CONFIG_KEYBOARD_ADP5520=m +CONFIG_KEYBOARD_ADP5588=m +CONFIG_KEYBOARD_ADP5589=m +CONFIG_KEYBOARD_APPLESPI=m +CONFIG_KEYBOARD_ATKBD=m +CONFIG_KEYBOARD_QT1050=m +CONFIG_KEYBOARD_QT1070=m +CONFIG_KEYBOARD_QT2160=m +CONFIG_KEYBOARD_DLINK_DIR685=m +CONFIG_KEYBOARD_LKKBD=m +CONFIG_KEYBOARD_GPIO=m +CONFIG_KEYBOARD_GPIO_POLLED=m +CONFIG_KEYBOARD_TCA6416=m +CONFIG_KEYBOARD_TCA8418=m +CONFIG_KEYBOARD_MATRIX=m +CONFIG_KEYBOARD_LM8323=m +CONFIG_KEYBOARD_LM8333=m +CONFIG_KEYBOARD_MAX7359=m +CONFIG_KEYBOARD_MCS=m +CONFIG_KEYBOARD_MPR121=m +CONFIG_KEYBOARD_NEWTON=m +CONFIG_KEYBOARD_OPENCORES=m +CONFIG_KEYBOARD_SAMSUNG=m +CONFIG_KEYBOARD_STOWAWAY=m +CONFIG_KEYBOARD_SUNKBD=m +CONFIG_KEYBOARD_STMPE=m +CONFIG_KEYBOARD_IQS62X=m +CONFIG_KEYBOARD_OMAP4=m +CONFIG_KEYBOARD_TC3589X=m +CONFIG_KEYBOARD_TM2_TOUCHKEY=m +CONFIG_KEYBOARD_TWL4030=m +CONFIG_KEYBOARD_XTKBD=m +CONFIG_KEYBOARD_CROS_EC=m +CONFIG_KEYBOARD_CAP11XX=m +CONFIG_KEYBOARD_BCM=m +CONFIG_KEYBOARD_MTK_PMIC=m +CONFIG_INPUT_MOUSE=y +CONFIG_MOUSE_PS2=m +CONFIG_MOUSE_PS2_ALPS=y +CONFIG_MOUSE_PS2_BYD=y +CONFIG_MOUSE_PS2_LOGIPS2PP=y +CONFIG_MOUSE_PS2_SYNAPTICS=y +CONFIG_MOUSE_PS2_SYNAPTICS_SMBUS=y +CONFIG_MOUSE_PS2_CYPRESS=y +CONFIG_MOUSE_PS2_LIFEBOOK=y +CONFIG_MOUSE_PS2_TRACKPOINT=y +CONFIG_MOUSE_PS2_ELANTECH=y +CONFIG_MOUSE_PS2_ELANTECH_SMBUS=y +CONFIG_MOUSE_PS2_SENTELIC=y +CONFIG_MOUSE_PS2_TOUCHKIT=y +CONFIG_MOUSE_PS2_FOCALTECH=y +CONFIG_MOUSE_PS2_VMMOUSE=y +CONFIG_MOUSE_PS2_SMBUS=y +CONFIG_MOUSE_SERIAL=m +CONFIG_MOUSE_APPLETOUCH=m +CONFIG_MOUSE_BCM5974=m +CONFIG_MOUSE_CYAPA=m +CONFIG_MOUSE_ELAN_I2C=m +CONFIG_MOUSE_ELAN_I2C_I2C=y +CONFIG_MOUSE_ELAN_I2C_SMBUS=y +CONFIG_MOUSE_VSXXXAA=m +CONFIG_MOUSE_GPIO=m +CONFIG_MOUSE_SYNAPTICS_I2C=m +CONFIG_MOUSE_SYNAPTICS_USB=m +CONFIG_INPUT_JOYSTICK=y +CONFIG_JOYSTICK_ANALOG=m +CONFIG_JOYSTICK_A3D=m +CONFIG_JOYSTICK_ADI=m +CONFIG_JOYSTICK_COBRA=m +CONFIG_JOYSTICK_GF2K=m +CONFIG_JOYSTICK_GRIP=m +CONFIG_JOYSTICK_GRIP_MP=m +CONFIG_JOYSTICK_GUILLEMOT=m +CONFIG_JOYSTICK_INTERACT=m +CONFIG_JOYSTICK_SIDEWINDER=m +CONFIG_JOYSTICK_TMDC=m +CONFIG_JOYSTICK_IFORCE=m +CONFIG_JOYSTICK_IFORCE_USB=m +CONFIG_JOYSTICK_IFORCE_232=m +CONFIG_JOYSTICK_WARRIOR=m +CONFIG_JOYSTICK_MAGELLAN=m +CONFIG_JOYSTICK_SPACEORB=m +CONFIG_JOYSTICK_SPACEBALL=m +CONFIG_JOYSTICK_STINGER=m +CONFIG_JOYSTICK_TWIDJOY=m +CONFIG_JOYSTICK_ZHENHUA=m +CONFIG_JOYSTICK_DB9=m +CONFIG_JOYSTICK_GAMECON=m +CONFIG_JOYSTICK_TURBOGRAFX=m +CONFIG_JOYSTICK_AS5011=m +CONFIG_JOYSTICK_JOYDUMP=m +CONFIG_JOYSTICK_XPAD=m +CONFIG_JOYSTICK_XPAD_FF=y +CONFIG_JOYSTICK_XPAD_LEDS=y +CONFIG_JOYSTICK_WALKERA0701=m +CONFIG_JOYSTICK_PSXPAD_SPI=m +CONFIG_JOYSTICK_PSXPAD_SPI_FF=y +CONFIG_JOYSTICK_PXRC=m +CONFIG_JOYSTICK_FSIA6B=m +CONFIG_INPUT_TABLET=y +CONFIG_TABLET_USB_ACECAD=m +CONFIG_TABLET_USB_AIPTEK=m +CONFIG_TABLET_USB_GTCO=m +CONFIG_TABLET_USB_HANWANG=m +CONFIG_TABLET_USB_KBTAB=m +CONFIG_TABLET_USB_PEGASUS=m +CONFIG_TABLET_SERIAL_WACOM4=m +CONFIG_INPUT_TOUCHSCREEN=y +CONFIG_TOUCHSCREEN_PROPERTIES=y +CONFIG_TOUCHSCREEN_88PM860X=m +CONFIG_TOUCHSCREEN_ADS7846=m +CONFIG_TOUCHSCREEN_AD7877=m +CONFIG_TOUCHSCREEN_AD7879=m +CONFIG_TOUCHSCREEN_AD7879_I2C=m +CONFIG_TOUCHSCREEN_AD7879_SPI=m +CONFIG_TOUCHSCREEN_ADC=m +CONFIG_TOUCHSCREEN_AR1021_I2C=m +CONFIG_TOUCHSCREEN_ATMEL_MXT=m +CONFIG_TOUCHSCREEN_ATMEL_MXT_T37=y +CONFIG_TOUCHSCREEN_AUO_PIXCIR=m +CONFIG_TOUCHSCREEN_BU21013=m +CONFIG_TOUCHSCREEN_BU21029=m +CONFIG_TOUCHSCREEN_CHIPONE_ICN8318=m +CONFIG_TOUCHSCREEN_CHIPONE_ICN8505=m +CONFIG_TOUCHSCREEN_CY8CTMG110=m +CONFIG_TOUCHSCREEN_CYTTSP_CORE=m +CONFIG_TOUCHSCREEN_CYTTSP_I2C=m +CONFIG_TOUCHSCREEN_CYTTSP_SPI=m +CONFIG_TOUCHSCREEN_CYTTSP4_CORE=m +CONFIG_TOUCHSCREEN_CYTTSP4_I2C=m +CONFIG_TOUCHSCREEN_CYTTSP4_SPI=m +CONFIG_TOUCHSCREEN_DA9034=m +CONFIG_TOUCHSCREEN_DA9052=m +CONFIG_TOUCHSCREEN_DYNAPRO=m +CONFIG_TOUCHSCREEN_HAMPSHIRE=m +CONFIG_TOUCHSCREEN_EETI=m +CONFIG_TOUCHSCREEN_EGALAX=m +CONFIG_TOUCHSCREEN_EGALAX_SERIAL=m +CONFIG_TOUCHSCREEN_EXC3000=m +CONFIG_TOUCHSCREEN_FUJITSU=m +CONFIG_TOUCHSCREEN_GOODIX=m +CONFIG_TOUCHSCREEN_HIDEEP=m +CONFIG_TOUCHSCREEN_ILI210X=m +CONFIG_TOUCHSCREEN_S6SY761=m +CONFIG_TOUCHSCREEN_GUNZE=m +CONFIG_TOUCHSCREEN_EKTF2127=m +CONFIG_TOUCHSCREEN_ELAN=m +CONFIG_TOUCHSCREEN_ELO=m +CONFIG_TOUCHSCREEN_WACOM_W8001=m +CONFIG_TOUCHSCREEN_WACOM_I2C=m +CONFIG_TOUCHSCREEN_MAX11801=m +CONFIG_TOUCHSCREEN_MCS5000=m +CONFIG_TOUCHSCREEN_MMS114=m +CONFIG_TOUCHSCREEN_MELFAS_MIP4=m +CONFIG_TOUCHSCREEN_MTOUCH=m +CONFIG_TOUCHSCREEN_IMX6UL_TSC=m +CONFIG_TOUCHSCREEN_INEXIO=m +CONFIG_TOUCHSCREEN_MK712=m +CONFIG_TOUCHSCREEN_PENMOUNT=m +CONFIG_TOUCHSCREEN_EDT_FT5X06=m +CONFIG_TOUCHSCREEN_TOUCHRIGHT=m +CONFIG_TOUCHSCREEN_TOUCHWIN=m +CONFIG_TOUCHSCREEN_TI_AM335X_TSC=m +CONFIG_TOUCHSCREEN_UCB1400=m +CONFIG_TOUCHSCREEN_PIXCIR=m +CONFIG_TOUCHSCREEN_WDT87XX_I2C=m +CONFIG_TOUCHSCREEN_WM831X=m +CONFIG_TOUCHSCREEN_WM97XX=m +CONFIG_TOUCHSCREEN_WM9705=y +CONFIG_TOUCHSCREEN_WM9712=y +CONFIG_TOUCHSCREEN_WM9713=y +CONFIG_TOUCHSCREEN_USB_COMPOSITE=m +CONFIG_TOUCHSCREEN_MC13783=m +CONFIG_TOUCHSCREEN_USB_EGALAX=y +CONFIG_TOUCHSCREEN_USB_PANJIT=y +CONFIG_TOUCHSCREEN_USB_3M=y +CONFIG_TOUCHSCREEN_USB_ITM=y +CONFIG_TOUCHSCREEN_USB_ETURBO=y +CONFIG_TOUCHSCREEN_USB_GUNZE=y +CONFIG_TOUCHSCREEN_USB_DMC_TSC10=y +CONFIG_TOUCHSCREEN_USB_IRTOUCH=y +CONFIG_TOUCHSCREEN_USB_IDEALTEK=y +CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH=y +CONFIG_TOUCHSCREEN_USB_GOTOP=y +CONFIG_TOUCHSCREEN_USB_JASTEC=y +CONFIG_TOUCHSCREEN_USB_ELO=y +CONFIG_TOUCHSCREEN_USB_E2I=y +CONFIG_TOUCHSCREEN_USB_ZYTRONIC=y +CONFIG_TOUCHSCREEN_USB_ETT_TC45USB=y +CONFIG_TOUCHSCREEN_USB_NEXIO=y +CONFIG_TOUCHSCREEN_USB_EASYTOUCH=y +CONFIG_TOUCHSCREEN_TOUCHIT213=m +CONFIG_TOUCHSCREEN_TSC_SERIO=m +CONFIG_TOUCHSCREEN_TSC200X_CORE=m +CONFIG_TOUCHSCREEN_TSC2004=m +CONFIG_TOUCHSCREEN_TSC2005=m +CONFIG_TOUCHSCREEN_TSC2007=m +CONFIG_TOUCHSCREEN_TSC2007_IIO=y +CONFIG_TOUCHSCREEN_PCAP=m +CONFIG_TOUCHSCREEN_RM_TS=m +CONFIG_TOUCHSCREEN_SILEAD=m +CONFIG_TOUCHSCREEN_SIS_I2C=m +CONFIG_TOUCHSCREEN_ST1232=m +CONFIG_TOUCHSCREEN_STMFTS=m +CONFIG_TOUCHSCREEN_STMPE=m +CONFIG_TOUCHSCREEN_SUR40=m +CONFIG_TOUCHSCREEN_SURFACE3_SPI=m +CONFIG_TOUCHSCREEN_SX8654=m +CONFIG_TOUCHSCREEN_TPS6507X=m +CONFIG_TOUCHSCREEN_ZET6223=m +CONFIG_TOUCHSCREEN_ZFORCE=m +CONFIG_TOUCHSCREEN_COLIBRI_VF50=m +CONFIG_TOUCHSCREEN_ROHM_BU21023=m +CONFIG_TOUCHSCREEN_IQS5XX=m +CONFIG_INPUT_MISC=y +CONFIG_INPUT_88PM860X_ONKEY=m +CONFIG_INPUT_88PM80X_ONKEY=m +CONFIG_INPUT_AD714X=m +CONFIG_INPUT_AD714X_I2C=m +CONFIG_INPUT_AD714X_SPI=m +CONFIG_INPUT_ARIZONA_HAPTICS=m +CONFIG_INPUT_ATMEL_CAPTOUCH=m +CONFIG_INPUT_BMA150=m +CONFIG_INPUT_E3X0_BUTTON=m +CONFIG_INPUT_MSM_VIBRATOR=m +CONFIG_INPUT_PCSPKR=m +CONFIG_INPUT_MAX77650_ONKEY=m +CONFIG_INPUT_MAX77693_HAPTIC=m +CONFIG_INPUT_MAX8925_ONKEY=m +CONFIG_INPUT_MAX8997_HAPTIC=m +CONFIG_INPUT_MC13783_PWRBUTTON=m +CONFIG_INPUT_MMA8450=m +CONFIG_INPUT_APANEL=m +CONFIG_INPUT_GP2A=m +CONFIG_INPUT_GPIO_BEEPER=m +CONFIG_INPUT_GPIO_DECODER=m +CONFIG_INPUT_GPIO_VIBRA=m +CONFIG_INPUT_CPCAP_PWRBUTTON=m +CONFIG_INPUT_ATLAS_BTNS=m +CONFIG_INPUT_ATI_REMOTE2=m +CONFIG_INPUT_KEYSPAN_REMOTE=m +CONFIG_INPUT_KXTJ9=m +CONFIG_INPUT_POWERMATE=m +CONFIG_INPUT_YEALINK=m +CONFIG_INPUT_CM109=m +CONFIG_INPUT_REGULATOR_HAPTIC=m +CONFIG_INPUT_RETU_PWRBUTTON=m +CONFIG_INPUT_TPS65218_PWRBUTTON=m +CONFIG_INPUT_AXP20X_PEK=m +CONFIG_INPUT_TWL4030_PWRBUTTON=m +CONFIG_INPUT_TWL4030_VIBRA=m +CONFIG_INPUT_TWL6040_VIBRA=m +CONFIG_INPUT_UINPUT=m +CONFIG_INPUT_PALMAS_PWRBUTTON=m +CONFIG_INPUT_PCF50633_PMU=m +CONFIG_INPUT_PCF8574=m +CONFIG_INPUT_PWM_BEEPER=m +CONFIG_INPUT_PWM_VIBRA=m +CONFIG_INPUT_RK805_PWRKEY=m +CONFIG_INPUT_GPIO_ROTARY_ENCODER=m +CONFIG_INPUT_DA9052_ONKEY=m +CONFIG_INPUT_DA9055_ONKEY=m +CONFIG_INPUT_DA9063_ONKEY=m +CONFIG_INPUT_WM831X_ON=m +CONFIG_INPUT_PCAP=m +CONFIG_INPUT_ADXL34X=m +CONFIG_INPUT_ADXL34X_I2C=m +CONFIG_INPUT_ADXL34X_SPI=m +CONFIG_INPUT_IMS_PCU=m +CONFIG_INPUT_CMA3000=m +CONFIG_INPUT_CMA3000_I2C=m +CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m +CONFIG_INPUT_IDEAPAD_SLIDEBAR=m +CONFIG_INPUT_SOC_BUTTON_ARRAY=m +CONFIG_INPUT_DRV260X_HAPTICS=m +CONFIG_INPUT_DRV2665_HAPTICS=m +CONFIG_INPUT_DRV2667_HAPTICS=m +CONFIG_INPUT_RAVE_SP_PWRBUTTON=m +CONFIG_INPUT_STPMIC1_ONKEY=m +CONFIG_RMI4_CORE=m +CONFIG_RMI4_I2C=m +CONFIG_RMI4_SPI=m +CONFIG_RMI4_SMB=m +CONFIG_RMI4_F03=y +CONFIG_RMI4_F03_SERIO=m +CONFIG_RMI4_2D_SENSOR=y +CONFIG_RMI4_F11=y +CONFIG_RMI4_F12=y +CONFIG_RMI4_F30=y +CONFIG_RMI4_F34=y +# CONFIG_RMI4_F54 is not set +CONFIG_RMI4_F55=y + +# +# Hardware I/O ports +# +CONFIG_SERIO=m +CONFIG_ARCH_MIGHT_HAVE_PC_SERIO=y +CONFIG_SERIO_I8042=m +CONFIG_SERIO_SERPORT=m +CONFIG_SERIO_CT82C710=m +CONFIG_SERIO_PARKBD=m +CONFIG_SERIO_PCIPS2=m +CONFIG_SERIO_LIBPS2=m +CONFIG_SERIO_RAW=m +CONFIG_SERIO_ALTERA_PS2=m +CONFIG_SERIO_PS2MULT=m +CONFIG_SERIO_ARC_PS2=m +# CONFIG_SERIO_APBPS2 is not set +CONFIG_HYPERV_KEYBOARD=m +CONFIG_SERIO_GPIO_PS2=m +CONFIG_USERIO=m +CONFIG_GAMEPORT=m +CONFIG_GAMEPORT_NS558=m +CONFIG_GAMEPORT_L4=m +CONFIG_GAMEPORT_EMU10K1=m +CONFIG_GAMEPORT_FM801=m +# end of Hardware I/O ports +# end of Input device support + +# +# Character devices +# +CONFIG_TTY=y +CONFIG_VT=y +CONFIG_CONSOLE_TRANSLATIONS=y +CONFIG_VT_CONSOLE=y +CONFIG_VT_CONSOLE_SLEEP=y +CONFIG_HW_CONSOLE=y +CONFIG_VT_HW_CONSOLE_BINDING=y +CONFIG_UNIX98_PTYS=y +# CONFIG_LEGACY_PTYS is not set +CONFIG_LDISC_AUTOLOAD=y + +# +# Serial drivers +# +CONFIG_SERIAL_EARLYCON=y +CONFIG_SERIAL_8250=y +# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set +CONFIG_SERIAL_8250_PNP=y +# CONFIG_SERIAL_8250_16550A_VARIANTS is not set +CONFIG_SERIAL_8250_FINTEK=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_8250_DMA=y +CONFIG_SERIAL_8250_PCI=y +CONFIG_SERIAL_8250_EXAR=m +CONFIG_SERIAL_8250_CS=m +CONFIG_SERIAL_8250_MEN_MCB=m +CONFIG_SERIAL_8250_NR_UARTS=32 +CONFIG_SERIAL_8250_RUNTIME_UARTS=4 +CONFIG_SERIAL_8250_EXTENDED=y +CONFIG_SERIAL_8250_MANY_PORTS=y +CONFIG_SERIAL_8250_ASPEED_VUART=m +CONFIG_SERIAL_8250_SHARE_IRQ=y +# CONFIG_SERIAL_8250_DETECT_IRQ is not set +CONFIG_SERIAL_8250_RSA=y +CONFIG_SERIAL_8250_DWLIB=y +CONFIG_SERIAL_8250_DW=m +CONFIG_SERIAL_8250_RT288X=y +CONFIG_SERIAL_8250_LPSS=y +CONFIG_SERIAL_8250_MID=y +CONFIG_SERIAL_OF_PLATFORM=m + +# +# Non-8250 serial port support +# +CONFIG_SERIAL_MAX3100=m +CONFIG_SERIAL_MAX310X=m +CONFIG_SERIAL_UARTLITE=m +CONFIG_SERIAL_UARTLITE_NR_UARTS=1 +CONFIG_SERIAL_CORE=y +CONFIG_SERIAL_CORE_CONSOLE=y +CONFIG_SERIAL_JSM=m +CONFIG_SERIAL_SIFIVE=m +CONFIG_SERIAL_SCCNXP=m +CONFIG_SERIAL_SC16IS7XX_CORE=m +CONFIG_SERIAL_SC16IS7XX=m +CONFIG_SERIAL_SC16IS7XX_I2C=y +CONFIG_SERIAL_SC16IS7XX_SPI=y +CONFIG_SERIAL_ALTERA_JTAGUART=m +CONFIG_SERIAL_ALTERA_UART=m +CONFIG_SERIAL_ALTERA_UART_MAXPORTS=4 +CONFIG_SERIAL_ALTERA_UART_BAUDRATE=115200 +CONFIG_SERIAL_IFX6X60=m +CONFIG_SERIAL_XILINX_PS_UART=m +CONFIG_SERIAL_ARC=m +CONFIG_SERIAL_ARC_NR_PORTS=1 +CONFIG_SERIAL_RP2=m +CONFIG_SERIAL_RP2_NR_UARTS=32 +CONFIG_SERIAL_FSL_LPUART=m +CONFIG_SERIAL_FSL_LINFLEXUART=m +CONFIG_SERIAL_CONEXANT_DIGICOLOR=m +CONFIG_SERIAL_MEN_Z135=m +CONFIG_SERIAL_SPRD=m +# end of Serial drivers + +CONFIG_SERIAL_MCTRL_GPIO=y +CONFIG_SERIAL_NONSTANDARD=y +CONFIG_ROCKETPORT=m +CONFIG_CYCLADES=m +CONFIG_CYZ_INTR=y +CONFIG_MOXA_INTELLIO=m +CONFIG_MOXA_SMARTIO=m +CONFIG_SYNCLINK=m +CONFIG_SYNCLINKMP=m +CONFIG_SYNCLINK_GT=m +CONFIG_ISI=m +CONFIG_N_HDLC=m +CONFIG_N_GSM=m +CONFIG_NOZOMI=m +CONFIG_NULL_TTY=m +CONFIG_TRACE_ROUTER=m +CONFIG_TRACE_SINK=m +CONFIG_HVC_DRIVER=y +CONFIG_HVC_IRQ=y +CONFIG_HVC_XEN=y +CONFIG_HVC_XEN_FRONTEND=y +CONFIG_SERIAL_DEV_BUS=y +CONFIG_SERIAL_DEV_CTRL_TTYPORT=y +# CONFIG_TTY_PRINTK is not set +CONFIG_PRINTER=m +# CONFIG_LP_CONSOLE is not set +CONFIG_PPDEV=m +CONFIG_VIRTIO_CONSOLE=m +CONFIG_IPMI_HANDLER=m +CONFIG_IPMI_DMI_DECODE=y +CONFIG_IPMI_PLAT_DATA=y +# CONFIG_IPMI_PANIC_EVENT is not set +CONFIG_IPMI_DEVICE_INTERFACE=m +CONFIG_IPMI_SI=m +CONFIG_IPMI_SSIF=m +CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m +CONFIG_IPMB_DEVICE_INTERFACE=m +CONFIG_HW_RANDOM=m +CONFIG_HW_RANDOM_TIMERIOMEM=m +CONFIG_HW_RANDOM_INTEL=m +CONFIG_HW_RANDOM_AMD=m +CONFIG_HW_RANDOM_VIA=m +CONFIG_HW_RANDOM_VIRTIO=m +CONFIG_APPLICOM=m + +# +# PCMCIA character devices +# +CONFIG_SYNCLINK_CS=m +CONFIG_CARDMAN_4000=m +CONFIG_CARDMAN_4040=m +CONFIG_SCR24X=m +CONFIG_IPWIRELESS=m +# end of PCMCIA character devices + +CONFIG_MWAVE=m +# CONFIG_DEVMEM is not set +# CONFIG_DEVKMEM is not set +CONFIG_NVRAM=m +CONFIG_RAW_DRIVER=m +CONFIG_MAX_RAW_DEVS=256 +# CONFIG_DEVPORT is not set +CONFIG_HPET=y +CONFIG_HPET_MMAP=y +CONFIG_HPET_MMAP_DEFAULT=y +CONFIG_HANGCHECK_TIMER=m +CONFIG_TCG_TPM=m +CONFIG_HW_RANDOM_TPM=y +CONFIG_TCG_TIS_CORE=m +CONFIG_TCG_TIS=m +CONFIG_TCG_TIS_SPI=m +CONFIG_TCG_TIS_SPI_CR50=y +CONFIG_TCG_TIS_I2C_ATMEL=m +CONFIG_TCG_TIS_I2C_INFINEON=m +CONFIG_TCG_TIS_I2C_NUVOTON=m +CONFIG_TCG_NSC=m +CONFIG_TCG_ATMEL=m +CONFIG_TCG_INFINEON=m +CONFIG_TCG_XEN=m +CONFIG_TCG_CRB=m +CONFIG_TCG_VTPM_PROXY=m +CONFIG_TCG_TIS_ST33ZP24=m +CONFIG_TCG_TIS_ST33ZP24_I2C=m +CONFIG_TCG_TIS_ST33ZP24_SPI=m +CONFIG_TELCLOCK=m +CONFIG_XILLYBUS=m +CONFIG_XILLYBUS_PCIE=m +CONFIG_XILLYBUS_OF=m +# end of Character devices + +# CONFIG_RANDOM_TRUST_CPU is not set +# CONFIG_RANDOM_TRUST_BOOTLOADER is not set + +# +# I2C support +# +CONFIG_I2C=y +CONFIG_ACPI_I2C_OPREGION=y +CONFIG_I2C_BOARDINFO=y +CONFIG_I2C_COMPAT=y +CONFIG_I2C_CHARDEV=m +CONFIG_I2C_MUX=m + +# +# Multiplexer I2C Chip support +# +CONFIG_I2C_ARB_GPIO_CHALLENGE=m +CONFIG_I2C_MUX_GPIO=m +CONFIG_I2C_MUX_GPMUX=m +CONFIG_I2C_MUX_LTC4306=m +CONFIG_I2C_MUX_PCA9541=m +CONFIG_I2C_MUX_PCA954x=m +CONFIG_I2C_MUX_PINCTRL=m +CONFIG_I2C_MUX_REG=m +CONFIG_I2C_DEMUX_PINCTRL=m +CONFIG_I2C_MUX_MLXCPLD=m +# end of Multiplexer I2C Chip support + +CONFIG_I2C_HELPER_AUTO=y +CONFIG_I2C_SMBUS=m +CONFIG_I2C_ALGOBIT=m +CONFIG_I2C_ALGOPCA=m + +# +# I2C Hardware Bus support +# + +# +# PC SMBus host controller drivers +# +CONFIG_I2C_ALI1535=m +CONFIG_I2C_ALI1563=m +CONFIG_I2C_ALI15X3=m +CONFIG_I2C_AMD756=m +CONFIG_I2C_AMD756_S4882=m +CONFIG_I2C_AMD8111=m +CONFIG_I2C_AMD_MP2=m +CONFIG_I2C_I801=m +CONFIG_I2C_ISCH=m +CONFIG_I2C_ISMT=m +CONFIG_I2C_PIIX4=m +CONFIG_I2C_CHT_WC=m +CONFIG_I2C_NFORCE2=m +CONFIG_I2C_NFORCE2_S4985=m +CONFIG_I2C_NVIDIA_GPU=m +CONFIG_I2C_SIS5595=m +CONFIG_I2C_SIS630=m +CONFIG_I2C_SIS96X=m +CONFIG_I2C_VIA=m +CONFIG_I2C_VIAPRO=m + +# +# ACPI drivers +# +CONFIG_I2C_SCMI=m + +# +# I2C system bus drivers (mostly embedded / system-on-chip) +# +CONFIG_I2C_CBUS_GPIO=m +CONFIG_I2C_DESIGNWARE_CORE=y +CONFIG_I2C_DESIGNWARE_PLATFORM=y +CONFIG_I2C_DESIGNWARE_SLAVE=y +CONFIG_I2C_DESIGNWARE_PCI=m +CONFIG_I2C_DESIGNWARE_BAYTRAIL=y +CONFIG_I2C_EMEV2=m +CONFIG_I2C_GPIO=m +# CONFIG_I2C_GPIO_FAULT_INJECTOR is not set +CONFIG_I2C_KEMPLD=m +CONFIG_I2C_OCORES=m +CONFIG_I2C_PCA_PLATFORM=m +CONFIG_I2C_RK3X=m +CONFIG_I2C_SIMTEC=m +CONFIG_I2C_XILINX=m + +# +# External I2C/SMBus adapter drivers +# +CONFIG_I2C_DIOLAN_U2C=m +CONFIG_I2C_DLN2=m +CONFIG_I2C_PARPORT=m +CONFIG_I2C_ROBOTFUZZ_OSIF=m +CONFIG_I2C_TAOS_EVM=m +CONFIG_I2C_TINY_USB=m +CONFIG_I2C_VIPERBOARD=m + +# +# Other I2C/SMBus bus drivers +# +CONFIG_I2C_MLXCPLD=m +CONFIG_I2C_CROS_EC_TUNNEL=m +CONFIG_I2C_FSI=m +# end of I2C Hardware Bus support + +CONFIG_I2C_STUB=m +CONFIG_I2C_SLAVE=y +CONFIG_I2C_SLAVE_EEPROM=m +# CONFIG_I2C_DEBUG_CORE is not set +# CONFIG_I2C_DEBUG_ALGO is not set +# CONFIG_I2C_DEBUG_BUS is not set +# end of I2C support + +CONFIG_I3C=m +CONFIG_CDNS_I3C_MASTER=m +CONFIG_DW_I3C_MASTER=m +CONFIG_SPI=y +# CONFIG_SPI_DEBUG is not set +CONFIG_SPI_MASTER=y +CONFIG_SPI_MEM=y + +# +# SPI Master Controller Drivers +# +CONFIG_SPI_ALTERA=m +CONFIG_SPI_AXI_SPI_ENGINE=m +CONFIG_SPI_BITBANG=m +CONFIG_SPI_BUTTERFLY=m +CONFIG_SPI_CADENCE=m +CONFIG_SPI_DESIGNWARE=m +CONFIG_SPI_DW_PCI=m +CONFIG_SPI_DW_MID_DMA=y +CONFIG_SPI_DW_MMIO=m +CONFIG_SPI_DLN2=m +CONFIG_SPI_FSI=m +CONFIG_SPI_NXP_FLEXSPI=m +CONFIG_SPI_GPIO=m +CONFIG_SPI_LM70_LLP=m +CONFIG_SPI_FSL_LIB=m +CONFIG_SPI_FSL_SPI=m +CONFIG_SPI_OC_TINY=m +CONFIG_SPI_PXA2XX=m +CONFIG_SPI_PXA2XX_PCI=m +CONFIG_SPI_ROCKCHIP=m +CONFIG_SPI_SC18IS602=m +CONFIG_SPI_SIFIVE=m +CONFIG_SPI_MXIC=m +CONFIG_SPI_XCOMM=m +CONFIG_SPI_XILINX=m +CONFIG_SPI_ZYNQMP_GQSPI=m + +# +# SPI Multiplexer support +# +CONFIG_SPI_MUX=m + +# +# SPI Protocol Masters +# +CONFIG_SPI_SPIDEV=m +CONFIG_SPI_LOOPBACK_TEST=m +CONFIG_SPI_TLE62X0=m +CONFIG_SPI_SLAVE=y +CONFIG_SPI_SLAVE_TIME=m +CONFIG_SPI_SLAVE_SYSTEM_CONTROL=m +CONFIG_SPMI=m +CONFIG_HSI=m +CONFIG_HSI_BOARDINFO=y + +# +# HSI controllers +# + +# +# HSI clients +# +CONFIG_HSI_CHAR=m +CONFIG_PPS=y +# CONFIG_PPS_DEBUG is not set + +# +# PPS clients support +# +CONFIG_PPS_CLIENT_KTIMER=m +CONFIG_PPS_CLIENT_LDISC=m +CONFIG_PPS_CLIENT_PARPORT=m +CONFIG_PPS_CLIENT_GPIO=m + +# +# PPS generators support +# + +# +# PTP clock support +# +CONFIG_PTP_1588_CLOCK=y +CONFIG_DP83640_PHY=m +CONFIG_PTP_1588_CLOCK_INES=m +CONFIG_PTP_1588_CLOCK_KVM=m +CONFIG_PTP_1588_CLOCK_IDT82P33=m +CONFIG_PTP_1588_CLOCK_IDTCM=m +CONFIG_PTP_1588_CLOCK_VMW=m +# end of PTP clock support + +CONFIG_PINCTRL=y +CONFIG_GENERIC_PINCTRL_GROUPS=y +CONFIG_PINMUX=y +CONFIG_GENERIC_PINMUX_FUNCTIONS=y +CONFIG_PINCONF=y +CONFIG_GENERIC_PINCONF=y +# CONFIG_DEBUG_PINCTRL is not set +CONFIG_PINCTRL_AS3722=m +CONFIG_PINCTRL_AXP209=m +CONFIG_PINCTRL_AMD=m +CONFIG_PINCTRL_DA9062=m +CONFIG_PINCTRL_MCP23S08=m +CONFIG_PINCTRL_SINGLE=m +CONFIG_PINCTRL_SX150X=y +CONFIG_PINCTRL_STMFX=m +CONFIG_PINCTRL_MAX77620=m +CONFIG_PINCTRL_PALMAS=m +CONFIG_PINCTRL_RK805=m +CONFIG_PINCTRL_OCELOT=y +CONFIG_PINCTRL_BAYTRAIL=y +CONFIG_PINCTRL_CHERRYVIEW=y +CONFIG_PINCTRL_LYNXPOINT=y +CONFIG_PINCTRL_INTEL=y +CONFIG_PINCTRL_BROXTON=y +CONFIG_PINCTRL_CANNONLAKE=y +CONFIG_PINCTRL_CEDARFORK=y +CONFIG_PINCTRL_DENVERTON=y +CONFIG_PINCTRL_GEMINILAKE=y +CONFIG_PINCTRL_ICELAKE=y +CONFIG_PINCTRL_LEWISBURG=y +CONFIG_PINCTRL_SUNRISEPOINT=y +CONFIG_PINCTRL_TIGERLAKE=y +CONFIG_PINCTRL_LOCHNAGAR=m +CONFIG_PINCTRL_MADERA=m +CONFIG_PINCTRL_CS47L15=y +CONFIG_PINCTRL_CS47L35=y +CONFIG_PINCTRL_CS47L85=y +CONFIG_PINCTRL_CS47L90=y +CONFIG_PINCTRL_CS47L92=y +CONFIG_PINCTRL_EQUILIBRIUM=m +CONFIG_GPIOLIB=y +CONFIG_GPIOLIB_FASTPATH_LIMIT=512 +CONFIG_OF_GPIO=y +CONFIG_GPIO_ACPI=y +CONFIG_GPIOLIB_IRQCHIP=y +# CONFIG_DEBUG_GPIO is not set +CONFIG_GPIO_SYSFS=y +CONFIG_GPIO_GENERIC=y +CONFIG_GPIO_MAX730X=m + +# +# Memory mapped GPIO drivers +# +CONFIG_GPIO_74XX_MMIO=m +CONFIG_GPIO_ALTERA=m +CONFIG_GPIO_AMDPT=m +CONFIG_GPIO_CADENCE=m +CONFIG_GPIO_DWAPB=m +CONFIG_GPIO_EXAR=m +CONFIG_GPIO_FTGPIO010=y +CONFIG_GPIO_GENERIC_PLATFORM=m +CONFIG_GPIO_GRGPIO=m +CONFIG_GPIO_HLWD=m +CONFIG_GPIO_ICH=m +CONFIG_GPIO_LOGICVC=m +CONFIG_GPIO_MB86S7X=m +CONFIG_GPIO_MENZ127=m +CONFIG_GPIO_SAMA5D2_PIOBU=m +CONFIG_GPIO_SIFIVE=y +CONFIG_GPIO_SIOX=m +CONFIG_GPIO_SYSCON=m +CONFIG_GPIO_VX855=m +CONFIG_GPIO_WCD934X=m +CONFIG_GPIO_XILINX=m +CONFIG_GPIO_AMD_FCH=m +# end of Memory mapped GPIO drivers + +# +# Port-mapped I/O GPIO drivers +# +CONFIG_GPIO_F7188X=m +CONFIG_GPIO_IT87=m +CONFIG_GPIO_SCH=m +CONFIG_GPIO_SCH311X=m +CONFIG_GPIO_WINBOND=m +CONFIG_GPIO_WS16C48=m +# end of Port-mapped I/O GPIO drivers + +# +# I2C GPIO expanders +# +CONFIG_GPIO_ADP5588=m +CONFIG_GPIO_ADNP=m +CONFIG_GPIO_GW_PLD=m +CONFIG_GPIO_MAX7300=m +CONFIG_GPIO_MAX732X=m +CONFIG_GPIO_PCA953X=m +CONFIG_GPIO_PCF857X=m +CONFIG_GPIO_TPIC2810=m +# end of I2C GPIO expanders + +# +# MFD GPIO expanders +# +CONFIG_GPIO_ADP5520=m +CONFIG_GPIO_ARIZONA=m +CONFIG_GPIO_BD70528=m +CONFIG_GPIO_BD71828=m +CONFIG_GPIO_BD9571MWV=m +CONFIG_GPIO_CRYSTAL_COVE=m +CONFIG_GPIO_DA9052=m +CONFIG_GPIO_DA9055=m +CONFIG_GPIO_DLN2=m +CONFIG_GPIO_JANZ_TTL=m +CONFIG_GPIO_KEMPLD=m +CONFIG_GPIO_LP3943=m +CONFIG_GPIO_LP873X=m +CONFIG_GPIO_LP87565=m +CONFIG_GPIO_MADERA=m +CONFIG_GPIO_MAX77620=m +CONFIG_GPIO_MAX77650=m +CONFIG_GPIO_PALMAS=y +CONFIG_GPIO_RC5T583=y +CONFIG_GPIO_STMPE=y +CONFIG_GPIO_TC3589X=y +CONFIG_GPIO_TPS65086=m +CONFIG_GPIO_TPS65218=m +CONFIG_GPIO_TPS6586X=y +CONFIG_GPIO_TPS65910=y +CONFIG_GPIO_TPS65912=m +CONFIG_GPIO_TPS68470=y +CONFIG_GPIO_TQMX86=m +CONFIG_GPIO_TWL4030=m +CONFIG_GPIO_TWL6040=m +CONFIG_GPIO_UCB1400=m +CONFIG_GPIO_WHISKEY_COVE=m +CONFIG_GPIO_WM831X=m +CONFIG_GPIO_WM8350=m +CONFIG_GPIO_WM8994=m +# end of MFD GPIO expanders + +# +# PCI GPIO expanders +# +CONFIG_GPIO_AMD8111=m +CONFIG_GPIO_ML_IOH=m +CONFIG_GPIO_PCI_IDIO_16=m +CONFIG_GPIO_PCIE_IDIO_24=m +CONFIG_GPIO_RDC321X=m +CONFIG_GPIO_SODAVILLE=y +# end of PCI GPIO expanders + +# +# SPI GPIO expanders +# +CONFIG_GPIO_74X164=m +CONFIG_GPIO_MAX3191X=m +CONFIG_GPIO_MAX7301=m +CONFIG_GPIO_MC33880=m +CONFIG_GPIO_PISOSR=m +CONFIG_GPIO_XRA1403=m +CONFIG_GPIO_MOXTET=m +# end of SPI GPIO expanders + +# +# USB GPIO expanders +# +CONFIG_GPIO_VIPERBOARD=m +# end of USB GPIO expanders + +CONFIG_GPIO_MOCKUP=m +CONFIG_W1=m +CONFIG_W1_CON=y + +# +# 1-wire Bus Masters +# +CONFIG_W1_MASTER_MATROX=m +CONFIG_W1_MASTER_DS2490=m +CONFIG_W1_MASTER_DS2482=m +CONFIG_W1_MASTER_DS1WM=m +CONFIG_W1_MASTER_GPIO=m +CONFIG_W1_MASTER_SGI=m +# end of 1-wire Bus Masters + +# +# 1-wire Slaves +# +CONFIG_W1_SLAVE_THERM=m +CONFIG_W1_SLAVE_SMEM=m +CONFIG_W1_SLAVE_DS2405=m +CONFIG_W1_SLAVE_DS2408=m +# CONFIG_W1_SLAVE_DS2408_READBACK is not set +CONFIG_W1_SLAVE_DS2413=m +CONFIG_W1_SLAVE_DS2406=m +CONFIG_W1_SLAVE_DS2423=m +CONFIG_W1_SLAVE_DS2805=m +CONFIG_W1_SLAVE_DS2430=m +CONFIG_W1_SLAVE_DS2431=m +CONFIG_W1_SLAVE_DS2433=m +# CONFIG_W1_SLAVE_DS2433_CRC is not set +CONFIG_W1_SLAVE_DS2438=m +CONFIG_W1_SLAVE_DS250X=m +CONFIG_W1_SLAVE_DS2780=m +CONFIG_W1_SLAVE_DS2781=m +CONFIG_W1_SLAVE_DS28E04=m +CONFIG_W1_SLAVE_DS28E17=m +# end of 1-wire Slaves + +CONFIG_POWER_AVS=y +CONFIG_QCOM_CPR=m +CONFIG_POWER_RESET=y +CONFIG_POWER_RESET_AS3722=y +CONFIG_POWER_RESET_GPIO=y +CONFIG_POWER_RESET_GPIO_RESTART=y +CONFIG_POWER_RESET_LTC2952=y +CONFIG_POWER_RESET_MT6323=y +CONFIG_POWER_RESET_RESTART=y +CONFIG_POWER_RESET_SYSCON=y +CONFIG_POWER_RESET_SYSCON_POWEROFF=y +CONFIG_REBOOT_MODE=m +CONFIG_SYSCON_REBOOT_MODE=m +CONFIG_NVMEM_REBOOT_MODE=m +CONFIG_POWER_SUPPLY=y +# CONFIG_POWER_SUPPLY_DEBUG is not set +CONFIG_POWER_SUPPLY_HWMON=y +CONFIG_PDA_POWER=m +CONFIG_GENERIC_ADC_BATTERY=m +CONFIG_MAX8925_POWER=m +CONFIG_WM831X_BACKUP=m +CONFIG_WM831X_POWER=m +CONFIG_WM8350_POWER=m +CONFIG_TEST_POWER=m +CONFIG_BATTERY_88PM860X=m +CONFIG_CHARGER_ADP5061=m +CONFIG_BATTERY_ACT8945A=m +CONFIG_BATTERY_CPCAP=m +CONFIG_BATTERY_DS2760=m +CONFIG_BATTERY_DS2780=m +CONFIG_BATTERY_DS2781=m +CONFIG_BATTERY_DS2782=m +CONFIG_BATTERY_LEGO_EV3=m +CONFIG_BATTERY_SBS=m +CONFIG_CHARGER_SBS=m +CONFIG_MANAGER_SBS=m +CONFIG_BATTERY_BQ27XXX=m +CONFIG_BATTERY_BQ27XXX_I2C=m +CONFIG_BATTERY_BQ27XXX_HDQ=m +# CONFIG_BATTERY_BQ27XXX_DT_UPDATES_NVM is not set +CONFIG_BATTERY_DA9030=m +CONFIG_BATTERY_DA9052=m +CONFIG_CHARGER_DA9150=m +CONFIG_BATTERY_DA9150=m +CONFIG_CHARGER_AXP20X=m +CONFIG_BATTERY_AXP20X=m +CONFIG_AXP20X_POWER=m +CONFIG_AXP288_CHARGER=m +CONFIG_AXP288_FUEL_GAUGE=m +CONFIG_BATTERY_MAX17040=m +CONFIG_BATTERY_MAX17042=m +CONFIG_BATTERY_MAX1721X=m +CONFIG_BATTERY_TWL4030_MADC=m +CONFIG_CHARGER_88PM860X=m +CONFIG_CHARGER_PCF50633=m +CONFIG_BATTERY_RX51=m +CONFIG_CHARGER_ISP1704=m +CONFIG_CHARGER_MAX8903=m +CONFIG_CHARGER_TWL4030=m +CONFIG_CHARGER_LP8727=m +CONFIG_CHARGER_LP8788=m +CONFIG_CHARGER_GPIO=m +CONFIG_CHARGER_MANAGER=y +CONFIG_CHARGER_LT3651=m +CONFIG_CHARGER_MAX14577=m +CONFIG_CHARGER_DETECTOR_MAX14656=m +CONFIG_CHARGER_MAX77650=m +CONFIG_CHARGER_MAX77693=m +CONFIG_CHARGER_MAX8997=m +CONFIG_CHARGER_MAX8998=m +CONFIG_CHARGER_BQ2415X=m +CONFIG_CHARGER_BQ24190=m +CONFIG_CHARGER_BQ24257=m +CONFIG_CHARGER_BQ24735=m +CONFIG_CHARGER_BQ25890=m +CONFIG_CHARGER_SMB347=m +CONFIG_CHARGER_TPS65090=m +CONFIG_CHARGER_TPS65217=m +CONFIG_BATTERY_GAUGE_LTC2941=m +CONFIG_BATTERY_RT5033=m +CONFIG_CHARGER_RT9455=m +CONFIG_CHARGER_CROS_USBPD=m +CONFIG_CHARGER_UCS1002=m +CONFIG_CHARGER_BD70528=m +CONFIG_CHARGER_WILCO=m +CONFIG_HWMON=y +CONFIG_HWMON_VID=m +# CONFIG_HWMON_DEBUG_CHIP is not set + +# +# Native drivers +# +CONFIG_SENSORS_ABITUGURU=m +CONFIG_SENSORS_ABITUGURU3=m +CONFIG_SENSORS_AD7314=m +CONFIG_SENSORS_AD7414=m +CONFIG_SENSORS_AD7418=m +CONFIG_SENSORS_ADM1021=m +CONFIG_SENSORS_ADM1025=m +CONFIG_SENSORS_ADM1026=m +CONFIG_SENSORS_ADM1029=m +CONFIG_SENSORS_ADM1031=m +CONFIG_SENSORS_ADM1177=m +CONFIG_SENSORS_ADM9240=m +CONFIG_SENSORS_ADT7X10=m +CONFIG_SENSORS_ADT7310=m +CONFIG_SENSORS_ADT7410=m +CONFIG_SENSORS_ADT7411=m +CONFIG_SENSORS_ADT7462=m +CONFIG_SENSORS_ADT7470=m +CONFIG_SENSORS_ADT7475=m +CONFIG_SENSORS_AS370=m +CONFIG_SENSORS_ASC7621=m +CONFIG_SENSORS_AXI_FAN_CONTROL=m +CONFIG_SENSORS_K8TEMP=m +CONFIG_SENSORS_K10TEMP=m +CONFIG_SENSORS_FAM15H_POWER=m +CONFIG_SENSORS_APPLESMC=m +CONFIG_SENSORS_ASB100=m +CONFIG_SENSORS_ASPEED=m +CONFIG_SENSORS_ATXP1=m +CONFIG_SENSORS_DRIVETEMP=m +CONFIG_SENSORS_DS620=m +CONFIG_SENSORS_DS1621=m +CONFIG_SENSORS_DELL_SMM=m +CONFIG_SENSORS_DA9052_ADC=m +CONFIG_SENSORS_DA9055=m +CONFIG_SENSORS_I5K_AMB=m +CONFIG_SENSORS_F71805F=m +CONFIG_SENSORS_F71882FG=m +CONFIG_SENSORS_F75375S=m +CONFIG_SENSORS_MC13783_ADC=m +CONFIG_SENSORS_FSCHMD=m +CONFIG_SENSORS_FTSTEUTATES=m +CONFIG_SENSORS_GL518SM=m +CONFIG_SENSORS_GL520SM=m +CONFIG_SENSORS_G760A=m +CONFIG_SENSORS_G762=m +CONFIG_SENSORS_GPIO_FAN=m +CONFIG_SENSORS_HIH6130=m +CONFIG_SENSORS_IBMAEM=m +CONFIG_SENSORS_IBMPEX=m +CONFIG_SENSORS_IIO_HWMON=m +CONFIG_SENSORS_I5500=m +CONFIG_SENSORS_CORETEMP=m +CONFIG_SENSORS_IT87=m +CONFIG_SENSORS_JC42=m +CONFIG_SENSORS_POWR1220=m +CONFIG_SENSORS_LINEAGE=m +CONFIG_SENSORS_LOCHNAGAR=m +CONFIG_SENSORS_LTC2945=m +CONFIG_SENSORS_LTC2947=m +CONFIG_SENSORS_LTC2947_I2C=m +CONFIG_SENSORS_LTC2947_SPI=m +CONFIG_SENSORS_LTC2990=m +CONFIG_SENSORS_LTC4151=m +CONFIG_SENSORS_LTC4215=m +CONFIG_SENSORS_LTC4222=m +CONFIG_SENSORS_LTC4245=m +CONFIG_SENSORS_LTC4260=m +CONFIG_SENSORS_LTC4261=m +CONFIG_SENSORS_MAX1111=m +CONFIG_SENSORS_MAX16065=m +CONFIG_SENSORS_MAX1619=m +CONFIG_SENSORS_MAX1668=m +CONFIG_SENSORS_MAX197=m +CONFIG_SENSORS_MAX31722=m +CONFIG_SENSORS_MAX31730=m +CONFIG_SENSORS_MAX6621=m +CONFIG_SENSORS_MAX6639=m +CONFIG_SENSORS_MAX6642=m +CONFIG_SENSORS_MAX6650=m +CONFIG_SENSORS_MAX6697=m +CONFIG_SENSORS_MAX31790=m +CONFIG_SENSORS_MCP3021=m +CONFIG_SENSORS_MLXREG_FAN=m +CONFIG_SENSORS_TC654=m +CONFIG_SENSORS_MENF21BMC_HWMON=m +CONFIG_SENSORS_ADCXX=m +CONFIG_SENSORS_LM63=m +CONFIG_SENSORS_LM70=m +CONFIG_SENSORS_LM73=m +CONFIG_SENSORS_LM75=m +CONFIG_SENSORS_LM77=m +CONFIG_SENSORS_LM78=m +CONFIG_SENSORS_LM80=m +CONFIG_SENSORS_LM83=m +CONFIG_SENSORS_LM85=m +CONFIG_SENSORS_LM87=m +CONFIG_SENSORS_LM90=m +CONFIG_SENSORS_LM92=m +CONFIG_SENSORS_LM93=m +CONFIG_SENSORS_LM95234=m +CONFIG_SENSORS_LM95241=m +CONFIG_SENSORS_LM95245=m +CONFIG_SENSORS_PC87360=m +CONFIG_SENSORS_PC87427=m +CONFIG_SENSORS_NTC_THERMISTOR=m +CONFIG_SENSORS_NCT6683=m +CONFIG_SENSORS_NCT6775=m +CONFIG_SENSORS_NCT7802=m +CONFIG_SENSORS_NCT7904=m +CONFIG_SENSORS_NPCM7XX=m +CONFIG_SENSORS_PCF8591=m +CONFIG_PMBUS=m +CONFIG_SENSORS_PMBUS=m +CONFIG_SENSORS_ADM1275=m +CONFIG_SENSORS_BEL_PFE=m +CONFIG_SENSORS_IBM_CFFPS=m +CONFIG_SENSORS_INSPUR_IPSPS=m +CONFIG_SENSORS_IR35221=m +CONFIG_SENSORS_IR38064=m +CONFIG_SENSORS_IRPS5401=m +CONFIG_SENSORS_ISL68137=m +CONFIG_SENSORS_LM25066=m +CONFIG_SENSORS_LTC2978=m +# CONFIG_SENSORS_LTC2978_REGULATOR is not set +CONFIG_SENSORS_LTC3815=m +CONFIG_SENSORS_MAX16064=m +CONFIG_SENSORS_MAX20730=m +CONFIG_SENSORS_MAX20751=m +CONFIG_SENSORS_MAX31785=m +CONFIG_SENSORS_MAX34440=m +CONFIG_SENSORS_MAX8688=m +CONFIG_SENSORS_PXE1610=m +CONFIG_SENSORS_TPS40422=m +CONFIG_SENSORS_TPS53679=m +CONFIG_SENSORS_UCD9000=m +CONFIG_SENSORS_UCD9200=m +CONFIG_SENSORS_XDPE122=m +CONFIG_SENSORS_ZL6100=m +CONFIG_SENSORS_PWM_FAN=m +CONFIG_SENSORS_SHT15=m +CONFIG_SENSORS_SHT21=m +CONFIG_SENSORS_SHT3x=m +CONFIG_SENSORS_SHTC1=m +CONFIG_SENSORS_SIS5595=m +CONFIG_SENSORS_DME1737=m +CONFIG_SENSORS_EMC1403=m +CONFIG_SENSORS_EMC2103=m +CONFIG_SENSORS_EMC6W201=m +CONFIG_SENSORS_SMSC47M1=m +CONFIG_SENSORS_SMSC47M192=m +CONFIG_SENSORS_SMSC47B397=m +CONFIG_SENSORS_SCH56XX_COMMON=m +CONFIG_SENSORS_SCH5627=m +CONFIG_SENSORS_SCH5636=m +CONFIG_SENSORS_STTS751=m +CONFIG_SENSORS_SMM665=m +CONFIG_SENSORS_ADC128D818=m +CONFIG_SENSORS_ADS7828=m +CONFIG_SENSORS_ADS7871=m +CONFIG_SENSORS_AMC6821=m +CONFIG_SENSORS_INA209=m +CONFIG_SENSORS_INA2XX=m +CONFIG_SENSORS_INA3221=m +CONFIG_SENSORS_TC74=m +CONFIG_SENSORS_THMC50=m +CONFIG_SENSORS_TMP102=m +CONFIG_SENSORS_TMP103=m +CONFIG_SENSORS_TMP108=m +CONFIG_SENSORS_TMP401=m +CONFIG_SENSORS_TMP421=m +CONFIG_SENSORS_TMP513=m +CONFIG_SENSORS_VIA_CPUTEMP=m +CONFIG_SENSORS_VIA686A=m +CONFIG_SENSORS_VT1211=m +CONFIG_SENSORS_VT8231=m +CONFIG_SENSORS_W83773G=m +CONFIG_SENSORS_W83781D=m +CONFIG_SENSORS_W83791D=m +CONFIG_SENSORS_W83792D=m +CONFIG_SENSORS_W83793=m +CONFIG_SENSORS_W83795=m +# CONFIG_SENSORS_W83795_FANCTRL is not set +CONFIG_SENSORS_W83L785TS=m +CONFIG_SENSORS_W83L786NG=m +CONFIG_SENSORS_W83627HF=m +CONFIG_SENSORS_W83627EHF=m +CONFIG_SENSORS_WM831X=m +CONFIG_SENSORS_WM8350=m +CONFIG_SENSORS_XGENE=m + +# +# ACPI drivers +# +CONFIG_SENSORS_ACPI_POWER=m +CONFIG_SENSORS_ATK0110=m +CONFIG_THERMAL=y +# CONFIG_THERMAL_STATISTICS is not set +CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=100 +CONFIG_THERMAL_HWMON=y +CONFIG_THERMAL_OF=y +CONFIG_THERMAL_WRITABLE_TRIPS=y +CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y +# CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE is not set +# CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE is not set +# CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR is not set +CONFIG_THERMAL_GOV_FAIR_SHARE=y +CONFIG_THERMAL_GOV_STEP_WISE=y +CONFIG_THERMAL_GOV_BANG_BANG=y +CONFIG_THERMAL_GOV_USER_SPACE=y +CONFIG_THERMAL_GOV_POWER_ALLOCATOR=y +CONFIG_CPU_THERMAL=y +CONFIG_CPU_FREQ_THERMAL=y +CONFIG_CPU_IDLE_THERMAL=y +CONFIG_CLOCK_THERMAL=y +CONFIG_DEVFREQ_THERMAL=y +# CONFIG_THERMAL_EMULATION is not set +CONFIG_THERMAL_MMIO=m +CONFIG_MAX77620_THERMAL=m +CONFIG_QORIQ_THERMAL=m +CONFIG_DA9062_THERMAL=m + +# +# Intel thermal drivers +# +CONFIG_INTEL_POWERCLAMP=m +CONFIG_X86_PKG_TEMP_THERMAL=m +CONFIG_INTEL_SOC_DTS_IOSF_CORE=m +CONFIG_INTEL_SOC_DTS_THERMAL=m + +# +# ACPI INT340X thermal drivers +# +CONFIG_INT340X_THERMAL=m +CONFIG_ACPI_THERMAL_REL=m +CONFIG_INT3406_THERMAL=m +CONFIG_PROC_THERMAL_MMIO_RAPL=y +# end of ACPI INT340X thermal drivers + +CONFIG_INTEL_BXT_PMIC_THERMAL=m +CONFIG_INTEL_PCH_THERMAL=m +# end of Intel thermal drivers + +# CONFIG_TI_SOC_THERMAL is not set +CONFIG_GENERIC_ADC_THERMAL=m +CONFIG_WATCHDOG=y +CONFIG_WATCHDOG_CORE=y +# CONFIG_WATCHDOG_NOWAYOUT is not set +CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED=y +CONFIG_WATCHDOG_OPEN_TIMEOUT=0 +CONFIG_WATCHDOG_SYSFS=y + +# +# Watchdog Pretimeout Governors +# +CONFIG_WATCHDOG_PRETIMEOUT_GOV=y +CONFIG_WATCHDOG_PRETIMEOUT_GOV_SEL=m +CONFIG_WATCHDOG_PRETIMEOUT_GOV_NOOP=m +CONFIG_WATCHDOG_PRETIMEOUT_GOV_PANIC=y +# CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_NOOP is not set +CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC=y + +# +# Watchdog Device Drivers +# +CONFIG_SOFT_WATCHDOG=m +# CONFIG_SOFT_WATCHDOG_PRETIMEOUT is not set +CONFIG_BD70528_WATCHDOG=m +CONFIG_DA9052_WATCHDOG=m +CONFIG_DA9055_WATCHDOG=m +CONFIG_DA9063_WATCHDOG=m +CONFIG_DA9062_WATCHDOG=m +CONFIG_GPIO_WATCHDOG=m +CONFIG_MENF21BMC_WATCHDOG=m +CONFIG_MENZ069_WATCHDOG=m +CONFIG_WDAT_WDT=m +CONFIG_WM831X_WATCHDOG=m +CONFIG_WM8350_WATCHDOG=m +CONFIG_XILINX_WATCHDOG=m +CONFIG_ZIIRAVE_WATCHDOG=m +CONFIG_RAVE_SP_WATCHDOG=m +CONFIG_MLX_WDT=m +CONFIG_CADENCE_WATCHDOG=m +CONFIG_DW_WATCHDOG=m +CONFIG_RN5T618_WATCHDOG=m +CONFIG_TWL4030_WATCHDOG=m +CONFIG_MAX63XX_WATCHDOG=m +CONFIG_MAX77620_WATCHDOG=m +CONFIG_RETU_WATCHDOG=m +CONFIG_STPMIC1_WATCHDOG=m +CONFIG_ACQUIRE_WDT=m +CONFIG_ADVANTECH_WDT=m +CONFIG_ALIM1535_WDT=m +CONFIG_ALIM7101_WDT=m +CONFIG_EBC_C384_WDT=m +CONFIG_F71808E_WDT=m +CONFIG_SP5100_TCO=m +CONFIG_SBC_FITPC2_WATCHDOG=m +CONFIG_EUROTECH_WDT=m +CONFIG_IB700_WDT=m +CONFIG_IBMASR=m +CONFIG_WAFER_WDT=m +CONFIG_I6300ESB_WDT=m +CONFIG_IE6XX_WDT=m +CONFIG_ITCO_WDT=m +CONFIG_ITCO_VENDOR_SUPPORT=y +CONFIG_IT8712F_WDT=m +CONFIG_IT87_WDT=m +CONFIG_HP_WATCHDOG=m +CONFIG_HPWDT_NMI_DECODING=y +CONFIG_KEMPLD_WDT=m +CONFIG_SC1200_WDT=m +CONFIG_PC87413_WDT=m +CONFIG_NV_TCO=m +CONFIG_60XX_WDT=m +CONFIG_CPU5_WDT=m +CONFIG_SMSC_SCH311X_WDT=m +CONFIG_SMSC37B787_WDT=m +CONFIG_TQMX86_WDT=m +CONFIG_VIA_WDT=m +CONFIG_W83627HF_WDT=m +CONFIG_W83877F_WDT=m +CONFIG_W83977F_WDT=m +CONFIG_MACHZ_WDT=m +CONFIG_SBC_EPX_C3_WATCHDOG=m +CONFIG_INTEL_MEI_WDT=m +CONFIG_NI903X_WDT=m +CONFIG_NIC7018_WDT=m +CONFIG_MEN_A21_WDT=m +CONFIG_XEN_WDT=m + +# +# PCI-based Watchdog Cards +# +CONFIG_PCIPCWATCHDOG=m +CONFIG_WDTPCI=m + +# +# USB-based Watchdog Cards +# +CONFIG_USBPCWATCHDOG=m +CONFIG_SSB_POSSIBLE=y +CONFIG_SSB=m +CONFIG_SSB_SPROM=y +CONFIG_SSB_BLOCKIO=y +CONFIG_SSB_PCIHOST_POSSIBLE=y +CONFIG_SSB_PCIHOST=y +CONFIG_SSB_B43_PCI_BRIDGE=y +CONFIG_SSB_PCMCIAHOST_POSSIBLE=y +CONFIG_SSB_PCMCIAHOST=y +CONFIG_SSB_SDIOHOST_POSSIBLE=y +CONFIG_SSB_SDIOHOST=y +CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y +CONFIG_SSB_DRIVER_PCICORE=y +CONFIG_SSB_DRIVER_GPIO=y +CONFIG_BCMA_POSSIBLE=y +CONFIG_BCMA=m +CONFIG_BCMA_BLOCKIO=y +CONFIG_BCMA_HOST_PCI_POSSIBLE=y +CONFIG_BCMA_HOST_PCI=y +# CONFIG_BCMA_HOST_SOC is not set +CONFIG_BCMA_DRIVER_PCI=y +CONFIG_BCMA_DRIVER_GMAC_CMN=y +CONFIG_BCMA_DRIVER_GPIO=y +# CONFIG_BCMA_DEBUG is not set + +# +# Multifunction device drivers +# +CONFIG_MFD_CORE=y +CONFIG_MFD_ACT8945A=m +CONFIG_MFD_AS3711=y +CONFIG_MFD_AS3722=m +CONFIG_PMIC_ADP5520=y +CONFIG_MFD_AAT2870_CORE=y +CONFIG_MFD_ATMEL_FLEXCOM=m +CONFIG_MFD_ATMEL_HLCDC=m +CONFIG_MFD_BCM590XX=m +CONFIG_MFD_BD9571MWV=m +CONFIG_MFD_AXP20X=m +CONFIG_MFD_AXP20X_I2C=m +CONFIG_MFD_CROS_EC_DEV=m +CONFIG_MFD_MADERA=m +CONFIG_MFD_MADERA_I2C=m +CONFIG_MFD_MADERA_SPI=m +CONFIG_MFD_CS47L15=y +CONFIG_MFD_CS47L35=y +CONFIG_MFD_CS47L85=y +CONFIG_MFD_CS47L90=y +CONFIG_MFD_CS47L92=y +CONFIG_PMIC_DA903X=y +CONFIG_PMIC_DA9052=y +CONFIG_MFD_DA9052_SPI=y +CONFIG_MFD_DA9052_I2C=y +CONFIG_MFD_DA9055=y +CONFIG_MFD_DA9062=m +CONFIG_MFD_DA9063=m +CONFIG_MFD_DA9150=m +CONFIG_MFD_DLN2=m +CONFIG_MFD_MC13XXX=m +CONFIG_MFD_MC13XXX_SPI=m +CONFIG_MFD_MC13XXX_I2C=m +CONFIG_MFD_HI6421_PMIC=m +CONFIG_HTC_PASIC3=m +CONFIG_HTC_I2CPLD=y +CONFIG_MFD_INTEL_QUARK_I2C_GPIO=m +CONFIG_LPC_ICH=m +CONFIG_LPC_SCH=m +CONFIG_INTEL_SOC_PMIC=y +CONFIG_INTEL_SOC_PMIC_BXTWC=m +CONFIG_INTEL_SOC_PMIC_CHTWC=y +CONFIG_INTEL_SOC_PMIC_CHTDC_TI=m +CONFIG_MFD_INTEL_LPSS=m +CONFIG_MFD_INTEL_LPSS_ACPI=m +CONFIG_MFD_INTEL_LPSS_PCI=m +CONFIG_MFD_IQS62X=m +CONFIG_MFD_JANZ_CMODIO=m +CONFIG_MFD_KEMPLD=m +CONFIG_MFD_88PM800=m +CONFIG_MFD_88PM805=m +CONFIG_MFD_88PM860X=y +CONFIG_MFD_MAX14577=m +CONFIG_MFD_MAX77620=y +CONFIG_MFD_MAX77650=m +CONFIG_MFD_MAX77686=m +CONFIG_MFD_MAX77693=m +CONFIG_MFD_MAX77843=y +CONFIG_MFD_MAX8907=m +CONFIG_MFD_MAX8925=y +CONFIG_MFD_MAX8997=y +CONFIG_MFD_MAX8998=y +CONFIG_MFD_MT6397=m +CONFIG_MFD_MENF21BMC=m +CONFIG_EZX_PCAP=y +CONFIG_MFD_CPCAP=m +CONFIG_MFD_VIPERBOARD=m +CONFIG_MFD_RETU=m +CONFIG_MFD_PCF50633=m +CONFIG_PCF50633_ADC=m +CONFIG_PCF50633_GPIO=m +CONFIG_UCB1400_CORE=m +CONFIG_MFD_RDC321X=m +CONFIG_MFD_RT5033=m +CONFIG_MFD_RC5T583=y +CONFIG_MFD_RK808=m +CONFIG_MFD_RN5T618=m +CONFIG_MFD_SEC_CORE=y +CONFIG_MFD_SI476X_CORE=m +CONFIG_MFD_SM501=m +CONFIG_MFD_SM501_GPIO=y +CONFIG_MFD_SKY81452=m +CONFIG_MFD_SMSC=y +CONFIG_ABX500_CORE=y +CONFIG_AB3100_CORE=y +CONFIG_AB3100_OTP=y +CONFIG_MFD_STMPE=y + +# +# STMicroelectronics STMPE Interface Drivers +# +CONFIG_STMPE_I2C=y +CONFIG_STMPE_SPI=y +# end of STMicroelectronics STMPE Interface Drivers + +CONFIG_MFD_SYSCON=y +CONFIG_MFD_TI_AM335X_TSCADC=m +CONFIG_MFD_LP3943=m +CONFIG_MFD_LP8788=y +CONFIG_MFD_TI_LMU=m +CONFIG_MFD_PALMAS=y +CONFIG_TPS6105X=m +CONFIG_TPS65010=m +CONFIG_TPS6507X=m +CONFIG_MFD_TPS65086=m +CONFIG_MFD_TPS65090=y +CONFIG_MFD_TPS65217=m +CONFIG_MFD_TPS68470=y +CONFIG_MFD_TI_LP873X=m +CONFIG_MFD_TI_LP87565=m +CONFIG_MFD_TPS65218=m +CONFIG_MFD_TPS6586X=y +CONFIG_MFD_TPS65910=y +CONFIG_MFD_TPS65912=m +CONFIG_MFD_TPS65912_I2C=m +CONFIG_MFD_TPS65912_SPI=m +CONFIG_MFD_TPS80031=y +CONFIG_TWL4030_CORE=y +CONFIG_MFD_TWL4030_AUDIO=y +CONFIG_TWL6040_CORE=y +CONFIG_MFD_WL1273_CORE=m +CONFIG_MFD_LM3533=m +CONFIG_MFD_TC3589X=y +CONFIG_MFD_TQMX86=m +CONFIG_MFD_VX855=m +CONFIG_MFD_LOCHNAGAR=y +CONFIG_MFD_ARIZONA=y +CONFIG_MFD_ARIZONA_I2C=m +CONFIG_MFD_ARIZONA_SPI=m +CONFIG_MFD_CS47L24=y +CONFIG_MFD_WM5102=y +CONFIG_MFD_WM5110=y +CONFIG_MFD_WM8997=y +CONFIG_MFD_WM8998=y +CONFIG_MFD_WM8400=y +CONFIG_MFD_WM831X=y +CONFIG_MFD_WM831X_I2C=y +CONFIG_MFD_WM831X_SPI=y +CONFIG_MFD_WM8350=y +CONFIG_MFD_WM8350_I2C=y +CONFIG_MFD_WM8994=m +CONFIG_MFD_ROHM_BD718XX=m +CONFIG_MFD_ROHM_BD70528=m +CONFIG_MFD_ROHM_BD71828=m +CONFIG_MFD_STPMIC1=m +CONFIG_MFD_STMFX=m +CONFIG_MFD_WCD934X=m +CONFIG_RAVE_SP_CORE=m +# end of Multifunction device drivers + +CONFIG_REGULATOR=y +# CONFIG_REGULATOR_DEBUG is not set +CONFIG_REGULATOR_FIXED_VOLTAGE=m +CONFIG_REGULATOR_VIRTUAL_CONSUMER=m +CONFIG_REGULATOR_USERSPACE_CONSUMER=m +CONFIG_REGULATOR_88PG86X=m +CONFIG_REGULATOR_88PM800=m +CONFIG_REGULATOR_88PM8607=m +CONFIG_REGULATOR_ACT8865=m +CONFIG_REGULATOR_ACT8945A=m +CONFIG_REGULATOR_AD5398=m +CONFIG_REGULATOR_AAT2870=m +CONFIG_REGULATOR_AB3100=m +CONFIG_REGULATOR_ARIZONA_LDO1=m +CONFIG_REGULATOR_ARIZONA_MICSUPP=m +CONFIG_REGULATOR_AS3711=m +CONFIG_REGULATOR_AS3722=m +CONFIG_REGULATOR_AXP20X=m +CONFIG_REGULATOR_BCM590XX=m +CONFIG_REGULATOR_BD70528=m +CONFIG_REGULATOR_BD71828=m +CONFIG_REGULATOR_BD718XX=m +CONFIG_REGULATOR_BD9571MWV=m +CONFIG_REGULATOR_CPCAP=m +CONFIG_REGULATOR_DA903X=m +CONFIG_REGULATOR_DA9052=m +CONFIG_REGULATOR_DA9055=m +CONFIG_REGULATOR_DA9062=m +CONFIG_REGULATOR_DA9063=m +CONFIG_REGULATOR_DA9210=m +CONFIG_REGULATOR_DA9211=m +CONFIG_REGULATOR_FAN53555=m +CONFIG_REGULATOR_GPIO=m +CONFIG_REGULATOR_HI6421=m +CONFIG_REGULATOR_HI6421V530=m +CONFIG_REGULATOR_ISL9305=m +CONFIG_REGULATOR_ISL6271A=m +CONFIG_REGULATOR_LM363X=m +CONFIG_REGULATOR_LOCHNAGAR=m +CONFIG_REGULATOR_LP3971=m +CONFIG_REGULATOR_LP3972=m +CONFIG_REGULATOR_LP872X=m +CONFIG_REGULATOR_LP873X=m +CONFIG_REGULATOR_LP8755=m +CONFIG_REGULATOR_LP87565=m +CONFIG_REGULATOR_LP8788=m +CONFIG_REGULATOR_LTC3589=m +CONFIG_REGULATOR_LTC3676=m +CONFIG_REGULATOR_MAX14577=m +CONFIG_REGULATOR_MAX1586=m +CONFIG_REGULATOR_MAX77620=m +CONFIG_REGULATOR_MAX77650=m +CONFIG_REGULATOR_MAX8649=m +CONFIG_REGULATOR_MAX8660=m +CONFIG_REGULATOR_MAX8907=m +CONFIG_REGULATOR_MAX8925=m +CONFIG_REGULATOR_MAX8952=m +CONFIG_REGULATOR_MAX8973=m +CONFIG_REGULATOR_MAX8997=m +CONFIG_REGULATOR_MAX8998=m +CONFIG_REGULATOR_MAX77686=m +CONFIG_REGULATOR_MAX77693=m +CONFIG_REGULATOR_MAX77802=m +CONFIG_REGULATOR_MC13XXX_CORE=m +CONFIG_REGULATOR_MC13783=m +CONFIG_REGULATOR_MC13892=m +CONFIG_REGULATOR_MCP16502=m +CONFIG_REGULATOR_MP5416=m +CONFIG_REGULATOR_MP8859=m +CONFIG_REGULATOR_MP886X=m +CONFIG_REGULATOR_MPQ7920=m +CONFIG_REGULATOR_MT6311=m +CONFIG_REGULATOR_MT6323=m +CONFIG_REGULATOR_MT6397=m +CONFIG_REGULATOR_PALMAS=m +CONFIG_REGULATOR_PCAP=m +CONFIG_REGULATOR_PCF50633=m +CONFIG_REGULATOR_PFUZE100=m +CONFIG_REGULATOR_PV88060=m +CONFIG_REGULATOR_PV88080=m +CONFIG_REGULATOR_PV88090=m +CONFIG_REGULATOR_PWM=m +CONFIG_REGULATOR_QCOM_SPMI=m +CONFIG_REGULATOR_RC5T583=m +CONFIG_REGULATOR_RK808=m +CONFIG_REGULATOR_RN5T618=m +CONFIG_REGULATOR_ROHM=m +CONFIG_REGULATOR_RT5033=m +CONFIG_REGULATOR_S2MPA01=m +CONFIG_REGULATOR_S2MPS11=m +CONFIG_REGULATOR_S5M8767=m +CONFIG_REGULATOR_SKY81452=m +CONFIG_REGULATOR_SLG51000=m +CONFIG_REGULATOR_STPMIC1=m +CONFIG_REGULATOR_SY8106A=m +CONFIG_REGULATOR_SY8824X=m +CONFIG_REGULATOR_TPS51632=m +CONFIG_REGULATOR_TPS6105X=m +CONFIG_REGULATOR_TPS62360=m +CONFIG_REGULATOR_TPS65023=m +CONFIG_REGULATOR_TPS6507X=m +CONFIG_REGULATOR_TPS65086=m +CONFIG_REGULATOR_TPS65090=m +CONFIG_REGULATOR_TPS65132=m +CONFIG_REGULATOR_TPS65217=m +CONFIG_REGULATOR_TPS65218=m +CONFIG_REGULATOR_TPS6524X=m +CONFIG_REGULATOR_TPS6586X=m +CONFIG_REGULATOR_TPS65910=m +CONFIG_REGULATOR_TPS65912=m +CONFIG_REGULATOR_TPS80031=m +CONFIG_REGULATOR_TWL4030=m +CONFIG_REGULATOR_VCTRL=m +CONFIG_REGULATOR_WM831X=m +CONFIG_REGULATOR_WM8350=m +CONFIG_REGULATOR_WM8400=m +CONFIG_REGULATOR_WM8994=m +CONFIG_CEC_CORE=m +CONFIG_CEC_NOTIFIER=y +CONFIG_CEC_PIN=y +CONFIG_RC_CORE=m +CONFIG_RC_MAP=m +CONFIG_LIRC=y +CONFIG_RC_DECODERS=y +CONFIG_IR_NEC_DECODER=m +CONFIG_IR_RC5_DECODER=m +CONFIG_IR_RC6_DECODER=m +CONFIG_IR_JVC_DECODER=m +CONFIG_IR_SONY_DECODER=m +CONFIG_IR_SANYO_DECODER=m +CONFIG_IR_SHARP_DECODER=m +CONFIG_IR_MCE_KBD_DECODER=m +CONFIG_IR_XMP_DECODER=m +CONFIG_IR_IMON_DECODER=m +CONFIG_IR_RCMM_DECODER=m +CONFIG_RC_DEVICES=y +CONFIG_RC_ATI_REMOTE=m +CONFIG_IR_ENE=m +CONFIG_IR_HIX5HD2=m +CONFIG_IR_IMON=m +CONFIG_IR_IMON_RAW=m +CONFIG_IR_MCEUSB=m +CONFIG_IR_ITE_CIR=m +CONFIG_IR_FINTEK=m +CONFIG_IR_NUVOTON=m +CONFIG_IR_REDRAT3=m +CONFIG_IR_SPI=m +CONFIG_IR_STREAMZAP=m +CONFIG_IR_WINBOND_CIR=m +CONFIG_IR_IGORPLUGUSB=m +CONFIG_IR_IGUANA=m +CONFIG_IR_TTUSBIR=m +CONFIG_RC_LOOPBACK=m +CONFIG_IR_GPIO_CIR=m +CONFIG_IR_GPIO_TX=m +CONFIG_IR_PWM_TX=m +CONFIG_IR_SERIAL=m +CONFIG_IR_SERIAL_TRANSMITTER=y +CONFIG_IR_SIR=m +CONFIG_RC_XBOX_DVD=m +CONFIG_MEDIA_SUPPORT=m + +# +# Multimedia core support +# +CONFIG_MEDIA_CAMERA_SUPPORT=y +CONFIG_MEDIA_ANALOG_TV_SUPPORT=y +CONFIG_MEDIA_DIGITAL_TV_SUPPORT=y +CONFIG_MEDIA_RADIO_SUPPORT=y +CONFIG_MEDIA_SDR_SUPPORT=y +CONFIG_MEDIA_CEC_SUPPORT=y +CONFIG_MEDIA_CEC_RC=y +# CONFIG_CEC_PIN_ERROR_INJ is not set +CONFIG_MEDIA_CONTROLLER=y +CONFIG_MEDIA_CONTROLLER_DVB=y +# CONFIG_MEDIA_CONTROLLER_REQUEST_API is not set +CONFIG_VIDEO_DEV=m +CONFIG_VIDEO_V4L2_SUBDEV_API=y +CONFIG_VIDEO_V4L2=m +CONFIG_VIDEO_V4L2_I2C=y +# CONFIG_VIDEO_ADV_DEBUG is not set +# CONFIG_VIDEO_FIXED_MINOR_RANGES is not set +CONFIG_VIDEO_TUNER=m +CONFIG_V4L2_MEM2MEM_DEV=m +CONFIG_V4L2_FLASH_LED_CLASS=m +CONFIG_V4L2_FWNODE=m +CONFIG_VIDEOBUF_GEN=m +CONFIG_VIDEOBUF_DMA_SG=m +CONFIG_VIDEOBUF_VMALLOC=m +CONFIG_DVB_CORE=m +CONFIG_DVB_MMAP=y +CONFIG_DVB_NET=y +CONFIG_TTPCI_EEPROM=m +CONFIG_DVB_MAX_ADAPTERS=16 +# CONFIG_DVB_DYNAMIC_MINORS is not set +# CONFIG_DVB_DEMUX_SECTION_LOSS_LOG is not set +# CONFIG_DVB_ULE_DEBUG is not set + +# +# Media drivers +# +CONFIG_MEDIA_USB_SUPPORT=y + +# +# Webcam devices +# +CONFIG_USB_VIDEO_CLASS=m +CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y +CONFIG_USB_GSPCA=m +CONFIG_USB_M5602=m +CONFIG_USB_STV06XX=m +CONFIG_USB_GL860=m +CONFIG_USB_GSPCA_BENQ=m +CONFIG_USB_GSPCA_CONEX=m +CONFIG_USB_GSPCA_CPIA1=m +CONFIG_USB_GSPCA_DTCS033=m +CONFIG_USB_GSPCA_ETOMS=m +CONFIG_USB_GSPCA_FINEPIX=m +CONFIG_USB_GSPCA_JEILINJ=m +CONFIG_USB_GSPCA_JL2005BCD=m +CONFIG_USB_GSPCA_KINECT=m +CONFIG_USB_GSPCA_KONICA=m +CONFIG_USB_GSPCA_MARS=m +CONFIG_USB_GSPCA_MR97310A=m +CONFIG_USB_GSPCA_NW80X=m +CONFIG_USB_GSPCA_OV519=m +CONFIG_USB_GSPCA_OV534=m +CONFIG_USB_GSPCA_OV534_9=m +CONFIG_USB_GSPCA_PAC207=m +CONFIG_USB_GSPCA_PAC7302=m +CONFIG_USB_GSPCA_PAC7311=m +CONFIG_USB_GSPCA_SE401=m +CONFIG_USB_GSPCA_SN9C2028=m +CONFIG_USB_GSPCA_SN9C20X=m +CONFIG_USB_GSPCA_SONIXB=m +CONFIG_USB_GSPCA_SONIXJ=m +CONFIG_USB_GSPCA_SPCA500=m +CONFIG_USB_GSPCA_SPCA501=m +CONFIG_USB_GSPCA_SPCA505=m +CONFIG_USB_GSPCA_SPCA506=m +CONFIG_USB_GSPCA_SPCA508=m +CONFIG_USB_GSPCA_SPCA561=m +CONFIG_USB_GSPCA_SPCA1528=m +CONFIG_USB_GSPCA_SQ905=m +CONFIG_USB_GSPCA_SQ905C=m +CONFIG_USB_GSPCA_SQ930X=m +CONFIG_USB_GSPCA_STK014=m +CONFIG_USB_GSPCA_STK1135=m +CONFIG_USB_GSPCA_STV0680=m +CONFIG_USB_GSPCA_SUNPLUS=m +CONFIG_USB_GSPCA_T613=m +CONFIG_USB_GSPCA_TOPRO=m +CONFIG_USB_GSPCA_TOUPTEK=m +CONFIG_USB_GSPCA_TV8532=m +CONFIG_USB_GSPCA_VC032X=m +CONFIG_USB_GSPCA_VICAM=m +CONFIG_USB_GSPCA_XIRLINK_CIT=m +CONFIG_USB_GSPCA_ZC3XX=m +CONFIG_USB_PWC=m +# CONFIG_USB_PWC_DEBUG is not set +CONFIG_USB_PWC_INPUT_EVDEV=y +CONFIG_VIDEO_CPIA2=m +CONFIG_USB_ZR364XX=m +CONFIG_USB_STKWEBCAM=m +CONFIG_USB_S2255=m +CONFIG_VIDEO_USBTV=m + +# +# Analog TV USB devices +# +CONFIG_VIDEO_PVRUSB2=m +CONFIG_VIDEO_PVRUSB2_SYSFS=y +CONFIG_VIDEO_PVRUSB2_DVB=y +# CONFIG_VIDEO_PVRUSB2_DEBUGIFC is not set +CONFIG_VIDEO_HDPVR=m +CONFIG_VIDEO_STK1160_COMMON=m +CONFIG_VIDEO_STK1160=m +CONFIG_VIDEO_GO7007=m +CONFIG_VIDEO_GO7007_USB=m +CONFIG_VIDEO_GO7007_LOADER=m +CONFIG_VIDEO_GO7007_USB_S2250_BOARD=m + +# +# Analog/digital TV USB devices +# +CONFIG_VIDEO_AU0828=m +CONFIG_VIDEO_AU0828_V4L2=y +CONFIG_VIDEO_AU0828_RC=y +CONFIG_VIDEO_CX231XX=m +CONFIG_VIDEO_CX231XX_RC=y +CONFIG_VIDEO_CX231XX_ALSA=m +CONFIG_VIDEO_CX231XX_DVB=m +CONFIG_VIDEO_TM6000=m +CONFIG_VIDEO_TM6000_ALSA=m +CONFIG_VIDEO_TM6000_DVB=m + +# +# Digital TV USB devices +# +CONFIG_DVB_USB=m +# CONFIG_DVB_USB_DEBUG is not set +CONFIG_DVB_USB_DIB3000MC=m +CONFIG_DVB_USB_A800=m +CONFIG_DVB_USB_DIBUSB_MB=m +CONFIG_DVB_USB_DIBUSB_MB_FAULTY=y +CONFIG_DVB_USB_DIBUSB_MC=m +CONFIG_DVB_USB_DIB0700=m +CONFIG_DVB_USB_UMT_010=m +CONFIG_DVB_USB_CXUSB=m +CONFIG_DVB_USB_CXUSB_ANALOG=y +CONFIG_DVB_USB_M920X=m +CONFIG_DVB_USB_DIGITV=m +CONFIG_DVB_USB_VP7045=m +CONFIG_DVB_USB_VP702X=m +CONFIG_DVB_USB_GP8PSK=m +CONFIG_DVB_USB_NOVA_T_USB2=m +CONFIG_DVB_USB_TTUSB2=m +CONFIG_DVB_USB_DTT200U=m +CONFIG_DVB_USB_OPERA1=m +CONFIG_DVB_USB_AF9005=m +CONFIG_DVB_USB_AF9005_REMOTE=m +CONFIG_DVB_USB_PCTV452E=m +CONFIG_DVB_USB_DW2102=m +CONFIG_DVB_USB_CINERGY_T2=m +CONFIG_DVB_USB_DTV5100=m +CONFIG_DVB_USB_AZ6027=m +CONFIG_DVB_USB_TECHNISAT_USB2=m +CONFIG_DVB_USB_V2=m +CONFIG_DVB_USB_AF9015=m +CONFIG_DVB_USB_AF9035=m +CONFIG_DVB_USB_ANYSEE=m +CONFIG_DVB_USB_AU6610=m +CONFIG_DVB_USB_AZ6007=m +CONFIG_DVB_USB_CE6230=m +CONFIG_DVB_USB_EC168=m +CONFIG_DVB_USB_GL861=m +CONFIG_DVB_USB_LME2510=m +CONFIG_DVB_USB_MXL111SF=m +CONFIG_DVB_USB_RTL28XXU=m +CONFIG_DVB_USB_DVBSKY=m +CONFIG_DVB_USB_ZD1301=m +CONFIG_DVB_TTUSB_BUDGET=m +CONFIG_DVB_TTUSB_DEC=m +CONFIG_SMS_USB_DRV=m +CONFIG_DVB_B2C2_FLEXCOP_USB=m +# CONFIG_DVB_B2C2_FLEXCOP_USB_DEBUG is not set +CONFIG_DVB_AS102=m + +# +# Webcam, TV (analog/digital) USB devices +# +CONFIG_VIDEO_EM28XX=m +CONFIG_VIDEO_EM28XX_V4L2=m +CONFIG_VIDEO_EM28XX_ALSA=m +CONFIG_VIDEO_EM28XX_DVB=m +CONFIG_VIDEO_EM28XX_RC=m + +# +# Software defined radio USB devices +# +CONFIG_USB_AIRSPY=m +CONFIG_USB_HACKRF=m +CONFIG_USB_MSI2500=m + +# +# USB HDMI CEC adapters +# +CONFIG_USB_PULSE8_CEC=m +CONFIG_USB_RAINSHADOW_CEC=m +CONFIG_MEDIA_PCI_SUPPORT=y + +# +# Media capture support +# +CONFIG_VIDEO_MEYE=m +CONFIG_VIDEO_SOLO6X10=m +CONFIG_VIDEO_TW5864=m +CONFIG_VIDEO_TW68=m +CONFIG_VIDEO_TW686X=m + +# +# Media capture/analog TV support +# +CONFIG_VIDEO_IVTV=m +# CONFIG_VIDEO_IVTV_DEPRECATED_IOCTLS is not set +CONFIG_VIDEO_IVTV_ALSA=m +CONFIG_VIDEO_FB_IVTV=m +# CONFIG_VIDEO_FB_IVTV_FORCE_PAT is not set +CONFIG_VIDEO_HEXIUM_GEMINI=m +CONFIG_VIDEO_HEXIUM_ORION=m +CONFIG_VIDEO_MXB=m +CONFIG_VIDEO_DT3155=m + +# +# Media capture/analog/hybrid TV support +# +CONFIG_VIDEO_CX18=m +CONFIG_VIDEO_CX18_ALSA=m +CONFIG_VIDEO_CX23885=m +CONFIG_MEDIA_ALTERA_CI=m +CONFIG_VIDEO_CX25821=m +CONFIG_VIDEO_CX25821_ALSA=m +CONFIG_VIDEO_CX88=m +CONFIG_VIDEO_CX88_ALSA=m +CONFIG_VIDEO_CX88_BLACKBIRD=m +CONFIG_VIDEO_CX88_DVB=m +CONFIG_VIDEO_CX88_ENABLE_VP3054=y +CONFIG_VIDEO_CX88_VP3054=m +CONFIG_VIDEO_CX88_MPEG=m +CONFIG_VIDEO_BT848=m +CONFIG_DVB_BT8XX=m +CONFIG_VIDEO_SAA7134=m +CONFIG_VIDEO_SAA7134_ALSA=m +CONFIG_VIDEO_SAA7134_RC=y +CONFIG_VIDEO_SAA7134_DVB=m +CONFIG_VIDEO_SAA7134_GO7007=m +CONFIG_VIDEO_SAA7164=m + +# +# Media digital TV PCI Adapters +# +CONFIG_DVB_AV7110_IR=y +CONFIG_DVB_AV7110=m +CONFIG_DVB_AV7110_OSD=y +CONFIG_DVB_BUDGET_CORE=m +CONFIG_DVB_BUDGET=m +CONFIG_DVB_BUDGET_CI=m +CONFIG_DVB_BUDGET_AV=m +CONFIG_DVB_BUDGET_PATCH=m +CONFIG_DVB_B2C2_FLEXCOP_PCI=m +# CONFIG_DVB_B2C2_FLEXCOP_PCI_DEBUG is not set +CONFIG_DVB_PLUTO2=m +CONFIG_DVB_DM1105=m +CONFIG_DVB_PT1=m +CONFIG_DVB_PT3=m +CONFIG_MANTIS_CORE=m +CONFIG_DVB_MANTIS=m +CONFIG_DVB_HOPPER=m +CONFIG_DVB_NGENE=m +CONFIG_DVB_DDBRIDGE=m +# CONFIG_DVB_DDBRIDGE_MSIENABLE is not set +CONFIG_DVB_SMIPCIE=m +CONFIG_DVB_NETUP_UNIDVB=m +CONFIG_VIDEO_IPU3_CIO2=m +CONFIG_V4L_PLATFORM_DRIVERS=y +CONFIG_VIDEO_CAFE_CCIC=m +CONFIG_VIDEO_CADENCE=y +CONFIG_VIDEO_CADENCE_CSI2RX=m +CONFIG_VIDEO_CADENCE_CSI2TX=m +CONFIG_VIDEO_ASPEED=m +CONFIG_VIDEO_MUX=m +CONFIG_VIDEO_XILINX=m +CONFIG_VIDEO_XILINX_TPG=m +CONFIG_VIDEO_XILINX_VTC=m +CONFIG_V4L_MEM2MEM_DRIVERS=y +CONFIG_VIDEO_MEM2MEM_DEINTERLACE=m +CONFIG_VIDEO_SH_VEU=m +CONFIG_V4L_TEST_DRIVERS=y +CONFIG_VIDEO_VIMC=m +CONFIG_VIDEO_VIVID=m +CONFIG_VIDEO_VIVID_CEC=y +CONFIG_VIDEO_VIVID_MAX_DEVS=64 +CONFIG_VIDEO_VIM2M=m +CONFIG_VIDEO_VICODEC=m +CONFIG_DVB_PLATFORM_DRIVERS=y +CONFIG_CEC_PLATFORM_DRIVERS=y +CONFIG_VIDEO_CROS_EC_CEC=m +CONFIG_CEC_GPIO=m +CONFIG_VIDEO_SECO_CEC=m +CONFIG_VIDEO_SECO_RC=y +CONFIG_SDR_PLATFORM_DRIVERS=y + +# +# Supported MMC/SDIO adapters +# +CONFIG_SMS_SDIO_DRV=m +CONFIG_RADIO_ADAPTERS=y +CONFIG_RADIO_TEA575X=m +CONFIG_RADIO_SI470X=m +CONFIG_USB_SI470X=m +CONFIG_I2C_SI470X=m +CONFIG_RADIO_SI4713=m +CONFIG_USB_SI4713=m +CONFIG_PLATFORM_SI4713=m +CONFIG_I2C_SI4713=m +CONFIG_RADIO_SI476X=m +CONFIG_USB_MR800=m +CONFIG_USB_DSBR=m +CONFIG_RADIO_MAXIRADIO=m +CONFIG_RADIO_SHARK=m +CONFIG_RADIO_SHARK2=m +CONFIG_USB_KEENE=m +CONFIG_USB_RAREMONO=m +CONFIG_USB_MA901=m +CONFIG_RADIO_TEA5764=m +CONFIG_RADIO_SAA7706H=m +CONFIG_RADIO_TEF6862=m +CONFIG_RADIO_WL1273=m + +# +# Texas Instruments WL128x FM driver (ST based) +# +CONFIG_RADIO_WL128X=m +# end of Texas Instruments WL128x FM driver (ST based) + +# +# Supported FireWire (IEEE 1394) Adapters +# +CONFIG_DVB_FIREDTV=m +CONFIG_DVB_FIREDTV_INPUT=y +CONFIG_MEDIA_COMMON_OPTIONS=y + +# +# common driver options +# +CONFIG_VIDEO_CX2341X=m +CONFIG_VIDEO_TVEEPROM=m +CONFIG_CYPRESS_FIRMWARE=m +CONFIG_VIDEOBUF2_CORE=m +CONFIG_VIDEOBUF2_V4L2=m +CONFIG_VIDEOBUF2_MEMOPS=m +CONFIG_VIDEOBUF2_DMA_CONTIG=m +CONFIG_VIDEOBUF2_VMALLOC=m +CONFIG_VIDEOBUF2_DMA_SG=m +CONFIG_VIDEOBUF2_DVB=m +CONFIG_DVB_B2C2_FLEXCOP=m +CONFIG_VIDEO_SAA7146=m +CONFIG_VIDEO_SAA7146_VV=m +CONFIG_SMS_SIANO_MDTV=m +CONFIG_SMS_SIANO_RC=y +# CONFIG_SMS_SIANO_DEBUGFS is not set +CONFIG_VIDEO_V4L2_TPG=m + +# +# Media ancillary drivers (tuners, sensors, i2c, spi, frontends) +# +CONFIG_MEDIA_SUBDRV_AUTOSELECT=y +CONFIG_MEDIA_ATTACH=y +CONFIG_VIDEO_IR_I2C=m + +# +# I2C Encoders, decoders, sensors and other helper chips +# + +# +# Audio decoders, processors and mixers +# +CONFIG_VIDEO_TVAUDIO=m +CONFIG_VIDEO_TDA7432=m +CONFIG_VIDEO_TDA9840=m +CONFIG_VIDEO_TDA1997X=m +CONFIG_VIDEO_TEA6415C=m +CONFIG_VIDEO_TEA6420=m +CONFIG_VIDEO_MSP3400=m +CONFIG_VIDEO_CS3308=m +CONFIG_VIDEO_CS5345=m +CONFIG_VIDEO_CS53L32A=m +CONFIG_VIDEO_TLV320AIC23B=m +CONFIG_VIDEO_UDA1342=m +CONFIG_VIDEO_WM8775=m +CONFIG_VIDEO_WM8739=m +CONFIG_VIDEO_VP27SMPX=m +CONFIG_VIDEO_SONY_BTF_MPX=m + +# +# RDS decoders +# +CONFIG_VIDEO_SAA6588=m + +# +# Video decoders +# +CONFIG_VIDEO_ADV7180=m +CONFIG_VIDEO_ADV7183=m +CONFIG_VIDEO_ADV748X=m +CONFIG_VIDEO_ADV7604=m +CONFIG_VIDEO_ADV7604_CEC=y +CONFIG_VIDEO_ADV7842=m +CONFIG_VIDEO_ADV7842_CEC=y +CONFIG_VIDEO_BT819=m +CONFIG_VIDEO_BT856=m +CONFIG_VIDEO_BT866=m +CONFIG_VIDEO_KS0127=m +CONFIG_VIDEO_ML86V7667=m +CONFIG_VIDEO_SAA7110=m +CONFIG_VIDEO_SAA711X=m +CONFIG_VIDEO_TC358743=m +CONFIG_VIDEO_TC358743_CEC=y +CONFIG_VIDEO_TVP514X=m +CONFIG_VIDEO_TVP5150=m +CONFIG_VIDEO_TVP7002=m +CONFIG_VIDEO_TW2804=m +CONFIG_VIDEO_TW9903=m +CONFIG_VIDEO_TW9906=m +CONFIG_VIDEO_TW9910=m +CONFIG_VIDEO_VPX3220=m + +# +# Video and audio decoders +# +CONFIG_VIDEO_SAA717X=m +CONFIG_VIDEO_CX25840=m + +# +# Video encoders +# +CONFIG_VIDEO_SAA7127=m +CONFIG_VIDEO_SAA7185=m +CONFIG_VIDEO_ADV7170=m +CONFIG_VIDEO_ADV7175=m +CONFIG_VIDEO_ADV7343=m +CONFIG_VIDEO_ADV7393=m +CONFIG_VIDEO_AD9389B=m +CONFIG_VIDEO_AK881X=m +CONFIG_VIDEO_THS8200=m + +# +# Camera sensor devices +# +CONFIG_VIDEO_APTINA_PLL=m +CONFIG_VIDEO_SMIAPP_PLL=m +CONFIG_VIDEO_HI556=m +CONFIG_VIDEO_IMX214=m +CONFIG_VIDEO_IMX219=m +CONFIG_VIDEO_IMX258=m +CONFIG_VIDEO_IMX274=m +CONFIG_VIDEO_IMX290=m +CONFIG_VIDEO_IMX319=m +CONFIG_VIDEO_IMX355=m +CONFIG_VIDEO_OV2640=m +CONFIG_VIDEO_OV2659=m +CONFIG_VIDEO_OV2680=m +CONFIG_VIDEO_OV2685=m +CONFIG_VIDEO_OV5640=m +CONFIG_VIDEO_OV5645=m +CONFIG_VIDEO_OV5647=m +CONFIG_VIDEO_OV6650=m +CONFIG_VIDEO_OV5670=m +CONFIG_VIDEO_OV5675=m +CONFIG_VIDEO_OV5695=m +CONFIG_VIDEO_OV7251=m +CONFIG_VIDEO_OV772X=m +CONFIG_VIDEO_OV7640=m +CONFIG_VIDEO_OV7670=m +CONFIG_VIDEO_OV7740=m +CONFIG_VIDEO_OV8856=m +CONFIG_VIDEO_OV9640=m +CONFIG_VIDEO_OV9650=m +CONFIG_VIDEO_OV13858=m +CONFIG_VIDEO_VS6624=m +CONFIG_VIDEO_MT9M001=m +CONFIG_VIDEO_MT9M032=m +CONFIG_VIDEO_MT9M111=m +CONFIG_VIDEO_MT9P031=m +CONFIG_VIDEO_MT9T001=m +CONFIG_VIDEO_MT9T112=m +CONFIG_VIDEO_MT9V011=m +CONFIG_VIDEO_MT9V032=m +CONFIG_VIDEO_MT9V111=m +CONFIG_VIDEO_SR030PC30=m +CONFIG_VIDEO_NOON010PC30=m +CONFIG_VIDEO_M5MOLS=m +CONFIG_VIDEO_RJ54N1=m +CONFIG_VIDEO_S5K6AA=m +CONFIG_VIDEO_S5K6A3=m +CONFIG_VIDEO_S5K4ECGX=m +CONFIG_VIDEO_S5K5BAF=m +CONFIG_VIDEO_SMIAPP=m +CONFIG_VIDEO_ET8EK8=m +CONFIG_VIDEO_S5C73M3=m + +# +# Lens drivers +# +CONFIG_VIDEO_AD5820=m +CONFIG_VIDEO_AK7375=m +CONFIG_VIDEO_DW9714=m +CONFIG_VIDEO_DW9807_VCM=m + +# +# Flash devices +# +CONFIG_VIDEO_ADP1653=m +CONFIG_VIDEO_LM3560=m +CONFIG_VIDEO_LM3646=m + +# +# Video improvement chips +# +CONFIG_VIDEO_UPD64031A=m +CONFIG_VIDEO_UPD64083=m + +# +# Audio/Video compression chips +# +CONFIG_VIDEO_SAA6752HS=m + +# +# SDR tuner chips +# +CONFIG_SDR_MAX2175=m + +# +# Miscellaneous helper chips +# +CONFIG_VIDEO_THS7303=m +CONFIG_VIDEO_M52790=m +CONFIG_VIDEO_I2C=m +CONFIG_VIDEO_ST_MIPID02=m +# end of I2C Encoders, decoders, sensors and other helper chips + +# +# SPI helper chips +# +CONFIG_VIDEO_GS1662=m +# end of SPI helper chips + +# +# Media SPI Adapters +# +CONFIG_CXD2880_SPI_DRV=m +# end of Media SPI Adapters + +CONFIG_MEDIA_TUNER=m + +# +# Customize TV tuners +# +CONFIG_MEDIA_TUNER_SIMPLE=m +CONFIG_MEDIA_TUNER_TDA18250=m +CONFIG_MEDIA_TUNER_TDA8290=m +CONFIG_MEDIA_TUNER_TDA827X=m +CONFIG_MEDIA_TUNER_TDA18271=m +CONFIG_MEDIA_TUNER_TDA9887=m +CONFIG_MEDIA_TUNER_TEA5761=m +CONFIG_MEDIA_TUNER_TEA5767=m +CONFIG_MEDIA_TUNER_MSI001=m +CONFIG_MEDIA_TUNER_MT20XX=m +CONFIG_MEDIA_TUNER_MT2060=m +CONFIG_MEDIA_TUNER_MT2063=m +CONFIG_MEDIA_TUNER_MT2266=m +CONFIG_MEDIA_TUNER_MT2131=m +CONFIG_MEDIA_TUNER_QT1010=m +CONFIG_MEDIA_TUNER_XC2028=m +CONFIG_MEDIA_TUNER_XC5000=m +CONFIG_MEDIA_TUNER_XC4000=m +CONFIG_MEDIA_TUNER_MXL5005S=m +CONFIG_MEDIA_TUNER_MXL5007T=m +CONFIG_MEDIA_TUNER_MC44S803=m +CONFIG_MEDIA_TUNER_MAX2165=m +CONFIG_MEDIA_TUNER_TDA18218=m +CONFIG_MEDIA_TUNER_FC0011=m +CONFIG_MEDIA_TUNER_FC0012=m +CONFIG_MEDIA_TUNER_FC0013=m +CONFIG_MEDIA_TUNER_TDA18212=m +CONFIG_MEDIA_TUNER_E4000=m +CONFIG_MEDIA_TUNER_FC2580=m +CONFIG_MEDIA_TUNER_M88RS6000T=m +CONFIG_MEDIA_TUNER_TUA9001=m +CONFIG_MEDIA_TUNER_SI2157=m +CONFIG_MEDIA_TUNER_IT913X=m +CONFIG_MEDIA_TUNER_R820T=m +CONFIG_MEDIA_TUNER_MXL301RF=m +CONFIG_MEDIA_TUNER_QM1D1C0042=m +CONFIG_MEDIA_TUNER_QM1D1B0004=m +# end of Customize TV tuners + +# +# Customise DVB Frontends +# + +# +# Multistandard (satellite) frontends +# +CONFIG_DVB_STB0899=m +CONFIG_DVB_STB6100=m +CONFIG_DVB_STV090x=m +CONFIG_DVB_STV0910=m +CONFIG_DVB_STV6110x=m +CONFIG_DVB_STV6111=m +CONFIG_DVB_MXL5XX=m +CONFIG_DVB_M88DS3103=m + +# +# Multistandard (cable + terrestrial) frontends +# +CONFIG_DVB_DRXK=m +CONFIG_DVB_TDA18271C2DD=m +CONFIG_DVB_SI2165=m +CONFIG_DVB_MN88472=m +CONFIG_DVB_MN88473=m + +# +# DVB-S (satellite) frontends +# +CONFIG_DVB_CX24110=m +CONFIG_DVB_CX24123=m +CONFIG_DVB_MT312=m +CONFIG_DVB_ZL10036=m +CONFIG_DVB_ZL10039=m +CONFIG_DVB_S5H1420=m +CONFIG_DVB_STV0288=m +CONFIG_DVB_STB6000=m +CONFIG_DVB_STV0299=m +CONFIG_DVB_STV6110=m +CONFIG_DVB_STV0900=m +CONFIG_DVB_TDA8083=m +CONFIG_DVB_TDA10086=m +CONFIG_DVB_TDA8261=m +CONFIG_DVB_VES1X93=m +CONFIG_DVB_TUNER_ITD1000=m +CONFIG_DVB_TUNER_CX24113=m +CONFIG_DVB_TDA826X=m +CONFIG_DVB_TUA6100=m +CONFIG_DVB_CX24116=m +CONFIG_DVB_CX24117=m +CONFIG_DVB_CX24120=m +CONFIG_DVB_SI21XX=m +CONFIG_DVB_TS2020=m +CONFIG_DVB_DS3000=m +CONFIG_DVB_MB86A16=m +CONFIG_DVB_TDA10071=m + +# +# DVB-T (terrestrial) frontends +# +CONFIG_DVB_SP8870=m +CONFIG_DVB_SP887X=m +CONFIG_DVB_CX22700=m +CONFIG_DVB_CX22702=m +CONFIG_DVB_S5H1432=m +CONFIG_DVB_DRXD=m +CONFIG_DVB_L64781=m +CONFIG_DVB_TDA1004X=m +CONFIG_DVB_NXT6000=m +CONFIG_DVB_MT352=m +CONFIG_DVB_ZL10353=m +CONFIG_DVB_DIB3000MB=m +CONFIG_DVB_DIB3000MC=m +CONFIG_DVB_DIB7000M=m +CONFIG_DVB_DIB7000P=m +CONFIG_DVB_DIB9000=m +CONFIG_DVB_TDA10048=m +CONFIG_DVB_AF9013=m +CONFIG_DVB_EC100=m +CONFIG_DVB_STV0367=m +CONFIG_DVB_CXD2820R=m +CONFIG_DVB_CXD2841ER=m +CONFIG_DVB_RTL2830=m +CONFIG_DVB_RTL2832=m +CONFIG_DVB_RTL2832_SDR=m +CONFIG_DVB_SI2168=m +CONFIG_DVB_AS102_FE=m +CONFIG_DVB_ZD1301_DEMOD=m +CONFIG_DVB_GP8PSK_FE=m +CONFIG_DVB_CXD2880=m + +# +# DVB-C (cable) frontends +# +CONFIG_DVB_VES1820=m +CONFIG_DVB_TDA10021=m +CONFIG_DVB_TDA10023=m +CONFIG_DVB_STV0297=m + +# +# ATSC (North American/Korean Terrestrial/Cable DTV) frontends +# +CONFIG_DVB_NXT200X=m +CONFIG_DVB_OR51211=m +CONFIG_DVB_OR51132=m +CONFIG_DVB_BCM3510=m +CONFIG_DVB_LGDT330X=m +CONFIG_DVB_LGDT3305=m +CONFIG_DVB_LGDT3306A=m +CONFIG_DVB_LG2160=m +CONFIG_DVB_S5H1409=m +CONFIG_DVB_AU8522=m +CONFIG_DVB_AU8522_DTV=m +CONFIG_DVB_AU8522_V4L=m +CONFIG_DVB_S5H1411=m + +# +# ISDB-T (terrestrial) frontends +# +CONFIG_DVB_S921=m +CONFIG_DVB_DIB8000=m +CONFIG_DVB_MB86A20S=m + +# +# ISDB-S (satellite) & ISDB-T (terrestrial) frontends +# +CONFIG_DVB_TC90522=m +CONFIG_DVB_MN88443X=m + +# +# Digital terrestrial only tuners/PLL +# +CONFIG_DVB_PLL=m +CONFIG_DVB_TUNER_DIB0070=m +CONFIG_DVB_TUNER_DIB0090=m + +# +# SEC control devices for DVB-S +# +CONFIG_DVB_DRX39XYJ=m +CONFIG_DVB_LNBH25=m +CONFIG_DVB_LNBH29=m +CONFIG_DVB_LNBP21=m +CONFIG_DVB_LNBP22=m +CONFIG_DVB_ISL6405=m +CONFIG_DVB_ISL6421=m +CONFIG_DVB_ISL6423=m +CONFIG_DVB_A8293=m +CONFIG_DVB_LGS8GL5=m +CONFIG_DVB_LGS8GXX=m +CONFIG_DVB_ATBM8830=m +CONFIG_DVB_TDA665x=m +CONFIG_DVB_IX2505V=m +CONFIG_DVB_M88RS2000=m +CONFIG_DVB_AF9033=m +CONFIG_DVB_HORUS3A=m +CONFIG_DVB_ASCOT2E=m +CONFIG_DVB_HELENE=m + +# +# Common Interface (EN50221) controller drivers +# +CONFIG_DVB_CXD2099=m +CONFIG_DVB_SP2=m + +# +# Tools to develop new frontends +# +CONFIG_DVB_DUMMY_FE=m +# end of Customise DVB Frontends + +# +# Graphics support +# +CONFIG_AGP=m +CONFIG_AGP_AMD64=m +CONFIG_AGP_INTEL=m +CONFIG_AGP_SIS=m +CONFIG_AGP_VIA=m +CONFIG_INTEL_GTT=m +CONFIG_VGA_ARB=y +CONFIG_VGA_ARB_MAX_GPUS=10 +CONFIG_VGA_SWITCHEROO=y +CONFIG_DRM=m +CONFIG_DRM_MIPI_DBI=m +CONFIG_DRM_MIPI_DSI=y +CONFIG_DRM_DP_AUX_CHARDEV=y +# CONFIG_DRM_DEBUG_SELFTEST is not set +CONFIG_DRM_KMS_HELPER=m +CONFIG_DRM_KMS_FB_HELPER=y +# CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS is not set +CONFIG_DRM_FBDEV_EMULATION=y +CONFIG_DRM_FBDEV_OVERALLOC=100 +# CONFIG_DRM_FBDEV_LEAK_PHYS_SMEM is not set +CONFIG_DRM_LOAD_EDID_FIRMWARE=y +CONFIG_DRM_DP_CEC=y +CONFIG_DRM_TTM=m +CONFIG_DRM_TTM_DMA_PAGE_POOL=y +CONFIG_DRM_VRAM_HELPER=m +CONFIG_DRM_TTM_HELPER=m +CONFIG_DRM_GEM_CMA_HELPER=y +CONFIG_DRM_KMS_CMA_HELPER=y +CONFIG_DRM_GEM_SHMEM_HELPER=y +CONFIG_DRM_SCHED=m + +# +# I2C encoder or helper chips +# +CONFIG_DRM_I2C_CH7006=m +CONFIG_DRM_I2C_SIL164=m +CONFIG_DRM_I2C_NXP_TDA998X=m +CONFIG_DRM_I2C_NXP_TDA9950=m +# end of I2C encoder or helper chips + +# +# ARM devices +# +CONFIG_DRM_KOMEDA=m +# end of ARM devices + +CONFIG_DRM_RADEON=m +CONFIG_DRM_RADEON_USERPTR=y +CONFIG_DRM_AMDGPU=m +CONFIG_DRM_AMDGPU_SI=y +CONFIG_DRM_AMDGPU_CIK=y +CONFIG_DRM_AMDGPU_USERPTR=y +# CONFIG_DRM_AMDGPU_GART_DEBUGFS is not set + +# +# ACP (Audio CoProcessor) Configuration +# +CONFIG_DRM_AMD_ACP=y +# end of ACP (Audio CoProcessor) Configuration + +# +# Display Engine Configuration +# +CONFIG_DRM_AMD_DC=y +CONFIG_DRM_AMD_DC_DCN=y +CONFIG_DRM_AMD_DC_HDCP=y +# CONFIG_DEBUG_KERNEL_DC is not set +# end of Display Engine Configuration + +CONFIG_HSA_AMD=y +CONFIG_DRM_NOUVEAU=m +# CONFIG_NOUVEAU_LEGACY_CTX_SUPPORT is not set +CONFIG_NOUVEAU_DEBUG=5 +CONFIG_NOUVEAU_DEBUG_DEFAULT=3 +# CONFIG_NOUVEAU_DEBUG_MMU is not set +CONFIG_DRM_NOUVEAU_BACKLIGHT=y +CONFIG_DRM_NOUVEAU_SVM=y +CONFIG_DRM_I915=m +CONFIG_DRM_I915_FORCE_PROBE="*" +CONFIG_DRM_I915_CAPTURE_ERROR=y +CONFIG_DRM_I915_COMPRESS_ERROR=y +CONFIG_DRM_I915_USERPTR=y +CONFIG_DRM_I915_GVT=y +CONFIG_DRM_I915_GVT_KVMGT=m + +# +# drm/i915 Debugging +# +# CONFIG_DRM_I915_WERROR is not set +# CONFIG_DRM_I915_DEBUG is not set +# CONFIG_DRM_I915_DEBUG_MMIO is not set +# CONFIG_DRM_I915_SW_FENCE_DEBUG_OBJECTS is not set +# CONFIG_DRM_I915_SW_FENCE_CHECK_DAG is not set +# CONFIG_DRM_I915_DEBUG_GUC is not set +# CONFIG_DRM_I915_SELFTEST is not set +# CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS is not set +# CONFIG_DRM_I915_DEBUG_VBLANK_EVADE is not set +# CONFIG_DRM_I915_DEBUG_RUNTIME_PM is not set +# end of drm/i915 Debugging + +# +# drm/i915 Profile Guided Optimisation +# +CONFIG_DRM_I915_USERFAULT_AUTOSUSPEND=250 +CONFIG_DRM_I915_HEARTBEAT_INTERVAL=2500 +CONFIG_DRM_I915_PREEMPT_TIMEOUT=640 +CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT=8000 +CONFIG_DRM_I915_STOP_TIMEOUT=100 +CONFIG_DRM_I915_TIMESLICE_DURATION=1 +# end of drm/i915 Profile Guided Optimisation + +CONFIG_DRM_VGEM=m +CONFIG_DRM_VKMS=m +CONFIG_DRM_VMWGFX=m +CONFIG_DRM_VMWGFX_FBCON=y +CONFIG_DRM_GMA500=m +CONFIG_DRM_GMA600=y +CONFIG_DRM_GMA3600=y +CONFIG_DRM_UDL=m +CONFIG_DRM_AST=m +CONFIG_DRM_MGAG200=m +CONFIG_DRM_CIRRUS_QEMU=m +CONFIG_DRM_RCAR_DW_HDMI=m +CONFIG_DRM_RCAR_LVDS=m +CONFIG_DRM_QXL=m +CONFIG_DRM_BOCHS=m +CONFIG_DRM_VIRTIO_GPU=m +CONFIG_DRM_PANEL=y + +# +# Display Panels +# +CONFIG_DRM_PANEL_ARM_VERSATILE=m +CONFIG_DRM_PANEL_BOE_HIMAX8279D=m +CONFIG_DRM_PANEL_BOE_TV101WUM_NL6=m +CONFIG_DRM_PANEL_LVDS=m +CONFIG_DRM_PANEL_SIMPLE=m +CONFIG_DRM_PANEL_ELIDA_KD35T133=m +CONFIG_DRM_PANEL_FEIXIN_K101_IM2BA02=m +CONFIG_DRM_PANEL_FEIYANG_FY07024DI26A30D=m +CONFIG_DRM_PANEL_ILITEK_IL9322=m +CONFIG_DRM_PANEL_ILITEK_ILI9881C=m +CONFIG_DRM_PANEL_INNOLUX_P079ZCA=m +CONFIG_DRM_PANEL_JDI_LT070ME05000=m +CONFIG_DRM_PANEL_KINGDISPLAY_KD097D04=m +CONFIG_DRM_PANEL_LEADTEK_LTK500HD1829=m +CONFIG_DRM_PANEL_SAMSUNG_LD9040=m +CONFIG_DRM_PANEL_LG_LB035Q02=m +CONFIG_DRM_PANEL_LG_LG4573=m +CONFIG_DRM_PANEL_NEC_NL8048HL11=m +CONFIG_DRM_PANEL_NOVATEK_NT35510=m +CONFIG_DRM_PANEL_NOVATEK_NT39016=m +CONFIG_DRM_PANEL_OLIMEX_LCD_OLINUXINO=m +CONFIG_DRM_PANEL_ORISETECH_OTM8009A=m +CONFIG_DRM_PANEL_OSD_OSD101T2587_53TS=m +CONFIG_DRM_PANEL_PANASONIC_VVX10F034N00=m +CONFIG_DRM_PANEL_RASPBERRYPI_TOUCHSCREEN=m +CONFIG_DRM_PANEL_RAYDIUM_RM67191=m +CONFIG_DRM_PANEL_RAYDIUM_RM68200=m +CONFIG_DRM_PANEL_ROCKTECH_JH057N00900=m +CONFIG_DRM_PANEL_RONBO_RB070D30=m +CONFIG_DRM_PANEL_SAMSUNG_S6D16D0=m +CONFIG_DRM_PANEL_SAMSUNG_S6E3HA2=m +CONFIG_DRM_PANEL_SAMSUNG_S6E63J0X03=m +CONFIG_DRM_PANEL_SAMSUNG_S6E63M0=m +CONFIG_DRM_PANEL_SAMSUNG_S6E88A0_AMS452EF01=m +CONFIG_DRM_PANEL_SAMSUNG_S6E8AA0=m +CONFIG_DRM_PANEL_SEIKO_43WVF1G=m +CONFIG_DRM_PANEL_SHARP_LQ101R1SX01=m +CONFIG_DRM_PANEL_SHARP_LS037V7DW01=m +CONFIG_DRM_PANEL_SHARP_LS043T1LE01=m +CONFIG_DRM_PANEL_SITRONIX_ST7701=m +CONFIG_DRM_PANEL_SITRONIX_ST7789V=m +CONFIG_DRM_PANEL_SONY_ACX424AKP=m +CONFIG_DRM_PANEL_SONY_ACX565AKM=m +CONFIG_DRM_PANEL_TPO_TD028TTEC1=m +CONFIG_DRM_PANEL_TPO_TD043MTEA1=m +CONFIG_DRM_PANEL_TPO_TPG110=m +CONFIG_DRM_PANEL_TRULY_NT35597_WQXGA=m +CONFIG_DRM_PANEL_XINPENG_XPP055C272=m +# end of Display Panels + +CONFIG_DRM_BRIDGE=y +CONFIG_DRM_PANEL_BRIDGE=y + +# +# Display Interface Bridges +# +CONFIG_DRM_CDNS_DSI=m +CONFIG_DRM_DISPLAY_CONNECTOR=m +CONFIG_DRM_LVDS_CODEC=m +CONFIG_DRM_MEGACHIPS_STDPXXXX_GE_B850V3_FW=m +CONFIG_DRM_NXP_PTN3460=m +CONFIG_DRM_PARADE_PS8622=m +CONFIG_DRM_PARADE_PS8640=m +CONFIG_DRM_SIL_SII8620=m +CONFIG_DRM_SII902X=m +CONFIG_DRM_SII9234=m +CONFIG_DRM_SIMPLE_BRIDGE=m +CONFIG_DRM_THINE_THC63LVD1024=m +CONFIG_DRM_TOSHIBA_TC358764=m +CONFIG_DRM_TOSHIBA_TC358767=m +CONFIG_DRM_TOSHIBA_TC358768=m +CONFIG_DRM_TI_TFP410=m +CONFIG_DRM_TI_SN65DSI86=m +CONFIG_DRM_TI_TPD12S015=m +CONFIG_DRM_ANALOGIX_ANX6345=m +CONFIG_DRM_ANALOGIX_ANX78XX=m +CONFIG_DRM_ANALOGIX_DP=m +CONFIG_DRM_I2C_ADV7511=m +CONFIG_DRM_I2C_ADV7511_AUDIO=y +CONFIG_DRM_I2C_ADV7511_CEC=y +CONFIG_DRM_DW_HDMI=m +CONFIG_DRM_DW_HDMI_AHB_AUDIO=m +CONFIG_DRM_DW_HDMI_I2S_AUDIO=m +CONFIG_DRM_DW_HDMI_CEC=m +# end of Display Interface Bridges + +# CONFIG_DRM_ETNAVIV is not set +CONFIG_DRM_ARCPGU=m +CONFIG_DRM_MXS=y +CONFIG_DRM_MXSFB=m +CONFIG_DRM_GM12U320=m +CONFIG_TINYDRM_HX8357D=m +CONFIG_TINYDRM_ILI9225=m +CONFIG_TINYDRM_ILI9341=m +CONFIG_TINYDRM_ILI9486=m +CONFIG_TINYDRM_MI0283QT=m +CONFIG_TINYDRM_REPAPER=m +CONFIG_TINYDRM_ST7586=m +CONFIG_TINYDRM_ST7735R=m +CONFIG_DRM_XEN=y +CONFIG_DRM_XEN_FRONTEND=m +CONFIG_DRM_VBOXVIDEO=m +# CONFIG_DRM_LEGACY is not set +CONFIG_DRM_PANEL_ORIENTATION_QUIRKS=y + +# +# Frame buffer Devices +# +CONFIG_FB_CMDLINE=y +CONFIG_FB_NOTIFY=y +CONFIG_FB=y +CONFIG_FIRMWARE_EDID=y +CONFIG_FB_BOOT_VESA_SUPPORT=y +CONFIG_FB_CFB_FILLRECT=y +CONFIG_FB_CFB_COPYAREA=y +CONFIG_FB_CFB_IMAGEBLIT=y +CONFIG_FB_SYS_FILLRECT=m +CONFIG_FB_SYS_COPYAREA=m +CONFIG_FB_SYS_IMAGEBLIT=m +# CONFIG_FB_FOREIGN_ENDIAN is not set +CONFIG_FB_SYS_FOPS=m +CONFIG_FB_DEFERRED_IO=y +CONFIG_FB_BACKLIGHT=m +CONFIG_FB_MODE_HELPERS=y +CONFIG_FB_TILEBLITTING=y + +# +# Frame buffer hardware drivers +# +# CONFIG_FB_CIRRUS is not set +# CONFIG_FB_PM2 is not set +# CONFIG_FB_CYBER2000 is not set +# CONFIG_FB_ARC is not set +# CONFIG_FB_ASILIANT is not set +# CONFIG_FB_IMSTT is not set +# CONFIG_FB_VGA16 is not set +# CONFIG_FB_UVESA is not set +CONFIG_FB_VESA=y +CONFIG_FB_EFI=y +# CONFIG_FB_N411 is not set +# CONFIG_FB_HGA is not set +# CONFIG_FB_OPENCORES is not set +# CONFIG_FB_S1D13XXX is not set +# CONFIG_FB_NVIDIA is not set +# CONFIG_FB_RIVA is not set +# CONFIG_FB_I740 is not set +# CONFIG_FB_LE80578 is not set +# CONFIG_FB_INTEL is not set +# CONFIG_FB_MATROX is not set +# CONFIG_FB_RADEON is not set +# CONFIG_FB_ATY128 is not set +# CONFIG_FB_ATY is not set +# CONFIG_FB_S3 is not set +# CONFIG_FB_SAVAGE is not set +# CONFIG_FB_SIS is not set +# CONFIG_FB_VIA is not set +# CONFIG_FB_NEOMAGIC is not set +# CONFIG_FB_KYRO is not set +# CONFIG_FB_3DFX is not set +# CONFIG_FB_VOODOO1 is not set +# CONFIG_FB_VT8623 is not set +# CONFIG_FB_TRIDENT is not set +# CONFIG_FB_ARK is not set +# CONFIG_FB_PM3 is not set +# CONFIG_FB_CARMINE is not set +# CONFIG_FB_SM501 is not set +# CONFIG_FB_SMSCUFX is not set +# CONFIG_FB_UDL is not set +# CONFIG_FB_IBM_GXT4500 is not set +# CONFIG_FB_VIRTUAL is not set +CONFIG_XEN_FBDEV_FRONTEND=m +# CONFIG_FB_METRONOME is not set +# CONFIG_FB_MB862XX is not set +CONFIG_FB_HYPERV=m +CONFIG_FB_SIMPLE=y +# CONFIG_FB_SSD1307 is not set +# CONFIG_FB_SM712 is not set +# end of Frame buffer Devices + +# +# Backlight & LCD device support +# +CONFIG_LCD_CLASS_DEVICE=m +CONFIG_LCD_L4F00242T03=m +CONFIG_LCD_LMS283GF05=m +CONFIG_LCD_LTV350QV=m +CONFIG_LCD_ILI922X=m +CONFIG_LCD_ILI9320=m +CONFIG_LCD_TDO24M=m +CONFIG_LCD_VGG2432A4=m +CONFIG_LCD_PLATFORM=m +CONFIG_LCD_AMS369FG06=m +CONFIG_LCD_LMS501KF03=m +CONFIG_LCD_HX8357=m +CONFIG_LCD_OTM3225A=m +CONFIG_BACKLIGHT_CLASS_DEVICE=y +CONFIG_BACKLIGHT_GENERIC=m +CONFIG_BACKLIGHT_LM3533=m +CONFIG_BACKLIGHT_PWM=m +CONFIG_BACKLIGHT_DA903X=m +CONFIG_BACKLIGHT_DA9052=m +CONFIG_BACKLIGHT_MAX8925=m +CONFIG_BACKLIGHT_APPLE=m +CONFIG_BACKLIGHT_QCOM_WLED=m +CONFIG_BACKLIGHT_SAHARA=m +CONFIG_BACKLIGHT_WM831X=m +CONFIG_BACKLIGHT_ADP5520=m +CONFIG_BACKLIGHT_ADP8860=m +CONFIG_BACKLIGHT_ADP8870=m +CONFIG_BACKLIGHT_88PM860X=m +CONFIG_BACKLIGHT_PCF50633=m +CONFIG_BACKLIGHT_AAT2870=m +CONFIG_BACKLIGHT_LM3630A=m +CONFIG_BACKLIGHT_LM3639=m +CONFIG_BACKLIGHT_LP855X=m +CONFIG_BACKLIGHT_LP8788=m +CONFIG_BACKLIGHT_PANDORA=m +CONFIG_BACKLIGHT_SKY81452=m +CONFIG_BACKLIGHT_TPS65217=m +CONFIG_BACKLIGHT_AS3711=m +CONFIG_BACKLIGHT_GPIO=m +CONFIG_BACKLIGHT_LV5207LP=m +CONFIG_BACKLIGHT_BD6107=m +CONFIG_BACKLIGHT_ARCXCNN=m +CONFIG_BACKLIGHT_RAVE_SP=m +CONFIG_BACKLIGHT_LED=m +# end of Backlight & LCD device support + +CONFIG_VIDEOMODE_HELPERS=y +CONFIG_HDMI=y + +# +# Console display driver support +# +CONFIG_VGA_CONSOLE=y +CONFIG_VGACON_SOFT_SCROLLBACK=y +CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 +# CONFIG_VGACON_SOFT_SCROLLBACK_PERSISTENT_ENABLE_BY_DEFAULT is not set +CONFIG_DUMMY_CONSOLE=y +CONFIG_DUMMY_CONSOLE_COLUMNS=80 +CONFIG_DUMMY_CONSOLE_ROWS=25 +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y +CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y +CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER=y +# end of Console display driver support + +# CONFIG_LOGO is not set +# end of Graphics support + +CONFIG_SOUND=m +CONFIG_SOUND_OSS_CORE=y +# CONFIG_SOUND_OSS_CORE_PRECLAIM is not set +CONFIG_SND=m +CONFIG_SND_TIMER=m +CONFIG_SND_PCM=m +CONFIG_SND_PCM_ELD=y +CONFIG_SND_PCM_IEC958=y +CONFIG_SND_DMAENGINE_PCM=m +CONFIG_SND_HWDEP=m +CONFIG_SND_SEQ_DEVICE=m +CONFIG_SND_RAWMIDI=m +CONFIG_SND_COMPRESS_OFFLOAD=m +CONFIG_SND_JACK=y +CONFIG_SND_JACK_INPUT_DEV=y +CONFIG_SND_OSSEMUL=y +CONFIG_SND_MIXER_OSS=m +CONFIG_SND_PCM_OSS=m +CONFIG_SND_PCM_OSS_PLUGINS=y +CONFIG_SND_PCM_TIMER=y +CONFIG_SND_HRTIMER=m +CONFIG_SND_DYNAMIC_MINORS=y +CONFIG_SND_MAX_CARDS=32 +# CONFIG_SND_SUPPORT_OLD_API is not set +CONFIG_SND_PROC_FS=y +CONFIG_SND_VERBOSE_PROCFS=y +CONFIG_SND_VERBOSE_PRINTK=y +CONFIG_SND_DEBUG=y +# CONFIG_SND_DEBUG_VERBOSE is not set +# CONFIG_SND_PCM_XRUN_DEBUG is not set +# CONFIG_SND_CTL_VALIDATION is not set +CONFIG_SND_VMASTER=y +CONFIG_SND_DMA_SGBUF=y +CONFIG_SND_SEQUENCER=m +CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_SEQUENCER_OSS=m +CONFIG_SND_SEQ_HRTIMER_DEFAULT=y +CONFIG_SND_SEQ_MIDI_EVENT=m +CONFIG_SND_SEQ_MIDI=m +CONFIG_SND_SEQ_MIDI_EMUL=m +CONFIG_SND_SEQ_VIRMIDI=m +CONFIG_SND_MPU401_UART=m +CONFIG_SND_OPL3_LIB=m +CONFIG_SND_OPL3_LIB_SEQ=m +CONFIG_SND_VX_LIB=m +CONFIG_SND_AC97_CODEC=m +CONFIG_SND_DRIVERS=y +# CONFIG_SND_PCSP is not set +CONFIG_SND_DUMMY=m +CONFIG_SND_ALOOP=m +CONFIG_SND_VIRMIDI=m +CONFIG_SND_MTPAV=m +CONFIG_SND_MTS64=m +CONFIG_SND_SERIAL_U16550=m +CONFIG_SND_MPU401=m +CONFIG_SND_PORTMAN2X4=m +CONFIG_SND_AC97_POWER_SAVE=y +CONFIG_SND_AC97_POWER_SAVE_DEFAULT=0 +CONFIG_SND_SB_COMMON=m +CONFIG_SND_PCI=y +CONFIG_SND_AD1889=m +CONFIG_SND_ALS300=m +CONFIG_SND_ALS4000=m +CONFIG_SND_ALI5451=m +CONFIG_SND_ASIHPI=m +CONFIG_SND_ATIIXP=m +CONFIG_SND_ATIIXP_MODEM=m +CONFIG_SND_AU8810=m +CONFIG_SND_AU8820=m +CONFIG_SND_AU8830=m +CONFIG_SND_AW2=m +CONFIG_SND_AZT3328=m +CONFIG_SND_BT87X=m +# CONFIG_SND_BT87X_OVERCLOCK is not set +CONFIG_SND_CA0106=m +CONFIG_SND_CMIPCI=m +CONFIG_SND_OXYGEN_LIB=m +CONFIG_SND_OXYGEN=m +CONFIG_SND_CS4281=m +CONFIG_SND_CS46XX=m +CONFIG_SND_CS46XX_NEW_DSP=y +CONFIG_SND_CTXFI=m +CONFIG_SND_DARLA20=m +CONFIG_SND_GINA20=m +CONFIG_SND_LAYLA20=m +CONFIG_SND_DARLA24=m +CONFIG_SND_GINA24=m +CONFIG_SND_LAYLA24=m +CONFIG_SND_MONA=m +CONFIG_SND_MIA=m +CONFIG_SND_ECHO3G=m +CONFIG_SND_INDIGO=m +CONFIG_SND_INDIGOIO=m +CONFIG_SND_INDIGODJ=m +CONFIG_SND_INDIGOIOX=m +CONFIG_SND_INDIGODJX=m +CONFIG_SND_EMU10K1=m +CONFIG_SND_EMU10K1_SEQ=m +CONFIG_SND_EMU10K1X=m +CONFIG_SND_ENS1370=m +CONFIG_SND_ENS1371=m +CONFIG_SND_ES1938=m +CONFIG_SND_ES1968=m +CONFIG_SND_ES1968_INPUT=y +CONFIG_SND_ES1968_RADIO=y +CONFIG_SND_FM801=m +CONFIG_SND_FM801_TEA575X_BOOL=y +CONFIG_SND_HDSP=m +CONFIG_SND_HDSPM=m +CONFIG_SND_ICE1712=m +CONFIG_SND_ICE1724=m +CONFIG_SND_INTEL8X0=m +CONFIG_SND_INTEL8X0M=m +CONFIG_SND_KORG1212=m +CONFIG_SND_LOLA=m +CONFIG_SND_LX6464ES=m +CONFIG_SND_MAESTRO3=m +CONFIG_SND_MAESTRO3_INPUT=y +CONFIG_SND_MIXART=m +CONFIG_SND_NM256=m +CONFIG_SND_PCXHR=m +CONFIG_SND_RIPTIDE=m +CONFIG_SND_RME32=m +CONFIG_SND_RME96=m +CONFIG_SND_RME9652=m +CONFIG_SND_SONICVIBES=m +CONFIG_SND_TRIDENT=m +CONFIG_SND_VIA82XX=m +CONFIG_SND_VIA82XX_MODEM=m +CONFIG_SND_VIRTUOSO=m +CONFIG_SND_VX222=m +CONFIG_SND_YMFPCI=m + +# +# HD-Audio +# +CONFIG_SND_HDA=m +CONFIG_SND_HDA_INTEL=m +CONFIG_SND_HDA_HWDEP=y +CONFIG_SND_HDA_RECONFIG=y +CONFIG_SND_HDA_INPUT_BEEP=y +CONFIG_SND_HDA_INPUT_BEEP_MODE=1 +CONFIG_SND_HDA_PATCH_LOADER=y +CONFIG_SND_HDA_CODEC_REALTEK=m +CONFIG_SND_HDA_CODEC_ANALOG=m +CONFIG_SND_HDA_CODEC_SIGMATEL=m +CONFIG_SND_HDA_CODEC_VIA=m +CONFIG_SND_HDA_CODEC_HDMI=m +CONFIG_SND_HDA_CODEC_CIRRUS=m +CONFIG_SND_HDA_CODEC_CONEXANT=m +CONFIG_SND_HDA_CODEC_CA0110=m +CONFIG_SND_HDA_CODEC_CA0132=m +CONFIG_SND_HDA_CODEC_CA0132_DSP=y +CONFIG_SND_HDA_CODEC_CMEDIA=m +CONFIG_SND_HDA_CODEC_SI3054=m +CONFIG_SND_HDA_GENERIC=m +CONFIG_SND_HDA_POWER_SAVE_DEFAULT=0 +# end of HD-Audio + +CONFIG_SND_HDA_CORE=m +CONFIG_SND_HDA_DSP_LOADER=y +CONFIG_SND_HDA_COMPONENT=y +CONFIG_SND_HDA_I915=y +CONFIG_SND_HDA_EXT_CORE=m +CONFIG_SND_HDA_PREALLOC_SIZE=0 +CONFIG_SND_INTEL_NHLT=y +CONFIG_SND_INTEL_DSP_CONFIG=m +CONFIG_SND_SPI=y +CONFIG_SND_USB=y +CONFIG_SND_USB_AUDIO=m +CONFIG_SND_USB_AUDIO_USE_MEDIA_CONTROLLER=y +CONFIG_SND_USB_UA101=m +CONFIG_SND_USB_USX2Y=m +CONFIG_SND_USB_CAIAQ=m +CONFIG_SND_USB_CAIAQ_INPUT=y +CONFIG_SND_USB_US122L=m +CONFIG_SND_USB_6FIRE=m +CONFIG_SND_USB_HIFACE=m +CONFIG_SND_BCD2000=m +CONFIG_SND_USB_LINE6=m +CONFIG_SND_USB_POD=m +CONFIG_SND_USB_PODHD=m +CONFIG_SND_USB_TONEPORT=m +CONFIG_SND_USB_VARIAX=m +CONFIG_SND_FIREWIRE=y +CONFIG_SND_FIREWIRE_LIB=m +CONFIG_SND_DICE=m +CONFIG_SND_OXFW=m +CONFIG_SND_ISIGHT=m +CONFIG_SND_FIREWORKS=m +CONFIG_SND_BEBOB=m +CONFIG_SND_FIREWIRE_DIGI00X=m +CONFIG_SND_FIREWIRE_TASCAM=m +CONFIG_SND_FIREWIRE_MOTU=m +CONFIG_SND_FIREFACE=m +CONFIG_SND_PCMCIA=y +CONFIG_SND_VXPOCKET=m +CONFIG_SND_PDAUDIOCF=m +CONFIG_SND_SOC=m +CONFIG_SND_SOC_AC97_BUS=y +CONFIG_SND_SOC_GENERIC_DMAENGINE_PCM=y +CONFIG_SND_SOC_COMPRESS=y +CONFIG_SND_SOC_TOPOLOGY=y +CONFIG_SND_SOC_ACPI=m +CONFIG_SND_SOC_AMD_ACP=m +CONFIG_SND_SOC_AMD_CZ_DA7219MX98357_MACH=m +CONFIG_SND_SOC_AMD_CZ_RT5645_MACH=m +CONFIG_SND_SOC_AMD_ACP3x=m +CONFIG_SND_SOC_AMD_RV_RT5682_MACH=m +CONFIG_SND_ATMEL_SOC=m +CONFIG_SND_SOC_MIKROE_PROTO=m +CONFIG_SND_BCM63XX_I2S_WHISTLER=m +CONFIG_SND_DESIGNWARE_I2S=m +CONFIG_SND_DESIGNWARE_PCM=y + +# +# SoC Audio for Freescale CPUs +# + +# +# Common SoC Audio options for Freescale CPUs: +# +# CONFIG_SND_SOC_FSL_ASRC is not set +# CONFIG_SND_SOC_FSL_SAI is not set +# CONFIG_SND_SOC_FSL_AUDMIX is not set +# CONFIG_SND_SOC_FSL_SSI is not set +# CONFIG_SND_SOC_FSL_SPDIF is not set +# CONFIG_SND_SOC_FSL_ESAI is not set +# CONFIG_SND_SOC_FSL_MICFIL is not set +# CONFIG_SND_SOC_IMX_AUDMUX is not set +# end of SoC Audio for Freescale CPUs + +CONFIG_SND_I2S_HI6210_I2S=m +CONFIG_SND_SOC_IMG=y +CONFIG_SND_SOC_IMG_I2S_IN=m +CONFIG_SND_SOC_IMG_I2S_OUT=m +CONFIG_SND_SOC_IMG_PARALLEL_OUT=m +CONFIG_SND_SOC_IMG_SPDIF_IN=m +CONFIG_SND_SOC_IMG_SPDIF_OUT=m +CONFIG_SND_SOC_IMG_PISTACHIO_INTERNAL_DAC=m +CONFIG_SND_SOC_INTEL_SST_TOPLEVEL=y +CONFIG_SND_SST_IPC=m +CONFIG_SND_SST_IPC_PCI=m +CONFIG_SND_SST_IPC_ACPI=m +CONFIG_SND_SOC_INTEL_SST_ACPI=m +CONFIG_SND_SOC_INTEL_SST=m +CONFIG_SND_SOC_INTEL_SST_FIRMWARE=m +CONFIG_SND_SOC_INTEL_HASWELL=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_PCI=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_ACPI=m +CONFIG_SND_SOC_INTEL_SKYLAKE=m +CONFIG_SND_SOC_INTEL_SKL=m +CONFIG_SND_SOC_INTEL_APL=m +CONFIG_SND_SOC_INTEL_KBL=m +CONFIG_SND_SOC_INTEL_GLK=m +CONFIG_SND_SOC_INTEL_CNL=m +CONFIG_SND_SOC_INTEL_CFL=m +CONFIG_SND_SOC_INTEL_CML_H=m +CONFIG_SND_SOC_INTEL_CML_LP=m +CONFIG_SND_SOC_INTEL_SKYLAKE_FAMILY=m +CONFIG_SND_SOC_INTEL_SKYLAKE_SSP_CLK=m +# CONFIG_SND_SOC_INTEL_SKYLAKE_HDAUDIO_CODEC is not set +CONFIG_SND_SOC_INTEL_SKYLAKE_COMMON=m +CONFIG_SND_SOC_ACPI_INTEL_MATCH=m +CONFIG_SND_SOC_INTEL_MACH=y +# CONFIG_SND_SOC_INTEL_USER_FRIENDLY_LONG_NAMES is not set +CONFIG_SND_SOC_INTEL_HASWELL_MACH=m +CONFIG_SND_SOC_INTEL_BDW_RT5650_MACH=m +CONFIG_SND_SOC_INTEL_BDW_RT5677_MACH=m +CONFIG_SND_SOC_INTEL_BROADWELL_MACH=m +CONFIG_SND_SOC_INTEL_BYTCR_RT5640_MACH=m +CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_RT5672_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_RT5645_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_MAX98090_TI_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_NAU8824_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_CX2072X_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_DA7213_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_ES8316_MACH=m +# CONFIG_SND_SOC_INTEL_BYT_CHT_NOCODEC_MACH is not set +CONFIG_SND_SOC_INTEL_SKL_RT286_MACH=m +CONFIG_SND_SOC_INTEL_SKL_NAU88L25_SSM4567_MACH=m +CONFIG_SND_SOC_INTEL_SKL_NAU88L25_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_DA7219_MAX98357A_GENERIC=m +CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_COMMON=m +CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_BXT_RT298_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5663_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5663_RT5514_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5660_MACH=m +CONFIG_SND_SOC_INTEL_GLK_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_GLK_RT5682_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_SKL_HDA_DSP_GENERIC_MACH=m +CONFIG_SND_SOC_INTEL_SOF_RT5682_MACH=m +CONFIG_SND_SOC_INTEL_SOF_PCM512x_MACH=m +CONFIG_SND_SOC_INTEL_CML_LP_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_SOF_CML_RT1011_RT5682_MACH=m +CONFIG_SND_SOC_INTEL_SOF_DA7219_MAX98373_MACH=m +CONFIG_SND_SOC_MTK_BTCVSD=m +CONFIG_SND_SOC_SOF_TOPLEVEL=y +CONFIG_SND_SOC_SOF_PCI=m +CONFIG_SND_SOC_SOF_ACPI=m +CONFIG_SND_SOC_SOF_OF=m +# CONFIG_SND_SOC_SOF_DEBUG_PROBES is not set +# CONFIG_SND_SOC_SOF_DEVELOPER_SUPPORT is not set +CONFIG_SND_SOC_SOF=m +CONFIG_SND_SOC_SOF_PROBE_WORK_QUEUE=y +CONFIG_SND_SOC_SOF_INTEL_TOPLEVEL=y +CONFIG_SND_SOC_SOF_INTEL_ACPI=m +CONFIG_SND_SOC_SOF_INTEL_PCI=m +CONFIG_SND_SOC_SOF_INTEL_HIFI_EP_IPC=m +CONFIG_SND_SOC_SOF_INTEL_ATOM_HIFI_EP=m +CONFIG_SND_SOC_SOF_INTEL_COMMON=m +CONFIG_SND_SOC_SOF_MERRIFIELD_SUPPORT=y +CONFIG_SND_SOC_SOF_MERRIFIELD=m +CONFIG_SND_SOC_SOF_APOLLOLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_APOLLOLAKE=m +CONFIG_SND_SOC_SOF_GEMINILAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_GEMINILAKE=m +CONFIG_SND_SOC_SOF_CANNONLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_CANNONLAKE=m +CONFIG_SND_SOC_SOF_COFFEELAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_COFFEELAKE=m +CONFIG_SND_SOC_SOF_ICELAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_ICELAKE=m +CONFIG_SND_SOC_SOF_COMETLAKE_LP=m +CONFIG_SND_SOC_SOF_COMETLAKE_LP_SUPPORT=y +CONFIG_SND_SOC_SOF_COMETLAKE_H=m +CONFIG_SND_SOC_SOF_COMETLAKE_H_SUPPORT=y +CONFIG_SND_SOC_SOF_TIGERLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_TIGERLAKE=m +CONFIG_SND_SOC_SOF_ELKHARTLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_ELKHARTLAKE=m +CONFIG_SND_SOC_SOF_JASPERLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_JASPERLAKE=m +CONFIG_SND_SOC_SOF_HDA_COMMON=m +CONFIG_SND_SOC_SOF_HDA_LINK=y +CONFIG_SND_SOC_SOF_HDA_AUDIO_CODEC=y +# CONFIG_SND_SOC_SOF_HDA_ALWAYS_ENABLE_DMI_L1 is not set +CONFIG_SND_SOC_SOF_HDA_LINK_BASELINE=m +CONFIG_SND_SOC_SOF_HDA=m +CONFIG_SND_SOC_SOF_XTENSA=m + +# +# STMicroelectronics STM32 SOC audio support +# +# end of STMicroelectronics STM32 SOC audio support + +CONFIG_SND_SOC_XILINX_I2S=m +CONFIG_SND_SOC_XILINX_AUDIO_FORMATTER=m +CONFIG_SND_SOC_XILINX_SPDIF=m +CONFIG_SND_SOC_XTFPGA_I2S=m +CONFIG_ZX_TDM=m +CONFIG_SND_SOC_I2C_AND_SPI=m + +# +# CODEC drivers +# +CONFIG_SND_SOC_AC97_CODEC=m +CONFIG_SND_SOC_ADAU_UTILS=m +CONFIG_SND_SOC_ADAU1701=m +CONFIG_SND_SOC_ADAU17X1=m +CONFIG_SND_SOC_ADAU1761=m +CONFIG_SND_SOC_ADAU1761_I2C=m +CONFIG_SND_SOC_ADAU1761_SPI=m +CONFIG_SND_SOC_ADAU7002=m +CONFIG_SND_SOC_ADAU7118=m +CONFIG_SND_SOC_ADAU7118_HW=m +CONFIG_SND_SOC_ADAU7118_I2C=m +CONFIG_SND_SOC_AK4104=m +CONFIG_SND_SOC_AK4118=m +CONFIG_SND_SOC_AK4458=m +CONFIG_SND_SOC_AK4554=m +CONFIG_SND_SOC_AK4613=m +CONFIG_SND_SOC_AK4642=m +CONFIG_SND_SOC_AK5386=m +CONFIG_SND_SOC_AK5558=m +CONFIG_SND_SOC_ALC5623=m +CONFIG_SND_SOC_BD28623=m +# CONFIG_SND_SOC_BT_SCO is not set +CONFIG_SND_SOC_CPCAP=m +CONFIG_SND_SOC_CROS_EC_CODEC=m +CONFIG_SND_SOC_CS35L32=m +CONFIG_SND_SOC_CS35L33=m +CONFIG_SND_SOC_CS35L34=m +CONFIG_SND_SOC_CS35L35=m +CONFIG_SND_SOC_CS35L36=m +CONFIG_SND_SOC_CS42L42=m +CONFIG_SND_SOC_CS42L51=m +CONFIG_SND_SOC_CS42L51_I2C=m +CONFIG_SND_SOC_CS42L52=m +CONFIG_SND_SOC_CS42L56=m +CONFIG_SND_SOC_CS42L73=m +CONFIG_SND_SOC_CS4265=m +CONFIG_SND_SOC_CS4270=m +CONFIG_SND_SOC_CS4271=m +CONFIG_SND_SOC_CS4271_I2C=m +CONFIG_SND_SOC_CS4271_SPI=m +CONFIG_SND_SOC_CS42XX8=m +CONFIG_SND_SOC_CS42XX8_I2C=m +CONFIG_SND_SOC_CS43130=m +CONFIG_SND_SOC_CS4341=m +CONFIG_SND_SOC_CS4349=m +CONFIG_SND_SOC_CS53L30=m +CONFIG_SND_SOC_CX2072X=m +CONFIG_SND_SOC_DA7213=m +CONFIG_SND_SOC_DA7219=m +CONFIG_SND_SOC_DMIC=m +CONFIG_SND_SOC_HDMI_CODEC=m +CONFIG_SND_SOC_ES7134=m +CONFIG_SND_SOC_ES7241=m +CONFIG_SND_SOC_ES8316=m +CONFIG_SND_SOC_ES8328=m +CONFIG_SND_SOC_ES8328_I2C=m +CONFIG_SND_SOC_ES8328_SPI=m +CONFIG_SND_SOC_GTM601=m +CONFIG_SND_SOC_HDAC_HDMI=m +CONFIG_SND_SOC_HDAC_HDA=m +CONFIG_SND_SOC_INNO_RK3036=m +CONFIG_SND_SOC_LOCHNAGAR_SC=m +CONFIG_SND_SOC_MAX98088=m +CONFIG_SND_SOC_MAX98090=m +CONFIG_SND_SOC_MAX98357A=m +CONFIG_SND_SOC_MAX98504=m +CONFIG_SND_SOC_MAX9867=m +CONFIG_SND_SOC_MAX98927=m +CONFIG_SND_SOC_MAX98373=m +CONFIG_SND_SOC_MAX9860=m +CONFIG_SND_SOC_MSM8916_WCD_ANALOG=m +CONFIG_SND_SOC_MSM8916_WCD_DIGITAL=m +CONFIG_SND_SOC_PCM1681=m +CONFIG_SND_SOC_PCM1789=m +CONFIG_SND_SOC_PCM1789_I2C=m +CONFIG_SND_SOC_PCM179X=m +CONFIG_SND_SOC_PCM179X_I2C=m +CONFIG_SND_SOC_PCM179X_SPI=m +CONFIG_SND_SOC_PCM186X=m +CONFIG_SND_SOC_PCM186X_I2C=m +CONFIG_SND_SOC_PCM186X_SPI=m +CONFIG_SND_SOC_PCM3060=m +CONFIG_SND_SOC_PCM3060_I2C=m +CONFIG_SND_SOC_PCM3060_SPI=m +CONFIG_SND_SOC_PCM3168A=m +CONFIG_SND_SOC_PCM3168A_I2C=m +CONFIG_SND_SOC_PCM3168A_SPI=m +CONFIG_SND_SOC_PCM512x=m +CONFIG_SND_SOC_PCM512x_I2C=m +CONFIG_SND_SOC_PCM512x_SPI=m +CONFIG_SND_SOC_RK3328=m +CONFIG_SND_SOC_RL6231=m +CONFIG_SND_SOC_RL6347A=m +CONFIG_SND_SOC_RT286=m +CONFIG_SND_SOC_RT298=m +CONFIG_SND_SOC_RT1011=m +CONFIG_SND_SOC_RT1015=m +CONFIG_SND_SOC_RT1308_SDW=m +CONFIG_SND_SOC_RT5514=m +CONFIG_SND_SOC_RT5514_SPI=m +CONFIG_SND_SOC_RT5616=m +CONFIG_SND_SOC_RT5631=m +CONFIG_SND_SOC_RT5640=m +CONFIG_SND_SOC_RT5645=m +CONFIG_SND_SOC_RT5651=m +CONFIG_SND_SOC_RT5660=m +CONFIG_SND_SOC_RT5663=m +CONFIG_SND_SOC_RT5670=m +CONFIG_SND_SOC_RT5677=m +CONFIG_SND_SOC_RT5677_SPI=m +CONFIG_SND_SOC_RT5682=m +CONFIG_SND_SOC_RT5682_SDW=m +CONFIG_SND_SOC_RT700=m +CONFIG_SND_SOC_RT700_SDW=m +CONFIG_SND_SOC_RT711=m +CONFIG_SND_SOC_RT711_SDW=m +CONFIG_SND_SOC_RT715=m +CONFIG_SND_SOC_RT715_SDW=m +CONFIG_SND_SOC_SGTL5000=m +CONFIG_SND_SOC_SI476X=m +CONFIG_SND_SOC_SIGMADSP=m +CONFIG_SND_SOC_SIGMADSP_I2C=m +CONFIG_SND_SOC_SIGMADSP_REGMAP=m +CONFIG_SND_SOC_SIMPLE_AMPLIFIER=m +CONFIG_SND_SOC_SIRF_AUDIO_CODEC=m +CONFIG_SND_SOC_SPDIF=m +CONFIG_SND_SOC_SSM2305=m +CONFIG_SND_SOC_SSM2602=m +CONFIG_SND_SOC_SSM2602_SPI=m +CONFIG_SND_SOC_SSM2602_I2C=m +CONFIG_SND_SOC_SSM4567=m +CONFIG_SND_SOC_STA32X=m +CONFIG_SND_SOC_STA350=m +CONFIG_SND_SOC_STI_SAS=m +CONFIG_SND_SOC_TAS2552=m +CONFIG_SND_SOC_TAS2562=m +CONFIG_SND_SOC_TAS2770=m +CONFIG_SND_SOC_TAS5086=m +CONFIG_SND_SOC_TAS571X=m +CONFIG_SND_SOC_TAS5720=m +CONFIG_SND_SOC_TAS6424=m +CONFIG_SND_SOC_TDA7419=m +CONFIG_SND_SOC_TFA9879=m +CONFIG_SND_SOC_TLV320AIC23=m +CONFIG_SND_SOC_TLV320AIC23_I2C=m +CONFIG_SND_SOC_TLV320AIC23_SPI=m +CONFIG_SND_SOC_TLV320AIC31XX=m +CONFIG_SND_SOC_TLV320AIC32X4=m +CONFIG_SND_SOC_TLV320AIC32X4_I2C=m +CONFIG_SND_SOC_TLV320AIC32X4_SPI=m +CONFIG_SND_SOC_TLV320AIC3X=m +CONFIG_SND_SOC_TLV320ADCX140=m +CONFIG_SND_SOC_TS3A227E=m +CONFIG_SND_SOC_TSCS42XX=m +CONFIG_SND_SOC_TSCS454=m +CONFIG_SND_SOC_UDA1334=m +CONFIG_SND_SOC_WCD9335=m +CONFIG_SND_SOC_WCD934X=m +CONFIG_SND_SOC_WM8510=m +CONFIG_SND_SOC_WM8523=m +CONFIG_SND_SOC_WM8524=m +CONFIG_SND_SOC_WM8580=m +CONFIG_SND_SOC_WM8711=m +CONFIG_SND_SOC_WM8728=m +CONFIG_SND_SOC_WM8731=m +CONFIG_SND_SOC_WM8737=m +CONFIG_SND_SOC_WM8741=m +CONFIG_SND_SOC_WM8750=m +CONFIG_SND_SOC_WM8753=m +CONFIG_SND_SOC_WM8770=m +CONFIG_SND_SOC_WM8776=m +CONFIG_SND_SOC_WM8782=m +CONFIG_SND_SOC_WM8804=m +CONFIG_SND_SOC_WM8804_I2C=m +CONFIG_SND_SOC_WM8804_SPI=m +CONFIG_SND_SOC_WM8903=m +CONFIG_SND_SOC_WM8904=m +CONFIG_SND_SOC_WM8960=m +CONFIG_SND_SOC_WM8962=m +CONFIG_SND_SOC_WM8974=m +CONFIG_SND_SOC_WM8978=m +CONFIG_SND_SOC_WM8985=m +CONFIG_SND_SOC_WSA881X=m +CONFIG_SND_SOC_ZX_AUD96P22=m +CONFIG_SND_SOC_MAX9759=m +CONFIG_SND_SOC_MT6351=m +CONFIG_SND_SOC_MT6358=m +CONFIG_SND_SOC_MT6660=m +CONFIG_SND_SOC_NAU8540=m +CONFIG_SND_SOC_NAU8810=m +CONFIG_SND_SOC_NAU8822=m +CONFIG_SND_SOC_NAU8824=m +CONFIG_SND_SOC_NAU8825=m +CONFIG_SND_SOC_TPA6130A2=m +# end of CODEC drivers + +CONFIG_SND_SIMPLE_CARD_UTILS=m +CONFIG_SND_SIMPLE_CARD=m +CONFIG_SND_AUDIO_GRAPH_CARD=m +CONFIG_SND_X86=y +CONFIG_HDMI_LPE_AUDIO=m +CONFIG_SND_SYNTH_EMUX=m +CONFIG_SND_XEN_FRONTEND=m +CONFIG_AC97_BUS=m + +# +# HID support +# +CONFIG_HID=m +CONFIG_HID_BATTERY_STRENGTH=y +CONFIG_HIDRAW=y +CONFIG_UHID=m +CONFIG_HID_GENERIC=m + +# +# Special HID drivers +# +CONFIG_HID_A4TECH=m +CONFIG_HID_ACCUTOUCH=m +CONFIG_HID_ACRUX=m +CONFIG_HID_ACRUX_FF=y +CONFIG_HID_APPLE=m +CONFIG_HID_APPLEIR=m +CONFIG_HID_ASUS=m +CONFIG_HID_AUREAL=m +CONFIG_HID_BELKIN=m +CONFIG_HID_BETOP_FF=m +CONFIG_HID_BIGBEN_FF=m +CONFIG_HID_CHERRY=m +CONFIG_HID_CHICONY=m +CONFIG_HID_CORSAIR=m +CONFIG_HID_COUGAR=m +CONFIG_HID_MACALLY=m +CONFIG_HID_PRODIKEYS=m +CONFIG_HID_CMEDIA=m +CONFIG_HID_CP2112=m +CONFIG_HID_CREATIVE_SB0540=m +CONFIG_HID_CYPRESS=m +CONFIG_HID_DRAGONRISE=m +CONFIG_DRAGONRISE_FF=y +CONFIG_HID_EMS_FF=m +CONFIG_HID_ELAN=m +CONFIG_HID_ELECOM=m +CONFIG_HID_ELO=m +CONFIG_HID_EZKEY=m +CONFIG_HID_GEMBIRD=m +CONFIG_HID_GFRM=m +CONFIG_HID_GLORIOUS=m +CONFIG_HID_HOLTEK=m +CONFIG_HOLTEK_FF=y +CONFIG_HID_GOOGLE_HAMMER=m +CONFIG_HID_GT683R=m +CONFIG_HID_KEYTOUCH=m +CONFIG_HID_KYE=m +CONFIG_HID_UCLOGIC=m +CONFIG_HID_WALTOP=m +CONFIG_HID_VIEWSONIC=m +CONFIG_HID_GYRATION=m +CONFIG_HID_ICADE=m +CONFIG_HID_ITE=m +CONFIG_HID_JABRA=m +CONFIG_HID_TWINHAN=m +CONFIG_HID_KENSINGTON=m +CONFIG_HID_LCPOWER=m +CONFIG_HID_LED=m +CONFIG_HID_LENOVO=m +CONFIG_HID_LOGITECH=m +CONFIG_HID_LOGITECH_DJ=m +CONFIG_HID_LOGITECH_HIDPP=m +CONFIG_LOGITECH_FF=y +CONFIG_LOGIRUMBLEPAD2_FF=y +CONFIG_LOGIG940_FF=y +CONFIG_LOGIWHEELS_FF=y +CONFIG_HID_MAGICMOUSE=m +CONFIG_HID_MALTRON=m +CONFIG_HID_MAYFLASH=m +CONFIG_HID_REDRAGON=m +CONFIG_HID_MICROSOFT=m +CONFIG_HID_MONTEREY=m +CONFIG_HID_MULTITOUCH=m +CONFIG_HID_NTI=m +CONFIG_HID_NTRIG=m +CONFIG_HID_ORTEK=m +CONFIG_HID_PANTHERLORD=m +CONFIG_PANTHERLORD_FF=y +CONFIG_HID_PENMOUNT=m +CONFIG_HID_PETALYNX=m +CONFIG_HID_PICOLCD=m +CONFIG_HID_PICOLCD_FB=y +CONFIG_HID_PICOLCD_BACKLIGHT=y +CONFIG_HID_PICOLCD_LCD=y +CONFIG_HID_PICOLCD_LEDS=y +CONFIG_HID_PICOLCD_CIR=y +CONFIG_HID_PLANTRONICS=m +CONFIG_HID_PRIMAX=m +CONFIG_HID_RETRODE=m +CONFIG_HID_ROCCAT=m +CONFIG_HID_SAITEK=m +CONFIG_HID_SAMSUNG=m +CONFIG_HID_SONY=m +CONFIG_SONY_FF=y +CONFIG_HID_SPEEDLINK=m +CONFIG_HID_STEAM=m +CONFIG_HID_STEELSERIES=m +CONFIG_HID_SUNPLUS=m +CONFIG_HID_RMI=m +CONFIG_HID_GREENASIA=m +CONFIG_GREENASIA_FF=y +CONFIG_HID_HYPERV_MOUSE=m +CONFIG_HID_SMARTJOYPLUS=m +CONFIG_SMARTJOYPLUS_FF=y +CONFIG_HID_TIVO=m +CONFIG_HID_TOPSEED=m +CONFIG_HID_THINGM=m +CONFIG_HID_THRUSTMASTER=m +CONFIG_THRUSTMASTER_FF=y +CONFIG_HID_UDRAW_PS3=m +CONFIG_HID_U2FZERO=m +CONFIG_HID_WACOM=m +CONFIG_HID_WIIMOTE=m +CONFIG_HID_XINMO=m +CONFIG_HID_ZEROPLUS=m +CONFIG_ZEROPLUS_FF=y +CONFIG_HID_ZYDACRON=m +CONFIG_HID_SENSOR_HUB=m +# CONFIG_HID_SENSOR_CUSTOM_SENSOR is not set +CONFIG_HID_ALPS=m +CONFIG_HID_MCP2221=m +# end of Special HID drivers + +# +# USB HID support +# +CONFIG_USB_HID=m +CONFIG_HID_PID=y +CONFIG_USB_HIDDEV=y + +# +# USB HID Boot Protocol drivers +# +# CONFIG_USB_KBD is not set +# CONFIG_USB_MOUSE is not set +# end of USB HID Boot Protocol drivers +# end of USB HID support + +# +# I2C HID support +# +CONFIG_I2C_HID=m +# end of I2C HID support + +# +# Intel ISH HID support +# +CONFIG_INTEL_ISH_HID=m +CONFIG_INTEL_ISH_FIRMWARE_DOWNLOADER=m +# end of Intel ISH HID support +# end of HID support + +CONFIG_USB_OHCI_LITTLE_ENDIAN=y +CONFIG_USB_SUPPORT=y +CONFIG_USB_COMMON=y +CONFIG_USB_LED_TRIG=y +CONFIG_USB_ULPI_BUS=m +CONFIG_USB_CONN_GPIO=m +CONFIG_USB_ARCH_HAS_HCD=y +CONFIG_USB=y +CONFIG_USB_PCI=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y + +# +# Miscellaneous USB options +# +CONFIG_USB_DEFAULT_PERSIST=y +CONFIG_USB_DYNAMIC_MINORS=y +# CONFIG_USB_OTG is not set +# CONFIG_USB_OTG_WHITELIST is not set +# CONFIG_USB_OTG_BLACKLIST_HUB is not set +CONFIG_USB_LEDS_TRIGGER_USBPORT=m +CONFIG_USB_AUTOSUSPEND_DELAY=2 +CONFIG_USB_MON=m + +# +# USB Host Controller Drivers +# +CONFIG_USB_C67X00_HCD=m +CONFIG_USB_XHCI_HCD=m +# CONFIG_USB_XHCI_DBGCAP is not set +CONFIG_USB_XHCI_PCI=m +CONFIG_USB_XHCI_PLATFORM=m +CONFIG_USB_EHCI_HCD=m +CONFIG_USB_EHCI_ROOT_HUB_TT=y +CONFIG_USB_EHCI_TT_NEWSCHED=y +CONFIG_USB_EHCI_PCI=m +CONFIG_USB_EHCI_FSL=m +CONFIG_USB_EHCI_HCD_PLATFORM=m +CONFIG_USB_OXU210HP_HCD=m +CONFIG_USB_ISP116X_HCD=m +CONFIG_USB_FOTG210_HCD=m +CONFIG_USB_MAX3421_HCD=m +CONFIG_USB_OHCI_HCD=m +CONFIG_USB_OHCI_HCD_PCI=m +# CONFIG_USB_OHCI_HCD_SSB is not set +CONFIG_USB_OHCI_HCD_PLATFORM=m +CONFIG_USB_UHCI_HCD=m +CONFIG_USB_U132_HCD=m +CONFIG_USB_SL811_HCD=m +# CONFIG_USB_SL811_HCD_ISO is not set +CONFIG_USB_SL811_CS=m +CONFIG_USB_R8A66597_HCD=m +CONFIG_USB_HCD_BCMA=m +CONFIG_USB_HCD_SSB=m +# CONFIG_USB_HCD_TEST_MODE is not set + +# +# USB Device Class drivers +# +CONFIG_USB_ACM=m +CONFIG_USB_PRINTER=m +CONFIG_USB_WDM=m +CONFIG_USB_TMC=m + +# +# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may +# + +# +# also be needed; see USB_STORAGE Help for more info +# +CONFIG_USB_STORAGE=m +# CONFIG_USB_STORAGE_DEBUG is not set +CONFIG_USB_STORAGE_REALTEK=m +CONFIG_REALTEK_AUTOPM=y +CONFIG_USB_STORAGE_DATAFAB=m +CONFIG_USB_STORAGE_FREECOM=m +CONFIG_USB_STORAGE_ISD200=m +CONFIG_USB_STORAGE_USBAT=m +CONFIG_USB_STORAGE_SDDR09=m +CONFIG_USB_STORAGE_SDDR55=m +CONFIG_USB_STORAGE_JUMPSHOT=m +CONFIG_USB_STORAGE_ALAUDA=m +CONFIG_USB_STORAGE_ONETOUCH=m +CONFIG_USB_STORAGE_KARMA=m +CONFIG_USB_STORAGE_CYPRESS_ATACB=m +CONFIG_USB_STORAGE_ENE_UB6250=m +CONFIG_USB_UAS=m + +# +# USB Imaging devices +# +CONFIG_USB_MDC800=m +CONFIG_USB_MICROTEK=m +CONFIG_USBIP_CORE=m +CONFIG_USBIP_VHCI_HCD=m +CONFIG_USBIP_VHCI_HC_PORTS=8 +CONFIG_USBIP_VHCI_NR_HCS=1 +CONFIG_USBIP_HOST=m +CONFIG_USBIP_VUDC=m +# CONFIG_USBIP_DEBUG is not set +CONFIG_USB_CDNS3=m +CONFIG_USB_CDNS3_GADGET=y +CONFIG_USB_CDNS3_HOST=y +CONFIG_USB_CDNS3_PCI_WRAP=m +CONFIG_USB_MUSB_HDRC=m +# CONFIG_USB_MUSB_HOST is not set +# CONFIG_USB_MUSB_GADGET is not set +CONFIG_USB_MUSB_DUAL_ROLE=y + +# +# Platform Glue Layer +# + +# +# MUSB DMA mode +# +# CONFIG_MUSB_PIO_ONLY is not set +CONFIG_USB_DWC3=m +CONFIG_USB_DWC3_ULPI=y +# CONFIG_USB_DWC3_HOST is not set +# CONFIG_USB_DWC3_GADGET is not set +CONFIG_USB_DWC3_DUAL_ROLE=y + +# +# Platform Glue Driver Support +# +CONFIG_USB_DWC3_PCI=m +CONFIG_USB_DWC3_HAPS=m +CONFIG_USB_DWC3_OF_SIMPLE=m +CONFIG_USB_DWC2=m +# CONFIG_USB_DWC2_HOST is not set + +# +# Gadget/Dual-role mode requires USB Gadget support to be enabled +# +# CONFIG_USB_DWC2_PERIPHERAL is not set +CONFIG_USB_DWC2_DUAL_ROLE=y +CONFIG_USB_DWC2_PCI=m +# CONFIG_USB_DWC2_DEBUG is not set +# CONFIG_USB_DWC2_TRACK_MISSED_SOFS is not set +CONFIG_USB_CHIPIDEA=m +CONFIG_USB_CHIPIDEA_OF=m +CONFIG_USB_CHIPIDEA_PCI=m +CONFIG_USB_CHIPIDEA_UDC=y +CONFIG_USB_CHIPIDEA_HOST=y +CONFIG_USB_ISP1760=m +CONFIG_USB_ISP1760_HCD=y +CONFIG_USB_ISP1761_UDC=y +# CONFIG_USB_ISP1760_HOST_ROLE is not set +# CONFIG_USB_ISP1760_GADGET_ROLE is not set +CONFIG_USB_ISP1760_DUAL_ROLE=y + +# +# USB port drivers +# +CONFIG_USB_USS720=m +CONFIG_USB_SERIAL=y +CONFIG_USB_SERIAL_CONSOLE=y +CONFIG_USB_SERIAL_GENERIC=y +CONFIG_USB_SERIAL_SIMPLE=m +CONFIG_USB_SERIAL_AIRCABLE=m +CONFIG_USB_SERIAL_ARK3116=m +CONFIG_USB_SERIAL_BELKIN=m +CONFIG_USB_SERIAL_CH341=m +CONFIG_USB_SERIAL_WHITEHEAT=m +CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m +CONFIG_USB_SERIAL_CP210X=m +CONFIG_USB_SERIAL_CYPRESS_M8=m +CONFIG_USB_SERIAL_EMPEG=m +CONFIG_USB_SERIAL_FTDI_SIO=m +CONFIG_USB_SERIAL_VISOR=m +CONFIG_USB_SERIAL_IPAQ=m +CONFIG_USB_SERIAL_IR=m +CONFIG_USB_SERIAL_EDGEPORT=m +CONFIG_USB_SERIAL_EDGEPORT_TI=m +CONFIG_USB_SERIAL_F81232=m +CONFIG_USB_SERIAL_F8153X=m +CONFIG_USB_SERIAL_GARMIN=m +CONFIG_USB_SERIAL_IPW=m +CONFIG_USB_SERIAL_IUU=m +CONFIG_USB_SERIAL_KEYSPAN_PDA=m +CONFIG_USB_SERIAL_KEYSPAN=m +CONFIG_USB_SERIAL_KLSI=m +CONFIG_USB_SERIAL_KOBIL_SCT=m +CONFIG_USB_SERIAL_MCT_U232=m +CONFIG_USB_SERIAL_METRO=m +CONFIG_USB_SERIAL_MOS7720=m +CONFIG_USB_SERIAL_MOS7715_PARPORT=y +CONFIG_USB_SERIAL_MOS7840=m +CONFIG_USB_SERIAL_MXUPORT=m +CONFIG_USB_SERIAL_NAVMAN=m +CONFIG_USB_SERIAL_PL2303=m +CONFIG_USB_SERIAL_OTI6858=m +CONFIG_USB_SERIAL_QCAUX=m +CONFIG_USB_SERIAL_QUALCOMM=m +CONFIG_USB_SERIAL_SPCP8X5=m +CONFIG_USB_SERIAL_SAFE=m +# CONFIG_USB_SERIAL_SAFE_PADDED is not set +CONFIG_USB_SERIAL_SIERRAWIRELESS=m +CONFIG_USB_SERIAL_SYMBOL=m +CONFIG_USB_SERIAL_TI=m +CONFIG_USB_SERIAL_CYBERJACK=m +CONFIG_USB_SERIAL_XIRCOM=m +CONFIG_USB_SERIAL_WWAN=m +CONFIG_USB_SERIAL_OPTION=m +CONFIG_USB_SERIAL_OMNINET=m +CONFIG_USB_SERIAL_OPTICON=m +CONFIG_USB_SERIAL_XSENS_MT=m +CONFIG_USB_SERIAL_WISHBONE=m +CONFIG_USB_SERIAL_SSU100=m +CONFIG_USB_SERIAL_QT2=m +CONFIG_USB_SERIAL_UPD78F0730=m +CONFIG_USB_SERIAL_DEBUG=m + +# +# USB Miscellaneous drivers +# +CONFIG_USB_EMI62=m +CONFIG_USB_EMI26=m +CONFIG_USB_ADUTUX=m +CONFIG_USB_SEVSEG=m +CONFIG_USB_LEGOTOWER=m +CONFIG_USB_LCD=m +CONFIG_USB_CYPRESS_CY7C63=m +CONFIG_USB_CYTHERM=m +CONFIG_USB_IDMOUSE=m +CONFIG_USB_FTDI_ELAN=m +CONFIG_USB_APPLEDISPLAY=m +CONFIG_APPLE_MFI_FASTCHARGE=m +CONFIG_USB_SISUSBVGA=m +CONFIG_USB_SISUSBVGA_CON=y +CONFIG_USB_LD=m +CONFIG_USB_TRANCEVIBRATOR=m +CONFIG_USB_IOWARRIOR=m +CONFIG_USB_TEST=m +CONFIG_USB_EHSET_TEST_FIXTURE=m +CONFIG_USB_ISIGHTFW=m +CONFIG_USB_YUREX=m +CONFIG_USB_EZUSB_FX2=m +CONFIG_USB_HUB_USB251XB=m +CONFIG_USB_HSIC_USB3503=m +CONFIG_USB_HSIC_USB4604=m +CONFIG_USB_LINK_LAYER_TEST=m +CONFIG_USB_CHAOSKEY=m +CONFIG_USB_ATM=m +CONFIG_USB_SPEEDTOUCH=m +CONFIG_USB_CXACRU=m +CONFIG_USB_UEAGLEATM=m +CONFIG_USB_XUSBATM=m + +# +# USB Physical Layer drivers +# +CONFIG_USB_PHY=y +CONFIG_NOP_USB_XCEIV=m +CONFIG_USB_GPIO_VBUS=m +CONFIG_TAHVO_USB=m +# CONFIG_TAHVO_USB_HOST_BY_DEFAULT is not set +CONFIG_USB_ISP1301=m +# end of USB Physical Layer drivers + +CONFIG_USB_GADGET=m +# CONFIG_USB_GADGET_DEBUG is not set +# CONFIG_USB_GADGET_DEBUG_FILES is not set +# CONFIG_USB_GADGET_DEBUG_FS is not set +CONFIG_USB_GADGET_VBUS_DRAW=2 +CONFIG_USB_GADGET_STORAGE_NUM_BUFFERS=2 +CONFIG_U_SERIAL_CONSOLE=y + +# +# USB Peripheral Controller +# +CONFIG_USB_FOTG210_UDC=m +CONFIG_USB_GR_UDC=m +CONFIG_USB_R8A66597=m +CONFIG_USB_PXA27X=m +CONFIG_USB_MV_UDC=m +CONFIG_USB_MV_U3D=m +CONFIG_USB_SNP_CORE=m +CONFIG_USB_SNP_UDC_PLAT=m +CONFIG_USB_M66592=m +CONFIG_USB_BDC_UDC=m + +# +# Platform Support +# +CONFIG_USB_BDC_PCI=m +CONFIG_USB_AMD5536UDC=m +CONFIG_USB_NET2272=m +CONFIG_USB_NET2272_DMA=y +CONFIG_USB_NET2280=m +CONFIG_USB_GOKU=m +CONFIG_USB_EG20T=m +CONFIG_USB_GADGET_XILINX=m +CONFIG_USB_MAX3420_UDC=m +CONFIG_USB_DUMMY_HCD=m +# end of USB Peripheral Controller + +CONFIG_USB_LIBCOMPOSITE=m +CONFIG_USB_F_ACM=m +CONFIG_USB_F_SS_LB=m +CONFIG_USB_U_SERIAL=m +CONFIG_USB_U_ETHER=m +CONFIG_USB_U_AUDIO=m +CONFIG_USB_F_SERIAL=m +CONFIG_USB_F_OBEX=m +CONFIG_USB_F_NCM=m +CONFIG_USB_F_ECM=m +CONFIG_USB_F_PHONET=m +CONFIG_USB_F_EEM=m +CONFIG_USB_F_SUBSET=m +CONFIG_USB_F_RNDIS=m +CONFIG_USB_F_MASS_STORAGE=m +CONFIG_USB_F_FS=m +CONFIG_USB_F_UAC1=m +CONFIG_USB_F_UAC1_LEGACY=m +CONFIG_USB_F_UAC2=m +CONFIG_USB_F_UVC=m +CONFIG_USB_F_MIDI=m +CONFIG_USB_F_HID=m +CONFIG_USB_F_PRINTER=m +CONFIG_USB_F_TCM=m +CONFIG_USB_CONFIGFS=m +CONFIG_USB_CONFIGFS_SERIAL=y +CONFIG_USB_CONFIGFS_ACM=y +CONFIG_USB_CONFIGFS_OBEX=y +CONFIG_USB_CONFIGFS_NCM=y +CONFIG_USB_CONFIGFS_ECM=y +CONFIG_USB_CONFIGFS_ECM_SUBSET=y +CONFIG_USB_CONFIGFS_RNDIS=y +CONFIG_USB_CONFIGFS_EEM=y +CONFIG_USB_CONFIGFS_PHONET=y +CONFIG_USB_CONFIGFS_MASS_STORAGE=y +CONFIG_USB_CONFIGFS_F_LB_SS=y +CONFIG_USB_CONFIGFS_F_FS=y +CONFIG_USB_CONFIGFS_F_UAC1=y +CONFIG_USB_CONFIGFS_F_UAC1_LEGACY=y +CONFIG_USB_CONFIGFS_F_UAC2=y +CONFIG_USB_CONFIGFS_F_MIDI=y +CONFIG_USB_CONFIGFS_F_HID=y +CONFIG_USB_CONFIGFS_F_UVC=y +CONFIG_USB_CONFIGFS_F_PRINTER=y +CONFIG_USB_CONFIGFS_F_TCM=y + +# +# USB Gadget precomposed configurations +# +CONFIG_USB_ZERO=m +CONFIG_USB_AUDIO=m +# CONFIG_GADGET_UAC1 is not set +CONFIG_USB_ETH=m +CONFIG_USB_ETH_RNDIS=y +CONFIG_USB_ETH_EEM=y +CONFIG_USB_G_NCM=m +CONFIG_USB_GADGETFS=m +CONFIG_USB_FUNCTIONFS=m +CONFIG_USB_FUNCTIONFS_ETH=y +CONFIG_USB_FUNCTIONFS_RNDIS=y +CONFIG_USB_FUNCTIONFS_GENERIC=y +CONFIG_USB_MASS_STORAGE=m +CONFIG_USB_GADGET_TARGET=m +CONFIG_USB_G_SERIAL=m +CONFIG_USB_MIDI_GADGET=m +CONFIG_USB_G_PRINTER=m +CONFIG_USB_CDC_COMPOSITE=m +CONFIG_USB_G_NOKIA=m +CONFIG_USB_G_ACM_MS=m +CONFIG_USB_G_MULTI=m +CONFIG_USB_G_MULTI_RNDIS=y +CONFIG_USB_G_MULTI_CDC=y +CONFIG_USB_G_HID=m +CONFIG_USB_G_DBGP=m +# CONFIG_USB_G_DBGP_PRINTK is not set +CONFIG_USB_G_DBGP_SERIAL=y +CONFIG_USB_G_WEBCAM=m +CONFIG_USB_RAW_GADGET=m +# end of USB Gadget precomposed configurations + +CONFIG_TYPEC=m +CONFIG_TYPEC_TCPM=m +CONFIG_TYPEC_TCPCI=m +CONFIG_TYPEC_RT1711H=m +CONFIG_TYPEC_FUSB302=m +CONFIG_TYPEC_WCOVE=m +CONFIG_TYPEC_UCSI=m +CONFIG_UCSI_CCG=m +CONFIG_UCSI_ACPI=m +CONFIG_TYPEC_HD3SS3220=m +CONFIG_TYPEC_TPS6598X=m + +# +# USB Type-C Multiplexer/DeMultiplexer Switch support +# +CONFIG_TYPEC_MUX_PI3USB30532=m +CONFIG_TYPEC_MUX_INTEL_PMC=m +# end of USB Type-C Multiplexer/DeMultiplexer Switch support + +# +# USB Type-C Alternate Mode drivers +# +CONFIG_TYPEC_DP_ALTMODE=m +CONFIG_TYPEC_NVIDIA_ALTMODE=m +# end of USB Type-C Alternate Mode drivers + +CONFIG_USB_ROLE_SWITCH=m +CONFIG_USB_ROLES_INTEL_XHCI=m +CONFIG_MMC=m +CONFIG_PWRSEQ_EMMC=m +CONFIG_PWRSEQ_SD8787=m +CONFIG_PWRSEQ_SIMPLE=m +CONFIG_MMC_BLOCK=m +CONFIG_MMC_BLOCK_MINORS=8 +CONFIG_SDIO_UART=m +CONFIG_MMC_TEST=m + +# +# MMC/SD/SDIO Host Controller Drivers +# +# CONFIG_MMC_DEBUG is not set +CONFIG_MMC_SDHCI=m +CONFIG_MMC_SDHCI_IO_ACCESSORS=y +CONFIG_MMC_SDHCI_PCI=m +CONFIG_MMC_RICOH_MMC=y +CONFIG_MMC_SDHCI_ACPI=m +CONFIG_MMC_SDHCI_PLTFM=m +CONFIG_MMC_SDHCI_OF_ARASAN=m +CONFIG_MMC_SDHCI_OF_ASPEED=m +CONFIG_MMC_SDHCI_OF_AT91=m +CONFIG_MMC_SDHCI_OF_DWCMSHC=m +CONFIG_MMC_SDHCI_CADENCE=m +CONFIG_MMC_SDHCI_F_SDH30=m +CONFIG_MMC_SDHCI_MILBEAUT=m +CONFIG_MMC_WBSD=m +CONFIG_MMC_ALCOR=m +CONFIG_MMC_TIFM_SD=m +CONFIG_MMC_SPI=m +CONFIG_MMC_SDRICOH_CS=m +CONFIG_MMC_CB710=m +CONFIG_MMC_VIA_SDMMC=m +CONFIG_MMC_VUB300=m +CONFIG_MMC_USHC=m +CONFIG_MMC_USDHI6ROL0=m +CONFIG_MMC_REALTEK_PCI=m +CONFIG_MMC_REALTEK_USB=m +CONFIG_MMC_CQHCI=m +CONFIG_MMC_HSQ=m +CONFIG_MMC_TOSHIBA_PCI=m +CONFIG_MMC_MTK=m +CONFIG_MMC_SDHCI_XENON=m +CONFIG_MMC_SDHCI_OMAP=m +CONFIG_MMC_SDHCI_AM654=m +CONFIG_MMC_SDHCI_EXTERNAL_DMA=y +CONFIG_MEMSTICK=m +# CONFIG_MEMSTICK_DEBUG is not set + +# +# MemoryStick drivers +# +# CONFIG_MEMSTICK_UNSAFE_RESUME is not set +CONFIG_MSPRO_BLOCK=m +CONFIG_MS_BLOCK=m + +# +# MemoryStick Host Controller Drivers +# +CONFIG_MEMSTICK_TIFM_MS=m +CONFIG_MEMSTICK_JMICRON_38X=m +CONFIG_MEMSTICK_R592=m +CONFIG_MEMSTICK_REALTEK_PCI=m +CONFIG_MEMSTICK_REALTEK_USB=m +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=y +CONFIG_LEDS_CLASS_FLASH=m +CONFIG_LEDS_BRIGHTNESS_HW_CHANGED=y + +# +# LED drivers +# +CONFIG_LEDS_88PM860X=m +CONFIG_LEDS_AAT1290=m +CONFIG_LEDS_AN30259A=m +CONFIG_LEDS_APU=m +CONFIG_LEDS_AS3645A=m +CONFIG_LEDS_BCM6328=m +CONFIG_LEDS_BCM6358=m +CONFIG_LEDS_CPCAP=m +CONFIG_LEDS_CR0014114=m +CONFIG_LEDS_EL15203000=m +CONFIG_LEDS_LM3530=m +CONFIG_LEDS_LM3532=m +CONFIG_LEDS_LM3533=m +CONFIG_LEDS_LM3642=m +CONFIG_LEDS_LM3692X=m +CONFIG_LEDS_LM3601X=m +CONFIG_LEDS_MT6323=m +CONFIG_LEDS_PCA9532=m +CONFIG_LEDS_PCA9532_GPIO=y +CONFIG_LEDS_GPIO=m +CONFIG_LEDS_LP3944=m +CONFIG_LEDS_LP3952=m +# CONFIG_LEDS_LP5521 is not set +# CONFIG_LEDS_LP5523 is not set +# CONFIG_LEDS_LP5562 is not set +# CONFIG_LEDS_LP8501 is not set +CONFIG_LEDS_LP8788=m +CONFIG_LEDS_LP8860=m +CONFIG_LEDS_CLEVO_MAIL=m +CONFIG_LEDS_PCA955X=m +CONFIG_LEDS_PCA955X_GPIO=y +CONFIG_LEDS_PCA963X=m +CONFIG_LEDS_WM831X_STATUS=m +CONFIG_LEDS_WM8350=m +CONFIG_LEDS_DA903X=m +CONFIG_LEDS_DA9052=m +CONFIG_LEDS_DAC124S085=m +CONFIG_LEDS_PWM=m +CONFIG_LEDS_REGULATOR=m +CONFIG_LEDS_BD2802=m +CONFIG_LEDS_INTEL_SS4200=m +CONFIG_LEDS_LT3593=m +CONFIG_LEDS_ADP5520=m +CONFIG_LEDS_MC13783=m +CONFIG_LEDS_TCA6507=m +CONFIG_LEDS_TLC591XX=m +CONFIG_LEDS_MAX77650=m +CONFIG_LEDS_MAX77693=m +CONFIG_LEDS_MAX8997=m +CONFIG_LEDS_LM355x=m +CONFIG_LEDS_MENF21BMC=m +CONFIG_LEDS_KTD2692=m +CONFIG_LEDS_IS31FL319X=m +CONFIG_LEDS_IS31FL32XX=m + +# +# LED driver for blink(1) USB RGB LED is under Special HID drivers (HID_THINGM) +# +CONFIG_LEDS_BLINKM=m +CONFIG_LEDS_SYSCON=y +CONFIG_LEDS_MLXCPLD=m +CONFIG_LEDS_MLXREG=m +CONFIG_LEDS_USER=m +CONFIG_LEDS_NIC78BX=m +CONFIG_LEDS_SPI_BYTE=m +CONFIG_LEDS_TI_LMU_COMMON=m +CONFIG_LEDS_LM3697=m +CONFIG_LEDS_LM36274=m +CONFIG_LEDS_TPS6105X=m + +# +# LED Triggers +# +CONFIG_LEDS_TRIGGERS=y +CONFIG_LEDS_TRIGGER_TIMER=m +CONFIG_LEDS_TRIGGER_ONESHOT=m +CONFIG_LEDS_TRIGGER_DISK=y +CONFIG_LEDS_TRIGGER_MTD=y +CONFIG_LEDS_TRIGGER_HEARTBEAT=m +CONFIG_LEDS_TRIGGER_BACKLIGHT=m +CONFIG_LEDS_TRIGGER_CPU=y +CONFIG_LEDS_TRIGGER_ACTIVITY=m +CONFIG_LEDS_TRIGGER_GPIO=m +CONFIG_LEDS_TRIGGER_DEFAULT_ON=m + +# +# iptables trigger is under Netfilter config (LED target) +# +CONFIG_LEDS_TRIGGER_TRANSIENT=m +CONFIG_LEDS_TRIGGER_CAMERA=m +CONFIG_LEDS_TRIGGER_PANIC=y +CONFIG_LEDS_TRIGGER_NETDEV=m +CONFIG_LEDS_TRIGGER_PATTERN=m +CONFIG_LEDS_TRIGGER_AUDIO=m +CONFIG_ACCESSIBILITY=y +CONFIG_A11Y_BRAILLE_CONSOLE=y +CONFIG_INFINIBAND=m +CONFIG_INFINIBAND_USER_MAD=m +CONFIG_INFINIBAND_USER_ACCESS=m +# CONFIG_INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI is not set +CONFIG_INFINIBAND_USER_MEM=y +CONFIG_INFINIBAND_ON_DEMAND_PAGING=y +CONFIG_INFINIBAND_ADDR_TRANS=y +CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS=y +CONFIG_INFINIBAND_MTHCA=m +CONFIG_INFINIBAND_MTHCA_DEBUG=y +CONFIG_INFINIBAND_QIB=m +CONFIG_INFINIBAND_QIB_DCA=y +CONFIG_INFINIBAND_CXGB4=m +CONFIG_INFINIBAND_EFA=m +CONFIG_INFINIBAND_I40IW=m +CONFIG_MLX4_INFINIBAND=m +CONFIG_MLX5_INFINIBAND=m +CONFIG_INFINIBAND_OCRDMA=m +CONFIG_INFINIBAND_VMWARE_PVRDMA=m +CONFIG_INFINIBAND_USNIC=m +CONFIG_INFINIBAND_BNXT_RE=m +CONFIG_INFINIBAND_HFI1=m +# CONFIG_HFI1_DEBUG_SDMA_ORDER is not set +# CONFIG_SDMA_VERBOSITY is not set +CONFIG_INFINIBAND_QEDR=m +CONFIG_INFINIBAND_RDMAVT=m +CONFIG_RDMA_RXE=m +CONFIG_RDMA_SIW=m +CONFIG_INFINIBAND_IPOIB=m +CONFIG_INFINIBAND_IPOIB_CM=y +CONFIG_INFINIBAND_IPOIB_DEBUG=y +# CONFIG_INFINIBAND_IPOIB_DEBUG_DATA is not set +CONFIG_INFINIBAND_SRP=m +CONFIG_INFINIBAND_SRPT=m +CONFIG_INFINIBAND_ISER=m +CONFIG_INFINIBAND_ISERT=m +CONFIG_INFINIBAND_OPA_VNIC=m +CONFIG_EDAC_ATOMIC_SCRUB=y +CONFIG_EDAC_SUPPORT=y +CONFIG_EDAC=y +CONFIG_EDAC_LEGACY_SYSFS=y +# CONFIG_EDAC_DEBUG is not set +CONFIG_EDAC_DECODE_MCE=m +CONFIG_EDAC_GHES=y +CONFIG_EDAC_AMD64=m +# CONFIG_EDAC_AMD64_ERROR_INJECTION is not set +CONFIG_EDAC_E752X=m +CONFIG_EDAC_I82975X=m +CONFIG_EDAC_I3000=m +CONFIG_EDAC_I3200=m +CONFIG_EDAC_IE31200=m +CONFIG_EDAC_X38=m +CONFIG_EDAC_I5400=m +CONFIG_EDAC_I7CORE=m +CONFIG_EDAC_I5000=m +CONFIG_EDAC_I5100=m +CONFIG_EDAC_I7300=m +CONFIG_EDAC_SBRIDGE=m +CONFIG_EDAC_SKX=m +CONFIG_EDAC_I10NM=m +CONFIG_EDAC_PND2=m +CONFIG_RTC_LIB=y +CONFIG_RTC_MC146818_LIB=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_HCTOSYS=y +CONFIG_RTC_HCTOSYS_DEVICE="rtc0" +CONFIG_RTC_SYSTOHC=y +CONFIG_RTC_SYSTOHC_DEVICE="rtc0" +# CONFIG_RTC_DEBUG is not set +CONFIG_RTC_NVMEM=y + +# +# RTC interfaces +# +CONFIG_RTC_INTF_SYSFS=y +CONFIG_RTC_INTF_PROC=y +CONFIG_RTC_INTF_DEV=y +CONFIG_RTC_INTF_DEV_UIE_EMUL=y +# CONFIG_RTC_DRV_TEST is not set + +# +# I2C RTC drivers +# +CONFIG_RTC_DRV_88PM860X=m +CONFIG_RTC_DRV_88PM80X=m +CONFIG_RTC_DRV_ABB5ZES3=m +CONFIG_RTC_DRV_ABEOZ9=m +CONFIG_RTC_DRV_ABX80X=m +CONFIG_RTC_DRV_AS3722=m +CONFIG_RTC_DRV_DS1307=m +CONFIG_RTC_DRV_DS1307_CENTURY=y +CONFIG_RTC_DRV_DS1374=m +CONFIG_RTC_DRV_DS1374_WDT=y +CONFIG_RTC_DRV_DS1672=m +CONFIG_RTC_DRV_HYM8563=m +CONFIG_RTC_DRV_LP8788=m +CONFIG_RTC_DRV_MAX6900=m +CONFIG_RTC_DRV_MAX8907=m +CONFIG_RTC_DRV_MAX8925=m +CONFIG_RTC_DRV_MAX8998=m +CONFIG_RTC_DRV_MAX8997=m +CONFIG_RTC_DRV_MAX77686=m +CONFIG_RTC_DRV_RK808=m +CONFIG_RTC_DRV_RS5C372=m +CONFIG_RTC_DRV_ISL1208=m +CONFIG_RTC_DRV_ISL12022=m +CONFIG_RTC_DRV_ISL12026=m +CONFIG_RTC_DRV_X1205=m +CONFIG_RTC_DRV_PCF8523=m +CONFIG_RTC_DRV_PCF85063=m +CONFIG_RTC_DRV_PCF85363=m +CONFIG_RTC_DRV_PCF8563=m +CONFIG_RTC_DRV_PCF8583=m +CONFIG_RTC_DRV_M41T80=m +CONFIG_RTC_DRV_M41T80_WDT=y +CONFIG_RTC_DRV_BD70528=m +CONFIG_RTC_DRV_BQ32K=m +CONFIG_RTC_DRV_TWL4030=m +CONFIG_RTC_DRV_PALMAS=m +CONFIG_RTC_DRV_TPS6586X=m +CONFIG_RTC_DRV_TPS65910=m +CONFIG_RTC_DRV_TPS80031=m +CONFIG_RTC_DRV_RC5T583=m +CONFIG_RTC_DRV_RC5T619=m +CONFIG_RTC_DRV_S35390A=m +CONFIG_RTC_DRV_FM3130=m +CONFIG_RTC_DRV_RX8010=m +CONFIG_RTC_DRV_RX8581=m +CONFIG_RTC_DRV_RX8025=m +CONFIG_RTC_DRV_EM3027=m +CONFIG_RTC_DRV_RV3028=m +CONFIG_RTC_DRV_RV8803=m +CONFIG_RTC_DRV_S5M=m +CONFIG_RTC_DRV_SD3078=m + +# +# SPI RTC drivers +# +CONFIG_RTC_DRV_M41T93=m +CONFIG_RTC_DRV_M41T94=m +CONFIG_RTC_DRV_DS1302=m +CONFIG_RTC_DRV_DS1305=m +CONFIG_RTC_DRV_DS1343=m +CONFIG_RTC_DRV_DS1347=m +CONFIG_RTC_DRV_DS1390=m +CONFIG_RTC_DRV_MAX6916=m +CONFIG_RTC_DRV_R9701=m +CONFIG_RTC_DRV_RX4581=m +CONFIG_RTC_DRV_RX6110=m +CONFIG_RTC_DRV_RS5C348=m +CONFIG_RTC_DRV_MAX6902=m +CONFIG_RTC_DRV_PCF2123=m +CONFIG_RTC_DRV_MCP795=m +CONFIG_RTC_I2C_AND_SPI=y + +# +# SPI and I2C RTC drivers +# +CONFIG_RTC_DRV_DS3232=m +CONFIG_RTC_DRV_DS3232_HWMON=y +CONFIG_RTC_DRV_PCF2127=m +CONFIG_RTC_DRV_RV3029C2=m +CONFIG_RTC_DRV_RV3029_HWMON=y + +# +# Platform RTC drivers +# +CONFIG_RTC_DRV_CMOS=y +CONFIG_RTC_DRV_DS1286=m +CONFIG_RTC_DRV_DS1511=m +CONFIG_RTC_DRV_DS1553=m +CONFIG_RTC_DRV_DS1685_FAMILY=m +CONFIG_RTC_DRV_DS1685=y +# CONFIG_RTC_DRV_DS1689 is not set +# CONFIG_RTC_DRV_DS17285 is not set +# CONFIG_RTC_DRV_DS17485 is not set +# CONFIG_RTC_DRV_DS17885 is not set +CONFIG_RTC_DRV_DS1742=m +CONFIG_RTC_DRV_DS2404=m +CONFIG_RTC_DRV_DA9052=m +CONFIG_RTC_DRV_DA9055=m +CONFIG_RTC_DRV_DA9063=m +CONFIG_RTC_DRV_STK17TA8=m +CONFIG_RTC_DRV_M48T86=m +CONFIG_RTC_DRV_M48T35=m +CONFIG_RTC_DRV_M48T59=m +CONFIG_RTC_DRV_MSM6242=m +CONFIG_RTC_DRV_BQ4802=m +CONFIG_RTC_DRV_RP5C01=m +CONFIG_RTC_DRV_V3020=m +CONFIG_RTC_DRV_WM831X=m +CONFIG_RTC_DRV_WM8350=m +CONFIG_RTC_DRV_PCF50633=m +CONFIG_RTC_DRV_AB3100=m +CONFIG_RTC_DRV_ZYNQMP=m +CONFIG_RTC_DRV_CROS_EC=m + +# +# on-CPU RTC drivers +# +CONFIG_RTC_DRV_CADENCE=m +CONFIG_RTC_DRV_FTRTC010=m +CONFIG_RTC_DRV_PCAP=m +CONFIG_RTC_DRV_MC13XXX=m +CONFIG_RTC_DRV_MT6397=m +CONFIG_RTC_DRV_R7301=m +CONFIG_RTC_DRV_CPCAP=m + +# +# HID Sensor RTC drivers +# +CONFIG_RTC_DRV_HID_SENSOR_TIME=m +CONFIG_RTC_DRV_WILCO_EC=m +CONFIG_DMADEVICES=y +# CONFIG_DMADEVICES_DEBUG is not set + +# +# DMA Devices +# +CONFIG_DMA_ENGINE=y +CONFIG_DMA_VIRTUAL_CHANNELS=y +CONFIG_DMA_ACPI=y +CONFIG_DMA_OF=y +CONFIG_ALTERA_MSGDMA=m +CONFIG_DW_AXI_DMAC=m +CONFIG_FSL_EDMA=m +CONFIG_INTEL_IDMA64=m +CONFIG_INTEL_IDXD=m +CONFIG_INTEL_IOATDMA=m +CONFIG_INTEL_MIC_X100_DMA=m +CONFIG_PLX_DMA=m +CONFIG_QCOM_HIDMA_MGMT=m +CONFIG_QCOM_HIDMA=m +CONFIG_DW_DMAC_CORE=y +CONFIG_DW_DMAC=y +CONFIG_DW_DMAC_PCI=y +CONFIG_DW_EDMA=m +CONFIG_DW_EDMA_PCIE=m +CONFIG_HSU_DMA=y +CONFIG_SF_PDMA=m + +# +# DMA Clients +# +CONFIG_ASYNC_TX_DMA=y +# CONFIG_DMATEST is not set +CONFIG_DMA_ENGINE_RAID=y + +# +# DMABUF options +# +CONFIG_SYNC_FILE=y +# CONFIG_SW_SYNC is not set +CONFIG_UDMABUF=y +# CONFIG_DMABUF_MOVE_NOTIFY is not set +# CONFIG_DMABUF_SELFTESTS is not set +CONFIG_DMABUF_HEAPS=y +CONFIG_DMABUF_HEAPS_SYSTEM=y +# end of DMABUF options + +CONFIG_DCA=m +CONFIG_AUXDISPLAY=y +CONFIG_HD44780=m +CONFIG_KS0108=m +CONFIG_KS0108_PORT=0x378 +CONFIG_KS0108_DELAY=2 +CONFIG_CFAG12864B=m +CONFIG_CFAG12864B_RATE=20 +CONFIG_IMG_ASCII_LCD=m +CONFIG_HT16K33=m +CONFIG_PARPORT_PANEL=m +CONFIG_PANEL_PARPORT=0 +CONFIG_PANEL_PROFILE=5 +# CONFIG_PANEL_CHANGE_MESSAGE is not set +# CONFIG_CHARLCD_BL_OFF is not set +# CONFIG_CHARLCD_BL_ON is not set +CONFIG_CHARLCD_BL_FLASH=y +CONFIG_PANEL=m +CONFIG_CHARLCD=m +CONFIG_UIO=m +CONFIG_UIO_CIF=m +CONFIG_UIO_PDRV_GENIRQ=m +CONFIG_UIO_DMEM_GENIRQ=m +CONFIG_UIO_AEC=m +CONFIG_UIO_SERCOS3=m +CONFIG_UIO_PCI_GENERIC=m +CONFIG_UIO_NETX=m +CONFIG_UIO_PRUSS=m +CONFIG_UIO_MF624=m +CONFIG_UIO_HV_GENERIC=m +CONFIG_VFIO_IOMMU_TYPE1=m +CONFIG_VFIO_VIRQFD=m +CONFIG_VFIO=m +# CONFIG_VFIO_NOIOMMU is not set +CONFIG_VFIO_PCI=m +CONFIG_VFIO_PCI_VGA=y +CONFIG_VFIO_PCI_MMAP=y +CONFIG_VFIO_PCI_INTX=y +CONFIG_VFIO_PCI_IGD=y +CONFIG_VFIO_MDEV=m +CONFIG_VFIO_MDEV_DEVICE=m +CONFIG_IRQ_BYPASS_MANAGER=m +CONFIG_VIRT_DRIVERS=y +CONFIG_VBOXGUEST=m +CONFIG_VIRTIO=y +CONFIG_VIRTIO_MENU=y +CONFIG_VIRTIO_PCI=m +CONFIG_VIRTIO_PCI_LEGACY=y +CONFIG_VIRTIO_VDPA=m +CONFIG_VIRTIO_PMEM=m +CONFIG_VIRTIO_BALLOON=m +CONFIG_VIRTIO_INPUT=m +CONFIG_VIRTIO_MMIO=m +CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y +CONFIG_VDPA=m +CONFIG_VDPA_SIM=m +CONFIG_IFCVF=m +CONFIG_VHOST_IOTLB=m +CONFIG_VHOST_RING=m +CONFIG_VHOST_DPN=y +CONFIG_VHOST=m +CONFIG_VHOST_MENU=y +CONFIG_VHOST_NET=m +CONFIG_VHOST_SCSI=m +CONFIG_VHOST_VSOCK=m +CONFIG_VHOST_VDPA=m +# CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set + +# +# Microsoft Hyper-V guest support +# +CONFIG_HYPERV=m +CONFIG_HYPERV_TIMER=y +CONFIG_HYPERV_UTILS=m +CONFIG_HYPERV_BALLOON=m +# end of Microsoft Hyper-V guest support + +# +# Xen driver support +# +CONFIG_XEN_BALLOON=y +CONFIG_XEN_BALLOON_MEMORY_HOTPLUG=y +CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT=512 +CONFIG_XEN_SCRUB_PAGES_DEFAULT=y +CONFIG_XEN_DEV_EVTCHN=m +CONFIG_XEN_BACKEND=y +CONFIG_XENFS=m +CONFIG_XEN_COMPAT_XENFS=y +CONFIG_XEN_SYS_HYPERVISOR=y +CONFIG_XEN_XENBUS_FRONTEND=y +CONFIG_XEN_GNTDEV=m +CONFIG_XEN_GNTDEV_DMABUF=y +CONFIG_XEN_GRANT_DEV_ALLOC=m +CONFIG_XEN_GRANT_DMA_ALLOC=y +CONFIG_SWIOTLB_XEN=y +CONFIG_XEN_PCIDEV_BACKEND=m +CONFIG_XEN_PVCALLS_FRONTEND=m +CONFIG_XEN_PVCALLS_BACKEND=y +CONFIG_XEN_SCSI_BACKEND=m +CONFIG_XEN_PRIVCMD=m +CONFIG_XEN_ACPI_PROCESSOR=m +CONFIG_XEN_MCE_LOG=y +CONFIG_XEN_HAVE_PVMMU=y +CONFIG_XEN_EFI=y +CONFIG_XEN_AUTO_XLATE=y +CONFIG_XEN_ACPI=y +CONFIG_XEN_SYMS=y +CONFIG_XEN_HAVE_VPMU=y +CONFIG_XEN_FRONT_PGDIR_SHBUF=m +# end of Xen driver support + +# CONFIG_GREYBUS is not set +CONFIG_STAGING=y +CONFIG_PRISM2_USB=m +CONFIG_COMEDI=m +# CONFIG_COMEDI_DEBUG is not set +CONFIG_COMEDI_DEFAULT_BUF_SIZE_KB=2048 +CONFIG_COMEDI_DEFAULT_BUF_MAXSIZE_KB=20480 +CONFIG_COMEDI_MISC_DRIVERS=y +CONFIG_COMEDI_BOND=m +CONFIG_COMEDI_TEST=m +CONFIG_COMEDI_PARPORT=m +# CONFIG_COMEDI_ISA_DRIVERS is not set +CONFIG_COMEDI_PCI_DRIVERS=m +CONFIG_COMEDI_8255_PCI=m +CONFIG_COMEDI_ADDI_WATCHDOG=m +CONFIG_COMEDI_ADDI_APCI_1032=m +CONFIG_COMEDI_ADDI_APCI_1500=m +CONFIG_COMEDI_ADDI_APCI_1516=m +CONFIG_COMEDI_ADDI_APCI_1564=m +CONFIG_COMEDI_ADDI_APCI_16XX=m +CONFIG_COMEDI_ADDI_APCI_2032=m +CONFIG_COMEDI_ADDI_APCI_2200=m +CONFIG_COMEDI_ADDI_APCI_3120=m +CONFIG_COMEDI_ADDI_APCI_3501=m +CONFIG_COMEDI_ADDI_APCI_3XXX=m +CONFIG_COMEDI_ADL_PCI6208=m +CONFIG_COMEDI_ADL_PCI7X3X=m +CONFIG_COMEDI_ADL_PCI8164=m +CONFIG_COMEDI_ADL_PCI9111=m +CONFIG_COMEDI_ADL_PCI9118=m +CONFIG_COMEDI_ADV_PCI1710=m +CONFIG_COMEDI_ADV_PCI1720=m +CONFIG_COMEDI_ADV_PCI1723=m +CONFIG_COMEDI_ADV_PCI1724=m +CONFIG_COMEDI_ADV_PCI1760=m +CONFIG_COMEDI_ADV_PCI_DIO=m +CONFIG_COMEDI_AMPLC_DIO200_PCI=m +CONFIG_COMEDI_AMPLC_PC236_PCI=m +CONFIG_COMEDI_AMPLC_PC263_PCI=m +CONFIG_COMEDI_AMPLC_PCI224=m +CONFIG_COMEDI_AMPLC_PCI230=m +CONFIG_COMEDI_CONTEC_PCI_DIO=m +CONFIG_COMEDI_DAS08_PCI=m +CONFIG_COMEDI_DT3000=m +CONFIG_COMEDI_DYNA_PCI10XX=m +CONFIG_COMEDI_GSC_HPDI=m +CONFIG_COMEDI_MF6X4=m +CONFIG_COMEDI_ICP_MULTI=m +CONFIG_COMEDI_DAQBOARD2000=m +CONFIG_COMEDI_JR3_PCI=m +CONFIG_COMEDI_KE_COUNTER=m +CONFIG_COMEDI_CB_PCIDAS64=m +CONFIG_COMEDI_CB_PCIDAS=m +CONFIG_COMEDI_CB_PCIDDA=m +CONFIG_COMEDI_CB_PCIMDAS=m +CONFIG_COMEDI_CB_PCIMDDA=m +CONFIG_COMEDI_ME4000=m +CONFIG_COMEDI_ME_DAQ=m +CONFIG_COMEDI_NI_6527=m +CONFIG_COMEDI_NI_65XX=m +CONFIG_COMEDI_NI_660X=m +CONFIG_COMEDI_NI_670X=m +CONFIG_COMEDI_NI_LABPC_PCI=m +CONFIG_COMEDI_NI_PCIDIO=m +CONFIG_COMEDI_NI_PCIMIO=m +CONFIG_COMEDI_RTD520=m +CONFIG_COMEDI_S626=m +CONFIG_COMEDI_MITE=m +CONFIG_COMEDI_NI_TIOCMD=m +CONFIG_COMEDI_PCMCIA_DRIVERS=m +CONFIG_COMEDI_CB_DAS16_CS=m +CONFIG_COMEDI_DAS08_CS=m +CONFIG_COMEDI_NI_DAQ_700_CS=m +CONFIG_COMEDI_NI_DAQ_DIO24_CS=m +CONFIG_COMEDI_NI_LABPC_CS=m +CONFIG_COMEDI_NI_MIO_CS=m +CONFIG_COMEDI_QUATECH_DAQP_CS=m +CONFIG_COMEDI_USB_DRIVERS=m +CONFIG_COMEDI_DT9812=m +CONFIG_COMEDI_NI_USB6501=m +CONFIG_COMEDI_USBDUX=m +CONFIG_COMEDI_USBDUXFAST=m +CONFIG_COMEDI_USBDUXSIGMA=m +CONFIG_COMEDI_VMK80XX=m +CONFIG_COMEDI_8254=m +CONFIG_COMEDI_8255=m +CONFIG_COMEDI_8255_SA=m +CONFIG_COMEDI_KCOMEDILIB=m +CONFIG_COMEDI_AMPLC_DIO200=m +CONFIG_COMEDI_AMPLC_PC236=m +CONFIG_COMEDI_DAS08=m +CONFIG_COMEDI_NI_LABPC=m +CONFIG_COMEDI_NI_TIO=m +CONFIG_COMEDI_NI_ROUTING=m +CONFIG_RTL8192U=m +CONFIG_RTLLIB=m +CONFIG_RTLLIB_CRYPTO_CCMP=m +CONFIG_RTLLIB_CRYPTO_TKIP=m +CONFIG_RTLLIB_CRYPTO_WEP=m +CONFIG_RTL8192E=m +CONFIG_RTL8723BS=m +CONFIG_R8712U=m +CONFIG_R8188EU=m +CONFIG_88EU_AP_MODE=y +CONFIG_RTS5208=m +CONFIG_VT6655=m +CONFIG_VT6656=m + +# +# IIO staging drivers +# + +# +# Accelerometers +# +CONFIG_ADIS16203=m +CONFIG_ADIS16240=m +# end of Accelerometers + +# +# Analog to digital converters +# +CONFIG_AD7816=m +CONFIG_AD7280=m +# end of Analog to digital converters + +# +# Analog digital bi-direction converters +# +CONFIG_ADT7316=m +CONFIG_ADT7316_SPI=m +CONFIG_ADT7316_I2C=m +# end of Analog digital bi-direction converters + +# +# Capacitance to digital converters +# +CONFIG_AD7150=m +CONFIG_AD7746=m +# end of Capacitance to digital converters + +# +# Direct Digital Synthesis +# +CONFIG_AD9832=m +CONFIG_AD9834=m +# end of Direct Digital Synthesis + +# +# Network Analyzer, Impedance Converters +# +CONFIG_AD5933=m +# end of Network Analyzer, Impedance Converters + +# +# Active energy metering IC +# +CONFIG_ADE7854=m +CONFIG_ADE7854_I2C=m +CONFIG_ADE7854_SPI=m +# end of Active energy metering IC + +# +# Resolver to digital converters +# +CONFIG_AD2S1210=m +# end of Resolver to digital converters +# end of IIO staging drivers + +# CONFIG_FB_SM750 is not set + +# +# Speakup console speech +# +CONFIG_SPEAKUP=m +CONFIG_SPEAKUP_SYNTH_ACNTSA=m +CONFIG_SPEAKUP_SYNTH_APOLLO=m +CONFIG_SPEAKUP_SYNTH_AUDPTR=m +CONFIG_SPEAKUP_SYNTH_BNS=m +CONFIG_SPEAKUP_SYNTH_DECTLK=m +CONFIG_SPEAKUP_SYNTH_DECEXT=m +CONFIG_SPEAKUP_SYNTH_LTLK=m +CONFIG_SPEAKUP_SYNTH_SOFT=m +CONFIG_SPEAKUP_SYNTH_SPKOUT=m +CONFIG_SPEAKUP_SYNTH_TXPRT=m +CONFIG_SPEAKUP_SYNTH_DUMMY=m +# end of Speakup console speech + +CONFIG_STAGING_MEDIA=y +CONFIG_VIDEO_IPU3_IMGU=m + +# +# soc_camera sensor drivers +# +CONFIG_VIDEO_USBVISION=m + +# +# Android +# +# end of Android + +CONFIG_STAGING_BOARD=y +CONFIG_LTE_GDM724X=m +CONFIG_FIREWIRE_SERIAL=m +CONFIG_FWTTY_MAX_TOTAL_PORTS=64 +CONFIG_FWTTY_MAX_CARD_PORTS=32 +CONFIG_GS_FPGABOOT=m +CONFIG_UNISYSSPAR=y +CONFIG_UNISYS_VISORNIC=m +CONFIG_UNISYS_VISORINPUT=m +CONFIG_UNISYS_VISORHBA=m +CONFIG_COMMON_CLK_XLNX_CLKWZRD=m +# CONFIG_FB_TFT is not set +CONFIG_WILC1000=m +CONFIG_WILC1000_SDIO=m +CONFIG_WILC1000_SPI=m +# CONFIG_WILC1000_HW_OOB_INTR is not set +CONFIG_MOST_COMPONENTS=m +CONFIG_MOST_CDEV=m +CONFIG_MOST_NET=m +CONFIG_MOST_SOUND=m +CONFIG_MOST_VIDEO=m +CONFIG_MOST_DIM2=m +CONFIG_MOST_I2C=m +CONFIG_MOST_USB=m +CONFIG_KS7010=m +CONFIG_PI433=m + +# +# Gasket devices +# +CONFIG_STAGING_GASKET_FRAMEWORK=m +CONFIG_STAGING_APEX_DRIVER=m +# end of Gasket devices + +CONFIG_XIL_AXIS_FIFO=m +CONFIG_FIELDBUS_DEV=m +CONFIG_HMS_ANYBUSS_BUS=m +CONFIG_ARCX_ANYBUS_CONTROLLER=m +CONFIG_HMS_PROFINET=m +CONFIG_KPC2000=y +CONFIG_KPC2000_CORE=m +CONFIG_KPC2000_SPI=m +CONFIG_KPC2000_I2C=m +CONFIG_KPC2000_DMA=m +CONFIG_QLGE=m +CONFIG_WFX=m +CONFIG_X86_PLATFORM_DEVICES=y +CONFIG_ACPI_WMI=m +CONFIG_WMI_BMOF=m +CONFIG_ALIENWARE_WMI=m +CONFIG_HUAWEI_WMI=m +CONFIG_INTEL_WMI_THUNDERBOLT=m +CONFIG_MXM_WMI=m +CONFIG_PEAQ_WMI=m +CONFIG_XIAOMI_WMI=m +CONFIG_ACERHDF=m +CONFIG_ACER_WIRELESS=m +CONFIG_ACER_WMI=m +CONFIG_APPLE_GMUX=m +CONFIG_ASUS_LAPTOP=m +CONFIG_ASUS_WIRELESS=m +CONFIG_ASUS_WMI=m +CONFIG_ASUS_NB_WMI=m +CONFIG_EEEPC_LAPTOP=m +CONFIG_EEEPC_WMI=m +CONFIG_DCDBAS=m +CONFIG_DELL_SMBIOS=m +CONFIG_DELL_SMBIOS_WMI=y +CONFIG_DELL_SMBIOS_SMM=y +CONFIG_DELL_LAPTOP=m +CONFIG_DELL_RBTN=m +# CONFIG_DELL_RBU is not set +CONFIG_DELL_SMO8800=m +CONFIG_DELL_WMI=m +CONFIG_DELL_WMI_DESCRIPTOR=m +CONFIG_DELL_WMI_AIO=m +CONFIG_DELL_WMI_LED=m +CONFIG_AMILO_RFKILL=m +CONFIG_FUJITSU_LAPTOP=m +CONFIG_FUJITSU_TABLET=m +CONFIG_GPD_POCKET_FAN=m +CONFIG_HP_ACCEL=m +CONFIG_HP_WIRELESS=m +CONFIG_HP_WMI=m +CONFIG_IBM_RTL=m +CONFIG_IDEAPAD_LAPTOP=m +CONFIG_SENSORS_HDAPS=m +CONFIG_THINKPAD_ACPI=m +CONFIG_THINKPAD_ACPI_ALSA_SUPPORT=y +# CONFIG_THINKPAD_ACPI_DEBUGFACILITIES is not set +# CONFIG_THINKPAD_ACPI_DEBUG is not set +# CONFIG_THINKPAD_ACPI_UNSAFE_LEDS is not set +CONFIG_THINKPAD_ACPI_VIDEO=y +CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y +CONFIG_INTEL_ATOMISP2_PM=m +CONFIG_INTEL_CHT_INT33FE=m +CONFIG_INTEL_HID_EVENT=m +CONFIG_INTEL_INT0002_VGPIO=m +CONFIG_INTEL_MENLOW=m +CONFIG_INTEL_OAKTRAIL=m +CONFIG_INTEL_VBTN=m +CONFIG_SURFACE3_WMI=m +CONFIG_SURFACE_3_BUTTON=m +CONFIG_SURFACE_3_POWER_OPREGION=m +CONFIG_SURFACE_PRO3_BUTTON=m +CONFIG_MSI_LAPTOP=m +CONFIG_MSI_WMI=m +CONFIG_PCENGINES_APU2=m +CONFIG_SAMSUNG_LAPTOP=m +CONFIG_SAMSUNG_Q10=m +CONFIG_ACPI_TOSHIBA=m +CONFIG_TOSHIBA_BT_RFKILL=m +CONFIG_TOSHIBA_HAPS=m +CONFIG_TOSHIBA_WMI=m +CONFIG_ACPI_CMPC=m +CONFIG_COMPAL_LAPTOP=m +CONFIG_LG_LAPTOP=m +CONFIG_PANASONIC_LAPTOP=m +CONFIG_SONY_LAPTOP=m +CONFIG_SONYPI_COMPAT=y +CONFIG_SYSTEM76_ACPI=m +CONFIG_TOPSTAR_LAPTOP=m +CONFIG_I2C_MULTI_INSTANTIATE=m +CONFIG_MLX_PLATFORM=m +CONFIG_TOUCHSCREEN_DMI=y +CONFIG_INTEL_IPS=m +CONFIG_INTEL_RST=m +CONFIG_INTEL_SMARTCONNECT=m + +# +# Intel Speed Select Technology interface support +# +CONFIG_INTEL_SPEED_SELECT_INTERFACE=m +# end of Intel Speed Select Technology interface support + +CONFIG_INTEL_TURBO_MAX_3=y +CONFIG_INTEL_UNCORE_FREQ_CONTROL=m +CONFIG_INTEL_BXTWC_PMIC_TMU=m +CONFIG_INTEL_CHTDC_TI_PWRBTN=m +CONFIG_INTEL_PMC_CORE=y +CONFIG_INTEL_PMC_IPC=m +CONFIG_INTEL_PUNIT_IPC=m +CONFIG_INTEL_TELEMETRY=m +CONFIG_PMC_ATOM=y +CONFIG_MFD_CROS_EC=m +CONFIG_CHROME_PLATFORMS=y +CONFIG_CHROMEOS_LAPTOP=m +CONFIG_CHROMEOS_PSTORE=m +CONFIG_CHROMEOS_TBMC=m +CONFIG_CROS_EC=m +CONFIG_CROS_EC_I2C=m +CONFIG_CROS_EC_RPMSG=m +CONFIG_CROS_EC_ISHTP=m +CONFIG_CROS_EC_SPI=m +CONFIG_CROS_EC_LPC=m +CONFIG_CROS_EC_PROTO=y +CONFIG_CROS_KBD_LED_BACKLIGHT=m +CONFIG_CROS_EC_CHARDEV=m +CONFIG_CROS_EC_LIGHTBAR=m +CONFIG_CROS_EC_VBC=m +# CONFIG_CROS_EC_DEBUGFS is not set +CONFIG_CROS_EC_SENSORHUB=m +CONFIG_CROS_EC_SYSFS=m +CONFIG_CROS_EC_TYPEC=m +CONFIG_CROS_USBPD_LOGGER=m +CONFIG_CROS_USBPD_NOTIFY=m +CONFIG_WILCO_EC=m +# CONFIG_WILCO_EC_DEBUGFS is not set +CONFIG_WILCO_EC_EVENTS=m +CONFIG_WILCO_EC_TELEMETRY=m +CONFIG_MELLANOX_PLATFORM=y +CONFIG_MLXREG_HOTPLUG=m +CONFIG_MLXREG_IO=m +CONFIG_CLKDEV_LOOKUP=y +CONFIG_HAVE_CLK_PREPARE=y +CONFIG_COMMON_CLK=y + +# +# Common Clock Framework +# +CONFIG_COMMON_CLK_WM831X=m +CONFIG_CLK_HSDK=y +CONFIG_COMMON_CLK_MAX77686=m +CONFIG_COMMON_CLK_MAX9485=m +CONFIG_COMMON_CLK_RK808=m +CONFIG_COMMON_CLK_SI5341=m +CONFIG_COMMON_CLK_SI5351=m +CONFIG_COMMON_CLK_SI514=m +CONFIG_COMMON_CLK_SI544=m +CONFIG_COMMON_CLK_SI570=m +CONFIG_COMMON_CLK_CDCE706=m +CONFIG_COMMON_CLK_CDCE925=m +CONFIG_COMMON_CLK_CS2000_CP=m +CONFIG_COMMON_CLK_S2MPS11=m +CONFIG_CLK_TWL6040=m +CONFIG_COMMON_CLK_LOCHNAGAR=m +CONFIG_COMMON_CLK_PALMAS=m +CONFIG_COMMON_CLK_PWM=m +CONFIG_COMMON_CLK_VC5=m +CONFIG_COMMON_CLK_BD718XX=m +CONFIG_COMMON_CLK_FIXED_MMIO=y +# end of Common Clock Framework + +CONFIG_HWSPINLOCK=y + +# +# Clock Source drivers +# +CONFIG_TIMER_OF=y +CONFIG_TIMER_PROBE=y +CONFIG_CLKEVT_I8253=y +CONFIG_I8253_LOCK=y +CONFIG_CLKBLD_I8253=y +CONFIG_CLKSRC_MMIO=y +CONFIG_MICROCHIP_PIT64B=y +# end of Clock Source drivers + +CONFIG_MAILBOX=y +CONFIG_PLATFORM_MHU=m +CONFIG_PCC=y +CONFIG_ALTERA_MBOX=m +CONFIG_MAILBOX_TEST=m +CONFIG_IOMMU_IOVA=y +CONFIG_IOASID=y +CONFIG_IOMMU_API=y +CONFIG_IOMMU_SUPPORT=y + +# +# Generic IOMMU Pagetable Support +# +# end of Generic IOMMU Pagetable Support + +# CONFIG_IOMMU_DEBUGFS is not set +# CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set +CONFIG_OF_IOMMU=y +CONFIG_IOMMU_DMA=y +CONFIG_AMD_IOMMU=y +CONFIG_AMD_IOMMU_V2=y +CONFIG_DMAR_TABLE=y +CONFIG_INTEL_IOMMU=y +CONFIG_INTEL_IOMMU_SVM=y +# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set +CONFIG_INTEL_IOMMU_FLOPPY_WA=y +# CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON is not set +CONFIG_IRQ_REMAP=y +CONFIG_HYPERV_IOMMU=y + +# +# Remoteproc drivers +# +CONFIG_REMOTEPROC=y +# end of Remoteproc drivers + +# +# Rpmsg drivers +# +CONFIG_RPMSG=m +CONFIG_RPMSG_CHAR=m +CONFIG_RPMSG_QCOM_GLINK_NATIVE=m +CONFIG_RPMSG_QCOM_GLINK_RPM=m +CONFIG_RPMSG_VIRTIO=m +# end of Rpmsg drivers + +CONFIG_SOUNDWIRE=m + +# +# SoundWire Devices +# +CONFIG_SOUNDWIRE_CADENCE=m +CONFIG_SOUNDWIRE_INTEL=m +CONFIG_SOUNDWIRE_QCOM=m + +# +# SOC (System On Chip) specific Drivers +# + +# +# Amlogic SoC drivers +# +# end of Amlogic SoC drivers + +# +# Aspeed SoC drivers +# +# end of Aspeed SoC drivers + +# +# Broadcom SoC drivers +# +# end of Broadcom SoC drivers + +# +# NXP/Freescale QorIQ SoC drivers +# +# end of NXP/Freescale QorIQ SoC drivers + +# +# i.MX SoC drivers +# +# end of i.MX SoC drivers + +# +# Qualcomm SoC drivers +# +# end of Qualcomm SoC drivers + +CONFIG_SOC_TI=y + +# +# Xilinx SoC drivers +# +CONFIG_XILINX_VCU=m +# end of Xilinx SoC drivers +# end of SOC (System On Chip) specific Drivers + +CONFIG_PM_DEVFREQ=y + +# +# DEVFREQ Governors +# +CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=m +CONFIG_DEVFREQ_GOV_PERFORMANCE=m +CONFIG_DEVFREQ_GOV_POWERSAVE=m +CONFIG_DEVFREQ_GOV_USERSPACE=m +CONFIG_DEVFREQ_GOV_PASSIVE=m + +# +# DEVFREQ Drivers +# +CONFIG_PM_DEVFREQ_EVENT=y +CONFIG_EXTCON=y + +# +# Extcon Device Drivers +# +CONFIG_EXTCON_ADC_JACK=m +CONFIG_EXTCON_ARIZONA=m +CONFIG_EXTCON_AXP288=m +CONFIG_EXTCON_FSA9480=m +CONFIG_EXTCON_GPIO=m +CONFIG_EXTCON_INTEL_INT3496=m +CONFIG_EXTCON_INTEL_CHT_WC=m +CONFIG_EXTCON_MAX14577=m +CONFIG_EXTCON_MAX3355=m +CONFIG_EXTCON_MAX77693=m +CONFIG_EXTCON_MAX77843=m +CONFIG_EXTCON_MAX8997=m +CONFIG_EXTCON_PALMAS=m +CONFIG_EXTCON_PTN5150=m +CONFIG_EXTCON_RT8973A=m +CONFIG_EXTCON_SM5502=m +CONFIG_EXTCON_USB_GPIO=m +CONFIG_EXTCON_USBC_CROS_EC=m +CONFIG_MEMORY=y +CONFIG_IIO=m +CONFIG_IIO_BUFFER=y +CONFIG_IIO_BUFFER_CB=m +CONFIG_IIO_BUFFER_HW_CONSUMER=m +CONFIG_IIO_KFIFO_BUF=m +CONFIG_IIO_TRIGGERED_BUFFER=m +CONFIG_IIO_CONFIGFS=m +CONFIG_IIO_TRIGGER=y +CONFIG_IIO_CONSUMERS_PER_TRIGGER=2 +CONFIG_IIO_SW_DEVICE=m +CONFIG_IIO_SW_TRIGGER=m +CONFIG_IIO_TRIGGERED_EVENT=m + +# +# Accelerometers +# +CONFIG_ADIS16201=m +CONFIG_ADIS16209=m +CONFIG_ADXL372=m +CONFIG_ADXL372_SPI=m +CONFIG_ADXL372_I2C=m +CONFIG_BMA180=m +CONFIG_BMA220=m +CONFIG_BMA400=m +CONFIG_BMA400_I2C=m +CONFIG_BMC150_ACCEL=m +CONFIG_BMC150_ACCEL_I2C=m +CONFIG_BMC150_ACCEL_SPI=m +CONFIG_DA280=m +CONFIG_DA311=m +CONFIG_DMARD06=m +CONFIG_DMARD09=m +CONFIG_DMARD10=m +CONFIG_HID_SENSOR_ACCEL_3D=m +CONFIG_IIO_CROS_EC_ACCEL_LEGACY=m +CONFIG_IIO_ST_ACCEL_3AXIS=m +CONFIG_IIO_ST_ACCEL_I2C_3AXIS=m +CONFIG_IIO_ST_ACCEL_SPI_3AXIS=m +CONFIG_KXSD9=m +CONFIG_KXSD9_SPI=m +CONFIG_KXSD9_I2C=m +CONFIG_KXCJK1013=m +CONFIG_MC3230=m +CONFIG_MMA7455=m +CONFIG_MMA7455_I2C=m +CONFIG_MMA7455_SPI=m +CONFIG_MMA7660=m +CONFIG_MMA8452=m +CONFIG_MMA9551_CORE=m +CONFIG_MMA9551=m +CONFIG_MMA9553=m +CONFIG_MXC4005=m +CONFIG_MXC6255=m +CONFIG_SCA3000=m +CONFIG_STK8312=m +CONFIG_STK8BA50=m +# end of Accelerometers + +# +# Analog to digital converters +# +CONFIG_AD_SIGMA_DELTA=m +CONFIG_AD7091R5=m +CONFIG_AD7124=m +CONFIG_AD7192=m +CONFIG_AD7266=m +CONFIG_AD7291=m +CONFIG_AD7292=m +CONFIG_AD7298=m +CONFIG_AD7476=m +CONFIG_AD7606=m +CONFIG_AD7606_IFACE_PARALLEL=m +CONFIG_AD7606_IFACE_SPI=m +CONFIG_AD7766=m +CONFIG_AD7768_1=m +CONFIG_AD7780=m +CONFIG_AD7791=m +CONFIG_AD7793=m +CONFIG_AD7887=m +CONFIG_AD7923=m +CONFIG_AD7949=m +CONFIG_AD799X=m +CONFIG_AXP20X_ADC=m +CONFIG_AXP288_ADC=m +CONFIG_CC10001_ADC=m +CONFIG_CPCAP_ADC=m +CONFIG_DA9150_GPADC=m +CONFIG_DLN2_ADC=m +CONFIG_ENVELOPE_DETECTOR=m +CONFIG_HI8435=m +CONFIG_HX711=m +CONFIG_INA2XX_ADC=m +CONFIG_LP8788_ADC=m +CONFIG_LTC2471=m +CONFIG_LTC2485=m +CONFIG_LTC2496=m +CONFIG_LTC2497=m +CONFIG_MAX1027=m +CONFIG_MAX11100=m +CONFIG_MAX1118=m +CONFIG_MAX1363=m +CONFIG_MAX9611=m +CONFIG_MCP320X=m +CONFIG_MCP3422=m +CONFIG_MCP3911=m +CONFIG_MEN_Z188_ADC=m +CONFIG_NAU7802=m +CONFIG_PALMAS_GPADC=m +CONFIG_QCOM_VADC_COMMON=m +CONFIG_QCOM_SPMI_IADC=m +CONFIG_QCOM_SPMI_VADC=m +CONFIG_QCOM_SPMI_ADC5=m +CONFIG_RN5T618_ADC=m +CONFIG_SD_ADC_MODULATOR=m +CONFIG_STMPE_ADC=m +CONFIG_TI_ADC081C=m +CONFIG_TI_ADC0832=m +CONFIG_TI_ADC084S021=m +CONFIG_TI_ADC12138=m +CONFIG_TI_ADC108S102=m +CONFIG_TI_ADC128S052=m +CONFIG_TI_ADC161S626=m +CONFIG_TI_ADS1015=m +CONFIG_TI_ADS7950=m +CONFIG_TI_ADS8344=m +CONFIG_TI_ADS8688=m +CONFIG_TI_ADS124S08=m +CONFIG_TI_AM335X_ADC=m +CONFIG_TI_TLC4541=m +CONFIG_TWL4030_MADC=m +CONFIG_TWL6030_GPADC=m +CONFIG_VF610_ADC=m +CONFIG_VIPERBOARD_ADC=m +CONFIG_XILINX_XADC=m +# end of Analog to digital converters + +# +# Analog Front Ends +# +CONFIG_IIO_RESCALE=m +# end of Analog Front Ends + +# +# Amplifiers +# +CONFIG_AD8366=m +CONFIG_HMC425=m +# end of Amplifiers + +# +# Chemical Sensors +# +CONFIG_ATLAS_PH_SENSOR=m +CONFIG_BME680=m +CONFIG_BME680_I2C=m +CONFIG_BME680_SPI=m +CONFIG_CCS811=m +CONFIG_IAQCORE=m +CONFIG_PMS7003=m +CONFIG_SENSIRION_SGP30=m +CONFIG_SPS30=m +CONFIG_VZ89X=m +# end of Chemical Sensors + +CONFIG_IIO_CROS_EC_SENSORS_CORE=m +CONFIG_IIO_CROS_EC_SENSORS=m +CONFIG_IIO_CROS_EC_SENSORS_LID_ANGLE=m + +# +# Hid Sensor IIO Common +# +CONFIG_HID_SENSOR_IIO_COMMON=m +CONFIG_HID_SENSOR_IIO_TRIGGER=m +# end of Hid Sensor IIO Common + +CONFIG_IIO_MS_SENSORS_I2C=m + +# +# SSP Sensor Common +# +CONFIG_IIO_SSP_SENSORS_COMMONS=m +CONFIG_IIO_SSP_SENSORHUB=m +# end of SSP Sensor Common + +CONFIG_IIO_ST_SENSORS_I2C=m +CONFIG_IIO_ST_SENSORS_SPI=m +CONFIG_IIO_ST_SENSORS_CORE=m + +# +# Digital to analog converters +# +CONFIG_AD5064=m +CONFIG_AD5360=m +CONFIG_AD5380=m +CONFIG_AD5421=m +CONFIG_AD5446=m +CONFIG_AD5449=m +CONFIG_AD5592R_BASE=m +CONFIG_AD5592R=m +CONFIG_AD5593R=m +CONFIG_AD5504=m +CONFIG_AD5624R_SPI=m +CONFIG_AD5686=m +CONFIG_AD5686_SPI=m +CONFIG_AD5696_I2C=m +CONFIG_AD5755=m +CONFIG_AD5758=m +CONFIG_AD5761=m +CONFIG_AD5764=m +CONFIG_AD5770R=m +CONFIG_AD5791=m +CONFIG_AD7303=m +CONFIG_AD8801=m +CONFIG_DPOT_DAC=m +CONFIG_DS4424=m +CONFIG_LTC1660=m +CONFIG_LTC2632=m +CONFIG_M62332=m +CONFIG_MAX517=m +CONFIG_MAX5821=m +CONFIG_MCP4725=m +CONFIG_MCP4922=m +CONFIG_TI_DAC082S085=m +CONFIG_TI_DAC5571=m +CONFIG_TI_DAC7311=m +CONFIG_TI_DAC7612=m +CONFIG_VF610_DAC=m +# end of Digital to analog converters + +# +# IIO dummy driver +# +# CONFIG_IIO_SIMPLE_DUMMY is not set +# end of IIO dummy driver + +# +# Frequency Synthesizers DDS/PLL +# + +# +# Clock Generator/Distribution +# +CONFIG_AD9523=m +# end of Clock Generator/Distribution + +# +# Phase-Locked Loop (PLL) frequency synthesizers +# +CONFIG_ADF4350=m +CONFIG_ADF4371=m +# end of Phase-Locked Loop (PLL) frequency synthesizers +# end of Frequency Synthesizers DDS/PLL + +# +# Digital gyroscope sensors +# +CONFIG_ADIS16080=m +CONFIG_ADIS16130=m +CONFIG_ADIS16136=m +CONFIG_ADIS16260=m +CONFIG_ADXRS450=m +CONFIG_BMG160=m +CONFIG_BMG160_I2C=m +CONFIG_BMG160_SPI=m +CONFIG_FXAS21002C=m +CONFIG_FXAS21002C_I2C=m +CONFIG_FXAS21002C_SPI=m +CONFIG_HID_SENSOR_GYRO_3D=m +CONFIG_MPU3050=m +CONFIG_MPU3050_I2C=m +CONFIG_IIO_ST_GYRO_3AXIS=m +CONFIG_IIO_ST_GYRO_I2C_3AXIS=m +CONFIG_IIO_ST_GYRO_SPI_3AXIS=m +CONFIG_ITG3200=m +# end of Digital gyroscope sensors + +# +# Health Sensors +# + +# +# Heart Rate Monitors +# +CONFIG_AFE4403=m +CONFIG_AFE4404=m +CONFIG_MAX30100=m +CONFIG_MAX30102=m +# end of Heart Rate Monitors +# end of Health Sensors + +# +# Humidity sensors +# +CONFIG_AM2315=m +CONFIG_DHT11=m +CONFIG_HDC100X=m +CONFIG_HID_SENSOR_HUMIDITY=m +CONFIG_HTS221=m +CONFIG_HTS221_I2C=m +CONFIG_HTS221_SPI=m +CONFIG_HTU21=m +CONFIG_SI7005=m +CONFIG_SI7020=m +# end of Humidity sensors + +# +# Inertial measurement units +# +CONFIG_ADIS16400=m +CONFIG_ADIS16460=m +CONFIG_ADIS16480=m +CONFIG_BMI160=m +CONFIG_BMI160_I2C=m +CONFIG_BMI160_SPI=m +CONFIG_FXOS8700=m +CONFIG_FXOS8700_I2C=m +CONFIG_FXOS8700_SPI=m +CONFIG_KMX61=m +CONFIG_INV_MPU6050_IIO=m +CONFIG_INV_MPU6050_I2C=m +CONFIG_INV_MPU6050_SPI=m +CONFIG_IIO_ST_LSM6DSX=m +CONFIG_IIO_ST_LSM6DSX_I2C=m +CONFIG_IIO_ST_LSM6DSX_SPI=m +CONFIG_IIO_ST_LSM6DSX_I3C=m +# end of Inertial measurement units + +CONFIG_IIO_ADIS_LIB=m +CONFIG_IIO_ADIS_LIB_BUFFER=y + +# +# Light sensors +# +CONFIG_ACPI_ALS=m +CONFIG_ADJD_S311=m +CONFIG_ADUX1020=m +CONFIG_AL3010=m +CONFIG_AL3320A=m +CONFIG_APDS9300=m +CONFIG_APDS9960=m +CONFIG_BH1750=m +CONFIG_BH1780=m +CONFIG_CM32181=m +CONFIG_CM3232=m +CONFIG_CM3323=m +CONFIG_CM3605=m +CONFIG_CM36651=m +CONFIG_IIO_CROS_EC_LIGHT_PROX=m +CONFIG_GP2AP002=m +CONFIG_GP2AP020A00F=m +CONFIG_IQS621_ALS=m +CONFIG_SENSORS_ISL29018=m +CONFIG_SENSORS_ISL29028=m +CONFIG_ISL29125=m +CONFIG_HID_SENSOR_ALS=m +CONFIG_HID_SENSOR_PROX=m +CONFIG_JSA1212=m +CONFIG_RPR0521=m +CONFIG_SENSORS_LM3533=m +CONFIG_LTR501=m +CONFIG_LV0104CS=m +CONFIG_MAX44000=m +CONFIG_MAX44009=m +CONFIG_NOA1305=m +CONFIG_OPT3001=m +CONFIG_PA12203001=m +CONFIG_SI1133=m +CONFIG_SI1145=m +CONFIG_STK3310=m +CONFIG_ST_UVIS25=m +CONFIG_ST_UVIS25_I2C=m +CONFIG_ST_UVIS25_SPI=m +CONFIG_TCS3414=m +CONFIG_TCS3472=m +CONFIG_SENSORS_TSL2563=m +CONFIG_TSL2583=m +CONFIG_TSL2772=m +CONFIG_TSL4531=m +CONFIG_US5182D=m +CONFIG_VCNL4000=m +CONFIG_VCNL4035=m +CONFIG_VEML6030=m +CONFIG_VEML6070=m +CONFIG_VL6180=m +CONFIG_ZOPT2201=m +# end of Light sensors + +# +# Magnetometer sensors +# +CONFIG_AK8974=m +CONFIG_AK8975=m +CONFIG_AK09911=m +CONFIG_BMC150_MAGN=m +CONFIG_BMC150_MAGN_I2C=m +CONFIG_BMC150_MAGN_SPI=m +CONFIG_MAG3110=m +CONFIG_HID_SENSOR_MAGNETOMETER_3D=m +CONFIG_MMC35240=m +CONFIG_IIO_ST_MAGN_3AXIS=m +CONFIG_IIO_ST_MAGN_I2C_3AXIS=m +CONFIG_IIO_ST_MAGN_SPI_3AXIS=m +CONFIG_SENSORS_HMC5843=m +CONFIG_SENSORS_HMC5843_I2C=m +CONFIG_SENSORS_HMC5843_SPI=m +CONFIG_SENSORS_RM3100=m +CONFIG_SENSORS_RM3100_I2C=m +CONFIG_SENSORS_RM3100_SPI=m +# end of Magnetometer sensors + +# +# Multiplexers +# +CONFIG_IIO_MUX=m +# end of Multiplexers + +# +# Inclinometer sensors +# +CONFIG_HID_SENSOR_INCLINOMETER_3D=m +CONFIG_HID_SENSOR_DEVICE_ROTATION=m +# end of Inclinometer sensors + +# +# Triggers - standalone +# +CONFIG_IIO_HRTIMER_TRIGGER=m +CONFIG_IIO_INTERRUPT_TRIGGER=m +CONFIG_IIO_TIGHTLOOP_TRIGGER=m +CONFIG_IIO_SYSFS_TRIGGER=m +# end of Triggers - standalone + +# +# Linear and angular position sensors +# +CONFIG_IQS624_POS=m +# end of Linear and angular position sensors + +# +# Digital potentiometers +# +CONFIG_AD5272=m +CONFIG_DS1803=m +CONFIG_MAX5432=m +CONFIG_MAX5481=m +CONFIG_MAX5487=m +CONFIG_MCP4018=m +CONFIG_MCP4131=m +CONFIG_MCP4531=m +CONFIG_MCP41010=m +CONFIG_TPL0102=m +# end of Digital potentiometers + +# +# Digital potentiostats +# +CONFIG_LMP91000=m +# end of Digital potentiostats + +# +# Pressure sensors +# +CONFIG_ABP060MG=m +CONFIG_BMP280=m +CONFIG_BMP280_I2C=m +CONFIG_BMP280_SPI=m +CONFIG_IIO_CROS_EC_BARO=m +CONFIG_DLHL60D=m +CONFIG_DPS310=m +CONFIG_HID_SENSOR_PRESS=m +CONFIG_HP03=m +CONFIG_ICP10100=m +CONFIG_MPL115=m +CONFIG_MPL115_I2C=m +CONFIG_MPL115_SPI=m +CONFIG_MPL3115=m +CONFIG_MS5611=m +CONFIG_MS5611_I2C=m +CONFIG_MS5611_SPI=m +CONFIG_MS5637=m +CONFIG_IIO_ST_PRESS=m +CONFIG_IIO_ST_PRESS_I2C=m +CONFIG_IIO_ST_PRESS_SPI=m +CONFIG_T5403=m +CONFIG_HP206C=m +CONFIG_ZPA2326=m +CONFIG_ZPA2326_I2C=m +CONFIG_ZPA2326_SPI=m +# end of Pressure sensors + +# +# Lightning sensors +# +CONFIG_AS3935=m +# end of Lightning sensors + +# +# Proximity and distance sensors +# +CONFIG_ISL29501=m +CONFIG_LIDAR_LITE_V2=m +CONFIG_MB1232=m +CONFIG_PING=m +CONFIG_RFD77402=m +CONFIG_SRF04=m +CONFIG_SX9500=m +CONFIG_SRF08=m +CONFIG_VL53L0X_I2C=m +# end of Proximity and distance sensors + +# +# Resolver to digital converters +# +CONFIG_AD2S90=m +CONFIG_AD2S1200=m +# end of Resolver to digital converters + +# +# Temperature sensors +# +CONFIG_IQS620AT_TEMP=m +CONFIG_LTC2983=m +CONFIG_MAXIM_THERMOCOUPLE=m +CONFIG_HID_SENSOR_TEMP=m +CONFIG_MLX90614=m +CONFIG_MLX90632=m +CONFIG_TMP006=m +CONFIG_TMP007=m +CONFIG_TSYS01=m +CONFIG_TSYS02D=m +CONFIG_MAX31856=m +# end of Temperature sensors + +CONFIG_NTB=m +CONFIG_NTB_MSI=y +CONFIG_NTB_AMD=m +CONFIG_NTB_IDT=m +CONFIG_NTB_INTEL=m +CONFIG_NTB_SWITCHTEC=m +# CONFIG_NTB_PINGPONG is not set +# CONFIG_NTB_TOOL is not set +# CONFIG_NTB_PERF is not set +# CONFIG_NTB_MSI_TEST is not set +CONFIG_NTB_TRANSPORT=m +CONFIG_VME_BUS=y + +# +# VME Bridge Drivers +# +CONFIG_VME_CA91CX42=m +CONFIG_VME_TSI148=m +# CONFIG_VME_FAKE is not set + +# +# VME Board Drivers +# +CONFIG_VMIVME_7805=m + +# +# VME Device Drivers +# +CONFIG_VME_USER=m +CONFIG_PWM=y +CONFIG_PWM_SYSFS=y +# CONFIG_PWM_DEBUG is not set +CONFIG_PWM_ATMEL_HLCDC_PWM=m +CONFIG_PWM_CRC=y +CONFIG_PWM_CROS_EC=m +CONFIG_PWM_FSL_FTM=m +CONFIG_PWM_LP3943=m +CONFIG_PWM_LPSS=m +CONFIG_PWM_LPSS_PCI=m +CONFIG_PWM_LPSS_PLATFORM=m +CONFIG_PWM_PCA9685=m +CONFIG_PWM_STMPE=y +CONFIG_PWM_TWL=m +CONFIG_PWM_TWL_LED=m + +# +# IRQ chip support +# +CONFIG_IRQCHIP=y +CONFIG_AL_FIC=y +CONFIG_MADERA_IRQ=m +# end of IRQ chip support + +CONFIG_IPACK_BUS=m +CONFIG_BOARD_TPCI200=m +CONFIG_SERIAL_IPOCTAL=m +CONFIG_RESET_CONTROLLER=y +CONFIG_RESET_BRCMSTB_RESCAL=y +CONFIG_RESET_INTEL_GW=y +CONFIG_RESET_TI_SYSCON=m + +# +# PHY Subsystem +# +CONFIG_GENERIC_PHY=y +CONFIG_GENERIC_PHY_MIPI_DPHY=y +CONFIG_BCM_KONA_USB2_PHY=m +CONFIG_PHY_CADENCE_TORRENT=m +CONFIG_PHY_CADENCE_DPHY=m +CONFIG_PHY_CADENCE_SIERRA=m +CONFIG_PHY_FSL_IMX8MQ_USB=m +CONFIG_PHY_MIXEL_MIPI_DPHY=m +CONFIG_PHY_PXA_28NM_HSIC=m +CONFIG_PHY_PXA_28NM_USB2=m +CONFIG_PHY_CPCAP_USB=m +CONFIG_PHY_MAPPHONE_MDM6600=m +CONFIG_PHY_OCELOT_SERDES=m +CONFIG_PHY_QCOM_USB_HS=m +CONFIG_PHY_QCOM_USB_HSIC=m +CONFIG_PHY_SAMSUNG_USB2=m +CONFIG_PHY_TUSB1210=m +CONFIG_PHY_INTEL_EMMC=m +# end of PHY Subsystem + +CONFIG_POWERCAP=y +CONFIG_INTEL_RAPL_CORE=m +CONFIG_INTEL_RAPL=m +CONFIG_IDLE_INJECT=y +CONFIG_MCB=m +CONFIG_MCB_PCI=m +CONFIG_MCB_LPC=m + +# +# Performance monitor support +# +# end of Performance monitor support + +CONFIG_RAS=y +CONFIG_RAS_CEC=y +# CONFIG_RAS_CEC_DEBUG is not set +CONFIG_USB4=m + +# +# Android +# +# CONFIG_ANDROID is not set +# end of Android + +CONFIG_LIBNVDIMM=y +CONFIG_BLK_DEV_PMEM=m +CONFIG_ND_BLK=m +CONFIG_ND_CLAIM=y +CONFIG_ND_BTT=m +CONFIG_BTT=y +CONFIG_ND_PFN=m +CONFIG_NVDIMM_PFN=y +CONFIG_NVDIMM_DAX=y +CONFIG_OF_PMEM=m +CONFIG_DAX_DRIVER=y +CONFIG_DAX=y +CONFIG_DEV_DAX=m +CONFIG_DEV_DAX_PMEM=m +CONFIG_DEV_DAX_HMEM=m +CONFIG_DEV_DAX_KMEM=m +CONFIG_DEV_DAX_PMEM_COMPAT=m +CONFIG_NVMEM=y +CONFIG_NVMEM_SYSFS=y +CONFIG_NVMEM_SPMI_SDAM=m +CONFIG_RAVE_SP_EEPROM=m + +# +# HW tracing support +# +CONFIG_STM=m +CONFIG_STM_PROTO_BASIC=m +CONFIG_STM_PROTO_SYS_T=m +# CONFIG_STM_DUMMY is not set +CONFIG_STM_SOURCE_CONSOLE=m +CONFIG_STM_SOURCE_HEARTBEAT=m +CONFIG_STM_SOURCE_FTRACE=m +CONFIG_INTEL_TH=m +CONFIG_INTEL_TH_PCI=m +CONFIG_INTEL_TH_ACPI=m +CONFIG_INTEL_TH_GTH=m +CONFIG_INTEL_TH_STH=m +CONFIG_INTEL_TH_MSU=m +CONFIG_INTEL_TH_PTI=m +# CONFIG_INTEL_TH_DEBUG is not set +# end of HW tracing support + +CONFIG_FPGA=m +CONFIG_ALTERA_PR_IP_CORE=m +CONFIG_ALTERA_PR_IP_CORE_PLAT=m +CONFIG_FPGA_MGR_ALTERA_PS_SPI=m +CONFIG_FPGA_MGR_ALTERA_CVP=m +CONFIG_FPGA_MGR_XILINX_SPI=m +CONFIG_FPGA_MGR_ICE40_SPI=m +CONFIG_FPGA_MGR_MACHXO2_SPI=m +CONFIG_FPGA_BRIDGE=m +CONFIG_ALTERA_FREEZE_BRIDGE=m +CONFIG_XILINX_PR_DECOUPLER=m +CONFIG_FPGA_REGION=m +CONFIG_OF_FPGA_REGION=m +CONFIG_FPGA_DFL=m +CONFIG_FPGA_DFL_FME=m +CONFIG_FPGA_DFL_FME_MGR=m +CONFIG_FPGA_DFL_FME_BRIDGE=m +CONFIG_FPGA_DFL_FME_REGION=m +CONFIG_FPGA_DFL_AFU=m +CONFIG_FPGA_DFL_PCI=m +CONFIG_FSI=m +CONFIG_FSI_NEW_DEV_NODE=y +CONFIG_FSI_MASTER_GPIO=m +CONFIG_FSI_MASTER_HUB=m +CONFIG_FSI_MASTER_ASPEED=m +CONFIG_FSI_SCOM=m +CONFIG_FSI_SBEFIFO=m +CONFIG_FSI_OCC=m +CONFIG_TEE=m + +# +# TEE drivers +# +CONFIG_AMDTEE=m +# end of TEE drivers + +CONFIG_MULTIPLEXER=m + +# +# Multiplexer drivers +# +CONFIG_MUX_ADG792A=m +CONFIG_MUX_ADGS1408=m +CONFIG_MUX_GPIO=m +CONFIG_MUX_MMIO=m +# end of Multiplexer drivers + +CONFIG_PM_OPP=y +CONFIG_UNISYS_VISORBUS=m +CONFIG_SIOX=m +CONFIG_SIOX_BUS_GPIO=m +CONFIG_SLIMBUS=m +CONFIG_SLIM_QCOM_CTRL=m +CONFIG_INTERCONNECT=m +CONFIG_COUNTER=m +CONFIG_FTM_QUADDEC=m +CONFIG_MOST=m +# end of Device Drivers + +# +# File systems +# +CONFIG_DCACHE_WORD_ACCESS=y +CONFIG_VALIDATE_FS_PARSER=y +CONFIG_FS_IOMAP=y +# CONFIG_EXT2_FS is not set +# CONFIG_EXT3_FS is not set +CONFIG_EXT4_FS=m +CONFIG_EXT4_USE_FOR_EXT2=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +# CONFIG_EXT4_DEBUG is not set +CONFIG_JBD2=m +# CONFIG_JBD2_DEBUG is not set +CONFIG_FS_MBCACHE=m +CONFIG_REISERFS_FS=m +# CONFIG_REISERFS_CHECK is not set +CONFIG_REISERFS_PROC_INFO=y +CONFIG_REISERFS_FS_XATTR=y +CONFIG_REISERFS_FS_POSIX_ACL=y +CONFIG_REISERFS_FS_SECURITY=y +CONFIG_JFS_FS=m +CONFIG_JFS_POSIX_ACL=y +CONFIG_JFS_SECURITY=y +# CONFIG_JFS_DEBUG is not set +CONFIG_JFS_STATISTICS=y +CONFIG_XFS_FS=m +CONFIG_XFS_QUOTA=y +CONFIG_XFS_POSIX_ACL=y +CONFIG_XFS_RT=y +CONFIG_XFS_ONLINE_SCRUB=y +CONFIG_XFS_ONLINE_REPAIR=y +# CONFIG_XFS_WARN is not set +# CONFIG_XFS_DEBUG is not set +CONFIG_GFS2_FS=m +CONFIG_GFS2_FS_LOCKING_DLM=y +CONFIG_OCFS2_FS=m +CONFIG_OCFS2_FS_O2CB=m +CONFIG_OCFS2_FS_USERSPACE_CLUSTER=m +CONFIG_OCFS2_FS_STATS=y +CONFIG_OCFS2_DEBUG_MASKLOG=y +# CONFIG_OCFS2_DEBUG_FS is not set +CONFIG_BTRFS_FS=m +CONFIG_BTRFS_FS_POSIX_ACL=y +# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set +# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set +# CONFIG_BTRFS_DEBUG is not set +# CONFIG_BTRFS_ASSERT is not set +# CONFIG_BTRFS_FS_REF_VERIFY is not set +CONFIG_NILFS2_FS=m +CONFIG_F2FS_FS=m +CONFIG_F2FS_STAT_FS=y +CONFIG_F2FS_FS_XATTR=y +CONFIG_F2FS_FS_POSIX_ACL=y +CONFIG_F2FS_FS_SECURITY=y +CONFIG_F2FS_CHECK_FS=y +# CONFIG_F2FS_IO_TRACE is not set +# CONFIG_F2FS_FAULT_INJECTION is not set +CONFIG_F2FS_FS_COMPRESSION=y +CONFIG_F2FS_FS_LZO=y +CONFIG_F2FS_FS_LZ4=y +CONFIG_F2FS_FS_ZSTD=y +CONFIG_ZONEFS_FS=m +CONFIG_FS_DAX=y +CONFIG_FS_DAX_PMD=y +CONFIG_FS_POSIX_ACL=y +CONFIG_EXPORTFS=y +CONFIG_EXPORTFS_BLOCK_OPS=y +CONFIG_FILE_LOCKING=y +# CONFIG_MANDATORY_FILE_LOCKING is not set +CONFIG_FS_ENCRYPTION=y +CONFIG_FS_ENCRYPTION_ALGS=m +CONFIG_FS_VERITY=y +# CONFIG_FS_VERITY_DEBUG is not set +CONFIG_FS_VERITY_BUILTIN_SIGNATURES=y +CONFIG_FSNOTIFY=y +CONFIG_DNOTIFY=y +CONFIG_INOTIFY_USER=y +CONFIG_FANOTIFY=y +CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y +CONFIG_QUOTA=y +CONFIG_QUOTA_NETLINK_INTERFACE=y +# CONFIG_PRINT_QUOTA_WARNING is not set +# CONFIG_QUOTA_DEBUG is not set +CONFIG_QUOTA_TREE=m +CONFIG_QFMT_V1=m +CONFIG_QFMT_V2=m +CONFIG_QUOTACTL=y +CONFIG_QUOTACTL_COMPAT=y +CONFIG_AUTOFS4_FS=y +CONFIG_AUTOFS_FS=y +CONFIG_FUSE_FS=m +CONFIG_CUSE=m +CONFIG_VIRTIO_FS=m +CONFIG_OVERLAY_FS=m +CONFIG_OVERLAY_FS_REDIRECT_DIR=y +# CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW is not set +CONFIG_OVERLAY_FS_INDEX=y +CONFIG_OVERLAY_FS_XINO_AUTO=y +CONFIG_OVERLAY_FS_METACOPY=y + +# +# Caches +# +CONFIG_FSCACHE=m +CONFIG_FSCACHE_STATS=y +CONFIG_FSCACHE_HISTOGRAM=y +# CONFIG_FSCACHE_DEBUG is not set +# CONFIG_FSCACHE_OBJECT_LIST is not set +CONFIG_CACHEFILES=m +# CONFIG_CACHEFILES_DEBUG is not set +# CONFIG_CACHEFILES_HISTOGRAM is not set +# end of Caches + +# +# CD-ROM/DVD Filesystems +# +CONFIG_ISO9660_FS=m +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_UDF_FS=m +# end of CD-ROM/DVD Filesystems + +# +# DOS/FAT/EXFAT/NT Filesystems +# +CONFIG_FAT_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_CODEPAGE=437 +CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" +CONFIG_FAT_DEFAULT_UTF8=y +CONFIG_EXFAT_FS=m +CONFIG_EXFAT_DEFAULT_IOCHARSET="utf8" +CONFIG_NTFS_FS=m +# CONFIG_NTFS_DEBUG is not set +CONFIG_NTFS_RW=y +# end of DOS/FAT/EXFAT/NT Filesystems + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +# CONFIG_PROC_KCORE is not set +# CONFIG_PROC_VMCORE is not set +CONFIG_PROC_SYSCTL=y +CONFIG_PROC_PAGE_MONITOR=y +CONFIG_PROC_CHILDREN=y +CONFIG_PROC_PID_ARCH_STATUS=y +CONFIG_PROC_CPU_RESCTRL=y +CONFIG_KERNFS=y +CONFIG_SYSFS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_TMPFS_XATTR=y +CONFIG_HUGETLBFS=y +CONFIG_HUGETLB_PAGE=y +CONFIG_MEMFD_CREATE=y +CONFIG_ARCH_HAS_GIGANTIC_PAGE=y +CONFIG_CONFIGFS_FS=y +CONFIG_EFIVAR_FS=y +# end of Pseudo filesystems + +CONFIG_MISC_FILESYSTEMS=y +CONFIG_ORANGEFS_FS=m +# CONFIG_ADFS_FS is not set +CONFIG_AFFS_FS=m +CONFIG_ECRYPT_FS=m +# CONFIG_ECRYPT_FS_MESSAGING is not set +CONFIG_HFS_FS=m +CONFIG_HFSPLUS_FS=m +CONFIG_BEFS_FS=m +# CONFIG_BEFS_DEBUG is not set +# CONFIG_BFS_FS is not set +# CONFIG_EFS_FS is not set +CONFIG_JFFS2_FS=m +CONFIG_JFFS2_FS_DEBUG=0 +CONFIG_JFFS2_FS_WRITEBUFFER=y +# CONFIG_JFFS2_FS_WBUF_VERIFY is not set +CONFIG_JFFS2_SUMMARY=y +CONFIG_JFFS2_FS_XATTR=y +CONFIG_JFFS2_FS_POSIX_ACL=y +CONFIG_JFFS2_FS_SECURITY=y +# CONFIG_JFFS2_COMPRESSION_OPTIONS is not set +CONFIG_JFFS2_ZLIB=y +CONFIG_JFFS2_RTIME=y +CONFIG_UBIFS_FS=m +# CONFIG_UBIFS_FS_ADVANCED_COMPR is not set +CONFIG_UBIFS_FS_LZO=y +CONFIG_UBIFS_FS_ZLIB=y +CONFIG_UBIFS_FS_ZSTD=y +CONFIG_UBIFS_ATIME_SUPPORT=y +CONFIG_UBIFS_FS_XATTR=y +CONFIG_UBIFS_FS_SECURITY=y +CONFIG_UBIFS_FS_AUTHENTICATION=y +CONFIG_CRAMFS=m +CONFIG_CRAMFS_BLOCKDEV=y +CONFIG_CRAMFS_MTD=y +CONFIG_SQUASHFS=m +# CONFIG_SQUASHFS_FILE_CACHE is not set +CONFIG_SQUASHFS_FILE_DIRECT=y +# CONFIG_SQUASHFS_DECOMP_SINGLE is not set +CONFIG_SQUASHFS_DECOMP_MULTI=y +# CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU is not set +CONFIG_SQUASHFS_XATTR=y +CONFIG_SQUASHFS_ZLIB=y +CONFIG_SQUASHFS_LZ4=y +CONFIG_SQUASHFS_LZO=y +CONFIG_SQUASHFS_XZ=y +CONFIG_SQUASHFS_ZSTD=y +# CONFIG_SQUASHFS_4K_DEVBLK_SIZE is not set +# CONFIG_SQUASHFS_EMBEDDED is not set +CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3 +# CONFIG_VXFS_FS is not set +CONFIG_MINIX_FS=m +CONFIG_OMFS_FS=m +# CONFIG_HPFS_FS is not set +# CONFIG_QNX4FS_FS is not set +# CONFIG_QNX6FS_FS is not set +CONFIG_ROMFS_FS=m +CONFIG_ROMFS_BACKED_BY_BLOCK=y +# CONFIG_ROMFS_BACKED_BY_MTD is not set +# CONFIG_ROMFS_BACKED_BY_BOTH is not set +CONFIG_ROMFS_ON_BLOCK=y +CONFIG_PSTORE=y +CONFIG_PSTORE_DEFLATE_COMPRESS=m +CONFIG_PSTORE_LZO_COMPRESS=m +CONFIG_PSTORE_LZ4_COMPRESS=m +CONFIG_PSTORE_LZ4HC_COMPRESS=m +# CONFIG_PSTORE_842_COMPRESS is not set +CONFIG_PSTORE_ZSTD_COMPRESS=y +CONFIG_PSTORE_COMPRESS=y +# CONFIG_PSTORE_DEFLATE_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_LZO_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_LZ4_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_LZ4HC_COMPRESS_DEFAULT is not set +CONFIG_PSTORE_ZSTD_COMPRESS_DEFAULT=y +CONFIG_PSTORE_COMPRESS_DEFAULT="zstd" +# CONFIG_PSTORE_CONSOLE is not set +# CONFIG_PSTORE_PMSG is not set +# CONFIG_PSTORE_FTRACE is not set +CONFIG_PSTORE_RAM=y +# CONFIG_SYSV_FS is not set +CONFIG_UFS_FS=m +# CONFIG_UFS_FS_WRITE is not set +# CONFIG_UFS_DEBUG is not set +CONFIG_EROFS_FS=m +# CONFIG_EROFS_FS_DEBUG is not set +CONFIG_EROFS_FS_XATTR=y +CONFIG_EROFS_FS_POSIX_ACL=y +CONFIG_EROFS_FS_SECURITY=y +CONFIG_EROFS_FS_ZIP=y +CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT=2 +CONFIG_VBOXSF_FS=m +CONFIG_NETWORK_FILESYSTEMS=y +CONFIG_NFS_FS=m +CONFIG_NFS_V2=m +CONFIG_NFS_V3=m +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=m +CONFIG_NFS_SWAP=y +CONFIG_NFS_V4_1=y +CONFIG_NFS_V4_2=y +CONFIG_PNFS_FILE_LAYOUT=m +CONFIG_PNFS_BLOCK=m +CONFIG_PNFS_FLEXFILE_LAYOUT=m +CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="kernel.org" +CONFIG_NFS_V4_1_MIGRATION=y +CONFIG_NFS_V4_SECURITY_LABEL=y +CONFIG_NFS_FSCACHE=y +# CONFIG_NFS_USE_LEGACY_DNS is not set +CONFIG_NFS_USE_KERNEL_DNS=y +# CONFIG_NFS_DISABLE_UDP_SUPPORT is not set +CONFIG_NFSD=m +CONFIG_NFSD_V2_ACL=y +CONFIG_NFSD_V3=y +CONFIG_NFSD_V3_ACL=y +CONFIG_NFSD_V4=y +CONFIG_NFSD_PNFS=y +CONFIG_NFSD_BLOCKLAYOUT=y +CONFIG_NFSD_SCSILAYOUT=y +# CONFIG_NFSD_FLEXFILELAYOUT is not set +CONFIG_NFSD_V4_SECURITY_LABEL=y +CONFIG_GRACE_PERIOD=m +CONFIG_LOCKD=m +CONFIG_LOCKD_V4=y +CONFIG_NFS_ACL_SUPPORT=m +CONFIG_NFS_COMMON=y +CONFIG_SUNRPC=m +CONFIG_SUNRPC_GSS=m +CONFIG_SUNRPC_BACKCHANNEL=y +CONFIG_SUNRPC_SWAP=y +CONFIG_RPCSEC_GSS_KRB5=m +CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES=y +CONFIG_SUNRPC_DEBUG=y +CONFIG_SUNRPC_XPRT_RDMA=m +CONFIG_CEPH_FS=m +CONFIG_CEPH_FSCACHE=y +CONFIG_CEPH_FS_POSIX_ACL=y +CONFIG_CEPH_FS_SECURITY_LABEL=y +CONFIG_CIFS=m +# CONFIG_CIFS_STATS2 is not set +# CONFIG_CIFS_ALLOW_INSECURE_LEGACY is not set +CONFIG_CIFS_UPCALL=y +CONFIG_CIFS_XATTR=y +CONFIG_CIFS_DEBUG=y +# CONFIG_CIFS_DEBUG2 is not set +# CONFIG_CIFS_DEBUG_DUMP_KEYS is not set +CONFIG_CIFS_DFS_UPCALL=y +# CONFIG_CIFS_SMB_DIRECT is not set +CONFIG_CIFS_FSCACHE=y +CONFIG_CODA_FS=m +CONFIG_AFS_FS=m +# CONFIG_AFS_DEBUG is not set +CONFIG_AFS_FSCACHE=y +# CONFIG_AFS_DEBUG_CURSOR is not set +CONFIG_9P_FS=m +CONFIG_9P_FSCACHE=y +CONFIG_9P_FS_POSIX_ACL=y +CONFIG_9P_FS_SECURITY=y +CONFIG_NLS=y +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_737=m +CONFIG_NLS_CODEPAGE_775=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_CODEPAGE_852=m +CONFIG_NLS_CODEPAGE_855=m +CONFIG_NLS_CODEPAGE_857=m +CONFIG_NLS_CODEPAGE_860=m +CONFIG_NLS_CODEPAGE_861=m +CONFIG_NLS_CODEPAGE_862=m +CONFIG_NLS_CODEPAGE_863=m +CONFIG_NLS_CODEPAGE_864=m +CONFIG_NLS_CODEPAGE_865=m +CONFIG_NLS_CODEPAGE_866=m +CONFIG_NLS_CODEPAGE_869=m +CONFIG_NLS_CODEPAGE_936=m +CONFIG_NLS_CODEPAGE_950=m +CONFIG_NLS_CODEPAGE_932=m +CONFIG_NLS_CODEPAGE_949=m +CONFIG_NLS_CODEPAGE_874=m +CONFIG_NLS_ISO8859_8=m +CONFIG_NLS_CODEPAGE_1250=m +CONFIG_NLS_CODEPAGE_1251=m +CONFIG_NLS_ASCII=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_ISO8859_2=m +CONFIG_NLS_ISO8859_3=m +CONFIG_NLS_ISO8859_4=m +CONFIG_NLS_ISO8859_5=m +CONFIG_NLS_ISO8859_6=m +CONFIG_NLS_ISO8859_7=m +CONFIG_NLS_ISO8859_9=m +CONFIG_NLS_ISO8859_13=m +CONFIG_NLS_ISO8859_14=m +CONFIG_NLS_ISO8859_15=m +CONFIG_NLS_KOI8_R=m +CONFIG_NLS_KOI8_U=m +CONFIG_NLS_MAC_ROMAN=m +CONFIG_NLS_MAC_CELTIC=m +CONFIG_NLS_MAC_CENTEURO=m +CONFIG_NLS_MAC_CROATIAN=m +CONFIG_NLS_MAC_CYRILLIC=m +CONFIG_NLS_MAC_GAELIC=m +CONFIG_NLS_MAC_GREEK=m +CONFIG_NLS_MAC_ICELAND=m +CONFIG_NLS_MAC_INUIT=m +CONFIG_NLS_MAC_ROMANIAN=m +CONFIG_NLS_MAC_TURKISH=m +CONFIG_NLS_UTF8=m +CONFIG_DLM=m +# CONFIG_DLM_DEBUG is not set +CONFIG_UNICODE=y +# CONFIG_UNICODE_NORMALIZATION_SELFTEST is not set +CONFIG_IO_WQ=y +# end of File systems + +# +# Security options +# +CONFIG_KEYS=y +CONFIG_KEYS_REQUEST_CACHE=y +CONFIG_PERSISTENT_KEYRINGS=y +CONFIG_BIG_KEYS=y +CONFIG_TRUSTED_KEYS=m +CONFIG_ENCRYPTED_KEYS=m +CONFIG_KEY_DH_OPERATIONS=y +CONFIG_SECURITY_DMESG_RESTRICT=y +CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y +CONFIG_SECURITY_TIOCSTI_RESTRICT=y +CONFIG_SECURITY=y +CONFIG_SECURITYFS=y +CONFIG_SECURITY_NETWORK=y +CONFIG_PAGE_TABLE_ISOLATION=y +CONFIG_SECURITY_INFINIBAND=y +CONFIG_SECURITY_NETWORK_XFRM=y +CONFIG_SECURITY_PATH=y +# CONFIG_INTEL_TXT is not set +CONFIG_LSM_MMAP_MIN_ADDR=65536 +CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR=y +CONFIG_HARDENED_USERCOPY=y +# CONFIG_HARDENED_USERCOPY_FALLBACK is not set +# CONFIG_HARDENED_USERCOPY_PAGESPAN is not set +CONFIG_FORTIFY_SOURCE=y +# CONFIG_FORTIFY_SOURCE_STRICT_STRING is not set +# CONFIG_STATIC_USERMODEHELPER is not set +CONFIG_SECURITY_SELINUX=y +CONFIG_SECURITY_SELINUX_BOOTPARAM=y +# CONFIG_SECURITY_SELINUX_DISABLE is not set +CONFIG_SECURITY_SELINUX_DEVELOP=y +CONFIG_SECURITY_SELINUX_AVC_STATS=y +CONFIG_SECURITY_SELINUX_SIDTAB_HASH_BITS=9 +CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE=256 +CONFIG_SECURITY_SMACK=y +CONFIG_SECURITY_SMACK_BRINGUP=y +CONFIG_SECURITY_SMACK_NETFILTER=y +CONFIG_SECURITY_SMACK_APPEND_SIGNALS=y +CONFIG_SECURITY_TOMOYO=y +CONFIG_SECURITY_TOMOYO_MAX_ACCEPT_ENTRY=2048 +CONFIG_SECURITY_TOMOYO_MAX_AUDIT_LOG=1024 +# CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER is not set +CONFIG_SECURITY_TOMOYO_POLICY_LOADER="/sbin/tomoyo-init" +CONFIG_SECURITY_TOMOYO_ACTIVATION_TRIGGER="/sbin/init" +# CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING is not set +CONFIG_SECURITY_APPARMOR=y +CONFIG_SECURITY_APPARMOR_HASH=y +CONFIG_SECURITY_APPARMOR_HASH_DEFAULT=y +# CONFIG_SECURITY_APPARMOR_DEBUG is not set +# CONFIG_SECURITY_LOADPIN is not set +CONFIG_SECURITY_YAMA=y +CONFIG_SECURITY_SAFESETID=y +CONFIG_SECURITY_LOCKDOWN_LSM=y +# CONFIG_SECURITY_LOCKDOWN_LSM_EARLY is not set +CONFIG_LOCK_DOWN_KERNEL_FORCE_NONE=y +# CONFIG_LOCK_DOWN_KERNEL_FORCE_INTEGRITY is not set +# CONFIG_LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY is not set +# CONFIG_INTEGRITY is not set +# CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT is not set +# CONFIG_DEFAULT_SECURITY_SELINUX is not set +# CONFIG_DEFAULT_SECURITY_SMACK is not set +# CONFIG_DEFAULT_SECURITY_TOMOYO is not set +# CONFIG_DEFAULT_SECURITY_APPARMOR is not set +CONFIG_DEFAULT_SECURITY_DAC=y +CONFIG_LSM="lockdown,yama" + +# +# Kernel hardening options +# +CONFIG_GCC_PLUGIN_STRUCTLEAK=y + +# +# Memory initialization +# +# CONFIG_INIT_STACK_NONE is not set +# CONFIG_GCC_PLUGIN_STRUCTLEAK_USER is not set +# CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF is not set +CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL=y +# CONFIG_GCC_PLUGIN_STRUCTLEAK_VERBOSE is not set +CONFIG_GCC_PLUGIN_STACKLEAK=y +CONFIG_STACKLEAK_TRACK_MIN_SIZE=100 +# CONFIG_STACKLEAK_METRICS is not set +# CONFIG_STACKLEAK_RUNTIME_DISABLE is not set +CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y +CONFIG_INIT_ON_FREE_DEFAULT_ON=y +CONFIG_PAGE_SANITIZE_VERIFY=y +CONFIG_SLAB_SANITIZE_VERIFY=y +# end of Memory initialization +# end of Kernel hardening options +# end of Security options + +CONFIG_XOR_BLOCKS=m +CONFIG_ASYNC_CORE=m +CONFIG_ASYNC_MEMCPY=m +CONFIG_ASYNC_XOR=m +CONFIG_ASYNC_PQ=m +CONFIG_ASYNC_RAID6_RECOV=m +CONFIG_CRYPTO=y + +# +# Crypto core or helper +# +CONFIG_CRYPTO_ALGAPI=y +CONFIG_CRYPTO_ALGAPI2=y +CONFIG_CRYPTO_AEAD=y +CONFIG_CRYPTO_AEAD2=y +CONFIG_CRYPTO_SKCIPHER=y +CONFIG_CRYPTO_SKCIPHER2=y +CONFIG_CRYPTO_HASH=y +CONFIG_CRYPTO_HASH2=y +CONFIG_CRYPTO_RNG=y +CONFIG_CRYPTO_RNG2=y +CONFIG_CRYPTO_RNG_DEFAULT=y +CONFIG_CRYPTO_AKCIPHER2=y +CONFIG_CRYPTO_AKCIPHER=y +CONFIG_CRYPTO_KPP2=y +CONFIG_CRYPTO_KPP=y +CONFIG_CRYPTO_ACOMP2=y +CONFIG_CRYPTO_MANAGER=y +CONFIG_CRYPTO_MANAGER2=y +CONFIG_CRYPTO_USER=m +CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y +CONFIG_CRYPTO_GF128MUL=y +CONFIG_CRYPTO_NULL=y +CONFIG_CRYPTO_NULL2=y +CONFIG_CRYPTO_PCRYPT=m +CONFIG_CRYPTO_CRYPTD=m +CONFIG_CRYPTO_AUTHENC=m +CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_SIMD=m +CONFIG_CRYPTO_GLUE_HELPER_X86=m +CONFIG_CRYPTO_ENGINE=m + +# +# Public-key cryptography +# +CONFIG_CRYPTO_RSA=y +CONFIG_CRYPTO_DH=y +CONFIG_CRYPTO_ECC=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECRDSA=m +CONFIG_CRYPTO_CURVE25519=m +CONFIG_CRYPTO_CURVE25519_X86=m + +# +# Authenticated Encryption with Associated Data +# +CONFIG_CRYPTO_CCM=m +CONFIG_CRYPTO_GCM=y +CONFIG_CRYPTO_CHACHA20POLY1305=m +CONFIG_CRYPTO_AEGIS128=m +CONFIG_CRYPTO_AEGIS128_AESNI_SSE2=m +CONFIG_CRYPTO_SEQIV=y +CONFIG_CRYPTO_ECHAINIV=m + +# +# Block modes +# +CONFIG_CRYPTO_CBC=m +CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTR=y +CONFIG_CRYPTO_CTS=m +CONFIG_CRYPTO_ECB=m +CONFIG_CRYPTO_LRW=m +CONFIG_CRYPTO_OFB=m +CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m +CONFIG_CRYPTO_KEYWRAP=m +CONFIG_CRYPTO_NHPOLY1305=m +CONFIG_CRYPTO_NHPOLY1305_SSE2=m +CONFIG_CRYPTO_NHPOLY1305_AVX2=m +CONFIG_CRYPTO_ADIANTUM=m +CONFIG_CRYPTO_ESSIV=m + +# +# Hash modes +# +CONFIG_CRYPTO_CMAC=m +CONFIG_CRYPTO_HMAC=y +CONFIG_CRYPTO_XCBC=m +CONFIG_CRYPTO_VMAC=m + +# +# Digest +# +CONFIG_CRYPTO_CRC32C=m +CONFIG_CRYPTO_CRC32C_INTEL=m +CONFIG_CRYPTO_CRC32=m +CONFIG_CRYPTO_CRC32_PCLMUL=m +CONFIG_CRYPTO_XXHASH=m +CONFIG_CRYPTO_BLAKE2B=m +CONFIG_CRYPTO_BLAKE2S=m +CONFIG_CRYPTO_BLAKE2S_X86=m +CONFIG_CRYPTO_CRCT10DIF=y +CONFIG_CRYPTO_CRCT10DIF_PCLMUL=m +CONFIG_CRYPTO_GHASH=y +CONFIG_CRYPTO_POLY1305=m +CONFIG_CRYPTO_POLY1305_X86_64=m +CONFIG_CRYPTO_MD4=m +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_RMD128=m +CONFIG_CRYPTO_RMD160=m +CONFIG_CRYPTO_RMD256=m +CONFIG_CRYPTO_RMD320=m +CONFIG_CRYPTO_SHA1=y +CONFIG_CRYPTO_SHA1_SSSE3=m +CONFIG_CRYPTO_SHA256_SSSE3=m +CONFIG_CRYPTO_SHA512_SSSE3=m +CONFIG_CRYPTO_SHA256=y +CONFIG_CRYPTO_SHA512=y +CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m +CONFIG_CRYPTO_STREEBOG=m +CONFIG_CRYPTO_TGR192=m +CONFIG_CRYPTO_WP512=m +CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL=m + +# +# Ciphers +# +CONFIG_CRYPTO_AES=y +CONFIG_CRYPTO_AES_TI=m +CONFIG_CRYPTO_AES_NI_INTEL=m +CONFIG_CRYPTO_ANUBIS=m +CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_BLOWFISH=m +CONFIG_CRYPTO_BLOWFISH_COMMON=m +CONFIG_CRYPTO_BLOWFISH_X86_64=m +CONFIG_CRYPTO_CAMELLIA=m +CONFIG_CRYPTO_CAMELLIA_X86_64=m +CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64=m +CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64=m +CONFIG_CRYPTO_CAST_COMMON=m +CONFIG_CRYPTO_CAST5=m +CONFIG_CRYPTO_CAST5_AVX_X86_64=m +CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_CAST6_AVX_X86_64=m +CONFIG_CRYPTO_DES=m +CONFIG_CRYPTO_DES3_EDE_X86_64=m +CONFIG_CRYPTO_FCRYPT=m +CONFIG_CRYPTO_KHAZAD=m +CONFIG_CRYPTO_SALSA20=m +CONFIG_CRYPTO_CHACHA20=m +CONFIG_CRYPTO_CHACHA20_X86_64=m +CONFIG_CRYPTO_SEED=m +CONFIG_CRYPTO_SERPENT=m +CONFIG_CRYPTO_SERPENT_SSE2_X86_64=m +CONFIG_CRYPTO_SERPENT_AVX_X86_64=m +CONFIG_CRYPTO_SERPENT_AVX2_X86_64=m +CONFIG_CRYPTO_SM4=m +CONFIG_CRYPTO_TEA=m +CONFIG_CRYPTO_TWOFISH=m +CONFIG_CRYPTO_TWOFISH_COMMON=m +CONFIG_CRYPTO_TWOFISH_X86_64=m +CONFIG_CRYPTO_TWOFISH_X86_64_3WAY=m +CONFIG_CRYPTO_TWOFISH_AVX_X86_64=m + +# +# Compression +# +CONFIG_CRYPTO_DEFLATE=m +CONFIG_CRYPTO_LZO=m +CONFIG_CRYPTO_842=m +CONFIG_CRYPTO_LZ4=y +CONFIG_CRYPTO_LZ4HC=m +CONFIG_CRYPTO_ZSTD=y + +# +# Random Number Generation +# +CONFIG_CRYPTO_ANSI_CPRNG=m +CONFIG_CRYPTO_DRBG_MENU=y +CONFIG_CRYPTO_DRBG_HMAC=y +CONFIG_CRYPTO_DRBG_HASH=y +CONFIG_CRYPTO_DRBG_CTR=y +CONFIG_CRYPTO_DRBG=y +CONFIG_CRYPTO_JITTERENTROPY=y +CONFIG_CRYPTO_USER_API=m +CONFIG_CRYPTO_USER_API_HASH=m +CONFIG_CRYPTO_USER_API_SKCIPHER=m +CONFIG_CRYPTO_USER_API_RNG=m +CONFIG_CRYPTO_USER_API_AEAD=m +# CONFIG_CRYPTO_STATS is not set +CONFIG_CRYPTO_HASH_INFO=y + +# +# Crypto library routines +# +CONFIG_CRYPTO_LIB_AES=y +CONFIG_CRYPTO_LIB_ARC4=m +CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S=m +CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC=m +CONFIG_CRYPTO_LIB_BLAKE2S=m +CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA=m +CONFIG_CRYPTO_LIB_CHACHA_GENERIC=m +CONFIG_CRYPTO_LIB_CHACHA=m +CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519=m +CONFIG_CRYPTO_LIB_CURVE25519_GENERIC=m +CONFIG_CRYPTO_LIB_CURVE25519=m +CONFIG_CRYPTO_LIB_DES=m +CONFIG_CRYPTO_LIB_POLY1305_RSIZE=11 +CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305=m +CONFIG_CRYPTO_LIB_POLY1305_GENERIC=m +CONFIG_CRYPTO_LIB_POLY1305=m +CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m +CONFIG_CRYPTO_LIB_SHA256=y +CONFIG_CRYPTO_HW=y +CONFIG_CRYPTO_DEV_PADLOCK=m +CONFIG_CRYPTO_DEV_PADLOCK_AES=m +CONFIG_CRYPTO_DEV_PADLOCK_SHA=m +CONFIG_CRYPTO_DEV_ATMEL_I2C=m +CONFIG_CRYPTO_DEV_ATMEL_ECC=m +CONFIG_CRYPTO_DEV_ATMEL_SHA204A=m +CONFIG_CRYPTO_DEV_CCP=y +CONFIG_CRYPTO_DEV_CCP_DD=m +CONFIG_CRYPTO_DEV_SP_CCP=y +CONFIG_CRYPTO_DEV_CCP_CRYPTO=m +CONFIG_CRYPTO_DEV_SP_PSP=y +# CONFIG_CRYPTO_DEV_CCP_DEBUGFS is not set +CONFIG_CRYPTO_DEV_QAT=m +CONFIG_CRYPTO_DEV_QAT_DH895xCC=m +CONFIG_CRYPTO_DEV_QAT_C3XXX=m +CONFIG_CRYPTO_DEV_QAT_C62X=m +CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m +CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m +CONFIG_CRYPTO_DEV_QAT_C62XVF=m +CONFIG_CRYPTO_DEV_NITROX=m +CONFIG_CRYPTO_DEV_NITROX_CNN55XX=m +CONFIG_CRYPTO_DEV_CHELSIO=m +CONFIG_CHELSIO_IPSEC_INLINE=y +CONFIG_CHELSIO_TLS_DEVICE=y +CONFIG_CRYPTO_DEV_VIRTIO=m +CONFIG_CRYPTO_DEV_SAFEXCEL=m +CONFIG_CRYPTO_DEV_CCREE=m +CONFIG_CRYPTO_DEV_AMLOGIC_GXL=m +# CONFIG_CRYPTO_DEV_AMLOGIC_GXL_DEBUG is not set +CONFIG_ASYMMETRIC_KEY_TYPE=y +CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y +CONFIG_ASYMMETRIC_TPM_KEY_SUBTYPE=m +CONFIG_X509_CERTIFICATE_PARSER=y +CONFIG_PKCS8_PRIVATE_KEY_PARSER=m +CONFIG_TPM_KEY_PARSER=m +CONFIG_PKCS7_MESSAGE_PARSER=y +# CONFIG_PKCS7_TEST_KEY is not set +CONFIG_SIGNED_PE_FILE_VERIFICATION=y + +# +# Certificates for signature checking +# +CONFIG_MODULE_SIG_KEY="certs/signing_key.pem" +CONFIG_SYSTEM_TRUSTED_KEYRING=y +CONFIG_SYSTEM_TRUSTED_KEYS="" +# CONFIG_SYSTEM_EXTRA_CERTIFICATE is not set +CONFIG_SECONDARY_TRUSTED_KEYRING=y +CONFIG_SYSTEM_BLACKLIST_KEYRING=y +CONFIG_SYSTEM_BLACKLIST_HASH_LIST="" +# end of Certificates for signature checking + +CONFIG_BINARY_PRINTF=y + +# +# Library routines +# +CONFIG_RAID6_PQ=m +CONFIG_RAID6_PQ_BENCHMARK=y +CONFIG_PACKING=y +CONFIG_BITREVERSE=y +CONFIG_GENERIC_STRNCPY_FROM_USER=y +CONFIG_GENERIC_STRNLEN_USER=y +CONFIG_GENERIC_NET_UTILS=y +CONFIG_GENERIC_FIND_FIRST_BIT=y +CONFIG_CORDIC=m +CONFIG_RATIONAL=y +CONFIG_GENERIC_PCI_IOMAP=y +CONFIG_GENERIC_IOMAP=y +CONFIG_ARCH_USE_CMPXCHG_LOCKREF=y +CONFIG_ARCH_HAS_FAST_MULTIPLIER=y +CONFIG_CRC_CCITT=y +CONFIG_CRC16=m +CONFIG_CRC_T10DIF=y +CONFIG_CRC_ITU_T=m +CONFIG_CRC32=y +# CONFIG_CRC32_SELFTEST is not set +CONFIG_CRC32_SLICEBY8=y +# CONFIG_CRC32_SLICEBY4 is not set +# CONFIG_CRC32_SARWATE is not set +# CONFIG_CRC32_BIT is not set +CONFIG_CRC64=m +CONFIG_CRC4=m +CONFIG_CRC7=m +CONFIG_LIBCRC32C=m +CONFIG_CRC8=m +CONFIG_XXHASH=y +# CONFIG_RANDOM32_SELFTEST is not set +CONFIG_842_COMPRESS=m +CONFIG_842_DECOMPRESS=m +CONFIG_ZLIB_INFLATE=y +CONFIG_ZLIB_DEFLATE=y +CONFIG_LZO_COMPRESS=y +CONFIG_LZO_DECOMPRESS=y +CONFIG_LZ4_COMPRESS=y +CONFIG_LZ4HC_COMPRESS=m +CONFIG_LZ4_DECOMPRESS=y +CONFIG_ZSTD_COMPRESS=y +CONFIG_ZSTD_DECOMPRESS=y +CONFIG_XZ_DEC=y +CONFIG_XZ_DEC_X86=y +CONFIG_XZ_DEC_POWERPC=y +CONFIG_XZ_DEC_IA64=y +CONFIG_XZ_DEC_ARM=y +CONFIG_XZ_DEC_ARMTHUMB=y +CONFIG_XZ_DEC_SPARC=y +CONFIG_XZ_DEC_BCJ=y +# CONFIG_XZ_DEC_TEST is not set +CONFIG_DECOMPRESS_GZIP=y +CONFIG_DECOMPRESS_BZIP2=y +CONFIG_DECOMPRESS_LZMA=y +CONFIG_DECOMPRESS_XZ=y +CONFIG_DECOMPRESS_LZO=y +CONFIG_DECOMPRESS_LZ4=y +CONFIG_GENERIC_ALLOCATOR=y +CONFIG_REED_SOLOMON=y +CONFIG_REED_SOLOMON_ENC8=y +CONFIG_REED_SOLOMON_DEC8=y +CONFIG_REED_SOLOMON_DEC16=y +CONFIG_BCH=m +CONFIG_TEXTSEARCH=y +CONFIG_TEXTSEARCH_KMP=m +CONFIG_TEXTSEARCH_BM=m +CONFIG_TEXTSEARCH_FSM=m +CONFIG_BTREE=y +CONFIG_INTERVAL_TREE=y +CONFIG_XARRAY_MULTI=y +CONFIG_ASSOCIATIVE_ARRAY=y +CONFIG_HAS_IOMEM=y +CONFIG_HAS_IOPORT_MAP=y +CONFIG_HAS_DMA=y +CONFIG_NEED_SG_DMA_LENGTH=y +CONFIG_NEED_DMA_MAP_STATE=y +CONFIG_ARCH_DMA_ADDR_T_64BIT=y +CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED=y +CONFIG_DMA_VIRT_OPS=y +CONFIG_SWIOTLB=y +# CONFIG_DMA_API_DEBUG is not set +CONFIG_SGL_ALLOC=y +CONFIG_IOMMU_HELPER=y +CONFIG_CHECK_SIGNATURE=y +CONFIG_CPU_RMAP=y +CONFIG_DQL=y +CONFIG_GLOB=y +# CONFIG_GLOB_SELFTEST is not set +CONFIG_NLATTR=y +CONFIG_LRU_CACHE=m +CONFIG_CLZ_TAB=y +CONFIG_IRQ_POLL=y +CONFIG_MPILIB=y +CONFIG_DIMLIB=y +CONFIG_LIBFDT=y +CONFIG_OID_REGISTRY=y +CONFIG_UCS2_STRING=y +CONFIG_HAVE_GENERIC_VDSO=y +CONFIG_GENERIC_GETTIMEOFDAY=y +CONFIG_GENERIC_VDSO_TIME_NS=y +CONFIG_FONT_SUPPORT=y +CONFIG_FONTS=y +# CONFIG_FONT_8x8 is not set +CONFIG_FONT_8x16=y +# CONFIG_FONT_6x11 is not set +# CONFIG_FONT_7x14 is not set +# CONFIG_FONT_PEARL_8x8 is not set +# CONFIG_FONT_ACORN_8x8 is not set +# CONFIG_FONT_MINI_4x6 is not set +# CONFIG_FONT_6x10 is not set +# CONFIG_FONT_10x18 is not set +# CONFIG_FONT_SUN8x16 is not set +# CONFIG_FONT_SUN12x22 is not set +CONFIG_FONT_TER16x32=y +CONFIG_SG_POOL=y +CONFIG_ARCH_HAS_PMEM_API=y +CONFIG_MEMREGION=y +CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE=y +CONFIG_ARCH_HAS_UACCESS_MCSAFE=y +CONFIG_ARCH_STACKWALK=y +CONFIG_SBITMAP=y +CONFIG_PARMAN=m +CONFIG_OBJAGG=m +# CONFIG_STRING_SELFTEST is not set +# end of Library routines + +# +# Kernel hacking +# + +# +# printk and dmesg options +# +CONFIG_PRINTK_TIME=y +# CONFIG_PRINTK_CALLER is not set +CONFIG_CONSOLE_LOGLEVEL_DEFAULT=4 +CONFIG_CONSOLE_LOGLEVEL_QUIET=1 +CONFIG_MESSAGE_LOGLEVEL_DEFAULT=4 +# CONFIG_BOOT_PRINTK_DELAY is not set +CONFIG_DYNAMIC_DEBUG=y +CONFIG_SYMBOLIC_ERRNAME=y +CONFIG_DEBUG_BUGVERBOSE=y +# end of printk and dmesg options + +# +# Compile-time checks and compiler options +# +# CONFIG_DEBUG_INFO is not set +# CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_FRAME_WARN=2048 +CONFIG_STRIP_ASM_SYMS=y +# CONFIG_READABLE_ASM is not set +# CONFIG_HEADERS_INSTALL is not set +# CONFIG_DEBUG_SECTION_MISMATCH is not set +CONFIG_SECTION_MISMATCH_WARN_ONLY=y +# CONFIG_DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE is not set +CONFIG_STACK_VALIDATION=y +# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set +# end of Compile-time checks and compiler options + +# +# Generic Kernel Debugging Instruments +# +CONFIG_MAGIC_SYSRQ=y +CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0x0 +CONFIG_MAGIC_SYSRQ_SERIAL=y +CONFIG_MAGIC_SYSRQ_SERIAL_SEQUENCE="" +CONFIG_DEBUG_FS=y +CONFIG_HAVE_ARCH_KGDB=y +# CONFIG_KGDB is not set +CONFIG_ARCH_HAS_UBSAN_SANITIZE_ALL=y +# CONFIG_UBSAN is not set +# end of Generic Kernel Debugging Instruments + +CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_MISC=y + +# +# Memory Debugging +# +# CONFIG_PAGE_EXTENSION is not set +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_PAGE_OWNER is not set +# CONFIG_PAGE_POISONING is not set +# CONFIG_DEBUG_PAGE_REF is not set +# CONFIG_DEBUG_RODATA_TEST is not set +CONFIG_GENERIC_PTDUMP=y +CONFIG_PTDUMP_CORE=y +# CONFIG_PTDUMP_DEBUGFS is not set +# CONFIG_DEBUG_OBJECTS is not set +# CONFIG_SLUB_DEBUG_ON is not set +# CONFIG_SLUB_STATS is not set +CONFIG_HAVE_DEBUG_KMEMLEAK=y +# CONFIG_DEBUG_KMEMLEAK is not set +# CONFIG_DEBUG_STACK_USAGE is not set +CONFIG_SCHED_STACK_END_CHECK=y +# CONFIG_DEBUG_VM is not set +CONFIG_ARCH_HAS_DEBUG_VIRTUAL=y +# CONFIG_DEBUG_VIRTUAL is not set +CONFIG_DEBUG_MEMORY_INIT=y +# CONFIG_DEBUG_PER_CPU_MAPS is not set +CONFIG_HAVE_ARCH_KASAN=y +CONFIG_HAVE_ARCH_KASAN_VMALLOC=y +CONFIG_CC_HAS_KASAN_GENERIC=y +# CONFIG_KASAN is not set +CONFIG_KASAN_STACK=1 +# end of Memory Debugging + +# CONFIG_DEBUG_SHIRQ is not set + +# +# Debug Oops, Lockups and Hangs +# +CONFIG_PANIC_ON_OOPS=y +CONFIG_PANIC_ON_OOPS_VALUE=1 +CONFIG_PANIC_TIMEOUT=0 +CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y +# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 +CONFIG_HARDLOCKUP_DETECTOR_PERF=y +CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y +CONFIG_HARDLOCKUP_DETECTOR=y +# CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is not set +CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE=0 +CONFIG_DETECT_HUNG_TASK=y +CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120 +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 +# CONFIG_WQ_WATCHDOG is not set +# CONFIG_TEST_LOCKUP is not set +# end of Debug Oops, Lockups and Hangs + +# +# Scheduler Debugging +# +CONFIG_SCHED_DEBUG=y +CONFIG_SCHED_INFO=y +CONFIG_SCHEDSTATS=y +# end of Scheduler Debugging + +# CONFIG_DEBUG_TIMEKEEPING is not set +CONFIG_DEBUG_PREEMPT=y + +# +# Lock Debugging (spinlocks, mutexes, etc...) +# +CONFIG_LOCK_DEBUGGING_SUPPORT=y +# CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set +# CONFIG_DEBUG_RT_MUTEXES is not set +# CONFIG_DEBUG_SPINLOCK is not set +# CONFIG_DEBUG_MUTEXES is not set +# CONFIG_DEBUG_WW_MUTEX_SLOWPATH is not set +# CONFIG_DEBUG_RWSEMS is not set +# CONFIG_DEBUG_LOCK_ALLOC is not set +# CONFIG_DEBUG_ATOMIC_SLEEP is not set +# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +# CONFIG_LOCK_TORTURE_TEST is not set +# CONFIG_WW_MUTEX_SELFTEST is not set +# end of Lock Debugging (spinlocks, mutexes, etc...) + +CONFIG_STACKTRACE=y +# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set +# CONFIG_DEBUG_KOBJECT is not set + +# +# Debug kernel data structures +# +CONFIG_DEBUG_LIST=y +# CONFIG_DEBUG_PLIST is not set +CONFIG_DEBUG_SG=y +CONFIG_DEBUG_NOTIFIERS=y +CONFIG_BUG_ON_DATA_CORRUPTION=y +# end of Debug kernel data structures + +CONFIG_DEBUG_CREDENTIALS=y + +# +# RCU Debugging +# +# CONFIG_RCU_PERF_TEST is not set +# CONFIG_RCU_TORTURE_TEST is not set +CONFIG_RCU_CPU_STALL_TIMEOUT=60 +# CONFIG_RCU_TRACE is not set +# CONFIG_RCU_EQS_DEBUG is not set +# end of RCU Debugging + +# CONFIG_DEBUG_WQ_FORCE_RR_CPU is not set +# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set +# CONFIG_CPU_HOTPLUG_STATE_CONTROL is not set +CONFIG_LATENCYTOP=y +CONFIG_USER_STACKTRACE_SUPPORT=y +CONFIG_NOP_TRACER=y +CONFIG_HAVE_FUNCTION_TRACER=y +CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y +CONFIG_HAVE_DYNAMIC_FTRACE=y +CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS=y +CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y +CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y +CONFIG_HAVE_FENTRY=y +CONFIG_HAVE_C_RECORDMCOUNT=y +CONFIG_TRACER_MAX_TRACE=y +CONFIG_TRACE_CLOCK=y +CONFIG_RING_BUFFER=y +CONFIG_EVENT_TRACING=y +CONFIG_CONTEXT_SWITCH_TRACER=y +CONFIG_RING_BUFFER_ALLOW_SWAP=y +CONFIG_TRACING=y +CONFIG_GENERIC_TRACER=y +CONFIG_TRACING_SUPPORT=y +CONFIG_FTRACE=y +# CONFIG_BOOTTIME_TRACING is not set +CONFIG_FUNCTION_TRACER=y +CONFIG_FUNCTION_GRAPH_TRACER=y +CONFIG_DYNAMIC_FTRACE=y +CONFIG_DYNAMIC_FTRACE_WITH_REGS=y +CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y +CONFIG_FUNCTION_PROFILER=y +CONFIG_STACK_TRACER=y +# CONFIG_PREEMPTIRQ_EVENTS is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_PREEMPT_TRACER is not set +CONFIG_SCHED_TRACER=y +CONFIG_HWLAT_TRACER=y +CONFIG_MMIOTRACE=y +CONFIG_FTRACE_SYSCALLS=y +CONFIG_TRACER_SNAPSHOT=y +# CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP is not set +CONFIG_BRANCH_PROFILE_NONE=y +# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_KPROBE_EVENTS=y +# CONFIG_KPROBE_EVENTS_ON_NOTRACE is not set +CONFIG_UPROBE_EVENTS=y +CONFIG_BPF_EVENTS=y +CONFIG_DYNAMIC_EVENTS=y +CONFIG_PROBE_EVENTS=y +# CONFIG_BPF_KPROBE_OVERRIDE is not set +CONFIG_FTRACE_MCOUNT_RECORD=y +# CONFIG_HIST_TRIGGERS is not set +# CONFIG_TRACE_EVENT_INJECT is not set +# CONFIG_TRACEPOINT_BENCHMARK is not set +# CONFIG_RING_BUFFER_BENCHMARK is not set +# CONFIG_TRACE_EVAL_MAP_FILE is not set +# CONFIG_FTRACE_STARTUP_TEST is not set +# CONFIG_RING_BUFFER_STARTUP_TEST is not set +# CONFIG_MMIOTRACE_TEST is not set +# CONFIG_PREEMPTIRQ_DELAY_TEST is not set +# CONFIG_KPROBE_EVENT_GEN_TEST is not set +# CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set +# CONFIG_SAMPLES is not set +CONFIG_ARCH_HAS_DEVMEM_IS_ALLOWED=y +# CONFIG_STRICT_DEVMEM is not set + +# +# x86 Debugging +# +CONFIG_TRACE_IRQFLAGS_SUPPORT=y +# CONFIG_X86_VERBOSE_BOOTUP is not set +CONFIG_EARLY_PRINTK=y +# CONFIG_EARLY_PRINTK_DBGP is not set +# CONFIG_EARLY_PRINTK_USB_XDBC is not set +# CONFIG_EFI_PGT_DUMP is not set +CONFIG_DEBUG_WX=y +CONFIG_DOUBLEFAULT=y +# CONFIG_DEBUG_TLBFLUSH is not set +# CONFIG_IOMMU_DEBUG is not set +CONFIG_HAVE_MMIOTRACE_SUPPORT=y +# CONFIG_X86_DECODER_SELFTEST is not set +CONFIG_IO_DELAY_0X80=y +# CONFIG_IO_DELAY_0XED is not set +# CONFIG_IO_DELAY_UDELAY is not set +# CONFIG_IO_DELAY_NONE is not set +CONFIG_DEBUG_BOOT_PARAMS=y +# CONFIG_CPA_DEBUG is not set +# CONFIG_DEBUG_ENTRY is not set +# CONFIG_DEBUG_NMI_SELFTEST is not set +# CONFIG_X86_DEBUG_FPU is not set +# CONFIG_PUNIT_ATOM_DEBUG is not set +CONFIG_UNWINDER_ORC=y +# CONFIG_UNWINDER_FRAME_POINTER is not set +# CONFIG_UNWINDER_GUESS is not set +# end of x86 Debugging + +# +# Kernel Testing and Coverage +# +# CONFIG_KUNIT is not set +# CONFIG_NOTIFIER_ERROR_INJECTION is not set +CONFIG_FUNCTION_ERROR_INJECTION=y +# CONFIG_FAULT_INJECTION is not set +CONFIG_ARCH_HAS_KCOV=y +CONFIG_CC_HAS_SANCOV_TRACE_PC=y +# CONFIG_KCOV is not set +CONFIG_RUNTIME_TESTING_MENU=y +CONFIG_LKDTM=m +# CONFIG_TEST_LIST_SORT is not set +# CONFIG_TEST_MIN_HEAP is not set +# CONFIG_TEST_SORT is not set +# CONFIG_KPROBES_SANITY_TEST is not set +# CONFIG_BACKTRACE_SELF_TEST is not set +# CONFIG_RBTREE_TEST is not set +# CONFIG_REED_SOLOMON_TEST is not set +# CONFIG_INTERVAL_TREE_TEST is not set +# CONFIG_PERCPU_TEST is not set +# CONFIG_ATOMIC64_SELFTEST is not set +# CONFIG_ASYNC_RAID6_TEST is not set +# CONFIG_TEST_HEXDUMP is not set +# CONFIG_TEST_STRING_HELPERS is not set +# CONFIG_TEST_STRSCPY is not set +# CONFIG_TEST_KSTRTOX is not set +# CONFIG_TEST_PRINTF is not set +# CONFIG_TEST_BITMAP is not set +# CONFIG_TEST_BITFIELD is not set +# CONFIG_TEST_UUID is not set +# CONFIG_TEST_XARRAY is not set +# CONFIG_TEST_OVERFLOW is not set +# CONFIG_TEST_RHASHTABLE is not set +# CONFIG_TEST_HASH is not set +# CONFIG_TEST_IDA is not set +# CONFIG_TEST_PARMAN is not set +# CONFIG_TEST_LKM is not set +# CONFIG_TEST_VMALLOC is not set +# CONFIG_TEST_USER_COPY is not set +# CONFIG_TEST_BPF is not set +# CONFIG_TEST_BLACKHOLE_DEV is not set +# CONFIG_FIND_BIT_BENCHMARK is not set +# CONFIG_TEST_FIRMWARE is not set +# CONFIG_TEST_SYSCTL is not set +# CONFIG_TEST_UDELAY is not set +# CONFIG_TEST_STATIC_KEYS is not set +# CONFIG_TEST_KMOD is not set +# CONFIG_TEST_MEMCAT_P is not set +# CONFIG_TEST_OBJAGG is not set +# CONFIG_TEST_STACKINIT is not set +# CONFIG_TEST_MEMINIT is not set +# CONFIG_MEMTEST is not set +# CONFIG_HYPERV_TESTING is not set +# end of Kernel Testing and Coverage +# end of Kernel hacking diff --git a/linux-tkg/linux-tkg-config/5.8/90-cleanup.hook b/linux-tkg/linux-tkg-config/5.8/90-cleanup.hook new file mode 100644 index 0000000..99f5221 --- /dev/null +++ b/linux-tkg/linux-tkg-config/5.8/90-cleanup.hook @@ -0,0 +1,14 @@ +[Trigger] +Type = File +Operation = Install +Operation = Upgrade +Operation = Remove +Target = usr/lib/modules/*/ +Target = !usr/lib/modules/*/?* + +[Action] +Description = Cleaning up... +When = PostTransaction +Exec = /usr/share/libalpm/scripts/cleanup +NeedsTargets + diff --git a/linux-tkg/linux-tkg-config/5.8/cleanup b/linux-tkg/linux-tkg-config/5.8/cleanup new file mode 100755 index 0000000..c00c08d --- /dev/null +++ b/linux-tkg/linux-tkg-config/5.8/cleanup @@ -0,0 +1,10 @@ +#!/bin/bash + +for _f in /usr/lib/modules/*tkg*; do + if [[ ! -e ${_f}/vmlinuz ]]; then + rm -rf "$_f" + fi +done + +# vim:set ft=sh sw=2 et: + diff --git a/linux-tkg/linux-tkg-config/5.8/config.x86_64 b/linux-tkg/linux-tkg-config/5.8/config.x86_64 new file mode 100644 index 0000000..9a691fd --- /dev/null +++ b/linux-tkg/linux-tkg-config/5.8/config.x86_64 @@ -0,0 +1,11019 @@ +# +# Automatically generated file; DO NOT EDIT. +# Linux/x86 5.8.0-arch1 Kernel Configuration +# +CONFIG_CC_VERSION_TEXT="gcc (GCC) 10.1.0" +CONFIG_CC_IS_GCC=y +CONFIG_GCC_VERSION=100100 +CONFIG_LD_VERSION=234000000 +CONFIG_CLANG_VERSION=0 +CONFIG_CC_CAN_LINK=y +CONFIG_CC_CAN_LINK_STATIC=y +CONFIG_CC_HAS_ASM_GOTO=y +CONFIG_CC_HAS_ASM_INLINE=y +CONFIG_IRQ_WORK=y +CONFIG_BUILDTIME_TABLE_SORT=y +CONFIG_THREAD_INFO_IN_TASK=y + +# +# General setup +# +CONFIG_INIT_ENV_ARG_LIMIT=32 +# CONFIG_COMPILE_TEST is not set +CONFIG_LOCALVERSION="" +CONFIG_LOCALVERSION_AUTO=y +CONFIG_BUILD_SALT="" +CONFIG_HAVE_KERNEL_GZIP=y +CONFIG_HAVE_KERNEL_BZIP2=y +CONFIG_HAVE_KERNEL_LZMA=y +CONFIG_HAVE_KERNEL_XZ=y +CONFIG_HAVE_KERNEL_LZO=y +CONFIG_HAVE_KERNEL_LZ4=y +# CONFIG_KERNEL_GZIP is not set +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set +CONFIG_KERNEL_XZ=y +# CONFIG_KERNEL_LZO is not set +# CONFIG_KERNEL_LZ4 is not set +CONFIG_DEFAULT_INIT="" +CONFIG_DEFAULT_HOSTNAME="archlinux" +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_SYSVIPC_SYSCTL=y +CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y +CONFIG_WATCH_QUEUE=y +CONFIG_CROSS_MEMORY_ATTACH=y +# CONFIG_USELIB is not set +CONFIG_AUDIT=y +CONFIG_HAVE_ARCH_AUDITSYSCALL=y +CONFIG_AUDITSYSCALL=y + +# +# IRQ subsystem +# +CONFIG_GENERIC_IRQ_PROBE=y +CONFIG_GENERIC_IRQ_SHOW=y +CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y +CONFIG_GENERIC_PENDING_IRQ=y +CONFIG_GENERIC_IRQ_MIGRATION=y +CONFIG_HARDIRQS_SW_RESEND=y +CONFIG_GENERIC_IRQ_CHIP=y +CONFIG_IRQ_DOMAIN=y +CONFIG_IRQ_SIM=y +CONFIG_IRQ_DOMAIN_HIERARCHY=y +CONFIG_GENERIC_MSI_IRQ=y +CONFIG_GENERIC_MSI_IRQ_DOMAIN=y +CONFIG_IRQ_MSI_IOMMU=y +CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR=y +CONFIG_GENERIC_IRQ_RESERVATION_MODE=y +CONFIG_IRQ_FORCED_THREADING=y +CONFIG_SPARSE_IRQ=y +# CONFIG_GENERIC_IRQ_DEBUGFS is not set +# end of IRQ subsystem + +CONFIG_CLOCKSOURCE_WATCHDOG=y +CONFIG_ARCH_CLOCKSOURCE_INIT=y +CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y +CONFIG_GENERIC_TIME_VSYSCALL=y +CONFIG_GENERIC_CLOCKEVENTS=y +CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y +CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=y +CONFIG_GENERIC_CMOS_UPDATE=y + +# +# Timers subsystem +# +CONFIG_TICK_ONESHOT=y +CONFIG_NO_HZ_COMMON=y +# CONFIG_HZ_PERIODIC is not set +CONFIG_NO_HZ_IDLE=y +# CONFIG_NO_HZ_FULL is not set +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +# end of Timers subsystem + +# CONFIG_PREEMPT_NONE is not set +# CONFIG_PREEMPT_VOLUNTARY is not set +CONFIG_PREEMPT=y +CONFIG_PREEMPT_COUNT=y +CONFIG_PREEMPTION=y + +# +# CPU/Task time and stats accounting +# +CONFIG_TICK_CPU_ACCOUNTING=y +# CONFIG_VIRT_CPU_ACCOUNTING_GEN is not set +CONFIG_IRQ_TIME_ACCOUNTING=y +CONFIG_HAVE_SCHED_AVG_IRQ=y +# CONFIG_SCHED_THERMAL_PRESSURE is not set +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_BSD_PROCESS_ACCT_V3=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_PSI=y +# CONFIG_PSI_DEFAULT_DISABLED is not set +# end of CPU/Task time and stats accounting + +CONFIG_CPU_ISOLATION=y + +# +# RCU Subsystem +# +CONFIG_TREE_RCU=y +CONFIG_PREEMPT_RCU=y +CONFIG_RCU_EXPERT=y +CONFIG_SRCU=y +CONFIG_TREE_SRCU=y +CONFIG_TASKS_RCU_GENERIC=y +CONFIG_TASKS_RCU=y +CONFIG_TASKS_RUDE_RCU=y +CONFIG_RCU_STALL_COMMON=y +CONFIG_RCU_NEED_SEGCBLIST=y +CONFIG_RCU_FANOUT=64 +CONFIG_RCU_FANOUT_LEAF=16 +CONFIG_RCU_FAST_NO_HZ=y +CONFIG_RCU_BOOST=y +CONFIG_RCU_BOOST_DELAY=500 +# CONFIG_RCU_NOCB_CPU is not set +# CONFIG_TASKS_TRACE_RCU_READ_MB is not set +# end of RCU Subsystem + +CONFIG_BUILD_BIN2C=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +# CONFIG_IKHEADERS is not set +CONFIG_LOG_BUF_SHIFT=17 +CONFIG_LOG_CPU_MAX_BUF_SHIFT=12 +CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=13 +CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y + +# +# Scheduler features +# +CONFIG_UCLAMP_TASK=y +CONFIG_UCLAMP_BUCKETS_COUNT=5 +# end of Scheduler features + +CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y +CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y +CONFIG_CC_HAS_INT128=y +CONFIG_ARCH_SUPPORTS_INT128=y +CONFIG_NUMA_BALANCING=y +CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y +CONFIG_CGROUPS=y +CONFIG_PAGE_COUNTER=y +CONFIG_MEMCG=y +CONFIG_MEMCG_SWAP=y +CONFIG_MEMCG_KMEM=y +CONFIG_BLK_CGROUP=y +CONFIG_CGROUP_WRITEBACK=y +CONFIG_CGROUP_SCHED=y +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_CFS_BANDWIDTH=y +# CONFIG_RT_GROUP_SCHED is not set +CONFIG_UCLAMP_TASK_GROUP=y +CONFIG_CGROUP_PIDS=y +CONFIG_CGROUP_RDMA=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_HUGETLB=y +CONFIG_CPUSETS=y +CONFIG_PROC_PID_CPUSET=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_PERF=y +CONFIG_CGROUP_BPF=y +# CONFIG_CGROUP_DEBUG is not set +CONFIG_SOCK_CGROUP_DATA=y +CONFIG_NAMESPACES=y +CONFIG_UTS_NS=y +CONFIG_TIME_NS=y +CONFIG_IPC_NS=y +CONFIG_USER_NS=y +CONFIG_USER_NS_UNPRIVILEGED=y +CONFIG_PID_NS=y +CONFIG_NET_NS=y +CONFIG_CHECKPOINT_RESTORE=y +CONFIG_SCHED_AUTOGROUP=y +# CONFIG_SYSFS_DEPRECATED is not set +CONFIG_RELAY=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_GZIP=y +CONFIG_RD_BZIP2=y +CONFIG_RD_LZMA=y +CONFIG_RD_XZ=y +CONFIG_RD_LZO=y +CONFIG_RD_LZ4=y +CONFIG_BOOT_CONFIG=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set +CONFIG_SYSCTL=y +CONFIG_HAVE_UID16=y +CONFIG_SYSCTL_EXCEPTION_TRACE=y +CONFIG_HAVE_PCSPKR_PLATFORM=y +CONFIG_BPF=y +CONFIG_EXPERT=y +# CONFIG_UID16 is not set +CONFIG_MULTIUSER=y +CONFIG_SGETMASK_SYSCALL=y +# CONFIG_SYSFS_SYSCALL is not set +CONFIG_FHANDLE=y +CONFIG_POSIX_TIMERS=y +CONFIG_PRINTK=y +CONFIG_PRINTK_NMI=y +CONFIG_BUG=y +CONFIG_ELF_CORE=y +CONFIG_PCSPKR_PLATFORM=y +CONFIG_BASE_FULL=y +CONFIG_FUTEX=y +CONFIG_FUTEX_PI=y +CONFIG_EPOLL=y +CONFIG_SIGNALFD=y +CONFIG_TIMERFD=y +CONFIG_EVENTFD=y +CONFIG_SHMEM=y +CONFIG_AIO=y +CONFIG_IO_URING=y +CONFIG_ADVISE_SYSCALLS=y +CONFIG_HAVE_ARCH_USERFAULTFD_WP=y +CONFIG_MEMBARRIER=y +CONFIG_KALLSYMS=y +CONFIG_KALLSYMS_ALL=y +CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y +CONFIG_KALLSYMS_BASE_RELATIVE=y +CONFIG_BPF_LSM=y +CONFIG_BPF_SYSCALL=y +CONFIG_ARCH_WANT_DEFAULT_BPF_JIT=y +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_BPF_JIT_DEFAULT_ON=y +CONFIG_USERFAULTFD=y +CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y +CONFIG_RSEQ=y +# CONFIG_DEBUG_RSEQ is not set +# CONFIG_EMBEDDED is not set +CONFIG_HAVE_PERF_EVENTS=y +# CONFIG_PC104 is not set + +# +# Kernel Performance Events And Counters +# +CONFIG_PERF_EVENTS=y +# CONFIG_DEBUG_PERF_USE_VMALLOC is not set +# end of Kernel Performance Events And Counters + +CONFIG_VM_EVENT_COUNTERS=y +CONFIG_SLUB_DEBUG=y +# CONFIG_SLUB_MEMCG_SYSFS_ON is not set +# CONFIG_COMPAT_BRK is not set +# CONFIG_SLAB is not set +CONFIG_SLUB=y +# CONFIG_SLOB is not set +CONFIG_SLAB_MERGE_DEFAULT=y +CONFIG_SLAB_FREELIST_RANDOM=y +CONFIG_SLAB_FREELIST_HARDENED=y +CONFIG_SHUFFLE_PAGE_ALLOCATOR=y +CONFIG_SLUB_CPU_PARTIAL=y +CONFIG_SYSTEM_DATA_VERIFICATION=y +CONFIG_PROFILING=y +CONFIG_TRACEPOINTS=y +# end of General setup + +CONFIG_64BIT=y +CONFIG_X86_64=y +CONFIG_X86=y +CONFIG_INSTRUCTION_DECODER=y +CONFIG_OUTPUT_FORMAT="elf64-x86-64" +CONFIG_LOCKDEP_SUPPORT=y +CONFIG_STACKTRACE_SUPPORT=y +CONFIG_MMU=y +CONFIG_ARCH_MMAP_RND_BITS_MIN=28 +CONFIG_ARCH_MMAP_RND_BITS_MAX=32 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=8 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16 +CONFIG_GENERIC_ISA_DMA=y +CONFIG_GENERIC_BUG=y +CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y +CONFIG_ARCH_MAY_HAVE_PC_FDC=y +CONFIG_GENERIC_CALIBRATE_DELAY=y +CONFIG_ARCH_HAS_CPU_RELAX=y +CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y +CONFIG_ARCH_HAS_FILTER_PGPROT=y +CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y +CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y +CONFIG_ARCH_HIBERNATION_POSSIBLE=y +CONFIG_ARCH_SUSPEND_POSSIBLE=y +CONFIG_ARCH_WANT_GENERAL_HUGETLB=y +CONFIG_ZONE_DMA32=y +CONFIG_AUDIT_ARCH=y +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y +CONFIG_HAVE_INTEL_TXT=y +CONFIG_X86_64_SMP=y +CONFIG_ARCH_SUPPORTS_UPROBES=y +CONFIG_FIX_EARLYCON_MEM=y +CONFIG_DYNAMIC_PHYSICAL_MASK=y +CONFIG_PGTABLE_LEVELS=5 +CONFIG_CC_HAS_SANE_STACKPROTECTOR=y + +# +# Processor type and features +# +CONFIG_ZONE_DMA=y +CONFIG_SMP=y +CONFIG_X86_FEATURE_NAMES=y +CONFIG_X86_X2APIC=y +CONFIG_X86_MPPARSE=y +# CONFIG_GOLDFISH is not set +CONFIG_RETPOLINE=y +CONFIG_X86_CPU_RESCTRL=y +# CONFIG_X86_EXTENDED_PLATFORM is not set +CONFIG_X86_INTEL_LPSS=y +CONFIG_X86_AMD_PLATFORM_DEVICE=y +CONFIG_IOSF_MBI=y +# CONFIG_IOSF_MBI_DEBUG is not set +CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y +CONFIG_SCHED_OMIT_FRAME_POINTER=y +CONFIG_HYPERVISOR_GUEST=y +CONFIG_PARAVIRT=y +CONFIG_PARAVIRT_XXL=y +# CONFIG_PARAVIRT_DEBUG is not set +CONFIG_PARAVIRT_SPINLOCKS=y +CONFIG_X86_HV_CALLBACK_VECTOR=y +CONFIG_XEN=y +CONFIG_XEN_PV=y +CONFIG_XEN_PV_SMP=y +CONFIG_XEN_DOM0=y +CONFIG_XEN_PVHVM=y +CONFIG_XEN_PVHVM_SMP=y +CONFIG_XEN_512GB=y +CONFIG_XEN_SAVE_RESTORE=y +# CONFIG_XEN_DEBUG_FS is not set +CONFIG_XEN_PVH=y +CONFIG_KVM_GUEST=y +CONFIG_ARCH_CPUIDLE_HALTPOLL=y +CONFIG_PVH=y +CONFIG_PARAVIRT_TIME_ACCOUNTING=y +CONFIG_PARAVIRT_CLOCK=y +CONFIG_JAILHOUSE_GUEST=y +CONFIG_ACRN_GUEST=y +# CONFIG_MK8 is not set +# CONFIG_MPSC is not set +# CONFIG_MCORE2 is not set +# CONFIG_MATOM is not set +CONFIG_GENERIC_CPU=y +CONFIG_X86_INTERNODE_CACHE_SHIFT=6 +CONFIG_X86_L1_CACHE_SHIFT=6 +CONFIG_X86_TSC=y +CONFIG_X86_CMPXCHG64=y +CONFIG_X86_CMOV=y +CONFIG_X86_MINIMUM_CPU_FAMILY=64 +CONFIG_X86_DEBUGCTLMSR=y +CONFIG_IA32_FEAT_CTL=y +CONFIG_X86_VMX_FEATURE_NAMES=y +CONFIG_PROCESSOR_SELECT=y +CONFIG_CPU_SUP_INTEL=y +CONFIG_CPU_SUP_AMD=y +CONFIG_CPU_SUP_HYGON=y +CONFIG_CPU_SUP_CENTAUR=y +CONFIG_CPU_SUP_ZHAOXIN=y +CONFIG_HPET_TIMER=y +CONFIG_HPET_EMULATE_RTC=y +CONFIG_DMI=y +CONFIG_GART_IOMMU=y +# CONFIG_MAXSMP is not set +CONFIG_NR_CPUS_RANGE_BEGIN=2 +CONFIG_NR_CPUS_RANGE_END=512 +CONFIG_NR_CPUS_DEFAULT=64 +CONFIG_NR_CPUS=320 +CONFIG_SCHED_SMT=y +CONFIG_SCHED_MC=y +CONFIG_SCHED_MC_PRIO=y +CONFIG_X86_LOCAL_APIC=y +CONFIG_X86_IO_APIC=y +CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y +CONFIG_X86_MCE=y +# CONFIG_X86_MCELOG_LEGACY is not set +CONFIG_X86_MCE_INTEL=y +CONFIG_X86_MCE_AMD=y +CONFIG_X86_MCE_THRESHOLD=y +CONFIG_X86_MCE_INJECT=m +CONFIG_X86_THERMAL_VECTOR=y + +# +# Performance monitoring +# +CONFIG_PERF_EVENTS_INTEL_UNCORE=m +CONFIG_PERF_EVENTS_INTEL_RAPL=m +CONFIG_PERF_EVENTS_INTEL_CSTATE=m +CONFIG_PERF_EVENTS_AMD_POWER=m +# end of Performance monitoring + +CONFIG_X86_16BIT=y +CONFIG_X86_ESPFIX64=y +CONFIG_X86_VSYSCALL_EMULATION=y +CONFIG_X86_IOPL_IOPERM=y +CONFIG_I8K=m +CONFIG_MICROCODE=y +CONFIG_MICROCODE_INTEL=y +CONFIG_MICROCODE_AMD=y +CONFIG_MICROCODE_OLD_INTERFACE=y +CONFIG_X86_MSR=m +CONFIG_X86_CPUID=m +CONFIG_X86_5LEVEL=y +CONFIG_X86_DIRECT_GBPAGES=y +# CONFIG_X86_CPA_STATISTICS is not set +CONFIG_AMD_MEM_ENCRYPT=y +# CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT is not set +CONFIG_NUMA=y +CONFIG_AMD_NUMA=y +CONFIG_X86_64_ACPI_NUMA=y +# CONFIG_NUMA_EMU is not set +CONFIG_NODES_SHIFT=5 +CONFIG_ARCH_SPARSEMEM_ENABLE=y +CONFIG_ARCH_SPARSEMEM_DEFAULT=y +CONFIG_ARCH_SELECT_MEMORY_MODEL=y +CONFIG_ARCH_MEMORY_PROBE=y +CONFIG_ARCH_PROC_KCORE_TEXT=y +CONFIG_ILLEGAL_POINTER_VALUE=0xdead000000000000 +CONFIG_X86_PMEM_LEGACY_DEVICE=y +CONFIG_X86_PMEM_LEGACY=m +CONFIG_X86_CHECK_BIOS_CORRUPTION=y +CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y +CONFIG_X86_RESERVE_LOW=64 +CONFIG_MTRR=y +CONFIG_MTRR_SANITIZER=y +CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=1 +CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT=0 +CONFIG_X86_PAT=y +CONFIG_ARCH_USES_PG_UNCACHED=y +CONFIG_ARCH_RANDOM=y +CONFIG_X86_SMAP=y +CONFIG_X86_UMIP=y +CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS=y +# CONFIG_X86_INTEL_TSX_MODE_OFF is not set +# CONFIG_X86_INTEL_TSX_MODE_ON is not set +CONFIG_X86_INTEL_TSX_MODE_AUTO=y +CONFIG_EFI=y +CONFIG_EFI_STUB=y +CONFIG_EFI_MIXED=y +CONFIG_SECCOMP=y +# CONFIG_HZ_100 is not set +# CONFIG_HZ_250 is not set +CONFIG_HZ_300=y +# CONFIG_HZ_1000 is not set +CONFIG_HZ=300 +CONFIG_SCHED_HRTICK=y +CONFIG_KEXEC=y +CONFIG_KEXEC_FILE=y +CONFIG_ARCH_HAS_KEXEC_PURGATORY=y +# CONFIG_KEXEC_SIG is not set +CONFIG_CRASH_DUMP=y +CONFIG_KEXEC_JUMP=y +CONFIG_PHYSICAL_START=0x1000000 +CONFIG_RELOCATABLE=y +CONFIG_RANDOMIZE_BASE=y +CONFIG_X86_NEED_RELOCS=y +CONFIG_PHYSICAL_ALIGN=0x200000 +CONFIG_DYNAMIC_MEMORY_LAYOUT=y +CONFIG_RANDOMIZE_MEMORY=y +CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING=0x1 +CONFIG_HOTPLUG_CPU=y +# CONFIG_BOOTPARAM_HOTPLUG_CPU0 is not set +# CONFIG_DEBUG_HOTPLUG_CPU0 is not set +# CONFIG_COMPAT_VDSO is not set +# CONFIG_LEGACY_VSYSCALL_EMULATE is not set +CONFIG_LEGACY_VSYSCALL_XONLY=y +# CONFIG_LEGACY_VSYSCALL_NONE is not set +# CONFIG_CMDLINE_BOOL is not set +CONFIG_MODIFY_LDT_SYSCALL=y +CONFIG_HAVE_LIVEPATCH=y +# CONFIG_LIVEPATCH is not set +# end of Processor type and features + +CONFIG_ARCH_HAS_ADD_PAGES=y +CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y +CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y +CONFIG_USE_PERCPU_NUMA_NODE_ID=y +CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK=y +CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION=y +CONFIG_ARCH_ENABLE_THP_MIGRATION=y + +# +# Power management and ACPI options +# +CONFIG_ARCH_HIBERNATION_HEADER=y +CONFIG_SUSPEND=y +CONFIG_SUSPEND_FREEZER=y +# CONFIG_SUSPEND_SKIP_SYNC is not set +CONFIG_HIBERNATE_CALLBACKS=y +CONFIG_HIBERNATION=y +CONFIG_HIBERNATION_SNAPSHOT_DEV=y +CONFIG_PM_STD_PARTITION="" +CONFIG_PM_SLEEP=y +CONFIG_PM_SLEEP_SMP=y +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y +CONFIG_PM_WAKELOCKS_LIMIT=100 +CONFIG_PM_WAKELOCKS_GC=y +CONFIG_PM=y +CONFIG_PM_DEBUG=y +CONFIG_PM_ADVANCED_DEBUG=y +# CONFIG_PM_TEST_SUSPEND is not set +CONFIG_PM_SLEEP_DEBUG=y +# CONFIG_DPM_WATCHDOG is not set +CONFIG_PM_TRACE=y +CONFIG_PM_TRACE_RTC=y +CONFIG_PM_CLK=y +CONFIG_PM_GENERIC_DOMAINS=y +CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y +CONFIG_PM_GENERIC_DOMAINS_SLEEP=y +CONFIG_PM_GENERIC_DOMAINS_OF=y +CONFIG_ENERGY_MODEL=y +CONFIG_ARCH_SUPPORTS_ACPI=y +CONFIG_ACPI=y +CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y +CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC=y +CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT=y +# CONFIG_ACPI_DEBUGGER is not set +CONFIG_ACPI_SPCR_TABLE=y +CONFIG_ACPI_LPIT=y +CONFIG_ACPI_SLEEP=y +# CONFIG_ACPI_PROCFS_POWER is not set +CONFIG_ACPI_REV_OVERRIDE_POSSIBLE=y +CONFIG_ACPI_EC_DEBUGFS=y +CONFIG_ACPI_AC=m +CONFIG_ACPI_BATTERY=m +CONFIG_ACPI_BUTTON=y +CONFIG_ACPI_VIDEO=y +CONFIG_ACPI_FAN=y +CONFIG_ACPI_TAD=m +CONFIG_ACPI_DOCK=y +CONFIG_ACPI_CPU_FREQ_PSS=y +CONFIG_ACPI_PROCESSOR_CSTATE=y +CONFIG_ACPI_PROCESSOR_IDLE=y +CONFIG_ACPI_CPPC_LIB=y +CONFIG_ACPI_PROCESSOR=y +CONFIG_ACPI_IPMI=m +CONFIG_ACPI_HOTPLUG_CPU=y +CONFIG_ACPI_PROCESSOR_AGGREGATOR=y +CONFIG_ACPI_THERMAL=y +CONFIG_ARCH_HAS_ACPI_TABLE_UPGRADE=y +CONFIG_ACPI_TABLE_UPGRADE=y +CONFIG_ACPI_DEBUG=y +CONFIG_ACPI_PCI_SLOT=y +CONFIG_ACPI_CONTAINER=y +CONFIG_ACPI_HOTPLUG_MEMORY=y +CONFIG_ACPI_HOTPLUG_IOAPIC=y +CONFIG_ACPI_SBS=m +CONFIG_ACPI_HED=y +CONFIG_ACPI_CUSTOM_METHOD=m +CONFIG_ACPI_BGRT=y +# CONFIG_ACPI_REDUCED_HARDWARE_ONLY is not set +CONFIG_ACPI_NFIT=m +# CONFIG_NFIT_SECURITY_DEBUG is not set +CONFIG_ACPI_NUMA=y +CONFIG_ACPI_HMAT=y +CONFIG_HAVE_ACPI_APEI=y +CONFIG_HAVE_ACPI_APEI_NMI=y +CONFIG_ACPI_APEI=y +CONFIG_ACPI_APEI_GHES=y +CONFIG_ACPI_APEI_PCIEAER=y +CONFIG_ACPI_APEI_MEMORY_FAILURE=y +CONFIG_ACPI_APEI_EINJ=m +CONFIG_ACPI_APEI_ERST_DEBUG=m +CONFIG_DPTF_POWER=m +CONFIG_ACPI_WATCHDOG=y +CONFIG_ACPI_EXTLOG=m +CONFIG_ACPI_ADXL=y +CONFIG_PMIC_OPREGION=y +CONFIG_BYTCRC_PMIC_OPREGION=y +CONFIG_CHTCRC_PMIC_OPREGION=y +CONFIG_XPOWER_PMIC_OPREGION=y +CONFIG_BXT_WC_PMIC_OPREGION=y +CONFIG_CHT_WC_PMIC_OPREGION=y +CONFIG_CHT_DC_TI_PMIC_OPREGION=y +CONFIG_ACPI_CONFIGFS=m +CONFIG_TPS68470_PMIC_OPREGION=y +CONFIG_X86_PM_TIMER=y +CONFIG_SFI=y + +# +# CPU Frequency scaling +# +CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_GOV_ATTR_SET=y +CONFIG_CPU_FREQ_GOV_COMMON=y +CONFIG_CPU_FREQ_STAT=y +# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y +CONFIG_CPU_FREQ_GOV_PERFORMANCE=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=m +CONFIG_CPU_FREQ_GOV_USERSPACE=m +CONFIG_CPU_FREQ_GOV_ONDEMAND=m +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m +CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y + +# +# CPU frequency scaling drivers +# +CONFIG_CPUFREQ_DT=m +CONFIG_CPUFREQ_DT_PLATDEV=y +CONFIG_X86_INTEL_PSTATE=y +CONFIG_X86_PCC_CPUFREQ=m +CONFIG_X86_ACPI_CPUFREQ=m +CONFIG_X86_ACPI_CPUFREQ_CPB=y +CONFIG_X86_POWERNOW_K8=m +CONFIG_X86_AMD_FREQ_SENSITIVITY=m +# CONFIG_X86_SPEEDSTEP_CENTRINO is not set +CONFIG_X86_P4_CLOCKMOD=m + +# +# shared options +# +CONFIG_X86_SPEEDSTEP_LIB=m +# end of CPU Frequency scaling + +# +# CPU Idle +# +CONFIG_CPU_IDLE=y +CONFIG_CPU_IDLE_GOV_LADDER=y +CONFIG_CPU_IDLE_GOV_MENU=y +CONFIG_CPU_IDLE_GOV_TEO=y +CONFIG_CPU_IDLE_GOV_HALTPOLL=y +CONFIG_HALTPOLL_CPUIDLE=m +# end of CPU Idle + +CONFIG_INTEL_IDLE=y +# end of Power management and ACPI options + +# +# Bus options (PCI etc.) +# +CONFIG_PCI_DIRECT=y +CONFIG_PCI_MMCONFIG=y +CONFIG_PCI_XEN=y +CONFIG_MMCONF_FAM10H=y +# CONFIG_PCI_CNB20LE_QUIRK is not set +# CONFIG_ISA_BUS is not set +CONFIG_ISA_DMA_API=y +CONFIG_AMD_NB=y +# CONFIG_X86_SYSFB is not set +# end of Bus options (PCI etc.) + +# +# Binary Emulations +# +CONFIG_IA32_EMULATION=y +# CONFIG_X86_X32 is not set +CONFIG_COMPAT_32=y +CONFIG_COMPAT=y +CONFIG_COMPAT_FOR_U64_ALIGNMENT=y +CONFIG_SYSVIPC_COMPAT=y +# end of Binary Emulations + +# +# Firmware Drivers +# +CONFIG_EDD=m +# CONFIG_EDD_OFF is not set +CONFIG_FIRMWARE_MEMMAP=y +CONFIG_DMIID=y +CONFIG_DMI_SYSFS=m +CONFIG_DMI_SCAN_MACHINE_NON_EFI_FALLBACK=y +CONFIG_ISCSI_IBFT_FIND=y +CONFIG_ISCSI_IBFT=m +CONFIG_FW_CFG_SYSFS=m +# CONFIG_FW_CFG_SYSFS_CMDLINE is not set +CONFIG_GOOGLE_FIRMWARE=y +# CONFIG_GOOGLE_SMI is not set +CONFIG_GOOGLE_COREBOOT_TABLE=m +CONFIG_GOOGLE_MEMCONSOLE=m +# CONFIG_GOOGLE_MEMCONSOLE_X86_LEGACY is not set +CONFIG_GOOGLE_FRAMEBUFFER_COREBOOT=m +CONFIG_GOOGLE_MEMCONSOLE_COREBOOT=m +CONFIG_GOOGLE_VPD=m + +# +# EFI (Extensible Firmware Interface) Support +# +# CONFIG_EFI_VARS is not set +CONFIG_EFI_ESRT=y +CONFIG_EFI_RUNTIME_MAP=y +# CONFIG_EFI_FAKE_MEMMAP is not set +CONFIG_EFI_SOFT_RESERVE=y +CONFIG_EFI_RUNTIME_WRAPPERS=y +CONFIG_EFI_GENERIC_STUB_INITRD_CMDLINE_LOADER=y +CONFIG_EFI_CAPSULE_LOADER=m +# CONFIG_EFI_TEST is not set +CONFIG_APPLE_PROPERTIES=y +# CONFIG_RESET_ATTACK_MITIGATION is not set +CONFIG_EFI_RCI2_TABLE=y +# CONFIG_EFI_DISABLE_PCI_DMA is not set +# end of EFI (Extensible Firmware Interface) Support + +CONFIG_EFI_EMBEDDED_FIRMWARE=y +CONFIG_UEFI_CPER=y +CONFIG_UEFI_CPER_X86=y +CONFIG_EFI_DEV_PATH_PARSER=y +CONFIG_EFI_EARLYCON=y + +# +# Tegra firmware driver +# +# end of Tegra firmware driver +# end of Firmware Drivers + +CONFIG_HAVE_KVM=y +CONFIG_HAVE_KVM_IRQCHIP=y +CONFIG_HAVE_KVM_IRQFD=y +CONFIG_HAVE_KVM_IRQ_ROUTING=y +CONFIG_HAVE_KVM_EVENTFD=y +CONFIG_KVM_MMIO=y +CONFIG_KVM_ASYNC_PF=y +CONFIG_HAVE_KVM_MSI=y +CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT=y +CONFIG_KVM_VFIO=y +CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT=y +CONFIG_KVM_COMPAT=y +CONFIG_HAVE_KVM_IRQ_BYPASS=y +CONFIG_HAVE_KVM_NO_POLL=y +CONFIG_VIRTUALIZATION=y +CONFIG_KVM=m +CONFIG_KVM_WERROR=y +CONFIG_KVM_INTEL=m +CONFIG_KVM_AMD=m +CONFIG_KVM_AMD_SEV=y +CONFIG_KVM_MMU_AUDIT=y +CONFIG_AS_AVX512=y +CONFIG_AS_SHA1_NI=y +CONFIG_AS_SHA256_NI=y +CONFIG_AS_TPAUSE=y + +# +# General architecture-dependent options +# +CONFIG_CRASH_CORE=y +CONFIG_KEXEC_CORE=y +CONFIG_HOTPLUG_SMT=y +CONFIG_OPROFILE=m +# CONFIG_OPROFILE_EVENT_MULTIPLEX is not set +CONFIG_HAVE_OPROFILE=y +CONFIG_OPROFILE_NMI_TIMER=y +CONFIG_KPROBES=y +CONFIG_JUMP_LABEL=y +# CONFIG_STATIC_KEYS_SELFTEST is not set +CONFIG_OPTPROBES=y +CONFIG_KPROBES_ON_FTRACE=y +CONFIG_UPROBES=y +CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y +CONFIG_ARCH_USE_BUILTIN_BSWAP=y +CONFIG_KRETPROBES=y +CONFIG_USER_RETURN_NOTIFIER=y +CONFIG_HAVE_IOREMAP_PROT=y +CONFIG_HAVE_KPROBES=y +CONFIG_HAVE_KRETPROBES=y +CONFIG_HAVE_OPTPROBES=y +CONFIG_HAVE_KPROBES_ON_FTRACE=y +CONFIG_HAVE_FUNCTION_ERROR_INJECTION=y +CONFIG_HAVE_NMI=y +CONFIG_HAVE_ARCH_TRACEHOOK=y +CONFIG_HAVE_DMA_CONTIGUOUS=y +CONFIG_GENERIC_SMP_IDLE_THREAD=y +CONFIG_ARCH_HAS_FORTIFY_SOURCE=y +CONFIG_ARCH_HAS_SET_MEMORY=y +CONFIG_ARCH_HAS_SET_DIRECT_MAP=y +CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y +CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT=y +CONFIG_HAVE_ASM_MODVERSIONS=y +CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y +CONFIG_HAVE_RSEQ=y +CONFIG_HAVE_FUNCTION_ARG_ACCESS_API=y +CONFIG_HAVE_HW_BREAKPOINT=y +CONFIG_HAVE_MIXED_BREAKPOINTS_REGS=y +CONFIG_HAVE_USER_RETURN_NOTIFIER=y +CONFIG_HAVE_PERF_EVENTS_NMI=y +CONFIG_HAVE_HARDLOCKUP_DETECTOR_PERF=y +CONFIG_HAVE_PERF_REGS=y +CONFIG_HAVE_PERF_USER_STACK_DUMP=y +CONFIG_HAVE_ARCH_JUMP_LABEL=y +CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE=y +CONFIG_MMU_GATHER_TABLE_FREE=y +CONFIG_MMU_GATHER_RCU_TABLE_FREE=y +CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG=y +CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y +CONFIG_HAVE_CMPXCHG_LOCAL=y +CONFIG_HAVE_CMPXCHG_DOUBLE=y +CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION=y +CONFIG_ARCH_WANT_OLD_COMPAT_IPC=y +CONFIG_HAVE_ARCH_SECCOMP_FILTER=y +CONFIG_SECCOMP_FILTER=y +CONFIG_HAVE_ARCH_STACKLEAK=y +CONFIG_HAVE_STACKPROTECTOR=y +CONFIG_CC_HAS_STACKPROTECTOR_NONE=y +CONFIG_STACKPROTECTOR=y +CONFIG_STACKPROTECTOR_STRONG=y +CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES=y +CONFIG_HAVE_CONTEXT_TRACKING=y +CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y +CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y +CONFIG_HAVE_MOVE_PMD=y +CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y +CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD=y +CONFIG_HAVE_ARCH_HUGE_VMAP=y +CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y +CONFIG_HAVE_ARCH_SOFT_DIRTY=y +CONFIG_HAVE_MOD_ARCH_SPECIFIC=y +CONFIG_MODULES_USE_ELF_RELA=y +CONFIG_ARCH_HAS_ELF_RANDOMIZE=y +CONFIG_HAVE_ARCH_MMAP_RND_BITS=y +CONFIG_HAVE_EXIT_THREAD=y +CONFIG_ARCH_MMAP_RND_BITS=28 +CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS=y +CONFIG_ARCH_MMAP_RND_COMPAT_BITS=8 +CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES=y +CONFIG_HAVE_COPY_THREAD_TLS=y +CONFIG_HAVE_STACK_VALIDATION=y +CONFIG_HAVE_RELIABLE_STACKTRACE=y +CONFIG_ISA_BUS_API=y +CONFIG_OLD_SIGSUSPEND3=y +CONFIG_COMPAT_OLD_SIGACTION=y +CONFIG_COMPAT_32BIT_TIME=y +CONFIG_HAVE_ARCH_VMAP_STACK=y +CONFIG_VMAP_STACK=y +CONFIG_ARCH_HAS_STRICT_KERNEL_RWX=y +CONFIG_STRICT_KERNEL_RWX=y +CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y +CONFIG_STRICT_MODULE_RWX=y +CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y +CONFIG_ARCH_USE_MEMREMAP_PROT=y +CONFIG_LOCK_EVENT_COUNTS=y +CONFIG_ARCH_HAS_MEM_ENCRYPT=y + +# +# GCOV-based kernel profiling +# +# CONFIG_GCOV_KERNEL is not set +CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y +# end of GCOV-based kernel profiling + +CONFIG_HAVE_GCC_PLUGINS=y +CONFIG_GCC_PLUGINS=y +# CONFIG_GCC_PLUGIN_CYC_COMPLEXITY is not set +# CONFIG_GCC_PLUGIN_LATENT_ENTROPY is not set +# CONFIG_GCC_PLUGIN_RANDSTRUCT is not set +# end of General architecture-dependent options + +CONFIG_RT_MUTEXES=y +CONFIG_BASE_SMALL=0 +CONFIG_MODULE_SIG_FORMAT=y +CONFIG_MODULES=y +CONFIG_MODULE_FORCE_LOAD=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_FORCE_UNLOAD=y +# CONFIG_MODVERSIONS is not set +CONFIG_MODULE_SRCVERSION_ALL=y +CONFIG_MODULE_SIG=y +# CONFIG_MODULE_SIG_FORCE is not set +CONFIG_MODULE_SIG_ALL=y +# CONFIG_MODULE_SIG_SHA1 is not set +# CONFIG_MODULE_SIG_SHA224 is not set +# CONFIG_MODULE_SIG_SHA256 is not set +# CONFIG_MODULE_SIG_SHA384 is not set +CONFIG_MODULE_SIG_SHA512=y +CONFIG_MODULE_SIG_HASH="sha512" +CONFIG_MODULE_COMPRESS=y +# CONFIG_MODULE_COMPRESS_GZIP is not set +CONFIG_MODULE_COMPRESS_XZ=y +CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS=y +CONFIG_UNUSED_SYMBOLS=y +CONFIG_MODULES_TREE_LOOKUP=y +CONFIG_BLOCK=y +CONFIG_BLK_RQ_ALLOC_TIME=y +CONFIG_BLK_SCSI_REQUEST=y +CONFIG_BLK_CGROUP_RWSTAT=y +CONFIG_BLK_DEV_BSG=y +CONFIG_BLK_DEV_BSGLIB=y +CONFIG_BLK_DEV_INTEGRITY=y +CONFIG_BLK_DEV_INTEGRITY_T10=y +CONFIG_BLK_DEV_ZONED=y +CONFIG_BLK_DEV_THROTTLING=y +CONFIG_BLK_DEV_THROTTLING_LOW=y +# CONFIG_BLK_CMDLINE_PARSER is not set +CONFIG_BLK_WBT=y +CONFIG_BLK_CGROUP_IOLATENCY=y +CONFIG_BLK_CGROUP_IOCOST=y +CONFIG_BLK_WBT_MQ=y +CONFIG_BLK_DEBUG_FS=y +CONFIG_BLK_DEBUG_FS_ZONED=y +CONFIG_BLK_SED_OPAL=y +CONFIG_BLK_INLINE_ENCRYPTION=y +CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y + +# +# Partition Types +# +CONFIG_PARTITION_ADVANCED=y +# CONFIG_ACORN_PARTITION is not set +CONFIG_AIX_PARTITION=y +# CONFIG_OSF_PARTITION is not set +# CONFIG_AMIGA_PARTITION is not set +# CONFIG_ATARI_PARTITION is not set +CONFIG_MAC_PARTITION=y +CONFIG_MSDOS_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +# CONFIG_UNIXWARE_DISKLABEL is not set +CONFIG_LDM_PARTITION=y +# CONFIG_LDM_DEBUG is not set +# CONFIG_SGI_PARTITION is not set +# CONFIG_ULTRIX_PARTITION is not set +# CONFIG_SUN_PARTITION is not set +CONFIG_KARMA_PARTITION=y +CONFIG_EFI_PARTITION=y +# CONFIG_SYSV68_PARTITION is not set +# CONFIG_CMDLINE_PARTITION is not set +# end of Partition Types + +CONFIG_BLOCK_COMPAT=y +CONFIG_BLK_MQ_PCI=y +CONFIG_BLK_MQ_VIRTIO=y +CONFIG_BLK_MQ_RDMA=y +CONFIG_BLK_PM=y + +# +# IO Schedulers +# +CONFIG_MQ_IOSCHED_DEADLINE=y +CONFIG_MQ_IOSCHED_KYBER=y +CONFIG_IOSCHED_BFQ=y +CONFIG_BFQ_GROUP_IOSCHED=y +# CONFIG_BFQ_CGROUP_DEBUG is not set +# end of IO Schedulers + +CONFIG_PREEMPT_NOTIFIERS=y +CONFIG_PADATA=y +CONFIG_ASN1=y +CONFIG_UNINLINE_SPIN_UNLOCK=y +CONFIG_ARCH_SUPPORTS_ATOMIC_RMW=y +CONFIG_MUTEX_SPIN_ON_OWNER=y +CONFIG_RWSEM_SPIN_ON_OWNER=y +CONFIG_LOCK_SPIN_ON_OWNER=y +CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y +CONFIG_QUEUED_SPINLOCKS=y +CONFIG_ARCH_USE_QUEUED_RWLOCKS=y +CONFIG_QUEUED_RWLOCKS=y +CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE=y +CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE=y +CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y +CONFIG_FREEZER=y + +# +# Executable file formats +# +CONFIG_BINFMT_ELF=y +CONFIG_COMPAT_BINFMT_ELF=y +CONFIG_ELFCORE=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y +CONFIG_BINFMT_SCRIPT=y +CONFIG_BINFMT_MISC=y +CONFIG_COREDUMP=y +# end of Executable file formats + +# +# Memory Management options +# +CONFIG_SELECT_MEMORY_MODEL=y +CONFIG_SPARSEMEM_MANUAL=y +CONFIG_SPARSEMEM=y +CONFIG_NEED_MULTIPLE_NODES=y +CONFIG_HAVE_MEMORY_PRESENT=y +CONFIG_SPARSEMEM_EXTREME=y +CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y +CONFIG_SPARSEMEM_VMEMMAP=y +CONFIG_HAVE_FAST_GUP=y +CONFIG_NUMA_KEEP_MEMINFO=y +CONFIG_MEMORY_ISOLATION=y +CONFIG_HAVE_BOOTMEM_INFO_NODE=y +CONFIG_MEMORY_HOTPLUG=y +CONFIG_MEMORY_HOTPLUG_SPARSE=y +CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y +CONFIG_MEMORY_HOTREMOVE=y +CONFIG_SPLIT_PTLOCK_CPUS=4 +CONFIG_MEMORY_BALLOON=y +CONFIG_BALLOON_COMPACTION=y +CONFIG_COMPACTION=y +CONFIG_PAGE_REPORTING=y +CONFIG_MIGRATION=y +CONFIG_CONTIG_ALLOC=y +CONFIG_PHYS_ADDR_T_64BIT=y +CONFIG_BOUNCE=y +CONFIG_VIRT_TO_BUS=y +CONFIG_MMU_NOTIFIER=y +CONFIG_KSM=y +CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 +CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y +CONFIG_MEMORY_FAILURE=y +CONFIG_HWPOISON_INJECT=m +CONFIG_TRANSPARENT_HUGEPAGE=y +# CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS is not set +CONFIG_TRANSPARENT_HUGEPAGE_MADVISE=y +CONFIG_ARCH_WANTS_THP_SWAP=y +CONFIG_THP_SWAP=y +CONFIG_CLEANCACHE=y +CONFIG_FRONTSWAP=y +# CONFIG_CMA is not set +CONFIG_MEM_SOFT_DIRTY=y +CONFIG_ZSWAP=y +# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_DEFLATE is not set +# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZO is not set +# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_842 is not set +CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4=y +# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4HC is not set +# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_ZSTD is not set +CONFIG_ZSWAP_COMPRESSOR_DEFAULT="lz4" +# CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD is not set +CONFIG_ZSWAP_ZPOOL_DEFAULT_Z3FOLD=y +# CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC is not set +CONFIG_ZSWAP_ZPOOL_DEFAULT="z3fold" +CONFIG_ZSWAP_DEFAULT_ON=y +CONFIG_ZPOOL=y +CONFIG_ZBUD=y +CONFIG_Z3FOLD=y +CONFIG_ZSMALLOC=y +# CONFIG_ZSMALLOC_PGTABLE_MAPPING is not set +# CONFIG_ZSMALLOC_STAT is not set +CONFIG_GENERIC_EARLY_IOREMAP=y +# CONFIG_DEFERRED_STRUCT_PAGE_INIT is not set +# CONFIG_IDLE_PAGE_TRACKING is not set +CONFIG_ARCH_HAS_PTE_DEVMAP=y +CONFIG_ZONE_DEVICE=y +CONFIG_DEV_PAGEMAP_OPS=y +CONFIG_HMM_MIRROR=y +CONFIG_DEVICE_PRIVATE=y +CONFIG_FRAME_VECTOR=y +CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y +CONFIG_ARCH_HAS_PKEYS=y +# CONFIG_PERCPU_STATS is not set +# CONFIG_GUP_BENCHMARK is not set +CONFIG_READ_ONLY_THP_FOR_FS=y +CONFIG_ARCH_HAS_PTE_SPECIAL=y +CONFIG_MAPPING_DIRTY_HELPERS=y +# end of Memory Management options + +CONFIG_NET=y +CONFIG_COMPAT_NETLINK_MESSAGES=y +CONFIG_NET_INGRESS=y +CONFIG_NET_EGRESS=y +CONFIG_NET_REDIRECT=y +CONFIG_SKB_EXTENSIONS=y + +# +# Networking options +# +CONFIG_PACKET=y +CONFIG_PACKET_DIAG=y +CONFIG_UNIX=y +CONFIG_UNIX_SCM=y +CONFIG_UNIX_DIAG=y +CONFIG_TLS=m +CONFIG_TLS_DEVICE=y +# CONFIG_TLS_TOE is not set +CONFIG_XFRM=y +CONFIG_XFRM_OFFLOAD=y +CONFIG_XFRM_ALGO=m +CONFIG_XFRM_USER=m +CONFIG_XFRM_INTERFACE=m +CONFIG_XFRM_SUB_POLICY=y +CONFIG_XFRM_MIGRATE=y +CONFIG_XFRM_STATISTICS=y +CONFIG_XFRM_AH=m +CONFIG_XFRM_ESP=m +CONFIG_XFRM_IPCOMP=m +CONFIG_NET_KEY=m +CONFIG_NET_KEY_MIGRATE=y +CONFIG_XFRM_ESPINTCP=y +CONFIG_SMC=m +CONFIG_SMC_DIAG=m +CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +# CONFIG_IP_FIB_TRIE_STATS is not set +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_IP_ROUTE_CLASSID=y +# CONFIG_IP_PNP is not set +CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE_DEMUX=m +CONFIG_NET_IP_TUNNEL=m +CONFIG_NET_IPGRE=m +# CONFIG_NET_IPGRE_BROADCAST is not set +CONFIG_IP_MROUTE_COMMON=y +CONFIG_IP_MROUTE=y +CONFIG_IP_MROUTE_MULTIPLE_TABLES=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +CONFIG_SYN_COOKIES=y +CONFIG_NET_IPVTI=m +CONFIG_NET_UDP_TUNNEL=m +CONFIG_NET_FOU=m +CONFIG_NET_FOU_IP_TUNNELS=y +CONFIG_INET_AH=m +CONFIG_INET_ESP=m +CONFIG_INET_ESP_OFFLOAD=m +CONFIG_INET_ESPINTCP=y +CONFIG_INET_IPCOMP=m +CONFIG_INET_XFRM_TUNNEL=m +CONFIG_INET_TUNNEL=m +CONFIG_INET_DIAG=m +CONFIG_INET_TCP_DIAG=m +CONFIG_INET_UDP_DIAG=m +CONFIG_INET_RAW_DIAG=m +CONFIG_INET_DIAG_DESTROY=y +CONFIG_TCP_CONG_ADVANCED=y +CONFIG_TCP_CONG_BIC=m +CONFIG_TCP_CONG_CUBIC=y +CONFIG_TCP_CONG_WESTWOOD=m +CONFIG_TCP_CONG_HTCP=m +CONFIG_TCP_CONG_HSTCP=m +CONFIG_TCP_CONG_HYBLA=m +CONFIG_TCP_CONG_VEGAS=m +CONFIG_TCP_CONG_NV=m +CONFIG_TCP_CONG_SCALABLE=m +CONFIG_TCP_CONG_LP=m +CONFIG_TCP_CONG_VENO=m +CONFIG_TCP_CONG_YEAH=m +CONFIG_TCP_CONG_ILLINOIS=m +CONFIG_TCP_CONG_DCTCP=m +CONFIG_TCP_CONG_CDG=m +CONFIG_TCP_CONG_BBR=m +CONFIG_DEFAULT_CUBIC=y +# CONFIG_DEFAULT_RENO is not set +CONFIG_DEFAULT_TCP_CONG="cubic" +CONFIG_TCP_MD5SIG=y +CONFIG_IPV6=y +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_INET6_AH=m +CONFIG_INET6_ESP=m +CONFIG_INET6_ESP_OFFLOAD=m +CONFIG_INET6_ESPINTCP=y +CONFIG_INET6_IPCOMP=m +CONFIG_IPV6_MIP6=m +CONFIG_IPV6_ILA=m +CONFIG_INET6_XFRM_TUNNEL=m +CONFIG_INET6_TUNNEL=m +CONFIG_IPV6_VTI=m +CONFIG_IPV6_SIT=m +CONFIG_IPV6_SIT_6RD=y +CONFIG_IPV6_NDISC_NODETYPE=y +CONFIG_IPV6_TUNNEL=m +CONFIG_IPV6_GRE=m +CONFIG_IPV6_FOU=m +CONFIG_IPV6_FOU_TUNNEL=m +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_SUBTREES=y +CONFIG_IPV6_MROUTE=y +CONFIG_IPV6_MROUTE_MULTIPLE_TABLES=y +CONFIG_IPV6_PIMSM_V2=y +CONFIG_IPV6_SEG6_LWTUNNEL=y +CONFIG_IPV6_SEG6_HMAC=y +CONFIG_IPV6_SEG6_BPF=y +CONFIG_IPV6_RPL_LWTUNNEL=y +CONFIG_NETLABEL=y +CONFIG_MPTCP=y +CONFIG_MPTCP_IPV6=y +# CONFIG_MPTCP_HMAC_TEST is not set +CONFIG_NETWORK_SECMARK=y +CONFIG_NET_PTP_CLASSIFY=y +CONFIG_NETWORK_PHY_TIMESTAMPING=y +CONFIG_NETFILTER=y +CONFIG_NETFILTER_ADVANCED=y +CONFIG_BRIDGE_NETFILTER=m + +# +# Core Netfilter Configuration +# +CONFIG_NETFILTER_INGRESS=y +CONFIG_NETFILTER_NETLINK=m +CONFIG_NETFILTER_FAMILY_BRIDGE=y +CONFIG_NETFILTER_FAMILY_ARP=y +CONFIG_NETFILTER_NETLINK_ACCT=m +CONFIG_NETFILTER_NETLINK_QUEUE=m +CONFIG_NETFILTER_NETLINK_LOG=m +CONFIG_NETFILTER_NETLINK_OSF=m +CONFIG_NF_CONNTRACK=m +CONFIG_NF_LOG_COMMON=m +CONFIG_NF_LOG_NETDEV=m +CONFIG_NETFILTER_CONNCOUNT=m +CONFIG_NF_CONNTRACK_MARK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_ZONES=y +CONFIG_NF_CONNTRACK_PROCFS=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_TIMEOUT=y +CONFIG_NF_CONNTRACK_TIMESTAMP=y +CONFIG_NF_CONNTRACK_LABELS=y +CONFIG_NF_CT_PROTO_DCCP=y +CONFIG_NF_CT_PROTO_GRE=y +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_CT_PROTO_UDPLITE=y +CONFIG_NF_CONNTRACK_AMANDA=m +CONFIG_NF_CONNTRACK_FTP=m +CONFIG_NF_CONNTRACK_H323=m +CONFIG_NF_CONNTRACK_IRC=m +CONFIG_NF_CONNTRACK_BROADCAST=m +CONFIG_NF_CONNTRACK_NETBIOS_NS=m +CONFIG_NF_CONNTRACK_SNMP=m +CONFIG_NF_CONNTRACK_PPTP=m +CONFIG_NF_CONNTRACK_SANE=m +CONFIG_NF_CONNTRACK_SIP=m +CONFIG_NF_CONNTRACK_TFTP=m +CONFIG_NF_CT_NETLINK=m +CONFIG_NF_CT_NETLINK_TIMEOUT=m +CONFIG_NF_CT_NETLINK_HELPER=m +CONFIG_NETFILTER_NETLINK_GLUE_CT=y +CONFIG_NF_NAT=m +CONFIG_NF_NAT_AMANDA=m +CONFIG_NF_NAT_FTP=m +CONFIG_NF_NAT_IRC=m +CONFIG_NF_NAT_SIP=m +CONFIG_NF_NAT_TFTP=m +CONFIG_NF_NAT_REDIRECT=y +CONFIG_NF_NAT_MASQUERADE=y +CONFIG_NETFILTER_SYNPROXY=m +CONFIG_NF_TABLES=m +CONFIG_NF_TABLES_INET=y +CONFIG_NF_TABLES_NETDEV=y +CONFIG_NFT_NUMGEN=m +CONFIG_NFT_CT=m +CONFIG_NFT_FLOW_OFFLOAD=m +CONFIG_NFT_COUNTER=m +CONFIG_NFT_CONNLIMIT=m +CONFIG_NFT_LOG=m +CONFIG_NFT_LIMIT=m +CONFIG_NFT_MASQ=m +CONFIG_NFT_REDIR=m +CONFIG_NFT_NAT=m +CONFIG_NFT_TUNNEL=m +CONFIG_NFT_OBJREF=m +CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m +CONFIG_NFT_REJECT=m +CONFIG_NFT_REJECT_INET=m +CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m +CONFIG_NFT_FIB=m +CONFIG_NFT_FIB_INET=m +CONFIG_NFT_XFRM=m +CONFIG_NFT_SOCKET=m +CONFIG_NFT_OSF=m +CONFIG_NFT_TPROXY=m +CONFIG_NFT_SYNPROXY=m +CONFIG_NF_DUP_NETDEV=m +CONFIG_NFT_DUP_NETDEV=m +CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m +CONFIG_NF_FLOW_TABLE_INET=m +CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES=m + +# +# Xtables combined modules +# +CONFIG_NETFILTER_XT_MARK=m +CONFIG_NETFILTER_XT_CONNMARK=m +CONFIG_NETFILTER_XT_SET=m + +# +# Xtables targets +# +CONFIG_NETFILTER_XT_TARGET_AUDIT=m +CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m +CONFIG_NETFILTER_XT_TARGET_CONNMARK=m +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m +CONFIG_NETFILTER_XT_TARGET_CT=m +CONFIG_NETFILTER_XT_TARGET_DSCP=m +CONFIG_NETFILTER_XT_TARGET_HL=m +CONFIG_NETFILTER_XT_TARGET_HMARK=m +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m +CONFIG_NETFILTER_XT_TARGET_LED=m +CONFIG_NETFILTER_XT_TARGET_LOG=m +CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_NAT=m +CONFIG_NETFILTER_XT_TARGET_NETMAP=m +CONFIG_NETFILTER_XT_TARGET_NFLOG=m +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_NOTRACK=m +CONFIG_NETFILTER_XT_TARGET_RATEEST=m +CONFIG_NETFILTER_XT_TARGET_REDIRECT=m +CONFIG_NETFILTER_XT_TARGET_MASQUERADE=m +CONFIG_NETFILTER_XT_TARGET_TEE=m +CONFIG_NETFILTER_XT_TARGET_TPROXY=m +CONFIG_NETFILTER_XT_TARGET_TRACE=m +CONFIG_NETFILTER_XT_TARGET_SECMARK=m +CONFIG_NETFILTER_XT_TARGET_TCPMSS=m +CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m + +# +# Xtables matches +# +CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m +CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NETFILTER_XT_MATCH_CGROUP=m +CONFIG_NETFILTER_XT_MATCH_CLUSTER=m +CONFIG_NETFILTER_XT_MATCH_COMMENT=m +CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m +CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m +CONFIG_NETFILTER_XT_MATCH_CONNMARK=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NETFILTER_XT_MATCH_CPU=m +CONFIG_NETFILTER_XT_MATCH_DCCP=m +CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m +CONFIG_NETFILTER_XT_MATCH_DSCP=m +CONFIG_NETFILTER_XT_MATCH_ECN=m +CONFIG_NETFILTER_XT_MATCH_ESP=m +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m +CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_HL=m +CONFIG_NETFILTER_XT_MATCH_IPCOMP=m +CONFIG_NETFILTER_XT_MATCH_IPRANGE=m +CONFIG_NETFILTER_XT_MATCH_IPVS=m +CONFIG_NETFILTER_XT_MATCH_L2TP=m +CONFIG_NETFILTER_XT_MATCH_LENGTH=m +CONFIG_NETFILTER_XT_MATCH_LIMIT=m +CONFIG_NETFILTER_XT_MATCH_MAC=m +CONFIG_NETFILTER_XT_MATCH_MARK=m +CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m +CONFIG_NETFILTER_XT_MATCH_NFACCT=m +CONFIG_NETFILTER_XT_MATCH_OSF=m +CONFIG_NETFILTER_XT_MATCH_OWNER=m +CONFIG_NETFILTER_XT_MATCH_POLICY=m +CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m +CONFIG_NETFILTER_XT_MATCH_QUOTA=m +CONFIG_NETFILTER_XT_MATCH_RATEEST=m +CONFIG_NETFILTER_XT_MATCH_REALM=m +CONFIG_NETFILTER_XT_MATCH_RECENT=m +CONFIG_NETFILTER_XT_MATCH_SCTP=m +CONFIG_NETFILTER_XT_MATCH_SOCKET=m +CONFIG_NETFILTER_XT_MATCH_STATE=m +CONFIG_NETFILTER_XT_MATCH_STATISTIC=m +CONFIG_NETFILTER_XT_MATCH_STRING=m +CONFIG_NETFILTER_XT_MATCH_TCPMSS=m +CONFIG_NETFILTER_XT_MATCH_TIME=m +CONFIG_NETFILTER_XT_MATCH_U32=m +# end of Core Netfilter Configuration + +CONFIG_IP_SET=m +CONFIG_IP_SET_MAX=256 +CONFIG_IP_SET_BITMAP_IP=m +CONFIG_IP_SET_BITMAP_IPMAC=m +CONFIG_IP_SET_BITMAP_PORT=m +CONFIG_IP_SET_HASH_IP=m +CONFIG_IP_SET_HASH_IPMARK=m +CONFIG_IP_SET_HASH_IPPORT=m +CONFIG_IP_SET_HASH_IPPORTIP=m +CONFIG_IP_SET_HASH_IPPORTNET=m +CONFIG_IP_SET_HASH_IPMAC=m +CONFIG_IP_SET_HASH_MAC=m +CONFIG_IP_SET_HASH_NETPORTNET=m +CONFIG_IP_SET_HASH_NET=m +CONFIG_IP_SET_HASH_NETNET=m +CONFIG_IP_SET_HASH_NETPORT=m +CONFIG_IP_SET_HASH_NETIFACE=m +CONFIG_IP_SET_LIST_SET=m +CONFIG_IP_VS=m +CONFIG_IP_VS_IPV6=y +# CONFIG_IP_VS_DEBUG is not set +CONFIG_IP_VS_TAB_BITS=15 + +# +# IPVS transport protocol load balancing support +# +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_PROTO_UDP=y +CONFIG_IP_VS_PROTO_AH_ESP=y +CONFIG_IP_VS_PROTO_ESP=y +CONFIG_IP_VS_PROTO_AH=y +CONFIG_IP_VS_PROTO_SCTP=y + +# +# IPVS scheduler +# +CONFIG_IP_VS_RR=m +CONFIG_IP_VS_WRR=m +CONFIG_IP_VS_LC=m +CONFIG_IP_VS_WLC=m +CONFIG_IP_VS_FO=m +CONFIG_IP_VS_OVF=m +CONFIG_IP_VS_LBLC=m +CONFIG_IP_VS_LBLCR=m +CONFIG_IP_VS_DH=m +CONFIG_IP_VS_SH=m +CONFIG_IP_VS_MH=m +CONFIG_IP_VS_SED=m +CONFIG_IP_VS_NQ=m + +# +# IPVS SH scheduler +# +CONFIG_IP_VS_SH_TAB_BITS=8 + +# +# IPVS MH scheduler +# +CONFIG_IP_VS_MH_TAB_INDEX=12 + +# +# IPVS application helper +# +CONFIG_IP_VS_FTP=m +CONFIG_IP_VS_NFCT=y +CONFIG_IP_VS_PE_SIP=m + +# +# IP: Netfilter Configuration +# +CONFIG_NF_DEFRAG_IPV4=m +CONFIG_NF_SOCKET_IPV4=m +CONFIG_NF_TPROXY_IPV4=m +CONFIG_NF_TABLES_IPV4=y +CONFIG_NFT_REJECT_IPV4=m +CONFIG_NFT_DUP_IPV4=m +CONFIG_NFT_FIB_IPV4=m +CONFIG_NF_TABLES_ARP=y +CONFIG_NF_FLOW_TABLE_IPV4=m +CONFIG_NF_DUP_IPV4=m +CONFIG_NF_LOG_ARP=m +CONFIG_NF_LOG_IPV4=m +CONFIG_NF_REJECT_IPV4=m +CONFIG_NF_NAT_SNMP_BASIC=m +CONFIG_NF_NAT_PPTP=m +CONFIG_NF_NAT_H323=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_AH=m +CONFIG_IP_NF_MATCH_ECN=m +CONFIG_IP_NF_MATCH_RPFILTER=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_TARGET_SYNPROXY=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_TARGET_NETMAP=m +CONFIG_IP_NF_TARGET_REDIRECT=m +CONFIG_IP_NF_MANGLE=m +CONFIG_IP_NF_TARGET_CLUSTERIP=m +CONFIG_IP_NF_TARGET_ECN=m +CONFIG_IP_NF_TARGET_TTL=m +CONFIG_IP_NF_RAW=m +CONFIG_IP_NF_SECURITY=m +CONFIG_IP_NF_ARPTABLES=m +CONFIG_IP_NF_ARPFILTER=m +CONFIG_IP_NF_ARP_MANGLE=m +# end of IP: Netfilter Configuration + +# +# IPv6: Netfilter Configuration +# +CONFIG_NF_SOCKET_IPV6=m +CONFIG_NF_TPROXY_IPV6=m +CONFIG_NF_TABLES_IPV6=y +CONFIG_NFT_REJECT_IPV6=m +CONFIG_NFT_DUP_IPV6=m +CONFIG_NFT_FIB_IPV6=m +CONFIG_NF_FLOW_TABLE_IPV6=m +CONFIG_NF_DUP_IPV6=m +CONFIG_NF_REJECT_IPV6=m +CONFIG_NF_LOG_IPV6=m +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_AH=m +CONFIG_IP6_NF_MATCH_EUI64=m +CONFIG_IP6_NF_MATCH_FRAG=m +CONFIG_IP6_NF_MATCH_OPTS=m +CONFIG_IP6_NF_MATCH_HL=m +CONFIG_IP6_NF_MATCH_IPV6HEADER=m +CONFIG_IP6_NF_MATCH_MH=m +CONFIG_IP6_NF_MATCH_RPFILTER=m +CONFIG_IP6_NF_MATCH_RT=m +CONFIG_IP6_NF_MATCH_SRH=m +CONFIG_IP6_NF_TARGET_HL=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_REJECT=m +CONFIG_IP6_NF_TARGET_SYNPROXY=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_RAW=m +CONFIG_IP6_NF_SECURITY=m +CONFIG_IP6_NF_NAT=m +CONFIG_IP6_NF_TARGET_MASQUERADE=m +CONFIG_IP6_NF_TARGET_NPT=m +# end of IPv6: Netfilter Configuration + +CONFIG_NF_DEFRAG_IPV6=m +CONFIG_NF_TABLES_BRIDGE=m +CONFIG_NFT_BRIDGE_META=m +CONFIG_NFT_BRIDGE_REJECT=m +CONFIG_NF_LOG_BRIDGE=m +CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES=m +CONFIG_BRIDGE_EBT_BROUTE=m +CONFIG_BRIDGE_EBT_T_FILTER=m +CONFIG_BRIDGE_EBT_T_NAT=m +CONFIG_BRIDGE_EBT_802_3=m +CONFIG_BRIDGE_EBT_AMONG=m +CONFIG_BRIDGE_EBT_ARP=m +CONFIG_BRIDGE_EBT_IP=m +CONFIG_BRIDGE_EBT_IP6=m +CONFIG_BRIDGE_EBT_LIMIT=m +CONFIG_BRIDGE_EBT_MARK=m +CONFIG_BRIDGE_EBT_PKTTYPE=m +CONFIG_BRIDGE_EBT_STP=m +CONFIG_BRIDGE_EBT_VLAN=m +CONFIG_BRIDGE_EBT_ARPREPLY=m +CONFIG_BRIDGE_EBT_DNAT=m +CONFIG_BRIDGE_EBT_MARK_T=m +CONFIG_BRIDGE_EBT_REDIRECT=m +CONFIG_BRIDGE_EBT_SNAT=m +CONFIG_BRIDGE_EBT_LOG=m +CONFIG_BRIDGE_EBT_NFLOG=m +# CONFIG_BPFILTER is not set +CONFIG_IP_DCCP=m +CONFIG_INET_DCCP_DIAG=m + +# +# DCCP CCIDs Configuration +# +# CONFIG_IP_DCCP_CCID2_DEBUG is not set +CONFIG_IP_DCCP_CCID3=y +# CONFIG_IP_DCCP_CCID3_DEBUG is not set +CONFIG_IP_DCCP_TFRC_LIB=y +# end of DCCP CCIDs Configuration + +# +# DCCP Kernel Hacking +# +# CONFIG_IP_DCCP_DEBUG is not set +# end of DCCP Kernel Hacking + +CONFIG_IP_SCTP=m +# CONFIG_SCTP_DBG_OBJCNT is not set +# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5 is not set +CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1=y +# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE is not set +CONFIG_SCTP_COOKIE_HMAC_MD5=y +CONFIG_SCTP_COOKIE_HMAC_SHA1=y +CONFIG_INET_SCTP_DIAG=m +CONFIG_RDS=m +CONFIG_RDS_RDMA=m +CONFIG_RDS_TCP=m +# CONFIG_RDS_DEBUG is not set +CONFIG_TIPC=m +CONFIG_TIPC_MEDIA_IB=y +CONFIG_TIPC_MEDIA_UDP=y +CONFIG_TIPC_CRYPTO=y +CONFIG_TIPC_DIAG=m +CONFIG_ATM=m +CONFIG_ATM_CLIP=m +# CONFIG_ATM_CLIP_NO_ICMP is not set +CONFIG_ATM_LANE=m +CONFIG_ATM_MPOA=m +CONFIG_ATM_BR2684=m +# CONFIG_ATM_BR2684_IPFILTER is not set +CONFIG_L2TP=m +# CONFIG_L2TP_DEBUGFS is not set +CONFIG_L2TP_V3=y +CONFIG_L2TP_IP=m +CONFIG_L2TP_ETH=m +CONFIG_STP=m +CONFIG_GARP=m +CONFIG_MRP=m +CONFIG_BRIDGE=m +CONFIG_BRIDGE_IGMP_SNOOPING=y +CONFIG_BRIDGE_VLAN_FILTERING=y +CONFIG_BRIDGE_MRP=y +CONFIG_HAVE_NET_DSA=y +CONFIG_NET_DSA=m +CONFIG_NET_DSA_TAG_8021Q=m +CONFIG_NET_DSA_TAG_AR9331=m +CONFIG_NET_DSA_TAG_BRCM_COMMON=m +CONFIG_NET_DSA_TAG_BRCM=m +CONFIG_NET_DSA_TAG_BRCM_PREPEND=m +CONFIG_NET_DSA_TAG_GSWIP=m +CONFIG_NET_DSA_TAG_DSA=m +CONFIG_NET_DSA_TAG_EDSA=m +CONFIG_NET_DSA_TAG_MTK=m +CONFIG_NET_DSA_TAG_KSZ=m +CONFIG_NET_DSA_TAG_OCELOT=m +CONFIG_NET_DSA_TAG_QCA=m +CONFIG_NET_DSA_TAG_LAN9303=m +CONFIG_NET_DSA_TAG_SJA1105=m +CONFIG_NET_DSA_TAG_TRAILER=m +CONFIG_VLAN_8021Q=m +CONFIG_VLAN_8021Q_GVRP=y +CONFIG_VLAN_8021Q_MVRP=y +# CONFIG_DECNET is not set +CONFIG_LLC=m +CONFIG_LLC2=m +CONFIG_ATALK=m +CONFIG_DEV_APPLETALK=m +CONFIG_IPDDP=m +CONFIG_IPDDP_ENCAP=y +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +CONFIG_PHONET=m +CONFIG_6LOWPAN=m +# CONFIG_6LOWPAN_DEBUGFS is not set +CONFIG_6LOWPAN_NHC=m +CONFIG_6LOWPAN_NHC_DEST=m +CONFIG_6LOWPAN_NHC_FRAGMENT=m +CONFIG_6LOWPAN_NHC_HOP=m +CONFIG_6LOWPAN_NHC_IPV6=m +CONFIG_6LOWPAN_NHC_MOBILITY=m +CONFIG_6LOWPAN_NHC_ROUTING=m +CONFIG_6LOWPAN_NHC_UDP=m +CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m +CONFIG_6LOWPAN_GHC_UDP=m +CONFIG_6LOWPAN_GHC_ICMPV6=m +CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m +CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m +CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m +CONFIG_IEEE802154=m +CONFIG_IEEE802154_NL802154_EXPERIMENTAL=y +CONFIG_IEEE802154_SOCKET=m +CONFIG_IEEE802154_6LOWPAN=m +CONFIG_MAC802154=m +CONFIG_NET_SCHED=y + +# +# Queueing/Scheduling +# +CONFIG_NET_SCH_CBQ=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_HFSC=m +CONFIG_NET_SCH_ATM=m +CONFIG_NET_SCH_PRIO=m +CONFIG_NET_SCH_MULTIQ=m +CONFIG_NET_SCH_RED=m +CONFIG_NET_SCH_SFB=m +CONFIG_NET_SCH_SFQ=m +CONFIG_NET_SCH_TEQL=m +CONFIG_NET_SCH_TBF=m +CONFIG_NET_SCH_CBS=m +CONFIG_NET_SCH_ETF=m +CONFIG_NET_SCH_TAPRIO=m +CONFIG_NET_SCH_GRED=m +CONFIG_NET_SCH_DSMARK=m +CONFIG_NET_SCH_NETEM=m +CONFIG_NET_SCH_DRR=m +CONFIG_NET_SCH_MQPRIO=m +CONFIG_NET_SCH_SKBPRIO=m +CONFIG_NET_SCH_CHOKE=m +CONFIG_NET_SCH_QFQ=m +CONFIG_NET_SCH_CODEL=m +CONFIG_NET_SCH_FQ_CODEL=y +CONFIG_NET_SCH_CAKE=m +CONFIG_NET_SCH_FQ=m +CONFIG_NET_SCH_HHF=m +CONFIG_NET_SCH_PIE=m +CONFIG_NET_SCH_FQ_PIE=m +CONFIG_NET_SCH_INGRESS=m +CONFIG_NET_SCH_PLUG=m +CONFIG_NET_SCH_ETS=m +CONFIG_NET_SCH_DEFAULT=y +# CONFIG_DEFAULT_FQ is not set +# CONFIG_DEFAULT_CODEL is not set +CONFIG_DEFAULT_FQ_CODEL=y +# CONFIG_DEFAULT_SFQ is not set +# CONFIG_DEFAULT_PFIFO_FAST is not set +CONFIG_DEFAULT_NET_SCH="fq_codel" + +# +# Classification +# +CONFIG_NET_CLS=y +CONFIG_NET_CLS_BASIC=m +CONFIG_NET_CLS_TCINDEX=m +CONFIG_NET_CLS_ROUTE4=m +CONFIG_NET_CLS_FW=m +CONFIG_NET_CLS_U32=m +CONFIG_CLS_U32_PERF=y +CONFIG_CLS_U32_MARK=y +CONFIG_NET_CLS_RSVP=m +CONFIG_NET_CLS_RSVP6=m +CONFIG_NET_CLS_FLOW=m +CONFIG_NET_CLS_CGROUP=m +CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_FLOWER=m +CONFIG_NET_CLS_MATCHALL=m +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_STACK=32 +CONFIG_NET_EMATCH_CMP=m +CONFIG_NET_EMATCH_NBYTE=m +CONFIG_NET_EMATCH_U32=m +CONFIG_NET_EMATCH_META=m +CONFIG_NET_EMATCH_TEXT=m +CONFIG_NET_EMATCH_CANID=m +CONFIG_NET_EMATCH_IPSET=m +CONFIG_NET_EMATCH_IPT=m +CONFIG_NET_CLS_ACT=y +CONFIG_NET_ACT_POLICE=m +CONFIG_NET_ACT_GACT=m +CONFIG_GACT_PROB=y +CONFIG_NET_ACT_MIRRED=m +CONFIG_NET_ACT_SAMPLE=m +CONFIG_NET_ACT_IPT=m +CONFIG_NET_ACT_NAT=m +CONFIG_NET_ACT_PEDIT=m +CONFIG_NET_ACT_SIMP=m +CONFIG_NET_ACT_SKBEDIT=m +CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_MPLS=m +CONFIG_NET_ACT_VLAN=m +CONFIG_NET_ACT_BPF=m +CONFIG_NET_ACT_CONNMARK=m +CONFIG_NET_ACT_CTINFO=m +CONFIG_NET_ACT_SKBMOD=m +CONFIG_NET_ACT_IFE=m +CONFIG_NET_ACT_TUNNEL_KEY=m +CONFIG_NET_ACT_CT=m +CONFIG_NET_ACT_GATE=m +CONFIG_NET_IFE_SKBMARK=m +CONFIG_NET_IFE_SKBPRIO=m +CONFIG_NET_IFE_SKBTCINDEX=m +CONFIG_NET_TC_SKB_EXT=y +CONFIG_NET_SCH_FIFO=y +CONFIG_DCB=y +CONFIG_DNS_RESOLVER=m +CONFIG_BATMAN_ADV=m +CONFIG_BATMAN_ADV_BATMAN_V=y +CONFIG_BATMAN_ADV_BLA=y +CONFIG_BATMAN_ADV_DAT=y +CONFIG_BATMAN_ADV_NC=y +CONFIG_BATMAN_ADV_MCAST=y +CONFIG_BATMAN_ADV_DEBUGFS=y +# CONFIG_BATMAN_ADV_DEBUG is not set +CONFIG_BATMAN_ADV_SYSFS=y +# CONFIG_BATMAN_ADV_TRACING is not set +CONFIG_OPENVSWITCH=m +CONFIG_OPENVSWITCH_GRE=m +CONFIG_OPENVSWITCH_VXLAN=m +CONFIG_OPENVSWITCH_GENEVE=m +CONFIG_VSOCKETS=m +CONFIG_VSOCKETS_DIAG=m +CONFIG_VSOCKETS_LOOPBACK=m +CONFIG_VMWARE_VMCI_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS_COMMON=m +CONFIG_HYPERV_VSOCKETS=m +CONFIG_NETLINK_DIAG=m +CONFIG_MPLS=y +CONFIG_NET_MPLS_GSO=m +CONFIG_MPLS_ROUTING=m +CONFIG_MPLS_IPTUNNEL=m +CONFIG_NET_NSH=m +CONFIG_HSR=m +CONFIG_NET_SWITCHDEV=y +CONFIG_NET_L3_MASTER_DEV=y +CONFIG_QRTR=m +CONFIG_QRTR_SMD=m +CONFIG_QRTR_TUN=m +CONFIG_QRTR_MHI=m +CONFIG_NET_NCSI=y +CONFIG_NCSI_OEM_CMD_GET_MAC=y +CONFIG_RPS=y +CONFIG_RFS_ACCEL=y +CONFIG_XPS=y +CONFIG_CGROUP_NET_PRIO=y +CONFIG_CGROUP_NET_CLASSID=y +CONFIG_NET_RX_BUSY_POLL=y +CONFIG_BQL=y +CONFIG_BPF_JIT=y +CONFIG_BPF_STREAM_PARSER=y +CONFIG_NET_FLOW_LIMIT=y + +# +# Network testing +# +CONFIG_NET_PKTGEN=m +CONFIG_NET_DROP_MONITOR=y +# end of Network testing +# end of Networking options + +CONFIG_HAMRADIO=y + +# +# Packet Radio protocols +# +CONFIG_AX25=m +CONFIG_AX25_DAMA_SLAVE=y +CONFIG_NETROM=m +CONFIG_ROSE=m + +# +# AX.25 network device drivers +# +CONFIG_MKISS=m +CONFIG_6PACK=m +CONFIG_BPQETHER=m +CONFIG_BAYCOM_SER_FDX=m +CONFIG_BAYCOM_SER_HDX=m +CONFIG_BAYCOM_PAR=m +CONFIG_YAM=m +# end of AX.25 network device drivers + +CONFIG_CAN=m +CONFIG_CAN_RAW=m +CONFIG_CAN_BCM=m +CONFIG_CAN_GW=m +CONFIG_CAN_J1939=m + +# +# CAN Device Drivers +# +CONFIG_CAN_VCAN=m +CONFIG_CAN_VXCAN=m +CONFIG_CAN_SLCAN=m +CONFIG_CAN_DEV=m +CONFIG_CAN_CALC_BITTIMING=y +CONFIG_CAN_FLEXCAN=m +CONFIG_CAN_GRCAN=m +CONFIG_CAN_JANZ_ICAN3=m +CONFIG_CAN_KVASER_PCIEFD=m +CONFIG_CAN_C_CAN=m +CONFIG_CAN_C_CAN_PLATFORM=m +CONFIG_CAN_C_CAN_PCI=m +CONFIG_CAN_CC770=m +# CONFIG_CAN_CC770_ISA is not set +CONFIG_CAN_CC770_PLATFORM=m +CONFIG_CAN_IFI_CANFD=m +CONFIG_CAN_M_CAN=m +CONFIG_CAN_M_CAN_PLATFORM=m +CONFIG_CAN_M_CAN_TCAN4X5X=m +CONFIG_CAN_PEAK_PCIEFD=m +CONFIG_CAN_SJA1000=m +CONFIG_CAN_EMS_PCI=m +# CONFIG_CAN_EMS_PCMCIA is not set +CONFIG_CAN_F81601=m +CONFIG_CAN_KVASER_PCI=m +CONFIG_CAN_PEAK_PCI=m +CONFIG_CAN_PEAK_PCIEC=y +CONFIG_CAN_PEAK_PCMCIA=m +CONFIG_CAN_PLX_PCI=m +# CONFIG_CAN_SJA1000_ISA is not set +CONFIG_CAN_SJA1000_PLATFORM=m +CONFIG_CAN_SOFTING=m +CONFIG_CAN_SOFTING_CS=m + +# +# CAN SPI interfaces +# +CONFIG_CAN_HI311X=m +CONFIG_CAN_MCP251X=m +# end of CAN SPI interfaces + +# +# CAN USB interfaces +# +CONFIG_CAN_8DEV_USB=m +CONFIG_CAN_EMS_USB=m +CONFIG_CAN_ESD_USB2=m +CONFIG_CAN_GS_USB=m +CONFIG_CAN_KVASER_USB=m +CONFIG_CAN_MCBA_USB=m +CONFIG_CAN_PEAK_USB=m +CONFIG_CAN_UCAN=m +# end of CAN USB interfaces + +# CONFIG_CAN_DEBUG_DEVICES is not set +# end of CAN Device Drivers + +CONFIG_BT=m +CONFIG_BT_BREDR=y +CONFIG_BT_RFCOMM=m +CONFIG_BT_RFCOMM_TTY=y +CONFIG_BT_BNEP=m +CONFIG_BT_BNEP_MC_FILTER=y +CONFIG_BT_BNEP_PROTO_FILTER=y +CONFIG_BT_CMTP=m +CONFIG_BT_HIDP=m +CONFIG_BT_HS=y +CONFIG_BT_LE=y +CONFIG_BT_6LOWPAN=m +CONFIG_BT_LEDS=y +CONFIG_BT_MSFTEXT=y +CONFIG_BT_DEBUGFS=y +# CONFIG_BT_SELFTEST is not set + +# +# Bluetooth device drivers +# +CONFIG_BT_INTEL=m +CONFIG_BT_BCM=m +CONFIG_BT_RTL=m +CONFIG_BT_QCA=m +CONFIG_BT_HCIBTUSB=m +CONFIG_BT_HCIBTUSB_AUTOSUSPEND=y +CONFIG_BT_HCIBTUSB_BCM=y +CONFIG_BT_HCIBTUSB_MTK=y +CONFIG_BT_HCIBTUSB_RTL=y +CONFIG_BT_HCIBTSDIO=m +CONFIG_BT_HCIUART=m +CONFIG_BT_HCIUART_SERDEV=y +CONFIG_BT_HCIUART_H4=y +CONFIG_BT_HCIUART_NOKIA=m +CONFIG_BT_HCIUART_BCSP=y +CONFIG_BT_HCIUART_ATH3K=y +CONFIG_BT_HCIUART_LL=y +CONFIG_BT_HCIUART_3WIRE=y +CONFIG_BT_HCIUART_INTEL=y +CONFIG_BT_HCIUART_BCM=y +CONFIG_BT_HCIUART_RTL=y +CONFIG_BT_HCIUART_QCA=y +CONFIG_BT_HCIUART_AG6XX=y +CONFIG_BT_HCIUART_MRVL=y +CONFIG_BT_HCIBCM203X=m +CONFIG_BT_HCIBPA10X=m +CONFIG_BT_HCIBFUSB=m +CONFIG_BT_HCIDTL1=m +CONFIG_BT_HCIBT3C=m +CONFIG_BT_HCIBLUECARD=m +CONFIG_BT_HCIVHCI=m +CONFIG_BT_MRVL=m +CONFIG_BT_MRVL_SDIO=m +CONFIG_BT_ATH3K=m +CONFIG_BT_MTKSDIO=m +CONFIG_BT_MTKUART=m +CONFIG_BT_HCIRSI=m +# end of Bluetooth device drivers + +CONFIG_AF_RXRPC=m +CONFIG_AF_RXRPC_IPV6=y +# CONFIG_AF_RXRPC_INJECT_LOSS is not set +CONFIG_AF_RXRPC_DEBUG=y +CONFIG_RXKAD=y +CONFIG_AF_KCM=m +CONFIG_STREAM_PARSER=y +CONFIG_FIB_RULES=y +CONFIG_WIRELESS=y +CONFIG_WIRELESS_EXT=y +CONFIG_WEXT_CORE=y +CONFIG_WEXT_PROC=y +CONFIG_WEXT_SPY=y +CONFIG_WEXT_PRIV=y +CONFIG_CFG80211=m +# CONFIG_NL80211_TESTMODE is not set +# CONFIG_CFG80211_DEVELOPER_WARNINGS is not set +# CONFIG_CFG80211_CERTIFICATION_ONUS is not set +CONFIG_CFG80211_REQUIRE_SIGNED_REGDB=y +CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS=y +CONFIG_CFG80211_DEFAULT_PS=y +CONFIG_CFG80211_DEBUGFS=y +CONFIG_CFG80211_CRDA_SUPPORT=y +CONFIG_CFG80211_WEXT=y +CONFIG_CFG80211_WEXT_EXPORT=y +CONFIG_LIB80211=m +CONFIG_LIB80211_CRYPT_WEP=m +CONFIG_LIB80211_CRYPT_CCMP=m +CONFIG_LIB80211_CRYPT_TKIP=m +# CONFIG_LIB80211_DEBUG is not set +CONFIG_MAC80211=m +CONFIG_MAC80211_HAS_RC=y +CONFIG_MAC80211_RC_MINSTREL=y +CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y +CONFIG_MAC80211_RC_DEFAULT="minstrel_ht" +CONFIG_MAC80211_MESH=y +CONFIG_MAC80211_LEDS=y +CONFIG_MAC80211_DEBUGFS=y +# CONFIG_MAC80211_MESSAGE_TRACING is not set +# CONFIG_MAC80211_DEBUG_MENU is not set +CONFIG_MAC80211_STA_HASH_MAX_SIZE=0 +CONFIG_WIMAX=m +CONFIG_WIMAX_DEBUG_LEVEL=8 +CONFIG_RFKILL=m +CONFIG_RFKILL_LEDS=y +CONFIG_RFKILL_INPUT=y +CONFIG_RFKILL_GPIO=m +CONFIG_NET_9P=m +CONFIG_NET_9P_VIRTIO=m +CONFIG_NET_9P_XEN=m +CONFIG_NET_9P_RDMA=m +# CONFIG_NET_9P_DEBUG is not set +CONFIG_CAIF=m +# CONFIG_CAIF_DEBUG is not set +CONFIG_CAIF_NETDEV=m +CONFIG_CAIF_USB=m +CONFIG_CEPH_LIB=m +CONFIG_CEPH_LIB_PRETTYDEBUG=y +CONFIG_CEPH_LIB_USE_DNS_RESOLVER=y +CONFIG_NFC=m +CONFIG_NFC_DIGITAL=m +CONFIG_NFC_NCI=m +CONFIG_NFC_NCI_SPI=m +CONFIG_NFC_NCI_UART=m +CONFIG_NFC_HCI=m +CONFIG_NFC_SHDLC=y + +# +# Near Field Communication (NFC) devices +# +CONFIG_NFC_TRF7970A=m +CONFIG_NFC_MEI_PHY=m +CONFIG_NFC_SIM=m +CONFIG_NFC_PORT100=m +CONFIG_NFC_FDP=m +CONFIG_NFC_FDP_I2C=m +CONFIG_NFC_PN544=m +CONFIG_NFC_PN544_I2C=m +CONFIG_NFC_PN544_MEI=m +CONFIG_NFC_PN533=m +CONFIG_NFC_PN533_USB=m +CONFIG_NFC_PN533_I2C=m +CONFIG_NFC_PN532_UART=m +CONFIG_NFC_MICROREAD=m +CONFIG_NFC_MICROREAD_I2C=m +CONFIG_NFC_MICROREAD_MEI=m +CONFIG_NFC_MRVL=m +CONFIG_NFC_MRVL_USB=m +CONFIG_NFC_MRVL_UART=m +CONFIG_NFC_MRVL_I2C=m +CONFIG_NFC_MRVL_SPI=m +CONFIG_NFC_ST21NFCA=m +CONFIG_NFC_ST21NFCA_I2C=m +CONFIG_NFC_ST_NCI=m +CONFIG_NFC_ST_NCI_I2C=m +CONFIG_NFC_ST_NCI_SPI=m +CONFIG_NFC_NXP_NCI=m +CONFIG_NFC_NXP_NCI_I2C=m +CONFIG_NFC_S3FWRN5=m +CONFIG_NFC_S3FWRN5_I2C=m +CONFIG_NFC_ST95HF=m +# end of Near Field Communication (NFC) devices + +CONFIG_PSAMPLE=m +CONFIG_NET_IFE=m +CONFIG_LWTUNNEL=y +CONFIG_LWTUNNEL_BPF=y +CONFIG_DST_CACHE=y +CONFIG_GRO_CELLS=y +CONFIG_SOCK_VALIDATE_XMIT=y +CONFIG_NET_SOCK_MSG=y +CONFIG_NET_DEVLINK=y +CONFIG_PAGE_POOL=y +CONFIG_FAILOVER=m +CONFIG_HAVE_EBPF_JIT=y + +# +# Device Drivers +# +CONFIG_HAVE_EISA=y +# CONFIG_EISA is not set +CONFIG_HAVE_PCI=y +CONFIG_PCI=y +CONFIG_PCI_DOMAINS=y +CONFIG_PCIEPORTBUS=y +CONFIG_HOTPLUG_PCI_PCIE=y +CONFIG_PCIEAER=y +# CONFIG_PCIEAER_INJECT is not set +CONFIG_PCIE_ECRC=y +CONFIG_PCIEASPM=y +CONFIG_PCIEASPM_DEFAULT=y +# CONFIG_PCIEASPM_POWERSAVE is not set +# CONFIG_PCIEASPM_POWER_SUPERSAVE is not set +# CONFIG_PCIEASPM_PERFORMANCE is not set +CONFIG_PCIE_PME=y +CONFIG_PCIE_DPC=y +CONFIG_PCIE_PTM=y +# CONFIG_PCIE_BW is not set +CONFIG_PCIE_EDR=y +CONFIG_PCI_MSI=y +CONFIG_PCI_MSI_IRQ_DOMAIN=y +CONFIG_PCI_QUIRKS=y +# CONFIG_PCI_DEBUG is not set +CONFIG_PCI_REALLOC_ENABLE_AUTO=y +CONFIG_PCI_STUB=y +CONFIG_PCI_PF_STUB=m +CONFIG_XEN_PCIDEV_FRONTEND=m +CONFIG_PCI_ATS=y +CONFIG_PCI_ECAM=y +CONFIG_PCI_LOCKLESS_CONFIG=y +CONFIG_PCI_IOV=y +CONFIG_PCI_PRI=y +CONFIG_PCI_PASID=y +CONFIG_PCI_P2PDMA=y +CONFIG_PCI_LABEL=y +CONFIG_PCI_HYPERV=m +CONFIG_HOTPLUG_PCI=y +CONFIG_HOTPLUG_PCI_ACPI=y +CONFIG_HOTPLUG_PCI_ACPI_IBM=m +CONFIG_HOTPLUG_PCI_CPCI=y +CONFIG_HOTPLUG_PCI_CPCI_ZT5550=m +CONFIG_HOTPLUG_PCI_CPCI_GENERIC=m +CONFIG_HOTPLUG_PCI_SHPC=y + +# +# PCI controller drivers +# +CONFIG_PCI_FTPCI100=y +CONFIG_PCI_HOST_COMMON=y +CONFIG_PCI_HOST_GENERIC=y +CONFIG_PCIE_XILINX=y +CONFIG_VMD=m +CONFIG_PCI_HYPERV_INTERFACE=m + +# +# DesignWare PCI Core Support +# +CONFIG_PCIE_DW=y +CONFIG_PCIE_DW_HOST=y +CONFIG_PCIE_DW_EP=y +CONFIG_PCIE_DW_PLAT=y +CONFIG_PCIE_DW_PLAT_HOST=y +CONFIG_PCIE_DW_PLAT_EP=y +CONFIG_PCIE_INTEL_GW=y +CONFIG_PCI_MESON=y +# end of DesignWare PCI Core Support + +# +# Mobiveil PCIe Core Support +# +# end of Mobiveil PCIe Core Support + +# +# Cadence PCIe controllers support +# +CONFIG_PCIE_CADENCE=y +CONFIG_PCIE_CADENCE_HOST=y +CONFIG_PCIE_CADENCE_EP=y +CONFIG_PCIE_CADENCE_PLAT=y +CONFIG_PCIE_CADENCE_PLAT_HOST=y +CONFIG_PCIE_CADENCE_PLAT_EP=y +# end of Cadence PCIe controllers support +# end of PCI controller drivers + +# +# PCI Endpoint +# +CONFIG_PCI_ENDPOINT=y +CONFIG_PCI_ENDPOINT_CONFIGFS=y +# CONFIG_PCI_EPF_TEST is not set +# end of PCI Endpoint + +# +# PCI switch controller drivers +# +CONFIG_PCI_SW_SWITCHTEC=m +# end of PCI switch controller drivers + +CONFIG_PCCARD=m +CONFIG_PCMCIA=m +CONFIG_PCMCIA_LOAD_CIS=y +CONFIG_CARDBUS=y + +# +# PC-card bridges +# +CONFIG_YENTA=m +CONFIG_YENTA_O2=y +CONFIG_YENTA_RICOH=y +CONFIG_YENTA_TI=y +CONFIG_YENTA_ENE_TUNE=y +CONFIG_YENTA_TOSHIBA=y +CONFIG_PD6729=m +CONFIG_I82092=m +CONFIG_PCCARD_NONSTATIC=y +CONFIG_RAPIDIO=m +CONFIG_RAPIDIO_TSI721=m +CONFIG_RAPIDIO_DISC_TIMEOUT=30 +CONFIG_RAPIDIO_ENABLE_RX_TX_PORTS=y +CONFIG_RAPIDIO_DMA_ENGINE=y +# CONFIG_RAPIDIO_DEBUG is not set +CONFIG_RAPIDIO_ENUM_BASIC=m +CONFIG_RAPIDIO_CHMAN=m +CONFIG_RAPIDIO_MPORT_CDEV=m + +# +# RapidIO Switch drivers +# +CONFIG_RAPIDIO_TSI57X=m +CONFIG_RAPIDIO_CPS_XX=m +CONFIG_RAPIDIO_TSI568=m +CONFIG_RAPIDIO_CPS_GEN2=m +CONFIG_RAPIDIO_RXS_GEN3=m +# end of RapidIO Switch drivers + +# +# Generic Driver Options +# +# CONFIG_UEVENT_HELPER is not set +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +CONFIG_STANDALONE=y +CONFIG_PREVENT_FIRMWARE_BUILD=y + +# +# Firmware loader +# +CONFIG_FW_LOADER=y +CONFIG_FW_LOADER_PAGED_BUF=y +CONFIG_EXTRA_FIRMWARE="" +# CONFIG_FW_LOADER_USER_HELPER is not set +CONFIG_FW_LOADER_COMPRESS=y +CONFIG_FW_CACHE=y +# end of Firmware loader + +CONFIG_WANT_DEV_COREDUMP=y +CONFIG_ALLOW_DEV_COREDUMP=y +CONFIG_DEV_COREDUMP=y +# CONFIG_DEBUG_DRIVER is not set +# CONFIG_DEBUG_DEVRES is not set +# CONFIG_DEBUG_TEST_DRIVER_REMOVE is not set +CONFIG_HMEM_REPORTING=y +# CONFIG_TEST_ASYNC_DRIVER_PROBE is not set +CONFIG_SYS_HYPERVISOR=y +CONFIG_GENERIC_CPU_AUTOPROBE=y +CONFIG_GENERIC_CPU_VULNERABILITIES=y +CONFIG_REGMAP=y +CONFIG_REGMAP_I2C=y +CONFIG_REGMAP_SLIMBUS=m +CONFIG_REGMAP_SPI=y +CONFIG_REGMAP_SPMI=m +CONFIG_REGMAP_W1=m +CONFIG_REGMAP_MMIO=y +CONFIG_REGMAP_IRQ=y +CONFIG_REGMAP_SOUNDWIRE=m +CONFIG_REGMAP_SCCB=m +CONFIG_REGMAP_I3C=m +CONFIG_DMA_SHARED_BUFFER=y +# CONFIG_DMA_FENCE_TRACE is not set +# end of Generic Driver Options + +# +# Bus devices +# +CONFIG_MOXTET=m +CONFIG_SIMPLE_PM_BUS=y +CONFIG_MHI_BUS=m +# end of Bus devices + +CONFIG_CONNECTOR=y +CONFIG_PROC_EVENTS=y +CONFIG_GNSS=m +CONFIG_GNSS_SERIAL=m +CONFIG_GNSS_MTK_SERIAL=m +CONFIG_GNSS_SIRF_SERIAL=m +CONFIG_GNSS_UBX_SERIAL=m +CONFIG_MTD=m +CONFIG_MTD_TESTS=m + +# +# Partition parsers +# +CONFIG_MTD_AR7_PARTS=m +CONFIG_MTD_CMDLINE_PARTS=m +CONFIG_MTD_OF_PARTS=m +CONFIG_MTD_REDBOOT_PARTS=m +CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1 +# CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED is not set +# CONFIG_MTD_REDBOOT_PARTS_READONLY is not set +# end of Partition parsers + +# +# User Modules And Translation Layers +# +CONFIG_MTD_BLKDEVS=m +CONFIG_MTD_BLOCK=m +CONFIG_MTD_BLOCK_RO=m +CONFIG_FTL=m +CONFIG_NFTL=m +CONFIG_NFTL_RW=y +CONFIG_INFTL=m +CONFIG_RFD_FTL=m +CONFIG_SSFDC=m +CONFIG_SM_FTL=m +CONFIG_MTD_OOPS=m +CONFIG_MTD_PSTORE=m +CONFIG_MTD_SWAP=m +CONFIG_MTD_PARTITIONED_MASTER=y + +# +# RAM/ROM/Flash chip drivers +# +CONFIG_MTD_CFI=m +CONFIG_MTD_JEDECPROBE=m +CONFIG_MTD_GEN_PROBE=m +# CONFIG_MTD_CFI_ADV_OPTIONS is not set +CONFIG_MTD_MAP_BANK_WIDTH_1=y +CONFIG_MTD_MAP_BANK_WIDTH_2=y +CONFIG_MTD_MAP_BANK_WIDTH_4=y +CONFIG_MTD_CFI_I1=y +CONFIG_MTD_CFI_I2=y +CONFIG_MTD_CFI_INTELEXT=m +CONFIG_MTD_CFI_AMDSTD=m +CONFIG_MTD_CFI_STAA=m +CONFIG_MTD_CFI_UTIL=m +CONFIG_MTD_RAM=m +CONFIG_MTD_ROM=m +CONFIG_MTD_ABSENT=m +# end of RAM/ROM/Flash chip drivers + +# +# Mapping drivers for chip access +# +CONFIG_MTD_COMPLEX_MAPPINGS=y +CONFIG_MTD_PHYSMAP=m +# CONFIG_MTD_PHYSMAP_COMPAT is not set +CONFIG_MTD_PHYSMAP_OF=y +CONFIG_MTD_PHYSMAP_VERSATILE=y +CONFIG_MTD_PHYSMAP_GEMINI=y +CONFIG_MTD_PHYSMAP_GPIO_ADDR=y +CONFIG_MTD_SBC_GXX=m +CONFIG_MTD_AMD76XROM=m +CONFIG_MTD_ICHXROM=m +CONFIG_MTD_ESB2ROM=m +CONFIG_MTD_CK804XROM=m +CONFIG_MTD_SCB2_FLASH=m +CONFIG_MTD_NETtel=m +CONFIG_MTD_L440GX=m +CONFIG_MTD_PCI=m +CONFIG_MTD_PCMCIA=m +# CONFIG_MTD_PCMCIA_ANONYMOUS is not set +CONFIG_MTD_INTEL_VR_NOR=m +CONFIG_MTD_PLATRAM=m +# end of Mapping drivers for chip access + +# +# Self-contained MTD device drivers +# +CONFIG_MTD_PMC551=m +# CONFIG_MTD_PMC551_BUGFIX is not set +# CONFIG_MTD_PMC551_DEBUG is not set +CONFIG_MTD_DATAFLASH=m +# CONFIG_MTD_DATAFLASH_WRITE_VERIFY is not set +CONFIG_MTD_DATAFLASH_OTP=y +CONFIG_MTD_MCHP23K256=m +CONFIG_MTD_SST25L=m +CONFIG_MTD_SLRAM=m +CONFIG_MTD_PHRAM=m +CONFIG_MTD_MTDRAM=m +CONFIG_MTDRAM_TOTAL_SIZE=4096 +CONFIG_MTDRAM_ERASE_SIZE=128 +CONFIG_MTD_BLOCK2MTD=m + +# +# Disk-On-Chip Device Drivers +# +CONFIG_MTD_DOCG3=m +CONFIG_BCH_CONST_M=14 +CONFIG_BCH_CONST_T=4 +# end of Self-contained MTD device drivers + +CONFIG_MTD_NAND_CORE=m +CONFIG_MTD_ONENAND=m +# CONFIG_MTD_ONENAND_VERIFY_WRITE is not set +CONFIG_MTD_ONENAND_GENERIC=m +CONFIG_MTD_ONENAND_OTP=y +CONFIG_MTD_ONENAND_2X_PROGRAM=y +CONFIG_MTD_NAND_ECC_SW_HAMMING=m +CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC=y +CONFIG_MTD_RAW_NAND=m +CONFIG_MTD_NAND_ECC_SW_BCH=y + +# +# Raw/parallel NAND flash controllers +# +CONFIG_MTD_NAND_DENALI=m +CONFIG_MTD_NAND_DENALI_PCI=m +CONFIG_MTD_NAND_DENALI_DT=m +CONFIG_MTD_NAND_CAFE=m +CONFIG_MTD_NAND_MXIC=m +CONFIG_MTD_NAND_GPIO=m +CONFIG_MTD_NAND_PLATFORM=m +CONFIG_MTD_NAND_CADENCE=m +CONFIG_MTD_NAND_ARASAN=m + +# +# Misc +# +CONFIG_MTD_SM_COMMON=m +CONFIG_MTD_NAND_NANDSIM=m +CONFIG_MTD_NAND_RICOH=m +CONFIG_MTD_NAND_DISKONCHIP=m +# CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED is not set +CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0 +CONFIG_MTD_NAND_DISKONCHIP_BBTWRITE=y +CONFIG_MTD_SPI_NAND=m + +# +# LPDDR & LPDDR2 PCM memory drivers +# +CONFIG_MTD_LPDDR=m +CONFIG_MTD_QINFO_PROBE=m +# end of LPDDR & LPDDR2 PCM memory drivers + +CONFIG_MTD_SPI_NOR=m +CONFIG_MTD_SPI_NOR_USE_4K_SECTORS=y +CONFIG_SPI_INTEL_SPI=m +CONFIG_SPI_INTEL_SPI_PCI=m +CONFIG_SPI_INTEL_SPI_PLATFORM=m +CONFIG_MTD_UBI=m +CONFIG_MTD_UBI_WL_THRESHOLD=4096 +CONFIG_MTD_UBI_BEB_LIMIT=20 +CONFIG_MTD_UBI_FASTMAP=y +CONFIG_MTD_UBI_GLUEBI=m +CONFIG_MTD_UBI_BLOCK=y +CONFIG_MTD_HYPERBUS=m +CONFIG_DTC=y +CONFIG_OF=y +# CONFIG_OF_UNITTEST is not set +CONFIG_OF_FLATTREE=y +CONFIG_OF_EARLY_FLATTREE=y +CONFIG_OF_KOBJ=y +CONFIG_OF_DYNAMIC=y +CONFIG_OF_ADDRESS=y +CONFIG_OF_IRQ=y +CONFIG_OF_NET=y +CONFIG_OF_MDIO=m +CONFIG_OF_RESERVED_MEM=y +CONFIG_OF_RESOLVE=y +CONFIG_OF_OVERLAY=y +CONFIG_ARCH_MIGHT_HAVE_PC_PARPORT=y +CONFIG_PARPORT=m +CONFIG_PARPORT_PC=m +CONFIG_PARPORT_SERIAL=m +CONFIG_PARPORT_PC_FIFO=y +CONFIG_PARPORT_PC_SUPERIO=y +CONFIG_PARPORT_PC_PCMCIA=m +CONFIG_PARPORT_AX88796=m +CONFIG_PARPORT_1284=y +CONFIG_PARPORT_NOT_PC=y +CONFIG_PNP=y +CONFIG_PNP_DEBUG_MESSAGES=y + +# +# Protocols +# +CONFIG_PNPACPI=y +CONFIG_BLK_DEV=y +# CONFIG_BLK_DEV_NULL_BLK is not set +CONFIG_BLK_DEV_FD=m +CONFIG_CDROM=m +# CONFIG_PARIDE is not set +CONFIG_BLK_DEV_PCIESSD_MTIP32XX=m +CONFIG_ZRAM=m +CONFIG_ZRAM_WRITEBACK=y +# CONFIG_ZRAM_MEMORY_TRACKING is not set +CONFIG_BLK_DEV_UMEM=m +CONFIG_BLK_DEV_LOOP=m +CONFIG_BLK_DEV_LOOP_MIN_COUNT=8 +CONFIG_BLK_DEV_CRYPTOLOOP=m +CONFIG_BLK_DEV_DRBD=m +# CONFIG_DRBD_FAULT_INJECTION is not set +CONFIG_BLK_DEV_NBD=m +CONFIG_BLK_DEV_SKD=m +CONFIG_BLK_DEV_SX8=m +CONFIG_BLK_DEV_RAM=m +CONFIG_BLK_DEV_RAM_COUNT=16 +CONFIG_BLK_DEV_RAM_SIZE=16384 +CONFIG_CDROM_PKTCDVD=m +CONFIG_CDROM_PKTCDVD_BUFFERS=8 +# CONFIG_CDROM_PKTCDVD_WCACHE is not set +CONFIG_ATA_OVER_ETH=m +CONFIG_XEN_BLKDEV_FRONTEND=m +CONFIG_XEN_BLKDEV_BACKEND=m +CONFIG_VIRTIO_BLK=m +CONFIG_BLK_DEV_RBD=m +CONFIG_BLK_DEV_RSXX=m +CONFIG_BLK_DEV_RNBD=y +CONFIG_BLK_DEV_RNBD_CLIENT=m +CONFIG_BLK_DEV_RNBD_SERVER=m + +# +# NVME Support +# +CONFIG_NVME_CORE=y +CONFIG_BLK_DEV_NVME=y +CONFIG_NVME_MULTIPATH=y +CONFIG_NVME_HWMON=y +CONFIG_NVME_FABRICS=m +CONFIG_NVME_RDMA=m +CONFIG_NVME_FC=m +CONFIG_NVME_TCP=m +CONFIG_NVME_TARGET=m +CONFIG_NVME_TARGET_LOOP=m +CONFIG_NVME_TARGET_RDMA=m +CONFIG_NVME_TARGET_FC=m +CONFIG_NVME_TARGET_FCLOOP=m +CONFIG_NVME_TARGET_TCP=m +# end of NVME Support + +# +# Misc devices +# +CONFIG_SENSORS_LIS3LV02D=m +CONFIG_AD525X_DPOT=m +CONFIG_AD525X_DPOT_I2C=m +CONFIG_AD525X_DPOT_SPI=m +# CONFIG_DUMMY_IRQ is not set +CONFIG_IBM_ASM=m +CONFIG_PHANTOM=m +CONFIG_TIFM_CORE=m +CONFIG_TIFM_7XX1=m +CONFIG_ICS932S401=m +CONFIG_ENCLOSURE_SERVICES=m +CONFIG_HP_ILO=m +CONFIG_APDS9802ALS=m +CONFIG_ISL29003=m +CONFIG_ISL29020=m +CONFIG_SENSORS_TSL2550=m +CONFIG_SENSORS_BH1770=m +CONFIG_SENSORS_APDS990X=m +CONFIG_HMC6352=m +CONFIG_DS1682=m +CONFIG_VMWARE_BALLOON=m +CONFIG_LATTICE_ECP3_CONFIG=m +# CONFIG_SRAM is not set +CONFIG_PCI_ENDPOINT_TEST=m +CONFIG_XILINX_SDFEC=m +CONFIG_MISC_RTSX=m +CONFIG_PVPANIC=m +CONFIG_C2PORT=m +CONFIG_C2PORT_DURAMAR_2150=m + +# +# EEPROM support +# +CONFIG_EEPROM_AT24=m +# CONFIG_EEPROM_AT25 is not set +CONFIG_EEPROM_LEGACY=m +CONFIG_EEPROM_MAX6875=m +CONFIG_EEPROM_93CX6=m +# CONFIG_EEPROM_93XX46 is not set +CONFIG_EEPROM_IDT_89HPESX=m +CONFIG_EEPROM_EE1004=m +# end of EEPROM support + +CONFIG_CB710_CORE=m +# CONFIG_CB710_DEBUG is not set +CONFIG_CB710_DEBUG_ASSUMPTIONS=y + +# +# Texas Instruments shared transport line discipline +# +CONFIG_TI_ST=m +# end of Texas Instruments shared transport line discipline + +CONFIG_SENSORS_LIS3_I2C=m +CONFIG_ALTERA_STAPL=m +CONFIG_INTEL_MEI=m +CONFIG_INTEL_MEI_ME=m +CONFIG_INTEL_MEI_TXE=m +CONFIG_INTEL_MEI_HDCP=m +CONFIG_VMWARE_VMCI=m + +# +# Intel MIC & related support +# +CONFIG_INTEL_MIC_BUS=m +CONFIG_SCIF_BUS=m +CONFIG_VOP_BUS=m +CONFIG_INTEL_MIC_HOST=m +CONFIG_INTEL_MIC_CARD=m +CONFIG_SCIF=m +CONFIG_MIC_COSM=m +CONFIG_VOP=m +# end of Intel MIC & related support + +CONFIG_GENWQE=m +CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY=0 +CONFIG_ECHO=m +CONFIG_MISC_ALCOR_PCI=m +CONFIG_MISC_RTSX_PCI=m +CONFIG_MISC_RTSX_USB=m +CONFIG_HABANA_AI=m +CONFIG_UACCE=m +# end of Misc devices + +CONFIG_HAVE_IDE=y +# CONFIG_IDE is not set + +# +# SCSI device support +# +CONFIG_SCSI_MOD=y +CONFIG_RAID_ATTRS=m +CONFIG_SCSI=y +CONFIG_SCSI_DMA=y +CONFIG_SCSI_NETLINK=y +CONFIG_SCSI_PROC_FS=y + +# +# SCSI support type (disk, tape, CD-ROM) +# +CONFIG_BLK_DEV_SD=y +CONFIG_CHR_DEV_ST=m +CONFIG_BLK_DEV_SR=m +CONFIG_CHR_DEV_SG=m +CONFIG_CHR_DEV_SCH=m +CONFIG_SCSI_ENCLOSURE=m +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_LOGGING=y +CONFIG_SCSI_SCAN_ASYNC=y + +# +# SCSI Transports +# +CONFIG_SCSI_SPI_ATTRS=m +CONFIG_SCSI_FC_ATTRS=m +CONFIG_SCSI_ISCSI_ATTRS=m +CONFIG_SCSI_SAS_ATTRS=m +CONFIG_SCSI_SAS_LIBSAS=m +CONFIG_SCSI_SAS_ATA=y +CONFIG_SCSI_SAS_HOST_SMP=y +CONFIG_SCSI_SRP_ATTRS=m +# end of SCSI Transports + +CONFIG_SCSI_LOWLEVEL=y +CONFIG_ISCSI_TCP=m +CONFIG_ISCSI_BOOT_SYSFS=m +CONFIG_SCSI_CXGB3_ISCSI=m +CONFIG_SCSI_CXGB4_ISCSI=m +CONFIG_SCSI_BNX2_ISCSI=m +CONFIG_SCSI_BNX2X_FCOE=m +CONFIG_BE2ISCSI=m +CONFIG_BLK_DEV_3W_XXXX_RAID=m +CONFIG_SCSI_HPSA=m +CONFIG_SCSI_3W_9XXX=m +CONFIG_SCSI_3W_SAS=m +CONFIG_SCSI_ACARD=m +CONFIG_SCSI_AACRAID=m +CONFIG_SCSI_AIC7XXX=m +CONFIG_AIC7XXX_CMDS_PER_DEVICE=32 +CONFIG_AIC7XXX_RESET_DELAY_MS=15000 +CONFIG_AIC7XXX_DEBUG_ENABLE=y +CONFIG_AIC7XXX_DEBUG_MASK=0 +CONFIG_AIC7XXX_REG_PRETTY_PRINT=y +CONFIG_SCSI_AIC79XX=m +CONFIG_AIC79XX_CMDS_PER_DEVICE=32 +CONFIG_AIC79XX_RESET_DELAY_MS=15000 +CONFIG_AIC79XX_DEBUG_ENABLE=y +CONFIG_AIC79XX_DEBUG_MASK=0 +CONFIG_AIC79XX_REG_PRETTY_PRINT=y +CONFIG_SCSI_AIC94XX=m +CONFIG_AIC94XX_DEBUG=y +CONFIG_SCSI_MVSAS=m +CONFIG_SCSI_MVSAS_DEBUG=y +CONFIG_SCSI_MVSAS_TASKLET=y +CONFIG_SCSI_MVUMI=m +CONFIG_SCSI_DPT_I2O=m +CONFIG_SCSI_ADVANSYS=m +CONFIG_SCSI_ARCMSR=m +CONFIG_SCSI_ESAS2R=m +CONFIG_MEGARAID_NEWGEN=y +CONFIG_MEGARAID_MM=m +CONFIG_MEGARAID_MAILBOX=m +CONFIG_MEGARAID_LEGACY=m +CONFIG_MEGARAID_SAS=m +CONFIG_SCSI_MPT3SAS=m +CONFIG_SCSI_MPT2SAS_MAX_SGE=128 +CONFIG_SCSI_MPT3SAS_MAX_SGE=128 +CONFIG_SCSI_MPT2SAS=m +CONFIG_SCSI_SMARTPQI=m +CONFIG_SCSI_UFSHCD=m +CONFIG_SCSI_UFSHCD_PCI=m +# CONFIG_SCSI_UFS_DWC_TC_PCI is not set +CONFIG_SCSI_UFSHCD_PLATFORM=m +CONFIG_SCSI_UFS_CDNS_PLATFORM=m +# CONFIG_SCSI_UFS_DWC_TC_PLATFORM is not set +CONFIG_SCSI_UFS_BSG=y +CONFIG_SCSI_HPTIOP=m +CONFIG_SCSI_BUSLOGIC=m +CONFIG_SCSI_FLASHPOINT=y +CONFIG_SCSI_MYRB=m +CONFIG_SCSI_MYRS=m +CONFIG_VMWARE_PVSCSI=m +CONFIG_XEN_SCSI_FRONTEND=m +CONFIG_HYPERV_STORAGE=m +CONFIG_LIBFC=m +CONFIG_LIBFCOE=m +CONFIG_FCOE=m +CONFIG_FCOE_FNIC=m +CONFIG_SCSI_SNIC=m +# CONFIG_SCSI_SNIC_DEBUG_FS is not set +CONFIG_SCSI_DMX3191D=m +CONFIG_SCSI_FDOMAIN=m +CONFIG_SCSI_FDOMAIN_PCI=m +CONFIG_SCSI_GDTH=m +CONFIG_SCSI_ISCI=m +CONFIG_SCSI_IPS=m +CONFIG_SCSI_INITIO=m +CONFIG_SCSI_INIA100=m +CONFIG_SCSI_PPA=m +CONFIG_SCSI_IMM=m +# CONFIG_SCSI_IZIP_EPP16 is not set +# CONFIG_SCSI_IZIP_SLOW_CTR is not set +CONFIG_SCSI_STEX=m +CONFIG_SCSI_SYM53C8XX_2=m +CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1 +CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 +CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 +CONFIG_SCSI_SYM53C8XX_MMIO=y +CONFIG_SCSI_IPR=m +CONFIG_SCSI_IPR_TRACE=y +CONFIG_SCSI_IPR_DUMP=y +CONFIG_SCSI_QLOGIC_1280=m +CONFIG_SCSI_QLA_FC=m +CONFIG_TCM_QLA2XXX=m +# CONFIG_TCM_QLA2XXX_DEBUG is not set +CONFIG_SCSI_QLA_ISCSI=m +CONFIG_QEDI=m +CONFIG_QEDF=m +CONFIG_SCSI_LPFC=m +# CONFIG_SCSI_LPFC_DEBUG_FS is not set +CONFIG_SCSI_DC395x=m +CONFIG_SCSI_AM53C974=m +CONFIG_SCSI_WD719X=m +CONFIG_SCSI_DEBUG=m +CONFIG_SCSI_PMCRAID=m +CONFIG_SCSI_PM8001=m +CONFIG_SCSI_BFA_FC=m +CONFIG_SCSI_VIRTIO=m +CONFIG_SCSI_CHELSIO_FCOE=m +CONFIG_SCSI_LOWLEVEL_PCMCIA=y +CONFIG_PCMCIA_AHA152X=m +CONFIG_PCMCIA_FDOMAIN=m +CONFIG_PCMCIA_QLOGIC=m +CONFIG_PCMCIA_SYM53C500=m +CONFIG_SCSI_DH=y +CONFIG_SCSI_DH_RDAC=m +CONFIG_SCSI_DH_HP_SW=m +CONFIG_SCSI_DH_EMC=m +CONFIG_SCSI_DH_ALUA=m +# end of SCSI device support + +CONFIG_ATA=y +CONFIG_SATA_HOST=y +CONFIG_PATA_TIMINGS=y +CONFIG_ATA_VERBOSE_ERROR=y +CONFIG_ATA_FORCE=y +CONFIG_ATA_ACPI=y +CONFIG_SATA_ZPODD=y +CONFIG_SATA_PMP=y + +# +# Controllers with non-SFF native interface +# +CONFIG_SATA_AHCI=y +CONFIG_SATA_MOBILE_LPM_POLICY=3 +CONFIG_SATA_AHCI_PLATFORM=m +CONFIG_AHCI_CEVA=m +CONFIG_AHCI_QORIQ=m +CONFIG_SATA_INIC162X=m +CONFIG_SATA_ACARD_AHCI=m +CONFIG_SATA_SIL24=m +CONFIG_ATA_SFF=y + +# +# SFF controllers with custom DMA interface +# +CONFIG_PDC_ADMA=m +CONFIG_SATA_QSTOR=m +CONFIG_SATA_SX4=m +CONFIG_ATA_BMDMA=y + +# +# SATA SFF controllers with BMDMA +# +CONFIG_ATA_PIIX=m +CONFIG_SATA_DWC=m +# CONFIG_SATA_DWC_OLD_DMA is not set +# CONFIG_SATA_DWC_DEBUG is not set +CONFIG_SATA_MV=m +CONFIG_SATA_NV=m +CONFIG_SATA_PROMISE=m +CONFIG_SATA_SIL=m +CONFIG_SATA_SIS=m +CONFIG_SATA_SVW=m +CONFIG_SATA_ULI=m +CONFIG_SATA_VIA=m +CONFIG_SATA_VITESSE=m + +# +# PATA SFF controllers with BMDMA +# +CONFIG_PATA_ALI=m +CONFIG_PATA_AMD=m +CONFIG_PATA_ARTOP=m +CONFIG_PATA_ATIIXP=m +CONFIG_PATA_ATP867X=m +CONFIG_PATA_CMD64X=m +CONFIG_PATA_CYPRESS=m +CONFIG_PATA_EFAR=m +CONFIG_PATA_HPT366=m +CONFIG_PATA_HPT37X=m +CONFIG_PATA_HPT3X2N=m +CONFIG_PATA_HPT3X3=m +CONFIG_PATA_HPT3X3_DMA=y +CONFIG_PATA_IT8213=m +CONFIG_PATA_IT821X=m +CONFIG_PATA_JMICRON=m +CONFIG_PATA_MARVELL=m +CONFIG_PATA_NETCELL=m +CONFIG_PATA_NINJA32=m +CONFIG_PATA_NS87415=m +CONFIG_PATA_OLDPIIX=m +CONFIG_PATA_OPTIDMA=m +CONFIG_PATA_PDC2027X=m +CONFIG_PATA_PDC_OLD=m +CONFIG_PATA_RADISYS=m +CONFIG_PATA_RDC=m +CONFIG_PATA_SCH=m +CONFIG_PATA_SERVERWORKS=m +CONFIG_PATA_SIL680=m +CONFIG_PATA_SIS=m +CONFIG_PATA_TOSHIBA=m +CONFIG_PATA_TRIFLEX=m +CONFIG_PATA_VIA=m +CONFIG_PATA_WINBOND=m + +# +# PIO-only SFF controllers +# +CONFIG_PATA_CMD640_PCI=m +CONFIG_PATA_MPIIX=m +CONFIG_PATA_NS87410=m +CONFIG_PATA_OPTI=m +CONFIG_PATA_PCMCIA=m +# CONFIG_PATA_PLATFORM is not set +CONFIG_PATA_RZ1000=m + +# +# Generic fallback / legacy drivers +# +CONFIG_PATA_ACPI=m +CONFIG_ATA_GENERIC=m +CONFIG_PATA_LEGACY=m +CONFIG_MD=y +CONFIG_BLK_DEV_MD=m +CONFIG_MD_LINEAR=m +CONFIG_MD_RAID0=m +CONFIG_MD_RAID1=m +CONFIG_MD_RAID10=m +CONFIG_MD_RAID456=m +CONFIG_MD_MULTIPATH=m +CONFIG_MD_FAULTY=m +CONFIG_MD_CLUSTER=m +CONFIG_BCACHE=m +# CONFIG_BCACHE_DEBUG is not set +# CONFIG_BCACHE_CLOSURES_DEBUG is not set +CONFIG_BCACHE_ASYNC_REGISTRAION=y +CONFIG_BLK_DEV_DM_BUILTIN=y +CONFIG_BLK_DEV_DM=m +CONFIG_DM_DEBUG=y +CONFIG_DM_BUFIO=m +# CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING is not set +CONFIG_DM_BIO_PRISON=m +CONFIG_DM_PERSISTENT_DATA=m +CONFIG_DM_UNSTRIPED=m +CONFIG_DM_CRYPT=m +CONFIG_DM_SNAPSHOT=m +CONFIG_DM_THIN_PROVISIONING=m +CONFIG_DM_CACHE=m +CONFIG_DM_CACHE_SMQ=m +CONFIG_DM_WRITECACHE=m +CONFIG_DM_EBS=m +CONFIG_DM_ERA=m +CONFIG_DM_CLONE=m +CONFIG_DM_MIRROR=m +CONFIG_DM_LOG_USERSPACE=m +CONFIG_DM_RAID=m +CONFIG_DM_ZERO=m +CONFIG_DM_MULTIPATH=m +CONFIG_DM_MULTIPATH_QL=m +CONFIG_DM_MULTIPATH_ST=m +CONFIG_DM_MULTIPATH_HST=m +CONFIG_DM_DELAY=m +CONFIG_DM_DUST=m +CONFIG_DM_UEVENT=y +CONFIG_DM_FLAKEY=m +CONFIG_DM_VERITY=m +CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y +CONFIG_DM_VERITY_FEC=y +CONFIG_DM_SWITCH=m +CONFIG_DM_LOG_WRITES=m +CONFIG_DM_INTEGRITY=m +CONFIG_DM_ZONED=m +CONFIG_TARGET_CORE=m +CONFIG_TCM_IBLOCK=m +CONFIG_TCM_FILEIO=m +CONFIG_TCM_PSCSI=m +CONFIG_TCM_USER2=m +CONFIG_LOOPBACK_TARGET=m +CONFIG_TCM_FC=m +CONFIG_ISCSI_TARGET=m +CONFIG_ISCSI_TARGET_CXGB4=m +CONFIG_SBP_TARGET=m +CONFIG_FUSION=y +CONFIG_FUSION_SPI=m +CONFIG_FUSION_FC=m +CONFIG_FUSION_SAS=m +CONFIG_FUSION_MAX_SGE=128 +CONFIG_FUSION_CTL=m +CONFIG_FUSION_LAN=m +# CONFIG_FUSION_LOGGING is not set + +# +# IEEE 1394 (FireWire) support +# +CONFIG_FIREWIRE=m +CONFIG_FIREWIRE_OHCI=m +CONFIG_FIREWIRE_SBP2=m +CONFIG_FIREWIRE_NET=m +CONFIG_FIREWIRE_NOSY=m +# end of IEEE 1394 (FireWire) support + +CONFIG_MACINTOSH_DRIVERS=y +CONFIG_MAC_EMUMOUSEBTN=m +CONFIG_NETDEVICES=y +CONFIG_MII=m +CONFIG_NET_CORE=y +CONFIG_BONDING=m +CONFIG_DUMMY=m +CONFIG_WIREGUARD=m +# CONFIG_WIREGUARD_DEBUG is not set +CONFIG_EQUALIZER=m +CONFIG_NET_FC=y +CONFIG_IFB=m +CONFIG_NET_TEAM=m +CONFIG_NET_TEAM_MODE_BROADCAST=m +CONFIG_NET_TEAM_MODE_ROUNDROBIN=m +CONFIG_NET_TEAM_MODE_RANDOM=m +CONFIG_NET_TEAM_MODE_ACTIVEBACKUP=m +CONFIG_NET_TEAM_MODE_LOADBALANCE=m +CONFIG_MACVLAN=m +CONFIG_MACVTAP=m +CONFIG_IPVLAN_L3S=y +CONFIG_IPVLAN=m +CONFIG_IPVTAP=m +CONFIG_VXLAN=m +CONFIG_GENEVE=m +CONFIG_BAREUDP=m +CONFIG_GTP=m +CONFIG_MACSEC=m +CONFIG_NETCONSOLE=m +CONFIG_NETCONSOLE_DYNAMIC=y +CONFIG_NETPOLL=y +CONFIG_NET_POLL_CONTROLLER=y +CONFIG_NTB_NETDEV=m +CONFIG_RIONET=m +CONFIG_RIONET_TX_SIZE=128 +CONFIG_RIONET_RX_SIZE=128 +CONFIG_TUN=m +CONFIG_TAP=m +# CONFIG_TUN_VNET_CROSS_LE is not set +CONFIG_VETH=m +CONFIG_VIRTIO_NET=m +CONFIG_NLMON=m +CONFIG_NET_VRF=m +CONFIG_VSOCKMON=m +CONFIG_SUNGEM_PHY=m +# CONFIG_ARCNET is not set +CONFIG_ATM_DRIVERS=y +# CONFIG_ATM_DUMMY is not set +CONFIG_ATM_TCP=m +CONFIG_ATM_LANAI=m +CONFIG_ATM_ENI=m +# CONFIG_ATM_ENI_DEBUG is not set +# CONFIG_ATM_ENI_TUNE_BURST is not set +CONFIG_ATM_FIRESTREAM=m +CONFIG_ATM_ZATM=m +# CONFIG_ATM_ZATM_DEBUG is not set +CONFIG_ATM_NICSTAR=m +# CONFIG_ATM_NICSTAR_USE_SUNI is not set +# CONFIG_ATM_NICSTAR_USE_IDT77105 is not set +CONFIG_ATM_IDT77252=m +# CONFIG_ATM_IDT77252_DEBUG is not set +# CONFIG_ATM_IDT77252_RCV_ALL is not set +CONFIG_ATM_IDT77252_USE_SUNI=y +CONFIG_ATM_AMBASSADOR=m +# CONFIG_ATM_AMBASSADOR_DEBUG is not set +CONFIG_ATM_HORIZON=m +# CONFIG_ATM_HORIZON_DEBUG is not set +CONFIG_ATM_IA=m +# CONFIG_ATM_IA_DEBUG is not set +CONFIG_ATM_FORE200E=m +CONFIG_ATM_FORE200E_USE_TASKLET=y +CONFIG_ATM_FORE200E_TX_RETRY=16 +CONFIG_ATM_FORE200E_DEBUG=0 +CONFIG_ATM_HE=m +CONFIG_ATM_HE_USE_SUNI=y +CONFIG_ATM_SOLOS=m +CONFIG_CAIF_DRIVERS=y +CONFIG_CAIF_TTY=m +CONFIG_CAIF_SPI_SLAVE=m +CONFIG_CAIF_SPI_SYNC=y +CONFIG_CAIF_HSI=m +CONFIG_CAIF_VIRTIO=m + +# +# Distributed Switch Architecture drivers +# +CONFIG_B53=m +# CONFIG_B53_SPI_DRIVER is not set +CONFIG_B53_MDIO_DRIVER=m +CONFIG_B53_MMAP_DRIVER=m +CONFIG_B53_SRAB_DRIVER=m +CONFIG_B53_SERDES=m +CONFIG_NET_DSA_BCM_SF2=m +CONFIG_NET_DSA_LOOP=m +CONFIG_NET_DSA_LANTIQ_GSWIP=m +CONFIG_NET_DSA_MT7530=m +CONFIG_NET_DSA_MV88E6060=m +CONFIG_NET_DSA_MICROCHIP_KSZ_COMMON=m +CONFIG_NET_DSA_MICROCHIP_KSZ9477=m +CONFIG_NET_DSA_MICROCHIP_KSZ9477_I2C=m +CONFIG_NET_DSA_MICROCHIP_KSZ9477_SPI=m +CONFIG_NET_DSA_MICROCHIP_KSZ8795=m +CONFIG_NET_DSA_MICROCHIP_KSZ8795_SPI=m +CONFIG_NET_DSA_MV88E6XXX=m +CONFIG_NET_DSA_MV88E6XXX_GLOBAL2=y +CONFIG_NET_DSA_MV88E6XXX_PTP=y +CONFIG_NET_DSA_AR9331=m +CONFIG_NET_DSA_SJA1105=m +CONFIG_NET_DSA_SJA1105_PTP=y +CONFIG_NET_DSA_SJA1105_TAS=y +CONFIG_NET_DSA_SJA1105_VL=y +CONFIG_NET_DSA_QCA8K=m +CONFIG_NET_DSA_REALTEK_SMI=m +CONFIG_NET_DSA_SMSC_LAN9303=m +CONFIG_NET_DSA_SMSC_LAN9303_I2C=m +CONFIG_NET_DSA_SMSC_LAN9303_MDIO=m +CONFIG_NET_DSA_VITESSE_VSC73XX=m +CONFIG_NET_DSA_VITESSE_VSC73XX_SPI=m +CONFIG_NET_DSA_VITESSE_VSC73XX_PLATFORM=m +# end of Distributed Switch Architecture drivers + +CONFIG_ETHERNET=y +CONFIG_MDIO=m +CONFIG_NET_VENDOR_3COM=y +CONFIG_PCMCIA_3C574=m +CONFIG_PCMCIA_3C589=m +CONFIG_VORTEX=m +CONFIG_TYPHOON=m +CONFIG_NET_VENDOR_ADAPTEC=y +CONFIG_ADAPTEC_STARFIRE=m +CONFIG_NET_VENDOR_AGERE=y +CONFIG_ET131X=m +CONFIG_NET_VENDOR_ALACRITECH=y +CONFIG_SLICOSS=m +CONFIG_NET_VENDOR_ALTEON=y +CONFIG_ACENIC=m +# CONFIG_ACENIC_OMIT_TIGON_I is not set +CONFIG_ALTERA_TSE=m +CONFIG_NET_VENDOR_AMAZON=y +CONFIG_ENA_ETHERNET=m +CONFIG_NET_VENDOR_AMD=y +CONFIG_AMD8111_ETH=m +CONFIG_PCNET32=m +CONFIG_PCMCIA_NMCLAN=m +CONFIG_AMD_XGBE=m +CONFIG_AMD_XGBE_DCB=y +CONFIG_AMD_XGBE_HAVE_ECC=y +CONFIG_NET_VENDOR_AQUANTIA=y +CONFIG_AQTION=m +CONFIG_NET_VENDOR_ARC=y +CONFIG_NET_VENDOR_ATHEROS=y +CONFIG_ATL2=m +CONFIG_ATL1=m +CONFIG_ATL1E=m +CONFIG_ATL1C=m +CONFIG_ALX=m +CONFIG_NET_VENDOR_AURORA=y +CONFIG_AURORA_NB8800=m +CONFIG_NET_VENDOR_BROADCOM=y +CONFIG_B44=m +CONFIG_B44_PCI_AUTOSELECT=y +CONFIG_B44_PCICORE_AUTOSELECT=y +CONFIG_B44_PCI=y +CONFIG_BCMGENET=m +CONFIG_BNX2=m +CONFIG_CNIC=m +CONFIG_TIGON3=m +CONFIG_TIGON3_HWMON=y +CONFIG_BNX2X=m +CONFIG_BNX2X_SRIOV=y +CONFIG_SYSTEMPORT=m +CONFIG_BNXT=m +CONFIG_BNXT_SRIOV=y +CONFIG_BNXT_FLOWER_OFFLOAD=y +CONFIG_BNXT_DCB=y +CONFIG_BNXT_HWMON=y +CONFIG_NET_VENDOR_BROCADE=y +CONFIG_BNA=m +CONFIG_NET_VENDOR_CADENCE=y +CONFIG_MACB=m +CONFIG_MACB_USE_HWSTAMP=y +CONFIG_MACB_PCI=m +CONFIG_NET_VENDOR_CAVIUM=y +CONFIG_THUNDER_NIC_PF=m +CONFIG_THUNDER_NIC_VF=m +CONFIG_THUNDER_NIC_BGX=m +CONFIG_THUNDER_NIC_RGX=m +CONFIG_CAVIUM_PTP=m +CONFIG_LIQUIDIO=m +CONFIG_LIQUIDIO_VF=m +CONFIG_NET_VENDOR_CHELSIO=y +CONFIG_CHELSIO_T1=m +CONFIG_CHELSIO_T1_1G=y +CONFIG_CHELSIO_T3=m +CONFIG_CHELSIO_T4=m +CONFIG_CHELSIO_T4_DCB=y +CONFIG_CHELSIO_T4_FCOE=y +CONFIG_CHELSIO_T4VF=m +CONFIG_CHELSIO_LIB=m +CONFIG_NET_VENDOR_CISCO=y +CONFIG_ENIC=m +CONFIG_NET_VENDOR_CORTINA=y +CONFIG_GEMINI_ETHERNET=m +CONFIG_CX_ECAT=m +CONFIG_DNET=m +CONFIG_NET_VENDOR_DEC=y +CONFIG_NET_TULIP=y +CONFIG_DE2104X=m +CONFIG_DE2104X_DSL=0 +CONFIG_TULIP=m +CONFIG_TULIP_MWI=y +CONFIG_TULIP_MMIO=y +CONFIG_TULIP_NAPI=y +CONFIG_TULIP_NAPI_HW_MITIGATION=y +CONFIG_DE4X5=m +CONFIG_WINBOND_840=m +CONFIG_DM9102=m +CONFIG_ULI526X=m +CONFIG_PCMCIA_XIRCOM=m +CONFIG_NET_VENDOR_DLINK=y +CONFIG_DL2K=m +CONFIG_SUNDANCE=m +# CONFIG_SUNDANCE_MMIO is not set +CONFIG_NET_VENDOR_EMULEX=y +CONFIG_BE2NET=m +CONFIG_BE2NET_HWMON=y +CONFIG_BE2NET_BE2=y +CONFIG_BE2NET_BE3=y +CONFIG_BE2NET_LANCER=y +CONFIG_BE2NET_SKYHAWK=y +CONFIG_NET_VENDOR_EZCHIP=y +CONFIG_EZCHIP_NPS_MANAGEMENT_ENET=m +CONFIG_NET_VENDOR_FUJITSU=y +CONFIG_PCMCIA_FMVJ18X=m +CONFIG_NET_VENDOR_GOOGLE=y +CONFIG_GVE=m +CONFIG_NET_VENDOR_HUAWEI=y +CONFIG_HINIC=m +CONFIG_NET_VENDOR_I825XX=y +CONFIG_NET_VENDOR_INTEL=y +CONFIG_E100=m +CONFIG_E1000=m +CONFIG_E1000E=m +CONFIG_E1000E_HWTS=y +CONFIG_IGB=m +CONFIG_IGB_HWMON=y +CONFIG_IGB_DCA=y +CONFIG_IGBVF=m +CONFIG_IXGB=m +CONFIG_IXGBE=m +CONFIG_IXGBE_HWMON=y +CONFIG_IXGBE_DCA=y +CONFIG_IXGBE_DCB=y +# CONFIG_IXGBE_IPSEC is not set +CONFIG_IXGBEVF=m +CONFIG_IXGBEVF_IPSEC=y +CONFIG_I40E=m +CONFIG_I40E_DCB=y +CONFIG_IAVF=m +CONFIG_I40EVF=m +CONFIG_ICE=m +CONFIG_FM10K=m +CONFIG_IGC=m +CONFIG_JME=m +CONFIG_NET_VENDOR_MARVELL=y +CONFIG_MVMDIO=m +CONFIG_SKGE=m +# CONFIG_SKGE_DEBUG is not set +CONFIG_SKGE_GENESIS=y +CONFIG_SKY2=m +# CONFIG_SKY2_DEBUG is not set +CONFIG_NET_VENDOR_MELLANOX=y +CONFIG_MLX4_EN=m +CONFIG_MLX4_EN_DCB=y +CONFIG_MLX4_CORE=m +CONFIG_MLX4_DEBUG=y +CONFIG_MLX4_CORE_GEN2=y +CONFIG_MLX5_CORE=m +CONFIG_MLX5_ACCEL=y +CONFIG_MLX5_FPGA=y +CONFIG_MLX5_CORE_EN=y +CONFIG_MLX5_EN_ARFS=y +CONFIG_MLX5_EN_RXNFC=y +CONFIG_MLX5_MPFS=y +CONFIG_MLX5_ESWITCH=y +CONFIG_MLX5_CLS_ACT=y +CONFIG_MLX5_TC_CT=y +CONFIG_MLX5_CORE_EN_DCB=y +CONFIG_MLX5_CORE_IPOIB=y +CONFIG_MLX5_FPGA_IPSEC=y +CONFIG_MLX5_EN_IPSEC=y +CONFIG_MLX5_FPGA_TLS=y +CONFIG_MLX5_TLS=y +CONFIG_MLX5_EN_TLS=y +CONFIG_MLX5_SW_STEERING=y +CONFIG_MLXSW_CORE=m +CONFIG_MLXSW_CORE_HWMON=y +CONFIG_MLXSW_CORE_THERMAL=y +CONFIG_MLXSW_PCI=m +CONFIG_MLXSW_I2C=m +CONFIG_MLXSW_SWITCHIB=m +CONFIG_MLXSW_SWITCHX2=m +CONFIG_MLXSW_SPECTRUM=m +CONFIG_MLXSW_SPECTRUM_DCB=y +CONFIG_MLXSW_MINIMAL=m +CONFIG_MLXFW=m +CONFIG_NET_VENDOR_MICREL=y +CONFIG_KS8842=m +CONFIG_KS8851=m +CONFIG_KS8851_MLL=m +CONFIG_KSZ884X_PCI=m +CONFIG_NET_VENDOR_MICROCHIP=y +CONFIG_ENC28J60=m +# CONFIG_ENC28J60_WRITEVERIFY is not set +CONFIG_ENCX24J600=m +CONFIG_LAN743X=m +CONFIG_NET_VENDOR_MICROSEMI=y +CONFIG_MSCC_OCELOT_SWITCH=m +CONFIG_MSCC_OCELOT_SWITCH_OCELOT=m +CONFIG_NET_VENDOR_MYRI=y +CONFIG_MYRI10GE=m +CONFIG_MYRI10GE_DCA=y +CONFIG_FEALNX=m +CONFIG_NET_VENDOR_NATSEMI=y +CONFIG_NATSEMI=m +CONFIG_NS83820=m +CONFIG_NET_VENDOR_NETERION=y +CONFIG_S2IO=m +CONFIG_VXGE=m +# CONFIG_VXGE_DEBUG_TRACE_ALL is not set +CONFIG_NET_VENDOR_NETRONOME=y +CONFIG_NFP=m +CONFIG_NFP_APP_FLOWER=y +CONFIG_NFP_APP_ABM_NIC=y +# CONFIG_NFP_DEBUG is not set +CONFIG_NET_VENDOR_NI=y +CONFIG_NI_XGE_MANAGEMENT_ENET=m +CONFIG_NET_VENDOR_8390=y +CONFIG_PCMCIA_AXNET=m +CONFIG_NE2K_PCI=m +CONFIG_PCMCIA_PCNET=m +CONFIG_NET_VENDOR_NVIDIA=y +CONFIG_FORCEDETH=m +CONFIG_NET_VENDOR_OKI=y +CONFIG_ETHOC=m +CONFIG_NET_VENDOR_PACKET_ENGINES=y +CONFIG_HAMACHI=m +CONFIG_YELLOWFIN=m +CONFIG_NET_VENDOR_PENSANDO=y +CONFIG_IONIC=m +CONFIG_NET_VENDOR_QLOGIC=y +CONFIG_QLA3XXX=m +CONFIG_QLCNIC=m +CONFIG_QLCNIC_SRIOV=y +CONFIG_QLCNIC_DCB=y +CONFIG_QLCNIC_HWMON=y +CONFIG_NETXEN_NIC=m +CONFIG_QED=m +CONFIG_QED_LL2=y +CONFIG_QED_SRIOV=y +CONFIG_QEDE=m +CONFIG_QED_RDMA=y +CONFIG_QED_ISCSI=y +CONFIG_QED_FCOE=y +CONFIG_QED_OOO=y +CONFIG_NET_VENDOR_QUALCOMM=y +CONFIG_QCA7000=m +CONFIG_QCA7000_SPI=m +CONFIG_QCA7000_UART=m +CONFIG_QCOM_EMAC=m +CONFIG_RMNET=m +CONFIG_NET_VENDOR_RDC=y +CONFIG_R6040=m +CONFIG_NET_VENDOR_REALTEK=y +CONFIG_ATP=m +CONFIG_8139CP=m +CONFIG_8139TOO=m +# CONFIG_8139TOO_PIO is not set +CONFIG_8139TOO_TUNE_TWISTER=y +CONFIG_8139TOO_8129=y +# CONFIG_8139_OLD_RX_RESET is not set +CONFIG_R8169=m +CONFIG_NET_VENDOR_RENESAS=y +CONFIG_NET_VENDOR_ROCKER=y +CONFIG_ROCKER=m +CONFIG_NET_VENDOR_SAMSUNG=y +CONFIG_SXGBE_ETH=m +CONFIG_NET_VENDOR_SEEQ=y +CONFIG_NET_VENDOR_SOLARFLARE=y +CONFIG_SFC=m +CONFIG_SFC_MTD=y +CONFIG_SFC_MCDI_MON=y +CONFIG_SFC_SRIOV=y +CONFIG_SFC_MCDI_LOGGING=y +CONFIG_SFC_FALCON=m +CONFIG_SFC_FALCON_MTD=y +CONFIG_NET_VENDOR_SILAN=y +CONFIG_SC92031=m +CONFIG_NET_VENDOR_SIS=y +CONFIG_SIS900=m +CONFIG_SIS190=m +CONFIG_NET_VENDOR_SMSC=y +CONFIG_PCMCIA_SMC91C92=m +CONFIG_EPIC100=m +CONFIG_SMSC911X=m +CONFIG_SMSC9420=m +CONFIG_NET_VENDOR_SOCIONEXT=y +CONFIG_NET_VENDOR_STMICRO=y +CONFIG_STMMAC_ETH=m +# CONFIG_STMMAC_SELFTESTS is not set +CONFIG_STMMAC_PLATFORM=m +CONFIG_DWMAC_DWC_QOS_ETH=m +CONFIG_DWMAC_GENERIC=m +CONFIG_DWMAC_INTEL=m +CONFIG_STMMAC_PCI=m +CONFIG_NET_VENDOR_SUN=y +CONFIG_HAPPYMEAL=m +CONFIG_SUNGEM=m +CONFIG_CASSINI=m +CONFIG_NIU=m +CONFIG_NET_VENDOR_SYNOPSYS=y +CONFIG_DWC_XLGMAC=m +CONFIG_DWC_XLGMAC_PCI=m +CONFIG_NET_VENDOR_TEHUTI=y +CONFIG_TEHUTI=m +CONFIG_NET_VENDOR_TI=y +# CONFIG_TI_CPSW_PHY_SEL is not set +CONFIG_TLAN=m +CONFIG_NET_VENDOR_VIA=y +CONFIG_VIA_RHINE=m +CONFIG_VIA_RHINE_MMIO=y +CONFIG_VIA_VELOCITY=m +CONFIG_NET_VENDOR_WIZNET=y +CONFIG_WIZNET_W5100=m +CONFIG_WIZNET_W5300=m +# CONFIG_WIZNET_BUS_DIRECT is not set +# CONFIG_WIZNET_BUS_INDIRECT is not set +CONFIG_WIZNET_BUS_ANY=y +CONFIG_WIZNET_W5100_SPI=m +CONFIG_NET_VENDOR_XILINX=y +CONFIG_XILINX_AXI_EMAC=m +CONFIG_XILINX_LL_TEMAC=m +CONFIG_NET_VENDOR_XIRCOM=y +CONFIG_PCMCIA_XIRC2PS=m +CONFIG_FDDI=m +CONFIG_DEFXX=m +CONFIG_DEFXX_MMIO=y +CONFIG_SKFP=m +# CONFIG_HIPPI is not set +CONFIG_NET_SB1000=m +CONFIG_MDIO_DEVICE=m +CONFIG_MDIO_BUS=m +CONFIG_MDIO_BCM_UNIMAC=m +CONFIG_MDIO_BITBANG=m +CONFIG_MDIO_BUS_MUX=m +CONFIG_MDIO_BUS_MUX_GPIO=m +CONFIG_MDIO_BUS_MUX_MMIOREG=m +CONFIG_MDIO_BUS_MUX_MULTIPLEXER=m +CONFIG_MDIO_CAVIUM=m +CONFIG_MDIO_GPIO=m +CONFIG_MDIO_HISI_FEMAC=m +CONFIG_MDIO_I2C=m +CONFIG_MDIO_IPQ4019=m +CONFIG_MDIO_IPQ8064=m +CONFIG_MDIO_MSCC_MIIM=m +CONFIG_MDIO_MVUSB=m +CONFIG_MDIO_OCTEON=m +CONFIG_MDIO_THUNDER=m +CONFIG_MDIO_XPCS=m +CONFIG_PHYLINK=m +CONFIG_PHYLIB=m +CONFIG_SWPHY=y +CONFIG_LED_TRIGGER_PHY=y + +# +# MII PHY device drivers +# +CONFIG_SFP=m +CONFIG_ADIN_PHY=m +CONFIG_AMD_PHY=m +CONFIG_AQUANTIA_PHY=m +CONFIG_AX88796B_PHY=m +CONFIG_BCM7XXX_PHY=m +CONFIG_BCM87XX_PHY=m +CONFIG_BCM_NET_PHYLIB=m +CONFIG_BROADCOM_PHY=m +CONFIG_BCM54140_PHY=m +CONFIG_BCM84881_PHY=m +CONFIG_CICADA_PHY=m +CONFIG_CORTINA_PHY=m +CONFIG_DAVICOM_PHY=m +CONFIG_DP83822_PHY=m +CONFIG_DP83TC811_PHY=m +CONFIG_DP83848_PHY=m +CONFIG_DP83867_PHY=m +CONFIG_DP83869_PHY=m +CONFIG_FIXED_PHY=m +CONFIG_ICPLUS_PHY=m +CONFIG_INTEL_XWAY_PHY=m +CONFIG_LSI_ET1011C_PHY=m +CONFIG_LXT_PHY=m +CONFIG_MARVELL_PHY=m +CONFIG_MARVELL_10G_PHY=m +CONFIG_MICREL_PHY=m +CONFIG_MICROCHIP_PHY=m +CONFIG_MICROCHIP_T1_PHY=m +CONFIG_MICROSEMI_PHY=m +CONFIG_NATIONAL_PHY=m +CONFIG_NXP_TJA11XX_PHY=m +CONFIG_AT803X_PHY=m +CONFIG_QSEMI_PHY=m +CONFIG_REALTEK_PHY=m +CONFIG_RENESAS_PHY=m +CONFIG_ROCKCHIP_PHY=m +CONFIG_SMSC_PHY=m +CONFIG_STE10XP=m +CONFIG_TERANETICS_PHY=m +CONFIG_VITESSE_PHY=m +CONFIG_XILINX_GMII2RGMII=m +CONFIG_MICREL_KS8995MA=m +CONFIG_PLIP=m +CONFIG_PPP=m +CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_FILTER=y +CONFIG_PPP_MPPE=m +CONFIG_PPP_MULTILINK=y +CONFIG_PPPOATM=m +CONFIG_PPPOE=m +CONFIG_PPTP=m +CONFIG_PPPOL2TP=m +CONFIG_PPP_ASYNC=m +CONFIG_PPP_SYNC_TTY=m +CONFIG_SLIP=m +CONFIG_SLHC=m +CONFIG_SLIP_COMPRESSED=y +CONFIG_SLIP_SMART=y +CONFIG_SLIP_MODE_SLIP6=y +CONFIG_USB_NET_DRIVERS=m +CONFIG_USB_CATC=m +CONFIG_USB_KAWETH=m +CONFIG_USB_PEGASUS=m +CONFIG_USB_RTL8150=m +CONFIG_USB_RTL8152=m +CONFIG_USB_LAN78XX=m +CONFIG_USB_USBNET=m +CONFIG_USB_NET_AX8817X=m +CONFIG_USB_NET_AX88179_178A=m +CONFIG_USB_NET_CDCETHER=m +CONFIG_USB_NET_CDC_EEM=m +CONFIG_USB_NET_CDC_NCM=m +CONFIG_USB_NET_HUAWEI_CDC_NCM=m +CONFIG_USB_NET_CDC_MBIM=m +CONFIG_USB_NET_DM9601=m +CONFIG_USB_NET_SR9700=m +CONFIG_USB_NET_SR9800=m +CONFIG_USB_NET_SMSC75XX=m +CONFIG_USB_NET_SMSC95XX=m +CONFIG_USB_NET_GL620A=m +CONFIG_USB_NET_NET1080=m +CONFIG_USB_NET_PLUSB=m +CONFIG_USB_NET_MCS7830=m +CONFIG_USB_NET_RNDIS_HOST=m +CONFIG_USB_NET_CDC_SUBSET_ENABLE=m +CONFIG_USB_NET_CDC_SUBSET=m +CONFIG_USB_ALI_M5632=y +CONFIG_USB_AN2720=y +CONFIG_USB_BELKIN=y +CONFIG_USB_ARMLINUX=y +CONFIG_USB_EPSON2888=y +CONFIG_USB_KC2190=y +CONFIG_USB_NET_ZAURUS=m +CONFIG_USB_NET_CX82310_ETH=m +CONFIG_USB_NET_KALMIA=m +CONFIG_USB_NET_QMI_WWAN=m +CONFIG_USB_HSO=m +CONFIG_USB_NET_INT51X1=m +CONFIG_USB_CDC_PHONET=m +CONFIG_USB_IPHETH=m +CONFIG_USB_SIERRA_NET=m +CONFIG_USB_VL600=m +CONFIG_USB_NET_CH9200=m +CONFIG_USB_NET_AQC111=m +CONFIG_WLAN=y +# CONFIG_WIRELESS_WDS is not set +CONFIG_WLAN_VENDOR_ADMTEK=y +CONFIG_ADM8211=m +CONFIG_ATH_COMMON=m +CONFIG_WLAN_VENDOR_ATH=y +# CONFIG_ATH_DEBUG is not set +CONFIG_ATH5K=m +CONFIG_ATH5K_DEBUG=y +CONFIG_ATH5K_TRACER=y +CONFIG_ATH5K_PCI=y +CONFIG_ATH9K_HW=m +CONFIG_ATH9K_COMMON=m +CONFIG_ATH9K_COMMON_DEBUG=y +CONFIG_ATH9K_BTCOEX_SUPPORT=y +CONFIG_ATH9K=m +CONFIG_ATH9K_PCI=y +CONFIG_ATH9K_AHB=y +CONFIG_ATH9K_DEBUGFS=y +CONFIG_ATH9K_STATION_STATISTICS=y +CONFIG_ATH9K_DYNACK=y +CONFIG_ATH9K_WOW=y +CONFIG_ATH9K_RFKILL=y +CONFIG_ATH9K_CHANNEL_CONTEXT=y +CONFIG_ATH9K_PCOEM=y +CONFIG_ATH9K_PCI_NO_EEPROM=m +CONFIG_ATH9K_HTC=m +CONFIG_ATH9K_HTC_DEBUGFS=y +CONFIG_ATH9K_HWRNG=y +CONFIG_ATH9K_COMMON_SPECTRAL=y +CONFIG_CARL9170=m +CONFIG_CARL9170_LEDS=y +CONFIG_CARL9170_DEBUGFS=y +CONFIG_CARL9170_WPC=y +# CONFIG_CARL9170_HWRNG is not set +CONFIG_ATH6KL=m +CONFIG_ATH6KL_SDIO=m +CONFIG_ATH6KL_USB=m +CONFIG_ATH6KL_DEBUG=y +CONFIG_ATH6KL_TRACING=y +CONFIG_AR5523=m +CONFIG_WIL6210=m +CONFIG_WIL6210_ISR_COR=y +CONFIG_WIL6210_TRACING=y +CONFIG_WIL6210_DEBUGFS=y +CONFIG_ATH10K=m +CONFIG_ATH10K_CE=y +CONFIG_ATH10K_PCI=m +CONFIG_ATH10K_AHB=y +CONFIG_ATH10K_SDIO=m +CONFIG_ATH10K_USB=m +CONFIG_ATH10K_DEBUG=y +CONFIG_ATH10K_DEBUGFS=y +CONFIG_ATH10K_SPECTRAL=y +CONFIG_ATH10K_TRACING=y +CONFIG_WCN36XX=m +CONFIG_WCN36XX_DEBUGFS=y +CONFIG_WLAN_VENDOR_ATMEL=y +CONFIG_ATMEL=m +CONFIG_PCI_ATMEL=m +CONFIG_PCMCIA_ATMEL=m +CONFIG_AT76C50X_USB=m +CONFIG_WLAN_VENDOR_BROADCOM=y +CONFIG_B43=m +CONFIG_B43_BCMA=y +CONFIG_B43_SSB=y +CONFIG_B43_BUSES_BCMA_AND_SSB=y +# CONFIG_B43_BUSES_BCMA is not set +# CONFIG_B43_BUSES_SSB is not set +CONFIG_B43_PCI_AUTOSELECT=y +CONFIG_B43_PCICORE_AUTOSELECT=y +CONFIG_B43_SDIO=y +CONFIG_B43_BCMA_PIO=y +CONFIG_B43_PIO=y +CONFIG_B43_PHY_G=y +CONFIG_B43_PHY_N=y +CONFIG_B43_PHY_LP=y +CONFIG_B43_PHY_HT=y +CONFIG_B43_LEDS=y +CONFIG_B43_HWRNG=y +# CONFIG_B43_DEBUG is not set +CONFIG_B43LEGACY=m +CONFIG_B43LEGACY_PCI_AUTOSELECT=y +CONFIG_B43LEGACY_PCICORE_AUTOSELECT=y +CONFIG_B43LEGACY_LEDS=y +CONFIG_B43LEGACY_HWRNG=y +CONFIG_B43LEGACY_DEBUG=y +CONFIG_B43LEGACY_DMA=y +CONFIG_B43LEGACY_PIO=y +CONFIG_B43LEGACY_DMA_AND_PIO_MODE=y +# CONFIG_B43LEGACY_DMA_MODE is not set +# CONFIG_B43LEGACY_PIO_MODE is not set +CONFIG_BRCMUTIL=m +CONFIG_BRCMSMAC=m +CONFIG_BRCMFMAC=m +CONFIG_BRCMFMAC_PROTO_BCDC=y +CONFIG_BRCMFMAC_PROTO_MSGBUF=y +CONFIG_BRCMFMAC_SDIO=y +CONFIG_BRCMFMAC_USB=y +CONFIG_BRCMFMAC_PCIE=y +CONFIG_BRCM_TRACING=y +CONFIG_BRCMDBG=y +CONFIG_WLAN_VENDOR_CISCO=y +CONFIG_AIRO=m +CONFIG_AIRO_CS=m +CONFIG_WLAN_VENDOR_INTEL=y +CONFIG_IPW2100=m +CONFIG_IPW2100_MONITOR=y +# CONFIG_IPW2100_DEBUG is not set +CONFIG_IPW2200=m +CONFIG_IPW2200_MONITOR=y +CONFIG_IPW2200_RADIOTAP=y +CONFIG_IPW2200_PROMISCUOUS=y +CONFIG_IPW2200_QOS=y +# CONFIG_IPW2200_DEBUG is not set +CONFIG_LIBIPW=m +# CONFIG_LIBIPW_DEBUG is not set +CONFIG_IWLEGACY=m +CONFIG_IWL4965=m +CONFIG_IWL3945=m + +# +# iwl3945 / iwl4965 Debugging Options +# +CONFIG_IWLEGACY_DEBUG=y +CONFIG_IWLEGACY_DEBUGFS=y +# end of iwl3945 / iwl4965 Debugging Options + +CONFIG_IWLWIFI=m +CONFIG_IWLWIFI_LEDS=y +CONFIG_IWLDVM=m +CONFIG_IWLMVM=m +CONFIG_IWLWIFI_OPMODE_MODULAR=y +# CONFIG_IWLWIFI_BCAST_FILTERING is not set + +# +# Debugging Options +# +CONFIG_IWLWIFI_DEBUG=y +CONFIG_IWLWIFI_DEBUGFS=y +CONFIG_IWLWIFI_DEVICE_TRACING=y +# end of Debugging Options + +CONFIG_WLAN_VENDOR_INTERSIL=y +CONFIG_HOSTAP=m +CONFIG_HOSTAP_FIRMWARE=y +CONFIG_HOSTAP_FIRMWARE_NVRAM=y +CONFIG_HOSTAP_PLX=m +CONFIG_HOSTAP_PCI=m +CONFIG_HOSTAP_CS=m +CONFIG_HERMES=m +CONFIG_HERMES_PRISM=y +CONFIG_HERMES_CACHE_FW_ON_INIT=y +CONFIG_PLX_HERMES=m +CONFIG_TMD_HERMES=m +CONFIG_NORTEL_HERMES=m +CONFIG_PCI_HERMES=m +CONFIG_PCMCIA_HERMES=m +CONFIG_PCMCIA_SPECTRUM=m +CONFIG_ORINOCO_USB=m +CONFIG_P54_COMMON=m +CONFIG_P54_USB=m +CONFIG_P54_PCI=m +CONFIG_P54_SPI=m +# CONFIG_P54_SPI_DEFAULT_EEPROM is not set +CONFIG_P54_LEDS=y +CONFIG_PRISM54=m +CONFIG_WLAN_VENDOR_MARVELL=y +CONFIG_LIBERTAS=m +CONFIG_LIBERTAS_USB=m +CONFIG_LIBERTAS_CS=m +CONFIG_LIBERTAS_SDIO=m +CONFIG_LIBERTAS_SPI=m +# CONFIG_LIBERTAS_DEBUG is not set +CONFIG_LIBERTAS_MESH=y +CONFIG_LIBERTAS_THINFIRM=m +# CONFIG_LIBERTAS_THINFIRM_DEBUG is not set +CONFIG_LIBERTAS_THINFIRM_USB=m +CONFIG_MWIFIEX=m +CONFIG_MWIFIEX_SDIO=m +CONFIG_MWIFIEX_PCIE=m +CONFIG_MWIFIEX_USB=m +CONFIG_MWL8K=m +CONFIG_WLAN_VENDOR_MEDIATEK=y +CONFIG_MT7601U=m +CONFIG_MT76_CORE=m +CONFIG_MT76_LEDS=y +CONFIG_MT76_USB=m +CONFIG_MT76x02_LIB=m +CONFIG_MT76x02_USB=m +CONFIG_MT76x0_COMMON=m +CONFIG_MT76x0U=m +CONFIG_MT76x0E=m +CONFIG_MT76x2_COMMON=m +CONFIG_MT76x2E=m +CONFIG_MT76x2U=m +CONFIG_MT7603E=m +CONFIG_MT7615_COMMON=m +CONFIG_MT7615E=m +CONFIG_MT7663U=m +CONFIG_MT7915E=m +CONFIG_WLAN_VENDOR_RALINK=y +CONFIG_RT2X00=m +CONFIG_RT2400PCI=m +CONFIG_RT2500PCI=m +CONFIG_RT61PCI=m +CONFIG_RT2800PCI=m +CONFIG_RT2800PCI_RT33XX=y +CONFIG_RT2800PCI_RT35XX=y +CONFIG_RT2800PCI_RT53XX=y +CONFIG_RT2800PCI_RT3290=y +CONFIG_RT2500USB=m +CONFIG_RT73USB=m +CONFIG_RT2800USB=m +CONFIG_RT2800USB_RT33XX=y +CONFIG_RT2800USB_RT35XX=y +CONFIG_RT2800USB_RT3573=y +CONFIG_RT2800USB_RT53XX=y +CONFIG_RT2800USB_RT55XX=y +CONFIG_RT2800USB_UNKNOWN=y +CONFIG_RT2800_LIB=m +CONFIG_RT2800_LIB_MMIO=m +CONFIG_RT2X00_LIB_MMIO=m +CONFIG_RT2X00_LIB_PCI=m +CONFIG_RT2X00_LIB_USB=m +CONFIG_RT2X00_LIB=m +CONFIG_RT2X00_LIB_FIRMWARE=y +CONFIG_RT2X00_LIB_CRYPTO=y +CONFIG_RT2X00_LIB_LEDS=y +CONFIG_RT2X00_LIB_DEBUGFS=y +# CONFIG_RT2X00_DEBUG is not set +CONFIG_WLAN_VENDOR_REALTEK=y +CONFIG_RTL8180=m +CONFIG_RTL8187=m +CONFIG_RTL8187_LEDS=y +CONFIG_RTL_CARDS=m +CONFIG_RTL8192CE=m +CONFIG_RTL8192SE=m +CONFIG_RTL8192DE=m +CONFIG_RTL8723AE=m +CONFIG_RTL8723BE=m +CONFIG_RTL8188EE=m +CONFIG_RTL8192EE=m +CONFIG_RTL8821AE=m +CONFIG_RTL8192CU=m +CONFIG_RTLWIFI=m +CONFIG_RTLWIFI_PCI=m +CONFIG_RTLWIFI_USB=m +CONFIG_RTLWIFI_DEBUG=y +CONFIG_RTL8192C_COMMON=m +CONFIG_RTL8723_COMMON=m +CONFIG_RTLBTCOEXIST=m +CONFIG_RTL8XXXU=m +CONFIG_RTL8XXXU_UNTESTED=y +CONFIG_RTW88=m +CONFIG_RTW88_CORE=m +CONFIG_RTW88_PCI=m +CONFIG_RTW88_8822B=m +CONFIG_RTW88_8822C=m +CONFIG_RTW88_8723D=m +CONFIG_RTW88_8822BE=m +CONFIG_RTW88_8822CE=m +CONFIG_RTW88_8723DE=m +CONFIG_RTW88_DEBUG=y +CONFIG_RTW88_DEBUGFS=y +CONFIG_WLAN_VENDOR_RSI=y +CONFIG_RSI_91X=m +CONFIG_RSI_DEBUGFS=y +CONFIG_RSI_SDIO=m +CONFIG_RSI_USB=m +CONFIG_RSI_COEX=y +CONFIG_WLAN_VENDOR_ST=y +CONFIG_CW1200=m +CONFIG_CW1200_WLAN_SDIO=m +CONFIG_CW1200_WLAN_SPI=m +CONFIG_WLAN_VENDOR_TI=y +CONFIG_WL1251=m +CONFIG_WL1251_SPI=m +CONFIG_WL1251_SDIO=m +CONFIG_WL12XX=m +CONFIG_WL18XX=m +CONFIG_WLCORE=m +CONFIG_WLCORE_SPI=m +CONFIG_WLCORE_SDIO=m +CONFIG_WILINK_PLATFORM_DATA=y +CONFIG_WLAN_VENDOR_ZYDAS=y +CONFIG_USB_ZD1201=m +CONFIG_ZD1211RW=m +# CONFIG_ZD1211RW_DEBUG is not set +CONFIG_WLAN_VENDOR_QUANTENNA=y +CONFIG_QTNFMAC=m +CONFIG_QTNFMAC_PCIE=m +CONFIG_PCMCIA_RAYCS=m +CONFIG_PCMCIA_WL3501=m +CONFIG_MAC80211_HWSIM=m +CONFIG_USB_NET_RNDIS_WLAN=m +CONFIG_VIRT_WIFI=m + +# +# WiMAX Wireless Broadband devices +# +CONFIG_WIMAX_I2400M=m +CONFIG_WIMAX_I2400M_USB=m +CONFIG_WIMAX_I2400M_DEBUG_LEVEL=8 +# end of WiMAX Wireless Broadband devices + +# CONFIG_WAN is not set +CONFIG_IEEE802154_DRIVERS=m +CONFIG_IEEE802154_FAKELB=m +CONFIG_IEEE802154_AT86RF230=m +# CONFIG_IEEE802154_AT86RF230_DEBUGFS is not set +CONFIG_IEEE802154_MRF24J40=m +CONFIG_IEEE802154_CC2520=m +CONFIG_IEEE802154_ATUSB=m +CONFIG_IEEE802154_ADF7242=m +CONFIG_IEEE802154_CA8210=m +# CONFIG_IEEE802154_CA8210_DEBUGFS is not set +CONFIG_IEEE802154_MCR20A=m +CONFIG_IEEE802154_HWSIM=m +CONFIG_XEN_NETDEV_FRONTEND=m +CONFIG_XEN_NETDEV_BACKEND=m +CONFIG_VMXNET3=m +CONFIG_FUJITSU_ES=m +CONFIG_USB4_NET=m +CONFIG_HYPERV_NET=m +CONFIG_NETDEVSIM=m +CONFIG_NET_FAILOVER=m +CONFIG_ISDN=y +CONFIG_ISDN_CAPI=y +CONFIG_CAPI_TRACE=y +CONFIG_ISDN_CAPI_MIDDLEWARE=y +CONFIG_MISDN=m +CONFIG_MISDN_DSP=m +CONFIG_MISDN_L1OIP=m + +# +# mISDN hardware drivers +# +CONFIG_MISDN_HFCPCI=m +CONFIG_MISDN_HFCMULTI=m +CONFIG_MISDN_HFCUSB=m +CONFIG_MISDN_AVMFRITZ=m +CONFIG_MISDN_SPEEDFAX=m +CONFIG_MISDN_INFINEON=m +CONFIG_MISDN_W6692=m +CONFIG_MISDN_NETJET=m +CONFIG_MISDN_HDLC=m +CONFIG_MISDN_IPAC=m +CONFIG_MISDN_ISAR=m +CONFIG_NVM=y +CONFIG_NVM_PBLK=m +# CONFIG_NVM_PBLK_DEBUG is not set + +# +# Input device support +# +CONFIG_INPUT=y +CONFIG_INPUT_LEDS=m +CONFIG_INPUT_FF_MEMLESS=m +CONFIG_INPUT_POLLDEV=m +CONFIG_INPUT_SPARSEKMAP=m +CONFIG_INPUT_MATRIXKMAP=m + +# +# Userland interfaces +# +CONFIG_INPUT_MOUSEDEV=m +CONFIG_INPUT_MOUSEDEV_PSAUX=y +CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 +CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 +CONFIG_INPUT_JOYDEV=m +CONFIG_INPUT_EVDEV=m +# CONFIG_INPUT_EVBUG is not set + +# +# Input Device Drivers +# +CONFIG_INPUT_KEYBOARD=y +CONFIG_KEYBOARD_ADC=m +CONFIG_KEYBOARD_ADP5520=m +CONFIG_KEYBOARD_ADP5588=m +CONFIG_KEYBOARD_ADP5589=m +CONFIG_KEYBOARD_APPLESPI=m +CONFIG_KEYBOARD_ATKBD=m +CONFIG_KEYBOARD_QT1050=m +CONFIG_KEYBOARD_QT1070=m +CONFIG_KEYBOARD_QT2160=m +CONFIG_KEYBOARD_DLINK_DIR685=m +CONFIG_KEYBOARD_LKKBD=m +CONFIG_KEYBOARD_GPIO=m +CONFIG_KEYBOARD_GPIO_POLLED=m +CONFIG_KEYBOARD_TCA6416=m +CONFIG_KEYBOARD_TCA8418=m +CONFIG_KEYBOARD_MATRIX=m +CONFIG_KEYBOARD_LM8323=m +CONFIG_KEYBOARD_LM8333=m +CONFIG_KEYBOARD_MAX7359=m +CONFIG_KEYBOARD_MCS=m +CONFIG_KEYBOARD_MPR121=m +CONFIG_KEYBOARD_NEWTON=m +CONFIG_KEYBOARD_OPENCORES=m +CONFIG_KEYBOARD_SAMSUNG=m +CONFIG_KEYBOARD_STOWAWAY=m +CONFIG_KEYBOARD_SUNKBD=m +CONFIG_KEYBOARD_STMPE=m +CONFIG_KEYBOARD_IQS62X=m +CONFIG_KEYBOARD_OMAP4=m +CONFIG_KEYBOARD_TC3589X=m +CONFIG_KEYBOARD_TM2_TOUCHKEY=m +CONFIG_KEYBOARD_TWL4030=m +CONFIG_KEYBOARD_XTKBD=m +CONFIG_KEYBOARD_CROS_EC=m +CONFIG_KEYBOARD_CAP11XX=m +CONFIG_KEYBOARD_BCM=m +CONFIG_KEYBOARD_MTK_PMIC=m +CONFIG_INPUT_MOUSE=y +CONFIG_MOUSE_PS2=m +CONFIG_MOUSE_PS2_ALPS=y +CONFIG_MOUSE_PS2_BYD=y +CONFIG_MOUSE_PS2_LOGIPS2PP=y +CONFIG_MOUSE_PS2_SYNAPTICS=y +CONFIG_MOUSE_PS2_SYNAPTICS_SMBUS=y +CONFIG_MOUSE_PS2_CYPRESS=y +CONFIG_MOUSE_PS2_LIFEBOOK=y +CONFIG_MOUSE_PS2_TRACKPOINT=y +CONFIG_MOUSE_PS2_ELANTECH=y +CONFIG_MOUSE_PS2_ELANTECH_SMBUS=y +CONFIG_MOUSE_PS2_SENTELIC=y +CONFIG_MOUSE_PS2_TOUCHKIT=y +CONFIG_MOUSE_PS2_FOCALTECH=y +CONFIG_MOUSE_PS2_VMMOUSE=y +CONFIG_MOUSE_PS2_SMBUS=y +CONFIG_MOUSE_SERIAL=m +CONFIG_MOUSE_APPLETOUCH=m +CONFIG_MOUSE_BCM5974=m +CONFIG_MOUSE_CYAPA=m +CONFIG_MOUSE_ELAN_I2C=m +CONFIG_MOUSE_ELAN_I2C_I2C=y +CONFIG_MOUSE_ELAN_I2C_SMBUS=y +CONFIG_MOUSE_VSXXXAA=m +CONFIG_MOUSE_GPIO=m +CONFIG_MOUSE_SYNAPTICS_I2C=m +CONFIG_MOUSE_SYNAPTICS_USB=m +CONFIG_INPUT_JOYSTICK=y +CONFIG_JOYSTICK_ANALOG=m +CONFIG_JOYSTICK_A3D=m +CONFIG_JOYSTICK_ADI=m +CONFIG_JOYSTICK_COBRA=m +CONFIG_JOYSTICK_GF2K=m +CONFIG_JOYSTICK_GRIP=m +CONFIG_JOYSTICK_GRIP_MP=m +CONFIG_JOYSTICK_GUILLEMOT=m +CONFIG_JOYSTICK_INTERACT=m +CONFIG_JOYSTICK_SIDEWINDER=m +CONFIG_JOYSTICK_TMDC=m +CONFIG_JOYSTICK_IFORCE=m +CONFIG_JOYSTICK_IFORCE_USB=m +CONFIG_JOYSTICK_IFORCE_232=m +CONFIG_JOYSTICK_WARRIOR=m +CONFIG_JOYSTICK_MAGELLAN=m +CONFIG_JOYSTICK_SPACEORB=m +CONFIG_JOYSTICK_SPACEBALL=m +CONFIG_JOYSTICK_STINGER=m +CONFIG_JOYSTICK_TWIDJOY=m +CONFIG_JOYSTICK_ZHENHUA=m +CONFIG_JOYSTICK_DB9=m +CONFIG_JOYSTICK_GAMECON=m +CONFIG_JOYSTICK_TURBOGRAFX=m +CONFIG_JOYSTICK_AS5011=m +CONFIG_JOYSTICK_JOYDUMP=m +CONFIG_JOYSTICK_XPAD=m +CONFIG_JOYSTICK_XPAD_FF=y +CONFIG_JOYSTICK_XPAD_LEDS=y +CONFIG_JOYSTICK_WALKERA0701=m +CONFIG_JOYSTICK_PSXPAD_SPI=m +CONFIG_JOYSTICK_PSXPAD_SPI_FF=y +CONFIG_JOYSTICK_PXRC=m +CONFIG_JOYSTICK_FSIA6B=m +CONFIG_INPUT_TABLET=y +CONFIG_TABLET_USB_ACECAD=m +CONFIG_TABLET_USB_AIPTEK=m +CONFIG_TABLET_USB_GTCO=m +CONFIG_TABLET_USB_HANWANG=m +CONFIG_TABLET_USB_KBTAB=m +CONFIG_TABLET_USB_PEGASUS=m +CONFIG_TABLET_SERIAL_WACOM4=m +CONFIG_INPUT_TOUCHSCREEN=y +CONFIG_TOUCHSCREEN_PROPERTIES=y +CONFIG_TOUCHSCREEN_88PM860X=m +CONFIG_TOUCHSCREEN_ADS7846=m +CONFIG_TOUCHSCREEN_AD7877=m +CONFIG_TOUCHSCREEN_AD7879=m +CONFIG_TOUCHSCREEN_AD7879_I2C=m +CONFIG_TOUCHSCREEN_AD7879_SPI=m +CONFIG_TOUCHSCREEN_ADC=m +CONFIG_TOUCHSCREEN_AR1021_I2C=m +CONFIG_TOUCHSCREEN_ATMEL_MXT=m +CONFIG_TOUCHSCREEN_ATMEL_MXT_T37=y +CONFIG_TOUCHSCREEN_AUO_PIXCIR=m +CONFIG_TOUCHSCREEN_BU21013=m +CONFIG_TOUCHSCREEN_BU21029=m +CONFIG_TOUCHSCREEN_CHIPONE_ICN8318=m +CONFIG_TOUCHSCREEN_CHIPONE_ICN8505=m +CONFIG_TOUCHSCREEN_CY8CTMA140=m +CONFIG_TOUCHSCREEN_CY8CTMG110=m +CONFIG_TOUCHSCREEN_CYTTSP_CORE=m +CONFIG_TOUCHSCREEN_CYTTSP_I2C=m +CONFIG_TOUCHSCREEN_CYTTSP_SPI=m +CONFIG_TOUCHSCREEN_CYTTSP4_CORE=m +CONFIG_TOUCHSCREEN_CYTTSP4_I2C=m +CONFIG_TOUCHSCREEN_CYTTSP4_SPI=m +CONFIG_TOUCHSCREEN_DA9034=m +CONFIG_TOUCHSCREEN_DA9052=m +CONFIG_TOUCHSCREEN_DYNAPRO=m +CONFIG_TOUCHSCREEN_HAMPSHIRE=m +CONFIG_TOUCHSCREEN_EETI=m +CONFIG_TOUCHSCREEN_EGALAX=m +CONFIG_TOUCHSCREEN_EGALAX_SERIAL=m +CONFIG_TOUCHSCREEN_EXC3000=m +CONFIG_TOUCHSCREEN_FUJITSU=m +CONFIG_TOUCHSCREEN_GOODIX=m +CONFIG_TOUCHSCREEN_HIDEEP=m +CONFIG_TOUCHSCREEN_ILI210X=m +CONFIG_TOUCHSCREEN_S6SY761=m +CONFIG_TOUCHSCREEN_GUNZE=m +CONFIG_TOUCHSCREEN_EKTF2127=m +CONFIG_TOUCHSCREEN_ELAN=m +CONFIG_TOUCHSCREEN_ELO=m +CONFIG_TOUCHSCREEN_WACOM_W8001=m +CONFIG_TOUCHSCREEN_WACOM_I2C=m +CONFIG_TOUCHSCREEN_MAX11801=m +CONFIG_TOUCHSCREEN_MCS5000=m +CONFIG_TOUCHSCREEN_MMS114=m +CONFIG_TOUCHSCREEN_MELFAS_MIP4=m +CONFIG_TOUCHSCREEN_MTOUCH=m +CONFIG_TOUCHSCREEN_IMX6UL_TSC=m +CONFIG_TOUCHSCREEN_INEXIO=m +CONFIG_TOUCHSCREEN_MK712=m +CONFIG_TOUCHSCREEN_PENMOUNT=m +CONFIG_TOUCHSCREEN_EDT_FT5X06=m +CONFIG_TOUCHSCREEN_TOUCHRIGHT=m +CONFIG_TOUCHSCREEN_TOUCHWIN=m +CONFIG_TOUCHSCREEN_TI_AM335X_TSC=m +CONFIG_TOUCHSCREEN_UCB1400=m +CONFIG_TOUCHSCREEN_PIXCIR=m +CONFIG_TOUCHSCREEN_WDT87XX_I2C=m +CONFIG_TOUCHSCREEN_WM831X=m +CONFIG_TOUCHSCREEN_WM97XX=m +CONFIG_TOUCHSCREEN_WM9705=y +CONFIG_TOUCHSCREEN_WM9712=y +CONFIG_TOUCHSCREEN_WM9713=y +CONFIG_TOUCHSCREEN_USB_COMPOSITE=m +CONFIG_TOUCHSCREEN_MC13783=m +CONFIG_TOUCHSCREEN_USB_EGALAX=y +CONFIG_TOUCHSCREEN_USB_PANJIT=y +CONFIG_TOUCHSCREEN_USB_3M=y +CONFIG_TOUCHSCREEN_USB_ITM=y +CONFIG_TOUCHSCREEN_USB_ETURBO=y +CONFIG_TOUCHSCREEN_USB_GUNZE=y +CONFIG_TOUCHSCREEN_USB_DMC_TSC10=y +CONFIG_TOUCHSCREEN_USB_IRTOUCH=y +CONFIG_TOUCHSCREEN_USB_IDEALTEK=y +CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH=y +CONFIG_TOUCHSCREEN_USB_GOTOP=y +CONFIG_TOUCHSCREEN_USB_JASTEC=y +CONFIG_TOUCHSCREEN_USB_ELO=y +CONFIG_TOUCHSCREEN_USB_E2I=y +CONFIG_TOUCHSCREEN_USB_ZYTRONIC=y +CONFIG_TOUCHSCREEN_USB_ETT_TC45USB=y +CONFIG_TOUCHSCREEN_USB_NEXIO=y +CONFIG_TOUCHSCREEN_USB_EASYTOUCH=y +CONFIG_TOUCHSCREEN_TOUCHIT213=m +CONFIG_TOUCHSCREEN_TSC_SERIO=m +CONFIG_TOUCHSCREEN_TSC200X_CORE=m +CONFIG_TOUCHSCREEN_TSC2004=m +CONFIG_TOUCHSCREEN_TSC2005=m +CONFIG_TOUCHSCREEN_TSC2007=m +CONFIG_TOUCHSCREEN_TSC2007_IIO=y +CONFIG_TOUCHSCREEN_PCAP=m +CONFIG_TOUCHSCREEN_RM_TS=m +CONFIG_TOUCHSCREEN_SILEAD=m +CONFIG_TOUCHSCREEN_SIS_I2C=m +CONFIG_TOUCHSCREEN_ST1232=m +CONFIG_TOUCHSCREEN_STMFTS=m +CONFIG_TOUCHSCREEN_STMPE=m +CONFIG_TOUCHSCREEN_SUR40=m +CONFIG_TOUCHSCREEN_SURFACE3_SPI=m +CONFIG_TOUCHSCREEN_SX8654=m +CONFIG_TOUCHSCREEN_TPS6507X=m +CONFIG_TOUCHSCREEN_ZET6223=m +CONFIG_TOUCHSCREEN_ZFORCE=m +CONFIG_TOUCHSCREEN_COLIBRI_VF50=m +CONFIG_TOUCHSCREEN_ROHM_BU21023=m +CONFIG_TOUCHSCREEN_IQS5XX=m +CONFIG_INPUT_MISC=y +CONFIG_INPUT_88PM860X_ONKEY=m +CONFIG_INPUT_88PM80X_ONKEY=m +CONFIG_INPUT_AD714X=m +CONFIG_INPUT_AD714X_I2C=m +CONFIG_INPUT_AD714X_SPI=m +CONFIG_INPUT_ARIZONA_HAPTICS=m +CONFIG_INPUT_ATMEL_CAPTOUCH=m +CONFIG_INPUT_BMA150=m +CONFIG_INPUT_E3X0_BUTTON=m +CONFIG_INPUT_PCSPKR=m +CONFIG_INPUT_MAX77650_ONKEY=m +CONFIG_INPUT_MAX77693_HAPTIC=m +CONFIG_INPUT_MAX8925_ONKEY=m +CONFIG_INPUT_MAX8997_HAPTIC=m +CONFIG_INPUT_MC13783_PWRBUTTON=m +CONFIG_INPUT_MMA8450=m +CONFIG_INPUT_APANEL=m +CONFIG_INPUT_GPIO_BEEPER=m +CONFIG_INPUT_GPIO_DECODER=m +CONFIG_INPUT_GPIO_VIBRA=m +CONFIG_INPUT_CPCAP_PWRBUTTON=m +CONFIG_INPUT_ATLAS_BTNS=m +CONFIG_INPUT_ATI_REMOTE2=m +CONFIG_INPUT_KEYSPAN_REMOTE=m +CONFIG_INPUT_KXTJ9=m +CONFIG_INPUT_POWERMATE=m +CONFIG_INPUT_YEALINK=m +CONFIG_INPUT_CM109=m +CONFIG_INPUT_REGULATOR_HAPTIC=m +CONFIG_INPUT_RETU_PWRBUTTON=m +CONFIG_INPUT_TPS65218_PWRBUTTON=m +CONFIG_INPUT_AXP20X_PEK=m +CONFIG_INPUT_TWL4030_PWRBUTTON=m +CONFIG_INPUT_TWL4030_VIBRA=m +CONFIG_INPUT_TWL6040_VIBRA=m +CONFIG_INPUT_UINPUT=m +CONFIG_INPUT_PALMAS_PWRBUTTON=m +CONFIG_INPUT_PCF50633_PMU=m +CONFIG_INPUT_PCF8574=m +CONFIG_INPUT_PWM_BEEPER=m +CONFIG_INPUT_PWM_VIBRA=m +CONFIG_INPUT_RK805_PWRKEY=m +CONFIG_INPUT_GPIO_ROTARY_ENCODER=m +CONFIG_INPUT_DA9052_ONKEY=m +CONFIG_INPUT_DA9055_ONKEY=m +CONFIG_INPUT_DA9063_ONKEY=m +CONFIG_INPUT_WM831X_ON=m +CONFIG_INPUT_PCAP=m +CONFIG_INPUT_ADXL34X=m +CONFIG_INPUT_ADXL34X_I2C=m +CONFIG_INPUT_ADXL34X_SPI=m +CONFIG_INPUT_IMS_PCU=m +CONFIG_INPUT_IQS269A=m +CONFIG_INPUT_CMA3000=m +CONFIG_INPUT_CMA3000_I2C=m +CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m +CONFIG_INPUT_IDEAPAD_SLIDEBAR=m +CONFIG_INPUT_SOC_BUTTON_ARRAY=m +CONFIG_INPUT_DRV260X_HAPTICS=m +CONFIG_INPUT_DRV2665_HAPTICS=m +CONFIG_INPUT_DRV2667_HAPTICS=m +CONFIG_INPUT_RAVE_SP_PWRBUTTON=m +CONFIG_INPUT_STPMIC1_ONKEY=m +CONFIG_RMI4_CORE=m +CONFIG_RMI4_I2C=m +CONFIG_RMI4_SPI=m +CONFIG_RMI4_SMB=m +CONFIG_RMI4_F03=y +CONFIG_RMI4_F03_SERIO=m +CONFIG_RMI4_2D_SENSOR=y +CONFIG_RMI4_F11=y +CONFIG_RMI4_F12=y +CONFIG_RMI4_F30=y +CONFIG_RMI4_F34=y +# CONFIG_RMI4_F54 is not set +CONFIG_RMI4_F55=y + +# +# Hardware I/O ports +# +CONFIG_SERIO=m +CONFIG_ARCH_MIGHT_HAVE_PC_SERIO=y +CONFIG_SERIO_I8042=m +CONFIG_SERIO_SERPORT=m +CONFIG_SERIO_CT82C710=m +CONFIG_SERIO_PARKBD=m +CONFIG_SERIO_PCIPS2=m +CONFIG_SERIO_LIBPS2=m +CONFIG_SERIO_RAW=m +CONFIG_SERIO_ALTERA_PS2=m +CONFIG_SERIO_PS2MULT=m +CONFIG_SERIO_ARC_PS2=m +# CONFIG_SERIO_APBPS2 is not set +CONFIG_HYPERV_KEYBOARD=m +CONFIG_SERIO_GPIO_PS2=m +CONFIG_USERIO=m +CONFIG_GAMEPORT=m +CONFIG_GAMEPORT_NS558=m +CONFIG_GAMEPORT_L4=m +CONFIG_GAMEPORT_EMU10K1=m +CONFIG_GAMEPORT_FM801=m +# end of Hardware I/O ports +# end of Input device support + +# +# Character devices +# +CONFIG_TTY=y +CONFIG_VT=y +CONFIG_CONSOLE_TRANSLATIONS=y +CONFIG_VT_CONSOLE=y +CONFIG_VT_CONSOLE_SLEEP=y +CONFIG_HW_CONSOLE=y +CONFIG_VT_HW_CONSOLE_BINDING=y +CONFIG_UNIX98_PTYS=y +# CONFIG_LEGACY_PTYS is not set +CONFIG_LDISC_AUTOLOAD=y + +# +# Serial drivers +# +CONFIG_SERIAL_EARLYCON=y +CONFIG_SERIAL_8250=y +# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set +CONFIG_SERIAL_8250_PNP=y +# CONFIG_SERIAL_8250_16550A_VARIANTS is not set +CONFIG_SERIAL_8250_FINTEK=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_8250_DMA=y +CONFIG_SERIAL_8250_PCI=y +CONFIG_SERIAL_8250_EXAR=m +CONFIG_SERIAL_8250_CS=m +CONFIG_SERIAL_8250_MEN_MCB=m +CONFIG_SERIAL_8250_NR_UARTS=32 +CONFIG_SERIAL_8250_RUNTIME_UARTS=4 +CONFIG_SERIAL_8250_EXTENDED=y +CONFIG_SERIAL_8250_MANY_PORTS=y +CONFIG_SERIAL_8250_ASPEED_VUART=m +CONFIG_SERIAL_8250_SHARE_IRQ=y +# CONFIG_SERIAL_8250_DETECT_IRQ is not set +CONFIG_SERIAL_8250_RSA=y +CONFIG_SERIAL_8250_DWLIB=y +CONFIG_SERIAL_8250_DW=m +CONFIG_SERIAL_8250_RT288X=y +CONFIG_SERIAL_8250_LPSS=y +CONFIG_SERIAL_8250_MID=y +CONFIG_SERIAL_OF_PLATFORM=m + +# +# Non-8250 serial port support +# +CONFIG_SERIAL_MAX3100=m +CONFIG_SERIAL_MAX310X=m +CONFIG_SERIAL_UARTLITE=m +CONFIG_SERIAL_UARTLITE_NR_UARTS=1 +CONFIG_SERIAL_CORE=y +CONFIG_SERIAL_CORE_CONSOLE=y +CONFIG_SERIAL_JSM=m +CONFIG_SERIAL_SIFIVE=m +CONFIG_SERIAL_LANTIQ=m +CONFIG_SERIAL_SCCNXP=m +CONFIG_SERIAL_SC16IS7XX_CORE=m +CONFIG_SERIAL_SC16IS7XX=m +CONFIG_SERIAL_SC16IS7XX_I2C=y +CONFIG_SERIAL_SC16IS7XX_SPI=y +CONFIG_SERIAL_ALTERA_JTAGUART=m +CONFIG_SERIAL_ALTERA_UART=m +CONFIG_SERIAL_ALTERA_UART_MAXPORTS=4 +CONFIG_SERIAL_ALTERA_UART_BAUDRATE=115200 +CONFIG_SERIAL_IFX6X60=m +CONFIG_SERIAL_XILINX_PS_UART=m +CONFIG_SERIAL_ARC=m +CONFIG_SERIAL_ARC_NR_PORTS=1 +CONFIG_SERIAL_RP2=m +CONFIG_SERIAL_RP2_NR_UARTS=32 +CONFIG_SERIAL_FSL_LPUART=m +CONFIG_SERIAL_FSL_LINFLEXUART=m +CONFIG_SERIAL_CONEXANT_DIGICOLOR=m +CONFIG_SERIAL_MEN_Z135=m +CONFIG_SERIAL_SPRD=m +# end of Serial drivers + +CONFIG_SERIAL_MCTRL_GPIO=y +CONFIG_SERIAL_NONSTANDARD=y +CONFIG_ROCKETPORT=m +CONFIG_CYCLADES=m +CONFIG_CYZ_INTR=y +CONFIG_MOXA_INTELLIO=m +CONFIG_MOXA_SMARTIO=m +CONFIG_SYNCLINK=m +CONFIG_SYNCLINKMP=m +CONFIG_SYNCLINK_GT=m +CONFIG_ISI=m +CONFIG_N_HDLC=m +CONFIG_N_GSM=m +CONFIG_NOZOMI=m +CONFIG_NULL_TTY=m +CONFIG_TRACE_ROUTER=m +CONFIG_TRACE_SINK=m +CONFIG_HVC_DRIVER=y +CONFIG_HVC_IRQ=y +CONFIG_HVC_XEN=y +CONFIG_HVC_XEN_FRONTEND=y +CONFIG_SERIAL_DEV_BUS=y +CONFIG_SERIAL_DEV_CTRL_TTYPORT=y +# CONFIG_TTY_PRINTK is not set +CONFIG_PRINTER=m +# CONFIG_LP_CONSOLE is not set +CONFIG_PPDEV=m +CONFIG_VIRTIO_CONSOLE=m +CONFIG_IPMI_HANDLER=m +CONFIG_IPMI_DMI_DECODE=y +CONFIG_IPMI_PLAT_DATA=y +# CONFIG_IPMI_PANIC_EVENT is not set +CONFIG_IPMI_DEVICE_INTERFACE=m +CONFIG_IPMI_SI=m +CONFIG_IPMI_SSIF=m +CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m +CONFIG_IPMB_DEVICE_INTERFACE=m +CONFIG_HW_RANDOM=m +CONFIG_HW_RANDOM_TIMERIOMEM=m +CONFIG_HW_RANDOM_INTEL=m +CONFIG_HW_RANDOM_AMD=m +CONFIG_HW_RANDOM_VIA=m +CONFIG_HW_RANDOM_VIRTIO=m +CONFIG_HW_RANDOM_CCTRNG=m +CONFIG_APPLICOM=m + +# +# PCMCIA character devices +# +CONFIG_SYNCLINK_CS=m +CONFIG_CARDMAN_4000=m +CONFIG_CARDMAN_4040=m +CONFIG_SCR24X=m +CONFIG_IPWIRELESS=m +# end of PCMCIA character devices + +CONFIG_MWAVE=m +CONFIG_DEVMEM=y +# CONFIG_DEVKMEM is not set +CONFIG_NVRAM=m +CONFIG_RAW_DRIVER=m +CONFIG_MAX_RAW_DEVS=256 +CONFIG_DEVPORT=y +CONFIG_HPET=y +CONFIG_HPET_MMAP=y +CONFIG_HPET_MMAP_DEFAULT=y +CONFIG_HANGCHECK_TIMER=m +CONFIG_TCG_TPM=m +CONFIG_HW_RANDOM_TPM=y +CONFIG_TCG_TIS_CORE=m +CONFIG_TCG_TIS=m +CONFIG_TCG_TIS_SPI=m +CONFIG_TCG_TIS_SPI_CR50=y +CONFIG_TCG_TIS_I2C_ATMEL=m +CONFIG_TCG_TIS_I2C_INFINEON=m +CONFIG_TCG_TIS_I2C_NUVOTON=m +CONFIG_TCG_NSC=m +CONFIG_TCG_ATMEL=m +CONFIG_TCG_INFINEON=m +CONFIG_TCG_XEN=m +CONFIG_TCG_CRB=m +CONFIG_TCG_VTPM_PROXY=m +CONFIG_TCG_TIS_ST33ZP24=m +CONFIG_TCG_TIS_ST33ZP24_I2C=m +CONFIG_TCG_TIS_ST33ZP24_SPI=m +CONFIG_TELCLOCK=m +CONFIG_XILLYBUS=m +CONFIG_XILLYBUS_PCIE=m +CONFIG_XILLYBUS_OF=m +# end of Character devices + +# CONFIG_RANDOM_TRUST_CPU is not set +# CONFIG_RANDOM_TRUST_BOOTLOADER is not set + +# +# I2C support +# +CONFIG_I2C=y +CONFIG_ACPI_I2C_OPREGION=y +CONFIG_I2C_BOARDINFO=y +CONFIG_I2C_COMPAT=y +CONFIG_I2C_CHARDEV=m +CONFIG_I2C_MUX=m + +# +# Multiplexer I2C Chip support +# +CONFIG_I2C_ARB_GPIO_CHALLENGE=m +CONFIG_I2C_MUX_GPIO=m +CONFIG_I2C_MUX_GPMUX=m +CONFIG_I2C_MUX_LTC4306=m +CONFIG_I2C_MUX_PCA9541=m +CONFIG_I2C_MUX_PCA954x=m +CONFIG_I2C_MUX_PINCTRL=m +CONFIG_I2C_MUX_REG=m +CONFIG_I2C_DEMUX_PINCTRL=m +CONFIG_I2C_MUX_MLXCPLD=m +# end of Multiplexer I2C Chip support + +CONFIG_I2C_HELPER_AUTO=y +CONFIG_I2C_SMBUS=m +CONFIG_I2C_ALGOBIT=m +CONFIG_I2C_ALGOPCA=m + +# +# I2C Hardware Bus support +# + +# +# PC SMBus host controller drivers +# +CONFIG_I2C_ALI1535=m +CONFIG_I2C_ALI1563=m +CONFIG_I2C_ALI15X3=m +CONFIG_I2C_AMD756=m +CONFIG_I2C_AMD756_S4882=m +CONFIG_I2C_AMD8111=m +CONFIG_I2C_AMD_MP2=m +CONFIG_I2C_I801=m +CONFIG_I2C_ISCH=m +CONFIG_I2C_ISMT=m +CONFIG_I2C_PIIX4=m +CONFIG_I2C_CHT_WC=m +CONFIG_I2C_NFORCE2=m +CONFIG_I2C_NFORCE2_S4985=m +CONFIG_I2C_NVIDIA_GPU=m +CONFIG_I2C_SIS5595=m +CONFIG_I2C_SIS630=m +CONFIG_I2C_SIS96X=m +CONFIG_I2C_VIA=m +CONFIG_I2C_VIAPRO=m + +# +# ACPI drivers +# +CONFIG_I2C_SCMI=m + +# +# I2C system bus drivers (mostly embedded / system-on-chip) +# +CONFIG_I2C_CBUS_GPIO=m +CONFIG_I2C_DESIGNWARE_CORE=y +CONFIG_I2C_DESIGNWARE_SLAVE=y +CONFIG_I2C_DESIGNWARE_PLATFORM=y +CONFIG_I2C_DESIGNWARE_BAYTRAIL=y +CONFIG_I2C_DESIGNWARE_PCI=m +CONFIG_I2C_EMEV2=m +CONFIG_I2C_GPIO=m +# CONFIG_I2C_GPIO_FAULT_INJECTOR is not set +CONFIG_I2C_KEMPLD=m +CONFIG_I2C_OCORES=m +CONFIG_I2C_PCA_PLATFORM=m +CONFIG_I2C_RK3X=m +CONFIG_I2C_SIMTEC=m +CONFIG_I2C_XILINX=m + +# +# External I2C/SMBus adapter drivers +# +CONFIG_I2C_DIOLAN_U2C=m +CONFIG_I2C_DLN2=m +CONFIG_I2C_PARPORT=m +CONFIG_I2C_ROBOTFUZZ_OSIF=m +CONFIG_I2C_TAOS_EVM=m +CONFIG_I2C_TINY_USB=m +CONFIG_I2C_VIPERBOARD=m + +# +# Other I2C/SMBus bus drivers +# +CONFIG_I2C_MLXCPLD=m +CONFIG_I2C_CROS_EC_TUNNEL=m +CONFIG_I2C_FSI=m +# end of I2C Hardware Bus support + +CONFIG_I2C_STUB=m +CONFIG_I2C_SLAVE=y +CONFIG_I2C_SLAVE_EEPROM=m +# CONFIG_I2C_DEBUG_CORE is not set +# CONFIG_I2C_DEBUG_ALGO is not set +# CONFIG_I2C_DEBUG_BUS is not set +# end of I2C support + +CONFIG_I3C=m +CONFIG_CDNS_I3C_MASTER=m +CONFIG_DW_I3C_MASTER=m +CONFIG_SPI=y +# CONFIG_SPI_DEBUG is not set +CONFIG_SPI_MASTER=y +CONFIG_SPI_MEM=y + +# +# SPI Master Controller Drivers +# +CONFIG_SPI_ALTERA=m +CONFIG_SPI_AXI_SPI_ENGINE=m +CONFIG_SPI_BITBANG=m +CONFIG_SPI_BUTTERFLY=m +CONFIG_SPI_CADENCE=m +CONFIG_SPI_DESIGNWARE=m +CONFIG_SPI_DW_DMA=y +CONFIG_SPI_DW_PCI=m +CONFIG_SPI_DW_MMIO=m +CONFIG_SPI_DLN2=m +CONFIG_SPI_FSI=m +CONFIG_SPI_NXP_FLEXSPI=m +CONFIG_SPI_GPIO=m +CONFIG_SPI_LM70_LLP=m +CONFIG_SPI_FSL_LIB=m +CONFIG_SPI_FSL_SPI=m +CONFIG_SPI_OC_TINY=m +CONFIG_SPI_PXA2XX=m +CONFIG_SPI_PXA2XX_PCI=m +CONFIG_SPI_ROCKCHIP=m +CONFIG_SPI_SC18IS602=m +CONFIG_SPI_SIFIVE=m +CONFIG_SPI_MXIC=m +CONFIG_SPI_XCOMM=m +CONFIG_SPI_XILINX=m +CONFIG_SPI_ZYNQMP_GQSPI=m +CONFIG_SPI_AMD=m + +# +# SPI Multiplexer support +# +CONFIG_SPI_MUX=m + +# +# SPI Protocol Masters +# +CONFIG_SPI_SPIDEV=m +CONFIG_SPI_LOOPBACK_TEST=m +CONFIG_SPI_TLE62X0=m +CONFIG_SPI_SLAVE=y +CONFIG_SPI_SLAVE_TIME=m +CONFIG_SPI_SLAVE_SYSTEM_CONTROL=m +CONFIG_SPMI=m +CONFIG_HSI=m +CONFIG_HSI_BOARDINFO=y + +# +# HSI controllers +# + +# +# HSI clients +# +CONFIG_HSI_CHAR=m +CONFIG_PPS=y +# CONFIG_PPS_DEBUG is not set + +# +# PPS clients support +# +CONFIG_PPS_CLIENT_KTIMER=m +CONFIG_PPS_CLIENT_LDISC=m +CONFIG_PPS_CLIENT_PARPORT=m +CONFIG_PPS_CLIENT_GPIO=m + +# +# PPS generators support +# + +# +# PTP clock support +# +CONFIG_PTP_1588_CLOCK=y +CONFIG_DP83640_PHY=m +CONFIG_PTP_1588_CLOCK_INES=m +CONFIG_PTP_1588_CLOCK_KVM=m +CONFIG_PTP_1588_CLOCK_IDT82P33=m +CONFIG_PTP_1588_CLOCK_IDTCM=m +CONFIG_PTP_1588_CLOCK_VMW=m +# end of PTP clock support + +CONFIG_PINCTRL=y +CONFIG_GENERIC_PINCTRL_GROUPS=y +CONFIG_PINMUX=y +CONFIG_GENERIC_PINMUX_FUNCTIONS=y +CONFIG_PINCONF=y +CONFIG_GENERIC_PINCONF=y +# CONFIG_DEBUG_PINCTRL is not set +CONFIG_PINCTRL_AS3722=m +CONFIG_PINCTRL_AXP209=m +CONFIG_PINCTRL_AMD=m +CONFIG_PINCTRL_DA9062=m +CONFIG_PINCTRL_MCP23S08_I2C=m +CONFIG_PINCTRL_MCP23S08_SPI=m +CONFIG_PINCTRL_MCP23S08=m +CONFIG_PINCTRL_SINGLE=m +CONFIG_PINCTRL_SX150X=y +CONFIG_PINCTRL_STMFX=m +CONFIG_PINCTRL_MAX77620=m +CONFIG_PINCTRL_PALMAS=m +CONFIG_PINCTRL_RK805=m +CONFIG_PINCTRL_OCELOT=y +CONFIG_PINCTRL_BAYTRAIL=y +CONFIG_PINCTRL_CHERRYVIEW=y +CONFIG_PINCTRL_LYNXPOINT=y +CONFIG_PINCTRL_INTEL=y +CONFIG_PINCTRL_BROXTON=y +CONFIG_PINCTRL_CANNONLAKE=y +CONFIG_PINCTRL_CEDARFORK=y +CONFIG_PINCTRL_DENVERTON=y +CONFIG_PINCTRL_GEMINILAKE=y +CONFIG_PINCTRL_ICELAKE=y +CONFIG_PINCTRL_JASPERLAKE=y +CONFIG_PINCTRL_LEWISBURG=y +CONFIG_PINCTRL_SUNRISEPOINT=y +CONFIG_PINCTRL_TIGERLAKE=y +CONFIG_PINCTRL_LOCHNAGAR=m +CONFIG_PINCTRL_MADERA=m +CONFIG_PINCTRL_CS47L15=y +CONFIG_PINCTRL_CS47L35=y +CONFIG_PINCTRL_CS47L85=y +CONFIG_PINCTRL_CS47L90=y +CONFIG_PINCTRL_CS47L92=y +CONFIG_PINCTRL_EQUILIBRIUM=m +CONFIG_GPIOLIB=y +CONFIG_GPIOLIB_FASTPATH_LIMIT=512 +CONFIG_OF_GPIO=y +CONFIG_GPIO_ACPI=y +CONFIG_GPIOLIB_IRQCHIP=y +# CONFIG_DEBUG_GPIO is not set +CONFIG_GPIO_SYSFS=y +CONFIG_GPIO_GENERIC=y +CONFIG_GPIO_MAX730X=m + +# +# Memory mapped GPIO drivers +# +CONFIG_GPIO_74XX_MMIO=m +CONFIG_GPIO_ALTERA=m +CONFIG_GPIO_AMDPT=m +CONFIG_GPIO_CADENCE=m +CONFIG_GPIO_DWAPB=m +CONFIG_GPIO_EXAR=m +CONFIG_GPIO_FTGPIO010=y +CONFIG_GPIO_GENERIC_PLATFORM=m +CONFIG_GPIO_GRGPIO=m +CONFIG_GPIO_HLWD=m +CONFIG_GPIO_ICH=m +CONFIG_GPIO_LOGICVC=m +CONFIG_GPIO_MB86S7X=m +CONFIG_GPIO_MENZ127=m +CONFIG_GPIO_SAMA5D2_PIOBU=m +CONFIG_GPIO_SIFIVE=y +CONFIG_GPIO_SIOX=m +CONFIG_GPIO_SYSCON=m +CONFIG_GPIO_VX855=m +CONFIG_GPIO_WCD934X=m +CONFIG_GPIO_XILINX=m +CONFIG_GPIO_AMD_FCH=m +# end of Memory mapped GPIO drivers + +# +# Port-mapped I/O GPIO drivers +# +CONFIG_GPIO_F7188X=m +CONFIG_GPIO_IT87=m +CONFIG_GPIO_SCH=m +CONFIG_GPIO_SCH311X=m +CONFIG_GPIO_WINBOND=m +CONFIG_GPIO_WS16C48=m +# end of Port-mapped I/O GPIO drivers + +# +# I2C GPIO expanders +# +CONFIG_GPIO_ADP5588=m +CONFIG_GPIO_ADNP=m +CONFIG_GPIO_GW_PLD=m +CONFIG_GPIO_MAX7300=m +CONFIG_GPIO_MAX732X=m +CONFIG_GPIO_PCA953X=m +CONFIG_GPIO_PCA953X_IRQ=y +CONFIG_GPIO_PCF857X=m +CONFIG_GPIO_TPIC2810=m +# end of I2C GPIO expanders + +# +# MFD GPIO expanders +# +CONFIG_GPIO_ADP5520=m +CONFIG_GPIO_ARIZONA=m +CONFIG_GPIO_BD70528=m +CONFIG_GPIO_BD71828=m +CONFIG_GPIO_BD9571MWV=m +CONFIG_GPIO_CRYSTAL_COVE=m +CONFIG_GPIO_DA9052=m +CONFIG_GPIO_DA9055=m +CONFIG_GPIO_DLN2=m +CONFIG_GPIO_JANZ_TTL=m +CONFIG_GPIO_KEMPLD=m +CONFIG_GPIO_LP3943=m +CONFIG_GPIO_LP873X=m +CONFIG_GPIO_LP87565=m +CONFIG_GPIO_MADERA=m +CONFIG_GPIO_MAX77620=m +CONFIG_GPIO_MAX77650=m +CONFIG_GPIO_MSIC=y +CONFIG_GPIO_PALMAS=y +CONFIG_GPIO_RC5T583=y +CONFIG_GPIO_STMPE=y +CONFIG_GPIO_TC3589X=y +CONFIG_GPIO_TPS65086=m +CONFIG_GPIO_TPS65218=m +CONFIG_GPIO_TPS6586X=y +CONFIG_GPIO_TPS65910=y +CONFIG_GPIO_TPS65912=m +CONFIG_GPIO_TPS68470=y +CONFIG_GPIO_TQMX86=m +CONFIG_GPIO_TWL4030=m +CONFIG_GPIO_TWL6040=m +CONFIG_GPIO_UCB1400=m +CONFIG_GPIO_WHISKEY_COVE=m +CONFIG_GPIO_WM831X=m +CONFIG_GPIO_WM8350=m +CONFIG_GPIO_WM8994=m +# end of MFD GPIO expanders + +# +# PCI GPIO expanders +# +CONFIG_GPIO_AMD8111=m +CONFIG_GPIO_ML_IOH=m +CONFIG_GPIO_PCI_IDIO_16=m +CONFIG_GPIO_PCIE_IDIO_24=m +CONFIG_GPIO_RDC321X=m +CONFIG_GPIO_SODAVILLE=y +# end of PCI GPIO expanders + +# +# SPI GPIO expanders +# +CONFIG_GPIO_74X164=m +CONFIG_GPIO_MAX3191X=m +CONFIG_GPIO_MAX7301=m +CONFIG_GPIO_MC33880=m +CONFIG_GPIO_PISOSR=m +CONFIG_GPIO_XRA1403=m +CONFIG_GPIO_MOXTET=m +# end of SPI GPIO expanders + +# +# USB GPIO expanders +# +CONFIG_GPIO_VIPERBOARD=m +# end of USB GPIO expanders + +CONFIG_GPIO_AGGREGATOR=m +CONFIG_GPIO_MOCKUP=m +CONFIG_W1=m +CONFIG_W1_CON=y + +# +# 1-wire Bus Masters +# +CONFIG_W1_MASTER_MATROX=m +CONFIG_W1_MASTER_DS2490=m +CONFIG_W1_MASTER_DS2482=m +CONFIG_W1_MASTER_DS1WM=m +CONFIG_W1_MASTER_GPIO=m +CONFIG_W1_MASTER_SGI=m +# end of 1-wire Bus Masters + +# +# 1-wire Slaves +# +CONFIG_W1_SLAVE_THERM=m +CONFIG_W1_SLAVE_SMEM=m +CONFIG_W1_SLAVE_DS2405=m +CONFIG_W1_SLAVE_DS2408=m +# CONFIG_W1_SLAVE_DS2408_READBACK is not set +CONFIG_W1_SLAVE_DS2413=m +CONFIG_W1_SLAVE_DS2406=m +CONFIG_W1_SLAVE_DS2423=m +CONFIG_W1_SLAVE_DS2805=m +CONFIG_W1_SLAVE_DS2430=m +CONFIG_W1_SLAVE_DS2431=m +CONFIG_W1_SLAVE_DS2433=m +# CONFIG_W1_SLAVE_DS2433_CRC is not set +CONFIG_W1_SLAVE_DS2438=m +CONFIG_W1_SLAVE_DS250X=m +CONFIG_W1_SLAVE_DS2780=m +CONFIG_W1_SLAVE_DS2781=m +CONFIG_W1_SLAVE_DS28E04=m +CONFIG_W1_SLAVE_DS28E17=m +# end of 1-wire Slaves + +CONFIG_POWER_AVS=y +CONFIG_QCOM_CPR=m +CONFIG_POWER_RESET=y +CONFIG_POWER_RESET_AS3722=y +CONFIG_POWER_RESET_GPIO=y +CONFIG_POWER_RESET_GPIO_RESTART=y +CONFIG_POWER_RESET_LTC2952=y +CONFIG_POWER_RESET_MT6323=y +CONFIG_POWER_RESET_RESTART=y +CONFIG_POWER_RESET_SYSCON=y +CONFIG_POWER_RESET_SYSCON_POWEROFF=y +CONFIG_REBOOT_MODE=m +CONFIG_SYSCON_REBOOT_MODE=m +CONFIG_NVMEM_REBOOT_MODE=m +CONFIG_POWER_SUPPLY=y +# CONFIG_POWER_SUPPLY_DEBUG is not set +CONFIG_POWER_SUPPLY_HWMON=y +CONFIG_PDA_POWER=m +CONFIG_GENERIC_ADC_BATTERY=m +CONFIG_MAX8925_POWER=m +CONFIG_WM831X_BACKUP=m +CONFIG_WM831X_POWER=m +CONFIG_WM8350_POWER=m +CONFIG_TEST_POWER=m +CONFIG_BATTERY_88PM860X=m +CONFIG_CHARGER_ADP5061=m +CONFIG_BATTERY_ACT8945A=m +CONFIG_BATTERY_CPCAP=m +CONFIG_BATTERY_CW2015=m +CONFIG_BATTERY_DS2760=m +CONFIG_BATTERY_DS2780=m +CONFIG_BATTERY_DS2781=m +CONFIG_BATTERY_DS2782=m +CONFIG_BATTERY_LEGO_EV3=m +CONFIG_BATTERY_SBS=m +CONFIG_CHARGER_SBS=m +CONFIG_MANAGER_SBS=m +CONFIG_BATTERY_BQ27XXX=m +CONFIG_BATTERY_BQ27XXX_I2C=m +CONFIG_BATTERY_BQ27XXX_HDQ=m +# CONFIG_BATTERY_BQ27XXX_DT_UPDATES_NVM is not set +CONFIG_BATTERY_DA9030=m +CONFIG_BATTERY_DA9052=m +CONFIG_CHARGER_DA9150=m +CONFIG_BATTERY_DA9150=m +CONFIG_CHARGER_AXP20X=m +CONFIG_BATTERY_AXP20X=m +CONFIG_AXP20X_POWER=m +CONFIG_AXP288_CHARGER=m +CONFIG_AXP288_FUEL_GAUGE=m +CONFIG_BATTERY_MAX17040=m +CONFIG_BATTERY_MAX17042=m +CONFIG_BATTERY_MAX1721X=m +CONFIG_BATTERY_TWL4030_MADC=m +CONFIG_CHARGER_88PM860X=m +CONFIG_CHARGER_PCF50633=m +CONFIG_BATTERY_RX51=m +CONFIG_CHARGER_ISP1704=m +CONFIG_CHARGER_MAX8903=m +CONFIG_CHARGER_TWL4030=m +CONFIG_CHARGER_LP8727=m +CONFIG_CHARGER_LP8788=m +CONFIG_CHARGER_GPIO=m +CONFIG_CHARGER_MANAGER=y +CONFIG_CHARGER_LT3651=m +CONFIG_CHARGER_MAX14577=m +CONFIG_CHARGER_DETECTOR_MAX14656=m +CONFIG_CHARGER_MAX77650=m +CONFIG_CHARGER_MAX77693=m +CONFIG_CHARGER_MAX8997=m +CONFIG_CHARGER_MAX8998=m +CONFIG_CHARGER_MP2629=m +CONFIG_CHARGER_BQ2415X=m +CONFIG_CHARGER_BQ24190=m +CONFIG_CHARGER_BQ24257=m +CONFIG_CHARGER_BQ24735=m +CONFIG_CHARGER_BQ25890=m +CONFIG_CHARGER_SMB347=m +CONFIG_CHARGER_TPS65090=m +CONFIG_CHARGER_TPS65217=m +CONFIG_BATTERY_GAUGE_LTC2941=m +CONFIG_BATTERY_RT5033=m +CONFIG_CHARGER_RT9455=m +CONFIG_CHARGER_CROS_USBPD=m +CONFIG_CHARGER_UCS1002=m +CONFIG_CHARGER_BD70528=m +CONFIG_CHARGER_BD99954=m +CONFIG_CHARGER_WILCO=m +CONFIG_HWMON=y +CONFIG_HWMON_VID=m +# CONFIG_HWMON_DEBUG_CHIP is not set + +# +# Native drivers +# +CONFIG_SENSORS_ABITUGURU=m +CONFIG_SENSORS_ABITUGURU3=m +CONFIG_SENSORS_AD7314=m +CONFIG_SENSORS_AD7414=m +CONFIG_SENSORS_AD7418=m +CONFIG_SENSORS_ADM1021=m +CONFIG_SENSORS_ADM1025=m +CONFIG_SENSORS_ADM1026=m +CONFIG_SENSORS_ADM1029=m +CONFIG_SENSORS_ADM1031=m +CONFIG_SENSORS_ADM1177=m +CONFIG_SENSORS_ADM9240=m +CONFIG_SENSORS_ADT7X10=m +CONFIG_SENSORS_ADT7310=m +CONFIG_SENSORS_ADT7410=m +CONFIG_SENSORS_ADT7411=m +CONFIG_SENSORS_ADT7462=m +CONFIG_SENSORS_ADT7470=m +CONFIG_SENSORS_ADT7475=m +CONFIG_SENSORS_AS370=m +CONFIG_SENSORS_ASC7621=m +CONFIG_SENSORS_AXI_FAN_CONTROL=m +CONFIG_SENSORS_K8TEMP=m +CONFIG_SENSORS_K10TEMP=m +CONFIG_SENSORS_FAM15H_POWER=m +CONFIG_SENSORS_AMD_ENERGY=m +CONFIG_SENSORS_APPLESMC=m +CONFIG_SENSORS_ASB100=m +CONFIG_SENSORS_ASPEED=m +CONFIG_SENSORS_ATXP1=m +CONFIG_SENSORS_DRIVETEMP=m +CONFIG_SENSORS_DS620=m +CONFIG_SENSORS_DS1621=m +CONFIG_SENSORS_DELL_SMM=m +CONFIG_SENSORS_DA9052_ADC=m +CONFIG_SENSORS_DA9055=m +CONFIG_SENSORS_I5K_AMB=m +CONFIG_SENSORS_F71805F=m +CONFIG_SENSORS_F71882FG=m +CONFIG_SENSORS_F75375S=m +CONFIG_SENSORS_GSC=m +CONFIG_SENSORS_MC13783_ADC=m +CONFIG_SENSORS_FSCHMD=m +CONFIG_SENSORS_FTSTEUTATES=m +CONFIG_SENSORS_GL518SM=m +CONFIG_SENSORS_GL520SM=m +CONFIG_SENSORS_G760A=m +CONFIG_SENSORS_G762=m +CONFIG_SENSORS_GPIO_FAN=m +CONFIG_SENSORS_HIH6130=m +CONFIG_SENSORS_IBMAEM=m +CONFIG_SENSORS_IBMPEX=m +CONFIG_SENSORS_IIO_HWMON=m +CONFIG_SENSORS_I5500=m +CONFIG_SENSORS_CORETEMP=m +CONFIG_SENSORS_IT87=m +CONFIG_SENSORS_JC42=m +CONFIG_SENSORS_POWR1220=m +CONFIG_SENSORS_LINEAGE=m +CONFIG_SENSORS_LOCHNAGAR=m +CONFIG_SENSORS_LTC2945=m +CONFIG_SENSORS_LTC2947=m +CONFIG_SENSORS_LTC2947_I2C=m +CONFIG_SENSORS_LTC2947_SPI=m +CONFIG_SENSORS_LTC2990=m +CONFIG_SENSORS_LTC4151=m +CONFIG_SENSORS_LTC4215=m +CONFIG_SENSORS_LTC4222=m +CONFIG_SENSORS_LTC4245=m +CONFIG_SENSORS_LTC4260=m +CONFIG_SENSORS_LTC4261=m +CONFIG_SENSORS_MAX1111=m +CONFIG_SENSORS_MAX16065=m +CONFIG_SENSORS_MAX1619=m +CONFIG_SENSORS_MAX1668=m +CONFIG_SENSORS_MAX197=m +CONFIG_SENSORS_MAX31722=m +CONFIG_SENSORS_MAX31730=m +CONFIG_SENSORS_MAX6621=m +CONFIG_SENSORS_MAX6639=m +CONFIG_SENSORS_MAX6642=m +CONFIG_SENSORS_MAX6650=m +CONFIG_SENSORS_MAX6697=m +CONFIG_SENSORS_MAX31790=m +CONFIG_SENSORS_MCP3021=m +CONFIG_SENSORS_MLXREG_FAN=m +CONFIG_SENSORS_TC654=m +CONFIG_SENSORS_MENF21BMC_HWMON=m +CONFIG_SENSORS_ADCXX=m +CONFIG_SENSORS_LM63=m +CONFIG_SENSORS_LM70=m +CONFIG_SENSORS_LM73=m +CONFIG_SENSORS_LM75=m +CONFIG_SENSORS_LM77=m +CONFIG_SENSORS_LM78=m +CONFIG_SENSORS_LM80=m +CONFIG_SENSORS_LM83=m +CONFIG_SENSORS_LM85=m +CONFIG_SENSORS_LM87=m +CONFIG_SENSORS_LM90=m +CONFIG_SENSORS_LM92=m +CONFIG_SENSORS_LM93=m +CONFIG_SENSORS_LM95234=m +CONFIG_SENSORS_LM95241=m +CONFIG_SENSORS_LM95245=m +CONFIG_SENSORS_PC87360=m +CONFIG_SENSORS_PC87427=m +CONFIG_SENSORS_NTC_THERMISTOR=m +CONFIG_SENSORS_NCT6683=m +CONFIG_SENSORS_NCT6775=m +CONFIG_SENSORS_NCT7802=m +CONFIG_SENSORS_NCT7904=m +CONFIG_SENSORS_NPCM7XX=m +CONFIG_SENSORS_PCF8591=m +CONFIG_PMBUS=m +CONFIG_SENSORS_PMBUS=m +CONFIG_SENSORS_ADM1275=m +CONFIG_SENSORS_BEL_PFE=m +CONFIG_SENSORS_IBM_CFFPS=m +CONFIG_SENSORS_INSPUR_IPSPS=m +CONFIG_SENSORS_IR35221=m +CONFIG_SENSORS_IR38064=m +CONFIG_SENSORS_IRPS5401=m +CONFIG_SENSORS_ISL68137=m +CONFIG_SENSORS_LM25066=m +CONFIG_SENSORS_LTC2978=m +# CONFIG_SENSORS_LTC2978_REGULATOR is not set +CONFIG_SENSORS_LTC3815=m +CONFIG_SENSORS_MAX16064=m +CONFIG_SENSORS_MAX16601=m +CONFIG_SENSORS_MAX20730=m +CONFIG_SENSORS_MAX20751=m +CONFIG_SENSORS_MAX31785=m +CONFIG_SENSORS_MAX34440=m +CONFIG_SENSORS_MAX8688=m +CONFIG_SENSORS_PXE1610=m +CONFIG_SENSORS_TPS40422=m +CONFIG_SENSORS_TPS53679=m +CONFIG_SENSORS_UCD9000=m +CONFIG_SENSORS_UCD9200=m +CONFIG_SENSORS_XDPE122=m +CONFIG_SENSORS_ZL6100=m +CONFIG_SENSORS_PWM_FAN=m +CONFIG_SENSORS_SHT15=m +CONFIG_SENSORS_SHT21=m +CONFIG_SENSORS_SHT3x=m +CONFIG_SENSORS_SHTC1=m +CONFIG_SENSORS_SIS5595=m +CONFIG_SENSORS_DME1737=m +CONFIG_SENSORS_EMC1403=m +CONFIG_SENSORS_EMC2103=m +CONFIG_SENSORS_EMC6W201=m +CONFIG_SENSORS_SMSC47M1=m +CONFIG_SENSORS_SMSC47M192=m +CONFIG_SENSORS_SMSC47B397=m +CONFIG_SENSORS_SCH56XX_COMMON=m +CONFIG_SENSORS_SCH5627=m +CONFIG_SENSORS_SCH5636=m +CONFIG_SENSORS_STTS751=m +CONFIG_SENSORS_SMM665=m +CONFIG_SENSORS_ADC128D818=m +CONFIG_SENSORS_ADS7828=m +CONFIG_SENSORS_ADS7871=m +CONFIG_SENSORS_AMC6821=m +CONFIG_SENSORS_INA209=m +CONFIG_SENSORS_INA2XX=m +CONFIG_SENSORS_INA3221=m +CONFIG_SENSORS_TC74=m +CONFIG_SENSORS_THMC50=m +CONFIG_SENSORS_TMP102=m +CONFIG_SENSORS_TMP103=m +CONFIG_SENSORS_TMP108=m +CONFIG_SENSORS_TMP401=m +CONFIG_SENSORS_TMP421=m +CONFIG_SENSORS_TMP513=m +CONFIG_SENSORS_VIA_CPUTEMP=m +CONFIG_SENSORS_VIA686A=m +CONFIG_SENSORS_VT1211=m +CONFIG_SENSORS_VT8231=m +CONFIG_SENSORS_W83773G=m +CONFIG_SENSORS_W83781D=m +CONFIG_SENSORS_W83791D=m +CONFIG_SENSORS_W83792D=m +CONFIG_SENSORS_W83793=m +CONFIG_SENSORS_W83795=m +# CONFIG_SENSORS_W83795_FANCTRL is not set +CONFIG_SENSORS_W83L785TS=m +CONFIG_SENSORS_W83L786NG=m +CONFIG_SENSORS_W83627HF=m +CONFIG_SENSORS_W83627EHF=m +CONFIG_SENSORS_WM831X=m +CONFIG_SENSORS_WM8350=m +CONFIG_SENSORS_XGENE=m + +# +# ACPI drivers +# +CONFIG_SENSORS_ACPI_POWER=m +CONFIG_SENSORS_ATK0110=m +CONFIG_THERMAL=y +# CONFIG_THERMAL_STATISTICS is not set +CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=100 +CONFIG_THERMAL_HWMON=y +CONFIG_THERMAL_OF=y +CONFIG_THERMAL_WRITABLE_TRIPS=y +CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y +# CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE is not set +# CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE is not set +# CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR is not set +CONFIG_THERMAL_GOV_FAIR_SHARE=y +CONFIG_THERMAL_GOV_STEP_WISE=y +CONFIG_THERMAL_GOV_BANG_BANG=y +CONFIG_THERMAL_GOV_USER_SPACE=y +CONFIG_THERMAL_GOV_POWER_ALLOCATOR=y +CONFIG_CPU_THERMAL=y +CONFIG_CPU_FREQ_THERMAL=y +CONFIG_CPU_IDLE_THERMAL=y +CONFIG_CLOCK_THERMAL=y +CONFIG_DEVFREQ_THERMAL=y +# CONFIG_THERMAL_EMULATION is not set +CONFIG_THERMAL_MMIO=m +CONFIG_MAX77620_THERMAL=m +CONFIG_DA9062_THERMAL=m + +# +# Intel thermal drivers +# +CONFIG_INTEL_POWERCLAMP=m +CONFIG_X86_PKG_TEMP_THERMAL=m +CONFIG_INTEL_SOC_DTS_IOSF_CORE=m +CONFIG_INTEL_SOC_DTS_THERMAL=m + +# +# ACPI INT340X thermal drivers +# +CONFIG_INT340X_THERMAL=m +CONFIG_ACPI_THERMAL_REL=m +CONFIG_INT3406_THERMAL=m +CONFIG_PROC_THERMAL_MMIO_RAPL=y +# end of ACPI INT340X thermal drivers + +CONFIG_INTEL_BXT_PMIC_THERMAL=m +CONFIG_INTEL_PCH_THERMAL=m +# end of Intel thermal drivers + +# CONFIG_TI_SOC_THERMAL is not set +CONFIG_GENERIC_ADC_THERMAL=m +CONFIG_WATCHDOG=y +CONFIG_WATCHDOG_CORE=y +# CONFIG_WATCHDOG_NOWAYOUT is not set +CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED=y +CONFIG_WATCHDOG_OPEN_TIMEOUT=0 +CONFIG_WATCHDOG_SYSFS=y + +# +# Watchdog Pretimeout Governors +# +CONFIG_WATCHDOG_PRETIMEOUT_GOV=y +CONFIG_WATCHDOG_PRETIMEOUT_GOV_SEL=m +CONFIG_WATCHDOG_PRETIMEOUT_GOV_NOOP=m +CONFIG_WATCHDOG_PRETIMEOUT_GOV_PANIC=y +# CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_NOOP is not set +CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC=y + +# +# Watchdog Device Drivers +# +CONFIG_SOFT_WATCHDOG=m +# CONFIG_SOFT_WATCHDOG_PRETIMEOUT is not set +CONFIG_BD70528_WATCHDOG=m +CONFIG_DA9052_WATCHDOG=m +CONFIG_DA9055_WATCHDOG=m +CONFIG_DA9063_WATCHDOG=m +CONFIG_DA9062_WATCHDOG=m +CONFIG_GPIO_WATCHDOG=m +CONFIG_MENF21BMC_WATCHDOG=m +CONFIG_MENZ069_WATCHDOG=m +CONFIG_WDAT_WDT=m +CONFIG_WM831X_WATCHDOG=m +CONFIG_WM8350_WATCHDOG=m +CONFIG_XILINX_WATCHDOG=m +CONFIG_ZIIRAVE_WATCHDOG=m +CONFIG_RAVE_SP_WATCHDOG=m +CONFIG_MLX_WDT=m +CONFIG_CADENCE_WATCHDOG=m +CONFIG_DW_WATCHDOG=m +CONFIG_RN5T618_WATCHDOG=m +CONFIG_TWL4030_WATCHDOG=m +CONFIG_MAX63XX_WATCHDOG=m +CONFIG_MAX77620_WATCHDOG=m +CONFIG_RETU_WATCHDOG=m +CONFIG_STPMIC1_WATCHDOG=m +CONFIG_ACQUIRE_WDT=m +CONFIG_ADVANTECH_WDT=m +CONFIG_ALIM1535_WDT=m +CONFIG_ALIM7101_WDT=m +CONFIG_EBC_C384_WDT=m +CONFIG_F71808E_WDT=m +CONFIG_SP5100_TCO=m +CONFIG_SBC_FITPC2_WATCHDOG=m +CONFIG_EUROTECH_WDT=m +CONFIG_IB700_WDT=m +CONFIG_IBMASR=m +CONFIG_WAFER_WDT=m +CONFIG_I6300ESB_WDT=m +CONFIG_IE6XX_WDT=m +CONFIG_ITCO_WDT=m +CONFIG_ITCO_VENDOR_SUPPORT=y +CONFIG_IT8712F_WDT=m +CONFIG_IT87_WDT=m +CONFIG_HP_WATCHDOG=m +CONFIG_HPWDT_NMI_DECODING=y +CONFIG_KEMPLD_WDT=m +CONFIG_SC1200_WDT=m +CONFIG_PC87413_WDT=m +CONFIG_NV_TCO=m +CONFIG_60XX_WDT=m +CONFIG_CPU5_WDT=m +CONFIG_SMSC_SCH311X_WDT=m +CONFIG_SMSC37B787_WDT=m +CONFIG_TQMX86_WDT=m +CONFIG_VIA_WDT=m +CONFIG_W83627HF_WDT=m +CONFIG_W83877F_WDT=m +CONFIG_W83977F_WDT=m +CONFIG_MACHZ_WDT=m +CONFIG_SBC_EPX_C3_WATCHDOG=m +CONFIG_INTEL_MEI_WDT=m +CONFIG_NI903X_WDT=m +CONFIG_NIC7018_WDT=m +CONFIG_MEN_A21_WDT=m +CONFIG_XEN_WDT=m + +# +# PCI-based Watchdog Cards +# +CONFIG_PCIPCWATCHDOG=m +CONFIG_WDTPCI=m + +# +# USB-based Watchdog Cards +# +CONFIG_USBPCWATCHDOG=m +CONFIG_SSB_POSSIBLE=y +CONFIG_SSB=m +CONFIG_SSB_SPROM=y +CONFIG_SSB_BLOCKIO=y +CONFIG_SSB_PCIHOST_POSSIBLE=y +CONFIG_SSB_PCIHOST=y +CONFIG_SSB_B43_PCI_BRIDGE=y +CONFIG_SSB_PCMCIAHOST_POSSIBLE=y +CONFIG_SSB_PCMCIAHOST=y +CONFIG_SSB_SDIOHOST_POSSIBLE=y +CONFIG_SSB_SDIOHOST=y +CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y +CONFIG_SSB_DRIVER_PCICORE=y +CONFIG_SSB_DRIVER_GPIO=y +CONFIG_BCMA_POSSIBLE=y +CONFIG_BCMA=m +CONFIG_BCMA_BLOCKIO=y +CONFIG_BCMA_HOST_PCI_POSSIBLE=y +CONFIG_BCMA_HOST_PCI=y +# CONFIG_BCMA_HOST_SOC is not set +CONFIG_BCMA_DRIVER_PCI=y +CONFIG_BCMA_DRIVER_GMAC_CMN=y +CONFIG_BCMA_DRIVER_GPIO=y +# CONFIG_BCMA_DEBUG is not set + +# +# Multifunction device drivers +# +CONFIG_MFD_CORE=y +CONFIG_MFD_ACT8945A=m +CONFIG_MFD_AS3711=y +CONFIG_MFD_AS3722=m +CONFIG_PMIC_ADP5520=y +CONFIG_MFD_AAT2870_CORE=y +CONFIG_MFD_ATMEL_FLEXCOM=m +CONFIG_MFD_ATMEL_HLCDC=m +CONFIG_MFD_BCM590XX=m +CONFIG_MFD_BD9571MWV=m +CONFIG_MFD_AXP20X=m +CONFIG_MFD_AXP20X_I2C=m +CONFIG_MFD_CROS_EC_DEV=m +CONFIG_MFD_MADERA=m +CONFIG_MFD_MADERA_I2C=m +CONFIG_MFD_MADERA_SPI=m +CONFIG_MFD_CS47L15=y +CONFIG_MFD_CS47L35=y +CONFIG_MFD_CS47L85=y +CONFIG_MFD_CS47L90=y +CONFIG_MFD_CS47L92=y +CONFIG_PMIC_DA903X=y +CONFIG_PMIC_DA9052=y +CONFIG_MFD_DA9052_SPI=y +CONFIG_MFD_DA9052_I2C=y +CONFIG_MFD_DA9055=y +CONFIG_MFD_DA9062=m +CONFIG_MFD_DA9063=m +CONFIG_MFD_DA9150=m +CONFIG_MFD_DLN2=m +CONFIG_MFD_GATEWORKS_GSC=m +CONFIG_MFD_MC13XXX=m +CONFIG_MFD_MC13XXX_SPI=m +CONFIG_MFD_MC13XXX_I2C=m +CONFIG_MFD_MP2629=m +CONFIG_MFD_HI6421_PMIC=m +CONFIG_HTC_PASIC3=m +CONFIG_HTC_I2CPLD=y +CONFIG_MFD_INTEL_QUARK_I2C_GPIO=m +CONFIG_LPC_ICH=m +CONFIG_LPC_SCH=m +CONFIG_INTEL_SOC_PMIC=y +CONFIG_INTEL_SOC_PMIC_BXTWC=m +CONFIG_INTEL_SOC_PMIC_CHTWC=y +CONFIG_INTEL_SOC_PMIC_CHTDC_TI=m +CONFIG_INTEL_SOC_PMIC_MRFLD=m +CONFIG_MFD_INTEL_LPSS=m +CONFIG_MFD_INTEL_LPSS_ACPI=m +CONFIG_MFD_INTEL_LPSS_PCI=m +CONFIG_MFD_INTEL_MSIC=y +CONFIG_MFD_INTEL_PMC_BXT=m +CONFIG_MFD_IQS62X=m +CONFIG_MFD_JANZ_CMODIO=m +CONFIG_MFD_KEMPLD=m +CONFIG_MFD_88PM800=m +CONFIG_MFD_88PM805=m +CONFIG_MFD_88PM860X=y +CONFIG_MFD_MAX14577=m +CONFIG_MFD_MAX77620=y +CONFIG_MFD_MAX77650=m +CONFIG_MFD_MAX77686=m +CONFIG_MFD_MAX77693=m +CONFIG_MFD_MAX77843=y +CONFIG_MFD_MAX8907=m +CONFIG_MFD_MAX8925=y +CONFIG_MFD_MAX8997=y +CONFIG_MFD_MAX8998=y +CONFIG_MFD_MT6360=m +CONFIG_MFD_MT6397=m +CONFIG_MFD_MENF21BMC=m +CONFIG_EZX_PCAP=y +CONFIG_MFD_CPCAP=m +CONFIG_MFD_VIPERBOARD=m +CONFIG_MFD_RETU=m +CONFIG_MFD_PCF50633=m +CONFIG_PCF50633_ADC=m +CONFIG_PCF50633_GPIO=m +CONFIG_UCB1400_CORE=m +CONFIG_MFD_RDC321X=m +CONFIG_MFD_RT5033=m +CONFIG_MFD_RC5T583=y +CONFIG_MFD_RK808=m +CONFIG_MFD_RN5T618=m +CONFIG_MFD_SEC_CORE=y +CONFIG_MFD_SI476X_CORE=m +CONFIG_MFD_SM501=m +CONFIG_MFD_SM501_GPIO=y +CONFIG_MFD_SKY81452=m +CONFIG_MFD_SMSC=y +CONFIG_ABX500_CORE=y +CONFIG_AB3100_CORE=y +CONFIG_AB3100_OTP=y +CONFIG_MFD_STMPE=y + +# +# STMicroelectronics STMPE Interface Drivers +# +CONFIG_STMPE_I2C=y +CONFIG_STMPE_SPI=y +# end of STMicroelectronics STMPE Interface Drivers + +CONFIG_MFD_SYSCON=y +CONFIG_MFD_TI_AM335X_TSCADC=m +CONFIG_MFD_LP3943=m +CONFIG_MFD_LP8788=y +CONFIG_MFD_TI_LMU=m +CONFIG_MFD_PALMAS=y +CONFIG_TPS6105X=m +CONFIG_TPS65010=m +CONFIG_TPS6507X=m +CONFIG_MFD_TPS65086=m +CONFIG_MFD_TPS65090=y +CONFIG_MFD_TPS65217=m +CONFIG_MFD_TPS68470=y +CONFIG_MFD_TI_LP873X=m +CONFIG_MFD_TI_LP87565=m +CONFIG_MFD_TPS65218=m +CONFIG_MFD_TPS6586X=y +CONFIG_MFD_TPS65910=y +CONFIG_MFD_TPS65912=m +CONFIG_MFD_TPS65912_I2C=m +CONFIG_MFD_TPS65912_SPI=m +CONFIG_MFD_TPS80031=y +CONFIG_TWL4030_CORE=y +CONFIG_MFD_TWL4030_AUDIO=y +CONFIG_TWL6040_CORE=y +CONFIG_MFD_WL1273_CORE=m +CONFIG_MFD_LM3533=m +CONFIG_MFD_TC3589X=y +CONFIG_MFD_TQMX86=m +CONFIG_MFD_VX855=m +CONFIG_MFD_LOCHNAGAR=y +CONFIG_MFD_ARIZONA=y +CONFIG_MFD_ARIZONA_I2C=m +CONFIG_MFD_ARIZONA_SPI=m +CONFIG_MFD_CS47L24=y +CONFIG_MFD_WM5102=y +CONFIG_MFD_WM5110=y +CONFIG_MFD_WM8997=y +CONFIG_MFD_WM8998=y +CONFIG_MFD_WM8400=y +CONFIG_MFD_WM831X=y +CONFIG_MFD_WM831X_I2C=y +CONFIG_MFD_WM831X_SPI=y +CONFIG_MFD_WM8350=y +CONFIG_MFD_WM8350_I2C=y +CONFIG_MFD_WM8994=m +CONFIG_MFD_ROHM_BD718XX=m +CONFIG_MFD_ROHM_BD70528=m +CONFIG_MFD_ROHM_BD71828=m +CONFIG_MFD_STPMIC1=m +CONFIG_MFD_STMFX=m +CONFIG_MFD_WCD934X=m +CONFIG_RAVE_SP_CORE=m +# end of Multifunction device drivers + +CONFIG_REGULATOR=y +# CONFIG_REGULATOR_DEBUG is not set +CONFIG_REGULATOR_FIXED_VOLTAGE=m +CONFIG_REGULATOR_VIRTUAL_CONSUMER=m +CONFIG_REGULATOR_USERSPACE_CONSUMER=m +CONFIG_REGULATOR_88PG86X=m +CONFIG_REGULATOR_88PM800=m +CONFIG_REGULATOR_88PM8607=m +CONFIG_REGULATOR_ACT8865=m +CONFIG_REGULATOR_ACT8945A=m +CONFIG_REGULATOR_AD5398=m +CONFIG_REGULATOR_AAT2870=m +CONFIG_REGULATOR_AB3100=m +CONFIG_REGULATOR_ARIZONA_LDO1=m +CONFIG_REGULATOR_ARIZONA_MICSUPP=m +CONFIG_REGULATOR_AS3711=m +CONFIG_REGULATOR_AS3722=m +CONFIG_REGULATOR_AXP20X=m +CONFIG_REGULATOR_BCM590XX=m +CONFIG_REGULATOR_BD70528=m +CONFIG_REGULATOR_BD71828=m +CONFIG_REGULATOR_BD718XX=m +CONFIG_REGULATOR_BD9571MWV=m +CONFIG_REGULATOR_CPCAP=m +CONFIG_REGULATOR_DA903X=m +CONFIG_REGULATOR_DA9052=m +CONFIG_REGULATOR_DA9055=m +CONFIG_REGULATOR_DA9062=m +CONFIG_REGULATOR_DA9063=m +CONFIG_REGULATOR_DA9210=m +CONFIG_REGULATOR_DA9211=m +CONFIG_REGULATOR_FAN53555=m +CONFIG_REGULATOR_GPIO=m +CONFIG_REGULATOR_HI6421=m +CONFIG_REGULATOR_HI6421V530=m +CONFIG_REGULATOR_ISL9305=m +CONFIG_REGULATOR_ISL6271A=m +CONFIG_REGULATOR_LM363X=m +CONFIG_REGULATOR_LOCHNAGAR=m +CONFIG_REGULATOR_LP3971=m +CONFIG_REGULATOR_LP3972=m +CONFIG_REGULATOR_LP872X=m +CONFIG_REGULATOR_LP873X=m +CONFIG_REGULATOR_LP8755=m +CONFIG_REGULATOR_LP87565=m +CONFIG_REGULATOR_LP8788=m +CONFIG_REGULATOR_LTC3589=m +CONFIG_REGULATOR_LTC3676=m +CONFIG_REGULATOR_MAX14577=m +CONFIG_REGULATOR_MAX1586=m +CONFIG_REGULATOR_MAX77620=m +CONFIG_REGULATOR_MAX77650=m +CONFIG_REGULATOR_MAX8649=m +CONFIG_REGULATOR_MAX8660=m +CONFIG_REGULATOR_MAX8907=m +CONFIG_REGULATOR_MAX8925=m +CONFIG_REGULATOR_MAX8952=m +CONFIG_REGULATOR_MAX8973=m +CONFIG_REGULATOR_MAX8997=m +CONFIG_REGULATOR_MAX8998=m +CONFIG_REGULATOR_MAX77686=m +CONFIG_REGULATOR_MAX77693=m +CONFIG_REGULATOR_MAX77802=m +CONFIG_REGULATOR_MAX77826=m +CONFIG_REGULATOR_MC13XXX_CORE=m +CONFIG_REGULATOR_MC13783=m +CONFIG_REGULATOR_MC13892=m +CONFIG_REGULATOR_MCP16502=m +CONFIG_REGULATOR_MP5416=m +CONFIG_REGULATOR_MP8859=m +CONFIG_REGULATOR_MP886X=m +CONFIG_REGULATOR_MPQ7920=m +CONFIG_REGULATOR_MT6311=m +CONFIG_REGULATOR_MT6323=m +CONFIG_REGULATOR_MT6358=m +CONFIG_REGULATOR_MT6397=m +CONFIG_REGULATOR_PALMAS=m +CONFIG_REGULATOR_PCAP=m +CONFIG_REGULATOR_PCF50633=m +CONFIG_REGULATOR_PFUZE100=m +CONFIG_REGULATOR_PV88060=m +CONFIG_REGULATOR_PV88080=m +CONFIG_REGULATOR_PV88090=m +CONFIG_REGULATOR_PWM=m +CONFIG_REGULATOR_QCOM_SPMI=m +CONFIG_REGULATOR_RC5T583=m +CONFIG_REGULATOR_RK808=m +CONFIG_REGULATOR_RN5T618=m +CONFIG_REGULATOR_ROHM=m +CONFIG_REGULATOR_RT5033=m +CONFIG_REGULATOR_S2MPA01=m +CONFIG_REGULATOR_S2MPS11=m +CONFIG_REGULATOR_S5M8767=m +CONFIG_REGULATOR_SKY81452=m +CONFIG_REGULATOR_SLG51000=m +CONFIG_REGULATOR_STPMIC1=m +CONFIG_REGULATOR_SY8106A=m +CONFIG_REGULATOR_SY8824X=m +CONFIG_REGULATOR_TPS51632=m +CONFIG_REGULATOR_TPS6105X=m +CONFIG_REGULATOR_TPS62360=m +CONFIG_REGULATOR_TPS65023=m +CONFIG_REGULATOR_TPS6507X=m +CONFIG_REGULATOR_TPS65086=m +CONFIG_REGULATOR_TPS65090=m +CONFIG_REGULATOR_TPS65132=m +CONFIG_REGULATOR_TPS65217=m +CONFIG_REGULATOR_TPS65218=m +CONFIG_REGULATOR_TPS6524X=m +CONFIG_REGULATOR_TPS6586X=m +CONFIG_REGULATOR_TPS65910=m +CONFIG_REGULATOR_TPS65912=m +CONFIG_REGULATOR_TPS80031=m +CONFIG_REGULATOR_TWL4030=m +CONFIG_REGULATOR_VCTRL=m +CONFIG_REGULATOR_WM831X=m +CONFIG_REGULATOR_WM8350=m +CONFIG_REGULATOR_WM8400=m +CONFIG_REGULATOR_WM8994=m +CONFIG_RC_CORE=m +CONFIG_RC_MAP=m +CONFIG_LIRC=y +CONFIG_RC_DECODERS=y +CONFIG_IR_NEC_DECODER=m +CONFIG_IR_RC5_DECODER=m +CONFIG_IR_RC6_DECODER=m +CONFIG_IR_JVC_DECODER=m +CONFIG_IR_SONY_DECODER=m +CONFIG_IR_SANYO_DECODER=m +CONFIG_IR_SHARP_DECODER=m +CONFIG_IR_MCE_KBD_DECODER=m +CONFIG_IR_XMP_DECODER=m +CONFIG_IR_IMON_DECODER=m +CONFIG_IR_RCMM_DECODER=m +CONFIG_RC_DEVICES=y +CONFIG_RC_ATI_REMOTE=m +CONFIG_IR_ENE=m +CONFIG_IR_HIX5HD2=m +CONFIG_IR_IMON=m +CONFIG_IR_IMON_RAW=m +CONFIG_IR_MCEUSB=m +CONFIG_IR_ITE_CIR=m +CONFIG_IR_FINTEK=m +CONFIG_IR_NUVOTON=m +CONFIG_IR_REDRAT3=m +CONFIG_IR_SPI=m +CONFIG_IR_STREAMZAP=m +CONFIG_IR_WINBOND_CIR=m +CONFIG_IR_IGORPLUGUSB=m +CONFIG_IR_IGUANA=m +CONFIG_IR_TTUSBIR=m +CONFIG_RC_LOOPBACK=m +CONFIG_IR_GPIO_CIR=m +CONFIG_IR_GPIO_TX=m +CONFIG_IR_PWM_TX=m +CONFIG_IR_SERIAL=m +CONFIG_IR_SERIAL_TRANSMITTER=y +CONFIG_IR_SIR=m +CONFIG_RC_XBOX_DVD=m +CONFIG_CEC_CORE=m +CONFIG_CEC_NOTIFIER=y +CONFIG_CEC_PIN=y +CONFIG_MEDIA_CEC_RC=y +# CONFIG_CEC_PIN_ERROR_INJ is not set +CONFIG_MEDIA_CEC_SUPPORT=y +CONFIG_CEC_CROS_EC=m +CONFIG_CEC_GPIO=m +CONFIG_CEC_SECO=m +CONFIG_CEC_SECO_RC=y +CONFIG_USB_PULSE8_CEC=m +CONFIG_USB_RAINSHADOW_CEC=m +CONFIG_MEDIA_SUPPORT=m +# CONFIG_MEDIA_SUPPORT_FILTER is not set +CONFIG_MEDIA_SUBDRV_AUTOSELECT=y + +# +# Media device types +# +CONFIG_MEDIA_CAMERA_SUPPORT=y +CONFIG_MEDIA_ANALOG_TV_SUPPORT=y +CONFIG_MEDIA_DIGITAL_TV_SUPPORT=y +CONFIG_MEDIA_RADIO_SUPPORT=y +CONFIG_MEDIA_SDR_SUPPORT=y +CONFIG_MEDIA_PLATFORM_SUPPORT=y +CONFIG_MEDIA_TEST_SUPPORT=y +# end of Media device types + +# +# Media core support +# +CONFIG_VIDEO_DEV=m +CONFIG_MEDIA_CONTROLLER=y +CONFIG_DVB_CORE=m +# end of Media core support + +# +# Video4Linux options +# +CONFIG_VIDEO_V4L2=m +CONFIG_VIDEO_V4L2_I2C=y +CONFIG_VIDEO_V4L2_SUBDEV_API=y +# CONFIG_VIDEO_ADV_DEBUG is not set +# CONFIG_VIDEO_FIXED_MINOR_RANGES is not set +CONFIG_VIDEO_TUNER=m +CONFIG_V4L2_MEM2MEM_DEV=m +CONFIG_V4L2_FLASH_LED_CLASS=m +CONFIG_V4L2_FWNODE=m +CONFIG_VIDEOBUF_GEN=m +CONFIG_VIDEOBUF_DMA_SG=m +CONFIG_VIDEOBUF_VMALLOC=m +# end of Video4Linux options + +# +# Media controller options +# +CONFIG_MEDIA_CONTROLLER_DVB=y +CONFIG_MEDIA_CONTROLLER_REQUEST_API=y + +# +# Please notice that the enabled Media controller Request API is EXPERIMENTAL +# +# end of Media controller options + +# +# Digital TV options +# +CONFIG_DVB_MMAP=y +CONFIG_DVB_NET=y +CONFIG_DVB_MAX_ADAPTERS=16 +# CONFIG_DVB_DYNAMIC_MINORS is not set +# CONFIG_DVB_DEMUX_SECTION_LOSS_LOG is not set +# CONFIG_DVB_ULE_DEBUG is not set +# end of Digital TV options + +# +# Media drivers +# +CONFIG_TTPCI_EEPROM=m +CONFIG_MEDIA_USB_SUPPORT=y + +# +# Webcam devices +# +CONFIG_USB_VIDEO_CLASS=m +CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y +CONFIG_USB_GSPCA=m +CONFIG_USB_M5602=m +CONFIG_USB_STV06XX=m +CONFIG_USB_GL860=m +CONFIG_USB_GSPCA_BENQ=m +CONFIG_USB_GSPCA_CONEX=m +CONFIG_USB_GSPCA_CPIA1=m +CONFIG_USB_GSPCA_DTCS033=m +CONFIG_USB_GSPCA_ETOMS=m +CONFIG_USB_GSPCA_FINEPIX=m +CONFIG_USB_GSPCA_JEILINJ=m +CONFIG_USB_GSPCA_JL2005BCD=m +CONFIG_USB_GSPCA_KINECT=m +CONFIG_USB_GSPCA_KONICA=m +CONFIG_USB_GSPCA_MARS=m +CONFIG_USB_GSPCA_MR97310A=m +CONFIG_USB_GSPCA_NW80X=m +CONFIG_USB_GSPCA_OV519=m +CONFIG_USB_GSPCA_OV534=m +CONFIG_USB_GSPCA_OV534_9=m +CONFIG_USB_GSPCA_PAC207=m +CONFIG_USB_GSPCA_PAC7302=m +CONFIG_USB_GSPCA_PAC7311=m +CONFIG_USB_GSPCA_SE401=m +CONFIG_USB_GSPCA_SN9C2028=m +CONFIG_USB_GSPCA_SN9C20X=m +CONFIG_USB_GSPCA_SONIXB=m +CONFIG_USB_GSPCA_SONIXJ=m +CONFIG_USB_GSPCA_SPCA500=m +CONFIG_USB_GSPCA_SPCA501=m +CONFIG_USB_GSPCA_SPCA505=m +CONFIG_USB_GSPCA_SPCA506=m +CONFIG_USB_GSPCA_SPCA508=m +CONFIG_USB_GSPCA_SPCA561=m +CONFIG_USB_GSPCA_SPCA1528=m +CONFIG_USB_GSPCA_SQ905=m +CONFIG_USB_GSPCA_SQ905C=m +CONFIG_USB_GSPCA_SQ930X=m +CONFIG_USB_GSPCA_STK014=m +CONFIG_USB_GSPCA_STK1135=m +CONFIG_USB_GSPCA_STV0680=m +CONFIG_USB_GSPCA_SUNPLUS=m +CONFIG_USB_GSPCA_T613=m +CONFIG_USB_GSPCA_TOPRO=m +CONFIG_USB_GSPCA_TOUPTEK=m +CONFIG_USB_GSPCA_TV8532=m +CONFIG_USB_GSPCA_VC032X=m +CONFIG_USB_GSPCA_VICAM=m +CONFIG_USB_GSPCA_XIRLINK_CIT=m +CONFIG_USB_GSPCA_ZC3XX=m +CONFIG_USB_PWC=m +# CONFIG_USB_PWC_DEBUG is not set +CONFIG_USB_PWC_INPUT_EVDEV=y +CONFIG_VIDEO_CPIA2=m +CONFIG_USB_ZR364XX=m +CONFIG_USB_STKWEBCAM=m +CONFIG_USB_S2255=m +CONFIG_VIDEO_USBTV=m + +# +# Analog TV USB devices +# +CONFIG_VIDEO_PVRUSB2=m +CONFIG_VIDEO_PVRUSB2_SYSFS=y +CONFIG_VIDEO_PVRUSB2_DVB=y +# CONFIG_VIDEO_PVRUSB2_DEBUGIFC is not set +CONFIG_VIDEO_HDPVR=m +CONFIG_VIDEO_STK1160_COMMON=m +CONFIG_VIDEO_STK1160=m +CONFIG_VIDEO_GO7007=m +CONFIG_VIDEO_GO7007_USB=m +CONFIG_VIDEO_GO7007_LOADER=m +CONFIG_VIDEO_GO7007_USB_S2250_BOARD=m + +# +# Analog/digital TV USB devices +# +CONFIG_VIDEO_AU0828=m +CONFIG_VIDEO_AU0828_V4L2=y +CONFIG_VIDEO_AU0828_RC=y +CONFIG_VIDEO_CX231XX=m +CONFIG_VIDEO_CX231XX_RC=y +CONFIG_VIDEO_CX231XX_ALSA=m +CONFIG_VIDEO_CX231XX_DVB=m +CONFIG_VIDEO_TM6000=m +CONFIG_VIDEO_TM6000_ALSA=m +CONFIG_VIDEO_TM6000_DVB=m + +# +# Digital TV USB devices +# +CONFIG_DVB_USB=m +# CONFIG_DVB_USB_DEBUG is not set +CONFIG_DVB_USB_DIB3000MC=m +CONFIG_DVB_USB_A800=m +CONFIG_DVB_USB_DIBUSB_MB=m +CONFIG_DVB_USB_DIBUSB_MB_FAULTY=y +CONFIG_DVB_USB_DIBUSB_MC=m +CONFIG_DVB_USB_DIB0700=m +CONFIG_DVB_USB_UMT_010=m +CONFIG_DVB_USB_CXUSB=m +CONFIG_DVB_USB_CXUSB_ANALOG=y +CONFIG_DVB_USB_M920X=m +CONFIG_DVB_USB_DIGITV=m +CONFIG_DVB_USB_VP7045=m +CONFIG_DVB_USB_VP702X=m +CONFIG_DVB_USB_GP8PSK=m +CONFIG_DVB_USB_NOVA_T_USB2=m +CONFIG_DVB_USB_TTUSB2=m +CONFIG_DVB_USB_DTT200U=m +CONFIG_DVB_USB_OPERA1=m +CONFIG_DVB_USB_AF9005=m +CONFIG_DVB_USB_AF9005_REMOTE=m +CONFIG_DVB_USB_PCTV452E=m +CONFIG_DVB_USB_DW2102=m +CONFIG_DVB_USB_CINERGY_T2=m +CONFIG_DVB_USB_DTV5100=m +CONFIG_DVB_USB_AZ6027=m +CONFIG_DVB_USB_TECHNISAT_USB2=m +CONFIG_DVB_USB_V2=m +CONFIG_DVB_USB_AF9015=m +CONFIG_DVB_USB_AF9035=m +CONFIG_DVB_USB_ANYSEE=m +CONFIG_DVB_USB_AU6610=m +CONFIG_DVB_USB_AZ6007=m +CONFIG_DVB_USB_CE6230=m +CONFIG_DVB_USB_EC168=m +CONFIG_DVB_USB_GL861=m +CONFIG_DVB_USB_LME2510=m +CONFIG_DVB_USB_MXL111SF=m +CONFIG_DVB_USB_RTL28XXU=m +CONFIG_DVB_USB_DVBSKY=m +CONFIG_DVB_USB_ZD1301=m +CONFIG_DVB_TTUSB_BUDGET=m +CONFIG_DVB_TTUSB_DEC=m +CONFIG_SMS_USB_DRV=m +CONFIG_DVB_B2C2_FLEXCOP_USB=m +# CONFIG_DVB_B2C2_FLEXCOP_USB_DEBUG is not set +CONFIG_DVB_AS102=m + +# +# Webcam, TV (analog/digital) USB devices +# +CONFIG_VIDEO_EM28XX=m +CONFIG_VIDEO_EM28XX_V4L2=m +CONFIG_VIDEO_EM28XX_ALSA=m +CONFIG_VIDEO_EM28XX_DVB=m +CONFIG_VIDEO_EM28XX_RC=m + +# +# Software defined radio USB devices +# +CONFIG_USB_AIRSPY=m +CONFIG_USB_HACKRF=m +CONFIG_USB_MSI2500=m +CONFIG_MEDIA_PCI_SUPPORT=y + +# +# Media capture support +# +CONFIG_VIDEO_MEYE=m +CONFIG_VIDEO_SOLO6X10=m +CONFIG_VIDEO_TW5864=m +CONFIG_VIDEO_TW68=m +CONFIG_VIDEO_TW686X=m + +# +# Media capture/analog TV support +# +CONFIG_VIDEO_IVTV=m +# CONFIG_VIDEO_IVTV_DEPRECATED_IOCTLS is not set +CONFIG_VIDEO_IVTV_ALSA=m +CONFIG_VIDEO_FB_IVTV=m +# CONFIG_VIDEO_FB_IVTV_FORCE_PAT is not set +CONFIG_VIDEO_HEXIUM_GEMINI=m +CONFIG_VIDEO_HEXIUM_ORION=m +CONFIG_VIDEO_MXB=m +CONFIG_VIDEO_DT3155=m + +# +# Media capture/analog/hybrid TV support +# +CONFIG_VIDEO_CX18=m +CONFIG_VIDEO_CX18_ALSA=m +CONFIG_VIDEO_CX23885=m +CONFIG_MEDIA_ALTERA_CI=m +CONFIG_VIDEO_CX25821=m +CONFIG_VIDEO_CX25821_ALSA=m +CONFIG_VIDEO_CX88=m +CONFIG_VIDEO_CX88_ALSA=m +CONFIG_VIDEO_CX88_BLACKBIRD=m +CONFIG_VIDEO_CX88_DVB=m +CONFIG_VIDEO_CX88_ENABLE_VP3054=y +CONFIG_VIDEO_CX88_VP3054=m +CONFIG_VIDEO_CX88_MPEG=m +CONFIG_VIDEO_BT848=m +CONFIG_DVB_BT8XX=m +CONFIG_VIDEO_SAA7134=m +CONFIG_VIDEO_SAA7134_ALSA=m +CONFIG_VIDEO_SAA7134_RC=y +CONFIG_VIDEO_SAA7134_DVB=m +CONFIG_VIDEO_SAA7134_GO7007=m +CONFIG_VIDEO_SAA7164=m + +# +# Media digital TV PCI Adapters +# +CONFIG_DVB_AV7110_IR=y +CONFIG_DVB_AV7110=m +CONFIG_DVB_AV7110_OSD=y +CONFIG_DVB_BUDGET_CORE=m +CONFIG_DVB_BUDGET=m +CONFIG_DVB_BUDGET_CI=m +CONFIG_DVB_BUDGET_AV=m +CONFIG_DVB_BUDGET_PATCH=m +CONFIG_DVB_B2C2_FLEXCOP_PCI=m +# CONFIG_DVB_B2C2_FLEXCOP_PCI_DEBUG is not set +CONFIG_DVB_PLUTO2=m +CONFIG_DVB_DM1105=m +CONFIG_DVB_PT1=m +CONFIG_DVB_PT3=m +CONFIG_MANTIS_CORE=m +CONFIG_DVB_MANTIS=m +CONFIG_DVB_HOPPER=m +CONFIG_DVB_NGENE=m +CONFIG_DVB_DDBRIDGE=m +# CONFIG_DVB_DDBRIDGE_MSIENABLE is not set +CONFIG_DVB_SMIPCIE=m +CONFIG_DVB_NETUP_UNIDVB=m +CONFIG_VIDEO_IPU3_CIO2=m +CONFIG_RADIO_ADAPTERS=y +CONFIG_RADIO_TEA575X=m +CONFIG_RADIO_SI470X=m +CONFIG_USB_SI470X=m +CONFIG_I2C_SI470X=m +CONFIG_RADIO_SI4713=m +CONFIG_USB_SI4713=m +CONFIG_PLATFORM_SI4713=m +CONFIG_I2C_SI4713=m +CONFIG_RADIO_SI476X=m +CONFIG_USB_MR800=m +CONFIG_USB_DSBR=m +CONFIG_RADIO_MAXIRADIO=m +CONFIG_RADIO_SHARK=m +CONFIG_RADIO_SHARK2=m +CONFIG_USB_KEENE=m +CONFIG_USB_RAREMONO=m +CONFIG_USB_MA901=m +CONFIG_RADIO_TEA5764=m +CONFIG_RADIO_SAA7706H=m +CONFIG_RADIO_TEF6862=m +CONFIG_RADIO_WL1273=m +CONFIG_RADIO_WL128X=m +CONFIG_MEDIA_COMMON_OPTIONS=y + +# +# common driver options +# +CONFIG_VIDEO_CX2341X=m +CONFIG_VIDEO_TVEEPROM=m +CONFIG_CYPRESS_FIRMWARE=m +CONFIG_VIDEOBUF2_CORE=m +CONFIG_VIDEOBUF2_V4L2=m +CONFIG_VIDEOBUF2_MEMOPS=m +CONFIG_VIDEOBUF2_DMA_CONTIG=m +CONFIG_VIDEOBUF2_VMALLOC=m +CONFIG_VIDEOBUF2_DMA_SG=m +CONFIG_VIDEOBUF2_DVB=m +CONFIG_DVB_B2C2_FLEXCOP=m +CONFIG_VIDEO_SAA7146=m +CONFIG_VIDEO_SAA7146_VV=m +CONFIG_SMS_SIANO_MDTV=m +CONFIG_SMS_SIANO_RC=y +# CONFIG_SMS_SIANO_DEBUGFS is not set +CONFIG_VIDEO_V4L2_TPG=m +CONFIG_V4L_PLATFORM_DRIVERS=y +CONFIG_VIDEO_CAFE_CCIC=m +CONFIG_VIDEO_CADENCE=y +CONFIG_VIDEO_CADENCE_CSI2RX=m +CONFIG_VIDEO_CADENCE_CSI2TX=m +CONFIG_VIDEO_ASPEED=m +CONFIG_VIDEO_MUX=m +CONFIG_VIDEO_XILINX=m +CONFIG_VIDEO_XILINX_TPG=m +CONFIG_VIDEO_XILINX_VTC=m +CONFIG_V4L_MEM2MEM_DRIVERS=y +CONFIG_VIDEO_MEM2MEM_DEINTERLACE=m +CONFIG_DVB_PLATFORM_DRIVERS=y +CONFIG_SDR_PLATFORM_DRIVERS=y + +# +# MMC/SDIO DVB adapters +# +CONFIG_SMS_SDIO_DRV=m +CONFIG_V4L_TEST_DRIVERS=y +CONFIG_VIDEO_VIMC=m +CONFIG_VIDEO_VIVID=m +CONFIG_VIDEO_VIVID_CEC=y +CONFIG_VIDEO_VIVID_MAX_DEVS=64 +CONFIG_VIDEO_VIM2M=m +CONFIG_VIDEO_VICODEC=m + +# +# FireWire (IEEE 1394) Adapters +# +CONFIG_DVB_FIREDTV=m +CONFIG_DVB_FIREDTV_INPUT=y +# end of Media drivers + +# +# Media ancillary drivers +# +CONFIG_MEDIA_ATTACH=y + +# +# IR I2C driver auto-selected by 'Autoselect ancillary drivers' +# +CONFIG_VIDEO_IR_I2C=m + +# +# Audio decoders, processors and mixers +# +CONFIG_VIDEO_TVAUDIO=m +CONFIG_VIDEO_TDA7432=m +CONFIG_VIDEO_TDA9840=m +CONFIG_VIDEO_TDA1997X=m +CONFIG_VIDEO_TEA6415C=m +CONFIG_VIDEO_TEA6420=m +CONFIG_VIDEO_MSP3400=m +CONFIG_VIDEO_CS3308=m +CONFIG_VIDEO_CS5345=m +CONFIG_VIDEO_CS53L32A=m +CONFIG_VIDEO_TLV320AIC23B=m +CONFIG_VIDEO_UDA1342=m +CONFIG_VIDEO_WM8775=m +CONFIG_VIDEO_WM8739=m +CONFIG_VIDEO_VP27SMPX=m +CONFIG_VIDEO_SONY_BTF_MPX=m +# end of Audio decoders, processors and mixers + +# +# RDS decoders +# +CONFIG_VIDEO_SAA6588=m +# end of RDS decoders + +# +# Video decoders +# +CONFIG_VIDEO_ADV7180=m +CONFIG_VIDEO_ADV7183=m +CONFIG_VIDEO_ADV748X=m +CONFIG_VIDEO_ADV7604=m +CONFIG_VIDEO_ADV7604_CEC=y +CONFIG_VIDEO_ADV7842=m +CONFIG_VIDEO_ADV7842_CEC=y +CONFIG_VIDEO_BT819=m +CONFIG_VIDEO_BT856=m +CONFIG_VIDEO_BT866=m +CONFIG_VIDEO_KS0127=m +CONFIG_VIDEO_ML86V7667=m +CONFIG_VIDEO_SAA7110=m +CONFIG_VIDEO_SAA711X=m +CONFIG_VIDEO_TC358743=m +CONFIG_VIDEO_TC358743_CEC=y +CONFIG_VIDEO_TVP514X=m +CONFIG_VIDEO_TVP5150=m +CONFIG_VIDEO_TVP7002=m +CONFIG_VIDEO_TW2804=m +CONFIG_VIDEO_TW9903=m +CONFIG_VIDEO_TW9906=m +CONFIG_VIDEO_TW9910=m +CONFIG_VIDEO_VPX3220=m + +# +# Video and audio decoders +# +CONFIG_VIDEO_SAA717X=m +CONFIG_VIDEO_CX25840=m +# end of Video decoders + +# +# Video encoders +# +CONFIG_VIDEO_SAA7127=m +CONFIG_VIDEO_SAA7185=m +CONFIG_VIDEO_ADV7170=m +CONFIG_VIDEO_ADV7175=m +CONFIG_VIDEO_ADV7343=m +CONFIG_VIDEO_ADV7393=m +CONFIG_VIDEO_AD9389B=m +CONFIG_VIDEO_AK881X=m +CONFIG_VIDEO_THS8200=m +# end of Video encoders + +# +# Video improvement chips +# +CONFIG_VIDEO_UPD64031A=m +CONFIG_VIDEO_UPD64083=m +# end of Video improvement chips + +# +# Audio/Video compression chips +# +CONFIG_VIDEO_SAA6752HS=m +# end of Audio/Video compression chips + +# +# SDR tuner chips +# +CONFIG_SDR_MAX2175=m +# end of SDR tuner chips + +# +# Miscellaneous helper chips +# +CONFIG_VIDEO_THS7303=m +CONFIG_VIDEO_M52790=m +CONFIG_VIDEO_I2C=m +CONFIG_VIDEO_ST_MIPID02=m +# end of Miscellaneous helper chips + +# +# Camera sensor devices +# +CONFIG_VIDEO_APTINA_PLL=m +CONFIG_VIDEO_SMIAPP_PLL=m +CONFIG_VIDEO_HI556=m +CONFIG_VIDEO_IMX214=m +CONFIG_VIDEO_IMX219=m +CONFIG_VIDEO_IMX258=m +CONFIG_VIDEO_IMX274=m +CONFIG_VIDEO_IMX290=m +CONFIG_VIDEO_IMX319=m +CONFIG_VIDEO_IMX355=m +CONFIG_VIDEO_OV2640=m +CONFIG_VIDEO_OV2659=m +CONFIG_VIDEO_OV2680=m +CONFIG_VIDEO_OV2685=m +CONFIG_VIDEO_OV2740=m +CONFIG_VIDEO_OV5640=m +CONFIG_VIDEO_OV5645=m +CONFIG_VIDEO_OV5647=m +CONFIG_VIDEO_OV6650=m +CONFIG_VIDEO_OV5670=m +CONFIG_VIDEO_OV5675=m +CONFIG_VIDEO_OV5695=m +CONFIG_VIDEO_OV7251=m +CONFIG_VIDEO_OV772X=m +CONFIG_VIDEO_OV7640=m +CONFIG_VIDEO_OV7670=m +CONFIG_VIDEO_OV7740=m +CONFIG_VIDEO_OV8856=m +CONFIG_VIDEO_OV9640=m +CONFIG_VIDEO_OV9650=m +CONFIG_VIDEO_OV13858=m +CONFIG_VIDEO_VS6624=m +CONFIG_VIDEO_MT9M001=m +CONFIG_VIDEO_MT9M032=m +CONFIG_VIDEO_MT9M111=m +CONFIG_VIDEO_MT9P031=m +CONFIG_VIDEO_MT9T001=m +CONFIG_VIDEO_MT9T112=m +CONFIG_VIDEO_MT9V011=m +CONFIG_VIDEO_MT9V032=m +CONFIG_VIDEO_MT9V111=m +CONFIG_VIDEO_SR030PC30=m +CONFIG_VIDEO_NOON010PC30=m +CONFIG_VIDEO_M5MOLS=m +CONFIG_VIDEO_RJ54N1=m +CONFIG_VIDEO_S5K6AA=m +CONFIG_VIDEO_S5K6A3=m +CONFIG_VIDEO_S5K4ECGX=m +CONFIG_VIDEO_S5K5BAF=m +CONFIG_VIDEO_SMIAPP=m +CONFIG_VIDEO_ET8EK8=m +CONFIG_VIDEO_S5C73M3=m +# end of Camera sensor devices + +# +# Lens drivers +# +CONFIG_VIDEO_AD5820=m +CONFIG_VIDEO_AK7375=m +CONFIG_VIDEO_DW9714=m +CONFIG_VIDEO_DW9807_VCM=m +# end of Lens drivers + +# +# Flash devices +# +CONFIG_VIDEO_ADP1653=m +CONFIG_VIDEO_LM3560=m +CONFIG_VIDEO_LM3646=m +# end of Flash devices + +# +# SPI helper chips +# +CONFIG_VIDEO_GS1662=m +# end of SPI helper chips + +# +# Media SPI Adapters +# +CONFIG_CXD2880_SPI_DRV=m +# end of Media SPI Adapters + +CONFIG_MEDIA_TUNER=m + +# +# Customize TV tuners +# +CONFIG_MEDIA_TUNER_SIMPLE=m +CONFIG_MEDIA_TUNER_TDA18250=m +CONFIG_MEDIA_TUNER_TDA8290=m +CONFIG_MEDIA_TUNER_TDA827X=m +CONFIG_MEDIA_TUNER_TDA18271=m +CONFIG_MEDIA_TUNER_TDA9887=m +CONFIG_MEDIA_TUNER_TEA5761=m +CONFIG_MEDIA_TUNER_TEA5767=m +CONFIG_MEDIA_TUNER_MSI001=m +CONFIG_MEDIA_TUNER_MT20XX=m +CONFIG_MEDIA_TUNER_MT2060=m +CONFIG_MEDIA_TUNER_MT2063=m +CONFIG_MEDIA_TUNER_MT2266=m +CONFIG_MEDIA_TUNER_MT2131=m +CONFIG_MEDIA_TUNER_QT1010=m +CONFIG_MEDIA_TUNER_XC2028=m +CONFIG_MEDIA_TUNER_XC5000=m +CONFIG_MEDIA_TUNER_XC4000=m +CONFIG_MEDIA_TUNER_MXL5005S=m +CONFIG_MEDIA_TUNER_MXL5007T=m +CONFIG_MEDIA_TUNER_MC44S803=m +CONFIG_MEDIA_TUNER_MAX2165=m +CONFIG_MEDIA_TUNER_TDA18218=m +CONFIG_MEDIA_TUNER_FC0011=m +CONFIG_MEDIA_TUNER_FC0012=m +CONFIG_MEDIA_TUNER_FC0013=m +CONFIG_MEDIA_TUNER_TDA18212=m +CONFIG_MEDIA_TUNER_E4000=m +CONFIG_MEDIA_TUNER_FC2580=m +CONFIG_MEDIA_TUNER_M88RS6000T=m +CONFIG_MEDIA_TUNER_TUA9001=m +CONFIG_MEDIA_TUNER_SI2157=m +CONFIG_MEDIA_TUNER_IT913X=m +CONFIG_MEDIA_TUNER_R820T=m +CONFIG_MEDIA_TUNER_MXL301RF=m +CONFIG_MEDIA_TUNER_QM1D1C0042=m +CONFIG_MEDIA_TUNER_QM1D1B0004=m +# end of Customize TV tuners + +# +# Customise DVB Frontends +# + +# +# Multistandard (satellite) frontends +# +CONFIG_DVB_STB0899=m +CONFIG_DVB_STB6100=m +CONFIG_DVB_STV090x=m +CONFIG_DVB_STV0910=m +CONFIG_DVB_STV6110x=m +CONFIG_DVB_STV6111=m +CONFIG_DVB_MXL5XX=m +CONFIG_DVB_M88DS3103=m + +# +# Multistandard (cable + terrestrial) frontends +# +CONFIG_DVB_DRXK=m +CONFIG_DVB_TDA18271C2DD=m +CONFIG_DVB_SI2165=m +CONFIG_DVB_MN88472=m +CONFIG_DVB_MN88473=m + +# +# DVB-S (satellite) frontends +# +CONFIG_DVB_CX24110=m +CONFIG_DVB_CX24123=m +CONFIG_DVB_MT312=m +CONFIG_DVB_ZL10036=m +CONFIG_DVB_ZL10039=m +CONFIG_DVB_S5H1420=m +CONFIG_DVB_STV0288=m +CONFIG_DVB_STB6000=m +CONFIG_DVB_STV0299=m +CONFIG_DVB_STV6110=m +CONFIG_DVB_STV0900=m +CONFIG_DVB_TDA8083=m +CONFIG_DVB_TDA10086=m +CONFIG_DVB_TDA8261=m +CONFIG_DVB_VES1X93=m +CONFIG_DVB_TUNER_ITD1000=m +CONFIG_DVB_TUNER_CX24113=m +CONFIG_DVB_TDA826X=m +CONFIG_DVB_TUA6100=m +CONFIG_DVB_CX24116=m +CONFIG_DVB_CX24117=m +CONFIG_DVB_CX24120=m +CONFIG_DVB_SI21XX=m +CONFIG_DVB_TS2020=m +CONFIG_DVB_DS3000=m +CONFIG_DVB_MB86A16=m +CONFIG_DVB_TDA10071=m + +# +# DVB-T (terrestrial) frontends +# +CONFIG_DVB_SP8870=m +CONFIG_DVB_SP887X=m +CONFIG_DVB_CX22700=m +CONFIG_DVB_CX22702=m +CONFIG_DVB_S5H1432=m +CONFIG_DVB_DRXD=m +CONFIG_DVB_L64781=m +CONFIG_DVB_TDA1004X=m +CONFIG_DVB_NXT6000=m +CONFIG_DVB_MT352=m +CONFIG_DVB_ZL10353=m +CONFIG_DVB_DIB3000MB=m +CONFIG_DVB_DIB3000MC=m +CONFIG_DVB_DIB7000M=m +CONFIG_DVB_DIB7000P=m +CONFIG_DVB_DIB9000=m +CONFIG_DVB_TDA10048=m +CONFIG_DVB_AF9013=m +CONFIG_DVB_EC100=m +CONFIG_DVB_STV0367=m +CONFIG_DVB_CXD2820R=m +CONFIG_DVB_CXD2841ER=m +CONFIG_DVB_RTL2830=m +CONFIG_DVB_RTL2832=m +CONFIG_DVB_RTL2832_SDR=m +CONFIG_DVB_SI2168=m +CONFIG_DVB_AS102_FE=m +CONFIG_DVB_ZD1301_DEMOD=m +CONFIG_DVB_GP8PSK_FE=m +CONFIG_DVB_CXD2880=m + +# +# DVB-C (cable) frontends +# +CONFIG_DVB_VES1820=m +CONFIG_DVB_TDA10021=m +CONFIG_DVB_TDA10023=m +CONFIG_DVB_STV0297=m + +# +# ATSC (North American/Korean Terrestrial/Cable DTV) frontends +# +CONFIG_DVB_NXT200X=m +CONFIG_DVB_OR51211=m +CONFIG_DVB_OR51132=m +CONFIG_DVB_BCM3510=m +CONFIG_DVB_LGDT330X=m +CONFIG_DVB_LGDT3305=m +CONFIG_DVB_LGDT3306A=m +CONFIG_DVB_LG2160=m +CONFIG_DVB_S5H1409=m +CONFIG_DVB_AU8522=m +CONFIG_DVB_AU8522_DTV=m +CONFIG_DVB_AU8522_V4L=m +CONFIG_DVB_S5H1411=m + +# +# ISDB-T (terrestrial) frontends +# +CONFIG_DVB_S921=m +CONFIG_DVB_DIB8000=m +CONFIG_DVB_MB86A20S=m + +# +# ISDB-S (satellite) & ISDB-T (terrestrial) frontends +# +CONFIG_DVB_TC90522=m +CONFIG_DVB_MN88443X=m + +# +# Digital terrestrial only tuners/PLL +# +CONFIG_DVB_PLL=m +CONFIG_DVB_TUNER_DIB0070=m +CONFIG_DVB_TUNER_DIB0090=m + +# +# SEC control devices for DVB-S +# +CONFIG_DVB_DRX39XYJ=m +CONFIG_DVB_LNBH25=m +CONFIG_DVB_LNBH29=m +CONFIG_DVB_LNBP21=m +CONFIG_DVB_LNBP22=m +CONFIG_DVB_ISL6405=m +CONFIG_DVB_ISL6421=m +CONFIG_DVB_ISL6423=m +CONFIG_DVB_A8293=m +CONFIG_DVB_LGS8GL5=m +CONFIG_DVB_LGS8GXX=m +CONFIG_DVB_ATBM8830=m +CONFIG_DVB_TDA665x=m +CONFIG_DVB_IX2505V=m +CONFIG_DVB_M88RS2000=m +CONFIG_DVB_AF9033=m +CONFIG_DVB_HORUS3A=m +CONFIG_DVB_ASCOT2E=m +CONFIG_DVB_HELENE=m + +# +# Common Interface (EN50221) controller drivers +# +CONFIG_DVB_CXD2099=m +CONFIG_DVB_SP2=m +# end of Customise DVB Frontends + +# +# Tools to develop new frontends +# +CONFIG_DVB_DUMMY_FE=m +# end of Media ancillary drivers + +# +# Graphics support +# +CONFIG_AGP=m +CONFIG_AGP_AMD64=m +CONFIG_AGP_INTEL=m +CONFIG_AGP_SIS=m +CONFIG_AGP_VIA=m +CONFIG_INTEL_GTT=m +CONFIG_VGA_ARB=y +CONFIG_VGA_ARB_MAX_GPUS=10 +CONFIG_VGA_SWITCHEROO=y +CONFIG_DRM=m +CONFIG_DRM_MIPI_DBI=m +CONFIG_DRM_MIPI_DSI=y +CONFIG_DRM_DP_AUX_CHARDEV=y +# CONFIG_DRM_DEBUG_SELFTEST is not set +CONFIG_DRM_KMS_HELPER=m +CONFIG_DRM_KMS_FB_HELPER=y +# CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS is not set +CONFIG_DRM_FBDEV_EMULATION=y +CONFIG_DRM_FBDEV_OVERALLOC=100 +# CONFIG_DRM_FBDEV_LEAK_PHYS_SMEM is not set +CONFIG_DRM_LOAD_EDID_FIRMWARE=y +CONFIG_DRM_DP_CEC=y +CONFIG_DRM_TTM=m +CONFIG_DRM_TTM_DMA_PAGE_POOL=y +CONFIG_DRM_VRAM_HELPER=m +CONFIG_DRM_TTM_HELPER=m +CONFIG_DRM_GEM_CMA_HELPER=y +CONFIG_DRM_KMS_CMA_HELPER=y +CONFIG_DRM_GEM_SHMEM_HELPER=y +CONFIG_DRM_SCHED=m + +# +# I2C encoder or helper chips +# +CONFIG_DRM_I2C_CH7006=m +CONFIG_DRM_I2C_SIL164=m +CONFIG_DRM_I2C_NXP_TDA998X=m +CONFIG_DRM_I2C_NXP_TDA9950=m +# end of I2C encoder or helper chips + +# +# ARM devices +# +CONFIG_DRM_KOMEDA=m +# end of ARM devices + +CONFIG_DRM_RADEON=m +CONFIG_DRM_RADEON_USERPTR=y +CONFIG_DRM_AMDGPU=m +CONFIG_DRM_AMDGPU_SI=y +CONFIG_DRM_AMDGPU_CIK=y +CONFIG_DRM_AMDGPU_USERPTR=y +# CONFIG_DRM_AMDGPU_GART_DEBUGFS is not set + +# +# ACP (Audio CoProcessor) Configuration +# +CONFIG_DRM_AMD_ACP=y +# end of ACP (Audio CoProcessor) Configuration + +# +# Display Engine Configuration +# +CONFIG_DRM_AMD_DC=y +CONFIG_DRM_AMD_DC_DCN=y +CONFIG_DRM_AMD_DC_HDCP=y +# CONFIG_DEBUG_KERNEL_DC is not set +# end of Display Engine Configuration + +CONFIG_HSA_AMD=y +CONFIG_DRM_NOUVEAU=m +# CONFIG_NOUVEAU_LEGACY_CTX_SUPPORT is not set +CONFIG_NOUVEAU_DEBUG=5 +CONFIG_NOUVEAU_DEBUG_DEFAULT=3 +# CONFIG_NOUVEAU_DEBUG_MMU is not set +CONFIG_DRM_NOUVEAU_BACKLIGHT=y +CONFIG_DRM_NOUVEAU_SVM=y +CONFIG_DRM_I915=m +CONFIG_DRM_I915_FORCE_PROBE="*" +CONFIG_DRM_I915_CAPTURE_ERROR=y +CONFIG_DRM_I915_COMPRESS_ERROR=y +CONFIG_DRM_I915_USERPTR=y +CONFIG_DRM_I915_GVT=y +CONFIG_DRM_I915_GVT_KVMGT=m + +# +# drm/i915 Debugging +# +# CONFIG_DRM_I915_WERROR is not set +# CONFIG_DRM_I915_DEBUG is not set +# CONFIG_DRM_I915_DEBUG_MMIO is not set +# CONFIG_DRM_I915_SW_FENCE_DEBUG_OBJECTS is not set +# CONFIG_DRM_I915_SW_FENCE_CHECK_DAG is not set +# CONFIG_DRM_I915_DEBUG_GUC is not set +# CONFIG_DRM_I915_SELFTEST is not set +# CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS is not set +# CONFIG_DRM_I915_DEBUG_VBLANK_EVADE is not set +# CONFIG_DRM_I915_DEBUG_RUNTIME_PM is not set +# end of drm/i915 Debugging + +# +# drm/i915 Profile Guided Optimisation +# +CONFIG_DRM_I915_FENCE_TIMEOUT=10000 +CONFIG_DRM_I915_USERFAULT_AUTOSUSPEND=250 +CONFIG_DRM_I915_HEARTBEAT_INTERVAL=2500 +CONFIG_DRM_I915_PREEMPT_TIMEOUT=640 +CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT=8000 +CONFIG_DRM_I915_STOP_TIMEOUT=100 +CONFIG_DRM_I915_TIMESLICE_DURATION=1 +# end of drm/i915 Profile Guided Optimisation + +CONFIG_DRM_VGEM=m +CONFIG_DRM_VKMS=m +CONFIG_DRM_VMWGFX=m +CONFIG_DRM_VMWGFX_FBCON=y +CONFIG_DRM_GMA500=m +CONFIG_DRM_GMA600=y +CONFIG_DRM_GMA3600=y +CONFIG_DRM_UDL=m +CONFIG_DRM_AST=m +CONFIG_DRM_MGAG200=m +CONFIG_DRM_RCAR_DW_HDMI=m +CONFIG_DRM_RCAR_LVDS=m +CONFIG_DRM_QXL=m +CONFIG_DRM_BOCHS=m +CONFIG_DRM_VIRTIO_GPU=m +CONFIG_DRM_PANEL=y + +# +# Display Panels +# +CONFIG_DRM_PANEL_ARM_VERSATILE=m +CONFIG_DRM_PANEL_ASUS_Z00T_TM5P5_NT35596=m +CONFIG_DRM_PANEL_BOE_HIMAX8279D=m +CONFIG_DRM_PANEL_BOE_TV101WUM_NL6=m +CONFIG_DRM_PANEL_LVDS=m +CONFIG_DRM_PANEL_SIMPLE=m +CONFIG_DRM_PANEL_ELIDA_KD35T133=m +CONFIG_DRM_PANEL_FEIXIN_K101_IM2BA02=m +CONFIG_DRM_PANEL_FEIYANG_FY07024DI26A30D=m +CONFIG_DRM_PANEL_ILITEK_IL9322=m +CONFIG_DRM_PANEL_ILITEK_ILI9881C=m +CONFIG_DRM_PANEL_INNOLUX_P079ZCA=m +CONFIG_DRM_PANEL_JDI_LT070ME05000=m +CONFIG_DRM_PANEL_KINGDISPLAY_KD097D04=m +CONFIG_DRM_PANEL_LEADTEK_LTK050H3146W=m +CONFIG_DRM_PANEL_LEADTEK_LTK500HD1829=m +CONFIG_DRM_PANEL_SAMSUNG_LD9040=m +CONFIG_DRM_PANEL_LG_LB035Q02=m +CONFIG_DRM_PANEL_LG_LG4573=m +CONFIG_DRM_PANEL_NEC_NL8048HL11=m +CONFIG_DRM_PANEL_NOVATEK_NT35510=m +CONFIG_DRM_PANEL_NOVATEK_NT39016=m +CONFIG_DRM_PANEL_OLIMEX_LCD_OLINUXINO=m +CONFIG_DRM_PANEL_ORISETECH_OTM8009A=m +CONFIG_DRM_PANEL_OSD_OSD101T2587_53TS=m +CONFIG_DRM_PANEL_PANASONIC_VVX10F034N00=m +CONFIG_DRM_PANEL_RASPBERRYPI_TOUCHSCREEN=m +CONFIG_DRM_PANEL_RAYDIUM_RM67191=m +CONFIG_DRM_PANEL_RAYDIUM_RM68200=m +CONFIG_DRM_PANEL_ROCKTECH_JH057N00900=m +CONFIG_DRM_PANEL_RONBO_RB070D30=m +CONFIG_DRM_PANEL_SAMSUNG_S6D16D0=m +CONFIG_DRM_PANEL_SAMSUNG_S6E3HA2=m +CONFIG_DRM_PANEL_SAMSUNG_S6E63J0X03=m +CONFIG_DRM_PANEL_SAMSUNG_S6E63M0=m +CONFIG_DRM_PANEL_SAMSUNG_S6E88A0_AMS452EF01=m +CONFIG_DRM_PANEL_SAMSUNG_S6E8AA0=m +CONFIG_DRM_PANEL_SEIKO_43WVF1G=m +CONFIG_DRM_PANEL_SHARP_LQ101R1SX01=m +CONFIG_DRM_PANEL_SHARP_LS037V7DW01=m +CONFIG_DRM_PANEL_SHARP_LS043T1LE01=m +CONFIG_DRM_PANEL_SITRONIX_ST7701=m +CONFIG_DRM_PANEL_SITRONIX_ST7789V=m +CONFIG_DRM_PANEL_SONY_ACX424AKP=m +CONFIG_DRM_PANEL_SONY_ACX565AKM=m +CONFIG_DRM_PANEL_TPO_TD028TTEC1=m +CONFIG_DRM_PANEL_TPO_TD043MTEA1=m +CONFIG_DRM_PANEL_TPO_TPG110=m +CONFIG_DRM_PANEL_TRULY_NT35597_WQXGA=m +CONFIG_DRM_PANEL_VISIONOX_RM69299=m +CONFIG_DRM_PANEL_XINPENG_XPP055C272=m +# end of Display Panels + +CONFIG_DRM_BRIDGE=y +CONFIG_DRM_PANEL_BRIDGE=y + +# +# Display Interface Bridges +# +CONFIG_DRM_CDNS_DSI=m +CONFIG_DRM_CHRONTEL_CH7033=m +CONFIG_DRM_DISPLAY_CONNECTOR=m +CONFIG_DRM_LVDS_CODEC=m +CONFIG_DRM_MEGACHIPS_STDPXXXX_GE_B850V3_FW=m +CONFIG_DRM_NWL_MIPI_DSI=m +CONFIG_DRM_NXP_PTN3460=m +CONFIG_DRM_PARADE_PS8622=m +CONFIG_DRM_PARADE_PS8640=m +CONFIG_DRM_SIL_SII8620=m +CONFIG_DRM_SII902X=m +CONFIG_DRM_SII9234=m +CONFIG_DRM_SIMPLE_BRIDGE=m +CONFIG_DRM_THINE_THC63LVD1024=m +CONFIG_DRM_TOSHIBA_TC358764=m +CONFIG_DRM_TOSHIBA_TC358767=m +CONFIG_DRM_TOSHIBA_TC358768=m +CONFIG_DRM_TI_TFP410=m +CONFIG_DRM_TI_SN65DSI86=m +CONFIG_DRM_TI_TPD12S015=m +CONFIG_DRM_ANALOGIX_ANX6345=m +CONFIG_DRM_ANALOGIX_ANX78XX=m +CONFIG_DRM_ANALOGIX_DP=m +CONFIG_DRM_I2C_ADV7511=m +CONFIG_DRM_I2C_ADV7511_AUDIO=y +CONFIG_DRM_I2C_ADV7511_CEC=y +CONFIG_DRM_DW_HDMI=m +CONFIG_DRM_DW_HDMI_AHB_AUDIO=m +CONFIG_DRM_DW_HDMI_I2S_AUDIO=m +CONFIG_DRM_DW_HDMI_CEC=m +# end of Display Interface Bridges + +# CONFIG_DRM_ETNAVIV is not set +CONFIG_DRM_ARCPGU=m +CONFIG_DRM_MXS=y +CONFIG_DRM_MXSFB=m +CONFIG_DRM_CIRRUS_QEMU=m +CONFIG_DRM_GM12U320=m +CONFIG_TINYDRM_HX8357D=m +CONFIG_TINYDRM_ILI9225=m +CONFIG_TINYDRM_ILI9341=m +CONFIG_TINYDRM_ILI9486=m +CONFIG_TINYDRM_MI0283QT=m +CONFIG_TINYDRM_REPAPER=m +CONFIG_TINYDRM_ST7586=m +CONFIG_TINYDRM_ST7735R=m +CONFIG_DRM_XEN=y +CONFIG_DRM_XEN_FRONTEND=m +CONFIG_DRM_VBOXVIDEO=m +# CONFIG_DRM_LEGACY is not set +CONFIG_DRM_PANEL_ORIENTATION_QUIRKS=y + +# +# Frame buffer Devices +# +CONFIG_FB_CMDLINE=y +CONFIG_FB_NOTIFY=y +CONFIG_FB=y +CONFIG_FIRMWARE_EDID=y +CONFIG_FB_BOOT_VESA_SUPPORT=y +CONFIG_FB_CFB_FILLRECT=y +CONFIG_FB_CFB_COPYAREA=y +CONFIG_FB_CFB_IMAGEBLIT=y +CONFIG_FB_SYS_FILLRECT=m +CONFIG_FB_SYS_COPYAREA=m +CONFIG_FB_SYS_IMAGEBLIT=m +# CONFIG_FB_FOREIGN_ENDIAN is not set +CONFIG_FB_SYS_FOPS=m +CONFIG_FB_DEFERRED_IO=y +CONFIG_FB_BACKLIGHT=m +CONFIG_FB_MODE_HELPERS=y +CONFIG_FB_TILEBLITTING=y + +# +# Frame buffer hardware drivers +# +# CONFIG_FB_CIRRUS is not set +# CONFIG_FB_PM2 is not set +# CONFIG_FB_CYBER2000 is not set +# CONFIG_FB_ARC is not set +# CONFIG_FB_ASILIANT is not set +# CONFIG_FB_IMSTT is not set +# CONFIG_FB_VGA16 is not set +# CONFIG_FB_UVESA is not set +CONFIG_FB_VESA=y +CONFIG_FB_EFI=y +# CONFIG_FB_N411 is not set +# CONFIG_FB_HGA is not set +# CONFIG_FB_OPENCORES is not set +# CONFIG_FB_S1D13XXX is not set +# CONFIG_FB_NVIDIA is not set +# CONFIG_FB_RIVA is not set +# CONFIG_FB_I740 is not set +# CONFIG_FB_LE80578 is not set +# CONFIG_FB_INTEL is not set +# CONFIG_FB_MATROX is not set +# CONFIG_FB_RADEON is not set +# CONFIG_FB_ATY128 is not set +# CONFIG_FB_ATY is not set +# CONFIG_FB_S3 is not set +# CONFIG_FB_SAVAGE is not set +# CONFIG_FB_SIS is not set +# CONFIG_FB_VIA is not set +# CONFIG_FB_NEOMAGIC is not set +# CONFIG_FB_KYRO is not set +# CONFIG_FB_3DFX is not set +# CONFIG_FB_VOODOO1 is not set +# CONFIG_FB_VT8623 is not set +# CONFIG_FB_TRIDENT is not set +# CONFIG_FB_ARK is not set +# CONFIG_FB_PM3 is not set +# CONFIG_FB_CARMINE is not set +# CONFIG_FB_SM501 is not set +# CONFIG_FB_SMSCUFX is not set +# CONFIG_FB_UDL is not set +# CONFIG_FB_IBM_GXT4500 is not set +# CONFIG_FB_VIRTUAL is not set +CONFIG_XEN_FBDEV_FRONTEND=m +# CONFIG_FB_METRONOME is not set +# CONFIG_FB_MB862XX is not set +CONFIG_FB_HYPERV=m +CONFIG_FB_SIMPLE=y +# CONFIG_FB_SSD1307 is not set +# CONFIG_FB_SM712 is not set +# end of Frame buffer Devices + +# +# Backlight & LCD device support +# +CONFIG_LCD_CLASS_DEVICE=m +CONFIG_LCD_L4F00242T03=m +CONFIG_LCD_LMS283GF05=m +CONFIG_LCD_LTV350QV=m +CONFIG_LCD_ILI922X=m +CONFIG_LCD_ILI9320=m +CONFIG_LCD_TDO24M=m +CONFIG_LCD_VGG2432A4=m +CONFIG_LCD_PLATFORM=m +CONFIG_LCD_AMS369FG06=m +CONFIG_LCD_LMS501KF03=m +CONFIG_LCD_HX8357=m +CONFIG_LCD_OTM3225A=m +CONFIG_BACKLIGHT_CLASS_DEVICE=y +CONFIG_BACKLIGHT_GENERIC=m +CONFIG_BACKLIGHT_LM3533=m +CONFIG_BACKLIGHT_PWM=m +CONFIG_BACKLIGHT_DA903X=m +CONFIG_BACKLIGHT_DA9052=m +CONFIG_BACKLIGHT_MAX8925=m +CONFIG_BACKLIGHT_APPLE=m +CONFIG_BACKLIGHT_QCOM_WLED=m +CONFIG_BACKLIGHT_SAHARA=m +CONFIG_BACKLIGHT_WM831X=m +CONFIG_BACKLIGHT_ADP5520=m +CONFIG_BACKLIGHT_ADP8860=m +CONFIG_BACKLIGHT_ADP8870=m +CONFIG_BACKLIGHT_88PM860X=m +CONFIG_BACKLIGHT_PCF50633=m +CONFIG_BACKLIGHT_AAT2870=m +CONFIG_BACKLIGHT_LM3630A=m +CONFIG_BACKLIGHT_LM3639=m +CONFIG_BACKLIGHT_LP855X=m +CONFIG_BACKLIGHT_LP8788=m +CONFIG_BACKLIGHT_PANDORA=m +CONFIG_BACKLIGHT_SKY81452=m +CONFIG_BACKLIGHT_TPS65217=m +CONFIG_BACKLIGHT_AS3711=m +CONFIG_BACKLIGHT_GPIO=m +CONFIG_BACKLIGHT_LV5207LP=m +CONFIG_BACKLIGHT_BD6107=m +CONFIG_BACKLIGHT_ARCXCNN=m +CONFIG_BACKLIGHT_RAVE_SP=m +CONFIG_BACKLIGHT_LED=m +# end of Backlight & LCD device support + +CONFIG_VIDEOMODE_HELPERS=y +CONFIG_HDMI=y + +# +# Console display driver support +# +CONFIG_VGA_CONSOLE=y +CONFIG_VGACON_SOFT_SCROLLBACK=y +CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 +# CONFIG_VGACON_SOFT_SCROLLBACK_PERSISTENT_ENABLE_BY_DEFAULT is not set +CONFIG_DUMMY_CONSOLE=y +CONFIG_DUMMY_CONSOLE_COLUMNS=80 +CONFIG_DUMMY_CONSOLE_ROWS=25 +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y +CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y +CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER=y +# end of Console display driver support + +# CONFIG_LOGO is not set +# end of Graphics support + +CONFIG_SOUND=m +CONFIG_SOUND_OSS_CORE=y +# CONFIG_SOUND_OSS_CORE_PRECLAIM is not set +CONFIG_SND=m +CONFIG_SND_TIMER=m +CONFIG_SND_PCM=m +CONFIG_SND_PCM_ELD=y +CONFIG_SND_PCM_IEC958=y +CONFIG_SND_DMAENGINE_PCM=m +CONFIG_SND_HWDEP=m +CONFIG_SND_SEQ_DEVICE=m +CONFIG_SND_RAWMIDI=m +CONFIG_SND_COMPRESS_OFFLOAD=m +CONFIG_SND_JACK=y +CONFIG_SND_JACK_INPUT_DEV=y +CONFIG_SND_OSSEMUL=y +CONFIG_SND_MIXER_OSS=m +CONFIG_SND_PCM_OSS=m +CONFIG_SND_PCM_OSS_PLUGINS=y +CONFIG_SND_PCM_TIMER=y +CONFIG_SND_HRTIMER=m +CONFIG_SND_DYNAMIC_MINORS=y +CONFIG_SND_MAX_CARDS=32 +# CONFIG_SND_SUPPORT_OLD_API is not set +CONFIG_SND_PROC_FS=y +CONFIG_SND_VERBOSE_PROCFS=y +CONFIG_SND_VERBOSE_PRINTK=y +CONFIG_SND_DEBUG=y +# CONFIG_SND_DEBUG_VERBOSE is not set +# CONFIG_SND_PCM_XRUN_DEBUG is not set +# CONFIG_SND_CTL_VALIDATION is not set +CONFIG_SND_VMASTER=y +CONFIG_SND_DMA_SGBUF=y +CONFIG_SND_SEQUENCER=m +CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_SEQUENCER_OSS=m +CONFIG_SND_SEQ_HRTIMER_DEFAULT=y +CONFIG_SND_SEQ_MIDI_EVENT=m +CONFIG_SND_SEQ_MIDI=m +CONFIG_SND_SEQ_MIDI_EMUL=m +CONFIG_SND_SEQ_VIRMIDI=m +CONFIG_SND_MPU401_UART=m +CONFIG_SND_OPL3_LIB=m +CONFIG_SND_OPL3_LIB_SEQ=m +CONFIG_SND_VX_LIB=m +CONFIG_SND_AC97_CODEC=m +CONFIG_SND_DRIVERS=y +# CONFIG_SND_PCSP is not set +CONFIG_SND_DUMMY=m +CONFIG_SND_ALOOP=m +CONFIG_SND_VIRMIDI=m +CONFIG_SND_MTPAV=m +CONFIG_SND_MTS64=m +CONFIG_SND_SERIAL_U16550=m +CONFIG_SND_MPU401=m +CONFIG_SND_PORTMAN2X4=m +CONFIG_SND_AC97_POWER_SAVE=y +CONFIG_SND_AC97_POWER_SAVE_DEFAULT=0 +CONFIG_SND_SB_COMMON=m +CONFIG_SND_PCI=y +CONFIG_SND_AD1889=m +CONFIG_SND_ALS300=m +CONFIG_SND_ALS4000=m +CONFIG_SND_ALI5451=m +CONFIG_SND_ASIHPI=m +CONFIG_SND_ATIIXP=m +CONFIG_SND_ATIIXP_MODEM=m +CONFIG_SND_AU8810=m +CONFIG_SND_AU8820=m +CONFIG_SND_AU8830=m +CONFIG_SND_AW2=m +CONFIG_SND_AZT3328=m +CONFIG_SND_BT87X=m +# CONFIG_SND_BT87X_OVERCLOCK is not set +CONFIG_SND_CA0106=m +CONFIG_SND_CMIPCI=m +CONFIG_SND_OXYGEN_LIB=m +CONFIG_SND_OXYGEN=m +CONFIG_SND_CS4281=m +CONFIG_SND_CS46XX=m +CONFIG_SND_CS46XX_NEW_DSP=y +CONFIG_SND_CTXFI=m +CONFIG_SND_DARLA20=m +CONFIG_SND_GINA20=m +CONFIG_SND_LAYLA20=m +CONFIG_SND_DARLA24=m +CONFIG_SND_GINA24=m +CONFIG_SND_LAYLA24=m +CONFIG_SND_MONA=m +CONFIG_SND_MIA=m +CONFIG_SND_ECHO3G=m +CONFIG_SND_INDIGO=m +CONFIG_SND_INDIGOIO=m +CONFIG_SND_INDIGODJ=m +CONFIG_SND_INDIGOIOX=m +CONFIG_SND_INDIGODJX=m +CONFIG_SND_EMU10K1=m +CONFIG_SND_EMU10K1_SEQ=m +CONFIG_SND_EMU10K1X=m +CONFIG_SND_ENS1370=m +CONFIG_SND_ENS1371=m +CONFIG_SND_ES1938=m +CONFIG_SND_ES1968=m +CONFIG_SND_ES1968_INPUT=y +CONFIG_SND_ES1968_RADIO=y +CONFIG_SND_FM801=m +CONFIG_SND_FM801_TEA575X_BOOL=y +CONFIG_SND_HDSP=m +CONFIG_SND_HDSPM=m +CONFIG_SND_ICE1712=m +CONFIG_SND_ICE1724=m +CONFIG_SND_INTEL8X0=m +CONFIG_SND_INTEL8X0M=m +CONFIG_SND_KORG1212=m +CONFIG_SND_LOLA=m +CONFIG_SND_LX6464ES=m +CONFIG_SND_MAESTRO3=m +CONFIG_SND_MAESTRO3_INPUT=y +CONFIG_SND_MIXART=m +CONFIG_SND_NM256=m +CONFIG_SND_PCXHR=m +CONFIG_SND_RIPTIDE=m +CONFIG_SND_RME32=m +CONFIG_SND_RME96=m +CONFIG_SND_RME9652=m +CONFIG_SND_SONICVIBES=m +CONFIG_SND_TRIDENT=m +CONFIG_SND_VIA82XX=m +CONFIG_SND_VIA82XX_MODEM=m +CONFIG_SND_VIRTUOSO=m +CONFIG_SND_VX222=m +CONFIG_SND_YMFPCI=m + +# +# HD-Audio +# +CONFIG_SND_HDA=m +CONFIG_SND_HDA_INTEL=m +CONFIG_SND_HDA_HWDEP=y +CONFIG_SND_HDA_RECONFIG=y +CONFIG_SND_HDA_INPUT_BEEP=y +CONFIG_SND_HDA_INPUT_BEEP_MODE=1 +CONFIG_SND_HDA_PATCH_LOADER=y +CONFIG_SND_HDA_CODEC_REALTEK=m +CONFIG_SND_HDA_CODEC_ANALOG=m +CONFIG_SND_HDA_CODEC_SIGMATEL=m +CONFIG_SND_HDA_CODEC_VIA=m +CONFIG_SND_HDA_CODEC_HDMI=m +CONFIG_SND_HDA_CODEC_CIRRUS=m +CONFIG_SND_HDA_CODEC_CONEXANT=m +CONFIG_SND_HDA_CODEC_CA0110=m +CONFIG_SND_HDA_CODEC_CA0132=m +CONFIG_SND_HDA_CODEC_CA0132_DSP=y +CONFIG_SND_HDA_CODEC_CMEDIA=m +CONFIG_SND_HDA_CODEC_SI3054=m +CONFIG_SND_HDA_GENERIC=m +CONFIG_SND_HDA_POWER_SAVE_DEFAULT=0 +# end of HD-Audio + +CONFIG_SND_HDA_CORE=m +CONFIG_SND_HDA_DSP_LOADER=y +CONFIG_SND_HDA_COMPONENT=y +CONFIG_SND_HDA_I915=y +CONFIG_SND_HDA_EXT_CORE=m +CONFIG_SND_HDA_PREALLOC_SIZE=0 +CONFIG_SND_INTEL_NHLT=y +CONFIG_SND_INTEL_DSP_CONFIG=m +CONFIG_SND_SPI=y +CONFIG_SND_USB=y +CONFIG_SND_USB_AUDIO=m +CONFIG_SND_USB_AUDIO_USE_MEDIA_CONTROLLER=y +CONFIG_SND_USB_UA101=m +CONFIG_SND_USB_USX2Y=m +CONFIG_SND_USB_CAIAQ=m +CONFIG_SND_USB_CAIAQ_INPUT=y +CONFIG_SND_USB_US122L=m +CONFIG_SND_USB_6FIRE=m +CONFIG_SND_USB_HIFACE=m +CONFIG_SND_BCD2000=m +CONFIG_SND_USB_LINE6=m +CONFIG_SND_USB_POD=m +CONFIG_SND_USB_PODHD=m +CONFIG_SND_USB_TONEPORT=m +CONFIG_SND_USB_VARIAX=m +CONFIG_SND_FIREWIRE=y +CONFIG_SND_FIREWIRE_LIB=m +CONFIG_SND_DICE=m +CONFIG_SND_OXFW=m +CONFIG_SND_ISIGHT=m +CONFIG_SND_FIREWORKS=m +CONFIG_SND_BEBOB=m +CONFIG_SND_FIREWIRE_DIGI00X=m +CONFIG_SND_FIREWIRE_TASCAM=m +CONFIG_SND_FIREWIRE_MOTU=m +CONFIG_SND_FIREFACE=m +CONFIG_SND_PCMCIA=y +CONFIG_SND_VXPOCKET=m +CONFIG_SND_PDAUDIOCF=m +CONFIG_SND_SOC=m +CONFIG_SND_SOC_AC97_BUS=y +CONFIG_SND_SOC_GENERIC_DMAENGINE_PCM=y +CONFIG_SND_SOC_COMPRESS=y +CONFIG_SND_SOC_TOPOLOGY=y +CONFIG_SND_SOC_ACPI=m +CONFIG_SND_SOC_AMD_ACP=m +CONFIG_SND_SOC_AMD_CZ_DA7219MX98357_MACH=m +CONFIG_SND_SOC_AMD_CZ_RT5645_MACH=m +CONFIG_SND_SOC_AMD_ACP3x=m +CONFIG_SND_SOC_AMD_RV_RT5682_MACH=m +CONFIG_SND_SOC_AMD_RENOIR=m +CONFIG_SND_SOC_AMD_RENOIR_MACH=m +CONFIG_SND_ATMEL_SOC=m +CONFIG_SND_SOC_MIKROE_PROTO=m +CONFIG_SND_BCM63XX_I2S_WHISTLER=m +CONFIG_SND_DESIGNWARE_I2S=m +CONFIG_SND_DESIGNWARE_PCM=y + +# +# SoC Audio for Freescale CPUs +# + +# +# Common SoC Audio options for Freescale CPUs: +# +# CONFIG_SND_SOC_FSL_ASRC is not set +# CONFIG_SND_SOC_FSL_SAI is not set +# CONFIG_SND_SOC_FSL_AUDMIX is not set +# CONFIG_SND_SOC_FSL_SSI is not set +# CONFIG_SND_SOC_FSL_SPDIF is not set +# CONFIG_SND_SOC_FSL_ESAI is not set +# CONFIG_SND_SOC_FSL_MICFIL is not set +# CONFIG_SND_SOC_IMX_AUDMUX is not set +# end of SoC Audio for Freescale CPUs + +CONFIG_SND_I2S_HI6210_I2S=m +CONFIG_SND_SOC_IMG=y +CONFIG_SND_SOC_IMG_I2S_IN=m +CONFIG_SND_SOC_IMG_I2S_OUT=m +CONFIG_SND_SOC_IMG_PARALLEL_OUT=m +CONFIG_SND_SOC_IMG_SPDIF_IN=m +CONFIG_SND_SOC_IMG_SPDIF_OUT=m +CONFIG_SND_SOC_IMG_PISTACHIO_INTERNAL_DAC=m +CONFIG_SND_SOC_INTEL_SST_TOPLEVEL=y +CONFIG_SND_SST_IPC=m +CONFIG_SND_SST_IPC_PCI=m +CONFIG_SND_SST_IPC_ACPI=m +CONFIG_SND_SOC_INTEL_SST_ACPI=m +CONFIG_SND_SOC_INTEL_SST=m +CONFIG_SND_SOC_INTEL_SST_FIRMWARE=m +CONFIG_SND_SOC_INTEL_HASWELL=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_PCI=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_ACPI=m +CONFIG_SND_SOC_INTEL_SKYLAKE=m +CONFIG_SND_SOC_INTEL_SKL=m +CONFIG_SND_SOC_INTEL_APL=m +CONFIG_SND_SOC_INTEL_KBL=m +CONFIG_SND_SOC_INTEL_GLK=m +CONFIG_SND_SOC_INTEL_CNL=m +CONFIG_SND_SOC_INTEL_CFL=m +CONFIG_SND_SOC_INTEL_CML_H=m +CONFIG_SND_SOC_INTEL_CML_LP=m +CONFIG_SND_SOC_INTEL_SKYLAKE_FAMILY=m +CONFIG_SND_SOC_INTEL_SKYLAKE_SSP_CLK=m +# CONFIG_SND_SOC_INTEL_SKYLAKE_HDAUDIO_CODEC is not set +CONFIG_SND_SOC_INTEL_SKYLAKE_COMMON=m +CONFIG_SND_SOC_ACPI_INTEL_MATCH=m +CONFIG_SND_SOC_INTEL_MACH=y +# CONFIG_SND_SOC_INTEL_USER_FRIENDLY_LONG_NAMES is not set +CONFIG_SND_SOC_INTEL_HASWELL_MACH=m +CONFIG_SND_SOC_INTEL_BDW_RT5650_MACH=m +CONFIG_SND_SOC_INTEL_BDW_RT5677_MACH=m +CONFIG_SND_SOC_INTEL_BROADWELL_MACH=m +CONFIG_SND_SOC_INTEL_BYTCR_RT5640_MACH=m +CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_RT5672_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_RT5645_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_MAX98090_TI_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_NAU8824_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_CX2072X_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_DA7213_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_ES8316_MACH=m +# CONFIG_SND_SOC_INTEL_BYT_CHT_NOCODEC_MACH is not set +CONFIG_SND_SOC_INTEL_SKL_RT286_MACH=m +CONFIG_SND_SOC_INTEL_SKL_NAU88L25_SSM4567_MACH=m +CONFIG_SND_SOC_INTEL_SKL_NAU88L25_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_DA7219_MAX98357A_GENERIC=m +CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_COMMON=m +CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_BXT_RT298_MACH=m +CONFIG_SND_SOC_INTEL_SOF_WM8804_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5663_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5663_RT5514_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5660_MACH=m +CONFIG_SND_SOC_INTEL_GLK_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_GLK_RT5682_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_SKL_HDA_DSP_GENERIC_MACH=m +CONFIG_SND_SOC_INTEL_SOF_RT5682_MACH=m +CONFIG_SND_SOC_INTEL_SOF_PCM512x_MACH=m +CONFIG_SND_SOC_INTEL_CML_LP_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_SOF_CML_RT1011_RT5682_MACH=m +CONFIG_SND_SOC_INTEL_SOF_DA7219_MAX98373_MACH=m +CONFIG_SND_SOC_INTEL_EHL_RT5660_MACH=m +CONFIG_SND_SOC_MTK_BTCVSD=m +CONFIG_SND_SOC_SOF_TOPLEVEL=y +CONFIG_SND_SOC_SOF_PCI=m +CONFIG_SND_SOC_SOF_ACPI=m +CONFIG_SND_SOC_SOF_OF=m +# CONFIG_SND_SOC_SOF_DEBUG_PROBES is not set +# CONFIG_SND_SOC_SOF_DEVELOPER_SUPPORT is not set +CONFIG_SND_SOC_SOF=m +CONFIG_SND_SOC_SOF_PROBE_WORK_QUEUE=y +CONFIG_SND_SOC_SOF_INTEL_TOPLEVEL=y +CONFIG_SND_SOC_SOF_INTEL_ACPI=m +CONFIG_SND_SOC_SOF_INTEL_PCI=m +CONFIG_SND_SOC_SOF_INTEL_HIFI_EP_IPC=m +CONFIG_SND_SOC_SOF_INTEL_ATOM_HIFI_EP=m +CONFIG_SND_SOC_SOF_INTEL_COMMON=m +CONFIG_SND_SOC_SOF_MERRIFIELD_SUPPORT=y +CONFIG_SND_SOC_SOF_MERRIFIELD=m +CONFIG_SND_SOC_SOF_APOLLOLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_APOLLOLAKE=m +CONFIG_SND_SOC_SOF_GEMINILAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_GEMINILAKE=m +CONFIG_SND_SOC_SOF_CANNONLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_CANNONLAKE=m +CONFIG_SND_SOC_SOF_COFFEELAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_COFFEELAKE=m +CONFIG_SND_SOC_SOF_ICELAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_ICELAKE=m +CONFIG_SND_SOC_SOF_COMETLAKE=m +CONFIG_SND_SOC_SOF_COMETLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_COMETLAKE_LP_SUPPORT=y +CONFIG_SND_SOC_SOF_TIGERLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_TIGERLAKE=m +CONFIG_SND_SOC_SOF_ELKHARTLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_ELKHARTLAKE=m +CONFIG_SND_SOC_SOF_JASPERLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_JASPERLAKE=m +CONFIG_SND_SOC_SOF_HDA_COMMON=m +CONFIG_SND_SOC_SOF_HDA_LINK=y +CONFIG_SND_SOC_SOF_HDA_AUDIO_CODEC=y +# CONFIG_SND_SOC_SOF_HDA_ALWAYS_ENABLE_DMI_L1 is not set +CONFIG_SND_SOC_SOF_HDA_LINK_BASELINE=m +CONFIG_SND_SOC_SOF_HDA=m +CONFIG_SND_SOC_SOF_XTENSA=m + +# +# STMicroelectronics STM32 SOC audio support +# +# end of STMicroelectronics STM32 SOC audio support + +CONFIG_SND_SOC_XILINX_I2S=m +CONFIG_SND_SOC_XILINX_AUDIO_FORMATTER=m +CONFIG_SND_SOC_XILINX_SPDIF=m +CONFIG_SND_SOC_XTFPGA_I2S=m +CONFIG_ZX_TDM=m +CONFIG_SND_SOC_I2C_AND_SPI=m + +# +# CODEC drivers +# +CONFIG_SND_SOC_AC97_CODEC=m +CONFIG_SND_SOC_ADAU_UTILS=m +CONFIG_SND_SOC_ADAU1701=m +CONFIG_SND_SOC_ADAU17X1=m +CONFIG_SND_SOC_ADAU1761=m +CONFIG_SND_SOC_ADAU1761_I2C=m +CONFIG_SND_SOC_ADAU1761_SPI=m +CONFIG_SND_SOC_ADAU7002=m +CONFIG_SND_SOC_ADAU7118=m +CONFIG_SND_SOC_ADAU7118_HW=m +CONFIG_SND_SOC_ADAU7118_I2C=m +CONFIG_SND_SOC_AK4104=m +CONFIG_SND_SOC_AK4118=m +CONFIG_SND_SOC_AK4458=m +CONFIG_SND_SOC_AK4554=m +CONFIG_SND_SOC_AK4613=m +CONFIG_SND_SOC_AK4642=m +CONFIG_SND_SOC_AK5386=m +CONFIG_SND_SOC_AK5558=m +CONFIG_SND_SOC_ALC5623=m +CONFIG_SND_SOC_BD28623=m +# CONFIG_SND_SOC_BT_SCO is not set +CONFIG_SND_SOC_CPCAP=m +CONFIG_SND_SOC_CROS_EC_CODEC=m +CONFIG_SND_SOC_CS35L32=m +CONFIG_SND_SOC_CS35L33=m +CONFIG_SND_SOC_CS35L34=m +CONFIG_SND_SOC_CS35L35=m +CONFIG_SND_SOC_CS35L36=m +CONFIG_SND_SOC_CS42L42=m +CONFIG_SND_SOC_CS42L51=m +CONFIG_SND_SOC_CS42L51_I2C=m +CONFIG_SND_SOC_CS42L52=m +CONFIG_SND_SOC_CS42L56=m +CONFIG_SND_SOC_CS42L73=m +CONFIG_SND_SOC_CS4265=m +CONFIG_SND_SOC_CS4270=m +CONFIG_SND_SOC_CS4271=m +CONFIG_SND_SOC_CS4271_I2C=m +CONFIG_SND_SOC_CS4271_SPI=m +CONFIG_SND_SOC_CS42XX8=m +CONFIG_SND_SOC_CS42XX8_I2C=m +CONFIG_SND_SOC_CS43130=m +CONFIG_SND_SOC_CS4341=m +CONFIG_SND_SOC_CS4349=m +CONFIG_SND_SOC_CS53L30=m +CONFIG_SND_SOC_CX2072X=m +CONFIG_SND_SOC_DA7213=m +CONFIG_SND_SOC_DA7219=m +CONFIG_SND_SOC_DMIC=m +CONFIG_SND_SOC_HDMI_CODEC=m +CONFIG_SND_SOC_ES7134=m +CONFIG_SND_SOC_ES7241=m +CONFIG_SND_SOC_ES8316=m +CONFIG_SND_SOC_ES8328=m +CONFIG_SND_SOC_ES8328_I2C=m +CONFIG_SND_SOC_ES8328_SPI=m +CONFIG_SND_SOC_GTM601=m +CONFIG_SND_SOC_HDAC_HDMI=m +CONFIG_SND_SOC_HDAC_HDA=m +CONFIG_SND_SOC_INNO_RK3036=m +CONFIG_SND_SOC_LOCHNAGAR_SC=m +CONFIG_SND_SOC_MAX98088=m +CONFIG_SND_SOC_MAX98090=m +CONFIG_SND_SOC_MAX98357A=m +CONFIG_SND_SOC_MAX98504=m +CONFIG_SND_SOC_MAX9867=m +CONFIG_SND_SOC_MAX98927=m +CONFIG_SND_SOC_MAX98373=m +CONFIG_SND_SOC_MAX98390=m +CONFIG_SND_SOC_MAX9860=m +CONFIG_SND_SOC_MSM8916_WCD_ANALOG=m +CONFIG_SND_SOC_MSM8916_WCD_DIGITAL=m +CONFIG_SND_SOC_PCM1681=m +CONFIG_SND_SOC_PCM1789=m +CONFIG_SND_SOC_PCM1789_I2C=m +CONFIG_SND_SOC_PCM179X=m +CONFIG_SND_SOC_PCM179X_I2C=m +CONFIG_SND_SOC_PCM179X_SPI=m +CONFIG_SND_SOC_PCM186X=m +CONFIG_SND_SOC_PCM186X_I2C=m +CONFIG_SND_SOC_PCM186X_SPI=m +CONFIG_SND_SOC_PCM3060=m +CONFIG_SND_SOC_PCM3060_I2C=m +CONFIG_SND_SOC_PCM3060_SPI=m +CONFIG_SND_SOC_PCM3168A=m +CONFIG_SND_SOC_PCM3168A_I2C=m +CONFIG_SND_SOC_PCM3168A_SPI=m +CONFIG_SND_SOC_PCM512x=m +CONFIG_SND_SOC_PCM512x_I2C=m +CONFIG_SND_SOC_PCM512x_SPI=m +CONFIG_SND_SOC_RK3328=m +CONFIG_SND_SOC_RL6231=m +CONFIG_SND_SOC_RL6347A=m +CONFIG_SND_SOC_RT286=m +CONFIG_SND_SOC_RT298=m +CONFIG_SND_SOC_RT1011=m +CONFIG_SND_SOC_RT1015=m +CONFIG_SND_SOC_RT1308_SDW=m +CONFIG_SND_SOC_RT5514=m +CONFIG_SND_SOC_RT5514_SPI=m +CONFIG_SND_SOC_RT5616=m +CONFIG_SND_SOC_RT5631=m +CONFIG_SND_SOC_RT5640=m +CONFIG_SND_SOC_RT5645=m +CONFIG_SND_SOC_RT5651=m +CONFIG_SND_SOC_RT5660=m +CONFIG_SND_SOC_RT5663=m +CONFIG_SND_SOC_RT5670=m +CONFIG_SND_SOC_RT5677=m +CONFIG_SND_SOC_RT5677_SPI=m +CONFIG_SND_SOC_RT5682=m +CONFIG_SND_SOC_RT5682_I2C=m +CONFIG_SND_SOC_RT5682_SDW=m +CONFIG_SND_SOC_RT700=m +CONFIG_SND_SOC_RT700_SDW=m +CONFIG_SND_SOC_RT711=m +CONFIG_SND_SOC_RT711_SDW=m +CONFIG_SND_SOC_RT715=m +CONFIG_SND_SOC_RT715_SDW=m +CONFIG_SND_SOC_SGTL5000=m +CONFIG_SND_SOC_SI476X=m +CONFIG_SND_SOC_SIGMADSP=m +CONFIG_SND_SOC_SIGMADSP_I2C=m +CONFIG_SND_SOC_SIGMADSP_REGMAP=m +CONFIG_SND_SOC_SIMPLE_AMPLIFIER=m +CONFIG_SND_SOC_SIRF_AUDIO_CODEC=m +CONFIG_SND_SOC_SPDIF=m +CONFIG_SND_SOC_SSM2305=m +CONFIG_SND_SOC_SSM2602=m +CONFIG_SND_SOC_SSM2602_SPI=m +CONFIG_SND_SOC_SSM2602_I2C=m +CONFIG_SND_SOC_SSM4567=m +CONFIG_SND_SOC_STA32X=m +CONFIG_SND_SOC_STA350=m +CONFIG_SND_SOC_STI_SAS=m +CONFIG_SND_SOC_TAS2552=m +CONFIG_SND_SOC_TAS2562=m +CONFIG_SND_SOC_TAS2770=m +CONFIG_SND_SOC_TAS5086=m +CONFIG_SND_SOC_TAS571X=m +CONFIG_SND_SOC_TAS5720=m +CONFIG_SND_SOC_TAS6424=m +CONFIG_SND_SOC_TDA7419=m +CONFIG_SND_SOC_TFA9879=m +CONFIG_SND_SOC_TLV320AIC23=m +CONFIG_SND_SOC_TLV320AIC23_I2C=m +CONFIG_SND_SOC_TLV320AIC23_SPI=m +CONFIG_SND_SOC_TLV320AIC31XX=m +CONFIG_SND_SOC_TLV320AIC32X4=m +CONFIG_SND_SOC_TLV320AIC32X4_I2C=m +CONFIG_SND_SOC_TLV320AIC32X4_SPI=m +CONFIG_SND_SOC_TLV320AIC3X=m +CONFIG_SND_SOC_TLV320ADCX140=m +CONFIG_SND_SOC_TS3A227E=m +CONFIG_SND_SOC_TSCS42XX=m +CONFIG_SND_SOC_TSCS454=m +CONFIG_SND_SOC_UDA1334=m +CONFIG_SND_SOC_WCD9335=m +CONFIG_SND_SOC_WCD934X=m +CONFIG_SND_SOC_WM8510=m +CONFIG_SND_SOC_WM8523=m +CONFIG_SND_SOC_WM8524=m +CONFIG_SND_SOC_WM8580=m +CONFIG_SND_SOC_WM8711=m +CONFIG_SND_SOC_WM8728=m +CONFIG_SND_SOC_WM8731=m +CONFIG_SND_SOC_WM8737=m +CONFIG_SND_SOC_WM8741=m +CONFIG_SND_SOC_WM8750=m +CONFIG_SND_SOC_WM8753=m +CONFIG_SND_SOC_WM8770=m +CONFIG_SND_SOC_WM8776=m +CONFIG_SND_SOC_WM8782=m +CONFIG_SND_SOC_WM8804=m +CONFIG_SND_SOC_WM8804_I2C=m +CONFIG_SND_SOC_WM8804_SPI=m +CONFIG_SND_SOC_WM8903=m +CONFIG_SND_SOC_WM8904=m +CONFIG_SND_SOC_WM8960=m +CONFIG_SND_SOC_WM8962=m +CONFIG_SND_SOC_WM8974=m +CONFIG_SND_SOC_WM8978=m +CONFIG_SND_SOC_WM8985=m +CONFIG_SND_SOC_WSA881X=m +CONFIG_SND_SOC_ZL38060=m +CONFIG_SND_SOC_ZX_AUD96P22=m +CONFIG_SND_SOC_MAX9759=m +CONFIG_SND_SOC_MT6351=m +CONFIG_SND_SOC_MT6358=m +CONFIG_SND_SOC_MT6660=m +CONFIG_SND_SOC_NAU8540=m +CONFIG_SND_SOC_NAU8810=m +CONFIG_SND_SOC_NAU8822=m +CONFIG_SND_SOC_NAU8824=m +CONFIG_SND_SOC_NAU8825=m +CONFIG_SND_SOC_TPA6130A2=m +# end of CODEC drivers + +CONFIG_SND_SIMPLE_CARD_UTILS=m +CONFIG_SND_SIMPLE_CARD=m +CONFIG_SND_AUDIO_GRAPH_CARD=m +CONFIG_SND_X86=y +CONFIG_HDMI_LPE_AUDIO=m +CONFIG_SND_SYNTH_EMUX=m +CONFIG_SND_XEN_FRONTEND=m +CONFIG_AC97_BUS=m + +# +# HID support +# +CONFIG_HID=m +CONFIG_HID_BATTERY_STRENGTH=y +CONFIG_HIDRAW=y +CONFIG_UHID=m +CONFIG_HID_GENERIC=m + +# +# Special HID drivers +# +CONFIG_HID_A4TECH=m +CONFIG_HID_ACCUTOUCH=m +CONFIG_HID_ACRUX=m +CONFIG_HID_ACRUX_FF=y +CONFIG_HID_APPLE=m +CONFIG_HID_APPLEIR=m +CONFIG_HID_ASUS=m +CONFIG_HID_AUREAL=m +CONFIG_HID_BELKIN=m +CONFIG_HID_BETOP_FF=m +CONFIG_HID_BIGBEN_FF=m +CONFIG_HID_CHERRY=m +CONFIG_HID_CHICONY=m +CONFIG_HID_CORSAIR=m +CONFIG_HID_COUGAR=m +CONFIG_HID_MACALLY=m +CONFIG_HID_PRODIKEYS=m +CONFIG_HID_CMEDIA=m +CONFIG_HID_CP2112=m +CONFIG_HID_CREATIVE_SB0540=m +CONFIG_HID_CYPRESS=m +CONFIG_HID_DRAGONRISE=m +CONFIG_DRAGONRISE_FF=y +CONFIG_HID_EMS_FF=m +CONFIG_HID_ELAN=m +CONFIG_HID_ELECOM=m +CONFIG_HID_ELO=m +CONFIG_HID_EZKEY=m +CONFIG_HID_GEMBIRD=m +CONFIG_HID_GFRM=m +CONFIG_HID_GLORIOUS=m +CONFIG_HID_HOLTEK=m +CONFIG_HOLTEK_FF=y +CONFIG_HID_GOOGLE_HAMMER=m +CONFIG_HID_GT683R=m +CONFIG_HID_KEYTOUCH=m +CONFIG_HID_KYE=m +CONFIG_HID_UCLOGIC=m +CONFIG_HID_WALTOP=m +CONFIG_HID_VIEWSONIC=m +CONFIG_HID_GYRATION=m +CONFIG_HID_ICADE=m +CONFIG_HID_ITE=m +CONFIG_HID_JABRA=m +CONFIG_HID_TWINHAN=m +CONFIG_HID_KENSINGTON=m +CONFIG_HID_LCPOWER=m +CONFIG_HID_LED=m +CONFIG_HID_LENOVO=m +CONFIG_HID_LOGITECH=m +CONFIG_HID_LOGITECH_DJ=m +CONFIG_HID_LOGITECH_HIDPP=m +CONFIG_LOGITECH_FF=y +CONFIG_LOGIRUMBLEPAD2_FF=y +CONFIG_LOGIG940_FF=y +CONFIG_LOGIWHEELS_FF=y +CONFIG_HID_MAGICMOUSE=m +CONFIG_HID_MALTRON=m +CONFIG_HID_MAYFLASH=m +CONFIG_HID_REDRAGON=m +CONFIG_HID_MICROSOFT=m +CONFIG_HID_MONTEREY=m +CONFIG_HID_MULTITOUCH=m +CONFIG_HID_NTI=m +CONFIG_HID_NTRIG=m +CONFIG_HID_ORTEK=m +CONFIG_HID_PANTHERLORD=m +CONFIG_PANTHERLORD_FF=y +CONFIG_HID_PENMOUNT=m +CONFIG_HID_PETALYNX=m +CONFIG_HID_PICOLCD=m +CONFIG_HID_PICOLCD_FB=y +CONFIG_HID_PICOLCD_BACKLIGHT=y +CONFIG_HID_PICOLCD_LCD=y +CONFIG_HID_PICOLCD_LEDS=y +CONFIG_HID_PICOLCD_CIR=y +CONFIG_HID_PLANTRONICS=m +CONFIG_HID_PRIMAX=m +CONFIG_HID_RETRODE=m +CONFIG_HID_ROCCAT=m +CONFIG_HID_SAITEK=m +CONFIG_HID_SAMSUNG=m +CONFIG_HID_SONY=m +CONFIG_SONY_FF=y +CONFIG_HID_SPEEDLINK=m +CONFIG_HID_STEAM=m +CONFIG_HID_STEELSERIES=m +CONFIG_HID_SUNPLUS=m +CONFIG_HID_RMI=m +CONFIG_HID_GREENASIA=m +CONFIG_GREENASIA_FF=y +CONFIG_HID_HYPERV_MOUSE=m +CONFIG_HID_SMARTJOYPLUS=m +CONFIG_SMARTJOYPLUS_FF=y +CONFIG_HID_TIVO=m +CONFIG_HID_TOPSEED=m +CONFIG_HID_THINGM=m +CONFIG_HID_THRUSTMASTER=m +CONFIG_THRUSTMASTER_FF=y +CONFIG_HID_UDRAW_PS3=m +CONFIG_HID_U2FZERO=m +CONFIG_HID_WACOM=m +CONFIG_HID_WIIMOTE=m +CONFIG_HID_XINMO=m +CONFIG_HID_ZEROPLUS=m +CONFIG_ZEROPLUS_FF=y +CONFIG_HID_ZYDACRON=m +CONFIG_HID_SENSOR_HUB=m +# CONFIG_HID_SENSOR_CUSTOM_SENSOR is not set +CONFIG_HID_ALPS=m +CONFIG_HID_MCP2221=m +# end of Special HID drivers + +# +# USB HID support +# +CONFIG_USB_HID=m +CONFIG_HID_PID=y +CONFIG_USB_HIDDEV=y + +# +# USB HID Boot Protocol drivers +# +# CONFIG_USB_KBD is not set +# CONFIG_USB_MOUSE is not set +# end of USB HID Boot Protocol drivers +# end of USB HID support + +# +# I2C HID support +# +CONFIG_I2C_HID=m +# end of I2C HID support + +# +# Intel ISH HID support +# +CONFIG_INTEL_ISH_HID=m +CONFIG_INTEL_ISH_FIRMWARE_DOWNLOADER=m +# end of Intel ISH HID support +# end of HID support + +CONFIG_USB_OHCI_LITTLE_ENDIAN=y +CONFIG_USB_SUPPORT=y +CONFIG_USB_COMMON=y +CONFIG_USB_LED_TRIG=y +CONFIG_USB_ULPI_BUS=m +CONFIG_USB_CONN_GPIO=m +CONFIG_USB_ARCH_HAS_HCD=y +CONFIG_USB=y +CONFIG_USB_PCI=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y + +# +# Miscellaneous USB options +# +CONFIG_USB_DEFAULT_PERSIST=y +CONFIG_USB_DYNAMIC_MINORS=y +# CONFIG_USB_OTG is not set +# CONFIG_USB_OTG_WHITELIST is not set +# CONFIG_USB_OTG_BLACKLIST_HUB is not set +CONFIG_USB_LEDS_TRIGGER_USBPORT=m +CONFIG_USB_AUTOSUSPEND_DELAY=2 +CONFIG_USB_MON=m + +# +# USB Host Controller Drivers +# +CONFIG_USB_C67X00_HCD=m +CONFIG_USB_XHCI_HCD=m +# CONFIG_USB_XHCI_DBGCAP is not set +CONFIG_USB_XHCI_PCI=m +CONFIG_USB_XHCI_PCI_RENESAS=m +CONFIG_USB_XHCI_PLATFORM=m +CONFIG_USB_EHCI_HCD=m +CONFIG_USB_EHCI_ROOT_HUB_TT=y +CONFIG_USB_EHCI_TT_NEWSCHED=y +CONFIG_USB_EHCI_PCI=m +CONFIG_USB_EHCI_FSL=m +CONFIG_USB_EHCI_HCD_PLATFORM=m +CONFIG_USB_OXU210HP_HCD=m +CONFIG_USB_ISP116X_HCD=m +CONFIG_USB_FOTG210_HCD=m +CONFIG_USB_MAX3421_HCD=m +CONFIG_USB_OHCI_HCD=m +CONFIG_USB_OHCI_HCD_PCI=m +# CONFIG_USB_OHCI_HCD_SSB is not set +CONFIG_USB_OHCI_HCD_PLATFORM=m +CONFIG_USB_UHCI_HCD=m +CONFIG_USB_U132_HCD=m +CONFIG_USB_SL811_HCD=m +# CONFIG_USB_SL811_HCD_ISO is not set +CONFIG_USB_SL811_CS=m +CONFIG_USB_R8A66597_HCD=m +CONFIG_USB_HCD_BCMA=m +CONFIG_USB_HCD_SSB=m +# CONFIG_USB_HCD_TEST_MODE is not set + +# +# USB Device Class drivers +# +CONFIG_USB_ACM=m +CONFIG_USB_PRINTER=m +CONFIG_USB_WDM=m +CONFIG_USB_TMC=m + +# +# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may +# + +# +# also be needed; see USB_STORAGE Help for more info +# +CONFIG_USB_STORAGE=m +# CONFIG_USB_STORAGE_DEBUG is not set +CONFIG_USB_STORAGE_REALTEK=m +CONFIG_REALTEK_AUTOPM=y +CONFIG_USB_STORAGE_DATAFAB=m +CONFIG_USB_STORAGE_FREECOM=m +CONFIG_USB_STORAGE_ISD200=m +CONFIG_USB_STORAGE_USBAT=m +CONFIG_USB_STORAGE_SDDR09=m +CONFIG_USB_STORAGE_SDDR55=m +CONFIG_USB_STORAGE_JUMPSHOT=m +CONFIG_USB_STORAGE_ALAUDA=m +CONFIG_USB_STORAGE_ONETOUCH=m +CONFIG_USB_STORAGE_KARMA=m +CONFIG_USB_STORAGE_CYPRESS_ATACB=m +CONFIG_USB_STORAGE_ENE_UB6250=m +CONFIG_USB_UAS=m + +# +# USB Imaging devices +# +CONFIG_USB_MDC800=m +CONFIG_USB_MICROTEK=m +CONFIG_USBIP_CORE=m +CONFIG_USBIP_VHCI_HCD=m +CONFIG_USBIP_VHCI_HC_PORTS=8 +CONFIG_USBIP_VHCI_NR_HCS=1 +CONFIG_USBIP_HOST=m +CONFIG_USBIP_VUDC=m +# CONFIG_USBIP_DEBUG is not set +CONFIG_USB_CDNS3=m +CONFIG_USB_CDNS3_GADGET=y +CONFIG_USB_CDNS3_HOST=y +CONFIG_USB_CDNS3_PCI_WRAP=m +CONFIG_USB_MUSB_HDRC=m +# CONFIG_USB_MUSB_HOST is not set +# CONFIG_USB_MUSB_GADGET is not set +CONFIG_USB_MUSB_DUAL_ROLE=y + +# +# Platform Glue Layer +# + +# +# MUSB DMA mode +# +# CONFIG_MUSB_PIO_ONLY is not set +CONFIG_USB_DWC3=m +CONFIG_USB_DWC3_ULPI=y +# CONFIG_USB_DWC3_HOST is not set +# CONFIG_USB_DWC3_GADGET is not set +CONFIG_USB_DWC3_DUAL_ROLE=y + +# +# Platform Glue Driver Support +# +CONFIG_USB_DWC3_PCI=m +CONFIG_USB_DWC3_HAPS=m +CONFIG_USB_DWC3_OF_SIMPLE=m +CONFIG_USB_DWC2=m +# CONFIG_USB_DWC2_HOST is not set + +# +# Gadget/Dual-role mode requires USB Gadget support to be enabled +# +# CONFIG_USB_DWC2_PERIPHERAL is not set +CONFIG_USB_DWC2_DUAL_ROLE=y +CONFIG_USB_DWC2_PCI=m +# CONFIG_USB_DWC2_DEBUG is not set +# CONFIG_USB_DWC2_TRACK_MISSED_SOFS is not set +CONFIG_USB_CHIPIDEA=m +CONFIG_USB_CHIPIDEA_UDC=y +CONFIG_USB_CHIPIDEA_HOST=y +CONFIG_USB_CHIPIDEA_PCI=m +CONFIG_USB_CHIPIDEA_MSM=m +CONFIG_USB_CHIPIDEA_IMX=m +CONFIG_USB_CHIPIDEA_GENERIC=m +CONFIG_USB_CHIPIDEA_TEGRA=m +CONFIG_USB_ISP1760=m +CONFIG_USB_ISP1760_HCD=y +CONFIG_USB_ISP1761_UDC=y +# CONFIG_USB_ISP1760_HOST_ROLE is not set +# CONFIG_USB_ISP1760_GADGET_ROLE is not set +CONFIG_USB_ISP1760_DUAL_ROLE=y + +# +# USB port drivers +# +CONFIG_USB_USS720=m +CONFIG_USB_SERIAL=y +CONFIG_USB_SERIAL_CONSOLE=y +CONFIG_USB_SERIAL_GENERIC=y +CONFIG_USB_SERIAL_SIMPLE=m +CONFIG_USB_SERIAL_AIRCABLE=m +CONFIG_USB_SERIAL_ARK3116=m +CONFIG_USB_SERIAL_BELKIN=m +CONFIG_USB_SERIAL_CH341=m +CONFIG_USB_SERIAL_WHITEHEAT=m +CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m +CONFIG_USB_SERIAL_CP210X=m +CONFIG_USB_SERIAL_CYPRESS_M8=m +CONFIG_USB_SERIAL_EMPEG=m +CONFIG_USB_SERIAL_FTDI_SIO=m +CONFIG_USB_SERIAL_VISOR=m +CONFIG_USB_SERIAL_IPAQ=m +CONFIG_USB_SERIAL_IR=m +CONFIG_USB_SERIAL_EDGEPORT=m +CONFIG_USB_SERIAL_EDGEPORT_TI=m +CONFIG_USB_SERIAL_F81232=m +CONFIG_USB_SERIAL_F8153X=m +CONFIG_USB_SERIAL_GARMIN=m +CONFIG_USB_SERIAL_IPW=m +CONFIG_USB_SERIAL_IUU=m +CONFIG_USB_SERIAL_KEYSPAN_PDA=m +CONFIG_USB_SERIAL_KEYSPAN=m +CONFIG_USB_SERIAL_KLSI=m +CONFIG_USB_SERIAL_KOBIL_SCT=m +CONFIG_USB_SERIAL_MCT_U232=m +CONFIG_USB_SERIAL_METRO=m +CONFIG_USB_SERIAL_MOS7720=m +CONFIG_USB_SERIAL_MOS7715_PARPORT=y +CONFIG_USB_SERIAL_MOS7840=m +CONFIG_USB_SERIAL_MXUPORT=m +CONFIG_USB_SERIAL_NAVMAN=m +CONFIG_USB_SERIAL_PL2303=m +CONFIG_USB_SERIAL_OTI6858=m +CONFIG_USB_SERIAL_QCAUX=m +CONFIG_USB_SERIAL_QUALCOMM=m +CONFIG_USB_SERIAL_SPCP8X5=m +CONFIG_USB_SERIAL_SAFE=m +# CONFIG_USB_SERIAL_SAFE_PADDED is not set +CONFIG_USB_SERIAL_SIERRAWIRELESS=m +CONFIG_USB_SERIAL_SYMBOL=m +CONFIG_USB_SERIAL_TI=m +CONFIG_USB_SERIAL_CYBERJACK=m +CONFIG_USB_SERIAL_XIRCOM=m +CONFIG_USB_SERIAL_WWAN=m +CONFIG_USB_SERIAL_OPTION=m +CONFIG_USB_SERIAL_OMNINET=m +CONFIG_USB_SERIAL_OPTICON=m +CONFIG_USB_SERIAL_XSENS_MT=m +CONFIG_USB_SERIAL_WISHBONE=m +CONFIG_USB_SERIAL_SSU100=m +CONFIG_USB_SERIAL_QT2=m +CONFIG_USB_SERIAL_UPD78F0730=m +CONFIG_USB_SERIAL_DEBUG=m + +# +# USB Miscellaneous drivers +# +CONFIG_USB_EMI62=m +CONFIG_USB_EMI26=m +CONFIG_USB_ADUTUX=m +CONFIG_USB_SEVSEG=m +CONFIG_USB_LEGOTOWER=m +CONFIG_USB_LCD=m +CONFIG_USB_CYPRESS_CY7C63=m +CONFIG_USB_CYTHERM=m +CONFIG_USB_IDMOUSE=m +CONFIG_USB_FTDI_ELAN=m +CONFIG_USB_APPLEDISPLAY=m +CONFIG_APPLE_MFI_FASTCHARGE=m +CONFIG_USB_SISUSBVGA=m +CONFIG_USB_SISUSBVGA_CON=y +CONFIG_USB_LD=m +CONFIG_USB_TRANCEVIBRATOR=m +CONFIG_USB_IOWARRIOR=m +CONFIG_USB_TEST=m +CONFIG_USB_EHSET_TEST_FIXTURE=m +CONFIG_USB_ISIGHTFW=m +CONFIG_USB_YUREX=m +CONFIG_USB_EZUSB_FX2=m +CONFIG_USB_HUB_USB251XB=m +CONFIG_USB_HSIC_USB3503=m +CONFIG_USB_HSIC_USB4604=m +CONFIG_USB_LINK_LAYER_TEST=m +CONFIG_USB_CHAOSKEY=m +CONFIG_USB_ATM=m +CONFIG_USB_SPEEDTOUCH=m +CONFIG_USB_CXACRU=m +CONFIG_USB_UEAGLEATM=m +CONFIG_USB_XUSBATM=m + +# +# USB Physical Layer drivers +# +CONFIG_USB_PHY=y +CONFIG_NOP_USB_XCEIV=m +CONFIG_USB_GPIO_VBUS=m +CONFIG_TAHVO_USB=m +# CONFIG_TAHVO_USB_HOST_BY_DEFAULT is not set +CONFIG_USB_ISP1301=m +# end of USB Physical Layer drivers + +CONFIG_USB_GADGET=m +# CONFIG_USB_GADGET_DEBUG is not set +# CONFIG_USB_GADGET_DEBUG_FILES is not set +# CONFIG_USB_GADGET_DEBUG_FS is not set +CONFIG_USB_GADGET_VBUS_DRAW=2 +CONFIG_USB_GADGET_STORAGE_NUM_BUFFERS=2 +CONFIG_U_SERIAL_CONSOLE=y + +# +# USB Peripheral Controller +# +CONFIG_USB_FOTG210_UDC=m +CONFIG_USB_GR_UDC=m +CONFIG_USB_R8A66597=m +CONFIG_USB_PXA27X=m +CONFIG_USB_MV_UDC=m +CONFIG_USB_MV_U3D=m +CONFIG_USB_SNP_CORE=m +CONFIG_USB_SNP_UDC_PLAT=m +CONFIG_USB_M66592=m +CONFIG_USB_BDC_UDC=m + +# +# Platform Support +# +CONFIG_USB_BDC_PCI=m +CONFIG_USB_AMD5536UDC=m +CONFIG_USB_NET2272=m +CONFIG_USB_NET2272_DMA=y +CONFIG_USB_NET2280=m +CONFIG_USB_GOKU=m +CONFIG_USB_EG20T=m +CONFIG_USB_GADGET_XILINX=m +CONFIG_USB_MAX3420_UDC=m +CONFIG_USB_DUMMY_HCD=m +# end of USB Peripheral Controller + +CONFIG_USB_LIBCOMPOSITE=m +CONFIG_USB_F_ACM=m +CONFIG_USB_F_SS_LB=m +CONFIG_USB_U_SERIAL=m +CONFIG_USB_U_ETHER=m +CONFIG_USB_U_AUDIO=m +CONFIG_USB_F_SERIAL=m +CONFIG_USB_F_OBEX=m +CONFIG_USB_F_NCM=m +CONFIG_USB_F_ECM=m +CONFIG_USB_F_PHONET=m +CONFIG_USB_F_EEM=m +CONFIG_USB_F_SUBSET=m +CONFIG_USB_F_RNDIS=m +CONFIG_USB_F_MASS_STORAGE=m +CONFIG_USB_F_FS=m +CONFIG_USB_F_UAC1=m +CONFIG_USB_F_UAC1_LEGACY=m +CONFIG_USB_F_UAC2=m +CONFIG_USB_F_UVC=m +CONFIG_USB_F_MIDI=m +CONFIG_USB_F_HID=m +CONFIG_USB_F_PRINTER=m +CONFIG_USB_F_TCM=m +CONFIG_USB_CONFIGFS=m +CONFIG_USB_CONFIGFS_SERIAL=y +CONFIG_USB_CONFIGFS_ACM=y +CONFIG_USB_CONFIGFS_OBEX=y +CONFIG_USB_CONFIGFS_NCM=y +CONFIG_USB_CONFIGFS_ECM=y +CONFIG_USB_CONFIGFS_ECM_SUBSET=y +CONFIG_USB_CONFIGFS_RNDIS=y +CONFIG_USB_CONFIGFS_EEM=y +CONFIG_USB_CONFIGFS_PHONET=y +CONFIG_USB_CONFIGFS_MASS_STORAGE=y +CONFIG_USB_CONFIGFS_F_LB_SS=y +CONFIG_USB_CONFIGFS_F_FS=y +CONFIG_USB_CONFIGFS_F_UAC1=y +CONFIG_USB_CONFIGFS_F_UAC1_LEGACY=y +CONFIG_USB_CONFIGFS_F_UAC2=y +CONFIG_USB_CONFIGFS_F_MIDI=y +CONFIG_USB_CONFIGFS_F_HID=y +CONFIG_USB_CONFIGFS_F_UVC=y +CONFIG_USB_CONFIGFS_F_PRINTER=y +CONFIG_USB_CONFIGFS_F_TCM=y + +# +# USB Gadget precomposed configurations +# +CONFIG_USB_ZERO=m +CONFIG_USB_AUDIO=m +# CONFIG_GADGET_UAC1 is not set +CONFIG_USB_ETH=m +CONFIG_USB_ETH_RNDIS=y +CONFIG_USB_ETH_EEM=y +CONFIG_USB_G_NCM=m +CONFIG_USB_GADGETFS=m +CONFIG_USB_FUNCTIONFS=m +CONFIG_USB_FUNCTIONFS_ETH=y +CONFIG_USB_FUNCTIONFS_RNDIS=y +CONFIG_USB_FUNCTIONFS_GENERIC=y +CONFIG_USB_MASS_STORAGE=m +CONFIG_USB_GADGET_TARGET=m +CONFIG_USB_G_SERIAL=m +CONFIG_USB_MIDI_GADGET=m +CONFIG_USB_G_PRINTER=m +CONFIG_USB_CDC_COMPOSITE=m +CONFIG_USB_G_NOKIA=m +CONFIG_USB_G_ACM_MS=m +CONFIG_USB_G_MULTI=m +CONFIG_USB_G_MULTI_RNDIS=y +CONFIG_USB_G_MULTI_CDC=y +CONFIG_USB_G_HID=m +CONFIG_USB_G_DBGP=m +# CONFIG_USB_G_DBGP_PRINTK is not set +CONFIG_USB_G_DBGP_SERIAL=y +CONFIG_USB_G_WEBCAM=m +CONFIG_USB_RAW_GADGET=m +# end of USB Gadget precomposed configurations + +CONFIG_TYPEC=m +CONFIG_TYPEC_TCPM=m +CONFIG_TYPEC_TCPCI=m +CONFIG_TYPEC_RT1711H=m +CONFIG_TYPEC_FUSB302=m +CONFIG_TYPEC_WCOVE=m +CONFIG_TYPEC_UCSI=m +CONFIG_UCSI_CCG=m +CONFIG_UCSI_ACPI=m +CONFIG_TYPEC_HD3SS3220=m +CONFIG_TYPEC_TPS6598X=m + +# +# USB Type-C Multiplexer/DeMultiplexer Switch support +# +CONFIG_TYPEC_MUX_PI3USB30532=m +CONFIG_TYPEC_MUX_INTEL_PMC=m +# end of USB Type-C Multiplexer/DeMultiplexer Switch support + +# +# USB Type-C Alternate Mode drivers +# +CONFIG_TYPEC_DP_ALTMODE=m +CONFIG_TYPEC_NVIDIA_ALTMODE=m +# end of USB Type-C Alternate Mode drivers + +CONFIG_USB_ROLE_SWITCH=m +CONFIG_USB_ROLES_INTEL_XHCI=m +CONFIG_MMC=m +CONFIG_PWRSEQ_EMMC=m +CONFIG_PWRSEQ_SD8787=m +CONFIG_PWRSEQ_SIMPLE=m +CONFIG_MMC_BLOCK=m +CONFIG_MMC_BLOCK_MINORS=8 +CONFIG_SDIO_UART=m +CONFIG_MMC_TEST=m + +# +# MMC/SD/SDIO Host Controller Drivers +# +# CONFIG_MMC_DEBUG is not set +CONFIG_MMC_SDHCI=m +CONFIG_MMC_SDHCI_IO_ACCESSORS=y +CONFIG_MMC_SDHCI_PCI=m +CONFIG_MMC_RICOH_MMC=y +CONFIG_MMC_SDHCI_ACPI=m +CONFIG_MMC_SDHCI_PLTFM=m +CONFIG_MMC_SDHCI_OF_ARASAN=m +CONFIG_MMC_SDHCI_OF_ASPEED=m +CONFIG_MMC_SDHCI_OF_AT91=m +CONFIG_MMC_SDHCI_OF_DWCMSHC=m +CONFIG_MMC_SDHCI_CADENCE=m +CONFIG_MMC_SDHCI_F_SDH30=m +CONFIG_MMC_SDHCI_MILBEAUT=m +CONFIG_MMC_WBSD=m +CONFIG_MMC_ALCOR=m +CONFIG_MMC_TIFM_SD=m +CONFIG_MMC_SPI=m +CONFIG_MMC_SDRICOH_CS=m +CONFIG_MMC_CB710=m +CONFIG_MMC_VIA_SDMMC=m +CONFIG_MMC_VUB300=m +CONFIG_MMC_USHC=m +CONFIG_MMC_USDHI6ROL0=m +CONFIG_MMC_REALTEK_PCI=m +CONFIG_MMC_REALTEK_USB=m +CONFIG_MMC_CQHCI=m +CONFIG_MMC_HSQ=m +CONFIG_MMC_TOSHIBA_PCI=m +CONFIG_MMC_MTK=m +CONFIG_MMC_SDHCI_XENON=m +CONFIG_MMC_SDHCI_OMAP=m +CONFIG_MMC_SDHCI_AM654=m +CONFIG_MMC_SDHCI_EXTERNAL_DMA=y +CONFIG_MEMSTICK=m +# CONFIG_MEMSTICK_DEBUG is not set + +# +# MemoryStick drivers +# +# CONFIG_MEMSTICK_UNSAFE_RESUME is not set +CONFIG_MSPRO_BLOCK=m +CONFIG_MS_BLOCK=m + +# +# MemoryStick Host Controller Drivers +# +CONFIG_MEMSTICK_TIFM_MS=m +CONFIG_MEMSTICK_JMICRON_38X=m +CONFIG_MEMSTICK_R592=m +CONFIG_MEMSTICK_REALTEK_PCI=m +CONFIG_MEMSTICK_REALTEK_USB=m +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=y +CONFIG_LEDS_CLASS_FLASH=m +CONFIG_LEDS_BRIGHTNESS_HW_CHANGED=y + +# +# LED drivers +# +CONFIG_LEDS_88PM860X=m +CONFIG_LEDS_AAT1290=m +CONFIG_LEDS_AN30259A=m +CONFIG_LEDS_APU=m +CONFIG_LEDS_AS3645A=m +CONFIG_LEDS_AW2013=m +CONFIG_LEDS_BCM6328=m +CONFIG_LEDS_BCM6358=m +CONFIG_LEDS_CPCAP=m +CONFIG_LEDS_CR0014114=m +CONFIG_LEDS_EL15203000=m +CONFIG_LEDS_LM3530=m +CONFIG_LEDS_LM3532=m +CONFIG_LEDS_LM3533=m +CONFIG_LEDS_LM3642=m +CONFIG_LEDS_LM3692X=m +CONFIG_LEDS_LM3601X=m +CONFIG_LEDS_MT6323=m +CONFIG_LEDS_PCA9532=m +CONFIG_LEDS_PCA9532_GPIO=y +CONFIG_LEDS_GPIO=m +CONFIG_LEDS_LP3944=m +CONFIG_LEDS_LP3952=m +# CONFIG_LEDS_LP5521 is not set +# CONFIG_LEDS_LP5523 is not set +# CONFIG_LEDS_LP5562 is not set +# CONFIG_LEDS_LP8501 is not set +CONFIG_LEDS_LP8788=m +CONFIG_LEDS_LP8860=m +CONFIG_LEDS_CLEVO_MAIL=m +CONFIG_LEDS_PCA955X=m +CONFIG_LEDS_PCA955X_GPIO=y +CONFIG_LEDS_PCA963X=m +CONFIG_LEDS_WM831X_STATUS=m +CONFIG_LEDS_WM8350=m +CONFIG_LEDS_DA903X=m +CONFIG_LEDS_DA9052=m +CONFIG_LEDS_DAC124S085=m +CONFIG_LEDS_PWM=m +CONFIG_LEDS_REGULATOR=m +CONFIG_LEDS_BD2802=m +CONFIG_LEDS_INTEL_SS4200=m +CONFIG_LEDS_LT3593=m +CONFIG_LEDS_ADP5520=m +CONFIG_LEDS_MC13783=m +CONFIG_LEDS_TCA6507=m +CONFIG_LEDS_TLC591XX=m +CONFIG_LEDS_MAX77650=m +CONFIG_LEDS_MAX77693=m +CONFIG_LEDS_MAX8997=m +CONFIG_LEDS_LM355x=m +CONFIG_LEDS_MENF21BMC=m +CONFIG_LEDS_KTD2692=m +CONFIG_LEDS_IS31FL319X=m +CONFIG_LEDS_IS31FL32XX=m + +# +# LED driver for blink(1) USB RGB LED is under Special HID drivers (HID_THINGM) +# +CONFIG_LEDS_BLINKM=m +CONFIG_LEDS_SYSCON=y +CONFIG_LEDS_MLXCPLD=m +CONFIG_LEDS_MLXREG=m +CONFIG_LEDS_USER=m +CONFIG_LEDS_NIC78BX=m +CONFIG_LEDS_SPI_BYTE=m +CONFIG_LEDS_TI_LMU_COMMON=m +CONFIG_LEDS_LM3697=m +CONFIG_LEDS_LM36274=m +CONFIG_LEDS_TPS6105X=m +CONFIG_LEDS_SGM3140=m + +# +# LED Triggers +# +CONFIG_LEDS_TRIGGERS=y +CONFIG_LEDS_TRIGGER_TIMER=m +CONFIG_LEDS_TRIGGER_ONESHOT=m +CONFIG_LEDS_TRIGGER_DISK=y +CONFIG_LEDS_TRIGGER_MTD=y +CONFIG_LEDS_TRIGGER_HEARTBEAT=m +CONFIG_LEDS_TRIGGER_BACKLIGHT=m +CONFIG_LEDS_TRIGGER_CPU=y +CONFIG_LEDS_TRIGGER_ACTIVITY=m +CONFIG_LEDS_TRIGGER_GPIO=m +CONFIG_LEDS_TRIGGER_DEFAULT_ON=m + +# +# iptables trigger is under Netfilter config (LED target) +# +CONFIG_LEDS_TRIGGER_TRANSIENT=m +CONFIG_LEDS_TRIGGER_CAMERA=m +CONFIG_LEDS_TRIGGER_PANIC=y +CONFIG_LEDS_TRIGGER_NETDEV=m +CONFIG_LEDS_TRIGGER_PATTERN=m +CONFIG_LEDS_TRIGGER_AUDIO=m +CONFIG_ACCESSIBILITY=y +CONFIG_A11Y_BRAILLE_CONSOLE=y +CONFIG_INFINIBAND=m +CONFIG_INFINIBAND_USER_MAD=m +CONFIG_INFINIBAND_USER_ACCESS=m +# CONFIG_INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI is not set +CONFIG_INFINIBAND_USER_MEM=y +CONFIG_INFINIBAND_ON_DEMAND_PAGING=y +CONFIG_INFINIBAND_ADDR_TRANS=y +CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS=y +CONFIG_INFINIBAND_MTHCA=m +CONFIG_INFINIBAND_MTHCA_DEBUG=y +CONFIG_INFINIBAND_QIB=m +CONFIG_INFINIBAND_QIB_DCA=y +CONFIG_INFINIBAND_CXGB4=m +CONFIG_INFINIBAND_EFA=m +CONFIG_INFINIBAND_I40IW=m +CONFIG_MLX4_INFINIBAND=m +CONFIG_MLX5_INFINIBAND=m +CONFIG_INFINIBAND_OCRDMA=m +CONFIG_INFINIBAND_VMWARE_PVRDMA=m +CONFIG_INFINIBAND_USNIC=m +CONFIG_INFINIBAND_BNXT_RE=m +CONFIG_INFINIBAND_HFI1=m +# CONFIG_HFI1_DEBUG_SDMA_ORDER is not set +# CONFIG_SDMA_VERBOSITY is not set +CONFIG_INFINIBAND_QEDR=m +CONFIG_INFINIBAND_RDMAVT=m +CONFIG_RDMA_RXE=m +CONFIG_RDMA_SIW=m +CONFIG_INFINIBAND_IPOIB=m +CONFIG_INFINIBAND_IPOIB_CM=y +CONFIG_INFINIBAND_IPOIB_DEBUG=y +# CONFIG_INFINIBAND_IPOIB_DEBUG_DATA is not set +CONFIG_INFINIBAND_SRP=m +CONFIG_INFINIBAND_SRPT=m +CONFIG_INFINIBAND_ISER=m +CONFIG_INFINIBAND_ISERT=m +CONFIG_INFINIBAND_RTRS=m +CONFIG_INFINIBAND_RTRS_CLIENT=m +CONFIG_INFINIBAND_RTRS_SERVER=m +CONFIG_INFINIBAND_OPA_VNIC=m +CONFIG_EDAC_ATOMIC_SCRUB=y +CONFIG_EDAC_SUPPORT=y +CONFIG_EDAC=y +CONFIG_EDAC_LEGACY_SYSFS=y +# CONFIG_EDAC_DEBUG is not set +CONFIG_EDAC_DECODE_MCE=m +CONFIG_EDAC_GHES=y +CONFIG_EDAC_AMD64=m +# CONFIG_EDAC_AMD64_ERROR_INJECTION is not set +CONFIG_EDAC_E752X=m +CONFIG_EDAC_I82975X=m +CONFIG_EDAC_I3000=m +CONFIG_EDAC_I3200=m +CONFIG_EDAC_IE31200=m +CONFIG_EDAC_X38=m +CONFIG_EDAC_I5400=m +CONFIG_EDAC_I7CORE=m +CONFIG_EDAC_I5000=m +CONFIG_EDAC_I5100=m +CONFIG_EDAC_I7300=m +CONFIG_EDAC_SBRIDGE=m +CONFIG_EDAC_SKX=m +CONFIG_EDAC_I10NM=m +CONFIG_EDAC_PND2=m +CONFIG_RTC_LIB=y +CONFIG_RTC_MC146818_LIB=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_HCTOSYS=y +CONFIG_RTC_HCTOSYS_DEVICE="rtc0" +CONFIG_RTC_SYSTOHC=y +CONFIG_RTC_SYSTOHC_DEVICE="rtc0" +# CONFIG_RTC_DEBUG is not set +CONFIG_RTC_NVMEM=y + +# +# RTC interfaces +# +CONFIG_RTC_INTF_SYSFS=y +CONFIG_RTC_INTF_PROC=y +CONFIG_RTC_INTF_DEV=y +CONFIG_RTC_INTF_DEV_UIE_EMUL=y +# CONFIG_RTC_DRV_TEST is not set + +# +# I2C RTC drivers +# +CONFIG_RTC_DRV_88PM860X=m +CONFIG_RTC_DRV_88PM80X=m +CONFIG_RTC_DRV_ABB5ZES3=m +CONFIG_RTC_DRV_ABEOZ9=m +CONFIG_RTC_DRV_ABX80X=m +CONFIG_RTC_DRV_AS3722=m +CONFIG_RTC_DRV_DS1307=m +CONFIG_RTC_DRV_DS1307_CENTURY=y +CONFIG_RTC_DRV_DS1374=m +CONFIG_RTC_DRV_DS1374_WDT=y +CONFIG_RTC_DRV_DS1672=m +CONFIG_RTC_DRV_HYM8563=m +CONFIG_RTC_DRV_LP8788=m +CONFIG_RTC_DRV_MAX6900=m +CONFIG_RTC_DRV_MAX8907=m +CONFIG_RTC_DRV_MAX8925=m +CONFIG_RTC_DRV_MAX8998=m +CONFIG_RTC_DRV_MAX8997=m +CONFIG_RTC_DRV_MAX77686=m +CONFIG_RTC_DRV_RK808=m +CONFIG_RTC_DRV_RS5C372=m +CONFIG_RTC_DRV_ISL1208=m +CONFIG_RTC_DRV_ISL12022=m +CONFIG_RTC_DRV_ISL12026=m +CONFIG_RTC_DRV_X1205=m +CONFIG_RTC_DRV_PCF8523=m +CONFIG_RTC_DRV_PCF85063=m +CONFIG_RTC_DRV_PCF85363=m +CONFIG_RTC_DRV_PCF8563=m +CONFIG_RTC_DRV_PCF8583=m +CONFIG_RTC_DRV_M41T80=m +CONFIG_RTC_DRV_M41T80_WDT=y +CONFIG_RTC_DRV_BD70528=m +CONFIG_RTC_DRV_BQ32K=m +CONFIG_RTC_DRV_TWL4030=m +CONFIG_RTC_DRV_PALMAS=m +CONFIG_RTC_DRV_TPS6586X=m +CONFIG_RTC_DRV_TPS65910=m +CONFIG_RTC_DRV_TPS80031=m +CONFIG_RTC_DRV_RC5T583=m +CONFIG_RTC_DRV_RC5T619=m +CONFIG_RTC_DRV_S35390A=m +CONFIG_RTC_DRV_FM3130=m +CONFIG_RTC_DRV_RX8010=m +CONFIG_RTC_DRV_RX8581=m +CONFIG_RTC_DRV_RX8025=m +CONFIG_RTC_DRV_EM3027=m +CONFIG_RTC_DRV_RV3028=m +CONFIG_RTC_DRV_RV8803=m +CONFIG_RTC_DRV_S5M=m +CONFIG_RTC_DRV_SD3078=m + +# +# SPI RTC drivers +# +CONFIG_RTC_DRV_M41T93=m +CONFIG_RTC_DRV_M41T94=m +CONFIG_RTC_DRV_DS1302=m +CONFIG_RTC_DRV_DS1305=m +CONFIG_RTC_DRV_DS1343=m +CONFIG_RTC_DRV_DS1347=m +CONFIG_RTC_DRV_DS1390=m +CONFIG_RTC_DRV_MAX6916=m +CONFIG_RTC_DRV_R9701=m +CONFIG_RTC_DRV_RX4581=m +CONFIG_RTC_DRV_RX6110=m +CONFIG_RTC_DRV_RS5C348=m +CONFIG_RTC_DRV_MAX6902=m +CONFIG_RTC_DRV_PCF2123=m +CONFIG_RTC_DRV_MCP795=m +CONFIG_RTC_I2C_AND_SPI=y + +# +# SPI and I2C RTC drivers +# +CONFIG_RTC_DRV_DS3232=m +CONFIG_RTC_DRV_DS3232_HWMON=y +CONFIG_RTC_DRV_PCF2127=m +CONFIG_RTC_DRV_RV3029C2=m +CONFIG_RTC_DRV_RV3029_HWMON=y + +# +# Platform RTC drivers +# +CONFIG_RTC_DRV_CMOS=y +CONFIG_RTC_DRV_DS1286=m +CONFIG_RTC_DRV_DS1511=m +CONFIG_RTC_DRV_DS1553=m +CONFIG_RTC_DRV_DS1685_FAMILY=m +CONFIG_RTC_DRV_DS1685=y +# CONFIG_RTC_DRV_DS1689 is not set +# CONFIG_RTC_DRV_DS17285 is not set +# CONFIG_RTC_DRV_DS17485 is not set +# CONFIG_RTC_DRV_DS17885 is not set +CONFIG_RTC_DRV_DS1742=m +CONFIG_RTC_DRV_DS2404=m +CONFIG_RTC_DRV_DA9052=m +CONFIG_RTC_DRV_DA9055=m +CONFIG_RTC_DRV_DA9063=m +CONFIG_RTC_DRV_STK17TA8=m +CONFIG_RTC_DRV_M48T86=m +CONFIG_RTC_DRV_M48T35=m +CONFIG_RTC_DRV_M48T59=m +CONFIG_RTC_DRV_MSM6242=m +CONFIG_RTC_DRV_BQ4802=m +CONFIG_RTC_DRV_RP5C01=m +CONFIG_RTC_DRV_V3020=m +CONFIG_RTC_DRV_WM831X=m +CONFIG_RTC_DRV_WM8350=m +CONFIG_RTC_DRV_PCF50633=m +CONFIG_RTC_DRV_AB3100=m +CONFIG_RTC_DRV_ZYNQMP=m +CONFIG_RTC_DRV_CROS_EC=m + +# +# on-CPU RTC drivers +# +CONFIG_RTC_DRV_CADENCE=m +CONFIG_RTC_DRV_FTRTC010=m +CONFIG_RTC_DRV_PCAP=m +CONFIG_RTC_DRV_MC13XXX=m +CONFIG_RTC_DRV_MT6397=m +CONFIG_RTC_DRV_R7301=m +CONFIG_RTC_DRV_CPCAP=m + +# +# HID Sensor RTC drivers +# +CONFIG_RTC_DRV_HID_SENSOR_TIME=m +CONFIG_RTC_DRV_WILCO_EC=m +CONFIG_DMADEVICES=y +# CONFIG_DMADEVICES_DEBUG is not set + +# +# DMA Devices +# +CONFIG_DMA_ENGINE=y +CONFIG_DMA_VIRTUAL_CHANNELS=y +CONFIG_DMA_ACPI=y +CONFIG_DMA_OF=y +CONFIG_ALTERA_MSGDMA=m +CONFIG_DW_AXI_DMAC=m +CONFIG_FSL_EDMA=m +CONFIG_INTEL_IDMA64=m +CONFIG_INTEL_IDXD=m +CONFIG_INTEL_IOATDMA=m +CONFIG_INTEL_MIC_X100_DMA=m +CONFIG_PLX_DMA=m +CONFIG_QCOM_HIDMA_MGMT=m +CONFIG_QCOM_HIDMA=m +CONFIG_DW_DMAC_CORE=y +CONFIG_DW_DMAC=y +CONFIG_DW_DMAC_PCI=y +CONFIG_DW_EDMA=m +CONFIG_DW_EDMA_PCIE=m +CONFIG_HSU_DMA=y +CONFIG_SF_PDMA=m + +# +# DMA Clients +# +CONFIG_ASYNC_TX_DMA=y +# CONFIG_DMATEST is not set +CONFIG_DMA_ENGINE_RAID=y + +# +# DMABUF options +# +CONFIG_SYNC_FILE=y +# CONFIG_SW_SYNC is not set +CONFIG_UDMABUF=y +# CONFIG_DMABUF_MOVE_NOTIFY is not set +# CONFIG_DMABUF_SELFTESTS is not set +CONFIG_DMABUF_HEAPS=y +CONFIG_DMABUF_HEAPS_SYSTEM=y +# end of DMABUF options + +CONFIG_DCA=m +CONFIG_AUXDISPLAY=y +CONFIG_HD44780=m +CONFIG_KS0108=m +CONFIG_KS0108_PORT=0x378 +CONFIG_KS0108_DELAY=2 +CONFIG_CFAG12864B=m +CONFIG_CFAG12864B_RATE=20 +CONFIG_IMG_ASCII_LCD=m +CONFIG_HT16K33=m +CONFIG_PARPORT_PANEL=m +CONFIG_PANEL_PARPORT=0 +CONFIG_PANEL_PROFILE=5 +# CONFIG_PANEL_CHANGE_MESSAGE is not set +# CONFIG_CHARLCD_BL_OFF is not set +# CONFIG_CHARLCD_BL_ON is not set +CONFIG_CHARLCD_BL_FLASH=y +CONFIG_PANEL=m +CONFIG_CHARLCD=m +CONFIG_UIO=m +CONFIG_UIO_CIF=m +CONFIG_UIO_PDRV_GENIRQ=m +CONFIG_UIO_DMEM_GENIRQ=m +CONFIG_UIO_AEC=m +CONFIG_UIO_SERCOS3=m +CONFIG_UIO_PCI_GENERIC=m +CONFIG_UIO_NETX=m +CONFIG_UIO_PRUSS=m +CONFIG_UIO_MF624=m +CONFIG_UIO_HV_GENERIC=m +CONFIG_VFIO_IOMMU_TYPE1=m +CONFIG_VFIO_VIRQFD=m +CONFIG_VFIO=m +# CONFIG_VFIO_NOIOMMU is not set +CONFIG_VFIO_PCI=m +CONFIG_VFIO_PCI_VGA=y +CONFIG_VFIO_PCI_MMAP=y +CONFIG_VFIO_PCI_INTX=y +CONFIG_VFIO_PCI_IGD=y +CONFIG_VFIO_MDEV=m +CONFIG_VFIO_MDEV_DEVICE=m +CONFIG_IRQ_BYPASS_MANAGER=m +CONFIG_VIRT_DRIVERS=y +CONFIG_VBOXGUEST=m +CONFIG_VIRTIO=y +CONFIG_VIRTIO_MENU=y +CONFIG_VIRTIO_PCI=m +CONFIG_VIRTIO_PCI_LEGACY=y +CONFIG_VIRTIO_VDPA=m +CONFIG_VIRTIO_PMEM=m +CONFIG_VIRTIO_BALLOON=m +CONFIG_VIRTIO_MEM=m +CONFIG_VIRTIO_INPUT=m +CONFIG_VIRTIO_MMIO=m +CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y +CONFIG_VDPA=m +CONFIG_VDPA_SIM=m +CONFIG_IFCVF=m +CONFIG_VHOST_IOTLB=m +CONFIG_VHOST_RING=m +CONFIG_VHOST=m +CONFIG_VHOST_MENU=y +CONFIG_VHOST_NET=m +CONFIG_VHOST_SCSI=m +CONFIG_VHOST_VSOCK=m +CONFIG_VHOST_VDPA=m +# CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set + +# +# Microsoft Hyper-V guest support +# +CONFIG_HYPERV=m +CONFIG_HYPERV_TIMER=y +CONFIG_HYPERV_UTILS=m +CONFIG_HYPERV_BALLOON=m +# end of Microsoft Hyper-V guest support + +# +# Xen driver support +# +CONFIG_XEN_BALLOON=y +CONFIG_XEN_BALLOON_MEMORY_HOTPLUG=y +CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT=512 +CONFIG_XEN_SCRUB_PAGES_DEFAULT=y +CONFIG_XEN_DEV_EVTCHN=m +CONFIG_XEN_BACKEND=y +CONFIG_XENFS=m +CONFIG_XEN_COMPAT_XENFS=y +CONFIG_XEN_SYS_HYPERVISOR=y +CONFIG_XEN_XENBUS_FRONTEND=y +CONFIG_XEN_GNTDEV=m +CONFIG_XEN_GNTDEV_DMABUF=y +CONFIG_XEN_GRANT_DEV_ALLOC=m +CONFIG_XEN_GRANT_DMA_ALLOC=y +CONFIG_SWIOTLB_XEN=y +CONFIG_XEN_PCIDEV_BACKEND=m +CONFIG_XEN_PVCALLS_FRONTEND=m +CONFIG_XEN_PVCALLS_BACKEND=y +CONFIG_XEN_SCSI_BACKEND=m +CONFIG_XEN_PRIVCMD=m +CONFIG_XEN_ACPI_PROCESSOR=m +CONFIG_XEN_MCE_LOG=y +CONFIG_XEN_HAVE_PVMMU=y +CONFIG_XEN_EFI=y +CONFIG_XEN_AUTO_XLATE=y +CONFIG_XEN_ACPI=y +CONFIG_XEN_SYMS=y +CONFIG_XEN_HAVE_VPMU=y +CONFIG_XEN_FRONT_PGDIR_SHBUF=m +# end of Xen driver support + +# CONFIG_GREYBUS is not set +CONFIG_STAGING=y +CONFIG_PRISM2_USB=m +CONFIG_COMEDI=m +# CONFIG_COMEDI_DEBUG is not set +CONFIG_COMEDI_DEFAULT_BUF_SIZE_KB=2048 +CONFIG_COMEDI_DEFAULT_BUF_MAXSIZE_KB=20480 +CONFIG_COMEDI_MISC_DRIVERS=y +CONFIG_COMEDI_BOND=m +CONFIG_COMEDI_TEST=m +CONFIG_COMEDI_PARPORT=m +# CONFIG_COMEDI_ISA_DRIVERS is not set +CONFIG_COMEDI_PCI_DRIVERS=m +CONFIG_COMEDI_8255_PCI=m +CONFIG_COMEDI_ADDI_WATCHDOG=m +CONFIG_COMEDI_ADDI_APCI_1032=m +CONFIG_COMEDI_ADDI_APCI_1500=m +CONFIG_COMEDI_ADDI_APCI_1516=m +CONFIG_COMEDI_ADDI_APCI_1564=m +CONFIG_COMEDI_ADDI_APCI_16XX=m +CONFIG_COMEDI_ADDI_APCI_2032=m +CONFIG_COMEDI_ADDI_APCI_2200=m +CONFIG_COMEDI_ADDI_APCI_3120=m +CONFIG_COMEDI_ADDI_APCI_3501=m +CONFIG_COMEDI_ADDI_APCI_3XXX=m +CONFIG_COMEDI_ADL_PCI6208=m +CONFIG_COMEDI_ADL_PCI7X3X=m +CONFIG_COMEDI_ADL_PCI8164=m +CONFIG_COMEDI_ADL_PCI9111=m +CONFIG_COMEDI_ADL_PCI9118=m +CONFIG_COMEDI_ADV_PCI1710=m +CONFIG_COMEDI_ADV_PCI1720=m +CONFIG_COMEDI_ADV_PCI1723=m +CONFIG_COMEDI_ADV_PCI1724=m +CONFIG_COMEDI_ADV_PCI1760=m +CONFIG_COMEDI_ADV_PCI_DIO=m +CONFIG_COMEDI_AMPLC_DIO200_PCI=m +CONFIG_COMEDI_AMPLC_PC236_PCI=m +CONFIG_COMEDI_AMPLC_PC263_PCI=m +CONFIG_COMEDI_AMPLC_PCI224=m +CONFIG_COMEDI_AMPLC_PCI230=m +CONFIG_COMEDI_CONTEC_PCI_DIO=m +CONFIG_COMEDI_DAS08_PCI=m +CONFIG_COMEDI_DT3000=m +CONFIG_COMEDI_DYNA_PCI10XX=m +CONFIG_COMEDI_GSC_HPDI=m +CONFIG_COMEDI_MF6X4=m +CONFIG_COMEDI_ICP_MULTI=m +CONFIG_COMEDI_DAQBOARD2000=m +CONFIG_COMEDI_JR3_PCI=m +CONFIG_COMEDI_KE_COUNTER=m +CONFIG_COMEDI_CB_PCIDAS64=m +CONFIG_COMEDI_CB_PCIDAS=m +CONFIG_COMEDI_CB_PCIDDA=m +CONFIG_COMEDI_CB_PCIMDAS=m +CONFIG_COMEDI_CB_PCIMDDA=m +CONFIG_COMEDI_ME4000=m +CONFIG_COMEDI_ME_DAQ=m +CONFIG_COMEDI_NI_6527=m +CONFIG_COMEDI_NI_65XX=m +CONFIG_COMEDI_NI_660X=m +CONFIG_COMEDI_NI_670X=m +CONFIG_COMEDI_NI_LABPC_PCI=m +CONFIG_COMEDI_NI_PCIDIO=m +CONFIG_COMEDI_NI_PCIMIO=m +CONFIG_COMEDI_RTD520=m +CONFIG_COMEDI_S626=m +CONFIG_COMEDI_MITE=m +CONFIG_COMEDI_NI_TIOCMD=m +CONFIG_COMEDI_PCMCIA_DRIVERS=m +CONFIG_COMEDI_CB_DAS16_CS=m +CONFIG_COMEDI_DAS08_CS=m +CONFIG_COMEDI_NI_DAQ_700_CS=m +CONFIG_COMEDI_NI_DAQ_DIO24_CS=m +CONFIG_COMEDI_NI_LABPC_CS=m +CONFIG_COMEDI_NI_MIO_CS=m +CONFIG_COMEDI_QUATECH_DAQP_CS=m +CONFIG_COMEDI_USB_DRIVERS=m +CONFIG_COMEDI_DT9812=m +CONFIG_COMEDI_NI_USB6501=m +CONFIG_COMEDI_USBDUX=m +CONFIG_COMEDI_USBDUXFAST=m +CONFIG_COMEDI_USBDUXSIGMA=m +CONFIG_COMEDI_VMK80XX=m +CONFIG_COMEDI_8254=m +CONFIG_COMEDI_8255=m +CONFIG_COMEDI_8255_SA=m +CONFIG_COMEDI_KCOMEDILIB=m +CONFIG_COMEDI_AMPLC_DIO200=m +CONFIG_COMEDI_AMPLC_PC236=m +CONFIG_COMEDI_DAS08=m +CONFIG_COMEDI_NI_LABPC=m +CONFIG_COMEDI_NI_TIO=m +CONFIG_COMEDI_NI_ROUTING=m +CONFIG_RTL8192U=m +CONFIG_RTLLIB=m +CONFIG_RTLLIB_CRYPTO_CCMP=m +CONFIG_RTLLIB_CRYPTO_TKIP=m +CONFIG_RTLLIB_CRYPTO_WEP=m +CONFIG_RTL8192E=m +CONFIG_RTL8723BS=m +CONFIG_R8712U=m +CONFIG_R8188EU=m +CONFIG_88EU_AP_MODE=y +CONFIG_RTS5208=m +CONFIG_VT6655=m +CONFIG_VT6656=m + +# +# IIO staging drivers +# + +# +# Accelerometers +# +CONFIG_ADIS16203=m +CONFIG_ADIS16240=m +# end of Accelerometers + +# +# Analog to digital converters +# +CONFIG_AD7816=m +CONFIG_AD7280=m +# end of Analog to digital converters + +# +# Analog digital bi-direction converters +# +CONFIG_ADT7316=m +CONFIG_ADT7316_SPI=m +CONFIG_ADT7316_I2C=m +# end of Analog digital bi-direction converters + +# +# Capacitance to digital converters +# +CONFIG_AD7150=m +CONFIG_AD7746=m +# end of Capacitance to digital converters + +# +# Direct Digital Synthesis +# +CONFIG_AD9832=m +CONFIG_AD9834=m +# end of Direct Digital Synthesis + +# +# Network Analyzer, Impedance Converters +# +CONFIG_AD5933=m +# end of Network Analyzer, Impedance Converters + +# +# Active energy metering IC +# +CONFIG_ADE7854=m +CONFIG_ADE7854_I2C=m +CONFIG_ADE7854_SPI=m +# end of Active energy metering IC + +# +# Resolver to digital converters +# +CONFIG_AD2S1210=m +# end of Resolver to digital converters +# end of IIO staging drivers + +# CONFIG_FB_SM750 is not set + +# +# Speakup console speech +# +CONFIG_SPEAKUP=m +CONFIG_SPEAKUP_SYNTH_ACNTSA=m +CONFIG_SPEAKUP_SYNTH_APOLLO=m +CONFIG_SPEAKUP_SYNTH_AUDPTR=m +CONFIG_SPEAKUP_SYNTH_BNS=m +CONFIG_SPEAKUP_SYNTH_DECTLK=m +CONFIG_SPEAKUP_SYNTH_DECEXT=m +CONFIG_SPEAKUP_SYNTH_LTLK=m +CONFIG_SPEAKUP_SYNTH_SOFT=m +CONFIG_SPEAKUP_SYNTH_SPKOUT=m +CONFIG_SPEAKUP_SYNTH_TXPRT=m +CONFIG_SPEAKUP_SYNTH_DUMMY=m +# end of Speakup console speech + +CONFIG_STAGING_MEDIA=y +CONFIG_INTEL_ATOMISP=y +CONFIG_VIDEO_ATOMISP=m +CONFIG_VIDEO_ATOMISP_ISP2401=y +CONFIG_VIDEO_ATOMISP_OV5693=m +CONFIG_VIDEO_ATOMISP_OV2722=m +CONFIG_VIDEO_ATOMISP_GC2235=m +CONFIG_VIDEO_ATOMISP_MSRLIST_HELPER=m +CONFIG_VIDEO_ATOMISP_MT9M114=m +CONFIG_VIDEO_ATOMISP_GC0310=m +CONFIG_VIDEO_ATOMISP_OV2680=m +CONFIG_VIDEO_ATOMISP_LM3554=m +CONFIG_VIDEO_IPU3_IMGU=m + +# +# soc_camera sensor drivers +# +CONFIG_VIDEO_USBVISION=m + +# +# Android +# +# end of Android + +CONFIG_STAGING_BOARD=y +CONFIG_LTE_GDM724X=m +CONFIG_FIREWIRE_SERIAL=m +CONFIG_FWTTY_MAX_TOTAL_PORTS=64 +CONFIG_FWTTY_MAX_CARD_PORTS=32 +CONFIG_GS_FPGABOOT=m +CONFIG_UNISYSSPAR=y +CONFIG_UNISYS_VISORNIC=m +CONFIG_UNISYS_VISORINPUT=m +CONFIG_UNISYS_VISORHBA=m +CONFIG_COMMON_CLK_XLNX_CLKWZRD=m +# CONFIG_FB_TFT is not set +CONFIG_WILC1000=m +CONFIG_WILC1000_SDIO=m +CONFIG_WILC1000_SPI=m +# CONFIG_WILC1000_HW_OOB_INTR is not set +CONFIG_MOST_COMPONENTS=m +CONFIG_MOST_CDEV=m +CONFIG_MOST_NET=m +CONFIG_MOST_SOUND=m +CONFIG_MOST_VIDEO=m +CONFIG_MOST_DIM2=m +CONFIG_MOST_I2C=m +CONFIG_MOST_USB=m +CONFIG_KS7010=m +CONFIG_PI433=m + +# +# Gasket devices +# +CONFIG_STAGING_GASKET_FRAMEWORK=m +CONFIG_STAGING_APEX_DRIVER=m +# end of Gasket devices + +CONFIG_XIL_AXIS_FIFO=m +CONFIG_FIELDBUS_DEV=m +CONFIG_HMS_ANYBUSS_BUS=m +CONFIG_ARCX_ANYBUS_CONTROLLER=m +CONFIG_HMS_PROFINET=m +CONFIG_KPC2000=y +CONFIG_KPC2000_CORE=m +CONFIG_KPC2000_SPI=m +CONFIG_KPC2000_I2C=m +CONFIG_KPC2000_DMA=m +CONFIG_QLGE=m +CONFIG_WFX=m +CONFIG_X86_PLATFORM_DEVICES=y +CONFIG_ACPI_WMI=m +CONFIG_WMI_BMOF=m +CONFIG_ALIENWARE_WMI=m +CONFIG_HUAWEI_WMI=m +CONFIG_INTEL_WMI_SBL_FW_UPDATE=m +CONFIG_INTEL_WMI_THUNDERBOLT=m +CONFIG_MXM_WMI=m +CONFIG_PEAQ_WMI=m +CONFIG_XIAOMI_WMI=m +CONFIG_ACERHDF=m +CONFIG_ACER_WIRELESS=m +CONFIG_ACER_WMI=m +CONFIG_APPLE_GMUX=m +CONFIG_ASUS_LAPTOP=m +CONFIG_ASUS_WIRELESS=m +CONFIG_ASUS_WMI=m +CONFIG_ASUS_NB_WMI=m +CONFIG_EEEPC_LAPTOP=m +CONFIG_EEEPC_WMI=m +CONFIG_DCDBAS=m +CONFIG_DELL_SMBIOS=m +CONFIG_DELL_SMBIOS_WMI=y +CONFIG_DELL_SMBIOS_SMM=y +CONFIG_DELL_LAPTOP=m +CONFIG_DELL_RBTN=m +# CONFIG_DELL_RBU is not set +CONFIG_DELL_SMO8800=m +CONFIG_DELL_WMI=m +CONFIG_DELL_WMI_DESCRIPTOR=m +CONFIG_DELL_WMI_AIO=m +CONFIG_DELL_WMI_LED=m +CONFIG_AMILO_RFKILL=m +CONFIG_FUJITSU_LAPTOP=m +CONFIG_FUJITSU_TABLET=m +CONFIG_GPD_POCKET_FAN=m +CONFIG_HP_ACCEL=m +CONFIG_HP_WIRELESS=m +CONFIG_HP_WMI=m +CONFIG_IBM_RTL=m +CONFIG_IDEAPAD_LAPTOP=m +CONFIG_SENSORS_HDAPS=m +CONFIG_THINKPAD_ACPI=m +CONFIG_THINKPAD_ACPI_ALSA_SUPPORT=y +# CONFIG_THINKPAD_ACPI_DEBUGFACILITIES is not set +# CONFIG_THINKPAD_ACPI_DEBUG is not set +# CONFIG_THINKPAD_ACPI_UNSAFE_LEDS is not set +CONFIG_THINKPAD_ACPI_VIDEO=y +CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y +CONFIG_INTEL_CHT_INT33FE=m +CONFIG_INTEL_HID_EVENT=m +CONFIG_INTEL_INT0002_VGPIO=m +CONFIG_INTEL_MENLOW=m +CONFIG_INTEL_OAKTRAIL=m +CONFIG_INTEL_VBTN=m +CONFIG_SURFACE3_WMI=m +CONFIG_SURFACE_3_BUTTON=m +CONFIG_SURFACE_3_POWER_OPREGION=m +CONFIG_SURFACE_PRO3_BUTTON=m +CONFIG_MSI_LAPTOP=m +CONFIG_MSI_WMI=m +CONFIG_PCENGINES_APU2=m +CONFIG_SAMSUNG_LAPTOP=m +CONFIG_SAMSUNG_Q10=m +CONFIG_ACPI_TOSHIBA=m +CONFIG_TOSHIBA_BT_RFKILL=m +CONFIG_TOSHIBA_HAPS=m +CONFIG_TOSHIBA_WMI=m +CONFIG_ACPI_CMPC=m +CONFIG_COMPAL_LAPTOP=m +CONFIG_LG_LAPTOP=m +CONFIG_PANASONIC_LAPTOP=m +CONFIG_SONY_LAPTOP=m +CONFIG_SONYPI_COMPAT=y +CONFIG_SYSTEM76_ACPI=m +CONFIG_TOPSTAR_LAPTOP=m +CONFIG_I2C_MULTI_INSTANTIATE=m +CONFIG_MLX_PLATFORM=m +CONFIG_TOUCHSCREEN_DMI=y +CONFIG_INTEL_IPS=m +CONFIG_INTEL_RST=m +CONFIG_INTEL_SMARTCONNECT=m + +# +# Intel Speed Select Technology interface support +# +CONFIG_INTEL_SPEED_SELECT_INTERFACE=m +# end of Intel Speed Select Technology interface support + +CONFIG_INTEL_TURBO_MAX_3=y +CONFIG_INTEL_UNCORE_FREQ_CONTROL=m +CONFIG_INTEL_BXTWC_PMIC_TMU=m +CONFIG_INTEL_CHTDC_TI_PWRBTN=m +CONFIG_INTEL_MFLD_THERMAL=m +CONFIG_INTEL_MID_POWER_BUTTON=m +CONFIG_INTEL_MRFLD_PWRBTN=m +CONFIG_INTEL_PMC_CORE=y +CONFIG_INTEL_PUNIT_IPC=m +CONFIG_INTEL_SCU_IPC=y +CONFIG_INTEL_SCU=y +CONFIG_INTEL_SCU_PCI=y +CONFIG_INTEL_SCU_PLATFORM=m +CONFIG_INTEL_SCU_IPC_UTIL=m +CONFIG_INTEL_TELEMETRY=m +CONFIG_PMC_ATOM=y +CONFIG_MFD_CROS_EC=m +CONFIG_CHROME_PLATFORMS=y +CONFIG_CHROMEOS_LAPTOP=m +CONFIG_CHROMEOS_PSTORE=m +CONFIG_CHROMEOS_TBMC=m +CONFIG_CROS_EC=m +CONFIG_CROS_EC_I2C=m +CONFIG_CROS_EC_RPMSG=m +CONFIG_CROS_EC_ISHTP=m +CONFIG_CROS_EC_SPI=m +CONFIG_CROS_EC_LPC=m +CONFIG_CROS_EC_PROTO=y +CONFIG_CROS_KBD_LED_BACKLIGHT=m +CONFIG_CROS_EC_CHARDEV=m +CONFIG_CROS_EC_LIGHTBAR=m +CONFIG_CROS_EC_VBC=m +CONFIG_CROS_EC_DEBUGFS=m +CONFIG_CROS_EC_SENSORHUB=m +CONFIG_CROS_EC_SYSFS=m +CONFIG_CROS_EC_TYPEC=m +CONFIG_CROS_USBPD_LOGGER=m +CONFIG_CROS_USBPD_NOTIFY=m +CONFIG_WILCO_EC=m +CONFIG_WILCO_EC_DEBUGFS=m +CONFIG_WILCO_EC_EVENTS=m +CONFIG_WILCO_EC_TELEMETRY=m +CONFIG_MELLANOX_PLATFORM=y +CONFIG_MLXREG_HOTPLUG=m +CONFIG_MLXREG_IO=m +CONFIG_HAVE_CLK=y +CONFIG_CLKDEV_LOOKUP=y +CONFIG_HAVE_CLK_PREPARE=y +CONFIG_COMMON_CLK=y +CONFIG_COMMON_CLK_WM831X=m +# CONFIG_CLK_HSDK is not set +CONFIG_COMMON_CLK_MAX77686=m +CONFIG_COMMON_CLK_MAX9485=m +CONFIG_COMMON_CLK_RK808=m +CONFIG_COMMON_CLK_SI5341=m +CONFIG_COMMON_CLK_SI5351=m +CONFIG_COMMON_CLK_SI514=m +CONFIG_COMMON_CLK_SI544=m +CONFIG_COMMON_CLK_SI570=m +CONFIG_COMMON_CLK_CDCE706=m +CONFIG_COMMON_CLK_CDCE925=m +CONFIG_COMMON_CLK_CS2000_CP=m +CONFIG_COMMON_CLK_S2MPS11=m +CONFIG_CLK_TWL6040=m +CONFIG_COMMON_CLK_LOCHNAGAR=m +CONFIG_COMMON_CLK_PALMAS=m +CONFIG_COMMON_CLK_PWM=m +CONFIG_COMMON_CLK_VC5=m +CONFIG_COMMON_CLK_BD718XX=m +CONFIG_COMMON_CLK_FIXED_MMIO=y +CONFIG_CLK_LGM_CGU=y +CONFIG_HWSPINLOCK=y + +# +# Clock Source drivers +# +CONFIG_TIMER_OF=y +CONFIG_TIMER_PROBE=y +CONFIG_CLKEVT_I8253=y +CONFIG_I8253_LOCK=y +CONFIG_CLKBLD_I8253=y +CONFIG_CLKSRC_MMIO=y +CONFIG_MICROCHIP_PIT64B=y +# end of Clock Source drivers + +CONFIG_MAILBOX=y +CONFIG_PLATFORM_MHU=m +CONFIG_PCC=y +CONFIG_ALTERA_MBOX=m +CONFIG_MAILBOX_TEST=m +CONFIG_IOMMU_IOVA=y +CONFIG_IOASID=y +CONFIG_IOMMU_API=y +CONFIG_IOMMU_SUPPORT=y + +# +# Generic IOMMU Pagetable Support +# +# end of Generic IOMMU Pagetable Support + +# CONFIG_IOMMU_DEBUGFS is not set +# CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set +CONFIG_OF_IOMMU=y +CONFIG_IOMMU_DMA=y +CONFIG_AMD_IOMMU=y +CONFIG_AMD_IOMMU_V2=y +CONFIG_DMAR_TABLE=y +CONFIG_INTEL_IOMMU=y +CONFIG_INTEL_IOMMU_SVM=y +# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set +CONFIG_INTEL_IOMMU_FLOPPY_WA=y +# CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON is not set +CONFIG_IRQ_REMAP=y +CONFIG_HYPERV_IOMMU=y + +# +# Remoteproc drivers +# +CONFIG_REMOTEPROC=y +# end of Remoteproc drivers + +# +# Rpmsg drivers +# +CONFIG_RPMSG=m +CONFIG_RPMSG_CHAR=m +CONFIG_RPMSG_QCOM_GLINK=m +CONFIG_RPMSG_QCOM_GLINK_RPM=m +CONFIG_RPMSG_VIRTIO=m +# end of Rpmsg drivers + +CONFIG_SOUNDWIRE=m + +# +# SoundWire Devices +# +CONFIG_SOUNDWIRE_CADENCE=m +CONFIG_SOUNDWIRE_INTEL=m +CONFIG_SOUNDWIRE_QCOM=m + +# +# SOC (System On Chip) specific Drivers +# + +# +# Amlogic SoC drivers +# +# end of Amlogic SoC drivers + +# +# Aspeed SoC drivers +# +# end of Aspeed SoC drivers + +# +# Broadcom SoC drivers +# +# end of Broadcom SoC drivers + +# +# NXP/Freescale QorIQ SoC drivers +# +# end of NXP/Freescale QorIQ SoC drivers + +# +# i.MX SoC drivers +# +# end of i.MX SoC drivers + +# +# Qualcomm SoC drivers +# +# end of Qualcomm SoC drivers + +CONFIG_SOC_TI=y + +# +# Xilinx SoC drivers +# +CONFIG_XILINX_VCU=m +# end of Xilinx SoC drivers +# end of SOC (System On Chip) specific Drivers + +CONFIG_PM_DEVFREQ=y + +# +# DEVFREQ Governors +# +CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=m +CONFIG_DEVFREQ_GOV_PERFORMANCE=m +CONFIG_DEVFREQ_GOV_POWERSAVE=m +CONFIG_DEVFREQ_GOV_USERSPACE=m +CONFIG_DEVFREQ_GOV_PASSIVE=m + +# +# DEVFREQ Drivers +# +CONFIG_PM_DEVFREQ_EVENT=y +CONFIG_EXTCON=y + +# +# Extcon Device Drivers +# +CONFIG_EXTCON_ADC_JACK=m +CONFIG_EXTCON_ARIZONA=m +CONFIG_EXTCON_AXP288=m +CONFIG_EXTCON_FSA9480=m +CONFIG_EXTCON_GPIO=m +CONFIG_EXTCON_INTEL_INT3496=m +CONFIG_EXTCON_INTEL_CHT_WC=m +CONFIG_EXTCON_INTEL_MRFLD=m +CONFIG_EXTCON_MAX14577=m +CONFIG_EXTCON_MAX3355=m +CONFIG_EXTCON_MAX77693=m +CONFIG_EXTCON_MAX77843=m +CONFIG_EXTCON_MAX8997=m +CONFIG_EXTCON_PALMAS=m +CONFIG_EXTCON_PTN5150=m +CONFIG_EXTCON_RT8973A=m +CONFIG_EXTCON_SM5502=m +CONFIG_EXTCON_USB_GPIO=m +CONFIG_EXTCON_USBC_CROS_EC=m +CONFIG_MEMORY=y +CONFIG_IIO=m +CONFIG_IIO_BUFFER=y +CONFIG_IIO_BUFFER_CB=m +CONFIG_IIO_BUFFER_DMA=m +CONFIG_IIO_BUFFER_DMAENGINE=m +CONFIG_IIO_BUFFER_HW_CONSUMER=m +CONFIG_IIO_KFIFO_BUF=m +CONFIG_IIO_TRIGGERED_BUFFER=m +CONFIG_IIO_CONFIGFS=m +CONFIG_IIO_TRIGGER=y +CONFIG_IIO_CONSUMERS_PER_TRIGGER=2 +CONFIG_IIO_SW_DEVICE=m +CONFIG_IIO_SW_TRIGGER=m +CONFIG_IIO_TRIGGERED_EVENT=m + +# +# Accelerometers +# +CONFIG_ADIS16201=m +CONFIG_ADIS16209=m +CONFIG_ADXL372=m +CONFIG_ADXL372_SPI=m +CONFIG_ADXL372_I2C=m +CONFIG_BMA220=m +CONFIG_BMA400=m +CONFIG_BMA400_I2C=m +CONFIG_BMC150_ACCEL=m +CONFIG_BMC150_ACCEL_I2C=m +CONFIG_BMC150_ACCEL_SPI=m +CONFIG_DA280=m +CONFIG_DA311=m +CONFIG_DMARD06=m +CONFIG_DMARD09=m +CONFIG_DMARD10=m +CONFIG_HID_SENSOR_ACCEL_3D=m +CONFIG_IIO_CROS_EC_ACCEL_LEGACY=m +CONFIG_IIO_ST_ACCEL_3AXIS=m +CONFIG_IIO_ST_ACCEL_I2C_3AXIS=m +CONFIG_IIO_ST_ACCEL_SPI_3AXIS=m +CONFIG_KXSD9=m +CONFIG_KXSD9_SPI=m +CONFIG_KXSD9_I2C=m +CONFIG_KXCJK1013=m +CONFIG_MC3230=m +CONFIG_MMA7455=m +CONFIG_MMA7455_I2C=m +CONFIG_MMA7455_SPI=m +CONFIG_MMA7660=m +CONFIG_MMA8452=m +CONFIG_MMA9551_CORE=m +CONFIG_MMA9551=m +CONFIG_MMA9553=m +CONFIG_MXC4005=m +CONFIG_MXC6255=m +CONFIG_SCA3000=m +CONFIG_STK8312=m +CONFIG_STK8BA50=m +# end of Accelerometers + +# +# Analog to digital converters +# +CONFIG_AD_SIGMA_DELTA=m +CONFIG_AD7091R5=m +CONFIG_AD7124=m +CONFIG_AD7192=m +CONFIG_AD7266=m +CONFIG_AD7291=m +CONFIG_AD7292=m +CONFIG_AD7298=m +CONFIG_AD7476=m +CONFIG_AD7606=m +CONFIG_AD7606_IFACE_PARALLEL=m +CONFIG_AD7606_IFACE_SPI=m +CONFIG_AD7766=m +CONFIG_AD7768_1=m +CONFIG_AD7780=m +CONFIG_AD7791=m +CONFIG_AD7793=m +CONFIG_AD7887=m +CONFIG_AD7923=m +CONFIG_AD7949=m +CONFIG_AD799X=m +CONFIG_AD9467=m +CONFIG_ADI_AXI_ADC=m +CONFIG_AXP20X_ADC=m +CONFIG_AXP288_ADC=m +CONFIG_CC10001_ADC=m +CONFIG_CPCAP_ADC=m +CONFIG_DA9150_GPADC=m +CONFIG_DLN2_ADC=m +CONFIG_ENVELOPE_DETECTOR=m +CONFIG_HI8435=m +CONFIG_HX711=m +CONFIG_INA2XX_ADC=m +CONFIG_INTEL_MRFLD_ADC=m +CONFIG_LP8788_ADC=m +CONFIG_LTC2471=m +CONFIG_LTC2485=m +CONFIG_LTC2496=m +CONFIG_LTC2497=m +CONFIG_MAX1027=m +CONFIG_MAX11100=m +CONFIG_MAX1118=m +CONFIG_MAX1241=m +CONFIG_MAX1363=m +CONFIG_MAX9611=m +CONFIG_MCP320X=m +CONFIG_MCP3422=m +CONFIG_MCP3911=m +CONFIG_MEN_Z188_ADC=m +CONFIG_MP2629_ADC=m +CONFIG_NAU7802=m +CONFIG_PALMAS_GPADC=m +CONFIG_QCOM_VADC_COMMON=m +CONFIG_QCOM_SPMI_IADC=m +CONFIG_QCOM_SPMI_VADC=m +CONFIG_QCOM_SPMI_ADC5=m +CONFIG_RN5T618_ADC=m +CONFIG_SD_ADC_MODULATOR=m +CONFIG_STMPE_ADC=m +CONFIG_TI_ADC081C=m +CONFIG_TI_ADC0832=m +CONFIG_TI_ADC084S021=m +CONFIG_TI_ADC12138=m +CONFIG_TI_ADC108S102=m +CONFIG_TI_ADC128S052=m +CONFIG_TI_ADC161S626=m +CONFIG_TI_ADS1015=m +CONFIG_TI_ADS7950=m +CONFIG_TI_ADS8344=m +CONFIG_TI_ADS8688=m +CONFIG_TI_ADS124S08=m +CONFIG_TI_AM335X_ADC=m +CONFIG_TI_TLC4541=m +CONFIG_TWL4030_MADC=m +CONFIG_TWL6030_GPADC=m +CONFIG_VF610_ADC=m +CONFIG_VIPERBOARD_ADC=m +CONFIG_XILINX_XADC=m +# end of Analog to digital converters + +# +# Analog Front Ends +# +CONFIG_IIO_RESCALE=m +# end of Analog Front Ends + +# +# Amplifiers +# +CONFIG_AD8366=m +CONFIG_HMC425=m +# end of Amplifiers + +# +# Chemical Sensors +# +CONFIG_ATLAS_PH_SENSOR=m +CONFIG_ATLAS_EZO_SENSOR=m +CONFIG_BME680=m +CONFIG_BME680_I2C=m +CONFIG_BME680_SPI=m +CONFIG_CCS811=m +CONFIG_IAQCORE=m +CONFIG_PMS7003=m +CONFIG_SENSIRION_SGP30=m +CONFIG_SPS30=m +CONFIG_VZ89X=m +# end of Chemical Sensors + +CONFIG_IIO_CROS_EC_SENSORS_CORE=m +CONFIG_IIO_CROS_EC_SENSORS=m +CONFIG_IIO_CROS_EC_SENSORS_LID_ANGLE=m + +# +# Hid Sensor IIO Common +# +CONFIG_HID_SENSOR_IIO_COMMON=m +CONFIG_HID_SENSOR_IIO_TRIGGER=m +# end of Hid Sensor IIO Common + +CONFIG_IIO_MS_SENSORS_I2C=m + +# +# SSP Sensor Common +# +CONFIG_IIO_SSP_SENSORS_COMMONS=m +CONFIG_IIO_SSP_SENSORHUB=m +# end of SSP Sensor Common + +CONFIG_IIO_ST_SENSORS_I2C=m +CONFIG_IIO_ST_SENSORS_SPI=m +CONFIG_IIO_ST_SENSORS_CORE=m + +# +# Digital to analog converters +# +CONFIG_AD5064=m +CONFIG_AD5360=m +CONFIG_AD5380=m +CONFIG_AD5421=m +CONFIG_AD5446=m +CONFIG_AD5449=m +CONFIG_AD5592R_BASE=m +CONFIG_AD5592R=m +CONFIG_AD5593R=m +CONFIG_AD5504=m +CONFIG_AD5624R_SPI=m +CONFIG_AD5686=m +CONFIG_AD5686_SPI=m +CONFIG_AD5696_I2C=m +CONFIG_AD5755=m +CONFIG_AD5758=m +CONFIG_AD5761=m +CONFIG_AD5764=m +CONFIG_AD5770R=m +CONFIG_AD5791=m +CONFIG_AD7303=m +CONFIG_AD8801=m +CONFIG_DPOT_DAC=m +CONFIG_DS4424=m +CONFIG_LTC1660=m +CONFIG_LTC2632=m +CONFIG_M62332=m +CONFIG_MAX517=m +CONFIG_MAX5821=m +CONFIG_MCP4725=m +CONFIG_MCP4922=m +CONFIG_TI_DAC082S085=m +CONFIG_TI_DAC5571=m +CONFIG_TI_DAC7311=m +CONFIG_TI_DAC7612=m +CONFIG_VF610_DAC=m +# end of Digital to analog converters + +# +# IIO dummy driver +# +# CONFIG_IIO_SIMPLE_DUMMY is not set +# end of IIO dummy driver + +# +# Frequency Synthesizers DDS/PLL +# + +# +# Clock Generator/Distribution +# +CONFIG_AD9523=m +# end of Clock Generator/Distribution + +# +# Phase-Locked Loop (PLL) frequency synthesizers +# +CONFIG_ADF4350=m +CONFIG_ADF4371=m +# end of Phase-Locked Loop (PLL) frequency synthesizers +# end of Frequency Synthesizers DDS/PLL + +# +# Digital gyroscope sensors +# +CONFIG_ADIS16080=m +CONFIG_ADIS16130=m +CONFIG_ADIS16136=m +CONFIG_ADIS16260=m +CONFIG_ADXRS450=m +CONFIG_BMG160=m +CONFIG_BMG160_I2C=m +CONFIG_BMG160_SPI=m +CONFIG_FXAS21002C=m +CONFIG_FXAS21002C_I2C=m +CONFIG_FXAS21002C_SPI=m +CONFIG_HID_SENSOR_GYRO_3D=m +CONFIG_MPU3050=m +CONFIG_MPU3050_I2C=m +CONFIG_IIO_ST_GYRO_3AXIS=m +CONFIG_IIO_ST_GYRO_I2C_3AXIS=m +CONFIG_IIO_ST_GYRO_SPI_3AXIS=m +CONFIG_ITG3200=m +# end of Digital gyroscope sensors + +# +# Health Sensors +# + +# +# Heart Rate Monitors +# +CONFIG_AFE4403=m +CONFIG_AFE4404=m +CONFIG_MAX30100=m +CONFIG_MAX30102=m +# end of Heart Rate Monitors +# end of Health Sensors + +# +# Humidity sensors +# +CONFIG_AM2315=m +CONFIG_DHT11=m +CONFIG_HDC100X=m +CONFIG_HID_SENSOR_HUMIDITY=m +CONFIG_HTS221=m +CONFIG_HTS221_I2C=m +CONFIG_HTS221_SPI=m +CONFIG_HTU21=m +CONFIG_SI7005=m +CONFIG_SI7020=m +# end of Humidity sensors + +# +# Inertial measurement units +# +CONFIG_ADIS16400=m +CONFIG_ADIS16460=m +CONFIG_ADIS16475=m +CONFIG_ADIS16480=m +CONFIG_BMI160=m +CONFIG_BMI160_I2C=m +CONFIG_BMI160_SPI=m +CONFIG_FXOS8700=m +CONFIG_FXOS8700_I2C=m +CONFIG_FXOS8700_SPI=m +CONFIG_KMX61=m +CONFIG_INV_MPU6050_IIO=m +CONFIG_INV_MPU6050_I2C=m +CONFIG_INV_MPU6050_SPI=m +CONFIG_IIO_ST_LSM6DSX=m +CONFIG_IIO_ST_LSM6DSX_I2C=m +CONFIG_IIO_ST_LSM6DSX_SPI=m +CONFIG_IIO_ST_LSM6DSX_I3C=m +# end of Inertial measurement units + +CONFIG_IIO_ADIS_LIB=m +CONFIG_IIO_ADIS_LIB_BUFFER=y + +# +# Light sensors +# +CONFIG_ACPI_ALS=m +CONFIG_ADJD_S311=m +CONFIG_ADUX1020=m +CONFIG_AL3010=m +CONFIG_AL3320A=m +CONFIG_APDS9300=m +CONFIG_APDS9960=m +CONFIG_BH1750=m +CONFIG_BH1780=m +CONFIG_CM32181=m +CONFIG_CM3232=m +CONFIG_CM3323=m +CONFIG_CM3605=m +CONFIG_CM36651=m +CONFIG_IIO_CROS_EC_LIGHT_PROX=m +CONFIG_GP2AP002=m +CONFIG_GP2AP020A00F=m +CONFIG_IQS621_ALS=m +CONFIG_SENSORS_ISL29018=m +CONFIG_SENSORS_ISL29028=m +CONFIG_ISL29125=m +CONFIG_HID_SENSOR_ALS=m +CONFIG_HID_SENSOR_PROX=m +CONFIG_JSA1212=m +CONFIG_RPR0521=m +CONFIG_SENSORS_LM3533=m +CONFIG_LTR501=m +CONFIG_LV0104CS=m +CONFIG_MAX44000=m +CONFIG_MAX44009=m +CONFIG_NOA1305=m +CONFIG_OPT3001=m +CONFIG_PA12203001=m +CONFIG_SI1133=m +CONFIG_SI1145=m +CONFIG_STK3310=m +CONFIG_ST_UVIS25=m +CONFIG_ST_UVIS25_I2C=m +CONFIG_ST_UVIS25_SPI=m +CONFIG_TCS3414=m +CONFIG_TCS3472=m +CONFIG_SENSORS_TSL2563=m +CONFIG_TSL2583=m +CONFIG_TSL2772=m +CONFIG_TSL4531=m +CONFIG_US5182D=m +CONFIG_VCNL4000=m +CONFIG_VCNL4035=m +CONFIG_VEML6030=m +CONFIG_VEML6070=m +CONFIG_VL6180=m +CONFIG_ZOPT2201=m +# end of Light sensors + +# +# Magnetometer sensors +# +CONFIG_AK8974=m +CONFIG_AK8975=m +CONFIG_AK09911=m +CONFIG_BMC150_MAGN=m +CONFIG_BMC150_MAGN_I2C=m +CONFIG_BMC150_MAGN_SPI=m +CONFIG_MAG3110=m +CONFIG_HID_SENSOR_MAGNETOMETER_3D=m +CONFIG_MMC35240=m +CONFIG_IIO_ST_MAGN_3AXIS=m +CONFIG_IIO_ST_MAGN_I2C_3AXIS=m +CONFIG_IIO_ST_MAGN_SPI_3AXIS=m +CONFIG_SENSORS_HMC5843=m +CONFIG_SENSORS_HMC5843_I2C=m +CONFIG_SENSORS_HMC5843_SPI=m +CONFIG_SENSORS_RM3100=m +CONFIG_SENSORS_RM3100_I2C=m +CONFIG_SENSORS_RM3100_SPI=m +# end of Magnetometer sensors + +# +# Multiplexers +# +CONFIG_IIO_MUX=m +# end of Multiplexers + +# +# Inclinometer sensors +# +CONFIG_HID_SENSOR_INCLINOMETER_3D=m +CONFIG_HID_SENSOR_DEVICE_ROTATION=m +# end of Inclinometer sensors + +# +# Triggers - standalone +# +CONFIG_IIO_HRTIMER_TRIGGER=m +CONFIG_IIO_INTERRUPT_TRIGGER=m +CONFIG_IIO_TIGHTLOOP_TRIGGER=m +CONFIG_IIO_SYSFS_TRIGGER=m +# end of Triggers - standalone + +# +# Linear and angular position sensors +# +CONFIG_IQS624_POS=m +# end of Linear and angular position sensors + +# +# Digital potentiometers +# +CONFIG_AD5272=m +CONFIG_DS1803=m +CONFIG_MAX5432=m +CONFIG_MAX5481=m +CONFIG_MAX5487=m +CONFIG_MCP4018=m +CONFIG_MCP4131=m +CONFIG_MCP4531=m +CONFIG_MCP41010=m +CONFIG_TPL0102=m +# end of Digital potentiometers + +# +# Digital potentiostats +# +CONFIG_LMP91000=m +# end of Digital potentiostats + +# +# Pressure sensors +# +CONFIG_ABP060MG=m +CONFIG_BMP280=m +CONFIG_BMP280_I2C=m +CONFIG_BMP280_SPI=m +CONFIG_IIO_CROS_EC_BARO=m +CONFIG_DLHL60D=m +CONFIG_DPS310=m +CONFIG_HID_SENSOR_PRESS=m +CONFIG_HP03=m +CONFIG_ICP10100=m +CONFIG_MPL115=m +CONFIG_MPL115_I2C=m +CONFIG_MPL115_SPI=m +CONFIG_MPL3115=m +CONFIG_MS5611=m +CONFIG_MS5611_I2C=m +CONFIG_MS5611_SPI=m +CONFIG_MS5637=m +CONFIG_IIO_ST_PRESS=m +CONFIG_IIO_ST_PRESS_I2C=m +CONFIG_IIO_ST_PRESS_SPI=m +CONFIG_T5403=m +CONFIG_HP206C=m +CONFIG_ZPA2326=m +CONFIG_ZPA2326_I2C=m +CONFIG_ZPA2326_SPI=m +# end of Pressure sensors + +# +# Lightning sensors +# +CONFIG_AS3935=m +# end of Lightning sensors + +# +# Proximity and distance sensors +# +CONFIG_ISL29501=m +CONFIG_LIDAR_LITE_V2=m +CONFIG_MB1232=m +CONFIG_PING=m +CONFIG_RFD77402=m +CONFIG_SRF04=m +CONFIG_SX9310=m +CONFIG_SX9500=m +CONFIG_SRF08=m +CONFIG_VCNL3020=m +CONFIG_VL53L0X_I2C=m +# end of Proximity and distance sensors + +# +# Resolver to digital converters +# +CONFIG_AD2S90=m +CONFIG_AD2S1200=m +# end of Resolver to digital converters + +# +# Temperature sensors +# +CONFIG_IQS620AT_TEMP=m +CONFIG_LTC2983=m +CONFIG_MAXIM_THERMOCOUPLE=m +CONFIG_HID_SENSOR_TEMP=m +CONFIG_MLX90614=m +CONFIG_MLX90632=m +CONFIG_TMP006=m +CONFIG_TMP007=m +CONFIG_TSYS01=m +CONFIG_TSYS02D=m +CONFIG_MAX31856=m +# end of Temperature sensors + +CONFIG_NTB=m +CONFIG_NTB_MSI=y +CONFIG_NTB_AMD=m +CONFIG_NTB_IDT=m +CONFIG_NTB_INTEL=m +CONFIG_NTB_SWITCHTEC=m +# CONFIG_NTB_PINGPONG is not set +# CONFIG_NTB_TOOL is not set +# CONFIG_NTB_PERF is not set +# CONFIG_NTB_MSI_TEST is not set +CONFIG_NTB_TRANSPORT=m +CONFIG_VME_BUS=y + +# +# VME Bridge Drivers +# +CONFIG_VME_CA91CX42=m +CONFIG_VME_TSI148=m +# CONFIG_VME_FAKE is not set + +# +# VME Board Drivers +# +CONFIG_VMIVME_7805=m + +# +# VME Device Drivers +# +CONFIG_VME_USER=m +CONFIG_PWM=y +CONFIG_PWM_SYSFS=y +# CONFIG_PWM_DEBUG is not set +CONFIG_PWM_ATMEL_HLCDC_PWM=m +CONFIG_PWM_CRC=y +CONFIG_PWM_CROS_EC=m +CONFIG_PWM_FSL_FTM=m +CONFIG_PWM_IQS620A=m +CONFIG_PWM_LP3943=m +CONFIG_PWM_LPSS=m +CONFIG_PWM_LPSS_PCI=m +CONFIG_PWM_LPSS_PLATFORM=m +CONFIG_PWM_PCA9685=m +CONFIG_PWM_STMPE=y +CONFIG_PWM_TWL=m +CONFIG_PWM_TWL_LED=m + +# +# IRQ chip support +# +CONFIG_IRQCHIP=y +CONFIG_AL_FIC=y +CONFIG_MADERA_IRQ=m +# end of IRQ chip support + +CONFIG_IPACK_BUS=m +CONFIG_BOARD_TPCI200=m +CONFIG_SERIAL_IPOCTAL=m +CONFIG_RESET_CONTROLLER=y +CONFIG_RESET_BRCMSTB_RESCAL=y +CONFIG_RESET_INTEL_GW=y +CONFIG_RESET_TI_SYSCON=m + +# +# PHY Subsystem +# +CONFIG_GENERIC_PHY=y +CONFIG_GENERIC_PHY_MIPI_DPHY=y +CONFIG_BCM_KONA_USB2_PHY=m +CONFIG_PHY_CADENCE_TORRENT=m +CONFIG_PHY_CADENCE_DPHY=m +CONFIG_PHY_CADENCE_SIERRA=m +CONFIG_PHY_CADENCE_SALVO=m +CONFIG_PHY_FSL_IMX8MQ_USB=m +CONFIG_PHY_MIXEL_MIPI_DPHY=m +CONFIG_PHY_PXA_28NM_HSIC=m +CONFIG_PHY_PXA_28NM_USB2=m +CONFIG_PHY_CPCAP_USB=m +CONFIG_PHY_MAPPHONE_MDM6600=m +CONFIG_PHY_OCELOT_SERDES=m +CONFIG_PHY_QCOM_USB_HS=m +CONFIG_PHY_QCOM_USB_HSIC=m +CONFIG_PHY_SAMSUNG_USB2=m +CONFIG_PHY_TUSB1210=m +CONFIG_PHY_INTEL_COMBO=y +CONFIG_PHY_INTEL_EMMC=m +# end of PHY Subsystem + +CONFIG_POWERCAP=y +CONFIG_INTEL_RAPL_CORE=m +CONFIG_INTEL_RAPL=m +CONFIG_IDLE_INJECT=y +CONFIG_MCB=m +CONFIG_MCB_PCI=m +CONFIG_MCB_LPC=m + +# +# Performance monitor support +# +# end of Performance monitor support + +CONFIG_RAS=y +CONFIG_RAS_CEC=y +# CONFIG_RAS_CEC_DEBUG is not set +CONFIG_USB4=m + +# +# Android +# +# CONFIG_ANDROID is not set +# end of Android + +CONFIG_LIBNVDIMM=y +CONFIG_BLK_DEV_PMEM=m +CONFIG_ND_BLK=m +CONFIG_ND_CLAIM=y +CONFIG_ND_BTT=m +CONFIG_BTT=y +CONFIG_ND_PFN=m +CONFIG_NVDIMM_PFN=y +CONFIG_NVDIMM_DAX=y +CONFIG_OF_PMEM=m +CONFIG_DAX_DRIVER=y +CONFIG_DAX=y +CONFIG_DEV_DAX=m +CONFIG_DEV_DAX_PMEM=m +CONFIG_DEV_DAX_HMEM=m +CONFIG_DEV_DAX_KMEM=m +CONFIG_DEV_DAX_PMEM_COMPAT=m +CONFIG_NVMEM=y +CONFIG_NVMEM_SYSFS=y +CONFIG_NVMEM_SPMI_SDAM=m +CONFIG_RAVE_SP_EEPROM=m + +# +# HW tracing support +# +CONFIG_STM=m +CONFIG_STM_PROTO_BASIC=m +CONFIG_STM_PROTO_SYS_T=m +# CONFIG_STM_DUMMY is not set +CONFIG_STM_SOURCE_CONSOLE=m +CONFIG_STM_SOURCE_HEARTBEAT=m +CONFIG_STM_SOURCE_FTRACE=m +CONFIG_INTEL_TH=m +CONFIG_INTEL_TH_PCI=m +CONFIG_INTEL_TH_ACPI=m +CONFIG_INTEL_TH_GTH=m +CONFIG_INTEL_TH_STH=m +CONFIG_INTEL_TH_MSU=m +CONFIG_INTEL_TH_PTI=m +# CONFIG_INTEL_TH_DEBUG is not set +# end of HW tracing support + +CONFIG_FPGA=m +CONFIG_ALTERA_PR_IP_CORE=m +CONFIG_ALTERA_PR_IP_CORE_PLAT=m +CONFIG_FPGA_MGR_ALTERA_PS_SPI=m +CONFIG_FPGA_MGR_ALTERA_CVP=m +CONFIG_FPGA_MGR_XILINX_SPI=m +CONFIG_FPGA_MGR_ICE40_SPI=m +CONFIG_FPGA_MGR_MACHXO2_SPI=m +CONFIG_FPGA_BRIDGE=m +CONFIG_ALTERA_FREEZE_BRIDGE=m +CONFIG_XILINX_PR_DECOUPLER=m +CONFIG_FPGA_REGION=m +CONFIG_OF_FPGA_REGION=m +CONFIG_FPGA_DFL=m +CONFIG_FPGA_DFL_FME=m +CONFIG_FPGA_DFL_FME_MGR=m +CONFIG_FPGA_DFL_FME_BRIDGE=m +CONFIG_FPGA_DFL_FME_REGION=m +CONFIG_FPGA_DFL_AFU=m +CONFIG_FPGA_DFL_PCI=m +CONFIG_FSI=m +CONFIG_FSI_NEW_DEV_NODE=y +CONFIG_FSI_MASTER_GPIO=m +CONFIG_FSI_MASTER_HUB=m +CONFIG_FSI_MASTER_ASPEED=m +CONFIG_FSI_SCOM=m +CONFIG_FSI_SBEFIFO=m +CONFIG_FSI_OCC=m +CONFIG_TEE=m + +# +# TEE drivers +# +CONFIG_AMDTEE=m +# end of TEE drivers + +CONFIG_MULTIPLEXER=m + +# +# Multiplexer drivers +# +CONFIG_MUX_ADG792A=m +CONFIG_MUX_ADGS1408=m +CONFIG_MUX_GPIO=m +CONFIG_MUX_MMIO=m +# end of Multiplexer drivers + +CONFIG_PM_OPP=y +CONFIG_UNISYS_VISORBUS=m +CONFIG_SIOX=m +CONFIG_SIOX_BUS_GPIO=m +CONFIG_SLIMBUS=m +CONFIG_SLIM_QCOM_CTRL=m +CONFIG_INTERCONNECT=y +CONFIG_COUNTER=m +CONFIG_FTM_QUADDEC=m +CONFIG_MOST=m +# end of Device Drivers + +# +# File systems +# +CONFIG_DCACHE_WORD_ACCESS=y +CONFIG_VALIDATE_FS_PARSER=y +CONFIG_FS_IOMAP=y +# CONFIG_EXT2_FS is not set +# CONFIG_EXT3_FS is not set +CONFIG_EXT4_FS=m +CONFIG_EXT4_USE_FOR_EXT2=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +# CONFIG_EXT4_DEBUG is not set +CONFIG_JBD2=m +# CONFIG_JBD2_DEBUG is not set +CONFIG_FS_MBCACHE=m +CONFIG_REISERFS_FS=m +# CONFIG_REISERFS_CHECK is not set +CONFIG_REISERFS_PROC_INFO=y +CONFIG_REISERFS_FS_XATTR=y +CONFIG_REISERFS_FS_POSIX_ACL=y +CONFIG_REISERFS_FS_SECURITY=y +CONFIG_JFS_FS=m +CONFIG_JFS_POSIX_ACL=y +CONFIG_JFS_SECURITY=y +# CONFIG_JFS_DEBUG is not set +CONFIG_JFS_STATISTICS=y +CONFIG_XFS_FS=m +CONFIG_XFS_QUOTA=y +CONFIG_XFS_POSIX_ACL=y +CONFIG_XFS_RT=y +CONFIG_XFS_ONLINE_SCRUB=y +CONFIG_XFS_ONLINE_REPAIR=y +# CONFIG_XFS_WARN is not set +# CONFIG_XFS_DEBUG is not set +CONFIG_GFS2_FS=m +CONFIG_GFS2_FS_LOCKING_DLM=y +CONFIG_OCFS2_FS=m +CONFIG_OCFS2_FS_O2CB=m +CONFIG_OCFS2_FS_USERSPACE_CLUSTER=m +CONFIG_OCFS2_FS_STATS=y +CONFIG_OCFS2_DEBUG_MASKLOG=y +# CONFIG_OCFS2_DEBUG_FS is not set +CONFIG_BTRFS_FS=m +CONFIG_BTRFS_FS_POSIX_ACL=y +# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set +# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set +# CONFIG_BTRFS_DEBUG is not set +# CONFIG_BTRFS_ASSERT is not set +# CONFIG_BTRFS_FS_REF_VERIFY is not set +CONFIG_NILFS2_FS=m +CONFIG_F2FS_FS=m +CONFIG_F2FS_STAT_FS=y +CONFIG_F2FS_FS_XATTR=y +CONFIG_F2FS_FS_POSIX_ACL=y +CONFIG_F2FS_FS_SECURITY=y +CONFIG_F2FS_CHECK_FS=y +# CONFIG_F2FS_IO_TRACE is not set +# CONFIG_F2FS_FAULT_INJECTION is not set +CONFIG_F2FS_FS_COMPRESSION=y +CONFIG_F2FS_FS_LZO=y +CONFIG_F2FS_FS_LZ4=y +CONFIG_F2FS_FS_ZSTD=y +CONFIG_F2FS_FS_LZORLE=y +CONFIG_ZONEFS_FS=m +CONFIG_FS_DAX=y +CONFIG_FS_DAX_PMD=y +CONFIG_FS_POSIX_ACL=y +CONFIG_EXPORTFS=y +CONFIG_EXPORTFS_BLOCK_OPS=y +CONFIG_FILE_LOCKING=y +# CONFIG_MANDATORY_FILE_LOCKING is not set +CONFIG_FS_ENCRYPTION=y +CONFIG_FS_ENCRYPTION_ALGS=m +CONFIG_FS_VERITY=y +# CONFIG_FS_VERITY_DEBUG is not set +CONFIG_FS_VERITY_BUILTIN_SIGNATURES=y +CONFIG_FSNOTIFY=y +CONFIG_DNOTIFY=y +CONFIG_INOTIFY_USER=y +CONFIG_FANOTIFY=y +CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y +CONFIG_QUOTA=y +CONFIG_QUOTA_NETLINK_INTERFACE=y +# CONFIG_PRINT_QUOTA_WARNING is not set +# CONFIG_QUOTA_DEBUG is not set +CONFIG_QUOTA_TREE=m +CONFIG_QFMT_V1=m +CONFIG_QFMT_V2=m +CONFIG_QUOTACTL=y +CONFIG_QUOTACTL_COMPAT=y +CONFIG_AUTOFS4_FS=y +CONFIG_AUTOFS_FS=y +CONFIG_FUSE_FS=m +CONFIG_CUSE=m +CONFIG_VIRTIO_FS=m +CONFIG_OVERLAY_FS=m +CONFIG_OVERLAY_FS_REDIRECT_DIR=y +# CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW is not set +CONFIG_OVERLAY_FS_INDEX=y +CONFIG_OVERLAY_FS_XINO_AUTO=y +CONFIG_OVERLAY_FS_METACOPY=y + +# +# Caches +# +CONFIG_FSCACHE=m +CONFIG_FSCACHE_STATS=y +CONFIG_FSCACHE_HISTOGRAM=y +# CONFIG_FSCACHE_DEBUG is not set +# CONFIG_FSCACHE_OBJECT_LIST is not set +CONFIG_CACHEFILES=m +# CONFIG_CACHEFILES_DEBUG is not set +# CONFIG_CACHEFILES_HISTOGRAM is not set +# end of Caches + +# +# CD-ROM/DVD Filesystems +# +CONFIG_ISO9660_FS=m +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_UDF_FS=m +# end of CD-ROM/DVD Filesystems + +# +# DOS/FAT/EXFAT/NT Filesystems +# +CONFIG_FAT_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_CODEPAGE=437 +CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" +CONFIG_FAT_DEFAULT_UTF8=y +CONFIG_EXFAT_FS=m +CONFIG_EXFAT_DEFAULT_IOCHARSET="utf8" +CONFIG_NTFS_FS=m +# CONFIG_NTFS_DEBUG is not set +CONFIG_NTFS_RW=y +# end of DOS/FAT/EXFAT/NT Filesystems + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +CONFIG_PROC_KCORE=y +CONFIG_PROC_VMCORE=y +CONFIG_PROC_VMCORE_DEVICE_DUMP=y +CONFIG_PROC_SYSCTL=y +CONFIG_PROC_PAGE_MONITOR=y +CONFIG_PROC_CHILDREN=y +CONFIG_PROC_PID_ARCH_STATUS=y +CONFIG_PROC_CPU_RESCTRL=y +CONFIG_KERNFS=y +CONFIG_SYSFS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_TMPFS_XATTR=y +CONFIG_HUGETLBFS=y +CONFIG_HUGETLB_PAGE=y +CONFIG_MEMFD_CREATE=y +CONFIG_ARCH_HAS_GIGANTIC_PAGE=y +CONFIG_CONFIGFS_FS=y +CONFIG_EFIVAR_FS=y +# end of Pseudo filesystems + +CONFIG_MISC_FILESYSTEMS=y +CONFIG_ORANGEFS_FS=m +# CONFIG_ADFS_FS is not set +CONFIG_AFFS_FS=m +CONFIG_ECRYPT_FS=m +# CONFIG_ECRYPT_FS_MESSAGING is not set +CONFIG_HFS_FS=m +CONFIG_HFSPLUS_FS=m +CONFIG_BEFS_FS=m +# CONFIG_BEFS_DEBUG is not set +# CONFIG_BFS_FS is not set +# CONFIG_EFS_FS is not set +CONFIG_JFFS2_FS=m +CONFIG_JFFS2_FS_DEBUG=0 +CONFIG_JFFS2_FS_WRITEBUFFER=y +# CONFIG_JFFS2_FS_WBUF_VERIFY is not set +CONFIG_JFFS2_SUMMARY=y +CONFIG_JFFS2_FS_XATTR=y +CONFIG_JFFS2_FS_POSIX_ACL=y +CONFIG_JFFS2_FS_SECURITY=y +# CONFIG_JFFS2_COMPRESSION_OPTIONS is not set +CONFIG_JFFS2_ZLIB=y +CONFIG_JFFS2_RTIME=y +CONFIG_UBIFS_FS=m +# CONFIG_UBIFS_FS_ADVANCED_COMPR is not set +CONFIG_UBIFS_FS_LZO=y +CONFIG_UBIFS_FS_ZLIB=y +CONFIG_UBIFS_FS_ZSTD=y +CONFIG_UBIFS_ATIME_SUPPORT=y +CONFIG_UBIFS_FS_XATTR=y +CONFIG_UBIFS_FS_SECURITY=y +CONFIG_UBIFS_FS_AUTHENTICATION=y +CONFIG_CRAMFS=m +CONFIG_CRAMFS_BLOCKDEV=y +CONFIG_CRAMFS_MTD=y +CONFIG_SQUASHFS=m +# CONFIG_SQUASHFS_FILE_CACHE is not set +CONFIG_SQUASHFS_FILE_DIRECT=y +# CONFIG_SQUASHFS_DECOMP_SINGLE is not set +CONFIG_SQUASHFS_DECOMP_MULTI=y +# CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU is not set +CONFIG_SQUASHFS_XATTR=y +CONFIG_SQUASHFS_ZLIB=y +CONFIG_SQUASHFS_LZ4=y +CONFIG_SQUASHFS_LZO=y +CONFIG_SQUASHFS_XZ=y +CONFIG_SQUASHFS_ZSTD=y +# CONFIG_SQUASHFS_4K_DEVBLK_SIZE is not set +# CONFIG_SQUASHFS_EMBEDDED is not set +CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3 +# CONFIG_VXFS_FS is not set +CONFIG_MINIX_FS=m +CONFIG_OMFS_FS=m +# CONFIG_HPFS_FS is not set +# CONFIG_QNX4FS_FS is not set +# CONFIG_QNX6FS_FS is not set +CONFIG_ROMFS_FS=m +CONFIG_ROMFS_BACKED_BY_BLOCK=y +# CONFIG_ROMFS_BACKED_BY_MTD is not set +# CONFIG_ROMFS_BACKED_BY_BOTH is not set +CONFIG_ROMFS_ON_BLOCK=y +CONFIG_PSTORE=y +CONFIG_PSTORE_DEFLATE_COMPRESS=m +CONFIG_PSTORE_LZO_COMPRESS=m +CONFIG_PSTORE_LZ4_COMPRESS=m +CONFIG_PSTORE_LZ4HC_COMPRESS=m +# CONFIG_PSTORE_842_COMPRESS is not set +CONFIG_PSTORE_ZSTD_COMPRESS=y +CONFIG_PSTORE_COMPRESS=y +# CONFIG_PSTORE_DEFLATE_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_LZO_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_LZ4_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_LZ4HC_COMPRESS_DEFAULT is not set +CONFIG_PSTORE_ZSTD_COMPRESS_DEFAULT=y +CONFIG_PSTORE_COMPRESS_DEFAULT="zstd" +# CONFIG_PSTORE_CONSOLE is not set +# CONFIG_PSTORE_PMSG is not set +# CONFIG_PSTORE_FTRACE is not set +CONFIG_PSTORE_RAM=y +CONFIG_PSTORE_ZONE=m +CONFIG_PSTORE_BLK=m +CONFIG_PSTORE_BLK_BLKDEV="" +CONFIG_PSTORE_BLK_KMSG_SIZE=64 +CONFIG_PSTORE_BLK_MAX_REASON=2 +# CONFIG_SYSV_FS is not set +CONFIG_UFS_FS=m +# CONFIG_UFS_FS_WRITE is not set +# CONFIG_UFS_DEBUG is not set +CONFIG_EROFS_FS=m +# CONFIG_EROFS_FS_DEBUG is not set +CONFIG_EROFS_FS_XATTR=y +CONFIG_EROFS_FS_POSIX_ACL=y +CONFIG_EROFS_FS_SECURITY=y +CONFIG_EROFS_FS_ZIP=y +CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT=2 +CONFIG_VBOXSF_FS=m +CONFIG_NETWORK_FILESYSTEMS=y +CONFIG_NFS_FS=m +CONFIG_NFS_V2=m +CONFIG_NFS_V3=m +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=m +CONFIG_NFS_SWAP=y +CONFIG_NFS_V4_1=y +CONFIG_NFS_V4_2=y +CONFIG_PNFS_FILE_LAYOUT=m +CONFIG_PNFS_BLOCK=m +CONFIG_PNFS_FLEXFILE_LAYOUT=m +CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="kernel.org" +CONFIG_NFS_V4_1_MIGRATION=y +CONFIG_NFS_V4_SECURITY_LABEL=y +CONFIG_NFS_FSCACHE=y +# CONFIG_NFS_USE_LEGACY_DNS is not set +CONFIG_NFS_USE_KERNEL_DNS=y +CONFIG_NFS_DEBUG=y +# CONFIG_NFS_DISABLE_UDP_SUPPORT is not set +CONFIG_NFSD=m +CONFIG_NFSD_V2_ACL=y +CONFIG_NFSD_V3=y +CONFIG_NFSD_V3_ACL=y +CONFIG_NFSD_V4=y +CONFIG_NFSD_PNFS=y +CONFIG_NFSD_BLOCKLAYOUT=y +CONFIG_NFSD_SCSILAYOUT=y +# CONFIG_NFSD_FLEXFILELAYOUT is not set +CONFIG_NFSD_V4_SECURITY_LABEL=y +CONFIG_GRACE_PERIOD=m +CONFIG_LOCKD=m +CONFIG_LOCKD_V4=y +CONFIG_NFS_ACL_SUPPORT=m +CONFIG_NFS_COMMON=y +CONFIG_SUNRPC=m +CONFIG_SUNRPC_GSS=m +CONFIG_SUNRPC_BACKCHANNEL=y +CONFIG_SUNRPC_SWAP=y +CONFIG_RPCSEC_GSS_KRB5=m +CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES=y +CONFIG_SUNRPC_DEBUG=y +CONFIG_SUNRPC_XPRT_RDMA=m +CONFIG_CEPH_FS=m +CONFIG_CEPH_FSCACHE=y +CONFIG_CEPH_FS_POSIX_ACL=y +CONFIG_CEPH_FS_SECURITY_LABEL=y +CONFIG_CIFS=m +# CONFIG_CIFS_STATS2 is not set +CONFIG_CIFS_ALLOW_INSECURE_LEGACY=y +# CONFIG_CIFS_WEAK_PW_HASH is not set +CONFIG_CIFS_UPCALL=y +CONFIG_CIFS_XATTR=y +CONFIG_CIFS_POSIX=y +CONFIG_CIFS_DEBUG=y +# CONFIG_CIFS_DEBUG2 is not set +# CONFIG_CIFS_DEBUG_DUMP_KEYS is not set +CONFIG_CIFS_DFS_UPCALL=y +# CONFIG_CIFS_SMB_DIRECT is not set +CONFIG_CIFS_FSCACHE=y +CONFIG_CODA_FS=m +CONFIG_AFS_FS=m +# CONFIG_AFS_DEBUG is not set +CONFIG_AFS_FSCACHE=y +# CONFIG_AFS_DEBUG_CURSOR is not set +CONFIG_9P_FS=m +CONFIG_9P_FSCACHE=y +CONFIG_9P_FS_POSIX_ACL=y +CONFIG_9P_FS_SECURITY=y +CONFIG_NLS=y +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_737=m +CONFIG_NLS_CODEPAGE_775=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_CODEPAGE_852=m +CONFIG_NLS_CODEPAGE_855=m +CONFIG_NLS_CODEPAGE_857=m +CONFIG_NLS_CODEPAGE_860=m +CONFIG_NLS_CODEPAGE_861=m +CONFIG_NLS_CODEPAGE_862=m +CONFIG_NLS_CODEPAGE_863=m +CONFIG_NLS_CODEPAGE_864=m +CONFIG_NLS_CODEPAGE_865=m +CONFIG_NLS_CODEPAGE_866=m +CONFIG_NLS_CODEPAGE_869=m +CONFIG_NLS_CODEPAGE_936=m +CONFIG_NLS_CODEPAGE_950=m +CONFIG_NLS_CODEPAGE_932=m +CONFIG_NLS_CODEPAGE_949=m +CONFIG_NLS_CODEPAGE_874=m +CONFIG_NLS_ISO8859_8=m +CONFIG_NLS_CODEPAGE_1250=m +CONFIG_NLS_CODEPAGE_1251=m +CONFIG_NLS_ASCII=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_ISO8859_2=m +CONFIG_NLS_ISO8859_3=m +CONFIG_NLS_ISO8859_4=m +CONFIG_NLS_ISO8859_5=m +CONFIG_NLS_ISO8859_6=m +CONFIG_NLS_ISO8859_7=m +CONFIG_NLS_ISO8859_9=m +CONFIG_NLS_ISO8859_13=m +CONFIG_NLS_ISO8859_14=m +CONFIG_NLS_ISO8859_15=m +CONFIG_NLS_KOI8_R=m +CONFIG_NLS_KOI8_U=m +CONFIG_NLS_MAC_ROMAN=m +CONFIG_NLS_MAC_CELTIC=m +CONFIG_NLS_MAC_CENTEURO=m +CONFIG_NLS_MAC_CROATIAN=m +CONFIG_NLS_MAC_CYRILLIC=m +CONFIG_NLS_MAC_GAELIC=m +CONFIG_NLS_MAC_GREEK=m +CONFIG_NLS_MAC_ICELAND=m +CONFIG_NLS_MAC_INUIT=m +CONFIG_NLS_MAC_ROMANIAN=m +CONFIG_NLS_MAC_TURKISH=m +CONFIG_NLS_UTF8=m +CONFIG_DLM=m +# CONFIG_DLM_DEBUG is not set +CONFIG_UNICODE=y +# CONFIG_UNICODE_NORMALIZATION_SELFTEST is not set +CONFIG_IO_WQ=y +# end of File systems + +# +# Security options +# +CONFIG_KEYS=y +CONFIG_KEYS_REQUEST_CACHE=y +CONFIG_PERSISTENT_KEYRINGS=y +CONFIG_TRUSTED_KEYS=m +CONFIG_ENCRYPTED_KEYS=m +CONFIG_KEY_DH_OPERATIONS=y +CONFIG_KEY_NOTIFICATIONS=y +# CONFIG_SECURITY_DMESG_RESTRICT is not set +CONFIG_SECURITY=y +CONFIG_SECURITYFS=y +CONFIG_SECURITY_NETWORK=y +CONFIG_PAGE_TABLE_ISOLATION=y +CONFIG_SECURITY_INFINIBAND=y +CONFIG_SECURITY_NETWORK_XFRM=y +CONFIG_SECURITY_PATH=y +# CONFIG_INTEL_TXT is not set +CONFIG_LSM_MMAP_MIN_ADDR=65536 +CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR=y +CONFIG_HARDENED_USERCOPY=y +CONFIG_HARDENED_USERCOPY_FALLBACK=y +# CONFIG_HARDENED_USERCOPY_PAGESPAN is not set +CONFIG_FORTIFY_SOURCE=y +# CONFIG_STATIC_USERMODEHELPER is not set +CONFIG_SECURITY_SELINUX=y +CONFIG_SECURITY_SELINUX_BOOTPARAM=y +# CONFIG_SECURITY_SELINUX_DISABLE is not set +CONFIG_SECURITY_SELINUX_DEVELOP=y +CONFIG_SECURITY_SELINUX_AVC_STATS=y +CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=0 +CONFIG_SECURITY_SELINUX_SIDTAB_HASH_BITS=9 +CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE=256 +CONFIG_SECURITY_SMACK=y +CONFIG_SECURITY_SMACK_BRINGUP=y +CONFIG_SECURITY_SMACK_NETFILTER=y +CONFIG_SECURITY_SMACK_APPEND_SIGNALS=y +CONFIG_SECURITY_TOMOYO=y +CONFIG_SECURITY_TOMOYO_MAX_ACCEPT_ENTRY=2048 +CONFIG_SECURITY_TOMOYO_MAX_AUDIT_LOG=1024 +# CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER is not set +CONFIG_SECURITY_TOMOYO_POLICY_LOADER="/sbin/tomoyo-init" +CONFIG_SECURITY_TOMOYO_ACTIVATION_TRIGGER="/sbin/init" +# CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING is not set +CONFIG_SECURITY_APPARMOR=y +CONFIG_SECURITY_APPARMOR_HASH=y +CONFIG_SECURITY_APPARMOR_HASH_DEFAULT=y +# CONFIG_SECURITY_APPARMOR_DEBUG is not set +# CONFIG_SECURITY_LOADPIN is not set +CONFIG_SECURITY_YAMA=y +CONFIG_SECURITY_SAFESETID=y +CONFIG_SECURITY_LOCKDOWN_LSM=y +# CONFIG_SECURITY_LOCKDOWN_LSM_EARLY is not set +CONFIG_LOCK_DOWN_KERNEL_FORCE_NONE=y +# CONFIG_LOCK_DOWN_KERNEL_FORCE_INTEGRITY is not set +# CONFIG_LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY is not set +# CONFIG_INTEGRITY is not set +# CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT is not set +# CONFIG_DEFAULT_SECURITY_SELINUX is not set +# CONFIG_DEFAULT_SECURITY_SMACK is not set +# CONFIG_DEFAULT_SECURITY_TOMOYO is not set +# CONFIG_DEFAULT_SECURITY_APPARMOR is not set +CONFIG_DEFAULT_SECURITY_DAC=y +CONFIG_LSM="lockdown,yama" + +# +# Kernel hardening options +# +CONFIG_GCC_PLUGIN_STRUCTLEAK=y + +# +# Memory initialization +# +# CONFIG_INIT_STACK_NONE is not set +# CONFIG_GCC_PLUGIN_STRUCTLEAK_USER is not set +# CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF is not set +CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL=y +# CONFIG_GCC_PLUGIN_STRUCTLEAK_VERBOSE is not set +# CONFIG_GCC_PLUGIN_STACKLEAK is not set +CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y +# CONFIG_INIT_ON_FREE_DEFAULT_ON is not set +# end of Memory initialization +# end of Kernel hardening options +# end of Security options + +CONFIG_XOR_BLOCKS=m +CONFIG_ASYNC_CORE=m +CONFIG_ASYNC_MEMCPY=m +CONFIG_ASYNC_XOR=m +CONFIG_ASYNC_PQ=m +CONFIG_ASYNC_RAID6_RECOV=m +CONFIG_CRYPTO=y + +# +# Crypto core or helper +# +CONFIG_CRYPTO_ALGAPI=y +CONFIG_CRYPTO_ALGAPI2=y +CONFIG_CRYPTO_AEAD=y +CONFIG_CRYPTO_AEAD2=y +CONFIG_CRYPTO_SKCIPHER=y +CONFIG_CRYPTO_SKCIPHER2=y +CONFIG_CRYPTO_HASH=y +CONFIG_CRYPTO_HASH2=y +CONFIG_CRYPTO_RNG=y +CONFIG_CRYPTO_RNG2=y +CONFIG_CRYPTO_RNG_DEFAULT=y +CONFIG_CRYPTO_AKCIPHER2=y +CONFIG_CRYPTO_AKCIPHER=y +CONFIG_CRYPTO_KPP2=y +CONFIG_CRYPTO_KPP=y +CONFIG_CRYPTO_ACOMP2=y +CONFIG_CRYPTO_MANAGER=y +CONFIG_CRYPTO_MANAGER2=y +CONFIG_CRYPTO_USER=m +CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y +CONFIG_CRYPTO_GF128MUL=y +CONFIG_CRYPTO_NULL=y +CONFIG_CRYPTO_NULL2=y +CONFIG_CRYPTO_PCRYPT=m +CONFIG_CRYPTO_CRYPTD=m +CONFIG_CRYPTO_AUTHENC=m +CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_SIMD=m +CONFIG_CRYPTO_GLUE_HELPER_X86=m +CONFIG_CRYPTO_ENGINE=m + +# +# Public-key cryptography +# +CONFIG_CRYPTO_RSA=y +CONFIG_CRYPTO_DH=y +CONFIG_CRYPTO_ECC=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECRDSA=m +CONFIG_CRYPTO_CURVE25519=m +CONFIG_CRYPTO_CURVE25519_X86=m + +# +# Authenticated Encryption with Associated Data +# +CONFIG_CRYPTO_CCM=m +CONFIG_CRYPTO_GCM=y +CONFIG_CRYPTO_CHACHA20POLY1305=m +CONFIG_CRYPTO_AEGIS128=m +CONFIG_CRYPTO_AEGIS128_AESNI_SSE2=m +CONFIG_CRYPTO_SEQIV=y +CONFIG_CRYPTO_ECHAINIV=m + +# +# Block modes +# +CONFIG_CRYPTO_CBC=m +CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTR=y +CONFIG_CRYPTO_CTS=m +CONFIG_CRYPTO_ECB=m +CONFIG_CRYPTO_LRW=m +CONFIG_CRYPTO_OFB=m +CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m +CONFIG_CRYPTO_KEYWRAP=m +CONFIG_CRYPTO_NHPOLY1305=m +CONFIG_CRYPTO_NHPOLY1305_SSE2=m +CONFIG_CRYPTO_NHPOLY1305_AVX2=m +CONFIG_CRYPTO_ADIANTUM=m +CONFIG_CRYPTO_ESSIV=m + +# +# Hash modes +# +CONFIG_CRYPTO_CMAC=m +CONFIG_CRYPTO_HMAC=y +CONFIG_CRYPTO_XCBC=m +CONFIG_CRYPTO_VMAC=m + +# +# Digest +# +CONFIG_CRYPTO_CRC32C=m +CONFIG_CRYPTO_CRC32C_INTEL=m +CONFIG_CRYPTO_CRC32=m +CONFIG_CRYPTO_CRC32_PCLMUL=m +CONFIG_CRYPTO_XXHASH=m +CONFIG_CRYPTO_BLAKE2B=m +CONFIG_CRYPTO_BLAKE2S=m +CONFIG_CRYPTO_BLAKE2S_X86=m +CONFIG_CRYPTO_CRCT10DIF=y +CONFIG_CRYPTO_CRCT10DIF_PCLMUL=m +CONFIG_CRYPTO_GHASH=y +CONFIG_CRYPTO_POLY1305=m +CONFIG_CRYPTO_POLY1305_X86_64=m +CONFIG_CRYPTO_MD4=m +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_RMD128=m +CONFIG_CRYPTO_RMD160=m +CONFIG_CRYPTO_RMD256=m +CONFIG_CRYPTO_RMD320=m +CONFIG_CRYPTO_SHA1=y +CONFIG_CRYPTO_SHA1_SSSE3=m +CONFIG_CRYPTO_SHA256_SSSE3=m +CONFIG_CRYPTO_SHA512_SSSE3=m +CONFIG_CRYPTO_SHA256=y +CONFIG_CRYPTO_SHA512=y +CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m +CONFIG_CRYPTO_STREEBOG=m +CONFIG_CRYPTO_TGR192=m +CONFIG_CRYPTO_WP512=m +CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL=m + +# +# Ciphers +# +CONFIG_CRYPTO_AES=y +CONFIG_CRYPTO_AES_TI=m +CONFIG_CRYPTO_AES_NI_INTEL=m +CONFIG_CRYPTO_ANUBIS=m +CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_BLOWFISH=m +CONFIG_CRYPTO_BLOWFISH_COMMON=m +CONFIG_CRYPTO_BLOWFISH_X86_64=m +CONFIG_CRYPTO_CAMELLIA=m +CONFIG_CRYPTO_CAMELLIA_X86_64=m +CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64=m +CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64=m +CONFIG_CRYPTO_CAST_COMMON=m +CONFIG_CRYPTO_CAST5=m +CONFIG_CRYPTO_CAST5_AVX_X86_64=m +CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_CAST6_AVX_X86_64=m +CONFIG_CRYPTO_DES=m +CONFIG_CRYPTO_DES3_EDE_X86_64=m +CONFIG_CRYPTO_FCRYPT=m +CONFIG_CRYPTO_KHAZAD=m +CONFIG_CRYPTO_SALSA20=m +CONFIG_CRYPTO_CHACHA20=m +CONFIG_CRYPTO_CHACHA20_X86_64=m +CONFIG_CRYPTO_SEED=m +CONFIG_CRYPTO_SERPENT=m +CONFIG_CRYPTO_SERPENT_SSE2_X86_64=m +CONFIG_CRYPTO_SERPENT_AVX_X86_64=m +CONFIG_CRYPTO_SERPENT_AVX2_X86_64=m +CONFIG_CRYPTO_SM4=m +CONFIG_CRYPTO_TEA=m +CONFIG_CRYPTO_TWOFISH=m +CONFIG_CRYPTO_TWOFISH_COMMON=m +CONFIG_CRYPTO_TWOFISH_X86_64=m +CONFIG_CRYPTO_TWOFISH_X86_64_3WAY=m +CONFIG_CRYPTO_TWOFISH_AVX_X86_64=m + +# +# Compression +# +CONFIG_CRYPTO_DEFLATE=m +CONFIG_CRYPTO_LZO=m +CONFIG_CRYPTO_842=m +CONFIG_CRYPTO_LZ4=y +CONFIG_CRYPTO_LZ4HC=m +CONFIG_CRYPTO_ZSTD=y + +# +# Random Number Generation +# +CONFIG_CRYPTO_ANSI_CPRNG=m +CONFIG_CRYPTO_DRBG_MENU=y +CONFIG_CRYPTO_DRBG_HMAC=y +CONFIG_CRYPTO_DRBG_HASH=y +CONFIG_CRYPTO_DRBG_CTR=y +CONFIG_CRYPTO_DRBG=y +CONFIG_CRYPTO_JITTERENTROPY=y +CONFIG_CRYPTO_USER_API=m +CONFIG_CRYPTO_USER_API_HASH=m +CONFIG_CRYPTO_USER_API_SKCIPHER=m +CONFIG_CRYPTO_USER_API_RNG=m +CONFIG_CRYPTO_USER_API_AEAD=m +# CONFIG_CRYPTO_STATS is not set +CONFIG_CRYPTO_HASH_INFO=y + +# +# Crypto library routines +# +CONFIG_CRYPTO_LIB_AES=y +CONFIG_CRYPTO_LIB_ARC4=m +CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S=m +CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC=m +CONFIG_CRYPTO_LIB_BLAKE2S=m +CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA=m +CONFIG_CRYPTO_LIB_CHACHA_GENERIC=m +CONFIG_CRYPTO_LIB_CHACHA=m +CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519=m +CONFIG_CRYPTO_LIB_CURVE25519_GENERIC=m +CONFIG_CRYPTO_LIB_CURVE25519=m +CONFIG_CRYPTO_LIB_DES=m +CONFIG_CRYPTO_LIB_POLY1305_RSIZE=11 +CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305=m +CONFIG_CRYPTO_LIB_POLY1305_GENERIC=m +CONFIG_CRYPTO_LIB_POLY1305=m +CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m +CONFIG_CRYPTO_LIB_SHA256=y +CONFIG_CRYPTO_HW=y +CONFIG_CRYPTO_DEV_PADLOCK=m +CONFIG_CRYPTO_DEV_PADLOCK_AES=m +CONFIG_CRYPTO_DEV_PADLOCK_SHA=m +CONFIG_CRYPTO_DEV_ATMEL_I2C=m +CONFIG_CRYPTO_DEV_ATMEL_ECC=m +CONFIG_CRYPTO_DEV_ATMEL_SHA204A=m +CONFIG_CRYPTO_DEV_CCP=y +CONFIG_CRYPTO_DEV_CCP_DD=m +CONFIG_CRYPTO_DEV_SP_CCP=y +CONFIG_CRYPTO_DEV_CCP_CRYPTO=m +CONFIG_CRYPTO_DEV_SP_PSP=y +CONFIG_CRYPTO_DEV_CCP_DEBUGFS=y +CONFIG_CRYPTO_DEV_QAT=m +CONFIG_CRYPTO_DEV_QAT_DH895xCC=m +CONFIG_CRYPTO_DEV_QAT_C3XXX=m +CONFIG_CRYPTO_DEV_QAT_C62X=m +CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m +CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m +CONFIG_CRYPTO_DEV_QAT_C62XVF=m +CONFIG_CRYPTO_DEV_NITROX=m +CONFIG_CRYPTO_DEV_NITROX_CNN55XX=m +CONFIG_CRYPTO_DEV_CHELSIO=m +CONFIG_CHELSIO_IPSEC_INLINE=y +CONFIG_CHELSIO_TLS_DEVICE=y +CONFIG_CRYPTO_DEV_VIRTIO=m +CONFIG_CRYPTO_DEV_SAFEXCEL=m +CONFIG_CRYPTO_DEV_CCREE=m +CONFIG_CRYPTO_DEV_AMLOGIC_GXL=m +CONFIG_CRYPTO_DEV_AMLOGIC_GXL_DEBUG=y +CONFIG_ASYMMETRIC_KEY_TYPE=y +CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y +CONFIG_ASYMMETRIC_TPM_KEY_SUBTYPE=m +CONFIG_X509_CERTIFICATE_PARSER=y +CONFIG_PKCS8_PRIVATE_KEY_PARSER=m +CONFIG_TPM_KEY_PARSER=m +CONFIG_PKCS7_MESSAGE_PARSER=y +# CONFIG_PKCS7_TEST_KEY is not set +CONFIG_SIGNED_PE_FILE_VERIFICATION=y + +# +# Certificates for signature checking +# +CONFIG_MODULE_SIG_KEY="certs/signing_key.pem" +CONFIG_SYSTEM_TRUSTED_KEYRING=y +CONFIG_SYSTEM_TRUSTED_KEYS="" +# CONFIG_SYSTEM_EXTRA_CERTIFICATE is not set +CONFIG_SECONDARY_TRUSTED_KEYRING=y +CONFIG_SYSTEM_BLACKLIST_KEYRING=y +CONFIG_SYSTEM_BLACKLIST_HASH_LIST="" +# end of Certificates for signature checking + +CONFIG_BINARY_PRINTF=y + +# +# Library routines +# +CONFIG_RAID6_PQ=m +CONFIG_RAID6_PQ_BENCHMARK=y +CONFIG_LINEAR_RANGES=y +CONFIG_PACKING=y +CONFIG_BITREVERSE=y +CONFIG_GENERIC_STRNCPY_FROM_USER=y +CONFIG_GENERIC_STRNLEN_USER=y +CONFIG_GENERIC_NET_UTILS=y +CONFIG_GENERIC_FIND_FIRST_BIT=y +CONFIG_CORDIC=m +# CONFIG_PRIME_NUMBERS is not set +CONFIG_RATIONAL=y +CONFIG_GENERIC_PCI_IOMAP=y +CONFIG_GENERIC_IOMAP=y +CONFIG_ARCH_USE_CMPXCHG_LOCKREF=y +CONFIG_ARCH_HAS_FAST_MULTIPLIER=y +CONFIG_ARCH_USE_SYM_ANNOTATIONS=y +CONFIG_CRC_CCITT=y +CONFIG_CRC16=m +CONFIG_CRC_T10DIF=y +CONFIG_CRC_ITU_T=m +CONFIG_CRC32=y +# CONFIG_CRC32_SELFTEST is not set +CONFIG_CRC32_SLICEBY8=y +# CONFIG_CRC32_SLICEBY4 is not set +# CONFIG_CRC32_SARWATE is not set +# CONFIG_CRC32_BIT is not set +CONFIG_CRC64=m +CONFIG_CRC4=m +CONFIG_CRC7=m +CONFIG_LIBCRC32C=m +CONFIG_CRC8=m +CONFIG_XXHASH=y +# CONFIG_RANDOM32_SELFTEST is not set +CONFIG_842_COMPRESS=m +CONFIG_842_DECOMPRESS=m +CONFIG_ZLIB_INFLATE=y +CONFIG_ZLIB_DEFLATE=y +CONFIG_LZO_COMPRESS=y +CONFIG_LZO_DECOMPRESS=y +CONFIG_LZ4_COMPRESS=y +CONFIG_LZ4HC_COMPRESS=m +CONFIG_LZ4_DECOMPRESS=y +CONFIG_ZSTD_COMPRESS=y +CONFIG_ZSTD_DECOMPRESS=y +CONFIG_XZ_DEC=y +CONFIG_XZ_DEC_X86=y +CONFIG_XZ_DEC_POWERPC=y +CONFIG_XZ_DEC_IA64=y +CONFIG_XZ_DEC_ARM=y +CONFIG_XZ_DEC_ARMTHUMB=y +CONFIG_XZ_DEC_SPARC=y +CONFIG_XZ_DEC_BCJ=y +# CONFIG_XZ_DEC_TEST is not set +CONFIG_DECOMPRESS_GZIP=y +CONFIG_DECOMPRESS_BZIP2=y +CONFIG_DECOMPRESS_LZMA=y +CONFIG_DECOMPRESS_XZ=y +CONFIG_DECOMPRESS_LZO=y +CONFIG_DECOMPRESS_LZ4=y +CONFIG_GENERIC_ALLOCATOR=y +CONFIG_REED_SOLOMON=y +CONFIG_REED_SOLOMON_ENC8=y +CONFIG_REED_SOLOMON_DEC8=y +CONFIG_REED_SOLOMON_DEC16=y +CONFIG_BCH=m +CONFIG_TEXTSEARCH=y +CONFIG_TEXTSEARCH_KMP=m +CONFIG_TEXTSEARCH_BM=m +CONFIG_TEXTSEARCH_FSM=m +CONFIG_BTREE=y +CONFIG_INTERVAL_TREE=y +CONFIG_XARRAY_MULTI=y +CONFIG_ASSOCIATIVE_ARRAY=y +CONFIG_HAS_IOMEM=y +CONFIG_HAS_IOPORT_MAP=y +CONFIG_HAS_DMA=y +CONFIG_NEED_SG_DMA_LENGTH=y +CONFIG_NEED_DMA_MAP_STATE=y +CONFIG_ARCH_DMA_ADDR_T_64BIT=y +CONFIG_DMA_DECLARE_COHERENT=y +CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED=y +CONFIG_DMA_VIRT_OPS=y +CONFIG_SWIOTLB=y +CONFIG_DMA_COHERENT_POOL=y +# CONFIG_DMA_API_DEBUG is not set +CONFIG_SGL_ALLOC=y +CONFIG_IOMMU_HELPER=y +CONFIG_CHECK_SIGNATURE=y +CONFIG_CPU_RMAP=y +CONFIG_DQL=y +CONFIG_GLOB=y +# CONFIG_GLOB_SELFTEST is not set +CONFIG_NLATTR=y +CONFIG_LRU_CACHE=m +CONFIG_CLZ_TAB=y +CONFIG_IRQ_POLL=y +CONFIG_MPILIB=y +CONFIG_DIMLIB=y +CONFIG_LIBFDT=y +CONFIG_OID_REGISTRY=y +CONFIG_UCS2_STRING=y +CONFIG_HAVE_GENERIC_VDSO=y +CONFIG_GENERIC_GETTIMEOFDAY=y +CONFIG_GENERIC_VDSO_TIME_NS=y +CONFIG_FONT_SUPPORT=y +CONFIG_FONTS=y +# CONFIG_FONT_8x8 is not set +CONFIG_FONT_8x16=y +# CONFIG_FONT_6x11 is not set +# CONFIG_FONT_7x14 is not set +# CONFIG_FONT_PEARL_8x8 is not set +# CONFIG_FONT_ACORN_8x8 is not set +# CONFIG_FONT_MINI_4x6 is not set +# CONFIG_FONT_6x10 is not set +# CONFIG_FONT_10x18 is not set +# CONFIG_FONT_SUN8x16 is not set +# CONFIG_FONT_SUN12x22 is not set +CONFIG_FONT_TER16x32=y +CONFIG_SG_POOL=y +CONFIG_ARCH_HAS_PMEM_API=y +CONFIG_MEMREGION=y +CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE=y +CONFIG_ARCH_HAS_UACCESS_MCSAFE=y +CONFIG_ARCH_STACKWALK=y +CONFIG_SBITMAP=y +CONFIG_PARMAN=m +CONFIG_OBJAGG=m +# CONFIG_STRING_SELFTEST is not set +# end of Library routines + +# +# Kernel hacking +# + +# +# printk and dmesg options +# +CONFIG_PRINTK_TIME=y +# CONFIG_PRINTK_CALLER is not set +CONFIG_CONSOLE_LOGLEVEL_DEFAULT=4 +CONFIG_CONSOLE_LOGLEVEL_QUIET=1 +CONFIG_MESSAGE_LOGLEVEL_DEFAULT=4 +# CONFIG_BOOT_PRINTK_DELAY is not set +CONFIG_DYNAMIC_DEBUG=y +CONFIG_DYNAMIC_DEBUG_CORE=y +CONFIG_SYMBOLIC_ERRNAME=y +CONFIG_DEBUG_BUGVERBOSE=y +# end of printk and dmesg options + +# +# Compile-time checks and compiler options +# +CONFIG_DEBUG_INFO=y +# CONFIG_DEBUG_INFO_REDUCED is not set +# CONFIG_DEBUG_INFO_COMPRESSED is not set +# CONFIG_DEBUG_INFO_SPLIT is not set +CONFIG_DEBUG_INFO_DWARF4=y +CONFIG_DEBUG_INFO_BTF=y +# CONFIG_GDB_SCRIPTS is not set +# CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_FRAME_WARN=2048 +CONFIG_STRIP_ASM_SYMS=y +# CONFIG_READABLE_ASM is not set +# CONFIG_HEADERS_INSTALL is not set +# CONFIG_DEBUG_SECTION_MISMATCH is not set +CONFIG_SECTION_MISMATCH_WARN_ONLY=y +CONFIG_STACK_VALIDATION=y +# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set +# end of Compile-time checks and compiler options + +# +# Generic Kernel Debugging Instruments +# +CONFIG_MAGIC_SYSRQ=y +CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0x0 +CONFIG_MAGIC_SYSRQ_SERIAL=y +CONFIG_MAGIC_SYSRQ_SERIAL_SEQUENCE="" +CONFIG_DEBUG_FS=y +CONFIG_HAVE_ARCH_KGDB=y +# CONFIG_KGDB is not set +CONFIG_ARCH_HAS_UBSAN_SANITIZE_ALL=y +# CONFIG_UBSAN is not set +# end of Generic Kernel Debugging Instruments + +CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_MISC=y + +# +# Memory Debugging +# +# CONFIG_PAGE_EXTENSION is not set +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_PAGE_OWNER is not set +CONFIG_PAGE_POISONING=y +CONFIG_PAGE_POISONING_NO_SANITY=y +CONFIG_PAGE_POISONING_ZERO=y +# CONFIG_DEBUG_PAGE_REF is not set +# CONFIG_DEBUG_RODATA_TEST is not set +CONFIG_ARCH_HAS_DEBUG_WX=y +CONFIG_DEBUG_WX=y +CONFIG_GENERIC_PTDUMP=y +CONFIG_PTDUMP_CORE=y +# CONFIG_PTDUMP_DEBUGFS is not set +# CONFIG_DEBUG_OBJECTS is not set +# CONFIG_SLUB_DEBUG_ON is not set +# CONFIG_SLUB_STATS is not set +CONFIG_HAVE_DEBUG_KMEMLEAK=y +# CONFIG_DEBUG_KMEMLEAK is not set +# CONFIG_DEBUG_STACK_USAGE is not set +CONFIG_SCHED_STACK_END_CHECK=y +CONFIG_ARCH_HAS_DEBUG_VM_PGTABLE=y +# CONFIG_DEBUG_VM is not set +# CONFIG_DEBUG_VM_PGTABLE is not set +CONFIG_ARCH_HAS_DEBUG_VIRTUAL=y +# CONFIG_DEBUG_VIRTUAL is not set +CONFIG_DEBUG_MEMORY_INIT=y +# CONFIG_DEBUG_PER_CPU_MAPS is not set +CONFIG_HAVE_ARCH_KASAN=y +CONFIG_HAVE_ARCH_KASAN_VMALLOC=y +CONFIG_CC_HAS_KASAN_GENERIC=y +CONFIG_CC_HAS_WORKING_NOSANITIZE_ADDRESS=y +# CONFIG_KASAN is not set +CONFIG_KASAN_STACK=1 +# end of Memory Debugging + +# CONFIG_DEBUG_SHIRQ is not set + +# +# Debug Oops, Lockups and Hangs +# +# CONFIG_PANIC_ON_OOPS is not set +CONFIG_PANIC_ON_OOPS_VALUE=0 +CONFIG_PANIC_TIMEOUT=0 +CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y +# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 +CONFIG_HARDLOCKUP_DETECTOR_PERF=y +CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y +CONFIG_HARDLOCKUP_DETECTOR=y +# CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is not set +CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE=0 +CONFIG_DETECT_HUNG_TASK=y +CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120 +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 +# CONFIG_WQ_WATCHDOG is not set +# CONFIG_TEST_LOCKUP is not set +# end of Debug Oops, Lockups and Hangs + +# +# Scheduler Debugging +# +CONFIG_SCHED_DEBUG=y +CONFIG_SCHED_INFO=y +CONFIG_SCHEDSTATS=y +# end of Scheduler Debugging + +# CONFIG_DEBUG_TIMEKEEPING is not set +CONFIG_DEBUG_PREEMPT=y + +# +# Lock Debugging (spinlocks, mutexes, etc...) +# +CONFIG_LOCK_DEBUGGING_SUPPORT=y +# CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set +# CONFIG_DEBUG_RT_MUTEXES is not set +# CONFIG_DEBUG_SPINLOCK is not set +# CONFIG_DEBUG_MUTEXES is not set +# CONFIG_DEBUG_WW_MUTEX_SLOWPATH is not set +# CONFIG_DEBUG_RWSEMS is not set +# CONFIG_DEBUG_LOCK_ALLOC is not set +# CONFIG_DEBUG_ATOMIC_SLEEP is not set +# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +# CONFIG_LOCK_TORTURE_TEST is not set +# CONFIG_WW_MUTEX_SELFTEST is not set +# end of Lock Debugging (spinlocks, mutexes, etc...) + +CONFIG_STACKTRACE=y +# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set +# CONFIG_DEBUG_KOBJECT is not set + +# +# Debug kernel data structures +# +# CONFIG_DEBUG_LIST is not set +# CONFIG_DEBUG_PLIST is not set +# CONFIG_DEBUG_SG is not set +# CONFIG_DEBUG_NOTIFIERS is not set +# CONFIG_BUG_ON_DATA_CORRUPTION is not set +# end of Debug kernel data structures + +# CONFIG_DEBUG_CREDENTIALS is not set + +# +# RCU Debugging +# +# CONFIG_RCU_PERF_TEST is not set +# CONFIG_RCU_TORTURE_TEST is not set +CONFIG_RCU_CPU_STALL_TIMEOUT=60 +# CONFIG_RCU_TRACE is not set +# CONFIG_RCU_EQS_DEBUG is not set +# end of RCU Debugging + +# CONFIG_DEBUG_WQ_FORCE_RR_CPU is not set +# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set +# CONFIG_CPU_HOTPLUG_STATE_CONTROL is not set +CONFIG_LATENCYTOP=y +CONFIG_USER_STACKTRACE_SUPPORT=y +CONFIG_NOP_TRACER=y +CONFIG_HAVE_FUNCTION_TRACER=y +CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y +CONFIG_HAVE_DYNAMIC_FTRACE=y +CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS=y +CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y +CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y +CONFIG_HAVE_FENTRY=y +CONFIG_HAVE_C_RECORDMCOUNT=y +CONFIG_TRACER_MAX_TRACE=y +CONFIG_TRACE_CLOCK=y +CONFIG_RING_BUFFER=y +CONFIG_EVENT_TRACING=y +CONFIG_CONTEXT_SWITCH_TRACER=y +CONFIG_RING_BUFFER_ALLOW_SWAP=y +CONFIG_TRACING=y +CONFIG_GENERIC_TRACER=y +CONFIG_TRACING_SUPPORT=y +CONFIG_FTRACE=y +# CONFIG_BOOTTIME_TRACING is not set +CONFIG_FUNCTION_TRACER=y +CONFIG_FUNCTION_GRAPH_TRACER=y +CONFIG_DYNAMIC_FTRACE=y +CONFIG_DYNAMIC_FTRACE_WITH_REGS=y +CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y +CONFIG_FUNCTION_PROFILER=y +CONFIG_STACK_TRACER=y +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_PREEMPT_TRACER is not set +CONFIG_SCHED_TRACER=y +CONFIG_HWLAT_TRACER=y +CONFIG_MMIOTRACE=y +CONFIG_FTRACE_SYSCALLS=y +CONFIG_TRACER_SNAPSHOT=y +# CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP is not set +CONFIG_BRANCH_PROFILE_NONE=y +# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_KPROBE_EVENTS=y +# CONFIG_KPROBE_EVENTS_ON_NOTRACE is not set +CONFIG_UPROBE_EVENTS=y +CONFIG_BPF_EVENTS=y +CONFIG_DYNAMIC_EVENTS=y +CONFIG_PROBE_EVENTS=y +CONFIG_BPF_KPROBE_OVERRIDE=y +CONFIG_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_MAP=y +CONFIG_SYNTH_EVENTS=y +CONFIG_HIST_TRIGGERS=y +# CONFIG_TRACE_EVENT_INJECT is not set +# CONFIG_TRACEPOINT_BENCHMARK is not set +# CONFIG_RING_BUFFER_BENCHMARK is not set +# CONFIG_TRACE_EVAL_MAP_FILE is not set +# CONFIG_FTRACE_STARTUP_TEST is not set +# CONFIG_RING_BUFFER_STARTUP_TEST is not set +# CONFIG_MMIOTRACE_TEST is not set +# CONFIG_PREEMPTIRQ_DELAY_TEST is not set +# CONFIG_SYNTH_EVENT_GEN_TEST is not set +# CONFIG_KPROBE_EVENT_GEN_TEST is not set +# CONFIG_HIST_TRIGGERS_DEBUG is not set +# CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set +# CONFIG_SAMPLES is not set +CONFIG_HAVE_ARCH_KCSAN=y +CONFIG_ARCH_HAS_DEVMEM_IS_ALLOWED=y +CONFIG_STRICT_DEVMEM=y +CONFIG_IO_STRICT_DEVMEM=y + +# +# x86 Debugging +# +CONFIG_TRACE_IRQFLAGS_SUPPORT=y +# CONFIG_X86_VERBOSE_BOOTUP is not set +CONFIG_EARLY_PRINTK=y +# CONFIG_EARLY_PRINTK_DBGP is not set +# CONFIG_EARLY_PRINTK_USB_XDBC is not set +# CONFIG_EFI_PGT_DUMP is not set +# CONFIG_DEBUG_TLBFLUSH is not set +# CONFIG_IOMMU_DEBUG is not set +CONFIG_HAVE_MMIOTRACE_SUPPORT=y +# CONFIG_X86_DECODER_SELFTEST is not set +CONFIG_IO_DELAY_0X80=y +# CONFIG_IO_DELAY_0XED is not set +# CONFIG_IO_DELAY_UDELAY is not set +# CONFIG_IO_DELAY_NONE is not set +CONFIG_DEBUG_BOOT_PARAMS=y +# CONFIG_CPA_DEBUG is not set +# CONFIG_DEBUG_ENTRY is not set +# CONFIG_DEBUG_NMI_SELFTEST is not set +# CONFIG_X86_DEBUG_FPU is not set +# CONFIG_PUNIT_ATOM_DEBUG is not set +CONFIG_UNWINDER_ORC=y +# CONFIG_UNWINDER_FRAME_POINTER is not set +# CONFIG_UNWINDER_GUESS is not set +# end of x86 Debugging + +# +# Kernel Testing and Coverage +# +# CONFIG_KUNIT is not set +# CONFIG_NOTIFIER_ERROR_INJECTION is not set +CONFIG_FUNCTION_ERROR_INJECTION=y +# CONFIG_FAULT_INJECTION is not set +CONFIG_ARCH_HAS_KCOV=y +CONFIG_CC_HAS_SANCOV_TRACE_PC=y +# CONFIG_KCOV is not set +CONFIG_RUNTIME_TESTING_MENU=y +CONFIG_LKDTM=m +# CONFIG_TEST_LIST_SORT is not set +# CONFIG_TEST_MIN_HEAP is not set +# CONFIG_TEST_SORT is not set +# CONFIG_KPROBES_SANITY_TEST is not set +# CONFIG_BACKTRACE_SELF_TEST is not set +# CONFIG_RBTREE_TEST is not set +# CONFIG_REED_SOLOMON_TEST is not set +# CONFIG_INTERVAL_TREE_TEST is not set +# CONFIG_PERCPU_TEST is not set +# CONFIG_ATOMIC64_SELFTEST is not set +# CONFIG_ASYNC_RAID6_TEST is not set +# CONFIG_TEST_HEXDUMP is not set +# CONFIG_TEST_STRING_HELPERS is not set +# CONFIG_TEST_STRSCPY is not set +# CONFIG_TEST_KSTRTOX is not set +# CONFIG_TEST_PRINTF is not set +# CONFIG_TEST_BITMAP is not set +# CONFIG_TEST_BITFIELD is not set +# CONFIG_TEST_UUID is not set +# CONFIG_TEST_XARRAY is not set +# CONFIG_TEST_OVERFLOW is not set +# CONFIG_TEST_RHASHTABLE is not set +# CONFIG_TEST_HASH is not set +# CONFIG_TEST_IDA is not set +# CONFIG_TEST_PARMAN is not set +# CONFIG_TEST_LKM is not set +# CONFIG_TEST_BITOPS is not set +# CONFIG_TEST_VMALLOC is not set +# CONFIG_TEST_USER_COPY is not set +# CONFIG_TEST_BPF is not set +# CONFIG_TEST_BLACKHOLE_DEV is not set +# CONFIG_FIND_BIT_BENCHMARK is not set +# CONFIG_TEST_FIRMWARE is not set +# CONFIG_TEST_SYSCTL is not set +# CONFIG_TEST_UDELAY is not set +# CONFIG_TEST_STATIC_KEYS is not set +# CONFIG_TEST_KMOD is not set +# CONFIG_TEST_MEMCAT_P is not set +# CONFIG_TEST_OBJAGG is not set +# CONFIG_TEST_STACKINIT is not set +# CONFIG_TEST_MEMINIT is not set +# CONFIG_TEST_HMM is not set +# CONFIG_MEMTEST is not set +# CONFIG_HYPERV_TESTING is not set +# end of Kernel Testing and Coverage +# end of Kernel hacking diff --git a/linux-tkg/linux-tkg-config/5.9/90-cleanup.hook b/linux-tkg/linux-tkg-config/5.9/90-cleanup.hook new file mode 100644 index 0000000..99f5221 --- /dev/null +++ b/linux-tkg/linux-tkg-config/5.9/90-cleanup.hook @@ -0,0 +1,14 @@ +[Trigger] +Type = File +Operation = Install +Operation = Upgrade +Operation = Remove +Target = usr/lib/modules/*/ +Target = !usr/lib/modules/*/?* + +[Action] +Description = Cleaning up... +When = PostTransaction +Exec = /usr/share/libalpm/scripts/cleanup +NeedsTargets + diff --git a/linux-tkg/linux-tkg-config/5.9/cleanup b/linux-tkg/linux-tkg-config/5.9/cleanup new file mode 100755 index 0000000..c00c08d --- /dev/null +++ b/linux-tkg/linux-tkg-config/5.9/cleanup @@ -0,0 +1,10 @@ +#!/bin/bash + +for _f in /usr/lib/modules/*tkg*; do + if [[ ! -e ${_f}/vmlinuz ]]; then + rm -rf "$_f" + fi +done + +# vim:set ft=sh sw=2 et: + diff --git a/linux-tkg/linux-tkg-config/5.9/config.x86_64 b/linux-tkg/linux-tkg-config/5.9/config.x86_64 new file mode 100644 index 0000000..e4944f2 --- /dev/null +++ b/linux-tkg/linux-tkg-config/5.9/config.x86_64 @@ -0,0 +1,11049 @@ +# +# Automatically generated file; DO NOT EDIT. +# Linux/x86 5.9.0-rc4 Kernel Configuration +# +CONFIG_CC_VERSION_TEXT="gcc (TkG-mostlyportable) 10.2.1 20200730" +CONFIG_CC_IS_GCC=y +CONFIG_GCC_VERSION=100201 +CONFIG_LD_VERSION=235000000 +CONFIG_CLANG_VERSION=0 +CONFIG_CC_CAN_LINK=y +CONFIG_CC_CAN_LINK_STATIC=y +CONFIG_CC_HAS_ASM_GOTO=y +CONFIG_CC_HAS_ASM_INLINE=y +CONFIG_IRQ_WORK=y +CONFIG_BUILDTIME_TABLE_SORT=y +CONFIG_THREAD_INFO_IN_TASK=y + +# +# General setup +# +CONFIG_INIT_ENV_ARG_LIMIT=32 +# CONFIG_COMPILE_TEST is not set +CONFIG_LOCALVERSION="" +CONFIG_LOCALVERSION_AUTO=y +CONFIG_BUILD_SALT="" +CONFIG_HAVE_KERNEL_GZIP=y +CONFIG_HAVE_KERNEL_BZIP2=y +CONFIG_HAVE_KERNEL_LZMA=y +CONFIG_HAVE_KERNEL_XZ=y +CONFIG_HAVE_KERNEL_LZO=y +CONFIG_HAVE_KERNEL_LZ4=y +CONFIG_HAVE_KERNEL_ZSTD=y +# CONFIG_KERNEL_GZIP is not set +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set +CONFIG_KERNEL_XZ=y +# CONFIG_KERNEL_LZO is not set +# CONFIG_KERNEL_LZ4 is not set +# CONFIG_KERNEL_ZSTD is not set +CONFIG_DEFAULT_INIT="" +CONFIG_DEFAULT_HOSTNAME="archlinux" +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_SYSVIPC_SYSCTL=y +CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y +# CONFIG_WATCH_QUEUE is not set +CONFIG_CROSS_MEMORY_ATTACH=y +# CONFIG_USELIB is not set +CONFIG_AUDIT=y +CONFIG_HAVE_ARCH_AUDITSYSCALL=y +CONFIG_AUDITSYSCALL=y + +# +# IRQ subsystem +# +CONFIG_GENERIC_IRQ_PROBE=y +CONFIG_GENERIC_IRQ_SHOW=y +CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y +CONFIG_GENERIC_PENDING_IRQ=y +CONFIG_GENERIC_IRQ_MIGRATION=y +CONFIG_HARDIRQS_SW_RESEND=y +CONFIG_GENERIC_IRQ_CHIP=y +CONFIG_IRQ_DOMAIN=y +CONFIG_IRQ_SIM=y +CONFIG_IRQ_DOMAIN_HIERARCHY=y +CONFIG_GENERIC_MSI_IRQ=y +CONFIG_GENERIC_MSI_IRQ_DOMAIN=y +CONFIG_IRQ_MSI_IOMMU=y +CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR=y +CONFIG_GENERIC_IRQ_RESERVATION_MODE=y +CONFIG_IRQ_FORCED_THREADING=y +CONFIG_SPARSE_IRQ=y +# CONFIG_GENERIC_IRQ_DEBUGFS is not set +# end of IRQ subsystem + +CONFIG_CLOCKSOURCE_WATCHDOG=y +CONFIG_ARCH_CLOCKSOURCE_INIT=y +CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y +CONFIG_GENERIC_TIME_VSYSCALL=y +CONFIG_GENERIC_CLOCKEVENTS=y +CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y +CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=y +CONFIG_GENERIC_CMOS_UPDATE=y +CONFIG_HAVE_POSIX_CPU_TIMERS_TASK_WORK=y +CONFIG_POSIX_CPU_TIMERS_TASK_WORK=y + +# +# Timers subsystem +# +CONFIG_TICK_ONESHOT=y +CONFIG_NO_HZ_COMMON=y +# CONFIG_HZ_PERIODIC is not set +CONFIG_NO_HZ_IDLE=y +# CONFIG_NO_HZ_FULL is not set +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +# end of Timers subsystem + +# CONFIG_PREEMPT_NONE is not set +# CONFIG_PREEMPT_VOLUNTARY is not set +CONFIG_PREEMPT=y +CONFIG_PREEMPT_COUNT=y +CONFIG_PREEMPTION=y + +# +# CPU/Task time and stats accounting +# +CONFIG_TICK_CPU_ACCOUNTING=y +# CONFIG_VIRT_CPU_ACCOUNTING_GEN is not set +CONFIG_IRQ_TIME_ACCOUNTING=y +CONFIG_HAVE_SCHED_AVG_IRQ=y +# CONFIG_SCHED_THERMAL_PRESSURE is not set +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_BSD_PROCESS_ACCT_V3=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_PSI=y +# CONFIG_PSI_DEFAULT_DISABLED is not set +# end of CPU/Task time and stats accounting + +CONFIG_CPU_ISOLATION=y + +# +# RCU Subsystem +# +CONFIG_TREE_RCU=y +CONFIG_PREEMPT_RCU=y +CONFIG_RCU_EXPERT=y +CONFIG_SRCU=y +CONFIG_TREE_SRCU=y +CONFIG_TASKS_RCU_GENERIC=y +CONFIG_TASKS_RCU=y +CONFIG_TASKS_RUDE_RCU=y +CONFIG_RCU_STALL_COMMON=y +CONFIG_RCU_NEED_SEGCBLIST=y +CONFIG_RCU_FANOUT=64 +CONFIG_RCU_FANOUT_LEAF=16 +CONFIG_RCU_FAST_NO_HZ=y +CONFIG_RCU_BOOST=y +CONFIG_RCU_BOOST_DELAY=500 +CONFIG_RCU_NOCB_CPU=y +# CONFIG_TASKS_TRACE_RCU_READ_MB is not set +# end of RCU Subsystem + +CONFIG_BUILD_BIN2C=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +# CONFIG_IKHEADERS is not set +CONFIG_LOG_BUF_SHIFT=17 +CONFIG_LOG_CPU_MAX_BUF_SHIFT=12 +CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=13 +CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y + +# +# Scheduler features +# +CONFIG_UCLAMP_TASK=y +CONFIG_UCLAMP_BUCKETS_COUNT=5 +# end of Scheduler features + +CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y +CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y +CONFIG_CC_HAS_INT128=y +CONFIG_ARCH_SUPPORTS_INT128=y +CONFIG_NUMA_BALANCING=y +CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y +CONFIG_CGROUPS=y +CONFIG_PAGE_COUNTER=y +CONFIG_MEMCG=y +CONFIG_MEMCG_SWAP=y +CONFIG_MEMCG_KMEM=y +CONFIG_BLK_CGROUP=y +CONFIG_CGROUP_WRITEBACK=y +CONFIG_CGROUP_SCHED=y +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_CFS_BANDWIDTH=y +# CONFIG_RT_GROUP_SCHED is not set +CONFIG_UCLAMP_TASK_GROUP=y +CONFIG_CGROUP_PIDS=y +CONFIG_CGROUP_RDMA=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_HUGETLB=y +CONFIG_CPUSETS=y +CONFIG_PROC_PID_CPUSET=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_PERF=y +CONFIG_CGROUP_BPF=y +# CONFIG_CGROUP_DEBUG is not set +CONFIG_SOCK_CGROUP_DATA=y +CONFIG_NAMESPACES=y +CONFIG_UTS_NS=y +CONFIG_TIME_NS=y +CONFIG_IPC_NS=y +CONFIG_USER_NS=y +CONFIG_USER_NS_UNPRIVILEGED=y +CONFIG_PID_NS=y +CONFIG_NET_NS=y +CONFIG_CHECKPOINT_RESTORE=y +CONFIG_SCHED_AUTOGROUP=y +# CONFIG_SYSFS_DEPRECATED is not set +CONFIG_RELAY=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_GZIP=y +CONFIG_RD_BZIP2=y +CONFIG_RD_LZMA=y +CONFIG_RD_XZ=y +CONFIG_RD_LZO=y +CONFIG_RD_LZ4=y +CONFIG_RD_ZSTD=y +CONFIG_BOOT_CONFIG=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set +CONFIG_SYSCTL=y +CONFIG_HAVE_UID16=y +CONFIG_SYSCTL_EXCEPTION_TRACE=y +CONFIG_HAVE_PCSPKR_PLATFORM=y +CONFIG_BPF=y +CONFIG_EXPERT=y +# CONFIG_UID16 is not set +CONFIG_MULTIUSER=y +CONFIG_SGETMASK_SYSCALL=y +# CONFIG_SYSFS_SYSCALL is not set +CONFIG_FHANDLE=y +CONFIG_POSIX_TIMERS=y +CONFIG_PRINTK=y +CONFIG_PRINTK_NMI=y +CONFIG_BUG=y +CONFIG_ELF_CORE=y +CONFIG_PCSPKR_PLATFORM=y +CONFIG_BASE_FULL=y +CONFIG_FUTEX=y +CONFIG_FUTEX_PI=y +CONFIG_EPOLL=y +CONFIG_SIGNALFD=y +CONFIG_TIMERFD=y +CONFIG_EVENTFD=y +CONFIG_SHMEM=y +CONFIG_AIO=y +CONFIG_IO_URING=y +CONFIG_ADVISE_SYSCALLS=y +CONFIG_MEMBARRIER=y +CONFIG_KALLSYMS=y +CONFIG_KALLSYMS_ALL=y +CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y +CONFIG_KALLSYMS_BASE_RELATIVE=y +CONFIG_BPF_LSM=y +CONFIG_BPF_SYSCALL=y +CONFIG_ARCH_WANT_DEFAULT_BPF_JIT=y +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_BPF_JIT_DEFAULT_ON=y +# CONFIG_USERFAULTFD is not set +CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y +CONFIG_RSEQ=y +# CONFIG_DEBUG_RSEQ is not set +# CONFIG_EMBEDDED is not set +CONFIG_HAVE_PERF_EVENTS=y +# CONFIG_PC104 is not set + +# +# Kernel Performance Events And Counters +# +CONFIG_PERF_EVENTS=y +# CONFIG_DEBUG_PERF_USE_VMALLOC is not set +# end of Kernel Performance Events And Counters + +CONFIG_VM_EVENT_COUNTERS=y +CONFIG_SLUB_DEBUG=y +# CONFIG_SLUB_MEMCG_SYSFS_ON is not set +# CONFIG_COMPAT_BRK is not set +# CONFIG_SLAB is not set +CONFIG_SLUB=y +# CONFIG_SLOB is not set +CONFIG_SLAB_MERGE_DEFAULT=y +CONFIG_SLAB_FREELIST_RANDOM=y +CONFIG_SLAB_FREELIST_HARDENED=y +CONFIG_SHUFFLE_PAGE_ALLOCATOR=y +CONFIG_SLUB_CPU_PARTIAL=y +CONFIG_SYSTEM_DATA_VERIFICATION=y +CONFIG_PROFILING=y +CONFIG_TRACEPOINTS=y +# end of General setup + +CONFIG_64BIT=y +CONFIG_X86_64=y +CONFIG_X86=y +CONFIG_INSTRUCTION_DECODER=y +CONFIG_OUTPUT_FORMAT="elf64-x86-64" +CONFIG_LOCKDEP_SUPPORT=y +CONFIG_STACKTRACE_SUPPORT=y +CONFIG_MMU=y +CONFIG_ARCH_MMAP_RND_BITS_MIN=28 +CONFIG_ARCH_MMAP_RND_BITS_MAX=32 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=8 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16 +CONFIG_GENERIC_ISA_DMA=y +CONFIG_GENERIC_BUG=y +CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y +CONFIG_ARCH_MAY_HAVE_PC_FDC=y +CONFIG_GENERIC_CALIBRATE_DELAY=y +CONFIG_ARCH_HAS_CPU_RELAX=y +CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y +CONFIG_ARCH_HAS_FILTER_PGPROT=y +CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y +CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y +CONFIG_ARCH_HIBERNATION_POSSIBLE=y +CONFIG_ARCH_SUSPEND_POSSIBLE=y +CONFIG_ARCH_WANT_GENERAL_HUGETLB=y +CONFIG_ZONE_DMA32=y +CONFIG_AUDIT_ARCH=y +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y +CONFIG_HAVE_INTEL_TXT=y +CONFIG_X86_64_SMP=y +CONFIG_ARCH_SUPPORTS_UPROBES=y +CONFIG_FIX_EARLYCON_MEM=y +CONFIG_DYNAMIC_PHYSICAL_MASK=y +CONFIG_PGTABLE_LEVELS=5 +CONFIG_CC_HAS_SANE_STACKPROTECTOR=y + +# +# Processor type and features +# +CONFIG_ZONE_DMA=y +CONFIG_SMP=y +CONFIG_X86_FEATURE_NAMES=y +CONFIG_X86_X2APIC=y +CONFIG_X86_MPPARSE=y +# CONFIG_GOLDFISH is not set +CONFIG_RETPOLINE=y +CONFIG_X86_CPU_RESCTRL=y +# CONFIG_X86_EXTENDED_PLATFORM is not set +CONFIG_X86_INTEL_LPSS=y +CONFIG_X86_AMD_PLATFORM_DEVICE=y +CONFIG_IOSF_MBI=y +# CONFIG_IOSF_MBI_DEBUG is not set +CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y +CONFIG_SCHED_OMIT_FRAME_POINTER=y +CONFIG_HYPERVISOR_GUEST=y +CONFIG_PARAVIRT=y +CONFIG_PARAVIRT_XXL=y +# CONFIG_PARAVIRT_DEBUG is not set +CONFIG_PARAVIRT_SPINLOCKS=y +CONFIG_X86_HV_CALLBACK_VECTOR=y +CONFIG_XEN=y +CONFIG_XEN_PV=y +CONFIG_XEN_PV_SMP=y +CONFIG_XEN_DOM0=y +CONFIG_XEN_PVHVM=y +CONFIG_XEN_PVHVM_SMP=y +CONFIG_XEN_512GB=y +CONFIG_XEN_SAVE_RESTORE=y +# CONFIG_XEN_DEBUG_FS is not set +CONFIG_XEN_PVH=y +CONFIG_KVM_GUEST=y +CONFIG_ARCH_CPUIDLE_HALTPOLL=y +CONFIG_PVH=y +CONFIG_PARAVIRT_TIME_ACCOUNTING=y +CONFIG_PARAVIRT_CLOCK=y +CONFIG_JAILHOUSE_GUEST=y +CONFIG_ACRN_GUEST=y +# CONFIG_MK8 is not set +# CONFIG_MPSC is not set +# CONFIG_MCORE2 is not set +# CONFIG_MATOM is not set +CONFIG_GENERIC_CPU=y +CONFIG_X86_INTERNODE_CACHE_SHIFT=6 +CONFIG_X86_L1_CACHE_SHIFT=6 +CONFIG_X86_TSC=y +CONFIG_X86_CMPXCHG64=y +CONFIG_X86_CMOV=y +CONFIG_X86_MINIMUM_CPU_FAMILY=64 +CONFIG_X86_DEBUGCTLMSR=y +CONFIG_IA32_FEAT_CTL=y +CONFIG_X86_VMX_FEATURE_NAMES=y +CONFIG_PROCESSOR_SELECT=y +CONFIG_CPU_SUP_INTEL=y +CONFIG_CPU_SUP_AMD=y +CONFIG_CPU_SUP_HYGON=y +CONFIG_CPU_SUP_CENTAUR=y +CONFIG_CPU_SUP_ZHAOXIN=y +CONFIG_HPET_TIMER=y +CONFIG_HPET_EMULATE_RTC=y +CONFIG_DMI=y +CONFIG_GART_IOMMU=y +# CONFIG_MAXSMP is not set +CONFIG_NR_CPUS_RANGE_BEGIN=2 +CONFIG_NR_CPUS_RANGE_END=512 +CONFIG_NR_CPUS_DEFAULT=64 +CONFIG_NR_CPUS=320 +CONFIG_SCHED_SMT=y +CONFIG_SCHED_MC=y +CONFIG_SCHED_MC_PRIO=y +CONFIG_X86_LOCAL_APIC=y +CONFIG_X86_IO_APIC=y +CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y +CONFIG_X86_MCE=y +# CONFIG_X86_MCELOG_LEGACY is not set +CONFIG_X86_MCE_INTEL=y +CONFIG_X86_MCE_AMD=y +CONFIG_X86_MCE_THRESHOLD=y +CONFIG_X86_MCE_INJECT=m +CONFIG_X86_THERMAL_VECTOR=y + +# +# Performance monitoring +# +CONFIG_PERF_EVENTS_INTEL_UNCORE=m +CONFIG_PERF_EVENTS_INTEL_RAPL=m +CONFIG_PERF_EVENTS_INTEL_CSTATE=m +CONFIG_PERF_EVENTS_AMD_POWER=m +# end of Performance monitoring + +CONFIG_X86_16BIT=y +CONFIG_X86_ESPFIX64=y +CONFIG_X86_VSYSCALL_EMULATION=y +CONFIG_X86_IOPL_IOPERM=y +CONFIG_I8K=m +CONFIG_MICROCODE=y +CONFIG_MICROCODE_INTEL=y +CONFIG_MICROCODE_AMD=y +CONFIG_MICROCODE_OLD_INTERFACE=y +CONFIG_X86_MSR=m +CONFIG_X86_CPUID=m +CONFIG_X86_5LEVEL=y +CONFIG_X86_DIRECT_GBPAGES=y +# CONFIG_X86_CPA_STATISTICS is not set +CONFIG_AMD_MEM_ENCRYPT=y +# CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT is not set +CONFIG_NUMA=y +CONFIG_AMD_NUMA=y +CONFIG_X86_64_ACPI_NUMA=y +# CONFIG_NUMA_EMU is not set +CONFIG_NODES_SHIFT=5 +CONFIG_ARCH_SPARSEMEM_ENABLE=y +CONFIG_ARCH_SPARSEMEM_DEFAULT=y +CONFIG_ARCH_SELECT_MEMORY_MODEL=y +CONFIG_ARCH_MEMORY_PROBE=y +CONFIG_ARCH_PROC_KCORE_TEXT=y +CONFIG_ILLEGAL_POINTER_VALUE=0xdead000000000000 +CONFIG_X86_PMEM_LEGACY_DEVICE=y +CONFIG_X86_PMEM_LEGACY=m +CONFIG_X86_CHECK_BIOS_CORRUPTION=y +CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y +CONFIG_X86_RESERVE_LOW=64 +CONFIG_MTRR=y +CONFIG_MTRR_SANITIZER=y +CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=1 +CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT=0 +CONFIG_X86_PAT=y +CONFIG_ARCH_USES_PG_UNCACHED=y +CONFIG_ARCH_RANDOM=y +CONFIG_X86_SMAP=y +CONFIG_X86_UMIP=y +CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS=y +# CONFIG_X86_INTEL_TSX_MODE_OFF is not set +# CONFIG_X86_INTEL_TSX_MODE_ON is not set +CONFIG_X86_INTEL_TSX_MODE_AUTO=y +CONFIG_EFI=y +CONFIG_EFI_STUB=y +CONFIG_EFI_MIXED=y +CONFIG_SECCOMP=y +# CONFIG_HZ_100 is not set +# CONFIG_HZ_250 is not set +CONFIG_HZ_300=y +# CONFIG_HZ_1000 is not set +CONFIG_HZ=300 +CONFIG_SCHED_HRTICK=y +CONFIG_KEXEC=y +CONFIG_KEXEC_FILE=y +CONFIG_ARCH_HAS_KEXEC_PURGATORY=y +# CONFIG_KEXEC_SIG is not set +CONFIG_CRASH_DUMP=y +CONFIG_KEXEC_JUMP=y +CONFIG_PHYSICAL_START=0x1000000 +CONFIG_RELOCATABLE=y +CONFIG_RANDOMIZE_BASE=y +CONFIG_X86_NEED_RELOCS=y +CONFIG_PHYSICAL_ALIGN=0x200000 +CONFIG_DYNAMIC_MEMORY_LAYOUT=y +CONFIG_RANDOMIZE_MEMORY=y +CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING=0x1 +CONFIG_HOTPLUG_CPU=y +# CONFIG_BOOTPARAM_HOTPLUG_CPU0 is not set +# CONFIG_DEBUG_HOTPLUG_CPU0 is not set +# CONFIG_COMPAT_VDSO is not set +# CONFIG_LEGACY_VSYSCALL_EMULATE is not set +CONFIG_LEGACY_VSYSCALL_XONLY=y +# CONFIG_LEGACY_VSYSCALL_NONE is not set +# CONFIG_CMDLINE_BOOL is not set +CONFIG_MODIFY_LDT_SYSCALL=y +CONFIG_HAVE_LIVEPATCH=y +# CONFIG_LIVEPATCH is not set +# end of Processor type and features + +CONFIG_ARCH_HAS_ADD_PAGES=y +CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y +CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y +CONFIG_USE_PERCPU_NUMA_NODE_ID=y +CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK=y +CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION=y +CONFIG_ARCH_ENABLE_THP_MIGRATION=y + +# +# Power management and ACPI options +# +CONFIG_ARCH_HIBERNATION_HEADER=y +CONFIG_SUSPEND=y +CONFIG_SUSPEND_FREEZER=y +# CONFIG_SUSPEND_SKIP_SYNC is not set +CONFIG_HIBERNATE_CALLBACKS=y +CONFIG_HIBERNATION=y +CONFIG_HIBERNATION_SNAPSHOT_DEV=y +CONFIG_PM_STD_PARTITION="" +CONFIG_PM_SLEEP=y +CONFIG_PM_SLEEP_SMP=y +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y +CONFIG_PM_WAKELOCKS_LIMIT=100 +CONFIG_PM_WAKELOCKS_GC=y +CONFIG_PM=y +CONFIG_PM_DEBUG=y +CONFIG_PM_ADVANCED_DEBUG=y +# CONFIG_PM_TEST_SUSPEND is not set +CONFIG_PM_SLEEP_DEBUG=y +# CONFIG_DPM_WATCHDOG is not set +CONFIG_PM_TRACE=y +CONFIG_PM_TRACE_RTC=y +CONFIG_PM_CLK=y +CONFIG_PM_GENERIC_DOMAINS=y +CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y +CONFIG_PM_GENERIC_DOMAINS_SLEEP=y +CONFIG_PM_GENERIC_DOMAINS_OF=y +CONFIG_ENERGY_MODEL=y +CONFIG_ARCH_SUPPORTS_ACPI=y +CONFIG_ACPI=y +CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y +CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC=y +CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT=y +# CONFIG_ACPI_DEBUGGER is not set +CONFIG_ACPI_SPCR_TABLE=y +CONFIG_ACPI_LPIT=y +CONFIG_ACPI_SLEEP=y +CONFIG_ACPI_REV_OVERRIDE_POSSIBLE=y +CONFIG_ACPI_EC_DEBUGFS=y +CONFIG_ACPI_AC=m +CONFIG_ACPI_BATTERY=m +CONFIG_ACPI_BUTTON=y +CONFIG_ACPI_VIDEO=y +CONFIG_ACPI_FAN=y +CONFIG_ACPI_TAD=m +CONFIG_ACPI_DOCK=y +CONFIG_ACPI_CPU_FREQ_PSS=y +CONFIG_ACPI_PROCESSOR_CSTATE=y +CONFIG_ACPI_PROCESSOR_IDLE=y +CONFIG_ACPI_CPPC_LIB=y +CONFIG_ACPI_PROCESSOR=y +CONFIG_ACPI_IPMI=m +CONFIG_ACPI_HOTPLUG_CPU=y +CONFIG_ACPI_PROCESSOR_AGGREGATOR=y +CONFIG_ACPI_THERMAL=y +CONFIG_ARCH_HAS_ACPI_TABLE_UPGRADE=y +CONFIG_ACPI_TABLE_UPGRADE=y +CONFIG_ACPI_DEBUG=y +CONFIG_ACPI_PCI_SLOT=y +CONFIG_ACPI_CONTAINER=y +CONFIG_ACPI_HOTPLUG_MEMORY=y +CONFIG_ACPI_HOTPLUG_IOAPIC=y +CONFIG_ACPI_SBS=m +CONFIG_ACPI_HED=y +CONFIG_ACPI_CUSTOM_METHOD=m +CONFIG_ACPI_BGRT=y +# CONFIG_ACPI_REDUCED_HARDWARE_ONLY is not set +CONFIG_ACPI_NFIT=m +# CONFIG_NFIT_SECURITY_DEBUG is not set +CONFIG_ACPI_NUMA=y +CONFIG_ACPI_HMAT=y +CONFIG_HAVE_ACPI_APEI=y +CONFIG_HAVE_ACPI_APEI_NMI=y +CONFIG_ACPI_APEI=y +CONFIG_ACPI_APEI_GHES=y +CONFIG_ACPI_APEI_PCIEAER=y +CONFIG_ACPI_APEI_MEMORY_FAILURE=y +CONFIG_ACPI_APEI_EINJ=m +CONFIG_ACPI_APEI_ERST_DEBUG=m +CONFIG_DPTF_POWER=m +CONFIG_ACPI_WATCHDOG=y +CONFIG_ACPI_EXTLOG=m +CONFIG_ACPI_ADXL=y +CONFIG_PMIC_OPREGION=y +CONFIG_BYTCRC_PMIC_OPREGION=y +CONFIG_CHTCRC_PMIC_OPREGION=y +CONFIG_XPOWER_PMIC_OPREGION=y +CONFIG_CHT_WC_PMIC_OPREGION=y +CONFIG_CHT_DC_TI_PMIC_OPREGION=y +CONFIG_ACPI_CONFIGFS=m +CONFIG_TPS68470_PMIC_OPREGION=y +CONFIG_X86_PM_TIMER=y +CONFIG_SFI=y + +# +# CPU Frequency scaling +# +CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_GOV_ATTR_SET=y +CONFIG_CPU_FREQ_GOV_COMMON=y +CONFIG_CPU_FREQ_STAT=y +# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y +CONFIG_CPU_FREQ_GOV_PERFORMANCE=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=m +CONFIG_CPU_FREQ_GOV_USERSPACE=m +CONFIG_CPU_FREQ_GOV_ONDEMAND=m +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=m +CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y + +# +# CPU frequency scaling drivers +# +CONFIG_CPUFREQ_DT=m +CONFIG_CPUFREQ_DT_PLATDEV=y +CONFIG_X86_INTEL_PSTATE=y +CONFIG_X86_PCC_CPUFREQ=m +CONFIG_X86_ACPI_CPUFREQ=m +CONFIG_X86_ACPI_CPUFREQ_CPB=y +CONFIG_X86_POWERNOW_K8=m +CONFIG_X86_AMD_FREQ_SENSITIVITY=m +# CONFIG_X86_SPEEDSTEP_CENTRINO is not set +CONFIG_X86_P4_CLOCKMOD=m + +# +# shared options +# +CONFIG_X86_SPEEDSTEP_LIB=m +# end of CPU Frequency scaling + +# +# CPU Idle +# +CONFIG_CPU_IDLE=y +CONFIG_CPU_IDLE_GOV_LADDER=y +CONFIG_CPU_IDLE_GOV_MENU=y +CONFIG_CPU_IDLE_GOV_TEO=y +CONFIG_CPU_IDLE_GOV_HALTPOLL=y +CONFIG_HALTPOLL_CPUIDLE=m +# end of CPU Idle + +CONFIG_INTEL_IDLE=y +# end of Power management and ACPI options + +# +# Bus options (PCI etc.) +# +CONFIG_PCI_DIRECT=y +CONFIG_PCI_MMCONFIG=y +CONFIG_PCI_XEN=y +CONFIG_MMCONF_FAM10H=y +# CONFIG_PCI_CNB20LE_QUIRK is not set +# CONFIG_ISA_BUS is not set +CONFIG_ISA_DMA_API=y +CONFIG_AMD_NB=y +# CONFIG_X86_SYSFB is not set +# end of Bus options (PCI etc.) + +# +# Binary Emulations +# +CONFIG_IA32_EMULATION=y +# CONFIG_X86_X32 is not set +CONFIG_COMPAT_32=y +CONFIG_COMPAT=y +CONFIG_COMPAT_FOR_U64_ALIGNMENT=y +CONFIG_SYSVIPC_COMPAT=y +# end of Binary Emulations + +# +# Firmware Drivers +# +CONFIG_EDD=m +# CONFIG_EDD_OFF is not set +CONFIG_FIRMWARE_MEMMAP=y +CONFIG_DMIID=y +CONFIG_DMI_SYSFS=m +CONFIG_DMI_SCAN_MACHINE_NON_EFI_FALLBACK=y +CONFIG_ISCSI_IBFT_FIND=y +CONFIG_ISCSI_IBFT=m +CONFIG_FW_CFG_SYSFS=m +# CONFIG_FW_CFG_SYSFS_CMDLINE is not set +CONFIG_GOOGLE_FIRMWARE=y +# CONFIG_GOOGLE_SMI is not set +CONFIG_GOOGLE_COREBOOT_TABLE=m +CONFIG_GOOGLE_MEMCONSOLE=m +# CONFIG_GOOGLE_MEMCONSOLE_X86_LEGACY is not set +CONFIG_GOOGLE_FRAMEBUFFER_COREBOOT=m +CONFIG_GOOGLE_MEMCONSOLE_COREBOOT=m +CONFIG_GOOGLE_VPD=m + +# +# EFI (Extensible Firmware Interface) Support +# +# CONFIG_EFI_VARS is not set +CONFIG_EFI_ESRT=y +CONFIG_EFI_RUNTIME_MAP=y +# CONFIG_EFI_FAKE_MEMMAP is not set +CONFIG_EFI_SOFT_RESERVE=y +CONFIG_EFI_RUNTIME_WRAPPERS=y +CONFIG_EFI_GENERIC_STUB_INITRD_CMDLINE_LOADER=y +CONFIG_EFI_CAPSULE_LOADER=m +# CONFIG_EFI_TEST is not set +CONFIG_APPLE_PROPERTIES=y +# CONFIG_RESET_ATTACK_MITIGATION is not set +CONFIG_EFI_RCI2_TABLE=y +# CONFIG_EFI_DISABLE_PCI_DMA is not set +# end of EFI (Extensible Firmware Interface) Support + +CONFIG_EFI_EMBEDDED_FIRMWARE=y +CONFIG_UEFI_CPER=y +CONFIG_UEFI_CPER_X86=y +CONFIG_EFI_DEV_PATH_PARSER=y +CONFIG_EFI_EARLYCON=y + +# +# Tegra firmware driver +# +# end of Tegra firmware driver +# end of Firmware Drivers + +CONFIG_HAVE_KVM=y +CONFIG_HAVE_KVM_IRQCHIP=y +CONFIG_HAVE_KVM_IRQFD=y +CONFIG_HAVE_KVM_IRQ_ROUTING=y +CONFIG_HAVE_KVM_EVENTFD=y +CONFIG_KVM_MMIO=y +CONFIG_KVM_ASYNC_PF=y +CONFIG_HAVE_KVM_MSI=y +CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT=y +CONFIG_KVM_VFIO=y +CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT=y +CONFIG_KVM_COMPAT=y +CONFIG_HAVE_KVM_IRQ_BYPASS=y +CONFIG_HAVE_KVM_NO_POLL=y +CONFIG_KVM_XFER_TO_GUEST_WORK=y +CONFIG_VIRTUALIZATION=y +CONFIG_KVM=m +CONFIG_KVM_WERROR=y +CONFIG_KVM_INTEL=m +CONFIG_KVM_AMD=m +CONFIG_KVM_AMD_SEV=y +CONFIG_KVM_MMU_AUDIT=y +CONFIG_AS_AVX512=y +CONFIG_AS_SHA1_NI=y +CONFIG_AS_SHA256_NI=y +CONFIG_AS_TPAUSE=y + +# +# General architecture-dependent options +# +CONFIG_CRASH_CORE=y +CONFIG_KEXEC_CORE=y +CONFIG_HOTPLUG_SMT=y +CONFIG_GENERIC_ENTRY=y +CONFIG_OPROFILE=m +# CONFIG_OPROFILE_EVENT_MULTIPLEX is not set +CONFIG_HAVE_OPROFILE=y +CONFIG_OPROFILE_NMI_TIMER=y +CONFIG_KPROBES=y +CONFIG_JUMP_LABEL=y +# CONFIG_STATIC_KEYS_SELFTEST is not set +CONFIG_OPTPROBES=y +CONFIG_KPROBES_ON_FTRACE=y +CONFIG_UPROBES=y +CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y +CONFIG_ARCH_USE_BUILTIN_BSWAP=y +CONFIG_KRETPROBES=y +CONFIG_USER_RETURN_NOTIFIER=y +CONFIG_HAVE_IOREMAP_PROT=y +CONFIG_HAVE_KPROBES=y +CONFIG_HAVE_KRETPROBES=y +CONFIG_HAVE_OPTPROBES=y +CONFIG_HAVE_KPROBES_ON_FTRACE=y +CONFIG_HAVE_FUNCTION_ERROR_INJECTION=y +CONFIG_HAVE_NMI=y +CONFIG_HAVE_ARCH_TRACEHOOK=y +CONFIG_HAVE_DMA_CONTIGUOUS=y +CONFIG_GENERIC_SMP_IDLE_THREAD=y +CONFIG_ARCH_HAS_FORTIFY_SOURCE=y +CONFIG_ARCH_HAS_SET_MEMORY=y +CONFIG_ARCH_HAS_SET_DIRECT_MAP=y +CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y +CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT=y +CONFIG_HAVE_ASM_MODVERSIONS=y +CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y +CONFIG_HAVE_RSEQ=y +CONFIG_HAVE_FUNCTION_ARG_ACCESS_API=y +CONFIG_HAVE_HW_BREAKPOINT=y +CONFIG_HAVE_MIXED_BREAKPOINTS_REGS=y +CONFIG_HAVE_USER_RETURN_NOTIFIER=y +CONFIG_HAVE_PERF_EVENTS_NMI=y +CONFIG_HAVE_HARDLOCKUP_DETECTOR_PERF=y +CONFIG_HAVE_PERF_REGS=y +CONFIG_HAVE_PERF_USER_STACK_DUMP=y +CONFIG_HAVE_ARCH_JUMP_LABEL=y +CONFIG_HAVE_ARCH_JUMP_LABEL_RELATIVE=y +CONFIG_MMU_GATHER_TABLE_FREE=y +CONFIG_MMU_GATHER_RCU_TABLE_FREE=y +CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG=y +CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y +CONFIG_HAVE_CMPXCHG_LOCAL=y +CONFIG_HAVE_CMPXCHG_DOUBLE=y +CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION=y +CONFIG_ARCH_WANT_OLD_COMPAT_IPC=y +CONFIG_HAVE_ARCH_SECCOMP_FILTER=y +CONFIG_SECCOMP_FILTER=y +CONFIG_HAVE_ARCH_STACKLEAK=y +CONFIG_HAVE_STACKPROTECTOR=y +CONFIG_STACKPROTECTOR=y +CONFIG_STACKPROTECTOR_STRONG=y +CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES=y +CONFIG_HAVE_CONTEXT_TRACKING=y +CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y +CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y +CONFIG_HAVE_MOVE_PMD=y +CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y +CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD=y +CONFIG_HAVE_ARCH_HUGE_VMAP=y +CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y +CONFIG_HAVE_ARCH_SOFT_DIRTY=y +CONFIG_HAVE_MOD_ARCH_SPECIFIC=y +CONFIG_MODULES_USE_ELF_RELA=y +CONFIG_ARCH_HAS_ELF_RANDOMIZE=y +CONFIG_HAVE_ARCH_MMAP_RND_BITS=y +CONFIG_HAVE_EXIT_THREAD=y +CONFIG_ARCH_MMAP_RND_BITS=28 +CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS=y +CONFIG_ARCH_MMAP_RND_COMPAT_BITS=8 +CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES=y +CONFIG_HAVE_STACK_VALIDATION=y +CONFIG_HAVE_RELIABLE_STACKTRACE=y +CONFIG_ISA_BUS_API=y +CONFIG_OLD_SIGSUSPEND3=y +CONFIG_COMPAT_OLD_SIGACTION=y +CONFIG_COMPAT_32BIT_TIME=y +CONFIG_HAVE_ARCH_VMAP_STACK=y +CONFIG_VMAP_STACK=y +CONFIG_ARCH_HAS_STRICT_KERNEL_RWX=y +CONFIG_STRICT_KERNEL_RWX=y +CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y +CONFIG_STRICT_MODULE_RWX=y +CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y +CONFIG_ARCH_USE_MEMREMAP_PROT=y +CONFIG_LOCK_EVENT_COUNTS=y +CONFIG_ARCH_HAS_MEM_ENCRYPT=y + +# +# GCOV-based kernel profiling +# +# CONFIG_GCOV_KERNEL is not set +CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y +# end of GCOV-based kernel profiling + +CONFIG_HAVE_GCC_PLUGINS=y +CONFIG_GCC_PLUGINS=y +# CONFIG_GCC_PLUGIN_CYC_COMPLEXITY is not set +# CONFIG_GCC_PLUGIN_LATENT_ENTROPY is not set +# CONFIG_GCC_PLUGIN_RANDSTRUCT is not set +# end of General architecture-dependent options + +CONFIG_RT_MUTEXES=y +CONFIG_BASE_SMALL=0 +CONFIG_MODULE_SIG_FORMAT=y +CONFIG_MODULES=y +CONFIG_MODULE_FORCE_LOAD=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_FORCE_UNLOAD=y +# CONFIG_MODVERSIONS is not set +CONFIG_MODULE_SRCVERSION_ALL=y +CONFIG_MODULE_SIG=y +# CONFIG_MODULE_SIG_FORCE is not set +CONFIG_MODULE_SIG_ALL=y +# CONFIG_MODULE_SIG_SHA1 is not set +# CONFIG_MODULE_SIG_SHA224 is not set +# CONFIG_MODULE_SIG_SHA256 is not set +# CONFIG_MODULE_SIG_SHA384 is not set +CONFIG_MODULE_SIG_SHA512=y +CONFIG_MODULE_SIG_HASH="sha512" +CONFIG_MODULE_COMPRESS=y +# CONFIG_MODULE_COMPRESS_GZIP is not set +CONFIG_MODULE_COMPRESS_XZ=y +CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS=y +CONFIG_UNUSED_SYMBOLS=y +CONFIG_MODULES_TREE_LOOKUP=y +CONFIG_BLOCK=y +CONFIG_BLK_RQ_ALLOC_TIME=y +CONFIG_BLK_SCSI_REQUEST=y +CONFIG_BLK_CGROUP_RWSTAT=y +CONFIG_BLK_DEV_BSG=y +CONFIG_BLK_DEV_BSGLIB=y +CONFIG_BLK_DEV_INTEGRITY=y +CONFIG_BLK_DEV_INTEGRITY_T10=y +CONFIG_BLK_DEV_ZONED=y +CONFIG_BLK_DEV_THROTTLING=y +CONFIG_BLK_DEV_THROTTLING_LOW=y +# CONFIG_BLK_CMDLINE_PARSER is not set +CONFIG_BLK_WBT=y +CONFIG_BLK_CGROUP_IOLATENCY=y +CONFIG_BLK_CGROUP_IOCOST=y +CONFIG_BLK_WBT_MQ=y +CONFIG_BLK_DEBUG_FS=y +CONFIG_BLK_DEBUG_FS_ZONED=y +CONFIG_BLK_SED_OPAL=y +CONFIG_BLK_INLINE_ENCRYPTION=y +CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y + +# +# Partition Types +# +CONFIG_PARTITION_ADVANCED=y +# CONFIG_ACORN_PARTITION is not set +CONFIG_AIX_PARTITION=y +# CONFIG_OSF_PARTITION is not set +# CONFIG_AMIGA_PARTITION is not set +# CONFIG_ATARI_PARTITION is not set +CONFIG_MAC_PARTITION=y +CONFIG_MSDOS_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +# CONFIG_UNIXWARE_DISKLABEL is not set +CONFIG_LDM_PARTITION=y +# CONFIG_LDM_DEBUG is not set +# CONFIG_SGI_PARTITION is not set +# CONFIG_ULTRIX_PARTITION is not set +# CONFIG_SUN_PARTITION is not set +CONFIG_KARMA_PARTITION=y +CONFIG_EFI_PARTITION=y +# CONFIG_SYSV68_PARTITION is not set +# CONFIG_CMDLINE_PARTITION is not set +# end of Partition Types + +CONFIG_BLOCK_COMPAT=y +CONFIG_BLK_MQ_PCI=y +CONFIG_BLK_MQ_VIRTIO=y +CONFIG_BLK_MQ_RDMA=y +CONFIG_BLK_PM=y + +# +# IO Schedulers +# +CONFIG_MQ_IOSCHED_DEADLINE=y +CONFIG_MQ_IOSCHED_KYBER=y +CONFIG_IOSCHED_BFQ=y +CONFIG_BFQ_GROUP_IOSCHED=y +# CONFIG_BFQ_CGROUP_DEBUG is not set +# end of IO Schedulers + +CONFIG_PREEMPT_NOTIFIERS=y +CONFIG_PADATA=y +CONFIG_ASN1=y +CONFIG_UNINLINE_SPIN_UNLOCK=y +CONFIG_ARCH_SUPPORTS_ATOMIC_RMW=y +CONFIG_MUTEX_SPIN_ON_OWNER=y +CONFIG_RWSEM_SPIN_ON_OWNER=y +CONFIG_LOCK_SPIN_ON_OWNER=y +CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y +CONFIG_QUEUED_SPINLOCKS=y +CONFIG_ARCH_USE_QUEUED_RWLOCKS=y +CONFIG_QUEUED_RWLOCKS=y +CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE=y +CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE=y +CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y +CONFIG_FREEZER=y + +# +# Executable file formats +# +CONFIG_BINFMT_ELF=y +CONFIG_COMPAT_BINFMT_ELF=y +CONFIG_ELFCORE=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y +CONFIG_BINFMT_SCRIPT=y +CONFIG_BINFMT_MISC=y +CONFIG_COREDUMP=y +# end of Executable file formats + +# +# Memory Management options +# +CONFIG_SELECT_MEMORY_MODEL=y +CONFIG_SPARSEMEM_MANUAL=y +CONFIG_SPARSEMEM=y +CONFIG_NEED_MULTIPLE_NODES=y +CONFIG_SPARSEMEM_EXTREME=y +CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y +CONFIG_SPARSEMEM_VMEMMAP=y +CONFIG_HAVE_FAST_GUP=y +CONFIG_NUMA_KEEP_MEMINFO=y +CONFIG_MEMORY_ISOLATION=y +CONFIG_HAVE_BOOTMEM_INFO_NODE=y +CONFIG_MEMORY_HOTPLUG=y +CONFIG_MEMORY_HOTPLUG_SPARSE=y +CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE=y +CONFIG_MEMORY_HOTREMOVE=y +CONFIG_SPLIT_PTLOCK_CPUS=4 +CONFIG_MEMORY_BALLOON=y +CONFIG_BALLOON_COMPACTION=y +CONFIG_COMPACTION=y +CONFIG_PAGE_REPORTING=y +CONFIG_MIGRATION=y +CONFIG_CONTIG_ALLOC=y +CONFIG_PHYS_ADDR_T_64BIT=y +CONFIG_BOUNCE=y +CONFIG_VIRT_TO_BUS=y +CONFIG_MMU_NOTIFIER=y +CONFIG_KSM=y +CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 +CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y +CONFIG_MEMORY_FAILURE=y +CONFIG_HWPOISON_INJECT=m +CONFIG_TRANSPARENT_HUGEPAGE=y +# CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS is not set +CONFIG_TRANSPARENT_HUGEPAGE_MADVISE=y +CONFIG_ARCH_WANTS_THP_SWAP=y +CONFIG_THP_SWAP=y +CONFIG_CLEANCACHE=y +CONFIG_FRONTSWAP=y +# CONFIG_CMA is not set +# CONFIG_MEM_SOFT_DIRTY is not set +CONFIG_ZSWAP=y +# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_DEFLATE is not set +# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZO is not set +# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_842 is not set +CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4=y +# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4HC is not set +# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_ZSTD is not set +CONFIG_ZSWAP_COMPRESSOR_DEFAULT="lz4" +# CONFIG_ZSWAP_ZPOOL_DEFAULT_ZBUD is not set +CONFIG_ZSWAP_ZPOOL_DEFAULT_Z3FOLD=y +# CONFIG_ZSWAP_ZPOOL_DEFAULT_ZSMALLOC is not set +CONFIG_ZSWAP_ZPOOL_DEFAULT="z3fold" +CONFIG_ZSWAP_DEFAULT_ON=y +CONFIG_ZPOOL=y +CONFIG_ZBUD=y +CONFIG_Z3FOLD=y +CONFIG_ZSMALLOC=y +# CONFIG_ZSMALLOC_PGTABLE_MAPPING is not set +# CONFIG_ZSMALLOC_STAT is not set +CONFIG_GENERIC_EARLY_IOREMAP=y +# CONFIG_DEFERRED_STRUCT_PAGE_INIT is not set +# CONFIG_IDLE_PAGE_TRACKING is not set +CONFIG_ARCH_HAS_PTE_DEVMAP=y +CONFIG_ZONE_DEVICE=y +CONFIG_DEV_PAGEMAP_OPS=y +CONFIG_HMM_MIRROR=y +CONFIG_DEVICE_PRIVATE=y +CONFIG_FRAME_VECTOR=y +CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y +CONFIG_ARCH_HAS_PKEYS=y +# CONFIG_PERCPU_STATS is not set +# CONFIG_GUP_BENCHMARK is not set +CONFIG_READ_ONLY_THP_FOR_FS=y +CONFIG_ARCH_HAS_PTE_SPECIAL=y +CONFIG_MAPPING_DIRTY_HELPERS=y +# end of Memory Management options + +CONFIG_NET=y +CONFIG_COMPAT_NETLINK_MESSAGES=y +CONFIG_NET_INGRESS=y +CONFIG_NET_EGRESS=y +CONFIG_NET_REDIRECT=y +CONFIG_SKB_EXTENSIONS=y + +# +# Networking options +# +CONFIG_PACKET=y +CONFIG_PACKET_DIAG=y +CONFIG_UNIX=y +CONFIG_UNIX_SCM=y +CONFIG_UNIX_DIAG=y +CONFIG_TLS=m +CONFIG_TLS_DEVICE=y +# CONFIG_TLS_TOE is not set +CONFIG_XFRM=y +CONFIG_XFRM_OFFLOAD=y +CONFIG_XFRM_ALGO=m +CONFIG_XFRM_USER=m +CONFIG_XFRM_INTERFACE=m +CONFIG_XFRM_SUB_POLICY=y +CONFIG_XFRM_MIGRATE=y +CONFIG_XFRM_STATISTICS=y +CONFIG_XFRM_AH=m +CONFIG_XFRM_ESP=m +CONFIG_XFRM_IPCOMP=m +CONFIG_NET_KEY=m +CONFIG_NET_KEY_MIGRATE=y +CONFIG_XFRM_ESPINTCP=y +CONFIG_SMC=m +CONFIG_SMC_DIAG=m +CONFIG_XDP_SOCKETS=y +CONFIG_XDP_SOCKETS_DIAG=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +# CONFIG_IP_FIB_TRIE_STATS is not set +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_IP_ROUTE_CLASSID=y +# CONFIG_IP_PNP is not set +CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE_DEMUX=m +CONFIG_NET_IP_TUNNEL=m +CONFIG_NET_IPGRE=m +# CONFIG_NET_IPGRE_BROADCAST is not set +CONFIG_IP_MROUTE_COMMON=y +CONFIG_IP_MROUTE=y +CONFIG_IP_MROUTE_MULTIPLE_TABLES=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +CONFIG_SYN_COOKIES=y +CONFIG_NET_IPVTI=m +CONFIG_NET_UDP_TUNNEL=m +CONFIG_NET_FOU=m +CONFIG_NET_FOU_IP_TUNNELS=y +CONFIG_INET_AH=m +CONFIG_INET_ESP=m +CONFIG_INET_ESP_OFFLOAD=m +CONFIG_INET_ESPINTCP=y +CONFIG_INET_IPCOMP=m +CONFIG_INET_XFRM_TUNNEL=m +CONFIG_INET_TUNNEL=m +CONFIG_INET_DIAG=m +CONFIG_INET_TCP_DIAG=m +CONFIG_INET_UDP_DIAG=m +CONFIG_INET_RAW_DIAG=m +CONFIG_INET_DIAG_DESTROY=y +CONFIG_TCP_CONG_ADVANCED=y +CONFIG_TCP_CONG_BIC=m +CONFIG_TCP_CONG_CUBIC=y +CONFIG_TCP_CONG_WESTWOOD=m +CONFIG_TCP_CONG_HTCP=m +CONFIG_TCP_CONG_HSTCP=m +CONFIG_TCP_CONG_HYBLA=m +CONFIG_TCP_CONG_VEGAS=m +CONFIG_TCP_CONG_NV=m +CONFIG_TCP_CONG_SCALABLE=m +CONFIG_TCP_CONG_LP=m +CONFIG_TCP_CONG_VENO=m +CONFIG_TCP_CONG_YEAH=m +CONFIG_TCP_CONG_ILLINOIS=m +CONFIG_TCP_CONG_DCTCP=m +CONFIG_TCP_CONG_CDG=m +CONFIG_TCP_CONG_BBR=m +CONFIG_DEFAULT_CUBIC=y +# CONFIG_DEFAULT_RENO is not set +CONFIG_DEFAULT_TCP_CONG="cubic" +CONFIG_TCP_MD5SIG=y +CONFIG_IPV6=y +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_INET6_AH=m +CONFIG_INET6_ESP=m +CONFIG_INET6_ESP_OFFLOAD=m +CONFIG_INET6_ESPINTCP=y +CONFIG_INET6_IPCOMP=m +CONFIG_IPV6_MIP6=m +CONFIG_IPV6_ILA=m +CONFIG_INET6_XFRM_TUNNEL=m +CONFIG_INET6_TUNNEL=m +CONFIG_IPV6_VTI=m +CONFIG_IPV6_SIT=m +CONFIG_IPV6_SIT_6RD=y +CONFIG_IPV6_NDISC_NODETYPE=y +CONFIG_IPV6_TUNNEL=m +CONFIG_IPV6_GRE=m +CONFIG_IPV6_FOU=m +CONFIG_IPV6_FOU_TUNNEL=m +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_SUBTREES=y +CONFIG_IPV6_MROUTE=y +CONFIG_IPV6_MROUTE_MULTIPLE_TABLES=y +CONFIG_IPV6_PIMSM_V2=y +CONFIG_IPV6_SEG6_LWTUNNEL=y +CONFIG_IPV6_SEG6_HMAC=y +CONFIG_IPV6_SEG6_BPF=y +CONFIG_IPV6_RPL_LWTUNNEL=y +CONFIG_NETLABEL=y +CONFIG_MPTCP=y +CONFIG_INET_MPTCP_DIAG=m +CONFIG_MPTCP_IPV6=y +CONFIG_NETWORK_SECMARK=y +CONFIG_NET_PTP_CLASSIFY=y +CONFIG_NETWORK_PHY_TIMESTAMPING=y +CONFIG_NETFILTER=y +CONFIG_NETFILTER_ADVANCED=y +CONFIG_BRIDGE_NETFILTER=m + +# +# Core Netfilter Configuration +# +CONFIG_NETFILTER_INGRESS=y +CONFIG_NETFILTER_NETLINK=m +CONFIG_NETFILTER_FAMILY_BRIDGE=y +CONFIG_NETFILTER_FAMILY_ARP=y +CONFIG_NETFILTER_NETLINK_ACCT=m +CONFIG_NETFILTER_NETLINK_QUEUE=m +CONFIG_NETFILTER_NETLINK_LOG=m +CONFIG_NETFILTER_NETLINK_OSF=m +CONFIG_NF_CONNTRACK=m +CONFIG_NF_LOG_COMMON=m +CONFIG_NF_LOG_NETDEV=m +CONFIG_NETFILTER_CONNCOUNT=m +CONFIG_NF_CONNTRACK_MARK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_ZONES=y +CONFIG_NF_CONNTRACK_PROCFS=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_TIMEOUT=y +CONFIG_NF_CONNTRACK_TIMESTAMP=y +CONFIG_NF_CONNTRACK_LABELS=y +CONFIG_NF_CT_PROTO_DCCP=y +CONFIG_NF_CT_PROTO_GRE=y +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_CT_PROTO_UDPLITE=y +CONFIG_NF_CONNTRACK_AMANDA=m +CONFIG_NF_CONNTRACK_FTP=m +CONFIG_NF_CONNTRACK_H323=m +CONFIG_NF_CONNTRACK_IRC=m +CONFIG_NF_CONNTRACK_BROADCAST=m +CONFIG_NF_CONNTRACK_NETBIOS_NS=m +CONFIG_NF_CONNTRACK_SNMP=m +CONFIG_NF_CONNTRACK_PPTP=m +CONFIG_NF_CONNTRACK_SANE=m +CONFIG_NF_CONNTRACK_SIP=m +CONFIG_NF_CONNTRACK_TFTP=m +CONFIG_NF_CT_NETLINK=m +CONFIG_NF_CT_NETLINK_TIMEOUT=m +CONFIG_NF_CT_NETLINK_HELPER=m +CONFIG_NETFILTER_NETLINK_GLUE_CT=y +CONFIG_NF_NAT=m +CONFIG_NF_NAT_AMANDA=m +CONFIG_NF_NAT_FTP=m +CONFIG_NF_NAT_IRC=m +CONFIG_NF_NAT_SIP=m +CONFIG_NF_NAT_TFTP=m +CONFIG_NF_NAT_REDIRECT=y +CONFIG_NF_NAT_MASQUERADE=y +CONFIG_NETFILTER_SYNPROXY=m +CONFIG_NF_TABLES=m +CONFIG_NF_TABLES_INET=y +CONFIG_NF_TABLES_NETDEV=y +CONFIG_NFT_NUMGEN=m +CONFIG_NFT_CT=m +CONFIG_NFT_FLOW_OFFLOAD=m +CONFIG_NFT_COUNTER=m +CONFIG_NFT_CONNLIMIT=m +CONFIG_NFT_LOG=m +CONFIG_NFT_LIMIT=m +CONFIG_NFT_MASQ=m +CONFIG_NFT_REDIR=m +CONFIG_NFT_NAT=m +CONFIG_NFT_TUNNEL=m +CONFIG_NFT_OBJREF=m +CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m +CONFIG_NFT_REJECT=m +CONFIG_NFT_REJECT_INET=m +CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m +CONFIG_NFT_FIB=m +CONFIG_NFT_FIB_INET=m +CONFIG_NFT_XFRM=m +CONFIG_NFT_SOCKET=m +CONFIG_NFT_OSF=m +CONFIG_NFT_TPROXY=m +CONFIG_NFT_SYNPROXY=m +CONFIG_NF_DUP_NETDEV=m +CONFIG_NFT_DUP_NETDEV=m +CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m +CONFIG_NF_FLOW_TABLE_INET=m +CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES=m + +# +# Xtables combined modules +# +CONFIG_NETFILTER_XT_MARK=m +CONFIG_NETFILTER_XT_CONNMARK=m +CONFIG_NETFILTER_XT_SET=m + +# +# Xtables targets +# +CONFIG_NETFILTER_XT_TARGET_AUDIT=m +CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m +CONFIG_NETFILTER_XT_TARGET_CONNMARK=m +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m +CONFIG_NETFILTER_XT_TARGET_CT=m +CONFIG_NETFILTER_XT_TARGET_DSCP=m +CONFIG_NETFILTER_XT_TARGET_HL=m +CONFIG_NETFILTER_XT_TARGET_HMARK=m +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m +CONFIG_NETFILTER_XT_TARGET_LED=m +CONFIG_NETFILTER_XT_TARGET_LOG=m +CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_NAT=m +CONFIG_NETFILTER_XT_TARGET_NETMAP=m +CONFIG_NETFILTER_XT_TARGET_NFLOG=m +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_NOTRACK=m +CONFIG_NETFILTER_XT_TARGET_RATEEST=m +CONFIG_NETFILTER_XT_TARGET_REDIRECT=m +CONFIG_NETFILTER_XT_TARGET_MASQUERADE=m +CONFIG_NETFILTER_XT_TARGET_TEE=m +CONFIG_NETFILTER_XT_TARGET_TPROXY=m +CONFIG_NETFILTER_XT_TARGET_TRACE=m +CONFIG_NETFILTER_XT_TARGET_SECMARK=m +CONFIG_NETFILTER_XT_TARGET_TCPMSS=m +CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m + +# +# Xtables matches +# +CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m +CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NETFILTER_XT_MATCH_CGROUP=m +CONFIG_NETFILTER_XT_MATCH_CLUSTER=m +CONFIG_NETFILTER_XT_MATCH_COMMENT=m +CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m +CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m +CONFIG_NETFILTER_XT_MATCH_CONNMARK=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NETFILTER_XT_MATCH_CPU=m +CONFIG_NETFILTER_XT_MATCH_DCCP=m +CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m +CONFIG_NETFILTER_XT_MATCH_DSCP=m +CONFIG_NETFILTER_XT_MATCH_ECN=m +CONFIG_NETFILTER_XT_MATCH_ESP=m +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m +CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_HL=m +CONFIG_NETFILTER_XT_MATCH_IPCOMP=m +CONFIG_NETFILTER_XT_MATCH_IPRANGE=m +CONFIG_NETFILTER_XT_MATCH_IPVS=m +CONFIG_NETFILTER_XT_MATCH_L2TP=m +CONFIG_NETFILTER_XT_MATCH_LENGTH=m +CONFIG_NETFILTER_XT_MATCH_LIMIT=m +CONFIG_NETFILTER_XT_MATCH_MAC=m +CONFIG_NETFILTER_XT_MATCH_MARK=m +CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m +CONFIG_NETFILTER_XT_MATCH_NFACCT=m +CONFIG_NETFILTER_XT_MATCH_OSF=m +CONFIG_NETFILTER_XT_MATCH_OWNER=m +CONFIG_NETFILTER_XT_MATCH_POLICY=m +CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m +CONFIG_NETFILTER_XT_MATCH_QUOTA=m +CONFIG_NETFILTER_XT_MATCH_RATEEST=m +CONFIG_NETFILTER_XT_MATCH_REALM=m +CONFIG_NETFILTER_XT_MATCH_RECENT=m +CONFIG_NETFILTER_XT_MATCH_SCTP=m +CONFIG_NETFILTER_XT_MATCH_SOCKET=m +CONFIG_NETFILTER_XT_MATCH_STATE=m +CONFIG_NETFILTER_XT_MATCH_STATISTIC=m +CONFIG_NETFILTER_XT_MATCH_STRING=m +CONFIG_NETFILTER_XT_MATCH_TCPMSS=m +CONFIG_NETFILTER_XT_MATCH_TIME=m +CONFIG_NETFILTER_XT_MATCH_U32=m +# end of Core Netfilter Configuration + +CONFIG_IP_SET=m +CONFIG_IP_SET_MAX=256 +CONFIG_IP_SET_BITMAP_IP=m +CONFIG_IP_SET_BITMAP_IPMAC=m +CONFIG_IP_SET_BITMAP_PORT=m +CONFIG_IP_SET_HASH_IP=m +CONFIG_IP_SET_HASH_IPMARK=m +CONFIG_IP_SET_HASH_IPPORT=m +CONFIG_IP_SET_HASH_IPPORTIP=m +CONFIG_IP_SET_HASH_IPPORTNET=m +CONFIG_IP_SET_HASH_IPMAC=m +CONFIG_IP_SET_HASH_MAC=m +CONFIG_IP_SET_HASH_NETPORTNET=m +CONFIG_IP_SET_HASH_NET=m +CONFIG_IP_SET_HASH_NETNET=m +CONFIG_IP_SET_HASH_NETPORT=m +CONFIG_IP_SET_HASH_NETIFACE=m +CONFIG_IP_SET_LIST_SET=m +CONFIG_IP_VS=m +CONFIG_IP_VS_IPV6=y +# CONFIG_IP_VS_DEBUG is not set +CONFIG_IP_VS_TAB_BITS=15 + +# +# IPVS transport protocol load balancing support +# +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_PROTO_UDP=y +CONFIG_IP_VS_PROTO_AH_ESP=y +CONFIG_IP_VS_PROTO_ESP=y +CONFIG_IP_VS_PROTO_AH=y +CONFIG_IP_VS_PROTO_SCTP=y + +# +# IPVS scheduler +# +CONFIG_IP_VS_RR=m +CONFIG_IP_VS_WRR=m +CONFIG_IP_VS_LC=m +CONFIG_IP_VS_WLC=m +CONFIG_IP_VS_FO=m +CONFIG_IP_VS_OVF=m +CONFIG_IP_VS_LBLC=m +CONFIG_IP_VS_LBLCR=m +CONFIG_IP_VS_DH=m +CONFIG_IP_VS_SH=m +CONFIG_IP_VS_MH=m +CONFIG_IP_VS_SED=m +CONFIG_IP_VS_NQ=m + +# +# IPVS SH scheduler +# +CONFIG_IP_VS_SH_TAB_BITS=8 + +# +# IPVS MH scheduler +# +CONFIG_IP_VS_MH_TAB_INDEX=12 + +# +# IPVS application helper +# +CONFIG_IP_VS_FTP=m +CONFIG_IP_VS_NFCT=y +CONFIG_IP_VS_PE_SIP=m + +# +# IP: Netfilter Configuration +# +CONFIG_NF_DEFRAG_IPV4=m +CONFIG_NF_SOCKET_IPV4=m +CONFIG_NF_TPROXY_IPV4=m +CONFIG_NF_TABLES_IPV4=y +CONFIG_NFT_REJECT_IPV4=m +CONFIG_NFT_DUP_IPV4=m +CONFIG_NFT_FIB_IPV4=m +CONFIG_NF_TABLES_ARP=y +CONFIG_NF_FLOW_TABLE_IPV4=m +CONFIG_NF_DUP_IPV4=m +CONFIG_NF_LOG_ARP=m +CONFIG_NF_LOG_IPV4=m +CONFIG_NF_REJECT_IPV4=m +CONFIG_NF_NAT_SNMP_BASIC=m +CONFIG_NF_NAT_PPTP=m +CONFIG_NF_NAT_H323=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_AH=m +CONFIG_IP_NF_MATCH_ECN=m +CONFIG_IP_NF_MATCH_RPFILTER=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_TARGET_SYNPROXY=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_TARGET_NETMAP=m +CONFIG_IP_NF_TARGET_REDIRECT=m +CONFIG_IP_NF_MANGLE=m +CONFIG_IP_NF_TARGET_CLUSTERIP=m +CONFIG_IP_NF_TARGET_ECN=m +CONFIG_IP_NF_TARGET_TTL=m +CONFIG_IP_NF_RAW=m +CONFIG_IP_NF_SECURITY=m +CONFIG_IP_NF_ARPTABLES=m +CONFIG_IP_NF_ARPFILTER=m +CONFIG_IP_NF_ARP_MANGLE=m +# end of IP: Netfilter Configuration + +# +# IPv6: Netfilter Configuration +# +CONFIG_NF_SOCKET_IPV6=m +CONFIG_NF_TPROXY_IPV6=m +CONFIG_NF_TABLES_IPV6=y +CONFIG_NFT_REJECT_IPV6=m +CONFIG_NFT_DUP_IPV6=m +CONFIG_NFT_FIB_IPV6=m +CONFIG_NF_FLOW_TABLE_IPV6=m +CONFIG_NF_DUP_IPV6=m +CONFIG_NF_REJECT_IPV6=m +CONFIG_NF_LOG_IPV6=m +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_AH=m +CONFIG_IP6_NF_MATCH_EUI64=m +CONFIG_IP6_NF_MATCH_FRAG=m +CONFIG_IP6_NF_MATCH_OPTS=m +CONFIG_IP6_NF_MATCH_HL=m +CONFIG_IP6_NF_MATCH_IPV6HEADER=m +CONFIG_IP6_NF_MATCH_MH=m +CONFIG_IP6_NF_MATCH_RPFILTER=m +CONFIG_IP6_NF_MATCH_RT=m +CONFIG_IP6_NF_MATCH_SRH=m +CONFIG_IP6_NF_TARGET_HL=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_REJECT=m +CONFIG_IP6_NF_TARGET_SYNPROXY=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_RAW=m +CONFIG_IP6_NF_SECURITY=m +CONFIG_IP6_NF_NAT=m +CONFIG_IP6_NF_TARGET_MASQUERADE=m +CONFIG_IP6_NF_TARGET_NPT=m +# end of IPv6: Netfilter Configuration + +CONFIG_NF_DEFRAG_IPV6=m +CONFIG_NF_TABLES_BRIDGE=m +CONFIG_NFT_BRIDGE_META=m +CONFIG_NFT_BRIDGE_REJECT=m +CONFIG_NF_LOG_BRIDGE=m +CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES=m +CONFIG_BRIDGE_EBT_BROUTE=m +CONFIG_BRIDGE_EBT_T_FILTER=m +CONFIG_BRIDGE_EBT_T_NAT=m +CONFIG_BRIDGE_EBT_802_3=m +CONFIG_BRIDGE_EBT_AMONG=m +CONFIG_BRIDGE_EBT_ARP=m +CONFIG_BRIDGE_EBT_IP=m +CONFIG_BRIDGE_EBT_IP6=m +CONFIG_BRIDGE_EBT_LIMIT=m +CONFIG_BRIDGE_EBT_MARK=m +CONFIG_BRIDGE_EBT_PKTTYPE=m +CONFIG_BRIDGE_EBT_STP=m +CONFIG_BRIDGE_EBT_VLAN=m +CONFIG_BRIDGE_EBT_ARPREPLY=m +CONFIG_BRIDGE_EBT_DNAT=m +CONFIG_BRIDGE_EBT_MARK_T=m +CONFIG_BRIDGE_EBT_REDIRECT=m +CONFIG_BRIDGE_EBT_SNAT=m +CONFIG_BRIDGE_EBT_LOG=m +CONFIG_BRIDGE_EBT_NFLOG=m +# CONFIG_BPFILTER is not set +CONFIG_IP_DCCP=m +CONFIG_INET_DCCP_DIAG=m + +# +# DCCP CCIDs Configuration +# +# CONFIG_IP_DCCP_CCID2_DEBUG is not set +CONFIG_IP_DCCP_CCID3=y +# CONFIG_IP_DCCP_CCID3_DEBUG is not set +CONFIG_IP_DCCP_TFRC_LIB=y +# end of DCCP CCIDs Configuration + +# +# DCCP Kernel Hacking +# +# CONFIG_IP_DCCP_DEBUG is not set +# end of DCCP Kernel Hacking + +CONFIG_IP_SCTP=m +# CONFIG_SCTP_DBG_OBJCNT is not set +# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5 is not set +CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1=y +# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE is not set +CONFIG_SCTP_COOKIE_HMAC_MD5=y +CONFIG_SCTP_COOKIE_HMAC_SHA1=y +CONFIG_INET_SCTP_DIAG=m +CONFIG_RDS=m +CONFIG_RDS_RDMA=m +CONFIG_RDS_TCP=m +# CONFIG_RDS_DEBUG is not set +CONFIG_TIPC=m +CONFIG_TIPC_MEDIA_IB=y +CONFIG_TIPC_MEDIA_UDP=y +CONFIG_TIPC_CRYPTO=y +CONFIG_TIPC_DIAG=m +CONFIG_ATM=m +CONFIG_ATM_CLIP=m +# CONFIG_ATM_CLIP_NO_ICMP is not set +CONFIG_ATM_LANE=m +CONFIG_ATM_MPOA=m +CONFIG_ATM_BR2684=m +# CONFIG_ATM_BR2684_IPFILTER is not set +CONFIG_L2TP=m +# CONFIG_L2TP_DEBUGFS is not set +CONFIG_L2TP_V3=y +CONFIG_L2TP_IP=m +CONFIG_L2TP_ETH=m +CONFIG_STP=m +CONFIG_GARP=m +CONFIG_MRP=m +CONFIG_BRIDGE=m +CONFIG_BRIDGE_IGMP_SNOOPING=y +CONFIG_BRIDGE_VLAN_FILTERING=y +CONFIG_BRIDGE_MRP=y +CONFIG_HAVE_NET_DSA=y +CONFIG_NET_DSA=m +CONFIG_NET_DSA_TAG_8021Q=m +CONFIG_NET_DSA_TAG_AR9331=m +CONFIG_NET_DSA_TAG_BRCM_COMMON=m +CONFIG_NET_DSA_TAG_BRCM=m +CONFIG_NET_DSA_TAG_BRCM_PREPEND=m +CONFIG_NET_DSA_TAG_GSWIP=m +CONFIG_NET_DSA_TAG_DSA=m +CONFIG_NET_DSA_TAG_EDSA=m +CONFIG_NET_DSA_TAG_MTK=m +CONFIG_NET_DSA_TAG_KSZ=m +CONFIG_NET_DSA_TAG_RTL4_A=m +CONFIG_NET_DSA_TAG_OCELOT=m +CONFIG_NET_DSA_TAG_QCA=m +CONFIG_NET_DSA_TAG_LAN9303=m +CONFIG_NET_DSA_TAG_SJA1105=m +CONFIG_NET_DSA_TAG_TRAILER=m +CONFIG_VLAN_8021Q=m +CONFIG_VLAN_8021Q_GVRP=y +CONFIG_VLAN_8021Q_MVRP=y +# CONFIG_DECNET is not set +CONFIG_LLC=m +CONFIG_LLC2=m +CONFIG_ATALK=m +CONFIG_DEV_APPLETALK=m +CONFIG_IPDDP=m +CONFIG_IPDDP_ENCAP=y +# CONFIG_X25 is not set +# CONFIG_LAPB is not set +CONFIG_PHONET=m +CONFIG_6LOWPAN=m +# CONFIG_6LOWPAN_DEBUGFS is not set +CONFIG_6LOWPAN_NHC=m +CONFIG_6LOWPAN_NHC_DEST=m +CONFIG_6LOWPAN_NHC_FRAGMENT=m +CONFIG_6LOWPAN_NHC_HOP=m +CONFIG_6LOWPAN_NHC_IPV6=m +CONFIG_6LOWPAN_NHC_MOBILITY=m +CONFIG_6LOWPAN_NHC_ROUTING=m +CONFIG_6LOWPAN_NHC_UDP=m +CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m +CONFIG_6LOWPAN_GHC_UDP=m +CONFIG_6LOWPAN_GHC_ICMPV6=m +CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m +CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m +CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m +CONFIG_IEEE802154=m +CONFIG_IEEE802154_NL802154_EXPERIMENTAL=y +CONFIG_IEEE802154_SOCKET=m +CONFIG_IEEE802154_6LOWPAN=m +CONFIG_MAC802154=m +CONFIG_NET_SCHED=y + +# +# Queueing/Scheduling +# +CONFIG_NET_SCH_CBQ=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_HFSC=m +CONFIG_NET_SCH_ATM=m +CONFIG_NET_SCH_PRIO=m +CONFIG_NET_SCH_MULTIQ=m +CONFIG_NET_SCH_RED=m +CONFIG_NET_SCH_SFB=m +CONFIG_NET_SCH_SFQ=m +CONFIG_NET_SCH_TEQL=m +CONFIG_NET_SCH_TBF=m +CONFIG_NET_SCH_CBS=m +CONFIG_NET_SCH_ETF=m +CONFIG_NET_SCH_TAPRIO=m +CONFIG_NET_SCH_GRED=m +CONFIG_NET_SCH_DSMARK=m +CONFIG_NET_SCH_NETEM=m +CONFIG_NET_SCH_DRR=m +CONFIG_NET_SCH_MQPRIO=m +CONFIG_NET_SCH_SKBPRIO=m +CONFIG_NET_SCH_CHOKE=m +CONFIG_NET_SCH_QFQ=m +CONFIG_NET_SCH_CODEL=m +CONFIG_NET_SCH_FQ_CODEL=y +CONFIG_NET_SCH_CAKE=m +CONFIG_NET_SCH_FQ=m +CONFIG_NET_SCH_HHF=m +CONFIG_NET_SCH_PIE=m +CONFIG_NET_SCH_FQ_PIE=m +CONFIG_NET_SCH_INGRESS=m +CONFIG_NET_SCH_PLUG=m +CONFIG_NET_SCH_ETS=m +CONFIG_NET_SCH_DEFAULT=y +# CONFIG_DEFAULT_FQ is not set +# CONFIG_DEFAULT_CODEL is not set +# CONFIG_DEFAULT_FQ_CODEL is not set +# CONFIG_DEFAULT_FQ_PIE is not set +# CONFIG_DEFAULT_SFQ is not set +# CONFIG_DEFAULT_PFIFO_FAST is not set +CONFIG_DEFAULT_NET_SCH="fq_codel" + +# +# Classification +# +CONFIG_NET_CLS=y +CONFIG_NET_CLS_BASIC=m +CONFIG_NET_CLS_TCINDEX=m +CONFIG_NET_CLS_ROUTE4=m +CONFIG_NET_CLS_FW=m +CONFIG_NET_CLS_U32=m +CONFIG_CLS_U32_PERF=y +CONFIG_CLS_U32_MARK=y +CONFIG_NET_CLS_RSVP=m +CONFIG_NET_CLS_RSVP6=m +CONFIG_NET_CLS_FLOW=m +CONFIG_NET_CLS_CGROUP=m +CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_FLOWER=m +CONFIG_NET_CLS_MATCHALL=m +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_STACK=32 +CONFIG_NET_EMATCH_CMP=m +CONFIG_NET_EMATCH_NBYTE=m +CONFIG_NET_EMATCH_U32=m +CONFIG_NET_EMATCH_META=m +CONFIG_NET_EMATCH_TEXT=m +CONFIG_NET_EMATCH_CANID=m +CONFIG_NET_EMATCH_IPSET=m +CONFIG_NET_EMATCH_IPT=m +CONFIG_NET_CLS_ACT=y +CONFIG_NET_ACT_POLICE=m +CONFIG_NET_ACT_GACT=m +CONFIG_GACT_PROB=y +CONFIG_NET_ACT_MIRRED=m +CONFIG_NET_ACT_SAMPLE=m +CONFIG_NET_ACT_IPT=m +CONFIG_NET_ACT_NAT=m +CONFIG_NET_ACT_PEDIT=m +CONFIG_NET_ACT_SIMP=m +CONFIG_NET_ACT_SKBEDIT=m +CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_MPLS=m +CONFIG_NET_ACT_VLAN=m +CONFIG_NET_ACT_BPF=m +CONFIG_NET_ACT_CONNMARK=m +CONFIG_NET_ACT_CTINFO=m +CONFIG_NET_ACT_SKBMOD=m +CONFIG_NET_ACT_IFE=m +CONFIG_NET_ACT_TUNNEL_KEY=m +CONFIG_NET_ACT_CT=m +CONFIG_NET_ACT_GATE=m +CONFIG_NET_IFE_SKBMARK=m +CONFIG_NET_IFE_SKBPRIO=m +CONFIG_NET_IFE_SKBTCINDEX=m +CONFIG_NET_TC_SKB_EXT=y +CONFIG_NET_SCH_FIFO=y +CONFIG_DCB=y +CONFIG_DNS_RESOLVER=m +CONFIG_BATMAN_ADV=m +CONFIG_BATMAN_ADV_BATMAN_V=y +CONFIG_BATMAN_ADV_BLA=y +CONFIG_BATMAN_ADV_DAT=y +CONFIG_BATMAN_ADV_NC=y +CONFIG_BATMAN_ADV_MCAST=y +CONFIG_BATMAN_ADV_DEBUGFS=y +# CONFIG_BATMAN_ADV_DEBUG is not set +CONFIG_BATMAN_ADV_SYSFS=y +# CONFIG_BATMAN_ADV_TRACING is not set +CONFIG_OPENVSWITCH=m +CONFIG_OPENVSWITCH_GRE=m +CONFIG_OPENVSWITCH_VXLAN=m +CONFIG_OPENVSWITCH_GENEVE=m +CONFIG_VSOCKETS=m +CONFIG_VSOCKETS_DIAG=m +CONFIG_VSOCKETS_LOOPBACK=m +CONFIG_VMWARE_VMCI_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS_COMMON=m +CONFIG_HYPERV_VSOCKETS=m +CONFIG_NETLINK_DIAG=m +CONFIG_MPLS=y +CONFIG_NET_MPLS_GSO=m +CONFIG_MPLS_ROUTING=m +CONFIG_MPLS_IPTUNNEL=m +CONFIG_NET_NSH=m +CONFIG_HSR=m +CONFIG_NET_SWITCHDEV=y +CONFIG_NET_L3_MASTER_DEV=y +CONFIG_QRTR=m +CONFIG_QRTR_SMD=m +CONFIG_QRTR_TUN=m +CONFIG_QRTR_MHI=m +CONFIG_NET_NCSI=y +CONFIG_NCSI_OEM_CMD_GET_MAC=y +CONFIG_RPS=y +CONFIG_RFS_ACCEL=y +CONFIG_XPS=y +CONFIG_CGROUP_NET_PRIO=y +CONFIG_CGROUP_NET_CLASSID=y +CONFIG_NET_RX_BUSY_POLL=y +CONFIG_BQL=y +CONFIG_BPF_JIT=y +CONFIG_BPF_STREAM_PARSER=y +CONFIG_NET_FLOW_LIMIT=y + +# +# Network testing +# +CONFIG_NET_PKTGEN=m +CONFIG_NET_DROP_MONITOR=y +# end of Network testing +# end of Networking options + +CONFIG_HAMRADIO=y + +# +# Packet Radio protocols +# +CONFIG_AX25=m +CONFIG_AX25_DAMA_SLAVE=y +CONFIG_NETROM=m +CONFIG_ROSE=m + +# +# AX.25 network device drivers +# +CONFIG_MKISS=m +CONFIG_6PACK=m +CONFIG_BPQETHER=m +CONFIG_BAYCOM_SER_FDX=m +CONFIG_BAYCOM_SER_HDX=m +CONFIG_BAYCOM_PAR=m +CONFIG_YAM=m +# end of AX.25 network device drivers + +CONFIG_CAN=m +CONFIG_CAN_RAW=m +CONFIG_CAN_BCM=m +CONFIG_CAN_GW=m +CONFIG_CAN_J1939=m + +# +# CAN Device Drivers +# +CONFIG_CAN_VCAN=m +CONFIG_CAN_VXCAN=m +CONFIG_CAN_SLCAN=m +CONFIG_CAN_DEV=m +CONFIG_CAN_CALC_BITTIMING=y +CONFIG_CAN_FLEXCAN=m +CONFIG_CAN_GRCAN=m +CONFIG_CAN_JANZ_ICAN3=m +CONFIG_CAN_KVASER_PCIEFD=m +CONFIG_CAN_C_CAN=m +CONFIG_CAN_C_CAN_PLATFORM=m +CONFIG_CAN_C_CAN_PCI=m +CONFIG_CAN_CC770=m +# CONFIG_CAN_CC770_ISA is not set +CONFIG_CAN_CC770_PLATFORM=m +CONFIG_CAN_IFI_CANFD=m +CONFIG_CAN_M_CAN=m +CONFIG_CAN_M_CAN_PLATFORM=m +CONFIG_CAN_M_CAN_TCAN4X5X=m +CONFIG_CAN_PEAK_PCIEFD=m +CONFIG_CAN_SJA1000=m +CONFIG_CAN_EMS_PCI=m +# CONFIG_CAN_EMS_PCMCIA is not set +CONFIG_CAN_F81601=m +CONFIG_CAN_KVASER_PCI=m +CONFIG_CAN_PEAK_PCI=m +CONFIG_CAN_PEAK_PCIEC=y +CONFIG_CAN_PEAK_PCMCIA=m +CONFIG_CAN_PLX_PCI=m +# CONFIG_CAN_SJA1000_ISA is not set +CONFIG_CAN_SJA1000_PLATFORM=m +CONFIG_CAN_SOFTING=m +CONFIG_CAN_SOFTING_CS=m + +# +# CAN SPI interfaces +# +CONFIG_CAN_HI311X=m +CONFIG_CAN_MCP251X=m +# end of CAN SPI interfaces + +# +# CAN USB interfaces +# +CONFIG_CAN_8DEV_USB=m +CONFIG_CAN_EMS_USB=m +CONFIG_CAN_ESD_USB2=m +CONFIG_CAN_GS_USB=m +CONFIG_CAN_KVASER_USB=m +CONFIG_CAN_MCBA_USB=m +CONFIG_CAN_PEAK_USB=m +CONFIG_CAN_UCAN=m +# end of CAN USB interfaces + +# CONFIG_CAN_DEBUG_DEVICES is not set +# end of CAN Device Drivers + +CONFIG_BT=m +CONFIG_BT_BREDR=y +CONFIG_BT_RFCOMM=m +CONFIG_BT_RFCOMM_TTY=y +CONFIG_BT_BNEP=m +CONFIG_BT_BNEP_MC_FILTER=y +CONFIG_BT_BNEP_PROTO_FILTER=y +CONFIG_BT_CMTP=m +CONFIG_BT_HIDP=m +CONFIG_BT_HS=y +CONFIG_BT_LE=y +CONFIG_BT_6LOWPAN=m +CONFIG_BT_LEDS=y +CONFIG_BT_MSFTEXT=y +CONFIG_BT_DEBUGFS=y +# CONFIG_BT_SELFTEST is not set + +# +# Bluetooth device drivers +# +CONFIG_BT_INTEL=m +CONFIG_BT_BCM=m +CONFIG_BT_RTL=m +CONFIG_BT_QCA=m +CONFIG_BT_HCIBTUSB=m +CONFIG_BT_HCIBTUSB_AUTOSUSPEND=y +CONFIG_BT_HCIBTUSB_BCM=y +CONFIG_BT_HCIBTUSB_MTK=y +CONFIG_BT_HCIBTUSB_RTL=y +CONFIG_BT_HCIBTSDIO=m +CONFIG_BT_HCIUART=m +CONFIG_BT_HCIUART_SERDEV=y +CONFIG_BT_HCIUART_H4=y +CONFIG_BT_HCIUART_NOKIA=m +CONFIG_BT_HCIUART_BCSP=y +CONFIG_BT_HCIUART_ATH3K=y +CONFIG_BT_HCIUART_LL=y +CONFIG_BT_HCIUART_3WIRE=y +CONFIG_BT_HCIUART_INTEL=y +CONFIG_BT_HCIUART_BCM=y +CONFIG_BT_HCIUART_RTL=y +CONFIG_BT_HCIUART_QCA=y +CONFIG_BT_HCIUART_AG6XX=y +CONFIG_BT_HCIUART_MRVL=y +CONFIG_BT_HCIBCM203X=m +CONFIG_BT_HCIBPA10X=m +CONFIG_BT_HCIBFUSB=m +CONFIG_BT_HCIDTL1=m +CONFIG_BT_HCIBT3C=m +CONFIG_BT_HCIBLUECARD=m +CONFIG_BT_HCIVHCI=m +CONFIG_BT_MRVL=m +CONFIG_BT_MRVL_SDIO=m +CONFIG_BT_ATH3K=m +CONFIG_BT_MTKSDIO=m +CONFIG_BT_MTKUART=m +CONFIG_BT_HCIRSI=m +# end of Bluetooth device drivers + +CONFIG_AF_RXRPC=m +CONFIG_AF_RXRPC_IPV6=y +# CONFIG_AF_RXRPC_INJECT_LOSS is not set +CONFIG_AF_RXRPC_DEBUG=y +CONFIG_RXKAD=y +CONFIG_AF_KCM=m +CONFIG_STREAM_PARSER=y +CONFIG_FIB_RULES=y +CONFIG_WIRELESS=y +CONFIG_WIRELESS_EXT=y +CONFIG_WEXT_CORE=y +CONFIG_WEXT_PROC=y +CONFIG_WEXT_SPY=y +CONFIG_WEXT_PRIV=y +CONFIG_CFG80211=m +# CONFIG_NL80211_TESTMODE is not set +# CONFIG_CFG80211_DEVELOPER_WARNINGS is not set +# CONFIG_CFG80211_CERTIFICATION_ONUS is not set +CONFIG_CFG80211_REQUIRE_SIGNED_REGDB=y +CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS=y +CONFIG_CFG80211_DEFAULT_PS=y +CONFIG_CFG80211_DEBUGFS=y +CONFIG_CFG80211_CRDA_SUPPORT=y +CONFIG_CFG80211_WEXT=y +CONFIG_CFG80211_WEXT_EXPORT=y +CONFIG_LIB80211=m +CONFIG_LIB80211_CRYPT_WEP=m +CONFIG_LIB80211_CRYPT_CCMP=m +CONFIG_LIB80211_CRYPT_TKIP=m +# CONFIG_LIB80211_DEBUG is not set +CONFIG_MAC80211=m +CONFIG_MAC80211_HAS_RC=y +CONFIG_MAC80211_RC_MINSTREL=y +CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y +CONFIG_MAC80211_RC_DEFAULT="minstrel_ht" +CONFIG_MAC80211_MESH=y +CONFIG_MAC80211_LEDS=y +CONFIG_MAC80211_DEBUGFS=y +# CONFIG_MAC80211_MESSAGE_TRACING is not set +# CONFIG_MAC80211_DEBUG_MENU is not set +CONFIG_MAC80211_STA_HASH_MAX_SIZE=0 +CONFIG_WIMAX=m +CONFIG_WIMAX_DEBUG_LEVEL=8 +CONFIG_RFKILL=m +CONFIG_RFKILL_LEDS=y +CONFIG_RFKILL_INPUT=y +CONFIG_RFKILL_GPIO=m +CONFIG_NET_9P=m +CONFIG_NET_9P_VIRTIO=m +CONFIG_NET_9P_XEN=m +CONFIG_NET_9P_RDMA=m +# CONFIG_NET_9P_DEBUG is not set +CONFIG_CAIF=m +# CONFIG_CAIF_DEBUG is not set +CONFIG_CAIF_NETDEV=m +CONFIG_CAIF_USB=m +CONFIG_CEPH_LIB=m +CONFIG_CEPH_LIB_PRETTYDEBUG=y +CONFIG_CEPH_LIB_USE_DNS_RESOLVER=y +CONFIG_NFC=m +CONFIG_NFC_DIGITAL=m +CONFIG_NFC_NCI=m +CONFIG_NFC_NCI_SPI=m +CONFIG_NFC_NCI_UART=m +CONFIG_NFC_HCI=m +CONFIG_NFC_SHDLC=y + +# +# Near Field Communication (NFC) devices +# +CONFIG_NFC_TRF7970A=m +CONFIG_NFC_MEI_PHY=m +CONFIG_NFC_SIM=m +CONFIG_NFC_PORT100=m +CONFIG_NFC_FDP=m +CONFIG_NFC_FDP_I2C=m +CONFIG_NFC_PN544=m +CONFIG_NFC_PN544_I2C=m +CONFIG_NFC_PN544_MEI=m +CONFIG_NFC_PN533=m +CONFIG_NFC_PN533_USB=m +CONFIG_NFC_PN533_I2C=m +CONFIG_NFC_PN532_UART=m +CONFIG_NFC_MICROREAD=m +CONFIG_NFC_MICROREAD_I2C=m +CONFIG_NFC_MICROREAD_MEI=m +CONFIG_NFC_MRVL=m +CONFIG_NFC_MRVL_USB=m +CONFIG_NFC_MRVL_UART=m +CONFIG_NFC_MRVL_I2C=m +CONFIG_NFC_MRVL_SPI=m +CONFIG_NFC_ST21NFCA=m +CONFIG_NFC_ST21NFCA_I2C=m +CONFIG_NFC_ST_NCI=m +CONFIG_NFC_ST_NCI_I2C=m +CONFIG_NFC_ST_NCI_SPI=m +CONFIG_NFC_NXP_NCI=m +CONFIG_NFC_NXP_NCI_I2C=m +CONFIG_NFC_S3FWRN5=m +CONFIG_NFC_S3FWRN5_I2C=m +CONFIG_NFC_ST95HF=m +# end of Near Field Communication (NFC) devices + +CONFIG_PSAMPLE=m +CONFIG_NET_IFE=m +CONFIG_LWTUNNEL=y +CONFIG_LWTUNNEL_BPF=y +CONFIG_DST_CACHE=y +CONFIG_GRO_CELLS=y +CONFIG_SOCK_VALIDATE_XMIT=y +CONFIG_NET_SOCK_MSG=y +CONFIG_NET_DEVLINK=y +CONFIG_PAGE_POOL=y +CONFIG_FAILOVER=m +CONFIG_ETHTOOL_NETLINK=y +CONFIG_HAVE_EBPF_JIT=y + +# +# Device Drivers +# +CONFIG_HAVE_EISA=y +# CONFIG_EISA is not set +CONFIG_HAVE_PCI=y +CONFIG_PCI=y +CONFIG_PCI_DOMAINS=y +CONFIG_PCIEPORTBUS=y +CONFIG_HOTPLUG_PCI_PCIE=y +CONFIG_PCIEAER=y +# CONFIG_PCIEAER_INJECT is not set +CONFIG_PCIE_ECRC=y +CONFIG_PCIEASPM=y +CONFIG_PCIEASPM_DEFAULT=y +# CONFIG_PCIEASPM_POWERSAVE is not set +# CONFIG_PCIEASPM_POWER_SUPERSAVE is not set +# CONFIG_PCIEASPM_PERFORMANCE is not set +CONFIG_PCIE_PME=y +CONFIG_PCIE_DPC=y +CONFIG_PCIE_PTM=y +# CONFIG_PCIE_BW is not set +CONFIG_PCIE_EDR=y +CONFIG_PCI_MSI=y +CONFIG_PCI_MSI_IRQ_DOMAIN=y +CONFIG_PCI_QUIRKS=y +# CONFIG_PCI_DEBUG is not set +CONFIG_PCI_REALLOC_ENABLE_AUTO=y +CONFIG_PCI_STUB=y +CONFIG_PCI_PF_STUB=m +CONFIG_XEN_PCIDEV_FRONTEND=m +CONFIG_PCI_ATS=y +CONFIG_PCI_ECAM=y +CONFIG_PCI_LOCKLESS_CONFIG=y +CONFIG_PCI_IOV=y +CONFIG_PCI_PRI=y +CONFIG_PCI_PASID=y +CONFIG_PCI_P2PDMA=y +CONFIG_PCI_LABEL=y +CONFIG_PCI_HYPERV=m +CONFIG_HOTPLUG_PCI=y +CONFIG_HOTPLUG_PCI_ACPI=y +CONFIG_HOTPLUG_PCI_ACPI_IBM=m +CONFIG_HOTPLUG_PCI_CPCI=y +CONFIG_HOTPLUG_PCI_CPCI_ZT5550=m +CONFIG_HOTPLUG_PCI_CPCI_GENERIC=m +CONFIG_HOTPLUG_PCI_SHPC=y + +# +# PCI controller drivers +# +CONFIG_PCI_FTPCI100=y +CONFIG_PCI_HOST_COMMON=y +CONFIG_PCI_HOST_GENERIC=y +CONFIG_PCIE_XILINX=y +CONFIG_VMD=m +CONFIG_PCI_HYPERV_INTERFACE=m + +# +# DesignWare PCI Core Support +# +CONFIG_PCIE_DW=y +CONFIG_PCIE_DW_HOST=y +CONFIG_PCIE_DW_EP=y +CONFIG_PCIE_DW_PLAT=y +CONFIG_PCIE_DW_PLAT_HOST=y +CONFIG_PCIE_DW_PLAT_EP=y +CONFIG_PCIE_INTEL_GW=y +CONFIG_PCI_MESON=y +# end of DesignWare PCI Core Support + +# +# Mobiveil PCIe Core Support +# +# end of Mobiveil PCIe Core Support + +# +# Cadence PCIe controllers support +# +CONFIG_PCIE_CADENCE=y +CONFIG_PCIE_CADENCE_HOST=y +CONFIG_PCIE_CADENCE_EP=y +CONFIG_PCIE_CADENCE_PLAT=y +CONFIG_PCIE_CADENCE_PLAT_HOST=y +CONFIG_PCIE_CADENCE_PLAT_EP=y +# CONFIG_PCI_J721E_HOST is not set +# CONFIG_PCI_J721E_EP is not set +# end of Cadence PCIe controllers support +# end of PCI controller drivers + +# +# PCI Endpoint +# +CONFIG_PCI_ENDPOINT=y +CONFIG_PCI_ENDPOINT_CONFIGFS=y +# CONFIG_PCI_EPF_TEST is not set +# end of PCI Endpoint + +# +# PCI switch controller drivers +# +CONFIG_PCI_SW_SWITCHTEC=m +# end of PCI switch controller drivers + +CONFIG_PCCARD=m +CONFIG_PCMCIA=m +CONFIG_PCMCIA_LOAD_CIS=y +CONFIG_CARDBUS=y + +# +# PC-card bridges +# +CONFIG_YENTA=m +CONFIG_YENTA_O2=y +CONFIG_YENTA_RICOH=y +CONFIG_YENTA_TI=y +CONFIG_YENTA_ENE_TUNE=y +CONFIG_YENTA_TOSHIBA=y +CONFIG_PD6729=m +CONFIG_I82092=m +CONFIG_PCCARD_NONSTATIC=y +CONFIG_RAPIDIO=m +CONFIG_RAPIDIO_TSI721=m +CONFIG_RAPIDIO_DISC_TIMEOUT=30 +CONFIG_RAPIDIO_ENABLE_RX_TX_PORTS=y +CONFIG_RAPIDIO_DMA_ENGINE=y +# CONFIG_RAPIDIO_DEBUG is not set +CONFIG_RAPIDIO_ENUM_BASIC=m +CONFIG_RAPIDIO_CHMAN=m +CONFIG_RAPIDIO_MPORT_CDEV=m + +# +# RapidIO Switch drivers +# +CONFIG_RAPIDIO_TSI57X=m +CONFIG_RAPIDIO_CPS_XX=m +CONFIG_RAPIDIO_TSI568=m +CONFIG_RAPIDIO_CPS_GEN2=m +CONFIG_RAPIDIO_RXS_GEN3=m +# end of RapidIO Switch drivers + +# +# Generic Driver Options +# +# CONFIG_UEVENT_HELPER is not set +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +CONFIG_STANDALONE=y +CONFIG_PREVENT_FIRMWARE_BUILD=y + +# +# Firmware loader +# +CONFIG_FW_LOADER=y +CONFIG_FW_LOADER_PAGED_BUF=y +CONFIG_EXTRA_FIRMWARE="" +# CONFIG_FW_LOADER_USER_HELPER is not set +CONFIG_FW_LOADER_COMPRESS=y +CONFIG_FW_CACHE=y +# end of Firmware loader + +CONFIG_WANT_DEV_COREDUMP=y +CONFIG_ALLOW_DEV_COREDUMP=y +CONFIG_DEV_COREDUMP=y +# CONFIG_DEBUG_DRIVER is not set +# CONFIG_DEBUG_DEVRES is not set +# CONFIG_DEBUG_TEST_DRIVER_REMOVE is not set +CONFIG_HMEM_REPORTING=y +# CONFIG_TEST_ASYNC_DRIVER_PROBE is not set +CONFIG_SYS_HYPERVISOR=y +CONFIG_GENERIC_CPU_AUTOPROBE=y +CONFIG_GENERIC_CPU_VULNERABILITIES=y +CONFIG_REGMAP=y +CONFIG_REGMAP_I2C=y +CONFIG_REGMAP_SLIMBUS=m +CONFIG_REGMAP_SPI=y +CONFIG_REGMAP_SPMI=m +CONFIG_REGMAP_W1=m +CONFIG_REGMAP_MMIO=y +CONFIG_REGMAP_IRQ=y +CONFIG_REGMAP_SOUNDWIRE=m +CONFIG_REGMAP_SCCB=m +CONFIG_REGMAP_I3C=m +CONFIG_DMA_SHARED_BUFFER=y +# CONFIG_DMA_FENCE_TRACE is not set +# end of Generic Driver Options + +# +# Bus devices +# +CONFIG_MOXTET=m +CONFIG_SIMPLE_PM_BUS=y +CONFIG_MHI_BUS=m +# end of Bus devices + +CONFIG_CONNECTOR=y +CONFIG_PROC_EVENTS=y +CONFIG_GNSS=m +CONFIG_GNSS_SERIAL=m +CONFIG_GNSS_MTK_SERIAL=m +CONFIG_GNSS_SIRF_SERIAL=m +CONFIG_GNSS_UBX_SERIAL=m +CONFIG_MTD=m +CONFIG_MTD_TESTS=m + +# +# Partition parsers +# +CONFIG_MTD_AR7_PARTS=m +CONFIG_MTD_CMDLINE_PARTS=m +CONFIG_MTD_OF_PARTS=m +CONFIG_MTD_REDBOOT_PARTS=m +CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1 +# CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED is not set +# CONFIG_MTD_REDBOOT_PARTS_READONLY is not set +# end of Partition parsers + +# +# User Modules And Translation Layers +# +CONFIG_MTD_BLKDEVS=m +CONFIG_MTD_BLOCK=m +CONFIG_MTD_BLOCK_RO=m +CONFIG_FTL=m +CONFIG_NFTL=m +CONFIG_NFTL_RW=y +CONFIG_INFTL=m +CONFIG_RFD_FTL=m +CONFIG_SSFDC=m +CONFIG_SM_FTL=m +CONFIG_MTD_OOPS=m +# CONFIG_MTD_PSTORE is not set +CONFIG_MTD_SWAP=m +CONFIG_MTD_PARTITIONED_MASTER=y + +# +# RAM/ROM/Flash chip drivers +# +CONFIG_MTD_CFI=m +CONFIG_MTD_JEDECPROBE=m +CONFIG_MTD_GEN_PROBE=m +# CONFIG_MTD_CFI_ADV_OPTIONS is not set +CONFIG_MTD_MAP_BANK_WIDTH_1=y +CONFIG_MTD_MAP_BANK_WIDTH_2=y +CONFIG_MTD_MAP_BANK_WIDTH_4=y +CONFIG_MTD_CFI_I1=y +CONFIG_MTD_CFI_I2=y +CONFIG_MTD_CFI_INTELEXT=m +CONFIG_MTD_CFI_AMDSTD=m +CONFIG_MTD_CFI_STAA=m +CONFIG_MTD_CFI_UTIL=m +CONFIG_MTD_RAM=m +CONFIG_MTD_ROM=m +CONFIG_MTD_ABSENT=m +# end of RAM/ROM/Flash chip drivers + +# +# Mapping drivers for chip access +# +CONFIG_MTD_COMPLEX_MAPPINGS=y +CONFIG_MTD_PHYSMAP=m +# CONFIG_MTD_PHYSMAP_COMPAT is not set +CONFIG_MTD_PHYSMAP_OF=y +CONFIG_MTD_PHYSMAP_VERSATILE=y +CONFIG_MTD_PHYSMAP_GEMINI=y +CONFIG_MTD_PHYSMAP_GPIO_ADDR=y +CONFIG_MTD_SBC_GXX=m +CONFIG_MTD_AMD76XROM=m +CONFIG_MTD_ICHXROM=m +CONFIG_MTD_ESB2ROM=m +CONFIG_MTD_CK804XROM=m +CONFIG_MTD_SCB2_FLASH=m +CONFIG_MTD_NETtel=m +CONFIG_MTD_L440GX=m +CONFIG_MTD_PCI=m +CONFIG_MTD_PCMCIA=m +# CONFIG_MTD_PCMCIA_ANONYMOUS is not set +CONFIG_MTD_INTEL_VR_NOR=m +CONFIG_MTD_PLATRAM=m +# end of Mapping drivers for chip access + +# +# Self-contained MTD device drivers +# +CONFIG_MTD_PMC551=m +# CONFIG_MTD_PMC551_BUGFIX is not set +# CONFIG_MTD_PMC551_DEBUG is not set +CONFIG_MTD_DATAFLASH=m +# CONFIG_MTD_DATAFLASH_WRITE_VERIFY is not set +CONFIG_MTD_DATAFLASH_OTP=y +CONFIG_MTD_MCHP23K256=m +CONFIG_MTD_SST25L=m +CONFIG_MTD_SLRAM=m +CONFIG_MTD_PHRAM=m +CONFIG_MTD_MTDRAM=m +CONFIG_MTDRAM_TOTAL_SIZE=4096 +CONFIG_MTDRAM_ERASE_SIZE=128 +CONFIG_MTD_BLOCK2MTD=m + +# +# Disk-On-Chip Device Drivers +# +CONFIG_MTD_DOCG3=m +CONFIG_BCH_CONST_M=14 +CONFIG_BCH_CONST_T=4 +# end of Self-contained MTD device drivers + +# +# NAND +# +CONFIG_MTD_NAND_CORE=m +CONFIG_MTD_ONENAND=m +# CONFIG_MTD_ONENAND_VERIFY_WRITE is not set +CONFIG_MTD_ONENAND_GENERIC=m +CONFIG_MTD_ONENAND_OTP=y +CONFIG_MTD_ONENAND_2X_PROGRAM=y +CONFIG_MTD_NAND_ECC_SW_HAMMING=m +CONFIG_MTD_NAND_ECC_SW_HAMMING_SMC=y +CONFIG_MTD_RAW_NAND=m +CONFIG_MTD_NAND_ECC_SW_BCH=y + +# +# Raw/parallel NAND flash controllers +# +CONFIG_MTD_NAND_DENALI=m +CONFIG_MTD_NAND_DENALI_PCI=m +CONFIG_MTD_NAND_DENALI_DT=m +CONFIG_MTD_NAND_CAFE=m +CONFIG_MTD_NAND_MXIC=m +CONFIG_MTD_NAND_GPIO=m +CONFIG_MTD_NAND_PLATFORM=m +CONFIG_MTD_NAND_CADENCE=m +CONFIG_MTD_NAND_ARASAN=m + +# +# Misc +# +CONFIG_MTD_SM_COMMON=m +CONFIG_MTD_NAND_NANDSIM=m +CONFIG_MTD_NAND_RICOH=m +CONFIG_MTD_NAND_DISKONCHIP=m +# CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED is not set +CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0 +CONFIG_MTD_NAND_DISKONCHIP_BBTWRITE=y +CONFIG_MTD_SPI_NAND=m +# end of NAND + +# +# LPDDR & LPDDR2 PCM memory drivers +# +CONFIG_MTD_LPDDR=m +CONFIG_MTD_QINFO_PROBE=m +# end of LPDDR & LPDDR2 PCM memory drivers + +CONFIG_MTD_SPI_NOR=m +CONFIG_MTD_SPI_NOR_USE_4K_SECTORS=y +CONFIG_SPI_INTEL_SPI=m +CONFIG_SPI_INTEL_SPI_PCI=m +CONFIG_SPI_INTEL_SPI_PLATFORM=m +CONFIG_MTD_UBI=m +CONFIG_MTD_UBI_WL_THRESHOLD=4096 +CONFIG_MTD_UBI_BEB_LIMIT=20 +CONFIG_MTD_UBI_FASTMAP=y +CONFIG_MTD_UBI_GLUEBI=m +CONFIG_MTD_UBI_BLOCK=y +CONFIG_MTD_HYPERBUS=m +CONFIG_DTC=y +CONFIG_OF=y +# CONFIG_OF_UNITTEST is not set +CONFIG_OF_FLATTREE=y +CONFIG_OF_EARLY_FLATTREE=y +CONFIG_OF_KOBJ=y +CONFIG_OF_DYNAMIC=y +CONFIG_OF_ADDRESS=y +CONFIG_OF_IRQ=y +CONFIG_OF_NET=y +CONFIG_OF_MDIO=m +CONFIG_OF_RESERVED_MEM=y +CONFIG_OF_RESOLVE=y +CONFIG_OF_OVERLAY=y +CONFIG_ARCH_MIGHT_HAVE_PC_PARPORT=y +CONFIG_PARPORT=m +CONFIG_PARPORT_PC=m +CONFIG_PARPORT_SERIAL=m +CONFIG_PARPORT_PC_FIFO=y +CONFIG_PARPORT_PC_SUPERIO=y +CONFIG_PARPORT_PC_PCMCIA=m +CONFIG_PARPORT_AX88796=m +CONFIG_PARPORT_1284=y +CONFIG_PARPORT_NOT_PC=y +CONFIG_PNP=y +CONFIG_PNP_DEBUG_MESSAGES=y + +# +# Protocols +# +CONFIG_PNPACPI=y +CONFIG_BLK_DEV=y +# CONFIG_BLK_DEV_NULL_BLK is not set +CONFIG_BLK_DEV_FD=m +CONFIG_CDROM=m +# CONFIG_PARIDE is not set +CONFIG_BLK_DEV_PCIESSD_MTIP32XX=m +CONFIG_ZRAM=m +CONFIG_ZRAM_WRITEBACK=y +# CONFIG_ZRAM_MEMORY_TRACKING is not set +CONFIG_BLK_DEV_UMEM=m +CONFIG_BLK_DEV_LOOP=m +CONFIG_BLK_DEV_LOOP_MIN_COUNT=8 +CONFIG_BLK_DEV_CRYPTOLOOP=m +CONFIG_BLK_DEV_DRBD=m +# CONFIG_DRBD_FAULT_INJECTION is not set +CONFIG_BLK_DEV_NBD=m +CONFIG_BLK_DEV_SKD=m +CONFIG_BLK_DEV_SX8=m +CONFIG_BLK_DEV_RAM=m +CONFIG_BLK_DEV_RAM_COUNT=16 +CONFIG_BLK_DEV_RAM_SIZE=16384 +CONFIG_CDROM_PKTCDVD=m +CONFIG_CDROM_PKTCDVD_BUFFERS=8 +# CONFIG_CDROM_PKTCDVD_WCACHE is not set +CONFIG_ATA_OVER_ETH=m +CONFIG_XEN_BLKDEV_FRONTEND=m +CONFIG_XEN_BLKDEV_BACKEND=m +CONFIG_VIRTIO_BLK=m +CONFIG_BLK_DEV_RBD=m +CONFIG_BLK_DEV_RSXX=m + +# +# NVME Support +# +CONFIG_NVME_CORE=y +CONFIG_BLK_DEV_NVME=y +CONFIG_NVME_MULTIPATH=y +CONFIG_NVME_HWMON=y +CONFIG_NVME_FABRICS=m +CONFIG_NVME_RDMA=m +CONFIG_NVME_FC=m +CONFIG_NVME_TCP=m +CONFIG_NVME_TARGET=m +# CONFIG_NVME_TARGET_PASSTHRU is not set +CONFIG_NVME_TARGET_LOOP=m +CONFIG_NVME_TARGET_RDMA=m +CONFIG_NVME_TARGET_FC=m +CONFIG_NVME_TARGET_FCLOOP=m +CONFIG_NVME_TARGET_TCP=m +# end of NVME Support + +# +# Misc devices +# +CONFIG_SENSORS_LIS3LV02D=m +CONFIG_AD525X_DPOT=m +CONFIG_AD525X_DPOT_I2C=m +CONFIG_AD525X_DPOT_SPI=m +# CONFIG_DUMMY_IRQ is not set +CONFIG_IBM_ASM=m +CONFIG_PHANTOM=m +CONFIG_TIFM_CORE=m +CONFIG_TIFM_7XX1=m +CONFIG_ICS932S401=m +CONFIG_ENCLOSURE_SERVICES=m +CONFIG_HP_ILO=m +CONFIG_APDS9802ALS=m +CONFIG_ISL29003=m +CONFIG_ISL29020=m +CONFIG_SENSORS_TSL2550=m +CONFIG_SENSORS_BH1770=m +CONFIG_SENSORS_APDS990X=m +CONFIG_HMC6352=m +CONFIG_DS1682=m +CONFIG_VMWARE_BALLOON=m +CONFIG_LATTICE_ECP3_CONFIG=m +# CONFIG_SRAM is not set +CONFIG_PCI_ENDPOINT_TEST=m +CONFIG_XILINX_SDFEC=m +CONFIG_MISC_RTSX=m +CONFIG_PVPANIC=m +CONFIG_C2PORT=m +CONFIG_C2PORT_DURAMAR_2150=m + +# +# EEPROM support +# +CONFIG_EEPROM_AT24=m +# CONFIG_EEPROM_AT25 is not set +CONFIG_EEPROM_LEGACY=m +CONFIG_EEPROM_MAX6875=m +CONFIG_EEPROM_93CX6=m +# CONFIG_EEPROM_93XX46 is not set +CONFIG_EEPROM_IDT_89HPESX=m +CONFIG_EEPROM_EE1004=m +# end of EEPROM support + +CONFIG_CB710_CORE=m +# CONFIG_CB710_DEBUG is not set +CONFIG_CB710_DEBUG_ASSUMPTIONS=y + +# +# Texas Instruments shared transport line discipline +# +CONFIG_TI_ST=m +# end of Texas Instruments shared transport line discipline + +CONFIG_SENSORS_LIS3_I2C=m +CONFIG_ALTERA_STAPL=m +CONFIG_INTEL_MEI=m +CONFIG_INTEL_MEI_ME=m +CONFIG_INTEL_MEI_TXE=m +CONFIG_INTEL_MEI_HDCP=m +CONFIG_VMWARE_VMCI=m + +# +# Intel MIC & related support +# +CONFIG_INTEL_MIC_BUS=m +CONFIG_SCIF_BUS=m +CONFIG_VOP_BUS=m +CONFIG_INTEL_MIC_HOST=m +CONFIG_INTEL_MIC_CARD=m +CONFIG_SCIF=m +CONFIG_MIC_COSM=m +CONFIG_VOP=m +# end of Intel MIC & related support + +CONFIG_GENWQE=m +CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY=0 +CONFIG_ECHO=m +CONFIG_MISC_ALCOR_PCI=m +CONFIG_MISC_RTSX_PCI=m +CONFIG_MISC_RTSX_USB=m +CONFIG_HABANA_AI=m +CONFIG_UACCE=m +# end of Misc devices + +CONFIG_HAVE_IDE=y +# CONFIG_IDE is not set + +# +# SCSI device support +# +CONFIG_SCSI_MOD=y +CONFIG_RAID_ATTRS=m +CONFIG_SCSI=y +CONFIG_SCSI_DMA=y +CONFIG_SCSI_NETLINK=y +CONFIG_SCSI_PROC_FS=y + +# +# SCSI support type (disk, tape, CD-ROM) +# +CONFIG_BLK_DEV_SD=y +CONFIG_CHR_DEV_ST=m +CONFIG_BLK_DEV_SR=m +CONFIG_CHR_DEV_SG=m +CONFIG_CHR_DEV_SCH=m +CONFIG_SCSI_ENCLOSURE=m +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_LOGGING=y +CONFIG_SCSI_SCAN_ASYNC=y + +# +# SCSI Transports +# +CONFIG_SCSI_SPI_ATTRS=m +CONFIG_SCSI_FC_ATTRS=m +CONFIG_SCSI_ISCSI_ATTRS=m +CONFIG_SCSI_SAS_ATTRS=m +CONFIG_SCSI_SAS_LIBSAS=m +CONFIG_SCSI_SAS_ATA=y +CONFIG_SCSI_SAS_HOST_SMP=y +CONFIG_SCSI_SRP_ATTRS=m +# end of SCSI Transports + +CONFIG_SCSI_LOWLEVEL=y +CONFIG_ISCSI_TCP=m +CONFIG_ISCSI_BOOT_SYSFS=m +CONFIG_SCSI_CXGB3_ISCSI=m +CONFIG_SCSI_CXGB4_ISCSI=m +CONFIG_SCSI_BNX2_ISCSI=m +CONFIG_SCSI_BNX2X_FCOE=m +CONFIG_BE2ISCSI=m +CONFIG_BLK_DEV_3W_XXXX_RAID=m +CONFIG_SCSI_HPSA=m +CONFIG_SCSI_3W_9XXX=m +CONFIG_SCSI_3W_SAS=m +CONFIG_SCSI_ACARD=m +CONFIG_SCSI_AACRAID=m +CONFIG_SCSI_AIC7XXX=m +CONFIG_AIC7XXX_CMDS_PER_DEVICE=32 +CONFIG_AIC7XXX_RESET_DELAY_MS=15000 +CONFIG_AIC7XXX_DEBUG_ENABLE=y +CONFIG_AIC7XXX_DEBUG_MASK=0 +CONFIG_AIC7XXX_REG_PRETTY_PRINT=y +CONFIG_SCSI_AIC79XX=m +CONFIG_AIC79XX_CMDS_PER_DEVICE=32 +CONFIG_AIC79XX_RESET_DELAY_MS=15000 +CONFIG_AIC79XX_DEBUG_ENABLE=y +CONFIG_AIC79XX_DEBUG_MASK=0 +CONFIG_AIC79XX_REG_PRETTY_PRINT=y +CONFIG_SCSI_AIC94XX=m +CONFIG_AIC94XX_DEBUG=y +CONFIG_SCSI_MVSAS=m +CONFIG_SCSI_MVSAS_DEBUG=y +CONFIG_SCSI_MVSAS_TASKLET=y +CONFIG_SCSI_MVUMI=m +CONFIG_SCSI_DPT_I2O=m +CONFIG_SCSI_ADVANSYS=m +CONFIG_SCSI_ARCMSR=m +CONFIG_SCSI_ESAS2R=m +CONFIG_MEGARAID_NEWGEN=y +CONFIG_MEGARAID_MM=m +CONFIG_MEGARAID_MAILBOX=m +CONFIG_MEGARAID_LEGACY=m +CONFIG_MEGARAID_SAS=m +CONFIG_SCSI_MPT3SAS=m +CONFIG_SCSI_MPT2SAS_MAX_SGE=128 +CONFIG_SCSI_MPT3SAS_MAX_SGE=128 +CONFIG_SCSI_MPT2SAS=m +CONFIG_SCSI_SMARTPQI=m +CONFIG_SCSI_UFSHCD=m +CONFIG_SCSI_UFSHCD_PCI=m +# CONFIG_SCSI_UFS_DWC_TC_PCI is not set +CONFIG_SCSI_UFSHCD_PLATFORM=m +CONFIG_SCSI_UFS_CDNS_PLATFORM=m +# CONFIG_SCSI_UFS_DWC_TC_PLATFORM is not set +CONFIG_SCSI_UFS_BSG=y +# CONFIG_SCSI_UFS_CRYPTO is not set +CONFIG_SCSI_HPTIOP=m +CONFIG_SCSI_BUSLOGIC=m +CONFIG_SCSI_FLASHPOINT=y +CONFIG_SCSI_MYRB=m +CONFIG_SCSI_MYRS=m +CONFIG_VMWARE_PVSCSI=m +CONFIG_XEN_SCSI_FRONTEND=m +CONFIG_HYPERV_STORAGE=m +CONFIG_LIBFC=m +CONFIG_LIBFCOE=m +CONFIG_FCOE=m +CONFIG_FCOE_FNIC=m +CONFIG_SCSI_SNIC=m +# CONFIG_SCSI_SNIC_DEBUG_FS is not set +CONFIG_SCSI_DMX3191D=m +CONFIG_SCSI_FDOMAIN=m +CONFIG_SCSI_FDOMAIN_PCI=m +CONFIG_SCSI_GDTH=m +CONFIG_SCSI_ISCI=m +CONFIG_SCSI_IPS=m +CONFIG_SCSI_INITIO=m +CONFIG_SCSI_INIA100=m +CONFIG_SCSI_PPA=m +CONFIG_SCSI_IMM=m +# CONFIG_SCSI_IZIP_EPP16 is not set +# CONFIG_SCSI_IZIP_SLOW_CTR is not set +CONFIG_SCSI_STEX=m +CONFIG_SCSI_SYM53C8XX_2=m +CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1 +CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 +CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 +CONFIG_SCSI_SYM53C8XX_MMIO=y +CONFIG_SCSI_IPR=m +CONFIG_SCSI_IPR_TRACE=y +CONFIG_SCSI_IPR_DUMP=y +CONFIG_SCSI_QLOGIC_1280=m +CONFIG_SCSI_QLA_FC=m +CONFIG_TCM_QLA2XXX=m +# CONFIG_TCM_QLA2XXX_DEBUG is not set +CONFIG_SCSI_QLA_ISCSI=m +CONFIG_QEDI=m +CONFIG_QEDF=m +CONFIG_SCSI_LPFC=m +# CONFIG_SCSI_LPFC_DEBUG_FS is not set +CONFIG_SCSI_DC395x=m +CONFIG_SCSI_AM53C974=m +CONFIG_SCSI_WD719X=m +CONFIG_SCSI_DEBUG=m +CONFIG_SCSI_PMCRAID=m +CONFIG_SCSI_PM8001=m +CONFIG_SCSI_BFA_FC=m +CONFIG_SCSI_VIRTIO=m +CONFIG_SCSI_CHELSIO_FCOE=m +CONFIG_SCSI_LOWLEVEL_PCMCIA=y +CONFIG_PCMCIA_AHA152X=m +CONFIG_PCMCIA_FDOMAIN=m +CONFIG_PCMCIA_QLOGIC=m +CONFIG_PCMCIA_SYM53C500=m +CONFIG_SCSI_DH=y +CONFIG_SCSI_DH_RDAC=m +CONFIG_SCSI_DH_HP_SW=m +CONFIG_SCSI_DH_EMC=m +CONFIG_SCSI_DH_ALUA=m +# end of SCSI device support + +CONFIG_ATA=y +CONFIG_SATA_HOST=y +CONFIG_PATA_TIMINGS=y +CONFIG_ATA_VERBOSE_ERROR=y +CONFIG_ATA_FORCE=y +CONFIG_ATA_ACPI=y +CONFIG_SATA_ZPODD=y +CONFIG_SATA_PMP=y + +# +# Controllers with non-SFF native interface +# +CONFIG_SATA_AHCI=y +CONFIG_SATA_MOBILE_LPM_POLICY=3 +CONFIG_SATA_AHCI_PLATFORM=m +CONFIG_AHCI_CEVA=m +CONFIG_AHCI_QORIQ=m +CONFIG_SATA_INIC162X=m +CONFIG_SATA_ACARD_AHCI=m +CONFIG_SATA_SIL24=m +CONFIG_ATA_SFF=y + +# +# SFF controllers with custom DMA interface +# +CONFIG_PDC_ADMA=m +CONFIG_SATA_QSTOR=m +CONFIG_SATA_SX4=m +CONFIG_ATA_BMDMA=y + +# +# SATA SFF controllers with BMDMA +# +CONFIG_ATA_PIIX=m +CONFIG_SATA_DWC=m +# CONFIG_SATA_DWC_OLD_DMA is not set +# CONFIG_SATA_DWC_DEBUG is not set +CONFIG_SATA_MV=m +CONFIG_SATA_NV=m +CONFIG_SATA_PROMISE=m +CONFIG_SATA_SIL=m +CONFIG_SATA_SIS=m +CONFIG_SATA_SVW=m +CONFIG_SATA_ULI=m +CONFIG_SATA_VIA=m +CONFIG_SATA_VITESSE=m + +# +# PATA SFF controllers with BMDMA +# +CONFIG_PATA_ALI=m +CONFIG_PATA_AMD=m +CONFIG_PATA_ARTOP=m +CONFIG_PATA_ATIIXP=m +CONFIG_PATA_ATP867X=m +CONFIG_PATA_CMD64X=m +CONFIG_PATA_CYPRESS=m +CONFIG_PATA_EFAR=m +CONFIG_PATA_HPT366=m +CONFIG_PATA_HPT37X=m +CONFIG_PATA_HPT3X2N=m +CONFIG_PATA_HPT3X3=m +CONFIG_PATA_HPT3X3_DMA=y +CONFIG_PATA_IT8213=m +CONFIG_PATA_IT821X=m +CONFIG_PATA_JMICRON=m +CONFIG_PATA_MARVELL=m +CONFIG_PATA_NETCELL=m +CONFIG_PATA_NINJA32=m +CONFIG_PATA_NS87415=m +CONFIG_PATA_OLDPIIX=m +CONFIG_PATA_OPTIDMA=m +CONFIG_PATA_PDC2027X=m +CONFIG_PATA_PDC_OLD=m +CONFIG_PATA_RADISYS=m +CONFIG_PATA_RDC=m +CONFIG_PATA_SCH=m +CONFIG_PATA_SERVERWORKS=m +CONFIG_PATA_SIL680=m +CONFIG_PATA_SIS=m +CONFIG_PATA_TOSHIBA=m +CONFIG_PATA_TRIFLEX=m +CONFIG_PATA_VIA=m +CONFIG_PATA_WINBOND=m + +# +# PIO-only SFF controllers +# +CONFIG_PATA_CMD640_PCI=m +CONFIG_PATA_MPIIX=m +CONFIG_PATA_NS87410=m +CONFIG_PATA_OPTI=m +CONFIG_PATA_PCMCIA=m +# CONFIG_PATA_PLATFORM is not set +CONFIG_PATA_RZ1000=m + +# +# Generic fallback / legacy drivers +# +CONFIG_PATA_ACPI=m +CONFIG_ATA_GENERIC=m +CONFIG_PATA_LEGACY=m +CONFIG_MD=y +CONFIG_BLK_DEV_MD=m +CONFIG_MD_LINEAR=m +CONFIG_MD_RAID0=m +CONFIG_MD_RAID1=m +CONFIG_MD_RAID10=m +CONFIG_MD_RAID456=m +CONFIG_MD_MULTIPATH=m +CONFIG_MD_FAULTY=m +CONFIG_MD_CLUSTER=m +CONFIG_BCACHE=m +# CONFIG_BCACHE_DEBUG is not set +# CONFIG_BCACHE_CLOSURES_DEBUG is not set +CONFIG_BCACHE_ASYNC_REGISTRATION=y +CONFIG_BLK_DEV_DM_BUILTIN=y +CONFIG_BLK_DEV_DM=m +CONFIG_DM_DEBUG=y +CONFIG_DM_BUFIO=m +# CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING is not set +CONFIG_DM_BIO_PRISON=m +CONFIG_DM_PERSISTENT_DATA=m +CONFIG_DM_UNSTRIPED=m +CONFIG_DM_CRYPT=m +CONFIG_DM_SNAPSHOT=m +CONFIG_DM_THIN_PROVISIONING=m +CONFIG_DM_CACHE=m +CONFIG_DM_CACHE_SMQ=m +CONFIG_DM_WRITECACHE=m +CONFIG_DM_EBS=m +CONFIG_DM_ERA=m +CONFIG_DM_CLONE=m +CONFIG_DM_MIRROR=m +CONFIG_DM_LOG_USERSPACE=m +CONFIG_DM_RAID=m +CONFIG_DM_ZERO=m +CONFIG_DM_MULTIPATH=m +CONFIG_DM_MULTIPATH_QL=m +CONFIG_DM_MULTIPATH_ST=m +CONFIG_DM_MULTIPATH_HST=m +CONFIG_DM_DELAY=m +CONFIG_DM_DUST=m +CONFIG_DM_UEVENT=y +CONFIG_DM_FLAKEY=m +CONFIG_DM_VERITY=m +CONFIG_DM_VERITY_VERIFY_ROOTHASH_SIG=y +CONFIG_DM_VERITY_FEC=y +CONFIG_DM_SWITCH=m +CONFIG_DM_LOG_WRITES=m +CONFIG_DM_INTEGRITY=m +CONFIG_DM_ZONED=m +CONFIG_TARGET_CORE=m +CONFIG_TCM_IBLOCK=m +CONFIG_TCM_FILEIO=m +CONFIG_TCM_PSCSI=m +CONFIG_TCM_USER2=m +CONFIG_LOOPBACK_TARGET=m +CONFIG_TCM_FC=m +CONFIG_ISCSI_TARGET=m +CONFIG_ISCSI_TARGET_CXGB4=m +CONFIG_SBP_TARGET=m +CONFIG_FUSION=y +CONFIG_FUSION_SPI=m +CONFIG_FUSION_FC=m +CONFIG_FUSION_SAS=m +CONFIG_FUSION_MAX_SGE=128 +CONFIG_FUSION_CTL=m +CONFIG_FUSION_LAN=m +# CONFIG_FUSION_LOGGING is not set + +# +# IEEE 1394 (FireWire) support +# +CONFIG_FIREWIRE=m +CONFIG_FIREWIRE_OHCI=m +CONFIG_FIREWIRE_SBP2=m +CONFIG_FIREWIRE_NET=m +CONFIG_FIREWIRE_NOSY=m +# end of IEEE 1394 (FireWire) support + +CONFIG_MACINTOSH_DRIVERS=y +CONFIG_MAC_EMUMOUSEBTN=m +CONFIG_NETDEVICES=y +CONFIG_MII=m +CONFIG_NET_CORE=y +CONFIG_BONDING=m +CONFIG_DUMMY=m +CONFIG_WIREGUARD=m +# CONFIG_WIREGUARD_DEBUG is not set +CONFIG_EQUALIZER=m +CONFIG_NET_FC=y +CONFIG_IFB=m +CONFIG_NET_TEAM=m +CONFIG_NET_TEAM_MODE_BROADCAST=m +CONFIG_NET_TEAM_MODE_ROUNDROBIN=m +CONFIG_NET_TEAM_MODE_RANDOM=m +CONFIG_NET_TEAM_MODE_ACTIVEBACKUP=m +CONFIG_NET_TEAM_MODE_LOADBALANCE=m +CONFIG_MACVLAN=m +CONFIG_MACVTAP=m +CONFIG_IPVLAN_L3S=y +CONFIG_IPVLAN=m +CONFIG_IPVTAP=m +CONFIG_VXLAN=m +CONFIG_GENEVE=m +CONFIG_BAREUDP=m +CONFIG_GTP=m +CONFIG_MACSEC=m +CONFIG_NETCONSOLE=m +CONFIG_NETCONSOLE_DYNAMIC=y +CONFIG_NETPOLL=y +CONFIG_NET_POLL_CONTROLLER=y +CONFIG_NTB_NETDEV=m +CONFIG_RIONET=m +CONFIG_RIONET_TX_SIZE=128 +CONFIG_RIONET_RX_SIZE=128 +CONFIG_TUN=m +CONFIG_TAP=m +# CONFIG_TUN_VNET_CROSS_LE is not set +CONFIG_VETH=m +CONFIG_VIRTIO_NET=m +CONFIG_NLMON=m +CONFIG_NET_VRF=m +CONFIG_VSOCKMON=m +CONFIG_SUNGEM_PHY=m +# CONFIG_ARCNET is not set +CONFIG_ATM_DRIVERS=y +# CONFIG_ATM_DUMMY is not set +CONFIG_ATM_TCP=m +CONFIG_ATM_LANAI=m +CONFIG_ATM_ENI=m +# CONFIG_ATM_ENI_DEBUG is not set +# CONFIG_ATM_ENI_TUNE_BURST is not set +CONFIG_ATM_FIRESTREAM=m +CONFIG_ATM_ZATM=m +# CONFIG_ATM_ZATM_DEBUG is not set +CONFIG_ATM_NICSTAR=m +# CONFIG_ATM_NICSTAR_USE_SUNI is not set +# CONFIG_ATM_NICSTAR_USE_IDT77105 is not set +CONFIG_ATM_IDT77252=m +# CONFIG_ATM_IDT77252_DEBUG is not set +# CONFIG_ATM_IDT77252_RCV_ALL is not set +CONFIG_ATM_IDT77252_USE_SUNI=y +CONFIG_ATM_AMBASSADOR=m +# CONFIG_ATM_AMBASSADOR_DEBUG is not set +CONFIG_ATM_HORIZON=m +# CONFIG_ATM_HORIZON_DEBUG is not set +CONFIG_ATM_IA=m +# CONFIG_ATM_IA_DEBUG is not set +CONFIG_ATM_FORE200E=m +CONFIG_ATM_FORE200E_USE_TASKLET=y +CONFIG_ATM_FORE200E_TX_RETRY=16 +CONFIG_ATM_FORE200E_DEBUG=0 +CONFIG_ATM_HE=m +CONFIG_ATM_HE_USE_SUNI=y +CONFIG_ATM_SOLOS=m +CONFIG_CAIF_DRIVERS=y +CONFIG_CAIF_TTY=m +CONFIG_CAIF_SPI_SLAVE=m +CONFIG_CAIF_SPI_SYNC=y +CONFIG_CAIF_HSI=m +CONFIG_CAIF_VIRTIO=m + +# +# Distributed Switch Architecture drivers +# +CONFIG_B53=m +# CONFIG_B53_SPI_DRIVER is not set +CONFIG_B53_MDIO_DRIVER=m +CONFIG_B53_MMAP_DRIVER=m +CONFIG_B53_SRAB_DRIVER=m +CONFIG_B53_SERDES=m +CONFIG_NET_DSA_BCM_SF2=m +CONFIG_NET_DSA_LOOP=m +CONFIG_NET_DSA_LANTIQ_GSWIP=m +CONFIG_NET_DSA_MT7530=m +CONFIG_NET_DSA_MV88E6060=m +CONFIG_NET_DSA_MICROCHIP_KSZ_COMMON=m +CONFIG_NET_DSA_MICROCHIP_KSZ9477=m +CONFIG_NET_DSA_MICROCHIP_KSZ9477_I2C=m +CONFIG_NET_DSA_MICROCHIP_KSZ9477_SPI=m +CONFIG_NET_DSA_MICROCHIP_KSZ8795=m +CONFIG_NET_DSA_MICROCHIP_KSZ8795_SPI=m +CONFIG_NET_DSA_MV88E6XXX=m +CONFIG_NET_DSA_MV88E6XXX_GLOBAL2=y +CONFIG_NET_DSA_MV88E6XXX_PTP=y +CONFIG_NET_DSA_AR9331=m +CONFIG_NET_DSA_SJA1105=m +CONFIG_NET_DSA_SJA1105_PTP=y +CONFIG_NET_DSA_SJA1105_TAS=y +CONFIG_NET_DSA_SJA1105_VL=y +CONFIG_NET_DSA_QCA8K=m +CONFIG_NET_DSA_REALTEK_SMI=m +CONFIG_NET_DSA_SMSC_LAN9303=m +CONFIG_NET_DSA_SMSC_LAN9303_I2C=m +CONFIG_NET_DSA_SMSC_LAN9303_MDIO=m +CONFIG_NET_DSA_VITESSE_VSC73XX=m +CONFIG_NET_DSA_VITESSE_VSC73XX_SPI=m +CONFIG_NET_DSA_VITESSE_VSC73XX_PLATFORM=m +# end of Distributed Switch Architecture drivers + +CONFIG_ETHERNET=y +CONFIG_MDIO=m +CONFIG_NET_VENDOR_3COM=y +CONFIG_PCMCIA_3C574=m +CONFIG_PCMCIA_3C589=m +CONFIG_VORTEX=m +CONFIG_TYPHOON=m +CONFIG_NET_VENDOR_ADAPTEC=y +CONFIG_ADAPTEC_STARFIRE=m +CONFIG_NET_VENDOR_AGERE=y +CONFIG_ET131X=m +CONFIG_NET_VENDOR_ALACRITECH=y +CONFIG_SLICOSS=m +CONFIG_NET_VENDOR_ALTEON=y +CONFIG_ACENIC=m +# CONFIG_ACENIC_OMIT_TIGON_I is not set +CONFIG_ALTERA_TSE=m +CONFIG_NET_VENDOR_AMAZON=y +CONFIG_ENA_ETHERNET=m +CONFIG_NET_VENDOR_AMD=y +CONFIG_AMD8111_ETH=m +CONFIG_PCNET32=m +CONFIG_PCMCIA_NMCLAN=m +CONFIG_AMD_XGBE=m +CONFIG_AMD_XGBE_DCB=y +CONFIG_AMD_XGBE_HAVE_ECC=y +CONFIG_NET_VENDOR_AQUANTIA=y +CONFIG_AQTION=m +CONFIG_NET_VENDOR_ARC=y +CONFIG_NET_VENDOR_ATHEROS=y +CONFIG_ATL2=m +CONFIG_ATL1=m +CONFIG_ATL1E=m +CONFIG_ATL1C=m +CONFIG_ALX=m +CONFIG_NET_VENDOR_AURORA=y +CONFIG_AURORA_NB8800=m +CONFIG_NET_VENDOR_BROADCOM=y +CONFIG_B44=m +CONFIG_B44_PCI_AUTOSELECT=y +CONFIG_B44_PCICORE_AUTOSELECT=y +CONFIG_B44_PCI=y +CONFIG_BCMGENET=m +CONFIG_BNX2=m +CONFIG_CNIC=m +CONFIG_TIGON3=m +CONFIG_TIGON3_HWMON=y +CONFIG_BNX2X=m +CONFIG_BNX2X_SRIOV=y +CONFIG_SYSTEMPORT=m +CONFIG_BNXT=m +CONFIG_BNXT_SRIOV=y +CONFIG_BNXT_FLOWER_OFFLOAD=y +CONFIG_BNXT_DCB=y +CONFIG_BNXT_HWMON=y +CONFIG_NET_VENDOR_BROCADE=y +CONFIG_BNA=m +CONFIG_NET_VENDOR_CADENCE=y +CONFIG_MACB=m +CONFIG_MACB_USE_HWSTAMP=y +CONFIG_MACB_PCI=m +CONFIG_NET_VENDOR_CAVIUM=y +CONFIG_THUNDER_NIC_PF=m +CONFIG_THUNDER_NIC_VF=m +CONFIG_THUNDER_NIC_BGX=m +CONFIG_THUNDER_NIC_RGX=m +CONFIG_CAVIUM_PTP=m +CONFIG_LIQUIDIO=m +CONFIG_LIQUIDIO_VF=m +CONFIG_NET_VENDOR_CHELSIO=y +CONFIG_CHELSIO_T1=m +CONFIG_CHELSIO_T1_1G=y +CONFIG_CHELSIO_T3=m +CONFIG_CHELSIO_T4=m +CONFIG_CHELSIO_T4_DCB=y +CONFIG_CHELSIO_T4_FCOE=y +CONFIG_CHELSIO_T4VF=m +CONFIG_CHELSIO_LIB=m +CONFIG_NET_VENDOR_CISCO=y +CONFIG_ENIC=m +CONFIG_NET_VENDOR_CORTINA=y +CONFIG_GEMINI_ETHERNET=m +CONFIG_CX_ECAT=m +CONFIG_DNET=m +CONFIG_NET_VENDOR_DEC=y +CONFIG_NET_TULIP=y +CONFIG_DE2104X=m +CONFIG_DE2104X_DSL=0 +CONFIG_TULIP=m +CONFIG_TULIP_MWI=y +CONFIG_TULIP_MMIO=y +CONFIG_TULIP_NAPI=y +CONFIG_TULIP_NAPI_HW_MITIGATION=y +CONFIG_DE4X5=m +CONFIG_WINBOND_840=m +CONFIG_DM9102=m +CONFIG_ULI526X=m +CONFIG_PCMCIA_XIRCOM=m +CONFIG_NET_VENDOR_DLINK=y +CONFIG_DL2K=m +CONFIG_SUNDANCE=m +# CONFIG_SUNDANCE_MMIO is not set +CONFIG_NET_VENDOR_EMULEX=y +CONFIG_BE2NET=m +CONFIG_BE2NET_HWMON=y +CONFIG_BE2NET_BE2=y +CONFIG_BE2NET_BE3=y +CONFIG_BE2NET_LANCER=y +CONFIG_BE2NET_SKYHAWK=y +CONFIG_NET_VENDOR_EZCHIP=y +CONFIG_EZCHIP_NPS_MANAGEMENT_ENET=m +CONFIG_NET_VENDOR_FUJITSU=y +CONFIG_PCMCIA_FMVJ18X=m +CONFIG_NET_VENDOR_GOOGLE=y +CONFIG_GVE=m +CONFIG_NET_VENDOR_HUAWEI=y +CONFIG_HINIC=m +CONFIG_NET_VENDOR_I825XX=y +CONFIG_NET_VENDOR_INTEL=y +CONFIG_E100=m +CONFIG_E1000=m +CONFIG_E1000E=m +CONFIG_E1000E_HWTS=y +CONFIG_IGB=m +CONFIG_IGB_HWMON=y +CONFIG_IGB_DCA=y +CONFIG_IGBVF=m +CONFIG_IXGB=m +CONFIG_IXGBE=m +CONFIG_IXGBE_HWMON=y +CONFIG_IXGBE_DCA=y +CONFIG_IXGBE_DCB=y +# CONFIG_IXGBE_IPSEC is not set +CONFIG_IXGBEVF=m +CONFIG_IXGBEVF_IPSEC=y +CONFIG_I40E=m +CONFIG_I40E_DCB=y +CONFIG_IAVF=m +CONFIG_I40EVF=m +CONFIG_ICE=m +CONFIG_FM10K=m +CONFIG_IGC=m +CONFIG_JME=m +CONFIG_NET_VENDOR_MARVELL=y +CONFIG_MVMDIO=m +CONFIG_SKGE=m +# CONFIG_SKGE_DEBUG is not set +CONFIG_SKGE_GENESIS=y +CONFIG_SKY2=m +# CONFIG_SKY2_DEBUG is not set +CONFIG_NET_VENDOR_MELLANOX=y +CONFIG_MLX4_EN=m +CONFIG_MLX4_EN_DCB=y +CONFIG_MLX4_CORE=m +CONFIG_MLX4_DEBUG=y +CONFIG_MLX4_CORE_GEN2=y +CONFIG_MLX5_CORE=m +CONFIG_MLX5_ACCEL=y +CONFIG_MLX5_FPGA=y +CONFIG_MLX5_CORE_EN=y +CONFIG_MLX5_EN_ARFS=y +CONFIG_MLX5_EN_RXNFC=y +CONFIG_MLX5_MPFS=y +CONFIG_MLX5_ESWITCH=y +CONFIG_MLX5_CLS_ACT=y +CONFIG_MLX5_TC_CT=y +CONFIG_MLX5_CORE_EN_DCB=y +CONFIG_MLX5_CORE_IPOIB=y +CONFIG_MLX5_FPGA_IPSEC=y +# CONFIG_MLX5_IPSEC is not set +CONFIG_MLX5_EN_IPSEC=y +CONFIG_MLX5_FPGA_TLS=y +CONFIG_MLX5_TLS=y +CONFIG_MLX5_EN_TLS=y +CONFIG_MLX5_SW_STEERING=y +CONFIG_MLXSW_CORE=m +CONFIG_MLXSW_CORE_HWMON=y +CONFIG_MLXSW_CORE_THERMAL=y +CONFIG_MLXSW_PCI=m +CONFIG_MLXSW_I2C=m +CONFIG_MLXSW_SWITCHIB=m +CONFIG_MLXSW_SWITCHX2=m +CONFIG_MLXSW_SPECTRUM=m +CONFIG_MLXSW_SPECTRUM_DCB=y +CONFIG_MLXSW_MINIMAL=m +CONFIG_MLXFW=m +CONFIG_NET_VENDOR_MICREL=y +CONFIG_KS8842=m +CONFIG_KS8851=m +CONFIG_KS8851_MLL=m +CONFIG_KSZ884X_PCI=m +CONFIG_NET_VENDOR_MICROCHIP=y +CONFIG_ENC28J60=m +# CONFIG_ENC28J60_WRITEVERIFY is not set +CONFIG_ENCX24J600=m +CONFIG_LAN743X=m +CONFIG_NET_VENDOR_MICROSEMI=y +CONFIG_MSCC_OCELOT_SWITCH_LIB=m +CONFIG_MSCC_OCELOT_SWITCH=m +CONFIG_NET_VENDOR_MYRI=y +CONFIG_MYRI10GE=m +CONFIG_MYRI10GE_DCA=y +CONFIG_FEALNX=m +CONFIG_NET_VENDOR_NATSEMI=y +CONFIG_NATSEMI=m +CONFIG_NS83820=m +CONFIG_NET_VENDOR_NETERION=y +CONFIG_S2IO=m +CONFIG_VXGE=m +# CONFIG_VXGE_DEBUG_TRACE_ALL is not set +CONFIG_NET_VENDOR_NETRONOME=y +CONFIG_NFP=m +CONFIG_NFP_APP_FLOWER=y +CONFIG_NFP_APP_ABM_NIC=y +# CONFIG_NFP_DEBUG is not set +CONFIG_NET_VENDOR_NI=y +CONFIG_NI_XGE_MANAGEMENT_ENET=m +CONFIG_NET_VENDOR_8390=y +CONFIG_PCMCIA_AXNET=m +CONFIG_NE2K_PCI=m +CONFIG_PCMCIA_PCNET=m +CONFIG_NET_VENDOR_NVIDIA=y +CONFIG_FORCEDETH=m +CONFIG_NET_VENDOR_OKI=y +CONFIG_ETHOC=m +CONFIG_NET_VENDOR_PACKET_ENGINES=y +CONFIG_HAMACHI=m +CONFIG_YELLOWFIN=m +CONFIG_NET_VENDOR_PENSANDO=y +CONFIG_IONIC=m +CONFIG_NET_VENDOR_QLOGIC=y +CONFIG_QLA3XXX=m +CONFIG_QLCNIC=m +CONFIG_QLCNIC_SRIOV=y +CONFIG_QLCNIC_DCB=y +CONFIG_QLCNIC_HWMON=y +CONFIG_NETXEN_NIC=m +CONFIG_QED=m +CONFIG_QED_LL2=y +CONFIG_QED_SRIOV=y +CONFIG_QEDE=m +CONFIG_QED_RDMA=y +CONFIG_QED_ISCSI=y +CONFIG_QED_FCOE=y +CONFIG_QED_OOO=y +CONFIG_NET_VENDOR_QUALCOMM=y +CONFIG_QCA7000=m +CONFIG_QCA7000_SPI=m +CONFIG_QCA7000_UART=m +CONFIG_QCOM_EMAC=m +CONFIG_RMNET=m +CONFIG_NET_VENDOR_RDC=y +CONFIG_R6040=m +CONFIG_NET_VENDOR_REALTEK=y +CONFIG_ATP=m +CONFIG_8139CP=m +CONFIG_8139TOO=m +# CONFIG_8139TOO_PIO is not set +CONFIG_8139TOO_TUNE_TWISTER=y +CONFIG_8139TOO_8129=y +# CONFIG_8139_OLD_RX_RESET is not set +CONFIG_R8169=m +CONFIG_NET_VENDOR_RENESAS=y +CONFIG_NET_VENDOR_ROCKER=y +CONFIG_ROCKER=m +CONFIG_NET_VENDOR_SAMSUNG=y +CONFIG_SXGBE_ETH=m +CONFIG_NET_VENDOR_SEEQ=y +CONFIG_NET_VENDOR_SOLARFLARE=y +CONFIG_SFC=m +CONFIG_SFC_MTD=y +CONFIG_SFC_MCDI_MON=y +CONFIG_SFC_SRIOV=y +CONFIG_SFC_MCDI_LOGGING=y +CONFIG_SFC_FALCON=m +CONFIG_SFC_FALCON_MTD=y +CONFIG_NET_VENDOR_SILAN=y +CONFIG_SC92031=m +CONFIG_NET_VENDOR_SIS=y +CONFIG_SIS900=m +CONFIG_SIS190=m +CONFIG_NET_VENDOR_SMSC=y +CONFIG_PCMCIA_SMC91C92=m +CONFIG_EPIC100=m +CONFIG_SMSC911X=m +CONFIG_SMSC9420=m +CONFIG_NET_VENDOR_SOCIONEXT=y +CONFIG_NET_VENDOR_STMICRO=y +CONFIG_STMMAC_ETH=m +# CONFIG_STMMAC_SELFTESTS is not set +CONFIG_STMMAC_PLATFORM=m +CONFIG_DWMAC_DWC_QOS_ETH=m +CONFIG_DWMAC_GENERIC=m +CONFIG_DWMAC_INTEL=m +CONFIG_STMMAC_PCI=m +CONFIG_NET_VENDOR_SUN=y +CONFIG_HAPPYMEAL=m +CONFIG_SUNGEM=m +CONFIG_CASSINI=m +CONFIG_NIU=m +CONFIG_NET_VENDOR_SYNOPSYS=y +CONFIG_DWC_XLGMAC=m +CONFIG_DWC_XLGMAC_PCI=m +CONFIG_NET_VENDOR_TEHUTI=y +CONFIG_TEHUTI=m +CONFIG_NET_VENDOR_TI=y +# CONFIG_TI_CPSW_PHY_SEL is not set +CONFIG_TLAN=m +CONFIG_NET_VENDOR_VIA=y +CONFIG_VIA_RHINE=m +CONFIG_VIA_RHINE_MMIO=y +CONFIG_VIA_VELOCITY=m +CONFIG_NET_VENDOR_WIZNET=y +CONFIG_WIZNET_W5100=m +CONFIG_WIZNET_W5300=m +# CONFIG_WIZNET_BUS_DIRECT is not set +# CONFIG_WIZNET_BUS_INDIRECT is not set +CONFIG_WIZNET_BUS_ANY=y +CONFIG_WIZNET_W5100_SPI=m +CONFIG_NET_VENDOR_XILINX=y +CONFIG_XILINX_AXI_EMAC=m +CONFIG_XILINX_LL_TEMAC=m +CONFIG_NET_VENDOR_XIRCOM=y +CONFIG_PCMCIA_XIRC2PS=m +CONFIG_FDDI=m +CONFIG_DEFXX=m +CONFIG_DEFXX_MMIO=y +CONFIG_SKFP=m +# CONFIG_HIPPI is not set +CONFIG_NET_SB1000=m +CONFIG_MDIO_DEVICE=m +CONFIG_MDIO_BUS=m +CONFIG_MDIO_DEVRES=m +CONFIG_MDIO_BCM_UNIMAC=m +CONFIG_MDIO_BITBANG=m +CONFIG_MDIO_BUS_MUX=m +CONFIG_MDIO_BUS_MUX_GPIO=m +CONFIG_MDIO_BUS_MUX_MMIOREG=m +CONFIG_MDIO_BUS_MUX_MULTIPLEXER=m +CONFIG_MDIO_CAVIUM=m +CONFIG_MDIO_GPIO=m +CONFIG_MDIO_HISI_FEMAC=m +CONFIG_MDIO_I2C=m +CONFIG_MDIO_IPQ4019=m +CONFIG_MDIO_IPQ8064=m +CONFIG_MDIO_MSCC_MIIM=m +CONFIG_MDIO_MVUSB=m +CONFIG_MDIO_OCTEON=m +CONFIG_MDIO_THUNDER=m +CONFIG_MDIO_XPCS=m +CONFIG_PHYLINK=m +CONFIG_PHYLIB=m +CONFIG_SWPHY=y +CONFIG_LED_TRIGGER_PHY=y + +# +# MII PHY device drivers +# +CONFIG_SFP=m +CONFIG_ADIN_PHY=m +CONFIG_AMD_PHY=m +CONFIG_AQUANTIA_PHY=m +CONFIG_AX88796B_PHY=m +CONFIG_BCM7XXX_PHY=m +CONFIG_BCM87XX_PHY=m +CONFIG_BCM_NET_PHYLIB=m +CONFIG_BROADCOM_PHY=m +CONFIG_BCM54140_PHY=m +CONFIG_BCM84881_PHY=m +CONFIG_CICADA_PHY=m +CONFIG_CORTINA_PHY=m +CONFIG_DAVICOM_PHY=m +CONFIG_DP83822_PHY=m +CONFIG_DP83TC811_PHY=m +CONFIG_DP83848_PHY=m +CONFIG_DP83867_PHY=m +CONFIG_DP83869_PHY=m +CONFIG_FIXED_PHY=m +CONFIG_ICPLUS_PHY=m +CONFIG_INTEL_XWAY_PHY=m +CONFIG_LSI_ET1011C_PHY=m +CONFIG_LXT_PHY=m +CONFIG_MARVELL_PHY=m +CONFIG_MARVELL_10G_PHY=m +CONFIG_MICREL_PHY=m +CONFIG_MICROCHIP_PHY=m +CONFIG_MICROCHIP_T1_PHY=m +CONFIG_MICROSEMI_PHY=m +CONFIG_NATIONAL_PHY=m +CONFIG_NXP_TJA11XX_PHY=m +CONFIG_AT803X_PHY=m +CONFIG_QSEMI_PHY=m +CONFIG_REALTEK_PHY=m +CONFIG_RENESAS_PHY=m +CONFIG_ROCKCHIP_PHY=m +CONFIG_SMSC_PHY=m +CONFIG_STE10XP=m +CONFIG_TERANETICS_PHY=m +CONFIG_VITESSE_PHY=m +CONFIG_XILINX_GMII2RGMII=m +CONFIG_MICREL_KS8995MA=m +CONFIG_PLIP=m +CONFIG_PPP=m +CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_FILTER=y +CONFIG_PPP_MPPE=m +CONFIG_PPP_MULTILINK=y +CONFIG_PPPOATM=m +CONFIG_PPPOE=m +CONFIG_PPTP=m +CONFIG_PPPOL2TP=m +CONFIG_PPP_ASYNC=m +CONFIG_PPP_SYNC_TTY=m +CONFIG_SLIP=m +CONFIG_SLHC=m +CONFIG_SLIP_COMPRESSED=y +CONFIG_SLIP_SMART=y +CONFIG_SLIP_MODE_SLIP6=y +CONFIG_USB_NET_DRIVERS=m +CONFIG_USB_CATC=m +CONFIG_USB_KAWETH=m +CONFIG_USB_PEGASUS=m +CONFIG_USB_RTL8150=m +CONFIG_USB_RTL8152=m +CONFIG_USB_LAN78XX=m +CONFIG_USB_USBNET=m +CONFIG_USB_NET_AX8817X=m +CONFIG_USB_NET_AX88179_178A=m +CONFIG_USB_NET_CDCETHER=m +CONFIG_USB_NET_CDC_EEM=m +CONFIG_USB_NET_CDC_NCM=m +CONFIG_USB_NET_HUAWEI_CDC_NCM=m +CONFIG_USB_NET_CDC_MBIM=m +CONFIG_USB_NET_DM9601=m +CONFIG_USB_NET_SR9700=m +CONFIG_USB_NET_SR9800=m +CONFIG_USB_NET_SMSC75XX=m +CONFIG_USB_NET_SMSC95XX=m +CONFIG_USB_NET_GL620A=m +CONFIG_USB_NET_NET1080=m +CONFIG_USB_NET_PLUSB=m +CONFIG_USB_NET_MCS7830=m +CONFIG_USB_NET_RNDIS_HOST=m +CONFIG_USB_NET_CDC_SUBSET_ENABLE=m +CONFIG_USB_NET_CDC_SUBSET=m +CONFIG_USB_ALI_M5632=y +CONFIG_USB_AN2720=y +CONFIG_USB_BELKIN=y +CONFIG_USB_ARMLINUX=y +CONFIG_USB_EPSON2888=y +CONFIG_USB_KC2190=y +CONFIG_USB_NET_ZAURUS=m +CONFIG_USB_NET_CX82310_ETH=m +CONFIG_USB_NET_KALMIA=m +CONFIG_USB_NET_QMI_WWAN=m +CONFIG_USB_HSO=m +CONFIG_USB_NET_INT51X1=m +CONFIG_USB_CDC_PHONET=m +CONFIG_USB_IPHETH=m +CONFIG_USB_SIERRA_NET=m +CONFIG_USB_VL600=m +CONFIG_USB_NET_CH9200=m +CONFIG_USB_NET_AQC111=m +CONFIG_WLAN=y +# CONFIG_WIRELESS_WDS is not set +CONFIG_WLAN_VENDOR_ADMTEK=y +CONFIG_ADM8211=m +CONFIG_ATH_COMMON=m +CONFIG_WLAN_VENDOR_ATH=y +# CONFIG_ATH_DEBUG is not set +CONFIG_ATH5K=m +CONFIG_ATH5K_DEBUG=y +CONFIG_ATH5K_TRACER=y +CONFIG_ATH5K_PCI=y +CONFIG_ATH9K_HW=m +CONFIG_ATH9K_COMMON=m +CONFIG_ATH9K_COMMON_DEBUG=y +CONFIG_ATH9K_BTCOEX_SUPPORT=y +CONFIG_ATH9K=m +CONFIG_ATH9K_PCI=y +CONFIG_ATH9K_AHB=y +CONFIG_ATH9K_DEBUGFS=y +CONFIG_ATH9K_STATION_STATISTICS=y +CONFIG_ATH9K_DYNACK=y +CONFIG_ATH9K_WOW=y +CONFIG_ATH9K_RFKILL=y +CONFIG_ATH9K_CHANNEL_CONTEXT=y +CONFIG_ATH9K_PCOEM=y +CONFIG_ATH9K_PCI_NO_EEPROM=m +CONFIG_ATH9K_HTC=m +CONFIG_ATH9K_HTC_DEBUGFS=y +CONFIG_ATH9K_HWRNG=y +CONFIG_ATH9K_COMMON_SPECTRAL=y +CONFIG_CARL9170=m +CONFIG_CARL9170_LEDS=y +CONFIG_CARL9170_DEBUGFS=y +CONFIG_CARL9170_WPC=y +# CONFIG_CARL9170_HWRNG is not set +CONFIG_ATH6KL=m +CONFIG_ATH6KL_SDIO=m +CONFIG_ATH6KL_USB=m +CONFIG_ATH6KL_DEBUG=y +CONFIG_ATH6KL_TRACING=y +CONFIG_AR5523=m +CONFIG_WIL6210=m +CONFIG_WIL6210_ISR_COR=y +CONFIG_WIL6210_TRACING=y +CONFIG_WIL6210_DEBUGFS=y +CONFIG_ATH10K=m +CONFIG_ATH10K_CE=y +CONFIG_ATH10K_PCI=m +CONFIG_ATH10K_AHB=y +CONFIG_ATH10K_SDIO=m +CONFIG_ATH10K_USB=m +CONFIG_ATH10K_DEBUG=y +CONFIG_ATH10K_DEBUGFS=y +CONFIG_ATH10K_SPECTRAL=y +CONFIG_ATH10K_TRACING=y +CONFIG_WCN36XX=m +CONFIG_WCN36XX_DEBUGFS=y +CONFIG_WLAN_VENDOR_ATMEL=y +CONFIG_ATMEL=m +CONFIG_PCI_ATMEL=m +CONFIG_PCMCIA_ATMEL=m +CONFIG_AT76C50X_USB=m +CONFIG_WLAN_VENDOR_BROADCOM=y +CONFIG_B43=m +CONFIG_B43_BCMA=y +CONFIG_B43_SSB=y +CONFIG_B43_BUSES_BCMA_AND_SSB=y +# CONFIG_B43_BUSES_BCMA is not set +# CONFIG_B43_BUSES_SSB is not set +CONFIG_B43_PCI_AUTOSELECT=y +CONFIG_B43_PCICORE_AUTOSELECT=y +CONFIG_B43_SDIO=y +CONFIG_B43_BCMA_PIO=y +CONFIG_B43_PIO=y +CONFIG_B43_PHY_G=y +CONFIG_B43_PHY_N=y +CONFIG_B43_PHY_LP=y +CONFIG_B43_PHY_HT=y +CONFIG_B43_LEDS=y +CONFIG_B43_HWRNG=y +# CONFIG_B43_DEBUG is not set +CONFIG_B43LEGACY=m +CONFIG_B43LEGACY_PCI_AUTOSELECT=y +CONFIG_B43LEGACY_PCICORE_AUTOSELECT=y +CONFIG_B43LEGACY_LEDS=y +CONFIG_B43LEGACY_HWRNG=y +CONFIG_B43LEGACY_DEBUG=y +CONFIG_B43LEGACY_DMA=y +CONFIG_B43LEGACY_PIO=y +CONFIG_B43LEGACY_DMA_AND_PIO_MODE=y +# CONFIG_B43LEGACY_DMA_MODE is not set +# CONFIG_B43LEGACY_PIO_MODE is not set +CONFIG_BRCMUTIL=m +CONFIG_BRCMSMAC=m +CONFIG_BRCMFMAC=m +CONFIG_BRCMFMAC_PROTO_BCDC=y +CONFIG_BRCMFMAC_PROTO_MSGBUF=y +CONFIG_BRCMFMAC_SDIO=y +CONFIG_BRCMFMAC_USB=y +CONFIG_BRCMFMAC_PCIE=y +CONFIG_BRCM_TRACING=y +CONFIG_BRCMDBG=y +CONFIG_WLAN_VENDOR_CISCO=y +CONFIG_AIRO=m +CONFIG_AIRO_CS=m +CONFIG_WLAN_VENDOR_INTEL=y +CONFIG_IPW2100=m +CONFIG_IPW2100_MONITOR=y +# CONFIG_IPW2100_DEBUG is not set +CONFIG_IPW2200=m +CONFIG_IPW2200_MONITOR=y +CONFIG_IPW2200_RADIOTAP=y +CONFIG_IPW2200_PROMISCUOUS=y +CONFIG_IPW2200_QOS=y +# CONFIG_IPW2200_DEBUG is not set +CONFIG_LIBIPW=m +# CONFIG_LIBIPW_DEBUG is not set +CONFIG_IWLEGACY=m +CONFIG_IWL4965=m +CONFIG_IWL3945=m + +# +# iwl3945 / iwl4965 Debugging Options +# +CONFIG_IWLEGACY_DEBUG=y +CONFIG_IWLEGACY_DEBUGFS=y +# end of iwl3945 / iwl4965 Debugging Options + +CONFIG_IWLWIFI=m +CONFIG_IWLWIFI_LEDS=y +CONFIG_IWLDVM=m +CONFIG_IWLMVM=m +CONFIG_IWLWIFI_OPMODE_MODULAR=y +# CONFIG_IWLWIFI_BCAST_FILTERING is not set + +# +# Debugging Options +# +CONFIG_IWLWIFI_DEBUG=y +CONFIG_IWLWIFI_DEBUGFS=y +CONFIG_IWLWIFI_DEVICE_TRACING=y +# end of Debugging Options + +CONFIG_WLAN_VENDOR_INTERSIL=y +CONFIG_HOSTAP=m +CONFIG_HOSTAP_FIRMWARE=y +CONFIG_HOSTAP_FIRMWARE_NVRAM=y +CONFIG_HOSTAP_PLX=m +CONFIG_HOSTAP_PCI=m +CONFIG_HOSTAP_CS=m +CONFIG_HERMES=m +CONFIG_HERMES_PRISM=y +CONFIG_HERMES_CACHE_FW_ON_INIT=y +CONFIG_PLX_HERMES=m +CONFIG_TMD_HERMES=m +CONFIG_NORTEL_HERMES=m +CONFIG_PCI_HERMES=m +CONFIG_PCMCIA_HERMES=m +CONFIG_PCMCIA_SPECTRUM=m +CONFIG_ORINOCO_USB=m +CONFIG_P54_COMMON=m +CONFIG_P54_USB=m +CONFIG_P54_PCI=m +CONFIG_P54_SPI=m +# CONFIG_P54_SPI_DEFAULT_EEPROM is not set +CONFIG_P54_LEDS=y +CONFIG_PRISM54=m +CONFIG_WLAN_VENDOR_MARVELL=y +CONFIG_LIBERTAS=m +CONFIG_LIBERTAS_USB=m +CONFIG_LIBERTAS_CS=m +CONFIG_LIBERTAS_SDIO=m +CONFIG_LIBERTAS_SPI=m +# CONFIG_LIBERTAS_DEBUG is not set +CONFIG_LIBERTAS_MESH=y +CONFIG_LIBERTAS_THINFIRM=m +# CONFIG_LIBERTAS_THINFIRM_DEBUG is not set +CONFIG_LIBERTAS_THINFIRM_USB=m +CONFIG_MWIFIEX=m +CONFIG_MWIFIEX_SDIO=m +CONFIG_MWIFIEX_PCIE=m +CONFIG_MWIFIEX_USB=m +CONFIG_MWL8K=m +CONFIG_WLAN_VENDOR_MEDIATEK=y +CONFIG_MT7601U=m +CONFIG_MT76_CORE=m +CONFIG_MT76_LEDS=y +CONFIG_MT76_USB=m +CONFIG_MT76x02_LIB=m +CONFIG_MT76x02_USB=m +CONFIG_MT76x0_COMMON=m +CONFIG_MT76x0U=m +CONFIG_MT76x0E=m +CONFIG_MT76x2_COMMON=m +CONFIG_MT76x2E=m +CONFIG_MT76x2U=m +CONFIG_MT7603E=m +CONFIG_MT7615_COMMON=m +CONFIG_MT7615E=m +# CONFIG_MT7663U is not set +# CONFIG_MT7663S is not set +# CONFIG_MT7915E is not set +CONFIG_WLAN_VENDOR_MICROCHIP=y +CONFIG_WILC1000=m +CONFIG_WILC1000_SDIO=m +CONFIG_WILC1000_SPI=m +# CONFIG_WILC1000_HW_OOB_INTR is not set +CONFIG_WLAN_VENDOR_RALINK=y +CONFIG_RT2X00=m +CONFIG_RT2400PCI=m +CONFIG_RT2500PCI=m +CONFIG_RT61PCI=m +CONFIG_RT2800PCI=m +CONFIG_RT2800PCI_RT33XX=y +CONFIG_RT2800PCI_RT35XX=y +CONFIG_RT2800PCI_RT53XX=y +CONFIG_RT2800PCI_RT3290=y +CONFIG_RT2500USB=m +CONFIG_RT73USB=m +CONFIG_RT2800USB=m +CONFIG_RT2800USB_RT33XX=y +CONFIG_RT2800USB_RT35XX=y +CONFIG_RT2800USB_RT3573=y +CONFIG_RT2800USB_RT53XX=y +CONFIG_RT2800USB_RT55XX=y +CONFIG_RT2800USB_UNKNOWN=y +CONFIG_RT2800_LIB=m +CONFIG_RT2800_LIB_MMIO=m +CONFIG_RT2X00_LIB_MMIO=m +CONFIG_RT2X00_LIB_PCI=m +CONFIG_RT2X00_LIB_USB=m +CONFIG_RT2X00_LIB=m +CONFIG_RT2X00_LIB_FIRMWARE=y +CONFIG_RT2X00_LIB_CRYPTO=y +CONFIG_RT2X00_LIB_LEDS=y +CONFIG_RT2X00_LIB_DEBUGFS=y +# CONFIG_RT2X00_DEBUG is not set +CONFIG_WLAN_VENDOR_REALTEK=y +CONFIG_RTL8180=m +CONFIG_RTL8187=m +CONFIG_RTL8187_LEDS=y +CONFIG_RTL_CARDS=m +CONFIG_RTL8192CE=m +CONFIG_RTL8192SE=m +CONFIG_RTL8192DE=m +CONFIG_RTL8723AE=m +CONFIG_RTL8723BE=m +CONFIG_RTL8188EE=m +CONFIG_RTL8192EE=m +CONFIG_RTL8821AE=m +CONFIG_RTL8192CU=m +CONFIG_RTLWIFI=m +CONFIG_RTLWIFI_PCI=m +CONFIG_RTLWIFI_USB=m +CONFIG_RTLWIFI_DEBUG=y +CONFIG_RTL8192C_COMMON=m +CONFIG_RTL8723_COMMON=m +CONFIG_RTLBTCOEXIST=m +CONFIG_RTL8XXXU=m +CONFIG_RTL8XXXU_UNTESTED=y +CONFIG_RTW88=m +CONFIG_RTW88_CORE=m +CONFIG_RTW88_PCI=m +CONFIG_RTW88_8822B=m +CONFIG_RTW88_8822C=m +CONFIG_RTW88_8822BE=m +CONFIG_RTW88_8822CE=m +# CONFIG_RTW88_8723DE is not set +# CONFIG_RTW88_8821CE is not set +CONFIG_RTW88_DEBUG=y +CONFIG_RTW88_DEBUGFS=y +CONFIG_WLAN_VENDOR_RSI=y +CONFIG_RSI_91X=m +CONFIG_RSI_DEBUGFS=y +CONFIG_RSI_SDIO=m +CONFIG_RSI_USB=m +CONFIG_RSI_COEX=y +CONFIG_WLAN_VENDOR_ST=y +CONFIG_CW1200=m +CONFIG_CW1200_WLAN_SDIO=m +CONFIG_CW1200_WLAN_SPI=m +CONFIG_WLAN_VENDOR_TI=y +CONFIG_WL1251=m +CONFIG_WL1251_SPI=m +CONFIG_WL1251_SDIO=m +CONFIG_WL12XX=m +CONFIG_WL18XX=m +CONFIG_WLCORE=m +CONFIG_WLCORE_SPI=m +CONFIG_WLCORE_SDIO=m +CONFIG_WILINK_PLATFORM_DATA=y +CONFIG_WLAN_VENDOR_ZYDAS=y +CONFIG_USB_ZD1201=m +CONFIG_ZD1211RW=m +# CONFIG_ZD1211RW_DEBUG is not set +CONFIG_WLAN_VENDOR_QUANTENNA=y +CONFIG_QTNFMAC=m +CONFIG_QTNFMAC_PCIE=m +CONFIG_PCMCIA_RAYCS=m +CONFIG_PCMCIA_WL3501=m +CONFIG_MAC80211_HWSIM=m +CONFIG_USB_NET_RNDIS_WLAN=m +CONFIG_VIRT_WIFI=m + +# +# WiMAX Wireless Broadband devices +# +CONFIG_WIMAX_I2400M=m +CONFIG_WIMAX_I2400M_USB=m +CONFIG_WIMAX_I2400M_DEBUG_LEVEL=8 +# end of WiMAX Wireless Broadband devices + +# CONFIG_WAN is not set +CONFIG_IEEE802154_DRIVERS=m +CONFIG_IEEE802154_FAKELB=m +CONFIG_IEEE802154_AT86RF230=m +# CONFIG_IEEE802154_AT86RF230_DEBUGFS is not set +CONFIG_IEEE802154_MRF24J40=m +CONFIG_IEEE802154_CC2520=m +CONFIG_IEEE802154_ATUSB=m +CONFIG_IEEE802154_ADF7242=m +CONFIG_IEEE802154_CA8210=m +# CONFIG_IEEE802154_CA8210_DEBUGFS is not set +CONFIG_IEEE802154_MCR20A=m +CONFIG_IEEE802154_HWSIM=m +CONFIG_XEN_NETDEV_FRONTEND=m +CONFIG_XEN_NETDEV_BACKEND=m +CONFIG_VMXNET3=m +CONFIG_FUJITSU_ES=m +CONFIG_USB4_NET=m +CONFIG_HYPERV_NET=m +CONFIG_NETDEVSIM=m +CONFIG_NET_FAILOVER=m +CONFIG_ISDN=y +CONFIG_ISDN_CAPI=y +CONFIG_CAPI_TRACE=y +CONFIG_ISDN_CAPI_MIDDLEWARE=y +CONFIG_MISDN=m +CONFIG_MISDN_DSP=m +CONFIG_MISDN_L1OIP=m + +# +# mISDN hardware drivers +# +CONFIG_MISDN_HFCPCI=m +CONFIG_MISDN_HFCMULTI=m +CONFIG_MISDN_HFCUSB=m +CONFIG_MISDN_AVMFRITZ=m +CONFIG_MISDN_SPEEDFAX=m +CONFIG_MISDN_INFINEON=m +CONFIG_MISDN_W6692=m +CONFIG_MISDN_NETJET=m +CONFIG_MISDN_HDLC=m +CONFIG_MISDN_IPAC=m +CONFIG_MISDN_ISAR=m +CONFIG_NVM=y +CONFIG_NVM_PBLK=m +# CONFIG_NVM_PBLK_DEBUG is not set + +# +# Input device support +# +CONFIG_INPUT=y +CONFIG_INPUT_LEDS=m +CONFIG_INPUT_FF_MEMLESS=m +CONFIG_INPUT_POLLDEV=m +CONFIG_INPUT_SPARSEKMAP=m +CONFIG_INPUT_MATRIXKMAP=m + +# +# Userland interfaces +# +CONFIG_INPUT_MOUSEDEV=m +CONFIG_INPUT_MOUSEDEV_PSAUX=y +CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 +CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 +CONFIG_INPUT_JOYDEV=m +CONFIG_INPUT_EVDEV=m +# CONFIG_INPUT_EVBUG is not set + +# +# Input Device Drivers +# +CONFIG_INPUT_KEYBOARD=y +CONFIG_KEYBOARD_ADC=m +CONFIG_KEYBOARD_ADP5520=m +CONFIG_KEYBOARD_ADP5588=m +CONFIG_KEYBOARD_ADP5589=m +CONFIG_KEYBOARD_APPLESPI=m +CONFIG_KEYBOARD_ATKBD=m +CONFIG_KEYBOARD_QT1050=m +CONFIG_KEYBOARD_QT1070=m +CONFIG_KEYBOARD_QT2160=m +CONFIG_KEYBOARD_DLINK_DIR685=m +CONFIG_KEYBOARD_LKKBD=m +CONFIG_KEYBOARD_GPIO=m +CONFIG_KEYBOARD_GPIO_POLLED=m +CONFIG_KEYBOARD_TCA6416=m +CONFIG_KEYBOARD_TCA8418=m +CONFIG_KEYBOARD_MATRIX=m +CONFIG_KEYBOARD_LM8323=m +CONFIG_KEYBOARD_LM8333=m +CONFIG_KEYBOARD_MAX7359=m +CONFIG_KEYBOARD_MCS=m +CONFIG_KEYBOARD_MPR121=m +CONFIG_KEYBOARD_NEWTON=m +CONFIG_KEYBOARD_OPENCORES=m +CONFIG_KEYBOARD_SAMSUNG=m +CONFIG_KEYBOARD_STOWAWAY=m +CONFIG_KEYBOARD_SUNKBD=m +CONFIG_KEYBOARD_STMPE=m +CONFIG_KEYBOARD_IQS62X=m +CONFIG_KEYBOARD_OMAP4=m +CONFIG_KEYBOARD_TC3589X=m +CONFIG_KEYBOARD_TM2_TOUCHKEY=m +CONFIG_KEYBOARD_TWL4030=m +CONFIG_KEYBOARD_XTKBD=m +CONFIG_KEYBOARD_CROS_EC=m +CONFIG_KEYBOARD_CAP11XX=m +CONFIG_KEYBOARD_BCM=m +CONFIG_KEYBOARD_MTK_PMIC=m +CONFIG_INPUT_MOUSE=y +CONFIG_MOUSE_PS2=m +CONFIG_MOUSE_PS2_ALPS=y +CONFIG_MOUSE_PS2_BYD=y +CONFIG_MOUSE_PS2_LOGIPS2PP=y +CONFIG_MOUSE_PS2_SYNAPTICS=y +CONFIG_MOUSE_PS2_SYNAPTICS_SMBUS=y +CONFIG_MOUSE_PS2_CYPRESS=y +CONFIG_MOUSE_PS2_LIFEBOOK=y +CONFIG_MOUSE_PS2_TRACKPOINT=y +CONFIG_MOUSE_PS2_ELANTECH=y +CONFIG_MOUSE_PS2_ELANTECH_SMBUS=y +CONFIG_MOUSE_PS2_SENTELIC=y +CONFIG_MOUSE_PS2_TOUCHKIT=y +CONFIG_MOUSE_PS2_FOCALTECH=y +CONFIG_MOUSE_PS2_VMMOUSE=y +CONFIG_MOUSE_PS2_SMBUS=y +CONFIG_MOUSE_SERIAL=m +CONFIG_MOUSE_APPLETOUCH=m +CONFIG_MOUSE_BCM5974=m +CONFIG_MOUSE_CYAPA=m +CONFIG_MOUSE_ELAN_I2C=m +CONFIG_MOUSE_ELAN_I2C_I2C=y +CONFIG_MOUSE_ELAN_I2C_SMBUS=y +CONFIG_MOUSE_VSXXXAA=m +CONFIG_MOUSE_GPIO=m +CONFIG_MOUSE_SYNAPTICS_I2C=m +CONFIG_MOUSE_SYNAPTICS_USB=m +CONFIG_INPUT_JOYSTICK=y +CONFIG_JOYSTICK_ANALOG=m +CONFIG_JOYSTICK_A3D=m +CONFIG_JOYSTICK_ADI=m +CONFIG_JOYSTICK_COBRA=m +CONFIG_JOYSTICK_GF2K=m +CONFIG_JOYSTICK_GRIP=m +CONFIG_JOYSTICK_GRIP_MP=m +CONFIG_JOYSTICK_GUILLEMOT=m +CONFIG_JOYSTICK_INTERACT=m +CONFIG_JOYSTICK_SIDEWINDER=m +CONFIG_JOYSTICK_TMDC=m +CONFIG_JOYSTICK_IFORCE=m +CONFIG_JOYSTICK_IFORCE_USB=m +CONFIG_JOYSTICK_IFORCE_232=m +CONFIG_JOYSTICK_WARRIOR=m +CONFIG_JOYSTICK_MAGELLAN=m +CONFIG_JOYSTICK_SPACEORB=m +CONFIG_JOYSTICK_SPACEBALL=m +CONFIG_JOYSTICK_STINGER=m +CONFIG_JOYSTICK_TWIDJOY=m +CONFIG_JOYSTICK_ZHENHUA=m +CONFIG_JOYSTICK_DB9=m +CONFIG_JOYSTICK_GAMECON=m +CONFIG_JOYSTICK_TURBOGRAFX=m +CONFIG_JOYSTICK_AS5011=m +CONFIG_JOYSTICK_JOYDUMP=m +CONFIG_JOYSTICK_XPAD=m +CONFIG_JOYSTICK_XPAD_FF=y +CONFIG_JOYSTICK_XPAD_LEDS=y +CONFIG_JOYSTICK_WALKERA0701=m +CONFIG_JOYSTICK_PSXPAD_SPI=m +CONFIG_JOYSTICK_PSXPAD_SPI_FF=y +CONFIG_JOYSTICK_PXRC=m +CONFIG_JOYSTICK_FSIA6B=m +CONFIG_INPUT_TABLET=y +CONFIG_TABLET_USB_ACECAD=m +CONFIG_TABLET_USB_AIPTEK=m +CONFIG_TABLET_USB_GTCO=m +CONFIG_TABLET_USB_HANWANG=m +CONFIG_TABLET_USB_KBTAB=m +CONFIG_TABLET_USB_PEGASUS=m +CONFIG_TABLET_SERIAL_WACOM4=m +CONFIG_INPUT_TOUCHSCREEN=y +CONFIG_TOUCHSCREEN_PROPERTIES=y +CONFIG_TOUCHSCREEN_88PM860X=m +CONFIG_TOUCHSCREEN_ADS7846=m +CONFIG_TOUCHSCREEN_AD7877=m +CONFIG_TOUCHSCREEN_AD7879=m +CONFIG_TOUCHSCREEN_AD7879_I2C=m +CONFIG_TOUCHSCREEN_AD7879_SPI=m +CONFIG_TOUCHSCREEN_ADC=m +CONFIG_TOUCHSCREEN_AR1021_I2C=m +CONFIG_TOUCHSCREEN_ATMEL_MXT=m +CONFIG_TOUCHSCREEN_ATMEL_MXT_T37=y +CONFIG_TOUCHSCREEN_AUO_PIXCIR=m +CONFIG_TOUCHSCREEN_BU21013=m +CONFIG_TOUCHSCREEN_BU21029=m +CONFIG_TOUCHSCREEN_CHIPONE_ICN8318=m +CONFIG_TOUCHSCREEN_CHIPONE_ICN8505=m +CONFIG_TOUCHSCREEN_CY8CTMA140=m +CONFIG_TOUCHSCREEN_CY8CTMG110=m +CONFIG_TOUCHSCREEN_CYTTSP_CORE=m +CONFIG_TOUCHSCREEN_CYTTSP_I2C=m +CONFIG_TOUCHSCREEN_CYTTSP_SPI=m +CONFIG_TOUCHSCREEN_CYTTSP4_CORE=m +CONFIG_TOUCHSCREEN_CYTTSP4_I2C=m +CONFIG_TOUCHSCREEN_CYTTSP4_SPI=m +CONFIG_TOUCHSCREEN_DA9034=m +CONFIG_TOUCHSCREEN_DA9052=m +CONFIG_TOUCHSCREEN_DYNAPRO=m +CONFIG_TOUCHSCREEN_HAMPSHIRE=m +CONFIG_TOUCHSCREEN_EETI=m +CONFIG_TOUCHSCREEN_EGALAX=m +CONFIG_TOUCHSCREEN_EGALAX_SERIAL=m +CONFIG_TOUCHSCREEN_EXC3000=m +CONFIG_TOUCHSCREEN_FUJITSU=m +CONFIG_TOUCHSCREEN_GOODIX=m +CONFIG_TOUCHSCREEN_HIDEEP=m +CONFIG_TOUCHSCREEN_ILI210X=m +CONFIG_TOUCHSCREEN_S6SY761=m +CONFIG_TOUCHSCREEN_GUNZE=m +CONFIG_TOUCHSCREEN_EKTF2127=m +CONFIG_TOUCHSCREEN_ELAN=m +CONFIG_TOUCHSCREEN_ELO=m +CONFIG_TOUCHSCREEN_WACOM_W8001=m +CONFIG_TOUCHSCREEN_WACOM_I2C=m +CONFIG_TOUCHSCREEN_MAX11801=m +CONFIG_TOUCHSCREEN_MCS5000=m +CONFIG_TOUCHSCREEN_MMS114=m +CONFIG_TOUCHSCREEN_MELFAS_MIP4=m +CONFIG_TOUCHSCREEN_MTOUCH=m +CONFIG_TOUCHSCREEN_IMX6UL_TSC=m +CONFIG_TOUCHSCREEN_INEXIO=m +CONFIG_TOUCHSCREEN_MK712=m +CONFIG_TOUCHSCREEN_PENMOUNT=m +CONFIG_TOUCHSCREEN_EDT_FT5X06=m +CONFIG_TOUCHSCREEN_TOUCHRIGHT=m +CONFIG_TOUCHSCREEN_TOUCHWIN=m +CONFIG_TOUCHSCREEN_TI_AM335X_TSC=m +CONFIG_TOUCHSCREEN_UCB1400=m +CONFIG_TOUCHSCREEN_PIXCIR=m +CONFIG_TOUCHSCREEN_WDT87XX_I2C=m +CONFIG_TOUCHSCREEN_WM831X=m +CONFIG_TOUCHSCREEN_WM97XX=m +CONFIG_TOUCHSCREEN_WM9705=y +CONFIG_TOUCHSCREEN_WM9712=y +CONFIG_TOUCHSCREEN_WM9713=y +CONFIG_TOUCHSCREEN_USB_COMPOSITE=m +CONFIG_TOUCHSCREEN_MC13783=m +CONFIG_TOUCHSCREEN_USB_EGALAX=y +CONFIG_TOUCHSCREEN_USB_PANJIT=y +CONFIG_TOUCHSCREEN_USB_3M=y +CONFIG_TOUCHSCREEN_USB_ITM=y +CONFIG_TOUCHSCREEN_USB_ETURBO=y +CONFIG_TOUCHSCREEN_USB_GUNZE=y +CONFIG_TOUCHSCREEN_USB_DMC_TSC10=y +CONFIG_TOUCHSCREEN_USB_IRTOUCH=y +CONFIG_TOUCHSCREEN_USB_IDEALTEK=y +CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH=y +CONFIG_TOUCHSCREEN_USB_GOTOP=y +CONFIG_TOUCHSCREEN_USB_JASTEC=y +CONFIG_TOUCHSCREEN_USB_ELO=y +CONFIG_TOUCHSCREEN_USB_E2I=y +CONFIG_TOUCHSCREEN_USB_ZYTRONIC=y +CONFIG_TOUCHSCREEN_USB_ETT_TC45USB=y +CONFIG_TOUCHSCREEN_USB_NEXIO=y +CONFIG_TOUCHSCREEN_USB_EASYTOUCH=y +CONFIG_TOUCHSCREEN_TOUCHIT213=m +CONFIG_TOUCHSCREEN_TSC_SERIO=m +CONFIG_TOUCHSCREEN_TSC200X_CORE=m +CONFIG_TOUCHSCREEN_TSC2004=m +CONFIG_TOUCHSCREEN_TSC2005=m +CONFIG_TOUCHSCREEN_TSC2007=m +CONFIG_TOUCHSCREEN_TSC2007_IIO=y +CONFIG_TOUCHSCREEN_PCAP=m +CONFIG_TOUCHSCREEN_RM_TS=m +CONFIG_TOUCHSCREEN_SILEAD=m +CONFIG_TOUCHSCREEN_SIS_I2C=m +CONFIG_TOUCHSCREEN_ST1232=m +CONFIG_TOUCHSCREEN_STMFTS=m +CONFIG_TOUCHSCREEN_STMPE=m +CONFIG_TOUCHSCREEN_SUR40=m +CONFIG_TOUCHSCREEN_SURFACE3_SPI=m +CONFIG_TOUCHSCREEN_SX8654=m +CONFIG_TOUCHSCREEN_TPS6507X=m +CONFIG_TOUCHSCREEN_ZET6223=m +CONFIG_TOUCHSCREEN_ZFORCE=m +CONFIG_TOUCHSCREEN_COLIBRI_VF50=m +CONFIG_TOUCHSCREEN_ROHM_BU21023=m +CONFIG_TOUCHSCREEN_IQS5XX=m +CONFIG_INPUT_MISC=y +CONFIG_INPUT_88PM860X_ONKEY=m +CONFIG_INPUT_88PM80X_ONKEY=m +CONFIG_INPUT_AD714X=m +CONFIG_INPUT_AD714X_I2C=m +CONFIG_INPUT_AD714X_SPI=m +CONFIG_INPUT_ARIZONA_HAPTICS=m +CONFIG_INPUT_ATMEL_CAPTOUCH=m +CONFIG_INPUT_BMA150=m +CONFIG_INPUT_E3X0_BUTTON=m +CONFIG_INPUT_PCSPKR=m +CONFIG_INPUT_MAX77650_ONKEY=m +CONFIG_INPUT_MAX77693_HAPTIC=m +CONFIG_INPUT_MAX8925_ONKEY=m +CONFIG_INPUT_MAX8997_HAPTIC=m +CONFIG_INPUT_MC13783_PWRBUTTON=m +CONFIG_INPUT_MMA8450=m +CONFIG_INPUT_APANEL=m +CONFIG_INPUT_GPIO_BEEPER=m +CONFIG_INPUT_GPIO_DECODER=m +CONFIG_INPUT_GPIO_VIBRA=m +CONFIG_INPUT_CPCAP_PWRBUTTON=m +CONFIG_INPUT_ATLAS_BTNS=m +CONFIG_INPUT_ATI_REMOTE2=m +CONFIG_INPUT_KEYSPAN_REMOTE=m +CONFIG_INPUT_KXTJ9=m +CONFIG_INPUT_POWERMATE=m +CONFIG_INPUT_YEALINK=m +CONFIG_INPUT_CM109=m +CONFIG_INPUT_REGULATOR_HAPTIC=m +CONFIG_INPUT_RETU_PWRBUTTON=m +CONFIG_INPUT_TPS65218_PWRBUTTON=m +CONFIG_INPUT_AXP20X_PEK=m +CONFIG_INPUT_TWL4030_PWRBUTTON=m +CONFIG_INPUT_TWL4030_VIBRA=m +CONFIG_INPUT_TWL6040_VIBRA=m +CONFIG_INPUT_UINPUT=m +CONFIG_INPUT_PALMAS_PWRBUTTON=m +CONFIG_INPUT_PCF50633_PMU=m +CONFIG_INPUT_PCF8574=m +CONFIG_INPUT_PWM_BEEPER=m +CONFIG_INPUT_PWM_VIBRA=m +CONFIG_INPUT_RK805_PWRKEY=m +CONFIG_INPUT_GPIO_ROTARY_ENCODER=m +CONFIG_INPUT_DA9052_ONKEY=m +CONFIG_INPUT_DA9055_ONKEY=m +CONFIG_INPUT_DA9063_ONKEY=m +CONFIG_INPUT_WM831X_ON=m +CONFIG_INPUT_PCAP=m +CONFIG_INPUT_ADXL34X=m +CONFIG_INPUT_ADXL34X_I2C=m +CONFIG_INPUT_ADXL34X_SPI=m +CONFIG_INPUT_IMS_PCU=m +CONFIG_INPUT_IQS269A=m +CONFIG_INPUT_CMA3000=m +CONFIG_INPUT_CMA3000_I2C=m +CONFIG_INPUT_XEN_KBDDEV_FRONTEND=m +CONFIG_INPUT_IDEAPAD_SLIDEBAR=m +CONFIG_INPUT_SOC_BUTTON_ARRAY=m +CONFIG_INPUT_DRV260X_HAPTICS=m +CONFIG_INPUT_DRV2665_HAPTICS=m +CONFIG_INPUT_DRV2667_HAPTICS=m +CONFIG_INPUT_RAVE_SP_PWRBUTTON=m +CONFIG_INPUT_STPMIC1_ONKEY=m +CONFIG_RMI4_CORE=m +CONFIG_RMI4_I2C=m +CONFIG_RMI4_SPI=m +CONFIG_RMI4_SMB=m +CONFIG_RMI4_F03=y +CONFIG_RMI4_F03_SERIO=m +CONFIG_RMI4_2D_SENSOR=y +CONFIG_RMI4_F11=y +CONFIG_RMI4_F12=y +CONFIG_RMI4_F30=y +CONFIG_RMI4_F34=y +# CONFIG_RMI4_F54 is not set +CONFIG_RMI4_F55=y + +# +# Hardware I/O ports +# +CONFIG_SERIO=m +CONFIG_ARCH_MIGHT_HAVE_PC_SERIO=y +CONFIG_SERIO_I8042=m +CONFIG_SERIO_SERPORT=m +CONFIG_SERIO_CT82C710=m +CONFIG_SERIO_PARKBD=m +CONFIG_SERIO_PCIPS2=m +CONFIG_SERIO_LIBPS2=m +CONFIG_SERIO_RAW=m +CONFIG_SERIO_ALTERA_PS2=m +CONFIG_SERIO_PS2MULT=m +CONFIG_SERIO_ARC_PS2=m +# CONFIG_SERIO_APBPS2 is not set +CONFIG_HYPERV_KEYBOARD=m +CONFIG_SERIO_GPIO_PS2=m +CONFIG_USERIO=m +CONFIG_GAMEPORT=m +CONFIG_GAMEPORT_NS558=m +CONFIG_GAMEPORT_L4=m +CONFIG_GAMEPORT_EMU10K1=m +CONFIG_GAMEPORT_FM801=m +# end of Hardware I/O ports +# end of Input device support + +# +# Character devices +# +CONFIG_TTY=y +CONFIG_VT=y +CONFIG_CONSOLE_TRANSLATIONS=y +CONFIG_VT_CONSOLE=y +CONFIG_VT_CONSOLE_SLEEP=y +CONFIG_HW_CONSOLE=y +CONFIG_VT_HW_CONSOLE_BINDING=y +CONFIG_UNIX98_PTYS=y +# CONFIG_LEGACY_PTYS is not set +CONFIG_LDISC_AUTOLOAD=y + +# +# Serial drivers +# +CONFIG_SERIAL_EARLYCON=y +CONFIG_SERIAL_8250=y +# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set +CONFIG_SERIAL_8250_PNP=y +# CONFIG_SERIAL_8250_16550A_VARIANTS is not set +CONFIG_SERIAL_8250_FINTEK=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_8250_DMA=y +CONFIG_SERIAL_8250_PCI=y +CONFIG_SERIAL_8250_EXAR=m +CONFIG_SERIAL_8250_CS=m +CONFIG_SERIAL_8250_MEN_MCB=m +CONFIG_SERIAL_8250_NR_UARTS=32 +CONFIG_SERIAL_8250_RUNTIME_UARTS=4 +CONFIG_SERIAL_8250_EXTENDED=y +CONFIG_SERIAL_8250_MANY_PORTS=y +CONFIG_SERIAL_8250_ASPEED_VUART=m +CONFIG_SERIAL_8250_SHARE_IRQ=y +# CONFIG_SERIAL_8250_DETECT_IRQ is not set +CONFIG_SERIAL_8250_RSA=y +CONFIG_SERIAL_8250_DWLIB=y +CONFIG_SERIAL_8250_DW=m +CONFIG_SERIAL_8250_RT288X=y +CONFIG_SERIAL_8250_LPSS=y +CONFIG_SERIAL_8250_MID=y +CONFIG_SERIAL_OF_PLATFORM=m + +# +# Non-8250 serial port support +# +CONFIG_SERIAL_MAX3100=m +CONFIG_SERIAL_MAX310X=m +# CONFIG_SERIAL_IMX_EARLYCON is not set +CONFIG_SERIAL_UARTLITE=m +CONFIG_SERIAL_UARTLITE_NR_UARTS=1 +CONFIG_SERIAL_CORE=y +CONFIG_SERIAL_CORE_CONSOLE=y +CONFIG_SERIAL_JSM=m +CONFIG_SERIAL_SIFIVE=m +CONFIG_SERIAL_LANTIQ=m +CONFIG_SERIAL_SCCNXP=m +CONFIG_SERIAL_SC16IS7XX_CORE=m +CONFIG_SERIAL_SC16IS7XX=m +CONFIG_SERIAL_SC16IS7XX_I2C=y +CONFIG_SERIAL_SC16IS7XX_SPI=y +CONFIG_SERIAL_ALTERA_JTAGUART=m +CONFIG_SERIAL_ALTERA_UART=m +CONFIG_SERIAL_ALTERA_UART_MAXPORTS=4 +CONFIG_SERIAL_ALTERA_UART_BAUDRATE=115200 +CONFIG_SERIAL_IFX6X60=m +CONFIG_SERIAL_XILINX_PS_UART=m +CONFIG_SERIAL_ARC=m +CONFIG_SERIAL_ARC_NR_PORTS=1 +CONFIG_SERIAL_RP2=m +CONFIG_SERIAL_RP2_NR_UARTS=32 +CONFIG_SERIAL_FSL_LPUART=m +CONFIG_SERIAL_FSL_LINFLEXUART=m +CONFIG_SERIAL_CONEXANT_DIGICOLOR=m +CONFIG_SERIAL_MEN_Z135=m +CONFIG_SERIAL_SPRD=m +# end of Serial drivers + +CONFIG_SERIAL_MCTRL_GPIO=y +CONFIG_SERIAL_NONSTANDARD=y +CONFIG_ROCKETPORT=m +CONFIG_CYCLADES=m +CONFIG_CYZ_INTR=y +CONFIG_MOXA_INTELLIO=m +CONFIG_MOXA_SMARTIO=m +CONFIG_SYNCLINK=m +CONFIG_SYNCLINKMP=m +CONFIG_SYNCLINK_GT=m +CONFIG_ISI=m +CONFIG_N_HDLC=m +CONFIG_N_GSM=m +CONFIG_NOZOMI=m +CONFIG_NULL_TTY=m +CONFIG_TRACE_ROUTER=m +CONFIG_TRACE_SINK=m +CONFIG_HVC_DRIVER=y +CONFIG_HVC_IRQ=y +CONFIG_HVC_XEN=y +CONFIG_HVC_XEN_FRONTEND=y +CONFIG_SERIAL_DEV_BUS=y +CONFIG_SERIAL_DEV_CTRL_TTYPORT=y +# CONFIG_TTY_PRINTK is not set +CONFIG_PRINTER=m +# CONFIG_LP_CONSOLE is not set +CONFIG_PPDEV=m +CONFIG_VIRTIO_CONSOLE=m +CONFIG_IPMI_HANDLER=m +CONFIG_IPMI_DMI_DECODE=y +CONFIG_IPMI_PLAT_DATA=y +# CONFIG_IPMI_PANIC_EVENT is not set +CONFIG_IPMI_DEVICE_INTERFACE=m +CONFIG_IPMI_SI=m +CONFIG_IPMI_SSIF=m +CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m +CONFIG_IPMB_DEVICE_INTERFACE=m +CONFIG_HW_RANDOM=m +CONFIG_HW_RANDOM_TIMERIOMEM=m +CONFIG_HW_RANDOM_INTEL=m +CONFIG_HW_RANDOM_AMD=m +# CONFIG_HW_RANDOM_BA431 is not set +CONFIG_HW_RANDOM_VIA=m +CONFIG_HW_RANDOM_VIRTIO=m +CONFIG_HW_RANDOM_CCTRNG=m +CONFIG_APPLICOM=m + +# +# PCMCIA character devices +# +CONFIG_SYNCLINK_CS=m +CONFIG_CARDMAN_4000=m +CONFIG_CARDMAN_4040=m +CONFIG_SCR24X=m +CONFIG_IPWIRELESS=m +# end of PCMCIA character devices + +CONFIG_MWAVE=m +CONFIG_DEVMEM=y +# CONFIG_DEVKMEM is not set +CONFIG_NVRAM=m +CONFIG_RAW_DRIVER=m +CONFIG_MAX_RAW_DEVS=256 +CONFIG_DEVPORT=y +CONFIG_HPET=y +CONFIG_HPET_MMAP=y +CONFIG_HPET_MMAP_DEFAULT=y +CONFIG_HANGCHECK_TIMER=m +CONFIG_TCG_TPM=m +CONFIG_HW_RANDOM_TPM=y +CONFIG_TCG_TIS_CORE=m +CONFIG_TCG_TIS=m +CONFIG_TCG_TIS_SPI=m +CONFIG_TCG_TIS_SPI_CR50=y +CONFIG_TCG_TIS_I2C_ATMEL=m +CONFIG_TCG_TIS_I2C_INFINEON=m +CONFIG_TCG_TIS_I2C_NUVOTON=m +CONFIG_TCG_NSC=m +CONFIG_TCG_ATMEL=m +CONFIG_TCG_INFINEON=m +CONFIG_TCG_XEN=m +CONFIG_TCG_CRB=m +CONFIG_TCG_VTPM_PROXY=m +CONFIG_TCG_TIS_ST33ZP24=m +CONFIG_TCG_TIS_ST33ZP24_I2C=m +CONFIG_TCG_TIS_ST33ZP24_SPI=m +CONFIG_TELCLOCK=m +CONFIG_XILLYBUS=m +CONFIG_XILLYBUS_PCIE=m +CONFIG_XILLYBUS_OF=m +# end of Character devices + +# CONFIG_RANDOM_TRUST_CPU is not set +# CONFIG_RANDOM_TRUST_BOOTLOADER is not set + +# +# I2C support +# +CONFIG_I2C=y +CONFIG_ACPI_I2C_OPREGION=y +CONFIG_I2C_BOARDINFO=y +CONFIG_I2C_COMPAT=y +CONFIG_I2C_CHARDEV=m +CONFIG_I2C_MUX=m + +# +# Multiplexer I2C Chip support +# +CONFIG_I2C_ARB_GPIO_CHALLENGE=m +CONFIG_I2C_MUX_GPIO=m +CONFIG_I2C_MUX_GPMUX=m +CONFIG_I2C_MUX_LTC4306=m +CONFIG_I2C_MUX_PCA9541=m +CONFIG_I2C_MUX_PCA954x=m +CONFIG_I2C_MUX_PINCTRL=m +CONFIG_I2C_MUX_REG=m +CONFIG_I2C_DEMUX_PINCTRL=m +CONFIG_I2C_MUX_MLXCPLD=m +# end of Multiplexer I2C Chip support + +CONFIG_I2C_HELPER_AUTO=y +CONFIG_I2C_SMBUS=m +CONFIG_I2C_ALGOBIT=m +CONFIG_I2C_ALGOPCA=m + +# +# I2C Hardware Bus support +# + +# +# PC SMBus host controller drivers +# +CONFIG_I2C_ALI1535=m +CONFIG_I2C_ALI1563=m +CONFIG_I2C_ALI15X3=m +CONFIG_I2C_AMD756=m +CONFIG_I2C_AMD756_S4882=m +CONFIG_I2C_AMD8111=m +CONFIG_I2C_AMD_MP2=m +CONFIG_I2C_I801=m +CONFIG_I2C_ISCH=m +CONFIG_I2C_ISMT=m +CONFIG_I2C_PIIX4=m +CONFIG_I2C_CHT_WC=m +CONFIG_I2C_NFORCE2=m +CONFIG_I2C_NFORCE2_S4985=m +CONFIG_I2C_NVIDIA_GPU=m +CONFIG_I2C_SIS5595=m +CONFIG_I2C_SIS630=m +CONFIG_I2C_SIS96X=m +CONFIG_I2C_VIA=m +CONFIG_I2C_VIAPRO=m + +# +# ACPI drivers +# +CONFIG_I2C_SCMI=m + +# +# I2C system bus drivers (mostly embedded / system-on-chip) +# +CONFIG_I2C_CBUS_GPIO=m +CONFIG_I2C_DESIGNWARE_CORE=y +CONFIG_I2C_DESIGNWARE_SLAVE=y +CONFIG_I2C_DESIGNWARE_PLATFORM=y +CONFIG_I2C_DESIGNWARE_BAYTRAIL=y +CONFIG_I2C_DESIGNWARE_PCI=m +CONFIG_I2C_EMEV2=m +CONFIG_I2C_GPIO=m +# CONFIG_I2C_GPIO_FAULT_INJECTOR is not set +CONFIG_I2C_KEMPLD=m +CONFIG_I2C_OCORES=m +CONFIG_I2C_PCA_PLATFORM=m +CONFIG_I2C_RK3X=m +CONFIG_I2C_SIMTEC=m +CONFIG_I2C_XILINX=m + +# +# External I2C/SMBus adapter drivers +# +CONFIG_I2C_DIOLAN_U2C=m +CONFIG_I2C_DLN2=m +CONFIG_I2C_PARPORT=m +CONFIG_I2C_ROBOTFUZZ_OSIF=m +CONFIG_I2C_TAOS_EVM=m +CONFIG_I2C_TINY_USB=m +CONFIG_I2C_VIPERBOARD=m + +# +# Other I2C/SMBus bus drivers +# +CONFIG_I2C_MLXCPLD=m +CONFIG_I2C_CROS_EC_TUNNEL=m +CONFIG_I2C_FSI=m +# end of I2C Hardware Bus support + +CONFIG_I2C_STUB=m +CONFIG_I2C_SLAVE=y +CONFIG_I2C_SLAVE_EEPROM=m +# CONFIG_I2C_DEBUG_CORE is not set +# CONFIG_I2C_DEBUG_ALGO is not set +# CONFIG_I2C_DEBUG_BUS is not set +# end of I2C support + +CONFIG_I3C=m +CONFIG_CDNS_I3C_MASTER=m +CONFIG_DW_I3C_MASTER=m +CONFIG_SPI=y +# CONFIG_SPI_DEBUG is not set +CONFIG_SPI_MASTER=y +CONFIG_SPI_MEM=y + +# +# SPI Master Controller Drivers +# +CONFIG_SPI_ALTERA=m +CONFIG_SPI_AXI_SPI_ENGINE=m +CONFIG_SPI_BITBANG=m +CONFIG_SPI_BUTTERFLY=m +CONFIG_SPI_CADENCE=m +CONFIG_SPI_DESIGNWARE=m +CONFIG_SPI_DW_DMA=y +CONFIG_SPI_DW_PCI=m +CONFIG_SPI_DW_MMIO=m +CONFIG_SPI_DLN2=m +CONFIG_SPI_FSI=m +CONFIG_SPI_NXP_FLEXSPI=m +CONFIG_SPI_GPIO=m +CONFIG_SPI_LM70_LLP=m +CONFIG_SPI_FSL_LIB=m +CONFIG_SPI_FSL_SPI=m +# CONFIG_SPI_LANTIQ_SSC is not set +CONFIG_SPI_OC_TINY=m +CONFIG_SPI_PXA2XX=m +CONFIG_SPI_PXA2XX_PCI=m +CONFIG_SPI_ROCKCHIP=m +CONFIG_SPI_SC18IS602=m +CONFIG_SPI_SIFIVE=m +CONFIG_SPI_MXIC=m +CONFIG_SPI_XCOMM=m +CONFIG_SPI_XILINX=m +CONFIG_SPI_ZYNQMP_GQSPI=m +CONFIG_SPI_AMD=m + +# +# SPI Multiplexer support +# +CONFIG_SPI_MUX=m + +# +# SPI Protocol Masters +# +CONFIG_SPI_SPIDEV=m +CONFIG_SPI_LOOPBACK_TEST=m +CONFIG_SPI_TLE62X0=m +CONFIG_SPI_SLAVE=y +CONFIG_SPI_SLAVE_TIME=m +CONFIG_SPI_SLAVE_SYSTEM_CONTROL=m +CONFIG_SPMI=m +CONFIG_HSI=m +CONFIG_HSI_BOARDINFO=y + +# +# HSI controllers +# + +# +# HSI clients +# +CONFIG_HSI_CHAR=m +CONFIG_PPS=y +# CONFIG_PPS_DEBUG is not set + +# +# PPS clients support +# +CONFIG_PPS_CLIENT_KTIMER=m +CONFIG_PPS_CLIENT_LDISC=m +CONFIG_PPS_CLIENT_PARPORT=m +CONFIG_PPS_CLIENT_GPIO=m + +# +# PPS generators support +# + +# +# PTP clock support +# +CONFIG_PTP_1588_CLOCK=y +CONFIG_DP83640_PHY=m +CONFIG_PTP_1588_CLOCK_INES=m +CONFIG_PTP_1588_CLOCK_KVM=m +CONFIG_PTP_1588_CLOCK_IDT82P33=m +CONFIG_PTP_1588_CLOCK_IDTCM=m +CONFIG_PTP_1588_CLOCK_VMW=m +# end of PTP clock support + +CONFIG_PINCTRL=y +CONFIG_GENERIC_PINCTRL_GROUPS=y +CONFIG_PINMUX=y +CONFIG_GENERIC_PINMUX_FUNCTIONS=y +CONFIG_PINCONF=y +CONFIG_GENERIC_PINCONF=y +# CONFIG_DEBUG_PINCTRL is not set +CONFIG_PINCTRL_AS3722=m +CONFIG_PINCTRL_AXP209=m +CONFIG_PINCTRL_AMD=m +CONFIG_PINCTRL_DA9062=m +CONFIG_PINCTRL_MCP23S08_I2C=m +CONFIG_PINCTRL_MCP23S08_SPI=m +CONFIG_PINCTRL_MCP23S08=m +CONFIG_PINCTRL_SINGLE=m +CONFIG_PINCTRL_SX150X=y +CONFIG_PINCTRL_STMFX=m +CONFIG_PINCTRL_MAX77620=m +CONFIG_PINCTRL_PALMAS=m +CONFIG_PINCTRL_RK805=m +CONFIG_PINCTRL_OCELOT=y +CONFIG_PINCTRL_BAYTRAIL=y +CONFIG_PINCTRL_CHERRYVIEW=y +CONFIG_PINCTRL_LYNXPOINT=y +CONFIG_PINCTRL_INTEL=y +CONFIG_PINCTRL_BROXTON=y +CONFIG_PINCTRL_CANNONLAKE=y +CONFIG_PINCTRL_CEDARFORK=y +CONFIG_PINCTRL_DENVERTON=y +# CONFIG_PINCTRL_EMMITSBURG is not set +CONFIG_PINCTRL_GEMINILAKE=y +CONFIG_PINCTRL_ICELAKE=y +CONFIG_PINCTRL_JASPERLAKE=y +CONFIG_PINCTRL_LEWISBURG=y +CONFIG_PINCTRL_SUNRISEPOINT=y +CONFIG_PINCTRL_TIGERLAKE=y +CONFIG_PINCTRL_LOCHNAGAR=m +CONFIG_PINCTRL_MADERA=m +CONFIG_PINCTRL_CS47L15=y +CONFIG_PINCTRL_CS47L35=y +CONFIG_PINCTRL_CS47L85=y +CONFIG_PINCTRL_CS47L90=y +CONFIG_PINCTRL_CS47L92=y +CONFIG_PINCTRL_EQUILIBRIUM=m +CONFIG_GPIOLIB=y +CONFIG_GPIOLIB_FASTPATH_LIMIT=512 +CONFIG_OF_GPIO=y +CONFIG_GPIO_ACPI=y +CONFIG_GPIOLIB_IRQCHIP=y +# CONFIG_DEBUG_GPIO is not set +CONFIG_GPIO_SYSFS=y +CONFIG_GPIO_GENERIC=y +CONFIG_GPIO_MAX730X=m + +# +# Memory mapped GPIO drivers +# +CONFIG_GPIO_74XX_MMIO=m +CONFIG_GPIO_ALTERA=m +CONFIG_GPIO_AMDPT=m +CONFIG_GPIO_CADENCE=m +CONFIG_GPIO_DWAPB=m +CONFIG_GPIO_EXAR=m +CONFIG_GPIO_FTGPIO010=y +CONFIG_GPIO_GENERIC_PLATFORM=m +CONFIG_GPIO_GRGPIO=m +CONFIG_GPIO_HLWD=m +CONFIG_GPIO_ICH=m +CONFIG_GPIO_LOGICVC=m +CONFIG_GPIO_MB86S7X=m +CONFIG_GPIO_MENZ127=m +CONFIG_GPIO_SAMA5D2_PIOBU=m +CONFIG_GPIO_SIFIVE=y +CONFIG_GPIO_SIOX=m +CONFIG_GPIO_SYSCON=m +CONFIG_GPIO_VX855=m +CONFIG_GPIO_WCD934X=m +CONFIG_GPIO_XILINX=m +CONFIG_GPIO_AMD_FCH=m +# end of Memory mapped GPIO drivers + +# +# Port-mapped I/O GPIO drivers +# +CONFIG_GPIO_F7188X=m +CONFIG_GPIO_IT87=m +CONFIG_GPIO_SCH=m +CONFIG_GPIO_SCH311X=m +CONFIG_GPIO_WINBOND=m +CONFIG_GPIO_WS16C48=m +# end of Port-mapped I/O GPIO drivers + +# +# I2C GPIO expanders +# +CONFIG_GPIO_ADP5588=m +CONFIG_GPIO_ADNP=m +CONFIG_GPIO_GW_PLD=m +CONFIG_GPIO_MAX7300=m +CONFIG_GPIO_MAX732X=m +CONFIG_GPIO_PCA953X=m +# CONFIG_GPIO_PCA953X_IRQ is not set +# CONFIG_GPIO_PCA9570 is not set +CONFIG_GPIO_PCF857X=m +CONFIG_GPIO_TPIC2810=m +# end of I2C GPIO expanders + +# +# MFD GPIO expanders +# +CONFIG_GPIO_ADP5520=m +CONFIG_GPIO_ARIZONA=m +CONFIG_GPIO_BD70528=m +CONFIG_GPIO_BD71828=m +CONFIG_GPIO_BD9571MWV=m +CONFIG_GPIO_CRYSTAL_COVE=m +CONFIG_GPIO_DA9052=m +CONFIG_GPIO_DA9055=m +CONFIG_GPIO_DLN2=m +CONFIG_GPIO_JANZ_TTL=m +CONFIG_GPIO_KEMPLD=m +CONFIG_GPIO_LP3943=m +CONFIG_GPIO_LP873X=m +CONFIG_GPIO_LP87565=m +CONFIG_GPIO_MADERA=m +CONFIG_GPIO_MAX77620=m +CONFIG_GPIO_MAX77650=m +# CONFIG_GPIO_MSIC is not set +CONFIG_GPIO_PALMAS=y +CONFIG_GPIO_RC5T583=y +CONFIG_GPIO_STMPE=y +CONFIG_GPIO_TC3589X=y +CONFIG_GPIO_TPS65086=m +CONFIG_GPIO_TPS65218=m +CONFIG_GPIO_TPS6586X=y +CONFIG_GPIO_TPS65910=y +CONFIG_GPIO_TPS65912=m +CONFIG_GPIO_TPS68470=y +CONFIG_GPIO_TQMX86=m +CONFIG_GPIO_TWL4030=m +CONFIG_GPIO_TWL6040=m +CONFIG_GPIO_UCB1400=m +CONFIG_GPIO_WM831X=m +CONFIG_GPIO_WM8350=m +CONFIG_GPIO_WM8994=m +# end of MFD GPIO expanders + +# +# PCI GPIO expanders +# +CONFIG_GPIO_AMD8111=m +CONFIG_GPIO_ML_IOH=m +CONFIG_GPIO_PCI_IDIO_16=m +CONFIG_GPIO_PCIE_IDIO_24=m +CONFIG_GPIO_RDC321X=m +CONFIG_GPIO_SODAVILLE=y +# end of PCI GPIO expanders + +# +# SPI GPIO expanders +# +CONFIG_GPIO_74X164=m +CONFIG_GPIO_MAX3191X=m +CONFIG_GPIO_MAX7301=m +CONFIG_GPIO_MC33880=m +CONFIG_GPIO_PISOSR=m +CONFIG_GPIO_XRA1403=m +CONFIG_GPIO_MOXTET=m +# end of SPI GPIO expanders + +# +# USB GPIO expanders +# +CONFIG_GPIO_VIPERBOARD=m +# end of USB GPIO expanders + +CONFIG_GPIO_AGGREGATOR=m +CONFIG_GPIO_MOCKUP=m +CONFIG_W1=m +CONFIG_W1_CON=y + +# +# 1-wire Bus Masters +# +CONFIG_W1_MASTER_MATROX=m +CONFIG_W1_MASTER_DS2490=m +CONFIG_W1_MASTER_DS2482=m +CONFIG_W1_MASTER_DS1WM=m +CONFIG_W1_MASTER_GPIO=m +CONFIG_W1_MASTER_SGI=m +# end of 1-wire Bus Masters + +# +# 1-wire Slaves +# +CONFIG_W1_SLAVE_THERM=m +CONFIG_W1_SLAVE_SMEM=m +CONFIG_W1_SLAVE_DS2405=m +CONFIG_W1_SLAVE_DS2408=m +# CONFIG_W1_SLAVE_DS2408_READBACK is not set +CONFIG_W1_SLAVE_DS2413=m +CONFIG_W1_SLAVE_DS2406=m +CONFIG_W1_SLAVE_DS2423=m +CONFIG_W1_SLAVE_DS2805=m +CONFIG_W1_SLAVE_DS2430=m +CONFIG_W1_SLAVE_DS2431=m +CONFIG_W1_SLAVE_DS2433=m +# CONFIG_W1_SLAVE_DS2433_CRC is not set +CONFIG_W1_SLAVE_DS2438=m +CONFIG_W1_SLAVE_DS250X=m +CONFIG_W1_SLAVE_DS2780=m +CONFIG_W1_SLAVE_DS2781=m +CONFIG_W1_SLAVE_DS28E04=m +CONFIG_W1_SLAVE_DS28E17=m +# end of 1-wire Slaves + +CONFIG_POWER_AVS=y +CONFIG_QCOM_CPR=m +CONFIG_POWER_RESET=y +CONFIG_POWER_RESET_AS3722=y +CONFIG_POWER_RESET_GPIO=y +CONFIG_POWER_RESET_GPIO_RESTART=y +CONFIG_POWER_RESET_LTC2952=y +CONFIG_POWER_RESET_MT6323=y +CONFIG_POWER_RESET_RESTART=y +CONFIG_POWER_RESET_SYSCON=y +CONFIG_POWER_RESET_SYSCON_POWEROFF=y +CONFIG_REBOOT_MODE=m +CONFIG_SYSCON_REBOOT_MODE=m +CONFIG_NVMEM_REBOOT_MODE=m +CONFIG_POWER_SUPPLY=y +# CONFIG_POWER_SUPPLY_DEBUG is not set +CONFIG_POWER_SUPPLY_HWMON=y +CONFIG_PDA_POWER=m +CONFIG_GENERIC_ADC_BATTERY=m +CONFIG_MAX8925_POWER=m +CONFIG_WM831X_BACKUP=m +CONFIG_WM831X_POWER=m +CONFIG_WM8350_POWER=m +CONFIG_TEST_POWER=m +CONFIG_BATTERY_88PM860X=m +CONFIG_CHARGER_ADP5061=m +CONFIG_BATTERY_ACT8945A=m +CONFIG_BATTERY_CPCAP=m +CONFIG_BATTERY_CW2015=m +CONFIG_BATTERY_DS2760=m +CONFIG_BATTERY_DS2780=m +CONFIG_BATTERY_DS2781=m +CONFIG_BATTERY_DS2782=m +CONFIG_BATTERY_LEGO_EV3=m +CONFIG_BATTERY_SBS=m +CONFIG_CHARGER_SBS=m +CONFIG_MANAGER_SBS=m +CONFIG_BATTERY_BQ27XXX=m +CONFIG_BATTERY_BQ27XXX_I2C=m +CONFIG_BATTERY_BQ27XXX_HDQ=m +# CONFIG_BATTERY_BQ27XXX_DT_UPDATES_NVM is not set +CONFIG_BATTERY_DA9030=m +CONFIG_BATTERY_DA9052=m +CONFIG_CHARGER_DA9150=m +CONFIG_BATTERY_DA9150=m +CONFIG_CHARGER_AXP20X=m +CONFIG_BATTERY_AXP20X=m +CONFIG_AXP20X_POWER=m +CONFIG_AXP288_CHARGER=m +CONFIG_AXP288_FUEL_GAUGE=m +CONFIG_BATTERY_MAX17040=m +CONFIG_BATTERY_MAX17042=m +CONFIG_BATTERY_MAX1721X=m +CONFIG_BATTERY_TWL4030_MADC=m +CONFIG_CHARGER_88PM860X=m +CONFIG_CHARGER_PCF50633=m +CONFIG_BATTERY_RX51=m +CONFIG_CHARGER_ISP1704=m +CONFIG_CHARGER_MAX8903=m +CONFIG_CHARGER_TWL4030=m +CONFIG_CHARGER_LP8727=m +CONFIG_CHARGER_LP8788=m +CONFIG_CHARGER_GPIO=m +CONFIG_CHARGER_MANAGER=y +CONFIG_CHARGER_LT3651=m +CONFIG_CHARGER_MAX14577=m +CONFIG_CHARGER_DETECTOR_MAX14656=m +CONFIG_CHARGER_MAX77650=m +CONFIG_CHARGER_MAX77693=m +CONFIG_CHARGER_MAX8997=m +CONFIG_CHARGER_MAX8998=m +CONFIG_CHARGER_BQ2415X=m +CONFIG_CHARGER_BQ24190=m +CONFIG_CHARGER_BQ24257=m +CONFIG_CHARGER_BQ24735=m +# CONFIG_CHARGER_BQ2515X is not set +CONFIG_CHARGER_BQ25890=m +CONFIG_CHARGER_SMB347=m +CONFIG_CHARGER_TPS65090=m +CONFIG_CHARGER_TPS65217=m +CONFIG_BATTERY_GAUGE_LTC2941=m +CONFIG_BATTERY_RT5033=m +CONFIG_CHARGER_RT9455=m +CONFIG_CHARGER_CROS_USBPD=m +CONFIG_CHARGER_UCS1002=m +CONFIG_CHARGER_BD70528=m +CONFIG_CHARGER_BD99954=m +CONFIG_CHARGER_WILCO=m +CONFIG_HWMON=y +CONFIG_HWMON_VID=m +# CONFIG_HWMON_DEBUG_CHIP is not set + +# +# Native drivers +# +CONFIG_SENSORS_ABITUGURU=m +CONFIG_SENSORS_ABITUGURU3=m +CONFIG_SENSORS_AD7314=m +CONFIG_SENSORS_AD7414=m +CONFIG_SENSORS_AD7418=m +CONFIG_SENSORS_ADM1021=m +CONFIG_SENSORS_ADM1025=m +CONFIG_SENSORS_ADM1026=m +CONFIG_SENSORS_ADM1029=m +CONFIG_SENSORS_ADM1031=m +CONFIG_SENSORS_ADM1177=m +CONFIG_SENSORS_ADM9240=m +CONFIG_SENSORS_ADT7X10=m +CONFIG_SENSORS_ADT7310=m +CONFIG_SENSORS_ADT7410=m +CONFIG_SENSORS_ADT7411=m +CONFIG_SENSORS_ADT7462=m +CONFIG_SENSORS_ADT7470=m +CONFIG_SENSORS_ADT7475=m +CONFIG_SENSORS_AS370=m +CONFIG_SENSORS_ASC7621=m +CONFIG_SENSORS_AXI_FAN_CONTROL=m +CONFIG_SENSORS_K8TEMP=m +CONFIG_SENSORS_K10TEMP=m +CONFIG_SENSORS_FAM15H_POWER=m +CONFIG_SENSORS_AMD_ENERGY=m +CONFIG_SENSORS_APPLESMC=m +CONFIG_SENSORS_ASB100=m +CONFIG_SENSORS_ASPEED=m +CONFIG_SENSORS_ATXP1=m +# CONFIG_SENSORS_CORSAIR_CPRO is not set +CONFIG_SENSORS_DRIVETEMP=m +CONFIG_SENSORS_DS620=m +CONFIG_SENSORS_DS1621=m +CONFIG_SENSORS_DELL_SMM=m +CONFIG_SENSORS_DA9052_ADC=m +CONFIG_SENSORS_DA9055=m +CONFIG_SENSORS_I5K_AMB=m +CONFIG_SENSORS_F71805F=m +CONFIG_SENSORS_F71882FG=m +CONFIG_SENSORS_F75375S=m +# CONFIG_SENSORS_GSC is not set +CONFIG_SENSORS_MC13783_ADC=m +CONFIG_SENSORS_FSCHMD=m +CONFIG_SENSORS_FTSTEUTATES=m +CONFIG_SENSORS_GL518SM=m +CONFIG_SENSORS_GL520SM=m +CONFIG_SENSORS_G760A=m +CONFIG_SENSORS_G762=m +CONFIG_SENSORS_GPIO_FAN=m +CONFIG_SENSORS_HIH6130=m +CONFIG_SENSORS_IBMAEM=m +CONFIG_SENSORS_IBMPEX=m +CONFIG_SENSORS_IIO_HWMON=m +CONFIG_SENSORS_I5500=m +CONFIG_SENSORS_CORETEMP=m +CONFIG_SENSORS_IT87=m +CONFIG_SENSORS_JC42=m +CONFIG_SENSORS_POWR1220=m +CONFIG_SENSORS_LINEAGE=m +CONFIG_SENSORS_LOCHNAGAR=m +CONFIG_SENSORS_LTC2945=m +CONFIG_SENSORS_LTC2947=m +CONFIG_SENSORS_LTC2947_I2C=m +CONFIG_SENSORS_LTC2947_SPI=m +CONFIG_SENSORS_LTC2990=m +CONFIG_SENSORS_LTC4151=m +CONFIG_SENSORS_LTC4215=m +CONFIG_SENSORS_LTC4222=m +CONFIG_SENSORS_LTC4245=m +CONFIG_SENSORS_LTC4260=m +CONFIG_SENSORS_LTC4261=m +CONFIG_SENSORS_MAX1111=m +CONFIG_SENSORS_MAX16065=m +CONFIG_SENSORS_MAX1619=m +CONFIG_SENSORS_MAX1668=m +CONFIG_SENSORS_MAX197=m +CONFIG_SENSORS_MAX31722=m +CONFIG_SENSORS_MAX31730=m +CONFIG_SENSORS_MAX6621=m +CONFIG_SENSORS_MAX6639=m +CONFIG_SENSORS_MAX6642=m +CONFIG_SENSORS_MAX6650=m +CONFIG_SENSORS_MAX6697=m +CONFIG_SENSORS_MAX31790=m +CONFIG_SENSORS_MCP3021=m +CONFIG_SENSORS_MLXREG_FAN=m +CONFIG_SENSORS_TC654=m +CONFIG_SENSORS_MENF21BMC_HWMON=m +CONFIG_SENSORS_ADCXX=m +CONFIG_SENSORS_LM63=m +CONFIG_SENSORS_LM70=m +CONFIG_SENSORS_LM73=m +CONFIG_SENSORS_LM75=m +CONFIG_SENSORS_LM77=m +CONFIG_SENSORS_LM78=m +CONFIG_SENSORS_LM80=m +CONFIG_SENSORS_LM83=m +CONFIG_SENSORS_LM85=m +CONFIG_SENSORS_LM87=m +CONFIG_SENSORS_LM90=m +CONFIG_SENSORS_LM92=m +CONFIG_SENSORS_LM93=m +CONFIG_SENSORS_LM95234=m +CONFIG_SENSORS_LM95241=m +CONFIG_SENSORS_LM95245=m +CONFIG_SENSORS_PC87360=m +CONFIG_SENSORS_PC87427=m +CONFIG_SENSORS_NTC_THERMISTOR=m +CONFIG_SENSORS_NCT6683=m +CONFIG_SENSORS_NCT6775=m +CONFIG_SENSORS_NCT7802=m +CONFIG_SENSORS_NCT7904=m +CONFIG_SENSORS_NPCM7XX=m +CONFIG_SENSORS_PCF8591=m +CONFIG_PMBUS=m +CONFIG_SENSORS_PMBUS=m +CONFIG_SENSORS_ADM1275=m +CONFIG_SENSORS_BEL_PFE=m +CONFIG_SENSORS_IBM_CFFPS=m +CONFIG_SENSORS_INSPUR_IPSPS=m +CONFIG_SENSORS_IR35221=m +CONFIG_SENSORS_IR38064=m +CONFIG_SENSORS_IRPS5401=m +CONFIG_SENSORS_ISL68137=m +CONFIG_SENSORS_LM25066=m +CONFIG_SENSORS_LTC2978=m +# CONFIG_SENSORS_LTC2978_REGULATOR is not set +CONFIG_SENSORS_LTC3815=m +CONFIG_SENSORS_MAX16064=m +CONFIG_SENSORS_MAX16601=m +CONFIG_SENSORS_MAX20730=m +CONFIG_SENSORS_MAX20751=m +CONFIG_SENSORS_MAX31785=m +CONFIG_SENSORS_MAX34440=m +CONFIG_SENSORS_MAX8688=m +CONFIG_SENSORS_PXE1610=m +CONFIG_SENSORS_TPS40422=m +CONFIG_SENSORS_TPS53679=m +CONFIG_SENSORS_UCD9000=m +CONFIG_SENSORS_UCD9200=m +CONFIG_SENSORS_XDPE122=m +CONFIG_SENSORS_ZL6100=m +CONFIG_SENSORS_PWM_FAN=m +CONFIG_SENSORS_SHT15=m +CONFIG_SENSORS_SHT21=m +CONFIG_SENSORS_SHT3x=m +CONFIG_SENSORS_SHTC1=m +CONFIG_SENSORS_SIS5595=m +CONFIG_SENSORS_DME1737=m +CONFIG_SENSORS_EMC1403=m +CONFIG_SENSORS_EMC2103=m +CONFIG_SENSORS_EMC6W201=m +CONFIG_SENSORS_SMSC47M1=m +CONFIG_SENSORS_SMSC47M192=m +CONFIG_SENSORS_SMSC47B397=m +CONFIG_SENSORS_SCH56XX_COMMON=m +CONFIG_SENSORS_SCH5627=m +CONFIG_SENSORS_SCH5636=m +CONFIG_SENSORS_STTS751=m +CONFIG_SENSORS_SMM665=m +CONFIG_SENSORS_ADC128D818=m +CONFIG_SENSORS_ADS7828=m +CONFIG_SENSORS_ADS7871=m +CONFIG_SENSORS_AMC6821=m +CONFIG_SENSORS_INA209=m +CONFIG_SENSORS_INA2XX=m +CONFIG_SENSORS_INA3221=m +CONFIG_SENSORS_TC74=m +CONFIG_SENSORS_THMC50=m +CONFIG_SENSORS_TMP102=m +CONFIG_SENSORS_TMP103=m +CONFIG_SENSORS_TMP108=m +CONFIG_SENSORS_TMP401=m +CONFIG_SENSORS_TMP421=m +CONFIG_SENSORS_TMP513=m +CONFIG_SENSORS_VIA_CPUTEMP=m +CONFIG_SENSORS_VIA686A=m +CONFIG_SENSORS_VT1211=m +CONFIG_SENSORS_VT8231=m +CONFIG_SENSORS_W83773G=m +CONFIG_SENSORS_W83781D=m +CONFIG_SENSORS_W83791D=m +CONFIG_SENSORS_W83792D=m +CONFIG_SENSORS_W83793=m +CONFIG_SENSORS_W83795=m +# CONFIG_SENSORS_W83795_FANCTRL is not set +CONFIG_SENSORS_W83L785TS=m +CONFIG_SENSORS_W83L786NG=m +CONFIG_SENSORS_W83627HF=m +CONFIG_SENSORS_W83627EHF=m +CONFIG_SENSORS_WM831X=m +CONFIG_SENSORS_WM8350=m +CONFIG_SENSORS_XGENE=m + +# +# ACPI drivers +# +CONFIG_SENSORS_ACPI_POWER=m +CONFIG_SENSORS_ATK0110=m +CONFIG_THERMAL=y +# CONFIG_THERMAL_NETLINK is not set +# CONFIG_THERMAL_STATISTICS is not set +CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=100 +CONFIG_THERMAL_HWMON=y +CONFIG_THERMAL_OF=y +CONFIG_THERMAL_WRITABLE_TRIPS=y +CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y +# CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE is not set +# CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE is not set +# CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR is not set +CONFIG_THERMAL_GOV_FAIR_SHARE=y +CONFIG_THERMAL_GOV_STEP_WISE=y +CONFIG_THERMAL_GOV_BANG_BANG=y +CONFIG_THERMAL_GOV_USER_SPACE=y +CONFIG_THERMAL_GOV_POWER_ALLOCATOR=y +CONFIG_CPU_THERMAL=y +CONFIG_CPU_FREQ_THERMAL=y +CONFIG_CPU_IDLE_THERMAL=y +CONFIG_DEVFREQ_THERMAL=y +# CONFIG_THERMAL_EMULATION is not set +CONFIG_THERMAL_MMIO=m +CONFIG_MAX77620_THERMAL=m +CONFIG_DA9062_THERMAL=m + +# +# Intel thermal drivers +# +CONFIG_INTEL_POWERCLAMP=m +CONFIG_X86_PKG_TEMP_THERMAL=m +CONFIG_INTEL_SOC_DTS_IOSF_CORE=m +CONFIG_INTEL_SOC_DTS_THERMAL=m + +# +# ACPI INT340X thermal drivers +# +CONFIG_INT340X_THERMAL=m +CONFIG_ACPI_THERMAL_REL=m +CONFIG_INT3406_THERMAL=m +CONFIG_PROC_THERMAL_MMIO_RAPL=y +# end of ACPI INT340X thermal drivers + +CONFIG_INTEL_PCH_THERMAL=m +# end of Intel thermal drivers + +# CONFIG_TI_SOC_THERMAL is not set +CONFIG_GENERIC_ADC_THERMAL=m +CONFIG_WATCHDOG=y +CONFIG_WATCHDOG_CORE=y +# CONFIG_WATCHDOG_NOWAYOUT is not set +CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED=y +CONFIG_WATCHDOG_OPEN_TIMEOUT=0 +CONFIG_WATCHDOG_SYSFS=y + +# +# Watchdog Pretimeout Governors +# +CONFIG_WATCHDOG_PRETIMEOUT_GOV=y +CONFIG_WATCHDOG_PRETIMEOUT_GOV_SEL=m +CONFIG_WATCHDOG_PRETIMEOUT_GOV_NOOP=m +CONFIG_WATCHDOG_PRETIMEOUT_GOV_PANIC=y +# CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_NOOP is not set +CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC=y + +# +# Watchdog Device Drivers +# +CONFIG_SOFT_WATCHDOG=m +# CONFIG_SOFT_WATCHDOG_PRETIMEOUT is not set +CONFIG_BD70528_WATCHDOG=m +CONFIG_DA9052_WATCHDOG=m +CONFIG_DA9055_WATCHDOG=m +CONFIG_DA9063_WATCHDOG=m +CONFIG_DA9062_WATCHDOG=m +CONFIG_GPIO_WATCHDOG=m +CONFIG_MENF21BMC_WATCHDOG=m +CONFIG_MENZ069_WATCHDOG=m +CONFIG_WDAT_WDT=m +CONFIG_WM831X_WATCHDOG=m +CONFIG_WM8350_WATCHDOG=m +CONFIG_XILINX_WATCHDOG=m +CONFIG_ZIIRAVE_WATCHDOG=m +CONFIG_RAVE_SP_WATCHDOG=m +CONFIG_MLX_WDT=m +CONFIG_CADENCE_WATCHDOG=m +CONFIG_DW_WATCHDOG=m +CONFIG_RN5T618_WATCHDOG=m +CONFIG_TWL4030_WATCHDOG=m +CONFIG_MAX63XX_WATCHDOG=m +CONFIG_MAX77620_WATCHDOG=m +CONFIG_RETU_WATCHDOG=m +CONFIG_STPMIC1_WATCHDOG=m +CONFIG_ACQUIRE_WDT=m +CONFIG_ADVANTECH_WDT=m +CONFIG_ALIM1535_WDT=m +CONFIG_ALIM7101_WDT=m +CONFIG_EBC_C384_WDT=m +CONFIG_F71808E_WDT=m +CONFIG_SP5100_TCO=m +CONFIG_SBC_FITPC2_WATCHDOG=m +CONFIG_EUROTECH_WDT=m +CONFIG_IB700_WDT=m +CONFIG_IBMASR=m +CONFIG_WAFER_WDT=m +CONFIG_I6300ESB_WDT=m +CONFIG_IE6XX_WDT=m +CONFIG_ITCO_WDT=m +CONFIG_ITCO_VENDOR_SUPPORT=y +CONFIG_IT8712F_WDT=m +CONFIG_IT87_WDT=m +CONFIG_HP_WATCHDOG=m +CONFIG_HPWDT_NMI_DECODING=y +CONFIG_KEMPLD_WDT=m +CONFIG_SC1200_WDT=m +CONFIG_PC87413_WDT=m +CONFIG_NV_TCO=m +CONFIG_60XX_WDT=m +CONFIG_CPU5_WDT=m +CONFIG_SMSC_SCH311X_WDT=m +CONFIG_SMSC37B787_WDT=m +CONFIG_TQMX86_WDT=m +CONFIG_VIA_WDT=m +CONFIG_W83627HF_WDT=m +CONFIG_W83877F_WDT=m +CONFIG_W83977F_WDT=m +CONFIG_MACHZ_WDT=m +CONFIG_SBC_EPX_C3_WATCHDOG=m +CONFIG_INTEL_MEI_WDT=m +CONFIG_NI903X_WDT=m +CONFIG_NIC7018_WDT=m +CONFIG_MEN_A21_WDT=m +CONFIG_XEN_WDT=m + +# +# PCI-based Watchdog Cards +# +CONFIG_PCIPCWATCHDOG=m +CONFIG_WDTPCI=m + +# +# USB-based Watchdog Cards +# +CONFIG_USBPCWATCHDOG=m +CONFIG_SSB_POSSIBLE=y +CONFIG_SSB=m +CONFIG_SSB_SPROM=y +CONFIG_SSB_BLOCKIO=y +CONFIG_SSB_PCIHOST_POSSIBLE=y +CONFIG_SSB_PCIHOST=y +CONFIG_SSB_B43_PCI_BRIDGE=y +CONFIG_SSB_PCMCIAHOST_POSSIBLE=y +CONFIG_SSB_PCMCIAHOST=y +CONFIG_SSB_SDIOHOST_POSSIBLE=y +CONFIG_SSB_SDIOHOST=y +CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y +CONFIG_SSB_DRIVER_PCICORE=y +CONFIG_SSB_DRIVER_GPIO=y +CONFIG_BCMA_POSSIBLE=y +CONFIG_BCMA=m +CONFIG_BCMA_BLOCKIO=y +CONFIG_BCMA_HOST_PCI_POSSIBLE=y +CONFIG_BCMA_HOST_PCI=y +# CONFIG_BCMA_HOST_SOC is not set +CONFIG_BCMA_DRIVER_PCI=y +CONFIG_BCMA_DRIVER_GMAC_CMN=y +CONFIG_BCMA_DRIVER_GPIO=y +# CONFIG_BCMA_DEBUG is not set + +# +# Multifunction device drivers +# +CONFIG_MFD_CORE=y +CONFIG_MFD_ACT8945A=m +CONFIG_MFD_AS3711=y +CONFIG_MFD_AS3722=m +CONFIG_PMIC_ADP5520=y +CONFIG_MFD_AAT2870_CORE=y +CONFIG_MFD_ATMEL_FLEXCOM=m +CONFIG_MFD_ATMEL_HLCDC=m +CONFIG_MFD_BCM590XX=m +CONFIG_MFD_BD9571MWV=m +CONFIG_MFD_AXP20X=m +CONFIG_MFD_AXP20X_I2C=m +CONFIG_MFD_CROS_EC_DEV=m +CONFIG_MFD_MADERA=m +CONFIG_MFD_MADERA_I2C=m +CONFIG_MFD_MADERA_SPI=m +CONFIG_MFD_CS47L15=y +CONFIG_MFD_CS47L35=y +CONFIG_MFD_CS47L85=y +CONFIG_MFD_CS47L90=y +CONFIG_MFD_CS47L92=y +CONFIG_PMIC_DA903X=y +CONFIG_PMIC_DA9052=y +CONFIG_MFD_DA9052_SPI=y +CONFIG_MFD_DA9052_I2C=y +CONFIG_MFD_DA9055=y +CONFIG_MFD_DA9062=m +CONFIG_MFD_DA9063=m +CONFIG_MFD_DA9150=m +CONFIG_MFD_DLN2=m +CONFIG_MFD_GATEWORKS_GSC=m +CONFIG_MFD_MC13XXX=m +CONFIG_MFD_MC13XXX_SPI=m +CONFIG_MFD_MC13XXX_I2C=m +CONFIG_MFD_MP2629=m +CONFIG_MFD_HI6421_PMIC=m +CONFIG_HTC_PASIC3=m +CONFIG_HTC_I2CPLD=y +CONFIG_MFD_INTEL_QUARK_I2C_GPIO=m +CONFIG_LPC_ICH=m +CONFIG_LPC_SCH=m +CONFIG_INTEL_SOC_PMIC=y +# CONFIG_INTEL_SOC_PMIC_BXTWC is not set +CONFIG_INTEL_SOC_PMIC_CHTWC=y +CONFIG_INTEL_SOC_PMIC_CHTDC_TI=m +# CONFIG_INTEL_SOC_PMIC_MRFLD is not set +CONFIG_MFD_INTEL_LPSS=m +CONFIG_MFD_INTEL_LPSS_ACPI=m +CONFIG_MFD_INTEL_LPSS_PCI=m +CONFIG_MFD_INTEL_MSIC=y +CONFIG_MFD_INTEL_PMC_BXT=m +CONFIG_MFD_IQS62X=m +CONFIG_MFD_JANZ_CMODIO=m +CONFIG_MFD_KEMPLD=m +CONFIG_MFD_88PM800=m +CONFIG_MFD_88PM805=m +CONFIG_MFD_88PM860X=y +CONFIG_MFD_MAX14577=m +CONFIG_MFD_MAX77620=y +CONFIG_MFD_MAX77650=m +CONFIG_MFD_MAX77686=m +CONFIG_MFD_MAX77693=m +CONFIG_MFD_MAX77843=y +CONFIG_MFD_MAX8907=m +CONFIG_MFD_MAX8925=y +CONFIG_MFD_MAX8997=y +CONFIG_MFD_MAX8998=y +CONFIG_MFD_MT6360=m +CONFIG_MFD_MT6397=m +CONFIG_MFD_MENF21BMC=m +CONFIG_EZX_PCAP=y +CONFIG_MFD_CPCAP=m +CONFIG_MFD_VIPERBOARD=m +CONFIG_MFD_RETU=m +CONFIG_MFD_PCF50633=m +CONFIG_PCF50633_ADC=m +CONFIG_PCF50633_GPIO=m +CONFIG_UCB1400_CORE=m +CONFIG_MFD_RDC321X=m +CONFIG_MFD_RT5033=m +CONFIG_MFD_RC5T583=y +CONFIG_MFD_RK808=m +CONFIG_MFD_RN5T618=m +CONFIG_MFD_SEC_CORE=y +CONFIG_MFD_SI476X_CORE=m +CONFIG_MFD_SM501=m +CONFIG_MFD_SM501_GPIO=y +CONFIG_MFD_SKY81452=m +CONFIG_ABX500_CORE=y +CONFIG_AB3100_CORE=y +CONFIG_AB3100_OTP=y +CONFIG_MFD_STMPE=y + +# +# STMicroelectronics STMPE Interface Drivers +# +CONFIG_STMPE_I2C=y +CONFIG_STMPE_SPI=y +# end of STMicroelectronics STMPE Interface Drivers + +CONFIG_MFD_SYSCON=y +CONFIG_MFD_TI_AM335X_TSCADC=m +CONFIG_MFD_LP3943=m +CONFIG_MFD_LP8788=y +CONFIG_MFD_TI_LMU=m +CONFIG_MFD_PALMAS=y +CONFIG_TPS6105X=m +CONFIG_TPS65010=m +CONFIG_TPS6507X=m +CONFIG_MFD_TPS65086=m +CONFIG_MFD_TPS65090=y +CONFIG_MFD_TPS65217=m +CONFIG_MFD_TPS68470=y +CONFIG_MFD_TI_LP873X=m +CONFIG_MFD_TI_LP87565=m +CONFIG_MFD_TPS65218=m +CONFIG_MFD_TPS6586X=y +CONFIG_MFD_TPS65910=y +CONFIG_MFD_TPS65912=m +CONFIG_MFD_TPS65912_I2C=m +CONFIG_MFD_TPS65912_SPI=m +CONFIG_MFD_TPS80031=y +CONFIG_TWL4030_CORE=y +CONFIG_MFD_TWL4030_AUDIO=y +CONFIG_TWL6040_CORE=y +CONFIG_MFD_WL1273_CORE=m +CONFIG_MFD_LM3533=m +CONFIG_MFD_TC3589X=y +CONFIG_MFD_TQMX86=m +CONFIG_MFD_VX855=m +CONFIG_MFD_LOCHNAGAR=y +CONFIG_MFD_ARIZONA=y +CONFIG_MFD_ARIZONA_I2C=m +CONFIG_MFD_ARIZONA_SPI=m +CONFIG_MFD_CS47L24=y +CONFIG_MFD_WM5102=y +CONFIG_MFD_WM5110=y +CONFIG_MFD_WM8997=y +CONFIG_MFD_WM8998=y +CONFIG_MFD_WM8400=y +CONFIG_MFD_WM831X=y +CONFIG_MFD_WM831X_I2C=y +CONFIG_MFD_WM831X_SPI=y +CONFIG_MFD_WM8350=y +CONFIG_MFD_WM8350_I2C=y +CONFIG_MFD_WM8994=m +CONFIG_MFD_ROHM_BD718XX=m +CONFIG_MFD_ROHM_BD70528=m +CONFIG_MFD_ROHM_BD71828=m +CONFIG_MFD_STPMIC1=m +CONFIG_MFD_STMFX=m +CONFIG_MFD_WCD934X=m +CONFIG_RAVE_SP_CORE=m +# end of Multifunction device drivers + +CONFIG_REGULATOR=y +# CONFIG_REGULATOR_DEBUG is not set +CONFIG_REGULATOR_FIXED_VOLTAGE=m +CONFIG_REGULATOR_VIRTUAL_CONSUMER=m +CONFIG_REGULATOR_USERSPACE_CONSUMER=m +CONFIG_REGULATOR_88PG86X=m +CONFIG_REGULATOR_88PM800=m +CONFIG_REGULATOR_88PM8607=m +CONFIG_REGULATOR_ACT8865=m +CONFIG_REGULATOR_ACT8945A=m +CONFIG_REGULATOR_AD5398=m +CONFIG_REGULATOR_AAT2870=m +CONFIG_REGULATOR_AB3100=m +CONFIG_REGULATOR_ARIZONA_LDO1=m +CONFIG_REGULATOR_ARIZONA_MICSUPP=m +CONFIG_REGULATOR_AS3711=m +CONFIG_REGULATOR_AS3722=m +CONFIG_REGULATOR_AXP20X=m +CONFIG_REGULATOR_BCM590XX=m +CONFIG_REGULATOR_BD70528=m +CONFIG_REGULATOR_BD71828=m +CONFIG_REGULATOR_BD718XX=m +CONFIG_REGULATOR_BD9571MWV=m +CONFIG_REGULATOR_CPCAP=m +# CONFIG_REGULATOR_CROS_EC is not set +CONFIG_REGULATOR_DA903X=m +CONFIG_REGULATOR_DA9052=m +CONFIG_REGULATOR_DA9055=m +CONFIG_REGULATOR_DA9062=m +CONFIG_REGULATOR_DA9063=m +CONFIG_REGULATOR_DA9210=m +CONFIG_REGULATOR_DA9211=m +CONFIG_REGULATOR_FAN53555=m +# CONFIG_REGULATOR_FAN53880 is not set +CONFIG_REGULATOR_GPIO=m +CONFIG_REGULATOR_HI6421=m +CONFIG_REGULATOR_HI6421V530=m +CONFIG_REGULATOR_ISL9305=m +CONFIG_REGULATOR_ISL6271A=m +CONFIG_REGULATOR_LM363X=m +CONFIG_REGULATOR_LOCHNAGAR=m +CONFIG_REGULATOR_LP3971=m +CONFIG_REGULATOR_LP3972=m +CONFIG_REGULATOR_LP872X=m +CONFIG_REGULATOR_LP873X=m +CONFIG_REGULATOR_LP8755=m +CONFIG_REGULATOR_LP87565=m +CONFIG_REGULATOR_LP8788=m +CONFIG_REGULATOR_LTC3589=m +CONFIG_REGULATOR_LTC3676=m +CONFIG_REGULATOR_MAX14577=m +CONFIG_REGULATOR_MAX1586=m +CONFIG_REGULATOR_MAX77620=m +CONFIG_REGULATOR_MAX77650=m +CONFIG_REGULATOR_MAX8649=m +CONFIG_REGULATOR_MAX8660=m +CONFIG_REGULATOR_MAX8907=m +CONFIG_REGULATOR_MAX8925=m +CONFIG_REGULATOR_MAX8952=m +CONFIG_REGULATOR_MAX8973=m +CONFIG_REGULATOR_MAX8997=m +CONFIG_REGULATOR_MAX8998=m +CONFIG_REGULATOR_MAX77686=m +CONFIG_REGULATOR_MAX77693=m +CONFIG_REGULATOR_MAX77802=m +CONFIG_REGULATOR_MAX77826=m +CONFIG_REGULATOR_MC13XXX_CORE=m +CONFIG_REGULATOR_MC13783=m +CONFIG_REGULATOR_MC13892=m +CONFIG_REGULATOR_MCP16502=m +CONFIG_REGULATOR_MP5416=m +CONFIG_REGULATOR_MP8859=m +CONFIG_REGULATOR_MP886X=m +CONFIG_REGULATOR_MPQ7920=m +CONFIG_REGULATOR_MT6311=m +CONFIG_REGULATOR_MT6323=m +CONFIG_REGULATOR_MT6358=m +CONFIG_REGULATOR_MT6397=m +CONFIG_REGULATOR_PALMAS=m +# CONFIG_REGULATOR_PCA9450 is not set +CONFIG_REGULATOR_PCAP=m +CONFIG_REGULATOR_PCF50633=m +CONFIG_REGULATOR_PFUZE100=m +CONFIG_REGULATOR_PV88060=m +CONFIG_REGULATOR_PV88080=m +CONFIG_REGULATOR_PV88090=m +CONFIG_REGULATOR_PWM=m +CONFIG_REGULATOR_QCOM_SPMI=m +# CONFIG_REGULATOR_QCOM_USB_VBUS is not set +CONFIG_REGULATOR_RC5T583=m +CONFIG_REGULATOR_RK808=m +CONFIG_REGULATOR_RN5T618=m +CONFIG_REGULATOR_ROHM=m +CONFIG_REGULATOR_RT5033=m +CONFIG_REGULATOR_S2MPA01=m +CONFIG_REGULATOR_S2MPS11=m +CONFIG_REGULATOR_S5M8767=m +CONFIG_REGULATOR_SKY81452=m +CONFIG_REGULATOR_SLG51000=m +CONFIG_REGULATOR_STPMIC1=m +CONFIG_REGULATOR_SY8106A=m +CONFIG_REGULATOR_SY8824X=m +# CONFIG_REGULATOR_SY8827N is not set +CONFIG_REGULATOR_TPS51632=m +CONFIG_REGULATOR_TPS6105X=m +CONFIG_REGULATOR_TPS62360=m +CONFIG_REGULATOR_TPS65023=m +CONFIG_REGULATOR_TPS6507X=m +CONFIG_REGULATOR_TPS65086=m +CONFIG_REGULATOR_TPS65090=m +CONFIG_REGULATOR_TPS65132=m +CONFIG_REGULATOR_TPS65217=m +CONFIG_REGULATOR_TPS65218=m +CONFIG_REGULATOR_TPS6524X=m +CONFIG_REGULATOR_TPS6586X=m +CONFIG_REGULATOR_TPS65910=m +CONFIG_REGULATOR_TPS65912=m +CONFIG_REGULATOR_TPS80031=m +CONFIG_REGULATOR_TWL4030=m +CONFIG_REGULATOR_VCTRL=m +CONFIG_REGULATOR_WM831X=m +CONFIG_REGULATOR_WM8350=m +CONFIG_REGULATOR_WM8400=m +CONFIG_REGULATOR_WM8994=m +# CONFIG_REGULATOR_QCOM_LABIBB is not set +CONFIG_RC_CORE=m +CONFIG_RC_MAP=m +CONFIG_LIRC=y +CONFIG_RC_DECODERS=y +CONFIG_IR_NEC_DECODER=m +CONFIG_IR_RC5_DECODER=m +CONFIG_IR_RC6_DECODER=m +CONFIG_IR_JVC_DECODER=m +CONFIG_IR_SONY_DECODER=m +CONFIG_IR_SANYO_DECODER=m +CONFIG_IR_SHARP_DECODER=m +CONFIG_IR_MCE_KBD_DECODER=m +CONFIG_IR_XMP_DECODER=m +CONFIG_IR_IMON_DECODER=m +CONFIG_IR_RCMM_DECODER=m +CONFIG_RC_DEVICES=y +CONFIG_RC_ATI_REMOTE=m +CONFIG_IR_ENE=m +CONFIG_IR_HIX5HD2=m +CONFIG_IR_IMON=m +CONFIG_IR_IMON_RAW=m +CONFIG_IR_MCEUSB=m +CONFIG_IR_ITE_CIR=m +CONFIG_IR_FINTEK=m +CONFIG_IR_NUVOTON=m +CONFIG_IR_REDRAT3=m +CONFIG_IR_SPI=m +CONFIG_IR_STREAMZAP=m +CONFIG_IR_WINBOND_CIR=m +CONFIG_IR_IGORPLUGUSB=m +CONFIG_IR_IGUANA=m +CONFIG_IR_TTUSBIR=m +CONFIG_RC_LOOPBACK=m +CONFIG_IR_GPIO_CIR=m +CONFIG_IR_GPIO_TX=m +CONFIG_IR_PWM_TX=m +CONFIG_IR_SERIAL=m +CONFIG_IR_SERIAL_TRANSMITTER=y +CONFIG_IR_SIR=m +CONFIG_RC_XBOX_DVD=m +# CONFIG_IR_TOY is not set +CONFIG_CEC_CORE=m +CONFIG_CEC_NOTIFIER=y +CONFIG_CEC_PIN=y +CONFIG_MEDIA_CEC_RC=y +# CONFIG_CEC_PIN_ERROR_INJ is not set +CONFIG_MEDIA_CEC_SUPPORT=y +# CONFIG_CEC_CH7322 is not set +# CONFIG_CEC_CROS_EC is not set +CONFIG_CEC_GPIO=m +# CONFIG_CEC_SECO is not set +CONFIG_USB_PULSE8_CEC=m +CONFIG_USB_RAINSHADOW_CEC=m +CONFIG_MEDIA_SUPPORT=m +# CONFIG_MEDIA_SUPPORT_FILTER is not set +CONFIG_MEDIA_SUBDRV_AUTOSELECT=y + +# +# Media device types +# +CONFIG_MEDIA_CAMERA_SUPPORT=y +CONFIG_MEDIA_ANALOG_TV_SUPPORT=y +CONFIG_MEDIA_DIGITAL_TV_SUPPORT=y +CONFIG_MEDIA_RADIO_SUPPORT=y +CONFIG_MEDIA_SDR_SUPPORT=y +CONFIG_MEDIA_PLATFORM_SUPPORT=y +CONFIG_MEDIA_TEST_SUPPORT=y +# end of Media device types + +# +# Media core support +# +CONFIG_VIDEO_DEV=m +CONFIG_MEDIA_CONTROLLER=y +CONFIG_DVB_CORE=m +# end of Media core support + +# +# Video4Linux options +# +CONFIG_VIDEO_V4L2=m +CONFIG_VIDEO_V4L2_I2C=y +CONFIG_VIDEO_V4L2_SUBDEV_API=y +# CONFIG_VIDEO_ADV_DEBUG is not set +# CONFIG_VIDEO_FIXED_MINOR_RANGES is not set +CONFIG_VIDEO_TUNER=m +CONFIG_V4L2_MEM2MEM_DEV=m +CONFIG_V4L2_FLASH_LED_CLASS=m +CONFIG_V4L2_FWNODE=m +CONFIG_VIDEOBUF_GEN=m +CONFIG_VIDEOBUF_DMA_SG=m +CONFIG_VIDEOBUF_VMALLOC=m +# end of Video4Linux options + +# +# Media controller options +# +CONFIG_MEDIA_CONTROLLER_DVB=y +CONFIG_MEDIA_CONTROLLER_REQUEST_API=y + +# +# Please notice that the enabled Media controller Request API is EXPERIMENTAL +# +# end of Media controller options + +# +# Digital TV options +# +CONFIG_DVB_MMAP=y +CONFIG_DVB_NET=y +CONFIG_DVB_MAX_ADAPTERS=16 +# CONFIG_DVB_DYNAMIC_MINORS is not set +# CONFIG_DVB_DEMUX_SECTION_LOSS_LOG is not set +# CONFIG_DVB_ULE_DEBUG is not set +# end of Digital TV options + +# +# Media drivers +# +CONFIG_TTPCI_EEPROM=m +CONFIG_MEDIA_USB_SUPPORT=y + +# +# Webcam devices +# +CONFIG_USB_VIDEO_CLASS=m +CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y +CONFIG_USB_GSPCA=m +CONFIG_USB_M5602=m +CONFIG_USB_STV06XX=m +CONFIG_USB_GL860=m +CONFIG_USB_GSPCA_BENQ=m +CONFIG_USB_GSPCA_CONEX=m +CONFIG_USB_GSPCA_CPIA1=m +CONFIG_USB_GSPCA_DTCS033=m +CONFIG_USB_GSPCA_ETOMS=m +CONFIG_USB_GSPCA_FINEPIX=m +CONFIG_USB_GSPCA_JEILINJ=m +CONFIG_USB_GSPCA_JL2005BCD=m +CONFIG_USB_GSPCA_KINECT=m +CONFIG_USB_GSPCA_KONICA=m +CONFIG_USB_GSPCA_MARS=m +CONFIG_USB_GSPCA_MR97310A=m +CONFIG_USB_GSPCA_NW80X=m +CONFIG_USB_GSPCA_OV519=m +CONFIG_USB_GSPCA_OV534=m +CONFIG_USB_GSPCA_OV534_9=m +CONFIG_USB_GSPCA_PAC207=m +CONFIG_USB_GSPCA_PAC7302=m +CONFIG_USB_GSPCA_PAC7311=m +CONFIG_USB_GSPCA_SE401=m +CONFIG_USB_GSPCA_SN9C2028=m +CONFIG_USB_GSPCA_SN9C20X=m +CONFIG_USB_GSPCA_SONIXB=m +CONFIG_USB_GSPCA_SONIXJ=m +CONFIG_USB_GSPCA_SPCA500=m +CONFIG_USB_GSPCA_SPCA501=m +CONFIG_USB_GSPCA_SPCA505=m +CONFIG_USB_GSPCA_SPCA506=m +CONFIG_USB_GSPCA_SPCA508=m +CONFIG_USB_GSPCA_SPCA561=m +CONFIG_USB_GSPCA_SPCA1528=m +CONFIG_USB_GSPCA_SQ905=m +CONFIG_USB_GSPCA_SQ905C=m +CONFIG_USB_GSPCA_SQ930X=m +CONFIG_USB_GSPCA_STK014=m +CONFIG_USB_GSPCA_STK1135=m +CONFIG_USB_GSPCA_STV0680=m +CONFIG_USB_GSPCA_SUNPLUS=m +CONFIG_USB_GSPCA_T613=m +CONFIG_USB_GSPCA_TOPRO=m +CONFIG_USB_GSPCA_TOUPTEK=m +CONFIG_USB_GSPCA_TV8532=m +CONFIG_USB_GSPCA_VC032X=m +CONFIG_USB_GSPCA_VICAM=m +CONFIG_USB_GSPCA_XIRLINK_CIT=m +CONFIG_USB_GSPCA_ZC3XX=m +CONFIG_USB_PWC=m +# CONFIG_USB_PWC_DEBUG is not set +CONFIG_USB_PWC_INPUT_EVDEV=y +CONFIG_VIDEO_CPIA2=m +CONFIG_USB_ZR364XX=m +CONFIG_USB_STKWEBCAM=m +CONFIG_USB_S2255=m +CONFIG_VIDEO_USBTV=m + +# +# Analog TV USB devices +# +CONFIG_VIDEO_PVRUSB2=m +CONFIG_VIDEO_PVRUSB2_SYSFS=y +CONFIG_VIDEO_PVRUSB2_DVB=y +# CONFIG_VIDEO_PVRUSB2_DEBUGIFC is not set +CONFIG_VIDEO_HDPVR=m +CONFIG_VIDEO_STK1160_COMMON=m +CONFIG_VIDEO_STK1160=m +CONFIG_VIDEO_GO7007=m +CONFIG_VIDEO_GO7007_USB=m +CONFIG_VIDEO_GO7007_LOADER=m +CONFIG_VIDEO_GO7007_USB_S2250_BOARD=m + +# +# Analog/digital TV USB devices +# +CONFIG_VIDEO_AU0828=m +CONFIG_VIDEO_AU0828_V4L2=y +CONFIG_VIDEO_AU0828_RC=y +CONFIG_VIDEO_CX231XX=m +CONFIG_VIDEO_CX231XX_RC=y +CONFIG_VIDEO_CX231XX_ALSA=m +CONFIG_VIDEO_CX231XX_DVB=m +CONFIG_VIDEO_TM6000=m +CONFIG_VIDEO_TM6000_ALSA=m +CONFIG_VIDEO_TM6000_DVB=m + +# +# Digital TV USB devices +# +CONFIG_DVB_USB=m +# CONFIG_DVB_USB_DEBUG is not set +CONFIG_DVB_USB_DIB3000MC=m +CONFIG_DVB_USB_A800=m +CONFIG_DVB_USB_DIBUSB_MB=m +CONFIG_DVB_USB_DIBUSB_MB_FAULTY=y +CONFIG_DVB_USB_DIBUSB_MC=m +CONFIG_DVB_USB_DIB0700=m +CONFIG_DVB_USB_UMT_010=m +CONFIG_DVB_USB_CXUSB=m +CONFIG_DVB_USB_CXUSB_ANALOG=y +CONFIG_DVB_USB_M920X=m +CONFIG_DVB_USB_DIGITV=m +CONFIG_DVB_USB_VP7045=m +CONFIG_DVB_USB_VP702X=m +CONFIG_DVB_USB_GP8PSK=m +CONFIG_DVB_USB_NOVA_T_USB2=m +CONFIG_DVB_USB_TTUSB2=m +CONFIG_DVB_USB_DTT200U=m +CONFIG_DVB_USB_OPERA1=m +CONFIG_DVB_USB_AF9005=m +CONFIG_DVB_USB_AF9005_REMOTE=m +CONFIG_DVB_USB_PCTV452E=m +CONFIG_DVB_USB_DW2102=m +CONFIG_DVB_USB_CINERGY_T2=m +CONFIG_DVB_USB_DTV5100=m +CONFIG_DVB_USB_AZ6027=m +CONFIG_DVB_USB_TECHNISAT_USB2=m +CONFIG_DVB_USB_V2=m +CONFIG_DVB_USB_AF9015=m +CONFIG_DVB_USB_AF9035=m +CONFIG_DVB_USB_ANYSEE=m +CONFIG_DVB_USB_AU6610=m +CONFIG_DVB_USB_AZ6007=m +CONFIG_DVB_USB_CE6230=m +CONFIG_DVB_USB_EC168=m +CONFIG_DVB_USB_GL861=m +CONFIG_DVB_USB_LME2510=m +CONFIG_DVB_USB_MXL111SF=m +CONFIG_DVB_USB_RTL28XXU=m +CONFIG_DVB_USB_DVBSKY=m +CONFIG_DVB_USB_ZD1301=m +CONFIG_DVB_TTUSB_BUDGET=m +CONFIG_DVB_TTUSB_DEC=m +CONFIG_SMS_USB_DRV=m +CONFIG_DVB_B2C2_FLEXCOP_USB=m +# CONFIG_DVB_B2C2_FLEXCOP_USB_DEBUG is not set +CONFIG_DVB_AS102=m + +# +# Webcam, TV (analog/digital) USB devices +# +CONFIG_VIDEO_EM28XX=m +CONFIG_VIDEO_EM28XX_V4L2=m +CONFIG_VIDEO_EM28XX_ALSA=m +CONFIG_VIDEO_EM28XX_DVB=m +CONFIG_VIDEO_EM28XX_RC=m + +# +# Software defined radio USB devices +# +CONFIG_USB_AIRSPY=m +CONFIG_USB_HACKRF=m +CONFIG_USB_MSI2500=m +CONFIG_MEDIA_PCI_SUPPORT=y + +# +# Media capture support +# +CONFIG_VIDEO_MEYE=m +CONFIG_VIDEO_SOLO6X10=m +CONFIG_VIDEO_TW5864=m +CONFIG_VIDEO_TW68=m +CONFIG_VIDEO_TW686X=m + +# +# Media capture/analog TV support +# +CONFIG_VIDEO_IVTV=m +# CONFIG_VIDEO_IVTV_DEPRECATED_IOCTLS is not set +CONFIG_VIDEO_IVTV_ALSA=m +CONFIG_VIDEO_FB_IVTV=m +# CONFIG_VIDEO_FB_IVTV_FORCE_PAT is not set +CONFIG_VIDEO_HEXIUM_GEMINI=m +CONFIG_VIDEO_HEXIUM_ORION=m +CONFIG_VIDEO_MXB=m +CONFIG_VIDEO_DT3155=m + +# +# Media capture/analog/hybrid TV support +# +CONFIG_VIDEO_CX18=m +CONFIG_VIDEO_CX18_ALSA=m +CONFIG_VIDEO_CX23885=m +CONFIG_MEDIA_ALTERA_CI=m +CONFIG_VIDEO_CX25821=m +CONFIG_VIDEO_CX25821_ALSA=m +CONFIG_VIDEO_CX88=m +CONFIG_VIDEO_CX88_ALSA=m +CONFIG_VIDEO_CX88_BLACKBIRD=m +CONFIG_VIDEO_CX88_DVB=m +CONFIG_VIDEO_CX88_ENABLE_VP3054=y +CONFIG_VIDEO_CX88_VP3054=m +CONFIG_VIDEO_CX88_MPEG=m +CONFIG_VIDEO_BT848=m +CONFIG_DVB_BT8XX=m +CONFIG_VIDEO_SAA7134=m +CONFIG_VIDEO_SAA7134_ALSA=m +CONFIG_VIDEO_SAA7134_RC=y +CONFIG_VIDEO_SAA7134_DVB=m +CONFIG_VIDEO_SAA7134_GO7007=m +CONFIG_VIDEO_SAA7164=m + +# +# Media digital TV PCI Adapters +# +CONFIG_DVB_AV7110_IR=y +CONFIG_DVB_AV7110=m +CONFIG_DVB_AV7110_OSD=y +CONFIG_DVB_BUDGET_CORE=m +CONFIG_DVB_BUDGET=m +CONFIG_DVB_BUDGET_CI=m +CONFIG_DVB_BUDGET_AV=m +CONFIG_DVB_BUDGET_PATCH=m +CONFIG_DVB_B2C2_FLEXCOP_PCI=m +# CONFIG_DVB_B2C2_FLEXCOP_PCI_DEBUG is not set +CONFIG_DVB_PLUTO2=m +CONFIG_DVB_DM1105=m +CONFIG_DVB_PT1=m +CONFIG_DVB_PT3=m +CONFIG_MANTIS_CORE=m +CONFIG_DVB_MANTIS=m +CONFIG_DVB_HOPPER=m +CONFIG_DVB_NGENE=m +CONFIG_DVB_DDBRIDGE=m +# CONFIG_DVB_DDBRIDGE_MSIENABLE is not set +CONFIG_DVB_SMIPCIE=m +CONFIG_DVB_NETUP_UNIDVB=m +CONFIG_VIDEO_IPU3_CIO2=m +CONFIG_RADIO_ADAPTERS=y +CONFIG_RADIO_TEA575X=m +CONFIG_RADIO_SI470X=m +CONFIG_USB_SI470X=m +CONFIG_I2C_SI470X=m +CONFIG_RADIO_SI4713=m +CONFIG_USB_SI4713=m +CONFIG_PLATFORM_SI4713=m +CONFIG_I2C_SI4713=m +CONFIG_RADIO_SI476X=m +CONFIG_USB_MR800=m +CONFIG_USB_DSBR=m +CONFIG_RADIO_MAXIRADIO=m +CONFIG_RADIO_SHARK=m +CONFIG_RADIO_SHARK2=m +CONFIG_USB_KEENE=m +CONFIG_USB_RAREMONO=m +CONFIG_USB_MA901=m +CONFIG_RADIO_TEA5764=m +CONFIG_RADIO_SAA7706H=m +CONFIG_RADIO_TEF6862=m +CONFIG_RADIO_WL1273=m +CONFIG_RADIO_WL128X=m +CONFIG_MEDIA_COMMON_OPTIONS=y + +# +# common driver options +# +CONFIG_VIDEO_CX2341X=m +CONFIG_VIDEO_TVEEPROM=m +CONFIG_CYPRESS_FIRMWARE=m +CONFIG_VIDEOBUF2_CORE=m +CONFIG_VIDEOBUF2_V4L2=m +CONFIG_VIDEOBUF2_MEMOPS=m +CONFIG_VIDEOBUF2_DMA_CONTIG=m +CONFIG_VIDEOBUF2_VMALLOC=m +CONFIG_VIDEOBUF2_DMA_SG=m +CONFIG_VIDEOBUF2_DVB=m +CONFIG_DVB_B2C2_FLEXCOP=m +CONFIG_VIDEO_SAA7146=m +CONFIG_VIDEO_SAA7146_VV=m +CONFIG_SMS_SIANO_MDTV=m +CONFIG_SMS_SIANO_RC=y +# CONFIG_SMS_SIANO_DEBUGFS is not set +CONFIG_VIDEO_V4L2_TPG=m +CONFIG_V4L_PLATFORM_DRIVERS=y +CONFIG_VIDEO_CAFE_CCIC=m +CONFIG_VIDEO_CADENCE=y +CONFIG_VIDEO_CADENCE_CSI2RX=m +CONFIG_VIDEO_CADENCE_CSI2TX=m +CONFIG_VIDEO_ASPEED=m +CONFIG_VIDEO_MUX=m +CONFIG_VIDEO_XILINX=m +# CONFIG_VIDEO_XILINX_CSI2RXSS is not set +CONFIG_VIDEO_XILINX_TPG=m +CONFIG_VIDEO_XILINX_VTC=m +CONFIG_V4L_MEM2MEM_DRIVERS=y +CONFIG_VIDEO_MEM2MEM_DEINTERLACE=m +CONFIG_DVB_PLATFORM_DRIVERS=y +CONFIG_SDR_PLATFORM_DRIVERS=y + +# +# MMC/SDIO DVB adapters +# +CONFIG_SMS_SDIO_DRV=m +CONFIG_V4L_TEST_DRIVERS=y +CONFIG_VIDEO_VIMC=m +CONFIG_VIDEO_VIVID=m +CONFIG_VIDEO_VIVID_CEC=y +CONFIG_VIDEO_VIVID_MAX_DEVS=64 +CONFIG_VIDEO_VIM2M=m +CONFIG_VIDEO_VICODEC=m + +# +# FireWire (IEEE 1394) Adapters +# +CONFIG_DVB_FIREDTV=m +CONFIG_DVB_FIREDTV_INPUT=y +# end of Media drivers + +# +# Media ancillary drivers +# +CONFIG_MEDIA_ATTACH=y + +# +# IR I2C driver auto-selected by 'Autoselect ancillary drivers' +# +CONFIG_VIDEO_IR_I2C=m + +# +# Audio decoders, processors and mixers +# +CONFIG_VIDEO_TVAUDIO=m +CONFIG_VIDEO_TDA7432=m +CONFIG_VIDEO_TDA9840=m +CONFIG_VIDEO_TDA1997X=m +CONFIG_VIDEO_TEA6415C=m +CONFIG_VIDEO_TEA6420=m +CONFIG_VIDEO_MSP3400=m +CONFIG_VIDEO_CS3308=m +CONFIG_VIDEO_CS5345=m +CONFIG_VIDEO_CS53L32A=m +CONFIG_VIDEO_TLV320AIC23B=m +CONFIG_VIDEO_UDA1342=m +CONFIG_VIDEO_WM8775=m +CONFIG_VIDEO_WM8739=m +CONFIG_VIDEO_VP27SMPX=m +CONFIG_VIDEO_SONY_BTF_MPX=m +# end of Audio decoders, processors and mixers + +# +# RDS decoders +# +CONFIG_VIDEO_SAA6588=m +# end of RDS decoders + +# +# Video decoders +# +CONFIG_VIDEO_ADV7180=m +CONFIG_VIDEO_ADV7183=m +CONFIG_VIDEO_ADV748X=m +CONFIG_VIDEO_ADV7604=m +CONFIG_VIDEO_ADV7604_CEC=y +CONFIG_VIDEO_ADV7842=m +CONFIG_VIDEO_ADV7842_CEC=y +CONFIG_VIDEO_BT819=m +CONFIG_VIDEO_BT856=m +CONFIG_VIDEO_BT866=m +CONFIG_VIDEO_KS0127=m +CONFIG_VIDEO_ML86V7667=m +CONFIG_VIDEO_SAA7110=m +CONFIG_VIDEO_SAA711X=m +CONFIG_VIDEO_TC358743=m +CONFIG_VIDEO_TC358743_CEC=y +CONFIG_VIDEO_TVP514X=m +CONFIG_VIDEO_TVP5150=m +CONFIG_VIDEO_TVP7002=m +CONFIG_VIDEO_TW2804=m +CONFIG_VIDEO_TW9903=m +CONFIG_VIDEO_TW9906=m +CONFIG_VIDEO_TW9910=m +CONFIG_VIDEO_VPX3220=m +# CONFIG_VIDEO_MAX9286 is not set + +# +# Video and audio decoders +# +CONFIG_VIDEO_SAA717X=m +CONFIG_VIDEO_CX25840=m +# end of Video decoders + +# +# Video encoders +# +CONFIG_VIDEO_SAA7127=m +CONFIG_VIDEO_SAA7185=m +CONFIG_VIDEO_ADV7170=m +CONFIG_VIDEO_ADV7175=m +CONFIG_VIDEO_ADV7343=m +CONFIG_VIDEO_ADV7393=m +CONFIG_VIDEO_AD9389B=m +CONFIG_VIDEO_AK881X=m +CONFIG_VIDEO_THS8200=m +# end of Video encoders + +# +# Video improvement chips +# +CONFIG_VIDEO_UPD64031A=m +CONFIG_VIDEO_UPD64083=m +# end of Video improvement chips + +# +# Audio/Video compression chips +# +CONFIG_VIDEO_SAA6752HS=m +# end of Audio/Video compression chips + +# +# SDR tuner chips +# +CONFIG_SDR_MAX2175=m +# end of SDR tuner chips + +# +# Miscellaneous helper chips +# +CONFIG_VIDEO_THS7303=m +CONFIG_VIDEO_M52790=m +CONFIG_VIDEO_I2C=m +CONFIG_VIDEO_ST_MIPID02=m +# end of Miscellaneous helper chips + +# +# Camera sensor devices +# +CONFIG_VIDEO_APTINA_PLL=m +CONFIG_VIDEO_SMIAPP_PLL=m +CONFIG_VIDEO_HI556=m +CONFIG_VIDEO_IMX214=m +CONFIG_VIDEO_IMX219=m +CONFIG_VIDEO_IMX258=m +CONFIG_VIDEO_IMX274=m +CONFIG_VIDEO_IMX290=m +CONFIG_VIDEO_IMX319=m +CONFIG_VIDEO_IMX355=m +CONFIG_VIDEO_OV2640=m +CONFIG_VIDEO_OV2659=m +CONFIG_VIDEO_OV2680=m +CONFIG_VIDEO_OV2685=m +CONFIG_VIDEO_OV2740=m +CONFIG_VIDEO_OV5640=m +CONFIG_VIDEO_OV5645=m +CONFIG_VIDEO_OV5647=m +CONFIG_VIDEO_OV6650=m +CONFIG_VIDEO_OV5670=m +CONFIG_VIDEO_OV5675=m +CONFIG_VIDEO_OV5695=m +CONFIG_VIDEO_OV7251=m +CONFIG_VIDEO_OV772X=m +CONFIG_VIDEO_OV7640=m +CONFIG_VIDEO_OV7670=m +CONFIG_VIDEO_OV7740=m +CONFIG_VIDEO_OV8856=m +CONFIG_VIDEO_OV9640=m +CONFIG_VIDEO_OV9650=m +CONFIG_VIDEO_OV13858=m +CONFIG_VIDEO_VS6624=m +CONFIG_VIDEO_MT9M001=m +CONFIG_VIDEO_MT9M032=m +CONFIG_VIDEO_MT9M111=m +CONFIG_VIDEO_MT9P031=m +CONFIG_VIDEO_MT9T001=m +CONFIG_VIDEO_MT9T112=m +CONFIG_VIDEO_MT9V011=m +CONFIG_VIDEO_MT9V032=m +CONFIG_VIDEO_MT9V111=m +CONFIG_VIDEO_SR030PC30=m +CONFIG_VIDEO_NOON010PC30=m +CONFIG_VIDEO_M5MOLS=m +# CONFIG_VIDEO_RDACM20 is not set +CONFIG_VIDEO_RJ54N1=m +CONFIG_VIDEO_S5K6AA=m +CONFIG_VIDEO_S5K6A3=m +CONFIG_VIDEO_S5K4ECGX=m +CONFIG_VIDEO_S5K5BAF=m +CONFIG_VIDEO_SMIAPP=m +CONFIG_VIDEO_ET8EK8=m +CONFIG_VIDEO_S5C73M3=m +# end of Camera sensor devices + +# +# Lens drivers +# +CONFIG_VIDEO_AD5820=m +CONFIG_VIDEO_AK7375=m +CONFIG_VIDEO_DW9714=m +# CONFIG_VIDEO_DW9768 is not set +CONFIG_VIDEO_DW9807_VCM=m +# end of Lens drivers + +# +# Flash devices +# +CONFIG_VIDEO_ADP1653=m +CONFIG_VIDEO_LM3560=m +CONFIG_VIDEO_LM3646=m +# end of Flash devices + +# +# SPI helper chips +# +CONFIG_VIDEO_GS1662=m +# end of SPI helper chips + +# +# Media SPI Adapters +# +CONFIG_CXD2880_SPI_DRV=m +# end of Media SPI Adapters + +CONFIG_MEDIA_TUNER=m + +# +# Customize TV tuners +# +CONFIG_MEDIA_TUNER_SIMPLE=m +CONFIG_MEDIA_TUNER_TDA18250=m +CONFIG_MEDIA_TUNER_TDA8290=m +CONFIG_MEDIA_TUNER_TDA827X=m +CONFIG_MEDIA_TUNER_TDA18271=m +CONFIG_MEDIA_TUNER_TDA9887=m +CONFIG_MEDIA_TUNER_TEA5761=m +CONFIG_MEDIA_TUNER_TEA5767=m +CONFIG_MEDIA_TUNER_MSI001=m +CONFIG_MEDIA_TUNER_MT20XX=m +CONFIG_MEDIA_TUNER_MT2060=m +CONFIG_MEDIA_TUNER_MT2063=m +CONFIG_MEDIA_TUNER_MT2266=m +CONFIG_MEDIA_TUNER_MT2131=m +CONFIG_MEDIA_TUNER_QT1010=m +CONFIG_MEDIA_TUNER_XC2028=m +CONFIG_MEDIA_TUNER_XC5000=m +CONFIG_MEDIA_TUNER_XC4000=m +CONFIG_MEDIA_TUNER_MXL5005S=m +CONFIG_MEDIA_TUNER_MXL5007T=m +CONFIG_MEDIA_TUNER_MC44S803=m +CONFIG_MEDIA_TUNER_MAX2165=m +CONFIG_MEDIA_TUNER_TDA18218=m +CONFIG_MEDIA_TUNER_FC0011=m +CONFIG_MEDIA_TUNER_FC0012=m +CONFIG_MEDIA_TUNER_FC0013=m +CONFIG_MEDIA_TUNER_TDA18212=m +CONFIG_MEDIA_TUNER_E4000=m +CONFIG_MEDIA_TUNER_FC2580=m +CONFIG_MEDIA_TUNER_M88RS6000T=m +CONFIG_MEDIA_TUNER_TUA9001=m +CONFIG_MEDIA_TUNER_SI2157=m +CONFIG_MEDIA_TUNER_IT913X=m +CONFIG_MEDIA_TUNER_R820T=m +CONFIG_MEDIA_TUNER_MXL301RF=m +CONFIG_MEDIA_TUNER_QM1D1C0042=m +CONFIG_MEDIA_TUNER_QM1D1B0004=m +# end of Customize TV tuners + +# +# Customise DVB Frontends +# + +# +# Multistandard (satellite) frontends +# +CONFIG_DVB_STB0899=m +CONFIG_DVB_STB6100=m +CONFIG_DVB_STV090x=m +CONFIG_DVB_STV0910=m +CONFIG_DVB_STV6110x=m +CONFIG_DVB_STV6111=m +CONFIG_DVB_MXL5XX=m +CONFIG_DVB_M88DS3103=m + +# +# Multistandard (cable + terrestrial) frontends +# +CONFIG_DVB_DRXK=m +CONFIG_DVB_TDA18271C2DD=m +CONFIG_DVB_SI2165=m +CONFIG_DVB_MN88472=m +CONFIG_DVB_MN88473=m + +# +# DVB-S (satellite) frontends +# +CONFIG_DVB_CX24110=m +CONFIG_DVB_CX24123=m +CONFIG_DVB_MT312=m +CONFIG_DVB_ZL10036=m +CONFIG_DVB_ZL10039=m +CONFIG_DVB_S5H1420=m +CONFIG_DVB_STV0288=m +CONFIG_DVB_STB6000=m +CONFIG_DVB_STV0299=m +CONFIG_DVB_STV6110=m +CONFIG_DVB_STV0900=m +CONFIG_DVB_TDA8083=m +CONFIG_DVB_TDA10086=m +CONFIG_DVB_TDA8261=m +CONFIG_DVB_VES1X93=m +CONFIG_DVB_TUNER_ITD1000=m +CONFIG_DVB_TUNER_CX24113=m +CONFIG_DVB_TDA826X=m +CONFIG_DVB_TUA6100=m +CONFIG_DVB_CX24116=m +CONFIG_DVB_CX24117=m +CONFIG_DVB_CX24120=m +CONFIG_DVB_SI21XX=m +CONFIG_DVB_TS2020=m +CONFIG_DVB_DS3000=m +CONFIG_DVB_MB86A16=m +CONFIG_DVB_TDA10071=m + +# +# DVB-T (terrestrial) frontends +# +CONFIG_DVB_SP8870=m +CONFIG_DVB_SP887X=m +CONFIG_DVB_CX22700=m +CONFIG_DVB_CX22702=m +CONFIG_DVB_S5H1432=m +CONFIG_DVB_DRXD=m +CONFIG_DVB_L64781=m +CONFIG_DVB_TDA1004X=m +CONFIG_DVB_NXT6000=m +CONFIG_DVB_MT352=m +CONFIG_DVB_ZL10353=m +CONFIG_DVB_DIB3000MB=m +CONFIG_DVB_DIB3000MC=m +CONFIG_DVB_DIB7000M=m +CONFIG_DVB_DIB7000P=m +CONFIG_DVB_DIB9000=m +CONFIG_DVB_TDA10048=m +CONFIG_DVB_AF9013=m +CONFIG_DVB_EC100=m +CONFIG_DVB_STV0367=m +CONFIG_DVB_CXD2820R=m +CONFIG_DVB_CXD2841ER=m +CONFIG_DVB_RTL2830=m +CONFIG_DVB_RTL2832=m +CONFIG_DVB_RTL2832_SDR=m +CONFIG_DVB_SI2168=m +CONFIG_DVB_AS102_FE=m +CONFIG_DVB_ZD1301_DEMOD=m +CONFIG_DVB_GP8PSK_FE=m +CONFIG_DVB_CXD2880=m + +# +# DVB-C (cable) frontends +# +CONFIG_DVB_VES1820=m +CONFIG_DVB_TDA10021=m +CONFIG_DVB_TDA10023=m +CONFIG_DVB_STV0297=m + +# +# ATSC (North American/Korean Terrestrial/Cable DTV) frontends +# +CONFIG_DVB_NXT200X=m +CONFIG_DVB_OR51211=m +CONFIG_DVB_OR51132=m +CONFIG_DVB_BCM3510=m +CONFIG_DVB_LGDT330X=m +CONFIG_DVB_LGDT3305=m +CONFIG_DVB_LGDT3306A=m +CONFIG_DVB_LG2160=m +CONFIG_DVB_S5H1409=m +CONFIG_DVB_AU8522=m +CONFIG_DVB_AU8522_DTV=m +CONFIG_DVB_AU8522_V4L=m +CONFIG_DVB_S5H1411=m + +# +# ISDB-T (terrestrial) frontends +# +CONFIG_DVB_S921=m +CONFIG_DVB_DIB8000=m +CONFIG_DVB_MB86A20S=m + +# +# ISDB-S (satellite) & ISDB-T (terrestrial) frontends +# +CONFIG_DVB_TC90522=m +CONFIG_DVB_MN88443X=m + +# +# Digital terrestrial only tuners/PLL +# +CONFIG_DVB_PLL=m +CONFIG_DVB_TUNER_DIB0070=m +CONFIG_DVB_TUNER_DIB0090=m + +# +# SEC control devices for DVB-S +# +CONFIG_DVB_DRX39XYJ=m +CONFIG_DVB_LNBH25=m +CONFIG_DVB_LNBH29=m +CONFIG_DVB_LNBP21=m +CONFIG_DVB_LNBP22=m +CONFIG_DVB_ISL6405=m +CONFIG_DVB_ISL6421=m +CONFIG_DVB_ISL6423=m +CONFIG_DVB_A8293=m +CONFIG_DVB_LGS8GL5=m +CONFIG_DVB_LGS8GXX=m +CONFIG_DVB_ATBM8830=m +CONFIG_DVB_TDA665x=m +CONFIG_DVB_IX2505V=m +CONFIG_DVB_M88RS2000=m +CONFIG_DVB_AF9033=m +CONFIG_DVB_HORUS3A=m +CONFIG_DVB_ASCOT2E=m +CONFIG_DVB_HELENE=m + +# +# Common Interface (EN50221) controller drivers +# +CONFIG_DVB_CXD2099=m +CONFIG_DVB_SP2=m +# end of Customise DVB Frontends + +# +# Tools to develop new frontends +# +CONFIG_DVB_DUMMY_FE=m +# end of Media ancillary drivers + +# +# Graphics support +# +CONFIG_AGP=m +CONFIG_AGP_AMD64=m +CONFIG_AGP_INTEL=m +CONFIG_AGP_SIS=m +CONFIG_AGP_VIA=m +CONFIG_INTEL_GTT=m +CONFIG_VGA_ARB=y +CONFIG_VGA_ARB_MAX_GPUS=10 +CONFIG_VGA_SWITCHEROO=y +CONFIG_DRM=m +CONFIG_DRM_MIPI_DBI=m +CONFIG_DRM_MIPI_DSI=y +CONFIG_DRM_DP_AUX_CHARDEV=y +# CONFIG_DRM_DEBUG_SELFTEST is not set +CONFIG_DRM_KMS_HELPER=m +CONFIG_DRM_KMS_FB_HELPER=y +# CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS is not set +CONFIG_DRM_FBDEV_EMULATION=y +CONFIG_DRM_FBDEV_OVERALLOC=100 +# CONFIG_DRM_FBDEV_LEAK_PHYS_SMEM is not set +CONFIG_DRM_LOAD_EDID_FIRMWARE=y +CONFIG_DRM_DP_CEC=y +CONFIG_DRM_TTM=m +CONFIG_DRM_TTM_DMA_PAGE_POOL=y +CONFIG_DRM_VRAM_HELPER=m +CONFIG_DRM_TTM_HELPER=m +CONFIG_DRM_GEM_CMA_HELPER=y +CONFIG_DRM_KMS_CMA_HELPER=y +CONFIG_DRM_GEM_SHMEM_HELPER=y +CONFIG_DRM_SCHED=m + +# +# I2C encoder or helper chips +# +CONFIG_DRM_I2C_CH7006=m +CONFIG_DRM_I2C_SIL164=m +CONFIG_DRM_I2C_NXP_TDA998X=m +CONFIG_DRM_I2C_NXP_TDA9950=m +# end of I2C encoder or helper chips + +# +# ARM devices +# +CONFIG_DRM_KOMEDA=m +# end of ARM devices + +CONFIG_DRM_RADEON=m +CONFIG_DRM_RADEON_USERPTR=y +CONFIG_DRM_AMDGPU=m +CONFIG_DRM_AMDGPU_SI=y +CONFIG_DRM_AMDGPU_CIK=y +CONFIG_DRM_AMDGPU_USERPTR=y +# CONFIG_DRM_AMDGPU_GART_DEBUGFS is not set + +# +# ACP (Audio CoProcessor) Configuration +# +CONFIG_DRM_AMD_ACP=y +# end of ACP (Audio CoProcessor) Configuration + +# +# Display Engine Configuration +# +CONFIG_DRM_AMD_DC=y +CONFIG_DRM_AMD_DC_DCN=y +CONFIG_DRM_AMD_DC_DCN3_0=y +CONFIG_DRM_AMD_DC_HDCP=y +# CONFIG_DEBUG_KERNEL_DC is not set +# end of Display Engine Configuration + +CONFIG_HSA_AMD=y +CONFIG_DRM_NOUVEAU=m +# CONFIG_NOUVEAU_LEGACY_CTX_SUPPORT is not set +CONFIG_NOUVEAU_DEBUG=5 +CONFIG_NOUVEAU_DEBUG_DEFAULT=3 +# CONFIG_NOUVEAU_DEBUG_MMU is not set +# CONFIG_NOUVEAU_DEBUG_PUSH is not set +CONFIG_DRM_NOUVEAU_BACKLIGHT=y +CONFIG_DRM_NOUVEAU_SVM=y +CONFIG_DRM_I915=m +CONFIG_DRM_I915_FORCE_PROBE="*" +CONFIG_DRM_I915_CAPTURE_ERROR=y +CONFIG_DRM_I915_COMPRESS_ERROR=y +CONFIG_DRM_I915_USERPTR=y +CONFIG_DRM_I915_GVT=y +CONFIG_DRM_I915_GVT_KVMGT=m + +# +# drm/i915 Debugging +# +# CONFIG_DRM_I915_WERROR is not set +# CONFIG_DRM_I915_DEBUG is not set +# CONFIG_DRM_I915_DEBUG_MMIO is not set +# CONFIG_DRM_I915_SW_FENCE_DEBUG_OBJECTS is not set +# CONFIG_DRM_I915_SW_FENCE_CHECK_DAG is not set +# CONFIG_DRM_I915_DEBUG_GUC is not set +# CONFIG_DRM_I915_SELFTEST is not set +# CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS is not set +# CONFIG_DRM_I915_DEBUG_VBLANK_EVADE is not set +# CONFIG_DRM_I915_DEBUG_RUNTIME_PM is not set +# end of drm/i915 Debugging + +# +# drm/i915 Profile Guided Optimisation +# +CONFIG_DRM_I915_FENCE_TIMEOUT=10000 +CONFIG_DRM_I915_USERFAULT_AUTOSUSPEND=250 +CONFIG_DRM_I915_HEARTBEAT_INTERVAL=2500 +CONFIG_DRM_I915_PREEMPT_TIMEOUT=640 +CONFIG_DRM_I915_MAX_REQUEST_BUSYWAIT=8000 +CONFIG_DRM_I915_STOP_TIMEOUT=100 +CONFIG_DRM_I915_TIMESLICE_DURATION=1 +# end of drm/i915 Profile Guided Optimisation + +CONFIG_DRM_VGEM=m +CONFIG_DRM_VKMS=m +CONFIG_DRM_VMWGFX=m +CONFIG_DRM_VMWGFX_FBCON=y +CONFIG_DRM_GMA500=m +CONFIG_DRM_GMA600=y +CONFIG_DRM_GMA3600=y +CONFIG_DRM_UDL=m +CONFIG_DRM_AST=m +CONFIG_DRM_MGAG200=m +CONFIG_DRM_RCAR_DW_HDMI=m +CONFIG_DRM_RCAR_LVDS=m +CONFIG_DRM_QXL=m +CONFIG_DRM_BOCHS=m +CONFIG_DRM_VIRTIO_GPU=m +CONFIG_DRM_PANEL=y + +# +# Display Panels +# +CONFIG_DRM_PANEL_ARM_VERSATILE=m +CONFIG_DRM_PANEL_ASUS_Z00T_TM5P5_NT35596=m +CONFIG_DRM_PANEL_BOE_HIMAX8279D=m +CONFIG_DRM_PANEL_BOE_TV101WUM_NL6=m +CONFIG_DRM_PANEL_LVDS=m +CONFIG_DRM_PANEL_SIMPLE=m +CONFIG_DRM_PANEL_ELIDA_KD35T133=m +CONFIG_DRM_PANEL_FEIXIN_K101_IM2BA02=m +CONFIG_DRM_PANEL_FEIYANG_FY07024DI26A30D=m +CONFIG_DRM_PANEL_ILITEK_IL9322=m +CONFIG_DRM_PANEL_ILITEK_ILI9881C=m +CONFIG_DRM_PANEL_INNOLUX_P079ZCA=m +CONFIG_DRM_PANEL_JDI_LT070ME05000=m +CONFIG_DRM_PANEL_KINGDISPLAY_KD097D04=m +CONFIG_DRM_PANEL_LEADTEK_LTK050H3146W=m +CONFIG_DRM_PANEL_LEADTEK_LTK500HD1829=m +CONFIG_DRM_PANEL_SAMSUNG_LD9040=m +CONFIG_DRM_PANEL_LG_LB035Q02=m +CONFIG_DRM_PANEL_LG_LG4573=m +CONFIG_DRM_PANEL_NEC_NL8048HL11=m +CONFIG_DRM_PANEL_NOVATEK_NT35510=m +CONFIG_DRM_PANEL_NOVATEK_NT39016=m +CONFIG_DRM_PANEL_OLIMEX_LCD_OLINUXINO=m +CONFIG_DRM_PANEL_ORISETECH_OTM8009A=m +CONFIG_DRM_PANEL_OSD_OSD101T2587_53TS=m +CONFIG_DRM_PANEL_PANASONIC_VVX10F034N00=m +CONFIG_DRM_PANEL_RASPBERRYPI_TOUCHSCREEN=m +CONFIG_DRM_PANEL_RAYDIUM_RM67191=m +CONFIG_DRM_PANEL_RAYDIUM_RM68200=m +CONFIG_DRM_PANEL_RONBO_RB070D30=m +CONFIG_DRM_PANEL_SAMSUNG_S6D16D0=m +CONFIG_DRM_PANEL_SAMSUNG_S6E3HA2=m +CONFIG_DRM_PANEL_SAMSUNG_S6E63J0X03=m +CONFIG_DRM_PANEL_SAMSUNG_S6E63M0=m +CONFIG_DRM_PANEL_SAMSUNG_S6E88A0_AMS452EF01=m +CONFIG_DRM_PANEL_SAMSUNG_S6E8AA0=m +CONFIG_DRM_PANEL_SEIKO_43WVF1G=m +CONFIG_DRM_PANEL_SHARP_LQ101R1SX01=m +CONFIG_DRM_PANEL_SHARP_LS037V7DW01=m +CONFIG_DRM_PANEL_SHARP_LS043T1LE01=m +CONFIG_DRM_PANEL_SITRONIX_ST7701=m +# CONFIG_DRM_PANEL_SITRONIX_ST7703 is not set +CONFIG_DRM_PANEL_SITRONIX_ST7789V=m +CONFIG_DRM_PANEL_SONY_ACX424AKP=m +CONFIG_DRM_PANEL_SONY_ACX565AKM=m +CONFIG_DRM_PANEL_TPO_TD028TTEC1=m +CONFIG_DRM_PANEL_TPO_TD043MTEA1=m +CONFIG_DRM_PANEL_TPO_TPG110=m +CONFIG_DRM_PANEL_TRULY_NT35597_WQXGA=m +CONFIG_DRM_PANEL_VISIONOX_RM69299=m +CONFIG_DRM_PANEL_XINPENG_XPP055C272=m +# end of Display Panels + +CONFIG_DRM_BRIDGE=y +CONFIG_DRM_PANEL_BRIDGE=y + +# +# Display Interface Bridges +# +CONFIG_DRM_CDNS_DSI=m +CONFIG_DRM_CHRONTEL_CH7033=m +CONFIG_DRM_DISPLAY_CONNECTOR=m +CONFIG_DRM_LVDS_CODEC=m +CONFIG_DRM_MEGACHIPS_STDPXXXX_GE_B850V3_FW=m +CONFIG_DRM_NWL_MIPI_DSI=m +CONFIG_DRM_NXP_PTN3460=m +CONFIG_DRM_PARADE_PS8622=m +CONFIG_DRM_PARADE_PS8640=m +CONFIG_DRM_SIL_SII8620=m +CONFIG_DRM_SII902X=m +CONFIG_DRM_SII9234=m +CONFIG_DRM_SIMPLE_BRIDGE=m +CONFIG_DRM_THINE_THC63LVD1024=m +CONFIG_DRM_TOSHIBA_TC358764=m +CONFIG_DRM_TOSHIBA_TC358767=m +CONFIG_DRM_TOSHIBA_TC358768=m +CONFIG_DRM_TI_TFP410=m +CONFIG_DRM_TI_SN65DSI86=m +CONFIG_DRM_TI_TPD12S015=m +CONFIG_DRM_ANALOGIX_ANX6345=m +CONFIG_DRM_ANALOGIX_ANX78XX=m +CONFIG_DRM_ANALOGIX_DP=m +CONFIG_DRM_I2C_ADV7511=m +CONFIG_DRM_I2C_ADV7511_AUDIO=y +CONFIG_DRM_I2C_ADV7511_CEC=y +CONFIG_DRM_DW_HDMI=m +CONFIG_DRM_DW_HDMI_AHB_AUDIO=m +CONFIG_DRM_DW_HDMI_I2S_AUDIO=m +CONFIG_DRM_DW_HDMI_CEC=m +# end of Display Interface Bridges + +# CONFIG_DRM_ETNAVIV is not set +CONFIG_DRM_ARCPGU=m +CONFIG_DRM_MXS=y +CONFIG_DRM_MXSFB=m +CONFIG_DRM_CIRRUS_QEMU=m +CONFIG_DRM_GM12U320=m +CONFIG_TINYDRM_HX8357D=m +CONFIG_TINYDRM_ILI9225=m +CONFIG_TINYDRM_ILI9341=m +CONFIG_TINYDRM_ILI9486=m +CONFIG_TINYDRM_MI0283QT=m +CONFIG_TINYDRM_REPAPER=m +CONFIG_TINYDRM_ST7586=m +CONFIG_TINYDRM_ST7735R=m +CONFIG_DRM_XEN=y +CONFIG_DRM_XEN_FRONTEND=m +CONFIG_DRM_VBOXVIDEO=m +# CONFIG_DRM_LEGACY is not set +CONFIG_DRM_PANEL_ORIENTATION_QUIRKS=y + +# +# Frame buffer Devices +# +CONFIG_FB_CMDLINE=y +CONFIG_FB_NOTIFY=y +CONFIG_FB=y +CONFIG_FIRMWARE_EDID=y +CONFIG_FB_BOOT_VESA_SUPPORT=y +CONFIG_FB_CFB_FILLRECT=y +CONFIG_FB_CFB_COPYAREA=y +CONFIG_FB_CFB_IMAGEBLIT=y +CONFIG_FB_SYS_FILLRECT=m +CONFIG_FB_SYS_COPYAREA=m +CONFIG_FB_SYS_IMAGEBLIT=m +# CONFIG_FB_FOREIGN_ENDIAN is not set +CONFIG_FB_SYS_FOPS=m +CONFIG_FB_DEFERRED_IO=y +CONFIG_FB_BACKLIGHT=m +CONFIG_FB_MODE_HELPERS=y +CONFIG_FB_TILEBLITTING=y + +# +# Frame buffer hardware drivers +# +# CONFIG_FB_CIRRUS is not set +# CONFIG_FB_PM2 is not set +# CONFIG_FB_CYBER2000 is not set +# CONFIG_FB_ARC is not set +# CONFIG_FB_ASILIANT is not set +# CONFIG_FB_IMSTT is not set +# CONFIG_FB_VGA16 is not set +# CONFIG_FB_UVESA is not set +CONFIG_FB_VESA=y +CONFIG_FB_EFI=y +# CONFIG_FB_N411 is not set +# CONFIG_FB_HGA is not set +# CONFIG_FB_OPENCORES is not set +# CONFIG_FB_S1D13XXX is not set +# CONFIG_FB_NVIDIA is not set +# CONFIG_FB_RIVA is not set +# CONFIG_FB_I740 is not set +# CONFIG_FB_LE80578 is not set +# CONFIG_FB_INTEL is not set +# CONFIG_FB_MATROX is not set +# CONFIG_FB_RADEON is not set +# CONFIG_FB_ATY128 is not set +# CONFIG_FB_ATY is not set +# CONFIG_FB_S3 is not set +# CONFIG_FB_SAVAGE is not set +# CONFIG_FB_SIS is not set +# CONFIG_FB_VIA is not set +# CONFIG_FB_NEOMAGIC is not set +# CONFIG_FB_KYRO is not set +# CONFIG_FB_3DFX is not set +# CONFIG_FB_VOODOO1 is not set +# CONFIG_FB_VT8623 is not set +# CONFIG_FB_TRIDENT is not set +# CONFIG_FB_ARK is not set +# CONFIG_FB_PM3 is not set +# CONFIG_FB_CARMINE is not set +# CONFIG_FB_SM501 is not set +# CONFIG_FB_SMSCUFX is not set +# CONFIG_FB_UDL is not set +# CONFIG_FB_IBM_GXT4500 is not set +# CONFIG_FB_VIRTUAL is not set +CONFIG_XEN_FBDEV_FRONTEND=m +# CONFIG_FB_METRONOME is not set +# CONFIG_FB_MB862XX is not set +CONFIG_FB_HYPERV=m +CONFIG_FB_SIMPLE=y +# CONFIG_FB_SSD1307 is not set +# CONFIG_FB_SM712 is not set +# end of Frame buffer Devices + +# +# Backlight & LCD device support +# +CONFIG_LCD_CLASS_DEVICE=m +CONFIG_LCD_L4F00242T03=m +CONFIG_LCD_LMS283GF05=m +CONFIG_LCD_LTV350QV=m +CONFIG_LCD_ILI922X=m +CONFIG_LCD_ILI9320=m +CONFIG_LCD_TDO24M=m +CONFIG_LCD_VGG2432A4=m +CONFIG_LCD_PLATFORM=m +CONFIG_LCD_AMS369FG06=m +CONFIG_LCD_LMS501KF03=m +CONFIG_LCD_HX8357=m +CONFIG_LCD_OTM3225A=m +CONFIG_BACKLIGHT_CLASS_DEVICE=y +CONFIG_BACKLIGHT_LM3533=m +CONFIG_BACKLIGHT_PWM=m +CONFIG_BACKLIGHT_DA903X=m +CONFIG_BACKLIGHT_DA9052=m +CONFIG_BACKLIGHT_MAX8925=m +CONFIG_BACKLIGHT_APPLE=m +CONFIG_BACKLIGHT_QCOM_WLED=m +CONFIG_BACKLIGHT_SAHARA=m +CONFIG_BACKLIGHT_WM831X=m +CONFIG_BACKLIGHT_ADP5520=m +CONFIG_BACKLIGHT_ADP8860=m +CONFIG_BACKLIGHT_ADP8870=m +CONFIG_BACKLIGHT_88PM860X=m +CONFIG_BACKLIGHT_PCF50633=m +CONFIG_BACKLIGHT_AAT2870=m +CONFIG_BACKLIGHT_LM3630A=m +CONFIG_BACKLIGHT_LM3639=m +CONFIG_BACKLIGHT_LP855X=m +CONFIG_BACKLIGHT_LP8788=m +CONFIG_BACKLIGHT_PANDORA=m +CONFIG_BACKLIGHT_SKY81452=m +CONFIG_BACKLIGHT_TPS65217=m +CONFIG_BACKLIGHT_AS3711=m +CONFIG_BACKLIGHT_GPIO=m +CONFIG_BACKLIGHT_LV5207LP=m +CONFIG_BACKLIGHT_BD6107=m +CONFIG_BACKLIGHT_ARCXCNN=m +CONFIG_BACKLIGHT_RAVE_SP=m +CONFIG_BACKLIGHT_LED=m +# end of Backlight & LCD device support + +CONFIG_VIDEOMODE_HELPERS=y +CONFIG_HDMI=y + +# +# Console display driver support +# +CONFIG_VGA_CONSOLE=y +CONFIG_VGACON_SOFT_SCROLLBACK=y +CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 +# CONFIG_VGACON_SOFT_SCROLLBACK_PERSISTENT_ENABLE_BY_DEFAULT is not set +CONFIG_DUMMY_CONSOLE=y +CONFIG_DUMMY_CONSOLE_COLUMNS=80 +CONFIG_DUMMY_CONSOLE_ROWS=25 +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y +CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y +CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER=y +# end of Console display driver support + +# CONFIG_LOGO is not set +# end of Graphics support + +CONFIG_SOUND=m +CONFIG_SOUND_OSS_CORE=y +# CONFIG_SOUND_OSS_CORE_PRECLAIM is not set +CONFIG_SND=m +CONFIG_SND_TIMER=m +CONFIG_SND_PCM=m +CONFIG_SND_PCM_ELD=y +CONFIG_SND_PCM_IEC958=y +CONFIG_SND_DMAENGINE_PCM=m +CONFIG_SND_HWDEP=m +CONFIG_SND_SEQ_DEVICE=m +CONFIG_SND_RAWMIDI=m +CONFIG_SND_COMPRESS_OFFLOAD=m +CONFIG_SND_JACK=y +CONFIG_SND_JACK_INPUT_DEV=y +CONFIG_SND_OSSEMUL=y +CONFIG_SND_MIXER_OSS=m +CONFIG_SND_PCM_OSS=m +CONFIG_SND_PCM_OSS_PLUGINS=y +CONFIG_SND_PCM_TIMER=y +CONFIG_SND_HRTIMER=m +CONFIG_SND_DYNAMIC_MINORS=y +CONFIG_SND_MAX_CARDS=32 +# CONFIG_SND_SUPPORT_OLD_API is not set +CONFIG_SND_PROC_FS=y +CONFIG_SND_VERBOSE_PROCFS=y +CONFIG_SND_VERBOSE_PRINTK=y +CONFIG_SND_DEBUG=y +# CONFIG_SND_DEBUG_VERBOSE is not set +# CONFIG_SND_PCM_XRUN_DEBUG is not set +# CONFIG_SND_CTL_VALIDATION is not set +CONFIG_SND_VMASTER=y +CONFIG_SND_DMA_SGBUF=y +CONFIG_SND_SEQUENCER=m +CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_SEQUENCER_OSS=m +CONFIG_SND_SEQ_HRTIMER_DEFAULT=y +CONFIG_SND_SEQ_MIDI_EVENT=m +CONFIG_SND_SEQ_MIDI=m +CONFIG_SND_SEQ_MIDI_EMUL=m +CONFIG_SND_SEQ_VIRMIDI=m +CONFIG_SND_MPU401_UART=m +CONFIG_SND_OPL3_LIB=m +CONFIG_SND_OPL3_LIB_SEQ=m +CONFIG_SND_VX_LIB=m +CONFIG_SND_AC97_CODEC=m +CONFIG_SND_DRIVERS=y +# CONFIG_SND_PCSP is not set +CONFIG_SND_DUMMY=m +CONFIG_SND_ALOOP=m +CONFIG_SND_VIRMIDI=m +CONFIG_SND_MTPAV=m +CONFIG_SND_MTS64=m +CONFIG_SND_SERIAL_U16550=m +CONFIG_SND_MPU401=m +CONFIG_SND_PORTMAN2X4=m +CONFIG_SND_AC97_POWER_SAVE=y +CONFIG_SND_AC97_POWER_SAVE_DEFAULT=0 +CONFIG_SND_SB_COMMON=m +CONFIG_SND_PCI=y +CONFIG_SND_AD1889=m +CONFIG_SND_ALS300=m +CONFIG_SND_ALS4000=m +CONFIG_SND_ALI5451=m +CONFIG_SND_ASIHPI=m +CONFIG_SND_ATIIXP=m +CONFIG_SND_ATIIXP_MODEM=m +CONFIG_SND_AU8810=m +CONFIG_SND_AU8820=m +CONFIG_SND_AU8830=m +CONFIG_SND_AW2=m +CONFIG_SND_AZT3328=m +CONFIG_SND_BT87X=m +# CONFIG_SND_BT87X_OVERCLOCK is not set +CONFIG_SND_CA0106=m +CONFIG_SND_CMIPCI=m +CONFIG_SND_OXYGEN_LIB=m +CONFIG_SND_OXYGEN=m +CONFIG_SND_CS4281=m +CONFIG_SND_CS46XX=m +CONFIG_SND_CS46XX_NEW_DSP=y +CONFIG_SND_CTXFI=m +CONFIG_SND_DARLA20=m +CONFIG_SND_GINA20=m +CONFIG_SND_LAYLA20=m +CONFIG_SND_DARLA24=m +CONFIG_SND_GINA24=m +CONFIG_SND_LAYLA24=m +CONFIG_SND_MONA=m +CONFIG_SND_MIA=m +CONFIG_SND_ECHO3G=m +CONFIG_SND_INDIGO=m +CONFIG_SND_INDIGOIO=m +CONFIG_SND_INDIGODJ=m +CONFIG_SND_INDIGOIOX=m +CONFIG_SND_INDIGODJX=m +CONFIG_SND_EMU10K1=m +CONFIG_SND_EMU10K1_SEQ=m +CONFIG_SND_EMU10K1X=m +CONFIG_SND_ENS1370=m +CONFIG_SND_ENS1371=m +CONFIG_SND_ES1938=m +CONFIG_SND_ES1968=m +CONFIG_SND_ES1968_INPUT=y +CONFIG_SND_ES1968_RADIO=y +CONFIG_SND_FM801=m +CONFIG_SND_FM801_TEA575X_BOOL=y +CONFIG_SND_HDSP=m +CONFIG_SND_HDSPM=m +CONFIG_SND_ICE1712=m +CONFIG_SND_ICE1724=m +CONFIG_SND_INTEL8X0=m +CONFIG_SND_INTEL8X0M=m +CONFIG_SND_KORG1212=m +CONFIG_SND_LOLA=m +CONFIG_SND_LX6464ES=m +CONFIG_SND_MAESTRO3=m +CONFIG_SND_MAESTRO3_INPUT=y +CONFIG_SND_MIXART=m +CONFIG_SND_NM256=m +CONFIG_SND_PCXHR=m +CONFIG_SND_RIPTIDE=m +CONFIG_SND_RME32=m +CONFIG_SND_RME96=m +CONFIG_SND_RME9652=m +CONFIG_SND_SONICVIBES=m +CONFIG_SND_TRIDENT=m +CONFIG_SND_VIA82XX=m +CONFIG_SND_VIA82XX_MODEM=m +CONFIG_SND_VIRTUOSO=m +CONFIG_SND_VX222=m +CONFIG_SND_YMFPCI=m + +# +# HD-Audio +# +CONFIG_SND_HDA=m +CONFIG_SND_HDA_GENERIC_LEDS=y +CONFIG_SND_HDA_INTEL=m +CONFIG_SND_HDA_HWDEP=y +CONFIG_SND_HDA_RECONFIG=y +CONFIG_SND_HDA_INPUT_BEEP=y +CONFIG_SND_HDA_INPUT_BEEP_MODE=1 +CONFIG_SND_HDA_PATCH_LOADER=y +CONFIG_SND_HDA_CODEC_REALTEK=m +CONFIG_SND_HDA_CODEC_ANALOG=m +CONFIG_SND_HDA_CODEC_SIGMATEL=m +CONFIG_SND_HDA_CODEC_VIA=m +CONFIG_SND_HDA_CODEC_HDMI=m +CONFIG_SND_HDA_CODEC_CIRRUS=m +CONFIG_SND_HDA_CODEC_CONEXANT=m +CONFIG_SND_HDA_CODEC_CA0110=m +CONFIG_SND_HDA_CODEC_CA0132=m +CONFIG_SND_HDA_CODEC_CA0132_DSP=y +CONFIG_SND_HDA_CODEC_CMEDIA=m +CONFIG_SND_HDA_CODEC_SI3054=m +CONFIG_SND_HDA_GENERIC=m +CONFIG_SND_HDA_POWER_SAVE_DEFAULT=0 +# CONFIG_SND_HDA_INTEL_HDMI_SILENT_STREAM is not set +# end of HD-Audio + +CONFIG_SND_HDA_CORE=m +CONFIG_SND_HDA_DSP_LOADER=y +CONFIG_SND_HDA_COMPONENT=y +CONFIG_SND_HDA_I915=y +CONFIG_SND_HDA_EXT_CORE=m +CONFIG_SND_HDA_PREALLOC_SIZE=0 +CONFIG_SND_INTEL_NHLT=y +CONFIG_SND_INTEL_DSP_CONFIG=m +CONFIG_SND_SPI=y +CONFIG_SND_USB=y +CONFIG_SND_USB_AUDIO=m +CONFIG_SND_USB_AUDIO_USE_MEDIA_CONTROLLER=y +CONFIG_SND_USB_UA101=m +CONFIG_SND_USB_USX2Y=m +CONFIG_SND_USB_CAIAQ=m +CONFIG_SND_USB_CAIAQ_INPUT=y +CONFIG_SND_USB_US122L=m +CONFIG_SND_USB_6FIRE=m +CONFIG_SND_USB_HIFACE=m +CONFIG_SND_BCD2000=m +CONFIG_SND_USB_LINE6=m +CONFIG_SND_USB_POD=m +CONFIG_SND_USB_PODHD=m +CONFIG_SND_USB_TONEPORT=m +CONFIG_SND_USB_VARIAX=m +CONFIG_SND_FIREWIRE=y +CONFIG_SND_FIREWIRE_LIB=m +CONFIG_SND_DICE=m +CONFIG_SND_OXFW=m +CONFIG_SND_ISIGHT=m +CONFIG_SND_FIREWORKS=m +CONFIG_SND_BEBOB=m +CONFIG_SND_FIREWIRE_DIGI00X=m +CONFIG_SND_FIREWIRE_TASCAM=m +CONFIG_SND_FIREWIRE_MOTU=m +CONFIG_SND_FIREFACE=m +CONFIG_SND_PCMCIA=y +CONFIG_SND_VXPOCKET=m +CONFIG_SND_PDAUDIOCF=m +CONFIG_SND_SOC=m +CONFIG_SND_SOC_AC97_BUS=y +CONFIG_SND_SOC_GENERIC_DMAENGINE_PCM=y +CONFIG_SND_SOC_COMPRESS=y +CONFIG_SND_SOC_TOPOLOGY=y +CONFIG_SND_SOC_ACPI=m +CONFIG_SND_SOC_AMD_ACP=m +CONFIG_SND_SOC_AMD_CZ_DA7219MX98357_MACH=m +CONFIG_SND_SOC_AMD_CZ_RT5645_MACH=m +CONFIG_SND_SOC_AMD_ACP3x=m +CONFIG_SND_SOC_AMD_RV_RT5682_MACH=m +CONFIG_SND_SOC_AMD_RENOIR=m +CONFIG_SND_SOC_AMD_RENOIR_MACH=m +CONFIG_SND_ATMEL_SOC=m +CONFIG_SND_SOC_MIKROE_PROTO=m +CONFIG_SND_BCM63XX_I2S_WHISTLER=m +CONFIG_SND_DESIGNWARE_I2S=m +CONFIG_SND_DESIGNWARE_PCM=y + +# +# SoC Audio for Freescale CPUs +# + +# +# Common SoC Audio options for Freescale CPUs: +# +# CONFIG_SND_SOC_FSL_ASRC is not set +# CONFIG_SND_SOC_FSL_SAI is not set +# CONFIG_SND_SOC_FSL_AUDMIX is not set +# CONFIG_SND_SOC_FSL_SSI is not set +# CONFIG_SND_SOC_FSL_SPDIF is not set +# CONFIG_SND_SOC_FSL_ESAI is not set +# CONFIG_SND_SOC_FSL_MICFIL is not set +# CONFIG_SND_SOC_IMX_AUDMUX is not set +# end of SoC Audio for Freescale CPUs + +CONFIG_SND_I2S_HI6210_I2S=m +CONFIG_SND_SOC_IMG=y +CONFIG_SND_SOC_IMG_I2S_IN=m +CONFIG_SND_SOC_IMG_I2S_OUT=m +CONFIG_SND_SOC_IMG_PARALLEL_OUT=m +CONFIG_SND_SOC_IMG_SPDIF_IN=m +CONFIG_SND_SOC_IMG_SPDIF_OUT=m +CONFIG_SND_SOC_IMG_PISTACHIO_INTERNAL_DAC=m +CONFIG_SND_SOC_INTEL_SST_TOPLEVEL=y +CONFIG_SND_SST_IPC=m +CONFIG_SND_SST_IPC_PCI=m +CONFIG_SND_SST_IPC_ACPI=m +CONFIG_SND_SOC_INTEL_SST_ACPI=m +CONFIG_SND_SOC_INTEL_SST=m +CONFIG_SND_SOC_INTEL_SST_FIRMWARE=m +CONFIG_SND_SOC_INTEL_HASWELL=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_PCI=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_ACPI=m +CONFIG_SND_SOC_INTEL_SKYLAKE=m +CONFIG_SND_SOC_INTEL_SKL=m +CONFIG_SND_SOC_INTEL_APL=m +CONFIG_SND_SOC_INTEL_KBL=m +CONFIG_SND_SOC_INTEL_GLK=m +CONFIG_SND_SOC_INTEL_CNL=m +CONFIG_SND_SOC_INTEL_CFL=m +CONFIG_SND_SOC_INTEL_CML_H=m +CONFIG_SND_SOC_INTEL_CML_LP=m +CONFIG_SND_SOC_INTEL_SKYLAKE_FAMILY=m +CONFIG_SND_SOC_INTEL_SKYLAKE_SSP_CLK=m +# CONFIG_SND_SOC_INTEL_SKYLAKE_HDAUDIO_CODEC is not set +CONFIG_SND_SOC_INTEL_SKYLAKE_COMMON=m +CONFIG_SND_SOC_ACPI_INTEL_MATCH=m +CONFIG_SND_SOC_INTEL_MACH=y +# CONFIG_SND_SOC_INTEL_USER_FRIENDLY_LONG_NAMES is not set +CONFIG_SND_SOC_INTEL_HASWELL_MACH=m +CONFIG_SND_SOC_INTEL_BDW_RT5650_MACH=m +CONFIG_SND_SOC_INTEL_BDW_RT5677_MACH=m +CONFIG_SND_SOC_INTEL_BROADWELL_MACH=m +CONFIG_SND_SOC_INTEL_BYTCR_RT5640_MACH=m +CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_RT5672_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_RT5645_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_MAX98090_TI_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_NAU8824_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_CX2072X_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_DA7213_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_ES8316_MACH=m +# CONFIG_SND_SOC_INTEL_BYT_CHT_NOCODEC_MACH is not set +CONFIG_SND_SOC_INTEL_SKL_RT286_MACH=m +CONFIG_SND_SOC_INTEL_SKL_NAU88L25_SSM4567_MACH=m +CONFIG_SND_SOC_INTEL_SKL_NAU88L25_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_DA7219_MAX98357A_GENERIC=m +CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_COMMON=m +CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_BXT_RT298_MACH=m +CONFIG_SND_SOC_INTEL_SOF_WM8804_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5663_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5663_RT5514_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5660_MACH=m +CONFIG_SND_SOC_INTEL_GLK_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_GLK_RT5682_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_SKL_HDA_DSP_GENERIC_MACH=m +CONFIG_SND_SOC_INTEL_SOF_RT5682_MACH=m +CONFIG_SND_SOC_INTEL_SOF_PCM512x_MACH=m +CONFIG_SND_SOC_INTEL_CML_LP_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_SOF_CML_RT1011_RT5682_MACH=m +CONFIG_SND_SOC_INTEL_SOF_DA7219_MAX98373_MACH=m +CONFIG_SND_SOC_INTEL_EHL_RT5660_MACH=m +CONFIG_SND_SOC_MTK_BTCVSD=m +CONFIG_SND_SOC_SOF_TOPLEVEL=y +CONFIG_SND_SOC_SOF_PCI=m +CONFIG_SND_SOC_SOF_ACPI=m +CONFIG_SND_SOC_SOF_OF=m +# CONFIG_SND_SOC_SOF_DEBUG_PROBES is not set +# CONFIG_SND_SOC_SOF_DEVELOPER_SUPPORT is not set +CONFIG_SND_SOC_SOF=m +CONFIG_SND_SOC_SOF_PROBE_WORK_QUEUE=y +CONFIG_SND_SOC_SOF_INTEL_TOPLEVEL=y +CONFIG_SND_SOC_SOF_INTEL_ACPI=m +CONFIG_SND_SOC_SOF_INTEL_PCI=m +CONFIG_SND_SOC_SOF_INTEL_HIFI_EP_IPC=m +CONFIG_SND_SOC_SOF_INTEL_ATOM_HIFI_EP=m +CONFIG_SND_SOC_SOF_INTEL_COMMON=m +CONFIG_SND_SOC_SOF_MERRIFIELD_SUPPORT=y +CONFIG_SND_SOC_SOF_MERRIFIELD=m +CONFIG_SND_SOC_SOF_APOLLOLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_APOLLOLAKE=m +CONFIG_SND_SOC_SOF_GEMINILAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_GEMINILAKE=m +CONFIG_SND_SOC_SOF_CANNONLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_CANNONLAKE=m +CONFIG_SND_SOC_SOF_COFFEELAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_COFFEELAKE=m +CONFIG_SND_SOC_SOF_ICELAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_ICELAKE=m +CONFIG_SND_SOC_SOF_COMETLAKE=m +CONFIG_SND_SOC_SOF_COMETLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_COMETLAKE_LP_SUPPORT=y +CONFIG_SND_SOC_SOF_TIGERLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_TIGERLAKE=m +CONFIG_SND_SOC_SOF_ELKHARTLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_ELKHARTLAKE=m +CONFIG_SND_SOC_SOF_JASPERLAKE_SUPPORT=y +CONFIG_SND_SOC_SOF_JASPERLAKE=m +CONFIG_SND_SOC_SOF_HDA_COMMON=m +CONFIG_SND_SOC_SOF_HDA_LINK=y +CONFIG_SND_SOC_SOF_HDA_AUDIO_CODEC=y +# CONFIG_SND_SOC_SOF_HDA_ALWAYS_ENABLE_DMI_L1 is not set +CONFIG_SND_SOC_SOF_HDA_LINK_BASELINE=m +CONFIG_SND_SOC_SOF_HDA=m +CONFIG_SND_SOC_SOF_XTENSA=m + +# +# STMicroelectronics STM32 SOC audio support +# +# end of STMicroelectronics STM32 SOC audio support + +CONFIG_SND_SOC_XILINX_I2S=m +CONFIG_SND_SOC_XILINX_AUDIO_FORMATTER=m +CONFIG_SND_SOC_XILINX_SPDIF=m +CONFIG_SND_SOC_XTFPGA_I2S=m +CONFIG_ZX_TDM=m +CONFIG_SND_SOC_I2C_AND_SPI=m + +# +# CODEC drivers +# +CONFIG_SND_SOC_AC97_CODEC=m +CONFIG_SND_SOC_ADAU_UTILS=m +CONFIG_SND_SOC_ADAU1701=m +CONFIG_SND_SOC_ADAU17X1=m +CONFIG_SND_SOC_ADAU1761=m +CONFIG_SND_SOC_ADAU1761_I2C=m +CONFIG_SND_SOC_ADAU1761_SPI=m +CONFIG_SND_SOC_ADAU7002=m +CONFIG_SND_SOC_ADAU7118=m +CONFIG_SND_SOC_ADAU7118_HW=m +CONFIG_SND_SOC_ADAU7118_I2C=m +CONFIG_SND_SOC_AK4104=m +CONFIG_SND_SOC_AK4118=m +CONFIG_SND_SOC_AK4458=m +CONFIG_SND_SOC_AK4554=m +CONFIG_SND_SOC_AK4613=m +CONFIG_SND_SOC_AK4642=m +CONFIG_SND_SOC_AK5386=m +CONFIG_SND_SOC_AK5558=m +CONFIG_SND_SOC_ALC5623=m +CONFIG_SND_SOC_BD28623=m +# CONFIG_SND_SOC_BT_SCO is not set +CONFIG_SND_SOC_CPCAP=m +CONFIG_SND_SOC_CROS_EC_CODEC=m +CONFIG_SND_SOC_CS35L32=m +CONFIG_SND_SOC_CS35L33=m +CONFIG_SND_SOC_CS35L34=m +CONFIG_SND_SOC_CS35L35=m +CONFIG_SND_SOC_CS35L36=m +CONFIG_SND_SOC_CS42L42=m +CONFIG_SND_SOC_CS42L51=m +CONFIG_SND_SOC_CS42L51_I2C=m +CONFIG_SND_SOC_CS42L52=m +CONFIG_SND_SOC_CS42L56=m +CONFIG_SND_SOC_CS42L73=m +CONFIG_SND_SOC_CS4265=m +CONFIG_SND_SOC_CS4270=m +CONFIG_SND_SOC_CS4271=m +CONFIG_SND_SOC_CS4271_I2C=m +CONFIG_SND_SOC_CS4271_SPI=m +CONFIG_SND_SOC_CS42XX8=m +CONFIG_SND_SOC_CS42XX8_I2C=m +CONFIG_SND_SOC_CS43130=m +CONFIG_SND_SOC_CS4341=m +CONFIG_SND_SOC_CS4349=m +CONFIG_SND_SOC_CS53L30=m +CONFIG_SND_SOC_CX2072X=m +CONFIG_SND_SOC_DA7213=m +CONFIG_SND_SOC_DA7219=m +CONFIG_SND_SOC_DMIC=m +CONFIG_SND_SOC_HDMI_CODEC=m +CONFIG_SND_SOC_ES7134=m +CONFIG_SND_SOC_ES7241=m +CONFIG_SND_SOC_ES8316=m +CONFIG_SND_SOC_ES8328=m +CONFIG_SND_SOC_ES8328_I2C=m +CONFIG_SND_SOC_ES8328_SPI=m +CONFIG_SND_SOC_GTM601=m +CONFIG_SND_SOC_HDAC_HDMI=m +CONFIG_SND_SOC_HDAC_HDA=m +CONFIG_SND_SOC_INNO_RK3036=m +CONFIG_SND_SOC_LOCHNAGAR_SC=m +CONFIG_SND_SOC_MAX98088=m +CONFIG_SND_SOC_MAX98090=m +CONFIG_SND_SOC_MAX98357A=m +CONFIG_SND_SOC_MAX98504=m +CONFIG_SND_SOC_MAX9867=m +CONFIG_SND_SOC_MAX98927=m +CONFIG_SND_SOC_MAX98373=m +CONFIG_SND_SOC_MAX98373_I2C=m +# CONFIG_SND_SOC_MAX98373_SDW is not set +CONFIG_SND_SOC_MAX98390=m +CONFIG_SND_SOC_MAX9860=m +CONFIG_SND_SOC_MSM8916_WCD_ANALOG=m +CONFIG_SND_SOC_MSM8916_WCD_DIGITAL=m +CONFIG_SND_SOC_PCM1681=m +CONFIG_SND_SOC_PCM1789=m +CONFIG_SND_SOC_PCM1789_I2C=m +CONFIG_SND_SOC_PCM179X=m +CONFIG_SND_SOC_PCM179X_I2C=m +CONFIG_SND_SOC_PCM179X_SPI=m +CONFIG_SND_SOC_PCM186X=m +CONFIG_SND_SOC_PCM186X_I2C=m +CONFIG_SND_SOC_PCM186X_SPI=m +CONFIG_SND_SOC_PCM3060=m +CONFIG_SND_SOC_PCM3060_I2C=m +CONFIG_SND_SOC_PCM3060_SPI=m +CONFIG_SND_SOC_PCM3168A=m +CONFIG_SND_SOC_PCM3168A_I2C=m +CONFIG_SND_SOC_PCM3168A_SPI=m +CONFIG_SND_SOC_PCM512x=m +CONFIG_SND_SOC_PCM512x_I2C=m +CONFIG_SND_SOC_PCM512x_SPI=m +CONFIG_SND_SOC_RK3328=m +CONFIG_SND_SOC_RL6231=m +CONFIG_SND_SOC_RL6347A=m +CONFIG_SND_SOC_RT286=m +CONFIG_SND_SOC_RT298=m +CONFIG_SND_SOC_RT1011=m +CONFIG_SND_SOC_RT1015=m +CONFIG_SND_SOC_RT1308_SDW=m +CONFIG_SND_SOC_RT5514=m +CONFIG_SND_SOC_RT5514_SPI=m +CONFIG_SND_SOC_RT5616=m +CONFIG_SND_SOC_RT5631=m +CONFIG_SND_SOC_RT5640=m +CONFIG_SND_SOC_RT5645=m +CONFIG_SND_SOC_RT5651=m +CONFIG_SND_SOC_RT5660=m +CONFIG_SND_SOC_RT5663=m +CONFIG_SND_SOC_RT5670=m +CONFIG_SND_SOC_RT5677=m +CONFIG_SND_SOC_RT5677_SPI=m +CONFIG_SND_SOC_RT5682=m +CONFIG_SND_SOC_RT5682_I2C=m +CONFIG_SND_SOC_RT5682_SDW=m +CONFIG_SND_SOC_RT700=m +CONFIG_SND_SOC_RT700_SDW=m +CONFIG_SND_SOC_RT711=m +CONFIG_SND_SOC_RT711_SDW=m +CONFIG_SND_SOC_RT715=m +CONFIG_SND_SOC_RT715_SDW=m +CONFIG_SND_SOC_SGTL5000=m +CONFIG_SND_SOC_SI476X=m +CONFIG_SND_SOC_SIGMADSP=m +CONFIG_SND_SOC_SIGMADSP_I2C=m +CONFIG_SND_SOC_SIGMADSP_REGMAP=m +CONFIG_SND_SOC_SIMPLE_AMPLIFIER=m +CONFIG_SND_SOC_SIRF_AUDIO_CODEC=m +CONFIG_SND_SOC_SPDIF=m +CONFIG_SND_SOC_SSM2305=m +CONFIG_SND_SOC_SSM2602=m +CONFIG_SND_SOC_SSM2602_SPI=m +CONFIG_SND_SOC_SSM2602_I2C=m +CONFIG_SND_SOC_SSM4567=m +CONFIG_SND_SOC_STA32X=m +CONFIG_SND_SOC_STA350=m +CONFIG_SND_SOC_STI_SAS=m +CONFIG_SND_SOC_TAS2552=m +CONFIG_SND_SOC_TAS2562=m +CONFIG_SND_SOC_TAS2770=m +CONFIG_SND_SOC_TAS5086=m +CONFIG_SND_SOC_TAS571X=m +CONFIG_SND_SOC_TAS5720=m +CONFIG_SND_SOC_TAS6424=m +CONFIG_SND_SOC_TDA7419=m +CONFIG_SND_SOC_TFA9879=m +CONFIG_SND_SOC_TLV320AIC23=m +CONFIG_SND_SOC_TLV320AIC23_I2C=m +CONFIG_SND_SOC_TLV320AIC23_SPI=m +CONFIG_SND_SOC_TLV320AIC31XX=m +CONFIG_SND_SOC_TLV320AIC32X4=m +CONFIG_SND_SOC_TLV320AIC32X4_I2C=m +CONFIG_SND_SOC_TLV320AIC32X4_SPI=m +CONFIG_SND_SOC_TLV320AIC3X=m +CONFIG_SND_SOC_TLV320ADCX140=m +CONFIG_SND_SOC_TS3A227E=m +CONFIG_SND_SOC_TSCS42XX=m +CONFIG_SND_SOC_TSCS454=m +CONFIG_SND_SOC_UDA1334=m +CONFIG_SND_SOC_WCD9335=m +CONFIG_SND_SOC_WCD934X=m +CONFIG_SND_SOC_WM8510=m +CONFIG_SND_SOC_WM8523=m +CONFIG_SND_SOC_WM8524=m +CONFIG_SND_SOC_WM8580=m +CONFIG_SND_SOC_WM8711=m +CONFIG_SND_SOC_WM8728=m +CONFIG_SND_SOC_WM8731=m +CONFIG_SND_SOC_WM8737=m +CONFIG_SND_SOC_WM8741=m +CONFIG_SND_SOC_WM8750=m +CONFIG_SND_SOC_WM8753=m +CONFIG_SND_SOC_WM8770=m +CONFIG_SND_SOC_WM8776=m +CONFIG_SND_SOC_WM8782=m +CONFIG_SND_SOC_WM8804=m +CONFIG_SND_SOC_WM8804_I2C=m +CONFIG_SND_SOC_WM8804_SPI=m +CONFIG_SND_SOC_WM8903=m +CONFIG_SND_SOC_WM8904=m +CONFIG_SND_SOC_WM8960=m +CONFIG_SND_SOC_WM8962=m +CONFIG_SND_SOC_WM8974=m +CONFIG_SND_SOC_WM8978=m +CONFIG_SND_SOC_WM8985=m +CONFIG_SND_SOC_WSA881X=m +CONFIG_SND_SOC_ZL38060=m +CONFIG_SND_SOC_ZX_AUD96P22=m +CONFIG_SND_SOC_MAX9759=m +CONFIG_SND_SOC_MT6351=m +CONFIG_SND_SOC_MT6358=m +CONFIG_SND_SOC_MT6660=m +CONFIG_SND_SOC_NAU8540=m +CONFIG_SND_SOC_NAU8810=m +CONFIG_SND_SOC_NAU8822=m +CONFIG_SND_SOC_NAU8824=m +CONFIG_SND_SOC_NAU8825=m +CONFIG_SND_SOC_TPA6130A2=m +# end of CODEC drivers + +CONFIG_SND_SIMPLE_CARD_UTILS=m +CONFIG_SND_SIMPLE_CARD=m +CONFIG_SND_AUDIO_GRAPH_CARD=m +CONFIG_SND_X86=y +CONFIG_HDMI_LPE_AUDIO=m +CONFIG_SND_SYNTH_EMUX=m +CONFIG_SND_XEN_FRONTEND=m +CONFIG_AC97_BUS=m + +# +# HID support +# +CONFIG_HID=m +CONFIG_HID_BATTERY_STRENGTH=y +CONFIG_HIDRAW=y +CONFIG_UHID=m +CONFIG_HID_GENERIC=m + +# +# Special HID drivers +# +CONFIG_HID_A4TECH=m +CONFIG_HID_ACCUTOUCH=m +CONFIG_HID_ACRUX=m +CONFIG_HID_ACRUX_FF=y +CONFIG_HID_APPLE=m +CONFIG_HID_APPLEIR=m +CONFIG_HID_ASUS=m +CONFIG_HID_AUREAL=m +CONFIG_HID_BELKIN=m +CONFIG_HID_BETOP_FF=m +CONFIG_HID_BIGBEN_FF=m +CONFIG_HID_CHERRY=m +CONFIG_HID_CHICONY=m +CONFIG_HID_CORSAIR=m +CONFIG_HID_COUGAR=m +CONFIG_HID_MACALLY=m +CONFIG_HID_PRODIKEYS=m +CONFIG_HID_CMEDIA=m +CONFIG_HID_CP2112=m +CONFIG_HID_CREATIVE_SB0540=m +CONFIG_HID_CYPRESS=m +CONFIG_HID_DRAGONRISE=m +CONFIG_DRAGONRISE_FF=y +CONFIG_HID_EMS_FF=m +CONFIG_HID_ELAN=m +CONFIG_HID_ELECOM=m +CONFIG_HID_ELO=m +CONFIG_HID_EZKEY=m +CONFIG_HID_GEMBIRD=m +CONFIG_HID_GFRM=m +CONFIG_HID_GLORIOUS=m +CONFIG_HID_HOLTEK=m +CONFIG_HOLTEK_FF=y +CONFIG_HID_GOOGLE_HAMMER=m +CONFIG_HID_GT683R=m +CONFIG_HID_KEYTOUCH=m +CONFIG_HID_KYE=m +CONFIG_HID_UCLOGIC=m +CONFIG_HID_WALTOP=m +CONFIG_HID_VIEWSONIC=m +CONFIG_HID_GYRATION=m +CONFIG_HID_ICADE=m +CONFIG_HID_ITE=m +CONFIG_HID_JABRA=m +CONFIG_HID_TWINHAN=m +CONFIG_HID_KENSINGTON=m +CONFIG_HID_LCPOWER=m +CONFIG_HID_LED=m +CONFIG_HID_LENOVO=m +CONFIG_HID_LOGITECH=m +CONFIG_HID_LOGITECH_DJ=m +CONFIG_HID_LOGITECH_HIDPP=m +CONFIG_LOGITECH_FF=y +CONFIG_LOGIRUMBLEPAD2_FF=y +CONFIG_LOGIG940_FF=y +CONFIG_LOGIWHEELS_FF=y +CONFIG_HID_MAGICMOUSE=m +CONFIG_HID_MALTRON=m +CONFIG_HID_MAYFLASH=m +CONFIG_HID_REDRAGON=m +CONFIG_HID_MICROSOFT=m +CONFIG_HID_MONTEREY=m +CONFIG_HID_MULTITOUCH=m +CONFIG_HID_NTI=m +CONFIG_HID_NTRIG=m +CONFIG_HID_ORTEK=m +CONFIG_HID_PANTHERLORD=m +CONFIG_PANTHERLORD_FF=y +CONFIG_HID_PENMOUNT=m +CONFIG_HID_PETALYNX=m +CONFIG_HID_PICOLCD=m +CONFIG_HID_PICOLCD_FB=y +CONFIG_HID_PICOLCD_BACKLIGHT=y +CONFIG_HID_PICOLCD_LCD=y +CONFIG_HID_PICOLCD_LEDS=y +CONFIG_HID_PICOLCD_CIR=y +CONFIG_HID_PLANTRONICS=m +CONFIG_HID_PRIMAX=m +CONFIG_HID_RETRODE=m +CONFIG_HID_ROCCAT=m +CONFIG_HID_SAITEK=m +CONFIG_HID_SAMSUNG=m +CONFIG_HID_SONY=m +CONFIG_SONY_FF=y +CONFIG_HID_SPEEDLINK=m +CONFIG_HID_STEAM=m +CONFIG_HID_STEELSERIES=m +CONFIG_HID_SUNPLUS=m +CONFIG_HID_RMI=m +CONFIG_HID_GREENASIA=m +CONFIG_GREENASIA_FF=y +CONFIG_HID_HYPERV_MOUSE=m +CONFIG_HID_SMARTJOYPLUS=m +CONFIG_SMARTJOYPLUS_FF=y +CONFIG_HID_TIVO=m +CONFIG_HID_TOPSEED=m +CONFIG_HID_THINGM=m +CONFIG_HID_THRUSTMASTER=m +CONFIG_THRUSTMASTER_FF=y +CONFIG_HID_UDRAW_PS3=m +CONFIG_HID_U2FZERO=m +CONFIG_HID_WACOM=m +CONFIG_HID_WIIMOTE=m +CONFIG_HID_XINMO=m +CONFIG_HID_ZEROPLUS=m +CONFIG_ZEROPLUS_FF=y +CONFIG_HID_ZYDACRON=m +CONFIG_HID_SENSOR_HUB=m +# CONFIG_HID_SENSOR_CUSTOM_SENSOR is not set +CONFIG_HID_ALPS=m +CONFIG_HID_MCP2221=m +# end of Special HID drivers + +# +# USB HID support +# +CONFIG_USB_HID=m +CONFIG_HID_PID=y +CONFIG_USB_HIDDEV=y + +# +# USB HID Boot Protocol drivers +# +# CONFIG_USB_KBD is not set +# CONFIG_USB_MOUSE is not set +# end of USB HID Boot Protocol drivers +# end of USB HID support + +# +# I2C HID support +# +CONFIG_I2C_HID=m +# end of I2C HID support + +# +# Intel ISH HID support +# +CONFIG_INTEL_ISH_HID=m +CONFIG_INTEL_ISH_FIRMWARE_DOWNLOADER=m +# end of Intel ISH HID support +# end of HID support + +CONFIG_USB_OHCI_LITTLE_ENDIAN=y +CONFIG_USB_SUPPORT=y +CONFIG_USB_COMMON=y +CONFIG_USB_LED_TRIG=y +CONFIG_USB_ULPI_BUS=m +CONFIG_USB_CONN_GPIO=m +CONFIG_USB_ARCH_HAS_HCD=y +CONFIG_USB=y +CONFIG_USB_PCI=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y + +# +# Miscellaneous USB options +# +CONFIG_USB_DEFAULT_PERSIST=y +CONFIG_USB_DYNAMIC_MINORS=y +# CONFIG_USB_OTG is not set +# CONFIG_USB_OTG_PRODUCTLIST is not set +# CONFIG_USB_OTG_DISABLE_EXTERNAL_HUB is not set +CONFIG_USB_LEDS_TRIGGER_USBPORT=m +CONFIG_USB_AUTOSUSPEND_DELAY=2 +CONFIG_USB_MON=m + +# +# USB Host Controller Drivers +# +CONFIG_USB_C67X00_HCD=m +CONFIG_USB_XHCI_HCD=m +# CONFIG_USB_XHCI_DBGCAP is not set +CONFIG_USB_XHCI_PCI=m +CONFIG_USB_XHCI_PCI_RENESAS=m +CONFIG_USB_XHCI_PLATFORM=m +CONFIG_USB_EHCI_HCD=m +CONFIG_USB_EHCI_ROOT_HUB_TT=y +CONFIG_USB_EHCI_TT_NEWSCHED=y +CONFIG_USB_EHCI_PCI=m +CONFIG_USB_EHCI_FSL=m +CONFIG_USB_EHCI_HCD_PLATFORM=m +CONFIG_USB_OXU210HP_HCD=m +CONFIG_USB_ISP116X_HCD=m +CONFIG_USB_FOTG210_HCD=m +CONFIG_USB_MAX3421_HCD=m +CONFIG_USB_OHCI_HCD=m +CONFIG_USB_OHCI_HCD_PCI=m +# CONFIG_USB_OHCI_HCD_SSB is not set +CONFIG_USB_OHCI_HCD_PLATFORM=m +CONFIG_USB_UHCI_HCD=m +CONFIG_USB_U132_HCD=m +CONFIG_USB_SL811_HCD=m +# CONFIG_USB_SL811_HCD_ISO is not set +CONFIG_USB_SL811_CS=m +CONFIG_USB_R8A66597_HCD=m +CONFIG_USB_HCD_BCMA=m +CONFIG_USB_HCD_SSB=m +# CONFIG_USB_HCD_TEST_MODE is not set + +# +# USB Device Class drivers +# +CONFIG_USB_ACM=m +CONFIG_USB_PRINTER=m +CONFIG_USB_WDM=m +CONFIG_USB_TMC=m + +# +# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may +# + +# +# also be needed; see USB_STORAGE Help for more info +# +CONFIG_USB_STORAGE=m +# CONFIG_USB_STORAGE_DEBUG is not set +CONFIG_USB_STORAGE_REALTEK=m +CONFIG_REALTEK_AUTOPM=y +CONFIG_USB_STORAGE_DATAFAB=m +CONFIG_USB_STORAGE_FREECOM=m +CONFIG_USB_STORAGE_ISD200=m +CONFIG_USB_STORAGE_USBAT=m +CONFIG_USB_STORAGE_SDDR09=m +CONFIG_USB_STORAGE_SDDR55=m +CONFIG_USB_STORAGE_JUMPSHOT=m +CONFIG_USB_STORAGE_ALAUDA=m +CONFIG_USB_STORAGE_ONETOUCH=m +CONFIG_USB_STORAGE_KARMA=m +CONFIG_USB_STORAGE_CYPRESS_ATACB=m +CONFIG_USB_STORAGE_ENE_UB6250=m +CONFIG_USB_UAS=m + +# +# USB Imaging devices +# +CONFIG_USB_MDC800=m +CONFIG_USB_MICROTEK=m +CONFIG_USBIP_CORE=m +CONFIG_USBIP_VHCI_HCD=m +CONFIG_USBIP_VHCI_HC_PORTS=8 +CONFIG_USBIP_VHCI_NR_HCS=1 +CONFIG_USBIP_HOST=m +CONFIG_USBIP_VUDC=m +# CONFIG_USBIP_DEBUG is not set +CONFIG_USB_CDNS3=m +CONFIG_USB_CDNS3_GADGET=y +CONFIG_USB_CDNS3_HOST=y +CONFIG_USB_CDNS3_PCI_WRAP=m +CONFIG_USB_MUSB_HDRC=m +# CONFIG_USB_MUSB_HOST is not set +# CONFIG_USB_MUSB_GADGET is not set +CONFIG_USB_MUSB_DUAL_ROLE=y + +# +# Platform Glue Layer +# + +# +# MUSB DMA mode +# +# CONFIG_MUSB_PIO_ONLY is not set +CONFIG_USB_DWC3=m +CONFIG_USB_DWC3_ULPI=y +# CONFIG_USB_DWC3_HOST is not set +# CONFIG_USB_DWC3_GADGET is not set +CONFIG_USB_DWC3_DUAL_ROLE=y + +# +# Platform Glue Driver Support +# +CONFIG_USB_DWC3_PCI=m +CONFIG_USB_DWC3_HAPS=m +CONFIG_USB_DWC3_OF_SIMPLE=m +CONFIG_USB_DWC2=m +# CONFIG_USB_DWC2_HOST is not set + +# +# Gadget/Dual-role mode requires USB Gadget support to be enabled +# +# CONFIG_USB_DWC2_PERIPHERAL is not set +CONFIG_USB_DWC2_DUAL_ROLE=y +CONFIG_USB_DWC2_PCI=m +# CONFIG_USB_DWC2_DEBUG is not set +# CONFIG_USB_DWC2_TRACK_MISSED_SOFS is not set +CONFIG_USB_CHIPIDEA=m +CONFIG_USB_CHIPIDEA_UDC=y +CONFIG_USB_CHIPIDEA_HOST=y +CONFIG_USB_CHIPIDEA_PCI=m +CONFIG_USB_CHIPIDEA_MSM=m +CONFIG_USB_CHIPIDEA_IMX=m +CONFIG_USB_CHIPIDEA_GENERIC=m +CONFIG_USB_CHIPIDEA_TEGRA=m +CONFIG_USB_ISP1760=m +CONFIG_USB_ISP1760_HCD=y +CONFIG_USB_ISP1761_UDC=y +# CONFIG_USB_ISP1760_HOST_ROLE is not set +# CONFIG_USB_ISP1760_GADGET_ROLE is not set +CONFIG_USB_ISP1760_DUAL_ROLE=y + +# +# USB port drivers +# +CONFIG_USB_USS720=m +CONFIG_USB_SERIAL=y +CONFIG_USB_SERIAL_CONSOLE=y +CONFIG_USB_SERIAL_GENERIC=y +CONFIG_USB_SERIAL_SIMPLE=m +CONFIG_USB_SERIAL_AIRCABLE=m +CONFIG_USB_SERIAL_ARK3116=m +CONFIG_USB_SERIAL_BELKIN=m +CONFIG_USB_SERIAL_CH341=m +CONFIG_USB_SERIAL_WHITEHEAT=m +CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m +CONFIG_USB_SERIAL_CP210X=m +CONFIG_USB_SERIAL_CYPRESS_M8=m +CONFIG_USB_SERIAL_EMPEG=m +CONFIG_USB_SERIAL_FTDI_SIO=m +CONFIG_USB_SERIAL_VISOR=m +CONFIG_USB_SERIAL_IPAQ=m +CONFIG_USB_SERIAL_IR=m +CONFIG_USB_SERIAL_EDGEPORT=m +CONFIG_USB_SERIAL_EDGEPORT_TI=m +CONFIG_USB_SERIAL_F81232=m +CONFIG_USB_SERIAL_F8153X=m +CONFIG_USB_SERIAL_GARMIN=m +CONFIG_USB_SERIAL_IPW=m +CONFIG_USB_SERIAL_IUU=m +CONFIG_USB_SERIAL_KEYSPAN_PDA=m +CONFIG_USB_SERIAL_KEYSPAN=m +CONFIG_USB_SERIAL_KLSI=m +CONFIG_USB_SERIAL_KOBIL_SCT=m +CONFIG_USB_SERIAL_MCT_U232=m +CONFIG_USB_SERIAL_METRO=m +CONFIG_USB_SERIAL_MOS7720=m +CONFIG_USB_SERIAL_MOS7715_PARPORT=y +CONFIG_USB_SERIAL_MOS7840=m +CONFIG_USB_SERIAL_MXUPORT=m +CONFIG_USB_SERIAL_NAVMAN=m +CONFIG_USB_SERIAL_PL2303=m +CONFIG_USB_SERIAL_OTI6858=m +CONFIG_USB_SERIAL_QCAUX=m +CONFIG_USB_SERIAL_QUALCOMM=m +CONFIG_USB_SERIAL_SPCP8X5=m +CONFIG_USB_SERIAL_SAFE=m +# CONFIG_USB_SERIAL_SAFE_PADDED is not set +CONFIG_USB_SERIAL_SIERRAWIRELESS=m +CONFIG_USB_SERIAL_SYMBOL=m +CONFIG_USB_SERIAL_TI=m +CONFIG_USB_SERIAL_CYBERJACK=m +CONFIG_USB_SERIAL_XIRCOM=m +CONFIG_USB_SERIAL_WWAN=m +CONFIG_USB_SERIAL_OPTION=m +CONFIG_USB_SERIAL_OMNINET=m +CONFIG_USB_SERIAL_OPTICON=m +CONFIG_USB_SERIAL_XSENS_MT=m +CONFIG_USB_SERIAL_WISHBONE=m +CONFIG_USB_SERIAL_SSU100=m +CONFIG_USB_SERIAL_QT2=m +CONFIG_USB_SERIAL_UPD78F0730=m +CONFIG_USB_SERIAL_DEBUG=m + +# +# USB Miscellaneous drivers +# +CONFIG_USB_EMI62=m +CONFIG_USB_EMI26=m +CONFIG_USB_ADUTUX=m +CONFIG_USB_SEVSEG=m +CONFIG_USB_LEGOTOWER=m +CONFIG_USB_LCD=m +CONFIG_USB_CYPRESS_CY7C63=m +CONFIG_USB_CYTHERM=m +CONFIG_USB_IDMOUSE=m +CONFIG_USB_FTDI_ELAN=m +CONFIG_USB_APPLEDISPLAY=m +CONFIG_APPLE_MFI_FASTCHARGE=m +CONFIG_USB_SISUSBVGA=m +CONFIG_USB_SISUSBVGA_CON=y +CONFIG_USB_LD=m +CONFIG_USB_TRANCEVIBRATOR=m +CONFIG_USB_IOWARRIOR=m +CONFIG_USB_TEST=m +CONFIG_USB_EHSET_TEST_FIXTURE=m +CONFIG_USB_ISIGHTFW=m +CONFIG_USB_YUREX=m +CONFIG_USB_EZUSB_FX2=m +CONFIG_USB_HUB_USB251XB=m +CONFIG_USB_HSIC_USB3503=m +CONFIG_USB_HSIC_USB4604=m +CONFIG_USB_LINK_LAYER_TEST=m +CONFIG_USB_CHAOSKEY=m +CONFIG_USB_ATM=m +CONFIG_USB_SPEEDTOUCH=m +CONFIG_USB_CXACRU=m +CONFIG_USB_UEAGLEATM=m +CONFIG_USB_XUSBATM=m + +# +# USB Physical Layer drivers +# +CONFIG_USB_PHY=y +CONFIG_NOP_USB_XCEIV=m +CONFIG_USB_GPIO_VBUS=m +CONFIG_TAHVO_USB=m +# CONFIG_TAHVO_USB_HOST_BY_DEFAULT is not set +CONFIG_USB_ISP1301=m +# end of USB Physical Layer drivers + +CONFIG_USB_GADGET=m +# CONFIG_USB_GADGET_DEBUG is not set +# CONFIG_USB_GADGET_DEBUG_FILES is not set +# CONFIG_USB_GADGET_DEBUG_FS is not set +CONFIG_USB_GADGET_VBUS_DRAW=2 +CONFIG_USB_GADGET_STORAGE_NUM_BUFFERS=2 +CONFIG_U_SERIAL_CONSOLE=y + +# +# USB Peripheral Controller +# +CONFIG_USB_FOTG210_UDC=m +CONFIG_USB_GR_UDC=m +CONFIG_USB_R8A66597=m +CONFIG_USB_PXA27X=m +CONFIG_USB_MV_UDC=m +CONFIG_USB_MV_U3D=m +CONFIG_USB_SNP_CORE=m +CONFIG_USB_SNP_UDC_PLAT=m +CONFIG_USB_M66592=m +CONFIG_USB_BDC_UDC=m + +# +# Platform Support +# +CONFIG_USB_BDC_PCI=m +CONFIG_USB_AMD5536UDC=m +CONFIG_USB_NET2272=m +CONFIG_USB_NET2272_DMA=y +CONFIG_USB_NET2280=m +CONFIG_USB_GOKU=m +CONFIG_USB_EG20T=m +CONFIG_USB_GADGET_XILINX=m +CONFIG_USB_MAX3420_UDC=m +CONFIG_USB_DUMMY_HCD=m +# end of USB Peripheral Controller + +CONFIG_USB_LIBCOMPOSITE=m +CONFIG_USB_F_ACM=m +CONFIG_USB_F_SS_LB=m +CONFIG_USB_U_SERIAL=m +CONFIG_USB_U_ETHER=m +CONFIG_USB_U_AUDIO=m +CONFIG_USB_F_SERIAL=m +CONFIG_USB_F_OBEX=m +CONFIG_USB_F_NCM=m +CONFIG_USB_F_ECM=m +CONFIG_USB_F_PHONET=m +CONFIG_USB_F_EEM=m +CONFIG_USB_F_SUBSET=m +CONFIG_USB_F_RNDIS=m +CONFIG_USB_F_MASS_STORAGE=m +CONFIG_USB_F_FS=m +CONFIG_USB_F_UAC1=m +CONFIG_USB_F_UAC1_LEGACY=m +CONFIG_USB_F_UAC2=m +CONFIG_USB_F_UVC=m +CONFIG_USB_F_MIDI=m +CONFIG_USB_F_HID=m +CONFIG_USB_F_PRINTER=m +CONFIG_USB_F_TCM=m +CONFIG_USB_CONFIGFS=m +CONFIG_USB_CONFIGFS_SERIAL=y +CONFIG_USB_CONFIGFS_ACM=y +CONFIG_USB_CONFIGFS_OBEX=y +CONFIG_USB_CONFIGFS_NCM=y +CONFIG_USB_CONFIGFS_ECM=y +CONFIG_USB_CONFIGFS_ECM_SUBSET=y +CONFIG_USB_CONFIGFS_RNDIS=y +CONFIG_USB_CONFIGFS_EEM=y +CONFIG_USB_CONFIGFS_PHONET=y +CONFIG_USB_CONFIGFS_MASS_STORAGE=y +CONFIG_USB_CONFIGFS_F_LB_SS=y +CONFIG_USB_CONFIGFS_F_FS=y +CONFIG_USB_CONFIGFS_F_UAC1=y +CONFIG_USB_CONFIGFS_F_UAC1_LEGACY=y +CONFIG_USB_CONFIGFS_F_UAC2=y +CONFIG_USB_CONFIGFS_F_MIDI=y +CONFIG_USB_CONFIGFS_F_HID=y +CONFIG_USB_CONFIGFS_F_UVC=y +CONFIG_USB_CONFIGFS_F_PRINTER=y +CONFIG_USB_CONFIGFS_F_TCM=y + +# +# USB Gadget precomposed configurations +# +CONFIG_USB_ZERO=m +CONFIG_USB_AUDIO=m +# CONFIG_GADGET_UAC1 is not set +CONFIG_USB_ETH=m +CONFIG_USB_ETH_RNDIS=y +CONFIG_USB_ETH_EEM=y +CONFIG_USB_G_NCM=m +CONFIG_USB_GADGETFS=m +CONFIG_USB_FUNCTIONFS=m +CONFIG_USB_FUNCTIONFS_ETH=y +CONFIG_USB_FUNCTIONFS_RNDIS=y +CONFIG_USB_FUNCTIONFS_GENERIC=y +CONFIG_USB_MASS_STORAGE=m +CONFIG_USB_GADGET_TARGET=m +CONFIG_USB_G_SERIAL=m +CONFIG_USB_MIDI_GADGET=m +CONFIG_USB_G_PRINTER=m +CONFIG_USB_CDC_COMPOSITE=m +CONFIG_USB_G_NOKIA=m +CONFIG_USB_G_ACM_MS=m +CONFIG_USB_G_MULTI=m +CONFIG_USB_G_MULTI_RNDIS=y +CONFIG_USB_G_MULTI_CDC=y +CONFIG_USB_G_HID=m +CONFIG_USB_G_DBGP=m +# CONFIG_USB_G_DBGP_PRINTK is not set +CONFIG_USB_G_DBGP_SERIAL=y +CONFIG_USB_G_WEBCAM=m +CONFIG_USB_RAW_GADGET=m +# end of USB Gadget precomposed configurations + +CONFIG_TYPEC=m +CONFIG_TYPEC_TCPM=m +CONFIG_TYPEC_TCPCI=m +CONFIG_TYPEC_RT1711H=m +CONFIG_TYPEC_FUSB302=m +CONFIG_TYPEC_UCSI=m +CONFIG_UCSI_CCG=m +CONFIG_UCSI_ACPI=m +CONFIG_TYPEC_HD3SS3220=m +CONFIG_TYPEC_TPS6598X=m + +# +# USB Type-C Multiplexer/DeMultiplexer Switch support +# +CONFIG_TYPEC_MUX_PI3USB30532=m +# CONFIG_TYPEC_MUX_INTEL_PMC is not set +# end of USB Type-C Multiplexer/DeMultiplexer Switch support + +# +# USB Type-C Alternate Mode drivers +# +CONFIG_TYPEC_DP_ALTMODE=m +CONFIG_TYPEC_NVIDIA_ALTMODE=m +# end of USB Type-C Alternate Mode drivers + +CONFIG_USB_ROLE_SWITCH=m +CONFIG_USB_ROLES_INTEL_XHCI=m +CONFIG_MMC=m +CONFIG_PWRSEQ_EMMC=m +CONFIG_PWRSEQ_SD8787=m +CONFIG_PWRSEQ_SIMPLE=m +CONFIG_MMC_BLOCK=m +CONFIG_MMC_BLOCK_MINORS=8 +CONFIG_SDIO_UART=m +CONFIG_MMC_TEST=m + +# +# MMC/SD/SDIO Host Controller Drivers +# +# CONFIG_MMC_DEBUG is not set +CONFIG_MMC_SDHCI=m +CONFIG_MMC_SDHCI_IO_ACCESSORS=y +CONFIG_MMC_SDHCI_PCI=m +CONFIG_MMC_RICOH_MMC=y +CONFIG_MMC_SDHCI_ACPI=m +CONFIG_MMC_SDHCI_PLTFM=m +CONFIG_MMC_SDHCI_OF_ARASAN=m +CONFIG_MMC_SDHCI_OF_ASPEED=m +CONFIG_MMC_SDHCI_OF_AT91=m +CONFIG_MMC_SDHCI_OF_DWCMSHC=m +CONFIG_MMC_SDHCI_CADENCE=m +CONFIG_MMC_SDHCI_F_SDH30=m +CONFIG_MMC_SDHCI_MILBEAUT=m +CONFIG_MMC_WBSD=m +CONFIG_MMC_ALCOR=m +CONFIG_MMC_TIFM_SD=m +CONFIG_MMC_SPI=m +CONFIG_MMC_SDRICOH_CS=m +CONFIG_MMC_CB710=m +CONFIG_MMC_VIA_SDMMC=m +CONFIG_MMC_VUB300=m +CONFIG_MMC_USHC=m +CONFIG_MMC_USDHI6ROL0=m +CONFIG_MMC_REALTEK_PCI=m +CONFIG_MMC_REALTEK_USB=m +CONFIG_MMC_CQHCI=m +CONFIG_MMC_HSQ=m +CONFIG_MMC_TOSHIBA_PCI=m +CONFIG_MMC_MTK=m +CONFIG_MMC_SDHCI_XENON=m +CONFIG_MMC_SDHCI_OMAP=m +CONFIG_MMC_SDHCI_AM654=m +CONFIG_MMC_SDHCI_EXTERNAL_DMA=y +CONFIG_MEMSTICK=m +# CONFIG_MEMSTICK_DEBUG is not set + +# +# MemoryStick drivers +# +# CONFIG_MEMSTICK_UNSAFE_RESUME is not set +CONFIG_MSPRO_BLOCK=m +CONFIG_MS_BLOCK=m + +# +# MemoryStick Host Controller Drivers +# +CONFIG_MEMSTICK_TIFM_MS=m +CONFIG_MEMSTICK_JMICRON_38X=m +CONFIG_MEMSTICK_R592=m +CONFIG_MEMSTICK_REALTEK_PCI=m +CONFIG_MEMSTICK_REALTEK_USB=m +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=y +CONFIG_LEDS_CLASS_FLASH=m +# CONFIG_LEDS_CLASS_MULTICOLOR is not set +CONFIG_LEDS_BRIGHTNESS_HW_CHANGED=y + +# +# LED drivers +# +CONFIG_LEDS_88PM860X=m +CONFIG_LEDS_AAT1290=m +CONFIG_LEDS_AN30259A=m +CONFIG_LEDS_APU=m +CONFIG_LEDS_AS3645A=m +CONFIG_LEDS_AW2013=m +CONFIG_LEDS_BCM6328=m +CONFIG_LEDS_BCM6358=m +CONFIG_LEDS_CPCAP=m +CONFIG_LEDS_CR0014114=m +CONFIG_LEDS_EL15203000=m +CONFIG_LEDS_LM3530=m +CONFIG_LEDS_LM3532=m +CONFIG_LEDS_LM3533=m +CONFIG_LEDS_LM3642=m +CONFIG_LEDS_LM3692X=m +CONFIG_LEDS_LM3601X=m +CONFIG_LEDS_MT6323=m +CONFIG_LEDS_PCA9532=m +CONFIG_LEDS_PCA9532_GPIO=y +CONFIG_LEDS_GPIO=m +CONFIG_LEDS_LP3944=m +CONFIG_LEDS_LP3952=m +# CONFIG_LEDS_LP55XX_COMMON is not set +CONFIG_LEDS_LP8788=m +CONFIG_LEDS_LP8860=m +CONFIG_LEDS_CLEVO_MAIL=m +CONFIG_LEDS_PCA955X=m +CONFIG_LEDS_PCA955X_GPIO=y +CONFIG_LEDS_PCA963X=m +CONFIG_LEDS_WM831X_STATUS=m +CONFIG_LEDS_WM8350=m +CONFIG_LEDS_DA903X=m +CONFIG_LEDS_DA9052=m +CONFIG_LEDS_DAC124S085=m +CONFIG_LEDS_PWM=m +CONFIG_LEDS_REGULATOR=m +CONFIG_LEDS_BD2802=m +CONFIG_LEDS_INTEL_SS4200=m +CONFIG_LEDS_LT3593=m +CONFIG_LEDS_ADP5520=m +CONFIG_LEDS_MC13783=m +CONFIG_LEDS_TCA6507=m +CONFIG_LEDS_TLC591XX=m +CONFIG_LEDS_MAX77650=m +CONFIG_LEDS_MAX77693=m +CONFIG_LEDS_MAX8997=m +CONFIG_LEDS_LM355x=m +CONFIG_LEDS_MENF21BMC=m +CONFIG_LEDS_KTD2692=m +CONFIG_LEDS_IS31FL319X=m +CONFIG_LEDS_IS31FL32XX=m + +# +# LED driver for blink(1) USB RGB LED is under Special HID drivers (HID_THINGM) +# +CONFIG_LEDS_BLINKM=m +CONFIG_LEDS_SYSCON=y +CONFIG_LEDS_MLXCPLD=m +CONFIG_LEDS_MLXREG=m +CONFIG_LEDS_USER=m +CONFIG_LEDS_NIC78BX=m +CONFIG_LEDS_SPI_BYTE=m +CONFIG_LEDS_TI_LMU_COMMON=m +CONFIG_LEDS_LM3697=m +CONFIG_LEDS_LM36274=m +CONFIG_LEDS_TPS6105X=m +CONFIG_LEDS_SGM3140=m + +# +# LED Triggers +# +CONFIG_LEDS_TRIGGERS=y +CONFIG_LEDS_TRIGGER_TIMER=m +CONFIG_LEDS_TRIGGER_ONESHOT=m +CONFIG_LEDS_TRIGGER_DISK=y +CONFIG_LEDS_TRIGGER_MTD=y +CONFIG_LEDS_TRIGGER_HEARTBEAT=m +CONFIG_LEDS_TRIGGER_BACKLIGHT=m +CONFIG_LEDS_TRIGGER_CPU=y +CONFIG_LEDS_TRIGGER_ACTIVITY=m +CONFIG_LEDS_TRIGGER_GPIO=m +CONFIG_LEDS_TRIGGER_DEFAULT_ON=m + +# +# iptables trigger is under Netfilter config (LED target) +# +CONFIG_LEDS_TRIGGER_TRANSIENT=m +CONFIG_LEDS_TRIGGER_CAMERA=m +CONFIG_LEDS_TRIGGER_PANIC=y +CONFIG_LEDS_TRIGGER_NETDEV=m +CONFIG_LEDS_TRIGGER_PATTERN=m +CONFIG_LEDS_TRIGGER_AUDIO=m +CONFIG_ACCESSIBILITY=y +CONFIG_A11Y_BRAILLE_CONSOLE=y + +# +# Speakup console speech +# +CONFIG_SPEAKUP=m +CONFIG_SPEAKUP_SYNTH_ACNTSA=m +CONFIG_SPEAKUP_SYNTH_APOLLO=m +CONFIG_SPEAKUP_SYNTH_AUDPTR=m +CONFIG_SPEAKUP_SYNTH_BNS=m +CONFIG_SPEAKUP_SYNTH_DECTLK=m +CONFIG_SPEAKUP_SYNTH_DECEXT=m +CONFIG_SPEAKUP_SYNTH_LTLK=m +CONFIG_SPEAKUP_SYNTH_SOFT=m +CONFIG_SPEAKUP_SYNTH_SPKOUT=m +CONFIG_SPEAKUP_SYNTH_TXPRT=m +CONFIG_SPEAKUP_SYNTH_DUMMY=m +# end of Speakup console speech + +CONFIG_INFINIBAND=m +CONFIG_INFINIBAND_USER_MAD=m +CONFIG_INFINIBAND_USER_ACCESS=m +CONFIG_INFINIBAND_USER_MEM=y +CONFIG_INFINIBAND_ON_DEMAND_PAGING=y +CONFIG_INFINIBAND_ADDR_TRANS=y +CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS=y +CONFIG_INFINIBAND_MTHCA=m +CONFIG_INFINIBAND_MTHCA_DEBUG=y +CONFIG_INFINIBAND_QIB=m +CONFIG_INFINIBAND_QIB_DCA=y +CONFIG_INFINIBAND_CXGB4=m +CONFIG_INFINIBAND_EFA=m +CONFIG_INFINIBAND_I40IW=m +CONFIG_MLX4_INFINIBAND=m +CONFIG_MLX5_INFINIBAND=m +CONFIG_INFINIBAND_OCRDMA=m +CONFIG_INFINIBAND_VMWARE_PVRDMA=m +CONFIG_INFINIBAND_USNIC=m +CONFIG_INFINIBAND_BNXT_RE=m +CONFIG_INFINIBAND_HFI1=m +# CONFIG_HFI1_DEBUG_SDMA_ORDER is not set +# CONFIG_SDMA_VERBOSITY is not set +CONFIG_INFINIBAND_QEDR=m +CONFIG_INFINIBAND_RDMAVT=m +CONFIG_RDMA_RXE=m +CONFIG_RDMA_SIW=m +CONFIG_INFINIBAND_IPOIB=m +CONFIG_INFINIBAND_IPOIB_CM=y +CONFIG_INFINIBAND_IPOIB_DEBUG=y +# CONFIG_INFINIBAND_IPOIB_DEBUG_DATA is not set +CONFIG_INFINIBAND_SRP=m +CONFIG_INFINIBAND_SRPT=m +CONFIG_INFINIBAND_ISER=m +CONFIG_INFINIBAND_ISERT=m +# CONFIG_INFINIBAND_RTRS_CLIENT is not set +# CONFIG_INFINIBAND_RTRS_SERVER is not set +CONFIG_INFINIBAND_OPA_VNIC=m +CONFIG_EDAC_ATOMIC_SCRUB=y +CONFIG_EDAC_SUPPORT=y +CONFIG_EDAC=y +CONFIG_EDAC_LEGACY_SYSFS=y +# CONFIG_EDAC_DEBUG is not set +CONFIG_EDAC_DECODE_MCE=m +CONFIG_EDAC_GHES=y +CONFIG_EDAC_AMD64=m +# CONFIG_EDAC_AMD64_ERROR_INJECTION is not set +CONFIG_EDAC_E752X=m +CONFIG_EDAC_I82975X=m +CONFIG_EDAC_I3000=m +CONFIG_EDAC_I3200=m +CONFIG_EDAC_IE31200=m +CONFIG_EDAC_X38=m +CONFIG_EDAC_I5400=m +CONFIG_EDAC_I7CORE=m +CONFIG_EDAC_I5000=m +CONFIG_EDAC_I5100=m +CONFIG_EDAC_I7300=m +CONFIG_EDAC_SBRIDGE=m +CONFIG_EDAC_SKX=m +CONFIG_EDAC_I10NM=m +CONFIG_EDAC_PND2=m +CONFIG_RTC_LIB=y +CONFIG_RTC_MC146818_LIB=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_HCTOSYS=y +CONFIG_RTC_HCTOSYS_DEVICE="rtc0" +CONFIG_RTC_SYSTOHC=y +CONFIG_RTC_SYSTOHC_DEVICE="rtc0" +# CONFIG_RTC_DEBUG is not set +CONFIG_RTC_NVMEM=y + +# +# RTC interfaces +# +CONFIG_RTC_INTF_SYSFS=y +CONFIG_RTC_INTF_PROC=y +CONFIG_RTC_INTF_DEV=y +CONFIG_RTC_INTF_DEV_UIE_EMUL=y +# CONFIG_RTC_DRV_TEST is not set + +# +# I2C RTC drivers +# +CONFIG_RTC_DRV_88PM860X=m +CONFIG_RTC_DRV_88PM80X=m +CONFIG_RTC_DRV_ABB5ZES3=m +CONFIG_RTC_DRV_ABEOZ9=m +CONFIG_RTC_DRV_ABX80X=m +CONFIG_RTC_DRV_AS3722=m +CONFIG_RTC_DRV_DS1307=m +CONFIG_RTC_DRV_DS1307_CENTURY=y +CONFIG_RTC_DRV_DS1374=m +CONFIG_RTC_DRV_DS1374_WDT=y +CONFIG_RTC_DRV_DS1672=m +CONFIG_RTC_DRV_HYM8563=m +CONFIG_RTC_DRV_LP8788=m +CONFIG_RTC_DRV_MAX6900=m +CONFIG_RTC_DRV_MAX8907=m +CONFIG_RTC_DRV_MAX8925=m +CONFIG_RTC_DRV_MAX8998=m +CONFIG_RTC_DRV_MAX8997=m +CONFIG_RTC_DRV_MAX77686=m +CONFIG_RTC_DRV_RK808=m +CONFIG_RTC_DRV_RS5C372=m +CONFIG_RTC_DRV_ISL1208=m +CONFIG_RTC_DRV_ISL12022=m +CONFIG_RTC_DRV_ISL12026=m +CONFIG_RTC_DRV_X1205=m +CONFIG_RTC_DRV_PCF8523=m +CONFIG_RTC_DRV_PCF85063=m +CONFIG_RTC_DRV_PCF85363=m +CONFIG_RTC_DRV_PCF8563=m +CONFIG_RTC_DRV_PCF8583=m +CONFIG_RTC_DRV_M41T80=m +CONFIG_RTC_DRV_M41T80_WDT=y +CONFIG_RTC_DRV_BD70528=m +CONFIG_RTC_DRV_BQ32K=m +CONFIG_RTC_DRV_TWL4030=m +CONFIG_RTC_DRV_PALMAS=m +CONFIG_RTC_DRV_TPS6586X=m +CONFIG_RTC_DRV_TPS65910=m +CONFIG_RTC_DRV_TPS80031=m +CONFIG_RTC_DRV_RC5T583=m +CONFIG_RTC_DRV_RC5T619=m +CONFIG_RTC_DRV_S35390A=m +CONFIG_RTC_DRV_FM3130=m +CONFIG_RTC_DRV_RX8010=m +CONFIG_RTC_DRV_RX8581=m +CONFIG_RTC_DRV_RX8025=m +CONFIG_RTC_DRV_EM3027=m +CONFIG_RTC_DRV_RV3028=m +CONFIG_RTC_DRV_RV8803=m +CONFIG_RTC_DRV_S5M=m +CONFIG_RTC_DRV_SD3078=m + +# +# SPI RTC drivers +# +CONFIG_RTC_DRV_M41T93=m +CONFIG_RTC_DRV_M41T94=m +CONFIG_RTC_DRV_DS1302=m +CONFIG_RTC_DRV_DS1305=m +CONFIG_RTC_DRV_DS1343=m +CONFIG_RTC_DRV_DS1347=m +CONFIG_RTC_DRV_DS1390=m +CONFIG_RTC_DRV_MAX6916=m +CONFIG_RTC_DRV_R9701=m +CONFIG_RTC_DRV_RX4581=m +CONFIG_RTC_DRV_RX6110=m +CONFIG_RTC_DRV_RS5C348=m +CONFIG_RTC_DRV_MAX6902=m +CONFIG_RTC_DRV_PCF2123=m +CONFIG_RTC_DRV_MCP795=m +CONFIG_RTC_I2C_AND_SPI=y + +# +# SPI and I2C RTC drivers +# +CONFIG_RTC_DRV_DS3232=m +CONFIG_RTC_DRV_DS3232_HWMON=y +CONFIG_RTC_DRV_PCF2127=m +CONFIG_RTC_DRV_RV3029C2=m +CONFIG_RTC_DRV_RV3029_HWMON=y + +# +# Platform RTC drivers +# +CONFIG_RTC_DRV_CMOS=y +CONFIG_RTC_DRV_DS1286=m +CONFIG_RTC_DRV_DS1511=m +CONFIG_RTC_DRV_DS1553=m +CONFIG_RTC_DRV_DS1685_FAMILY=m +CONFIG_RTC_DRV_DS1685=y +# CONFIG_RTC_DRV_DS1689 is not set +# CONFIG_RTC_DRV_DS17285 is not set +# CONFIG_RTC_DRV_DS17485 is not set +# CONFIG_RTC_DRV_DS17885 is not set +CONFIG_RTC_DRV_DS1742=m +CONFIG_RTC_DRV_DS2404=m +CONFIG_RTC_DRV_DA9052=m +CONFIG_RTC_DRV_DA9055=m +CONFIG_RTC_DRV_DA9063=m +CONFIG_RTC_DRV_STK17TA8=m +CONFIG_RTC_DRV_M48T86=m +CONFIG_RTC_DRV_M48T35=m +CONFIG_RTC_DRV_M48T59=m +CONFIG_RTC_DRV_MSM6242=m +CONFIG_RTC_DRV_BQ4802=m +CONFIG_RTC_DRV_RP5C01=m +CONFIG_RTC_DRV_V3020=m +CONFIG_RTC_DRV_WM831X=m +CONFIG_RTC_DRV_WM8350=m +CONFIG_RTC_DRV_PCF50633=m +CONFIG_RTC_DRV_AB3100=m +CONFIG_RTC_DRV_ZYNQMP=m +CONFIG_RTC_DRV_CROS_EC=m + +# +# on-CPU RTC drivers +# +CONFIG_RTC_DRV_CADENCE=m +CONFIG_RTC_DRV_FTRTC010=m +CONFIG_RTC_DRV_PCAP=m +CONFIG_RTC_DRV_MC13XXX=m +CONFIG_RTC_DRV_MT6397=m +CONFIG_RTC_DRV_R7301=m +CONFIG_RTC_DRV_CPCAP=m + +# +# HID Sensor RTC drivers +# +CONFIG_RTC_DRV_HID_SENSOR_TIME=m +CONFIG_RTC_DRV_WILCO_EC=m +CONFIG_DMADEVICES=y +# CONFIG_DMADEVICES_DEBUG is not set + +# +# DMA Devices +# +CONFIG_DMA_ENGINE=y +CONFIG_DMA_VIRTUAL_CHANNELS=y +CONFIG_DMA_ACPI=y +CONFIG_DMA_OF=y +CONFIG_ALTERA_MSGDMA=m +CONFIG_DW_AXI_DMAC=m +CONFIG_FSL_EDMA=m +CONFIG_INTEL_IDMA64=m +CONFIG_INTEL_IDXD=m +CONFIG_INTEL_IOATDMA=m +CONFIG_INTEL_MIC_X100_DMA=m +CONFIG_PLX_DMA=m +# CONFIG_XILINX_ZYNQMP_DPDMA is not set +CONFIG_QCOM_HIDMA_MGMT=m +CONFIG_QCOM_HIDMA=m +CONFIG_DW_DMAC_CORE=y +CONFIG_DW_DMAC=y +CONFIG_DW_DMAC_PCI=y +CONFIG_DW_EDMA=m +CONFIG_DW_EDMA_PCIE=m +CONFIG_HSU_DMA=y +CONFIG_SF_PDMA=m + +# +# DMA Clients +# +CONFIG_ASYNC_TX_DMA=y +# CONFIG_DMATEST is not set +CONFIG_DMA_ENGINE_RAID=y + +# +# DMABUF options +# +CONFIG_SYNC_FILE=y +# CONFIG_SW_SYNC is not set +CONFIG_UDMABUF=y +# CONFIG_DMABUF_MOVE_NOTIFY is not set +# CONFIG_DMABUF_SELFTESTS is not set +CONFIG_DMABUF_HEAPS=y +CONFIG_DMABUF_HEAPS_SYSTEM=y +# end of DMABUF options + +CONFIG_DCA=m +CONFIG_AUXDISPLAY=y +CONFIG_HD44780=m +CONFIG_KS0108=m +CONFIG_KS0108_PORT=0x378 +CONFIG_KS0108_DELAY=2 +CONFIG_CFAG12864B=m +CONFIG_CFAG12864B_RATE=20 +CONFIG_IMG_ASCII_LCD=m +CONFIG_HT16K33=m +CONFIG_PARPORT_PANEL=m +CONFIG_PANEL_PARPORT=0 +CONFIG_PANEL_PROFILE=5 +# CONFIG_PANEL_CHANGE_MESSAGE is not set +# CONFIG_CHARLCD_BL_OFF is not set +# CONFIG_CHARLCD_BL_ON is not set +CONFIG_CHARLCD_BL_FLASH=y +CONFIG_PANEL=m +CONFIG_CHARLCD=m +CONFIG_UIO=m +CONFIG_UIO_CIF=m +CONFIG_UIO_PDRV_GENIRQ=m +CONFIG_UIO_DMEM_GENIRQ=m +CONFIG_UIO_AEC=m +CONFIG_UIO_SERCOS3=m +CONFIG_UIO_PCI_GENERIC=m +CONFIG_UIO_NETX=m +CONFIG_UIO_PRUSS=m +CONFIG_UIO_MF624=m +CONFIG_UIO_HV_GENERIC=m +CONFIG_VFIO_IOMMU_TYPE1=m +CONFIG_VFIO_VIRQFD=m +CONFIG_VFIO=m +# CONFIG_VFIO_NOIOMMU is not set +CONFIG_VFIO_PCI=m +CONFIG_VFIO_PCI_VGA=y +CONFIG_VFIO_PCI_MMAP=y +CONFIG_VFIO_PCI_INTX=y +CONFIG_VFIO_PCI_IGD=y +CONFIG_VFIO_MDEV=m +CONFIG_VFIO_MDEV_DEVICE=m +CONFIG_IRQ_BYPASS_MANAGER=m +CONFIG_VIRT_DRIVERS=y +CONFIG_VBOXGUEST=m +CONFIG_VIRTIO=y +CONFIG_VIRTIO_MENU=y +CONFIG_VIRTIO_PCI=m +CONFIG_VIRTIO_PCI_LEGACY=y +CONFIG_VIRTIO_VDPA=m +CONFIG_VIRTIO_PMEM=m +CONFIG_VIRTIO_BALLOON=m +CONFIG_VIRTIO_MEM=m +CONFIG_VIRTIO_INPUT=m +CONFIG_VIRTIO_MMIO=m +CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y +CONFIG_VDPA=m +CONFIG_VDPA_SIM=m +CONFIG_IFCVF=m +# CONFIG_MLX5_VDPA is not set +CONFIG_VHOST_IOTLB=m +CONFIG_VHOST_RING=m +CONFIG_VHOST=m +CONFIG_VHOST_MENU=y +CONFIG_VHOST_NET=m +CONFIG_VHOST_SCSI=m +CONFIG_VHOST_VSOCK=m +CONFIG_VHOST_VDPA=m +# CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set + +# +# Microsoft Hyper-V guest support +# +CONFIG_HYPERV=m +CONFIG_HYPERV_TIMER=y +CONFIG_HYPERV_UTILS=m +CONFIG_HYPERV_BALLOON=m +# end of Microsoft Hyper-V guest support + +# +# Xen driver support +# +CONFIG_XEN_BALLOON=y +CONFIG_XEN_BALLOON_MEMORY_HOTPLUG=y +CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT=512 +CONFIG_XEN_SCRUB_PAGES_DEFAULT=y +CONFIG_XEN_DEV_EVTCHN=m +CONFIG_XEN_BACKEND=y +CONFIG_XENFS=m +CONFIG_XEN_COMPAT_XENFS=y +CONFIG_XEN_SYS_HYPERVISOR=y +CONFIG_XEN_XENBUS_FRONTEND=y +CONFIG_XEN_GNTDEV=m +CONFIG_XEN_GNTDEV_DMABUF=y +CONFIG_XEN_GRANT_DEV_ALLOC=m +CONFIG_XEN_GRANT_DMA_ALLOC=y +CONFIG_SWIOTLB_XEN=y +CONFIG_XEN_PCIDEV_BACKEND=m +CONFIG_XEN_PVCALLS_FRONTEND=m +CONFIG_XEN_PVCALLS_BACKEND=y +CONFIG_XEN_SCSI_BACKEND=m +CONFIG_XEN_PRIVCMD=m +CONFIG_XEN_ACPI_PROCESSOR=m +CONFIG_XEN_MCE_LOG=y +CONFIG_XEN_HAVE_PVMMU=y +CONFIG_XEN_EFI=y +CONFIG_XEN_AUTO_XLATE=y +CONFIG_XEN_ACPI=y +CONFIG_XEN_SYMS=y +CONFIG_XEN_HAVE_VPMU=y +CONFIG_XEN_FRONT_PGDIR_SHBUF=m +CONFIG_XEN_UNPOPULATED_ALLOC=y +# end of Xen driver support + +# CONFIG_GREYBUS is not set +CONFIG_STAGING=y +CONFIG_PRISM2_USB=m +CONFIG_COMEDI=m +# CONFIG_COMEDI_DEBUG is not set +CONFIG_COMEDI_DEFAULT_BUF_SIZE_KB=2048 +CONFIG_COMEDI_DEFAULT_BUF_MAXSIZE_KB=20480 +CONFIG_COMEDI_MISC_DRIVERS=y +CONFIG_COMEDI_BOND=m +CONFIG_COMEDI_TEST=m +CONFIG_COMEDI_PARPORT=m +# CONFIG_COMEDI_ISA_DRIVERS is not set +CONFIG_COMEDI_PCI_DRIVERS=m +CONFIG_COMEDI_8255_PCI=m +CONFIG_COMEDI_ADDI_WATCHDOG=m +CONFIG_COMEDI_ADDI_APCI_1032=m +CONFIG_COMEDI_ADDI_APCI_1500=m +CONFIG_COMEDI_ADDI_APCI_1516=m +CONFIG_COMEDI_ADDI_APCI_1564=m +CONFIG_COMEDI_ADDI_APCI_16XX=m +CONFIG_COMEDI_ADDI_APCI_2032=m +CONFIG_COMEDI_ADDI_APCI_2200=m +CONFIG_COMEDI_ADDI_APCI_3120=m +CONFIG_COMEDI_ADDI_APCI_3501=m +CONFIG_COMEDI_ADDI_APCI_3XXX=m +CONFIG_COMEDI_ADL_PCI6208=m +CONFIG_COMEDI_ADL_PCI7X3X=m +CONFIG_COMEDI_ADL_PCI8164=m +CONFIG_COMEDI_ADL_PCI9111=m +CONFIG_COMEDI_ADL_PCI9118=m +CONFIG_COMEDI_ADV_PCI1710=m +CONFIG_COMEDI_ADV_PCI1720=m +CONFIG_COMEDI_ADV_PCI1723=m +CONFIG_COMEDI_ADV_PCI1724=m +CONFIG_COMEDI_ADV_PCI1760=m +CONFIG_COMEDI_ADV_PCI_DIO=m +CONFIG_COMEDI_AMPLC_DIO200_PCI=m +CONFIG_COMEDI_AMPLC_PC236_PCI=m +CONFIG_COMEDI_AMPLC_PC263_PCI=m +CONFIG_COMEDI_AMPLC_PCI224=m +CONFIG_COMEDI_AMPLC_PCI230=m +CONFIG_COMEDI_CONTEC_PCI_DIO=m +CONFIG_COMEDI_DAS08_PCI=m +CONFIG_COMEDI_DT3000=m +CONFIG_COMEDI_DYNA_PCI10XX=m +CONFIG_COMEDI_GSC_HPDI=m +CONFIG_COMEDI_MF6X4=m +CONFIG_COMEDI_ICP_MULTI=m +CONFIG_COMEDI_DAQBOARD2000=m +CONFIG_COMEDI_JR3_PCI=m +CONFIG_COMEDI_KE_COUNTER=m +CONFIG_COMEDI_CB_PCIDAS64=m +CONFIG_COMEDI_CB_PCIDAS=m +CONFIG_COMEDI_CB_PCIDDA=m +CONFIG_COMEDI_CB_PCIMDAS=m +CONFIG_COMEDI_CB_PCIMDDA=m +CONFIG_COMEDI_ME4000=m +CONFIG_COMEDI_ME_DAQ=m +CONFIG_COMEDI_NI_6527=m +CONFIG_COMEDI_NI_65XX=m +CONFIG_COMEDI_NI_660X=m +CONFIG_COMEDI_NI_670X=m +CONFIG_COMEDI_NI_LABPC_PCI=m +CONFIG_COMEDI_NI_PCIDIO=m +CONFIG_COMEDI_NI_PCIMIO=m +CONFIG_COMEDI_RTD520=m +CONFIG_COMEDI_S626=m +CONFIG_COMEDI_MITE=m +CONFIG_COMEDI_NI_TIOCMD=m +CONFIG_COMEDI_PCMCIA_DRIVERS=m +CONFIG_COMEDI_CB_DAS16_CS=m +CONFIG_COMEDI_DAS08_CS=m +CONFIG_COMEDI_NI_DAQ_700_CS=m +CONFIG_COMEDI_NI_DAQ_DIO24_CS=m +CONFIG_COMEDI_NI_LABPC_CS=m +CONFIG_COMEDI_NI_MIO_CS=m +CONFIG_COMEDI_QUATECH_DAQP_CS=m +CONFIG_COMEDI_USB_DRIVERS=m +CONFIG_COMEDI_DT9812=m +CONFIG_COMEDI_NI_USB6501=m +CONFIG_COMEDI_USBDUX=m +CONFIG_COMEDI_USBDUXFAST=m +CONFIG_COMEDI_USBDUXSIGMA=m +CONFIG_COMEDI_VMK80XX=m +CONFIG_COMEDI_8254=m +CONFIG_COMEDI_8255=m +CONFIG_COMEDI_8255_SA=m +CONFIG_COMEDI_KCOMEDILIB=m +CONFIG_COMEDI_AMPLC_DIO200=m +CONFIG_COMEDI_AMPLC_PC236=m +CONFIG_COMEDI_DAS08=m +CONFIG_COMEDI_NI_LABPC=m +CONFIG_COMEDI_NI_TIO=m +CONFIG_COMEDI_NI_ROUTING=m +CONFIG_RTL8192U=m +CONFIG_RTLLIB=m +CONFIG_RTLLIB_CRYPTO_CCMP=m +CONFIG_RTLLIB_CRYPTO_TKIP=m +CONFIG_RTLLIB_CRYPTO_WEP=m +CONFIG_RTL8192E=m +CONFIG_RTL8723BS=m +CONFIG_R8712U=m +CONFIG_R8188EU=m +CONFIG_88EU_AP_MODE=y +CONFIG_RTS5208=m +CONFIG_VT6655=m +CONFIG_VT6656=m + +# +# IIO staging drivers +# + +# +# Accelerometers +# +CONFIG_ADIS16203=m +CONFIG_ADIS16240=m +# end of Accelerometers + +# +# Analog to digital converters +# +CONFIG_AD7816=m +CONFIG_AD7280=m +# end of Analog to digital converters + +# +# Analog digital bi-direction converters +# +CONFIG_ADT7316=m +CONFIG_ADT7316_SPI=m +CONFIG_ADT7316_I2C=m +# end of Analog digital bi-direction converters + +# +# Capacitance to digital converters +# +CONFIG_AD7150=m +CONFIG_AD7746=m +# end of Capacitance to digital converters + +# +# Direct Digital Synthesis +# +CONFIG_AD9832=m +CONFIG_AD9834=m +# end of Direct Digital Synthesis + +# +# Network Analyzer, Impedance Converters +# +CONFIG_AD5933=m +# end of Network Analyzer, Impedance Converters + +# +# Active energy metering IC +# +CONFIG_ADE7854=m +CONFIG_ADE7854_I2C=m +CONFIG_ADE7854_SPI=m +# end of Active energy metering IC + +# +# Resolver to digital converters +# +CONFIG_AD2S1210=m +# end of Resolver to digital converters +# end of IIO staging drivers + +# CONFIG_FB_SM750 is not set +CONFIG_STAGING_MEDIA=y +# CONFIG_INTEL_ATOMISP is not set +CONFIG_VIDEO_IPU3_IMGU=m +CONFIG_VIDEO_USBVISION=m + +# +# Android +# +# end of Android + +CONFIG_STAGING_BOARD=y +CONFIG_LTE_GDM724X=m +CONFIG_FIREWIRE_SERIAL=m +CONFIG_FWTTY_MAX_TOTAL_PORTS=64 +CONFIG_FWTTY_MAX_CARD_PORTS=32 +CONFIG_GS_FPGABOOT=m +CONFIG_UNISYSSPAR=y +CONFIG_UNISYS_VISORNIC=m +CONFIG_UNISYS_VISORINPUT=m +CONFIG_UNISYS_VISORHBA=m +# CONFIG_FB_TFT is not set +CONFIG_MOST_COMPONENTS=m +CONFIG_MOST_CDEV=m +CONFIG_MOST_NET=m +CONFIG_MOST_SOUND=m +CONFIG_MOST_VIDEO=m +CONFIG_MOST_DIM2=m +CONFIG_MOST_I2C=m +CONFIG_KS7010=m +CONFIG_PI433=m + +# +# Gasket devices +# +CONFIG_STAGING_GASKET_FRAMEWORK=m +CONFIG_STAGING_APEX_DRIVER=m +# end of Gasket devices + +CONFIG_XIL_AXIS_FIFO=m +CONFIG_FIELDBUS_DEV=m +CONFIG_HMS_ANYBUSS_BUS=m +CONFIG_ARCX_ANYBUS_CONTROLLER=m +CONFIG_HMS_PROFINET=m +CONFIG_KPC2000=y +CONFIG_KPC2000_CORE=m +CONFIG_KPC2000_SPI=m +CONFIG_KPC2000_I2C=m +CONFIG_KPC2000_DMA=m +CONFIG_QLGE=m +CONFIG_WFX=m +CONFIG_X86_PLATFORM_DEVICES=y +CONFIG_ACPI_WMI=m +CONFIG_WMI_BMOF=m +CONFIG_ALIENWARE_WMI=m +CONFIG_HUAWEI_WMI=m +CONFIG_INTEL_WMI_SBL_FW_UPDATE=m +CONFIG_INTEL_WMI_THUNDERBOLT=m +CONFIG_MXM_WMI=m +CONFIG_PEAQ_WMI=m +CONFIG_XIAOMI_WMI=m +CONFIG_ACERHDF=m +CONFIG_ACER_WIRELESS=m +CONFIG_ACER_WMI=m +CONFIG_APPLE_GMUX=m +CONFIG_ASUS_LAPTOP=m +CONFIG_ASUS_WIRELESS=m +CONFIG_ASUS_WMI=m +CONFIG_ASUS_NB_WMI=m +CONFIG_EEEPC_LAPTOP=m +CONFIG_EEEPC_WMI=m +CONFIG_DCDBAS=m +CONFIG_DELL_SMBIOS=m +CONFIG_DELL_SMBIOS_WMI=y +CONFIG_DELL_SMBIOS_SMM=y +CONFIG_DELL_LAPTOP=m +CONFIG_DELL_RBTN=m +# CONFIG_DELL_RBU is not set +CONFIG_DELL_SMO8800=m +CONFIG_DELL_WMI=m +CONFIG_DELL_WMI_DESCRIPTOR=m +CONFIG_DELL_WMI_AIO=m +CONFIG_DELL_WMI_LED=m +CONFIG_AMILO_RFKILL=m +CONFIG_FUJITSU_LAPTOP=m +CONFIG_FUJITSU_TABLET=m +CONFIG_GPD_POCKET_FAN=m +CONFIG_HP_ACCEL=m +CONFIG_HP_WIRELESS=m +CONFIG_HP_WMI=m +CONFIG_IBM_RTL=m +CONFIG_IDEAPAD_LAPTOP=m +CONFIG_SENSORS_HDAPS=m +CONFIG_THINKPAD_ACPI=m +CONFIG_THINKPAD_ACPI_ALSA_SUPPORT=y +# CONFIG_THINKPAD_ACPI_DEBUGFACILITIES is not set +# CONFIG_THINKPAD_ACPI_DEBUG is not set +# CONFIG_THINKPAD_ACPI_UNSAFE_LEDS is not set +CONFIG_THINKPAD_ACPI_VIDEO=y +CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y +# CONFIG_INTEL_ATOMISP2_LED is not set +CONFIG_INTEL_ATOMISP2_PM=m +CONFIG_INTEL_CHT_INT33FE=m +CONFIG_INTEL_HID_EVENT=m +CONFIG_INTEL_INT0002_VGPIO=m +CONFIG_INTEL_MENLOW=m +CONFIG_INTEL_OAKTRAIL=m +CONFIG_INTEL_VBTN=m +CONFIG_SURFACE3_WMI=m +CONFIG_SURFACE_3_BUTTON=m +CONFIG_SURFACE_3_POWER_OPREGION=m +CONFIG_SURFACE_PRO3_BUTTON=m +CONFIG_MSI_LAPTOP=m +CONFIG_MSI_WMI=m +CONFIG_PCENGINES_APU2=m +CONFIG_SAMSUNG_LAPTOP=m +CONFIG_SAMSUNG_Q10=m +CONFIG_ACPI_TOSHIBA=m +CONFIG_TOSHIBA_BT_RFKILL=m +CONFIG_TOSHIBA_HAPS=m +CONFIG_TOSHIBA_WMI=m +CONFIG_ACPI_CMPC=m +CONFIG_COMPAL_LAPTOP=m +CONFIG_LG_LAPTOP=m +CONFIG_PANASONIC_LAPTOP=m +CONFIG_SONY_LAPTOP=m +CONFIG_SONYPI_COMPAT=y +CONFIG_SYSTEM76_ACPI=m +CONFIG_TOPSTAR_LAPTOP=m +CONFIG_I2C_MULTI_INSTANTIATE=m +CONFIG_MLX_PLATFORM=m +CONFIG_TOUCHSCREEN_DMI=y +CONFIG_INTEL_IPS=m +CONFIG_INTEL_RST=m +CONFIG_INTEL_SMARTCONNECT=m + +# +# Intel Speed Select Technology interface support +# +CONFIG_INTEL_SPEED_SELECT_INTERFACE=m +# end of Intel Speed Select Technology interface support + +CONFIG_INTEL_TURBO_MAX_3=y +CONFIG_INTEL_UNCORE_FREQ_CONTROL=m +CONFIG_INTEL_CHTDC_TI_PWRBTN=m +# CONFIG_INTEL_MFLD_THERMAL is not set +# CONFIG_INTEL_MID_POWER_BUTTON is not set +CONFIG_INTEL_PMC_CORE=y +CONFIG_INTEL_PUNIT_IPC=m +CONFIG_INTEL_SCU_IPC=y +CONFIG_INTEL_SCU=y +CONFIG_INTEL_SCU_PCI=y +CONFIG_INTEL_SCU_PLATFORM=m +CONFIG_INTEL_SCU_IPC_UTIL=m +CONFIG_INTEL_TELEMETRY=m +CONFIG_PMC_ATOM=y +CONFIG_MFD_CROS_EC=m +CONFIG_CHROME_PLATFORMS=y +CONFIG_CHROMEOS_LAPTOP=m +CONFIG_CHROMEOS_PSTORE=m +CONFIG_CHROMEOS_TBMC=m +CONFIG_CROS_EC=m +CONFIG_CROS_EC_I2C=m +CONFIG_CROS_EC_RPMSG=m +CONFIG_CROS_EC_ISHTP=m +CONFIG_CROS_EC_SPI=m +CONFIG_CROS_EC_LPC=m +CONFIG_CROS_EC_PROTO=y +CONFIG_CROS_KBD_LED_BACKLIGHT=m +CONFIG_CROS_EC_CHARDEV=m +CONFIG_CROS_EC_LIGHTBAR=m +CONFIG_CROS_EC_VBC=m +CONFIG_CROS_EC_DEBUGFS=m +CONFIG_CROS_EC_SENSORHUB=m +CONFIG_CROS_EC_SYSFS=m +CONFIG_CROS_EC_TYPEC=m +CONFIG_CROS_USBPD_LOGGER=m +CONFIG_CROS_USBPD_NOTIFY=m +CONFIG_WILCO_EC=m +CONFIG_WILCO_EC_DEBUGFS=m +CONFIG_WILCO_EC_EVENTS=m +CONFIG_WILCO_EC_TELEMETRY=m +CONFIG_MELLANOX_PLATFORM=y +CONFIG_MLXREG_HOTPLUG=m +CONFIG_MLXREG_IO=m +CONFIG_HAVE_CLK=y +CONFIG_CLKDEV_LOOKUP=y +CONFIG_HAVE_CLK_PREPARE=y +CONFIG_COMMON_CLK=y +CONFIG_COMMON_CLK_WM831X=m +CONFIG_CLK_HSDK=y +CONFIG_COMMON_CLK_MAX77686=m +CONFIG_COMMON_CLK_MAX9485=m +CONFIG_COMMON_CLK_RK808=m +CONFIG_COMMON_CLK_SI5341=m +CONFIG_COMMON_CLK_SI5351=m +CONFIG_COMMON_CLK_SI514=m +CONFIG_COMMON_CLK_SI544=m +CONFIG_COMMON_CLK_SI570=m +CONFIG_COMMON_CLK_CDCE706=m +CONFIG_COMMON_CLK_CDCE925=m +CONFIG_COMMON_CLK_CS2000_CP=m +CONFIG_COMMON_CLK_S2MPS11=m +CONFIG_CLK_TWL6040=m +CONFIG_COMMON_CLK_LOCHNAGAR=m +CONFIG_COMMON_CLK_PALMAS=m +CONFIG_COMMON_CLK_PWM=m +CONFIG_COMMON_CLK_VC5=m +CONFIG_COMMON_CLK_BD718XX=m +CONFIG_COMMON_CLK_FIXED_MMIO=y +CONFIG_CLK_LGM_CGU=y +CONFIG_HWSPINLOCK=y + +# +# Clock Source drivers +# +CONFIG_TIMER_OF=y +CONFIG_TIMER_PROBE=y +CONFIG_CLKEVT_I8253=y +CONFIG_I8253_LOCK=y +CONFIG_CLKBLD_I8253=y +CONFIG_CLKSRC_MMIO=y +CONFIG_MICROCHIP_PIT64B=y +# end of Clock Source drivers + +CONFIG_MAILBOX=y +CONFIG_PLATFORM_MHU=m +CONFIG_PCC=y +CONFIG_ALTERA_MBOX=m +CONFIG_MAILBOX_TEST=m +CONFIG_IOMMU_IOVA=y +CONFIG_IOASID=y +CONFIG_IOMMU_API=y +CONFIG_IOMMU_SUPPORT=y + +# +# Generic IOMMU Pagetable Support +# +# end of Generic IOMMU Pagetable Support + +# CONFIG_IOMMU_DEBUGFS is not set +# CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set +CONFIG_OF_IOMMU=y +CONFIG_IOMMU_DMA=y +CONFIG_AMD_IOMMU=y +CONFIG_AMD_IOMMU_V2=y +CONFIG_DMAR_TABLE=y +CONFIG_INTEL_IOMMU=y +CONFIG_INTEL_IOMMU_SVM=y +# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set +CONFIG_INTEL_IOMMU_FLOPPY_WA=y +# CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON is not set +CONFIG_IRQ_REMAP=y +CONFIG_HYPERV_IOMMU=y + +# +# Remoteproc drivers +# +CONFIG_REMOTEPROC=y +# CONFIG_REMOTEPROC_CDEV is not set +# end of Remoteproc drivers + +# +# Rpmsg drivers +# +CONFIG_RPMSG=m +CONFIG_RPMSG_CHAR=m +CONFIG_RPMSG_QCOM_GLINK=m +CONFIG_RPMSG_QCOM_GLINK_RPM=m +CONFIG_RPMSG_VIRTIO=m +# end of Rpmsg drivers + +CONFIG_SOUNDWIRE=m + +# +# SoundWire Devices +# +CONFIG_SOUNDWIRE_CADENCE=m +CONFIG_SOUNDWIRE_INTEL=m +CONFIG_SOUNDWIRE_QCOM=m + +# +# SOC (System On Chip) specific Drivers +# + +# +# Amlogic SoC drivers +# +# end of Amlogic SoC drivers + +# +# Aspeed SoC drivers +# +# end of Aspeed SoC drivers + +# +# Broadcom SoC drivers +# +# end of Broadcom SoC drivers + +# +# NXP/Freescale QorIQ SoC drivers +# +# end of NXP/Freescale QorIQ SoC drivers + +# +# i.MX SoC drivers +# +# end of i.MX SoC drivers + +# +# Qualcomm SoC drivers +# +# end of Qualcomm SoC drivers + +CONFIG_SOC_TI=y + +# +# Xilinx SoC drivers +# +CONFIG_XILINX_VCU=m +# end of Xilinx SoC drivers +# end of SOC (System On Chip) specific Drivers + +CONFIG_PM_DEVFREQ=y + +# +# DEVFREQ Governors +# +CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=m +CONFIG_DEVFREQ_GOV_PERFORMANCE=m +CONFIG_DEVFREQ_GOV_POWERSAVE=m +CONFIG_DEVFREQ_GOV_USERSPACE=m +CONFIG_DEVFREQ_GOV_PASSIVE=m + +# +# DEVFREQ Drivers +# +CONFIG_PM_DEVFREQ_EVENT=y +CONFIG_EXTCON=y + +# +# Extcon Device Drivers +# +CONFIG_EXTCON_ADC_JACK=m +CONFIG_EXTCON_ARIZONA=m +CONFIG_EXTCON_AXP288=m +CONFIG_EXTCON_FSA9480=m +CONFIG_EXTCON_GPIO=m +CONFIG_EXTCON_INTEL_INT3496=m +CONFIG_EXTCON_INTEL_CHT_WC=m +CONFIG_EXTCON_MAX14577=m +CONFIG_EXTCON_MAX3355=m +CONFIG_EXTCON_MAX77693=m +CONFIG_EXTCON_MAX77843=m +CONFIG_EXTCON_MAX8997=m +CONFIG_EXTCON_PALMAS=m +CONFIG_EXTCON_PTN5150=m +CONFIG_EXTCON_RT8973A=m +CONFIG_EXTCON_SM5502=m +CONFIG_EXTCON_USB_GPIO=m +CONFIG_EXTCON_USBC_CROS_EC=m +CONFIG_MEMORY=y +CONFIG_IIO=m +CONFIG_IIO_BUFFER=y +CONFIG_IIO_BUFFER_CB=m +CONFIG_IIO_BUFFER_DMA=m +CONFIG_IIO_BUFFER_DMAENGINE=m +CONFIG_IIO_BUFFER_HW_CONSUMER=m +CONFIG_IIO_KFIFO_BUF=m +CONFIG_IIO_TRIGGERED_BUFFER=m +CONFIG_IIO_CONFIGFS=m +CONFIG_IIO_TRIGGER=y +CONFIG_IIO_CONSUMERS_PER_TRIGGER=2 +CONFIG_IIO_SW_DEVICE=m +CONFIG_IIO_SW_TRIGGER=m +CONFIG_IIO_TRIGGERED_EVENT=m + +# +# Accelerometers +# +CONFIG_ADIS16201=m +CONFIG_ADIS16209=m +CONFIG_ADXL372=m +CONFIG_ADXL372_SPI=m +CONFIG_ADXL372_I2C=m +CONFIG_BMA220=m +CONFIG_BMA400=m +CONFIG_BMA400_I2C=m +CONFIG_BMA400_SPI=m +CONFIG_BMC150_ACCEL=m +CONFIG_BMC150_ACCEL_I2C=m +CONFIG_BMC150_ACCEL_SPI=m +CONFIG_DA280=m +CONFIG_DA311=m +CONFIG_DMARD06=m +CONFIG_DMARD09=m +CONFIG_DMARD10=m +CONFIG_HID_SENSOR_ACCEL_3D=m +CONFIG_IIO_CROS_EC_ACCEL_LEGACY=m +CONFIG_IIO_ST_ACCEL_3AXIS=m +CONFIG_IIO_ST_ACCEL_I2C_3AXIS=m +CONFIG_IIO_ST_ACCEL_SPI_3AXIS=m +CONFIG_KXSD9=m +CONFIG_KXSD9_SPI=m +CONFIG_KXSD9_I2C=m +CONFIG_KXCJK1013=m +CONFIG_MC3230=m +CONFIG_MMA7455=m +CONFIG_MMA7455_I2C=m +CONFIG_MMA7455_SPI=m +CONFIG_MMA7660=m +CONFIG_MMA8452=m +CONFIG_MMA9551_CORE=m +CONFIG_MMA9551=m +CONFIG_MMA9553=m +CONFIG_MXC4005=m +CONFIG_MXC6255=m +CONFIG_SCA3000=m +CONFIG_STK8312=m +CONFIG_STK8BA50=m +# end of Accelerometers + +# +# Analog to digital converters +# +CONFIG_AD_SIGMA_DELTA=m +CONFIG_AD7091R5=m +CONFIG_AD7124=m +CONFIG_AD7192=m +CONFIG_AD7266=m +CONFIG_AD7291=m +CONFIG_AD7292=m +CONFIG_AD7298=m +CONFIG_AD7476=m +CONFIG_AD7606=m +CONFIG_AD7606_IFACE_PARALLEL=m +CONFIG_AD7606_IFACE_SPI=m +CONFIG_AD7766=m +CONFIG_AD7768_1=m +CONFIG_AD7780=m +CONFIG_AD7791=m +CONFIG_AD7793=m +CONFIG_AD7887=m +CONFIG_AD7923=m +CONFIG_AD7949=m +CONFIG_AD799X=m +CONFIG_AD9467=m +CONFIG_ADI_AXI_ADC=m +CONFIG_AXP20X_ADC=m +CONFIG_AXP288_ADC=m +CONFIG_CC10001_ADC=m +CONFIG_CPCAP_ADC=m +CONFIG_DA9150_GPADC=m +CONFIG_DLN2_ADC=m +CONFIG_ENVELOPE_DETECTOR=m +CONFIG_HI8435=m +CONFIG_HX711=m +CONFIG_INA2XX_ADC=m +CONFIG_LP8788_ADC=m +CONFIG_LTC2471=m +CONFIG_LTC2485=m +CONFIG_LTC2496=m +CONFIG_LTC2497=m +CONFIG_MAX1027=m +CONFIG_MAX11100=m +CONFIG_MAX1118=m +CONFIG_MAX1241=m +CONFIG_MAX1363=m +CONFIG_MAX9611=m +CONFIG_MCP320X=m +CONFIG_MCP3422=m +CONFIG_MCP3911=m +CONFIG_MEN_Z188_ADC=m +# CONFIG_MP2629_ADC is not set +CONFIG_NAU7802=m +CONFIG_PALMAS_GPADC=m +CONFIG_QCOM_VADC_COMMON=m +CONFIG_QCOM_SPMI_IADC=m +CONFIG_QCOM_SPMI_VADC=m +CONFIG_QCOM_SPMI_ADC5=m +CONFIG_RN5T618_ADC=m +CONFIG_SD_ADC_MODULATOR=m +CONFIG_STMPE_ADC=m +CONFIG_TI_ADC081C=m +CONFIG_TI_ADC0832=m +CONFIG_TI_ADC084S021=m +CONFIG_TI_ADC12138=m +CONFIG_TI_ADC108S102=m +CONFIG_TI_ADC128S052=m +CONFIG_TI_ADC161S626=m +CONFIG_TI_ADS1015=m +CONFIG_TI_ADS7950=m +CONFIG_TI_ADS8344=m +CONFIG_TI_ADS8688=m +CONFIG_TI_ADS124S08=m +CONFIG_TI_AM335X_ADC=m +CONFIG_TI_TLC4541=m +CONFIG_TWL4030_MADC=m +CONFIG_TWL6030_GPADC=m +CONFIG_VF610_ADC=m +CONFIG_VIPERBOARD_ADC=m +CONFIG_XILINX_XADC=m +# end of Analog to digital converters + +# +# Analog Front Ends +# +CONFIG_IIO_RESCALE=m +# end of Analog Front Ends + +# +# Amplifiers +# +CONFIG_AD8366=m +CONFIG_HMC425=m +# end of Amplifiers + +# +# Chemical Sensors +# +CONFIG_ATLAS_PH_SENSOR=m +CONFIG_ATLAS_EZO_SENSOR=m +CONFIG_BME680=m +CONFIG_BME680_I2C=m +CONFIG_BME680_SPI=m +CONFIG_CCS811=m +CONFIG_IAQCORE=m +CONFIG_PMS7003=m +# CONFIG_SCD30_CORE is not set +CONFIG_SENSIRION_SGP30=m +CONFIG_SPS30=m +CONFIG_VZ89X=m +# end of Chemical Sensors + +CONFIG_IIO_CROS_EC_SENSORS_CORE=m +CONFIG_IIO_CROS_EC_SENSORS=m +CONFIG_IIO_CROS_EC_SENSORS_LID_ANGLE=m + +# +# Hid Sensor IIO Common +# +CONFIG_HID_SENSOR_IIO_COMMON=m +CONFIG_HID_SENSOR_IIO_TRIGGER=m +# end of Hid Sensor IIO Common + +CONFIG_IIO_MS_SENSORS_I2C=m + +# +# SSP Sensor Common +# +CONFIG_IIO_SSP_SENSORS_COMMONS=m +CONFIG_IIO_SSP_SENSORHUB=m +# end of SSP Sensor Common + +CONFIG_IIO_ST_SENSORS_I2C=m +CONFIG_IIO_ST_SENSORS_SPI=m +CONFIG_IIO_ST_SENSORS_CORE=m + +# +# Digital to analog converters +# +CONFIG_AD5064=m +CONFIG_AD5360=m +CONFIG_AD5380=m +CONFIG_AD5421=m +CONFIG_AD5446=m +CONFIG_AD5449=m +CONFIG_AD5592R_BASE=m +CONFIG_AD5592R=m +CONFIG_AD5593R=m +CONFIG_AD5504=m +CONFIG_AD5624R_SPI=m +CONFIG_AD5686=m +CONFIG_AD5686_SPI=m +CONFIG_AD5696_I2C=m +CONFIG_AD5755=m +CONFIG_AD5758=m +CONFIG_AD5761=m +CONFIG_AD5764=m +CONFIG_AD5770R=m +CONFIG_AD5791=m +CONFIG_AD7303=m +CONFIG_AD8801=m +CONFIG_DPOT_DAC=m +CONFIG_DS4424=m +CONFIG_LTC1660=m +CONFIG_LTC2632=m +CONFIG_M62332=m +CONFIG_MAX517=m +CONFIG_MAX5821=m +CONFIG_MCP4725=m +CONFIG_MCP4922=m +CONFIG_TI_DAC082S085=m +CONFIG_TI_DAC5571=m +CONFIG_TI_DAC7311=m +CONFIG_TI_DAC7612=m +CONFIG_VF610_DAC=m +# end of Digital to analog converters + +# +# IIO dummy driver +# +# CONFIG_IIO_SIMPLE_DUMMY is not set +# end of IIO dummy driver + +# +# Frequency Synthesizers DDS/PLL +# + +# +# Clock Generator/Distribution +# +CONFIG_AD9523=m +# end of Clock Generator/Distribution + +# +# Phase-Locked Loop (PLL) frequency synthesizers +# +CONFIG_ADF4350=m +CONFIG_ADF4371=m +# end of Phase-Locked Loop (PLL) frequency synthesizers +# end of Frequency Synthesizers DDS/PLL + +# +# Digital gyroscope sensors +# +CONFIG_ADIS16080=m +CONFIG_ADIS16130=m +CONFIG_ADIS16136=m +CONFIG_ADIS16260=m +CONFIG_ADXRS450=m +CONFIG_BMG160=m +CONFIG_BMG160_I2C=m +CONFIG_BMG160_SPI=m +CONFIG_FXAS21002C=m +CONFIG_FXAS21002C_I2C=m +CONFIG_FXAS21002C_SPI=m +CONFIG_HID_SENSOR_GYRO_3D=m +CONFIG_MPU3050=m +CONFIG_MPU3050_I2C=m +CONFIG_IIO_ST_GYRO_3AXIS=m +CONFIG_IIO_ST_GYRO_I2C_3AXIS=m +CONFIG_IIO_ST_GYRO_SPI_3AXIS=m +CONFIG_ITG3200=m +# end of Digital gyroscope sensors + +# +# Health Sensors +# + +# +# Heart Rate Monitors +# +CONFIG_AFE4403=m +CONFIG_AFE4404=m +CONFIG_MAX30100=m +CONFIG_MAX30102=m +# end of Heart Rate Monitors +# end of Health Sensors + +# +# Humidity sensors +# +CONFIG_AM2315=m +CONFIG_DHT11=m +CONFIG_HDC100X=m +CONFIG_HID_SENSOR_HUMIDITY=m +CONFIG_HTS221=m +CONFIG_HTS221_I2C=m +CONFIG_HTS221_SPI=m +CONFIG_HTU21=m +CONFIG_SI7005=m +CONFIG_SI7020=m +# end of Humidity sensors + +# +# Inertial measurement units +# +CONFIG_ADIS16400=m +CONFIG_ADIS16460=m +CONFIG_ADIS16475=m +CONFIG_ADIS16480=m +CONFIG_BMI160=m +CONFIG_BMI160_I2C=m +CONFIG_BMI160_SPI=m +CONFIG_FXOS8700=m +CONFIG_FXOS8700_I2C=m +CONFIG_FXOS8700_SPI=m +CONFIG_KMX61=m +# CONFIG_INV_ICM42600_I2C is not set +# CONFIG_INV_ICM42600_SPI is not set +CONFIG_INV_MPU6050_IIO=m +CONFIG_INV_MPU6050_I2C=m +CONFIG_INV_MPU6050_SPI=m +CONFIG_IIO_ST_LSM6DSX=m +CONFIG_IIO_ST_LSM6DSX_I2C=m +CONFIG_IIO_ST_LSM6DSX_SPI=m +CONFIG_IIO_ST_LSM6DSX_I3C=m +# end of Inertial measurement units + +CONFIG_IIO_ADIS_LIB=m +CONFIG_IIO_ADIS_LIB_BUFFER=y + +# +# Light sensors +# +CONFIG_ACPI_ALS=m +CONFIG_ADJD_S311=m +CONFIG_ADUX1020=m +CONFIG_AL3010=m +CONFIG_AL3320A=m +CONFIG_APDS9300=m +CONFIG_APDS9960=m +CONFIG_BH1750=m +CONFIG_BH1780=m +CONFIG_CM32181=m +CONFIG_CM3232=m +CONFIG_CM3323=m +CONFIG_CM3605=m +CONFIG_CM36651=m +CONFIG_IIO_CROS_EC_LIGHT_PROX=m +CONFIG_GP2AP002=m +CONFIG_GP2AP020A00F=m +CONFIG_IQS621_ALS=m +CONFIG_SENSORS_ISL29018=m +CONFIG_SENSORS_ISL29028=m +CONFIG_ISL29125=m +CONFIG_HID_SENSOR_ALS=m +CONFIG_HID_SENSOR_PROX=m +CONFIG_JSA1212=m +CONFIG_RPR0521=m +CONFIG_SENSORS_LM3533=m +CONFIG_LTR501=m +CONFIG_LV0104CS=m +CONFIG_MAX44000=m +CONFIG_MAX44009=m +CONFIG_NOA1305=m +CONFIG_OPT3001=m +CONFIG_PA12203001=m +CONFIG_SI1133=m +CONFIG_SI1145=m +CONFIG_STK3310=m +CONFIG_ST_UVIS25=m +CONFIG_ST_UVIS25_I2C=m +CONFIG_ST_UVIS25_SPI=m +CONFIG_TCS3414=m +CONFIG_TCS3472=m +CONFIG_SENSORS_TSL2563=m +CONFIG_TSL2583=m +CONFIG_TSL2772=m +CONFIG_TSL4531=m +CONFIG_US5182D=m +CONFIG_VCNL4000=m +CONFIG_VCNL4035=m +CONFIG_VEML6030=m +CONFIG_VEML6070=m +CONFIG_VL6180=m +CONFIG_ZOPT2201=m +# end of Light sensors + +# +# Magnetometer sensors +# +CONFIG_AK8974=m +CONFIG_AK8975=m +CONFIG_AK09911=m +CONFIG_BMC150_MAGN=m +CONFIG_BMC150_MAGN_I2C=m +CONFIG_BMC150_MAGN_SPI=m +CONFIG_MAG3110=m +CONFIG_HID_SENSOR_MAGNETOMETER_3D=m +CONFIG_MMC35240=m +CONFIG_IIO_ST_MAGN_3AXIS=m +CONFIG_IIO_ST_MAGN_I2C_3AXIS=m +CONFIG_IIO_ST_MAGN_SPI_3AXIS=m +CONFIG_SENSORS_HMC5843=m +CONFIG_SENSORS_HMC5843_I2C=m +CONFIG_SENSORS_HMC5843_SPI=m +CONFIG_SENSORS_RM3100=m +CONFIG_SENSORS_RM3100_I2C=m +CONFIG_SENSORS_RM3100_SPI=m +# end of Magnetometer sensors + +# +# Multiplexers +# +CONFIG_IIO_MUX=m +# end of Multiplexers + +# +# Inclinometer sensors +# +CONFIG_HID_SENSOR_INCLINOMETER_3D=m +CONFIG_HID_SENSOR_DEVICE_ROTATION=m +# end of Inclinometer sensors + +# +# Triggers - standalone +# +CONFIG_IIO_HRTIMER_TRIGGER=m +CONFIG_IIO_INTERRUPT_TRIGGER=m +CONFIG_IIO_TIGHTLOOP_TRIGGER=m +CONFIG_IIO_SYSFS_TRIGGER=m +# end of Triggers - standalone + +# +# Linear and angular position sensors +# +CONFIG_IQS624_POS=m +# end of Linear and angular position sensors + +# +# Digital potentiometers +# +CONFIG_AD5272=m +CONFIG_DS1803=m +CONFIG_MAX5432=m +CONFIG_MAX5481=m +CONFIG_MAX5487=m +CONFIG_MCP4018=m +CONFIG_MCP4131=m +CONFIG_MCP4531=m +CONFIG_MCP41010=m +CONFIG_TPL0102=m +# end of Digital potentiometers + +# +# Digital potentiostats +# +CONFIG_LMP91000=m +# end of Digital potentiostats + +# +# Pressure sensors +# +CONFIG_ABP060MG=m +CONFIG_BMP280=m +CONFIG_BMP280_I2C=m +CONFIG_BMP280_SPI=m +CONFIG_IIO_CROS_EC_BARO=m +CONFIG_DLHL60D=m +CONFIG_DPS310=m +CONFIG_HID_SENSOR_PRESS=m +CONFIG_HP03=m +CONFIG_ICP10100=m +CONFIG_MPL115=m +CONFIG_MPL115_I2C=m +CONFIG_MPL115_SPI=m +CONFIG_MPL3115=m +CONFIG_MS5611=m +CONFIG_MS5611_I2C=m +CONFIG_MS5611_SPI=m +CONFIG_MS5637=m +CONFIG_IIO_ST_PRESS=m +CONFIG_IIO_ST_PRESS_I2C=m +CONFIG_IIO_ST_PRESS_SPI=m +CONFIG_T5403=m +CONFIG_HP206C=m +CONFIG_ZPA2326=m +CONFIG_ZPA2326_I2C=m +CONFIG_ZPA2326_SPI=m +# end of Pressure sensors + +# +# Lightning sensors +# +CONFIG_AS3935=m +# end of Lightning sensors + +# +# Proximity and distance sensors +# +CONFIG_ISL29501=m +CONFIG_LIDAR_LITE_V2=m +CONFIG_MB1232=m +CONFIG_PING=m +CONFIG_RFD77402=m +CONFIG_SRF04=m +CONFIG_SX9310=m +CONFIG_SX9500=m +CONFIG_SRF08=m +CONFIG_VCNL3020=m +CONFIG_VL53L0X_I2C=m +# end of Proximity and distance sensors + +# +# Resolver to digital converters +# +CONFIG_AD2S90=m +CONFIG_AD2S1200=m +# end of Resolver to digital converters + +# +# Temperature sensors +# +CONFIG_IQS620AT_TEMP=m +CONFIG_LTC2983=m +CONFIG_MAXIM_THERMOCOUPLE=m +CONFIG_HID_SENSOR_TEMP=m +CONFIG_MLX90614=m +CONFIG_MLX90632=m +CONFIG_TMP006=m +CONFIG_TMP007=m +CONFIG_TSYS01=m +CONFIG_TSYS02D=m +CONFIG_MAX31856=m +# end of Temperature sensors + +CONFIG_NTB=m +CONFIG_NTB_MSI=y +CONFIG_NTB_AMD=m +CONFIG_NTB_IDT=m +CONFIG_NTB_INTEL=m +CONFIG_NTB_SWITCHTEC=m +# CONFIG_NTB_PINGPONG is not set +# CONFIG_NTB_TOOL is not set +# CONFIG_NTB_PERF is not set +# CONFIG_NTB_MSI_TEST is not set +CONFIG_NTB_TRANSPORT=m +CONFIG_VME_BUS=y + +# +# VME Bridge Drivers +# +CONFIG_VME_CA91CX42=m +CONFIG_VME_TSI148=m +# CONFIG_VME_FAKE is not set + +# +# VME Board Drivers +# +CONFIG_VMIVME_7805=m + +# +# VME Device Drivers +# +CONFIG_VME_USER=m +CONFIG_PWM=y +CONFIG_PWM_SYSFS=y +# CONFIG_PWM_DEBUG is not set +CONFIG_PWM_ATMEL_HLCDC_PWM=m +CONFIG_PWM_CRC=y +CONFIG_PWM_CROS_EC=m +CONFIG_PWM_FSL_FTM=m +CONFIG_PWM_IQS620A=m +CONFIG_PWM_LP3943=m +CONFIG_PWM_LPSS=m +CONFIG_PWM_LPSS_PCI=m +CONFIG_PWM_LPSS_PLATFORM=m +CONFIG_PWM_PCA9685=m +CONFIG_PWM_STMPE=y +CONFIG_PWM_TWL=m +CONFIG_PWM_TWL_LED=m + +# +# IRQ chip support +# +CONFIG_IRQCHIP=y +CONFIG_AL_FIC=y +CONFIG_MADERA_IRQ=m +# end of IRQ chip support + +CONFIG_IPACK_BUS=m +CONFIG_BOARD_TPCI200=m +CONFIG_SERIAL_IPOCTAL=m +CONFIG_RESET_CONTROLLER=y +CONFIG_RESET_BRCMSTB_RESCAL=y +CONFIG_RESET_INTEL_GW=y +CONFIG_RESET_TI_SYSCON=m + +# +# PHY Subsystem +# +CONFIG_GENERIC_PHY=y +CONFIG_GENERIC_PHY_MIPI_DPHY=y +CONFIG_BCM_KONA_USB2_PHY=m +CONFIG_PHY_CADENCE_TORRENT=m +CONFIG_PHY_CADENCE_DPHY=m +CONFIG_PHY_CADENCE_SIERRA=m +CONFIG_PHY_CADENCE_SALVO=m +CONFIG_PHY_FSL_IMX8MQ_USB=m +CONFIG_PHY_MIXEL_MIPI_DPHY=m +CONFIG_PHY_PXA_28NM_HSIC=m +CONFIG_PHY_PXA_28NM_USB2=m +CONFIG_PHY_CPCAP_USB=m +CONFIG_PHY_MAPPHONE_MDM6600=m +CONFIG_PHY_OCELOT_SERDES=m +CONFIG_PHY_QCOM_USB_HS=m +CONFIG_PHY_QCOM_USB_HSIC=m +CONFIG_PHY_SAMSUNG_USB2=m +CONFIG_PHY_TUSB1210=m +CONFIG_PHY_INTEL_COMBO=y +CONFIG_PHY_INTEL_EMMC=m +# end of PHY Subsystem + +CONFIG_POWERCAP=y +CONFIG_INTEL_RAPL_CORE=m +CONFIG_INTEL_RAPL=m +CONFIG_IDLE_INJECT=y +CONFIG_MCB=m +CONFIG_MCB_PCI=m +CONFIG_MCB_LPC=m + +# +# Performance monitor support +# +# end of Performance monitor support + +CONFIG_RAS=y +CONFIG_RAS_CEC=y +# CONFIG_RAS_CEC_DEBUG is not set +CONFIG_USB4=m + +# +# Android +# +# CONFIG_ANDROID is not set +# end of Android + +CONFIG_LIBNVDIMM=y +CONFIG_BLK_DEV_PMEM=m +CONFIG_ND_BLK=m +CONFIG_ND_CLAIM=y +CONFIG_ND_BTT=m +CONFIG_BTT=y +CONFIG_ND_PFN=m +CONFIG_NVDIMM_PFN=y +CONFIG_NVDIMM_DAX=y +CONFIG_OF_PMEM=m +CONFIG_DAX_DRIVER=y +CONFIG_DAX=y +CONFIG_DEV_DAX=m +CONFIG_DEV_DAX_PMEM=m +CONFIG_DEV_DAX_HMEM=m +CONFIG_DEV_DAX_KMEM=m +CONFIG_DEV_DAX_PMEM_COMPAT=m +CONFIG_NVMEM=y +CONFIG_NVMEM_SYSFS=y +CONFIG_NVMEM_SPMI_SDAM=m +CONFIG_RAVE_SP_EEPROM=m + +# +# HW tracing support +# +CONFIG_STM=m +CONFIG_STM_PROTO_BASIC=m +CONFIG_STM_PROTO_SYS_T=m +# CONFIG_STM_DUMMY is not set +CONFIG_STM_SOURCE_CONSOLE=m +CONFIG_STM_SOURCE_HEARTBEAT=m +CONFIG_STM_SOURCE_FTRACE=m +CONFIG_INTEL_TH=m +CONFIG_INTEL_TH_PCI=m +CONFIG_INTEL_TH_ACPI=m +CONFIG_INTEL_TH_GTH=m +CONFIG_INTEL_TH_STH=m +CONFIG_INTEL_TH_MSU=m +CONFIG_INTEL_TH_PTI=m +# CONFIG_INTEL_TH_DEBUG is not set +# end of HW tracing support + +CONFIG_FPGA=m +CONFIG_ALTERA_PR_IP_CORE=m +CONFIG_ALTERA_PR_IP_CORE_PLAT=m +CONFIG_FPGA_MGR_ALTERA_PS_SPI=m +CONFIG_FPGA_MGR_ALTERA_CVP=m +CONFIG_FPGA_MGR_XILINX_SPI=m +CONFIG_FPGA_MGR_ICE40_SPI=m +CONFIG_FPGA_MGR_MACHXO2_SPI=m +CONFIG_FPGA_BRIDGE=m +CONFIG_ALTERA_FREEZE_BRIDGE=m +CONFIG_XILINX_PR_DECOUPLER=m +CONFIG_FPGA_REGION=m +CONFIG_OF_FPGA_REGION=m +CONFIG_FPGA_DFL=m +CONFIG_FPGA_DFL_FME=m +CONFIG_FPGA_DFL_FME_MGR=m +CONFIG_FPGA_DFL_FME_BRIDGE=m +CONFIG_FPGA_DFL_FME_REGION=m +CONFIG_FPGA_DFL_AFU=m +CONFIG_FPGA_DFL_PCI=m +CONFIG_FSI=m +CONFIG_FSI_NEW_DEV_NODE=y +CONFIG_FSI_MASTER_GPIO=m +CONFIG_FSI_MASTER_HUB=m +CONFIG_FSI_MASTER_ASPEED=m +CONFIG_FSI_SCOM=m +CONFIG_FSI_SBEFIFO=m +CONFIG_FSI_OCC=m +CONFIG_TEE=m + +# +# TEE drivers +# +CONFIG_AMDTEE=m +# end of TEE drivers + +CONFIG_MULTIPLEXER=m + +# +# Multiplexer drivers +# +CONFIG_MUX_ADG792A=m +CONFIG_MUX_ADGS1408=m +CONFIG_MUX_GPIO=m +CONFIG_MUX_MMIO=m +# end of Multiplexer drivers + +CONFIG_PM_OPP=y +CONFIG_UNISYS_VISORBUS=m +CONFIG_SIOX=m +CONFIG_SIOX_BUS_GPIO=m +CONFIG_SLIMBUS=m +CONFIG_SLIM_QCOM_CTRL=m +# CONFIG_INTERCONNECT is not set +CONFIG_COUNTER=m +CONFIG_FTM_QUADDEC=m +# CONFIG_MICROCHIP_TCB_CAPTURE is not set +CONFIG_MOST=m +# CONFIG_MOST_USB_HDM is not set +# end of Device Drivers + +# +# File systems +# +CONFIG_DCACHE_WORD_ACCESS=y +CONFIG_VALIDATE_FS_PARSER=y +CONFIG_FS_IOMAP=y +# CONFIG_EXT2_FS is not set +# CONFIG_EXT3_FS is not set +CONFIG_EXT4_FS=m +CONFIG_EXT4_USE_FOR_EXT2=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +# CONFIG_EXT4_DEBUG is not set +CONFIG_JBD2=m +# CONFIG_JBD2_DEBUG is not set +CONFIG_FS_MBCACHE=m +CONFIG_REISERFS_FS=m +# CONFIG_REISERFS_CHECK is not set +CONFIG_REISERFS_PROC_INFO=y +CONFIG_REISERFS_FS_XATTR=y +CONFIG_REISERFS_FS_POSIX_ACL=y +CONFIG_REISERFS_FS_SECURITY=y +CONFIG_JFS_FS=m +CONFIG_JFS_POSIX_ACL=y +CONFIG_JFS_SECURITY=y +# CONFIG_JFS_DEBUG is not set +CONFIG_JFS_STATISTICS=y +CONFIG_XFS_FS=m +CONFIG_XFS_QUOTA=y +CONFIG_XFS_POSIX_ACL=y +CONFIG_XFS_RT=y +CONFIG_XFS_ONLINE_SCRUB=y +CONFIG_XFS_ONLINE_REPAIR=y +# CONFIG_XFS_WARN is not set +# CONFIG_XFS_DEBUG is not set +CONFIG_GFS2_FS=m +CONFIG_GFS2_FS_LOCKING_DLM=y +CONFIG_OCFS2_FS=m +CONFIG_OCFS2_FS_O2CB=m +CONFIG_OCFS2_FS_USERSPACE_CLUSTER=m +CONFIG_OCFS2_FS_STATS=y +CONFIG_OCFS2_DEBUG_MASKLOG=y +# CONFIG_OCFS2_DEBUG_FS is not set +CONFIG_BTRFS_FS=m +CONFIG_BTRFS_FS_POSIX_ACL=y +# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set +# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set +# CONFIG_BTRFS_DEBUG is not set +# CONFIG_BTRFS_ASSERT is not set +# CONFIG_BTRFS_FS_REF_VERIFY is not set +CONFIG_NILFS2_FS=m +CONFIG_F2FS_FS=m +CONFIG_F2FS_STAT_FS=y +CONFIG_F2FS_FS_XATTR=y +CONFIG_F2FS_FS_POSIX_ACL=y +CONFIG_F2FS_FS_SECURITY=y +CONFIG_F2FS_CHECK_FS=y +# CONFIG_F2FS_IO_TRACE is not set +# CONFIG_F2FS_FAULT_INJECTION is not set +CONFIG_F2FS_FS_COMPRESSION=y +CONFIG_F2FS_FS_LZO=y +CONFIG_F2FS_FS_LZ4=y +CONFIG_F2FS_FS_ZSTD=y +CONFIG_F2FS_FS_LZORLE=y +CONFIG_ZONEFS_FS=m +CONFIG_FS_DAX=y +CONFIG_FS_DAX_PMD=y +CONFIG_FS_POSIX_ACL=y +CONFIG_EXPORTFS=y +CONFIG_EXPORTFS_BLOCK_OPS=y +CONFIG_FILE_LOCKING=y +# CONFIG_MANDATORY_FILE_LOCKING is not set +CONFIG_FS_ENCRYPTION=y +CONFIG_FS_ENCRYPTION_ALGS=m +# CONFIG_FS_ENCRYPTION_INLINE_CRYPT is not set +CONFIG_FS_VERITY=y +# CONFIG_FS_VERITY_DEBUG is not set +CONFIG_FS_VERITY_BUILTIN_SIGNATURES=y +CONFIG_FSNOTIFY=y +CONFIG_DNOTIFY=y +CONFIG_INOTIFY_USER=y +CONFIG_FANOTIFY=y +CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y +CONFIG_QUOTA=y +CONFIG_QUOTA_NETLINK_INTERFACE=y +# CONFIG_PRINT_QUOTA_WARNING is not set +# CONFIG_QUOTA_DEBUG is not set +CONFIG_QUOTA_TREE=m +CONFIG_QFMT_V1=m +CONFIG_QFMT_V2=m +CONFIG_QUOTACTL=y +CONFIG_QUOTACTL_COMPAT=y +CONFIG_AUTOFS4_FS=y +CONFIG_AUTOFS_FS=y +CONFIG_FUSE_FS=m +CONFIG_CUSE=m +CONFIG_VIRTIO_FS=m +CONFIG_OVERLAY_FS=m +CONFIG_OVERLAY_FS_REDIRECT_DIR=y +# CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW is not set +CONFIG_OVERLAY_FS_INDEX=y +CONFIG_OVERLAY_FS_XINO_AUTO=y +CONFIG_OVERLAY_FS_METACOPY=y + +# +# Caches +# +CONFIG_FSCACHE=m +CONFIG_FSCACHE_STATS=y +CONFIG_FSCACHE_HISTOGRAM=y +# CONFIG_FSCACHE_DEBUG is not set +# CONFIG_FSCACHE_OBJECT_LIST is not set +CONFIG_CACHEFILES=m +# CONFIG_CACHEFILES_DEBUG is not set +# CONFIG_CACHEFILES_HISTOGRAM is not set +# end of Caches + +# +# CD-ROM/DVD Filesystems +# +CONFIG_ISO9660_FS=m +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_UDF_FS=m +# end of CD-ROM/DVD Filesystems + +# +# DOS/FAT/EXFAT/NT Filesystems +# +CONFIG_FAT_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_CODEPAGE=437 +CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1" +CONFIG_FAT_DEFAULT_UTF8=y +CONFIG_EXFAT_FS=m +CONFIG_EXFAT_DEFAULT_IOCHARSET="utf8" +CONFIG_NTFS_FS=m +# CONFIG_NTFS_DEBUG is not set +CONFIG_NTFS_RW=y +# end of DOS/FAT/EXFAT/NT Filesystems + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +CONFIG_PROC_KCORE=y +CONFIG_PROC_VMCORE=y +CONFIG_PROC_VMCORE_DEVICE_DUMP=y +CONFIG_PROC_SYSCTL=y +CONFIG_PROC_PAGE_MONITOR=y +CONFIG_PROC_CHILDREN=y +CONFIG_PROC_PID_ARCH_STATUS=y +CONFIG_PROC_CPU_RESCTRL=y +CONFIG_KERNFS=y +CONFIG_SYSFS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_TMPFS_XATTR=y +# CONFIG_TMPFS_INODE64 is not set +CONFIG_HUGETLBFS=y +CONFIG_HUGETLB_PAGE=y +CONFIG_MEMFD_CREATE=y +CONFIG_ARCH_HAS_GIGANTIC_PAGE=y +CONFIG_CONFIGFS_FS=y +CONFIG_EFIVAR_FS=y +# end of Pseudo filesystems + +CONFIG_MISC_FILESYSTEMS=y +CONFIG_ORANGEFS_FS=m +# CONFIG_ADFS_FS is not set +CONFIG_AFFS_FS=m +CONFIG_ECRYPT_FS=m +# CONFIG_ECRYPT_FS_MESSAGING is not set +CONFIG_HFS_FS=m +CONFIG_HFSPLUS_FS=m +CONFIG_BEFS_FS=m +# CONFIG_BEFS_DEBUG is not set +# CONFIG_BFS_FS is not set +# CONFIG_EFS_FS is not set +CONFIG_JFFS2_FS=m +CONFIG_JFFS2_FS_DEBUG=0 +CONFIG_JFFS2_FS_WRITEBUFFER=y +# CONFIG_JFFS2_FS_WBUF_VERIFY is not set +CONFIG_JFFS2_SUMMARY=y +CONFIG_JFFS2_FS_XATTR=y +CONFIG_JFFS2_FS_POSIX_ACL=y +CONFIG_JFFS2_FS_SECURITY=y +# CONFIG_JFFS2_COMPRESSION_OPTIONS is not set +CONFIG_JFFS2_ZLIB=y +CONFIG_JFFS2_RTIME=y +CONFIG_UBIFS_FS=m +# CONFIG_UBIFS_FS_ADVANCED_COMPR is not set +CONFIG_UBIFS_FS_LZO=y +CONFIG_UBIFS_FS_ZLIB=y +CONFIG_UBIFS_FS_ZSTD=y +CONFIG_UBIFS_ATIME_SUPPORT=y +CONFIG_UBIFS_FS_XATTR=y +CONFIG_UBIFS_FS_SECURITY=y +CONFIG_UBIFS_FS_AUTHENTICATION=y +CONFIG_CRAMFS=m +CONFIG_CRAMFS_BLOCKDEV=y +CONFIG_CRAMFS_MTD=y +CONFIG_SQUASHFS=m +# CONFIG_SQUASHFS_FILE_CACHE is not set +CONFIG_SQUASHFS_FILE_DIRECT=y +# CONFIG_SQUASHFS_DECOMP_SINGLE is not set +CONFIG_SQUASHFS_DECOMP_MULTI=y +# CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU is not set +CONFIG_SQUASHFS_XATTR=y +CONFIG_SQUASHFS_ZLIB=y +CONFIG_SQUASHFS_LZ4=y +CONFIG_SQUASHFS_LZO=y +CONFIG_SQUASHFS_XZ=y +CONFIG_SQUASHFS_ZSTD=y +# CONFIG_SQUASHFS_4K_DEVBLK_SIZE is not set +# CONFIG_SQUASHFS_EMBEDDED is not set +CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3 +# CONFIG_VXFS_FS is not set +CONFIG_MINIX_FS=m +CONFIG_OMFS_FS=m +# CONFIG_HPFS_FS is not set +# CONFIG_QNX4FS_FS is not set +# CONFIG_QNX6FS_FS is not set +CONFIG_ROMFS_FS=m +CONFIG_ROMFS_BACKED_BY_BLOCK=y +# CONFIG_ROMFS_BACKED_BY_MTD is not set +# CONFIG_ROMFS_BACKED_BY_BOTH is not set +CONFIG_ROMFS_ON_BLOCK=y +CONFIG_PSTORE=y +CONFIG_PSTORE_DEFLATE_COMPRESS=m +CONFIG_PSTORE_LZO_COMPRESS=m +CONFIG_PSTORE_LZ4_COMPRESS=m +CONFIG_PSTORE_LZ4HC_COMPRESS=m +# CONFIG_PSTORE_842_COMPRESS is not set +CONFIG_PSTORE_ZSTD_COMPRESS=y +CONFIG_PSTORE_COMPRESS=y +# CONFIG_PSTORE_DEFLATE_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_LZO_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_LZ4_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_LZ4HC_COMPRESS_DEFAULT is not set +CONFIG_PSTORE_ZSTD_COMPRESS_DEFAULT=y +CONFIG_PSTORE_COMPRESS_DEFAULT="zstd" +# CONFIG_PSTORE_CONSOLE is not set +# CONFIG_PSTORE_PMSG is not set +# CONFIG_PSTORE_FTRACE is not set +CONFIG_PSTORE_RAM=y +CONFIG_PSTORE_ZONE=m +CONFIG_PSTORE_BLK=m +CONFIG_PSTORE_BLK_BLKDEV="" +CONFIG_PSTORE_BLK_KMSG_SIZE=64 +CONFIG_PSTORE_BLK_MAX_REASON=2 +# CONFIG_SYSV_FS is not set +CONFIG_UFS_FS=m +# CONFIG_UFS_FS_WRITE is not set +# CONFIG_UFS_DEBUG is not set +CONFIG_EROFS_FS=m +# CONFIG_EROFS_FS_DEBUG is not set +CONFIG_EROFS_FS_XATTR=y +CONFIG_EROFS_FS_POSIX_ACL=y +CONFIG_EROFS_FS_SECURITY=y +CONFIG_EROFS_FS_ZIP=y +CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT=2 +CONFIG_VBOXSF_FS=m +CONFIG_NETWORK_FILESYSTEMS=y +CONFIG_NFS_FS=m +CONFIG_NFS_V2=m +CONFIG_NFS_V3=m +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=m +CONFIG_NFS_SWAP=y +CONFIG_NFS_V4_1=y +CONFIG_NFS_V4_2=y +CONFIG_PNFS_FILE_LAYOUT=m +CONFIG_PNFS_BLOCK=m +CONFIG_PNFS_FLEXFILE_LAYOUT=m +CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="kernel.org" +CONFIG_NFS_V4_1_MIGRATION=y +CONFIG_NFS_V4_SECURITY_LABEL=y +CONFIG_NFS_FSCACHE=y +# CONFIG_NFS_USE_LEGACY_DNS is not set +CONFIG_NFS_USE_KERNEL_DNS=y +CONFIG_NFS_DEBUG=y +# CONFIG_NFS_DISABLE_UDP_SUPPORT is not set +CONFIG_NFSD=m +CONFIG_NFSD_V2_ACL=y +CONFIG_NFSD_V3=y +CONFIG_NFSD_V3_ACL=y +CONFIG_NFSD_V4=y +CONFIG_NFSD_PNFS=y +CONFIG_NFSD_BLOCKLAYOUT=y +CONFIG_NFSD_SCSILAYOUT=y +# CONFIG_NFSD_FLEXFILELAYOUT is not set +CONFIG_NFSD_V4_SECURITY_LABEL=y +CONFIG_GRACE_PERIOD=m +CONFIG_LOCKD=m +CONFIG_LOCKD_V4=y +CONFIG_NFS_ACL_SUPPORT=m +CONFIG_NFS_COMMON=y +CONFIG_SUNRPC=m +CONFIG_SUNRPC_GSS=m +CONFIG_SUNRPC_BACKCHANNEL=y +CONFIG_SUNRPC_SWAP=y +CONFIG_RPCSEC_GSS_KRB5=m +CONFIG_SUNRPC_DISABLE_INSECURE_ENCTYPES=y +CONFIG_SUNRPC_DEBUG=y +CONFIG_SUNRPC_XPRT_RDMA=m +CONFIG_CEPH_FS=m +CONFIG_CEPH_FSCACHE=y +CONFIG_CEPH_FS_POSIX_ACL=y +CONFIG_CEPH_FS_SECURITY_LABEL=y +CONFIG_CIFS=m +# CONFIG_CIFS_STATS2 is not set +CONFIG_CIFS_ALLOW_INSECURE_LEGACY=y +# CONFIG_CIFS_WEAK_PW_HASH is not set +CONFIG_CIFS_UPCALL=y +CONFIG_CIFS_XATTR=y +CONFIG_CIFS_POSIX=y +CONFIG_CIFS_DEBUG=y +# CONFIG_CIFS_DEBUG2 is not set +# CONFIG_CIFS_DEBUG_DUMP_KEYS is not set +CONFIG_CIFS_DFS_UPCALL=y +# CONFIG_CIFS_SMB_DIRECT is not set +CONFIG_CIFS_FSCACHE=y +CONFIG_CODA_FS=m +CONFIG_AFS_FS=m +# CONFIG_AFS_DEBUG is not set +CONFIG_AFS_FSCACHE=y +# CONFIG_AFS_DEBUG_CURSOR is not set +CONFIG_9P_FS=m +CONFIG_9P_FSCACHE=y +CONFIG_9P_FS_POSIX_ACL=y +CONFIG_9P_FS_SECURITY=y +CONFIG_NLS=y +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_737=m +CONFIG_NLS_CODEPAGE_775=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_CODEPAGE_852=m +CONFIG_NLS_CODEPAGE_855=m +CONFIG_NLS_CODEPAGE_857=m +CONFIG_NLS_CODEPAGE_860=m +CONFIG_NLS_CODEPAGE_861=m +CONFIG_NLS_CODEPAGE_862=m +CONFIG_NLS_CODEPAGE_863=m +CONFIG_NLS_CODEPAGE_864=m +CONFIG_NLS_CODEPAGE_865=m +CONFIG_NLS_CODEPAGE_866=m +CONFIG_NLS_CODEPAGE_869=m +CONFIG_NLS_CODEPAGE_936=m +CONFIG_NLS_CODEPAGE_950=m +CONFIG_NLS_CODEPAGE_932=m +CONFIG_NLS_CODEPAGE_949=m +CONFIG_NLS_CODEPAGE_874=m +CONFIG_NLS_ISO8859_8=m +CONFIG_NLS_CODEPAGE_1250=m +CONFIG_NLS_CODEPAGE_1251=m +CONFIG_NLS_ASCII=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_ISO8859_2=m +CONFIG_NLS_ISO8859_3=m +CONFIG_NLS_ISO8859_4=m +CONFIG_NLS_ISO8859_5=m +CONFIG_NLS_ISO8859_6=m +CONFIG_NLS_ISO8859_7=m +CONFIG_NLS_ISO8859_9=m +CONFIG_NLS_ISO8859_13=m +CONFIG_NLS_ISO8859_14=m +CONFIG_NLS_ISO8859_15=m +CONFIG_NLS_KOI8_R=m +CONFIG_NLS_KOI8_U=m +CONFIG_NLS_MAC_ROMAN=m +CONFIG_NLS_MAC_CELTIC=m +CONFIG_NLS_MAC_CENTEURO=m +CONFIG_NLS_MAC_CROATIAN=m +CONFIG_NLS_MAC_CYRILLIC=m +CONFIG_NLS_MAC_GAELIC=m +CONFIG_NLS_MAC_GREEK=m +CONFIG_NLS_MAC_ICELAND=m +CONFIG_NLS_MAC_INUIT=m +CONFIG_NLS_MAC_ROMANIAN=m +CONFIG_NLS_MAC_TURKISH=m +CONFIG_NLS_UTF8=m +CONFIG_DLM=m +# CONFIG_DLM_DEBUG is not set +CONFIG_UNICODE=y +# CONFIG_UNICODE_NORMALIZATION_SELFTEST is not set +CONFIG_IO_WQ=y +# end of File systems + +# +# Security options +# +CONFIG_KEYS=y +CONFIG_KEYS_REQUEST_CACHE=y +CONFIG_PERSISTENT_KEYRINGS=y +CONFIG_TRUSTED_KEYS=m +CONFIG_ENCRYPTED_KEYS=m +CONFIG_KEY_DH_OPERATIONS=y +# CONFIG_SECURITY_DMESG_RESTRICT is not set +CONFIG_SECURITY=y +CONFIG_SECURITYFS=y +CONFIG_SECURITY_NETWORK=y +CONFIG_PAGE_TABLE_ISOLATION=y +CONFIG_SECURITY_INFINIBAND=y +CONFIG_SECURITY_NETWORK_XFRM=y +CONFIG_SECURITY_PATH=y +# CONFIG_INTEL_TXT is not set +CONFIG_LSM_MMAP_MIN_ADDR=65536 +CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR=y +CONFIG_HARDENED_USERCOPY=y +CONFIG_HARDENED_USERCOPY_FALLBACK=y +# CONFIG_HARDENED_USERCOPY_PAGESPAN is not set +CONFIG_FORTIFY_SOURCE=y +# CONFIG_STATIC_USERMODEHELPER is not set +CONFIG_SECURITY_SELINUX=y +CONFIG_SECURITY_SELINUX_BOOTPARAM=y +# CONFIG_SECURITY_SELINUX_DISABLE is not set +CONFIG_SECURITY_SELINUX_DEVELOP=y +CONFIG_SECURITY_SELINUX_AVC_STATS=y +CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=0 +CONFIG_SECURITY_SELINUX_SIDTAB_HASH_BITS=9 +CONFIG_SECURITY_SELINUX_SID2STR_CACHE_SIZE=256 +CONFIG_SECURITY_SMACK=y +CONFIG_SECURITY_SMACK_BRINGUP=y +CONFIG_SECURITY_SMACK_NETFILTER=y +CONFIG_SECURITY_SMACK_APPEND_SIGNALS=y +CONFIG_SECURITY_TOMOYO=y +CONFIG_SECURITY_TOMOYO_MAX_ACCEPT_ENTRY=2048 +CONFIG_SECURITY_TOMOYO_MAX_AUDIT_LOG=1024 +# CONFIG_SECURITY_TOMOYO_OMIT_USERSPACE_LOADER is not set +CONFIG_SECURITY_TOMOYO_POLICY_LOADER="/sbin/tomoyo-init" +CONFIG_SECURITY_TOMOYO_ACTIVATION_TRIGGER="/sbin/init" +# CONFIG_SECURITY_TOMOYO_INSECURE_BUILTIN_SETTING is not set +CONFIG_SECURITY_APPARMOR=y +CONFIG_SECURITY_APPARMOR_HASH=y +CONFIG_SECURITY_APPARMOR_HASH_DEFAULT=y +# CONFIG_SECURITY_APPARMOR_DEBUG is not set +# CONFIG_SECURITY_LOADPIN is not set +CONFIG_SECURITY_YAMA=y +CONFIG_SECURITY_SAFESETID=y +CONFIG_SECURITY_LOCKDOWN_LSM=y +# CONFIG_SECURITY_LOCKDOWN_LSM_EARLY is not set +CONFIG_LOCK_DOWN_KERNEL_FORCE_NONE=y +# CONFIG_LOCK_DOWN_KERNEL_FORCE_INTEGRITY is not set +# CONFIG_LOCK_DOWN_KERNEL_FORCE_CONFIDENTIALITY is not set +# CONFIG_INTEGRITY is not set +# CONFIG_IMA_SECURE_AND_OR_TRUSTED_BOOT is not set +# CONFIG_DEFAULT_SECURITY_SELINUX is not set +# CONFIG_DEFAULT_SECURITY_SMACK is not set +# CONFIG_DEFAULT_SECURITY_TOMOYO is not set +# CONFIG_DEFAULT_SECURITY_APPARMOR is not set +CONFIG_DEFAULT_SECURITY_DAC=y +CONFIG_LSM="lockdown,yama" + +# +# Kernel hardening options +# +CONFIG_GCC_PLUGIN_STRUCTLEAK=y + +# +# Memory initialization +# +# CONFIG_INIT_STACK_NONE is not set +# CONFIG_GCC_PLUGIN_STRUCTLEAK_USER is not set +# CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF is not set +CONFIG_GCC_PLUGIN_STRUCTLEAK_BYREF_ALL=y +# CONFIG_GCC_PLUGIN_STRUCTLEAK_VERBOSE is not set +# CONFIG_GCC_PLUGIN_STACKLEAK is not set +CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y +# CONFIG_INIT_ON_FREE_DEFAULT_ON is not set +# end of Memory initialization +# end of Kernel hardening options +# end of Security options + +CONFIG_XOR_BLOCKS=m +CONFIG_ASYNC_CORE=m +CONFIG_ASYNC_MEMCPY=m +CONFIG_ASYNC_XOR=m +CONFIG_ASYNC_PQ=m +CONFIG_ASYNC_RAID6_RECOV=m +CONFIG_CRYPTO=y + +# +# Crypto core or helper +# +CONFIG_CRYPTO_ALGAPI=y +CONFIG_CRYPTO_ALGAPI2=y +CONFIG_CRYPTO_AEAD=y +CONFIG_CRYPTO_AEAD2=y +CONFIG_CRYPTO_SKCIPHER=y +CONFIG_CRYPTO_SKCIPHER2=y +CONFIG_CRYPTO_HASH=y +CONFIG_CRYPTO_HASH2=y +CONFIG_CRYPTO_RNG=y +CONFIG_CRYPTO_RNG2=y +CONFIG_CRYPTO_RNG_DEFAULT=y +CONFIG_CRYPTO_AKCIPHER2=y +CONFIG_CRYPTO_AKCIPHER=y +CONFIG_CRYPTO_KPP2=y +CONFIG_CRYPTO_KPP=y +CONFIG_CRYPTO_ACOMP2=y +CONFIG_CRYPTO_MANAGER=y +CONFIG_CRYPTO_MANAGER2=y +CONFIG_CRYPTO_USER=m +CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y +CONFIG_CRYPTO_GF128MUL=y +CONFIG_CRYPTO_NULL=y +CONFIG_CRYPTO_NULL2=y +CONFIG_CRYPTO_PCRYPT=m +CONFIG_CRYPTO_CRYPTD=m +CONFIG_CRYPTO_AUTHENC=m +CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_SIMD=m +CONFIG_CRYPTO_GLUE_HELPER_X86=m +CONFIG_CRYPTO_ENGINE=m + +# +# Public-key cryptography +# +CONFIG_CRYPTO_RSA=y +CONFIG_CRYPTO_DH=y +CONFIG_CRYPTO_ECC=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECRDSA=m +CONFIG_CRYPTO_CURVE25519=m +CONFIG_CRYPTO_CURVE25519_X86=m + +# +# Authenticated Encryption with Associated Data +# +CONFIG_CRYPTO_CCM=m +CONFIG_CRYPTO_GCM=y +CONFIG_CRYPTO_CHACHA20POLY1305=m +CONFIG_CRYPTO_AEGIS128=m +CONFIG_CRYPTO_AEGIS128_AESNI_SSE2=m +CONFIG_CRYPTO_SEQIV=y +CONFIG_CRYPTO_ECHAINIV=m + +# +# Block modes +# +CONFIG_CRYPTO_CBC=m +CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTR=y +CONFIG_CRYPTO_CTS=m +CONFIG_CRYPTO_ECB=m +CONFIG_CRYPTO_LRW=m +CONFIG_CRYPTO_OFB=m +CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m +CONFIG_CRYPTO_KEYWRAP=m +CONFIG_CRYPTO_NHPOLY1305=m +CONFIG_CRYPTO_NHPOLY1305_SSE2=m +CONFIG_CRYPTO_NHPOLY1305_AVX2=m +CONFIG_CRYPTO_ADIANTUM=m +CONFIG_CRYPTO_ESSIV=m + +# +# Hash modes +# +CONFIG_CRYPTO_CMAC=m +CONFIG_CRYPTO_HMAC=y +CONFIG_CRYPTO_XCBC=m +CONFIG_CRYPTO_VMAC=m + +# +# Digest +# +CONFIG_CRYPTO_CRC32C=m +CONFIG_CRYPTO_CRC32C_INTEL=m +CONFIG_CRYPTO_CRC32=m +CONFIG_CRYPTO_CRC32_PCLMUL=m +CONFIG_CRYPTO_XXHASH=m +CONFIG_CRYPTO_BLAKE2B=m +CONFIG_CRYPTO_BLAKE2S=m +CONFIG_CRYPTO_BLAKE2S_X86=m +CONFIG_CRYPTO_CRCT10DIF=y +CONFIG_CRYPTO_CRCT10DIF_PCLMUL=m +CONFIG_CRYPTO_GHASH=y +CONFIG_CRYPTO_POLY1305=m +CONFIG_CRYPTO_POLY1305_X86_64=m +CONFIG_CRYPTO_MD4=m +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_RMD128=m +CONFIG_CRYPTO_RMD160=m +CONFIG_CRYPTO_RMD256=m +CONFIG_CRYPTO_RMD320=m +CONFIG_CRYPTO_SHA1=y +CONFIG_CRYPTO_SHA1_SSSE3=m +CONFIG_CRYPTO_SHA256_SSSE3=m +CONFIG_CRYPTO_SHA512_SSSE3=m +CONFIG_CRYPTO_SHA256=y +CONFIG_CRYPTO_SHA512=y +CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m +CONFIG_CRYPTO_STREEBOG=m +CONFIG_CRYPTO_TGR192=m +CONFIG_CRYPTO_WP512=m +CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL=m + +# +# Ciphers +# +CONFIG_CRYPTO_AES=y +CONFIG_CRYPTO_AES_TI=m +CONFIG_CRYPTO_AES_NI_INTEL=m +CONFIG_CRYPTO_ANUBIS=m +CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_BLOWFISH=m +CONFIG_CRYPTO_BLOWFISH_COMMON=m +CONFIG_CRYPTO_BLOWFISH_X86_64=m +CONFIG_CRYPTO_CAMELLIA=m +CONFIG_CRYPTO_CAMELLIA_X86_64=m +CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64=m +CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64=m +CONFIG_CRYPTO_CAST_COMMON=m +CONFIG_CRYPTO_CAST5=m +CONFIG_CRYPTO_CAST5_AVX_X86_64=m +CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_CAST6_AVX_X86_64=m +CONFIG_CRYPTO_DES=m +CONFIG_CRYPTO_DES3_EDE_X86_64=m +CONFIG_CRYPTO_FCRYPT=m +CONFIG_CRYPTO_KHAZAD=m +CONFIG_CRYPTO_SALSA20=m +CONFIG_CRYPTO_CHACHA20=m +CONFIG_CRYPTO_CHACHA20_X86_64=m +CONFIG_CRYPTO_SEED=m +CONFIG_CRYPTO_SERPENT=m +CONFIG_CRYPTO_SERPENT_SSE2_X86_64=m +CONFIG_CRYPTO_SERPENT_AVX_X86_64=m +CONFIG_CRYPTO_SERPENT_AVX2_X86_64=m +CONFIG_CRYPTO_SM4=m +CONFIG_CRYPTO_TEA=m +CONFIG_CRYPTO_TWOFISH=m +CONFIG_CRYPTO_TWOFISH_COMMON=m +CONFIG_CRYPTO_TWOFISH_X86_64=m +CONFIG_CRYPTO_TWOFISH_X86_64_3WAY=m +CONFIG_CRYPTO_TWOFISH_AVX_X86_64=m + +# +# Compression +# +CONFIG_CRYPTO_DEFLATE=m +CONFIG_CRYPTO_LZO=m +CONFIG_CRYPTO_842=m +CONFIG_CRYPTO_LZ4=y +CONFIG_CRYPTO_LZ4HC=y +CONFIG_CRYPTO_ZSTD=y + +# +# Random Number Generation +# +CONFIG_CRYPTO_ANSI_CPRNG=m +CONFIG_CRYPTO_DRBG_MENU=y +CONFIG_CRYPTO_DRBG_HMAC=y +CONFIG_CRYPTO_DRBG_HASH=y +CONFIG_CRYPTO_DRBG_CTR=y +CONFIG_CRYPTO_DRBG=y +CONFIG_CRYPTO_JITTERENTROPY=y +CONFIG_CRYPTO_USER_API=m +CONFIG_CRYPTO_USER_API_HASH=m +CONFIG_CRYPTO_USER_API_SKCIPHER=m +CONFIG_CRYPTO_USER_API_RNG=m +CONFIG_CRYPTO_USER_API_AEAD=m +# CONFIG_CRYPTO_STATS is not set +CONFIG_CRYPTO_HASH_INFO=y + +# +# Crypto library routines +# +CONFIG_CRYPTO_LIB_AES=y +CONFIG_CRYPTO_LIB_ARC4=m +CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S=m +CONFIG_CRYPTO_LIB_BLAKE2S_GENERIC=m +CONFIG_CRYPTO_LIB_BLAKE2S=m +CONFIG_CRYPTO_ARCH_HAVE_LIB_CHACHA=m +CONFIG_CRYPTO_LIB_CHACHA_GENERIC=m +CONFIG_CRYPTO_LIB_CHACHA=m +CONFIG_CRYPTO_ARCH_HAVE_LIB_CURVE25519=m +CONFIG_CRYPTO_LIB_CURVE25519_GENERIC=m +CONFIG_CRYPTO_LIB_CURVE25519=m +CONFIG_CRYPTO_LIB_DES=m +CONFIG_CRYPTO_LIB_POLY1305_RSIZE=11 +CONFIG_CRYPTO_ARCH_HAVE_LIB_POLY1305=m +CONFIG_CRYPTO_LIB_POLY1305_GENERIC=m +CONFIG_CRYPTO_LIB_POLY1305=m +CONFIG_CRYPTO_LIB_CHACHA20POLY1305=m +CONFIG_CRYPTO_LIB_SHA256=y +CONFIG_CRYPTO_HW=y +CONFIG_CRYPTO_DEV_PADLOCK=m +CONFIG_CRYPTO_DEV_PADLOCK_AES=m +CONFIG_CRYPTO_DEV_PADLOCK_SHA=m +CONFIG_CRYPTO_DEV_ATMEL_I2C=m +CONFIG_CRYPTO_DEV_ATMEL_ECC=m +CONFIG_CRYPTO_DEV_ATMEL_SHA204A=m +CONFIG_CRYPTO_DEV_CCP=y +CONFIG_CRYPTO_DEV_CCP_DD=m +CONFIG_CRYPTO_DEV_SP_CCP=y +CONFIG_CRYPTO_DEV_CCP_CRYPTO=m +CONFIG_CRYPTO_DEV_SP_PSP=y +CONFIG_CRYPTO_DEV_CCP_DEBUGFS=y +CONFIG_CRYPTO_DEV_QAT=m +CONFIG_CRYPTO_DEV_QAT_DH895xCC=m +CONFIG_CRYPTO_DEV_QAT_C3XXX=m +CONFIG_CRYPTO_DEV_QAT_C62X=m +CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m +CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m +CONFIG_CRYPTO_DEV_QAT_C62XVF=m +CONFIG_CRYPTO_DEV_NITROX=m +CONFIG_CRYPTO_DEV_NITROX_CNN55XX=m +CONFIG_CRYPTO_DEV_CHELSIO=m +CONFIG_CHELSIO_IPSEC_INLINE=y +CONFIG_CHELSIO_TLS_DEVICE=y +CONFIG_CRYPTO_DEV_VIRTIO=m +CONFIG_CRYPTO_DEV_SAFEXCEL=m +CONFIG_CRYPTO_DEV_CCREE=m +CONFIG_CRYPTO_DEV_AMLOGIC_GXL=m +CONFIG_CRYPTO_DEV_AMLOGIC_GXL_DEBUG=y +CONFIG_ASYMMETRIC_KEY_TYPE=y +CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y +CONFIG_ASYMMETRIC_TPM_KEY_SUBTYPE=m +CONFIG_X509_CERTIFICATE_PARSER=y +CONFIG_PKCS8_PRIVATE_KEY_PARSER=m +CONFIG_TPM_KEY_PARSER=m +CONFIG_PKCS7_MESSAGE_PARSER=y +# CONFIG_PKCS7_TEST_KEY is not set +CONFIG_SIGNED_PE_FILE_VERIFICATION=y + +# +# Certificates for signature checking +# +CONFIG_MODULE_SIG_KEY="certs/signing_key.pem" +CONFIG_SYSTEM_TRUSTED_KEYRING=y +CONFIG_SYSTEM_TRUSTED_KEYS="" +# CONFIG_SYSTEM_EXTRA_CERTIFICATE is not set +CONFIG_SECONDARY_TRUSTED_KEYRING=y +CONFIG_SYSTEM_BLACKLIST_KEYRING=y +CONFIG_SYSTEM_BLACKLIST_HASH_LIST="" +# end of Certificates for signature checking + +CONFIG_BINARY_PRINTF=y + +# +# Library routines +# +CONFIG_RAID6_PQ=m +CONFIG_RAID6_PQ_BENCHMARK=y +CONFIG_LINEAR_RANGES=y +CONFIG_PACKING=y +CONFIG_BITREVERSE=y +CONFIG_GENERIC_STRNCPY_FROM_USER=y +CONFIG_GENERIC_STRNLEN_USER=y +CONFIG_GENERIC_NET_UTILS=y +CONFIG_GENERIC_FIND_FIRST_BIT=y +CONFIG_CORDIC=m +# CONFIG_PRIME_NUMBERS is not set +CONFIG_RATIONAL=y +CONFIG_GENERIC_PCI_IOMAP=y +CONFIG_GENERIC_IOMAP=y +CONFIG_ARCH_USE_CMPXCHG_LOCKREF=y +CONFIG_ARCH_HAS_FAST_MULTIPLIER=y +CONFIG_ARCH_USE_SYM_ANNOTATIONS=y +CONFIG_CRC_CCITT=y +CONFIG_CRC16=m +CONFIG_CRC_T10DIF=y +CONFIG_CRC_ITU_T=m +CONFIG_CRC32=y +# CONFIG_CRC32_SELFTEST is not set +CONFIG_CRC32_SLICEBY8=y +# CONFIG_CRC32_SLICEBY4 is not set +# CONFIG_CRC32_SARWATE is not set +# CONFIG_CRC32_BIT is not set +CONFIG_CRC64=m +CONFIG_CRC4=m +CONFIG_CRC7=m +CONFIG_LIBCRC32C=m +CONFIG_CRC8=m +CONFIG_XXHASH=y +# CONFIG_RANDOM32_SELFTEST is not set +CONFIG_842_COMPRESS=m +CONFIG_842_DECOMPRESS=m +CONFIG_ZLIB_INFLATE=y +CONFIG_ZLIB_DEFLATE=y +CONFIG_LZO_COMPRESS=y +CONFIG_LZO_DECOMPRESS=y +CONFIG_LZ4_COMPRESS=y +CONFIG_LZ4HC_COMPRESS=y +CONFIG_LZ4_DECOMPRESS=y +CONFIG_ZSTD_COMPRESS=y +CONFIG_ZSTD_DECOMPRESS=y +CONFIG_XZ_DEC=y +CONFIG_XZ_DEC_X86=y +CONFIG_XZ_DEC_POWERPC=y +CONFIG_XZ_DEC_IA64=y +CONFIG_XZ_DEC_ARM=y +CONFIG_XZ_DEC_ARMTHUMB=y +CONFIG_XZ_DEC_SPARC=y +CONFIG_XZ_DEC_BCJ=y +# CONFIG_XZ_DEC_TEST is not set +CONFIG_DECOMPRESS_GZIP=y +CONFIG_DECOMPRESS_BZIP2=y +CONFIG_DECOMPRESS_LZMA=y +CONFIG_DECOMPRESS_XZ=y +CONFIG_DECOMPRESS_LZO=y +CONFIG_DECOMPRESS_LZ4=y +CONFIG_DECOMPRESS_ZSTD=y +CONFIG_GENERIC_ALLOCATOR=y +CONFIG_REED_SOLOMON=y +CONFIG_REED_SOLOMON_ENC8=y +CONFIG_REED_SOLOMON_DEC8=y +CONFIG_REED_SOLOMON_DEC16=y +CONFIG_BCH=m +CONFIG_TEXTSEARCH=y +CONFIG_TEXTSEARCH_KMP=m +CONFIG_TEXTSEARCH_BM=m +CONFIG_TEXTSEARCH_FSM=m +CONFIG_BTREE=y +CONFIG_INTERVAL_TREE=y +CONFIG_XARRAY_MULTI=y +CONFIG_ASSOCIATIVE_ARRAY=y +CONFIG_HAS_IOMEM=y +CONFIG_HAS_IOPORT_MAP=y +CONFIG_HAS_DMA=y +CONFIG_DMA_OPS=y +CONFIG_NEED_SG_DMA_LENGTH=y +CONFIG_NEED_DMA_MAP_STATE=y +CONFIG_ARCH_DMA_ADDR_T_64BIT=y +CONFIG_DMA_DECLARE_COHERENT=y +CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED=y +CONFIG_DMA_VIRT_OPS=y +CONFIG_SWIOTLB=y +CONFIG_DMA_COHERENT_POOL=y +# CONFIG_DMA_API_DEBUG is not set +CONFIG_SGL_ALLOC=y +CONFIG_IOMMU_HELPER=y +CONFIG_CHECK_SIGNATURE=y +CONFIG_CPU_RMAP=y +CONFIG_DQL=y +CONFIG_GLOB=y +# CONFIG_GLOB_SELFTEST is not set +CONFIG_NLATTR=y +CONFIG_LRU_CACHE=m +CONFIG_CLZ_TAB=y +CONFIG_IRQ_POLL=y +CONFIG_MPILIB=y +CONFIG_DIMLIB=y +CONFIG_LIBFDT=y +CONFIG_OID_REGISTRY=y +CONFIG_UCS2_STRING=y +CONFIG_HAVE_GENERIC_VDSO=y +CONFIG_GENERIC_GETTIMEOFDAY=y +CONFIG_GENERIC_VDSO_TIME_NS=y +CONFIG_FONT_SUPPORT=y +CONFIG_FONTS=y +# CONFIG_FONT_8x8 is not set +CONFIG_FONT_8x16=y +# CONFIG_FONT_6x11 is not set +# CONFIG_FONT_7x14 is not set +# CONFIG_FONT_PEARL_8x8 is not set +# CONFIG_FONT_ACORN_8x8 is not set +# CONFIG_FONT_MINI_4x6 is not set +# CONFIG_FONT_6x10 is not set +# CONFIG_FONT_10x18 is not set +# CONFIG_FONT_SUN8x16 is not set +# CONFIG_FONT_SUN12x22 is not set +CONFIG_FONT_TER16x32=y +CONFIG_SG_POOL=y +CONFIG_ARCH_HAS_PMEM_API=y +CONFIG_MEMREGION=y +CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE=y +CONFIG_ARCH_HAS_UACCESS_MCSAFE=y +CONFIG_ARCH_STACKWALK=y +CONFIG_SBITMAP=y +CONFIG_PARMAN=m +CONFIG_OBJAGG=m +# CONFIG_STRING_SELFTEST is not set +# end of Library routines + +CONFIG_PLDMFW=y + +# +# Kernel hacking +# + +# +# printk and dmesg options +# +CONFIG_PRINTK_TIME=y +# CONFIG_PRINTK_CALLER is not set +CONFIG_CONSOLE_LOGLEVEL_DEFAULT=4 +CONFIG_CONSOLE_LOGLEVEL_QUIET=1 +CONFIG_MESSAGE_LOGLEVEL_DEFAULT=4 +# CONFIG_BOOT_PRINTK_DELAY is not set +CONFIG_DYNAMIC_DEBUG=y +CONFIG_DYNAMIC_DEBUG_CORE=y +CONFIG_SYMBOLIC_ERRNAME=y +CONFIG_DEBUG_BUGVERBOSE=y +# end of printk and dmesg options + +# +# Compile-time checks and compiler options +# +CONFIG_DEBUG_INFO=y +# CONFIG_DEBUG_INFO_REDUCED is not set +# CONFIG_DEBUG_INFO_COMPRESSED is not set +# CONFIG_DEBUG_INFO_SPLIT is not set +CONFIG_DEBUG_INFO_DWARF4=y +CONFIG_DEBUG_INFO_BTF=y +# CONFIG_GDB_SCRIPTS is not set +# CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_FRAME_WARN=2048 +CONFIG_STRIP_ASM_SYMS=y +# CONFIG_READABLE_ASM is not set +# CONFIG_HEADERS_INSTALL is not set +# CONFIG_DEBUG_SECTION_MISMATCH is not set +CONFIG_SECTION_MISMATCH_WARN_ONLY=y +# CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_32B is not set +CONFIG_STACK_VALIDATION=y +# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set +# end of Compile-time checks and compiler options + +# +# Generic Kernel Debugging Instruments +# +CONFIG_MAGIC_SYSRQ=y +CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0x0 +CONFIG_MAGIC_SYSRQ_SERIAL=y +CONFIG_MAGIC_SYSRQ_SERIAL_SEQUENCE="" +CONFIG_DEBUG_FS=y +CONFIG_DEBUG_FS_ALLOW_ALL=y +# CONFIG_DEBUG_FS_DISALLOW_MOUNT is not set +# CONFIG_DEBUG_FS_ALLOW_NONE is not set +CONFIG_HAVE_ARCH_KGDB=y +# CONFIG_KGDB is not set +CONFIG_ARCH_HAS_UBSAN_SANITIZE_ALL=y +# CONFIG_UBSAN is not set +# end of Generic Kernel Debugging Instruments + +CONFIG_DEBUG_KERNEL=y +CONFIG_DEBUG_MISC=y + +# +# Memory Debugging +# +# CONFIG_PAGE_EXTENSION is not set +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_PAGE_OWNER is not set +CONFIG_PAGE_POISONING=y +CONFIG_PAGE_POISONING_NO_SANITY=y +CONFIG_PAGE_POISONING_ZERO=y +# CONFIG_DEBUG_PAGE_REF is not set +# CONFIG_DEBUG_RODATA_TEST is not set +CONFIG_ARCH_HAS_DEBUG_WX=y +CONFIG_DEBUG_WX=y +CONFIG_GENERIC_PTDUMP=y +CONFIG_PTDUMP_CORE=y +# CONFIG_PTDUMP_DEBUGFS is not set +# CONFIG_DEBUG_OBJECTS is not set +# CONFIG_SLUB_DEBUG_ON is not set +# CONFIG_SLUB_STATS is not set +CONFIG_HAVE_DEBUG_KMEMLEAK=y +# CONFIG_DEBUG_KMEMLEAK is not set +# CONFIG_DEBUG_STACK_USAGE is not set +CONFIG_SCHED_STACK_END_CHECK=y +CONFIG_ARCH_HAS_DEBUG_VM_PGTABLE=y +# CONFIG_DEBUG_VM is not set +# CONFIG_DEBUG_VM_PGTABLE is not set +CONFIG_ARCH_HAS_DEBUG_VIRTUAL=y +# CONFIG_DEBUG_VIRTUAL is not set +CONFIG_DEBUG_MEMORY_INIT=y +# CONFIG_DEBUG_PER_CPU_MAPS is not set +CONFIG_HAVE_ARCH_KASAN=y +CONFIG_HAVE_ARCH_KASAN_VMALLOC=y +CONFIG_CC_HAS_KASAN_GENERIC=y +CONFIG_CC_HAS_WORKING_NOSANITIZE_ADDRESS=y +# CONFIG_KASAN is not set +# end of Memory Debugging + +# CONFIG_DEBUG_SHIRQ is not set + +# +# Debug Oops, Lockups and Hangs +# +# CONFIG_PANIC_ON_OOPS is not set +CONFIG_PANIC_ON_OOPS_VALUE=0 +CONFIG_PANIC_TIMEOUT=0 +CONFIG_LOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR=y +# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set +CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 +CONFIG_HARDLOCKUP_DETECTOR_PERF=y +CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y +CONFIG_HARDLOCKUP_DETECTOR=y +# CONFIG_BOOTPARAM_HARDLOCKUP_PANIC is not set +CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE=0 +CONFIG_DETECT_HUNG_TASK=y +CONFIG_DEFAULT_HUNG_TASK_TIMEOUT=120 +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 +# CONFIG_WQ_WATCHDOG is not set +# CONFIG_TEST_LOCKUP is not set +# end of Debug Oops, Lockups and Hangs + +# +# Scheduler Debugging +# +CONFIG_SCHED_DEBUG=y +CONFIG_SCHED_INFO=y +CONFIG_SCHEDSTATS=y +# end of Scheduler Debugging + +# CONFIG_DEBUG_TIMEKEEPING is not set +CONFIG_DEBUG_PREEMPT=y + +# +# Lock Debugging (spinlocks, mutexes, etc...) +# +CONFIG_LOCK_DEBUGGING_SUPPORT=y +# CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set +# CONFIG_DEBUG_RT_MUTEXES is not set +# CONFIG_DEBUG_SPINLOCK is not set +# CONFIG_DEBUG_MUTEXES is not set +# CONFIG_DEBUG_WW_MUTEX_SLOWPATH is not set +# CONFIG_DEBUG_RWSEMS is not set +# CONFIG_DEBUG_LOCK_ALLOC is not set +# CONFIG_DEBUG_ATOMIC_SLEEP is not set +# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +# CONFIG_LOCK_TORTURE_TEST is not set +# CONFIG_WW_MUTEX_SELFTEST is not set +# end of Lock Debugging (spinlocks, mutexes, etc...) + +CONFIG_STACKTRACE=y +# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set +# CONFIG_DEBUG_KOBJECT is not set + +# +# Debug kernel data structures +# +# CONFIG_DEBUG_LIST is not set +# CONFIG_DEBUG_PLIST is not set +# CONFIG_DEBUG_SG is not set +# CONFIG_DEBUG_NOTIFIERS is not set +# CONFIG_BUG_ON_DATA_CORRUPTION is not set +# end of Debug kernel data structures + +# CONFIG_DEBUG_CREDENTIALS is not set + +# +# RCU Debugging +# +# CONFIG_RCU_PERF_TEST is not set +# CONFIG_RCU_TORTURE_TEST is not set +# CONFIG_RCU_REF_SCALE_TEST is not set +CONFIG_RCU_CPU_STALL_TIMEOUT=60 +# CONFIG_RCU_TRACE is not set +# CONFIG_RCU_EQS_DEBUG is not set +# end of RCU Debugging + +# CONFIG_DEBUG_WQ_FORCE_RR_CPU is not set +# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set +# CONFIG_CPU_HOTPLUG_STATE_CONTROL is not set +CONFIG_LATENCYTOP=y +CONFIG_USER_STACKTRACE_SUPPORT=y +CONFIG_NOP_TRACER=y +CONFIG_HAVE_FUNCTION_TRACER=y +CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y +CONFIG_HAVE_DYNAMIC_FTRACE=y +CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS=y +CONFIG_HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y +CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y +CONFIG_HAVE_FENTRY=y +CONFIG_HAVE_C_RECORDMCOUNT=y +CONFIG_TRACER_MAX_TRACE=y +CONFIG_TRACE_CLOCK=y +CONFIG_RING_BUFFER=y +CONFIG_EVENT_TRACING=y +CONFIG_CONTEXT_SWITCH_TRACER=y +CONFIG_RING_BUFFER_ALLOW_SWAP=y +CONFIG_TRACING=y +CONFIG_GENERIC_TRACER=y +CONFIG_TRACING_SUPPORT=y +CONFIG_FTRACE=y +# CONFIG_BOOTTIME_TRACING is not set +CONFIG_FUNCTION_TRACER=y +CONFIG_FUNCTION_GRAPH_TRACER=y +CONFIG_DYNAMIC_FTRACE=y +CONFIG_DYNAMIC_FTRACE_WITH_REGS=y +CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS=y +CONFIG_FUNCTION_PROFILER=y +CONFIG_STACK_TRACER=y +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_PREEMPT_TRACER is not set +CONFIG_SCHED_TRACER=y +CONFIG_HWLAT_TRACER=y +CONFIG_MMIOTRACE=y +CONFIG_FTRACE_SYSCALLS=y +CONFIG_TRACER_SNAPSHOT=y +# CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP is not set +CONFIG_BRANCH_PROFILE_NONE=y +# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_KPROBE_EVENTS=y +# CONFIG_KPROBE_EVENTS_ON_NOTRACE is not set +CONFIG_UPROBE_EVENTS=y +CONFIG_BPF_EVENTS=y +CONFIG_DYNAMIC_EVENTS=y +CONFIG_PROBE_EVENTS=y +CONFIG_BPF_KPROBE_OVERRIDE=y +CONFIG_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_MAP=y +CONFIG_SYNTH_EVENTS=y +CONFIG_HIST_TRIGGERS=y +# CONFIG_TRACE_EVENT_INJECT is not set +# CONFIG_TRACEPOINT_BENCHMARK is not set +# CONFIG_RING_BUFFER_BENCHMARK is not set +# CONFIG_TRACE_EVAL_MAP_FILE is not set +# CONFIG_FTRACE_STARTUP_TEST is not set +# CONFIG_RING_BUFFER_STARTUP_TEST is not set +# CONFIG_MMIOTRACE_TEST is not set +# CONFIG_PREEMPTIRQ_DELAY_TEST is not set +# CONFIG_SYNTH_EVENT_GEN_TEST is not set +# CONFIG_KPROBE_EVENT_GEN_TEST is not set +# CONFIG_HIST_TRIGGERS_DEBUG is not set +# CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set +# CONFIG_SAMPLES is not set +CONFIG_HAVE_ARCH_KCSAN=y +CONFIG_ARCH_HAS_DEVMEM_IS_ALLOWED=y +CONFIG_STRICT_DEVMEM=y +CONFIG_IO_STRICT_DEVMEM=y + +# +# x86 Debugging +# +CONFIG_TRACE_IRQFLAGS_SUPPORT=y +CONFIG_TRACE_IRQFLAGS_NMI_SUPPORT=y +# CONFIG_X86_VERBOSE_BOOTUP is not set +CONFIG_EARLY_PRINTK=y +# CONFIG_EARLY_PRINTK_DBGP is not set +# CONFIG_EARLY_PRINTK_USB_XDBC is not set +# CONFIG_EFI_PGT_DUMP is not set +# CONFIG_DEBUG_TLBFLUSH is not set +# CONFIG_IOMMU_DEBUG is not set +CONFIG_HAVE_MMIOTRACE_SUPPORT=y +# CONFIG_X86_DECODER_SELFTEST is not set +CONFIG_IO_DELAY_0X80=y +# CONFIG_IO_DELAY_0XED is not set +# CONFIG_IO_DELAY_UDELAY is not set +# CONFIG_IO_DELAY_NONE is not set +CONFIG_DEBUG_BOOT_PARAMS=y +# CONFIG_CPA_DEBUG is not set +# CONFIG_DEBUG_ENTRY is not set +# CONFIG_DEBUG_NMI_SELFTEST is not set +# CONFIG_X86_DEBUG_FPU is not set +# CONFIG_PUNIT_ATOM_DEBUG is not set +CONFIG_UNWINDER_ORC=y +# CONFIG_UNWINDER_FRAME_POINTER is not set +# CONFIG_UNWINDER_GUESS is not set +# end of x86 Debugging + +# +# Kernel Testing and Coverage +# +# CONFIG_KUNIT is not set +# CONFIG_NOTIFIER_ERROR_INJECTION is not set +CONFIG_FUNCTION_ERROR_INJECTION=y +# CONFIG_FAULT_INJECTION is not set +CONFIG_ARCH_HAS_KCOV=y +CONFIG_CC_HAS_SANCOV_TRACE_PC=y +# CONFIG_KCOV is not set +CONFIG_RUNTIME_TESTING_MENU=y +CONFIG_LKDTM=m +# CONFIG_TEST_LIST_SORT is not set +# CONFIG_TEST_MIN_HEAP is not set +# CONFIG_TEST_SORT is not set +# CONFIG_KPROBES_SANITY_TEST is not set +# CONFIG_BACKTRACE_SELF_TEST is not set +# CONFIG_RBTREE_TEST is not set +# CONFIG_REED_SOLOMON_TEST is not set +# CONFIG_INTERVAL_TREE_TEST is not set +# CONFIG_PERCPU_TEST is not set +# CONFIG_ATOMIC64_SELFTEST is not set +# CONFIG_ASYNC_RAID6_TEST is not set +# CONFIG_TEST_HEXDUMP is not set +# CONFIG_TEST_STRING_HELPERS is not set +# CONFIG_TEST_STRSCPY is not set +# CONFIG_TEST_KSTRTOX is not set +# CONFIG_TEST_PRINTF is not set +# CONFIG_TEST_BITMAP is not set +# CONFIG_TEST_BITFIELD is not set +# CONFIG_TEST_UUID is not set +# CONFIG_TEST_XARRAY is not set +# CONFIG_TEST_OVERFLOW is not set +# CONFIG_TEST_RHASHTABLE is not set +# CONFIG_TEST_HASH is not set +# CONFIG_TEST_IDA is not set +# CONFIG_TEST_PARMAN is not set +# CONFIG_TEST_LKM is not set +# CONFIG_TEST_BITOPS is not set +# CONFIG_TEST_VMALLOC is not set +# CONFIG_TEST_USER_COPY is not set +# CONFIG_TEST_BPF is not set +# CONFIG_TEST_BLACKHOLE_DEV is not set +# CONFIG_FIND_BIT_BENCHMARK is not set +# CONFIG_TEST_FIRMWARE is not set +# CONFIG_TEST_SYSCTL is not set +# CONFIG_TEST_UDELAY is not set +# CONFIG_TEST_STATIC_KEYS is not set +# CONFIG_TEST_KMOD is not set +# CONFIG_TEST_MEMCAT_P is not set +# CONFIG_TEST_OBJAGG is not set +# CONFIG_TEST_STACKINIT is not set +# CONFIG_TEST_MEMINIT is not set +# CONFIG_TEST_HMM is not set +# CONFIG_TEST_FPU is not set +# CONFIG_MEMTEST is not set +# CONFIG_HYPERV_TESTING is not set +# end of Kernel Testing and Coverage +# end of Kernel hacking diff --git a/linux-tkg/linux-tkg-config/generic-desktop-profile.cfg b/linux-tkg/linux-tkg-config/generic-desktop-profile.cfg new file mode 100644 index 0000000..ac64d8a --- /dev/null +++ b/linux-tkg/linux-tkg-config/generic-desktop-profile.cfg @@ -0,0 +1,35 @@ +# linux-TkG config file +# Generic Desktop + + +#### KERNEL OPTIONS #### + +# Disable some non-module debugging - See PKGBUILD for the list +_debugdisable="false" + +# LEAVE AN EMPTY VALUE TO BE PROMPTED ABOUT FOLLOWING OPTIONS AT BUILD TIME + +# Set to "true" to disable FUNCTION_TRACER/GRAPH_TRACER, lowering overhead but limiting debugging and analyzing of kernel functions - Kernel default is "false" +_ftracedisable="false" + +# Set to "true" to disable NUMA, lowering overhead, but breaking CUDA/NvEnc on Nvidia equipped systems - Kernel default is "false" +_numadisable="false" + +# Set to "true" to use explicit preemption points to lower latency at the cost of a small throughput loss - Can give a nice perf boost in VMs - Kernel default is "false" +_voluntary_preempt="false" + +# A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience (ZENIFY) - Default is "true" +_zenify="true" + +# compiler optimization level - 1. Optimize for performance (-O2); 2. Optimize harder (-O3); 3. Optimize for size (-Os) - Kernel default is "2" +_compileroptlevel="1" + +# Trust the CPU manufacturer to initialize Linux's CRNG (RANDOM_TRUST_CPU) - Kernel default is "false" +_random_trust_cpu="false" + +# CPU scheduler runqueue sharing - No sharing (RQ_NONE), SMT (hyperthread) siblings (RQ_SMT), Multicore siblings (RQ_MC), Symmetric Multi-Processing (RQ_SMP), NUMA (RQ_ALL) +# Valid values are "none", "smt", "mc", "mc-llc"(for zen), "smp", "all" - Kernel default is "mc" +_runqueue_sharing="mc" + +# Timer frequency - "500", "750" or "1000" - More options available in kernel config prompt when left empty depending on selected cpusched - Kernel default is "750" +_timer_freq="500" diff --git a/linux-tkg/linux-tkg-config/prepare b/linux-tkg/linux-tkg-config/prepare new file mode 100644 index 0000000..a4b9103 --- /dev/null +++ b/linux-tkg/linux-tkg-config/prepare @@ -0,0 +1,1260 @@ +#!/bin/bash + +ver54=72 +ver57=19 +ver58=16 +ver59=1 + +_tkg_initscript() { + # Load external configuration file if present. Available variable values will overwrite customization.cfg ones. + if [ -e "$_EXT_CONFIG_PATH" ]; then + source "$_EXT_CONFIG_PATH" && msg2 "External configuration file $_EXT_CONFIG_PATH will be used to override customization.cfg values." && msg2 "" + fi + + # create build dir early + if [ "$_distro" = "Void" ]; then + _path="${XBPS_BUILDDIR}/${wrksrc}" + else + _path="${_where}" + fi + + if [ -z "$_version" ] && [ ! -e "$_path"/versel ]; then + plain "Which kernel version do you want to install?" + read -rp "`echo $' > 1. 5.8\n 2. 5.9\n 3. 5.7\n 4. 5.4\nchoice[1-4?]'`" _VERSEL; + case $_VERSEL in + "2") + echo "_basever=59" > "$_path"/versel + echo "_basekernel=5.9" >> "$_path"/versel + echo "_sub=${ver59}" >> "$_path"/versel + ;; + "3") + echo "_basever=57" > "$_path"/versel + echo "_basekernel=5.7" >> "$_path"/versel + echo "_sub=${ver57}" >> "$_path"/versel + ;; + "4") + echo "_basever=54" > "$_path"/versel + echo "_basekernel=5.4" >> "$_path"/versel + echo "_sub=${ver54}" >> "$_path"/versel + ;; + *) + echo "_basever=58" > "$_path"/versel + echo "_basekernel=5.8" >> "$_path"/versel + echo "_sub=${ver58}" >> "$_path"/versel + ;; + esac + elif [ -n "$_version" ];then + case "$_version" in + "5.4") + echo "_basever=54" > "$_path"/versel + echo "_basekernel=5.4" >> "$_path"/versel + echo "_sub=${ver54}" >> "$_path"/versel + ;; + "5.7") + echo "_basever=57" > "$_path"/versel + echo "_basekernel=5.7" >> "$_path"/versel + echo "_sub=${ver57}" >> "$_path"/versel + ;; + "5.8") + echo "_basever=58" > "$_path"/versel + echo "_basekernel=5.8" >> "$_path"/versel + echo "_sub=${ver58}" >> "$_path"/versel + ;; + "5.9") + echo "_basever=59" > "$_path"/versel + echo "_basekernel=5.9" >> "$_path"/versel + echo "_sub=${ver59}" >> "$_path"/versel + ;; + *) + error "There is something wrong with your kernel version selection, exiting..." + exit 1 + esac + fi + + # source versel early if present + if [ -e "${_path}"/versel ]; then + source "${_path}"/versel + fi + + if [ "$_distro" != "Void" ]; then + cp "$_where"/linux-tkg-patches/${_basekernel}/* "$_where" # copy patches inside the PKGBUILD's dir to preserve makepkg sourcing and md5sum checking + cp "$_where"/linux-tkg-config/${_basekernel}/* "$_where" # copy config files and hooks inside the PKGBUILD's dir to preserve makepkg sourcing and md5sum checking + else + cp "$_where"/linux-tkg-patches/${_basekernel}/* "$_path" + cp "$_where"/linux-tkg-config/${_basekernel}/* "$_path" + fi + + if [ -z "$_OPTIPROFILE" ] && [ ! -e "$_path"/cpuschedset ]; then + # Prompt about optimized configurations. Available variable values will overwrite customization.cfg/external config ones. + plain "Do you want to use a predefined optimized profile?" + read -rp "`echo $' > 1.Custom\n 2.Ryzen Desktop (Performance)\n 3.Other Desktop (Performance)\nchoice[1-3?]: '`" _OPTIPROFILE; + fi + if [ "$_OPTIPROFILE" = "2" ]; then + source "$_where"/ryzen-desktop-profile.cfg && msg2 "Ryzen Desktop (Performance) profile will be used." && msg2 "" + elif [ "$_OPTIPROFILE" = "3" ]; then + source "$_where"/generic-desktop-profile.cfg && msg2 "Generic Desktop (Performance) profile will be used." && msg2 "" + fi + + # source cpuschedset early if present + if [ -e "${_path}"/cpuschedset ]; then + source "${_path}"/cpuschedset + fi + + # source compilerset early if present + if [ -e "${_path}"/compilerset ]; then + source "${_path}"/compilerset + fi + + # CPU SCHED selector + if [ -z "$_cpusched" ] && [ ! -e "${_path}"/cpuschedset ]; then + plain "What CPU sched variant do you want to build/install?" + if [ "$_basever" = "58" ]; then + prompt="`echo $' > 1.Undead PDS (TkG)\n 2.Project C / PDS\n 3.Project C / BMQ\n 4.CFS\nchoice[1-4?]: '`" + elif [ "$_basever" = "59" ]; then + prompt="`echo $' > 1.Project C / PDS\n 2.Project C / BMQ\n 3.MuQSS\n 4.CFS\nchoice[1-3?]: '`" + else + prompt="`echo $' > 1.PDS\n 2.MuQSS\n 3.BMQ\n 4.CFS\nchoice[1-4?]: '`" + fi + read -rp "$prompt" CONDITION; + if [ "$CONDITION" = "2" ];then + if [ "$_basever" = "58" ]; then + echo "_cpusched=\"pds\"" > "${_path}"/cpuschedset + elif [ "$_basever" = "59" ]; then + echo "_cpusched=\"bmq\"" > "${_path}"/cpuschedset + else + echo "_cpusched=\"MuQSS\"" > "${_path}"/cpuschedset + fi + elif [ "$CONDITION" = "3" ]; then + if [ "$_basever" != "59" ]; then + echo "_cpusched=\"bmq\"" > "${_path}"/cpuschedset + else + echo "_cpusched=\"MuQSS\"" > "${_path}"/cpuschedset + fi + elif [ "$CONDITION" = "4" ]; then + echo "_cpusched=\"cfs\"" > "${_path}"/cpuschedset + else + if [ "$_basever" = "58" ]; then + echo "_cpusched=\"upds\"" > "${_path}"/cpuschedset + else + echo "_cpusched=\"pds\"" > "${_path}"/cpuschedset + fi + fi + if [ -n "$_custom_pkgbase" ]; then + echo "_custom_pkgbase=\"${_custom_pkgbase}\"" >> "${_path}"/cpuschedset + fi + elif [ "$_cpusched" = "upds" ]; then + echo "_cpusched=\"upds\"" > "${_path}"/cpuschedset + elif [ "$_cpusched" = "pds" ]; then + echo "_cpusched=\"pds\"" > "${_path}"/cpuschedset + elif [ "$_cpusched" = "cfs" ]; then + echo "_cpusched=\"cfs\"" > "${_path}"/cpuschedset + elif [ "$_cpusched" = "bmq" ]; then + echo "_cpusched=\"bmq\"" > "${_path}"/cpuschedset + elif [ "$_cpusched" = "muqss" ] || [ "$_cpusched" = "MuQSS" ]; then + echo "_cpusched=\"MuQSS\"" > "${_path}"/cpuschedset + else + if [ "$_nofallback" != "true" ]; then + warning "Something is wrong with your cpusched selection. Do you want to fallback to CFS (default)?" + read -rp "`echo $' > N/y : '`" _fallback; + fi + if [[ "$_fallback" =~ [yY] ]] || [ "$_nofallback" = "true" ]; then + echo "_cpusched=\"cfs\"" > "${_path}"/cpuschedset + else + error "Exiting..." + exit 1 + fi + fi + if [ -n "$_custom_pkgbase" ]; then + echo "_custom_pkgbase=\"${_custom_pkgbase}\"" >> "${_path}"/cpuschedset + fi + + case $_compileroptlevel in + "2") + _compileropt="-O3" + ;; + "3") + _compileropt="-Os" + ;; + *) + _compileropt="-O2" + ;; + esac + +# Compiler selector + if [ -z "$_compiler" ] && [ ! -e "${_path}"/compilerset ]; then + plain "Which compiler do you want to use?" + read -rp "`echo $' > 1.GCC (recommended)\n 2.Clang/LLVM\nchoice[1-2?]: '`" _CONDITION_CMPLR; + if [ "$_CONDITION_CMPLR" = "2" ];then + echo "_compiler_name=\"-llvm\"" > ${_path}/compilerset + echo "llvm_opt=\"LLVM=1\"" >> ${_path}/compilerset + else + echo "_compiler_name=" >> ${_path}/compilerset + fi + elif [ "$_compiler" = "llvm" ]; then + echo "_compiler_name=\"-llvm\"" > ${_path}/compilerset + echo "llvm_opt=\"LLVM=1\"" >> ${_path}/compilerset + else + echo "_compiler_name=" >> ${_path}/compilerset + if [ "$_nofallback" != "true" ] && [ ! -e "${_path}"/compilerset ] && [ "$_compiler" != "llvm" ]; then + warning "Something is wrong with your compiler selection. Do you want to fallback to GCC (default)?" + read -rp "`echo $' > N/y : '`" _fallback; + fi + if [[ "$_fallback" =~ [yY] ]] || [ "$_nofallback" = "true" ]; then + echo "_compiler_name=" >> "${_path}"/compilerset + fi + if [ ! -e "${_path}"/compilerset ]; then + error "Exiting..." + exit 1 + fi + fi + + source "${_path}"/cpuschedset + source "${_path}"/compilerset + source "${_path}"/versel +} + +user_patcher() { + # To patch the user because all your base are belong to us + local _patches=("$_where"/*."${_userpatch_ext}revert") + if [ ${#_patches[@]} -ge 2 ] || [ -e "${_patches}" ]; then + if [ "$_user_patches_no_confirm" != "true" ]; then + msg2 "Found ${#_patches[@]} 'to revert' userpatches for ${_userpatch_target}:" + printf '%s\n' "${_patches[@]}" + read -rp "Do you want to install it/them? - Be careful with that ;)"$'\n> N/y : ' _CONDITION; + fi + if [[ "$_CONDITION" =~ [yY] ]] || [ "$_user_patches_no_confirm" = "true" ]; then + for _f in "${_patches[@]}"; do + if [ -e "${_f}" ]; then + msg2 "######################################################" + msg2 "" + msg2 "Reverting your own ${_userpatch_target} patch ${_f}" + msg2 "" + msg2 "######################################################" + patch -Np1 -R < "${_f}" + echo "Reverted your own patch ${_f}" >> "$_where"/last_build_config.log + fi + done + fi + fi + + _patches=("$_where"/*."${_userpatch_ext}patch") + if [ ${#_patches[@]} -ge 2 ] || [ -e "${_patches}" ]; then + if [ "$_user_patches_no_confirm" != "true" ]; then + msg2 "Found ${#_patches[@]} userpatches for ${_userpatch_target}:" + printf '%s\n' "${_patches[@]}" + read -rp "Do you want to install it/them? - Be careful with that ;)"$'\n> N/y : ' _CONDITION; + fi + if [[ "$_CONDITION" =~ [yY] ]] || [ "$_user_patches_no_confirm" = "true" ]; then + for _f in "${_patches[@]}"; do + if [ -e "${_f}" ]; then + msg2 "######################################################" + msg2 "" + msg2 "Applying your own ${_userpatch_target} patch ${_f}" + msg2 "" + msg2 "######################################################" + patch -Np1 < "${_f}" + echo "Applied your own patch ${_f}" >> "$_where"/last_build_config.log + fi + done + fi + fi +} + +_tkg_srcprep() { + + if [ "${_distro}" = "Void" ] && [ -e ${srcdir}/sum_failed ]; then + exit 1 + fi + + if [ "${_distro}" = "Arch" ]; then + msg2 "Setting version..." + scripts/setlocalversion --save-scmversion + echo "-$pkgrel-tkg-${_cpusched}-${_compiler_name}" > localversion.10-pkgrel + echo "" > localversion.20-pkgname + + # add upstream patch + if [ "$_sub" != "0" ]; then + msg2 "Patching from $_basekernel to $pkgver" + patch -p1 -i "$srcdir"/patch-"${pkgver}" + fi + + # ARCH Patches + if [ "${_configfile}" = "config_hardened_${_basekernel}.x86_64" ] && [ "${_cpusched}" = "cfs" ]; then + msg2 "Using linux hardened patchset" + patch -Np1 -i "$srcdir"/0012-linux-hardened.patch + else + patch -Np1 -i "$srcdir"/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch + fi + fi + + pwd + if [ "$_distro" = "Void" ] && [[ "$_sub" = rc* ]]; then + cd ${wrksrc}/linux-${_rc_kern_ver} + elif [ "$_distro" = "Void" ]; then + cd ${wrksrc}/linux-${_kern_ver} + fi + + # graysky's cpu opts - https://github.com/graysky2/kernel_gcc_patch + if [ "$_compiler_name" != "-llvm" ]; then + msg2 "Applying graysky's cpu opts patch" + if [ "${_distro}" = "Arch" ]; then + patch -Np1 -i "$srcdir"/enable_additional_cpu_optimizations_for_gcc_v10.1%2B_kernel_v${opt_ver}.patch + elif [ "${_distro}" = "Void" ]; then + patch -Np1 -i "${wrksrc}"/enable_additional_cpu_optimizations_for_gcc_v10.1+_kernel_v${opt_ver}.patch + else + patch -Np1 -i "$srcdir"/enable_additional_cpu_optimizations_for_gcc_v10.1+_kernel_v${opt_ver}+.patch + fi + fi + + # TkG + msg2 "Applying clear linux patches" + patch -Np1 -i "$srcdir"/0002-clear-patches.patch + + msg2 "Applying glitched base patch" + patch -Np1 -i "$srcdir"/0003-glitched-base.patch + + if [ -z $_misc_adds ]; then + plain "Enable misc additions ? May contain temporary fixes pending upstream or changes that can break on non-Arch. " + read -rp "`echo $' > [Y]/n : '`" _interactive_misc_adds; + if [ "$_interactive_misc_adds" != "n" ] && [ "$_interactive_misc_adds" != "N" ]; then + _misc_adds="true" + fi + fi + + if [ "$_misc_adds" = "true" ] && [ "$_basever" != "54" ]; then + msg2 "Applying misc additions patch" + patch -Np1 -i "$srcdir"/0012-misc-additions.patch + fi + + # prjc/bmq patch rev + if [ "$_basever" = "59" ]; then + rev=0 + elif [ "$_basever" = "58" ] || [ "$_basever" = "57" ]; then + rev=3 + fi + + if [ "${_cpusched}" = "MuQSS" ]; then + # MuQSS + msg2 "Applying MuQSS base patch" + patch -Np1 -i "$srcdir"/0004-${_basekernel}-ck1.patch + + if [ "${_aggressive_ondemand}" = "true" ]; then + msg2 "Applying MuQSS agressive ondemand governor patch" + patch -Np1 -i "$srcdir"/0004-glitched-ondemand-muqss.patch + fi + + msg2 "Applying Glitched MuQSS patch" + patch -Np1 -i "$srcdir"/0004-glitched-muqss.patch + + elif [ "${_cpusched}" = "upds" ] || [ "${_cpusched}" = "pds" ]; then + # PDS-mq + msg2 "Applying PDS base patch" + if [ "${_cpusched}" = "upds" ] && [ "$_basever" != "59" ]; then + patch -Np1 -i "$srcdir"/0005-v${_basekernel}_undead-pds099o.patch + elif [ "$_basever" != "54" ]; then + patch -Np1 -i "$srcdir"/0009-prjc_v${_basekernel}-r${rev}.patch + fi + + # ondemand patches don't exist for 5.4 + if [ "$_basever" = "58" ] && [ "${_cpusched}" = "upds" ];then + # is it dead or alive + doa="-undead" + fi + if [ "${_aggressive_ondemand}" = "true" ] && [ "$_basever" != "54" ] && [ "${_cpusched}" = "upds" ]; then + msg2 "Applying PDS agressive ondemand governor patch" + patch -Np1 -i "$srcdir"/0005${doa}-glitched-ondemand-pds.patch + fi + + msg2 "Applying Glitched PDS patch" + patch -Np1 -i "$srcdir"/0005${doa}-glitched-pds.patch + + elif [ "${_cpusched}" = "bmq" ]; then + # Project C / BMQ + msg2 "Applying Project C / BMQ base patch" + if [ "$_basever" != "54" ]; then + patch -Np1 -i "$srcdir"/0009-prjc_v${_basekernel}-r${rev}.patch + else + patch -Np1 -i "$srcdir"/0009-bmq_v5.4-r2.patch + fi + + if [ "${_aggressive_ondemand}" = "true" ] && [ "$_basever" != "54" ]; then + msg2 "Applying BMQ agressive ondemand governor patch" + patch -Np1 -i "$srcdir"/0009-glitched-ondemand-bmq.patch + fi + + msg2 "Applying Glitched BMQ patch" + patch -Np1 -i "$srcdir"/0009-glitched-bmq.patch + + elif [ "${_cpusched}" = "cfs" ]; then + msg2 "Applying Glitched CFS patch" + patch -Np1 -i "$srcdir"/0003-glitched-cfs.patch + fi + + if [ "$_distro" = "Void" ] && [[ "$_sub" = rc* ]]; then + cd ${wrksrc}/linux-${_rc_kern_ver} + elif [ "$_distro" = "Void" ] && [[ "$_sub" != rc* ]]; then + cd ${wrksrc}/linux-${_kern_ver} + fi + + if [ "${_distro}" = "Arch" ] || [ "$_distro" = "Void" ]; then + if [ -z "${_configfile}" ]; then + _configfile="config.x86_64" + fi + + cat "${srcdir}/${_configfile}" > ./.config + fi + + # Set some -tkg defaults + echo "# CONFIG_DYNAMIC_FAULT is not set" >> ./.config + sed -i -e 's/CONFIG_DEFAULT_FQ_CODEL=y/# CONFIG_DEFAULT_FQ_CODEL is not set/' ./.config + echo "CONFIG_DEFAULT_CAKE=y" >> ./.config + echo "CONFIG_NR_TTY_DEVICES=63" >> ./.config + if [ "$_basever" = "54" ]; then + echo "CONFIG_TP_SMAPI=m" >> ./.config + echo "CONFIG_RAID6_USE_PREFER_GEN=y" >> ./.config + fi + if [ "$_basever" = "54" ] || [ "$_basever" = "59" ]; then + sed -i -e 's/CONFIG_RCU_BOOST_DELAY=500/CONFIG_RCU_BOOST_DELAY=0/' ./.config + fi + echo "# CONFIG_NTP_PPS is not set" >> ./.config + sed -i -e 's/CONFIG_CRYPTO_LZ4=m/CONFIG_CRYPTO_LZ4=y/' ./.config + sed -i -e 's/CONFIG_CRYPTO_LZ4HC=m/CONFIG_CRYPTO_LZ4HC=y/' ./.config + sed -i -e 's/CONFIG_LZ4_COMPRESS=m/CONFIG_LZ4_COMPRESS=y/' ./.config + sed -i -e 's/CONFIG_LZ4HC_COMPRESS=m/CONFIG_LZ4HC_COMPRESS=y/' ./.config + sed -i -e 's/CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZO=y/# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZO is not set/' ./.config + sed -i -e 's/# CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4 is not set/CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZ4=y/' ./.config + sed -i -e 's/CONFIG_ZSWAP_COMPRESSOR_DEFAULT="lzo"/CONFIG_ZSWAP_COMPRESSOR_DEFAULT="lz4"/' ./.config + sed -i -e 's/# CONFIG_CMDLINE_BOOL is not set/CONFIG_CMDLINE_BOOL=y/' ./.config + echo "CONFIG_CMDLINE=\"${_custom_commandline}\"" >> ./.config + echo "# CONFIG_CMDLINE_OVERRIDE is not set" >> ./.config + echo "# CONFIG_X86_P6_NOP is not set" >> ./.config + + # openrgb + echo "CONFIG_I2C_NCT6775=m" >> ./.config + + # ccache fix + if [ "$_noccache" != "true" ]; then + if { [ "$_distro" = "Arch" ] && pacman -Qq ccache &> /dev/null; } || { [ "$_distro" = "Ubuntu" ] && dpkg -l ccache > /dev/null; }\ + || { [ "$_distro" = "Void" ] && xbps-query -s ccache > /dev/null; } ; then + sed -i -e 's/CONFIG_GCC_PLUGINS=y/# CONFIG_GCC_PLUGINS is not set/' ./.config + fi + # Void uses LibreSSL + if [ "$_distro" = "Void" ]; then + sed -i -e 's/CONFIG_MODULE_SIG_SHA512=y/# CONFIG_MODULE_SIG_SHA512 is not set/' ./.config + sed -i -e 's/# CONFIG_MODULE_SIG_SHA1 is not set/CONFIG_MODULE_SIG_SHA1=y/' ./.config + sed -i -e 's/CONFIG_MODULE_SIG_HASH="sha512"/CONFIG_MODULE_SIG_HASH="sha1"/' ./.config + fi + fi + # Skip dbg package creation on non-Arch + if [ "$_distro" != "Arch" ]; then + sed -i -e 's/CONFIG_DEBUG_INFO.*/CONFIG_DEBUG_INFO=n/' ./.config + fi + if [ "$_compiler_name" = "-llvm" ]; then + if [ "$_basever" = "59" ]; then + echo 'CONFIG_INIT_STACK_ALL_PATTERN=n' >> ./.config + else + echo 'CONFIG_INIT_STACK_ALL=n' >> ./.config + fi + sed -i -e 's/# CONFIG_INIT_ON_FREE_DEFAULT_ON is not set/CONFIG_INIT_ON_FREE_DEFAULT_ON=y/' ./.config + sed -i -e 's/CONFIG_INIT_STACK_NONE=y/CONFIG_INIT_STACK_NONE=n/' ./.config + echo "CONFIG_INIT_STACK_ALL_ZERO=y" >> ./.config + fi + + if [ "$_font_autoselect" != "false" ]; then + sed -i -e 's/CONFIG_FONT_TER16x32=y/# CONFIG_FONT_TER16x32 is not set\nCONFIG_FONT_AUTOSELECT=y/' ./.config + fi + + # Inject cpuopts options + if [ "$_compiler_name" != "-llvm" ]; then + echo "# CONFIG_MK8SSE3 is not set" >> ./.config + echo "# CONFIG_MK10 is not set" >> ./.config + echo "# CONFIG_MBARCELONA is not set" >> ./.config + echo "# CONFIG_MBOBCAT is not set" >> ./.config + echo "# CONFIG_MJAGUAR is not set" >> ./.config + echo "# CONFIG_MBULLDOZER is not set" >> ./.config + echo "# CONFIG_MPILEDRIVER is not set" >> ./.config + echo "# CONFIG_MSTEAMROLLER is not set" >> ./.config + echo "# CONFIG_MEXCAVATOR is not set" >> ./.config + echo "# CONFIG_MZEN is not set" >> ./.config + echo "# CONFIG_MZEN2 is not set" >> ./.config + echo "# CONFIG_MATOM is not set" >> ./.config + echo "# CONFIG_MNEHALEM is not set" >> ./.config + echo "# CONFIG_MWESTMERE is not set" >> ./.config + echo "# CONFIG_MSILVERMONT is not set" >> ./.config + echo "# CONFIG_MSANDYBRIDGE is not set" >> ./.config + echo "# CONFIG_MIVYBRIDGE is not set" >> ./.config + echo "# CONFIG_MHASWELL is not set" >> ./.config + echo "# CONFIG_MBROADWELL is not set" >> ./.config + echo "# CONFIG_MSKYLAKE is not set" >> ./.config + echo "# CONFIG_MSKYLAKEX is not set" >> ./.config + echo "# CONFIG_MCANNONLAKE is not set" >> ./.config + echo "# CONFIG_MICELAKE is not set" >> ./.config + echo "# CONFIG_MGOLDMONT is not set" >> ./.config + echo "# CONFIG_MGOLDMONTPLUS is not set" >> ./.config + echo "# CONFIG_MCASCADELAKE is not set" >> ./.config + echo "# CONFIG_MCOOPERLAKE is not set" >> ./.config + echo "# CONFIG_MTIGERLAKE is not set" >> ./.config + fi + + # Disable some debugging + if [ "${_debugdisable}" = "true" ]; then + sed -i -e 's/CONFIG_SLUB_DEBUG=y/# CONFIG_SLUB_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_PM_DEBUG=y/# CONFIG_PM_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_PM_ADVANCED_DEBUG=y/# CONFIG_PM_ADVANCED_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_PM_SLEEP_DEBUG=y/# CONFIG_PM_SLEEP_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_ACPI_DEBUG=y/# CONFIG_ACPI_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_SCHED_DEBUG=y/# CONFIG_SCHED_DEBUG is not set/' ./.config + sed -i -e 's/CONFIG_LATENCYTOP=y/# CONFIG_LATENCYTOP is not set/' ./.config + sed -i -e 's/CONFIG_DEBUG_PREEMPT=y/# CONFIG_DEBUG_PREEMPT is not set/' ./.config + fi + + if [ "${_cpusched}" = "MuQSS" ]; then + # MuQSS default config + echo "CONFIG_SCHED_MUQSS=y" >> ./.config + elif [ "${_cpusched}" = "pds" ]; then + # PDS default config + echo "CONFIG_SCHED_ALT=y" >> ./.config + echo "CONFIG_SCHED_PDS=y" >> ./.config + echo "# CONFIG_SCHED_BMQ is not set" >> ./.config + elif [ "${_cpusched}" = "upds" ]; then + # PDS default config + echo "CONFIG_SCHED_PDS=y" >> ./.config + elif [ "${_cpusched}" = "bmq" ]; then + # BMQ default config + echo "CONFIG_SCHED_ALT=y" >> ./.config + echo "CONFIG_SCHED_BMQ=y" >> ./.config + echo "# CONFIG_SCHED_PDS is not set" >> ./.config + fi + + if [ "${_cpusched}" = "MuQSS" ] || [ "${_cpusched}" = "pds" ] || [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "upds" ]; then + # Disable CFS + sed -i -e 's/CONFIG_FAIR_GROUP_SCHED=y/# CONFIG_FAIR_GROUP_SCHED is not set/' ./.config + sed -i -e 's/CONFIG_CFS_BANDWIDTH=y/# CONFIG_CFS_BANDWIDTH is not set/' ./.config + # sched yield type + if [ -n "$_sched_yield_type" ]; then + CONDITION0="$_sched_yield_type" + # in customization.cfg for linux59-tkg _sched_yield_type is set to 0 + elif [ "$_basever" = "59" ]; then + CONDITION0="0" + else + plain "" + plain "CPU sched_yield_type - Choose what sort of yield sched_yield will perform." + plain "" + plain "For PDS and MuQSS:" + plain "0: No yield." + plain "1: Yield only to better priority/deadline tasks." + plain "2: Expire timeslice and recalculate deadline." + plain "" + plain "For BMQ (experimental) - No recommended value yet, so try for yourself x) :" + plain "0: No yield." + plain "1: Deboost and requeue task. (default)" + plain "2: Set rq skip task." + if [ "${_cpusched}" = "MuQSS" ]; then + read -rp "`echo $'\n 0. Supposedly best option for gaming performance - could lead to stability issues on some (AMD) platforms when combined with MuQSS\n > 1. Default and recommended option for MuQSS - could lead to stability issues on some (Intel) platforms\n 2. Can be a good option with low rr_interval on MuQSS\n [0-2?]: '`" CONDITION0; + else + read -rp "`echo $'\n > 0. Recommended option for gaming on PDS - "tkg" default\n 1. Default, but can lead to stability issues on some platforms\n 2. Can be a good option with low rr_interval on MuQSS\n [0-2?]: '`" CONDITION0; + fi + fi + if [ "$_basever" = "54" ] || [ "$_basever" = "57" ] && [ "$_cpusched" != "bmq" ]; then + _sched=${_cpusched} + else + _sched="alt_core" + fi + if [ "$CONDITION0" = "0" ]; then + if [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "pds" ] && [ "$_basever" != "54" ]; then + sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/${_sched}.c + else + sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/"${_cpusched}".c + fi + elif [ "$CONDITION0" = "1" ]; then + msg2 "Using default CPU sched yield type (1)" + elif [ "$CONDITION0" = "2" ]; then + if [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "pds" ] && [ "$_basever" != "54" ]; then + sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 2;/' ./kernel/sched/${_sched}.c + elif [ "${_cpusched}" = "upds" ]; then + sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 2;/' ./kernel/sched/pds.c + else + sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 2;/' ./kernel/sched/"${_cpusched}".c + fi + else + if [ "${_cpusched}" = "MuQSS" ]; then + msg2 "Using default CPU sched yield type (1)" + elif [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "pds" ] && [ "$_basever" != "54" ]; then + sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/${_sched}.c + elif [ "${_cpusched}" = "upds" ]; then + sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/pds.c + else + sed -i -e 's/int sched_yield_type __read_mostly = 1;/int sched_yield_type __read_mostly = 0;/' ./kernel/sched/"${_cpusched}".c + fi + fi + fi + + # Round Robin interval + if [ "${_cpusched}" = "MuQSS" ] || [ "${_cpusched}" = "pds" ] || [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "upds" ]; then + if [ -n "$_rr_interval" ]; then + CONDITION1="$_rr_interval" + else + plain "" + plain "Round Robin interval is the longest duration two tasks with the same nice level will" + plain "be delayed for. When CPU time is requested by a task, it receives a time slice equal" + plain "to the rr_interval in addition to a virtual deadline. When using yield_type 2, a low" + plain "value can help offset the disadvantages of rescheduling a process that has yielded." + plain "" + plain "MuQSS default: 6ms" + plain "PDS default: 4ms" + plain "BMQ default: 2ms" + read -rp "`echo $'\n > 0.Keep defaults\n 1.2ms\n 2.4ms\n 3.6ms\n 4.8ms\n [0-4?]: '`" CONDITION1; + fi + if [ "$CONDITION1" = "1" ]; then + msg2 "Using 2ms rr_interval" + _rrvalue="2" + elif [ "$CONDITION1" = "2" ]; then + msg2 "Using 4ms rr_interval" + _rrvalue="4" + elif [ "$CONDITION1" = "3" ]; then + msg2 "Using 6ms rr_interval" + _rrvalue="6" + elif [ "$CONDITION1" = "4" ]; then + msg2 "Using 8ms rr_interval" + _rrvalue="8" + else + msg2 "Using default rr_interval" + _rrvalue="default" + fi + if [ "$_basever" != "54" ]; then + if [ "$_rrvalue" != "default" ]; then + if [ "${_cpusched}" = "MuQSS" ]; then + sed -i -e "s/int rr_interval __read_mostly = 6;/int rr_interval __read_mostly = ${_rrvalue};/" ./kernel/sched/"${_cpusched}".c + elif [ "${_cpusched}" = "upds" ]; then + sed -i -e "s/#define SCHED_DEFAULT_RR (4)/#define SCHED_DEFAULT_RR (${_rrvalue})/" ./kernel/sched/pds.c + elif [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "pds" ]; then + sed -i -e "s/u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000);/u64 sched_timeslice_ns __read_mostly = (${_rrvalue} * 1000 * 1000);/" ./kernel/sched/${_sched}.c + fi + if [ "${_cpusched}" == "bmq" ]; then + echo "CONFIG_SCHED_TIMESLICE=2" >> ./.config + fi + else + if [ "${_cpusched}" = "bmq" ] || [ "${_cpusched}" = "pds" ]; then + sed -i -e "s/u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000);/u64 sched_timeslice_ns __read_mostly = (2 * 1000 * 1000);/" ./kernel/sched/${_sched}.c + fi + fi + else + if [ "$_rrvalue" != "default" ]; then + if [ "${_cpusched}" == "MuQSS" ]; then + sed -i -e "s/int rr_interval __read_mostly = 6;/int rr_interval __read_mostly = ${_rrvalue};/" ./kernel/sched/"${_cpusched}".c + elif [ "${_cpusched}" == "pds" ]; then + sed -i -e "s/#define SCHED_DEFAULT_RR (4)/#define SCHED_DEFAULT_RR (${_rrvalue})/" ./kernel/sched/"${_cpusched}".c + elif [ "${_cpusched}" == "bmq" ]; then + echo "CONFIG_SCHED_TIMESLICE=${_rrvalue}" >> ./.config + fi + else + if [ "${_cpusched}" == "bmq" ]; then + echo "CONFIG_SCHED_TIMESLICE=2" >> ./.config + fi + fi + fi + fi + + # zenify + if [ "$_zenify" = "true" ]; then + echo "CONFIG_ZENIFY=y" >> ./.config + elif [ "$_zenify" = "false" ]; then + echo "# CONFIG_ZENIFY is not set" >> ./.config + fi + + # compiler optimization level + if [ "$_compileroptlevel" = "1" ]; then + if [ "$_basever" != "54" ]; then + echo "# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set" >> ./.config + else + echo "# CONFIG_CC_OPTIMIZE_HARDER is not set" >> ./.config + fi + elif [ "$_compileroptlevel" = "2" ]; then + sed -i -e 's/CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y/# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set/' ./.config + if [ "$_basever" != "54" ]; then + echo "CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3=y" >> ./.config + else + echo "CONFIG_CC_OPTIMIZE_HARDER=y" >> ./.config + fi + elif [ "$_compileroptlevel" = "3" ]; then + sed -i -e 's/CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y/# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set/' ./.config + sed -i -e 's/# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set/CONFIG_CC_OPTIMIZE_FOR_SIZE=y/' ./.config + if [ "$_basever" != "54" ]; then + echo "# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 is not set" >> ./.config + else + echo "# CONFIG_CC_OPTIMIZE_HARDER is not set" >> ./.config + fi + fi + + # cpu opt + if [ -n "$_processor_opt" ] && [ "$_processor_opt" != "native" ]; then + echo "# CONFIG_MNATIVE is not set" >> ./.config + fi + + if [ -n "$_processor_opt" ] && [ "$_processor_opt" != "generic" ]; then + sed -i -e 's/CONFIG_GENERIC_CPU=y/# CONFIG_GENERIC_CPU is not set/' ./.config + fi + + if [ "$_processor_opt" = "native" ]; then + echo "CONFIG_MNATIVE=y" >> ./.config + elif [ "$_processor_opt" = "k8" ]; then + sed -i -e 's/# CONFIG_MK8 is not set/CONFIG_MK8=y/' ./.config + elif [ "$_processor_opt" = "k8sse3" ]; then + sed -i -e 's/# CONFIG_MK8SSE3 is not set/CONFIG_MK8SSE3=y/' ./.config + elif [ "$_processor_opt" = "k10" ]; then + sed -i -e 's/# CONFIG_MK10 is not set/CONFIG_MK10=y/' ./.config + elif [ "$_processor_opt" = "barcelona" ]; then + sed -i -e 's/# CONFIG_MBARCELONA is not set/CONFIG_MBARCELONA=y/' ./.config + elif [ "$_processor_opt" = "bobcat" ]; then + sed -i -e 's/# CONFIG_MBOBCAT is not set/CONFIG_MBOBCAT=y/' ./.config + elif [ "$_processor_opt" = "jaguar" ]; then + sed -i -e 's/# CONFIG_MJAGUAR is not set/CONFIG_MJAGUAR=y/' ./.config + elif [ "$_processor_opt" = "bulldozer" ]; then + sed -i -e 's/# CONFIG_MBULLDOZER is not set/CONFIG_MBULLDOZER=y/' ./.config + elif [ "$_processor_opt" = "piledriver" ]; then + sed -i -e 's/# CONFIG_MPILEDRIVER is not set/CONFIG_MPILEDRIVER=y/' ./.config + elif [ "$_processor_opt" = "steamroller" ]; then + sed -i -e 's/# CONFIG_MSTEAMROLLER is not set/CONFIG_MSTEAMROLLER=y/' ./.config + elif [ "$_processor_opt" = "excavator" ]; then + sed -i -e 's/# CONFIG_MEXCAVATOR is not set/CONFIG_MEXCAVATOR=y/' ./.config + elif [ "$_processor_opt" = "zen" ]; then + sed -i -e 's/# CONFIG_MZEN is not set/CONFIG_MZEN=y/' ./.config + elif [ "$_processor_opt" = "zen2" ]; then + sed -i -e 's/# CONFIG_MZEN2 is not set/CONFIG_MZEN2=y/' ./.config + elif [ "$_processor_opt" = "mpsc" ]; then + sed -i -e 's/# CONFIG_MPSC is not set/CONFIG_MPSC=y/' ./.config + elif [ "$_processor_opt" = "atom" ]; then + sed -i -e 's/# CONFIG_MATOM is not set/CONFIG_MATOM=y/' ./.config + elif [ "$_processor_opt" = "core2" ]; then + sed -i -e 's/# CONFIG_MCORE2 is not set/CONFIG_MCORE2=y/' ./.config + elif [ "$_processor_opt" = "nehalem" ]; then + sed -i -e 's/# CONFIG_MNEHALEM is not set/CONFIG_MNEHALEM=y/' ./.config + elif [ "$_processor_opt" = "westmere" ]; then + sed -i -e 's/# CONFIG_MWESTMERE is not set/CONFIG_MWESTMERE=y/' ./.config + elif [ "$_processor_opt" = "silvermont" ]; then + sed -i -e 's/# CONFIG_MSILVERMONT is not set/CONFIG_MSILVERMONT=y/' ./.config + elif [ "$_processor_opt" = "sandybridge" ]; then + sed -i -e 's/# CONFIG_MSANDYBRIDGE is not set/CONFIG_MSANDYBRIDGE=y/' ./.config + elif [ "$_processor_opt" = "ivybridge" ]; then + sed -i -e 's/# CONFIG_MIVYBRIDGE is not set/CONFIG_MIVYBRIDGE=y/' ./.config + elif [ "$_processor_opt" = "haswell" ]; then + sed -i -e 's/# CONFIG_MHASWELL is not set/CONFIG_MHASWELL=y/' ./.config + elif [ "$_processor_opt" = "broadwell" ]; then + sed -i -e 's/# CONFIG_MBROADWELL is not set/CONFIG_MBROADWELL=y/' ./.config + elif [ "$_processor_opt" = "skylake" ]; then + sed -i -e 's/# CONFIG_MSKYLAKE is not set/CONFIG_MSKYLAKE=y/' ./.config + elif [ "$_processor_opt" = "skylakex" ]; then + sed -i -e 's/# CONFIG_MSKYLAKEX is not set/CONFIG_MSKYLAKEX=y/' ./.config + elif [ "$_processor_opt" = "cannonlake" ]; then + sed -i -e 's/# CONFIG_MCANNONLAKE is not set/CONFIG_MCANNONLAKE=y/' ./.config + elif [ "$_processor_opt" = "icelake" ]; then + sed -i -e 's/# CONFIG_MICELAKE is not set/CONFIG_MICELAKE=y/' ./.config + elif [ "$_processor_opt" = "goldmont" ]; then + sed -i -e 's/# CONFIG_MGOLDMONT is not set/CONFIG_MGOLDMONT=y/' ./.config + elif [ "$_processor_opt" = "goldmontplus" ]; then + sed -i -e 's/# CONFIG_MGOLDMONTPLUS is not set/CONFIG_MGOLDMONTPLUS=y/' ./.config + elif [ "$_processor_opt" = "cascadelake" ]; then + sed -i -e 's/# CONFIG_MCASCADELAKE is not set/CONFIG_MCASCADELAKE=y/' ./.config + elif [ "$_processor_opt" = "cooperlake" ]; then + sed -i -e 's/# CONFIG_MCOOPERLAKE is not set/CONFIG_MCOOPERLAKE=y/' ./.config + elif [ "$_processor_opt" = "tigerlake" ]; then + sed -i -e 's/# CONFIG_MTIGERLAKE is not set/CONFIG_MTIGERLAKE=y/' ./.config + fi + + # irq threading + if [ "$_irq_threading" = "true" ]; then + echo "CONFIG_FORCE_IRQ_THREADING=y" >> ./.config + elif [ "$_irq_threading" = "false" ]; then + echo "# CONFIG_FORCE_IRQ_THREADING is not set" >> ./.config + fi + + # smt nice + if [ "$_smt_nice" = "true" ]; then + echo "CONFIG_SMT_NICE=y" >> ./.config + elif [ "$_smt_nice" = "false" ]; then + echo "# CONFIG_SMT_NICE is not set" >> ./.config + fi + + # random trust cpu + if [ "$_random_trust_cpu" = "true" ]; then + sed -i -e 's/# CONFIG_RANDOM_TRUST_CPU is not set/CONFIG_RANDOM_TRUST_CPU=y/' ./.config + fi + + # rq sharing + if [ "$_runqueue_sharing" = "none" ]; then + echo -e "CONFIG_RQ_NONE=y\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config + elif [ -z "$_runqueue_sharing" ] || [ "$_runqueue_sharing" = "smt" ]; then + echo -e "# CONFIG_RQ_NONE is not set\nCONFIG_RQ_SMT=y\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config + elif [ "$_runqueue_sharing" = "mc" ]; then + echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\nCONFIG_RQ_MC=y\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config + elif [ "$_runqueue_sharing" = "smp" ]; then + echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\nCONFIG_RQ_SMP=y\n# CONFIG_RQ_ALL is not set" >> ./.config + elif [ "$_runqueue_sharing" = "all" ]; then + echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\n# CONFIG_RQ_MC_LLC is not set\n# CONFIG_RQ_SMP is not set\nCONFIG_RQ_ALL=y" >> ./.config + elif [ "$_runqueue_sharing" = "mc-llc" ]; then + echo -e "# CONFIG_RQ_NONE is not set\n# CONFIG_RQ_SMT is not set\n# CONFIG_RQ_MC is not set\nCONFIG_RQ_MC_LLC=y\n# CONFIG_RQ_SMP is not set\n# CONFIG_RQ_ALL is not set" >> ./.config + fi + + # timer freq + if [ -n "$_timer_freq" ] && [ "$_timer_freq" != "300" ]; then + sed -i -e 's/CONFIG_HZ_300=y/# CONFIG_HZ_300 is not set/' ./.config + sed -i -e 's/CONFIG_HZ_300_NODEF=y/# CONFIG_HZ_300_NODEF is not set/' ./.config + if [ "$_timer_freq" = "1000" ]; then + sed -i -e 's/# CONFIG_HZ_1000 is not set/CONFIG_HZ_1000=y/' ./.config + sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=1000/' ./.config + echo "# CONFIG_HZ_500 is not set" >> ./.config + echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_750 is not set" >> ./.config + echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config + echo "CONFIG_HZ_1000_NODEF=y" >> ./.config + echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config + elif [ "$_timer_freq" = "750" ]; then + sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=750/' ./.config + echo "# CONFIG_HZ_500 is not set" >> ./.config + echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config + echo "CONFIG_HZ_750=y" >> ./.config + echo "CONFIG_HZ_750_NODEF=y" >> ./.config + echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config + elif [ "$_timer_freq" = "500" ]; then + sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=500/' ./.config + echo "CONFIG_HZ_500=y" >> ./.config + echo "CONFIG_HZ_500_NODEF=y" >> ./.config + echo "# CONFIG_HZ_750 is not set" >> ./.config + echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config + elif [ "$_timer_freq" = "100" ]; then + sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=100/' ./.config + echo "# CONFIG_HZ_500 is not set" >> ./.config + echo "# CONFIG_HZ_750 is not set" >> ./.config + echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config + echo "CONFIG_HZ_100=y" >> ./.config + echo "CONFIG_HZ_100_NODEF=y" >> ./.config + fi + elif [ "${_cpusched}" = "MuQSS" ] && [ -z "$_timer_freq" ]; then + sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=100/' ./.config + echo "# CONFIG_HZ_500 is not set" >> ./.config + echo "# CONFIG_HZ_750 is not set" >> ./.config + echo "# CONFIG_HZ_1000_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_750_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_500_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config + echo "CONFIG_HZ_100=y" >> ./.config + echo "CONFIG_HZ_100_NODEF=y" >> ./.config + else + sed -i -e 's/CONFIG_HZ_300=y/# CONFIG_HZ_300 is not set/' ./.config + sed -i -e 's/CONFIG_HZ_300_NODEF=y/# CONFIG_HZ_300_NODEF is not set/' ./.config + sed -i -e 's/CONFIG_HZ=300/CONFIG_HZ=500/' ./.config + echo "CONFIG_HZ_500=y" >> ./.config + echo "CONFIG_HZ_500_NODEF=y" >> ./.config + echo "# CONFIG_HZ_250_NODEF is not set" >> ./.config + echo "# CONFIG_HZ_300_NODEF is not set" >> ./.config + fi + + # default cpu gov + if [ "$_default_cpu_gov" = "performance" ]; then + sed -i -e 's/CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y/# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set/' ./.config + sed -i -e 's/# CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE is not set/CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y/' ./.config + elif [ "$_default_cpu_gov" = "ondemand" ]; then + sed -i -e 's/CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y/# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set/' ./.config + sed -i -e 's/# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set/CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y/' ./.config + fi + + # ACPI_CPUFREQ disablement + if [ "$_disable_acpi_cpufreq" = "true" ]; then + sed -i -e 's/CONFIG_X86_ACPI_CPUFREQ=m/# CONFIG_X86_ACPI_CPUFREQ is not set/' ./.config + fi + + # ftrace + if [ -z "$_ftracedisable" ]; then + plain "" + plain "Disable FUNCTION_TRACER/GRAPH_TRACER? Lowers overhead but limits debugging" + plain "and analyzing of kernel functions." + read -rp "`echo $' > N/y : '`" CONDITION2; + fi + if [[ "$CONDITION2" =~ [yY] ]] || [ "$_ftracedisable" = "true" ]; then + sed -i -e 's/CONFIG_FUNCTION_TRACER=y/# CONFIG_FUNCTION_TRACER is not set/' ./.config + sed -i -e 's/CONFIG_FUNCTION_GRAPH_TRACER=y/# CONFIG_FUNCTION_GRAPH_TRACER is not set/' ./.config + fi + + # disable numa + if [ -z "$_numadisable" ]; then + plain "" + plain "Disable NUMA? Lowers overhead, but breaks CUDA/NvEnc on Nvidia if disabled." + plain "https://bbs.archlinux.org/viewtopic.php?id=239174" + read -rp "`echo $' > N/y : '`" CONDITION3; + fi + if [[ "$CONDITION3" =~ [yY] ]] || [ "$_numadisable" = "true" ]; then + # disable NUMA since 99.9% of users do not have multiple CPUs but do have multiple cores in one CPU + sed -i -e 's/CONFIG_NUMA=y/# CONFIG_NUMA is not set/' \ + -i -e '/CONFIG_AMD_NUMA=y/d' \ + -i -e '/CONFIG_X86_64_ACPI_NUMA=y/d' \ + -i -e '/CONFIG_NODES_SPAN_OTHER_NODES=y/d' \ + -i -e '/# CONFIG_NUMA_EMU is not set/d' \ + -i -e '/CONFIG_NODES_SHIFT=6/d' \ + -i -e '/CONFIG_NEED_MULTIPLE_NODES=y/d' \ + -i -e '/CONFIG_USE_PERCPU_NUMA_NODE_ID=y/d' \ + -i -e '/CONFIG_ACPI_NUMA=y/d' ./.config + fi + + # tickless + if [ -z "$_tickless" ]; then + plain "" + plain "Use CattaRappa mode (Tickless/Dynticks) ?" + plain "Can give higher performances in many cases but lower consistency on some hardware." + plain "Just tickless idle can perform better with some platforms (mostly AMD) or CPU schedulers (mostly MuQSS)." + if [ "${_cpusched}" = "MuQSS" ]; then + read -rp "`echo $'\n 0.No, use periodic ticks\n 1.Yes, full tickless baby!\n > 2.Just tickless idle plz\n [0-2?]: '`" CONDITION4; + else + read -rp "`echo $'\n 0.No, use periodic ticks\n > 1.Yes, full tickless baby!\n 2.Just tickless idle plz\n [0-2?]: '`" CONDITION4; + fi + fi + if [ "$CONDITION4" = "0" ] || [ "$_tickless" = "0" ]; then + echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config + sed -i -e 's/# CONFIG_HZ_PERIODIC is not set/CONFIG_HZ_PERIODIC=y/' ./.config + sed -i -e 's/CONFIG_NO_HZ_IDLE=y/# CONFIG_NO_HZ_IDLE is not set/' ./.config + sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config + sed -i -e 's/CONFIG_NO_HZ=y/# CONFIG_NO_HZ is not set/' ./.config + sed -i -e 's/CONFIG_NO_HZ_COMMON=y/# CONFIG_NO_HZ_COMMON is not set/' ./.config + elif [ "$CONDITION4" = "2" ] || [ "$_tickless" = "2" ]; then + echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config + sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config + sed -i -e 's/# CONFIG_NO_HZ_IDLE is not set/CONFIG_NO_HZ_IDLE=y/' ./.config + sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config + sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config + sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config + else + if [ "${_cpusched}" = "MuQSS" ]; then + echo "# CONFIG_NO_HZ_FULL_NODEF is not set" >> ./.config + sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config + sed -i -e 's/# CONFIG_NO_HZ_IDLE is not set/CONFIG_NO_HZ_IDLE=y/' ./.config + sed -i -e 's/CONFIG_NO_HZ_FULL=y/# CONFIG_NO_HZ_FULL is not set/' ./.config + sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config + sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config + else + echo "CONFIG_NO_HZ_FULL_NODEF=y" >> ./.config + sed -i -e 's/CONFIG_HZ_PERIODIC=y/# CONFIG_HZ_PERIODIC is not set/' ./.config + sed -i -e 's/CONFIG_NO_HZ_IDLE=y/# CONFIG_NO_HZ_IDLE is not set/' ./.config + sed -i -e 's/# CONFIG_NO_HZ_FULL is not set/CONFIG_NO_HZ_FULL=y/' ./.config + sed -i -e 's/# CONFIG_NO_HZ is not set/CONFIG_NO_HZ=y/' ./.config + sed -i -e 's/# CONFIG_NO_HZ_COMMON is not set/CONFIG_NO_HZ_COMMON=y/' ./.config + echo "CONFIG_CONTEXT_TRACKING=y" >> ./.config + echo "# CONFIG_CONTEXT_TRACKING_FORCE is not set" >> ./.config + fi + fi + + # voluntary preempt + if [ -z "$_voluntary_preempt" ]; then + plain "" + plain "Use explicit preemption points?" + plain "It can improve latency on PDS (at the cost of throughput)" + plain "and improve throughput on other schedulers (at the cost of latency)" + read -rp "`echo $' > N/y : '`" CONDITION5; + fi + if [[ "$CONDITION5" =~ [yY] ]] || [ "$_voluntary_preempt" = "true" ]; then + sed -i -e 's/CONFIG_PREEMPT=y/# CONFIG_PREEMPT is not set/' ./.config + sed -i -e 's/CONFIG_PREEMPT_LL=y/# CONFIG_PREEMPT_LL is not set/' ./.config + sed -i -e 's/# CONFIG_PREEMPT_VOLUNTARY is not set/CONFIG_PREEMPT_VOLUNTARY=y/' ./.config + fi + + # Open Firmware support + if [ -z "$_OFenable" ]; then + plain "" + plain "Enable Device Tree and Open Firmware support?" + read -rp "`echo $' > N/y : '`" CONDITION6; + fi + if [[ "$CONDITION6" =~ [yY] ]] || [ "$_OFenable" = "true" ]; then + sed -i -e 's/# CONFIG_OF is not set/CONFIG_OF=y/' ./.config + fi + + # acs override + if [ -z "$_acs_override" ]; then + plain "" + plain "Use ACS override patch?" + plain "https://wiki.archlinux.org/index.php/PCI_passthrough_via_OVMF#Bypassing_the_IOMMU_groups_.28ACS_override_patch.29" + read -rp "`echo $' > N/y : '`" CONDITION7; + fi + if [[ "$CONDITION7" =~ [yY] ]] || [ "$_acs_override" = "true" ]; then + msg2 "Patching ACS override" + patch -Np1 -i "$srcdir"/0006-add-acs-overrides_iommu.patch + fi + + # bcachefs + if [ -z "$_bcachefs" ] && [ "$_basever" != "54" ]; then + plain "" + plain "Add Bcache filesystem support? You'll have to install bcachefs-tools-git from AUR for utilities." + plain "https://bcachefs.org/" + read -rp "`echo $' > N/y : '`" CONDITION8; + fi + if [[ "$CONDITION8" =~ [yY] ]] || [ "$_bcachefs" = "true" ]; then + msg2 "Patching Bcache filesystem support override" + patch -Np1 -i "$srcdir"/0008-${_basekernel}-bcachefs.patch + echo "CONFIG_BCACHEFS_FS=m" >> ./.config + echo "CONFIG_BCACHEFS_QUOTA=y" >> ./.config + echo "CONFIG_BCACHEFS_POSIX_ACL=y" >> ./.config + echo "# CONFIG_BCACHEFS_DEBUG is not set" >> ./.config + echo "# CONFIG_BCACHEFS_TESTS is not set" >> ./.config + echo "# CONFIG_DEBUG_CLOSURES is not set" >> ./.config + fi + + # fsync support + if [ -z "$_fsync" ]; then + plain "" + plain "Enable support for fsync, an experimental replacement for esync in Valve Proton 4.11+" + plain "https://steamcommunity.com/games/221410/announcements/detail/2957094910196249305" + read -rp "`echo $' > N/y : '`" CONDITION9; + fi + if [[ "$CONDITION9" =~ [yY] ]] || [ "$_fsync" = "true" ]; then + msg2 "Patching Fsync support" + patch -Np1 -i "$srcdir"/0007-v${_basekernel}-fsync.patch + fi + + # ZFS fix + if [ "$_basever" != "59" ]; then + if [ -z "$_zfsfix" ]; then + plain "" + plain "Add back missing symbol for AES-NI/AVX support on ZFS" + plain "https://github.com/NixOS/nixpkgs/blob/master/pkgs/os-specific/linux/kernel/export_kernel_fpu_functions_5_3.patch" + read -rp "`echo $' > N/y : '`" CONDITION11; + fi + if [[ "$CONDITION11" =~ [yY] ]] || [ "$_zfsfix" = "true" ]; then + msg2 "Patching missing symbol for AES-NI/AVX support on ZFS" + patch -Np1 -i "$srcdir"/0011-ZFS-fix.patch + fi + fi + + # Community patches + if [ -n "$_community_patches" ]; then + if [ ! -d "$_where/../../community-patches" ]; then + cd "$_where/../.." && git clone https://github.com/Frogging-Family/community-patches.git && cd "${srcdir}/${_srcpath}" + fi + _community_patches=($_community_patches) + for _p in ${_community_patches[@]}; do + ln -s "$_where"/../../community-patches/linux"$_basever"-tkg/$_p "$_where"/ + done + fi + + # userpatches + if [ "$_user_patches" = "true" ]; then + _userpatch_target="linux-${_basekernel}" + _userpatch_ext="my" + user_patcher + fi + + # Community patches removal + for _p in ${_community_patches[@]}; do + rm -f "$_where"/$_p + done + + if [ "$_distro" = "Arch" ] || [ "$_distro" = "Void" ]; then + # don't run depmod on 'make install'. We'll do this ourselves in packaging + sed -i '2iexit 0' scripts/depmod.sh + + # get kernel version + make prepare ${llvm_opt} + fi + + # modprobed-db + if [ -z "$_modprobeddb" ]; then + plain "" + plain "Use modprobed db to clean config from unneeded modules?" + plain "Speeds up compilation considerably. Requires root." + plain "https://wiki.archlinux.org/index.php/Modprobed-db" + plain "!!!! Make sure to have a well populated db !!!!" + read -rp "`echo $' > N/y : '`" CONDITIONMPDB; + fi + if [[ "$CONDITIONMPDB" =~ [yY] ]] || [ "$_modprobeddb" = "true" ]; then + if [ "$_distro" != "Void" ]; then + sudo modprobed-db recall + fi + yes "" | make localmodconfig ${llvm_opt} + fi + + if [ true = "$_config_fragments" ]; then + local fragments=() + mapfile -d '' -t fragments < <(find "$_where"/ -type f -name "*.myfrag" -print0) + + if [ true = "$_config_fragments_no_confirm" ]; then + printf 'Using config fragment %s\n' "${fragments[@]#$_where/}" + else + for i in "${!fragments[@]}"; do + while true; do + read -r -p 'Found config fragment '"${fragments[$i]#$_where/}"', apply it? [y/N] ' CONDITIONMPDB + CONDITIONMPDB="$(printf '%s' "$CONDITIONMPDB" | tr '[:upper:]' '[:lower:]')" + case "$CONDITIONMPDB" in + y|yes) + break;; + n|no|'') + unset fragments[$i] + break;; + *) + echo 'Please answer with yes or no' + esac + done + done + fi + + if [ 0 -lt "${#fragments[@]}" ]; then + scripts/kconfig/merge_config.sh -m .config "${fragments[@]}" + fi + fi + + # set _menuconfig early for Void + if [ "$_distro" = "Void" ]; then + _menuconfig="Void" + fi + + # menuconfig / nconfig + if [ -z "$_menunconfig" ] && [ "$_distro" != "Void" ]; then + plain "" + plain "*Optional* For advanced users - Do you want to use make menuconfig or nconfig" + plain "to configure the kernel before building it?" + plain "If you do, make sure your terminal is currently" + plain "at least 19 lines by 80 columns large or you'll get an error :D" + read -rp "`echo $' > 0. nope\n 1. menuconfig\n 2. nconfig\n 3. xconfig\n choice[0-3?]: '`" CONDITIONMNC; + _menunconfig="$CONDITIONMNC" + fi + if [ 1 = "$_menunconfig" ]; then + cp .config .config.orig + make menuconfig ${llvm_opt} + elif [ 2 = "$_menunconfig" ]; then + cp .config .config.orig + make nconfig ${llvm_opt} + elif [ 3 = "$_menunconfig" ]; then + cp .config .config.orig + make xconfig ${llvm_opt} + else + # rewrite configuration + yes "" | make config ${llvm_opt} >/dev/null + fi + if [ 1 = "$_menunconfig" ] || [ 2 = "$_menunconfig" ] || [ 3 = "$_menunconfig" ]; then + if [ -z "${_diffconfig}" ]; then + while true; do + read -r -p 'Generate a config fragment from your changes? [y/N] ' CONDITIONF + CONDITIONF="$(printf '%s' "$CONDITIONF" | tr '[:upper:]' '[:lower:]')" + case "$CONDITIONF" in + y|yes) + _diffconfig=true + break;; + n|no|'') + _diffconfig=false + break;; + *) + echo 'Please answer with yes or no' + esac + done + fi + if [ true = "$_diffconfig" ]; then + if [ -z "$_diffconfig_name" ]; then + IFS= read -r -p 'Filename for the config fragment [leave empty to not generate fragment]: ' _diffconfig_name + fi + if [ -z "$_diffconfig_name" ]; then + echo 'No file name given, not generating config fragment.' + else ( + prev_pwd="${PWD:-$(pwd)}" + cd "$_where" + "${prev_pwd}/scripts/diffconfig" -m "${prev_pwd}/.config.orig" "${prev_pwd}/.config" > "$_diffconfig_name" + ) fi + fi + rm .config.orig + fi + + if [ "$_distro" = "Arch" ]; then + make -s kernelrelease > version + msg2 "Prepared %s version %s" "$pkgbase" "$( +From: Serge Hallyn +Date: Fri, 31 May 2013 19:12:12 +0100 +Subject: [PATCH] add sysctl to disallow unprivileged CLONE_NEWUSER by default + +Signed-off-by: Serge Hallyn +[bwh: Remove unneeded binary sysctl bits] +Signed-off-by: Daniel Micay +--- + kernel/fork.c | 15 +++++++++++++++ + kernel/sysctl.c | 12 ++++++++++++ + kernel/user_namespace.c | 3 +++ + 3 files changed, 30 insertions(+) + +diff --git a/kernel/fork.c b/kernel/fork.c +index 07cc743698d3668e..4011d68a8ff9305c 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -102,6 +102,11 @@ + + #define CREATE_TRACE_POINTS + #include ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#else ++#define unprivileged_userns_clone 0 ++#endif + + /* + * Minimum number of threads to boot the kernel +@@ -1555,6 +1560,10 @@ static __latent_entropy struct task_struct *copy_process( + if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) + return ERR_PTR(-EINVAL); + ++ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) ++ if (!capable(CAP_SYS_ADMIN)) ++ return ERR_PTR(-EPERM); ++ + /* + * Thread groups must share signals as well, and detached threads + * can only be started up within the thread group. +@@ -2348,6 +2357,12 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; + ++ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { ++ err = -EPERM; ++ if (!capable(CAP_SYS_ADMIN)) ++ goto bad_unshare_out; ++ } ++ + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index b86520ed3fb60fbf..f7dab3760839f1a1 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -105,6 +105,9 @@ extern int core_uses_pid; + extern char core_pattern[]; + extern unsigned int core_pipe_limit; + #endif ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#endif + extern int pid_max; + extern int pid_max_min, pid_max_max; + extern int percpu_pagelist_fraction; +@@ -513,6 +516,15 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_USER_NS ++ { ++ .procname = "unprivileged_userns_clone", ++ .data = &unprivileged_userns_clone, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++#endif + #ifdef CONFIG_PROC_SYSCTL + { + .procname = "tainted", +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index c490f1e4313b998a..dd03bd39d7bf194d 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -24,6 +24,9 @@ + #include + #include + ++/* sysctl */ ++int unprivileged_userns_clone; ++ + static struct kmem_cache *user_ns_cachep __read_mostly; + static DEFINE_MUTEX(userns_state_mutex); + +-- +2.15.1 + +From b5202296055dd333db4425120d3f93ef4e6a0573 Mon Sep 17 00:00:00 2001 +From: "Jan Alexander Steffens (heftig)" +Date: Thu, 7 Dec 2017 13:50:48 +0100 +Subject: ZEN: Add CONFIG for unprivileged_userns_clone + +This way our default behavior continues to match the vanilla kernel. +--- + init/Kconfig | 16 ++++++++++++++++ + kernel/user_namespace.c | 4 ++++ + 2 files changed, 20 insertions(+) + +diff --git a/init/Kconfig b/init/Kconfig +index 4592bf7997c0..f3df02990aff 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1004,6 +1004,22 @@ config USER_NS + + If unsure, say N. + ++config USER_NS_UNPRIVILEGED ++ bool "Allow unprivileged users to create namespaces" ++ default y ++ depends on USER_NS ++ help ++ When disabled, unprivileged users will not be able to create ++ new namespaces. Allowing users to create their own namespaces ++ has been part of several recent local privilege escalation ++ exploits, so if you need user namespaces but are ++ paranoid^Wsecurity-conscious you want to disable this. ++ ++ This setting can be overridden at runtime via the ++ kernel.unprivileged_userns_clone sysctl. ++ ++ If unsure, say Y. ++ + config PID_NS + bool "PID Namespaces" + default y +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index 6b9dbc257e34..107b17f0d528 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -27,7 +27,11 @@ + #include + + /* sysctl */ ++#ifdef CONFIG_USER_NS_UNPRIVILEGED ++int unprivileged_userns_clone = 1; ++#else + int unprivileged_userns_clone; ++#endif + + static struct kmem_cache *user_ns_cachep __read_mostly; + static DEFINE_MUTEX(userns_state_mutex); diff --git a/linux-tkg/linux-tkg-patches/5.4/0002-clear-patches.patch b/linux-tkg/linux-tkg-patches/5.4/0002-clear-patches.patch new file mode 100644 index 0000000..a7c9d4a --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.4/0002-clear-patches.patch @@ -0,0 +1,354 @@ +From 2ac70785613ef4c6b16414986bb18bd7b60d2a13 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Mon, 14 Mar 2016 11:10:58 -0600 +Subject: [PATCH] pci pme wakeups + +Reduce wakeups for PME checks, which are a workaround for miswired +boards (sadly, too many of them) in laptops. +--- + drivers/pci/pci.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c +index c25acace7d91..0ddebdad9f5b 100644 +--- a/drivers/pci/pci.c ++++ b/drivers/pci/pci.c +@@ -61,7 +61,7 @@ struct pci_pme_device { + struct pci_dev *dev; + }; + +-#define PME_TIMEOUT 1000 /* How long between PME checks */ ++#define PME_TIMEOUT 4000 /* How long between PME checks */ + + static void pci_dev_d3_sleep(struct pci_dev *dev) + { +-- +2.20.1 + +From 7e7e36c67aa71d6a1ec5676d99d37c1fea389ceb Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Sat, 19 Mar 2016 21:32:19 -0400 +Subject: [PATCH] intel_idle: tweak cpuidle cstates + +Increase target_residency in cpuidle cstate + +Tune intel_idle to be a bit less agressive; +Clear linux is cleaner in hygiene (wakupes) than the average linux, +so we can afford changing these in a way that increases +performance while keeping power efficiency +--- + drivers/idle/intel_idle.c | 44 +++++++++++++++++++-------------------- + 1 file changed, 22 insertions(+), 22 deletions(-) + +diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c +index 8b5d85c91e9d..5e2d813a048d 100644 +--- a/drivers/idle/intel_idle.c ++++ b/drivers/idle/intel_idle.c +@@ -466,7 +466,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -474,7 +474,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 33, +- .target_residency = 100, ++ .target_residency = 900, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -482,7 +482,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, +- .target_residency = 400, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -490,7 +490,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x32", + .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 166, +- .target_residency = 500, ++ .target_residency = 1500, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -498,7 +498,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, +- .target_residency = 900, ++ .target_residency = 2000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -506,7 +506,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 600, +- .target_residency = 1800, ++ .target_residency = 5000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -514,7 +514,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 2600, +- .target_residency = 7700, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -534,7 +534,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -542,7 +542,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 40, +- .target_residency = 100, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -550,7 +550,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, +- .target_residency = 400, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -558,7 +558,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x32", + .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 166, +- .target_residency = 500, ++ .target_residency = 2000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -566,7 +566,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, +- .target_residency = 900, ++ .target_residency = 4000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -574,7 +574,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 600, +- .target_residency = 1800, ++ .target_residency = 7000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -582,7 +582,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 2600, +- .target_residency = 7700, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -603,7 +603,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -611,7 +611,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 70, +- .target_residency = 100, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -619,7 +619,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 85, +- .target_residency = 200, ++ .target_residency = 600, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -627,7 +627,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x33", + .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 124, +- .target_residency = 800, ++ .target_residency = 3000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -635,7 +635,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 200, +- .target_residency = 800, ++ .target_residency = 3200, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -643,7 +643,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 480, +- .target_residency = 5000, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -651,7 +651,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 890, +- .target_residency = 5000, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -672,7 +672,7 @@ static struct cpuidle_state skx_cstates[] = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 300, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +-- +2.20.1 + +From b8211d4f79dd88dfc2d4bd52be46103ea0b70e3e Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Fri, 6 Jan 2017 15:34:09 +0000 +Subject: [PATCH] ipv4/tcp: allow the memory tuning for tcp to go a little + bigger than default + +--- + net/ipv4/tcp.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index cf3c5095c10e..b30d51837b2d 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -3897,8 +3897,8 @@ void __init tcp_init(void) + tcp_init_mem(); + /* Set per-socket limits to no more than 1/128 the pressure threshold */ + limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); +- max_wshare = min(4UL*1024*1024, limit); +- max_rshare = min(6UL*1024*1024, limit); ++ max_wshare = min(16UL*1024*1024, limit); ++ max_rshare = min(16UL*1024*1024, limit); + + init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; +-- +2.20.1 + +From 050223869257b87e22636158a80da38d877248ed Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Sun, 18 Feb 2018 23:35:41 +0000 +Subject: [PATCH] locking: rwsem: spin faster + +tweak rwsem owner spinning a bit +--- + kernel/locking/rwsem.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c +index eef04551eae7..1ec5ab4c8ff7 100644 +--- a/kernel/locking/rwsem.c ++++ b/kernel/locking/rwsem.c +@@ -720,6 +720,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) + struct task_struct *new, *owner; + unsigned long flags, new_flags; + enum owner_state state; ++ int i = 0; + + owner = rwsem_owner_flags(sem, &flags); + state = rwsem_owner_state(owner, flags, nonspinnable); +@@ -753,7 +754,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) + break; + } + +- cpu_relax(); ++ if (i++ > 1000) ++ cpu_relax(); + } + rcu_read_unlock(); + +From b836ea320114643d4354b43acb6ec8bb06ada487 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Thu, 2 Jun 2016 23:36:32 -0500 +Subject: [PATCH] drivers: Initialize ata before graphics + +ATA init is the long pole in the boot process, and its asynchronous. +move the graphics init after it so that ata and graphics initialize +in parallel +--- + drivers/Makefile | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +diff --git a/drivers/Makefile b/drivers/Makefile +index aaef17cc6512..d08f3a394929 100644 +--- a/drivers/Makefile ++++ b/drivers/Makefile +@@ -58,15 +58,8 @@ obj-y += char/ + # iommu/ comes before gpu as gpu are using iommu controllers + obj-y += iommu/ + +-# gpu/ comes after char for AGP vs DRM startup and after iommu +-obj-y += gpu/ +- + obj-$(CONFIG_CONNECTOR) += connector/ + +-# i810fb and intelfb depend on char/agp/ +-obj-$(CONFIG_FB_I810) += video/fbdev/i810/ +-obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ +- + obj-$(CONFIG_PARPORT) += parport/ + obj-$(CONFIG_NVM) += lightnvm/ + obj-y += base/ block/ misc/ mfd/ nfc/ +@@ -79,6 +72,14 @@ obj-$(CONFIG_IDE) += ide/ + obj-y += scsi/ + obj-y += nvme/ + obj-$(CONFIG_ATA) += ata/ ++ ++# gpu/ comes after char for AGP vs DRM startup and after iommu ++obj-y += gpu/ ++ ++# i810fb and intelfb depend on char/agp/ ++obj-$(CONFIG_FB_I810) += video/fbdev/i810/ ++obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ ++ + obj-$(CONFIG_TARGET_CORE) += target/ + obj-$(CONFIG_MTD) += mtd/ + obj-$(CONFIG_SPI) += spi/ diff --git a/linux-tkg/linux-tkg-patches/5.4/0003-glitched-base.patch b/linux-tkg/linux-tkg-patches/5.4/0003-glitched-base.patch new file mode 100644 index 0000000..4cbf12d --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.4/0003-glitched-base.patch @@ -0,0 +1,4612 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched + +diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h +index 87f1fc9..b3be470 100755 +--- a/scripts/mkcompile_h ++++ b/scripts/mkcompile_h +@@ -50,8 +50,8 @@ else + fi + + UTS_VERSION="#$VERSION" +-CONFIG_FLAGS="" +-if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi ++CONFIG_FLAGS="TKG" ++if [ -n "$SMP" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS SMP"; fi + if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi + UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP" + +diff --git a/fs/dcache.c b/fs/dcache.c +index 2acfc69878f5..3f1131431e06 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -69,7 +69,7 @@ + * If no ancestor relationship: + * arbitrary, since it's serialized on rename_lock + */ +-int sysctl_vfs_cache_pressure __read_mostly = 100; ++int sysctl_vfs_cache_pressure __read_mostly = 50; + EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); + + __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 211890edf37e..37121563407d 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -41,7 +41,7 @@ const_debug unsigned int sysctl_sched_features = + * Number of tasks to iterate in a single balance run. + * Limited because this is done with IRQs disabled. + */ +-const_debug unsigned int sysctl_sched_nr_migrate = 32; ++const_debug unsigned int sysctl_sched_nr_migrate = 128; + + /* + * period over which we average the RT time consumption, measured +@@ -61,9 +61,9 @@ __read_mostly int scheduler_running; + + /* + * part of the period that we allow rt tasks to run in us. +- * default: 0.95s ++ * XanMod default: 0.98s + */ +-int sysctl_sched_rt_runtime = 950000; ++int sysctl_sched_rt_runtime = 980000; + + /* + * __task_rq_lock - lock the rq @p resides on. +diff --git a/lib/Kconfig b/lib/Kconfig +index 5fe577673b98..c44c27cd6e05 100644 +--- a/lib/Kconfig ++++ b/lib/Kconfig +@@ -10,6 +10,16 @@ menu "Library routines" + config RAID6_PQ + tristate + ++config RAID6_USE_PREFER_GEN ++ bool "Use prefered raid6 gen function." ++ default n ++ depends on RAID6_PQ ++ help ++ This option is provided for using prefered raid6 gen function ++ directly instead of calculating the best durning boot-up. ++ The prefered function should be the same as the best one from ++ calculating. ++ + config BITREVERSE + tristate + +diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c +index 5065b1e7e327..1bf3c712a4ca 100644 +--- a/lib/raid6/algos.c ++++ b/lib/raid6/algos.c +@@ -150,6 +150,29 @@ static inline const struct raid6_recov_calls *raid6_choose_recov(void) + return best; + } + ++#ifdef CONFIG_RAID6_USE_PREFER_GEN ++static inline const struct raid6_calls *raid6_choose_prefer_gen(void) ++{ ++ const struct raid6_calls *const *algo; ++ const struct raid6_calls *best; ++ ++ for (best = NULL, algo = raid6_algos; *algo; algo++) { ++ if (!best || (*algo)->prefer >= best->prefer) { ++ if ((*algo)->valid && !(*algo)->valid()) ++ continue; ++ best = *algo; ++ } ++ } ++ ++ if (best) { ++ printk("raid6: using algorithm %s\n", best->name); ++ raid6_call = *best; ++ } else ++ printk("raid6: Yikes! No algorithm found!\n"); ++ ++ return best; ++} ++#else + static inline const struct raid6_calls *raid6_choose_gen( + void *(*const dptrs)[(65536/PAGE_SIZE)+2], const int disks) + { +@@ -221,6 +244,7 @@ static inline const struct raid6_calls *raid6_choose_gen( + + return best; + } ++#endif + + + /* Try to pick the best algorithm */ +@@ -228,10 +252,11 @@ static inline const struct raid6_calls *raid6_choose_gen( + + int __init raid6_select_algo(void) + { +- const int disks = (65536/PAGE_SIZE)+2; +- + const struct raid6_calls *gen_best; + const struct raid6_recov_calls *rec_best; ++#ifndef CONFIG_RAID6_USE_PREFER_GEN ++ const int disks = (65536/PAGE_SIZE)+2; ++ + char *syndromes; + void *dptrs[(65536/PAGE_SIZE)+2]; + int i; +@@ -252,11 +277,16 @@ int __init raid6_select_algo(void) + + /* select raid gen_syndrome function */ + gen_best = raid6_choose_gen(&dptrs, disks); ++#else ++ gen_best = raid6_choose_prefer_gen(); ++#endif + + /* select raid recover functions */ + rec_best = raid6_choose_recov(); + ++#ifndef CONFIG_RAID6_USE_PREFER_GEN + free_pages((unsigned long)syndromes, 1); ++#endif + + return gen_best && rec_best ? 0 : -EINVAL; + } +diff --git a/mm/zswap.c b/mm/zswap.c +index 61a5c41972db..2674c2806130 100644 +--- a/mm/zswap.c ++++ b/mm/zswap.c +@@ -91,7 +91,7 @@ static struct kernel_param_ops zswap_enabled_param_ops = { + module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644); + + /* Crypto compressor to use */ +-#define ZSWAP_COMPRESSOR_DEFAULT "lzo" ++#define ZSWAP_COMPRESSOR_DEFAULT "lz4" + static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; + static int zswap_compressor_param_set(const char *, + const struct kernel_param *); +diff --git a/scripts/setlocalversion b/scripts/setlocalversion +index 71f39410691b..288f9679e883 100755 +--- a/scripts/setlocalversion ++++ b/scripts/setlocalversion +@@ -54,7 +54,7 @@ scm_version() + # If only the short version is requested, don't bother + # running further git commands + if $short; then +- echo "+" ++ # echo "+" + return + fi + # If we are past a tagged commit (like + +From f85ed068b4d0e6c31edce8574a95757a60e58b87 Mon Sep 17 00:00:00 2001 +From: Etienne Juvigny +Date: Mon, 3 Sep 2018 17:36:25 +0200 +Subject: Zenify & stuff + + +diff --git a/Documentation/tp_smapi.txt b/Documentation/tp_smapi.txt +new file mode 100644 +index 000000000000..a249678a8866 +--- /dev/null ++++ b/Documentation/tp_smapi.txt +@@ -0,0 +1,275 @@ ++tp_smapi version 0.42 ++IBM ThinkPad hardware functions driver ++ ++Author: Shem Multinymous ++Project: http://sourceforge.net/projects/tpctl ++Wiki: http://thinkwiki.org/wiki/tp_smapi ++List: linux-thinkpad@linux-thinkpad.org ++ (http://mailman.linux-thinkpad.org/mailman/listinfo/linux-thinkpad) ++ ++Description ++----------- ++ ++ThinkPad laptops include a proprietary interface called SMAPI BIOS ++(System Management Application Program Interface) which provides some ++hardware control functionality that is not accessible by other means. ++ ++This driver exposes some features of the SMAPI BIOS through a sysfs ++interface. It is suitable for newer models, on which SMAPI is invoked ++through IO port writes. Older models use a different SMAPI interface; ++for those, try the "thinkpad" module from the "tpctl" package. ++ ++WARNING: ++This driver uses undocumented features and direct hardware access. ++It thus cannot be guaranteed to work, and may cause arbitrary damage ++(especially on models it wasn't tested on). ++ ++ ++Module parameters ++----------------- ++ ++thinkpad_ec module: ++ force_io=1 lets thinkpad_ec load on some recent ThinkPad models ++ (e.g., T400 and T500) whose BIOS's ACPI DSDT reserves the ports we need. ++tp_smapi module: ++ debug=1 enables verbose dmesg output. ++ ++ ++Usage ++----- ++ ++Control of battery charging thresholds (in percents of current full charge ++capacity): ++ ++# echo 40 > /sys/devices/platform/smapi/BAT0/start_charge_thresh ++# echo 70 > /sys/devices/platform/smapi/BAT0/stop_charge_thresh ++# cat /sys/devices/platform/smapi/BAT0/*_charge_thresh ++ ++ (This is useful since Li-Ion batteries wear out much faster at very ++ high or low charge levels. The driver will also keeps the thresholds ++ across suspend-to-disk with AC disconnected; this isn't done ++ automatically by the hardware.) ++ ++Inhibiting battery charging for 17 minutes (overrides thresholds): ++ ++# echo 17 > /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes ++# echo 0 > /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes # stop ++# cat /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes ++ ++ (This can be used to control which battery is charged when using an ++ Ultrabay battery.) ++ ++Forcing battery discharging even if AC power available: ++ ++# echo 1 > /sys/devices/platform/smapi/BAT0/force_discharge # start discharge ++# echo 0 > /sys/devices/platform/smapi/BAT0/force_discharge # stop discharge ++# cat /sys/devices/platform/smapi/BAT0/force_discharge ++ ++ (When AC is connected, forced discharging will automatically stop ++ when battery is fully depleted -- this is useful for calibration. ++ Also, this attribute can be used to control which battery is discharged ++ when both a system battery and an Ultrabay battery are connected.) ++ ++Misc read-only battery status attributes (see note about HDAPS below): ++ ++/sys/devices/platform/smapi/BAT0/installed # 0 or 1 ++/sys/devices/platform/smapi/BAT0/state # idle/charging/discharging ++/sys/devices/platform/smapi/BAT0/cycle_count # integer counter ++/sys/devices/platform/smapi/BAT0/current_now # instantaneous current ++/sys/devices/platform/smapi/BAT0/current_avg # last minute average ++/sys/devices/platform/smapi/BAT0/power_now # instantaneous power ++/sys/devices/platform/smapi/BAT0/power_avg # last minute average ++/sys/devices/platform/smapi/BAT0/last_full_capacity # in mWh ++/sys/devices/platform/smapi/BAT0/remaining_percent # remaining percent of energy (set by calibration) ++/sys/devices/platform/smapi/BAT0/remaining_percent_error # error range of remaing_percent (not reset by calibration) ++/sys/devices/platform/smapi/BAT0/remaining_running_time # in minutes, by last minute average power ++/sys/devices/platform/smapi/BAT0/remaining_running_time_now # in minutes, by instantenous power ++/sys/devices/platform/smapi/BAT0/remaining_charging_time # in minutes ++/sys/devices/platform/smapi/BAT0/remaining_capacity # in mWh ++/sys/devices/platform/smapi/BAT0/design_capacity # in mWh ++/sys/devices/platform/smapi/BAT0/voltage # in mV ++/sys/devices/platform/smapi/BAT0/design_voltage # in mV ++/sys/devices/platform/smapi/BAT0/charging_max_current # max charging current ++/sys/devices/platform/smapi/BAT0/charging_max_voltage # max charging voltage ++/sys/devices/platform/smapi/BAT0/group{0,1,2,3}_voltage # see below ++/sys/devices/platform/smapi/BAT0/manufacturer # string ++/sys/devices/platform/smapi/BAT0/model # string ++/sys/devices/platform/smapi/BAT0/barcoding # string ++/sys/devices/platform/smapi/BAT0/chemistry # string ++/sys/devices/platform/smapi/BAT0/serial # integer ++/sys/devices/platform/smapi/BAT0/manufacture_date # YYYY-MM-DD ++/sys/devices/platform/smapi/BAT0/first_use_date # YYYY-MM-DD ++/sys/devices/platform/smapi/BAT0/temperature # in milli-Celsius ++/sys/devices/platform/smapi/BAT0/dump # see below ++/sys/devices/platform/smapi/ac_connected # 0 or 1 ++ ++The BAT0/group{0,1,2,3}_voltage attribute refers to the separate cell groups ++in each battery. For example, on the ThinkPad 600, X3x, T4x and R5x models, ++the battery contains 3 cell groups in series, where each group consisting of 2 ++or 3 cells connected in parallel. The voltage of each group is given by these ++attributes, and their sum (roughly) equals the "voltage" attribute. ++(The effective performance of the battery is determined by the weakest group, ++i.e., the one those voltage changes most rapidly during dis/charging.) ++ ++The "BAT0/dump" attribute gives a a hex dump of the raw status data, which ++contains additional data now in the above (if you can figure it out). Some ++unused values are autodetected and replaced by "--": ++ ++In all of the above, replace BAT0 with BAT1 to address the 2nd battery (e.g. ++in the UltraBay). ++ ++ ++Raw SMAPI calls: ++ ++/sys/devices/platform/smapi/smapi_request ++This performs raw SMAPI calls. It uses a bad interface that cannot handle ++multiple simultaneous access. Don't touch it, it's for development only. ++If you did touch it, you would so something like ++# echo '211a 100 0 0' > /sys/devices/platform/smapi/smapi_request ++# cat /sys/devices/platform/smapi/smapi_request ++and notice that in the output "211a 34b b2 0 0 0 'OK'", the "4b" in the 2nd ++value, converted to decimal is 75: the current charge stop threshold. ++ ++ ++Model-specific status ++--------------------- ++ ++Works (at least partially) on the following ThinkPad model: ++* A30 ++* G41 ++* R40, R50p, R51, R52 ++* T23, T40, T40p, T41, T41p, T42, T42p, T43, T43p, T60, T61, T400, T410, T420 (partially) ++* X24, X31, X32, X40, X41, X60, X61, X200, X201, X220 (partially) ++* Z60t, Z61m ++ ++Does not work on: ++* X230 and newer ++* T430 and newer ++* Any ThinkPad Edge ++* Any ThinkPad Yoga ++* Any ThinkPad L series ++* Any ThinkPad P series ++ ++Not all functions are available on all models; for detailed status, see: ++ http://thinkwiki.org/wiki/tp_smapi ++ ++Please report success/failure by e-mail or on the Wiki. ++If you get a "not implemented" or "not supported" message, your laptop ++probably just can't do that (at least not via the SMAPI BIOS). ++For negative reports, follow the bug reporting guidelines below. ++If you send me the necessary technical data (i.e., SMAPI function ++interfaces), I will support additional models. ++ ++ ++Additional HDAPS features ++------------------------- ++ ++The modified hdaps driver has several improvements on the one in mainline ++(beyond resolving the conflict with thinkpad_ec and tp_smapi): ++ ++- Fixes reliability and improves support for recent ThinkPad models ++ (especially *60 and newer). Unlike the mainline driver, the modified hdaps ++ correctly follows the Embedded Controller communication protocol. ++ ++- Extends the "invert" parameter to cover all possible axis orientations. ++ The possible values are as follows. ++ Let X,Y denote the hardware readouts. ++ Let R denote the laptop's roll (tilt left/right). ++ Let P denote the laptop's pitch (tilt forward/backward). ++ invert=0: R= X P= Y (same as mainline) ++ invert=1: R=-X P=-Y (same as mainline) ++ invert=2: R=-X P= Y (new) ++ invert=3: R= X P=-Y (new) ++ invert=4: R= Y P= X (new) ++ invert=5: R=-Y P=-X (new) ++ invert=6: R=-Y P= X (new) ++ invert=7: R= Y P=-X (new) ++ It's probably easiest to just try all 8 possibilities and see which yields ++ correct results (e.g., in the hdaps-gl visualisation). ++ ++- Adds a whitelist which automatically sets the correct axis orientation for ++ some models. If the value for your model is wrong or missing, you can override ++ it using the "invert" parameter. Please also update the tables at ++ http://www.thinkwiki.org/wiki/tp_smapi and ++ http://www.thinkwiki.org/wiki/List_of_DMI_IDs ++ and submit a patch for the whitelist in hdaps.c. ++ ++- Provides new attributes: ++ /sys/devices/platform/hdaps/sampling_rate: ++ This determines the frequency at which the host queries the embedded ++ controller for accelerometer data (and informs the hdaps input devices). ++ Default=50. ++ /sys/devices/platform/hdaps/oversampling_ratio: ++ When set to X, the embedded controller is told to do physical accelerometer ++ measurements at a rate that is X times higher than the rate at which ++ the driver reads those measurements (i.e., X*sampling_rate). This ++ makes the readouts from the embedded controller more fresh, and is also ++ useful for the running average filter (see next). Default=5 ++ /sys/devices/platform/hdaps/running_avg_filter_order: ++ When set to X, reported readouts will be the average of the last X physical ++ accelerometer measurements. Current firmware allows 1<=X<=8. Setting to a ++ high value decreases readout fluctuations. The averaging is handled by the ++ embedded controller, so no CPU resources are used. Higher values make the ++ readouts smoother, since it averages out both sensor noise (good) and abrupt ++ changes (bad). Default=2. ++ ++- Provides a second input device, which publishes the raw accelerometer ++ measurements (without the fuzzing needed for joystick emulation). This input ++ device can be matched by a udev rule such as the following (all on one line): ++ KERNEL=="event[0-9]*", ATTRS{phys}=="hdaps/input1", ++ ATTRS{modalias}=="input:b0019v1014p5054e4801-*", ++ SYMLINK+="input/hdaps/accelerometer-event ++ ++A new version of the hdapsd userspace daemon, which uses the input device ++interface instead of polling sysfs, is available seprately. Using this reduces ++the total interrupts per second generated by hdaps+hdapsd (on tickless kernels) ++to 50, down from a value that fluctuates between 50 and 100. Set the ++sampling_rate sysfs attribute to a lower value to further reduce interrupts, ++at the expense of response latency. ++ ++Licensing note: all my changes to the HDAPS driver are licensed under the ++GPL version 2 or, at your option and to the extent allowed by derivation from ++prior works, any later version. My version of hdaps is derived work from the ++mainline version, which at the time of writing is available only under ++GPL version 2. ++ ++Bug reporting ++------------- ++ ++Mail . Please include: ++* Details about your model, ++* Relevant "dmesg" output. Make sure thinkpad_ec and tp_smapi are loaded with ++ the "debug=1" parameter (e.g., use "make load HDAPS=1 DEBUG=1"). ++* Output of "dmidecode | grep -C5 Product" ++* Does the failed functionality works under Windows? ++ ++ ++More about SMAPI ++---------------- ++ ++For hints about what may be possible via the SMAPI BIOS and how, see: ++ ++* IBM Technical Reference Manual for the ThinkPad 770 ++ (http://www-307.ibm.com/pc/support/site.wss/document.do?lndocid=PFAN-3TUQQD) ++* Exported symbols in PWRMGRIF.DLL or TPPWRW32.DLL (e.g., use "objdump -x"). ++* drivers/char/mwave/smapi.c in the Linux kernel tree.* ++* The "thinkpad" SMAPI module (http://tpctl.sourceforge.net). ++* The SMAPI_* constants in tp_smapi.c. ++ ++Note that in the above Technical Reference and in the "thinkpad" module, ++SMAPI is invoked through a function call to some physical address. However, ++the interface used by tp_smapi and the above mwave drive, and apparently ++required by newer ThinkPad, is different: you set the parameters up in the ++CPU's registers and write to ports 0xB2 (the APM control port) and 0x4F; this ++triggers an SMI (System Management Interrupt), causing the CPU to enter ++SMM (System Management Mode) and run the BIOS firmware; the results are ++returned in the CPU's registers. It is not clear what is the relation between ++the two variants of SMAPI, though the assignment of error codes seems to be ++similar. ++ ++In addition, the embedded controller on ThinkPad laptops has a non-standard ++interface at IO ports 0x1600-0x161F (mapped to LCP channel 3 of the H8S chip). ++The interface provides various system management services (currently known: ++battery information and accelerometer readouts). For more information see the ++thinkpad_ec module and the H8S hardware documentation: ++http://documentation.renesas.com/eng/products/mpumcu/rej09b0300_2140bhm.pdf +diff --git a/Makefile b/Makefile +index 863f58503bee..f33cf760af6d 100644 +--- a/Makefile ++++ b/Makefile +@@ -682,12 +682,16 @@ ifdef CONFIG_FUNCTION_TRACER + KBUILD_CFLAGS += $(call cc-disable-warning, format-overflow) + KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) + ++ifdef CONFIG_CC_OPTIMIZE_HARDER ++KBUILD_CFLAGS += -O3 $(call cc-disable-warning,maybe-uninitialized,) ++else + ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE + KBUILD_CFLAGS += -O2 + else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 + KBUILD_CFLAGS += -O3 + else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE + KBUILD_CFLAGS += -Os + endif ++endif + + ifdef CONFIG_CC_DISABLE_WARN_MAYBE_UNINITIALIZED + KBUILD_CFLAGS += -Wno-maybe-uninitialized + +diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c +index 4f32c4062fb6..c0bf039e1b40 100644 +--- a/drivers/infiniband/core/addr.c ++++ b/drivers/infiniband/core/addr.c +@@ -721,6 +721,7 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; ++ struct sockaddr_ib _sockaddr_ib; + } sgid_addr, dgid_addr; + int ret; + +diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c +index 55d33500d55e..744e84228a1f 100644 +--- a/drivers/input/mouse/synaptics.c ++++ b/drivers/input/mouse/synaptics.c +@@ -1338,7 +1338,9 @@ static int set_input_params(struct psmouse *psmouse, + if (psmouse_matches_pnp_id(psmouse, topbuttonpad_pnp_ids) && + !SYN_CAP_EXT_BUTTONS_STICK(info->ext_cap_10)) + __set_bit(INPUT_PROP_TOPBUTTONPAD, dev->propbit); +- } ++ } else if (SYN_CAP_CLICKPAD2BTN(info->ext_cap_0c) || ++ SYN_CAP_CLICKPAD2BTN2(info->ext_cap_0c)) ++ __set_bit(INPUT_PROP_BUTTONPAD, dev->propbit); + + return 0; + } +diff --git a/drivers/input/mouse/synaptics.h b/drivers/input/mouse/synaptics.h +index fc00e005c611..4cfbeec3ae4c 100644 +--- a/drivers/input/mouse/synaptics.h ++++ b/drivers/input/mouse/synaptics.h +@@ -86,6 +86,7 @@ + */ + #define SYN_CAP_CLICKPAD(ex0c) ((ex0c) & BIT(20)) /* 1-button ClickPad */ + #define SYN_CAP_CLICKPAD2BTN(ex0c) ((ex0c) & BIT(8)) /* 2-button ClickPad */ ++#define SYN_CAP_CLICKPAD2BTN2(ex0c) ((ex0c) & BIT(21)) /* 2-button ClickPad */ + #define SYN_CAP_MAX_DIMENSIONS(ex0c) ((ex0c) & BIT(17)) + #define SYN_CAP_MIN_DIMENSIONS(ex0c) ((ex0c) & BIT(13)) + #define SYN_CAP_ADV_GESTURE(ex0c) ((ex0c) & BIT(19)) +diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig +index 97a420c11eed..c8621e9b2e4a 100644 +--- a/drivers/macintosh/Kconfig ++++ b/drivers/macintosh/Kconfig +@@ -159,6 +159,13 @@ config INPUT_ADBHID + + If unsure, say Y. + ++config ADB_TRACKPAD_ABSOLUTE ++ bool "Enable absolute mode for adb trackpads" ++ depends on INPUT_ADBHID ++ help ++ Enable absolute mode in adb-base trackpads. This feature adds ++ compatibility with synaptics Xorg / Xfree drivers. ++ + config MAC_EMUMOUSEBTN + tristate "Support for mouse button 2+3 emulation" + depends on SYSCTL && INPUT +diff --git a/drivers/macintosh/adbhid.c b/drivers/macintosh/adbhid.c +index a261892c03b3..a85192de840c 100644 +--- a/drivers/macintosh/adbhid.c ++++ b/drivers/macintosh/adbhid.c +@@ -262,6 +262,15 @@ static struct adb_ids buttons_ids; + #define ADBMOUSE_MS_A3 8 /* Mouse systems A3 trackball (handler 3) */ + #define ADBMOUSE_MACALLY2 9 /* MacAlly 2-button mouse */ + ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++#define ABS_XMIN 310 ++#define ABS_XMAX 1700 ++#define ABS_YMIN 200 ++#define ABS_YMAX 1000 ++#define ABS_ZMIN 0 ++#define ABS_ZMAX 55 ++#endif ++ + static void + adbhid_keyboard_input(unsigned char *data, int nb, int apoll) + { +@@ -405,6 +414,9 @@ static void + adbhid_mouse_input(unsigned char *data, int nb, int autopoll) + { + int id = (data[0] >> 4) & 0x0f; ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ int btn = 0; int x_axis = 0; int y_axis = 0; int z_axis = 0; ++#endif + + if (!adbhid[id]) { + pr_err("ADB HID on ID %d not yet registered\n", id); +@@ -436,6 +448,17 @@ adbhid_mouse_input(unsigned char *data, int nb, int autopoll) + high bits of y-axis motion. XY is additional + high bits of x-axis motion. + ++ For ADB Absolute motion protocol the data array will contain the ++ following values: ++ ++ BITS COMMENTS ++ data[0] = dddd 1100 ADB command: Talk, register 0, for device dddd. ++ data[1] = byyy yyyy Left button and y-axis motion. ++ data[2] = bxxx xxxx Second button and x-axis motion. ++ data[3] = 1yyy 1xxx Half bits of y-axis and x-axis motion. ++ data[4] = 1yyy 1xxx Higher bits of y-axis and x-axis motion. ++ data[5] = 1zzz 1zzz Higher and lower bits of z-pressure. ++ + MacAlly 2-button mouse protocol. + + For MacAlly 2-button mouse protocol the data array will contain the +@@ -458,8 +481,17 @@ adbhid_mouse_input(unsigned char *data, int nb, int autopoll) + switch (adbhid[id]->mouse_kind) + { + case ADBMOUSE_TRACKPAD: ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ x_axis = (data[2] & 0x7f) | ((data[3] & 0x07) << 7) | ++ ((data[4] & 0x07) << 10); ++ y_axis = (data[1] & 0x7f) | ((data[3] & 0x70) << 3) | ++ ((data[4] & 0x70) << 6); ++ z_axis = (data[5] & 0x07) | ((data[5] & 0x70) >> 1); ++ btn = (!(data[1] >> 7)) & 1; ++#else + data[1] = (data[1] & 0x7f) | ((data[1] & data[2]) & 0x80); + data[2] = data[2] | 0x80; ++#endif + break; + case ADBMOUSE_MICROSPEED: + data[1] = (data[1] & 0x7f) | ((data[3] & 0x01) << 7); +@@ -485,17 +517,39 @@ adbhid_mouse_input(unsigned char *data, int nb, int autopoll) + break; + } + +- input_report_key(adbhid[id]->input, BTN_LEFT, !((data[1] >> 7) & 1)); +- input_report_key(adbhid[id]->input, BTN_MIDDLE, !((data[2] >> 7) & 1)); ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ if ( adbhid[id]->mouse_kind == ADBMOUSE_TRACKPAD ) { + +- if (nb >= 4 && adbhid[id]->mouse_kind != ADBMOUSE_TRACKPAD) +- input_report_key(adbhid[id]->input, BTN_RIGHT, !((data[3] >> 7) & 1)); ++ if(z_axis > 30) input_report_key(adbhid[id]->input, BTN_TOUCH, 1); ++ if(z_axis < 25) input_report_key(adbhid[id]->input, BTN_TOUCH, 0); + +- input_report_rel(adbhid[id]->input, REL_X, +- ((data[2]&0x7f) < 64 ? (data[2]&0x7f) : (data[2]&0x7f)-128 )); +- input_report_rel(adbhid[id]->input, REL_Y, +- ((data[1]&0x7f) < 64 ? (data[1]&0x7f) : (data[1]&0x7f)-128 )); ++ if(z_axis > 0){ ++ input_report_abs(adbhid[id]->input, ABS_X, x_axis); ++ input_report_abs(adbhid[id]->input, ABS_Y, y_axis); ++ input_report_key(adbhid[id]->input, BTN_TOOL_FINGER, 1); ++ input_report_key(adbhid[id]->input, ABS_TOOL_WIDTH, 5); ++ } else { ++ input_report_key(adbhid[id]->input, BTN_TOOL_FINGER, 0); ++ input_report_key(adbhid[id]->input, ABS_TOOL_WIDTH, 0); ++ } ++ ++ input_report_abs(adbhid[id]->input, ABS_PRESSURE, z_axis); ++ input_report_key(adbhid[id]->input, BTN_LEFT, btn); ++ } else { ++#endif ++ input_report_key(adbhid[id]->input, BTN_LEFT, !((data[1] >> 7) & 1)); ++ input_report_key(adbhid[id]->input, BTN_MIDDLE, !((data[2] >> 7) & 1)); ++ ++ if (nb >= 4 && adbhid[id]->mouse_kind != ADBMOUSE_TRACKPAD) ++ input_report_key(adbhid[id]->input, BTN_RIGHT, !((data[3] >> 7) & 1)); + ++ input_report_rel(adbhid[id]->input, REL_X, ++ ((data[2]&0x7f) < 64 ? (data[2]&0x7f) : (data[2]&0x7f)-128 )); ++ input_report_rel(adbhid[id]->input, REL_Y, ++ ((data[1]&0x7f) < 64 ? (data[1]&0x7f) : (data[1]&0x7f)-128 )); ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ } ++#endif + input_sync(adbhid[id]->input); + } + +@@ -849,6 +903,15 @@ adbhid_input_register(int id, int default_id, int original_handler_id, + input_dev->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) | + BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT); + input_dev->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y); ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ set_bit(EV_ABS, input_dev->evbit); ++ input_set_abs_params(input_dev, ABS_X, ABS_XMIN, ABS_XMAX, 0, 0); ++ input_set_abs_params(input_dev, ABS_Y, ABS_YMIN, ABS_YMAX, 0, 0); ++ input_set_abs_params(input_dev, ABS_PRESSURE, ABS_ZMIN, ABS_ZMAX, 0, 0); ++ set_bit(BTN_TOUCH, input_dev->keybit); ++ set_bit(BTN_TOOL_FINGER, input_dev->keybit); ++ set_bit(ABS_TOOL_WIDTH, input_dev->absbit); ++#endif + break; + + case ADB_MISC: +@@ -1132,7 +1195,11 @@ init_trackpad(int id) + r1_buffer[3], + r1_buffer[4], + r1_buffer[5], ++#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE ++ 0x00, /* Enable absolute mode */ ++#else + 0x03, /*r1_buffer[6],*/ ++#endif + r1_buffer[7]); + + /* Without this flush, the trackpad may be locked up */ +diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig +index ac4d48830415..b272132ac742 100644 +--- a/drivers/platform/x86/Kconfig ++++ b/drivers/platform/x86/Kconfig +@@ -573,9 +573,28 @@ config THINKPAD_ACPI_HOTKEY_POLL + If you are not sure, say Y here. The driver enables polling only if + it is strictly necessary to do so. + ++config THINKPAD_EC ++ tristate ++ ---help--- ++ This is a low-level driver for accessing the ThinkPad H8S embedded ++ controller over the LPC bus (not to be confused with the ACPI Embedded ++ Controller interface). ++ ++config TP_SMAPI ++ tristate "ThinkPad SMAPI Support" ++ select THINKPAD_EC ++ default n ++ help ++ This adds SMAPI support on Lenovo/IBM ThinkPads, for features such ++ as battery charging control. For more information about this driver ++ see . ++ ++ If you have a Lenovo/IBM ThinkPad laptop, say Y or M here. ++ + config SENSORS_HDAPS + tristate "Thinkpad Hard Drive Active Protection System (hdaps)" + depends on INPUT ++ select THINKPAD_EC + select INPUT_POLLDEV + help + This driver provides support for the IBM Hard Drive Active Protection +diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile +index 2ba6cb795338..399f8b88646f 100644 +--- a/drivers/platform/x86/Makefile ++++ b/drivers/platform/x86/Makefile +@@ -35,6 +35,8 @@ obj-$(CONFIG_TC1100_WMI) += tc1100-wmi.o + obj-$(CONFIG_SONY_LAPTOP) += sony-laptop.o + obj-$(CONFIG_IDEAPAD_LAPTOP) += ideapad-laptop.o + obj-$(CONFIG_THINKPAD_ACPI) += thinkpad_acpi.o ++obj-$(CONFIG_THINKPAD_EC) += thinkpad_ec.o ++obj-$(CONFIG_TP_SMAPI) += tp_smapi.o + obj-$(CONFIG_SENSORS_HDAPS) += hdaps.o + obj-$(CONFIG_FUJITSU_LAPTOP) += fujitsu-laptop.o + obj-$(CONFIG_FUJITSU_TABLET) += fujitsu-tablet.o +diff --git a/drivers/platform/x86/hdaps.c b/drivers/platform/x86/hdaps.c +index c26baf77938e..1814614f240c 100644 +--- a/drivers/platform/x86/hdaps.c ++++ b/drivers/platform/x86/hdaps.c +@@ -2,7 +2,7 @@ + * hdaps.c - driver for IBM's Hard Drive Active Protection System + * + * Copyright (C) 2005 Robert Love +- * Copyright (C) 2005 Jesper Juhl ++ * Copyright (C) 2005 Jesper Juhl + * + * The HardDisk Active Protection System (hdaps) is present in IBM ThinkPads + * starting with the R40, T41, and X40. It provides a basic two-axis +@@ -30,266 +30,384 @@ + + #include + #include +-#include ++#include + #include +-#include + #include + #include + #include + #include +-#include +- +-#define HDAPS_LOW_PORT 0x1600 /* first port used by hdaps */ +-#define HDAPS_NR_PORTS 0x30 /* number of ports: 0x1600 - 0x162f */ +- +-#define HDAPS_PORT_STATE 0x1611 /* device state */ +-#define HDAPS_PORT_YPOS 0x1612 /* y-axis position */ +-#define HDAPS_PORT_XPOS 0x1614 /* x-axis position */ +-#define HDAPS_PORT_TEMP1 0x1616 /* device temperature, in Celsius */ +-#define HDAPS_PORT_YVAR 0x1617 /* y-axis variance (what is this?) */ +-#define HDAPS_PORT_XVAR 0x1619 /* x-axis variance (what is this?) */ +-#define HDAPS_PORT_TEMP2 0x161b /* device temperature (again?) */ +-#define HDAPS_PORT_UNKNOWN 0x161c /* what is this? */ +-#define HDAPS_PORT_KMACT 0x161d /* keyboard or mouse activity */ +- +-#define STATE_FRESH 0x50 /* accelerometer data is fresh */ ++#include ++#include ++#include ++ ++/* Embedded controller accelerometer read command and its result: */ ++static const struct thinkpad_ec_row ec_accel_args = ++ { .mask = 0x0001, .val = {0x11} }; ++#define EC_ACCEL_IDX_READOUTS 0x1 /* readouts included in this read */ ++ /* First readout, if READOUTS>=1: */ ++#define EC_ACCEL_IDX_YPOS1 0x2 /* y-axis position word */ ++#define EC_ACCEL_IDX_XPOS1 0x4 /* x-axis position word */ ++#define EC_ACCEL_IDX_TEMP1 0x6 /* device temperature in Celsius */ ++ /* Second readout, if READOUTS>=2: */ ++#define EC_ACCEL_IDX_XPOS2 0x7 /* y-axis position word */ ++#define EC_ACCEL_IDX_YPOS2 0x9 /* x-axis position word */ ++#define EC_ACCEL_IDX_TEMP2 0xb /* device temperature in Celsius */ ++#define EC_ACCEL_IDX_QUEUED 0xc /* Number of queued readouts left */ ++#define EC_ACCEL_IDX_KMACT 0xd /* keyboard or mouse activity */ ++#define EC_ACCEL_IDX_RETVAL 0xf /* command return value, good=0x00 */ + + #define KEYBD_MASK 0x20 /* set if keyboard activity */ + #define MOUSE_MASK 0x40 /* set if mouse activity */ +-#define KEYBD_ISSET(n) (!! (n & KEYBD_MASK)) /* keyboard used? */ +-#define MOUSE_ISSET(n) (!! (n & MOUSE_MASK)) /* mouse used? */ + +-#define INIT_TIMEOUT_MSECS 4000 /* wait up to 4s for device init ... */ +-#define INIT_WAIT_MSECS 200 /* ... in 200ms increments */ ++#define READ_TIMEOUT_MSECS 100 /* wait this long for device read */ ++#define RETRY_MSECS 3 /* retry delay */ + +-#define HDAPS_POLL_INTERVAL 50 /* poll for input every 1/20s (50 ms)*/ + #define HDAPS_INPUT_FUZZ 4 /* input event threshold */ + #define HDAPS_INPUT_FLAT 4 +- +-#define HDAPS_X_AXIS (1 << 0) +-#define HDAPS_Y_AXIS (1 << 1) +-#define HDAPS_BOTH_AXES (HDAPS_X_AXIS | HDAPS_Y_AXIS) +- ++#define KMACT_REMEMBER_PERIOD (HZ/10) /* keyboard/mouse persistence */ ++ ++/* Input IDs */ ++#define HDAPS_INPUT_VENDOR PCI_VENDOR_ID_IBM ++#define HDAPS_INPUT_PRODUCT 0x5054 /* "TP", shared with thinkpad_acpi */ ++#define HDAPS_INPUT_JS_VERSION 0x6801 /* Joystick emulation input device */ ++#define HDAPS_INPUT_RAW_VERSION 0x4801 /* Raw accelerometer input device */ ++ ++/* Axis orientation. */ ++/* The unnatural bit-representation of inversions is for backward ++ * compatibility with the"invert=1" module parameter. */ ++#define HDAPS_ORIENT_INVERT_XY 0x01 /* Invert both X and Y axes. */ ++#define HDAPS_ORIENT_INVERT_X 0x02 /* Invert the X axis (uninvert if ++ * already inverted by INVERT_XY). */ ++#define HDAPS_ORIENT_SWAP 0x04 /* Swap the axes. The swap occurs ++ * before inverting X or Y. */ ++#define HDAPS_ORIENT_MAX 0x07 ++#define HDAPS_ORIENT_UNDEFINED 0xFF /* Placeholder during initialization */ ++#define HDAPS_ORIENT_INVERT_Y (HDAPS_ORIENT_INVERT_XY | HDAPS_ORIENT_INVERT_X) ++ ++static struct timer_list hdaps_timer; + static struct platform_device *pdev; +-static struct input_polled_dev *hdaps_idev; +-static unsigned int hdaps_invert; +-static u8 km_activity; +-static int rest_x; +-static int rest_y; +- +-static DEFINE_MUTEX(hdaps_mtx); +- +-/* +- * __get_latch - Get the value from a given port. Callers must hold hdaps_mtx. +- */ +-static inline u8 __get_latch(u16 port) ++static struct input_dev *hdaps_idev; /* joystick-like device with fuzz */ ++static struct input_dev *hdaps_idev_raw; /* raw hdaps sensor readouts */ ++static unsigned int hdaps_invert = HDAPS_ORIENT_UNDEFINED; ++static int needs_calibration; ++ ++/* Configuration: */ ++static int sampling_rate = 50; /* Sampling rate */ ++static int oversampling_ratio = 5; /* Ratio between our sampling rate and ++ * EC accelerometer sampling rate */ ++static int running_avg_filter_order = 2; /* EC running average filter order */ ++ ++/* Latest state readout: */ ++static int pos_x, pos_y; /* position */ ++static int temperature; /* temperature */ ++static int stale_readout = 1; /* last read invalid */ ++static int rest_x, rest_y; /* calibrated rest position */ ++ ++/* Last time we saw keyboard and mouse activity: */ ++static u64 last_keyboard_jiffies = INITIAL_JIFFIES; ++static u64 last_mouse_jiffies = INITIAL_JIFFIES; ++static u64 last_update_jiffies = INITIAL_JIFFIES; ++ ++/* input device use count */ ++static int hdaps_users; ++static DEFINE_MUTEX(hdaps_users_mtx); ++ ++/* Some models require an axis transformation to the standard representation */ ++static void transform_axes(int *x, int *y) + { +- return inb(port) & 0xff; ++ if (hdaps_invert & HDAPS_ORIENT_SWAP) { ++ int z; ++ z = *x; ++ *x = *y; ++ *y = z; ++ } ++ if (hdaps_invert & HDAPS_ORIENT_INVERT_XY) { ++ *x = -*x; ++ *y = -*y; ++ } ++ if (hdaps_invert & HDAPS_ORIENT_INVERT_X) ++ *x = -*x; + } + +-/* +- * __check_latch - Check a port latch for a given value. Returns zero if the +- * port contains the given value. Callers must hold hdaps_mtx. ++/** ++ * __hdaps_update - query current state, with locks already acquired ++ * @fast: if nonzero, do one quick attempt without retries. ++ * ++ * Query current accelerometer state and update global state variables. ++ * Also prefetches the next query. Caller must hold controller lock. + */ +-static inline int __check_latch(u16 port, u8 val) ++static int __hdaps_update(int fast) + { +- if (__get_latch(port) == val) +- return 0; +- return -EINVAL; +-} ++ /* Read data: */ ++ struct thinkpad_ec_row data; ++ int ret; + +-/* +- * __wait_latch - Wait up to 100us for a port latch to get a certain value, +- * returning zero if the value is obtained. Callers must hold hdaps_mtx. +- */ +-static int __wait_latch(u16 port, u8 val) +-{ +- unsigned int i; ++ data.mask = (1 << EC_ACCEL_IDX_READOUTS) | (1 << EC_ACCEL_IDX_KMACT) | ++ (3 << EC_ACCEL_IDX_YPOS1) | (3 << EC_ACCEL_IDX_XPOS1) | ++ (1 << EC_ACCEL_IDX_TEMP1) | (1 << EC_ACCEL_IDX_RETVAL); ++ if (fast) ++ ret = thinkpad_ec_try_read_row(&ec_accel_args, &data); ++ else ++ ret = thinkpad_ec_read_row(&ec_accel_args, &data); ++ thinkpad_ec_prefetch_row(&ec_accel_args); /* Prefetch even if error */ ++ if (ret) ++ return ret; + +- for (i = 0; i < 20; i++) { +- if (!__check_latch(port, val)) +- return 0; +- udelay(5); ++ /* Check status: */ ++ if (data.val[EC_ACCEL_IDX_RETVAL] != 0x00) { ++ pr_warn("read RETVAL=0x%02x\n", ++ data.val[EC_ACCEL_IDX_RETVAL]); ++ return -EIO; + } + +- return -EIO; ++ if (data.val[EC_ACCEL_IDX_READOUTS] < 1) ++ return -EBUSY; /* no pending readout, try again later */ ++ ++ /* Parse position data: */ ++ pos_x = *(s16 *)(data.val+EC_ACCEL_IDX_XPOS1); ++ pos_y = *(s16 *)(data.val+EC_ACCEL_IDX_YPOS1); ++ transform_axes(&pos_x, &pos_y); ++ ++ /* Keyboard and mouse activity status is cleared as soon as it's read, ++ * so applications will eat each other's events. Thus we remember any ++ * event for KMACT_REMEMBER_PERIOD jiffies. ++ */ ++ if (data.val[EC_ACCEL_IDX_KMACT] & KEYBD_MASK) ++ last_keyboard_jiffies = get_jiffies_64(); ++ if (data.val[EC_ACCEL_IDX_KMACT] & MOUSE_MASK) ++ last_mouse_jiffies = get_jiffies_64(); ++ ++ temperature = data.val[EC_ACCEL_IDX_TEMP1]; ++ ++ last_update_jiffies = get_jiffies_64(); ++ stale_readout = 0; ++ if (needs_calibration) { ++ rest_x = pos_x; ++ rest_y = pos_y; ++ needs_calibration = 0; ++ } ++ ++ return 0; + } + +-/* +- * __device_refresh - request a refresh from the accelerometer. Does not wait +- * for refresh to complete. Callers must hold hdaps_mtx. ++/** ++ * hdaps_update - acquire locks and query current state ++ * ++ * Query current accelerometer state and update global state variables. ++ * Also prefetches the next query. ++ * Retries until timeout if the accelerometer is not in ready status (common). ++ * Does its own locking. + */ +-static void __device_refresh(void) ++static int hdaps_update(void) + { +- udelay(200); +- if (inb(0x1604) != STATE_FRESH) { +- outb(0x11, 0x1610); +- outb(0x01, 0x161f); ++ u64 age = get_jiffies_64() - last_update_jiffies; ++ int total, ret; ++ ++ if (!stale_readout && age < (9*HZ)/(10*sampling_rate)) ++ return 0; /* already updated recently */ ++ for (total = 0; total < READ_TIMEOUT_MSECS; total += RETRY_MSECS) { ++ ret = thinkpad_ec_lock(); ++ if (ret) ++ return ret; ++ ret = __hdaps_update(0); ++ thinkpad_ec_unlock(); ++ ++ if (!ret) ++ return 0; ++ if (ret != -EBUSY) ++ break; ++ msleep(RETRY_MSECS); + } ++ return ret; + } + +-/* +- * __device_refresh_sync - request a synchronous refresh from the +- * accelerometer. We wait for the refresh to complete. Returns zero if +- * successful and nonzero on error. Callers must hold hdaps_mtx. ++/** ++ * hdaps_set_power - enable or disable power to the accelerometer. ++ * Returns zero on success and negative error code on failure. Can sleep. + */ +-static int __device_refresh_sync(void) ++static int hdaps_set_power(int on) + { +- __device_refresh(); +- return __wait_latch(0x1604, STATE_FRESH); ++ struct thinkpad_ec_row args = ++ { .mask = 0x0003, .val = {0x14, on?0x01:0x00} }; ++ struct thinkpad_ec_row data = { .mask = 0x8000 }; ++ int ret = thinkpad_ec_read_row(&args, &data); ++ if (ret) ++ return ret; ++ if (data.val[0xF] != 0x00) ++ return -EIO; ++ return 0; + } + +-/* +- * __device_complete - indicate to the accelerometer that we are done reading +- * data, and then initiate an async refresh. Callers must hold hdaps_mtx. ++/** ++ * hdaps_set_ec_config - set accelerometer parameters. ++ * @ec_rate: embedded controller sampling rate ++ * @order: embedded controller running average filter order ++ * (Normally we have @ec_rate = sampling_rate * oversampling_ratio.) ++ * Returns zero on success and negative error code on failure. Can sleep. + */ +-static inline void __device_complete(void) ++static int hdaps_set_ec_config(int ec_rate, int order) + { +- inb(0x161f); +- inb(0x1604); +- __device_refresh(); ++ struct thinkpad_ec_row args = { .mask = 0x000F, ++ .val = {0x10, (u8)ec_rate, (u8)(ec_rate>>8), order} }; ++ struct thinkpad_ec_row data = { .mask = 0x8000 }; ++ int ret = thinkpad_ec_read_row(&args, &data); ++ pr_debug("setting ec_rate=%d, filter_order=%d\n", ec_rate, order); ++ if (ret) ++ return ret; ++ if (data.val[0xF] == 0x03) { ++ pr_warn("config param out of range\n"); ++ return -EINVAL; ++ } ++ if (data.val[0xF] == 0x06) { ++ pr_warn("config change already pending\n"); ++ return -EBUSY; ++ } ++ if (data.val[0xF] != 0x00) { ++ pr_warn("config change error, ret=%d\n", ++ data.val[0xF]); ++ return -EIO; ++ } ++ return 0; + } + +-/* +- * hdaps_readb_one - reads a byte from a single I/O port, placing the value in +- * the given pointer. Returns zero on success or a negative error on failure. +- * Can sleep. ++/** ++ * hdaps_get_ec_config - get accelerometer parameters. ++ * @ec_rate: embedded controller sampling rate ++ * @order: embedded controller running average filter order ++ * Returns zero on success and negative error code on failure. Can sleep. + */ +-static int hdaps_readb_one(unsigned int port, u8 *val) ++static int hdaps_get_ec_config(int *ec_rate, int *order) + { +- int ret; +- +- mutex_lock(&hdaps_mtx); +- +- /* do a sync refresh -- we need to be sure that we read fresh data */ +- ret = __device_refresh_sync(); ++ const struct thinkpad_ec_row args = ++ { .mask = 0x0003, .val = {0x17, 0x82} }; ++ struct thinkpad_ec_row data = { .mask = 0x801F }; ++ int ret = thinkpad_ec_read_row(&args, &data); + if (ret) +- goto out; +- +- *val = inb(port); +- __device_complete(); +- +-out: +- mutex_unlock(&hdaps_mtx); +- return ret; ++ return ret; ++ if (data.val[0xF] != 0x00) ++ return -EIO; ++ if (!(data.val[0x1] & 0x01)) ++ return -ENXIO; /* accelerometer polling not enabled */ ++ if (data.val[0x1] & 0x02) ++ return -EBUSY; /* config change in progress, retry later */ ++ *ec_rate = data.val[0x2] | ((int)(data.val[0x3]) << 8); ++ *order = data.val[0x4]; ++ return 0; + } + +-/* __hdaps_read_pair - internal lockless helper for hdaps_read_pair(). */ +-static int __hdaps_read_pair(unsigned int port1, unsigned int port2, +- int *x, int *y) ++/** ++ * hdaps_get_ec_mode - get EC accelerometer mode ++ * Returns zero on success and negative error code on failure. Can sleep. ++ */ ++static int hdaps_get_ec_mode(u8 *mode) + { +- /* do a sync refresh -- we need to be sure that we read fresh data */ +- if (__device_refresh_sync()) ++ const struct thinkpad_ec_row args = ++ { .mask = 0x0001, .val = {0x13} }; ++ struct thinkpad_ec_row data = { .mask = 0x8002 }; ++ int ret = thinkpad_ec_read_row(&args, &data); ++ if (ret) ++ return ret; ++ if (data.val[0xF] != 0x00) { ++ pr_warn("accelerometer not implemented (0x%02x)\n", ++ data.val[0xF]); + return -EIO; +- +- *y = inw(port2); +- *x = inw(port1); +- km_activity = inb(HDAPS_PORT_KMACT); +- __device_complete(); +- +- /* hdaps_invert is a bitvector to negate the axes */ +- if (hdaps_invert & HDAPS_X_AXIS) +- *x = -*x; +- if (hdaps_invert & HDAPS_Y_AXIS) +- *y = -*y; +- ++ } ++ *mode = data.val[0x1]; + return 0; + } + +-/* +- * hdaps_read_pair - reads the values from a pair of ports, placing the values +- * in the given pointers. Returns zero on success. Can sleep. ++/** ++ * hdaps_check_ec - checks something about the EC. ++ * Follows the clean-room spec for HDAPS; we don't know what it means. ++ * Returns zero on success and negative error code on failure. Can sleep. + */ +-static int hdaps_read_pair(unsigned int port1, unsigned int port2, +- int *val1, int *val2) ++static int hdaps_check_ec(void) + { +- int ret; +- +- mutex_lock(&hdaps_mtx); +- ret = __hdaps_read_pair(port1, port2, val1, val2); +- mutex_unlock(&hdaps_mtx); +- +- return ret; ++ const struct thinkpad_ec_row args = ++ { .mask = 0x0003, .val = {0x17, 0x81} }; ++ struct thinkpad_ec_row data = { .mask = 0x800E }; ++ int ret = thinkpad_ec_read_row(&args, &data); ++ if (ret) ++ return ret; ++ if (!((data.val[0x1] == 0x00 && data.val[0x2] == 0x60) || /* cleanroom spec */ ++ (data.val[0x1] == 0x01 && data.val[0x2] == 0x00)) || /* seen on T61 */ ++ data.val[0x3] != 0x00 || data.val[0xF] != 0x00) { ++ pr_warn("hdaps_check_ec: bad response (0x%x,0x%x,0x%x,0x%x)\n", ++ data.val[0x1], data.val[0x2], ++ data.val[0x3], data.val[0xF]); ++ return -EIO; ++ } ++ return 0; + } + +-/* +- * hdaps_device_init - initialize the accelerometer. Returns zero on success +- * and negative error code on failure. Can sleep. ++/** ++ * hdaps_device_init - initialize the accelerometer. ++ * ++ * Call several embedded controller functions to test and initialize the ++ * accelerometer. ++ * Returns zero on success and negative error code on failure. Can sleep. + */ ++#define FAILED_INIT(msg) pr_err("init failed at: %s\n", msg) + static int hdaps_device_init(void) + { +- int total, ret = -ENXIO; ++ int ret; ++ u8 mode; + +- mutex_lock(&hdaps_mtx); ++ ret = thinkpad_ec_lock(); ++ if (ret) ++ return ret; + +- outb(0x13, 0x1610); +- outb(0x01, 0x161f); +- if (__wait_latch(0x161f, 0x00)) +- goto out; ++ if (hdaps_get_ec_mode(&mode)) ++ { FAILED_INIT("hdaps_get_ec_mode failed"); goto bad; } + +- /* +- * Most ThinkPads return 0x01. +- * +- * Others--namely the R50p, T41p, and T42p--return 0x03. These laptops +- * have "inverted" axises. +- * +- * The 0x02 value occurs when the chip has been previously initialized. +- */ +- if (__check_latch(0x1611, 0x03) && +- __check_latch(0x1611, 0x02) && +- __check_latch(0x1611, 0x01)) +- goto out; ++ pr_debug("initial mode latch is 0x%02x\n", mode); ++ if (mode == 0x00) ++ { FAILED_INIT("accelerometer not available"); goto bad; } + +- printk(KERN_DEBUG "hdaps: initial latch check good (0x%02x)\n", +- __get_latch(0x1611)); ++ if (hdaps_check_ec()) ++ { FAILED_INIT("hdaps_check_ec failed"); goto bad; } + +- outb(0x17, 0x1610); +- outb(0x81, 0x1611); +- outb(0x01, 0x161f); +- if (__wait_latch(0x161f, 0x00)) +- goto out; +- if (__wait_latch(0x1611, 0x00)) +- goto out; +- if (__wait_latch(0x1612, 0x60)) +- goto out; +- if (__wait_latch(0x1613, 0x00)) +- goto out; +- outb(0x14, 0x1610); +- outb(0x01, 0x1611); +- outb(0x01, 0x161f); +- if (__wait_latch(0x161f, 0x00)) +- goto out; +- outb(0x10, 0x1610); +- outb(0xc8, 0x1611); +- outb(0x00, 0x1612); +- outb(0x02, 0x1613); +- outb(0x01, 0x161f); +- if (__wait_latch(0x161f, 0x00)) +- goto out; +- if (__device_refresh_sync()) +- goto out; +- if (__wait_latch(0x1611, 0x00)) +- goto out; +- +- /* we have done our dance, now let's wait for the applause */ +- for (total = INIT_TIMEOUT_MSECS; total > 0; total -= INIT_WAIT_MSECS) { +- int x, y; ++ if (hdaps_set_power(1)) ++ { FAILED_INIT("hdaps_set_power failed"); goto bad; } + +- /* a read of the device helps push it into action */ +- __hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &x, &y); +- if (!__wait_latch(0x1611, 0x02)) { +- ret = 0; +- break; +- } ++ if (hdaps_set_ec_config(sampling_rate*oversampling_ratio, ++ running_avg_filter_order)) ++ { FAILED_INIT("hdaps_set_ec_config failed"); goto bad; } + +- msleep(INIT_WAIT_MSECS); +- } ++ thinkpad_ec_invalidate(); ++ udelay(200); + +-out: +- mutex_unlock(&hdaps_mtx); ++ /* Just prefetch instead of reading, to avoid ~1sec delay on load */ ++ ret = thinkpad_ec_prefetch_row(&ec_accel_args); ++ if (ret) ++ { FAILED_INIT("initial prefetch failed"); goto bad; } ++ goto good; ++bad: ++ thinkpad_ec_invalidate(); ++ ret = -ENXIO; ++good: ++ stale_readout = 1; ++ thinkpad_ec_unlock(); + return ret; + } + ++/** ++ * hdaps_device_shutdown - power off the accelerometer ++ * Returns nonzero on failure. Can sleep. ++ */ ++static int hdaps_device_shutdown(void) ++{ ++ int ret; ++ ret = hdaps_set_power(0); ++ if (ret) { ++ pr_warn("cannot power off\n"); ++ return ret; ++ } ++ ret = hdaps_set_ec_config(0, 1); ++ if (ret) ++ pr_warn("cannot stop EC sampling\n"); ++ return ret; ++} + + /* Device model stuff */ + +@@ -306,13 +424,29 @@ static int hdaps_probe(struct platform_device *dev) + } + + #ifdef CONFIG_PM_SLEEP ++static int hdaps_suspend(struct device *dev) ++{ ++ /* Don't do hdaps polls until resume re-initializes the sensor. */ ++ del_timer_sync(&hdaps_timer); ++ hdaps_device_shutdown(); /* ignore errors, effect is negligible */ ++ return 0; ++} ++ + static int hdaps_resume(struct device *dev) + { +- return hdaps_device_init(); ++ int ret = hdaps_device_init(); ++ if (ret) ++ return ret; ++ ++ mutex_lock(&hdaps_users_mtx); ++ if (hdaps_users) ++ mod_timer(&hdaps_timer, jiffies + HZ/sampling_rate); ++ mutex_unlock(&hdaps_users_mtx); ++ return 0; + } + #endif + +-static SIMPLE_DEV_PM_OPS(hdaps_pm, NULL, hdaps_resume); ++static SIMPLE_DEV_PM_OPS(hdaps_pm, hdaps_suspend, hdaps_resume); + + static struct platform_driver hdaps_driver = { + .probe = hdaps_probe, +@@ -322,30 +456,51 @@ static struct platform_driver hdaps_driver = { + }, + }; + +-/* +- * hdaps_calibrate - Set our "resting" values. Callers must hold hdaps_mtx. ++/** ++ * hdaps_calibrate - set our "resting" values. ++ * Does its own locking. + */ + static void hdaps_calibrate(void) + { +- __hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &rest_x, &rest_y); ++ needs_calibration = 1; ++ hdaps_update(); ++ /* If that fails, the mousedev poll will take care of things later. */ + } + +-static void hdaps_mousedev_poll(struct input_polled_dev *dev) ++/* Timer handler for updating the input device. Runs in softirq context, ++ * so avoid lenghty or blocking operations. ++ */ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,15,0) ++static void hdaps_mousedev_poll(unsigned long unused) ++#else ++static void hdaps_mousedev_poll(struct timer_list *unused) ++#endif + { +- struct input_dev *input_dev = dev->input; +- int x, y; ++ int ret; + +- mutex_lock(&hdaps_mtx); ++ stale_readout = 1; + +- if (__hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &x, &y)) +- goto out; ++ /* Cannot sleep. Try nonblockingly. If we fail, try again later. */ ++ if (thinkpad_ec_try_lock()) ++ goto keep_active; + +- input_report_abs(input_dev, ABS_X, x - rest_x); +- input_report_abs(input_dev, ABS_Y, y - rest_y); +- input_sync(input_dev); ++ ret = __hdaps_update(1); /* fast update, we're in softirq context */ ++ thinkpad_ec_unlock(); ++ /* Any of "successful", "not yet ready" and "not prefetched"? */ ++ if (ret != 0 && ret != -EBUSY && ret != -ENODATA) { ++ pr_err("poll failed, disabling updates\n"); ++ return; ++ } + +-out: +- mutex_unlock(&hdaps_mtx); ++keep_active: ++ /* Even if we failed now, pos_x,y may have been updated earlier: */ ++ input_report_abs(hdaps_idev, ABS_X, pos_x - rest_x); ++ input_report_abs(hdaps_idev, ABS_Y, pos_y - rest_y); ++ input_sync(hdaps_idev); ++ input_report_abs(hdaps_idev_raw, ABS_X, pos_x); ++ input_report_abs(hdaps_idev_raw, ABS_Y, pos_y); ++ input_sync(hdaps_idev_raw); ++ mod_timer(&hdaps_timer, jiffies + HZ/sampling_rate); + } + + +@@ -354,65 +509,41 @@ static void hdaps_mousedev_poll(struct input_polled_dev *dev) + static ssize_t hdaps_position_show(struct device *dev, + struct device_attribute *attr, char *buf) + { +- int ret, x, y; +- +- ret = hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &x, &y); ++ int ret = hdaps_update(); + if (ret) + return ret; +- +- return sprintf(buf, "(%d,%d)\n", x, y); +-} +- +-static ssize_t hdaps_variance_show(struct device *dev, +- struct device_attribute *attr, char *buf) +-{ +- int ret, x, y; +- +- ret = hdaps_read_pair(HDAPS_PORT_XVAR, HDAPS_PORT_YVAR, &x, &y); +- if (ret) +- return ret; +- +- return sprintf(buf, "(%d,%d)\n", x, y); ++ return sprintf(buf, "(%d,%d)\n", pos_x, pos_y); + } + + static ssize_t hdaps_temp1_show(struct device *dev, + struct device_attribute *attr, char *buf) + { +- u8 uninitialized_var(temp); +- int ret; +- +- ret = hdaps_readb_one(HDAPS_PORT_TEMP1, &temp); +- if (ret) +- return ret; +- +- return sprintf(buf, "%u\n", temp); +-} +- +-static ssize_t hdaps_temp2_show(struct device *dev, +- struct device_attribute *attr, char *buf) +-{ +- u8 uninitialized_var(temp); +- int ret; +- +- ret = hdaps_readb_one(HDAPS_PORT_TEMP2, &temp); ++ int ret = hdaps_update(); + if (ret) + return ret; +- +- return sprintf(buf, "%u\n", temp); ++ return sprintf(buf, "%d\n", temperature); + } + + static ssize_t hdaps_keyboard_activity_show(struct device *dev, + struct device_attribute *attr, + char *buf) + { +- return sprintf(buf, "%u\n", KEYBD_ISSET(km_activity)); ++ int ret = hdaps_update(); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%u\n", ++ get_jiffies_64() < last_keyboard_jiffies + KMACT_REMEMBER_PERIOD); + } + + static ssize_t hdaps_mouse_activity_show(struct device *dev, + struct device_attribute *attr, + char *buf) + { +- return sprintf(buf, "%u\n", MOUSE_ISSET(km_activity)); ++ int ret = hdaps_update(); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%u\n", ++ get_jiffies_64() < last_mouse_jiffies + KMACT_REMEMBER_PERIOD); + } + + static ssize_t hdaps_calibrate_show(struct device *dev, +@@ -425,10 +556,7 @@ static ssize_t hdaps_calibrate_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) + { +- mutex_lock(&hdaps_mtx); + hdaps_calibrate(); +- mutex_unlock(&hdaps_mtx); +- + return count; + } + +@@ -445,7 +573,7 @@ static ssize_t hdaps_invert_store(struct device *dev, + int invert; + + if (sscanf(buf, "%d", &invert) != 1 || +- invert < 0 || invert > HDAPS_BOTH_AXES) ++ invert < 0 || invert > HDAPS_ORIENT_MAX) + return -EINVAL; + + hdaps_invert = invert; +@@ -454,24 +582,128 @@ static ssize_t hdaps_invert_store(struct device *dev, + return count; + } + ++static ssize_t hdaps_sampling_rate_show( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%d\n", sampling_rate); ++} ++ ++static ssize_t hdaps_sampling_rate_store( ++ struct device *dev, struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int rate, ret; ++ if (sscanf(buf, "%d", &rate) != 1 || rate > HZ || rate <= 0) { ++ pr_warn("must have 0ident); +- return 1; +-} +- + /* hdaps_dmi_match_invert - found an inverted match. */ + static int __init hdaps_dmi_match_invert(const struct dmi_system_id *id) + { +- hdaps_invert = (unsigned long)id->driver_data; +- pr_info("inverting axis (%u) readings\n", hdaps_invert); +- return hdaps_dmi_match(id); ++ unsigned int orient = (kernel_ulong_t) id->driver_data; ++ hdaps_invert = orient; ++ pr_info("%s detected, setting orientation %u\n", id->ident, orient); ++ return 1; /* stop enumeration */ + } + +-#define HDAPS_DMI_MATCH_INVERT(vendor, model, axes) { \ ++#define HDAPS_DMI_MATCH_INVERT(vendor, model, orient) { \ + .ident = vendor " " model, \ + .callback = hdaps_dmi_match_invert, \ +- .driver_data = (void *)axes, \ ++ .driver_data = (void *)(orient), \ + .matches = { \ + DMI_MATCH(DMI_BOARD_VENDOR, vendor), \ + DMI_MATCH(DMI_PRODUCT_VERSION, model) \ + } \ + } + +-#define HDAPS_DMI_MATCH_NORMAL(vendor, model) \ +- HDAPS_DMI_MATCH_INVERT(vendor, model, 0) +- +-/* Note that HDAPS_DMI_MATCH_NORMAL("ThinkPad T42") would match +- "ThinkPad T42p", so the order of the entries matters. +- If your ThinkPad is not recognized, please update to latest +- BIOS. This is especially the case for some R52 ThinkPads. */ +-static const struct dmi_system_id hdaps_whitelist[] __initconst = { +- HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad R50p", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R50"), +- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R51"), +- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R52"), +- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61i", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T41p", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T41"), +- HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T42p", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T42"), +- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T43"), +- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T400", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T60", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61p", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad X40"), +- HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad X41", HDAPS_Y_AXIS), +- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61s", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad Z60m"), +- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad Z61m", HDAPS_BOTH_AXES), +- HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad Z61p", HDAPS_BOTH_AXES), ++/* List of models with abnormal axis configuration. ++ Note that HDAPS_DMI_MATCH_NORMAL("ThinkPad T42") would match ++ "ThinkPad T42p", and enumeration stops after first match, ++ so the order of the entries matters. */ ++const struct dmi_system_id hdaps_whitelist[] __initconst = { ++ HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad R50p", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad R60", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T41p", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T42p", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad X40", HDAPS_ORIENT_INVERT_Y), ++ HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad X41", HDAPS_ORIENT_INVERT_Y), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R60", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R400", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R500", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T60", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60 Tablet", HDAPS_ORIENT_INVERT_Y), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60s", HDAPS_ORIENT_INVERT_Y), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T400s", HDAPS_ORIENT_INVERT_X), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T400", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T410s", HDAPS_ORIENT_SWAP), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T410", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T500", HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T510", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X | HDAPS_ORIENT_INVERT_Y), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad W510", HDAPS_ORIENT_MAX), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad W520", HDAPS_ORIENT_MAX), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X200s", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X200", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X | HDAPS_ORIENT_INVERT_Y), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X201 Tablet", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X201s", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_XY), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X201", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X), ++ HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X220", HDAPS_ORIENT_SWAP), + { .ident = NULL } + }; + + static int __init hdaps_init(void) + { +- struct input_dev *idev; + int ret; + +- if (!dmi_check_system(hdaps_whitelist)) { +- pr_warn("supported laptop not found!\n"); +- ret = -ENODEV; +- goto out; +- } +- +- if (!request_region(HDAPS_LOW_PORT, HDAPS_NR_PORTS, "hdaps")) { +- ret = -ENXIO; +- goto out; +- } +- ++ /* Determine axis orientation orientation */ ++ if (hdaps_invert == HDAPS_ORIENT_UNDEFINED) /* set by module param? */ ++ if (dmi_check_system(hdaps_whitelist) < 1) /* in whitelist? */ ++ hdaps_invert = 0; /* default */ ++ ++ /* Init timer before platform_driver_register, in case of suspend */ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,15,0) ++ init_timer(&hdaps_timer); ++ hdaps_timer.function = hdaps_mousedev_poll; ++#else ++ timer_setup(&hdaps_timer, hdaps_mousedev_poll, 0); ++#endif + ret = platform_driver_register(&hdaps_driver); + if (ret) +- goto out_region; ++ goto out; + + pdev = platform_device_register_simple("hdaps", -1, NULL, 0); + if (IS_ERR(pdev)) { +@@ -571,47 +801,79 @@ static int __init hdaps_init(void) + if (ret) + goto out_device; + +- hdaps_idev = input_allocate_polled_device(); ++ hdaps_idev = input_allocate_device(); + if (!hdaps_idev) { + ret = -ENOMEM; + goto out_group; + } + +- hdaps_idev->poll = hdaps_mousedev_poll; +- hdaps_idev->poll_interval = HDAPS_POLL_INTERVAL; +- +- /* initial calibrate for the input device */ +- hdaps_calibrate(); ++ hdaps_idev_raw = input_allocate_device(); ++ if (!hdaps_idev_raw) { ++ ret = -ENOMEM; ++ goto out_idev_first; ++ } + +- /* initialize the input class */ +- idev = hdaps_idev->input; +- idev->name = "hdaps"; +- idev->phys = "isa1600/input0"; +- idev->id.bustype = BUS_ISA; +- idev->dev.parent = &pdev->dev; +- idev->evbit[0] = BIT_MASK(EV_ABS); +- input_set_abs_params(idev, ABS_X, ++ /* calibration for the input device (deferred to avoid delay) */ ++ needs_calibration = 1; ++ ++ /* initialize the joystick-like fuzzed input device */ ++ hdaps_idev->name = "ThinkPad HDAPS joystick emulation"; ++ hdaps_idev->phys = "hdaps/input0"; ++ hdaps_idev->id.bustype = BUS_HOST; ++ hdaps_idev->id.vendor = HDAPS_INPUT_VENDOR; ++ hdaps_idev->id.product = HDAPS_INPUT_PRODUCT; ++ hdaps_idev->id.version = HDAPS_INPUT_JS_VERSION; ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) ++ hdaps_idev->cdev.dev = &pdev->dev; ++#endif ++ hdaps_idev->evbit[0] = BIT(EV_ABS); ++ hdaps_idev->open = hdaps_mousedev_open; ++ hdaps_idev->close = hdaps_mousedev_close; ++ input_set_abs_params(hdaps_idev, ABS_X, + -256, 256, HDAPS_INPUT_FUZZ, HDAPS_INPUT_FLAT); +- input_set_abs_params(idev, ABS_Y, ++ input_set_abs_params(hdaps_idev, ABS_Y, + -256, 256, HDAPS_INPUT_FUZZ, HDAPS_INPUT_FLAT); + +- ret = input_register_polled_device(hdaps_idev); ++ ret = input_register_device(hdaps_idev); + if (ret) + goto out_idev; + +- pr_info("driver successfully loaded\n"); ++ /* initialize the raw data input device */ ++ hdaps_idev_raw->name = "ThinkPad HDAPS accelerometer data"; ++ hdaps_idev_raw->phys = "hdaps/input1"; ++ hdaps_idev_raw->id.bustype = BUS_HOST; ++ hdaps_idev_raw->id.vendor = HDAPS_INPUT_VENDOR; ++ hdaps_idev_raw->id.product = HDAPS_INPUT_PRODUCT; ++ hdaps_idev_raw->id.version = HDAPS_INPUT_RAW_VERSION; ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25) ++ hdaps_idev_raw->cdev.dev = &pdev->dev; ++#endif ++ hdaps_idev_raw->evbit[0] = BIT(EV_ABS); ++ hdaps_idev_raw->open = hdaps_mousedev_open; ++ hdaps_idev_raw->close = hdaps_mousedev_close; ++ input_set_abs_params(hdaps_idev_raw, ABS_X, -32768, 32767, 0, 0); ++ input_set_abs_params(hdaps_idev_raw, ABS_Y, -32768, 32767, 0, 0); ++ ++ ret = input_register_device(hdaps_idev_raw); ++ if (ret) ++ goto out_idev_reg_first; ++ ++ pr_info("driver successfully loaded.\n"); + return 0; + ++out_idev_reg_first: ++ input_unregister_device(hdaps_idev); + out_idev: +- input_free_polled_device(hdaps_idev); ++ input_free_device(hdaps_idev_raw); ++out_idev_first: ++ input_free_device(hdaps_idev); + out_group: + sysfs_remove_group(&pdev->dev.kobj, &hdaps_attribute_group); + out_device: + platform_device_unregister(pdev); + out_driver: + platform_driver_unregister(&hdaps_driver); +-out_region: +- release_region(HDAPS_LOW_PORT, HDAPS_NR_PORTS); ++ hdaps_device_shutdown(); + out: + pr_warn("driver init failed (ret=%d)!\n", ret); + return ret; +@@ -619,12 +881,12 @@ static int __init hdaps_init(void) + + static void __exit hdaps_exit(void) + { +- input_unregister_polled_device(hdaps_idev); +- input_free_polled_device(hdaps_idev); ++ input_unregister_device(hdaps_idev_raw); ++ input_unregister_device(hdaps_idev); ++ hdaps_device_shutdown(); /* ignore errors, effect is negligible */ + sysfs_remove_group(&pdev->dev.kobj, &hdaps_attribute_group); + platform_device_unregister(pdev); + platform_driver_unregister(&hdaps_driver); +- release_region(HDAPS_LOW_PORT, HDAPS_NR_PORTS); + + pr_info("driver unloaded\n"); + } +@@ -632,9 +894,8 @@ static void __exit hdaps_exit(void) + module_init(hdaps_init); + module_exit(hdaps_exit); + +-module_param_named(invert, hdaps_invert, int, 0); +-MODULE_PARM_DESC(invert, "invert data along each axis. 1 invert x-axis, " +- "2 invert y-axis, 3 invert both axes."); ++module_param_named(invert, hdaps_invert, uint, 0); ++MODULE_PARM_DESC(invert, "axis orientation code"); + + MODULE_AUTHOR("Robert Love"); + MODULE_DESCRIPTION("IBM Hard Drive Active Protection System (HDAPS) driver"); +diff --git a/drivers/platform/x86/thinkpad_ec.c b/drivers/platform/x86/thinkpad_ec.c +new file mode 100644 +index 000000000000..597614bc17e6 +--- /dev/null ++++ b/drivers/platform/x86/thinkpad_ec.c +@@ -0,0 +1,513 @@ ++/* ++ * thinkpad_ec.c - ThinkPad embedded controller LPC3 functions ++ * ++ * The embedded controller on ThinkPad laptops has a non-standard interface, ++ * where LPC channel 3 of the H8S EC chip is hooked up to IO ports ++ * 0x1600-0x161F and implements (a special case of) the H8S LPC protocol. ++ * The EC LPC interface provides various system management services (currently ++ * known: battery information and accelerometer readouts). This driver ++ * provides access and mutual exclusion for the EC interface. ++* ++ * The LPC protocol and terminology are documented here: ++ * "H8S/2104B Group Hardware Manual", ++ * http://documentation.renesas.com/eng/products/mpumcu/rej09b0300_2140bhm.pdf ++ * ++ * Copyright (C) 2006-2007 Shem Multinymous ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) ++ #include ++#else ++ #include ++#endif ++ ++#define TP_VERSION "0.42" ++ ++MODULE_AUTHOR("Shem Multinymous"); ++MODULE_DESCRIPTION("ThinkPad embedded controller hardware access"); ++MODULE_VERSION(TP_VERSION); ++MODULE_LICENSE("GPL"); ++ ++/* IO ports used by embedded controller LPC channel 3: */ ++#define TPC_BASE_PORT 0x1600 ++#define TPC_NUM_PORTS 0x20 ++#define TPC_STR3_PORT 0x1604 /* Reads H8S EC register STR3 */ ++#define TPC_TWR0_PORT 0x1610 /* Mapped to H8S EC register TWR0MW/SW */ ++#define TPC_TWR15_PORT 0x161F /* Mapped to H8S EC register TWR15. */ ++ /* (and port TPC_TWR0_PORT+i is mapped to H8S reg TWRi for 00x%02x", \ ++ msg, args->val[0x0], args->val[0xF], code) ++ ++/* State of request prefetching: */ ++static u8 prefetch_arg0, prefetch_argF; /* Args of last prefetch */ ++static u64 prefetch_jiffies; /* time of prefetch, or: */ ++#define TPC_PREFETCH_NONE INITIAL_JIFFIES /* No prefetch */ ++#define TPC_PREFETCH_JUNK (INITIAL_JIFFIES+1) /* Ignore prefetch */ ++ ++/* Locking: */ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) ++static DECLARE_MUTEX(thinkpad_ec_mutex); ++#else ++static DEFINE_SEMAPHORE(thinkpad_ec_mutex); ++#endif ++ ++/* Kludge in case the ACPI DSDT reserves the ports we need. */ ++static bool force_io; /* Willing to do IO to ports we couldn't reserve? */ ++static int reserved_io; /* Successfully reserved the ports? */ ++module_param_named(force_io, force_io, bool, 0600); ++MODULE_PARM_DESC(force_io, "Force IO even if region already reserved (0=off, 1=on)"); ++ ++/** ++ * thinkpad_ec_lock - get lock on the ThinkPad EC ++ * ++ * Get exclusive lock for accesing the ThinkPad embedded controller LPC3 ++ * interface. Returns 0 iff lock acquired. ++ */ ++int thinkpad_ec_lock(void) ++{ ++ int ret; ++ ret = down_interruptible(&thinkpad_ec_mutex); ++ return ret; ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_lock); ++ ++/** ++ * thinkpad_ec_try_lock - try getting lock on the ThinkPad EC ++ * ++ * Try getting an exclusive lock for accesing the ThinkPad embedded ++ * controller LPC3. Returns immediately if lock is not available; neither ++ * blocks nor sleeps. Returns 0 iff lock acquired . ++ */ ++int thinkpad_ec_try_lock(void) ++{ ++ return down_trylock(&thinkpad_ec_mutex); ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_try_lock); ++ ++/** ++ * thinkpad_ec_unlock - release lock on ThinkPad EC ++ * ++ * Release a previously acquired exclusive lock on the ThinkPad ebmedded ++ * controller LPC3 interface. ++ */ ++void thinkpad_ec_unlock(void) ++{ ++ up(&thinkpad_ec_mutex); ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_unlock); ++ ++/** ++ * thinkpad_ec_request_row - tell embedded controller to prepare a row ++ * @args Input register arguments ++ * ++ * Requests a data row by writing to H8S LPC registers TRW0 through TWR15 (or ++ * a subset thereof) following the protocol prescribed by the "H8S/2104B Group ++ * Hardware Manual". Does sanity checks via status register STR3. ++ */ ++static int thinkpad_ec_request_row(const struct thinkpad_ec_row *args) ++{ ++ u8 str3; ++ int i; ++ ++ /* EC protocol requires write to TWR0 (function code): */ ++ if (!(args->mask & 0x0001)) { ++ printk(KERN_ERR MSG_FMT("bad args->mask=0x%02x", args->mask)); ++ return -EINVAL; ++ } ++ ++ /* Check initial STR3 status: */ ++ str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; ++ if (str3 & H8S_STR3_OBF3B) { /* data already pending */ ++ inb(TPC_TWR15_PORT); /* marks end of previous transaction */ ++ if (prefetch_jiffies == TPC_PREFETCH_NONE) ++ printk(KERN_WARNING REQ_FMT( ++ "EC has result from unrequested transaction", ++ str3)); ++ return -EBUSY; /* EC will be ready in a few usecs */ ++ } else if (str3 == H8S_STR3_SWMF) { /* busy with previous request */ ++ if (prefetch_jiffies == TPC_PREFETCH_NONE) ++ printk(KERN_WARNING REQ_FMT( ++ "EC is busy with unrequested transaction", ++ str3)); ++ return -EBUSY; /* data will be pending in a few usecs */ ++ } else if (str3 != 0x00) { /* unexpected status? */ ++ printk(KERN_WARNING REQ_FMT("unexpected initial STR3", str3)); ++ return -EIO; ++ } ++ ++ /* Send TWR0MW: */ ++ outb(args->val[0], TPC_TWR0_PORT); ++ str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; ++ if (str3 != H8S_STR3_MWMF) { /* not accepted? */ ++ printk(KERN_WARNING REQ_FMT("arg0 rejected", str3)); ++ return -EIO; ++ } ++ ++ /* Send TWR1 through TWR14: */ ++ for (i = 1; i < TP_CONTROLLER_ROW_LEN-1; i++) ++ if ((args->mask>>i)&1) ++ outb(args->val[i], TPC_TWR0_PORT+i); ++ ++ /* Send TWR15 (default to 0x01). This marks end of command. */ ++ outb((args->mask & 0x8000) ? args->val[0xF] : 0x01, TPC_TWR15_PORT); ++ ++ /* Wait until EC starts writing its reply (~60ns on average). ++ * Releasing locks before this happens may cause an EC hang ++ * due to firmware bug! ++ */ ++ for (i = 0; i < TPC_REQUEST_RETRIES; i++) { ++ str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; ++ if (str3 & H8S_STR3_SWMF) /* EC started replying */ ++ return 0; ++ else if (!(str3 & ~(H8S_STR3_IBF3B|H8S_STR3_MWMF))) ++ /* Normal progress (the EC hasn't seen the request ++ * yet, or is processing it). Wait it out. */ ++ ndelay(TPC_REQUEST_NDELAY); ++ else { /* weird EC status */ ++ printk(KERN_WARNING ++ REQ_FMT("bad end STR3", str3)); ++ return -EIO; ++ } ++ } ++ printk(KERN_WARNING REQ_FMT("EC is mysteriously silent", str3)); ++ return -EIO; ++} ++ ++/** ++ * thinkpad_ec_read_data - read pre-requested row-data from EC ++ * @args Input register arguments of pre-requested rows ++ * @data Output register values ++ * ++ * Reads current row data from the controller, assuming it's already ++ * requested. Follows the H8S spec for register access and status checks. ++ */ ++static int thinkpad_ec_read_data(const struct thinkpad_ec_row *args, ++ struct thinkpad_ec_row *data) ++{ ++ int i; ++ u8 str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; ++ /* Once we make a request, STR3 assumes the sequence of values listed ++ * in the following 'if' as it reads the request and writes its data. ++ * It takes about a few dozen nanosecs total, with very high variance. ++ */ ++ if (str3 == (H8S_STR3_IBF3B|H8S_STR3_MWMF) || ++ str3 == 0x00 || /* the 0x00 is indistinguishable from idle EC! */ ++ str3 == H8S_STR3_SWMF) ++ return -EBUSY; /* not ready yet */ ++ /* Finally, the EC signals output buffer full: */ ++ if (str3 != (H8S_STR3_OBF3B|H8S_STR3_SWMF)) { ++ printk(KERN_WARNING ++ REQ_FMT("bad initial STR3", str3)); ++ return -EIO; ++ } ++ ++ /* Read first byte (signals start of read transactions): */ ++ data->val[0] = inb(TPC_TWR0_PORT); ++ /* Optionally read 14 more bytes: */ ++ for (i = 1; i < TP_CONTROLLER_ROW_LEN-1; i++) ++ if ((data->mask >> i)&1) ++ data->val[i] = inb(TPC_TWR0_PORT+i); ++ /* Read last byte from 0x161F (signals end of read transaction): */ ++ data->val[0xF] = inb(TPC_TWR15_PORT); ++ ++ /* Readout still pending? */ ++ str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK; ++ if (str3 & H8S_STR3_OBF3B) ++ printk(KERN_WARNING ++ REQ_FMT("OBF3B=1 after read", str3)); ++ /* If port 0x161F returns 0x80 too often, the EC may lock up. Warn: */ ++ if (data->val[0xF] == 0x80) ++ printk(KERN_WARNING ++ REQ_FMT("0x161F reports error", data->val[0xF])); ++ return 0; ++} ++ ++/** ++ * thinkpad_ec_is_row_fetched - is the given row currently prefetched? ++ * ++ * To keep things simple we compare only the first and last args; ++ * this suffices for all known cases. ++ */ ++static int thinkpad_ec_is_row_fetched(const struct thinkpad_ec_row *args) ++{ ++ return (prefetch_jiffies != TPC_PREFETCH_NONE) && ++ (prefetch_jiffies != TPC_PREFETCH_JUNK) && ++ (prefetch_arg0 == args->val[0]) && ++ (prefetch_argF == args->val[0xF]) && ++ (get_jiffies_64() < prefetch_jiffies + TPC_PREFETCH_TIMEOUT); ++} ++ ++/** ++ * thinkpad_ec_read_row - request and read data from ThinkPad EC ++ * @args Input register arguments ++ * @data Output register values ++ * ++ * Read a data row from the ThinkPad embedded controller LPC3 interface. ++ * Does fetching and retrying if needed. The row is specified by an ++ * array of 16 bytes, some of which may be undefined (but the first is ++ * mandatory). These bytes are given in @args->val[], where @args->val[i] is ++ * used iff (@args->mask>>i)&1). The resulting row data is stored in ++ * @data->val[], but is only guaranteed to be valid for indices corresponding ++ * to set bit in @data->mask. That is, if @data->mask&(1<val[i] is undefined. ++ * ++ * Returns -EBUSY on transient error and -EIO on abnormal condition. ++ * Caller must hold controller lock. ++ */ ++int thinkpad_ec_read_row(const struct thinkpad_ec_row *args, ++ struct thinkpad_ec_row *data) ++{ ++ int retries, ret; ++ ++ if (thinkpad_ec_is_row_fetched(args)) ++ goto read_row; /* already requested */ ++ ++ /* Request the row */ ++ for (retries = 0; retries < TPC_READ_RETRIES; ++retries) { ++ ret = thinkpad_ec_request_row(args); ++ if (!ret) ++ goto read_row; ++ if (ret != -EBUSY) ++ break; ++ ndelay(TPC_READ_NDELAY); ++ } ++ printk(KERN_ERR REQ_FMT("failed requesting row", ret)); ++ goto out; ++ ++read_row: ++ /* Read the row's data */ ++ for (retries = 0; retries < TPC_READ_RETRIES; ++retries) { ++ ret = thinkpad_ec_read_data(args, data); ++ if (!ret) ++ goto out; ++ if (ret != -EBUSY) ++ break; ++ ndelay(TPC_READ_NDELAY); ++ } ++ ++ printk(KERN_ERR REQ_FMT("failed waiting for data", ret)); ++ ++out: ++ prefetch_jiffies = TPC_PREFETCH_JUNK; ++ return ret; ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_read_row); ++ ++/** ++ * thinkpad_ec_try_read_row - try reading prefetched data from ThinkPad EC ++ * @args Input register arguments ++ * @data Output register values ++ * ++ * Try reading a data row from the ThinkPad embedded controller LPC3 ++ * interface, if this raw was recently prefetched using ++ * thinkpad_ec_prefetch_row(). Does not fetch, retry or block. ++ * The parameters have the same meaning as in thinkpad_ec_read_row(). ++ * ++ * Returns -EBUSY is data not ready and -ENODATA if row not prefetched. ++ * Caller must hold controller lock. ++ */ ++int thinkpad_ec_try_read_row(const struct thinkpad_ec_row *args, ++ struct thinkpad_ec_row *data) ++{ ++ int ret; ++ if (!thinkpad_ec_is_row_fetched(args)) { ++ ret = -ENODATA; ++ } else { ++ ret = thinkpad_ec_read_data(args, data); ++ if (!ret) ++ prefetch_jiffies = TPC_PREFETCH_NONE; /* eaten up */ ++ } ++ return ret; ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_try_read_row); ++ ++/** ++ * thinkpad_ec_prefetch_row - prefetch data from ThinkPad EC ++ * @args Input register arguments ++ * ++ * Prefetch a data row from the ThinkPad embedded controller LCP3 ++ * interface. A subsequent call to thinkpad_ec_read_row() with the ++ * same arguments will be faster, and a subsequent call to ++ * thinkpad_ec_try_read_row() stands a good chance of succeeding if ++ * done neither too soon nor too late. See ++ * thinkpad_ec_read_row() for the meaning of @args. ++ * ++ * Returns -EBUSY on transient error and -EIO on abnormal condition. ++ * Caller must hold controller lock. ++ */ ++int thinkpad_ec_prefetch_row(const struct thinkpad_ec_row *args) ++{ ++ int ret; ++ ret = thinkpad_ec_request_row(args); ++ if (ret) { ++ prefetch_jiffies = TPC_PREFETCH_JUNK; ++ } else { ++ prefetch_jiffies = get_jiffies_64(); ++ prefetch_arg0 = args->val[0x0]; ++ prefetch_argF = args->val[0xF]; ++ } ++ return ret; ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_prefetch_row); ++ ++/** ++ * thinkpad_ec_invalidate - invalidate prefetched ThinkPad EC data ++ * ++ * Invalidate the data prefetched via thinkpad_ec_prefetch_row() from the ++ * ThinkPad embedded controller LPC3 interface. ++ * Must be called before unlocking by any code that accesses the controller ++ * ports directly. ++ */ ++void thinkpad_ec_invalidate(void) ++{ ++ prefetch_jiffies = TPC_PREFETCH_JUNK; ++} ++EXPORT_SYMBOL_GPL(thinkpad_ec_invalidate); ++ ++ ++/*** Checking for EC hardware ***/ ++ ++/** ++ * thinkpad_ec_test - verify the EC is present and follows protocol ++ * ++ * Ensure the EC LPC3 channel really works on this machine by making ++ * an EC request and seeing if the EC follows the documented H8S protocol. ++ * The requested row just reads battery status, so it should be harmless to ++ * access it (on a correct EC). ++ * This test writes to IO ports, so execute only after checking DMI. ++ */ ++static int __init thinkpad_ec_test(void) ++{ ++ int ret; ++ const struct thinkpad_ec_row args = /* battery 0 basic status */ ++ { .mask = 0x8001, .val = {0x01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0x00} }; ++ struct thinkpad_ec_row data = { .mask = 0x0000 }; ++ ret = thinkpad_ec_lock(); ++ if (ret) ++ return ret; ++ ret = thinkpad_ec_read_row(&args, &data); ++ thinkpad_ec_unlock(); ++ return ret; ++} ++ ++/* Search all DMI device names of a given type for a substring */ ++static int __init dmi_find_substring(int type, const char *substr) ++{ ++ const struct dmi_device *dev = NULL; ++ while ((dev = dmi_find_device(type, NULL, dev))) { ++ if (strstr(dev->name, substr)) ++ return 1; ++ } ++ return 0; ++} ++ ++#define TP_DMI_MATCH(vendor,model) { \ ++ .ident = vendor " " model, \ ++ .matches = { \ ++ DMI_MATCH(DMI_BOARD_VENDOR, vendor), \ ++ DMI_MATCH(DMI_PRODUCT_VERSION, model) \ ++ } \ ++} ++ ++/* Check DMI for existence of ThinkPad embedded controller */ ++static int __init check_dmi_for_ec(void) ++{ ++ /* A few old models that have a good EC but don't report it in DMI */ ++ struct dmi_system_id tp_whitelist[] = { ++ TP_DMI_MATCH("IBM", "ThinkPad A30"), ++ TP_DMI_MATCH("IBM", "ThinkPad T23"), ++ TP_DMI_MATCH("IBM", "ThinkPad X24"), ++ TP_DMI_MATCH("LENOVO", "ThinkPad"), ++ { .ident = NULL } ++ }; ++ return dmi_find_substring(DMI_DEV_TYPE_OEM_STRING, ++ "IBM ThinkPad Embedded Controller") || ++ dmi_check_system(tp_whitelist); ++} ++ ++/*** Init and cleanup ***/ ++ ++static int __init thinkpad_ec_init(void) ++{ ++ if (!check_dmi_for_ec()) { ++ printk(KERN_WARNING ++ "thinkpad_ec: no ThinkPad embedded controller!\n"); ++ return -ENODEV; ++ } ++ ++ if (request_region(TPC_BASE_PORT, TPC_NUM_PORTS, "thinkpad_ec")) { ++ reserved_io = 1; ++ } else { ++ printk(KERN_ERR "thinkpad_ec: cannot claim IO ports %#x-%#x... ", ++ TPC_BASE_PORT, ++ TPC_BASE_PORT + TPC_NUM_PORTS - 1); ++ if (force_io) { ++ printk("forcing use of unreserved IO ports.\n"); ++ } else { ++ printk("consider using force_io=1.\n"); ++ return -ENXIO; ++ } ++ } ++ prefetch_jiffies = TPC_PREFETCH_JUNK; ++ if (thinkpad_ec_test()) { ++ printk(KERN_ERR "thinkpad_ec: initial ec test failed\n"); ++ if (reserved_io) ++ release_region(TPC_BASE_PORT, TPC_NUM_PORTS); ++ return -ENXIO; ++ } ++ printk(KERN_INFO "thinkpad_ec: thinkpad_ec " TP_VERSION " loaded.\n"); ++ return 0; ++} ++ ++static void __exit thinkpad_ec_exit(void) ++{ ++ if (reserved_io) ++ release_region(TPC_BASE_PORT, TPC_NUM_PORTS); ++ printk(KERN_INFO "thinkpad_ec: unloaded.\n"); ++} ++ ++module_init(thinkpad_ec_init); ++module_exit(thinkpad_ec_exit); +diff --git a/drivers/platform/x86/tp_smapi.c b/drivers/platform/x86/tp_smapi.c +new file mode 100644 +index 000000000000..209cb6487e24 +--- /dev/null ++++ b/drivers/platform/x86/tp_smapi.c +@@ -0,0 +1,1493 @@ ++/* ++ * tp_smapi.c - ThinkPad SMAPI support ++ * ++ * This driver exposes some features of the System Management Application ++ * Program Interface (SMAPI) BIOS found on ThinkPad laptops. It works on ++ * models in which the SMAPI BIOS runs in SMM and is invoked by writing ++ * to the APM control port 0xB2. ++ * It also exposes battery status information, obtained from the ThinkPad ++ * embedded controller (via the thinkpad_ec module). ++ * Ancient ThinkPad models use a different interface, supported by the ++ * "thinkpad" module from "tpctl". ++ * ++ * Many of the battery status values obtained from the EC simply mirror ++ * values provided by the battery's Smart Battery System (SBS) interface, so ++ * their meaning is defined by the Smart Battery Data Specification (see ++ * http://sbs-forum.org/specs/sbdat110.pdf). References to this SBS spec ++ * are given in the code where relevant. ++ * ++ * Copyright (C) 2006 Shem Multinymous . ++ * SMAPI access code based on the mwave driver by Mike Sullivan. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include /* CMOS defines */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define TP_VERSION "0.42" ++#define TP_DESC "ThinkPad SMAPI Support" ++#define TP_DIR "smapi" ++ ++MODULE_AUTHOR("Shem Multinymous"); ++MODULE_DESCRIPTION(TP_DESC); ++MODULE_VERSION(TP_VERSION); ++MODULE_LICENSE("GPL"); ++ ++static struct platform_device *pdev; ++ ++static int tp_debug; ++module_param_named(debug, tp_debug, int, 0600); ++MODULE_PARM_DESC(debug, "Debug level (0=off, 1=on)"); ++ ++/* A few macros for printk()ing: */ ++#define TPRINTK(level, fmt, args...) \ ++ dev_printk(level, &(pdev->dev), "%s: " fmt "\n", __func__, ## args) ++#define DPRINTK(fmt, args...) \ ++ do { if (tp_debug) TPRINTK(KERN_DEBUG, fmt, ## args); } while (0) ++ ++/********************************************************************* ++ * SMAPI interface ++ */ ++ ++/* SMAPI functions (register BX when making the SMM call). */ ++#define SMAPI_GET_INHIBIT_CHARGE 0x2114 ++#define SMAPI_SET_INHIBIT_CHARGE 0x2115 ++#define SMAPI_GET_THRESH_START 0x2116 ++#define SMAPI_SET_THRESH_START 0x2117 ++#define SMAPI_GET_FORCE_DISCHARGE 0x2118 ++#define SMAPI_SET_FORCE_DISCHARGE 0x2119 ++#define SMAPI_GET_THRESH_STOP 0x211a ++#define SMAPI_SET_THRESH_STOP 0x211b ++ ++/* SMAPI error codes (see ThinkPad 770 Technical Reference Manual p.83 at ++ http://www-307.ibm.com/pc/support/site.wss/document.do?lndocid=PFAN-3TUQQD */ ++#define SMAPI_RETCODE_EOF 0xff ++static struct { u8 rc; char *msg; int ret; } smapi_retcode[] = ++{ ++ {0x00, "OK", 0}, ++ {0x53, "SMAPI function is not available", -ENXIO}, ++ {0x81, "Invalid parameter", -EINVAL}, ++ {0x86, "Function is not supported by SMAPI BIOS", -EOPNOTSUPP}, ++ {0x90, "System error", -EIO}, ++ {0x91, "System is invalid", -EIO}, ++ {0x92, "System is busy, -EBUSY"}, ++ {0xa0, "Device error (disk read error)", -EIO}, ++ {0xa1, "Device is busy", -EBUSY}, ++ {0xa2, "Device is not attached", -ENXIO}, ++ {0xa3, "Device is disbled", -EIO}, ++ {0xa4, "Request parameter is out of range", -EINVAL}, ++ {0xa5, "Request parameter is not accepted", -EINVAL}, ++ {0xa6, "Transient error", -EBUSY}, /* ? */ ++ {SMAPI_RETCODE_EOF, "Unknown error code", -EIO} ++}; ++ ++ ++#define SMAPI_MAX_RETRIES 10 ++#define SMAPI_PORT2 0x4F /* fixed port, meaning unclear */ ++static unsigned short smapi_port; /* APM control port, normally 0xB2 */ ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37) ++static DECLARE_MUTEX(smapi_mutex); ++#else ++static DEFINE_SEMAPHORE(smapi_mutex); ++#endif ++ ++/** ++ * find_smapi_port - read SMAPI port from NVRAM ++ */ ++static int __init find_smapi_port(void) ++{ ++ u16 smapi_id = 0; ++ unsigned short port = 0; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&rtc_lock, flags); ++ smapi_id = CMOS_READ(0x7C); ++ smapi_id |= (CMOS_READ(0x7D) << 8); ++ spin_unlock_irqrestore(&rtc_lock, flags); ++ ++ if (smapi_id != 0x5349) { ++ printk(KERN_ERR "SMAPI not supported (ID=0x%x)\n", smapi_id); ++ return -ENXIO; ++ } ++ spin_lock_irqsave(&rtc_lock, flags); ++ port = CMOS_READ(0x7E); ++ port |= (CMOS_READ(0x7F) << 8); ++ spin_unlock_irqrestore(&rtc_lock, flags); ++ if (port == 0) { ++ printk(KERN_ERR "unable to read SMAPI port number\n"); ++ return -ENXIO; ++ } ++ return port; ++} ++ ++/** ++ * smapi_request - make a SMAPI call ++ * @inEBX, @inECX, @inEDI, @inESI: input registers ++ * @outEBX, @outECX, @outEDX, @outEDI, @outESI: outputs registers ++ * @msg: textual error message ++ * Invokes the SMAPI SMBIOS with the given input and outpu args. ++ * All outputs are optional (can be %NULL). ++ * Returns 0 when successful, and a negative errno constant ++ * (see smapi_retcode above) upon failure. ++ */ ++static int smapi_request(u32 inEBX, u32 inECX, ++ u32 inEDI, u32 inESI, ++ u32 *outEBX, u32 *outECX, u32 *outEDX, ++ u32 *outEDI, u32 *outESI, const char **msg) ++{ ++ int ret = 0; ++ int i; ++ int retries; ++ u8 rc; ++ /* Must use local vars for output regs, due to reg pressure. */ ++ u32 tmpEAX, tmpEBX, tmpECX, tmpEDX, tmpEDI, tmpESI; ++ ++ for (retries = 0; retries < SMAPI_MAX_RETRIES; ++retries) { ++ DPRINTK("req_in: BX=%x CX=%x DI=%x SI=%x", ++ inEBX, inECX, inEDI, inESI); ++ ++ /* SMAPI's SMBIOS call and thinkpad_ec end up using use ++ * different interfaces to the same chip, so play it safe. */ ++ ret = thinkpad_ec_lock(); ++ if (ret) ++ return ret; ++ ++ __asm__ __volatile__( ++ "movl $0x00005380,%%eax\n\t" ++ "movl %6,%%ebx\n\t" ++ "movl %7,%%ecx\n\t" ++ "movl %8,%%edi\n\t" ++ "movl %9,%%esi\n\t" ++ "xorl %%edx,%%edx\n\t" ++ "movw %10,%%dx\n\t" ++ "out %%al,%%dx\n\t" /* trigger SMI to SMBIOS */ ++ "out %%al,$0x4F\n\t" ++ "movl %%eax,%0\n\t" ++ "movl %%ebx,%1\n\t" ++ "movl %%ecx,%2\n\t" ++ "movl %%edx,%3\n\t" ++ "movl %%edi,%4\n\t" ++ "movl %%esi,%5\n\t" ++ :"=m"(tmpEAX), ++ "=m"(tmpEBX), ++ "=m"(tmpECX), ++ "=m"(tmpEDX), ++ "=m"(tmpEDI), ++ "=m"(tmpESI) ++ :"m"(inEBX), "m"(inECX), "m"(inEDI), "m"(inESI), ++ "m"((u16)smapi_port) ++ :"%eax", "%ebx", "%ecx", "%edx", "%edi", ++ "%esi"); ++ ++ thinkpad_ec_invalidate(); ++ thinkpad_ec_unlock(); ++ ++ /* Don't let the next SMAPI access happen too quickly, ++ * may case problems. (We're hold smapi_mutex). */ ++ msleep(50); ++ ++ if (outEBX) *outEBX = tmpEBX; ++ if (outECX) *outECX = tmpECX; ++ if (outEDX) *outEDX = tmpEDX; ++ if (outESI) *outESI = tmpESI; ++ if (outEDI) *outEDI = tmpEDI; ++ ++ /* Look up error code */ ++ rc = (tmpEAX>>8)&0xFF; ++ for (i = 0; smapi_retcode[i].rc != SMAPI_RETCODE_EOF && ++ smapi_retcode[i].rc != rc; ++i) {} ++ ret = smapi_retcode[i].ret; ++ if (msg) ++ *msg = smapi_retcode[i].msg; ++ ++ DPRINTK("req_out: AX=%x BX=%x CX=%x DX=%x DI=%x SI=%x r=%d", ++ tmpEAX, tmpEBX, tmpECX, tmpEDX, tmpEDI, tmpESI, ret); ++ if (ret) ++ TPRINTK(KERN_NOTICE, "SMAPI error: %s (func=%x)", ++ smapi_retcode[i].msg, inEBX); ++ ++ if (ret != -EBUSY) ++ return ret; ++ } ++ return ret; ++} ++ ++/* Convenience wrapper: discard output arguments */ ++static int smapi_write(u32 inEBX, u32 inECX, ++ u32 inEDI, u32 inESI, const char **msg) ++{ ++ return smapi_request(inEBX, inECX, inEDI, inESI, ++ NULL, NULL, NULL, NULL, NULL, msg); ++} ++ ++ ++/********************************************************************* ++ * Specific SMAPI services ++ * All of these functions return 0 upon success, and a negative errno ++ * constant (see smapi_retcode) on failure. ++ */ ++ ++enum thresh_type { ++ THRESH_STOP = 0, /* the code assumes this is 0 for brevity */ ++ THRESH_START ++}; ++#define THRESH_NAME(which) ((which == THRESH_START) ? "start" : "stop") ++ ++/** ++ * __get_real_thresh - read battery charge start/stop threshold from SMAPI ++ * @bat: battery number (0 or 1) ++ * @which: THRESH_START or THRESH_STOP ++ * @thresh: 1..99, 0=default 1..99, 0=default (pass this as-is to SMAPI) ++ * @outEDI: some additional state that needs to be preserved, meaning unknown ++ * @outESI: some additional state that needs to be preserved, meaning unknown ++ */ ++static int __get_real_thresh(int bat, enum thresh_type which, int *thresh, ++ u32 *outEDI, u32 *outESI) ++{ ++ u32 ebx = (which == THRESH_START) ? SMAPI_GET_THRESH_START ++ : SMAPI_GET_THRESH_STOP; ++ u32 ecx = (bat+1)<<8; ++ const char *msg; ++ int ret = smapi_request(ebx, ecx, 0, 0, NULL, ++ &ecx, NULL, outEDI, outESI, &msg); ++ if (ret) { ++ TPRINTK(KERN_NOTICE, "cannot get %s_thresh of bat=%d: %s", ++ THRESH_NAME(which), bat, msg); ++ return ret; ++ } ++ if (!(ecx&0x00000100)) { ++ TPRINTK(KERN_NOTICE, "cannot get %s_thresh of bat=%d: ecx=0%x", ++ THRESH_NAME(which), bat, ecx); ++ return -EIO; ++ } ++ if (thresh) ++ *thresh = ecx&0xFF; ++ return 0; ++} ++ ++/** ++ * get_real_thresh - read battery charge start/stop threshold from SMAPI ++ * @bat: battery number (0 or 1) ++ * @which: THRESH_START or THRESH_STOP ++ * @thresh: 1..99, 0=default (passes as-is to SMAPI) ++ */ ++static int get_real_thresh(int bat, enum thresh_type which, int *thresh) ++{ ++ return __get_real_thresh(bat, which, thresh, NULL, NULL); ++} ++ ++/** ++ * set_real_thresh - write battery start/top charge threshold to SMAPI ++ * @bat: battery number (0 or 1) ++ * @which: THRESH_START or THRESH_STOP ++ * @thresh: 1..99, 0=default (passes as-is to SMAPI) ++ */ ++static int set_real_thresh(int bat, enum thresh_type which, int thresh) ++{ ++ u32 ebx = (which == THRESH_START) ? SMAPI_SET_THRESH_START ++ : SMAPI_SET_THRESH_STOP; ++ u32 ecx = ((bat+1)<<8) + thresh; ++ u32 getDI, getSI; ++ const char *msg; ++ int ret; ++ ++ /* verify read before writing */ ++ ret = __get_real_thresh(bat, which, NULL, &getDI, &getSI); ++ if (ret) ++ return ret; ++ ++ ret = smapi_write(ebx, ecx, getDI, getSI, &msg); ++ if (ret) ++ TPRINTK(KERN_NOTICE, "set %s to %d for bat=%d failed: %s", ++ THRESH_NAME(which), thresh, bat, msg); ++ else ++ TPRINTK(KERN_INFO, "set %s to %d for bat=%d", ++ THRESH_NAME(which), thresh, bat); ++ return ret; ++} ++ ++/** ++ * __get_inhibit_charge_minutes - get inhibit charge period from SMAPI ++ * @bat: battery number (0 or 1) ++ * @minutes: period in minutes (1..65535 minutes, 0=disabled) ++ * @outECX: some additional state that needs to be preserved, meaning unknown ++ * Note that @minutes is the originally set value, it does not count down. ++ */ ++static int __get_inhibit_charge_minutes(int bat, int *minutes, u32 *outECX) ++{ ++ u32 ecx = (bat+1)<<8; ++ u32 esi; ++ const char *msg; ++ int ret = smapi_request(SMAPI_GET_INHIBIT_CHARGE, ecx, 0, 0, ++ NULL, &ecx, NULL, NULL, &esi, &msg); ++ if (ret) { ++ TPRINTK(KERN_NOTICE, "failed for bat=%d: %s", bat, msg); ++ return ret; ++ } ++ if (!(ecx&0x0100)) { ++ TPRINTK(KERN_NOTICE, "bad ecx=0x%x for bat=%d", ecx, bat); ++ return -EIO; ++ } ++ if (minutes) ++ *minutes = (ecx&0x0001)?esi:0; ++ if (outECX) ++ *outECX = ecx; ++ return 0; ++} ++ ++/** ++ * get_inhibit_charge_minutes - get inhibit charge period from SMAPI ++ * @bat: battery number (0 or 1) ++ * @minutes: period in minutes (1..65535 minutes, 0=disabled) ++ * Note that @minutes is the originally set value, it does not count down. ++ */ ++static int get_inhibit_charge_minutes(int bat, int *minutes) ++{ ++ return __get_inhibit_charge_minutes(bat, minutes, NULL); ++} ++ ++/** ++ * set_inhibit_charge_minutes - write inhibit charge period to SMAPI ++ * @bat: battery number (0 or 1) ++ * @minutes: period in minutes (1..65535 minutes, 0=disabled) ++ */ ++static int set_inhibit_charge_minutes(int bat, int minutes) ++{ ++ u32 ecx; ++ const char *msg; ++ int ret; ++ ++ /* verify read before writing */ ++ ret = __get_inhibit_charge_minutes(bat, NULL, &ecx); ++ if (ret) ++ return ret; ++ ++ ecx = ((bat+1)<<8) | (ecx&0x00FE) | (minutes > 0 ? 0x0001 : 0x0000); ++ if (minutes > 0xFFFF) ++ minutes = 0xFFFF; ++ ret = smapi_write(SMAPI_SET_INHIBIT_CHARGE, ecx, 0, minutes, &msg); ++ if (ret) ++ TPRINTK(KERN_NOTICE, ++ "set to %d failed for bat=%d: %s", minutes, bat, msg); ++ else ++ TPRINTK(KERN_INFO, "set to %d for bat=%d\n", minutes, bat); ++ return ret; ++} ++ ++ ++/** ++ * get_force_discharge - get status of forced discharging from SMAPI ++ * @bat: battery number (0 or 1) ++ * @enabled: 1 if forced discharged is enabled, 0 if not ++ */ ++static int get_force_discharge(int bat, int *enabled) ++{ ++ u32 ecx = (bat+1)<<8; ++ const char *msg; ++ int ret = smapi_request(SMAPI_GET_FORCE_DISCHARGE, ecx, 0, 0, ++ NULL, &ecx, NULL, NULL, NULL, &msg); ++ if (ret) { ++ TPRINTK(KERN_NOTICE, "failed for bat=%d: %s", bat, msg); ++ return ret; ++ } ++ *enabled = (!(ecx&0x00000100) && (ecx&0x00000001))?1:0; ++ return 0; ++} ++ ++/** ++ * set_force_discharge - write status of forced discharging to SMAPI ++ * @bat: battery number (0 or 1) ++ * @enabled: 1 if forced discharged is enabled, 0 if not ++ */ ++static int set_force_discharge(int bat, int enabled) ++{ ++ u32 ecx = (bat+1)<<8; ++ const char *msg; ++ int ret = smapi_request(SMAPI_GET_FORCE_DISCHARGE, ecx, 0, 0, ++ NULL, &ecx, NULL, NULL, NULL, &msg); ++ if (ret) { ++ TPRINTK(KERN_NOTICE, "get failed for bat=%d: %s", bat, msg); ++ return ret; ++ } ++ if (ecx&0x00000100) { ++ TPRINTK(KERN_NOTICE, "cannot force discharge bat=%d", bat); ++ return -EIO; ++ } ++ ++ ecx = ((bat+1)<<8) | (ecx&0x000000FA) | (enabled?0x00000001:0); ++ ret = smapi_write(SMAPI_SET_FORCE_DISCHARGE, ecx, 0, 0, &msg); ++ if (ret) ++ TPRINTK(KERN_NOTICE, "set to %d failed for bat=%d: %s", ++ enabled, bat, msg); ++ else ++ TPRINTK(KERN_INFO, "set to %d for bat=%d", enabled, bat); ++ return ret; ++} ++ ++ ++/********************************************************************* ++ * Wrappers to threshold-related SMAPI functions, which handle default ++ * thresholds and related quirks. ++ */ ++ ++/* Minimum, default and minimum difference for battery charging thresholds: */ ++#define MIN_THRESH_DELTA 4 /* Min delta between start and stop thresh */ ++#define MIN_THRESH_START 2 ++#define MAX_THRESH_START (100-MIN_THRESH_DELTA) ++#define MIN_THRESH_STOP (MIN_THRESH_START + MIN_THRESH_DELTA) ++#define MAX_THRESH_STOP 100 ++#define DEFAULT_THRESH_START MAX_THRESH_START ++#define DEFAULT_THRESH_STOP MAX_THRESH_STOP ++ ++/* The GUI of IBM's Battery Maximizer seems to show a start threshold that ++ * is 1 more than the value we set/get via SMAPI. Since the threshold is ++ * maintained across reboot, this can be confusing. So we kludge our ++ * interface for interoperability: */ ++#define BATMAX_FIX 1 ++ ++/* Get charge start/stop threshold (1..100), ++ * substituting default values if needed and applying BATMAT_FIX. */ ++static int get_thresh(int bat, enum thresh_type which, int *thresh) ++{ ++ int ret = get_real_thresh(bat, which, thresh); ++ if (ret) ++ return ret; ++ if (*thresh == 0) ++ *thresh = (which == THRESH_START) ? DEFAULT_THRESH_START ++ : DEFAULT_THRESH_STOP; ++ else if (which == THRESH_START) ++ *thresh += BATMAX_FIX; ++ return 0; ++} ++ ++ ++/* Set charge start/stop threshold (1..100), ++ * substituting default values if needed and applying BATMAT_FIX. */ ++static int set_thresh(int bat, enum thresh_type which, int thresh) ++{ ++ if (which == THRESH_STOP && thresh == DEFAULT_THRESH_STOP) ++ thresh = 0; /* 100 is out of range, but default means 100 */ ++ if (which == THRESH_START) ++ thresh -= BATMAX_FIX; ++ return set_real_thresh(bat, which, thresh); ++} ++ ++/********************************************************************* ++ * ThinkPad embedded controller readout and basic functions ++ */ ++ ++/** ++ * read_tp_ec_row - read data row from the ThinkPad embedded controller ++ * @arg0: EC command code ++ * @bat: battery number, 0 or 1 ++ * @j: the byte value to be used for "junk" (unused) input/outputs ++ * @dataval: result vector ++ */ ++static int read_tp_ec_row(u8 arg0, int bat, u8 j, u8 *dataval) ++{ ++ int ret; ++ const struct thinkpad_ec_row args = { .mask = 0xFFFF, ++ .val = {arg0, j,j,j,j,j,j,j,j,j,j,j,j,j,j, (u8)bat} }; ++ struct thinkpad_ec_row data = { .mask = 0xFFFF }; ++ ++ ret = thinkpad_ec_lock(); ++ if (ret) ++ return ret; ++ ret = thinkpad_ec_read_row(&args, &data); ++ thinkpad_ec_unlock(); ++ memcpy(dataval, &data.val, TP_CONTROLLER_ROW_LEN); ++ return ret; ++} ++ ++/** ++ * power_device_present - check for presence of battery or AC power ++ * @bat: 0 for battery 0, 1 for battery 1, otherwise AC power ++ * Returns 1 if present, 0 if not present, negative if error. ++ */ ++static int power_device_present(int bat) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ u8 test; ++ int ret = read_tp_ec_row(1, bat, 0, row); ++ if (ret) ++ return ret; ++ switch (bat) { ++ case 0: test = 0x40; break; /* battery 0 */ ++ case 1: test = 0x20; break; /* battery 1 */ ++ default: test = 0x80; /* AC power */ ++ } ++ return (row[0] & test) ? 1 : 0; ++} ++ ++/** ++ * bat_has_status - check if battery can report detailed status ++ * @bat: 0 for battery 0, 1 for battery 1 ++ * Returns 1 if yes, 0 if no, negative if error. ++ */ ++static int bat_has_status(int bat) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ int ret = read_tp_ec_row(1, bat, 0, row); ++ if (ret) ++ return ret; ++ if ((row[0] & (bat?0x20:0x40)) == 0) /* no battery */ ++ return 0; ++ if ((row[1] & (0x60)) == 0) /* no status */ ++ return 0; ++ return 1; ++} ++ ++/** ++ * get_tp_ec_bat_16 - read a 16-bit value from EC battery status data ++ * @arg0: first argument to EC ++ * @off: offset in row returned from EC ++ * @bat: battery (0 or 1) ++ * @val: the 16-bit value obtained ++ * Returns nonzero on error. ++ */ ++static int get_tp_ec_bat_16(u8 arg0, int offset, int bat, u16 *val) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ int ret; ++ if (bat_has_status(bat) != 1) ++ return -ENXIO; ++ ret = read_tp_ec_row(arg0, bat, 0, row); ++ if (ret) ++ return ret; ++ *val = *(u16 *)(row+offset); ++ return 0; ++} ++ ++/********************************************************************* ++ * sysfs attributes for batteries - ++ * definitions and helper functions ++ */ ++ ++/* A custom device attribute struct which holds a battery number */ ++struct bat_device_attribute { ++ struct device_attribute dev_attr; ++ int bat; ++}; ++ ++/** ++ * attr_get_bat - get the battery to which the attribute belongs ++ */ ++static int attr_get_bat(struct device_attribute *attr) ++{ ++ return container_of(attr, struct bat_device_attribute, dev_attr)->bat; ++} ++ ++/** ++ * show_tp_ec_bat_u16 - show an unsigned 16-bit battery attribute ++ * @arg0: specified 1st argument of EC raw to read ++ * @offset: byte offset in EC raw data ++ * @mul: correction factor to multiply by ++ * @na_msg: string to output is value not available (0xFFFFFFFF) ++ * @attr: battery attribute ++ * @buf: output buffer ++ * The 16-bit value is read from the EC, treated as unsigned, ++ * transformed as x->mul*x, and printed to the buffer. ++ * If the value is 0xFFFFFFFF and na_msg!=%NULL, na_msg is printed instead. ++ */ ++static ssize_t show_tp_ec_bat_u16(u8 arg0, int offset, int mul, ++ const char *na_msg, ++ struct device_attribute *attr, char *buf) ++{ ++ u16 val; ++ int ret = get_tp_ec_bat_16(arg0, offset, attr_get_bat(attr), &val); ++ if (ret) ++ return ret; ++ if (na_msg && val == 0xFFFF) ++ return sprintf(buf, "%s\n", na_msg); ++ else ++ return sprintf(buf, "%u\n", mul*(unsigned int)val); ++} ++ ++/** ++ * show_tp_ec_bat_s16 - show an signed 16-bit battery attribute ++ * @arg0: specified 1st argument of EC raw to read ++ * @offset: byte offset in EC raw data ++ * @mul: correction factor to multiply by ++ * @add: correction term to add after multiplication ++ * @attr: battery attribute ++ * @buf: output buffer ++ * The 16-bit value is read from the EC, treated as signed, ++ * transformed as x->mul*x+add, and printed to the buffer. ++ */ ++static ssize_t show_tp_ec_bat_s16(u8 arg0, int offset, int mul, int add, ++ struct device_attribute *attr, char *buf) ++{ ++ u16 val; ++ int ret = get_tp_ec_bat_16(arg0, offset, attr_get_bat(attr), &val); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%d\n", mul*(s16)val+add); ++} ++ ++/** ++ * show_tp_ec_bat_str - show a string from EC battery status data ++ * @arg0: specified 1st argument of EC raw to read ++ * @offset: byte offset in EC raw data ++ * @maxlen: maximum string length ++ * @attr: battery attribute ++ * @buf: output buffer ++ */ ++static ssize_t show_tp_ec_bat_str(u8 arg0, int offset, int maxlen, ++ struct device_attribute *attr, char *buf) ++{ ++ int bat = attr_get_bat(attr); ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ int ret; ++ if (bat_has_status(bat) != 1) ++ return -ENXIO; ++ ret = read_tp_ec_row(arg0, bat, 0, row); ++ if (ret) ++ return ret; ++ strncpy(buf, (char *)row+offset, maxlen); ++ buf[maxlen] = 0; ++ strcat(buf, "\n"); ++ return strlen(buf); ++} ++ ++/** ++ * show_tp_ec_bat_power - show a power readout from EC battery status data ++ * @arg0: specified 1st argument of EC raw to read ++ * @offV: byte offset of voltage in EC raw data ++ * @offI: byte offset of current in EC raw data ++ * @attr: battery attribute ++ * @buf: output buffer ++ * Computes the power as current*voltage from the two given readout offsets. ++ */ ++static ssize_t show_tp_ec_bat_power(u8 arg0, int offV, int offI, ++ struct device_attribute *attr, char *buf) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ int milliamp, millivolt, ret; ++ int bat = attr_get_bat(attr); ++ if (bat_has_status(bat) != 1) ++ return -ENXIO; ++ ret = read_tp_ec_row(1, bat, 0, row); ++ if (ret) ++ return ret; ++ millivolt = *(u16 *)(row+offV); ++ milliamp = *(s16 *)(row+offI); ++ return sprintf(buf, "%d\n", milliamp*millivolt/1000); /* units: mW */ ++} ++ ++/** ++ * show_tp_ec_bat_date - decode and show a date from EC battery status data ++ * @arg0: specified 1st argument of EC raw to read ++ * @offset: byte offset in EC raw data ++ * @attr: battery attribute ++ * @buf: output buffer ++ */ ++static ssize_t show_tp_ec_bat_date(u8 arg0, int offset, ++ struct device_attribute *attr, char *buf) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ u16 v; ++ int ret; ++ int day, month, year; ++ int bat = attr_get_bat(attr); ++ if (bat_has_status(bat) != 1) ++ return -ENXIO; ++ ret = read_tp_ec_row(arg0, bat, 0, row); ++ if (ret) ++ return ret; ++ ++ /* Decode bit-packed: v = day | (month<<5) | ((year-1980)<<9) */ ++ v = *(u16 *)(row+offset); ++ day = v & 0x1F; ++ month = (v >> 5) & 0xF; ++ year = (v >> 9) + 1980; ++ ++ return sprintf(buf, "%04d-%02d-%02d\n", year, month, day); ++} ++ ++ ++/********************************************************************* ++ * sysfs attribute I/O for batteries - ++ * the actual attribute show/store functions ++ */ ++ ++static ssize_t show_battery_start_charge_thresh(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ int thresh; ++ int bat = attr_get_bat(attr); ++ int ret = get_thresh(bat, THRESH_START, &thresh); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%d\n", thresh); /* units: percent */ ++} ++ ++static ssize_t show_battery_stop_charge_thresh(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ int thresh; ++ int bat = attr_get_bat(attr); ++ int ret = get_thresh(bat, THRESH_STOP, &thresh); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%d\n", thresh); /* units: percent */ ++} ++ ++/** ++ * store_battery_start_charge_thresh - store battery_start_charge_thresh attr ++ * Since this is a kernel<->user interface, we ensure a valid state for ++ * the hardware. We do this by clamping the requested threshold to the ++ * valid range and, if necessary, moving the other threshold so that ++ * it's MIN_THRESH_DELTA away from this one. ++ */ ++static ssize_t store_battery_start_charge_thresh(struct device *dev, ++ struct device_attribute *attr, const char *buf, size_t count) ++{ ++ int thresh, other_thresh, ret; ++ int bat = attr_get_bat(attr); ++ ++ if (sscanf(buf, "%d", &thresh) != 1 || thresh < 1 || thresh > 100) ++ return -EINVAL; ++ ++ if (thresh < MIN_THRESH_START) /* clamp up to MIN_THRESH_START */ ++ thresh = MIN_THRESH_START; ++ if (thresh > MAX_THRESH_START) /* clamp down to MAX_THRESH_START */ ++ thresh = MAX_THRESH_START; ++ ++ down(&smapi_mutex); ++ ret = get_thresh(bat, THRESH_STOP, &other_thresh); ++ if (ret != -EOPNOTSUPP && ret != -ENXIO) { ++ if (ret) /* other threshold is set? */ ++ goto out; ++ ret = get_real_thresh(bat, THRESH_START, NULL); ++ if (ret) /* this threshold is set? */ ++ goto out; ++ if (other_thresh < thresh+MIN_THRESH_DELTA) { ++ /* move other thresh to keep it above this one */ ++ ret = set_thresh(bat, THRESH_STOP, ++ thresh+MIN_THRESH_DELTA); ++ if (ret) ++ goto out; ++ } ++ } ++ ret = set_thresh(bat, THRESH_START, thresh); ++out: ++ up(&smapi_mutex); ++ return count; ++ ++} ++ ++/** ++ * store_battery_stop_charge_thresh - store battery_stop_charge_thresh attr ++ * Since this is a kernel<->user interface, we ensure a valid state for ++ * the hardware. We do this by clamping the requested threshold to the ++ * valid range and, if necessary, moving the other threshold so that ++ * it's MIN_THRESH_DELTA away from this one. ++ */ ++static ssize_t store_battery_stop_charge_thresh(struct device *dev, ++ struct device_attribute *attr, const char *buf, size_t count) ++{ ++ int thresh, other_thresh, ret; ++ int bat = attr_get_bat(attr); ++ ++ if (sscanf(buf, "%d", &thresh) != 1 || thresh < 1 || thresh > 100) ++ return -EINVAL; ++ ++ if (thresh < MIN_THRESH_STOP) /* clamp up to MIN_THRESH_STOP */ ++ thresh = MIN_THRESH_STOP; ++ ++ down(&smapi_mutex); ++ ret = get_thresh(bat, THRESH_START, &other_thresh); ++ if (ret != -EOPNOTSUPP && ret != -ENXIO) { /* other threshold exists? */ ++ if (ret) ++ goto out; ++ /* this threshold exists? */ ++ ret = get_real_thresh(bat, THRESH_STOP, NULL); ++ if (ret) ++ goto out; ++ if (other_thresh >= thresh-MIN_THRESH_DELTA) { ++ /* move other thresh to be below this one */ ++ ret = set_thresh(bat, THRESH_START, ++ thresh-MIN_THRESH_DELTA); ++ if (ret) ++ goto out; ++ } ++ } ++ ret = set_thresh(bat, THRESH_STOP, thresh); ++out: ++ up(&smapi_mutex); ++ return count; ++} ++ ++static ssize_t show_battery_inhibit_charge_minutes(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ int minutes; ++ int bat = attr_get_bat(attr); ++ int ret = get_inhibit_charge_minutes(bat, &minutes); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%d\n", minutes); /* units: minutes */ ++} ++ ++static ssize_t store_battery_inhibit_charge_minutes(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int ret; ++ int minutes; ++ int bat = attr_get_bat(attr); ++ if (sscanf(buf, "%d", &minutes) != 1 || minutes < 0) { ++ TPRINTK(KERN_ERR, "inhibit_charge_minutes: " ++ "must be a non-negative integer"); ++ return -EINVAL; ++ } ++ ret = set_inhibit_charge_minutes(bat, minutes); ++ if (ret) ++ return ret; ++ return count; ++} ++ ++static ssize_t show_battery_force_discharge(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ int enabled; ++ int bat = attr_get_bat(attr); ++ int ret = get_force_discharge(bat, &enabled); ++ if (ret) ++ return ret; ++ return sprintf(buf, "%d\n", enabled); /* type: boolean */ ++} ++ ++static ssize_t store_battery_force_discharge(struct device *dev, ++ struct device_attribute *attr, const char *buf, size_t count) ++{ ++ int ret; ++ int enabled; ++ int bat = attr_get_bat(attr); ++ if (sscanf(buf, "%d", &enabled) != 1 || enabled < 0 || enabled > 1) ++ return -EINVAL; ++ ret = set_force_discharge(bat, enabled); ++ if (ret) ++ return ret; ++ return count; ++} ++ ++static ssize_t show_battery_installed( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ int bat = attr_get_bat(attr); ++ int ret = power_device_present(bat); ++ if (ret < 0) ++ return ret; ++ return sprintf(buf, "%d\n", ret); /* type: boolean */ ++} ++ ++static ssize_t show_battery_state( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ u8 row[TP_CONTROLLER_ROW_LEN]; ++ const char *txt; ++ int ret; ++ int bat = attr_get_bat(attr); ++ if (bat_has_status(bat) != 1) ++ return sprintf(buf, "none\n"); ++ ret = read_tp_ec_row(1, bat, 0, row); ++ if (ret) ++ return ret; ++ switch (row[1] & 0xf0) { ++ case 0xc0: txt = "idle"; break; ++ case 0xd0: txt = "discharging"; break; ++ case 0xe0: txt = "charging"; break; ++ default: return sprintf(buf, "unknown (0x%x)\n", row[1]); ++ } ++ return sprintf(buf, "%s\n", txt); /* type: string from fixed set */ ++} ++ ++static ssize_t show_battery_manufacturer( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: string. SBS spec v1.1 p34: ManufacturerName() */ ++ return show_tp_ec_bat_str(4, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf); ++} ++ ++static ssize_t show_battery_model( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: string. SBS spec v1.1 p34: DeviceName() */ ++ return show_tp_ec_bat_str(5, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf); ++} ++ ++static ssize_t show_battery_barcoding( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: string */ ++ return show_tp_ec_bat_str(7, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf); ++} ++ ++static ssize_t show_battery_chemistry( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: string. SBS spec v1.1 p34-35: DeviceChemistry() */ ++ return show_tp_ec_bat_str(6, 2, 5, attr, buf); ++} ++ ++static ssize_t show_battery_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV. SBS spec v1.1 p24: Voltage() */ ++ return show_tp_ec_bat_u16(1, 6, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_design_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV. SBS spec v1.1 p32: DesignVoltage() */ ++ return show_tp_ec_bat_u16(3, 4, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_charging_max_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV. SBS spec v1.1 p37,39: ChargingVoltage() */ ++ return show_tp_ec_bat_u16(9, 8, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_group0_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV */ ++ return show_tp_ec_bat_u16(0xA, 12, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_group1_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV */ ++ return show_tp_ec_bat_u16(0xA, 10, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_group2_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV */ ++ return show_tp_ec_bat_u16(0xA, 8, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_group3_voltage( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mV */ ++ return show_tp_ec_bat_u16(0xA, 6, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_current_now( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mA. SBS spec v1.1 p24: Current() */ ++ return show_tp_ec_bat_s16(1, 8, 1, 0, attr, buf); ++} ++ ++static ssize_t show_battery_current_avg( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mA. SBS spec v1.1 p24: AverageCurrent() */ ++ return show_tp_ec_bat_s16(1, 10, 1, 0, attr, buf); ++} ++ ++static ssize_t show_battery_charging_max_current( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mA. SBS spec v1.1 p36,38: ChargingCurrent() */ ++ return show_tp_ec_bat_s16(9, 6, 1, 0, attr, buf); ++} ++ ++static ssize_t show_battery_power_now( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mW. SBS spec v1.1: Voltage()*Current() */ ++ return show_tp_ec_bat_power(1, 6, 8, attr, buf); ++} ++ ++static ssize_t show_battery_power_avg( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mW. SBS spec v1.1: Voltage()*AverageCurrent() */ ++ return show_tp_ec_bat_power(1, 6, 10, attr, buf); ++} ++ ++static ssize_t show_battery_remaining_percent( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: percent. SBS spec v1.1 p25: RelativeStateOfCharge() */ ++ return show_tp_ec_bat_u16(1, 12, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_remaining_percent_error( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: percent. SBS spec v1.1 p25: MaxError() */ ++ return show_tp_ec_bat_u16(9, 4, 1, NULL, attr, buf); ++} ++ ++static ssize_t show_battery_remaining_charging_time( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: minutes. SBS spec v1.1 p27: AverageTimeToFull() */ ++ return show_tp_ec_bat_u16(2, 8, 1, "not_charging", attr, buf); ++} ++ ++static ssize_t show_battery_remaining_running_time( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: minutes. SBS spec v1.1 p27: RunTimeToEmpty() */ ++ return show_tp_ec_bat_u16(2, 6, 1, "not_discharging", attr, buf); ++} ++ ++static ssize_t show_battery_remaining_running_time_now( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: minutes. SBS spec v1.1 p27: RunTimeToEmpty() */ ++ return show_tp_ec_bat_u16(2, 4, 1, "not_discharging", attr, buf); ++} ++ ++static ssize_t show_battery_remaining_capacity( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mWh. SBS spec v1.1 p26. */ ++ return show_tp_ec_bat_u16(1, 14, 10, "", attr, buf); ++} ++ ++static ssize_t show_battery_last_full_capacity( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mWh. SBS spec v1.1 p26: FullChargeCapacity() */ ++ return show_tp_ec_bat_u16(2, 2, 10, "", attr, buf); ++} ++ ++static ssize_t show_battery_design_capacity( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: mWh. SBS spec v1.1 p32: DesignCapacity() */ ++ return show_tp_ec_bat_u16(3, 2, 10, "", attr, buf); ++} ++ ++static ssize_t show_battery_cycle_count( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: ordinal. SBS spec v1.1 p32: CycleCount() */ ++ return show_tp_ec_bat_u16(2, 12, 1, "", attr, buf); ++} ++ ++static ssize_t show_battery_temperature( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* units: millicelsius. SBS spec v1.1: Temperature()*10 */ ++ return show_tp_ec_bat_s16(1, 4, 100, -273100, attr, buf); ++} ++ ++static ssize_t show_battery_serial( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: int. SBS spec v1.1 p34: SerialNumber() */ ++ return show_tp_ec_bat_u16(3, 10, 1, "", attr, buf); ++} ++ ++static ssize_t show_battery_manufacture_date( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: YYYY-MM-DD. SBS spec v1.1 p34: ManufactureDate() */ ++ return show_tp_ec_bat_date(3, 8, attr, buf); ++} ++ ++static ssize_t show_battery_first_use_date( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ /* type: YYYY-MM-DD */ ++ return show_tp_ec_bat_date(8, 2, attr, buf); ++} ++ ++/** ++ * show_battery_dump - show the battery's dump attribute ++ * The dump attribute gives a hex dump of all EC readouts related to a ++ * battery. Some of the enumerated values don't really exist (i.e., the ++ * EC function just leaves them untouched); we use a kludge to detect and ++ * denote these. ++ */ ++#define MIN_DUMP_ARG0 0x00 ++#define MAX_DUMP_ARG0 0x0a /* 0x0b is useful too but hangs old EC firmware */ ++static ssize_t show_battery_dump( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ int i; ++ char *p = buf; ++ int bat = attr_get_bat(attr); ++ u8 arg0; /* first argument to EC */ ++ u8 rowa[TP_CONTROLLER_ROW_LEN], ++ rowb[TP_CONTROLLER_ROW_LEN]; ++ const u8 junka = 0xAA, ++ junkb = 0x55; /* junk values for testing changes */ ++ int ret; ++ ++ for (arg0 = MIN_DUMP_ARG0; arg0 <= MAX_DUMP_ARG0; ++arg0) { ++ if ((p-buf) > PAGE_SIZE-TP_CONTROLLER_ROW_LEN*5) ++ return -ENOMEM; /* don't overflow sysfs buf */ ++ /* Read raw twice with different junk values, ++ * to detect unused output bytes which are left unchaged: */ ++ ret = read_tp_ec_row(arg0, bat, junka, rowa); ++ if (ret) ++ return ret; ++ ret = read_tp_ec_row(arg0, bat, junkb, rowb); ++ if (ret) ++ return ret; ++ for (i = 0; i < TP_CONTROLLER_ROW_LEN; i++) { ++ if (rowa[i] == junka && rowb[i] == junkb) ++ p += sprintf(p, "-- "); /* unused by EC */ ++ else ++ p += sprintf(p, "%02x ", rowa[i]); ++ } ++ p += sprintf(p, "\n"); ++ } ++ return p-buf; ++} ++ ++ ++/********************************************************************* ++ * sysfs attribute I/O, other than batteries ++ */ ++ ++static ssize_t show_ac_connected( ++ struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ int ret = power_device_present(0xFF); ++ if (ret < 0) ++ return ret; ++ return sprintf(buf, "%d\n", ret); /* type: boolean */ ++} ++ ++/********************************************************************* ++ * The the "smapi_request" sysfs attribute executes a raw SMAPI call. ++ * You write to make a request and read to get the result. The state ++ * is saved globally rather than per fd (sysfs limitation), so ++ * simultaenous requests may get each other's results! So this is for ++ * development and debugging only. ++ */ ++#define MAX_SMAPI_ATTR_ANSWER_LEN 128 ++static char smapi_attr_answer[MAX_SMAPI_ATTR_ANSWER_LEN] = ""; ++ ++static ssize_t show_smapi_request(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ int ret = snprintf(buf, PAGE_SIZE, "%s", smapi_attr_answer); ++ smapi_attr_answer[0] = '\0'; ++ return ret; ++} ++ ++static ssize_t store_smapi_request(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ unsigned int inEBX, inECX, inEDI, inESI; ++ u32 outEBX, outECX, outEDX, outEDI, outESI; ++ const char *msg; ++ int ret; ++ if (sscanf(buf, "%x %x %x %x", &inEBX, &inECX, &inEDI, &inESI) != 4) { ++ smapi_attr_answer[0] = '\0'; ++ return -EINVAL; ++ } ++ ret = smapi_request( ++ inEBX, inECX, inEDI, inESI, ++ &outEBX, &outECX, &outEDX, &outEDI, &outESI, &msg); ++ snprintf(smapi_attr_answer, MAX_SMAPI_ATTR_ANSWER_LEN, ++ "%x %x %x %x %x %d '%s'\n", ++ (unsigned int)outEBX, (unsigned int)outECX, ++ (unsigned int)outEDX, (unsigned int)outEDI, ++ (unsigned int)outESI, ret, msg); ++ if (ret) ++ return ret; ++ else ++ return count; ++} ++ ++/********************************************************************* ++ * Power management: the embedded controller forgets the battery ++ * thresholds when the system is suspended to disk and unplugged from ++ * AC and battery, so we restore it upon resume. ++ */ ++ ++static int saved_threshs[4] = {-1, -1, -1, -1}; /* -1 = don't know */ ++ ++static int tp_suspend(struct platform_device *dev, pm_message_t state) ++{ ++ int restore = (state.event == PM_EVENT_HIBERNATE || ++ state.event == PM_EVENT_FREEZE); ++ if (!restore || get_real_thresh(0, THRESH_STOP , &saved_threshs[0])) ++ saved_threshs[0] = -1; ++ if (!restore || get_real_thresh(0, THRESH_START, &saved_threshs[1])) ++ saved_threshs[1] = -1; ++ if (!restore || get_real_thresh(1, THRESH_STOP , &saved_threshs[2])) ++ saved_threshs[2] = -1; ++ if (!restore || get_real_thresh(1, THRESH_START, &saved_threshs[3])) ++ saved_threshs[3] = -1; ++ DPRINTK("suspend saved: %d %d %d %d", saved_threshs[0], ++ saved_threshs[1], saved_threshs[2], saved_threshs[3]); ++ return 0; ++} ++ ++static int tp_resume(struct platform_device *dev) ++{ ++ DPRINTK("resume restoring: %d %d %d %d", saved_threshs[0], ++ saved_threshs[1], saved_threshs[2], saved_threshs[3]); ++ if (saved_threshs[0] >= 0) ++ set_real_thresh(0, THRESH_STOP , saved_threshs[0]); ++ if (saved_threshs[1] >= 0) ++ set_real_thresh(0, THRESH_START, saved_threshs[1]); ++ if (saved_threshs[2] >= 0) ++ set_real_thresh(1, THRESH_STOP , saved_threshs[2]); ++ if (saved_threshs[3] >= 0) ++ set_real_thresh(1, THRESH_START, saved_threshs[3]); ++ return 0; ++} ++ ++ ++/********************************************************************* ++ * Driver model ++ */ ++ ++static struct platform_driver tp_driver = { ++ .suspend = tp_suspend, ++ .resume = tp_resume, ++ .driver = { ++ .name = "smapi", ++ .owner = THIS_MODULE ++ }, ++}; ++ ++ ++/********************************************************************* ++ * Sysfs device model ++ */ ++ ++/* Attributes in /sys/devices/platform/smapi/ */ ++ ++static DEVICE_ATTR(ac_connected, 0444, show_ac_connected, NULL); ++static DEVICE_ATTR(smapi_request, 0600, show_smapi_request, ++ store_smapi_request); ++ ++static struct attribute *tp_root_attributes[] = { ++ &dev_attr_ac_connected.attr, ++ &dev_attr_smapi_request.attr, ++ NULL ++}; ++static struct attribute_group tp_root_attribute_group = { ++ .attrs = tp_root_attributes ++}; ++ ++/* Attributes under /sys/devices/platform/smapi/BAT{0,1}/ : ++ * Every attribute needs to be defined (i.e., statically allocated) for ++ * each battery, and then referenced in the attribute list of each battery. ++ * We use preprocessor voodoo to avoid duplicating the list of attributes 4 ++ * times. The preprocessor output is just normal sysfs attributes code. ++ */ ++ ++/** ++ * FOREACH_BAT_ATTR - invoke the given macros on all our battery attributes ++ * @_BAT: battery number (0 or 1) ++ * @_ATTR_RW: macro to invoke for each read/write attribute ++ * @_ATTR_R: macro to invoke for each read-only attribute ++ */ ++#define FOREACH_BAT_ATTR(_BAT, _ATTR_RW, _ATTR_R) \ ++ _ATTR_RW(_BAT, start_charge_thresh) \ ++ _ATTR_RW(_BAT, stop_charge_thresh) \ ++ _ATTR_RW(_BAT, inhibit_charge_minutes) \ ++ _ATTR_RW(_BAT, force_discharge) \ ++ _ATTR_R(_BAT, installed) \ ++ _ATTR_R(_BAT, state) \ ++ _ATTR_R(_BAT, manufacturer) \ ++ _ATTR_R(_BAT, model) \ ++ _ATTR_R(_BAT, barcoding) \ ++ _ATTR_R(_BAT, chemistry) \ ++ _ATTR_R(_BAT, voltage) \ ++ _ATTR_R(_BAT, group0_voltage) \ ++ _ATTR_R(_BAT, group1_voltage) \ ++ _ATTR_R(_BAT, group2_voltage) \ ++ _ATTR_R(_BAT, group3_voltage) \ ++ _ATTR_R(_BAT, current_now) \ ++ _ATTR_R(_BAT, current_avg) \ ++ _ATTR_R(_BAT, charging_max_current) \ ++ _ATTR_R(_BAT, power_now) \ ++ _ATTR_R(_BAT, power_avg) \ ++ _ATTR_R(_BAT, remaining_percent) \ ++ _ATTR_R(_BAT, remaining_percent_error) \ ++ _ATTR_R(_BAT, remaining_charging_time) \ ++ _ATTR_R(_BAT, remaining_running_time) \ ++ _ATTR_R(_BAT, remaining_running_time_now) \ ++ _ATTR_R(_BAT, remaining_capacity) \ ++ _ATTR_R(_BAT, last_full_capacity) \ ++ _ATTR_R(_BAT, design_voltage) \ ++ _ATTR_R(_BAT, charging_max_voltage) \ ++ _ATTR_R(_BAT, design_capacity) \ ++ _ATTR_R(_BAT, cycle_count) \ ++ _ATTR_R(_BAT, temperature) \ ++ _ATTR_R(_BAT, serial) \ ++ _ATTR_R(_BAT, manufacture_date) \ ++ _ATTR_R(_BAT, first_use_date) \ ++ _ATTR_R(_BAT, dump) ++ ++/* Define several macros we will feed into FOREACH_BAT_ATTR: */ ++ ++#define DEFINE_BAT_ATTR_RW(_BAT,_NAME) \ ++ static struct bat_device_attribute dev_attr_##_NAME##_##_BAT = { \ ++ .dev_attr = __ATTR(_NAME, 0644, show_battery_##_NAME, \ ++ store_battery_##_NAME), \ ++ .bat = _BAT \ ++ }; ++ ++#define DEFINE_BAT_ATTR_R(_BAT,_NAME) \ ++ static struct bat_device_attribute dev_attr_##_NAME##_##_BAT = { \ ++ .dev_attr = __ATTR(_NAME, 0644, show_battery_##_NAME, 0), \ ++ .bat = _BAT \ ++ }; ++ ++#define REF_BAT_ATTR(_BAT,_NAME) \ ++ &dev_attr_##_NAME##_##_BAT.dev_attr.attr, ++ ++/* This provide all attributes for one battery: */ ++ ++#define PROVIDE_BAT_ATTRS(_BAT) \ ++ FOREACH_BAT_ATTR(_BAT, DEFINE_BAT_ATTR_RW, DEFINE_BAT_ATTR_R) \ ++ static struct attribute *tp_bat##_BAT##_attributes[] = { \ ++ FOREACH_BAT_ATTR(_BAT, REF_BAT_ATTR, REF_BAT_ATTR) \ ++ NULL \ ++ }; \ ++ static struct attribute_group tp_bat##_BAT##_attribute_group = { \ ++ .name = "BAT" #_BAT, \ ++ .attrs = tp_bat##_BAT##_attributes \ ++ }; ++ ++/* Finally genereate the attributes: */ ++ ++PROVIDE_BAT_ATTRS(0) ++PROVIDE_BAT_ATTRS(1) ++ ++/* List of attribute groups */ ++ ++static struct attribute_group *attr_groups[] = { ++ &tp_root_attribute_group, ++ &tp_bat0_attribute_group, ++ &tp_bat1_attribute_group, ++ NULL ++}; ++ ++ ++/********************************************************************* ++ * Init and cleanup ++ */ ++ ++static struct attribute_group **next_attr_group; /* next to register */ ++ ++static int __init tp_init(void) ++{ ++ int ret; ++ printk(KERN_INFO "tp_smapi " TP_VERSION " loading...\n"); ++ ++ ret = find_smapi_port(); ++ if (ret < 0) ++ goto err; ++ else ++ smapi_port = ret; ++ ++ if (!request_region(smapi_port, 1, "smapi")) { ++ printk(KERN_ERR "tp_smapi cannot claim port 0x%x\n", ++ smapi_port); ++ ret = -ENXIO; ++ goto err; ++ } ++ ++ if (!request_region(SMAPI_PORT2, 1, "smapi")) { ++ printk(KERN_ERR "tp_smapi cannot claim port 0x%x\n", ++ SMAPI_PORT2); ++ ret = -ENXIO; ++ goto err_port1; ++ } ++ ++ ret = platform_driver_register(&tp_driver); ++ if (ret) ++ goto err_port2; ++ ++ pdev = platform_device_alloc("smapi", -1); ++ if (!pdev) { ++ ret = -ENOMEM; ++ goto err_driver; ++ } ++ ++ ret = platform_device_add(pdev); ++ if (ret) ++ goto err_device_free; ++ ++ for (next_attr_group = attr_groups; *next_attr_group; ++ ++next_attr_group) { ++ ret = sysfs_create_group(&pdev->dev.kobj, *next_attr_group); ++ if (ret) ++ goto err_attr; ++ } ++ ++ printk(KERN_INFO "tp_smapi successfully loaded (smapi_port=0x%x).\n", ++ smapi_port); ++ return 0; ++ ++err_attr: ++ while (--next_attr_group >= attr_groups) ++ sysfs_remove_group(&pdev->dev.kobj, *next_attr_group); ++ platform_device_unregister(pdev); ++err_device_free: ++ platform_device_put(pdev); ++err_driver: ++ platform_driver_unregister(&tp_driver); ++err_port2: ++ release_region(SMAPI_PORT2, 1); ++err_port1: ++ release_region(smapi_port, 1); ++err: ++ printk(KERN_ERR "tp_smapi init failed (ret=%d)!\n", ret); ++ return ret; ++} ++ ++static void __exit tp_exit(void) ++{ ++ while (next_attr_group && --next_attr_group >= attr_groups) ++ sysfs_remove_group(&pdev->dev.kobj, *next_attr_group); ++ platform_device_unregister(pdev); ++ platform_driver_unregister(&tp_driver); ++ release_region(SMAPI_PORT2, 1); ++ if (smapi_port) ++ release_region(smapi_port, 1); ++ ++ printk(KERN_INFO "tp_smapi unloaded.\n"); ++} ++ ++module_init(tp_init); ++module_exit(tp_exit); +diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig +index 0840d27381ea..73aba9a31064 100644 +--- a/drivers/tty/Kconfig ++++ b/drivers/tty/Kconfig +@@ -75,6 +75,19 @@ config VT_CONSOLE_SLEEP + def_bool y + depends on VT_CONSOLE && PM_SLEEP + ++config NR_TTY_DEVICES ++ int "Maximum tty device number" ++ depends on VT ++ range 12 63 ++ default 63 ++ ---help--- ++ This option is used to change the number of tty devices in /dev. ++ The default value is 63. The lowest number you can set is 12, ++ 63 is also the upper limit so we don't overrun the serial ++ consoles. ++ ++ If unsure, say 63. ++ + config HW_CONSOLE + bool + depends on VT && !UML +diff --git a/fs/exec.c b/fs/exec.c +index 65eaacaba4f4..1d3b310bd5f0 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -63,6 +63,8 @@ + #include + #include + ++#include ++ + #include + #include + #include +@@ -866,9 +868,12 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) + if (err) + goto exit; + +- if (name->name[0] != '\0') ++ if (name->name[0] != '\0') { + fsnotify_open(file); + ++ trace_open_exec(name->name); ++ } ++ + out: + return file; + +diff --git a/fs/open.c b/fs/open.c +index cb81623a8b09..a92b0f6061ac 100644 +--- a/fs/open.c ++++ b/fs/open.c +@@ -34,6 +34,9 @@ + + #include "internal.h" + ++#define CREATE_TRACE_POINTS ++#include ++ + int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, + struct file *filp) + { +@@ -1068,6 +1071,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) + } else { + fsnotify_open(f); + fd_install(fd, f); ++ trace_do_sys_open(tmp->name, flags, mode); + } + } + putname(tmp); +diff --git a/include/trace/events/fs.h b/include/trace/events/fs.h +new file mode 100644 +index 000000000000..fb634b74adf3 +--- /dev/null ++++ b/include/trace/events/fs.h +@@ -0,0 +1,53 @@ ++#undef TRACE_SYSTEM ++#define TRACE_SYSTEM fs ++ ++#if !defined(_TRACE_FS_H) || defined(TRACE_HEADER_MULTI_READ) ++#define _TRACE_FS_H ++ ++#include ++#include ++ ++TRACE_EVENT(do_sys_open, ++ ++ TP_PROTO(const char *filename, int flags, int mode), ++ ++ TP_ARGS(filename, flags, mode), ++ ++ TP_STRUCT__entry( ++ __string( filename, filename ) ++ __field( int, flags ) ++ __field( int, mode ) ++ ), ++ ++ TP_fast_assign( ++ __assign_str(filename, filename); ++ __entry->flags = flags; ++ __entry->mode = mode; ++ ), ++ ++ TP_printk("\"%s\" %x %o", ++ __get_str(filename), __entry->flags, __entry->mode) ++); ++ ++TRACE_EVENT(open_exec, ++ ++ TP_PROTO(const char *filename), ++ ++ TP_ARGS(filename), ++ ++ TP_STRUCT__entry( ++ __string( filename, filename ) ++ ), ++ ++ TP_fast_assign( ++ __assign_str(filename, filename); ++ ), ++ ++ TP_printk("\"%s\"", ++ __get_str(filename)) ++); ++ ++#endif /* _TRACE_FS_H */ ++ ++/* This part must be outside protection */ ++#include +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 79226ca8f80f..2a30060e7e1d 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -47,7 +47,11 @@ struct blk_queue_stats; + struct blk_stat_callback; + + #define BLKDEV_MIN_RQ 4 ++#ifdef CONFIG_ZENIFY ++#define BLKDEV_MAX_RQ 512 ++#else + #define BLKDEV_MAX_RQ 128 /* Default maximum */ ++#endif + + /* Must be consistent with blk_mq_poll_stats_bkt() */ + #define BLK_MQ_POLL_STATS_BKTS 16 +diff --git a/include/linux/thinkpad_ec.h b/include/linux/thinkpad_ec.h +new file mode 100644 +index 000000000000..1b80d7ee5493 +--- /dev/null ++++ b/include/linux/thinkpad_ec.h +@@ -0,0 +1,47 @@ ++/* ++ * thinkpad_ec.h - interface to ThinkPad embedded controller LPC3 functions ++ * ++ * Copyright (C) 2005 Shem Multinymous ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#ifndef _THINKPAD_EC_H ++#define _THINKPAD_EC_H ++ ++#ifdef __KERNEL__ ++ ++#define TP_CONTROLLER_ROW_LEN 16 ++ ++/* EC transactions input and output (possibly partial) vectors of 16 bytes. */ ++struct thinkpad_ec_row { ++ u16 mask; /* bitmap of which entries of val[] are meaningful */ ++ u8 val[TP_CONTROLLER_ROW_LEN]; ++}; ++ ++extern int __must_check thinkpad_ec_lock(void); ++extern int __must_check thinkpad_ec_try_lock(void); ++extern void thinkpad_ec_unlock(void); ++ ++extern int thinkpad_ec_read_row(const struct thinkpad_ec_row *args, ++ struct thinkpad_ec_row *data); ++extern int thinkpad_ec_try_read_row(const struct thinkpad_ec_row *args, ++ struct thinkpad_ec_row *mask); ++extern int thinkpad_ec_prefetch_row(const struct thinkpad_ec_row *args); ++extern void thinkpad_ec_invalidate(void); ++ ++ ++#endif /* __KERNEL */ ++#endif /* _THINKPAD_EC_H */ +diff --git a/include/uapi/linux/vt.h b/include/uapi/linux/vt.h +index e9d39c48520a..3bceead8da40 100644 +--- a/include/uapi/linux/vt.h ++++ b/include/uapi/linux/vt.h +@@ -3,12 +3,25 @@ + #define _UAPI_LINUX_VT_H + + ++/* ++ * We will make this definition solely for the purpose of making packages ++ * such as splashutils build, because they can not understand that ++ * NR_TTY_DEVICES is defined in the kernel configuration. ++ */ ++#ifndef CONFIG_NR_TTY_DEVICES ++#define CONFIG_NR_TTY_DEVICES 63 ++#endif ++ + /* + * These constants are also useful for user-level apps (e.g., VC + * resizing). + */ + #define MIN_NR_CONSOLES 1 /* must be at least 1 */ +-#define MAX_NR_CONSOLES 63 /* serial lines start at 64 */ ++/* ++ * NR_TTY_DEVICES: ++ * Value MUST be at least 12 and must never be higher then 63 ++ */ ++#define MAX_NR_CONSOLES CONFIG_NR_TTY_DEVICES /* serial lines start above this */ + /* Note: the ioctl VT_GETSTATE does not work for + consoles 16 and higher (since it returns a short) */ + +diff --git a/init/Kconfig b/init/Kconfig +index 041f3a022122..5ed70eb1ad3a 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -45,6 +45,38 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config ZENIFY ++ bool "A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience" ++ default y ++ help ++ Tunes the kernel for responsiveness at the cost of throughput and power usage. ++ ++ --- Virtual Memory Subsystem --------------------------- ++ ++ Mem dirty before bg writeback..: 10 % -> 20 % ++ Mem dirty before sync writeback: 20 % -> 50 % ++ ++ --- Block Layer ---------------------------------------- ++ ++ Queue depth...............: 128 -> 512 ++ Default MQ scheduler......: mq-deadline -> bfq ++ ++ --- CFS CPU Scheduler ---------------------------------- ++ ++ Scheduling latency.............: 6 -> 3 ms ++ Minimal granularity............: 0.75 -> 0.3 ms ++ Wakeup granularity.............: 1 -> 0.5 ms ++ CPU migration cost.............: 0.5 -> 0.25 ms ++ Bandwidth slice size...........: 5 -> 3 ms ++ Ondemand fine upscaling limit..: 95 % -> 85 % ++ ++ --- MuQSS CPU Scheduler -------------------------------- ++ ++ Scheduling interval............: 6 -> 3 ms ++ ISO task max realtime use......: 70 % -> 25 % ++ Ondemand coarse upscaling limit: 80 % -> 45 % ++ Ondemand fine upscaling limit..: 95 % -> 45 % ++ + config BROKEN + bool + +@@ -1026,6 +1058,13 @@ config CC_OPTIMIZE_FOR_PERFORMANCE + with the "-O2" compiler flag for best performance and most + helpful compile-time warnings. + ++config CC_OPTIMIZE_HARDER ++ bool "Optimize harder" ++ help ++ This option will pass "-O3" to your compiler resulting in a ++ larger and faster kernel. The more complex optimizations also ++ increase compilation time and may affect stability. ++ + config CC_OPTIMIZE_FOR_SIZE + bool "Optimize for size" + help +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 2f0a0be4d344..bada807c7e59 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -37,8 +37,13 @@ + * + * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_latency = 3000000ULL; ++static unsigned int normalized_sysctl_sched_latency = 3000000ULL; ++#else + unsigned int sysctl_sched_latency = 6000000ULL; + static unsigned int normalized_sysctl_sched_latency = 6000000ULL; ++#endif + + /* + * The initial- and re-scaling of tunables is configurable +@@ -58,13 +63,22 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L + * + * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_min_granularity = 300000ULL; ++static unsigned int normalized_sysctl_sched_min_granularity = 300000ULL; ++#else + unsigned int sysctl_sched_min_granularity = 750000ULL; + static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; ++#endif + + /* + * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity + */ ++#ifdef CONFIG_ZENIFY ++static unsigned int sched_nr_latency = 10; ++#else + static unsigned int sched_nr_latency = 8; ++#endif + + /* + * After fork, child runs first. If set to 0 (default) then +@@ -81,10 +95,17 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; + * + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_wakeup_granularity = 500000UL; ++static unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL; ++ ++const_debug unsigned int sysctl_sched_migration_cost = 50000UL; ++#else + unsigned int sysctl_sched_wakeup_granularity = 1000000UL; + static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; + + const_debug unsigned int sysctl_sched_migration_cost = 500000UL; ++#endif + + #ifdef CONFIG_SMP + /* +@@ -107,8 +128,12 @@ int __weak arch_asym_cpu_priority(int cpu) + * + * (default: 5 msec, units: microseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; ++#else + unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; + #endif ++#endif + + /* + * The margin used when comparing utilization with CPU capacity: +diff --git a/mm/page-writeback.c b/mm/page-writeback.c +index 337c6afb3345..9315e358f292 100644 +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -71,7 +71,11 @@ static long ratelimit_pages = 32; + /* + * Start background writeback (via writeback threads) at this percentage + */ ++#ifdef CONFIG_ZENIFY ++int dirty_background_ratio = 20; ++#else + int dirty_background_ratio = 10; ++#endif + + /* + * dirty_background_bytes starts at 0 (disabled) so that it is a function of +@@ -88,7 +92,11 @@ int vm_highmem_is_dirtyable; + /* + * The generator of dirty data starts writeback at this percentage + */ ++#ifdef CONFIG_ZENIFY ++int vm_dirty_ratio = 50; ++#else + int vm_dirty_ratio = 20; ++#endif + + /* + * vm_dirty_bytes starts at 0 (disabled) so that it is a function of +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index 80dad301361d..42b7fa7d01f8 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -702,6 +702,9 @@ choice + config DEFAULT_VEGAS + bool "Vegas" if TCP_CONG_VEGAS=y + ++ config DEFAULT_YEAH ++ bool "YeAH" if TCP_CONG_YEAH=y ++ + config DEFAULT_VENO + bool "Veno" if TCP_CONG_VENO=y + +@@ -735,6 +738,7 @@ config DEFAULT_TCP_CONG + default "htcp" if DEFAULT_HTCP + default "hybla" if DEFAULT_HYBLA + default "vegas" if DEFAULT_VEGAS ++ default "yeah" if DEFAULT_YEAH + default "westwood" if DEFAULT_WESTWOOD + default "veno" if DEFAULT_VENO + default "reno" if DEFAULT_RENO + +From: Nick Desaulniers +Date: Mon, 24 Dec 2018 13:37:41 +0200 +Subject: include/linux/compiler*.h: define asm_volatile_goto + +asm_volatile_goto should also be defined for other compilers that +support asm goto. + +Fixes commit 815f0dd ("include/linux/compiler*.h: make compiler-*.h +mutually exclusive"). + +Signed-off-by: Nick Desaulniers +Signed-off-by: Miguel Ojeda + +diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h +index ba814f1..e77eeb0 100644 +--- a/include/linux/compiler_types.h ++++ b/include/linux/compiler_types.h +@@ -188,6 +188,10 @@ struct ftrace_likely_data { + #define asm_volatile_goto(x...) asm goto(x) + #endif + ++#ifndef asm_volatile_goto ++#define asm_volatile_goto(x...) asm goto(x) ++#endif ++ + /* Are two types/vars the same type (ignoring qualifiers)? */ + #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) + +From: Andy Lavr +Date: Mon, 24 Dec 2018 14:57:47 +0200 +Subject: avl: Use [defer+madvise] as default khugepaged defrag strategy + +For some reason, the default strategy to respond to THP fault fallbacks +is still just madvise, meaning stall if the program wants transparent +hugepages, but don't trigger a background reclaim / compaction if THP +begins to fail allocations. This creates a snowball affect where we +still use the THP code paths, but we almost always fail once a system +has been active and busy for a while. + +The option "defer" was created for interactive systems where THP can +still improve performance. If we have to fallback to a regular page due +to an allocation failure or anything else, we will trigger a background +reclaim and compaction so future THP attempts succeed and previous +attempts eventually have their smaller pages combined without stalling +running applications. + +We still want madvise to stall applications that explicitely want THP, +so defer+madvise _does_ make a ton of sense. Make it the default for +interactive systems, especially if the kernel maintainer left +transparent hugepages on "always". + +Reasoning and details in the original patch: +https://lwn.net/Articles/711248/ + +Signed-off-by: Andy Lavr + +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index e84a10b..21d62b7 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -53,7 +53,11 @@ unsigned long transparent_hugepage_flags __read_mostly = + #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE + (1<hw.mac; + struct e1000_phy_info *phy = &adapter->hw.phy; + struct e1000_ring *tx_ring = adapter->tx_ring; +- u32 dmoff_exit_timeout = 100, tries = 0; + struct e1000_hw *hw = &adapter->hw; ++ u32 link, tctl; +- u32 link, tctl, pcim_state; + + if (test_bit(__E1000_DOWN, &adapter->state)) + return; +@@ -5188,21 +5187,6 @@ static void e1000_watchdog_task(struct work_struct *work) + /* Cancel scheduled suspend requests. */ + pm_runtime_resume(netdev->dev.parent); + +- /* Checking if MAC is in DMoff state*/ +- pcim_state = er32(STATUS); +- while (pcim_state & E1000_STATUS_PCIM_STATE) { +- if (tries++ == dmoff_exit_timeout) { +- e_dbg("Error in exiting dmoff\n"); +- break; +- } +- usleep_range(10000, 20000); +- pcim_state = er32(STATUS); +- +- /* Checking if MAC exited DMoff state */ +- if (!(pcim_state & E1000_STATUS_PCIM_STATE)) +- e1000_phy_hw_reset(&adapter->hw); +- } +- + /* update snapshot of PHY registers on LSC */ + e1000_phy_read_status(adapter); + mac->ops.get_link_up_info(&adapter->hw, +From adb1f9df27f08e6488bcd80b1607987c6114a77a Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Mon, 25 Nov 2019 15:13:06 -0300 +Subject: [PATCH] elevator: set default scheduler to bfq for blk-mq + +Signed-off-by: Alexandre Frade +--- + block/elevator.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/block/elevator.c b/block/elevator.c +index 076ba7308e65..81f89095aa77 100644 +--- a/block/elevator.c ++++ b/block/elevator.c +@@ -623,15 +623,15 @@ static inline bool elv_support_iosched(struct request_queue *q) + } + + /* +- * For single queue devices, default to using mq-deadline. If we have multiple +- * queues or mq-deadline is not available, default to "none". ++ * For single queue devices, default to using bfq. If we have multiple ++ * queues or bfq is not available, default to "none". + */ + static struct elevator_type *elevator_get_default(struct request_queue *q) + { + if (q->nr_hw_queues != 1) + return NULL; + +- return elevator_get(q, "mq-deadline", false); ++ return elevator_get(q, "bfq", false); + } + + /* +From c3ec05777c46e19a8a26d0fc4ca0c0db8a19de97 Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Fri, 10 May 2019 16:45:59 -0300 +Subject: [PATCH] block: set rq_affinity = 2 for full multithreading I/O + requests + +Signed-off-by: Alexandre Frade +--- + include/linux/blkdev.h | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index f3ea78b0c91c..4dbacc6b073b 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -621,7 +621,8 @@ struct request_queue { + #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ + + #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ +- (1 << QUEUE_FLAG_SAME_COMP)) ++ (1 << QUEUE_FLAG_SAME_COMP) | \ ++ (1 << QUEUE_FLAG_SAME_FORCE)) + + void blk_queue_flag_set(unsigned int flag, struct request_queue *q); + void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); +From 8171d33d0b84a953649863538fdbe4c26c035e4f Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Fri, 10 May 2019 14:32:50 -0300 +Subject: [PATCH] mm: set 2 megabytes for address_space-level file read-ahead + pages size + +Signed-off-by: Alexandre Frade +--- + include/linux/mm.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/linux/mm.h b/include/linux/mm.h +index a2adf95b3f9c..e804d9f7583a 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -2416,7 +2416,7 @@ int __must_check write_one_page(struct page *page); + void task_dirty_inc(struct task_struct *tsk); + + /* readahead.c */ +-#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) ++#define VM_READAHEAD_PAGES (SZ_2M / PAGE_SIZE) + + int force_page_cache_readahead(struct address_space *mapping, struct file *filp, + pgoff_t offset, unsigned long nr_to_read); diff --git a/linux-tkg/linux-tkg-patches/5.4/0003-glitched-cfs.patch b/linux-tkg/linux-tkg-patches/5.4/0003-glitched-cfs.patch new file mode 100644 index 0000000..06b7f02 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.4/0003-glitched-cfs.patch @@ -0,0 +1,72 @@ +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_500 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -39,6 +39,13 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,6 +59,7 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_500 ++ default HZ_750 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -46,6 +46,13 @@ choice + on desktops with great smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -60,6 +67,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 500 if HZ_500 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK + diff --git a/linux-tkg/linux-tkg-patches/5.4/0004-5.4-ck1.patch b/linux-tkg/linux-tkg-patches/5.4/0004-5.4-ck1.patch new file mode 100644 index 0000000..f3fbde8 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.4/0004-5.4-ck1.patch @@ -0,0 +1,17684 @@ +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 8dee8f68fe15..e56fb275f607 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4277,6 +4277,14 @@ + Memory area to be used by remote processor image, + managed by CMA. + ++ rqshare= [X86] Select the MuQSS scheduler runqueue sharing type. ++ Format: ++ smt -- Share SMT (hyperthread) sibling runqueues ++ mc -- Share MC (multicore) sibling runqueues ++ smp -- Share SMP runqueues ++ none -- So not share any runqueues ++ Default value is mc ++ + rw [KNL] Mount root device read-write on boot + + S [KNL] Run init in single mode +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index 032c7cd3cede..ff41dfacb34b 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -46,6 +46,7 @@ show up in /proc/sys/kernel: + - hung_task_check_interval_secs + - hung_task_warnings + - hyperv_record_panic_msg ++- iso_cpu + - kexec_load_disabled + - kptr_restrict + - l2cr [ PPC only ] +@@ -82,6 +83,7 @@ show up in /proc/sys/kernel: + - randomize_va_space + - real-root-dev ==> Documentation/admin-guide/initrd.rst + - reboot-cmd [ SPARC only ] ++- rr_interval + - rtsig-max + - rtsig-nr + - sched_energy_aware +@@ -105,6 +107,7 @@ show up in /proc/sys/kernel: + - unknown_nmi_panic + - watchdog + - watchdog_thresh ++- yield_type + - version + + +@@ -438,6 +441,16 @@ When kptr_restrict is set to (2), kernel pointers printed using + %pK will be replaced with 0's regardless of privileges. + + ++iso_cpu: (MuQSS CPU scheduler only) ++=================================== ++ ++This sets the percentage cpu that the unprivileged SCHED_ISO tasks can ++run effectively at realtime priority, averaged over a rolling five ++seconds over the -whole- system, meaning all cpus. ++ ++Set to 70 (percent) by default. ++ ++ + l2cr: (PPC only) + ================ + +@@ -905,6 +918,20 @@ ROM/Flash boot loader. Maybe to tell it what to do after + rebooting. ??? + + ++rr_interval: (MuQSS CPU scheduler only) ++======================================= ++ ++This is the smallest duration that any cpu process scheduling unit ++will run for. Increasing this value can increase throughput of cpu ++bound tasks substantially but at the expense of increased latencies ++overall. Conversely decreasing it will decrease average and maximum ++latencies but at the expense of throughput. This value is in ++milliseconds and the default value chosen depends on the number of ++cpus available at scheduler initialisation with a minimum of 6. ++ ++Valid values are from 1-1000. ++ ++ + rtsig-max & rtsig-nr: + ===================== + +@@ -1175,3 +1202,13 @@ is 10 seconds. + + The softlockup threshold is (2 * watchdog_thresh). Setting this + tunable to zero will disable lockup detection altogether. ++ ++ ++yield_type: (MuQSS CPU scheduler only) ++====================================== ++ ++This determines what type of yield calls to sched_yield will perform. ++ ++ 0: No yield. ++ 1: Yield only to better priority/deadline tasks. (default) ++ 2: Expire timeslice and recalculate deadline. +diff --git a/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt +new file mode 100644 +index 000000000000..c0282002a079 +--- /dev/null ++++ b/Documentation/scheduler/sched-BFS.txt +@@ -0,0 +1,351 @@ ++BFS - The Brain Fuck Scheduler by Con Kolivas. ++ ++Goals. ++ ++The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to ++completely do away with the complex designs of the past for the cpu process ++scheduler and instead implement one that is very simple in basic design. ++The main focus of BFS is to achieve excellent desktop interactivity and ++responsiveness without heuristics and tuning knobs that are difficult to ++understand, impossible to model and predict the effect of, and when tuned to ++one workload cause massive detriment to another. ++ ++ ++Design summary. ++ ++BFS is best described as a single runqueue, O(n) lookup, earliest effective ++virtual deadline first design, loosely based on EEVDF (earliest eligible virtual ++deadline first) and my previous Staircase Deadline scheduler. Each component ++shall be described in order to understand the significance of, and reasoning for ++it. The codebase when the first stable version was released was approximately ++9000 lines less code than the existing mainline linux kernel scheduler (in ++2.6.31). This does not even take into account the removal of documentation and ++the cgroups code that is not used. ++ ++Design reasoning. ++ ++The single runqueue refers to the queued but not running processes for the ++entire system, regardless of the number of CPUs. The reason for going back to ++a single runqueue design is that once multiple runqueues are introduced, ++per-CPU or otherwise, there will be complex interactions as each runqueue will ++be responsible for the scheduling latency and fairness of the tasks only on its ++own runqueue, and to achieve fairness and low latency across multiple CPUs, any ++advantage in throughput of having CPU local tasks causes other disadvantages. ++This is due to requiring a very complex balancing system to at best achieve some ++semblance of fairness across CPUs and can only maintain relatively low latency ++for tasks bound to the same CPUs, not across them. To increase said fairness ++and latency across CPUs, the advantage of local runqueue locking, which makes ++for better scalability, is lost due to having to grab multiple locks. ++ ++A significant feature of BFS is that all accounting is done purely based on CPU ++used and nowhere is sleep time used in any way to determine entitlement or ++interactivity. Interactivity "estimators" that use some kind of sleep/run ++algorithm are doomed to fail to detect all interactive tasks, and to falsely tag ++tasks that aren't interactive as being so. The reason for this is that it is ++close to impossible to determine that when a task is sleeping, whether it is ++doing it voluntarily, as in a userspace application waiting for input in the ++form of a mouse click or otherwise, or involuntarily, because it is waiting for ++another thread, process, I/O, kernel activity or whatever. Thus, such an ++estimator will introduce corner cases, and more heuristics will be required to ++cope with those corner cases, introducing more corner cases and failed ++interactivity detection and so on. Interactivity in BFS is built into the design ++by virtue of the fact that tasks that are waking up have not used up their quota ++of CPU time, and have earlier effective deadlines, thereby making it very likely ++they will preempt any CPU bound task of equivalent nice level. See below for ++more information on the virtual deadline mechanism. Even if they do not preempt ++a running task, because the rr interval is guaranteed to have a bound upper ++limit on how long a task will wait for, it will be scheduled within a timeframe ++that will not cause visible interface jitter. ++ ++ ++Design details. ++ ++Task insertion. ++ ++BFS inserts tasks into each relevant queue as an O(1) insertion into a double ++linked list. On insertion, *every* running queue is checked to see if the newly ++queued task can run on any idle queue, or preempt the lowest running task on the ++system. This is how the cross-CPU scheduling of BFS achieves significantly lower ++latency per extra CPU the system has. In this case the lookup is, in the worst ++case scenario, O(n) where n is the number of CPUs on the system. ++ ++Data protection. ++ ++BFS has one single lock protecting the process local data of every task in the ++global queue. Thus every insertion, removal and modification of task data in the ++global runqueue needs to grab the global lock. However, once a task is taken by ++a CPU, the CPU has its own local data copy of the running process' accounting ++information which only that CPU accesses and modifies (such as during a ++timer tick) thus allowing the accounting data to be updated lockless. Once a ++CPU has taken a task to run, it removes it from the global queue. Thus the ++global queue only ever has, at most, ++ ++ (number of tasks requesting cpu time) - (number of logical CPUs) + 1 ++ ++tasks in the global queue. This value is relevant for the time taken to look up ++tasks during scheduling. This will increase if many tasks with CPU affinity set ++in their policy to limit which CPUs they're allowed to run on if they outnumber ++the number of CPUs. The +1 is because when rescheduling a task, the CPU's ++currently running task is put back on the queue. Lookup will be described after ++the virtual deadline mechanism is explained. ++ ++Virtual deadline. ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in BFS is entirely in the virtual deadline mechanism. The one ++tunable in BFS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in jiffies by this equation: ++ ++ jiffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. Once a task is descheduled, it is put back on the queue, and an ++O(n) lookup of all queued-but-not-running tasks is done to determine which has ++the earliest deadline and that task is chosen to receive CPU next. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (jiffies) is ++constantly moving. ++ ++Task lookup. ++ ++BFS has 103 priority queues. 100 of these are dedicated to the static priority ++of realtime tasks, and the remaining 3 are, in order of best to worst priority, ++SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority ++scheduling). When a task of these priorities is queued, a bitmap of running ++priorities is set showing which of these priorities has tasks waiting for CPU ++time. When a CPU is made to reschedule, the lookup for the next task to get ++CPU time is performed in the following way: ++ ++First the bitmap is checked to see what static priority tasks are queued. If ++any realtime priorities are found, the corresponding queue is checked and the ++first task listed there is taken (provided CPU affinity is suitable) and lookup ++is complete. If the priority corresponds to a SCHED_ISO task, they are also ++taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds ++to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this ++stage, every task in the runlist that corresponds to that priority is checked ++to see which has the earliest set deadline, and (provided it has suitable CPU ++affinity) it is taken off the runqueue and given the CPU. If a task has an ++expired deadline, it is taken and the rest of the lookup aborted (as they are ++chosen in FIFO order). ++ ++Thus, the lookup is O(n) in the worst case only, where n is as described ++earlier, as tasks may be chosen before the whole task list is looked over. ++ ++ ++Scalability. ++ ++The major limitations of BFS will be that of scalability, as the separate ++runqueue designs will have less lock contention as the number of CPUs rises. ++However they do not scale linearly even with separate runqueues as multiple ++runqueues will need to be locked concurrently on such designs to be able to ++achieve fair CPU balancing, to try and achieve some sort of nice-level fairness ++across CPUs, and to achieve low enough latency for tasks on a busy CPU when ++other CPUs would be more suited. BFS has the advantage that it requires no ++balancing algorithm whatsoever, as balancing occurs by proxy simply because ++all CPUs draw off the global runqueue, in priority and deadline order. Despite ++the fact that scalability is _not_ the prime concern of BFS, it both shows very ++good scalability to smaller numbers of CPUs and is likely a more scalable design ++at these numbers of CPUs. ++ ++It also has some very low overhead scalability features built into the design ++when it has been deemed their overhead is so marginal that they're worth adding. ++The first is the local copy of the running process' data to the CPU it's running ++on to allow that data to be updated lockless where possible. Then there is ++deference paid to the last CPU a task was running on, by trying that CPU first ++when looking for an idle CPU to use the next time it's scheduled. Finally there ++is the notion of cache locality beyond the last running CPU. The sched_domains ++information is used to determine the relative virtual "cache distance" that ++other CPUs have from the last CPU a task was running on. CPUs with shared ++caches, such as SMT siblings, or multicore CPUs with shared caches, are treated ++as cache local. CPUs without shared caches are treated as not cache local, and ++CPUs on different NUMA nodes are treated as very distant. This "relative cache ++distance" is used by modifying the virtual deadline value when doing lookups. ++Effectively, the deadline is unaltered between "cache local" CPUs, doubled for ++"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning ++behind the doubling of deadlines is as follows. The real cost of migrating a ++task from one CPU to another is entirely dependant on the cache footprint of ++the task, how cache intensive the task is, how long it's been running on that ++CPU to take up the bulk of its cache, how big the CPU cache is, how fast and ++how layered the CPU cache is, how fast a context switch is... and so on. In ++other words, it's close to random in the real world where we do more than just ++one sole workload. The only thing we can be sure of is that it's not free. So ++BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs ++is more important than cache locality, and cache locality only plays a part ++after that. Doubling the effective deadline is based on the premise that the ++"cache local" CPUs will tend to work on the same tasks up to double the number ++of cache local CPUs, and once the workload is beyond that amount, it is likely ++that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA ++is a value I pulled out of my arse. ++ ++When choosing an idle CPU for a waking task, the cache locality is determined ++according to where the task last ran and then idle CPUs are ranked from best ++to worst to choose the most suitable idle CPU based on cache locality, NUMA ++node locality and hyperthread sibling business. They are chosen in the ++following preference (if idle): ++ ++* Same core, idle or busy cache, idle threads ++* Other core, same cache, idle or busy cache, idle threads. ++* Same node, other CPU, idle cache, idle threads. ++* Same node, other CPU, busy cache, idle threads. ++* Same core, busy threads. ++* Other core, same cache, busy threads. ++* Same node, other CPU, busy threads. ++* Other node, other CPU, idle cache, idle threads. ++* Other node, other CPU, busy cache, idle threads. ++* Other node, other CPU, busy threads. ++ ++This shows the SMT or "hyperthread" awareness in the design as well which will ++choose a real idle core first before a logical SMT sibling which already has ++tasks on the physical CPU. ++ ++Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark. ++However this benchmarking was performed on an earlier design that was far less ++scalable than the current one so it's hard to know how scalable it is in terms ++of both CPUs (due to the global runqueue) and heavily loaded machines (due to ++O(n) lookup) at this stage. Note that in terms of scalability, the number of ++_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x) ++quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark ++results are very promising indeed, without needing to tweak any knobs, features ++or options. Benchmark contributions are most welcome. ++ ++ ++Features ++ ++As the initial prime target audience for BFS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval ++and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition ++to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is ++support for CGROUPS. The average user should neither need to know what these ++are, nor should they need to be using them to have good desktop behaviour. ++ ++rr_interval ++ ++There is only one "scheduler" tunable, the round robin interval. This can be ++accessed in ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6 on a ++uniprocessor machine, and automatically set to a progressively higher value on ++multiprocessor machines. The reasoning behind increasing the value on more CPUs ++is that the effective latency is decreased by virtue of there being more CPUs on ++BFS (for reasons explained above), and increasing the value allows for less ++cache contention and more throughput. Valid values are from 1 to 1000 ++Decreasing the value will decrease latencies at the cost of decreasing ++throughput, while increasing it will improve throughput, but at the cost of ++worsening latencies. The accuracy of the rr interval is limited by HZ resolution ++of the kernel configuration. Thus, the worst case latencies are usually slightly ++higher than this actual value. The default value of 6 is not an arbitrary one. ++It is based on the fact that humans can detect jitter at approximately 7ms, so ++aiming for much lower latencies is pointless under most circumstances. It is ++worth noting this fact when comparing the latency performance of BFS to other ++schedulers. Worst case latencies being higher than 7ms are far worse than ++average latencies not being in the microsecond range. ++ ++Isochronous scheduling. ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of _total CPU_ available across the machine, configurable ++as a percentage in the following "resource handling" tunable (as opposed to a ++scheduler tunable): ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of BFS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++Because some applications constantly set their policy as well as their nice ++level, there is potential for them to undo the override specified by the user ++on the command line of setting the policy to SCHED_ISO. To counter this, once ++a task has been set to SCHED_ISO policy, it needs superuser privileges to set ++it back to SCHED_NORMAL. This will ensure the task remains ISO and all child ++processes and threads will also inherit the ISO policy. ++ ++Idleprio scheduling. ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start ++a video encode or so on without any slowdown of other tasks. To avoid this ++policy from grabbing shared resources and holding them indefinitely, if it ++detects a state where the task is waiting on I/O, the machine is about to ++suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As ++per the Isochronous task management, once a task has been scheduled as IDLEPRIO, ++it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can ++be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++ schedtool -D -e ./mprime ++ ++Subtick accounting. ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the ++timer tick frequency (HZ) is lowered. It is possible to create an application ++which uses almost 100% CPU, yet by being descheduled at the right time, records ++zero CPU usage. While the main problem with this is that there are possible ++security implications, it is also difficult to determine how much CPU a task ++really does use. BFS tries to use the sub-tick accounting from the TSC clock, ++where possible, to determine real CPU usage. This is not entirely reliable, but ++is far more likely to produce accurate CPU usage data than the existing designs ++and will not show tasks as consuming no CPU usage when they actually are. Thus, ++the amount of CPU reported as being used by BFS will more accurately represent ++how much CPU the task itself is using (as is shown for example by the 'time' ++application), so the reported values may be quite different to other schedulers. ++Values reported as the 'load' are more prone to problems with this design, but ++per process values are closer to real usage. When comparing throughput of BFS ++to other designs, it is important to compare the actual completed work in terms ++of total wall clock time taken and total work done, rather than the reported ++"cpu usage". ++ ++ ++Con Kolivas Fri Aug 27 2010 +diff --git a/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt +new file mode 100644 +index 000000000000..ae28b85c9995 +--- /dev/null ++++ b/Documentation/scheduler/sched-MuQSS.txt +@@ -0,0 +1,373 @@ ++MuQSS - The Multiple Queue Skiplist Scheduler by Con Kolivas. ++ ++MuQSS is a per-cpu runqueue variant of the original BFS scheduler with ++one 8 level skiplist per runqueue, and fine grained locking for much more ++scalability. ++ ++ ++Goals. ++ ++The goal of the Multiple Queue Skiplist Scheduler, referred to as MuQSS from ++here on (pronounced mux) is to completely do away with the complex designs of ++the past for the cpu process scheduler and instead implement one that is very ++simple in basic design. The main focus of MuQSS is to achieve excellent desktop ++interactivity and responsiveness without heuristics and tuning knobs that are ++difficult to understand, impossible to model and predict the effect of, and when ++tuned to one workload cause massive detriment to another, while still being ++scalable to many CPUs and processes. ++ ++ ++Design summary. ++ ++MuQSS is best described as per-cpu multiple runqueue, O(log n) insertion, O(1) ++lookup, earliest effective virtual deadline first tickless design, loosely based ++on EEVDF (earliest eligible virtual deadline first) and my previous Staircase ++Deadline scheduler, and evolved from the single runqueue O(n) BFS scheduler. ++Each component shall be described in order to understand the significance of, ++and reasoning for it. ++ ++ ++Design reasoning. ++ ++In BFS, the use of a single runqueue across all CPUs meant that each CPU would ++need to scan the entire runqueue looking for the process with the earliest ++deadline and schedule that next, regardless of which CPU it originally came ++from. This made BFS deterministic with respect to latency and provided ++guaranteed latencies dependent on number of processes and CPUs. The single ++runqueue, however, meant that all CPUs would compete for the single lock ++protecting it, which would lead to increasing lock contention as the number of ++CPUs rose and appeared to limit scalability of common workloads beyond 16 ++logical CPUs. Additionally, the O(n) lookup of the runqueue list obviously ++increased overhead proportionate to the number of queued proecesses and led to ++cache thrashing while iterating over the linked list. ++ ++MuQSS is an evolution of BFS, designed to maintain the same scheduling ++decision mechanism and be virtually deterministic without relying on the ++constrained design of the single runqueue by splitting out the single runqueue ++to be per-CPU and use skiplists instead of linked lists. ++ ++The original reason for going back to a single runqueue design for BFS was that ++once multiple runqueues are introduced, per-CPU or otherwise, there will be ++complex interactions as each runqueue will be responsible for the scheduling ++latency and fairness of the tasks only on its own runqueue, and to achieve ++fairness and low latency across multiple CPUs, any advantage in throughput of ++having CPU local tasks causes other disadvantages. This is due to requiring a ++very complex balancing system to at best achieve some semblance of fairness ++across CPUs and can only maintain relatively low latency for tasks bound to the ++same CPUs, not across them. To increase said fairness and latency across CPUs, ++the advantage of local runqueue locking, which makes for better scalability, is ++lost due to having to grab multiple locks. ++ ++MuQSS works around the problems inherent in multiple runqueue designs by ++making its skip lists priority ordered and through novel use of lockless ++examination of each other runqueue it can decide if it should take the earliest ++deadline task from another runqueue for latency reasons, or for CPU balancing ++reasons. It still does not have a balancing system, choosing to allow the ++next task scheduling decision and task wakeup CPU choice to allow balancing to ++happen by virtue of its choices. ++ ++As a further evolution of the design, MuQSS normally configures sharing of ++runqueues in a logical fashion for when CPU resources are shared for improved ++latency and throughput. By default it shares runqueues and locks between ++multicore siblings. Optionally it can be configured to run with sharing of ++SMT siblings only, all SMP packages or no sharing at all. Additionally it can ++be selected at boot time. ++ ++ ++Design details. ++ ++Custom skip list implementation: ++ ++To avoid the overhead of building up and tearing down skip list structures, ++the variant used by MuQSS has a number of optimisations making it specific for ++its use case in the scheduler. It uses static arrays of 8 'levels' instead of ++building up and tearing down structures dynamically. This makes each runqueue ++only scale O(log N) up to 64k tasks. However as there is one runqueue per CPU ++it means that it scales O(log N) up to 64k x number of logical CPUs which is ++far beyond the realistic task limits each CPU could handle. By being 8 levels ++it also makes the array exactly one cacheline in size. Additionally, each ++skip list node is bidirectional making insertion and removal amortised O(1), ++being O(k) where k is 1-8. Uniquely, we are only ever interested in the very ++first entry in each list at all times with MuQSS, so there is never a need to ++do a search and thus look up is always O(1). In interactive mode, the queues ++will be searched beyond their first entry if the first task is not suitable ++for affinity or SMT nice reasons. ++ ++Task insertion: ++ ++MuQSS inserts tasks into a per CPU runqueue as an O(log N) insertion into ++a custom skip list as described above (based on the original design by William ++Pugh). Insertion is ordered in such a way that there is never a need to do a ++search by ordering tasks according to static priority primarily, and then ++virtual deadline at the time of insertion. ++ ++Niffies: ++ ++Niffies are a monotonic forward moving timer not unlike the "jiffies" but are ++of nanosecond resolution. Niffies are calculated per-runqueue from the high ++resolution TSC timers, and in order to maintain fairness are synchronised ++between CPUs whenever both runqueues are locked concurrently. ++ ++Virtual deadline: ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in MuQSS is entirely in the virtual deadline mechanism. The one ++tunable in MuQSS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in niffies by this equation: ++ ++ niffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (niffies) is ++constantly moving. ++ ++Task lookup: ++ ++As tasks are already pre-ordered according to anticipated scheduling order in ++the skip lists, lookup for the next suitable task per-runqueue is always a ++matter of simply selecting the first task in the 0th level skip list entry. ++In order to maintain optimal latency and fairness across CPUs, MuQSS does a ++novel examination of every other runqueue in cache locality order, choosing the ++best task across all runqueues. This provides near-determinism of how long any ++task across the entire system may wait before receiving CPU time. The other ++runqueues are first examine lockless and then trylocked to minimise the ++potential lock contention if they are likely to have a suitable better task. ++Each other runqueue lock is only held for as long as it takes to examine the ++entry for suitability. In "interactive" mode, the default setting, MuQSS will ++look for the best deadline task across all CPUs, while in !interactive mode, ++it will only select a better deadline task from another CPU if it is more ++heavily laden than the current one. ++ ++Lookup is therefore O(k) where k is number of CPUs. ++ ++ ++Latency. ++ ++Through the use of virtual deadlines to govern the scheduling order of normal ++tasks, queue-to-activation latency per runqueue is guaranteed to be bound by ++the rr_interval tunable which is set to 6ms by default. This means that the ++longest a CPU bound task will wait for more CPU is proportional to the number ++of running tasks and in the common case of 0-2 running tasks per CPU, will be ++under the 7ms threshold for human perception of jitter. Additionally, as newly ++woken tasks will have an early deadline from their previous runtime, the very ++tasks that are usually latency sensitive will have the shortest interval for ++activation, usually preempting any existing CPU bound tasks. ++ ++Tickless expiry: ++ ++A feature of MuQSS is that it is not tied to the resolution of the chosen tick ++rate in Hz, instead depending entirely on the high resolution timers where ++possible for sub-millisecond accuracy on timeouts regarless of the underlying ++tick rate. This allows MuQSS to be run with the low overhead of low Hz rates ++such as 100 by default, benefiting from the improved throughput and lower ++power usage it provides. Another advantage of this approach is that in ++combination with the Full No HZ option, which disables ticks on running task ++CPUs instead of just idle CPUs, the tick can be disabled at all times ++regardless of how many tasks are running instead of being limited to just one ++running task. Note that this option is NOT recommended for regular desktop ++users. ++ ++ ++Scalability and balancing. ++ ++Unlike traditional approaches where balancing is a combination of CPU selection ++at task wakeup and intermittent balancing based on a vast array of rules set ++according to architecture, busyness calculations and special case management, ++MuQSS indirectly balances on the fly at task wakeup and next task selection. ++During initialisation, MuQSS creates a cache coherency ordered list of CPUs for ++each logical CPU and uses this to aid task/CPU selection when CPUs are busy. ++Additionally it selects any idle CPUs, if they are available, at any time over ++busy CPUs according to the following preference: ++ ++ * Same thread, idle or busy cache, idle or busy threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ ++Mux is therefore SMT, MC and Numa aware without the need for extra ++intermittent balancing to maintain CPUs busy and make the most of cache ++coherency. ++ ++ ++Features ++ ++As the initial prime target audience for MuQSS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are 3 optional tunables, and 2 extra scheduling policies. The rr_interval, ++interactive, and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO ++policies. In addition to this, MuQSS also uses sub-tick accounting. What MuQSS ++does _not_ now feature is support for CGROUPS. The average user should neither ++need to know what these are, nor should they need to be using them to have good ++desktop behaviour. However since some applications refuse to work without ++cgroups, one can enable them with MuQSS as a stub and the filesystem will be ++created which will allow the applications to work. ++ ++rr_interval: ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6. Valid values ++are from 1 to 1000 Decreasing the value will decrease latencies at the cost of ++decreasing throughput, while increasing it will improve throughput, but at the ++cost of worsening latencies. It is based on the fact that humans can detect ++jitter at approximately 7ms, so aiming for much lower latencies is pointless ++under most circumstances. It is worth noting this fact when comparing the ++latency performance of MuQSS to other schedulers. Worst case latencies being ++higher than 7ms are far worse than average latencies not being in the ++microsecond range. ++ ++interactive: ++ ++ /proc/sys/kernel/interactive ++ ++The value is a simple boolean of 1 for on and 0 for off and is set to on by ++default. Disabling this will disable the near-determinism of MuQSS when ++selecting the next task by not examining all CPUs for the earliest deadline ++task, or which CPU to wake to, instead prioritising CPU balancing for improved ++throughput. Latency will still be bound by rr_interval, but on a per-CPU basis ++instead of across the whole system. ++ ++Runqueue sharing. ++ ++By default MuQSS chooses to share runqueue resources (specifically the skip ++list and locking) between multicore siblings. It is configurable at build time ++to select between None, SMT, MC and SMP, corresponding to no sharing, sharing ++only between simultaneous mulithreading siblings, multicore siblings, or ++symmetric multiprocessing physical packages. Additionally it can be se at ++bootime with the use of the rqshare parameter. The reason for configurability ++is that some architectures have CPUs with many multicore siblings (>= 16) ++where it may be detrimental to throughput to share runqueues and another ++sharing option may be desirable. Additionally, more sharing than usual can ++improve latency on a system-wide level at the expense of throughput if desired. ++ ++The options are: ++none, smt, mc, smp ++ ++eg: ++ rqshare=mc ++ ++Isochronous scheduling: ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of CPU available per CPU, configurable as a percentage in ++the following "resource handling" tunable (as opposed to a scheduler tunable): ++ ++iso_cpu: ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of MuQSS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++ ++ ++Idleprio scheduling: ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start a ++video encode or so on without any slowdown of other tasks. To avoid this policy ++from grabbing shared resources and holding them indefinitely, if it detects a ++state where the task is waiting on I/O, the machine is about to suspend to ram ++and so on, it will transiently schedule them as SCHED_NORMAL. Once a task has ++been scheduled as IDLEPRIO, it cannot be put back to SCHED_NORMAL without ++superuser privileges since it is effectively a lower scheduling policy. Tasks ++can be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++schedtool -D -e ./mprime ++ ++Subtick accounting: ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the timer ++tick frequency (HZ) is lowered. It is possible to create an application which ++uses almost 100% CPU, yet by being descheduled at the right time, records zero ++CPU usage. While the main problem with this is that there are possible security ++implications, it is also difficult to determine how much CPU a task really does ++use. Mux uses sub-tick accounting from the TSC clock to determine real CPU ++usage. Thus, the amount of CPU reported as being used by MuQSS will more ++accurately represent how much CPU the task itself is using (as is shown for ++example by the 'time' application), so the reported values may be quite ++different to other schedulers. When comparing throughput of MuQSS to other ++designs, it is important to compare the actual completed work in terms of total ++wall clock time taken and total work done, rather than the reported "cpu usage". ++ ++Symmetric MultiThreading (SMT) aware nice: ++ ++SMT, a.k.a. hyperthreading, is a very common feature on modern CPUs. While the ++logical CPU count rises by adding thread units to each CPU core, allowing more ++than one task to be run simultaneously on the same core, the disadvantage of it ++is that the CPU power is shared between the tasks, not summating to the power ++of two CPUs. The practical upshot of this is that two tasks running on ++separate threads of the same core run significantly slower than if they had one ++core each to run on. While smart CPU selection allows each task to have a core ++to itself whenever available (as is done on MuQSS), it cannot offset the ++slowdown that occurs when the cores are all loaded and only a thread is left. ++Most of the time this is harmless as the CPU is effectively overloaded at this ++point and the extra thread is of benefit. However when running a niced task in ++the presence of an un-niced task (say nice 19 v nice 0), the nice task gets ++precisely the same amount of CPU power as the unniced one. MuQSS has an ++optional configuration feature known as SMT-NICE which selectively idles the ++secondary niced thread for a period proportional to the nice difference, ++allowing CPU distribution according to nice level to be maintained, at the ++expense of a small amount of extra overhead. If this is configured in on a ++machine without SMT threads, the overhead is minimal. ++ ++ ++Con Kolivas Sat, 29th October 2016 +diff --git a/Makefile b/Makefile +index d4d36c61940b..4a9dfe471f1f 100644 +--- a/Makefile ++++ b/Makefile +@@ -15,6 +15,10 @@ NAME = Kleptomaniac Octopus + PHONY := _all + _all: + ++CKVERSION = -ck1 ++CKNAME = MuQSS Powered ++EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION) ++ + # We are using a recursive build, so we need to do a little thinking + # to get the ordering right. + # +diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig +index ef179033a7c2..14b576a531ad 100644 +--- a/arch/alpha/Kconfig ++++ b/arch/alpha/Kconfig +@@ -665,6 +665,8 @@ config HZ + default 1200 if HZ_1200 + default 1024 + ++source "kernel/Kconfig.MuQSS" ++ + config SRM_ENV + tristate "SRM environment through procfs" + depends on PROC_FS +diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig +index 3a138f8c7299..65f44e309a08 100644 +--- a/arch/arc/configs/tb10x_defconfig ++++ b/arch/arc/configs/tb10x_defconfig +@@ -30,7 +30,7 @@ CONFIG_ARC_PLAT_TB10X=y + CONFIG_ARC_CACHE_LINE_SHIFT=5 + CONFIG_HZ=250 + CONFIG_ARC_BUILTIN_DTB_NAME="abilis_tb100_dvk" +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_COMPACTION is not set + CONFIG_NET=y + CONFIG_PACKET=y +diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig +index 8a50efb559f3..d8507d20c258 100644 +--- a/arch/arm/Kconfig ++++ b/arch/arm/Kconfig +@@ -1238,6 +1238,8 @@ config SCHED_SMT + MultiThreading at a cost of slightly increased overhead in some + places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config HAVE_ARM_SCU + bool + help +diff --git a/arch/arm/configs/bcm2835_defconfig b/arch/arm/configs/bcm2835_defconfig +index 519ff58e67b3..b2a05b6f7d80 100644 +--- a/arch/arm/configs/bcm2835_defconfig ++++ b/arch/arm/configs/bcm2835_defconfig +@@ -29,7 +29,7 @@ CONFIG_MODULE_UNLOAD=y + CONFIG_ARCH_MULTI_V6=y + CONFIG_ARCH_BCM=y + CONFIG_ARCH_BCM2835=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_AEABI=y + CONFIG_KSM=y + CONFIG_CLEANCACHE=y +diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig +index 0f7381ee0c37..3d747237bfed 100644 +--- a/arch/arm/configs/imx_v6_v7_defconfig ++++ b/arch/arm/configs/imx_v6_v7_defconfig +@@ -45,6 +45,7 @@ CONFIG_PCI_MSI=y + CONFIG_PCI_IMX6=y + CONFIG_SMP=y + CONFIG_ARM_PSCI=y ++CONFIG_PREEMPT=y + CONFIG_HIGHMEM=y + CONFIG_FORCE_MAX_ZONEORDER=14 + CONFIG_CMDLINE="noinitrd console=ttymxc0,115200" +diff --git a/arch/arm/configs/mps2_defconfig b/arch/arm/configs/mps2_defconfig +index 1d923dbb9928..9c1931f1fafd 100644 +--- a/arch/arm/configs/mps2_defconfig ++++ b/arch/arm/configs/mps2_defconfig +@@ -18,7 +18,7 @@ CONFIG_ARCH_MPS2=y + CONFIG_SET_MEM_PARAM=y + CONFIG_DRAM_BASE=0x21000000 + CONFIG_DRAM_SIZE=0x1000000 +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_ATAGS is not set + CONFIG_ZBOOT_ROM_TEXT=0x0 + CONFIG_ZBOOT_ROM_BSS=0x0 +diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig +index 2773899c21b3..870866aaa39d 100644 +--- a/arch/arm/configs/mxs_defconfig ++++ b/arch/arm/configs/mxs_defconfig +@@ -1,7 +1,7 @@ + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT_VOLUNTARY=n + CONFIG_TASKSTATS=y + CONFIG_TASK_DELAY_ACCT=y + CONFIG_TASK_XACCT=y +@@ -27,6 +27,11 @@ CONFIG_MODVERSIONS=y + CONFIG_BLK_DEV_INTEGRITY=y + # CONFIG_IOSCHED_DEADLINE is not set + # CONFIG_IOSCHED_CFQ is not set ++# CONFIG_ARCH_MULTI_V7 is not set ++CONFIG_ARCH_MXS=y ++# CONFIG_ARM_THUMB is not set ++CONFIG_PREEMPT=y ++CONFIG_AEABI=y + CONFIG_NET=y + CONFIG_PACKET=y + CONFIG_UNIX=y +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index 3f047afb982c..d35eae0a5c7d 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -864,6 +864,8 @@ config SCHED_SMT + MultiThreading at a cost of slightly increased overhead in some + places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config NR_CPUS + int "Maximum number of CPUs (2-4096)" + range 2 4096 +diff --git a/arch/blackfin/configs/BF518F-EZBRD_defconfig b/arch/blackfin/configs/BF518F-EZBRD_defconfig +new file mode 100644 +index 000000000000..39b91dfa55b5 +--- /dev/null ++++ b/arch/blackfin/configs/BF518F-EZBRD_defconfig +@@ -0,0 +1,121 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF518=y ++CONFIG_IRQ_TIMER0=12 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_JEDECPROBE=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++CONFIG_NET_BFIN=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_USB_SUPPORT is not set ++CONFIG_MMC=y ++CONFIG_SDH_BFIN=y ++CONFIG_SDH_BFIN_MISSING_CMD_PULLUP_WORKAROUND=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=m ++# CONFIG_DNOTIFY is not set ++CONFIG_VFAT_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC_CCITT=m +diff --git a/arch/blackfin/configs/BF526-EZBRD_defconfig b/arch/blackfin/configs/BF526-EZBRD_defconfig +new file mode 100644 +index 000000000000..675cadb3a0c4 +--- /dev/null ++++ b/arch/blackfin/configs/BF526-EZBRD_defconfig +@@ -0,0 +1,158 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF526=y ++CONFIG_IRQ_TIMER0=12 ++CONFIG_BFIN526_EZBRD=y ++CONFIG_IRQ_USB_INT0=11 ++CONFIG_IRQ_USB_INT1=11 ++CONFIG_IRQ_USB_INT2=11 ++CONFIG_IRQ_USB_DMA=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_PHYSMAP=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_NAND=m ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_SCSI=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++CONFIG_BLK_DEV_SR=m ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_BFIN=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT_FF_MEMLESS=m ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_HID_A4TECH=y ++CONFIG_HID_APPLE=y ++CONFIG_HID_BELKIN=y ++CONFIG_HID_CHERRY=y ++CONFIG_HID_CHICONY=y ++CONFIG_HID_CYPRESS=y ++CONFIG_HID_EZKEY=y ++CONFIG_HID_GYRATION=y ++CONFIG_HID_LOGITECH=y ++CONFIG_HID_MICROSOFT=y ++CONFIG_HID_MONTEREY=y ++CONFIG_HID_PANTHERLORD=y ++CONFIG_HID_PETALYNX=y ++CONFIG_HID_SAMSUNG=y ++CONFIG_HID_SONY=y ++CONFIG_HID_SUNPLUS=y ++CONFIG_USB=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_OTG_BLACKLIST_HUB=y ++CONFIG_USB_MON=y ++CONFIG_USB_STORAGE=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=m ++# CONFIG_DNOTIFY is not set ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_VFAT_FS=m ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC_CCITT=m +diff --git a/arch/blackfin/configs/BF527-EZKIT-V2_defconfig b/arch/blackfin/configs/BF527-EZKIT-V2_defconfig +new file mode 100644 +index 000000000000..4c517c443af5 +--- /dev/null ++++ b/arch/blackfin/configs/BF527-EZKIT-V2_defconfig +@@ -0,0 +1,188 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF527=y ++CONFIG_BF_REV_0_2=y ++CONFIG_BFIN527_EZKIT_V2=y ++CONFIG_IRQ_USB_INT0=11 ++CONFIG_IRQ_USB_INT1=11 ++CONFIG_IRQ_USB_INT2=11 ++CONFIG_IRQ_USB_DMA=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++CONFIG_BFIN_SIR0=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_JEDECPROBE=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_NAND=m ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_SCSI=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++CONFIG_BLK_DEV_SR=m ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_BFIN=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT_FF_MEMLESS=m ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=y ++CONFIG_KEYBOARD_ADP5520=y ++# CONFIG_KEYBOARD_ATKBD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_TOUCHSCREEN=y ++CONFIG_TOUCHSCREEN_AD7879=y ++CONFIG_TOUCHSCREEN_AD7879_I2C=y ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_PMIC_ADP5520=y ++CONFIG_FB=y ++CONFIG_FB_BFIN_LQ035Q1=y ++CONFIG_BACKLIGHT_LCD_SUPPORT=y ++CONFIG_FRAMEBUFFER_CONSOLE=y ++CONFIG_LOGO=y ++# CONFIG_LOGO_LINUX_MONO is not set ++# CONFIG_LOGO_LINUX_VGA16 is not set ++# CONFIG_LOGO_LINUX_CLUT224 is not set ++# CONFIG_LOGO_BLACKFIN_VGA16 is not set ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_SOC=y ++CONFIG_SND_BF5XX_I2S=y ++CONFIG_SND_BF5XX_SOC_SSM2602=y ++CONFIG_HID_A4TECH=y ++CONFIG_HID_APPLE=y ++CONFIG_HID_BELKIN=y ++CONFIG_HID_CHERRY=y ++CONFIG_HID_CHICONY=y ++CONFIG_HID_CYPRESS=y ++CONFIG_HID_EZKEY=y ++CONFIG_HID_GYRATION=y ++CONFIG_HID_LOGITECH=y ++CONFIG_HID_MICROSOFT=y ++CONFIG_HID_MONTEREY=y ++CONFIG_HID_PANTHERLORD=y ++CONFIG_HID_PETALYNX=y ++CONFIG_HID_SAMSUNG=y ++CONFIG_HID_SONY=y ++CONFIG_HID_SUNPLUS=y ++CONFIG_USB=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_OTG_BLACKLIST_HUB=y ++CONFIG_USB_MON=y ++CONFIG_USB_MUSB_HDRC=y ++CONFIG_USB_MUSB_BLACKFIN=y ++CONFIG_USB_STORAGE=y ++CONFIG_USB_GADGET=y ++CONFIG_NEW_LEDS=y ++CONFIG_LEDS_CLASS=y ++CONFIG_LEDS_ADP5520=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=m ++# CONFIG_DNOTIFY is not set ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_UDF_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF527-EZKIT_defconfig b/arch/blackfin/configs/BF527-EZKIT_defconfig +new file mode 100644 +index 000000000000..bf8df3e6cf02 +--- /dev/null ++++ b/arch/blackfin/configs/BF527-EZKIT_defconfig +@@ -0,0 +1,181 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF527=y ++CONFIG_BF_REV_0_1=y ++CONFIG_IRQ_USB_INT0=11 ++CONFIG_IRQ_USB_INT1=11 ++CONFIG_IRQ_USB_INT2=11 ++CONFIG_IRQ_USB_DMA=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++CONFIG_BFIN_SIR0=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_JEDECPROBE=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_NAND=m ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_SCSI=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++CONFIG_BLK_DEV_SR=m ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_BFIN=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT_FF_MEMLESS=m ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_FB=y ++CONFIG_FB_BFIN_T350MCQB=y ++CONFIG_BACKLIGHT_LCD_SUPPORT=y ++CONFIG_LCD_LTV350QV=m ++CONFIG_FRAMEBUFFER_CONSOLE=y ++CONFIG_LOGO=y ++# CONFIG_LOGO_LINUX_MONO is not set ++# CONFIG_LOGO_LINUX_VGA16 is not set ++# CONFIG_LOGO_LINUX_CLUT224 is not set ++# CONFIG_LOGO_BLACKFIN_VGA16 is not set ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_SOC=y ++CONFIG_SND_BF5XX_I2S=y ++CONFIG_SND_BF5XX_SOC_SSM2602=y ++CONFIG_HID_A4TECH=y ++CONFIG_HID_APPLE=y ++CONFIG_HID_BELKIN=y ++CONFIG_HID_CHERRY=y ++CONFIG_HID_CHICONY=y ++CONFIG_HID_CYPRESS=y ++CONFIG_HID_EZKEY=y ++CONFIG_HID_GYRATION=y ++CONFIG_HID_LOGITECH=y ++CONFIG_HID_MICROSOFT=y ++CONFIG_HID_MONTEREY=y ++CONFIG_HID_PANTHERLORD=y ++CONFIG_HID_PETALYNX=y ++CONFIG_HID_SAMSUNG=y ++CONFIG_HID_SONY=y ++CONFIG_HID_SUNPLUS=y ++CONFIG_USB=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_OTG_BLACKLIST_HUB=y ++CONFIG_USB_MON=y ++CONFIG_USB_MUSB_HDRC=y ++CONFIG_MUSB_PIO_ONLY=y ++CONFIG_USB_MUSB_BLACKFIN=y ++CONFIG_MUSB_PIO_ONLY=y ++CONFIG_USB_STORAGE=y ++CONFIG_USB_GADGET=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=m ++# CONFIG_DNOTIFY is not set ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_UDF_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF527-TLL6527M_defconfig b/arch/blackfin/configs/BF527-TLL6527M_defconfig +new file mode 100644 +index 000000000000..0220b3b15c53 +--- /dev/null ++++ b/arch/blackfin/configs/BF527-TLL6527M_defconfig +@@ -0,0 +1,178 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_LOCALVERSION="DEV_0-1_pre2010" ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++CONFIG_PREEMPT=y ++CONFIG_BF527=y ++CONFIG_BF_REV_0_2=y ++CONFIG_BFIN527_TLL6527M=y ++CONFIG_BF527_UART1_PORTG=y ++CONFIG_IRQ_USB_INT0=11 ++CONFIG_IRQ_USB_INT1=11 ++CONFIG_IRQ_USB_INT2=11 ++CONFIG_IRQ_USB_DMA=11 ++CONFIG_BOOT_LOAD=0x400000 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=y ++CONFIG_DMA_UNCACHED_2M=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_0=0xFFC2 ++CONFIG_BANK_1=0xFFC2 ++CONFIG_BANK_2=0xFFC2 ++CONFIG_BANK_3=0xFFC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++CONFIG_BFIN_SIR0=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_GPIO_ADDR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_SCSI=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++CONFIG_BLK_DEV_SR=m ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=y ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_TOUCHSCREEN=y ++CONFIG_TOUCHSCREEN_AD7879=m ++CONFIG_INPUT_MISC=y ++CONFIG_INPUT_AD714X=y ++CONFIG_INPUT_ADXL34X=y ++# CONFIG_SERIO is not set ++CONFIG_BFIN_PPI=m ++CONFIG_BFIN_SIMPLE_TIMER=m ++CONFIG_BFIN_SPORT=m ++# CONFIG_CONSOLE_TRANSLATIONS is not set ++# CONFIG_DEVKMEM is not set ++CONFIG_BFIN_JTAG_COMM=m ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_LEGACY_PTYS is not set ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C_CHARDEV=y ++# CONFIG_I2C_HELPER_AUTO is not set ++CONFIG_I2C_SMBUS=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_MEDIA_SUPPORT=y ++CONFIG_VIDEO_DEV=y ++# CONFIG_MEDIA_TUNER_CUSTOMISE is not set ++CONFIG_VIDEO_HELPER_CHIPS_AUTO=y ++CONFIG_VIDEO_BLACKFIN_CAM=m ++CONFIG_OV9655=y ++CONFIG_FB=y ++CONFIG_BACKLIGHT_LCD_SUPPORT=y ++CONFIG_FRAMEBUFFER_CONSOLE=y ++CONFIG_FONTS=y ++CONFIG_FONT_6x11=y ++CONFIG_LOGO=y ++# CONFIG_LOGO_LINUX_MONO is not set ++# CONFIG_LOGO_LINUX_VGA16 is not set ++# CONFIG_LOGO_LINUX_CLUT224 is not set ++# CONFIG_LOGO_BLACKFIN_VGA16 is not set ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_MIXER_OSS=y ++CONFIG_SND_PCM_OSS=y ++CONFIG_SND_SOC=y ++CONFIG_SND_BF5XX_I2S=y ++CONFIG_SND_BF5XX_SOC_SSM2602=y ++# CONFIG_HID_SUPPORT is not set ++# CONFIG_USB_SUPPORT is not set ++CONFIG_MMC=m ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=y ++# CONFIG_DNOTIFY is not set ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_UDF_FS=m ++CONFIG_MSDOS_FS=y ++CONFIG_VFAT_FS=y ++CONFIG_JFFS2_FS=y ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++# CONFIG_RPCSEC_GSS_KRB5 is not set ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC7=m +diff --git a/arch/blackfin/configs/BF533-EZKIT_defconfig b/arch/blackfin/configs/BF533-EZKIT_defconfig +new file mode 100644 +index 000000000000..6023e3fd2c48 +--- /dev/null ++++ b/arch/blackfin/configs/BF533-EZKIT_defconfig +@@ -0,0 +1,114 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BFIN533_EZKIT=y ++CONFIG_TIMER0=11 ++CONFIG_CLKIN_HZ=27000000 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_JEDECPROBE=y ++CONFIG_MTD_CFI_AMDSTD=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_PHYSMAP=y ++CONFIG_MTD_PLATRAM=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++CONFIG_SMC91X=y ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT=m ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_USB_SUPPORT is not set ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF533-STAMP_defconfig b/arch/blackfin/configs/BF533-STAMP_defconfig +new file mode 100644 +index 000000000000..f5cd0f18b711 +--- /dev/null ++++ b/arch/blackfin/configs/BF533-STAMP_defconfig +@@ -0,0 +1,124 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_TIMER0=11 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=m ++CONFIG_MTD_CFI_AMDSTD=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++CONFIG_SMC91X=y ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=m ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_GPIO=m ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_FB=m ++CONFIG_FIRMWARE_EDID=y ++CONFIG_SOUND=m ++CONFIG_SND=m ++CONFIG_SND_MIXER_OSS=m ++CONFIG_SND_PCM_OSS=m ++CONFIG_SND_SOC=m ++CONFIG_SND_BF5XX_I2S=m ++CONFIG_SND_BF5XX_SOC_AD73311=m ++# CONFIG_USB_SUPPORT is not set ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF537-STAMP_defconfig b/arch/blackfin/configs/BF537-STAMP_defconfig +new file mode 100644 +index 000000000000..48085fde7f9e +--- /dev/null ++++ b/arch/blackfin/configs/BF537-STAMP_defconfig +@@ -0,0 +1,136 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF537=y ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_CAN=m ++CONFIG_CAN_RAW=m ++CONFIG_CAN_BCM=m ++CONFIG_CAN_BFIN=m ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++CONFIG_BFIN_SIR1=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=m ++CONFIG_MTD_CFI_AMDSTD=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_PHYSMAP=m ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++CONFIG_NET_BFIN=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=m ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_BLACKFIN_TWI=m ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_FB=m ++CONFIG_FIRMWARE_EDID=y ++CONFIG_BACKLIGHT_LCD_SUPPORT=y ++CONFIG_SOUND=m ++CONFIG_SND=m ++CONFIG_SND_MIXER_OSS=m ++CONFIG_SND_PCM_OSS=m ++CONFIG_SND_SOC=m ++CONFIG_SND_BF5XX_I2S=m ++CONFIG_SND_BF5XX_SOC_AD73311=m ++# CONFIG_USB_SUPPORT is not set ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF538-EZKIT_defconfig b/arch/blackfin/configs/BF538-EZKIT_defconfig +new file mode 100644 +index 000000000000..12deeaaef3cb +--- /dev/null ++++ b/arch/blackfin/configs/BF538-EZKIT_defconfig +@@ -0,0 +1,133 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF538=y ++CONFIG_IRQ_TIMER0=12 ++CONFIG_IRQ_TIMER1=12 ++CONFIG_IRQ_TIMER2=12 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_PM=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_CAN=m ++CONFIG_CAN_RAW=m ++CONFIG_CAN_BCM=m ++CONFIG_CAN_DEV=m ++CONFIG_CAN_BFIN=m ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=m ++CONFIG_MTD_CFI_AMDSTD=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_PHYSMAP=m ++CONFIG_MTD_NAND=m ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++CONFIG_PHYLIB=y ++CONFIG_SMSC_PHY=y ++CONFIG_NET_ETHERNET=y ++CONFIG_SMC91X=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_TOUCHSCREEN=y ++CONFIG_TOUCHSCREEN_AD7879=y ++CONFIG_TOUCHSCREEN_AD7879_SPI=y ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_DEVKMEM is not set ++CONFIG_BFIN_JTAG_COMM=m ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++CONFIG_SERIAL_BFIN_UART1=y ++CONFIG_SERIAL_BFIN_UART2=y ++# CONFIG_LEGACY_PTYS is not set ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=m ++CONFIG_I2C_BLACKFIN_TWI=m ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_FB=m ++CONFIG_FB_BFIN_LQ035Q1=m ++# CONFIG_USB_SUPPORT is not set ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_SMB_FS=m ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF548-EZKIT_defconfig b/arch/blackfin/configs/BF548-EZKIT_defconfig +new file mode 100644 +index 000000000000..6a68ffc55b5a +--- /dev/null ++++ b/arch/blackfin/configs/BF548-EZKIT_defconfig +@@ -0,0 +1,207 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF548_std=y ++CONFIG_IRQ_TIMER0=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_CACHELINE_ALIGNED_L1=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_DMA_UNCACHED_2M=y ++CONFIG_BFIN_EXTMEM_WRITETHROUGH=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_EBIU_MBSCTLVAL=0x0 ++CONFIG_EBIU_MODEVAL=0x1 ++CONFIG_EBIU_FCTLVAL=0x6 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_CAN=m ++CONFIG_CAN_RAW=m ++CONFIG_CAN_BCM=m ++CONFIG_CAN_BFIN=m ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++CONFIG_BFIN_SIR3=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++CONFIG_FW_LOADER=m ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_PHYSMAP=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_NAND=y ++CONFIG_MTD_NAND_BF5XX=y ++# CONFIG_MTD_NAND_BF5XX_HWECC is not set ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_RAM=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++CONFIG_BLK_DEV_SR=m ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_ATA=y ++# CONFIG_SATA_PMP is not set ++CONFIG_PATA_BF54X=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++CONFIG_SMSC911X=y ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT_FF_MEMLESS=m ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++CONFIG_INPUT_EVBUG=m ++# CONFIG_KEYBOARD_ATKBD is not set ++CONFIG_KEYBOARD_BFIN=y ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_TOUCHSCREEN=y ++CONFIG_TOUCHSCREEN_AD7877=m ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_FB=y ++CONFIG_FIRMWARE_EDID=y ++CONFIG_FB_BF54X_LQ043=y ++CONFIG_FRAMEBUFFER_CONSOLE=y ++CONFIG_FONTS=y ++CONFIG_FONT_6x11=y ++CONFIG_LOGO=y ++# CONFIG_LOGO_LINUX_MONO is not set ++# CONFIG_LOGO_LINUX_VGA16 is not set ++# CONFIG_LOGO_LINUX_CLUT224 is not set ++# CONFIG_LOGO_BLACKFIN_VGA16 is not set ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_MIXER_OSS=y ++CONFIG_SND_PCM_OSS=y ++CONFIG_SND_SOC=y ++CONFIG_SND_BF5XX_AC97=y ++CONFIG_SND_BF5XX_SOC_AD1980=y ++CONFIG_HID_A4TECH=y ++CONFIG_HID_APPLE=y ++CONFIG_HID_BELKIN=y ++CONFIG_HID_CHERRY=y ++CONFIG_HID_CHICONY=y ++CONFIG_HID_CYPRESS=y ++CONFIG_HID_EZKEY=y ++CONFIG_HID_GYRATION=y ++CONFIG_HID_LOGITECH=y ++CONFIG_HID_MICROSOFT=y ++CONFIG_HID_MONTEREY=y ++CONFIG_HID_PANTHERLORD=y ++CONFIG_HID_PETALYNX=y ++CONFIG_HID_SAMSUNG=y ++CONFIG_HID_SONY=y ++CONFIG_HID_SUNPLUS=y ++CONFIG_USB=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_OTG_BLACKLIST_HUB=y ++CONFIG_USB_MON=y ++CONFIG_USB_MUSB_HDRC=y ++CONFIG_USB_MUSB_BLACKFIN=y ++CONFIG_USB_STORAGE=y ++CONFIG_USB_GADGET=y ++CONFIG_MMC=y ++CONFIG_MMC_BLOCK=m ++CONFIG_SDH_BFIN=y ++CONFIG_SDH_BFIN_MISSING_CMD_PULLUP_WORKAROUND=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++# CONFIG_DNOTIFY is not set ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_ZISOFS=y ++CONFIG_MSDOS_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_NTFS_FS=m ++CONFIG_NTFS_RW=y ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_NFSD=m ++CONFIG_NFSD_V3=y ++CONFIG_CIFS=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF561-ACVILON_defconfig b/arch/blackfin/configs/BF561-ACVILON_defconfig +new file mode 100644 +index 000000000000..e9f3ba783a4e +--- /dev/null ++++ b/arch/blackfin/configs/BF561-ACVILON_defconfig +@@ -0,0 +1,149 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_SYSFS_DEPRECATED_V2=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++CONFIG_PREEMPT=y ++CONFIG_BF561=y ++CONFIG_BF_REV_0_5=y ++CONFIG_IRQ_TIMER0=10 ++CONFIG_BFIN561_ACVILON=y ++# CONFIG_BF561_COREB is not set ++CONFIG_CLKIN_HZ=12000000 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=y ++CONFIG_DMA_UNCACHED_4M=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_0=0x99b2 ++CONFIG_BANK_1=0x3350 ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++CONFIG_SYN_COOKIES=y ++# CONFIG_INET_LRO is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_PLATRAM=y ++CONFIG_MTD_PHRAM=y ++CONFIG_MTD_BLOCK2MTD=y ++CONFIG_MTD_NAND=y ++CONFIG_MTD_NAND_PLATFORM=y ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_COUNT=2 ++CONFIG_BLK_DEV_RAM_SIZE=16384 ++CONFIG_SCSI=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_SMSC911X=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_PIO=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_PCA_PLATFORM=y ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_SPI_SPIDEV=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++CONFIG_GPIO_PCF857X=y ++CONFIG_SENSORS_LM75=y ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_MIXER_OSS=y ++CONFIG_SND_PCM_OSS=y ++# CONFIG_SND_DRIVERS is not set ++# CONFIG_SND_USB is not set ++CONFIG_SND_SOC=y ++CONFIG_SND_BF5XX_I2S=y ++CONFIG_SND_BF5XX_SPORT_NUM=1 ++CONFIG_USB=y ++CONFIG_USB_ANNOUNCE_NEW_DEVICES=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_MON=y ++CONFIG_USB_STORAGE=y ++CONFIG_USB_SERIAL=y ++CONFIG_USB_SERIAL_FTDI_SIO=y ++CONFIG_USB_SERIAL_PL2303=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_DS1307=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++CONFIG_EXT2_FS_POSIX_ACL=y ++CONFIG_EXT2_FS_SECURITY=y ++# CONFIG_DNOTIFY is not set ++CONFIG_MSDOS_FS=y ++CONFIG_VFAT_FS=y ++CONFIG_FAT_DEFAULT_CODEPAGE=866 ++CONFIG_FAT_DEFAULT_IOCHARSET="cp1251" ++CONFIG_NTFS_FS=y ++CONFIG_CONFIGFS_FS=y ++CONFIG_JFFS2_FS=y ++CONFIG_JFFS2_COMPRESSION_OPTIONS=y ++# CONFIG_JFFS2_ZLIB is not set ++CONFIG_JFFS2_LZO=y ++# CONFIG_JFFS2_RTIME is not set ++CONFIG_JFFS2_CMODE_FAVOURLZO=y ++CONFIG_CRAMFS=y ++CONFIG_MINIX_FS=y ++CONFIG_NFS_FS=y ++CONFIG_NFS_V3=y ++CONFIG_ROOT_NFS=y ++CONFIG_NLS_DEFAULT="cp1251" ++CONFIG_NLS_CODEPAGE_866=y ++CONFIG_NLS_CODEPAGE_1251=y ++CONFIG_NLS_KOI8_R=y ++CONFIG_NLS_UTF8=y ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++# CONFIG_DEBUG_BUGVERBOSE is not set ++CONFIG_DEBUG_INFO=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set ++CONFIG_CPLB_INFO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig b/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig +new file mode 100644 +index 000000000000..89b75a6c3fab +--- /dev/null ++++ b/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig +@@ -0,0 +1,112 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF561=y ++CONFIG_SMP=y ++CONFIG_IRQ_TIMER0=10 ++CONFIG_CLKIN_HZ=30000000 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_AMDSTD=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_PHYSMAP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++CONFIG_SMC91X=y ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT=m ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_USB_SUPPORT is not set ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF561-EZKIT_defconfig b/arch/blackfin/configs/BF561-EZKIT_defconfig +new file mode 100644 +index 000000000000..67b3d2f419ba +--- /dev/null ++++ b/arch/blackfin/configs/BF561-EZKIT_defconfig +@@ -0,0 +1,114 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF561=y ++CONFIG_IRQ_TIMER0=10 ++CONFIG_CLKIN_HZ=30000000 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_BFIN_EXTMEM_WRITETHROUGH=y ++CONFIG_BFIN_L2_DCACHEABLE=y ++CONFIG_BFIN_L2_WRITETHROUGH=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_AMDSTD=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_PHYSMAP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++CONFIG_SMC91X=y ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT=m ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_USB_SUPPORT is not set ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF609-EZKIT_defconfig b/arch/blackfin/configs/BF609-EZKIT_defconfig +new file mode 100644 +index 000000000000..8cc75d4218fb +--- /dev/null ++++ b/arch/blackfin/configs/BF609-EZKIT_defconfig +@@ -0,0 +1,154 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF609=y ++CONFIG_PINT1_ASSIGN=0x01010000 ++CONFIG_PINT2_ASSIGN=0x07000101 ++CONFIG_PINT3_ASSIGN=0x02020303 ++CONFIG_IP_CHECKSUM_L1=y ++CONFIG_SYSCALL_TAB_L1=y ++CONFIG_CPLB_SWITCH_TAB_L1=y ++# CONFIG_APP_STACK_L1 is not set ++# CONFIG_BFIN_INS_LOWOVERHEAD is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_PM_BFIN_WAKE_PE12=y ++CONFIG_PM_BFIN_WAKE_PE12_POL=1 ++CONFIG_CPU_FREQ=y ++CONFIG_CPU_FREQ_GOV_POWERSAVE=y ++CONFIG_CPU_FREQ_GOV_ONDEMAND=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++CONFIG_IP_PNP_DHCP=y ++CONFIG_IP_PNP_BOOTP=y ++CONFIG_IP_PNP_RARP=y ++# CONFIG_IPV6 is not set ++CONFIG_NETFILTER=y ++CONFIG_CAN=y ++CONFIG_CAN_BFIN=y ++CONFIG_IRDA=y ++CONFIG_IRTTY_SIR=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++CONFIG_FW_LOADER=m ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_CFI_STAA=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_PHYSMAP=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_SPI_NOR=y ++CONFIG_MTD_UBI=m ++CONFIG_SCSI=y ++CONFIG_BLK_DEV_SD=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++CONFIG_STMMAC_ETH=y ++CONFIG_STMMAC_IEEE1588=y ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=y ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++CONFIG_INPUT_BFIN_ROTARY=y ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_SIMPLE_TIMER=m ++# CONFIG_BFIN_CRC is not set ++CONFIG_BFIN_LINKPORT=y ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_ADI_V3=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++CONFIG_PINCTRL_MCP23S08=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_SOUND=m ++CONFIG_SND=m ++CONFIG_SND_MIXER_OSS=m ++CONFIG_SND_PCM_OSS=m ++# CONFIG_SND_DRIVERS is not set ++# CONFIG_SND_SPI is not set ++# CONFIG_SND_USB is not set ++CONFIG_SND_SOC=m ++CONFIG_USB=y ++CONFIG_USB_MUSB_HDRC=y ++CONFIG_USB_MUSB_BLACKFIN=m ++CONFIG_USB_STORAGE=y ++CONFIG_USB_GADGET=y ++CONFIG_USB_GADGET_MUSB_HDRC=y ++CONFIG_USB_ZERO=y ++CONFIG_MMC=y ++CONFIG_SDH_BFIN=y ++# CONFIG_IOMMU_SUPPORT is not set ++CONFIG_EXT2_FS=y ++# CONFIG_DNOTIFY is not set ++CONFIG_MSDOS_FS=y ++CONFIG_VFAT_FS=y ++CONFIG_JFFS2_FS=m ++CONFIG_UBIFS_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_ISO8859_1=y ++CONFIG_DEBUG_FS=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++CONFIG_FRAME_POINTER=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO_HMAC=m ++CONFIG_CRYPTO_MD4=m ++CONFIG_CRYPTO_MD5=m ++CONFIG_CRYPTO_ARC4=m ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRYPTO_DEV_BFIN_CRC=m +diff --git a/arch/blackfin/configs/BlackStamp_defconfig b/arch/blackfin/configs/BlackStamp_defconfig +new file mode 100644 +index 000000000000..9faf0ec7007f +--- /dev/null ++++ b/arch/blackfin/configs/BlackStamp_defconfig +@@ -0,0 +1,108 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_SYSFS_DEPRECATED_V2=y ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_MODULE_FORCE_UNLOAD=y ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++CONFIG_PREEMPT=y ++CONFIG_BF532=y ++CONFIG_BF_REV_0_5=y ++CONFIG_BLACKSTAMP=y ++CONFIG_TIMER0=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_ROMKERNEL=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_BINFMT_SHARED_FLAT=y ++CONFIG_PM=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_LRO is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=m ++CONFIG_MTD_CFI_AMDSTD=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_NBD=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_MISC_DEVICES=y ++CONFIG_EEPROM_AT25=y ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_SMC91X=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_HW_RANDOM=y ++CONFIG_I2C=m ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_GPIO=m ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_SPI_SPIDEV=m ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_USB_SUPPORT is not set ++CONFIG_MMC=y ++CONFIG_MMC_SPI=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_MSDOS_FS=y ++CONFIG_VFAT_FS=y ++CONFIG_JFFS2_FS=y ++CONFIG_NFS_FS=y ++CONFIG_NFS_V3=y ++CONFIG_NFS_V4=y ++CONFIG_SMB_FS=y ++CONFIG_CIFS=y ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_ASCII=y ++CONFIG_NLS_UTF8=y ++CONFIG_SYSCTL_SYSCALL_CHECK=y ++CONFIG_DEBUG_MMRS=y ++# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_CRC_CCITT=m +diff --git a/arch/blackfin/configs/CM-BF527_defconfig b/arch/blackfin/configs/CM-BF527_defconfig +new file mode 100644 +index 000000000000..4a1ad4fd7bb2 +--- /dev/null ++++ b/arch/blackfin/configs/CM-BF527_defconfig +@@ -0,0 +1,129 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_KERNEL_LZMA=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_RD_GZIP is not set ++CONFIG_RD_LZMA=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++CONFIG_PREEMPT=y ++CONFIG_BF527=y ++CONFIG_BF_REV_0_1=y ++CONFIG_IRQ_TIMER0=12 ++CONFIG_BFIN527_BLUETECHNIX_CM=y ++CONFIG_IRQ_USB_INT0=11 ++CONFIG_IRQ_USB_INT1=11 ++CONFIG_IRQ_USB_INT2=11 ++CONFIG_IRQ_USB_DMA=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xFFC0 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_GPIO_ADDR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_SCSI=y ++CONFIG_BLK_DEV_SD=y ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_LEGACY_PTYS is not set ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_BLACKFIN_TWI=m ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_USB=m ++CONFIG_USB_ANNOUNCE_NEW_DEVICES=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_OTG_BLACKLIST_HUB=y ++CONFIG_USB_MON=m ++CONFIG_USB_MUSB_HDRC=m ++CONFIG_USB_MUSB_PERIPHERAL=y ++CONFIG_USB_GADGET_MUSB_HDRC=y ++CONFIG_MUSB_PIO_ONLY=y ++CONFIG_USB_STORAGE=m ++CONFIG_USB_GADGET=m ++CONFIG_USB_ETH=m ++CONFIG_USB_MASS_STORAGE=m ++CONFIG_USB_G_SERIAL=m ++CONFIG_USB_G_PRINTER=m ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_MSDOS_FS=y ++CONFIG_VFAT_FS=y ++CONFIG_JFFS2_FS=y ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_SMB_FS=m ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_ISO8859_1=y ++CONFIG_DEBUG_FS=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set ++CONFIG_EARLY_PRINTK=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC_CCITT=m ++CONFIG_CRC_ITU_T=y ++CONFIG_CRC7=y +diff --git a/arch/blackfin/configs/PNAV-10_defconfig b/arch/blackfin/configs/PNAV-10_defconfig +new file mode 100644 +index 000000000000..9d787e28bbe8 +--- /dev/null ++++ b/arch/blackfin/configs/PNAV-10_defconfig +@@ -0,0 +1,111 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_LOG_BUF_SHIFT=14 ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF537=y ++CONFIG_IRQ_TIMER0=12 ++CONFIG_PNAV10=y ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++CONFIG_IP_CHECKSUM_L1=y ++CONFIG_SYSCALL_TAB_L1=y ++CONFIG_CPLB_SWITCH_TAB_L1=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_1=0x33B0 ++CONFIG_BANK_2=0x33B0 ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_UCLINUX=y ++CONFIG_MTD_NAND=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_BFIN_MAC=y ++# CONFIG_BFIN_MAC_USE_L1 is not set ++CONFIG_BFIN_TX_DESC_NUM=100 ++CONFIG_BFIN_RX_DESC_NUM=100 ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=y ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_TOUCHSCREEN=y ++CONFIG_TOUCHSCREEN_AD7877=y ++CONFIG_INPUT_MISC=y ++CONFIG_INPUT_UINPUT=y ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_HW_RANDOM=y ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_FB=y ++CONFIG_FIRMWARE_EDID=y ++CONFIG_BACKLIGHT_LCD_SUPPORT=y ++CONFIG_LCD_CLASS_DEVICE=y ++CONFIG_BACKLIGHT_CLASS_DEVICE=y ++CONFIG_SOUND=y ++CONFIG_SND=m ++# CONFIG_SND_SUPPORT_OLD_API is not set ++# CONFIG_SND_VERBOSE_PROCFS is not set ++CONFIG_SOUND_PRIME=y ++# CONFIG_HID is not set ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++# CONFIG_DNOTIFY is not set ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_SMB_FS=m ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_DEBUG_HUNT_FOR_ZERO is not set ++# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set ++# CONFIG_ACCESS_CHECK is not set ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC_CCITT=m +diff --git a/arch/blackfin/configs/SRV1_defconfig b/arch/blackfin/configs/SRV1_defconfig +new file mode 100644 +index 000000000000..225df32dc9a8 +--- /dev/null ++++ b/arch/blackfin/configs/SRV1_defconfig +@@ -0,0 +1,88 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++CONFIG_KALLSYMS_ALL=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_IOSCHED_DEADLINE is not set ++CONFIG_PREEMPT=y ++CONFIG_BF537=y ++CONFIG_IRQ_TIMER0=12 ++CONFIG_BOOT_LOAD=0x400000 ++CONFIG_CLKIN_HZ=22118400 ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_DMA_UNCACHED_2M=y ++CONFIG_C_CDPRIO=y ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_PM=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++# CONFIG_WIRELESS is not set ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_JEDECPROBE=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_UCLINUX=y ++CONFIG_MTD_NAND=m ++CONFIG_BLK_DEV_RAM=y ++CONFIG_MISC_DEVICES=y ++CONFIG_EEPROM_AT25=m ++CONFIG_NETDEVICES=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++CONFIG_INPUT_UINPUT=y ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_HWMON=m ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_HID is not set ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_SMB_FS=m ++CONFIG_DEBUG_KERNEL=y ++# CONFIG_DEBUG_BUGVERBOSE is not set ++CONFIG_DEBUG_INFO=y ++# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set ++CONFIG_CPLB_INFO=y +diff --git a/arch/blackfin/configs/TCM-BF518_defconfig b/arch/blackfin/configs/TCM-BF518_defconfig +new file mode 100644 +index 000000000000..425c24e43c34 +--- /dev/null ++++ b/arch/blackfin/configs/TCM-BF518_defconfig +@@ -0,0 +1,131 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_KERNEL_LZMA=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_RD_GZIP is not set ++CONFIG_RD_LZMA=y ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF518=y ++CONFIG_BF_REV_0_1=y ++CONFIG_BFIN518F_TCM=y ++CONFIG_IRQ_TIMER0=12 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_ADV_OPTIONS=y ++CONFIG_MTD_CFI_GEOMETRY=y ++# CONFIG_MTD_MAP_BANK_WIDTH_1 is not set ++# CONFIG_MTD_MAP_BANK_WIDTH_4 is not set ++# CONFIG_MTD_CFI_I2 is not set ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_PHYSMAP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_DEVKMEM is not set ++CONFIG_BFIN_JTAG_COMM=m ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++# CONFIG_LEGACY_PTYS is not set ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_HID_SUPPORT is not set ++# CONFIG_USB_SUPPORT is not set ++CONFIG_MMC=y ++CONFIG_MMC_DEBUG=y ++CONFIG_MMC_SPI=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=y ++# CONFIG_DNOTIFY is not set ++CONFIG_VFAT_FS=m ++# CONFIG_MISC_FILESYSTEMS is not set ++CONFIG_NFS_FS=y ++CONFIG_NFS_V3=y ++CONFIG_ROOT_NFS=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC_CCITT=m +diff --git a/arch/mips/configs/fuloong2e_defconfig b/arch/mips/configs/fuloong2e_defconfig +index 7a7af706e898..be19bf122fde 100644 +--- a/arch/mips/configs/fuloong2e_defconfig ++++ b/arch/mips/configs/fuloong2e_defconfig +@@ -4,7 +4,7 @@ CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y +diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig +index 9085f4d6c698..fb23111d45f6 100644 +--- a/arch/mips/configs/gpr_defconfig ++++ b/arch/mips/configs/gpr_defconfig +@@ -1,8 +1,8 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_BSD_PROCESS_ACCT_V3=y + CONFIG_RELAY=y +diff --git a/arch/mips/configs/ip22_defconfig b/arch/mips/configs/ip22_defconfig +index 21a1168ae301..529a1b1007cf 100644 +--- a/arch/mips/configs/ip22_defconfig ++++ b/arch/mips/configs/ip22_defconfig +@@ -1,7 +1,7 @@ + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=14 +diff --git a/arch/mips/configs/ip28_defconfig b/arch/mips/configs/ip28_defconfig +index 0921ef38e9fb..6da05cef46f8 100644 +--- a/arch/mips/configs/ip28_defconfig ++++ b/arch/mips/configs/ip28_defconfig +@@ -1,5 +1,5 @@ + CONFIG_SYSVIPC=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=14 +diff --git a/arch/mips/configs/jazz_defconfig b/arch/mips/configs/jazz_defconfig +index 328d4dfeb4cb..e17cb23173ea 100644 +--- a/arch/mips/configs/jazz_defconfig ++++ b/arch/mips/configs/jazz_defconfig +@@ -1,6 +1,6 @@ ++CONFIG_PREEMPT=y + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y +diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig +index 914af125a7fa..76a64290373f 100644 +--- a/arch/mips/configs/mtx1_defconfig ++++ b/arch/mips/configs/mtx1_defconfig +@@ -1,8 +1,8 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_AUDIT=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_BSD_PROCESS_ACCT_V3=y + CONFIG_RELAY=y +diff --git a/arch/mips/configs/nlm_xlr_defconfig b/arch/mips/configs/nlm_xlr_defconfig +index 4ecb157e56d4..ea7309283b01 100644 +--- a/arch/mips/configs/nlm_xlr_defconfig ++++ b/arch/mips/configs/nlm_xlr_defconfig +@@ -1,10 +1,10 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_AUDIT=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_BSD_PROCESS_ACCT_V3=y + CONFIG_TASKSTATS=y +diff --git a/arch/mips/configs/pic32mzda_defconfig b/arch/mips/configs/pic32mzda_defconfig +index 63fe2da1b37f..7f08ee237345 100644 +--- a/arch/mips/configs/pic32mzda_defconfig ++++ b/arch/mips/configs/pic32mzda_defconfig +@@ -1,7 +1,7 @@ ++CONFIG_PREEMPT=y + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=14 +diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig +index 24e07180c57d..38582e8f71c4 100644 +--- a/arch/mips/configs/pistachio_defconfig ++++ b/arch/mips/configs/pistachio_defconfig +@@ -1,9 +1,9 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_DEFAULT_HOSTNAME="localhost" + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_IKCONFIG=m + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=18 +diff --git a/arch/mips/configs/pnx8335_stb225_defconfig b/arch/mips/configs/pnx8335_stb225_defconfig +index 738ba3b1374b..6a3267e8aa0d 100644 +--- a/arch/mips/configs/pnx8335_stb225_defconfig ++++ b/arch/mips/configs/pnx8335_stb225_defconfig +@@ -1,9 +1,9 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + # CONFIG_SWAP is not set + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_LOG_BUF_SHIFT=14 + CONFIG_EXPERT=y + CONFIG_SLAB=y +diff --git a/arch/mips/configs/rm200_defconfig b/arch/mips/configs/rm200_defconfig +index 2c7adea7638f..1c82d62bee72 100644 +--- a/arch/mips/configs/rm200_defconfig ++++ b/arch/mips/configs/rm200_defconfig +@@ -1,6 +1,6 @@ ++CONFIG_PREEMPT=y + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y +diff --git a/arch/parisc/configs/712_defconfig b/arch/parisc/configs/712_defconfig +index d3e3d94e90c3..578524f80cc4 100644 +--- a/arch/parisc/configs/712_defconfig ++++ b/arch/parisc/configs/712_defconfig +@@ -13,7 +13,7 @@ CONFIG_MODULES=y + CONFIG_MODULE_UNLOAD=y + CONFIG_MODULE_FORCE_UNLOAD=y + CONFIG_PA7100LC=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_GSC_LASI=y + # CONFIG_PDC_CHASSIS is not set + CONFIG_BINFMT_MISC=m +diff --git a/arch/parisc/configs/c3000_defconfig b/arch/parisc/configs/c3000_defconfig +index 64d45a8b6ca0..d1bdfad94048 100644 +--- a/arch/parisc/configs/c3000_defconfig ++++ b/arch/parisc/configs/c3000_defconfig +@@ -13,7 +13,7 @@ CONFIG_MODULES=y + CONFIG_MODULE_UNLOAD=y + CONFIG_MODULE_FORCE_UNLOAD=y + CONFIG_PA8X00=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_GSC is not set + CONFIG_PCI=y + CONFIG_PCI_LBA=y +diff --git a/arch/parisc/configs/defconfig b/arch/parisc/configs/defconfig +index 5b877ca34ebf..0d976614934c 100644 +--- a/arch/parisc/configs/defconfig ++++ b/arch/parisc/configs/defconfig +@@ -14,7 +14,7 @@ CONFIG_MODULE_UNLOAD=y + CONFIG_MODULE_FORCE_UNLOAD=y + # CONFIG_BLK_DEV_BSG is not set + CONFIG_PA7100LC=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_IOMMU_CCIO=y + CONFIG_GSC_LASI=y + CONFIG_GSC_WAX=y +diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig +index 3e56c9c2f16e..ecee9c2a0062 100644 +--- a/arch/powerpc/Kconfig ++++ b/arch/powerpc/Kconfig +@@ -853,6 +853,8 @@ config SCHED_SMT + when dealing with POWER5 cpus at a cost of slightly increased + overhead in some places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config PPC_DENORMALISATION + bool "PowerPC denormalisation exception handling" + depends on PPC_BOOK3S_64 +diff --git a/arch/powerpc/configs/c2k_defconfig b/arch/powerpc/configs/c2k_defconfig +new file mode 100644 +index 000000000000..04fee07ea6c5 +--- /dev/null ++++ b/arch/powerpc/configs/c2k_defconfig +@@ -0,0 +1,389 @@ ++CONFIG_SYSVIPC=y ++CONFIG_POSIX_MQUEUE=y ++CONFIG_AUDIT=y ++CONFIG_BSD_PROCESS_ACCT=y ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_PROFILING=y ++CONFIG_OPROFILE=m ++CONFIG_KPROBES=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_MODVERSIONS=y ++CONFIG_PARTITION_ADVANCED=y ++CONFIG_OSF_PARTITION=y ++CONFIG_MAC_PARTITION=y ++CONFIG_BSD_DISKLABEL=y ++CONFIG_MINIX_SUBPARTITION=y ++CONFIG_SOLARIS_X86_PARTITION=y ++CONFIG_UNIXWARE_DISKLABEL=y ++CONFIG_SGI_PARTITION=y ++CONFIG_SUN_PARTITION=y ++# CONFIG_PPC_CHRP is not set ++# CONFIG_PPC_PMAC is not set ++CONFIG_EMBEDDED6xx=y ++CONFIG_PPC_C2K=y ++CONFIG_CPU_FREQ=y ++CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y ++CONFIG_CPU_FREQ_GOV_PERFORMANCE=y ++CONFIG_CPU_FREQ_GOV_POWERSAVE=m ++CONFIG_CPU_FREQ_GOV_ONDEMAND=m ++CONFIG_GEN_RTC=y ++CONFIG_HIGHMEM=y ++CONFIG_PREEMPT=y ++CONFIG_BINFMT_MISC=y ++CONFIG_PM=y ++CONFIG_PCI_MSI=y ++CONFIG_HOTPLUG_PCI=y ++CONFIG_HOTPLUG_PCI_SHPC=m ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_XFRM_USER=y ++CONFIG_NET_KEY=m ++CONFIG_INET=y ++CONFIG_IP_MULTICAST=y ++CONFIG_IP_ADVANCED_ROUTER=y ++CONFIG_IP_MULTIPLE_TABLES=y ++CONFIG_IP_ROUTE_MULTIPATH=y ++CONFIG_IP_ROUTE_VERBOSE=y ++CONFIG_IP_PNP=y ++CONFIG_IP_PNP_DHCP=y ++CONFIG_NET_IPIP=m ++CONFIG_IP_MROUTE=y ++CONFIG_IP_PIMSM_V1=y ++CONFIG_IP_PIMSM_V2=y ++CONFIG_SYN_COOKIES=y ++CONFIG_INET_AH=m ++CONFIG_INET_ESP=m ++CONFIG_INET_IPCOMP=m ++CONFIG_INET6_AH=m ++CONFIG_INET6_ESP=m ++CONFIG_INET6_IPCOMP=m ++CONFIG_IPV6_TUNNEL=m ++CONFIG_NETFILTER=y ++# CONFIG_NETFILTER_XT_MATCH_SCTP is not set ++CONFIG_IP_NF_IPTABLES=m ++CONFIG_IP_NF_MATCH_ECN=m ++CONFIG_IP_NF_MATCH_TTL=m ++CONFIG_IP_NF_FILTER=m ++CONFIG_IP_NF_TARGET_REJECT=m ++CONFIG_IP_NF_MANGLE=m ++CONFIG_IP_NF_TARGET_ECN=m ++CONFIG_IP_NF_RAW=m ++CONFIG_IP_NF_ARPTABLES=m ++CONFIG_IP_NF_ARPFILTER=m ++CONFIG_IP_NF_ARP_MANGLE=m ++CONFIG_IP6_NF_IPTABLES=m ++CONFIG_IP6_NF_MATCH_EUI64=m ++CONFIG_IP6_NF_MATCH_FRAG=m ++CONFIG_IP6_NF_MATCH_OPTS=m ++CONFIG_IP6_NF_MATCH_HL=m ++CONFIG_IP6_NF_MATCH_IPV6HEADER=m ++CONFIG_IP6_NF_MATCH_RT=m ++CONFIG_IP6_NF_FILTER=m ++CONFIG_IP6_NF_MANGLE=m ++CONFIG_IP6_NF_RAW=m ++CONFIG_BRIDGE_NF_EBTABLES=m ++CONFIG_BRIDGE_EBT_BROUTE=m ++CONFIG_BRIDGE_EBT_T_FILTER=m ++CONFIG_BRIDGE_EBT_T_NAT=m ++CONFIG_BRIDGE_EBT_802_3=m ++CONFIG_BRIDGE_EBT_AMONG=m ++CONFIG_BRIDGE_EBT_ARP=m ++CONFIG_BRIDGE_EBT_IP=m ++CONFIG_BRIDGE_EBT_LIMIT=m ++CONFIG_BRIDGE_EBT_MARK=m ++CONFIG_BRIDGE_EBT_PKTTYPE=m ++CONFIG_BRIDGE_EBT_STP=m ++CONFIG_BRIDGE_EBT_VLAN=m ++CONFIG_BRIDGE_EBT_ARPREPLY=m ++CONFIG_BRIDGE_EBT_DNAT=m ++CONFIG_BRIDGE_EBT_MARK_T=m ++CONFIG_BRIDGE_EBT_REDIRECT=m ++CONFIG_BRIDGE_EBT_SNAT=m ++CONFIG_BRIDGE_EBT_LOG=m ++CONFIG_IP_SCTP=m ++CONFIG_ATM=m ++CONFIG_ATM_CLIP=m ++CONFIG_ATM_LANE=m ++CONFIG_ATM_BR2684=m ++CONFIG_BRIDGE=m ++CONFIG_VLAN_8021Q=m ++CONFIG_NET_SCHED=y ++CONFIG_NET_SCH_CBQ=m ++CONFIG_NET_SCH_HTB=m ++CONFIG_NET_SCH_HFSC=m ++CONFIG_NET_SCH_ATM=m ++CONFIG_NET_SCH_PRIO=m ++CONFIG_NET_SCH_RED=m ++CONFIG_NET_SCH_SFQ=m ++CONFIG_NET_SCH_TEQL=m ++CONFIG_NET_SCH_TBF=m ++CONFIG_NET_SCH_GRED=m ++CONFIG_NET_SCH_DSMARK=m ++CONFIG_NET_SCH_NETEM=m ++CONFIG_NET_CLS_TCINDEX=m ++CONFIG_NET_CLS_ROUTE4=m ++CONFIG_NET_CLS_FW=m ++CONFIG_NET_CLS_U32=m ++CONFIG_CLS_U32_PERF=y ++CONFIG_NET_CLS_RSVP=m ++CONFIG_NET_CLS_RSVP6=m ++CONFIG_NET_CLS_IND=y ++CONFIG_BT=m ++CONFIG_BT_RFCOMM=m ++CONFIG_BT_RFCOMM_TTY=y ++CONFIG_BT_BNEP=m ++CONFIG_BT_BNEP_MC_FILTER=y ++CONFIG_BT_BNEP_PROTO_FILTER=y ++CONFIG_BT_HIDP=m ++CONFIG_BT_HCIUART=m ++CONFIG_BT_HCIUART_H4=y ++CONFIG_BT_HCIUART_BCSP=y ++CONFIG_BT_HCIBCM203X=m ++CONFIG_BT_HCIBFUSB=m ++CONFIG_BT_HCIVHCI=m ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_AMDSTD=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_PHYSMAP_OF=y ++CONFIG_BLK_DEV_LOOP=m ++CONFIG_BLK_DEV_CRYPTOLOOP=m ++CONFIG_BLK_DEV_NBD=m ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_SIZE=16384 ++CONFIG_SCSI=m ++CONFIG_BLK_DEV_SD=m ++CONFIG_CHR_DEV_ST=m ++CONFIG_CHR_DEV_OSST=m ++CONFIG_BLK_DEV_SR=m ++CONFIG_BLK_DEV_SR_VENDOR=y ++CONFIG_CHR_DEV_SG=m ++CONFIG_SCSI_CONSTANTS=y ++CONFIG_SCSI_LOGGING=y ++CONFIG_SCSI_ISCSI_ATTRS=m ++CONFIG_BLK_DEV_3W_XXXX_RAID=m ++CONFIG_SCSI_3W_9XXX=m ++CONFIG_SCSI_ACARD=m ++CONFIG_SCSI_AACRAID=m ++CONFIG_SCSI_AIC7XXX=m ++CONFIG_AIC7XXX_CMDS_PER_DEVICE=4 ++CONFIG_AIC7XXX_RESET_DELAY_MS=15000 ++# CONFIG_AIC7XXX_DEBUG_ENABLE is not set ++# CONFIG_AIC7XXX_REG_PRETTY_PRINT is not set ++CONFIG_SCSI_AIC79XX=m ++CONFIG_AIC79XX_CMDS_PER_DEVICE=4 ++CONFIG_AIC79XX_RESET_DELAY_MS=15000 ++# CONFIG_AIC79XX_DEBUG_ENABLE is not set ++# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set ++CONFIG_SCSI_ARCMSR=m ++CONFIG_MEGARAID_NEWGEN=y ++CONFIG_MEGARAID_MM=m ++CONFIG_MEGARAID_MAILBOX=m ++CONFIG_MEGARAID_SAS=m ++CONFIG_SCSI_GDTH=m ++CONFIG_SCSI_IPS=m ++CONFIG_SCSI_INITIO=m ++CONFIG_SCSI_SYM53C8XX_2=m ++CONFIG_SCSI_QLOGIC_1280=m ++CONFIG_NETDEVICES=y ++CONFIG_BONDING=m ++CONFIG_DUMMY=m ++CONFIG_NETCONSOLE=m ++CONFIG_TUN=m ++# CONFIG_ATM_DRIVERS is not set ++CONFIG_MV643XX_ETH=y ++CONFIG_VITESSE_PHY=y ++CONFIG_INPUT_EVDEV=y ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++CONFIG_INPUT_UINPUT=m ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_SERIAL_NONSTANDARD=y ++CONFIG_SERIAL_MPSC=y ++CONFIG_SERIAL_MPSC_CONSOLE=y ++CONFIG_NVRAM=m ++CONFIG_RAW_DRIVER=y ++CONFIG_MAX_RAW_DEVS=8192 ++CONFIG_I2C=m ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_MV64XXX=m ++CONFIG_HWMON=m ++CONFIG_SENSORS_ADM1021=m ++CONFIG_SENSORS_ADM1025=m ++CONFIG_SENSORS_ADM1026=m ++CONFIG_SENSORS_ADM1031=m ++CONFIG_SENSORS_DS1621=m ++CONFIG_SENSORS_GL518SM=m ++CONFIG_SENSORS_MAX1619=m ++CONFIG_SENSORS_LM75=m ++CONFIG_SENSORS_LM77=m ++CONFIG_SENSORS_LM78=m ++CONFIG_SENSORS_LM80=m ++CONFIG_SENSORS_LM83=m ++CONFIG_SENSORS_LM85=m ++CONFIG_SENSORS_LM87=m ++CONFIG_SENSORS_LM90=m ++CONFIG_SENSORS_PCF8591=m ++CONFIG_SENSORS_VIA686A=m ++CONFIG_SENSORS_W83781D=m ++CONFIG_SENSORS_W83L785TS=m ++CONFIG_WATCHDOG=y ++CONFIG_SOFT_WATCHDOG=m ++CONFIG_PCIPCWATCHDOG=m ++CONFIG_WDTPCI=m ++CONFIG_USBPCWATCHDOG=m ++# CONFIG_VGA_CONSOLE is not set ++CONFIG_USB=m ++CONFIG_USB_MON=m ++CONFIG_USB_EHCI_HCD=m ++CONFIG_USB_EHCI_ROOT_HUB_TT=y ++CONFIG_USB_OHCI_HCD=m ++CONFIG_USB_OHCI_HCD_PPC_OF_BE=y ++CONFIG_USB_UHCI_HCD=m ++CONFIG_USB_ACM=m ++CONFIG_USB_PRINTER=m ++CONFIG_USB_STORAGE=m ++CONFIG_USB_STORAGE_DATAFAB=m ++CONFIG_USB_STORAGE_FREECOM=m ++CONFIG_USB_STORAGE_ISD200=m ++CONFIG_USB_STORAGE_SDDR09=m ++CONFIG_USB_STORAGE_SDDR55=m ++CONFIG_USB_STORAGE_JUMPSHOT=m ++CONFIG_USB_MDC800=m ++CONFIG_USB_MICROTEK=m ++CONFIG_USB_SERIAL=m ++CONFIG_USB_SERIAL_GENERIC=y ++CONFIG_USB_SERIAL_BELKIN=m ++CONFIG_USB_SERIAL_WHITEHEAT=m ++CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m ++CONFIG_USB_SERIAL_EMPEG=m ++CONFIG_USB_SERIAL_FTDI_SIO=m ++CONFIG_USB_SERIAL_VISOR=m ++CONFIG_USB_SERIAL_IPAQ=m ++CONFIG_USB_SERIAL_IR=m ++CONFIG_USB_SERIAL_EDGEPORT=m ++CONFIG_USB_SERIAL_EDGEPORT_TI=m ++CONFIG_USB_SERIAL_KEYSPAN_PDA=m ++CONFIG_USB_SERIAL_KEYSPAN=m ++CONFIG_USB_SERIAL_KLSI=m ++CONFIG_USB_SERIAL_KOBIL_SCT=m ++CONFIG_USB_SERIAL_MCT_U232=m ++CONFIG_USB_SERIAL_PL2303=m ++CONFIG_USB_SERIAL_SAFE=m ++CONFIG_USB_SERIAL_SAFE_PADDED=y ++CONFIG_USB_SERIAL_CYBERJACK=m ++CONFIG_USB_SERIAL_XIRCOM=m ++CONFIG_USB_SERIAL_OMNINET=m ++CONFIG_USB_EMI62=m ++CONFIG_USB_RIO500=m ++CONFIG_USB_LEGOTOWER=m ++CONFIG_USB_LCD=m ++CONFIG_USB_LED=m ++CONFIG_USB_TEST=m ++CONFIG_USB_ATM=m ++CONFIG_USB_SPEEDTOUCH=m ++CONFIG_INFINIBAND=m ++CONFIG_INFINIBAND_USER_MAD=m ++CONFIG_INFINIBAND_USER_ACCESS=m ++CONFIG_INFINIBAND_MTHCA=m ++CONFIG_INFINIBAND_IPOIB=m ++CONFIG_INFINIBAND_IPOIB_CM=y ++CONFIG_INFINIBAND_SRP=m ++CONFIG_DMADEVICES=y ++CONFIG_EXT4_FS=m ++CONFIG_EXT4_FS_POSIX_ACL=y ++CONFIG_EXT4_FS_SECURITY=y ++CONFIG_QUOTA=y ++CONFIG_QFMT_V2=y ++CONFIG_AUTOFS4_FS=m ++CONFIG_UDF_FS=m ++CONFIG_MSDOS_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_FAT_DEFAULT_IOCHARSET="ascii" ++CONFIG_PROC_KCORE=y ++CONFIG_TMPFS=y ++CONFIG_HFS_FS=m ++CONFIG_HFSPLUS_FS=m ++CONFIG_JFFS2_FS=y ++CONFIG_CRAMFS=m ++CONFIG_VXFS_FS=m ++CONFIG_NFS_FS=y ++CONFIG_NFS_V3_ACL=y ++CONFIG_NFS_V4=y ++CONFIG_ROOT_NFS=y ++CONFIG_CIFS=m ++CONFIG_CIFS_XATTR=y ++CONFIG_CIFS_POSIX=y ++CONFIG_NLS=y ++CONFIG_NLS_DEFAULT="utf8" ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_CODEPAGE_737=m ++CONFIG_NLS_CODEPAGE_775=m ++CONFIG_NLS_CODEPAGE_850=m ++CONFIG_NLS_CODEPAGE_852=m ++CONFIG_NLS_CODEPAGE_855=m ++CONFIG_NLS_CODEPAGE_857=m ++CONFIG_NLS_CODEPAGE_860=m ++CONFIG_NLS_CODEPAGE_861=m ++CONFIG_NLS_CODEPAGE_862=m ++CONFIG_NLS_CODEPAGE_863=m ++CONFIG_NLS_CODEPAGE_864=m ++CONFIG_NLS_CODEPAGE_865=m ++CONFIG_NLS_CODEPAGE_866=m ++CONFIG_NLS_CODEPAGE_869=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_CODEPAGE_950=m ++CONFIG_NLS_CODEPAGE_932=m ++CONFIG_NLS_CODEPAGE_949=m ++CONFIG_NLS_CODEPAGE_874=m ++CONFIG_NLS_ISO8859_8=m ++CONFIG_NLS_CODEPAGE_1250=m ++CONFIG_NLS_CODEPAGE_1251=m ++CONFIG_NLS_ASCII=y ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_ISO8859_2=m ++CONFIG_NLS_ISO8859_3=m ++CONFIG_NLS_ISO8859_4=m ++CONFIG_NLS_ISO8859_5=m ++CONFIG_NLS_ISO8859_6=m ++CONFIG_NLS_ISO8859_7=m ++CONFIG_NLS_ISO8859_9=m ++CONFIG_NLS_ISO8859_13=m ++CONFIG_NLS_ISO8859_14=m ++CONFIG_NLS_ISO8859_15=m ++CONFIG_NLS_KOI8_R=m ++CONFIG_NLS_KOI8_U=m ++CONFIG_CRC_CCITT=m ++CONFIG_CRC_T10DIF=m ++CONFIG_DEBUG_INFO=y ++CONFIG_MAGIC_SYSRQ=y ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_STACK_USAGE=y ++CONFIG_DEBUG_HIGHMEM=y ++CONFIG_DEBUG_STACKOVERFLOW=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_SPINLOCK=y ++CONFIG_BOOTX_TEXT=y ++CONFIG_PPC_EARLY_DEBUG=y ++CONFIG_SECURITY=y ++CONFIG_SECURITY_NETWORK=y ++CONFIG_SECURITY_SELINUX=y ++CONFIG_SECURITY_SELINUX_BOOTPARAM=y ++CONFIG_SECURITY_SELINUX_DISABLE=y ++CONFIG_CRYPTO_HMAC=y ++CONFIG_CRYPTO_MICHAEL_MIC=m ++CONFIG_CRYPTO_SHA1=y ++CONFIG_CRYPTO_SHA512=m ++CONFIG_CRYPTO_WP512=m ++CONFIG_CRYPTO_BLOWFISH=m ++CONFIG_CRYPTO_CAST6=m ++CONFIG_CRYPTO_KHAZAD=m ++CONFIG_CRYPTO_SERPENT=m ++CONFIG_CRYPTO_TEA=m ++CONFIG_CRYPTO_TWOFISH=m +diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig +index 9dca4cffa623..09d38c3e59a5 100644 +--- a/arch/powerpc/configs/ppc6xx_defconfig ++++ b/arch/powerpc/configs/ppc6xx_defconfig +@@ -74,7 +74,7 @@ CONFIG_QE_GPIO=y + CONFIG_MCU_MPC8349EMITX=y + CONFIG_HIGHMEM=y + CONFIG_HZ_1000=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BINFMT_MISC=y + CONFIG_HIBERNATION=y + CONFIG_PM_DEBUG=y +diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +index f18d5067cd0f..fe489fc01c73 100644 +--- a/arch/powerpc/platforms/cell/spufs/sched.c ++++ b/arch/powerpc/platforms/cell/spufs/sched.c +@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; + static struct timer_list spusched_timer; + static struct timer_list spuloadavg_timer; + +-/* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- + /* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. +diff --git a/arch/score/configs/spct6600_defconfig b/arch/score/configs/spct6600_defconfig +new file mode 100644 +index 000000000000..46434ca1fa10 +--- /dev/null ++++ b/arch/score/configs/spct6600_defconfig +@@ -0,0 +1,84 @@ ++CONFIG_HZ_100=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y ++# CONFIG_LOCALVERSION_AUTO is not set ++CONFIG_SYSVIPC=y ++CONFIG_POSIX_MQUEUE=y ++CONFIG_BSD_PROCESS_ACCT=y ++CONFIG_LOG_BUF_SHIFT=12 ++CONFIG_SYSFS_DEPRECATED_V2=y ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_KALLSYMS is not set ++# CONFIG_HOTPLUG is not set ++CONFIG_SLAB=y ++CONFIG_MODULES=y ++CONFIG_MODULE_FORCE_LOAD=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_MODULE_FORCE_UNLOAD=y ++# CONFIG_BLK_DEV_BSG is not set ++CONFIG_BINFMT_MISC=y ++CONFIG_NET=y ++CONFIG_UNIX=y ++CONFIG_NET_KEY=y ++CONFIG_INET=y ++CONFIG_IP_MULTICAST=y ++CONFIG_ARPD=y ++# CONFIG_INET_LRO is not set ++# CONFIG_IPV6 is not set ++# CONFIG_STANDALONE is not set ++# CONFIG_PREVENT_FIRMWARE_BUILD is not set ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_CRYPTOLOOP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_COUNT=1 ++# CONFIG_MISC_DEVICES is not set ++CONFIG_NETDEVICES=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++CONFIG_SERIAL_NONSTANDARD=y ++CONFIG_STALDRV=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_RAW_DRIVER=y ++CONFIG_MAX_RAW_DEVS=8192 ++# CONFIG_HWMON is not set ++# CONFIG_VGA_CONSOLE is not set ++# CONFIG_HID_SUPPORT is not set ++# CONFIG_USB_SUPPORT is not set ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++CONFIG_EXT2_FS_POSIX_ACL=y ++CONFIG_EXT3_FS=y ++# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set ++CONFIG_EXT3_FS_POSIX_ACL=y ++CONFIG_AUTOFS_FS=y ++CONFIG_AUTOFS4_FS=y ++CONFIG_PROC_KCORE=y ++# CONFIG_PROC_PAGE_MONITOR is not set ++CONFIG_TMPFS=y ++CONFIG_TMPFS_POSIX_ACL=y ++CONFIG_NFS_FS=y ++CONFIG_NFS_V3=y ++CONFIG_NFS_V3_ACL=y ++CONFIG_NFS_V4=y ++CONFIG_NFSD=y ++CONFIG_NFSD_V3_ACL=y ++CONFIG_NFSD_V4=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++CONFIG_SECURITY=y ++CONFIG_SECURITY_NETWORK=y ++CONFIG_CRYPTO_NULL=y ++CONFIG_CRYPTO_CRYPTD=y ++CONFIG_CRYPTO_SEQIV=y ++CONFIG_CRYPTO_MD4=y ++CONFIG_CRYPTO_MICHAEL_MIC=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++# CONFIG_CRYPTO_HW is not set ++CONFIG_CRC_CCITT=y ++CONFIG_CRC16=y ++CONFIG_LIBCRC32C=y +diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig +index 9a527f978106..5895f2cc726e 100644 +--- a/arch/sh/configs/se7712_defconfig ++++ b/arch/sh/configs/se7712_defconfig +@@ -23,7 +23,7 @@ CONFIG_FLATMEM_MANUAL=y + CONFIG_SH_SOLUTION_ENGINE=y + CONFIG_SH_PCLK_FREQ=66666666 + CONFIG_HEARTBEAT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1" + CONFIG_NET=y +diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig +index 3b0e1eb6e874..e296a2cd9903 100644 +--- a/arch/sh/configs/se7721_defconfig ++++ b/arch/sh/configs/se7721_defconfig +@@ -23,7 +23,7 @@ CONFIG_FLATMEM_MANUAL=y + CONFIG_SH_7721_SOLUTION_ENGINE=y + CONFIG_SH_PCLK_FREQ=33333333 + CONFIG_HEARTBEAT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda2" + CONFIG_NET=y +diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig +index 4ec961ace688..a03a1ad670a0 100644 +--- a/arch/sh/configs/titan_defconfig ++++ b/arch/sh/configs/titan_defconfig +@@ -20,7 +20,7 @@ CONFIG_SH_TITAN=y + CONFIG_SH_PCLK_FREQ=30000000 + CONFIG_SH_DMA=y + CONFIG_SH_DMA_API=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC1,38400N81 root=/dev/nfs ip=:::::eth1:autoconf rw" + CONFIG_PCI=y +diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig +index 6c325d53a20a..98d4ef3d76cf 100644 +--- a/arch/sparc/configs/sparc64_defconfig ++++ b/arch/sparc/configs/sparc64_defconfig +@@ -22,7 +22,7 @@ CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y + CONFIG_NUMA=y + CONFIG_DEFAULT_MMAP_MIN_ADDR=8192 +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_SUN_LDOMS=y + CONFIG_PCI=y + CONFIG_PCI_MSI=y +diff --git a/arch/tile/configs/tilegx_defconfig b/arch/tile/configs/tilegx_defconfig +new file mode 100644 +index 000000000000..939c63ba7e6e +--- /dev/null ++++ b/arch/tile/configs/tilegx_defconfig +@@ -0,0 +1,411 @@ ++CONFIG_TILEGX=y ++CONFIG_SYSVIPC=y ++CONFIG_POSIX_MQUEUE=y ++CONFIG_FHANDLE=y ++CONFIG_AUDIT=y ++CONFIG_NO_HZ=y ++CONFIG_BSD_PROCESS_ACCT=y ++CONFIG_BSD_PROCESS_ACCT_V3=y ++CONFIG_TASKSTATS=y ++CONFIG_TASK_DELAY_ACCT=y ++CONFIG_TASK_XACCT=y ++CONFIG_TASK_IO_ACCOUNTING=y ++CONFIG_LOG_BUF_SHIFT=19 ++CONFIG_CGROUPS=y ++CONFIG_CGROUP_DEBUG=y ++CONFIG_CGROUP_DEVICE=y ++CONFIG_CPUSETS=y ++CONFIG_CGROUP_CPUACCT=y ++CONFIG_CGROUP_SCHED=y ++CONFIG_RT_GROUP_SCHED=y ++CONFIG_BLK_CGROUP=y ++CONFIG_NAMESPACES=y ++CONFIG_RELAY=y ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_RD_XZ=y ++CONFIG_SYSCTL_SYSCALL=y ++CONFIG_EMBEDDED=y ++# CONFIG_COMPAT_BRK is not set ++CONFIG_PROFILING=y ++CONFIG_KPROBES=y ++CONFIG_MODULES=y ++CONFIG_MODULE_FORCE_LOAD=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_BLK_DEV_INTEGRITY=y ++CONFIG_PARTITION_ADVANCED=y ++CONFIG_OSF_PARTITION=y ++CONFIG_AMIGA_PARTITION=y ++CONFIG_MAC_PARTITION=y ++CONFIG_BSD_DISKLABEL=y ++CONFIG_MINIX_SUBPARTITION=y ++CONFIG_SOLARIS_X86_PARTITION=y ++CONFIG_UNIXWARE_DISKLABEL=y ++CONFIG_SGI_PARTITION=y ++CONFIG_SUN_PARTITION=y ++CONFIG_KARMA_PARTITION=y ++CONFIG_CFQ_GROUP_IOSCHED=y ++CONFIG_NR_CPUS=100 ++CONFIG_HZ_100=y ++# CONFIG_COMPACTION is not set ++CONFIG_PREEMPT=y ++CONFIG_TILE_PCI_IO=y ++CONFIG_PCI_DEBUG=y ++# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set ++CONFIG_BINFMT_MISC=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_XFRM_USER=y ++CONFIG_XFRM_SUB_POLICY=y ++CONFIG_XFRM_STATISTICS=y ++CONFIG_NET_KEY=m ++CONFIG_NET_KEY_MIGRATE=y ++CONFIG_INET=y ++CONFIG_IP_MULTICAST=y ++CONFIG_IP_ADVANCED_ROUTER=y ++CONFIG_IP_MULTIPLE_TABLES=y ++CONFIG_IP_ROUTE_MULTIPATH=y ++CONFIG_IP_ROUTE_VERBOSE=y ++CONFIG_NET_IPIP=m ++CONFIG_IP_MROUTE=y ++CONFIG_IP_PIMSM_V1=y ++CONFIG_IP_PIMSM_V2=y ++CONFIG_SYN_COOKIES=y ++CONFIG_INET_AH=m ++CONFIG_INET_ESP=m ++CONFIG_INET_IPCOMP=m ++CONFIG_INET_XFRM_MODE_TRANSPORT=m ++CONFIG_INET_XFRM_MODE_TUNNEL=m ++CONFIG_INET_XFRM_MODE_BEET=m ++CONFIG_INET_DIAG=m ++CONFIG_TCP_CONG_ADVANCED=y ++CONFIG_TCP_CONG_HSTCP=m ++CONFIG_TCP_CONG_HYBLA=m ++CONFIG_TCP_CONG_SCALABLE=m ++CONFIG_TCP_CONG_LP=m ++CONFIG_TCP_CONG_VENO=m ++CONFIG_TCP_CONG_YEAH=m ++CONFIG_TCP_CONG_ILLINOIS=m ++CONFIG_TCP_MD5SIG=y ++CONFIG_IPV6=y ++CONFIG_IPV6_ROUTER_PREF=y ++CONFIG_IPV6_ROUTE_INFO=y ++CONFIG_IPV6_OPTIMISTIC_DAD=y ++CONFIG_INET6_AH=m ++CONFIG_INET6_ESP=m ++CONFIG_INET6_IPCOMP=m ++CONFIG_IPV6_MIP6=m ++CONFIG_INET6_XFRM_MODE_TRANSPORT=m ++CONFIG_INET6_XFRM_MODE_TUNNEL=m ++CONFIG_INET6_XFRM_MODE_BEET=m ++CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m ++CONFIG_IPV6_SIT=m ++CONFIG_IPV6_TUNNEL=m ++CONFIG_IPV6_MULTIPLE_TABLES=y ++CONFIG_IPV6_MROUTE=y ++CONFIG_IPV6_PIMSM_V2=y ++CONFIG_NETLABEL=y ++CONFIG_RDS=m ++CONFIG_RDS_TCP=m ++CONFIG_BRIDGE=m ++CONFIG_VLAN_8021Q=m ++CONFIG_VLAN_8021Q_GVRP=y ++CONFIG_PHONET=m ++CONFIG_NET_SCHED=y ++CONFIG_NET_SCH_CBQ=m ++CONFIG_NET_SCH_HTB=m ++CONFIG_NET_SCH_HFSC=m ++CONFIG_NET_SCH_PRIO=m ++CONFIG_NET_SCH_MULTIQ=m ++CONFIG_NET_SCH_RED=m ++CONFIG_NET_SCH_SFQ=m ++CONFIG_NET_SCH_TEQL=m ++CONFIG_NET_SCH_TBF=m ++CONFIG_NET_SCH_GRED=m ++CONFIG_NET_SCH_DSMARK=m ++CONFIG_NET_SCH_NETEM=m ++CONFIG_NET_SCH_DRR=m ++CONFIG_NET_SCH_INGRESS=m ++CONFIG_NET_CLS_BASIC=m ++CONFIG_NET_CLS_TCINDEX=m ++CONFIG_NET_CLS_ROUTE4=m ++CONFIG_NET_CLS_FW=m ++CONFIG_NET_CLS_U32=m ++CONFIG_CLS_U32_PERF=y ++CONFIG_CLS_U32_MARK=y ++CONFIG_NET_CLS_RSVP=m ++CONFIG_NET_CLS_RSVP6=m ++CONFIG_NET_CLS_FLOW=m ++CONFIG_NET_CLS_CGROUP=y ++CONFIG_NET_EMATCH=y ++CONFIG_NET_EMATCH_CMP=m ++CONFIG_NET_EMATCH_NBYTE=m ++CONFIG_NET_EMATCH_U32=m ++CONFIG_NET_EMATCH_META=m ++CONFIG_NET_EMATCH_TEXT=m ++CONFIG_NET_CLS_ACT=y ++CONFIG_NET_ACT_POLICE=m ++CONFIG_NET_ACT_GACT=m ++CONFIG_GACT_PROB=y ++CONFIG_NET_ACT_MIRRED=m ++CONFIG_NET_ACT_NAT=m ++CONFIG_NET_ACT_PEDIT=m ++CONFIG_NET_ACT_SIMP=m ++CONFIG_NET_ACT_SKBEDIT=m ++CONFIG_NET_CLS_IND=y ++CONFIG_DCB=y ++CONFIG_DNS_RESOLVER=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++CONFIG_DEVTMPFS=y ++CONFIG_DEVTMPFS_MOUNT=y ++CONFIG_CONNECTOR=y ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_CRYPTOLOOP=m ++CONFIG_BLK_DEV_SX8=m ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_SIZE=16384 ++CONFIG_ATA_OVER_ETH=m ++CONFIG_RAID_ATTRS=m ++CONFIG_BLK_DEV_SD=y ++CONFIG_SCSI_CONSTANTS=y ++CONFIG_SCSI_LOGGING=y ++CONFIG_SCSI_SAS_ATA=y ++CONFIG_ISCSI_TCP=m ++CONFIG_SCSI_MVSAS=y ++# CONFIG_SCSI_MVSAS_DEBUG is not set ++CONFIG_SCSI_MVSAS_TASKLET=y ++CONFIG_ATA=y ++CONFIG_SATA_AHCI=y ++CONFIG_SATA_SIL24=y ++# CONFIG_ATA_SFF is not set ++CONFIG_MD=y ++CONFIG_BLK_DEV_MD=y ++CONFIG_MD_LINEAR=m ++CONFIG_MD_RAID0=m ++CONFIG_MD_RAID1=m ++CONFIG_MD_RAID10=m ++CONFIG_MD_RAID456=m ++CONFIG_MD_FAULTY=m ++CONFIG_BLK_DEV_DM=m ++CONFIG_DM_DEBUG=y ++CONFIG_DM_CRYPT=m ++CONFIG_DM_SNAPSHOT=m ++CONFIG_DM_MIRROR=m ++CONFIG_DM_LOG_USERSPACE=m ++CONFIG_DM_ZERO=m ++CONFIG_DM_MULTIPATH=m ++CONFIG_DM_MULTIPATH_QL=m ++CONFIG_DM_MULTIPATH_ST=m ++CONFIG_DM_DELAY=m ++CONFIG_DM_UEVENT=y ++CONFIG_TARGET_CORE=m ++CONFIG_TCM_IBLOCK=m ++CONFIG_TCM_FILEIO=m ++CONFIG_TCM_PSCSI=m ++CONFIG_LOOPBACK_TARGET=m ++CONFIG_ISCSI_TARGET=m ++CONFIG_FUSION=y ++CONFIG_FUSION_SAS=y ++CONFIG_NETDEVICES=y ++CONFIG_BONDING=m ++CONFIG_DUMMY=m ++CONFIG_IFB=m ++CONFIG_MACVLAN=m ++CONFIG_MACVTAP=m ++CONFIG_NETCONSOLE=m ++CONFIG_NETCONSOLE_DYNAMIC=y ++CONFIG_TUN=y ++CONFIG_VETH=m ++CONFIG_NET_DSA_MV88E6060=y ++CONFIG_NET_DSA_MV88E6XXX=y ++CONFIG_SKY2=y ++CONFIG_PTP_1588_CLOCK_TILEGX=y ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_SERIAL_TILEGX=y ++CONFIG_HW_RANDOM=y ++CONFIG_HW_RANDOM_TIMERIOMEM=m ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_WATCHDOG_NOWAYOUT=y ++# CONFIG_VGA_ARB is not set ++CONFIG_DRM=m ++CONFIG_DRM_TDFX=m ++CONFIG_DRM_R128=m ++CONFIG_DRM_MGA=m ++CONFIG_DRM_VIA=m ++CONFIG_DRM_SAVAGE=m ++CONFIG_USB=y ++CONFIG_USB_EHCI_HCD=y ++CONFIG_USB_OHCI_HCD=y ++CONFIG_USB_STORAGE=y ++CONFIG_EDAC=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_TILE=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++CONFIG_EXT2_FS_POSIX_ACL=y ++CONFIG_EXT2_FS_SECURITY=y ++CONFIG_EXT2_FS_XIP=y ++CONFIG_EXT3_FS=y ++CONFIG_EXT3_FS_POSIX_ACL=y ++CONFIG_EXT3_FS_SECURITY=y ++CONFIG_EXT4_FS=y ++CONFIG_EXT4_FS_POSIX_ACL=y ++CONFIG_EXT4_FS_SECURITY=y ++CONFIG_XFS_FS=y ++CONFIG_XFS_QUOTA=y ++CONFIG_XFS_POSIX_ACL=y ++CONFIG_GFS2_FS=m ++CONFIG_GFS2_FS_LOCKING_DLM=y ++CONFIG_BTRFS_FS=m ++CONFIG_BTRFS_FS_POSIX_ACL=y ++CONFIG_QUOTA=y ++CONFIG_QUOTA_NETLINK_INTERFACE=y ++# CONFIG_PRINT_QUOTA_WARNING is not set ++CONFIG_QFMT_V2=y ++CONFIG_AUTOFS4_FS=m ++CONFIG_FUSE_FS=y ++CONFIG_CUSE=m ++CONFIG_FSCACHE=m ++CONFIG_FSCACHE_STATS=y ++CONFIG_CACHEFILES=m ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_ZISOFS=y ++CONFIG_UDF_FS=m ++CONFIG_MSDOS_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_FAT_DEFAULT_IOCHARSET="ascii" ++CONFIG_PROC_KCORE=y ++CONFIG_TMPFS=y ++CONFIG_TMPFS_POSIX_ACL=y ++CONFIG_HUGETLBFS=y ++CONFIG_ECRYPT_FS=m ++CONFIG_CRAMFS=m ++CONFIG_SQUASHFS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3_ACL=y ++CONFIG_NFS_V4=m ++CONFIG_NFS_V4_1=y ++CONFIG_NFS_FSCACHE=y ++CONFIG_NFSD=m ++CONFIG_NFSD_V3_ACL=y ++CONFIG_NFSD_V4=y ++CONFIG_CIFS=m ++CONFIG_CIFS_STATS=y ++CONFIG_CIFS_WEAK_PW_HASH=y ++CONFIG_CIFS_UPCALL=y ++CONFIG_CIFS_XATTR=y ++CONFIG_CIFS_POSIX=y ++CONFIG_CIFS_DFS_UPCALL=y ++CONFIG_CIFS_FSCACHE=y ++CONFIG_NLS_DEFAULT="utf8" ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_CODEPAGE_737=m ++CONFIG_NLS_CODEPAGE_775=m ++CONFIG_NLS_CODEPAGE_850=m ++CONFIG_NLS_CODEPAGE_852=m ++CONFIG_NLS_CODEPAGE_855=m ++CONFIG_NLS_CODEPAGE_857=m ++CONFIG_NLS_CODEPAGE_860=m ++CONFIG_NLS_CODEPAGE_861=m ++CONFIG_NLS_CODEPAGE_862=m ++CONFIG_NLS_CODEPAGE_863=m ++CONFIG_NLS_CODEPAGE_864=m ++CONFIG_NLS_CODEPAGE_865=m ++CONFIG_NLS_CODEPAGE_866=m ++CONFIG_NLS_CODEPAGE_869=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_CODEPAGE_950=m ++CONFIG_NLS_CODEPAGE_932=m ++CONFIG_NLS_CODEPAGE_949=m ++CONFIG_NLS_CODEPAGE_874=m ++CONFIG_NLS_ISO8859_8=m ++CONFIG_NLS_CODEPAGE_1250=m ++CONFIG_NLS_CODEPAGE_1251=m ++CONFIG_NLS_ASCII=y ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_ISO8859_2=m ++CONFIG_NLS_ISO8859_3=m ++CONFIG_NLS_ISO8859_4=m ++CONFIG_NLS_ISO8859_5=m ++CONFIG_NLS_ISO8859_6=m ++CONFIG_NLS_ISO8859_7=m ++CONFIG_NLS_ISO8859_9=m ++CONFIG_NLS_ISO8859_13=m ++CONFIG_NLS_ISO8859_14=m ++CONFIG_NLS_ISO8859_15=m ++CONFIG_NLS_KOI8_R=m ++CONFIG_NLS_KOI8_U=m ++CONFIG_NLS_UTF8=m ++CONFIG_DLM=m ++CONFIG_DLM_DEBUG=y ++CONFIG_DYNAMIC_DEBUG=y ++CONFIG_DEBUG_INFO=y ++CONFIG_DEBUG_INFO_REDUCED=y ++# CONFIG_ENABLE_WARN_DEPRECATED is not set ++CONFIG_STRIP_ASM_SYMS=y ++CONFIG_DEBUG_FS=y ++CONFIG_HEADERS_CHECK=y ++# CONFIG_FRAME_POINTER is not set ++CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y ++CONFIG_DEBUG_VM=y ++CONFIG_DEBUG_MEMORY_INIT=y ++CONFIG_DEBUG_STACKOVERFLOW=y ++CONFIG_LOCKUP_DETECTOR=y ++CONFIG_SCHEDSTATS=y ++CONFIG_TIMER_STATS=y ++CONFIG_DEBUG_LIST=y ++CONFIG_DEBUG_CREDENTIALS=y ++CONFIG_RCU_CPU_STALL_TIMEOUT=60 ++CONFIG_ASYNC_RAID6_TEST=m ++CONFIG_KGDB=y ++CONFIG_SECURITY=y ++CONFIG_SECURITYFS=y ++CONFIG_SECURITY_NETWORK=y ++CONFIG_SECURITY_NETWORK_XFRM=y ++CONFIG_SECURITY_SELINUX=y ++CONFIG_SECURITY_SELINUX_BOOTPARAM=y ++CONFIG_SECURITY_SELINUX_DISABLE=y ++CONFIG_CRYPTO_PCRYPT=m ++CONFIG_CRYPTO_CRYPTD=m ++CONFIG_CRYPTO_TEST=m ++CONFIG_CRYPTO_CCM=m ++CONFIG_CRYPTO_GCM=m ++CONFIG_CRYPTO_CTS=m ++CONFIG_CRYPTO_LRW=m ++CONFIG_CRYPTO_PCBC=m ++CONFIG_CRYPTO_XTS=m ++CONFIG_CRYPTO_HMAC=y ++CONFIG_CRYPTO_XCBC=m ++CONFIG_CRYPTO_VMAC=m ++CONFIG_CRYPTO_MICHAEL_MIC=m ++CONFIG_CRYPTO_RMD128=m ++CONFIG_CRYPTO_RMD160=m ++CONFIG_CRYPTO_RMD256=m ++CONFIG_CRYPTO_RMD320=m ++CONFIG_CRYPTO_SHA1=y ++CONFIG_CRYPTO_SHA512=m ++CONFIG_CRYPTO_TGR192=m ++CONFIG_CRYPTO_WP512=m ++CONFIG_CRYPTO_ANUBIS=m ++CONFIG_CRYPTO_BLOWFISH=m ++CONFIG_CRYPTO_CAMELLIA=m ++CONFIG_CRYPTO_CAST5=m ++CONFIG_CRYPTO_CAST6=m ++CONFIG_CRYPTO_FCRYPT=m ++CONFIG_CRYPTO_KHAZAD=m ++CONFIG_CRYPTO_SEED=m ++CONFIG_CRYPTO_SERPENT=m ++CONFIG_CRYPTO_TEA=m ++CONFIG_CRYPTO_TWOFISH=m ++CONFIG_CRYPTO_LZO=m +diff --git a/arch/tile/configs/tilepro_defconfig b/arch/tile/configs/tilepro_defconfig +new file mode 100644 +index 000000000000..e8c4003cbd81 +--- /dev/null ++++ b/arch/tile/configs/tilepro_defconfig +@@ -0,0 +1,524 @@ ++CONFIG_SYSVIPC=y ++CONFIG_POSIX_MQUEUE=y ++CONFIG_AUDIT=y ++CONFIG_NO_HZ=y ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_BSD_PROCESS_ACCT=y ++CONFIG_BSD_PROCESS_ACCT_V3=y ++CONFIG_TASKSTATS=y ++CONFIG_TASK_DELAY_ACCT=y ++CONFIG_TASK_XACCT=y ++CONFIG_TASK_IO_ACCOUNTING=y ++CONFIG_LOG_BUF_SHIFT=19 ++CONFIG_CGROUPS=y ++CONFIG_CGROUP_DEBUG=y ++CONFIG_CGROUP_DEVICE=y ++CONFIG_CPUSETS=y ++CONFIG_CGROUP_CPUACCT=y ++CONFIG_CGROUP_SCHED=y ++CONFIG_RT_GROUP_SCHED=y ++CONFIG_BLK_CGROUP=y ++CONFIG_NAMESPACES=y ++CONFIG_RELAY=y ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_RD_XZ=y ++CONFIG_SYSCTL_SYSCALL=y ++CONFIG_EMBEDDED=y ++# CONFIG_COMPAT_BRK is not set ++CONFIG_PROFILING=y ++CONFIG_MODULES=y ++CONFIG_MODULE_FORCE_LOAD=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_BLK_DEV_INTEGRITY=y ++CONFIG_PARTITION_ADVANCED=y ++CONFIG_OSF_PARTITION=y ++CONFIG_AMIGA_PARTITION=y ++CONFIG_MAC_PARTITION=y ++CONFIG_BSD_DISKLABEL=y ++CONFIG_MINIX_SUBPARTITION=y ++CONFIG_SOLARIS_X86_PARTITION=y ++CONFIG_UNIXWARE_DISKLABEL=y ++CONFIG_SGI_PARTITION=y ++CONFIG_SUN_PARTITION=y ++CONFIG_KARMA_PARTITION=y ++CONFIG_CFQ_GROUP_IOSCHED=y ++CONFIG_HZ_100=y ++# CONFIG_COMPACTION is not set ++CONFIG_PREEMPT=y ++CONFIG_PCI_DEBUG=y ++# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set ++CONFIG_BINFMT_MISC=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_XFRM_USER=y ++CONFIG_XFRM_SUB_POLICY=y ++CONFIG_XFRM_STATISTICS=y ++CONFIG_NET_KEY=m ++CONFIG_NET_KEY_MIGRATE=y ++CONFIG_INET=y ++CONFIG_IP_MULTICAST=y ++CONFIG_IP_ADVANCED_ROUTER=y ++CONFIG_IP_MULTIPLE_TABLES=y ++CONFIG_IP_ROUTE_MULTIPATH=y ++CONFIG_IP_ROUTE_VERBOSE=y ++CONFIG_NET_IPIP=m ++CONFIG_IP_MROUTE=y ++CONFIG_IP_PIMSM_V1=y ++CONFIG_IP_PIMSM_V2=y ++CONFIG_SYN_COOKIES=y ++CONFIG_INET_AH=m ++CONFIG_INET_ESP=m ++CONFIG_INET_IPCOMP=m ++CONFIG_INET_XFRM_MODE_TRANSPORT=m ++CONFIG_INET_XFRM_MODE_TUNNEL=m ++CONFIG_INET_XFRM_MODE_BEET=m ++CONFIG_INET_DIAG=m ++CONFIG_TCP_CONG_ADVANCED=y ++CONFIG_TCP_CONG_HSTCP=m ++CONFIG_TCP_CONG_HYBLA=m ++CONFIG_TCP_CONG_SCALABLE=m ++CONFIG_TCP_CONG_LP=m ++CONFIG_TCP_CONG_VENO=m ++CONFIG_TCP_CONG_YEAH=m ++CONFIG_TCP_CONG_ILLINOIS=m ++CONFIG_TCP_MD5SIG=y ++CONFIG_IPV6=y ++CONFIG_IPV6_ROUTER_PREF=y ++CONFIG_IPV6_ROUTE_INFO=y ++CONFIG_IPV6_OPTIMISTIC_DAD=y ++CONFIG_INET6_AH=m ++CONFIG_INET6_ESP=m ++CONFIG_INET6_IPCOMP=m ++CONFIG_IPV6_MIP6=m ++CONFIG_INET6_XFRM_MODE_TRANSPORT=m ++CONFIG_INET6_XFRM_MODE_TUNNEL=m ++CONFIG_INET6_XFRM_MODE_BEET=m ++CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m ++CONFIG_IPV6_SIT=m ++CONFIG_IPV6_TUNNEL=m ++CONFIG_IPV6_MULTIPLE_TABLES=y ++CONFIG_IPV6_MROUTE=y ++CONFIG_IPV6_PIMSM_V2=y ++CONFIG_NETLABEL=y ++CONFIG_NETFILTER=y ++CONFIG_NF_CONNTRACK=m ++CONFIG_NF_CONNTRACK_SECMARK=y ++CONFIG_NF_CONNTRACK_ZONES=y ++CONFIG_NF_CONNTRACK_EVENTS=y ++CONFIG_NF_CT_PROTO_DCCP=m ++CONFIG_NF_CT_PROTO_UDPLITE=m ++CONFIG_NF_CONNTRACK_AMANDA=m ++CONFIG_NF_CONNTRACK_FTP=m ++CONFIG_NF_CONNTRACK_H323=m ++CONFIG_NF_CONNTRACK_IRC=m ++CONFIG_NF_CONNTRACK_NETBIOS_NS=m ++CONFIG_NF_CONNTRACK_PPTP=m ++CONFIG_NF_CONNTRACK_SANE=m ++CONFIG_NF_CONNTRACK_SIP=m ++CONFIG_NF_CONNTRACK_TFTP=m ++CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m ++CONFIG_NETFILTER_XT_TARGET_CONNMARK=m ++CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m ++CONFIG_NETFILTER_XT_TARGET_DSCP=m ++CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m ++CONFIG_NETFILTER_XT_TARGET_MARK=m ++CONFIG_NETFILTER_XT_TARGET_NFLOG=m ++CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m ++CONFIG_NETFILTER_XT_TARGET_NOTRACK=m ++CONFIG_NETFILTER_XT_TARGET_TEE=m ++CONFIG_NETFILTER_XT_TARGET_TPROXY=m ++CONFIG_NETFILTER_XT_TARGET_TRACE=m ++CONFIG_NETFILTER_XT_TARGET_SECMARK=m ++CONFIG_NETFILTER_XT_TARGET_TCPMSS=m ++CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m ++CONFIG_NETFILTER_XT_MATCH_CLUSTER=m ++CONFIG_NETFILTER_XT_MATCH_COMMENT=m ++CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m ++CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m ++CONFIG_NETFILTER_XT_MATCH_CONNMARK=m ++CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m ++CONFIG_NETFILTER_XT_MATCH_DCCP=m ++CONFIG_NETFILTER_XT_MATCH_DSCP=m ++CONFIG_NETFILTER_XT_MATCH_ESP=m ++CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m ++CONFIG_NETFILTER_XT_MATCH_HELPER=m ++CONFIG_NETFILTER_XT_MATCH_IPRANGE=m ++CONFIG_NETFILTER_XT_MATCH_IPVS=m ++CONFIG_NETFILTER_XT_MATCH_LENGTH=m ++CONFIG_NETFILTER_XT_MATCH_LIMIT=m ++CONFIG_NETFILTER_XT_MATCH_MAC=m ++CONFIG_NETFILTER_XT_MATCH_MARK=m ++CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m ++CONFIG_NETFILTER_XT_MATCH_OSF=m ++CONFIG_NETFILTER_XT_MATCH_OWNER=m ++CONFIG_NETFILTER_XT_MATCH_POLICY=m ++CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m ++CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m ++CONFIG_NETFILTER_XT_MATCH_QUOTA=m ++CONFIG_NETFILTER_XT_MATCH_RATEEST=m ++CONFIG_NETFILTER_XT_MATCH_REALM=m ++CONFIG_NETFILTER_XT_MATCH_RECENT=m ++CONFIG_NETFILTER_XT_MATCH_SOCKET=m ++CONFIG_NETFILTER_XT_MATCH_STATE=m ++CONFIG_NETFILTER_XT_MATCH_STATISTIC=m ++CONFIG_NETFILTER_XT_MATCH_STRING=m ++CONFIG_NETFILTER_XT_MATCH_TCPMSS=m ++CONFIG_NETFILTER_XT_MATCH_TIME=m ++CONFIG_NETFILTER_XT_MATCH_U32=m ++CONFIG_IP_VS=m ++CONFIG_IP_VS_IPV6=y ++CONFIG_IP_VS_PROTO_TCP=y ++CONFIG_IP_VS_PROTO_UDP=y ++CONFIG_IP_VS_PROTO_ESP=y ++CONFIG_IP_VS_PROTO_AH=y ++CONFIG_IP_VS_PROTO_SCTP=y ++CONFIG_IP_VS_RR=m ++CONFIG_IP_VS_WRR=m ++CONFIG_IP_VS_LC=m ++CONFIG_IP_VS_WLC=m ++CONFIG_IP_VS_LBLC=m ++CONFIG_IP_VS_LBLCR=m ++CONFIG_IP_VS_SED=m ++CONFIG_IP_VS_NQ=m ++CONFIG_NF_CONNTRACK_IPV4=m ++# CONFIG_NF_CONNTRACK_PROC_COMPAT is not set ++CONFIG_IP_NF_IPTABLES=y ++CONFIG_IP_NF_MATCH_AH=m ++CONFIG_IP_NF_MATCH_ECN=m ++CONFIG_IP_NF_MATCH_TTL=m ++CONFIG_IP_NF_FILTER=y ++CONFIG_IP_NF_TARGET_REJECT=y ++CONFIG_IP_NF_MANGLE=m ++CONFIG_IP_NF_TARGET_ECN=m ++CONFIG_IP_NF_TARGET_TTL=m ++CONFIG_IP_NF_RAW=m ++CONFIG_IP_NF_SECURITY=m ++CONFIG_IP_NF_ARPTABLES=m ++CONFIG_IP_NF_ARPFILTER=m ++CONFIG_IP_NF_ARP_MANGLE=m ++CONFIG_NF_CONNTRACK_IPV6=m ++CONFIG_IP6_NF_MATCH_AH=m ++CONFIG_IP6_NF_MATCH_EUI64=m ++CONFIG_IP6_NF_MATCH_FRAG=m ++CONFIG_IP6_NF_MATCH_OPTS=m ++CONFIG_IP6_NF_MATCH_HL=m ++CONFIG_IP6_NF_MATCH_IPV6HEADER=m ++CONFIG_IP6_NF_MATCH_MH=m ++CONFIG_IP6_NF_MATCH_RT=m ++CONFIG_IP6_NF_TARGET_HL=m ++CONFIG_IP6_NF_FILTER=m ++CONFIG_IP6_NF_TARGET_REJECT=m ++CONFIG_IP6_NF_MANGLE=m ++CONFIG_IP6_NF_RAW=m ++CONFIG_IP6_NF_SECURITY=m ++CONFIG_BRIDGE_NF_EBTABLES=m ++CONFIG_BRIDGE_EBT_BROUTE=m ++CONFIG_BRIDGE_EBT_T_FILTER=m ++CONFIG_BRIDGE_EBT_T_NAT=m ++CONFIG_BRIDGE_EBT_802_3=m ++CONFIG_BRIDGE_EBT_AMONG=m ++CONFIG_BRIDGE_EBT_ARP=m ++CONFIG_BRIDGE_EBT_IP=m ++CONFIG_BRIDGE_EBT_IP6=m ++CONFIG_BRIDGE_EBT_LIMIT=m ++CONFIG_BRIDGE_EBT_MARK=m ++CONFIG_BRIDGE_EBT_PKTTYPE=m ++CONFIG_BRIDGE_EBT_STP=m ++CONFIG_BRIDGE_EBT_VLAN=m ++CONFIG_BRIDGE_EBT_ARPREPLY=m ++CONFIG_BRIDGE_EBT_DNAT=m ++CONFIG_BRIDGE_EBT_MARK_T=m ++CONFIG_BRIDGE_EBT_REDIRECT=m ++CONFIG_BRIDGE_EBT_SNAT=m ++CONFIG_BRIDGE_EBT_LOG=m ++CONFIG_BRIDGE_EBT_ULOG=m ++CONFIG_BRIDGE_EBT_NFLOG=m ++CONFIG_RDS=m ++CONFIG_RDS_TCP=m ++CONFIG_BRIDGE=m ++CONFIG_VLAN_8021Q=m ++CONFIG_VLAN_8021Q_GVRP=y ++CONFIG_PHONET=m ++CONFIG_NET_SCHED=y ++CONFIG_NET_SCH_CBQ=m ++CONFIG_NET_SCH_HTB=m ++CONFIG_NET_SCH_HFSC=m ++CONFIG_NET_SCH_PRIO=m ++CONFIG_NET_SCH_MULTIQ=m ++CONFIG_NET_SCH_RED=m ++CONFIG_NET_SCH_SFQ=m ++CONFIG_NET_SCH_TEQL=m ++CONFIG_NET_SCH_TBF=m ++CONFIG_NET_SCH_GRED=m ++CONFIG_NET_SCH_DSMARK=m ++CONFIG_NET_SCH_NETEM=m ++CONFIG_NET_SCH_DRR=m ++CONFIG_NET_SCH_INGRESS=m ++CONFIG_NET_CLS_BASIC=m ++CONFIG_NET_CLS_TCINDEX=m ++CONFIG_NET_CLS_ROUTE4=m ++CONFIG_NET_CLS_FW=m ++CONFIG_NET_CLS_U32=m ++CONFIG_CLS_U32_PERF=y ++CONFIG_CLS_U32_MARK=y ++CONFIG_NET_CLS_RSVP=m ++CONFIG_NET_CLS_RSVP6=m ++CONFIG_NET_CLS_FLOW=m ++CONFIG_NET_CLS_CGROUP=y ++CONFIG_NET_EMATCH=y ++CONFIG_NET_EMATCH_CMP=m ++CONFIG_NET_EMATCH_NBYTE=m ++CONFIG_NET_EMATCH_U32=m ++CONFIG_NET_EMATCH_META=m ++CONFIG_NET_EMATCH_TEXT=m ++CONFIG_NET_CLS_ACT=y ++CONFIG_NET_ACT_POLICE=m ++CONFIG_NET_ACT_GACT=m ++CONFIG_GACT_PROB=y ++CONFIG_NET_ACT_MIRRED=m ++CONFIG_NET_ACT_IPT=m ++CONFIG_NET_ACT_NAT=m ++CONFIG_NET_ACT_PEDIT=m ++CONFIG_NET_ACT_SIMP=m ++CONFIG_NET_ACT_SKBEDIT=m ++CONFIG_NET_CLS_IND=y ++CONFIG_DCB=y ++CONFIG_DNS_RESOLVER=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++CONFIG_DEVTMPFS=y ++CONFIG_DEVTMPFS_MOUNT=y ++CONFIG_CONNECTOR=y ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_CRYPTOLOOP=m ++CONFIG_BLK_DEV_SX8=m ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_SIZE=16384 ++CONFIG_ATA_OVER_ETH=m ++CONFIG_RAID_ATTRS=m ++CONFIG_BLK_DEV_SD=y ++CONFIG_SCSI_CONSTANTS=y ++CONFIG_SCSI_LOGGING=y ++CONFIG_ATA=y ++CONFIG_SATA_SIL24=y ++# CONFIG_ATA_SFF is not set ++CONFIG_MD=y ++CONFIG_BLK_DEV_MD=y ++CONFIG_MD_LINEAR=m ++CONFIG_MD_RAID0=m ++CONFIG_MD_RAID1=m ++CONFIG_MD_RAID10=m ++CONFIG_MD_RAID456=m ++CONFIG_MD_FAULTY=m ++CONFIG_BLK_DEV_DM=m ++CONFIG_DM_DEBUG=y ++CONFIG_DM_CRYPT=m ++CONFIG_DM_SNAPSHOT=m ++CONFIG_DM_MIRROR=m ++CONFIG_DM_LOG_USERSPACE=m ++CONFIG_DM_ZERO=m ++CONFIG_DM_MULTIPATH=m ++CONFIG_DM_MULTIPATH_QL=m ++CONFIG_DM_MULTIPATH_ST=m ++CONFIG_DM_DELAY=m ++CONFIG_DM_UEVENT=y ++CONFIG_FUSION=y ++CONFIG_FUSION_SAS=y ++CONFIG_NETDEVICES=y ++CONFIG_BONDING=m ++CONFIG_DUMMY=m ++CONFIG_IFB=m ++CONFIG_MACVLAN=m ++CONFIG_MACVTAP=m ++CONFIG_NETCONSOLE=m ++CONFIG_NETCONSOLE_DYNAMIC=y ++CONFIG_TUN=y ++CONFIG_VETH=m ++CONFIG_NET_DSA_MV88E6060=y ++CONFIG_NET_DSA_MV88E6XXX=y ++# CONFIG_NET_VENDOR_3COM is not set ++CONFIG_E1000E=y ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_HW_RANDOM=y ++CONFIG_HW_RANDOM_TIMERIOMEM=m ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_WATCHDOG_NOWAYOUT=y ++# CONFIG_VGA_ARB is not set ++# CONFIG_USB_SUPPORT is not set ++CONFIG_EDAC=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_TILE=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++CONFIG_EXT2_FS_POSIX_ACL=y ++CONFIG_EXT2_FS_SECURITY=y ++CONFIG_EXT2_FS_XIP=y ++CONFIG_EXT3_FS=y ++CONFIG_EXT3_FS_POSIX_ACL=y ++CONFIG_EXT3_FS_SECURITY=y ++CONFIG_EXT4_FS=y ++CONFIG_EXT4_FS_POSIX_ACL=y ++CONFIG_EXT4_FS_SECURITY=y ++CONFIG_XFS_FS=y ++CONFIG_XFS_QUOTA=y ++CONFIG_XFS_POSIX_ACL=y ++CONFIG_GFS2_FS=m ++CONFIG_GFS2_FS_LOCKING_DLM=y ++CONFIG_BTRFS_FS=m ++CONFIG_BTRFS_FS_POSIX_ACL=y ++CONFIG_QUOTA=y ++CONFIG_QUOTA_NETLINK_INTERFACE=y ++# CONFIG_PRINT_QUOTA_WARNING is not set ++CONFIG_QFMT_V2=y ++CONFIG_AUTOFS4_FS=m ++CONFIG_FUSE_FS=y ++CONFIG_CUSE=m ++CONFIG_FSCACHE=m ++CONFIG_FSCACHE_STATS=y ++CONFIG_CACHEFILES=m ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_ZISOFS=y ++CONFIG_UDF_FS=m ++CONFIG_MSDOS_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_FAT_DEFAULT_IOCHARSET="ascii" ++CONFIG_PROC_KCORE=y ++CONFIG_TMPFS=y ++CONFIG_TMPFS_POSIX_ACL=y ++CONFIG_HUGETLBFS=y ++CONFIG_CONFIGFS_FS=m ++CONFIG_ECRYPT_FS=m ++CONFIG_CRAMFS=m ++CONFIG_SQUASHFS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3_ACL=y ++CONFIG_NFS_V4=m ++CONFIG_NFS_V4_1=y ++CONFIG_NFS_FSCACHE=y ++CONFIG_NFSD=m ++CONFIG_NFSD_V3_ACL=y ++CONFIG_NFSD_V4=y ++CONFIG_CIFS=m ++CONFIG_CIFS_STATS=y ++CONFIG_CIFS_WEAK_PW_HASH=y ++CONFIG_CIFS_UPCALL=y ++CONFIG_CIFS_XATTR=y ++CONFIG_CIFS_POSIX=y ++CONFIG_CIFS_DFS_UPCALL=y ++CONFIG_CIFS_FSCACHE=y ++CONFIG_NLS=y ++CONFIG_NLS_DEFAULT="utf8" ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_CODEPAGE_737=m ++CONFIG_NLS_CODEPAGE_775=m ++CONFIG_NLS_CODEPAGE_850=m ++CONFIG_NLS_CODEPAGE_852=m ++CONFIG_NLS_CODEPAGE_855=m ++CONFIG_NLS_CODEPAGE_857=m ++CONFIG_NLS_CODEPAGE_860=m ++CONFIG_NLS_CODEPAGE_861=m ++CONFIG_NLS_CODEPAGE_862=m ++CONFIG_NLS_CODEPAGE_863=m ++CONFIG_NLS_CODEPAGE_864=m ++CONFIG_NLS_CODEPAGE_865=m ++CONFIG_NLS_CODEPAGE_866=m ++CONFIG_NLS_CODEPAGE_869=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_CODEPAGE_950=m ++CONFIG_NLS_CODEPAGE_932=m ++CONFIG_NLS_CODEPAGE_949=m ++CONFIG_NLS_CODEPAGE_874=m ++CONFIG_NLS_ISO8859_8=m ++CONFIG_NLS_CODEPAGE_1250=m ++CONFIG_NLS_CODEPAGE_1251=m ++CONFIG_NLS_ASCII=y ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_ISO8859_2=m ++CONFIG_NLS_ISO8859_3=m ++CONFIG_NLS_ISO8859_4=m ++CONFIG_NLS_ISO8859_5=m ++CONFIG_NLS_ISO8859_6=m ++CONFIG_NLS_ISO8859_7=m ++CONFIG_NLS_ISO8859_9=m ++CONFIG_NLS_ISO8859_13=m ++CONFIG_NLS_ISO8859_14=m ++CONFIG_NLS_ISO8859_15=m ++CONFIG_NLS_KOI8_R=m ++CONFIG_NLS_KOI8_U=m ++CONFIG_NLS_UTF8=m ++CONFIG_DLM=m ++CONFIG_DLM_DEBUG=y ++CONFIG_DYNAMIC_DEBUG=y ++CONFIG_DEBUG_INFO=y ++CONFIG_DEBUG_INFO_REDUCED=y ++# CONFIG_ENABLE_WARN_DEPRECATED is not set ++CONFIG_FRAME_WARN=2048 ++CONFIG_STRIP_ASM_SYMS=y ++CONFIG_DEBUG_FS=y ++CONFIG_HEADERS_CHECK=y ++# CONFIG_FRAME_POINTER is not set ++CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y ++CONFIG_MAGIC_SYSRQ=y ++CONFIG_DEBUG_VM=y ++CONFIG_DEBUG_MEMORY_INIT=y ++CONFIG_DEBUG_STACKOVERFLOW=y ++CONFIG_LOCKUP_DETECTOR=y ++CONFIG_SCHEDSTATS=y ++CONFIG_TIMER_STATS=y ++CONFIG_DEBUG_LIST=y ++CONFIG_DEBUG_CREDENTIALS=y ++CONFIG_RCU_CPU_STALL_TIMEOUT=60 ++CONFIG_ASYNC_RAID6_TEST=m ++CONFIG_SECURITY=y ++CONFIG_SECURITYFS=y ++CONFIG_SECURITY_NETWORK=y ++CONFIG_SECURITY_NETWORK_XFRM=y ++CONFIG_SECURITY_SELINUX=y ++CONFIG_SECURITY_SELINUX_BOOTPARAM=y ++CONFIG_SECURITY_SELINUX_DISABLE=y ++CONFIG_CRYPTO_PCRYPT=m ++CONFIG_CRYPTO_CRYPTD=m ++CONFIG_CRYPTO_TEST=m ++CONFIG_CRYPTO_CCM=m ++CONFIG_CRYPTO_GCM=m ++CONFIG_CRYPTO_CTS=m ++CONFIG_CRYPTO_LRW=m ++CONFIG_CRYPTO_PCBC=m ++CONFIG_CRYPTO_XTS=m ++CONFIG_CRYPTO_HMAC=y ++CONFIG_CRYPTO_XCBC=m ++CONFIG_CRYPTO_VMAC=m ++CONFIG_CRYPTO_MICHAEL_MIC=m ++CONFIG_CRYPTO_RMD128=m ++CONFIG_CRYPTO_RMD160=m ++CONFIG_CRYPTO_RMD256=m ++CONFIG_CRYPTO_RMD320=m ++CONFIG_CRYPTO_SHA1=y ++CONFIG_CRYPTO_SHA512=m ++CONFIG_CRYPTO_TGR192=m ++CONFIG_CRYPTO_WP512=m ++CONFIG_CRYPTO_ANUBIS=m ++CONFIG_CRYPTO_BLOWFISH=m ++CONFIG_CRYPTO_CAMELLIA=m ++CONFIG_CRYPTO_CAST5=m ++CONFIG_CRYPTO_CAST6=m ++CONFIG_CRYPTO_FCRYPT=m ++CONFIG_CRYPTO_KHAZAD=m ++CONFIG_CRYPTO_SEED=m ++CONFIG_CRYPTO_SERPENT=m ++CONFIG_CRYPTO_TEA=m ++CONFIG_CRYPTO_TWOFISH=m ++CONFIG_CRYPTO_LZO=m ++CONFIG_CRC_CCITT=m ++CONFIG_CRC7=m +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 8ef85139553f..6f6ecda60d5b 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1034,6 +1034,22 @@ config NR_CPUS + config SCHED_SMT + def_bool y if SMP + ++config SMT_NICE ++ bool "SMT (Hyperthreading) aware nice priority and policy support" ++ depends on SCHED_MUQSS && SCHED_SMT ++ default y ++ ---help--- ++ Enabling Hyperthreading on Intel CPUs decreases the effectiveness ++ of the use of 'nice' levels and different scheduling policies ++ (e.g. realtime) due to sharing of CPU power between hyperthreads. ++ SMT nice support makes each logical CPU aware of what is running on ++ its hyperthread siblings, maintaining appropriate distribution of ++ CPU according to nice levels and scheduling policies at the expense ++ of slightly increased overhead. ++ ++ If unsure say Y here. ++ ++ + config SCHED_MC + def_bool y + prompt "Multi-core scheduler support" +@@ -1064,6 +1080,8 @@ config SCHED_MC_PRIO + + If unsure say Y here. + ++source "kernel/Kconfig.MuQSS" ++ + config UP_LATE_INIT + def_bool y + depends on !SMP && X86_LOCAL_APIC +@@ -1433,7 +1451,7 @@ config HIGHMEM64G + endchoice + + choice +- prompt "Memory split" if EXPERT ++ prompt "Memory split" + default VMSPLIT_3G + depends on X86_32 + ---help--- +@@ -1453,17 +1471,17 @@ choice + option alone! + + config VMSPLIT_3G +- bool "3G/1G user/kernel split" ++ bool "Default 896MB lowmem (3G/1G user/kernel split)" + config VMSPLIT_3G_OPT + depends on !X86_PAE +- bool "3G/1G user/kernel split (for full 1G low memory)" ++ bool "1GB lowmem (3G/1G user/kernel split)" + config VMSPLIT_2G +- bool "2G/2G user/kernel split" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_2G_OPT + depends on !X86_PAE +- bool "2G/2G user/kernel split (for full 2G low memory)" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_1G +- bool "1G/3G user/kernel split" ++ bool "3GB lowmem (1G/3G user/kernel split)" + endchoice + + config PAGE_OFFSET +diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig +index 59ce9ed58430..f19741b0f43d 100644 +--- a/arch/x86/configs/i386_defconfig ++++ b/arch/x86/configs/i386_defconfig +@@ -29,7 +29,7 @@ CONFIG_SMP=y + CONFIG_X86_GENERIC=y + CONFIG_HPET_TIMER=y + CONFIG_SCHED_SMT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y + CONFIG_X86_MCE=y + CONFIG_X86_REBOOTFIXUPS=y +diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig +index d0a5ffeae8df..63f1fb92590c 100644 +--- a/arch/x86/configs/x86_64_defconfig ++++ b/arch/x86/configs/x86_64_defconfig +@@ -28,7 +28,7 @@ CONFIG_SMP=y + CONFIG_CALGARY_IOMMU=y + CONFIG_NR_CPUS=64 + CONFIG_SCHED_SMT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y + CONFIG_X86_MCE=y + CONFIG_MICROCODE=y +diff --git a/drivers/block/swim.c b/drivers/block/swim.c +index 4c297f69171d..5bc4f1be2617 100644 +--- a/drivers/block/swim.c ++++ b/drivers/block/swim.c +@@ -328,7 +328,7 @@ static inline void swim_motor(struct swim __iomem *base, + if (swim_readbit(base, MOTOR_ON)) + break; + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + } else if (action == OFF) { + swim_action(base, MOTOR_OFF); +@@ -347,7 +347,7 @@ static inline void swim_eject(struct swim __iomem *base) + if (!swim_readbit(base, DISK_IN)) + break; + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + swim_select(base, RELAX); + } +@@ -371,7 +371,7 @@ static inline int swim_step(struct swim __iomem *base) + for (wait = 0; wait < HZ; wait++) { + + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + + swim_select(base, RELAX); + if (!swim_readbit(base, STEP)) +diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c +index 2aab80e19ae0..6200dbb3b5ef 100644 +--- a/drivers/char/ipmi/ipmi_msghandler.c ++++ b/drivers/char/ipmi/ipmi_msghandler.c +@@ -3544,7 +3544,7 @@ static void cleanup_smi_msgs(struct ipmi_smi *intf) + /* Current message first, to preserve order */ + while (intf->curr_msg && !list_empty(&intf->waiting_rcv_msgs)) { + /* Wait for the message to clear out. */ +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + + /* No need for locks, the interface is down. */ +diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c +index 22c6a2e61236..c4bccd444cbf 100644 +--- a/drivers/char/ipmi/ipmi_ssif.c ++++ b/drivers/char/ipmi/ipmi_ssif.c +@@ -1289,7 +1289,7 @@ static void shutdown_ssif(void *send_info) + + /* make sure the driver is not looking for flags any more. */ + while (ssif_info->ssif_state != SSIF_NORMAL) +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + + ssif_info->stopping = true; + del_timer_sync(&ssif_info->watch_timer); +diff --git a/drivers/char/snsc.c b/drivers/char/snsc.c +new file mode 100644 +index 000000000000..5228e78df804 +--- /dev/null ++++ b/drivers/char/snsc.c +@@ -0,0 +1,469 @@ ++/* ++ * SN Platform system controller communication support ++ * ++ * This file is subject to the terms and conditions of the GNU General Public ++ * License. See the file "COPYING" in the main directory of this archive ++ * for more details. ++ * ++ * Copyright (C) 2004, 2006 Silicon Graphics, Inc. All rights reserved. ++ */ ++ ++/* ++ * System controller communication driver ++ * ++ * This driver allows a user process to communicate with the system ++ * controller (a.k.a. "IRouter") network in an SGI SN system. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "snsc.h" ++ ++#define SYSCTL_BASENAME "snsc" ++ ++#define SCDRV_BUFSZ 2048 ++#define SCDRV_TIMEOUT 1000 ++ ++static DEFINE_MUTEX(scdrv_mutex); ++static irqreturn_t ++scdrv_interrupt(int irq, void *subch_data) ++{ ++ struct subch_data_s *sd = subch_data; ++ unsigned long flags; ++ int status; ++ ++ spin_lock_irqsave(&sd->sd_rlock, flags); ++ spin_lock(&sd->sd_wlock); ++ status = ia64_sn_irtr_intr(sd->sd_nasid, sd->sd_subch); ++ ++ if (status > 0) { ++ if (status & SAL_IROUTER_INTR_RECV) { ++ wake_up(&sd->sd_rq); ++ } ++ if (status & SAL_IROUTER_INTR_XMIT) { ++ ia64_sn_irtr_intr_disable ++ (sd->sd_nasid, sd->sd_subch, ++ SAL_IROUTER_INTR_XMIT); ++ wake_up(&sd->sd_wq); ++ } ++ } ++ spin_unlock(&sd->sd_wlock); ++ spin_unlock_irqrestore(&sd->sd_rlock, flags); ++ return IRQ_HANDLED; ++} ++ ++/* ++ * scdrv_open ++ * ++ * Reserve a subchannel for system controller communication. ++ */ ++ ++static int ++scdrv_open(struct inode *inode, struct file *file) ++{ ++ struct sysctl_data_s *scd; ++ struct subch_data_s *sd; ++ int rv; ++ ++ /* look up device info for this device file */ ++ scd = container_of(inode->i_cdev, struct sysctl_data_s, scd_cdev); ++ ++ /* allocate memory for subchannel data */ ++ sd = kzalloc(sizeof (struct subch_data_s), GFP_KERNEL); ++ if (sd == NULL) { ++ printk("%s: couldn't allocate subchannel data\n", ++ __func__); ++ return -ENOMEM; ++ } ++ ++ /* initialize subch_data_s fields */ ++ sd->sd_nasid = scd->scd_nasid; ++ sd->sd_subch = ia64_sn_irtr_open(scd->scd_nasid); ++ ++ if (sd->sd_subch < 0) { ++ kfree(sd); ++ printk("%s: couldn't allocate subchannel\n", __func__); ++ return -EBUSY; ++ } ++ ++ spin_lock_init(&sd->sd_rlock); ++ spin_lock_init(&sd->sd_wlock); ++ init_waitqueue_head(&sd->sd_rq); ++ init_waitqueue_head(&sd->sd_wq); ++ sema_init(&sd->sd_rbs, 1); ++ sema_init(&sd->sd_wbs, 1); ++ ++ file->private_data = sd; ++ ++ /* hook this subchannel up to the system controller interrupt */ ++ mutex_lock(&scdrv_mutex); ++ rv = request_irq(SGI_UART_VECTOR, scdrv_interrupt, ++ IRQF_SHARED, SYSCTL_BASENAME, sd); ++ if (rv) { ++ ia64_sn_irtr_close(sd->sd_nasid, sd->sd_subch); ++ kfree(sd); ++ printk("%s: irq request failed (%d)\n", __func__, rv); ++ mutex_unlock(&scdrv_mutex); ++ return -EBUSY; ++ } ++ mutex_unlock(&scdrv_mutex); ++ return 0; ++} ++ ++/* ++ * scdrv_release ++ * ++ * Release a previously-reserved subchannel. ++ */ ++ ++static int ++scdrv_release(struct inode *inode, struct file *file) ++{ ++ struct subch_data_s *sd = (struct subch_data_s *) file->private_data; ++ int rv; ++ ++ /* free the interrupt */ ++ free_irq(SGI_UART_VECTOR, sd); ++ ++ /* ask SAL to close the subchannel */ ++ rv = ia64_sn_irtr_close(sd->sd_nasid, sd->sd_subch); ++ ++ kfree(sd); ++ return rv; ++} ++ ++/* ++ * scdrv_read ++ * ++ * Called to read bytes from the open IRouter pipe. ++ * ++ */ ++ ++static inline int ++read_status_check(struct subch_data_s *sd, int *len) ++{ ++ return ia64_sn_irtr_recv(sd->sd_nasid, sd->sd_subch, sd->sd_rb, len); ++} ++ ++static ssize_t ++scdrv_read(struct file *file, char __user *buf, size_t count, loff_t *f_pos) ++{ ++ int status; ++ int len; ++ unsigned long flags; ++ struct subch_data_s *sd = (struct subch_data_s *) file->private_data; ++ ++ /* try to get control of the read buffer */ ++ if (down_trylock(&sd->sd_rbs)) { ++ /* somebody else has it now; ++ * if we're non-blocking, then exit... ++ */ ++ if (file->f_flags & O_NONBLOCK) { ++ return -EAGAIN; ++ } ++ /* ...or if we want to block, then do so here */ ++ if (down_interruptible(&sd->sd_rbs)) { ++ /* something went wrong with wait */ ++ return -ERESTARTSYS; ++ } ++ } ++ ++ /* anything to read? */ ++ len = CHUNKSIZE; ++ spin_lock_irqsave(&sd->sd_rlock, flags); ++ status = read_status_check(sd, &len); ++ ++ /* if not, and we're blocking I/O, loop */ ++ while (status < 0) { ++ DECLARE_WAITQUEUE(wait, current); ++ ++ if (file->f_flags & O_NONBLOCK) { ++ spin_unlock_irqrestore(&sd->sd_rlock, flags); ++ up(&sd->sd_rbs); ++ return -EAGAIN; ++ } ++ ++ len = CHUNKSIZE; ++ set_current_state(TASK_INTERRUPTIBLE); ++ add_wait_queue(&sd->sd_rq, &wait); ++ spin_unlock_irqrestore(&sd->sd_rlock, flags); ++ ++ schedule_msec_hrtimeout((SCDRV_TIMEOUT)); ++ ++ remove_wait_queue(&sd->sd_rq, &wait); ++ if (signal_pending(current)) { ++ /* wait was interrupted */ ++ up(&sd->sd_rbs); ++ return -ERESTARTSYS; ++ } ++ ++ spin_lock_irqsave(&sd->sd_rlock, flags); ++ status = read_status_check(sd, &len); ++ } ++ spin_unlock_irqrestore(&sd->sd_rlock, flags); ++ ++ if (len > 0) { ++ /* we read something in the last read_status_check(); copy ++ * it out to user space ++ */ ++ if (count < len) { ++ pr_debug("%s: only accepting %d of %d bytes\n", ++ __func__, (int) count, len); ++ } ++ len = min((int) count, len); ++ if (copy_to_user(buf, sd->sd_rb, len)) ++ len = -EFAULT; ++ } ++ ++ /* release the read buffer and wake anyone who might be ++ * waiting for it ++ */ ++ up(&sd->sd_rbs); ++ ++ /* return the number of characters read in */ ++ return len; ++} ++ ++/* ++ * scdrv_write ++ * ++ * Writes a chunk of an IRouter packet (or other system controller data) ++ * to the system controller. ++ * ++ */ ++static inline int ++write_status_check(struct subch_data_s *sd, int count) ++{ ++ return ia64_sn_irtr_send(sd->sd_nasid, sd->sd_subch, sd->sd_wb, count); ++} ++ ++static ssize_t ++scdrv_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *f_pos) ++{ ++ unsigned long flags; ++ int status; ++ struct subch_data_s *sd = (struct subch_data_s *) file->private_data; ++ ++ /* try to get control of the write buffer */ ++ if (down_trylock(&sd->sd_wbs)) { ++ /* somebody else has it now; ++ * if we're non-blocking, then exit... ++ */ ++ if (file->f_flags & O_NONBLOCK) { ++ return -EAGAIN; ++ } ++ /* ...or if we want to block, then do so here */ ++ if (down_interruptible(&sd->sd_wbs)) { ++ /* something went wrong with wait */ ++ return -ERESTARTSYS; ++ } ++ } ++ ++ count = min((int) count, CHUNKSIZE); ++ if (copy_from_user(sd->sd_wb, buf, count)) { ++ up(&sd->sd_wbs); ++ return -EFAULT; ++ } ++ ++ /* try to send the buffer */ ++ spin_lock_irqsave(&sd->sd_wlock, flags); ++ status = write_status_check(sd, count); ++ ++ /* if we failed, and we want to block, then loop */ ++ while (status <= 0) { ++ DECLARE_WAITQUEUE(wait, current); ++ ++ if (file->f_flags & O_NONBLOCK) { ++ spin_unlock_irqrestore(&sd->sd_wlock, flags); ++ up(&sd->sd_wbs); ++ return -EAGAIN; ++ } ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ add_wait_queue(&sd->sd_wq, &wait); ++ spin_unlock_irqrestore(&sd->sd_wlock, flags); ++ ++ schedule_msec_hrtimeout((SCDRV_TIMEOUT)); ++ ++ remove_wait_queue(&sd->sd_wq, &wait); ++ if (signal_pending(current)) { ++ /* wait was interrupted */ ++ up(&sd->sd_wbs); ++ return -ERESTARTSYS; ++ } ++ ++ spin_lock_irqsave(&sd->sd_wlock, flags); ++ status = write_status_check(sd, count); ++ } ++ spin_unlock_irqrestore(&sd->sd_wlock, flags); ++ ++ /* release the write buffer and wake anyone who's waiting for it */ ++ up(&sd->sd_wbs); ++ ++ /* return the number of characters accepted (should be the complete ++ * "chunk" as requested) ++ */ ++ if ((status >= 0) && (status < count)) { ++ pr_debug("Didn't accept the full chunk; %d of %d\n", ++ status, (int) count); ++ } ++ return status; ++} ++ ++static __poll_t ++scdrv_poll(struct file *file, struct poll_table_struct *wait) ++{ ++ __poll_t mask = 0; ++ int status = 0; ++ struct subch_data_s *sd = (struct subch_data_s *) file->private_data; ++ unsigned long flags; ++ ++ poll_wait(file, &sd->sd_rq, wait); ++ poll_wait(file, &sd->sd_wq, wait); ++ ++ spin_lock_irqsave(&sd->sd_rlock, flags); ++ spin_lock(&sd->sd_wlock); ++ status = ia64_sn_irtr_intr(sd->sd_nasid, sd->sd_subch); ++ spin_unlock(&sd->sd_wlock); ++ spin_unlock_irqrestore(&sd->sd_rlock, flags); ++ ++ if (status > 0) { ++ if (status & SAL_IROUTER_INTR_RECV) { ++ mask |= EPOLLIN | EPOLLRDNORM; ++ } ++ if (status & SAL_IROUTER_INTR_XMIT) { ++ mask |= EPOLLOUT | EPOLLWRNORM; ++ } ++ } ++ ++ return mask; ++} ++ ++static const struct file_operations scdrv_fops = { ++ .owner = THIS_MODULE, ++ .read = scdrv_read, ++ .write = scdrv_write, ++ .poll = scdrv_poll, ++ .open = scdrv_open, ++ .release = scdrv_release, ++ .llseek = noop_llseek, ++}; ++ ++static struct class *snsc_class; ++ ++/* ++ * scdrv_init ++ * ++ * Called at boot time to initialize the system controller communication ++ * facility. ++ */ ++int __init ++scdrv_init(void) ++{ ++ geoid_t geoid; ++ cnodeid_t cnode; ++ char devname[32]; ++ char *devnamep; ++ struct sysctl_data_s *scd; ++ void *salbuf; ++ dev_t first_dev, dev; ++ nasid_t event_nasid; ++ ++ if (!ia64_platform_is("sn2")) ++ return -ENODEV; ++ ++ event_nasid = ia64_sn_get_console_nasid(); ++ ++ snsc_class = class_create(THIS_MODULE, SYSCTL_BASENAME); ++ if (IS_ERR(snsc_class)) { ++ printk("%s: failed to allocate class\n", __func__); ++ return PTR_ERR(snsc_class); ++ } ++ ++ if (alloc_chrdev_region(&first_dev, 0, num_cnodes, ++ SYSCTL_BASENAME) < 0) { ++ printk("%s: failed to register SN system controller device\n", ++ __func__); ++ return -ENODEV; ++ } ++ ++ for (cnode = 0; cnode < num_cnodes; cnode++) { ++ geoid = cnodeid_get_geoid(cnode); ++ devnamep = devname; ++ format_module_id(devnamep, geo_module(geoid), ++ MODULE_FORMAT_BRIEF); ++ devnamep = devname + strlen(devname); ++ sprintf(devnamep, "^%d#%d", geo_slot(geoid), ++ geo_slab(geoid)); ++ ++ /* allocate sysctl device data */ ++ scd = kzalloc(sizeof (struct sysctl_data_s), ++ GFP_KERNEL); ++ if (!scd) { ++ printk("%s: failed to allocate device info" ++ "for %s/%s\n", __func__, ++ SYSCTL_BASENAME, devname); ++ continue; ++ } ++ ++ /* initialize sysctl device data fields */ ++ scd->scd_nasid = cnodeid_to_nasid(cnode); ++ if (!(salbuf = kmalloc(SCDRV_BUFSZ, GFP_KERNEL))) { ++ printk("%s: failed to allocate driver buffer" ++ "(%s%s)\n", __func__, ++ SYSCTL_BASENAME, devname); ++ kfree(scd); ++ continue; ++ } ++ ++ if (ia64_sn_irtr_init(scd->scd_nasid, salbuf, ++ SCDRV_BUFSZ) < 0) { ++ printk ++ ("%s: failed to initialize SAL for" ++ " system controller communication" ++ " (%s/%s): outdated PROM?\n", ++ __func__, SYSCTL_BASENAME, devname); ++ kfree(scd); ++ kfree(salbuf); ++ continue; ++ } ++ ++ dev = first_dev + cnode; ++ cdev_init(&scd->scd_cdev, &scdrv_fops); ++ if (cdev_add(&scd->scd_cdev, dev, 1)) { ++ printk("%s: failed to register system" ++ " controller device (%s%s)\n", ++ __func__, SYSCTL_BASENAME, devname); ++ kfree(scd); ++ kfree(salbuf); ++ continue; ++ } ++ ++ device_create(snsc_class, NULL, dev, NULL, ++ "%s", devname); ++ ++ ia64_sn_irtr_intr_enable(scd->scd_nasid, ++ 0 /*ignored */ , ++ SAL_IROUTER_INTR_RECV); ++ ++ /* on the console nasid, prepare to receive ++ * system controller environmental events ++ */ ++ if(scd->scd_nasid == event_nasid) { ++ scdrv_event_init(scd); ++ } ++ } ++ return 0; ++} ++device_initcall(scdrv_init); +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c +index e5252ef3812f..6ae6241185ea 100644 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c +@@ -237,7 +237,7 @@ static int vmw_fifo_wait_noirq(struct vmw_private *dev_priv, + DRM_ERROR("SVGA device lockup.\n"); + break; + } +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + if (interruptible && signal_pending(current)) { + ret = -ERESTARTSYS; + break; +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c +index 75f3efee21a4..09b1932ce85b 100644 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c +@@ -203,7 +203,7 @@ int vmw_fallback_wait(struct vmw_private *dev_priv, + break; + } + if (lazy) +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + else if ((++count & 0x0F) == 0) { + /** + * FIXME: Use schedule_hr_timeout here for +diff --git a/drivers/hwmon/fam15h_power.c b/drivers/hwmon/fam15h_power.c +index 267eac00a3fb..352af68c6cd7 100644 +--- a/drivers/hwmon/fam15h_power.c ++++ b/drivers/hwmon/fam15h_power.c +@@ -225,7 +225,7 @@ static ssize_t power1_average_show(struct device *dev, + prev_ptsc[cu] = data->cpu_sw_pwr_ptsc[cu]; + } + +- leftover = schedule_timeout_interruptible(msecs_to_jiffies(data->power_period)); ++ leftover = schedule_msec_hrtimeout_interruptible((data->power_period)); + if (leftover) + return 0; + +diff --git a/drivers/iio/light/tsl2563.c b/drivers/iio/light/tsl2563.c +index d8c40a83097d..8332baf4961c 100644 +--- a/drivers/iio/light/tsl2563.c ++++ b/drivers/iio/light/tsl2563.c +@@ -269,11 +269,7 @@ static void tsl2563_wait_adc(struct tsl2563_chip *chip) + default: + delay = 402; + } +- /* +- * TODO: Make sure that we wait at least required delay but why we +- * have to extend it one tick more? +- */ +- schedule_timeout_interruptible(msecs_to_jiffies(delay) + 2); ++ schedule_msec_hrtimeout_interruptible(delay + 1); + } + + static int tsl2563_adjust_gainlevel(struct tsl2563_chip *chip, u16 adc) +diff --git a/drivers/media/i2c/msp3400-driver.c b/drivers/media/i2c/msp3400-driver.c +index 39530d43590e..a7caf2eb5771 100644 +--- a/drivers/media/i2c/msp3400-driver.c ++++ b/drivers/media/i2c/msp3400-driver.c +@@ -170,7 +170,7 @@ static int msp_read(struct i2c_client *client, int dev, int addr) + break; + dev_warn(&client->dev, "I/O error #%d (read 0x%02x/0x%02x)\n", err, + dev, addr); +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + if (err == 3) { + dev_warn(&client->dev, "resetting chip, sound will go off.\n"); +@@ -211,7 +211,7 @@ static int msp_write(struct i2c_client *client, int dev, int addr, int val) + break; + dev_warn(&client->dev, "I/O error #%d (write 0x%02x/0x%02x)\n", err, + dev, addr); +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + if (err == 3) { + dev_warn(&client->dev, "resetting chip, sound will go off.\n"); +diff --git a/drivers/media/pci/cx18/cx18-gpio.c b/drivers/media/pci/cx18/cx18-gpio.c +index cf7cfda94107..f63e17489547 100644 +--- a/drivers/media/pci/cx18/cx18-gpio.c ++++ b/drivers/media/pci/cx18/cx18-gpio.c +@@ -81,11 +81,11 @@ static void gpio_reset_seq(struct cx18 *cx, u32 active_lo, u32 active_hi, + + /* Assert */ + gpio_update(cx, mask, ~active_lo); +- schedule_timeout_uninterruptible(msecs_to_jiffies(assert_msecs)); ++ schedule_msec_hrtimeout_uninterruptible((assert_msecs)); + + /* Deassert */ + gpio_update(cx, mask, ~active_hi); +- schedule_timeout_uninterruptible(msecs_to_jiffies(recovery_msecs)); ++ schedule_msec_hrtimeout_uninterruptible((recovery_msecs)); + } + + /* +diff --git a/drivers/media/pci/ivtv/ivtv-gpio.c b/drivers/media/pci/ivtv/ivtv-gpio.c +index 856e7ab7f33e..766a26251337 100644 +--- a/drivers/media/pci/ivtv/ivtv-gpio.c ++++ b/drivers/media/pci/ivtv/ivtv-gpio.c +@@ -105,7 +105,7 @@ void ivtv_reset_ir_gpio(struct ivtv *itv) + curout = (curout & ~0xF) | 1; + write_reg(curout, IVTV_REG_GPIO_OUT); + /* We could use something else for smaller time */ +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + curout |= 2; + write_reg(curout, IVTV_REG_GPIO_OUT); + curdir &= ~0x80; +@@ -125,11 +125,11 @@ int ivtv_reset_tuner_gpio(void *dev, int component, int cmd, int value) + curout = read_reg(IVTV_REG_GPIO_OUT); + curout &= ~(1 << itv->card->xceive_pin); + write_reg(curout, IVTV_REG_GPIO_OUT); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + + curout |= 1 << itv->card->xceive_pin; + write_reg(curout, IVTV_REG_GPIO_OUT); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + return 0; + } + +diff --git a/drivers/media/pci/ivtv/ivtv-ioctl.c b/drivers/media/pci/ivtv/ivtv-ioctl.c +index 137853944e46..76830892f373 100644 +--- a/drivers/media/pci/ivtv/ivtv-ioctl.c ++++ b/drivers/media/pci/ivtv/ivtv-ioctl.c +@@ -1137,7 +1137,7 @@ void ivtv_s_std_dec(struct ivtv *itv, v4l2_std_id std) + TASK_UNINTERRUPTIBLE); + if ((read_reg(IVTV_REG_DEC_LINE_FIELD) >> 16) < 100) + break; +- schedule_timeout(msecs_to_jiffies(25)); ++ schedule_msec_hrtimeout((25)); + } + finish_wait(&itv->vsync_waitq, &wait); + mutex_lock(&itv->serialize_lock); +diff --git a/drivers/media/pci/ivtv/ivtv-streams.c b/drivers/media/pci/ivtv/ivtv-streams.c +index f7de9118f609..f39ad2952c0f 100644 +--- a/drivers/media/pci/ivtv/ivtv-streams.c ++++ b/drivers/media/pci/ivtv/ivtv-streams.c +@@ -849,7 +849,7 @@ int ivtv_stop_v4l2_encode_stream(struct ivtv_stream *s, int gop_end) + while (!test_bit(IVTV_F_I_EOS, &itv->i_flags) && + time_before(jiffies, + then + msecs_to_jiffies(2000))) { +- schedule_timeout(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout((10)); + } + + /* To convert jiffies to ms, we must multiply by 1000 +diff --git a/drivers/media/radio/radio-mr800.c b/drivers/media/radio/radio-mr800.c +index cb0437b4c331..163fffc0e1d4 100644 +--- a/drivers/media/radio/radio-mr800.c ++++ b/drivers/media/radio/radio-mr800.c +@@ -366,7 +366,7 @@ static int vidioc_s_hw_freq_seek(struct file *file, void *priv, + retval = -ENODATA; + break; + } +- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { ++ if (schedule_msec_hrtimeout_interruptible((10))) { + retval = -ERESTARTSYS; + break; + } +diff --git a/drivers/media/radio/radio-tea5777.c b/drivers/media/radio/radio-tea5777.c +index fb9de7bbcd19..e53cf45e7f3f 100644 +--- a/drivers/media/radio/radio-tea5777.c ++++ b/drivers/media/radio/radio-tea5777.c +@@ -235,7 +235,7 @@ static int radio_tea5777_update_read_reg(struct radio_tea5777 *tea, int wait) + } + + if (wait) { +- if (schedule_timeout_interruptible(msecs_to_jiffies(wait))) ++ if (schedule_msec_hrtimeout_interruptible((wait))) + return -ERESTARTSYS; + } + +diff --git a/drivers/media/radio/tea575x.c b/drivers/media/radio/tea575x.c +index b0303cf00387..0925b5065147 100644 +--- a/drivers/media/radio/tea575x.c ++++ b/drivers/media/radio/tea575x.c +@@ -401,7 +401,7 @@ int snd_tea575x_s_hw_freq_seek(struct file *file, struct snd_tea575x *tea, + for (;;) { + if (time_after(jiffies, timeout)) + break; +- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { ++ if (schedule_msec_hrtimeout_interruptible((10))) { + /* some signal arrived, stop search */ + tea->val &= ~TEA575X_BIT_SEARCH; + snd_tea575x_set_freq(tea); +diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c +index b690796d24d4..448b13da62b4 100644 +--- a/drivers/mfd/ucb1x00-core.c ++++ b/drivers/mfd/ucb1x00-core.c +@@ -250,7 +250,7 @@ unsigned int ucb1x00_adc_read(struct ucb1x00 *ucb, int adc_channel, int sync) + break; + /* yield to other processes */ + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + + return UCB_ADC_DAT(val); +diff --git a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c +index 8e6607fc8a67..b9ab770bbdb5 100644 +--- a/drivers/misc/sgi-xp/xpc_channel.c ++++ b/drivers/misc/sgi-xp/xpc_channel.c +@@ -834,7 +834,7 @@ xpc_allocate_msg_wait(struct xpc_channel *ch) + + atomic_inc(&ch->n_on_msg_allocate_wq); + prepare_to_wait(&ch->msg_allocate_wq, &wait, TASK_INTERRUPTIBLE); +- ret = schedule_timeout(1); ++ ret = schedule_min_hrtimeout(); + finish_wait(&ch->msg_allocate_wq, &wait); + atomic_dec(&ch->n_on_msg_allocate_wq); + +diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c +index bbb2575d4728..637757144221 100644 +--- a/drivers/net/caif/caif_hsi.c ++++ b/drivers/net/caif/caif_hsi.c +@@ -939,7 +939,7 @@ static void cfhsi_wake_down(struct work_struct *work) + break; + + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + retry--; + } + +diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c +index d2539c95adb6..0c2f31a03ce9 100644 +--- a/drivers/net/can/usb/peak_usb/pcan_usb.c ++++ b/drivers/net/can/usb/peak_usb/pcan_usb.c +@@ -242,7 +242,7 @@ static int pcan_usb_write_mode(struct peak_usb_device *dev, u8 onoff) + } else { + /* the PCAN-USB needs time to init */ + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(PCAN_USB_STARTUP_TIMEOUT)); ++ schedule_msec_hrtimeout((PCAN_USB_STARTUP_TIMEOUT)); + } + + return err; +diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c +index f24a1b0b801f..972313b92b0a 100644 +--- a/drivers/net/usb/lan78xx.c ++++ b/drivers/net/usb/lan78xx.c +@@ -2676,7 +2676,7 @@ static void lan78xx_terminate_urbs(struct lan78xx_net *dev) + while (!skb_queue_empty(&dev->rxq) && + !skb_queue_empty(&dev->txq) && + !skb_queue_empty(&dev->done)) { +- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); ++ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); + set_current_state(TASK_UNINTERRUPTIBLE); + netif_dbg(dev, ifdown, dev->net, + "waited for %d urb completions\n", temp); +diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c +index dde05e2fdc3e..fa6c1581136e 100644 +--- a/drivers/net/usb/usbnet.c ++++ b/drivers/net/usb/usbnet.c +@@ -767,7 +767,7 @@ static void wait_skb_queue_empty(struct sk_buff_head *q) + spin_lock_irqsave(&q->lock, flags); + while (!skb_queue_empty(q)) { + spin_unlock_irqrestore(&q->lock, flags); +- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); ++ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); + set_current_state(TASK_UNINTERRUPTIBLE); + spin_lock_irqsave(&q->lock, flags); + } +diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c +index 8dfbaff2d1fe..d1d6b9777f47 100644 +--- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c ++++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c +@@ -816,7 +816,7 @@ static int ipw2100_hw_send_command(struct ipw2100_priv *priv, + * doesn't seem to have as many firmware restart cycles... + * + * As a test, we're sticking in a 1/100s delay here */ +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + + return 0; + +@@ -1267,7 +1267,7 @@ static int ipw2100_start_adapter(struct ipw2100_priv *priv) + IPW_DEBUG_FW("Waiting for f/w initialization to complete...\n"); + i = 5000; + do { +- schedule_timeout_uninterruptible(msecs_to_jiffies(40)); ++ schedule_msec_hrtimeout_uninterruptible((40)); + /* Todo... wait for sync command ... */ + + read_register(priv->net_dev, IPW_REG_INTA, &inta); +diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c +index 90fb73575495..c94048b048a5 100644 +--- a/drivers/parport/ieee1284.c ++++ b/drivers/parport/ieee1284.c +@@ -208,7 +208,7 @@ int parport_wait_peripheral(struct parport *port, + /* parport_wait_event didn't time out, but the + * peripheral wasn't actually ready either. + * Wait for another 10ms. */ +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + } + +diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c +index 5d41dda6da4e..34705f6b423f 100644 +--- a/drivers/parport/ieee1284_ops.c ++++ b/drivers/parport/ieee1284_ops.c +@@ -537,7 +537,7 @@ size_t parport_ieee1284_ecp_read_data (struct parport *port, + /* Yield the port for a while. */ + if (count && dev->port->irq != PARPORT_IRQ_NONE) { + parport_release (dev); +- schedule_timeout_interruptible(msecs_to_jiffies(40)); ++ schedule_msec_hrtimeout_interruptible((40)); + parport_claim_or_block (dev); + } + else +diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c +index bffe548187ee..c2918ee3e100 100644 +--- a/drivers/platform/x86/intel_ips.c ++++ b/drivers/platform/x86/intel_ips.c +@@ -798,7 +798,7 @@ static int ips_adjust(void *data) + ips_gpu_lower(ips); + + sleep: +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_ADJUST_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_ADJUST_PERIOD)); + } while (!kthread_should_stop()); + + dev_dbg(ips->dev, "ips-adjust thread stopped\n"); +@@ -974,7 +974,7 @@ static int ips_monitor(void *data) + seqno_timestamp = get_jiffies_64(); + + old_cpu_power = thm_readl(THM_CEC); +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + + /* Collect an initial average */ + for (i = 0; i < IPS_SAMPLE_COUNT; i++) { +@@ -1001,7 +1001,7 @@ static int ips_monitor(void *data) + mchp_samples[i] = mchp; + } + +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + if (kthread_should_stop()) + break; + } +@@ -1028,7 +1028,7 @@ static int ips_monitor(void *data) + * us to reduce the sample frequency if the CPU and GPU are idle. + */ + old_cpu_power = thm_readl(THM_CEC); +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + last_sample_period = IPS_SAMPLE_PERIOD; + + timer_setup(&ips->timer, monitor_timeout, TIMER_DEFERRABLE); +diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c +index 2018614f258f..fc19b312c345 100644 +--- a/drivers/rtc/rtc-wm8350.c ++++ b/drivers/rtc/rtc-wm8350.c +@@ -114,7 +114,7 @@ static int wm8350_rtc_settime(struct device *dev, struct rtc_time *tm) + /* Wait until confirmation of stopping */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (--retries && !(rtc_ctrl & WM8350_RTC_STS)); + + if (!retries) { +@@ -197,7 +197,7 @@ static int wm8350_rtc_stop_alarm(struct wm8350 *wm8350) + /* Wait until confirmation of stopping */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (retries-- && !(rtc_ctrl & WM8350_RTC_ALMSTS)); + + if (!(rtc_ctrl & WM8350_RTC_ALMSTS)) +@@ -220,7 +220,7 @@ static int wm8350_rtc_start_alarm(struct wm8350 *wm8350) + /* Wait until confirmation */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (retries-- && rtc_ctrl & WM8350_RTC_ALMSTS); + + if (rtc_ctrl & WM8350_RTC_ALMSTS) +diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c +index 80608b53897b..84051b538fa8 100644 +--- a/drivers/scsi/fnic/fnic_scsi.c ++++ b/drivers/scsi/fnic/fnic_scsi.c +@@ -216,7 +216,7 @@ int fnic_fw_reset_handler(struct fnic *fnic) + + /* wait for io cmpl */ + while (atomic_read(&fnic->in_flight)) +- schedule_timeout(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout((1)); + + spin_lock_irqsave(&fnic->wq_copy_lock[0], flags); + +@@ -2273,7 +2273,7 @@ static int fnic_clean_pending_aborts(struct fnic *fnic, + } + } + +- schedule_timeout(msecs_to_jiffies(2 * fnic->config.ed_tov)); ++ schedule_msec_hrtimeout((2 * fnic->config.ed_tov)); + + /* walk again to check, if IOs are still pending in fw */ + if (fnic_is_abts_pending(fnic, lr_sc)) +diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c +index 6822cd9ff8f1..ac3ad534be1a 100644 +--- a/drivers/scsi/lpfc/lpfc_scsi.c ++++ b/drivers/scsi/lpfc/lpfc_scsi.c +@@ -5176,7 +5176,7 @@ lpfc_reset_flush_io_context(struct lpfc_vport *vport, uint16_t tgt_id, + tgt_id, lun_id, context); + later = msecs_to_jiffies(2 * vport->cfg_devloss_tmo * 1000) + jiffies; + while (time_after(later, jiffies) && cnt) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(20)); ++ schedule_msec_hrtimeout_uninterruptible((20)); + cnt = lpfc_sli_sum_iocb(vport, tgt_id, lun_id, context); + } + if (cnt) { +diff --git a/drivers/scsi/snic/snic_scsi.c b/drivers/scsi/snic/snic_scsi.c +index b3650c989ed4..7ed1fb285754 100644 +--- a/drivers/scsi/snic/snic_scsi.c ++++ b/drivers/scsi/snic/snic_scsi.c +@@ -2353,7 +2353,7 @@ snic_reset(struct Scsi_Host *shost, struct scsi_cmnd *sc) + + /* Wait for all the IOs that are entered in Qcmd */ + while (atomic_read(&snic->ios_inflight)) +- schedule_timeout(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout((1)); + + ret = snic_issue_hba_reset(snic, sc); + if (ret) { +diff --git a/drivers/staging/comedi/drivers/ni_mio_common.c b/drivers/staging/comedi/drivers/ni_mio_common.c +index f98e3ae27bff..0741c8352a6d 100644 +--- a/drivers/staging/comedi/drivers/ni_mio_common.c ++++ b/drivers/staging/comedi/drivers/ni_mio_common.c +@@ -4742,7 +4742,7 @@ static int cs5529_wait_for_idle(struct comedi_device *dev) + if ((status & NI67XX_CAL_STATUS_BUSY) == 0) + break; + set_current_state(TASK_INTERRUPTIBLE); +- if (schedule_timeout(1)) ++ if (schedule_min_hrtimeout()) + return -EIO; + } + if (i == timeout) { +diff --git a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c +new file mode 100644 +index 000000000000..8cca151741b2 +--- /dev/null ++++ b/drivers/staging/lustre/lnet/lnet/lib-eq.c +@@ -0,0 +1,426 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * GPL HEADER START ++ * ++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 only, ++ * as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, but ++ * WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License version 2 for more details (a copy is included ++ * in the LICENSE file that accompanied this code). ++ * ++ * You should have received a copy of the GNU General Public License ++ * version 2 along with this program; If not, see ++ * http://www.gnu.org/licenses/gpl-2.0.html ++ * ++ * GPL HEADER END ++ */ ++/* ++ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Use is subject to license terms. ++ * ++ * Copyright (c) 2012, Intel Corporation. ++ */ ++/* ++ * This file is part of Lustre, http://www.lustre.org/ ++ * Lustre is a trademark of Sun Microsystems, Inc. ++ * ++ * lnet/lnet/lib-eq.c ++ * ++ * Library level Event queue management routines ++ */ ++ ++#define DEBUG_SUBSYSTEM S_LNET ++ ++#include ++ ++/** ++ * Create an event queue that has room for \a count number of events. ++ * ++ * The event queue is circular and older events will be overwritten by new ++ * ones if they are not removed in time by the user using the functions ++ * LNetEQGet(), LNetEQWait(), or LNetEQPoll(). It is up to the user to ++ * determine the appropriate size of the event queue to prevent this loss ++ * of events. Note that when EQ handler is specified in \a callback, no ++ * event loss can happen, since the handler is run for each event deposited ++ * into the EQ. ++ * ++ * \param count The number of events to be stored in the event queue. It ++ * will be rounded up to the next power of two. ++ * \param callback A handler function that runs when an event is deposited ++ * into the EQ. The constant value LNET_EQ_HANDLER_NONE can be used to ++ * indicate that no event handler is desired. ++ * \param handle On successful return, this location will hold a handle for ++ * the newly created EQ. ++ * ++ * \retval 0 On success. ++ * \retval -EINVAL If an parameter is not valid. ++ * \retval -ENOMEM If memory for the EQ can't be allocated. ++ * ++ * \see lnet_eq_handler_t for the discussion on EQ handler semantics. ++ */ ++int ++LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback, ++ struct lnet_handle_eq *handle) ++{ ++ struct lnet_eq *eq; ++ ++ LASSERT(the_lnet.ln_refcount > 0); ++ ++ /* ++ * We need count to be a power of 2 so that when eq_{enq,deq}_seq ++ * overflow, they don't skip entries, so the queue has the same ++ * apparent capacity at all times ++ */ ++ if (count) ++ count = roundup_pow_of_two(count); ++ ++ if (callback != LNET_EQ_HANDLER_NONE && count) ++ CWARN("EQ callback is guaranteed to get every event, do you still want to set eqcount %d for polling event which will have locking overhead? Please contact with developer to confirm\n", count); ++ ++ /* ++ * count can be 0 if only need callback, we can eliminate ++ * overhead of enqueue event ++ */ ++ if (!count && callback == LNET_EQ_HANDLER_NONE) ++ return -EINVAL; ++ ++ eq = kzalloc(sizeof(*eq), GFP_NOFS); ++ if (!eq) ++ return -ENOMEM; ++ ++ if (count) { ++ eq->eq_events = kvmalloc_array(count, sizeof(struct lnet_event), ++ GFP_KERNEL | __GFP_ZERO); ++ if (!eq->eq_events) ++ goto failed; ++ /* ++ * NB allocator has set all event sequence numbers to 0, ++ * so all them should be earlier than eq_deq_seq ++ */ ++ } ++ ++ eq->eq_deq_seq = 1; ++ eq->eq_enq_seq = 1; ++ eq->eq_size = count; ++ eq->eq_callback = callback; ++ ++ eq->eq_refs = cfs_percpt_alloc(lnet_cpt_table(), ++ sizeof(*eq->eq_refs[0])); ++ if (!eq->eq_refs) ++ goto failed; ++ ++ /* MUST hold both exclusive lnet_res_lock */ ++ lnet_res_lock(LNET_LOCK_EX); ++ /* ++ * NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do ++ * both EQ lookup and poll event with only lnet_eq_wait_lock ++ */ ++ lnet_eq_wait_lock(); ++ ++ lnet_res_lh_initialize(&the_lnet.ln_eq_container, &eq->eq_lh); ++ list_add(&eq->eq_list, &the_lnet.ln_eq_container.rec_active); ++ ++ lnet_eq_wait_unlock(); ++ lnet_res_unlock(LNET_LOCK_EX); ++ ++ lnet_eq2handle(handle, eq); ++ return 0; ++ ++failed: ++ kvfree(eq->eq_events); ++ ++ if (eq->eq_refs) ++ cfs_percpt_free(eq->eq_refs); ++ ++ kfree(eq); ++ return -ENOMEM; ++} ++EXPORT_SYMBOL(LNetEQAlloc); ++ ++/** ++ * Release the resources associated with an event queue if it's idle; ++ * otherwise do nothing and it's up to the user to try again. ++ * ++ * \param eqh A handle for the event queue to be released. ++ * ++ * \retval 0 If the EQ is not in use and freed. ++ * \retval -ENOENT If \a eqh does not point to a valid EQ. ++ * \retval -EBUSY If the EQ is still in use by some MDs. ++ */ ++int ++LNetEQFree(struct lnet_handle_eq eqh) ++{ ++ struct lnet_eq *eq; ++ struct lnet_event *events = NULL; ++ int **refs = NULL; ++ int *ref; ++ int rc = 0; ++ int size = 0; ++ int i; ++ ++ LASSERT(the_lnet.ln_refcount > 0); ++ ++ lnet_res_lock(LNET_LOCK_EX); ++ /* ++ * NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do ++ * both EQ lookup and poll event with only lnet_eq_wait_lock ++ */ ++ lnet_eq_wait_lock(); ++ ++ eq = lnet_handle2eq(&eqh); ++ if (!eq) { ++ rc = -ENOENT; ++ goto out; ++ } ++ ++ cfs_percpt_for_each(ref, i, eq->eq_refs) { ++ LASSERT(*ref >= 0); ++ if (!*ref) ++ continue; ++ ++ CDEBUG(D_NET, "Event equeue (%d: %d) busy on destroy.\n", ++ i, *ref); ++ rc = -EBUSY; ++ goto out; ++ } ++ ++ /* stash for free after lock dropped */ ++ events = eq->eq_events; ++ size = eq->eq_size; ++ refs = eq->eq_refs; ++ ++ lnet_res_lh_invalidate(&eq->eq_lh); ++ list_del(&eq->eq_list); ++ kfree(eq); ++ out: ++ lnet_eq_wait_unlock(); ++ lnet_res_unlock(LNET_LOCK_EX); ++ ++ kvfree(events); ++ if (refs) ++ cfs_percpt_free(refs); ++ ++ return rc; ++} ++EXPORT_SYMBOL(LNetEQFree); ++ ++void ++lnet_eq_enqueue_event(struct lnet_eq *eq, struct lnet_event *ev) ++{ ++ /* MUST called with resource lock hold but w/o lnet_eq_wait_lock */ ++ int index; ++ ++ if (!eq->eq_size) { ++ LASSERT(eq->eq_callback != LNET_EQ_HANDLER_NONE); ++ eq->eq_callback(ev); ++ return; ++ } ++ ++ lnet_eq_wait_lock(); ++ ev->sequence = eq->eq_enq_seq++; ++ ++ LASSERT(eq->eq_size == LOWEST_BIT_SET(eq->eq_size)); ++ index = ev->sequence & (eq->eq_size - 1); ++ ++ eq->eq_events[index] = *ev; ++ ++ if (eq->eq_callback != LNET_EQ_HANDLER_NONE) ++ eq->eq_callback(ev); ++ ++ /* Wake anyone waiting in LNetEQPoll() */ ++ if (waitqueue_active(&the_lnet.ln_eq_waitq)) ++ wake_up_all(&the_lnet.ln_eq_waitq); ++ lnet_eq_wait_unlock(); ++} ++ ++static int ++lnet_eq_dequeue_event(struct lnet_eq *eq, struct lnet_event *ev) ++{ ++ int new_index = eq->eq_deq_seq & (eq->eq_size - 1); ++ struct lnet_event *new_event = &eq->eq_events[new_index]; ++ int rc; ++ ++ /* must called with lnet_eq_wait_lock hold */ ++ if (LNET_SEQ_GT(eq->eq_deq_seq, new_event->sequence)) ++ return 0; ++ ++ /* We've got a new event... */ ++ *ev = *new_event; ++ ++ CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n", ++ new_event, eq->eq_deq_seq, eq->eq_size); ++ ++ /* ...but did it overwrite an event we've not seen yet? */ ++ if (eq->eq_deq_seq == new_event->sequence) { ++ rc = 1; ++ } else { ++ /* ++ * don't complain with CERROR: some EQs are sized small ++ * anyway; if it's important, the caller should complain ++ */ ++ CDEBUG(D_NET, "Event Queue Overflow: eq seq %lu ev seq %lu\n", ++ eq->eq_deq_seq, new_event->sequence); ++ rc = -EOVERFLOW; ++ } ++ ++ eq->eq_deq_seq = new_event->sequence + 1; ++ return rc; ++} ++ ++/** ++ * A nonblocking function that can be used to get the next event in an EQ. ++ * If an event handler is associated with the EQ, the handler will run before ++ * this function returns successfully. The event is removed from the queue. ++ * ++ * \param eventq A handle for the event queue. ++ * \param event On successful return (1 or -EOVERFLOW), this location will ++ * hold the next event in the EQ. ++ * ++ * \retval 0 No pending event in the EQ. ++ * \retval 1 Indicates success. ++ * \retval -ENOENT If \a eventq does not point to a valid EQ. ++ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that ++ * at least one event between this event and the last event obtained from the ++ * EQ has been dropped due to limited space in the EQ. ++ */ ++ ++/** ++ * Block the calling process until there is an event in the EQ. ++ * If an event handler is associated with the EQ, the handler will run before ++ * this function returns successfully. This function returns the next event ++ * in the EQ and removes it from the EQ. ++ * ++ * \param eventq A handle for the event queue. ++ * \param event On successful return (1 or -EOVERFLOW), this location will ++ * hold the next event in the EQ. ++ * ++ * \retval 1 Indicates success. ++ * \retval -ENOENT If \a eventq does not point to a valid EQ. ++ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that ++ * at least one event between this event and the last event obtained from the ++ * EQ has been dropped due to limited space in the EQ. ++ */ ++ ++static int ++lnet_eq_wait_locked(int *timeout_ms, long state) ++__must_hold(&the_lnet.ln_eq_wait_lock) ++{ ++ int tms = *timeout_ms; ++ int wait; ++ wait_queue_entry_t wl; ++ unsigned long now; ++ ++ if (!tms) ++ return -ENXIO; /* don't want to wait and no new event */ ++ ++ init_waitqueue_entry(&wl, current); ++ set_current_state(state); ++ add_wait_queue(&the_lnet.ln_eq_waitq, &wl); ++ ++ lnet_eq_wait_unlock(); ++ ++ if (tms < 0) { ++ schedule(); ++ } else { ++ now = jiffies; ++ schedule_msec_hrtimeout((tms)); ++ tms -= jiffies_to_msecs(jiffies - now); ++ if (tms < 0) /* no more wait but may have new event */ ++ tms = 0; ++ } ++ ++ wait = tms; /* might need to call here again */ ++ *timeout_ms = tms; ++ ++ lnet_eq_wait_lock(); ++ remove_wait_queue(&the_lnet.ln_eq_waitq, &wl); ++ ++ return wait; ++} ++ ++/** ++ * Block the calling process until there's an event from a set of EQs or ++ * timeout happens. ++ * ++ * If an event handler is associated with the EQ, the handler will run before ++ * this function returns successfully, in which case the corresponding event ++ * is consumed. ++ * ++ * LNetEQPoll() provides a timeout to allow applications to poll, block for a ++ * fixed period, or block indefinitely. ++ * ++ * \param eventqs,neq An array of EQ handles, and size of the array. ++ * \param timeout_ms Time in milliseconds to wait for an event to occur on ++ * one of the EQs. The constant LNET_TIME_FOREVER can be used to indicate an ++ * infinite timeout. ++ * \param interruptible, if true, use TASK_INTERRUPTIBLE, else TASK_NOLOAD ++ * \param event,which On successful return (1 or -EOVERFLOW), \a event will ++ * hold the next event in the EQs, and \a which will contain the index of the ++ * EQ from which the event was taken. ++ * ++ * \retval 0 No pending event in the EQs after timeout. ++ * \retval 1 Indicates success. ++ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that ++ * at least one event between this event and the last event obtained from the ++ * EQ indicated by \a which has been dropped due to limited space in the EQ. ++ * \retval -ENOENT If there's an invalid handle in \a eventqs. ++ */ ++int ++LNetEQPoll(struct lnet_handle_eq *eventqs, int neq, int timeout_ms, ++ int interruptible, ++ struct lnet_event *event, int *which) ++{ ++ int wait = 1; ++ int rc; ++ int i; ++ ++ LASSERT(the_lnet.ln_refcount > 0); ++ ++ if (neq < 1) ++ return -ENOENT; ++ ++ lnet_eq_wait_lock(); ++ ++ for (;;) { ++ for (i = 0; i < neq; i++) { ++ struct lnet_eq *eq = lnet_handle2eq(&eventqs[i]); ++ ++ if (!eq) { ++ lnet_eq_wait_unlock(); ++ return -ENOENT; ++ } ++ ++ rc = lnet_eq_dequeue_event(eq, event); ++ if (rc) { ++ lnet_eq_wait_unlock(); ++ *which = i; ++ return rc; ++ } ++ } ++ ++ if (!wait) ++ break; ++ ++ /* ++ * return value of lnet_eq_wait_locked: ++ * -1 : did nothing and it's sure no new event ++ * 1 : sleep inside and wait until new event ++ * 0 : don't want to wait anymore, but might have new event ++ * so need to call dequeue again ++ */ ++ wait = lnet_eq_wait_locked(&timeout_ms, ++ interruptible ? TASK_INTERRUPTIBLE ++ : TASK_NOLOAD); ++ if (wait < 0) /* no new event */ ++ break; ++ } ++ ++ lnet_eq_wait_unlock(); ++ return 0; ++} +diff --git a/drivers/staging/rts5208/rtsx.c b/drivers/staging/rts5208/rtsx.c +index fa597953e9a0..685cf842badc 100644 +--- a/drivers/staging/rts5208/rtsx.c ++++ b/drivers/staging/rts5208/rtsx.c +@@ -490,7 +490,7 @@ static int rtsx_polling_thread(void *__dev) + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(POLLING_INTERVAL)); ++ schedule_msec_hrtimeout((POLLING_INTERVAL)); + + /* lock the device pointers */ + mutex_lock(&dev->dev_mutex); +diff --git a/drivers/staging/speakup/speakup_acntpc.c b/drivers/staging/speakup/speakup_acntpc.c +index c94328a5bd4a..6e7d4671aa69 100644 +--- a/drivers/staging/speakup/speakup_acntpc.c ++++ b/drivers/staging/speakup/speakup_acntpc.c +@@ -198,7 +198,7 @@ static void do_catch_up(struct spk_synth *synth) + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout((full_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -226,7 +226,7 @@ static void do_catch_up(struct spk_synth *synth) + jiffy_delta_val = jiffy_delta->u.n.value; + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff --git a/drivers/staging/speakup/speakup_apollo.c b/drivers/staging/speakup/speakup_apollo.c +index 0877b4044c28..627102d048c1 100644 +--- a/drivers/staging/speakup/speakup_apollo.c ++++ b/drivers/staging/speakup/speakup_apollo.c +@@ -165,7 +165,7 @@ static void do_catch_up(struct spk_synth *synth) + if (!synth->io_ops->synth_out(synth, ch)) { + synth->io_ops->tiocmset(0, UART_MCR_RTS); + synth->io_ops->tiocmset(UART_MCR_RTS, 0); +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout(full_time_val); + continue; + } + if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { +diff --git a/drivers/staging/speakup/speakup_decext.c b/drivers/staging/speakup/speakup_decext.c +index ddbb7e97d118..f9502addc765 100644 +--- a/drivers/staging/speakup/speakup_decext.c ++++ b/drivers/staging/speakup/speakup_decext.c +@@ -176,7 +176,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (synth_full() || !synth->io_ops->synth_out(synth, ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/speakup/speakup_decpc.c b/drivers/staging/speakup/speakup_decpc.c +index 798c42dfa16c..d85b41db67a3 100644 +--- a/drivers/staging/speakup/speakup_decpc.c ++++ b/drivers/staging/speakup/speakup_decpc.c +@@ -394,7 +394,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (dt_sendchar(ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/speakup/speakup_dectlk.c b/drivers/staging/speakup/speakup_dectlk.c +index dccb4ea29d37..8ecead307d04 100644 +--- a/drivers/staging/speakup/speakup_dectlk.c ++++ b/drivers/staging/speakup/speakup_dectlk.c +@@ -244,7 +244,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (synth_full_val || !synth->io_ops->synth_out(synth, ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/speakup/speakup_dtlk.c b/drivers/staging/speakup/speakup_dtlk.c +index dbebed0eeeec..6d83c13ca4a6 100644 +--- a/drivers/staging/speakup/speakup_dtlk.c ++++ b/drivers/staging/speakup/speakup_dtlk.c +@@ -211,7 +211,7 @@ static void do_catch_up(struct spk_synth *synth) + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -227,7 +227,7 @@ static void do_catch_up(struct spk_synth *synth) + delay_time_val = delay_time->u.n.value; + jiffy_delta_val = jiffy_delta->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff --git a/drivers/staging/speakup/speakup_keypc.c b/drivers/staging/speakup/speakup_keypc.c +index 414827e888fc..cb31c9176daa 100644 +--- a/drivers/staging/speakup/speakup_keypc.c ++++ b/drivers/staging/speakup/speakup_keypc.c +@@ -199,7 +199,7 @@ static void do_catch_up(struct spk_synth *synth) + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout((full_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -232,7 +232,7 @@ static void do_catch_up(struct spk_synth *synth) + jiffy_delta_val = jiffy_delta->u.n.value; + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff --git a/drivers/staging/speakup/synth.c b/drivers/staging/speakup/synth.c +index 3568bfb89912..0a80b3b098b2 100644 +--- a/drivers/staging/speakup/synth.c ++++ b/drivers/staging/speakup/synth.c +@@ -93,12 +93,8 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (ch == '\n') + ch = synth->procspeech; +- if (unicode) +- ret = synth->io_ops->synth_out_unicode(synth, ch); +- else +- ret = synth->io_ops->synth_out(synth, ch); +- if (!ret) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ if (!synth->io_ops->synth_out(synth, ch)) { ++ schedule_msec_hrtimeout(full_time_val); + continue; + } + if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { +@@ -108,11 +104,9 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth->io_ops->synth_out(synth, synth->procspeech)) +- schedule_timeout( +- msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + else +- schedule_timeout( +- msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout(full_time_val); + jiff_max = jiffies + jiffy_delta_val; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/unisys/visornic/visornic_main.c b/drivers/staging/unisys/visornic/visornic_main.c +index 1d1440d43002..52fe89ae1d9d 100644 +--- a/drivers/staging/unisys/visornic/visornic_main.c ++++ b/drivers/staging/unisys/visornic/visornic_main.c +@@ -549,7 +549,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- wait += schedule_timeout(msecs_to_jiffies(10)); ++ wait += schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + } + +@@ -560,7 +560,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- schedule_timeout(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + if (atomic_read(&devdata->usage)) + break; +@@ -714,7 +714,7 @@ static int visornic_enable_with_timeout(struct net_device *netdev, + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- wait += schedule_timeout(msecs_to_jiffies(10)); ++ wait += schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + } + +diff --git a/drivers/video/fbdev/omap/hwa742.c b/drivers/video/fbdev/omap/hwa742.c +index cfe63932f825..71c00ef772a3 100644 +--- a/drivers/video/fbdev/omap/hwa742.c ++++ b/drivers/video/fbdev/omap/hwa742.c +@@ -913,7 +913,7 @@ static void hwa742_resume(void) + if (hwa742_read_reg(HWA742_PLL_DIV_REG) & (1 << 7)) + break; + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(5)); ++ schedule_msec_hrtimeout((5)); + } + hwa742_set_update_mode(hwa742.update_mode_before_suspend); + } +diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c +index f70c9f79622e..0b363eaee24f 100644 +--- a/drivers/video/fbdev/pxafb.c ++++ b/drivers/video/fbdev/pxafb.c +@@ -1287,7 +1287,7 @@ static int pxafb_smart_thread(void *arg) + mutex_unlock(&fbi->ctrlr_lock); + + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(30)); ++ schedule_msec_hrtimeout((30)); + } + + pr_debug("%s(): task ending\n", __func__); +diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c +index 37345fb6191d..3874c17d1bc5 100644 +--- a/fs/btrfs/inode-map.c ++++ b/fs/btrfs/inode-map.c +@@ -91,7 +91,7 @@ static int caching_kthread(void *data) + btrfs_release_path(path); + root->ino_cache_progress = last; + up_read(&fs_info->commit_root_sem); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + goto again; + } else + continue; +diff --git a/fs/proc/base.c b/fs/proc/base.c +index ebea9501afb8..51c9346a69fe 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -477,7 +477,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + seq_puts(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff --git a/include/linux/freezer.h b/include/linux/freezer.h +index 21f5aa0b217f..ee9b46394fdf 100644 +--- a/include/linux/freezer.h ++++ b/include/linux/freezer.h +@@ -297,6 +297,7 @@ static inline void set_freezable(void) {} + #define wait_event_freezekillable_unsafe(wq, condition) \ + wait_event_killable(wq, condition) + ++#define pm_freezing (false) + #endif /* !CONFIG_FREEZER */ + + #endif /* FREEZER_H_INCLUDED */ +diff --git a/include/linux/init_task.h b/include/linux/init_task.h +index 2c620d7ac432..73417df5daa2 100644 +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -36,7 +36,11 @@ extern struct cred init_cred; + #define INIT_PREV_CPUTIME(x) + #endif + ++#ifdef CONFIG_SCHED_MUQSS ++#define INIT_TASK_COMM "MuQSS" ++#else + #define INIT_TASK_COMM "swapper" ++#endif + + /* Attach to the init_task data structure for proper alignment */ + #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK +diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h +index e9bfe6972aed..16ba1c7e5bde 100644 +--- a/include/linux/ioprio.h ++++ b/include/linux/ioprio.h +@@ -53,6 +53,8 @@ enum { + */ + static inline int task_nice_ioprio(struct task_struct *task) + { ++ if (iso_task(task)) ++ return 0; + return (task_nice(task) + 20) / 5; + } + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 67a1d86981a9..95b427fdbb2e 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -31,6 +31,9 @@ + #include + #include + #include ++#ifdef CONFIG_SCHED_MUQSS ++#include ++#endif + + /* task_struct member predeclarations (sorted alphabetically): */ + struct audit_context; +@@ -214,13 +217,40 @@ struct task_group; + + extern void scheduler_tick(void); + +-#define MAX_SCHEDULE_TIMEOUT LONG_MAX +- ++#define MAX_SCHEDULE_TIMEOUT LONG_MAX + extern long schedule_timeout(long timeout); + extern long schedule_timeout_interruptible(long timeout); + extern long schedule_timeout_killable(long timeout); + extern long schedule_timeout_uninterruptible(long timeout); + extern long schedule_timeout_idle(long timeout); ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++extern long schedule_msec_hrtimeout(long timeout); ++extern long schedule_min_hrtimeout(void); ++extern long schedule_msec_hrtimeout_interruptible(long timeout); ++extern long schedule_msec_hrtimeout_uninterruptible(long timeout); ++#else ++static inline long schedule_msec_hrtimeout(long timeout) ++{ ++ return schedule_timeout(msecs_to_jiffies(timeout)); ++} ++ ++static inline long schedule_min_hrtimeout(void) ++{ ++ return schedule_timeout(1); ++} ++ ++static inline long schedule_msec_hrtimeout_interruptible(long timeout) ++{ ++ return schedule_timeout_interruptible(msecs_to_jiffies(timeout)); ++} ++ ++static inline long schedule_msec_hrtimeout_uninterruptible(long timeout) ++{ ++ return schedule_timeout_uninterruptible(msecs_to_jiffies(timeout)); ++} ++#endif ++ + asmlinkage void schedule(void); + extern void schedule_preempt_disabled(void); + asmlinkage void preempt_schedule_irq(void); +@@ -644,9 +674,11 @@ struct task_struct { + unsigned int flags; + unsigned int ptrace; + ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_MUQSS) ++ int on_cpu; ++#endif + #ifdef CONFIG_SMP + struct llist_node wake_entry; +- int on_cpu; + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ + unsigned int cpu; +@@ -671,10 +703,25 @@ struct task_struct { + int static_prio; + int normal_prio; + unsigned int rt_priority; ++#ifdef CONFIG_SCHED_MUQSS ++ int time_slice; ++ u64 deadline; ++ skiplist_node node; /* Skip list node */ ++ u64 last_ran; ++ u64 sched_time; /* sched_clock time spent running */ ++#ifdef CONFIG_SMT_NICE ++ int smt_bias; /* Policy/nice level bias across smt siblings */ ++#endif ++#ifdef CONFIG_HOTPLUG_CPU ++ bool zerobound; /* Bound to CPU0 for hotplug */ ++#endif ++ unsigned long rt_timeout; ++#else /* CONFIG_SCHED_MUQSS */ + + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +@@ -839,6 +886,10 @@ struct task_struct { + #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + u64 utimescaled; + u64 stimescaled; ++#endif ++#ifdef CONFIG_SCHED_MUQSS ++ /* Unbanked cpu time */ ++ unsigned long utime_ns, stime_ns; + #endif + u64 gtime; + struct prev_cputime prev_cputime; +@@ -1283,6 +1334,40 @@ struct task_struct { + */ + }; + ++#ifdef CONFIG_SCHED_MUQSS ++#define tsk_seruntime(t) ((t)->sched_time) ++#define tsk_rttimeout(t) ((t)->rt_timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++} ++ ++void print_scheduler_version(void); ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return (p->policy == SCHED_ISO); ++} ++#else /* CFS */ ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++ p->nr_cpus_allowed = current->nr_cpus_allowed; ++} ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "CFS CPU scheduler.\n"); ++} ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return false; ++} ++#endif /* CONFIG_SCHED_MUQSS */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index 1aff00b65f3c..73d6319a856a 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -28,7 +28,16 @@ static inline bool dl_time_before(u64 a, u64 b) + #ifdef CONFIG_SMP + + struct root_domain; ++#ifdef CONFIG_SCHED_MUQSS ++static inline void dl_clear_root_domain(struct root_domain *rd) ++{ ++} ++static inline void dl_add_task_root_domain(struct task_struct *p) ++{ ++} ++#else /* CONFIG_SCHED_MUQSS */ + extern void dl_add_task_root_domain(struct task_struct *p); + extern void dl_clear_root_domain(struct root_domain *rd); ++#endif /* CONFIG_SCHED_MUQSS */ + + #endif /* CONFIG_SMP */ +diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h +index 1abe91ff6e4a..20ba383562b0 100644 +--- a/include/linux/sched/nohz.h ++++ b/include/linux/sched/nohz.h +@@ -13,7 +13,7 @@ extern int get_nohz_timer_target(void); + static inline void nohz_balance_enter_idle(int cpu) { } + #endif + +-#ifdef CONFIG_NO_HZ_COMMON ++#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS) + void calc_load_nohz_start(void); + void calc_load_nohz_stop(void); + #else +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index 7d64feafc408..43c9d9e50c09 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -20,8 +20,20 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ ++#ifdef CONFIG_SCHED_MUQSS ++/* Note different MAX_RT_PRIO */ ++#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1) ++ ++#define ISO_PRIO (MAX_RT_PRIO) ++#define NORMAL_PRIO (MAX_RT_PRIO + 1) ++#define IDLE_PRIO (MAX_RT_PRIO + 2) ++#define PRIO_LIMIT ((IDLE_PRIO) + 1) ++#else /* CONFIG_SCHED_MUQSS */ + #define MAX_RT_PRIO MAX_USER_RT_PRIO + ++#endif /* CONFIG_SCHED_MUQSS */ ++ + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) + +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c08b4..010b2244e0b6 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_MUQSS + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h +index 4b1c3b664f51..a9671b48799c 100644 +--- a/include/linux/sched/task.h ++++ b/include/linux/sched/task.h +@@ -99,7 +99,7 @@ extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); + extern void free_task(struct task_struct *tsk); + + /* sched_exec is called by processes performing an exec */ +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_MUQSS) + extern void sched_exec(void); + #else + #define sched_exec() {} +diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h +new file mode 100644 +index 000000000000..d4be84ba273b +--- /dev/null ++++ b/include/linux/skip_list.h +@@ -0,0 +1,33 @@ ++#ifndef _LINUX_SKIP_LISTS_H ++#define _LINUX_SKIP_LISTS_H ++typedef u64 keyType; ++typedef void *valueType; ++ ++typedef struct nodeStructure skiplist_node; ++ ++struct nodeStructure { ++ int level; /* Levels in this structure */ ++ keyType key; ++ valueType value; ++ skiplist_node *next[8]; ++ skiplist_node *prev[8]; ++}; ++ ++typedef struct listStructure { ++ int entries; ++ int level; /* Maximum level of the list ++ (1 more than the number of levels in the list) */ ++ skiplist_node *header; /* pointer to header */ ++} skiplist; ++ ++void skiplist_init(skiplist_node *slnode); ++skiplist *new_skiplist(skiplist_node *slnode); ++void free_skiplist(skiplist *l); ++void skiplist_node_init(skiplist_node *node); ++void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed); ++void skiplist_delete(skiplist *l, skiplist_node *node); ++ ++static inline bool skiplist_node_empty(skiplist_node *node) { ++ return (!node->next[0]); ++} ++#endif /* _LINUX_SKIP_LISTS_H */ +diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h +index 25b4fa00bad1..c2503cd28025 100644 +--- a/include/uapi/linux/sched.h ++++ b/include/uapi/linux/sched.h +@@ -84,9 +84,16 @@ struct clone_args { + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 +-/* SCHED_ISO: reserved but not implemented yet */ ++/* SCHED_ISO: Implemented on MuQSS only */ + #define SCHED_IDLE 5 ++#ifdef CONFIG_SCHED_MUQSS ++#define SCHED_ISO 4 ++#define SCHED_IDLEPRIO SCHED_IDLE ++#define SCHED_MAX (SCHED_IDLEPRIO) ++#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) ++#else /* CONFIG_SCHED_MUQSS */ + #define SCHED_DEADLINE 6 ++#endif /* CONFIG_SCHED_MUQSS */ + + /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ + #define SCHED_RESET_ON_FORK 0x40000000 +diff --git a/init/Kconfig b/init/Kconfig +index b4daad2bac23..da90d33ba4b3 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -73,6 +73,18 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config SCHED_MUQSS ++ bool "MuQSS cpu scheduler" ++ select HIGH_RES_TIMERS ++ ---help--- ++ The Multiple Queue Skiplist Scheduler for excellent interactivity and ++ responsiveness on the desktop and highly scalable deterministic ++ low latency on any hardware. ++ ++ Say Y here. ++ default y ++ ++ + config BROKEN + bool + +@@ -802,6 +814,7 @@ config NUMA_BALANCING + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_MUQSS + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -901,9 +914,13 @@ menuconfig CGROUP_SCHED + help + This feature lets CPU scheduler recognize task groups and control CPU + bandwidth allocation to such task groups. It uses cgroups to group +- tasks. ++ tasks. In combination with MuQSS this is purely a STUB to create the ++ files associated with the CPU controller cgroup but most of the ++ controls do nothing. This is useful for working in environments and ++ with applications that will only work if this control group is ++ present. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_MUQSS + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -1032,6 +1049,7 @@ config CGROUP_DEVICE + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" ++ depends on !SCHED_MUQSS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +@@ -1150,6 +1168,7 @@ config CHECKPOINT_RESTORE + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_MUQSS + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff --git a/init/init_task.c b/init/init_task.c +index 9e5cbe5eab7b..5c2bcbf25add 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -66,9 +66,17 @@ struct task_struct init_task + .stack = init_stack, + .usage = REFCOUNT_INIT(2), + .flags = PF_KTHREAD, ++#ifdef CONFIG_SCHED_MUQSS ++ .prio = NORMAL_PRIO, ++ .static_prio = MAX_PRIO - 20, ++ .normal_prio = NORMAL_PRIO, ++ .deadline = 0, ++ .time_slice = 1000000, ++#else + .prio = MAX_PRIO - 20, + .static_prio = MAX_PRIO - 20, + .normal_prio = MAX_PRIO - 20, ++#endif + .policy = SCHED_NORMAL, + .cpus_ptr = &init_task.cpus_mask, + .cpus_mask = CPU_MASK_ALL, +@@ -78,6 +86,7 @@ struct task_struct init_task + .restart_block = { + .fn = do_no_restart_syscall, + }, ++#ifndef CONFIG_SCHED_MUQSS + .se = { + .group_node = LIST_HEAD_INIT(init_task.se.group_node), + }, +@@ -85,6 +94,7 @@ struct task_struct init_task + .run_list = LIST_HEAD_INIT(init_task.rt.run_list), + .time_slice = RR_TIMESLICE, + }, ++#endif + .tasks = LIST_HEAD_INIT(init_task.tasks), + #ifdef CONFIG_SMP + .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), +diff --git a/init/main.c b/init/main.c +index 91f6ebb30ef0..22792032de64 100644 +--- a/init/main.c ++++ b/init/main.c +@@ -1124,6 +1124,8 @@ static int __ref kernel_init(void *unused) + + rcu_end_inkernel_boot(); + ++ print_scheduler_version(); ++ + if (ramdisk_execute_command) { + ret = run_init_process(ramdisk_execute_command); + if (!ret) +diff --git a/kernel/Kconfig.MuQSS b/kernel/Kconfig.MuQSS +new file mode 100644 +index 000000000000..a6a58781ef91 +--- /dev/null ++++ b/kernel/Kconfig.MuQSS +@@ -0,0 +1,105 @@ ++choice ++ prompt "CPU scheduler runqueue sharing" ++ default RQ_MC if SCHED_MUQSS ++ default RQ_NONE ++ ++config RQ_NONE ++ bool "No sharing" ++ help ++ This is the default behaviour where the CPU scheduler has one runqueue ++ per CPU, whether it is a physical or logical CPU (hyperthread). ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=none ++ ++ If unsure, say N. ++ ++config RQ_SMT ++ bool "SMT (hyperthread) siblings" ++ depends on SCHED_SMT && SCHED_MUQSS ++ ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by SMT (hyperthread) siblings. As these logical cores share ++ one physical core, sharing the runqueue resource can lead to decreased ++ overhead, lower latency and higher throughput. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=smt ++ ++ If unsure, say N. ++ ++config RQ_MC ++ bool "Multicore siblings" ++ depends on SCHED_MC && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by multicore siblings in addition to any SMT siblings. ++ As these physical cores share caches, sharing the runqueue resource ++ will lead to lower latency, but its effects on overhead and throughput ++ are less predictable. As a general rule, 6 or fewer cores will likely ++ benefit from this, while larger CPUs will only derive a latency ++ benefit. If your workloads are primarily single threaded, this will ++ possibly worsen throughput. If you are only concerned about latency ++ then enable this regardless of how many cores you have. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=mc ++ ++ If unsure, say Y. ++ ++config RQ_MC_LLC ++ bool "Multicore siblings (LLC)" ++ depends on SCHED_MC && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will behave similarly as ++ with "Multicore siblings". ++ This option takes LLC cache into account when scheduling tasks. ++ Option may benefit CPUs with multiple LLC caches, such as Ryzen ++ and Xeon CPUs. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=llc ++ ++ If unsure, say N. ++ ++config RQ_SMP ++ bool "Symmetric Multi-Processing" ++ depends on SMP && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by all physical CPUs unless they are on separate NUMA nodes. ++ As physical CPUs usually do not share resources, sharing the runqueue ++ will normally worsen throughput but improve latency. If you only ++ care about latency enable this. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=smp ++ ++ If unsure, say N. ++ ++config RQ_ALL ++ bool "NUMA" ++ depends on SMP && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ regardless of the architecture configuration, including across NUMA ++ nodes. This can substantially decrease throughput in NUMA ++ configurations, but light NUMA designs will not be dramatically ++ affected. This option should only be chosen if latency is the prime ++ concern. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=all ++ ++ If unsure, say N. ++endchoice ++ ++config SHARERQ ++ int ++ default 0 if RQ_NONE ++ default 1 if RQ_SMT ++ default 2 if RQ_MC ++ default 3 if RQ_MC_LLC ++ default 4 if RQ_SMP ++ default 5 if RQ_ALL +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 38ef6d06888e..89ed751ac4e4 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -5,7 +5,8 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_100 if SCHED_MUQSS ++ default HZ_250_NODEF if !SCHED_MUQSS + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -20,11 +21,18 @@ choice + config HZ_100 + bool "100 HZ" + help ++ 100 Hz is a suitable choice in combination with MuQSS which does ++ not rely on ticks for rescheduling interrupts, and is not Hz limited ++ for timeouts and sleeps from both the kernel and userspace. ++ This allows us to benefit from the lower overhead and higher ++ throughput of fewer timer ticks. ++ ++ Non-MuQSS kernels: + 100 Hz is a typical choice for servers, SMP and NUMA systems + with lots of processors that may show reduced performance if + too many timer interrupts are occurring. + +- config HZ_250 ++ config HZ_250_NODEF + bool "250 HZ" + help + 250 Hz is a good compromise choice allowing server performance +@@ -32,7 +40,10 @@ choice + on SMP and NUMA systems. If you are going to be using NTSC video + or multimedia, selected 300Hz instead. + +- config HZ_300 ++ 250 Hz is the default choice for the mainline scheduler but not ++ advantageous in combination with MuQSS. ++ ++ config HZ_300_NODEF + bool "300 HZ" + help + 300 Hz is a good compromise choice allowing server performance +@@ -40,7 +51,7 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + +- config HZ_1000 ++ config HZ_1000_NODEF + bool "1000 HZ" + help + 1000 Hz is the preferred choice for desktop systems and other +@@ -51,9 +62,9 @@ endchoice + config HZ + int + default 100 if HZ_100 +- default 250 if HZ_250 +- default 300 if HZ_300 +- default 1000 if HZ_1000 ++ default 250 if HZ_250_NODEF ++ default 300 if HZ_300_NODEF ++ default 1000 if HZ_1000_NODEF + + config SCHED_HRTICK + def_bool HIGH_RES_TIMERS +diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt +index deff97217496..883998dd0437 100644 +--- a/kernel/Kconfig.preempt ++++ b/kernel/Kconfig.preempt +@@ -2,7 +2,7 @@ + + choice + prompt "Preemption Model" +- default PREEMPT_NONE ++ default PREEMPT + + config PREEMPT_NONE + bool "No Forced Preemption (Server)" +@@ -18,7 +18,7 @@ config PREEMPT_NONE + latencies. + + config PREEMPT_VOLUNTARY +- bool "Voluntary Kernel Preemption (Desktop)" ++ bool "Voluntary Kernel Preemption (Nothing)" + depends on !ARCH_NO_PREEMPT + help + This option reduces the latency of the kernel by adding more +@@ -33,7 +33,8 @@ config PREEMPT_VOLUNTARY + applications to run more 'smoothly' even when the system is + under load. + +- Select this if you are building a kernel for a desktop system. ++ Select this for no system in particular (choose Preemptible ++ instead on a desktop if you know what's good for you). + + config PREEMPT + bool "Preemptible Kernel (Low-Latency Desktop)" +diff --git a/kernel/Makefile b/kernel/Makefile +index daad787fb795..9bb44fc4ef5b 100644 +--- a/kernel/Makefile ++++ b/kernel/Makefile +@@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o \ + extable.o params.o \ + kthread.o sys_ni.o nsproxy.o \ + notifier.o ksysfs.o cred.o reboot.o \ +- async.o range.o smpboot.o ucount.o ++ async.o range.o smpboot.o ucount.o skip_list.o + + obj-$(CONFIG_MODULES) += kmod.o + obj-$(CONFIG_MULTIUSER) += groups.o +diff --git a/kernel/delayacct.c b/kernel/delayacct.c +index 27725754ac99..769d773c7182 100644 +--- a/kernel/delayacct.c ++++ b/kernel/delayacct.c +@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff --git a/kernel/exit.c b/kernel/exit.c +index a46a50d67002..58043176b285 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -131,7 +131,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -152,7 +152,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig +index f92d9a687372..d17db0ff775f 100644 +--- a/kernel/irq/Kconfig ++++ b/kernel/irq/Kconfig +@@ -111,6 +111,23 @@ config GENERIC_IRQ_RESERVATION_MODE + config IRQ_FORCED_THREADING + bool + ++config FORCE_IRQ_THREADING ++ bool "Make IRQ threading compulsory" ++ depends on IRQ_FORCED_THREADING ++ default n ++ ---help--- ++ ++ Make IRQ threading mandatory for any IRQ handlers that support it ++ instead of being optional and requiring the threadirqs kernel ++ parameter. Instead they can be optionally disabled with the ++ nothreadirqs kernel parameter. ++ ++ Enabling this may make some architectures not boot with runqueue ++ sharing and MuQSS. ++ ++ Enable if you are building for a desktop or low latency system, ++ otherwise say N. ++ + config SPARSE_IRQ + bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ + ---help--- +diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c +index 1753486b440c..f43423737493 100644 +--- a/kernel/irq/manage.c ++++ b/kernel/irq/manage.c +@@ -24,9 +24,20 @@ + #include "internals.h" + + #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) ++#ifdef CONFIG_FORCE_IRQ_THREADING ++__read_mostly bool force_irqthreads = true; ++#else + __read_mostly bool force_irqthreads; ++#endif + EXPORT_SYMBOL_GPL(force_irqthreads); + ++static int __init setup_noforced_irqthreads(char *arg) ++{ ++ force_irqthreads = false; ++ return 0; ++} ++early_param("nothreadirqs", setup_noforced_irqthreads); ++ + static int __init setup_forced_irqthreads(char *arg) + { + force_irqthreads = true; +diff --git a/kernel/kthread.c b/kernel/kthread.c +index b262f47046ca..9797ad652268 100644 +--- a/kernel/kthread.c ++++ b/kernel/kthread.c +@@ -433,6 +433,34 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) + } + EXPORT_SYMBOL(kthread_bind); + ++#if defined(CONFIG_SCHED_MUQSS) && defined(CONFIG_SMP) ++extern void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); ++ ++/* ++ * new_kthread_bind is a special variant of __kthread_bind_mask. ++ * For new threads to work on muqss we want to call do_set_cpus_allowed ++ * without the task_cpu being set and the task rescheduled until they're ++ * rescheduled on their own so we call __do_set_cpus_allowed directly which ++ * only changes the cpumask. This is particularly important for smpboot threads ++ * to work. ++ */ ++static void new_kthread_bind(struct task_struct *p, unsigned int cpu) ++{ ++ unsigned long flags; ++ ++ if (WARN_ON(!wait_task_inactive(p, TASK_UNINTERRUPTIBLE))) ++ return; ++ ++ /* It's safe because the task is inactive. */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ __do_set_cpus_allowed(p, cpumask_of(cpu)); ++ p->flags |= PF_NO_SETAFFINITY; ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++#else ++#define new_kthread_bind(p, cpu) kthread_bind(p, cpu) ++#endif ++ + /** + * kthread_create_on_cpu - Create a cpu bound kthread + * @threadfn: the function to run until signal_pending(current). +@@ -454,7 +482,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), + cpu); + if (IS_ERR(p)) + return p; +- kthread_bind(p, cpu); ++ new_kthread_bind(p, cpu); + /* CPU hotplug need to bind once again when unparking the thread. */ + set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); + to_kthread(p)->cpu = cpu; +diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +index cdf318d86dd6..304c0c8c2bea 100644 +--- a/kernel/livepatch/transition.c ++++ b/kernel/livepatch/transition.c +@@ -282,7 +282,7 @@ static bool klp_try_switch_task(struct task_struct *task) + { + static char err_buf[STACK_ERR_BUF_SIZE]; + struct rq *rq; +- struct rq_flags flags; ++ struct rq_flags rf; + int ret; + bool success = false; + +@@ -304,7 +304,7 @@ static bool klp_try_switch_task(struct task_struct *task) + * functions. If all goes well, switch the task to the target patch + * state. + */ +- rq = task_rq_lock(task, &flags); ++ rq = task_rq_lock(task, &rf); + + if (task_running(rq, task) && task != current) { + snprintf(err_buf, STACK_ERR_BUF_SIZE, +@@ -323,7 +323,7 @@ static bool klp_try_switch_task(struct task_struct *task) + task->patch_state = klp_target_state; + + done: +- task_rq_unlock(rq, task, &flags); ++ task_rq_unlock(rq, task, &rf); + + /* + * Due to console deadlock issues, pr_debug() can't be used while +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 21fb5a5662b5..a04ffebc6b7a 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -16,15 +16,23 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + ++ifdef CONFIG_SCHED_MUQSS ++obj-y += MuQSS.o clock.o cputime.o ++obj-y += idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o ++ ++obj-$(CONFIG_SMP) += topology.o ++else + obj-y += core.o loadavg.o clock.o cputime.o + obj-y += idle.o fair.o rt.o deadline.o + obj-y += wait.o wait_bit.o swait.o completion.o + + obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o +-obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o ++endif ++obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o +diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c +new file mode 100644 +index 000000000000..fafb5a790cf1 +--- /dev/null ++++ b/kernel/sched/MuQSS.c +@@ -0,0 +1,7606 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * kernel/sched/MuQSS.c, was kernel/sched.c ++ * ++ * Kernel scheduler and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and ++ * make semaphores SMP safe ++ * 1998-11-19 Implemented schedule_timeout() and related stuff ++ * by Andrea Arcangeli ++ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: ++ * hybrid priority-list and round-robin design with ++ * an array-switch method of distributing timeslices ++ * and per-CPU runqueues. Cleanups and useful suggestions ++ * by Davide Libenzi, preemptible kernel bits by Robert Love. ++ * 2003-09-03 Interactivity tuning by Con Kolivas. ++ * 2004-04-02 Scheduler domains code by Nick Piggin ++ * 2007-04-15 Work begun on replacing all interactivity tuning with a ++ * fair scheduling design by Con Kolivas. ++ * 2007-05-05 Load balancing (smp-nice) and other improvements ++ * by Peter Williams ++ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith ++ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri ++ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, ++ * Thomas Gleixner, Mike Kravetz ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2016-10-01 Multiple Queue Skiplist Scheduler scalable evolution of BFS ++ * scheduler by Con Kolivas. ++ * 2019-08-31 LLC bits by Eduards Bezverhijs ++ */ ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#include "../workqueue_internal.h" ++#include "../smpboot.h" ++ ++#define CREATE_TRACE_POINTS ++#include ++ ++#include "MuQSS.h" ++ ++#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) ++#define rt_task(p) rt_prio((p)->prio) ++#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) ++#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR) ++#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) ++ ++#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) ++#define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) ++#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO) ++ ++#define is_iso_policy(policy) ((policy) == SCHED_ISO) ++#define iso_task(p) unlikely(is_iso_policy((p)->policy)) ++#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO) ++ ++#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) ++ ++#define ISO_PERIOD (5 * HZ) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* ++ * Some helpers for converting to/from various scales. Use shifts to get ++ * approximate multiples of ten for less overhead. ++ */ ++#define APPROX_NS_PS (1073741824) /* Approximate ns per second */ ++#define JIFFIES_TO_NS(TIME) ((TIME) * (APPROX_NS_PS / HZ)) ++#define JIFFY_NS (APPROX_NS_PS / HZ) ++#define JIFFY_US (1048576 / HZ) ++#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS) ++#define HALF_JIFFY_NS (APPROX_NS_PS / HZ / 2) ++#define HALF_JIFFY_US (1048576 / HZ / 2) ++#define MS_TO_NS(TIME) ((TIME) << 20) ++#define MS_TO_US(TIME) ((TIME) << 10) ++#define NS_TO_MS(TIME) ((TIME) >> 20) ++#define NS_TO_US(TIME) ((TIME) >> 10) ++#define US_TO_NS(TIME) ((TIME) << 10) ++#define TICK_APPROX_NS ((APPROX_NS_PS+HZ/2)/HZ) ++ ++#define RESCHED_US (100) /* Reschedule if less than this many μs left */ ++ ++void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "MuQSS CPU scheduler v0.196 by Con Kolivas.\n"); ++} ++ ++/* Define RQ share levels */ ++#define RQSHARE_NONE 0 ++#define RQSHARE_SMT 1 ++#define RQSHARE_MC 2 ++#define RQSHARE_MC_LLC 3 ++#define RQSHARE_SMP 4 ++#define RQSHARE_ALL 5 ++ ++/* Define locality levels */ ++#define LOCALITY_SAME 0 ++#define LOCALITY_SMT 1 ++#define LOCALITY_MC_LLC 2 ++#define LOCALITY_MC 3 ++#define LOCALITY_SMP 4 ++#define LOCALITY_DISTANT 5 ++ ++/* ++ * This determines what level of runqueue sharing will be done and is ++ * configurable at boot time with the bootparam rqshare = ++ */ ++static int rqshare __read_mostly = CONFIG_SHARERQ; /* Default RQSHARE_MC */ ++ ++static int __init set_rqshare(char *str) ++{ ++ if (!strncmp(str, "none", 4)) { ++ rqshare = RQSHARE_NONE; ++ return 0; ++ } ++ if (!strncmp(str, "smt", 3)) { ++ rqshare = RQSHARE_SMT; ++ return 0; ++ } ++ if (!strncmp(str, "mc", 2)) { ++ rqshare = RQSHARE_MC; ++ return 0; ++ } ++ if (!strncmp(str, "llc", 3)) { ++ rqshare = RQSHARE_MC_LLC; ++ return 0; ++ } ++ if (!strncmp(str, "smp", 3)) { ++ rqshare = RQSHARE_SMP; ++ return 0; ++ } ++ if (!strncmp(str, "all", 3)) { ++ rqshare = RQSHARE_ALL; ++ return 0; ++ } ++ return 1; ++} ++__setup("rqshare=", set_rqshare); ++ ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 6ms. ++ * Tunable via /proc interface. ++ */ ++int rr_interval __read_mostly = 6; ++ ++/* ++ * Tunable to choose whether to prioritise latency or throughput, simple ++ * binary yes or no ++ */ ++int sched_interactive __read_mostly = 1; ++ ++/* ++ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks ++ * are allowed to run five seconds as real time tasks. This is the total over ++ * all online cpus. ++ */ ++int sched_iso_cpu __read_mostly = 70; ++ ++/* ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Yield only to better priority/deadline tasks. (default) ++ * 2: Expire timeslice and recalculate deadline. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++/* ++ * The relative length of deadline for each priority(nice) level. ++ */ ++static int prio_ratios[NICE_WIDTH] __read_mostly; ++ ++ ++/* ++ * The quota handed out to tasks of all priority levels when refilling their ++ * time_slice. ++ */ ++static inline int timeslice(void) ++{ ++ return MS_TO_US(rr_interval); ++} ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifdef CONFIG_SMP ++/* ++ * Total number of runqueues. Equals number of CPUs when there is no runqueue ++ * sharing but is usually less with SMT/MC sharing of runqueues. ++ */ ++static int total_runqueues __read_mostly = 1; ++ ++static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp; ++ ++struct rq *cpu_rq(int cpu) ++{ ++ return &per_cpu(runqueues, (cpu)); ++} ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++ ++/* ++ * For asym packing, by default the lower numbered cpu has higher priority. ++ */ ++int __weak arch_asym_cpu_priority(int cpu) ++{ ++ return -cpu; ++} ++ ++int __weak arch_sd_sibling_asym_packing(void) ++{ ++ return 0*SD_ASYM_PACKING; ++} ++ ++#ifdef CONFIG_SCHED_SMT ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++EXPORT_SYMBOL_GPL(sched_smt_present); ++#endif ++ ++#else ++struct rq *uprq; ++#endif /* CONFIG_SMP */ ++ ++#include "stats.h" ++ ++/* ++ * All common locking functions performed on rq->lock. rq->clock is local to ++ * the CPU accessing it so it can be modified just with interrupts disabled ++ * when we're not updating niffies. ++ * Looking up task_rq must be done under rq->lock to be safe. ++ */ ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++static void update_irq_load_avg(struct rq *rq, long delta); ++#else ++static inline void update_irq_load_avg(struct rq *rq, long delta) {} ++#endif ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++ s64 __maybe_unused steal = 0, irq_delta = 0; ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ delta -= steal; ++ } ++#endif ++ rq->clock_task += delta; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ if (irq_delta + steal) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta < 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++/* ++ * Niffies are a globally increasing nanosecond counter. They're only used by ++ * update_load_avg and time_slice_expired, however deadlines are based on them ++ * across CPUs. Update them whenever we will call one of those functions, and ++ * synchronise them across CPUs whenever we hold both runqueue locks. ++ */ ++static inline void update_clocks(struct rq *rq) ++{ ++ s64 ndiff, minndiff; ++ long jdiff; ++ ++ update_rq_clock(rq); ++ ndiff = rq->clock - rq->old_clock; ++ rq->old_clock = rq->clock; ++ jdiff = jiffies - rq->last_jiffy; ++ ++ /* Subtract any niffies added by balancing with other rqs */ ++ ndiff -= rq->niffies - rq->last_niffy; ++ minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies; ++ if (minndiff < 0) ++ minndiff = 0; ++ ndiff = max(ndiff, minndiff); ++ rq->niffies += ndiff; ++ rq->last_niffy = rq->niffies; ++ if (jdiff) { ++ rq->last_jiffy += jdiff; ++ rq->last_jiffy_niffies = rq->niffies; ++ } ++} ++ ++/* ++ * Any time we have two runqueues locked we use that as an opportunity to ++ * synchronise niffies to the highest value as idle ticks may have artificially ++ * kept niffies low on one CPU and the truth can only be later. ++ */ ++static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2) ++{ ++ if (rq1->niffies > rq2->niffies) ++ rq2->niffies = rq1->niffies; ++ else ++ rq1->niffies = rq2->niffies; ++} ++ ++/* ++ * double_rq_lock - safely lock two runqueues ++ * ++ * Note this does not disable interrupts like task_rq_lock, ++ * you need to do so manually before calling. ++ */ ++ ++/* For when we know rq1 != rq2 */ ++static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2) ++ __acquires(rq1->lock) ++ __acquires(rq2->lock) ++{ ++ if (rq1 < rq2) { ++ raw_spin_lock(rq1->lock); ++ raw_spin_lock_nested(rq2->lock, SINGLE_DEPTH_NESTING); ++ } else { ++ raw_spin_lock(rq2->lock); ++ raw_spin_lock_nested(rq1->lock, SINGLE_DEPTH_NESTING); ++ } ++} ++ ++static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) ++ __acquires(rq1->lock) ++ __acquires(rq2->lock) ++{ ++ BUG_ON(!irqs_disabled()); ++ if (rq1->lock == rq2->lock) { ++ raw_spin_lock(rq1->lock); ++ __acquire(rq2->lock); /* Fake it out ;) */ ++ } else ++ __double_rq_lock(rq1, rq2); ++ synchronise_niffies(rq1, rq2); ++} ++ ++/* ++ * double_rq_unlock - safely unlock two runqueues ++ * ++ * Note this does not restore interrupts like task_rq_unlock, ++ * you need to do so manually after calling. ++ */ ++static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) ++ __releases(rq1->lock) ++ __releases(rq2->lock) ++{ ++ raw_spin_unlock(rq1->lock); ++ if (rq1->lock != rq2->lock) ++ raw_spin_unlock(rq2->lock); ++ else ++ __release(rq2->lock); ++} ++ ++static inline void lock_all_rqs(void) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ do_raw_spin_lock(rq->lock); ++ } ++} ++ ++static inline void unlock_all_rqs(void) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ do_raw_spin_unlock(rq->lock); ++ } ++ preempt_enable(); ++} ++ ++/* Specially nest trylock an rq */ ++static inline bool trylock_rq(struct rq *this_rq, struct rq *rq) ++{ ++ if (unlikely(!do_raw_spin_trylock(rq->lock))) ++ return false; ++ spin_acquire(&rq->lock->dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ synchronise_niffies(this_rq, rq); ++ return true; ++} ++ ++/* Unlock a specially nested trylocked rq */ ++static inline void unlock_rq(struct rq *rq) ++{ ++ spin_release(&rq->lock->dep_map, 1, _RET_IP_); ++ do_raw_spin_unlock(rq->lock); ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * In order to ensure that a pending wakeup will observe our pending ++ * state, even in the failed case, an explicit smp_mb() must be used. ++ */ ++ smp_mb__before_atomic(); ++ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) ++ return false; ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++ return true; ++} ++ ++/** ++ * wake_q_add() - queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ */ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task)) ++ get_task_struct(task); ++} ++ ++/** ++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ * ++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers ++ * that already hold reference to @task can call the 'safe' version and trust ++ * wake_q to do the right thing depending whether or not the @task is already ++ * queued for wakeup. ++ */ ++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (!__wake_q_add(head, task)) ++ put_task_struct(task); ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* Task can safely be re-inserted now */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++static inline void smp_sched_reschedule(int cpu) ++{ ++ if (likely(cpu_online(cpu))) ++ smp_send_reschedule(cpu); ++} ++ ++/* ++ * resched_task - mark a task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_task(struct task_struct *p) ++{ ++ int cpu; ++#ifdef CONFIG_LOCKDEP ++ /* Kernel threads call this when creating workqueues while still ++ * inactive from __kthread_bind_mask, holding only the pi_lock */ ++ if (!(p->flags & PF_KTHREAD)) { ++ struct rq *rq = task_rq(p); ++ ++ lockdep_assert_held(rq->lock); ++ } ++#endif ++ if (test_tsk_need_resched(p)) ++ return; ++ ++ cpu = task_cpu(p); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(p)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++/* ++ * A task that is not running or queued will not have a node set. ++ * A task that is queued but not running will have a node set. ++ * A task that is currently running will have ->on_cpu set but no node set. ++ */ ++static inline bool task_queued(struct task_struct *p) ++{ ++ return !skiplist_node_empty(&p->node); ++} ++ ++static void enqueue_task(struct rq *rq, struct task_struct *p, int flags); ++static inline void resched_if_idle(struct rq *rq); ++ ++/* Dodgy workaround till we figure out where the softirqs are going */ ++static inline void do_pending_softirq(struct rq *rq, struct task_struct *next) ++{ ++ if (unlikely(next == rq->idle && local_softirq_pending() && !in_interrupt())) ++ do_softirq_own_stack(); ++} ++ ++static inline bool deadline_before(u64 deadline, u64 time) ++{ ++ return (deadline < time); ++} ++ ++/* ++ * Deadline is "now" in niffies + (offset by priority). Setting the deadline ++ * is the key to everything. It distributes cpu fairly amongst tasks of the ++ * same nice value, it proportions cpu according to nice level, it means the ++ * task that last woke up the longest ago has the earliest deadline, thus ++ * ensuring that interactive tasks get low latency on wake up. The CPU ++ * proportion works out to the square of the virtual deadline difference, so ++ * this equation will give nice 19 3% CPU compared to nice 0. ++ */ ++static inline u64 prio_deadline_diff(int user_prio) ++{ ++ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); ++} ++ ++static inline u64 task_deadline_diff(struct task_struct *p) ++{ ++ return prio_deadline_diff(TASK_USER_PRIO(p)); ++} ++ ++static inline u64 static_deadline_diff(int static_prio) ++{ ++ return prio_deadline_diff(USER_PRIO(static_prio)); ++} ++ ++static inline int longest_deadline_diff(void) ++{ ++ return prio_deadline_diff(39); ++} ++ ++static inline int ms_longest_deadline_diff(void) ++{ ++ return NS_TO_MS(longest_deadline_diff()); ++} ++ ++static inline bool rq_local(struct rq *rq); ++ ++#ifndef SCHED_CAPACITY_SCALE ++#define SCHED_CAPACITY_SCALE 1024 ++#endif ++ ++static inline int rq_load(struct rq *rq) ++{ ++ return rq->nr_running; ++} ++ ++/* ++ * Update the load average for feeding into cpu frequency governors. Use a ++ * rough estimate of a rolling average with ~ time constant of 32ms. ++ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 ++ * Make sure a call to update_clocks has been made before calling this to get ++ * an updated rq->niffies. ++ */ ++static void update_load_avg(struct rq *rq, unsigned int flags) ++{ ++ long us_interval, load; ++ unsigned long curload; ++ ++ us_interval = NS_TO_US(rq->niffies - rq->load_update); ++ if (unlikely(us_interval <= 0)) ++ return; ++ ++ curload = rq_load(rq); ++ load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); ++ if (unlikely(load < 0)) ++ load = 0; ++ load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; ++ rq->load_avg = load; ++ ++ rq->load_update = rq->niffies; ++ update_irq_load_avg(rq, 0); ++ if (likely(rq_local(rq))) ++ cpufreq_trigger(rq, flags); ++} ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++/* ++ * IRQ variant of update_load_avg below. delta is actually time in nanoseconds ++ * here so we scale curload to how long it's been since the last update. ++ */ ++static void update_irq_load_avg(struct rq *rq, long delta) ++{ ++ long us_interval, load; ++ unsigned long curload; ++ ++ us_interval = NS_TO_US(rq->niffies - rq->irq_load_update); ++ if (unlikely(us_interval <= 0)) ++ return; ++ ++ curload = NS_TO_US(delta) / us_interval; ++ load = rq->irq_load_avg - (rq->irq_load_avg * us_interval * 5 / 262144); ++ if (unlikely(load < 0)) ++ load = 0; ++ load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; ++ rq->irq_load_avg = load; ++ ++ rq->irq_load_update = rq->niffies; ++} ++#endif ++ ++/* ++ * Removing from the runqueue. Enter with rq locked. Deleting a task ++ * from the skip list is done via the stored node reference in the task struct ++ * and does not require a full look up. Thus it occurs in O(k) time where k ++ * is the "level" of the list the task was stored at - usually < 4, max 8. ++ */ ++static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ skiplist_delete(rq->sl, &p->node); ++ rq->best_key = rq->node->next[0]->key; ++ update_clocks(rq); ++ ++ if (!(flags & DEQUEUE_SAVE)) { ++ sched_info_dequeued(rq, p); ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); ++ } ++ rq->nr_running--; ++ if (rt_task(p)) ++ rq->rt_nr_running--; ++ update_load_avg(rq, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_RCU ++static bool rcu_read_critical(struct task_struct *p) ++{ ++ return p->rcu_read_unlock_special.b.blocked; ++} ++#else /* CONFIG_PREEMPT_RCU */ ++#define rcu_read_critical(p) (false) ++#endif /* CONFIG_PREEMPT_RCU */ ++ ++/* ++ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as ++ * an idle task, we ensure none of the following conditions are met. ++ */ ++static bool idleprio_suitable(struct task_struct *p) ++{ ++ return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) && ++ !signal_pending(p) && !rcu_read_critical(p) && !freezing(p)); ++} ++ ++/* ++ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check ++ * that the iso_refractory flag is not set. ++ */ ++static inline bool isoprio_suitable(struct rq *rq) ++{ ++ return !rq->iso_refractory; ++} ++ ++/* ++ * Adding to the runqueue. Enter with rq locked. ++ */ ++static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ unsigned int randseed, cflags = 0; ++ u64 sl_id; ++ ++ if (!rt_task(p)) { ++ /* Check it hasn't gotten rt from PI */ ++ if ((idleprio_task(p) && idleprio_suitable(p)) || ++ (iso_task(p) && isoprio_suitable(rq))) ++ p->prio = p->normal_prio; ++ else ++ p->prio = NORMAL_PRIO; ++ } else ++ rq->rt_nr_running++; ++ /* ++ * The sl_id key passed to the skiplist generates a sorted list. ++ * Realtime and sched iso tasks run FIFO so they only need be sorted ++ * according to priority. The skiplist will put tasks of the same ++ * key inserted later in FIFO order. Tasks of sched normal, batch ++ * and idleprio are sorted according to their deadlines. Idleprio ++ * tasks are offset by an impossibly large deadline value ensuring ++ * they get sorted into last positions, but still according to their ++ * own deadlines. This creates a "landscape" of skiplists running ++ * from priority 0 realtime in first place to the lowest priority ++ * idleprio tasks last. Skiplist insertion is an O(log n) process. ++ */ ++ if (p->prio <= ISO_PRIO) { ++ sl_id = p->prio; ++ } else { ++ sl_id = p->deadline; ++ if (idleprio_task(p)) { ++ if (p->prio == IDLE_PRIO) ++ sl_id |= 0xF000000000000000; ++ else ++ sl_id += longest_deadline_diff(); ++ } ++ } ++ /* ++ * Some architectures don't have better than microsecond resolution ++ * so mask out ~microseconds as the random seed for skiplist insertion. ++ */ ++ update_clocks(rq); ++ if (!(flags & ENQUEUE_RESTORE)) { ++ sched_info_queued(rq, p); ++ psi_enqueue(p, flags & ENQUEUE_WAKEUP); ++ } ++ ++ randseed = (rq->niffies >> 10) & 0xFFFFFFFF; ++ skiplist_insert(rq->sl, &p->node, sl_id, p, randseed); ++ rq->best_key = rq->node->next[0]->key; ++ if (p->in_iowait) ++ cflags |= SCHED_CPUFREQ_IOWAIT; ++ rq->nr_running++; ++ update_load_avg(rq, cflags); ++} ++ ++/* ++ * Returns the relative length of deadline all compared to the shortest ++ * deadline which is that of nice -20. ++ */ ++static inline int task_prio_ratio(struct task_struct *p) ++{ ++ return prio_ratios[TASK_USER_PRIO(p)]; ++} ++ ++/* ++ * task_timeslice - all tasks of all priorities get the exact same timeslice ++ * length. CPU distribution is handled by giving different deadlines to ++ * tasks of different priorities. Use 128 as the base value for fast shifts. ++ */ ++static inline int task_timeslice(struct task_struct *p) ++{ ++ return (rr_interval * task_prio_ratio(p) / 128); ++} ++ ++#ifdef CONFIG_SMP ++/* Entered with rq locked */ ++static inline void resched_if_idle(struct rq *rq) ++{ ++ if (rq_idle(rq)) ++ resched_task(rq->curr); ++} ++ ++static inline bool rq_local(struct rq *rq) ++{ ++ return (rq->cpu == smp_processor_id()); ++} ++#ifdef CONFIG_SMT_NICE ++static const cpumask_t *thread_cpumask(int cpu); ++ ++/* Find the best real time priority running on any SMT siblings of cpu and if ++ * none are running, the static priority of the best deadline task running. ++ * The lookups to the other runqueues is done lockless as the occasional wrong ++ * value would be harmless. */ ++static int best_smt_bias(struct rq *this_rq) ++{ ++ int other_cpu, best_bias = 0; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct rq *rq = cpu_rq(other_cpu); ++ ++ if (rq_idle(rq)) ++ continue; ++ if (unlikely(!rq->online)) ++ continue; ++ if (!rq->rq_mm) ++ continue; ++ if (likely(rq->rq_smt_bias > best_bias)) ++ best_bias = rq->rq_smt_bias; ++ } ++ return best_bias; ++} ++ ++static int task_prio_bias(struct task_struct *p) ++{ ++ if (rt_task(p)) ++ return 1 << 30; ++ else if (task_running_iso(p)) ++ return 1 << 29; ++ else if (task_running_idle(p)) ++ return 0; ++ return MAX_PRIO - p->static_prio; ++} ++ ++static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq) ++{ ++ return true; ++} ++ ++static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule; ++ ++/* We've already decided p can run on CPU, now test if it shouldn't for SMT ++ * nice reasons. */ ++static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq) ++{ ++ int best_bias, task_bias; ++ ++ /* Kernel threads always run */ ++ if (unlikely(!p->mm)) ++ return true; ++ if (rt_task(p)) ++ return true; ++ if (!idleprio_suitable(p)) ++ return true; ++ best_bias = best_smt_bias(this_rq); ++ /* The smt siblings are all idle or running IDLEPRIO */ ++ if (best_bias < 1) ++ return true; ++ task_bias = task_prio_bias(p); ++ if (task_bias < 1) ++ return false; ++ if (task_bias >= best_bias) ++ return true; ++ /* Dither 25% cpu of normal tasks regardless of nice difference */ ++ if (best_bias % 4 == 1) ++ return true; ++ /* Sorry, you lose */ ++ return false; ++} ++#else /* CONFIG_SMT_NICE */ ++#define smt_schedule(p, this_rq) (true) ++#endif /* CONFIG_SMT_NICE */ ++ ++static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask) ++{ ++ set_bit(cpu, (volatile unsigned long *)cpumask); ++} ++ ++/* ++ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to ++ * allow easy lookup of whether any suitable idle CPUs are available. ++ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the ++ * idle_cpus variable than to do a full bitmask check when we are busy. The ++ * bits are set atomically but read locklessly as occasional false positive / ++ * negative is harmless. ++ */ ++static inline void set_cpuidle_map(int cpu) ++{ ++ if (likely(cpu_online(cpu))) ++ atomic_set_cpu(cpu, &cpu_idle_map); ++} ++ ++static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) ++{ ++ clear_bit(cpu, (volatile unsigned long *)cpumask); ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++ atomic_clear_cpu(cpu, &cpu_idle_map); ++} ++ ++static bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return (cpumask_intersects(p->cpus_ptr, &cpu_idle_map)); ++} ++ ++/* ++ * Resched current on rq. We don't know if rq is local to this CPU nor if it ++ * is locked so we do not use an intermediate variable for the task to avoid ++ * having it dereferenced. ++ */ ++static void resched_curr(struct rq *rq) ++{ ++ int cpu; ++ ++ if (test_tsk_need_resched(rq->curr)) ++ return; ++ ++ rq->preempt = rq->curr; ++ cpu = rq->cpu; ++ ++ /* We're doing this without holding the rq lock if it's not task_rq */ ++ ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(rq->curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(rq->curr)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++#define CPUIDLE_DIFF_THREAD (1) ++#define CPUIDLE_DIFF_CORE_LLC (2) ++#define CPUIDLE_DIFF_CORE (4) ++#define CPUIDLE_CACHE_BUSY (8) ++#define CPUIDLE_DIFF_CPU (16) ++#define CPUIDLE_THREAD_BUSY (32) ++#define CPUIDLE_DIFF_NODE (64) ++ ++/* ++ * The best idle CPU is chosen according to the CPUIDLE ranking above where the ++ * lowest value would give the most suitable CPU to schedule p onto next. The ++ * order works out to be the following: ++ * ++ * Same thread, idle or busy cache, idle or busy threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ */ ++static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask) ++{ ++ int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY | ++ CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE | ++ CPUIDLE_DIFF_CORE_LLC | CPUIDLE_DIFF_THREAD; ++ int cpu_tmp; ++ ++ if (cpumask_test_cpu(best_cpu, tmpmask)) ++ goto out; ++ ++ for_each_cpu(cpu_tmp, tmpmask) { ++ int ranking, locality; ++ struct rq *tmp_rq; ++ ++ ranking = 0; ++ tmp_rq = cpu_rq(cpu_tmp); ++ ++ locality = rq->cpu_locality[cpu_tmp]; ++#ifdef CONFIG_NUMA ++ if (locality > LOCALITY_SMP) ++ ranking |= CPUIDLE_DIFF_NODE; ++ else ++#endif ++ if (locality > LOCALITY_MC) ++ ranking |= CPUIDLE_DIFF_CPU; ++#ifdef CONFIG_SCHED_MC ++ else if (locality == LOCALITY_MC_LLC) ++ ranking |= CPUIDLE_DIFF_CORE_LLC; ++ else if (locality == LOCALITY_MC) ++ ranking |= CPUIDLE_DIFF_CORE; ++ if (!(tmp_rq->cache_idle(tmp_rq))) ++ ranking |= CPUIDLE_CACHE_BUSY; ++#endif ++#ifdef CONFIG_SCHED_SMT ++ if (locality == LOCALITY_SMT) ++ ranking |= CPUIDLE_DIFF_THREAD; ++#endif ++ if (ranking < best_ranking ++#ifdef CONFIG_SCHED_SMT ++ || (ranking == best_ranking && (tmp_rq->siblings_idle(tmp_rq))) ++#endif ++ ) { ++ best_cpu = cpu_tmp; ++ best_ranking = ranking; ++ } ++ } ++out: ++ return best_cpu; ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ struct rq *this_rq = cpu_rq(this_cpu); ++ ++ return (this_rq->cpu_locality[that_cpu] < LOCALITY_SMP); ++} ++ ++/* As per resched_curr but only will resched idle task */ ++static inline void resched_idle(struct rq *rq) ++{ ++ if (test_tsk_need_resched(rq->idle)) ++ return; ++ ++ rq->preempt = rq->idle; ++ ++ set_tsk_need_resched(rq->idle); ++ ++ if (rq_local(rq)) { ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ smp_sched_reschedule(rq->cpu); ++} ++ ++static struct rq *resched_best_idle(struct task_struct *p, int cpu) ++{ ++ cpumask_t tmpmask; ++ struct rq *rq; ++ int best_cpu; ++ ++ cpumask_and(&tmpmask, p->cpus_ptr, &cpu_idle_map); ++ best_cpu = best_mask_cpu(cpu, task_rq(p), &tmpmask); ++ rq = cpu_rq(best_cpu); ++ if (!smt_schedule(p, rq)) ++ return NULL; ++ rq->preempt = p; ++ resched_idle(rq); ++ return rq; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p, task_cpu(p)); ++} ++ ++static inline struct rq *rq_order(struct rq *rq, int cpu) ++{ ++ return rq->rq_order[cpu]; ++} ++#else /* CONFIG_SMP */ ++static inline void set_cpuidle_map(int cpu) ++{ ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++} ++ ++static inline bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return uprq->curr == uprq->idle; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++} ++ ++static inline void resched_curr(struct rq *rq) ++{ ++ resched_task(rq->curr); ++} ++ ++static inline void resched_if_idle(struct rq *rq) ++{ ++} ++ ++static inline bool rq_local(struct rq *rq) ++{ ++ return true; ++} ++ ++static inline struct rq *rq_order(struct rq *rq, int cpu) ++{ ++ return rq; ++} ++ ++static inline bool smt_schedule(struct task_struct *p, struct rq *rq) ++{ ++ return true; ++} ++#endif /* CONFIG_SMP */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ if (has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ if (idleprio_task(p)) ++ return IDLE_PRIO; ++ if (iso_task(p)) ++ return ISO_PRIO; ++ return NORMAL_PRIO; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. Enter with rq locked. ++ */ ++static void activate_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ resched_if_idle(rq); ++ ++ /* ++ * Sleep time is in units of nanosecs, so shift by 20 to get a ++ * milliseconds-range estimation of the amount of time that the task ++ * spent sleeping: ++ */ ++ if (unlikely(prof_on == SLEEP_PROFILING)) { ++ if (p->state == TASK_UNINTERRUPTIBLE) ++ profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), ++ (rq->niffies - p->last_ran) >> 20); ++ } ++ ++ p->prio = effective_prio(p); ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible--; ++ ++ enqueue_task(rq, p, flags); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++} ++ ++/* ++ * deactivate_task - If it's running, it's not on the runqueue and we can just ++ * decrement the nr_running. Enter with rq locked. ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible++; ++ ++ p->on_rq = 0; ++ sched_info_dequeued(rq, p); ++ /* deactivate_task is always DEQUEUE_SLEEP in muqss */ ++ psi_dequeue(p, DEQUEUE_SLEEP); ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++ struct rq *rq; ++ ++ if (task_cpu(p) == new_cpu) ++ return; ++ ++ /* Do NOT call set_task_cpu on a currently queued task as we will not ++ * be reliably holding the rq lock after changing CPU. */ ++ BUG_ON(task_queued(p)); ++ rq = task_rq(p); ++ ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * Furthermore, all task_rq users should acquire both locks, see ++ * task_rq_lock(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(rq->lock))); ++#endif ++ ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ /* ++ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++ p->wake_cpu = new_cpu; ++ ++ if (task_running(rq, p)) { ++ /* ++ * We should only be calling this on a running task if we're ++ * holding rq lock. ++ */ ++ lockdep_assert_held(rq->lock); ++ ++ /* ++ * We can't change the task_thread_info CPU on a running task ++ * as p will still be protected by the rq lock of the CPU it ++ * is still running on so we only set the wake_cpu for it to be ++ * lazily updated once off the CPU. ++ */ ++ return; ++ } ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ WRITE_ONCE(p->cpu, new_cpu); ++#else ++ WRITE_ONCE(task_thread_info(p)->cpu, new_cpu); ++#endif ++ /* We're no longer protecting p after this point since we're holding ++ * the wrong runqueue lock. */ ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Move a task off the runqueue and take it to a cpu for it will ++ * become the running task. ++ */ ++static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) ++{ ++ struct rq *p_rq = task_rq(p); ++ ++ dequeue_task(p_rq, p, DEQUEUE_SAVE); ++ if (p_rq != rq) { ++ sched_info_dequeued(p_rq, p); ++ sched_info_queued(rq, p); ++ } ++ set_task_cpu(p, cpu); ++} ++ ++/* ++ * Returns a descheduling task to the runqueue unless it is being ++ * deactivated. ++ */ ++static inline void return_task(struct task_struct *p, struct rq *rq, ++ int cpu, bool deactivate) ++{ ++ if (deactivate) ++ deactivate_task(p, rq); ++ else { ++#ifdef CONFIG_SMP ++ /* ++ * set_task_cpu was called on the running task that doesn't ++ * want to deactivate so it has to be enqueued to a different ++ * CPU and we need its lock. Tag it to be moved with as the ++ * lock is dropped in finish_lock_switch. ++ */ ++ if (unlikely(p->wake_cpu != cpu)) ++ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ++ else ++#endif ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ } ++} ++ ++/* Enter with rq lock held. We know p is on the local cpu */ ++static inline void __set_tsk_resched(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ int running, queued; ++ struct rq_flags rf; ++ unsigned long ncsw; ++ struct rq *rq; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(rq, p)) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ rq = task_rq_lock(p, &rf); ++ trace_sched_wait_task(p); ++ running = task_running(rq, p); ++ queued = task_on_rq_queued(p); ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_rq_unlock(rq, p, &rf); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(queued)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_sched_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++#endif ++ ++/* ++ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the ++ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or ++ * between themselves, they cooperatively multitask. An idle rq scores as ++ * prio PRIO_LIMIT so it is always preempted. ++ */ ++static inline bool ++can_preempt(struct task_struct *p, int prio, u64 deadline) ++{ ++ /* Better static priority RT task or better policy preemption */ ++ if (p->prio < prio) ++ return true; ++ if (p->prio > prio) ++ return false; ++ if (p->policy == SCHED_BATCH) ++ return false; ++ /* SCHED_NORMAL and ISO will preempt based on deadline */ ++ if (!deadline_before(p->deadline, deadline)) ++ return false; ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++ ++static inline bool is_per_cpu_kthread(struct task_struct *p) ++{ ++ if (!(p->flags & PF_KTHREAD)) ++ return false; ++ ++ if (p->nr_cpus_allowed != 1) ++ return false; ++ ++ return true; ++} ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ ++ if (is_per_cpu_kthread(p)) ++ return cpu_online(cpu); ++ ++ return cpu_active(cpu); ++} ++ ++/* ++ * Check to see if p can run on cpu, and if not, whether there are any online ++ * CPUs it can run on instead. This only happens with the hotplug threads that ++ * bring up the CPUs. ++ */ ++static inline bool sched_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (likely(cpumask_test_cpu(cpu, p->cpus_ptr))) ++ return false; ++ if (p->nr_cpus_allowed == 1) { ++ cpumask_t valid_mask; ++ ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_online_mask); ++ if (unlikely(cpumask_empty(&valid_mask))) ++ return false; ++ } ++ return true; ++} ++ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ return true; ++} ++ ++#define cpu_online_map (*(cpumask_t *)cpu_online_mask) ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ int i, this_entries = rq_load(this_rq); ++ cpumask_t tmp; ++ ++ if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p))) ++ return; ++ ++ /* IDLEPRIO tasks never preempt anything but idle */ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ ++ cpumask_and(&tmp, &cpu_online_map, p->cpus_ptr); ++ ++ for (i = 0; i < num_online_cpus(); i++) { ++ struct rq *rq = this_rq->cpu_order[i]; ++ ++ if (!cpumask_test_cpu(rq->cpu, &tmp)) ++ continue; ++ ++ if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries) ++ continue; ++ if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) { ++ /* We set rq->preempting lockless, it's a hint only */ ++ rq->preempting = p; ++ resched_curr(rq); ++ return; ++ } ++ } ++} ++ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check); ++#else /* CONFIG_SMP */ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ return false; ++} ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) ++ resched_curr(uprq); ++} ++ ++static inline int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq = this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) { ++ __schedstat_inc(rq->ttwu_local); ++ } else { ++ struct sched_domain *sd; ++ ++ rcu_read_lock(); ++ for_each_domain(rq->cpu, sd) { ++ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { ++ __schedstat_inc(sd->ttwu_wake_remote); ++ break; ++ } ++ } ++ rcu_read_unlock(); ++ } ++ ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ /* ++ * Sync wakeups (i.e. those types of wakeups where the waker ++ * has indicated that it will leave the CPU in short order) ++ * don't trigger a preemption if there are no idle cpus, ++ * instead waiting for current to deschedule. ++ */ ++ if (wake_flags & WF_SYNC) ++ resched_suitable_idle(p); ++ else ++ try_preempt(p, rq); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ int en_flags = ENQUEUE_WAKEUP; ++ ++ lockdep_assert_held(rq->lock); ++ ++#ifdef CONFIG_SMP ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++ ++ if (wake_flags & WF_MIGRATED) ++ en_flags |= ENQUEUE_MIGRATED; ++#endif ++ ++ activate_task(rq, p, en_flags); ++ ttwu_do_wakeup(rq, p, wake_flags); ++} ++ ++/* ++ * Called in case the task @p isn't fully descheduled from its runqueue, ++ * in this case we must do a remote wakeup. Its a 'light' wakeup though, ++ * since all we need to do is flip p->state to TASK_RUNNING, since ++ * the task is still ->on_rq. ++ */ ++static int ttwu_remote(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ int ret = 0; ++ ++ rq = __task_rq_lock(p, NULL); ++ if (likely(task_on_rq_queued(p))) { ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_rq_unlock(rq, NULL); ++ ++ return ret; ++} ++ ++#ifdef CONFIG_SMP ++void sched_ttwu_pending(void) ++{ ++ struct rq *rq = this_rq(); ++ struct llist_node *llist = llist_del_all(&rq->wake_list); ++ struct task_struct *p, *t; ++ struct rq_flags rf; ++ ++ if (!llist) ++ return; ++ ++ rq_lock_irqsave(rq, &rf); ++ ++ llist_for_each_entry_safe(p, t, llist, wake_entry) ++ ttwu_do_activate(rq, p, 0); ++ ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++void scheduler_ipi(void) ++{ ++ /* ++ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting ++ * TIF_NEED_RESCHED remotely (for the first time) will also send ++ * this IPI. ++ */ ++ preempt_fold_need_resched(); ++ ++ if (llist_empty(&this_rq()->wake_list) && (!idle_cpu(smp_processor_id()) || need_resched())) ++ return; ++ ++ /* ++ * Not all reschedule IPI handlers call irq_enter/irq_exit, since ++ * traditionally all their work was done from the interrupt return ++ * path. Now that we actually do some work, we need to make sure ++ * we do call them. ++ * ++ * Some archs already do call them, luckily irq_enter/exit nest ++ * properly. ++ * ++ * Arguably we should visit all archs and update all handlers, ++ * however a fair share of IPIs are still resched only so this would ++ * somewhat pessimize the simple resched case. ++ */ ++ irq_enter(); ++ sched_ttwu_pending(); ++ irq_exit(); ++} ++ ++static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { ++ if (!set_nr_if_polling(rq->idle)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++ } ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ rq_lock_irqsave(rq, &rf); ++ if (likely(is_idle_task(rq->curr))) ++ smp_sched_reschedule(cpu); ++ /* Else cpu is not in idle, do nothing here */ ++ rq_unlock_irqrestore(rq, &rf); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++static int valid_task_cpu(struct task_struct *p) ++{ ++ cpumask_t valid_mask; ++ ++ if (p->flags & PF_KTHREAD) ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_all_mask); ++ else ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_active_mask); ++ ++ if (unlikely(!cpumask_weight(&valid_mask))) { ++ /* We shouldn't be hitting this any more */ ++ printk(KERN_WARNING "SCHED: No cpumask for %s/%d weight %d\n", p->comm, ++ p->pid, cpumask_weight(p->cpus_ptr)); ++ return cpumask_any(p->cpus_ptr); ++ } ++ return cpumask_any(&valid_mask); ++} ++ ++/* ++ * For a task that's just being woken up we have a valuable balancing ++ * opportunity so choose the nearest cache most lightly loaded runqueue. ++ * Entered with rq locked and returns with the chosen runqueue locked. ++ */ ++static inline int select_best_cpu(struct task_struct *p) ++{ ++ unsigned int idlest = ~0U; ++ struct rq *rq = NULL; ++ int i; ++ ++ if (suitable_idle_cpus(p)) { ++ int cpu = task_cpu(p); ++ ++ if (unlikely(needs_other_cpu(p, cpu))) ++ cpu = valid_task_cpu(p); ++ rq = resched_best_idle(p, cpu); ++ if (likely(rq)) ++ return rq->cpu; ++ } ++ ++ for (i = 0; i < num_online_cpus(); i++) { ++ struct rq *other_rq = task_rq(p)->cpu_order[i]; ++ int entries; ++ ++ if (!other_rq->online) ++ continue; ++ if (needs_other_cpu(p, other_rq->cpu)) ++ continue; ++ entries = rq_load(other_rq); ++ if (entries >= idlest) ++ continue; ++ idlest = entries; ++ rq = other_rq; ++ } ++ if (unlikely(!rq)) ++ return task_cpu(p); ++ return rq->cpu; ++} ++#else /* CONFIG_SMP */ ++static int valid_task_cpu(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static inline int select_best_cpu(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static struct rq *resched_best_idle(struct task_struct *p, int cpu) ++{ ++ return NULL; ++} ++#endif /* CONFIG_SMP */ ++ ++static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++#if defined(CONFIG_SMP) ++ if (!cpus_share_cache(smp_processor_id(), cpu)) { ++ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ ++ ttwu_queue_remote(p, cpu, wake_flags); ++ return; ++ } ++#endif ++ rq_lock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ rq_unlock(rq); ++} ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Return: %true if @p was woken up, %false if it was already running. ++ * or @state didn't match @p's state. ++ */ ++static int ++try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ++{ ++ unsigned long flags; ++ int cpu, success = 0; ++ ++ preempt_disable(); ++ if (p == current) { ++ /* ++ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) ++ * == smp_processor_id()'. Together this means we can special ++ * case the whole 'p->on_rq && ttwu_remote()' case below ++ * without taking any locks. ++ * ++ * In particular: ++ * - we rely on Program-Order guarantees for all the ordering, ++ * - we're serialized against set_special_state() by virtue of ++ * it disabling IRQs (this allows not taking ->pi_lock). ++ */ ++ if (!(p->state & state)) ++ goto out; ++ ++ success = 1; ++ cpu = task_cpu(p); ++ trace_sched_waking(p); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++ goto out; ++ } ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with mb() in ++ * set_current_state() the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ if (!(p->state & state)) ++ goto unlock; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ cpu = task_cpu(p); ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ if (p->on_rq && ttwu_remote(p, wake_flags)) ++ goto unlock; ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until its done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ p->sched_contributes_to_load = !!task_contributes_to_load(p); ++ p->state = TASK_WAKING; ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ cpu = select_best_cpu(p); ++ if (task_cpu(p) != cpu) { ++ wake_flags |= WF_MIGRATED; ++ psi_ttwu_dequeue(p); ++ set_task_cpu(p, cpu); ++ } ++ ++#else /* CONFIG_SMP */ ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++#endif /* CONFIG_SMP */ ++ ++ ttwu_queue(p, cpu, wake_flags); ++unlock: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++out: ++ if (success) ++ ttwu_stat(p, cpu, wake_flags); ++ preempt_enable(); ++ ++ return success; ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++static void time_slice_expired(struct task_struct *p, struct rq *rq); ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ */ ++int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ ++#ifdef CONFIG_COMPACTION ++ p->capture_control = NULL; ++#endif ++ ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * The process state is set to the same value of the process executing ++ * do_fork() code. That is running. This guarantees that nobody will ++ * actually run it, and a signal or other external event cannot wake ++ * it up and insert it on the runqueue either. ++ */ ++ ++ /* Should be reset in fork.c but done here for ease of MuQSS patching */ ++ p->on_cpu = ++ p->on_rq = ++ p->utime = ++ p->stime = ++ p->sched_time = ++ p->stime_ns = ++ p->utime_ns = 0; ++ skiplist_node_init(&p->node); ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { ++ p->policy = SCHED_NORMAL; ++ p->normal_prio = normal_prio(p); ++ } ++ ++ if (PRIO_TO_NICE(p->static_prio) < 0) { ++ p->static_prio = NICE_TO_PRIO(0); ++ p->normal_prio = p->static_prio; ++ } ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ /* ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ set_task_cpu(p, smp_processor_id()); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ return 0; ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p); ++ ++static void account_task_cpu(struct rq *rq, struct task_struct *p) ++{ ++ update_clocks(rq); ++ /* This isn't really a context switch but accounting is the same */ ++ update_cpu_clock_switch(rq, p); ++ p->last_ran = rq->niffies; ++} ++ ++bool sched_smp_initialized __read_mostly; ++ ++static inline int hrexpiry_enabled(struct rq *rq) ++{ ++ if (unlikely(!cpu_active(cpu_of(rq)) || !sched_smp_initialized)) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrexpiry_timer); ++} ++ ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++static inline void hrexpiry_clear(struct rq *rq) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ if (hrtimer_active(&rq->hrexpiry_timer)) ++ hrtimer_cancel(&rq->hrexpiry_timer); ++} ++ ++/* ++ * High-resolution time_slice expiry. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrexpiry(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrexpiry_timer); ++ struct task_struct *p; ++ ++ /* This can happen during CPU hotplug / resume */ ++ if (unlikely(cpu_of(rq) != smp_processor_id())) ++ goto out; ++ ++ /* ++ * We're doing this without the runqueue lock but this should always ++ * be run on the local CPU. Time slice should run out in __schedule ++ * but we set it to zero here in case niffies is slightly less. ++ */ ++ p = rq->curr; ++ p->time_slice = 0; ++ __set_tsk_resched(p); ++out: ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Called to set the hrexpiry timer state. ++ * ++ * called with irqs disabled from the local CPU only ++ */ ++static void hrexpiry_start(struct rq *rq, u64 delay) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ ++ hrtimer_start(&rq->hrexpiry_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED); ++} ++ ++static void init_rq_hrexpiry(struct rq *rq) ++{ ++ hrtimer_init(&rq->hrexpiry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ rq->hrexpiry_timer.function = hrexpiry; ++} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return HALF_JIFFY_US; ++ return 0; ++} ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ struct task_struct *parent, *rq_curr; ++ struct rq *rq, *new_rq; ++ unsigned long flags; ++ ++ parent = p->parent; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ p->state = TASK_RUNNING; ++ /* Task_rq can't change yet on a new task */ ++ new_rq = rq = task_rq(p); ++ if (unlikely(needs_other_cpu(p, task_cpu(p)))) { ++ set_task_cpu(p, valid_task_cpu(p)); ++ new_rq = task_rq(p); ++ } ++ ++ double_rq_lock(rq, new_rq); ++ rq_curr = rq->curr; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = rq_curr->normal_prio; ++ ++ trace_sched_wakeup_new(p); ++ ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. If it's negative, it won't ++ * matter since that's the same as being 0. rq->rq_deadline is only ++ * modified within schedule() so it is always equal to ++ * current->deadline. ++ */ ++ account_task_cpu(rq, rq_curr); ++ p->last_ran = rq_curr->last_ran; ++ if (likely(rq_curr->policy != SCHED_FIFO)) { ++ rq_curr->time_slice /= 2; ++ if (rq_curr->time_slice < RESCHED_US) { ++ /* ++ * Forking task has run out of timeslice. Reschedule it and ++ * start its child with a new time slice and deadline. The ++ * child will end up running first because its deadline will ++ * be slightly earlier. ++ */ ++ __set_tsk_resched(rq_curr); ++ time_slice_expired(p, new_rq); ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p, task_cpu(p)); ++ else if (unlikely(rq != new_rq)) ++ try_preempt(p, new_rq); ++ } else { ++ p->time_slice = rq_curr->time_slice; ++ if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) { ++ /* ++ * The VM isn't cloned, so we're in a good position to ++ * do child-runs-first in anticipation of an exec. This ++ * usually avoids a lot of COW overhead. ++ */ ++ __set_tsk_resched(rq_curr); ++ } else { ++ /* ++ * Adjust the hrexpiry since rq_curr will keep ++ * running and its timeslice has been shortened. ++ */ ++ hrexpiry_start(rq, US_TO_NS(rq_curr->time_slice)); ++ try_preempt(p, new_rq); ++ } ++ } ++ } else { ++ time_slice_expired(p, new_rq); ++ try_preempt(p, new_rq); ++ } ++ activate_task(new_rq, p, 0); ++ double_rq_unlock(rq, new_rq); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ */ ++ next->on_cpu = 1; ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->on_cpu is cleared, the task can be moved to a different CPU. ++ * We must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#endif ++} ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock->dep_map, 1, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock->owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock->dep_map, 0, 0, _THIS_IP_); ++ ++#ifdef CONFIG_SMP ++ /* ++ * If prev was marked as migrating to another CPU in return_task, drop ++ * the local runqueue lock but leave interrupts disabled and grab the ++ * remote lock we're migrating it to before enabling them. ++ */ ++ if (unlikely(task_on_rq_migrating(prev))) { ++ sched_info_dequeued(rq, prev); ++ /* ++ * We move the ownership of prev to the new cpu now. ttwu can't ++ * activate prev to the wrong cpu since it has to grab this ++ * runqueue in ttwu_remote. ++ */ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ prev->cpu = prev->wake_cpu; ++#else ++ task_thread_info(prev)->cpu = prev->wake_cpu; ++#endif ++ raw_spin_unlock(rq->lock); ++ ++ raw_spin_lock(&prev->pi_lock); ++ rq = __task_rq_lock(prev, NULL); ++ /* Check that someone else hasn't already queued prev */ ++ if (likely(!task_queued(prev))) { ++ enqueue_task(rq, prev, 0); ++ prev->on_rq = TASK_ON_RQ_QUEUED; ++ /* Wake up the CPU if it's not already running */ ++ resched_if_idle(rq); ++ } ++ raw_spin_unlock(&prev->pi_lock); ++ } ++#endif ++ rq_unlock(rq); ++ ++ do_pending_softirq(rq, current); ++ ++ local_irq_enable(); ++} ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_switch ++# define finish_arch_switch(prev) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static void finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ finish_lock_switch(rq, prev); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct_rcu_user(prev); ++ } ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++{ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline void ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prepare_task_switch(rq, prev, next); ++ ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * kernel -> kernel lazy + transfer active ++ * user -> kernel lazy + mmgrab() active ++ * ++ * kernel -> user switch + mmdrop() active ++ * user -> user switch ++ */ ++ if (!next->mm) { // to kernel ++ enter_lazy_tlb(prev->active_mm, next); ++ ++ next->active_mm = prev->active_mm; ++ if (prev->mm) // from user ++ mmgrab(prev->active_mm); ++ else ++ prev->active_mm = NULL; ++ } else { // to user ++ membarrier_switch_mm(rq, prev->active_mm, next->mm); ++ /* ++ * sys_membarrier() requires an smp_mb() between setting ++ * rq->curr / membarrier_switch_mm() and returning to userspace. ++ * ++ * The below provides this either through switch_mm(), or in ++ * case 'prev->active_mm == next->mm' through ++ * finish_task_switch()'s mmdrop(). ++ */ ++ switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ ++ if (!prev->mm) { // from kernel ++ /* will mmdrop() in finish_task_switch(). */ ++ rq->prev_mm = prev->active_mm; ++ prev->active_mm = NULL; ++ } ++ } ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++static unsigned long nr_uninterruptible(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_uninterruptible; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptible section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ if (rq_load(raw_rq()) == 1) ++ return true; ++ else ++ return false; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int cpu; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(cpu) ++ sum += cpu_rq(cpu)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpufreq menu ++ * governor are using nonsensical data. Boosting frequency for a CPU that has ++ * IO-wait which might not even end up running the task when it does become ++ * runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ return atomic_read(&cpu_rq(cpu)->nr_iowait); ++} ++ ++/* ++ * IO-wait accounting, and how its mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long cpu, sum = 0; ++ ++ for_each_possible_cpu(cpu) ++ sum += nr_iowait_cpu(cpu); ++ ++ return sum; ++} ++ ++unsigned long nr_active(void) ++{ ++ return nr_running() + nr_uninterruptible(); ++} ++ ++/* Variables and functions for calc_load */ ++static unsigned long calc_load_update; ++unsigned long avenrun[3]; ++EXPORT_SYMBOL(avenrun); ++ ++/** ++ * get_avenrun - get the load average array ++ * @loads: pointer to dest load array ++ * @offset: offset to add ++ * @shift: shift count to shift the result left ++ * ++ * These values are estimates at best, so no need for locking. ++ */ ++void get_avenrun(unsigned long *loads, unsigned long offset, int shift) ++{ ++ loads[0] = (avenrun[0] + offset) << shift; ++ loads[1] = (avenrun[1] + offset) << shift; ++ loads[2] = (avenrun[2] + offset) << shift; ++} ++ ++/* ++ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds. ++ */ ++void calc_global_load(unsigned long ticks) ++{ ++ long active; ++ ++ if (time_before(jiffies, READ_ONCE(calc_load_update))) ++ return; ++ active = nr_active() * FIXED_1; ++ ++ avenrun[0] = calc_load(avenrun[0], EXP_1, active); ++ avenrun[1] = calc_load(avenrun[1], EXP_5, active); ++ avenrun[2] = calc_load(avenrun[2], EXP_15, active); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++} ++ ++/** ++ * fixed_power_int - compute: x^n, in O(log n) time ++ * ++ * @x: base of the power ++ * @frac_bits: fractional bits of @x ++ * @n: power to raise @x to. ++ * ++ * By exploiting the relation between the definition of the natural power ++ * function: x^n := x*x*...*x (x multiplied by itself for n times), and ++ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, ++ * (where: n_i \elem {0, 1}, the binary vector representing n), ++ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is ++ * of course trivially computable in O(log_2 n), the length of our binary ++ * vector. ++ */ ++static unsigned long ++fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) ++{ ++ unsigned long result = 1UL << frac_bits; ++ ++ if (n) { ++ for (;;) { ++ if (n & 1) { ++ result *= x; ++ result += 1UL << (frac_bits - 1); ++ result >>= frac_bits; ++ } ++ n >>= 1; ++ if (!n) ++ break; ++ x *= x; ++ x += 1UL << (frac_bits - 1); ++ x >>= frac_bits; ++ } ++ } ++ ++ return result; ++} ++ ++/* ++ * a1 = a0 * e + a * (1 - e) ++ * ++ * a2 = a1 * e + a * (1 - e) ++ * = (a0 * e + a * (1 - e)) * e + a * (1 - e) ++ * = a0 * e^2 + a * (1 - e) * (1 + e) ++ * ++ * a3 = a2 * e + a * (1 - e) ++ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) ++ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) ++ * ++ * ... ++ * ++ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] ++ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) ++ * = a0 * e^n + a * (1 - e^n) ++ * ++ * [1] application of the geometric series: ++ * ++ * n 1 - x^(n+1) ++ * S_n := \Sum x^i = ------------- ++ * i=0 1 - x ++ */ ++unsigned long ++calc_load_n(unsigned long load, unsigned long exp, ++ unsigned long active, unsigned int n) ++{ ++ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); ++} ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++#ifdef CONFIG_PARAVIRT ++static inline u64 steal_ticks(u64 steal) ++{ ++ if (unlikely(steal > NSEC_PER_SEC)) ++ return div_u64(steal, TICK_NSEC); ++ ++ return __iter_div_u64_rem(steal, TICK_NSEC, &steal); ++} ++#endif ++ ++#ifndef nsecs_to_cputime ++# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) ++#endif ++ ++/* ++ * On each tick, add the number of nanoseconds to the unbanked variables and ++ * once one tick's worth has accumulated, account it allowing for accurate ++ * sub-tick accounting and totals. Use the TICK_APPROX_NS to match the way we ++ * deduct nanoseconds. ++ */ ++static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ if (atomic_read(&rq->nr_iowait) > 0) { ++ rq->iowait_ns += ns; ++ if (rq->iowait_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->iowait_ns); ++ cpustat[CPUTIME_IOWAIT] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->iowait_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->idle_ns += ns; ++ if (rq->idle_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->idle_ns); ++ cpustat[CPUTIME_IDLE] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->idle_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(idle); ++} ++ ++static void pc_system_time(struct rq *rq, struct task_struct *p, ++ int hardirq_offset, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ p->stime_ns += ns; ++ if (p->stime_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(p->stime_ns); ++ p->stime_ns %= JIFFY_NS; ++ p->stime += (__force u64)TICK_APPROX_NS * ticks; ++ account_group_system_time(p, TICK_APPROX_NS * ticks); ++ } ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ if (hardirq_count() - hardirq_offset) { ++ rq->irq_ns += ns; ++ if (rq->irq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->irq_ns); ++ cpustat[CPUTIME_IRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->irq_ns %= JIFFY_NS; ++ } ++ } else if (in_serving_softirq()) { ++ rq->softirq_ns += ns; ++ if (rq->softirq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->softirq_ns); ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->softirq_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->system_ns += ns; ++ if (rq->system_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->system_ns); ++ cpustat[CPUTIME_SYSTEM] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->system_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(p); ++} ++ ++static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ p->utime_ns += ns; ++ if (p->utime_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(p->utime_ns); ++ p->utime_ns %= JIFFY_NS; ++ p->utime += (__force u64)TICK_APPROX_NS * ticks; ++ account_group_user_time(p, TICK_APPROX_NS * ticks); ++ } ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ if (this_cpu_ksoftirqd() == p) { ++ /* ++ * ksoftirqd time do not get accounted in cpu_softirq_time. ++ * So, we have to handle it separately here. ++ */ ++ rq->softirq_ns += ns; ++ if (rq->softirq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->softirq_ns); ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->softirq_ns %= JIFFY_NS; ++ } ++ } ++ ++ if (task_nice(p) > 0 || idleprio_task(p)) { ++ rq->nice_ns += ns; ++ if (rq->nice_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->nice_ns); ++ cpustat[CPUTIME_NICE] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->nice_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->user_ns += ns; ++ if (rq->user_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->user_ns); ++ cpustat[CPUTIME_USER] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->user_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(p); ++} ++ ++/* ++ * This is called on clock ticks. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p) ++{ ++ s64 account_ns = rq->niffies - p->last_ran; ++ struct task_struct *idle = rq->idle; ++ ++ /* Accurate tick timekeeping */ ++ if (user_mode(get_irq_regs())) ++ pc_user_time(rq, p, account_ns); ++ else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) { ++ pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns); ++ } else ++ pc_idle_time(rq, idle, account_ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p->policy != SCHED_FIFO && p != idle) ++ p->time_slice -= NS_TO_US(account_ns); ++ ++ p->last_ran = rq->niffies; ++} ++ ++/* ++ * This is called on context switches. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p) ++{ ++ s64 account_ns = rq->niffies - p->last_ran; ++ struct task_struct *idle = rq->idle; ++ ++ /* Accurate subtick timekeeping */ ++ if (p != idle) ++ pc_user_time(rq, p, account_ns); ++ else ++ pc_idle_time(rq, idle, account_ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p->policy != SCHED_FIFO && p != idle) ++ p->time_slice -= NS_TO_US(account_ns); ++} ++ ++/* ++ * Return any ns on the sched_clock that have not yet been accounted in ++ * @p in case that task is currently running. ++ * ++ * Called with task_rq_lock(p) held. ++ */ ++static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) ++{ ++ u64 ns = 0; ++ ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_clocks(rq); ++ ns = rq->niffies - p->last_ran; ++ } ++ ++ return ns; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ * ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ struct rq_flags rf; ++ struct rq *rq; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimisation chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_rq_lock(p, &rf); ++ ns = p->sched_time + do_task_delta_exec(p, rq); ++ task_rq_unlock(rq, p, &rf); ++ ++ return ns; ++} ++ ++/* ++ * Functions to test for when SCHED_ISO tasks have used their allocated ++ * quota as real time scheduling and convert them back to SCHED_NORMAL. All ++ * data is modified only by the local runqueue during scheduler_tick with ++ * interrupts disabled. ++ */ ++ ++/* ++ * Test if SCHED_ISO tasks have run longer than their alloted period as RT ++ * tasks and set the refractory flag if necessary. There is 10% hysteresis ++ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a ++ * slow division. ++ */ ++static inline void iso_tick(struct rq *rq) ++{ ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; ++ rq->iso_ticks += 100; ++ if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) { ++ rq->iso_refractory = true; ++ if (unlikely(rq->iso_ticks > ISO_PERIOD * 100)) ++ rq->iso_ticks = ISO_PERIOD * 100; ++ } ++} ++ ++/* No SCHED_ISO task was running so decrease rq->iso_ticks */ ++static inline void no_iso_tick(struct rq *rq, int ticks) ++{ ++ if (rq->iso_ticks > 0 || rq->iso_refractory) { ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD; ++ if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) { ++ rq->iso_refractory = false; ++ if (unlikely(rq->iso_ticks < 0)) ++ rq->iso_ticks = 0; ++ } ++ } ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static void task_running_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ /* ++ * If a SCHED_ISO task is running we increment the iso_ticks. In ++ * order to prevent SCHED_ISO tasks from causing starvation in the ++ * presence of true RT tasks we account those as iso_ticks as well. ++ */ ++ if (rt_task(p) || task_running_iso(p)) ++ iso_tick(rq); ++ else ++ no_iso_tick(rq, 1); ++ ++ /* SCHED_FIFO tasks never run out of timeslice. */ ++ if (p->policy == SCHED_FIFO) ++ return; ++ ++ if (iso_task(p)) { ++ if (task_running_iso(p)) { ++ if (rq->iso_refractory) { ++ /* ++ * SCHED_ISO task is running as RT and limit ++ * has been hit. Force it to reschedule as ++ * SCHED_NORMAL by zeroing its time_slice ++ */ ++ p->time_slice = 0; ++ } ++ } else if (!rq->iso_refractory) { ++ /* Can now run again ISO. Reschedule to pick up prio */ ++ goto out_resched; ++ } ++ } ++ ++ /* ++ * Tasks that were scheduled in the first half of a tick are not ++ * allowed to run into the 2nd half of the next tick if they will ++ * run out of time slice in the interim. Otherwise, if they have ++ * less than RESCHED_US μs of time slice left they will be rescheduled. ++ * Dither is used as a backup for when hrexpiry is disabled or high res ++ * timers not configured in. ++ */ ++ if (p->time_slice - rq->dither >= RESCHED_US) ++ return; ++out_resched: ++ rq_lock(rq); ++ __set_tsk_resched(p); ++ rq_unlock(rq); ++} ++ ++static inline void task_tick(struct rq *rq) ++{ ++ if (!rq_idle(rq)) ++ task_running_tick(rq); ++ else if (rq->last_jiffy > rq->last_scheduler_tick) ++ no_iso_tick(rq, rq->last_jiffy - rq->last_scheduler_tick); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * We can stop the timer tick any time highres timers are active since ++ * we rely entirely on highres timeouts for task expiry rescheduling. ++ */ ++static void sched_stop_tick(struct rq *rq, int cpu) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ if (!tick_nohz_full_enabled()) ++ return; ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++ ++static inline void sched_start_tick(struct rq *rq, int cpu) ++{ ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++ ++struct tick_work { ++ int cpu; ++ atomic_t state; ++ struct delayed_work work; ++}; ++/* Values for ->state, see diagram below. */ ++#define TICK_SCHED_REMOTE_OFFLINE 0 ++#define TICK_SCHED_REMOTE_OFFLINING 1 ++#define TICK_SCHED_REMOTE_RUNNING 2 ++ ++/* ++ * State diagram for ->state: ++ * ++ * ++ * TICK_SCHED_REMOTE_OFFLINE ++ * | ^ ++ * | | ++ * | | sched_tick_remote() ++ * | | ++ * | | ++ * +--TICK_SCHED_REMOTE_OFFLINING ++ * | ^ ++ * | | ++ * sched_tick_start() | | sched_tick_stop() ++ * | | ++ * V | ++ * TICK_SCHED_REMOTE_RUNNING ++ * ++ * ++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() ++ * and sched_tick_start() are happy to leave the state in RUNNING. ++ */ ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ u64 delta; ++ int os; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ rq_lock_irq(rq); ++ curr = rq->curr; ++ if (is_idle_task(curr) || cpu_is_offline(cpu)) ++ goto out_unlock; ++ ++ update_rq_clock(rq); ++ delta = rq_clock_task(rq) - curr->last_ran; ++ ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ task_tick(rq); ++ ++out_unlock: ++ rq_unlock_irq(rq, NULL); ++ ++out_requeue: ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. But ++ * first update state to reflect hotplug activity if required. ++ */ ++ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); ++ if (os == TICK_SCHED_REMOTE_RUNNING) ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ struct tick_work *twork; ++ int os; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); ++ if (os == TICK_SCHED_REMOTE_OFFLINE) { ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++ } ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ int os; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ /* There cannot be competing actions, but don't rely on stop-machine. */ ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); ++ WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); ++ /* Don't cancel, as this would mess up the state machine. */ ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_stop_tick(struct rq *rq, int cpu) {} ++static inline void sched_start_tick(struct rq *rq, int cpu) {} ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ sched_clock_tick(); ++ update_clocks(rq); ++ update_load_avg(rq, 0); ++ update_cpu_clock_tick(rq, rq->curr); ++ task_tick(rq); ++ rq->last_scheduler_tick = rq->last_jiffy; ++ rq->last_tick = rq->clock; ++ psi_task_tick(rq); ++ perf_event_task_tick(); ++ sched_stop_tick(rq, cpu); ++} ++ ++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * The time_slice is only refilled when it is empty and that is when we set a ++ * new deadline. Make sure update_clocks has been called recently to update ++ * rq->niffies. ++ */ ++static void time_slice_expired(struct task_struct *p, struct rq *rq) ++{ ++ p->time_slice = timeslice(); ++ p->deadline = rq->niffies + task_deadline_diff(p); ++#ifdef CONFIG_SMT_NICE ++ if (!p->mm) ++ p->smt_bias = 0; ++ else if (rt_task(p)) ++ p->smt_bias = 1 << 30; ++ else if (task_running_iso(p)) ++ p->smt_bias = 1 << 29; ++ else if (idleprio_task(p)) { ++ if (task_running_idle(p)) ++ p->smt_bias = 0; ++ else ++ p->smt_bias = 1; ++ } else if (--p->smt_bias < 1) ++ p->smt_bias = MAX_PRIO - p->static_prio; ++#endif ++} ++ ++/* ++ * Timeslices below RESCHED_US are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. SCHED_BATCH tasks ++ * have been flagged be not latency sensitive and likely to be fully CPU ++ * bound so every time they're rescheduled they have their time_slice ++ * refilled, but get a new later deadline to have little effect on ++ * SCHED_NORMAL tasks. ++ ++ */ ++static inline void check_deadline(struct task_struct *p, struct rq *rq) ++{ ++ if (p->time_slice < RESCHED_US || batch_task(p)) ++ time_slice_expired(p, rq); ++} ++ ++/* ++ * Task selection with skiplists is a simple matter of picking off the first ++ * task in the sorted list, an O(1) operation. The lookup is amortised O(1) ++ * being bound to the number of processors. ++ * ++ * Runqueues are selectively locked based on their unlocked data and then ++ * unlocked if not needed. At most 3 locks will be held at any time and are ++ * released as soon as they're no longer needed. All balancing between CPUs ++ * is thus done here in an extremely simple first come best fit manner. ++ * ++ * This iterates over runqueues in cache locality order. In interactive mode ++ * it iterates over all CPUs and finds the task with the best key/deadline. ++ * In non-interactive mode it will only take a task if it's from the current ++ * runqueue or a runqueue with more tasks than the current one with a better ++ * key/deadline. ++ */ ++#ifdef CONFIG_SMP ++static inline struct task_struct ++*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ struct rq *locked = NULL, *chosen = NULL; ++ struct task_struct *edt = idle; ++ int i, best_entries = 0; ++ u64 best_key = ~0ULL; ++ ++ for (i = 0; i < total_runqueues; i++) { ++ struct rq *other_rq = rq_order(rq, i); ++ skiplist_node *next; ++ int entries; ++ ++ entries = other_rq->sl->entries; ++ /* ++ * Check for queued entres lockless first. The local runqueue ++ * is locked so entries will always be accurate. ++ */ ++ if (!sched_interactive) { ++ /* ++ * Don't reschedule balance across nodes unless the CPU ++ * is idle. ++ */ ++ if (edt != idle && rq->cpu_locality[other_rq->cpu] > LOCALITY_SMP) ++ break; ++ if (entries <= best_entries) ++ continue; ++ } else if (!entries) ++ continue; ++ ++ /* if (i) implies other_rq != rq */ ++ if (i) { ++ /* Check for best id queued lockless first */ ++ if (other_rq->best_key >= best_key) ++ continue; ++ ++ if (unlikely(!trylock_rq(rq, other_rq))) ++ continue; ++ ++ /* Need to reevaluate entries after locking */ ++ entries = other_rq->sl->entries; ++ if (unlikely(!entries)) { ++ unlock_rq(other_rq); ++ continue; ++ } ++ } ++ ++ next = other_rq->node; ++ /* ++ * In interactive mode we check beyond the best entry on other ++ * runqueues if we can't get the best for smt or affinity ++ * reasons. ++ */ ++ while ((next = next->next[0]) != other_rq->node) { ++ struct task_struct *p; ++ u64 key = next->key; ++ ++ /* Reevaluate key after locking */ ++ if (key >= best_key) ++ break; ++ ++ p = next->value; ++ if (!smt_schedule(p, rq)) { ++ if (i && !sched_interactive) ++ break; ++ continue; ++ } ++ ++ if (sched_other_cpu(p, cpu)) { ++ if (sched_interactive || !i) ++ continue; ++ break; ++ } ++ /* Make sure affinity is ok */ ++ if (i) { ++ /* From this point on p is the best so far */ ++ if (locked) ++ unlock_rq(locked); ++ chosen = locked = other_rq; ++ } ++ best_entries = entries; ++ best_key = key; ++ edt = p; ++ break; ++ } ++ /* rq->preempting is a hint only as the state may have changed ++ * since it was set with the resched call but if we have met ++ * the condition we can break out here. */ ++ if (edt == rq->preempting) ++ break; ++ if (i && other_rq != chosen) ++ unlock_rq(other_rq); ++ } ++ ++ if (likely(edt != idle)) ++ take_task(rq, cpu, edt); ++ ++ if (locked) ++ unlock_rq(locked); ++ ++ rq->preempting = NULL; ++ ++ return edt; ++} ++#else /* CONFIG_SMP */ ++static inline struct task_struct ++*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ struct task_struct *edt; ++ ++ if (unlikely(!rq->sl->entries)) ++ return idle; ++ edt = rq->node->next[0]->value; ++ take_task(rq, cpu, edt); ++ return edt; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev, bool preempt) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++#endif ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++ if (!preempt && prev->state && prev->non_block_count) { ++ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", ++ prev->comm, prev->pid, prev->non_block_count); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++ } ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++/* ++ * The currently running task's information is all stored in rq local data ++ * which is only modified by the local CPU. ++ */ ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ if (p == rq->idle || p->policy == SCHED_FIFO) ++ hrexpiry_clear(rq); ++ else ++ hrexpiry_start(rq, US_TO_NS(p->time_slice)); ++ if (rq->clock - rq->last_tick > HALF_JIFFY_NS) ++ rq->dither = 0; ++ else ++ rq->dither = rq_dither(rq); ++ ++ rq->rq_deadline = p->deadline; ++ rq->rq_prio = p->prio; ++#ifdef CONFIG_SMT_NICE ++ rq->rq_mm = p->mm; ++ rq->rq_smt_bias = p->smt_bias; ++#endif ++} ++ ++#ifdef CONFIG_SMT_NICE ++static void check_no_siblings(struct rq __maybe_unused *this_rq) {} ++static void wake_no_siblings(struct rq __maybe_unused *this_rq) {} ++static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings; ++static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings; ++ ++/* Iterate over smt siblings when we've scheduled a process on cpu and decide ++ * whether they should continue running or be descheduled. */ ++static void check_smt_siblings(struct rq *this_rq) ++{ ++ int other_cpu; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct task_struct *p; ++ struct rq *rq; ++ ++ rq = cpu_rq(other_cpu); ++ if (rq_idle(rq)) ++ continue; ++ p = rq->curr; ++ if (!smt_schedule(p, this_rq)) ++ resched_curr(rq); ++ } ++} ++ ++static void wake_smt_siblings(struct rq *this_rq) ++{ ++ int other_cpu; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct rq *rq; ++ ++ rq = cpu_rq(other_cpu); ++ if (rq_idle(rq)) ++ resched_idle(rq); ++ } ++} ++#else ++static void check_siblings(struct rq __maybe_unused *this_rq) {} ++static void wake_siblings(struct rq __maybe_unused *this_rq) {} ++#endif ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next, *idle; ++ unsigned long *switch_count; ++ bool deactivate = false; ++ struct rq *rq; ++ u64 niffies; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ idle = rq->idle; ++ ++ schedule_debug(prev, preempt); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(). ++ * ++ * The membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ rq_lock(rq); ++ smp_mb__after_spinlock(); ++#ifdef CONFIG_SMP ++ if (rq->preempt) { ++ /* ++ * Make sure resched_curr hasn't triggered a preemption ++ * locklessly on a task that has since scheduled away. Spurious ++ * wakeup of idle is okay though. ++ */ ++ if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) { ++ rq->preempt = NULL; ++ clear_preempt_need_resched(); ++ rq_unlock_irq(rq, NULL); ++ return; ++ } ++ rq->preempt = NULL; ++ } ++#endif ++ ++ switch_count = &prev->nivcsw; ++ if (!preempt && prev->state) { ++ if (signal_pending_state(prev->state, prev)) { ++ prev->state = TASK_RUNNING; ++ } else { ++ deactivate = true; ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ /* ++ * Store the niffy value here for use by the next task's last_ran ++ * below to avoid losing niffies due to update_clocks being called ++ * again after this point. ++ */ ++ update_clocks(rq); ++ niffies = rq->niffies; ++ update_cpu_clock_switch(rq, prev); ++ ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ if (idle != prev) { ++ check_deadline(prev, rq); ++ return_task(prev, rq, cpu, deactivate); ++ } ++ ++ next = earliest_deadline_task(rq, cpu, idle); ++ if (likely(next->prio != PRIO_LIMIT)) ++ clear_cpuidle_map(cpu); ++ else { ++ set_cpuidle_map(cpu); ++ update_load_avg(rq, 0); ++ } ++ ++ set_rq_task(rq, next); ++ next->last_ran = niffies; ++ ++ if (likely(prev != next)) { ++ /* ++ * Don't reschedule an idle task or deactivated tasks ++ */ ++ if (prev == idle) { ++ rq->nr_running++; ++ if (rt_task(next)) ++ rq->rt_nr_running++; ++ } else if (!deactivate) ++ resched_suitable_idle(prev); ++ if (unlikely(next == idle)) { ++ rq->nr_running--; ++ if (rt_task(prev)) ++ rq->rt_nr_running--; ++ wake_siblings(rq); ++ } else ++ check_siblings(rq); ++ rq->nr_switches++; ++ /* ++ * RCU users of rcu_dereference(rq->curr) may not see ++ * changes to task_struct made by pick_next_task(). ++ */ ++ RCU_INIT_POINTER(rq->curr, next); ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ ++ trace_sched_switch(preempt, prev, next); ++ context_switch(rq, prev, next); /* unlocks the rq */ ++ } else { ++ check_siblings(rq); ++ rq_unlock(rq); ++ do_pending_softirq(rq, next); ++ local_irq_enable(); ++ } ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(). */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ __schedule(false); ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ if (!tsk->state) ++ return; ++ ++ /* ++ * If a worker went to sleep, notify and ask workqueue whether ++ * it wants to wake up a task to maintain concurrency. ++ * As this function is called inside the schedule() context, ++ * we disable preemption to avoid it calling schedule() again ++ * in the possible wakeup of a kworker. ++ */ ++ if (tsk->flags & PF_WQ_WORKER) { ++ preempt_disable(); ++ wq_worker_sleeping(tsk); ++ preempt_enable_no_resched(); ++ } ++ ++ if (tsk_is_pi_blocked(tsk)) ++ return; ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++static inline void sched_update_worker(struct task_struct *tsk) ++{ ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_running(tsk); ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ sched_update_worker(tsk); ++} ++ ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_CONTEXT_TRACKING ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != IN_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPTION ++/* ++ * This is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPTION */ ++ ++/* ++ * This is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio, oldprio; ++ struct rq *rq; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_rq_lock(p, NULL); ++ update_rq_clock(rq); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guaratees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ oldprio = p->prio; ++ p->prio = prio; ++ if (task_running(rq, p)){ ++ if (prio > oldprio) ++ resched_task(p); ++ } else if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (prio < oldprio) ++ try_preempt(p, rq); ++ } ++out_unlock: ++ __task_rq_unlock(rq, NULL); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++/* ++ * Adjust the deadline for when the priority is to change, before it's ++ * changed. ++ */ ++static inline void adjust_deadline(struct task_struct *p, int new_prio) ++{ ++ p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p); ++} ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ int new_static, old_static; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ new_static = NICE_TO_PRIO(nice); ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (has_rt_policy(p)) { ++ p->static_prio = new_static; ++ goto out_unlock; ++ } ++ ++ adjust_deadline(p, new_static); ++ old_static = p->static_prio; ++ p->static_prio = new_static; ++ p->prio = effective_prio(p); ++ ++ if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (new_static < old_static) ++ try_preempt(p, rq); ++ } else if (task_running(rq, p)) { ++ set_rq_task(rq, p); ++ if (old_static < new_static) ++ resched_task(p); ++ } ++out_unlock: ++ task_rq_unlock(rq, p, &rf); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ int delta, prio = p->prio - MAX_RT_PRIO; ++ ++ /* rt tasks and iso tasks */ ++ if (prio <= 0) ++ goto out; ++ ++ /* Convert to ms to avoid overflows */ ++ delta = NS_TO_MS(p->deadline - task_rq(p)->niffies); ++ if (unlikely(delta < 0)) ++ delta = 0; ++ delta = delta * 40 / ms_longest_deadline_diff(); ++ if (delta <= 80) ++ prio += delta; ++ if (idleprio_task(p)) ++ prio += 40; ++out: ++ return prio; ++} ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ return cpu_curr(cpu) == cpu_rq(cpu)->idle; ++} ++ ++/** ++ * available_idle_cpu - is a given CPU idle for enqueuing work. ++ * @cpu: the CPU in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int available_idle_cpu(int cpu) ++{ ++ if (!idle_cpu(cpu)) ++ return 0; ++ ++ if (vcpu_is_preempted(cpu)) ++ return 0; ++ ++ return 1; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the CPU @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, ++ int prio, const struct sched_attr *attr, ++ bool keep_boost) ++{ ++ int oldrtprio, oldprio; ++ ++ /* ++ * If params can't change scheduling class changes aren't allowed ++ * either. ++ */ ++ if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) ++ return; ++ ++ p->policy = policy; ++ oldrtprio = p->rt_priority; ++ p->rt_priority = prio; ++ p->normal_prio = normal_prio(p); ++ oldprio = p->prio; ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++ ++ if (task_running(rq, p)) { ++ set_rq_task(rq, p); ++ resched_task(p); ++ } else if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (p->prio < oldprio || p->rt_priority > oldrtprio) ++ try_preempt(p, rq); ++ } ++} ++ ++/* ++ * Check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int __sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, ++ bool user, bool pi) ++{ ++ int retval, policy = attr->sched_policy, oldpolicy = -1, priority = attr->sched_priority; ++ unsigned long rlim_rtprio = 0; ++ struct rq_flags rf; ++ int reset_on_fork; ++ struct rq *rq; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { ++ unsigned long lflags; ++ ++ if (!lock_task_sighand(p, &lflags)) ++ return -ESRCH; ++ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); ++ unlock_task_sighand(p, &lflags); ++ if (rlim_rtprio) ++ goto recheck; ++ /* ++ * If the caller requested an RT policy without having the ++ * necessary rights, we downgrade the policy to SCHED_ISO. ++ * We also set the parameter to zero to pass the checks. ++ */ ++ policy = SCHED_ISO; ++ priority = 0; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); ++ policy &= ~SCHED_RESET_ON_FORK; ++ ++ if (!SCHED_RANGE(policy)) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH is 0. ++ */ ++ if (priority < 0 || ++ (p->mm && priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if (is_rt_policy(policy) != (priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (is_rt_policy(policy)) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (priority > p->rt_priority && ++ priority > rlim_rtprio) ++ return -EPERM; ++ } else { ++ switch (p->policy) { ++ /* ++ * Can only downgrade policies but not back to ++ * SCHED_NORMAL ++ */ ++ case SCHED_ISO: ++ if (policy == SCHED_ISO) ++ goto out; ++ if (policy != SCHED_NORMAL) ++ return -EPERM; ++ break; ++ case SCHED_BATCH: ++ if (policy == SCHED_BATCH) ++ goto out; ++ if (policy != SCHED_IDLEPRIO) ++ return -EPERM; ++ break; ++ case SCHED_IDLEPRIO: ++ if (policy == SCHED_IDLEPRIO) ++ goto out; ++ return -EPERM; ++ default: ++ break; ++ } ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag: */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ if (pi) ++ cpuset_read_lock(); ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ * ++ * To be able to change p->policy safely, the runqueue lock must be ++ * held. ++ */ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea: ++ */ ++ if (p == rq->stop) { ++ retval = -EINVAL; ++ goto unlock; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy && (!is_rt_policy(policy) || ++ priority == p->rt_priority))) { ++ retval = 0; ++ goto unlock; ++ } ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ task_rq_unlock(rq, p, &rf); ++ if (pi) ++ cpuset_read_unlock(); ++ goto recheck; ++ } ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ __setscheduler(p, rq, policy, priority, attr, pi); ++ ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ task_rq_unlock(rq, p, &rf); ++ ++ if (pi) { ++ cpuset_read_unlock(); ++ rt_mutex_adjust_pi(p); ++ } ++ preempt_enable(); ++out: ++ return 0; ++ ++unlock: ++ task_rq_unlock(rq, p, &rf); ++ if (pi) ++ cpuset_read_unlock(); ++ return retval; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++EXPORT_SYMBOL_GPL(sched_setscheduler); ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++EXPORT_SYMBOL_GPL(sched_setattr); ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setscheduler(p, policy, &lparam); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, ++ struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) ++ goto err_size; ++ ++ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); ++ if (ret) { ++ if (ret == -E2BIG) ++ goto err_size; ++ return ret; ++ } ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) ++ attr.sched_policy = SETPARAM_POLICY; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setattr(p, &attr); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/* ++ * Copy the kernel size attribute structure (which might be larger ++ * than what user-space knows about) to user-space. ++ * ++ * Note that all cases are valid: user-space buffer can be larger or ++ * smaller than the kernel-space buffer. The usual case is that both ++ * have the same size. ++ */ ++static int ++sched_attr_copy_to_user(struct sched_attr __user *uattr, ++ struct sched_attr *kattr, ++ unsigned int usize) ++{ ++ unsigned int ksize = sizeof(*kattr); ++ ++ if (!access_ok(uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * sched_getattr() ABI forwards and backwards compatibility: ++ * ++ * If usize == ksize then we just copy everything to user-space and all is good. ++ * ++ * If usize < ksize then we only copy as much as user-space has space for, ++ * this keeps ABI compatibility as well. We skip the rest. ++ * ++ * If usize > ksize then user-space is using a newer version of the ABI, ++ * which part the kernel doesn't know about. Just ignore it - tooling can ++ * detect the kernel's knowledge of attributes from the attr->size value ++ * which is set to ksize in this case. ++ */ ++ kattr->size = min(usize, ksize); ++ ++ if (copy_to_user(uattr, kattr, kattr->size)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @usize: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, usize, unsigned int, flags) ++{ ++ struct sched_attr kattr = { }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || usize > PAGE_SIZE || ++ usize < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ kattr.sched_policy = p->policy; ++ if (rt_task(p)) ++ kattr.sched_priority = p->rt_priority; ++ else ++ kattr.sched_nice = task_nice(p); ++ ++ rcu_read_unlock(); ++ ++ return sched_attr_copy_to_user(uattr, &kattr, usize); ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_allowed, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_allowed); ++ cpumask_and(new_mask, in_mask, cpus_allowed); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_allowed); ++ if (!cpumask_subset(new_mask, cpus_allowed)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_allowed to the ++ * cpuset's cpus_allowed ++ */ ++ cpumask_copy(new_mask, cpus_allowed); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_allowed); ++out_put_task: ++ put_task_struct(p); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ cpumask_t *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ unsigned long flags; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ put_online_cpus(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min(len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ * ++ * Return: 0. ++ */ ++static void do_sched_yield(void) ++{ ++ struct rq *rq; ++ ++ if (!sched_yield_type) ++ return; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ rq_lock(rq); ++ ++ if (sched_yield_type > 1) ++ time_slice_expired(current, rq); ++ schedstat_inc(rq->yld_count); ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ preempt_disable(); ++ rq_unlock(rq); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPTION ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ rcu_all_qs(); ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, its already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ struct task_struct *rq_p; ++ struct rq *rq, *p_rq; ++ unsigned long flags; ++ int yielded = 0; ++ ++ local_irq_save(flags); ++ rq = this_rq(); ++ ++again: ++ p_rq = task_rq(p); ++ /* ++ * If we're the only runnable task on the rq and target rq also ++ * has only one task, there's absolutely no point in yielding. ++ */ ++ if (task_running(p_rq, p) || p->state) { ++ yielded = -ESRCH; ++ goto out_irq; ++ } ++ ++ double_rq_lock(rq, p_rq); ++ if (unlikely(task_rq(p) != p_rq)) { ++ double_rq_unlock(rq, p_rq); ++ goto again; ++ } ++ ++ yielded = 1; ++ schedstat_inc(rq->yld_count); ++ rq_p = rq->curr; ++ if (p->deadline > rq_p->deadline) ++ p->deadline = rq_p->deadline; ++ p->time_slice += rq_p->time_slice; ++ if (p->time_slice > timeslice()) ++ p->time_slice = timeslice(); ++ time_slice_expired(rq_p, rq); ++ if (preempt && rq != p_rq) ++ resched_task(p_rq->curr); ++ double_rq_unlock(rq, p_rq); ++out_irq: ++ local_irq_restore(flags); ++ ++ if (yielded > 0) ++ schedule(); ++ return yielded; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void __sched io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ unsigned int time_slice; ++ struct rq_flags rf; ++ struct rq *rq; ++ int retval; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ rq = task_rq_lock(p, &rf); ++ time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); ++ task_rq_unlock(rq, p, &rf); ++ ++ rcu_read_unlock(); ++ *t = ns_to_timespec64(time_slice); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * this syscall writes the default timeslice value of a given process ++ * into the user-space timespec buffer. A value of '0' means infinity. ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct __kernel_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT_32BIT_TIME ++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, ++ struct old_timespec32 __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_old_timespec32(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, ++ task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ show_stack(p, NULL); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++#if BITS_PER_LONG == 32 ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#else ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#endif ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++#ifdef CONFIG_SMP ++void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ struct rq *rq = task_rq(p); ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ ++ if (task_queued(p)) { ++ /* ++ * Because __kthread_bind() calls this on blocked tasks without ++ * holding rq->lock. ++ */ ++ lockdep_assert_held(rq->lock); ++ } ++} ++ ++/* ++ * Calling do_set_cpus_allowed from outside the scheduler code should not be ++ * called on a running or queued task. We should be holding pi_lock. ++ */ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ __do_set_cpus_allowed(p, new_mask); ++ if (needs_other_cpu(p, task_cpu(p))) { ++ struct rq *rq; ++ ++ rq = __task_rq_lock(p, NULL); ++ set_task_cpu(p, valid_task_cpu(p)); ++ resched_task(p); ++ __task_rq_unlock(rq, NULL); ++ } ++} ++#endif ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: cpu the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(rq->lock); ++ idle->last_ran = rq->niffies; ++ time_slice_expired(idle, rq); ++ idle->state = TASK_RUNNING; ++ /* Setting prio to illegal value shouldn't matter when never queued */ ++ idle->prio = PRIO_LIMIT; ++ ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#ifdef CONFIG_SMT_NICE ++ idle->smt_bias = 0; ++#endif ++#endif ++ set_rq_task(rq, idle); ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->idle = idle; ++ rcu_assign_pointer(rq->curr, idle); ++ idle->on_rq = TASK_ON_RQ_QUEUED; ++ raw_spin_unlock(rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_mask may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++ rq_lock_irqsave(rq, &rf); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(rq); ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_NO_HZ_COMMON ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++void nohz_balance_enter_idle(int cpu) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(); ++ struct sched_domain *sd; ++ ++ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ return cpu; ++ ++ rcu_read_lock(); ++ for_each_domain(cpu, sd) { ++ for_each_cpu(i, sched_domain_span(sd)) { ++ if (cpu == i) ++ continue; ++ ++ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { ++ cpu = i; ++ cpu = i; ++ goto unlock; ++ } ++ } ++ } ++ ++ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++unlock: ++ rcu_read_unlock(); ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++void wake_up_idle_cpu(int cpu) ++{ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ if (set_nr_and_not_polling(cpu_rq(cpu)->idle)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++static bool wake_up_full_nohz_cpu(int cpu) ++{ ++ /* ++ * We just need the target to call irq_exit() and re-evaluate ++ * the next tick. The nohz full kick at least implies that. ++ * If needed we can still optimize that later with an ++ * empty IRQ. ++ */ ++ if (cpu_is_offline(cpu)) ++ return true; /* Don't try to wake offline CPUs. */ ++ if (tick_nohz_full_cpu(cpu)) { ++ if (cpu != smp_processor_id() || ++ tick_nohz_tick_stopped()) ++ tick_nohz_full_kick_cpu(cpu); ++ return true; ++ } ++ ++ return false; ++} ++ ++/* ++ * Wake up the specified CPU. If the CPU is going offline, it is the ++ * caller's responsibility to deal with the lost wakeup, for example, ++ * by hooking into the CPU_DEAD notifier like timers and hrtimers do. ++ */ ++void wake_up_nohz_cpu(int cpu) ++{ ++ if (!wake_up_full_nohz_cpu(cpu)) ++ wake_up_idle_cpu(cpu); ++} ++#endif /* CONFIG_NO_HZ_COMMON */ ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ bool queued = false, running_wrong = false, kthread; ++ struct cpumask old_mask; ++ unsigned int dest_cpu; ++ struct rq_flags rf; ++ struct rq *rq; ++ int ret = 0; ++ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ kthread = !!(p->flags & PF_KTHREAD); ++ if (kthread) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ cpumask_copy(&old_mask, p->cpus_ptr); ++ if (cpumask_equal(&old_mask, new_mask)) ++ goto out; ++ ++ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ if (dest_cpu >= nr_cpu_ids) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ queued = task_queued(p); ++ __do_set_cpus_allowed(p, new_mask); ++ ++ if (kthread) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(rq, p)) { ++ /* Task is running on the wrong cpu now, reschedule it. */ ++ if (rq == this_rq()) { ++ set_task_cpu(p, dest_cpu); ++ set_tsk_need_resched(p); ++ running_wrong = true; ++ } else ++ resched_task(p); ++ } else { ++ if (queued) { ++ /* ++ * Switch runqueue locks after dequeueing the task ++ * here while still holding the pi_lock to be holding ++ * the correct lock for enqueueing. ++ */ ++ dequeue_task(rq, p, 0); ++ rq_unlock(rq); ++ ++ rq = cpu_rq(dest_cpu); ++ rq_lock(rq); ++ } ++ set_task_cpu(p, dest_cpu); ++ if (queued) ++ enqueue_task(rq, p, 0); ++ } ++ if (queued) ++ try_preempt(p, rq); ++ if (running_wrong) ++ preempt_disable(); ++out: ++ task_rq_unlock(rq, p, &rf); ++ ++ if (running_wrong) { ++ __schedule(true); ++ preempt_enable(); ++ } ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Run through task list and find tasks affined to the dead cpu, then remove ++ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold ++ * cpu 0 and src_cpu's runqueue locks. We should be holding both rq lock and ++ * pi_lock to change cpus_mask but it's not going to matter here. ++ */ ++static void bind_zero(int src_cpu) ++{ ++ struct task_struct *p, *t; ++ struct rq *rq0; ++ int bound = 0; ++ ++ if (src_cpu == 0) ++ return; ++ ++ rq0 = cpu_rq(0); ++ ++ do_each_thread(t, p) { ++ if (cpumask_test_cpu(src_cpu, p->cpus_ptr)) { ++ bool local = (task_cpu(p) == src_cpu); ++ struct rq *rq = task_rq(p); ++ ++ /* task_running is the cpu stopper thread */ ++ if (local && task_running(rq, p)) ++ continue; ++ atomic_clear_cpu(src_cpu, &p->cpus_mask); ++ atomic_set_cpu(0, &p->cpus_mask); ++ p->zerobound = true; ++ bound++; ++ if (local) { ++ bool queued = task_queued(p); ++ ++ if (queued) ++ dequeue_task(rq, p, 0); ++ set_task_cpu(p, 0); ++ if (queued) ++ enqueue_task(rq0, p, 0); ++ } ++ } ++ } while_each_thread(t, p); ++ ++ if (bound) { ++ printk(KERN_INFO "MuQSS removed affinity for %d processes to cpu %d\n", ++ bound, src_cpu); ++ } ++} ++ ++/* Find processes with the zerobound flag and reenable their affinity for the ++ * CPU coming alive. */ ++static void unbind_zero(int src_cpu) ++{ ++ int unbound = 0, zerobound = 0; ++ struct task_struct *p, *t; ++ ++ if (src_cpu == 0) ++ return; ++ ++ do_each_thread(t, p) { ++ if (!p->mm) ++ p->zerobound = false; ++ if (p->zerobound) { ++ unbound++; ++ cpumask_set_cpu(src_cpu, &p->cpus_mask); ++ /* Once every CPU affinity has been re-enabled, remove ++ * the zerobound flag */ ++ if (cpumask_subset(cpu_possible_mask, p->cpus_ptr)) { ++ p->zerobound = false; ++ zerobound++; ++ } ++ } ++ } while_each_thread(t, p); ++ ++ if (unbound) { ++ printk(KERN_INFO "MuQSS added affinity for %d processes to cpu %d\n", ++ unbound, src_cpu); ++ } ++ if (zerobound) { ++ printk(KERN_INFO "MuQSS released forced binding to cpu0 for %d processes\n", ++ zerobound); ++ } ++} ++ ++/* ++ * Ensure that the idle task is using init_mm right before its cpu goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(cpu_online(smp_processor_id())); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ current->active_mm = &init_mm; ++ finish_arch_post_lock_switch(); ++ } ++ mmdrop(mm); ++} ++#else /* CONFIG_HOTPLUG_CPU */ ++static void unbind_zero(int src_cpu) {} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++ ++static struct ctl_table sd_ctl_dir[] = { ++ { ++ .procname = "sched_domain", ++ .mode = 0555, ++ }, ++ {} ++}; ++ ++static struct ctl_table sd_ctl_root[] = { ++ { ++ .procname = "kernel", ++ .mode = 0555, ++ .child = sd_ctl_dir, ++ }, ++ {} ++}; ++ ++static struct ctl_table *sd_alloc_ctl_entry(int n) ++{ ++ struct ctl_table *entry = ++ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); ++ ++ return entry; ++} ++ ++static void sd_free_ctl_entry(struct ctl_table **tablep) ++{ ++ struct ctl_table *entry; ++ ++ /* ++ * In the intermediate directories, both the child directory and ++ * procname are dynamically allocated and could fail but the mode ++ * will always be set. In the lowest directory the names are ++ * static strings and all have proc handlers. ++ */ ++ for (entry = *tablep; entry->mode; entry++) { ++ if (entry->child) ++ sd_free_ctl_entry(&entry->child); ++ if (entry->proc_handler == NULL) ++ kfree(entry->procname); ++ } ++ ++ kfree(*tablep); ++ *tablep = NULL; ++} ++ ++static void ++set_table_entry(struct ctl_table *entry, ++ const char *procname, void *data, int maxlen, ++ umode_t mode, proc_handler *proc_handler) ++{ ++ entry->procname = procname; ++ entry->data = data; ++ entry->maxlen = maxlen; ++ entry->mode = mode; ++ entry->proc_handler = proc_handler; ++} ++ ++static struct ctl_table * ++sd_alloc_ctl_domain_table(struct sched_domain *sd) ++{ ++ struct ctl_table *table = sd_alloc_ctl_entry(9); ++ ++ if (table == NULL) ++ return NULL; ++ ++ set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); ++ /* &table[8] is terminator */ ++ ++ return table; ++} ++ ++static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) ++{ ++ struct ctl_table *entry, *table; ++ struct sched_domain *sd; ++ int domain_num = 0, i; ++ char buf[32]; ++ ++ for_each_domain(cpu, sd) ++ domain_num++; ++ entry = table = sd_alloc_ctl_entry(domain_num + 1); ++ if (table == NULL) ++ return NULL; ++ ++ i = 0; ++ for_each_domain(cpu, sd) { ++ snprintf(buf, 32, "domain%d", i); ++ entry->procname = kstrdup(buf, GFP_KERNEL); ++ entry->mode = 0555; ++ entry->child = sd_alloc_ctl_domain_table(sd); ++ entry++; ++ i++; ++ } ++ return table; ++} ++ ++static cpumask_var_t sd_sysctl_cpus; ++static struct ctl_table_header *sd_sysctl_header; ++ ++void register_sched_domain_sysctl(void) ++{ ++ static struct ctl_table *cpu_entries; ++ static struct ctl_table **cpu_idx; ++ char buf[32]; ++ int i; ++ ++ if (!cpu_entries) { ++ cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); ++ if (!cpu_entries) ++ return; ++ ++ WARN_ON(sd_ctl_dir[0].child); ++ sd_ctl_dir[0].child = cpu_entries; ++ } ++ ++ if (!cpu_idx) { ++ struct ctl_table *e = cpu_entries; ++ ++ cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); ++ if (!cpu_idx) ++ return; ++ ++ /* deal with sparse possible map */ ++ for_each_possible_cpu(i) { ++ cpu_idx[i] = e; ++ e++; ++ } ++ } ++ ++ if (!cpumask_available(sd_sysctl_cpus)) { ++ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) ++ return; ++ ++ /* init to possible to not have holes in @cpu_entries */ ++ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); ++ } ++ ++ for_each_cpu(i, sd_sysctl_cpus) { ++ struct ctl_table *e = cpu_idx[i]; ++ ++ if (e->child) ++ sd_free_ctl_entry(&e->child); ++ ++ if (!e->procname) { ++ snprintf(buf, 32, "cpu%d", i); ++ e->procname = kstrdup(buf, GFP_KERNEL); ++ } ++ e->mode = 0555; ++ e->child = sd_alloc_ctl_cpu_table(i); ++ ++ __cpumask_clear_cpu(i, sd_sysctl_cpus); ++ } ++ ++ WARN_ON(sd_sysctl_header); ++ sd_sysctl_header = register_sysctl_table(sd_ctl_root); ++} ++ ++void dirty_sched_domain_sysctl(int cpu) ++{ ++ if (cpumask_available(sd_sysctl_cpus)) ++ __cpumask_set_cpu(cpu, sd_sysctl_cpus); ++} ++ ++/* may be called multiple times per register */ ++void unregister_sched_domain_sysctl(void) ++{ ++ unregister_sysctl_table(sd_sysctl_header); ++ sd_sysctl_header = NULL; ++} ++#endif /* CONFIG_SYSCTL */ ++ ++void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) { ++ cpumask_set_cpu(cpu_of(rq), rq->rd->online); ++ rq->online = true; ++ } ++} ++ ++void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) { ++ int cpu = cpu_of(rq); ++ ++ cpumask_clear_cpu(cpu, rq->rd->online); ++ rq->online = false; ++ clear_cpuidle_map(cpu); ++ } ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going up, increment the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_inc_cpuslocked(&sched_smt_present); ++#endif ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) { ++ sched_domains_numa_masks_set(cpu); ++ cpuset_cpu_active(); ++ } ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all CPUs have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ rq_lock_irqsave(rq, &rf); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_online(rq); ++ } ++ unbind_zero(cpu); ++ rq_unlock_irqrestore(rq, &rf); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu(); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going down, decrement the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_dec_cpuslocked(&sched_smt_present); ++#endif ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ sched_domains_numa_masks_clear(cpu); ++ return 0; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ /* Handle pending wakeups and then migrate everything off */ ++ sched_ttwu_pending(); ++ sched_tick_stop(cpu); ++ ++ local_irq_save(flags); ++ double_rq_lock(rq, cpu_rq(0)); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_offline(rq); ++ } ++ bind_zero(cpu); ++ double_rq_unlock(rq, cpu_rq(0)); ++ sched_start_tick(rq, cpu); ++ hrexpiry_clear(rq); ++ local_irq_restore(flags); ++ ++ return 0; ++} ++#endif ++ ++#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) ++/* ++ * Cheaper version of the below functions in case support for SMT and MC is ++ * compiled in but CPUs have no siblings. ++ */ ++static bool sole_cpu_idle(struct rq *rq) ++{ ++ return rq_idle(rq); ++} ++#endif ++#ifdef CONFIG_SCHED_SMT ++static const cpumask_t *thread_cpumask(int cpu) ++{ ++ return topology_sibling_cpumask(cpu); ++} ++/* All this CPU's SMT siblings are idle */ ++static bool siblings_cpu_idle(struct rq *rq) ++{ ++ return cpumask_subset(&rq->thread_mask, &cpu_idle_map); ++} ++#endif ++#ifdef CONFIG_SCHED_MC ++static const cpumask_t *core_cpumask(int cpu) ++{ ++ return topology_core_cpumask(cpu); ++} ++/* All this CPU's shared cache siblings are idle */ ++static bool cache_cpu_idle(struct rq *rq) ++{ ++ return cpumask_subset(&rq->core_mask, &cpu_idle_map); ++} ++/* MC siblings CPU mask which share the same LLC */ ++static const cpumask_t *llc_core_cpumask(int cpu) ++{ ++ return per_cpu(cpu_llc_shared_map, cpu); ++} ++#endif ++ ++enum sched_domain_level { ++ SD_LV_NONE = 0, ++ SD_LV_SIBLING, ++ SD_LV_MC, ++ SD_LV_BOOK, ++ SD_LV_CPU, ++ SD_LV_NODE, ++ SD_LV_ALLNODES, ++ SD_LV_MAX ++}; ++ ++void __init sched_init_smp(void) ++{ ++ struct rq *rq, *other_rq, *leader = cpu_rq(0); ++ struct sched_domain *sd; ++ int cpu, other_cpu, i; ++#ifdef CONFIG_SCHED_SMT ++ bool smt_threads = false; ++#endif ++ sched_init_numa(); ++ ++ /* ++ * There's no userspace yet to cause hotplug operations; hence all the ++ * cpu masks are stable and all blatant races in the below code cannot ++ * happen. ++ */ ++ mutex_lock(&sched_domains_mutex); ++ sched_init_domains(cpu_active_mask); ++ mutex_unlock(&sched_domains_mutex); ++ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) ++ BUG(); ++ ++ local_irq_disable(); ++ mutex_lock(&sched_domains_mutex); ++ lock_all_rqs(); ++ ++ printk(KERN_INFO "MuQSS possible/present/online CPUs: %d/%d/%d\n", ++ num_possible_cpus(), num_present_cpus(), num_online_cpus()); ++ ++ /* ++ * Set up the relative cache distance of each online cpu from each ++ * other in a simple array for quick lookup. Locality is determined ++ * by the closest sched_domain that CPUs are separated by. CPUs with ++ * shared cache in SMT and MC are treated as local. Separate CPUs ++ * (within the same package or physically) within the same node are ++ * treated as not local. CPUs not even in the same domain (different ++ * nodes) are treated as very distant. ++ */ ++ for (cpu = num_online_cpus() - 1; cpu >= 0; cpu--) { ++ rq = cpu_rq(cpu); ++ leader = NULL; ++ /* First check if this cpu is in the same node */ ++ for_each_domain(cpu, sd) { ++ if (sd->level > SD_LV_MC) ++ continue; ++ if (rqshare != RQSHARE_ALL) ++ leader = NULL; ++ /* Set locality to local node if not already found lower */ ++ for_each_cpu(other_cpu, sched_domain_span(sd)) { ++ if (rqshare >= RQSHARE_SMP) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the smp_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ other_rq->smp_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_SMP) ++ rq->cpu_locality[other_cpu] = LOCALITY_SMP; ++ } ++ } ++ ++ /* ++ * Each runqueue has its own function in case it doesn't have ++ * siblings of its own allowing mixed topologies. ++ */ ++#ifdef CONFIG_SCHED_MC ++ leader = NULL; ++ if (cpumask_weight(core_cpumask(cpu)) > 1) { ++ cpumask_copy(&rq->core_mask, llc_core_cpumask(cpu)); ++ cpumask_clear_cpu(cpu, &rq->core_mask); ++ for_each_cpu(other_cpu, core_cpumask(cpu)) { ++ if (rqshare == RQSHARE_MC || ++ (rqshare == RQSHARE_MC_LLC && cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu)))) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the mc_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ other_rq->mc_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_MC) { ++ /* this is to get LLC into play even in case LLC sharing is not used */ ++ if (cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu))) ++ rq->cpu_locality[other_cpu] = LOCALITY_MC_LLC; ++ else ++ rq->cpu_locality[other_cpu] = LOCALITY_MC; ++ } ++ } ++ rq->cache_idle = cache_cpu_idle; ++ } ++#endif ++#ifdef CONFIG_SCHED_SMT ++ leader = NULL; ++ if (cpumask_weight(thread_cpumask(cpu)) > 1) { ++ cpumask_copy(&rq->thread_mask, thread_cpumask(cpu)); ++ cpumask_clear_cpu(cpu, &rq->thread_mask); ++ for_each_cpu(other_cpu, thread_cpumask(cpu)) { ++ if (rqshare == RQSHARE_SMT) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the smt_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ other_rq->smt_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_SMT) ++ rq->cpu_locality[other_cpu] = LOCALITY_SMT; ++ } ++ rq->siblings_idle = siblings_cpu_idle; ++ smt_threads = true; ++ } ++#endif ++ } ++ ++#ifdef CONFIG_SMT_NICE ++ if (smt_threads) { ++ check_siblings = &check_smt_siblings; ++ wake_siblings = &wake_smt_siblings; ++ smt_schedule = &smt_should_schedule; ++ } ++#endif ++ unlock_all_rqs(); ++ mutex_unlock(&sched_domains_mutex); ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for_each_online_cpu(other_cpu) { ++ printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]); ++ } ++ } ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->smp_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing SMP runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ kfree(rq->node); ++ kfree(rq->sl); ++ kfree(rq->lock); ++ rq->node = leader->node; ++ rq->sl = leader->sl; ++ rq->lock = leader->lock; ++ barrier(); ++ /* To make up for not unlocking the freed runlock */ ++ preempt_enable(); ++ } else ++ rq_unlock(rq); ++ } ++ ++#ifdef CONFIG_SCHED_MC ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->mc_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing MC runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ kfree(rq->node); ++ kfree(rq->sl); ++ kfree(rq->lock); ++ rq->node = leader->node; ++ rq->sl = leader->sl; ++ rq->lock = leader->lock; ++ barrier(); ++ /* To make up for not unlocking the freed runlock */ ++ preempt_enable(); ++ } else ++ rq_unlock(rq); ++ } ++#endif /* CONFIG_SCHED_MC */ ++ ++#ifdef CONFIG_SCHED_SMT ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ ++ leader = rq->smt_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing SMT runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ kfree(rq->node); ++ kfree(rq->sl); ++ kfree(rq->lock); ++ rq->node = leader->node; ++ rq->sl = leader->sl; ++ rq->lock = leader->lock; ++ barrier(); ++ /* To make up for not unlocking the freed runlock */ ++ preempt_enable(); ++ } else ++ rq_unlock(rq); ++ } ++#endif /* CONFIG_SCHED_SMT */ ++ ++ local_irq_enable(); ++ ++ total_runqueues = 0; ++ for_each_online_cpu(cpu) { ++ int locality, total_rqs = 0, total_cpus = 0; ++ ++ rq = cpu_rq(cpu); ++ if ( ++#ifdef CONFIG_SCHED_MC ++ (rq->mc_leader == rq) && ++#endif ++#ifdef CONFIG_SCHED_SMT ++ (rq->smt_leader == rq) && ++#endif ++ (rq->smp_leader == rq)) { ++ total_runqueues++; ++ } ++ ++ for (locality = LOCALITY_SAME; locality <= LOCALITY_DISTANT; locality++) { ++ int selected_cpus[NR_CPUS], selected_cpu_cnt, selected_cpu_idx, test_cpu_idx, cpu_idx, best_locality, test_cpu; ++ int ordered_cpus[NR_CPUS], ordered_cpus_idx; ++ ++ ordered_cpus_idx = -1; ++ selected_cpu_cnt = 0; ++ ++ for_each_online_cpu(test_cpu) { ++ if (cpu < num_online_cpus() / 2) ++ other_cpu = cpu + test_cpu; ++ else ++ other_cpu = cpu - test_cpu; ++ if (other_cpu < 0) ++ other_cpu += num_online_cpus(); ++ else ++ other_cpu %= num_online_cpus(); ++ /* gather CPUs of the same locality */ ++ if (rq->cpu_locality[other_cpu] == locality) { ++ selected_cpus[selected_cpu_cnt] = other_cpu; ++ selected_cpu_cnt++; ++ } ++ } ++ ++ /* reserve first CPU as starting point */ ++ if (selected_cpu_cnt > 0) { ++ ordered_cpus_idx++; ++ ordered_cpus[ordered_cpus_idx] = selected_cpus[ordered_cpus_idx]; ++ selected_cpus[ordered_cpus_idx] = -1; ++ } ++ ++ /* take each CPU and sort it within the same locality based on each inter-CPU localities */ ++ for(test_cpu_idx = 1; test_cpu_idx < selected_cpu_cnt; test_cpu_idx++) { ++ /* starting point with worst locality and current CPU */ ++ best_locality = LOCALITY_DISTANT; ++ selected_cpu_idx = test_cpu_idx; ++ ++ /* try to find the best locality within group */ ++ for(cpu_idx = 1; cpu_idx < selected_cpu_cnt; cpu_idx++) { ++ /* if CPU has not been used and locality is better */ ++ if (selected_cpus[cpu_idx] > -1) { ++ other_rq = cpu_rq(ordered_cpus[ordered_cpus_idx]); ++ if (best_locality > other_rq->cpu_locality[selected_cpus[cpu_idx]]) { ++ /* assign best locality and best CPU idx in array */ ++ best_locality = other_rq->cpu_locality[selected_cpus[cpu_idx]]; ++ selected_cpu_idx = cpu_idx; ++ } ++ } ++ } ++ ++ /* add our next best CPU to ordered list */ ++ ordered_cpus_idx++; ++ ordered_cpus[ordered_cpus_idx] = selected_cpus[selected_cpu_idx]; ++ /* mark this CPU as used */ ++ selected_cpus[selected_cpu_idx] = -1; ++ } ++ ++ /* set up RQ and CPU orders */ ++ for (test_cpu = 0; test_cpu <= ordered_cpus_idx; test_cpu++) { ++ other_rq = cpu_rq(ordered_cpus[test_cpu]); ++ /* set up cpu orders */ ++ rq->cpu_order[total_cpus++] = other_rq; ++ if ( ++#ifdef CONFIG_SCHED_MC ++ (other_rq->mc_leader == other_rq) && ++#endif ++#ifdef CONFIG_SCHED_SMT ++ (other_rq->smt_leader == other_rq) && ++#endif ++ (other_rq->smp_leader == other_rq)) { ++ /* set up RQ orders */ ++ rq->rq_order[total_rqs++] = other_rq; ++ } ++ } ++ } ++ } ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for (i = 0; i < total_runqueues; i++) { ++ printk(KERN_DEBUG "MuQSS CPU %d llc %d RQ order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, ++ rq->rq_order[i]->cpu, per_cpu(cpu_llc_id, rq->rq_order[i]->cpu)); ++ } ++ } ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for (i = 0; i < num_online_cpus(); i++) { ++ printk(KERN_DEBUG "MuQSS CPU %d llc %d CPU order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, ++ rq->cpu_order[i]->cpu, per_cpu(cpu_llc_id, rq->cpu_order[i]->cpu)); ++ } ++ } ++ ++ switch (rqshare) { ++ case RQSHARE_ALL: ++ /* This should only ever read 1 */ ++ printk(KERN_INFO "MuQSS runqueue share type ALL total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_SMP: ++ printk(KERN_INFO "MuQSS runqueue share type SMP total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_MC: ++ printk(KERN_INFO "MuQSS runqueue share type MC total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_MC_LLC: ++ printk(KERN_INFO "MuQSS runqueue share type LLC total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_SMT: ++ printk(KERN_INFO "MuQSS runqueue share type SMT total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_NONE: ++ printk(KERN_INFO "MuQSS runqueue share type NONE total runqueues: %d\n", ++ total_runqueues); ++ break; ++ } ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++ sched_smp_initialized = true; ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++#ifdef CONFIG_SMP ++ int cpu_ids; ++#endif ++ int i; ++ struct rq *rq; ++ ++ wait_bit_init(); ++ ++ prio_ratios[0] = 128; ++ for (i = 1 ; i < NICE_WIDTH ; i++) ++ prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; ++ ++ skiplist_node_init(&init_task.node); ++ ++#ifdef CONFIG_SMP ++ init_defrootdomain(); ++ cpumask_clear(&cpu_idle_map); ++#else ++ uprq = &per_cpu(runqueues, 0); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ rq->node = kmalloc(sizeof(skiplist_node), GFP_ATOMIC); ++ skiplist_init(rq->node); ++ rq->sl = new_skiplist(rq->node); ++ rq->lock = kmalloc(sizeof(raw_spinlock_t), GFP_ATOMIC); ++ raw_spin_lock_init(rq->lock); ++ rq->nr_running = 0; ++ rq->nr_uninterruptible = 0; ++ rq->nr_switches = 0; ++ rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0; ++ rq->last_jiffy = jiffies; ++ rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns = ++ rq->iowait_ns = rq->idle_ns = 0; ++ rq->dither = 0; ++ set_rq_task(rq, &init_task); ++ rq->iso_ticks = 0; ++ rq->iso_refractory = false; ++#ifdef CONFIG_SMP ++ rq->smp_leader = rq; ++#ifdef CONFIG_SCHED_MC ++ rq->mc_leader = rq; ++#endif ++#ifdef CONFIG_SCHED_SMT ++ rq->smt_leader = rq; ++#endif ++ rq->sd = NULL; ++ rq->rd = NULL; ++ rq->online = false; ++ rq->cpu = i; ++ rq_attach_root(rq, &def_root_domain); ++#endif ++ init_rq_hrexpiry(rq); ++ atomic_set(&rq->nr_iowait, 0); ++ } ++ ++#ifdef CONFIG_SMP ++ cpu_ids = i; ++ /* ++ * Set the base locality for cpu cache distance calculation to ++ * "distant" (3). Make sure the distance from a CPU to itself is 0. ++ */ ++ for_each_possible_cpu(i) { ++ int j; ++ ++ rq = cpu_rq(i); ++#ifdef CONFIG_SCHED_SMT ++ rq->siblings_idle = sole_cpu_idle; ++#endif ++#ifdef CONFIG_SCHED_MC ++ rq->cache_idle = sole_cpu_idle; ++#endif ++ rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC); ++ for_each_possible_cpu(j) { ++ if (i == j) ++ rq->cpu_locality[j] = LOCALITY_SAME; ++ else ++ rq->cpu_locality[j] = LOCALITY_DISTANT; ++ } ++ rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); ++ rq->cpu_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); ++ rq->rq_order[0] = rq->cpu_order[0] = rq; ++ for (j = 1; j < cpu_ids; j++) ++ rq->rq_order[j] = rq->cpu_order[j] = cpu_rq(j); ++ } ++#endif ++ ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++ ++ psi_init(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void __cant_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > preempt_offset) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); ++ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current) && !current->non_block_count) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && !preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++static inline void normalise_rt_tasks(void) ++{ ++ struct sched_attr attr = {}; ++ struct task_struct *g, *p; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p) && !iso_task(p)) ++ continue; ++ ++ rq = task_rq_lock(p, &rf); ++ __setscheduler(p, rq, SCHED_NORMAL, 0, &attr, false); ++ task_rq_unlock(rq, p, &rf); ++ } ++ read_unlock(&tasklist_lock); ++} ++ ++void normalize_rt_tasks(void) ++{ ++ normalise_rt_tasks(); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * ia64_set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++void init_idle_bootup_task(struct task_struct *idle) ++{} ++ ++#ifdef CONFIG_SCHED_DEBUG ++__read_mostly bool sched_debug_enabled; ++ ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{ ++ seq_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), ++ get_nr_threads(p)); ++} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_legacy_files[] = { ++ { } /* Terminate */ ++}; ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++#undef CREATE_TRACE_POINTS +diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h +new file mode 100644 +index 000000000000..5214b158d82f +--- /dev/null ++++ b/kernel/sched/MuQSS.h +@@ -0,0 +1,1010 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef MUQSS_SCHED_H ++#define MUQSS_SCHED_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_PARAVIRT ++#include ++#endif ++ ++#include "cpupri.h" ++ ++#ifdef CONFIG_SCHED_DEBUG ++# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) ++#else ++# define SCHED_WARN_ON(x) ((void)(x)) ++#endif ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++struct rq; ++ ++#ifdef CONFIG_SMP ++ ++static inline bool sched_asym_prefer(int a, int b) ++{ ++ return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); ++} ++ ++struct perf_domain { ++ struct em_perf_domain *em_pd; ++ struct perf_domain *next; ++ struct rcu_head rcu; ++}; ++ ++/* Scheduling group status flags */ ++#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ ++#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ ++ ++/* ++ * We add the notion of a root-domain which will be used to define per-domain ++ * variables. Each exclusive cpuset essentially defines an island domain by ++ * fully partitioning the member cpus from any other cpuset. Whenever a new ++ * exclusive cpuset is created, we also create and attach a new root-domain ++ * object. ++ * ++ */ ++struct root_domain { ++ atomic_t refcount; ++ atomic_t rto_count; ++ struct rcu_head rcu; ++ cpumask_var_t span; ++ cpumask_var_t online; ++ ++ /* ++ * Indicate pullable load on at least one CPU, e.g: ++ * - More than one runnable task ++ * - Running task is misfit ++ */ ++ int overload; ++ ++ /* Indicate one or more cpus over-utilized (tipping point) */ ++ int overutilized; ++ ++ /* ++ * The bit corresponding to a CPU gets set here if such CPU has more ++ * than one runnable -deadline task (as it is below for RT tasks). ++ */ ++ cpumask_var_t dlo_mask; ++ atomic_t dlo_count; ++ /* Replace unused CFS structures with void */ ++ //struct dl_bw dl_bw; ++ //struct cpudl cpudl; ++ void *dl_bw; ++ void *cpudl; ++ ++ /* ++ * The "RT overload" flag: it gets set if a CPU has more than ++ * one runnable RT task. ++ */ ++ cpumask_var_t rto_mask; ++ //struct cpupri cpupri; ++ void *cpupri; ++ ++ unsigned long max_cpu_capacity; ++ ++ /* ++ * NULL-terminated list of performance domains intersecting with the ++ * CPUs of the rd. Protected by RCU. ++ */ ++ struct perf_domain *pd; ++}; ++ ++extern void init_defrootdomain(void); ++extern int sched_init_domains(const struct cpumask *cpu_map); ++extern void rq_attach_root(struct rq *rq, struct root_domain *rd); ++ ++static inline void cpupri_cleanup(void __maybe_unused *cpupri) ++{ ++} ++ ++static inline void cpudl_cleanup(void __maybe_unused *cpudl) ++{ ++} ++ ++static inline void init_dl_bw(void __maybe_unused *dl_bw) ++{ ++} ++ ++static inline int cpudl_init(void __maybe_unused *dl_bw) ++{ ++ return 0; ++} ++ ++static inline int cpupri_init(void __maybe_unused *cpupri) ++{ ++ return 0; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ raw_spinlock_t *lock; ++ raw_spinlock_t *orig_lock; ++ ++ struct task_struct *curr, *idle, *stop; ++ struct mm_struct *prev_mm; ++ ++ unsigned int nr_running; ++ /* ++ * This is part of a global counter where only the total sum ++ * over all CPUs matters. A task can increase this counter on ++ * one CPU and if it got migrated afterwards it may decrease ++ * it on another CPU. Always updated under the runqueue lock: ++ */ ++ unsigned long nr_uninterruptible; ++ u64 nr_switches; ++ ++ /* Stored data about rq->curr to work outside rq lock */ ++ u64 rq_deadline; ++ int rq_prio; ++ ++ /* Best queued id for use outside lock */ ++ u64 best_key; ++ ++ unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */ ++ unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ ++ u64 niffies; /* Last time this RQ updated rq clock */ ++ u64 last_niffy; /* Last niffies as updated by local clock */ ++ u64 last_jiffy_niffies; /* Niffies @ last_jiffy */ ++ ++ u64 load_update; /* When we last updated load */ ++ unsigned long load_avg; /* Rolling load average */ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ u64 irq_load_update; /* When we last updated IRQ load */ ++ unsigned long irq_load_avg; /* Rolling IRQ load average */ ++#endif ++#ifdef CONFIG_SMT_NICE ++ struct mm_struct *rq_mm; ++ int rq_smt_bias; /* Policy/nice level bias across smt siblings */ ++#endif ++ /* Accurate timekeeping data */ ++ unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns, ++ iowait_ns, idle_ns; ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_MEMBARRIER ++ int membarrier_state; ++#endif ++ ++ skiplist_node *node; ++ skiplist *sl; ++#ifdef CONFIG_SMP ++ struct task_struct *preempt; /* Preempt triggered on this task */ ++ struct task_struct *preempting; /* Hint only, what task is preempting */ ++ ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++ struct root_domain *rd; ++ struct sched_domain *sd; ++ ++ unsigned long cpu_capacity_orig; ++ ++ int *cpu_locality; /* CPU relative cache distance */ ++ struct rq **rq_order; /* Shared RQs ordered by relative cache distance */ ++ struct rq **cpu_order; /* RQs of discrete CPUs ordered by distance */ ++ ++ struct rq *smp_leader; /* First physical CPU per node */ ++#ifdef CONFIG_SCHED_SMT ++ struct rq *smt_leader; /* First logical CPU in SMT siblings */ ++ cpumask_t thread_mask; ++ bool (*siblings_idle)(struct rq *rq); ++ /* See if all smt siblings are idle */ ++#endif /* CONFIG_SCHED_SMT */ ++#ifdef CONFIG_SCHED_MC ++ struct rq *mc_leader; /* First logical CPU in MC siblings */ ++ cpumask_t core_mask; ++ bool (*cache_idle)(struct rq *rq); ++ /* See if all cache siblings are idle */ ++#endif /* CONFIG_SCHED_MC */ ++#endif /* CONFIG_SMP */ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ u64 clock, old_clock, last_tick; ++ /* Ensure that all clocks are in the same cache line */ ++ u64 clock_task ____cacheline_aligned; ++ int dither; ++ ++ int iso_ticks; ++ bool iso_refractory; ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++ struct hrtimer hrexpiry_timer; ++#endif ++ ++ int rt_nr_running; /* Number real time tasks running */ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++ ++#ifdef CONFIG_SMP ++ struct llist_head wake_list; ++#endif ++ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++}; ++ ++struct rq_flags { ++ unsigned long flags; ++}; ++ ++#ifdef CONFIG_SMP ++struct rq *cpu_rq(int cpu); ++#endif ++ ++#ifndef CONFIG_SMP ++extern struct rq *uprq; ++#define cpu_rq(cpu) (uprq) ++#define this_rq() (uprq) ++#define raw_rq() (uprq) ++#define task_rq(p) (uprq) ++#define cpu_curr(cpu) ((uprq)->curr) ++#else /* CONFIG_SMP */ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define this_rq() this_cpu_ptr(&runqueues) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#endif /* CONFIG_SMP */ ++ ++static inline int task_current(struct rq *rq, struct task_struct *p) ++{ ++ return rq->curr == p; ++} ++ ++static inline int task_running(struct rq *rq, struct task_struct *p) ++{ ++#ifdef CONFIG_SMP ++ return p->on_cpu; ++#else ++ return task_current(rq, p); ++#endif ++} ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++} ++ ++static inline void rq_lock(struct rq *rq) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock(rq->lock); ++} ++ ++static inline void rq_unlock(struct rq *rq) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(rq->lock); ++} ++ ++static inline void rq_lock_irq(struct rq *rq) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irq(rq->lock); ++} ++ ++static inline void rq_unlock_irq(struct rq *rq, struct rq_flags __always_unused *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(rq->lock); ++} ++ ++static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irqsave(rq->lock, rf->flags); ++} ++ ++static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irqrestore(rq->lock, rf->flags); ++} ++ ++static inline struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ while (42) { ++ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); ++ rq = task_rq(p); ++ raw_spin_lock(rq->lock); ++ if (likely(rq == task_rq(p))) ++ break; ++ raw_spin_unlock(rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++ } ++ return rq; ++} ++ ++static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ rq_unlock(rq); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++} ++ ++static inline struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags __always_unused *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ while (42) { ++ rq = task_rq(p); ++ raw_spin_lock(rq->lock); ++ if (likely(rq == task_rq(p))) ++ break; ++ raw_spin_unlock(rq->lock); ++ } ++ return rq; ++} ++ ++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags __always_unused *rf) ++{ ++ rq_unlock(rq); ++} ++ ++static inline struct rq * ++this_rq_lock_irq(struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ rq_lock(rq); ++ return rq; ++} ++ ++/* ++ * {de,en}queue flags: Most not used on MuQSS. ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks ++ * are in a known state which allows modification. Such pairs ++ * should preserve as much state as possible. ++ * ++ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location ++ * in the runqueue. ++ * ++ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) ++ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) ++ * ENQUEUE_MIGRATED - the task was migrated during wakeup ++ * ++ */ ++ ++#define DEQUEUE_SLEEP 0x01 ++#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ ++ ++#define ENQUEUE_WAKEUP 0x01 ++#define ENQUEUE_RESTORE 0x02 ++ ++#ifdef CONFIG_SMP ++#define ENQUEUE_MIGRATED 0x40 ++#else ++#define ENQUEUE_MIGRATED 0x00 ++#endif ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ lockdep_assert_held(rq->lock); ++ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ lockdep_assert_held(rq->lock); ++ ++ return rq->clock_task; ++} ++ ++#ifdef CONFIG_NUMA ++enum numa_topology_type { ++ NUMA_DIRECT, ++ NUMA_GLUELESS_MESH, ++ NUMA_BACKPLANE, ++}; ++extern enum numa_topology_type sched_numa_topology_type; ++extern int sched_max_numa_distance; ++extern bool find_numa_distance(int distance); ++extern void sched_init_numa(void); ++extern void sched_domains_numa_masks_set(unsigned int cpu); ++extern void sched_domains_numa_masks_clear(unsigned int cpu); ++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); ++#else ++static inline void sched_init_numa(void) { } ++static inline void sched_domains_numa_masks_set(unsigned int cpu) { } ++static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } ++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return nr_cpu_ids; ++} ++#endif ++ ++extern struct mutex sched_domains_mutex; ++extern struct static_key_false sched_schedstats; ++ ++#define rcu_dereference_check_sched_domain(p) \ ++ rcu_dereference_check((p), \ ++ lockdep_is_held(&sched_domains_mutex)) ++ ++#ifdef CONFIG_SMP ++ ++/* ++ * The domain tree (rq->sd) is protected by RCU's quiescent state transition. ++ * See destroy_sched_domains: call_rcu for details. ++ * ++ * The domain tree of any CPU may only be accessed from within ++ * preempt-disabled sections. ++ */ ++#define for_each_domain(cpu, __sd) \ ++ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ ++ __sd; __sd = __sd->parent) ++ ++#define for_each_lower_domain(sd) for (; sd; sd = sd->child) ++ ++/** ++ * highest_flag_domain - Return highest sched_domain containing flag. ++ * @cpu: The cpu whose highest level of sched domain is to ++ * be returned. ++ * @flag: The flag to check for the highest sched_domain ++ * for the given cpu. ++ * ++ * Returns the highest sched_domain of a cpu which contains the given flag. ++ */ ++static inline struct sched_domain *highest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd, *hsd = NULL; ++ ++ for_each_domain(cpu, sd) { ++ if (!(sd->flags & flag)) ++ break; ++ hsd = sd; ++ } ++ ++ return hsd; ++} ++ ++static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd; ++ ++ for_each_domain(cpu, sd) { ++ if (sd->flags & flag) ++ break; ++ } ++ ++ return sd; ++} ++ ++DECLARE_PER_CPU(struct sched_domain *, sd_llc); ++DECLARE_PER_CPU(int, sd_llc_size); ++DECLARE_PER_CPU(int, sd_llc_id); ++DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); ++DECLARE_PER_CPU(struct sched_domain *, sd_numa); ++DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); ++DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); ++ ++struct sched_group_capacity { ++ atomic_t ref; ++ /* ++ * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity ++ * for a single CPU. ++ */ ++ unsigned long capacity; ++ unsigned long min_capacity; /* Min per-CPU capacity in group */ ++ unsigned long max_capacity; /* Max per-CPU capacity in group */ ++ unsigned long next_update; ++ int imbalance; /* XXX unrelated to capacity but shared group state */ ++ ++#ifdef CONFIG_SCHED_DEBUG ++ int id; ++#endif ++ ++ unsigned long cpumask[0]; /* balance mask */ ++}; ++ ++struct sched_group { ++ struct sched_group *next; /* Must be a circular list */ ++ atomic_t ref; ++ ++ unsigned int group_weight; ++ struct sched_group_capacity *sgc; ++ int asym_prefer_cpu; /* cpu of highest priority in group */ ++ ++ /* ++ * The CPUs this group covers. ++ * ++ * NOTE: this field is variable length. (Allocated dynamically ++ * by attaching extra space to the end of the structure, ++ * depending on how many CPUs the kernel has booted up with) ++ */ ++ unsigned long cpumask[0]; ++}; ++ ++static inline struct cpumask *sched_group_span(struct sched_group *sg) ++{ ++ return to_cpumask(sg->cpumask); ++} ++ ++/* ++ * See build_balance_mask(). ++ */ ++static inline struct cpumask *group_balance_mask(struct sched_group *sg) ++{ ++ return to_cpumask(sg->sgc->cpumask); ++} ++ ++/** ++ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. ++ * @group: The group whose first cpu is to be returned. ++ */ ++static inline unsigned int group_first_cpu(struct sched_group *group) ++{ ++ return cpumask_first(sched_group_span(group)); ++} ++ ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void dirty_sched_domain_sysctl(int cpu); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void dirty_sched_domain_sysctl(int cpu) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++extern void sched_ttwu_pending(void); ++extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); ++extern void set_rq_online (struct rq *rq); ++extern void set_rq_offline(struct rq *rq); ++extern bool sched_smp_initialized; ++ ++static inline void update_group_capacity(struct sched_domain *sd, int cpu) ++{ ++} ++ ++static inline void trigger_load_balance(struct rq *rq) ++{ ++} ++ ++#define sched_feat(x) 0 ++ ++#else /* CONFIG_SMP */ ++ ++static inline void sched_ttwu_pending(void) { } ++ ++#endif /* CONFIG_SMP */ ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ SCHED_WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++#ifdef CONFIG_SCHED_DEBUG ++extern bool sched_debug_enabled; ++#endif ++ ++extern void schedule_idle(void); ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++static inline bool sched_stop_runnable(struct rq *rq) ++{ ++ return rq->stop && task_on_rq_queued(rq->stop); ++} ++ ++#ifdef CONFIG_SMP ++static inline int cpu_of(struct rq *rq) ++{ ++ return rq->cpu; ++} ++#else /* CONFIG_SMP */ ++static inline int cpu_of(struct rq *rq) ++{ ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); ++ ++static inline void cpufreq_trigger(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, ++ cpu_of(rq))); ++ ++ if (data) ++ data->func(data, rq->niffies, flags); ++} ++#else ++static inline void cpufreq_trigger(struct rq *rq, unsigned int flag) ++{ ++} ++#endif /* CONFIG_CPU_FREQ */ ++ ++static inline bool uclamp_is_used(void) ++{ ++ return false; ++} ++ ++static __always_inline ++unsigned int uclamp_util_with(struct rq __maybe_unused *rq, unsigned int util, ++ struct task_struct __maybe_unused *p) ++{ ++ return util; ++} ++ ++static inline unsigned int uclamp_util(struct rq *rq, unsigned int util) ++{ ++ return util; ++} ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++/* ++ * This should only be called when current == rq->idle. Dodgy workaround for ++ * when softirqs are pending and we are in the idle loop. Setting current to ++ * resched will kick us out of the idle loop and the softirqs will be serviced ++ * on our next pass through schedule(). ++ */ ++static inline bool softirq_pending(int cpu) ++{ ++ if (likely(!local_softirq_pending())) ++ return false; ++ set_tsk_need_resched(current); ++ return true; ++} ++ ++#ifdef CONFIG_64BIT ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ return tsk_seruntime(t); ++} ++#else ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ struct rq_flags rf; ++ u64 ns; ++ struct rq *rq; ++ ++ rq = task_rq_lock(t, &rf); ++ ns = tsk_seruntime(t); ++ task_rq_unlock(rq, t, &rf); ++ ++ return ns; ++} ++#endif ++ ++#ifndef arch_scale_freq_capacity ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern bool sched_can_stop_tick(struct rq *rq); ++extern int __init sched_tick_offload_init(void); ++ ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out of ++ * nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu; ++ ++ if (!tick_nohz_full_enabled()) ++ return; ++ ++ cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (sched_can_stop_tick(rq)) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++static inline bool rt_rq_is_runnable(struct rq *rt_rq) ++{ ++ return rt_rq->rt_nr_running; ++} ++ ++/** ++ * enum schedutil_type - CPU utilization type ++ * @FREQUENCY_UTIL: Utilization used to select frequency ++ * @ENERGY_UTIL: Utilization used during energy calculation ++ * ++ * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time ++ * need to be aggregated differently depending on the usage made of them. This ++ * enum is used within schedutil_freq_util() to differentiate the types of ++ * utilization expected by the callers, and adjust the aggregation accordingly. ++ */ ++enum schedutil_type { ++ FREQUENCY_UTIL, ++ ENERGY_UTIL, ++}; ++ ++#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL ++ ++unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, ++ unsigned long max, enum schedutil_type type, ++ struct task_struct *p); ++ ++static inline unsigned long cpu_bw_dl(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline unsigned long cpu_util_dl(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline unsigned long cpu_util_cfs(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->load_avg); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++static inline unsigned long cpu_util_rt(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->rt_nr_running); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++static inline unsigned long cpu_util_irq(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->irq_load_avg); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++static inline ++unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) ++{ ++ util *= (max - irq); ++ util /= max; ++ ++ return util; ++ ++} ++#else ++static inline unsigned long cpu_util_irq(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline ++unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) ++{ ++ return util; ++} ++#endif ++#endif ++ ++#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) ++#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) ++ ++DECLARE_STATIC_KEY_FALSE(sched_energy_present); ++ ++static inline bool sched_energy_enabled(void) ++{ ++ return static_branch_unlikely(&sched_energy_present); ++} ++ ++#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ ++ ++#define perf_domain_span(pd) NULL ++static inline bool sched_energy_enabled(void) { return false; } ++ ++#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ ++ ++#ifdef CONFIG_MEMBARRIER ++/* ++ * The scheduler provides memory barriers required by membarrier between: ++ * - prior user-space memory accesses and store to rq->membarrier_state, ++ * - store to rq->membarrier_state and following user-space memory accesses. ++ * In the same way it provides those guarantees around store to rq->curr. ++ */ ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++ int membarrier_state; ++ ++ if (prev_mm == next_mm) ++ return; ++ ++ membarrier_state = atomic_read(&next_mm->membarrier_state); ++ if (READ_ONCE(rq->membarrier_state) == membarrier_state) ++ return; ++ ++ WRITE_ONCE(rq->membarrier_state, membarrier_state); ++} ++#else ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++} ++#endif ++ ++#endif /* MUQSS_SCHED_H */ +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index 86800b4d5453..f3d8dca0538a 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -185,6 +185,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifdef CONFIG_SCHED_MUQSS ++#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(rq) ++#else ++#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(&rq->rt) ++#endif ++ + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -213,7 +219,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + struct rq *rq = cpu_rq(cpu); + + if (!uclamp_is_used() && +- type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { ++ type == FREQUENCY_UTIL && rt_rq_runnable(rq)) { + return max; + } + +@@ -658,7 +664,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + struct task_struct *thread; + struct sched_attr attr = { + .size = sizeof(struct sched_attr), ++#ifdef CONFIG_SCHED_MUQSS ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, ++#endif + .sched_flags = SCHED_FLAG_SUGOV, + .sched_nice = 0, + .sched_priority = 0, +diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h +index 7dc20a3232e7..e733a0a53b0a 100644 +--- a/kernel/sched/cpupri.h ++++ b/kernel/sched/cpupri.h +@@ -17,9 +17,11 @@ struct cpupri { + int *cpu_to_pri; + }; + ++#ifndef CONFIG_SCHED_MUQSS + #ifdef CONFIG_SMP + int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); + void cpupri_set(struct cpupri *cp, int cpu, int pri); + int cpupri_init(struct cpupri *cp); + void cpupri_cleanup(struct cpupri *cp); + #endif ++#endif +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index 46ed4e1383e2..f077fcd22d2b 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -266,26 +266,6 @@ static inline u64 account_other_time(u64 max) + return accounted; + } + +-#ifdef CONFIG_64BIT +-static inline u64 read_sum_exec_runtime(struct task_struct *t) +-{ +- return t->se.sum_exec_runtime; +-} +-#else +-static u64 read_sum_exec_runtime(struct task_struct *t) +-{ +- u64 ns; +- struct rq_flags rf; +- struct rq *rq; +- +- rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; +- task_rq_unlock(rq, t, &rf); +- +- return ns; +-} +-#endif +- + /* + * Accumulate raw cputime values of dead tasks (sig->[us]time) and live + * tasks (sum on group iteration) belonging to @tsk's group. +@@ -663,7 +643,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index f65ef1e2f204..e0aa6c73a5fa 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -225,6 +225,8 @@ static void cpuidle_idle_call(void) + static void do_idle(void) + { + int cpu = smp_processor_id(); ++ bool pending = false; ++ + /* + * If the arch has a polling bit, we maintain an invariant: + * +@@ -235,7 +237,10 @@ static void do_idle(void) + */ + + __current_set_polling(); +- tick_nohz_idle_enter(); ++ if (unlikely(softirq_pending(cpu))) ++ pending = true; ++ else ++ tick_nohz_idle_enter(); + + while (!need_resched()) { + rmb(); +@@ -273,7 +278,8 @@ static void do_idle(void) + * an IPI to fold the state for us. + */ + preempt_set_need_resched(); +- tick_nohz_idle_exit(); ++ if (!pending) ++ tick_nohz_idle_exit(); + __current_clr_polling(); + + /* +@@ -355,6 +361,7 @@ void cpu_startup_entry(enum cpuhp_state state) + do_idle(); + } + ++#ifndef CONFIG_SCHED_MUQSS + /* + * idle-task scheduling class. + */ +@@ -479,3 +486,4 @@ const struct sched_class idle_sched_class = { + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif /* CONFIG_SCHED_MUQSS */ +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index c8870c5bd7df..add1d74c2e91 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2,6 +2,19 @@ + /* + * Scheduler internal types and methods: + */ ++#ifdef CONFIG_SCHED_MUQSS ++#include "MuQSS.h" ++ ++/* Begin compatibility wrappers for MuQSS/CFS differences */ ++#define rq_rt_nr_running(rq) ((rq)->rt_nr_running) ++#define rq_h_nr_running(rq) ((rq)->nr_running) ++ ++#else /* CONFIG_SCHED_MUQSS */ ++ ++#define rq_rt_nr_running(rq) ((rq)->rt.rt_nr_running) ++#define rq_h_nr_running(rq) ((rq)->cfs.h_nr_running) ++ ++ + #include + + #include +@@ -2496,3 +2509,30 @@ static inline void membarrier_switch_mm(struct rq *rq, + { + } + #endif ++ ++/* MuQSS compatibility functions */ ++static inline bool softirq_pending(int cpu) ++{ ++ return false; ++} ++ ++#ifdef CONFIG_64BIT ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ return t->se.sum_exec_runtime; ++} ++#else ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ u64 ns; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ rq = task_rq_lock(t, &rf); ++ ns = t->se.sum_exec_runtime; ++ task_rq_unlock(rq, t, &rf); ++ ++ return ns; ++} ++#endif ++#endif /* CONFIG_SCHED_MUQSS */ +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 49b835f1305f..0253ea846c0d 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -3,6 +3,7 @@ + * Scheduler topology setup/handling methods + */ + #include "sched.h" ++#include "linux/sched/deadline.h" + + DEFINE_MUTEX(sched_domains_mutex); + +@@ -442,7 +443,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) + struct root_domain *old_rd = NULL; + unsigned long flags; + ++#ifdef CONFIG_SCHED_MUQSS ++ raw_spin_lock_irqsave(rq->lock, flags); ++#else + raw_spin_lock_irqsave(&rq->lock, flags); ++#endif + + if (rq->rd) { + old_rd = rq->rd; +@@ -468,7 +473,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) + if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) + set_rq_online(rq); + ++#ifdef CONFIG_SCHED_MUQSS ++ raw_spin_unlock_irqrestore(rq->lock, flags); ++#else + raw_spin_unlock_irqrestore(&rq->lock, flags); ++#endif + + if (old_rd) + call_rcu(&old_rd->rcu, free_rootdomain); +diff --git a/kernel/skip_list.c b/kernel/skip_list.c +new file mode 100644 +index 000000000000..bf5c6e97e139 +--- /dev/null ++++ b/kernel/skip_list.c +@@ -0,0 +1,148 @@ ++/* ++ Copyright (C) 2011,2016 Con Kolivas. ++ ++ Code based on example originally by William Pugh. ++ ++Skip Lists are a probabilistic alternative to balanced trees, as ++described in the June 1990 issue of CACM and were invented by ++William Pugh in 1987. ++ ++A couple of comments about this implementation: ++The routine randomLevel has been hard-coded to generate random ++levels using p=0.25. It can be easily changed. ++ ++The insertion routine has been implemented so as to use the ++dirty hack described in the CACM paper: if a random level is ++generated that is more than the current maximum level, the ++current maximum level plus one is used instead. ++ ++Levels start at zero and go up to MaxLevel (which is equal to ++MaxNumberOfLevels-1). ++ ++The routines defined in this file are: ++ ++init: defines slnode ++ ++new_skiplist: returns a new, empty list ++ ++randomLevel: Returns a random level based on a u64 random seed passed to it. ++In MuQSS, the "niffy" time is used for this purpose. ++ ++insert(l,key, value): inserts the binding (key, value) into l. This operation ++occurs in O(log n) time. ++ ++delnode(slnode, l, node): deletes any binding of key from the l based on the ++actual node value. This operation occurs in O(k) time where k is the ++number of levels of the node in question (max 8). The original delete ++function occurred in O(log n) time and involved a search. ++ ++MuQSS Notes: In this implementation of skiplists, there are bidirectional ++next/prev pointers and the insert function returns a pointer to the actual ++node the value is stored. The key here is chosen by the scheduler so as to ++sort tasks according to the priority list requirements and is no longer used ++by the scheduler after insertion. The scheduler lookup, however, occurs in ++O(1) time because it is always the first item in the level 0 linked list. ++Since the task struct stores a copy of the node pointer upon skiplist_insert, ++it can also remove it much faster than the original implementation with the ++aid of prev<->next pointer manipulation and no searching. ++ ++*/ ++ ++#include ++#include ++ ++#define MaxNumberOfLevels 8 ++#define MaxLevel (MaxNumberOfLevels - 1) ++ ++void skiplist_init(skiplist_node *slnode) ++{ ++ int i; ++ ++ slnode->key = 0xFFFFFFFFFFFFFFFF; ++ slnode->level = 0; ++ slnode->value = NULL; ++ for (i = 0; i < MaxNumberOfLevels; i++) ++ slnode->next[i] = slnode->prev[i] = slnode; ++} ++ ++skiplist *new_skiplist(skiplist_node *slnode) ++{ ++ skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC); ++ ++ BUG_ON(!l); ++ l->header = slnode; ++ return l; ++} ++ ++void free_skiplist(skiplist *l) ++{ ++ skiplist_node *p, *q; ++ ++ p = l->header; ++ do { ++ q = p->next[0]; ++ p->next[0]->prev[0] = q->prev[0]; ++ skiplist_node_init(p); ++ p = q; ++ } while (p != l->header); ++ kfree(l); ++} ++ ++void skiplist_node_init(skiplist_node *node) ++{ ++ memset(node, 0, sizeof(skiplist_node)); ++} ++ ++static inline unsigned int randomLevel(const long unsigned int randseed) ++{ ++ return find_first_bit(&randseed, MaxLevel) / 2; ++} ++ ++void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed) ++{ ++ skiplist_node *update[MaxNumberOfLevels]; ++ skiplist_node *p, *q; ++ int k = l->level; ++ ++ p = l->header; ++ do { ++ while (q = p->next[k], q->key <= key) ++ p = q; ++ update[k] = p; ++ } while (--k >= 0); ++ ++ ++l->entries; ++ k = randomLevel(randseed); ++ if (k > l->level) { ++ k = ++l->level; ++ update[k] = l->header; ++ } ++ ++ node->level = k; ++ node->key = key; ++ node->value = value; ++ do { ++ p = update[k]; ++ node->next[k] = p->next[k]; ++ p->next[k] = node; ++ node->prev[k] = p; ++ node->next[k]->prev[k] = node; ++ } while (--k >= 0); ++} ++ ++void skiplist_delete(skiplist *l, skiplist_node *node) ++{ ++ int k, m = node->level; ++ ++ for (k = 0; k <= m; k++) { ++ node->prev[k]->next[k] = node->next[k]; ++ node->next[k]->prev[k] = node->prev[k]; ++ } ++ skiplist_node_init(node); ++ if (m == l->level) { ++ while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0) ++ m--; ++ l->level = m; ++ } ++ l->entries--; ++} +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index b6f2f35d0bcf..349f5a249593 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -130,9 +130,19 @@ static int __maybe_unused four = 4; + static unsigned long zero_ul; + static unsigned long one_ul = 1; + static unsigned long long_max = LONG_MAX; +-static int one_hundred = 100; +-static int one_thousand = 1000; +-#ifdef CONFIG_PRINTK ++static int __read_mostly one_hundred = 100; ++static int __read_mostly one_thousand = 1000; ++#ifdef CONFIG_SCHED_MUQSS ++static int zero = 0; ++static int one = 1; ++extern int rr_interval; ++extern int sched_interactive; ++extern int sched_iso_cpu; ++extern int sched_yield_type; ++#endif ++extern int hrtimer_granularity_us; ++extern int hrtimeout_min_us; ++#if defined(CONFIG_PRINTK) || defined(CONFIG_SCHED_MUQSS) + static int ten_thousand = 10000; + #endif + #ifdef CONFIG_PERF_EVENTS +@@ -300,7 +310,7 @@ static struct ctl_table sysctl_base_table[] = { + { } + }; + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_MUQSS) + static int min_sched_granularity_ns = 100000; /* 100 usecs */ + static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns; /* 0 usecs */ +@@ -317,6 +327,7 @@ static int max_extfrag_threshold = 1000; + #endif + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_MUQSS + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -498,6 +509,7 @@ static struct ctl_table kern_table[] = { + .extra2 = SYSCTL_ONE, + }, + #endif ++#endif /* !CONFIG_SCHED_MUQSS */ + #ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", +@@ -1070,6 +1082,62 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_MUQSS ++ { ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &one_thousand, ++ }, ++ { ++ .procname = "interactive", ++ .data = &sched_interactive, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, ++ { ++ .procname = "iso_cpu", ++ .data = &sched_iso_cpu, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one_hundred, ++ }, ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &two, ++ }, ++#endif ++ { ++ .procname = "hrtimer_granularity_us", ++ .data = &hrtimer_granularity_us, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &ten_thousand, ++ }, ++ { ++ .procname = "hrtimeout_min_us", ++ .data = &hrtimeout_min_us, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &ten_thousand, ++ }, + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig +index fcc42353f125..46bb16d3c159 100644 +--- a/kernel/time/Kconfig ++++ b/kernel/time/Kconfig +@@ -66,6 +66,9 @@ config NO_HZ_COMMON + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + ++config NO_HZ_FULL ++ bool ++ + choice + prompt "Timer tick handling" + default NO_HZ_IDLE if NO_HZ +@@ -87,8 +90,9 @@ config NO_HZ_IDLE + + Most of the time you want to say Y here. + +-config NO_HZ_FULL ++config NO_HZ_FULL_NODEF + bool "Full dynticks system (tickless)" ++ select NO_HZ_FULL + # NO_HZ_COMMON dependency + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + # We need at least one periodic CPU for timekeeping +@@ -114,6 +118,8 @@ config NO_HZ_FULL + transitions: syscalls, exceptions and interrupts. Even when it's + dynamically off. + ++ Not recommended for desktops,laptops, or mobile devices. ++ + Say N. + + endchoice +diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c +index f5490222e134..544c58c29267 100644 +--- a/kernel/time/clockevents.c ++++ b/kernel/time/clockevents.c +@@ -190,8 +190,9 @@ int clockevents_tick_resume(struct clock_event_device *dev) + + #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST + +-/* Limit min_delta to a jiffie */ +-#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) ++int __read_mostly hrtimer_granularity_us = 100; ++/* Limit min_delta to 100us */ ++#define MIN_DELTA_LIMIT (hrtimer_granularity_us * NSEC_PER_USEC) + + /** + * clockevents_increase_min_delta - raise minimum delta of a clock event device +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index 65605530ee34..75e67a12a97b 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -2206,3 +2206,113 @@ int __sched schedule_hrtimeout(ktime_t *expires, + return schedule_hrtimeout_range(expires, 0, mode); + } + EXPORT_SYMBOL_GPL(schedule_hrtimeout); ++ ++/* ++ * As per schedule_hrtimeout but taskes a millisecond value and returns how ++ * many milliseconds are left. ++ */ ++long __sched schedule_msec_hrtimeout(long timeout) ++{ ++ struct hrtimer_sleeper t; ++ int delta, jiffs; ++ ktime_t expires; ++ ++ if (!timeout) { ++ __set_current_state(TASK_RUNNING); ++ return 0; ++ } ++ ++ jiffs = msecs_to_jiffies(timeout); ++ /* ++ * If regular timer resolution is adequate or hrtimer resolution is not ++ * (yet) better than Hz, as would occur during startup, use regular ++ * timers. ++ */ ++ if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ || pm_freezing) ++ return schedule_timeout(jiffs); ++ ++ delta = (timeout % 1000) * NSEC_PER_MSEC; ++ expires = ktime_set(0, delta); ++ ++ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ hrtimer_set_expires_range_ns(&t.timer, expires, delta); ++ ++ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); ++ ++ if (likely(t.task)) ++ schedule(); ++ ++ hrtimer_cancel(&t.timer); ++ destroy_hrtimer_on_stack(&t.timer); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ expires = hrtimer_expires_remaining(&t.timer); ++ timeout = ktime_to_ms(expires); ++ return timeout < 0 ? 0 : timeout; ++} ++ ++EXPORT_SYMBOL(schedule_msec_hrtimeout); ++ ++#define USECS_PER_SEC 1000000 ++extern int hrtimer_granularity_us; ++ ++static inline long schedule_usec_hrtimeout(long timeout) ++{ ++ struct hrtimer_sleeper t; ++ ktime_t expires; ++ int delta; ++ ++ if (!timeout) { ++ __set_current_state(TASK_RUNNING); ++ return 0; ++ } ++ ++ if (hrtimer_resolution >= NSEC_PER_SEC / HZ) ++ return schedule_timeout(usecs_to_jiffies(timeout)); ++ ++ if (timeout < hrtimer_granularity_us) ++ timeout = hrtimer_granularity_us; ++ delta = (timeout % USECS_PER_SEC) * NSEC_PER_USEC; ++ expires = ktime_set(0, delta); ++ ++ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ hrtimer_set_expires_range_ns(&t.timer, expires, delta); ++ ++ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); ++ ++ if (likely(t.task)) ++ schedule(); ++ ++ hrtimer_cancel(&t.timer); ++ destroy_hrtimer_on_stack(&t.timer); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ expires = hrtimer_expires_remaining(&t.timer); ++ timeout = ktime_to_us(expires); ++ return timeout < 0 ? 0 : timeout; ++} ++ ++int __read_mostly hrtimeout_min_us = 500; ++ ++long __sched schedule_min_hrtimeout(void) ++{ ++ return usecs_to_jiffies(schedule_usec_hrtimeout(hrtimeout_min_us)); ++} ++ ++EXPORT_SYMBOL(schedule_min_hrtimeout); ++ ++long __sched schedule_msec_hrtimeout_interruptible(long timeout) ++{ ++ __set_current_state(TASK_INTERRUPTIBLE); ++ return schedule_msec_hrtimeout(timeout); ++} ++EXPORT_SYMBOL(schedule_msec_hrtimeout_interruptible); ++ ++long __sched schedule_msec_hrtimeout_uninterruptible(long timeout) ++{ ++ __set_current_state(TASK_UNINTERRUPTIBLE); ++ return schedule_msec_hrtimeout(timeout); ++} ++EXPORT_SYMBOL(schedule_msec_hrtimeout_uninterruptible); +diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +index 42d512fcfda2..0db83bdf7f39 100644 +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -226,7 +226,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) + u64 stime, utime; + + task_cputime(p, &utime, &stime); +- store_samples(samples, stime, utime, p->se.sum_exec_runtime); ++ store_samples(samples, stime, utime, tsk_seruntime(p)); + } + + static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, +@@ -845,7 +845,7 @@ static void check_thread_timers(struct task_struct *tsk, + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ +- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); ++ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + /* At the hard limit, send SIGKILL. No further action. */ +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index 4820823515e9..13034cc7c9a4 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -43,6 +43,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -1567,7 +1568,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) + * Check, if the next hrtimer event is before the next timer wheel + * event: + */ +-static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) ++static u64 cmp_next_hrtimer_event(struct timer_base *base, u64 basem, u64 expires) + { + u64 nextevt = hrtimer_get_next_event(); + +@@ -1585,6 +1586,9 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) + if (nextevt <= basem) + return basem; + ++ if (nextevt < expires && nextevt - basem <= TICK_NSEC) ++ base->is_idle = false; ++ + /* + * Round up to the next jiffie. High resolution timers are + * off, so the hrtimers are expired in the tick and we need to +@@ -1654,7 +1658,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) + } + raw_spin_unlock(&base->lock); + +- return cmp_next_hrtimer_event(basem, expires); ++ return cmp_next_hrtimer_event(base, basem, expires); + } + + /** +@@ -1889,6 +1893,18 @@ signed long __sched schedule_timeout(signed long timeout) + + expire = timeout + jiffies; + ++#ifdef CONFIG_HIGH_RES_TIMERS ++ if (timeout == 1 && hrtimer_resolution < NSEC_PER_SEC / HZ) { ++ /* ++ * Special case 1 as being a request for the minimum timeout ++ * and use highres timers to timeout after 1ms to workaround ++ * the granularity of low Hz tick timers. ++ */ ++ if (!schedule_min_hrtimeout()) ++ return 0; ++ goto out_timeout; ++ } ++#endif + timer.task = current; + timer_setup_on_stack(&timer.timer, process_timeout, 0); + __mod_timer(&timer.timer, expire, 0); +@@ -1897,10 +1913,10 @@ signed long __sched schedule_timeout(signed long timeout) + + /* Remove the timer from the object tracker */ + destroy_timer_on_stack(&timer.timer); +- ++out_timeout: + timeout = expire - jiffies; + +- out: ++out: + return timeout < 0 ? 0 : timeout; + } + EXPORT_SYMBOL(schedule_timeout); +@@ -2042,7 +2058,19 @@ void __init init_timers(void) + */ + void msleep(unsigned int msecs) + { +- unsigned long timeout = msecs_to_jiffies(msecs) + 1; ++ int jiffs = msecs_to_jiffies(msecs); ++ unsigned long timeout; ++ ++ /* ++ * Use high resolution timers where the resolution of tick based ++ * timers is inadequate. ++ */ ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { ++ while (msecs) ++ msecs = schedule_msec_hrtimeout_uninterruptible(msecs); ++ return; ++ } ++ timeout = jiffs + 1; + + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); +@@ -2056,7 +2084,15 @@ EXPORT_SYMBOL(msleep); + */ + unsigned long msleep_interruptible(unsigned int msecs) + { +- unsigned long timeout = msecs_to_jiffies(msecs) + 1; ++ int jiffs = msecs_to_jiffies(msecs); ++ unsigned long timeout; ++ ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { ++ while (msecs && !signal_pending(current)) ++ msecs = schedule_msec_hrtimeout_interruptible(msecs); ++ return msecs; ++ } ++ timeout = jiffs + 1; + + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); +diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +index 69ee8ef12cee..6edb01f2fd81 100644 +--- a/kernel/trace/trace_selftest.c ++++ b/kernel/trace/trace_selftest.c +@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_MUQSS ++ /* No deadline on MuQSS, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index ee4eecc7e1c2..22c1b0469468 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -164,7 +164,7 @@ struct scan_control { + /* + * From 0 .. 100. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 33; + /* + * The total number of pages which are beyond the high watermark within all + * zones. +diff --git a/net/core/pktgen.c b/net/core/pktgen.c +index 48b1e429857c..908c866bc9fc 100644 +--- a/net/core/pktgen.c ++++ b/net/core/pktgen.c +@@ -1894,7 +1894,7 @@ static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname) + mutex_unlock(&pktgen_thread_lock); + pr_debug("%s: waiting for %s to disappear....\n", + __func__, ifname); +- schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try)); ++ schedule_msec_hrtimeout_interruptible((msec_per_try)); + mutex_lock(&pktgen_thread_lock); + + if (++i >= max_tries) { +diff --git a/sound/pci/maestro3.c b/sound/pci/maestro3.c +index 19fa73df0846..46caed9b924d 100644 +--- a/sound/pci/maestro3.c ++++ b/sound/pci/maestro3.c +@@ -2001,7 +2001,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) + outw(0, io + GPIO_DATA); + outw(dir | GPO_PRIMARY_AC97, io + GPIO_DIRECTION); + +- schedule_timeout_uninterruptible(msecs_to_jiffies(delay1)); ++ schedule_msec_hrtimeout_uninterruptible((delay1)); + + outw(GPO_PRIMARY_AC97, io + GPIO_DATA); + udelay(5); +@@ -2009,7 +2009,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) + outw(IO_SRAM_ENABLE | SERIAL_AC_LINK_ENABLE, io + RING_BUS_CTRL_A); + outw(~0, io + GPIO_MASK); + +- schedule_timeout_uninterruptible(msecs_to_jiffies(delay2)); ++ schedule_msec_hrtimeout_uninterruptible((delay2)); + + if (! snd_m3_try_read_vendor(chip)) + break; +diff --git a/sound/soc/codecs/rt5631.c b/sound/soc/codecs/rt5631.c +index f70b9f7e68bb..77b65398ca07 100644 +--- a/sound/soc/codecs/rt5631.c ++++ b/sound/soc/codecs/rt5631.c +@@ -415,7 +415,7 @@ static void onebit_depop_mute_stage(struct snd_soc_component *component, int ena + hp_zc = snd_soc_component_read32(component, RT5631_INT_ST_IRQ_CTRL_2); + snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); + if (enable) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + /* config one-bit depop parameter */ + rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x307f); + snd_soc_component_update_bits(component, RT5631_HP_OUT_VOL, +@@ -525,7 +525,7 @@ static void depop_seq_mute_stage(struct snd_soc_component *component, int enable + hp_zc = snd_soc_component_read32(component, RT5631_INT_ST_IRQ_CTRL_2); + snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); + if (enable) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + + /* config depop sequence parameter */ + rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x302f); +diff --git a/sound/soc/codecs/wm8350.c b/sound/soc/codecs/wm8350.c +index fe99584c917f..f1344d532a13 100644 +--- a/sound/soc/codecs/wm8350.c ++++ b/sound/soc/codecs/wm8350.c +@@ -233,10 +233,10 @@ static void wm8350_pga_work(struct work_struct *work) + out2->ramp == WM8350_RAMP_UP) { + /* delay is longer over 0dB as increases are larger */ + if (i >= WM8350_OUTn_0dB) +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (2)); + else +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (1)); + } else + udelay(50); /* doesn't matter if we delay longer */ +@@ -1120,7 +1120,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + (platform->dis_out4 << 6)); + + /* wait for discharge */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + cap_discharge_msecs)); + +@@ -1136,7 +1136,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + WM8350_VBUFEN); + + /* wait for vmid */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + vmid_charge_msecs)); + +@@ -1187,7 +1187,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1, pm1); + + /* wait */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + vmid_discharge_msecs)); + +@@ -1205,7 +1205,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + pm1 | WM8350_OUTPUT_DRAIN_EN); + + /* wait */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform->drain_msecs)); + + pm1 &= ~WM8350_BIASEN; +diff --git a/sound/soc/codecs/wm8900.c b/sound/soc/codecs/wm8900.c +index 271235a69c01..3ec90e1b1eb4 100644 +--- a/sound/soc/codecs/wm8900.c ++++ b/sound/soc/codecs/wm8900.c +@@ -1109,7 +1109,7 @@ static int wm8900_set_bias_level(struct snd_soc_component *component, + /* Need to let things settle before stopping the clock + * to ensure that restart works, see "Stopping the + * master clock" in the datasheet. */ +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible(1); + snd_soc_component_write(component, WM8900_REG_POWER2, + WM8900_REG_POWER2_SYSCLK_ENA); + break; +diff --git a/sound/soc/codecs/wm9713.c b/sound/soc/codecs/wm9713.c +index 6497c1ea6228..08fefeca9d82 100644 +--- a/sound/soc/codecs/wm9713.c ++++ b/sound/soc/codecs/wm9713.c +@@ -199,7 +199,7 @@ static int wm9713_voice_shutdown(struct snd_soc_dapm_widget *w, + + /* Gracefully shut down the voice interface. */ + snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0200); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible(1); + snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0f00); + snd_soc_component_update_bits(component, AC97_EXTENDED_MID, 0x1000, 0x1000); + +@@ -868,7 +868,7 @@ static int wm9713_set_pll(struct snd_soc_component *component, + wm9713->pll_in = freq_in; + + /* wait 10ms AC97 link frames for the link to stabilise */ +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + return 0; + } + +diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c +index b6378f025836..5f5e58655d32 100644 +--- a/sound/soc/soc-dapm.c ++++ b/sound/soc/soc-dapm.c +@@ -154,7 +154,7 @@ static void dapm_assert_locked(struct snd_soc_dapm_context *dapm) + static void pop_wait(u32 pop_time) + { + if (pop_time) +- schedule_timeout_uninterruptible(msecs_to_jiffies(pop_time)); ++ schedule_msec_hrtimeout_uninterruptible((pop_time)); + } + + __printf(3, 4) +diff --git a/sound/usb/line6/pcm.c b/sound/usb/line6/pcm.c +index f70211e6b174..5ae4421225e6 100644 +--- a/sound/usb/line6/pcm.c ++++ b/sound/usb/line6/pcm.c +@@ -127,7 +127,7 @@ static void line6_wait_clear_audio_urbs(struct snd_line6_pcm *line6pcm, + if (!alive) + break; + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } while (--timeout > 0); + if (alive) + dev_err(line6pcm->line6->ifcdev, diff --git a/linux-tkg/linux-tkg-patches/5.4/0004-glitched-muqss.patch b/linux-tkg/linux-tkg-patches/5.4/0004-glitched-muqss.patch new file mode 100644 index 0000000..2c4837e --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.4/0004-glitched-muqss.patch @@ -0,0 +1,78 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched - MuQSS + +diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c +index 84a1d08d68551..57c3036a68952 100644 +--- a/kernel/sched/MuQSS.c ++++ b/kernel/sched/MuQSS.c +@@ -163,7 +167,11 @@ int sched_interactive __read_mostly = 1; + * are allowed to run five seconds as real time tasks. This is the total over + * all online cpus. + */ ++#ifdef CONFIG_ZENIFY ++int sched_iso_cpu __read_mostly = 25; ++#else + int sched_iso_cpu __read_mostly = 70; ++#endif + + /* + * sched_yield_type - Choose what sort of yield sched_yield will perform. + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -5,7 +5,7 @@ + choice + prompt "Timer frequency" + default HZ_100 if SCHED_MUQSS +- default HZ_250_NODEF if !SCHED_MUQSS ++ default HZ_500_NODEF if !SCHED_MUQSS + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -50,6 +50,20 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500_NODEF ++ bool "500 HZ" ++ help ++ 500 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ ++ config HZ_750_NODEF ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000_NODEF + bool "1000 HZ" + help +@@ -63,6 +70,8 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250_NODEF + default 300 if HZ_300_NODEF ++ default 500 if HZ_500_NODEF ++ default 750 if HZ_750_NODEF + default 1000 if HZ_1000_NODEF + + config SCHED_HRTICK + +diff --git a/Makefile b/Makefile +index d4d36c61940b..4a9dfe471f1f 100644 +--- a/Makefile ++++ b/Makefile +@@ -15,7 +15,6 @@ NAME = Kleptomaniac Octopus + + CKVERSION = -ck1 + CKNAME = MuQSS Powered +-EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION) + + # We are using a recursive build, so we need to do a little thinking + # to get the ordering right. diff --git a/linux-tkg/linux-tkg-patches/5.4/0004-glitched-ondemand-muqss.patch b/linux-tkg/linux-tkg-patches/5.4/0004-glitched-ondemand-muqss.patch new file mode 100644 index 0000000..02933e4 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.4/0004-glitched-ondemand-muqss.patch @@ -0,0 +1,18 @@ +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index 6b423eebfd5d..61e3271675d6 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -21,10 +21,10 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_SAMPLING_DOWN_FACTOR (1) ++#define DEF_FREQUENCY_UP_THRESHOLD (45) ++#define DEF_SAMPLING_DOWN_FACTOR (5) + #define MAX_SAMPLING_DOWN_FACTOR (100000) +-#define MICRO_FREQUENCY_UP_THRESHOLD (95) ++#define MICRO_FREQUENCY_UP_THRESHOLD (45) + #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) + #define MIN_FREQUENCY_UP_THRESHOLD (1) + #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux-tkg/linux-tkg-patches/5.4/0005-glitched-ondemand-pds.patch b/linux-tkg/linux-tkg-patches/5.4/0005-glitched-ondemand-pds.patch new file mode 100644 index 0000000..c1929e8 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.4/0005-glitched-ondemand-pds.patch @@ -0,0 +1,18 @@ +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index 6b423eebfd5d..61e3271675d6 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -21,10 +21,10 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (63) +-#define DEF_SAMPLING_DOWN_FACTOR (1) ++#define DEF_FREQUENCY_UP_THRESHOLD (55) ++#define DEF_SAMPLING_DOWN_FACTOR (5) + #define MAX_SAMPLING_DOWN_FACTOR (100000) +-#define MICRO_FREQUENCY_UP_THRESHOLD (95) ++#define MICRO_FREQUENCY_UP_THRESHOLD (63) + #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) + #define MIN_FREQUENCY_UP_THRESHOLD (1) + #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux-tkg/linux-tkg-patches/5.4/0005-glitched-pds.patch b/linux-tkg/linux-tkg-patches/5.4/0005-glitched-pds.patch new file mode 100644 index 0000000..21f2d69 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.4/0005-glitched-pds.patch @@ -0,0 +1,213 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched - PDS + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_500 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -39,6 +39,13 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,6 +59,7 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_500 ++ default HZ_750 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -46,6 +46,13 @@ choice + on desktops with great smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -60,6 +67,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 500 if HZ_500 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 9270a4370d54..30d01e647417 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -159,7 +159,7 @@ struct scan_control { + /* + * From 0 .. 100. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 20; + /* + * The total number of pages which are beyond the high watermark within all + * zones. + +diff --git a/kernel/sched/pds.c b/kernel/sched/pds.c +index c2d831b242b6d18a47e0d87a9f5433a7748b52ff..5bc8d7a8f920c21feab69b2706a3328dc8d39f9a 100644 +--- a/kernel/sched/pds.c ++++ b/kernel/sched/pds.c +@@ -409,12 +409,11 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) + * [L] ->on_rq + * RELEASE (rq->lock) + * +- * If we observe the old CPU in task_rq_lock(), the acquire of ++ * If we observe the old CPU in task_rq_lock, the acquire of + * the old rq->lock will fully serialize against the stores. + * +- * If we observe the new CPU in task_rq_lock(), the address +- * dependency headed by '[L] rq = task_rq()' and the acquire +- * will pair with the WMB to ensure we then also see migrating. ++ * If we observe the new CPU in task_rq_lock, the acquire will ++ * pair with the WMB to ensure we must then also see migrating. + */ + if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { + return rq; +@@ -952,9 +953,9 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) + smp_wmb(); + + #ifdef CONFIG_THREAD_INFO_IN_TASK +- WRITE_ONCE(p->cpu, cpu); ++ p->cpu = cpu; + #else +- WRITE_ONCE(task_thread_info(p)->cpu, cpu); ++ task_thread_info(p)->cpu = cpu; + #endif + #endif + } +@@ -1035,7 +1036,7 @@ static void detach_task(struct rq *rq, struct task_struct *p, int target_cpu) + { + lockdep_assert_held(&rq->lock); + +- WRITE_ONCE(p->on_rq ,TASK_ON_RQ_MIGRATING); ++ p->on_rq = TASK_ON_RQ_MIGRATING; + if (task_contributes_to_load(p)) + rq->nr_uninterruptible++; + dequeue_task(p, rq, 0); +diff --git a/kernel/sched/pds_sched.h b/kernel/sched/pds_sched.h +index 20dcf19ea057627d91be07b4ec20f0827c30084c..24fa90ca63d144cc4f45d82d88407ea70d2d2edf 100644 +--- a/kernel/sched/pds_sched.h ++++ b/kernel/sched/pds_sched.h +@@ -56,7 +56,7 @@ static inline int task_on_rq_queued(struct task_struct *p) + + static inline int task_on_rq_migrating(struct task_struct *p) + { +- return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++ return p->on_rq == TASK_ON_RQ_MIGRATING; + } + + enum { + +diff --git a/init/Kconfig b/init/Kconfig +index 11fd9b502d06..e9bc34d3019b 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -948,7 +948,6 @@ config CGROUP_DEVICE + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" +- depends on !SCHED_PDS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index b23231bae996..cab4e5c5b38e 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -24,13 +24,13 @@ obj-y += fair.o rt.o deadline.o + obj-$(CONFIG_SMP) += cpudeadline.o topology.o stop_task.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o +-obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o + endif + obj-y += loadavg.o clock.o cputime.o + obj-y += idle.o + obj-y += wait.o wait_bit.o swait.o completion.o + obj-$(CONFIG_SMP) += cpupri.o pelt.o + obj-$(CONFIG_SCHEDSTATS) += stats.o ++obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o + +diff --git a/kernel/sched/pds.c b/kernel/sched/pds.c +index 9281ad164..f09a609cf 100644 +--- a/kernel/sched/pds.c ++++ b/kernel/sched/pds.c +@@ -81,6 +81,18 @@ enum { + NR_CPU_AFFINITY_CHK_LEVEL + }; + ++/* ++ * This allows printing both to /proc/sched_debug and ++ * to the console ++ */ ++#define SEQ_printf(m, x...) \ ++ do { \ ++ if (m) \ ++ seq_printf(m, x); \ ++ else \ ++ pr_cont(x); \ ++ } while (0) ++ + static inline void print_scheduler_version(void) + { + printk(KERN_INFO "pds: PDS-mq CPU Scheduler 0.99o by Alfred Chen.\n"); +@@ -6353,7 +6365,10 @@ void ia64_set_curr_task(int cpu, struct task_struct *p) + #ifdef CONFIG_SCHED_DEBUG + void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, + struct seq_file *m) +-{} ++{ ++ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), ++ get_nr_threads(p)); ++} + + void proc_sched_set_task(struct task_struct *p) + {} diff --git a/linux-tkg/linux-tkg-patches/5.4/0005-v5.4_undead-pds099o.patch b/linux-tkg/linux-tkg-patches/5.4/0005-v5.4_undead-pds099o.patch new file mode 100644 index 0000000..e6db1ad --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.4/0005-v5.4_undead-pds099o.patch @@ -0,0 +1,8387 @@ +From 89067d28ca90681fc6cf108de79b9aedb93dfa9d Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Mon, 25 Nov 2019 21:46:23 +0100 +Subject: PDS 099o, 5.4 rebase + + +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index 032c7cd3cede..360a229b0abe 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -82,6 +82,7 @@ show up in /proc/sys/kernel: + - randomize_va_space + - real-root-dev ==> Documentation/admin-guide/initrd.rst + - reboot-cmd [ SPARC only ] ++- rr_interval + - rtsig-max + - rtsig-nr + - sched_energy_aware +@@ -105,6 +106,7 @@ show up in /proc/sys/kernel: + - unknown_nmi_panic + - watchdog + - watchdog_thresh ++- yield_type + - version + + +diff --git a/Documentation/scheduler/sched-PDS-mq.txt b/Documentation/scheduler/sched-PDS-mq.txt +new file mode 100644 +index 000000000000..709e86f6487e +--- /dev/null ++++ b/Documentation/scheduler/sched-PDS-mq.txt +@@ -0,0 +1,56 @@ ++ Priority and Deadline based Skiplist multiple queue Scheduler ++ ------------------------------------------------------------- ++ ++CONTENT ++======== ++ ++ 0. Development ++ 1. Overview ++ 1.1 Design goal ++ 1.2 Design summary ++ 2. Design Detail ++ 2.1 Skip list implementation ++ 2.2 Task preempt ++ 2.3 Task policy, priority and deadline ++ 2.4 Task selection ++ 2.5 Run queue balance ++ 2.6 Task migration ++ ++ ++0. Development ++============== ++ ++Priority and Deadline based Skiplist multiple queue scheduler, referred to as ++PDS from here on, is developed upon the enhancement patchset VRQ(Variable Run ++Queue) for BFS(Brain Fuck Scheduler by Con Kolivas). PDS inherits the existing ++design from VRQ and inspired by the introduction of skiplist data structure ++to the scheduler by Con Kolivas. However, PDS is different from MuQSS(Multiple ++Queue Skiplist Scheduler, the successor after BFS) in many ways. ++ ++1. Overview ++=========== ++ ++1.1 Design goal ++--------------- ++ ++PDS is designed to make the cpu process scheduler code to be simple, but while ++efficiency and scalable. Be Simple, the scheduler code will be easy to be read ++and the behavious of scheduler will be easy to predict. Be efficiency, the ++scheduler shall be well balance the thoughput performance and task interactivity ++at the same time for different properties the tasks behave. Be scalable, the ++performance of the scheduler should be in good shape with the glowing of ++workload or with the growing of the cpu numbers. ++ ++1.2 Design summary ++------------------ ++ ++PDS is described as a multiple run queues cpu scheduler. Each cpu has its own ++run queue. A heavry customized skiplist is used as the backend data structure ++of the cpu run queue. Tasks in run queue is sorted by priority then virtual ++deadline(simplfy to just deadline from here on). In PDS, balance action among ++run queues are kept as less as possible to reduce the migration cost. Cpumask ++data structure is widely used in cpu affinity checking and cpu preemption/ ++selection to make PDS scalable with increasing cpu number. ++ ++ ++To be continued... +diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +index f18d5067cd0f..fe489fc01c73 100644 +--- a/arch/powerpc/platforms/cell/spufs/sched.c ++++ b/arch/powerpc/platforms/cell/spufs/sched.c +@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; + static struct timer_list spusched_timer; + static struct timer_list spuloadavg_timer; + +-/* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- + /* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 8ef85139553f..9d44d8d78259 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1034,6 +1034,22 @@ config NR_CPUS + config SCHED_SMT + def_bool y if SMP + ++config SMT_NICE ++ bool "SMT (Hyperthreading) aware nice priority and policy support" ++ depends on SCHED_PDS && SCHED_SMT ++ default y ++ ---help--- ++ Enabling Hyperthreading on Intel CPUs decreases the effectiveness ++ of the use of 'nice' levels and different scheduling policies ++ (e.g. realtime) due to sharing of CPU power between hyperthreads. ++ SMT nice support makes each logical CPU aware of what is running on ++ its hyperthread siblings, maintaining appropriate distribution of ++ CPU according to nice levels and scheduling policies at the expense ++ of slightly increased overhead. ++ ++ If unsure say Y here. ++ ++ + config SCHED_MC + def_bool y + prompt "Multi-core scheduler support" +diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c +index b66e81c06a57..a294f8f5fd75 100644 +--- a/drivers/cpufreq/cpufreq_conservative.c ++++ b/drivers/cpufreq/cpufreq_conservative.c +@@ -28,8 +28,8 @@ struct cs_dbs_tuners { + }; + + /* Conservative governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_FREQUENCY_DOWN_THRESHOLD (20) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) ++#define DEF_FREQUENCY_DOWN_THRESHOLD (26) + #define DEF_FREQUENCY_STEP (5) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (10) +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index dced033875bf..d2cd03766b09 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -18,7 +18,7 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (100000) + #define MICRO_FREQUENCY_UP_THRESHOLD (95) +@@ -127,7 +127,7 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq) + } + + /* +- * Every sampling_rate, we check, if current idle time is less than 20% ++ * Every sampling_rate, we check, if current idle time is less than 37% + * (default), then we try to increase frequency. Else, we adjust the frequency + * proportional to load. + */ +diff --git a/fs/proc/base.c b/fs/proc/base.c +index ebea9501afb8..51c9346a69fe 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -477,7 +477,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + seq_puts(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff --git a/include/linux/init_task.h b/include/linux/init_task.h +index 2c620d7ac432..1a7987c40c80 100644 +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -36,7 +36,11 @@ extern struct cred init_cred; + #define INIT_PREV_CPUTIME(x) + #endif + ++#ifdef CONFIG_SCHED_PDS ++#define INIT_TASK_COMM "PDS" ++#else + #define INIT_TASK_COMM "swapper" ++#endif /* !CONFIG_SCHED_PDS */ + + /* Attach to the init_task data structure for proper alignment */ + #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK +diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h +index 1b6d31da7cbc..dea181bdb1dd 100644 +--- a/include/linux/jiffies.h ++++ b/include/linux/jiffies.h +@@ -171,7 +171,7 @@ static inline u64 get_jiffies_64(void) + * Have the 32 bit jiffies value wrap 5 minutes after boot + * so jiffies wrap bugs show up earlier. + */ +-#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) ++#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ)) + + /* + * Change timeval to jiffies, trying to avoid the +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 67a1d86981a9..8268cad4b0a2 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -31,6 +31,7 @@ + #include + #include + #include ++#include + + /* task_struct member predeclarations (sorted alphabetically): */ + struct audit_context; +@@ -644,9 +645,13 @@ struct task_struct { + unsigned int flags; + unsigned int ptrace; + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) + struct llist_node wake_entry; ++#endif ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_PDS) + int on_cpu; ++#endif ++#ifdef CONFIG_SMP + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ + unsigned int cpu; +@@ -655,6 +660,7 @@ struct task_struct { + unsigned long wakee_flip_decay_ts; + struct task_struct *last_wakee; + ++#ifndef CONFIG_SCHED_PDS + /* + * recent_used_cpu is initially set as the last CPU used by a task + * that wakes affine another task. Waker/wakee relationships can +@@ -663,6 +669,7 @@ struct task_struct { + * used CPU that may be idle. + */ + int recent_used_cpu; ++#endif /* CONFIG_SCHED_PDS */ + int wake_cpu; + #endif + int on_rq; +@@ -672,13 +679,27 @@ struct task_struct { + int normal_prio; + unsigned int rt_priority; + ++#ifdef CONFIG_SCHED_PDS ++ int time_slice; ++ u64 deadline; ++ /* skip list level */ ++ int sl_level; ++ /* skip list node */ ++ struct skiplist_node sl_node; ++ /* 8bits prio and 56bits deadline for quick processing */ ++ u64 priodl; ++ u64 last_ran; ++ /* sched_clock time spent running */ ++ u64 sched_time; ++#else /* CONFIG_SCHED_PDS */ + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++ struct sched_dl_entity dl; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +- struct sched_dl_entity dl; + + #ifdef CONFIG_UCLAMP_TASK + /* Clamp values requested for a scheduling entity */ +@@ -1283,6 +1304,29 @@ struct task_struct { + */ + }; + ++#ifdef CONFIG_SCHED_PDS ++void cpu_scaling(int cpu); ++void cpu_nonscaling(int cpu); ++#define tsk_seruntime(t) ((t)->sched_time) ++/* replace the uncertian rt_timeout with 0UL */ ++#define tsk_rttimeout(t) (0UL) ++ ++#define task_running_idle(p) ((p)->prio == IDLE_PRIO) ++#else /* CFS */ ++extern int runqueue_is_locked(int cpu); ++static inline void cpu_scaling(int cpu) ++{ ++} ++ ++static inline void cpu_nonscaling(int cpu) ++{ ++} ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++ ++#define iso_task(p) (false) ++#endif /* CONFIG_SCHED_PDS */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index 1aff00b65f3c..a5e5fc2c9170 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -1,5 +1,22 @@ + /* SPDX-License-Identifier: GPL-2.0 */ + ++#ifdef CONFIG_SCHED_PDS ++ ++#define __tsk_deadline(p) ((p)->deadline) ++ ++static inline int dl_prio(int prio) ++{ ++ return 1; ++} ++ ++static inline int dl_task(struct task_struct *p) ++{ ++ return 1; ++} ++#else ++ ++#define __tsk_deadline(p) ((p)->dl.deadline) ++ + /* + * SCHED_DEADLINE tasks has negative priorities, reflecting + * the fact that any of them has higher prio than RT and +@@ -19,6 +36,7 @@ static inline int dl_task(struct task_struct *p) + { + return dl_prio(p->prio); + } ++#endif /* CONFIG_SCHED_PDS */ + + static inline bool dl_time_before(u64 a, u64 b) + { +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index 7d64feafc408..fba04bb91492 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -20,7 +20,18 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ ++#ifdef CONFIG_SCHED_PDS ++#define ISO_PRIO (MAX_USER_RT_PRIO) ++ ++#define MAX_RT_PRIO ((MAX_USER_RT_PRIO) + 1) ++ ++#define NORMAL_PRIO (MAX_RT_PRIO) ++#define IDLE_PRIO ((MAX_RT_PRIO) + 1) ++#define PRIO_LIMIT ((IDLE_PRIO) + 1) ++#else /* !CONFIG_SCHED_PDS */ + #define MAX_RT_PRIO MAX_USER_RT_PRIO ++#endif /* CONFIG_SCHED_PDS */ + + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c08b4..a96012e6f15e 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_PDS + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h +index 4b1c3b664f51..f186b8119ad6 100644 +--- a/include/linux/sched/task.h ++++ b/include/linux/sched/task.h +@@ -99,7 +99,7 @@ extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); + extern void free_task(struct task_struct *tsk); + + /* sched_exec is called by processes performing an exec */ +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) + extern void sched_exec(void); + #else + #define sched_exec() {} +diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h +new file mode 100644 +index 000000000000..713fedd8034f +--- /dev/null ++++ b/include/linux/skip_list.h +@@ -0,0 +1,177 @@ ++/* ++ Copyright (C) 2016 Alfred Chen. ++ ++ Code based on Con Kolivas's skip list implementation for BFS, and ++ which is based on example originally by William Pugh. ++ ++Skip Lists are a probabilistic alternative to balanced trees, as ++described in the June 1990 issue of CACM and were invented by ++William Pugh in 1987. ++ ++A couple of comments about this implementation: ++ ++This file only provides a infrastructure of skip list. ++ ++skiplist_node is embedded into container data structure, to get rid the ++dependency of kmalloc/kfree operation in scheduler code. ++ ++A customized search function should be defined using DEFINE_SKIPLIST_INSERT ++macro and be used for skip list insert operation. ++ ++Random Level is also not defined in this file, instead, it should be customized ++implemented and set to node->level then pass to the customized skiplist_insert ++function. ++ ++Levels start at zero and go up to (NUM_SKIPLIST_LEVEL -1) ++ ++NUM_SKIPLIST_LEVEL in this implementation is 8 instead of origin 16, ++considering that there will be 256 entries to enable the top level when using ++random level p=0.5, and that number is more than enough for a run queue usage ++in a scheduler usage. And it also help to reduce the memory usage of the ++embedded skip list node in task_struct to about 50%. ++ ++The insertion routine has been implemented so as to use the ++dirty hack described in the CACM paper: if a random level is ++generated that is more than the current maximum level, the ++current maximum level plus one is used instead. ++ ++BFS Notes: In this implementation of skiplists, there are bidirectional ++next/prev pointers and the insert function returns a pointer to the actual ++node the value is stored. The key here is chosen by the scheduler so as to ++sort tasks according to the priority list requirements and is no longer used ++by the scheduler after insertion. The scheduler lookup, however, occurs in ++O(1) time because it is always the first item in the level 0 linked list. ++Since the task struct stores a copy of the node pointer upon skiplist_insert, ++it can also remove it much faster than the original implementation with the ++aid of prev<->next pointer manipulation and no searching. ++*/ ++#ifndef _LINUX_SKIP_LIST_H ++#define _LINUX_SKIP_LIST_H ++ ++#include ++ ++#define NUM_SKIPLIST_LEVEL (8) ++ ++struct skiplist_node { ++ int level; /* Levels in this node */ ++ struct skiplist_node *next[NUM_SKIPLIST_LEVEL]; ++ struct skiplist_node *prev[NUM_SKIPLIST_LEVEL]; ++}; ++ ++#define SKIPLIST_NODE_INIT(name) { 0,\ ++ {&name, &name, &name, &name,\ ++ &name, &name, &name, &name},\ ++ {&name, &name, &name, &name,\ ++ &name, &name, &name, &name},\ ++ } ++ ++static inline void INIT_SKIPLIST_NODE(struct skiplist_node *node) ++{ ++ /* only level 0 ->next matters in skiplist_empty()*/ ++ WRITE_ONCE(node->next[0], node); ++} ++ ++/** ++ * FULL_INIT_SKIPLIST_NODE -- fully init a skiplist_node, expecially for header ++ * @node: the skip list node to be inited. ++ */ ++static inline void FULL_INIT_SKIPLIST_NODE(struct skiplist_node *node) ++{ ++ int i; ++ ++ node->level = 0; ++ for (i = 0; i < NUM_SKIPLIST_LEVEL; i++) { ++ WRITE_ONCE(node->next[i], node); ++ node->prev[i] = node; ++ } ++} ++ ++/** ++ * skiplist_empty - test whether a skip list is empty ++ * @head: the skip list to test. ++ */ ++static inline int skiplist_empty(const struct skiplist_node *head) ++{ ++ return READ_ONCE(head->next[0]) == head; ++} ++ ++/** ++ * skiplist_entry - get the struct for this entry ++ * @ptr: the &struct skiplist_node pointer. ++ * @type: the type of the struct this is embedded in. ++ * @member: the name of the skiplist_node within the struct. ++ */ ++#define skiplist_entry(ptr, type, member) \ ++ container_of(ptr, type, member) ++ ++/** ++ * DEFINE_SKIPLIST_INSERT_FUNC -- macro to define a customized skip list insert ++ * function, which takes two parameters, first one is the header node of the ++ * skip list, second one is the skip list node to be inserted ++ * @func_name: the customized skip list insert function name ++ * @search_func: the search function to be used, which takes two parameters, ++ * 1st one is the itrator of skiplist_node in the list, the 2nd is the skip list ++ * node to be inserted, the function should return true if search should be ++ * continued, otherwise return false. ++ * Returns 1 if @node is inserted as the first item of skip list at level zero, ++ * otherwise 0 ++ */ ++#define DEFINE_SKIPLIST_INSERT_FUNC(func_name, search_func)\ ++static inline int func_name(struct skiplist_node *head, struct skiplist_node *node)\ ++{\ ++ struct skiplist_node *update[NUM_SKIPLIST_LEVEL];\ ++ struct skiplist_node *p, *q;\ ++ int k = head->level;\ ++\ ++ p = head;\ ++ do {\ ++ while (q = p->next[k], q != head && search_func(q, node))\ ++ p = q;\ ++ update[k] = p;\ ++ } while (--k >= 0);\ ++\ ++ k = node->level;\ ++ if (unlikely(k > head->level)) {\ ++ node->level = k = ++head->level;\ ++ update[k] = head;\ ++ }\ ++\ ++ do {\ ++ p = update[k];\ ++ q = p->next[k];\ ++ node->next[k] = q;\ ++ p->next[k] = node;\ ++ node->prev[k] = p;\ ++ q->prev[k] = node;\ ++ } while (--k >= 0);\ ++\ ++ return (p == head);\ ++} ++ ++/** ++ * skiplist_del_init -- delete skip list node from a skip list and reset it's ++ * init state ++ * @head: the header node of the skip list to be deleted from. ++ * @node: the skip list node to be deleted, the caller need to ensure @node is ++ * in skip list which @head represent. ++ * Returns 1 if @node is the first item of skip level at level zero, otherwise 0 ++ */ ++static inline int ++skiplist_del_init(struct skiplist_node *head, struct skiplist_node *node) ++{ ++ int l, m = node->level; ++ ++ for (l = 0; l <= m; l++) { ++ node->prev[l]->next[l] = node->next[l]; ++ node->next[l]->prev[l] = node->prev[l]; ++ } ++ if (m == head->level && m > 0) { ++ while (head->next[m] == head && m > 0) ++ m--; ++ head->level = m; ++ } ++ INIT_SKIPLIST_NODE(node); ++ ++ return (node->prev[0] == head); ++} ++#endif /* _LINUX_SKIP_LIST_H */ +diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h +index 25b4fa00bad1..fc0aabdce15f 100644 +--- a/include/uapi/linux/sched.h ++++ b/include/uapi/linux/sched.h +@@ -84,7 +84,10 @@ struct clone_args { + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 +-/* SCHED_ISO: reserved but not implemented yet */ ++/* SCHED_ISO: Implemented in BFS/MuQSSPDS only */ ++#ifdef CONFIG_SCHED_PDS ++#define SCHED_ISO 4 ++#endif + #define SCHED_IDLE 5 + #define SCHED_DEADLINE 6 + +diff --git a/init/Kconfig b/init/Kconfig +index b4daad2bac23..ee3b9957cf3b 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -73,6 +73,21 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config SCHED_PDS ++ bool "PDS-mq cpu scheduler" ++ help ++ The Priority and Deadline based Skip list multiple queue CPU ++ Scheduler for excellent interactivity and responsiveness on the ++ desktop and solid scalability on normal hardware and commodity ++ servers. ++ ++ Currently incompatible with the Group CPU scheduler, and RCU TORTURE ++ TEST so these options are disabled. ++ ++ Say Y here. ++ default y ++ ++ + config BROKEN + bool + +@@ -802,6 +817,7 @@ config NUMA_BALANCING + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_PDS + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -903,7 +919,7 @@ menuconfig CGROUP_SCHED + bandwidth allocation to such task groups. It uses cgroups to group + tasks. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_PDS + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -1032,6 +1048,7 @@ config CGROUP_DEVICE + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" ++ depends on !SCHED_PDS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +@@ -1150,6 +1167,7 @@ config CHECKPOINT_RESTORE + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_PDS + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff --git a/init/init_task.c b/init/init_task.c +index 9e5cbe5eab7b..89787e2feb60 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -58,6 +58,126 @@ struct task_struct init_task + __init_task_data + #endif + = { ++#ifdef CONFIG_SCHED_PDS ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ .thread_info = INIT_THREAD_INFO(init_task), ++ .stack_refcount = ATOMIC_INIT(1), ++#endif ++ .state = 0, ++ .stack = init_stack, ++ .usage = ATOMIC_INIT(2), ++ .flags = PF_KTHREAD, ++ .prio = NORMAL_PRIO, ++ .static_prio = MAX_PRIO - 20, ++ .normal_prio = NORMAL_PRIO, ++ .deadline = 0, /* PDS only */ ++ .policy = SCHED_NORMAL, ++ .cpus_ptr = &init_task.cpus_mask, ++ .cpus_mask = CPU_MASK_ALL, ++ .nr_cpus_allowed= NR_CPUS, ++ .mm = NULL, ++ .active_mm = &init_mm, ++ .restart_block = { ++ .fn = do_no_restart_syscall, ++ }, ++ .sl_level = 0, /* PDS only */ ++ .sl_node = SKIPLIST_NODE_INIT(init_task.sl_node), /* PDS only */ ++ .time_slice = HZ, /* PDS only */ ++ .tasks = LIST_HEAD_INIT(init_task.tasks), ++#ifdef CONFIG_SMP ++ .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), ++#endif ++#ifdef CONFIG_CGROUP_SCHED ++ .sched_task_group = &root_task_group, ++#endif ++ .ptraced = LIST_HEAD_INIT(init_task.ptraced), ++ .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), ++ .real_parent = &init_task, ++ .parent = &init_task, ++ .children = LIST_HEAD_INIT(init_task.children), ++ .sibling = LIST_HEAD_INIT(init_task.sibling), ++ .group_leader = &init_task, ++ RCU_POINTER_INITIALIZER(real_cred, &init_cred), ++ RCU_POINTER_INITIALIZER(cred, &init_cred), ++ .comm = INIT_TASK_COMM, ++ .thread = INIT_THREAD, ++ .fs = &init_fs, ++ .files = &init_files, ++ .signal = &init_signals, ++ .sighand = &init_sighand, ++ .nsproxy = &init_nsproxy, ++ .pending = { ++ .list = LIST_HEAD_INIT(init_task.pending.list), ++ .signal = {{0}} ++ }, ++ .blocked = {{0}}, ++ .alloc_lock = __SPIN_LOCK_UNLOCKED(init_task.alloc_lock), ++ .journal_info = NULL, ++ INIT_CPU_TIMERS(init_task) ++ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock), ++ .timer_slack_ns = 50000, /* 50 usec default slack */ ++ .thread_pid = &init_struct_pid, ++ .thread_group = LIST_HEAD_INIT(init_task.thread_group), ++ .thread_node = LIST_HEAD_INIT(init_signals.thread_head), ++#ifdef CONFIG_AUDITSYSCALL ++ .loginuid = INVALID_UID, ++ .sessionid = AUDIT_SID_UNSET, ++#endif ++#ifdef CONFIG_PERF_EVENTS ++ .perf_event_mutex = __MUTEX_INITIALIZER(init_task.perf_event_mutex), ++ .perf_event_list = LIST_HEAD_INIT(init_task.perf_event_list), ++#endif ++#ifdef CONFIG_PREEMPT_RCU ++ .rcu_read_lock_nesting = 0, ++ .rcu_read_unlock_special.s = 0, ++ .rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry), ++ .rcu_blocked_node = NULL, ++#endif ++#ifdef CONFIG_TASKS_RCU ++ .rcu_tasks_holdout = false, ++ .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list), ++ .rcu_tasks_idle_cpu = -1, ++#endif ++#ifdef CONFIG_CPUSETS ++ .mems_allowed_seq = SEQCNT_ZERO(init_task.mems_allowed_seq), ++#endif ++#ifdef CONFIG_RT_MUTEXES ++ .pi_waiters = RB_ROOT_CACHED, ++ .pi_top_task = NULL, ++#endif ++ INIT_PREV_CPUTIME(init_task) ++#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN ++ .vtime.seqcount = SEQCNT_ZERO(init_task.vtime_seqcount), ++ .vtime.starttime = 0, ++ .vtime.state = VTIME_SYS, ++#endif ++#ifdef CONFIG_NUMA_BALANCING ++ .numa_preferred_nid = -1, ++ .numa_group = NULL, ++ .numa_faults = NULL, ++#endif ++#ifdef CONFIG_KASAN ++ .kasan_depth = 1, ++#endif ++#ifdef CONFIG_TRACE_IRQFLAGS ++ .softirqs_enabled = 1, ++#endif ++#ifdef CONFIG_LOCKDEP ++ .lockdep_recursion = 0, ++#endif ++#ifdef CONFIG_FUNCTION_GRAPH_TRACER ++ .ret_stack = NULL, ++#endif ++#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPT) ++ .trace_recursion = 0, ++#endif ++#ifdef CONFIG_LIVEPATCH ++ .patch_state = KLP_UNDEFINED, ++#endif ++#ifdef CONFIG_SECURITY ++ .security = NULL, ++#endif ++#else /* CONFIG_SCHED_PDS */ + #ifdef CONFIG_THREAD_INFO_IN_TASK + .thread_info = INIT_THREAD_INFO(init_task), + .stack_refcount = REFCOUNT_INIT(1), +@@ -181,6 +301,7 @@ struct task_struct init_task + #ifdef CONFIG_SECURITY + .security = NULL, + #endif ++#endif /* CONFIG_SCHED_PDS */ + }; + EXPORT_SYMBOL(init_task); + +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index c87ee6412b36..4045c8532027 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -632,7 +632,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) + return ret; + } + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) + /* + * Helper routine for generate_sched_domains(). + * Do cpusets a, b have overlapping effective cpus_allowed masks? +@@ -1007,7 +1007,7 @@ static void rebuild_sched_domains_locked(void) + /* Have scheduler rebuild the domains */ + partition_and_rebuild_sched_domains(ndoms, doms, attr); + } +-#else /* !CONFIG_SMP */ ++#else /* !CONFIG_SMP || CONFIG_SCHED_PDS */ + static void rebuild_sched_domains_locked(void) + { + } +diff --git a/kernel/delayacct.c b/kernel/delayacct.c +index 27725754ac99..769d773c7182 100644 +--- a/kernel/delayacct.c ++++ b/kernel/delayacct.c +@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff --git a/kernel/exit.c b/kernel/exit.c +index a46a50d67002..58043176b285 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -131,7 +131,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -152,7 +152,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +index cdf318d86dd6..baa525865d5c 100644 +--- a/kernel/livepatch/transition.c ++++ b/kernel/livepatch/transition.c +@@ -306,7 +306,11 @@ static bool klp_try_switch_task(struct task_struct *task) + */ + rq = task_rq_lock(task, &flags); + ++#ifdef CONFIG_SCHED_PDS ++ if (task_running(task) && task != current) { ++#else + if (task_running(rq, task) && task != current) { ++#endif + snprintf(err_buf, STACK_ERR_BUF_SIZE, + "%s: %s:%d is running\n", __func__, task->comm, + task->pid); +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index 2874bf556162..fad8a279fdfa 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -229,7 +229,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + * Only use with rt_mutex_waiter_{less,equal}() + */ + #define task_to_waiter(p) \ +- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } ++ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = __tsk_deadline(p) } + + static inline int + rt_mutex_waiter_less(struct rt_mutex_waiter *left, +@@ -680,7 +680,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, + * the values of the node being removed. + */ + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + rt_mutex_enqueue(lock, waiter); + +@@ -953,7 +953,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + waiter->task = task; + waiter->lock = lock; + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + /* Get the top priority waiter on the lock */ + if (rt_mutex_has_waiters(lock)) +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 21fb5a5662b5..8ebe4e33fb5f 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -16,15 +16,21 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + +-obj-y += core.o loadavg.o clock.o cputime.o +-obj-y += idle.o fair.o rt.o deadline.o +-obj-y += wait.o wait_bit.o swait.o completion.o +- +-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o ++ifdef CONFIG_SCHED_PDS ++obj-y += pds.o ++else ++obj-y += core.o ++obj-y += fair.o rt.o deadline.o ++obj-$(CONFIG_SMP) += cpudeadline.o topology.o stop_task.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o +-obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o ++endif ++obj-y += loadavg.o clock.o cputime.o ++obj-y += idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o ++obj-$(CONFIG_SMP) += cpupri.o pelt.o ++obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index 86800b4d5453..07f278dc3137 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -185,6 +185,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifndef CONFIG_SCHED_PDS + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -302,6 +303,13 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) + + return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); + } ++#else /* CONFIG_SCHED_PDS */ ++static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) ++{ ++ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); ++ return sg_cpu->max; ++} ++#endif + + /** + * sugov_iowait_reset() - Reset the IO boost status of a CPU. +@@ -445,7 +453,9 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } + */ + static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) + { ++#ifndef CONFIG_SCHED_PDS + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) ++#endif + sg_policy->limits_changed = true; + } + +@@ -688,6 +698,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + } + + ret = sched_setattr_nocheck(thread, &attr); ++ + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); +@@ -918,6 +929,7 @@ static int __init sugov_register(void) + fs_initcall(sugov_register); + + #ifdef CONFIG_ENERGY_MODEL ++#ifndef CONFIG_SCHED_PDS + extern bool sched_energy_update; + extern struct mutex sched_energy_mutex; + +@@ -948,4 +960,10 @@ void sched_cpufreq_governor_change(struct cpufreq_policy *policy, + } + + } ++#else /* CONFIG_SCHED_PDS */ ++void sched_cpufreq_governor_change(struct cpufreq_policy *policy, ++ struct cpufreq_governor *old_gov) ++{ ++} ++#endif + #endif +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index 46ed4e1383e2..0a9548ee995c 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -122,7 +122,12 @@ void account_user_time(struct task_struct *p, u64 cputime) + p->utime += cputime; + account_group_user_time(p, cputime); + ++#ifdef CONFIG_SCHED_PDS ++ index = (task_nice(p) > 0 || task_running_idle(p)) ? CPUTIME_NICE : ++ CPUTIME_USER; ++#else + index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; ++#endif + + /* Add user time to cpustat. */ + task_group_account_field(p, index, cputime); +@@ -146,7 +151,11 @@ void account_guest_time(struct task_struct *p, u64 cputime) + p->gtime += cputime; + + /* Add guest time to cpustat. */ ++#ifdef CONFIG_SCHED_PDS ++ if (task_nice(p) > 0 || task_running_idle(p)) { ++#else + if (task_nice(p) > 0) { ++#endif + cpustat[CPUTIME_NICE] += cputime; + cpustat[CPUTIME_GUEST_NICE] += cputime; + } else { +@@ -269,7 +278,7 @@ static inline u64 account_other_time(u64 max) + #ifdef CONFIG_64BIT + static inline u64 read_sum_exec_runtime(struct task_struct *t) + { +- return t->se.sum_exec_runtime; ++ return tsk_seruntime(t); + } + #else + static u64 read_sum_exec_runtime(struct task_struct *t) +@@ -279,7 +288,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) + struct rq *rq; + + rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; ++ ns = tsk_seruntime(t); + task_rq_unlock(rq, t, &rf); + + return ns; +@@ -663,7 +672,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index f65ef1e2f204..454fa7e460e3 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -355,6 +355,7 @@ void cpu_startup_entry(enum cpuhp_state state) + do_idle(); + } + ++#ifndef CONFIG_SCHED_PDS + /* + * idle-task scheduling class. + */ +@@ -479,3 +480,4 @@ const struct sched_class idle_sched_class = { + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif +diff --git a/kernel/sched/pds.c b/kernel/sched/pds.c +new file mode 100644 +index 000000000000..aefbd9cebcfb +--- /dev/null ++++ b/kernel/sched/pds.c +@@ -0,0 +1,6566 @@ ++/* ++ * kernel/sched/pds.c, was kernel/sched.c ++ * ++ * PDS-mq Core kernel scheduler code and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel ++ * scheduler by Alfred Chen. ++ */ ++#include "pds_sched.h" ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++ ++#include "../workqueue_internal.h" ++#include "../smpboot.h" ++ ++#include "pelt.h" ++ ++#define CREATE_TRACE_POINTS ++#include ++ ++ ++#define rt_prio(prio) ((prio) < MAX_RT_PRIO) ++#define rt_task(p) rt_prio((p)->prio) ++#define rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR || \ ++ (policy) == SCHED_ISO) ++#define task_has_rt_policy(p) (rt_policy((p)->policy)) ++ ++#define idle_policy(policy) ((policy) == SCHED_IDLE) ++#define idleprio_task(p) unlikely(idle_policy((p)->policy)) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* ++ * Some helpers for converting to/from various scales. Use shifts to get ++ * approximate multiples of ten for less overhead. ++ */ ++#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) ++#define JIFFY_NS (1000000000 / HZ) ++#define HALF_JIFFY_NS (1000000000 / HZ / 2) ++#define HALF_JIFFY_US (1000000 / HZ / 2) ++#define MS_TO_NS(TIME) ((TIME) << 20) ++#define MS_TO_US(TIME) ((TIME) << 10) ++#define NS_TO_MS(TIME) ((TIME) >> 20) ++#define NS_TO_US(TIME) ((TIME) >> 10) ++#define US_TO_NS(TIME) ((TIME) << 10) ++ ++#define RESCHED_US (100) /* Reschedule if less than this many μs left */ ++ ++enum { ++ BASE_CPU_AFFINITY_CHK_LEVEL = 1, ++#ifdef CONFIG_SCHED_SMT ++ SMT_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++#ifdef CONFIG_SCHED_MC ++ MC_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++ NR_CPU_AFFINITY_CHK_LEVEL ++}; ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "pds: PDS-mq CPU Scheduler 0.99o by Alfred Chen and kept alive artificially by Tk-Glitch.\n"); ++} ++ ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 6ms. Scales with number of cpus. ++ * Tunable via /proc interface. ++ */ ++#define SCHED_DEFAULT_RR (4) ++int rr_interval __read_mostly = SCHED_DEFAULT_RR; ++ ++static int __init rr_interval_set(char *str) ++{ ++ u32 rr; ++ ++ pr_info("rr_interval: "); ++ if (kstrtouint(str, 0, &rr)) { ++ pr_cont("using default of %u, unable to parse %s\n", ++ rr_interval, str); ++ return 1; ++ } ++ ++ rr_interval = rr; ++ pr_cont("%d\n", rr_interval); ++ ++ return 1; ++} ++__setup("rr_interval=", rr_interval_set); ++ ++ ++static const u64 sched_prio2deadline[NICE_WIDTH] = { ++/* -20 */ 6291456, 6920601, 7612661, 8373927, 9211319, ++/* -15 */ 10132450, 11145695, 12260264, 13486290, 14834919, ++/* -10 */ 16318410, 17950251, 19745276, 21719803, 23891783, ++/* -5 */ 26280961, 28909057, 31799962, 34979958, 38477953, ++/* 0 */ 42325748, 46558322, 51214154, 56335569, 61969125, ++/* 5 */ 68166037, 74982640, 82480904, 90728994, 99801893, ++/* 10 */ 109782082, 120760290, 132836319, 146119950, 160731945, ++/* 15 */ 176805139, 194485652, 213934217, 235327638, 258860401 ++}; ++ ++/** ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Yield only to better priority/deadline tasks. (default) ++ * 2: Expire timeslice and recalculate deadline. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++/* ++ * The quota handed out to tasks of all priority levels when refilling their ++ * time_slice. ++ */ ++static inline int timeslice(void) ++{ ++ return MS_TO_US(rr_interval); ++} ++ ++#ifdef CONFIG_SMP ++enum { ++SCHED_RQ_EMPTY = 0, ++SCHED_RQ_IDLE, ++SCHED_RQ_NORMAL_0, ++SCHED_RQ_NORMAL_1, ++SCHED_RQ_NORMAL_2, ++SCHED_RQ_NORMAL_3, ++SCHED_RQ_NORMAL_4, ++SCHED_RQ_NORMAL_5, ++SCHED_RQ_NORMAL_6, ++SCHED_RQ_NORMAL_7, ++SCHED_RQ_ISO, ++SCHED_RQ_RT, ++NR_SCHED_RQ_QUEUED_LEVEL ++}; ++ ++static cpumask_t sched_rq_queued_masks[NR_SCHED_RQ_QUEUED_LEVEL] ++____cacheline_aligned_in_smp; ++ ++static DECLARE_BITMAP(sched_rq_queued_masks_bitmap, NR_SCHED_RQ_QUEUED_LEVEL) ++____cacheline_aligned_in_smp; ++ ++static cpumask_t sched_rq_pending_masks[NR_SCHED_RQ_QUEUED_LEVEL] ++____cacheline_aligned_in_smp; ++ ++static DECLARE_BITMAP(sched_rq_pending_masks_bitmap, NR_SCHED_RQ_QUEUED_LEVEL) ++____cacheline_aligned_in_smp; ++ ++DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_chk_masks); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_start_mask); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_chk_end_masks); ++ ++#ifdef CONFIG_SCHED_SMT ++DEFINE_PER_CPU(int, sched_sibling_cpu); ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++EXPORT_SYMBOL_GPL(sched_smt_present); ++ ++static cpumask_t sched_cpu_sg_idle_mask ____cacheline_aligned_in_smp; ++ ++#ifdef CONFIG_SMT_NICE ++/* ++ * Preemptible sibling group mask ++ * Which all sibling cpus are running at PRIO_LIMIT or IDLE_PRIO ++ */ ++static cpumask_t sched_cpu_psg_mask ____cacheline_aligned_in_smp; ++/* ++ * SMT supressed mask ++ * When a cpu is running task with NORMAL/ISO/RT policy, its sibling cpu ++ * will be supressed to run IDLE priority task. ++ */ ++static cpumask_t sched_smt_supressed_mask ____cacheline_aligned_in_smp; ++#endif /* CONFIG_SMT_NICE */ ++#endif ++ ++static int sched_rq_prio[NR_CPUS] ____cacheline_aligned; ++ ++/* ++ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of ++ * the domain), this allows us to quickly tell if two cpus are in the same cache ++ * domain, see cpus_share_cache(). ++ */ ++DEFINE_PER_CPU(int, sd_llc_id); ++ ++int __weak arch_sd_sibling_asym_packing(void) ++{ ++ return 0*SD_ASYM_PACKING; ++} ++#else ++struct rq *uprq; ++#endif /* CONFIG_SMP */ ++ ++static DEFINE_MUTEX(sched_hotcpu_mutex); ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++/* ++ * Context: p->pi_lock ++ */ ++static inline struct rq ++*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock(&rq->lock); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ *plock = NULL; ++ return rq; ++ } ++ } ++} ++ ++static inline void ++__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) ++{ ++ if (NULL != lock) ++ raw_spin_unlock(lock); ++} ++ ++static inline struct rq ++*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, ++ unsigned long *flags) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock_irqsave(&rq->lock, *flags); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&rq->lock, *flags); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ raw_spin_lock_irqsave(&p->pi_lock, *flags); ++ if (likely(!p->on_cpu && !p->on_rq && ++ rq == task_rq(p))) { ++ *plock = &p->pi_lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); ++ } ++ } ++} ++ ++static inline void ++task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, ++ unsigned long *flags) ++{ ++ raw_spin_unlock_irqrestore(lock, *flags); ++} ++ ++/* ++ * __task_rq_lock - lock the rq @p resides on. ++ */ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ for (;;) { ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) ++ return rq; ++ raw_spin_unlock(&rq->lock); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. ++ */ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ for (;;) { ++ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ /* ++ * move_queued_task() task_rq_lock() ++ * ++ * ACQUIRE (rq->lock) ++ * [S] ->on_rq = MIGRATING [L] rq = task_rq() ++ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); ++ * [S] ->cpu = new_cpu [L] task_rq() ++ * [L] ->on_rq ++ * RELEASE (rq->lock) ++ * ++ * If we observe the old CPU in task_rq_lock(), the acquire of ++ * the old rq->lock will fully serialize against the stores. ++ * ++ * If we observe the new CPU in task_rq_lock(), the address ++ * dependency headed by '[L] rq = task_rq()' and the acquire ++ * will pair with the WMB to ensure we then also see migrating. ++ */ ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++ s64 __maybe_unused steal = 0, irq_delta = 0; ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ ++ delta -= steal; ++ } ++#endif ++ ++ rq->clock_task += delta; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ if ((irq_delta + steal)) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta <= 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++static inline void update_task_priodl(struct task_struct *p) ++{ ++ p->priodl = (((u64) (p->prio))<<56) | ((p->deadline)>>8); ++} ++ ++/* ++ * Deadline is "now" in niffies + (offset by priority). Setting the deadline ++ * is the key to everything. It distributes CPU fairly amongst tasks of the ++ * same nice value, it proportions CPU according to nice level, it means the ++ * task that last woke up the longest ago has the earliest deadline, thus ++ * ensuring that interactive tasks get low latency on wake up. The CPU ++ * proportion works out to the square of the virtual deadline difference, so ++ * this equation will give nice 19 3% CPU compared to nice 0. ++ */ ++static inline u64 task_deadline_diff(const struct task_struct *p) ++{ ++ return sched_prio2deadline[TASK_USER_PRIO(p)]; ++} ++ ++static inline u64 static_deadline_diff(int static_prio) ++{ ++ return sched_prio2deadline[USER_PRIO(static_prio)]; ++} ++ ++/* ++ * The time_slice is only refilled when it is empty and that is when we set a ++ * new deadline for non-rt tasks. ++ */ ++static inline void time_slice_expired(struct task_struct *p, struct rq *rq) ++{ ++ p->time_slice = timeslice(); ++ if (p->prio >= NORMAL_PRIO) ++ p->deadline = rq->clock + task_deadline_diff(p); ++ ++ update_task_priodl(p); ++} ++ ++static inline struct task_struct *rq_first_queued_task(struct rq *rq) ++{ ++ struct skiplist_node *node = rq->sl_header.next[0]; ++ ++ if (node == &rq->sl_header) ++ return rq->idle; ++ ++ return skiplist_entry(node, struct task_struct, sl_node); ++} ++ ++static inline struct task_struct *rq_second_queued_task(struct rq *rq) ++{ ++ struct skiplist_node *node = rq->sl_header.next[0]->next[0]; ++ ++ if (node == &rq->sl_header) ++ return rq->idle; ++ ++ return skiplist_entry(node, struct task_struct, sl_node); ++} ++ ++static inline int is_second_in_rq(struct task_struct *p, struct rq *rq) ++{ ++ return (p->sl_node.prev[0]->prev[0] == &rq->sl_header); ++} ++ ++static const int task_dl_hash_tbl[] = { ++/* 0 4 8 12 */ ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ++/* 16 20 24 28 */ ++ 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5, 6, 7 ++}; ++ ++static inline int ++task_deadline_level(const struct task_struct *p, const struct rq *rq) ++{ ++ u64 delta = (rq->clock + sched_prio2deadline[39] - p->deadline) >> 23; ++ ++ delta = min((size_t)delta, ARRAY_SIZE(task_dl_hash_tbl) - 1); ++ return task_dl_hash_tbl[delta]; ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_SMT_NICE ++static void resched_cpu_if_curr_is(int cpu, int priority) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ rcu_read_lock(); ++ ++ if (rcu_dereference(rq->curr)->prio != priority) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ if (!do_raw_spin_trylock(&rq->lock)) ++ goto out; ++ spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ ++ if (priority == rq->curr->prio) ++ smp_send_reschedule(cpu); ++ /* Else CPU is not idle, do nothing here */ ++ ++ spin_release(&rq->lock.dep_map, 1, _RET_IP_); ++ do_raw_spin_unlock(&rq->lock); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++#endif /* CONFIG_SMT_NICE */ ++ ++static inline bool ++__update_cpumasks_bitmap(int cpu, unsigned long *plevel, unsigned long level, ++ cpumask_t cpumasks[], unsigned long bitmap[]) ++{ ++ if (*plevel == level) ++ return false; ++ ++ cpumask_clear_cpu(cpu, cpumasks + *plevel); ++ if (cpumask_empty(cpumasks + *plevel)) ++ clear_bit(*plevel, bitmap); ++ cpumask_set_cpu(cpu, cpumasks + level); ++ set_bit(level, bitmap); ++ ++ *plevel = level; ++ ++ return true; ++} ++ ++static inline int ++task_running_policy_level(const struct task_struct *p, const struct rq *rq) ++{ ++ int prio = p->prio; ++ ++ if (NORMAL_PRIO == prio) ++ return SCHED_RQ_NORMAL_0 + task_deadline_level(p, rq); ++ ++ if (ISO_PRIO == prio) ++ return SCHED_RQ_ISO; ++ if (prio < MAX_RT_PRIO) ++ return SCHED_RQ_RT; ++ return PRIO_LIMIT - prio; ++} ++ ++static inline void update_sched_rq_queued_masks_normal(struct rq *rq) ++{ ++ struct task_struct *p = rq_first_queued_task(rq); ++ ++ if (p->prio != NORMAL_PRIO) ++ return; ++ ++ __update_cpumasks_bitmap(cpu_of(rq), &rq->queued_level, ++ task_running_policy_level(p, rq), ++ &sched_rq_queued_masks[0], ++ &sched_rq_queued_masks_bitmap[0]); ++} ++ ++#ifdef CONFIG_SMT_NICE ++static inline void update_sched_cpu_psg_mask(const int cpu) ++{ ++ cpumask_t tmp; ++ ++ cpumask_or(&tmp, &sched_rq_queued_masks[SCHED_RQ_EMPTY], ++ &sched_rq_queued_masks[SCHED_RQ_IDLE]); ++ cpumask_and(&tmp, &tmp, cpu_smt_mask(cpu)); ++ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) ++ cpumask_or(&sched_cpu_psg_mask, &sched_cpu_psg_mask, ++ cpu_smt_mask(cpu)); ++ else ++ cpumask_andnot(&sched_cpu_psg_mask, &sched_cpu_psg_mask, ++ cpu_smt_mask(cpu)); ++} ++#endif ++ ++static inline void update_sched_rq_queued_masks(struct rq *rq) ++{ ++ int cpu = cpu_of(rq); ++ struct task_struct *p = rq_first_queued_task(rq); ++ unsigned long level; ++#ifdef CONFIG_SCHED_SMT ++ unsigned long last_level = rq->queued_level; ++#endif ++ ++ level = task_running_policy_level(p, rq); ++ sched_rq_prio[cpu] = p->prio; ++ ++ if (!__update_cpumasks_bitmap(cpu, &rq->queued_level, level, ++ &sched_rq_queued_masks[0], ++ &sched_rq_queued_masks_bitmap[0])) ++ return; ++ ++#ifdef CONFIG_SCHED_SMT ++ if (cpu == per_cpu(sched_sibling_cpu, cpu)) ++ return; ++ ++ if (SCHED_RQ_EMPTY == last_level) { ++ cpumask_andnot(&sched_cpu_sg_idle_mask, &sched_cpu_sg_idle_mask, ++ cpu_smt_mask(cpu)); ++ } else if (SCHED_RQ_EMPTY == level) { ++ cpumask_t tmp; ++ ++ cpumask_and(&tmp, cpu_smt_mask(cpu), ++ &sched_rq_queued_masks[SCHED_RQ_EMPTY]); ++ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) ++ cpumask_or(&sched_cpu_sg_idle_mask, cpu_smt_mask(cpu), ++ &sched_cpu_sg_idle_mask); ++ } ++ ++#ifdef CONFIG_SMT_NICE ++ if (level <= SCHED_RQ_IDLE && last_level > SCHED_RQ_IDLE) { ++ cpumask_clear_cpu(per_cpu(sched_sibling_cpu, cpu), ++ &sched_smt_supressed_mask); ++ update_sched_cpu_psg_mask(cpu); ++ resched_cpu_if_curr_is(per_cpu(sched_sibling_cpu, cpu), PRIO_LIMIT); ++ } else if (last_level <= SCHED_RQ_IDLE && level > SCHED_RQ_IDLE) { ++ cpumask_set_cpu(per_cpu(sched_sibling_cpu, cpu), ++ &sched_smt_supressed_mask); ++ update_sched_cpu_psg_mask(cpu); ++ resched_cpu_if_curr_is(per_cpu(sched_sibling_cpu, cpu), IDLE_PRIO); ++ } ++#endif /* CONFIG_SMT_NICE */ ++#endif ++} ++ ++static inline void update_sched_rq_pending_masks(struct rq *rq) ++{ ++ unsigned long level; ++ struct task_struct *p = rq_second_queued_task(rq); ++ ++ level = task_running_policy_level(p, rq); ++ ++ __update_cpumasks_bitmap(cpu_of(rq), &rq->pending_level, level, ++ &sched_rq_pending_masks[0], ++ &sched_rq_pending_masks_bitmap[0]); ++} ++ ++#else /* CONFIG_SMP */ ++static inline void update_sched_rq_queued_masks(struct rq *rq) {} ++static inline void update_sched_rq_queued_masks_normal(struct rq *rq) {} ++static inline void update_sched_rq_pending_masks(struct rq *rq) {} ++#endif ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out ++ * of nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu; ++ ++ if (!tick_nohz_full_enabled()) ++ return; ++ ++ cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (rq->nr_running < 2) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++/* ++ * Removing from the runqueue. Deleting a task from the skip list is done ++ * via the stored node reference in the task struct and does not require a full ++ * look up. Thus it occurs in O(k) time where k is the "level" of the list the ++ * task was stored at - usually < 4, max 16. ++ * ++ * Context: rq->lock ++ */ ++static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "pds: dequeue task reside on cpu%d from cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ if (skiplist_del_init(&rq->sl_header, &p->sl_node)) { ++ update_sched_rq_queued_masks(rq); ++ update_sched_rq_pending_masks(rq); ++ } else if (is_second_in_rq(p, rq)) ++ update_sched_rq_pending_masks(rq); ++ rq->nr_running--; ++ ++ sched_update_tick_dependency(rq); ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); ++ ++ sched_info_dequeued(rq, p); ++} ++ ++/* ++ * To determine if it's safe for a task of SCHED_IDLE to actually run as ++ * an idle task, we ensure none of the following conditions are met. ++ */ ++static inline bool idleprio_suitable(struct task_struct *p) ++{ ++ return (!freezing(p) && !signal_pending(p) && ++ !(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING))); ++} ++ ++/* ++ * pds_skiplist_random_level -- Returns a pseudo-random level number for skip ++ * list node which is used in PDS run queue. ++ * ++ * In current implementation, based on testing, the first 8 bits in microseconds ++ * of niffies are suitable for random level population. ++ * find_first_bit() is used to satisfy p = 0.5 between each levels, and there ++ * should be platform hardware supported instruction(known as ctz/clz) to speed ++ * up this function. ++ * The skiplist level for a task is populated when task is created and doesn't ++ * change in task's life time. When task is being inserted into run queue, this ++ * skiplist level is set to task's sl_node->level, the skiplist insert function ++ * may change it based on current level of the skip lsit. ++ */ ++static inline int pds_skiplist_random_level(const struct task_struct *p) ++{ ++ long unsigned int randseed; ++ ++ /* ++ * 1. Some architectures don't have better than microsecond resolution ++ * so mask out ~microseconds as a factor of the random seed for skiplist ++ * insertion. ++ * 2. Use address of task structure pointer as another factor of the ++ * random seed for task burst forking scenario. ++ */ ++ randseed = (task_rq(p)->clock ^ (long unsigned int)p) >> 10; ++ ++ return find_first_bit(&randseed, NUM_SKIPLIST_LEVEL - 1); ++} ++ ++/** ++ * pds_skiplist_task_search -- search function used in PDS run queue skip list ++ * node insert operation. ++ * @it: iterator pointer to the node in the skip list ++ * @node: pointer to the skiplist_node to be inserted ++ * ++ * Returns true if key of @it is less or equal to key value of @node, otherwise ++ * false. ++ */ ++static inline bool ++pds_skiplist_task_search(struct skiplist_node *it, struct skiplist_node *node) ++{ ++ return (skiplist_entry(it, struct task_struct, sl_node)->priodl <= ++ skiplist_entry(node, struct task_struct, sl_node)->priodl); ++} ++ ++/* ++ * Define the skip list insert function for PDS ++ */ ++DEFINE_SKIPLIST_INSERT_FUNC(pds_skiplist_insert, pds_skiplist_task_search); ++ ++/* ++ * Adding task to the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "pds: enqueue task reside on cpu%d to cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ ++ p->sl_node.level = p->sl_level; ++ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node)) { ++ update_sched_rq_queued_masks(rq); ++ update_sched_rq_pending_masks(rq); ++ } else if (is_second_in_rq(p, rq)) ++ update_sched_rq_pending_masks(rq); ++ rq->nr_running++; ++ ++ sched_update_tick_dependency(rq); ++ ++ sched_info_queued(rq, p); ++ psi_enqueue(p, flags); ++ ++ /* ++ * If in_iowait is set, the code below may not trigger any cpufreq ++ * utilization updates, so do it here explicitly with the IOWAIT flag ++ * passed. ++ */ ++ if (p->in_iowait) ++ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); ++} ++ ++static inline void requeue_task(struct task_struct *p, struct rq *rq) ++{ ++ bool b_first, b_second; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "pds: cpu[%d] requeue task reside on cpu%d\n", ++ cpu_of(rq), task_cpu(p)); ++ ++ b_first = skiplist_del_init(&rq->sl_header, &p->sl_node); ++ b_second = is_second_in_rq(p, rq); ++ ++ p->sl_node.level = p->sl_level; ++ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node) || b_first) { ++ update_sched_rq_queued_masks(rq); ++ update_sched_rq_pending_masks(rq); ++ } else if (is_second_in_rq(p, rq) || b_second) ++ update_sched_rq_pending_masks(rq); ++} ++ ++/* ++ * resched_curr - mark rq's current task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_curr(struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ int cpu; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ if (test_tsk_need_resched(curr)) ++ return; ++ ++ cpu = cpu_of(rq); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(curr)) ++ smp_send_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) ++{ ++ struct task_struct *curr = rq->curr; ++ ++ if (curr->prio == PRIO_LIMIT) ++ resched_curr(rq); ++ ++ if (task_running_idle(p)) ++ return; ++ ++ if (p->priodl < curr->priodl) ++ resched_curr(rq); ++} ++ ++#ifdef CONFIG_SCHED_HRTICK ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++ ++static void hrtick_clear(struct rq *rq) ++{ ++ if (hrtimer_active(&rq->hrtick_timer)) ++ hrtimer_cancel(&rq->hrtick_timer); ++} ++ ++/* ++ * High-resolution timer tick. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrtick(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrtick_timer); ++ struct task_struct *p; ++ ++ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); ++ ++ raw_spin_lock(&rq->lock); ++ p = rq->curr; ++ p->time_slice = 0; ++ resched_curr(rq); ++ raw_spin_unlock(&rq->lock); ++ ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Use hrtick when: ++ * - enabled by features ++ * - hrtimer is actually high res ++ */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ /** ++ * PDS doesn't support sched_feat yet ++ if (!sched_feat(HRTICK)) ++ return 0; ++ */ ++ if (!cpu_active(cpu_of(rq))) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrtick_timer); ++} ++ ++#ifdef CONFIG_SMP ++ ++static void __hrtick_restart(struct rq *rq) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ++ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); ++} ++ ++/* ++ * called from hardirq (IPI) context ++ */ ++static void __hrtick_start(void *arg) ++{ ++ struct rq *rq = arg; ++ ++ raw_spin_lock(&rq->lock); ++ __hrtick_restart(rq); ++ rq->hrtick_csd_pending = 0; ++ raw_spin_unlock(&rq->lock); ++} ++ ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ktime_t time; ++ s64 delta; ++ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense and can cause timer DoS. ++ */ ++ delta = max_t(s64, delay, 10000LL); ++ time = ktime_add_ns(timer->base->get_time(), delta); ++ ++ hrtimer_set_expires(timer, time); ++ ++ if (rq == this_rq()) { ++ __hrtick_restart(rq); ++ } else if (!rq->hrtick_csd_pending) { ++ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); ++ rq->hrtick_csd_pending = 1; ++ } ++} ++ ++#else ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense. Rely on vruntime for fairness. ++ */ ++ delay = max_t(u64, delay, 10000LL); ++ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED_HARD); ++} ++#endif /* CONFIG_SMP */ ++ ++static void hrtick_rq_init(struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ rq->hrtick_csd_pending = 0; ++ ++ rq->hrtick_csd.flags = 0; ++ rq->hrtick_csd.func = __hrtick_start; ++ rq->hrtick_csd.info = rq; ++#endif ++ ++ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); ++ rq->hrtick_timer.function = hrtick; ++} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ if ((rq->clock - rq->last_tick > HALF_JIFFY_NS) || hrtick_enabled(rq)) ++ return 0; ++ ++ return HALF_JIFFY_NS; ++} ++ ++#else /* CONFIG_SCHED_HRTICK */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline void hrtick_clear(struct rq *rq) ++{ ++} ++ ++static inline void hrtick_rq_init(struct rq *rq) ++{ ++} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ return (rq->clock - rq->last_tick > HALF_JIFFY_NS)? 0:HALF_JIFFY_NS; ++} ++#endif /* CONFIG_SCHED_HRTICK */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ static const int policy_to_prio[] = { ++ NORMAL_PRIO, /* SCHED_NORMAL */ ++ 0, /* SCHED_FIFO */ ++ 0, /* SCHED_RR */ ++ IDLE_PRIO, /* SCHED_BATCH */ ++ ISO_PRIO, /* SCHED_ISO */ ++ IDLE_PRIO /* SCHED_IDLE */ ++ }; ++ ++ if (task_has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ return policy_to_prio[p->policy]; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static void activate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible--; ++ enqueue_task(p, rq, ENQUEUE_WAKEUP); ++ p->on_rq = 1; ++ cpufreq_update_this_cpu(rq, 0); ++} ++ ++/* ++ * deactivate_task - remove a task from the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible++; ++ dequeue_task(p, rq, DEQUEUE_SLEEP); ++ p->on_rq = 0; ++ cpufreq_update_this_cpu(rq, 0); ++} ++ ++static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ WRITE_ONCE(p->cpu, cpu); ++#else ++ WRITE_ONCE(task_thread_info(p)->cpu, cpu); ++#endif ++#endif ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++#ifdef CONFIG_SCHED_DEBUG ++ /* ++ * We should never call set_task_cpu() on a blocked task, ++ * ttwu() will sort out the placement. ++ */ ++ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && ++ !p->on_rq); ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * sched_move_task() holds both and thus holding either pins the cgroup, ++ * see task_group(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(&task_rq(p)->lock))); ++#endif ++ /* ++ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. ++ */ ++ WARN_ON_ONCE(!cpu_online(new_cpu)); ++#endif ++ if (task_cpu(p) == new_cpu) ++ return; ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ __set_task_cpu(p, new_cpu); ++} ++ ++static inline bool is_per_cpu_kthread(struct task_struct *p) ++{ ++ return ((p->flags & PF_KTHREAD) && (1 == p->nr_cpus_allowed)); ++} ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr() and select_fallback_rq(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ if (!cpumask_test_cpu(cpu, &p->cpus_mask)) ++ return false; ++ ++ if (is_per_cpu_kthread(p)) ++ return cpu_online(cpu); ++ ++ return cpu_active(cpu); ++} ++ ++/* ++ * This is how migration works: ++ * ++ * 1) we invoke migration_cpu_stop() on the target CPU using ++ * stop_one_cpu(). ++ * 2) stopper starts to run (implicitly forcing the migrated thread ++ * off the CPU) ++ * 3) it checks whether the migrated task is still in the wrong runqueue. ++ * 4) if it's in the wrong runqueue then the migration thread removes ++ * it and puts it into the right queue. ++ * 5) stopper completes and stop_one_cpu() returns and the migration ++ * is done. ++ */ ++ ++/* ++ * detach_task() -- detach the task for the migration specified in @target_cpu ++ */ ++static void detach_task(struct rq *rq, struct task_struct *p, int target_cpu) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WRITE_ONCE(p->on_rq ,TASK_ON_RQ_MIGRATING); ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible++; ++ dequeue_task(p, rq, 0); ++ ++ set_task_cpu(p, target_cpu); ++} ++ ++/* ++ * attach_task() -- attach the task detached by detach_task() to its new rq. ++ */ ++static void attach_task(struct rq *rq, struct task_struct *p) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ BUG_ON(task_rq(p) != rq); ++ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible--; ++ enqueue_task(p, rq, 0); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++ cpufreq_update_this_cpu(rq, 0); ++} ++ ++/* ++ * move_queued_task - move a queued task to new rq. ++ * ++ * Returns (locked) new rq. Old rq's lock is released. ++ */ ++static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int ++ new_cpu) ++{ ++ detach_task(rq, p, new_cpu); ++ raw_spin_unlock(&rq->lock); ++ ++ rq = cpu_rq(new_cpu); ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ attach_task(rq, p); ++ ++ check_preempt_curr(rq, p); ++ ++ return rq; ++} ++ ++struct migration_arg { ++ struct task_struct *task; ++ int dest_cpu; ++}; ++ ++/* ++ * Move (not current) task off this CPU, onto the destination CPU. We're doing ++ * this because either it can't run here any more (set_cpus_allowed() ++ * away from this CPU, or CPU going down), or because we're ++ * attempting to rebalance this task on exec (sched_exec). ++ * ++ * So we race with normal scheduler movements, but that's OK, as long ++ * as the task is no longer on this CPU. ++ */ ++static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int ++ dest_cpu) ++{ ++ /* Affinity changed (again). */ ++ if (!is_cpu_allowed(p, dest_cpu)) ++ return rq; ++ ++ update_rq_clock(rq); ++ return move_queued_task(rq, p, dest_cpu); ++} ++ ++/* ++ * migration_cpu_stop - this will be executed by a highprio stopper thread ++ * and performs thread migration by bumping thread off CPU then ++ * 'pushing' onto another runqueue. ++ */ ++static int migration_cpu_stop(void *data) ++{ ++ struct migration_arg *arg = data; ++ struct task_struct *p = arg->task; ++ struct rq *rq = this_rq(); ++ ++ /* ++ * The original target CPU might have gone down and we might ++ * be on another CPU but it doesn't matter. ++ */ ++ local_irq_disable(); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ /* ++ * If task_rq(p) != rq, it cannot be migrated here, because we're ++ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because ++ * we're holding p->pi_lock. ++ */ ++ if (task_rq(p) == rq) ++ if (task_on_rq_queued(p)) ++ rq = __migrate_task(rq, p, arg->dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_enable(); ++ return 0; ++} ++ ++static inline void ++set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ set_cpus_allowed_common(p, new_mask); ++} ++#endif ++ ++/* Enter with rq lock held. We know p is on the local CPU */ ++static inline void __set_tsk_resched(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ unsigned long flags; ++ bool running, on_rq; ++ unsigned long ncsw; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(p) && p == rq->curr) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ task_access_lock_irqsave(p, &lock, &flags); ++ trace_sched_wait_task(p); ++ running = task_running(p); ++ on_rq = p->on_rq; ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(on_rq)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_send_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++ ++/* ++ * ->cpus_mask is protected by both rq->lock and p->pi_lock ++ * ++ * A few notes on cpu_active vs cpu_online: ++ * ++ * - cpu_active must be a subset of cpu_online ++ * ++ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, ++ * see __set_cpus_allowed_ptr(). At this point the newly online ++ * CPU isn't yet part of the sched domains, and balancing will not ++ * see it. ++ * ++ * - on cpu-down we clear cpu_active() to mask the sched domains and ++ * avoid the load balancer to place new tasks on the to be removed ++ * CPU. Existing tasks will remain running there and will be taken ++ * off. ++ * ++ * This means that fallback selection must not select !active CPUs. ++ * And can assume that any active CPU must be online. Conversely ++ * select_task_rq() below may allow selection of !active CPUs in order ++ * to satisfy the above rules. ++ */ ++static int select_fallback_rq(int cpu, struct task_struct *p) ++{ ++ int nid = cpu_to_node(cpu); ++ const struct cpumask *nodemask = NULL; ++ enum { cpuset, possible, fail } state = cpuset; ++ int dest_cpu; ++ ++ /* ++ * If the node that the CPU is on has been offlined, cpu_to_node() ++ * will return -1. There is no CPU on the node, and we should ++ * select the CPU on the other node. ++ */ ++ if (nid != -1) { ++ nodemask = cpumask_of_node(nid); ++ ++ /* Look for allowed, online CPU in same node. */ ++ for_each_cpu(dest_cpu, nodemask) { ++ if (!cpu_active(dest_cpu)) ++ continue; ++ if (cpumask_test_cpu(dest_cpu, &p->cpus_mask)) ++ return dest_cpu; ++ } ++ } ++ ++ for (;;) { ++ /* Any allowed, online CPU? */ ++ for_each_cpu(dest_cpu, &p->cpus_mask) { ++ if (!is_cpu_allowed(p, dest_cpu)) ++ continue; ++ goto out; ++ } ++ ++ /* No more Mr. Nice Guy. */ ++ switch (state) { ++ case cpuset: ++ if (IS_ENABLED(CONFIG_CPUSETS)) { ++ cpuset_cpus_allowed_fallback(p); ++ state = possible; ++ break; ++ } ++ /* Fall-through */ ++ case possible: ++ do_set_cpus_allowed(p, cpu_possible_mask); ++ state = fail; ++ break; ++ ++ case fail: ++ BUG(); ++ break; ++ } ++ } ++ ++out: ++ if (state != cpuset) { ++ /* ++ * Don't tell them about moving exiting tasks or ++ * kernel threads (both mm NULL), since they never ++ * leave kernel. ++ */ ++ if (p->mm && printk_ratelimit()) { ++ printk_deferred("process %d (%s) no longer affine to cpu%d\n", ++ task_pid_nr(p), p->comm, cpu); ++ } ++ } ++ ++ return dest_cpu; ++} ++ ++static inline int best_mask_cpu(int cpu, const cpumask_t *cpumask) ++{ ++ cpumask_t *mask; ++ ++ if (cpumask_test_cpu(cpu, cpumask)) ++ return cpu; ++ ++ mask = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) ++ mask++; ++ ++ return cpu; ++} ++ ++/* ++ * task_preemptible_rq - return the rq which the given task can preempt on ++ * @p: task wants to preempt CPU ++ * @only_preempt_low_policy: indicate only preempt rq running low policy than @p ++ */ ++static inline int ++task_preemptible_rq_idle(struct task_struct *p, cpumask_t *chk_mask) ++{ ++ cpumask_t tmp; ++ ++#ifdef CONFIG_SCHED_SMT ++ if (cpumask_and(&tmp, chk_mask, &sched_cpu_sg_idle_mask)) ++ return best_mask_cpu(task_cpu(p), &tmp); ++#endif ++ ++#ifdef CONFIG_SMT_NICE ++ /* Only ttwu on cpu which is not smt supressed */ ++ if (cpumask_andnot(&tmp, chk_mask, &sched_smt_supressed_mask)) { ++ cpumask_t t; ++ if (cpumask_and(&t, &tmp, &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ return best_mask_cpu(task_cpu(p), &t); ++ return best_mask_cpu(task_cpu(p), &tmp); ++ } ++#endif ++ ++ if (cpumask_and(&tmp, chk_mask, &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ return best_mask_cpu(task_cpu(p), &tmp); ++ return best_mask_cpu(task_cpu(p), chk_mask); ++} ++ ++static inline int ++task_preemptible_rq(struct task_struct *p, cpumask_t *chk_mask, ++ int preempt_level) ++{ ++ cpumask_t tmp; ++ int level; ++ ++#ifdef CONFIG_SCHED_SMT ++#ifdef CONFIG_SMT_NICE ++ if (cpumask_and(&tmp, chk_mask, &sched_cpu_psg_mask)) ++ return best_mask_cpu(task_cpu(p), &tmp); ++#else ++ if (cpumask_and(&tmp, chk_mask, &sched_cpu_sg_idle_mask)) ++ return best_mask_cpu(task_cpu(p), &tmp); ++#endif ++#endif ++ ++ level = find_first_bit(sched_rq_queued_masks_bitmap, ++ NR_SCHED_RQ_QUEUED_LEVEL); ++ ++ while (level < preempt_level) { ++ if (cpumask_and(&tmp, chk_mask, &sched_rq_queued_masks[level])) ++ return best_mask_cpu(task_cpu(p), &tmp); ++ ++ level = find_next_bit(sched_rq_queued_masks_bitmap, ++ NR_SCHED_RQ_QUEUED_LEVEL, ++ level + 1); ++ } ++ ++ if (unlikely(SCHED_RQ_RT == level && ++ level == preempt_level && ++ cpumask_and(&tmp, chk_mask, ++ &sched_rq_queued_masks[SCHED_RQ_RT]))) { ++ unsigned int cpu; ++ ++ for_each_cpu (cpu, &tmp) ++ if (p->prio < sched_rq_prio[cpu]) ++ return cpu; ++ } ++ ++ return best_mask_cpu(task_cpu(p), chk_mask); ++} ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ cpumask_t chk_mask; ++ ++ if (unlikely(!cpumask_and(&chk_mask, &p->cpus_mask, cpu_online_mask))) ++ return select_fallback_rq(task_cpu(p), p); ++ ++ /* Check IDLE tasks suitable to run normal priority */ ++ if (idleprio_task(p)) { ++ if (idleprio_suitable(p)) { ++ p->prio = p->normal_prio; ++ update_task_priodl(p); ++ return task_preemptible_rq_idle(p, &chk_mask); ++ } ++ p->prio = NORMAL_PRIO; ++ update_task_priodl(p); ++ } ++ ++ return task_preemptible_rq(p, &chk_mask, ++ task_running_policy_level(p, this_rq())); ++} ++#else /* CONFIG_SMP */ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ return 0; ++} ++#endif /* CONFIG_SMP */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq= this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) ++ __schedstat_inc(rq->ttwu_local); ++ else { ++ /** PDS ToDo: ++ * How to do ttwu_wake_remote ++ */ ++ } ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static inline void ++ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static inline void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++#ifdef CONFIG_SMP ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++#endif ++ ++ activate_task(p, rq); ++ ttwu_do_wakeup(rq, p, 0); ++} ++ ++static int ttwu_remote(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ rq = __task_access_lock(p, &lock); ++ if (task_on_rq_queued(p)) { ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_access_unlock(p, lock); ++ ++ return ret; ++} ++ ++/* ++ * Notes on Program-Order guarantees on SMP systems. ++ * ++ * MIGRATION ++ * ++ * The basic program-order guarantee on SMP systems is that when a task [t] ++ * migrates, all its activity on its old CPU [c0] happens-before any subsequent ++ * execution on its new CPU [c1]. ++ * ++ * For migration (of runnable tasks) this is provided by the following means: ++ * ++ * A) UNLOCK of the rq(c0)->lock scheduling out task t ++ * B) migration for t is required to synchronize *both* rq(c0)->lock and ++ * rq(c1)->lock (if not at the same time, then in that order). ++ * C) LOCK of the rq(c1)->lock scheduling in task ++ * ++ * Transitivity guarantees that B happens after A and C after B. ++ * Note: we only require RCpc transitivity. ++ * Note: the CPU doing B need not be c0 or c1 ++ * ++ * Example: ++ * ++ * CPU0 CPU1 CPU2 ++ * ++ * LOCK rq(0)->lock ++ * sched-out X ++ * sched-in Y ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(0)->lock // orders against CPU0 ++ * dequeue X ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(1)->lock ++ * enqueue X ++ * UNLOCK rq(1)->lock ++ * ++ * LOCK rq(1)->lock // orders against CPU2 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(1)->lock ++ * ++ * ++ * BLOCKING -- aka. SLEEP + WAKEUP ++ * ++ * For blocking we (obviously) need to provide the same guarantee as for ++ * migration. However the means are completely different as there is no lock ++ * chain to provide order. Instead we do: ++ * ++ * 1) smp_store_release(X->on_cpu, 0) ++ * 2) smp_cond_load_acquire(!X->on_cpu) ++ * ++ * Example: ++ * ++ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) ++ * ++ * LOCK rq(0)->lock LOCK X->pi_lock ++ * dequeue X ++ * sched-out X ++ * smp_store_release(X->on_cpu, 0); ++ * ++ * smp_cond_load_acquire(&X->on_cpu, !VAL); ++ * X->state = WAKING ++ * set_task_cpu(X,2) ++ * ++ * LOCK rq(2)->lock ++ * enqueue X ++ * X->state = RUNNING ++ * UNLOCK rq(2)->lock ++ * ++ * LOCK rq(2)->lock // orders against CPU1 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(2)->lock ++ * ++ * UNLOCK X->pi_lock ++ * UNLOCK rq(0)->lock ++ * ++ * ++ * However; for wakeups there is a second guarantee we must provide, namely we ++ * must observe the state that lead to our wakeup. That is, not only must our ++ * task observe its own prior state, it must also observe the stores prior to ++ * its wakeup. ++ * ++ * This means that any means of doing remote wakeups must order the CPU doing ++ * the wakeup against the CPU the task is going to end up running on. This, ++ * however, is already required for the regular Program-Order guarantee above, ++ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). ++ * ++ */ ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Return: %true if @p was woken up, %false if it was already running. ++ * or @state didn't match @p's state. ++ */ ++static int try_to_wake_up(struct task_struct *p, unsigned int state, ++ int wake_flags) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ int cpu, success = 0; ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with mb() in ++ * set_current_state() the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ if (!(p->state & state)) ++ goto out; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ cpu = task_cpu(p); ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ if (p->on_rq && ttwu_remote(p, wake_flags)) ++ goto stat; ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until its done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ p->sched_contributes_to_load = !!task_contributes_to_load(p); ++ p->state = TASK_WAKING; ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ if (SCHED_ISO == p->policy && ISO_PRIO != p->prio) { ++ p->prio = ISO_PRIO; ++ p->deadline = 0UL; ++ update_task_priodl(p); ++ } ++ ++ cpu = select_task_rq(p); ++ ++ if (cpu != task_cpu(p)) { ++ wake_flags |= WF_MIGRATED; ++ psi_ttwu_dequeue(p); ++ set_task_cpu(p, cpu); ++ } ++#else /* CONFIG_SMP */ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++#endif ++ ++ rq = cpu_rq(cpu); ++ raw_spin_lock(&rq->lock); ++ ++ update_rq_clock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ check_preempt_curr(rq, p); ++ ++ raw_spin_unlock(&rq->lock); ++ ++stat: ++ ttwu_stat(p, cpu, wake_flags); ++out: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return success; ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ */ ++int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ int cpu = get_cpu(); ++ struct rq *rq = this_rq(); ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ /* Should be reset in fork.c but done here for ease of PDS patching */ ++ p->on_cpu = ++ p->on_rq = ++ p->utime = ++ p->stime = ++ p->sched_time = 0; ++ ++ p->sl_level = pds_skiplist_random_level(p); ++ INIT_SKIPLIST_NODE(&p->sl_node); ++ ++#ifdef CONFIG_COMPACTION ++ p->capture_control = NULL; ++#endif ++ ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = current->normal_prio; ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (task_has_rt_policy(p)) { ++ p->policy = SCHED_NORMAL; ++ p->static_prio = NICE_TO_PRIO(0); ++ p->rt_priority = 0; ++ } else if (PRIO_TO_NICE(p->static_prio) < 0) ++ p->static_prio = NICE_TO_PRIO(0); ++ ++ p->prio = p->normal_prio = normal_prio(p); ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ rq->curr->time_slice /= 2; ++ p->time_slice = rq->curr->time_slice; ++#ifdef CONFIG_SCHED_HRTICK ++ hrtick_start(rq, US_TO_NS(rq->curr->time_slice)); ++#endif ++ ++ if (p->time_slice < RESCHED_US) { ++ update_rq_clock(rq); ++ time_slice_expired(p, rq); ++ resched_curr(rq); ++ } else ++ update_task_priodl(p); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ /* ++ * The child is not yet in the pid-hash so no cgroup attach races, ++ * and the cgroup is pinned to this child due to cgroup_fork() ++ * is ran before sched_fork(). ++ * ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ /* ++ * We're setting the CPU for the first time, we don't migrate, ++ * so use __set_task_cpu(). ++ */ ++ __set_task_cpu(p, cpu); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ put_cpu(); ++ return 0; ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ p->state = TASK_RUNNING; ++ ++ rq = cpu_rq(select_task_rq(p)); ++#ifdef CONFIG_SMP ++ /* ++ * Fork balancing, do it here and not earlier because: ++ * - cpus_mask can change in the fork path ++ * - any previously selected CPU might disappear through hotplug ++ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, ++ * as we're not fully set-up yet. ++ */ ++ __set_task_cpu(p, cpu_of(rq)); ++#endif ++ ++ raw_spin_lock(&rq->lock); ++ ++ update_rq_clock(rq); ++ activate_task(p, rq); ++ trace_sched_wakeup_new(p); ++ check_preempt_curr(rq, p); ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ */ ++ next->on_cpu = 1; ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->on_cpu is cleared, the task can be moved to a different CPU. ++ * We must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#else ++ prev->on_cpu = 0; ++#endif ++} ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock.dep_map, 1, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock.owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static struct rq *finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(&rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ finish_lock_switch(rq); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct_rcu_user(prev); ++ } ++ ++ tick_nohz_task_switch(); ++ return rq; ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq; ++ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ rq = finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline struct rq * ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prepare_task_switch(rq, prev, next); ++ ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * kernel -> kernel lazy + transfer active ++ * user -> kernel lazy + mmgrab() active ++ * ++ * kernel -> user switch + mmdrop() active ++ * user -> user switch ++ */ ++ if (!next->mm) { // to kernel ++ enter_lazy_tlb(prev->active_mm, next); ++ ++ next->active_mm = prev->active_mm; ++ if (prev->mm) // from user ++ mmgrab(prev->active_mm); ++ else ++ prev->active_mm = NULL; ++ } else { // to user ++ membarrier_switch_mm(rq, prev->active_mm, next->mm); ++ /* ++ * sys_membarrier() requires an smp_mb() between setting ++ * rq->curr / membarrier_switch_mm() and returning to userspace. ++ * ++ * The below provides this either through switch_mm(), or in ++ * case 'prev->active_mm == next->mm' through ++ * finish_task_switch()'s mmdrop(). ++ */ ++ switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ ++ if (!prev->mm) { // from kernel ++ /* will mmdrop() in finish_task_switch(). */ ++ rq->prev_mm = prev->active_mm; ++ prev->active_mm = NULL; ++ } ++ } ++ ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ return finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptible section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ return raw_rq()->nr_running == 1; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int i; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += cpu_rq(i)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpuidle menu ++ * governor, are using nonsensical data. Preferring shallow idle state selection ++ * for a CPU that has IO-wait which might not even end up running the task when ++ * it does become runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ return atomic_read(&cpu_rq(cpu)->nr_iowait); ++} ++ ++/* ++ * IO-wait accounting, and how its mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += nr_iowait_cpu(i); ++ ++ return sum; ++} ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++static inline void pds_update_curr(struct rq *rq, struct task_struct *p) ++{ ++ s64 ns = rq->clock_task - p->last_ran; ++ ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ p->time_slice -= NS_TO_US(ns); ++ p->last_ran = rq->clock_task; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimization chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_access_lock_irqsave(p, &lock, &flags); ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_rq_clock(rq); ++ pds_update_curr(rq, p); ++ } ++ ns = tsk_seruntime(p); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ return ns; ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static inline void pds_scheduler_task_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ if (is_idle_task(p)) ++ return; ++ ++ pds_update_curr(rq, p); ++ ++ cpufreq_update_util(rq, 0); ++ ++ /* ++ * Tasks that were scheduled in the first half of a tick are not ++ * allowed to run into the 2nd half of the next tick if they will ++ * run out of time slice in the interim. Otherwise, if they have ++ * less than RESCHED_US μs of time slice left they will be rescheduled. ++ */ ++ if (p->time_slice - rq->dither >= RESCHED_US) ++ return; ++ ++ /** ++ * p->time_slice < RESCHED_US. We will modify task_struct under ++ * rq lock as p is rq->curr ++ */ ++ __set_tsk_resched(p); ++} ++ ++#ifdef CONFIG_SMP ++ ++#ifdef CONFIG_SCHED_SMT ++static int active_load_balance_cpu_stop(void *data) ++{ ++ struct rq *rq = this_rq(); ++ struct task_struct *p = data; ++ int cpu; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ rq->active_balance = 0; ++ /* ++ * _something_ may have changed the task, double check again ++ */ ++ if (task_on_rq_queued(p) && task_rq(p) == rq && ++ (cpu = cpumask_any_and(&p->cpus_mask, &sched_cpu_sg_idle_mask)) < nr_cpu_ids) ++ rq = __migrate_task(rq, p, cpu); ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_restore(flags); ++ ++ return 0; ++} ++ ++/* pds_sg_balance_trigger - trigger slibing group balance for @cpu */ ++static void pds_sg_balance_trigger(const int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ struct task_struct *curr; ++ ++ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) ++ return; ++ curr = rq->curr; ++ if (!is_idle_task(curr) && ++ cpumask_intersects(&curr->cpus_mask, &sched_cpu_sg_idle_mask)) { ++ int active_balance = 0; ++ ++ if (likely(!rq->active_balance)) { ++ rq->active_balance = 1; ++ active_balance = 1; ++ } ++ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ if (likely(active_balance)) ++ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, ++ curr, &rq->active_balance_work); ++ } else ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++} ++ ++/* ++ * pds_sg_balance_check - slibing group balance check for run queue @rq ++ */ ++static inline void pds_sg_balance_check(const struct rq *rq) ++{ ++ cpumask_t chk; ++ int i; ++ ++ /* Only online cpu will do sg balance checking */ ++ if (unlikely(!rq->online)) ++ return; ++ ++ /* Only cpu in slibing idle group will do the checking */ ++ if (!cpumask_test_cpu(cpu_of(rq), &sched_cpu_sg_idle_mask)) ++ return; ++ ++ /* Find potential cpus which can migrate the currently running task */ ++ if (!cpumask_andnot(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY], ++ &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ return; ++ ++ for_each_cpu(i, &chk) { ++ /* skip the cpu which has idle slibing cpu */ ++ if (cpumask_test_cpu(per_cpu(sched_sibling_cpu, i), ++ &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ continue; ++ pds_sg_balance_trigger(i); ++ } ++} ++#endif /* CONFIG_SCHED_SMT */ ++#endif /* CONFIG_SMP */ ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ sched_clock_tick(); ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ pds_scheduler_task_tick(rq); ++ update_sched_rq_queued_masks_normal(rq); ++ calc_global_load_tick(rq); ++ psi_task_tick(rq); ++ ++ rq->last_tick = rq->clock; ++ raw_spin_unlock(&rq->lock); ++ ++ perf_event_task_tick(); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++struct tick_work { ++ int cpu; ++ atomic_t state; ++ struct delayed_work work; ++}; ++/* Values for ->state, see diagram below. */ ++#define TICK_SCHED_REMOTE_OFFLINE 0 ++#define TICK_SCHED_REMOTE_OFFLINING 1 ++#define TICK_SCHED_REMOTE_RUNNING 2 ++ ++/* ++ * State diagram for ->state: ++ * ++ * ++ * TICK_SCHED_REMOTE_OFFLINE ++ * | ^ ++ * | | ++ * | | sched_tick_remote() ++ * | | ++ * | | ++ * +--TICK_SCHED_REMOTE_OFFLINING ++ * | ^ ++ * | | ++ * sched_tick_start() | | sched_tick_stop() ++ * | | ++ * V | ++ * TICK_SCHED_REMOTE_RUNNING ++ * ++ * ++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() ++ * and sched_tick_start() are happy to leave the state in RUNNING. ++ */ ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ unsigned long flags; ++ u64 delta; ++ int os; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ curr = rq->curr; ++ ++ if (is_idle_task(curr) || cpu_is_offline(cpu)) ++ goto out_unlock; ++ ++ update_rq_clock(rq); ++ delta = rq_clock_task(rq) - curr->last_ran; ++ ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ pds_scheduler_task_tick(rq); ++ update_sched_rq_queued_masks_normal(rq); ++ ++out_unlock: ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++out_requeue: ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. But ++ * first update state to reflect hotplug activity if required. ++ */ ++ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); ++ if (os == TICK_SCHED_REMOTE_RUNNING) ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ int os; ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); ++ if (os == TICK_SCHED_REMOTE_OFFLINE) { ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++ } ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ cancel_delayed_work_sync(&twork->work); ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_PREEMPT_TRACER)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++/* ++ * Timeslices below RESCHED_US are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. SCHED_BATCH tasks ++ * have been flagged be not latency sensitive and likely to be fully CPU ++ * bound so every time they're rescheduled they have their time_slice ++ * refilled, but get a new later deadline to have little effect on ++ * SCHED_NORMAL tasks. ++ ++ */ ++static inline void check_deadline(struct task_struct *p, struct rq *rq) ++{ ++ if (rq->idle == p) ++ return; ++ ++ pds_update_curr(rq, p); ++ ++ if (p->time_slice < RESCHED_US) { ++ time_slice_expired(p, rq); ++ if (SCHED_ISO == p->policy && ISO_PRIO == p->prio) { ++ p->prio = NORMAL_PRIO; ++ p->deadline = rq->clock + task_deadline_diff(p); ++ update_task_priodl(p); ++ } ++ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) ++ requeue_task(p, rq); ++ } ++} ++ ++#ifdef CONFIG_SMP ++ ++#define SCHED_RQ_NR_MIGRATION (32UL) ++/* ++ * Migrate pending tasks in @rq to @dest_cpu ++ * Will try to migrate mininal of half of @rq nr_running tasks and ++ * SCHED_RQ_NR_MIGRATION to @dest_cpu ++ */ ++static inline int ++migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, int filter_prio) ++{ ++ struct task_struct *p; ++ int dest_cpu = cpu_of(dest_rq); ++ int nr_migrated = 0; ++ int nr_tries = min((rq->nr_running + 1) / 2, SCHED_RQ_NR_MIGRATION); ++ struct skiplist_node *node = rq->sl_header.next[0]; ++ ++ while (nr_tries && node != &rq->sl_header) { ++ p = skiplist_entry(node, struct task_struct, sl_node); ++ node = node->next[0]; ++ ++ if (task_running(p)) ++ continue; ++ if (p->prio >= filter_prio) ++ break; ++ if (cpumask_test_cpu(dest_cpu, &p->cpus_mask)) { ++ detach_task(rq, p, dest_cpu); ++ attach_task(dest_rq, p); ++ nr_migrated++; ++ } ++ nr_tries--; ++ /* make a jump */ ++ if (node == &rq->sl_header) ++ break; ++ node = node->next[0]; ++ } ++ ++ return nr_migrated; ++} ++ ++static inline int ++take_queued_task_cpumask(struct rq *rq, cpumask_t *chk_mask, int filter_prio) ++{ ++ int src_cpu; ++ ++ for_each_cpu(src_cpu, chk_mask) { ++ int nr_migrated; ++ struct rq *src_rq = cpu_rq(src_cpu); ++ ++ if (!do_raw_spin_trylock(&src_rq->lock)) { ++ if (PRIO_LIMIT == filter_prio) ++ continue; ++ return 0; ++ } ++ spin_acquire(&src_rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ ++ update_rq_clock(src_rq); ++ nr_migrated = migrate_pending_tasks(src_rq, rq, filter_prio); ++ ++ spin_release(&src_rq->lock.dep_map, 1, _RET_IP_); ++ do_raw_spin_unlock(&src_rq->lock); ++ ++ if (nr_migrated || PRIO_LIMIT != filter_prio) ++ return nr_migrated; ++ } ++ return 0; ++} ++ ++static inline int take_other_rq_task(struct rq *rq, int cpu, int filter_prio) ++{ ++ struct cpumask *affinity_mask, *end; ++ struct cpumask chk; ++ ++ if (PRIO_LIMIT == filter_prio) { ++ cpumask_complement(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY]); ++#ifdef CONFIG_SMT_NICE ++ { ++ /* also try to take IDLE priority tasks from smt supressed cpu */ ++ struct cpumask t; ++ if (cpumask_and(&t, &sched_smt_supressed_mask, ++ &sched_rq_queued_masks[SCHED_RQ_IDLE])) ++ cpumask_or(&chk, &chk, &t); ++ } ++#endif ++ } else if (NORMAL_PRIO == filter_prio) { ++ cpumask_or(&chk, &sched_rq_pending_masks[SCHED_RQ_RT], ++ &sched_rq_pending_masks[SCHED_RQ_ISO]); ++ } else if (IDLE_PRIO == filter_prio) { ++ cpumask_complement(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY]); ++ cpumask_andnot(&chk, &chk, &sched_rq_pending_masks[SCHED_RQ_IDLE]); ++ } else ++ cpumask_copy(&chk, &sched_rq_pending_masks[SCHED_RQ_RT]); ++ ++ if (cpumask_empty(&chk)) ++ return 0; ++ ++ affinity_mask = per_cpu(sched_cpu_llc_start_mask, cpu); ++ end = per_cpu(sched_cpu_affinity_chk_end_masks, cpu); ++ do { ++ struct cpumask tmp; ++ ++ if (cpumask_and(&tmp, &chk, affinity_mask) && ++ take_queued_task_cpumask(rq, &tmp, filter_prio)) ++ return 1; ++ } while (++affinity_mask < end); ++ ++ return 0; ++} ++#endif ++ ++static inline struct task_struct * ++choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) ++{ ++ struct task_struct *next = rq_first_queued_task(rq); ++ ++#ifdef CONFIG_SMT_NICE ++ if (cpumask_test_cpu(cpu, &sched_smt_supressed_mask)) { ++ if (next->prio >= IDLE_PRIO) { ++ if (rq->online && ++ take_other_rq_task(rq, cpu, IDLE_PRIO)) ++ return rq_first_queued_task(rq); ++ return rq->idle; ++ } ++ } ++#endif ++ ++#ifdef CONFIG_SMP ++ if (likely(rq->online)) ++ if (take_other_rq_task(rq, cpu, next->prio)) { ++ resched_curr(rq); ++ return rq_first_queued_task(rq); ++ } ++#endif ++ return next; ++} ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ if (panic_on_warn) ++ panic("scheduling while atomic\n"); ++ ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev, bool preempt) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++#endif ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++ if (!preempt && prev->state && prev->non_block_count) { ++ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", ++ prev->comm, prev->pid, prev->non_block_count); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++ } ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ p->last_ran = rq->clock_task; ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++ if (p != rq->idle) ++ hrtick_start(rq, US_TO_NS(p->time_slice)); ++#endif ++ /* update rq->dither */ ++ rq->dither = rq_dither(rq); ++} ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next; ++ unsigned long *switch_count; ++ struct rq *rq; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ ++ schedule_debug(prev, preempt); ++ ++ /* by passing sched_feat(HRTICK) checking which PDS doesn't support */ ++ hrtick_clear(rq); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(). ++ * ++ * The membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ raw_spin_lock(&rq->lock); ++ smp_mb__after_spinlock(); ++ ++ update_rq_clock(rq); ++ ++ switch_count = &prev->nivcsw; ++ if (!preempt && prev->state) { ++ if (signal_pending_state(prev->state, prev)) { ++ prev->state = TASK_RUNNING; ++ } else { ++ deactivate_task(prev, rq); ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ check_deadline(prev, rq); ++ ++ next = choose_next_task(rq, cpu, prev); ++ ++ set_rq_task(rq, next); ++ ++ if (prev != next) { ++ if (next->prio == PRIO_LIMIT) ++ schedstat_inc(rq->sched_goidle); ++ ++ /* ++ * RCU users of rcu_dereference(rq->curr) may not see ++ * changes to task_struct made by pick_next_task(). ++ */ ++ RCU_INIT_POINTER(rq->curr, next); ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ rq->nr_switches++; ++ ++ trace_sched_switch(preempt, prev, next); ++ ++ /* Also unlocks the rq: */ ++ rq = context_switch(rq, prev, next); ++#ifdef CONFIG_SCHED_SMT ++ pds_sg_balance_check(rq); ++#endif ++ } else ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(): */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ __schedule(false); ++ ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ if (!tsk->state || tsk_is_pi_blocked(tsk) || ++ signal_pending_state(tsk->state, tsk)) ++ return; ++ ++ /* ++ * If a worker went to sleep, notify and ask workqueue whether ++ * it wants to wake up a task to maintain concurrency. ++ * As this function is called inside the schedule() context, ++ * we disable preemption to avoid it calling schedule() again ++ * in the possible wakeup of a kworker. ++ */ ++ if (tsk->flags & PF_WQ_WORKER) { ++ preempt_disable(); ++ wq_worker_sleeping(tsk); ++ preempt_enable_no_resched(); ++ } ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++static void sched_update_worker(struct task_struct *tsk) ++{ ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_running(tsk); ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ sched_update_worker(tsk); ++} ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_CONTEXT_TRACKING ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != CONTEXT_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPTION ++/* ++ * This is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPTION */ ++ ++/* ++ * This is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++static inline void ++check_task_changed(struct rq *rq, struct task_struct *p) ++{ ++ /* ++ * Trigger changes when task priority/deadline modified. ++ */ ++ if (task_on_rq_queued(p)) { ++ struct task_struct *first; ++ ++ requeue_task(p, rq); ++ ++ /* Resched if first queued task not running and not IDLE */ ++ if ((first = rq_first_queued_task(rq)) != rq->curr && ++ !task_running_idle(first)) ++ resched_curr(rq); ++ } ++} ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_access_lock(p, &lock); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guaratees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ p->prio = prio; ++ update_task_priodl(p); ++ ++ check_task_changed(rq, p); ++ ++out_unlock: ++ __task_access_unlock(p, lock); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ int new_static; ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ new_static = NICE_TO_PRIO(nice); ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ /* rq lock may not held!! */ ++ update_rq_clock(rq); ++ ++ p->static_prio = new_static; ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (task_has_rt_policy(p)) ++ goto out_unlock; ++ ++ p->deadline -= task_deadline_diff(p); ++ p->deadline += static_deadline_diff(new_static); ++ p->prio = effective_prio(p); ++ update_task_priodl(p); ++ ++ check_task_changed(rq, p); ++out_unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ int level, prio = p->prio - MAX_RT_PRIO; ++ static const int level_to_nice_prio[] = {39, 33, 26, 20, 14, 7, 0, 0}; ++ ++ /* rt tasks */ ++ if (prio <= 0) ++ goto out; ++ ++ preempt_disable(); ++ level = task_deadline_level(p, this_rq()); ++ preempt_enable(); ++ prio += level_to_nice_prio[level]; ++ if (idleprio_task(p)) ++ prio += NICE_WIDTH; ++out: ++ return prio; ++} ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ return cpu_curr(cpu) == cpu_rq(cpu)->idle; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the cpu @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++#ifdef CONFIG_SMP ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ int dest_cpu; ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (cpumask_equal(&p->cpus_mask, new_mask)) ++ goto out; ++ ++ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ if (dest_cpu >= nr_cpu_ids) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ do_set_cpus_allowed(p, new_mask); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(p) || p->state == TASK_WAKING) { ++ struct migration_arg arg = { p, dest_cpu }; ++ ++ /* Need help from migration thread: drop lock and wait. */ ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); ++ return 0; ++ } ++ if (task_on_rq_queued(p)) { ++ /* ++ * OK, since we're going to drop the lock immediately ++ * afterwards anyway. ++ */ ++ update_rq_clock(rq); ++ rq = move_queued_task(rq, p, dest_cpu); ++ lock = &rq->lock; ++ } ++ ++out: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#else ++static inline int ++__set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++#endif ++ ++static u64 task_init_deadline(const struct task_struct *p) ++{ ++ return task_rq(p)->clock + task_deadline_diff(p); ++} ++ ++u64 (* task_init_deadline_func_tbl[])(const struct task_struct *p) = { ++ task_init_deadline, /* SCHED_NORMAL */ ++ NULL, /* SCHED_FIFO */ ++ NULL, /* SCHED_RR */ ++ task_init_deadline, /* SCHED_BATCH */ ++ NULL, /* SCHED_ISO */ ++ task_init_deadline /* SCHED_IDLE */ ++}; ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++static void __setscheduler_params(struct task_struct *p, ++ const struct sched_attr *attr) ++{ ++ int old_policy = p->policy; ++ int policy = attr->sched_policy; ++ ++ if (policy == SETPARAM_POLICY) ++ policy = p->policy; ++ ++ p->policy = policy; ++ ++ /* ++ * allow normal nice value to be set, but will not have any ++ * effect on scheduling until the task not SCHED_NORMAL/ ++ * SCHED_BATCH ++ */ ++ p->static_prio = NICE_TO_PRIO(attr->sched_nice); ++ ++ /* ++ * __sched_setscheduler() ensures attr->sched_priority == 0 when ++ * !rt_policy. Always setting this ensures that things like ++ * getparam()/getattr() don't report silly values for !rt tasks. ++ */ ++ p->rt_priority = attr->sched_priority; ++ p->normal_prio = normal_prio(p); ++ ++ if (old_policy != policy) ++ p->deadline = (task_init_deadline_func_tbl[p->policy])? ++ task_init_deadline_func_tbl[p->policy](p):0ULL; ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct rq *rq, struct task_struct *p, ++ const struct sched_attr *attr, bool keep_boost) ++{ ++ __setscheduler_params(p, attr); ++ ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++ update_task_priodl(p); ++} ++ ++/* ++ * check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int ++__sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, bool user, bool pi) ++{ ++ const struct sched_attr dl_squash_attr = { ++ .size = sizeof(struct sched_attr), ++ .sched_policy = SCHED_FIFO, ++ .sched_nice = 0, ++ .sched_priority = 99, ++ }; ++ int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ int retval, oldpolicy = -1; ++ int policy = attr->sched_policy; ++ unsigned long flags; ++ struct rq *rq; ++ int reset_on_fork; ++ raw_spinlock_t *lock; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ /* ++ * PDS supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO ++ */ ++ if (unlikely(SCHED_DEADLINE == policy)) { ++ attr = &dl_squash_attr; ++ policy = attr->sched_policy; ++ newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); ++ ++ if (policy > SCHED_IDLE) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH and SCHED_IDLE is 0. ++ */ ++ if (attr->sched_priority < 0 || ++ (p->mm && attr->sched_priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if ((SCHED_RR == policy || SCHED_FIFO == policy) != ++ (attr->sched_priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (SCHED_FIFO == policy || SCHED_RR == policy) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (attr->sched_priority > p->rt_priority && ++ attr->sched_priority > rlim_rtprio) ++ return -EPERM; ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ if (pi) ++ cpuset_read_lock(); ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ /* ++ * To be able to change p->policy safely, task_access_lock() ++ * must be called. ++ * IF use task_access_lock() here: ++ * For the task p which is not running, reading rq->stop is ++ * racy but acceptable as ->stop doesn't change much. ++ * An enhancemnet can be made to read rq->stop saftly. ++ */ ++ rq = __task_access_lock(p, &lock); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea ++ */ ++ if (p == rq->stop) { ++ retval = -EINVAL; ++ goto unlock; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy)) { ++ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) ++ goto change; ++ if (!rt_policy(policy) && ++ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) ++ goto change; ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ retval = 0; ++ goto unlock; ++ } ++change: ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ goto recheck; ++ } ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ if (pi) { ++ /* ++ * Take priority boosted tasks into account. If the new ++ * effective priority is unchanged, we just store the new ++ * normal parameters and do not touch the scheduler class and ++ * the runqueue. This will be done when the task deboost ++ * itself. ++ */ ++ if (rt_effective_prio(p, newprio) == p->prio) { ++ __setscheduler_params(p, attr); ++ retval = 0; ++ goto unlock; ++ } ++ } ++ ++ __setscheduler(rq, p, attr, pi); ++ ++ check_task_changed(rq, p); ++ ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ if (pi) { ++ cpuset_read_unlock(); ++ rt_mutex_adjust_pi(p); ++ } ++ ++ preempt_enable(); ++ ++ return 0; ++ ++unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ return retval; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ ++ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { ++ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; ++ policy &= ~SCHED_RESET_ON_FORK; ++ attr.sched_policy = policy; ++ } ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++ ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++EXPORT_SYMBOL_GPL(sched_setscheduler); ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++EXPORT_SYMBOL_GPL(sched_setattr); ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setscheduler(p, policy, &lparam); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) ++ goto err_size; ++ ++ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); ++ if (ret) { ++ if (ret == -E2BIG) ++ goto err_size; ++ return ret; ++ } ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * @param: structure containing the new RT priority. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (p != NULL) ++ retval = sched_setattr(p, &attr); ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (task_has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/* ++ * Copy the kernel size attribute structure (which might be larger ++ * than what user-space knows about) to user-space. ++ * ++ * Note that all cases are valid: user-space buffer can be larger or ++ * smaller than the kernel-space buffer. The usual case is that both ++ * have the same size. ++ */ ++static int ++sched_attr_copy_to_user(struct sched_attr __user *uattr, ++ struct sched_attr *kattr, ++ unsigned int usize) ++{ ++ unsigned int ksize = sizeof(*kattr); ++ ++ if (!access_ok(uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * sched_getattr() ABI forwards and backwards compatibility: ++ * ++ * If usize == ksize then we just copy everything to user-space and all is good. ++ * ++ * If usize < ksize then we only copy as much as user-space has space for, ++ * this keeps ABI compatibility as well. We skip the rest. ++ * ++ * If usize > ksize then user-space is using a newer version of the ABI, ++ * which part the kernel doesn't know about. Just ignore it - tooling can ++ * detect the kernel's knowledge of attributes from the attr->size value ++ * which is set to ksize in this case. ++ */ ++ kattr->size = min(usize, ksize); ++ ++ if (copy_to_user(uattr, kattr, kattr->size)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @usize: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, usize, unsigned int, flags) ++{ ++ struct sched_attr kattr = { }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || usize > PAGE_SIZE || ++ usize < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ kattr.sched_policy = p->policy; ++ if (rt_task(p)) ++ kattr.sched_priority = p->rt_priority; ++ else ++ kattr.sched_nice = task_nice(p); ++ ++#ifdef CONFIG_UCLAMP_TASK ++ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; ++ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; ++#endif ++ ++ rcu_read_unlock(); ++ ++ return sched_attr_copy_to_user(uattr, &kattr, usize); ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_mask, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ put_online_cpus(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_mask); ++ cpumask_and(new_mask, in_mask, cpus_mask); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_mask); ++ if (!cpumask_subset(new_mask, cpus_mask)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_mask to the ++ * cpuset's cpus_mask ++ */ ++ cpumask_copy(new_mask, cpus_mask); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_mask); ++out_put_task: ++ put_task_struct(p); ++ put_online_cpus(); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ struct cpumask *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ raw_spinlock_t *lock; ++ unsigned long flags; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ task_access_lock_irqsave(p, &lock, &flags); ++ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: size of CPU mask copied to user_mask_ptr on success. An ++ * error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min_t(size_t, len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ * ++ * Return: 0. ++ */ ++static void do_sched_yield(void) ++{ ++ struct rq *rq; ++ struct rq_flags rf; ++ ++ if (!sched_yield_type) ++ return; ++ ++ rq = this_rq_lock_irq(&rf); ++ ++ if (sched_yield_type > 1) { ++ time_slice_expired(current, rq); ++ requeue_task(current, rq); ++ } ++ schedstat_inc(rq->yld_count); ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ preempt_disable(); ++ raw_spin_unlock(&rq->lock); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPTION ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ rcu_all_qs(); ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, its already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * In PDS, yield_to is not supported. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ int retval; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ rcu_read_unlock(); ++ ++ *t = ns_to_timespec64(MS_TO_NS(rr_interval)); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct __kernel_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT_32BIT_TIME ++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, ++ struct old_timespec32 __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_old_timespec32(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, ++ task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ show_stack(p, NULL); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++#if BITS_PER_LONG == 32 ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#else ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#endif ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++#ifdef CONFIG_SCHED_DEBUG ++ /* PDS TODO: should support this ++ if (!state_filter) ++ sysrq_sched_debug_show(); ++ */ ++#endif ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: cpu the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ idle->last_ran = rq->clock_task; ++ idle->state = TASK_RUNNING; ++ idle->flags |= PF_IDLE; ++ /* Setting prio to illegal value shouldn't matter when never queued */ ++ idle->prio = PRIO_LIMIT; ++ idle->deadline = rq_clock(rq) + task_deadline_diff(idle); ++ update_task_priodl(idle); ++ ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#endif ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ __set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->idle = idle; ++ rcu_assign_pointer(rq->curr, idle); ++ idle->on_cpu = 1; ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(cpu_rq(cpu)); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++} ++ ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * In order to ensure that a pending wakeup will observe our pending ++ * state, even in the failed case, an explicit smp_mb() must be used. ++ */ ++ smp_mb__before_atomic(); ++ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) ++ return false; ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++ return true; ++} ++ ++/** ++ * wake_q_add() - queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ */ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task)) ++ get_task_struct(task); ++} ++ ++/** ++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ * ++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers ++ * that already hold reference to @task can call the 'safe' version and trust ++ * wake_q to do the right thing depending whether or not the @task is already ++ * queued for wakeup. ++ */ ++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (!__wake_q_add(head, task)) ++ put_task_struct(task); ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* task can safely be re-inserted now: */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++#ifdef CONFIG_SMP ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_mask may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++static bool sched_smp_initialized __read_mostly; ++ ++#ifdef CONFIG_NO_HZ_COMMON ++void nohz_balance_enter_idle(int cpu) ++{ ++} ++ ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(); ++ struct cpumask *mask; ++ ++ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ return cpu; ++ ++ for (mask = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ mask < per_cpu(sched_cpu_affinity_chk_end_masks, cpu); mask++) ++ for_each_cpu(i, mask) ++ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) ++ return i; ++ ++ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++ ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++void wake_up_idle_cpu(int cpu) ++{ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ set_tsk_need_resched(cpu_rq(cpu)->idle); ++ smp_send_reschedule(cpu); ++} ++ ++void wake_up_nohz_cpu(int cpu) ++{ ++ wake_up_idle_cpu(cpu); ++} ++#endif /* CONFIG_NO_HZ_COMMON */ ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Ensures that the idle task is using init_mm right before its CPU goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(cpu_online(smp_processor_id())); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ current->active_mm = &init_mm; ++ finish_arch_post_lock_switch(); ++ } ++ mmdrop(mm); ++} ++ ++/* ++ * Migrate all tasks from the rq, sleeping tasks will be migrated by ++ * try_to_wake_up()->select_task_rq(). ++ * ++ * Called with rq->lock held even though we'er in stop_machine() and ++ * there's no concurrency possible, we hold the required locks anyway ++ * because of lock validation efforts. ++ */ ++static void migrate_tasks(struct rq *dead_rq) ++{ ++ struct rq *rq = dead_rq; ++ struct task_struct *p, *stop = rq->stop; ++ struct skiplist_node *node; ++ int count = 0; ++ ++ /* ++ * Fudge the rq selection such that the below task selection loop ++ * doesn't get stuck on the currently eligible stop task. ++ * ++ * We're currently inside stop_machine() and the rq is either stuck ++ * in the stop_machine_cpu_stop() loop, or we're executing this code, ++ * either way we should never end up calling schedule() until we're ++ * done here. ++ */ ++ rq->stop = NULL; ++ ++ node = &rq->sl_header; ++ while ((node = node->next[0]) != &rq->sl_header) { ++ int dest_cpu; ++ ++ p = skiplist_entry(node, struct task_struct, sl_node); ++ ++ /* skip the running task */ ++ if (task_running(p)) ++ continue; ++ ++ /* ++ * Rules for changing task_struct::cpus_mask are holding ++ * both pi_lock and rq->lock, such that holding either ++ * stabilizes the mask. ++ * ++ * Drop rq->lock is not quite as disastrous as it usually is ++ * because !cpu_active at this point, which means load-balance ++ * will not interfere. Also, stop-machine. ++ */ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ /* ++ * Since we're inside stop-machine, _nothing_ should have ++ * changed the task, WARN if weird stuff happened, because in ++ * that case the above rq->lock drop is a fail too. ++ */ ++ if (WARN_ON(task_rq(p) != rq || !task_on_rq_queued(p))) { ++ raw_spin_unlock(&p->pi_lock); ++ continue; ++ } ++ ++ count++; ++ /* Find suitable destination for @next, with force if needed. */ ++ dest_cpu = select_fallback_rq(dead_rq->cpu, p); ++ ++ rq = __migrate_task(rq, p, dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ rq = dead_rq; ++ raw_spin_lock(&rq->lock); ++ /* Check queued task all over from the header again */ ++ node = &rq->sl_header; ++ } ++ ++ rq->stop = stop; ++} ++ ++static void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) ++ rq->online = false; ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++static void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) ++ rq->online = true; ++} ++ ++#ifdef CONFIG_SCHED_DEBUG ++ ++static __read_mostly int sched_debug_enabled; ++ ++static int __init sched_debug_setup(char *str) ++{ ++ sched_debug_enabled = 1; ++ ++ return 0; ++} ++early_param("sched_debug", sched_debug_setup); ++ ++static inline bool sched_debug(void) ++{ ++ return sched_debug_enabled; ++} ++#else /* !CONFIG_SCHED_DEBUG */ ++static inline bool sched_debug(void) ++{ ++ return false; ++} ++#endif /* CONFIG_SCHED_DEBUG */ ++ ++#ifdef CONFIG_SMP ++void scheduler_ipi(void) ++{ ++ /* ++ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting ++ * TIF_NEED_RESCHED remotely (for the first time) will also send ++ * this IPI. ++ */ ++ preempt_fold_need_resched(); ++ ++ if (!idle_cpu(smp_processor_id()) || need_resched()) ++ return; ++ ++ irq_enter(); ++ irq_exit(); ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (is_idle_task(rq->curr)) ++ smp_send_reschedule(cpu); ++ /* Else CPU is not idle, do nothing here */ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Topology list, bottom-up. ++ */ ++static struct sched_domain_topology_level default_topology[] = { ++#ifdef CONFIG_SCHED_SMT ++ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, ++#endif ++#ifdef CONFIG_SCHED_MC ++ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, ++#endif ++ { cpu_cpu_mask, SD_INIT_NAME(DIE) }, ++ { NULL, }, ++}; ++ ++static struct sched_domain_topology_level *sched_domain_topology = ++ default_topology; ++ ++#define for_each_sd_topology(tl) \ ++ for (tl = sched_domain_topology; tl->mask; tl++) ++ ++void set_sched_topology(struct sched_domain_topology_level *tl) ++{ ++ if (WARN_ON_ONCE(sched_smp_initialized)) ++ return; ++ ++ sched_domain_topology = tl; ++} ++ ++/* ++ * Initializers for schedule domains ++ * Non-inlined to reduce accumulated stack pressure in build_sched_domains() ++ */ ++ ++int sched_domain_level_max; ++ ++/* ++ * Partition sched domains as specified by the 'ndoms_new' ++ * cpumasks in the array doms_new[] of cpumasks. This compares ++ * doms_new[] to the current sched domain partitioning, doms_cur[]. ++ * It destroys each deleted domain and builds each new domain. ++ * ++ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. ++ * The masks don't intersect (don't overlap.) We should setup one ++ * sched domain for each mask. CPUs not in any of the cpumasks will ++ * not be load balanced. If the same cpumask appears both in the ++ * current 'doms_cur' domains and in the new 'doms_new', we can leave ++ * it as it is. ++ * ++ * The passed in 'doms_new' should be allocated using ++ * alloc_sched_domains. This routine takes ownership of it and will ++ * free_sched_domains it when done with it. If the caller failed the ++ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, ++ * and partition_sched_domains() will fallback to the single partition ++ * 'fallback_doms', it also forces the domains to be rebuilt. ++ * ++ * If doms_new == NULL it will be replaced with cpu_online_mask. ++ * ndoms_new == 0 is a special case for destroying existing domains, ++ * and it will not create the default domain. ++ * ++ * Call with hotplug lock held ++ */ ++void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ++ struct sched_domain_attr *dattr_new) ++{ ++ /** ++ * PDS doesn't depend on sched domains, but just keep this api ++ */ ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++#ifdef CONFIG_NUMA ++int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; ++ ++/* ++ * sched_numa_find_closest() - given the NUMA topology, find the cpu ++ * closest to @cpu from @cpumask. ++ * cpumask: cpumask to find a cpu from ++ * cpu: cpu to be close to ++ * ++ * returns: cpu, or nr_cpu_ids when nothing found. ++ */ ++int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return best_mask_cpu(cpu, cpus); ++} ++#endif /* CONFIG_NUMA */ ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going up, increment the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_inc_cpuslocked(&sched_smt_present); ++#endif ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) ++ cpuset_cpu_active(); ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all cpus have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_online(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu(); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going down, decrement the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_dec_cpuslocked(&sched_smt_present); ++#endif ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ return 0; ++} ++ ++static void sched_rq_cpu_starting(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ rq->calc_load_update = calc_load_update; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_rq_cpu_starting(cpu); ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ sched_tick_stop(cpu); ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_offline(rq); ++ migrate_tasks(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ hrtick_clear(rq); ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_SMP ++static void sched_init_topology_cpumask_early(void) ++{ ++ int cpu, level; ++ cpumask_t *tmp; ++ ++ for_each_possible_cpu(cpu) { ++ for (level = 0; level < NR_CPU_AFFINITY_CHK_LEVEL; level++) { ++ tmp = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[level]); ++ cpumask_copy(tmp, cpu_possible_mask); ++ cpumask_clear_cpu(cpu, tmp); ++ } ++ per_cpu(sched_cpu_llc_start_mask, cpu) = ++ &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ per_cpu(sched_cpu_affinity_chk_end_masks, cpu) = ++ &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[1]); ++ } ++} ++ ++static void sched_init_topology_cpumask(void) ++{ ++ int cpu; ++ cpumask_t *chk; ++ ++ for_each_online_cpu(cpu) { ++ chk = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ ++#ifdef CONFIG_SCHED_SMT ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++ if (cpumask_and(chk, chk, topology_sibling_cpumask(cpu))) { ++ per_cpu(sched_sibling_cpu, cpu) = cpumask_first(chk); ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - smt 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ } ++#endif ++#ifdef CONFIG_SCHED_MC ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++ if (cpumask_and(chk, chk, cpu_coregroup_mask(cpu))) { ++ per_cpu(sched_cpu_llc_start_mask, cpu) = chk; ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - coregroup 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ } ++ cpumask_complement(chk, cpu_coregroup_mask(cpu)); ++ ++ /** ++ * Set up sd_llc_id per CPU ++ */ ++ per_cpu(sd_llc_id, cpu) = ++ cpumask_first(cpu_coregroup_mask(cpu)); ++#else ++ per_cpu(sd_llc_id, cpu) = ++ cpumask_first(topology_core_cpumask(cpu)); ++ ++ per_cpu(sched_cpu_llc_start_mask, cpu) = chk; ++ ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++#endif /* NOT CONFIG_SCHED_MC */ ++ if (cpumask_and(chk, chk, topology_core_cpumask(cpu))) ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - core 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ cpumask_complement(chk, topology_core_cpumask(cpu)); ++ ++ if (cpumask_and(chk, chk, cpu_online_mask)) ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - others 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ ++ per_cpu(sched_cpu_affinity_chk_end_masks, cpu) = chk; ++ } ++} ++#endif ++ ++void __init sched_init_smp(void) ++{ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) ++ BUG(); ++ ++ cpumask_copy(&sched_rq_queued_masks[SCHED_RQ_EMPTY], cpu_online_mask); ++ ++ sched_init_topology_cpumask(); ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++ int i; ++ struct rq *rq; ++ ++ print_scheduler_version(); ++ ++ wait_bit_init(); ++ ++#ifdef CONFIG_SMP ++ for (i = 0; i < NR_SCHED_RQ_QUEUED_LEVEL; i++) ++ cpumask_clear(&sched_rq_queued_masks[i]); ++ cpumask_setall(&sched_rq_queued_masks[SCHED_RQ_EMPTY]); ++ set_bit(SCHED_RQ_EMPTY, sched_rq_queued_masks_bitmap); ++ ++ cpumask_setall(&sched_rq_pending_masks[SCHED_RQ_EMPTY]); ++ set_bit(SCHED_RQ_EMPTY, sched_rq_pending_masks_bitmap); ++#else ++ uprq = &per_cpu(runqueues, 0); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ FULL_INIT_SKIPLIST_NODE(&rq->sl_header); ++ raw_spin_lock_init(&rq->lock); ++ rq->dither = 0; ++ rq->nr_running = rq->nr_uninterruptible = 0; ++ rq->calc_load_active = 0; ++ rq->calc_load_update = jiffies + LOAD_FREQ; ++#ifdef CONFIG_SMP ++ rq->online = false; ++ rq->cpu = i; ++ ++ rq->queued_level = SCHED_RQ_EMPTY; ++ rq->pending_level = SCHED_RQ_EMPTY; ++#ifdef CONFIG_SCHED_SMT ++ per_cpu(sched_sibling_cpu, i) = i; ++ rq->active_balance = 0; ++#endif ++#endif ++ rq->nr_switches = 0; ++ atomic_set(&rq->nr_iowait, 0); ++ hrtick_rq_init(rq); ++ } ++#ifdef CONFIG_SMP ++ /* Set rq->online for cpu 0 */ ++ cpu_rq(0)->online = true; ++#endif ++ ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++ ++ sched_init_topology_cpumask_early(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++ ++ psi_init(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current) && !current->non_block_count) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++#ifdef CONFIG_DEBUG_PREEMPT ++ if (!preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++#endif ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++ ++void __cant_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > preempt_offset) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); ++ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++void normalize_rt_tasks(void) ++{ ++ struct task_struct *g, *p; ++ struct sched_attr attr = { ++ .sched_policy = SCHED_NORMAL, ++ }; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p)) { ++ /* ++ * Renice negative nice level userspace ++ * tasks back to 0: ++ */ ++ if (task_nice(p) < 0) ++ set_user_nice(p, 0); ++ continue; ++ } ++ ++ __sched_setscheduler(p, &attr, false, false); ++ } ++ read_unlock(&tasklist_lock); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * ia64_set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++#ifdef CONFIG_SCHED_DEBUG ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_legacy_files[] = { ++ { } /* Terminate */ ++}; ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++#undef CREATE_TRACE_POINTS +diff --git a/kernel/sched/pds_sched.h b/kernel/sched/pds_sched.h +new file mode 100644 +index 000000000000..b3926a8425b2 +--- /dev/null ++++ b/kernel/sched/pds_sched.h +@@ -0,0 +1,474 @@ ++#ifndef PDS_SCHED_H ++#define PDS_SCHED_H ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#ifdef CONFIG_PARAVIRT ++# include ++#endif ++ ++#include "cpupri.h" ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++} ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ /* runqueue lock: */ ++ raw_spinlock_t lock; ++ ++ struct task_struct *curr, *idle, *stop; ++ struct mm_struct *prev_mm; ++ ++ struct skiplist_node sl_header; ++ ++ /* switch count */ ++ u64 nr_switches; ++ ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_MEMBARRIER ++ int membarrier_state; ++#endif ++ ++#ifdef CONFIG_SMP ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ struct sched_avg avg_irq; ++#endif ++ ++ unsigned long queued_level; ++ unsigned long pending_level; ++ ++#ifdef CONFIG_SCHED_SMT ++ int active_balance; ++ struct cpu_stop_work active_balance_work; ++#endif ++#endif /* CONFIG_SMP */ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ /* calc_load related fields */ ++ unsigned long calc_load_update; ++ long calc_load_active; ++ ++ u64 clock, last_tick; ++ u64 clock_task; ++ int dither; ++ ++ unsigned long nr_running; ++ unsigned long nr_uninterruptible; ++ ++#ifdef CONFIG_SCHED_HRTICK ++#ifdef CONFIG_SMP ++ int hrtick_csd_pending; ++ call_single_data_t hrtick_csd; ++#endif ++ struct hrtimer hrtick_timer; ++#endif ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++}; ++ ++extern unsigned long calc_load_update; ++extern atomic_long_t calc_load_tasks; ++ ++extern void calc_global_load_tick(struct rq *this_rq); ++extern long calc_load_fold_active(struct rq *this_rq, long adjust); ++ ++#ifndef CONFIG_SMP ++extern struct rq *uprq; ++#define cpu_rq(cpu) (uprq) ++#define this_rq() (uprq) ++#define raw_rq() (uprq) ++#define task_rq(p) (uprq) ++#define cpu_curr(cpu) ((uprq)->curr) ++#else /* CONFIG_SMP */ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) ++#define this_rq() this_cpu_ptr(&runqueues) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++#endif /* CONFIG_SMP */ ++ ++#ifndef arch_scale_freq_capacity ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock_task; ++} ++ ++/* ++ * {de,en}queue flags: ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ */ ++ ++#define DEQUEUE_SLEEP 0x01 ++ ++#define ENQUEUE_WAKEUP 0x01 ++ ++ ++/* ++ * Below are scheduler API which using in other kernel code ++ * It use the dummy rq_flags ++ * ToDo : PDS need to support these APIs for compatibility with mainline ++ * scheduler code. ++ */ ++struct rq_flags { ++ unsigned long flags; ++}; ++ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock); ++ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock); ++ ++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(&rq->lock); ++} ++ ++static inline void ++task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++} ++ ++static inline void ++rq_unlock_irq(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++static inline struct rq * ++this_rq_lock_irq(struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ raw_spin_lock(&rq->lock); ++ ++ return rq; ++} ++ ++static inline bool task_running(struct task_struct *p) ++{ ++ return p->on_cpu; ++} ++ ++extern struct static_key_false sched_schedstats; ++ ++static inline void sched_ttwu_pending(void) { } ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++static inline int cpu_of(const struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ return rq->cpu; ++#else ++ return 0; ++#endif ++} ++ ++#include "stats.h" ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); ++ ++/** ++ * cpufreq_update_util - Take a note about CPU utilization changes. ++ * @rq: Runqueue to carry out the update for. ++ * @flags: Update reason flags. ++ * ++ * This function is called by the scheduler on the CPU whose utilization is ++ * being updated. ++ * ++ * It can only be called from RCU-sched read-side critical sections. ++ * ++ * The way cpufreq is currently arranged requires it to evaluate the CPU ++ * performance state (frequency/voltage) on a regular basis to prevent it from ++ * being stuck in a completely inadequate performance level for too long. ++ * That is not guaranteed to happen if the updates are only triggered from CFS ++ * and DL, though, because they may not be coming in if only RT tasks are ++ * active all the time (or there are RT tasks only). ++ * ++ * As a workaround for that issue, this function is called periodically by the ++ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, ++ * but that really is a band-aid. Going forward it should be replaced with ++ * solutions targeted more specifically at RT tasks. ++ */ ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); ++ if (data) ++ data->func(data, rq_clock(rq), flags); ++} ++ ++static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) ++{ ++ if (cpu_of(rq) == smp_processor_id()) ++ cpufreq_update_util(rq, flags); ++} ++#else ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} ++static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} ++#endif /* CONFIG_CPU_FREQ */ ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern int __init sched_tick_offload_init(void); ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++#endif ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++extern void schedule_idle(void); ++ ++/* ++ * !! For sched_setattr_nocheck() (kernel) only !! ++ * ++ * This is actually gross. :( ++ * ++ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE ++ * tasks, but still be able to sleep. We need this on platforms that cannot ++ * atomically change clock frequency. Remove once fast switching will be ++ * available on such platforms. ++ * ++ * SUGOV stands for SchedUtil GOVernor. ++ */ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++#ifdef CONFIG_MEMBARRIER ++/* ++ * The scheduler provides memory barriers required by membarrier between: ++ * - prior user-space memory accesses and store to rq->membarrier_state, ++ * - store to rq->membarrier_state and following user-space memory accesses. ++ * In the same way it provides those guarantees around store to rq->curr. ++ */ ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++ int membarrier_state; ++ ++ if (prev_mm == next_mm) ++ return; ++ ++ membarrier_state = atomic_read(&next_mm->membarrier_state); ++ if (READ_ONCE(rq->membarrier_state) == membarrier_state) ++ return; ++ ++ WRITE_ONCE(rq->membarrier_state, membarrier_state); ++} ++#else ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++} ++#endif ++ ++#ifdef CONFIG_NUMA ++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); ++#else ++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return nr_cpu_ids; ++} ++#endif ++#endif /* PDS_SCHED_H */ +diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c +index a96db50d40e0..d3d12baa9036 100644 +--- a/kernel/sched/pelt.c ++++ b/kernel/sched/pelt.c +@@ -236,6 +236,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna + WRITE_ONCE(sa->util_avg, sa->util_sum / divider); + } + ++#ifndef CONFIG_SCHED_PDS + /* + * sched_entity: + * +@@ -352,6 +353,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + + return 0; + } ++#endif + + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ + /* +diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h +index afff644da065..26d6b47fc156 100644 +--- a/kernel/sched/pelt.h ++++ b/kernel/sched/pelt.h +@@ -1,11 +1,13 @@ + #ifdef CONFIG_SMP + #include "sched-pelt.h" + ++#ifndef CONFIG_SCHED_PDS + int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); + int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); + int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); + int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); + int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); ++#endif + + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ + int update_irq_load_avg(struct rq *rq, u64 running); +@@ -17,6 +19,7 @@ update_irq_load_avg(struct rq *rq, u64 running) + } + #endif + ++#ifndef CONFIG_SCHED_PDS + /* + * When a task is dequeued, its estimated utilization should not be update if + * its util_avg has not been updated at least once. +@@ -137,9 +140,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) + return rq_clock_pelt(rq_of(cfs_rq)); + } + #endif ++#endif /* CONFIG_SCHED_PDS */ + + #else + ++#ifndef CONFIG_SCHED_PDS + static inline int + update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) + { +@@ -157,6 +162,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + { + return 0; + } ++#endif + + static inline int + update_irq_load_avg(struct rq *rq, u64 running) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index c8870c5bd7df..4fc9f2ead4d2 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2,6 +2,10 @@ + /* + * Scheduler internal types and methods: + */ ++#ifdef CONFIG_SCHED_PDS ++#include "pds_sched.h" ++#else ++ + #include + + #include +@@ -2496,3 +2500,4 @@ static inline void membarrier_switch_mm(struct rq *rq, + { + } + #endif ++#endif /* !CONFIG_SCHED_PDS */ +diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c +index 750fb3c67eed..45bd43942575 100644 +--- a/kernel/sched/stats.c ++++ b/kernel/sched/stats.c +@@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v) + } else { + struct rq *rq; + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_PDS + struct sched_domain *sd; + int dcount = 0; ++#endif + #endif + cpu = (unsigned long)(v - 2); + rq = cpu_rq(cpu); +@@ -40,6 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + seq_printf(seq, "\n"); + + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_PDS + /* domain-specific stats */ + rcu_read_lock(); + for_each_domain(cpu, sd) { +@@ -68,6 +71,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + sd->ttwu_move_balance); + } + rcu_read_unlock(); ++#endif + #endif + } + return 0; +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index b6f2f35d0bcf..204933ebc95a 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -130,8 +130,12 @@ static int __maybe_unused four = 4; + static unsigned long zero_ul; + static unsigned long one_ul = 1; + static unsigned long long_max = LONG_MAX; +-static int one_hundred = 100; +-static int one_thousand = 1000; ++static int __read_mostly one_hundred = 100; ++static int __read_mostly one_thousand = 1000; ++#ifdef CONFIG_SCHED_PDS ++extern int rr_interval; ++extern int sched_yield_type; ++#endif + #ifdef CONFIG_PRINTK + static int ten_thousand = 10000; + #endif +@@ -300,7 +304,7 @@ static struct ctl_table sysctl_base_table[] = { + { } + }; + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_PDS) + static int min_sched_granularity_ns = 100000; /* 100 usecs */ + static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns; /* 0 usecs */ +@@ -317,6 +321,7 @@ static int max_extfrag_threshold = 1000; + #endif + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_PDS + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -498,6 +503,7 @@ static struct ctl_table kern_table[] = { + .extra2 = SYSCTL_ONE, + }, + #endif ++#endif /* !CONFIG_SCHED_PDS */ + #ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", +@@ -1070,6 +1076,26 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_PDS ++ { ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ONE, ++ .extra2 = &one_thousand, ++ }, ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &two, ++ }, ++#endif + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +index 42d512fcfda2..71af3cd30ccc 100644 +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -226,7 +226,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) + u64 stime, utime; + + task_cputime(p, &utime, &stime); +- store_samples(samples, stime, utime, p->se.sum_exec_runtime); ++ store_samples(samples, stime, utime, tsk_seruntime(p)); + } + + static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, +@@ -796,6 +796,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, + } + } + ++#ifndef CONFIG_SCHED_PDS + static inline void check_dl_overrun(struct task_struct *tsk) + { + if (tsk->dl.dl_overrun) { +@@ -803,6 +804,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + } + } ++#endif + + static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) + { +@@ -830,8 +832,10 @@ static void check_thread_timers(struct task_struct *tsk, + u64 samples[CPUCLOCK_MAX]; + unsigned long soft; + ++#ifndef CONFIG_SCHED_PDS + if (dl_task(tsk)) + check_dl_overrun(tsk); ++#endif + + if (expiry_cache_is_inactive(pct)) + return; +@@ -845,7 +849,7 @@ static void check_thread_timers(struct task_struct *tsk, + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ +- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); ++ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + /* At the hard limit, send SIGKILL. No further action. */ +@@ -1099,8 +1103,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) + return true; + } + ++#ifndef CONFIG_SCHED_PDS + if (dl_task(tsk) && tsk->dl.dl_overrun) + return true; ++#endif + + return false; + } +diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +index 69ee8ef12cee..3eaa2a21caa4 100644 +--- a/kernel/trace/trace_selftest.c ++++ b/kernel/trace/trace_selftest.c +@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_PDS ++ /* No deadline on BFS, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + diff --git a/linux-tkg/linux-tkg-patches/5.4/0006-add-acs-overrides_iommu.patch b/linux-tkg/linux-tkg-patches/5.4/0006-add-acs-overrides_iommu.patch new file mode 100644 index 0000000..d1303a5 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.4/0006-add-acs-overrides_iommu.patch @@ -0,0 +1,193 @@ +From cdeab384f48dd9c88e2dff2e9ad8d57dca1a1b1c Mon Sep 17 00:00:00 2001 +From: Mark Weiman +Date: Sun, 12 Aug 2018 11:36:21 -0400 +Subject: [PATCH] pci: Enable overrides for missing ACS capabilities + +This an updated version of Alex Williamson's patch from: +https://lkml.org/lkml/2013/5/30/513 + +Original commit message follows: + +PCIe ACS (Access Control Services) is the PCIe 2.0+ feature that +allows us to control whether transactions are allowed to be redirected +in various subnodes of a PCIe topology. For instance, if two +endpoints are below a root port or downsteam switch port, the +downstream port may optionally redirect transactions between the +devices, bypassing upstream devices. The same can happen internally +on multifunction devices. The transaction may never be visible to the +upstream devices. + +One upstream device that we particularly care about is the IOMMU. If +a redirection occurs in the topology below the IOMMU, then the IOMMU +cannot provide isolation between devices. This is why the PCIe spec +encourages topologies to include ACS support. Without it, we have to +assume peer-to-peer DMA within a hierarchy can bypass IOMMU isolation. + +Unfortunately, far too many topologies do not support ACS to make this +a steadfast requirement. Even the latest chipsets from Intel are only +sporadically supporting ACS. We have trouble getting interconnect +vendors to include the PCIe spec required PCIe capability, let alone +suggested features. + +Therefore, we need to add some flexibility. The pcie_acs_override= +boot option lets users opt-in specific devices or sets of devices to +assume ACS support. The "downstream" option assumes full ACS support +on root ports and downstream switch ports. The "multifunction" +option assumes the subset of ACS features available on multifunction +endpoints and upstream switch ports are supported. The "id:nnnn:nnnn" +option enables ACS support on devices matching the provided vendor +and device IDs, allowing more strategic ACS overrides. These options +may be combined in any order. A maximum of 16 id specific overrides +are available. It's suggested to use the most limited set of options +necessary to avoid completely disabling ACS across the topology. +Note to hardware vendors, we have facilities to permanently quirk +specific devices which enforce isolation but not provide an ACS +capability. Please contact me to have your devices added and save +your customers the hassle of this boot option. + +Signed-off-by: Mark Weiman +--- + .../admin-guide/kernel-parameters.txt | 9 ++ + drivers/pci/quirks.c | 101 ++++++++++++++++++ + 2 files changed, 110 insertions(+) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index aefd358a5ca3..173b3596fd9e 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -3190,6 +3190,15 @@ + nomsi [MSI] If the PCI_MSI kernel config parameter is + enabled, this kernel boot option can be used to + disable the use of MSI interrupts system-wide. ++ pcie_acs_override = ++ [PCIE] Override missing PCIe ACS support for: ++ downstream ++ All downstream ports - full ACS capabilities ++ multifunction ++ All multifunction devices - multifunction ACS subset ++ id:nnnn:nnnn ++ Specific device - full ACS capabilities ++ Specified as vid:did (vendor/device ID) in hex + noioapicquirk [APIC] Disable all boot interrupt quirks. + Safety option to keep boot IRQs enabled. This + should never be necessary. +diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c +index 4700d24e5d55..8f7a3d7fd9c1 100644 +--- a/drivers/pci/quirks.c ++++ b/drivers/pci/quirks.c +@@ -3372,6 +3372,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev) + dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET; + } + ++static bool acs_on_downstream; ++static bool acs_on_multifunction; ++ ++#define NUM_ACS_IDS 16 ++struct acs_on_id { ++ unsigned short vendor; ++ unsigned short device; ++}; ++static struct acs_on_id acs_on_ids[NUM_ACS_IDS]; ++static u8 max_acs_id; ++ ++static __init int pcie_acs_override_setup(char *p) ++{ ++ if (!p) ++ return -EINVAL; ++ ++ while (*p) { ++ if (!strncmp(p, "downstream", 10)) ++ acs_on_downstream = true; ++ if (!strncmp(p, "multifunction", 13)) ++ acs_on_multifunction = true; ++ if (!strncmp(p, "id:", 3)) { ++ char opt[5]; ++ int ret; ++ long val; ++ ++ if (max_acs_id >= NUM_ACS_IDS - 1) { ++ pr_warn("Out of PCIe ACS override slots (%d)\n", ++ NUM_ACS_IDS); ++ goto next; ++ } ++ ++ p += 3; ++ snprintf(opt, 5, "%s", p); ++ ret = kstrtol(opt, 16, &val); ++ if (ret) { ++ pr_warn("PCIe ACS ID parse error %d\n", ret); ++ goto next; ++ } ++ acs_on_ids[max_acs_id].vendor = val; ++ ++ p += strcspn(p, ":"); ++ if (*p != ':') { ++ pr_warn("PCIe ACS invalid ID\n"); ++ goto next; ++ } ++ ++ p++; ++ snprintf(opt, 5, "%s", p); ++ ret = kstrtol(opt, 16, &val); ++ if (ret) { ++ pr_warn("PCIe ACS ID parse error %d\n", ret); ++ goto next; ++ } ++ acs_on_ids[max_acs_id].device = val; ++ max_acs_id++; ++ } ++next: ++ p += strcspn(p, ","); ++ if (*p == ',') ++ p++; ++ } ++ ++ if (acs_on_downstream || acs_on_multifunction || max_acs_id) ++ pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n"); ++ ++ return 0; ++} ++early_param("pcie_acs_override", pcie_acs_override_setup); ++ ++static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags) ++{ ++ int i; ++ ++ /* Never override ACS for legacy devices or devices with ACS caps */ ++ if (!pci_is_pcie(dev) || ++ pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS)) ++ return -ENOTTY; ++ ++ for (i = 0; i < max_acs_id; i++) ++ if (acs_on_ids[i].vendor == dev->vendor && ++ acs_on_ids[i].device == dev->device) ++ return 1; ++ ++ switch (pci_pcie_type(dev)) { ++ case PCI_EXP_TYPE_DOWNSTREAM: ++ case PCI_EXP_TYPE_ROOT_PORT: ++ if (acs_on_downstream) ++ return 1; ++ break; ++ case PCI_EXP_TYPE_ENDPOINT: ++ case PCI_EXP_TYPE_UPSTREAM: ++ case PCI_EXP_TYPE_LEG_END: ++ case PCI_EXP_TYPE_RC_END: ++ if (acs_on_multifunction && dev->multifunction) ++ return 1; ++ } ++ ++ return -ENOTTY; ++} + /* + * Some Atheros AR9xxx and QCA988x chips do not behave after a bus reset. + * The device will throw a Link Down error on AER-capable systems and +@@ -4513,6 +4613,7 @@ static const struct pci_dev_acs_enabled { + { PCI_VENDOR_ID_ZHAOXIN, 0x9083, pci_quirk_mf_endpoint_acs }, + /* Zhaoxin Root/Downstream Ports */ + { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, ++ { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, + { 0 } + }; + + diff --git a/linux-tkg/linux-tkg-patches/5.4/0007-v5.4-fsync.patch b/linux-tkg/linux-tkg-patches/5.4/0007-v5.4-fsync.patch new file mode 100644 index 0000000..027116f --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.4/0007-v5.4-fsync.patch @@ -0,0 +1,419 @@ +split the futex key setup from the queue locking and key reading. This +is useful to support the setup of multiple keys at the same time, like +what is done in futex_requeue() and what will be done for the +FUTEX_WAIT_MULTIPLE command. + +Signed-off-by: Gabriel Krisman Bertazi +--- + kernel/futex.c | 71 +++++++++++++++++++++++++++++--------------------- + 1 file changed, 42 insertions(+), 29 deletions(-) + +diff --git a/kernel/futex.c b/kernel/futex.c +index 6d50728ef2e7..91f3db335c57 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -2631,6 +2631,39 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, + __set_current_state(TASK_RUNNING); + } + ++static int __futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, ++ struct futex_q *q, struct futex_hash_bucket **hb) ++{ ++ ++ u32 uval; ++ int ret; ++ ++retry_private: ++ *hb = queue_lock(q); ++ ++ ret = get_futex_value_locked(&uval, uaddr); ++ ++ if (ret) { ++ queue_unlock(*hb); ++ ++ ret = get_user(uval, uaddr); ++ if (ret) ++ return ret; ++ ++ if (!(flags & FLAGS_SHARED)) ++ goto retry_private; ++ ++ return 1; ++ } ++ ++ if (uval != val) { ++ queue_unlock(*hb); ++ ret = -EWOULDBLOCK; ++ } ++ ++ return ret; ++} ++ + /** + * futex_wait_setup() - Prepare to wait on a futex + * @uaddr: the futex userspace address +@@ -2651,7 +2684,6 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, + static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + struct futex_q *q, struct futex_hash_bucket **hb) + { +- u32 uval; + int ret; + + /* +@@ -2672,38 +2704,19 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + * absorb a wakeup if *uaddr does not match the desired values + * while the syscall executes. + */ +-retry: +- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, FUTEX_READ); +- if (unlikely(ret != 0)) +- return ret; +- +-retry_private: +- *hb = queue_lock(q); ++ do { ++ ret = get_futex_key(uaddr, flags & FLAGS_SHARED, ++ &q->key, FUTEX_READ); ++ if (unlikely(ret != 0)) ++ return ret; + +- ret = get_futex_value_locked(&uval, uaddr); ++ ret = __futex_wait_setup(uaddr, val, flags, q, hb); + +- if (ret) { +- queue_unlock(*hb); +- +- ret = get_user(uval, uaddr); ++ /* Drop key reference if retry or error. */ + if (ret) +- goto out; ++ put_futex_key(&q->key); ++ } while (ret > 0); + +- if (!(flags & FLAGS_SHARED)) +- goto retry_private; +- +- put_futex_key(&q->key); +- goto retry; +- } +- +- if (uval != val) { +- queue_unlock(*hb); +- ret = -EWOULDBLOCK; +- } +- +-out: +- if (ret) +- put_futex_key(&q->key); + return ret; + } + +-- +2.20.1 + +This is a new futex operation, called FUTEX_WAIT_MULTIPLE, which allows +a thread to wait on several futexes at the same time, and be awoken by +any of them. In a sense, it implements one of the features that was +supported by pooling on the old FUTEX_FD interface. + +My use case for this operation lies in Wine, where we want to implement +a similar interface available in Windows, used mainly for event +handling. The wine folks have an implementation that uses eventfd, but +it suffers from FD exhaustion (I was told they have application that go +to the order of multi-milion FDs), and higher CPU utilization. + +In time, we are also proposing modifications to glibc and libpthread to +make this feature available for Linux native multithreaded applications +using libpthread, which can benefit from the behavior of waiting on any +of a group of futexes. + +In particular, using futexes in our Wine use case reduced the CPU +utilization by 4% for the game Beat Saber and by 1.5% for the game +Shadow of Tomb Raider, both running over Proton (a wine based solution +for Windows emulation), when compared to the eventfd interface. This +implementation also doesn't rely of file descriptors, so it doesn't risk +overflowing the resource. + +Technically, the existing FUTEX_WAIT implementation can be easily +reworked by using do_futex_wait_multiple with a count of one, and I +have a patch showing how it works. I'm not proposing it, since +futex is such a tricky code, that I'd be more confortable to have +FUTEX_WAIT_MULTIPLE running upstream for a couple development cycles, +before considering modifying FUTEX_WAIT. + +From an implementation perspective, the futex list is passed as an array +of (pointer,value,bitset) to the kernel, which will enqueue all of them +and sleep if none was already triggered. It returns a hint of which +futex caused the wake up event to userspace, but the hint doesn't +guarantee that is the only futex triggered. Before calling the syscall +again, userspace should traverse the list, trying to re-acquire any of +the other futexes, to prevent an immediate -EWOULDBLOCK return code from +the kernel. + +This was tested using three mechanisms: + +1) By reimplementing FUTEX_WAIT in terms of FUTEX_WAIT_MULTIPLE and +running the unmodified tools/testing/selftests/futex and a full linux +distro on top of this kernel. + +2) By an example code that exercises the FUTEX_WAIT_MULTIPLE path on a +multi-threaded, event-handling setup. + +3) By running the Wine fsync implementation and executing multi-threaded +applications, in particular the modern games mentioned above, on top of +this implementation. + +Signed-off-by: Zebediah Figura +Signed-off-by: Steven Noonan +Signed-off-by: Pierre-Loup A. Griffais +Signed-off-by: Gabriel Krisman Bertazi +--- + include/uapi/linux/futex.h | 7 ++ + kernel/futex.c | 161 ++++++++++++++++++++++++++++++++++++- + 2 files changed, 164 insertions(+), 4 deletions(-) + +diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h +index a89eb0accd5e..2401c4cf5095 100644 +--- a/include/uapi/linux/futex.h ++++ b/include/uapi/linux/futex.h +@@ -21,6 +21,7 @@ + #define FUTEX_WAKE_BITSET 10 + #define FUTEX_WAIT_REQUEUE_PI 11 + #define FUTEX_CMP_REQUEUE_PI 12 ++#define FUTEX_WAIT_MULTIPLE 31 + + #define FUTEX_PRIVATE_FLAG 128 + #define FUTEX_CLOCK_REALTIME 256 +@@ -150,4 +151,10 @@ struct robust_list_head { + (((op & 0xf) << 28) | ((cmp & 0xf) << 24) \ + | ((oparg & 0xfff) << 12) | (cmparg & 0xfff)) + ++struct futex_wait_block { ++ __u32 __user *uaddr; ++ __u32 val; ++ __u32 bitset; ++}; ++ + #endif /* _UAPI_LINUX_FUTEX_H */ +diff --git a/kernel/futex.c b/kernel/futex.c +index 91f3db335c57..2623e8f152cd 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -183,6 +183,7 @@ static int __read_mostly futex_cmpxchg_enabled; + #endif + #define FLAGS_CLOCKRT 0x02 + #define FLAGS_HAS_TIMEOUT 0x04 ++#define FLAGS_WAKE_MULTIPLE 0x08 + + /* + * Priority Inheritance state: +@@ -2720,6 +2721,150 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + return ret; + } + ++static int do_futex_wait_multiple(struct futex_wait_block *wb, ++ u32 count, unsigned int flags, ++ ktime_t *abs_time) ++{ ++ ++ struct hrtimer_sleeper timeout, *to; ++ struct futex_hash_bucket *hb; ++ struct futex_q *qs = NULL; ++ int ret; ++ int i; ++ ++ qs = kcalloc(count, sizeof(struct futex_q), GFP_KERNEL); ++ if (!qs) ++ return -ENOMEM; ++ ++ to = futex_setup_timer(abs_time, &timeout, flags, ++ current->timer_slack_ns); ++ retry: ++ for (i = 0; i < count; i++) { ++ qs[i].key = FUTEX_KEY_INIT; ++ qs[i].bitset = wb[i].bitset; ++ ++ ret = get_futex_key(wb[i].uaddr, flags & FLAGS_SHARED, ++ &qs[i].key, FUTEX_READ); ++ if (unlikely(ret != 0)) { ++ for (--i; i >= 0; i--) ++ put_futex_key(&qs[i].key); ++ goto out; ++ } ++ } ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ for (i = 0; i < count; i++) { ++ ret = __futex_wait_setup(wb[i].uaddr, wb[i].val, ++ flags, &qs[i], &hb); ++ if (ret) { ++ /* Drop the failed key directly. keys 0..(i-1) ++ * will be put by unqueue_me. ++ */ ++ put_futex_key(&qs[i].key); ++ ++ /* Undo the partial work we did. */ ++ for (--i; i >= 0; i--) ++ unqueue_me(&qs[i]); ++ ++ __set_current_state(TASK_RUNNING); ++ if (ret > 0) ++ goto retry; ++ goto out; ++ } ++ ++ /* We can't hold to the bucket lock when dealing with ++ * the next futex. Queue ourselves now so we can unlock ++ * it before moving on. ++ */ ++ queue_me(&qs[i], hb); ++ } ++ ++ if (to) ++ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); ++ ++ /* There is no easy to way to check if we are wake already on ++ * multiple futexes without waking through each one of them. So ++ * just sleep and let the scheduler handle it. ++ */ ++ if (!to || to->task) ++ freezable_schedule(); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ ret = -ETIMEDOUT; ++ /* If we were woken (and unqueued), we succeeded. */ ++ for (i = 0; i < count; i++) ++ if (!unqueue_me(&qs[i])) ++ ret = i; ++ ++ /* Succeed wakeup */ ++ if (ret >= 0) ++ goto out; ++ ++ /* Woken by triggered timeout */ ++ if (to && !to->task) ++ goto out; ++ ++ /* ++ * We expect signal_pending(current), but we might be the ++ * victim of a spurious wakeup as well. ++ */ ++ if (!signal_pending(current)) ++ goto retry; ++ ++ ret = -ERESTARTSYS; ++ if (!abs_time) ++ goto out; ++ ++ ret = -ERESTART_RESTARTBLOCK; ++ out: ++ if (to) { ++ hrtimer_cancel(&to->timer); ++ destroy_hrtimer_on_stack(&to->timer); ++ } ++ ++ kfree(qs); ++ return ret; ++} ++ ++static int futex_wait_multiple(u32 __user *uaddr, unsigned int flags, ++ u32 count, ktime_t *abs_time) ++{ ++ struct futex_wait_block *wb; ++ struct restart_block *restart; ++ int ret; ++ ++ if (!count) ++ return -EINVAL; ++ ++ wb = kcalloc(count, sizeof(struct futex_wait_block), GFP_KERNEL); ++ if (!wb) ++ return -ENOMEM; ++ ++ if (copy_from_user(wb, uaddr, ++ count * sizeof(struct futex_wait_block))) { ++ ret = -EFAULT; ++ goto out; ++ } ++ ++ ret = do_futex_wait_multiple(wb, count, flags, abs_time); ++ ++ if (ret == -ERESTART_RESTARTBLOCK) { ++ restart = ¤t->restart_block; ++ restart->fn = futex_wait_restart; ++ restart->futex.uaddr = uaddr; ++ restart->futex.val = count; ++ restart->futex.time = *abs_time; ++ restart->futex.flags = (flags | FLAGS_HAS_TIMEOUT | ++ FLAGS_WAKE_MULTIPLE); ++ } ++ ++out: ++ kfree(wb); ++ return ret; ++} ++ + static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, + ktime_t *abs_time, u32 bitset) + { +@@ -2797,6 +2942,10 @@ static long futex_wait_restart(struct restart_block *restart) + } + restart->fn = do_no_restart_syscall; + ++ if (restart->futex.flags & FLAGS_WAKE_MULTIPLE) ++ return (long)futex_wait_multiple(uaddr, restart->futex.flags, ++ restart->futex.val, tp); ++ + return (long)futex_wait(uaddr, restart->futex.flags, + restart->futex.val, tp, restart->futex.bitset); + } +@@ -3680,6 +3829,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, + uaddr2); + case FUTEX_CMP_REQUEUE_PI: + return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); ++ case FUTEX_WAIT_MULTIPLE: ++ return futex_wait_multiple(uaddr, flags, val, timeout); + } + return -ENOSYS; + } +@@ -3696,7 +3847,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET || +- cmd == FUTEX_WAIT_REQUEUE_PI)) { ++ cmd == FUTEX_WAIT_REQUEUE_PI || ++ cmd == FUTEX_WAIT_MULTIPLE)) { + if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) + return -EFAULT; + if (get_timespec64(&ts, utime)) +@@ -3705,7 +3857,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + return -EINVAL; + + t = timespec64_to_ktime(ts); +- if (cmd == FUTEX_WAIT) ++ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) + t = ktime_add_safe(ktime_get(), t); + tp = &t; + } +@@ -3889,14 +4041,15 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET || +- cmd == FUTEX_WAIT_REQUEUE_PI)) { ++ cmd == FUTEX_WAIT_REQUEUE_PI || ++ cmd == FUTEX_WAIT_MULTIPLE)) { + if (get_old_timespec32(&ts, utime)) + return -EFAULT; + if (!timespec64_valid(&ts)) + return -EINVAL; + + t = timespec64_to_ktime(ts); +- if (cmd == FUTEX_WAIT) ++ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) + t = ktime_add_safe(ktime_get(), t); + tp = &t; + } +-- +2.20.1 diff --git a/linux-tkg/linux-tkg-patches/5.4/0009-bmq_v5.4-r2.patch b/linux-tkg/linux-tkg-patches/5.4/0009-bmq_v5.4-r2.patch new file mode 100644 index 0000000..4d86ca6 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.4/0009-bmq_v5.4-r2.patch @@ -0,0 +1,7601 @@ +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index 032c7cd3cede..97ea247cc43a 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -105,6 +105,7 @@ show up in /proc/sys/kernel: + - unknown_nmi_panic + - watchdog + - watchdog_thresh ++- yield_type + - version + + +@@ -1175,3 +1176,13 @@ is 10 seconds. + + The softlockup threshold is (2 * watchdog_thresh). Setting this + tunable to zero will disable lockup detection altogether. ++ ++yield_type: ++=========== ++ ++BMQ CPU scheduler only. This determines what type of yield calls to ++sched_yield will perform. ++ ++ 0 - No yield. ++ 1 - Deboost and requeue task. (default) ++ 2 - Set run queue skip task. +diff --git a/Documentation/scheduler/sched-BMQ.txt b/Documentation/scheduler/sched-BMQ.txt +new file mode 100644 +index 000000000000..05c84eec0f31 +--- /dev/null ++++ b/Documentation/scheduler/sched-BMQ.txt +@@ -0,0 +1,110 @@ ++ BitMap queue CPU Scheduler ++ -------------------------- ++ ++CONTENT ++======== ++ ++ Background ++ Design ++ Overview ++ Task policy ++ Priority management ++ BitMap Queue ++ CPU Assignment and Migration ++ ++ ++Background ++========== ++ ++BitMap Queue CPU scheduler, referred to as BMQ from here on, is an evolution ++of previous Priority and Deadline based Skiplist multiple queue scheduler(PDS), ++and inspired by Zircon scheduler. The goal of it is to keep the scheduler code ++simple, while efficiency and scalable for interactive tasks, such as desktop, ++movie playback and gaming etc. ++ ++Design ++====== ++ ++Overview ++-------- ++ ++BMQ use per CPU run queue design, each CPU(logical) has it's own run queue, ++each CPU is responsible for scheduling the tasks that are putting into it's ++run queue. ++ ++The run queue is a set of priority queues. Note that these queues are fifo ++queue for non-rt tasks or priority queue for rt tasks in data structure. See ++BitMap Queue below for details. BMQ is optimized for non-rt tasks in the fact ++that most applications are non-rt tasks. No matter the queue is fifo or ++priority, In each queue is an ordered list of runnable tasks awaiting execution ++and the data structures are the same. When it is time for a new task to run, ++the scheduler simply looks the lowest numbered queueue that contains a task, ++and runs the first task from the head of that queue. And per CPU idle task is ++also in the run queue, so the scheduler can always find a task to run on from ++its run queue. ++ ++Each task will assigned the same timeslice(default 4ms) when it is picked to ++start running. Task will be reinserted at the end of the appropriate priority ++queue when it uses its whole timeslice. When the scheduler selects a new task ++from the priority queue it sets the CPU's preemption timer for the remainder of ++the previous timeslice. When that timer fires the scheduler will stop execution ++on that task, select another task and start over again. ++ ++If a task blocks waiting for a shared resource then it's taken out of its ++priority queue and is placed in a wait queue for the shared resource. When it ++is unblocked it will be reinserted in the appropriate priority queue of an ++eligible CPU. ++ ++Task policy ++----------- ++ ++BMQ supports DEADLINE, FIFO, RR, NORMAL, BATCH and IDLE task policy like the ++mainline CFS scheduler. But BMQ is heavy optimized for non-rt task, that's ++NORMAL/BATCH/IDLE policy tasks. Below is the implementation detail of each ++policy. ++ ++DEADLINE ++ It is squashed as priority 0 FIFO task. ++ ++FIFO/RR ++ All RT tasks share one single priority queue in BMQ run queue designed. The ++complexity of insert operation is O(n). BMQ is not designed for system runs ++with major rt policy tasks. ++ ++NORMAL/BATCH/IDLE ++ BATCH and IDLE tasks are treated as the same policy. They compete CPU with ++NORMAL policy tasks, but they just don't boost. To control the priority of ++NORMAL/BATCH/IDLE tasks, simply use nice level. ++ ++ISO ++ ISO policy is not supported in BMQ. Please use nice level -20 NORMAL policy ++task instead. ++ ++Priority management ++------------------- ++ ++RT tasks have priority from 0-99. For non-rt tasks, there are three different ++factors used to determine the effective priority of a task. The effective ++priority being what is used to determine which queue it will be in. ++ ++The first factor is simply the task’s static priority. Which is assigned from ++task's nice level, within [-20, 19] in userland's point of view and [0, 39] ++internally. ++ ++The second factor is the priority boost. This is a value bounded between ++[-MAX_PRIORITY_ADJ, MAX_PRIORITY_ADJ] used to offset the base priority, it is ++modified by the following cases: ++ ++*When a thread has used up its entire timeslice, always deboost its boost by ++increasing by one. ++*When a thread gives up cpu control(voluntary or non-voluntary) to reschedule, ++and its switch-in time(time after last switch and run) below the thredhold ++based on its priority boost, will boost its boost by decreasing by one buti is ++capped at 0 (won’t go negative). ++ ++The intent in this system is to ensure that interactive threads are serviced ++quickly. These are usually the threads that interact directly with the user ++and cause user-perceivable latency. These threads usually do little work and ++spend most of their time blocked awaiting another user event. So they get the ++priority boost from unblocking while background threads that do most of the ++processing receive the priority penalty for using their entire timeslice. +diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +index f18d5067cd0f..fe489fc01c73 100644 +--- a/arch/powerpc/platforms/cell/spufs/sched.c ++++ b/arch/powerpc/platforms/cell/spufs/sched.c +@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; + static struct timer_list spusched_timer; + static struct timer_list spuloadavg_timer; + +-/* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- + /* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. +diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c +index b66e81c06a57..a294f8f5fd75 100644 +--- a/drivers/cpufreq/cpufreq_conservative.c ++++ b/drivers/cpufreq/cpufreq_conservative.c +@@ -28,8 +28,8 @@ struct cs_dbs_tuners { + }; + + /* Conservative governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_FREQUENCY_DOWN_THRESHOLD (20) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) ++#define DEF_FREQUENCY_DOWN_THRESHOLD (26) + #define DEF_FREQUENCY_STEP (5) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (10) +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index dced033875bf..d2cd03766b09 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -18,7 +18,7 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (100000) + #define MICRO_FREQUENCY_UP_THRESHOLD (95) +@@ -127,7 +127,7 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq) + } + + /* +- * Every sampling_rate, we check, if current idle time is less than 20% ++ * Every sampling_rate, we check, if current idle time is less than 37% + * (default), then we try to increase frequency. Else, we adjust the frequency + * proportional to load. + */ +diff --git a/fs/proc/base.c b/fs/proc/base.c +index ebea9501afb8..51c9346a69fe 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -477,7 +477,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + seq_puts(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h +index 8874f681b056..59eb72bf7d5f 100644 +--- a/include/asm-generic/resource.h ++++ b/include/asm-generic/resource.h +@@ -23,7 +23,7 @@ + [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY }, \ + [RLIMIT_SIGPENDING] = { 0, 0 }, \ + [RLIMIT_MSGQUEUE] = { MQ_BYTES_MAX, MQ_BYTES_MAX }, \ +- [RLIMIT_NICE] = { 0, 0 }, \ ++ [RLIMIT_NICE] = { 30, 30 }, \ + [RLIMIT_RTPRIO] = { 0, 0 }, \ + [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, \ + } +diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h +index 1b6d31da7cbc..dea181bdb1dd 100644 +--- a/include/linux/jiffies.h ++++ b/include/linux/jiffies.h +@@ -171,7 +171,7 @@ static inline u64 get_jiffies_64(void) + * Have the 32 bit jiffies value wrap 5 minutes after boot + * so jiffies wrap bugs show up earlier. + */ +-#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) ++#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ)) + + /* + * Change timeval to jiffies, trying to avoid the +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 67a1d86981a9..a38ec88efbad 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -644,13 +644,18 @@ struct task_struct { + unsigned int flags; + unsigned int ptrace; + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_BMQ) + struct llist_node wake_entry; ++#endif ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_BMQ) + int on_cpu; ++#endif ++#ifdef CONFIG_SMP + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ + unsigned int cpu; + #endif ++#ifndef CONFIG_SCHED_BMQ + unsigned int wakee_flips; + unsigned long wakee_flip_decay_ts; + struct task_struct *last_wakee; +@@ -664,6 +669,7 @@ struct task_struct { + */ + int recent_used_cpu; + int wake_cpu; ++#endif /* !CONFIG_SCHED_BMQ */ + #endif + int on_rq; + +@@ -672,13 +678,23 @@ struct task_struct { + int normal_prio; + unsigned int rt_priority; + ++#ifdef CONFIG_SCHED_BMQ ++ u64 last_ran; ++ s64 time_slice; ++ int boost_prio; ++ int bmq_idx; ++ struct list_head bmq_node; ++ /* sched_clock time spent running */ ++ u64 sched_time; ++#else /* !CONFIG_SCHED_BMQ */ + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++ struct sched_dl_entity dl; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +- struct sched_dl_entity dl; + + #ifdef CONFIG_UCLAMP_TASK + /* Clamp values requested for a scheduling entity */ +@@ -1283,6 +1299,15 @@ struct task_struct { + */ + }; + ++#ifdef CONFIG_SCHED_BMQ ++#define tsk_seruntime(t) ((t)->sched_time) ++/* replace the uncertian rt_timeout with 0UL */ ++#define tsk_rttimeout(t) (0UL) ++#else /* CFS */ ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++#endif /* !CONFIG_SCHED_BMQ */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index 1aff00b65f3c..02a3c5d34ee4 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -1,5 +1,22 @@ + /* SPDX-License-Identifier: GPL-2.0 */ + ++#ifdef CONFIG_SCHED_BMQ ++ ++#define __tsk_deadline(p) (0UL) ++ ++static inline int dl_prio(int prio) ++{ ++ return 0; ++} ++ ++static inline int dl_task(struct task_struct *p) ++{ ++ return (SCHED_NORMAL == p->policy); ++} ++#else ++ ++#define __tsk_deadline(p) ((p)->dl.deadline) ++ + /* + * SCHED_DEADLINE tasks has negative priorities, reflecting + * the fact that any of them has higher prio than RT and +@@ -19,6 +36,7 @@ static inline int dl_task(struct task_struct *p) + { + return dl_prio(p->prio); + } ++#endif /* CONFIG_SCHED_BMQ */ + + static inline bool dl_time_before(u64 a, u64 b) + { +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index 7d64feafc408..d9dc5d3ccd2e 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -20,11 +20,17 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ + #define MAX_RT_PRIO MAX_USER_RT_PRIO + + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) + ++#ifdef CONFIG_SCHED_BMQ ++/* +/- priority levels from the base priority */ ++#define MAX_PRIORITY_ADJ 4 ++#endif ++ + /* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c08b4..6387c8ea9832 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_BMQ + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h +index 4b1c3b664f51..f0f966219695 100644 +--- a/include/linux/sched/task.h ++++ b/include/linux/sched/task.h +@@ -99,7 +99,7 @@ extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); + extern void free_task(struct task_struct *tsk); + + /* sched_exec is called by processes performing an exec */ +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_BMQ) + extern void sched_exec(void); + #else + #define sched_exec() {} +diff --git a/init/Kconfig b/init/Kconfig +index b4daad2bac23..f9faeb82f677 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -717,9 +717,28 @@ config GENERIC_SCHED_CLOCK + + menu "Scheduler features" + ++config SCHED_BMQ ++ bool "BMQ CPU scheduler" ++ help ++ The BitMap Queue CPU scheduler for excellent interactivity and ++ responsiveness on the desktop and solid scalability on normal ++ hardware and commodity servers. ++ ++ Say Y here. ++ default y ++ ++config SCHED_TIMESLICE ++ int "Scheduler Task time slice" ++ depends on SCHED_BMQ ++ help ++ Time slice in ms for BMQ CPU scheduler, default 4 ms. ++ default 2 if PREEMPT ++ default 4 if !PREEMPT ++ + config UCLAMP_TASK + bool "Enable utilization clamping for RT/FAIR tasks" + depends on CPU_FREQ_GOV_SCHEDUTIL ++ depends on !SCHED_BMQ + help + This feature enables the scheduler to track the clamped utilization + of each CPU based on RUNNABLE tasks scheduled on that CPU. +@@ -802,6 +821,7 @@ config NUMA_BALANCING + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_BMQ + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -903,7 +923,7 @@ menuconfig CGROUP_SCHED + bandwidth allocation to such task groups. It uses cgroups to group + tasks. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_BMQ + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -1150,6 +1170,7 @@ config CHECKPOINT_RESTORE + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_BMQ + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff --git a/init/init_task.c b/init/init_task.c +index 9e5cbe5eab7b..c293de91d90f 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -66,9 +66,15 @@ struct task_struct init_task + .stack = init_stack, + .usage = REFCOUNT_INIT(2), + .flags = PF_KTHREAD, ++#ifdef CONFIG_SCHED_BMQ ++ .prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, ++ .static_prio = DEFAULT_PRIO, ++ .normal_prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, ++#else + .prio = MAX_PRIO - 20, + .static_prio = MAX_PRIO - 20, + .normal_prio = MAX_PRIO - 20, ++#endif + .policy = SCHED_NORMAL, + .cpus_ptr = &init_task.cpus_mask, + .cpus_mask = CPU_MASK_ALL, +@@ -78,6 +84,12 @@ struct task_struct init_task + .restart_block = { + .fn = do_no_restart_syscall, + }, ++#ifdef CONFIG_SCHED_BMQ ++ .boost_prio = 0, ++ .bmq_idx = 15, ++ .bmq_node = LIST_HEAD_INIT(init_task.bmq_node), ++ .time_slice = HZ, ++#else + .se = { + .group_node = LIST_HEAD_INIT(init_task.se.group_node), + }, +@@ -85,6 +97,7 @@ struct task_struct init_task + .run_list = LIST_HEAD_INIT(init_task.rt.run_list), + .time_slice = RR_TIMESLICE, + }, ++#endif + .tasks = LIST_HEAD_INIT(init_task.tasks), + #ifdef CONFIG_SMP + .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index c87ee6412b36..45fac7b9c940 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -632,7 +632,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) + return ret; + } + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_BMQ) + /* + * Helper routine for generate_sched_domains(). + * Do cpusets a, b have overlapping effective cpus_allowed masks? +@@ -1007,7 +1007,7 @@ static void rebuild_sched_domains_locked(void) + /* Have scheduler rebuild the domains */ + partition_and_rebuild_sched_domains(ndoms, doms, attr); + } +-#else /* !CONFIG_SMP */ ++#else /* !CONFIG_SMP || CONFIG_SCHED_BMQ */ + static void rebuild_sched_domains_locked(void) + { + } +diff --git a/kernel/delayacct.c b/kernel/delayacct.c +index 27725754ac99..769d773c7182 100644 +--- a/kernel/delayacct.c ++++ b/kernel/delayacct.c +@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff --git a/kernel/exit.c b/kernel/exit.c +index a46a50d67002..58043176b285 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -131,7 +131,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -152,7 +152,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +index cdf318d86dd6..b3bd1e65c002 100644 +--- a/kernel/livepatch/transition.c ++++ b/kernel/livepatch/transition.c +@@ -306,7 +306,11 @@ static bool klp_try_switch_task(struct task_struct *task) + */ + rq = task_rq_lock(task, &flags); + ++#ifdef CONFIG_SCHED_BMQ ++ if (task_running(task) && task != current) { ++#else + if (task_running(rq, task) && task != current) { ++#endif + snprintf(err_buf, STACK_ERR_BUF_SIZE, + "%s: %s:%d is running\n", __func__, task->comm, + task->pid); +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index 2874bf556162..fad8a279fdfa 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -229,7 +229,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + * Only use with rt_mutex_waiter_{less,equal}() + */ + #define task_to_waiter(p) \ +- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } ++ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = __tsk_deadline(p) } + + static inline int + rt_mutex_waiter_less(struct rt_mutex_waiter *left, +@@ -680,7 +680,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, + * the values of the node being removed. + */ + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + rt_mutex_enqueue(lock, waiter); + +@@ -953,7 +953,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + waiter->task = task; + waiter->lock = lock; + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + /* Get the top priority waiter on the lock */ + if (rt_mutex_has_waiters(lock)) +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 21fb5a5662b5..cab4e5c5b38e 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -16,14 +16,20 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + +-obj-y += core.o loadavg.o clock.o cputime.o +-obj-y += idle.o fair.o rt.o deadline.o +-obj-y += wait.o wait_bit.o swait.o completion.o +- +-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o ++ifdef CONFIG_SCHED_BMQ ++obj-y += bmq.o ++else ++obj-y += core.o ++obj-y += fair.o rt.o deadline.o ++obj-$(CONFIG_SMP) += cpudeadline.o topology.o stop_task.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o +-obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o ++endif ++obj-y += loadavg.o clock.o cputime.o ++obj-y += idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o ++obj-$(CONFIG_SMP) += cpupri.o pelt.o ++obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o +diff --git a/kernel/sched/bmq.c b/kernel/sched/bmq.c +new file mode 100644 +index 000000000000..42a2a5b3d172 +--- /dev/null ++++ b/kernel/sched/bmq.c +@@ -0,0 +1,6102 @@ ++/* ++ * kernel/sched/bmq.c ++ * ++ * BMQ Core kernel scheduler code and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel ++ * scheduler by Alfred Chen. ++ * 2019-02-20 BMQ(BitMap Queue) kernel scheduler by Alfred Chen. ++ */ ++#include "bmq_sched.h" ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++ ++#include "../workqueue_internal.h" ++#include "../smpboot.h" ++ ++#include "pelt.h" ++ ++#define CREATE_TRACE_POINTS ++#include ++ ++/* rt_prio(prio) defined in include/linux/sched/rt.h */ ++#define rt_task(p) rt_prio((p)->prio) ++#define rt_policy(policy) ((policy) == SCHED_FIFO || (policy) == SCHED_RR) ++#define task_has_rt_policy(p) (rt_policy((p)->policy)) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++#define SCHED_TIMESLICE_NS (CONFIG_SCHED_TIMESLICE * 1000 * 1000) ++ ++/* Reschedule if less than this many μs left */ ++#define RESCHED_NS (100 * 1000) ++ ++/* ++ * This allows printing both to /proc/sched_debug and ++ * to the console ++ */ ++#define SEQ_printf(m, x...) \ ++ do { \ ++ if (m) \ ++ seq_printf(m, x); \ ++ else \ ++ pr_cont(x); \ ++ } while (0) ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "bmq: BMQ CPU Scheduler 5.4-r2 by Alfred Chen.\n"); ++} ++ ++/** ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Deboost and requeue task. (default) ++ * 2: Set rq skip task. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++#define rq_switch_time(rq) ((rq)->clock - (rq)->last_ts_switch) ++#define boost_threshold(p) (SCHED_TIMESLICE_NS >>\ ++ (10 - MAX_PRIORITY_ADJ - (p)->boost_prio)) ++ ++static inline void boost_task(struct task_struct *p) ++{ ++ int limit; ++ ++ switch (p->policy) { ++ case SCHED_NORMAL: ++ limit = -MAX_PRIORITY_ADJ; ++ break; ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ limit = 0; ++ break; ++ default: ++ return; ++ } ++ ++ if (p->boost_prio > limit) ++ p->boost_prio--; ++} ++ ++static inline void deboost_task(struct task_struct *p) ++{ ++ if (p->boost_prio < MAX_PRIORITY_ADJ) ++ p->boost_prio++; ++} ++ ++#ifdef CONFIG_SMP ++static cpumask_t sched_rq_pending_mask ____cacheline_aligned_in_smp; ++ ++enum { ++ BASE_CPU_AFFINITY_CHK_LEVEL = 1, ++#ifdef CONFIG_SCHED_SMT ++ SMT_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++#ifdef CONFIG_SCHED_MC ++ MC_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++ NR_CPU_AFFINITY_CHK_LEVEL ++}; ++ ++DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_masks); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_end_mask); ++ ++#ifdef CONFIG_SCHED_SMT ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++EXPORT_SYMBOL_GPL(sched_smt_present); ++#endif ++ ++/* ++ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of ++ * the domain), this allows us to quickly tell if two cpus are in the same cache ++ * domain, see cpus_share_cache(). ++ */ ++DEFINE_PER_CPU(int, sd_llc_id); ++ ++int __weak arch_sd_sibling_asym_packing(void) ++{ ++ return 0*SD_ASYM_PACKING; ++} ++#endif /* CONFIG_SMP */ ++ ++static DEFINE_MUTEX(sched_hotcpu_mutex); ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++#define IDLE_WM (IDLE_TASK_SCHED_PRIO) ++ ++static cpumask_t sched_sg_idle_mask ____cacheline_aligned_in_smp; ++static cpumask_t sched_rq_watermark[bmq_BITS] ____cacheline_aligned_in_smp; ++ ++#if (bmq_BITS <= BITS_PER_LONG) ++#define bmq_find_first_bit(bm) __ffs((bm[0])) ++#define bmq_find_next_bit(bm, start) __ffs(BITMAP_FIRST_WORD_MASK(start) & bm[0]) ++#else ++#define bmq_find_first_bit(bm) find_first_bit((bm), bmq_BITS) ++#define bmq_find_next_bit(bm, start) find_next_bit(bm, bmq_BITS, start) ++#endif ++ ++static inline void update_sched_rq_watermark(struct rq *rq) ++{ ++ unsigned long watermark = bmq_find_first_bit(rq->queue.bitmap); ++ unsigned long last_wm = rq->watermark; ++ unsigned long i; ++ int cpu; ++ ++ if (watermark == last_wm) ++ return; ++ ++ rq->watermark = watermark; ++ cpu = cpu_of(rq); ++ if (watermark < last_wm) { ++ for (i = watermark + 1; i <= last_wm; i++) ++ cpumask_andnot(&sched_rq_watermark[i], ++ &sched_rq_watermark[i], cpumask_of(cpu)); ++#ifdef CONFIG_SCHED_SMT ++ if (!static_branch_likely(&sched_smt_present)) ++ return; ++ if (IDLE_WM == last_wm) ++ cpumask_andnot(&sched_sg_idle_mask, ++ &sched_sg_idle_mask, cpu_smt_mask(cpu)); ++#endif ++ return; ++ } ++ /* last_wm < watermark */ ++ for (i = last_wm + 1; i <= watermark; i++) ++ cpumask_set_cpu(cpu, &sched_rq_watermark[i]); ++#ifdef CONFIG_SCHED_SMT ++ if (!static_branch_likely(&sched_smt_present)) ++ return; ++ if (IDLE_WM == watermark) { ++ cpumask_t tmp; ++ cpumask_and(&tmp, cpu_smt_mask(cpu), &sched_rq_watermark[IDLE_WM]); ++ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) ++ cpumask_or(&sched_sg_idle_mask, cpu_smt_mask(cpu), ++ &sched_sg_idle_mask); ++ } ++#endif ++} ++ ++static inline int task_sched_prio(struct task_struct *p) ++{ ++ return (p->prio < MAX_RT_PRIO)? 0:p->prio - MAX_RT_PRIO + p->boost_prio + 1; ++} ++ ++static inline void bmq_init(struct bmq *q) ++{ ++ int i; ++ ++ bitmap_zero(q->bitmap, bmq_BITS); ++ for(i = 0; i < bmq_BITS; i++) ++ INIT_LIST_HEAD(&q->heads[i]); ++} ++ ++static inline void bmq_init_idle(struct bmq *q, struct task_struct *idle) ++{ ++ INIT_LIST_HEAD(&q->heads[IDLE_TASK_SCHED_PRIO]); ++ list_add(&idle->bmq_node, &q->heads[IDLE_TASK_SCHED_PRIO]); ++ set_bit(IDLE_TASK_SCHED_PRIO, q->bitmap); ++} ++ ++static inline void bmq_add_task(struct task_struct *p, struct bmq *q, int idx) ++{ ++ struct list_head *n; ++ ++ if (likely(idx)) { ++ list_add_tail(&p->bmq_node, &q->heads[idx]); ++ return; ++ } ++ ++ list_for_each(n, &q->heads[idx]) ++ if (list_entry(n, struct task_struct, bmq_node)->prio > p->prio) ++ break; ++ __list_add(&p->bmq_node, n->prev, n); ++} ++ ++/* ++ * This routine used in bmq scheduler only which assume the idle task in the bmq ++ */ ++static inline struct task_struct *rq_first_bmq_task(struct rq *rq) ++{ ++ unsigned long idx = bmq_find_first_bit(rq->queue.bitmap); ++ const struct list_head *head = &rq->queue.heads[idx]; ++ ++ return list_first_entry(head, struct task_struct, bmq_node); ++} ++ ++static inline struct task_struct * ++rq_next_bmq_task(struct task_struct *p, struct rq *rq) ++{ ++ unsigned long idx = p->bmq_idx; ++ struct list_head *head = &rq->queue.heads[idx]; ++ ++ if (list_is_last(&p->bmq_node, head)) { ++ idx = bmq_find_next_bit(rq->queue.bitmap, idx + 1); ++ head = &rq->queue.heads[idx]; ++ ++ return list_first_entry(head, struct task_struct, bmq_node); ++ } ++ ++ return list_next_entry(p, bmq_node); ++} ++ ++static inline struct task_struct *rq_runnable_task(struct rq *rq) ++{ ++ struct task_struct *next = rq_first_bmq_task(rq); ++ ++ if (unlikely(next == rq->skip)) ++ next = rq_next_bmq_task(next, rq); ++ ++ return next; ++} ++ ++/* ++ * Context: p->pi_lock ++ */ ++static inline struct rq ++*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock(&rq->lock); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ *plock = NULL; ++ return rq; ++ } ++ } ++} ++ ++static inline void ++__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) ++{ ++ if (NULL != lock) ++ raw_spin_unlock(lock); ++} ++ ++static inline struct rq ++*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, ++ unsigned long *flags) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock_irqsave(&rq->lock, *flags); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&rq->lock, *flags); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ raw_spin_lock_irqsave(&p->pi_lock, *flags); ++ if (likely(!p->on_cpu && !p->on_rq && ++ rq == task_rq(p))) { ++ *plock = &p->pi_lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); ++ } ++ } ++} ++ ++static inline void ++task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, ++ unsigned long *flags) ++{ ++ raw_spin_unlock_irqrestore(lock, *flags); ++} ++ ++/* ++ * __task_rq_lock - lock the rq @p resides on. ++ */ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ for (;;) { ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) ++ return rq; ++ raw_spin_unlock(&rq->lock); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. ++ */ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ for (;;) { ++ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ /* ++ * move_queued_task() task_rq_lock() ++ * ++ * ACQUIRE (rq->lock) ++ * [S] ->on_rq = MIGRATING [L] rq = task_rq() ++ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); ++ * [S] ->cpu = new_cpu [L] task_rq() ++ * [L] ->on_rq ++ * RELEASE (rq->lock) ++ * ++ * If we observe the old CPU in task_rq_lock(), the acquire of ++ * the old rq->lock will fully serialize against the stores. ++ * ++ * If we observe the new CPU in task_rq_lock(), the address ++ * dependency headed by '[L] rq = task_rq()' and the acquire ++ * will pair with the WMB to ensure we then also see migrating. ++ */ ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++ s64 __maybe_unused steal = 0, irq_delta = 0; ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ delta -= steal; ++ } ++#endif ++ ++ rq->clock_task += delta; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ if ((irq_delta + steal)) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta <= 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out ++ * of nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu; ++ ++ if (!tick_nohz_full_enabled()) ++ return; ++ ++ cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (rq->nr_running < 2) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++/* ++ * Add/Remove/Requeue task to/from the runqueue routines ++ * Context: rq->lock ++ */ ++static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "bmq: dequeue task reside on cpu%d from cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ ++ list_del(&p->bmq_node); ++ if (list_empty(&rq->queue.heads[p->bmq_idx])) { ++ clear_bit(p->bmq_idx, rq->queue.bitmap); ++ update_sched_rq_watermark(rq); ++ } ++ --rq->nr_running; ++#ifdef CONFIG_SMP ++ if (1 == rq->nr_running) ++ cpumask_clear_cpu(cpu_of(rq), &sched_rq_pending_mask); ++#endif ++ ++ sched_update_tick_dependency(rq); ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); ++ ++ sched_info_dequeued(rq, p); ++} ++ ++static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "bmq: enqueue task reside on cpu%d to cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ ++ p->bmq_idx = task_sched_prio(p); ++ bmq_add_task(p, &rq->queue, p->bmq_idx); ++ set_bit(p->bmq_idx, rq->queue.bitmap); ++ update_sched_rq_watermark(rq); ++ ++rq->nr_running; ++#ifdef CONFIG_SMP ++ if (2 == rq->nr_running) ++ cpumask_set_cpu(cpu_of(rq), &sched_rq_pending_mask); ++#endif ++ ++ sched_update_tick_dependency(rq); ++ ++ sched_info_queued(rq, p); ++ psi_enqueue(p, flags); ++ ++ /* ++ * If in_iowait is set, the code below may not trigger any cpufreq ++ * utilization updates, so do it here explicitly with the IOWAIT flag ++ * passed. ++ */ ++ if (p->in_iowait) ++ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); ++} ++ ++static inline void requeue_task(struct task_struct *p, struct rq *rq) ++{ ++ int idx = task_sched_prio(p); ++ ++ lockdep_assert_held(&rq->lock); ++ WARN_ONCE(task_rq(p) != rq, "bmq: cpu[%d] requeue task reside on cpu%d\n", ++ cpu_of(rq), task_cpu(p)); ++ ++ list_del(&p->bmq_node); ++ bmq_add_task(p, &rq->queue, idx); ++ if (idx != p->bmq_idx) { ++ if (list_empty(&rq->queue.heads[p->bmq_idx])) ++ clear_bit(p->bmq_idx, rq->queue.bitmap); ++ p->bmq_idx = idx; ++ set_bit(p->bmq_idx, rq->queue.bitmap); ++ update_sched_rq_watermark(rq); ++ } ++} ++ ++/* ++ * resched_curr - mark rq's current task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_curr(struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ int cpu; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ if (test_tsk_need_resched(curr)) ++ return; ++ ++ cpu = cpu_of(rq); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(curr)) ++ smp_send_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(cpu_rq(cpu)); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++} ++ ++static inline void check_preempt_curr(struct rq *rq) ++{ ++ if (rq_first_bmq_task(rq) != rq->curr) ++ resched_curr(rq); ++} ++ ++#ifdef CONFIG_SCHED_HRTICK ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++ ++static void hrtick_clear(struct rq *rq) ++{ ++ if (hrtimer_active(&rq->hrtick_timer)) ++ hrtimer_cancel(&rq->hrtick_timer); ++} ++ ++/* ++ * High-resolution timer tick. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrtick(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrtick_timer); ++ struct task_struct *p; ++ ++ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); ++ ++ raw_spin_lock(&rq->lock); ++ p = rq->curr; ++ p->time_slice = 0; ++ resched_curr(rq); ++ raw_spin_unlock(&rq->lock); ++ ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Use hrtick when: ++ * - enabled by features ++ * - hrtimer is actually high res ++ */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ /** ++ * BMQ doesn't support sched_feat yet ++ if (!sched_feat(HRTICK)) ++ return 0; ++ */ ++ if (!cpu_active(cpu_of(rq))) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrtick_timer); ++} ++ ++#ifdef CONFIG_SMP ++ ++static void __hrtick_restart(struct rq *rq) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ++ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); ++} ++ ++/* ++ * called from hardirq (IPI) context ++ */ ++static void __hrtick_start(void *arg) ++{ ++ struct rq *rq = arg; ++ ++ raw_spin_lock(&rq->lock); ++ __hrtick_restart(rq); ++ rq->hrtick_csd_pending = 0; ++ raw_spin_unlock(&rq->lock); ++} ++ ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ktime_t time; ++ s64 delta; ++ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense and can cause timer DoS. ++ */ ++ delta = max_t(s64, delay, 10000LL); ++ time = ktime_add_ns(timer->base->get_time(), delta); ++ ++ hrtimer_set_expires(timer, time); ++ ++ if (rq == this_rq()) { ++ __hrtick_restart(rq); ++ } else if (!rq->hrtick_csd_pending) { ++ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); ++ rq->hrtick_csd_pending = 1; ++ } ++} ++ ++#else ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense. Rely on vruntime for fairness. ++ */ ++ delay = max_t(u64, delay, 10000LL); ++ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED_HARD); ++} ++#endif /* CONFIG_SMP */ ++ ++static void hrtick_rq_init(struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ rq->hrtick_csd_pending = 0; ++ ++ rq->hrtick_csd.flags = 0; ++ rq->hrtick_csd.func = __hrtick_start; ++ rq->hrtick_csd.info = rq; ++#endif ++ ++ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); ++ rq->hrtick_timer.function = hrtick; ++} ++#else /* CONFIG_SCHED_HRTICK */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline void hrtick_clear(struct rq *rq) ++{ ++} ++ ++static inline void hrtick_rq_init(struct rq *rq) ++{ ++} ++#endif /* CONFIG_SCHED_HRTICK */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ if (task_has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ ++ return p->static_prio + MAX_PRIORITY_ADJ; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static void activate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible--; ++ enqueue_task(p, rq, ENQUEUE_WAKEUP); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++ cpufreq_update_util(rq, 0); ++} ++ ++/* ++ * deactivate_task - remove a task from the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible++; ++ dequeue_task(p, rq, DEQUEUE_SLEEP); ++ p->on_rq = 0; ++ cpufreq_update_util(rq, 0); ++} ++ ++static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ WRITE_ONCE(p->cpu, cpu); ++#else ++ WRITE_ONCE(task_thread_info(p)->cpu, cpu); ++#endif ++#endif ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++#ifdef CONFIG_SCHED_DEBUG ++ /* ++ * We should never call set_task_cpu() on a blocked task, ++ * ttwu() will sort out the placement. ++ */ ++ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && ++ !p->on_rq); ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * sched_move_task() holds both and thus holding either pins the cgroup, ++ * see task_group(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(&task_rq(p)->lock))); ++#endif ++ /* ++ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. ++ */ ++ WARN_ON_ONCE(!cpu_online(new_cpu)); ++#endif ++ if (task_cpu(p) == new_cpu) ++ return; ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ __set_task_cpu(p, new_cpu); ++} ++ ++static inline bool is_per_cpu_kthread(struct task_struct *p) ++{ ++ return ((p->flags & PF_KTHREAD) && (1 == p->nr_cpus_allowed)); ++} ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr() and select_fallback_rq(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ ++ if (is_per_cpu_kthread(p)) ++ return cpu_online(cpu); ++ ++ return cpu_active(cpu); ++} ++ ++/* ++ * This is how migration works: ++ * ++ * 1) we invoke migration_cpu_stop() on the target CPU using ++ * stop_one_cpu(). ++ * 2) stopper starts to run (implicitly forcing the migrated thread ++ * off the CPU) ++ * 3) it checks whether the migrated task is still in the wrong runqueue. ++ * 4) if it's in the wrong runqueue then the migration thread removes ++ * it and puts it into the right queue. ++ * 5) stopper completes and stop_one_cpu() returns and the migration ++ * is done. ++ */ ++ ++/* ++ * move_queued_task - move a queued task to new rq. ++ * ++ * Returns (locked) new rq. Old rq's lock is released. ++ */ ++static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int ++ new_cpu) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ++ dequeue_task(p, rq, 0); ++ set_task_cpu(p, new_cpu); ++ raw_spin_unlock(&rq->lock); ++ ++ rq = cpu_rq(new_cpu); ++ ++ raw_spin_lock(&rq->lock); ++ BUG_ON(task_cpu(p) != new_cpu); ++ enqueue_task(p, rq, 0); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++ check_preempt_curr(rq); ++ ++ return rq; ++} ++ ++struct migration_arg { ++ struct task_struct *task; ++ int dest_cpu; ++}; ++ ++/* ++ * Move (not current) task off this CPU, onto the destination CPU. We're doing ++ * this because either it can't run here any more (set_cpus_allowed() ++ * away from this CPU, or CPU going down), or because we're ++ * attempting to rebalance this task on exec (sched_exec). ++ * ++ * So we race with normal scheduler movements, but that's OK, as long ++ * as the task is no longer on this CPU. ++ */ ++static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int ++ dest_cpu) ++{ ++ /* Affinity changed (again). */ ++ if (!is_cpu_allowed(p, dest_cpu)) ++ return rq; ++ ++ update_rq_clock(rq); ++ return move_queued_task(rq, p, dest_cpu); ++} ++ ++/* ++ * migration_cpu_stop - this will be executed by a highprio stopper thread ++ * and performs thread migration by bumping thread off CPU then ++ * 'pushing' onto another runqueue. ++ */ ++static int migration_cpu_stop(void *data) ++{ ++ struct migration_arg *arg = data; ++ struct task_struct *p = arg->task; ++ struct rq *rq = this_rq(); ++ ++ /* ++ * The original target CPU might have gone down and we might ++ * be on another CPU but it doesn't matter. ++ */ ++ local_irq_disable(); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ /* ++ * If task_rq(p) != rq, it cannot be migrated here, because we're ++ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because ++ * we're holding p->pi_lock. ++ */ ++ if (task_rq(p) == rq && task_on_rq_queued(p)) ++ rq = __migrate_task(rq, p, arg->dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_enable(); ++ return 0; ++} ++ ++static inline void ++set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ set_cpus_allowed_common(p, new_mask); ++} ++#endif ++ ++/* Enter with rq lock held. We know p is on the local CPU */ ++static inline void __set_tsk_resched(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ unsigned long flags; ++ bool running, on_rq; ++ unsigned long ncsw; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(p) && p == rq->curr) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ task_access_lock_irqsave(p, &lock, &flags); ++ trace_sched_wait_task(p); ++ running = task_running(p); ++ on_rq = p->on_rq; ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(on_rq)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_send_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++ ++/* ++ * ->cpus_ptr is protected by both rq->lock and p->pi_lock ++ * ++ * A few notes on cpu_active vs cpu_online: ++ * ++ * - cpu_active must be a subset of cpu_online ++ * ++ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, ++ * see __set_cpus_allowed_ptr(). At this point the newly online ++ * CPU isn't yet part of the sched domains, and balancing will not ++ * see it. ++ * ++ * - on cpu-down we clear cpu_active() to mask the sched domains and ++ * avoid the load balancer to place new tasks on the to be removed ++ * CPU. Existing tasks will remain running there and will be taken ++ * off. ++ * ++ * This means that fallback selection must not select !active CPUs. ++ * And can assume that any active CPU must be online. Conversely ++ * select_task_rq() below may allow selection of !active CPUs in order ++ * to satisfy the above rules. ++ */ ++static int select_fallback_rq(int cpu, struct task_struct *p) ++{ ++ int nid = cpu_to_node(cpu); ++ const struct cpumask *nodemask = NULL; ++ enum { cpuset, possible, fail } state = cpuset; ++ int dest_cpu; ++ ++ /* ++ * If the node that the CPU is on has been offlined, cpu_to_node() ++ * will return -1. There is no CPU on the node, and we should ++ * select the CPU on the other node. ++ */ ++ if (nid != -1) { ++ nodemask = cpumask_of_node(nid); ++ ++ /* Look for allowed, online CPU in same node. */ ++ for_each_cpu(dest_cpu, nodemask) { ++ if (!cpu_active(dest_cpu)) ++ continue; ++ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) ++ return dest_cpu; ++ } ++ } ++ ++ for (;;) { ++ /* Any allowed, online CPU? */ ++ for_each_cpu(dest_cpu, p->cpus_ptr) { ++ if (!is_cpu_allowed(p, dest_cpu)) ++ continue; ++ goto out; ++ } ++ ++ /* No more Mr. Nice Guy. */ ++ switch (state) { ++ case cpuset: ++ if (IS_ENABLED(CONFIG_CPUSETS)) { ++ cpuset_cpus_allowed_fallback(p); ++ state = possible; ++ break; ++ } ++ /* Fall-through */ ++ case possible: ++ do_set_cpus_allowed(p, cpu_possible_mask); ++ state = fail; ++ break; ++ ++ case fail: ++ BUG(); ++ break; ++ } ++ } ++ ++out: ++ if (state != cpuset) { ++ /* ++ * Don't tell them about moving exiting tasks or ++ * kernel threads (both mm NULL), since they never ++ * leave kernel. ++ */ ++ if (p->mm && printk_ratelimit()) { ++ printk_deferred("process %d (%s) no longer affine to cpu%d\n", ++ task_pid_nr(p), p->comm, cpu); ++ } ++ } ++ ++ return dest_cpu; ++} ++ ++static inline int __best_mask_cpu(int cpu, const cpumask_t *cpumask) ++{ ++ cpumask_t *mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) ++ mask++; ++ return cpu; ++} ++ ++static inline int best_mask_cpu(int cpu, const cpumask_t *cpumask) ++{ ++ return cpumask_test_cpu(cpu, cpumask)? cpu:__best_mask_cpu(cpu, cpumask); ++} ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ cpumask_t chk_mask, tmp; ++ ++ if (unlikely(!cpumask_and(&chk_mask, p->cpus_ptr, cpu_online_mask))) ++ return select_fallback_rq(task_cpu(p), p); ++ ++ if ( ++#ifdef CONFIG_SCHED_SMT ++ cpumask_and(&tmp, &chk_mask, &sched_sg_idle_mask) || ++#endif ++ cpumask_and(&tmp, &chk_mask, &sched_rq_watermark[IDLE_WM]) || ++ cpumask_and(&tmp, &chk_mask, ++ &sched_rq_watermark[task_sched_prio(p) + 1])) ++ return best_mask_cpu(task_cpu(p), &tmp); ++ ++ return best_mask_cpu(task_cpu(p), &chk_mask); ++} ++#else /* CONFIG_SMP */ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ return 0; ++} ++#endif /* CONFIG_SMP */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq= this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) ++ __schedstat_inc(rq->ttwu_local); ++ else { ++ /** BMQ ToDo: ++ * How to do ttwu_wake_remote ++ */ ++ } ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static inline void ++ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static inline void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++#ifdef CONFIG_SMP ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++#endif ++ ++ activate_task(p, rq); ++ ttwu_do_wakeup(rq, p, 0); ++} ++ ++static inline void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ check_preempt_curr(rq); ++ raw_spin_unlock(&rq->lock); ++} ++ ++static int ttwu_remote(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ rq = __task_access_lock(p, &lock); ++ if (task_on_rq_queued(p)) { ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_access_unlock(p, lock); ++ ++ return ret; ++} ++ ++/* ++ * Notes on Program-Order guarantees on SMP systems. ++ * ++ * MIGRATION ++ * ++ * The basic program-order guarantee on SMP systems is that when a task [t] ++ * migrates, all its activity on its old CPU [c0] happens-before any subsequent ++ * execution on its new CPU [c1]. ++ * ++ * For migration (of runnable tasks) this is provided by the following means: ++ * ++ * A) UNLOCK of the rq(c0)->lock scheduling out task t ++ * B) migration for t is required to synchronize *both* rq(c0)->lock and ++ * rq(c1)->lock (if not at the same time, then in that order). ++ * C) LOCK of the rq(c1)->lock scheduling in task ++ * ++ * Transitivity guarantees that B happens after A and C after B. ++ * Note: we only require RCpc transitivity. ++ * Note: the CPU doing B need not be c0 or c1 ++ * ++ * Example: ++ * ++ * CPU0 CPU1 CPU2 ++ * ++ * LOCK rq(0)->lock ++ * sched-out X ++ * sched-in Y ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(0)->lock // orders against CPU0 ++ * dequeue X ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(1)->lock ++ * enqueue X ++ * UNLOCK rq(1)->lock ++ * ++ * LOCK rq(1)->lock // orders against CPU2 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(1)->lock ++ * ++ * ++ * BLOCKING -- aka. SLEEP + WAKEUP ++ * ++ * For blocking we (obviously) need to provide the same guarantee as for ++ * migration. However the means are completely different as there is no lock ++ * chain to provide order. Instead we do: ++ * ++ * 1) smp_store_release(X->on_cpu, 0) ++ * 2) smp_cond_load_acquire(!X->on_cpu) ++ * ++ * Example: ++ * ++ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) ++ * ++ * LOCK rq(0)->lock LOCK X->pi_lock ++ * dequeue X ++ * sched-out X ++ * smp_store_release(X->on_cpu, 0); ++ * ++ * smp_cond_load_acquire(&X->on_cpu, !VAL); ++ * X->state = WAKING ++ * set_task_cpu(X,2) ++ * ++ * LOCK rq(2)->lock ++ * enqueue X ++ * X->state = RUNNING ++ * UNLOCK rq(2)->lock ++ * ++ * LOCK rq(2)->lock // orders against CPU1 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(2)->lock ++ * ++ * UNLOCK X->pi_lock ++ * UNLOCK rq(0)->lock ++ * ++ * ++ * However; for wakeups there is a second guarantee we must provide, namely we ++ * must observe the state that lead to our wakeup. That is, not only must our ++ * task observe its own prior state, it must also observe the stores prior to ++ * its wakeup. ++ * ++ * This means that any means of doing remote wakeups must order the CPU doing ++ * the wakeup against the CPU the task is going to end up running on. This, ++ * however, is already required for the regular Program-Order guarantee above, ++ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). ++ * ++ */ ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Return: %true if @p was woken up, %false if it was already running. ++ * or @state didn't match @p's state. ++ */ ++static int try_to_wake_up(struct task_struct *p, unsigned int state, ++ int wake_flags) ++{ ++ unsigned long flags; ++ int cpu, success = 0; ++ ++ preempt_disable(); ++ if (p == current) { ++ /* ++ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) ++ * == smp_processor_id()'. Together this means we can special ++ * case the whole 'p->on_rq && ttwu_remote()' case below ++ * without taking any locks. ++ * ++ * In particular: ++ * - we rely on Program-Order guarantees for all the ordering, ++ * - we're serialized against set_special_state() by virtue of ++ * it disabling IRQs (this allows not taking ->pi_lock). ++ */ ++ if (!(p->state & state)) ++ goto out; ++ ++ success = 1; ++ cpu = task_cpu(p); ++ trace_sched_waking(p); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++ goto out; ++ } ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with mb() in ++ * set_current_state() the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ if (!(p->state & state)) ++ goto unlock; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ cpu = task_cpu(p); ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ if (p->on_rq && ttwu_remote(p, wake_flags)) ++ goto unlock; ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until its done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ p->sched_contributes_to_load = !!task_contributes_to_load(p); ++ p->state = TASK_WAKING; ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ if(cpu_rq(smp_processor_id())->clock - p->last_ran > SCHED_TIMESLICE_NS) ++ boost_task(p); ++ ++ cpu = select_task_rq(p); ++ ++ if (cpu != task_cpu(p)) { ++ wake_flags |= WF_MIGRATED; ++ psi_ttwu_dequeue(p); ++ set_task_cpu(p, cpu); ++ } ++#else /* CONFIG_SMP */ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++#endif /* CONFIG_SMP */ ++ ++ ttwu_queue(p, cpu, wake_flags); ++unlock: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++out: ++ if (success) ++ ttwu_stat(p, cpu, wake_flags); ++ preempt_enable(); ++ ++ return success; ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ */ ++int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ int cpu = get_cpu(); ++ struct rq *rq = this_rq(); ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ /* Should be reset in fork.c but done here for ease of BMQ patching */ ++ p->on_cpu = ++ p->on_rq = ++ p->utime = ++ p->stime = ++ p->sched_time = 0; ++ ++#ifdef CONFIG_COMPACTION ++ p->capture_control = NULL; ++#endif ++ ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = current->normal_prio; ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (task_has_rt_policy(p)) { ++ p->policy = SCHED_NORMAL; ++ p->static_prio = NICE_TO_PRIO(0); ++ p->rt_priority = 0; ++ } else if (PRIO_TO_NICE(p->static_prio) < 0) ++ p->static_prio = NICE_TO_PRIO(0); ++ ++ p->prio = p->normal_prio = normal_prio(p); ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ p->boost_prio = (p->boost_prio < 0) ? ++ p->boost_prio + MAX_PRIORITY_ADJ : MAX_PRIORITY_ADJ; ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ rq->curr->time_slice /= 2; ++ p->time_slice = rq->curr->time_slice; ++#ifdef CONFIG_SCHED_HRTICK ++ hrtick_start(rq, rq->curr->time_slice); ++#endif ++ ++ if (p->time_slice < RESCHED_NS) { ++ p->time_slice = SCHED_TIMESLICE_NS; ++ resched_curr(rq); ++ } ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ /* ++ * The child is not yet in the pid-hash so no cgroup attach races, ++ * and the cgroup is pinned to this child due to cgroup_fork() ++ * is ran before sched_fork(). ++ * ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ /* ++ * We're setting the CPU for the first time, we don't migrate, ++ * so use __set_task_cpu(). ++ */ ++ __set_task_cpu(p, cpu); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ put_cpu(); ++ return 0; ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ p->state = TASK_RUNNING; ++ ++ rq = cpu_rq(select_task_rq(p)); ++#ifdef CONFIG_SMP ++ /* ++ * Fork balancing, do it here and not earlier because: ++ * - cpus_ptr can change in the fork path ++ * - any previously selected CPU might disappear through hotplug ++ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, ++ * as we're not fully set-up yet. ++ */ ++ __set_task_cpu(p, cpu_of(rq)); ++#endif ++ ++ raw_spin_lock(&rq->lock); ++ ++ update_rq_clock(rq); ++ activate_task(p, rq); ++ trace_sched_wakeup_new(p); ++ check_preempt_curr(rq); ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ */ ++ next->on_cpu = 1; ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->on_cpu is cleared, the task can be moved to a different CPU. ++ * We must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#else ++ prev->on_cpu = 0; ++#endif ++} ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock.dep_map, 1, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock.owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static struct rq *finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(&rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ finish_lock_switch(rq); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct_rcu_user(prev); ++ } ++ ++ tick_nohz_task_switch(); ++ return rq; ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq; ++ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ rq = finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline struct rq * ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prepare_task_switch(rq, prev, next); ++ ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * kernel -> kernel lazy + transfer active ++ * user -> kernel lazy + mmgrab() active ++ * ++ * kernel -> user switch + mmdrop() active ++ * user -> user switch ++ */ ++ if (!next->mm) { // to kernel ++ enter_lazy_tlb(prev->active_mm, next); ++ ++ next->active_mm = prev->active_mm; ++ if (prev->mm) // from user ++ mmgrab(prev->active_mm); ++ else ++ prev->active_mm = NULL; ++ } else { // to user ++ membarrier_switch_mm(rq, prev->active_mm, next->mm); ++ /* ++ * sys_membarrier() requires an smp_mb() between setting ++ * rq->curr / membarrier_switch_mm() and returning to userspace. ++ * ++ * The below provides this either through switch_mm(), or in ++ * case 'prev->active_mm == next->mm' through ++ * finish_task_switch()'s mmdrop(). ++ */ ++ switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ ++ if (!prev->mm) { // from kernel ++ /* will mmdrop() in finish_task_switch(). */ ++ rq->prev_mm = prev->active_mm; ++ prev->active_mm = NULL; ++ } ++ } ++ ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ return finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptible section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ return raw_rq()->nr_running == 1; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int i; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += cpu_rq(i)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpuidle menu ++ * governor, are using nonsensical data. Preferring shallow idle state selection ++ * for a CPU that has IO-wait which might not even end up running the task when ++ * it does become runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ return atomic_read(&cpu_rq(cpu)->nr_iowait); ++} ++ ++/* ++ * IO-wait accounting, and how its mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += nr_iowait_cpu(i); ++ ++ return sum; ++} ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++static inline void update_curr(struct rq *rq, struct task_struct *p) ++{ ++ s64 ns = rq->clock_task - p->last_ran; ++ ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ p->time_slice -= ns; ++ p->last_ran = rq->clock_task; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimization chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_access_lock_irqsave(p, &lock, &flags); ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_rq_clock(rq); ++ update_curr(rq, p); ++ } ++ ns = tsk_seruntime(p); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ return ns; ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static inline void scheduler_task_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ if (is_idle_task(p)) ++ return; ++ ++ update_curr(rq, p); ++ cpufreq_update_util(rq, 0); ++ ++ /* ++ * Tasks have less than RESCHED_NS of time slice left they will be ++ * rescheduled. ++ */ ++ if (p->time_slice >= RESCHED_NS) ++ return; ++ __set_tsk_resched(p); ++} ++ ++#ifdef CONFIG_SCHED_SMT ++static inline int active_load_balance_cpu_stop(void *data) ++{ ++ struct rq *rq = this_rq(); ++ struct task_struct *p = data; ++ cpumask_t tmp; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ rq->active_balance = 0; ++ /* _something_ may have changed the task, double check again */ ++ if (task_on_rq_queued(p) && task_rq(p) == rq && ++ cpumask_and(&tmp, p->cpus_ptr, &sched_sg_idle_mask)) ++ rq = move_queued_task(rq, p, __best_mask_cpu(cpu_of(rq), &tmp)); ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_restore(flags); ++ ++ return 0; ++} ++ ++/* sg_balance_trigger - trigger slibing group balance for @cpu */ ++static inline int sg_balance_trigger(const int cpu, struct rq *rq) ++{ ++ unsigned long flags; ++ struct task_struct *curr; ++ int res; ++ ++ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) ++ return 0; ++ curr = rq->curr; ++ res = (!is_idle_task(curr)) && (1 == rq->nr_running) &&\ ++ cpumask_intersects(curr->cpus_ptr, &sched_sg_idle_mask) &&\ ++ (!rq->active_balance); ++ ++ if (res) ++ rq->active_balance = 1; ++ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ if (res) ++ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, ++ curr, &rq->active_balance_work); ++ return res; ++} ++ ++/* ++ * sg_balance_check - slibing group balance check for run queue @rq ++ */ ++static inline void sg_balance_check(struct rq *rq) ++{ ++ cpumask_t chk; ++ int cpu; ++ ++ /* exit when no sg in idle */ ++ if (cpumask_empty(&sched_sg_idle_mask)) ++ return; ++ ++ cpu = cpu_of(rq); ++ /* Only cpu in slibing idle group will do the checking */ ++ if (cpumask_test_cpu(cpu, &sched_sg_idle_mask)) { ++ /* Find potential cpus which can migrate the currently running task */ ++ if (cpumask_andnot(&chk, cpu_online_mask, &sched_rq_pending_mask) && ++ cpumask_andnot(&chk, &chk, &sched_rq_watermark[IDLE_WM])) { ++ int i, tried = 0; ++ ++ for_each_cpu_wrap(i, &chk, cpu) { ++ /* skip the cpu which has idle slibing cpu */ ++ if (cpumask_intersects(cpu_smt_mask(i), ++ &sched_rq_watermark[IDLE_WM])) ++ continue; ++ if (cpumask_intersects(cpu_smt_mask(i), ++ &sched_rq_pending_mask)) ++ continue; ++ if (sg_balance_trigger(i, cpu_rq(i))) ++ return; ++ if (tried) ++ return; ++ tried++; ++ } ++ } ++ return; ++ } ++ ++ if (1 != rq->nr_running) ++ return; ++ ++ if (cpumask_andnot(&chk, cpu_smt_mask(cpu), &sched_rq_pending_mask) && ++ cpumask_andnot(&chk, &chk, &sched_rq_watermark[IDLE_WM]) && ++ cpumask_equal(&chk, cpu_smt_mask(cpu))) ++ sg_balance_trigger(cpu, rq); ++} ++#endif /* CONFIG_SCHED_SMT */ ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ sched_clock_tick(); ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ scheduler_task_tick(rq); ++ calc_global_load_tick(rq); ++ psi_task_tick(rq); ++ ++ rq->last_tick = rq->clock; ++ raw_spin_unlock(&rq->lock); ++ ++ perf_event_task_tick(); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++struct tick_work { ++ int cpu; ++ atomic_t state; ++ struct delayed_work work; ++}; ++/* Values for ->state, see diagram below. */ ++#define TICK_SCHED_REMOTE_OFFLINE 0 ++#define TICK_SCHED_REMOTE_OFFLINING 1 ++#define TICK_SCHED_REMOTE_RUNNING 2 ++ ++/* ++ * State diagram for ->state: ++ * ++ * ++ * TICK_SCHED_REMOTE_OFFLINE ++ * | ^ ++ * | | ++ * | | sched_tick_remote() ++ * | | ++ * | | ++ * +--TICK_SCHED_REMOTE_OFFLINING ++ * | ^ ++ * | | ++ * sched_tick_start() | | sched_tick_stop() ++ * | | ++ * V | ++ * TICK_SCHED_REMOTE_RUNNING ++ * ++ * ++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() ++ * and sched_tick_start() are happy to leave the state in RUNNING. ++ */ ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ unsigned long flags; ++ u64 delta; ++ int os; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ curr = rq->curr; ++ ++ if (is_idle_task(curr) || cpu_is_offline(cpu)) ++ goto out_unlock; ++ ++ update_rq_clock(rq); ++ delta = rq_clock_task(rq) - curr->last_ran; ++ ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ scheduler_task_tick(rq); ++ ++out_unlock: ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++out_requeue: ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. But ++ * first update state to reflect hotplug activity if required. ++ */ ++ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); ++ if (os == TICK_SCHED_REMOTE_RUNNING) ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ int os; ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); ++ if (os == TICK_SCHED_REMOTE_OFFLINE) { ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++ } ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ cancel_delayed_work_sync(&twork->work); ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_PREEMPT_TRACER)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++/* ++ * Timeslices below RESCHED_NS are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. ++ */ ++static inline void check_curr(struct task_struct *p, struct rq *rq) ++{ ++ if (rq->idle == p) ++ return; ++ ++ update_curr(rq, p); ++ ++ if (p->time_slice < RESCHED_NS) { ++ p->time_slice = SCHED_TIMESLICE_NS; ++ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) { ++ if (SCHED_RR != p->policy) ++ deboost_task(p); ++ requeue_task(p, rq); ++ } ++ } ++} ++ ++#ifdef CONFIG_SMP ++ ++#define SCHED_RQ_NR_MIGRATION (32UL) ++/* ++ * Migrate pending tasks in @rq to @dest_cpu ++ * Will try to migrate mininal of half of @rq nr_running tasks and ++ * SCHED_RQ_NR_MIGRATION to @dest_cpu ++ */ ++static inline int ++migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, const int dest_cpu) ++{ ++ struct task_struct *p, *skip = rq->curr; ++ int nr_migrated = 0; ++ int nr_tries = min(rq->nr_running / 2, SCHED_RQ_NR_MIGRATION); ++ ++ while (skip != rq->idle && nr_tries && ++ (p = rq_next_bmq_task(skip, rq)) != rq->idle) { ++ skip = rq_next_bmq_task(p, rq); ++ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) { ++ dequeue_task(p, rq, 0); ++ set_task_cpu(p, dest_cpu); ++ enqueue_task(p, dest_rq, 0); ++ nr_migrated++; ++ } ++ nr_tries--; ++ } ++ ++ return nr_migrated; ++} ++ ++static inline int take_other_rq_tasks(struct rq *rq, int cpu) ++{ ++ struct cpumask *affinity_mask, *end_mask; ++ ++ if (cpumask_empty(&sched_rq_pending_mask)) ++ return 0; ++ ++ affinity_mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ end_mask = per_cpu(sched_cpu_affinity_end_mask, cpu); ++ do { ++ int i; ++ for_each_cpu_and(i, &sched_rq_pending_mask, affinity_mask) { ++ int nr_migrated; ++ struct rq *src_rq; ++ ++ src_rq = cpu_rq(i); ++ if (!do_raw_spin_trylock(&src_rq->lock)) ++ continue; ++ spin_acquire(&src_rq->lock.dep_map, ++ SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ ++ nr_migrated = migrate_pending_tasks(src_rq, rq, cpu); ++ ++ spin_release(&src_rq->lock.dep_map, 1, _RET_IP_); ++ do_raw_spin_unlock(&src_rq->lock); ++ ++ if (nr_migrated) { ++ cpufreq_update_util(rq, 0); ++ return 1; ++ } ++ } ++ } while (++affinity_mask < end_mask); ++ ++ return 0; ++} ++#endif ++ ++static inline struct task_struct * ++choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) ++{ ++ struct task_struct *next; ++ ++ if (unlikely(rq->skip)) { ++ next = rq_runnable_task(rq); ++#ifdef CONFIG_SMP ++ if (likely(rq->online)) ++ if (next == rq->idle && take_other_rq_tasks(rq, cpu)) ++ next = rq_runnable_task(rq); ++#endif ++ rq->skip = NULL; ++ return next; ++ } ++ ++ next = rq_first_bmq_task(rq); ++#ifdef CONFIG_SMP ++ if (likely(rq->online)) ++ if (next == rq->idle && take_other_rq_tasks(rq, cpu)) ++ return rq_first_bmq_task(rq); ++#endif ++ return next; ++} ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ if (panic_on_warn) ++ panic("scheduling while atomic\n"); ++ ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev, bool preempt) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++#endif ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++ if (!preempt && prev->state && prev->non_block_count) { ++ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", ++ prev->comm, prev->pid, prev->non_block_count); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++ } ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ p->last_ran = rq->clock_task; ++ ++ if (unlikely(SCHED_TIMESLICE_NS == p->time_slice)) ++ rq->last_ts_switch = rq->clock; ++#ifdef CONFIG_HIGH_RES_TIMERS ++ if (p != rq->idle) ++ hrtick_start(rq, p->time_slice); ++#endif ++} ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next; ++ unsigned long *switch_count; ++ struct rq *rq; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ ++ schedule_debug(prev, preempt); ++ ++ /* by passing sched_feat(HRTICK) checking which BMQ doesn't support */ ++ hrtick_clear(rq); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(). ++ * ++ * The membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ raw_spin_lock(&rq->lock); ++ smp_mb__after_spinlock(); ++ ++ update_rq_clock(rq); ++ ++ switch_count = &prev->nivcsw; ++ if (!preempt && prev->state) { ++ if (signal_pending_state(prev->state, prev)) { ++ prev->state = TASK_RUNNING; ++ } else { ++ if (rq_switch_time(rq) < boost_threshold(prev)) ++ boost_task(prev); ++ deactivate_task(prev, rq); ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ check_curr(prev, rq); ++ ++ next = choose_next_task(rq, cpu, prev); ++ ++ set_rq_task(rq, next); ++ ++ if (prev != next) { ++ if (MAX_PRIO == next->prio) ++ schedstat_inc(rq->sched_goidle); ++ ++ /* ++ * RCU users of rcu_dereference(rq->curr) may not see ++ * changes to task_struct made by pick_next_task(). ++ */ ++ RCU_INIT_POINTER(rq->curr, next); ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ rq->nr_switches++; ++ rq->last_ts_switch = rq->clock; ++ ++ trace_sched_switch(preempt, prev, next); ++ ++ /* Also unlocks the rq: */ ++ rq = context_switch(rq, prev, next); ++#ifdef CONFIG_SCHED_SMT ++ sg_balance_check(rq); ++#endif ++ } else ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(): */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ __schedule(false); ++ ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ if (!tsk->state) ++ return; ++ ++ /* ++ * If a worker went to sleep, notify and ask workqueue whether ++ * it wants to wake up a task to maintain concurrency. ++ * As this function is called inside the schedule() context, ++ * we disable preemption to avoid it calling schedule() again ++ * in the possible wakeup of a kworker. ++ */ ++ if (tsk->flags & PF_WQ_WORKER) { ++ preempt_disable(); ++ wq_worker_sleeping(tsk); ++ preempt_enable_no_resched(); ++ } ++ ++ if (tsk_is_pi_blocked(tsk)) ++ return; ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++static void sched_update_worker(struct task_struct *tsk) ++{ ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_running(tsk); ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ sched_update_worker(tsk); ++} ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_CONTEXT_TRACKING ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != CONTEXT_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPTION ++/* ++ * This is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPTION */ ++ ++/* ++ * This is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++static inline void check_task_changed(struct rq *rq, struct task_struct *p) ++{ ++ /* Trigger resched if task sched_prio has been modified. */ ++ if (task_on_rq_queued(p) && task_sched_prio(p) != p->bmq_idx) { ++ requeue_task(p, rq); ++ check_preempt_curr(rq); ++ } ++} ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_access_lock(p, &lock); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guaratees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ p->prio = prio; ++ ++ check_task_changed(rq, p); ++out_unlock: ++ __task_access_unlock(p, lock); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ p->static_prio = NICE_TO_PRIO(nice); ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (task_has_rt_policy(p)) ++ goto out_unlock; ++ ++ p->prio = effective_prio(p); ++ check_task_changed(rq, p); ++out_unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ if (p->prio < MAX_RT_PRIO) ++ return (p->prio - MAX_RT_PRIO); ++ return (p->prio - MAX_RT_PRIO + p->boost_prio); ++} ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ return cpu_curr(cpu) == cpu_rq(cpu)->idle; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the cpu @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++#ifdef CONFIG_SMP ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ int dest_cpu; ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (cpumask_equal(p->cpus_ptr, new_mask)) ++ goto out; ++ ++ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ if (dest_cpu >= nr_cpu_ids) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ do_set_cpus_allowed(p, new_mask); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(p) || p->state == TASK_WAKING) { ++ struct migration_arg arg = { p, dest_cpu }; ++ ++ /* Need help from migration thread: drop lock and wait. */ ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); ++ return 0; ++ } ++ if (task_on_rq_queued(p)) { ++ /* ++ * OK, since we're going to drop the lock immediately ++ * afterwards anyway. ++ */ ++ update_rq_clock(rq); ++ rq = move_queued_task(rq, p, dest_cpu); ++ lock = &rq->lock; ++ } ++ ++out: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#else ++static inline int ++__set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++#endif ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++static void __setscheduler_params(struct task_struct *p, ++ const struct sched_attr *attr) ++{ ++ int policy = attr->sched_policy; ++ ++ if (policy == SETPARAM_POLICY) ++ policy = p->policy; ++ ++ p->policy = policy; ++ ++ /* ++ * allow normal nice value to be set, but will not have any ++ * effect on scheduling until the task not SCHED_NORMAL/ ++ * SCHED_BATCH ++ */ ++ p->static_prio = NICE_TO_PRIO(attr->sched_nice); ++ ++ /* ++ * __sched_setscheduler() ensures attr->sched_priority == 0 when ++ * !rt_policy. Always setting this ensures that things like ++ * getparam()/getattr() don't report silly values for !rt tasks. ++ */ ++ p->rt_priority = attr->sched_priority; ++ p->normal_prio = normal_prio(p); ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct rq *rq, struct task_struct *p, ++ const struct sched_attr *attr, bool keep_boost) ++{ ++ __setscheduler_params(p, attr); ++ ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++} ++ ++/* ++ * check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int __sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, ++ bool user, bool pi) ++{ ++ const struct sched_attr dl_squash_attr = { ++ .size = sizeof(struct sched_attr), ++ .sched_policy = SCHED_FIFO, ++ .sched_nice = 0, ++ .sched_priority = 99, ++ }; ++ int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ int retval, oldpolicy = -1; ++ int policy = attr->sched_policy; ++ unsigned long flags; ++ struct rq *rq; ++ int reset_on_fork; ++ raw_spinlock_t *lock; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ /* ++ * BMQ supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO ++ */ ++ if (unlikely(SCHED_DEADLINE == policy)) { ++ attr = &dl_squash_attr; ++ policy = attr->sched_policy; ++ newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); ++ ++ if (policy > SCHED_IDLE) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH and SCHED_IDLE is 0. ++ */ ++ if (attr->sched_priority < 0 || ++ (p->mm && attr->sched_priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if ((SCHED_RR == policy || SCHED_FIFO == policy) != ++ (attr->sched_priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (SCHED_FIFO == policy || SCHED_RR == policy) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (attr->sched_priority > p->rt_priority && ++ attr->sched_priority > rlim_rtprio) ++ return -EPERM; ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ if (pi) ++ cpuset_read_lock(); ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ /* ++ * To be able to change p->policy safely, task_access_lock() ++ * must be called. ++ * IF use task_access_lock() here: ++ * For the task p which is not running, reading rq->stop is ++ * racy but acceptable as ->stop doesn't change much. ++ * An enhancemnet can be made to read rq->stop saftly. ++ */ ++ rq = __task_access_lock(p, &lock); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea ++ */ ++ if (p == rq->stop) { ++ retval = -EINVAL; ++ goto unlock; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy)) { ++ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) ++ goto change; ++ if (!rt_policy(policy) && ++ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) ++ goto change; ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ retval = 0; ++ goto unlock; ++ } ++change: ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ goto recheck; ++ } ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ if (pi) { ++ /* ++ * Take priority boosted tasks into account. If the new ++ * effective priority is unchanged, we just store the new ++ * normal parameters and do not touch the scheduler class and ++ * the runqueue. This will be done when the task deboost ++ * itself. ++ */ ++ if (rt_effective_prio(p, newprio) == p->prio) { ++ __setscheduler_params(p, attr); ++ retval = 0; ++ goto unlock; ++ } ++ } ++ ++ __setscheduler(rq, p, attr, pi); ++ ++ check_task_changed(rq, p); ++ ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ if (pi) { ++ cpuset_read_unlock(); ++ rt_mutex_adjust_pi(p); ++ } ++ ++ preempt_enable(); ++ ++ return 0; ++ ++unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ return retval; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ ++ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { ++ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; ++ policy &= ~SCHED_RESET_ON_FORK; ++ attr.sched_policy = policy; ++ } ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++ ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++EXPORT_SYMBOL_GPL(sched_setscheduler); ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++EXPORT_SYMBOL_GPL(sched_setattr); ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setscheduler(p, policy, &lparam); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) ++ goto err_size; ++ ++ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); ++ if (ret) { ++ if (ret == -E2BIG) ++ goto err_size; ++ return ret; ++ } ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * @param: structure containing the new RT priority. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (p != NULL) ++ retval = sched_setattr(p, &attr); ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (task_has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/* ++ * Copy the kernel size attribute structure (which might be larger ++ * than what user-space knows about) to user-space. ++ * ++ * Note that all cases are valid: user-space buffer can be larger or ++ * smaller than the kernel-space buffer. The usual case is that both ++ * have the same size. ++ */ ++static int ++sched_attr_copy_to_user(struct sched_attr __user *uattr, ++ struct sched_attr *kattr, ++ unsigned int usize) ++{ ++ unsigned int ksize = sizeof(*kattr); ++ ++ if (!access_ok(uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * sched_getattr() ABI forwards and backwards compatibility: ++ * ++ * If usize == ksize then we just copy everything to user-space and all is good. ++ * ++ * If usize < ksize then we only copy as much as user-space has space for, ++ * this keeps ABI compatibility as well. We skip the rest. ++ * ++ * If usize > ksize then user-space is using a newer version of the ABI, ++ * which part the kernel doesn't know about. Just ignore it - tooling can ++ * detect the kernel's knowledge of attributes from the attr->size value ++ * which is set to ksize in this case. ++ */ ++ kattr->size = min(usize, ksize); ++ ++ if (copy_to_user(uattr, kattr, kattr->size)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @usize: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, usize, unsigned int, flags) ++{ ++ struct sched_attr kattr = { }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || usize > PAGE_SIZE || ++ usize < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ kattr.sched_policy = p->policy; ++ if (p->sched_reset_on_fork) ++ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; ++ if (task_has_rt_policy(p)) ++ kattr.sched_priority = p->rt_priority; ++ else ++ kattr.sched_nice = task_nice(p); ++ ++#ifdef CONFIG_UCLAMP_TASK ++ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; ++ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; ++#endif ++ ++ rcu_read_unlock(); ++ ++ return sched_attr_copy_to_user(uattr, &kattr, usize); ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_allowed, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ put_online_cpus(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_allowed); ++ cpumask_and(new_mask, in_mask, cpus_allowed); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_allowed); ++ if (!cpumask_subset(new_mask, cpus_allowed)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_allowed to the ++ * cpuset's cpus_allowed ++ */ ++ cpumask_copy(new_mask, cpus_allowed); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_allowed); ++out_put_task: ++ put_task_struct(p); ++ put_online_cpus(); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ struct cpumask *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ raw_spinlock_t *lock; ++ unsigned long flags; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ task_access_lock_irqsave(p, &lock, &flags); ++ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: size of CPU mask copied to user_mask_ptr on success. An ++ * error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min_t(size_t, len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ * ++ * Return: 0. ++ */ ++static void do_sched_yield(void) ++{ ++ struct rq *rq; ++ struct rq_flags rf; ++ ++ if (!sched_yield_type) ++ return; ++ ++ rq = this_rq_lock_irq(&rf); ++ ++ schedstat_inc(rq->yld_count); ++ ++ if (1 == sched_yield_type) { ++ if (!rt_task(current)) { ++ current->boost_prio = MAX_PRIORITY_ADJ; ++ requeue_task(current, rq); ++ } ++ } else if (2 == sched_yield_type) { ++ if (rq->nr_running > 1) ++ rq->skip = current; ++ } ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ preempt_disable(); ++ raw_spin_unlock(&rq->lock); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPTION ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ rcu_all_qs(); ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, its already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * In BMQ, yield_to is not supported. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void __sched io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ int retval; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ rcu_read_unlock(); ++ ++ *t = ns_to_timespec64(SCHED_TIMESLICE_NS); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct __kernel_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT_32BIT_TIME ++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, ++ struct old_timespec32 __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_old_timespec32(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, ++ task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ show_stack(p, NULL); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++#if BITS_PER_LONG == 32 ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#else ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#endif ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++#ifdef CONFIG_SCHED_DEBUG ++ /* TODO: BMQ should support this ++ if (!state_filter) ++ sysrq_sched_debug_show(); ++ */ ++#endif ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: cpu the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ idle->last_ran = rq->clock_task; ++ idle->state = TASK_RUNNING; ++ idle->flags |= PF_IDLE; ++ /* Setting prio to illegal value shouldn't matter when never queued */ ++ idle->prio = MAX_PRIO; ++ ++ idle->bmq_idx = IDLE_TASK_SCHED_PRIO; ++ bmq_init_idle(&rq->queue, idle); ++ ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#endif ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ __set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->idle = idle; ++ rcu_assign_pointer(rq->curr, idle); ++ idle->on_cpu = 1; ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * In order to ensure that a pending wakeup will observe our pending ++ * state, even in the failed case, an explicit smp_mb() must be used. ++ */ ++ smp_mb__before_atomic(); ++ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) ++ return false; ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++ return true; ++} ++ ++/** ++ * wake_q_add() - queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ */ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task)) ++ get_task_struct(task); ++} ++ ++/** ++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ * ++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers ++ * that already hold reference to @task can call the 'safe' version and trust ++ * wake_q to do the right thing depending whether or not the @task is already ++ * queued for wakeup. ++ */ ++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (!__wake_q_add(head, task)) ++ put_task_struct(task); ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* task can safely be re-inserted now: */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++#ifdef CONFIG_SMP ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_mask may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++static bool sched_smp_initialized __read_mostly; ++ ++#ifdef CONFIG_NO_HZ_COMMON ++void nohz_balance_enter_idle(int cpu) ++{ ++} ++ ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(); ++ struct cpumask *mask; ++ ++ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ return cpu; ++ ++ for (mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ mask < per_cpu(sched_cpu_affinity_end_mask, cpu); mask++) ++ for_each_cpu(i, mask) ++ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) ++ return i; ++ ++ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++ ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++void wake_up_idle_cpu(int cpu) ++{ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ set_tsk_need_resched(cpu_rq(cpu)->idle); ++ smp_send_reschedule(cpu); ++} ++ ++void wake_up_nohz_cpu(int cpu) ++{ ++ wake_up_idle_cpu(cpu); ++} ++#endif /* CONFIG_NO_HZ_COMMON */ ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Ensures that the idle task is using init_mm right before its CPU goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(cpu_online(smp_processor_id())); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ current->active_mm = &init_mm; ++ finish_arch_post_lock_switch(); ++ } ++ mmdrop(mm); ++} ++ ++/* ++ * Migrate all tasks from the rq, sleeping tasks will be migrated by ++ * try_to_wake_up()->select_task_rq(). ++ * ++ * Called with rq->lock held even though we'er in stop_machine() and ++ * there's no concurrency possible, we hold the required locks anyway ++ * because of lock validation efforts. ++ */ ++static void migrate_tasks(struct rq *dead_rq) ++{ ++ struct rq *rq = dead_rq; ++ struct task_struct *p, *stop = rq->stop; ++ int count = 0; ++ ++ /* ++ * Fudge the rq selection such that the below task selection loop ++ * doesn't get stuck on the currently eligible stop task. ++ * ++ * We're currently inside stop_machine() and the rq is either stuck ++ * in the stop_machine_cpu_stop() loop, or we're executing this code, ++ * either way we should never end up calling schedule() until we're ++ * done here. ++ */ ++ rq->stop = NULL; ++ ++ p = rq_first_bmq_task(rq); ++ while (p != rq->idle) { ++ int dest_cpu; ++ ++ /* skip the running task */ ++ if (task_running(p) || 1 == p->nr_cpus_allowed) { ++ p = rq_next_bmq_task(p, rq); ++ continue; ++ } ++ ++ /* ++ * Rules for changing task_struct::cpus_allowed are holding ++ * both pi_lock and rq->lock, such that holding either ++ * stabilizes the mask. ++ * ++ * Drop rq->lock is not quite as disastrous as it usually is ++ * because !cpu_active at this point, which means load-balance ++ * will not interfere. Also, stop-machine. ++ */ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ /* ++ * Since we're inside stop-machine, _nothing_ should have ++ * changed the task, WARN if weird stuff happened, because in ++ * that case the above rq->lock drop is a fail too. ++ */ ++ if (WARN_ON(task_rq(p) != rq || !task_on_rq_queued(p))) { ++ raw_spin_unlock(&p->pi_lock); ++ p = rq_next_bmq_task(p, rq); ++ continue; ++ } ++ ++ count++; ++ /* Find suitable destination for @next, with force if needed. */ ++ dest_cpu = select_fallback_rq(dead_rq->cpu, p); ++ rq = __migrate_task(rq, p, dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ rq = dead_rq; ++ raw_spin_lock(&rq->lock); ++ /* Check queued task all over from the header again */ ++ p = rq_first_bmq_task(rq); ++ } ++ ++ rq->stop = stop; ++} ++ ++static void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) ++ rq->online = false; ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++static void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) ++ rq->online = true; ++} ++ ++#ifdef CONFIG_SCHED_DEBUG ++ ++static __read_mostly int sched_debug_enabled; ++ ++static int __init sched_debug_setup(char *str) ++{ ++ sched_debug_enabled = 1; ++ ++ return 0; ++} ++early_param("sched_debug", sched_debug_setup); ++ ++static inline bool sched_debug(void) ++{ ++ return sched_debug_enabled; ++} ++#else /* !CONFIG_SCHED_DEBUG */ ++static inline bool sched_debug(void) ++{ ++ return false; ++} ++#endif /* CONFIG_SCHED_DEBUG */ ++ ++#ifdef CONFIG_SMP ++void scheduler_ipi(void) ++{ ++ /* ++ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting ++ * TIF_NEED_RESCHED remotely (for the first time) will also send ++ * this IPI. ++ */ ++ preempt_fold_need_resched(); ++ ++ if (!idle_cpu(smp_processor_id()) || need_resched()) ++ return; ++ ++ irq_enter(); ++ irq_exit(); ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (is_idle_task(rq->curr)) ++ smp_send_reschedule(cpu); ++ /* Else CPU is not idle, do nothing here */ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Topology list, bottom-up. ++ */ ++static struct sched_domain_topology_level default_topology[] = { ++#ifdef CONFIG_SCHED_SMT ++ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, ++#endif ++#ifdef CONFIG_SCHED_MC ++ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, ++#endif ++ { cpu_cpu_mask, SD_INIT_NAME(DIE) }, ++ { NULL, }, ++}; ++ ++static struct sched_domain_topology_level *sched_domain_topology = ++ default_topology; ++ ++#define for_each_sd_topology(tl) \ ++ for (tl = sched_domain_topology; tl->mask; tl++) ++ ++void set_sched_topology(struct sched_domain_topology_level *tl) ++{ ++ if (WARN_ON_ONCE(sched_smp_initialized)) ++ return; ++ ++ sched_domain_topology = tl; ++} ++ ++/* ++ * Initializers for schedule domains ++ * Non-inlined to reduce accumulated stack pressure in build_sched_domains() ++ */ ++ ++int sched_domain_level_max; ++ ++/* ++ * Partition sched domains as specified by the 'ndoms_new' ++ * cpumasks in the array doms_new[] of cpumasks. This compares ++ * doms_new[] to the current sched domain partitioning, doms_cur[]. ++ * It destroys each deleted domain and builds each new domain. ++ * ++ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. ++ * The masks don't intersect (don't overlap.) We should setup one ++ * sched domain for each mask. CPUs not in any of the cpumasks will ++ * not be load balanced. If the same cpumask appears both in the ++ * current 'doms_cur' domains and in the new 'doms_new', we can leave ++ * it as it is. ++ * ++ * The passed in 'doms_new' should be allocated using ++ * alloc_sched_domains. This routine takes ownership of it and will ++ * free_sched_domains it when done with it. If the caller failed the ++ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, ++ * and partition_sched_domains() will fallback to the single partition ++ * 'fallback_doms', it also forces the domains to be rebuilt. ++ * ++ * If doms_new == NULL it will be replaced with cpu_online_mask. ++ * ndoms_new == 0 is a special case for destroying existing domains, ++ * and it will not create the default domain. ++ * ++ * Call with hotplug lock held ++ */ ++void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ++ struct sched_domain_attr *dattr_new) ++{ ++ /** ++ * BMQ doesn't depend on sched domains, but just keep this api ++ */ ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++#ifdef CONFIG_NUMA ++int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; ++ ++/* ++ * sched_numa_find_closest() - given the NUMA topology, find the cpu ++ * closest to @cpu from @cpumask. ++ * cpumask: cpumask to find a cpu from ++ * cpu: cpu to be close to ++ * ++ * returns: cpu, or nr_cpu_ids when nothing found. ++ */ ++int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return best_mask_cpu(cpu, cpus); ++} ++#endif /* CONFIG_NUMA */ ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going up, increment the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_inc_cpuslocked(&sched_smt_present); ++#endif ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) ++ cpuset_cpu_active(); ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all cpus have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_online(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu(); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going down, decrement the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) { ++ static_branch_dec_cpuslocked(&sched_smt_present); ++ if (!static_branch_likely(&sched_smt_present)) ++ cpumask_clear(&sched_sg_idle_mask); ++ } ++#endif ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ return 0; ++} ++ ++static void sched_rq_cpu_starting(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ rq->calc_load_update = calc_load_update; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_rq_cpu_starting(cpu); ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ sched_tick_stop(cpu); ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_offline(rq); ++ migrate_tasks(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ hrtick_clear(rq); ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_SMP ++static void sched_init_topology_cpumask_early(void) ++{ ++ int cpu, level; ++ cpumask_t *tmp; ++ ++ for_each_possible_cpu(cpu) { ++ for (level = 0; level < NR_CPU_AFFINITY_CHK_LEVEL; level++) { ++ tmp = &(per_cpu(sched_cpu_affinity_masks, cpu)[level]); ++ cpumask_copy(tmp, cpu_possible_mask); ++ cpumask_clear_cpu(cpu, tmp); ++ } ++ per_cpu(sched_cpu_affinity_end_mask, cpu) = ++ &(per_cpu(sched_cpu_affinity_masks, cpu)[1]); ++ } ++} ++ ++static void sched_init_topology_cpumask(void) ++{ ++ int cpu; ++ cpumask_t *chk; ++ ++ for_each_online_cpu(cpu) { ++ chk = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ ++#ifdef CONFIG_SCHED_SMT ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++ if (cpumask_and(chk, chk, topology_sibling_cpumask(cpu))) { ++ printk(KERN_INFO "bmq: cpu #%d affinity check mask - smt 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ } ++ cpumask_complement(chk, topology_sibling_cpumask(cpu)); ++#else ++ cpumask_clear_cpu(cpu, chk); ++#endif ++#ifdef CONFIG_SCHED_MC ++ if (cpumask_and(chk, chk, cpu_coregroup_mask(cpu))) ++ printk(KERN_INFO "bmq: cpu #%d affinity check mask - coregroup 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ cpumask_complement(chk, cpu_coregroup_mask(cpu)); ++ ++ /** ++ * Set up sd_llc_id per CPU ++ */ ++ per_cpu(sd_llc_id, cpu) = ++ cpumask_first(cpu_coregroup_mask(cpu)); ++#else ++ per_cpu(sd_llc_id, cpu) = ++ cpumask_first(topology_core_cpumask(cpu)); ++ ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++#endif /* NOT CONFIG_SCHED_MC */ ++ if (cpumask_and(chk, chk, topology_core_cpumask(cpu))) ++ printk(KERN_INFO "bmq: cpu #%d affinity check mask - core 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ cpumask_complement(chk, topology_core_cpumask(cpu)); ++ ++ if (cpumask_and(chk, chk, cpu_online_mask)) ++ printk(KERN_INFO "bmq: cpu #%d affinity check mask - others 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ ++ per_cpu(sched_cpu_affinity_end_mask, cpu) = chk; ++ } ++} ++#endif ++ ++void __init sched_init_smp(void) ++{ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) ++ BUG(); ++ ++ sched_init_topology_cpumask(); ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++ int i; ++ struct rq *rq; ++ ++ print_scheduler_version(); ++ ++ wait_bit_init(); ++ ++#ifdef CONFIG_SMP ++ for (i = 0; i < bmq_BITS; i++) ++ cpumask_copy(&sched_rq_watermark[i], cpu_present_mask); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ ++ bmq_init(&rq->queue); ++ rq->watermark = IDLE_WM; ++ rq->skip = NULL; ++ ++ raw_spin_lock_init(&rq->lock); ++ rq->nr_running = rq->nr_uninterruptible = 0; ++ rq->calc_load_active = 0; ++ rq->calc_load_update = jiffies + LOAD_FREQ; ++#ifdef CONFIG_SMP ++ rq->online = false; ++ rq->cpu = i; ++ ++#ifdef CONFIG_SCHED_SMT ++ rq->active_balance = 0; ++#endif ++#endif ++ rq->nr_switches = 0; ++ atomic_set(&rq->nr_iowait, 0); ++ hrtick_rq_init(rq); ++ } ++#ifdef CONFIG_SMP ++ /* Set rq->online for cpu 0 */ ++ cpu_rq(0)->online = true; ++#endif ++ ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++ ++ sched_init_topology_cpumask_early(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++ ++ psi_init(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current) && !current->non_block_count) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++#ifdef CONFIG_DEBUG_PREEMPT ++ if (!preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++#endif ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++ ++void __cant_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > preempt_offset) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); ++ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++void normalize_rt_tasks(void) ++{ ++ struct task_struct *g, *p; ++ struct sched_attr attr = { ++ .sched_policy = SCHED_NORMAL, ++ }; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p)) { ++ /* ++ * Renice negative nice level userspace ++ * tasks back to 0: ++ */ ++ if (task_nice(p) < 0) ++ set_user_nice(p, 0); ++ continue; ++ } ++ ++ __sched_setscheduler(p, &attr, false, false); ++ } ++ read_unlock(&tasklist_lock); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * ia64_set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++#ifdef CONFIG_SCHED_DEBUG ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{ ++ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), ++ get_nr_threads(p)); ++} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_legacy_files[] = { ++ { } /* Terminate */ ++}; ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++#undef CREATE_TRACE_POINTS +diff --git a/kernel/sched/bmq_sched.h b/kernel/sched/bmq_sched.h +new file mode 100644 +index 000000000000..ed08dd0b8227 +--- /dev/null ++++ b/kernel/sched/bmq_sched.h +@@ -0,0 +1,472 @@ ++#ifndef BMQ_SCHED_H ++#define BMQ_SCHED_H ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#ifdef CONFIG_PARAVIRT ++# include ++#endif ++ ++#include "cpupri.h" ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++} ++ ++/* bits: ++ * RT, Low prio adj range, nice width, high prio adj range, cpu idle task */ ++#define bmq_BITS (NICE_WIDTH + 2 * MAX_PRIORITY_ADJ + 2) ++#define IDLE_TASK_SCHED_PRIO (bmq_BITS - 1) ++ ++struct bmq { ++ DECLARE_BITMAP(bitmap, bmq_BITS); ++ struct list_head heads[bmq_BITS]; ++}; ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ /* runqueue lock: */ ++ raw_spinlock_t lock; ++ ++ struct task_struct *curr, *idle, *stop, *skip; ++ struct mm_struct *prev_mm; ++ ++ struct bmq queue; ++ unsigned long watermark; ++ ++ /* switch count */ ++ u64 nr_switches; ++ ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_MEMBARRIER ++ int membarrier_state; ++#endif ++ ++#ifdef CONFIG_SMP ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ struct sched_avg avg_irq; ++#endif ++ ++#ifdef CONFIG_SCHED_SMT ++ int active_balance; ++ struct cpu_stop_work active_balance_work; ++#endif ++#endif /* CONFIG_SMP */ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ /* calc_load related fields */ ++ unsigned long calc_load_update; ++ long calc_load_active; ++ ++ u64 clock, last_tick; ++ u64 last_ts_switch; ++ u64 clock_task; ++ ++ unsigned long nr_running; ++ unsigned long nr_uninterruptible; ++ ++#ifdef CONFIG_SCHED_HRTICK ++#ifdef CONFIG_SMP ++ int hrtick_csd_pending; ++ call_single_data_t hrtick_csd; ++#endif ++ struct hrtimer hrtick_timer; ++#endif ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++}; ++ ++extern unsigned long calc_load_update; ++extern atomic_long_t calc_load_tasks; ++ ++extern void calc_global_load_tick(struct rq *this_rq); ++extern long calc_load_fold_active(struct rq *this_rq, long adjust); ++ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) ++#define this_rq() this_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++ ++#ifdef CONFIG_SMP ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++#endif /* CONFIG_SMP */ ++ ++#ifndef arch_scale_freq_capacity ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock_task; ++} ++ ++/* ++ * {de,en}queue flags: ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ */ ++ ++#define DEQUEUE_SLEEP 0x01 ++ ++#define ENQUEUE_WAKEUP 0x01 ++ ++ ++/* ++ * Below are scheduler API which using in other kernel code ++ * It use the dummy rq_flags ++ * ToDo : BMQ need to support these APIs for compatibility with mainline ++ * scheduler code. ++ */ ++struct rq_flags { ++ unsigned long flags; ++}; ++ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock); ++ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock); ++ ++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(&rq->lock); ++} ++ ++static inline void ++task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++} ++ ++static inline void ++rq_unlock_irq(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++static inline struct rq * ++this_rq_lock_irq(struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ raw_spin_lock(&rq->lock); ++ ++ return rq; ++} ++ ++static inline bool task_running(struct task_struct *p) ++{ ++ return p->on_cpu; ++} ++ ++extern struct static_key_false sched_schedstats; ++ ++static inline void sched_ttwu_pending(void) { } ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++static inline int cpu_of(const struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ return rq->cpu; ++#else ++ return 0; ++#endif ++} ++ ++#include "stats.h" ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); ++ ++/** ++ * cpufreq_update_util - Take a note about CPU utilization changes. ++ * @rq: Runqueue to carry out the update for. ++ * @flags: Update reason flags. ++ * ++ * This function is called by the scheduler on the CPU whose utilization is ++ * being updated. ++ * ++ * It can only be called from RCU-sched read-side critical sections. ++ * ++ * The way cpufreq is currently arranged requires it to evaluate the CPU ++ * performance state (frequency/voltage) on a regular basis to prevent it from ++ * being stuck in a completely inadequate performance level for too long. ++ * That is not guaranteed to happen if the updates are only triggered from CFS ++ * and DL, though, because they may not be coming in if only RT tasks are ++ * active all the time (or there are RT tasks only). ++ * ++ * As a workaround for that issue, this function is called periodically by the ++ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, ++ * but that really is a band-aid. Going forward it should be replaced with ++ * solutions targeted more specifically at RT tasks. ++ */ ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); ++ if (data) ++ data->func(data, rq_clock(rq), flags); ++} ++#else ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} ++#endif /* CONFIG_CPU_FREQ */ ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern int __init sched_tick_offload_init(void); ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++#endif ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++extern void schedule_idle(void); ++ ++/* ++ * !! For sched_setattr_nocheck() (kernel) only !! ++ * ++ * This is actually gross. :( ++ * ++ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE ++ * tasks, but still be able to sleep. We need this on platforms that cannot ++ * atomically change clock frequency. Remove once fast switching will be ++ * available on such platforms. ++ * ++ * SUGOV stands for SchedUtil GOVernor. ++ */ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++#ifdef CONFIG_MEMBARRIER ++/* ++ * The scheduler provides memory barriers required by membarrier between: ++ * - prior user-space memory accesses and store to rq->membarrier_state, ++ * - store to rq->membarrier_state and following user-space memory accesses. ++ * In the same way it provides those guarantees around store to rq->curr. ++ */ ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++ int membarrier_state; ++ ++ if (prev_mm == next_mm) ++ return; ++ ++ membarrier_state = atomic_read(&next_mm->membarrier_state); ++ if (READ_ONCE(rq->membarrier_state) == membarrier_state) ++ return; ++ ++ WRITE_ONCE(rq->membarrier_state, membarrier_state); ++} ++#else ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++} ++#endif ++ ++static inline int task_running_nice(struct task_struct *p) ++{ ++ return (p->prio + p->boost_prio > DEFAULT_PRIO + MAX_PRIORITY_ADJ); ++} ++ ++#ifdef CONFIG_NUMA ++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); ++#else ++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return nr_cpu_ids; ++} ++#endif ++#endif /* BMQ_SCHED_H */ +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index 86800b4d5453..a816aafa6ba3 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -185,6 +185,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifndef CONFIG_SCHED_BMQ + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -302,6 +303,13 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) + + return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); + } ++#else /* CONFIG_SCHED_BMQ */ ++static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) ++{ ++ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); ++ return sg_cpu->max; ++} ++#endif + + /** + * sugov_iowait_reset() - Reset the IO boost status of a CPU. +@@ -445,7 +453,9 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } + */ + static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) + { ++#ifndef CONFIG_SCHED_BMQ + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) ++#endif + sg_policy->limits_changed = true; + } + +@@ -688,6 +698,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + } + + ret = sched_setattr_nocheck(thread, &attr); ++ + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); +@@ -918,6 +929,7 @@ static int __init sugov_register(void) + fs_initcall(sugov_register); + + #ifdef CONFIG_ENERGY_MODEL ++#ifndef CONFIG_SCHED_BMQ + extern bool sched_energy_update; + extern struct mutex sched_energy_mutex; + +@@ -948,4 +960,10 @@ void sched_cpufreq_governor_change(struct cpufreq_policy *policy, + } + + } ++#else /* CONFIG_SCHED_BMQ */ ++void sched_cpufreq_governor_change(struct cpufreq_policy *policy, ++ struct cpufreq_governor *old_gov) ++{ ++} ++#endif + #endif +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index 46ed4e1383e2..51460a446da0 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -122,7 +122,7 @@ void account_user_time(struct task_struct *p, u64 cputime) + p->utime += cputime; + account_group_user_time(p, cputime); + +- index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; ++ index = task_running_nice(p) ? CPUTIME_NICE : CPUTIME_USER; + + /* Add user time to cpustat. */ + task_group_account_field(p, index, cputime); +@@ -146,7 +146,7 @@ void account_guest_time(struct task_struct *p, u64 cputime) + p->gtime += cputime; + + /* Add guest time to cpustat. */ +- if (task_nice(p) > 0) { ++ if (task_running_nice(p)) { + cpustat[CPUTIME_NICE] += cputime; + cpustat[CPUTIME_GUEST_NICE] += cputime; + } else { +@@ -269,7 +269,7 @@ static inline u64 account_other_time(u64 max) + #ifdef CONFIG_64BIT + static inline u64 read_sum_exec_runtime(struct task_struct *t) + { +- return t->se.sum_exec_runtime; ++ return tsk_seruntime(t); + } + #else + static u64 read_sum_exec_runtime(struct task_struct *t) +@@ -279,7 +279,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) + struct rq *rq; + + rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; ++ ns = tsk_seruntime(t); + task_rq_unlock(rq, t, &rf); + + return ns; +@@ -663,7 +663,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index f65ef1e2f204..77bf219444fa 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -355,6 +355,7 @@ void cpu_startup_entry(enum cpuhp_state state) + do_idle(); + } + ++#ifndef CONFIG_SCHED_BMQ + /* + * idle-task scheduling class. + */ +@@ -479,3 +480,4 @@ const struct sched_class idle_sched_class = { + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif +diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c +index a96db50d40e0..22c20e28b613 100644 +--- a/kernel/sched/pelt.c ++++ b/kernel/sched/pelt.c +@@ -236,6 +236,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna + WRITE_ONCE(sa->util_avg, sa->util_sum / divider); + } + ++#ifndef CONFIG_SCHED_BMQ + /* + * sched_entity: + * +@@ -352,6 +353,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + + return 0; + } ++#endif + + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ + /* +diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h +index afff644da065..4da52afaeff8 100644 +--- a/kernel/sched/pelt.h ++++ b/kernel/sched/pelt.h +@@ -1,11 +1,13 @@ + #ifdef CONFIG_SMP + #include "sched-pelt.h" + ++#ifndef CONFIG_SCHED_BMQ + int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); + int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); + int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); + int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); + int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); ++#endif + + #ifdef CONFIG_HAVE_SCHED_AVG_IRQ + int update_irq_load_avg(struct rq *rq, u64 running); +@@ -17,6 +19,7 @@ update_irq_load_avg(struct rq *rq, u64 running) + } + #endif + ++#ifndef CONFIG_SCHED_BMQ + /* + * When a task is dequeued, its estimated utilization should not be update if + * its util_avg has not been updated at least once. +@@ -137,9 +140,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) + return rq_clock_pelt(rq_of(cfs_rq)); + } + #endif ++#endif /* CONFIG_SCHED_BMQ */ + + #else + ++#ifndef CONFIG_SCHED_BMQ + static inline int + update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) + { +@@ -157,6 +162,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + { + return 0; + } ++#endif + + static inline int + update_irq_load_avg(struct rq *rq, u64 running) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index c8870c5bd7df..4bca9838b6f0 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2,6 +2,10 @@ + /* + * Scheduler internal types and methods: + */ ++#ifdef CONFIG_SCHED_BMQ ++#include "bmq_sched.h" ++#else ++ + #include + + #include +@@ -2496,3 +2500,9 @@ static inline void membarrier_switch_mm(struct rq *rq, + { + } + #endif ++ ++static inline int task_running_nice(struct task_struct *p) ++{ ++ return (task_nice(p) > 0); ++} ++#endif /* !CONFIG_SCHED_BMQ */ +diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c +index 750fb3c67eed..0cc040a28d3f 100644 +--- a/kernel/sched/stats.c ++++ b/kernel/sched/stats.c +@@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v) + } else { + struct rq *rq; + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_BMQ + struct sched_domain *sd; + int dcount = 0; ++#endif + #endif + cpu = (unsigned long)(v - 2); + rq = cpu_rq(cpu); +@@ -40,6 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + seq_printf(seq, "\n"); + + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_BMQ + /* domain-specific stats */ + rcu_read_lock(); + for_each_domain(cpu, sd) { +@@ -68,6 +71,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + sd->ttwu_move_balance); + } + rcu_read_unlock(); ++#endif + #endif + } + return 0; +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index b6f2f35d0bcf..435440943455 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -132,6 +132,10 @@ static unsigned long one_ul = 1; + static unsigned long long_max = LONG_MAX; + static int one_hundred = 100; + static int one_thousand = 1000; ++#ifdef CONFIG_SCHED_BMQ ++static int __maybe_unused zero = 0; ++extern int sched_yield_type; ++#endif + #ifdef CONFIG_PRINTK + static int ten_thousand = 10000; + #endif +@@ -300,7 +304,7 @@ static struct ctl_table sysctl_base_table[] = { + { } + }; + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_BMQ) + static int min_sched_granularity_ns = 100000; /* 100 usecs */ + static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns; /* 0 usecs */ +@@ -317,6 +321,7 @@ static int max_extfrag_threshold = 1000; + #endif + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_BMQ + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -498,6 +503,7 @@ static struct ctl_table kern_table[] = { + .extra2 = SYSCTL_ONE, + }, + #endif ++#endif /* !CONFIG_SCHED_BMQ */ + #ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", +@@ -1070,6 +1076,17 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_BMQ ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &two, ++ }, ++#endif + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +index 42d512fcfda2..70b97fe0ff44 100644 +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -226,7 +226,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) + u64 stime, utime; + + task_cputime(p, &utime, &stime); +- store_samples(samples, stime, utime, p->se.sum_exec_runtime); ++ store_samples(samples, stime, utime, tsk_seruntime(p)); + } + + static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, +@@ -796,6 +796,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, + } + } + ++#ifndef CONFIG_SCHED_BMQ + static inline void check_dl_overrun(struct task_struct *tsk) + { + if (tsk->dl.dl_overrun) { +@@ -803,6 +804,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + } + } ++#endif + + static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) + { +@@ -830,8 +832,10 @@ static void check_thread_timers(struct task_struct *tsk, + u64 samples[CPUCLOCK_MAX]; + unsigned long soft; + ++#ifndef CONFIG_SCHED_BMQ + if (dl_task(tsk)) + check_dl_overrun(tsk); ++#endif + + if (expiry_cache_is_inactive(pct)) + return; +@@ -845,7 +849,7 @@ static void check_thread_timers(struct task_struct *tsk, + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ +- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); ++ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + /* At the hard limit, send SIGKILL. No further action. */ +@@ -1099,8 +1103,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) + return true; + } + ++#ifndef CONFIG_SCHED_BMQ + if (dl_task(tsk) && tsk->dl.dl_overrun) + return true; ++#endif + + return false; + } +diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +index 69ee8ef12cee..208788fcbb0e 100644 +--- a/kernel/trace/trace_selftest.c ++++ b/kernel/trace/trace_selftest.c +@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_BMQ ++ /* No deadline on BMQ, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + diff --git a/linux-tkg/linux-tkg-patches/5.4/0009-glitched-bmq.patch b/linux-tkg/linux-tkg-patches/5.4/0009-glitched-bmq.patch new file mode 100644 index 0000000..5e78811 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.4/0009-glitched-bmq.patch @@ -0,0 +1,108 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched - BMQ + +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index 6b423eebfd5d..61e3271675d6 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -21,10 +21,10 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (63) +-#define DEF_SAMPLING_DOWN_FACTOR (1) ++#define DEF_FREQUENCY_UP_THRESHOLD (55) ++#define DEF_SAMPLING_DOWN_FACTOR (5) + #define MAX_SAMPLING_DOWN_FACTOR (100000) +-#define MICRO_FREQUENCY_UP_THRESHOLD (95) ++#define MICRO_FREQUENCY_UP_THRESHOLD (63) + #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) + #define MIN_FREQUENCY_UP_THRESHOLD (1) + #define MAX_FREQUENCY_UP_THRESHOLD (100) +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_500 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -39,6 +39,13 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,6 +59,7 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_500 ++ default HZ_750 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -46,6 +46,13 @@ choice + on desktops with great smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -60,6 +67,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 500 if HZ_500 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 9270a4370d54..30d01e647417 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -159,7 +159,7 @@ struct scan_control { + /* + * From 0 .. 100. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 20; + /* + * The total number of pages which are beyond the high watermark within all + * zones. diff --git a/linux-tkg/linux-tkg-patches/5.4/0011-ZFS-fix.patch b/linux-tkg/linux-tkg-patches/5.4/0011-ZFS-fix.patch new file mode 100644 index 0000000..af71d04 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.4/0011-ZFS-fix.patch @@ -0,0 +1,43 @@ +From 1e010beda2896bdf3082fb37a3e49f8ce20e04d8 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= +Date: Thu, 2 May 2019 05:28:08 +0100 +Subject: [PATCH] x86/fpu: Export kernel_fpu_{begin,end}() with + EXPORT_SYMBOL_GPL +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +We need these symbols in zfs as the fpu implementation breaks userspace: + +https://github.com/zfsonlinux/zfs/issues/9346 +Signed-off-by: Jörg Thalheim +--- + arch/x86/kernel/fpu/core.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c +index 12c70840980e..352538b3bb5d 100644 +--- a/arch/x86/kernel/fpu/core.c ++++ b/arch/x86/kernel/fpu/core.c +@@ -102,7 +102,7 @@ void kernel_fpu_begin(void) + } + __cpu_invalidate_fpregs_state(); + } +-EXPORT_SYMBOL_GPL(kernel_fpu_begin); ++EXPORT_SYMBOL(kernel_fpu_begin); + + void kernel_fpu_end(void) + { +@@ -111,7 +111,7 @@ void kernel_fpu_end(void) + this_cpu_write(in_kernel_fpu, false); + preempt_enable(); + } +-EXPORT_SYMBOL_GPL(kernel_fpu_end); ++EXPORT_SYMBOL(kernel_fpu_end); + + /* + * Save the FPU state (mark it for reload if necessary): +-- +2.23.0 + + diff --git a/linux-tkg/linux-tkg-patches/5.4/0012-linux-hardened.patch b/linux-tkg/linux-tkg-patches/5.4/0012-linux-hardened.patch new file mode 100644 index 0000000..b50ec74 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.4/0012-linux-hardened.patch @@ -0,0 +1,2806 @@ +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 5594c8bf1dcd..ac80978f4629 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -505,16 +505,6 @@ + nosocket -- Disable socket memory accounting. + nokmem -- Disable kernel memory accounting. + +- checkreqprot [SELINUX] Set initial checkreqprot flag value. +- Format: { "0" | "1" } +- See security/selinux/Kconfig help text. +- 0 -- check protection applied by kernel (includes +- any implied execute protection). +- 1 -- check protection requested by application. +- Default value is set via a kernel config option. +- Value can be changed at runtime via +- /selinux/checkreqprot. +- + cio_ignore= [S390] + See Documentation/s390/common_io.rst for details. + clk_ignore_unused +@@ -3345,6 +3335,11 @@ + the specified number of seconds. This is to be used if + your oopses keep scrolling off the screen. + ++ extra_latent_entropy ++ Enable a very simple form of latent entropy extraction ++ from the first 4GB of memory as the bootmem allocator ++ passes the memory pages to the buddy allocator. ++ + pcbit= [HW,ISDN] + + pcd. [PARIDE] +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index 032c7cd3cede..cc3491b05976 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -102,6 +102,7 @@ show up in /proc/sys/kernel: + - sysctl_writes_strict + - tainted ==> Documentation/admin-guide/tainted-kernels.rst + - threads-max ++- tiocsti_restrict + - unknown_nmi_panic + - watchdog + - watchdog_thresh +@@ -1114,6 +1115,25 @@ thread structures would occupy too much (more than 1/8th) of the + available RAM pages threads-max is reduced accordingly. + + ++tiocsti_restrict: ++================= ++ ++This toggle indicates whether unprivileged users are prevented from using the ++TIOCSTI ioctl to inject commands into other processes which share a tty ++session. ++ ++When tiocsti_restrict is set to (0) there are no restrictions(accept the ++default restriction of only being able to injection commands into one's own ++tty). When tiocsti_restrict is set to (1), users must have CAP_SYS_ADMIN to ++use the TIOCSTI ioctl. ++ ++When user namespaces are in use, the check for the capability CAP_SYS_ADMIN is ++done against the user namespace that originally opened the tty. ++ ++The kernel config option CONFIG_SECURITY_TIOCSTI_RESTRICT sets the default ++value of tiocsti_restrict. ++ ++ + unknown_nmi_panic: + ================== + +diff --git a/arch/Kconfig b/arch/Kconfig +index 5f8a5d84dbbe..60103a76d33e 100644 +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -653,7 +653,7 @@ config ARCH_MMAP_RND_BITS + int "Number of bits to use for ASLR of mmap base address" if EXPERT + range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX + default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT +- default ARCH_MMAP_RND_BITS_MIN ++ default ARCH_MMAP_RND_BITS_MAX + depends on HAVE_ARCH_MMAP_RND_BITS + help + This value can be used to select the number of bits to use to +@@ -687,7 +687,7 @@ config ARCH_MMAP_RND_COMPAT_BITS + int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT + range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX + default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT +- default ARCH_MMAP_RND_COMPAT_BITS_MIN ++ default ARCH_MMAP_RND_COMPAT_BITS_MAX + depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS + help + This value can be used to select the number of bits to use to +@@ -906,6 +906,7 @@ config ARCH_HAS_REFCOUNT + + config REFCOUNT_FULL + bool "Perform full reference count validation at the expense of speed" ++ default y + help + Enabling this switches the refcounting infrastructure from a fast + unchecked atomic_t implementation to a fully state checked +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index 6ccd2ed30963..56d39ec3c2c3 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -1139,6 +1139,7 @@ config RODATA_FULL_DEFAULT_ENABLED + + config ARM64_SW_TTBR0_PAN + bool "Emulate Privileged Access Never using TTBR0_EL1 switching" ++ default y + help + Enabling this option prevents the kernel from accessing + user-space memory directly by pointing TTBR0_EL1 to a reserved +@@ -1538,6 +1539,7 @@ config RANDOMIZE_BASE + bool "Randomize the address of the kernel image" + select ARM64_MODULE_PLTS if MODULES + select RELOCATABLE ++ default y + help + Randomizes the virtual address at which the kernel image is + loaded, as a security feature that deters exploit attempts +diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug +index cf09010d825f..dc4083ceff57 100644 +--- a/arch/arm64/Kconfig.debug ++++ b/arch/arm64/Kconfig.debug +@@ -43,6 +43,7 @@ config ARM64_RANDOMIZE_TEXT_OFFSET + config DEBUG_WX + bool "Warn on W+X mappings at boot" + select ARM64_PTDUMP_CORE ++ default y + ---help--- + Generate a warning if any W+X mappings are found at boot. + +diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig +index c9a867ac32d4..5c4d264f6a6e 100644 +--- a/arch/arm64/configs/defconfig ++++ b/arch/arm64/configs/defconfig +@@ -1,4 +1,3 @@ +-CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_AUDIT=y + CONFIG_NO_HZ_IDLE=y +diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h +index b618017205a3..0a228dbcad65 100644 +--- a/arch/arm64/include/asm/elf.h ++++ b/arch/arm64/include/asm/elf.h +@@ -103,14 +103,10 @@ + + /* + * This is the base location for PIE (ET_DYN with INTERP) loads. On +- * 64-bit, this is above 4GB to leave the entire 32-bit address ++ * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * space open for things that want to use the area for 32-bit pointers. + */ +-#ifdef CONFIG_ARM64_FORCE_52BIT +-#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3) +-#else +-#define ELF_ET_DYN_BASE (2 * DEFAULT_MAP_WINDOW_64 / 3) +-#endif /* CONFIG_ARM64_FORCE_52BIT */ ++#define ELF_ET_DYN_BASE 0x100000000UL + + #ifndef __ASSEMBLY__ + +@@ -164,10 +160,10 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, + /* 1GB of VA */ + #ifdef CONFIG_COMPAT + #define STACK_RND_MASK (test_thread_flag(TIF_32BIT) ? \ +- 0x7ff >> (PAGE_SHIFT - 12) : \ +- 0x3ffff >> (PAGE_SHIFT - 12)) ++ ((1UL << mmap_rnd_compat_bits) - 1) >> (PAGE_SHIFT - 12) : \ ++ ((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12)) + #else +-#define STACK_RND_MASK (0x3ffff >> (PAGE_SHIFT - 12)) ++#define STACK_RND_MASK (((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12)) + #endif + + #ifdef __AARCH64EB__ +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 8ef85139553f..e16076b30625 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1219,8 +1219,7 @@ config VM86 + default X86_LEGACY_VM86 + + config X86_16BIT +- bool "Enable support for 16-bit segments" if EXPERT +- default y ++ bool "Enable support for 16-bit segments" + depends on MODIFY_LDT_SYSCALL + ---help--- + This option is required by programs like Wine to run 16-bit +@@ -2365,7 +2364,7 @@ config COMPAT_VDSO + choice + prompt "vsyscall table for legacy applications" + depends on X86_64 +- default LEGACY_VSYSCALL_XONLY ++ default LEGACY_VSYSCALL_NONE + help + Legacy user code that does not know how to find the vDSO expects + to be able to issue three syscalls by calling fixed addresses in +@@ -2461,8 +2460,7 @@ config CMDLINE_OVERRIDE + be set to 'N' under normal conditions. + + config MODIFY_LDT_SYSCALL +- bool "Enable the LDT (local descriptor table)" if EXPERT +- default y ++ bool "Enable the LDT (local descriptor table)" + ---help--- + Linux can allow user programs to install a per-process x86 + Local Descriptor Table (LDT) using the modify_ldt(2) system +diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug +index bf9cd83de777..13ef90f3de52 100644 +--- a/arch/x86/Kconfig.debug ++++ b/arch/x86/Kconfig.debug +@@ -91,6 +91,7 @@ config EFI_PGT_DUMP + config DEBUG_WX + bool "Warn on W+X mappings at boot" + select X86_PTDUMP_CORE ++ default y + ---help--- + Generate a warning if any W+X mappings are found at boot. + +diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig +index d0a5ffeae8df..2a91d4a9f640 100644 +--- a/arch/x86/configs/x86_64_defconfig ++++ b/arch/x86/configs/x86_64_defconfig +@@ -1,5 +1,4 @@ + # CONFIG_LOCALVERSION_AUTO is not set +-CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_TASKSTATS=y +diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c +index f5937742b290..6655ce228e25 100644 +--- a/arch/x86/entry/vdso/vma.c ++++ b/arch/x86/entry/vdso/vma.c +@@ -198,55 +198,9 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) + } + + #ifdef CONFIG_X86_64 +-/* +- * Put the vdso above the (randomized) stack with another randomized +- * offset. This way there is no hole in the middle of address space. +- * To save memory make sure it is still in the same PTE as the stack +- * top. This doesn't give that many random bits. +- * +- * Note that this algorithm is imperfect: the distribution of the vdso +- * start address within a PMD is biased toward the end. +- * +- * Only used for the 64-bit and x32 vdsos. +- */ +-static unsigned long vdso_addr(unsigned long start, unsigned len) +-{ +- unsigned long addr, end; +- unsigned offset; +- +- /* +- * Round up the start address. It can start out unaligned as a result +- * of stack start randomization. +- */ +- start = PAGE_ALIGN(start); +- +- /* Round the lowest possible end address up to a PMD boundary. */ +- end = (start + len + PMD_SIZE - 1) & PMD_MASK; +- if (end >= TASK_SIZE_MAX) +- end = TASK_SIZE_MAX; +- end -= len; +- +- if (end > start) { +- offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); +- addr = start + (offset << PAGE_SHIFT); +- } else { +- addr = start; +- } +- +- /* +- * Forcibly align the final address in case we have a hardware +- * issue that requires alignment for performance reasons. +- */ +- addr = align_vdso_addr(addr); +- +- return addr; +-} +- + static int map_vdso_randomized(const struct vdso_image *image) + { +- unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start); +- +- return map_vdso(image, addr); ++ return map_vdso(image, 0); + } + #endif + +diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h +index 69c0f892e310..f9f7a85bb71e 100644 +--- a/arch/x86/include/asm/elf.h ++++ b/arch/x86/include/asm/elf.h +@@ -248,11 +248,11 @@ extern int force_personality32; + + /* + * This is the base location for PIE (ET_DYN with INTERP) loads. On +- * 64-bit, this is above 4GB to leave the entire 32-bit address ++ * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * space open for things that want to use the area for 32-bit pointers. + */ + #define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ +- (DEFAULT_MAP_WINDOW / 3 * 2)) ++ 0x100000000UL) + + /* This yields a mask that user programs can use to figure out what + instruction set this CPU supports. This could be done in user space, +@@ -312,8 +312,8 @@ extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len); + + #ifdef CONFIG_X86_32 + +-#define __STACK_RND_MASK(is32bit) (0x7ff) +-#define STACK_RND_MASK (0x7ff) ++#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1) ++#define STACK_RND_MASK ((1UL << mmap_rnd_bits) - 1) + + #define ARCH_DLINFO ARCH_DLINFO_IA32 + +@@ -322,7 +322,11 @@ extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len); + #else /* CONFIG_X86_32 */ + + /* 1GB for 64bit, 8MB for 32bit */ +-#define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff) ++#ifdef CONFIG_COMPAT ++#define __STACK_RND_MASK(is32bit) ((is32bit) ? (1UL << mmap_rnd_compat_bits) - 1 : (1UL << mmap_rnd_bits) - 1) ++#else ++#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1) ++#endif + #define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32()) + + #define ARCH_DLINFO \ +@@ -380,5 +384,4 @@ struct va_alignment { + } ____cacheline_aligned; + + extern struct va_alignment va_align; +-extern unsigned long align_vdso_addr(unsigned long); + #endif /* _ASM_X86_ELF_H */ +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 6f66d841262d..b786e7cb395d 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -295,6 +295,7 @@ static inline void cr4_set_bits_irqsoff(unsigned long mask) + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + if ((cr4 | mask) != cr4) + __cr4_set(cr4 | mask); + } +@@ -305,6 +306,7 @@ static inline void cr4_clear_bits_irqsoff(unsigned long mask) + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + if ((cr4 & ~mask) != cr4) + __cr4_set(cr4 & ~mask); + } +@@ -334,6 +336,7 @@ static inline void cr4_toggle_bits_irqsoff(unsigned long mask) + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + __cr4_set(cr4 ^ mask); + } + +@@ -440,6 +443,7 @@ static inline void __native_flush_tlb_global(void) + raw_local_irq_save(flags); + + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + /* toggle PGE */ + native_write_cr4(cr4 ^ X86_CR4_PGE); + /* write old PGE again and flush TLBs */ +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index fffe21945374..e9e124eb6ccb 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1854,7 +1854,6 @@ void cpu_init(void) + wrmsrl(MSR_KERNEL_GS_BASE, 0); + barrier(); + +- x86_configure_nx(); + x2apic_setup(); + + /* +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index 5e94c4354d4e..093bd8ad1130 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -42,6 +42,8 @@ + #include + #include + #include ++#include ++#include + + #include "process.h" + +@@ -798,7 +800,10 @@ unsigned long arch_align_stack(unsigned long sp) + + unsigned long arch_randomize_brk(struct mm_struct *mm) + { +- return randomize_page(mm->brk, 0x02000000); ++ if (mmap_is_ia32()) ++ return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE; ++ else ++ return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE; + } + + /* +diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c +index f7476ce23b6e..652169a2b23a 100644 +--- a/arch/x86/kernel/sys_x86_64.c ++++ b/arch/x86/kernel/sys_x86_64.c +@@ -54,13 +54,6 @@ static unsigned long get_align_bits(void) + return va_align.bits & get_align_mask(); + } + +-unsigned long align_vdso_addr(unsigned long addr) +-{ +- unsigned long align_mask = get_align_mask(); +- addr = (addr + align_mask) & ~align_mask; +- return addr | get_align_bits(); +-} +- + static int __init control_va_addr_alignment(char *str) + { + /* guard against enabling this on other CPU families */ +@@ -122,10 +115,7 @@ static void find_start_end(unsigned long addr, unsigned long flags, + } + + *begin = get_mmap_base(1); +- if (in_32bit_syscall()) +- *end = task_size_32bit(); +- else +- *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW); ++ *end = get_mmap_base(0); + } + + unsigned long +@@ -210,7 +200,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; +- info.low_limit = PAGE_SIZE; ++ info.low_limit = get_mmap_base(1); + info.high_limit = get_mmap_base(0); + + /* +diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c +index 0a74407ef92e..5ceff405c81c 100644 +--- a/arch/x86/mm/init_32.c ++++ b/arch/x86/mm/init_32.c +@@ -560,9 +560,9 @@ static void __init pagetable_init(void) + + #define DEFAULT_PTE_MASK ~(_PAGE_NX | _PAGE_GLOBAL) + /* Bits supported by the hardware: */ +-pteval_t __supported_pte_mask __read_mostly = DEFAULT_PTE_MASK; ++pteval_t __supported_pte_mask __ro_after_init = DEFAULT_PTE_MASK; + /* Bits allowed in normal kernel mappings: */ +-pteval_t __default_kernel_pte_mask __read_mostly = DEFAULT_PTE_MASK; ++pteval_t __default_kernel_pte_mask __ro_after_init = DEFAULT_PTE_MASK; + EXPORT_SYMBOL_GPL(__supported_pte_mask); + /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ + EXPORT_SYMBOL(__default_kernel_pte_mask); +diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c +index b8541d77452c..a231504e0348 100644 +--- a/arch/x86/mm/init_64.c ++++ b/arch/x86/mm/init_64.c +@@ -97,9 +97,9 @@ DEFINE_ENTRY(pte, pte, init) + */ + + /* Bits supported by the hardware: */ +-pteval_t __supported_pte_mask __read_mostly = ~0; ++pteval_t __supported_pte_mask __ro_after_init = ~0; + /* Bits allowed in normal kernel mappings: */ +-pteval_t __default_kernel_pte_mask __read_mostly = ~0; ++pteval_t __default_kernel_pte_mask __ro_after_init = ~0; + EXPORT_SYMBOL_GPL(__supported_pte_mask); + /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ + EXPORT_SYMBOL(__default_kernel_pte_mask); +diff --git a/block/blk-softirq.c b/block/blk-softirq.c +index 457d9ba3eb20..5f987fc1c0a0 100644 +--- a/block/blk-softirq.c ++++ b/block/blk-softirq.c +@@ -20,7 +20,7 @@ static DEFINE_PER_CPU(struct list_head, blk_cpu_done); + * Softirq action handler - move entries to local list and loop over them + * while passing them to the queue registered handler. + */ +-static __latent_entropy void blk_done_softirq(struct softirq_action *h) ++static __latent_entropy void blk_done_softirq(void) + { + struct list_head *cpu_list, local_list; + +diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c +index 84b183a6424e..b83bff5e9ab5 100644 +--- a/drivers/ata/libata-core.c ++++ b/drivers/ata/libata-core.c +@@ -5143,7 +5143,7 @@ void ata_qc_free(struct ata_queued_cmd *qc) + struct ata_port *ap; + unsigned int tag; + +- WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ ++ BUG_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ + ap = qc->ap; + + qc->flags = 0; +@@ -5160,7 +5160,7 @@ void __ata_qc_complete(struct ata_queued_cmd *qc) + struct ata_port *ap; + struct ata_link *link; + +- WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ ++ BUG_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ + WARN_ON_ONCE(!(qc->flags & ATA_QCFLAG_ACTIVE)); + ap = qc->ap; + link = qc->dev->link; +diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig +index df0fc997dc3e..bd8eed8de6c1 100644 +--- a/drivers/char/Kconfig ++++ b/drivers/char/Kconfig +@@ -9,7 +9,6 @@ source "drivers/tty/Kconfig" + + config DEVMEM + bool "/dev/mem virtual device support" +- default y + help + Say Y here if you want to support the /dev/mem device. + The /dev/mem device is used to access areas of physical +@@ -514,7 +513,6 @@ config TELCLOCK + config DEVPORT + bool "/dev/port character device" + depends on ISA || PCI +- default y + help + Say Y here if you want to support the /dev/port device. The /dev/port + device is similar to /dev/mem, but for I/O ports. +diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig +index c7623f99ac0f..859c2782c8e2 100644 +--- a/drivers/tty/Kconfig ++++ b/drivers/tty/Kconfig +@@ -122,7 +122,6 @@ config UNIX98_PTYS + + config LEGACY_PTYS + bool "Legacy (BSD) PTY support" +- default y + ---help--- + A pseudo terminal (PTY) is a software device consisting of two + halves: a master and a slave. The slave device behaves identical to +diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c +index 802c1210558f..0cc320f33cdc 100644 +--- a/drivers/tty/tty_io.c ++++ b/drivers/tty/tty_io.c +@@ -173,6 +173,7 @@ static void free_tty_struct(struct tty_struct *tty) + put_device(tty->dev); + kfree(tty->write_buf); + tty->magic = 0xDEADDEAD; ++ put_user_ns(tty->owner_user_ns); + kfree(tty); + } + +@@ -2180,11 +2181,19 @@ static int tty_fasync(int fd, struct file *filp, int on) + * FIXME: may race normal receive processing + */ + ++int tiocsti_restrict = IS_ENABLED(CONFIG_SECURITY_TIOCSTI_RESTRICT); ++ + static int tiocsti(struct tty_struct *tty, char __user *p) + { + char ch, mbz = 0; + struct tty_ldisc *ld; + ++ if (tiocsti_restrict && ++ !ns_capable(tty->owner_user_ns, CAP_SYS_ADMIN)) { ++ dev_warn_ratelimited(tty->dev, ++ "Denied TIOCSTI ioctl for non-privileged process\n"); ++ return -EPERM; ++ } + if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + if (get_user(ch, p)) +@@ -3004,6 +3013,7 @@ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx) + tty->index = idx; + tty_line_name(driver, idx, tty->name); + tty->dev = tty_get_device(tty); ++ tty->owner_user_ns = get_user_ns(current_user_ns()); + + return tty; + } +diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c +index 4ac74b354801..7c2cb5b3a449 100644 +--- a/drivers/usb/core/hub.c ++++ b/drivers/usb/core/hub.c +@@ -42,6 +42,8 @@ + #define USB_TP_TRANSMISSION_DELAY 40 /* ns */ + #define USB_TP_TRANSMISSION_DELAY_MAX 65535 /* ns */ + ++extern int deny_new_usb; ++ + /* Protect struct usb_device->state and ->children members + * Note: Both are also protected by ->dev.sem, except that ->state can + * change to USB_STATE_NOTATTACHED even when the semaphore isn't held. */ +@@ -4991,6 +4993,12 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, + goto done; + return; + } ++ ++ if (deny_new_usb) { ++ dev_err(&port_dev->dev, "denied insert of USB device on port %d\n", port1); ++ goto done; ++ } ++ + if (hub_is_superspeed(hub->hdev)) + unit_load = 150; + else +diff --git a/fs/exec.c b/fs/exec.c +index c27231234764..4038334db213 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -63,6 +63,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -276,6 +277,8 @@ static int __bprm_mm_init(struct linux_binprm *bprm) + arch_bprm_mm_init(mm, vma); + up_write(&mm->mmap_sem); + bprm->p = vma->vm_end - sizeof(void *); ++ if (randomize_va_space) ++ bprm->p ^= get_random_int() & ~PAGE_MASK; + return 0; + err: + up_write(&mm->mmap_sem); +diff --git a/fs/namei.c b/fs/namei.c +index 671c3c1a3425..618ef0b5d000 100644 +--- a/fs/namei.c ++++ b/fs/namei.c +@@ -877,10 +877,10 @@ static inline void put_link(struct nameidata *nd) + path_put(&last->link); + } + +-int sysctl_protected_symlinks __read_mostly = 0; +-int sysctl_protected_hardlinks __read_mostly = 0; +-int sysctl_protected_fifos __read_mostly; +-int sysctl_protected_regular __read_mostly; ++int sysctl_protected_symlinks __read_mostly = 1; ++int sysctl_protected_hardlinks __read_mostly = 1; ++int sysctl_protected_fifos __read_mostly = 2; ++int sysctl_protected_regular __read_mostly = 2; + + /** + * may_follow_link - Check symlink following for unsafe situations +diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig +index 295a7a21b774..3aed361bc0f9 100644 +--- a/fs/nfs/Kconfig ++++ b/fs/nfs/Kconfig +@@ -195,4 +195,3 @@ config NFS_DEBUG + bool + depends on NFS_FS && SUNRPC_DEBUG + select CRC32 +- default y +diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig +index cb5629bd5fff..bc44606fcc48 100644 +--- a/fs/proc/Kconfig ++++ b/fs/proc/Kconfig +@@ -41,7 +41,6 @@ config PROC_KCORE + config PROC_VMCORE + bool "/proc/vmcore support" + depends on PROC_FS && CRASH_DUMP +- default y + help + Exports the dump image of crashed kernel in ELF format. + +diff --git a/fs/stat.c b/fs/stat.c +index c38e4c2e1221..6135fbaf7298 100644 +--- a/fs/stat.c ++++ b/fs/stat.c +@@ -40,8 +40,13 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) + stat->gid = inode->i_gid; + stat->rdev = inode->i_rdev; + stat->size = i_size_read(inode); +- stat->atime = inode->i_atime; +- stat->mtime = inode->i_mtime; ++ if (is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) { ++ stat->atime = inode->i_ctime; ++ stat->mtime = inode->i_ctime; ++ } else { ++ stat->atime = inode->i_atime; ++ stat->mtime = inode->i_mtime; ++ } + stat->ctime = inode->i_ctime; + stat->blksize = i_blocksize(inode); + stat->blocks = inode->i_blocks; +@@ -77,9 +82,14 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, + if (IS_AUTOMOUNT(inode)) + stat->attributes |= STATX_ATTR_AUTOMOUNT; + +- if (inode->i_op->getattr) +- return inode->i_op->getattr(path, stat, request_mask, +- query_flags); ++ if (inode->i_op->getattr) { ++ int retval = inode->i_op->getattr(path, stat, request_mask, query_flags); ++ if (!retval && is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) { ++ stat->atime = stat->ctime; ++ stat->mtime = stat->ctime; ++ } ++ return retval; ++ } + + generic_fillattr(inode, stat); + return 0; +diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c +index d99d166fd892..7a4f2854feb8 100644 +--- a/fs/userfaultfd.c ++++ b/fs/userfaultfd.c +@@ -28,7 +28,11 @@ + #include + #include + ++#ifdef CONFIG_USERFAULTFD_UNPRIVILEGED + int sysctl_unprivileged_userfaultfd __read_mostly = 1; ++#else ++int sysctl_unprivileged_userfaultfd __read_mostly; ++#endif + + static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; + +diff --git a/include/linux/cache.h b/include/linux/cache.h +index 750621e41d1c..e7157c18c62c 100644 +--- a/include/linux/cache.h ++++ b/include/linux/cache.h +@@ -31,6 +31,8 @@ + #define __ro_after_init __attribute__((__section__(".data..ro_after_init"))) + #endif + ++#define __read_only __ro_after_init ++ + #ifndef ____cacheline_aligned + #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) + #endif +diff --git a/include/linux/capability.h b/include/linux/capability.h +index ecce0f43c73a..e46306dd4401 100644 +--- a/include/linux/capability.h ++++ b/include/linux/capability.h +@@ -208,6 +208,7 @@ extern bool has_capability_noaudit(struct task_struct *t, int cap); + extern bool has_ns_capability_noaudit(struct task_struct *t, + struct user_namespace *ns, int cap); + extern bool capable(int cap); ++extern bool capable_noaudit(int cap); + extern bool ns_capable(struct user_namespace *ns, int cap); + extern bool ns_capable_noaudit(struct user_namespace *ns, int cap); + extern bool ns_capable_setid(struct user_namespace *ns, int cap); +@@ -234,6 +235,10 @@ static inline bool capable(int cap) + { + return true; + } ++static inline bool capable_noaudit(int cap) ++{ ++ return true; ++} + static inline bool ns_capable(struct user_namespace *ns, int cap) + { + return true; +diff --git a/include/linux/fs.h b/include/linux/fs.h +index 0b4d8fc79e0f..6f318e089249 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -3627,4 +3627,15 @@ static inline int inode_drain_writes(struct inode *inode) + return filemap_write_and_wait(inode->i_mapping); + } + ++extern int device_sidechannel_restrict; ++ ++static inline bool is_sidechannel_device(const struct inode *inode) ++{ ++ umode_t mode; ++ if (!device_sidechannel_restrict) ++ return false; ++ mode = inode->i_mode; ++ return ((S_ISCHR(mode) || S_ISBLK(mode)) && (mode & (S_IROTH | S_IWOTH))); ++} ++ + #endif /* _LINUX_FS_H */ +diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h +index a2d5d175d3c1..e91ab06119b0 100644 +--- a/include/linux/fsnotify.h ++++ b/include/linux/fsnotify.h +@@ -233,6 +233,9 @@ static inline void fsnotify_access(struct file *file) + struct inode *inode = file_inode(file); + __u32 mask = FS_ACCESS; + ++ if (is_sidechannel_device(inode)) ++ return; ++ + if (S_ISDIR(inode->i_mode)) + mask |= FS_ISDIR; + +@@ -249,6 +252,9 @@ static inline void fsnotify_modify(struct file *file) + struct inode *inode = file_inode(file); + __u32 mask = FS_MODIFY; + ++ if (is_sidechannel_device(inode)) ++ return; ++ + if (S_ISDIR(inode->i_mode)) + mask |= FS_ISDIR; + +diff --git a/include/linux/gfp.h b/include/linux/gfp.h +index 61f2f6ff9467..f9b3e3d675ae 100644 +--- a/include/linux/gfp.h ++++ b/include/linux/gfp.h +@@ -553,9 +553,9 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, + extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); + extern unsigned long get_zeroed_page(gfp_t gfp_mask); + +-void *alloc_pages_exact(size_t size, gfp_t gfp_mask); ++void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __attribute__((alloc_size(1))); + void free_pages_exact(void *virt, size_t size); +-void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); ++void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __attribute__((alloc_size(2))); + + #define __get_free_page(gfp_mask) \ + __get_free_pages((gfp_mask), 0) +diff --git a/include/linux/highmem.h b/include/linux/highmem.h +index ea5cdbd8c2c3..805b84d6bbca 100644 +--- a/include/linux/highmem.h ++++ b/include/linux/highmem.h +@@ -215,6 +215,13 @@ static inline void clear_highpage(struct page *page) + kunmap_atomic(kaddr); + } + ++static inline void verify_zero_highpage(struct page *page) ++{ ++ void *kaddr = kmap_atomic(page); ++ BUG_ON(memchr_inv(kaddr, 0, PAGE_SIZE)); ++ kunmap_atomic(kaddr); ++} ++ + static inline void zero_user_segments(struct page *page, + unsigned start1, unsigned end1, + unsigned start2, unsigned end2) +diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h +index 89fc59dab57d..5f98e14e9470 100644 +--- a/include/linux/interrupt.h ++++ b/include/linux/interrupt.h +@@ -540,7 +540,7 @@ extern const char * const softirq_to_name[NR_SOFTIRQS]; + + struct softirq_action + { +- void (*action)(struct softirq_action *); ++ void (*action)(void); + }; + + asmlinkage void do_softirq(void); +@@ -555,7 +555,7 @@ static inline void do_softirq_own_stack(void) + } + #endif + +-extern void open_softirq(int nr, void (*action)(struct softirq_action *)); ++extern void __init open_softirq(int nr, void (*action)(void)); + extern void softirq_init(void); + extern void __raise_softirq_irqoff(unsigned int nr); + +diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h +index 069aa2ebef90..cb9e3637a620 100644 +--- a/include/linux/kobject_ns.h ++++ b/include/linux/kobject_ns.h +@@ -45,7 +45,7 @@ struct kobj_ns_type_operations { + void (*drop_ns)(void *); + }; + +-int kobj_ns_type_register(const struct kobj_ns_type_operations *ops); ++int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops); + int kobj_ns_type_registered(enum kobj_ns_type type); + const struct kobj_ns_type_operations *kobj_child_ns_ops(struct kobject *parent); + const struct kobj_ns_type_operations *kobj_ns_ops(struct kobject *kobj); +diff --git a/include/linux/mm.h b/include/linux/mm.h +index b249d2e033aa..a4855777d1fa 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -664,7 +664,7 @@ static inline int is_vmalloc_or_module_addr(const void *x) + } + #endif + +-extern void *kvmalloc_node(size_t size, gfp_t flags, int node); ++extern void *kvmalloc_node(size_t size, gfp_t flags, int node) __attribute__((alloc_size(1))); + static inline void *kvmalloc(size_t size, gfp_t flags) + { + return kvmalloc_node(size, flags, NUMA_NO_NODE); +diff --git a/include/linux/percpu.h b/include/linux/percpu.h +index 5e76af742c80..9a6c682ec127 100644 +--- a/include/linux/percpu.h ++++ b/include/linux/percpu.h +@@ -123,7 +123,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, + pcpu_fc_populate_pte_fn_t populate_pte_fn); + #endif + +-extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align); ++extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __attribute__((alloc_size(1))); + extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr); + extern bool is_kernel_percpu_address(unsigned long addr); + +@@ -131,8 +131,8 @@ extern bool is_kernel_percpu_address(unsigned long addr); + extern void __init setup_per_cpu_areas(void); + #endif + +-extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); +-extern void __percpu *__alloc_percpu(size_t size, size_t align); ++extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __attribute__((alloc_size(1))); ++extern void __percpu *__alloc_percpu(size_t size, size_t align) __attribute__((alloc_size(1))); + extern void free_percpu(void __percpu *__pdata); + extern phys_addr_t per_cpu_ptr_to_phys(void *addr); + +diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h +index 68ccc5b1913b..a7565ea44938 100644 +--- a/include/linux/perf_event.h ++++ b/include/linux/perf_event.h +@@ -1241,6 +1241,11 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, + int perf_event_max_stack_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + ++static inline bool perf_paranoid_any(void) ++{ ++ return sysctl_perf_event_paranoid > 2; ++} ++ + static inline bool perf_paranoid_tracepoint_raw(void) + { + return sysctl_perf_event_paranoid > -1; +diff --git a/include/linux/slab.h b/include/linux/slab.h +index 4d2a2fa55ed5..be3a8234edde 100644 +--- a/include/linux/slab.h ++++ b/include/linux/slab.h +@@ -184,8 +184,8 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *, struct mem_cgroup *); + /* + * Common kmalloc functions provided by all allocators + */ +-void * __must_check __krealloc(const void *, size_t, gfp_t); +-void * __must_check krealloc(const void *, size_t, gfp_t); ++void * __must_check __krealloc(const void *, size_t, gfp_t) __attribute__((alloc_size(2))); ++void * __must_check krealloc(const void *, size_t, gfp_t) __attribute((alloc_size(2))); + void kfree(const void *); + void kzfree(const void *); + size_t __ksize(const void *); +@@ -390,7 +390,7 @@ static __always_inline unsigned int kmalloc_index(size_t size) + } + #endif /* !CONFIG_SLOB */ + +-void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc; ++void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc __attribute__((alloc_size(1))); + void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc; + void kmem_cache_free(struct kmem_cache *, void *); + +@@ -414,7 +414,7 @@ static __always_inline void kfree_bulk(size_t size, void **p) + } + + #ifdef CONFIG_NUMA +-void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc; ++void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc __attribute__((alloc_size(1))); + void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc; + #else + static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node) +@@ -539,7 +539,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) + * Try really hard to succeed the allocation but fail + * eventually. + */ +-static __always_inline void *kmalloc(size_t size, gfp_t flags) ++static __always_inline __attribute__((alloc_size(1))) void *kmalloc(size_t size, gfp_t flags) + { + if (__builtin_constant_p(size)) { + #ifndef CONFIG_SLOB +@@ -581,7 +581,7 @@ static __always_inline unsigned int kmalloc_size(unsigned int n) + return 0; + } + +-static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) ++static __always_inline __attribute__((alloc_size(1))) void *kmalloc_node(size_t size, gfp_t flags, int node) + { + #ifndef CONFIG_SLOB + if (__builtin_constant_p(size) && +diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h +index d2153789bd9f..97da977d6060 100644 +--- a/include/linux/slub_def.h ++++ b/include/linux/slub_def.h +@@ -121,6 +121,11 @@ struct kmem_cache { + unsigned long random; + #endif + ++#ifdef CONFIG_SLAB_CANARY ++ unsigned long random_active; ++ unsigned long random_inactive; ++#endif ++ + #ifdef CONFIG_NUMA + /* + * Defragmentation by allocating from a remote node. +diff --git a/include/linux/string.h b/include/linux/string.h +index b6ccdc2c7f02..6d66b8740f90 100644 +--- a/include/linux/string.h ++++ b/include/linux/string.h +@@ -268,10 +268,16 @@ void __read_overflow2(void) __compiletime_error("detected read beyond size of ob + void __read_overflow3(void) __compiletime_error("detected read beyond size of object passed as 3rd parameter"); + void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter"); + ++#ifdef CONFIG_FORTIFY_SOURCE_STRICT_STRING ++#define __string_size(p) __builtin_object_size(p, 1) ++#else ++#define __string_size(p) __builtin_object_size(p, 0) ++#endif ++ + #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) + __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) + { +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + if (__builtin_constant_p(size) && p_size < size) + __write_overflow(); + if (p_size < size) +@@ -281,7 +287,7 @@ __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) + + __FORTIFY_INLINE char *strcat(char *p, const char *q) + { +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + if (p_size == (size_t)-1) + return __builtin_strcat(p, q); + if (strlcat(p, q, p_size) >= p_size) +@@ -292,7 +298,7 @@ __FORTIFY_INLINE char *strcat(char *p, const char *q) + __FORTIFY_INLINE __kernel_size_t strlen(const char *p) + { + __kernel_size_t ret; +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + + /* Work around gcc excess stack consumption issue */ + if (p_size == (size_t)-1 || +@@ -307,7 +313,7 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p) + extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); + __FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen) + { +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + __kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); + if (p_size <= ret && maxlen != ret) + fortify_panic(__func__); +@@ -319,8 +325,8 @@ extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); + __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) + { + size_t ret; +- size_t p_size = __builtin_object_size(p, 0); +- size_t q_size = __builtin_object_size(q, 0); ++ size_t p_size = __string_size(p); ++ size_t q_size = __string_size(q); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __real_strlcpy(p, q, size); + ret = strlen(q); +@@ -340,8 +346,8 @@ __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) + __FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count) + { + size_t p_len, copy_len; +- size_t p_size = __builtin_object_size(p, 0); +- size_t q_size = __builtin_object_size(q, 0); ++ size_t p_size = __string_size(p); ++ size_t q_size = __string_size(q); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __builtin_strncat(p, q, count); + p_len = strlen(p); +@@ -454,8 +460,8 @@ __FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp) + /* defined after fortified strlen and memcpy to reuse them */ + __FORTIFY_INLINE char *strcpy(char *p, const char *q) + { +- size_t p_size = __builtin_object_size(p, 0); +- size_t q_size = __builtin_object_size(q, 0); ++ size_t p_size = __string_size(p); ++ size_t q_size = __string_size(q); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __builtin_strcpy(p, q); + memcpy(p, q, strlen(q) + 1); +diff --git a/include/linux/tty.h b/include/linux/tty.h +index bfa4e2ee94a9..3e18d583fc8d 100644 +--- a/include/linux/tty.h ++++ b/include/linux/tty.h +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + + + /* +@@ -336,6 +337,7 @@ struct tty_struct { + /* If the tty has a pending do_SAK, queue it here - akpm */ + struct work_struct SAK_work; + struct tty_port *port; ++ struct user_namespace *owner_user_ns; + } __randomize_layout; + + /* Each of a tty's open files has private_data pointing to tty_file_private */ +@@ -345,6 +347,8 @@ struct tty_file_private { + struct list_head list; + }; + ++extern int tiocsti_restrict; ++ + /* tty magic number */ + #define TTY_MAGIC 0x5401 + +diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h +index 4e7809408073..0b58a5176a25 100644 +--- a/include/linux/vmalloc.h ++++ b/include/linux/vmalloc.h +@@ -88,19 +88,19 @@ static inline void vmalloc_init(void) + static inline unsigned long vmalloc_nr_pages(void) { return 0; } + #endif + +-extern void *vmalloc(unsigned long size); +-extern void *vzalloc(unsigned long size); +-extern void *vmalloc_user(unsigned long size); +-extern void *vmalloc_node(unsigned long size, int node); +-extern void *vzalloc_node(unsigned long size, int node); +-extern void *vmalloc_exec(unsigned long size); +-extern void *vmalloc_32(unsigned long size); +-extern void *vmalloc_32_user(unsigned long size); +-extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); ++extern void *vmalloc(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vzalloc(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_user(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_node(unsigned long size, int node) __attribute__((alloc_size(1))); ++extern void *vzalloc_node(unsigned long size, int node) __attribute__((alloc_size(1))); ++extern void *vmalloc_exec(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_32(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_32_user(unsigned long size) __attribute__((alloc_size(1))); ++extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) __attribute__((alloc_size(1))); + extern void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags, int node, +- const void *caller); ++ const void *caller) __attribute__((alloc_size(1))); + #ifndef CONFIG_MMU + extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags); + static inline void *__vmalloc_node_flags_caller(unsigned long size, int node, +diff --git a/init/Kconfig b/init/Kconfig +index b4daad2bac23..c1016fd960f0 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -381,6 +381,7 @@ config USELIB + config AUDIT + bool "Auditing support" + depends on NET ++ default y + help + Enable auditing infrastructure that can be used with another + kernel subsystem, such as SELinux (which requires this for +@@ -1118,6 +1119,22 @@ config USER_NS + + If unsure, say N. + ++config USER_NS_UNPRIVILEGED ++ bool "Allow unprivileged users to create namespaces" ++ depends on USER_NS ++ default n ++ help ++ When disabled, unprivileged users will not be able to create ++ new namespaces. Allowing users to create their own namespaces ++ has been part of several recent local privilege escalation ++ exploits, so if you need user namespaces but are ++ paranoid^Wsecurity-conscious you want to disable this. ++ ++ This setting can be overridden at runtime via the ++ kernel.unprivileged_userns_clone sysctl. ++ ++ If unsure, say N. ++ + config PID_NS + bool "PID Namespaces" + default y +@@ -1538,8 +1555,7 @@ config SHMEM + which may be appropriate on small systems without swap. + + config AIO +- bool "Enable AIO support" if EXPERT +- default y ++ bool "Enable AIO support" + help + This option enables POSIX asynchronous I/O which may by used + by some high performance threaded applications. Disabling +@@ -1650,6 +1666,23 @@ config USERFAULTFD + Enable the userfaultfd() system call that allows to intercept and + handle page faults in userland. + ++config USERFAULTFD_UNPRIVILEGED ++ bool "Allow unprivileged users to use the userfaultfd syscall" ++ depends on USERFAULTFD ++ default n ++ help ++ When disabled, unprivileged users will not be able to use the userfaultfd ++ syscall. Userfaultfd provide attackers with a way to stall a kernel ++ thread in the middle of memory accesses from userspace by initiating an ++ access on an unmapped page. To avoid various heap grooming and heap ++ spraying techniques for exploiting use-after-free flaws this should be ++ disabled by default. ++ ++ This setting can be overridden at runtime via the ++ vm.unprivileged_userfaultfd sysctl. ++ ++ If unsure, say N. ++ + config ARCH_HAS_MEMBARRIER_CALLBACKS + bool + +@@ -1762,7 +1795,7 @@ config VM_EVENT_COUNTERS + + config SLUB_DEBUG + default y +- bool "Enable SLUB debugging support" if EXPERT ++ bool "Enable SLUB debugging support" + depends on SLUB && SYSFS + help + SLUB has extensive debug support features. Disabling these can +@@ -1786,7 +1819,6 @@ config SLUB_MEMCG_SYSFS_ON + + config COMPAT_BRK + bool "Disable heap randomization" +- default y + help + Randomizing heap placement makes heap exploits harder, but it + also breaks ancient binaries (including anything libc5 based). +@@ -1833,7 +1865,6 @@ endchoice + + config SLAB_MERGE_DEFAULT + bool "Allow slab caches to be merged" +- default y + help + For reduced kernel memory fragmentation, slab caches can be + merged when they share the same size and other characteristics. +@@ -1846,9 +1877,9 @@ config SLAB_MERGE_DEFAULT + command line. + + config SLAB_FREELIST_RANDOM +- default n + depends on SLAB || SLUB + bool "SLAB freelist randomization" ++ default y + help + Randomizes the freelist order used on creating new pages. This + security feature reduces the predictability of the kernel slab +@@ -1857,12 +1888,30 @@ config SLAB_FREELIST_RANDOM + config SLAB_FREELIST_HARDENED + bool "Harden slab freelist metadata" + depends on SLUB ++ default y + help + Many kernel heap attacks try to target slab cache metadata and + other infrastructure. This options makes minor performance + sacrifices to harden the kernel slab allocator against common + freelist exploit methods. + ++config SLAB_CANARY ++ depends on SLUB ++ depends on !SLAB_MERGE_DEFAULT ++ bool "SLAB canaries" ++ default y ++ help ++ Place canaries at the end of kernel slab allocations, sacrificing ++ some performance and memory usage for security. ++ ++ Canaries can detect some forms of heap corruption when allocations ++ are freed and as part of the HARDENED_USERCOPY feature. It provides ++ basic use-after-free detection for HARDENED_USERCOPY. ++ ++ Canaries absorb small overflows (rendering them harmless), mitigate ++ non-NUL terminated C string overflows on 64-bit via a guaranteed zero ++ byte and provide basic double-free detection. ++ + config SHUFFLE_PAGE_ALLOCATOR + bool "Page allocator randomization" + default SLAB_FREELIST_RANDOM && ACPI_NUMA +diff --git a/kernel/audit.c b/kernel/audit.c +index da8dc0db5bd3..62dda6867dd9 100644 +--- a/kernel/audit.c ++++ b/kernel/audit.c +@@ -1628,6 +1628,9 @@ static int __init audit_enable(char *str) + + if (audit_default == AUDIT_OFF) + audit_initialized = AUDIT_DISABLED; ++ else if (!audit_ever_enabled) ++ audit_initialized = AUDIT_UNINITIALIZED; ++ + if (audit_set_enabled(audit_default)) + pr_err("audit: error setting audit state (%d)\n", + audit_default); +diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c +index ef0e1e3e66f4..d1ddc8695ab8 100644 +--- a/kernel/bpf/core.c ++++ b/kernel/bpf/core.c +@@ -519,7 +519,7 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) + #ifdef CONFIG_BPF_JIT + /* All BPF JIT sysctl knobs here. */ + int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); +-int bpf_jit_harden __read_mostly; ++int bpf_jit_harden __read_mostly = 2; + int bpf_jit_kallsyms __read_mostly; + long bpf_jit_limit __read_mostly; + +diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c +index ace1cfaa24b6..37e08fc44a6b 100644 +--- a/kernel/bpf/syscall.c ++++ b/kernel/bpf/syscall.c +@@ -39,7 +39,7 @@ static DEFINE_SPINLOCK(prog_idr_lock); + static DEFINE_IDR(map_idr); + static DEFINE_SPINLOCK(map_idr_lock); + +-int sysctl_unprivileged_bpf_disabled __read_mostly; ++int sysctl_unprivileged_bpf_disabled __read_mostly = 1; + + static const struct bpf_map_ops * const bpf_map_types[] = { + #define BPF_PROG_TYPE(_id, _ops) +diff --git a/kernel/capability.c b/kernel/capability.c +index 1444f3954d75..8cc9dd7992f2 100644 +--- a/kernel/capability.c ++++ b/kernel/capability.c +@@ -449,6 +449,12 @@ bool capable(int cap) + return ns_capable(&init_user_ns, cap); + } + EXPORT_SYMBOL(capable); ++ ++bool capable_noaudit(int cap) ++{ ++ return ns_capable_noaudit(&init_user_ns, cap); ++} ++EXPORT_SYMBOL(capable_noaudit); + #endif /* CONFIG_MULTIUSER */ + + /** +diff --git a/kernel/events/core.c b/kernel/events/core.c +index 6c829e22bad3..3063a7239a94 100644 +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -398,8 +398,13 @@ static cpumask_var_t perf_online_mask; + * 0 - disallow raw tracepoint access for unpriv + * 1 - disallow cpu events for unpriv + * 2 - disallow kernel profiling for unpriv ++ * 3 - disallow all unpriv perf event use + */ ++#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT ++int sysctl_perf_event_paranoid __read_mostly = 3; ++#else + int sysctl_perf_event_paranoid __read_mostly = 2; ++#endif + + /* Minimum for 512 kiB + 1 user control page */ + int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ +@@ -10895,6 +10900,9 @@ SYSCALL_DEFINE5(perf_event_open, + if (flags & ~PERF_FLAG_ALL) + return -EINVAL; + ++ if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN)) ++ return -EACCES; ++ + err = perf_copy_attr(attr_uptr, &attr); + if (err) + return err; +diff --git a/kernel/fork.c b/kernel/fork.c +index 755d8160e001..ed909f8050b2 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -106,6 +106,11 @@ + + #define CREATE_TRACE_POINTS + #include ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#else ++#define unprivileged_userns_clone 0 ++#endif + + /* + * Minimum number of threads to boot the kernel +@@ -1779,6 +1784,10 @@ static __latent_entropy struct task_struct *copy_process( + if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) + return ERR_PTR(-EINVAL); + ++ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) ++ if (!capable(CAP_SYS_ADMIN)) ++ return ERR_PTR(-EPERM); ++ + /* + * Thread groups must share signals as well, and detached threads + * can only be started up within the thread group. +@@ -2836,6 +2845,12 @@ int ksys_unshare(unsigned long unshare_flags) + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; + ++ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { ++ err = -EPERM; ++ if (!capable(CAP_SYS_ADMIN)) ++ goto bad_unshare_out; ++ } ++ + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; +diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c +index 477b4eb44af5..db28cc3fd301 100644 +--- a/kernel/rcu/tiny.c ++++ b/kernel/rcu/tiny.c +@@ -74,7 +74,7 @@ void rcu_sched_clock_irq(int user) + } + + /* Invoke the RCU callbacks whose grace period has elapsed. */ +-static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) ++static __latent_entropy void rcu_process_callbacks(void) + { + struct rcu_head *next, *list; + unsigned long flags; +diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c +index 81105141b6a8..38f04f653d29 100644 +--- a/kernel/rcu/tree.c ++++ b/kernel/rcu/tree.c +@@ -2381,7 +2381,7 @@ static __latent_entropy void rcu_core(void) + trace_rcu_utilization(TPS("End RCU core")); + } + +-static void rcu_core_si(struct softirq_action *h) ++static void rcu_core_si(void) + { + rcu_core(); + } +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index c87a798d1456..341c384cc597 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9889,7 +9889,7 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf) + * run_rebalance_domains is triggered when needed from the scheduler tick. + * Also triggered for nohz idle balancing (with nohz_balancing_kick set). + */ +-static __latent_entropy void run_rebalance_domains(struct softirq_action *h) ++static __latent_entropy void run_rebalance_domains(void) + { + struct rq *this_rq = this_rq(); + enum cpu_idle_type idle = this_rq->idle_balance ? +diff --git a/kernel/softirq.c b/kernel/softirq.c +index 0427a86743a4..5e6a9b4ccb41 100644 +--- a/kernel/softirq.c ++++ b/kernel/softirq.c +@@ -52,7 +52,7 @@ DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat); + EXPORT_PER_CPU_SYMBOL(irq_stat); + #endif + +-static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; ++static struct softirq_action softirq_vec[NR_SOFTIRQS] __ro_after_init __aligned(PAGE_SIZE); + + DEFINE_PER_CPU(struct task_struct *, ksoftirqd); + +@@ -289,7 +289,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) + kstat_incr_softirqs_this_cpu(vec_nr); + + trace_softirq_entry(vec_nr); +- h->action(h); ++ h->action(); + trace_softirq_exit(vec_nr); + if (unlikely(prev_count != preempt_count())) { + pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n", +@@ -452,7 +452,7 @@ void __raise_softirq_irqoff(unsigned int nr) + or_softirq_pending(1UL << nr); + } + +-void open_softirq(int nr, void (*action)(struct softirq_action *)) ++void __init open_softirq(int nr, void (*action)(void)) + { + softirq_vec[nr].action = action; + } +@@ -498,8 +498,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) + } + EXPORT_SYMBOL(__tasklet_hi_schedule); + +-static void tasklet_action_common(struct softirq_action *a, +- struct tasklet_head *tl_head, ++static void tasklet_action_common(struct tasklet_head *tl_head, + unsigned int softirq_nr) + { + struct tasklet_struct *list; +@@ -536,14 +535,14 @@ static void tasklet_action_common(struct softirq_action *a, + } + } + +-static __latent_entropy void tasklet_action(struct softirq_action *a) ++static __latent_entropy void tasklet_action(void) + { +- tasklet_action_common(a, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); ++ tasklet_action_common(this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); + } + +-static __latent_entropy void tasklet_hi_action(struct softirq_action *a) ++static __latent_entropy void tasklet_hi_action(void) + { +- tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); ++ tasklet_action_common(this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); + } + + void tasklet_init(struct tasklet_struct *t, +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index 70665934d53e..8ea67d08b926 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -68,6 +68,7 @@ + #include + #include + #include ++#include + + #include "../lib/kstrtox.h" + +@@ -104,12 +105,19 @@ + #if defined(CONFIG_SYSCTL) + + /* External variables not in a header file. */ ++#if IS_ENABLED(CONFIG_USB) ++int deny_new_usb __read_mostly = 0; ++EXPORT_SYMBOL(deny_new_usb); ++#endif + extern int suid_dumpable; + #ifdef CONFIG_COREDUMP + extern int core_uses_pid; + extern char core_pattern[]; + extern unsigned int core_pipe_limit; + #endif ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#endif + extern int pid_max; + extern int pid_max_min, pid_max_max; + extern int percpu_pagelist_fraction; +@@ -121,32 +129,32 @@ extern int sysctl_nr_trim_pages; + + /* Constants used for minimum and maximum */ + #ifdef CONFIG_LOCKUP_DETECTOR +-static int sixty = 60; ++static int sixty __read_only = 60; + #endif + +-static int __maybe_unused neg_one = -1; +-static int __maybe_unused two = 2; +-static int __maybe_unused four = 4; +-static unsigned long zero_ul; +-static unsigned long one_ul = 1; +-static unsigned long long_max = LONG_MAX; +-static int one_hundred = 100; +-static int one_thousand = 1000; ++static int __maybe_unused neg_one __read_only = -1; ++static int __maybe_unused two __read_only = 2; ++static int __maybe_unused four __read_only = 4; ++static unsigned long zero_ul __read_only; ++static unsigned long one_ul __read_only = 1; ++static unsigned long long_max __read_only = LONG_MAX; ++static int one_hundred __read_only = 100; ++static int one_thousand __read_only = 1000; + #ifdef CONFIG_PRINTK +-static int ten_thousand = 10000; ++static int ten_thousand __read_only = 10000; + #endif + #ifdef CONFIG_PERF_EVENTS +-static int six_hundred_forty_kb = 640 * 1024; ++static int six_hundred_forty_kb __read_only = 640 * 1024; + #endif + + /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ +-static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; ++static unsigned long dirty_bytes_min __read_only = 2 * PAGE_SIZE; + + /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ +-static int maxolduid = 65535; +-static int minolduid; ++static int maxolduid __read_only = 65535; ++static int minolduid __read_only; + +-static int ngroups_max = NGROUPS_MAX; ++static int ngroups_max __read_only = NGROUPS_MAX; + static const int cap_last_cap = CAP_LAST_CAP; + + /* +@@ -154,9 +162,12 @@ static const int cap_last_cap = CAP_LAST_CAP; + * and hung_task_check_interval_secs + */ + #ifdef CONFIG_DETECT_HUNG_TASK +-static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); ++static unsigned long hung_task_timeout_max __read_only = (LONG_MAX/HZ); + #endif + ++int device_sidechannel_restrict __read_mostly = 1; ++EXPORT_SYMBOL(device_sidechannel_restrict); ++ + #ifdef CONFIG_INOTIFY_USER + #include + #endif +@@ -301,19 +312,19 @@ static struct ctl_table sysctl_base_table[] = { + }; + + #ifdef CONFIG_SCHED_DEBUG +-static int min_sched_granularity_ns = 100000; /* 100 usecs */ +-static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ +-static int min_wakeup_granularity_ns; /* 0 usecs */ +-static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ ++static int min_sched_granularity_ns __read_only = 100000; /* 100 usecs */ ++static int max_sched_granularity_ns __read_only = NSEC_PER_SEC; /* 1 second */ ++static int min_wakeup_granularity_ns __read_only; /* 0 usecs */ ++static int max_wakeup_granularity_ns __read_only = NSEC_PER_SEC; /* 1 second */ + #ifdef CONFIG_SMP +-static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; +-static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; ++static int min_sched_tunable_scaling __read_only = SCHED_TUNABLESCALING_NONE; ++static int max_sched_tunable_scaling __read_only = SCHED_TUNABLESCALING_END-1; + #endif /* CONFIG_SMP */ + #endif /* CONFIG_SCHED_DEBUG */ + + #ifdef CONFIG_COMPACTION +-static int min_extfrag_threshold; +-static int max_extfrag_threshold = 1000; ++static int min_extfrag_threshold __read_only; ++static int max_extfrag_threshold __read_only = 1000; + #endif + + static struct ctl_table kern_table[] = { +@@ -546,6 +557,15 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_USER_NS ++ { ++ .procname = "unprivileged_userns_clone", ++ .data = &unprivileged_userns_clone, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++#endif + #ifdef CONFIG_PROC_SYSCTL + { + .procname = "tainted", +@@ -901,6 +921,37 @@ static struct ctl_table kern_table[] = { + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, ++#endif ++#if defined CONFIG_TTY ++ { ++ .procname = "tiocsti_restrict", ++ .data = &tiocsti_restrict, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_minmax_sysadmin, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, ++#endif ++ { ++ .procname = "device_sidechannel_restrict", ++ .data = &device_sidechannel_restrict, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_minmax_sysadmin, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, ++#if IS_ENABLED(CONFIG_USB) ++ { ++ .procname = "deny_new_usb", ++ .data = &deny_new_usb, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_minmax_sysadmin, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, + #endif + { + .procname = "ngroups_max", +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index 7f31932216a1..9ede224fc81f 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -1583,7 +1583,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, + } + } + +-static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) ++static __latent_entropy void hrtimer_run_softirq(void) + { + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + unsigned long flags; +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index 4820823515e9..1a61e5aa87ae 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1779,7 +1779,7 @@ static inline void __run_timers(struct timer_base *base) + /* + * This function runs timers and the timer-tq in bottom half context. + */ +-static __latent_entropy void run_timer_softirq(struct softirq_action *h) ++static __latent_entropy void run_timer_softirq(void) + { + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index 8eadadc478f9..c36ecd19562c 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -21,6 +21,13 @@ + #include + #include + ++/* sysctl */ ++#ifdef CONFIG_USER_NS_UNPRIVILEGED ++int unprivileged_userns_clone = 1; ++#else ++int unprivileged_userns_clone; ++#endif ++ + static struct kmem_cache *user_ns_cachep __read_mostly; + static DEFINE_MUTEX(userns_state_mutex); + +diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug +index 93d97f9b0157..fb923cae2120 100644 +--- a/lib/Kconfig.debug ++++ b/lib/Kconfig.debug +@@ -352,6 +352,9 @@ config SECTION_MISMATCH_WARN_ONLY + + If unsure, say Y. + ++config DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE ++ bool "Enable verbose reporting of writable function pointers" ++ + # + # Select this config option from the architecture Kconfig, if it + # is preferred to always offer frame pointers as a config +@@ -974,6 +977,7 @@ endmenu # "Debug lockups and hangs" + + config PANIC_ON_OOPS + bool "Panic on Oops" ++ default y + help + Say Y here to enable the kernel to panic when it oopses. This + has the same effect as setting oops=panic on the kernel command +@@ -983,7 +987,7 @@ config PANIC_ON_OOPS + anything erroneous after an oops which could result in data + corruption or other issues. + +- Say N if unsure. ++ Say Y if unsure. + + config PANIC_ON_OOPS_VALUE + int +@@ -1352,6 +1356,7 @@ config DEBUG_BUGVERBOSE + config DEBUG_LIST + bool "Debug linked list manipulation" + depends on DEBUG_KERNEL || BUG_ON_DATA_CORRUPTION ++ default y + help + Enable this to turn on extended checks in the linked-list + walking routines. +@@ -2073,6 +2078,7 @@ config MEMTEST + config BUG_ON_DATA_CORRUPTION + bool "Trigger a BUG when data corruption is detected" + select DEBUG_LIST ++ default y + help + Select this option if the kernel should BUG when it encounters + data corruption in kernel memory structures when they get checked +@@ -2112,6 +2118,7 @@ config STRICT_DEVMEM + config IO_STRICT_DEVMEM + bool "Filter I/O access to /dev/mem" + depends on STRICT_DEVMEM ++ default y + ---help--- + If this option is disabled, you allow userspace (root) access to all + io-memory regardless of whether a driver is actively using that +diff --git a/lib/irq_poll.c b/lib/irq_poll.c +index 2f17b488d58e..b6e7996a0058 100644 +--- a/lib/irq_poll.c ++++ b/lib/irq_poll.c +@@ -75,7 +75,7 @@ void irq_poll_complete(struct irq_poll *iop) + } + EXPORT_SYMBOL(irq_poll_complete); + +-static void __latent_entropy irq_poll_softirq(struct softirq_action *h) ++static void __latent_entropy irq_poll_softirq(void) + { + struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll); + int rearm = 0, budget = irq_poll_budget; +diff --git a/lib/kobject.c b/lib/kobject.c +index 83198cb37d8d..4a053b7aef42 100644 +--- a/lib/kobject.c ++++ b/lib/kobject.c +@@ -1009,9 +1009,9 @@ EXPORT_SYMBOL_GPL(kset_create_and_add); + + + static DEFINE_SPINLOCK(kobj_ns_type_lock); +-static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES]; ++static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES] __ro_after_init; + +-int kobj_ns_type_register(const struct kobj_ns_type_operations *ops) ++int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops) + { + enum kobj_ns_type type = ops->type; + int error; +diff --git a/lib/nlattr.c b/lib/nlattr.c +index cace9b307781..39ba1387045d 100644 +--- a/lib/nlattr.c ++++ b/lib/nlattr.c +@@ -571,6 +571,8 @@ int nla_memcpy(void *dest, const struct nlattr *src, int count) + { + int minlen = min_t(int, count, nla_len(src)); + ++ BUG_ON(minlen < 0); ++ + memcpy(dest, nla_data(src), minlen); + if (count > minlen) + memset(dest + minlen, 0, count - minlen); +diff --git a/lib/vsprintf.c b/lib/vsprintf.c +index e78017a3e1bd..ac5a5b5a439b 100644 +--- a/lib/vsprintf.c ++++ b/lib/vsprintf.c +@@ -771,7 +771,7 @@ static char *ptr_to_id(char *buf, char *end, const void *ptr, + return pointer_string(buf, end, (const void *)hashval, spec); + } + +-int kptr_restrict __read_mostly; ++int kptr_restrict __read_mostly = 2; + + static noinline_for_stack + char *restricted_pointer(char *buf, char *end, const void *ptr, +diff --git a/mm/Kconfig b/mm/Kconfig +index a5dae9a7eb51..0a3070c5a125 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -303,7 +303,8 @@ config KSM + config DEFAULT_MMAP_MIN_ADDR + int "Low address space to protect from user allocation" + depends on MMU +- default 4096 ++ default 32768 if ARM || (ARM64 && COMPAT) ++ default 65536 + help + This is the portion of low virtual memory which should be protected + from userspace allocation. Keeping a user from writing to low pages +diff --git a/mm/mmap.c b/mm/mmap.c +index 4390dbea4aa5..076fd46af68c 100644 +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -230,6 +230,13 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) + + newbrk = PAGE_ALIGN(brk); + oldbrk = PAGE_ALIGN(mm->brk); ++ /* properly handle unaligned min_brk as an empty heap */ ++ if (min_brk & ~PAGE_MASK) { ++ if (brk == min_brk) ++ newbrk -= PAGE_SIZE; ++ if (mm->brk == min_brk) ++ oldbrk -= PAGE_SIZE; ++ } + if (oldbrk == newbrk) { + mm->brk = brk; + goto success; +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 45e39131a716..78b4865f8a1c 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -68,6 +68,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -106,6 +107,15 @@ struct pcpu_drain { + DEFINE_MUTEX(pcpu_drain_mutex); + DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain); + ++bool __meminitdata extra_latent_entropy; ++ ++static int __init setup_extra_latent_entropy(char *str) ++{ ++ extra_latent_entropy = true; ++ return 0; ++} ++early_param("extra_latent_entropy", setup_extra_latent_entropy); ++ + #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY + volatile unsigned long latent_entropy __latent_entropy; + EXPORT_SYMBOL(latent_entropy); +@@ -1427,6 +1437,25 @@ static void __free_pages_ok(struct page *page, unsigned int order) + local_irq_restore(flags); + } + ++static void __init __gather_extra_latent_entropy(struct page *page, ++ unsigned int nr_pages) ++{ ++ if (extra_latent_entropy && !PageHighMem(page) && page_to_pfn(page) < 0x100000) { ++ unsigned long hash = 0; ++ size_t index, end = PAGE_SIZE * nr_pages / sizeof hash; ++ const unsigned long *data = lowmem_page_address(page); ++ ++ for (index = 0; index < end; index++) ++ hash ^= hash + data[index]; ++#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY ++ latent_entropy ^= hash; ++ add_device_randomness((const void *)&latent_entropy, sizeof(latent_entropy)); ++#else ++ add_device_randomness((const void *)&hash, sizeof(hash)); ++#endif ++ } ++} ++ + void __free_pages_core(struct page *page, unsigned int order) + { + unsigned int nr_pages = 1 << order; +@@ -1441,7 +1470,6 @@ void __free_pages_core(struct page *page, unsigned int order) + } + __ClearPageReserved(p); + set_page_count(p, 0); +- + atomic_long_add(nr_pages, &page_zone(page)->managed_pages); + set_page_refcounted(page); + __free_pages(page, order); +@@ -1492,6 +1520,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, + { + if (early_page_uninitialised(pfn)) + return; ++ __gather_extra_latent_entropy(page, 1 << order); + __free_pages_core(page, order); + } + +@@ -1582,6 +1611,7 @@ static void __init deferred_free_range(unsigned long pfn, + if (nr_pages == pageblock_nr_pages && + (pfn & (pageblock_nr_pages - 1)) == 0) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); ++ __gather_extra_latent_entropy(page, 1 << pageblock_order); + __free_pages_core(page, pageblock_order); + return; + } +@@ -1589,6 +1619,7 @@ static void __init deferred_free_range(unsigned long pfn, + for (i = 0; i < nr_pages; i++, page++, pfn++) { + if ((pfn & (pageblock_nr_pages - 1)) == 0) + set_pageblock_migratetype(page, MIGRATE_MOVABLE); ++ __gather_extra_latent_entropy(page, 1); + __free_pages_core(page, 0); + } + } +@@ -2156,6 +2187,12 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags + { + post_alloc_hook(page, order, gfp_flags); + ++ if (IS_ENABLED(CONFIG_PAGE_SANITIZE_VERIFY) && want_init_on_free()) { ++ int i; ++ for (i = 0; i < (1 << order); i++) ++ verify_zero_highpage(page + i); ++ } ++ + if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags)) + kernel_init_free_pages(page, 1 << order); + +diff --git a/mm/slab.h b/mm/slab.h +index b2b01694dc43..b531661095a2 100644 +--- a/mm/slab.h ++++ b/mm/slab.h +@@ -470,9 +470,13 @@ static inline struct kmem_cache *virt_to_cache(const void *obj) + struct page *page; + + page = virt_to_head_page(obj); ++#ifdef CONFIG_BUG_ON_DATA_CORRUPTION ++ BUG_ON(!PageSlab(page)); ++#else + if (WARN_ONCE(!PageSlab(page), "%s: Object is not a Slab page!\n", + __func__)) + return NULL; ++#endif + return page->slab_cache; + } + +@@ -518,9 +522,14 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) + return s; + + cachep = virt_to_cache(x); +- WARN_ONCE(cachep && !slab_equal_or_root(cachep, s), +- "%s: Wrong slab cache. %s but object is from %s\n", +- __func__, s->name, cachep->name); ++ if (cachep && !slab_equal_or_root(cachep, s)) { ++#ifdef CONFIG_BUG_ON_DATA_CORRUPTION ++ BUG(); ++#else ++ WARN_ONCE(1, "%s: Wrong slab cache. %s but object is from %s\n", ++ __func__, s->name, cachep->name); ++#endif ++ } + return cachep; + } + +@@ -545,7 +554,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s) + * back there or track user information then we can + * only use the space before that information. + */ +- if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) ++ if ((s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) || IS_ENABLED(CONFIG_SLAB_CANARY)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation +@@ -674,8 +683,10 @@ static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } + static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) + { + if (static_branch_unlikely(&init_on_alloc)) { ++#ifndef CONFIG_SLUB + if (c->ctor) + return false; ++#endif + if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) + return flags & __GFP_ZERO; + return true; +@@ -685,9 +696,15 @@ static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) + + static inline bool slab_want_init_on_free(struct kmem_cache *c) + { +- if (static_branch_unlikely(&init_on_free)) +- return !(c->ctor || +- (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))); ++ if (static_branch_unlikely(&init_on_free)) { ++#ifndef CONFIG_SLUB ++ if (c->ctor) ++ return false; ++#endif ++ if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ++ return false; ++ return true; ++ } + return false; + } + +diff --git a/mm/slab_common.c b/mm/slab_common.c +index ade6c257d4b4..f8f9ebd51296 100644 +--- a/mm/slab_common.c ++++ b/mm/slab_common.c +@@ -28,10 +28,10 @@ + + #include "slab.h" + +-enum slab_state slab_state; ++enum slab_state slab_state __ro_after_init; + LIST_HEAD(slab_caches); + DEFINE_MUTEX(slab_mutex); +-struct kmem_cache *kmem_cache; ++struct kmem_cache *kmem_cache __ro_after_init; + + #ifdef CONFIG_HARDENED_USERCOPY + bool usercopy_fallback __ro_after_init = +@@ -59,7 +59,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, + /* + * Merge control. If this is set then no merging of slab caches will occur. + */ +-static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); ++static bool slab_nomerge __ro_after_init = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); + + static int __init setup_slab_nomerge(char *str) + { +diff --git a/mm/slub.c b/mm/slub.c +index 20d72cb20515..6690bce322a4 100644 +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -125,6 +125,12 @@ static inline int kmem_cache_debug(struct kmem_cache *s) + #endif + } + ++static inline bool has_sanitize_verify(struct kmem_cache *s) ++{ ++ return IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) && ++ slab_want_init_on_free(s); ++} ++ + void *fixup_red_left(struct kmem_cache *s, void *p) + { + if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) +@@ -309,6 +315,35 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) + *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr); + } + ++#ifdef CONFIG_SLAB_CANARY ++static inline unsigned long *get_canary(struct kmem_cache *s, void *object) ++{ ++ if (s->offset) ++ return object + s->offset + sizeof(void *); ++ return object + s->inuse; ++} ++ ++static inline unsigned long get_canary_value(const void *canary, unsigned long value) ++{ ++ return (value ^ (unsigned long)canary) & CANARY_MASK; ++} ++ ++static inline void set_canary(struct kmem_cache *s, void *object, unsigned long value) ++{ ++ unsigned long *canary = get_canary(s, object); ++ *canary = get_canary_value(canary, value); ++} ++ ++static inline void check_canary(struct kmem_cache *s, void *object, unsigned long value) ++{ ++ unsigned long *canary = get_canary(s, object); ++ BUG_ON(*canary != get_canary_value(canary, value)); ++} ++#else ++#define set_canary(s, object, value) ++#define check_canary(s, object, value) ++#endif ++ + /* Loop over all objects in a slab */ + #define for_each_object(__p, __s, __addr, __objects) \ + for (__p = fixup_red_left(__s, __addr); \ +@@ -476,13 +511,13 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p) + * Debug settings: + */ + #if defined(CONFIG_SLUB_DEBUG_ON) +-static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS; ++static slab_flags_t slub_debug __ro_after_init = DEBUG_DEFAULT_FLAGS; + #else +-static slab_flags_t slub_debug; ++static slab_flags_t slub_debug __ro_after_init; + #endif + +-static char *slub_debug_slabs; +-static int disable_higher_order_debug; ++static char *slub_debug_slabs __ro_after_init; ++static int disable_higher_order_debug __ro_after_init; + + /* + * slub is about to manipulate internal object metadata. This memory lies +@@ -543,6 +578,9 @@ static struct track *get_track(struct kmem_cache *s, void *object, + else + p = object + s->inuse; + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ p = (void *)p + sizeof(void *); ++ + return p + alloc; + } + +@@ -673,6 +711,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) + else + off = s->inuse; + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ off += sizeof(void *); ++ + if (s->flags & SLAB_STORE_USER) + off += 2 * sizeof(struct track); + +@@ -802,6 +843,9 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) + /* Freepointer is placed after the object. */ + off += sizeof(void *); + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ off += sizeof(void *); ++ + if (s->flags & SLAB_STORE_USER) + /* We also have user information there */ + off += 2 * sizeof(struct track); +@@ -1441,6 +1485,8 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, + object = next; + next = get_freepointer(s, object); + ++ check_canary(s, object, s->random_active); ++ + if (slab_want_init_on_free(s)) { + /* + * Clear the object and the metadata, but don't touch +@@ -1451,8 +1497,12 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, + : 0; + memset((char *)object + s->inuse, 0, + s->size - s->inuse - rsize); +- ++ if (!IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) && s->ctor) ++ s->ctor(object); + } ++ ++ set_canary(s, object, s->random_inactive); ++ + /* If object's reuse doesn't have to be delayed */ + if (!slab_free_hook(s, object)) { + /* Move object to the new freelist */ +@@ -1460,6 +1510,17 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, + *head = object; + if (!*tail) + *tail = object; ++ } else if (slab_want_init_on_free(s) && s->ctor) { ++ /* Objects that are put into quarantine by KASAN will ++ * still undergo free_consistency_checks() and thus ++ * need to show a valid freepointer to check_object(). ++ * ++ * Note that doing this for all caches (not just ctor ++ * ones, which have s->offset != NULL)) causes a GPF, ++ * due to KASAN poisoning and the way set_freepointer() ++ * eventually dereferences the freepointer. ++ */ ++ set_freepointer(s, object, NULL); + } + } while (object != old_tail); + +@@ -1473,8 +1534,9 @@ static void *setup_object(struct kmem_cache *s, struct page *page, + void *object) + { + setup_object_debug(s, page, object); ++ set_canary(s, object, s->random_inactive); + object = kasan_init_slab_obj(s, object); +- if (unlikely(s->ctor)) { ++ if (unlikely(s->ctor) && !has_sanitize_verify(s)) { + kasan_unpoison_object_data(s, object); + s->ctor(object); + kasan_poison_object_data(s, object); +@@ -2752,8 +2814,28 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, + + maybe_wipe_obj_freeptr(s, object); + +- if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) ++ if (has_sanitize_verify(s) && object) { ++ /* KASAN hasn't unpoisoned the object yet (this is done in the ++ * post-alloc hook), so let's do it temporarily. ++ */ ++ kasan_unpoison_object_data(s, object); ++ BUG_ON(memchr_inv(object, 0, s->object_size)); ++ if (s->ctor) ++ s->ctor(object); ++ kasan_poison_object_data(s, object); ++ } else if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) { + memset(object, 0, s->object_size); ++ if (s->ctor) { ++ kasan_unpoison_object_data(s, object); ++ s->ctor(object); ++ kasan_poison_object_data(s, object); ++ } ++ } ++ ++ if (object) { ++ check_canary(s, object, s->random_inactive); ++ set_canary(s, object, s->random_active); ++ } + + slab_post_alloc_hook(s, gfpflags, 1, &object); + +@@ -3136,7 +3218,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) + { + struct kmem_cache_cpu *c; +- int i; ++ int i, k; + + /* memcg and kmem_cache debug support */ + s = slab_pre_alloc_hook(s, flags); +@@ -3176,11 +3258,35 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + local_irq_enable(); + + /* Clear memory outside IRQ disabled fastpath loop */ +- if (unlikely(slab_want_init_on_alloc(flags, s))) { ++ if (has_sanitize_verify(s)) { + int j; + +- for (j = 0; j < i; j++) ++ for (j = 0; j < i; j++) { ++ /* KASAN hasn't unpoisoned the object yet (this is done ++ * in the post-alloc hook), so let's do it temporarily. ++ */ ++ kasan_unpoison_object_data(s, p[j]); ++ BUG_ON(memchr_inv(p[j], 0, s->object_size)); ++ if (s->ctor) ++ s->ctor(p[j]); ++ kasan_poison_object_data(s, p[j]); ++ } ++ } else if (unlikely(slab_want_init_on_alloc(flags, s))) { ++ int j; ++ ++ for (j = 0; j < i; j++) { + memset(p[j], 0, s->object_size); ++ if (s->ctor) { ++ kasan_unpoison_object_data(s, p[j]); ++ s->ctor(p[j]); ++ kasan_poison_object_data(s, p[j]); ++ } ++ } ++ } ++ ++ for (k = 0; k < i; k++) { ++ check_canary(s, p[k], s->random_inactive); ++ set_canary(s, p[k], s->random_active); + } + + /* memcg and kmem_cache debug support */ +@@ -3214,9 +3320,9 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk); + * and increases the number of allocations possible without having to + * take the list_lock. + */ +-static unsigned int slub_min_order; +-static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; +-static unsigned int slub_min_objects; ++static unsigned int slub_min_order __ro_after_init; ++static unsigned int slub_max_order __ro_after_init = PAGE_ALLOC_COSTLY_ORDER; ++static unsigned int slub_min_objects __ro_after_init; + + /* + * Calculate the order of allocation given an slab object size. +@@ -3384,6 +3490,7 @@ static void early_kmem_cache_node_alloc(int node) + init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); + init_tracking(kmem_cache_node, n); + #endif ++ set_canary(kmem_cache_node, n, kmem_cache_node->random_active); + n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), + GFP_KERNEL); + page->freelist = get_freepointer(kmem_cache_node, n); +@@ -3544,6 +3651,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) + size += sizeof(void *); + } + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ size += sizeof(void *); ++ + #ifdef CONFIG_SLUB_DEBUG + if (flags & SLAB_STORE_USER) + /* +@@ -3616,6 +3726,10 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) + #ifdef CONFIG_SLAB_FREELIST_HARDENED + s->random = get_random_long(); + #endif ++#ifdef CONFIG_SLAB_CANARY ++ s->random_active = get_random_long(); ++ s->random_inactive = get_random_long(); ++#endif + + if (!calculate_sizes(s, -1)) + goto error; +@@ -3891,6 +4005,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, + offset -= s->red_left_pad; + } + ++ check_canary(s, (void *)ptr - offset, s->random_active); ++ + /* Allow address range falling entirely within usercopy region. */ + if (offset >= s->useroffset && + offset - s->useroffset <= s->usersize && +@@ -3924,7 +4040,11 @@ size_t __ksize(const void *object) + page = virt_to_head_page(object); + + if (unlikely(!PageSlab(page))) { ++#ifdef CONFIG_BUG_ON_DATA_CORRUPTION ++ BUG_ON(!PageCompound(page)); ++#else + WARN_ON(!PageCompound(page)); ++#endif + return page_size(page); + } + +@@ -4769,7 +4889,7 @@ enum slab_stat_type { + #define SO_TOTAL (1 << SL_TOTAL) + + #ifdef CONFIG_MEMCG +-static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON); ++static bool memcg_sysfs_enabled __ro_after_init = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON); + + static int __init setup_slub_memcg_sysfs(char *str) + { +diff --git a/mm/swap.c b/mm/swap.c +index 38c3fa4308e2..0534c2e348c2 100644 +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -94,6 +94,13 @@ static void __put_compound_page(struct page *page) + if (!PageHuge(page)) + __page_cache_release(page); + dtor = get_compound_page_dtor(page); ++ if (!PageHuge(page)) ++ BUG_ON(dtor != free_compound_page ++#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++ && dtor != free_transhuge_page ++#endif ++ ); ++ + (*dtor)(page); + } + +diff --git a/mm/util.c b/mm/util.c +index 3ad6db9a722e..80209685f67c 100644 +--- a/mm/util.c ++++ b/mm/util.c +@@ -325,9 +325,9 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) + { + /* Is the current task 32bit ? */ + if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) +- return randomize_page(mm->brk, SZ_32M); ++ return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE; + +- return randomize_page(mm->brk, SZ_1G); ++ return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE; + } + + unsigned long arch_mmap_rnd(void) +diff --git a/net/core/dev.c b/net/core/dev.c +index 3098c90d60e2..08de516adfd5 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -4492,7 +4492,7 @@ int netif_rx_ni(struct sk_buff *skb) + } + EXPORT_SYMBOL(netif_rx_ni); + +-static __latent_entropy void net_tx_action(struct softirq_action *h) ++static __latent_entropy void net_tx_action(void) + { + struct softnet_data *sd = this_cpu_ptr(&softnet_data); + +@@ -6353,7 +6353,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) + return work; + } + +-static __latent_entropy void net_rx_action(struct softirq_action *h) ++static __latent_entropy void net_rx_action(void) + { + struct softnet_data *sd = this_cpu_ptr(&softnet_data); + unsigned long time_limit = jiffies + +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index 03381f3e12ba..8ea409f37436 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -267,6 +267,7 @@ config IP_PIMSM_V2 + + config SYN_COOKIES + bool "IP: TCP syncookie support" ++ default y + ---help--- + Normal TCP/IP networking is open to an attack known as "SYN + flooding". This denial-of-service attack prevents legitimate remote +diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost +index 952fff485546..59ffccdb1be4 100644 +--- a/scripts/Makefile.modpost ++++ b/scripts/Makefile.modpost +@@ -54,6 +54,7 @@ MODPOST = scripts/mod/modpost \ + $(if $(KBUILD_EXTMOD),$(addprefix -e ,$(KBUILD_EXTRA_SYMBOLS))) \ + $(if $(KBUILD_EXTMOD),-o $(modulesymfile)) \ + $(if $(CONFIG_SECTION_MISMATCH_WARN_ONLY),,-E) \ ++ $(if $(CONFIG_DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE),-f) \ + $(if $(KBUILD_MODPOST_WARN),-w) \ + $(if $(filter nsdeps,$(MAKECMDGOALS)),-d) + +diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig +index e3569543bdac..55cc439b3bc6 100644 +--- a/scripts/gcc-plugins/Kconfig ++++ b/scripts/gcc-plugins/Kconfig +@@ -61,6 +61,11 @@ config GCC_PLUGIN_LATENT_ENTROPY + is some slowdown of the boot process (about 0.5%) and fork and + irq processing. + ++ When extra_latent_entropy is passed on the kernel command line, ++ entropy will be extracted from up to the first 4GB of RAM while the ++ runtime memory allocator is being initialized. This costs even more ++ slowdown of the boot process. ++ + Note that entropy extracted this way is not cryptographically + secure! + +diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c +index d2a30a7b3f07..ff57a5fe8029 100644 +--- a/scripts/mod/modpost.c ++++ b/scripts/mod/modpost.c +@@ -36,6 +36,8 @@ static int warn_unresolved = 0; + /* How a symbol is exported */ + static int sec_mismatch_count = 0; + static int sec_mismatch_fatal = 0; ++static int writable_fptr_count = 0; ++static int writable_fptr_verbose = 0; + /* ignore missing files */ + static int ignore_missing_files; + /* write namespace dependencies */ +@@ -1019,6 +1021,7 @@ enum mismatch { + ANY_EXIT_TO_ANY_INIT, + EXPORT_TO_INIT_EXIT, + EXTABLE_TO_NON_TEXT, ++ DATA_TO_TEXT + }; + + /** +@@ -1145,6 +1148,12 @@ static const struct sectioncheck sectioncheck[] = { + .good_tosec = {ALL_TEXT_SECTIONS , NULL}, + .mismatch = EXTABLE_TO_NON_TEXT, + .handler = extable_mismatch_handler, ++}, ++/* Do not reference code from writable data */ ++{ ++ .fromsec = { DATA_SECTIONS, NULL }, ++ .bad_tosec = { ALL_TEXT_SECTIONS, NULL }, ++ .mismatch = DATA_TO_TEXT + } + }; + +@@ -1332,10 +1341,10 @@ static Elf_Sym *find_elf_symbol(struct elf_info *elf, Elf64_Sword addr, + continue; + if (!is_valid_name(elf, sym)) + continue; +- if (sym->st_value == addr) +- return sym; + /* Find a symbol nearby - addr are maybe negative */ + d = sym->st_value - addr; ++ if (d == 0) ++ return sym; + if (d < 0) + d = addr - sym->st_value; + if (d < distance) { +@@ -1470,7 +1479,13 @@ static void report_sec_mismatch(const char *modname, + char *prl_from; + char *prl_to; + +- sec_mismatch_count++; ++ if (mismatch->mismatch == DATA_TO_TEXT) { ++ writable_fptr_count++; ++ if (!writable_fptr_verbose) ++ return; ++ } else { ++ sec_mismatch_count++; ++ } + + get_pretty_name(from_is_func, &from, &from_p); + get_pretty_name(to_is_func, &to, &to_p); +@@ -1592,6 +1607,12 @@ static void report_sec_mismatch(const char *modname, + fatal("There's a special handler for this mismatch type, " + "we should never get here."); + break; ++ case DATA_TO_TEXT: ++ fprintf(stderr, ++ "The %s %s:%s references\n" ++ "the %s %s:%s%s\n", ++ from, fromsec, fromsym, to, tosec, tosym, to_p); ++ break; + } + fprintf(stderr, "\n"); + } +@@ -2569,7 +2590,7 @@ int main(int argc, char **argv) + struct ext_sym_list *extsym_iter; + struct ext_sym_list *extsym_start = NULL; + +- while ((opt = getopt(argc, argv, "i:I:e:mnsT:o:awEd")) != -1) { ++ while ((opt = getopt(argc, argv, "i:I:e:fmnsT:o:awEd")) != -1) { + switch (opt) { + case 'i': + kernel_read = optarg; +@@ -2586,6 +2607,9 @@ int main(int argc, char **argv) + extsym_iter->file = optarg; + extsym_start = extsym_iter; + break; ++ case 'f': ++ writable_fptr_verbose = 1; ++ break; + case 'm': + modversions = 1; + break; +@@ -2692,6 +2716,11 @@ int main(int argc, char **argv) + } + + free(buf.p); ++ if (writable_fptr_count && !writable_fptr_verbose) ++ warn("modpost: Found %d writable function pointer%s.\n" ++ "To see full details build your kernel with:\n" ++ "'make CONFIG_DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE=y'\n", ++ writable_fptr_count, (writable_fptr_count == 1 ? "" : "s")); + + return err; + } +diff --git a/security/Kconfig b/security/Kconfig +index 2a1a2d396228..3b7a71410f88 100644 +--- a/security/Kconfig ++++ b/security/Kconfig +@@ -9,7 +9,7 @@ source "security/keys/Kconfig" + + config SECURITY_DMESG_RESTRICT + bool "Restrict unprivileged access to the kernel syslog" +- default n ++ default y + help + This enforces restrictions on unprivileged users reading the kernel + syslog via dmesg(8). +@@ -19,10 +19,34 @@ config SECURITY_DMESG_RESTRICT + + If you are unsure how to answer this question, answer N. + ++config SECURITY_PERF_EVENTS_RESTRICT ++ bool "Restrict unprivileged use of performance events" ++ depends on PERF_EVENTS ++ default y ++ help ++ If you say Y here, the kernel.perf_event_paranoid sysctl ++ will be set to 3 by default, and no unprivileged use of the ++ perf_event_open syscall will be permitted unless it is ++ changed. ++ ++config SECURITY_TIOCSTI_RESTRICT ++ bool "Restrict unprivileged use of tiocsti command injection" ++ default y ++ help ++ This enforces restrictions on unprivileged users injecting commands ++ into other processes which share a tty session using the TIOCSTI ++ ioctl. This option makes TIOCSTI use require CAP_SYS_ADMIN. ++ ++ If this option is not selected, no restrictions will be enforced ++ unless the tiocsti_restrict sysctl is explicitly set to (1). ++ ++ If you are unsure how to answer this question, answer N. ++ + config SECURITY + bool "Enable different security models" + depends on SYSFS + depends on MULTIUSER ++ default y + help + This allows you to choose different security modules to be + configured into your kernel. +@@ -48,6 +72,7 @@ config SECURITYFS + config SECURITY_NETWORK + bool "Socket and Networking Security Hooks" + depends on SECURITY ++ default y + help + This enables the socket and networking security hooks. + If enabled, a security module can use these hooks to +@@ -154,6 +179,7 @@ config HARDENED_USERCOPY + bool "Harden memory copies between kernel and userspace" + depends on HAVE_HARDENED_USERCOPY_ALLOCATOR + imply STRICT_DEVMEM ++ default y + help + This option checks for obviously wrong memory regions when + copying memory to/from the kernel (via copy_to_user() and +@@ -166,7 +192,6 @@ config HARDENED_USERCOPY + config HARDENED_USERCOPY_FALLBACK + bool "Allow usercopy whitelist violations to fallback to object size" + depends on HARDENED_USERCOPY +- default y + help + This is a temporary option that allows missing usercopy whitelists + to be discovered via a WARN() to the kernel log, instead of +@@ -191,10 +216,21 @@ config HARDENED_USERCOPY_PAGESPAN + config FORTIFY_SOURCE + bool "Harden common str/mem functions against buffer overflows" + depends on ARCH_HAS_FORTIFY_SOURCE ++ default y + help + Detect overflows of buffers in common string and memory functions + where the compiler can determine and validate the buffer sizes. + ++config FORTIFY_SOURCE_STRICT_STRING ++ bool "Harden common functions against buffer overflows" ++ depends on FORTIFY_SOURCE ++ depends on EXPERT ++ help ++ Perform stricter overflow checks catching overflows within objects ++ for common C string functions rather than only between objects. ++ ++ This is not yet intended for production use, only bug finding. ++ + config STATIC_USERMODEHELPER + bool "Force all usermode helper calls through a single binary" + help +diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening +index af4c979b38ee..473e40bb8537 100644 +--- a/security/Kconfig.hardening ++++ b/security/Kconfig.hardening +@@ -169,6 +169,7 @@ config STACKLEAK_RUNTIME_DISABLE + + config INIT_ON_ALLOC_DEFAULT_ON + bool "Enable heap memory zeroing on allocation by default" ++ default yes + help + This has the effect of setting "init_on_alloc=1" on the kernel + command line. This can be disabled with "init_on_alloc=0". +@@ -181,6 +182,7 @@ config INIT_ON_ALLOC_DEFAULT_ON + + config INIT_ON_FREE_DEFAULT_ON + bool "Enable heap memory zeroing on free by default" ++ default yes + help + This has the effect of setting "init_on_free=1" on the kernel + command line. This can be disabled with "init_on_free=0". +@@ -196,6 +198,20 @@ config INIT_ON_FREE_DEFAULT_ON + touching "cold" memory areas. Most cases see 3-5% impact. Some + synthetic workloads have measured as high as 8%. + ++config PAGE_SANITIZE_VERIFY ++ bool "Verify sanitized pages" ++ default y ++ help ++ When init_on_free is enabled, verify that newly allocated pages ++ are zeroed to detect write-after-free bugs. ++ ++config SLAB_SANITIZE_VERIFY ++ default y ++ bool "Verify sanitized SLAB allocations" ++ help ++ When init_on_free is enabled, verify that newly allocated slab ++ objects are zeroed to detect write-after-free bugs. ++ + endmenu + + endmenu +diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig +index 5711689deb6a..fab0cb896907 100644 +--- a/security/selinux/Kconfig ++++ b/security/selinux/Kconfig +@@ -3,7 +3,7 @@ config SECURITY_SELINUX + bool "NSA SELinux Support" + depends on SECURITY_NETWORK && AUDIT && NET && INET + select NETWORK_SECMARK +- default n ++ default y + help + This selects NSA Security-Enhanced Linux (SELinux). + You will also need a policy configuration and a labeled filesystem. +@@ -65,23 +65,3 @@ config SECURITY_SELINUX_AVC_STATS + This option collects access vector cache statistics to + /selinux/avc/cache_stats, which may be monitored via + tools such as avcstat. +- +-config SECURITY_SELINUX_CHECKREQPROT_VALUE +- int "NSA SELinux checkreqprot default value" +- depends on SECURITY_SELINUX +- range 0 1 +- default 0 +- help +- This option sets the default value for the 'checkreqprot' flag +- that determines whether SELinux checks the protection requested +- by the application or the protection that will be applied by the +- kernel (including any implied execute for read-implies-exec) for +- mmap and mprotect calls. If this option is set to 0 (zero), +- SELinux will default to checking the protection that will be applied +- by the kernel. If this option is set to 1 (one), SELinux will +- default to checking the protection requested by the application. +- The checkreqprot flag may be changed from the default via the +- 'checkreqprot=' boot parameter. It may also be changed at runtime +- via /selinux/checkreqprot if authorized by policy. +- +- If you are unsure how to answer this question, answer 0. +diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c +index 9625b99e677f..daa40da7a8f9 100644 +--- a/security/selinux/hooks.c ++++ b/security/selinux/hooks.c +@@ -135,18 +135,7 @@ static int __init selinux_enabled_setup(char *str) + __setup("selinux=", selinux_enabled_setup); + #endif + +-static unsigned int selinux_checkreqprot_boot = +- CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE; +- +-static int __init checkreqprot_setup(char *str) +-{ +- unsigned long checkreqprot; +- +- if (!kstrtoul(str, 0, &checkreqprot)) +- selinux_checkreqprot_boot = checkreqprot ? 1 : 0; +- return 1; +-} +-__setup("checkreqprot=", checkreqprot_setup); ++static const unsigned int selinux_checkreqprot_boot; + + /** + * selinux_secmark_enabled - Check to see if SECMARK is currently enabled +diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c +index e6c7643c3fc0..0e8217f72c5a 100644 +--- a/security/selinux/selinuxfs.c ++++ b/security/selinux/selinuxfs.c +@@ -639,7 +639,6 @@ static ssize_t sel_read_checkreqprot(struct file *filp, char __user *buf, + static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) + { +- struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info; + char *page; + ssize_t length; + unsigned int new_value; +@@ -663,10 +662,9 @@ static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, + return PTR_ERR(page); + + length = -EINVAL; +- if (sscanf(page, "%u", &new_value) != 1) ++ if (sscanf(page, "%u", &new_value) != 1 || new_value) + goto out; + +- fsi->state->checkreqprot = new_value ? 1 : 0; + length = count; + out: + kfree(page); +diff --git a/security/yama/Kconfig b/security/yama/Kconfig +index a810304123ca..b809050b25d2 100644 +--- a/security/yama/Kconfig ++++ b/security/yama/Kconfig +@@ -2,7 +2,7 @@ + config SECURITY_YAMA + bool "Yama support" + depends on SECURITY +- default n ++ default y + help + This selects Yama, which extends DAC support with additional + system-wide security settings beyond regular Linux discretionary diff --git a/linux-tkg/linux-tkg-patches/5.7/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch b/linux-tkg/linux-tkg-patches/5.7/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch new file mode 100644 index 0000000..3cef558 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.7/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch @@ -0,0 +1,156 @@ +From 5ec2dd3a095442ec1a21d86042a4994f2ba24e63 Mon Sep 17 00:00:00 2001 +Message-Id: <5ec2dd3a095442ec1a21d86042a4994f2ba24e63.1512651251.git.jan.steffens@gmail.com> +From: Serge Hallyn +Date: Fri, 31 May 2013 19:12:12 +0100 +Subject: [PATCH] add sysctl to disallow unprivileged CLONE_NEWUSER by default + +Signed-off-by: Serge Hallyn +[bwh: Remove unneeded binary sysctl bits] +Signed-off-by: Daniel Micay +--- + kernel/fork.c | 15 +++++++++++++++ + kernel/sysctl.c | 12 ++++++++++++ + kernel/user_namespace.c | 3 +++ + 3 files changed, 30 insertions(+) + +diff --git a/kernel/fork.c b/kernel/fork.c +index 07cc743698d3668e..4011d68a8ff9305c 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -102,6 +102,11 @@ + + #define CREATE_TRACE_POINTS + #include ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#else ++#define unprivileged_userns_clone 0 ++#endif + + /* + * Minimum number of threads to boot the kernel +@@ -1555,6 +1560,10 @@ static __latent_entropy struct task_struct *copy_process( + if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) + return ERR_PTR(-EINVAL); + ++ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) ++ if (!capable(CAP_SYS_ADMIN)) ++ return ERR_PTR(-EPERM); ++ + /* + * Thread groups must share signals as well, and detached threads + * can only be started up within the thread group. +@@ -2348,6 +2357,12 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; + ++ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { ++ err = -EPERM; ++ if (!capable(CAP_SYS_ADMIN)) ++ goto bad_unshare_out; ++ } ++ + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index b86520ed3fb60fbf..f7dab3760839f1a1 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -105,6 +105,9 @@ extern int core_uses_pid; + extern char core_pattern[]; + extern unsigned int core_pipe_limit; + #endif ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#endif + extern int pid_max; + extern int pid_max_min, pid_max_max; + extern int percpu_pagelist_fraction; +@@ -513,6 +516,15 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_USER_NS ++ { ++ .procname = "unprivileged_userns_clone", ++ .data = &unprivileged_userns_clone, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++#endif + #ifdef CONFIG_PROC_SYSCTL + { + .procname = "tainted", +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index c490f1e4313b998a..dd03bd39d7bf194d 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -24,6 +24,9 @@ + #include + #include + ++/* sysctl */ ++int unprivileged_userns_clone; ++ + static struct kmem_cache *user_ns_cachep __read_mostly; + static DEFINE_MUTEX(userns_state_mutex); + +-- +2.15.1 + +From b5202296055dd333db4425120d3f93ef4e6a0573 Mon Sep 17 00:00:00 2001 +From: "Jan Alexander Steffens (heftig)" +Date: Thu, 7 Dec 2017 13:50:48 +0100 +Subject: ZEN: Add CONFIG for unprivileged_userns_clone + +This way our default behavior continues to match the vanilla kernel. +--- + init/Kconfig | 16 ++++++++++++++++ + kernel/user_namespace.c | 4 ++++ + 2 files changed, 20 insertions(+) + +diff --git a/init/Kconfig b/init/Kconfig +index 4592bf7997c0..f3df02990aff 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1004,6 +1004,22 @@ config USER_NS + + If unsure, say N. + ++config USER_NS_UNPRIVILEGED ++ bool "Allow unprivileged users to create namespaces" ++ default y ++ depends on USER_NS ++ help ++ When disabled, unprivileged users will not be able to create ++ new namespaces. Allowing users to create their own namespaces ++ has been part of several recent local privilege escalation ++ exploits, so if you need user namespaces but are ++ paranoid^Wsecurity-conscious you want to disable this. ++ ++ This setting can be overridden at runtime via the ++ kernel.unprivileged_userns_clone sysctl. ++ ++ If unsure, say Y. ++ + config PID_NS + bool "PID Namespaces" + default y +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index 6b9dbc257e34..107b17f0d528 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -27,7 +27,11 @@ + #include + + /* sysctl */ ++#ifdef CONFIG_USER_NS_UNPRIVILEGED ++int unprivileged_userns_clone = 1; ++#else + int unprivileged_userns_clone; ++#endif + + static struct kmem_cache *user_ns_cachep __read_mostly; + static DEFINE_MUTEX(userns_state_mutex); diff --git a/linux-tkg/linux-tkg-patches/5.7/0002-clear-patches.patch b/linux-tkg/linux-tkg-patches/5.7/0002-clear-patches.patch new file mode 100644 index 0000000..a7c9d4a --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.7/0002-clear-patches.patch @@ -0,0 +1,354 @@ +From 2ac70785613ef4c6b16414986bb18bd7b60d2a13 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Mon, 14 Mar 2016 11:10:58 -0600 +Subject: [PATCH] pci pme wakeups + +Reduce wakeups for PME checks, which are a workaround for miswired +boards (sadly, too many of them) in laptops. +--- + drivers/pci/pci.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c +index c25acace7d91..0ddebdad9f5b 100644 +--- a/drivers/pci/pci.c ++++ b/drivers/pci/pci.c +@@ -61,7 +61,7 @@ struct pci_pme_device { + struct pci_dev *dev; + }; + +-#define PME_TIMEOUT 1000 /* How long between PME checks */ ++#define PME_TIMEOUT 4000 /* How long between PME checks */ + + static void pci_dev_d3_sleep(struct pci_dev *dev) + { +-- +2.20.1 + +From 7e7e36c67aa71d6a1ec5676d99d37c1fea389ceb Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Sat, 19 Mar 2016 21:32:19 -0400 +Subject: [PATCH] intel_idle: tweak cpuidle cstates + +Increase target_residency in cpuidle cstate + +Tune intel_idle to be a bit less agressive; +Clear linux is cleaner in hygiene (wakupes) than the average linux, +so we can afford changing these in a way that increases +performance while keeping power efficiency +--- + drivers/idle/intel_idle.c | 44 +++++++++++++++++++-------------------- + 1 file changed, 22 insertions(+), 22 deletions(-) + +diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c +index 8b5d85c91e9d..5e2d813a048d 100644 +--- a/drivers/idle/intel_idle.c ++++ b/drivers/idle/intel_idle.c +@@ -466,7 +466,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -474,7 +474,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 33, +- .target_residency = 100, ++ .target_residency = 900, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -482,7 +482,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, +- .target_residency = 400, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -490,7 +490,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x32", + .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 166, +- .target_residency = 500, ++ .target_residency = 1500, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -498,7 +498,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, +- .target_residency = 900, ++ .target_residency = 2000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -506,7 +506,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 600, +- .target_residency = 1800, ++ .target_residency = 5000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -514,7 +514,7 @@ static struct cpuidle_state hsw_cstates[] = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 2600, +- .target_residency = 7700, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -534,7 +534,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -542,7 +542,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 40, +- .target_residency = 100, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -550,7 +550,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, +- .target_residency = 400, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -558,7 +558,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x32", + .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 166, +- .target_residency = 500, ++ .target_residency = 2000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -566,7 +566,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, +- .target_residency = 900, ++ .target_residency = 4000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -574,7 +574,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 600, +- .target_residency = 1800, ++ .target_residency = 7000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -582,7 +582,7 @@ static struct cpuidle_state bdw_cstates[] = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 2600, +- .target_residency = 7700, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -603,7 +603,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -611,7 +611,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 70, +- .target_residency = 100, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -619,7 +619,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 85, +- .target_residency = 200, ++ .target_residency = 600, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -627,7 +627,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x33", + .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 124, +- .target_residency = 800, ++ .target_residency = 3000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -635,7 +635,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 200, +- .target_residency = 800, ++ .target_residency = 3200, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -643,7 +643,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 480, +- .target_residency = 5000, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -651,7 +651,7 @@ static struct cpuidle_state skl_cstates[] = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 890, +- .target_residency = 5000, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -672,7 +672,7 @@ static struct cpuidle_state skx_cstates[] = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01), + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 300, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +-- +2.20.1 + +From b8211d4f79dd88dfc2d4bd52be46103ea0b70e3e Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Fri, 6 Jan 2017 15:34:09 +0000 +Subject: [PATCH] ipv4/tcp: allow the memory tuning for tcp to go a little + bigger than default + +--- + net/ipv4/tcp.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index cf3c5095c10e..b30d51837b2d 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -3897,8 +3897,8 @@ void __init tcp_init(void) + tcp_init_mem(); + /* Set per-socket limits to no more than 1/128 the pressure threshold */ + limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); +- max_wshare = min(4UL*1024*1024, limit); +- max_rshare = min(6UL*1024*1024, limit); ++ max_wshare = min(16UL*1024*1024, limit); ++ max_rshare = min(16UL*1024*1024, limit); + + init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; +-- +2.20.1 + +From 050223869257b87e22636158a80da38d877248ed Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Sun, 18 Feb 2018 23:35:41 +0000 +Subject: [PATCH] locking: rwsem: spin faster + +tweak rwsem owner spinning a bit +--- + kernel/locking/rwsem.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c +index eef04551eae7..1ec5ab4c8ff7 100644 +--- a/kernel/locking/rwsem.c ++++ b/kernel/locking/rwsem.c +@@ -720,6 +720,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) + struct task_struct *new, *owner; + unsigned long flags, new_flags; + enum owner_state state; ++ int i = 0; + + owner = rwsem_owner_flags(sem, &flags); + state = rwsem_owner_state(owner, flags, nonspinnable); +@@ -753,7 +754,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) + break; + } + +- cpu_relax(); ++ if (i++ > 1000) ++ cpu_relax(); + } + rcu_read_unlock(); + +From b836ea320114643d4354b43acb6ec8bb06ada487 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Thu, 2 Jun 2016 23:36:32 -0500 +Subject: [PATCH] drivers: Initialize ata before graphics + +ATA init is the long pole in the boot process, and its asynchronous. +move the graphics init after it so that ata and graphics initialize +in parallel +--- + drivers/Makefile | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +diff --git a/drivers/Makefile b/drivers/Makefile +index aaef17cc6512..d08f3a394929 100644 +--- a/drivers/Makefile ++++ b/drivers/Makefile +@@ -58,15 +58,8 @@ obj-y += char/ + # iommu/ comes before gpu as gpu are using iommu controllers + obj-y += iommu/ + +-# gpu/ comes after char for AGP vs DRM startup and after iommu +-obj-y += gpu/ +- + obj-$(CONFIG_CONNECTOR) += connector/ + +-# i810fb and intelfb depend on char/agp/ +-obj-$(CONFIG_FB_I810) += video/fbdev/i810/ +-obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ +- + obj-$(CONFIG_PARPORT) += parport/ + obj-$(CONFIG_NVM) += lightnvm/ + obj-y += base/ block/ misc/ mfd/ nfc/ +@@ -79,6 +72,14 @@ obj-$(CONFIG_IDE) += ide/ + obj-y += scsi/ + obj-y += nvme/ + obj-$(CONFIG_ATA) += ata/ ++ ++# gpu/ comes after char for AGP vs DRM startup and after iommu ++obj-y += gpu/ ++ ++# i810fb and intelfb depend on char/agp/ ++obj-$(CONFIG_FB_I810) += video/fbdev/i810/ ++obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ ++ + obj-$(CONFIG_TARGET_CORE) += target/ + obj-$(CONFIG_MTD) += mtd/ + obj-$(CONFIG_SPI) += spi/ diff --git a/linux-tkg/linux-tkg-patches/5.7/0003-glitched-base.patch b/linux-tkg/linux-tkg-patches/5.7/0003-glitched-base.patch new file mode 100644 index 0000000..0cd2ef0 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.7/0003-glitched-base.patch @@ -0,0 +1,545 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched + +diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h +index 87f1fc9..b3be470 100755 +--- a/scripts/mkcompile_h ++++ b/scripts/mkcompile_h +@@ -50,8 +50,8 @@ else + fi + + UTS_VERSION="#$VERSION" +-CONFIG_FLAGS="" +-if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi ++CONFIG_FLAGS="TKG" ++if [ -n "$SMP" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS SMP"; fi + if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi + UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP" + +diff --git a/fs/dcache.c b/fs/dcache.c +index 2acfc69878f5..3f1131431e06 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -69,7 +69,7 @@ + * If no ancestor relationship: + * arbitrary, since it's serialized on rename_lock + */ +-int sysctl_vfs_cache_pressure __read_mostly = 100; ++int sysctl_vfs_cache_pressure __read_mostly = 50; + EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); + + __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 211890edf37e..37121563407d 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -41,7 +41,7 @@ const_debug unsigned int sysctl_sched_features = + * Number of tasks to iterate in a single balance run. + * Limited because this is done with IRQs disabled. + */ +-const_debug unsigned int sysctl_sched_nr_migrate = 32; ++const_debug unsigned int sysctl_sched_nr_migrate = 128; + + /* + * period over which we average the RT time consumption, measured +@@ -61,9 +61,9 @@ __read_mostly int scheduler_running; + + /* + * part of the period that we allow rt tasks to run in us. +- * default: 0.95s ++ * XanMod default: 0.98s + */ +-int sysctl_sched_rt_runtime = 950000; ++int sysctl_sched_rt_runtime = 980000; + + /* + * __task_rq_lock - lock the rq @p resides on. +diff --git a/scripts/setlocalversion b/scripts/setlocalversion +index 71f39410691b..288f9679e883 100755 +--- a/scripts/setlocalversion ++++ b/scripts/setlocalversion +@@ -54,7 +54,7 @@ scm_version() + # If only the short version is requested, don't bother + # running further git commands + if $short; then +- echo "+" ++ # echo "+" + return + fi + # If we are past a tagged commit (like + +From f85ed068b4d0e6c31edce8574a95757a60e58b87 Mon Sep 17 00:00:00 2001 +From: Etienne Juvigny +Date: Mon, 3 Sep 2018 17:36:25 +0200 +Subject: Zenify & stuff + + +diff --git a/init/Kconfig b/init/Kconfig +index b4daad2bac23..c1e59dc04209 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1244,7 +1244,6 @@ config CC_OPTIMIZE_FOR_PERFORMANCE + + config CC_OPTIMIZE_FOR_PERFORMANCE_O3 + bool "Optimize more for performance (-O3)" +- depends on ARC + help + Choosing this option will pass "-O3" to your compiler to optimize + the kernel yet more for performance. +diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c +index 4f32c4062fb6..c0bf039e1b40 100644 +--- a/drivers/infiniband/core/addr.c ++++ b/drivers/infiniband/core/addr.c +@@ -721,6 +721,7 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; ++ struct sockaddr_ib _sockaddr_ib; + } sgid_addr, dgid_addr; + int ret; + +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 79226ca8f80f..2a30060e7e1d 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -47,7 +47,11 @@ struct blk_queue_stats; + struct blk_stat_callback; + + #define BLKDEV_MIN_RQ 4 ++#ifdef CONFIG_ZENIFY ++#define BLKDEV_MAX_RQ 512 ++#else + #define BLKDEV_MAX_RQ 128 /* Default maximum */ ++#endif + + /* Must be consistent with blk_mq_poll_stats_bkt() */ + #define BLK_MQ_POLL_STATS_BKTS 16 +diff --git a/init/Kconfig b/init/Kconfig +index 041f3a022122..5ed70eb1ad3a 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -45,6 +45,38 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config ZENIFY ++ bool "A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience" ++ default y ++ help ++ Tunes the kernel for responsiveness at the cost of throughput and power usage. ++ ++ --- Virtual Memory Subsystem --------------------------- ++ ++ Mem dirty before bg writeback..: 10 % -> 20 % ++ Mem dirty before sync writeback: 20 % -> 50 % ++ ++ --- Block Layer ---------------------------------------- ++ ++ Queue depth...............: 128 -> 512 ++ Default MQ scheduler......: mq-deadline -> bfq ++ ++ --- CFS CPU Scheduler ---------------------------------- ++ ++ Scheduling latency.............: 6 -> 3 ms ++ Minimal granularity............: 0.75 -> 0.3 ms ++ Wakeup granularity.............: 1 -> 0.5 ms ++ CPU migration cost.............: 0.5 -> 0.25 ms ++ Bandwidth slice size...........: 5 -> 3 ms ++ Ondemand fine upscaling limit..: 95 % -> 85 % ++ ++ --- MuQSS CPU Scheduler -------------------------------- ++ ++ Scheduling interval............: 6 -> 3 ms ++ ISO task max realtime use......: 70 % -> 25 % ++ Ondemand coarse upscaling limit: 80 % -> 45 % ++ Ondemand fine upscaling limit..: 95 % -> 45 % ++ + config BROKEN + bool + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 2f0a0be4d344..bada807c7e59 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -37,8 +37,13 @@ + * + * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_latency = 3000000ULL; ++static unsigned int normalized_sysctl_sched_latency = 3000000ULL; ++#else + unsigned int sysctl_sched_latency = 6000000ULL; + static unsigned int normalized_sysctl_sched_latency = 6000000ULL; ++#endif + + /* + * The initial- and re-scaling of tunables is configurable +@@ -58,13 +63,22 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L + * + * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_min_granularity = 300000ULL; ++static unsigned int normalized_sysctl_sched_min_granularity = 300000ULL; ++#else + unsigned int sysctl_sched_min_granularity = 750000ULL; + static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; ++#endif + + /* + * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity + */ ++#ifdef CONFIG_ZENIFY ++static unsigned int sched_nr_latency = 10; ++#else + static unsigned int sched_nr_latency = 8; ++#endif + + /* + * After fork, child runs first. If set to 0 (default) then +@@ -81,10 +95,17 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; + * + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_wakeup_granularity = 500000UL; ++static unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL; ++ ++const_debug unsigned int sysctl_sched_migration_cost = 50000UL; ++#else + unsigned int sysctl_sched_wakeup_granularity = 1000000UL; + static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; + + const_debug unsigned int sysctl_sched_migration_cost = 500000UL; ++#endif + + #ifdef CONFIG_SMP + /* +@@ -107,8 +128,12 @@ int __weak arch_asym_cpu_priority(int cpu) + * + * (default: 5 msec, units: microseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; ++#else + unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; + #endif ++#endif + + /* + * The margin used when comparing utilization with CPU capacity: +diff --git a/mm/page-writeback.c b/mm/page-writeback.c +index 337c6afb3345..9315e358f292 100644 +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -71,7 +71,11 @@ static long ratelimit_pages = 32; + /* + * Start background writeback (via writeback threads) at this percentage + */ ++#ifdef CONFIG_ZENIFY ++int dirty_background_ratio = 20; ++#else + int dirty_background_ratio = 10; ++#endif + + /* + * dirty_background_bytes starts at 0 (disabled) so that it is a function of +@@ -88,7 +92,11 @@ int vm_highmem_is_dirtyable; + /* + * The generator of dirty data starts writeback at this percentage + */ ++#ifdef CONFIG_ZENIFY ++int vm_dirty_ratio = 50; ++#else + int vm_dirty_ratio = 20; ++#endif + + /* + * vm_dirty_bytes starts at 0 (disabled) so that it is a function of +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index 80dad301361d..42b7fa7d01f8 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -702,6 +702,9 @@ choice + config DEFAULT_VEGAS + bool "Vegas" if TCP_CONG_VEGAS=y + ++ config DEFAULT_YEAH ++ bool "YeAH" if TCP_CONG_YEAH=y ++ + config DEFAULT_VENO + bool "Veno" if TCP_CONG_VENO=y + +@@ -735,6 +738,7 @@ config DEFAULT_TCP_CONG + default "htcp" if DEFAULT_HTCP + default "hybla" if DEFAULT_HYBLA + default "vegas" if DEFAULT_VEGAS ++ default "yeah" if DEFAULT_YEAH + default "westwood" if DEFAULT_WESTWOOD + default "veno" if DEFAULT_VENO + default "reno" if DEFAULT_RENO + +From: Nick Desaulniers +Date: Mon, 24 Dec 2018 13:37:41 +0200 +Subject: include/linux/compiler*.h: define asm_volatile_goto + +asm_volatile_goto should also be defined for other compilers that +support asm goto. + +Fixes commit 815f0dd ("include/linux/compiler*.h: make compiler-*.h +mutually exclusive"). + +Signed-off-by: Nick Desaulniers +Signed-off-by: Miguel Ojeda + +diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h +index ba814f1..e77eeb0 100644 +--- a/include/linux/compiler_types.h ++++ b/include/linux/compiler_types.h +@@ -188,6 +188,10 @@ struct ftrace_likely_data { + #define asm_volatile_goto(x...) asm goto(x) + #endif + ++#ifndef asm_volatile_goto ++#define asm_volatile_goto(x...) asm goto(x) ++#endif ++ + /* Are two types/vars the same type (ignoring qualifiers)? */ + #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) + +From: Andy Lavr +Date: Mon, 24 Dec 2018 14:57:47 +0200 +Subject: avl: Use [defer+madvise] as default khugepaged defrag strategy + +For some reason, the default strategy to respond to THP fault fallbacks +is still just madvise, meaning stall if the program wants transparent +hugepages, but don't trigger a background reclaim / compaction if THP +begins to fail allocations. This creates a snowball affect where we +still use the THP code paths, but we almost always fail once a system +has been active and busy for a while. + +The option "defer" was created for interactive systems where THP can +still improve performance. If we have to fallback to a regular page due +to an allocation failure or anything else, we will trigger a background +reclaim and compaction so future THP attempts succeed and previous +attempts eventually have their smaller pages combined without stalling +running applications. + +We still want madvise to stall applications that explicitely want THP, +so defer+madvise _does_ make a ton of sense. Make it the default for +interactive systems, especially if the kernel maintainer left +transparent hugepages on "always". + +Reasoning and details in the original patch: +https://lwn.net/Articles/711248/ + +Signed-off-by: Andy Lavr + +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index e84a10b..21d62b7 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -53,7 +53,11 @@ unsigned long transparent_hugepage_flags __read_mostly = + #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE + (1< +Date: Mon, 25 Nov 2019 15:13:06 -0300 +Subject: [PATCH] elevator: set default scheduler to bfq for blk-mq + +Signed-off-by: Alexandre Frade +--- + block/elevator.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/block/elevator.c b/block/elevator.c +index 076ba7308e65..81f89095aa77 100644 +--- a/block/elevator.c ++++ b/block/elevator.c +@@ -623,15 +623,15 @@ static inline bool elv_support_iosched(struct request_queue *q) + } + + /* +- * For single queue devices, default to using mq-deadline. If we have multiple +- * queues or mq-deadline is not available, default to "none". ++ * For single queue devices, default to using bfq. If we have multiple ++ * queues or bfq is not available, default to "none". + */ + static struct elevator_type *elevator_get_default(struct request_queue *q) + { + if (q->nr_hw_queues != 1) + return NULL; + +- return elevator_get(q, "mq-deadline", false); ++ return elevator_get(q, "bfq", false); + } + + /* +From c3ec05777c46e19a8a26d0fc4ca0c0db8a19de97 Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Fri, 10 May 2019 16:45:59 -0300 +Subject: [PATCH] block: set rq_affinity = 2 for full multithreading I/O + requests + +Signed-off-by: Alexandre Frade +--- + include/linux/blkdev.h | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index f3ea78b0c91c..4dbacc6b073b 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -621,7 +621,8 @@ struct request_queue { + #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ + + #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ +- (1 << QUEUE_FLAG_SAME_COMP)) ++ (1 << QUEUE_FLAG_SAME_COMP) | \ ++ (1 << QUEUE_FLAG_SAME_FORCE)) + + void blk_queue_flag_set(unsigned int flag, struct request_queue *q); + void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); +From 8171d33d0b84a953649863538fdbe4c26c035e4f Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Fri, 10 May 2019 14:32:50 -0300 +Subject: [PATCH] mm: set 2 megabytes for address_space-level file read-ahead + pages size + +Signed-off-by: Alexandre Frade +--- + include/linux/mm.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/linux/mm.h b/include/linux/mm.h +index a2adf95b3f9c..e804d9f7583a 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -2416,7 +2416,7 @@ int __must_check write_one_page(struct page *page); + void task_dirty_inc(struct task_struct *tsk); + + /* readahead.c */ +-#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) ++#define VM_READAHEAD_PAGES (SZ_2M / PAGE_SIZE) + + int force_page_cache_readahead(struct address_space *mapping, struct file *filp, + pgoff_t offset, unsigned long nr_to_read); +From de7119e3db9fdb4c704355854a02a7e9fad931d4 Mon Sep 17 00:00:00 2001 +From: Steven Barrett +Date: Wed, 15 Jan 2020 20:43:56 -0600 +Subject: [PATCH] ZEN: intel-pstate: Implement "enable" parameter + +If intel-pstate is compiled into the kernel, it will preempt the loading +of acpi-cpufreq so you can take advantage of hardware p-states without +any friction. + +However, intel-pstate is not completely superior to cpufreq's ondemand +for one reason. There's no concept of an up_threshold property. + +In ondemand, up_threshold essentially reduces the maximum utilization to +compare against, allowing you to hit max frequencies and turbo boost +from a much lower core utilization. + +With intel-pstate, you have the concept of minimum and maximum +performance, but no tunable that lets you define, maximum frequency +means 50% core utilization. For just this oversight, there's reasons +you may want ondemand. + +Lets support setting "enable" in kernel boot parameters. This lets +kernel maintainers include "intel_pstate=disable" statically in the +static boot parameters, but let users of the kernel override this +selection. +--- + Documentation/admin-guide/kernel-parameters.txt | 3 +++ + drivers/cpufreq/intel_pstate.c | 2 ++ + 2 files changed, 5 insertions(+) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index ade4e6ec23e03..0b613370d28d8 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -1765,6 +1765,9 @@ + disable + Do not enable intel_pstate as the default + scaling driver for the supported processors ++ enable ++ Enable intel_pstate in-case "disable" was passed ++ previously in the kernel boot parameters + passive + Use intel_pstate as a scaling driver, but configure it + to work with generic cpufreq governors (instead of +diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c +index d2fa3e9ccd97c..bd10cb02fc0ff 100644 +--- a/drivers/cpufreq/intel_pstate.c ++++ b/drivers/cpufreq/intel_pstate.c +@@ -2826,6 +2826,8 @@ static int __init intel_pstate_setup(char *str) + pr_info("HWP disabled\n"); + no_hwp = 1; + } ++ if (!strcmp(str, "enable")) ++ no_load = 0; + if (!strcmp(str, "force")) + force_load = 1; + if (!strcmp(str, "hwp_only")) diff --git a/linux-tkg/linux-tkg-patches/5.7/0003-glitched-cfs.patch b/linux-tkg/linux-tkg-patches/5.7/0003-glitched-cfs.patch new file mode 100644 index 0000000..06b7f02 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.7/0003-glitched-cfs.patch @@ -0,0 +1,72 @@ +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_500 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -39,6 +39,13 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,6 +59,7 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_500 ++ default HZ_750 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -46,6 +46,13 @@ choice + on desktops with great smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -60,6 +67,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 500 if HZ_500 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK + diff --git a/linux-tkg/linux-tkg-patches/5.7/0004-5.7-ck1.patch b/linux-tkg/linux-tkg-patches/5.7/0004-5.7-ck1.patch new file mode 100644 index 0000000..ee1d1c8 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.7/0004-5.7-ck1.patch @@ -0,0 +1,13147 @@ +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 7bc83f3d9bdf..2f9e8cdf5fec 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4429,6 +4429,14 @@ + Memory area to be used by remote processor image, + managed by CMA. + ++ rqshare= [X86] Select the MuQSS scheduler runqueue sharing type. ++ Format: ++ smt -- Share SMT (hyperthread) sibling runqueues ++ mc -- Share MC (multicore) sibling runqueues ++ smp -- Share SMP runqueues ++ none -- So not share any runqueues ++ Default value is mc ++ + rw [KNL] Mount root device read-write on boot + + S [KNL] Run init in single mode +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index 0d427fd10941..5b3406a3d76f 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -344,6 +344,16 @@ Controls whether the panic kmsg data should be reported to Hyper-V. + = ========================================================= + + ++iso_cpu: (MuQSS CPU scheduler only) ++=================================== ++ ++This sets the percentage cpu that the unprivileged SCHED_ISO tasks can ++run effectively at realtime priority, averaged over a rolling five ++seconds over the -whole- system, meaning all cpus. ++ ++Set to 70 (percent) by default. ++ ++ + kexec_load_disabled + =================== + +@@ -922,6 +932,20 @@ ROM/Flash boot loader. Maybe to tell it what to do after + rebooting. ??? + + ++rr_interval: (MuQSS CPU scheduler only) ++======================================= ++ ++This is the smallest duration that any cpu process scheduling unit ++will run for. Increasing this value can increase throughput of cpu ++bound tasks substantially but at the expense of increased latencies ++overall. Conversely decreasing it will decrease average and maximum ++latencies but at the expense of throughput. This value is in ++milliseconds and the default value chosen depends on the number of ++cpus available at scheduler initialisation with a minimum of 6. ++ ++Valid values are from 1-1000. ++ ++ + sched_energy_aware + ================== + +@@ -1230,3 +1254,13 @@ is 10 seconds. + + The softlockup threshold is (``2 * watchdog_thresh``). Setting this + tunable to zero will disable lockup detection altogether. ++ ++ ++yield_type: (MuQSS CPU scheduler only) ++====================================== ++ ++This determines what type of yield calls to sched_yield will perform. ++ ++ 0: No yield. ++ 1: Yield only to better priority/deadline tasks. (default) ++ 2: Expire timeslice and recalculate deadline. +diff --git a/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt +new file mode 100644 +index 000000000000..c0282002a079 +--- /dev/null ++++ b/Documentation/scheduler/sched-BFS.txt +@@ -0,0 +1,351 @@ ++BFS - The Brain Fuck Scheduler by Con Kolivas. ++ ++Goals. ++ ++The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to ++completely do away with the complex designs of the past for the cpu process ++scheduler and instead implement one that is very simple in basic design. ++The main focus of BFS is to achieve excellent desktop interactivity and ++responsiveness without heuristics and tuning knobs that are difficult to ++understand, impossible to model and predict the effect of, and when tuned to ++one workload cause massive detriment to another. ++ ++ ++Design summary. ++ ++BFS is best described as a single runqueue, O(n) lookup, earliest effective ++virtual deadline first design, loosely based on EEVDF (earliest eligible virtual ++deadline first) and my previous Staircase Deadline scheduler. Each component ++shall be described in order to understand the significance of, and reasoning for ++it. The codebase when the first stable version was released was approximately ++9000 lines less code than the existing mainline linux kernel scheduler (in ++2.6.31). This does not even take into account the removal of documentation and ++the cgroups code that is not used. ++ ++Design reasoning. ++ ++The single runqueue refers to the queued but not running processes for the ++entire system, regardless of the number of CPUs. The reason for going back to ++a single runqueue design is that once multiple runqueues are introduced, ++per-CPU or otherwise, there will be complex interactions as each runqueue will ++be responsible for the scheduling latency and fairness of the tasks only on its ++own runqueue, and to achieve fairness and low latency across multiple CPUs, any ++advantage in throughput of having CPU local tasks causes other disadvantages. ++This is due to requiring a very complex balancing system to at best achieve some ++semblance of fairness across CPUs and can only maintain relatively low latency ++for tasks bound to the same CPUs, not across them. To increase said fairness ++and latency across CPUs, the advantage of local runqueue locking, which makes ++for better scalability, is lost due to having to grab multiple locks. ++ ++A significant feature of BFS is that all accounting is done purely based on CPU ++used and nowhere is sleep time used in any way to determine entitlement or ++interactivity. Interactivity "estimators" that use some kind of sleep/run ++algorithm are doomed to fail to detect all interactive tasks, and to falsely tag ++tasks that aren't interactive as being so. The reason for this is that it is ++close to impossible to determine that when a task is sleeping, whether it is ++doing it voluntarily, as in a userspace application waiting for input in the ++form of a mouse click or otherwise, or involuntarily, because it is waiting for ++another thread, process, I/O, kernel activity or whatever. Thus, such an ++estimator will introduce corner cases, and more heuristics will be required to ++cope with those corner cases, introducing more corner cases and failed ++interactivity detection and so on. Interactivity in BFS is built into the design ++by virtue of the fact that tasks that are waking up have not used up their quota ++of CPU time, and have earlier effective deadlines, thereby making it very likely ++they will preempt any CPU bound task of equivalent nice level. See below for ++more information on the virtual deadline mechanism. Even if they do not preempt ++a running task, because the rr interval is guaranteed to have a bound upper ++limit on how long a task will wait for, it will be scheduled within a timeframe ++that will not cause visible interface jitter. ++ ++ ++Design details. ++ ++Task insertion. ++ ++BFS inserts tasks into each relevant queue as an O(1) insertion into a double ++linked list. On insertion, *every* running queue is checked to see if the newly ++queued task can run on any idle queue, or preempt the lowest running task on the ++system. This is how the cross-CPU scheduling of BFS achieves significantly lower ++latency per extra CPU the system has. In this case the lookup is, in the worst ++case scenario, O(n) where n is the number of CPUs on the system. ++ ++Data protection. ++ ++BFS has one single lock protecting the process local data of every task in the ++global queue. Thus every insertion, removal and modification of task data in the ++global runqueue needs to grab the global lock. However, once a task is taken by ++a CPU, the CPU has its own local data copy of the running process' accounting ++information which only that CPU accesses and modifies (such as during a ++timer tick) thus allowing the accounting data to be updated lockless. Once a ++CPU has taken a task to run, it removes it from the global queue. Thus the ++global queue only ever has, at most, ++ ++ (number of tasks requesting cpu time) - (number of logical CPUs) + 1 ++ ++tasks in the global queue. This value is relevant for the time taken to look up ++tasks during scheduling. This will increase if many tasks with CPU affinity set ++in their policy to limit which CPUs they're allowed to run on if they outnumber ++the number of CPUs. The +1 is because when rescheduling a task, the CPU's ++currently running task is put back on the queue. Lookup will be described after ++the virtual deadline mechanism is explained. ++ ++Virtual deadline. ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in BFS is entirely in the virtual deadline mechanism. The one ++tunable in BFS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in jiffies by this equation: ++ ++ jiffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. Once a task is descheduled, it is put back on the queue, and an ++O(n) lookup of all queued-but-not-running tasks is done to determine which has ++the earliest deadline and that task is chosen to receive CPU next. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (jiffies) is ++constantly moving. ++ ++Task lookup. ++ ++BFS has 103 priority queues. 100 of these are dedicated to the static priority ++of realtime tasks, and the remaining 3 are, in order of best to worst priority, ++SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority ++scheduling). When a task of these priorities is queued, a bitmap of running ++priorities is set showing which of these priorities has tasks waiting for CPU ++time. When a CPU is made to reschedule, the lookup for the next task to get ++CPU time is performed in the following way: ++ ++First the bitmap is checked to see what static priority tasks are queued. If ++any realtime priorities are found, the corresponding queue is checked and the ++first task listed there is taken (provided CPU affinity is suitable) and lookup ++is complete. If the priority corresponds to a SCHED_ISO task, they are also ++taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds ++to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this ++stage, every task in the runlist that corresponds to that priority is checked ++to see which has the earliest set deadline, and (provided it has suitable CPU ++affinity) it is taken off the runqueue and given the CPU. If a task has an ++expired deadline, it is taken and the rest of the lookup aborted (as they are ++chosen in FIFO order). ++ ++Thus, the lookup is O(n) in the worst case only, where n is as described ++earlier, as tasks may be chosen before the whole task list is looked over. ++ ++ ++Scalability. ++ ++The major limitations of BFS will be that of scalability, as the separate ++runqueue designs will have less lock contention as the number of CPUs rises. ++However they do not scale linearly even with separate runqueues as multiple ++runqueues will need to be locked concurrently on such designs to be able to ++achieve fair CPU balancing, to try and achieve some sort of nice-level fairness ++across CPUs, and to achieve low enough latency for tasks on a busy CPU when ++other CPUs would be more suited. BFS has the advantage that it requires no ++balancing algorithm whatsoever, as balancing occurs by proxy simply because ++all CPUs draw off the global runqueue, in priority and deadline order. Despite ++the fact that scalability is _not_ the prime concern of BFS, it both shows very ++good scalability to smaller numbers of CPUs and is likely a more scalable design ++at these numbers of CPUs. ++ ++It also has some very low overhead scalability features built into the design ++when it has been deemed their overhead is so marginal that they're worth adding. ++The first is the local copy of the running process' data to the CPU it's running ++on to allow that data to be updated lockless where possible. Then there is ++deference paid to the last CPU a task was running on, by trying that CPU first ++when looking for an idle CPU to use the next time it's scheduled. Finally there ++is the notion of cache locality beyond the last running CPU. The sched_domains ++information is used to determine the relative virtual "cache distance" that ++other CPUs have from the last CPU a task was running on. CPUs with shared ++caches, such as SMT siblings, or multicore CPUs with shared caches, are treated ++as cache local. CPUs without shared caches are treated as not cache local, and ++CPUs on different NUMA nodes are treated as very distant. This "relative cache ++distance" is used by modifying the virtual deadline value when doing lookups. ++Effectively, the deadline is unaltered between "cache local" CPUs, doubled for ++"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning ++behind the doubling of deadlines is as follows. The real cost of migrating a ++task from one CPU to another is entirely dependant on the cache footprint of ++the task, how cache intensive the task is, how long it's been running on that ++CPU to take up the bulk of its cache, how big the CPU cache is, how fast and ++how layered the CPU cache is, how fast a context switch is... and so on. In ++other words, it's close to random in the real world where we do more than just ++one sole workload. The only thing we can be sure of is that it's not free. So ++BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs ++is more important than cache locality, and cache locality only plays a part ++after that. Doubling the effective deadline is based on the premise that the ++"cache local" CPUs will tend to work on the same tasks up to double the number ++of cache local CPUs, and once the workload is beyond that amount, it is likely ++that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA ++is a value I pulled out of my arse. ++ ++When choosing an idle CPU for a waking task, the cache locality is determined ++according to where the task last ran and then idle CPUs are ranked from best ++to worst to choose the most suitable idle CPU based on cache locality, NUMA ++node locality and hyperthread sibling business. They are chosen in the ++following preference (if idle): ++ ++* Same core, idle or busy cache, idle threads ++* Other core, same cache, idle or busy cache, idle threads. ++* Same node, other CPU, idle cache, idle threads. ++* Same node, other CPU, busy cache, idle threads. ++* Same core, busy threads. ++* Other core, same cache, busy threads. ++* Same node, other CPU, busy threads. ++* Other node, other CPU, idle cache, idle threads. ++* Other node, other CPU, busy cache, idle threads. ++* Other node, other CPU, busy threads. ++ ++This shows the SMT or "hyperthread" awareness in the design as well which will ++choose a real idle core first before a logical SMT sibling which already has ++tasks on the physical CPU. ++ ++Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark. ++However this benchmarking was performed on an earlier design that was far less ++scalable than the current one so it's hard to know how scalable it is in terms ++of both CPUs (due to the global runqueue) and heavily loaded machines (due to ++O(n) lookup) at this stage. Note that in terms of scalability, the number of ++_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x) ++quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark ++results are very promising indeed, without needing to tweak any knobs, features ++or options. Benchmark contributions are most welcome. ++ ++ ++Features ++ ++As the initial prime target audience for BFS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval ++and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition ++to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is ++support for CGROUPS. The average user should neither need to know what these ++are, nor should they need to be using them to have good desktop behaviour. ++ ++rr_interval ++ ++There is only one "scheduler" tunable, the round robin interval. This can be ++accessed in ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6 on a ++uniprocessor machine, and automatically set to a progressively higher value on ++multiprocessor machines. The reasoning behind increasing the value on more CPUs ++is that the effective latency is decreased by virtue of there being more CPUs on ++BFS (for reasons explained above), and increasing the value allows for less ++cache contention and more throughput. Valid values are from 1 to 1000 ++Decreasing the value will decrease latencies at the cost of decreasing ++throughput, while increasing it will improve throughput, but at the cost of ++worsening latencies. The accuracy of the rr interval is limited by HZ resolution ++of the kernel configuration. Thus, the worst case latencies are usually slightly ++higher than this actual value. The default value of 6 is not an arbitrary one. ++It is based on the fact that humans can detect jitter at approximately 7ms, so ++aiming for much lower latencies is pointless under most circumstances. It is ++worth noting this fact when comparing the latency performance of BFS to other ++schedulers. Worst case latencies being higher than 7ms are far worse than ++average latencies not being in the microsecond range. ++ ++Isochronous scheduling. ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of _total CPU_ available across the machine, configurable ++as a percentage in the following "resource handling" tunable (as opposed to a ++scheduler tunable): ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of BFS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++Because some applications constantly set their policy as well as their nice ++level, there is potential for them to undo the override specified by the user ++on the command line of setting the policy to SCHED_ISO. To counter this, once ++a task has been set to SCHED_ISO policy, it needs superuser privileges to set ++it back to SCHED_NORMAL. This will ensure the task remains ISO and all child ++processes and threads will also inherit the ISO policy. ++ ++Idleprio scheduling. ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start ++a video encode or so on without any slowdown of other tasks. To avoid this ++policy from grabbing shared resources and holding them indefinitely, if it ++detects a state where the task is waiting on I/O, the machine is about to ++suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As ++per the Isochronous task management, once a task has been scheduled as IDLEPRIO, ++it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can ++be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++ schedtool -D -e ./mprime ++ ++Subtick accounting. ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the ++timer tick frequency (HZ) is lowered. It is possible to create an application ++which uses almost 100% CPU, yet by being descheduled at the right time, records ++zero CPU usage. While the main problem with this is that there are possible ++security implications, it is also difficult to determine how much CPU a task ++really does use. BFS tries to use the sub-tick accounting from the TSC clock, ++where possible, to determine real CPU usage. This is not entirely reliable, but ++is far more likely to produce accurate CPU usage data than the existing designs ++and will not show tasks as consuming no CPU usage when they actually are. Thus, ++the amount of CPU reported as being used by BFS will more accurately represent ++how much CPU the task itself is using (as is shown for example by the 'time' ++application), so the reported values may be quite different to other schedulers. ++Values reported as the 'load' are more prone to problems with this design, but ++per process values are closer to real usage. When comparing throughput of BFS ++to other designs, it is important to compare the actual completed work in terms ++of total wall clock time taken and total work done, rather than the reported ++"cpu usage". ++ ++ ++Con Kolivas Fri Aug 27 2010 +diff --git a/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt +new file mode 100644 +index 000000000000..ae28b85c9995 +--- /dev/null ++++ b/Documentation/scheduler/sched-MuQSS.txt +@@ -0,0 +1,373 @@ ++MuQSS - The Multiple Queue Skiplist Scheduler by Con Kolivas. ++ ++MuQSS is a per-cpu runqueue variant of the original BFS scheduler with ++one 8 level skiplist per runqueue, and fine grained locking for much more ++scalability. ++ ++ ++Goals. ++ ++The goal of the Multiple Queue Skiplist Scheduler, referred to as MuQSS from ++here on (pronounced mux) is to completely do away with the complex designs of ++the past for the cpu process scheduler and instead implement one that is very ++simple in basic design. The main focus of MuQSS is to achieve excellent desktop ++interactivity and responsiveness without heuristics and tuning knobs that are ++difficult to understand, impossible to model and predict the effect of, and when ++tuned to one workload cause massive detriment to another, while still being ++scalable to many CPUs and processes. ++ ++ ++Design summary. ++ ++MuQSS is best described as per-cpu multiple runqueue, O(log n) insertion, O(1) ++lookup, earliest effective virtual deadline first tickless design, loosely based ++on EEVDF (earliest eligible virtual deadline first) and my previous Staircase ++Deadline scheduler, and evolved from the single runqueue O(n) BFS scheduler. ++Each component shall be described in order to understand the significance of, ++and reasoning for it. ++ ++ ++Design reasoning. ++ ++In BFS, the use of a single runqueue across all CPUs meant that each CPU would ++need to scan the entire runqueue looking for the process with the earliest ++deadline and schedule that next, regardless of which CPU it originally came ++from. This made BFS deterministic with respect to latency and provided ++guaranteed latencies dependent on number of processes and CPUs. The single ++runqueue, however, meant that all CPUs would compete for the single lock ++protecting it, which would lead to increasing lock contention as the number of ++CPUs rose and appeared to limit scalability of common workloads beyond 16 ++logical CPUs. Additionally, the O(n) lookup of the runqueue list obviously ++increased overhead proportionate to the number of queued proecesses and led to ++cache thrashing while iterating over the linked list. ++ ++MuQSS is an evolution of BFS, designed to maintain the same scheduling ++decision mechanism and be virtually deterministic without relying on the ++constrained design of the single runqueue by splitting out the single runqueue ++to be per-CPU and use skiplists instead of linked lists. ++ ++The original reason for going back to a single runqueue design for BFS was that ++once multiple runqueues are introduced, per-CPU or otherwise, there will be ++complex interactions as each runqueue will be responsible for the scheduling ++latency and fairness of the tasks only on its own runqueue, and to achieve ++fairness and low latency across multiple CPUs, any advantage in throughput of ++having CPU local tasks causes other disadvantages. This is due to requiring a ++very complex balancing system to at best achieve some semblance of fairness ++across CPUs and can only maintain relatively low latency for tasks bound to the ++same CPUs, not across them. To increase said fairness and latency across CPUs, ++the advantage of local runqueue locking, which makes for better scalability, is ++lost due to having to grab multiple locks. ++ ++MuQSS works around the problems inherent in multiple runqueue designs by ++making its skip lists priority ordered and through novel use of lockless ++examination of each other runqueue it can decide if it should take the earliest ++deadline task from another runqueue for latency reasons, or for CPU balancing ++reasons. It still does not have a balancing system, choosing to allow the ++next task scheduling decision and task wakeup CPU choice to allow balancing to ++happen by virtue of its choices. ++ ++As a further evolution of the design, MuQSS normally configures sharing of ++runqueues in a logical fashion for when CPU resources are shared for improved ++latency and throughput. By default it shares runqueues and locks between ++multicore siblings. Optionally it can be configured to run with sharing of ++SMT siblings only, all SMP packages or no sharing at all. Additionally it can ++be selected at boot time. ++ ++ ++Design details. ++ ++Custom skip list implementation: ++ ++To avoid the overhead of building up and tearing down skip list structures, ++the variant used by MuQSS has a number of optimisations making it specific for ++its use case in the scheduler. It uses static arrays of 8 'levels' instead of ++building up and tearing down structures dynamically. This makes each runqueue ++only scale O(log N) up to 64k tasks. However as there is one runqueue per CPU ++it means that it scales O(log N) up to 64k x number of logical CPUs which is ++far beyond the realistic task limits each CPU could handle. By being 8 levels ++it also makes the array exactly one cacheline in size. Additionally, each ++skip list node is bidirectional making insertion and removal amortised O(1), ++being O(k) where k is 1-8. Uniquely, we are only ever interested in the very ++first entry in each list at all times with MuQSS, so there is never a need to ++do a search and thus look up is always O(1). In interactive mode, the queues ++will be searched beyond their first entry if the first task is not suitable ++for affinity or SMT nice reasons. ++ ++Task insertion: ++ ++MuQSS inserts tasks into a per CPU runqueue as an O(log N) insertion into ++a custom skip list as described above (based on the original design by William ++Pugh). Insertion is ordered in such a way that there is never a need to do a ++search by ordering tasks according to static priority primarily, and then ++virtual deadline at the time of insertion. ++ ++Niffies: ++ ++Niffies are a monotonic forward moving timer not unlike the "jiffies" but are ++of nanosecond resolution. Niffies are calculated per-runqueue from the high ++resolution TSC timers, and in order to maintain fairness are synchronised ++between CPUs whenever both runqueues are locked concurrently. ++ ++Virtual deadline: ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in MuQSS is entirely in the virtual deadline mechanism. The one ++tunable in MuQSS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in niffies by this equation: ++ ++ niffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (niffies) is ++constantly moving. ++ ++Task lookup: ++ ++As tasks are already pre-ordered according to anticipated scheduling order in ++the skip lists, lookup for the next suitable task per-runqueue is always a ++matter of simply selecting the first task in the 0th level skip list entry. ++In order to maintain optimal latency and fairness across CPUs, MuQSS does a ++novel examination of every other runqueue in cache locality order, choosing the ++best task across all runqueues. This provides near-determinism of how long any ++task across the entire system may wait before receiving CPU time. The other ++runqueues are first examine lockless and then trylocked to minimise the ++potential lock contention if they are likely to have a suitable better task. ++Each other runqueue lock is only held for as long as it takes to examine the ++entry for suitability. In "interactive" mode, the default setting, MuQSS will ++look for the best deadline task across all CPUs, while in !interactive mode, ++it will only select a better deadline task from another CPU if it is more ++heavily laden than the current one. ++ ++Lookup is therefore O(k) where k is number of CPUs. ++ ++ ++Latency. ++ ++Through the use of virtual deadlines to govern the scheduling order of normal ++tasks, queue-to-activation latency per runqueue is guaranteed to be bound by ++the rr_interval tunable which is set to 6ms by default. This means that the ++longest a CPU bound task will wait for more CPU is proportional to the number ++of running tasks and in the common case of 0-2 running tasks per CPU, will be ++under the 7ms threshold for human perception of jitter. Additionally, as newly ++woken tasks will have an early deadline from their previous runtime, the very ++tasks that are usually latency sensitive will have the shortest interval for ++activation, usually preempting any existing CPU bound tasks. ++ ++Tickless expiry: ++ ++A feature of MuQSS is that it is not tied to the resolution of the chosen tick ++rate in Hz, instead depending entirely on the high resolution timers where ++possible for sub-millisecond accuracy on timeouts regarless of the underlying ++tick rate. This allows MuQSS to be run with the low overhead of low Hz rates ++such as 100 by default, benefiting from the improved throughput and lower ++power usage it provides. Another advantage of this approach is that in ++combination with the Full No HZ option, which disables ticks on running task ++CPUs instead of just idle CPUs, the tick can be disabled at all times ++regardless of how many tasks are running instead of being limited to just one ++running task. Note that this option is NOT recommended for regular desktop ++users. ++ ++ ++Scalability and balancing. ++ ++Unlike traditional approaches where balancing is a combination of CPU selection ++at task wakeup and intermittent balancing based on a vast array of rules set ++according to architecture, busyness calculations and special case management, ++MuQSS indirectly balances on the fly at task wakeup and next task selection. ++During initialisation, MuQSS creates a cache coherency ordered list of CPUs for ++each logical CPU and uses this to aid task/CPU selection when CPUs are busy. ++Additionally it selects any idle CPUs, if they are available, at any time over ++busy CPUs according to the following preference: ++ ++ * Same thread, idle or busy cache, idle or busy threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ ++Mux is therefore SMT, MC and Numa aware without the need for extra ++intermittent balancing to maintain CPUs busy and make the most of cache ++coherency. ++ ++ ++Features ++ ++As the initial prime target audience for MuQSS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are 3 optional tunables, and 2 extra scheduling policies. The rr_interval, ++interactive, and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO ++policies. In addition to this, MuQSS also uses sub-tick accounting. What MuQSS ++does _not_ now feature is support for CGROUPS. The average user should neither ++need to know what these are, nor should they need to be using them to have good ++desktop behaviour. However since some applications refuse to work without ++cgroups, one can enable them with MuQSS as a stub and the filesystem will be ++created which will allow the applications to work. ++ ++rr_interval: ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6. Valid values ++are from 1 to 1000 Decreasing the value will decrease latencies at the cost of ++decreasing throughput, while increasing it will improve throughput, but at the ++cost of worsening latencies. It is based on the fact that humans can detect ++jitter at approximately 7ms, so aiming for much lower latencies is pointless ++under most circumstances. It is worth noting this fact when comparing the ++latency performance of MuQSS to other schedulers. Worst case latencies being ++higher than 7ms are far worse than average latencies not being in the ++microsecond range. ++ ++interactive: ++ ++ /proc/sys/kernel/interactive ++ ++The value is a simple boolean of 1 for on and 0 for off and is set to on by ++default. Disabling this will disable the near-determinism of MuQSS when ++selecting the next task by not examining all CPUs for the earliest deadline ++task, or which CPU to wake to, instead prioritising CPU balancing for improved ++throughput. Latency will still be bound by rr_interval, but on a per-CPU basis ++instead of across the whole system. ++ ++Runqueue sharing. ++ ++By default MuQSS chooses to share runqueue resources (specifically the skip ++list and locking) between multicore siblings. It is configurable at build time ++to select between None, SMT, MC and SMP, corresponding to no sharing, sharing ++only between simultaneous mulithreading siblings, multicore siblings, or ++symmetric multiprocessing physical packages. Additionally it can be se at ++bootime with the use of the rqshare parameter. The reason for configurability ++is that some architectures have CPUs with many multicore siblings (>= 16) ++where it may be detrimental to throughput to share runqueues and another ++sharing option may be desirable. Additionally, more sharing than usual can ++improve latency on a system-wide level at the expense of throughput if desired. ++ ++The options are: ++none, smt, mc, smp ++ ++eg: ++ rqshare=mc ++ ++Isochronous scheduling: ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of CPU available per CPU, configurable as a percentage in ++the following "resource handling" tunable (as opposed to a scheduler tunable): ++ ++iso_cpu: ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of MuQSS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++ ++ ++Idleprio scheduling: ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start a ++video encode or so on without any slowdown of other tasks. To avoid this policy ++from grabbing shared resources and holding them indefinitely, if it detects a ++state where the task is waiting on I/O, the machine is about to suspend to ram ++and so on, it will transiently schedule them as SCHED_NORMAL. Once a task has ++been scheduled as IDLEPRIO, it cannot be put back to SCHED_NORMAL without ++superuser privileges since it is effectively a lower scheduling policy. Tasks ++can be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++schedtool -D -e ./mprime ++ ++Subtick accounting: ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the timer ++tick frequency (HZ) is lowered. It is possible to create an application which ++uses almost 100% CPU, yet by being descheduled at the right time, records zero ++CPU usage. While the main problem with this is that there are possible security ++implications, it is also difficult to determine how much CPU a task really does ++use. Mux uses sub-tick accounting from the TSC clock to determine real CPU ++usage. Thus, the amount of CPU reported as being used by MuQSS will more ++accurately represent how much CPU the task itself is using (as is shown for ++example by the 'time' application), so the reported values may be quite ++different to other schedulers. When comparing throughput of MuQSS to other ++designs, it is important to compare the actual completed work in terms of total ++wall clock time taken and total work done, rather than the reported "cpu usage". ++ ++Symmetric MultiThreading (SMT) aware nice: ++ ++SMT, a.k.a. hyperthreading, is a very common feature on modern CPUs. While the ++logical CPU count rises by adding thread units to each CPU core, allowing more ++than one task to be run simultaneously on the same core, the disadvantage of it ++is that the CPU power is shared between the tasks, not summating to the power ++of two CPUs. The practical upshot of this is that two tasks running on ++separate threads of the same core run significantly slower than if they had one ++core each to run on. While smart CPU selection allows each task to have a core ++to itself whenever available (as is done on MuQSS), it cannot offset the ++slowdown that occurs when the cores are all loaded and only a thread is left. ++Most of the time this is harmless as the CPU is effectively overloaded at this ++point and the extra thread is of benefit. However when running a niced task in ++the presence of an un-niced task (say nice 19 v nice 0), the nice task gets ++precisely the same amount of CPU power as the unniced one. MuQSS has an ++optional configuration feature known as SMT-NICE which selectively idles the ++secondary niced thread for a period proportional to the nice difference, ++allowing CPU distribution according to nice level to be maintained, at the ++expense of a small amount of extra overhead. If this is configured in on a ++machine without SMT threads, the overhead is minimal. ++ ++ ++Con Kolivas Sat, 29th October 2016 +diff --git a/Makefile b/Makefile +index b668725a2a62..73a4381d3ea9 100644 +--- a/Makefile ++++ b/Makefile +@@ -15,6 +15,10 @@ NAME = Kleptomaniac Octopus + PHONY := _all + _all: + ++CKVERSION = -ck1 ++CKNAME = MuQSS Powered ++EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION) ++ + # We are using a recursive build, so we need to do a little thinking + # to get the ordering right. + # +diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig +index ef179033a7c2..14b576a531ad 100644 +--- a/arch/alpha/Kconfig ++++ b/arch/alpha/Kconfig +@@ -665,6 +665,8 @@ config HZ + default 1200 if HZ_1200 + default 1024 + ++source "kernel/Kconfig.MuQSS" ++ + config SRM_ENV + tristate "SRM environment through procfs" + depends on PROC_FS +diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig +index a12656ec0072..b46b6ddc7636 100644 +--- a/arch/arc/configs/tb10x_defconfig ++++ b/arch/arc/configs/tb10x_defconfig +@@ -29,7 +29,7 @@ CONFIG_ARC_PLAT_TB10X=y + CONFIG_ARC_CACHE_LINE_SHIFT=5 + CONFIG_HZ=250 + CONFIG_ARC_BUILTIN_DTB_NAME="abilis_tb100_dvk" +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_COMPACTION is not set + CONFIG_NET=y + CONFIG_PACKET=y +diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig +index c77c93c485a0..c16a89549ff2 100644 +--- a/arch/arm/Kconfig ++++ b/arch/arm/Kconfig +@@ -1237,6 +1237,8 @@ config SCHED_SMT + MultiThreading at a cost of slightly increased overhead in some + places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config HAVE_ARM_SCU + bool + help +diff --git a/arch/arm/configs/bcm2835_defconfig b/arch/arm/configs/bcm2835_defconfig +index 8e7a3ed2a4df..8a1ec6d2c3fb 100644 +--- a/arch/arm/configs/bcm2835_defconfig ++++ b/arch/arm/configs/bcm2835_defconfig +@@ -29,7 +29,7 @@ CONFIG_MODULE_UNLOAD=y + CONFIG_ARCH_MULTI_V6=y + CONFIG_ARCH_BCM=y + CONFIG_ARCH_BCM2835=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_AEABI=y + CONFIG_KSM=y + CONFIG_CLEANCACHE=y +diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig +index 5a20d12d62bd..fb76e6ff18a0 100644 +--- a/arch/arm/configs/imx_v6_v7_defconfig ++++ b/arch/arm/configs/imx_v6_v7_defconfig +@@ -45,6 +45,7 @@ CONFIG_PCI_MSI=y + CONFIG_PCI_IMX6=y + CONFIG_SMP=y + CONFIG_ARM_PSCI=y ++CONFIG_PREEMPT=y + CONFIG_HIGHMEM=y + CONFIG_FORCE_MAX_ZONEORDER=14 + CONFIG_CMDLINE="noinitrd console=ttymxc0,115200" +diff --git a/arch/arm/configs/mps2_defconfig b/arch/arm/configs/mps2_defconfig +index 1d923dbb9928..9c1931f1fafd 100644 +--- a/arch/arm/configs/mps2_defconfig ++++ b/arch/arm/configs/mps2_defconfig +@@ -18,7 +18,7 @@ CONFIG_ARCH_MPS2=y + CONFIG_SET_MEM_PARAM=y + CONFIG_DRAM_BASE=0x21000000 + CONFIG_DRAM_SIZE=0x1000000 +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_ATAGS is not set + CONFIG_ZBOOT_ROM_TEXT=0x0 + CONFIG_ZBOOT_ROM_BSS=0x0 +diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig +index a9c6f32a9b1c..870866aaa39d 100644 +--- a/arch/arm/configs/mxs_defconfig ++++ b/arch/arm/configs/mxs_defconfig +@@ -1,7 +1,7 @@ + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT_VOLUNTARY=n + CONFIG_TASKSTATS=y + CONFIG_TASK_DELAY_ACCT=y + CONFIG_TASK_XACCT=y +@@ -25,6 +25,13 @@ CONFIG_MODULE_UNLOAD=y + CONFIG_MODULE_FORCE_UNLOAD=y + CONFIG_MODVERSIONS=y + CONFIG_BLK_DEV_INTEGRITY=y ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++# CONFIG_ARCH_MULTI_V7 is not set ++CONFIG_ARCH_MXS=y ++# CONFIG_ARM_THUMB is not set ++CONFIG_PREEMPT=y ++CONFIG_AEABI=y + CONFIG_NET=y + CONFIG_PACKET=y + CONFIG_UNIX=y +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index 5d513f461957..7cb8456280be 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -942,6 +942,8 @@ config SCHED_SMT + MultiThreading at a cost of slightly increased overhead in some + places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config NR_CPUS + int "Maximum number of CPUs (2-4096)" + range 2 4096 +diff --git a/arch/mips/configs/fuloong2e_defconfig b/arch/mips/configs/fuloong2e_defconfig +index 6466e83067b4..776d8783fc2a 100644 +--- a/arch/mips/configs/fuloong2e_defconfig ++++ b/arch/mips/configs/fuloong2e_defconfig +@@ -4,7 +4,7 @@ CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y +diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig +index 9085f4d6c698..fb23111d45f6 100644 +--- a/arch/mips/configs/gpr_defconfig ++++ b/arch/mips/configs/gpr_defconfig +@@ -1,8 +1,8 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_BSD_PROCESS_ACCT_V3=y + CONFIG_RELAY=y +diff --git a/arch/mips/configs/ip22_defconfig b/arch/mips/configs/ip22_defconfig +index 21a1168ae301..529a1b1007cf 100644 +--- a/arch/mips/configs/ip22_defconfig ++++ b/arch/mips/configs/ip22_defconfig +@@ -1,7 +1,7 @@ + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=14 +diff --git a/arch/mips/configs/ip28_defconfig b/arch/mips/configs/ip28_defconfig +index 0921ef38e9fb..6da05cef46f8 100644 +--- a/arch/mips/configs/ip28_defconfig ++++ b/arch/mips/configs/ip28_defconfig +@@ -1,5 +1,5 @@ + CONFIG_SYSVIPC=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=14 +diff --git a/arch/mips/configs/jazz_defconfig b/arch/mips/configs/jazz_defconfig +index 8c223035921f..a3bf87450343 100644 +--- a/arch/mips/configs/jazz_defconfig ++++ b/arch/mips/configs/jazz_defconfig +@@ -1,8 +1,8 @@ ++CONFIG_PREEMPT=y + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_LOG_BUF_SHIFT=14 + CONFIG_RELAY=y +diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig +index 914af125a7fa..76a64290373f 100644 +--- a/arch/mips/configs/mtx1_defconfig ++++ b/arch/mips/configs/mtx1_defconfig +@@ -1,8 +1,8 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_AUDIT=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_BSD_PROCESS_ACCT_V3=y + CONFIG_RELAY=y +diff --git a/arch/mips/configs/nlm_xlr_defconfig b/arch/mips/configs/nlm_xlr_defconfig +index 4ecb157e56d4..ea7309283b01 100644 +--- a/arch/mips/configs/nlm_xlr_defconfig ++++ b/arch/mips/configs/nlm_xlr_defconfig +@@ -1,10 +1,10 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_AUDIT=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_BSD_PROCESS_ACCT_V3=y + CONFIG_TASKSTATS=y +diff --git a/arch/mips/configs/pic32mzda_defconfig b/arch/mips/configs/pic32mzda_defconfig +index 63fe2da1b37f..7f08ee237345 100644 +--- a/arch/mips/configs/pic32mzda_defconfig ++++ b/arch/mips/configs/pic32mzda_defconfig +@@ -1,7 +1,7 @@ ++CONFIG_PREEMPT=y + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=14 +diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig +index 24e07180c57d..38582e8f71c4 100644 +--- a/arch/mips/configs/pistachio_defconfig ++++ b/arch/mips/configs/pistachio_defconfig +@@ -1,9 +1,9 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_DEFAULT_HOSTNAME="localhost" + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_IKCONFIG=m + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=18 +diff --git a/arch/mips/configs/pnx8335_stb225_defconfig b/arch/mips/configs/pnx8335_stb225_defconfig +index d06db6b87959..fb2cd3234d95 100644 +--- a/arch/mips/configs/pnx8335_stb225_defconfig ++++ b/arch/mips/configs/pnx8335_stb225_defconfig +@@ -1,9 +1,9 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + # CONFIG_SWAP is not set + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_LOG_BUF_SHIFT=14 + CONFIG_EXPERT=y + CONFIG_SLAB=y +diff --git a/arch/mips/configs/rm200_defconfig b/arch/mips/configs/rm200_defconfig +index 30d7c3db884e..9e68acfa0d0e 100644 +--- a/arch/mips/configs/rm200_defconfig ++++ b/arch/mips/configs/rm200_defconfig +@@ -1,6 +1,6 @@ ++CONFIG_PREEMPT=y + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y +diff --git a/arch/parisc/configs/712_defconfig b/arch/parisc/configs/712_defconfig +new file mode 100644 +index 000000000000..578524f80cc4 +--- /dev/null ++++ b/arch/parisc/configs/712_defconfig +@@ -0,0 +1,181 @@ ++# CONFIG_LOCALVERSION_AUTO is not set ++CONFIG_SYSVIPC=y ++CONFIG_POSIX_MQUEUE=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=16 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_KALLSYMS_ALL=y ++CONFIG_SLAB=y ++CONFIG_PROFILING=y ++CONFIG_OPROFILE=m ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_MODULE_FORCE_UNLOAD=y ++CONFIG_PA7100LC=y ++CONFIG_PREEMPT=y ++CONFIG_GSC_LASI=y ++# CONFIG_PDC_CHASSIS is not set ++CONFIG_BINFMT_MISC=m ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_XFRM_USER=m ++CONFIG_NET_KEY=m ++CONFIG_INET=y ++CONFIG_IP_MULTICAST=y ++CONFIG_IP_PNP=y ++CONFIG_IP_PNP_DHCP=y ++CONFIG_IP_PNP_BOOTP=y ++CONFIG_INET_AH=m ++CONFIG_INET_ESP=m ++CONFIG_INET_DIAG=m ++# CONFIG_IPV6 is not set ++CONFIG_NETFILTER=y ++CONFIG_LLC2=m ++CONFIG_NET_PKTGEN=m ++CONFIG_DEVTMPFS=y ++CONFIG_DEVTMPFS_MOUNT=y ++# CONFIG_STANDALONE is not set ++# CONFIG_PREVENT_FIRMWARE_BUILD is not set ++CONFIG_PARPORT=y ++CONFIG_PARPORT_PC=m ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_CRYPTOLOOP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_SIZE=6144 ++CONFIG_ATA_OVER_ETH=m ++CONFIG_SCSI=y ++CONFIG_BLK_DEV_SD=y ++CONFIG_CHR_DEV_ST=y ++CONFIG_BLK_DEV_SR=y ++CONFIG_CHR_DEV_SG=y ++CONFIG_SCSI_ISCSI_ATTRS=m ++CONFIG_SCSI_LASI700=y ++CONFIG_SCSI_DEBUG=m ++CONFIG_MD=y ++CONFIG_BLK_DEV_MD=m ++CONFIG_MD_LINEAR=m ++CONFIG_MD_RAID0=m ++CONFIG_MD_RAID1=m ++CONFIG_NETDEVICES=y ++CONFIG_BONDING=m ++CONFIG_DUMMY=m ++CONFIG_TUN=m ++CONFIG_LASI_82596=y ++CONFIG_PPP=m ++CONFIG_PPP_BSDCOMP=m ++CONFIG_PPP_DEFLATE=m ++CONFIG_PPP_MPPE=m ++CONFIG_PPPOE=m ++CONFIG_PPP_ASYNC=m ++CONFIG_PPP_SYNC_TTY=m ++# CONFIG_KEYBOARD_HIL_OLD is not set ++CONFIG_MOUSE_SERIAL=m ++CONFIG_LEGACY_PTY_COUNT=64 ++CONFIG_SERIAL_8250=y ++CONFIG_SERIAL_8250_CONSOLE=y ++CONFIG_SERIAL_8250_NR_UARTS=17 ++CONFIG_SERIAL_8250_EXTENDED=y ++CONFIG_SERIAL_8250_MANY_PORTS=y ++CONFIG_SERIAL_8250_SHARE_IRQ=y ++# CONFIG_SERIAL_MUX is not set ++CONFIG_PDC_CONSOLE=y ++CONFIG_PRINTER=m ++CONFIG_PPDEV=m ++# CONFIG_HW_RANDOM is not set ++CONFIG_RAW_DRIVER=y ++# CONFIG_HWMON is not set ++CONFIG_FB=y ++CONFIG_FB_MODE_HELPERS=y ++CONFIG_FB_TILEBLITTING=y ++CONFIG_DUMMY_CONSOLE_COLUMNS=128 ++CONFIG_DUMMY_CONSOLE_ROWS=48 ++CONFIG_FRAMEBUFFER_CONSOLE=y ++CONFIG_LOGO=y ++# CONFIG_LOGO_LINUX_MONO is not set ++# CONFIG_LOGO_LINUX_VGA16 is not set ++# CONFIG_LOGO_LINUX_CLUT224 is not set ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_SEQUENCER=y ++CONFIG_SND_HARMONY=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT3_FS=y ++CONFIG_JFS_FS=m ++CONFIG_XFS_FS=m ++CONFIG_AUTOFS4_FS=y ++CONFIG_ISO9660_FS=y ++CONFIG_JOLIET=y ++CONFIG_UDF_FS=m ++CONFIG_MSDOS_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_PROC_KCORE=y ++CONFIG_TMPFS=y ++CONFIG_UFS_FS=m ++CONFIG_NFS_FS=y ++CONFIG_NFS_V4=y ++CONFIG_ROOT_NFS=y ++CONFIG_NFSD=m ++CONFIG_NFSD_V4=y ++CONFIG_CIFS=m ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_737=m ++CONFIG_NLS_CODEPAGE_775=m ++CONFIG_NLS_CODEPAGE_850=m ++CONFIG_NLS_CODEPAGE_852=m ++CONFIG_NLS_CODEPAGE_855=m ++CONFIG_NLS_CODEPAGE_857=m ++CONFIG_NLS_CODEPAGE_860=m ++CONFIG_NLS_CODEPAGE_861=m ++CONFIG_NLS_CODEPAGE_862=m ++CONFIG_NLS_CODEPAGE_863=m ++CONFIG_NLS_CODEPAGE_864=m ++CONFIG_NLS_CODEPAGE_865=m ++CONFIG_NLS_CODEPAGE_866=m ++CONFIG_NLS_CODEPAGE_869=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_CODEPAGE_950=m ++CONFIG_NLS_CODEPAGE_932=m ++CONFIG_NLS_CODEPAGE_949=m ++CONFIG_NLS_CODEPAGE_874=m ++CONFIG_NLS_ISO8859_8=m ++CONFIG_NLS_CODEPAGE_1250=m ++CONFIG_NLS_CODEPAGE_1251=m ++CONFIG_NLS_ASCII=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_ISO8859_2=m ++CONFIG_NLS_ISO8859_3=m ++CONFIG_NLS_ISO8859_4=m ++CONFIG_NLS_ISO8859_5=m ++CONFIG_NLS_ISO8859_6=m ++CONFIG_NLS_ISO8859_7=m ++CONFIG_NLS_ISO8859_9=m ++CONFIG_NLS_ISO8859_13=m ++CONFIG_NLS_ISO8859_14=m ++CONFIG_NLS_ISO8859_15=m ++CONFIG_NLS_KOI8_R=m ++CONFIG_NLS_KOI8_U=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_FS=y ++CONFIG_MAGIC_SYSRQ=y ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_MUTEXES=y ++CONFIG_CRYPTO_TEST=m ++CONFIG_CRYPTO_HMAC=y ++CONFIG_CRYPTO_MICHAEL_MIC=m ++CONFIG_CRYPTO_SHA512=m ++CONFIG_CRYPTO_TGR192=m ++CONFIG_CRYPTO_WP512=m ++CONFIG_CRYPTO_ANUBIS=m ++CONFIG_CRYPTO_BLOWFISH=m ++CONFIG_CRYPTO_CAST6=m ++CONFIG_CRYPTO_KHAZAD=m ++CONFIG_CRYPTO_SERPENT=m ++CONFIG_CRYPTO_TEA=m ++CONFIG_CRYPTO_TWOFISH=m ++CONFIG_CRYPTO_DEFLATE=m ++# CONFIG_CRYPTO_HW is not set ++CONFIG_FONTS=y ++CONFIG_FONT_8x8=y ++CONFIG_FONT_8x16=y +diff --git a/arch/parisc/configs/c3000_defconfig b/arch/parisc/configs/c3000_defconfig +new file mode 100644 +index 000000000000..d1bdfad94048 +--- /dev/null ++++ b/arch/parisc/configs/c3000_defconfig +@@ -0,0 +1,151 @@ ++# CONFIG_LOCALVERSION_AUTO is not set ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=16 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++CONFIG_KALLSYMS_ALL=y ++CONFIG_SLAB=y ++CONFIG_PROFILING=y ++CONFIG_OPROFILE=m ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_MODULE_FORCE_UNLOAD=y ++CONFIG_PA8X00=y ++CONFIG_PREEMPT=y ++# CONFIG_GSC is not set ++CONFIG_PCI=y ++CONFIG_PCI_LBA=y ++# CONFIG_PDC_CHASSIS is not set ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_XFRM_USER=m ++CONFIG_NET_KEY=m ++CONFIG_INET=y ++CONFIG_IP_MULTICAST=y ++CONFIG_IP_PNP=y ++CONFIG_IP_PNP_BOOTP=y ++# CONFIG_INET_DIAG is not set ++CONFIG_INET6_IPCOMP=m ++CONFIG_IPV6_TUNNEL=m ++CONFIG_NETFILTER=y ++CONFIG_NET_PKTGEN=m ++CONFIG_DEVTMPFS=y ++CONFIG_DEVTMPFS_MOUNT=y ++# CONFIG_STANDALONE is not set ++# CONFIG_PREVENT_FIRMWARE_BUILD is not set ++CONFIG_BLK_DEV_UMEM=m ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_CRYPTOLOOP=m ++CONFIG_IDE=y ++CONFIG_BLK_DEV_IDECD=y ++CONFIG_BLK_DEV_NS87415=y ++CONFIG_SCSI=y ++CONFIG_BLK_DEV_SD=y ++CONFIG_CHR_DEV_ST=y ++CONFIG_BLK_DEV_SR=y ++CONFIG_CHR_DEV_SG=y ++CONFIG_SCSI_ISCSI_ATTRS=m ++CONFIG_SCSI_SYM53C8XX_2=y ++CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=0 ++CONFIG_SCSI_DEBUG=m ++CONFIG_MD=y ++CONFIG_BLK_DEV_MD=y ++CONFIG_MD_LINEAR=y ++CONFIG_MD_RAID0=y ++CONFIG_MD_RAID1=y ++CONFIG_BLK_DEV_DM=m ++CONFIG_DM_CRYPT=m ++CONFIG_DM_SNAPSHOT=m ++CONFIG_DM_MIRROR=m ++CONFIG_DM_ZERO=m ++CONFIG_DM_MULTIPATH=m ++CONFIG_FUSION=y ++CONFIG_FUSION_SPI=m ++CONFIG_FUSION_CTL=m ++CONFIG_NETDEVICES=y ++CONFIG_BONDING=m ++CONFIG_DUMMY=m ++CONFIG_TUN=m ++CONFIG_ACENIC=m ++CONFIG_TIGON3=m ++CONFIG_NET_TULIP=y ++CONFIG_DE2104X=m ++CONFIG_TULIP=y ++CONFIG_TULIP_MMIO=y ++CONFIG_E100=m ++CONFIG_E1000=m ++CONFIG_PPP=m ++CONFIG_PPP_BSDCOMP=m ++CONFIG_PPP_DEFLATE=m ++CONFIG_PPPOE=m ++CONFIG_PPP_ASYNC=m ++CONFIG_PPP_SYNC_TTY=m ++# CONFIG_KEYBOARD_ATKBD is not set ++# CONFIG_MOUSE_PS2 is not set ++CONFIG_SERIO=m ++CONFIG_SERIO_LIBPS2=m ++CONFIG_SERIAL_8250=y ++CONFIG_SERIAL_8250_CONSOLE=y ++CONFIG_SERIAL_8250_NR_UARTS=13 ++CONFIG_SERIAL_8250_EXTENDED=y ++CONFIG_SERIAL_8250_MANY_PORTS=y ++CONFIG_SERIAL_8250_SHARE_IRQ=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_RAW_DRIVER=y ++# CONFIG_HWMON is not set ++CONFIG_FB=y ++CONFIG_FRAMEBUFFER_CONSOLE=y ++CONFIG_LOGO=y ++# CONFIG_LOGO_LINUX_MONO is not set ++# CONFIG_LOGO_LINUX_VGA16 is not set ++# CONFIG_LOGO_LINUX_CLUT224 is not set ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_SEQUENCER=y ++CONFIG_SND_AD1889=y ++CONFIG_USB_HIDDEV=y ++CONFIG_USB=y ++CONFIG_USB_OHCI_HCD=y ++CONFIG_USB_PRINTER=m ++CONFIG_USB_STORAGE=m ++CONFIG_USB_STORAGE_USBAT=m ++CONFIG_USB_STORAGE_SDDR09=m ++CONFIG_USB_STORAGE_SDDR55=m ++CONFIG_USB_STORAGE_JUMPSHOT=m ++CONFIG_USB_MDC800=m ++CONFIG_USB_MICROTEK=m ++CONFIG_USB_LEGOTOWER=m ++CONFIG_EXT2_FS=y ++CONFIG_EXT3_FS=y ++CONFIG_XFS_FS=m ++CONFIG_AUTOFS4_FS=y ++CONFIG_ISO9660_FS=y ++CONFIG_JOLIET=y ++CONFIG_MSDOS_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_PROC_KCORE=y ++CONFIG_TMPFS=y ++CONFIG_NFS_FS=y ++CONFIG_ROOT_NFS=y ++CONFIG_NFSD=y ++CONFIG_NFSD_V3=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_850=m ++CONFIG_NLS_ASCII=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_ISO8859_15=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_FS=y ++CONFIG_HEADERS_INSTALL=y ++CONFIG_HEADERS_CHECK=y ++CONFIG_MAGIC_SYSRQ=y ++CONFIG_DEBUG_MUTEXES=y ++# CONFIG_DEBUG_BUGVERBOSE is not set ++CONFIG_CRYPTO_TEST=m ++CONFIG_CRYPTO_MD5=m ++CONFIG_CRYPTO_BLOWFISH=m ++CONFIG_CRYPTO_DES=m ++# CONFIG_CRYPTO_HW is not set +diff --git a/arch/parisc/configs/defconfig b/arch/parisc/configs/defconfig +new file mode 100644 +index 000000000000..0d976614934c +--- /dev/null ++++ b/arch/parisc/configs/defconfig +@@ -0,0 +1,206 @@ ++# CONFIG_LOCALVERSION_AUTO is not set ++CONFIG_SYSVIPC=y ++CONFIG_POSIX_MQUEUE=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=16 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_KALLSYMS_ALL=y ++CONFIG_SLAB=y ++CONFIG_PROFILING=y ++CONFIG_OPROFILE=m ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_MODULE_FORCE_UNLOAD=y ++# CONFIG_BLK_DEV_BSG is not set ++CONFIG_PA7100LC=y ++CONFIG_PREEMPT=y ++CONFIG_IOMMU_CCIO=y ++CONFIG_GSC_LASI=y ++CONFIG_GSC_WAX=y ++CONFIG_EISA=y ++CONFIG_PCI=y ++CONFIG_GSC_DINO=y ++CONFIG_PCI_LBA=y ++CONFIG_PCCARD=y ++CONFIG_YENTA=y ++CONFIG_PD6729=y ++CONFIG_I82092=y ++CONFIG_BINFMT_MISC=m ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_XFRM_USER=m ++CONFIG_NET_KEY=m ++CONFIG_INET=y ++CONFIG_IP_MULTICAST=y ++CONFIG_IP_PNP=y ++CONFIG_IP_PNP_DHCP=y ++CONFIG_IP_PNP_BOOTP=y ++CONFIG_INET_AH=m ++CONFIG_INET_ESP=m ++CONFIG_INET_DIAG=m ++CONFIG_INET6_AH=y ++CONFIG_INET6_ESP=y ++CONFIG_INET6_IPCOMP=y ++CONFIG_LLC2=m ++CONFIG_DEVTMPFS=y ++CONFIG_DEVTMPFS_MOUNT=y ++# CONFIG_STANDALONE is not set ++# CONFIG_PREVENT_FIRMWARE_BUILD is not set ++CONFIG_PARPORT=y ++CONFIG_PARPORT_PC=m ++CONFIG_PARPORT_PC_PCMCIA=m ++CONFIG_PARPORT_1284=y ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_CRYPTOLOOP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_SIZE=6144 ++CONFIG_IDE=y ++CONFIG_BLK_DEV_IDECS=y ++CONFIG_BLK_DEV_IDECD=y ++CONFIG_BLK_DEV_GENERIC=y ++CONFIG_BLK_DEV_NS87415=y ++CONFIG_SCSI=y ++CONFIG_BLK_DEV_SD=y ++CONFIG_CHR_DEV_ST=y ++CONFIG_BLK_DEV_SR=y ++CONFIG_CHR_DEV_SG=y ++CONFIG_SCSI_LASI700=y ++CONFIG_SCSI_SYM53C8XX_2=y ++CONFIG_SCSI_ZALON=y ++CONFIG_MD=y ++CONFIG_BLK_DEV_MD=y ++CONFIG_MD_LINEAR=y ++CONFIG_MD_RAID0=y ++CONFIG_MD_RAID1=y ++CONFIG_MD_RAID10=y ++CONFIG_BLK_DEV_DM=y ++CONFIG_NETDEVICES=y ++CONFIG_BONDING=m ++CONFIG_DUMMY=m ++CONFIG_TUN=m ++CONFIG_ACENIC=y ++CONFIG_TIGON3=y ++CONFIG_NET_TULIP=y ++CONFIG_TULIP=y ++CONFIG_LASI_82596=y ++CONFIG_PPP=m ++CONFIG_PPP_BSDCOMP=m ++CONFIG_PPP_DEFLATE=m ++CONFIG_PPPOE=m ++CONFIG_PPP_ASYNC=m ++CONFIG_PPP_SYNC_TTY=m ++# CONFIG_KEYBOARD_HIL_OLD is not set ++CONFIG_MOUSE_SERIAL=y ++CONFIG_LEGACY_PTY_COUNT=64 ++CONFIG_SERIAL_8250=y ++CONFIG_SERIAL_8250_CONSOLE=y ++CONFIG_SERIAL_8250_CS=y ++CONFIG_SERIAL_8250_NR_UARTS=17 ++CONFIG_SERIAL_8250_EXTENDED=y ++CONFIG_SERIAL_8250_MANY_PORTS=y ++CONFIG_SERIAL_8250_SHARE_IRQ=y ++CONFIG_PRINTER=m ++CONFIG_PPDEV=m ++# CONFIG_HW_RANDOM is not set ++# CONFIG_HWMON is not set ++CONFIG_FB=y ++CONFIG_FB_MODE_HELPERS=y ++CONFIG_FB_TILEBLITTING=y ++CONFIG_DUMMY_CONSOLE_COLUMNS=128 ++CONFIG_DUMMY_CONSOLE_ROWS=48 ++CONFIG_FRAMEBUFFER_CONSOLE=y ++CONFIG_LOGO=y ++# CONFIG_LOGO_LINUX_MONO is not set ++# CONFIG_LOGO_LINUX_VGA16 is not set ++# CONFIG_LOGO_LINUX_CLUT224 is not set ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_DYNAMIC_MINORS=y ++CONFIG_SND_SEQUENCER=y ++CONFIG_SND_AD1889=y ++CONFIG_SND_HARMONY=y ++CONFIG_HID_GYRATION=y ++CONFIG_HID_NTRIG=y ++CONFIG_HID_PANTHERLORD=y ++CONFIG_HID_PETALYNX=y ++CONFIG_HID_SAMSUNG=y ++CONFIG_HID_SUNPLUS=y ++CONFIG_HID_TOPSEED=y ++CONFIG_USB=y ++CONFIG_USB_MON=y ++CONFIG_USB_OHCI_HCD=y ++CONFIG_USB_UHCI_HCD=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT3_FS=y ++CONFIG_ISO9660_FS=y ++CONFIG_JOLIET=y ++CONFIG_VFAT_FS=y ++CONFIG_PROC_KCORE=y ++CONFIG_TMPFS=y ++CONFIG_NFS_FS=y ++CONFIG_ROOT_NFS=y ++CONFIG_NFSD=y ++CONFIG_NFSD_V4=y ++CONFIG_CIFS=m ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_CODEPAGE_737=m ++CONFIG_NLS_CODEPAGE_775=m ++CONFIG_NLS_CODEPAGE_850=m ++CONFIG_NLS_CODEPAGE_852=m ++CONFIG_NLS_CODEPAGE_855=m ++CONFIG_NLS_CODEPAGE_857=m ++CONFIG_NLS_CODEPAGE_860=m ++CONFIG_NLS_CODEPAGE_861=m ++CONFIG_NLS_CODEPAGE_862=m ++CONFIG_NLS_CODEPAGE_863=m ++CONFIG_NLS_CODEPAGE_864=m ++CONFIG_NLS_CODEPAGE_865=m ++CONFIG_NLS_CODEPAGE_866=m ++CONFIG_NLS_CODEPAGE_869=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_CODEPAGE_950=m ++CONFIG_NLS_CODEPAGE_932=m ++CONFIG_NLS_CODEPAGE_949=m ++CONFIG_NLS_CODEPAGE_874=m ++CONFIG_NLS_ISO8859_8=m ++CONFIG_NLS_CODEPAGE_1250=y ++CONFIG_NLS_CODEPAGE_1251=m ++CONFIG_NLS_ASCII=m ++CONFIG_NLS_ISO8859_1=y ++CONFIG_NLS_ISO8859_2=m ++CONFIG_NLS_ISO8859_3=m ++CONFIG_NLS_ISO8859_4=m ++CONFIG_NLS_ISO8859_5=m ++CONFIG_NLS_ISO8859_6=m ++CONFIG_NLS_ISO8859_7=m ++CONFIG_NLS_ISO8859_9=m ++CONFIG_NLS_ISO8859_13=m ++CONFIG_NLS_ISO8859_14=m ++CONFIG_NLS_ISO8859_15=m ++CONFIG_NLS_KOI8_R=m ++CONFIG_NLS_KOI8_U=m ++CONFIG_NLS_UTF8=y ++CONFIG_DEBUG_FS=y ++CONFIG_HEADERS_INSTALL=y ++CONFIG_HEADERS_CHECK=y ++CONFIG_MAGIC_SYSRQ=y ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_MUTEXES=y ++CONFIG_KEYS=y ++CONFIG_CRYPTO_TEST=m ++CONFIG_CRYPTO_MICHAEL_MIC=m ++CONFIG_CRYPTO_SHA512=m ++CONFIG_CRYPTO_TGR192=m ++CONFIG_CRYPTO_WP512=m ++CONFIG_CRYPTO_ANUBIS=m ++CONFIG_CRYPTO_BLOWFISH=m ++CONFIG_CRYPTO_CAST6=m ++CONFIG_CRYPTO_KHAZAD=m ++CONFIG_CRYPTO_SERPENT=m ++CONFIG_CRYPTO_TEA=m ++CONFIG_CRYPTO_TWOFISH=m ++# CONFIG_CRYPTO_HW is not set ++CONFIG_LIBCRC32C=m ++CONFIG_FONTS=y +diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig +index b29d7cb38368..3af947541fdc 100644 +--- a/arch/powerpc/Kconfig ++++ b/arch/powerpc/Kconfig +@@ -879,6 +879,8 @@ config SCHED_SMT + when dealing with POWER5 cpus at a cost of slightly increased + overhead in some places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config PPC_DENORMALISATION + bool "PowerPC denormalisation exception handling" + depends on PPC_BOOK3S_64 +diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig +index feb5d47d8d1e..6ce1ce306381 100644 +--- a/arch/powerpc/configs/ppc6xx_defconfig ++++ b/arch/powerpc/configs/ppc6xx_defconfig +@@ -74,7 +74,7 @@ CONFIG_QE_GPIO=y + CONFIG_MCU_MPC8349EMITX=y + CONFIG_HIGHMEM=y + CONFIG_HZ_1000=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BINFMT_MISC=y + CONFIG_HIBERNATION=y + CONFIG_PM_DEBUG=y +diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +index f18d5067cd0f..fe489fc01c73 100644 +--- a/arch/powerpc/platforms/cell/spufs/sched.c ++++ b/arch/powerpc/platforms/cell/spufs/sched.c +@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; + static struct timer_list spusched_timer; + static struct timer_list spuloadavg_timer; + +-/* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- + /* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. +diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig +index 9a527f978106..5895f2cc726e 100644 +--- a/arch/sh/configs/se7712_defconfig ++++ b/arch/sh/configs/se7712_defconfig +@@ -23,7 +23,7 @@ CONFIG_FLATMEM_MANUAL=y + CONFIG_SH_SOLUTION_ENGINE=y + CONFIG_SH_PCLK_FREQ=66666666 + CONFIG_HEARTBEAT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1" + CONFIG_NET=y +diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig +index 3b0e1eb6e874..e296a2cd9903 100644 +--- a/arch/sh/configs/se7721_defconfig ++++ b/arch/sh/configs/se7721_defconfig +@@ -23,7 +23,7 @@ CONFIG_FLATMEM_MANUAL=y + CONFIG_SH_7721_SOLUTION_ENGINE=y + CONFIG_SH_PCLK_FREQ=33333333 + CONFIG_HEARTBEAT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda2" + CONFIG_NET=y +diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig +index 4ec961ace688..a03a1ad670a0 100644 +--- a/arch/sh/configs/titan_defconfig ++++ b/arch/sh/configs/titan_defconfig +@@ -20,7 +20,7 @@ CONFIG_SH_TITAN=y + CONFIG_SH_PCLK_FREQ=30000000 + CONFIG_SH_DMA=y + CONFIG_SH_DMA_API=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC1,38400N81 root=/dev/nfs ip=:::::eth1:autoconf rw" + CONFIG_PCI=y +diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig +index bde4d21a8ac8..c054ec82d91b 100644 +--- a/arch/sparc/configs/sparc64_defconfig ++++ b/arch/sparc/configs/sparc64_defconfig +@@ -22,7 +22,7 @@ CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y + CONFIG_NUMA=y + CONFIG_DEFAULT_MMAP_MIN_ADDR=8192 +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_SUN_LDOMS=y + CONFIG_PCI=y + CONFIG_PCI_MSI=y +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 2d3f963fd6f1..4df276a5781b 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1006,6 +1006,22 @@ config NR_CPUS + config SCHED_SMT + def_bool y if SMP + ++config SMT_NICE ++ bool "SMT (Hyperthreading) aware nice priority and policy support" ++ depends on SCHED_MUQSS && SCHED_SMT ++ default y ++ ---help--- ++ Enabling Hyperthreading on Intel CPUs decreases the effectiveness ++ of the use of 'nice' levels and different scheduling policies ++ (e.g. realtime) due to sharing of CPU power between hyperthreads. ++ SMT nice support makes each logical CPU aware of what is running on ++ its hyperthread siblings, maintaining appropriate distribution of ++ CPU according to nice levels and scheduling policies at the expense ++ of slightly increased overhead. ++ ++ If unsure say Y here. ++ ++ + config SCHED_MC + def_bool y + prompt "Multi-core scheduler support" +@@ -1036,6 +1052,8 @@ config SCHED_MC_PRIO + + If unsure say Y here. + ++source "kernel/Kconfig.MuQSS" ++ + config UP_LATE_INIT + def_bool y + depends on !SMP && X86_LOCAL_APIC +@@ -1423,7 +1441,7 @@ config HIGHMEM64G + endchoice + + choice +- prompt "Memory split" if EXPERT ++ prompt "Memory split" + default VMSPLIT_3G + depends on X86_32 + ---help--- +@@ -1443,17 +1461,17 @@ choice + option alone! + + config VMSPLIT_3G +- bool "3G/1G user/kernel split" ++ bool "Default 896MB lowmem (3G/1G user/kernel split)" + config VMSPLIT_3G_OPT + depends on !X86_PAE +- bool "3G/1G user/kernel split (for full 1G low memory)" ++ bool "1GB lowmem (3G/1G user/kernel split)" + config VMSPLIT_2G +- bool "2G/2G user/kernel split" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_2G_OPT + depends on !X86_PAE +- bool "2G/2G user/kernel split (for full 2G low memory)" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_1G +- bool "1G/3G user/kernel split" ++ bool "3GB lowmem (1G/3G user/kernel split)" + endchoice + + config PAGE_OFFSET +diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig +index 550904591e94..b5e80947326e 100644 +--- a/arch/x86/configs/i386_defconfig ++++ b/arch/x86/configs/i386_defconfig +@@ -29,7 +29,7 @@ CONFIG_SMP=y + CONFIG_X86_GENERIC=y + CONFIG_HPET_TIMER=y + CONFIG_SCHED_SMT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y + CONFIG_X86_MCE=y + CONFIG_X86_REBOOTFIXUPS=y +diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig +index 614961009075..05802ec44d19 100644 +--- a/arch/x86/configs/x86_64_defconfig ++++ b/arch/x86/configs/x86_64_defconfig +@@ -27,7 +27,7 @@ CONFIG_MODULE_FORCE_UNLOAD=y + CONFIG_SMP=y + CONFIG_NR_CPUS=64 + CONFIG_SCHED_SMT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y + CONFIG_X86_MCE=y + CONFIG_MICROCODE=y +diff --git a/drivers/block/swim.c b/drivers/block/swim.c +index 4c297f69171d..5bc4f1be2617 100644 +--- a/drivers/block/swim.c ++++ b/drivers/block/swim.c +@@ -328,7 +328,7 @@ static inline void swim_motor(struct swim __iomem *base, + if (swim_readbit(base, MOTOR_ON)) + break; + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + } else if (action == OFF) { + swim_action(base, MOTOR_OFF); +@@ -347,7 +347,7 @@ static inline void swim_eject(struct swim __iomem *base) + if (!swim_readbit(base, DISK_IN)) + break; + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + swim_select(base, RELAX); + } +@@ -371,7 +371,7 @@ static inline int swim_step(struct swim __iomem *base) + for (wait = 0; wait < HZ; wait++) { + + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + + swim_select(base, RELAX); + if (!swim_readbit(base, STEP)) +diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c +index c48d8f086382..8a6e399936c7 100644 +--- a/drivers/char/ipmi/ipmi_msghandler.c ++++ b/drivers/char/ipmi/ipmi_msghandler.c +@@ -3543,7 +3543,7 @@ static void cleanup_smi_msgs(struct ipmi_smi *intf) + /* Current message first, to preserve order */ + while (intf->curr_msg && !list_empty(&intf->waiting_rcv_msgs)) { + /* Wait for the message to clear out. */ +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + + /* No need for locks, the interface is down. */ +diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c +index 2704470e021d..49504b7f3aa9 100644 +--- a/drivers/char/ipmi/ipmi_ssif.c ++++ b/drivers/char/ipmi/ipmi_ssif.c +@@ -1295,7 +1295,7 @@ static void shutdown_ssif(void *send_info) + + /* make sure the driver is not looking for flags any more. */ + while (ssif_info->ssif_state != SSIF_NORMAL) +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + + ssif_info->stopping = true; + del_timer_sync(&ssif_info->watch_timer); +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c +index 6941689085ed..ec5a24e95401 100644 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c +@@ -235,7 +235,7 @@ static int vmw_fifo_wait_noirq(struct vmw_private *dev_priv, + DRM_ERROR("SVGA device lockup.\n"); + break; + } +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + if (interruptible && signal_pending(current)) { + ret = -ERESTARTSYS; + break; +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c +index 75f3efee21a4..09b1932ce85b 100644 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c +@@ -203,7 +203,7 @@ int vmw_fallback_wait(struct vmw_private *dev_priv, + break; + } + if (lazy) +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + else if ((++count & 0x0F) == 0) { + /** + * FIXME: Use schedule_hr_timeout here for +diff --git a/drivers/hwmon/fam15h_power.c b/drivers/hwmon/fam15h_power.c +index 267eac00a3fb..352af68c6cd7 100644 +--- a/drivers/hwmon/fam15h_power.c ++++ b/drivers/hwmon/fam15h_power.c +@@ -225,7 +225,7 @@ static ssize_t power1_average_show(struct device *dev, + prev_ptsc[cu] = data->cpu_sw_pwr_ptsc[cu]; + } + +- leftover = schedule_timeout_interruptible(msecs_to_jiffies(data->power_period)); ++ leftover = schedule_msec_hrtimeout_interruptible((data->power_period)); + if (leftover) + return 0; + +diff --git a/drivers/iio/light/tsl2563.c b/drivers/iio/light/tsl2563.c +index d8c40a83097d..8332baf4961c 100644 +--- a/drivers/iio/light/tsl2563.c ++++ b/drivers/iio/light/tsl2563.c +@@ -269,11 +269,7 @@ static void tsl2563_wait_adc(struct tsl2563_chip *chip) + default: + delay = 402; + } +- /* +- * TODO: Make sure that we wait at least required delay but why we +- * have to extend it one tick more? +- */ +- schedule_timeout_interruptible(msecs_to_jiffies(delay) + 2); ++ schedule_msec_hrtimeout_interruptible(delay + 1); + } + + static int tsl2563_adjust_gainlevel(struct tsl2563_chip *chip, u16 adc) +diff --git a/drivers/media/i2c/msp3400-driver.c b/drivers/media/i2c/msp3400-driver.c +index 39530d43590e..a7caf2eb5771 100644 +--- a/drivers/media/i2c/msp3400-driver.c ++++ b/drivers/media/i2c/msp3400-driver.c +@@ -170,7 +170,7 @@ static int msp_read(struct i2c_client *client, int dev, int addr) + break; + dev_warn(&client->dev, "I/O error #%d (read 0x%02x/0x%02x)\n", err, + dev, addr); +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + if (err == 3) { + dev_warn(&client->dev, "resetting chip, sound will go off.\n"); +@@ -211,7 +211,7 @@ static int msp_write(struct i2c_client *client, int dev, int addr, int val) + break; + dev_warn(&client->dev, "I/O error #%d (write 0x%02x/0x%02x)\n", err, + dev, addr); +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + if (err == 3) { + dev_warn(&client->dev, "resetting chip, sound will go off.\n"); +diff --git a/drivers/media/pci/cx18/cx18-gpio.c b/drivers/media/pci/cx18/cx18-gpio.c +index cf7cfda94107..f63e17489547 100644 +--- a/drivers/media/pci/cx18/cx18-gpio.c ++++ b/drivers/media/pci/cx18/cx18-gpio.c +@@ -81,11 +81,11 @@ static void gpio_reset_seq(struct cx18 *cx, u32 active_lo, u32 active_hi, + + /* Assert */ + gpio_update(cx, mask, ~active_lo); +- schedule_timeout_uninterruptible(msecs_to_jiffies(assert_msecs)); ++ schedule_msec_hrtimeout_uninterruptible((assert_msecs)); + + /* Deassert */ + gpio_update(cx, mask, ~active_hi); +- schedule_timeout_uninterruptible(msecs_to_jiffies(recovery_msecs)); ++ schedule_msec_hrtimeout_uninterruptible((recovery_msecs)); + } + + /* +diff --git a/drivers/media/pci/ivtv/ivtv-gpio.c b/drivers/media/pci/ivtv/ivtv-gpio.c +index 856e7ab7f33e..766a26251337 100644 +--- a/drivers/media/pci/ivtv/ivtv-gpio.c ++++ b/drivers/media/pci/ivtv/ivtv-gpio.c +@@ -105,7 +105,7 @@ void ivtv_reset_ir_gpio(struct ivtv *itv) + curout = (curout & ~0xF) | 1; + write_reg(curout, IVTV_REG_GPIO_OUT); + /* We could use something else for smaller time */ +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + curout |= 2; + write_reg(curout, IVTV_REG_GPIO_OUT); + curdir &= ~0x80; +@@ -125,11 +125,11 @@ int ivtv_reset_tuner_gpio(void *dev, int component, int cmd, int value) + curout = read_reg(IVTV_REG_GPIO_OUT); + curout &= ~(1 << itv->card->xceive_pin); + write_reg(curout, IVTV_REG_GPIO_OUT); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + + curout |= 1 << itv->card->xceive_pin; + write_reg(curout, IVTV_REG_GPIO_OUT); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + return 0; + } + +diff --git a/drivers/media/pci/ivtv/ivtv-ioctl.c b/drivers/media/pci/ivtv/ivtv-ioctl.c +index 137853944e46..76830892f373 100644 +--- a/drivers/media/pci/ivtv/ivtv-ioctl.c ++++ b/drivers/media/pci/ivtv/ivtv-ioctl.c +@@ -1137,7 +1137,7 @@ void ivtv_s_std_dec(struct ivtv *itv, v4l2_std_id std) + TASK_UNINTERRUPTIBLE); + if ((read_reg(IVTV_REG_DEC_LINE_FIELD) >> 16) < 100) + break; +- schedule_timeout(msecs_to_jiffies(25)); ++ schedule_msec_hrtimeout((25)); + } + finish_wait(&itv->vsync_waitq, &wait); + mutex_lock(&itv->serialize_lock); +diff --git a/drivers/media/pci/ivtv/ivtv-streams.c b/drivers/media/pci/ivtv/ivtv-streams.c +index f04ee84bab5f..c4469b4b8f99 100644 +--- a/drivers/media/pci/ivtv/ivtv-streams.c ++++ b/drivers/media/pci/ivtv/ivtv-streams.c +@@ -849,7 +849,7 @@ int ivtv_stop_v4l2_encode_stream(struct ivtv_stream *s, int gop_end) + while (!test_bit(IVTV_F_I_EOS, &itv->i_flags) && + time_before(jiffies, + then + msecs_to_jiffies(2000))) { +- schedule_timeout(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout((10)); + } + + /* To convert jiffies to ms, we must multiply by 1000 +diff --git a/drivers/media/radio/radio-mr800.c b/drivers/media/radio/radio-mr800.c +index cb0437b4c331..163fffc0e1d4 100644 +--- a/drivers/media/radio/radio-mr800.c ++++ b/drivers/media/radio/radio-mr800.c +@@ -366,7 +366,7 @@ static int vidioc_s_hw_freq_seek(struct file *file, void *priv, + retval = -ENODATA; + break; + } +- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { ++ if (schedule_msec_hrtimeout_interruptible((10))) { + retval = -ERESTARTSYS; + break; + } +diff --git a/drivers/media/radio/radio-tea5777.c b/drivers/media/radio/radio-tea5777.c +index fb9de7bbcd19..e53cf45e7f3f 100644 +--- a/drivers/media/radio/radio-tea5777.c ++++ b/drivers/media/radio/radio-tea5777.c +@@ -235,7 +235,7 @@ static int radio_tea5777_update_read_reg(struct radio_tea5777 *tea, int wait) + } + + if (wait) { +- if (schedule_timeout_interruptible(msecs_to_jiffies(wait))) ++ if (schedule_msec_hrtimeout_interruptible((wait))) + return -ERESTARTSYS; + } + +diff --git a/drivers/media/radio/tea575x.c b/drivers/media/radio/tea575x.c +index b0303cf00387..0925b5065147 100644 +--- a/drivers/media/radio/tea575x.c ++++ b/drivers/media/radio/tea575x.c +@@ -401,7 +401,7 @@ int snd_tea575x_s_hw_freq_seek(struct file *file, struct snd_tea575x *tea, + for (;;) { + if (time_after(jiffies, timeout)) + break; +- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { ++ if (schedule_msec_hrtimeout_interruptible((10))) { + /* some signal arrived, stop search */ + tea->val &= ~TEA575X_BIT_SEARCH; + snd_tea575x_set_freq(tea); +diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c +index b690796d24d4..448b13da62b4 100644 +--- a/drivers/mfd/ucb1x00-core.c ++++ b/drivers/mfd/ucb1x00-core.c +@@ -250,7 +250,7 @@ unsigned int ucb1x00_adc_read(struct ucb1x00 *ucb, int adc_channel, int sync) + break; + /* yield to other processes */ + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + + return UCB_ADC_DAT(val); +diff --git a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c +index 8e6607fc8a67..b9ab770bbdb5 100644 +--- a/drivers/misc/sgi-xp/xpc_channel.c ++++ b/drivers/misc/sgi-xp/xpc_channel.c +@@ -834,7 +834,7 @@ xpc_allocate_msg_wait(struct xpc_channel *ch) + + atomic_inc(&ch->n_on_msg_allocate_wq); + prepare_to_wait(&ch->msg_allocate_wq, &wait, TASK_INTERRUPTIBLE); +- ret = schedule_timeout(1); ++ ret = schedule_min_hrtimeout(); + finish_wait(&ch->msg_allocate_wq, &wait); + atomic_dec(&ch->n_on_msg_allocate_wq); + +diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c +index bbb2575d4728..637757144221 100644 +--- a/drivers/net/caif/caif_hsi.c ++++ b/drivers/net/caif/caif_hsi.c +@@ -939,7 +939,7 @@ static void cfhsi_wake_down(struct work_struct *work) + break; + + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + retry--; + } + +diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c +index d2539c95adb6..0c2f31a03ce9 100644 +--- a/drivers/net/can/usb/peak_usb/pcan_usb.c ++++ b/drivers/net/can/usb/peak_usb/pcan_usb.c +@@ -242,7 +242,7 @@ static int pcan_usb_write_mode(struct peak_usb_device *dev, u8 onoff) + } else { + /* the PCAN-USB needs time to init */ + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(PCAN_USB_STARTUP_TIMEOUT)); ++ schedule_msec_hrtimeout((PCAN_USB_STARTUP_TIMEOUT)); + } + + return err; +diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c +index eccbf4cd7149..03d285f022b0 100644 +--- a/drivers/net/usb/lan78xx.c ++++ b/drivers/net/usb/lan78xx.c +@@ -2670,7 +2670,7 @@ static void lan78xx_terminate_urbs(struct lan78xx_net *dev) + while (!skb_queue_empty(&dev->rxq) && + !skb_queue_empty(&dev->txq) && + !skb_queue_empty(&dev->done)) { +- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); ++ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); + set_current_state(TASK_UNINTERRUPTIBLE); + netif_dbg(dev, ifdown, dev->net, + "waited for %d urb completions\n", temp); +diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c +index 5ec97def3513..9e2bf55bbccd 100644 +--- a/drivers/net/usb/usbnet.c ++++ b/drivers/net/usb/usbnet.c +@@ -767,7 +767,7 @@ static void wait_skb_queue_empty(struct sk_buff_head *q) + spin_lock_irqsave(&q->lock, flags); + while (!skb_queue_empty(q)) { + spin_unlock_irqrestore(&q->lock, flags); +- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); ++ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); + set_current_state(TASK_UNINTERRUPTIBLE); + spin_lock_irqsave(&q->lock, flags); + } +diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c +index 97ea6e2035e6..1c693729bbd3 100644 +--- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c ++++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c +@@ -816,7 +816,7 @@ static int ipw2100_hw_send_command(struct ipw2100_priv *priv, + * doesn't seem to have as many firmware restart cycles... + * + * As a test, we're sticking in a 1/100s delay here */ +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + + return 0; + +@@ -1267,7 +1267,7 @@ static int ipw2100_start_adapter(struct ipw2100_priv *priv) + IPW_DEBUG_FW("Waiting for f/w initialization to complete...\n"); + i = 5000; + do { +- schedule_timeout_uninterruptible(msecs_to_jiffies(40)); ++ schedule_msec_hrtimeout_uninterruptible((40)); + /* Todo... wait for sync command ... */ + + read_register(priv->net_dev, IPW_REG_INTA, &inta); +diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c +index 90fb73575495..c94048b048a5 100644 +--- a/drivers/parport/ieee1284.c ++++ b/drivers/parport/ieee1284.c +@@ -208,7 +208,7 @@ int parport_wait_peripheral(struct parport *port, + /* parport_wait_event didn't time out, but the + * peripheral wasn't actually ready either. + * Wait for another 10ms. */ +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + } + +diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c +index 5d41dda6da4e..34705f6b423f 100644 +--- a/drivers/parport/ieee1284_ops.c ++++ b/drivers/parport/ieee1284_ops.c +@@ -537,7 +537,7 @@ size_t parport_ieee1284_ecp_read_data (struct parport *port, + /* Yield the port for a while. */ + if (count && dev->port->irq != PARPORT_IRQ_NONE) { + parport_release (dev); +- schedule_timeout_interruptible(msecs_to_jiffies(40)); ++ schedule_msec_hrtimeout_interruptible((40)); + parport_claim_or_block (dev); + } + else +diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c +index bffe548187ee..c2918ee3e100 100644 +--- a/drivers/platform/x86/intel_ips.c ++++ b/drivers/platform/x86/intel_ips.c +@@ -798,7 +798,7 @@ static int ips_adjust(void *data) + ips_gpu_lower(ips); + + sleep: +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_ADJUST_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_ADJUST_PERIOD)); + } while (!kthread_should_stop()); + + dev_dbg(ips->dev, "ips-adjust thread stopped\n"); +@@ -974,7 +974,7 @@ static int ips_monitor(void *data) + seqno_timestamp = get_jiffies_64(); + + old_cpu_power = thm_readl(THM_CEC); +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + + /* Collect an initial average */ + for (i = 0; i < IPS_SAMPLE_COUNT; i++) { +@@ -1001,7 +1001,7 @@ static int ips_monitor(void *data) + mchp_samples[i] = mchp; + } + +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + if (kthread_should_stop()) + break; + } +@@ -1028,7 +1028,7 @@ static int ips_monitor(void *data) + * us to reduce the sample frequency if the CPU and GPU are idle. + */ + old_cpu_power = thm_readl(THM_CEC); +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + last_sample_period = IPS_SAMPLE_PERIOD; + + timer_setup(&ips->timer, monitor_timeout, TIMER_DEFERRABLE); +diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c +index 2018614f258f..fc19b312c345 100644 +--- a/drivers/rtc/rtc-wm8350.c ++++ b/drivers/rtc/rtc-wm8350.c +@@ -114,7 +114,7 @@ static int wm8350_rtc_settime(struct device *dev, struct rtc_time *tm) + /* Wait until confirmation of stopping */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (--retries && !(rtc_ctrl & WM8350_RTC_STS)); + + if (!retries) { +@@ -197,7 +197,7 @@ static int wm8350_rtc_stop_alarm(struct wm8350 *wm8350) + /* Wait until confirmation of stopping */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (retries-- && !(rtc_ctrl & WM8350_RTC_ALMSTS)); + + if (!(rtc_ctrl & WM8350_RTC_ALMSTS)) +@@ -220,7 +220,7 @@ static int wm8350_rtc_start_alarm(struct wm8350 *wm8350) + /* Wait until confirmation */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (retries-- && rtc_ctrl & WM8350_RTC_ALMSTS); + + if (rtc_ctrl & WM8350_RTC_ALMSTS) +diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c +index b60795893994..d2d05691dbd2 100644 +--- a/drivers/scsi/fnic/fnic_scsi.c ++++ b/drivers/scsi/fnic/fnic_scsi.c +@@ -216,7 +216,7 @@ int fnic_fw_reset_handler(struct fnic *fnic) + + /* wait for io cmpl */ + while (atomic_read(&fnic->in_flight)) +- schedule_timeout(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout((1)); + + spin_lock_irqsave(&fnic->wq_copy_lock[0], flags); + +@@ -2277,7 +2277,7 @@ static int fnic_clean_pending_aborts(struct fnic *fnic, + } + } + +- schedule_timeout(msecs_to_jiffies(2 * fnic->config.ed_tov)); ++ schedule_msec_hrtimeout((2 * fnic->config.ed_tov)); + + /* walk again to check, if IOs are still pending in fw */ + if (fnic_is_abts_pending(fnic, lr_sc)) +diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c +index ad62fb3f3a54..a84d4c99d7d7 100644 +--- a/drivers/scsi/lpfc/lpfc_scsi.c ++++ b/drivers/scsi/lpfc/lpfc_scsi.c +@@ -5191,7 +5191,7 @@ lpfc_reset_flush_io_context(struct lpfc_vport *vport, uint16_t tgt_id, + tgt_id, lun_id, context); + later = msecs_to_jiffies(2 * vport->cfg_devloss_tmo * 1000) + jiffies; + while (time_after(later, jiffies) && cnt) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(20)); ++ schedule_msec_hrtimeout_uninterruptible((20)); + cnt = lpfc_sli_sum_iocb(vport, tgt_id, lun_id, context); + } + if (cnt) { +diff --git a/drivers/scsi/snic/snic_scsi.c b/drivers/scsi/snic/snic_scsi.c +index b3650c989ed4..7ed1fb285754 100644 +--- a/drivers/scsi/snic/snic_scsi.c ++++ b/drivers/scsi/snic/snic_scsi.c +@@ -2353,7 +2353,7 @@ snic_reset(struct Scsi_Host *shost, struct scsi_cmnd *sc) + + /* Wait for all the IOs that are entered in Qcmd */ + while (atomic_read(&snic->ios_inflight)) +- schedule_timeout(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout((1)); + + ret = snic_issue_hba_reset(snic, sc); + if (ret) { +diff --git a/drivers/staging/comedi/drivers/ni_mio_common.c b/drivers/staging/comedi/drivers/ni_mio_common.c +index d99f4065b96d..15f870d4e95f 100644 +--- a/drivers/staging/comedi/drivers/ni_mio_common.c ++++ b/drivers/staging/comedi/drivers/ni_mio_common.c +@@ -4748,7 +4748,7 @@ static int cs5529_wait_for_idle(struct comedi_device *dev) + if ((status & NI67XX_CAL_STATUS_BUSY) == 0) + break; + set_current_state(TASK_INTERRUPTIBLE); +- if (schedule_timeout(1)) ++ if (schedule_min_hrtimeout()) + return -EIO; + } + if (i == timeout) { +diff --git a/drivers/staging/rts5208/rtsx.c b/drivers/staging/rts5208/rtsx.c +index be0053c795b7..cc2e18c733e1 100644 +--- a/drivers/staging/rts5208/rtsx.c ++++ b/drivers/staging/rts5208/rtsx.c +@@ -490,7 +490,7 @@ static int rtsx_polling_thread(void *__dev) + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(POLLING_INTERVAL)); ++ schedule_msec_hrtimeout((POLLING_INTERVAL)); + + /* lock the device pointers */ + mutex_lock(&dev->dev_mutex); +diff --git a/drivers/staging/speakup/speakup_acntpc.c b/drivers/staging/speakup/speakup_acntpc.c +index c94328a5bd4a..6e7d4671aa69 100644 +--- a/drivers/staging/speakup/speakup_acntpc.c ++++ b/drivers/staging/speakup/speakup_acntpc.c +@@ -198,7 +198,7 @@ static void do_catch_up(struct spk_synth *synth) + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout((full_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -226,7 +226,7 @@ static void do_catch_up(struct spk_synth *synth) + jiffy_delta_val = jiffy_delta->u.n.value; + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff --git a/drivers/staging/speakup/speakup_apollo.c b/drivers/staging/speakup/speakup_apollo.c +index 0877b4044c28..627102d048c1 100644 +--- a/drivers/staging/speakup/speakup_apollo.c ++++ b/drivers/staging/speakup/speakup_apollo.c +@@ -165,7 +165,7 @@ static void do_catch_up(struct spk_synth *synth) + if (!synth->io_ops->synth_out(synth, ch)) { + synth->io_ops->tiocmset(0, UART_MCR_RTS); + synth->io_ops->tiocmset(UART_MCR_RTS, 0); +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout(full_time_val); + continue; + } + if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { +diff --git a/drivers/staging/speakup/speakup_decext.c b/drivers/staging/speakup/speakup_decext.c +index ddbb7e97d118..f9502addc765 100644 +--- a/drivers/staging/speakup/speakup_decext.c ++++ b/drivers/staging/speakup/speakup_decext.c +@@ -176,7 +176,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (synth_full() || !synth->io_ops->synth_out(synth, ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/speakup/speakup_decpc.c b/drivers/staging/speakup/speakup_decpc.c +index 798c42dfa16c..d85b41db67a3 100644 +--- a/drivers/staging/speakup/speakup_decpc.c ++++ b/drivers/staging/speakup/speakup_decpc.c +@@ -394,7 +394,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (dt_sendchar(ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/speakup/speakup_dectlk.c b/drivers/staging/speakup/speakup_dectlk.c +index dccb4ea29d37..8ecead307d04 100644 +--- a/drivers/staging/speakup/speakup_dectlk.c ++++ b/drivers/staging/speakup/speakup_dectlk.c +@@ -244,7 +244,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (synth_full_val || !synth->io_ops->synth_out(synth, ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/speakup/speakup_dtlk.c b/drivers/staging/speakup/speakup_dtlk.c +index dbebed0eeeec..6d83c13ca4a6 100644 +--- a/drivers/staging/speakup/speakup_dtlk.c ++++ b/drivers/staging/speakup/speakup_dtlk.c +@@ -211,7 +211,7 @@ static void do_catch_up(struct spk_synth *synth) + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -227,7 +227,7 @@ static void do_catch_up(struct spk_synth *synth) + delay_time_val = delay_time->u.n.value; + jiffy_delta_val = jiffy_delta->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff --git a/drivers/staging/speakup/speakup_keypc.c b/drivers/staging/speakup/speakup_keypc.c +index 414827e888fc..cb31c9176daa 100644 +--- a/drivers/staging/speakup/speakup_keypc.c ++++ b/drivers/staging/speakup/speakup_keypc.c +@@ -199,7 +199,7 @@ static void do_catch_up(struct spk_synth *synth) + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout((full_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -232,7 +232,7 @@ static void do_catch_up(struct spk_synth *synth) + jiffy_delta_val = jiffy_delta->u.n.value; + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff --git a/drivers/staging/speakup/synth.c b/drivers/staging/speakup/synth.c +index 3568bfb89912..0a80b3b098b2 100644 +--- a/drivers/staging/speakup/synth.c ++++ b/drivers/staging/speakup/synth.c +@@ -93,12 +93,8 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (ch == '\n') + ch = synth->procspeech; +- if (unicode) +- ret = synth->io_ops->synth_out_unicode(synth, ch); +- else +- ret = synth->io_ops->synth_out(synth, ch); +- if (!ret) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ if (!synth->io_ops->synth_out(synth, ch)) { ++ schedule_msec_hrtimeout(full_time_val); + continue; + } + if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { +@@ -108,11 +104,9 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth->io_ops->synth_out(synth, synth->procspeech)) +- schedule_timeout( +- msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + else +- schedule_timeout( +- msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout(full_time_val); + jiff_max = jiffies + jiffy_delta_val; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/unisys/visornic/visornic_main.c b/drivers/staging/unisys/visornic/visornic_main.c +index 0433536930a9..d8726f28843f 100644 +--- a/drivers/staging/unisys/visornic/visornic_main.c ++++ b/drivers/staging/unisys/visornic/visornic_main.c +@@ -549,7 +549,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- wait += schedule_timeout(msecs_to_jiffies(10)); ++ wait += schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + } + +@@ -560,7 +560,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- schedule_timeout(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + if (atomic_read(&devdata->usage)) + break; +@@ -714,7 +714,7 @@ static int visornic_enable_with_timeout(struct net_device *netdev, + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- wait += schedule_timeout(msecs_to_jiffies(10)); ++ wait += schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + } + +diff --git a/drivers/video/fbdev/omap/hwa742.c b/drivers/video/fbdev/omap/hwa742.c +index cfe63932f825..71c00ef772a3 100644 +--- a/drivers/video/fbdev/omap/hwa742.c ++++ b/drivers/video/fbdev/omap/hwa742.c +@@ -913,7 +913,7 @@ static void hwa742_resume(void) + if (hwa742_read_reg(HWA742_PLL_DIV_REG) & (1 << 7)) + break; + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(5)); ++ schedule_msec_hrtimeout((5)); + } + hwa742_set_update_mode(hwa742.update_mode_before_suspend); + } +diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c +index 00b96a78676e..37fc1c2d4cb9 100644 +--- a/drivers/video/fbdev/pxafb.c ++++ b/drivers/video/fbdev/pxafb.c +@@ -1287,7 +1287,7 @@ static int pxafb_smart_thread(void *arg) + mutex_unlock(&fbi->ctrlr_lock); + + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(30)); ++ schedule_msec_hrtimeout((30)); + } + + pr_debug("%s(): task ending\n", __func__); +diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c +index 6009e0e939b5..43868e6a85dc 100644 +--- a/fs/btrfs/inode-map.c ++++ b/fs/btrfs/inode-map.c +@@ -91,7 +91,7 @@ static int caching_kthread(void *data) + btrfs_release_path(path); + root->ino_cache_progress = last; + up_read(&fs_info->commit_root_sem); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + goto again; + } else + continue; +diff --git a/fs/proc/base.c b/fs/proc/base.c +index eb2255e95f62..62b8cedbccb6 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -479,7 +479,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + seq_puts(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff --git a/include/linux/freezer.h b/include/linux/freezer.h +index 21f5aa0b217f..ee9b46394fdf 100644 +--- a/include/linux/freezer.h ++++ b/include/linux/freezer.h +@@ -297,6 +297,7 @@ static inline void set_freezable(void) {} + #define wait_event_freezekillable_unsafe(wq, condition) \ + wait_event_killable(wq, condition) + ++#define pm_freezing (false) + #endif /* !CONFIG_FREEZER */ + + #endif /* FREEZER_H_INCLUDED */ +diff --git a/include/linux/init_task.h b/include/linux/init_task.h +index 2c620d7ac432..73417df5daa2 100644 +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -36,7 +36,11 @@ extern struct cred init_cred; + #define INIT_PREV_CPUTIME(x) + #endif + ++#ifdef CONFIG_SCHED_MUQSS ++#define INIT_TASK_COMM "MuQSS" ++#else + #define INIT_TASK_COMM "swapper" ++#endif + + /* Attach to the init_task data structure for proper alignment */ + #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK +diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h +index e9bfe6972aed..16ba1c7e5bde 100644 +--- a/include/linux/ioprio.h ++++ b/include/linux/ioprio.h +@@ -53,6 +53,8 @@ enum { + */ + static inline int task_nice_ioprio(struct task_struct *task) + { ++ if (iso_task(task)) ++ return 0; + return (task_nice(task) + 20) / 5; + } + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 4418f5cb8324..71e3063c06b3 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -31,6 +31,9 @@ + #include + #include + #include ++#ifdef CONFIG_SCHED_MUQSS ++#include ++#endif + + /* task_struct member predeclarations (sorted alphabetically): */ + struct audit_context; +@@ -214,13 +217,40 @@ struct task_group; + + extern void scheduler_tick(void); + +-#define MAX_SCHEDULE_TIMEOUT LONG_MAX +- ++#define MAX_SCHEDULE_TIMEOUT LONG_MAX + extern long schedule_timeout(long timeout); + extern long schedule_timeout_interruptible(long timeout); + extern long schedule_timeout_killable(long timeout); + extern long schedule_timeout_uninterruptible(long timeout); + extern long schedule_timeout_idle(long timeout); ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++extern long schedule_msec_hrtimeout(long timeout); ++extern long schedule_min_hrtimeout(void); ++extern long schedule_msec_hrtimeout_interruptible(long timeout); ++extern long schedule_msec_hrtimeout_uninterruptible(long timeout); ++#else ++static inline long schedule_msec_hrtimeout(long timeout) ++{ ++ return schedule_timeout(msecs_to_jiffies(timeout)); ++} ++ ++static inline long schedule_min_hrtimeout(void) ++{ ++ return schedule_timeout(1); ++} ++ ++static inline long schedule_msec_hrtimeout_interruptible(long timeout) ++{ ++ return schedule_timeout_interruptible(msecs_to_jiffies(timeout)); ++} ++ ++static inline long schedule_msec_hrtimeout_uninterruptible(long timeout) ++{ ++ return schedule_timeout_uninterruptible(msecs_to_jiffies(timeout)); ++} ++#endif ++ + asmlinkage void schedule(void); + extern void schedule_preempt_disabled(void); + asmlinkage void preempt_schedule_irq(void); +@@ -652,9 +682,11 @@ struct task_struct { + unsigned int flags; + unsigned int ptrace; + ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_MUQSS) ++ int on_cpu; ++#endif + #ifdef CONFIG_SMP + struct llist_node wake_entry; +- int on_cpu; + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ + unsigned int cpu; +@@ -679,10 +711,25 @@ struct task_struct { + int static_prio; + int normal_prio; + unsigned int rt_priority; ++#ifdef CONFIG_SCHED_MUQSS ++ int time_slice; ++ u64 deadline; ++ skiplist_node node; /* Skip list node */ ++ u64 last_ran; ++ u64 sched_time; /* sched_clock time spent running */ ++#ifdef CONFIG_SMT_NICE ++ int smt_bias; /* Policy/nice level bias across smt siblings */ ++#endif ++#ifdef CONFIG_HOTPLUG_CPU ++ bool zerobound; /* Bound to CPU0 for hotplug */ ++#endif ++ unsigned long rt_timeout; ++#else /* CONFIG_SCHED_MUQSS */ + + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +@@ -850,6 +897,10 @@ struct task_struct { + #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + u64 utimescaled; + u64 stimescaled; ++#endif ++#ifdef CONFIG_SCHED_MUQSS ++ /* Unbanked cpu time */ ++ unsigned long utime_ns, stime_ns; + #endif + u64 gtime; + struct prev_cputime prev_cputime; +@@ -1306,6 +1357,40 @@ struct task_struct { + */ + }; + ++#ifdef CONFIG_SCHED_MUQSS ++#define tsk_seruntime(t) ((t)->sched_time) ++#define tsk_rttimeout(t) ((t)->rt_timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++} ++ ++void print_scheduler_version(void); ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return (p->policy == SCHED_ISO); ++} ++#else /* CFS */ ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++ p->nr_cpus_allowed = current->nr_cpus_allowed; ++} ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "CFS CPU scheduler.\n"); ++} ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return false; ++} ++#endif /* CONFIG_SCHED_MUQSS */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index 1aff00b65f3c..73d6319a856a 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -28,7 +28,16 @@ static inline bool dl_time_before(u64 a, u64 b) + #ifdef CONFIG_SMP + + struct root_domain; ++#ifdef CONFIG_SCHED_MUQSS ++static inline void dl_clear_root_domain(struct root_domain *rd) ++{ ++} ++static inline void dl_add_task_root_domain(struct task_struct *p) ++{ ++} ++#else /* CONFIG_SCHED_MUQSS */ + extern void dl_add_task_root_domain(struct task_struct *p); + extern void dl_clear_root_domain(struct root_domain *rd); ++#endif /* CONFIG_SCHED_MUQSS */ + + #endif /* CONFIG_SMP */ +diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h +index 6d67e9a5af6b..101fe470aa8f 100644 +--- a/include/linux/sched/nohz.h ++++ b/include/linux/sched/nohz.h +@@ -13,7 +13,7 @@ extern int get_nohz_timer_target(void); + static inline void nohz_balance_enter_idle(int cpu) { } + #endif + +-#ifdef CONFIG_NO_HZ_COMMON ++#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS) + void calc_load_nohz_start(void); + void calc_load_nohz_remote(struct rq *rq); + void calc_load_nohz_stop(void); +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index 7d64feafc408..43c9d9e50c09 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -20,8 +20,20 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ ++#ifdef CONFIG_SCHED_MUQSS ++/* Note different MAX_RT_PRIO */ ++#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1) ++ ++#define ISO_PRIO (MAX_RT_PRIO) ++#define NORMAL_PRIO (MAX_RT_PRIO + 1) ++#define IDLE_PRIO (MAX_RT_PRIO + 2) ++#define PRIO_LIMIT ((IDLE_PRIO) + 1) ++#else /* CONFIG_SCHED_MUQSS */ + #define MAX_RT_PRIO MAX_USER_RT_PRIO + ++#endif /* CONFIG_SCHED_MUQSS */ ++ + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) + +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c08b4..010b2244e0b6 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_MUQSS + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h +index 38359071236a..e2ebedb6512c 100644 +--- a/include/linux/sched/task.h ++++ b/include/linux/sched/task.h +@@ -106,7 +106,7 @@ extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); + extern void free_task(struct task_struct *tsk); + + /* sched_exec is called by processes performing an exec */ +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_MUQSS) + extern void sched_exec(void); + #else + #define sched_exec() {} +diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h +new file mode 100644 +index 000000000000..d4be84ba273b +--- /dev/null ++++ b/include/linux/skip_list.h +@@ -0,0 +1,33 @@ ++#ifndef _LINUX_SKIP_LISTS_H ++#define _LINUX_SKIP_LISTS_H ++typedef u64 keyType; ++typedef void *valueType; ++ ++typedef struct nodeStructure skiplist_node; ++ ++struct nodeStructure { ++ int level; /* Levels in this structure */ ++ keyType key; ++ valueType value; ++ skiplist_node *next[8]; ++ skiplist_node *prev[8]; ++}; ++ ++typedef struct listStructure { ++ int entries; ++ int level; /* Maximum level of the list ++ (1 more than the number of levels in the list) */ ++ skiplist_node *header; /* pointer to header */ ++} skiplist; ++ ++void skiplist_init(skiplist_node *slnode); ++skiplist *new_skiplist(skiplist_node *slnode); ++void free_skiplist(skiplist *l); ++void skiplist_node_init(skiplist_node *node); ++void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed); ++void skiplist_delete(skiplist *l, skiplist_node *node); ++ ++static inline bool skiplist_node_empty(skiplist_node *node) { ++ return (!node->next[0]); ++} ++#endif /* _LINUX_SKIP_LISTS_H */ +diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h +index 3bac0a8ceab2..f48c5c5da651 100644 +--- a/include/uapi/linux/sched.h ++++ b/include/uapi/linux/sched.h +@@ -115,9 +115,16 @@ struct clone_args { + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 +-/* SCHED_ISO: reserved but not implemented yet */ ++/* SCHED_ISO: Implemented on MuQSS only */ + #define SCHED_IDLE 5 ++#ifdef CONFIG_SCHED_MUQSS ++#define SCHED_ISO 4 ++#define SCHED_IDLEPRIO SCHED_IDLE ++#define SCHED_MAX (SCHED_IDLEPRIO) ++#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) ++#else /* CONFIG_SCHED_MUQSS */ + #define SCHED_DEADLINE 6 ++#endif /* CONFIG_SCHED_MUQSS */ + + /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ + #define SCHED_RESET_ON_FORK 0x40000000 +diff --git a/init/Kconfig b/init/Kconfig +index 74a5ac65644f..44bba84664f3 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -61,6 +61,18 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config SCHED_MUQSS ++ bool "MuQSS cpu scheduler" ++ select HIGH_RES_TIMERS ++ ---help--- ++ The Multiple Queue Skiplist Scheduler for excellent interactivity and ++ responsiveness on the desktop and highly scalable deterministic ++ low latency on any hardware. ++ ++ Say Y here. ++ default y ++ ++ + config BROKEN + bool + +@@ -440,7 +452,7 @@ config HAVE_SCHED_AVG_IRQ + + config SCHED_THERMAL_PRESSURE + bool "Enable periodic averaging of thermal pressure" +- depends on SMP ++ depends on SMP && !SCHED_MUQSS + + config BSD_PROCESS_ACCT + bool "BSD Process Accounting" +@@ -777,6 +789,7 @@ config NUMA_BALANCING + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_MUQSS + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -876,9 +889,13 @@ menuconfig CGROUP_SCHED + help + This feature lets CPU scheduler recognize task groups and control CPU + bandwidth allocation to such task groups. It uses cgroups to group +- tasks. ++ tasks. In combination with MuQSS this is purely a STUB to create the ++ files associated with the CPU controller cgroup but most of the ++ controls do nothing. This is useful for working in environments and ++ with applications that will only work if this control group is ++ present. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_MUQSS + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -1007,6 +1024,7 @@ config CGROUP_DEVICE + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" ++ depends on !SCHED_MUQSS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +@@ -1134,6 +1152,7 @@ config CHECKPOINT_RESTORE + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_MUQSS + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff --git a/init/init_task.c b/init/init_task.c +index bd403ed3e418..5df65b2578eb 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -67,9 +67,17 @@ struct task_struct init_task + .stack = init_stack, + .usage = REFCOUNT_INIT(2), + .flags = PF_KTHREAD, ++#ifdef CONFIG_SCHED_MUQSS ++ .prio = NORMAL_PRIO, ++ .static_prio = MAX_PRIO - 20, ++ .normal_prio = NORMAL_PRIO, ++ .deadline = 0, ++ .time_slice = 1000000, ++#else + .prio = MAX_PRIO - 20, + .static_prio = MAX_PRIO - 20, + .normal_prio = MAX_PRIO - 20, ++#endif + .policy = SCHED_NORMAL, + .cpus_ptr = &init_task.cpus_mask, + .cpus_mask = CPU_MASK_ALL, +@@ -79,6 +87,7 @@ struct task_struct init_task + .restart_block = { + .fn = do_no_restart_syscall, + }, ++#ifndef CONFIG_SCHED_MUQSS + .se = { + .group_node = LIST_HEAD_INIT(init_task.se.group_node), + }, +@@ -86,6 +95,7 @@ struct task_struct init_task + .run_list = LIST_HEAD_INIT(init_task.rt.run_list), + .time_slice = RR_TIMESLICE, + }, ++#endif + .tasks = LIST_HEAD_INIT(init_task.tasks), + #ifdef CONFIG_SMP + .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), +diff --git a/init/main.c b/init/main.c +index 03371976d387..63243a24de9b 100644 +--- a/init/main.c ++++ b/init/main.c +@@ -1411,6 +1411,8 @@ static int __ref kernel_init(void *unused) + + rcu_end_inkernel_boot(); + ++ print_scheduler_version(); ++ + if (ramdisk_execute_command) { + ret = run_init_process(ramdisk_execute_command); + if (!ret) +diff --git a/kernel/Kconfig.MuQSS b/kernel/Kconfig.MuQSS +new file mode 100644 +index 000000000000..a6a58781ef91 +--- /dev/null ++++ b/kernel/Kconfig.MuQSS +@@ -0,0 +1,105 @@ ++choice ++ prompt "CPU scheduler runqueue sharing" ++ default RQ_MC if SCHED_MUQSS ++ default RQ_NONE ++ ++config RQ_NONE ++ bool "No sharing" ++ help ++ This is the default behaviour where the CPU scheduler has one runqueue ++ per CPU, whether it is a physical or logical CPU (hyperthread). ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=none ++ ++ If unsure, say N. ++ ++config RQ_SMT ++ bool "SMT (hyperthread) siblings" ++ depends on SCHED_SMT && SCHED_MUQSS ++ ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by SMT (hyperthread) siblings. As these logical cores share ++ one physical core, sharing the runqueue resource can lead to decreased ++ overhead, lower latency and higher throughput. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=smt ++ ++ If unsure, say N. ++ ++config RQ_MC ++ bool "Multicore siblings" ++ depends on SCHED_MC && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by multicore siblings in addition to any SMT siblings. ++ As these physical cores share caches, sharing the runqueue resource ++ will lead to lower latency, but its effects on overhead and throughput ++ are less predictable. As a general rule, 6 or fewer cores will likely ++ benefit from this, while larger CPUs will only derive a latency ++ benefit. If your workloads are primarily single threaded, this will ++ possibly worsen throughput. If you are only concerned about latency ++ then enable this regardless of how many cores you have. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=mc ++ ++ If unsure, say Y. ++ ++config RQ_MC_LLC ++ bool "Multicore siblings (LLC)" ++ depends on SCHED_MC && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will behave similarly as ++ with "Multicore siblings". ++ This option takes LLC cache into account when scheduling tasks. ++ Option may benefit CPUs with multiple LLC caches, such as Ryzen ++ and Xeon CPUs. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=llc ++ ++ If unsure, say N. ++ ++config RQ_SMP ++ bool "Symmetric Multi-Processing" ++ depends on SMP && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by all physical CPUs unless they are on separate NUMA nodes. ++ As physical CPUs usually do not share resources, sharing the runqueue ++ will normally worsen throughput but improve latency. If you only ++ care about latency enable this. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=smp ++ ++ If unsure, say N. ++ ++config RQ_ALL ++ bool "NUMA" ++ depends on SMP && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ regardless of the architecture configuration, including across NUMA ++ nodes. This can substantially decrease throughput in NUMA ++ configurations, but light NUMA designs will not be dramatically ++ affected. This option should only be chosen if latency is the prime ++ concern. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=all ++ ++ If unsure, say N. ++endchoice ++ ++config SHARERQ ++ int ++ default 0 if RQ_NONE ++ default 1 if RQ_SMT ++ default 2 if RQ_MC ++ default 3 if RQ_MC_LLC ++ default 4 if RQ_SMP ++ default 5 if RQ_ALL +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 38ef6d06888e..89ed751ac4e4 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -5,7 +5,8 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_100 if SCHED_MUQSS ++ default HZ_250_NODEF if !SCHED_MUQSS + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -20,11 +21,18 @@ choice + config HZ_100 + bool "100 HZ" + help ++ 100 Hz is a suitable choice in combination with MuQSS which does ++ not rely on ticks for rescheduling interrupts, and is not Hz limited ++ for timeouts and sleeps from both the kernel and userspace. ++ This allows us to benefit from the lower overhead and higher ++ throughput of fewer timer ticks. ++ ++ Non-MuQSS kernels: + 100 Hz is a typical choice for servers, SMP and NUMA systems + with lots of processors that may show reduced performance if + too many timer interrupts are occurring. + +- config HZ_250 ++ config HZ_250_NODEF + bool "250 HZ" + help + 250 Hz is a good compromise choice allowing server performance +@@ -32,7 +40,10 @@ choice + on SMP and NUMA systems. If you are going to be using NTSC video + or multimedia, selected 300Hz instead. + +- config HZ_300 ++ 250 Hz is the default choice for the mainline scheduler but not ++ advantageous in combination with MuQSS. ++ ++ config HZ_300_NODEF + bool "300 HZ" + help + 300 Hz is a good compromise choice allowing server performance +@@ -40,7 +51,7 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + +- config HZ_1000 ++ config HZ_1000_NODEF + bool "1000 HZ" + help + 1000 Hz is the preferred choice for desktop systems and other +@@ -51,9 +62,9 @@ endchoice + config HZ + int + default 100 if HZ_100 +- default 250 if HZ_250 +- default 300 if HZ_300 +- default 1000 if HZ_1000 ++ default 250 if HZ_250_NODEF ++ default 300 if HZ_300_NODEF ++ default 1000 if HZ_1000_NODEF + + config SCHED_HRTICK + def_bool HIGH_RES_TIMERS +diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt +index bf82259cff96..d9438eb6f91c 100644 +--- a/kernel/Kconfig.preempt ++++ b/kernel/Kconfig.preempt +@@ -2,7 +2,7 @@ + + choice + prompt "Preemption Model" +- default PREEMPT_NONE ++ default PREEMPT + + config PREEMPT_NONE + bool "No Forced Preemption (Server)" +@@ -18,7 +18,7 @@ config PREEMPT_NONE + latencies. + + config PREEMPT_VOLUNTARY +- bool "Voluntary Kernel Preemption (Desktop)" ++ bool "Voluntary Kernel Preemption (Nothing)" + depends on !ARCH_NO_PREEMPT + help + This option reduces the latency of the kernel by adding more +@@ -33,7 +33,8 @@ config PREEMPT_VOLUNTARY + applications to run more 'smoothly' even when the system is + under load. + +- Select this if you are building a kernel for a desktop system. ++ Select this for no system in particular (choose Preemptible ++ instead on a desktop if you know what's good for you). + + config PREEMPT + bool "Preemptible Kernel (Low-Latency Desktop)" +diff --git a/kernel/Makefile b/kernel/Makefile +index 4cb4130ced32..b11afae9eea8 100644 +--- a/kernel/Makefile ++++ b/kernel/Makefile +@@ -10,7 +10,7 @@ obj-y = fork.o exec_domain.o panic.o \ + extable.o params.o \ + kthread.o sys_ni.o nsproxy.o \ + notifier.o ksysfs.o cred.o reboot.o \ +- async.o range.o smpboot.o ucount.o ++ async.o range.o smpboot.o ucount.o skip_list.o + + obj-$(CONFIG_MODULES) += kmod.o + obj-$(CONFIG_MULTIUSER) += groups.o +diff --git a/kernel/delayacct.c b/kernel/delayacct.c +index 27725754ac99..769d773c7182 100644 +--- a/kernel/delayacct.c ++++ b/kernel/delayacct.c +@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff --git a/kernel/exit.c b/kernel/exit.c +index ce2a75bc0ade..f0f864bc1ab9 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -122,7 +122,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -143,7 +143,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig +index 20d501af4f2e..f92cabe495bd 100644 +--- a/kernel/irq/Kconfig ++++ b/kernel/irq/Kconfig +@@ -115,6 +115,23 @@ config GENERIC_IRQ_RESERVATION_MODE + config IRQ_FORCED_THREADING + bool + ++config FORCE_IRQ_THREADING ++ bool "Make IRQ threading compulsory" ++ depends on IRQ_FORCED_THREADING ++ default n ++ ---help--- ++ ++ Make IRQ threading mandatory for any IRQ handlers that support it ++ instead of being optional and requiring the threadirqs kernel ++ parameter. Instead they can be optionally disabled with the ++ nothreadirqs kernel parameter. ++ ++ Enabling this may make some architectures not boot with runqueue ++ sharing and MuQSS. ++ ++ Enable if you are building for a desktop or low latency system, ++ otherwise say N. ++ + config SPARSE_IRQ + bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ + ---help--- +diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c +index 453a8a0f4804..2f14a31d8efd 100644 +--- a/kernel/irq/manage.c ++++ b/kernel/irq/manage.c +@@ -25,9 +25,20 @@ + #include "internals.h" + + #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) ++#ifdef CONFIG_FORCE_IRQ_THREADING ++__read_mostly bool force_irqthreads = true; ++#else + __read_mostly bool force_irqthreads; ++#endif + EXPORT_SYMBOL_GPL(force_irqthreads); + ++static int __init setup_noforced_irqthreads(char *arg) ++{ ++ force_irqthreads = false; ++ return 0; ++} ++early_param("nothreadirqs", setup_noforced_irqthreads); ++ + static int __init setup_forced_irqthreads(char *arg) + { + force_irqthreads = true; +diff --git a/kernel/kthread.c b/kernel/kthread.c +index bfbfa481be3a..f5942fb29ba8 100644 +--- a/kernel/kthread.c ++++ b/kernel/kthread.c +@@ -446,6 +446,34 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) + } + EXPORT_SYMBOL(kthread_bind); + ++#if defined(CONFIG_SCHED_MUQSS) && defined(CONFIG_SMP) ++extern void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); ++ ++/* ++ * new_kthread_bind is a special variant of __kthread_bind_mask. ++ * For new threads to work on muqss we want to call do_set_cpus_allowed ++ * without the task_cpu being set and the task rescheduled until they're ++ * rescheduled on their own so we call __do_set_cpus_allowed directly which ++ * only changes the cpumask. This is particularly important for smpboot threads ++ * to work. ++ */ ++static void new_kthread_bind(struct task_struct *p, unsigned int cpu) ++{ ++ unsigned long flags; ++ ++ if (WARN_ON(!wait_task_inactive(p, TASK_UNINTERRUPTIBLE))) ++ return; ++ ++ /* It's safe because the task is inactive. */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ __do_set_cpus_allowed(p, cpumask_of(cpu)); ++ p->flags |= PF_NO_SETAFFINITY; ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++#else ++#define new_kthread_bind(p, cpu) kthread_bind(p, cpu) ++#endif ++ + /** + * kthread_create_on_cpu - Create a cpu bound kthread + * @threadfn: the function to run until signal_pending(current). +@@ -467,7 +495,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), + cpu); + if (IS_ERR(p)) + return p; +- kthread_bind(p, cpu); ++ new_kthread_bind(p, cpu); + /* CPU hotplug need to bind once again when unparking the thread. */ + set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); + to_kthread(p)->cpu = cpu; +diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +index f6310f848f34..825f9b8e228f 100644 +--- a/kernel/livepatch/transition.c ++++ b/kernel/livepatch/transition.c +@@ -282,7 +282,7 @@ static bool klp_try_switch_task(struct task_struct *task) + { + static char err_buf[STACK_ERR_BUF_SIZE]; + struct rq *rq; +- struct rq_flags flags; ++ struct rq_flags rf; + int ret; + bool success = false; + +@@ -304,7 +304,7 @@ static bool klp_try_switch_task(struct task_struct *task) + * functions. If all goes well, switch the task to the target patch + * state. + */ +- rq = task_rq_lock(task, &flags); ++ rq = task_rq_lock(task, &rf); + + if (task_running(rq, task) && task != current) { + snprintf(err_buf, STACK_ERR_BUF_SIZE, +@@ -323,7 +323,7 @@ static bool klp_try_switch_task(struct task_struct *task) + task->patch_state = klp_target_state; + + done: +- task_rq_unlock(rq, task, &flags); ++ task_rq_unlock(rq, task, &rf); + + /* + * Due to console deadlock issues, pr_debug() can't be used while +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 21fb5a5662b5..a04ffebc6b7a 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -16,15 +16,23 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + ++ifdef CONFIG_SCHED_MUQSS ++obj-y += MuQSS.o clock.o cputime.o ++obj-y += idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o ++ ++obj-$(CONFIG_SMP) += topology.o ++else + obj-y += core.o loadavg.o clock.o cputime.o + obj-y += idle.o fair.o rt.o deadline.o + obj-y += wait.o wait_bit.o swait.o completion.o + + obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o +-obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o ++endif ++obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o +diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c +new file mode 100644 +index 000000000000..18a9b4a23e44 +--- /dev/null ++++ b/kernel/sched/MuQSS.c +@@ -0,0 +1,7624 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * kernel/sched/MuQSS.c, was kernel/sched.c ++ * ++ * Kernel scheduler and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and ++ * make semaphores SMP safe ++ * 1998-11-19 Implemented schedule_timeout() and related stuff ++ * by Andrea Arcangeli ++ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: ++ * hybrid priority-list and round-robin design with ++ * an array-switch method of distributing timeslices ++ * and per-CPU runqueues. Cleanups and useful suggestions ++ * by Davide Libenzi, preemptible kernel bits by Robert Love. ++ * 2003-09-03 Interactivity tuning by Con Kolivas. ++ * 2004-04-02 Scheduler domains code by Nick Piggin ++ * 2007-04-15 Work begun on replacing all interactivity tuning with a ++ * fair scheduling design by Con Kolivas. ++ * 2007-05-05 Load balancing (smp-nice) and other improvements ++ * by Peter Williams ++ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith ++ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri ++ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, ++ * Thomas Gleixner, Mike Kravetz ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2016-10-01 Multiple Queue Skiplist Scheduler scalable evolution of BFS ++ * scheduler by Con Kolivas. ++ * 2019-08-31 LLC bits by Eduards Bezverhijs ++ */ ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#include "../workqueue_internal.h" ++#include "../../fs/io-wq.h" ++#include "../smpboot.h" ++ ++#define CREATE_TRACE_POINTS ++#include ++ ++#include "MuQSS.h" ++ ++#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) ++#define rt_task(p) rt_prio((p)->prio) ++#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) ++#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR) ++#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) ++ ++#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) ++#define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) ++#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO) ++ ++#define is_iso_policy(policy) ((policy) == SCHED_ISO) ++#define iso_task(p) unlikely(is_iso_policy((p)->policy)) ++#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO) ++ ++#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) ++ ++#define ISO_PERIOD (5 * HZ) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* ++ * Some helpers for converting to/from various scales. Use shifts to get ++ * approximate multiples of ten for less overhead. ++ */ ++#define APPROX_NS_PS (1073741824) /* Approximate ns per second */ ++#define JIFFIES_TO_NS(TIME) ((TIME) * (APPROX_NS_PS / HZ)) ++#define JIFFY_NS (APPROX_NS_PS / HZ) ++#define JIFFY_US (1048576 / HZ) ++#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS) ++#define HALF_JIFFY_NS (APPROX_NS_PS / HZ / 2) ++#define HALF_JIFFY_US (1048576 / HZ / 2) ++#define MS_TO_NS(TIME) ((TIME) << 20) ++#define MS_TO_US(TIME) ((TIME) << 10) ++#define NS_TO_MS(TIME) ((TIME) >> 20) ++#define NS_TO_US(TIME) ((TIME) >> 10) ++#define US_TO_NS(TIME) ((TIME) << 10) ++#define TICK_APPROX_NS ((APPROX_NS_PS+HZ/2)/HZ) ++ ++#define RESCHED_US (100) /* Reschedule if less than this many μs left */ ++ ++void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "MuQSS CPU scheduler v0.202 by Con Kolivas.\n"); ++} ++ ++/* Define RQ share levels */ ++#define RQSHARE_NONE 0 ++#define RQSHARE_SMT 1 ++#define RQSHARE_MC 2 ++#define RQSHARE_MC_LLC 3 ++#define RQSHARE_SMP 4 ++#define RQSHARE_ALL 5 ++ ++/* Define locality levels */ ++#define LOCALITY_SAME 0 ++#define LOCALITY_SMT 1 ++#define LOCALITY_MC_LLC 2 ++#define LOCALITY_MC 3 ++#define LOCALITY_SMP 4 ++#define LOCALITY_DISTANT 5 ++ ++/* ++ * This determines what level of runqueue sharing will be done and is ++ * configurable at boot time with the bootparam rqshare = ++ */ ++static int rqshare __read_mostly = CONFIG_SHARERQ; /* Default RQSHARE_MC */ ++ ++static int __init set_rqshare(char *str) ++{ ++ if (!strncmp(str, "none", 4)) { ++ rqshare = RQSHARE_NONE; ++ return 0; ++ } ++ if (!strncmp(str, "smt", 3)) { ++ rqshare = RQSHARE_SMT; ++ return 0; ++ } ++ if (!strncmp(str, "mc", 2)) { ++ rqshare = RQSHARE_MC; ++ return 0; ++ } ++ if (!strncmp(str, "llc", 3)) { ++ rqshare = RQSHARE_MC_LLC; ++ return 0; ++ } ++ if (!strncmp(str, "smp", 3)) { ++ rqshare = RQSHARE_SMP; ++ return 0; ++ } ++ if (!strncmp(str, "all", 3)) { ++ rqshare = RQSHARE_ALL; ++ return 0; ++ } ++ return 1; ++} ++__setup("rqshare=", set_rqshare); ++ ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 6ms. ++ * Tunable via /proc interface. ++ */ ++int rr_interval __read_mostly = 6; ++ ++/* ++ * Tunable to choose whether to prioritise latency or throughput, simple ++ * binary yes or no ++ */ ++int sched_interactive __read_mostly = 1; ++ ++/* ++ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks ++ * are allowed to run five seconds as real time tasks. This is the total over ++ * all online cpus. ++ */ ++int sched_iso_cpu __read_mostly = 70; ++ ++/* ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Yield only to better priority/deadline tasks. (default) ++ * 2: Expire timeslice and recalculate deadline. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++/* ++ * The relative length of deadline for each priority(nice) level. ++ */ ++static int prio_ratios[NICE_WIDTH] __read_mostly; ++ ++ ++/* ++ * The quota handed out to tasks of all priority levels when refilling their ++ * time_slice. ++ */ ++static inline int timeslice(void) ++{ ++ return MS_TO_US(rr_interval); ++} ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifdef CONFIG_SMP ++/* ++ * Total number of runqueues. Equals number of CPUs when there is no runqueue ++ * sharing but is usually less with SMT/MC sharing of runqueues. ++ */ ++static int total_runqueues __read_mostly = 1; ++ ++static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp; ++ ++struct rq *cpu_rq(int cpu) ++{ ++ return &per_cpu(runqueues, (cpu)); ++} ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++ ++/* ++ * For asym packing, by default the lower numbered cpu has higher priority. ++ */ ++int __weak arch_asym_cpu_priority(int cpu) ++{ ++ return -cpu; ++} ++ ++int __weak arch_sd_sibling_asym_packing(void) ++{ ++ return 0*SD_ASYM_PACKING; ++} ++ ++#ifdef CONFIG_SCHED_SMT ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++EXPORT_SYMBOL_GPL(sched_smt_present); ++#endif ++ ++#else ++struct rq *uprq; ++#endif /* CONFIG_SMP */ ++ ++#include "stats.h" ++ ++/* ++ * All common locking functions performed on rq->lock. rq->clock is local to ++ * the CPU accessing it so it can be modified just with interrupts disabled ++ * when we're not updating niffies. ++ * Looking up task_rq must be done under rq->lock to be safe. ++ */ ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++static void update_irq_load_avg(struct rq *rq, long delta); ++#else ++static inline void update_irq_load_avg(struct rq *rq, long delta) {} ++#endif ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++ s64 __maybe_unused steal = 0, irq_delta = 0; ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ delta -= steal; ++ } ++#endif ++ rq->clock_task += delta; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ if (irq_delta + steal) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta < 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++/* ++ * Niffies are a globally increasing nanosecond counter. They're only used by ++ * update_load_avg and time_slice_expired, however deadlines are based on them ++ * across CPUs. Update them whenever we will call one of those functions, and ++ * synchronise them across CPUs whenever we hold both runqueue locks. ++ */ ++static inline void update_clocks(struct rq *rq) ++{ ++ s64 ndiff, minndiff; ++ long jdiff; ++ ++ update_rq_clock(rq); ++ ndiff = rq->clock - rq->old_clock; ++ rq->old_clock = rq->clock; ++ jdiff = jiffies - rq->last_jiffy; ++ ++ /* Subtract any niffies added by balancing with other rqs */ ++ ndiff -= rq->niffies - rq->last_niffy; ++ minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies; ++ if (minndiff < 0) ++ minndiff = 0; ++ ndiff = max(ndiff, minndiff); ++ rq->niffies += ndiff; ++ rq->last_niffy = rq->niffies; ++ if (jdiff) { ++ rq->last_jiffy += jdiff; ++ rq->last_jiffy_niffies = rq->niffies; ++ } ++} ++ ++/* ++ * Any time we have two runqueues locked we use that as an opportunity to ++ * synchronise niffies to the highest value as idle ticks may have artificially ++ * kept niffies low on one CPU and the truth can only be later. ++ */ ++static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2) ++{ ++ if (rq1->niffies > rq2->niffies) ++ rq2->niffies = rq1->niffies; ++ else ++ rq1->niffies = rq2->niffies; ++} ++ ++/* ++ * double_rq_lock - safely lock two runqueues ++ * ++ * Note this does not disable interrupts like task_rq_lock, ++ * you need to do so manually before calling. ++ */ ++ ++/* For when we know rq1 != rq2 */ ++static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2) ++ __acquires(rq1->lock) ++ __acquires(rq2->lock) ++{ ++ if (rq1 < rq2) { ++ raw_spin_lock(rq1->lock); ++ raw_spin_lock_nested(rq2->lock, SINGLE_DEPTH_NESTING); ++ } else { ++ raw_spin_lock(rq2->lock); ++ raw_spin_lock_nested(rq1->lock, SINGLE_DEPTH_NESTING); ++ } ++} ++ ++static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) ++ __acquires(rq1->lock) ++ __acquires(rq2->lock) ++{ ++ BUG_ON(!irqs_disabled()); ++ if (rq1->lock == rq2->lock) { ++ raw_spin_lock(rq1->lock); ++ __acquire(rq2->lock); /* Fake it out ;) */ ++ } else ++ __double_rq_lock(rq1, rq2); ++ synchronise_niffies(rq1, rq2); ++} ++ ++/* ++ * double_rq_unlock - safely unlock two runqueues ++ * ++ * Note this does not restore interrupts like task_rq_unlock, ++ * you need to do so manually after calling. ++ */ ++static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) ++ __releases(rq1->lock) ++ __releases(rq2->lock) ++{ ++ raw_spin_unlock(rq1->lock); ++ if (rq1->lock != rq2->lock) ++ raw_spin_unlock(rq2->lock); ++ else ++ __release(rq2->lock); ++} ++ ++static inline void lock_all_rqs(void) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ do_raw_spin_lock(rq->lock); ++ } ++} ++ ++static inline void unlock_all_rqs(void) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ do_raw_spin_unlock(rq->lock); ++ } ++ preempt_enable(); ++} ++ ++/* Specially nest trylock an rq */ ++static inline bool trylock_rq(struct rq *this_rq, struct rq *rq) ++{ ++ if (unlikely(!do_raw_spin_trylock(rq->lock))) ++ return false; ++ spin_acquire(&rq->lock->dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ synchronise_niffies(this_rq, rq); ++ return true; ++} ++ ++/* Unlock a specially nested trylocked rq */ ++static inline void unlock_rq(struct rq *rq) ++{ ++ spin_release(&rq->lock->dep_map, _RET_IP_); ++ do_raw_spin_unlock(rq->lock); ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * In order to ensure that a pending wakeup will observe our pending ++ * state, even in the failed case, an explicit smp_mb() must be used. ++ */ ++ smp_mb__before_atomic(); ++ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) ++ return false; ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++ return true; ++} ++ ++/** ++ * wake_q_add() - queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ */ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task)) ++ get_task_struct(task); ++} ++ ++/** ++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ * ++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers ++ * that already hold reference to @task can call the 'safe' version and trust ++ * wake_q to do the right thing depending whether or not the @task is already ++ * queued for wakeup. ++ */ ++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (!__wake_q_add(head, task)) ++ put_task_struct(task); ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* Task can safely be re-inserted now */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++static inline void smp_sched_reschedule(int cpu) ++{ ++ if (likely(cpu_online(cpu))) ++ smp_send_reschedule(cpu); ++} ++ ++/* ++ * resched_task - mark a task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_task(struct task_struct *p) ++{ ++ int cpu; ++#ifdef CONFIG_LOCKDEP ++ /* Kernel threads call this when creating workqueues while still ++ * inactive from __kthread_bind_mask, holding only the pi_lock */ ++ if (!(p->flags & PF_KTHREAD)) { ++ struct rq *rq = task_rq(p); ++ ++ lockdep_assert_held(rq->lock); ++ } ++#endif ++ if (test_tsk_need_resched(p)) ++ return; ++ ++ cpu = task_cpu(p); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(p)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++/* ++ * A task that is not running or queued will not have a node set. ++ * A task that is queued but not running will have a node set. ++ * A task that is currently running will have ->on_cpu set but no node set. ++ */ ++static inline bool task_queued(struct task_struct *p) ++{ ++ return !skiplist_node_empty(&p->node); ++} ++ ++static void enqueue_task(struct rq *rq, struct task_struct *p, int flags); ++static inline void resched_if_idle(struct rq *rq); ++ ++static inline bool deadline_before(u64 deadline, u64 time) ++{ ++ return (deadline < time); ++} ++ ++/* ++ * Deadline is "now" in niffies + (offset by priority). Setting the deadline ++ * is the key to everything. It distributes cpu fairly amongst tasks of the ++ * same nice value, it proportions cpu according to nice level, it means the ++ * task that last woke up the longest ago has the earliest deadline, thus ++ * ensuring that interactive tasks get low latency on wake up. The CPU ++ * proportion works out to the square of the virtual deadline difference, so ++ * this equation will give nice 19 3% CPU compared to nice 0. ++ */ ++static inline u64 prio_deadline_diff(int user_prio) ++{ ++ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); ++} ++ ++static inline u64 task_deadline_diff(struct task_struct *p) ++{ ++ return prio_deadline_diff(TASK_USER_PRIO(p)); ++} ++ ++static inline u64 static_deadline_diff(int static_prio) ++{ ++ return prio_deadline_diff(USER_PRIO(static_prio)); ++} ++ ++static inline int longest_deadline_diff(void) ++{ ++ return prio_deadline_diff(39); ++} ++ ++static inline int ms_longest_deadline_diff(void) ++{ ++ return NS_TO_MS(longest_deadline_diff()); ++} ++ ++static inline bool rq_local(struct rq *rq); ++ ++#ifndef SCHED_CAPACITY_SCALE ++#define SCHED_CAPACITY_SCALE 1024 ++#endif ++ ++static inline int rq_load(struct rq *rq) ++{ ++ return rq->nr_running; ++} ++ ++/* ++ * Update the load average for feeding into cpu frequency governors. Use a ++ * rough estimate of a rolling average with ~ time constant of 32ms. ++ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 ++ * Make sure a call to update_clocks has been made before calling this to get ++ * an updated rq->niffies. ++ */ ++static void update_load_avg(struct rq *rq, unsigned int flags) ++{ ++ long us_interval, load; ++ ++ us_interval = NS_TO_US(rq->niffies - rq->load_update); ++ if (unlikely(us_interval <= 0)) ++ return; ++ ++ load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); ++ if (unlikely(load < 0)) ++ load = 0; ++ load += rq_load(rq) * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; ++ rq->load_avg = load; ++ ++ rq->load_update = rq->niffies; ++ update_irq_load_avg(rq, 0); ++ if (likely(rq_local(rq))) ++ cpufreq_trigger(rq, flags); ++} ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++/* ++ * IRQ variant of update_load_avg below. delta is actually time in nanoseconds ++ * here so we scale curload to how long it's been since the last update. ++ */ ++static void update_irq_load_avg(struct rq *rq, long delta) ++{ ++ long us_interval, load; ++ ++ us_interval = NS_TO_US(rq->niffies - rq->irq_load_update); ++ if (unlikely(us_interval <= 0)) ++ return; ++ ++ load = rq->irq_load_avg - (rq->irq_load_avg * us_interval * 5 / 262144); ++ if (unlikely(load < 0)) ++ load = 0; ++ load += NS_TO_US(delta) * SCHED_CAPACITY_SCALE * 5 / 262144; ++ rq->irq_load_avg = load; ++ ++ rq->irq_load_update = rq->niffies; ++} ++#endif ++ ++/* ++ * Removing from the runqueue. Enter with rq locked. Deleting a task ++ * from the skip list is done via the stored node reference in the task struct ++ * and does not require a full look up. Thus it occurs in O(k) time where k ++ * is the "level" of the list the task was stored at - usually < 4, max 8. ++ */ ++static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ skiplist_delete(rq->sl, &p->node); ++ rq->best_key = rq->node->next[0]->key; ++ update_clocks(rq); ++ ++ if (!(flags & DEQUEUE_SAVE)) { ++ sched_info_dequeued(rq, p); ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); ++ } ++ rq->nr_running--; ++ if (rt_task(p)) ++ rq->rt_nr_running--; ++ update_load_avg(rq, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_RCU ++static bool rcu_read_critical(struct task_struct *p) ++{ ++ return p->rcu_read_unlock_special.b.blocked; ++} ++#else /* CONFIG_PREEMPT_RCU */ ++#define rcu_read_critical(p) (false) ++#endif /* CONFIG_PREEMPT_RCU */ ++ ++/* ++ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as ++ * an idle task, we ensure none of the following conditions are met. ++ */ ++static bool idleprio_suitable(struct task_struct *p) ++{ ++ return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) && ++ !signal_pending(p) && !rcu_read_critical(p) && !freezing(p)); ++} ++ ++/* ++ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check ++ * that the iso_refractory flag is not set. ++ */ ++static inline bool isoprio_suitable(struct rq *rq) ++{ ++ return !rq->iso_refractory; ++} ++ ++/* ++ * Adding to the runqueue. Enter with rq locked. ++ */ ++static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ unsigned int randseed, cflags = 0; ++ u64 sl_id; ++ ++ if (!rt_task(p)) { ++ /* Check it hasn't gotten rt from PI */ ++ if ((idleprio_task(p) && idleprio_suitable(p)) || ++ (iso_task(p) && isoprio_suitable(rq))) ++ p->prio = p->normal_prio; ++ else ++ p->prio = NORMAL_PRIO; ++ } else ++ rq->rt_nr_running++; ++ /* ++ * The sl_id key passed to the skiplist generates a sorted list. ++ * Realtime and sched iso tasks run FIFO so they only need be sorted ++ * according to priority. The skiplist will put tasks of the same ++ * key inserted later in FIFO order. Tasks of sched normal, batch ++ * and idleprio are sorted according to their deadlines. Idleprio ++ * tasks are offset by an impossibly large deadline value ensuring ++ * they get sorted into last positions, but still according to their ++ * own deadlines. This creates a "landscape" of skiplists running ++ * from priority 0 realtime in first place to the lowest priority ++ * idleprio tasks last. Skiplist insertion is an O(log n) process. ++ */ ++ if (p->prio <= ISO_PRIO) { ++ sl_id = p->prio; ++ } else { ++ sl_id = p->deadline; ++ if (idleprio_task(p)) { ++ if (p->prio == IDLE_PRIO) ++ sl_id |= 0xF000000000000000; ++ else ++ sl_id += longest_deadline_diff(); ++ } ++ } ++ /* ++ * Some architectures don't have better than microsecond resolution ++ * so mask out ~microseconds as the random seed for skiplist insertion. ++ */ ++ update_clocks(rq); ++ if (!(flags & ENQUEUE_RESTORE)) { ++ sched_info_queued(rq, p); ++ psi_enqueue(p, flags & ENQUEUE_WAKEUP); ++ } ++ ++ randseed = (rq->niffies >> 10) & 0xFFFFFFFF; ++ skiplist_insert(rq->sl, &p->node, sl_id, p, randseed); ++ rq->best_key = rq->node->next[0]->key; ++ if (p->in_iowait) ++ cflags |= SCHED_CPUFREQ_IOWAIT; ++ rq->nr_running++; ++ update_load_avg(rq, cflags); ++} ++ ++/* ++ * Returns the relative length of deadline all compared to the shortest ++ * deadline which is that of nice -20. ++ */ ++static inline int task_prio_ratio(struct task_struct *p) ++{ ++ return prio_ratios[TASK_USER_PRIO(p)]; ++} ++ ++/* ++ * task_timeslice - all tasks of all priorities get the exact same timeslice ++ * length. CPU distribution is handled by giving different deadlines to ++ * tasks of different priorities. Use 128 as the base value for fast shifts. ++ */ ++static inline int task_timeslice(struct task_struct *p) ++{ ++ return (rr_interval * task_prio_ratio(p) / 128); ++} ++ ++#ifdef CONFIG_SMP ++/* Entered with rq locked */ ++static inline void resched_if_idle(struct rq *rq) ++{ ++ if (rq_idle(rq)) ++ resched_task(rq->curr); ++} ++ ++static inline bool rq_local(struct rq *rq) ++{ ++ return (rq->cpu == smp_processor_id()); ++} ++#ifdef CONFIG_SMT_NICE ++static const cpumask_t *thread_cpumask(int cpu); ++ ++/* Find the best real time priority running on any SMT siblings of cpu and if ++ * none are running, the static priority of the best deadline task running. ++ * The lookups to the other runqueues is done lockless as the occasional wrong ++ * value would be harmless. */ ++static int best_smt_bias(struct rq *this_rq) ++{ ++ int other_cpu, best_bias = 0; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct rq *rq = cpu_rq(other_cpu); ++ ++ if (rq_idle(rq)) ++ continue; ++ if (unlikely(!rq->online)) ++ continue; ++ if (!rq->rq_mm) ++ continue; ++ if (likely(rq->rq_smt_bias > best_bias)) ++ best_bias = rq->rq_smt_bias; ++ } ++ return best_bias; ++} ++ ++static int task_prio_bias(struct task_struct *p) ++{ ++ if (rt_task(p)) ++ return 1 << 30; ++ else if (task_running_iso(p)) ++ return 1 << 29; ++ else if (task_running_idle(p)) ++ return 0; ++ return MAX_PRIO - p->static_prio; ++} ++ ++static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq) ++{ ++ return true; ++} ++ ++static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule; ++ ++/* We've already decided p can run on CPU, now test if it shouldn't for SMT ++ * nice reasons. */ ++static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq) ++{ ++ int best_bias, task_bias; ++ ++ /* Kernel threads always run */ ++ if (unlikely(!p->mm)) ++ return true; ++ if (rt_task(p)) ++ return true; ++ if (!idleprio_suitable(p)) ++ return true; ++ best_bias = best_smt_bias(this_rq); ++ /* The smt siblings are all idle or running IDLEPRIO */ ++ if (best_bias < 1) ++ return true; ++ task_bias = task_prio_bias(p); ++ if (task_bias < 1) ++ return false; ++ if (task_bias >= best_bias) ++ return true; ++ /* Dither 25% cpu of normal tasks regardless of nice difference */ ++ if (best_bias % 4 == 1) ++ return true; ++ /* Sorry, you lose */ ++ return false; ++} ++#else /* CONFIG_SMT_NICE */ ++#define smt_schedule(p, this_rq) (true) ++#endif /* CONFIG_SMT_NICE */ ++ ++static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask) ++{ ++ set_bit(cpu, (volatile unsigned long *)cpumask); ++} ++ ++/* ++ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to ++ * allow easy lookup of whether any suitable idle CPUs are available. ++ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the ++ * idle_cpus variable than to do a full bitmask check when we are busy. The ++ * bits are set atomically but read locklessly as occasional false positive / ++ * negative is harmless. ++ */ ++static inline void set_cpuidle_map(int cpu) ++{ ++ if (likely(cpu_online(cpu))) ++ atomic_set_cpu(cpu, &cpu_idle_map); ++} ++ ++static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) ++{ ++ clear_bit(cpu, (volatile unsigned long *)cpumask); ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++ atomic_clear_cpu(cpu, &cpu_idle_map); ++} ++ ++static bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return (cpumask_intersects(p->cpus_ptr, &cpu_idle_map)); ++} ++ ++/* ++ * Resched current on rq. We don't know if rq is local to this CPU nor if it ++ * is locked so we do not use an intermediate variable for the task to avoid ++ * having it dereferenced. ++ */ ++static void resched_curr(struct rq *rq) ++{ ++ int cpu; ++ ++ if (test_tsk_need_resched(rq->curr)) ++ return; ++ ++ rq->preempt = rq->curr; ++ cpu = rq->cpu; ++ ++ /* We're doing this without holding the rq lock if it's not task_rq */ ++ ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(rq->curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(rq->curr)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++#define CPUIDLE_DIFF_THREAD (1) ++#define CPUIDLE_DIFF_CORE_LLC (2) ++#define CPUIDLE_DIFF_CORE (4) ++#define CPUIDLE_CACHE_BUSY (8) ++#define CPUIDLE_DIFF_CPU (16) ++#define CPUIDLE_THREAD_BUSY (32) ++#define CPUIDLE_DIFF_NODE (64) ++ ++/* ++ * The best idle CPU is chosen according to the CPUIDLE ranking above where the ++ * lowest value would give the most suitable CPU to schedule p onto next. The ++ * order works out to be the following: ++ * ++ * Same thread, idle or busy cache, idle or busy threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ */ ++static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask) ++{ ++ int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY | ++ CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE | ++ CPUIDLE_DIFF_CORE_LLC | CPUIDLE_DIFF_THREAD; ++ int cpu_tmp; ++ ++ if (cpumask_test_cpu(best_cpu, tmpmask)) ++ goto out; ++ ++ for_each_cpu(cpu_tmp, tmpmask) { ++ int ranking, locality; ++ struct rq *tmp_rq; ++ ++ ranking = 0; ++ tmp_rq = cpu_rq(cpu_tmp); ++ ++ locality = rq->cpu_locality[cpu_tmp]; ++#ifdef CONFIG_NUMA ++ if (locality > LOCALITY_SMP) ++ ranking |= CPUIDLE_DIFF_NODE; ++ else ++#endif ++ if (locality > LOCALITY_MC) ++ ranking |= CPUIDLE_DIFF_CPU; ++#ifdef CONFIG_SCHED_MC ++ else if (locality == LOCALITY_MC_LLC) ++ ranking |= CPUIDLE_DIFF_CORE_LLC; ++ else if (locality == LOCALITY_MC) ++ ranking |= CPUIDLE_DIFF_CORE; ++ if (!(tmp_rq->cache_idle(tmp_rq))) ++ ranking |= CPUIDLE_CACHE_BUSY; ++#endif ++#ifdef CONFIG_SCHED_SMT ++ if (locality == LOCALITY_SMT) ++ ranking |= CPUIDLE_DIFF_THREAD; ++#endif ++ if (ranking < best_ranking ++#ifdef CONFIG_SCHED_SMT ++ || (ranking == best_ranking && (tmp_rq->siblings_idle(tmp_rq))) ++#endif ++ ) { ++ best_cpu = cpu_tmp; ++ best_ranking = ranking; ++ } ++ } ++out: ++ return best_cpu; ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ struct rq *this_rq = cpu_rq(this_cpu); ++ ++ return (this_rq->cpu_locality[that_cpu] < LOCALITY_SMP); ++} ++ ++/* As per resched_curr but only will resched idle task */ ++static inline void resched_idle(struct rq *rq) ++{ ++ if (test_tsk_need_resched(rq->idle)) ++ return; ++ ++ rq->preempt = rq->idle; ++ ++ set_tsk_need_resched(rq->idle); ++ ++ if (rq_local(rq)) { ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ smp_sched_reschedule(rq->cpu); ++} ++ ++DEFINE_PER_CPU(cpumask_t, idlemask); ++ ++static struct rq *resched_best_idle(struct task_struct *p, int cpu) ++{ ++ cpumask_t *tmpmask = &(per_cpu(idlemask, cpu)); ++ struct rq *rq; ++ int best_cpu; ++ ++ cpumask_and(tmpmask, p->cpus_ptr, &cpu_idle_map); ++ best_cpu = best_mask_cpu(cpu, task_rq(p), tmpmask); ++ rq = cpu_rq(best_cpu); ++ if (!smt_schedule(p, rq)) ++ return NULL; ++ rq->preempt = p; ++ resched_idle(rq); ++ return rq; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p, task_cpu(p)); ++} ++ ++static inline struct rq *rq_order(struct rq *rq, int cpu) ++{ ++ return rq->rq_order[cpu]; ++} ++#else /* CONFIG_SMP */ ++static inline void set_cpuidle_map(int cpu) ++{ ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++} ++ ++static inline bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return uprq->curr == uprq->idle; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++} ++ ++static inline void resched_curr(struct rq *rq) ++{ ++ resched_task(rq->curr); ++} ++ ++static inline void resched_if_idle(struct rq *rq) ++{ ++} ++ ++static inline bool rq_local(struct rq *rq) ++{ ++ return true; ++} ++ ++static inline struct rq *rq_order(struct rq *rq, int cpu) ++{ ++ return rq; ++} ++ ++static inline bool smt_schedule(struct task_struct *p, struct rq *rq) ++{ ++ return true; ++} ++#endif /* CONFIG_SMP */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ if (has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ if (idleprio_task(p)) ++ return IDLE_PRIO; ++ if (iso_task(p)) ++ return ISO_PRIO; ++ return NORMAL_PRIO; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. Enter with rq locked. ++ */ ++static void activate_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ resched_if_idle(rq); ++ ++ /* ++ * Sleep time is in units of nanosecs, so shift by 20 to get a ++ * milliseconds-range estimation of the amount of time that the task ++ * spent sleeping: ++ */ ++ if (unlikely(prof_on == SLEEP_PROFILING)) { ++ if (p->state == TASK_UNINTERRUPTIBLE) ++ profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), ++ (rq->niffies - p->last_ran) >> 20); ++ } ++ ++ p->prio = effective_prio(p); ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible--; ++ ++ enqueue_task(rq, p, flags); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++} ++ ++/* ++ * deactivate_task - If it's running, it's not on the runqueue and we can just ++ * decrement the nr_running. Enter with rq locked. ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible++; ++ ++ p->on_rq = 0; ++ sched_info_dequeued(rq, p); ++ /* deactivate_task is always DEQUEUE_SLEEP in muqss */ ++ psi_dequeue(p, DEQUEUE_SLEEP); ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++ struct rq *rq; ++ ++ if (task_cpu(p) == new_cpu) ++ return; ++ ++ /* Do NOT call set_task_cpu on a currently queued task as we will not ++ * be reliably holding the rq lock after changing CPU. */ ++ BUG_ON(task_queued(p)); ++ rq = task_rq(p); ++ ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * Furthermore, all task_rq users should acquire both locks, see ++ * task_rq_lock(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(rq->lock))); ++#endif ++ ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ /* ++ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++ p->wake_cpu = new_cpu; ++ ++ if (task_running(rq, p)) { ++ /* ++ * We should only be calling this on a running task if we're ++ * holding rq lock. ++ */ ++ lockdep_assert_held(rq->lock); ++ ++ /* ++ * We can't change the task_thread_info CPU on a running task ++ * as p will still be protected by the rq lock of the CPU it ++ * is still running on so we only set the wake_cpu for it to be ++ * lazily updated once off the CPU. ++ */ ++ return; ++ } ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ WRITE_ONCE(p->cpu, new_cpu); ++#else ++ WRITE_ONCE(task_thread_info(p)->cpu, new_cpu); ++#endif ++ /* We're no longer protecting p after this point since we're holding ++ * the wrong runqueue lock. */ ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Move a task off the runqueue and take it to a cpu for it will ++ * become the running task. ++ */ ++static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) ++{ ++ struct rq *p_rq = task_rq(p); ++ ++ dequeue_task(p_rq, p, DEQUEUE_SAVE); ++ if (p_rq != rq) { ++ sched_info_dequeued(p_rq, p); ++ sched_info_queued(rq, p); ++ } ++ set_task_cpu(p, cpu); ++} ++ ++/* ++ * Returns a descheduling task to the runqueue unless it is being ++ * deactivated. ++ */ ++static inline void return_task(struct task_struct *p, struct rq *rq, ++ int cpu, bool deactivate) ++{ ++ if (deactivate) ++ deactivate_task(p, rq); ++ else { ++#ifdef CONFIG_SMP ++ /* ++ * set_task_cpu was called on the running task that doesn't ++ * want to deactivate so it has to be enqueued to a different ++ * CPU and we need its lock. Tag it to be moved with as the ++ * lock is dropped in finish_lock_switch. ++ */ ++ if (unlikely(p->wake_cpu != cpu)) ++ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ++ else ++#endif ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ } ++} ++ ++/* Enter with rq lock held. We know p is on the local cpu */ ++static inline void __set_tsk_resched(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ int running, queued; ++ struct rq_flags rf; ++ unsigned long ncsw; ++ struct rq *rq; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(rq, p)) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ rq = task_rq_lock(p, &rf); ++ trace_sched_wait_task(p); ++ running = task_running(rq, p); ++ queued = task_on_rq_queued(p); ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_rq_unlock(rq, p, &rf); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(queued)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_sched_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++#endif ++ ++/* ++ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the ++ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or ++ * between themselves, they cooperatively multitask. An idle rq scores as ++ * prio PRIO_LIMIT so it is always preempted. ++ */ ++static inline bool ++can_preempt(struct task_struct *p, int prio, u64 deadline) ++{ ++ /* Better static priority RT task or better policy preemption */ ++ if (p->prio < prio) ++ return true; ++ if (p->prio > prio) ++ return false; ++ if (p->policy == SCHED_BATCH) ++ return false; ++ /* SCHED_NORMAL and ISO will preempt based on deadline */ ++ if (!deadline_before(p->deadline, deadline)) ++ return false; ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ ++ if (is_per_cpu_kthread(p)) ++ return cpu_online(cpu); ++ ++ return cpu_active(cpu); ++} ++ ++/* ++ * Check to see if p can run on cpu, and if not, whether there are any online ++ * CPUs it can run on instead. This only happens with the hotplug threads that ++ * bring up the CPUs. ++ */ ++static inline bool sched_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (likely(cpumask_test_cpu(cpu, p->cpus_ptr))) ++ return false; ++ if (p->nr_cpus_allowed == 1) { ++ cpumask_t valid_mask; ++ ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_online_mask); ++ if (unlikely(cpumask_empty(&valid_mask))) ++ return false; ++ } ++ return true; ++} ++ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ return true; ++} ++ ++#define cpu_online_map (*(cpumask_t *)cpu_online_mask) ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ int i, this_entries = rq_load(this_rq); ++ cpumask_t tmp; ++ ++ if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p))) ++ return; ++ ++ /* IDLEPRIO tasks never preempt anything but idle */ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ ++ cpumask_and(&tmp, &cpu_online_map, p->cpus_ptr); ++ ++ for (i = 0; i < num_online_cpus(); i++) { ++ struct rq *rq = this_rq->cpu_order[i]; ++ ++ if (!cpumask_test_cpu(rq->cpu, &tmp)) ++ continue; ++ ++ if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries) ++ continue; ++ if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) { ++ /* We set rq->preempting lockless, it's a hint only */ ++ rq->preempting = p; ++ resched_curr(rq); ++ return; ++ } ++ } ++} ++ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check); ++#else /* CONFIG_SMP */ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ return false; ++} ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) ++ resched_curr(uprq); ++} ++ ++static inline int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++#endif /* CONFIG_SMP */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq = this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) { ++ __schedstat_inc(rq->ttwu_local); ++ } else { ++ struct sched_domain *sd; ++ ++ rcu_read_lock(); ++ for_each_domain(rq->cpu, sd) { ++ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { ++ __schedstat_inc(sd->ttwu_wake_remote); ++ break; ++ } ++ } ++ rcu_read_unlock(); ++ } ++ ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ /* ++ * Sync wakeups (i.e. those types of wakeups where the waker ++ * has indicated that it will leave the CPU in short order) ++ * don't trigger a preemption if there are no idle cpus, ++ * instead waiting for current to deschedule. ++ */ ++ if (wake_flags & WF_SYNC) ++ resched_suitable_idle(p); ++ else ++ try_preempt(p, rq); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ int en_flags = ENQUEUE_WAKEUP; ++ ++ lockdep_assert_held(rq->lock); ++ ++#ifdef CONFIG_SMP ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++ ++ if (wake_flags & WF_MIGRATED) ++ en_flags |= ENQUEUE_MIGRATED; ++#endif ++ ++ activate_task(rq, p, en_flags); ++ ttwu_do_wakeup(rq, p, wake_flags); ++} ++ ++/* ++ * Called in case the task @p isn't fully descheduled from its runqueue, ++ * in this case we must do a remote wakeup. Its a 'light' wakeup though, ++ * since all we need to do is flip p->state to TASK_RUNNING, since ++ * the task is still ->on_rq. ++ */ ++static int ttwu_remote(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ int ret = 0; ++ ++ rq = __task_rq_lock(p, NULL); ++ if (likely(task_on_rq_queued(p))) { ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_rq_unlock(rq, NULL); ++ ++ return ret; ++} ++ ++#ifdef CONFIG_SMP ++void sched_ttwu_pending(void) ++{ ++ struct rq *rq = this_rq(); ++ struct llist_node *llist = llist_del_all(&rq->wake_list); ++ struct task_struct *p, *t; ++ struct rq_flags rf; ++ ++ if (!llist) ++ return; ++ ++ rq_lock_irqsave(rq, &rf); ++ ++ llist_for_each_entry_safe(p, t, llist, wake_entry) ++ ttwu_do_activate(rq, p, 0); ++ ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++void scheduler_ipi(void) ++{ ++ /* ++ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting ++ * TIF_NEED_RESCHED remotely (for the first time) will also send ++ * this IPI. ++ */ ++ preempt_fold_need_resched(); ++ ++ if (llist_empty(&this_rq()->wake_list) && (!idle_cpu(smp_processor_id()) || need_resched())) ++ return; ++ ++ /* ++ * Not all reschedule IPI handlers call irq_enter/irq_exit, since ++ * traditionally all their work was done from the interrupt return ++ * path. Now that we actually do some work, we need to make sure ++ * we do call them. ++ * ++ * Some archs already do call them, luckily irq_enter/exit nest ++ * properly. ++ * ++ * Arguably we should visit all archs and update all handlers, ++ * however a fair share of IPIs are still resched only so this would ++ * somewhat pessimize the simple resched case. ++ */ ++ irq_enter(); ++ sched_ttwu_pending(); ++ irq_exit(); ++} ++ ++static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { ++ if (!set_nr_if_polling(rq->idle)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++ } ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ rq_lock_irqsave(rq, &rf); ++ if (likely(is_idle_task(rq->curr))) ++ smp_sched_reschedule(cpu); ++ /* Else cpu is not in idle, do nothing here */ ++ rq_unlock_irqrestore(rq, &rf); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++static int valid_task_cpu(struct task_struct *p) ++{ ++ cpumask_t valid_mask; ++ ++ if (p->flags & PF_KTHREAD) ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_all_mask); ++ else ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_active_mask); ++ ++ if (unlikely(!cpumask_weight(&valid_mask))) { ++ /* We shouldn't be hitting this any more */ ++ printk(KERN_WARNING "SCHED: No cpumask for %s/%d weight %d\n", p->comm, ++ p->pid, cpumask_weight(p->cpus_ptr)); ++ return cpumask_any(p->cpus_ptr); ++ } ++ return cpumask_any(&valid_mask); ++} ++ ++/* ++ * For a task that's just being woken up we have a valuable balancing ++ * opportunity so choose the nearest cache most lightly loaded runqueue. ++ * Entered with rq locked and returns with the chosen runqueue locked. ++ */ ++static inline int select_best_cpu(struct task_struct *p) ++{ ++ unsigned int idlest = ~0U; ++ struct rq *rq = NULL; ++ int i; ++ ++ if (suitable_idle_cpus(p)) { ++ int cpu = task_cpu(p); ++ ++ if (unlikely(needs_other_cpu(p, cpu))) ++ cpu = valid_task_cpu(p); ++ rq = resched_best_idle(p, cpu); ++ if (likely(rq)) ++ return rq->cpu; ++ } ++ ++ for (i = 0; i < num_online_cpus(); i++) { ++ struct rq *other_rq = task_rq(p)->cpu_order[i]; ++ int entries; ++ ++ if (!other_rq->online) ++ continue; ++ if (needs_other_cpu(p, other_rq->cpu)) ++ continue; ++ entries = rq_load(other_rq); ++ if (entries >= idlest) ++ continue; ++ idlest = entries; ++ rq = other_rq; ++ } ++ if (unlikely(!rq)) ++ return task_cpu(p); ++ return rq->cpu; ++} ++#else /* CONFIG_SMP */ ++static int valid_task_cpu(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static inline int select_best_cpu(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static struct rq *resched_best_idle(struct task_struct *p, int cpu) ++{ ++ return NULL; ++} ++#endif /* CONFIG_SMP */ ++ ++static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++#if defined(CONFIG_SMP) ++ if (!cpus_share_cache(smp_processor_id(), cpu)) { ++ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ ++ ttwu_queue_remote(p, cpu, wake_flags); ++ return; ++ } ++#endif ++ rq_lock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ rq_unlock(rq); ++} ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Return: %true if @p was woken up, %false if it was already running. ++ * or @state didn't match @p's state. ++ */ ++static int ++try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ++{ ++ unsigned long flags; ++ int cpu, success = 0; ++ ++ preempt_disable(); ++ if (p == current) { ++ /* ++ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) ++ * == smp_processor_id()'. Together this means we can special ++ * case the whole 'p->on_rq && ttwu_remote()' case below ++ * without taking any locks. ++ * ++ * In particular: ++ * - we rely on Program-Order guarantees for all the ordering, ++ * - we're serialized against set_special_state() by virtue of ++ * it disabling IRQs (this allows not taking ->pi_lock). ++ */ ++ if (!(p->state & state)) ++ goto out; ++ ++ success = 1; ++ cpu = task_cpu(p); ++ trace_sched_waking(p); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++ goto out; ++ } ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with mb() in ++ * set_current_state() the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ if (!(p->state & state)) ++ goto unlock; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ cpu = task_cpu(p); ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ if (p->on_rq && ttwu_remote(p, wake_flags)) ++ goto unlock; ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until its done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ p->sched_contributes_to_load = !!task_contributes_to_load(p); ++ p->state = TASK_WAKING; ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ cpu = select_best_cpu(p); ++ if (task_cpu(p) != cpu) { ++ wake_flags |= WF_MIGRATED; ++ psi_ttwu_dequeue(p); ++ set_task_cpu(p, cpu); ++ } ++ ++#else /* CONFIG_SMP */ ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++#endif /* CONFIG_SMP */ ++ ++ ttwu_queue(p, cpu, wake_flags); ++unlock: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++out: ++ if (success) ++ ttwu_stat(p, cpu, wake_flags); ++ preempt_enable(); ++ ++ return success; ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++static void time_slice_expired(struct task_struct *p, struct rq *rq); ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ */ ++int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ ++#ifdef CONFIG_COMPACTION ++ p->capture_control = NULL; ++#endif ++ ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * The process state is set to the same value of the process executing ++ * do_fork() code. That is running. This guarantees that nobody will ++ * actually run it, and a signal or other external event cannot wake ++ * it up and insert it on the runqueue either. ++ */ ++ ++ /* Should be reset in fork.c but done here for ease of MuQSS patching */ ++ p->on_cpu = ++ p->on_rq = ++ p->utime = ++ p->stime = ++ p->sched_time = ++ p->stime_ns = ++ p->utime_ns = 0; ++ skiplist_node_init(&p->node); ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR || p-> policy == SCHED_ISO) { ++ p->policy = SCHED_NORMAL; ++ p->normal_prio = normal_prio(p); ++ } ++ ++ if (PRIO_TO_NICE(p->static_prio) < 0) { ++ p->static_prio = NICE_TO_PRIO(0); ++ p->normal_prio = p->static_prio; ++ } ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ /* ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ set_task_cpu(p, smp_processor_id()); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ return 0; ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p); ++ ++static void account_task_cpu(struct rq *rq, struct task_struct *p) ++{ ++ update_clocks(rq); ++ /* This isn't really a context switch but accounting is the same */ ++ update_cpu_clock_switch(rq, p); ++ p->last_ran = rq->niffies; ++} ++ ++bool sched_smp_initialized __read_mostly; ++ ++static inline int hrexpiry_enabled(struct rq *rq) ++{ ++ if (unlikely(!cpu_active(cpu_of(rq)) || !sched_smp_initialized)) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrexpiry_timer); ++} ++ ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++static inline void hrexpiry_clear(struct rq *rq) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ if (hrtimer_active(&rq->hrexpiry_timer)) ++ hrtimer_cancel(&rq->hrexpiry_timer); ++} ++ ++/* ++ * High-resolution time_slice expiry. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrexpiry(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrexpiry_timer); ++ struct task_struct *p; ++ ++ /* This can happen during CPU hotplug / resume */ ++ if (unlikely(cpu_of(rq) != smp_processor_id())) ++ goto out; ++ ++ /* ++ * We're doing this without the runqueue lock but this should always ++ * be run on the local CPU. Time slice should run out in __schedule ++ * but we set it to zero here in case niffies is slightly less. ++ */ ++ p = rq->curr; ++ p->time_slice = 0; ++ __set_tsk_resched(p); ++out: ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Called to set the hrexpiry timer state. ++ * ++ * called with irqs disabled from the local CPU only ++ */ ++static void hrexpiry_start(struct rq *rq, u64 delay) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ ++ hrtimer_start(&rq->hrexpiry_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED); ++} ++ ++static void init_rq_hrexpiry(struct rq *rq) ++{ ++ hrtimer_init(&rq->hrexpiry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ rq->hrexpiry_timer.function = hrexpiry; ++} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return HALF_JIFFY_US; ++ return 0; ++} ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ struct task_struct *parent, *rq_curr; ++ struct rq *rq, *new_rq; ++ unsigned long flags; ++ ++ parent = p->parent; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ p->state = TASK_RUNNING; ++ /* Task_rq can't change yet on a new task */ ++ new_rq = rq = task_rq(p); ++ if (unlikely(needs_other_cpu(p, task_cpu(p)))) { ++ set_task_cpu(p, valid_task_cpu(p)); ++ new_rq = task_rq(p); ++ } ++ ++ double_rq_lock(rq, new_rq); ++ rq_curr = rq->curr; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = rq_curr->normal_prio; ++ ++ trace_sched_wakeup_new(p); ++ ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. If it's negative, it won't ++ * matter since that's the same as being 0. rq->rq_deadline is only ++ * modified within schedule() so it is always equal to ++ * current->deadline. ++ */ ++ account_task_cpu(rq, rq_curr); ++ p->last_ran = rq_curr->last_ran; ++ if (likely(rq_curr->policy != SCHED_FIFO)) { ++ rq_curr->time_slice /= 2; ++ if (rq_curr->time_slice < RESCHED_US) { ++ /* ++ * Forking task has run out of timeslice. Reschedule it and ++ * start its child with a new time slice and deadline. The ++ * child will end up running first because its deadline will ++ * be slightly earlier. ++ */ ++ __set_tsk_resched(rq_curr); ++ time_slice_expired(p, new_rq); ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p, task_cpu(p)); ++ else if (unlikely(rq != new_rq)) ++ try_preempt(p, new_rq); ++ } else { ++ p->time_slice = rq_curr->time_slice; ++ if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) { ++ /* ++ * The VM isn't cloned, so we're in a good position to ++ * do child-runs-first in anticipation of an exec. This ++ * usually avoids a lot of COW overhead. ++ */ ++ __set_tsk_resched(rq_curr); ++ } else { ++ /* ++ * Adjust the hrexpiry since rq_curr will keep ++ * running and its timeslice has been shortened. ++ */ ++ hrexpiry_start(rq, US_TO_NS(rq_curr->time_slice)); ++ try_preempt(p, new_rq); ++ } ++ } ++ } else { ++ time_slice_expired(p, new_rq); ++ try_preempt(p, new_rq); ++ } ++ activate_task(new_rq, p, 0); ++ double_rq_unlock(rq, new_rq); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ */ ++ next->on_cpu = 1; ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->on_cpu is cleared, the task can be moved to a different CPU. ++ * We must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#endif ++} ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock->dep_map, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock->owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock->dep_map, 0, 0, _THIS_IP_); ++ ++#ifdef CONFIG_SMP ++ /* ++ * If prev was marked as migrating to another CPU in return_task, drop ++ * the local runqueue lock but leave interrupts disabled and grab the ++ * remote lock we're migrating it to before enabling them. ++ */ ++ if (unlikely(task_on_rq_migrating(prev))) { ++ sched_info_dequeued(rq, prev); ++ /* ++ * We move the ownership of prev to the new cpu now. ttwu can't ++ * activate prev to the wrong cpu since it has to grab this ++ * runqueue in ttwu_remote. ++ */ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ prev->cpu = prev->wake_cpu; ++#else ++ task_thread_info(prev)->cpu = prev->wake_cpu; ++#endif ++ raw_spin_unlock(rq->lock); ++ ++ raw_spin_lock(&prev->pi_lock); ++ rq = __task_rq_lock(prev, NULL); ++ /* Check that someone else hasn't already queued prev */ ++ if (likely(!task_queued(prev))) { ++ enqueue_task(rq, prev, 0); ++ prev->on_rq = TASK_ON_RQ_QUEUED; ++ /* Wake up the CPU if it's not already running */ ++ resched_if_idle(rq); ++ } ++ raw_spin_unlock(&prev->pi_lock); ++ } ++#endif ++ rq_unlock(rq); ++ local_irq_enable(); ++} ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_switch ++# define finish_arch_switch(prev) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static void finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ finish_lock_switch(rq, prev); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct_rcu_user(prev); ++ } ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++{ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline void ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prepare_task_switch(rq, prev, next); ++ ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * kernel -> kernel lazy + transfer active ++ * user -> kernel lazy + mmgrab() active ++ * ++ * kernel -> user switch + mmdrop() active ++ * user -> user switch ++ */ ++ if (!next->mm) { // to kernel ++ enter_lazy_tlb(prev->active_mm, next); ++ ++ next->active_mm = prev->active_mm; ++ if (prev->mm) // from user ++ mmgrab(prev->active_mm); ++ else ++ prev->active_mm = NULL; ++ } else { // to user ++ membarrier_switch_mm(rq, prev->active_mm, next->mm); ++ /* ++ * sys_membarrier() requires an smp_mb() between setting ++ * rq->curr / membarrier_switch_mm() and returning to userspace. ++ * ++ * The below provides this either through switch_mm(), or in ++ * case 'prev->active_mm == next->mm' through ++ * finish_task_switch()'s mmdrop(). ++ */ ++ switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ ++ if (!prev->mm) { // from kernel ++ /* will mmdrop() in finish_task_switch(). */ ++ rq->prev_mm = prev->active_mm; ++ prev->active_mm = NULL; ++ } ++ } ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++static unsigned long nr_uninterruptible(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_uninterruptible; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptible section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ if (rq_load(raw_rq()) == 1) ++ return true; ++ else ++ return false; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int cpu; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(cpu) ++ sum += cpu_rq(cpu)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpufreq menu ++ * governor are using nonsensical data. Boosting frequency for a CPU that has ++ * IO-wait which might not even end up running the task when it does become ++ * runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ return atomic_read(&cpu_rq(cpu)->nr_iowait); ++} ++ ++/* ++ * IO-wait accounting, and how its mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long cpu, sum = 0; ++ ++ for_each_possible_cpu(cpu) ++ sum += nr_iowait_cpu(cpu); ++ ++ return sum; ++} ++ ++unsigned long nr_active(void) ++{ ++ return nr_running() + nr_uninterruptible(); ++} ++ ++/* Variables and functions for calc_load */ ++static unsigned long calc_load_update; ++unsigned long avenrun[3]; ++EXPORT_SYMBOL(avenrun); ++ ++/** ++ * get_avenrun - get the load average array ++ * @loads: pointer to dest load array ++ * @offset: offset to add ++ * @shift: shift count to shift the result left ++ * ++ * These values are estimates at best, so no need for locking. ++ */ ++void get_avenrun(unsigned long *loads, unsigned long offset, int shift) ++{ ++ loads[0] = (avenrun[0] + offset) << shift; ++ loads[1] = (avenrun[1] + offset) << shift; ++ loads[2] = (avenrun[2] + offset) << shift; ++} ++ ++/* ++ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds. ++ */ ++void calc_global_load(unsigned long ticks) ++{ ++ long active; ++ ++ if (time_before(jiffies, READ_ONCE(calc_load_update))) ++ return; ++ active = nr_active() * FIXED_1; ++ ++ avenrun[0] = calc_load(avenrun[0], EXP_1, active); ++ avenrun[1] = calc_load(avenrun[1], EXP_5, active); ++ avenrun[2] = calc_load(avenrun[2], EXP_15, active); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++} ++ ++/** ++ * fixed_power_int - compute: x^n, in O(log n) time ++ * ++ * @x: base of the power ++ * @frac_bits: fractional bits of @x ++ * @n: power to raise @x to. ++ * ++ * By exploiting the relation between the definition of the natural power ++ * function: x^n := x*x*...*x (x multiplied by itself for n times), and ++ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, ++ * (where: n_i \elem {0, 1}, the binary vector representing n), ++ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is ++ * of course trivially computable in O(log_2 n), the length of our binary ++ * vector. ++ */ ++static unsigned long ++fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) ++{ ++ unsigned long result = 1UL << frac_bits; ++ ++ if (n) { ++ for (;;) { ++ if (n & 1) { ++ result *= x; ++ result += 1UL << (frac_bits - 1); ++ result >>= frac_bits; ++ } ++ n >>= 1; ++ if (!n) ++ break; ++ x *= x; ++ x += 1UL << (frac_bits - 1); ++ x >>= frac_bits; ++ } ++ } ++ ++ return result; ++} ++ ++/* ++ * a1 = a0 * e + a * (1 - e) ++ * ++ * a2 = a1 * e + a * (1 - e) ++ * = (a0 * e + a * (1 - e)) * e + a * (1 - e) ++ * = a0 * e^2 + a * (1 - e) * (1 + e) ++ * ++ * a3 = a2 * e + a * (1 - e) ++ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) ++ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) ++ * ++ * ... ++ * ++ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] ++ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) ++ * = a0 * e^n + a * (1 - e^n) ++ * ++ * [1] application of the geometric series: ++ * ++ * n 1 - x^(n+1) ++ * S_n := \Sum x^i = ------------- ++ * i=0 1 - x ++ */ ++unsigned long ++calc_load_n(unsigned long load, unsigned long exp, ++ unsigned long active, unsigned int n) ++{ ++ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); ++} ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++#ifdef CONFIG_PARAVIRT ++static inline u64 steal_ticks(u64 steal) ++{ ++ if (unlikely(steal > NSEC_PER_SEC)) ++ return div_u64(steal, TICK_NSEC); ++ ++ return __iter_div_u64_rem(steal, TICK_NSEC, &steal); ++} ++#endif ++ ++#ifndef nsecs_to_cputime ++# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) ++#endif ++ ++/* ++ * On each tick, add the number of nanoseconds to the unbanked variables and ++ * once one tick's worth has accumulated, account it allowing for accurate ++ * sub-tick accounting and totals. Use the TICK_APPROX_NS to match the way we ++ * deduct nanoseconds. ++ */ ++static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ if (atomic_read(&rq->nr_iowait) > 0) { ++ rq->iowait_ns += ns; ++ if (rq->iowait_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->iowait_ns); ++ cpustat[CPUTIME_IOWAIT] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->iowait_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->idle_ns += ns; ++ if (rq->idle_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->idle_ns); ++ cpustat[CPUTIME_IDLE] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->idle_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(idle); ++} ++ ++static void pc_system_time(struct rq *rq, struct task_struct *p, ++ int hardirq_offset, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ p->stime_ns += ns; ++ if (p->stime_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(p->stime_ns); ++ p->stime_ns %= JIFFY_NS; ++ p->stime += (__force u64)TICK_APPROX_NS * ticks; ++ account_group_system_time(p, TICK_APPROX_NS * ticks); ++ } ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ if (hardirq_count() - hardirq_offset) { ++ rq->irq_ns += ns; ++ if (rq->irq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->irq_ns); ++ cpustat[CPUTIME_IRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->irq_ns %= JIFFY_NS; ++ } ++ } else if (in_serving_softirq()) { ++ rq->softirq_ns += ns; ++ if (rq->softirq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->softirq_ns); ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->softirq_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->system_ns += ns; ++ if (rq->system_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->system_ns); ++ cpustat[CPUTIME_SYSTEM] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->system_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(p); ++} ++ ++static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ p->utime_ns += ns; ++ if (p->utime_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(p->utime_ns); ++ p->utime_ns %= JIFFY_NS; ++ p->utime += (__force u64)TICK_APPROX_NS * ticks; ++ account_group_user_time(p, TICK_APPROX_NS * ticks); ++ } ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ if (this_cpu_ksoftirqd() == p) { ++ /* ++ * ksoftirqd time do not get accounted in cpu_softirq_time. ++ * So, we have to handle it separately here. ++ */ ++ rq->softirq_ns += ns; ++ if (rq->softirq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->softirq_ns); ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->softirq_ns %= JIFFY_NS; ++ } ++ } ++ ++ if (task_nice(p) > 0 || idleprio_task(p)) { ++ rq->nice_ns += ns; ++ if (rq->nice_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->nice_ns); ++ cpustat[CPUTIME_NICE] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->nice_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->user_ns += ns; ++ if (rq->user_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->user_ns); ++ cpustat[CPUTIME_USER] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->user_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(p); ++} ++ ++/* ++ * This is called on clock ticks. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p) ++{ ++ s64 account_ns = rq->niffies - p->last_ran; ++ struct task_struct *idle = rq->idle; ++ ++ /* Accurate tick timekeeping */ ++ if (user_mode(get_irq_regs())) ++ pc_user_time(rq, p, account_ns); ++ else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) { ++ pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns); ++ } else ++ pc_idle_time(rq, idle, account_ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p->policy != SCHED_FIFO && p != idle) ++ p->time_slice -= NS_TO_US(account_ns); ++ ++ p->last_ran = rq->niffies; ++} ++ ++/* ++ * This is called on context switches. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p) ++{ ++ s64 account_ns = rq->niffies - p->last_ran; ++ struct task_struct *idle = rq->idle; ++ ++ /* Accurate subtick timekeeping */ ++ if (p != idle) ++ pc_user_time(rq, p, account_ns); ++ else ++ pc_idle_time(rq, idle, account_ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p->policy != SCHED_FIFO && p != idle) ++ p->time_slice -= NS_TO_US(account_ns); ++} ++ ++/* ++ * Return any ns on the sched_clock that have not yet been accounted in ++ * @p in case that task is currently running. ++ * ++ * Called with task_rq_lock(p) held. ++ */ ++static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) ++{ ++ u64 ns = 0; ++ ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_clocks(rq); ++ ns = rq->niffies - p->last_ran; ++ } ++ ++ return ns; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ struct rq_flags rf; ++ struct rq *rq; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimisation chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_rq_lock(p, &rf); ++ ns = p->sched_time + do_task_delta_exec(p, rq); ++ task_rq_unlock(rq, p, &rf); ++ ++ return ns; ++} ++ ++/* ++ * Functions to test for when SCHED_ISO tasks have used their allocated ++ * quota as real time scheduling and convert them back to SCHED_NORMAL. All ++ * data is modified only by the local runqueue during scheduler_tick with ++ * interrupts disabled. ++ */ ++ ++/* ++ * Test if SCHED_ISO tasks have run longer than their alloted period as RT ++ * tasks and set the refractory flag if necessary. There is 10% hysteresis ++ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a ++ * slow division. ++ */ ++static inline void iso_tick(struct rq *rq) ++{ ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; ++ rq->iso_ticks += 100; ++ if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) { ++ rq->iso_refractory = true; ++ if (unlikely(rq->iso_ticks > ISO_PERIOD * 100)) ++ rq->iso_ticks = ISO_PERIOD * 100; ++ } ++} ++ ++/* No SCHED_ISO task was running so decrease rq->iso_ticks */ ++static inline void no_iso_tick(struct rq *rq, int ticks) ++{ ++ if (rq->iso_ticks > 0 || rq->iso_refractory) { ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD; ++ if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) { ++ rq->iso_refractory = false; ++ if (unlikely(rq->iso_ticks < 0)) ++ rq->iso_ticks = 0; ++ } ++ } ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static void task_running_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ /* ++ * If a SCHED_ISO task is running we increment the iso_ticks. In ++ * order to prevent SCHED_ISO tasks from causing starvation in the ++ * presence of true RT tasks we account those as iso_ticks as well. ++ */ ++ if (rt_task(p) || task_running_iso(p)) ++ iso_tick(rq); ++ else ++ no_iso_tick(rq, 1); ++ ++ /* SCHED_FIFO tasks never run out of timeslice. */ ++ if (p->policy == SCHED_FIFO) ++ return; ++ ++ if (iso_task(p)) { ++ if (task_running_iso(p)) { ++ if (rq->iso_refractory) { ++ /* ++ * SCHED_ISO task is running as RT and limit ++ * has been hit. Force it to reschedule as ++ * SCHED_NORMAL by zeroing its time_slice ++ */ ++ p->time_slice = 0; ++ } ++ } else if (!rq->iso_refractory) { ++ /* Can now run again ISO. Reschedule to pick up prio */ ++ goto out_resched; ++ } ++ } ++ ++ /* ++ * Tasks that were scheduled in the first half of a tick are not ++ * allowed to run into the 2nd half of the next tick if they will ++ * run out of time slice in the interim. Otherwise, if they have ++ * less than RESCHED_US μs of time slice left they will be rescheduled. ++ * Dither is used as a backup for when hrexpiry is disabled or high res ++ * timers not configured in. ++ */ ++ if (p->time_slice - rq->dither >= RESCHED_US) ++ return; ++out_resched: ++ rq_lock(rq); ++ __set_tsk_resched(p); ++ rq_unlock(rq); ++} ++ ++static inline void task_tick(struct rq *rq) ++{ ++ if (!rq_idle(rq)) ++ task_running_tick(rq); ++ else if (rq->last_jiffy > rq->last_scheduler_tick) ++ no_iso_tick(rq, rq->last_jiffy - rq->last_scheduler_tick); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * We can stop the timer tick any time highres timers are active since ++ * we rely entirely on highres timeouts for task expiry rescheduling. ++ */ ++static void sched_stop_tick(struct rq *rq, int cpu) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ if (!tick_nohz_full_enabled()) ++ return; ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++ ++static inline void sched_start_tick(struct rq *rq, int cpu) ++{ ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++ ++struct tick_work { ++ int cpu; ++ atomic_t state; ++ struct delayed_work work; ++}; ++/* Values for ->state, see diagram below. */ ++#define TICK_SCHED_REMOTE_OFFLINE 0 ++#define TICK_SCHED_REMOTE_OFFLINING 1 ++#define TICK_SCHED_REMOTE_RUNNING 2 ++ ++/* ++ * State diagram for ->state: ++ * ++ * ++ * TICK_SCHED_REMOTE_OFFLINE ++ * | ^ ++ * | | ++ * | | sched_tick_remote() ++ * | | ++ * | | ++ * +--TICK_SCHED_REMOTE_OFFLINING ++ * | ^ ++ * | | ++ * sched_tick_start() | | sched_tick_stop() ++ * | | ++ * V | ++ * TICK_SCHED_REMOTE_RUNNING ++ * ++ * ++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() ++ * and sched_tick_start() are happy to leave the state in RUNNING. ++ */ ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ u64 delta; ++ int os; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (!tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ rq_lock_irq(rq); ++ if (cpu_is_offline(cpu)) ++ goto out_unlock; ++ ++ curr = rq->curr; ++ update_rq_clock(rq); ++ ++ if (!is_idle_task(curr)) { ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ delta = rq_clock_task(rq) - curr->last_ran; ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ } ++ task_tick(rq); ++ ++out_unlock: ++ rq_unlock_irq(rq, NULL); ++ ++out_requeue: ++ ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. But ++ * first update state to reflect hotplug activity if required. ++ */ ++ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); ++ if (os == TICK_SCHED_REMOTE_RUNNING) ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ struct tick_work *twork; ++ int os; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); ++ if (os == TICK_SCHED_REMOTE_OFFLINE) { ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++ } ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ int os; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ /* There cannot be competing actions, but don't rely on stop-machine. */ ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); ++ WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); ++ /* Don't cancel, as this would mess up the state machine. */ ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_stop_tick(struct rq *rq, int cpu) {} ++static inline void sched_start_tick(struct rq *rq, int cpu) {} ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++DEFINE_PER_CPU(unsigned long, thermal_pressure); ++ ++void arch_set_thermal_pressure(struct cpumask *cpus, ++ unsigned long th_pressure) ++{ ++ int cpu; ++ ++ for_each_cpu(cpu, cpus) ++ WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); ++} ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ arch_scale_freq_tick(); ++ sched_clock_tick(); ++ update_clocks(rq); ++ update_load_avg(rq, 0); ++ update_cpu_clock_tick(rq, rq->curr); ++ task_tick(rq); ++ rq->last_scheduler_tick = rq->last_jiffy; ++ rq->last_tick = rq->clock; ++ psi_task_tick(rq); ++ perf_event_task_tick(); ++ sched_stop_tick(rq, cpu); ++} ++ ++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * The time_slice is only refilled when it is empty and that is when we set a ++ * new deadline. Make sure update_clocks has been called recently to update ++ * rq->niffies. ++ */ ++static void time_slice_expired(struct task_struct *p, struct rq *rq) ++{ ++ p->time_slice = timeslice(); ++ p->deadline = rq->niffies + task_deadline_diff(p); ++#ifdef CONFIG_SMT_NICE ++ if (!p->mm) ++ p->smt_bias = 0; ++ else if (rt_task(p)) ++ p->smt_bias = 1 << 30; ++ else if (task_running_iso(p)) ++ p->smt_bias = 1 << 29; ++ else if (idleprio_task(p)) { ++ if (task_running_idle(p)) ++ p->smt_bias = 0; ++ else ++ p->smt_bias = 1; ++ } else if (--p->smt_bias < 1) ++ p->smt_bias = MAX_PRIO - p->static_prio; ++#endif ++} ++ ++/* ++ * Timeslices below RESCHED_US are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. SCHED_BATCH tasks ++ * have been flagged be not latency sensitive and likely to be fully CPU ++ * bound so every time they're rescheduled they have their time_slice ++ * refilled, but get a new later deadline to have little effect on ++ * SCHED_NORMAL tasks. ++ ++ */ ++static inline void check_deadline(struct task_struct *p, struct rq *rq) ++{ ++ if (p->time_slice < RESCHED_US || batch_task(p)) ++ time_slice_expired(p, rq); ++} ++ ++/* ++ * Task selection with skiplists is a simple matter of picking off the first ++ * task in the sorted list, an O(1) operation. The lookup is amortised O(1) ++ * being bound to the number of processors. ++ * ++ * Runqueues are selectively locked based on their unlocked data and then ++ * unlocked if not needed. At most 3 locks will be held at any time and are ++ * released as soon as they're no longer needed. All balancing between CPUs ++ * is thus done here in an extremely simple first come best fit manner. ++ * ++ * This iterates over runqueues in cache locality order. In interactive mode ++ * it iterates over all CPUs and finds the task with the best key/deadline. ++ * In non-interactive mode it will only take a task if it's from the current ++ * runqueue or a runqueue with more tasks than the current one with a better ++ * key/deadline. ++ */ ++#ifdef CONFIG_SMP ++static inline struct task_struct ++*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ struct rq *locked = NULL, *chosen = NULL; ++ struct task_struct *edt = idle; ++ int i, best_entries = 0; ++ u64 best_key = ~0ULL; ++ ++ for (i = 0; i < total_runqueues; i++) { ++ struct rq *other_rq = rq_order(rq, i); ++ skiplist_node *next; ++ int entries; ++ ++ entries = other_rq->sl->entries; ++ /* ++ * Check for queued entres lockless first. The local runqueue ++ * is locked so entries will always be accurate. ++ */ ++ if (!sched_interactive) { ++ /* ++ * Don't reschedule balance across nodes unless the CPU ++ * is idle. ++ */ ++ if (edt != idle && rq->cpu_locality[other_rq->cpu] > LOCALITY_SMP) ++ break; ++ if (entries <= best_entries) ++ continue; ++ } else if (!entries) ++ continue; ++ ++ /* if (i) implies other_rq != rq */ ++ if (i) { ++ /* Check for best id queued lockless first */ ++ if (other_rq->best_key >= best_key) ++ continue; ++ ++ if (unlikely(!trylock_rq(rq, other_rq))) ++ continue; ++ ++ /* Need to reevaluate entries after locking */ ++ entries = other_rq->sl->entries; ++ if (unlikely(!entries)) { ++ unlock_rq(other_rq); ++ continue; ++ } ++ } ++ ++ next = other_rq->node; ++ /* ++ * In interactive mode we check beyond the best entry on other ++ * runqueues if we can't get the best for smt or affinity ++ * reasons. ++ */ ++ while ((next = next->next[0]) != other_rq->node) { ++ struct task_struct *p; ++ u64 key = next->key; ++ ++ /* Reevaluate key after locking */ ++ if (key >= best_key) ++ break; ++ ++ p = next->value; ++ if (!smt_schedule(p, rq)) { ++ if (i && !sched_interactive) ++ break; ++ continue; ++ } ++ ++ if (sched_other_cpu(p, cpu)) { ++ if (sched_interactive || !i) ++ continue; ++ break; ++ } ++ /* Make sure affinity is ok */ ++ if (i) { ++ /* From this point on p is the best so far */ ++ if (locked) ++ unlock_rq(locked); ++ chosen = locked = other_rq; ++ } ++ best_entries = entries; ++ best_key = key; ++ edt = p; ++ break; ++ } ++ /* rq->preempting is a hint only as the state may have changed ++ * since it was set with the resched call but if we have met ++ * the condition we can break out here. */ ++ if (edt == rq->preempting) ++ break; ++ if (i && other_rq != chosen) ++ unlock_rq(other_rq); ++ } ++ ++ if (likely(edt != idle)) ++ take_task(rq, cpu, edt); ++ ++ if (locked) ++ unlock_rq(locked); ++ ++ rq->preempting = NULL; ++ ++ return edt; ++} ++#else /* CONFIG_SMP */ ++static inline struct task_struct ++*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ struct task_struct *edt; ++ ++ if (unlikely(!rq->sl->entries)) ++ return idle; ++ edt = rq->node->next[0]->value; ++ take_task(rq, cpu, edt); ++ return edt; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev, bool preempt) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++#endif ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++ if (!preempt && prev->state && prev->non_block_count) { ++ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", ++ prev->comm, prev->pid, prev->non_block_count); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++ } ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++/* ++ * The currently running task's information is all stored in rq local data ++ * which is only modified by the local CPU. ++ */ ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ if (p == rq->idle || p->policy == SCHED_FIFO) ++ hrexpiry_clear(rq); ++ else ++ hrexpiry_start(rq, US_TO_NS(p->time_slice)); ++ if (rq->clock - rq->last_tick > HALF_JIFFY_NS) ++ rq->dither = 0; ++ else ++ rq->dither = rq_dither(rq); ++ ++ rq->rq_deadline = p->deadline; ++ rq->rq_prio = p->prio; ++#ifdef CONFIG_SMT_NICE ++ rq->rq_mm = p->mm; ++ rq->rq_smt_bias = p->smt_bias; ++#endif ++} ++ ++#ifdef CONFIG_SMT_NICE ++static void check_no_siblings(struct rq __maybe_unused *this_rq) {} ++static void wake_no_siblings(struct rq __maybe_unused *this_rq) {} ++static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings; ++static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings; ++ ++/* Iterate over smt siblings when we've scheduled a process on cpu and decide ++ * whether they should continue running or be descheduled. */ ++static void check_smt_siblings(struct rq *this_rq) ++{ ++ int other_cpu; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct task_struct *p; ++ struct rq *rq; ++ ++ rq = cpu_rq(other_cpu); ++ if (rq_idle(rq)) ++ continue; ++ p = rq->curr; ++ if (!smt_schedule(p, this_rq)) ++ resched_curr(rq); ++ } ++} ++ ++static void wake_smt_siblings(struct rq *this_rq) ++{ ++ int other_cpu; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct rq *rq; ++ ++ rq = cpu_rq(other_cpu); ++ if (rq_idle(rq)) ++ resched_idle(rq); ++ } ++} ++#else ++static void check_siblings(struct rq __maybe_unused *this_rq) {} ++static void wake_siblings(struct rq __maybe_unused *this_rq) {} ++#endif ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next, *idle; ++ unsigned long *switch_count; ++ bool deactivate = false; ++ struct rq *rq; ++ u64 niffies; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ idle = rq->idle; ++ ++ schedule_debug(prev, preempt); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(). ++ * ++ * The membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ rq_lock(rq); ++ smp_mb__after_spinlock(); ++#ifdef CONFIG_SMP ++ if (rq->preempt) { ++ /* ++ * Make sure resched_curr hasn't triggered a preemption ++ * locklessly on a task that has since scheduled away. Spurious ++ * wakeup of idle is okay though. ++ */ ++ if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) { ++ rq->preempt = NULL; ++ clear_preempt_need_resched(); ++ rq_unlock_irq(rq, NULL); ++ return; ++ } ++ rq->preempt = NULL; ++ } ++#endif ++ ++ switch_count = &prev->nivcsw; ++ if (!preempt && prev->state) { ++ if (signal_pending_state(prev->state, prev)) { ++ prev->state = TASK_RUNNING; ++ } else { ++ deactivate = true; ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ /* ++ * Store the niffy value here for use by the next task's last_ran ++ * below to avoid losing niffies due to update_clocks being called ++ * again after this point. ++ */ ++ update_clocks(rq); ++ niffies = rq->niffies; ++ update_cpu_clock_switch(rq, prev); ++ ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ if (idle != prev) { ++ check_deadline(prev, rq); ++ return_task(prev, rq, cpu, deactivate); ++ } ++ ++ next = earliest_deadline_task(rq, cpu, idle); ++ if (likely(next->prio != PRIO_LIMIT)) ++ clear_cpuidle_map(cpu); ++ else { ++ set_cpuidle_map(cpu); ++ update_load_avg(rq, 0); ++ } ++ ++ set_rq_task(rq, next); ++ next->last_ran = niffies; ++ ++ if (likely(prev != next)) { ++ /* ++ * Don't reschedule an idle task or deactivated tasks ++ */ ++ if (prev == idle) { ++ rq->nr_running++; ++ if (rt_task(next)) ++ rq->rt_nr_running++; ++ } else if (!deactivate) ++ resched_suitable_idle(prev); ++ if (unlikely(next == idle)) { ++ rq->nr_running--; ++ if (rt_task(prev)) ++ rq->rt_nr_running--; ++ wake_siblings(rq); ++ } else ++ check_siblings(rq); ++ rq->nr_switches++; ++ /* ++ * RCU users of rcu_dereference(rq->curr) may not see ++ * changes to task_struct made by pick_next_task(). ++ */ ++ RCU_INIT_POINTER(rq->curr, next); ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ ++ psi_sched_switch(prev, next, !task_on_rq_queued(prev)); ++ ++ trace_sched_switch(preempt, prev, next); ++ context_switch(rq, prev, next); /* unlocks the rq */ ++ } else { ++ check_siblings(rq); ++ rq_unlock(rq); ++ local_irq_enable(); ++ } ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(). */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ __schedule(false); ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ if (!tsk->state) ++ return; ++ ++ /* ++ * If a worker went to sleep, notify and ask workqueue whether ++ * it wants to wake up a task to maintain concurrency. ++ * As this function is called inside the schedule() context, ++ * we disable preemption to avoid it calling schedule() again ++ * in the possible wakeup of a kworker and because wq_worker_sleeping() ++ * requires it. ++ */ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ preempt_disable(); ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_sleeping(tsk); ++ else ++ io_wq_worker_sleeping(tsk); ++ preempt_enable_no_resched(); ++ } ++ ++ if (tsk_is_pi_blocked(tsk)) ++ return; ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++static inline void sched_update_worker(struct task_struct *tsk) ++{ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_running(tsk); ++ else ++ io_wq_worker_running(tsk); ++ } ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ sched_update_worker(tsk); ++} ++ ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_CONTEXT_TRACKING ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != IN_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPTION ++/* ++ * This is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPTION */ ++ ++/* ++ * This is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio, oldprio; ++ struct rq *rq; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_rq_lock(p, NULL); ++ update_rq_clock(rq); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guaratees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ oldprio = p->prio; ++ p->prio = prio; ++ if (task_running(rq, p)){ ++ if (prio > oldprio) ++ resched_task(p); ++ } else if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (prio < oldprio) ++ try_preempt(p, rq); ++ } ++out_unlock: ++ __task_rq_unlock(rq, NULL); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++/* ++ * Adjust the deadline for when the priority is to change, before it's ++ * changed. ++ */ ++static inline void adjust_deadline(struct task_struct *p, int new_prio) ++{ ++ p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p); ++} ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ int new_static, old_static; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ new_static = NICE_TO_PRIO(nice); ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (has_rt_policy(p)) { ++ p->static_prio = new_static; ++ goto out_unlock; ++ } ++ ++ adjust_deadline(p, new_static); ++ old_static = p->static_prio; ++ p->static_prio = new_static; ++ p->prio = effective_prio(p); ++ ++ if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (new_static < old_static) ++ try_preempt(p, rq); ++ } else if (task_running(rq, p)) { ++ set_rq_task(rq, p); ++ if (old_static < new_static) ++ resched_task(p); ++ } ++out_unlock: ++ task_rq_unlock(rq, p, &rf); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ int delta, prio = p->prio - MAX_RT_PRIO; ++ ++ /* rt tasks and iso tasks */ ++ if (prio <= 0) ++ goto out; ++ ++ /* Convert to ms to avoid overflows */ ++ delta = NS_TO_MS(p->deadline - task_rq(p)->niffies); ++ if (unlikely(delta < 0)) ++ delta = 0; ++ delta = delta * 40 / ms_longest_deadline_diff(); ++ if (delta <= 80) ++ prio += delta; ++ if (idleprio_task(p)) ++ prio += 40; ++out: ++ return prio; ++} ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ return cpu_curr(cpu) == cpu_rq(cpu)->idle; ++} ++ ++/** ++ * available_idle_cpu - is a given CPU idle for enqueuing work. ++ * @cpu: the CPU in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int available_idle_cpu(int cpu) ++{ ++ if (!idle_cpu(cpu)) ++ return 0; ++ ++ if (vcpu_is_preempted(cpu)) ++ return 0; ++ ++ return 1; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the CPU @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, ++ int prio, const struct sched_attr *attr, ++ bool keep_boost) ++{ ++ int oldrtprio, oldprio; ++ ++ /* ++ * If params can't change scheduling class changes aren't allowed ++ * either. ++ */ ++ if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) ++ return; ++ ++ p->policy = policy; ++ oldrtprio = p->rt_priority; ++ p->rt_priority = prio; ++ p->normal_prio = normal_prio(p); ++ oldprio = p->prio; ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++ ++ if (task_running(rq, p)) { ++ set_rq_task(rq, p); ++ resched_task(p); ++ } else if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (p->prio < oldprio || p->rt_priority > oldrtprio) ++ try_preempt(p, rq); ++ } ++} ++ ++/* ++ * Check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int __sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, ++ bool user, bool pi) ++{ ++ int retval, policy = attr->sched_policy, oldpolicy = -1, priority = attr->sched_priority; ++ unsigned long rlim_rtprio = 0; ++ struct rq_flags rf; ++ int reset_on_fork; ++ struct rq *rq; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { ++ unsigned long lflags; ++ ++ if (!lock_task_sighand(p, &lflags)) ++ return -ESRCH; ++ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); ++ unlock_task_sighand(p, &lflags); ++ if (rlim_rtprio) ++ goto recheck; ++ /* ++ * If the caller requested an RT policy without having the ++ * necessary rights, we downgrade the policy to SCHED_ISO. ++ * We also set the parameter to zero to pass the checks. ++ */ ++ policy = SCHED_ISO; ++ priority = 0; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); ++ policy &= ~SCHED_RESET_ON_FORK; ++ ++ if (!SCHED_RANGE(policy)) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH is 0. ++ */ ++ if (priority < 0 || ++ (p->mm && priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if (is_rt_policy(policy) != (priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (is_rt_policy(policy)) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (priority > p->rt_priority && ++ priority > rlim_rtprio) ++ return -EPERM; ++ } else { ++ switch (p->policy) { ++ /* ++ * Can only downgrade policies but not back to ++ * SCHED_NORMAL ++ */ ++ case SCHED_ISO: ++ if (policy == SCHED_ISO) ++ goto out; ++ if (policy != SCHED_NORMAL) ++ return -EPERM; ++ break; ++ case SCHED_BATCH: ++ if (policy == SCHED_BATCH) ++ goto out; ++ if (policy != SCHED_IDLEPRIO) ++ return -EPERM; ++ break; ++ case SCHED_IDLEPRIO: ++ if (policy == SCHED_IDLEPRIO) ++ goto out; ++ return -EPERM; ++ default: ++ break; ++ } ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag: */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ if (pi) ++ cpuset_read_lock(); ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ * ++ * To be able to change p->policy safely, the runqueue lock must be ++ * held. ++ */ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea: ++ */ ++ if (p == rq->stop) { ++ retval = -EINVAL; ++ goto unlock; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy && (!is_rt_policy(policy) || ++ priority == p->rt_priority))) { ++ retval = 0; ++ goto unlock; ++ } ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ task_rq_unlock(rq, p, &rf); ++ if (pi) ++ cpuset_read_unlock(); ++ goto recheck; ++ } ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ __setscheduler(p, rq, policy, priority, attr, pi); ++ ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ task_rq_unlock(rq, p, &rf); ++ ++ if (pi) { ++ cpuset_read_unlock(); ++ rt_mutex_adjust_pi(p); ++ } ++ preempt_enable(); ++out: ++ return 0; ++ ++unlock: ++ task_rq_unlock(rq, p, &rf); ++ if (pi) ++ cpuset_read_unlock(); ++ return retval; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++EXPORT_SYMBOL_GPL(sched_setscheduler); ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++EXPORT_SYMBOL_GPL(sched_setattr); ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setscheduler(p, policy, &lparam); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, ++ struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) ++ goto err_size; ++ ++ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); ++ if (ret) { ++ if (ret == -E2BIG) ++ goto err_size; ++ return ret; ++ } ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) ++ attr.sched_policy = SETPARAM_POLICY; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setattr(p, &attr); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/* ++ * Copy the kernel size attribute structure (which might be larger ++ * than what user-space knows about) to user-space. ++ * ++ * Note that all cases are valid: user-space buffer can be larger or ++ * smaller than the kernel-space buffer. The usual case is that both ++ * have the same size. ++ */ ++static int ++sched_attr_copy_to_user(struct sched_attr __user *uattr, ++ struct sched_attr *kattr, ++ unsigned int usize) ++{ ++ unsigned int ksize = sizeof(*kattr); ++ ++ if (!access_ok(uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * sched_getattr() ABI forwards and backwards compatibility: ++ * ++ * If usize == ksize then we just copy everything to user-space and all is good. ++ * ++ * If usize < ksize then we only copy as much as user-space has space for, ++ * this keeps ABI compatibility as well. We skip the rest. ++ * ++ * If usize > ksize then user-space is using a newer version of the ABI, ++ * which part the kernel doesn't know about. Just ignore it - tooling can ++ * detect the kernel's knowledge of attributes from the attr->size value ++ * which is set to ksize in this case. ++ */ ++ kattr->size = min(usize, ksize); ++ ++ if (copy_to_user(uattr, kattr, kattr->size)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @usize: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, usize, unsigned int, flags) ++{ ++ struct sched_attr kattr = { }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || usize > PAGE_SIZE || ++ usize < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ kattr.sched_policy = p->policy; ++ if (rt_task(p)) ++ kattr.sched_priority = p->rt_priority; ++ else ++ kattr.sched_nice = task_nice(p); ++ ++ rcu_read_unlock(); ++ ++ return sched_attr_copy_to_user(uattr, &kattr, usize); ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_allowed, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_allowed); ++ cpumask_and(new_mask, in_mask, cpus_allowed); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_allowed); ++ if (!cpumask_subset(new_mask, cpus_allowed)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_allowed to the ++ * cpuset's cpus_allowed ++ */ ++ cpumask_copy(new_mask, cpus_allowed); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_allowed); ++out_put_task: ++ put_task_struct(p); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ cpumask_t *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ unsigned long flags; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ put_online_cpus(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min(len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ * ++ * Return: 0. ++ */ ++static void do_sched_yield(void) ++{ ++ struct rq *rq; ++ ++ if (!sched_yield_type) ++ return; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ rq_lock(rq); ++ ++ if (sched_yield_type > 1) ++ time_slice_expired(current, rq); ++ schedstat_inc(rq->yld_count); ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ preempt_disable(); ++ rq_unlock(rq); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPTION ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ rcu_all_qs(); ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, its already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ struct task_struct *rq_p; ++ struct rq *rq, *p_rq; ++ unsigned long flags; ++ int yielded = 0; ++ ++ local_irq_save(flags); ++ rq = this_rq(); ++ ++again: ++ p_rq = task_rq(p); ++ /* ++ * If we're the only runnable task on the rq and target rq also ++ * has only one task, there's absolutely no point in yielding. ++ */ ++ if (task_running(p_rq, p) || p->state) { ++ yielded = -ESRCH; ++ goto out_irq; ++ } ++ ++ double_rq_lock(rq, p_rq); ++ if (unlikely(task_rq(p) != p_rq)) { ++ double_rq_unlock(rq, p_rq); ++ goto again; ++ } ++ ++ yielded = 1; ++ schedstat_inc(rq->yld_count); ++ rq_p = rq->curr; ++ if (p->deadline > rq_p->deadline) ++ p->deadline = rq_p->deadline; ++ p->time_slice += rq_p->time_slice; ++ if (p->time_slice > timeslice()) ++ p->time_slice = timeslice(); ++ time_slice_expired(rq_p, rq); ++ if (preempt && rq != p_rq) ++ resched_task(p_rq->curr); ++ double_rq_unlock(rq, p_rq); ++out_irq: ++ local_irq_restore(flags); ++ ++ if (yielded > 0) ++ schedule(); ++ return yielded; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void __sched io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ unsigned int time_slice; ++ struct rq_flags rf; ++ struct rq *rq; ++ int retval; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ rq = task_rq_lock(p, &rf); ++ time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); ++ task_rq_unlock(rq, p, &rf); ++ ++ rcu_read_unlock(); ++ *t = ns_to_timespec64(time_slice); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * this syscall writes the default timeslice value of a given process ++ * into the user-space timespec buffer. A value of '0' means infinity. ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct __kernel_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT_32BIT_TIME ++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, ++ struct old_timespec32 __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_old_timespec32(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, ++ task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ show_stack(p, NULL); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++#if BITS_PER_LONG == 32 ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#else ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#endif ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++#ifdef CONFIG_SMP ++void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ struct rq *rq = task_rq(p); ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ ++ if (task_queued(p)) { ++ /* ++ * Because __kthread_bind() calls this on blocked tasks without ++ * holding rq->lock. ++ */ ++ lockdep_assert_held(rq->lock); ++ } ++} ++ ++/* ++ * Calling do_set_cpus_allowed from outside the scheduler code should not be ++ * called on a running or queued task. We should be holding pi_lock. ++ */ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ __do_set_cpus_allowed(p, new_mask); ++ if (needs_other_cpu(p, task_cpu(p))) { ++ struct rq *rq; ++ ++ rq = __task_rq_lock(p, NULL); ++ set_task_cpu(p, valid_task_cpu(p)); ++ resched_task(p); ++ __task_rq_unlock(rq, NULL); ++ } ++} ++#endif ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: cpu the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(rq->lock); ++ idle->last_ran = rq->niffies; ++ time_slice_expired(idle, rq); ++ idle->state = TASK_RUNNING; ++ /* Setting prio to illegal value shouldn't matter when never queued */ ++ idle->prio = PRIO_LIMIT; ++ idle->flags |= PF_IDLE; ++ ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#ifdef CONFIG_SMT_NICE ++ idle->smt_bias = 0; ++#endif ++#endif ++ set_rq_task(rq, idle); ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->idle = idle; ++ rcu_assign_pointer(rq->curr, idle); ++ idle->on_rq = TASK_ON_RQ_QUEUED; ++ raw_spin_unlock(rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_mask may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++ rq_lock_irqsave(rq, &rf); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(rq); ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_NO_HZ_COMMON ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++void nohz_balance_enter_idle(int cpu) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(), default_cpu = -1; ++ struct sched_domain *sd; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { ++ if (!idle_cpu(cpu)) ++ return cpu; ++ default_cpu = cpu; ++ } ++ ++ rcu_read_lock(); ++ for_each_domain(cpu, sd) { ++ for_each_cpu_and(i, sched_domain_span(sd), ++ housekeeping_cpumask(HK_FLAG_TIMER)) { ++ if (cpu == i) ++ continue; ++ ++ if (!idle_cpu(i)) { ++ cpu = i; ++ goto unlock; ++ } ++ } ++ } ++ ++ if (default_cpu == -1) ++ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++ cpu = default_cpu; ++unlock: ++ rcu_read_unlock(); ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++void wake_up_idle_cpu(int cpu) ++{ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ if (set_nr_and_not_polling(cpu_rq(cpu)->idle)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++static bool wake_up_full_nohz_cpu(int cpu) ++{ ++ /* ++ * We just need the target to call irq_exit() and re-evaluate ++ * the next tick. The nohz full kick at least implies that. ++ * If needed we can still optimize that later with an ++ * empty IRQ. ++ */ ++ if (cpu_is_offline(cpu)) ++ return true; /* Don't try to wake offline CPUs. */ ++ if (tick_nohz_full_cpu(cpu)) { ++ if (cpu != smp_processor_id() || ++ tick_nohz_tick_stopped()) ++ tick_nohz_full_kick_cpu(cpu); ++ return true; ++ } ++ ++ return false; ++} ++ ++/* ++ * Wake up the specified CPU. If the CPU is going offline, it is the ++ * caller's responsibility to deal with the lost wakeup, for example, ++ * by hooking into the CPU_DEAD notifier like timers and hrtimers do. ++ */ ++void wake_up_nohz_cpu(int cpu) ++{ ++ if (!wake_up_full_nohz_cpu(cpu)) ++ wake_up_idle_cpu(cpu); ++} ++#endif /* CONFIG_NO_HZ_COMMON */ ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ bool queued = false, running_wrong = false, kthread; ++ unsigned int dest_cpu; ++ struct rq_flags rf; ++ struct rq *rq; ++ int ret = 0; ++ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ kthread = !!(p->flags & PF_KTHREAD); ++ if (kthread) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (cpumask_equal(p->cpus_ptr, new_mask)) ++ goto out; ++ ++ /* ++ * Picking a ~random cpu helps in cases where we are changing affinity ++ * for groups of tasks (ie. cpuset), so that load balancing is not ++ * immediately required to distribute the tasks within their new mask. ++ */ ++ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask); ++ if (dest_cpu >= nr_cpu_ids) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ queued = task_queued(p); ++ __do_set_cpus_allowed(p, new_mask); ++ ++ if (kthread) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(rq, p)) { ++ /* Task is running on the wrong cpu now, reschedule it. */ ++ if (rq == this_rq()) { ++ set_task_cpu(p, dest_cpu); ++ set_tsk_need_resched(p); ++ running_wrong = true; ++ } else ++ resched_task(p); ++ } else { ++ if (queued) { ++ /* ++ * Switch runqueue locks after dequeueing the task ++ * here while still holding the pi_lock to be holding ++ * the correct lock for enqueueing. ++ */ ++ dequeue_task(rq, p, 0); ++ rq_unlock(rq); ++ ++ rq = cpu_rq(dest_cpu); ++ rq_lock(rq); ++ } ++ set_task_cpu(p, dest_cpu); ++ if (queued) ++ enqueue_task(rq, p, 0); ++ } ++ if (queued) ++ try_preempt(p, rq); ++ if (running_wrong) ++ preempt_disable(); ++out: ++ task_rq_unlock(rq, p, &rf); ++ ++ if (running_wrong) { ++ __schedule(true); ++ preempt_enable(); ++ } ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Run through task list and find tasks affined to the dead cpu, then remove ++ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold ++ * cpu 0 and src_cpu's runqueue locks. We should be holding both rq lock and ++ * pi_lock to change cpus_mask but it's not going to matter here. ++ */ ++static void bind_zero(int src_cpu) ++{ ++ struct task_struct *p, *t; ++ struct rq *rq0; ++ int bound = 0; ++ ++ if (src_cpu == 0) ++ return; ++ ++ rq0 = cpu_rq(0); ++ ++ do_each_thread(t, p) { ++ if (cpumask_test_cpu(src_cpu, p->cpus_ptr)) { ++ bool local = (task_cpu(p) == src_cpu); ++ struct rq *rq = task_rq(p); ++ ++ /* task_running is the cpu stopper thread */ ++ if (local && task_running(rq, p)) ++ continue; ++ atomic_clear_cpu(src_cpu, &p->cpus_mask); ++ atomic_set_cpu(0, &p->cpus_mask); ++ p->zerobound = true; ++ bound++; ++ if (local) { ++ bool queued = task_queued(p); ++ ++ if (queued) ++ dequeue_task(rq, p, 0); ++ set_task_cpu(p, 0); ++ if (queued) ++ enqueue_task(rq0, p, 0); ++ } ++ } ++ } while_each_thread(t, p); ++ ++ if (bound) { ++ printk(KERN_INFO "MuQSS removed affinity for %d processes to cpu %d\n", ++ bound, src_cpu); ++ } ++} ++ ++/* Find processes with the zerobound flag and reenable their affinity for the ++ * CPU coming alive. */ ++static void unbind_zero(int src_cpu) ++{ ++ int unbound = 0, zerobound = 0; ++ struct task_struct *p, *t; ++ ++ if (src_cpu == 0) ++ return; ++ ++ do_each_thread(t, p) { ++ if (!p->mm) ++ p->zerobound = false; ++ if (p->zerobound) { ++ unbound++; ++ cpumask_set_cpu(src_cpu, &p->cpus_mask); ++ /* Once every CPU affinity has been re-enabled, remove ++ * the zerobound flag */ ++ if (cpumask_subset(cpu_possible_mask, p->cpus_ptr)) { ++ p->zerobound = false; ++ zerobound++; ++ } ++ } ++ } while_each_thread(t, p); ++ ++ if (unbound) { ++ printk(KERN_INFO "MuQSS added affinity for %d processes to cpu %d\n", ++ unbound, src_cpu); ++ } ++ if (zerobound) { ++ printk(KERN_INFO "MuQSS released forced binding to cpu0 for %d processes\n", ++ zerobound); ++ } ++} ++ ++/* ++ * Ensure that the idle task is using init_mm right before its cpu goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(cpu_online(smp_processor_id())); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ current->active_mm = &init_mm; ++ finish_arch_post_lock_switch(); ++ } ++ mmdrop(mm); ++} ++#else /* CONFIG_HOTPLUG_CPU */ ++static void unbind_zero(int src_cpu) {} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++ ++static struct ctl_table sd_ctl_dir[] = { ++ { ++ .procname = "sched_domain", ++ .mode = 0555, ++ }, ++ {} ++}; ++ ++static struct ctl_table sd_ctl_root[] = { ++ { ++ .procname = "kernel", ++ .mode = 0555, ++ .child = sd_ctl_dir, ++ }, ++ {} ++}; ++ ++static struct ctl_table *sd_alloc_ctl_entry(int n) ++{ ++ struct ctl_table *entry = ++ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); ++ ++ return entry; ++} ++ ++static void sd_free_ctl_entry(struct ctl_table **tablep) ++{ ++ struct ctl_table *entry; ++ ++ /* ++ * In the intermediate directories, both the child directory and ++ * procname are dynamically allocated and could fail but the mode ++ * will always be set. In the lowest directory the names are ++ * static strings and all have proc handlers. ++ */ ++ for (entry = *tablep; entry->mode; entry++) { ++ if (entry->child) ++ sd_free_ctl_entry(&entry->child); ++ if (entry->proc_handler == NULL) ++ kfree(entry->procname); ++ } ++ ++ kfree(*tablep); ++ *tablep = NULL; ++} ++ ++static void ++set_table_entry(struct ctl_table *entry, ++ const char *procname, void *data, int maxlen, ++ umode_t mode, proc_handler *proc_handler) ++{ ++ entry->procname = procname; ++ entry->data = data; ++ entry->maxlen = maxlen; ++ entry->mode = mode; ++ entry->proc_handler = proc_handler; ++} ++ ++static struct ctl_table * ++sd_alloc_ctl_domain_table(struct sched_domain *sd) ++{ ++ struct ctl_table *table = sd_alloc_ctl_entry(9); ++ ++ if (table == NULL) ++ return NULL; ++ ++ set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); ++ /* &table[8] is terminator */ ++ ++ return table; ++} ++ ++static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) ++{ ++ struct ctl_table *entry, *table; ++ struct sched_domain *sd; ++ int domain_num = 0, i; ++ char buf[32]; ++ ++ for_each_domain(cpu, sd) ++ domain_num++; ++ entry = table = sd_alloc_ctl_entry(domain_num + 1); ++ if (table == NULL) ++ return NULL; ++ ++ i = 0; ++ for_each_domain(cpu, sd) { ++ snprintf(buf, 32, "domain%d", i); ++ entry->procname = kstrdup(buf, GFP_KERNEL); ++ entry->mode = 0555; ++ entry->child = sd_alloc_ctl_domain_table(sd); ++ entry++; ++ i++; ++ } ++ return table; ++} ++ ++static cpumask_var_t sd_sysctl_cpus; ++static struct ctl_table_header *sd_sysctl_header; ++ ++void register_sched_domain_sysctl(void) ++{ ++ static struct ctl_table *cpu_entries; ++ static struct ctl_table **cpu_idx; ++ char buf[32]; ++ int i; ++ ++ if (!cpu_entries) { ++ cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); ++ if (!cpu_entries) ++ return; ++ ++ WARN_ON(sd_ctl_dir[0].child); ++ sd_ctl_dir[0].child = cpu_entries; ++ } ++ ++ if (!cpu_idx) { ++ struct ctl_table *e = cpu_entries; ++ ++ cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); ++ if (!cpu_idx) ++ return; ++ ++ /* deal with sparse possible map */ ++ for_each_possible_cpu(i) { ++ cpu_idx[i] = e; ++ e++; ++ } ++ } ++ ++ if (!cpumask_available(sd_sysctl_cpus)) { ++ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) ++ return; ++ ++ /* init to possible to not have holes in @cpu_entries */ ++ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); ++ } ++ ++ for_each_cpu(i, sd_sysctl_cpus) { ++ struct ctl_table *e = cpu_idx[i]; ++ ++ if (e->child) ++ sd_free_ctl_entry(&e->child); ++ ++ if (!e->procname) { ++ snprintf(buf, 32, "cpu%d", i); ++ e->procname = kstrdup(buf, GFP_KERNEL); ++ } ++ e->mode = 0555; ++ e->child = sd_alloc_ctl_cpu_table(i); ++ ++ __cpumask_clear_cpu(i, sd_sysctl_cpus); ++ } ++ ++ WARN_ON(sd_sysctl_header); ++ sd_sysctl_header = register_sysctl_table(sd_ctl_root); ++} ++ ++void dirty_sched_domain_sysctl(int cpu) ++{ ++ if (cpumask_available(sd_sysctl_cpus)) ++ __cpumask_set_cpu(cpu, sd_sysctl_cpus); ++} ++ ++/* may be called multiple times per register */ ++void unregister_sched_domain_sysctl(void) ++{ ++ unregister_sysctl_table(sd_sysctl_header); ++ sd_sysctl_header = NULL; ++} ++#endif /* CONFIG_SYSCTL */ ++ ++void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) { ++ cpumask_set_cpu(cpu_of(rq), rq->rd->online); ++ rq->online = true; ++ } ++} ++ ++void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) { ++ int cpu = cpu_of(rq); ++ ++ cpumask_clear_cpu(cpu, rq->rd->online); ++ rq->online = false; ++ clear_cpuidle_map(cpu); ++ } ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going up, increment the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_inc_cpuslocked(&sched_smt_present); ++#endif ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) { ++ sched_domains_numa_masks_set(cpu); ++ cpuset_cpu_active(); ++ } ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all CPUs have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ rq_lock_irqsave(rq, &rf); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_online(rq); ++ } ++ unbind_zero(cpu); ++ rq_unlock_irqrestore(rq, &rf); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu(); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going down, decrement the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_dec_cpuslocked(&sched_smt_present); ++#endif ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ sched_domains_numa_masks_clear(cpu); ++ return 0; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ /* Handle pending wakeups and then migrate everything off */ ++ sched_ttwu_pending(); ++ sched_tick_stop(cpu); ++ ++ local_irq_save(flags); ++ double_rq_lock(rq, cpu_rq(0)); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_offline(rq); ++ } ++ bind_zero(cpu); ++ double_rq_unlock(rq, cpu_rq(0)); ++ sched_start_tick(rq, cpu); ++ hrexpiry_clear(rq); ++ local_irq_restore(flags); ++ ++ return 0; ++} ++#endif ++ ++#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) ++/* ++ * Cheaper version of the below functions in case support for SMT and MC is ++ * compiled in but CPUs have no siblings. ++ */ ++static bool sole_cpu_idle(struct rq *rq) ++{ ++ return rq_idle(rq); ++} ++#endif ++#ifdef CONFIG_SCHED_SMT ++static const cpumask_t *thread_cpumask(int cpu) ++{ ++ return topology_sibling_cpumask(cpu); ++} ++/* All this CPU's SMT siblings are idle */ ++static bool siblings_cpu_idle(struct rq *rq) ++{ ++ return cpumask_subset(&rq->thread_mask, &cpu_idle_map); ++} ++#endif ++#ifdef CONFIG_SCHED_MC ++static const cpumask_t *core_cpumask(int cpu) ++{ ++ return topology_core_cpumask(cpu); ++} ++/* All this CPU's shared cache siblings are idle */ ++static bool cache_cpu_idle(struct rq *rq) ++{ ++ return cpumask_subset(&rq->core_mask, &cpu_idle_map); ++} ++/* MC siblings CPU mask which share the same LLC */ ++static const cpumask_t *llc_core_cpumask(int cpu) ++{ ++#ifdef CONFIG_X86 ++ return per_cpu(cpu_llc_shared_map, cpu); ++#else ++ return topology_core_cpumask(cpu); ++#endif ++} ++#endif ++ ++enum sched_domain_level { ++ SD_LV_NONE = 0, ++ SD_LV_SIBLING, ++ SD_LV_MC, ++ SD_LV_BOOK, ++ SD_LV_CPU, ++ SD_LV_NODE, ++ SD_LV_ALLNODES, ++ SD_LV_MAX ++}; ++ ++/* ++ * Set up the relative cache distance of each online cpu from each ++ * other in a simple array for quick lookup. Locality is determined ++ * by the closest sched_domain that CPUs are separated by. CPUs with ++ * shared cache in SMT and MC are treated as local. Separate CPUs ++ * (within the same package or physically) within the same node are ++ * treated as not local. CPUs not even in the same domain (different ++ * nodes) are treated as very distant. ++ */ ++static void __init select_leaders(void) ++{ ++ struct rq *rq, *other_rq, *leader; ++ struct sched_domain *sd; ++ int cpu, other_cpu; ++#ifdef CONFIG_SCHED_SMT ++ bool smt_threads = false; ++#endif ++ ++ for (cpu = 0; cpu < num_online_cpus(); cpu++) { ++ rq = cpu_rq(cpu); ++ leader = NULL; ++ /* First check if this cpu is in the same node */ ++ for_each_domain(cpu, sd) { ++ if (sd->level > SD_LV_MC) ++ continue; ++ if (rqshare != RQSHARE_ALL) ++ leader = NULL; ++ /* Set locality to local node if not already found lower */ ++ for_each_cpu(other_cpu, sched_domain_span(sd)) { ++ if (rqshare >= RQSHARE_SMP) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the smp_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ if (!other_rq->smp_leader) ++ other_rq->smp_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_SMP) ++ rq->cpu_locality[other_cpu] = LOCALITY_SMP; ++ } ++ } ++ ++ /* ++ * Each runqueue has its own function in case it doesn't have ++ * siblings of its own allowing mixed topologies. ++ */ ++#ifdef CONFIG_SCHED_MC ++ leader = NULL; ++ if (cpumask_weight(core_cpumask(cpu)) > 1) { ++ cpumask_copy(&rq->core_mask, llc_core_cpumask(cpu)); ++ cpumask_clear_cpu(cpu, &rq->core_mask); ++ for_each_cpu(other_cpu, core_cpumask(cpu)) { ++ if (rqshare == RQSHARE_MC || ++ (rqshare == RQSHARE_MC_LLC && cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu)))) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the mc_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ if (!other_rq->mc_leader) ++ other_rq->mc_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_MC) { ++ /* this is to get LLC into play even in case LLC sharing is not used */ ++ if (cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu))) ++ rq->cpu_locality[other_cpu] = LOCALITY_MC_LLC; ++ else ++ rq->cpu_locality[other_cpu] = LOCALITY_MC; ++ } ++ } ++ rq->cache_idle = cache_cpu_idle; ++ } ++#endif ++#ifdef CONFIG_SCHED_SMT ++ leader = NULL; ++ if (cpumask_weight(thread_cpumask(cpu)) > 1) { ++ cpumask_copy(&rq->thread_mask, thread_cpumask(cpu)); ++ cpumask_clear_cpu(cpu, &rq->thread_mask); ++ for_each_cpu(other_cpu, thread_cpumask(cpu)) { ++ if (rqshare == RQSHARE_SMT) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the smt_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ if (!other_rq->smt_leader) ++ other_rq->smt_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_SMT) ++ rq->cpu_locality[other_cpu] = LOCALITY_SMT; ++ } ++ rq->siblings_idle = siblings_cpu_idle; ++ smt_threads = true; ++ } ++#endif ++ } ++ ++#ifdef CONFIG_SMT_NICE ++ if (smt_threads) { ++ check_siblings = &check_smt_siblings; ++ wake_siblings = &wake_smt_siblings; ++ smt_schedule = &smt_should_schedule; ++ } ++#endif ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for_each_online_cpu(other_cpu) { ++ printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]); ++ } ++ } ++} ++ ++/* FIXME freeing locked spinlock */ ++static void __init share_and_free_rq(struct rq *leader, struct rq *rq) ++{ ++ WARN_ON(rq->nr_running > 0); ++ ++ kfree(rq->node); ++ kfree(rq->sl); ++ kfree(rq->lock); ++ rq->node = leader->node; ++ rq->sl = leader->sl; ++ rq->lock = leader->lock; ++ rq->is_leader = false; ++ barrier(); ++ /* To make up for not unlocking the freed runlock */ ++ preempt_enable(); ++} ++ ++static void __init share_rqs(void) ++{ ++ struct rq *rq, *leader; ++ int cpu; ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->smp_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing SMP runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ share_and_free_rq(leader, rq); ++ } else ++ rq_unlock(rq); ++ } ++ ++#ifdef CONFIG_SCHED_MC ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->mc_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing MC runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ share_and_free_rq(leader, rq); ++ } else ++ rq_unlock(rq); ++ } ++#endif /* CONFIG_SCHED_MC */ ++ ++#ifdef CONFIG_SCHED_SMT ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->smt_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing SMT runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ share_and_free_rq(leader, rq); ++ } else ++ rq_unlock(rq); ++ } ++#endif /* CONFIG_SCHED_SMT */ ++} ++ ++static void __init setup_rq_orders(void) ++{ ++ int *selected_cpus, *ordered_cpus; ++ struct rq *rq, *other_rq; ++ int cpu, other_cpu, i; ++ ++ selected_cpus = kmalloc(sizeof(int) * NR_CPUS, GFP_ATOMIC); ++ ordered_cpus = kmalloc(sizeof(int) * NR_CPUS, GFP_ATOMIC); ++ ++ total_runqueues = 0; ++ for_each_online_cpu(cpu) { ++ int locality, total_rqs = 0, total_cpus = 0; ++ ++ rq = cpu_rq(cpu); ++ if (rq->is_leader) ++ total_runqueues++; ++ ++ for (locality = LOCALITY_SAME; locality <= LOCALITY_DISTANT; locality++) { ++ int selected_cpu_cnt, selected_cpu_idx, test_cpu_idx, cpu_idx, best_locality, test_cpu; ++ int ordered_cpus_idx; ++ ++ ordered_cpus_idx = -1; ++ selected_cpu_cnt = 0; ++ ++ for_each_online_cpu(test_cpu) { ++ if (cpu < num_online_cpus() / 2) ++ other_cpu = cpu + test_cpu; ++ else ++ other_cpu = cpu - test_cpu; ++ if (other_cpu < 0) ++ other_cpu += num_online_cpus(); ++ else ++ other_cpu %= num_online_cpus(); ++ /* gather CPUs of the same locality */ ++ if (rq->cpu_locality[other_cpu] == locality) { ++ selected_cpus[selected_cpu_cnt] = other_cpu; ++ selected_cpu_cnt++; ++ } ++ } ++ ++ /* reserve first CPU as starting point */ ++ if (selected_cpu_cnt > 0) { ++ ordered_cpus_idx++; ++ ordered_cpus[ordered_cpus_idx] = selected_cpus[ordered_cpus_idx]; ++ selected_cpus[ordered_cpus_idx] = -1; ++ } ++ ++ /* take each CPU and sort it within the same locality based on each inter-CPU localities */ ++ for(test_cpu_idx = 1; test_cpu_idx < selected_cpu_cnt; test_cpu_idx++) { ++ /* starting point with worst locality and current CPU */ ++ best_locality = LOCALITY_DISTANT; ++ selected_cpu_idx = test_cpu_idx; ++ ++ /* try to find the best locality within group */ ++ for(cpu_idx = 1; cpu_idx < selected_cpu_cnt; cpu_idx++) { ++ /* if CPU has not been used and locality is better */ ++ if (selected_cpus[cpu_idx] > -1) { ++ other_rq = cpu_rq(ordered_cpus[ordered_cpus_idx]); ++ if (best_locality > other_rq->cpu_locality[selected_cpus[cpu_idx]]) { ++ /* assign best locality and best CPU idx in array */ ++ best_locality = other_rq->cpu_locality[selected_cpus[cpu_idx]]; ++ selected_cpu_idx = cpu_idx; ++ } ++ } ++ } ++ ++ /* add our next best CPU to ordered list */ ++ ordered_cpus_idx++; ++ ordered_cpus[ordered_cpus_idx] = selected_cpus[selected_cpu_idx]; ++ /* mark this CPU as used */ ++ selected_cpus[selected_cpu_idx] = -1; ++ } ++ ++ /* set up RQ and CPU orders */ ++ for (test_cpu = 0; test_cpu <= ordered_cpus_idx; test_cpu++) { ++ other_rq = cpu_rq(ordered_cpus[test_cpu]); ++ /* set up cpu orders */ ++ rq->cpu_order[total_cpus++] = other_rq; ++ if (other_rq->is_leader) { ++ /* set up RQ orders */ ++ rq->rq_order[total_rqs++] = other_rq; ++ } ++ } ++ } ++ } ++ ++ kfree(selected_cpus); ++ kfree(ordered_cpus); ++ ++#ifdef CONFIG_X86 ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for (i = 0; i < total_runqueues; i++) { ++ printk(KERN_DEBUG "MuQSS CPU %d llc %d RQ order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, ++ rq->rq_order[i]->cpu, per_cpu(cpu_llc_id, rq->rq_order[i]->cpu)); ++ } ++ } ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for (i = 0; i < num_online_cpus(); i++) { ++ printk(KERN_DEBUG "MuQSS CPU %d llc %d CPU order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, ++ rq->cpu_order[i]->cpu, per_cpu(cpu_llc_id, rq->cpu_order[i]->cpu)); ++ } ++ } ++#endif ++} ++ ++void __init sched_init_smp(void) ++{ ++ sched_init_numa(); ++ ++ /* ++ * There's no userspace yet to cause hotplug operations; hence all the ++ * cpu masks are stable and all blatant races in the below code cannot ++ * happen. ++ */ ++ mutex_lock(&sched_domains_mutex); ++ sched_init_domains(cpu_active_mask); ++ mutex_unlock(&sched_domains_mutex); ++ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) ++ BUG(); ++ ++ local_irq_disable(); ++ mutex_lock(&sched_domains_mutex); ++ lock_all_rqs(); ++ ++ printk(KERN_INFO "MuQSS possible/present/online CPUs: %d/%d/%d\n", ++ num_possible_cpus(), num_present_cpus(), num_online_cpus()); ++ ++ select_leaders(); ++ ++ unlock_all_rqs(); ++ mutex_unlock(&sched_domains_mutex); ++ ++ share_rqs(); ++ ++ local_irq_enable(); ++ ++ setup_rq_orders(); ++ ++ switch (rqshare) { ++ case RQSHARE_ALL: ++ /* This should only ever read 1 */ ++ printk(KERN_INFO "MuQSS runqueue share type ALL total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_SMP: ++ printk(KERN_INFO "MuQSS runqueue share type SMP total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_MC: ++ printk(KERN_INFO "MuQSS runqueue share type MC total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_MC_LLC: ++ printk(KERN_INFO "MuQSS runqueue share type LLC total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_SMT: ++ printk(KERN_INFO "MuQSS runqueue share type SMT total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_NONE: ++ printk(KERN_INFO "MuQSS runqueue share type NONE total runqueues: %d\n", ++ total_runqueues); ++ break; ++ } ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++ sched_smp_initialized = true; ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++#ifdef CONFIG_SMP ++ int cpu_ids; ++#endif ++ int i; ++ struct rq *rq; ++ ++ wait_bit_init(); ++ ++ prio_ratios[0] = 128; ++ for (i = 1 ; i < NICE_WIDTH ; i++) ++ prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; ++ ++ skiplist_node_init(&init_task.node); ++ ++#ifdef CONFIG_SMP ++ init_defrootdomain(); ++ cpumask_clear(&cpu_idle_map); ++#else ++ uprq = &per_cpu(runqueues, 0); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ rq->node = kmalloc(sizeof(skiplist_node), GFP_ATOMIC); ++ skiplist_init(rq->node); ++ rq->sl = new_skiplist(rq->node); ++ rq->lock = kmalloc(sizeof(raw_spinlock_t), GFP_ATOMIC); ++ raw_spin_lock_init(rq->lock); ++ rq->nr_running = 0; ++ rq->nr_uninterruptible = 0; ++ rq->nr_switches = 0; ++ rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0; ++ rq->last_jiffy = jiffies; ++ rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns = ++ rq->iowait_ns = rq->idle_ns = 0; ++ rq->dither = 0; ++ set_rq_task(rq, &init_task); ++ rq->iso_ticks = 0; ++ rq->iso_refractory = false; ++#ifdef CONFIG_SMP ++ rq->is_leader = true; ++ rq->smp_leader = NULL; ++#ifdef CONFIG_SCHED_MC ++ rq->mc_leader = NULL; ++#endif ++#ifdef CONFIG_SCHED_SMT ++ rq->smt_leader = NULL; ++#endif ++ rq->sd = NULL; ++ rq->rd = NULL; ++ rq->online = false; ++ rq->cpu = i; ++ rq_attach_root(rq, &def_root_domain); ++#endif ++ init_rq_hrexpiry(rq); ++ atomic_set(&rq->nr_iowait, 0); ++ } ++ ++#ifdef CONFIG_SMP ++ cpu_ids = i; ++ /* ++ * Set the base locality for cpu cache distance calculation to ++ * "distant" (3). Make sure the distance from a CPU to itself is 0. ++ */ ++ for_each_possible_cpu(i) { ++ int j; ++ ++ rq = cpu_rq(i); ++#ifdef CONFIG_SCHED_SMT ++ rq->siblings_idle = sole_cpu_idle; ++#endif ++#ifdef CONFIG_SCHED_MC ++ rq->cache_idle = sole_cpu_idle; ++#endif ++ rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC); ++ for_each_possible_cpu(j) { ++ if (i == j) ++ rq->cpu_locality[j] = LOCALITY_SAME; ++ else ++ rq->cpu_locality[j] = LOCALITY_DISTANT; ++ } ++ rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); ++ rq->cpu_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); ++ rq->rq_order[0] = rq->cpu_order[0] = rq; ++ for (j = 1; j < cpu_ids; j++) ++ rq->rq_order[j] = rq->cpu_order[j] = cpu_rq(j); ++ } ++#endif ++ ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++ ++ psi_init(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void __cant_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > preempt_offset) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); ++ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current) && !current->non_block_count) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && !preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++static inline void normalise_rt_tasks(void) ++{ ++ struct sched_attr attr = {}; ++ struct task_struct *g, *p; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p) && !iso_task(p)) ++ continue; ++ ++ rq = task_rq_lock(p, &rf); ++ __setscheduler(p, rq, SCHED_NORMAL, 0, &attr, false); ++ task_rq_unlock(rq, p, &rf); ++ } ++ read_unlock(&tasklist_lock); ++} ++ ++void normalize_rt_tasks(void) ++{ ++ normalise_rt_tasks(); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * ia64_set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++void init_idle_bootup_task(struct task_struct *idle) ++{} ++ ++#ifdef CONFIG_SCHED_DEBUG ++__read_mostly bool sched_debug_enabled; ++ ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{ ++ seq_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), ++ get_nr_threads(p)); ++} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_legacy_files[] = { ++ { } /* Terminate */ ++}; ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++#undef CREATE_TRACE_POINTS +diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h +new file mode 100644 +index 000000000000..b34f2797e44f +--- /dev/null ++++ b/kernel/sched/MuQSS.h +@@ -0,0 +1,1056 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef MUQSS_SCHED_H ++#define MUQSS_SCHED_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_PARAVIRT ++#include ++#endif ++ ++#include "cpupri.h" ++ ++#ifdef CONFIG_SCHED_DEBUG ++# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) ++#else ++# define SCHED_WARN_ON(x) ((void)(x)) ++#endif ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++struct rq; ++ ++#ifdef CONFIG_SMP ++ ++static inline bool sched_asym_prefer(int a, int b) ++{ ++ return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); ++} ++ ++struct perf_domain { ++ struct em_perf_domain *em_pd; ++ struct perf_domain *next; ++ struct rcu_head rcu; ++}; ++ ++/* Scheduling group status flags */ ++#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ ++#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ ++ ++/* ++ * We add the notion of a root-domain which will be used to define per-domain ++ * variables. Each exclusive cpuset essentially defines an island domain by ++ * fully partitioning the member cpus from any other cpuset. Whenever a new ++ * exclusive cpuset is created, we also create and attach a new root-domain ++ * object. ++ * ++ */ ++struct root_domain { ++ atomic_t refcount; ++ atomic_t rto_count; ++ struct rcu_head rcu; ++ cpumask_var_t span; ++ cpumask_var_t online; ++ ++ /* ++ * Indicate pullable load on at least one CPU, e.g: ++ * - More than one runnable task ++ * - Running task is misfit ++ */ ++ int overload; ++ ++ /* Indicate one or more cpus over-utilized (tipping point) */ ++ int overutilized; ++ ++ /* ++ * The bit corresponding to a CPU gets set here if such CPU has more ++ * than one runnable -deadline task (as it is below for RT tasks). ++ */ ++ cpumask_var_t dlo_mask; ++ atomic_t dlo_count; ++ /* Replace unused CFS structures with void */ ++ //struct dl_bw dl_bw; ++ //struct cpudl cpudl; ++ void *dl_bw; ++ void *cpudl; ++ ++ /* ++ * The "RT overload" flag: it gets set if a CPU has more than ++ * one runnable RT task. ++ */ ++ cpumask_var_t rto_mask; ++ //struct cpupri cpupri; ++ void *cpupri; ++ ++ unsigned long max_cpu_capacity; ++ ++ /* ++ * NULL-terminated list of performance domains intersecting with the ++ * CPUs of the rd. Protected by RCU. ++ */ ++ struct perf_domain *pd; ++}; ++ ++extern void init_defrootdomain(void); ++extern int sched_init_domains(const struct cpumask *cpu_map); ++extern void rq_attach_root(struct rq *rq, struct root_domain *rd); ++ ++static inline void cpupri_cleanup(void __maybe_unused *cpupri) ++{ ++} ++ ++static inline void cpudl_cleanup(void __maybe_unused *cpudl) ++{ ++} ++ ++static inline void init_dl_bw(void __maybe_unused *dl_bw) ++{ ++} ++ ++static inline int cpudl_init(void __maybe_unused *dl_bw) ++{ ++ return 0; ++} ++ ++static inline int cpupri_init(void __maybe_unused *cpupri) ++{ ++ return 0; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ raw_spinlock_t *lock; ++ raw_spinlock_t *orig_lock; ++ ++ struct task_struct __rcu *curr; ++ struct task_struct *idle; ++ struct task_struct *stop; ++ struct mm_struct *prev_mm; ++ ++ unsigned int nr_running; ++ /* ++ * This is part of a global counter where only the total sum ++ * over all CPUs matters. A task can increase this counter on ++ * one CPU and if it got migrated afterwards it may decrease ++ * it on another CPU. Always updated under the runqueue lock: ++ */ ++ unsigned long nr_uninterruptible; ++ u64 nr_switches; ++ ++ /* Stored data about rq->curr to work outside rq lock */ ++ u64 rq_deadline; ++ int rq_prio; ++ ++ /* Best queued id for use outside lock */ ++ u64 best_key; ++ ++ unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */ ++ unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ ++ u64 niffies; /* Last time this RQ updated rq clock */ ++ u64 last_niffy; /* Last niffies as updated by local clock */ ++ u64 last_jiffy_niffies; /* Niffies @ last_jiffy */ ++ ++ u64 load_update; /* When we last updated load */ ++ unsigned long load_avg; /* Rolling load average */ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ u64 irq_load_update; /* When we last updated IRQ load */ ++ unsigned long irq_load_avg; /* Rolling IRQ load average */ ++#endif ++#ifdef CONFIG_SMT_NICE ++ struct mm_struct *rq_mm; ++ int rq_smt_bias; /* Policy/nice level bias across smt siblings */ ++#endif ++ /* Accurate timekeeping data */ ++ unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns, ++ iowait_ns, idle_ns; ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_MEMBARRIER ++ int membarrier_state; ++#endif ++ ++ skiplist_node *node; ++ skiplist *sl; ++#ifdef CONFIG_SMP ++ struct task_struct *preempt; /* Preempt triggered on this task */ ++ struct task_struct *preempting; /* Hint only, what task is preempting */ ++ ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++ struct root_domain *rd; ++ struct sched_domain *sd; ++ ++ unsigned long cpu_capacity_orig; ++ ++ int *cpu_locality; /* CPU relative cache distance */ ++ struct rq **rq_order; /* Shared RQs ordered by relative cache distance */ ++ struct rq **cpu_order; /* RQs of discrete CPUs ordered by distance */ ++ ++ bool is_leader; ++ struct rq *smp_leader; /* First physical CPU per node */ ++#ifdef CONFIG_SCHED_THERMAL_PRESSURE ++ struct sched_avg avg_thermal; ++#endif /* CONFIG_SCHED_THERMAL_PRESSURE */ ++#ifdef CONFIG_SCHED_SMT ++ struct rq *smt_leader; /* First logical CPU in SMT siblings */ ++ cpumask_t thread_mask; ++ bool (*siblings_idle)(struct rq *rq); ++ /* See if all smt siblings are idle */ ++#endif /* CONFIG_SCHED_SMT */ ++#ifdef CONFIG_SCHED_MC ++ struct rq *mc_leader; /* First logical CPU in MC siblings */ ++ cpumask_t core_mask; ++ bool (*cache_idle)(struct rq *rq); ++ /* See if all cache siblings are idle */ ++#endif /* CONFIG_SCHED_MC */ ++#endif /* CONFIG_SMP */ ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ u64 clock, old_clock, last_tick; ++ /* Ensure that all clocks are in the same cache line */ ++ u64 clock_task ____cacheline_aligned; ++ int dither; ++ ++ int iso_ticks; ++ bool iso_refractory; ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++ struct hrtimer hrexpiry_timer; ++#endif ++ ++ int rt_nr_running; /* Number real time tasks running */ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++ ++#ifdef CONFIG_SMP ++ struct llist_head wake_list; ++#endif ++ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++}; ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ lockdep_assert_held(rq->lock); ++ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ lockdep_assert_held(rq->lock); ++ ++ return rq->clock_task; ++} ++ ++/** ++ * By default the decay is the default pelt decay period. ++ * The decay shift can change the decay period in ++ * multiples of 32. ++ * Decay shift Decay period(ms) ++ * 0 32 ++ * 1 64 ++ * 2 128 ++ * 3 256 ++ * 4 512 ++ */ ++extern int sched_thermal_decay_shift; ++ ++static inline u64 rq_clock_thermal(struct rq *rq) ++{ ++ return rq_clock_task(rq) >> sched_thermal_decay_shift; ++} ++ ++struct rq_flags { ++ unsigned long flags; ++}; ++ ++#ifdef CONFIG_SMP ++struct rq *cpu_rq(int cpu); ++#endif ++ ++#ifndef CONFIG_SMP ++extern struct rq *uprq; ++#define cpu_rq(cpu) (uprq) ++#define this_rq() (uprq) ++#define raw_rq() (uprq) ++#define task_rq(p) (uprq) ++#define cpu_curr(cpu) ((uprq)->curr) ++#else /* CONFIG_SMP */ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define this_rq() this_cpu_ptr(&runqueues) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#endif /* CONFIG_SMP */ ++ ++static inline int task_current(struct rq *rq, struct task_struct *p) ++{ ++ return rq->curr == p; ++} ++ ++static inline int task_running(struct rq *rq, struct task_struct *p) ++{ ++#ifdef CONFIG_SMP ++ return p->on_cpu; ++#else ++ return task_current(rq, p); ++#endif ++} ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++} ++ ++static inline void rq_lock(struct rq *rq) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock(rq->lock); ++} ++ ++static inline void rq_unlock(struct rq *rq) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(rq->lock); ++} ++ ++static inline void rq_lock_irq(struct rq *rq) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irq(rq->lock); ++} ++ ++static inline void rq_unlock_irq(struct rq *rq, struct rq_flags __always_unused *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(rq->lock); ++} ++ ++static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irqsave(rq->lock, rf->flags); ++} ++ ++static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irqrestore(rq->lock, rf->flags); ++} ++ ++static inline struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ while (42) { ++ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); ++ rq = task_rq(p); ++ raw_spin_lock(rq->lock); ++ if (likely(rq == task_rq(p))) ++ break; ++ raw_spin_unlock(rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++ } ++ return rq; ++} ++ ++static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ rq_unlock(rq); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++} ++ ++static inline struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags __always_unused *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ while (42) { ++ rq = task_rq(p); ++ raw_spin_lock(rq->lock); ++ if (likely(rq == task_rq(p))) ++ break; ++ raw_spin_unlock(rq->lock); ++ } ++ return rq; ++} ++ ++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags __always_unused *rf) ++{ ++ rq_unlock(rq); ++} ++ ++static inline struct rq * ++this_rq_lock_irq(struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ rq_lock(rq); ++ return rq; ++} ++ ++/* ++ * {de,en}queue flags: Most not used on MuQSS. ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks ++ * are in a known state which allows modification. Such pairs ++ * should preserve as much state as possible. ++ * ++ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location ++ * in the runqueue. ++ * ++ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) ++ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) ++ * ENQUEUE_MIGRATED - the task was migrated during wakeup ++ * ++ */ ++ ++#define DEQUEUE_SLEEP 0x01 ++#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ ++ ++#define ENQUEUE_WAKEUP 0x01 ++#define ENQUEUE_RESTORE 0x02 ++ ++#ifdef CONFIG_SMP ++#define ENQUEUE_MIGRATED 0x40 ++#else ++#define ENQUEUE_MIGRATED 0x00 ++#endif ++ ++#ifdef CONFIG_NUMA ++enum numa_topology_type { ++ NUMA_DIRECT, ++ NUMA_GLUELESS_MESH, ++ NUMA_BACKPLANE, ++}; ++extern enum numa_topology_type sched_numa_topology_type; ++extern int sched_max_numa_distance; ++extern bool find_numa_distance(int distance); ++extern void sched_init_numa(void); ++extern void sched_domains_numa_masks_set(unsigned int cpu); ++extern void sched_domains_numa_masks_clear(unsigned int cpu); ++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); ++#else ++static inline void sched_init_numa(void) { } ++static inline void sched_domains_numa_masks_set(unsigned int cpu) { } ++static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } ++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return nr_cpu_ids; ++} ++#endif ++ ++extern struct mutex sched_domains_mutex; ++extern struct static_key_false sched_schedstats; ++ ++#define rcu_dereference_check_sched_domain(p) \ ++ rcu_dereference_check((p), \ ++ lockdep_is_held(&sched_domains_mutex)) ++ ++#ifdef CONFIG_SMP ++ ++/* ++ * The domain tree (rq->sd) is protected by RCU's quiescent state transition. ++ * See destroy_sched_domains: call_rcu for details. ++ * ++ * The domain tree of any CPU may only be accessed from within ++ * preempt-disabled sections. ++ */ ++#define for_each_domain(cpu, __sd) \ ++ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ ++ __sd; __sd = __sd->parent) ++ ++/** ++ * highest_flag_domain - Return highest sched_domain containing flag. ++ * @cpu: The cpu whose highest level of sched domain is to ++ * be returned. ++ * @flag: The flag to check for the highest sched_domain ++ * for the given cpu. ++ * ++ * Returns the highest sched_domain of a cpu which contains the given flag. ++ */ ++static inline struct sched_domain *highest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd, *hsd = NULL; ++ ++ for_each_domain(cpu, sd) { ++ if (!(sd->flags & flag)) ++ break; ++ hsd = sd; ++ } ++ ++ return hsd; ++} ++ ++static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd; ++ ++ for_each_domain(cpu, sd) { ++ if (sd->flags & flag) ++ break; ++ } ++ ++ return sd; ++} ++ ++DECLARE_PER_CPU(struct sched_domain *, sd_llc); ++DECLARE_PER_CPU(int, sd_llc_size); ++DECLARE_PER_CPU(int, sd_llc_id); ++DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); ++DECLARE_PER_CPU(struct sched_domain *, sd_numa); ++DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); ++DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); ++ ++struct sched_group_capacity { ++ atomic_t ref; ++ /* ++ * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity ++ * for a single CPU. ++ */ ++ unsigned long capacity; ++ unsigned long min_capacity; /* Min per-CPU capacity in group */ ++ unsigned long max_capacity; /* Max per-CPU capacity in group */ ++ unsigned long next_update; ++ int imbalance; /* XXX unrelated to capacity but shared group state */ ++ ++#ifdef CONFIG_SCHED_DEBUG ++ int id; ++#endif ++ ++ unsigned long cpumask[0]; /* balance mask */ ++}; ++ ++struct sched_group { ++ struct sched_group *next; /* Must be a circular list */ ++ atomic_t ref; ++ ++ unsigned int group_weight; ++ struct sched_group_capacity *sgc; ++ int asym_prefer_cpu; /* cpu of highest priority in group */ ++ ++ /* ++ * The CPUs this group covers. ++ * ++ * NOTE: this field is variable length. (Allocated dynamically ++ * by attaching extra space to the end of the structure, ++ * depending on how many CPUs the kernel has booted up with) ++ */ ++ unsigned long cpumask[0]; ++}; ++ ++static inline struct cpumask *sched_group_span(struct sched_group *sg) ++{ ++ return to_cpumask(sg->cpumask); ++} ++ ++/* ++ * See build_balance_mask(). ++ */ ++static inline struct cpumask *group_balance_mask(struct sched_group *sg) ++{ ++ return to_cpumask(sg->sgc->cpumask); ++} ++ ++/** ++ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. ++ * @group: The group whose first cpu is to be returned. ++ */ ++static inline unsigned int group_first_cpu(struct sched_group *group) ++{ ++ return cpumask_first(sched_group_span(group)); ++} ++ ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void dirty_sched_domain_sysctl(int cpu); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void dirty_sched_domain_sysctl(int cpu) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++extern void sched_ttwu_pending(void); ++extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); ++extern void set_rq_online (struct rq *rq); ++extern void set_rq_offline(struct rq *rq); ++extern bool sched_smp_initialized; ++ ++static inline void update_group_capacity(struct sched_domain *sd, int cpu) ++{ ++} ++ ++static inline void trigger_load_balance(struct rq *rq) ++{ ++} ++ ++#define sched_feat(x) 0 ++ ++#else /* CONFIG_SMP */ ++ ++static inline void sched_ttwu_pending(void) { } ++ ++#endif /* CONFIG_SMP */ ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ SCHED_WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++#ifdef CONFIG_SCHED_DEBUG ++extern bool sched_debug_enabled; ++#endif ++ ++extern void schedule_idle(void); ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++static inline bool sched_stop_runnable(struct rq *rq) ++{ ++ return rq->stop && task_on_rq_queued(rq->stop); ++} ++ ++#ifdef CONFIG_SMP ++static inline int cpu_of(struct rq *rq) ++{ ++ return rq->cpu; ++} ++#else /* CONFIG_SMP */ ++static inline int cpu_of(struct rq *rq) ++{ ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); ++ ++static inline void cpufreq_trigger(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, ++ cpu_of(rq))); ++ ++ if (data) ++ data->func(data, rq->niffies, flags); ++} ++#else ++static inline void cpufreq_trigger(struct rq *rq, unsigned int flag) ++{ ++} ++#endif /* CONFIG_CPU_FREQ */ ++ ++static __always_inline ++unsigned int uclamp_rq_util_with(struct rq __maybe_unused *rq, unsigned int util, ++ struct task_struct __maybe_unused *p) ++{ ++ return util; ++} ++ ++static inline bool uclamp_is_used(void) ++{ ++ return false; ++} ++ ++#ifndef arch_scale_freq_tick ++static __always_inline ++void arch_scale_freq_tick(void) ++{ ++} ++#endif ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++#ifdef CONFIG_64BIT ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ return tsk_seruntime(t); ++} ++#else ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ struct rq_flags rf; ++ u64 ns; ++ struct rq *rq; ++ ++ rq = task_rq_lock(t, &rf); ++ ns = tsk_seruntime(t); ++ task_rq_unlock(rq, t, &rf); ++ ++ return ns; ++} ++#endif ++ ++#ifndef arch_scale_freq_capacity ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern bool sched_can_stop_tick(struct rq *rq); ++extern int __init sched_tick_offload_init(void); ++ ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out of ++ * nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu; ++ ++ if (!tick_nohz_full_enabled()) ++ return; ++ ++ cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (sched_can_stop_tick(rq)) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++static inline bool rt_rq_is_runnable(struct rq *rt_rq) ++{ ++ return rt_rq->rt_nr_running; ++} ++ ++/** ++ * enum schedutil_type - CPU utilization type ++ * @FREQUENCY_UTIL: Utilization used to select frequency ++ * @ENERGY_UTIL: Utilization used during energy calculation ++ * ++ * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time ++ * need to be aggregated differently depending on the usage made of them. This ++ * enum is used within schedutil_freq_util() to differentiate the types of ++ * utilization expected by the callers, and adjust the aggregation accordingly. ++ */ ++enum schedutil_type { ++ FREQUENCY_UTIL, ++ ENERGY_UTIL, ++}; ++ ++#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL ++ ++unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, ++ unsigned long max, enum schedutil_type type, ++ struct task_struct *p); ++ ++static inline unsigned long cpu_bw_dl(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline unsigned long cpu_util_dl(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline unsigned long cpu_util_cfs(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->load_avg); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++static inline unsigned long cpu_util_rt(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->rt_nr_running); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++static inline unsigned long cpu_util_irq(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->irq_load_avg); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++static inline ++unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) ++{ ++ util *= (max - irq); ++ util /= max; ++ ++ return util; ++ ++} ++#else ++static inline unsigned long cpu_util_irq(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline ++unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) ++{ ++ return util; ++} ++#endif ++#endif ++ ++#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) ++#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) ++ ++DECLARE_STATIC_KEY_FALSE(sched_energy_present); ++ ++static inline bool sched_energy_enabled(void) ++{ ++ return static_branch_unlikely(&sched_energy_present); ++} ++ ++#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ ++ ++#define perf_domain_span(pd) NULL ++static inline bool sched_energy_enabled(void) { return false; } ++ ++#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ ++ ++#ifdef CONFIG_MEMBARRIER ++/* ++ * The scheduler provides memory barriers required by membarrier between: ++ * - prior user-space memory accesses and store to rq->membarrier_state, ++ * - store to rq->membarrier_state and following user-space memory accesses. ++ * In the same way it provides those guarantees around store to rq->curr. ++ */ ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++ int membarrier_state; ++ ++ if (prev_mm == next_mm) ++ return; ++ ++ membarrier_state = atomic_read(&next_mm->membarrier_state); ++ if (READ_ONCE(rq->membarrier_state) == membarrier_state) ++ return; ++ ++ WRITE_ONCE(rq->membarrier_state, membarrier_state); ++} ++#else ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++} ++#endif ++ ++#ifdef CONFIG_SMP ++static inline bool is_per_cpu_kthread(struct task_struct *p) ++{ ++ if (!(p->flags & PF_KTHREAD)) ++ return false; ++ ++ if (p->nr_cpus_allowed != 1) ++ return false; ++ ++ return true; ++} ++#endif ++ ++void swake_up_all_locked(struct swait_queue_head *q); ++void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); ++ ++/* pelt.h compat CONFIG_SCHED_THERMAL_PRESSURE impossible with MUQSS */ ++static inline int ++update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) ++{ ++ return 0; ++} ++ ++static inline u64 thermal_load_avg(struct rq *rq) ++{ ++ return 0; ++} ++ ++#endif /* MUQSS_SCHED_H */ +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index 7fbaee24c824..15d274af9b1c 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -183,6 +183,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifdef CONFIG_SCHED_MUQSS ++#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(rq) ++#else ++#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(&rq->rt) ++#endif ++ + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -211,7 +217,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + struct rq *rq = cpu_rq(cpu); + + if (!uclamp_is_used() && +- type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { ++ type == FREQUENCY_UTIL && rt_rq_runnable(rq)) { + return max; + } + +@@ -656,7 +662,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + struct task_struct *thread; + struct sched_attr attr = { + .size = sizeof(struct sched_attr), ++#ifdef CONFIG_SCHED_MUQSS ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, ++#endif + .sched_flags = SCHED_FLAG_SUGOV, + .sched_nice = 0, + .sched_priority = 0, +diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h +index efbb492bb94c..f0288c32ab17 100644 +--- a/kernel/sched/cpupri.h ++++ b/kernel/sched/cpupri.h +@@ -17,6 +17,7 @@ struct cpupri { + int *cpu_to_pri; + }; + ++#ifndef CONFIG_SCHED_MUQSS + #ifdef CONFIG_SMP + int cpupri_find(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask); +@@ -27,3 +28,4 @@ void cpupri_set(struct cpupri *cp, int cpu, int pri); + int cpupri_init(struct cpupri *cp); + void cpupri_cleanup(struct cpupri *cp); + #endif ++#endif +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index ff9435dee1df..d7bd67204d65 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -266,26 +266,6 @@ static inline u64 account_other_time(u64 max) + return accounted; + } + +-#ifdef CONFIG_64BIT +-static inline u64 read_sum_exec_runtime(struct task_struct *t) +-{ +- return t->se.sum_exec_runtime; +-} +-#else +-static u64 read_sum_exec_runtime(struct task_struct *t) +-{ +- u64 ns; +- struct rq_flags rf; +- struct rq *rq; +- +- rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; +- task_rq_unlock(rq, t, &rf); +- +- return ns; +-} +-#endif +- + /* + * Accumulate raw cputime values of dead tasks (sig->[us]time) and live + * tasks (sum on group iteration) belonging to @tsk's group. +@@ -658,7 +638,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index b743bf38f08f..c769795d726b 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -361,6 +361,7 @@ void cpu_startup_entry(enum cpuhp_state state) + do_idle(); + } + ++#ifndef CONFIG_SCHED_MUQSS + /* + * idle-task scheduling class. + */ +@@ -481,3 +482,4 @@ const struct sched_class idle_sched_class = { + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif /* CONFIG_SCHED_MUQSS */ +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index db3a57675ccf..1f11cefe8d20 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2,6 +2,19 @@ + /* + * Scheduler internal types and methods: + */ ++#ifdef CONFIG_SCHED_MUQSS ++#include "MuQSS.h" ++ ++/* Begin compatibility wrappers for MuQSS/CFS differences */ ++#define rq_rt_nr_running(rq) ((rq)->rt_nr_running) ++#define rq_h_nr_running(rq) ((rq)->nr_running) ++ ++#else /* CONFIG_SCHED_MUQSS */ ++ ++#define rq_rt_nr_running(rq) ((rq)->rt.rt_nr_running) ++#define rq_h_nr_running(rq) ((rq)->cfs.h_nr_running) ++ ++ + #include + + #include +@@ -2546,3 +2559,25 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) + + void swake_up_all_locked(struct swait_queue_head *q); + void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); ++ ++/* MuQSS compatibility functions */ ++#ifdef CONFIG_64BIT ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ return t->se.sum_exec_runtime; ++} ++#else ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ u64 ns; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ rq = task_rq_lock(t, &rf); ++ ns = t->se.sum_exec_runtime; ++ task_rq_unlock(rq, t, &rf); ++ ++ return ns; ++} ++#endif ++#endif /* CONFIG_SCHED_MUQSS */ +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 8344757bba6e..d819af35a770 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -450,7 +450,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) + struct root_domain *old_rd = NULL; + unsigned long flags; + ++#ifdef CONFIG_SCHED_MUQSS ++ raw_spin_lock_irqsave(rq->lock, flags); ++#else + raw_spin_lock_irqsave(&rq->lock, flags); ++#endif + + if (rq->rd) { + old_rd = rq->rd; +@@ -476,7 +480,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) + if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) + set_rq_online(rq); + ++#ifdef CONFIG_SCHED_MUQSS ++ raw_spin_unlock_irqrestore(rq->lock, flags); ++#else + raw_spin_unlock_irqrestore(&rq->lock, flags); ++#endif + + if (old_rd) + call_rcu(&old_rd->rcu, free_rootdomain); +diff --git a/kernel/skip_list.c b/kernel/skip_list.c +new file mode 100644 +index 000000000000..bf5c6e97e139 +--- /dev/null ++++ b/kernel/skip_list.c +@@ -0,0 +1,148 @@ ++/* ++ Copyright (C) 2011,2016 Con Kolivas. ++ ++ Code based on example originally by William Pugh. ++ ++Skip Lists are a probabilistic alternative to balanced trees, as ++described in the June 1990 issue of CACM and were invented by ++William Pugh in 1987. ++ ++A couple of comments about this implementation: ++The routine randomLevel has been hard-coded to generate random ++levels using p=0.25. It can be easily changed. ++ ++The insertion routine has been implemented so as to use the ++dirty hack described in the CACM paper: if a random level is ++generated that is more than the current maximum level, the ++current maximum level plus one is used instead. ++ ++Levels start at zero and go up to MaxLevel (which is equal to ++MaxNumberOfLevels-1). ++ ++The routines defined in this file are: ++ ++init: defines slnode ++ ++new_skiplist: returns a new, empty list ++ ++randomLevel: Returns a random level based on a u64 random seed passed to it. ++In MuQSS, the "niffy" time is used for this purpose. ++ ++insert(l,key, value): inserts the binding (key, value) into l. This operation ++occurs in O(log n) time. ++ ++delnode(slnode, l, node): deletes any binding of key from the l based on the ++actual node value. This operation occurs in O(k) time where k is the ++number of levels of the node in question (max 8). The original delete ++function occurred in O(log n) time and involved a search. ++ ++MuQSS Notes: In this implementation of skiplists, there are bidirectional ++next/prev pointers and the insert function returns a pointer to the actual ++node the value is stored. The key here is chosen by the scheduler so as to ++sort tasks according to the priority list requirements and is no longer used ++by the scheduler after insertion. The scheduler lookup, however, occurs in ++O(1) time because it is always the first item in the level 0 linked list. ++Since the task struct stores a copy of the node pointer upon skiplist_insert, ++it can also remove it much faster than the original implementation with the ++aid of prev<->next pointer manipulation and no searching. ++ ++*/ ++ ++#include ++#include ++ ++#define MaxNumberOfLevels 8 ++#define MaxLevel (MaxNumberOfLevels - 1) ++ ++void skiplist_init(skiplist_node *slnode) ++{ ++ int i; ++ ++ slnode->key = 0xFFFFFFFFFFFFFFFF; ++ slnode->level = 0; ++ slnode->value = NULL; ++ for (i = 0; i < MaxNumberOfLevels; i++) ++ slnode->next[i] = slnode->prev[i] = slnode; ++} ++ ++skiplist *new_skiplist(skiplist_node *slnode) ++{ ++ skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC); ++ ++ BUG_ON(!l); ++ l->header = slnode; ++ return l; ++} ++ ++void free_skiplist(skiplist *l) ++{ ++ skiplist_node *p, *q; ++ ++ p = l->header; ++ do { ++ q = p->next[0]; ++ p->next[0]->prev[0] = q->prev[0]; ++ skiplist_node_init(p); ++ p = q; ++ } while (p != l->header); ++ kfree(l); ++} ++ ++void skiplist_node_init(skiplist_node *node) ++{ ++ memset(node, 0, sizeof(skiplist_node)); ++} ++ ++static inline unsigned int randomLevel(const long unsigned int randseed) ++{ ++ return find_first_bit(&randseed, MaxLevel) / 2; ++} ++ ++void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed) ++{ ++ skiplist_node *update[MaxNumberOfLevels]; ++ skiplist_node *p, *q; ++ int k = l->level; ++ ++ p = l->header; ++ do { ++ while (q = p->next[k], q->key <= key) ++ p = q; ++ update[k] = p; ++ } while (--k >= 0); ++ ++ ++l->entries; ++ k = randomLevel(randseed); ++ if (k > l->level) { ++ k = ++l->level; ++ update[k] = l->header; ++ } ++ ++ node->level = k; ++ node->key = key; ++ node->value = value; ++ do { ++ p = update[k]; ++ node->next[k] = p->next[k]; ++ p->next[k] = node; ++ node->prev[k] = p; ++ node->next[k]->prev[k] = node; ++ } while (--k >= 0); ++} ++ ++void skiplist_delete(skiplist *l, skiplist_node *node) ++{ ++ int k, m = node->level; ++ ++ for (k = 0; k <= m; k++) { ++ node->prev[k]->next[k] = node->next[k]; ++ node->next[k]->prev[k] = node->prev[k]; ++ } ++ skiplist_node_init(node); ++ if (m == l->level) { ++ while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0) ++ m--; ++ l->level = m; ++ } ++ l->entries--; ++} +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index 8a176d8727a3..808473f947ee 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -130,9 +130,19 @@ static int __maybe_unused four = 4; + static unsigned long zero_ul; + static unsigned long one_ul = 1; + static unsigned long long_max = LONG_MAX; +-static int one_hundred = 100; +-static int one_thousand = 1000; +-#ifdef CONFIG_PRINTK ++static int __read_mostly one_hundred = 100; ++static int __read_mostly one_thousand = 1000; ++static int zero = 0; ++static int one = 1; ++#ifdef CONFIG_SCHED_MUQSS ++extern int rr_interval; ++extern int sched_interactive; ++extern int sched_iso_cpu; ++extern int sched_yield_type; ++#endif ++extern int hrtimer_granularity_us; ++extern int hrtimeout_min_us; ++#if defined(CONFIG_PRINTK) || defined(CONFIG_SCHED_MUQSS) + static int ten_thousand = 10000; + #endif + #ifdef CONFIG_PERF_EVENTS +@@ -288,7 +298,7 @@ static struct ctl_table sysctl_base_table[] = { + { } + }; + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_MUQSS) + static int min_sched_granularity_ns = 100000; /* 100 usecs */ + static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns; /* 0 usecs */ +@@ -305,6 +315,7 @@ static int max_extfrag_threshold = 1000; + #endif + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_MUQSS + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -486,6 +497,7 @@ static struct ctl_table kern_table[] = { + .extra2 = SYSCTL_ONE, + }, + #endif ++#endif /* !CONFIG_SCHED_MUQSS */ + #ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", +@@ -1049,6 +1061,62 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_MUQSS ++ { ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &one_thousand, ++ }, ++ { ++ .procname = "interactive", ++ .data = &sched_interactive, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, ++ { ++ .procname = "iso_cpu", ++ .data = &sched_iso_cpu, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one_hundred, ++ }, ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &two, ++ }, ++#endif ++ { ++ .procname = "hrtimer_granularity_us", ++ .data = &hrtimer_granularity_us, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &ten_thousand, ++ }, ++ { ++ .procname = "hrtimeout_min_us", ++ .data = &hrtimeout_min_us, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &ten_thousand, ++ }, + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig +index fcc42353f125..2960cace6719 100644 +--- a/kernel/time/Kconfig ++++ b/kernel/time/Kconfig +@@ -66,6 +66,9 @@ config NO_HZ_COMMON + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + ++config NO_HZ_FULL ++ bool ++ + choice + prompt "Timer tick handling" + default NO_HZ_IDLE if NO_HZ +@@ -87,8 +90,9 @@ config NO_HZ_IDLE + + Most of the time you want to say Y here. + +-config NO_HZ_FULL ++config NO_HZ_FULL_NODEF + bool "Full dynticks system (tickless)" ++ select NO_HZ_FULL + # NO_HZ_COMMON dependency + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + # We need at least one periodic CPU for timekeeping +@@ -114,6 +118,8 @@ config NO_HZ_FULL + transitions: syscalls, exceptions and interrupts. Even when it's + dynamically off. + ++ Not recommended for desktops,laptops, or mobile devices. ++ + Say N. + + endchoice +@@ -123,7 +129,7 @@ config CONTEXT_TRACKING + + config CONTEXT_TRACKING_FORCE + bool "Force context tracking" +- depends on CONTEXT_TRACKING ++ depends on CONTEXT_TRACKING && !SCHED_MUQSS + default y if !NO_HZ_FULL + help + The major pre-requirement for full dynticks to work is to +diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c +index f5490222e134..544c58c29267 100644 +--- a/kernel/time/clockevents.c ++++ b/kernel/time/clockevents.c +@@ -190,8 +190,9 @@ int clockevents_tick_resume(struct clock_event_device *dev) + + #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST + +-/* Limit min_delta to a jiffie */ +-#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) ++int __read_mostly hrtimer_granularity_us = 100; ++/* Limit min_delta to 100us */ ++#define MIN_DELTA_LIMIT (hrtimer_granularity_us * NSEC_PER_USEC) + + /** + * clockevents_increase_min_delta - raise minimum delta of a clock event device +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index d89da1c7e005..e4f5b4c483a0 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -2216,3 +2216,113 @@ int __sched schedule_hrtimeout(ktime_t *expires, + return schedule_hrtimeout_range(expires, 0, mode); + } + EXPORT_SYMBOL_GPL(schedule_hrtimeout); ++ ++/* ++ * As per schedule_hrtimeout but taskes a millisecond value and returns how ++ * many milliseconds are left. ++ */ ++long __sched schedule_msec_hrtimeout(long timeout) ++{ ++ struct hrtimer_sleeper t; ++ int delta, jiffs; ++ ktime_t expires; ++ ++ if (!timeout) { ++ __set_current_state(TASK_RUNNING); ++ return 0; ++ } ++ ++ jiffs = msecs_to_jiffies(timeout); ++ /* ++ * If regular timer resolution is adequate or hrtimer resolution is not ++ * (yet) better than Hz, as would occur during startup, use regular ++ * timers. ++ */ ++ if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ || pm_freezing) ++ return schedule_timeout(jiffs); ++ ++ delta = (timeout % 1000) * NSEC_PER_MSEC; ++ expires = ktime_set(0, delta); ++ ++ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ hrtimer_set_expires_range_ns(&t.timer, expires, delta); ++ ++ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); ++ ++ if (likely(t.task)) ++ schedule(); ++ ++ hrtimer_cancel(&t.timer); ++ destroy_hrtimer_on_stack(&t.timer); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ expires = hrtimer_expires_remaining(&t.timer); ++ timeout = ktime_to_ms(expires); ++ return timeout < 0 ? 0 : timeout; ++} ++ ++EXPORT_SYMBOL(schedule_msec_hrtimeout); ++ ++#define USECS_PER_SEC 1000000 ++extern int hrtimer_granularity_us; ++ ++static inline long schedule_usec_hrtimeout(long timeout) ++{ ++ struct hrtimer_sleeper t; ++ ktime_t expires; ++ int delta; ++ ++ if (!timeout) { ++ __set_current_state(TASK_RUNNING); ++ return 0; ++ } ++ ++ if (hrtimer_resolution >= NSEC_PER_SEC / HZ) ++ return schedule_timeout(usecs_to_jiffies(timeout)); ++ ++ if (timeout < hrtimer_granularity_us) ++ timeout = hrtimer_granularity_us; ++ delta = (timeout % USECS_PER_SEC) * NSEC_PER_USEC; ++ expires = ktime_set(0, delta); ++ ++ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ hrtimer_set_expires_range_ns(&t.timer, expires, delta); ++ ++ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); ++ ++ if (likely(t.task)) ++ schedule(); ++ ++ hrtimer_cancel(&t.timer); ++ destroy_hrtimer_on_stack(&t.timer); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ expires = hrtimer_expires_remaining(&t.timer); ++ timeout = ktime_to_us(expires); ++ return timeout < 0 ? 0 : timeout; ++} ++ ++int __read_mostly hrtimeout_min_us = 500; ++ ++long __sched schedule_min_hrtimeout(void) ++{ ++ return usecs_to_jiffies(schedule_usec_hrtimeout(hrtimeout_min_us)); ++} ++ ++EXPORT_SYMBOL(schedule_min_hrtimeout); ++ ++long __sched schedule_msec_hrtimeout_interruptible(long timeout) ++{ ++ __set_current_state(TASK_INTERRUPTIBLE); ++ return schedule_msec_hrtimeout(timeout); ++} ++EXPORT_SYMBOL(schedule_msec_hrtimeout_interruptible); ++ ++long __sched schedule_msec_hrtimeout_uninterruptible(long timeout) ++{ ++ __set_current_state(TASK_UNINTERRUPTIBLE); ++ return schedule_msec_hrtimeout(timeout); ++} ++EXPORT_SYMBOL(schedule_msec_hrtimeout_uninterruptible); +diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +index 2fd3b3fa68bf..1202d7fe5d8e 100644 +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -236,7 +236,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) + u64 stime, utime; + + task_cputime(p, &utime, &stime); +- store_samples(samples, stime, utime, p->se.sum_exec_runtime); ++ store_samples(samples, stime, utime, tsk_seruntime(p)); + } + + static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, +@@ -855,7 +855,7 @@ static void check_thread_timers(struct task_struct *tsk, + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ +- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); ++ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + /* At the hard limit, send SIGKILL. No further action. */ +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index a5221abb4594..9a9287cb2a37 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -43,6 +43,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -1568,7 +1569,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) + * Check, if the next hrtimer event is before the next timer wheel + * event: + */ +-static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) ++static u64 cmp_next_hrtimer_event(struct timer_base *base, u64 basem, u64 expires) + { + u64 nextevt = hrtimer_get_next_event(); + +@@ -1586,6 +1587,9 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) + if (nextevt <= basem) + return basem; + ++ if (nextevt < expires && nextevt - basem <= TICK_NSEC) ++ base->is_idle = false; ++ + /* + * Round up to the next jiffie. High resolution timers are + * off, so the hrtimers are expired in the tick and we need to +@@ -1655,7 +1659,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) + } + raw_spin_unlock(&base->lock); + +- return cmp_next_hrtimer_event(basem, expires); ++ return cmp_next_hrtimer_event(base, basem, expires); + } + + /** +@@ -1892,6 +1896,18 @@ signed long __sched schedule_timeout(signed long timeout) + + expire = timeout + jiffies; + ++#ifdef CONFIG_HIGH_RES_TIMERS ++ if (timeout == 1 && hrtimer_resolution < NSEC_PER_SEC / HZ) { ++ /* ++ * Special case 1 as being a request for the minimum timeout ++ * and use highres timers to timeout after 1ms to workaround ++ * the granularity of low Hz tick timers. ++ */ ++ if (!schedule_min_hrtimeout()) ++ return 0; ++ goto out_timeout; ++ } ++#endif + timer.task = current; + timer_setup_on_stack(&timer.timer, process_timeout, 0); + __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING); +@@ -1900,10 +1916,10 @@ signed long __sched schedule_timeout(signed long timeout) + + /* Remove the timer from the object tracker */ + destroy_timer_on_stack(&timer.timer); +- ++out_timeout: + timeout = expire - jiffies; + +- out: ++out: + return timeout < 0 ? 0 : timeout; + } + EXPORT_SYMBOL(schedule_timeout); +@@ -2045,7 +2061,19 @@ void __init init_timers(void) + */ + void msleep(unsigned int msecs) + { +- unsigned long timeout = msecs_to_jiffies(msecs) + 1; ++ int jiffs = msecs_to_jiffies(msecs); ++ unsigned long timeout; ++ ++ /* ++ * Use high resolution timers where the resolution of tick based ++ * timers is inadequate. ++ */ ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { ++ while (msecs) ++ msecs = schedule_msec_hrtimeout_uninterruptible(msecs); ++ return; ++ } ++ timeout = jiffs + 1; + + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); +@@ -2059,7 +2087,15 @@ EXPORT_SYMBOL(msleep); + */ + unsigned long msleep_interruptible(unsigned int msecs) + { +- unsigned long timeout = msecs_to_jiffies(msecs) + 1; ++ int jiffs = msecs_to_jiffies(msecs); ++ unsigned long timeout; ++ ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { ++ while (msecs && !signal_pending(current)) ++ msecs = schedule_msec_hrtimeout_interruptible(msecs); ++ return msecs; ++ } ++ timeout = jiffs + 1; + + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); +diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +index b5e3496cf803..68930e7f4d28 100644 +--- a/kernel/trace/trace_selftest.c ++++ b/kernel/trace/trace_selftest.c +@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_MUQSS ++ /* No deadline on MuQSS, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index a37c87b5aee2..7b1d19e17af9 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -163,7 +163,7 @@ struct scan_control { + /* + * From 0 .. 100. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 33; + /* + * The total number of pages which are beyond the high watermark within all + * zones. +diff --git a/net/core/pktgen.c b/net/core/pktgen.c +index 08e2811b5274..955fcfdd3c3c 100644 +--- a/net/core/pktgen.c ++++ b/net/core/pktgen.c +@@ -1894,7 +1894,7 @@ static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname) + mutex_unlock(&pktgen_thread_lock); + pr_debug("%s: waiting for %s to disappear....\n", + __func__, ifname); +- schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try)); ++ schedule_msec_hrtimeout_interruptible((msec_per_try)); + mutex_lock(&pktgen_thread_lock); + + if (++i >= max_tries) { +diff --git a/sound/pci/maestro3.c b/sound/pci/maestro3.c +index 40232a278b1a..d87fae1113aa 100644 +--- a/sound/pci/maestro3.c ++++ b/sound/pci/maestro3.c +@@ -1995,7 +1995,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) + outw(0, io + GPIO_DATA); + outw(dir | GPO_PRIMARY_AC97, io + GPIO_DIRECTION); + +- schedule_timeout_uninterruptible(msecs_to_jiffies(delay1)); ++ schedule_msec_hrtimeout_uninterruptible((delay1)); + + outw(GPO_PRIMARY_AC97, io + GPIO_DATA); + udelay(5); +@@ -2003,7 +2003,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) + outw(IO_SRAM_ENABLE | SERIAL_AC_LINK_ENABLE, io + RING_BUS_CTRL_A); + outw(~0, io + GPIO_MASK); + +- schedule_timeout_uninterruptible(msecs_to_jiffies(delay2)); ++ schedule_msec_hrtimeout_uninterruptible((delay2)); + + if (! snd_m3_try_read_vendor(chip)) + break; +diff --git a/sound/soc/codecs/rt5631.c b/sound/soc/codecs/rt5631.c +index f70b9f7e68bb..77b65398ca07 100644 +--- a/sound/soc/codecs/rt5631.c ++++ b/sound/soc/codecs/rt5631.c +@@ -415,7 +415,7 @@ static void onebit_depop_mute_stage(struct snd_soc_component *component, int ena + hp_zc = snd_soc_component_read32(component, RT5631_INT_ST_IRQ_CTRL_2); + snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); + if (enable) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + /* config one-bit depop parameter */ + rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x307f); + snd_soc_component_update_bits(component, RT5631_HP_OUT_VOL, +@@ -525,7 +525,7 @@ static void depop_seq_mute_stage(struct snd_soc_component *component, int enable + hp_zc = snd_soc_component_read32(component, RT5631_INT_ST_IRQ_CTRL_2); + snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); + if (enable) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + + /* config depop sequence parameter */ + rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x302f); +diff --git a/sound/soc/codecs/wm8350.c b/sound/soc/codecs/wm8350.c +index fe99584c917f..f1344d532a13 100644 +--- a/sound/soc/codecs/wm8350.c ++++ b/sound/soc/codecs/wm8350.c +@@ -233,10 +233,10 @@ static void wm8350_pga_work(struct work_struct *work) + out2->ramp == WM8350_RAMP_UP) { + /* delay is longer over 0dB as increases are larger */ + if (i >= WM8350_OUTn_0dB) +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (2)); + else +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (1)); + } else + udelay(50); /* doesn't matter if we delay longer */ +@@ -1120,7 +1120,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + (platform->dis_out4 << 6)); + + /* wait for discharge */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + cap_discharge_msecs)); + +@@ -1136,7 +1136,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + WM8350_VBUFEN); + + /* wait for vmid */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + vmid_charge_msecs)); + +@@ -1187,7 +1187,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1, pm1); + + /* wait */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + vmid_discharge_msecs)); + +@@ -1205,7 +1205,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + pm1 | WM8350_OUTPUT_DRAIN_EN); + + /* wait */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform->drain_msecs)); + + pm1 &= ~WM8350_BIASEN; +diff --git a/sound/soc/codecs/wm8900.c b/sound/soc/codecs/wm8900.c +index 271235a69c01..3ec90e1b1eb4 100644 +--- a/sound/soc/codecs/wm8900.c ++++ b/sound/soc/codecs/wm8900.c +@@ -1109,7 +1109,7 @@ static int wm8900_set_bias_level(struct snd_soc_component *component, + /* Need to let things settle before stopping the clock + * to ensure that restart works, see "Stopping the + * master clock" in the datasheet. */ +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible(1); + snd_soc_component_write(component, WM8900_REG_POWER2, + WM8900_REG_POWER2_SYSCLK_ENA); + break; +diff --git a/sound/soc/codecs/wm9713.c b/sound/soc/codecs/wm9713.c +index 6497c1ea6228..08fefeca9d82 100644 +--- a/sound/soc/codecs/wm9713.c ++++ b/sound/soc/codecs/wm9713.c +@@ -199,7 +199,7 @@ static int wm9713_voice_shutdown(struct snd_soc_dapm_widget *w, + + /* Gracefully shut down the voice interface. */ + snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0200); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible(1); + snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0f00); + snd_soc_component_update_bits(component, AC97_EXTENDED_MID, 0x1000, 0x1000); + +@@ -868,7 +868,7 @@ static int wm9713_set_pll(struct snd_soc_component *component, + wm9713->pll_in = freq_in; + + /* wait 10ms AC97 link frames for the link to stabilise */ +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + return 0; + } + +diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c +index e2632841b321..7a445c1a2167 100644 +--- a/sound/soc/soc-dapm.c ++++ b/sound/soc/soc-dapm.c +@@ -154,7 +154,7 @@ static void dapm_assert_locked(struct snd_soc_dapm_context *dapm) + static void pop_wait(u32 pop_time) + { + if (pop_time) +- schedule_timeout_uninterruptible(msecs_to_jiffies(pop_time)); ++ schedule_msec_hrtimeout_uninterruptible((pop_time)); + } + + __printf(3, 4) +diff --git a/sound/usb/line6/pcm.c b/sound/usb/line6/pcm.c +index fdbdfb7bce92..fa8e8faf3eb3 100644 +--- a/sound/usb/line6/pcm.c ++++ b/sound/usb/line6/pcm.c +@@ -127,7 +127,7 @@ static void line6_wait_clear_audio_urbs(struct snd_line6_pcm *line6pcm, + if (!alive) + break; + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } while (--timeout > 0); + if (alive) + dev_err(line6pcm->line6->ifcdev, +diff --git a/kernel/cpu.c b/kernel/cpu.c +index 244d305443773..90b77028233b0 100644 +--- a/kernel/cpu.c ++++ b/kernel/cpu.c +@@ -1565,7 +1565,11 @@ static struct cpuhp_step cpuhp_hp_states[] = { + [CPUHP_BRINGUP_CPU] = { + .name = "cpu:bringup", + .startup.single = bringup_cpu, ++#ifdef CONFIG_SCHED_MUQSS ++ .teardown.single = NULL, ++#else + .teardown.single = finish_cpu, ++#endif + .cant_stop = true, + }, + /* Final state before CPU kills itself */ diff --git a/linux-tkg/linux-tkg-patches/5.7/0004-glitched-muqss.patch b/linux-tkg/linux-tkg-patches/5.7/0004-glitched-muqss.patch new file mode 100644 index 0000000..2c4837e --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.7/0004-glitched-muqss.patch @@ -0,0 +1,78 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched - MuQSS + +diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c +index 84a1d08d68551..57c3036a68952 100644 +--- a/kernel/sched/MuQSS.c ++++ b/kernel/sched/MuQSS.c +@@ -163,7 +167,11 @@ int sched_interactive __read_mostly = 1; + * are allowed to run five seconds as real time tasks. This is the total over + * all online cpus. + */ ++#ifdef CONFIG_ZENIFY ++int sched_iso_cpu __read_mostly = 25; ++#else + int sched_iso_cpu __read_mostly = 70; ++#endif + + /* + * sched_yield_type - Choose what sort of yield sched_yield will perform. + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -5,7 +5,7 @@ + choice + prompt "Timer frequency" + default HZ_100 if SCHED_MUQSS +- default HZ_250_NODEF if !SCHED_MUQSS ++ default HZ_500_NODEF if !SCHED_MUQSS + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -50,6 +50,20 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500_NODEF ++ bool "500 HZ" ++ help ++ 500 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ ++ config HZ_750_NODEF ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000_NODEF + bool "1000 HZ" + help +@@ -63,6 +70,8 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250_NODEF + default 300 if HZ_300_NODEF ++ default 500 if HZ_500_NODEF ++ default 750 if HZ_750_NODEF + default 1000 if HZ_1000_NODEF + + config SCHED_HRTICK + +diff --git a/Makefile b/Makefile +index d4d36c61940b..4a9dfe471f1f 100644 +--- a/Makefile ++++ b/Makefile +@@ -15,7 +15,6 @@ NAME = Kleptomaniac Octopus + + CKVERSION = -ck1 + CKNAME = MuQSS Powered +-EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION) + + # We are using a recursive build, so we need to do a little thinking + # to get the ordering right. diff --git a/linux-tkg/linux-tkg-patches/5.7/0004-glitched-ondemand-muqss.patch b/linux-tkg/linux-tkg-patches/5.7/0004-glitched-ondemand-muqss.patch new file mode 100644 index 0000000..02933e4 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.7/0004-glitched-ondemand-muqss.patch @@ -0,0 +1,18 @@ +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index 6b423eebfd5d..61e3271675d6 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -21,10 +21,10 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_SAMPLING_DOWN_FACTOR (1) ++#define DEF_FREQUENCY_UP_THRESHOLD (45) ++#define DEF_SAMPLING_DOWN_FACTOR (5) + #define MAX_SAMPLING_DOWN_FACTOR (100000) +-#define MICRO_FREQUENCY_UP_THRESHOLD (95) ++#define MICRO_FREQUENCY_UP_THRESHOLD (45) + #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) + #define MIN_FREQUENCY_UP_THRESHOLD (1) + #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux-tkg/linux-tkg-patches/5.7/0005-glitched-ondemand-pds.patch b/linux-tkg/linux-tkg-patches/5.7/0005-glitched-ondemand-pds.patch new file mode 100644 index 0000000..c1929e8 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.7/0005-glitched-ondemand-pds.patch @@ -0,0 +1,18 @@ +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index 6b423eebfd5d..61e3271675d6 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -21,10 +21,10 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (63) +-#define DEF_SAMPLING_DOWN_FACTOR (1) ++#define DEF_FREQUENCY_UP_THRESHOLD (55) ++#define DEF_SAMPLING_DOWN_FACTOR (5) + #define MAX_SAMPLING_DOWN_FACTOR (100000) +-#define MICRO_FREQUENCY_UP_THRESHOLD (95) ++#define MICRO_FREQUENCY_UP_THRESHOLD (63) + #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) + #define MIN_FREQUENCY_UP_THRESHOLD (1) + #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux-tkg/linux-tkg-patches/5.7/0005-glitched-pds.patch b/linux-tkg/linux-tkg-patches/5.7/0005-glitched-pds.patch new file mode 100644 index 0000000..23271f5 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.7/0005-glitched-pds.patch @@ -0,0 +1,166 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched - PDS + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_500 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -39,6 +39,13 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,6 +59,7 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_500 ++ default HZ_750 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -46,6 +46,13 @@ choice + on desktops with great smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -60,6 +67,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 500 if HZ_500 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 9270a4370d54..30d01e647417 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -159,7 +159,7 @@ struct scan_control { + /* + * From 0 .. 100. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 20; + /* + * The total number of pages which are beyond the high watermark within all + * zones. + +diff --git a/init/Kconfig b/init/Kconfig +index 11fd9b502d06..e9bc34d3019b 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -715,6 +715,7 @@ menu "Scheduler features" + config UCLAMP_TASK + bool "Enable utilization clamping for RT/FAIR tasks" + depends on CPU_FREQ_GOV_SCHEDUTIL ++ depends on !SCHED_PDS + help + This feature enables the scheduler to track the clamped utilization + of each CPU based on RUNNABLE tasks scheduled on that CPU. +@@ -948,7 +948,6 @@ config CGROUP_DEVICE + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" +- depends on !SCHED_PDS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index b23231bae996..cab4e5c5b38e 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -24,13 +24,13 @@ obj-y += fair.o rt.o deadline.o + obj-$(CONFIG_SMP) += cpudeadline.o topology.o stop_task.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o +-obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o + endif + obj-y += loadavg.o clock.o cputime.o + obj-y += idle.o + obj-y += wait.o wait_bit.o swait.o completion.o + obj-$(CONFIG_SMP) += cpupri.o pelt.o + obj-$(CONFIG_SCHEDSTATS) += stats.o ++obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o + +diff --git a/kernel/sched/pds.c b/kernel/sched/pds.c +index 9281ad164..f09a609cf 100644 +--- a/kernel/sched/pds.c ++++ b/kernel/sched/pds.c +@@ -81,6 +81,18 @@ enum { + NR_CPU_AFFINITY_CHK_LEVEL + }; + ++/* ++ * This allows printing both to /proc/sched_debug and ++ * to the console ++ */ ++#define SEQ_printf(m, x...) \ ++ do { \ ++ if (m) \ ++ seq_printf(m, x); \ ++ else \ ++ pr_cont(x); \ ++ } while (0) ++ + static inline void print_scheduler_version(void) + { + printk(KERN_INFO "pds: PDS-mq CPU Scheduler 0.99o by Alfred Chen.\n"); +@@ -6353,7 +6365,10 @@ void ia64_set_curr_task(int cpu, struct task_struct *p) + #ifdef CONFIG_SCHED_DEBUG + void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, + struct seq_file *m) +-{} ++{ ++ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), ++ get_nr_threads(p)); ++} + + void proc_sched_set_task(struct task_struct *p) + {} diff --git a/linux-tkg/linux-tkg-patches/5.7/0005-v5.7_undead-pds099o.patch b/linux-tkg/linux-tkg-patches/5.7/0005-v5.7_undead-pds099o.patch new file mode 100644 index 0000000..59c8d8d --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.7/0005-v5.7_undead-pds099o.patch @@ -0,0 +1,8400 @@ +From 68f1a9541ef3185b1021e8e54d2712c7039418d7 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Tue, 2 Jun 2020 18:55:09 +0200 +Subject: PDS 099o, 5.7 rebase (release/v2) + + +diff --git a/Documentation/scheduler/sched-PDS-mq.txt b/Documentation/scheduler/sched-PDS-mq.txt +new file mode 100644 +index 000000000000..709e86f6487e +--- /dev/null ++++ b/Documentation/scheduler/sched-PDS-mq.txt +@@ -0,0 +1,56 @@ ++ Priority and Deadline based Skiplist multiple queue Scheduler ++ ------------------------------------------------------------- ++ ++CONTENT ++======== ++ ++ 0. Development ++ 1. Overview ++ 1.1 Design goal ++ 1.2 Design summary ++ 2. Design Detail ++ 2.1 Skip list implementation ++ 2.2 Task preempt ++ 2.3 Task policy, priority and deadline ++ 2.4 Task selection ++ 2.5 Run queue balance ++ 2.6 Task migration ++ ++ ++0. Development ++============== ++ ++Priority and Deadline based Skiplist multiple queue scheduler, referred to as ++PDS from here on, is developed upon the enhancement patchset VRQ(Variable Run ++Queue) for BFS(Brain Fuck Scheduler by Con Kolivas). PDS inherits the existing ++design from VRQ and inspired by the introduction of skiplist data structure ++to the scheduler by Con Kolivas. However, PDS is different from MuQSS(Multiple ++Queue Skiplist Scheduler, the successor after BFS) in many ways. ++ ++1. Overview ++=========== ++ ++1.1 Design goal ++--------------- ++ ++PDS is designed to make the cpu process scheduler code to be simple, but while ++efficiency and scalable. Be Simple, the scheduler code will be easy to be read ++and the behavious of scheduler will be easy to predict. Be efficiency, the ++scheduler shall be well balance the thoughput performance and task interactivity ++at the same time for different properties the tasks behave. Be scalable, the ++performance of the scheduler should be in good shape with the glowing of ++workload or with the growing of the cpu numbers. ++ ++1.2 Design summary ++------------------ ++ ++PDS is described as a multiple run queues cpu scheduler. Each cpu has its own ++run queue. A heavry customized skiplist is used as the backend data structure ++of the cpu run queue. Tasks in run queue is sorted by priority then virtual ++deadline(simplfy to just deadline from here on). In PDS, balance action among ++run queues are kept as less as possible to reduce the migration cost. Cpumask ++data structure is widely used in cpu affinity checking and cpu preemption/ ++selection to make PDS scalable with increasing cpu number. ++ ++ ++To be continued... +diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +index f18d5067cd0f..fe489fc01c73 100644 +--- a/arch/powerpc/platforms/cell/spufs/sched.c ++++ b/arch/powerpc/platforms/cell/spufs/sched.c +@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; + static struct timer_list spusched_timer; + static struct timer_list spuloadavg_timer; + +-/* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- + /* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 2d3f963fd6f1..5f41ead019b1 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1006,6 +1006,22 @@ config NR_CPUS + config SCHED_SMT + def_bool y if SMP + ++config SMT_NICE ++ bool "SMT (Hyperthreading) aware nice priority and policy support" ++ depends on SCHED_PDS && SCHED_SMT ++ default y ++ ---help--- ++ Enabling Hyperthreading on Intel CPUs decreases the effectiveness ++ of the use of 'nice' levels and different scheduling policies ++ (e.g. realtime) due to sharing of CPU power between hyperthreads. ++ SMT nice support makes each logical CPU aware of what is running on ++ its hyperthread siblings, maintaining appropriate distribution of ++ CPU according to nice levels and scheduling policies at the expense ++ of slightly increased overhead. ++ ++ If unsure say Y here. ++ ++ + config SCHED_MC + def_bool y + prompt "Multi-core scheduler support" +diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c +index 737ff3b9c2c0..b5bc5a1b6de7 100644 +--- a/drivers/cpufreq/cpufreq_conservative.c ++++ b/drivers/cpufreq/cpufreq_conservative.c +@@ -28,8 +28,8 @@ struct cs_dbs_tuners { + }; + + /* Conservative governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_FREQUENCY_DOWN_THRESHOLD (20) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) ++#define DEF_FREQUENCY_DOWN_THRESHOLD (26) + #define DEF_FREQUENCY_STEP (5) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (10) +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index 82a4d37ddecb..1130e0f5db72 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -18,7 +18,7 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (100000) + #define MICRO_FREQUENCY_UP_THRESHOLD (95) +@@ -127,7 +127,7 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq) + } + + /* +- * Every sampling_rate, we check, if current idle time is less than 20% ++ * Every sampling_rate, we check, if current idle time is less than 37% + * (default), then we try to increase frequency. Else, we adjust the frequency + * proportional to load. + */ +diff --git a/fs/proc/base.c b/fs/proc/base.c +index eb2255e95f62..62b8cedbccb6 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -479,7 +479,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + seq_puts(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff --git a/include/linux/init_task.h b/include/linux/init_task.h +index 2c620d7ac432..1a7987c40c80 100644 +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -36,7 +36,11 @@ extern struct cred init_cred; + #define INIT_PREV_CPUTIME(x) + #endif + ++#ifdef CONFIG_SCHED_PDS ++#define INIT_TASK_COMM "PDS" ++#else + #define INIT_TASK_COMM "swapper" ++#endif /* !CONFIG_SCHED_PDS */ + + /* Attach to the init_task data structure for proper alignment */ + #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK +diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h +index fed6ba96c527..f03a5ee419a1 100644 +--- a/include/linux/jiffies.h ++++ b/include/linux/jiffies.h +@@ -169,7 +169,7 @@ static inline u64 get_jiffies_64(void) + * Have the 32 bit jiffies value wrap 5 minutes after boot + * so jiffies wrap bugs show up earlier. + */ +-#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) ++#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ)) + + /* + * Change timeval to jiffies, trying to avoid the +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 4418f5cb8324..2b51afac5b06 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -31,6 +31,7 @@ + #include + #include + #include ++#include + + /* task_struct member predeclarations (sorted alphabetically): */ + struct audit_context; +@@ -652,9 +653,13 @@ struct task_struct { + unsigned int flags; + unsigned int ptrace; + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) + struct llist_node wake_entry; ++#endif ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_PDS) + int on_cpu; ++#endif ++#ifdef CONFIG_SMP + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ + unsigned int cpu; +@@ -663,6 +668,7 @@ struct task_struct { + unsigned long wakee_flip_decay_ts; + struct task_struct *last_wakee; + ++#ifndef CONFIG_SCHED_PDS + /* + * recent_used_cpu is initially set as the last CPU used by a task + * that wakes affine another task. Waker/wakee relationships can +@@ -671,6 +677,7 @@ struct task_struct { + * used CPU that may be idle. + */ + int recent_used_cpu; ++#endif /* CONFIG_SCHED_PDS */ + int wake_cpu; + #endif + int on_rq; +@@ -680,13 +687,27 @@ struct task_struct { + int normal_prio; + unsigned int rt_priority; + ++#ifdef CONFIG_SCHED_PDS ++ int time_slice; ++ u64 deadline; ++ /* skip list level */ ++ int sl_level; ++ /* skip list node */ ++ struct skiplist_node sl_node; ++ /* 8bits prio and 56bits deadline for quick processing */ ++ u64 priodl; ++ u64 last_ran; ++ /* sched_clock time spent running */ ++ u64 sched_time; ++#else /* CONFIG_SCHED_PDS */ + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++ struct sched_dl_entity dl; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +- struct sched_dl_entity dl; + + #ifdef CONFIG_UCLAMP_TASK + /* Clamp values requested for a scheduling entity */ +@@ -1306,6 +1327,29 @@ struct task_struct { + */ + }; + ++#ifdef CONFIG_SCHED_PDS ++void cpu_scaling(int cpu); ++void cpu_nonscaling(int cpu); ++#define tsk_seruntime(t) ((t)->sched_time) ++/* replace the uncertian rt_timeout with 0UL */ ++#define tsk_rttimeout(t) (0UL) ++ ++#define task_running_idle(p) ((p)->prio == IDLE_PRIO) ++#else /* CFS */ ++extern int runqueue_is_locked(int cpu); ++static inline void cpu_scaling(int cpu) ++{ ++} ++ ++static inline void cpu_nonscaling(int cpu) ++{ ++} ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++ ++#define iso_task(p) (false) ++#endif /* CONFIG_SCHED_PDS */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index 1aff00b65f3c..a5e5fc2c9170 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -1,5 +1,22 @@ + /* SPDX-License-Identifier: GPL-2.0 */ + ++#ifdef CONFIG_SCHED_PDS ++ ++#define __tsk_deadline(p) ((p)->deadline) ++ ++static inline int dl_prio(int prio) ++{ ++ return 1; ++} ++ ++static inline int dl_task(struct task_struct *p) ++{ ++ return 1; ++} ++#else ++ ++#define __tsk_deadline(p) ((p)->dl.deadline) ++ + /* + * SCHED_DEADLINE tasks has negative priorities, reflecting + * the fact that any of them has higher prio than RT and +@@ -19,6 +36,7 @@ static inline int dl_task(struct task_struct *p) + { + return dl_prio(p->prio); + } ++#endif /* CONFIG_SCHED_PDS */ + + static inline bool dl_time_before(u64 a, u64 b) + { +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index 7d64feafc408..fba04bb91492 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -20,7 +20,18 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ ++#ifdef CONFIG_SCHED_PDS ++#define ISO_PRIO (MAX_USER_RT_PRIO) ++ ++#define MAX_RT_PRIO ((MAX_USER_RT_PRIO) + 1) ++ ++#define NORMAL_PRIO (MAX_RT_PRIO) ++#define IDLE_PRIO ((MAX_RT_PRIO) + 1) ++#define PRIO_LIMIT ((IDLE_PRIO) + 1) ++#else /* !CONFIG_SCHED_PDS */ + #define MAX_RT_PRIO MAX_USER_RT_PRIO ++#endif /* CONFIG_SCHED_PDS */ + + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c08b4..a96012e6f15e 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_PDS + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h +index 38359071236a..90328ccd527f 100644 +--- a/include/linux/sched/task.h ++++ b/include/linux/sched/task.h +@@ -106,7 +106,7 @@ extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); + extern void free_task(struct task_struct *tsk); + + /* sched_exec is called by processes performing an exec */ +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) + extern void sched_exec(void); + #else + #define sched_exec() {} +diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h +new file mode 100644 +index 000000000000..713fedd8034f +--- /dev/null ++++ b/include/linux/skip_list.h +@@ -0,0 +1,177 @@ ++/* ++ Copyright (C) 2016 Alfred Chen. ++ ++ Code based on Con Kolivas's skip list implementation for BFS, and ++ which is based on example originally by William Pugh. ++ ++Skip Lists are a probabilistic alternative to balanced trees, as ++described in the June 1990 issue of CACM and were invented by ++William Pugh in 1987. ++ ++A couple of comments about this implementation: ++ ++This file only provides a infrastructure of skip list. ++ ++skiplist_node is embedded into container data structure, to get rid the ++dependency of kmalloc/kfree operation in scheduler code. ++ ++A customized search function should be defined using DEFINE_SKIPLIST_INSERT ++macro and be used for skip list insert operation. ++ ++Random Level is also not defined in this file, instead, it should be customized ++implemented and set to node->level then pass to the customized skiplist_insert ++function. ++ ++Levels start at zero and go up to (NUM_SKIPLIST_LEVEL -1) ++ ++NUM_SKIPLIST_LEVEL in this implementation is 8 instead of origin 16, ++considering that there will be 256 entries to enable the top level when using ++random level p=0.5, and that number is more than enough for a run queue usage ++in a scheduler usage. And it also help to reduce the memory usage of the ++embedded skip list node in task_struct to about 50%. ++ ++The insertion routine has been implemented so as to use the ++dirty hack described in the CACM paper: if a random level is ++generated that is more than the current maximum level, the ++current maximum level plus one is used instead. ++ ++BFS Notes: In this implementation of skiplists, there are bidirectional ++next/prev pointers and the insert function returns a pointer to the actual ++node the value is stored. The key here is chosen by the scheduler so as to ++sort tasks according to the priority list requirements and is no longer used ++by the scheduler after insertion. The scheduler lookup, however, occurs in ++O(1) time because it is always the first item in the level 0 linked list. ++Since the task struct stores a copy of the node pointer upon skiplist_insert, ++it can also remove it much faster than the original implementation with the ++aid of prev<->next pointer manipulation and no searching. ++*/ ++#ifndef _LINUX_SKIP_LIST_H ++#define _LINUX_SKIP_LIST_H ++ ++#include ++ ++#define NUM_SKIPLIST_LEVEL (8) ++ ++struct skiplist_node { ++ int level; /* Levels in this node */ ++ struct skiplist_node *next[NUM_SKIPLIST_LEVEL]; ++ struct skiplist_node *prev[NUM_SKIPLIST_LEVEL]; ++}; ++ ++#define SKIPLIST_NODE_INIT(name) { 0,\ ++ {&name, &name, &name, &name,\ ++ &name, &name, &name, &name},\ ++ {&name, &name, &name, &name,\ ++ &name, &name, &name, &name},\ ++ } ++ ++static inline void INIT_SKIPLIST_NODE(struct skiplist_node *node) ++{ ++ /* only level 0 ->next matters in skiplist_empty()*/ ++ WRITE_ONCE(node->next[0], node); ++} ++ ++/** ++ * FULL_INIT_SKIPLIST_NODE -- fully init a skiplist_node, expecially for header ++ * @node: the skip list node to be inited. ++ */ ++static inline void FULL_INIT_SKIPLIST_NODE(struct skiplist_node *node) ++{ ++ int i; ++ ++ node->level = 0; ++ for (i = 0; i < NUM_SKIPLIST_LEVEL; i++) { ++ WRITE_ONCE(node->next[i], node); ++ node->prev[i] = node; ++ } ++} ++ ++/** ++ * skiplist_empty - test whether a skip list is empty ++ * @head: the skip list to test. ++ */ ++static inline int skiplist_empty(const struct skiplist_node *head) ++{ ++ return READ_ONCE(head->next[0]) == head; ++} ++ ++/** ++ * skiplist_entry - get the struct for this entry ++ * @ptr: the &struct skiplist_node pointer. ++ * @type: the type of the struct this is embedded in. ++ * @member: the name of the skiplist_node within the struct. ++ */ ++#define skiplist_entry(ptr, type, member) \ ++ container_of(ptr, type, member) ++ ++/** ++ * DEFINE_SKIPLIST_INSERT_FUNC -- macro to define a customized skip list insert ++ * function, which takes two parameters, first one is the header node of the ++ * skip list, second one is the skip list node to be inserted ++ * @func_name: the customized skip list insert function name ++ * @search_func: the search function to be used, which takes two parameters, ++ * 1st one is the itrator of skiplist_node in the list, the 2nd is the skip list ++ * node to be inserted, the function should return true if search should be ++ * continued, otherwise return false. ++ * Returns 1 if @node is inserted as the first item of skip list at level zero, ++ * otherwise 0 ++ */ ++#define DEFINE_SKIPLIST_INSERT_FUNC(func_name, search_func)\ ++static inline int func_name(struct skiplist_node *head, struct skiplist_node *node)\ ++{\ ++ struct skiplist_node *update[NUM_SKIPLIST_LEVEL];\ ++ struct skiplist_node *p, *q;\ ++ int k = head->level;\ ++\ ++ p = head;\ ++ do {\ ++ while (q = p->next[k], q != head && search_func(q, node))\ ++ p = q;\ ++ update[k] = p;\ ++ } while (--k >= 0);\ ++\ ++ k = node->level;\ ++ if (unlikely(k > head->level)) {\ ++ node->level = k = ++head->level;\ ++ update[k] = head;\ ++ }\ ++\ ++ do {\ ++ p = update[k];\ ++ q = p->next[k];\ ++ node->next[k] = q;\ ++ p->next[k] = node;\ ++ node->prev[k] = p;\ ++ q->prev[k] = node;\ ++ } while (--k >= 0);\ ++\ ++ return (p == head);\ ++} ++ ++/** ++ * skiplist_del_init -- delete skip list node from a skip list and reset it's ++ * init state ++ * @head: the header node of the skip list to be deleted from. ++ * @node: the skip list node to be deleted, the caller need to ensure @node is ++ * in skip list which @head represent. ++ * Returns 1 if @node is the first item of skip level at level zero, otherwise 0 ++ */ ++static inline int ++skiplist_del_init(struct skiplist_node *head, struct skiplist_node *node) ++{ ++ int l, m = node->level; ++ ++ for (l = 0; l <= m; l++) { ++ node->prev[l]->next[l] = node->next[l]; ++ node->next[l]->prev[l] = node->prev[l]; ++ } ++ if (m == head->level && m > 0) { ++ while (head->next[m] == head && m > 0) ++ m--; ++ head->level = m; ++ } ++ INIT_SKIPLIST_NODE(node); ++ ++ return (node->prev[0] == head); ++} ++#endif /* _LINUX_SKIP_LIST_H */ +diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h +index 3bac0a8ceab2..d6d384ddb57d 100644 +--- a/include/uapi/linux/sched.h ++++ b/include/uapi/linux/sched.h +@@ -115,7 +115,10 @@ struct clone_args { + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 +-/* SCHED_ISO: reserved but not implemented yet */ ++/* SCHED_ISO: Implemented in BFS/MuQSSPDS only */ ++ ++#define SCHED_ISO 4 ++ + #define SCHED_IDLE 5 + #define SCHED_DEADLINE 6 + +diff --git a/init/Kconfig b/init/Kconfig +index 74a5ac65644f..e4fd406b58dd 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -61,6 +61,21 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config SCHED_PDS ++ bool "PDS-mq cpu scheduler" ++ help ++ The Priority and Deadline based Skip list multiple queue CPU ++ Scheduler for excellent interactivity and responsiveness on the ++ desktop and solid scalability on normal hardware and commodity ++ servers. ++ ++ Currently incompatible with the Group CPU scheduler, and RCU TORTURE ++ TEST so these options are disabled. ++ ++ Say Y here. ++ default y ++ ++ + config BROKEN + bool + +@@ -777,6 +792,7 @@ config NUMA_BALANCING + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_PDS + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -878,7 +894,7 @@ menuconfig CGROUP_SCHED + bandwidth allocation to such task groups. It uses cgroups to group + tasks. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_PDS + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -1007,6 +1023,7 @@ config CGROUP_DEVICE + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" ++ depends on !SCHED_PDS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +@@ -1134,6 +1151,7 @@ config CHECKPOINT_RESTORE + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_PDS + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff --git a/init/init_task.c b/init/init_task.c +index bd403ed3e418..162d3deddd45 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -59,6 +59,126 @@ struct task_struct init_task + __init_task_data + #endif + = { ++#ifdef CONFIG_SCHED_PDS ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ .thread_info = INIT_THREAD_INFO(init_task), ++ .stack_refcount = ATOMIC_INIT(1), ++#endif ++ .state = 0, ++ .stack = init_stack, ++ .usage = ATOMIC_INIT(2), ++ .flags = PF_KTHREAD, ++ .prio = NORMAL_PRIO, ++ .static_prio = MAX_PRIO - 20, ++ .normal_prio = NORMAL_PRIO, ++ .deadline = 0, /* PDS only */ ++ .policy = SCHED_NORMAL, ++ .cpus_ptr = &init_task.cpus_mask, ++ .cpus_mask = CPU_MASK_ALL, ++ .nr_cpus_allowed= NR_CPUS, ++ .mm = NULL, ++ .active_mm = &init_mm, ++ .restart_block = { ++ .fn = do_no_restart_syscall, ++ }, ++ .sl_level = 0, /* PDS only */ ++ .sl_node = SKIPLIST_NODE_INIT(init_task.sl_node), /* PDS only */ ++ .time_slice = HZ, /* PDS only */ ++ .tasks = LIST_HEAD_INIT(init_task.tasks), ++#ifdef CONFIG_SMP ++ .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), ++#endif ++#ifdef CONFIG_CGROUP_SCHED ++ .sched_task_group = &root_task_group, ++#endif ++ .ptraced = LIST_HEAD_INIT(init_task.ptraced), ++ .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), ++ .real_parent = &init_task, ++ .parent = &init_task, ++ .children = LIST_HEAD_INIT(init_task.children), ++ .sibling = LIST_HEAD_INIT(init_task.sibling), ++ .group_leader = &init_task, ++ RCU_POINTER_INITIALIZER(real_cred, &init_cred), ++ RCU_POINTER_INITIALIZER(cred, &init_cred), ++ .comm = INIT_TASK_COMM, ++ .thread = INIT_THREAD, ++ .fs = &init_fs, ++ .files = &init_files, ++ .signal = &init_signals, ++ .sighand = &init_sighand, ++ .nsproxy = &init_nsproxy, ++ .pending = { ++ .list = LIST_HEAD_INIT(init_task.pending.list), ++ .signal = {{0}} ++ }, ++ .blocked = {{0}}, ++ .alloc_lock = __SPIN_LOCK_UNLOCKED(init_task.alloc_lock), ++ .journal_info = NULL, ++ INIT_CPU_TIMERS(init_task) ++ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock), ++ .timer_slack_ns = 50000, /* 50 usec default slack */ ++ .thread_pid = &init_struct_pid, ++ .thread_group = LIST_HEAD_INIT(init_task.thread_group), ++ .thread_node = LIST_HEAD_INIT(init_signals.thread_head), ++#ifdef CONFIG_AUDITSYSCALL ++ .loginuid = INVALID_UID, ++ .sessionid = AUDIT_SID_UNSET, ++#endif ++#ifdef CONFIG_PERF_EVENTS ++ .perf_event_mutex = __MUTEX_INITIALIZER(init_task.perf_event_mutex), ++ .perf_event_list = LIST_HEAD_INIT(init_task.perf_event_list), ++#endif ++#ifdef CONFIG_PREEMPT_RCU ++ .rcu_read_lock_nesting = 0, ++ .rcu_read_unlock_special.s = 0, ++ .rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry), ++ .rcu_blocked_node = NULL, ++#endif ++#ifdef CONFIG_TASKS_RCU ++ .rcu_tasks_holdout = false, ++ .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list), ++ .rcu_tasks_idle_cpu = -1, ++#endif ++#ifdef CONFIG_CPUSETS ++ .mems_allowed_seq = SEQCNT_ZERO(init_task.mems_allowed_seq), ++#endif ++#ifdef CONFIG_RT_MUTEXES ++ .pi_waiters = RB_ROOT_CACHED, ++ .pi_top_task = NULL, ++#endif ++ INIT_PREV_CPUTIME(init_task) ++#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN ++ .vtime.seqcount = SEQCNT_ZERO(init_task.vtime_seqcount), ++ .vtime.starttime = 0, ++ .vtime.state = VTIME_SYS, ++#endif ++#ifdef CONFIG_NUMA_BALANCING ++ .numa_preferred_nid = -1, ++ .numa_group = NULL, ++ .numa_faults = NULL, ++#endif ++#ifdef CONFIG_KASAN ++ .kasan_depth = 1, ++#endif ++#ifdef CONFIG_TRACE_IRQFLAGS ++ .softirqs_enabled = 1, ++#endif ++#ifdef CONFIG_LOCKDEP ++ .lockdep_recursion = 0, ++#endif ++#ifdef CONFIG_FUNCTION_GRAPH_TRACER ++ .ret_stack = NULL, ++#endif ++#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPT) ++ .trace_recursion = 0, ++#endif ++#ifdef CONFIG_LIVEPATCH ++ .patch_state = KLP_UNDEFINED, ++#endif ++#ifdef CONFIG_SECURITY ++ .security = NULL, ++#endif ++#else /* CONFIG_SCHED_PDS */ + #ifdef CONFIG_THREAD_INFO_IN_TASK + .thread_info = INIT_THREAD_INFO(init_task), + .stack_refcount = REFCOUNT_INIT(1), +@@ -182,6 +302,7 @@ struct task_struct init_task + #ifdef CONFIG_SECURITY + .security = NULL, + #endif ++#endif /* CONFIG_SCHED_PDS */ + }; + EXPORT_SYMBOL(init_task); + +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index 729d3a5c772e..10a7c52b90d5 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -636,7 +636,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) + return ret; + } + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) + /* + * Helper routine for generate_sched_domains(). + * Do cpusets a, b have overlapping effective cpus_allowed masks? +@@ -1009,7 +1009,7 @@ static void rebuild_sched_domains_locked(void) + /* Have scheduler rebuild the domains */ + partition_and_rebuild_sched_domains(ndoms, doms, attr); + } +-#else /* !CONFIG_SMP */ ++#else /* !CONFIG_SMP || CONFIG_SCHED_PDS */ + static void rebuild_sched_domains_locked(void) + { + } +diff --git a/kernel/delayacct.c b/kernel/delayacct.c +index 27725754ac99..769d773c7182 100644 +--- a/kernel/delayacct.c ++++ b/kernel/delayacct.c +@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff --git a/kernel/exit.c b/kernel/exit.c +index ce2a75bc0ade..f0f864bc1ab9 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -122,7 +122,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -143,7 +143,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +index f6310f848f34..b5de980c7d4e 100644 +--- a/kernel/livepatch/transition.c ++++ b/kernel/livepatch/transition.c +@@ -306,7 +306,11 @@ static bool klp_try_switch_task(struct task_struct *task) + */ + rq = task_rq_lock(task, &flags); + ++#ifdef CONFIG_SCHED_PDS ++ if (task_running(task) && task != current) { ++#else + if (task_running(rq, task) && task != current) { ++#endif + snprintf(err_buf, STACK_ERR_BUF_SIZE, + "%s: %s:%d is running\n", __func__, task->comm, + task->pid); +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index c9f090d64f00..063d15a1ab8b 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -229,7 +229,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + * Only use with rt_mutex_waiter_{less,equal}() + */ + #define task_to_waiter(p) \ +- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } ++ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = __tsk_deadline(p) } + + static inline int + rt_mutex_waiter_less(struct rt_mutex_waiter *left, +@@ -680,7 +680,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, + * the values of the node being removed. + */ + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + rt_mutex_enqueue(lock, waiter); + +@@ -953,7 +953,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + waiter->task = task; + waiter->lock = lock; + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + /* Get the top priority waiter on the lock */ + if (rt_mutex_has_waiters(lock)) +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 21fb5a5662b5..8ebe4e33fb5f 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -16,15 +16,21 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + +-obj-y += core.o loadavg.o clock.o cputime.o +-obj-y += idle.o fair.o rt.o deadline.o +-obj-y += wait.o wait_bit.o swait.o completion.o +- +-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o ++ifdef CONFIG_SCHED_PDS ++obj-y += pds.o ++else ++obj-y += core.o ++obj-y += fair.o rt.o deadline.o ++obj-$(CONFIG_SMP) += cpudeadline.o topology.o stop_task.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o +-obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o ++endif ++obj-y += loadavg.o clock.o cputime.o ++obj-y += idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o ++obj-$(CONFIG_SMP) += cpupri.o pelt.o ++obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index 7fbaee24c824..28377ad56248 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -183,6 +183,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifndef CONFIG_SCHED_PDS + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -300,6 +301,13 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) + + return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); + } ++#else /* CONFIG_SCHED_PDS */ ++static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) ++{ ++ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); ++ return sg_cpu->max; ++} ++#endif + + /** + * sugov_iowait_reset() - Reset the IO boost status of a CPU. +@@ -443,7 +451,9 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } + */ + static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) + { ++#ifndef CONFIG_SCHED_PDS + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) ++#endif + sg_policy->limits_changed = true; + } + +@@ -686,6 +696,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + } + + ret = sched_setattr_nocheck(thread, &attr); ++ + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); +@@ -916,6 +927,7 @@ static int __init sugov_register(void) + core_initcall(sugov_register); + + #ifdef CONFIG_ENERGY_MODEL ++#ifndef CONFIG_SCHED_PDS + extern bool sched_energy_update; + extern struct mutex sched_energy_mutex; + +@@ -946,4 +958,10 @@ void sched_cpufreq_governor_change(struct cpufreq_policy *policy, + } + + } ++#else /* CONFIG_SCHED_PDS */ ++void sched_cpufreq_governor_change(struct cpufreq_policy *policy, ++ struct cpufreq_governor *old_gov) ++{ ++} ++#endif + #endif +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index ff9435dee1df..1377ea3d1b76 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -122,7 +122,12 @@ void account_user_time(struct task_struct *p, u64 cputime) + p->utime += cputime; + account_group_user_time(p, cputime); + ++#ifdef CONFIG_SCHED_PDS ++ index = (task_nice(p) > 0 || task_running_idle(p)) ? CPUTIME_NICE : ++ CPUTIME_USER; ++#else + index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; ++#endif + + /* Add user time to cpustat. */ + task_group_account_field(p, index, cputime); +@@ -146,7 +151,11 @@ void account_guest_time(struct task_struct *p, u64 cputime) + p->gtime += cputime; + + /* Add guest time to cpustat. */ ++#ifdef CONFIG_SCHED_PDS ++ if (task_nice(p) > 0 || task_running_idle(p)) { ++#else + if (task_nice(p) > 0) { ++#endif + cpustat[CPUTIME_NICE] += cputime; + cpustat[CPUTIME_GUEST_NICE] += cputime; + } else { +@@ -269,7 +278,7 @@ static inline u64 account_other_time(u64 max) + #ifdef CONFIG_64BIT + static inline u64 read_sum_exec_runtime(struct task_struct *t) + { +- return t->se.sum_exec_runtime; ++ return tsk_seruntime(t); + } + #else + static u64 read_sum_exec_runtime(struct task_struct *t) +@@ -279,7 +288,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) + struct rq *rq; + + rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; ++ ns = tsk_seruntime(t); + task_rq_unlock(rq, t, &rf); + + return ns; +@@ -658,7 +667,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index b743bf38f08f..16e5754af1cf 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -361,6 +361,7 @@ void cpu_startup_entry(enum cpuhp_state state) + do_idle(); + } + ++#ifndef CONFIG_SCHED_PDS + /* + * idle-task scheduling class. + */ +@@ -481,3 +482,4 @@ const struct sched_class idle_sched_class = { + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif +diff --git a/kernel/sched/pds.c b/kernel/sched/pds.c +new file mode 100644 +index 000000000000..02d7d5a67c77 +--- /dev/null ++++ b/kernel/sched/pds.c +@@ -0,0 +1,6554 @@ ++/* ++ * kernel/sched/pds.c, was kernel/sched.c ++ * ++ * PDS-mq Core kernel scheduler code and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel ++ * scheduler by Alfred Chen. ++ */ ++#include "pds_sched.h" ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++ ++#include "../workqueue_internal.h" ++#include "../../fs/io-wq.h" ++#include "../smpboot.h" ++ ++#include "pelt.h" ++ ++#define CREATE_TRACE_POINTS ++#include ++ ++ ++#define rt_prio(prio) ((prio) < MAX_RT_PRIO) ++#define rt_task(p) rt_prio((p)->prio) ++#define rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR || \ ++ (policy) == SCHED_ISO) ++#define task_has_rt_policy(p) (rt_policy((p)->policy)) ++ ++#define idle_policy(policy) ((policy) == SCHED_IDLE) ++#define idleprio_task(p) unlikely(idle_policy((p)->policy)) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* ++ * Some helpers for converting to/from various scales. Use shifts to get ++ * approximate multiples of ten for less overhead. ++ */ ++#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) ++#define JIFFY_NS (1000000000 / HZ) ++#define HALF_JIFFY_NS (1000000000 / HZ / 2) ++#define HALF_JIFFY_US (1000000 / HZ / 2) ++#define MS_TO_NS(TIME) ((TIME) << 20) ++#define MS_TO_US(TIME) ((TIME) << 10) ++#define NS_TO_MS(TIME) ((TIME) >> 20) ++#define NS_TO_US(TIME) ((TIME) >> 10) ++#define US_TO_NS(TIME) ((TIME) << 10) ++ ++#define RESCHED_US (100) /* Reschedule if less than this many μs left */ ++ ++enum { ++ BASE_CPU_AFFINITY_CHK_LEVEL = 1, ++#ifdef CONFIG_SCHED_SMT ++ SMT_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++#ifdef CONFIG_SCHED_MC ++ MC_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++ NR_CPU_AFFINITY_CHK_LEVEL ++}; ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "pds: PDS-mq CPU Scheduler 0.99o by Alfred Chen and kept alive artificially by Tk-Glitch.\n"); ++} ++ ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 6ms. Scales with number of cpus. ++ * Tunable via /proc interface. ++ */ ++#define SCHED_DEFAULT_RR (4) ++int rr_interval __read_mostly = SCHED_DEFAULT_RR; ++ ++static int __init rr_interval_set(char *str) ++{ ++ u32 rr; ++ ++ pr_info("rr_interval: "); ++ if (kstrtouint(str, 0, &rr)) { ++ pr_cont("using default of %u, unable to parse %s\n", ++ rr_interval, str); ++ return 1; ++ } ++ ++ rr_interval = rr; ++ pr_cont("%d\n", rr_interval); ++ ++ return 1; ++} ++__setup("rr_interval=", rr_interval_set); ++ ++ ++static const u64 sched_prio2deadline[NICE_WIDTH] = { ++/* -20 */ 6291456, 6920601, 7612661, 8373927, 9211319, ++/* -15 */ 10132450, 11145695, 12260264, 13486290, 14834919, ++/* -10 */ 16318410, 17950251, 19745276, 21719803, 23891783, ++/* -5 */ 26280961, 28909057, 31799962, 34979958, 38477953, ++/* 0 */ 42325748, 46558322, 51214154, 56335569, 61969125, ++/* 5 */ 68166037, 74982640, 82480904, 90728994, 99801893, ++/* 10 */ 109782082, 120760290, 132836319, 146119950, 160731945, ++/* 15 */ 176805139, 194485652, 213934217, 235327638, 258860401 ++}; ++ ++/** ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Yield only to better priority/deadline tasks. (default) ++ * 2: Expire timeslice and recalculate deadline. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++/* ++ * The quota handed out to tasks of all priority levels when refilling their ++ * time_slice. ++ */ ++static inline int timeslice(void) ++{ ++ return MS_TO_US(rr_interval); ++} ++ ++#ifdef CONFIG_SMP ++enum { ++SCHED_RQ_EMPTY = 0, ++SCHED_RQ_IDLE, ++SCHED_RQ_NORMAL_0, ++SCHED_RQ_NORMAL_1, ++SCHED_RQ_NORMAL_2, ++SCHED_RQ_NORMAL_3, ++SCHED_RQ_NORMAL_4, ++SCHED_RQ_NORMAL_5, ++SCHED_RQ_NORMAL_6, ++SCHED_RQ_NORMAL_7, ++SCHED_RQ_ISO, ++SCHED_RQ_RT, ++NR_SCHED_RQ_QUEUED_LEVEL ++}; ++ ++static cpumask_t sched_rq_queued_masks[NR_SCHED_RQ_QUEUED_LEVEL] ++____cacheline_aligned_in_smp; ++ ++static DECLARE_BITMAP(sched_rq_queued_masks_bitmap, NR_SCHED_RQ_QUEUED_LEVEL) ++____cacheline_aligned_in_smp; ++ ++static cpumask_t sched_rq_pending_masks[NR_SCHED_RQ_QUEUED_LEVEL] ++____cacheline_aligned_in_smp; ++ ++static DECLARE_BITMAP(sched_rq_pending_masks_bitmap, NR_SCHED_RQ_QUEUED_LEVEL) ++____cacheline_aligned_in_smp; ++ ++DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_chk_masks); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_start_mask); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_chk_end_masks); ++ ++#ifdef CONFIG_SCHED_SMT ++DEFINE_PER_CPU(int, sched_sibling_cpu); ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++EXPORT_SYMBOL_GPL(sched_smt_present); ++ ++static cpumask_t sched_cpu_sg_idle_mask ____cacheline_aligned_in_smp; ++ ++#ifdef CONFIG_SMT_NICE ++/* ++ * Preemptible sibling group mask ++ * Which all sibling cpus are running at PRIO_LIMIT or IDLE_PRIO ++ */ ++static cpumask_t sched_cpu_psg_mask ____cacheline_aligned_in_smp; ++/* ++ * SMT supressed mask ++ * When a cpu is running task with NORMAL/ISO/RT policy, its sibling cpu ++ * will be supressed to run IDLE priority task. ++ */ ++static cpumask_t sched_smt_supressed_mask ____cacheline_aligned_in_smp; ++#endif /* CONFIG_SMT_NICE */ ++#endif ++ ++static int sched_rq_prio[NR_CPUS] ____cacheline_aligned; ++ ++/* ++ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of ++ * the domain), this allows us to quickly tell if two cpus are in the same cache ++ * domain, see cpus_share_cache(). ++ */ ++DEFINE_PER_CPU(int, sd_llc_id); ++ ++int __weak arch_sd_sibling_asym_packing(void) ++{ ++ return 0*SD_ASYM_PACKING; ++} ++#else ++struct rq *uprq; ++#endif /* CONFIG_SMP */ ++ ++static DEFINE_MUTEX(sched_hotcpu_mutex); ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++/* ++ * Context: p->pi_lock ++ */ ++static inline struct rq ++*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock(&rq->lock); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ *plock = NULL; ++ return rq; ++ } ++ } ++} ++ ++static inline void ++__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) ++{ ++ if (NULL != lock) ++ raw_spin_unlock(lock); ++} ++ ++static inline struct rq ++*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, ++ unsigned long *flags) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock_irqsave(&rq->lock, *flags); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&rq->lock, *flags); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ raw_spin_lock_irqsave(&p->pi_lock, *flags); ++ if (likely(!p->on_cpu && !p->on_rq && ++ rq == task_rq(p))) { ++ *plock = &p->pi_lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); ++ } ++ } ++} ++ ++static inline void ++task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, ++ unsigned long *flags) ++{ ++ raw_spin_unlock_irqrestore(lock, *flags); ++} ++ ++/* ++ * __task_rq_lock - lock the rq @p resides on. ++ */ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ for (;;) { ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) ++ return rq; ++ raw_spin_unlock(&rq->lock); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. ++ */ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ for (;;) { ++ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ /* ++ * move_queued_task() task_rq_lock() ++ * ++ * ACQUIRE (rq->lock) ++ * [S] ->on_rq = MIGRATING [L] rq = task_rq() ++ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); ++ * [S] ->cpu = new_cpu [L] task_rq() ++ * [L] ->on_rq ++ * RELEASE (rq->lock) ++ * ++ * If we observe the old CPU in task_rq_lock(), the acquire of ++ * the old rq->lock will fully serialize against the stores. ++ * ++ * If we observe the new CPU in task_rq_lock(), the address ++ * dependency headed by '[L] rq = task_rq()' and the acquire ++ * will pair with the WMB to ensure we then also see migrating. ++ */ ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++ s64 __maybe_unused steal = 0, irq_delta = 0; ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ ++ delta -= steal; ++ } ++#endif ++ ++ rq->clock_task += delta; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ if ((irq_delta + steal)) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta <= 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++static inline void update_task_priodl(struct task_struct *p) ++{ ++ p->priodl = (((u64) (p->prio))<<56) | ((p->deadline)>>8); ++} ++ ++/* ++ * Deadline is "now" in niffies + (offset by priority). Setting the deadline ++ * is the key to everything. It distributes CPU fairly amongst tasks of the ++ * same nice value, it proportions CPU according to nice level, it means the ++ * task that last woke up the longest ago has the earliest deadline, thus ++ * ensuring that interactive tasks get low latency on wake up. The CPU ++ * proportion works out to the square of the virtual deadline difference, so ++ * this equation will give nice 19 3% CPU compared to nice 0. ++ */ ++static inline u64 task_deadline_diff(const struct task_struct *p) ++{ ++ return sched_prio2deadline[TASK_USER_PRIO(p)]; ++} ++ ++static inline u64 static_deadline_diff(int static_prio) ++{ ++ return sched_prio2deadline[USER_PRIO(static_prio)]; ++} ++ ++/* ++ * The time_slice is only refilled when it is empty and that is when we set a ++ * new deadline for non-rt tasks. ++ */ ++static inline void time_slice_expired(struct task_struct *p, struct rq *rq) ++{ ++ p->time_slice = timeslice(); ++ if (p->prio >= NORMAL_PRIO) ++ p->deadline = rq->clock + task_deadline_diff(p); ++ ++ update_task_priodl(p); ++} ++ ++static inline struct task_struct *rq_first_queued_task(struct rq *rq) ++{ ++ struct skiplist_node *node = rq->sl_header.next[0]; ++ ++ if (node == &rq->sl_header) ++ return rq->idle; ++ ++ return skiplist_entry(node, struct task_struct, sl_node); ++} ++ ++static inline struct task_struct *rq_second_queued_task(struct rq *rq) ++{ ++ struct skiplist_node *node = rq->sl_header.next[0]->next[0]; ++ ++ if (node == &rq->sl_header) ++ return rq->idle; ++ ++ return skiplist_entry(node, struct task_struct, sl_node); ++} ++ ++static inline int is_second_in_rq(struct task_struct *p, struct rq *rq) ++{ ++ return (p->sl_node.prev[0]->prev[0] == &rq->sl_header); ++} ++ ++static const int task_dl_hash_tbl[] = { ++/* 0 4 8 12 */ ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ++/* 16 20 24 28 */ ++ 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5, 6, 7 ++}; ++ ++static inline int ++task_deadline_level(const struct task_struct *p, const struct rq *rq) ++{ ++ u64 delta = (rq->clock + sched_prio2deadline[39] - p->deadline) >> 23; ++ ++ delta = min((size_t)delta, ARRAY_SIZE(task_dl_hash_tbl) - 1); ++ return task_dl_hash_tbl[delta]; ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_SMT_NICE ++static void resched_cpu_if_curr_is(int cpu, int priority) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ rcu_read_lock(); ++ ++ if (rcu_dereference(rq->curr)->prio != priority) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ if (!do_raw_spin_trylock(&rq->lock)) ++ goto out; ++ spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ ++ if (priority == rq->curr->prio) ++ smp_send_reschedule(cpu); ++ /* Else CPU is not idle, do nothing here */ ++ ++ spin_release(&rq->lock.dep_map, _RET_IP_); ++ do_raw_spin_unlock(&rq->lock); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++#endif /* CONFIG_SMT_NICE */ ++ ++static inline bool ++__update_cpumasks_bitmap(int cpu, unsigned long *plevel, unsigned long level, ++ cpumask_t cpumasks[], unsigned long bitmap[]) ++{ ++ if (*plevel == level) ++ return false; ++ ++ cpumask_clear_cpu(cpu, cpumasks + *plevel); ++ if (cpumask_empty(cpumasks + *plevel)) ++ clear_bit(*plevel, bitmap); ++ cpumask_set_cpu(cpu, cpumasks + level); ++ set_bit(level, bitmap); ++ ++ *plevel = level; ++ ++ return true; ++} ++ ++static inline int ++task_running_policy_level(const struct task_struct *p, const struct rq *rq) ++{ ++ int prio = p->prio; ++ ++ if (NORMAL_PRIO == prio) ++ return SCHED_RQ_NORMAL_0 + task_deadline_level(p, rq); ++ ++ if (ISO_PRIO == prio) ++ return SCHED_RQ_ISO; ++ if (prio < MAX_RT_PRIO) ++ return SCHED_RQ_RT; ++ return PRIO_LIMIT - prio; ++} ++ ++static inline void update_sched_rq_queued_masks_normal(struct rq *rq) ++{ ++ struct task_struct *p = rq_first_queued_task(rq); ++ ++ if (p->prio != NORMAL_PRIO) ++ return; ++ ++ __update_cpumasks_bitmap(cpu_of(rq), &rq->queued_level, ++ task_running_policy_level(p, rq), ++ &sched_rq_queued_masks[0], ++ &sched_rq_queued_masks_bitmap[0]); ++} ++ ++#ifdef CONFIG_SMT_NICE ++static inline void update_sched_cpu_psg_mask(const int cpu) ++{ ++ cpumask_t tmp; ++ ++ cpumask_or(&tmp, &sched_rq_queued_masks[SCHED_RQ_EMPTY], ++ &sched_rq_queued_masks[SCHED_RQ_IDLE]); ++ cpumask_and(&tmp, &tmp, cpu_smt_mask(cpu)); ++ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) ++ cpumask_or(&sched_cpu_psg_mask, &sched_cpu_psg_mask, ++ cpu_smt_mask(cpu)); ++ else ++ cpumask_andnot(&sched_cpu_psg_mask, &sched_cpu_psg_mask, ++ cpu_smt_mask(cpu)); ++} ++#endif ++ ++static inline void update_sched_rq_queued_masks(struct rq *rq) ++{ ++ int cpu = cpu_of(rq); ++ struct task_struct *p = rq_first_queued_task(rq); ++ unsigned long level; ++#ifdef CONFIG_SCHED_SMT ++ unsigned long last_level = rq->queued_level; ++#endif ++ ++ level = task_running_policy_level(p, rq); ++ sched_rq_prio[cpu] = p->prio; ++ ++ if (!__update_cpumasks_bitmap(cpu, &rq->queued_level, level, ++ &sched_rq_queued_masks[0], ++ &sched_rq_queued_masks_bitmap[0])) ++ return; ++ ++#ifdef CONFIG_SCHED_SMT ++ if (cpu == per_cpu(sched_sibling_cpu, cpu)) ++ return; ++ ++ if (SCHED_RQ_EMPTY == last_level) { ++ cpumask_andnot(&sched_cpu_sg_idle_mask, &sched_cpu_sg_idle_mask, ++ cpu_smt_mask(cpu)); ++ } else if (SCHED_RQ_EMPTY == level) { ++ cpumask_t tmp; ++ ++ cpumask_and(&tmp, cpu_smt_mask(cpu), ++ &sched_rq_queued_masks[SCHED_RQ_EMPTY]); ++ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) ++ cpumask_or(&sched_cpu_sg_idle_mask, cpu_smt_mask(cpu), ++ &sched_cpu_sg_idle_mask); ++ } ++ ++#ifdef CONFIG_SMT_NICE ++ if (level <= SCHED_RQ_IDLE && last_level > SCHED_RQ_IDLE) { ++ cpumask_clear_cpu(per_cpu(sched_sibling_cpu, cpu), ++ &sched_smt_supressed_mask); ++ update_sched_cpu_psg_mask(cpu); ++ resched_cpu_if_curr_is(per_cpu(sched_sibling_cpu, cpu), PRIO_LIMIT); ++ } else if (last_level <= SCHED_RQ_IDLE && level > SCHED_RQ_IDLE) { ++ cpumask_set_cpu(per_cpu(sched_sibling_cpu, cpu), ++ &sched_smt_supressed_mask); ++ update_sched_cpu_psg_mask(cpu); ++ resched_cpu_if_curr_is(per_cpu(sched_sibling_cpu, cpu), IDLE_PRIO); ++ } ++#endif /* CONFIG_SMT_NICE */ ++#endif ++} ++ ++static inline void update_sched_rq_pending_masks(struct rq *rq) ++{ ++ unsigned long level; ++ struct task_struct *p = rq_second_queued_task(rq); ++ ++ level = task_running_policy_level(p, rq); ++ ++ __update_cpumasks_bitmap(cpu_of(rq), &rq->pending_level, level, ++ &sched_rq_pending_masks[0], ++ &sched_rq_pending_masks_bitmap[0]); ++} ++ ++#else /* CONFIG_SMP */ ++static inline void update_sched_rq_queued_masks(struct rq *rq) {} ++static inline void update_sched_rq_queued_masks_normal(struct rq *rq) {} ++static inline void update_sched_rq_pending_masks(struct rq *rq) {} ++#endif ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out ++ * of nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu; ++ ++ if (!tick_nohz_full_enabled()) ++ return; ++ ++ cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (rq->nr_running < 2) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++/* ++ * Removing from the runqueue. Deleting a task from the skip list is done ++ * via the stored node reference in the task struct and does not require a full ++ * look up. Thus it occurs in O(k) time where k is the "level" of the list the ++ * task was stored at - usually < 4, max 16. ++ * ++ * Context: rq->lock ++ */ ++static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "pds: dequeue task reside on cpu%d from cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ if (skiplist_del_init(&rq->sl_header, &p->sl_node)) { ++ update_sched_rq_queued_masks(rq); ++ update_sched_rq_pending_masks(rq); ++ } else if (is_second_in_rq(p, rq)) ++ update_sched_rq_pending_masks(rq); ++ rq->nr_running--; ++ ++ sched_update_tick_dependency(rq); ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); ++ ++ sched_info_dequeued(rq, p); ++} ++ ++/* ++ * To determine if it's safe for a task of SCHED_IDLE to actually run as ++ * an idle task, we ensure none of the following conditions are met. ++ */ ++static inline bool idleprio_suitable(struct task_struct *p) ++{ ++ return (!freezing(p) && !signal_pending(p) && ++ !(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING))); ++} ++ ++/* ++ * pds_skiplist_random_level -- Returns a pseudo-random level number for skip ++ * list node which is used in PDS run queue. ++ * ++ * In current implementation, based on testing, the first 8 bits in microseconds ++ * of niffies are suitable for random level population. ++ * find_first_bit() is used to satisfy p = 0.5 between each levels, and there ++ * should be platform hardware supported instruction(known as ctz/clz) to speed ++ * up this function. ++ * The skiplist level for a task is populated when task is created and doesn't ++ * change in task's life time. When task is being inserted into run queue, this ++ * skiplist level is set to task's sl_node->level, the skiplist insert function ++ * may change it based on current level of the skip lsit. ++ */ ++static inline int pds_skiplist_random_level(const struct task_struct *p) ++{ ++ long unsigned int randseed; ++ ++ /* ++ * 1. Some architectures don't have better than microsecond resolution ++ * so mask out ~microseconds as a factor of the random seed for skiplist ++ * insertion. ++ * 2. Use address of task structure pointer as another factor of the ++ * random seed for task burst forking scenario. ++ */ ++ randseed = (task_rq(p)->clock ^ (long unsigned int)p) >> 10; ++ ++ return find_first_bit(&randseed, NUM_SKIPLIST_LEVEL - 1); ++} ++ ++/** ++ * pds_skiplist_task_search -- search function used in PDS run queue skip list ++ * node insert operation. ++ * @it: iterator pointer to the node in the skip list ++ * @node: pointer to the skiplist_node to be inserted ++ * ++ * Returns true if key of @it is less or equal to key value of @node, otherwise ++ * false. ++ */ ++static inline bool ++pds_skiplist_task_search(struct skiplist_node *it, struct skiplist_node *node) ++{ ++ return (skiplist_entry(it, struct task_struct, sl_node)->priodl <= ++ skiplist_entry(node, struct task_struct, sl_node)->priodl); ++} ++ ++/* ++ * Define the skip list insert function for PDS ++ */ ++DEFINE_SKIPLIST_INSERT_FUNC(pds_skiplist_insert, pds_skiplist_task_search); ++ ++/* ++ * Adding task to the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "pds: enqueue task reside on cpu%d to cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ ++ p->sl_node.level = p->sl_level; ++ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node)) { ++ update_sched_rq_queued_masks(rq); ++ update_sched_rq_pending_masks(rq); ++ } else if (is_second_in_rq(p, rq)) ++ update_sched_rq_pending_masks(rq); ++ rq->nr_running++; ++ ++ sched_update_tick_dependency(rq); ++ ++ sched_info_queued(rq, p); ++ psi_enqueue(p, flags); ++ ++ /* ++ * If in_iowait is set, the code below may not trigger any cpufreq ++ * utilization updates, so do it here explicitly with the IOWAIT flag ++ * passed. ++ */ ++ if (p->in_iowait) ++ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); ++} ++ ++static inline void requeue_task(struct task_struct *p, struct rq *rq) ++{ ++ bool b_first, b_second; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "pds: cpu[%d] requeue task reside on cpu%d\n", ++ cpu_of(rq), task_cpu(p)); ++ ++ b_first = skiplist_del_init(&rq->sl_header, &p->sl_node); ++ b_second = is_second_in_rq(p, rq); ++ ++ p->sl_node.level = p->sl_level; ++ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node) || b_first) { ++ update_sched_rq_queued_masks(rq); ++ update_sched_rq_pending_masks(rq); ++ } else if (is_second_in_rq(p, rq) || b_second) ++ update_sched_rq_pending_masks(rq); ++} ++ ++/* ++ * resched_curr - mark rq's current task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_curr(struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ int cpu; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ if (test_tsk_need_resched(curr)) ++ return; ++ ++ cpu = cpu_of(rq); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(curr)) ++ smp_send_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) ++{ ++ struct task_struct *curr = rq->curr; ++ ++ if (curr->prio == PRIO_LIMIT) ++ resched_curr(rq); ++ ++ if (task_running_idle(p)) ++ return; ++ ++ if (p->priodl < curr->priodl) ++ resched_curr(rq); ++} ++ ++#ifdef CONFIG_SCHED_HRTICK ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++ ++static void hrtick_clear(struct rq *rq) ++{ ++ if (hrtimer_active(&rq->hrtick_timer)) ++ hrtimer_cancel(&rq->hrtick_timer); ++} ++ ++/* ++ * High-resolution timer tick. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrtick(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrtick_timer); ++ struct task_struct *p; ++ ++ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); ++ ++ raw_spin_lock(&rq->lock); ++ p = rq->curr; ++ p->time_slice = 0; ++ resched_curr(rq); ++ raw_spin_unlock(&rq->lock); ++ ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Use hrtick when: ++ * - enabled by features ++ * - hrtimer is actually high res ++ */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ /** ++ * PDS doesn't support sched_feat yet ++ if (!sched_feat(HRTICK)) ++ return 0; ++ */ ++ if (!cpu_active(cpu_of(rq))) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrtick_timer); ++} ++ ++#ifdef CONFIG_SMP ++ ++static void __hrtick_restart(struct rq *rq) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ++ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); ++} ++ ++/* ++ * called from hardirq (IPI) context ++ */ ++static void __hrtick_start(void *arg) ++{ ++ struct rq *rq = arg; ++ ++ raw_spin_lock(&rq->lock); ++ __hrtick_restart(rq); ++ raw_spin_unlock(&rq->lock); ++} ++ ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ktime_t time; ++ s64 delta; ++ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense and can cause timer DoS. ++ */ ++ delta = max_t(s64, delay, 10000LL); ++ time = ktime_add_ns(timer->base->get_time(), delta); ++ ++ hrtimer_set_expires(timer, time); ++ ++ if (rq == this_rq()) ++ __hrtick_restart(rq); ++ else ++ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); ++} ++ ++#else ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense. Rely on vruntime for fairness. ++ */ ++ delay = max_t(u64, delay, 10000LL); ++ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED_HARD); ++} ++#endif /* CONFIG_SMP */ ++ ++static void hrtick_rq_init(struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ rq->hrtick_csd.flags = 0; ++ rq->hrtick_csd.func = __hrtick_start; ++ rq->hrtick_csd.info = rq; ++#endif ++ ++ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); ++ rq->hrtick_timer.function = hrtick; ++} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ if ((rq->clock - rq->last_tick > HALF_JIFFY_NS) || hrtick_enabled(rq)) ++ return 0; ++ ++ return HALF_JIFFY_NS; ++} ++ ++#else /* CONFIG_SCHED_HRTICK */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline void hrtick_clear(struct rq *rq) ++{ ++} ++ ++static inline void hrtick_rq_init(struct rq *rq) ++{ ++} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ return (rq->clock - rq->last_tick > HALF_JIFFY_NS)? 0:HALF_JIFFY_NS; ++} ++#endif /* CONFIG_SCHED_HRTICK */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ static const int policy_to_prio[] = { ++ NORMAL_PRIO, /* SCHED_NORMAL */ ++ 0, /* SCHED_FIFO */ ++ 0, /* SCHED_RR */ ++ IDLE_PRIO, /* SCHED_BATCH */ ++ ISO_PRIO, /* SCHED_ISO */ ++ IDLE_PRIO /* SCHED_IDLE */ ++ }; ++ ++ if (task_has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ return policy_to_prio[p->policy]; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static void activate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible--; ++ enqueue_task(p, rq, ENQUEUE_WAKEUP); ++ p->on_rq = 1; ++ cpufreq_update_this_cpu(rq, 0); ++} ++ ++/* ++ * deactivate_task - remove a task from the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible++; ++ dequeue_task(p, rq, DEQUEUE_SLEEP); ++ p->on_rq = 0; ++ cpufreq_update_this_cpu(rq, 0); ++} ++ ++static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ WRITE_ONCE(p->cpu, cpu); ++#else ++ WRITE_ONCE(task_thread_info(p)->cpu, cpu); ++#endif ++#endif ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++#ifdef CONFIG_SCHED_DEBUG ++ /* ++ * We should never call set_task_cpu() on a blocked task, ++ * ttwu() will sort out the placement. ++ */ ++ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && ++ !p->on_rq); ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * sched_move_task() holds both and thus holding either pins the cgroup, ++ * see task_group(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(&task_rq(p)->lock))); ++#endif ++ /* ++ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. ++ */ ++ WARN_ON_ONCE(!cpu_online(new_cpu)); ++#endif ++ if (task_cpu(p) == new_cpu) ++ return; ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ __set_task_cpu(p, new_cpu); ++} ++ ++static inline bool is_per_cpu_kthread(struct task_struct *p) ++{ ++ return ((p->flags & PF_KTHREAD) && (1 == p->nr_cpus_allowed)); ++} ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr() and select_fallback_rq(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ if (!cpumask_test_cpu(cpu, &p->cpus_mask)) ++ return false; ++ ++ if (is_per_cpu_kthread(p)) ++ return cpu_online(cpu); ++ ++ return cpu_active(cpu); ++} ++ ++/* ++ * This is how migration works: ++ * ++ * 1) we invoke migration_cpu_stop() on the target CPU using ++ * stop_one_cpu(). ++ * 2) stopper starts to run (implicitly forcing the migrated thread ++ * off the CPU) ++ * 3) it checks whether the migrated task is still in the wrong runqueue. ++ * 4) if it's in the wrong runqueue then the migration thread removes ++ * it and puts it into the right queue. ++ * 5) stopper completes and stop_one_cpu() returns and the migration ++ * is done. ++ */ ++ ++/* ++ * move_queued_task - move a queued task to new rq. ++ * ++ * Returns (locked) new rq. Old rq's lock is released. ++ */ ++static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int ++ new_cpu) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ p->on_rq = TASK_ON_RQ_MIGRATING; ++ dequeue_task(p, rq, 0); ++ set_task_cpu(p, new_cpu); ++ raw_spin_unlock(&rq->lock); ++ ++ rq = cpu_rq(new_cpu); ++ ++ raw_spin_lock(&rq->lock); ++ BUG_ON(task_cpu(p) != new_cpu); ++ enqueue_task(p, rq, 0); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++ check_preempt_curr(rq, p); ++ ++ return rq; ++} ++ ++struct migration_arg { ++ struct task_struct *task; ++ int dest_cpu; ++}; ++ ++/* ++ * Move (not current) task off this CPU, onto the destination CPU. We're doing ++ * this because either it can't run here any more (set_cpus_allowed() ++ * away from this CPU, or CPU going down), or because we're ++ * attempting to rebalance this task on exec (sched_exec). ++ * ++ * So we race with normal scheduler movements, but that's OK, as long ++ * as the task is no longer on this CPU. ++ */ ++static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int ++ dest_cpu) ++{ ++ /* Affinity changed (again). */ ++ if (!is_cpu_allowed(p, dest_cpu)) ++ return rq; ++ ++ update_rq_clock(rq); ++ return move_queued_task(rq, p, dest_cpu); ++} ++ ++/* ++ * migration_cpu_stop - this will be executed by a highprio stopper thread ++ * and performs thread migration by bumping thread off CPU then ++ * 'pushing' onto another runqueue. ++ */ ++static int migration_cpu_stop(void *data) ++{ ++ struct migration_arg *arg = data; ++ struct task_struct *p = arg->task; ++ struct rq *rq = this_rq(); ++ ++ /* ++ * The original target CPU might have gone down and we might ++ * be on another CPU but it doesn't matter. ++ */ ++ local_irq_disable(); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ /* ++ * If task_rq(p) != rq, it cannot be migrated here, because we're ++ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because ++ * we're holding p->pi_lock. ++ */ ++ if (task_rq(p) == rq) ++ if (task_on_rq_queued(p)) ++ rq = __migrate_task(rq, p, arg->dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_enable(); ++ return 0; ++} ++ ++static inline void ++set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ set_cpus_allowed_common(p, new_mask); ++} ++#endif ++ ++/* Enter with rq lock held. We know p is on the local CPU */ ++static inline void __set_tsk_resched(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ unsigned long flags; ++ bool running, on_rq; ++ unsigned long ncsw; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(p) && p == rq->curr) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ task_access_lock_irqsave(p, &lock, &flags); ++ trace_sched_wait_task(p); ++ running = task_running(p); ++ on_rq = p->on_rq; ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(on_rq)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_send_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++ ++/* ++ * ->cpus_mask is protected by both rq->lock and p->pi_lock ++ * ++ * A few notes on cpu_active vs cpu_online: ++ * ++ * - cpu_active must be a subset of cpu_online ++ * ++ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, ++ * see __set_cpus_allowed_ptr(). At this point the newly online ++ * CPU isn't yet part of the sched domains, and balancing will not ++ * see it. ++ * ++ * - on cpu-down we clear cpu_active() to mask the sched domains and ++ * avoid the load balancer to place new tasks on the to be removed ++ * CPU. Existing tasks will remain running there and will be taken ++ * off. ++ * ++ * This means that fallback selection must not select !active CPUs. ++ * And can assume that any active CPU must be online. Conversely ++ * select_task_rq() below may allow selection of !active CPUs in order ++ * to satisfy the above rules. ++ */ ++static int select_fallback_rq(int cpu, struct task_struct *p) ++{ ++ int nid = cpu_to_node(cpu); ++ const struct cpumask *nodemask = NULL; ++ enum { cpuset, possible, fail } state = cpuset; ++ int dest_cpu; ++ ++ /* ++ * If the node that the CPU is on has been offlined, cpu_to_node() ++ * will return -1. There is no CPU on the node, and we should ++ * select the CPU on the other node. ++ */ ++ if (nid != -1) { ++ nodemask = cpumask_of_node(nid); ++ ++ /* Look for allowed, online CPU in same node. */ ++ for_each_cpu(dest_cpu, nodemask) { ++ if (!cpu_active(dest_cpu)) ++ continue; ++ if (cpumask_test_cpu(dest_cpu, &p->cpus_mask)) ++ return dest_cpu; ++ } ++ } ++ ++ for (;;) { ++ /* Any allowed, online CPU? */ ++ for_each_cpu(dest_cpu, &p->cpus_mask) { ++ if (!is_cpu_allowed(p, dest_cpu)) ++ continue; ++ goto out; ++ } ++ ++ /* No more Mr. Nice Guy. */ ++ switch (state) { ++ case cpuset: ++ if (IS_ENABLED(CONFIG_CPUSETS)) { ++ cpuset_cpus_allowed_fallback(p); ++ state = possible; ++ break; ++ } ++ /* Fall-through */ ++ case possible: ++ do_set_cpus_allowed(p, cpu_possible_mask); ++ state = fail; ++ break; ++ ++ case fail: ++ BUG(); ++ break; ++ } ++ } ++ ++out: ++ if (state != cpuset) { ++ /* ++ * Don't tell them about moving exiting tasks or ++ * kernel threads (both mm NULL), since they never ++ * leave kernel. ++ */ ++ if (p->mm && printk_ratelimit()) { ++ printk_deferred("process %d (%s) no longer affine to cpu%d\n", ++ task_pid_nr(p), p->comm, cpu); ++ } ++ } ++ ++ return dest_cpu; ++} ++ ++static inline int best_mask_cpu(int cpu, const cpumask_t *cpumask) ++{ ++ cpumask_t *mask; ++ ++ if (cpumask_test_cpu(cpu, cpumask)) ++ return cpu; ++ ++ mask = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) ++ mask++; ++ ++ return cpu; ++} ++ ++/* ++ * task_preemptible_rq - return the rq which the given task can preempt on ++ * @p: task wants to preempt CPU ++ * @only_preempt_low_policy: indicate only preempt rq running low policy than @p ++ */ ++static inline int ++task_preemptible_rq_idle(struct task_struct *p, cpumask_t *chk_mask) ++{ ++ cpumask_t tmp; ++ ++#ifdef CONFIG_SCHED_SMT ++ if (cpumask_and(&tmp, chk_mask, &sched_cpu_sg_idle_mask)) ++ return best_mask_cpu(task_cpu(p), &tmp); ++#endif ++ ++#ifdef CONFIG_SMT_NICE ++ /* Only ttwu on cpu which is not smt supressed */ ++ if (cpumask_andnot(&tmp, chk_mask, &sched_smt_supressed_mask)) { ++ cpumask_t t; ++ if (cpumask_and(&t, &tmp, &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ return best_mask_cpu(task_cpu(p), &t); ++ return best_mask_cpu(task_cpu(p), &tmp); ++ } ++#endif ++ ++ if (cpumask_and(&tmp, chk_mask, &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ return best_mask_cpu(task_cpu(p), &tmp); ++ return best_mask_cpu(task_cpu(p), chk_mask); ++} ++ ++static inline int ++task_preemptible_rq(struct task_struct *p, cpumask_t *chk_mask, ++ int preempt_level) ++{ ++ cpumask_t tmp; ++ int level; ++ ++#ifdef CONFIG_SCHED_SMT ++#ifdef CONFIG_SMT_NICE ++ if (cpumask_and(&tmp, chk_mask, &sched_cpu_psg_mask)) ++ return best_mask_cpu(task_cpu(p), &tmp); ++#else ++ if (cpumask_and(&tmp, chk_mask, &sched_cpu_sg_idle_mask)) ++ return best_mask_cpu(task_cpu(p), &tmp); ++#endif ++#endif ++ ++ level = find_first_bit(sched_rq_queued_masks_bitmap, ++ NR_SCHED_RQ_QUEUED_LEVEL); ++ ++ while (level < preempt_level) { ++ if (cpumask_and(&tmp, chk_mask, &sched_rq_queued_masks[level])) ++ return best_mask_cpu(task_cpu(p), &tmp); ++ ++ level = find_next_bit(sched_rq_queued_masks_bitmap, ++ NR_SCHED_RQ_QUEUED_LEVEL, ++ level + 1); ++ } ++ ++ if (unlikely(SCHED_RQ_RT == level && ++ level == preempt_level && ++ cpumask_and(&tmp, chk_mask, ++ &sched_rq_queued_masks[SCHED_RQ_RT]))) { ++ unsigned int cpu; ++ ++ for_each_cpu (cpu, &tmp) ++ if (p->prio < sched_rq_prio[cpu]) ++ return cpu; ++ } ++ ++ return best_mask_cpu(task_cpu(p), chk_mask); ++} ++ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ cpumask_t chk_mask; ++ ++ if (unlikely(!cpumask_and(&chk_mask, &p->cpus_mask, cpu_online_mask))) ++ return select_fallback_rq(task_cpu(p), p); ++ ++ /* Check IDLE tasks suitable to run normal priority */ ++ if (idleprio_task(p)) { ++ if (idleprio_suitable(p)) { ++ p->prio = p->normal_prio; ++ update_task_priodl(p); ++ return task_preemptible_rq_idle(p, &chk_mask); ++ } ++ p->prio = NORMAL_PRIO; ++ update_task_priodl(p); ++ } ++ ++ return task_preemptible_rq(p, &chk_mask, ++ task_running_policy_level(p, this_rq())); ++} ++#else /* CONFIG_SMP */ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ return 0; ++} ++#endif /* CONFIG_SMP */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq= this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) ++ __schedstat_inc(rq->ttwu_local); ++ else { ++ /** PDS ToDo: ++ * How to do ttwu_wake_remote ++ */ ++ } ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static inline void ++ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static inline void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++#ifdef CONFIG_SMP ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++#endif ++ ++ activate_task(p, rq); ++ ttwu_do_wakeup(rq, p, 0); ++} ++ ++static int ttwu_remote(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ rq = __task_access_lock(p, &lock); ++ if (task_on_rq_queued(p)) { ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_access_unlock(p, lock); ++ ++ return ret; ++} ++ ++/* ++ * Notes on Program-Order guarantees on SMP systems. ++ * ++ * MIGRATION ++ * ++ * The basic program-order guarantee on SMP systems is that when a task [t] ++ * migrates, all its activity on its old CPU [c0] happens-before any subsequent ++ * execution on its new CPU [c1]. ++ * ++ * For migration (of runnable tasks) this is provided by the following means: ++ * ++ * A) UNLOCK of the rq(c0)->lock scheduling out task t ++ * B) migration for t is required to synchronize *both* rq(c0)->lock and ++ * rq(c1)->lock (if not at the same time, then in that order). ++ * C) LOCK of the rq(c1)->lock scheduling in task ++ * ++ * Transitivity guarantees that B happens after A and C after B. ++ * Note: we only require RCpc transitivity. ++ * Note: the CPU doing B need not be c0 or c1 ++ * ++ * Example: ++ * ++ * CPU0 CPU1 CPU2 ++ * ++ * LOCK rq(0)->lock ++ * sched-out X ++ * sched-in Y ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(0)->lock // orders against CPU0 ++ * dequeue X ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(1)->lock ++ * enqueue X ++ * UNLOCK rq(1)->lock ++ * ++ * LOCK rq(1)->lock // orders against CPU2 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(1)->lock ++ * ++ * ++ * BLOCKING -- aka. SLEEP + WAKEUP ++ * ++ * For blocking we (obviously) need to provide the same guarantee as for ++ * migration. However the means are completely different as there is no lock ++ * chain to provide order. Instead we do: ++ * ++ * 1) smp_store_release(X->on_cpu, 0) ++ * 2) smp_cond_load_acquire(!X->on_cpu) ++ * ++ * Example: ++ * ++ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) ++ * ++ * LOCK rq(0)->lock LOCK X->pi_lock ++ * dequeue X ++ * sched-out X ++ * smp_store_release(X->on_cpu, 0); ++ * ++ * smp_cond_load_acquire(&X->on_cpu, !VAL); ++ * X->state = WAKING ++ * set_task_cpu(X,2) ++ * ++ * LOCK rq(2)->lock ++ * enqueue X ++ * X->state = RUNNING ++ * UNLOCK rq(2)->lock ++ * ++ * LOCK rq(2)->lock // orders against CPU1 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(2)->lock ++ * ++ * UNLOCK X->pi_lock ++ * UNLOCK rq(0)->lock ++ * ++ * ++ * However; for wakeups there is a second guarantee we must provide, namely we ++ * must observe the state that lead to our wakeup. That is, not only must our ++ * task observe its own prior state, it must also observe the stores prior to ++ * its wakeup. ++ * ++ * This means that any means of doing remote wakeups must order the CPU doing ++ * the wakeup against the CPU the task is going to end up running on. This, ++ * however, is already required for the regular Program-Order guarantee above, ++ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). ++ * ++ */ ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Return: %true if @p was woken up, %false if it was already running. ++ * or @state didn't match @p's state. ++ */ ++static int try_to_wake_up(struct task_struct *p, unsigned int state, ++ int wake_flags) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ int cpu, success = 0; ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with mb() in ++ * set_current_state() the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ if (!(p->state & state)) ++ goto out; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ cpu = task_cpu(p); ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ if (p->on_rq && ttwu_remote(p, wake_flags)) ++ goto stat; ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until its done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ p->sched_contributes_to_load = !!task_contributes_to_load(p); ++ p->state = TASK_WAKING; ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ if (SCHED_ISO == p->policy && ISO_PRIO != p->prio) { ++ p->prio = ISO_PRIO; ++ p->deadline = 0UL; ++ update_task_priodl(p); ++ } ++ ++ cpu = select_task_rq(p); ++ ++ if (cpu != task_cpu(p)) { ++ wake_flags |= WF_MIGRATED; ++ psi_ttwu_dequeue(p); ++ set_task_cpu(p, cpu); ++ } ++#else /* CONFIG_SMP */ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++#endif ++ ++ rq = cpu_rq(cpu); ++ raw_spin_lock(&rq->lock); ++ ++ update_rq_clock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ check_preempt_curr(rq, p); ++ ++ raw_spin_unlock(&rq->lock); ++ ++stat: ++ ttwu_stat(p, cpu, wake_flags); ++out: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return success; ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ */ ++int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ int cpu = get_cpu(); ++ struct rq *rq = this_rq(); ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ /* Should be reset in fork.c but done here for ease of PDS patching */ ++ p->on_cpu = ++ p->on_rq = ++ p->utime = ++ p->stime = ++ p->sched_time = 0; ++ ++ p->sl_level = pds_skiplist_random_level(p); ++ INIT_SKIPLIST_NODE(&p->sl_node); ++ ++#ifdef CONFIG_COMPACTION ++ p->capture_control = NULL; ++#endif ++ ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = current->normal_prio; ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (task_has_rt_policy(p)) { ++ p->policy = SCHED_NORMAL; ++ p->static_prio = NICE_TO_PRIO(0); ++ p->rt_priority = 0; ++ } else if (PRIO_TO_NICE(p->static_prio) < 0) ++ p->static_prio = NICE_TO_PRIO(0); ++ ++ p->prio = p->normal_prio = normal_prio(p); ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ rq->curr->time_slice /= 2; ++ p->time_slice = rq->curr->time_slice; ++#ifdef CONFIG_SCHED_HRTICK ++ hrtick_start(rq, US_TO_NS(rq->curr->time_slice)); ++#endif ++ ++ if (p->time_slice < RESCHED_US) { ++ update_rq_clock(rq); ++ time_slice_expired(p, rq); ++ resched_curr(rq); ++ } else ++ update_task_priodl(p); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ /* ++ * The child is not yet in the pid-hash so no cgroup attach races, ++ * and the cgroup is pinned to this child due to cgroup_fork() ++ * is ran before sched_fork(). ++ * ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ /* ++ * We're setting the CPU for the first time, we don't migrate, ++ * so use __set_task_cpu(). ++ */ ++ __set_task_cpu(p, cpu); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ put_cpu(); ++ return 0; ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ p->state = TASK_RUNNING; ++ ++ rq = cpu_rq(select_task_rq(p)); ++#ifdef CONFIG_SMP ++ /* ++ * Fork balancing, do it here and not earlier because: ++ * - cpus_mask can change in the fork path ++ * - any previously selected CPU might disappear through hotplug ++ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, ++ * as we're not fully set-up yet. ++ */ ++ __set_task_cpu(p, cpu_of(rq)); ++#endif ++ ++ raw_spin_lock(&rq->lock); ++ ++ update_rq_clock(rq); ++ activate_task(p, rq); ++ trace_sched_wakeup_new(p); ++ check_preempt_curr(rq, p); ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ */ ++ next->on_cpu = 1; ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->on_cpu is cleared, the task can be moved to a different CPU. ++ * We must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#else ++ prev->on_cpu = 0; ++#endif ++} ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock.dep_map, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock.owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static struct rq *finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(&rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ finish_lock_switch(rq); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct_rcu_user(prev); ++ } ++ ++ tick_nohz_task_switch(); ++ return rq; ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq; ++ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ rq = finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline struct rq * ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prepare_task_switch(rq, prev, next); ++ ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * kernel -> kernel lazy + transfer active ++ * user -> kernel lazy + mmgrab() active ++ * ++ * kernel -> user switch + mmdrop() active ++ * user -> user switch ++ */ ++ if (!next->mm) { // to kernel ++ enter_lazy_tlb(prev->active_mm, next); ++ ++ next->active_mm = prev->active_mm; ++ if (prev->mm) // from user ++ mmgrab(prev->active_mm); ++ else ++ prev->active_mm = NULL; ++ } else { // to user ++ membarrier_switch_mm(rq, prev->active_mm, next->mm); ++ /* ++ * sys_membarrier() requires an smp_mb() between setting ++ * rq->curr / membarrier_switch_mm() and returning to userspace. ++ * ++ * The below provides this either through switch_mm(), or in ++ * case 'prev->active_mm == next->mm' through ++ * finish_task_switch()'s mmdrop(). ++ */ ++ switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ ++ if (!prev->mm) { // from kernel ++ /* will mmdrop() in finish_task_switch(). */ ++ rq->prev_mm = prev->active_mm; ++ prev->active_mm = NULL; ++ } ++ } ++ ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ return finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptible section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ return raw_rq()->nr_running == 1; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int i; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += cpu_rq(i)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpuidle menu ++ * governor, are using nonsensical data. Preferring shallow idle state selection ++ * for a CPU that has IO-wait which might not even end up running the task when ++ * it does become runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ return atomic_read(&cpu_rq(cpu)->nr_iowait); ++} ++ ++/* ++ * IO-wait accounting, and how its mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += nr_iowait_cpu(i); ++ ++ return sum; ++} ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++static inline void pds_update_curr(struct rq *rq, struct task_struct *p) ++{ ++ s64 ns = rq->clock_task - p->last_ran; ++ ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ p->time_slice -= NS_TO_US(ns); ++ p->last_ran = rq->clock_task; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimization chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_access_lock_irqsave(p, &lock, &flags); ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_rq_clock(rq); ++ pds_update_curr(rq, p); ++ } ++ ns = tsk_seruntime(p); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ return ns; ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static inline void pds_scheduler_task_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ if (is_idle_task(p)) ++ return; ++ ++ pds_update_curr(rq, p); ++ ++ cpufreq_update_util(rq, 0); ++ ++ /* ++ * Tasks that were scheduled in the first half of a tick are not ++ * allowed to run into the 2nd half of the next tick if they will ++ * run out of time slice in the interim. Otherwise, if they have ++ * less than RESCHED_US μs of time slice left they will be rescheduled. ++ */ ++ if (p->time_slice - rq->dither >= RESCHED_US) ++ return; ++ ++ /** ++ * p->time_slice < RESCHED_US. We will modify task_struct under ++ * rq lock as p is rq->curr ++ */ ++ __set_tsk_resched(p); ++} ++ ++#ifdef CONFIG_SMP ++ ++#ifdef CONFIG_SCHED_SMT ++static int active_load_balance_cpu_stop(void *data) ++{ ++ struct rq *rq = this_rq(); ++ struct task_struct *p = data; ++ int cpu; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ rq->active_balance = 0; ++ /* ++ * _something_ may have changed the task, double check again ++ */ ++ if (task_on_rq_queued(p) && task_rq(p) == rq && ++ (cpu = cpumask_any_and(&p->cpus_mask, &sched_cpu_sg_idle_mask)) < nr_cpu_ids) ++ rq = __migrate_task(rq, p, cpu); ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_restore(flags); ++ ++ return 0; ++} ++ ++/* pds_sg_balance_trigger - trigger slibing group balance for @cpu */ ++static void pds_sg_balance_trigger(const int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ struct task_struct *curr; ++ ++ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) ++ return; ++ curr = rq->curr; ++ if (!is_idle_task(curr) && ++ cpumask_intersects(&curr->cpus_mask, &sched_cpu_sg_idle_mask)) { ++ int active_balance = 0; ++ ++ if (likely(!rq->active_balance)) { ++ rq->active_balance = 1; ++ active_balance = 1; ++ } ++ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ if (likely(active_balance)) ++ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, ++ curr, &rq->active_balance_work); ++ } else ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++} ++ ++/* ++ * pds_sg_balance_check - slibing group balance check for run queue @rq ++ */ ++static inline void pds_sg_balance_check(const struct rq *rq) ++{ ++ cpumask_t chk; ++ int i; ++ ++ /* Only online cpu will do sg balance checking */ ++ if (unlikely(!rq->online)) ++ return; ++ ++ /* Only cpu in slibing idle group will do the checking */ ++ if (!cpumask_test_cpu(cpu_of(rq), &sched_cpu_sg_idle_mask)) ++ return; ++ ++ /* Find potential cpus which can migrate the currently running task */ ++ if (!cpumask_andnot(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY], ++ &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ return; ++ ++ for_each_cpu(i, &chk) { ++ /* skip the cpu which has idle slibing cpu */ ++ if (cpumask_test_cpu(per_cpu(sched_sibling_cpu, i), ++ &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ continue; ++ pds_sg_balance_trigger(i); ++ } ++} ++DEFINE_PER_CPU(unsigned long, thermal_pressure); ++ ++void arch_set_thermal_pressure(struct cpumask *cpus, ++ unsigned long th_pressure) ++{ ++ int cpu; ++ ++ for_each_cpu(cpu, cpus) ++ WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); ++} ++#endif /* CONFIG_SCHED_SMT */ ++#endif /* CONFIG_SMP */ ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ arch_scale_freq_tick(); ++ sched_clock_tick(); ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ pds_scheduler_task_tick(rq); ++ update_sched_rq_queued_masks_normal(rq); ++ calc_global_load_tick(rq); ++ psi_task_tick(rq); ++ ++ rq->last_tick = rq->clock; ++ raw_spin_unlock(&rq->lock); ++ ++ perf_event_task_tick(); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++struct tick_work { ++ int cpu; ++ atomic_t state; ++ struct delayed_work work; ++}; ++/* Values for ->state, see diagram below. */ ++#define TICK_SCHED_REMOTE_OFFLINE 0 ++#define TICK_SCHED_REMOTE_OFFLINING 1 ++#define TICK_SCHED_REMOTE_RUNNING 2 ++ ++/* ++ * State diagram for ->state: ++ * ++ * ++ * TICK_SCHED_REMOTE_OFFLINE ++ * | ^ ++ * | | ++ * | | sched_tick_remote() ++ * | | ++ * | | ++ * +--TICK_SCHED_REMOTE_OFFLINING ++ * | ^ ++ * | | ++ * sched_tick_start() | | sched_tick_stop() ++ * | | ++ * V | ++ * TICK_SCHED_REMOTE_RUNNING ++ * ++ * ++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() ++ * and sched_tick_start() are happy to leave the state in RUNNING. ++ */ ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ unsigned long flags; ++ u64 delta; ++ int os; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (!tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ curr = rq->curr; ++ if (cpu_is_offline(cpu)) ++ goto out_unlock; ++ ++ update_rq_clock(rq); ++ if (!is_idle_task(curr)) { ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ delta = rq_clock_task(rq) - curr->last_ran; ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ } ++ pds_scheduler_task_tick(rq); ++ update_sched_rq_queued_masks_normal(rq); ++ calc_load_nohz_remote(rq); ++ ++out_unlock: ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++out_requeue: ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. But ++ * first update state to reflect hotplug activity if required. ++ */ ++ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); ++ if (os == TICK_SCHED_REMOTE_RUNNING) ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ int os; ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); ++ if (os == TICK_SCHED_REMOTE_OFFLINE) { ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++ } ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ cancel_delayed_work_sync(&twork->work); ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_PREEMPT_TRACER)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++/* ++ * Timeslices below RESCHED_US are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. SCHED_BATCH tasks ++ * have been flagged be not latency sensitive and likely to be fully CPU ++ * bound so every time they're rescheduled they have their time_slice ++ * refilled, but get a new later deadline to have little effect on ++ * SCHED_NORMAL tasks. ++ ++ */ ++static inline void check_deadline(struct task_struct *p, struct rq *rq) ++{ ++ if (rq->idle == p) ++ return; ++ ++ pds_update_curr(rq, p); ++ ++ if (p->time_slice < RESCHED_US) { ++ time_slice_expired(p, rq); ++ if (SCHED_ISO == p->policy && ISO_PRIO == p->prio) { ++ p->prio = NORMAL_PRIO; ++ p->deadline = rq->clock + task_deadline_diff(p); ++ update_task_priodl(p); ++ } ++ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) ++ requeue_task(p, rq); ++ } ++} ++ ++#ifdef CONFIG_SMP ++ ++#define SCHED_RQ_NR_MIGRATION (32UL) ++/* ++ * Migrate pending tasks in @rq to @dest_cpu ++ * Will try to migrate mininal of half of @rq nr_running tasks and ++ * SCHED_RQ_NR_MIGRATION to @dest_cpu ++ */ ++static inline int ++migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, int filter_prio) ++{ ++ struct task_struct *p; ++ int dest_cpu = cpu_of(dest_rq); ++ int nr_migrated = 0; ++ int nr_tries = min((rq->nr_running + 1) / 2, SCHED_RQ_NR_MIGRATION); ++ struct skiplist_node *node = rq->sl_header.next[0]; ++ ++ while (nr_tries && node != &rq->sl_header) { ++ p = skiplist_entry(node, struct task_struct, sl_node); ++ node = node->next[0]; ++ ++ if (task_running(p)) ++ continue; ++ if (p->prio >= filter_prio) ++ break; ++ if (cpumask_test_cpu(dest_cpu, &p->cpus_mask)) { ++ dequeue_task(p, rq, 0); ++ set_task_cpu(p, dest_cpu); ++ enqueue_task(p, dest_rq, 0); ++ nr_migrated++; ++ } ++ nr_tries--; ++ /* make a jump */ ++ if (node == &rq->sl_header) ++ break; ++ node = node->next[0]; ++ } ++ ++ return nr_migrated; ++} ++ ++static inline int ++take_queued_task_cpumask(struct rq *rq, cpumask_t *chk_mask, int filter_prio) ++{ ++ int src_cpu; ++ ++ for_each_cpu(src_cpu, chk_mask) { ++ int nr_migrated; ++ struct rq *src_rq = cpu_rq(src_cpu); ++ ++ if (!do_raw_spin_trylock(&src_rq->lock)) { ++ if (PRIO_LIMIT == filter_prio) ++ continue; ++ return 0; ++ } ++ spin_acquire(&src_rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ ++ update_rq_clock(src_rq); ++ if ((nr_migrated = migrate_pending_tasks(src_rq, rq, filter_prio))) ++ cpufreq_update_this_cpu(rq, 0); ++ ++ spin_release(&src_rq->lock.dep_map, _RET_IP_); ++ do_raw_spin_unlock(&src_rq->lock); ++ ++ if (nr_migrated || PRIO_LIMIT != filter_prio) ++ return nr_migrated; ++ } ++ return 0; ++} ++ ++static inline int take_other_rq_task(struct rq *rq, int cpu, int filter_prio) ++{ ++ struct cpumask *affinity_mask, *end; ++ struct cpumask chk; ++ ++ if (PRIO_LIMIT == filter_prio) { ++ cpumask_complement(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY]); ++#ifdef CONFIG_SMT_NICE ++ { ++ /* also try to take IDLE priority tasks from smt supressed cpu */ ++ struct cpumask t; ++ if (cpumask_and(&t, &sched_smt_supressed_mask, ++ &sched_rq_queued_masks[SCHED_RQ_IDLE])) ++ cpumask_or(&chk, &chk, &t); ++ } ++#endif ++ } else if (NORMAL_PRIO == filter_prio) { ++ cpumask_or(&chk, &sched_rq_pending_masks[SCHED_RQ_RT], ++ &sched_rq_pending_masks[SCHED_RQ_ISO]); ++ } else if (IDLE_PRIO == filter_prio) { ++ cpumask_complement(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY]); ++ cpumask_andnot(&chk, &chk, &sched_rq_pending_masks[SCHED_RQ_IDLE]); ++ } else ++ cpumask_copy(&chk, &sched_rq_pending_masks[SCHED_RQ_RT]); ++ ++ if (cpumask_empty(&chk)) ++ return 0; ++ ++ affinity_mask = per_cpu(sched_cpu_llc_start_mask, cpu); ++ end = per_cpu(sched_cpu_affinity_chk_end_masks, cpu); ++ do { ++ struct cpumask tmp; ++ ++ if (cpumask_and(&tmp, &chk, affinity_mask) && ++ take_queued_task_cpumask(rq, &tmp, filter_prio)) ++ return 1; ++ } while (++affinity_mask < end); ++ ++ return 0; ++} ++#endif ++ ++static inline struct task_struct * ++choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) ++{ ++ struct task_struct *next = rq_first_queued_task(rq); ++ ++#ifdef CONFIG_SMT_NICE ++ if (cpumask_test_cpu(cpu, &sched_smt_supressed_mask)) { ++ if (next->prio >= IDLE_PRIO) { ++ if (rq->online && ++ take_other_rq_task(rq, cpu, IDLE_PRIO)) ++ return rq_first_queued_task(rq); ++ return rq->idle; ++ } ++ } ++#endif ++ ++#ifdef CONFIG_SMP ++ if (likely(rq->online)) ++ if (take_other_rq_task(rq, cpu, next->prio)) { ++ resched_curr(rq); ++ return rq_first_queued_task(rq); ++ } ++#endif ++ return next; ++} ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ if (panic_on_warn) ++ panic("scheduling while atomic\n"); ++ ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev, bool preempt) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++#endif ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++ if (!preempt && prev->state && prev->non_block_count) { ++ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", ++ prev->comm, prev->pid, prev->non_block_count); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++ } ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ p->last_ran = rq->clock_task; ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++ if (p != rq->idle) ++ hrtick_start(rq, US_TO_NS(p->time_slice)); ++#endif ++ /* update rq->dither */ ++ rq->dither = rq_dither(rq); ++} ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next; ++ unsigned long *switch_count; ++ struct rq *rq; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ ++ schedule_debug(prev, preempt); ++ ++ /* by passing sched_feat(HRTICK) checking which PDS doesn't support */ ++ hrtick_clear(rq); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(). ++ * ++ * The membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ raw_spin_lock(&rq->lock); ++ smp_mb__after_spinlock(); ++ ++ update_rq_clock(rq); ++ ++ switch_count = &prev->nivcsw; ++ if (!preempt && prev->state) { ++ if (signal_pending_state(prev->state, prev)) { ++ prev->state = TASK_RUNNING; ++ } else { ++ deactivate_task(prev, rq); ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ check_deadline(prev, rq); ++ ++ next = choose_next_task(rq, cpu, prev); ++ ++ set_rq_task(rq, next); ++ ++ if (prev != next) { ++ if (next->prio == PRIO_LIMIT) ++ schedstat_inc(rq->sched_goidle); ++ ++ /* ++ * RCU users of rcu_dereference(rq->curr) may not see ++ * changes to task_struct made by pick_next_task(). ++ */ ++ RCU_INIT_POINTER(rq->curr, next); ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ rq->nr_switches++; ++ ++ psi_sched_switch(prev, next, !task_on_rq_queued(prev)); ++ ++ trace_sched_switch(preempt, prev, next); ++ ++ /* Also unlocks the rq: */ ++ rq = context_switch(rq, prev, next); ++#ifdef CONFIG_SCHED_SMT ++ pds_sg_balance_check(rq); ++#endif ++ } else ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(): */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ __schedule(false); ++ ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ if (!tsk->state || tsk_is_pi_blocked(tsk) || ++ signal_pending_state(tsk->state, tsk)) ++ return; ++ ++ /* ++ * If a worker went to sleep, notify and ask workqueue whether ++ * it wants to wake up a task to maintain concurrency. ++ * As this function is called inside the schedule() context, ++ * we disable preemption to avoid it calling schedule() again ++ * in the possible wakeup of a kworker. ++ */ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ preempt_disable(); ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_sleeping(tsk); ++ else ++ io_wq_worker_sleeping(tsk); ++ preempt_enable_no_resched(); ++ } ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++static void sched_update_worker(struct task_struct *tsk) ++{ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_running(tsk); ++ else ++ io_wq_worker_running(tsk); ++ } ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ sched_update_worker(tsk); ++} ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_CONTEXT_TRACKING ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != CONTEXT_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPTION ++/* ++ * This is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPTION */ ++ ++/* ++ * This is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++static inline void ++check_task_changed(struct rq *rq, struct task_struct *p) ++{ ++ /* ++ * Trigger changes when task priority/deadline modified. ++ */ ++ if (task_on_rq_queued(p)) { ++ struct task_struct *first; ++ ++ requeue_task(p, rq); ++ ++ /* Resched if first queued task not running and not IDLE */ ++ if ((first = rq_first_queued_task(rq)) != rq->curr && ++ !task_running_idle(first)) ++ resched_curr(rq); ++ } ++} ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_access_lock(p, &lock); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guaratees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ p->prio = prio; ++ update_task_priodl(p); ++ ++ check_task_changed(rq, p); ++ ++out_unlock: ++ __task_access_unlock(p, lock); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ int new_static; ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ new_static = NICE_TO_PRIO(nice); ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ /* rq lock may not held!! */ ++ update_rq_clock(rq); ++ ++ p->static_prio = new_static; ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (task_has_rt_policy(p)) ++ goto out_unlock; ++ ++ p->deadline -= task_deadline_diff(p); ++ p->deadline += static_deadline_diff(new_static); ++ p->prio = effective_prio(p); ++ update_task_priodl(p); ++ ++ check_task_changed(rq, p); ++out_unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ int level, prio = p->prio - MAX_RT_PRIO; ++ static const int level_to_nice_prio[] = {39, 33, 26, 20, 14, 7, 0, 0}; ++ ++ /* rt tasks */ ++ if (prio <= 0) ++ goto out; ++ ++ preempt_disable(); ++ level = task_deadline_level(p, this_rq()); ++ preempt_enable(); ++ prio += level_to_nice_prio[level]; ++ if (idleprio_task(p)) ++ prio += NICE_WIDTH; ++out: ++ return prio; ++} ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ return cpu_curr(cpu) == cpu_rq(cpu)->idle; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the cpu @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++#ifdef CONFIG_SMP ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ int dest_cpu; ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (cpumask_equal(&p->cpus_mask, new_mask)) ++ goto out; ++ ++ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ if (dest_cpu >= nr_cpu_ids) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ do_set_cpus_allowed(p, new_mask); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(p) || p->state == TASK_WAKING) { ++ struct migration_arg arg = { p, dest_cpu }; ++ ++ /* Need help from migration thread: drop lock and wait. */ ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); ++ return 0; ++ } ++ if (task_on_rq_queued(p)) { ++ /* ++ * OK, since we're going to drop the lock immediately ++ * afterwards anyway. ++ */ ++ update_rq_clock(rq); ++ rq = move_queued_task(rq, p, dest_cpu); ++ lock = &rq->lock; ++ } ++ ++out: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#else ++static inline int ++__set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++#endif ++ ++static u64 task_init_deadline(const struct task_struct *p) ++{ ++ return task_rq(p)->clock + task_deadline_diff(p); ++} ++ ++u64 (* task_init_deadline_func_tbl[])(const struct task_struct *p) = { ++ task_init_deadline, /* SCHED_NORMAL */ ++ NULL, /* SCHED_FIFO */ ++ NULL, /* SCHED_RR */ ++ task_init_deadline, /* SCHED_BATCH */ ++ NULL, /* SCHED_ISO */ ++ task_init_deadline /* SCHED_IDLE */ ++}; ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++static void __setscheduler_params(struct task_struct *p, ++ const struct sched_attr *attr) ++{ ++ int old_policy = p->policy; ++ int policy = attr->sched_policy; ++ ++ if (policy == SETPARAM_POLICY) ++ policy = p->policy; ++ ++ p->policy = policy; ++ ++ /* ++ * allow normal nice value to be set, but will not have any ++ * effect on scheduling until the task not SCHED_NORMAL/ ++ * SCHED_BATCH ++ */ ++ p->static_prio = NICE_TO_PRIO(attr->sched_nice); ++ ++ /* ++ * __sched_setscheduler() ensures attr->sched_priority == 0 when ++ * !rt_policy. Always setting this ensures that things like ++ * getparam()/getattr() don't report silly values for !rt tasks. ++ */ ++ p->rt_priority = attr->sched_priority; ++ p->normal_prio = normal_prio(p); ++ ++ if (old_policy != policy) ++ p->deadline = (task_init_deadline_func_tbl[p->policy])? ++ task_init_deadline_func_tbl[p->policy](p):0ULL; ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct rq *rq, struct task_struct *p, ++ const struct sched_attr *attr, bool keep_boost) ++{ ++ __setscheduler_params(p, attr); ++ ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++ update_task_priodl(p); ++} ++ ++/* ++ * check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int ++__sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, bool user, bool pi) ++{ ++ const struct sched_attr dl_squash_attr = { ++ .size = sizeof(struct sched_attr), ++ .sched_policy = SCHED_FIFO, ++ .sched_nice = 0, ++ .sched_priority = 99, ++ }; ++ int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ int retval, oldpolicy = -1; ++ int policy = attr->sched_policy; ++ unsigned long flags; ++ struct rq *rq; ++ int reset_on_fork; ++ raw_spinlock_t *lock; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ /* ++ * PDS supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO ++ */ ++ if (unlikely(SCHED_DEADLINE == policy)) { ++ attr = &dl_squash_attr; ++ policy = attr->sched_policy; ++ newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); ++ ++ if (policy > SCHED_IDLE) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH and SCHED_IDLE is 0. ++ */ ++ if (attr->sched_priority < 0 || ++ (p->mm && attr->sched_priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if ((SCHED_RR == policy || SCHED_FIFO == policy) != ++ (attr->sched_priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (SCHED_FIFO == policy || SCHED_RR == policy) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (attr->sched_priority > p->rt_priority && ++ attr->sched_priority > rlim_rtprio) ++ return -EPERM; ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ if (pi) ++ cpuset_read_lock(); ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ /* ++ * To be able to change p->policy safely, task_access_lock() ++ * must be called. ++ * IF use task_access_lock() here: ++ * For the task p which is not running, reading rq->stop is ++ * racy but acceptable as ->stop doesn't change much. ++ * An enhancemnet can be made to read rq->stop saftly. ++ */ ++ rq = __task_access_lock(p, &lock); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea ++ */ ++ if (p == rq->stop) { ++ retval = -EINVAL; ++ goto unlock; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy)) { ++ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) ++ goto change; ++ if (!rt_policy(policy) && ++ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) ++ goto change; ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ retval = 0; ++ goto unlock; ++ } ++change: ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ goto recheck; ++ } ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ if (pi) { ++ /* ++ * Take priority boosted tasks into account. If the new ++ * effective priority is unchanged, we just store the new ++ * normal parameters and do not touch the scheduler class and ++ * the runqueue. This will be done when the task deboost ++ * itself. ++ */ ++ if (rt_effective_prio(p, newprio) == p->prio) { ++ __setscheduler_params(p, attr); ++ retval = 0; ++ goto unlock; ++ } ++ } ++ ++ __setscheduler(rq, p, attr, pi); ++ ++ check_task_changed(rq, p); ++ ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ if (pi) { ++ cpuset_read_unlock(); ++ rt_mutex_adjust_pi(p); ++ } ++ ++ preempt_enable(); ++ ++ return 0; ++ ++unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ return retval; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ ++ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { ++ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; ++ policy &= ~SCHED_RESET_ON_FORK; ++ attr.sched_policy = policy; ++ } ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++ ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++EXPORT_SYMBOL_GPL(sched_setscheduler); ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++EXPORT_SYMBOL_GPL(sched_setattr); ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setscheduler(p, policy, &lparam); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) ++ goto err_size; ++ ++ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); ++ if (ret) { ++ if (ret == -E2BIG) ++ goto err_size; ++ return ret; ++ } ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * @param: structure containing the new RT priority. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (p != NULL) ++ retval = sched_setattr(p, &attr); ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (task_has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/* ++ * Copy the kernel size attribute structure (which might be larger ++ * than what user-space knows about) to user-space. ++ * ++ * Note that all cases are valid: user-space buffer can be larger or ++ * smaller than the kernel-space buffer. The usual case is that both ++ * have the same size. ++ */ ++static int ++sched_attr_copy_to_user(struct sched_attr __user *uattr, ++ struct sched_attr *kattr, ++ unsigned int usize) ++{ ++ unsigned int ksize = sizeof(*kattr); ++ ++ if (!access_ok(uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * sched_getattr() ABI forwards and backwards compatibility: ++ * ++ * If usize == ksize then we just copy everything to user-space and all is good. ++ * ++ * If usize < ksize then we only copy as much as user-space has space for, ++ * this keeps ABI compatibility as well. We skip the rest. ++ * ++ * If usize > ksize then user-space is using a newer version of the ABI, ++ * which part the kernel doesn't know about. Just ignore it - tooling can ++ * detect the kernel's knowledge of attributes from the attr->size value ++ * which is set to ksize in this case. ++ */ ++ kattr->size = min(usize, ksize); ++ ++ if (copy_to_user(uattr, kattr, kattr->size)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @usize: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, usize, unsigned int, flags) ++{ ++ struct sched_attr kattr = { }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || usize > PAGE_SIZE || ++ usize < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ kattr.sched_policy = p->policy; ++ if (rt_task(p)) ++ kattr.sched_priority = p->rt_priority; ++ else ++ kattr.sched_nice = task_nice(p); ++ ++#ifdef CONFIG_UCLAMP_TASK ++ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; ++ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; ++#endif ++ ++ rcu_read_unlock(); ++ ++ return sched_attr_copy_to_user(uattr, &kattr, usize); ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_mask, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ put_online_cpus(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_mask); ++ cpumask_and(new_mask, in_mask, cpus_mask); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_mask); ++ if (!cpumask_subset(new_mask, cpus_mask)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_mask to the ++ * cpuset's cpus_mask ++ */ ++ cpumask_copy(new_mask, cpus_mask); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_mask); ++out_put_task: ++ put_task_struct(p); ++ put_online_cpus(); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ struct cpumask *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ raw_spinlock_t *lock; ++ unsigned long flags; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ task_access_lock_irqsave(p, &lock, &flags); ++ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: size of CPU mask copied to user_mask_ptr on success. An ++ * error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min_t(size_t, len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ * ++ * Return: 0. ++ */ ++static void do_sched_yield(void) ++{ ++ struct rq *rq; ++ struct rq_flags rf; ++ ++ if (!sched_yield_type) ++ return; ++ ++ rq = this_rq_lock_irq(&rf); ++ ++ if (sched_yield_type > 1) { ++ time_slice_expired(current, rq); ++ requeue_task(current, rq); ++ } ++ schedstat_inc(rq->yld_count); ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ preempt_disable(); ++ raw_spin_unlock(&rq->lock); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPTION ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ rcu_all_qs(); ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, its already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * In PDS, yield_to is not supported. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ int retval; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ rcu_read_unlock(); ++ ++ *t = ns_to_timespec64(MS_TO_NS(rr_interval)); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct __kernel_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT_32BIT_TIME ++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, ++ struct old_timespec32 __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_old_timespec32(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, ++ task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ show_stack(p, NULL); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++#if BITS_PER_LONG == 32 ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#else ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#endif ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++#ifdef CONFIG_SCHED_DEBUG ++ /* PDS TODO: should support this ++ if (!state_filter) ++ sysrq_sched_debug_show(); ++ */ ++#endif ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: cpu the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ idle->last_ran = rq->clock_task; ++ idle->state = TASK_RUNNING; ++ idle->flags |= PF_IDLE; ++ /* Setting prio to illegal value shouldn't matter when never queued */ ++ idle->prio = PRIO_LIMIT; ++ idle->deadline = rq_clock(rq) + task_deadline_diff(idle); ++ update_task_priodl(idle); ++ ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#endif ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ __set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->idle = idle; ++ rcu_assign_pointer(rq->curr, idle); ++ idle->on_cpu = 1; ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(cpu_rq(cpu)); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++} ++ ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * In order to ensure that a pending wakeup will observe our pending ++ * state, even in the failed case, an explicit smp_mb() must be used. ++ */ ++ smp_mb__before_atomic(); ++ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) ++ return false; ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++ return true; ++} ++ ++/** ++ * wake_q_add() - queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ */ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task)) ++ get_task_struct(task); ++} ++ ++/** ++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ * ++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers ++ * that already hold reference to @task can call the 'safe' version and trust ++ * wake_q to do the right thing depending whether or not the @task is already ++ * queued for wakeup. ++ */ ++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (!__wake_q_add(head, task)) ++ put_task_struct(task); ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* task can safely be re-inserted now: */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++#ifdef CONFIG_SMP ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_mask may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++static bool sched_smp_initialized __read_mostly; ++ ++#ifdef CONFIG_NO_HZ_COMMON ++void nohz_balance_enter_idle(int cpu) ++{ ++} ++ ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(), default_cpu = -1; ++ struct cpumask *mask; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { ++ if (!idle_cpu(cpu)) ++ return cpu; ++ default_cpu = cpu; ++ } ++ ++ for (mask = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ mask < per_cpu(sched_cpu_affinity_chk_end_masks, cpu); mask++) ++ for_each_cpu_and(i, mask, housekeeping_cpumask(HK_FLAG_TIMER)) ++ if (!idle_cpu(i)) ++ return i; ++ ++ if (default_cpu == -1) ++ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++ cpu = default_cpu; ++ ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++void wake_up_idle_cpu(int cpu) ++{ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ set_tsk_need_resched(cpu_rq(cpu)->idle); ++ smp_send_reschedule(cpu); ++} ++ ++void wake_up_nohz_cpu(int cpu) ++{ ++ wake_up_idle_cpu(cpu); ++} ++#endif /* CONFIG_NO_HZ_COMMON */ ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Ensures that the idle task is using init_mm right before its CPU goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(current != this_rq()->idle); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ finish_arch_post_lock_switch(); ++ } ++ ++ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ ++} ++ ++/* ++ * Migrate all tasks from the rq, sleeping tasks will be migrated by ++ * try_to_wake_up()->select_task_rq(). ++ * ++ * Called with rq->lock held even though we'er in stop_machine() and ++ * there's no concurrency possible, we hold the required locks anyway ++ * because of lock validation efforts. ++ */ ++static void migrate_tasks(struct rq *dead_rq) ++{ ++ struct rq *rq = dead_rq; ++ struct task_struct *p, *stop = rq->stop; ++ struct skiplist_node *node; ++ int count = 0; ++ ++ /* ++ * Fudge the rq selection such that the below task selection loop ++ * doesn't get stuck on the currently eligible stop task. ++ * ++ * We're currently inside stop_machine() and the rq is either stuck ++ * in the stop_machine_cpu_stop() loop, or we're executing this code, ++ * either way we should never end up calling schedule() until we're ++ * done here. ++ */ ++ rq->stop = NULL; ++ ++ node = &rq->sl_header; ++ while ((node = node->next[0]) != &rq->sl_header) { ++ int dest_cpu; ++ ++ p = skiplist_entry(node, struct task_struct, sl_node); ++ ++ /* skip the running task */ ++ if (task_running(p)) ++ continue; ++ ++ /* ++ * Rules for changing task_struct::cpus_mask are holding ++ * both pi_lock and rq->lock, such that holding either ++ * stabilizes the mask. ++ * ++ * Drop rq->lock is not quite as disastrous as it usually is ++ * because !cpu_active at this point, which means load-balance ++ * will not interfere. Also, stop-machine. ++ */ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ /* ++ * Since we're inside stop-machine, _nothing_ should have ++ * changed the task, WARN if weird stuff happened, because in ++ * that case the above rq->lock drop is a fail too. ++ */ ++ if (WARN_ON(task_rq(p) != rq || !task_on_rq_queued(p))) { ++ raw_spin_unlock(&p->pi_lock); ++ continue; ++ } ++ ++ count++; ++ /* Find suitable destination for @next, with force if needed. */ ++ dest_cpu = select_fallback_rq(dead_rq->cpu, p); ++ ++ rq = __migrate_task(rq, p, dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ rq = dead_rq; ++ raw_spin_lock(&rq->lock); ++ /* Check queued task all over from the header again */ ++ node = &rq->sl_header; ++ } ++ ++ rq->stop = stop; ++} ++ ++static void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) ++ rq->online = false; ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++static void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) ++ rq->online = true; ++} ++ ++#ifdef CONFIG_SCHED_DEBUG ++ ++static __read_mostly int sched_debug_enabled; ++ ++static int __init sched_debug_setup(char *str) ++{ ++ sched_debug_enabled = 1; ++ ++ return 0; ++} ++early_param("sched_debug", sched_debug_setup); ++ ++static inline bool sched_debug(void) ++{ ++ return sched_debug_enabled; ++} ++#else /* !CONFIG_SCHED_DEBUG */ ++static inline bool sched_debug(void) ++{ ++ return false; ++} ++#endif /* CONFIG_SCHED_DEBUG */ ++ ++#ifdef CONFIG_SMP ++void scheduler_ipi(void) ++{ ++ /* ++ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting ++ * TIF_NEED_RESCHED remotely (for the first time) will also send ++ * this IPI. ++ */ ++ preempt_fold_need_resched(); ++ ++ if (!idle_cpu(smp_processor_id()) || need_resched()) ++ return; ++ ++ irq_enter(); ++ irq_exit(); ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (is_idle_task(rq->curr)) ++ smp_send_reschedule(cpu); ++ /* Else CPU is not idle, do nothing here */ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Topology list, bottom-up. ++ */ ++static struct sched_domain_topology_level default_topology[] = { ++#ifdef CONFIG_SCHED_SMT ++ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, ++#endif ++#ifdef CONFIG_SCHED_MC ++ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, ++#endif ++ { cpu_cpu_mask, SD_INIT_NAME(DIE) }, ++ { NULL, }, ++}; ++ ++static struct sched_domain_topology_level *sched_domain_topology = ++ default_topology; ++ ++#define for_each_sd_topology(tl) \ ++ for (tl = sched_domain_topology; tl->mask; tl++) ++ ++void set_sched_topology(struct sched_domain_topology_level *tl) ++{ ++ if (WARN_ON_ONCE(sched_smp_initialized)) ++ return; ++ ++ sched_domain_topology = tl; ++} ++ ++/* ++ * Initializers for schedule domains ++ * Non-inlined to reduce accumulated stack pressure in build_sched_domains() ++ */ ++ ++int sched_domain_level_max; ++ ++/* ++ * Partition sched domains as specified by the 'ndoms_new' ++ * cpumasks in the array doms_new[] of cpumasks. This compares ++ * doms_new[] to the current sched domain partitioning, doms_cur[]. ++ * It destroys each deleted domain and builds each new domain. ++ * ++ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. ++ * The masks don't intersect (don't overlap.) We should setup one ++ * sched domain for each mask. CPUs not in any of the cpumasks will ++ * not be load balanced. If the same cpumask appears both in the ++ * current 'doms_cur' domains and in the new 'doms_new', we can leave ++ * it as it is. ++ * ++ * The passed in 'doms_new' should be allocated using ++ * alloc_sched_domains. This routine takes ownership of it and will ++ * free_sched_domains it when done with it. If the caller failed the ++ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, ++ * and partition_sched_domains() will fallback to the single partition ++ * 'fallback_doms', it also forces the domains to be rebuilt. ++ * ++ * If doms_new == NULL it will be replaced with cpu_online_mask. ++ * ndoms_new == 0 is a special case for destroying existing domains, ++ * and it will not create the default domain. ++ * ++ * Call with hotplug lock held ++ */ ++void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ++ struct sched_domain_attr *dattr_new) ++{ ++ /** ++ * PDS doesn't depend on sched domains, but just keep this api ++ */ ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++#ifdef CONFIG_NUMA ++int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; ++ ++/* ++ * sched_numa_find_closest() - given the NUMA topology, find the cpu ++ * closest to @cpu from @cpumask. ++ * cpumask: cpumask to find a cpu from ++ * cpu: cpu to be close to ++ * ++ * returns: cpu, or nr_cpu_ids when nothing found. ++ */ ++int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return best_mask_cpu(cpu, cpus); ++} ++#endif /* CONFIG_NUMA */ ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going up, increment the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_inc_cpuslocked(&sched_smt_present); ++#endif ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) ++ cpuset_cpu_active(); ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all cpus have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_online(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu(); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going down, decrement the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_dec_cpuslocked(&sched_smt_present); ++#endif ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ return 0; ++} ++ ++static void sched_rq_cpu_starting(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ rq->calc_load_update = calc_load_update; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_rq_cpu_starting(cpu); ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ sched_tick_stop(cpu); ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_offline(rq); ++ migrate_tasks(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ hrtick_clear(rq); ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_SMP ++static void sched_init_topology_cpumask_early(void) ++{ ++ int cpu, level; ++ cpumask_t *tmp; ++ ++ for_each_possible_cpu(cpu) { ++ for (level = 0; level < NR_CPU_AFFINITY_CHK_LEVEL; level++) { ++ tmp = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[level]); ++ cpumask_copy(tmp, cpu_possible_mask); ++ cpumask_clear_cpu(cpu, tmp); ++ } ++ per_cpu(sched_cpu_llc_start_mask, cpu) = ++ &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ per_cpu(sched_cpu_affinity_chk_end_masks, cpu) = ++ &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[1]); ++ } ++} ++ ++static void sched_init_topology_cpumask(void) ++{ ++ int cpu; ++ cpumask_t *chk; ++ ++ for_each_online_cpu(cpu) { ++ chk = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ ++#ifdef CONFIG_SCHED_SMT ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++ if (cpumask_and(chk, chk, topology_sibling_cpumask(cpu))) { ++ per_cpu(sched_sibling_cpu, cpu) = cpumask_first(chk); ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - smt 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ } ++#endif ++#ifdef CONFIG_SCHED_MC ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++ if (cpumask_and(chk, chk, cpu_coregroup_mask(cpu))) { ++ per_cpu(sched_cpu_llc_start_mask, cpu) = chk; ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - coregroup 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ } ++ cpumask_complement(chk, cpu_coregroup_mask(cpu)); ++ ++ /** ++ * Set up sd_llc_id per CPU ++ */ ++ per_cpu(sd_llc_id, cpu) = ++ cpumask_first(cpu_coregroup_mask(cpu)); ++#else ++ per_cpu(sd_llc_id, cpu) = ++ cpumask_first(topology_core_cpumask(cpu)); ++ ++ per_cpu(sched_cpu_llc_start_mask, cpu) = chk; ++ ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++#endif /* NOT CONFIG_SCHED_MC */ ++ if (cpumask_and(chk, chk, topology_core_cpumask(cpu))) ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - core 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ cpumask_complement(chk, topology_core_cpumask(cpu)); ++ ++ if (cpumask_and(chk, chk, cpu_online_mask)) ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - others 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ ++ per_cpu(sched_cpu_affinity_chk_end_masks, cpu) = chk; ++ } ++} ++#endif ++ ++void __init sched_init_smp(void) ++{ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) ++ BUG(); ++ ++ cpumask_copy(&sched_rq_queued_masks[SCHED_RQ_EMPTY], cpu_online_mask); ++ ++ sched_init_topology_cpumask(); ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++ int i; ++ struct rq *rq; ++ ++ print_scheduler_version(); ++ ++ wait_bit_init(); ++ ++#ifdef CONFIG_SMP ++ for (i = 0; i < NR_SCHED_RQ_QUEUED_LEVEL; i++) ++ cpumask_clear(&sched_rq_queued_masks[i]); ++ cpumask_setall(&sched_rq_queued_masks[SCHED_RQ_EMPTY]); ++ set_bit(SCHED_RQ_EMPTY, sched_rq_queued_masks_bitmap); ++ ++ cpumask_setall(&sched_rq_pending_masks[SCHED_RQ_EMPTY]); ++ set_bit(SCHED_RQ_EMPTY, sched_rq_pending_masks_bitmap); ++#else ++ uprq = &per_cpu(runqueues, 0); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ FULL_INIT_SKIPLIST_NODE(&rq->sl_header); ++ raw_spin_lock_init(&rq->lock); ++ rq->dither = 0; ++ rq->nr_running = rq->nr_uninterruptible = 0; ++ rq->calc_load_active = 0; ++ rq->calc_load_update = jiffies + LOAD_FREQ; ++#ifdef CONFIG_SMP ++ rq->online = false; ++ rq->cpu = i; ++ ++ rq->queued_level = SCHED_RQ_EMPTY; ++ rq->pending_level = SCHED_RQ_EMPTY; ++#ifdef CONFIG_SCHED_SMT ++ per_cpu(sched_sibling_cpu, i) = i; ++ rq->active_balance = 0; ++#endif ++#endif ++ rq->nr_switches = 0; ++ atomic_set(&rq->nr_iowait, 0); ++ hrtick_rq_init(rq); ++ } ++#ifdef CONFIG_SMP ++ /* Set rq->online for cpu 0 */ ++ cpu_rq(0)->online = true; ++#endif ++ ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++ ++ sched_init_topology_cpumask_early(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++ ++ psi_init(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current) && !current->non_block_count) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++#ifdef CONFIG_DEBUG_PREEMPT ++ if (!preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++#endif ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++ ++void __cant_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > preempt_offset) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); ++ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++void normalize_rt_tasks(void) ++{ ++ struct task_struct *g, *p; ++ struct sched_attr attr = { ++ .sched_policy = SCHED_NORMAL, ++ }; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p)) { ++ /* ++ * Renice negative nice level userspace ++ * tasks back to 0: ++ */ ++ if (task_nice(p) < 0) ++ set_user_nice(p, 0); ++ continue; ++ } ++ ++ __sched_setscheduler(p, &attr, false, false); ++ } ++ read_unlock(&tasklist_lock); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * ia64_set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++#ifdef CONFIG_SCHED_DEBUG ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_legacy_files[] = { ++ { } /* Terminate */ ++}; ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++#undef CREATE_TRACE_POINTS +diff --git a/kernel/sched/pds_sched.h b/kernel/sched/pds_sched.h +new file mode 100644 +index 000000000000..6c3361f06087 +--- /dev/null ++++ b/kernel/sched/pds_sched.h +@@ -0,0 +1,518 @@ ++#ifndef PDS_SCHED_H ++#define PDS_SCHED_H ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#ifdef CONFIG_PARAVIRT ++# include ++#endif ++ ++#include "cpupri.h" ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++} ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ /* runqueue lock: */ ++ raw_spinlock_t lock; ++ ++ struct task_struct __rcu *curr; ++ struct task_struct *idle, *stop; ++ struct mm_struct *prev_mm; ++ ++ struct skiplist_node sl_header; ++ ++ /* switch count */ ++ u64 nr_switches; ++ ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_MEMBARRIER ++ int membarrier_state; ++#endif ++ ++#ifdef CONFIG_SMP ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ struct sched_avg avg_irq; ++#endif ++#ifdef CONFIG_SCHED_THERMAL_PRESSURE ++ struct sched_avg avg_thermal; ++#endif ++ ++ unsigned long queued_level; ++ unsigned long pending_level; ++ ++#ifdef CONFIG_SCHED_SMT ++ int active_balance; ++ struct cpu_stop_work active_balance_work; ++#endif ++#endif /* CONFIG_SMP */ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ /* calc_load related fields */ ++ unsigned long calc_load_update; ++ long calc_load_active; ++ ++ u64 clock, last_tick; ++ u64 clock_task; ++ int dither; ++ ++ unsigned long nr_running; ++ unsigned long nr_uninterruptible; ++ ++#ifdef CONFIG_SCHED_HRTICK ++#ifdef CONFIG_SMP ++ call_single_data_t hrtick_csd; ++#endif ++ struct hrtimer hrtick_timer; ++#endif ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++}; ++ ++extern unsigned long calc_load_update; ++extern atomic_long_t calc_load_tasks; ++ ++extern void calc_global_load_tick(struct rq *this_rq); ++extern long calc_load_fold_active(struct rq *this_rq, long adjust); ++ ++#ifndef CONFIG_SMP ++extern struct rq *uprq; ++#define cpu_rq(cpu) (uprq) ++#define this_rq() (uprq) ++#define raw_rq() (uprq) ++#define task_rq(p) (uprq) ++#define cpu_curr(cpu) ((uprq)->curr) ++#else /* CONFIG_SMP */ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) ++#define this_rq() this_cpu_ptr(&runqueues) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++#endif /* CONFIG_SMP */ ++ ++#ifndef arch_scale_freq_tick ++static __always_inline ++void arch_scale_freq_tick(void) ++{ ++} ++#endif ++ ++#ifndef arch_scale_freq_capacity ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock_task; ++} ++ ++/** ++ * By default the decay is the default pelt decay period. ++ * The decay shift can change the decay period in ++ * multiples of 32. ++ * Decay shift Decay period(ms) ++ * 0 32 ++ * 1 64 ++ * 2 128 ++ * 3 256 ++ * 4 512 ++ */ ++extern int sched_thermal_decay_shift; ++ ++static inline u64 rq_clock_thermal(struct rq *rq) ++{ ++ return rq_clock_task(rq) >> sched_thermal_decay_shift; ++} ++ ++/* ++ * {de,en}queue flags: ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ */ ++ ++#define DEQUEUE_SLEEP 0x01 ++ ++#define ENQUEUE_WAKEUP 0x01 ++ ++ ++/* ++ * Below are scheduler API which using in other kernel code ++ * It use the dummy rq_flags ++ * ToDo : PDS need to support these APIs for compatibility with mainline ++ * scheduler code. ++ */ ++struct rq_flags { ++ unsigned long flags; ++}; ++ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock); ++ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock); ++ ++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(&rq->lock); ++} ++ ++static inline void ++task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++} ++ ++static inline void ++rq_unlock_irq(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++static inline struct rq * ++this_rq_lock_irq(struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ raw_spin_lock(&rq->lock); ++ ++ return rq; ++} ++ ++static inline int task_current(struct rq *rq, struct task_struct *p) ++{ ++ return rq->curr == p; ++} ++ ++static inline bool task_running(struct task_struct *p) ++{ ++ return p->on_cpu; ++} ++ ++extern struct static_key_false sched_schedstats; ++ ++static inline void sched_ttwu_pending(void) { } ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++static inline int cpu_of(const struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ return rq->cpu; ++#else ++ return 0; ++#endif ++} ++ ++#include "stats.h" ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); ++ ++/** ++ * cpufreq_update_util - Take a note about CPU utilization changes. ++ * @rq: Runqueue to carry out the update for. ++ * @flags: Update reason flags. ++ * ++ * This function is called by the scheduler on the CPU whose utilization is ++ * being updated. ++ * ++ * It can only be called from RCU-sched read-side critical sections. ++ * ++ * The way cpufreq is currently arranged requires it to evaluate the CPU ++ * performance state (frequency/voltage) on a regular basis to prevent it from ++ * being stuck in a completely inadequate performance level for too long. ++ * That is not guaranteed to happen if the updates are only triggered from CFS ++ * and DL, though, because they may not be coming in if only RT tasks are ++ * active all the time (or there are RT tasks only). ++ * ++ * As a workaround for that issue, this function is called periodically by the ++ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, ++ * but that really is a band-aid. Going forward it should be replaced with ++ * solutions targeted more specifically at RT tasks. ++ */ ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); ++ if (data) ++ data->func(data, rq_clock(rq), flags); ++} ++ ++static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) ++{ ++ if (cpu_of(rq) == smp_processor_id()) ++ cpufreq_update_util(rq, flags); ++} ++#else ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} ++static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} ++#endif /* CONFIG_CPU_FREQ */ ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern int __init sched_tick_offload_init(void); ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++#endif ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++extern void schedule_idle(void); ++ ++/* ++ * !! For sched_setattr_nocheck() (kernel) only !! ++ * ++ * This is actually gross. :( ++ * ++ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE ++ * tasks, but still be able to sleep. We need this on platforms that cannot ++ * atomically change clock frequency. Remove once fast switching will be ++ * available on such platforms. ++ * ++ * SUGOV stands for SchedUtil GOVernor. ++ */ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++#ifdef CONFIG_MEMBARRIER ++/* ++ * The scheduler provides memory barriers required by membarrier between: ++ * - prior user-space memory accesses and store to rq->membarrier_state, ++ * - store to rq->membarrier_state and following user-space memory accesses. ++ * In the same way it provides those guarantees around store to rq->curr. ++ */ ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++ int membarrier_state; ++ ++ if (prev_mm == next_mm) ++ return; ++ ++ membarrier_state = atomic_read(&next_mm->membarrier_state); ++ if (READ_ONCE(rq->membarrier_state) == membarrier_state) ++ return; ++ ++ WRITE_ONCE(rq->membarrier_state, membarrier_state); ++} ++#else ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++} ++#endif ++ ++#ifdef CONFIG_NUMA ++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); ++#else ++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return nr_cpu_ids; ++} ++#endif ++ ++void swake_up_all_locked(struct swait_queue_head *q); ++void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); ++ ++#endif /* PDS_SCHED_H */ +diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c +index b647d04d9c8b..05b6cfd91842 100644 +--- a/kernel/sched/pelt.c ++++ b/kernel/sched/pelt.c +@@ -250,6 +250,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) + WRITE_ONCE(sa->util_avg, sa->util_sum / divider); + } + ++#ifndef CONFIG_SCHED_PDS + /* + * sched_entity: + * +@@ -367,6 +368,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + + return 0; + } ++#endif + + #ifdef CONFIG_SCHED_THERMAL_PRESSURE + /* +diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h +index eb034d9f024d..a074572f2976 100644 +--- a/kernel/sched/pelt.h ++++ b/kernel/sched/pelt.h +@@ -1,11 +1,13 @@ + #ifdef CONFIG_SMP + #include "sched-pelt.h" + ++#ifndef CONFIG_SCHED_PDS + int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); + int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); + int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); + int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); + int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); ++#endif + + #ifdef CONFIG_SCHED_THERMAL_PRESSURE + int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); +@@ -37,6 +39,7 @@ update_irq_load_avg(struct rq *rq, u64 running) + } + #endif + ++#ifndef CONFIG_SCHED_PDS + /* + * When a task is dequeued, its estimated utilization should not be update if + * its util_avg has not been updated at least once. +@@ -157,9 +160,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) + return rq_clock_pelt(rq_of(cfs_rq)); + } + #endif ++#endif /* CONFIG_SCHED_PDS */ + + #else + ++#ifndef CONFIG_SCHED_PDS + static inline int + update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) + { +@@ -188,6 +193,7 @@ static inline u64 thermal_load_avg(struct rq *rq) + { + return 0; + } ++#endif + + static inline int + update_irq_load_avg(struct rq *rq, u64 running) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index db3a57675ccf..5a8060bd2343 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2,6 +2,10 @@ + /* + * Scheduler internal types and methods: + */ ++#ifdef CONFIG_SCHED_PDS ++#include "pds_sched.h" ++#else ++ + #include + + #include +@@ -2546,3 +2550,5 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) + + void swake_up_all_locked(struct swait_queue_head *q); + void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); ++ ++#endif /* !CONFIG_SCHED_PDS */ +diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c +index 750fb3c67eed..45bd43942575 100644 +--- a/kernel/sched/stats.c ++++ b/kernel/sched/stats.c +@@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v) + } else { + struct rq *rq; + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_PDS + struct sched_domain *sd; + int dcount = 0; ++#endif + #endif + cpu = (unsigned long)(v - 2); + rq = cpu_rq(cpu); +@@ -40,6 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + seq_printf(seq, "\n"); + + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_PDS + /* domain-specific stats */ + rcu_read_lock(); + for_each_domain(cpu, sd) { +@@ -68,6 +71,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + sd->ttwu_move_balance); + } + rcu_read_unlock(); ++#endif + #endif + } + return 0; +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index 8a176d8727a3..b9dde576b576 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -130,8 +130,12 @@ static int __maybe_unused four = 4; + static unsigned long zero_ul; + static unsigned long one_ul = 1; + static unsigned long long_max = LONG_MAX; +-static int one_hundred = 100; +-static int one_thousand = 1000; ++static int __read_mostly one_hundred = 100; ++static int __read_mostly one_thousand = 1000; ++#ifdef CONFIG_SCHED_PDS ++extern int rr_interval; ++extern int sched_yield_type; ++#endif + #ifdef CONFIG_PRINTK + static int ten_thousand = 10000; + #endif +@@ -288,7 +292,7 @@ static struct ctl_table sysctl_base_table[] = { + { } + }; + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_PDS) + static int min_sched_granularity_ns = 100000; /* 100 usecs */ + static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns; /* 0 usecs */ +@@ -305,6 +309,7 @@ static int max_extfrag_threshold = 1000; + #endif + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_PDS + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -486,6 +491,7 @@ static struct ctl_table kern_table[] = { + .extra2 = SYSCTL_ONE, + }, + #endif ++#endif /* !CONFIG_SCHED_PDS */ + #ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", +@@ -1049,6 +1055,26 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_PDS ++ { ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ONE, ++ .extra2 = &one_thousand, ++ }, ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &two, ++ }, ++#endif + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +index 2fd3b3fa68bf..6f3b08afdd4c 100644 +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -236,7 +236,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) + u64 stime, utime; + + task_cputime(p, &utime, &stime); +- store_samples(samples, stime, utime, p->se.sum_exec_runtime); ++ store_samples(samples, stime, utime, tsk_seruntime(p)); + } + + static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, +@@ -806,6 +806,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, + } + } + ++#ifndef CONFIG_SCHED_PDS + static inline void check_dl_overrun(struct task_struct *tsk) + { + if (tsk->dl.dl_overrun) { +@@ -813,6 +814,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + } + } ++#endif + + static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) + { +@@ -840,8 +842,10 @@ static void check_thread_timers(struct task_struct *tsk, + u64 samples[CPUCLOCK_MAX]; + unsigned long soft; + ++#ifndef CONFIG_SCHED_PDS + if (dl_task(tsk)) + check_dl_overrun(tsk); ++#endif + + if (expiry_cache_is_inactive(pct)) + return; +@@ -855,7 +859,7 @@ static void check_thread_timers(struct task_struct *tsk, + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ +- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); ++ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + /* At the hard limit, send SIGKILL. No further action. */ +@@ -1091,8 +1095,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) + return true; + } + ++#ifndef CONFIG_SCHED_PDS + if (dl_task(tsk) && tsk->dl.dl_overrun) + return true; ++#endif + + return false; + } +diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +index b5e3496cf803..0816db0b9c16 100644 +--- a/kernel/trace/trace_selftest.c ++++ b/kernel/trace/trace_selftest.c +@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_PDS ++ /* No deadline on BFS, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + diff --git a/linux-tkg/linux-tkg-patches/5.7/0006-add-acs-overrides_iommu.patch b/linux-tkg/linux-tkg-patches/5.7/0006-add-acs-overrides_iommu.patch new file mode 100644 index 0000000..d1303a5 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.7/0006-add-acs-overrides_iommu.patch @@ -0,0 +1,193 @@ +From cdeab384f48dd9c88e2dff2e9ad8d57dca1a1b1c Mon Sep 17 00:00:00 2001 +From: Mark Weiman +Date: Sun, 12 Aug 2018 11:36:21 -0400 +Subject: [PATCH] pci: Enable overrides for missing ACS capabilities + +This an updated version of Alex Williamson's patch from: +https://lkml.org/lkml/2013/5/30/513 + +Original commit message follows: + +PCIe ACS (Access Control Services) is the PCIe 2.0+ feature that +allows us to control whether transactions are allowed to be redirected +in various subnodes of a PCIe topology. For instance, if two +endpoints are below a root port or downsteam switch port, the +downstream port may optionally redirect transactions between the +devices, bypassing upstream devices. The same can happen internally +on multifunction devices. The transaction may never be visible to the +upstream devices. + +One upstream device that we particularly care about is the IOMMU. If +a redirection occurs in the topology below the IOMMU, then the IOMMU +cannot provide isolation between devices. This is why the PCIe spec +encourages topologies to include ACS support. Without it, we have to +assume peer-to-peer DMA within a hierarchy can bypass IOMMU isolation. + +Unfortunately, far too many topologies do not support ACS to make this +a steadfast requirement. Even the latest chipsets from Intel are only +sporadically supporting ACS. We have trouble getting interconnect +vendors to include the PCIe spec required PCIe capability, let alone +suggested features. + +Therefore, we need to add some flexibility. The pcie_acs_override= +boot option lets users opt-in specific devices or sets of devices to +assume ACS support. The "downstream" option assumes full ACS support +on root ports and downstream switch ports. The "multifunction" +option assumes the subset of ACS features available on multifunction +endpoints and upstream switch ports are supported. The "id:nnnn:nnnn" +option enables ACS support on devices matching the provided vendor +and device IDs, allowing more strategic ACS overrides. These options +may be combined in any order. A maximum of 16 id specific overrides +are available. It's suggested to use the most limited set of options +necessary to avoid completely disabling ACS across the topology. +Note to hardware vendors, we have facilities to permanently quirk +specific devices which enforce isolation but not provide an ACS +capability. Please contact me to have your devices added and save +your customers the hassle of this boot option. + +Signed-off-by: Mark Weiman +--- + .../admin-guide/kernel-parameters.txt | 9 ++ + drivers/pci/quirks.c | 101 ++++++++++++++++++ + 2 files changed, 110 insertions(+) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index aefd358a5ca3..173b3596fd9e 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -3190,6 +3190,15 @@ + nomsi [MSI] If the PCI_MSI kernel config parameter is + enabled, this kernel boot option can be used to + disable the use of MSI interrupts system-wide. ++ pcie_acs_override = ++ [PCIE] Override missing PCIe ACS support for: ++ downstream ++ All downstream ports - full ACS capabilities ++ multifunction ++ All multifunction devices - multifunction ACS subset ++ id:nnnn:nnnn ++ Specific device - full ACS capabilities ++ Specified as vid:did (vendor/device ID) in hex + noioapicquirk [APIC] Disable all boot interrupt quirks. + Safety option to keep boot IRQs enabled. This + should never be necessary. +diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c +index 4700d24e5d55..8f7a3d7fd9c1 100644 +--- a/drivers/pci/quirks.c ++++ b/drivers/pci/quirks.c +@@ -3372,6 +3372,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev) + dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET; + } + ++static bool acs_on_downstream; ++static bool acs_on_multifunction; ++ ++#define NUM_ACS_IDS 16 ++struct acs_on_id { ++ unsigned short vendor; ++ unsigned short device; ++}; ++static struct acs_on_id acs_on_ids[NUM_ACS_IDS]; ++static u8 max_acs_id; ++ ++static __init int pcie_acs_override_setup(char *p) ++{ ++ if (!p) ++ return -EINVAL; ++ ++ while (*p) { ++ if (!strncmp(p, "downstream", 10)) ++ acs_on_downstream = true; ++ if (!strncmp(p, "multifunction", 13)) ++ acs_on_multifunction = true; ++ if (!strncmp(p, "id:", 3)) { ++ char opt[5]; ++ int ret; ++ long val; ++ ++ if (max_acs_id >= NUM_ACS_IDS - 1) { ++ pr_warn("Out of PCIe ACS override slots (%d)\n", ++ NUM_ACS_IDS); ++ goto next; ++ } ++ ++ p += 3; ++ snprintf(opt, 5, "%s", p); ++ ret = kstrtol(opt, 16, &val); ++ if (ret) { ++ pr_warn("PCIe ACS ID parse error %d\n", ret); ++ goto next; ++ } ++ acs_on_ids[max_acs_id].vendor = val; ++ ++ p += strcspn(p, ":"); ++ if (*p != ':') { ++ pr_warn("PCIe ACS invalid ID\n"); ++ goto next; ++ } ++ ++ p++; ++ snprintf(opt, 5, "%s", p); ++ ret = kstrtol(opt, 16, &val); ++ if (ret) { ++ pr_warn("PCIe ACS ID parse error %d\n", ret); ++ goto next; ++ } ++ acs_on_ids[max_acs_id].device = val; ++ max_acs_id++; ++ } ++next: ++ p += strcspn(p, ","); ++ if (*p == ',') ++ p++; ++ } ++ ++ if (acs_on_downstream || acs_on_multifunction || max_acs_id) ++ pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n"); ++ ++ return 0; ++} ++early_param("pcie_acs_override", pcie_acs_override_setup); ++ ++static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags) ++{ ++ int i; ++ ++ /* Never override ACS for legacy devices or devices with ACS caps */ ++ if (!pci_is_pcie(dev) || ++ pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS)) ++ return -ENOTTY; ++ ++ for (i = 0; i < max_acs_id; i++) ++ if (acs_on_ids[i].vendor == dev->vendor && ++ acs_on_ids[i].device == dev->device) ++ return 1; ++ ++ switch (pci_pcie_type(dev)) { ++ case PCI_EXP_TYPE_DOWNSTREAM: ++ case PCI_EXP_TYPE_ROOT_PORT: ++ if (acs_on_downstream) ++ return 1; ++ break; ++ case PCI_EXP_TYPE_ENDPOINT: ++ case PCI_EXP_TYPE_UPSTREAM: ++ case PCI_EXP_TYPE_LEG_END: ++ case PCI_EXP_TYPE_RC_END: ++ if (acs_on_multifunction && dev->multifunction) ++ return 1; ++ } ++ ++ return -ENOTTY; ++} + /* + * Some Atheros AR9xxx and QCA988x chips do not behave after a bus reset. + * The device will throw a Link Down error on AER-capable systems and +@@ -4513,6 +4613,7 @@ static const struct pci_dev_acs_enabled { + { PCI_VENDOR_ID_ZHAOXIN, 0x9083, pci_quirk_mf_endpoint_acs }, + /* Zhaoxin Root/Downstream Ports */ + { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, ++ { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, + { 0 } + }; + + diff --git a/linux-tkg/linux-tkg-patches/5.7/0007-v5.7-fsync.patch b/linux-tkg/linux-tkg-patches/5.7/0007-v5.7-fsync.patch new file mode 100644 index 0000000..01c86d8 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.7/0007-v5.7-fsync.patch @@ -0,0 +1,908 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Mon, 20 Apr 2020 14:09:11 +0200 +Subject: Import Fsync v3 patchset - Squashed from https://gitlab.collabora.com/tonyk/linux/-/commits/futex-proton-v3 + +diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h +index a89eb0accd5e2ee527be1e3e11b1117ff5bf94b4..580001e89c6caed57dd8b3cb491d65dce846caff 100644 +--- a/include/uapi/linux/futex.h ++++ b/include/uapi/linux/futex.h +@@ -21,6 +21,7 @@ + #define FUTEX_WAKE_BITSET 10 + #define FUTEX_WAIT_REQUEUE_PI 11 + #define FUTEX_CMP_REQUEUE_PI 12 ++#define FUTEX_WAIT_MULTIPLE 13 + + #define FUTEX_PRIVATE_FLAG 128 + #define FUTEX_CLOCK_REALTIME 256 +@@ -40,6 +41,8 @@ + FUTEX_PRIVATE_FLAG) + #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ + FUTEX_PRIVATE_FLAG) ++#define FUTEX_WAIT_MULTIPLE_PRIVATE (FUTEX_WAIT_MULTIPLE | \ ++ FUTEX_PRIVATE_FLAG) + + /* + * Support for robust futexes: the kernel cleans up held futexes at +@@ -150,4 +153,21 @@ struct robust_list_head { + (((op & 0xf) << 28) | ((cmp & 0xf) << 24) \ + | ((oparg & 0xfff) << 12) | (cmparg & 0xfff)) + ++/* ++ * Maximum number of multiple futexes to wait for ++ */ ++#define FUTEX_MULTIPLE_MAX_COUNT 128 ++ ++/** ++ * struct futex_wait_block - Block of futexes to be waited for ++ * @uaddr: User address of the futex ++ * @val: Futex value expected by userspace ++ * @bitset: Bitset for the optional bitmasked wakeup ++ */ ++struct futex_wait_block { ++ __u32 __user *uaddr; ++ __u32 val; ++ __u32 bitset; ++}; ++ + #endif /* _UAPI_LINUX_FUTEX_H */ +diff --git a/kernel/futex.c b/kernel/futex.c +index 0cf84c8664f207c574325b899ef2e57f01295a94..58cf9eb2b851b4858e29b5ef4114a29a92e676ba 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -215,6 +215,8 @@ struct futex_pi_state { + * @rt_waiter: rt_waiter storage for use with requeue_pi + * @requeue_pi_key: the requeue_pi target futex key + * @bitset: bitset for the optional bitmasked wakeup ++ * @uaddr: userspace address of futex ++ * @uval: expected futex's value + * + * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so + * we can wake only the relevant ones (hashed queues may be shared). +@@ -237,6 +239,8 @@ struct futex_q { + struct rt_mutex_waiter *rt_waiter; + union futex_key *requeue_pi_key; + u32 bitset; ++ u32 __user *uaddr; ++ u32 uval; + } __randomize_layout; + + static const struct futex_q futex_q_init = { +@@ -2420,6 +2424,29 @@ static int unqueue_me(struct futex_q *q) + return ret; + } + ++/** ++ * unqueue_multiple() - Remove several futexes from their futex_hash_bucket ++ * @q: The list of futexes to unqueue ++ * @count: Number of futexes in the list ++ * ++ * Helper to unqueue a list of futexes. This can't fail. ++ * ++ * Return: ++ * - >=0 - Index of the last futex that was awoken; ++ * - -1 - If no futex was awoken ++ */ ++static int unqueue_multiple(struct futex_q *q, int count) ++{ ++ int ret = -1; ++ int i; ++ ++ for (i = 0; i < count; i++) { ++ if (!unqueue_me(&q[i])) ++ ret = i; ++ } ++ return ret; ++} ++ + /* + * PI futexes can not be requeued and must remove themself from the + * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry +@@ -2783,6 +2810,211 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + return ret; + } + ++/** ++ * futex_wait_multiple_setup() - Prepare to wait and enqueue multiple futexes ++ * @qs: The corresponding futex list ++ * @count: The size of the lists ++ * @flags: Futex flags (FLAGS_SHARED, etc.) ++ * @awaken: Index of the last awoken futex ++ * ++ * Prepare multiple futexes in a single step and enqueue them. This may fail if ++ * the futex list is invalid or if any futex was already awoken. On success the ++ * task is ready to interruptible sleep. ++ * ++ * Return: ++ * - 1 - One of the futexes was awaken by another thread ++ * - 0 - Success ++ * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL ++ */ ++static int futex_wait_multiple_setup(struct futex_q *qs, int count, ++ unsigned int flags, int *awaken) ++{ ++ struct futex_hash_bucket *hb; ++ int ret, i; ++ u32 uval; ++ ++ /* ++ * Enqueuing multiple futexes is tricky, because we need to ++ * enqueue each futex in the list before dealing with the next ++ * one to avoid deadlocking on the hash bucket. But, before ++ * enqueuing, we need to make sure that current->state is ++ * TASK_INTERRUPTIBLE, so we don't absorb any awake events, which ++ * cannot be done before the get_futex_key of the next key, ++ * because it calls get_user_pages, which can sleep. Thus, we ++ * fetch the list of futexes keys in two steps, by first pinning ++ * all the memory keys in the futex key, and only then we read ++ * each key and queue the corresponding futex. ++ */ ++retry: ++ for (i = 0; i < count; i++) { ++ qs[i].key = FUTEX_KEY_INIT; ++ ret = get_futex_key(qs[i].uaddr, flags & FLAGS_SHARED, ++ &qs[i].key, FUTEX_READ); ++ if (unlikely(ret)) { ++ for (--i; i >= 0; i--) ++ put_futex_key(&qs[i].key); ++ return ret; ++ } ++ } ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ for (i = 0; i < count; i++) { ++ struct futex_q *q = &qs[i]; ++ ++ hb = queue_lock(q); ++ ++ ret = get_futex_value_locked(&uval, q->uaddr); ++ if (ret) { ++ /* ++ * We need to try to handle the fault, which ++ * cannot be done without sleep, so we need to ++ * undo all the work already done, to make sure ++ * we don't miss any wake ups. Therefore, clean ++ * up, handle the fault and retry from the ++ * beginning. ++ */ ++ queue_unlock(hb); ++ ++ /* ++ * Keys 0..(i-1) are implicitly put ++ * on unqueue_multiple. ++ */ ++ put_futex_key(&q->key); ++ ++ *awaken = unqueue_multiple(qs, i); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ /* ++ * On a real fault, prioritize the error even if ++ * some other futex was awoken. Userspace gave ++ * us a bad address, -EFAULT them. ++ */ ++ ret = get_user(uval, q->uaddr); ++ if (ret) ++ return ret; ++ ++ /* ++ * Even if the page fault was handled, If ++ * something was already awaken, we can safely ++ * give up and succeed to give a hint for userspace to ++ * acquire the right futex faster. ++ */ ++ if (*awaken >= 0) ++ return 1; ++ ++ goto retry; ++ } ++ ++ if (uval != q->uval) { ++ queue_unlock(hb); ++ ++ put_futex_key(&qs[i].key); ++ ++ /* ++ * If something was already awaken, we can ++ * safely ignore the error and succeed. ++ */ ++ *awaken = unqueue_multiple(qs, i); ++ __set_current_state(TASK_RUNNING); ++ if (*awaken >= 0) ++ return 1; ++ ++ return -EWOULDBLOCK; ++ } ++ ++ /* ++ * The bucket lock can't be held while dealing with the ++ * next futex. Queue each futex at this moment so hb can ++ * be unlocked. ++ */ ++ queue_me(&qs[i], hb); ++ } ++ return 0; ++} ++ ++/** ++ * futex_wait_multiple() - Prepare to wait on and enqueue several futexes ++ * @qs: The list of futexes to wait on ++ * @op: Operation code from futex's syscall ++ * @count: The number of objects ++ * @abs_time: Timeout before giving up and returning to userspace ++ * ++ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function ++ * sleeps on a group of futexes and returns on the first futex that ++ * triggered, or after the timeout has elapsed. ++ * ++ * Return: ++ * - >=0 - Hint to the futex that was awoken ++ * - <0 - On error ++ */ ++static int futex_wait_multiple(struct futex_q *qs, int op, ++ u32 count, ktime_t *abs_time) ++{ ++ struct hrtimer_sleeper timeout, *to; ++ int ret, flags = 0, hint = 0; ++ unsigned int i; ++ ++ if (!(op & FUTEX_PRIVATE_FLAG)) ++ flags |= FLAGS_SHARED; ++ ++ if (op & FUTEX_CLOCK_REALTIME) ++ flags |= FLAGS_CLOCKRT; ++ ++ to = futex_setup_timer(abs_time, &timeout, flags, 0); ++ while (1) { ++ ret = futex_wait_multiple_setup(qs, count, flags, &hint); ++ if (ret) { ++ if (ret > 0) { ++ /* A futex was awaken during setup */ ++ ret = hint; ++ } ++ break; ++ } ++ ++ if (to) ++ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); ++ ++ /* ++ * Avoid sleeping if another thread already tried to ++ * wake us. ++ */ ++ for (i = 0; i < count; i++) { ++ if (plist_node_empty(&qs[i].list)) ++ break; ++ } ++ ++ if (i == count && (!to || to->task)) ++ freezable_schedule(); ++ ++ ret = unqueue_multiple(qs, count); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ if (ret >= 0) ++ break; ++ if (to && !to->task) { ++ ret = -ETIMEDOUT; ++ break; ++ } else if (signal_pending(current)) { ++ ret = -ERESTARTSYS; ++ break; ++ } ++ /* ++ * The final case is a spurious wakeup, for ++ * which just retry. ++ */ ++ } ++ ++ if (to) { ++ hrtimer_cancel(&to->timer); ++ destroy_hrtimer_on_stack(&to->timer); ++ } ++ ++ return ret; ++} ++ + static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, + ktime_t *abs_time, u32 bitset) + { +@@ -3907,6 +4139,43 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, + return -ENOSYS; + } + ++/** ++ * futex_read_wait_block - Read an array of futex_wait_block from userspace ++ * @uaddr: Userspace address of the block ++ * @count: Number of blocks to be read ++ * ++ * This function creates and allocate an array of futex_q (we zero it to ++ * initialize the fields) and then, for each futex_wait_block element from ++ * userspace, fill a futex_q element with proper values. ++ */ ++inline struct futex_q *futex_read_wait_block(u32 __user *uaddr, u32 count) ++{ ++ unsigned int i; ++ struct futex_q *qs; ++ struct futex_wait_block fwb; ++ struct futex_wait_block __user *entry = ++ (struct futex_wait_block __user *)uaddr; ++ ++ if (!count || count > FUTEX_MULTIPLE_MAX_COUNT) ++ return ERR_PTR(-EINVAL); ++ ++ qs = kcalloc(count, sizeof(*qs), GFP_KERNEL); ++ if (!qs) ++ return ERR_PTR(-ENOMEM); ++ ++ for (i = 0; i < count; i++) { ++ if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { ++ kfree(qs); ++ return ERR_PTR(-EFAULT); ++ } ++ ++ qs[i].uaddr = fwb.uaddr; ++ qs[i].uval = fwb.val; ++ qs[i].bitset = fwb.bitset; ++ } ++ ++ return qs; ++} + + SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, +@@ -3919,7 +4188,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET || +- cmd == FUTEX_WAIT_REQUEUE_PI)) { ++ cmd == FUTEX_WAIT_REQUEUE_PI || ++ cmd == FUTEX_WAIT_MULTIPLE)) { + if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) + return -EFAULT; + if (get_timespec64(&ts, utime)) +@@ -3940,6 +4210,25 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) + val2 = (u32) (unsigned long) utime; + ++ if (cmd == FUTEX_WAIT_MULTIPLE) { ++ int ret; ++ struct futex_q *qs; ++ ++#ifdef CONFIG_X86_X32 ++ if (unlikely(in_x32_syscall())) ++ return -ENOSYS; ++#endif ++ qs = futex_read_wait_block(uaddr, val); ++ ++ if (IS_ERR(qs)) ++ return PTR_ERR(qs); ++ ++ ret = futex_wait_multiple(qs, op, val, tp); ++ kfree(qs); ++ ++ return ret; ++ } ++ + return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); + } + +@@ -4102,6 +4391,57 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, + #endif /* CONFIG_COMPAT */ + + #ifdef CONFIG_COMPAT_32BIT_TIME ++/** ++ * struct compat_futex_wait_block - Block of futexes to be waited for ++ * @uaddr: User address of the futex (compatible pointer) ++ * @val: Futex value expected by userspace ++ * @bitset: Bitset for the optional bitmasked wakeup ++ */ ++struct compat_futex_wait_block { ++ compat_uptr_t uaddr; ++ __u32 val; ++ __u32 bitset; ++}; ++ ++/** ++ * compat_futex_read_wait_block - Read an array of futex_wait_block from ++ * userspace ++ * @uaddr: Userspace address of the block ++ * @count: Number of blocks to be read ++ * ++ * This function does the same as futex_read_wait_block(), except that it ++ * converts the pointer to the futex from the compat version to the regular one. ++ */ ++inline struct futex_q *compat_futex_read_wait_block(u32 __user *uaddr, ++ u32 count) ++{ ++ unsigned int i; ++ struct futex_q *qs; ++ struct compat_futex_wait_block fwb; ++ struct compat_futex_wait_block __user *entry = ++ (struct compat_futex_wait_block __user *)uaddr; ++ ++ if (!count || count > FUTEX_MULTIPLE_MAX_COUNT) ++ return ERR_PTR(-EINVAL); ++ ++ qs = kcalloc(count, sizeof(*qs), GFP_KERNEL); ++ if (!qs) ++ return ERR_PTR(-ENOMEM); ++ ++ for (i = 0; i < count; i++) { ++ if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { ++ kfree(qs); ++ return ERR_PTR(-EFAULT); ++ } ++ ++ qs[i].uaddr = compat_ptr(fwb.uaddr); ++ qs[i].uval = fwb.val; ++ qs[i].bitset = fwb.bitset; ++ } ++ ++ return qs; ++} ++ + SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + struct old_timespec32 __user *, utime, u32 __user *, uaddr2, + u32, val3) +@@ -4113,7 +4453,8 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET || +- cmd == FUTEX_WAIT_REQUEUE_PI)) { ++ cmd == FUTEX_WAIT_REQUEUE_PI || ++ cmd == FUTEX_WAIT_MULTIPLE)) { + if (get_old_timespec32(&ts, utime)) + return -EFAULT; + if (!timespec64_valid(&ts)) +@@ -4128,6 +4469,19 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) + val2 = (int) (unsigned long) utime; + ++ if (cmd == FUTEX_WAIT_MULTIPLE) { ++ int ret; ++ struct futex_q *qs = compat_futex_read_wait_block(uaddr, val); ++ ++ if (IS_ERR(qs)) ++ return PTR_ERR(qs); ++ ++ ret = futex_wait_multiple(qs, op, val, tp); ++ kfree(qs); ++ ++ return ret; ++ } ++ + return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); + } + #endif /* CONFIG_COMPAT_32BIT_TIME */ +diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c +index ee55e6d389a3f053194435342c4e471dc7cf8786..2a63e1c2cfb6407a5988233217cff2e52787bc66 100644 +--- a/tools/testing/selftests/futex/functional/futex_wait_timeout.c ++++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c +@@ -11,6 +11,7 @@ + * + * HISTORY + * 2009-Nov-6: Initial version by Darren Hart ++ * 2019-Dec-13: Add WAIT_MULTIPLE test by Krisman + * + *****************************************************************************/ + +@@ -41,6 +42,8 @@ int main(int argc, char *argv[]) + { + futex_t f1 = FUTEX_INITIALIZER; + struct timespec to; ++ time_t secs; ++ struct futex_wait_block fwb = {&f1, f1, 0}; + int res, ret = RET_PASS; + int c; + +@@ -65,7 +68,7 @@ int main(int argc, char *argv[]) + } + + ksft_print_header(); +- ksft_set_plan(1); ++ ksft_set_plan(2); + ksft_print_msg("%s: Block on a futex and wait for timeout\n", + basename(argv[0])); + ksft_print_msg("\tArguments: timeout=%ldns\n", timeout_ns); +@@ -79,8 +82,39 @@ int main(int argc, char *argv[]) + if (!res || errno != ETIMEDOUT) { + fail("futex_wait returned %d\n", ret < 0 ? errno : ret); + ret = RET_FAIL; ++ } else ++ ksft_test_result_pass("futex_wait timeout succeeds\n"); ++ ++ info("Calling futex_wait_multiple on f1: %u @ %p\n", f1, &f1); ++ ++ /* Setup absolute time */ ++ ret = clock_gettime(CLOCK_REALTIME, &to); ++ secs = (to.tv_nsec + timeout_ns) / 1000000000; ++ to.tv_nsec = ((int64_t)to.tv_nsec + timeout_ns) % 1000000000; ++ to.tv_sec += secs; ++ info("to.tv_sec = %ld\n", to.tv_sec); ++ info("to.tv_nsec = %ld\n", to.tv_nsec); ++ ++ res = futex_wait_multiple(&fwb, 1, &to, ++ FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME); ++ ++#ifdef __ILP32__ ++ if (res == -1 && errno == ENOSYS) { ++ ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); ++ } else { ++ ksft_test_result_fail("futex_wait_multiple returned %d\n", ++ res < 0 ? errno : res); ++ ret = RET_FAIL; + } ++#else ++ if (!res || errno != ETIMEDOUT) { ++ ksft_test_result_fail("futex_wait_multiple returned %d\n", ++ res < 0 ? errno : res); ++ ret = RET_FAIL; ++ } else ++ ksft_test_result_pass("futex_wait_multiple timeout succeeds\n"); ++#endif /* __ILP32__ */ + +- print_result(TEST_NAME, ret); ++ ksft_print_cnts(); + return ret; + } +diff --git a/tools/testing/selftests/futex/include/futextest.h b/tools/testing/selftests/futex/include/futextest.h +index ddbcfc9b7bac4aebb5bac2f249e26ecfd948aa84..bb103bef4557012ef9a389ca74c868e4476a8a31 100644 +--- a/tools/testing/selftests/futex/include/futextest.h ++++ b/tools/testing/selftests/futex/include/futextest.h +@@ -38,6 +38,14 @@ typedef volatile u_int32_t futex_t; + #ifndef FUTEX_CMP_REQUEUE_PI + #define FUTEX_CMP_REQUEUE_PI 12 + #endif ++#ifndef FUTEX_WAIT_MULTIPLE ++#define FUTEX_WAIT_MULTIPLE 13 ++struct futex_wait_block { ++ futex_t *uaddr; ++ futex_t val; ++ __u32 bitset; ++}; ++#endif + #ifndef FUTEX_WAIT_REQUEUE_PI_PRIVATE + #define FUTEX_WAIT_REQUEUE_PI_PRIVATE (FUTEX_WAIT_REQUEUE_PI | \ + FUTEX_PRIVATE_FLAG) +@@ -80,6 +88,20 @@ futex_wait(futex_t *uaddr, futex_t val, struct timespec *timeout, int opflags) + return futex(uaddr, FUTEX_WAIT, val, timeout, NULL, 0, opflags); + } + ++/** ++ * futex_wait_multiple() - block on several futexes with optional timeout ++ * @fwb: wait block user space address ++ * @count: number of entities at fwb ++ * @timeout: absolute timeout ++ */ ++static inline int ++futex_wait_multiple(struct futex_wait_block *fwb, int count, ++ struct timespec *timeout, int opflags) ++{ ++ return futex(fwb, FUTEX_WAIT_MULTIPLE, count, timeout, NULL, 0, ++ opflags); ++} ++ + /** + * futex_wake() - wake one or more tasks blocked on uaddr + * @nr_wake: wake up to this many tasks +diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c +index 0ae390ff816449c88d0bb655a26eb014382c2b4f..bcbac042992d447e0bc9ef5fefe94e875de310f2 100644 +--- a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c ++++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c +@@ -12,6 +12,7 @@ + * + * HISTORY + * 2009-Nov-14: Initial version by Gowrishankar ++ * 2019-Dec-13: Add WAIT_MULTIPLE test by Krisman + * + *****************************************************************************/ + +@@ -40,6 +41,7 @@ int main(int argc, char *argv[]) + { + struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns}; + futex_t f1 = FUTEX_INITIALIZER; ++ struct futex_wait_block fwb = {&f1, f1+1, 0}; + int res, ret = RET_PASS; + int c; + +@@ -61,7 +63,7 @@ int main(int argc, char *argv[]) + } + + ksft_print_header(); +- ksft_set_plan(1); ++ ksft_set_plan(2); + ksft_print_msg("%s: Test the unexpected futex value in FUTEX_WAIT\n", + basename(argv[0])); + +@@ -71,8 +73,30 @@ int main(int argc, char *argv[]) + fail("futex_wait returned: %d %s\n", + res ? errno : res, res ? strerror(errno) : ""); + ret = RET_FAIL; ++ } else ++ ksft_test_result_pass("futex_wait wouldblock succeeds\n"); ++ ++ info("Calling futex_wait_multiple on f1: %u @ %p with val=%u\n", ++ f1, &f1, f1+1); ++ res = futex_wait_multiple(&fwb, 1, NULL, FUTEX_PRIVATE_FLAG); ++ ++#ifdef __ILP32__ ++ if (res != -1 || errno != ENOSYS) { ++ ksft_test_result_fail("futex_wait_multiple returned %d\n", ++ res < 0 ? errno : res); ++ ret = RET_FAIL; ++ } else { ++ ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); ++ } ++#else ++ if (!res || errno != EWOULDBLOCK) { ++ ksft_test_result_fail("futex_wait_multiple returned %d\n", ++ res < 0 ? errno : res); ++ ret = RET_FAIL; + } ++ ksft_test_result_pass("futex_wait_multiple wouldblock succeeds\n"); ++#endif /* __ILP32__ */ + +- print_result(TEST_NAME, ret); ++ ksft_print_cnts(); + return ret; + } +diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore +index a09f570619023750f558c84004aff166b4337d72..4660128a545edb04a17cc6bd9760931c1386122f 100644 +--- a/tools/testing/selftests/futex/functional/.gitignore ++++ b/tools/testing/selftests/futex/functional/.gitignore +@@ -5,3 +5,4 @@ futex_wait_private_mapped_file + futex_wait_timeout + futex_wait_uninitialized_heap + futex_wait_wouldblock ++futex_wait_multiple +diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile +index 30996306cabcfe89a47977643e529b122893bb7e..75f9fface11fa3c90c1bdb9a49b3ea51291afd58 100644 +--- a/tools/testing/selftests/futex/functional/Makefile ++++ b/tools/testing/selftests/futex/functional/Makefile +@@ -14,7 +14,8 @@ TEST_GEN_FILES := \ + futex_requeue_pi_signal_restart \ + futex_requeue_pi_mismatched_ops \ + futex_wait_uninitialized_heap \ +- futex_wait_private_mapped_file ++ futex_wait_private_mapped_file \ ++ futex_wait_multiple + + TEST_PROGS := run.sh + +diff --git a/tools/testing/selftests/futex/functional/futex_wait_multiple.c b/tools/testing/selftests/futex/functional/futex_wait_multiple.c +new file mode 100644 +index 0000000000000000000000000000000000000000..b48422e79f42edba1653bb0bd2a4c4fd98d2d48d +--- /dev/null ++++ b/tools/testing/selftests/futex/functional/futex_wait_multiple.c +@@ -0,0 +1,173 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/****************************************************************************** ++ * ++ * Copyright © Collabora, Ltd., 2019 ++ * ++ * DESCRIPTION ++ * Test basic semantics of FUTEX_WAIT_MULTIPLE ++ * ++ * AUTHOR ++ * Gabriel Krisman Bertazi ++ * ++ * HISTORY ++ * 2019-Dec-13: Initial version by Krisman ++ * ++ *****************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "futextest.h" ++#include "logging.h" ++ ++#define TEST_NAME "futex-wait-multiple" ++#define timeout_ns 100000 ++#define MAX_COUNT 128 ++#define WAKE_WAIT_US 3000000 ++ ++int ret = RET_PASS; ++char *progname; ++futex_t f[MAX_COUNT] = {0}; ++struct futex_wait_block fwb[MAX_COUNT]; ++ ++void usage(char *prog) ++{ ++ printf("Usage: %s\n", prog); ++ printf(" -c Use color\n"); ++ printf(" -h Display this help message\n"); ++ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", ++ VQUIET, VCRITICAL, VINFO); ++} ++ ++void test_count_overflow(void) ++{ ++ futex_t f = FUTEX_INITIALIZER; ++ struct futex_wait_block fwb[MAX_COUNT+1]; ++ int res, i; ++ ++ ksft_print_msg("%s: Test a too big number of futexes\n", progname); ++ ++ for (i = 0; i < MAX_COUNT+1; i++) { ++ fwb[i].uaddr = &f; ++ fwb[i].val = f; ++ fwb[i].bitset = 0; ++ } ++ ++ res = futex_wait_multiple(fwb, MAX_COUNT+1, NULL, FUTEX_PRIVATE_FLAG); ++ ++#ifdef __ILP32__ ++ if (res != -1 || errno != ENOSYS) { ++ ksft_test_result_fail("futex_wait_multiple returned %d\n", ++ res < 0 ? errno : res); ++ ret = RET_FAIL; ++ } else { ++ ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); ++ } ++#else ++ if (res != -1 || errno != EINVAL) { ++ ksft_test_result_fail("futex_wait_multiple returned %d\n", ++ res < 0 ? errno : res); ++ ret = RET_FAIL; ++ } else { ++ ksft_test_result_pass("futex_wait_multiple count overflow succeed\n"); ++ } ++ ++#endif /* __ILP32__ */ ++} ++ ++void *waiterfn(void *arg) ++{ ++ int res; ++ ++ res = futex_wait_multiple(fwb, MAX_COUNT, NULL, FUTEX_PRIVATE_FLAG); ++ ++#ifdef __ILP32__ ++ if (res != -1 || errno != ENOSYS) { ++ ksft_test_result_fail("futex_wait_multiple returned %d\n", ++ res < 0 ? errno : res); ++ ret = RET_FAIL; ++ } else { ++ ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); ++ } ++#else ++ if (res < 0) ++ ksft_print_msg("waiter failed %d\n", res); ++ ++ info("futex_wait_multiple: Got hint futex %d was freed\n", res); ++#endif /* __ILP32__ */ ++ ++ return NULL; ++} ++ ++void test_fwb_wakeup(void) ++{ ++ int res, i; ++ pthread_t waiter; ++ ++ ksft_print_msg("%s: Test wake up in a list of futex\n", progname); ++ ++ for (i = 0; i < MAX_COUNT; i++) { ++ fwb[i].uaddr = &f[i]; ++ fwb[i].val = f[i]; ++ fwb[i].bitset = 0xffffffff; ++ } ++ ++ res = pthread_create(&waiter, NULL, waiterfn, NULL); ++ if (res) { ++ ksft_test_result_fail("Creating waiting thread failed"); ++ ksft_exit_fail(); ++ } ++ ++ usleep(WAKE_WAIT_US); ++ res = futex_wake(&(f[MAX_COUNT-1]), 1, FUTEX_PRIVATE_FLAG); ++ if (res != 1) { ++ ksft_test_result_fail("Failed to wake thread res=%d\n", res); ++ ksft_exit_fail(); ++ } ++ ++ pthread_join(waiter, NULL); ++ ksft_test_result_pass("%s succeed\n", __func__); ++} ++ ++int main(int argc, char *argv[]) ++{ ++ int c; ++ ++ while ((c = getopt(argc, argv, "cht:v:")) != -1) { ++ switch (c) { ++ case 'c': ++ log_color(1); ++ break; ++ case 'h': ++ usage(basename(argv[0])); ++ exit(0); ++ case 'v': ++ log_verbosity(atoi(optarg)); ++ break; ++ default: ++ usage(basename(argv[0])); ++ exit(1); ++ } ++ } ++ ++ progname = basename(argv[0]); ++ ++ ksft_print_header(); ++ ksft_set_plan(2); ++ ++ test_count_overflow(); ++ ++#ifdef __ILP32__ ++ // if it's a 32x binary, there's no futex to wakeup ++ ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); ++#else ++ test_fwb_wakeup(); ++#endif /* __ILP32__ */ ++ ++ ksft_print_cnts(); ++ return ret; ++} +diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh +index 1acb6ace1680e8f3d6b3ee2dc528c19ddfdb018e..a8be94f28ff78b4879d2d19bca5d9b0fcb26c1f8 100755 +--- a/tools/testing/selftests/futex/functional/run.sh ++++ b/tools/testing/selftests/futex/functional/run.sh +@@ -73,3 +73,6 @@ echo + echo + ./futex_wait_uninitialized_heap $COLOR + ./futex_wait_private_mapped_file $COLOR ++ ++echo ++./futex_wait_multiple $COLOR +diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h +index 580001e89c6caed57dd8b3cb491d65dce846caff..a3e760886b8e7e74285fdcf2caaaa6f66ad16675 100644 +--- a/include/uapi/linux/futex.h ++++ b/include/uapi/linux/futex.h +@@ -21,7 +21,7 @@ + #define FUTEX_WAKE_BITSET 10 + #define FUTEX_WAIT_REQUEUE_PI 11 + #define FUTEX_CMP_REQUEUE_PI 12 +-#define FUTEX_WAIT_MULTIPLE 13 ++#define FUTEX_WAIT_MULTIPLE 31 + + #define FUTEX_PRIVATE_FLAG 128 + #define FUTEX_CLOCK_REALTIME 256 +diff --git a/kernel/futex.c b/kernel/futex.c +index 58cf9eb2b851b4858e29b5ef4114a29a92e676ba..e0bb628a5e1988dcc9ae5442a4259edc229d578d 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -4198,7 +4198,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + return -EINVAL; + + t = timespec64_to_ktime(ts); +- if (cmd == FUTEX_WAIT) ++ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) + t = ktime_add_safe(ktime_get(), t); + tp = &t; + } +@@ -4399,6 +4399,7 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, + */ + struct compat_futex_wait_block { + compat_uptr_t uaddr; ++ __u32 pad; + __u32 val; + __u32 bitset; + }; +@@ -4461,7 +4462,7 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + return -EINVAL; + + t = timespec64_to_ktime(ts); +- if (cmd == FUTEX_WAIT) ++ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) + t = ktime_add_safe(ktime_get(), t); + tp = &t; + } diff --git a/linux-tkg/linux-tkg-patches/5.7/0008-5.7-bcachefs.patch b/linux-tkg/linux-tkg-patches/5.7/0008-5.7-bcachefs.patch new file mode 100644 index 0000000..4ca0a38 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.7/0008-5.7-bcachefs.patch @@ -0,0 +1,71085 @@ +diff --git a/block/bio.c b/block/bio.c +index 21cbaa6a1c20..8d236b819612 100644 +--- a/block/bio.c ++++ b/block/bio.c +@@ -1049,6 +1049,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) + bio_set_flag(bio, BIO_NO_PAGE_REF); + return bio->bi_vcnt ? 0 : ret; + } ++EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); + + static void submit_bio_wait_endio(struct bio *bio) + { +@@ -1243,6 +1244,7 @@ void bio_set_pages_dirty(struct bio *bio) + set_page_dirty_lock(bvec->bv_page); + } + } ++EXPORT_SYMBOL_GPL(bio_set_pages_dirty); + + /* + * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. +@@ -1302,6 +1304,7 @@ void bio_check_pages_dirty(struct bio *bio) + spin_unlock_irqrestore(&bio_dirty_lock, flags); + schedule_work(&bio_dirty_work); + } ++EXPORT_SYMBOL_GPL(bio_check_pages_dirty); + + void update_io_ticks(struct hd_struct *part, unsigned long now, bool end) + { +diff --git a/block/blk-core.c b/block/blk-core.c +index 9bfaee050c82..60a1a2907abf 100644 +--- a/block/blk-core.c ++++ b/block/blk-core.c +@@ -210,18 +210,23 @@ int blk_status_to_errno(blk_status_t status) + } + EXPORT_SYMBOL_GPL(blk_status_to_errno); + +-static void print_req_error(struct request *req, blk_status_t status, +- const char *caller) ++const char *blk_status_to_str(blk_status_t status) + { + int idx = (__force int)status; + + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) +- return; ++ return "(invalid error)"; ++ return blk_errors[idx].name; ++} ++EXPORT_SYMBOL_GPL(blk_status_to_str); + ++static void print_req_error(struct request *req, blk_status_t status, ++ const char *caller) ++{ + printk_ratelimited(KERN_ERR + "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " + "phys_seg %u prio class %u\n", +- caller, blk_errors[idx].name, ++ caller, blk_status_to_str(status), + req->rq_disk ? req->rq_disk->disk_name : "?", + blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), + req->cmd_flags & ~REQ_OP_MASK, +diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig +index 6dfa653d30db..6b256291b924 100644 +--- a/drivers/md/bcache/Kconfig ++++ b/drivers/md/bcache/Kconfig +@@ -3,6 +3,7 @@ + config BCACHE + tristate "Block device as cache" + select CRC64 ++ select CLOSURES + help + Allows a block device to be used as cache for other devices; uses + a btree for indexing and the layout is optimized for SSDs. +@@ -17,12 +18,3 @@ config BCACHE_DEBUG + + Enables extra debugging tools, allows expensive runtime checks to be + turned on. +- +-config BCACHE_CLOSURES_DEBUG +- bool "Debug closures" +- depends on BCACHE +- select DEBUG_FS +- help +- Keeps all active closures in a linked list and provides a debugfs +- interface to list them, which makes it possible to see asynchronous +- operations that get stuck. +diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile +index fd714628da6a..0fb1b6009da3 100644 +--- a/drivers/md/bcache/Makefile ++++ b/drivers/md/bcache/Makefile +@@ -2,6 +2,6 @@ + + obj-$(CONFIG_BCACHE) += bcache.o + +-bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ +- io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ ++bcache-y := alloc.o bset.o btree.o debug.o extents.o io.o\ ++ journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ + util.o writeback.o +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 74a9849ea164..e03597696920 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -180,6 +180,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -192,7 +193,6 @@ + + #include "bset.h" + #include "util.h" +-#include "closure.h" + + struct bucket { + atomic_t pin; +diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c +deleted file mode 100644 +index 0164a1fe94a9..000000000000 +--- a/drivers/md/bcache/closure.c ++++ /dev/null +@@ -1,217 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0 +-/* +- * Asynchronous refcounty things +- * +- * Copyright 2010, 2011 Kent Overstreet +- * Copyright 2012 Google, Inc. +- */ +- +-#include +-#include +-#include +-#include +- +-#include "closure.h" +- +-static inline void closure_put_after_sub(struct closure *cl, int flags) +-{ +- int r = flags & CLOSURE_REMAINING_MASK; +- +- BUG_ON(flags & CLOSURE_GUARD_MASK); +- BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); +- +- if (!r) { +- if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { +- atomic_set(&cl->remaining, +- CLOSURE_REMAINING_INITIALIZER); +- closure_queue(cl); +- } else { +- struct closure *parent = cl->parent; +- closure_fn *destructor = cl->fn; +- +- closure_debug_destroy(cl); +- +- if (destructor) +- destructor(cl); +- +- if (parent) +- closure_put(parent); +- } +- } +-} +- +-/* For clearing flags with the same atomic op as a put */ +-void closure_sub(struct closure *cl, int v) +-{ +- closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); +-} +- +-/* +- * closure_put - decrement a closure's refcount +- */ +-void closure_put(struct closure *cl) +-{ +- closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); +-} +- +-/* +- * closure_wake_up - wake up all closures on a wait list, without memory barrier +- */ +-void __closure_wake_up(struct closure_waitlist *wait_list) +-{ +- struct llist_node *list; +- struct closure *cl, *t; +- struct llist_node *reverse = NULL; +- +- list = llist_del_all(&wait_list->list); +- +- /* We first reverse the list to preserve FIFO ordering and fairness */ +- reverse = llist_reverse_order(list); +- +- /* Then do the wakeups */ +- llist_for_each_entry_safe(cl, t, reverse, list) { +- closure_set_waiting(cl, 0); +- closure_sub(cl, CLOSURE_WAITING + 1); +- } +-} +- +-/** +- * closure_wait - add a closure to a waitlist +- * @waitlist: will own a ref on @cl, which will be released when +- * closure_wake_up() is called on @waitlist. +- * @cl: closure pointer. +- * +- */ +-bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) +-{ +- if (atomic_read(&cl->remaining) & CLOSURE_WAITING) +- return false; +- +- closure_set_waiting(cl, _RET_IP_); +- atomic_add(CLOSURE_WAITING + 1, &cl->remaining); +- llist_add(&cl->list, &waitlist->list); +- +- return true; +-} +- +-struct closure_syncer { +- struct task_struct *task; +- int done; +-}; +- +-static void closure_sync_fn(struct closure *cl) +-{ +- struct closure_syncer *s = cl->s; +- struct task_struct *p; +- +- rcu_read_lock(); +- p = READ_ONCE(s->task); +- s->done = 1; +- wake_up_process(p); +- rcu_read_unlock(); +-} +- +-void __sched __closure_sync(struct closure *cl) +-{ +- struct closure_syncer s = { .task = current }; +- +- cl->s = &s; +- continue_at(cl, closure_sync_fn, NULL); +- +- while (1) { +- set_current_state(TASK_UNINTERRUPTIBLE); +- if (s.done) +- break; +- schedule(); +- } +- +- __set_current_state(TASK_RUNNING); +-} +- +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- +-static LIST_HEAD(closure_list); +-static DEFINE_SPINLOCK(closure_list_lock); +- +-void closure_debug_create(struct closure *cl) +-{ +- unsigned long flags; +- +- BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); +- cl->magic = CLOSURE_MAGIC_ALIVE; +- +- spin_lock_irqsave(&closure_list_lock, flags); +- list_add(&cl->all, &closure_list); +- spin_unlock_irqrestore(&closure_list_lock, flags); +-} +- +-void closure_debug_destroy(struct closure *cl) +-{ +- unsigned long flags; +- +- BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); +- cl->magic = CLOSURE_MAGIC_DEAD; +- +- spin_lock_irqsave(&closure_list_lock, flags); +- list_del(&cl->all); +- spin_unlock_irqrestore(&closure_list_lock, flags); +-} +- +-static struct dentry *closure_debug; +- +-static int debug_seq_show(struct seq_file *f, void *data) +-{ +- struct closure *cl; +- +- spin_lock_irq(&closure_list_lock); +- +- list_for_each_entry(cl, &closure_list, all) { +- int r = atomic_read(&cl->remaining); +- +- seq_printf(f, "%p: %pS -> %pS p %p r %i ", +- cl, (void *) cl->ip, cl->fn, cl->parent, +- r & CLOSURE_REMAINING_MASK); +- +- seq_printf(f, "%s%s\n", +- test_bit(WORK_STRUCT_PENDING_BIT, +- work_data_bits(&cl->work)) ? "Q" : "", +- r & CLOSURE_RUNNING ? "R" : ""); +- +- if (r & CLOSURE_WAITING) +- seq_printf(f, " W %pS\n", +- (void *) cl->waiting_on); +- +- seq_printf(f, "\n"); +- } +- +- spin_unlock_irq(&closure_list_lock); +- return 0; +-} +- +-static int debug_seq_open(struct inode *inode, struct file *file) +-{ +- return single_open(file, debug_seq_show, NULL); +-} +- +-static const struct file_operations debug_ops = { +- .owner = THIS_MODULE, +- .open = debug_seq_open, +- .read = seq_read, +- .release = single_release +-}; +- +-void __init closure_debug_init(void) +-{ +- if (!IS_ERR_OR_NULL(bcache_debug)) +- /* +- * it is unnecessary to check return value of +- * debugfs_create_file(), we should not care +- * about this. +- */ +- closure_debug = debugfs_create_file( +- "closures", 0400, bcache_debug, NULL, &debug_ops); +-} +-#endif +- +-MODULE_AUTHOR("Kent Overstreet "); +-MODULE_LICENSE("GPL"); +diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h +deleted file mode 100644 +index c88cdc4ae4ec..000000000000 +--- a/drivers/md/bcache/closure.h ++++ /dev/null +@@ -1,378 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef _LINUX_CLOSURE_H +-#define _LINUX_CLOSURE_H +- +-#include +-#include +-#include +-#include +- +-/* +- * Closure is perhaps the most overused and abused term in computer science, but +- * since I've been unable to come up with anything better you're stuck with it +- * again. +- * +- * What are closures? +- * +- * They embed a refcount. The basic idea is they count "things that are in +- * progress" - in flight bios, some other thread that's doing something else - +- * anything you might want to wait on. +- * +- * The refcount may be manipulated with closure_get() and closure_put(). +- * closure_put() is where many of the interesting things happen, when it causes +- * the refcount to go to 0. +- * +- * Closures can be used to wait on things both synchronously and asynchronously, +- * and synchronous and asynchronous use can be mixed without restriction. To +- * wait synchronously, use closure_sync() - you will sleep until your closure's +- * refcount hits 1. +- * +- * To wait asynchronously, use +- * continue_at(cl, next_function, workqueue); +- * +- * passing it, as you might expect, the function to run when nothing is pending +- * and the workqueue to run that function out of. +- * +- * continue_at() also, critically, requires a 'return' immediately following the +- * location where this macro is referenced, to return to the calling function. +- * There's good reason for this. +- * +- * To use safely closures asynchronously, they must always have a refcount while +- * they are running owned by the thread that is running them. Otherwise, suppose +- * you submit some bios and wish to have a function run when they all complete: +- * +- * foo_endio(struct bio *bio) +- * { +- * closure_put(cl); +- * } +- * +- * closure_init(cl); +- * +- * do_stuff(); +- * closure_get(cl); +- * bio1->bi_endio = foo_endio; +- * bio_submit(bio1); +- * +- * do_more_stuff(); +- * closure_get(cl); +- * bio2->bi_endio = foo_endio; +- * bio_submit(bio2); +- * +- * continue_at(cl, complete_some_read, system_wq); +- * +- * If closure's refcount started at 0, complete_some_read() could run before the +- * second bio was submitted - which is almost always not what you want! More +- * importantly, it wouldn't be possible to say whether the original thread or +- * complete_some_read()'s thread owned the closure - and whatever state it was +- * associated with! +- * +- * So, closure_init() initializes a closure's refcount to 1 - and when a +- * closure_fn is run, the refcount will be reset to 1 first. +- * +- * Then, the rule is - if you got the refcount with closure_get(), release it +- * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount +- * on a closure because you called closure_init() or you were run out of a +- * closure - _always_ use continue_at(). Doing so consistently will help +- * eliminate an entire class of particularly pernicious races. +- * +- * Lastly, you might have a wait list dedicated to a specific event, and have no +- * need for specifying the condition - you just want to wait until someone runs +- * closure_wake_up() on the appropriate wait list. In that case, just use +- * closure_wait(). It will return either true or false, depending on whether the +- * closure was already on a wait list or not - a closure can only be on one wait +- * list at a time. +- * +- * Parents: +- * +- * closure_init() takes two arguments - it takes the closure to initialize, and +- * a (possibly null) parent. +- * +- * If parent is non null, the new closure will have a refcount for its lifetime; +- * a closure is considered to be "finished" when its refcount hits 0 and the +- * function to run is null. Hence +- * +- * continue_at(cl, NULL, NULL); +- * +- * returns up the (spaghetti) stack of closures, precisely like normal return +- * returns up the C stack. continue_at() with non null fn is better thought of +- * as doing a tail call. +- * +- * All this implies that a closure should typically be embedded in a particular +- * struct (which its refcount will normally control the lifetime of), and that +- * struct can very much be thought of as a stack frame. +- */ +- +-struct closure; +-struct closure_syncer; +-typedef void (closure_fn) (struct closure *); +-extern struct dentry *bcache_debug; +- +-struct closure_waitlist { +- struct llist_head list; +-}; +- +-enum closure_state { +- /* +- * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by +- * the thread that owns the closure, and cleared by the thread that's +- * waking up the closure. +- * +- * The rest are for debugging and don't affect behaviour: +- * +- * CLOSURE_RUNNING: Set when a closure is running (i.e. by +- * closure_init() and when closure_put() runs then next function), and +- * must be cleared before remaining hits 0. Primarily to help guard +- * against incorrect usage and accidentally transferring references. +- * continue_at() and closure_return() clear it for you, if you're doing +- * something unusual you can use closure_set_dead() which also helps +- * annotate where references are being transferred. +- */ +- +- CLOSURE_BITS_START = (1U << 26), +- CLOSURE_DESTRUCTOR = (1U << 26), +- CLOSURE_WAITING = (1U << 28), +- CLOSURE_RUNNING = (1U << 30), +-}; +- +-#define CLOSURE_GUARD_MASK \ +- ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1) +- +-#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) +-#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) +- +-struct closure { +- union { +- struct { +- struct workqueue_struct *wq; +- struct closure_syncer *s; +- struct llist_node list; +- closure_fn *fn; +- }; +- struct work_struct work; +- }; +- +- struct closure *parent; +- +- atomic_t remaining; +- +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +-#define CLOSURE_MAGIC_DEAD 0xc054dead +-#define CLOSURE_MAGIC_ALIVE 0xc054a11e +- +- unsigned int magic; +- struct list_head all; +- unsigned long ip; +- unsigned long waiting_on; +-#endif +-}; +- +-void closure_sub(struct closure *cl, int v); +-void closure_put(struct closure *cl); +-void __closure_wake_up(struct closure_waitlist *list); +-bool closure_wait(struct closure_waitlist *list, struct closure *cl); +-void __closure_sync(struct closure *cl); +- +-/** +- * closure_sync - sleep until a closure a closure has nothing left to wait on +- * +- * Sleeps until the refcount hits 1 - the thread that's running the closure owns +- * the last refcount. +- */ +-static inline void closure_sync(struct closure *cl) +-{ +- if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) +- __closure_sync(cl); +-} +- +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- +-void closure_debug_init(void); +-void closure_debug_create(struct closure *cl); +-void closure_debug_destroy(struct closure *cl); +- +-#else +- +-static inline void closure_debug_init(void) {} +-static inline void closure_debug_create(struct closure *cl) {} +-static inline void closure_debug_destroy(struct closure *cl) {} +- +-#endif +- +-static inline void closure_set_ip(struct closure *cl) +-{ +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- cl->ip = _THIS_IP_; +-#endif +-} +- +-static inline void closure_set_ret_ip(struct closure *cl) +-{ +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- cl->ip = _RET_IP_; +-#endif +-} +- +-static inline void closure_set_waiting(struct closure *cl, unsigned long f) +-{ +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- cl->waiting_on = f; +-#endif +-} +- +-static inline void closure_set_stopped(struct closure *cl) +-{ +- atomic_sub(CLOSURE_RUNNING, &cl->remaining); +-} +- +-static inline void set_closure_fn(struct closure *cl, closure_fn *fn, +- struct workqueue_struct *wq) +-{ +- closure_set_ip(cl); +- cl->fn = fn; +- cl->wq = wq; +- /* between atomic_dec() in closure_put() */ +- smp_mb__before_atomic(); +-} +- +-static inline void closure_queue(struct closure *cl) +-{ +- struct workqueue_struct *wq = cl->wq; +- /** +- * Changes made to closure, work_struct, or a couple of other structs +- * may cause work.func not pointing to the right location. +- */ +- BUILD_BUG_ON(offsetof(struct closure, fn) +- != offsetof(struct work_struct, func)); +- if (wq) { +- INIT_WORK(&cl->work, cl->work.func); +- BUG_ON(!queue_work(wq, &cl->work)); +- } else +- cl->fn(cl); +-} +- +-/** +- * closure_get - increment a closure's refcount +- */ +-static inline void closure_get(struct closure *cl) +-{ +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- BUG_ON((atomic_inc_return(&cl->remaining) & +- CLOSURE_REMAINING_MASK) <= 1); +-#else +- atomic_inc(&cl->remaining); +-#endif +-} +- +-/** +- * closure_init - Initialize a closure, setting the refcount to 1 +- * @cl: closure to initialize +- * @parent: parent of the new closure. cl will take a refcount on it for its +- * lifetime; may be NULL. +- */ +-static inline void closure_init(struct closure *cl, struct closure *parent) +-{ +- memset(cl, 0, sizeof(struct closure)); +- cl->parent = parent; +- if (parent) +- closure_get(parent); +- +- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +- +- closure_debug_create(cl); +- closure_set_ip(cl); +-} +- +-static inline void closure_init_stack(struct closure *cl) +-{ +- memset(cl, 0, sizeof(struct closure)); +- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +-} +- +-/** +- * closure_wake_up - wake up all closures on a wait list, +- * with memory barrier +- */ +-static inline void closure_wake_up(struct closure_waitlist *list) +-{ +- /* Memory barrier for the wait list */ +- smp_mb(); +- __closure_wake_up(list); +-} +- +-/** +- * continue_at - jump to another function with barrier +- * +- * After @cl is no longer waiting on anything (i.e. all outstanding refs have +- * been dropped with closure_put()), it will resume execution at @fn running out +- * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). +- * +- * This is because after calling continue_at() you no longer have a ref on @cl, +- * and whatever @cl owns may be freed out from under you - a running closure fn +- * has a ref on its own closure which continue_at() drops. +- * +- * Note you are expected to immediately return after using this macro. +- */ +-#define continue_at(_cl, _fn, _wq) \ +-do { \ +- set_closure_fn(_cl, _fn, _wq); \ +- closure_sub(_cl, CLOSURE_RUNNING + 1); \ +-} while (0) +- +-/** +- * closure_return - finish execution of a closure +- * +- * This is used to indicate that @cl is finished: when all outstanding refs on +- * @cl have been dropped @cl's ref on its parent closure (as passed to +- * closure_init()) will be dropped, if one was specified - thus this can be +- * thought of as returning to the parent closure. +- */ +-#define closure_return(_cl) continue_at((_cl), NULL, NULL) +- +-/** +- * continue_at_nobarrier - jump to another function without barrier +- * +- * Causes @fn to be executed out of @cl, in @wq context (or called directly if +- * @wq is NULL). +- * +- * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, +- * thus it's not safe to touch anything protected by @cl after a +- * continue_at_nobarrier(). +- */ +-#define continue_at_nobarrier(_cl, _fn, _wq) \ +-do { \ +- set_closure_fn(_cl, _fn, _wq); \ +- closure_queue(_cl); \ +-} while (0) +- +-/** +- * closure_return_with_destructor - finish execution of a closure, +- * with destructor +- * +- * Works like closure_return(), except @destructor will be called when all +- * outstanding refs on @cl have been dropped; @destructor may be used to safely +- * free the memory occupied by @cl, and it is called with the ref on the parent +- * closure still held - so @destructor could safely return an item to a +- * freelist protected by @cl's parent. +- */ +-#define closure_return_with_destructor(_cl, _destructor) \ +-do { \ +- set_closure_fn(_cl, _destructor, NULL); \ +- closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ +-} while (0) +- +-/** +- * closure_call - execute @fn out of a new, uninitialized closure +- * +- * Typically used when running out of one closure, and we want to run @fn +- * asynchronously out of a new closure - @parent will then wait for @cl to +- * finish. +- */ +-static inline void closure_call(struct closure *cl, closure_fn fn, +- struct workqueue_struct *wq, +- struct closure *parent) +-{ +- closure_init(cl, parent); +- continue_at_nobarrier(cl, fn, wq); +-} +- +-#endif /* _LINUX_CLOSURE_H */ +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index d98354fa28e3..9f3e769b5a67 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -2696,7 +2696,6 @@ static int __init bcache_init(void) + goto err; + + bch_debug_init(); +- closure_debug_init(); + + bcache_is_reboot = false; + +diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h +index c029f7443190..59093f9f1793 100644 +--- a/drivers/md/bcache/util.h ++++ b/drivers/md/bcache/util.h +@@ -4,6 +4,7 @@ + #define _BCACHE_UTIL_H + + #include ++#include + #include + #include + #include +@@ -13,8 +14,6 @@ + #include + #include + +-#include "closure.h" +- + #define PAGE_SECTORS (PAGE_SIZE / 512) + + struct closure; +diff --git a/fs/Kconfig b/fs/Kconfig +index f08fbbfafd9a..8502f8b7d8a7 100644 +--- a/fs/Kconfig ++++ b/fs/Kconfig +@@ -40,6 +40,7 @@ source "fs/ocfs2/Kconfig" + source "fs/btrfs/Kconfig" + source "fs/nilfs2/Kconfig" + source "fs/f2fs/Kconfig" ++source "fs/bcachefs/Kconfig" + source "fs/zonefs/Kconfig" + + config FS_DAX +diff --git a/fs/Makefile b/fs/Makefile +index 2ce5112b02c8..8e926e6bf48f 100644 +--- a/fs/Makefile ++++ b/fs/Makefile +@@ -130,6 +130,7 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ + obj-$(CONFIG_BTRFS_FS) += btrfs/ + obj-$(CONFIG_GFS2_FS) += gfs2/ + obj-$(CONFIG_F2FS_FS) += f2fs/ ++obj-$(CONFIG_BCACHEFS_FS) += bcachefs/ + obj-$(CONFIG_CEPH_FS) += ceph/ + obj-$(CONFIG_PSTORE) += pstore/ + obj-$(CONFIG_EFIVAR_FS) += efivarfs/ +diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig +new file mode 100644 +index 000000000000..10abddae6a80 +--- /dev/null ++++ b/fs/bcachefs/Kconfig +@@ -0,0 +1,50 @@ ++ ++config BCACHEFS_FS ++ tristate "bcachefs filesystem support" ++ depends on BLOCK ++ select EXPORTFS ++ select CLOSURES ++ select LIBCRC32C ++ select CRC64 ++ select FS_POSIX_ACL ++ select LZ4_COMPRESS ++ select LZ4_DECOMPRESS ++ select ZLIB_DEFLATE ++ select ZLIB_INFLATE ++ select ZSTD_COMPRESS ++ select ZSTD_DECOMPRESS ++ select CRYPTO_SHA256 ++ select CRYPTO_CHACHA20 ++ select CRYPTO_POLY1305 ++ select KEYS ++ select SIXLOCKS ++ select RAID6_PQ ++ select XOR_BLOCKS ++ ---help--- ++ The bcachefs filesystem - a modern, copy on write filesystem, with ++ support for multiple devices, compression, checksumming, etc. ++ ++config BCACHEFS_QUOTA ++ bool "bcachefs quota support" ++ depends on BCACHEFS_FS ++ select QUOTACTL ++ ++config BCACHEFS_POSIX_ACL ++ bool "bcachefs POSIX ACL support" ++ depends on BCACHEFS_FS ++ select FS_POSIX_ACL ++ ++config BCACHEFS_DEBUG ++ bool "bcachefs debugging" ++ depends on BCACHEFS_FS ++ ---help--- ++ Enables many extra debugging checks and assertions. ++ ++ The resulting code will be significantly slower than normal; you ++ probably shouldn't select this option unless you're a developer. ++ ++config BCACHEFS_TESTS ++ bool "bcachefs unit and performance tests" ++ depends on BCACHEFS_FS ++ ---help--- ++ Include some unit and performance tests for the core btree code +diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile +new file mode 100644 +index 000000000000..d85ced62c0dd +--- /dev/null ++++ b/fs/bcachefs/Makefile +@@ -0,0 +1,59 @@ ++ ++obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o ++ ++bcachefs-y := \ ++ acl.o \ ++ alloc_background.o \ ++ alloc_foreground.o \ ++ bkey.o \ ++ bkey_methods.o \ ++ bkey_sort.o \ ++ bset.o \ ++ btree_cache.o \ ++ btree_gc.o \ ++ btree_io.o \ ++ btree_iter.o \ ++ btree_key_cache.o \ ++ btree_update_interior.o \ ++ btree_update_leaf.o \ ++ buckets.o \ ++ chardev.o \ ++ checksum.o \ ++ clock.o \ ++ compress.o \ ++ debug.o \ ++ dirent.o \ ++ disk_groups.o \ ++ ec.o \ ++ error.o \ ++ extents.o \ ++ extent_update.o \ ++ fs.o \ ++ fs-common.o \ ++ fs-ioctl.o \ ++ fs-io.o \ ++ fsck.o \ ++ inode.o \ ++ io.o \ ++ journal.o \ ++ journal_io.o \ ++ journal_reclaim.o \ ++ journal_seq_blacklist.o \ ++ keylist.o \ ++ migrate.o \ ++ move.o \ ++ movinggc.o \ ++ opts.o \ ++ quota.o \ ++ rebalance.o \ ++ recovery.o \ ++ reflink.o \ ++ replicas.o \ ++ siphash.o \ ++ super.o \ ++ super-io.o \ ++ sysfs.o \ ++ tests.o \ ++ trace.o \ ++ util.o \ ++ xattr.o +diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c +new file mode 100644 +index 000000000000..76c98ddbf628 +--- /dev/null ++++ b/fs/bcachefs/acl.c +@@ -0,0 +1,388 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ ++#include "bcachefs.h" ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include "acl.h" ++#include "fs.h" ++#include "xattr.h" ++ ++static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long) ++{ ++ return sizeof(bch_acl_header) + ++ sizeof(bch_acl_entry_short) * nr_short + ++ sizeof(bch_acl_entry) * nr_long; ++} ++ ++static inline int acl_to_xattr_type(int type) ++{ ++ switch (type) { ++ case ACL_TYPE_ACCESS: ++ return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS; ++ case ACL_TYPE_DEFAULT: ++ return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT; ++ default: ++ BUG(); ++ } ++} ++ ++/* ++ * Convert from filesystem to in-memory representation. ++ */ ++static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size) ++{ ++ const void *p, *end = value + size; ++ struct posix_acl *acl; ++ struct posix_acl_entry *out; ++ unsigned count = 0; ++ ++ if (!value) ++ return NULL; ++ if (size < sizeof(bch_acl_header)) ++ goto invalid; ++ if (((bch_acl_header *)value)->a_version != ++ cpu_to_le32(BCH_ACL_VERSION)) ++ goto invalid; ++ ++ p = value + sizeof(bch_acl_header); ++ while (p < end) { ++ const bch_acl_entry *entry = p; ++ ++ if (p + sizeof(bch_acl_entry_short) > end) ++ goto invalid; ++ ++ switch (le16_to_cpu(entry->e_tag)) { ++ case ACL_USER_OBJ: ++ case ACL_GROUP_OBJ: ++ case ACL_MASK: ++ case ACL_OTHER: ++ p += sizeof(bch_acl_entry_short); ++ break; ++ case ACL_USER: ++ case ACL_GROUP: ++ p += sizeof(bch_acl_entry); ++ break; ++ default: ++ goto invalid; ++ } ++ ++ count++; ++ } ++ ++ if (p > end) ++ goto invalid; ++ ++ if (!count) ++ return NULL; ++ ++ acl = posix_acl_alloc(count, GFP_KERNEL); ++ if (!acl) ++ return ERR_PTR(-ENOMEM); ++ ++ out = acl->a_entries; ++ ++ p = value + sizeof(bch_acl_header); ++ while (p < end) { ++ const bch_acl_entry *in = p; ++ ++ out->e_tag = le16_to_cpu(in->e_tag); ++ out->e_perm = le16_to_cpu(in->e_perm); ++ ++ switch (out->e_tag) { ++ case ACL_USER_OBJ: ++ case ACL_GROUP_OBJ: ++ case ACL_MASK: ++ case ACL_OTHER: ++ p += sizeof(bch_acl_entry_short); ++ break; ++ case ACL_USER: ++ out->e_uid = make_kuid(&init_user_ns, ++ le32_to_cpu(in->e_id)); ++ p += sizeof(bch_acl_entry); ++ break; ++ case ACL_GROUP: ++ out->e_gid = make_kgid(&init_user_ns, ++ le32_to_cpu(in->e_id)); ++ p += sizeof(bch_acl_entry); ++ break; ++ } ++ ++ out++; ++ } ++ ++ BUG_ON(out != acl->a_entries + acl->a_count); ++ ++ return acl; ++invalid: ++ pr_err("invalid acl entry"); ++ return ERR_PTR(-EINVAL); ++} ++ ++#define acl_for_each_entry(acl, acl_e) \ ++ for (acl_e = acl->a_entries; \ ++ acl_e < acl->a_entries + acl->a_count; \ ++ acl_e++) ++ ++/* ++ * Convert from in-memory to filesystem representation. ++ */ ++static struct bkey_i_xattr * ++bch2_acl_to_xattr(struct btree_trans *trans, ++ const struct posix_acl *acl, ++ int type) ++{ ++ struct bkey_i_xattr *xattr; ++ bch_acl_header *acl_header; ++ const struct posix_acl_entry *acl_e; ++ void *outptr; ++ unsigned nr_short = 0, nr_long = 0, acl_len, u64s; ++ ++ acl_for_each_entry(acl, acl_e) { ++ switch (acl_e->e_tag) { ++ case ACL_USER: ++ case ACL_GROUP: ++ nr_long++; ++ break; ++ case ACL_USER_OBJ: ++ case ACL_GROUP_OBJ: ++ case ACL_MASK: ++ case ACL_OTHER: ++ nr_short++; ++ break; ++ default: ++ return ERR_PTR(-EINVAL); ++ } ++ } ++ ++ acl_len = bch2_acl_size(nr_short, nr_long); ++ u64s = BKEY_U64s + xattr_val_u64s(0, acl_len); ++ ++ if (u64s > U8_MAX) ++ return ERR_PTR(-E2BIG); ++ ++ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); ++ if (IS_ERR(xattr)) ++ return xattr; ++ ++ bkey_xattr_init(&xattr->k_i); ++ xattr->k.u64s = u64s; ++ xattr->v.x_type = acl_to_xattr_type(type); ++ xattr->v.x_name_len = 0, ++ xattr->v.x_val_len = cpu_to_le16(acl_len); ++ ++ acl_header = xattr_val(&xattr->v); ++ acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION); ++ ++ outptr = (void *) acl_header + sizeof(*acl_header); ++ ++ acl_for_each_entry(acl, acl_e) { ++ bch_acl_entry *entry = outptr; ++ ++ entry->e_tag = cpu_to_le16(acl_e->e_tag); ++ entry->e_perm = cpu_to_le16(acl_e->e_perm); ++ switch (acl_e->e_tag) { ++ case ACL_USER: ++ entry->e_id = cpu_to_le32( ++ from_kuid(&init_user_ns, acl_e->e_uid)); ++ outptr += sizeof(bch_acl_entry); ++ break; ++ case ACL_GROUP: ++ entry->e_id = cpu_to_le32( ++ from_kgid(&init_user_ns, acl_e->e_gid)); ++ outptr += sizeof(bch_acl_entry); ++ break; ++ ++ case ACL_USER_OBJ: ++ case ACL_GROUP_OBJ: ++ case ACL_MASK: ++ case ACL_OTHER: ++ outptr += sizeof(bch_acl_entry_short); ++ break; ++ } ++ } ++ ++ BUG_ON(outptr != xattr_val(&xattr->v) + acl_len); ++ ++ return xattr; ++} ++ ++struct posix_acl *bch2_get_acl(struct inode *vinode, int type) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c_xattr xattr; ++ struct posix_acl *acl = NULL; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, ++ &inode->ei_str_hash, inode->v.i_ino, ++ &X_SEARCH(acl_to_xattr_type(type), "", 0), ++ 0); ++ if (IS_ERR(iter)) { ++ if (PTR_ERR(iter) == -EINTR) ++ goto retry; ++ ++ if (PTR_ERR(iter) != -ENOENT) ++ acl = ERR_CAST(iter); ++ goto out; ++ } ++ ++ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); ++ ++ acl = bch2_acl_from_disk(xattr_val(xattr.v), ++ le16_to_cpu(xattr.v->x_val_len)); ++ ++ if (!IS_ERR(acl)) ++ set_cached_acl(&inode->v, type, acl); ++out: ++ bch2_trans_exit(&trans); ++ return acl; ++} ++ ++int bch2_set_acl_trans(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode_u, ++ const struct bch_hash_info *hash_info, ++ struct posix_acl *acl, int type) ++{ ++ int ret; ++ ++ if (type == ACL_TYPE_DEFAULT && ++ !S_ISDIR(inode_u->bi_mode)) ++ return acl ? -EACCES : 0; ++ ++ if (acl) { ++ struct bkey_i_xattr *xattr = ++ bch2_acl_to_xattr(trans, acl, type); ++ if (IS_ERR(xattr)) ++ return PTR_ERR(xattr); ++ ++ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, ++ inode_u->bi_inum, &xattr->k_i, 0); ++ } else { ++ struct xattr_search_key search = ++ X_SEARCH(acl_to_xattr_type(type), "", 0); ++ ++ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info, ++ inode_u->bi_inum, &search); ++ } ++ ++ return ret == -ENOENT ? 0 : ret; ++} ++ ++int bch2_set_acl(struct inode *vinode, struct posix_acl *_acl, int type) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter *inode_iter; ++ struct bch_inode_unpacked inode_u; ++ struct posix_acl *acl; ++ umode_t mode; ++ int ret; ++ ++ mutex_lock(&inode->ei_update_lock); ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ acl = _acl; ++ ++ inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(inode_iter); ++ if (ret) ++ goto btree_err; ++ ++ mode = inode_u.bi_mode; ++ ++ if (type == ACL_TYPE_ACCESS) { ++ ret = posix_acl_update_mode(&inode->v, &mode, &acl); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_set_acl_trans(&trans, &inode_u, ++ &inode->ei_str_hash, ++ acl, type); ++ if (ret) ++ goto btree_err; ++ ++ inode_u.bi_ctime = bch2_current_time(c); ++ inode_u.bi_mode = mode; ++ ++ ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: ++ bch2_trans_commit(&trans, NULL, ++ &inode->ei_journal_seq, ++ BTREE_INSERT_NOUNLOCK); ++btree_err: ++ if (ret == -EINTR) ++ goto retry; ++ if (unlikely(ret)) ++ goto err; ++ ++ bch2_inode_update_after_write(c, inode, &inode_u, ++ ATTR_CTIME|ATTR_MODE); ++ ++ set_cached_acl(&inode->v, type, acl); ++err: ++ bch2_trans_exit(&trans); ++ mutex_unlock(&inode->ei_update_lock); ++ ++ return ret; ++} ++ ++int bch2_acl_chmod(struct btree_trans *trans, ++ struct bch_inode_info *inode, ++ umode_t mode, ++ struct posix_acl **new_acl) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c_xattr xattr; ++ struct bkey_i_xattr *new; ++ struct posix_acl *acl; ++ int ret = 0; ++ ++ iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, ++ &inode->ei_str_hash, inode->v.i_ino, ++ &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0), ++ BTREE_ITER_INTENT); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0; ++ ++ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); ++ ++ acl = bch2_acl_from_disk(xattr_val(xattr.v), ++ le16_to_cpu(xattr.v->x_val_len)); ++ if (IS_ERR_OR_NULL(acl)) ++ return PTR_ERR(acl); ++ ++ ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); ++ if (ret) ++ goto err; ++ ++ new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); ++ if (IS_ERR(new)) { ++ ret = PTR_ERR(new); ++ goto err; ++ } ++ ++ new->k.p = iter->pos; ++ bch2_trans_update(trans, iter, &new->k_i, 0); ++ *new_acl = acl; ++ acl = NULL; ++err: ++ kfree(acl); ++ return ret; ++} ++ ++#endif /* CONFIG_BCACHEFS_POSIX_ACL */ +diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h +new file mode 100644 +index 000000000000..cb62d502a7ff +--- /dev/null ++++ b/fs/bcachefs/acl.h +@@ -0,0 +1,59 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ACL_H ++#define _BCACHEFS_ACL_H ++ ++struct bch_inode_unpacked; ++struct bch_hash_info; ++struct bch_inode_info; ++struct posix_acl; ++ ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ ++#define BCH_ACL_VERSION 0x0001 ++ ++typedef struct { ++ __le16 e_tag; ++ __le16 e_perm; ++ __le32 e_id; ++} bch_acl_entry; ++ ++typedef struct { ++ __le16 e_tag; ++ __le16 e_perm; ++} bch_acl_entry_short; ++ ++typedef struct { ++ __le32 a_version; ++} bch_acl_header; ++ ++struct posix_acl *bch2_get_acl(struct inode *, int); ++ ++int bch2_set_acl_trans(struct btree_trans *, ++ struct bch_inode_unpacked *, ++ const struct bch_hash_info *, ++ struct posix_acl *, int); ++int bch2_set_acl(struct inode *, struct posix_acl *, int); ++int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *, ++ umode_t, struct posix_acl **); ++ ++#else ++ ++static inline int bch2_set_acl_trans(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode_u, ++ const struct bch_hash_info *hash_info, ++ struct posix_acl *acl, int type) ++{ ++ return 0; ++} ++ ++static inline int bch2_acl_chmod(struct btree_trans *trans, ++ struct bch_inode_info *inode, ++ umode_t mode, ++ struct posix_acl **new_acl) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_BCACHEFS_POSIX_ACL */ ++ ++#endif /* _BCACHEFS_ACL_H */ +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +new file mode 100644 +index 000000000000..cb720ee04b86 +--- /dev/null ++++ b/fs/bcachefs/alloc_background.c +@@ -0,0 +1,1434 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_key_cache.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "clock.h" ++#include "debug.h" ++#include "ec.h" ++#include "error.h" ++#include "recovery.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static const char * const bch2_alloc_field_names[] = { ++#define x(name, bytes) #name, ++ BCH_ALLOC_FIELDS() ++#undef x ++ NULL ++}; ++ ++static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int); ++ ++/* Ratelimiting/PD controllers */ ++ ++static void pd_controllers_update(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(to_delayed_work(work), ++ struct bch_fs, ++ pd_controllers_update); ++ struct bch_dev *ca; ++ unsigned i; ++ ++ for_each_member_device(ca, c, i) { ++ struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); ++ ++ u64 free = bucket_to_sector(ca, ++ __dev_buckets_free(ca, stats)) << 9; ++ /* ++ * Bytes of internal fragmentation, which can be ++ * reclaimed by copy GC ++ */ ++ s64 fragmented = (bucket_to_sector(ca, ++ stats.buckets[BCH_DATA_USER] + ++ stats.buckets[BCH_DATA_CACHED]) - ++ (stats.sectors[BCH_DATA_USER] + ++ stats.sectors[BCH_DATA_CACHED])) << 9; ++ ++ fragmented = max(0LL, fragmented); ++ ++ bch2_pd_controller_update(&ca->copygc_pd, ++ free, fragmented, -1); ++ } ++ ++ schedule_delayed_work(&c->pd_controllers_update, ++ c->pd_controllers_update_seconds * HZ); ++} ++ ++/* Persistent alloc info: */ ++ ++static inline u64 get_alloc_field(const struct bch_alloc *a, ++ const void **p, unsigned field) ++{ ++ unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; ++ u64 v; ++ ++ if (!(a->fields & (1 << field))) ++ return 0; ++ ++ switch (bytes) { ++ case 1: ++ v = *((const u8 *) *p); ++ break; ++ case 2: ++ v = le16_to_cpup(*p); ++ break; ++ case 4: ++ v = le32_to_cpup(*p); ++ break; ++ case 8: ++ v = le64_to_cpup(*p); ++ break; ++ default: ++ BUG(); ++ } ++ ++ *p += bytes; ++ return v; ++} ++ ++static inline void put_alloc_field(struct bkey_i_alloc *a, void **p, ++ unsigned field, u64 v) ++{ ++ unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; ++ ++ if (!v) ++ return; ++ ++ a->v.fields |= 1 << field; ++ ++ switch (bytes) { ++ case 1: ++ *((u8 *) *p) = v; ++ break; ++ case 2: ++ *((__le16 *) *p) = cpu_to_le16(v); ++ break; ++ case 4: ++ *((__le32 *) *p) = cpu_to_le32(v); ++ break; ++ case 8: ++ *((__le64 *) *p) = cpu_to_le64(v); ++ break; ++ default: ++ BUG(); ++ } ++ ++ *p += bytes; ++} ++ ++struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) ++{ ++ struct bkey_alloc_unpacked ret = { .gen = 0 }; ++ ++ if (k.k->type == KEY_TYPE_alloc) { ++ const struct bch_alloc *a = bkey_s_c_to_alloc(k).v; ++ const void *d = a->data; ++ unsigned idx = 0; ++ ++ ret.gen = a->gen; ++ ++#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++); ++ BCH_ALLOC_FIELDS() ++#undef x ++ } ++ return ret; ++} ++ ++void bch2_alloc_pack(struct bkey_i_alloc *dst, ++ const struct bkey_alloc_unpacked src) ++{ ++ unsigned idx = 0; ++ void *d = dst->v.data; ++ unsigned bytes; ++ ++ dst->v.fields = 0; ++ dst->v.gen = src.gen; ++ ++#define x(_name, _bits) put_alloc_field(dst, &d, idx++, src._name); ++ BCH_ALLOC_FIELDS() ++#undef x ++ ++ bytes = (void *) d - (void *) &dst->v; ++ set_bkey_val_bytes(&dst->k, bytes); ++ memset_u64s_tail(&dst->v, 0, bytes); ++} ++ ++static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) ++{ ++ unsigned i, bytes = offsetof(struct bch_alloc, data); ++ ++ for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++) ++ if (a->fields & (1 << i)) ++ bytes += BCH_ALLOC_FIELD_BYTES[i]; ++ ++ return DIV_ROUND_UP(bytes, sizeof(u64)); ++} ++ ++const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); ++ ++ if (k.k->p.inode >= c->sb.nr_devices || ++ !c->devs[k.k->p.inode]) ++ return "invalid device"; ++ ++ /* allow for unknown fields */ ++ if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v)) ++ return "incorrect value size"; ++ ++ return NULL; ++} ++ ++void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); ++ const void *d = a.v->data; ++ unsigned i; ++ ++ pr_buf(out, "gen %u", a.v->gen); ++ ++ for (i = 0; i < BCH_ALLOC_FIELD_NR; i++) ++ if (a.v->fields & (1 << i)) ++ pr_buf(out, " %s %llu", ++ bch2_alloc_field_names[i], ++ get_alloc_field(a.v, &d, i)); ++} ++ ++static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_s_c k) ++{ ++ if (!level) ++ bch2_mark_key(c, k, 0, 0, NULL, 0, ++ BTREE_TRIGGER_ALLOC_READ| ++ BTREE_TRIGGER_NOATOMIC); ++ ++ return 0; ++} ++ ++int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ int ret = 0; ++ ++ ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC, ++ NULL, bch2_alloc_read_fn); ++ if (ret) { ++ bch_err(c, "error reading alloc info: %i", ret); ++ return ret; ++ } ++ ++ percpu_down_write(&c->mark_lock); ++ bch2_dev_usage_from_buckets(c); ++ percpu_up_write(&c->mark_lock); ++ ++ mutex_lock(&c->bucket_clock[READ].lock); ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ bch2_recalc_oldest_io(c, ca, READ); ++ up_read(&ca->bucket_lock); ++ } ++ mutex_unlock(&c->bucket_clock[READ].lock); ++ ++ mutex_lock(&c->bucket_clock[WRITE].lock); ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ bch2_recalc_oldest_io(c, ca, WRITE); ++ up_read(&ca->bucket_lock); ++ } ++ mutex_unlock(&c->bucket_clock[WRITE].lock); ++ ++ return 0; ++} ++ ++enum alloc_write_ret { ++ ALLOC_WROTE, ++ ALLOC_NOWROTE, ++ ALLOC_END, ++}; ++ ++static int bch2_alloc_write_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c k; ++ struct bch_dev *ca; ++ struct bucket_array *ba; ++ struct bucket *g; ++ struct bucket_mark m; ++ struct bkey_alloc_unpacked old_u, new_u; ++ __BKEY_PADDED(k, 8) alloc_key; /* hack: */ ++ struct bkey_i_alloc *a; ++ int ret; ++retry: ++ bch2_trans_begin(trans); ++ ++ ret = bch2_btree_key_cache_flush(trans, ++ BTREE_ID_ALLOC, iter->pos); ++ if (ret) ++ goto err; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ old_u = bch2_alloc_unpack(k); ++ ++ if (iter->pos.inode >= c->sb.nr_devices || ++ !c->devs[iter->pos.inode]) ++ return ALLOC_END; ++ ++ percpu_down_read(&c->mark_lock); ++ ca = bch_dev_bkey_exists(c, iter->pos.inode); ++ ba = bucket_array(ca); ++ ++ if (iter->pos.offset >= ba->nbuckets) { ++ percpu_up_read(&c->mark_lock); ++ return ALLOC_END; ++ } ++ ++ g = &ba->b[iter->pos.offset]; ++ m = READ_ONCE(g->mark); ++ new_u = alloc_mem_to_key(g, m); ++ percpu_up_read(&c->mark_lock); ++ ++ if (!bkey_alloc_unpacked_cmp(old_u, new_u)) ++ return ALLOC_NOWROTE; ++ ++ a = bkey_alloc_init(&alloc_key.k); ++ a->k.p = iter->pos; ++ bch2_alloc_pack(a, new_u); ++ ++ bch2_trans_update(trans, iter, &a->k_i, ++ BTREE_TRIGGER_NORUN); ++ ret = bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ flags); ++err: ++ if (ret == -EINTR) ++ goto retry; ++ return ret; ++} ++ ++int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bch_dev *ca; ++ unsigned i; ++ int ret = 0; ++ ++ BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ ++ for_each_rw_member(ca, c, i) { ++ unsigned first_bucket; ++ ++ percpu_down_read(&c->mark_lock); ++ first_bucket = bucket_array(ca)->first_bucket; ++ percpu_up_read(&c->mark_lock); ++ ++ bch2_btree_iter_set_pos(iter, POS(i, first_bucket)); ++ ++ while (1) { ++ ret = bch2_alloc_write_key(&trans, iter, flags); ++ if (ret < 0 || ret == ALLOC_END) ++ break; ++ if (ret == ALLOC_WROTE) ++ *wrote = true; ++ bch2_btree_iter_next_slot(iter); ++ } ++ ++ if (ret < 0) { ++ percpu_ref_put(&ca->io_ref); ++ break; ++ } ++ } ++ ++ bch2_trans_exit(&trans); ++ ++ return ret < 0 ? ret : 0; ++} ++ ++/* Bucket IO clocks: */ ++ ++static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw) ++{ ++ struct bucket_clock *clock = &c->bucket_clock[rw]; ++ struct bucket_array *buckets = bucket_array(ca); ++ struct bucket *g; ++ u16 max_last_io = 0; ++ unsigned i; ++ ++ lockdep_assert_held(&c->bucket_clock[rw].lock); ++ ++ /* Recalculate max_last_io for this device: */ ++ for_each_bucket(g, buckets) ++ max_last_io = max(max_last_io, bucket_last_io(c, g, rw)); ++ ++ ca->max_last_bucket_io[rw] = max_last_io; ++ ++ /* Recalculate global max_last_io: */ ++ max_last_io = 0; ++ ++ for_each_member_device(ca, c, i) ++ max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]); ++ ++ clock->max_last_io = max_last_io; ++} ++ ++static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw) ++{ ++ struct bucket_clock *clock = &c->bucket_clock[rw]; ++ struct bucket_array *buckets; ++ struct bch_dev *ca; ++ struct bucket *g; ++ unsigned i; ++ ++ trace_rescale_prios(c); ++ ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ for_each_bucket(g, buckets) ++ g->io_time[rw] = clock->hand - ++ bucket_last_io(c, g, rw) / 2; ++ ++ bch2_recalc_oldest_io(c, ca, rw); ++ ++ up_read(&ca->bucket_lock); ++ } ++} ++ ++static inline u64 bucket_clock_freq(u64 capacity) ++{ ++ return max(capacity >> 10, 2028ULL); ++} ++ ++static void bch2_inc_clock_hand(struct io_timer *timer) ++{ ++ struct bucket_clock *clock = container_of(timer, ++ struct bucket_clock, rescale); ++ struct bch_fs *c = container_of(clock, ++ struct bch_fs, bucket_clock[clock->rw]); ++ struct bch_dev *ca; ++ u64 capacity; ++ unsigned i; ++ ++ mutex_lock(&clock->lock); ++ ++ /* if clock cannot be advanced more, rescale prio */ ++ if (clock->max_last_io >= U16_MAX - 2) ++ bch2_rescale_bucket_io_times(c, clock->rw); ++ ++ BUG_ON(clock->max_last_io >= U16_MAX - 2); ++ ++ for_each_member_device(ca, c, i) ++ ca->max_last_bucket_io[clock->rw]++; ++ clock->max_last_io++; ++ clock->hand++; ++ ++ mutex_unlock(&clock->lock); ++ ++ capacity = READ_ONCE(c->capacity); ++ ++ if (!capacity) ++ return; ++ ++ /* ++ * we only increment when 0.1% of the filesystem capacity has been read ++ * or written too, this determines if it's time ++ * ++ * XXX: we shouldn't really be going off of the capacity of devices in ++ * RW mode (that will be 0 when we're RO, yet we can still service ++ * reads) ++ */ ++ timer->expire += bucket_clock_freq(capacity); ++ ++ bch2_io_timer_add(&c->io_clock[clock->rw], timer); ++} ++ ++static void bch2_bucket_clock_init(struct bch_fs *c, int rw) ++{ ++ struct bucket_clock *clock = &c->bucket_clock[rw]; ++ ++ clock->hand = 1; ++ clock->rw = rw; ++ clock->rescale.fn = bch2_inc_clock_hand; ++ clock->rescale.expire = bucket_clock_freq(c->capacity); ++ mutex_init(&clock->lock); ++} ++ ++/* Background allocator thread: */ ++ ++/* ++ * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens ++ * (marking them as invalidated on disk), then optionally issues discard ++ * commands to the newly free buckets, then puts them on the various freelists. ++ */ ++ ++#define BUCKET_GC_GEN_MAX 96U ++ ++/** ++ * wait_buckets_available - wait on reclaimable buckets ++ * ++ * If there aren't enough available buckets to fill up free_inc, wait until ++ * there are. ++ */ ++static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) ++{ ++ unsigned long gc_count = c->gc_count; ++ u64 available; ++ int ret = 0; ++ ++ ca->allocator_state = ALLOCATOR_BLOCKED; ++ closure_wake_up(&c->freelist_wait); ++ ++ while (1) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ if (kthread_should_stop()) { ++ ret = 1; ++ break; ++ } ++ ++ if (gc_count != c->gc_count) ++ ca->inc_gen_really_needs_gc = 0; ++ ++ available = max_t(s64, 0, dev_buckets_available(c, ca) - ++ ca->inc_gen_really_needs_gc); ++ ++ if (available > fifo_free(&ca->free_inc) || ++ (available && !fifo_full(&ca->free[RESERVE_BTREE]))) ++ break; ++ ++ up_read(&c->gc_lock); ++ schedule(); ++ try_to_freeze(); ++ down_read(&c->gc_lock); ++ } ++ ++ __set_current_state(TASK_RUNNING); ++ ca->allocator_state = ALLOCATOR_RUNNING; ++ closure_wake_up(&c->freelist_wait); ++ ++ return ret; ++} ++ ++static bool bch2_can_invalidate_bucket(struct bch_dev *ca, ++ size_t bucket, ++ struct bucket_mark mark) ++{ ++ u8 gc_gen; ++ ++ if (!is_available_bucket(mark)) ++ return false; ++ ++ if (ca->buckets_nouse && ++ test_bit(bucket, ca->buckets_nouse)) ++ return false; ++ ++ gc_gen = bucket_gc_gen(ca, bucket); ++ ++ if (gc_gen >= BUCKET_GC_GEN_MAX / 2) ++ ca->inc_gen_needs_gc++; ++ ++ if (gc_gen >= BUCKET_GC_GEN_MAX) ++ ca->inc_gen_really_needs_gc++; ++ ++ return gc_gen < BUCKET_GC_GEN_MAX; ++} ++ ++/* ++ * Determines what order we're going to reuse buckets, smallest bucket_key() ++ * first. ++ * ++ * ++ * - We take into account the read prio of the bucket, which gives us an ++ * indication of how hot the data is -- we scale the prio so that the prio ++ * farthest from the clock is worth 1/8th of the closest. ++ * ++ * - The number of sectors of cached data in the bucket, which gives us an ++ * indication of the cost in cache misses this eviction will cause. ++ * ++ * - If hotness * sectors used compares equal, we pick the bucket with the ++ * smallest bucket_gc_gen() - since incrementing the same bucket's generation ++ * number repeatedly forces us to run mark and sweep gc to avoid generation ++ * number wraparound. ++ */ ++ ++static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, struct bucket_mark m) ++{ ++ unsigned last_io = bucket_last_io(c, bucket(ca, b), READ); ++ unsigned max_last_io = ca->max_last_bucket_io[READ]; ++ ++ /* ++ * Time since last read, scaled to [0, 8) where larger value indicates ++ * more recently read data: ++ */ ++ unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io; ++ ++ /* How much we want to keep the data in this bucket: */ ++ unsigned long data_wantness = ++ (hotness + 1) * bucket_sectors_used(m); ++ ++ unsigned long needs_journal_commit = ++ bucket_needs_journal_commit(m, c->journal.last_seq_ondisk); ++ ++ return (data_wantness << 9) | ++ (needs_journal_commit << 8) | ++ (bucket_gc_gen(ca, b) / 16); ++} ++ ++static inline int bucket_alloc_cmp(alloc_heap *h, ++ struct alloc_heap_entry l, ++ struct alloc_heap_entry r) ++{ ++ return cmp_int(l.key, r.key) ?: ++ cmp_int(r.nr, l.nr) ?: ++ cmp_int(l.bucket, r.bucket); ++} ++ ++static inline int bucket_idx_cmp(const void *_l, const void *_r) ++{ ++ const struct alloc_heap_entry *l = _l, *r = _r; ++ ++ return cmp_int(l->bucket, r->bucket); ++} ++ ++static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bucket_array *buckets; ++ struct alloc_heap_entry e = { 0 }; ++ size_t b, i, nr = 0; ++ ++ ca->alloc_heap.used = 0; ++ ++ mutex_lock(&c->bucket_clock[READ].lock); ++ down_read(&ca->bucket_lock); ++ ++ buckets = bucket_array(ca); ++ ++ bch2_recalc_oldest_io(c, ca, READ); ++ ++ /* ++ * Find buckets with lowest read priority, by building a maxheap sorted ++ * by read priority and repeatedly replacing the maximum element until ++ * all buckets have been visited. ++ */ ++ for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { ++ struct bucket_mark m = READ_ONCE(buckets->b[b].mark); ++ unsigned long key = bucket_sort_key(c, ca, b, m); ++ ++ if (!bch2_can_invalidate_bucket(ca, b, m)) ++ continue; ++ ++ if (e.nr && e.bucket + e.nr == b && e.key == key) { ++ e.nr++; ++ } else { ++ if (e.nr) ++ heap_add_or_replace(&ca->alloc_heap, e, ++ -bucket_alloc_cmp, NULL); ++ ++ e = (struct alloc_heap_entry) { ++ .bucket = b, ++ .nr = 1, ++ .key = key, ++ }; ++ } ++ ++ cond_resched(); ++ } ++ ++ if (e.nr) ++ heap_add_or_replace(&ca->alloc_heap, e, ++ -bucket_alloc_cmp, NULL); ++ ++ for (i = 0; i < ca->alloc_heap.used; i++) ++ nr += ca->alloc_heap.data[i].nr; ++ ++ while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) { ++ nr -= ca->alloc_heap.data[0].nr; ++ heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL); ++ } ++ ++ up_read(&ca->bucket_lock); ++ mutex_unlock(&c->bucket_clock[READ].lock); ++} ++ ++static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bucket_array *buckets = bucket_array(ca); ++ struct bucket_mark m; ++ size_t b, start; ++ ++ if (ca->fifo_last_bucket < ca->mi.first_bucket || ++ ca->fifo_last_bucket >= ca->mi.nbuckets) ++ ca->fifo_last_bucket = ca->mi.first_bucket; ++ ++ start = ca->fifo_last_bucket; ++ ++ do { ++ ca->fifo_last_bucket++; ++ if (ca->fifo_last_bucket == ca->mi.nbuckets) ++ ca->fifo_last_bucket = ca->mi.first_bucket; ++ ++ b = ca->fifo_last_bucket; ++ m = READ_ONCE(buckets->b[b].mark); ++ ++ if (bch2_can_invalidate_bucket(ca, b, m)) { ++ struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; ++ ++ heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); ++ if (heap_full(&ca->alloc_heap)) ++ break; ++ } ++ ++ cond_resched(); ++ } while (ca->fifo_last_bucket != start); ++} ++ ++static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bucket_array *buckets = bucket_array(ca); ++ struct bucket_mark m; ++ size_t checked, i; ++ ++ for (checked = 0; ++ checked < ca->mi.nbuckets / 2; ++ checked++) { ++ size_t b = bch2_rand_range(ca->mi.nbuckets - ++ ca->mi.first_bucket) + ++ ca->mi.first_bucket; ++ ++ m = READ_ONCE(buckets->b[b].mark); ++ ++ if (bch2_can_invalidate_bucket(ca, b, m)) { ++ struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; ++ ++ heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); ++ if (heap_full(&ca->alloc_heap)) ++ break; ++ } ++ ++ cond_resched(); ++ } ++ ++ sort(ca->alloc_heap.data, ++ ca->alloc_heap.used, ++ sizeof(ca->alloc_heap.data[0]), ++ bucket_idx_cmp, NULL); ++ ++ /* remove duplicates: */ ++ for (i = 0; i + 1 < ca->alloc_heap.used; i++) ++ if (ca->alloc_heap.data[i].bucket == ++ ca->alloc_heap.data[i + 1].bucket) ++ ca->alloc_heap.data[i].nr = 0; ++} ++ ++static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) ++{ ++ size_t i, nr = 0; ++ ++ ca->inc_gen_needs_gc = 0; ++ ++ switch (ca->mi.replacement) { ++ case CACHE_REPLACEMENT_LRU: ++ find_reclaimable_buckets_lru(c, ca); ++ break; ++ case CACHE_REPLACEMENT_FIFO: ++ find_reclaimable_buckets_fifo(c, ca); ++ break; ++ case CACHE_REPLACEMENT_RANDOM: ++ find_reclaimable_buckets_random(c, ca); ++ break; ++ } ++ ++ heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL); ++ ++ for (i = 0; i < ca->alloc_heap.used; i++) ++ nr += ca->alloc_heap.data[i].nr; ++ ++ return nr; ++} ++ ++static inline long next_alloc_bucket(struct bch_dev *ca) ++{ ++ struct alloc_heap_entry e, *top = ca->alloc_heap.data; ++ ++ while (ca->alloc_heap.used) { ++ if (top->nr) { ++ size_t b = top->bucket; ++ ++ top->bucket++; ++ top->nr--; ++ return b; ++ } ++ ++ heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); ++ } ++ ++ return -1; ++} ++ ++/* ++ * returns sequence number of most recent journal entry that updated this ++ * bucket: ++ */ ++static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m) ++{ ++ if (m.journal_seq_valid) { ++ u64 journal_seq = atomic64_read(&c->journal.seq); ++ u64 bucket_seq = journal_seq; ++ ++ bucket_seq &= ~((u64) U16_MAX); ++ bucket_seq |= m.journal_seq; ++ ++ if (bucket_seq > journal_seq) ++ bucket_seq -= 1 << 16; ++ ++ return bucket_seq; ++ } else { ++ return 0; ++ } ++} ++ ++static int bch2_invalidate_one_bucket2(struct btree_trans *trans, ++ struct bch_dev *ca, ++ struct btree_iter *iter, ++ u64 *journal_seq, unsigned flags) ++{ ++#if 0 ++ __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key; ++#else ++ /* hack: */ ++ __BKEY_PADDED(k, 8) alloc_key; ++#endif ++ struct bch_fs *c = trans->c; ++ struct bkey_i_alloc *a; ++ struct bkey_alloc_unpacked u; ++ struct bucket *g; ++ struct bucket_mark m; ++ bool invalidating_cached_data; ++ size_t b; ++ int ret = 0; ++ ++ BUG_ON(!ca->alloc_heap.used || ++ !ca->alloc_heap.data[0].nr); ++ b = ca->alloc_heap.data[0].bucket; ++ ++ /* first, put on free_inc and mark as owned by allocator: */ ++ percpu_down_read(&c->mark_lock); ++ spin_lock(&c->freelist_lock); ++ ++ verify_not_on_freelist(c, ca, b); ++ ++ BUG_ON(!fifo_push(&ca->free_inc, b)); ++ ++ g = bucket(ca, b); ++ m = READ_ONCE(g->mark); ++ ++ invalidating_cached_data = m.cached_sectors != 0; ++ ++ /* ++ * If we're not invalidating cached data, we only increment the bucket ++ * gen in memory here, the incremented gen will be updated in the btree ++ * by bch2_trans_mark_pointer(): ++ */ ++ ++ if (!invalidating_cached_data) ++ bch2_invalidate_bucket(c, ca, b, &m); ++ else ++ bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); ++ ++ spin_unlock(&c->freelist_lock); ++ percpu_up_read(&c->mark_lock); ++ ++ if (!invalidating_cached_data) ++ goto out; ++ ++ /* ++ * If the read-only path is trying to shut down, we can't be generating ++ * new btree updates: ++ */ ++ if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) { ++ ret = 1; ++ goto out; ++ } ++ ++ BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); ++ ++ bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); ++retry: ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ return ret; ++ ++ percpu_down_read(&c->mark_lock); ++ g = bucket(ca, iter->pos.offset); ++ m = READ_ONCE(g->mark); ++ u = alloc_mem_to_key(g, m); ++ ++ percpu_up_read(&c->mark_lock); ++ ++ invalidating_cached_data = u.cached_sectors != 0; ++ ++ u.gen++; ++ u.data_type = 0; ++ u.dirty_sectors = 0; ++ u.cached_sectors = 0; ++ u.read_time = c->bucket_clock[READ].hand; ++ u.write_time = c->bucket_clock[WRITE].hand; ++ ++ a = bkey_alloc_init(&alloc_key.k); ++ a->k.p = iter->pos; ++ bch2_alloc_pack(a, u); ++ ++ bch2_trans_update(trans, iter, &a->k_i, ++ BTREE_TRIGGER_BUCKET_INVALIDATE); ++ ++ /* ++ * XXX: ++ * when using deferred btree updates, we have journal reclaim doing ++ * btree updates and thus requiring the allocator to make forward ++ * progress, and here the allocator is requiring space in the journal - ++ * so we need a journal pre-reservation: ++ */ ++ ret = bch2_trans_commit(trans, NULL, ++ invalidating_cached_data ? journal_seq : NULL, ++ BTREE_INSERT_NOUNLOCK| ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_USE_ALLOC_RESERVE| ++ flags); ++ if (ret == -EINTR) ++ goto retry; ++out: ++ if (!ret) { ++ /* remove from alloc_heap: */ ++ struct alloc_heap_entry e, *top = ca->alloc_heap.data; ++ ++ top->bucket++; ++ top->nr--; ++ ++ if (!top->nr) ++ heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); ++ ++ /* ++ * Make sure we flush the last journal entry that updated this ++ * bucket (i.e. deleting the last reference) before writing to ++ * this bucket again: ++ */ ++ *journal_seq = max(*journal_seq, bucket_journal_seq(c, m)); ++ } else { ++ size_t b2; ++ ++ /* remove from free_inc: */ ++ percpu_down_read(&c->mark_lock); ++ spin_lock(&c->freelist_lock); ++ ++ bch2_mark_alloc_bucket(c, ca, b, false, ++ gc_pos_alloc(c, NULL), 0); ++ ++ BUG_ON(!fifo_pop_back(&ca->free_inc, b2)); ++ BUG_ON(b != b2); ++ ++ spin_unlock(&c->freelist_lock); ++ percpu_up_read(&c->mark_lock); ++ } ++ ++ return ret < 0 ? ret : 0; ++} ++ ++/* ++ * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc: ++ */ ++static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ u64 journal_seq = 0; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, ++ POS(ca->dev_idx, 0), ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_INTENT); ++ ++ /* Only use nowait if we've already invalidated at least one bucket: */ ++ while (!ret && ++ !fifo_full(&ca->free_inc) && ++ ca->alloc_heap.used) ++ ret = bch2_invalidate_one_bucket2(&trans, ca, iter, &journal_seq, ++ BTREE_INSERT_GC_LOCK_HELD| ++ (!fifo_empty(&ca->free_inc) ++ ? BTREE_INSERT_NOWAIT : 0)); ++ ++ bch2_trans_exit(&trans); ++ ++ /* If we used NOWAIT, don't return the error: */ ++ if (!fifo_empty(&ca->free_inc)) ++ ret = 0; ++ if (ret) { ++ bch_err(ca, "error invalidating buckets: %i", ret); ++ return ret; ++ } ++ ++ if (journal_seq) ++ ret = bch2_journal_flush_seq(&c->journal, journal_seq); ++ if (ret) { ++ bch_err(ca, "journal error: %i", ret); ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket) ++{ ++ unsigned i; ++ int ret = 0; ++ ++ while (1) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ spin_lock(&c->freelist_lock); ++ for (i = 0; i < RESERVE_NR; i++) { ++ ++ /* ++ * Don't strand buckets on the copygc freelist until ++ * after recovery is finished: ++ */ ++ if (!test_bit(BCH_FS_STARTED, &c->flags) && ++ i == RESERVE_MOVINGGC) ++ continue; ++ ++ if (fifo_push(&ca->free[i], bucket)) { ++ fifo_pop(&ca->free_inc, bucket); ++ ++ closure_wake_up(&c->freelist_wait); ++ ca->allocator_state = ALLOCATOR_RUNNING; ++ ++ spin_unlock(&c->freelist_lock); ++ goto out; ++ } ++ } ++ ++ if (ca->allocator_state != ALLOCATOR_BLOCKED_FULL) { ++ ca->allocator_state = ALLOCATOR_BLOCKED_FULL; ++ closure_wake_up(&c->freelist_wait); ++ } ++ ++ spin_unlock(&c->freelist_lock); ++ ++ if ((current->flags & PF_KTHREAD) && ++ kthread_should_stop()) { ++ ret = 1; ++ break; ++ } ++ ++ schedule(); ++ try_to_freeze(); ++ } ++out: ++ __set_current_state(TASK_RUNNING); ++ return ret; ++} ++ ++/* ++ * Pulls buckets off free_inc, discards them (if enabled), then adds them to ++ * freelists, waiting until there's room if necessary: ++ */ ++static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca) ++{ ++ while (!fifo_empty(&ca->free_inc)) { ++ size_t bucket = fifo_peek(&ca->free_inc); ++ ++ if (ca->mi.discard && ++ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) ++ blkdev_issue_discard(ca->disk_sb.bdev, ++ bucket_to_sector(ca, bucket), ++ ca->mi.bucket_size, GFP_NOIO, 0); ++ ++ if (push_invalidated_bucket(c, ca, bucket)) ++ return 1; ++ } ++ ++ return 0; ++} ++ ++/** ++ * bch_allocator_thread - move buckets from free_inc to reserves ++ * ++ * The free_inc FIFO is populated by find_reclaimable_buckets(), and ++ * the reserves are depleted by bucket allocation. When we run out ++ * of free_inc, try to invalidate some buckets and write out ++ * prios and gens. ++ */ ++static int bch2_allocator_thread(void *arg) ++{ ++ struct bch_dev *ca = arg; ++ struct bch_fs *c = ca->fs; ++ size_t nr; ++ int ret; ++ ++ set_freezable(); ++ ca->allocator_state = ALLOCATOR_RUNNING; ++ ++ while (1) { ++ cond_resched(); ++ if (kthread_should_stop()) ++ break; ++ ++ pr_debug("discarding %zu invalidated buckets", ++ fifo_used(&ca->free_inc)); ++ ++ ret = discard_invalidated_buckets(c, ca); ++ if (ret) ++ goto stop; ++ ++ down_read(&c->gc_lock); ++ ++ ret = bch2_invalidate_buckets(c, ca); ++ if (ret) { ++ up_read(&c->gc_lock); ++ goto stop; ++ } ++ ++ if (!fifo_empty(&ca->free_inc)) { ++ up_read(&c->gc_lock); ++ continue; ++ } ++ ++ pr_debug("free_inc now empty"); ++ ++ do { ++ /* ++ * Find some buckets that we can invalidate, either ++ * they're completely unused, or only contain clean data ++ * that's been written back to the backing device or ++ * another cache tier ++ */ ++ ++ pr_debug("scanning for reclaimable buckets"); ++ ++ nr = find_reclaimable_buckets(c, ca); ++ ++ pr_debug("found %zu buckets", nr); ++ ++ trace_alloc_batch(ca, nr, ca->alloc_heap.size); ++ ++ if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || ++ ca->inc_gen_really_needs_gc) && ++ c->gc_thread) { ++ atomic_inc(&c->kick_gc); ++ wake_up_process(c->gc_thread); ++ } ++ ++ /* ++ * If we found any buckets, we have to invalidate them ++ * before we scan for more - but if we didn't find very ++ * many we may want to wait on more buckets being ++ * available so we don't spin: ++ */ ++ if (!nr || ++ (nr < ALLOC_SCAN_BATCH(ca) && ++ !fifo_empty(&ca->free[RESERVE_NONE]))) { ++ ret = wait_buckets_available(c, ca); ++ if (ret) { ++ up_read(&c->gc_lock); ++ goto stop; ++ } ++ } ++ } while (!nr); ++ ++ up_read(&c->gc_lock); ++ ++ pr_debug("%zu buckets to invalidate", nr); ++ ++ /* ++ * alloc_heap is now full of newly-invalidated buckets: next, ++ * write out the new bucket gens: ++ */ ++ } ++ ++stop: ++ pr_debug("alloc thread stopping (ret %i)", ret); ++ ca->allocator_state = ALLOCATOR_STOPPED; ++ closure_wake_up(&c->freelist_wait); ++ return 0; ++} ++ ++/* Startup/shutdown (ro/rw): */ ++ ++void bch2_recalc_capacity(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ u64 capacity = 0, reserved_sectors = 0, gc_reserve; ++ unsigned bucket_size_max = 0; ++ unsigned long ra_pages = 0; ++ unsigned i, j; ++ ++ lockdep_assert_held(&c->state_lock); ++ ++ for_each_online_member(ca, c, i) { ++ struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi; ++ ++ ra_pages += bdi->ra_pages; ++ } ++ ++ bch2_set_ra_pages(c, ra_pages); ++ ++ for_each_rw_member(ca, c, i) { ++ u64 dev_reserve = 0; ++ ++ /* ++ * We need to reserve buckets (from the number ++ * of currently available buckets) against ++ * foreground writes so that mainly copygc can ++ * make forward progress. ++ * ++ * We need enough to refill the various reserves ++ * from scratch - copygc will use its entire ++ * reserve all at once, then run against when ++ * its reserve is refilled (from the formerly ++ * available buckets). ++ * ++ * This reserve is just used when considering if ++ * allocations for foreground writes must wait - ++ * not -ENOSPC calculations. ++ */ ++ for (j = 0; j < RESERVE_NONE; j++) ++ dev_reserve += ca->free[j].size; ++ ++ dev_reserve += 1; /* btree write point */ ++ dev_reserve += 1; /* copygc write point */ ++ dev_reserve += 1; /* rebalance write point */ ++ ++ dev_reserve *= ca->mi.bucket_size; ++ ++ ca->copygc_threshold = dev_reserve; ++ ++ capacity += bucket_to_sector(ca, ca->mi.nbuckets - ++ ca->mi.first_bucket); ++ ++ reserved_sectors += dev_reserve * 2; ++ ++ bucket_size_max = max_t(unsigned, bucket_size_max, ++ ca->mi.bucket_size); ++ } ++ ++ gc_reserve = c->opts.gc_reserve_bytes ++ ? c->opts.gc_reserve_bytes >> 9 ++ : div64_u64(capacity * c->opts.gc_reserve_percent, 100); ++ ++ reserved_sectors = max(gc_reserve, reserved_sectors); ++ ++ reserved_sectors = min(reserved_sectors, capacity); ++ ++ c->capacity = capacity - reserved_sectors; ++ ++ c->bucket_size_max = bucket_size_max; ++ ++ if (c->capacity) { ++ bch2_io_timer_add(&c->io_clock[READ], ++ &c->bucket_clock[READ].rescale); ++ bch2_io_timer_add(&c->io_clock[WRITE], ++ &c->bucket_clock[WRITE].rescale); ++ } else { ++ bch2_io_timer_del(&c->io_clock[READ], ++ &c->bucket_clock[READ].rescale); ++ bch2_io_timer_del(&c->io_clock[WRITE], ++ &c->bucket_clock[WRITE].rescale); ++ } ++ ++ /* Wake up case someone was waiting for buckets */ ++ closure_wake_up(&c->freelist_wait); ++} ++ ++static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct open_bucket *ob; ++ bool ret = false; ++ ++ for (ob = c->open_buckets; ++ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ++ ob++) { ++ spin_lock(&ob->lock); ++ if (ob->valid && !ob->on_partial_list && ++ ob->ptr.dev == ca->dev_idx) ++ ret = true; ++ spin_unlock(&ob->lock); ++ } ++ ++ return ret; ++} ++ ++/* device goes ro: */ ++void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) ++{ ++ unsigned i; ++ ++ BUG_ON(ca->alloc_thread); ++ ++ /* First, remove device from allocation groups: */ ++ ++ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) ++ clear_bit(ca->dev_idx, c->rw_devs[i].d); ++ ++ /* ++ * Capacity is calculated based off of devices in allocation groups: ++ */ ++ bch2_recalc_capacity(c); ++ ++ /* Next, close write points that point to this device... */ ++ for (i = 0; i < ARRAY_SIZE(c->write_points); i++) ++ bch2_writepoint_stop(c, ca, &c->write_points[i]); ++ ++ bch2_writepoint_stop(c, ca, &ca->copygc_write_point); ++ bch2_writepoint_stop(c, ca, &c->rebalance_write_point); ++ bch2_writepoint_stop(c, ca, &c->btree_write_point); ++ ++ mutex_lock(&c->btree_reserve_cache_lock); ++ while (c->btree_reserve_cache_nr) { ++ struct btree_alloc *a = ++ &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; ++ ++ bch2_open_buckets_put(c, &a->ob); ++ } ++ mutex_unlock(&c->btree_reserve_cache_lock); ++ ++ while (1) { ++ struct open_bucket *ob; ++ ++ spin_lock(&c->freelist_lock); ++ if (!ca->open_buckets_partial_nr) { ++ spin_unlock(&c->freelist_lock); ++ break; ++ } ++ ob = c->open_buckets + ++ ca->open_buckets_partial[--ca->open_buckets_partial_nr]; ++ ob->on_partial_list = false; ++ spin_unlock(&c->freelist_lock); ++ ++ bch2_open_bucket_put(c, ob); ++ } ++ ++ bch2_ec_stop_dev(c, ca); ++ ++ /* ++ * Wake up threads that were blocked on allocation, so they can notice ++ * the device can no longer be removed and the capacity has changed: ++ */ ++ closure_wake_up(&c->freelist_wait); ++ ++ /* ++ * journal_res_get() can block waiting for free space in the journal - ++ * it needs to notice there may not be devices to allocate from anymore: ++ */ ++ wake_up(&c->journal.wait); ++ ++ /* Now wait for any in flight writes: */ ++ ++ closure_wait_event(&c->open_buckets_wait, ++ !bch2_dev_has_open_write_point(c, ca)); ++} ++ ++/* device goes rw: */ ++void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) ++ if (ca->mi.data_allowed & (1 << i)) ++ set_bit(ca->dev_idx, c->rw_devs[i].d); ++} ++ ++void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca) ++{ ++ if (ca->alloc_thread) ++ closure_wait_event(&c->freelist_wait, ++ ca->allocator_state != ALLOCATOR_RUNNING); ++} ++ ++/* stop allocator thread: */ ++void bch2_dev_allocator_stop(struct bch_dev *ca) ++{ ++ struct task_struct *p; ++ ++ p = rcu_dereference_protected(ca->alloc_thread, 1); ++ ca->alloc_thread = NULL; ++ ++ /* ++ * We need an rcu barrier between setting ca->alloc_thread = NULL and ++ * the thread shutting down to avoid bch2_wake_allocator() racing: ++ * ++ * XXX: it would be better to have the rcu barrier be asynchronous ++ * instead of blocking us here ++ */ ++ synchronize_rcu(); ++ ++ if (p) { ++ kthread_stop(p); ++ put_task_struct(p); ++ } ++} ++ ++/* start allocator thread: */ ++int bch2_dev_allocator_start(struct bch_dev *ca) ++{ ++ struct task_struct *p; ++ ++ /* ++ * allocator thread already started? ++ */ ++ if (ca->alloc_thread) ++ return 0; ++ ++ p = kthread_create(bch2_allocator_thread, ca, ++ "bch_alloc[%s]", ca->name); ++ if (IS_ERR(p)) ++ return PTR_ERR(p); ++ ++ get_task_struct(p); ++ rcu_assign_pointer(ca->alloc_thread, p); ++ wake_up_process(p); ++ return 0; ++} ++ ++void bch2_fs_allocator_background_init(struct bch_fs *c) ++{ ++ spin_lock_init(&c->freelist_lock); ++ bch2_bucket_clock_init(c, READ); ++ bch2_bucket_clock_init(c, WRITE); ++ ++ c->pd_controllers_update_seconds = 5; ++ INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); ++} +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +new file mode 100644 +index 000000000000..f6b9f27f0713 +--- /dev/null ++++ b/fs/bcachefs/alloc_background.h +@@ -0,0 +1,97 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ALLOC_BACKGROUND_H ++#define _BCACHEFS_ALLOC_BACKGROUND_H ++ ++#include "bcachefs.h" ++#include "alloc_types.h" ++#include "debug.h" ++ ++struct bkey_alloc_unpacked { ++ u8 gen; ++#define x(_name, _bits) u##_bits _name; ++ BCH_ALLOC_FIELDS() ++#undef x ++}; ++ ++/* returns true if not equal */ ++static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, ++ struct bkey_alloc_unpacked r) ++{ ++ return l.gen != r.gen ++#define x(_name, _bits) || l._name != r._name ++ BCH_ALLOC_FIELDS() ++#undef x ++ ; ++} ++ ++struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); ++void bch2_alloc_pack(struct bkey_i_alloc *, ++ const struct bkey_alloc_unpacked); ++ ++static inline struct bkey_alloc_unpacked ++alloc_mem_to_key(struct bucket *g, struct bucket_mark m) ++{ ++ return (struct bkey_alloc_unpacked) { ++ .gen = m.gen, ++ .oldest_gen = g->oldest_gen, ++ .data_type = m.data_type, ++ .dirty_sectors = m.dirty_sectors, ++ .cached_sectors = m.cached_sectors, ++ .read_time = g->io_time[READ], ++ .write_time = g->io_time[WRITE], ++ }; ++} ++ ++#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) ++ ++const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_alloc (struct bkey_ops) { \ ++ .key_invalid = bch2_alloc_invalid, \ ++ .val_to_text = bch2_alloc_to_text, \ ++} ++ ++struct journal_keys; ++int bch2_alloc_read(struct bch_fs *, struct journal_keys *); ++ ++static inline void bch2_wake_allocator(struct bch_dev *ca) ++{ ++ struct task_struct *p; ++ ++ rcu_read_lock(); ++ p = rcu_dereference(ca->alloc_thread); ++ if (p) ++ wake_up_process(p); ++ rcu_read_unlock(); ++} ++ ++static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, ++ size_t bucket) ++{ ++ if (expensive_debug_checks(c)) { ++ size_t iter; ++ long i; ++ unsigned j; ++ ++ for (j = 0; j < RESERVE_NR; j++) ++ fifo_for_each_entry(i, &ca->free[j], iter) ++ BUG_ON(i == bucket); ++ fifo_for_each_entry(i, &ca->free_inc, iter) ++ BUG_ON(i == bucket); ++ } ++} ++ ++void bch2_recalc_capacity(struct bch_fs *); ++ ++void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); ++void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); ++ ++void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); ++void bch2_dev_allocator_stop(struct bch_dev *); ++int bch2_dev_allocator_start(struct bch_dev *); ++ ++int bch2_alloc_write(struct bch_fs *, unsigned, bool *); ++void bch2_fs_allocator_background_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +new file mode 100644 +index 000000000000..979aba30bc9d +--- /dev/null ++++ b/fs/bcachefs/alloc_foreground.c +@@ -0,0 +1,1044 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Primary bucket allocation code ++ * ++ * Copyright 2012 Google, Inc. ++ * ++ * Allocation in bcache is done in terms of buckets: ++ * ++ * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in ++ * btree pointers - they must match for the pointer to be considered valid. ++ * ++ * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a ++ * bucket simply by incrementing its gen. ++ * ++ * The gens (along with the priorities; it's really the gens are important but ++ * the code is named as if it's the priorities) are written in an arbitrary list ++ * of buckets on disk, with a pointer to them in the journal header. ++ * ++ * When we invalidate a bucket, we have to write its new gen to disk and wait ++ * for that write to complete before we use it - otherwise after a crash we ++ * could have pointers that appeared to be good but pointed to data that had ++ * been overwritten. ++ * ++ * Since the gens and priorities are all stored contiguously on disk, we can ++ * batch this up: We fill up the free_inc list with freshly invalidated buckets, ++ * call prio_write(), and when prio_write() finishes we pull buckets off the ++ * free_inc list and optionally discard them. ++ * ++ * free_inc isn't the only freelist - if it was, we'd often have to sleep while ++ * priorities and gens were being written before we could allocate. c->free is a ++ * smaller freelist, and buckets on that list are always ready to be used. ++ * ++ * If we've got discards enabled, that happens when a bucket moves from the ++ * free_inc list to the free list. ++ * ++ * It's important to ensure that gens don't wrap around - with respect to ++ * either the oldest gen in the btree or the gen on disk. This is quite ++ * difficult to do in practice, but we explicitly guard against it anyways - if ++ * a bucket is in danger of wrapping around we simply skip invalidating it that ++ * time around, and we garbage collect or rewrite the priorities sooner than we ++ * would have otherwise. ++ * ++ * bch2_bucket_alloc() allocates a single bucket from a specific device. ++ * ++ * bch2_bucket_alloc_set() allocates one or more buckets from different devices ++ * in a given filesystem. ++ * ++ * invalidate_buckets() drives all the processes described above. It's called ++ * from bch2_bucket_alloc() and a few other places that need to make sure free ++ * buckets are ready. ++ * ++ * invalidate_buckets_(lru|fifo)() find buckets that are available to be ++ * invalidated, and then invalidate them and stick them on the free_inc list - ++ * in either lru or fifo order. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "clock.h" ++#include "debug.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "io.h" ++ ++#include ++#include ++#include ++#include ++ ++enum bucket_alloc_ret { ++ ALLOC_SUCCESS, ++ OPEN_BUCKETS_EMPTY, ++ FREELIST_EMPTY, /* Allocator thread not keeping up */ ++}; ++ ++/* ++ * Open buckets represent a bucket that's currently being allocated from. They ++ * serve two purposes: ++ * ++ * - They track buckets that have been partially allocated, allowing for ++ * sub-bucket sized allocations - they're used by the sector allocator below ++ * ++ * - They provide a reference to the buckets they own that mark and sweep GC ++ * can find, until the new allocation has a pointer to it inserted into the ++ * btree ++ * ++ * When allocating some space with the sector allocator, the allocation comes ++ * with a reference to an open bucket - the caller is required to put that ++ * reference _after_ doing the index update that makes its allocation reachable. ++ */ ++ ++void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) ++{ ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ ++ if (ob->ec) { ++ bch2_ec_bucket_written(c, ob); ++ return; ++ } ++ ++ percpu_down_read(&c->mark_lock); ++ spin_lock(&ob->lock); ++ ++ bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), ++ false, gc_pos_alloc(c, ob), 0); ++ ob->valid = false; ++ ob->type = 0; ++ ++ spin_unlock(&ob->lock); ++ percpu_up_read(&c->mark_lock); ++ ++ spin_lock(&c->freelist_lock); ++ ob->freelist = c->open_buckets_freelist; ++ c->open_buckets_freelist = ob - c->open_buckets; ++ c->open_buckets_nr_free++; ++ spin_unlock(&c->freelist_lock); ++ ++ closure_wake_up(&c->open_buckets_wait); ++} ++ ++void bch2_open_bucket_write_error(struct bch_fs *c, ++ struct open_buckets *obs, ++ unsigned dev) ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, obs, ob, i) ++ if (ob->ptr.dev == dev && ++ ob->ec) ++ bch2_ec_bucket_cancel(c, ob); ++} ++ ++static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) ++{ ++ struct open_bucket *ob; ++ ++ BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free); ++ ++ ob = c->open_buckets + c->open_buckets_freelist; ++ c->open_buckets_freelist = ob->freelist; ++ atomic_set(&ob->pin, 1); ++ ob->type = 0; ++ ++ c->open_buckets_nr_free--; ++ return ob; ++} ++ ++static void open_bucket_free_unused(struct bch_fs *c, ++ struct open_bucket *ob, ++ bool may_realloc) ++{ ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ ++ BUG_ON(ca->open_buckets_partial_nr >= ++ ARRAY_SIZE(ca->open_buckets_partial)); ++ ++ if (ca->open_buckets_partial_nr < ++ ARRAY_SIZE(ca->open_buckets_partial) && ++ may_realloc) { ++ spin_lock(&c->freelist_lock); ++ ob->on_partial_list = true; ++ ca->open_buckets_partial[ca->open_buckets_partial_nr++] = ++ ob - c->open_buckets; ++ spin_unlock(&c->freelist_lock); ++ ++ closure_wake_up(&c->open_buckets_wait); ++ closure_wake_up(&c->freelist_wait); ++ } else { ++ bch2_open_bucket_put(c, ob); ++ } ++} ++ ++static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, obs, ob, i) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ ++ BUG_ON(ptr_stale(ca, &ob->ptr)); ++ } ++#endif ++} ++ ++/* _only_ for allocating the journal on a new device: */ ++long bch2_bucket_alloc_new_fs(struct bch_dev *ca) ++{ ++ struct bucket_array *buckets; ++ ssize_t b; ++ ++ rcu_read_lock(); ++ buckets = bucket_array(ca); ++ ++ for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) ++ if (is_available_bucket(buckets->b[b].mark)) ++ goto success; ++ b = -1; ++success: ++ rcu_read_unlock(); ++ return b; ++} ++ ++static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) ++{ ++ switch (reserve) { ++ case RESERVE_ALLOC: ++ return 0; ++ case RESERVE_BTREE: ++ return OPEN_BUCKETS_COUNT / 4; ++ default: ++ return OPEN_BUCKETS_COUNT / 2; ++ } ++} ++ ++/** ++ * bch_bucket_alloc - allocate a single bucket from a specific device ++ * ++ * Returns index of bucket on success, 0 on failure ++ * */ ++struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, ++ enum alloc_reserve reserve, ++ bool may_alloc_partial, ++ struct closure *cl) ++{ ++ struct bucket_array *buckets; ++ struct open_bucket *ob; ++ long bucket = 0; ++ ++ spin_lock(&c->freelist_lock); ++ ++ if (may_alloc_partial && ++ ca->open_buckets_partial_nr) { ++ ob = c->open_buckets + ++ ca->open_buckets_partial[--ca->open_buckets_partial_nr]; ++ ob->on_partial_list = false; ++ spin_unlock(&c->freelist_lock); ++ return ob; ++ } ++ ++ if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { ++ if (cl) ++ closure_wait(&c->open_buckets_wait, cl); ++ ++ if (!c->blocked_allocate_open_bucket) ++ c->blocked_allocate_open_bucket = local_clock(); ++ ++ spin_unlock(&c->freelist_lock); ++ trace_open_bucket_alloc_fail(ca, reserve); ++ return ERR_PTR(-OPEN_BUCKETS_EMPTY); ++ } ++ ++ if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket))) ++ goto out; ++ ++ switch (reserve) { ++ case RESERVE_ALLOC: ++ if (fifo_pop(&ca->free[RESERVE_BTREE], bucket)) ++ goto out; ++ break; ++ case RESERVE_BTREE: ++ if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >= ++ ca->free[RESERVE_BTREE].size && ++ fifo_pop(&ca->free[RESERVE_BTREE], bucket)) ++ goto out; ++ break; ++ case RESERVE_MOVINGGC: ++ if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket)) ++ goto out; ++ break; ++ default: ++ break; ++ } ++ ++ if (cl) ++ closure_wait(&c->freelist_wait, cl); ++ ++ if (!c->blocked_allocate) ++ c->blocked_allocate = local_clock(); ++ ++ spin_unlock(&c->freelist_lock); ++ ++ trace_bucket_alloc_fail(ca, reserve); ++ return ERR_PTR(-FREELIST_EMPTY); ++out: ++ verify_not_on_freelist(c, ca, bucket); ++ ++ ob = bch2_open_bucket_alloc(c); ++ ++ spin_lock(&ob->lock); ++ buckets = bucket_array(ca); ++ ++ ob->valid = true; ++ ob->sectors_free = ca->mi.bucket_size; ++ ob->ptr = (struct bch_extent_ptr) { ++ .type = 1 << BCH_EXTENT_ENTRY_ptr, ++ .gen = buckets->b[bucket].mark.gen, ++ .offset = bucket_to_sector(ca, bucket), ++ .dev = ca->dev_idx, ++ }; ++ ++ bucket_io_clock_reset(c, ca, bucket, READ); ++ bucket_io_clock_reset(c, ca, bucket, WRITE); ++ spin_unlock(&ob->lock); ++ ++ if (c->blocked_allocate_open_bucket) { ++ bch2_time_stats_update( ++ &c->times[BCH_TIME_blocked_allocate_open_bucket], ++ c->blocked_allocate_open_bucket); ++ c->blocked_allocate_open_bucket = 0; ++ } ++ ++ if (c->blocked_allocate) { ++ bch2_time_stats_update( ++ &c->times[BCH_TIME_blocked_allocate], ++ c->blocked_allocate); ++ c->blocked_allocate = 0; ++ } ++ ++ spin_unlock(&c->freelist_lock); ++ ++ bch2_wake_allocator(ca); ++ ++ trace_bucket_alloc(ca, reserve); ++ return ob; ++} ++ ++static int __dev_stripe_cmp(struct dev_stripe_state *stripe, ++ unsigned l, unsigned r) ++{ ++ return ((stripe->next_alloc[l] > stripe->next_alloc[r]) - ++ (stripe->next_alloc[l] < stripe->next_alloc[r])); ++} ++ ++#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) ++ ++struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, ++ struct dev_stripe_state *stripe, ++ struct bch_devs_mask *devs) ++{ ++ struct dev_alloc_list ret = { .nr = 0 }; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ for_each_member_device_rcu(ca, c, i, devs) ++ ret.devs[ret.nr++] = i; ++ ++ bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); ++ return ret; ++} ++ ++void bch2_dev_stripe_increment(struct bch_fs *c, struct bch_dev *ca, ++ struct dev_stripe_state *stripe) ++{ ++ u64 *v = stripe->next_alloc + ca->dev_idx; ++ u64 free_space = dev_buckets_free(c, ca); ++ u64 free_space_inv = free_space ++ ? div64_u64(1ULL << 48, free_space) ++ : 1ULL << 48; ++ u64 scale = *v / 4; ++ ++ if (*v + free_space_inv >= *v) ++ *v += free_space_inv; ++ else ++ *v = U64_MAX; ++ ++ for (v = stripe->next_alloc; ++ v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) ++ *v = *v < scale ? 0 : *v - scale; ++} ++ ++#define BUCKET_MAY_ALLOC_PARTIAL (1 << 0) ++#define BUCKET_ALLOC_USE_DURABILITY (1 << 1) ++ ++static void add_new_bucket(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct bch_devs_mask *devs_may_alloc, ++ unsigned *nr_effective, ++ bool *have_cache, ++ unsigned flags, ++ struct open_bucket *ob) ++{ ++ unsigned durability = ++ bch_dev_bkey_exists(c, ob->ptr.dev)->mi.durability; ++ ++ __clear_bit(ob->ptr.dev, devs_may_alloc->d); ++ *nr_effective += (flags & BUCKET_ALLOC_USE_DURABILITY) ++ ? durability : 1; ++ *have_cache |= !durability; ++ ++ ob_push(c, ptrs, ob); ++} ++ ++static int bch2_bucket_alloc_set(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct dev_stripe_state *stripe, ++ struct bch_devs_mask *devs_may_alloc, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ enum alloc_reserve reserve, ++ unsigned flags, ++ struct closure *cl) ++{ ++ struct dev_alloc_list devs_sorted = ++ bch2_dev_alloc_list(c, stripe, devs_may_alloc); ++ struct bch_dev *ca; ++ bool alloc_failure = false; ++ unsigned i; ++ ++ BUG_ON(*nr_effective >= nr_replicas); ++ ++ for (i = 0; i < devs_sorted.nr; i++) { ++ struct open_bucket *ob; ++ ++ ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); ++ if (!ca) ++ continue; ++ ++ if (!ca->mi.durability && *have_cache) ++ continue; ++ ++ ob = bch2_bucket_alloc(c, ca, reserve, ++ flags & BUCKET_MAY_ALLOC_PARTIAL, cl); ++ if (IS_ERR(ob)) { ++ enum bucket_alloc_ret ret = -PTR_ERR(ob); ++ ++ WARN_ON(reserve == RESERVE_MOVINGGC && ++ ret != OPEN_BUCKETS_EMPTY); ++ ++ if (cl) ++ return -EAGAIN; ++ if (ret == OPEN_BUCKETS_EMPTY) ++ return -ENOSPC; ++ alloc_failure = true; ++ continue; ++ } ++ ++ add_new_bucket(c, ptrs, devs_may_alloc, ++ nr_effective, have_cache, flags, ob); ++ ++ bch2_dev_stripe_increment(c, ca, stripe); ++ ++ if (*nr_effective >= nr_replicas) ++ return 0; ++ } ++ ++ return alloc_failure ? -ENOSPC : -EROFS; ++} ++ ++/* Allocate from stripes: */ ++ ++/* ++ * XXX: use a higher watermark for allocating open buckets here: ++ */ ++static int ec_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) ++{ ++ struct bch_devs_mask devs; ++ struct open_bucket *ob; ++ unsigned i, nr_have = 0, nr_data = ++ min_t(unsigned, h->nr_active_devs, ++ EC_STRIPE_MAX) - h->redundancy; ++ bool have_cache = true; ++ int ret = 0; ++ ++ BUG_ON(h->blocks.nr > nr_data); ++ BUG_ON(h->parity.nr > h->redundancy); ++ ++ devs = h->devs; ++ ++ open_bucket_for_each(c, &h->parity, ob, i) ++ __clear_bit(ob->ptr.dev, devs.d); ++ open_bucket_for_each(c, &h->blocks, ob, i) ++ __clear_bit(ob->ptr.dev, devs.d); ++ ++ percpu_down_read(&c->mark_lock); ++ rcu_read_lock(); ++ ++ if (h->parity.nr < h->redundancy) { ++ nr_have = h->parity.nr; ++ ++ ret = bch2_bucket_alloc_set(c, &h->parity, ++ &h->parity_stripe, ++ &devs, ++ h->redundancy, ++ &nr_have, ++ &have_cache, ++ RESERVE_NONE, ++ 0, ++ NULL); ++ if (ret) ++ goto err; ++ } ++ ++ if (h->blocks.nr < nr_data) { ++ nr_have = h->blocks.nr; ++ ++ ret = bch2_bucket_alloc_set(c, &h->blocks, ++ &h->block_stripe, ++ &devs, ++ nr_data, ++ &nr_have, ++ &have_cache, ++ RESERVE_NONE, ++ 0, ++ NULL); ++ if (ret) ++ goto err; ++ } ++ ++ rcu_read_unlock(); ++ percpu_up_read(&c->mark_lock); ++ ++ return bch2_ec_stripe_new_alloc(c, h); ++err: ++ rcu_read_unlock(); ++ percpu_up_read(&c->mark_lock); ++ return -1; ++} ++ ++/* ++ * if we can't allocate a new stripe because there are already too many ++ * partially filled stripes, force allocating from an existing stripe even when ++ * it's to a device we don't want: ++ */ ++ ++static void bucket_alloc_from_stripe(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct write_point *wp, ++ struct bch_devs_mask *devs_may_alloc, ++ u16 target, ++ unsigned erasure_code, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ unsigned flags) ++{ ++ struct dev_alloc_list devs_sorted; ++ struct ec_stripe_head *h; ++ struct open_bucket *ob; ++ struct bch_dev *ca; ++ unsigned i, ec_idx; ++ ++ if (!erasure_code) ++ return; ++ ++ if (nr_replicas < 2) ++ return; ++ ++ if (ec_open_bucket(c, ptrs)) ++ return; ++ ++ h = bch2_ec_stripe_head_get(c, target, erasure_code, nr_replicas - 1); ++ if (!h) ++ return; ++ ++ if (!h->s && ec_stripe_alloc(c, h)) ++ goto out_put_head; ++ ++ rcu_read_lock(); ++ devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); ++ rcu_read_unlock(); ++ ++ for (i = 0; i < devs_sorted.nr; i++) ++ open_bucket_for_each(c, &h->s->blocks, ob, ec_idx) ++ if (ob->ptr.dev == devs_sorted.devs[i] && ++ !test_and_set_bit(ec_idx, h->s->blocks_allocated)) ++ goto got_bucket; ++ goto out_put_head; ++got_bucket: ++ ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ ++ ob->ec_idx = ec_idx; ++ ob->ec = h->s; ++ ++ add_new_bucket(c, ptrs, devs_may_alloc, ++ nr_effective, have_cache, flags, ob); ++ atomic_inc(&h->s->pin); ++out_put_head: ++ bch2_ec_stripe_head_put(h); ++} ++ ++/* Sector allocator */ ++ ++static void get_buckets_from_writepoint(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct write_point *wp, ++ struct bch_devs_mask *devs_may_alloc, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ unsigned flags, ++ bool need_ec) ++{ ++ struct open_buckets ptrs_skip = { .nr = 0 }; ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ ++ if (*nr_effective < nr_replicas && ++ test_bit(ob->ptr.dev, devs_may_alloc->d) && ++ (ca->mi.durability || ++ (wp->type == BCH_DATA_USER && !*have_cache)) && ++ (ob->ec || !need_ec)) { ++ add_new_bucket(c, ptrs, devs_may_alloc, ++ nr_effective, have_cache, ++ flags, ob); ++ } else { ++ ob_push(c, &ptrs_skip, ob); ++ } ++ } ++ wp->ptrs = ptrs_skip; ++} ++ ++static int open_bucket_add_buckets(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct write_point *wp, ++ struct bch_devs_list *devs_have, ++ u16 target, ++ unsigned erasure_code, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ enum alloc_reserve reserve, ++ unsigned flags, ++ struct closure *_cl) ++{ ++ struct bch_devs_mask devs; ++ struct open_bucket *ob; ++ struct closure *cl = NULL; ++ unsigned i; ++ int ret; ++ ++ rcu_read_lock(); ++ devs = target_rw_devs(c, wp->type, target); ++ rcu_read_unlock(); ++ ++ /* Don't allocate from devices we already have pointers to: */ ++ for (i = 0; i < devs_have->nr; i++) ++ __clear_bit(devs_have->devs[i], devs.d); ++ ++ open_bucket_for_each(c, ptrs, ob, i) ++ __clear_bit(ob->ptr.dev, devs.d); ++ ++ if (erasure_code) { ++ get_buckets_from_writepoint(c, ptrs, wp, &devs, ++ nr_replicas, nr_effective, ++ have_cache, flags, true); ++ if (*nr_effective >= nr_replicas) ++ return 0; ++ ++ bucket_alloc_from_stripe(c, ptrs, wp, &devs, ++ target, erasure_code, ++ nr_replicas, nr_effective, ++ have_cache, flags); ++ if (*nr_effective >= nr_replicas) ++ return 0; ++ } ++ ++ get_buckets_from_writepoint(c, ptrs, wp, &devs, ++ nr_replicas, nr_effective, ++ have_cache, flags, false); ++ if (*nr_effective >= nr_replicas) ++ return 0; ++ ++ percpu_down_read(&c->mark_lock); ++ rcu_read_lock(); ++ ++retry_blocking: ++ /* ++ * Try nonblocking first, so that if one device is full we'll try from ++ * other devices: ++ */ ++ ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs, ++ nr_replicas, nr_effective, have_cache, ++ reserve, flags, cl); ++ if (ret && ret != -EROFS && !cl && _cl) { ++ cl = _cl; ++ goto retry_blocking; ++ } ++ ++ rcu_read_unlock(); ++ percpu_up_read(&c->mark_lock); ++ ++ return ret; ++} ++ ++void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, ++ struct open_buckets *obs) ++{ ++ struct open_buckets ptrs = { .nr = 0 }; ++ struct open_bucket *ob, *ob2; ++ unsigned i, j; ++ ++ open_bucket_for_each(c, obs, ob, i) { ++ bool drop = !ca || ob->ptr.dev == ca->dev_idx; ++ ++ if (!drop && ob->ec) { ++ mutex_lock(&ob->ec->lock); ++ open_bucket_for_each(c, &ob->ec->blocks, ob2, j) ++ drop |= ob2->ptr.dev == ca->dev_idx; ++ open_bucket_for_each(c, &ob->ec->parity, ob2, j) ++ drop |= ob2->ptr.dev == ca->dev_idx; ++ mutex_unlock(&ob->ec->lock); ++ } ++ ++ if (drop) ++ bch2_open_bucket_put(c, ob); ++ else ++ ob_push(c, &ptrs, ob); ++ } ++ ++ *obs = ptrs; ++} ++ ++void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, ++ struct write_point *wp) ++{ ++ mutex_lock(&wp->lock); ++ bch2_open_buckets_stop_dev(c, ca, &wp->ptrs); ++ mutex_unlock(&wp->lock); ++} ++ ++static inline struct hlist_head *writepoint_hash(struct bch_fs *c, ++ unsigned long write_point) ++{ ++ unsigned hash = ++ hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); ++ ++ return &c->write_points_hash[hash]; ++} ++ ++static struct write_point *__writepoint_find(struct hlist_head *head, ++ unsigned long write_point) ++{ ++ struct write_point *wp; ++ ++ hlist_for_each_entry_rcu(wp, head, node) ++ if (wp->write_point == write_point) ++ return wp; ++ ++ return NULL; ++} ++ ++static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) ++{ ++ u64 stranded = c->write_points_nr * c->bucket_size_max; ++ u64 free = bch2_fs_usage_read_short(c).free; ++ ++ return stranded * factor > free; ++} ++ ++static bool try_increase_writepoints(struct bch_fs *c) ++{ ++ struct write_point *wp; ++ ++ if (c->write_points_nr == ARRAY_SIZE(c->write_points) || ++ too_many_writepoints(c, 32)) ++ return false; ++ ++ wp = c->write_points + c->write_points_nr++; ++ hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); ++ return true; ++} ++ ++static bool try_decrease_writepoints(struct bch_fs *c, ++ unsigned old_nr) ++{ ++ struct write_point *wp; ++ ++ mutex_lock(&c->write_points_hash_lock); ++ if (c->write_points_nr < old_nr) { ++ mutex_unlock(&c->write_points_hash_lock); ++ return true; ++ } ++ ++ if (c->write_points_nr == 1 || ++ !too_many_writepoints(c, 8)) { ++ mutex_unlock(&c->write_points_hash_lock); ++ return false; ++ } ++ ++ wp = c->write_points + --c->write_points_nr; ++ ++ hlist_del_rcu(&wp->node); ++ mutex_unlock(&c->write_points_hash_lock); ++ ++ bch2_writepoint_stop(c, NULL, wp); ++ return true; ++} ++ ++static struct write_point *writepoint_find(struct bch_fs *c, ++ unsigned long write_point) ++{ ++ struct write_point *wp, *oldest; ++ struct hlist_head *head; ++ ++ if (!(write_point & 1UL)) { ++ wp = (struct write_point *) write_point; ++ mutex_lock(&wp->lock); ++ return wp; ++ } ++ ++ head = writepoint_hash(c, write_point); ++restart_find: ++ wp = __writepoint_find(head, write_point); ++ if (wp) { ++lock_wp: ++ mutex_lock(&wp->lock); ++ if (wp->write_point == write_point) ++ goto out; ++ mutex_unlock(&wp->lock); ++ goto restart_find; ++ } ++restart_find_oldest: ++ oldest = NULL; ++ for (wp = c->write_points; ++ wp < c->write_points + c->write_points_nr; wp++) ++ if (!oldest || time_before64(wp->last_used, oldest->last_used)) ++ oldest = wp; ++ ++ mutex_lock(&oldest->lock); ++ mutex_lock(&c->write_points_hash_lock); ++ if (oldest >= c->write_points + c->write_points_nr || ++ try_increase_writepoints(c)) { ++ mutex_unlock(&c->write_points_hash_lock); ++ mutex_unlock(&oldest->lock); ++ goto restart_find_oldest; ++ } ++ ++ wp = __writepoint_find(head, write_point); ++ if (wp && wp != oldest) { ++ mutex_unlock(&c->write_points_hash_lock); ++ mutex_unlock(&oldest->lock); ++ goto lock_wp; ++ } ++ ++ wp = oldest; ++ hlist_del_rcu(&wp->node); ++ wp->write_point = write_point; ++ hlist_add_head_rcu(&wp->node, head); ++ mutex_unlock(&c->write_points_hash_lock); ++out: ++ wp->last_used = sched_clock(); ++ return wp; ++} ++ ++/* ++ * Get us an open_bucket we can allocate from, return with it locked: ++ */ ++struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, ++ unsigned target, ++ unsigned erasure_code, ++ struct write_point_specifier write_point, ++ struct bch_devs_list *devs_have, ++ unsigned nr_replicas, ++ unsigned nr_replicas_required, ++ enum alloc_reserve reserve, ++ unsigned flags, ++ struct closure *cl) ++{ ++ struct write_point *wp; ++ struct open_bucket *ob; ++ struct open_buckets ptrs; ++ unsigned nr_effective, write_points_nr; ++ unsigned ob_flags = 0; ++ bool have_cache; ++ int ret, i; ++ ++ if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) ++ ob_flags |= BUCKET_ALLOC_USE_DURABILITY; ++ ++ BUG_ON(!nr_replicas || !nr_replicas_required); ++retry: ++ ptrs.nr = 0; ++ nr_effective = 0; ++ write_points_nr = c->write_points_nr; ++ have_cache = false; ++ ++ wp = writepoint_find(c, write_point.v); ++ ++ if (wp->type == BCH_DATA_USER) ++ ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; ++ ++ /* metadata may not allocate on cache devices: */ ++ if (wp->type != BCH_DATA_USER) ++ have_cache = true; ++ ++ if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { ++ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, ++ target, erasure_code, ++ nr_replicas, &nr_effective, ++ &have_cache, reserve, ++ ob_flags, cl); ++ } else { ++ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, ++ target, erasure_code, ++ nr_replicas, &nr_effective, ++ &have_cache, reserve, ++ ob_flags, NULL); ++ if (!ret) ++ goto alloc_done; ++ ++ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, ++ 0, erasure_code, ++ nr_replicas, &nr_effective, ++ &have_cache, reserve, ++ ob_flags, cl); ++ } ++alloc_done: ++ BUG_ON(!ret && nr_effective < nr_replicas); ++ ++ if (erasure_code && !ec_open_bucket(c, &ptrs)) ++ pr_debug("failed to get ec bucket: ret %u", ret); ++ ++ if (ret == -EROFS && ++ nr_effective >= nr_replicas_required) ++ ret = 0; ++ ++ if (ret) ++ goto err; ++ ++ /* Free buckets we didn't use: */ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ open_bucket_free_unused(c, ob, wp->type == BCH_DATA_USER); ++ ++ wp->ptrs = ptrs; ++ ++ wp->sectors_free = UINT_MAX; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ wp->sectors_free = min(wp->sectors_free, ob->sectors_free); ++ ++ BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); ++ ++ verify_not_stale(c, &wp->ptrs); ++ ++ return wp; ++err: ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ if (ptrs.nr < ARRAY_SIZE(ptrs.v)) ++ ob_push(c, &ptrs, ob); ++ else ++ open_bucket_free_unused(c, ob, ++ wp->type == BCH_DATA_USER); ++ wp->ptrs = ptrs; ++ ++ mutex_unlock(&wp->lock); ++ ++ if (ret == -ENOSPC && ++ try_decrease_writepoints(c, write_points_nr)) ++ goto retry; ++ ++ return ERR_PTR(ret); ++} ++ ++/* ++ * Append pointers to the space we just allocated to @k, and mark @sectors space ++ * as allocated out of @ob ++ */ ++void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, ++ struct bkey_i *k, unsigned sectors) ++ ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ BUG_ON(sectors > wp->sectors_free); ++ wp->sectors_free -= sectors; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ struct bch_extent_ptr tmp = ob->ptr; ++ ++ tmp.cached = !ca->mi.durability && ++ wp->type == BCH_DATA_USER; ++ ++ tmp.offset += ca->mi.bucket_size - ob->sectors_free; ++ bch2_bkey_append_ptr(k, tmp); ++ ++ BUG_ON(sectors > ob->sectors_free); ++ ob->sectors_free -= sectors; ++ } ++} ++ ++/* ++ * Append pointers to the space we just allocated to @k, and mark @sectors space ++ * as allocated out of @ob ++ */ ++void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) ++{ ++ struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 }; ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob); ++ wp->ptrs = keep; ++ ++ mutex_unlock(&wp->lock); ++ ++ bch2_open_buckets_put(c, &ptrs); ++} ++ ++void bch2_fs_allocator_foreground_init(struct bch_fs *c) ++{ ++ struct open_bucket *ob; ++ struct write_point *wp; ++ ++ mutex_init(&c->write_points_hash_lock); ++ c->write_points_nr = ARRAY_SIZE(c->write_points); ++ ++ /* open bucket 0 is a sentinal NULL: */ ++ spin_lock_init(&c->open_buckets[0].lock); ++ ++ for (ob = c->open_buckets + 1; ++ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { ++ spin_lock_init(&ob->lock); ++ c->open_buckets_nr_free++; ++ ++ ob->freelist = c->open_buckets_freelist; ++ c->open_buckets_freelist = ob - c->open_buckets; ++ } ++ ++ writepoint_init(&c->btree_write_point, BCH_DATA_BTREE); ++ writepoint_init(&c->rebalance_write_point, BCH_DATA_USER); ++ ++ for (wp = c->write_points; ++ wp < c->write_points + c->write_points_nr; wp++) { ++ writepoint_init(wp, BCH_DATA_USER); ++ ++ wp->last_used = sched_clock(); ++ wp->write_point = (unsigned long) wp; ++ hlist_add_head_rcu(&wp->node, ++ writepoint_hash(c, wp->write_point)); ++ } ++} +diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h +new file mode 100644 +index 000000000000..687f973e4b3a +--- /dev/null ++++ b/fs/bcachefs/alloc_foreground.h +@@ -0,0 +1,133 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ALLOC_FOREGROUND_H ++#define _BCACHEFS_ALLOC_FOREGROUND_H ++ ++#include "bcachefs.h" ++#include "alloc_types.h" ++ ++#include ++ ++struct bkey; ++struct bch_dev; ++struct bch_fs; ++struct bch_devs_List; ++ ++struct dev_alloc_list { ++ unsigned nr; ++ u8 devs[BCH_SB_MEMBERS_MAX]; ++}; ++ ++struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, ++ struct dev_stripe_state *, ++ struct bch_devs_mask *); ++void bch2_dev_stripe_increment(struct bch_fs *, struct bch_dev *, ++ struct dev_stripe_state *); ++ ++long bch2_bucket_alloc_new_fs(struct bch_dev *); ++ ++struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, ++ enum alloc_reserve, bool, ++ struct closure *); ++ ++static inline void ob_push(struct bch_fs *c, struct open_buckets *obs, ++ struct open_bucket *ob) ++{ ++ BUG_ON(obs->nr >= ARRAY_SIZE(obs->v)); ++ ++ obs->v[obs->nr++] = ob - c->open_buckets; ++} ++ ++#define open_bucket_for_each(_c, _obs, _ob, _i) \ ++ for ((_i) = 0; \ ++ (_i) < (_obs)->nr && \ ++ ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true); \ ++ (_i)++) ++ ++static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, ++ struct open_buckets *obs) ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, obs, ob, i) ++ if (ob->ec) ++ return ob; ++ ++ return NULL; ++} ++ ++void bch2_open_bucket_write_error(struct bch_fs *, ++ struct open_buckets *, unsigned); ++ ++void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); ++ ++static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) ++{ ++ if (atomic_dec_and_test(&ob->pin)) ++ __bch2_open_bucket_put(c, ob); ++} ++ ++static inline void bch2_open_buckets_put(struct bch_fs *c, ++ struct open_buckets *ptrs) ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, ptrs, ob, i) ++ bch2_open_bucket_put(c, ob); ++ ptrs->nr = 0; ++} ++ ++static inline void bch2_open_bucket_get(struct bch_fs *c, ++ struct write_point *wp, ++ struct open_buckets *ptrs) ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) { ++ ob->type = wp->type; ++ atomic_inc(&ob->pin); ++ ob_push(c, ptrs, ob); ++ } ++} ++ ++struct write_point *bch2_alloc_sectors_start(struct bch_fs *, ++ unsigned, unsigned, ++ struct write_point_specifier, ++ struct bch_devs_list *, ++ unsigned, unsigned, ++ enum alloc_reserve, ++ unsigned, ++ struct closure *); ++ ++void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, ++ struct bkey_i *, unsigned); ++void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); ++ ++void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *, ++ struct open_buckets *); ++ ++void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *, ++ struct write_point *); ++ ++static inline struct write_point_specifier writepoint_hashed(unsigned long v) ++{ ++ return (struct write_point_specifier) { .v = v | 1 }; ++} ++ ++static inline struct write_point_specifier writepoint_ptr(struct write_point *wp) ++{ ++ return (struct write_point_specifier) { .v = (unsigned long) wp }; ++} ++ ++static inline void writepoint_init(struct write_point *wp, ++ enum bch_data_type type) ++{ ++ mutex_init(&wp->lock); ++ wp->type = type; ++} ++ ++void bch2_fs_allocator_foreground_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ +diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h +new file mode 100644 +index 000000000000..4f1465077994 +--- /dev/null ++++ b/fs/bcachefs/alloc_types.h +@@ -0,0 +1,112 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ALLOC_TYPES_H ++#define _BCACHEFS_ALLOC_TYPES_H ++ ++#include ++#include ++ ++#include "clock_types.h" ++#include "fifo.h" ++ ++struct ec_bucket_buf; ++ ++/* There's two of these clocks, one for reads and one for writes: */ ++struct bucket_clock { ++ /* ++ * "now" in (read/write) IO time - incremented whenever we do X amount ++ * of reads or writes. ++ * ++ * Goes with the bucket read/write prios: when we read or write to a ++ * bucket we reset the bucket's prio to the current hand; thus hand - ++ * prio = time since bucket was last read/written. ++ * ++ * The units are some amount (bytes/sectors) of data read/written, and ++ * the units can change on the fly if we need to rescale to fit ++ * everything in a u16 - your only guarantee is that the units are ++ * consistent. ++ */ ++ u16 hand; ++ u16 max_last_io; ++ ++ int rw; ++ ++ struct io_timer rescale; ++ struct mutex lock; ++}; ++ ++/* There is one reserve for each type of btree, one for prios and gens ++ * and one for moving GC */ ++enum alloc_reserve { ++ RESERVE_ALLOC = -1, ++ RESERVE_BTREE = 0, ++ RESERVE_MOVINGGC = 1, ++ RESERVE_NONE = 2, ++ RESERVE_NR = 3, ++}; ++ ++typedef FIFO(long) alloc_fifo; ++ ++#define OPEN_BUCKETS_COUNT 1024 ++ ++#define WRITE_POINT_HASH_NR 32 ++#define WRITE_POINT_MAX 32 ++ ++typedef u16 open_bucket_idx_t; ++ ++struct open_bucket { ++ spinlock_t lock; ++ atomic_t pin; ++ open_bucket_idx_t freelist; ++ ++ /* ++ * When an open bucket has an ec_stripe attached, this is the index of ++ * the block in the stripe this open_bucket corresponds to: ++ */ ++ u8 ec_idx; ++ u8 type; ++ unsigned valid:1; ++ unsigned on_partial_list:1; ++ unsigned sectors_free; ++ struct bch_extent_ptr ptr; ++ struct ec_stripe_new *ec; ++}; ++ ++#define OPEN_BUCKET_LIST_MAX 15 ++ ++struct open_buckets { ++ open_bucket_idx_t nr; ++ open_bucket_idx_t v[OPEN_BUCKET_LIST_MAX]; ++}; ++ ++struct dev_stripe_state { ++ u64 next_alloc[BCH_SB_MEMBERS_MAX]; ++}; ++ ++struct write_point { ++ struct hlist_node node; ++ struct mutex lock; ++ u64 last_used; ++ unsigned long write_point; ++ enum bch_data_type type; ++ bool is_ec; ++ ++ /* calculated based on how many pointers we're actually going to use: */ ++ unsigned sectors_free; ++ ++ struct open_buckets ptrs; ++ struct dev_stripe_state stripe; ++}; ++ ++struct write_point_specifier { ++ unsigned long v; ++}; ++ ++struct alloc_heap_entry { ++ size_t bucket; ++ size_t nr; ++ unsigned long key; ++}; ++ ++typedef HEAP(struct alloc_heap_entry) alloc_heap; ++ ++#endif /* _BCACHEFS_ALLOC_TYPES_H */ +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +new file mode 100644 +index 000000000000..893c89dbee60 +--- /dev/null ++++ b/fs/bcachefs/bcachefs.h +@@ -0,0 +1,878 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_H ++#define _BCACHEFS_H ++ ++/* ++ * SOME HIGH LEVEL CODE DOCUMENTATION: ++ * ++ * Bcache mostly works with cache sets, cache devices, and backing devices. ++ * ++ * Support for multiple cache devices hasn't quite been finished off yet, but ++ * it's about 95% plumbed through. A cache set and its cache devices is sort of ++ * like a md raid array and its component devices. Most of the code doesn't care ++ * about individual cache devices, the main abstraction is the cache set. ++ * ++ * Multiple cache devices is intended to give us the ability to mirror dirty ++ * cached data and metadata, without mirroring clean cached data. ++ * ++ * Backing devices are different, in that they have a lifetime independent of a ++ * cache set. When you register a newly formatted backing device it'll come up ++ * in passthrough mode, and then you can attach and detach a backing device from ++ * a cache set at runtime - while it's mounted and in use. Detaching implicitly ++ * invalidates any cached data for that backing device. ++ * ++ * A cache set can have multiple (many) backing devices attached to it. ++ * ++ * There's also flash only volumes - this is the reason for the distinction ++ * between struct cached_dev and struct bcache_device. A flash only volume ++ * works much like a bcache device that has a backing device, except the ++ * "cached" data is always dirty. The end result is that we get thin ++ * provisioning with very little additional code. ++ * ++ * Flash only volumes work but they're not production ready because the moving ++ * garbage collector needs more work. More on that later. ++ * ++ * BUCKETS/ALLOCATION: ++ * ++ * Bcache is primarily designed for caching, which means that in normal ++ * operation all of our available space will be allocated. Thus, we need an ++ * efficient way of deleting things from the cache so we can write new things to ++ * it. ++ * ++ * To do this, we first divide the cache device up into buckets. A bucket is the ++ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ ++ * works efficiently. ++ * ++ * Each bucket has a 16 bit priority, and an 8 bit generation associated with ++ * it. The gens and priorities for all the buckets are stored contiguously and ++ * packed on disk (in a linked list of buckets - aside from the superblock, all ++ * of bcache's metadata is stored in buckets). ++ * ++ * The priority is used to implement an LRU. We reset a bucket's priority when ++ * we allocate it or on cache it, and every so often we decrement the priority ++ * of each bucket. It could be used to implement something more sophisticated, ++ * if anyone ever gets around to it. ++ * ++ * The generation is used for invalidating buckets. Each pointer also has an 8 ++ * bit generation embedded in it; for a pointer to be considered valid, its gen ++ * must match the gen of the bucket it points into. Thus, to reuse a bucket all ++ * we have to do is increment its gen (and write its new gen to disk; we batch ++ * this up). ++ * ++ * Bcache is entirely COW - we never write twice to a bucket, even buckets that ++ * contain metadata (including btree nodes). ++ * ++ * THE BTREE: ++ * ++ * Bcache is in large part design around the btree. ++ * ++ * At a high level, the btree is just an index of key -> ptr tuples. ++ * ++ * Keys represent extents, and thus have a size field. Keys also have a variable ++ * number of pointers attached to them (potentially zero, which is handy for ++ * invalidating the cache). ++ * ++ * The key itself is an inode:offset pair. The inode number corresponds to a ++ * backing device or a flash only volume. The offset is the ending offset of the ++ * extent within the inode - not the starting offset; this makes lookups ++ * slightly more convenient. ++ * ++ * Pointers contain the cache device id, the offset on that device, and an 8 bit ++ * generation number. More on the gen later. ++ * ++ * Index lookups are not fully abstracted - cache lookups in particular are ++ * still somewhat mixed in with the btree code, but things are headed in that ++ * direction. ++ * ++ * Updates are fairly well abstracted, though. There are two different ways of ++ * updating the btree; insert and replace. ++ * ++ * BTREE_INSERT will just take a list of keys and insert them into the btree - ++ * overwriting (possibly only partially) any extents they overlap with. This is ++ * used to update the index after a write. ++ * ++ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is ++ * overwriting a key that matches another given key. This is used for inserting ++ * data into the cache after a cache miss, and for background writeback, and for ++ * the moving garbage collector. ++ * ++ * There is no "delete" operation; deleting things from the index is ++ * accomplished by either by invalidating pointers (by incrementing a bucket's ++ * gen) or by inserting a key with 0 pointers - which will overwrite anything ++ * previously present at that location in the index. ++ * ++ * This means that there are always stale/invalid keys in the btree. They're ++ * filtered out by the code that iterates through a btree node, and removed when ++ * a btree node is rewritten. ++ * ++ * BTREE NODES: ++ * ++ * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and ++ * free smaller than a bucket - so, that's how big our btree nodes are. ++ * ++ * (If buckets are really big we'll only use part of the bucket for a btree node ++ * - no less than 1/4th - but a bucket still contains no more than a single ++ * btree node. I'd actually like to change this, but for now we rely on the ++ * bucket's gen for deleting btree nodes when we rewrite/split a node.) ++ * ++ * Anyways, btree nodes are big - big enough to be inefficient with a textbook ++ * btree implementation. ++ * ++ * The way this is solved is that btree nodes are internally log structured; we ++ * can append new keys to an existing btree node without rewriting it. This ++ * means each set of keys we write is sorted, but the node is not. ++ * ++ * We maintain this log structure in memory - keeping 1Mb of keys sorted would ++ * be expensive, and we have to distinguish between the keys we have written and ++ * the keys we haven't. So to do a lookup in a btree node, we have to search ++ * each sorted set. But we do merge written sets together lazily, so the cost of ++ * these extra searches is quite low (normally most of the keys in a btree node ++ * will be in one big set, and then there'll be one or two sets that are much ++ * smaller). ++ * ++ * This log structure makes bcache's btree more of a hybrid between a ++ * conventional btree and a compacting data structure, with some of the ++ * advantages of both. ++ * ++ * GARBAGE COLLECTION: ++ * ++ * We can't just invalidate any bucket - it might contain dirty data or ++ * metadata. If it once contained dirty data, other writes might overwrite it ++ * later, leaving no valid pointers into that bucket in the index. ++ * ++ * Thus, the primary purpose of garbage collection is to find buckets to reuse. ++ * It also counts how much valid data it each bucket currently contains, so that ++ * allocation can reuse buckets sooner when they've been mostly overwritten. ++ * ++ * It also does some things that are really internal to the btree ++ * implementation. If a btree node contains pointers that are stale by more than ++ * some threshold, it rewrites the btree node to avoid the bucket's generation ++ * wrapping around. It also merges adjacent btree nodes if they're empty enough. ++ * ++ * THE JOURNAL: ++ * ++ * Bcache's journal is not necessary for consistency; we always strictly ++ * order metadata writes so that the btree and everything else is consistent on ++ * disk in the event of an unclean shutdown, and in fact bcache had writeback ++ * caching (with recovery from unclean shutdown) before journalling was ++ * implemented. ++ * ++ * Rather, the journal is purely a performance optimization; we can't complete a ++ * write until we've updated the index on disk, otherwise the cache would be ++ * inconsistent in the event of an unclean shutdown. This means that without the ++ * journal, on random write workloads we constantly have to update all the leaf ++ * nodes in the btree, and those writes will be mostly empty (appending at most ++ * a few keys each) - highly inefficient in terms of amount of metadata writes, ++ * and it puts more strain on the various btree resorting/compacting code. ++ * ++ * The journal is just a log of keys we've inserted; on startup we just reinsert ++ * all the keys in the open journal entries. That means that when we're updating ++ * a node in the btree, we can wait until a 4k block of keys fills up before ++ * writing them out. ++ * ++ * For simplicity, we only journal updates to leaf nodes; updates to parent ++ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth ++ * the complexity to deal with journalling them (in particular, journal replay) ++ * - updates to non leaf nodes just happen synchronously (see btree_split()). ++ */ ++ ++#undef pr_fmt ++#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "bcachefs_format.h" ++#include "fifo.h" ++#include "opts.h" ++#include "util.h" ++ ++#include ++ ++#define bch2_fs_init_fault(name) \ ++ dynamic_fault("bcachefs:bch_fs_init:" name) ++#define bch2_meta_read_fault(name) \ ++ dynamic_fault("bcachefs:meta:read:" name) ++#define bch2_meta_write_fault(name) \ ++ dynamic_fault("bcachefs:meta:write:" name) ++ ++#ifdef __KERNEL__ ++#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) ++#else ++#define bch2_fmt(_c, fmt) fmt "\n" ++#endif ++ ++#define bch_info(c, fmt, ...) \ ++ printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_notice(c, fmt, ...) \ ++ printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_warn(c, fmt, ...) \ ++ printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_warn_ratelimited(c, fmt, ...) \ ++ printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_err(c, fmt, ...) \ ++ printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_err_ratelimited(c, fmt, ...) \ ++ printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) ++ ++#define bch_verbose(c, fmt, ...) \ ++do { \ ++ if ((c)->opts.verbose) \ ++ bch_info(c, fmt, ##__VA_ARGS__); \ ++} while (0) ++ ++#define pr_verbose_init(opts, fmt, ...) \ ++do { \ ++ if (opt_get(opts, verbose)) \ ++ pr_info(fmt, ##__VA_ARGS__); \ ++} while (0) ++ ++/* Parameters that are useful for debugging, but should always be compiled in: */ ++#define BCH_DEBUG_PARAMS_ALWAYS() \ ++ BCH_DEBUG_PARAM(key_merging_disabled, \ ++ "Disables merging of extents") \ ++ BCH_DEBUG_PARAM(btree_gc_always_rewrite, \ ++ "Causes mark and sweep to compact and rewrite every " \ ++ "btree node it traverses") \ ++ BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \ ++ "Disables rewriting of btree nodes during mark and sweep")\ ++ BCH_DEBUG_PARAM(btree_shrinker_disabled, \ ++ "Disables the shrinker callback for the btree node cache") ++ ++/* Parameters that should only be compiled in in debug mode: */ ++#define BCH_DEBUG_PARAMS_DEBUG() \ ++ BCH_DEBUG_PARAM(expensive_debug_checks, \ ++ "Enables various runtime debugging checks that " \ ++ "significantly affect performance") \ ++ BCH_DEBUG_PARAM(debug_check_iterators, \ ++ "Enables extra verification for btree iterators") \ ++ BCH_DEBUG_PARAM(debug_check_bkeys, \ ++ "Run bkey_debugcheck (primarily checking GC/allocation "\ ++ "information) when iterating over keys") \ ++ BCH_DEBUG_PARAM(verify_btree_ondisk, \ ++ "Reread btree nodes at various points to verify the " \ ++ "mergesort in the read path against modifications " \ ++ "done in memory") \ ++ BCH_DEBUG_PARAM(journal_seq_verify, \ ++ "Store the journal sequence number in the version " \ ++ "number of every btree key, and verify that btree " \ ++ "update ordering is preserved during recovery") \ ++ BCH_DEBUG_PARAM(inject_invalid_keys, \ ++ "Store the journal sequence number in the version " \ ++ "number of every btree key, and verify that btree " \ ++ "update ordering is preserved during recovery") \ ++ BCH_DEBUG_PARAM(test_alloc_startup, \ ++ "Force allocator startup to use the slowpath where it" \ ++ "can't find enough free buckets without invalidating" \ ++ "cached data") \ ++ BCH_DEBUG_PARAM(force_reconstruct_read, \ ++ "Force reads to use the reconstruct path, when reading" \ ++ "from erasure coded extents") \ ++ BCH_DEBUG_PARAM(test_restart_gc, \ ++ "Test restarting mark and sweep gc when bucket gens change") ++ ++#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL() ++#else ++#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() ++#endif ++ ++#define BCH_TIME_STATS() \ ++ x(btree_node_mem_alloc) \ ++ x(btree_node_split) \ ++ x(btree_node_sort) \ ++ x(btree_node_read) \ ++ x(btree_gc) \ ++ x(btree_lock_contended_read) \ ++ x(btree_lock_contended_intent) \ ++ x(btree_lock_contended_write) \ ++ x(data_write) \ ++ x(data_read) \ ++ x(data_promote) \ ++ x(journal_write) \ ++ x(journal_delay) \ ++ x(journal_flush_seq) \ ++ x(blocked_journal) \ ++ x(blocked_allocate) \ ++ x(blocked_allocate_open_bucket) ++ ++enum bch_time_stats { ++#define x(name) BCH_TIME_##name, ++ BCH_TIME_STATS() ++#undef x ++ BCH_TIME_STAT_NR ++}; ++ ++#include "alloc_types.h" ++#include "btree_types.h" ++#include "buckets_types.h" ++#include "clock_types.h" ++#include "ec_types.h" ++#include "journal_types.h" ++#include "keylist_types.h" ++#include "quota_types.h" ++#include "rebalance_types.h" ++#include "replicas_types.h" ++#include "super_types.h" ++ ++/* Number of nodes btree coalesce will try to coalesce at once */ ++#define GC_MERGE_NODES 4U ++ ++/* Maximum number of nodes we might need to allocate atomically: */ ++#define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1)) ++ ++/* Size of the freelist we allocate btree nodes from: */ ++#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4) ++ ++#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX) ++ ++struct btree; ++ ++enum gc_phase { ++ GC_PHASE_NOT_RUNNING, ++ GC_PHASE_START, ++ GC_PHASE_SB, ++ ++ GC_PHASE_BTREE_EC, ++ GC_PHASE_BTREE_EXTENTS, ++ GC_PHASE_BTREE_INODES, ++ GC_PHASE_BTREE_DIRENTS, ++ GC_PHASE_BTREE_XATTRS, ++ GC_PHASE_BTREE_ALLOC, ++ GC_PHASE_BTREE_QUOTAS, ++ GC_PHASE_BTREE_REFLINK, ++ ++ GC_PHASE_PENDING_DELETE, ++ GC_PHASE_ALLOC, ++}; ++ ++struct gc_pos { ++ enum gc_phase phase; ++ struct bpos pos; ++ unsigned level; ++}; ++ ++struct io_count { ++ u64 sectors[2][BCH_DATA_NR]; ++}; ++ ++struct bch_dev { ++ struct kobject kobj; ++ struct percpu_ref ref; ++ struct completion ref_completion; ++ struct percpu_ref io_ref; ++ struct completion io_ref_completion; ++ ++ struct bch_fs *fs; ++ ++ u8 dev_idx; ++ /* ++ * Cached version of this device's member info from superblock ++ * Committed by bch2_write_super() -> bch_fs_mi_update() ++ */ ++ struct bch_member_cpu mi; ++ uuid_le uuid; ++ char name[BDEVNAME_SIZE]; ++ ++ struct bch_sb_handle disk_sb; ++ struct bch_sb *sb_read_scratch; ++ int sb_write_error; ++ ++ struct bch_devs_mask self; ++ ++ /* biosets used in cloned bios for writing multiple replicas */ ++ struct bio_set replica_set; ++ ++ /* ++ * Buckets: ++ * Per-bucket arrays are protected by c->mark_lock, bucket_lock and ++ * gc_lock, for device resize - holding any is sufficient for access: ++ * Or rcu_read_lock(), but only for ptr_stale(): ++ */ ++ struct bucket_array __rcu *buckets[2]; ++ unsigned long *buckets_nouse; ++ struct rw_semaphore bucket_lock; ++ ++ struct bch_dev_usage __percpu *usage[2]; ++ ++ /* Allocator: */ ++ struct task_struct __rcu *alloc_thread; ++ ++ /* ++ * free: Buckets that are ready to be used ++ * ++ * free_inc: Incoming buckets - these are buckets that currently have ++ * cached data in them, and we can't reuse them until after we write ++ * their new gen to disk. After prio_write() finishes writing the new ++ * gens/prios, they'll be moved to the free list (and possibly discarded ++ * in the process) ++ */ ++ alloc_fifo free[RESERVE_NR]; ++ alloc_fifo free_inc; ++ ++ open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; ++ open_bucket_idx_t open_buckets_partial_nr; ++ ++ size_t fifo_last_bucket; ++ ++ /* last calculated minimum prio */ ++ u16 max_last_bucket_io[2]; ++ ++ size_t inc_gen_needs_gc; ++ size_t inc_gen_really_needs_gc; ++ ++ /* ++ * XXX: this should be an enum for allocator state, so as to include ++ * error state ++ */ ++ enum { ++ ALLOCATOR_STOPPED, ++ ALLOCATOR_RUNNING, ++ ALLOCATOR_BLOCKED, ++ ALLOCATOR_BLOCKED_FULL, ++ } allocator_state; ++ ++ alloc_heap alloc_heap; ++ ++ /* Copying GC: */ ++ struct task_struct *copygc_thread; ++ copygc_heap copygc_heap; ++ struct bch_pd_controller copygc_pd; ++ struct write_point copygc_write_point; ++ u64 copygc_threshold; ++ ++ atomic64_t rebalance_work; ++ ++ struct journal_device journal; ++ ++ struct work_struct io_error_work; ++ ++ /* The rest of this all shows up in sysfs */ ++ atomic64_t cur_latency[2]; ++ struct time_stats io_latency[2]; ++ ++#define CONGESTED_MAX 1024 ++ atomic_t congested; ++ u64 congested_last; ++ ++ struct io_count __percpu *io_done; ++}; ++ ++enum { ++ /* startup: */ ++ BCH_FS_ALLOC_READ_DONE, ++ BCH_FS_ALLOC_CLEAN, ++ BCH_FS_ALLOCATOR_RUNNING, ++ BCH_FS_ALLOCATOR_STOPPING, ++ BCH_FS_INITIAL_GC_DONE, ++ BCH_FS_BTREE_INTERIOR_REPLAY_DONE, ++ BCH_FS_FSCK_DONE, ++ BCH_FS_STARTED, ++ BCH_FS_RW, ++ ++ /* shutdown: */ ++ BCH_FS_STOPPING, ++ BCH_FS_EMERGENCY_RO, ++ BCH_FS_WRITE_DISABLE_COMPLETE, ++ ++ /* errors: */ ++ BCH_FS_ERROR, ++ BCH_FS_ERRORS_FIXED, ++ ++ /* misc: */ ++ BCH_FS_BDEV_MOUNTED, ++ BCH_FS_FIXED_GENS, ++ BCH_FS_ALLOC_WRITTEN, ++ BCH_FS_REBUILD_REPLICAS, ++ BCH_FS_HOLD_BTREE_WRITES, ++}; ++ ++struct btree_debug { ++ unsigned id; ++ struct dentry *btree; ++ struct dentry *btree_format; ++ struct dentry *failed; ++}; ++ ++struct bch_fs_pcpu { ++ u64 sectors_available; ++}; ++ ++struct journal_seq_blacklist_table { ++ size_t nr; ++ struct journal_seq_blacklist_table_entry { ++ u64 start; ++ u64 end; ++ bool dirty; ++ } entries[0]; ++}; ++ ++struct journal_keys { ++ struct journal_key { ++ enum btree_id btree_id:8; ++ unsigned level:8; ++ struct bkey_i *k; ++ u32 journal_seq; ++ u32 journal_offset; ++ } *d; ++ size_t nr; ++ u64 journal_seq_base; ++}; ++ ++struct bch_fs { ++ struct closure cl; ++ ++ struct list_head list; ++ struct kobject kobj; ++ struct kobject internal; ++ struct kobject opts_dir; ++ struct kobject time_stats; ++ unsigned long flags; ++ ++ int minor; ++ struct device *chardev; ++ struct super_block *vfs_sb; ++ char name[40]; ++ ++ /* ro/rw, add/remove/resize devices: */ ++ struct rw_semaphore state_lock; ++ ++ /* Counts outstanding writes, for clean transition to read-only */ ++ struct percpu_ref writes; ++ struct work_struct read_only_work; ++ ++ struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; ++ ++ struct bch_replicas_cpu replicas; ++ struct bch_replicas_cpu replicas_gc; ++ struct mutex replicas_gc_lock; ++ ++ struct journal_entry_res replicas_journal_res; ++ ++ struct bch_disk_groups_cpu __rcu *disk_groups; ++ ++ struct bch_opts opts; ++ ++ /* Updated by bch2_sb_update():*/ ++ struct { ++ uuid_le uuid; ++ uuid_le user_uuid; ++ ++ u16 version; ++ u16 encoded_extent_max; ++ ++ u8 nr_devices; ++ u8 clean; ++ ++ u8 encryption_type; ++ ++ u64 time_base_lo; ++ u32 time_base_hi; ++ u32 time_precision; ++ u64 features; ++ u64 compat; ++ } sb; ++ ++ struct bch_sb_handle disk_sb; ++ ++ unsigned short block_bits; /* ilog2(block_size) */ ++ ++ u16 btree_foreground_merge_threshold; ++ ++ struct closure sb_write; ++ struct mutex sb_lock; ++ ++ /* BTREE CACHE */ ++ struct bio_set btree_bio; ++ ++ struct btree_root btree_roots[BTREE_ID_NR]; ++ struct mutex btree_root_lock; ++ ++ struct btree_cache btree_cache; ++ ++ /* ++ * Cache of allocated btree nodes - if we allocate a btree node and ++ * don't use it, if we free it that space can't be reused until going ++ * _all_ the way through the allocator (which exposes us to a livelock ++ * when allocating btree reserves fail halfway through) - instead, we ++ * can stick them here: ++ */ ++ struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2]; ++ unsigned btree_reserve_cache_nr; ++ struct mutex btree_reserve_cache_lock; ++ ++ mempool_t btree_interior_update_pool; ++ struct list_head btree_interior_update_list; ++ struct list_head btree_interior_updates_unwritten; ++ struct mutex btree_interior_update_lock; ++ struct closure_waitlist btree_interior_update_wait; ++ ++ struct workqueue_struct *btree_interior_update_worker; ++ struct work_struct btree_interior_update_work; ++ ++ /* btree_iter.c: */ ++ struct mutex btree_trans_lock; ++ struct list_head btree_trans_list; ++ mempool_t btree_iters_pool; ++ ++ struct btree_key_cache btree_key_cache; ++ ++ struct workqueue_struct *wq; ++ /* copygc needs its own workqueue for index updates.. */ ++ struct workqueue_struct *copygc_wq; ++ struct workqueue_struct *journal_reclaim_wq; ++ ++ /* ALLOCATION */ ++ struct delayed_work pd_controllers_update; ++ unsigned pd_controllers_update_seconds; ++ ++ struct bch_devs_mask rw_devs[BCH_DATA_NR]; ++ ++ u64 capacity; /* sectors */ ++ ++ /* ++ * When capacity _decreases_ (due to a disk being removed), we ++ * increment capacity_gen - this invalidates outstanding reservations ++ * and forces them to be revalidated ++ */ ++ u32 capacity_gen; ++ unsigned bucket_size_max; ++ ++ atomic64_t sectors_available; ++ ++ struct bch_fs_pcpu __percpu *pcpu; ++ ++ struct percpu_rw_semaphore mark_lock; ++ ++ seqcount_t usage_lock; ++ struct bch_fs_usage *usage_base; ++ struct bch_fs_usage __percpu *usage[2]; ++ struct bch_fs_usage __percpu *usage_gc; ++ ++ /* single element mempool: */ ++ struct mutex usage_scratch_lock; ++ struct bch_fs_usage *usage_scratch; ++ ++ /* ++ * When we invalidate buckets, we use both the priority and the amount ++ * of good data to determine which buckets to reuse first - to weight ++ * those together consistently we keep track of the smallest nonzero ++ * priority of any bucket. ++ */ ++ struct bucket_clock bucket_clock[2]; ++ ++ struct io_clock io_clock[2]; ++ ++ /* JOURNAL SEQ BLACKLIST */ ++ struct journal_seq_blacklist_table * ++ journal_seq_blacklist_table; ++ struct work_struct journal_seq_blacklist_gc_work; ++ ++ /* ALLOCATOR */ ++ spinlock_t freelist_lock; ++ struct closure_waitlist freelist_wait; ++ u64 blocked_allocate; ++ u64 blocked_allocate_open_bucket; ++ open_bucket_idx_t open_buckets_freelist; ++ open_bucket_idx_t open_buckets_nr_free; ++ struct closure_waitlist open_buckets_wait; ++ struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; ++ ++ struct write_point btree_write_point; ++ struct write_point rebalance_write_point; ++ ++ struct write_point write_points[WRITE_POINT_MAX]; ++ struct hlist_head write_points_hash[WRITE_POINT_HASH_NR]; ++ struct mutex write_points_hash_lock; ++ unsigned write_points_nr; ++ ++ /* GARBAGE COLLECTION */ ++ struct task_struct *gc_thread; ++ atomic_t kick_gc; ++ unsigned long gc_count; ++ ++ /* ++ * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] ++ * has been marked by GC. ++ * ++ * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.) ++ * ++ * Protected by gc_pos_lock. Only written to by GC thread, so GC thread ++ * can read without a lock. ++ */ ++ seqcount_t gc_pos_lock; ++ struct gc_pos gc_pos; ++ ++ /* ++ * The allocation code needs gc_mark in struct bucket to be correct, but ++ * it's not while a gc is in progress. ++ */ ++ struct rw_semaphore gc_lock; ++ ++ /* IO PATH */ ++ struct semaphore io_in_flight; ++ struct bio_set bio_read; ++ struct bio_set bio_read_split; ++ struct bio_set bio_write; ++ struct mutex bio_bounce_pages_lock; ++ mempool_t bio_bounce_pages; ++ struct rhashtable promote_table; ++ ++ mempool_t compression_bounce[2]; ++ mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR]; ++ mempool_t decompress_workspace; ++ ZSTD_parameters zstd_params; ++ ++ struct crypto_shash *sha256; ++ struct crypto_sync_skcipher *chacha20; ++ struct crypto_shash *poly1305; ++ ++ atomic64_t key_version; ++ ++ mempool_t large_bkey_pool; ++ ++ /* REBALANCE */ ++ struct bch_fs_rebalance rebalance; ++ ++ /* STRIPES: */ ++ GENRADIX(struct stripe) stripes[2]; ++ struct mutex ec_stripe_create_lock; ++ ++ ec_stripes_heap ec_stripes_heap; ++ spinlock_t ec_stripes_heap_lock; ++ ++ /* ERASURE CODING */ ++ struct list_head ec_new_stripe_list; ++ struct mutex ec_new_stripe_lock; ++ u64 ec_stripe_hint; ++ ++ struct bio_set ec_bioset; ++ ++ struct work_struct ec_stripe_delete_work; ++ struct llist_head ec_stripe_delete_list; ++ ++ /* REFLINK */ ++ u64 reflink_hint; ++ ++ /* VFS IO PATH - fs-io.c */ ++ struct bio_set writepage_bioset; ++ struct bio_set dio_write_bioset; ++ struct bio_set dio_read_bioset; ++ ++ struct bio_list btree_write_error_list; ++ struct work_struct btree_write_error_work; ++ spinlock_t btree_write_error_lock; ++ ++ /* ERRORS */ ++ struct list_head fsck_errors; ++ struct mutex fsck_error_lock; ++ bool fsck_alloc_err; ++ ++ /* QUOTAS */ ++ struct bch_memquota_type quotas[QTYP_NR]; ++ ++ /* DEBUG JUNK */ ++ struct dentry *debug; ++ struct btree_debug btree_debug[BTREE_ID_NR]; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct btree *verify_data; ++ struct btree_node *verify_ondisk; ++ struct mutex verify_lock; ++#endif ++ ++ u64 unused_inode_hint; ++ ++ /* ++ * A btree node on disk could have too many bsets for an iterator to fit ++ * on the stack - have to dynamically allocate them ++ */ ++ mempool_t fill_iter; ++ ++ mempool_t btree_bounce_pool; ++ ++ struct journal journal; ++ struct list_head journal_entries; ++ struct journal_keys journal_keys; ++ ++ u64 last_bucket_seq_cleanup; ++ ++ /* The rest of this all shows up in sysfs */ ++ atomic_long_t read_realloc_races; ++ atomic_long_t extent_migrate_done; ++ atomic_long_t extent_migrate_raced; ++ ++ unsigned btree_gc_periodic:1; ++ unsigned copy_gc_enabled:1; ++ bool promote_whole_extents; ++ ++#define BCH_DEBUG_PARAM(name, description) bool name; ++ BCH_DEBUG_PARAMS_ALL() ++#undef BCH_DEBUG_PARAM ++ ++ struct time_stats times[BCH_TIME_STAT_NR]; ++}; ++ ++static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) ++{ ++#ifndef NO_BCACHEFS_FS ++ if (c->vfs_sb) ++ c->vfs_sb->s_bdi->ra_pages = ra_pages; ++#endif ++} ++ ++static inline unsigned bucket_bytes(const struct bch_dev *ca) ++{ ++ return ca->mi.bucket_size << 9; ++} ++ ++static inline unsigned block_bytes(const struct bch_fs *c) ++{ ++ return c->opts.block_size << 9; ++} ++ ++static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time) ++{ ++ return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo); ++} ++ ++static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts) ++{ ++ s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo; ++ ++ if (c->sb.time_precision == 1) ++ return ns; ++ ++ return div_s64(ns, c->sb.time_precision); ++} ++ ++static inline s64 bch2_current_time(struct bch_fs *c) ++{ ++ struct timespec64 now; ++ ++ ktime_get_coarse_real_ts64(&now); ++ return timespec_to_bch2_time(c, now); ++} ++ ++static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) ++{ ++ return dev < c->sb.nr_devices && c->devs[dev]; ++} ++ ++#endif /* _BCACHEFS_H */ +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +new file mode 100644 +index 000000000000..f808e63a713d +--- /dev/null ++++ b/fs/bcachefs/bcachefs_format.h +@@ -0,0 +1,1666 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FORMAT_H ++#define _BCACHEFS_FORMAT_H ++ ++/* ++ * bcachefs on disk data structures ++ * ++ * OVERVIEW: ++ * ++ * There are three main types of on disk data structures in bcachefs (this is ++ * reduced from 5 in bcache) ++ * ++ * - superblock ++ * - journal ++ * - btree ++ * ++ * The btree is the primary structure; most metadata exists as keys in the ++ * various btrees. There are only a small number of btrees, they're not ++ * sharded - we have one btree for extents, another for inodes, et cetera. ++ * ++ * SUPERBLOCK: ++ * ++ * The superblock contains the location of the journal, the list of devices in ++ * the filesystem, and in general any metadata we need in order to decide ++ * whether we can start a filesystem or prior to reading the journal/btree ++ * roots. ++ * ++ * The superblock is extensible, and most of the contents of the superblock are ++ * in variable length, type tagged fields; see struct bch_sb_field. ++ * ++ * Backup superblocks do not reside in a fixed location; also, superblocks do ++ * not have a fixed size. To locate backup superblocks we have struct ++ * bch_sb_layout; we store a copy of this inside every superblock, and also ++ * before the first superblock. ++ * ++ * JOURNAL: ++ * ++ * The journal primarily records btree updates in the order they occurred; ++ * journal replay consists of just iterating over all the keys in the open ++ * journal entries and re-inserting them into the btrees. ++ * ++ * The journal also contains entry types for the btree roots, and blacklisted ++ * journal sequence numbers (see journal_seq_blacklist.c). ++ * ++ * BTREE: ++ * ++ * bcachefs btrees are copy on write b+ trees, where nodes are big (typically ++ * 128k-256k) and log structured. We use struct btree_node for writing the first ++ * entry in a given node (offset 0), and struct btree_node_entry for all ++ * subsequent writes. ++ * ++ * After the header, btree node entries contain a list of keys in sorted order. ++ * Values are stored inline with the keys; since values are variable length (and ++ * keys effectively are variable length too, due to packing) we can't do random ++ * access without building up additional in memory tables in the btree node read ++ * path. ++ * ++ * BTREE KEYS (struct bkey): ++ * ++ * The various btrees share a common format for the key - so as to avoid ++ * switching in fastpath lookup/comparison code - but define their own ++ * structures for the key values. ++ * ++ * The size of a key/value pair is stored as a u8 in units of u64s, so the max ++ * size is just under 2k. The common part also contains a type tag for the ++ * value, and a format field indicating whether the key is packed or not (and ++ * also meant to allow adding new key fields in the future, if desired). ++ * ++ * bkeys, when stored within a btree node, may also be packed. In that case, the ++ * bkey_format in that node is used to unpack it. Packed bkeys mean that we can ++ * be generous with field sizes in the common part of the key format (64 bit ++ * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#define LE_BITMASK(_bits, name, type, field, offset, end) \ ++static const unsigned name##_OFFSET = offset; \ ++static const unsigned name##_BITS = (end - offset); \ ++static const __u##_bits name##_MAX = (1ULL << (end - offset)) - 1; \ ++ \ ++static inline __u64 name(const type *k) \ ++{ \ ++ return (__le##_bits##_to_cpu(k->field) >> offset) & \ ++ ~(~0ULL << (end - offset)); \ ++} \ ++ \ ++static inline void SET_##name(type *k, __u64 v) \ ++{ \ ++ __u##_bits new = __le##_bits##_to_cpu(k->field); \ ++ \ ++ new &= ~(~(~0ULL << (end - offset)) << offset); \ ++ new |= (v & ~(~0ULL << (end - offset))) << offset; \ ++ k->field = __cpu_to_le##_bits(new); \ ++} ++ ++#define LE16_BITMASK(n, t, f, o, e) LE_BITMASK(16, n, t, f, o, e) ++#define LE32_BITMASK(n, t, f, o, e) LE_BITMASK(32, n, t, f, o, e) ++#define LE64_BITMASK(n, t, f, o, e) LE_BITMASK(64, n, t, f, o, e) ++ ++struct bkey_format { ++ __u8 key_u64s; ++ __u8 nr_fields; ++ /* One unused slot for now: */ ++ __u8 bits_per_field[6]; ++ __le64 field_offset[6]; ++}; ++ ++/* Btree keys - all units are in sectors */ ++ ++struct bpos { ++ /* ++ * Word order matches machine byte order - btree code treats a bpos as a ++ * single large integer, for search/comparison purposes ++ * ++ * Note that wherever a bpos is embedded in another on disk data ++ * structure, it has to be byte swabbed when reading in metadata that ++ * wasn't written in native endian order: ++ */ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ __u32 snapshot; ++ __u64 offset; ++ __u64 inode; ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++ __u64 inode; ++ __u64 offset; /* Points to end of extent - sectors */ ++ __u32 snapshot; ++#else ++#error edit for your odd byteorder. ++#endif ++} __attribute__((packed, aligned(4))); ++ ++#define KEY_INODE_MAX ((__u64)~0ULL) ++#define KEY_OFFSET_MAX ((__u64)~0ULL) ++#define KEY_SNAPSHOT_MAX ((__u32)~0U) ++#define KEY_SIZE_MAX ((__u32)~0U) ++ ++static inline struct bpos POS(__u64 inode, __u64 offset) ++{ ++ struct bpos ret; ++ ++ ret.inode = inode; ++ ret.offset = offset; ++ ret.snapshot = 0; ++ ++ return ret; ++} ++ ++#define POS_MIN POS(0, 0) ++#define POS_MAX POS(KEY_INODE_MAX, KEY_OFFSET_MAX) ++ ++/* Empty placeholder struct, for container_of() */ ++struct bch_val { ++ __u64 __nothing[0]; ++}; ++ ++struct bversion { ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ __u64 lo; ++ __u32 hi; ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++ __u32 hi; ++ __u64 lo; ++#endif ++} __attribute__((packed, aligned(4))); ++ ++struct bkey { ++ /* Size of combined key and value, in u64s */ ++ __u8 u64s; ++ ++ /* Format of key (0 for format local to btree node) */ ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u8 format:7, ++ needs_whiteout:1; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u8 needs_whiteout:1, ++ format:7; ++#else ++#error edit for your odd byteorder. ++#endif ++ ++ /* Type of the value */ ++ __u8 type; ++ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ __u8 pad[1]; ++ ++ struct bversion version; ++ __u32 size; /* extent size, in sectors */ ++ struct bpos p; ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++ struct bpos p; ++ __u32 size; /* extent size, in sectors */ ++ struct bversion version; ++ ++ __u8 pad[1]; ++#endif ++} __attribute__((packed, aligned(8))); ++ ++struct bkey_packed { ++ __u64 _data[0]; ++ ++ /* Size of combined key and value, in u64s */ ++ __u8 u64s; ++ ++ /* Format of key (0 for format local to btree node) */ ++ ++ /* ++ * XXX: next incompat on disk format change, switch format and ++ * needs_whiteout - bkey_packed() will be cheaper if format is the high ++ * bits of the bitfield ++ */ ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u8 format:7, ++ needs_whiteout:1; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u8 needs_whiteout:1, ++ format:7; ++#endif ++ ++ /* Type of the value */ ++ __u8 type; ++ __u8 key_start[0]; ++ ++ /* ++ * We copy bkeys with struct assignment in various places, and while ++ * that shouldn't be done with packed bkeys we can't disallow it in C, ++ * and it's legal to cast a bkey to a bkey_packed - so padding it out ++ * to the same size as struct bkey should hopefully be safest. ++ */ ++ __u8 pad[sizeof(struct bkey) - 3]; ++} __attribute__((packed, aligned(8))); ++ ++#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64)) ++#define BKEY_U64s_MAX U8_MAX ++#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s) ++ ++#define KEY_PACKED_BITS_START 24 ++ ++#define KEY_FORMAT_LOCAL_BTREE 0 ++#define KEY_FORMAT_CURRENT 1 ++ ++enum bch_bkey_fields { ++ BKEY_FIELD_INODE, ++ BKEY_FIELD_OFFSET, ++ BKEY_FIELD_SNAPSHOT, ++ BKEY_FIELD_SIZE, ++ BKEY_FIELD_VERSION_HI, ++ BKEY_FIELD_VERSION_LO, ++ BKEY_NR_FIELDS, ++}; ++ ++#define bkey_format_field(name, field) \ ++ [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8) ++ ++#define BKEY_FORMAT_CURRENT \ ++((struct bkey_format) { \ ++ .key_u64s = BKEY_U64s, \ ++ .nr_fields = BKEY_NR_FIELDS, \ ++ .bits_per_field = { \ ++ bkey_format_field(INODE, p.inode), \ ++ bkey_format_field(OFFSET, p.offset), \ ++ bkey_format_field(SNAPSHOT, p.snapshot), \ ++ bkey_format_field(SIZE, size), \ ++ bkey_format_field(VERSION_HI, version.hi), \ ++ bkey_format_field(VERSION_LO, version.lo), \ ++ }, \ ++}) ++ ++/* bkey with inline value */ ++struct bkey_i { ++ __u64 _data[0]; ++ ++ union { ++ struct { ++ /* Size of combined key and value, in u64s */ ++ __u8 u64s; ++ }; ++ struct { ++ struct bkey k; ++ struct bch_val v; ++ }; ++ }; ++}; ++ ++#define KEY(_inode, _offset, _size) \ ++((struct bkey) { \ ++ .u64s = BKEY_U64s, \ ++ .format = KEY_FORMAT_CURRENT, \ ++ .p = POS(_inode, _offset), \ ++ .size = _size, \ ++}) ++ ++static inline void bkey_init(struct bkey *k) ++{ ++ *k = KEY(0, 0, 0); ++} ++ ++#define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64)) ++ ++#define __BKEY_PADDED(key, pad) \ ++ struct { struct bkey_i key; __u64 key ## _pad[pad]; } ++ ++/* ++ * - DELETED keys are used internally to mark keys that should be ignored but ++ * override keys in composition order. Their version number is ignored. ++ * ++ * - DISCARDED keys indicate that the data is all 0s because it has been ++ * discarded. DISCARDs may have a version; if the version is nonzero the key ++ * will be persistent, otherwise the key will be dropped whenever the btree ++ * node is rewritten (like DELETED keys). ++ * ++ * - ERROR: any read of the data returns a read error, as the data was lost due ++ * to a failing device. Like DISCARDED keys, they can be removed (overridden) ++ * by new writes or cluster-wide GC. Node repair can also overwrite them with ++ * the same or a more recent version number, but not with an older version ++ * number. ++ * ++ * - WHITEOUT: for hash table btrees ++*/ ++#define BCH_BKEY_TYPES() \ ++ x(deleted, 0) \ ++ x(discard, 1) \ ++ x(error, 2) \ ++ x(cookie, 3) \ ++ x(whiteout, 4) \ ++ x(btree_ptr, 5) \ ++ x(extent, 6) \ ++ x(reservation, 7) \ ++ x(inode, 8) \ ++ x(inode_generation, 9) \ ++ x(dirent, 10) \ ++ x(xattr, 11) \ ++ x(alloc, 12) \ ++ x(quota, 13) \ ++ x(stripe, 14) \ ++ x(reflink_p, 15) \ ++ x(reflink_v, 16) \ ++ x(inline_data, 17) \ ++ x(btree_ptr_v2, 18) ++ ++enum bch_bkey_type { ++#define x(name, nr) KEY_TYPE_##name = nr, ++ BCH_BKEY_TYPES() ++#undef x ++ KEY_TYPE_MAX, ++}; ++ ++struct bch_cookie { ++ struct bch_val v; ++ __le64 cookie; ++}; ++ ++/* Extents */ ++ ++/* ++ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally ++ * preceded by checksum/compression information (bch_extent_crc32 or ++ * bch_extent_crc64). ++ * ++ * One major determining factor in the format of extents is how we handle and ++ * represent extents that have been partially overwritten and thus trimmed: ++ * ++ * If an extent is not checksummed or compressed, when the extent is trimmed we ++ * don't have to remember the extent we originally allocated and wrote: we can ++ * merely adjust ptr->offset to point to the start of the data that is currently ++ * live. The size field in struct bkey records the current (live) size of the ++ * extent, and is also used to mean "size of region on disk that we point to" in ++ * this case. ++ * ++ * Thus an extent that is not checksummed or compressed will consist only of a ++ * list of bch_extent_ptrs, with none of the fields in ++ * bch_extent_crc32/bch_extent_crc64. ++ * ++ * When an extent is checksummed or compressed, it's not possible to read only ++ * the data that is currently live: we have to read the entire extent that was ++ * originally written, and then return only the part of the extent that is ++ * currently live. ++ * ++ * Thus, in addition to the current size of the extent in struct bkey, we need ++ * to store the size of the originally allocated space - this is the ++ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, ++ * when the extent is trimmed, instead of modifying the offset field of the ++ * pointer, we keep a second smaller offset field - "offset into the original ++ * extent of the currently live region". ++ * ++ * The other major determining factor is replication and data migration: ++ * ++ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated ++ * write, we will initially write all the replicas in the same format, with the ++ * same checksum type and compression format - however, when copygc runs later (or ++ * tiering/cache promotion, anything that moves data), it is not in general ++ * going to rewrite all the pointers at once - one of the replicas may be in a ++ * bucket on one device that has very little fragmentation while another lives ++ * in a bucket that has become heavily fragmented, and thus is being rewritten ++ * sooner than the rest. ++ * ++ * Thus it will only move a subset of the pointers (or in the case of ++ * tiering/cache promotion perhaps add a single pointer without dropping any ++ * current pointers), and if the extent has been partially overwritten it must ++ * write only the currently live portion (or copygc would not be able to reduce ++ * fragmentation!) - which necessitates a different bch_extent_crc format for ++ * the new pointer. ++ * ++ * But in the interests of space efficiency, we don't want to store one ++ * bch_extent_crc for each pointer if we don't have to. ++ * ++ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and ++ * bch_extent_ptrs appended arbitrarily one after the other. We determine the ++ * type of a given entry with a scheme similar to utf8 (except we're encoding a ++ * type, not a size), encoding the type in the position of the first set bit: ++ * ++ * bch_extent_crc32 - 0b1 ++ * bch_extent_ptr - 0b10 ++ * bch_extent_crc64 - 0b100 ++ * ++ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and ++ * bch_extent_crc64 is the least constrained). ++ * ++ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, ++ * until the next bch_extent_crc32/64. ++ * ++ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer ++ * is neither checksummed nor compressed. ++ */ ++ ++/* 128 bits, sufficient for cryptographic MACs: */ ++struct bch_csum { ++ __le64 lo; ++ __le64 hi; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_EXTENT_ENTRY_TYPES() \ ++ x(ptr, 0) \ ++ x(crc32, 1) \ ++ x(crc64, 2) \ ++ x(crc128, 3) \ ++ x(stripe_ptr, 4) ++#define BCH_EXTENT_ENTRY_MAX 5 ++ ++enum bch_extent_entry_type { ++#define x(f, n) BCH_EXTENT_ENTRY_##f = n, ++ BCH_EXTENT_ENTRY_TYPES() ++#undef x ++}; ++ ++/* Compressed/uncompressed size are stored biased by 1: */ ++struct bch_extent_crc32 { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u32 type:2, ++ _compressed_size:7, ++ _uncompressed_size:7, ++ offset:7, ++ _unused:1, ++ csum_type:4, ++ compression_type:4; ++ __u32 csum; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u32 csum; ++ __u32 compression_type:4, ++ csum_type:4, ++ _unused:1, ++ offset:7, ++ _uncompressed_size:7, ++ _compressed_size:7, ++ type:2; ++#endif ++} __attribute__((packed, aligned(8))); ++ ++#define CRC32_SIZE_MAX (1U << 7) ++#define CRC32_NONCE_MAX 0 ++ ++struct bch_extent_crc64 { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:3, ++ _compressed_size:9, ++ _uncompressed_size:9, ++ offset:9, ++ nonce:10, ++ csum_type:4, ++ compression_type:4, ++ csum_hi:16; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 csum_hi:16, ++ compression_type:4, ++ csum_type:4, ++ nonce:10, ++ offset:9, ++ _uncompressed_size:9, ++ _compressed_size:9, ++ type:3; ++#endif ++ __u64 csum_lo; ++} __attribute__((packed, aligned(8))); ++ ++#define CRC64_SIZE_MAX (1U << 9) ++#define CRC64_NONCE_MAX ((1U << 10) - 1) ++ ++struct bch_extent_crc128 { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:4, ++ _compressed_size:13, ++ _uncompressed_size:13, ++ offset:13, ++ nonce:13, ++ csum_type:4, ++ compression_type:4; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 compression_type:4, ++ csum_type:4, ++ nonce:13, ++ offset:13, ++ _uncompressed_size:13, ++ _compressed_size:13, ++ type:4; ++#endif ++ struct bch_csum csum; ++} __attribute__((packed, aligned(8))); ++ ++#define CRC128_SIZE_MAX (1U << 13) ++#define CRC128_NONCE_MAX ((1U << 13) - 1) ++ ++/* ++ * @reservation - pointer hasn't been written to, just reserved ++ */ ++struct bch_extent_ptr { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:1, ++ cached:1, ++ unused:1, ++ reservation:1, ++ offset:44, /* 8 petabytes */ ++ dev:8, ++ gen:8; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 gen:8, ++ dev:8, ++ offset:44, ++ reservation:1, ++ unused:1, ++ cached:1, ++ type:1; ++#endif ++} __attribute__((packed, aligned(8))); ++ ++struct bch_extent_stripe_ptr { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:5, ++ block:8, ++ idx:51; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 idx:51, ++ block:8, ++ type:5; ++#endif ++}; ++ ++struct bch_extent_reservation { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:6, ++ unused:22, ++ replicas:4, ++ generation:32; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 generation:32, ++ replicas:4, ++ unused:22, ++ type:6; ++#endif ++}; ++ ++union bch_extent_entry { ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 ++ unsigned long type; ++#elif __BITS_PER_LONG == 32 ++ struct { ++ unsigned long pad; ++ unsigned long type; ++ }; ++#else ++#error edit for your odd byteorder. ++#endif ++ ++#define x(f, n) struct bch_extent_##f f; ++ BCH_EXTENT_ENTRY_TYPES() ++#undef x ++}; ++ ++struct bch_btree_ptr { ++ struct bch_val v; ++ ++ struct bch_extent_ptr start[0]; ++ __u64 _data[0]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_btree_ptr_v2 { ++ struct bch_val v; ++ ++ __u64 mem_ptr; ++ __le64 seq; ++ __le16 sectors_written; ++ /* In case we ever decide to do variable size btree nodes: */ ++ __le16 sectors; ++ struct bpos min_key; ++ struct bch_extent_ptr start[0]; ++ __u64 _data[0]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_extent { ++ struct bch_val v; ++ ++ union bch_extent_entry start[0]; ++ __u64 _data[0]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_reservation { ++ struct bch_val v; ++ ++ __le32 generation; ++ __u8 nr_replicas; ++ __u8 pad[3]; ++} __attribute__((packed, aligned(8))); ++ ++/* Maximum size (in u64s) a single pointer could be: */ ++#define BKEY_EXTENT_PTR_U64s_MAX\ ++ ((sizeof(struct bch_extent_crc128) + \ ++ sizeof(struct bch_extent_ptr)) / sizeof(u64)) ++ ++/* Maximum possible size of an entire extent value: */ ++#define BKEY_EXTENT_VAL_U64s_MAX \ ++ (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) ++ ++#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX) ++ ++/* * Maximum possible size of an entire extent, key + value: */ ++#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) ++ ++/* Btree pointers don't carry around checksums: */ ++#define BKEY_BTREE_PTR_VAL_U64s_MAX \ ++ ((sizeof(struct bch_btree_ptr_v2) + \ ++ sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(u64)) ++#define BKEY_BTREE_PTR_U64s_MAX \ ++ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) ++ ++/* Inodes */ ++ ++#define BLOCKDEV_INODE_MAX 4096 ++ ++#define BCACHEFS_ROOT_INO 4096 ++ ++struct bch_inode { ++ struct bch_val v; ++ ++ __le64 bi_hash_seed; ++ __le32 bi_flags; ++ __le16 bi_mode; ++ __u8 fields[0]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_inode_generation { ++ struct bch_val v; ++ ++ __le32 bi_generation; ++ __le32 pad; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_INODE_FIELDS() \ ++ x(bi_atime, 64) \ ++ x(bi_ctime, 64) \ ++ x(bi_mtime, 64) \ ++ x(bi_otime, 64) \ ++ x(bi_size, 64) \ ++ x(bi_sectors, 64) \ ++ x(bi_uid, 32) \ ++ x(bi_gid, 32) \ ++ x(bi_nlink, 32) \ ++ x(bi_generation, 32) \ ++ x(bi_dev, 32) \ ++ x(bi_data_checksum, 8) \ ++ x(bi_compression, 8) \ ++ x(bi_project, 32) \ ++ x(bi_background_compression, 8) \ ++ x(bi_data_replicas, 8) \ ++ x(bi_promote_target, 16) \ ++ x(bi_foreground_target, 16) \ ++ x(bi_background_target, 16) \ ++ x(bi_erasure_code, 16) \ ++ x(bi_fields_set, 16) ++ ++/* subset of BCH_INODE_FIELDS */ ++#define BCH_INODE_OPTS() \ ++ x(data_checksum, 8) \ ++ x(compression, 8) \ ++ x(project, 32) \ ++ x(background_compression, 8) \ ++ x(data_replicas, 8) \ ++ x(promote_target, 16) \ ++ x(foreground_target, 16) \ ++ x(background_target, 16) \ ++ x(erasure_code, 16) ++ ++enum inode_opt_id { ++#define x(name, ...) \ ++ Inode_opt_##name, ++ BCH_INODE_OPTS() ++#undef x ++ Inode_opt_nr, ++}; ++ ++enum { ++ /* ++ * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL ++ * flags) ++ */ ++ __BCH_INODE_SYNC = 0, ++ __BCH_INODE_IMMUTABLE = 1, ++ __BCH_INODE_APPEND = 2, ++ __BCH_INODE_NODUMP = 3, ++ __BCH_INODE_NOATIME = 4, ++ ++ __BCH_INODE_I_SIZE_DIRTY= 5, ++ __BCH_INODE_I_SECTORS_DIRTY= 6, ++ __BCH_INODE_UNLINKED = 7, ++ ++ /* bits 20+ reserved for packed fields below: */ ++}; ++ ++#define BCH_INODE_SYNC (1 << __BCH_INODE_SYNC) ++#define BCH_INODE_IMMUTABLE (1 << __BCH_INODE_IMMUTABLE) ++#define BCH_INODE_APPEND (1 << __BCH_INODE_APPEND) ++#define BCH_INODE_NODUMP (1 << __BCH_INODE_NODUMP) ++#define BCH_INODE_NOATIME (1 << __BCH_INODE_NOATIME) ++#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY) ++#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY) ++#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) ++ ++LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); ++LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 32); ++ ++/* Dirents */ ++ ++/* ++ * Dirents (and xattrs) have to implement string lookups; since our b-tree ++ * doesn't support arbitrary length strings for the key, we instead index by a ++ * 64 bit hash (currently truncated sha1) of the string, stored in the offset ++ * field of the key - using linear probing to resolve hash collisions. This also ++ * provides us with the readdir cookie posix requires. ++ * ++ * Linear probing requires us to use whiteouts for deletions, in the event of a ++ * collision: ++ */ ++ ++struct bch_dirent { ++ struct bch_val v; ++ ++ /* Target inode number: */ ++ __le64 d_inum; ++ ++ /* ++ * Copy of mode bits 12-15 from the target inode - so userspace can get ++ * the filetype without having to do a stat() ++ */ ++ __u8 d_type; ++ ++ __u8 d_name[]; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_NAME_MAX (U8_MAX * sizeof(u64) - \ ++ sizeof(struct bkey) - \ ++ offsetof(struct bch_dirent, d_name)) ++ ++ ++/* Xattrs */ ++ ++#define KEY_TYPE_XATTR_INDEX_USER 0 ++#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 ++#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 ++#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 ++#define KEY_TYPE_XATTR_INDEX_SECURITY 4 ++ ++struct bch_xattr { ++ struct bch_val v; ++ __u8 x_type; ++ __u8 x_name_len; ++ __le16 x_val_len; ++ __u8 x_name[]; ++} __attribute__((packed, aligned(8))); ++ ++/* Bucket/allocation information: */ ++ ++struct bch_alloc { ++ struct bch_val v; ++ __u8 fields; ++ __u8 gen; ++ __u8 data[]; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_ALLOC_FIELDS() \ ++ x(read_time, 16) \ ++ x(write_time, 16) \ ++ x(data_type, 8) \ ++ x(dirty_sectors, 16) \ ++ x(cached_sectors, 16) \ ++ x(oldest_gen, 8) ++ ++enum { ++#define x(name, bytes) BCH_ALLOC_FIELD_##name, ++ BCH_ALLOC_FIELDS() ++#undef x ++ BCH_ALLOC_FIELD_NR ++}; ++ ++static const unsigned BCH_ALLOC_FIELD_BYTES[] = { ++#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8, ++ BCH_ALLOC_FIELDS() ++#undef x ++}; ++ ++#define x(name, bits) + (bits / 8) ++static const unsigned BKEY_ALLOC_VAL_U64s_MAX = ++ DIV_ROUND_UP(offsetof(struct bch_alloc, data) ++ BCH_ALLOC_FIELDS(), sizeof(u64)); ++#undef x ++ ++#define BKEY_ALLOC_U64s_MAX (BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX) ++ ++/* Quotas: */ ++ ++enum quota_types { ++ QTYP_USR = 0, ++ QTYP_GRP = 1, ++ QTYP_PRJ = 2, ++ QTYP_NR = 3, ++}; ++ ++enum quota_counters { ++ Q_SPC = 0, ++ Q_INO = 1, ++ Q_COUNTERS = 2, ++}; ++ ++struct bch_quota_counter { ++ __le64 hardlimit; ++ __le64 softlimit; ++}; ++ ++struct bch_quota { ++ struct bch_val v; ++ struct bch_quota_counter c[Q_COUNTERS]; ++} __attribute__((packed, aligned(8))); ++ ++/* Erasure coding */ ++ ++struct bch_stripe { ++ struct bch_val v; ++ __le16 sectors; ++ __u8 algorithm; ++ __u8 nr_blocks; ++ __u8 nr_redundant; ++ ++ __u8 csum_granularity_bits; ++ __u8 csum_type; ++ __u8 pad; ++ ++ struct bch_extent_ptr ptrs[0]; ++} __attribute__((packed, aligned(8))); ++ ++/* Reflink: */ ++ ++struct bch_reflink_p { ++ struct bch_val v; ++ __le64 idx; ++ ++ __le32 reservation_generation; ++ __u8 nr_replicas; ++ __u8 pad[3]; ++}; ++ ++struct bch_reflink_v { ++ struct bch_val v; ++ __le64 refcount; ++ union bch_extent_entry start[0]; ++ __u64 _data[0]; ++}; ++ ++/* Inline data */ ++ ++struct bch_inline_data { ++ struct bch_val v; ++ u8 data[0]; ++}; ++ ++/* Optional/variable size superblock sections: */ ++ ++struct bch_sb_field { ++ __u64 _data[0]; ++ __le32 u64s; ++ __le32 type; ++}; ++ ++#define BCH_SB_FIELDS() \ ++ x(journal, 0) \ ++ x(members, 1) \ ++ x(crypt, 2) \ ++ x(replicas_v0, 3) \ ++ x(quota, 4) \ ++ x(disk_groups, 5) \ ++ x(clean, 6) \ ++ x(replicas, 7) \ ++ x(journal_seq_blacklist, 8) ++ ++enum bch_sb_field_type { ++#define x(f, nr) BCH_SB_FIELD_##f = nr, ++ BCH_SB_FIELDS() ++#undef x ++ BCH_SB_FIELD_NR ++}; ++ ++/* BCH_SB_FIELD_journal: */ ++ ++struct bch_sb_field_journal { ++ struct bch_sb_field field; ++ __le64 buckets[0]; ++}; ++ ++/* BCH_SB_FIELD_members: */ ++ ++#define BCH_MIN_NR_NBUCKETS (1 << 6) ++ ++struct bch_member { ++ uuid_le uuid; ++ __le64 nbuckets; /* device size */ ++ __le16 first_bucket; /* index of first bucket used */ ++ __le16 bucket_size; /* sectors */ ++ __le32 pad; ++ __le64 last_mount; /* time_t */ ++ ++ __le64 flags[2]; ++}; ++ ++LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4) ++/* 4-10 unused, was TIER, HAS_(META)DATA */ ++LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14) ++LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15) ++LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20) ++LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28) ++LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30) ++ ++#define BCH_TIER_MAX 4U ++ ++#if 0 ++LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); ++LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); ++#endif ++ ++enum bch_member_state { ++ BCH_MEMBER_STATE_RW = 0, ++ BCH_MEMBER_STATE_RO = 1, ++ BCH_MEMBER_STATE_FAILED = 2, ++ BCH_MEMBER_STATE_SPARE = 3, ++ BCH_MEMBER_STATE_NR = 4, ++}; ++ ++enum cache_replacement { ++ CACHE_REPLACEMENT_LRU = 0, ++ CACHE_REPLACEMENT_FIFO = 1, ++ CACHE_REPLACEMENT_RANDOM = 2, ++ CACHE_REPLACEMENT_NR = 3, ++}; ++ ++struct bch_sb_field_members { ++ struct bch_sb_field field; ++ struct bch_member members[0]; ++}; ++ ++/* BCH_SB_FIELD_crypt: */ ++ ++struct nonce { ++ __le32 d[4]; ++}; ++ ++struct bch_key { ++ __le64 key[4]; ++}; ++ ++#define BCH_KEY_MAGIC \ ++ (((u64) 'b' << 0)|((u64) 'c' << 8)| \ ++ ((u64) 'h' << 16)|((u64) '*' << 24)| \ ++ ((u64) '*' << 32)|((u64) 'k' << 40)| \ ++ ((u64) 'e' << 48)|((u64) 'y' << 56)) ++ ++struct bch_encrypted_key { ++ __le64 magic; ++ struct bch_key key; ++}; ++ ++/* ++ * If this field is present in the superblock, it stores an encryption key which ++ * is used encrypt all other data/metadata. The key will normally be encrypted ++ * with the key userspace provides, but if encryption has been turned off we'll ++ * just store the master key unencrypted in the superblock so we can access the ++ * previously encrypted data. ++ */ ++struct bch_sb_field_crypt { ++ struct bch_sb_field field; ++ ++ __le64 flags; ++ __le64 kdf_flags; ++ struct bch_encrypted_key key; ++}; ++ ++LE64_BITMASK(BCH_CRYPT_KDF_TYPE, struct bch_sb_field_crypt, flags, 0, 4); ++ ++enum bch_kdf_types { ++ BCH_KDF_SCRYPT = 0, ++ BCH_KDF_NR = 1, ++}; ++ ++/* stored as base 2 log of scrypt params: */ ++LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); ++LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); ++LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); ++ ++/* BCH_SB_FIELD_replicas: */ ++ ++enum bch_data_type { ++ BCH_DATA_NONE = 0, ++ BCH_DATA_SB = 1, ++ BCH_DATA_JOURNAL = 2, ++ BCH_DATA_BTREE = 3, ++ BCH_DATA_USER = 4, ++ BCH_DATA_CACHED = 5, ++ BCH_DATA_NR = 6, ++}; ++ ++struct bch_replicas_entry_v0 { ++ __u8 data_type; ++ __u8 nr_devs; ++ __u8 devs[0]; ++} __attribute__((packed)); ++ ++struct bch_sb_field_replicas_v0 { ++ struct bch_sb_field field; ++ struct bch_replicas_entry_v0 entries[0]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_replicas_entry { ++ __u8 data_type; ++ __u8 nr_devs; ++ __u8 nr_required; ++ __u8 devs[0]; ++} __attribute__((packed)); ++ ++#define replicas_entry_bytes(_i) \ ++ (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) ++ ++struct bch_sb_field_replicas { ++ struct bch_sb_field field; ++ struct bch_replicas_entry entries[0]; ++} __attribute__((packed, aligned(8))); ++ ++/* BCH_SB_FIELD_quota: */ ++ ++struct bch_sb_quota_counter { ++ __le32 timelimit; ++ __le32 warnlimit; ++}; ++ ++struct bch_sb_quota_type { ++ __le64 flags; ++ struct bch_sb_quota_counter c[Q_COUNTERS]; ++}; ++ ++struct bch_sb_field_quota { ++ struct bch_sb_field field; ++ struct bch_sb_quota_type q[QTYP_NR]; ++} __attribute__((packed, aligned(8))); ++ ++/* BCH_SB_FIELD_disk_groups: */ ++ ++#define BCH_SB_LABEL_SIZE 32 ++ ++struct bch_disk_group { ++ __u8 label[BCH_SB_LABEL_SIZE]; ++ __le64 flags[2]; ++} __attribute__((packed, aligned(8))); ++ ++LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) ++LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) ++LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) ++ ++struct bch_sb_field_disk_groups { ++ struct bch_sb_field field; ++ struct bch_disk_group entries[0]; ++} __attribute__((packed, aligned(8))); ++ ++/* ++ * On clean shutdown, store btree roots and current journal sequence number in ++ * the superblock: ++ */ ++struct jset_entry { ++ __le16 u64s; ++ __u8 btree_id; ++ __u8 level; ++ __u8 type; /* designates what this jset holds */ ++ __u8 pad[3]; ++ ++ union { ++ struct bkey_i start[0]; ++ __u64 _data[0]; ++ }; ++}; ++ ++struct bch_sb_field_clean { ++ struct bch_sb_field field; ++ ++ __le32 flags; ++ __le16 read_clock; ++ __le16 write_clock; ++ __le64 journal_seq; ++ ++ union { ++ struct jset_entry start[0]; ++ __u64 _data[0]; ++ }; ++}; ++ ++struct journal_seq_blacklist_entry { ++ __le64 start; ++ __le64 end; ++}; ++ ++struct bch_sb_field_journal_seq_blacklist { ++ struct bch_sb_field field; ++ ++ union { ++ struct journal_seq_blacklist_entry start[0]; ++ __u64 _data[0]; ++ }; ++}; ++ ++/* Superblock: */ ++ ++/* ++ * New versioning scheme: ++ * One common version number for all on disk data structures - superblock, btree ++ * nodes, journal entries ++ */ ++#define BCH_JSET_VERSION_OLD 2 ++#define BCH_BSET_VERSION_OLD 3 ++ ++enum bcachefs_metadata_version { ++ bcachefs_metadata_version_min = 9, ++ bcachefs_metadata_version_new_versioning = 10, ++ bcachefs_metadata_version_bkey_renumber = 10, ++ bcachefs_metadata_version_inode_btree_change = 11, ++ bcachefs_metadata_version_max = 12, ++}; ++ ++#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) ++ ++#define BCH_SB_SECTOR 8 ++#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */ ++ ++struct bch_sb_layout { ++ uuid_le magic; /* bcachefs superblock UUID */ ++ __u8 layout_type; ++ __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */ ++ __u8 nr_superblocks; ++ __u8 pad[5]; ++ __le64 sb_offset[61]; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_SB_LAYOUT_SECTOR 7 ++ ++/* ++ * @offset - sector where this sb was written ++ * @version - on disk format version ++ * @version_min - Oldest metadata version this filesystem contains; so we can ++ * safely drop compatibility code and refuse to mount filesystems ++ * we'd need it for ++ * @magic - identifies as a bcachefs superblock (BCACHE_MAGIC) ++ * @seq - incremented each time superblock is written ++ * @uuid - used for generating various magic numbers and identifying ++ * member devices, never changes ++ * @user_uuid - user visible UUID, may be changed ++ * @label - filesystem label ++ * @seq - identifies most recent superblock, incremented each time ++ * superblock is written ++ * @features - enabled incompatible features ++ */ ++struct bch_sb { ++ struct bch_csum csum; ++ __le16 version; ++ __le16 version_min; ++ __le16 pad[2]; ++ uuid_le magic; ++ uuid_le uuid; ++ uuid_le user_uuid; ++ __u8 label[BCH_SB_LABEL_SIZE]; ++ __le64 offset; ++ __le64 seq; ++ ++ __le16 block_size; ++ __u8 dev_idx; ++ __u8 nr_devices; ++ __le32 u64s; ++ ++ __le64 time_base_lo; ++ __le32 time_base_hi; ++ __le32 time_precision; ++ ++ __le64 flags[8]; ++ __le64 features[2]; ++ __le64 compat[2]; ++ ++ struct bch_sb_layout layout; ++ ++ union { ++ struct bch_sb_field start[0]; ++ __le64 _data[0]; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++/* ++ * Flags: ++ * BCH_SB_INITALIZED - set on first mount ++ * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect ++ * behaviour of mount/recovery path: ++ * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits ++ * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80 ++ * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides ++ * DATA/META_CSUM_TYPE. Also indicates encryption ++ * algorithm in use, if/when we get more than one ++ */ ++ ++LE16_BITMASK(BCH_SB_BLOCK_SIZE, struct bch_sb, block_size, 0, 16); ++ ++LE64_BITMASK(BCH_SB_INITIALIZED, struct bch_sb, flags[0], 0, 1); ++LE64_BITMASK(BCH_SB_CLEAN, struct bch_sb, flags[0], 1, 2); ++LE64_BITMASK(BCH_SB_CSUM_TYPE, struct bch_sb, flags[0], 2, 8); ++LE64_BITMASK(BCH_SB_ERROR_ACTION, struct bch_sb, flags[0], 8, 12); ++ ++LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE, struct bch_sb, flags[0], 12, 28); ++ ++LE64_BITMASK(BCH_SB_GC_RESERVE, struct bch_sb, flags[0], 28, 33); ++LE64_BITMASK(BCH_SB_ROOT_RESERVE, struct bch_sb, flags[0], 33, 40); ++ ++LE64_BITMASK(BCH_SB_META_CSUM_TYPE, struct bch_sb, flags[0], 40, 44); ++LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48); ++ ++LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52); ++LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56); ++ ++LE64_BITMASK(BCH_SB_POSIX_ACL, struct bch_sb, flags[0], 56, 57); ++LE64_BITMASK(BCH_SB_USRQUOTA, struct bch_sb, flags[0], 57, 58); ++LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[0], 58, 59); ++LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60); ++ ++LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61); ++ ++LE64_BITMASK(BCH_SB_REFLINK, struct bch_sb, flags[0], 61, 62); ++ ++/* 61-64 unused */ ++ ++LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); ++LE64_BITMASK(BCH_SB_COMPRESSION_TYPE, struct bch_sb, flags[1], 4, 8); ++LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9); ++ ++LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10); ++LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14); ++ ++/* ++ * Max size of an extent that may require bouncing to read or write ++ * (checksummed, compressed): 64k ++ */ ++LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS, ++ struct bch_sb, flags[1], 14, 20); ++ ++LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24); ++LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28); ++ ++LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40); ++LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52); ++LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64); ++ ++LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE, ++ struct bch_sb, flags[2], 0, 4); ++LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); ++ ++LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); ++ ++/* ++ * Features: ++ * ++ * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist ++ * reflink: gates KEY_TYPE_reflink ++ * inline_data: gates KEY_TYPE_inline_data ++ * new_siphash: gates BCH_STR_HASH_SIPHASH ++ * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE ++ */ ++#define BCH_SB_FEATURES() \ ++ x(lz4, 0) \ ++ x(gzip, 1) \ ++ x(zstd, 2) \ ++ x(atomic_nlink, 3) \ ++ x(ec, 4) \ ++ x(journal_seq_blacklist_v3, 5) \ ++ x(reflink, 6) \ ++ x(new_siphash, 7) \ ++ x(inline_data, 8) \ ++ x(new_extent_overwrite, 9) \ ++ x(incompressible, 10) \ ++ x(btree_ptr_v2, 11) \ ++ x(extents_above_btree_updates, 12) \ ++ x(btree_updates_journalled, 13) ++ ++#define BCH_SB_FEATURES_ALL \ ++ ((1ULL << BCH_FEATURE_new_siphash)| \ ++ (1ULL << BCH_FEATURE_new_extent_overwrite)| \ ++ (1ULL << BCH_FEATURE_btree_ptr_v2)| \ ++ (1ULL << BCH_FEATURE_extents_above_btree_updates)) ++ ++enum bch_sb_feature { ++#define x(f, n) BCH_FEATURE_##f, ++ BCH_SB_FEATURES() ++#undef x ++ BCH_FEATURE_NR, ++}; ++ ++enum bch_sb_compat { ++ BCH_COMPAT_FEAT_ALLOC_INFO = 0, ++ BCH_COMPAT_FEAT_ALLOC_METADATA = 1, ++}; ++ ++/* options: */ ++ ++#define BCH_REPLICAS_MAX 4U ++ ++enum bch_error_actions { ++ BCH_ON_ERROR_CONTINUE = 0, ++ BCH_ON_ERROR_RO = 1, ++ BCH_ON_ERROR_PANIC = 2, ++ BCH_NR_ERROR_ACTIONS = 3, ++}; ++ ++enum bch_str_hash_type { ++ BCH_STR_HASH_CRC32C = 0, ++ BCH_STR_HASH_CRC64 = 1, ++ BCH_STR_HASH_SIPHASH_OLD = 2, ++ BCH_STR_HASH_SIPHASH = 3, ++ BCH_STR_HASH_NR = 4, ++}; ++ ++enum bch_str_hash_opts { ++ BCH_STR_HASH_OPT_CRC32C = 0, ++ BCH_STR_HASH_OPT_CRC64 = 1, ++ BCH_STR_HASH_OPT_SIPHASH = 2, ++ BCH_STR_HASH_OPT_NR = 3, ++}; ++ ++enum bch_csum_type { ++ BCH_CSUM_NONE = 0, ++ BCH_CSUM_CRC32C_NONZERO = 1, ++ BCH_CSUM_CRC64_NONZERO = 2, ++ BCH_CSUM_CHACHA20_POLY1305_80 = 3, ++ BCH_CSUM_CHACHA20_POLY1305_128 = 4, ++ BCH_CSUM_CRC32C = 5, ++ BCH_CSUM_CRC64 = 6, ++ BCH_CSUM_NR = 7, ++}; ++ ++static const unsigned bch_crc_bytes[] = { ++ [BCH_CSUM_NONE] = 0, ++ [BCH_CSUM_CRC32C_NONZERO] = 4, ++ [BCH_CSUM_CRC32C] = 4, ++ [BCH_CSUM_CRC64_NONZERO] = 8, ++ [BCH_CSUM_CRC64] = 8, ++ [BCH_CSUM_CHACHA20_POLY1305_80] = 10, ++ [BCH_CSUM_CHACHA20_POLY1305_128] = 16, ++}; ++ ++static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) ++{ ++ switch (type) { ++ case BCH_CSUM_CHACHA20_POLY1305_80: ++ case BCH_CSUM_CHACHA20_POLY1305_128: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++enum bch_csum_opts { ++ BCH_CSUM_OPT_NONE = 0, ++ BCH_CSUM_OPT_CRC32C = 1, ++ BCH_CSUM_OPT_CRC64 = 2, ++ BCH_CSUM_OPT_NR = 3, ++}; ++ ++#define BCH_COMPRESSION_TYPES() \ ++ x(none, 0) \ ++ x(lz4_old, 1) \ ++ x(gzip, 2) \ ++ x(lz4, 3) \ ++ x(zstd, 4) \ ++ x(incompressible, 5) ++ ++enum bch_compression_type { ++#define x(t, n) BCH_COMPRESSION_TYPE_##t, ++ BCH_COMPRESSION_TYPES() ++#undef x ++ BCH_COMPRESSION_TYPE_NR ++}; ++ ++#define BCH_COMPRESSION_OPTS() \ ++ x(none, 0) \ ++ x(lz4, 1) \ ++ x(gzip, 2) \ ++ x(zstd, 3) ++ ++enum bch_compression_opts { ++#define x(t, n) BCH_COMPRESSION_OPT_##t, ++ BCH_COMPRESSION_OPTS() ++#undef x ++ BCH_COMPRESSION_OPT_NR ++}; ++ ++/* ++ * Magic numbers ++ * ++ * The various other data structures have their own magic numbers, which are ++ * xored with the first part of the cache set's UUID ++ */ ++ ++#define BCACHE_MAGIC \ ++ UUID_LE(0xf67385c6, 0x1a4e, 0xca45, \ ++ 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81) ++ ++#define BCACHEFS_STATFS_MAGIC 0xca451a4e ++ ++#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL) ++#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL) ++ ++static inline __le64 __bch2_sb_magic(struct bch_sb *sb) ++{ ++ __le64 ret; ++ memcpy(&ret, &sb->uuid, sizeof(ret)); ++ return ret; ++} ++ ++static inline __u64 __jset_magic(struct bch_sb *sb) ++{ ++ return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC); ++} ++ ++static inline __u64 __bset_magic(struct bch_sb *sb) ++{ ++ return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC); ++} ++ ++/* Journal */ ++ ++#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) ++ ++#define BCH_JSET_ENTRY_TYPES() \ ++ x(btree_keys, 0) \ ++ x(btree_root, 1) \ ++ x(prio_ptrs, 2) \ ++ x(blacklist, 3) \ ++ x(blacklist_v2, 4) \ ++ x(usage, 5) \ ++ x(data_usage, 6) ++ ++enum { ++#define x(f, nr) BCH_JSET_ENTRY_##f = nr, ++ BCH_JSET_ENTRY_TYPES() ++#undef x ++ BCH_JSET_ENTRY_NR ++}; ++ ++/* ++ * Journal sequence numbers can be blacklisted: bsets record the max sequence ++ * number of all the journal entries they contain updates for, so that on ++ * recovery we can ignore those bsets that contain index updates newer that what ++ * made it into the journal. ++ * ++ * This means that we can't reuse that journal_seq - we have to skip it, and ++ * then record that we skipped it so that the next time we crash and recover we ++ * don't think there was a missing journal entry. ++ */ ++struct jset_entry_blacklist { ++ struct jset_entry entry; ++ __le64 seq; ++}; ++ ++struct jset_entry_blacklist_v2 { ++ struct jset_entry entry; ++ __le64 start; ++ __le64 end; ++}; ++ ++enum { ++ FS_USAGE_RESERVED = 0, ++ FS_USAGE_INODES = 1, ++ FS_USAGE_KEY_VERSION = 2, ++ FS_USAGE_NR = 3 ++}; ++ ++struct jset_entry_usage { ++ struct jset_entry entry; ++ __le64 v; ++} __attribute__((packed)); ++ ++struct jset_entry_data_usage { ++ struct jset_entry entry; ++ __le64 v; ++ struct bch_replicas_entry r; ++} __attribute__((packed)); ++ ++/* ++ * On disk format for a journal entry: ++ * seq is monotonically increasing; every journal entry has its own unique ++ * sequence number. ++ * ++ * last_seq is the oldest journal entry that still has keys the btree hasn't ++ * flushed to disk yet. ++ * ++ * version is for on disk format changes. ++ */ ++struct jset { ++ struct bch_csum csum; ++ ++ __le64 magic; ++ __le64 seq; ++ __le32 version; ++ __le32 flags; ++ ++ __le32 u64s; /* size of d[] in u64s */ ++ ++ __u8 encrypted_start[0]; ++ ++ __le16 read_clock; ++ __le16 write_clock; ++ ++ /* Sequence number of oldest dirty journal entry */ ++ __le64 last_seq; ++ ++ ++ union { ++ struct jset_entry start[0]; ++ __u64 _data[0]; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); ++LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); ++ ++#define BCH_JOURNAL_BUCKETS_MIN 8 ++ ++/* Btree: */ ++ ++#define BCH_BTREE_IDS() \ ++ x(EXTENTS, 0, "extents") \ ++ x(INODES, 1, "inodes") \ ++ x(DIRENTS, 2, "dirents") \ ++ x(XATTRS, 3, "xattrs") \ ++ x(ALLOC, 4, "alloc") \ ++ x(QUOTAS, 5, "quotas") \ ++ x(EC, 6, "stripes") \ ++ x(REFLINK, 7, "reflink") ++ ++enum btree_id { ++#define x(kwd, val, name) BTREE_ID_##kwd = val, ++ BCH_BTREE_IDS() ++#undef x ++ BTREE_ID_NR ++}; ++ ++#define BTREE_MAX_DEPTH 4U ++ ++/* Btree nodes */ ++ ++/* ++ * Btree nodes ++ * ++ * On disk a btree node is a list/log of these; within each set the keys are ++ * sorted ++ */ ++struct bset { ++ __le64 seq; ++ ++ /* ++ * Highest journal entry this bset contains keys for. ++ * If on recovery we don't see that journal entry, this bset is ignored: ++ * this allows us to preserve the order of all index updates after a ++ * crash, since the journal records a total order of all index updates ++ * and anything that didn't make it to the journal doesn't get used. ++ */ ++ __le64 journal_seq; ++ ++ __le32 flags; ++ __le16 version; ++ __le16 u64s; /* count of d[] in u64s */ ++ ++ union { ++ struct bkey_packed start[0]; ++ __u64 _data[0]; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4); ++ ++LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5); ++LE32_BITMASK(BSET_SEPARATE_WHITEOUTS, ++ struct bset, flags, 5, 6); ++ ++struct btree_node { ++ struct bch_csum csum; ++ __le64 magic; ++ ++ /* this flags field is encrypted, unlike bset->flags: */ ++ __le64 flags; ++ ++ /* Closed interval: */ ++ struct bpos min_key; ++ struct bpos max_key; ++ struct bch_extent_ptr ptr; ++ struct bkey_format format; ++ ++ union { ++ struct bset keys; ++ struct { ++ __u8 pad[22]; ++ __le16 u64s; ++ __u64 _data[0]; ++ ++ }; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4); ++LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8); ++LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE, ++ struct btree_node, flags, 8, 9); ++/* 9-32 unused */ ++LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64); ++ ++struct btree_node_entry { ++ struct bch_csum csum; ++ ++ union { ++ struct bset keys; ++ struct { ++ __u8 pad[22]; ++ __le16 u64s; ++ __u64 _data[0]; ++ ++ }; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++#endif /* _BCACHEFS_FORMAT_H */ +diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h +new file mode 100644 +index 000000000000..d71157a3e073 +--- /dev/null ++++ b/fs/bcachefs/bcachefs_ioctl.h +@@ -0,0 +1,332 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_IOCTL_H ++#define _BCACHEFS_IOCTL_H ++ ++#include ++#include ++#include "bcachefs_format.h" ++ ++/* ++ * Flags common to multiple ioctls: ++ */ ++#define BCH_FORCE_IF_DATA_LOST (1 << 0) ++#define BCH_FORCE_IF_METADATA_LOST (1 << 1) ++#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2) ++#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3) ++ ++#define BCH_FORCE_IF_DEGRADED \ ++ (BCH_FORCE_IF_DATA_DEGRADED| \ ++ BCH_FORCE_IF_METADATA_DEGRADED) ++ ++/* ++ * If cleared, ioctl that refer to a device pass it as a pointer to a pathname ++ * (e.g. /dev/sda1); if set, the dev field is the device's index within the ++ * filesystem: ++ */ ++#define BCH_BY_INDEX (1 << 4) ++ ++/* ++ * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem ++ * wide superblock: ++ */ ++#define BCH_READ_DEV (1 << 5) ++ ++/* global control dev: */ ++ ++/* These are currently broken, and probably unnecessary: */ ++#if 0 ++#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble) ++#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental) ++ ++struct bch_ioctl_assemble { ++ __u32 flags; ++ __u32 nr_devs; ++ __u64 pad; ++ __u64 devs[]; ++}; ++ ++struct bch_ioctl_incremental { ++ __u32 flags; ++ __u64 pad; ++ __u64 dev; ++}; ++#endif ++ ++/* filesystem ioctls: */ ++ ++#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid) ++ ++/* These only make sense when we also have incremental assembly */ ++#if 0 ++#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start) ++#define BCH_IOCTL_STOP _IO(0xbc, 3) ++#endif ++ ++#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk) ++#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk) ++#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk) ++#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk) ++#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state) ++#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data) ++#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage) ++#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage) ++#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super) ++#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx) ++#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) ++ ++/* ioctl below act on a particular file, not the filesystem as a whole: */ ++ ++#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) ++ ++/* ++ * BCH_IOCTL_QUERY_UUID: get filesystem UUID ++ * ++ * Returns user visible UUID, not internal UUID (which may not ever be changed); ++ * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with ++ * this UUID. ++ */ ++struct bch_ioctl_query_uuid { ++ uuid_le uuid; ++}; ++ ++#if 0 ++struct bch_ioctl_start { ++ __u32 flags; ++ __u32 pad; ++}; ++#endif ++ ++/* ++ * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem ++ * ++ * The specified device must not be open or in use. On success, the new device ++ * will be an online member of the filesystem just like any other member. ++ * ++ * The device must first be prepared by userspace by formatting with a bcachefs ++ * superblock, which is only used for passing in superblock options/parameters ++ * for that device (in struct bch_member). The new device's superblock should ++ * not claim to be a member of any existing filesystem - UUIDs on it will be ++ * ignored. ++ */ ++ ++/* ++ * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem ++ * ++ * Any data present on @dev will be permanently deleted, and @dev will be ++ * removed from its slot in the filesystem's list of member devices. The device ++ * may be either offline or offline. ++ * ++ * Will fail removing @dev would leave us with insufficient read write devices ++ * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are ++ * set. ++ */ ++ ++/* ++ * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem ++ * but is not open (e.g. because we started in degraded mode), bring it online ++ * ++ * all existing data on @dev will be available once the device is online, ++ * exactly as if @dev was present when the filesystem was first mounted ++ */ ++ ++/* ++ * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that ++ * block device, without removing it from the filesystem (so it can be brought ++ * back online later) ++ * ++ * Data present on @dev will be unavailable while @dev is offline (unless ++ * replicated), but will still be intact and untouched if @dev is brought back ++ * online ++ * ++ * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would ++ * leave us with insufficient read write devices or degraded/unavailable data, ++ * unless the approprate BCH_FORCE_IF_* flags are set. ++ */ ++ ++struct bch_ioctl_disk { ++ __u32 flags; ++ __u32 pad; ++ __u64 dev; ++}; ++ ++/* ++ * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem ++ * ++ * @new_state - one of the bch_member_state states (rw, ro, failed, ++ * spare) ++ * ++ * Will refuse to change member state if we would then have insufficient devices ++ * to write to, or if it would result in degraded data (when @new_state is ++ * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set. ++ */ ++struct bch_ioctl_disk_set_state { ++ __u32 flags; ++ __u8 new_state; ++ __u8 pad[3]; ++ __u64 dev; ++}; ++ ++enum bch_data_ops { ++ BCH_DATA_OP_SCRUB = 0, ++ BCH_DATA_OP_REREPLICATE = 1, ++ BCH_DATA_OP_MIGRATE = 2, ++ BCH_DATA_OP_NR = 3, ++}; ++ ++/* ++ * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g. ++ * scrub, rereplicate, migrate). ++ * ++ * This ioctl kicks off a job in the background, and returns a file descriptor. ++ * Reading from the file descriptor returns a struct bch_ioctl_data_event, ++ * indicating current progress, and closing the file descriptor will stop the ++ * job. The file descriptor is O_CLOEXEC. ++ */ ++struct bch_ioctl_data { ++ __u32 op; ++ __u32 flags; ++ ++ struct bpos start; ++ struct bpos end; ++ ++ union { ++ struct { ++ __u32 dev; ++ __u32 pad; ++ } migrate; ++ struct { ++ __u64 pad[8]; ++ }; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++enum bch_data_event { ++ BCH_DATA_EVENT_PROGRESS = 0, ++ /* XXX: add an event for reporting errors */ ++ BCH_DATA_EVENT_NR = 1, ++}; ++ ++struct bch_ioctl_data_progress { ++ __u8 data_type; ++ __u8 btree_id; ++ __u8 pad[2]; ++ struct bpos pos; ++ ++ __u64 sectors_done; ++ __u64 sectors_total; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_ioctl_data_event { ++ __u8 type; ++ __u8 pad[7]; ++ union { ++ struct bch_ioctl_data_progress p; ++ __u64 pad2[15]; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_replicas_usage { ++ __u64 sectors; ++ struct bch_replicas_entry r; ++} __attribute__((packed)); ++ ++static inline struct bch_replicas_usage * ++replicas_usage_next(struct bch_replicas_usage *u) ++{ ++ return (void *) u + replicas_entry_bytes(&u->r) + 8; ++} ++ ++/* ++ * BCH_IOCTL_FS_USAGE: query filesystem disk space usage ++ * ++ * Returns disk space usage broken out by data type, number of replicas, and ++ * by component device ++ * ++ * @replica_entries_bytes - size, in bytes, allocated for replica usage entries ++ * ++ * On success, @replica_entries_bytes will be changed to indicate the number of ++ * bytes actually used. ++ * ++ * Returns -ERANGE if @replica_entries_bytes was too small ++ */ ++struct bch_ioctl_fs_usage { ++ __u64 capacity; ++ __u64 used; ++ __u64 online_reserved; ++ __u64 persistent_reserved[BCH_REPLICAS_MAX]; ++ ++ __u32 replica_entries_bytes; ++ __u32 pad; ++ ++ struct bch_replicas_usage replicas[0]; ++}; ++ ++/* ++ * BCH_IOCTL_DEV_USAGE: query device disk space usage ++ * ++ * Returns disk space usage broken out by data type - both by buckets and ++ * sectors. ++ */ ++struct bch_ioctl_dev_usage { ++ __u64 dev; ++ __u32 flags; ++ __u8 state; ++ __u8 pad[7]; ++ ++ __u32 bucket_size; ++ __u64 nr_buckets; ++ __u64 available_buckets; ++ ++ __u64 buckets[BCH_DATA_NR]; ++ __u64 sectors[BCH_DATA_NR]; ++ ++ __u64 ec_buckets; ++ __u64 ec_sectors; ++}; ++ ++/* ++ * BCH_IOCTL_READ_SUPER: read filesystem superblock ++ * ++ * Equivalent to reading the superblock directly from the block device, except ++ * avoids racing with the kernel writing the superblock or having to figure out ++ * which block device to read ++ * ++ * @sb - buffer to read into ++ * @size - size of userspace allocated buffer ++ * @dev - device to read superblock for, if BCH_READ_DEV flag is ++ * specified ++ * ++ * Returns -ERANGE if buffer provided is too small ++ */ ++struct bch_ioctl_read_super { ++ __u32 flags; ++ __u32 pad; ++ __u64 dev; ++ __u64 size; ++ __u64 sb; ++}; ++ ++/* ++ * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to ++ * determine if disk is a (online) member - if so, returns device's index ++ * ++ * Returns -ENOENT if not found ++ */ ++struct bch_ioctl_disk_get_idx { ++ __u64 dev; ++}; ++ ++/* ++ * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device ++ * ++ * @dev - member to resize ++ * @nbuckets - new number of buckets ++ */ ++struct bch_ioctl_disk_resize { ++ __u32 flags; ++ __u32 pad; ++ __u64 dev; ++ __u64 nbuckets; ++}; ++ ++#endif /* _BCACHEFS_IOCTL_H */ +diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c +new file mode 100644 +index 000000000000..4d0c9129cd4a +--- /dev/null ++++ b/fs/bcachefs/bkey.c +@@ -0,0 +1,1154 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey.h" ++#include "bkey_methods.h" ++#include "bset.h" ++#include "util.h" ++ ++#undef EBUG_ON ++ ++#ifdef DEBUG_BKEYS ++#define EBUG_ON(cond) BUG_ON(cond) ++#else ++#define EBUG_ON(cond) ++#endif ++ ++const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT; ++ ++struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, ++ const struct bkey_packed *); ++ ++void bch2_to_binary(char *out, const u64 *p, unsigned nr_bits) ++{ ++ unsigned bit = high_bit_offset, done = 0; ++ ++ while (1) { ++ while (bit < 64) { ++ if (done && !(done % 8)) ++ *out++ = ' '; ++ *out++ = *p & (1ULL << (63 - bit)) ? '1' : '0'; ++ bit++; ++ done++; ++ if (done == nr_bits) { ++ *out++ = '\0'; ++ return; ++ } ++ } ++ ++ p = next_word(p); ++ bit = 0; ++ } ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++static void bch2_bkey_pack_verify(const struct bkey_packed *packed, ++ const struct bkey *unpacked, ++ const struct bkey_format *format) ++{ ++ struct bkey tmp; ++ ++ BUG_ON(bkeyp_val_u64s(format, packed) != ++ bkey_val_u64s(unpacked)); ++ ++ BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed)); ++ ++ tmp = __bch2_bkey_unpack_key(format, packed); ++ ++ if (memcmp(&tmp, unpacked, sizeof(struct bkey))) { ++ char buf1[160], buf2[160]; ++ char buf3[160], buf4[160]; ++ ++ bch2_bkey_to_text(&PBUF(buf1), unpacked); ++ bch2_bkey_to_text(&PBUF(buf2), &tmp); ++ bch2_to_binary(buf3, (void *) unpacked, 80); ++ bch2_to_binary(buf4, high_word(format, packed), 80); ++ ++ panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n", ++ format->key_u64s, ++ format->bits_per_field[0], ++ format->bits_per_field[1], ++ format->bits_per_field[2], ++ format->bits_per_field[3], ++ format->bits_per_field[4], ++ buf1, buf2, buf3, buf4); ++ } ++} ++ ++#else ++static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed, ++ const struct bkey *unpacked, ++ const struct bkey_format *format) {} ++#endif ++ ++struct pack_state { ++ const struct bkey_format *format; ++ unsigned bits; /* bits remaining in current word */ ++ u64 w; /* current word */ ++ u64 *p; /* pointer to next word */ ++}; ++ ++__always_inline ++static struct pack_state pack_state_init(const struct bkey_format *format, ++ struct bkey_packed *k) ++{ ++ u64 *p = high_word(format, k); ++ ++ return (struct pack_state) { ++ .format = format, ++ .bits = 64 - high_bit_offset, ++ .w = 0, ++ .p = p, ++ }; ++} ++ ++__always_inline ++static void pack_state_finish(struct pack_state *state, ++ struct bkey_packed *k) ++{ ++ EBUG_ON(state->p < k->_data); ++ EBUG_ON(state->p >= k->_data + state->format->key_u64s); ++ ++ *state->p = state->w; ++} ++ ++struct unpack_state { ++ const struct bkey_format *format; ++ unsigned bits; /* bits remaining in current word */ ++ u64 w; /* current word */ ++ const u64 *p; /* pointer to next word */ ++}; ++ ++__always_inline ++static struct unpack_state unpack_state_init(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ const u64 *p = high_word(format, k); ++ ++ return (struct unpack_state) { ++ .format = format, ++ .bits = 64 - high_bit_offset, ++ .w = *p << high_bit_offset, ++ .p = p, ++ }; ++} ++ ++__always_inline ++static u64 get_inc_field(struct unpack_state *state, unsigned field) ++{ ++ unsigned bits = state->format->bits_per_field[field]; ++ u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]); ++ ++ if (bits >= state->bits) { ++ v = state->w >> (64 - bits); ++ bits -= state->bits; ++ ++ state->p = next_word(state->p); ++ state->w = *state->p; ++ state->bits = 64; ++ } ++ ++ /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ ++ v |= (state->w >> 1) >> (63 - bits); ++ state->w <<= bits; ++ state->bits -= bits; ++ ++ return v + offset; ++} ++ ++__always_inline ++static bool set_inc_field(struct pack_state *state, unsigned field, u64 v) ++{ ++ unsigned bits = state->format->bits_per_field[field]; ++ u64 offset = le64_to_cpu(state->format->field_offset[field]); ++ ++ if (v < offset) ++ return false; ++ ++ v -= offset; ++ ++ if (fls64(v) > bits) ++ return false; ++ ++ if (bits > state->bits) { ++ bits -= state->bits; ++ /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ ++ state->w |= (v >> 1) >> (bits - 1); ++ ++ *state->p = state->w; ++ state->p = next_word(state->p); ++ state->w = 0; ++ state->bits = 64; ++ } ++ ++ state->bits -= bits; ++ state->w |= v << state->bits; ++ ++ return true; ++} ++ ++/* ++ * Note: does NOT set out->format (we don't know what it should be here!) ++ * ++ * Also: doesn't work on extents - it doesn't preserve the invariant that ++ * if k is packed bkey_start_pos(k) will successfully pack ++ */ ++static bool bch2_bkey_transform_key(const struct bkey_format *out_f, ++ struct bkey_packed *out, ++ const struct bkey_format *in_f, ++ const struct bkey_packed *in) ++{ ++ struct pack_state out_s = pack_state_init(out_f, out); ++ struct unpack_state in_s = unpack_state_init(in_f, in); ++ unsigned i; ++ ++ out->_data[0] = 0; ++ ++ for (i = 0; i < BKEY_NR_FIELDS; i++) ++ if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i))) ++ return false; ++ ++ /* Can't happen because the val would be too big to unpack: */ ++ EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX); ++ ++ pack_state_finish(&out_s, out); ++ out->u64s = out_f->key_u64s + in->u64s - in_f->key_u64s; ++ out->needs_whiteout = in->needs_whiteout; ++ out->type = in->type; ++ ++ return true; ++} ++ ++bool bch2_bkey_transform(const struct bkey_format *out_f, ++ struct bkey_packed *out, ++ const struct bkey_format *in_f, ++ const struct bkey_packed *in) ++{ ++ if (!bch2_bkey_transform_key(out_f, out, in_f, in)) ++ return false; ++ ++ memcpy_u64s((u64 *) out + out_f->key_u64s, ++ (u64 *) in + in_f->key_u64s, ++ (in->u64s - in_f->key_u64s)); ++ return true; ++} ++ ++#define bkey_fields() \ ++ x(BKEY_FIELD_INODE, p.inode) \ ++ x(BKEY_FIELD_OFFSET, p.offset) \ ++ x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ ++ x(BKEY_FIELD_SIZE, size) \ ++ x(BKEY_FIELD_VERSION_HI, version.hi) \ ++ x(BKEY_FIELD_VERSION_LO, version.lo) ++ ++struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format, ++ const struct bkey_packed *in) ++{ ++ struct unpack_state state = unpack_state_init(format, in); ++ struct bkey out; ++ ++ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); ++ EBUG_ON(in->u64s < format->key_u64s); ++ EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); ++ EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX); ++ ++ out.u64s = BKEY_U64s + in->u64s - format->key_u64s; ++ out.format = KEY_FORMAT_CURRENT; ++ out.needs_whiteout = in->needs_whiteout; ++ out.type = in->type; ++ out.pad[0] = 0; ++ ++#define x(id, field) out.field = get_inc_field(&state, id); ++ bkey_fields() ++#undef x ++ ++ return out; ++} ++ ++#ifndef HAVE_BCACHEFS_COMPILED_UNPACK ++struct bpos __bkey_unpack_pos(const struct bkey_format *format, ++ const struct bkey_packed *in) ++{ ++ struct unpack_state state = unpack_state_init(format, in); ++ struct bpos out; ++ ++ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); ++ EBUG_ON(in->u64s < format->key_u64s); ++ EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); ++ ++ out.inode = get_inc_field(&state, BKEY_FIELD_INODE); ++ out.offset = get_inc_field(&state, BKEY_FIELD_OFFSET); ++ out.snapshot = get_inc_field(&state, BKEY_FIELD_SNAPSHOT); ++ ++ return out; ++} ++#endif ++ ++/** ++ * bch2_bkey_pack_key -- pack just the key, not the value ++ */ ++bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, ++ const struct bkey_format *format) ++{ ++ struct pack_state state = pack_state_init(format, out); ++ ++ EBUG_ON((void *) in == (void *) out); ++ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); ++ EBUG_ON(in->format != KEY_FORMAT_CURRENT); ++ ++ out->_data[0] = 0; ++ ++#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false; ++ bkey_fields() ++#undef x ++ ++ /* ++ * Extents - we have to guarantee that if an extent is packed, a trimmed ++ * version will also pack: ++ */ ++ if (bkey_start_offset(in) < ++ le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET])) ++ return false; ++ ++ pack_state_finish(&state, out); ++ out->u64s = format->key_u64s + in->u64s - BKEY_U64s; ++ out->format = KEY_FORMAT_LOCAL_BTREE; ++ out->needs_whiteout = in->needs_whiteout; ++ out->type = in->type; ++ ++ bch2_bkey_pack_verify(out, in, format); ++ return true; ++} ++ ++/** ++ * bch2_bkey_unpack -- unpack the key and the value ++ */ ++void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst, ++ const struct bkey_packed *src) ++{ ++ __bkey_unpack_key(b, &dst->k, src); ++ ++ memcpy_u64s(&dst->v, ++ bkeyp_val(&b->format, src), ++ bkeyp_val_u64s(&b->format, src)); ++} ++ ++/** ++ * bch2_bkey_pack -- pack the key and the value ++ */ ++bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in, ++ const struct bkey_format *format) ++{ ++ struct bkey_packed tmp; ++ ++ if (!bch2_bkey_pack_key(&tmp, &in->k, format)) ++ return false; ++ ++ memmove_u64s((u64 *) out + format->key_u64s, ++ &in->v, ++ bkey_val_u64s(&in->k)); ++ memcpy_u64s(out, &tmp, format->key_u64s); ++ ++ return true; ++} ++ ++__always_inline ++static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) ++{ ++ unsigned bits = state->format->bits_per_field[field]; ++ u64 offset = le64_to_cpu(state->format->field_offset[field]); ++ bool ret = true; ++ ++ EBUG_ON(v < offset); ++ v -= offset; ++ ++ if (fls64(v) > bits) { ++ v = ~(~0ULL << bits); ++ ret = false; ++ } ++ ++ if (bits > state->bits) { ++ bits -= state->bits; ++ state->w |= (v >> 1) >> (bits - 1); ++ ++ *state->p = state->w; ++ state->p = next_word(state->p); ++ state->w = 0; ++ state->bits = 64; ++ } ++ ++ state->bits -= bits; ++ state->w |= v << state->bits; ++ ++ return ret; ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++static bool bkey_packed_successor(struct bkey_packed *out, ++ const struct btree *b, ++ struct bkey_packed k) ++{ ++ const struct bkey_format *f = &b->format; ++ unsigned nr_key_bits = b->nr_key_bits; ++ unsigned first_bit, offset; ++ u64 *p; ++ ++ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); ++ ++ if (!nr_key_bits) ++ return false; ++ ++ *out = k; ++ ++ first_bit = high_bit_offset + nr_key_bits - 1; ++ p = nth_word(high_word(f, out), first_bit >> 6); ++ offset = 63 - (first_bit & 63); ++ ++ while (nr_key_bits) { ++ unsigned bits = min(64 - offset, nr_key_bits); ++ u64 mask = (~0ULL >> (64 - bits)) << offset; ++ ++ if ((*p & mask) != mask) { ++ *p += 1ULL << offset; ++ EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0); ++ return true; ++ } ++ ++ *p &= ~mask; ++ p = prev_word(p); ++ nr_key_bits -= bits; ++ offset = 0; ++ } ++ ++ return false; ++} ++#endif ++ ++/* ++ * Returns a packed key that compares <= in ++ * ++ * This is used in bset_search_tree(), where we need a packed pos in order to be ++ * able to compare against the keys in the auxiliary search tree - and it's ++ * legal to use a packed pos that isn't equivalent to the original pos, ++ * _provided_ it compares <= to the original pos. ++ */ ++enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, ++ struct bpos in, ++ const struct btree *b) ++{ ++ const struct bkey_format *f = &b->format; ++ struct pack_state state = pack_state_init(f, out); ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bpos orig = in; ++#endif ++ bool exact = true; ++ ++ out->_data[0] = 0; ++ ++ if (unlikely(in.snapshot < ++ le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) { ++ if (!in.offset-- && ++ !in.inode--) ++ return BKEY_PACK_POS_FAIL; ++ in.snapshot = KEY_SNAPSHOT_MAX; ++ exact = false; ++ } ++ ++ if (unlikely(in.offset < ++ le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) { ++ if (!in.inode--) ++ return BKEY_PACK_POS_FAIL; ++ in.offset = KEY_OFFSET_MAX; ++ in.snapshot = KEY_SNAPSHOT_MAX; ++ exact = false; ++ } ++ ++ if (unlikely(in.inode < ++ le64_to_cpu(f->field_offset[BKEY_FIELD_INODE]))) ++ return BKEY_PACK_POS_FAIL; ++ ++ if (!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode)) { ++ in.offset = KEY_OFFSET_MAX; ++ in.snapshot = KEY_SNAPSHOT_MAX; ++ exact = false; ++ } ++ ++ if (!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset)) { ++ in.snapshot = KEY_SNAPSHOT_MAX; ++ exact = false; ++ } ++ ++ if (!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot)) ++ exact = false; ++ ++ pack_state_finish(&state, out); ++ out->u64s = f->key_u64s; ++ out->format = KEY_FORMAT_LOCAL_BTREE; ++ out->type = KEY_TYPE_deleted; ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ if (exact) { ++ BUG_ON(bkey_cmp_left_packed(b, out, &orig)); ++ } else { ++ struct bkey_packed successor; ++ ++ BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); ++ BUG_ON(bkey_packed_successor(&successor, b, *out) && ++ bkey_cmp_left_packed(b, &successor, &orig) < 0); ++ } ++#endif ++ ++ return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER; ++} ++ ++void bch2_bkey_format_init(struct bkey_format_state *s) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(s->field_min); i++) ++ s->field_min[i] = U64_MAX; ++ ++ for (i = 0; i < ARRAY_SIZE(s->field_max); i++) ++ s->field_max[i] = 0; ++ ++ /* Make sure we can store a size of 0: */ ++ s->field_min[BKEY_FIELD_SIZE] = 0; ++} ++ ++static void __bkey_format_add(struct bkey_format_state *s, ++ unsigned field, u64 v) ++{ ++ s->field_min[field] = min(s->field_min[field], v); ++ s->field_max[field] = max(s->field_max[field], v); ++} ++ ++/* ++ * Changes @format so that @k can be successfully packed with @format ++ */ ++void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k) ++{ ++#define x(id, field) __bkey_format_add(s, id, k->field); ++ bkey_fields() ++#undef x ++ __bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k)); ++} ++ ++void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p) ++{ ++ unsigned field = 0; ++ ++ __bkey_format_add(s, field++, p.inode); ++ __bkey_format_add(s, field++, p.offset); ++ __bkey_format_add(s, field++, p.snapshot); ++} ++ ++/* ++ * We don't want it to be possible for the packed format to represent fields ++ * bigger than a u64... that will cause confusion and issues (like with ++ * bkey_packed_successor()) ++ */ ++static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i, ++ unsigned bits, u64 offset) ++{ ++ offset = bits == 64 ? 0 : min(offset, U64_MAX - ((1ULL << bits) - 1)); ++ ++ f->bits_per_field[i] = bits; ++ f->field_offset[i] = cpu_to_le64(offset); ++} ++ ++struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) ++{ ++ unsigned i, bits = KEY_PACKED_BITS_START; ++ struct bkey_format ret = { ++ .nr_fields = BKEY_NR_FIELDS, ++ }; ++ ++ for (i = 0; i < ARRAY_SIZE(s->field_min); i++) { ++ s->field_min[i] = min(s->field_min[i], s->field_max[i]); ++ ++ set_format_field(&ret, i, ++ fls64(s->field_max[i] - s->field_min[i]), ++ s->field_min[i]); ++ ++ bits += ret.bits_per_field[i]; ++ } ++ ++ /* allow for extent merging: */ ++ if (ret.bits_per_field[BKEY_FIELD_SIZE]) { ++ ret.bits_per_field[BKEY_FIELD_SIZE] += 4; ++ bits += 4; ++ } ++ ++ ret.key_u64s = DIV_ROUND_UP(bits, 64); ++ ++ /* if we have enough spare bits, round fields up to nearest byte */ ++ bits = ret.key_u64s * 64 - bits; ++ ++ for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) { ++ unsigned r = round_up(ret.bits_per_field[i], 8) - ++ ret.bits_per_field[i]; ++ ++ if (r <= bits) { ++ set_format_field(&ret, i, ++ ret.bits_per_field[i] + r, ++ le64_to_cpu(ret.field_offset[i])); ++ bits -= r; ++ } ++ } ++ ++ EBUG_ON(bch2_bkey_format_validate(&ret)); ++ return ret; ++} ++ ++const char *bch2_bkey_format_validate(struct bkey_format *f) ++{ ++ unsigned i, bits = KEY_PACKED_BITS_START; ++ ++ if (f->nr_fields != BKEY_NR_FIELDS) ++ return "incorrect number of fields"; ++ ++ for (i = 0; i < f->nr_fields; i++) { ++ u64 field_offset = le64_to_cpu(f->field_offset[i]); ++ ++ if (f->bits_per_field[i] > 64) ++ return "field too large"; ++ ++ if (field_offset && ++ (f->bits_per_field[i] == 64 || ++ (field_offset + ((1ULL << f->bits_per_field[i]) - 1) < ++ field_offset))) ++ return "offset + bits overflow"; ++ ++ bits += f->bits_per_field[i]; ++ } ++ ++ if (f->key_u64s != DIV_ROUND_UP(bits, 64)) ++ return "incorrect key_u64s"; ++ ++ return NULL; ++} ++ ++/* ++ * Most significant differing bit ++ * Bits are indexed from 0 - return is [0, nr_key_bits) ++ */ ++__pure ++unsigned bch2_bkey_greatest_differing_bit(const struct btree *b, ++ const struct bkey_packed *l_k, ++ const struct bkey_packed *r_k) ++{ ++ const u64 *l = high_word(&b->format, l_k); ++ const u64 *r = high_word(&b->format, r_k); ++ unsigned nr_key_bits = b->nr_key_bits; ++ unsigned word_bits = 64 - high_bit_offset; ++ u64 l_v, r_v; ++ ++ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); ++ ++ /* for big endian, skip past header */ ++ l_v = *l & (~0ULL >> high_bit_offset); ++ r_v = *r & (~0ULL >> high_bit_offset); ++ ++ while (nr_key_bits) { ++ if (nr_key_bits < word_bits) { ++ l_v >>= word_bits - nr_key_bits; ++ r_v >>= word_bits - nr_key_bits; ++ nr_key_bits = 0; ++ } else { ++ nr_key_bits -= word_bits; ++ } ++ ++ if (l_v != r_v) ++ return fls64(l_v ^ r_v) - 1 + nr_key_bits; ++ ++ l = next_word(l); ++ r = next_word(r); ++ ++ l_v = *l; ++ r_v = *r; ++ word_bits = 64; ++ } ++ ++ return 0; ++} ++ ++/* ++ * First set bit ++ * Bits are indexed from 0 - return is [0, nr_key_bits) ++ */ ++__pure ++unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k) ++{ ++ const u64 *p = high_word(&b->format, k); ++ unsigned nr_key_bits = b->nr_key_bits; ++ unsigned ret = 0, offset; ++ ++ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); ++ ++ offset = nr_key_bits; ++ while (offset > 64) { ++ p = next_word(p); ++ offset -= 64; ++ } ++ ++ offset = 64 - offset; ++ ++ while (nr_key_bits) { ++ unsigned bits = nr_key_bits + offset < 64 ++ ? nr_key_bits ++ : 64 - offset; ++ ++ u64 mask = (~0ULL >> (64 - bits)) << offset; ++ ++ if (*p & mask) ++ return ret + __ffs64(*p & mask) - offset; ++ ++ p = prev_word(p); ++ nr_key_bits -= bits; ++ ret += bits; ++ offset = 0; ++ } ++ ++ return 0; ++} ++ ++#ifdef CONFIG_X86_64 ++ ++static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, ++ unsigned nr_key_bits) ++{ ++ long d0, d1, d2, d3; ++ int cmp; ++ ++ /* we shouldn't need asm for this, but gcc is being retarded: */ ++ ++ asm(".intel_syntax noprefix;" ++ "xor eax, eax;" ++ "xor edx, edx;" ++ "1:;" ++ "mov r8, [rdi];" ++ "mov r9, [rsi];" ++ "sub ecx, 64;" ++ "jl 2f;" ++ ++ "cmp r8, r9;" ++ "jnz 3f;" ++ ++ "lea rdi, [rdi - 8];" ++ "lea rsi, [rsi - 8];" ++ "jmp 1b;" ++ ++ "2:;" ++ "not ecx;" ++ "shr r8, 1;" ++ "shr r9, 1;" ++ "shr r8, cl;" ++ "shr r9, cl;" ++ "cmp r8, r9;" ++ ++ "3:\n" ++ "seta al;" ++ "setb dl;" ++ "sub eax, edx;" ++ ".att_syntax prefix;" ++ : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp) ++ : "0" (l), "1" (r), "3" (nr_key_bits) ++ : "r8", "r9", "cc", "memory"); ++ ++ return cmp; ++} ++ ++#define I(_x) (*(out)++ = (_x)) ++#define I1(i0) I(i0) ++#define I2(i0, i1) (I1(i0), I(i1)) ++#define I3(i0, i1, i2) (I2(i0, i1), I(i2)) ++#define I4(i0, i1, i2, i3) (I3(i0, i1, i2), I(i3)) ++#define I5(i0, i1, i2, i3, i4) (I4(i0, i1, i2, i3), I(i4)) ++ ++static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out, ++ enum bch_bkey_fields field, ++ unsigned dst_offset, unsigned dst_size, ++ bool *eax_zeroed) ++{ ++ unsigned bits = format->bits_per_field[field]; ++ u64 offset = le64_to_cpu(format->field_offset[field]); ++ unsigned i, byte, bit_offset, align, shl, shr; ++ ++ if (!bits && !offset) { ++ if (!*eax_zeroed) { ++ /* xor eax, eax */ ++ I2(0x31, 0xc0); ++ } ++ ++ *eax_zeroed = true; ++ goto set_field; ++ } ++ ++ if (!bits) { ++ /* just return offset: */ ++ ++ switch (dst_size) { ++ case 8: ++ if (offset > S32_MAX) { ++ /* mov [rdi + dst_offset], offset */ ++ I3(0xc7, 0x47, dst_offset); ++ memcpy(out, &offset, 4); ++ out += 4; ++ ++ I3(0xc7, 0x47, dst_offset + 4); ++ memcpy(out, (void *) &offset + 4, 4); ++ out += 4; ++ } else { ++ /* mov [rdi + dst_offset], offset */ ++ /* sign extended */ ++ I4(0x48, 0xc7, 0x47, dst_offset); ++ memcpy(out, &offset, 4); ++ out += 4; ++ } ++ break; ++ case 4: ++ /* mov [rdi + dst_offset], offset */ ++ I3(0xc7, 0x47, dst_offset); ++ memcpy(out, &offset, 4); ++ out += 4; ++ break; ++ default: ++ BUG(); ++ } ++ ++ return out; ++ } ++ ++ bit_offset = format->key_u64s * 64; ++ for (i = 0; i <= field; i++) ++ bit_offset -= format->bits_per_field[i]; ++ ++ byte = bit_offset / 8; ++ bit_offset -= byte * 8; ++ ++ *eax_zeroed = false; ++ ++ if (bit_offset == 0 && bits == 8) { ++ /* movzx eax, BYTE PTR [rsi + imm8] */ ++ I4(0x0f, 0xb6, 0x46, byte); ++ } else if (bit_offset == 0 && bits == 16) { ++ /* movzx eax, WORD PTR [rsi + imm8] */ ++ I4(0x0f, 0xb7, 0x46, byte); ++ } else if (bit_offset + bits <= 32) { ++ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); ++ byte -= align; ++ bit_offset += align * 8; ++ ++ BUG_ON(bit_offset + bits > 32); ++ ++ /* mov eax, [rsi + imm8] */ ++ I3(0x8b, 0x46, byte); ++ ++ if (bit_offset) { ++ /* shr eax, imm8 */ ++ I3(0xc1, 0xe8, bit_offset); ++ } ++ ++ if (bit_offset + bits < 32) { ++ unsigned mask = ~0U >> (32 - bits); ++ ++ /* and eax, imm32 */ ++ I1(0x25); ++ memcpy(out, &mask, 4); ++ out += 4; ++ } ++ } else if (bit_offset + bits <= 64) { ++ align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7); ++ byte -= align; ++ bit_offset += align * 8; ++ ++ BUG_ON(bit_offset + bits > 64); ++ ++ /* mov rax, [rsi + imm8] */ ++ I4(0x48, 0x8b, 0x46, byte); ++ ++ shl = 64 - bit_offset - bits; ++ shr = bit_offset + shl; ++ ++ if (shl) { ++ /* shl rax, imm8 */ ++ I4(0x48, 0xc1, 0xe0, shl); ++ } ++ ++ if (shr) { ++ /* shr rax, imm8 */ ++ I4(0x48, 0xc1, 0xe8, shr); ++ } ++ } else { ++ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); ++ byte -= align; ++ bit_offset += align * 8; ++ ++ BUG_ON(bit_offset + bits > 96); ++ ++ /* mov rax, [rsi + byte] */ ++ I4(0x48, 0x8b, 0x46, byte); ++ ++ /* mov edx, [rsi + byte + 8] */ ++ I3(0x8b, 0x56, byte + 8); ++ ++ /* bits from next word: */ ++ shr = bit_offset + bits - 64; ++ BUG_ON(shr > bit_offset); ++ ++ /* shr rax, bit_offset */ ++ I4(0x48, 0xc1, 0xe8, shr); ++ ++ /* shl rdx, imm8 */ ++ I4(0x48, 0xc1, 0xe2, 64 - shr); ++ ++ /* or rax, rdx */ ++ I3(0x48, 0x09, 0xd0); ++ ++ shr = bit_offset - shr; ++ ++ if (shr) { ++ /* shr rax, imm8 */ ++ I4(0x48, 0xc1, 0xe8, shr); ++ } ++ } ++ ++ /* rax += offset: */ ++ if (offset > S32_MAX) { ++ /* mov rdx, imm64 */ ++ I2(0x48, 0xba); ++ memcpy(out, &offset, 8); ++ out += 8; ++ /* add %rdx, %rax */ ++ I3(0x48, 0x01, 0xd0); ++ } else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) { ++ /* add rax, imm32 */ ++ I2(0x48, 0x05); ++ memcpy(out, &offset, 4); ++ out += 4; ++ } else if (offset) { ++ /* add eax, imm32 */ ++ I1(0x05); ++ memcpy(out, &offset, 4); ++ out += 4; ++ } ++set_field: ++ switch (dst_size) { ++ case 8: ++ /* mov [rdi + dst_offset], rax */ ++ I4(0x48, 0x89, 0x47, dst_offset); ++ break; ++ case 4: ++ /* mov [rdi + dst_offset], eax */ ++ I3(0x89, 0x47, dst_offset); ++ break; ++ default: ++ BUG(); ++ } ++ ++ return out; ++} ++ ++int bch2_compile_bkey_format(const struct bkey_format *format, void *_out) ++{ ++ bool eax_zeroed = false; ++ u8 *out = _out; ++ ++ /* ++ * rdi: dst - unpacked key ++ * rsi: src - packed key ++ */ ++ ++ /* k->u64s, k->format, k->type */ ++ ++ /* mov eax, [rsi] */ ++ I2(0x8b, 0x06); ++ ++ /* add eax, BKEY_U64s - format->key_u64s */ ++ I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0); ++ ++ /* and eax, imm32: mask out k->pad: */ ++ I5(0x25, 0xff, 0xff, 0xff, 0); ++ ++ /* mov [rdi], eax */ ++ I2(0x89, 0x07); ++ ++#define x(id, field) \ ++ out = compile_bkey_field(format, out, id, \ ++ offsetof(struct bkey, field), \ ++ sizeof(((struct bkey *) NULL)->field), \ ++ &eax_zeroed); ++ bkey_fields() ++#undef x ++ ++ /* retq */ ++ I1(0xc3); ++ ++ return (void *) out - _out; ++} ++ ++#else ++static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, ++ unsigned nr_key_bits) ++{ ++ u64 l_v, r_v; ++ ++ if (!nr_key_bits) ++ return 0; ++ ++ /* for big endian, skip past header */ ++ nr_key_bits += high_bit_offset; ++ l_v = *l & (~0ULL >> high_bit_offset); ++ r_v = *r & (~0ULL >> high_bit_offset); ++ ++ while (1) { ++ if (nr_key_bits < 64) { ++ l_v >>= 64 - nr_key_bits; ++ r_v >>= 64 - nr_key_bits; ++ nr_key_bits = 0; ++ } else { ++ nr_key_bits -= 64; ++ } ++ ++ if (!nr_key_bits || l_v != r_v) ++ break; ++ ++ l = next_word(l); ++ r = next_word(r); ++ ++ l_v = *l; ++ r_v = *r; ++ } ++ ++ return cmp_int(l_v, r_v); ++} ++#endif ++ ++__pure ++int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l, ++ const struct bkey_packed *r, ++ const struct btree *b) ++{ ++ const struct bkey_format *f = &b->format; ++ int ret; ++ ++ EBUG_ON(!bkey_packed(l) || !bkey_packed(r)); ++ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); ++ ++ ret = __bkey_cmp_bits(high_word(f, l), ++ high_word(f, r), ++ b->nr_key_bits); ++ ++ EBUG_ON(ret != bkey_cmp(bkey_unpack_pos(b, l), ++ bkey_unpack_pos(b, r))); ++ return ret; ++} ++ ++__pure __flatten ++int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bpos *r) ++{ ++ return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r); ++} ++ ++__pure __flatten ++int __bch2_bkey_cmp_packed(const struct bkey_packed *l, ++ const struct bkey_packed *r, ++ const struct btree *b) ++{ ++ struct bkey unpacked; ++ ++ if (likely(bkey_packed(l) && bkey_packed(r))) ++ return __bch2_bkey_cmp_packed_format_checked(l, r, b); ++ ++ if (bkey_packed(l)) { ++ __bkey_unpack_key_format_checked(b, &unpacked, l); ++ l = (void*) &unpacked; ++ } else if (bkey_packed(r)) { ++ __bkey_unpack_key_format_checked(b, &unpacked, r); ++ r = (void*) &unpacked; ++ } ++ ++ return bkey_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); ++} ++ ++__pure __flatten ++int __bch2_bkey_cmp_left_packed(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bpos *r) ++{ ++ const struct bkey *l_unpacked; ++ ++ return unlikely(l_unpacked = packed_to_bkey_c(l)) ++ ? bkey_cmp(l_unpacked->p, *r) ++ : __bch2_bkey_cmp_left_packed_format_checked(b, l, r); ++} ++ ++void bch2_bpos_swab(struct bpos *p) ++{ ++ u8 *l = (u8 *) p; ++ u8 *h = ((u8 *) &p[1]) - 1; ++ ++ while (l < h) { ++ swap(*l, *h); ++ l++; ++ --h; ++ } ++} ++ ++void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k) ++{ ++ const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current; ++ u8 *l = k->key_start; ++ u8 *h = (u8 *) (k->_data + f->key_u64s) - 1; ++ ++ while (l < h) { ++ swap(*l, *h); ++ l++; ++ --h; ++ } ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_bkey_pack_test(void) ++{ ++ struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0); ++ struct bkey_packed p; ++ ++ struct bkey_format test_format = { ++ .key_u64s = 2, ++ .nr_fields = BKEY_NR_FIELDS, ++ .bits_per_field = { ++ 13, ++ 64, ++ }, ++ }; ++ ++ struct unpack_state in_s = ++ unpack_state_init(&bch2_bkey_format_current, (void *) &t); ++ struct pack_state out_s = pack_state_init(&test_format, &p); ++ unsigned i; ++ ++ for (i = 0; i < out_s.format->nr_fields; i++) { ++ u64 a, v = get_inc_field(&in_s, i); ++ ++ switch (i) { ++#define x(id, field) case id: a = t.field; break; ++ bkey_fields() ++#undef x ++ default: ++ BUG(); ++ } ++ ++ if (a != v) ++ panic("got %llu actual %llu i %u\n", v, a, i); ++ ++ if (!set_inc_field(&out_s, i, v)) ++ panic("failed at %u\n", i); ++ } ++ ++ BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format)); ++} ++#endif +diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h +new file mode 100644 +index 000000000000..cbcfbd26bc58 +--- /dev/null ++++ b/fs/bcachefs/bkey.h +@@ -0,0 +1,605 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BKEY_H ++#define _BCACHEFS_BKEY_H ++ ++#include ++#include "bcachefs_format.h" ++ ++#include "util.h" ++#include "vstructs.h" ++ ++#ifdef CONFIG_X86_64 ++#define HAVE_BCACHEFS_COMPILED_UNPACK 1 ++#endif ++ ++void bch2_to_binary(char *, const u64 *, unsigned); ++ ++/* bkey with split value, const */ ++struct bkey_s_c { ++ const struct bkey *k; ++ const struct bch_val *v; ++}; ++ ++/* bkey with split value */ ++struct bkey_s { ++ union { ++ struct { ++ struct bkey *k; ++ struct bch_val *v; ++ }; ++ struct bkey_s_c s_c; ++ }; ++}; ++ ++#define bkey_next(_k) vstruct_next(_k) ++ ++static inline struct bkey_packed *bkey_next_skip_noops(struct bkey_packed *k, ++ struct bkey_packed *end) ++{ ++ k = bkey_next(k); ++ ++ while (k != end && !k->u64s) ++ k = (void *) ((u64 *) k + 1); ++ return k; ++} ++ ++#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) ++ ++static inline size_t bkey_val_bytes(const struct bkey *k) ++{ ++ return bkey_val_u64s(k) * sizeof(u64); ++} ++ ++static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) ++{ ++ k->u64s = BKEY_U64s + val_u64s; ++} ++ ++static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) ++{ ++ k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64)); ++} ++ ++#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) ++ ++#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) ++ ++#define bkey_whiteout(_k) \ ++ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard) ++ ++#define bkey_packed_typecheck(_k) \ ++({ \ ++ BUILD_BUG_ON(!type_is(_k, struct bkey *) && \ ++ !type_is(_k, struct bkey_packed *)); \ ++ type_is(_k, struct bkey_packed *); \ ++}) ++ ++enum bkey_lr_packed { ++ BKEY_PACKED_BOTH, ++ BKEY_PACKED_RIGHT, ++ BKEY_PACKED_LEFT, ++ BKEY_PACKED_NONE, ++}; ++ ++#define bkey_lr_packed_typecheck(_l, _r) \ ++ (!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1)) ++ ++#define bkey_lr_packed(_l, _r) \ ++ ((_l)->format + ((_r)->format << 1)) ++ ++#define bkey_copy(_dst, _src) \ ++do { \ ++ BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) && \ ++ !type_is(_dst, struct bkey_packed *)); \ ++ BUILD_BUG_ON(!type_is(_src, struct bkey_i *) && \ ++ !type_is(_src, struct bkey_packed *)); \ ++ EBUG_ON((u64 *) (_dst) > (u64 *) (_src) && \ ++ (u64 *) (_dst) < (u64 *) (_src) + \ ++ ((struct bkey *) (_src))->u64s); \ ++ \ ++ memcpy_u64s_small((_dst), (_src), \ ++ ((struct bkey *) (_src))->u64s); \ ++} while (0) ++ ++struct btree; ++ ++struct bkey_format_state { ++ u64 field_min[BKEY_NR_FIELDS]; ++ u64 field_max[BKEY_NR_FIELDS]; ++}; ++ ++void bch2_bkey_format_init(struct bkey_format_state *); ++void bch2_bkey_format_add_key(struct bkey_format_state *, const struct bkey *); ++void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); ++struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); ++const char *bch2_bkey_format_validate(struct bkey_format *); ++ ++__pure ++unsigned bch2_bkey_greatest_differing_bit(const struct btree *, ++ const struct bkey_packed *, ++ const struct bkey_packed *); ++__pure ++unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *); ++ ++__pure ++int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *, ++ const struct bkey_packed *, ++ const struct btree *); ++ ++__pure ++int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *, ++ const struct bkey_packed *, ++ const struct bpos *); ++ ++__pure ++int __bch2_bkey_cmp_packed(const struct bkey_packed *, ++ const struct bkey_packed *, ++ const struct btree *); ++ ++__pure ++int __bch2_bkey_cmp_left_packed(const struct btree *, ++ const struct bkey_packed *, ++ const struct bpos *); ++ ++static inline __pure ++int bkey_cmp_left_packed(const struct btree *b, ++ const struct bkey_packed *l, const struct bpos *r) ++{ ++ return __bch2_bkey_cmp_left_packed(b, l, r); ++} ++ ++/* ++ * we prefer to pass bpos by ref, but it's often enough terribly convenient to ++ * pass it by by val... as much as I hate c++, const ref would be nice here: ++ */ ++__pure __flatten ++static inline int bkey_cmp_left_packed_byval(const struct btree *b, ++ const struct bkey_packed *l, ++ struct bpos r) ++{ ++ return bkey_cmp_left_packed(b, l, &r); ++} ++ ++/* ++ * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to ++ * skip dispatching on k->format: ++ */ ++#define bkey_cmp_packed(_b, _l, _r) \ ++({ \ ++ int _cmp; \ ++ \ ++ switch (bkey_lr_packed_typecheck(_l, _r)) { \ ++ case BKEY_PACKED_NONE: \ ++ _cmp = bkey_cmp(((struct bkey *) (_l))->p, \ ++ ((struct bkey *) (_r))->p); \ ++ break; \ ++ case BKEY_PACKED_LEFT: \ ++ _cmp = bkey_cmp_left_packed((_b), \ ++ (struct bkey_packed *) (_l), \ ++ &((struct bkey *) (_r))->p); \ ++ break; \ ++ case BKEY_PACKED_RIGHT: \ ++ _cmp = -bkey_cmp_left_packed((_b), \ ++ (struct bkey_packed *) (_r), \ ++ &((struct bkey *) (_l))->p); \ ++ break; \ ++ case BKEY_PACKED_BOTH: \ ++ _cmp = __bch2_bkey_cmp_packed((void *) (_l), \ ++ (void *) (_r), (_b)); \ ++ break; \ ++ } \ ++ _cmp; \ ++}) ++ ++#if 1 ++static __always_inline int bkey_cmp(struct bpos l, struct bpos r) ++{ ++ if (l.inode != r.inode) ++ return l.inode < r.inode ? -1 : 1; ++ if (l.offset != r.offset) ++ return l.offset < r.offset ? -1 : 1; ++ if (l.snapshot != r.snapshot) ++ return l.snapshot < r.snapshot ? -1 : 1; ++ return 0; ++} ++#else ++int bkey_cmp(struct bpos l, struct bpos r); ++#endif ++ ++static inline struct bpos bpos_min(struct bpos l, struct bpos r) ++{ ++ return bkey_cmp(l, r) < 0 ? l : r; ++} ++ ++void bch2_bpos_swab(struct bpos *); ++void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); ++ ++static __always_inline int bversion_cmp(struct bversion l, struct bversion r) ++{ ++ return cmp_int(l.hi, r.hi) ?: ++ cmp_int(l.lo, r.lo); ++} ++ ++#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) ++#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL }) ++ ++static __always_inline int bversion_zero(struct bversion v) ++{ ++ return !bversion_cmp(v, ZERO_VERSION); ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++/* statement expressions confusing unlikely()? */ ++#define bkey_packed(_k) \ ++ ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \ ++ (_k)->format != KEY_FORMAT_CURRENT; }) ++#else ++#define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT) ++#endif ++ ++/* ++ * It's safe to treat an unpacked bkey as a packed one, but not the reverse ++ */ ++static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k) ++{ ++ return (struct bkey_packed *) k; ++} ++ ++static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k) ++{ ++ return (const struct bkey_packed *) k; ++} ++ ++static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k) ++{ ++ return bkey_packed(k) ? NULL : (struct bkey_i *) k; ++} ++ ++static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k) ++{ ++ return bkey_packed(k) ? NULL : (const struct bkey *) k; ++} ++ ++static inline unsigned bkey_format_key_bits(const struct bkey_format *format) ++{ ++ return format->bits_per_field[BKEY_FIELD_INODE] + ++ format->bits_per_field[BKEY_FIELD_OFFSET] + ++ format->bits_per_field[BKEY_FIELD_SNAPSHOT]; ++} ++ ++static inline struct bpos bkey_successor(struct bpos p) ++{ ++ struct bpos ret = p; ++ ++ if (!++ret.offset) ++ BUG_ON(!++ret.inode); ++ ++ return ret; ++} ++ ++static inline struct bpos bkey_predecessor(struct bpos p) ++{ ++ struct bpos ret = p; ++ ++ if (!ret.offset--) ++ BUG_ON(!ret.inode--); ++ ++ return ret; ++} ++ ++static inline u64 bkey_start_offset(const struct bkey *k) ++{ ++ return k->p.offset - k->size; ++} ++ ++static inline struct bpos bkey_start_pos(const struct bkey *k) ++{ ++ return (struct bpos) { ++ .inode = k->p.inode, ++ .offset = bkey_start_offset(k), ++ .snapshot = k->p.snapshot, ++ }; ++} ++ ++/* Packed helpers */ ++ ++static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s; ++ ++ EBUG_ON(k->u64s < ret); ++ return ret; ++} ++ ++static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ return bkeyp_key_u64s(format, k) * sizeof(u64); ++} ++ ++static inline unsigned bkeyp_val_u64s(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ return k->u64s - bkeyp_key_u64s(format, k); ++} ++ ++static inline size_t bkeyp_val_bytes(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ return bkeyp_val_u64s(format, k) * sizeof(u64); ++} ++ ++static inline void set_bkeyp_val_u64s(const struct bkey_format *format, ++ struct bkey_packed *k, unsigned val_u64s) ++{ ++ k->u64s = bkeyp_key_u64s(format, k) + val_u64s; ++} ++ ++#define bkeyp_val(_format, _k) \ ++ ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k))) ++ ++extern const struct bkey_format bch2_bkey_format_current; ++ ++bool bch2_bkey_transform(const struct bkey_format *, ++ struct bkey_packed *, ++ const struct bkey_format *, ++ const struct bkey_packed *); ++ ++struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, ++ const struct bkey_packed *); ++ ++#ifndef HAVE_BCACHEFS_COMPILED_UNPACK ++struct bpos __bkey_unpack_pos(const struct bkey_format *, ++ const struct bkey_packed *); ++#endif ++ ++bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *, ++ const struct bkey_format *); ++ ++enum bkey_pack_pos_ret { ++ BKEY_PACK_POS_EXACT, ++ BKEY_PACK_POS_SMALLER, ++ BKEY_PACK_POS_FAIL, ++}; ++ ++enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos, ++ const struct btree *); ++ ++static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in, ++ const struct btree *b) ++{ ++ return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT; ++} ++ ++void bch2_bkey_unpack(const struct btree *, struct bkey_i *, ++ const struct bkey_packed *); ++bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *, ++ const struct bkey_format *); ++ ++static inline u64 bkey_field_max(const struct bkey_format *f, ++ enum bch_bkey_fields nr) ++{ ++ return f->bits_per_field[nr] < 64 ++ ? (le64_to_cpu(f->field_offset[nr]) + ++ ~(~0ULL << f->bits_per_field[nr])) ++ : U64_MAX; ++} ++ ++#ifdef HAVE_BCACHEFS_COMPILED_UNPACK ++ ++int bch2_compile_bkey_format(const struct bkey_format *, void *); ++ ++#else ++ ++static inline int bch2_compile_bkey_format(const struct bkey_format *format, ++ void *out) { return 0; } ++ ++#endif ++ ++static inline void bkey_reassemble(struct bkey_i *dst, ++ struct bkey_s_c src) ++{ ++ dst->k = *src.k; ++ memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k)); ++} ++ ++#define bkey_s_null ((struct bkey_s) { .k = NULL }) ++#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) ++ ++#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) ++#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) ++ ++static inline struct bkey_s bkey_to_s(struct bkey *k) ++{ ++ return (struct bkey_s) { .k = k, .v = NULL }; ++} ++ ++static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) ++{ ++ return (struct bkey_s_c) { .k = k, .v = NULL }; ++} ++ ++static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) ++{ ++ return (struct bkey_s) { .k = &k->k, .v = &k->v }; ++} ++ ++static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) ++{ ++ return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; ++} ++ ++/* ++ * For a given type of value (e.g. struct bch_extent), generates the types for ++ * bkey + bch_extent - inline, split, split const - and also all the conversion ++ * functions, which also check that the value is of the correct type. ++ * ++ * We use anonymous unions for upcasting - e.g. converting from e.g. a ++ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion ++ * functions. ++ */ ++#define BKEY_VAL_ACCESSORS(name) \ ++struct bkey_i_##name { \ ++ union { \ ++ struct bkey k; \ ++ struct bkey_i k_i; \ ++ }; \ ++ struct bch_##name v; \ ++}; \ ++ \ ++struct bkey_s_c_##name { \ ++ union { \ ++ struct { \ ++ const struct bkey *k; \ ++ const struct bch_##name *v; \ ++ }; \ ++ struct bkey_s_c s_c; \ ++ }; \ ++}; \ ++ \ ++struct bkey_s_##name { \ ++ union { \ ++ struct { \ ++ struct bkey *k; \ ++ struct bch_##name *v; \ ++ }; \ ++ struct bkey_s_c_##name c; \ ++ struct bkey_s s; \ ++ struct bkey_s_c s_c; \ ++ }; \ ++}; \ ++ \ ++static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ ++{ \ ++ EBUG_ON(k->k.type != KEY_TYPE_##name); \ ++ return container_of(&k->k, struct bkey_i_##name, k); \ ++} \ ++ \ ++static inline const struct bkey_i_##name * \ ++bkey_i_to_##name##_c(const struct bkey_i *k) \ ++{ \ ++ EBUG_ON(k->k.type != KEY_TYPE_##name); \ ++ return container_of(&k->k, struct bkey_i_##name, k); \ ++} \ ++ \ ++static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ ++{ \ ++ EBUG_ON(k.k->type != KEY_TYPE_##name); \ ++ return (struct bkey_s_##name) { \ ++ .k = k.k, \ ++ .v = container_of(k.v, struct bch_##name, v), \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ ++{ \ ++ EBUG_ON(k.k->type != KEY_TYPE_##name); \ ++ return (struct bkey_s_c_##name) { \ ++ .k = k.k, \ ++ .v = container_of(k.v, struct bch_##name, v), \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ ++{ \ ++ return (struct bkey_s_##name) { \ ++ .k = &k->k, \ ++ .v = &k->v, \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_c_##name \ ++name##_i_to_s_c(const struct bkey_i_##name *k) \ ++{ \ ++ return (struct bkey_s_c_##name) { \ ++ .k = &k->k, \ ++ .v = &k->v, \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ ++{ \ ++ EBUG_ON(k->k.type != KEY_TYPE_##name); \ ++ return (struct bkey_s_##name) { \ ++ .k = &k->k, \ ++ .v = container_of(&k->v, struct bch_##name, v), \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_c_##name \ ++bkey_i_to_s_c_##name(const struct bkey_i *k) \ ++{ \ ++ EBUG_ON(k->k.type != KEY_TYPE_##name); \ ++ return (struct bkey_s_c_##name) { \ ++ .k = &k->k, \ ++ .v = container_of(&k->v, struct bch_##name, v), \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ ++{ \ ++ struct bkey_i_##name *k = \ ++ container_of(&_k->k, struct bkey_i_##name, k); \ ++ \ ++ bkey_init(&k->k); \ ++ memset(&k->v, 0, sizeof(k->v)); \ ++ k->k.type = KEY_TYPE_##name; \ ++ set_bkey_val_bytes(&k->k, sizeof(k->v)); \ ++ \ ++ return k; \ ++} ++ ++BKEY_VAL_ACCESSORS(cookie); ++BKEY_VAL_ACCESSORS(btree_ptr); ++BKEY_VAL_ACCESSORS(extent); ++BKEY_VAL_ACCESSORS(reservation); ++BKEY_VAL_ACCESSORS(inode); ++BKEY_VAL_ACCESSORS(inode_generation); ++BKEY_VAL_ACCESSORS(dirent); ++BKEY_VAL_ACCESSORS(xattr); ++BKEY_VAL_ACCESSORS(alloc); ++BKEY_VAL_ACCESSORS(quota); ++BKEY_VAL_ACCESSORS(stripe); ++BKEY_VAL_ACCESSORS(reflink_p); ++BKEY_VAL_ACCESSORS(reflink_v); ++BKEY_VAL_ACCESSORS(inline_data); ++BKEY_VAL_ACCESSORS(btree_ptr_v2); ++ ++/* byte order helpers */ ++ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ ++static inline unsigned high_word_offset(const struct bkey_format *f) ++{ ++ return f->key_u64s - 1; ++} ++ ++#define high_bit_offset 0 ++#define nth_word(p, n) ((p) - (n)) ++ ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++ ++static inline unsigned high_word_offset(const struct bkey_format *f) ++{ ++ return 0; ++} ++ ++#define high_bit_offset KEY_PACKED_BITS_START ++#define nth_word(p, n) ((p) + (n)) ++ ++#else ++#error edit for your odd byteorder. ++#endif ++ ++#define high_word(f, k) ((k)->_data + high_word_offset(f)) ++#define next_word(p) nth_word(p, 1) ++#define prev_word(p) nth_word(p, -1) ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_bkey_pack_test(void); ++#else ++static inline void bch2_bkey_pack_test(void) {} ++#endif ++ ++#endif /* _BCACHEFS_BKEY_H */ +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +new file mode 100644 +index 000000000000..36e0c5152b47 +--- /dev/null ++++ b/fs/bcachefs/bkey_methods.c +@@ -0,0 +1,353 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_types.h" ++#include "alloc_background.h" ++#include "dirent.h" ++#include "ec.h" ++#include "error.h" ++#include "extents.h" ++#include "inode.h" ++#include "quota.h" ++#include "reflink.h" ++#include "xattr.h" ++ ++const char * const bch2_bkey_types[] = { ++#define x(name, nr) #name, ++ BCH_BKEY_TYPES() ++#undef x ++ NULL ++}; ++ ++static const char *deleted_key_invalid(const struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ return NULL; ++} ++ ++#define bch2_bkey_ops_deleted (struct bkey_ops) { \ ++ .key_invalid = deleted_key_invalid, \ ++} ++ ++#define bch2_bkey_ops_discard (struct bkey_ops) { \ ++ .key_invalid = deleted_key_invalid, \ ++} ++ ++static const char *empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ if (bkey_val_bytes(k.k)) ++ return "value size should be zero"; ++ ++ return NULL; ++} ++ ++#define bch2_bkey_ops_error (struct bkey_ops) { \ ++ .key_invalid = empty_val_key_invalid, \ ++} ++ ++static const char *key_type_cookie_invalid(const struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) ++ return "incorrect value size"; ++ ++ return NULL; ++} ++ ++#define bch2_bkey_ops_cookie (struct bkey_ops) { \ ++ .key_invalid = key_type_cookie_invalid, \ ++} ++ ++#define bch2_bkey_ops_whiteout (struct bkey_ops) { \ ++ .key_invalid = empty_val_key_invalid, \ ++} ++ ++static const char *key_type_inline_data_invalid(const struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ return NULL; ++} ++ ++static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ pr_buf(out, "(%zu bytes)", bkey_val_bytes(k.k)); ++} ++ ++#define bch2_bkey_ops_inline_data (struct bkey_ops) { \ ++ .key_invalid = key_type_inline_data_invalid, \ ++ .val_to_text = key_type_inline_data_to_text, \ ++} ++ ++static const struct bkey_ops bch2_bkey_ops[] = { ++#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, ++ BCH_BKEY_TYPES() ++#undef x ++}; ++ ++const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k) ++{ ++ if (k.k->type >= KEY_TYPE_MAX) ++ return "invalid type"; ++ ++ return bch2_bkey_ops[k.k->type].key_invalid(c, k); ++} ++ ++const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, ++ enum btree_node_type type) ++{ ++ if (k.k->u64s < BKEY_U64s) ++ return "u64s too small"; ++ ++ if (type == BKEY_TYPE_BTREE && ++ bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) ++ return "value too big"; ++ ++ if (btree_node_type_is_extents(type)) { ++ if ((k.k->size == 0) != bkey_deleted(k.k)) ++ return "bad size field"; ++ ++ if (k.k->size > k.k->p.offset) ++ return "size greater than offset"; ++ } else { ++ if (k.k->size) ++ return "nonzero size field"; ++ } ++ ++ if (k.k->p.snapshot) ++ return "nonzero snapshot"; ++ ++ if (type != BKEY_TYPE_BTREE && ++ !bkey_cmp(k.k->p, POS_MAX)) ++ return "POS_MAX key"; ++ ++ return NULL; ++} ++ ++const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, ++ enum btree_node_type type) ++{ ++ return __bch2_bkey_invalid(c, k, type) ?: ++ bch2_bkey_val_invalid(c, k); ++} ++ ++const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) ++{ ++ if (bkey_cmp(k.k->p, b->data->min_key) < 0) ++ return "key before start of btree node"; ++ ++ if (bkey_cmp(k.k->p, b->data->max_key) > 0) ++ return "key past end of btree node"; ++ ++ return NULL; ++} ++ ++void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; ++ const char *invalid; ++ ++ BUG_ON(!k.k->u64s); ++ ++ invalid = bch2_bkey_invalid(c, k, btree_node_type(b)) ?: ++ bch2_bkey_in_btree_node(b, k); ++ if (invalid) { ++ char buf[160]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, k); ++ bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid); ++ return; ++ } ++ ++ if (ops->key_debugcheck) ++ ops->key_debugcheck(c, k); ++} ++ ++void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) ++{ ++ if (!bkey_cmp(pos, POS_MIN)) ++ pr_buf(out, "POS_MIN"); ++ else if (!bkey_cmp(pos, POS_MAX)) ++ pr_buf(out, "POS_MAX"); ++ else ++ pr_buf(out, "%llu:%llu", pos.inode, pos.offset); ++} ++ ++void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) ++{ ++ if (k) { ++ pr_buf(out, "u64s %u type %s ", k->u64s, ++ bch2_bkey_types[k->type]); ++ ++ bch2_bpos_to_text(out, k->p); ++ ++ pr_buf(out, " snap %u len %u ver %llu", ++ k->p.snapshot, k->size, k->version.lo); ++ } else { ++ pr_buf(out, "(null)"); ++ } ++} ++ ++void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; ++ ++ if (likely(ops->val_to_text)) ++ ops->val_to_text(out, c, k); ++} ++ ++void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ bch2_bkey_to_text(out, k.k); ++ ++ if (k.k) { ++ pr_buf(out, ": "); ++ bch2_val_to_text(out, c, k); ++ } ++} ++ ++void bch2_bkey_swab_val(struct bkey_s k) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; ++ ++ if (ops->swab) ++ ops->swab(k); ++} ++ ++bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; ++ ++ return ops->key_normalize ++ ? ops->key_normalize(c, k) ++ : false; ++} ++ ++enum merge_result bch2_bkey_merge(struct bch_fs *c, ++ struct bkey_s l, struct bkey_s r) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type]; ++ enum merge_result ret; ++ ++ if (key_merging_disabled(c) || ++ !ops->key_merge || ++ l.k->type != r.k->type || ++ bversion_cmp(l.k->version, r.k->version) || ++ bkey_cmp(l.k->p, bkey_start_pos(r.k))) ++ return BCH_MERGE_NOMERGE; ++ ++ ret = ops->key_merge(c, l, r); ++ ++ if (ret != BCH_MERGE_NOMERGE) ++ l.k->needs_whiteout |= r.k->needs_whiteout; ++ return ret; ++} ++ ++static const struct old_bkey_type { ++ u8 btree_node_type; ++ u8 old; ++ u8 new; ++} bkey_renumber_table[] = { ++ {BKEY_TYPE_BTREE, 128, KEY_TYPE_btree_ptr }, ++ {BKEY_TYPE_EXTENTS, 128, KEY_TYPE_extent }, ++ {BKEY_TYPE_EXTENTS, 129, KEY_TYPE_extent }, ++ {BKEY_TYPE_EXTENTS, 130, KEY_TYPE_reservation }, ++ {BKEY_TYPE_INODES, 128, KEY_TYPE_inode }, ++ {BKEY_TYPE_INODES, 130, KEY_TYPE_inode_generation }, ++ {BKEY_TYPE_DIRENTS, 128, KEY_TYPE_dirent }, ++ {BKEY_TYPE_DIRENTS, 129, KEY_TYPE_whiteout }, ++ {BKEY_TYPE_XATTRS, 128, KEY_TYPE_xattr }, ++ {BKEY_TYPE_XATTRS, 129, KEY_TYPE_whiteout }, ++ {BKEY_TYPE_ALLOC, 128, KEY_TYPE_alloc }, ++ {BKEY_TYPE_QUOTAS, 128, KEY_TYPE_quota }, ++}; ++ ++void bch2_bkey_renumber(enum btree_node_type btree_node_type, ++ struct bkey_packed *k, ++ int write) ++{ ++ const struct old_bkey_type *i; ++ ++ for (i = bkey_renumber_table; ++ i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table); ++ i++) ++ if (btree_node_type == i->btree_node_type && ++ k->type == (write ? i->new : i->old)) { ++ k->type = write ? i->old : i->new; ++ break; ++ } ++} ++ ++void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, ++ struct bkey_format *f, ++ struct bkey_packed *k) ++{ ++ const struct bkey_ops *ops; ++ struct bkey uk; ++ struct bkey_s u; ++ int i; ++ ++ /* ++ * Do these operations in reverse order in the write path: ++ */ ++ ++ for (i = 0; i < 4; i++) ++ switch (!write ? i : 3 - i) { ++ case 0: ++ if (big_endian != CPU_BIG_ENDIAN) ++ bch2_bkey_swab_key(f, k); ++ break; ++ case 1: ++ if (version < bcachefs_metadata_version_bkey_renumber) ++ bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write); ++ break; ++ case 2: ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_id == BTREE_ID_INODES) { ++ if (!bkey_packed(k)) { ++ struct bkey_i *u = packed_to_bkey(k); ++ swap(u->k.p.inode, u->k.p.offset); ++ } else if (f->bits_per_field[BKEY_FIELD_INODE] && ++ f->bits_per_field[BKEY_FIELD_OFFSET]) { ++ struct bkey_format tmp = *f, *in = f, *out = &tmp; ++ ++ swap(tmp.bits_per_field[BKEY_FIELD_INODE], ++ tmp.bits_per_field[BKEY_FIELD_OFFSET]); ++ swap(tmp.field_offset[BKEY_FIELD_INODE], ++ tmp.field_offset[BKEY_FIELD_OFFSET]); ++ ++ if (!write) ++ swap(in, out); ++ ++ uk = __bch2_bkey_unpack_key(in, k); ++ swap(uk.p.inode, uk.p.offset); ++ BUG_ON(!bch2_bkey_pack_key(k, &uk, out)); ++ } ++ } ++ break; ++ case 3: ++ if (!bkey_packed(k)) { ++ u = bkey_i_to_s(packed_to_bkey(k)); ++ } else { ++ uk = __bch2_bkey_unpack_key(f, k); ++ u.k = &uk; ++ u.v = bkeyp_val(f, k); ++ } ++ ++ if (big_endian != CPU_BIG_ENDIAN) ++ bch2_bkey_swab_val(u); ++ ++ ops = &bch2_bkey_ops[k->type]; ++ ++ if (ops->compat) ++ ops->compat(btree_id, version, big_endian, write, u); ++ break; ++ default: ++ BUG(); ++ } ++} +diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h +new file mode 100644 +index 000000000000..0bca725ae3b8 +--- /dev/null ++++ b/fs/bcachefs/bkey_methods.h +@@ -0,0 +1,82 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BKEY_METHODS_H ++#define _BCACHEFS_BKEY_METHODS_H ++ ++#include "bkey.h" ++ ++struct bch_fs; ++struct btree; ++struct bkey; ++enum btree_node_type; ++ ++extern const char * const bch2_bkey_types[]; ++ ++enum merge_result { ++ BCH_MERGE_NOMERGE, ++ ++ /* ++ * The keys were mergeable, but would have overflowed size - so instead ++ * l was changed to the maximum size, and both keys were modified: ++ */ ++ BCH_MERGE_PARTIAL, ++ BCH_MERGE_MERGE, ++}; ++ ++struct bkey_ops { ++ /* Returns reason for being invalid if invalid, else NULL: */ ++ const char * (*key_invalid)(const struct bch_fs *, ++ struct bkey_s_c); ++ void (*key_debugcheck)(struct bch_fs *, struct bkey_s_c); ++ void (*val_to_text)(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ void (*swab)(struct bkey_s); ++ bool (*key_normalize)(struct bch_fs *, struct bkey_s); ++ enum merge_result (*key_merge)(struct bch_fs *, ++ struct bkey_s, struct bkey_s); ++ void (*compat)(enum btree_id id, unsigned version, ++ unsigned big_endian, int write, ++ struct bkey_s); ++}; ++ ++const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c); ++const char *__bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, ++ enum btree_node_type); ++const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, ++ enum btree_node_type); ++const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c); ++ ++void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); ++ ++void bch2_bpos_to_text(struct printbuf *, struct bpos); ++void bch2_bkey_to_text(struct printbuf *, const struct bkey *); ++void bch2_val_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ ++void bch2_bkey_swab_val(struct bkey_s); ++ ++bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); ++ ++enum merge_result bch2_bkey_merge(struct bch_fs *, ++ struct bkey_s, struct bkey_s); ++ ++void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); ++ ++void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned, ++ int, struct bkey_format *, struct bkey_packed *); ++ ++static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, ++ struct bkey_format *f, ++ struct bkey_packed *k) ++{ ++ if (version < bcachefs_metadata_version_current || ++ big_endian != CPU_BIG_ENDIAN) ++ __bch2_bkey_compat(level, btree_id, version, ++ big_endian, write, f, k); ++ ++} ++ ++#endif /* _BCACHEFS_BKEY_METHODS_H */ +diff --git a/fs/bcachefs/bkey_on_stack.h b/fs/bcachefs/bkey_on_stack.h +new file mode 100644 +index 000000000000..f607a0cb37ed +--- /dev/null ++++ b/fs/bcachefs/bkey_on_stack.h +@@ -0,0 +1,43 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BKEY_ON_STACK_H ++#define _BCACHEFS_BKEY_ON_STACK_H ++ ++#include "bcachefs.h" ++ ++struct bkey_on_stack { ++ struct bkey_i *k; ++ u64 onstack[12]; ++}; ++ ++static inline void bkey_on_stack_realloc(struct bkey_on_stack *s, ++ struct bch_fs *c, unsigned u64s) ++{ ++ if (s->k == (void *) s->onstack && ++ u64s > ARRAY_SIZE(s->onstack)) { ++ s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS); ++ memcpy(s->k, s->onstack, sizeof(s->onstack)); ++ } ++} ++ ++static inline void bkey_on_stack_reassemble(struct bkey_on_stack *s, ++ struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ bkey_on_stack_realloc(s, c, k.k->u64s); ++ bkey_reassemble(s->k, k); ++} ++ ++static inline void bkey_on_stack_init(struct bkey_on_stack *s) ++{ ++ s->k = (void *) s->onstack; ++} ++ ++static inline void bkey_on_stack_exit(struct bkey_on_stack *s, ++ struct bch_fs *c) ++{ ++ if (s->k != (void *) s->onstack) ++ mempool_free(s->k, &c->large_bkey_pool); ++ s->k = NULL; ++} ++ ++#endif /* _BCACHEFS_BKEY_ON_STACK_H */ +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +new file mode 100644 +index 000000000000..839e78d1dc35 +--- /dev/null ++++ b/fs/bcachefs/bkey_sort.c +@@ -0,0 +1,515 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "bkey_on_stack.h" ++#include "bkey_sort.h" ++#include "bset.h" ++#include "extents.h" ++ ++typedef int (*sort_cmp_fn)(struct btree *, ++ struct bkey_packed *, ++ struct bkey_packed *); ++ ++static inline bool sort_iter_end(struct sort_iter *iter) ++{ ++ return !iter->used; ++} ++ ++static inline void __sort_iter_sift(struct sort_iter *iter, ++ unsigned from, ++ sort_cmp_fn cmp) ++{ ++ unsigned i; ++ ++ for (i = from; ++ i + 1 < iter->used && ++ cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0; ++ i++) ++ swap(iter->data[i], iter->data[i + 1]); ++} ++ ++static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp) ++{ ++ ++ __sort_iter_sift(iter, 0, cmp); ++} ++ ++static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) ++{ ++ unsigned i = iter->used; ++ ++ while (i--) ++ __sort_iter_sift(iter, i, cmp); ++} ++ ++static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) ++{ ++ return !sort_iter_end(iter) ? iter->data->k : NULL; ++} ++ ++static inline void __sort_iter_advance(struct sort_iter *iter, ++ unsigned idx, sort_cmp_fn cmp) ++{ ++ struct sort_iter_set *i = iter->data + idx; ++ ++ BUG_ON(idx >= iter->used); ++ ++ i->k = bkey_next_skip_noops(i->k, i->end); ++ ++ BUG_ON(i->k > i->end); ++ ++ if (i->k == i->end) ++ array_remove_item(iter->data, iter->used, idx); ++ else ++ __sort_iter_sift(iter, idx, cmp); ++} ++ ++static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) ++{ ++ __sort_iter_advance(iter, 0, cmp); ++} ++ ++static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, ++ sort_cmp_fn cmp) ++{ ++ struct bkey_packed *ret = sort_iter_peek(iter); ++ ++ if (ret) ++ sort_iter_advance(iter, cmp); ++ ++ return ret; ++} ++ ++/* ++ * If keys compare equal, compare by pointer order: ++ */ ++static inline int key_sort_fix_overlapping_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) ++{ ++ return bkey_cmp_packed(b, l, r) ?: ++ cmp_int((unsigned long) l, (unsigned long) r); ++} ++ ++static inline bool should_drop_next_key(struct sort_iter *iter) ++{ ++ /* ++ * key_sort_cmp() ensures that when keys compare equal the older key ++ * comes first; so if l->k compares equal to r->k then l->k is older ++ * and should be dropped. ++ */ ++ return iter->used >= 2 && ++ !bkey_cmp_packed(iter->b, ++ iter->data[0].k, ++ iter->data[1].k); ++} ++ ++struct btree_nr_keys ++bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, ++ struct sort_iter *iter) ++{ ++ struct bkey_packed *out = dst->start; ++ struct bkey_packed *k; ++ struct btree_nr_keys nr; ++ ++ memset(&nr, 0, sizeof(nr)); ++ ++ sort_iter_sort(iter, key_sort_fix_overlapping_cmp); ++ ++ while ((k = sort_iter_peek(iter))) { ++ if (!bkey_whiteout(k) && ++ !should_drop_next_key(iter)) { ++ bkey_copy(out, k); ++ btree_keys_account_key_add(&nr, 0, out); ++ out = bkey_next(out); ++ } ++ ++ sort_iter_advance(iter, key_sort_fix_overlapping_cmp); ++ } ++ ++ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); ++ return nr; ++} ++ ++static void extent_sort_append(struct bch_fs *c, ++ struct bkey_format *f, ++ struct btree_nr_keys *nr, ++ struct bkey_packed **out, ++ struct bkey_s k) ++{ ++ if (!bkey_whiteout(k.k)) { ++ if (!bch2_bkey_pack_key(*out, k.k, f)) ++ memcpy_u64s_small(*out, k.k, BKEY_U64s); ++ ++ memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k)); ++ ++ btree_keys_account_key_add(nr, 0, *out); ++ *out = bkey_next(*out); ++ } ++} ++ ++/* Sort + repack in a new format: */ ++struct btree_nr_keys ++bch2_sort_repack(struct bset *dst, struct btree *src, ++ struct btree_node_iter *src_iter, ++ struct bkey_format *out_f, ++ bool filter_whiteouts) ++{ ++ struct bkey_format *in_f = &src->format; ++ struct bkey_packed *in, *out = vstruct_last(dst); ++ struct btree_nr_keys nr; ++ ++ memset(&nr, 0, sizeof(nr)); ++ ++ while ((in = bch2_btree_node_iter_next_all(src_iter, src))) { ++ if (filter_whiteouts && bkey_whiteout(in)) ++ continue; ++ ++ if (bch2_bkey_transform(out_f, out, bkey_packed(in) ++ ? in_f : &bch2_bkey_format_current, in)) ++ out->format = KEY_FORMAT_LOCAL_BTREE; ++ else ++ bch2_bkey_unpack(src, (void *) out, in); ++ ++ btree_keys_account_key_add(&nr, 0, out); ++ out = bkey_next(out); ++ } ++ ++ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); ++ return nr; ++} ++ ++/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */ ++struct btree_nr_keys ++bch2_sort_repack_merge(struct bch_fs *c, ++ struct bset *dst, struct btree *src, ++ struct btree_node_iter *iter, ++ struct bkey_format *out_f, ++ bool filter_whiteouts) ++{ ++ struct bkey_packed *out = vstruct_last(dst), *k_packed; ++ struct bkey_on_stack k; ++ struct btree_nr_keys nr; ++ ++ memset(&nr, 0, sizeof(nr)); ++ bkey_on_stack_init(&k); ++ ++ while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) { ++ if (filter_whiteouts && bkey_whiteout(k_packed)) ++ continue; ++ ++ /* ++ * NOTE: ++ * bch2_bkey_normalize may modify the key we pass it (dropping ++ * stale pointers) and we don't have a write lock on the src ++ * node; we have to make a copy of the entire key before calling ++ * normalize ++ */ ++ bkey_on_stack_realloc(&k, c, k_packed->u64s + BKEY_U64s); ++ bch2_bkey_unpack(src, k.k, k_packed); ++ ++ if (filter_whiteouts && ++ bch2_bkey_normalize(c, bkey_i_to_s(k.k))) ++ continue; ++ ++ extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k)); ++ } ++ ++ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); ++ bkey_on_stack_exit(&k, c); ++ return nr; ++} ++ ++static inline int sort_keys_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) ++{ ++ return bkey_cmp_packed(b, l, r) ?: ++ (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: ++ (int) l->needs_whiteout - (int) r->needs_whiteout; ++} ++ ++unsigned bch2_sort_keys(struct bkey_packed *dst, ++ struct sort_iter *iter, ++ bool filter_whiteouts) ++{ ++ const struct bkey_format *f = &iter->b->format; ++ struct bkey_packed *in, *next, *out = dst; ++ ++ sort_iter_sort(iter, sort_keys_cmp); ++ ++ while ((in = sort_iter_next(iter, sort_keys_cmp))) { ++ bool needs_whiteout = false; ++ ++ if (bkey_whiteout(in) && ++ (filter_whiteouts || !in->needs_whiteout)) ++ continue; ++ ++ while ((next = sort_iter_peek(iter)) && ++ !bkey_cmp_packed(iter->b, in, next)) { ++ BUG_ON(in->needs_whiteout && ++ next->needs_whiteout); ++ needs_whiteout |= in->needs_whiteout; ++ in = sort_iter_next(iter, sort_keys_cmp); ++ } ++ ++ if (bkey_whiteout(in)) { ++ memcpy_u64s(out, in, bkeyp_key_u64s(f, in)); ++ set_bkeyp_val_u64s(f, out, 0); ++ } else { ++ bkey_copy(out, in); ++ } ++ out->needs_whiteout |= needs_whiteout; ++ out = bkey_next(out); ++ } ++ ++ return (u64 *) out - (u64 *) dst; ++} ++ ++/* Compat code for btree_node_old_extent_overwrite: */ ++ ++/* ++ * If keys compare equal, compare by pointer order: ++ * ++ * Necessary for sort_fix_overlapping() - if there are multiple keys that ++ * compare equal in different sets, we have to process them newest to oldest. ++ */ ++static inline int extent_sort_fix_overlapping_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) ++{ ++ struct bkey ul = bkey_unpack_key(b, l); ++ struct bkey ur = bkey_unpack_key(b, r); ++ ++ return bkey_cmp(bkey_start_pos(&ul), ++ bkey_start_pos(&ur)) ?: ++ cmp_int((unsigned long) r, (unsigned long) l); ++} ++ ++/* ++ * The algorithm in extent_sort_fix_overlapping() relies on keys in the same ++ * bset being ordered by start offset - but 0 size whiteouts (which are always ++ * KEY_TYPE_deleted) break this ordering, so we need to skip over them: ++ */ ++static void extent_iter_advance(struct sort_iter *iter, unsigned idx) ++{ ++ struct sort_iter_set *i = iter->data + idx; ++ ++ do { ++ i->k = bkey_next_skip_noops(i->k, i->end); ++ } while (i->k != i->end && bkey_deleted(i->k)); ++ ++ if (i->k == i->end) ++ array_remove_item(iter->data, iter->used, idx); ++ else ++ __sort_iter_sift(iter, idx, extent_sort_fix_overlapping_cmp); ++} ++ ++struct btree_nr_keys ++bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, ++ struct sort_iter *iter) ++{ ++ struct btree *b = iter->b; ++ struct bkey_format *f = &b->format; ++ struct sort_iter_set *_l = iter->data, *_r = iter->data + 1; ++ struct bkey_packed *out = dst->start; ++ struct bkey l_unpacked, r_unpacked; ++ struct bkey_s l, r; ++ struct btree_nr_keys nr; ++ struct bkey_on_stack split; ++ unsigned i; ++ ++ memset(&nr, 0, sizeof(nr)); ++ bkey_on_stack_init(&split); ++ ++ sort_iter_sort(iter, extent_sort_fix_overlapping_cmp); ++ for (i = 0; i < iter->used;) { ++ if (bkey_deleted(iter->data[i].k)) ++ __sort_iter_advance(iter, i, ++ extent_sort_fix_overlapping_cmp); ++ else ++ i++; ++ } ++ ++ while (!sort_iter_end(iter)) { ++ l = __bkey_disassemble(b, _l->k, &l_unpacked); ++ ++ if (iter->used == 1) { ++ extent_sort_append(c, f, &nr, &out, l); ++ extent_iter_advance(iter, 0); ++ continue; ++ } ++ ++ r = __bkey_disassemble(b, _r->k, &r_unpacked); ++ ++ /* If current key and next key don't overlap, just append */ ++ if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { ++ extent_sort_append(c, f, &nr, &out, l); ++ extent_iter_advance(iter, 0); ++ continue; ++ } ++ ++ /* Skip 0 size keys */ ++ if (!r.k->size) { ++ extent_iter_advance(iter, 1); ++ continue; ++ } ++ ++ /* ++ * overlap: keep the newer key and trim the older key so they ++ * don't overlap. comparing pointers tells us which one is ++ * newer, since the bsets are appended one after the other. ++ */ ++ ++ /* can't happen because of comparison func */ ++ BUG_ON(_l->k < _r->k && ++ !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k))); ++ ++ if (_l->k > _r->k) { ++ /* l wins, trim r */ ++ if (bkey_cmp(l.k->p, r.k->p) >= 0) { ++ extent_iter_advance(iter, 1); ++ } else { ++ bch2_cut_front_s(l.k->p, r); ++ extent_save(b, _r->k, r.k); ++ __sort_iter_sift(iter, 1, ++ extent_sort_fix_overlapping_cmp); ++ } ++ } else if (bkey_cmp(l.k->p, r.k->p) > 0) { ++ ++ /* ++ * r wins, but it overlaps in the middle of l - split l: ++ */ ++ bkey_on_stack_reassemble(&split, c, l.s_c); ++ bch2_cut_back(bkey_start_pos(r.k), split.k); ++ ++ bch2_cut_front_s(r.k->p, l); ++ extent_save(b, _l->k, l.k); ++ ++ __sort_iter_sift(iter, 0, ++ extent_sort_fix_overlapping_cmp); ++ ++ extent_sort_append(c, f, &nr, &out, ++ bkey_i_to_s(split.k)); ++ } else { ++ bch2_cut_back_s(bkey_start_pos(r.k), l); ++ extent_save(b, _l->k, l.k); ++ } ++ } ++ ++ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); ++ ++ bkey_on_stack_exit(&split, c); ++ return nr; ++} ++ ++static inline int sort_extents_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) ++{ ++ return bkey_cmp_packed(b, l, r) ?: ++ (int) bkey_deleted(l) - (int) bkey_deleted(r); ++} ++ ++unsigned bch2_sort_extents(struct bkey_packed *dst, ++ struct sort_iter *iter, ++ bool filter_whiteouts) ++{ ++ struct bkey_packed *in, *out = dst; ++ ++ sort_iter_sort(iter, sort_extents_cmp); ++ ++ while ((in = sort_iter_next(iter, sort_extents_cmp))) { ++ if (bkey_deleted(in)) ++ continue; ++ ++ if (bkey_whiteout(in) && ++ (filter_whiteouts || !in->needs_whiteout)) ++ continue; ++ ++ bkey_copy(out, in); ++ out = bkey_next(out); ++ } ++ ++ return (u64 *) out - (u64 *) dst; ++} ++ ++static inline int sort_extent_whiteouts_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) ++{ ++ struct bkey ul = bkey_unpack_key(b, l); ++ struct bkey ur = bkey_unpack_key(b, r); ++ ++ return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur)); ++} ++ ++unsigned bch2_sort_extent_whiteouts(struct bkey_packed *dst, ++ struct sort_iter *iter) ++{ ++ const struct bkey_format *f = &iter->b->format; ++ struct bkey_packed *in, *out = dst; ++ struct bkey_i l, r; ++ bool prev = false, l_packed = false; ++ u64 max_packed_size = bkey_field_max(f, BKEY_FIELD_SIZE); ++ u64 max_packed_offset = bkey_field_max(f, BKEY_FIELD_OFFSET); ++ u64 new_size; ++ ++ max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX); ++ ++ sort_iter_sort(iter, sort_extent_whiteouts_cmp); ++ ++ while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) { ++ if (bkey_deleted(in)) ++ continue; ++ ++ EBUG_ON(bkeyp_val_u64s(f, in)); ++ EBUG_ON(in->type != KEY_TYPE_discard); ++ ++ r.k = bkey_unpack_key(iter->b, in); ++ ++ if (prev && ++ bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) { ++ if (bkey_cmp(l.k.p, r.k.p) >= 0) ++ continue; ++ ++ new_size = l_packed ++ ? min(max_packed_size, max_packed_offset - ++ bkey_start_offset(&l.k)) ++ : KEY_SIZE_MAX; ++ ++ new_size = min(new_size, r.k.p.offset - ++ bkey_start_offset(&l.k)); ++ ++ BUG_ON(new_size < l.k.size); ++ ++ bch2_key_resize(&l.k, new_size); ++ ++ if (bkey_cmp(l.k.p, r.k.p) >= 0) ++ continue; ++ ++ bch2_cut_front(l.k.p, &r); ++ } ++ ++ if (prev) { ++ if (!bch2_bkey_pack(out, &l, f)) { ++ BUG_ON(l_packed); ++ bkey_copy(out, &l); ++ } ++ out = bkey_next(out); ++ } ++ ++ l = r; ++ prev = true; ++ l_packed = bkey_packed(in); ++ } ++ ++ if (prev) { ++ if (!bch2_bkey_pack(out, &l, f)) { ++ BUG_ON(l_packed); ++ bkey_copy(out, &l); ++ } ++ out = bkey_next(out); ++ } ++ ++ return (u64 *) out - (u64 *) dst; ++} +diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h +new file mode 100644 +index 000000000000..458a051fdac5 +--- /dev/null ++++ b/fs/bcachefs/bkey_sort.h +@@ -0,0 +1,57 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BKEY_SORT_H ++#define _BCACHEFS_BKEY_SORT_H ++ ++struct sort_iter { ++ struct btree *b; ++ unsigned used; ++ unsigned size; ++ ++ struct sort_iter_set { ++ struct bkey_packed *k, *end; ++ } data[MAX_BSETS + 1]; ++}; ++ ++static inline void sort_iter_init(struct sort_iter *iter, struct btree *b) ++{ ++ iter->b = b; ++ iter->used = 0; ++ iter->size = ARRAY_SIZE(iter->data); ++} ++ ++static inline void sort_iter_add(struct sort_iter *iter, ++ struct bkey_packed *k, ++ struct bkey_packed *end) ++{ ++ BUG_ON(iter->used >= iter->size); ++ ++ if (k != end) ++ iter->data[iter->used++] = (struct sort_iter_set) { k, end }; ++} ++ ++struct btree_nr_keys ++bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *, ++ struct sort_iter *); ++struct btree_nr_keys ++bch2_extent_sort_fix_overlapping(struct bch_fs *, struct bset *, ++ struct sort_iter *); ++ ++struct btree_nr_keys ++bch2_sort_repack(struct bset *, struct btree *, ++ struct btree_node_iter *, ++ struct bkey_format *, bool); ++struct btree_nr_keys ++bch2_sort_repack_merge(struct bch_fs *, ++ struct bset *, struct btree *, ++ struct btree_node_iter *, ++ struct bkey_format *, bool); ++ ++unsigned bch2_sort_keys(struct bkey_packed *, ++ struct sort_iter *, bool); ++unsigned bch2_sort_extents(struct bkey_packed *, ++ struct sort_iter *, bool); ++ ++unsigned bch2_sort_extent_whiteouts(struct bkey_packed *, ++ struct sort_iter *); ++ ++#endif /* _BCACHEFS_BKEY_SORT_H */ +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +new file mode 100644 +index 000000000000..6fc91e6a35e8 +--- /dev/null ++++ b/fs/bcachefs/bset.c +@@ -0,0 +1,1803 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Code for working with individual keys, and sorted sets of keys with in a ++ * btree node ++ * ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "btree_cache.h" ++#include "bset.h" ++#include "eytzinger.h" ++#include "util.h" ++ ++#include ++#include ++#include ++#include ++ ++/* hack.. */ ++#include "alloc_types.h" ++#include ++ ++static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *, ++ struct btree *); ++ ++static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) ++{ ++ unsigned n = ARRAY_SIZE(iter->data); ++ ++ while (n && __btree_node_iter_set_end(iter, n - 1)) ++ --n; ++ ++ return n; ++} ++ ++struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) ++{ ++ unsigned offset = __btree_node_key_to_offset(b, k); ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) ++ if (offset <= t->end_offset) { ++ EBUG_ON(offset < btree_bkey_first_offset(t)); ++ return t; ++ } ++ ++ BUG(); ++} ++ ++/* ++ * There are never duplicate live keys in the btree - but including keys that ++ * have been flagged as deleted (and will be cleaned up later) we _will_ see ++ * duplicates. ++ * ++ * Thus the sort order is: usual key comparison first, but for keys that compare ++ * equal the deleted key(s) come first, and the (at most one) live version comes ++ * last. ++ * ++ * The main reason for this is insertion: to handle overwrites, we first iterate ++ * over keys that compare equal to our insert key, and then insert immediately ++ * prior to the first key greater than the key we're inserting - our insert ++ * position will be after all keys that compare equal to our insert key, which ++ * by the time we actually do the insert will all be deleted. ++ */ ++ ++void bch2_dump_bset(struct bch_fs *c, struct btree *b, ++ struct bset *i, unsigned set) ++{ ++ struct bkey_packed *_k, *_n; ++ struct bkey uk, n; ++ struct bkey_s_c k; ++ char buf[200]; ++ ++ if (!i->u64s) ++ return; ++ ++ for (_k = i->start; ++ _k < vstruct_last(i); ++ _k = _n) { ++ _n = bkey_next_skip_noops(_k, vstruct_last(i)); ++ ++ k = bkey_disassemble(b, _k, &uk); ++ if (c) ++ bch2_bkey_val_to_text(&PBUF(buf), c, k); ++ else ++ bch2_bkey_to_text(&PBUF(buf), k.k); ++ printk(KERN_ERR "block %u key %5zu: %s\n", set, ++ _k->_data - i->_data, buf); ++ ++ if (_n == vstruct_last(i)) ++ continue; ++ ++ n = bkey_unpack_key(b, _n); ++ ++ if (bkey_cmp(bkey_start_pos(&n), k.k->p) < 0) { ++ printk(KERN_ERR "Key skipped backwards\n"); ++ continue; ++ } ++ ++ if (!bkey_deleted(k.k) && ++ !bkey_cmp(n.p, k.k->p)) ++ printk(KERN_ERR "Duplicate keys\n"); ++ } ++} ++ ++void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) ++{ ++ struct bset_tree *t; ++ ++ console_lock(); ++ for_each_bset(b, t) ++ bch2_dump_bset(c, b, bset(b, t), t - b->set); ++ console_unlock(); ++} ++ ++void bch2_dump_btree_node_iter(struct btree *b, ++ struct btree_node_iter *iter) ++{ ++ struct btree_node_iter_set *set; ++ ++ printk(KERN_ERR "btree node iter with %u/%u sets:\n", ++ __btree_node_iter_used(iter), b->nsets); ++ ++ btree_node_iter_for_each(iter, set) { ++ struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ struct bkey uk = bkey_unpack_key(b, k); ++ char buf[100]; ++ ++ bch2_bkey_to_text(&PBUF(buf), &uk); ++ printk(KERN_ERR "set %zu key %u: %s\n", ++ t - b->set, set->k, buf); ++ } ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++void __bch2_verify_btree_nr_keys(struct btree *b) ++{ ++ struct bset_tree *t; ++ struct bkey_packed *k; ++ struct btree_nr_keys nr = { 0 }; ++ ++ for_each_bset(b, t) ++ bset_tree_for_each_key(b, t, k) ++ if (!bkey_whiteout(k)) ++ btree_keys_account_key_add(&nr, t - b->set, k); ++ ++ BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); ++} ++ ++static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, ++ struct btree *b) ++{ ++ struct btree_node_iter iter = *_iter; ++ const struct bkey_packed *k, *n; ++ ++ k = bch2_btree_node_iter_peek_all(&iter, b); ++ __bch2_btree_node_iter_advance(&iter, b); ++ n = bch2_btree_node_iter_peek_all(&iter, b); ++ ++ bkey_unpack_key(b, k); ++ ++ if (n && ++ bkey_iter_cmp(b, k, n) > 0) { ++ struct btree_node_iter_set *set; ++ struct bkey ku = bkey_unpack_key(b, k); ++ struct bkey nu = bkey_unpack_key(b, n); ++ char buf1[80], buf2[80]; ++ ++ bch2_dump_btree_node(NULL, b); ++ bch2_bkey_to_text(&PBUF(buf1), &ku); ++ bch2_bkey_to_text(&PBUF(buf2), &nu); ++ printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", ++ buf1, buf2); ++ printk(KERN_ERR "iter was:"); ++ ++ btree_node_iter_for_each(_iter, set) { ++ struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ printk(" [%zi %zi]", t - b->set, ++ k->_data - bset(b, t)->_data); ++ } ++ panic("\n"); ++ } ++} ++ ++void bch2_btree_node_iter_verify(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ struct btree_node_iter_set *set, *s2; ++ struct bkey_packed *k, *p; ++ struct bset_tree *t; ++ ++ if (bch2_btree_node_iter_end(iter)) ++ return; ++ ++ /* Verify no duplicates: */ ++ btree_node_iter_for_each(iter, set) ++ btree_node_iter_for_each(iter, s2) ++ BUG_ON(set != s2 && set->end == s2->end); ++ ++ /* Verify that set->end is correct: */ ++ btree_node_iter_for_each(iter, set) { ++ for_each_bset(b, t) ++ if (set->end == t->end_offset) ++ goto found; ++ BUG(); ++found: ++ BUG_ON(set->k < btree_bkey_first_offset(t) || ++ set->k >= t->end_offset); ++ } ++ ++ /* Verify iterator is sorted: */ ++ btree_node_iter_for_each(iter, set) ++ BUG_ON(set != iter->data && ++ btree_node_iter_cmp(b, set[-1], set[0]) > 0); ++ ++ k = bch2_btree_node_iter_peek_all(iter, b); ++ ++ for_each_bset(b, t) { ++ if (iter->data[0].end == t->end_offset) ++ continue; ++ ++ p = bch2_bkey_prev_all(b, t, ++ bch2_btree_node_iter_bset_pos(iter, b, t)); ++ ++ BUG_ON(p && bkey_iter_cmp(b, k, p) < 0); ++ } ++} ++ ++void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, ++ struct bkey_packed *insert, unsigned clobber_u64s) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, where); ++ struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); ++ struct bkey_packed *next = (void *) (where->_data + clobber_u64s); ++#if 0 ++ BUG_ON(prev && ++ bkey_iter_cmp(b, prev, insert) > 0); ++#else ++ if (prev && ++ bkey_iter_cmp(b, prev, insert) > 0) { ++ struct bkey k1 = bkey_unpack_key(b, prev); ++ struct bkey k2 = bkey_unpack_key(b, insert); ++ char buf1[100]; ++ char buf2[100]; ++ ++ bch2_dump_btree_node(NULL, b); ++ bch2_bkey_to_text(&PBUF(buf1), &k1); ++ bch2_bkey_to_text(&PBUF(buf2), &k2); ++ ++ panic("prev > insert:\n" ++ "prev key %s\n" ++ "insert key %s\n", ++ buf1, buf2); ++ } ++#endif ++#if 0 ++ BUG_ON(next != btree_bkey_last(b, t) && ++ bkey_iter_cmp(b, insert, next) > 0); ++#else ++ if (next != btree_bkey_last(b, t) && ++ bkey_iter_cmp(b, insert, next) > 0) { ++ struct bkey k1 = bkey_unpack_key(b, insert); ++ struct bkey k2 = bkey_unpack_key(b, next); ++ char buf1[100]; ++ char buf2[100]; ++ ++ bch2_dump_btree_node(NULL, b); ++ bch2_bkey_to_text(&PBUF(buf1), &k1); ++ bch2_bkey_to_text(&PBUF(buf2), &k2); ++ ++ panic("insert > next:\n" ++ "insert key %s\n" ++ "next key %s\n", ++ buf1, buf2); ++ } ++#endif ++} ++ ++#else ++ ++static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, ++ struct btree *b) {} ++ ++#endif ++ ++/* Auxiliary search trees */ ++ ++#define BFLOAT_FAILED_UNPACKED U8_MAX ++#define BFLOAT_FAILED U8_MAX ++ ++struct bkey_float { ++ u8 exponent; ++ u8 key_offset; ++ u16 mantissa; ++}; ++#define BKEY_MANTISSA_BITS 16 ++ ++static unsigned bkey_float_byte_offset(unsigned idx) ++{ ++ return idx * sizeof(struct bkey_float); ++} ++ ++struct ro_aux_tree { ++ struct bkey_float f[0]; ++}; ++ ++struct rw_aux_tree { ++ u16 offset; ++ struct bpos k; ++}; ++ ++/* ++ * BSET_CACHELINE was originally intended to match the hardware cacheline size - ++ * it used to be 64, but I realized the lookup code would touch slightly less ++ * memory if it was 128. ++ * ++ * It definites the number of bytes (in struct bset) per struct bkey_float in ++ * the auxiliar search tree - when we're done searching the bset_float tree we ++ * have this many bytes left that we do a linear search over. ++ * ++ * Since (after level 5) every level of the bset_tree is on a new cacheline, ++ * we're touching one fewer cacheline in the bset tree in exchange for one more ++ * cacheline in the linear search - but the linear search might stop before it ++ * gets to the second cacheline. ++ */ ++ ++#define BSET_CACHELINE 128 ++ ++/* Space required for the btree node keys */ ++static inline size_t btree_keys_bytes(struct btree *b) ++{ ++ return PAGE_SIZE << b->page_order; ++} ++ ++static inline size_t btree_keys_cachelines(struct btree *b) ++{ ++ return btree_keys_bytes(b) / BSET_CACHELINE; ++} ++ ++static inline size_t btree_aux_data_bytes(struct btree *b) ++{ ++ return btree_keys_cachelines(b) * 8; ++} ++ ++static inline size_t btree_aux_data_u64s(struct btree *b) ++{ ++ return btree_aux_data_bytes(b) / sizeof(u64); ++} ++ ++static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) ++{ ++ BUG_ON(t->aux_data_offset == U16_MAX); ++ ++ switch (bset_aux_tree_type(t)) { ++ case BSET_NO_AUX_TREE: ++ return t->aux_data_offset; ++ case BSET_RO_AUX_TREE: ++ return t->aux_data_offset + ++ DIV_ROUND_UP(t->size * sizeof(struct bkey_float) + ++ t->size * sizeof(u8), 8); ++ case BSET_RW_AUX_TREE: ++ return t->aux_data_offset + ++ DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); ++ default: ++ BUG(); ++ } ++} ++ ++static unsigned bset_aux_tree_buf_start(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ return t == b->set ++ ? DIV_ROUND_UP(b->unpack_fn_len, 8) ++ : bset_aux_tree_buf_end(t - 1); ++} ++ ++static void *__aux_tree_base(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ return b->aux_data + t->aux_data_offset * 8; ++} ++ ++static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); ++ ++ return __aux_tree_base(b, t); ++} ++ ++static u8 *ro_aux_tree_prev(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); ++ ++ return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size); ++} ++ ++static struct bkey_float *bkey_float(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned idx) ++{ ++ return ro_aux_tree_base(b, t)->f + idx; ++} ++ ++static void bset_aux_tree_verify(struct btree *b) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) { ++ if (t->aux_data_offset == U16_MAX) ++ continue; ++ ++ BUG_ON(t != b->set && ++ t[-1].aux_data_offset == U16_MAX); ++ ++ BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); ++ BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); ++ BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); ++ } ++#endif ++} ++ ++/* Memory allocation */ ++ ++void bch2_btree_keys_free(struct btree *b) ++{ ++ vfree(b->aux_data); ++ b->aux_data = NULL; ++} ++ ++#ifndef PAGE_KERNEL_EXEC ++# define PAGE_KERNEL_EXEC PAGE_KERNEL ++#endif ++ ++int bch2_btree_keys_alloc(struct btree *b, unsigned page_order, gfp_t gfp) ++{ ++ b->page_order = page_order; ++ b->aux_data = __vmalloc(btree_aux_data_bytes(b), gfp, ++ PAGE_KERNEL_EXEC); ++ if (!b->aux_data) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks) ++{ ++ unsigned i; ++ ++ b->nsets = 0; ++ memset(&b->nr, 0, sizeof(b->nr)); ++#ifdef CONFIG_BCACHEFS_DEBUG ++ b->expensive_debug_checks = expensive_debug_checks; ++#endif ++ for (i = 0; i < MAX_BSETS; i++) ++ b->set[i].data_offset = U16_MAX; ++ ++ bch2_bset_set_no_aux_tree(b, b->set); ++} ++ ++/* Binary tree stuff for auxiliary search trees */ ++ ++/* ++ * Cacheline/offset <-> bkey pointer arithmetic: ++ * ++ * t->tree is a binary search tree in an array; each node corresponds to a key ++ * in one cacheline in t->set (BSET_CACHELINE bytes). ++ * ++ * This means we don't have to store the full index of the key that a node in ++ * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and ++ * then bkey_float->m gives us the offset within that cacheline, in units of 8 ++ * bytes. ++ * ++ * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to ++ * make this work. ++ * ++ * To construct the bfloat for an arbitrary key we need to know what the key ++ * immediately preceding it is: we have to check if the two keys differ in the ++ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size ++ * of the previous key so we can walk backwards to it from t->tree[j]'s key. ++ */ ++ ++static inline void *bset_cacheline(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned cacheline) ++{ ++ return (void *) round_down((unsigned long) btree_bkey_first(b, t), ++ L1_CACHE_BYTES) + ++ cacheline * BSET_CACHELINE; ++} ++ ++static struct bkey_packed *cacheline_to_bkey(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned cacheline, ++ unsigned offset) ++{ ++ return bset_cacheline(b, t, cacheline) + offset * 8; ++} ++ ++static unsigned bkey_to_cacheline(const struct btree *b, ++ const struct bset_tree *t, ++ const struct bkey_packed *k) ++{ ++ return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE; ++} ++ ++static ssize_t __bkey_to_cacheline_offset(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned cacheline, ++ const struct bkey_packed *k) ++{ ++ return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline); ++} ++ ++static unsigned bkey_to_cacheline_offset(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned cacheline, ++ const struct bkey_packed *k) ++{ ++ size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k); ++ ++ EBUG_ON(m > U8_MAX); ++ return m; ++} ++ ++static inline struct bkey_packed *tree_to_bkey(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned j) ++{ ++ return cacheline_to_bkey(b, t, ++ __eytzinger1_to_inorder(j, t->size, t->extra), ++ bkey_float(b, t, j)->key_offset); ++} ++ ++static struct bkey_packed *tree_to_prev_bkey(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned j) ++{ ++ unsigned prev_u64s = ro_aux_tree_prev(b, t)[j]; ++ ++ return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s); ++} ++ ++static struct rw_aux_tree *rw_aux_tree(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); ++ ++ return __aux_tree_base(b, t); ++} ++ ++/* ++ * For the write set - the one we're currently inserting keys into - we don't ++ * maintain a full search tree, we just keep a simple lookup table in t->prev. ++ */ ++static struct bkey_packed *rw_aux_to_bkey(const struct btree *b, ++ struct bset_tree *t, ++ unsigned j) ++{ ++ return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset); ++} ++ ++static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, ++ unsigned j, struct bkey_packed *k) ++{ ++ EBUG_ON(k >= btree_bkey_last(b, t)); ++ ++ rw_aux_tree(b, t)[j] = (struct rw_aux_tree) { ++ .offset = __btree_node_key_to_offset(b, k), ++ .k = bkey_unpack_pos(b, k), ++ }; ++} ++ ++static void bch2_bset_verify_rw_aux_tree(struct btree *b, ++ struct bset_tree *t) ++{ ++ struct bkey_packed *k = btree_bkey_first(b, t); ++ unsigned j = 0; ++ ++ if (!btree_keys_expensive_checks(b)) ++ return; ++ ++ BUG_ON(bset_has_ro_aux_tree(t)); ++ ++ if (!bset_has_rw_aux_tree(t)) ++ return; ++ ++ BUG_ON(t->size < 1); ++ BUG_ON(rw_aux_to_bkey(b, t, j) != k); ++ ++ goto start; ++ while (1) { ++ if (rw_aux_to_bkey(b, t, j) == k) { ++ BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k, ++ bkey_unpack_pos(b, k))); ++start: ++ if (++j == t->size) ++ break; ++ ++ BUG_ON(rw_aux_tree(b, t)[j].offset <= ++ rw_aux_tree(b, t)[j - 1].offset); ++ } ++ ++ k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); ++ BUG_ON(k >= btree_bkey_last(b, t)); ++ } ++} ++ ++/* returns idx of first entry >= offset: */ ++static unsigned rw_aux_tree_bsearch(struct btree *b, ++ struct bset_tree *t, ++ unsigned offset) ++{ ++ unsigned bset_offs = offset - btree_bkey_first_offset(t); ++ unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t); ++ unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0; ++ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); ++ EBUG_ON(!t->size); ++ EBUG_ON(idx > t->size); ++ ++ while (idx < t->size && ++ rw_aux_tree(b, t)[idx].offset < offset) ++ idx++; ++ ++ while (idx && ++ rw_aux_tree(b, t)[idx - 1].offset >= offset) ++ idx--; ++ ++ EBUG_ON(idx < t->size && ++ rw_aux_tree(b, t)[idx].offset < offset); ++ EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset); ++ EBUG_ON(idx + 1 < t->size && ++ rw_aux_tree(b, t)[idx].offset == ++ rw_aux_tree(b, t)[idx + 1].offset); ++ ++ return idx; ++} ++ ++static inline unsigned bkey_mantissa(const struct bkey_packed *k, ++ const struct bkey_float *f, ++ unsigned idx) ++{ ++ u64 v; ++ ++ EBUG_ON(!bkey_packed(k)); ++ ++ v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3))); ++ ++ /* ++ * In little endian, we're shifting off low bits (and then the bits we ++ * want are at the low end), in big endian we're shifting off high bits ++ * (and then the bits we want are at the high end, so we shift them ++ * back down): ++ */ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ v >>= f->exponent & 7; ++#else ++ v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS; ++#endif ++ return (u16) v; ++} ++ ++static void make_bfloat(struct btree *b, struct bset_tree *t, ++ unsigned j, ++ struct bkey_packed *min_key, ++ struct bkey_packed *max_key) ++{ ++ struct bkey_float *f = bkey_float(b, t, j); ++ struct bkey_packed *m = tree_to_bkey(b, t, j); ++ struct bkey_packed *l, *r; ++ unsigned mantissa; ++ int shift, exponent, high_bit; ++ ++ if (is_power_of_2(j)) { ++ l = min_key; ++ ++ if (!l->u64s) { ++ if (!bkey_pack_pos(l, b->data->min_key, b)) { ++ struct bkey_i tmp; ++ ++ bkey_init(&tmp.k); ++ tmp.k.p = b->data->min_key; ++ bkey_copy(l, &tmp); ++ } ++ } ++ } else { ++ l = tree_to_prev_bkey(b, t, j >> ffs(j)); ++ ++ EBUG_ON(m < l); ++ } ++ ++ if (is_power_of_2(j + 1)) { ++ r = max_key; ++ ++ if (!r->u64s) { ++ if (!bkey_pack_pos(r, t->max_key, b)) { ++ struct bkey_i tmp; ++ ++ bkey_init(&tmp.k); ++ tmp.k.p = t->max_key; ++ bkey_copy(r, &tmp); ++ } ++ } ++ } else { ++ r = tree_to_bkey(b, t, j >> (ffz(j) + 1)); ++ ++ EBUG_ON(m > r); ++ } ++ ++ /* ++ * for failed bfloats, the lookup code falls back to comparing against ++ * the original key. ++ */ ++ ++ if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) || ++ !b->nr_key_bits) { ++ f->exponent = BFLOAT_FAILED_UNPACKED; ++ return; ++ } ++ ++ /* ++ * The greatest differing bit of l and r is the first bit we must ++ * include in the bfloat mantissa we're creating in order to do ++ * comparisons - that bit always becomes the high bit of ++ * bfloat->mantissa, and thus the exponent we're calculating here is ++ * the position of what will become the low bit in bfloat->mantissa: ++ * ++ * Note that this may be negative - we may be running off the low end ++ * of the key: we handle this later: ++ */ ++ high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r), ++ min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1); ++ exponent = high_bit - (BKEY_MANTISSA_BITS - 1); ++ ++ /* ++ * Then we calculate the actual shift value, from the start of the key ++ * (k->_data), to get the key bits starting at exponent: ++ */ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; ++ ++ EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64); ++#else ++ shift = high_bit_offset + ++ b->nr_key_bits - ++ exponent - ++ BKEY_MANTISSA_BITS; ++ ++ EBUG_ON(shift < KEY_PACKED_BITS_START); ++#endif ++ EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); ++ ++ f->exponent = shift; ++ mantissa = bkey_mantissa(m, f, j); ++ ++ /* ++ * If we've got garbage bits, set them to all 1s - it's legal for the ++ * bfloat to compare larger than the original key, but not smaller: ++ */ ++ if (exponent < 0) ++ mantissa |= ~(~0U << -exponent); ++ ++ f->mantissa = mantissa; ++} ++ ++/* bytes remaining - only valid for last bset: */ ++static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t) ++{ ++ bset_aux_tree_verify(b); ++ ++ return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); ++} ++ ++static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t) ++{ ++ return __bset_tree_capacity(b, t) / ++ (sizeof(struct bkey_float) + sizeof(u8)); ++} ++ ++static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t) ++{ ++ return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); ++} ++ ++static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) ++{ ++ struct bkey_packed *k; ++ ++ t->size = 1; ++ t->extra = BSET_RW_AUX_TREE_VAL; ++ rw_aux_tree(b, t)[0].offset = ++ __btree_node_key_to_offset(b, btree_bkey_first(b, t)); ++ ++ bset_tree_for_each_key(b, t, k) { ++ if (t->size == bset_rw_tree_capacity(b, t)) ++ break; ++ ++ if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) > ++ L1_CACHE_BYTES) ++ rw_aux_tree_set(b, t, t->size++, k); ++ } ++} ++ ++static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) ++{ ++ struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); ++ struct bkey_packed min_key, max_key; ++ unsigned j, cacheline = 1; ++ ++ /* signal to make_bfloat() that they're uninitialized: */ ++ min_key.u64s = max_key.u64s = 0; ++ ++ t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), ++ bset_ro_tree_capacity(b, t)); ++retry: ++ if (t->size < 2) { ++ t->size = 0; ++ t->extra = BSET_NO_AUX_TREE_VAL; ++ return; ++ } ++ ++ t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; ++ ++ /* First we figure out where the first key in each cacheline is */ ++ eytzinger1_for_each(j, t->size) { ++ while (bkey_to_cacheline(b, t, k) < cacheline) ++ prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); ++ ++ if (k >= btree_bkey_last(b, t)) { ++ /* XXX: this path sucks */ ++ t->size--; ++ goto retry; ++ } ++ ++ ro_aux_tree_prev(b, t)[j] = prev->u64s; ++ bkey_float(b, t, j)->key_offset = ++ bkey_to_cacheline_offset(b, t, cacheline++, k); ++ ++ EBUG_ON(tree_to_prev_bkey(b, t, j) != prev); ++ EBUG_ON(tree_to_bkey(b, t, j) != k); ++ } ++ ++ while (k != btree_bkey_last(b, t)) ++ prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); ++ ++ t->max_key = bkey_unpack_pos(b, prev); ++ ++ /* Then we build the tree */ ++ eytzinger1_for_each(j, t->size) ++ make_bfloat(b, t, j, &min_key, &max_key); ++} ++ ++static void bset_alloc_tree(struct btree *b, struct bset_tree *t) ++{ ++ struct bset_tree *i; ++ ++ for (i = b->set; i != t; i++) ++ BUG_ON(bset_has_rw_aux_tree(i)); ++ ++ bch2_bset_set_no_aux_tree(b, t); ++ ++ /* round up to next cacheline: */ ++ t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t), ++ SMP_CACHE_BYTES / sizeof(u64)); ++ ++ bset_aux_tree_verify(b); ++} ++ ++void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t, ++ bool writeable) ++{ ++ if (writeable ++ ? bset_has_rw_aux_tree(t) ++ : bset_has_ro_aux_tree(t)) ++ return; ++ ++ bset_alloc_tree(b, t); ++ ++ if (!__bset_tree_capacity(b, t)) ++ return; ++ ++ if (writeable) ++ __build_rw_aux_tree(b, t); ++ else ++ __build_ro_aux_tree(b, t); ++ ++ bset_aux_tree_verify(b); ++} ++ ++void bch2_bset_init_first(struct btree *b, struct bset *i) ++{ ++ struct bset_tree *t; ++ ++ BUG_ON(b->nsets); ++ ++ memset(i, 0, sizeof(*i)); ++ get_random_bytes(&i->seq, sizeof(i->seq)); ++ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); ++ ++ t = &b->set[b->nsets++]; ++ set_btree_bset(b, t, i); ++} ++ ++void bch2_bset_init_next(struct bch_fs *c, struct btree *b, ++ struct btree_node_entry *bne) ++{ ++ struct bset *i = &bne->keys; ++ struct bset_tree *t; ++ ++ BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c)); ++ BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); ++ BUG_ON(b->nsets >= MAX_BSETS); ++ ++ memset(i, 0, sizeof(*i)); ++ i->seq = btree_bset_first(b)->seq; ++ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); ++ ++ t = &b->set[b->nsets++]; ++ set_btree_bset(b, t, i); ++} ++ ++/* ++ * find _some_ key in the same bset as @k that precedes @k - not necessarily the ++ * immediate predecessor: ++ */ ++static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, ++ struct bkey_packed *k) ++{ ++ struct bkey_packed *p; ++ unsigned offset; ++ int j; ++ ++ EBUG_ON(k < btree_bkey_first(b, t) || ++ k > btree_bkey_last(b, t)); ++ ++ if (k == btree_bkey_first(b, t)) ++ return NULL; ++ ++ switch (bset_aux_tree_type(t)) { ++ case BSET_NO_AUX_TREE: ++ p = btree_bkey_first(b, t); ++ break; ++ case BSET_RO_AUX_TREE: ++ j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k)); ++ ++ do { ++ p = j ? tree_to_bkey(b, t, ++ __inorder_to_eytzinger1(j--, ++ t->size, t->extra)) ++ : btree_bkey_first(b, t); ++ } while (p >= k); ++ break; ++ case BSET_RW_AUX_TREE: ++ offset = __btree_node_key_to_offset(b, k); ++ j = rw_aux_tree_bsearch(b, t, offset); ++ p = j ? rw_aux_to_bkey(b, t, j - 1) ++ : btree_bkey_first(b, t); ++ break; ++ } ++ ++ return p; ++} ++ ++struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, ++ struct bset_tree *t, ++ struct bkey_packed *k, ++ unsigned min_key_type) ++{ ++ struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; ++ ++ while ((p = __bkey_prev(b, t, k)) && !ret) { ++ for (i = p; i != k; i = bkey_next_skip_noops(i, k)) ++ if (i->type >= min_key_type) ++ ret = i; ++ ++ k = p; ++ } ++ ++ if (btree_keys_expensive_checks(b)) { ++ BUG_ON(ret >= orig_k); ++ ++ for (i = ret ++ ? bkey_next_skip_noops(ret, orig_k) ++ : btree_bkey_first(b, t); ++ i != orig_k; ++ i = bkey_next_skip_noops(i, orig_k)) ++ BUG_ON(i->type >= min_key_type); ++ } ++ ++ return ret; ++} ++ ++/* Insert */ ++ ++static void rw_aux_tree_fix_invalidated_key(struct btree *b, ++ struct bset_tree *t, ++ struct bkey_packed *k) ++{ ++ unsigned offset = __btree_node_key_to_offset(b, k); ++ unsigned j = rw_aux_tree_bsearch(b, t, offset); ++ ++ if (j < t->size && ++ rw_aux_tree(b, t)[j].offset == offset) ++ rw_aux_tree_set(b, t, j, k); ++ ++ bch2_bset_verify_rw_aux_tree(b, t); ++} ++ ++static void ro_aux_tree_fix_invalidated_key(struct btree *b, ++ struct bset_tree *t, ++ struct bkey_packed *k) ++{ ++ struct bkey_packed min_key, max_key; ++ unsigned inorder, j; ++ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); ++ ++ /* signal to make_bfloat() that they're uninitialized: */ ++ min_key.u64s = max_key.u64s = 0; ++ ++ if (bkey_next_skip_noops(k, btree_bkey_last(b, t)) == btree_bkey_last(b, t)) { ++ t->max_key = bkey_unpack_pos(b, k); ++ ++ for (j = 1; j < t->size; j = j * 2 + 1) ++ make_bfloat(b, t, j, &min_key, &max_key); ++ } ++ ++ inorder = bkey_to_cacheline(b, t, k); ++ ++ if (inorder && ++ inorder < t->size) { ++ j = __inorder_to_eytzinger1(inorder, t->size, t->extra); ++ ++ if (k == tree_to_bkey(b, t, j)) { ++ /* Fix the node this key corresponds to */ ++ make_bfloat(b, t, j, &min_key, &max_key); ++ ++ /* Children for which this key is the right boundary */ ++ for (j = eytzinger1_left_child(j); ++ j < t->size; ++ j = eytzinger1_right_child(j)) ++ make_bfloat(b, t, j, &min_key, &max_key); ++ } ++ } ++ ++ if (inorder + 1 < t->size) { ++ j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra); ++ ++ if (k == tree_to_prev_bkey(b, t, j)) { ++ make_bfloat(b, t, j, &min_key, &max_key); ++ ++ /* Children for which this key is the left boundary */ ++ for (j = eytzinger1_right_child(j); ++ j < t->size; ++ j = eytzinger1_left_child(j)) ++ make_bfloat(b, t, j, &min_key, &max_key); ++ } ++ } ++} ++ ++/** ++ * bch2_bset_fix_invalidated_key() - given an existing key @k that has been ++ * modified, fix any auxiliary search tree by remaking all the nodes in the ++ * auxiliary search tree that @k corresponds to ++ */ ++void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ ++ switch (bset_aux_tree_type(t)) { ++ case BSET_NO_AUX_TREE: ++ break; ++ case BSET_RO_AUX_TREE: ++ ro_aux_tree_fix_invalidated_key(b, t, k); ++ break; ++ case BSET_RW_AUX_TREE: ++ rw_aux_tree_fix_invalidated_key(b, t, k); ++ break; ++ } ++} ++ ++static void bch2_bset_fix_lookup_table(struct btree *b, ++ struct bset_tree *t, ++ struct bkey_packed *_where, ++ unsigned clobber_u64s, ++ unsigned new_u64s) ++{ ++ int shift = new_u64s - clobber_u64s; ++ unsigned l, j, where = __btree_node_key_to_offset(b, _where); ++ ++ EBUG_ON(bset_has_ro_aux_tree(t)); ++ ++ if (!bset_has_rw_aux_tree(t)) ++ return; ++ ++ /* returns first entry >= where */ ++ l = rw_aux_tree_bsearch(b, t, where); ++ ++ if (!l) /* never delete first entry */ ++ l++; ++ else if (l < t->size && ++ where < t->end_offset && ++ rw_aux_tree(b, t)[l].offset == where) ++ rw_aux_tree_set(b, t, l++, _where); ++ ++ /* l now > where */ ++ ++ for (j = l; ++ j < t->size && ++ rw_aux_tree(b, t)[j].offset < where + clobber_u64s; ++ j++) ++ ; ++ ++ if (j < t->size && ++ rw_aux_tree(b, t)[j].offset + shift == ++ rw_aux_tree(b, t)[l - 1].offset) ++ j++; ++ ++ memmove(&rw_aux_tree(b, t)[l], ++ &rw_aux_tree(b, t)[j], ++ (void *) &rw_aux_tree(b, t)[t->size] - ++ (void *) &rw_aux_tree(b, t)[j]); ++ t->size -= j - l; ++ ++ for (j = l; j < t->size; j++) ++ rw_aux_tree(b, t)[j].offset += shift; ++ ++ EBUG_ON(l < t->size && ++ rw_aux_tree(b, t)[l].offset == ++ rw_aux_tree(b, t)[l - 1].offset); ++ ++ if (t->size < bset_rw_tree_capacity(b, t) && ++ (l < t->size ++ ? rw_aux_tree(b, t)[l].offset ++ : t->end_offset) - ++ rw_aux_tree(b, t)[l - 1].offset > ++ L1_CACHE_BYTES / sizeof(u64)) { ++ struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1); ++ struct bkey_packed *end = l < t->size ++ ? rw_aux_to_bkey(b, t, l) ++ : btree_bkey_last(b, t); ++ struct bkey_packed *k = start; ++ ++ while (1) { ++ k = bkey_next_skip_noops(k, end); ++ if (k == end) ++ break; ++ ++ if ((void *) k - (void *) start >= L1_CACHE_BYTES) { ++ memmove(&rw_aux_tree(b, t)[l + 1], ++ &rw_aux_tree(b, t)[l], ++ (void *) &rw_aux_tree(b, t)[t->size] - ++ (void *) &rw_aux_tree(b, t)[l]); ++ t->size++; ++ rw_aux_tree_set(b, t, l, k); ++ break; ++ } ++ } ++ } ++ ++ bch2_bset_verify_rw_aux_tree(b, t); ++ bset_aux_tree_verify(b); ++} ++ ++void bch2_bset_insert(struct btree *b, ++ struct btree_node_iter *iter, ++ struct bkey_packed *where, ++ struct bkey_i *insert, ++ unsigned clobber_u64s) ++{ ++ struct bkey_format *f = &b->format; ++ struct bset_tree *t = bset_tree_last(b); ++ struct bkey_packed packed, *src = bkey_to_packed(insert); ++ ++ bch2_bset_verify_rw_aux_tree(b, t); ++ bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s); ++ ++ if (bch2_bkey_pack_key(&packed, &insert->k, f)) ++ src = &packed; ++ ++ if (!bkey_whiteout(&insert->k)) ++ btree_keys_account_key_add(&b->nr, t - b->set, src); ++ ++ if (src->u64s != clobber_u64s) { ++ u64 *src_p = where->_data + clobber_u64s; ++ u64 *dst_p = where->_data + src->u64s; ++ ++ EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < ++ (int) clobber_u64s - src->u64s); ++ ++ memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); ++ le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s); ++ set_btree_bset_end(b, t); ++ } ++ ++ memcpy_u64s(where, src, ++ bkeyp_key_u64s(f, src)); ++ memcpy_u64s(bkeyp_val(f, where), &insert->v, ++ bkeyp_val_u64s(f, src)); ++ ++ if (src->u64s != clobber_u64s) ++ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); ++ ++ bch2_verify_btree_nr_keys(b); ++} ++ ++void bch2_bset_delete(struct btree *b, ++ struct bkey_packed *where, ++ unsigned clobber_u64s) ++{ ++ struct bset_tree *t = bset_tree_last(b); ++ u64 *src_p = where->_data + clobber_u64s; ++ u64 *dst_p = where->_data; ++ ++ bch2_bset_verify_rw_aux_tree(b, t); ++ ++ EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s); ++ ++ memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); ++ le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s); ++ set_btree_bset_end(b, t); ++ ++ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0); ++} ++ ++/* Lookup */ ++ ++__flatten ++static struct bkey_packed *bset_search_write_set(const struct btree *b, ++ struct bset_tree *t, ++ struct bpos *search, ++ const struct bkey_packed *packed_search) ++{ ++ unsigned l = 0, r = t->size; ++ ++ while (l + 1 != r) { ++ unsigned m = (l + r) >> 1; ++ ++ if (bkey_cmp(rw_aux_tree(b, t)[m].k, *search) < 0) ++ l = m; ++ else ++ r = m; ++ } ++ ++ return rw_aux_to_bkey(b, t, l); ++} ++ ++static inline void prefetch_four_cachelines(void *p) ++{ ++#ifdef CONFIG_X86_64 ++ asm(".intel_syntax noprefix;" ++ "prefetcht0 [%0 - 127 + 64 * 0];" ++ "prefetcht0 [%0 - 127 + 64 * 1];" ++ "prefetcht0 [%0 - 127 + 64 * 2];" ++ "prefetcht0 [%0 - 127 + 64 * 3];" ++ ".att_syntax prefix;" ++ : ++ : "r" (p + 127)); ++#else ++ prefetch(p + L1_CACHE_BYTES * 0); ++ prefetch(p + L1_CACHE_BYTES * 1); ++ prefetch(p + L1_CACHE_BYTES * 2); ++ prefetch(p + L1_CACHE_BYTES * 3); ++#endif ++} ++ ++static inline bool bkey_mantissa_bits_dropped(const struct btree *b, ++ const struct bkey_float *f, ++ unsigned idx) ++{ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits; ++ ++ return f->exponent > key_bits_start; ++#else ++ unsigned key_bits_end = high_bit_offset + b->nr_key_bits; ++ ++ return f->exponent + BKEY_MANTISSA_BITS < key_bits_end; ++#endif ++} ++ ++__flatten ++static struct bkey_packed *bset_search_tree(const struct btree *b, ++ struct bset_tree *t, ++ struct bpos *search, ++ const struct bkey_packed *packed_search) ++{ ++ struct ro_aux_tree *base = ro_aux_tree_base(b, t); ++ struct bkey_float *f; ++ struct bkey_packed *k; ++ unsigned inorder, n = 1, l, r; ++ int cmp; ++ ++ do { ++ if (likely(n << 4 < t->size)) ++ prefetch(&base->f[n << 4]); ++ ++ f = &base->f[n]; ++ ++ if (!unlikely(packed_search)) ++ goto slowpath; ++ if (unlikely(f->exponent >= BFLOAT_FAILED)) ++ goto slowpath; ++ ++ l = f->mantissa; ++ r = bkey_mantissa(packed_search, f, n); ++ ++ if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n)) ++ goto slowpath; ++ ++ n = n * 2 + (l < r); ++ continue; ++slowpath: ++ k = tree_to_bkey(b, t, n); ++ cmp = bkey_cmp_p_or_unp(b, k, packed_search, search); ++ if (!cmp) ++ return k; ++ ++ n = n * 2 + (cmp < 0); ++ } while (n < t->size); ++ ++ inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra); ++ ++ /* ++ * n would have been the node we recursed to - the low bit tells us if ++ * we recursed left or recursed right. ++ */ ++ if (likely(!(n & 1))) { ++ --inorder; ++ if (unlikely(!inorder)) ++ return btree_bkey_first(b, t); ++ ++ f = &base->f[eytzinger1_prev(n >> 1, t->size)]; ++ } ++ ++ return cacheline_to_bkey(b, t, inorder, f->key_offset); ++} ++ ++static __always_inline __flatten ++struct bkey_packed *__bch2_bset_search(struct btree *b, ++ struct bset_tree *t, ++ struct bpos *search, ++ const struct bkey_packed *lossy_packed_search) ++{ ++ ++ /* ++ * First, we search for a cacheline, then lastly we do a linear search ++ * within that cacheline. ++ * ++ * To search for the cacheline, there's three different possibilities: ++ * * The set is too small to have a search tree, so we just do a linear ++ * search over the whole set. ++ * * The set is the one we're currently inserting into; keeping a full ++ * auxiliary search tree up to date would be too expensive, so we ++ * use a much simpler lookup table to do a binary search - ++ * bset_search_write_set(). ++ * * Or we use the auxiliary search tree we constructed earlier - ++ * bset_search_tree() ++ */ ++ ++ switch (bset_aux_tree_type(t)) { ++ case BSET_NO_AUX_TREE: ++ return btree_bkey_first(b, t); ++ case BSET_RW_AUX_TREE: ++ return bset_search_write_set(b, t, search, lossy_packed_search); ++ case BSET_RO_AUX_TREE: ++ /* ++ * Each node in the auxiliary search tree covers a certain range ++ * of bits, and keys above and below the set it covers might ++ * differ outside those bits - so we have to special case the ++ * start and end - handle that here: ++ */ ++ ++ if (bkey_cmp(*search, t->max_key) > 0) ++ return btree_bkey_last(b, t); ++ ++ return bset_search_tree(b, t, search, lossy_packed_search); ++ default: ++ unreachable(); ++ } ++} ++ ++static __always_inline __flatten ++struct bkey_packed *bch2_bset_search_linear(struct btree *b, ++ struct bset_tree *t, ++ struct bpos *search, ++ struct bkey_packed *packed_search, ++ const struct bkey_packed *lossy_packed_search, ++ struct bkey_packed *m) ++{ ++ if (lossy_packed_search) ++ while (m != btree_bkey_last(b, t) && ++ bkey_iter_cmp_p_or_unp(b, m, ++ lossy_packed_search, search) < 0) ++ m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); ++ ++ if (!packed_search) ++ while (m != btree_bkey_last(b, t) && ++ bkey_iter_pos_cmp(b, m, search) < 0) ++ m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); ++ ++ if (btree_keys_expensive_checks(b)) { ++ struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); ++ ++ BUG_ON(prev && ++ bkey_iter_cmp_p_or_unp(b, prev, ++ packed_search, search) >= 0); ++ } ++ ++ return m; ++} ++ ++/* ++ * Returns the first key greater than or equal to @search ++ */ ++static __always_inline __flatten ++struct bkey_packed *bch2_bset_search(struct btree *b, ++ struct bset_tree *t, ++ struct bpos *search, ++ struct bkey_packed *packed_search, ++ const struct bkey_packed *lossy_packed_search) ++{ ++ struct bkey_packed *m = __bch2_bset_search(b, t, search, ++ lossy_packed_search); ++ ++ return bch2_bset_search_linear(b, t, search, ++ packed_search, lossy_packed_search, m); ++} ++ ++/* Btree node iterator */ ++ ++static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, ++ struct btree *b, ++ const struct bkey_packed *k, ++ const struct bkey_packed *end) ++{ ++ if (k != end) { ++ struct btree_node_iter_set *pos; ++ ++ btree_node_iter_for_each(iter, pos) ++ ; ++ ++ BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data)); ++ *pos = (struct btree_node_iter_set) { ++ __btree_node_key_to_offset(b, k), ++ __btree_node_key_to_offset(b, end) ++ }; ++ } ++} ++ ++void bch2_btree_node_iter_push(struct btree_node_iter *iter, ++ struct btree *b, ++ const struct bkey_packed *k, ++ const struct bkey_packed *end) ++{ ++ __bch2_btree_node_iter_push(iter, b, k, end); ++ bch2_btree_node_iter_sort(iter, b); ++} ++ ++noinline __flatten __attribute__((cold)) ++static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, ++ struct btree *b, struct bpos *search) ++{ ++ struct bset_tree *t; ++ ++ trace_bkey_pack_pos_fail(search); ++ ++ for_each_bset(b, t) ++ __bch2_btree_node_iter_push(iter, b, ++ bch2_bset_search(b, t, search, NULL, NULL), ++ btree_bkey_last(b, t)); ++ ++ bch2_btree_node_iter_sort(iter, b); ++} ++ ++/** ++ * bch_btree_node_iter_init - initialize a btree node iterator, starting from a ++ * given position ++ * ++ * Main entry point to the lookup code for individual btree nodes: ++ * ++ * NOTE: ++ * ++ * When you don't filter out deleted keys, btree nodes _do_ contain duplicate ++ * keys. This doesn't matter for most code, but it does matter for lookups. ++ * ++ * Some adjacent keys with a string of equal keys: ++ * i j k k k k l m ++ * ++ * If you search for k, the lookup code isn't guaranteed to return you any ++ * specific k. The lookup code is conceptually doing a binary search and ++ * iterating backwards is very expensive so if the pivot happens to land at the ++ * last k that's what you'll get. ++ * ++ * This works out ok, but it's something to be aware of: ++ * ++ * - For non extents, we guarantee that the live key comes last - see ++ * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't ++ * see will only be deleted keys you don't care about. ++ * ++ * - For extents, deleted keys sort last (see the comment at the top of this ++ * file). But when you're searching for extents, you actually want the first ++ * key strictly greater than your search key - an extent that compares equal ++ * to the search key is going to have 0 sectors after the search key. ++ * ++ * But this does mean that we can't just search for ++ * bkey_successor(start_of_range) to get the first extent that overlaps with ++ * the range we want - if we're unlucky and there's an extent that ends ++ * exactly where we searched, then there could be a deleted key at the same ++ * position and we'd get that when we search instead of the preceding extent ++ * we needed. ++ * ++ * So we've got to search for start_of_range, then after the lookup iterate ++ * past any extents that compare equal to the position we searched for. ++ */ ++__flatten ++void bch2_btree_node_iter_init(struct btree_node_iter *iter, ++ struct btree *b, struct bpos *search) ++{ ++ struct bkey_packed p, *packed_search = NULL; ++ struct btree_node_iter_set *pos = iter->data; ++ struct bkey_packed *k[MAX_BSETS]; ++ unsigned i; ++ ++ EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0); ++ bset_aux_tree_verify(b); ++ ++ memset(iter, 0, sizeof(*iter)); ++ ++ switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) { ++ case BKEY_PACK_POS_EXACT: ++ packed_search = &p; ++ break; ++ case BKEY_PACK_POS_SMALLER: ++ packed_search = NULL; ++ break; ++ case BKEY_PACK_POS_FAIL: ++ btree_node_iter_init_pack_failed(iter, b, search); ++ return; ++ } ++ ++ for (i = 0; i < b->nsets; i++) { ++ k[i] = __bch2_bset_search(b, b->set + i, search, &p); ++ prefetch_four_cachelines(k[i]); ++ } ++ ++ for (i = 0; i < b->nsets; i++) { ++ struct bset_tree *t = b->set + i; ++ struct bkey_packed *end = btree_bkey_last(b, t); ++ ++ k[i] = bch2_bset_search_linear(b, t, search, ++ packed_search, &p, k[i]); ++ if (k[i] != end) ++ *pos++ = (struct btree_node_iter_set) { ++ __btree_node_key_to_offset(b, k[i]), ++ __btree_node_key_to_offset(b, end) ++ }; ++ } ++ ++ bch2_btree_node_iter_sort(iter, b); ++} ++ ++void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ struct bset_tree *t; ++ ++ memset(iter, 0, sizeof(*iter)); ++ ++ for_each_bset(b, t) ++ __bch2_btree_node_iter_push(iter, b, ++ btree_bkey_first(b, t), ++ btree_bkey_last(b, t)); ++ bch2_btree_node_iter_sort(iter, b); ++} ++ ++struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter, ++ struct btree *b, ++ struct bset_tree *t) ++{ ++ struct btree_node_iter_set *set; ++ ++ btree_node_iter_for_each(iter, set) ++ if (set->end == t->end_offset) ++ return __btree_node_offset_to_key(b, set->k); ++ ++ return btree_bkey_last(b, t); ++} ++ ++static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter, ++ struct btree *b, ++ unsigned first) ++{ ++ bool ret; ++ ++ if ((ret = (btree_node_iter_cmp(b, ++ iter->data[first], ++ iter->data[first + 1]) > 0))) ++ swap(iter->data[first], iter->data[first + 1]); ++ return ret; ++} ++ ++void bch2_btree_node_iter_sort(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ /* unrolled bubble sort: */ ++ ++ if (!__btree_node_iter_set_end(iter, 2)) { ++ btree_node_iter_sort_two(iter, b, 0); ++ btree_node_iter_sort_two(iter, b, 1); ++ } ++ ++ if (!__btree_node_iter_set_end(iter, 1)) ++ btree_node_iter_sort_two(iter, b, 0); ++} ++ ++void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter, ++ struct btree_node_iter_set *set) ++{ ++ struct btree_node_iter_set *last = ++ iter->data + ARRAY_SIZE(iter->data) - 1; ++ ++ memmove(&set[0], &set[1], (void *) last - (void *) set); ++ *last = (struct btree_node_iter_set) { 0, 0 }; ++} ++ ++static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s; ++ ++ EBUG_ON(iter->data->k > iter->data->end); ++ ++ while (!__btree_node_iter_set_end(iter, 0) && ++ !__bch2_btree_node_iter_peek_all(iter, b)->u64s) ++ iter->data->k++; ++ ++ if (unlikely(__btree_node_iter_set_end(iter, 0))) { ++ bch2_btree_node_iter_set_drop(iter, iter->data); ++ return; ++ } ++ ++ if (__btree_node_iter_set_end(iter, 1)) ++ return; ++ ++ if (!btree_node_iter_sort_two(iter, b, 0)) ++ return; ++ ++ if (__btree_node_iter_set_end(iter, 2)) ++ return; ++ ++ btree_node_iter_sort_two(iter, b, 1); ++} ++ ++void bch2_btree_node_iter_advance(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ if (btree_keys_expensive_checks(b)) { ++ bch2_btree_node_iter_verify(iter, b); ++ bch2_btree_node_iter_next_check(iter, b); ++ } ++ ++ __bch2_btree_node_iter_advance(iter, b); ++} ++ ++/* ++ * Expensive: ++ */ ++struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ struct bkey_packed *k, *prev = NULL; ++ struct btree_node_iter_set *set; ++ struct bset_tree *t; ++ unsigned end = 0; ++ ++ if (btree_keys_expensive_checks(b)) ++ bch2_btree_node_iter_verify(iter, b); ++ ++ for_each_bset(b, t) { ++ k = bch2_bkey_prev_all(b, t, ++ bch2_btree_node_iter_bset_pos(iter, b, t)); ++ if (k && ++ (!prev || bkey_iter_cmp(b, k, prev) > 0)) { ++ prev = k; ++ end = t->end_offset; ++ } ++ } ++ ++ if (!prev) ++ return NULL; ++ ++ /* ++ * We're manually memmoving instead of just calling sort() to ensure the ++ * prev we picked ends up in slot 0 - sort won't necessarily put it ++ * there because of duplicate deleted keys: ++ */ ++ btree_node_iter_for_each(iter, set) ++ if (set->end == end) ++ goto found; ++ ++ BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]); ++found: ++ BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data)); ++ ++ memmove(&iter->data[1], ++ &iter->data[0], ++ (void *) set - (void *) &iter->data[0]); ++ ++ iter->data[0].k = __btree_node_key_to_offset(b, prev); ++ iter->data[0].end = end; ++ ++ if (btree_keys_expensive_checks(b)) ++ bch2_btree_node_iter_verify(iter, b); ++ return prev; ++} ++ ++struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter, ++ struct btree *b, ++ unsigned min_key_type) ++{ ++ struct bkey_packed *prev; ++ ++ do { ++ prev = bch2_btree_node_iter_prev_all(iter, b); ++ } while (prev && prev->type < min_key_type); ++ ++ return prev; ++} ++ ++struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, ++ struct btree *b, ++ struct bkey *u) ++{ ++ struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b); ++ ++ return k ? bkey_disassemble(b, k, u) : bkey_s_c_null; ++} ++ ++/* Mergesort */ ++ ++void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats) ++{ ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) { ++ enum bset_aux_tree_type type = bset_aux_tree_type(t); ++ size_t j; ++ ++ stats->sets[type].nr++; ++ stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) * ++ sizeof(u64); ++ ++ if (bset_has_ro_aux_tree(t)) { ++ stats->floats += t->size - 1; ++ ++ for (j = 1; j < t->size; j++) ++ stats->failed += ++ bkey_float(b, t, j)->exponent == ++ BFLOAT_FAILED; ++ } ++ } ++} ++ ++void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, ++ struct bkey_packed *k) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ struct bkey uk; ++ unsigned j, inorder; ++ ++ if (out->pos != out->end) ++ *out->pos = '\0'; ++ ++ if (!bset_has_ro_aux_tree(t)) ++ return; ++ ++ inorder = bkey_to_cacheline(b, t, k); ++ if (!inorder || inorder >= t->size) ++ return; ++ ++ j = __inorder_to_eytzinger1(inorder, t->size, t->extra); ++ if (k != tree_to_bkey(b, t, j)) ++ return; ++ ++ switch (bkey_float(b, t, j)->exponent) { ++ case BFLOAT_FAILED: ++ uk = bkey_unpack_key(b, k); ++ pr_buf(out, ++ " failed unpacked at depth %u\n" ++ "\t%llu:%llu\n", ++ ilog2(j), ++ uk.p.inode, uk.p.offset); ++ break; ++ } ++} +diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h +new file mode 100644 +index 000000000000..652ffed4adfb +--- /dev/null ++++ b/fs/bcachefs/bset.h +@@ -0,0 +1,631 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BSET_H ++#define _BCACHEFS_BSET_H ++ ++#include ++#include ++ ++#include "bcachefs_format.h" ++#include "bkey.h" ++#include "bkey_methods.h" ++#include "btree_types.h" ++#include "util.h" /* for time_stats */ ++#include "vstructs.h" ++ ++/* ++ * BKEYS: ++ * ++ * A bkey contains a key, a size field, a variable number of pointers, and some ++ * ancillary flag bits. ++ * ++ * We use two different functions for validating bkeys, bkey_invalid and ++ * bkey_deleted(). ++ * ++ * The one exception to the rule that ptr_invalid() filters out invalid keys is ++ * that it also filters out keys of size 0 - these are keys that have been ++ * completely overwritten. It'd be safe to delete these in memory while leaving ++ * them on disk, just unnecessary work - so we filter them out when resorting ++ * instead. ++ * ++ * We can't filter out stale keys when we're resorting, because garbage ++ * collection needs to find them to ensure bucket gens don't wrap around - ++ * unless we're rewriting the btree node those stale keys still exist on disk. ++ * ++ * We also implement functions here for removing some number of sectors from the ++ * front or the back of a bkey - this is mainly used for fixing overlapping ++ * extents, by removing the overlapping sectors from the older key. ++ * ++ * BSETS: ++ * ++ * A bset is an array of bkeys laid out contiguously in memory in sorted order, ++ * along with a header. A btree node is made up of a number of these, written at ++ * different times. ++ * ++ * There could be many of them on disk, but we never allow there to be more than ++ * 4 in memory - we lazily resort as needed. ++ * ++ * We implement code here for creating and maintaining auxiliary search trees ++ * (described below) for searching an individial bset, and on top of that we ++ * implement a btree iterator. ++ * ++ * BTREE ITERATOR: ++ * ++ * Most of the code in bcache doesn't care about an individual bset - it needs ++ * to search entire btree nodes and iterate over them in sorted order. ++ * ++ * The btree iterator code serves both functions; it iterates through the keys ++ * in a btree node in sorted order, starting from either keys after a specific ++ * point (if you pass it a search key) or the start of the btree node. ++ * ++ * AUXILIARY SEARCH TREES: ++ * ++ * Since keys are variable length, we can't use a binary search on a bset - we ++ * wouldn't be able to find the start of the next key. But binary searches are ++ * slow anyways, due to terrible cache behaviour; bcache originally used binary ++ * searches and that code topped out at under 50k lookups/second. ++ * ++ * So we need to construct some sort of lookup table. Since we only insert keys ++ * into the last (unwritten) set, most of the keys within a given btree node are ++ * usually in sets that are mostly constant. We use two different types of ++ * lookup tables to take advantage of this. ++ * ++ * Both lookup tables share in common that they don't index every key in the ++ * set; they index one key every BSET_CACHELINE bytes, and then a linear search ++ * is used for the rest. ++ * ++ * For sets that have been written to disk and are no longer being inserted ++ * into, we construct a binary search tree in an array - traversing a binary ++ * search tree in an array gives excellent locality of reference and is very ++ * fast, since both children of any node are adjacent to each other in memory ++ * (and their grandchildren, and great grandchildren...) - this means ++ * prefetching can be used to great effect. ++ * ++ * It's quite useful performance wise to keep these nodes small - not just ++ * because they're more likely to be in L2, but also because we can prefetch ++ * more nodes on a single cacheline and thus prefetch more iterations in advance ++ * when traversing this tree. ++ * ++ * Nodes in the auxiliary search tree must contain both a key to compare against ++ * (we don't want to fetch the key from the set, that would defeat the purpose), ++ * and a pointer to the key. We use a few tricks to compress both of these. ++ * ++ * To compress the pointer, we take advantage of the fact that one node in the ++ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have ++ * a function (to_inorder()) that takes the index of a node in a binary tree and ++ * returns what its index would be in an inorder traversal, so we only have to ++ * store the low bits of the offset. ++ * ++ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To ++ * compress that, we take advantage of the fact that when we're traversing the ++ * search tree at every iteration we know that both our search key and the key ++ * we're looking for lie within some range - bounded by our previous ++ * comparisons. (We special case the start of a search so that this is true even ++ * at the root of the tree). ++ * ++ * So we know the key we're looking for is between a and b, and a and b don't ++ * differ higher than bit 50, we don't need to check anything higher than bit ++ * 50. ++ * ++ * We don't usually need the rest of the bits, either; we only need enough bits ++ * to partition the key range we're currently checking. Consider key n - the ++ * key our auxiliary search tree node corresponds to, and key p, the key ++ * immediately preceding n. The lowest bit we need to store in the auxiliary ++ * search tree is the highest bit that differs between n and p. ++ * ++ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the ++ * comparison. But we'd really like our nodes in the auxiliary search tree to be ++ * of fixed size. ++ * ++ * The solution is to make them fixed size, and when we're constructing a node ++ * check if p and n differed in the bits we needed them to. If they don't we ++ * flag that node, and when doing lookups we fallback to comparing against the ++ * real key. As long as this doesn't happen to often (and it seems to reliably ++ * happen a bit less than 1% of the time), we win - even on failures, that key ++ * is then more likely to be in cache than if we were doing binary searches all ++ * the way, since we're touching so much less memory. ++ * ++ * The keys in the auxiliary search tree are stored in (software) floating ++ * point, with an exponent and a mantissa. The exponent needs to be big enough ++ * to address all the bits in the original key, but the number of bits in the ++ * mantissa is somewhat arbitrary; more bits just gets us fewer failures. ++ * ++ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys ++ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes. ++ * We need one node per 128 bytes in the btree node, which means the auxiliary ++ * search trees take up 3% as much memory as the btree itself. ++ * ++ * Constructing these auxiliary search trees is moderately expensive, and we ++ * don't want to be constantly rebuilding the search tree for the last set ++ * whenever we insert another key into it. For the unwritten set, we use a much ++ * simpler lookup table - it's just a flat array, so index i in the lookup table ++ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing ++ * within each byte range works the same as with the auxiliary search trees. ++ * ++ * These are much easier to keep up to date when we insert a key - we do it ++ * somewhat lazily; when we shift a key up we usually just increment the pointer ++ * to it, only when it would overflow do we go to the trouble of finding the ++ * first key in that range of bytes again. ++ */ ++ ++extern bool bch2_expensive_debug_checks; ++ ++static inline bool btree_keys_expensive_checks(const struct btree *b) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ return bch2_expensive_debug_checks || *b->expensive_debug_checks; ++#else ++ return false; ++#endif ++} ++ ++enum bset_aux_tree_type { ++ BSET_NO_AUX_TREE, ++ BSET_RO_AUX_TREE, ++ BSET_RW_AUX_TREE, ++}; ++ ++#define BSET_TREE_NR_TYPES 3 ++ ++#define BSET_NO_AUX_TREE_VAL (U16_MAX) ++#define BSET_RW_AUX_TREE_VAL (U16_MAX - 1) ++ ++static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t) ++{ ++ switch (t->extra) { ++ case BSET_NO_AUX_TREE_VAL: ++ EBUG_ON(t->size); ++ return BSET_NO_AUX_TREE; ++ case BSET_RW_AUX_TREE_VAL: ++ EBUG_ON(!t->size); ++ return BSET_RW_AUX_TREE; ++ default: ++ EBUG_ON(!t->size); ++ return BSET_RO_AUX_TREE; ++ } ++} ++ ++typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); ++ ++static inline void ++__bkey_unpack_key_format_checked(const struct btree *b, ++ struct bkey *dst, ++ const struct bkey_packed *src) ++{ ++#ifdef HAVE_BCACHEFS_COMPILED_UNPACK ++ { ++ compiled_unpack_fn unpack_fn = b->aux_data; ++ unpack_fn(dst, src); ++ ++ if (btree_keys_expensive_checks(b)) { ++ struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); ++ ++ BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); ++ } ++ } ++#else ++ *dst = __bch2_bkey_unpack_key(&b->format, src); ++#endif ++} ++ ++static inline struct bkey ++bkey_unpack_key_format_checked(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++ struct bkey dst; ++ ++ __bkey_unpack_key_format_checked(b, &dst, src); ++ return dst; ++} ++ ++static inline void __bkey_unpack_key(const struct btree *b, ++ struct bkey *dst, ++ const struct bkey_packed *src) ++{ ++ if (likely(bkey_packed(src))) ++ __bkey_unpack_key_format_checked(b, dst, src); ++ else ++ *dst = *packed_to_bkey_c(src); ++} ++ ++/** ++ * bkey_unpack_key -- unpack just the key, not the value ++ */ ++static inline struct bkey bkey_unpack_key(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++ return likely(bkey_packed(src)) ++ ? bkey_unpack_key_format_checked(b, src) ++ : *packed_to_bkey_c(src); ++} ++ ++static inline struct bpos ++bkey_unpack_pos_format_checked(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++#ifdef HAVE_BCACHEFS_COMPILED_UNPACK ++ return bkey_unpack_key_format_checked(b, src).p; ++#else ++ return __bkey_unpack_pos(&b->format, src); ++#endif ++} ++ ++static inline struct bpos bkey_unpack_pos(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++ return likely(bkey_packed(src)) ++ ? bkey_unpack_pos_format_checked(b, src) ++ : packed_to_bkey_c(src)->p; ++} ++ ++/* Disassembled bkeys */ ++ ++static inline struct bkey_s_c bkey_disassemble(struct btree *b, ++ const struct bkey_packed *k, ++ struct bkey *u) ++{ ++ __bkey_unpack_key(b, u, k); ++ ++ return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), }; ++} ++ ++/* non const version: */ ++static inline struct bkey_s __bkey_disassemble(struct btree *b, ++ struct bkey_packed *k, ++ struct bkey *u) ++{ ++ __bkey_unpack_key(b, u, k); ++ ++ return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; ++} ++ ++#define for_each_bset(_b, _t) \ ++ for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) ++ ++#define bset_tree_for_each_key(_b, _t, _k) \ ++ for (_k = btree_bkey_first(_b, _t); \ ++ _k != btree_bkey_last(_b, _t); \ ++ _k = bkey_next_skip_noops(_k, btree_bkey_last(_b, _t))) ++ ++static inline bool bset_has_ro_aux_tree(struct bset_tree *t) ++{ ++ return bset_aux_tree_type(t) == BSET_RO_AUX_TREE; ++} ++ ++static inline bool bset_has_rw_aux_tree(struct bset_tree *t) ++{ ++ return bset_aux_tree_type(t) == BSET_RW_AUX_TREE; ++} ++ ++static inline void bch2_bset_set_no_aux_tree(struct btree *b, ++ struct bset_tree *t) ++{ ++ BUG_ON(t < b->set); ++ ++ for (; t < b->set + ARRAY_SIZE(b->set); t++) { ++ t->size = 0; ++ t->extra = BSET_NO_AUX_TREE_VAL; ++ t->aux_data_offset = U16_MAX; ++ } ++} ++ ++static inline void btree_node_set_format(struct btree *b, ++ struct bkey_format f) ++{ ++ int len; ++ ++ b->format = f; ++ b->nr_key_bits = bkey_format_key_bits(&f); ++ ++ len = bch2_compile_bkey_format(&b->format, b->aux_data); ++ BUG_ON(len < 0 || len > U8_MAX); ++ ++ b->unpack_fn_len = len; ++ ++ bch2_bset_set_no_aux_tree(b, b->set); ++} ++ ++static inline struct bset *bset_next_set(struct btree *b, ++ unsigned block_bytes) ++{ ++ struct bset *i = btree_bset_last(b); ++ ++ EBUG_ON(!is_power_of_2(block_bytes)); ++ ++ return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); ++} ++ ++void bch2_btree_keys_free(struct btree *); ++int bch2_btree_keys_alloc(struct btree *, unsigned, gfp_t); ++void bch2_btree_keys_init(struct btree *, bool *); ++ ++void bch2_bset_init_first(struct btree *, struct bset *); ++void bch2_bset_init_next(struct bch_fs *, struct btree *, ++ struct btree_node_entry *); ++void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); ++void bch2_bset_fix_invalidated_key(struct btree *, struct bkey_packed *); ++ ++void bch2_bset_insert(struct btree *, struct btree_node_iter *, ++ struct bkey_packed *, struct bkey_i *, unsigned); ++void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned); ++ ++/* Bkey utility code */ ++ ++/* packed or unpacked */ ++static inline int bkey_cmp_p_or_unp(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bkey_packed *r_packed, ++ const struct bpos *r) ++{ ++ EBUG_ON(r_packed && !bkey_packed(r_packed)); ++ ++ if (unlikely(!bkey_packed(l))) ++ return bkey_cmp(packed_to_bkey_c(l)->p, *r); ++ ++ if (likely(r_packed)) ++ return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b); ++ ++ return __bch2_bkey_cmp_left_packed_format_checked(b, l, r); ++} ++ ++struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *); ++ ++struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *, ++ struct bkey_packed *, unsigned); ++ ++static inline struct bkey_packed * ++bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k) ++{ ++ return bch2_bkey_prev_filter(b, t, k, 0); ++} ++ ++static inline struct bkey_packed * ++bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k) ++{ ++ return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_discard + 1); ++} ++ ++enum bch_extent_overlap { ++ BCH_EXTENT_OVERLAP_ALL = 0, ++ BCH_EXTENT_OVERLAP_BACK = 1, ++ BCH_EXTENT_OVERLAP_FRONT = 2, ++ BCH_EXTENT_OVERLAP_MIDDLE = 3, ++}; ++ ++/* Returns how k overlaps with m */ ++static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, ++ const struct bkey *m) ++{ ++ int cmp1 = bkey_cmp(k->p, m->p) < 0; ++ int cmp2 = bkey_cmp(bkey_start_pos(k), ++ bkey_start_pos(m)) > 0; ++ ++ return (cmp1 << 1) + cmp2; ++} ++ ++/* Btree key iteration */ ++ ++void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *, ++ const struct bkey_packed *, ++ const struct bkey_packed *); ++void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *, ++ struct bpos *); ++void bch2_btree_node_iter_init_from_start(struct btree_node_iter *, ++ struct btree *); ++struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *, ++ struct btree *, ++ struct bset_tree *); ++ ++void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *); ++void bch2_btree_node_iter_set_drop(struct btree_node_iter *, ++ struct btree_node_iter_set *); ++void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *); ++ ++#define btree_node_iter_for_each(_iter, _set) \ ++ for (_set = (_iter)->data; \ ++ _set < (_iter)->data + ARRAY_SIZE((_iter)->data) && \ ++ (_set)->k != (_set)->end; \ ++ _set++) ++ ++static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter, ++ unsigned i) ++{ ++ return iter->data[i].k == iter->data[i].end; ++} ++ ++static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter) ++{ ++ return __btree_node_iter_set_end(iter, 0); ++} ++ ++/* ++ * When keys compare equal, deleted keys compare first: ++ * ++ * XXX: only need to compare pointers for keys that are both within a ++ * btree_node_iterator - we need to break ties for prev() to work correctly ++ */ ++static inline int bkey_iter_cmp(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bkey_packed *r) ++{ ++ return bkey_cmp_packed(b, l, r) ++ ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) ++ ?: cmp_int(l, r); ++} ++ ++static inline int btree_node_iter_cmp(const struct btree *b, ++ struct btree_node_iter_set l, ++ struct btree_node_iter_set r) ++{ ++ return bkey_iter_cmp(b, ++ __btree_node_offset_to_key(b, l.k), ++ __btree_node_offset_to_key(b, r.k)); ++} ++ ++/* These assume r (the search key) is not a deleted key: */ ++static inline int bkey_iter_pos_cmp(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bpos *r) ++{ ++ return bkey_cmp_left_packed(b, l, r) ++ ?: -((int) bkey_deleted(l)); ++} ++ ++static inline int bkey_iter_cmp_p_or_unp(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bkey_packed *r_packed, ++ const struct bpos *r) ++{ ++ return bkey_cmp_p_or_unp(b, l, r_packed, r) ++ ?: -((int) bkey_deleted(l)); ++} ++ ++static inline struct bkey_packed * ++__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ return __btree_node_offset_to_key(b, iter->data->k); ++} ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_peek_filter(struct btree_node_iter *iter, ++ struct btree *b, ++ unsigned min_key_type) ++{ ++ while (!bch2_btree_node_iter_end(iter)) { ++ struct bkey_packed *k = __bch2_btree_node_iter_peek_all(iter, b); ++ ++ if (k->type >= min_key_type) ++ return k; ++ ++ bch2_btree_node_iter_advance(iter, b); ++ } ++ ++ return NULL; ++} ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ return bch2_btree_node_iter_peek_filter(iter, b, 0); ++} ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b) ++{ ++ return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_discard + 1); ++} ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b) ++{ ++ struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b); ++ ++ if (ret) ++ bch2_btree_node_iter_advance(iter, b); ++ ++ return ret; ++} ++ ++struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *, ++ struct btree *); ++struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *, ++ struct btree *, unsigned); ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b) ++{ ++ return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_discard + 1); ++} ++ ++struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, ++ struct btree *, ++ struct bkey *); ++ ++#define for_each_btree_node_key_unpack(b, k, iter, unpacked) \ ++ for (bch2_btree_node_iter_init_from_start((iter), (b)); \ ++ (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\ ++ bch2_btree_node_iter_advance(iter, b)) ++ ++/* Accounting: */ ++ ++static inline void btree_keys_account_key(struct btree_nr_keys *n, ++ unsigned bset, ++ struct bkey_packed *k, ++ int sign) ++{ ++ n->live_u64s += k->u64s * sign; ++ n->bset_u64s[bset] += k->u64s * sign; ++ ++ if (bkey_packed(k)) ++ n->packed_keys += sign; ++ else ++ n->unpacked_keys += sign; ++} ++ ++static inline void btree_keys_account_val_delta(struct btree *b, ++ struct bkey_packed *k, ++ int delta) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ ++ b->nr.live_u64s += delta; ++ b->nr.bset_u64s[t - b->set] += delta; ++} ++ ++#define btree_keys_account_key_add(_nr, _bset_idx, _k) \ ++ btree_keys_account_key(_nr, _bset_idx, _k, 1) ++#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \ ++ btree_keys_account_key(_nr, _bset_idx, _k, -1) ++ ++#define btree_account_key_add(_b, _k) \ ++ btree_keys_account_key(&(_b)->nr, \ ++ bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1) ++#define btree_account_key_drop(_b, _k) \ ++ btree_keys_account_key(&(_b)->nr, \ ++ bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1) ++ ++struct bset_stats { ++ struct { ++ size_t nr, bytes; ++ } sets[BSET_TREE_NR_TYPES]; ++ ++ size_t floats; ++ size_t failed; ++}; ++ ++void bch2_btree_keys_stats(struct btree *, struct bset_stats *); ++void bch2_bfloat_to_text(struct printbuf *, struct btree *, ++ struct bkey_packed *); ++ ++/* Debug stuff */ ++ ++void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned); ++void bch2_dump_btree_node(struct bch_fs *, struct btree *); ++void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++void __bch2_verify_btree_nr_keys(struct btree *); ++void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); ++void bch2_verify_insert_pos(struct btree *, struct bkey_packed *, ++ struct bkey_packed *, unsigned); ++ ++#else ++ ++static inline void __bch2_verify_btree_nr_keys(struct btree *b) {} ++static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter, ++ struct btree *b) {} ++static inline void bch2_verify_insert_pos(struct btree *b, ++ struct bkey_packed *where, ++ struct bkey_packed *insert, ++ unsigned clobber_u64s) {} ++#endif ++ ++static inline void bch2_verify_btree_nr_keys(struct btree *b) ++{ ++ if (btree_keys_expensive_checks(b)) ++ __bch2_verify_btree_nr_keys(b); ++} ++ ++#endif /* _BCACHEFS_BSET_H */ +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +new file mode 100644 +index 000000000000..d3addd3a8964 +--- /dev/null ++++ b/fs/bcachefs/btree_cache.c +@@ -0,0 +1,1054 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_locking.h" ++#include "debug.h" ++ ++#include ++#include ++#include ++ ++const char * const bch2_btree_ids[] = { ++#define x(kwd, val, name) name, ++ BCH_BTREE_IDS() ++#undef x ++ NULL ++}; ++ ++void bch2_recalc_btree_reserve(struct bch_fs *c) ++{ ++ unsigned i, reserve = 16; ++ ++ if (!c->btree_roots[0].b) ++ reserve += 8; ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (c->btree_roots[i].b) ++ reserve += min_t(unsigned, 1, ++ c->btree_roots[i].b->c.level) * 8; ++ ++ c->btree_cache.reserve = reserve; ++} ++ ++static inline unsigned btree_cache_can_free(struct btree_cache *bc) ++{ ++ return max_t(int, 0, bc->used - bc->reserve); ++} ++ ++static void __btree_node_data_free(struct bch_fs *c, struct btree *b) ++{ ++ EBUG_ON(btree_node_write_in_flight(b)); ++ ++ kvpfree(b->data, btree_bytes(c)); ++ b->data = NULL; ++ bch2_btree_keys_free(b); ++} ++ ++static void btree_node_data_free(struct bch_fs *c, struct btree *b) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ ++ __btree_node_data_free(c, b); ++ bc->used--; ++ list_move(&b->list, &bc->freed); ++} ++ ++static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, ++ const void *obj) ++{ ++ const struct btree *b = obj; ++ const u64 *v = arg->key; ++ ++ return b->hash_val == *v ? 0 : 1; ++} ++ ++static const struct rhashtable_params bch_btree_cache_params = { ++ .head_offset = offsetof(struct btree, hash), ++ .key_offset = offsetof(struct btree, hash_val), ++ .key_len = sizeof(u64), ++ .obj_cmpfn = bch2_btree_cache_cmp_fn, ++}; ++ ++static int __btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) ++{ ++ BUG_ON(b->data || b->aux_data); ++ ++ b->data = kvpmalloc(btree_bytes(c), gfp); ++ if (!b->data) ++ return -ENOMEM; ++ ++ if (bch2_btree_keys_alloc(b, btree_page_order(c), gfp)) { ++ kvpfree(b->data, btree_bytes(c)); ++ b->data = NULL; ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++static void btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ ++ if (!__btree_node_data_alloc(c, b, gfp)) { ++ bc->used++; ++ list_move(&b->list, &bc->freeable); ++ } else { ++ list_move(&b->list, &bc->freed); ++ } ++} ++ ++static struct btree *btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) ++{ ++ struct btree *b = kzalloc(sizeof(struct btree), gfp); ++ if (!b) ++ return NULL; ++ ++ bkey_btree_ptr_init(&b->key); ++ six_lock_init(&b->c.lock); ++ INIT_LIST_HEAD(&b->list); ++ INIT_LIST_HEAD(&b->write_blocked); ++ ++ btree_node_data_alloc(c, b, gfp); ++ return b->data ? b : NULL; ++} ++ ++/* Btree in memory cache - hash table */ ++ ++void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) ++{ ++ rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); ++ ++ /* Cause future lookups for this node to fail: */ ++ b->hash_val = 0; ++} ++ ++int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) ++{ ++ BUG_ON(b->hash_val); ++ b->hash_val = btree_ptr_hash_val(&b->key); ++ ++ return rhashtable_lookup_insert_fast(&bc->table, &b->hash, ++ bch_btree_cache_params); ++} ++ ++int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, ++ unsigned level, enum btree_id id) ++{ ++ int ret; ++ ++ b->c.level = level; ++ b->c.btree_id = id; ++ ++ mutex_lock(&bc->lock); ++ ret = __bch2_btree_node_hash_insert(bc, b); ++ if (!ret) ++ list_add(&b->list, &bc->live); ++ mutex_unlock(&bc->lock); ++ ++ return ret; ++} ++ ++__flatten ++static inline struct btree *btree_cache_find(struct btree_cache *bc, ++ const struct bkey_i *k) ++{ ++ u64 v = btree_ptr_hash_val(k); ++ ++ return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); ++} ++ ++/* ++ * this version is for btree nodes that have already been freed (we're not ++ * reaping a real btree node) ++ */ ++static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ int ret = 0; ++ ++ lockdep_assert_held(&bc->lock); ++ ++ if (!six_trylock_intent(&b->c.lock)) ++ return -ENOMEM; ++ ++ if (!six_trylock_write(&b->c.lock)) ++ goto out_unlock_intent; ++ ++ if (btree_node_noevict(b)) ++ goto out_unlock; ++ ++ if (!btree_node_may_write(b)) ++ goto out_unlock; ++ ++ if (btree_node_dirty(b) && ++ test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) ++ goto out_unlock; ++ ++ if (btree_node_dirty(b) || ++ btree_node_write_in_flight(b) || ++ btree_node_read_in_flight(b)) { ++ if (!flush) ++ goto out_unlock; ++ ++ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, ++ TASK_UNINTERRUPTIBLE); ++ ++ /* ++ * Using the underscore version because we don't want to compact ++ * bsets after the write, since this node is about to be evicted ++ * - unless btree verify mode is enabled, since it runs out of ++ * the post write cleanup: ++ */ ++ if (verify_btree_ondisk(c)) ++ bch2_btree_node_write(c, b, SIX_LOCK_intent); ++ else ++ __bch2_btree_node_write(c, b, SIX_LOCK_read); ++ ++ /* wait for any in flight btree write */ ++ btree_node_wait_on_io(b); ++ } ++out: ++ if (b->hash_val && !ret) ++ trace_btree_node_reap(c, b); ++ return ret; ++out_unlock: ++ six_unlock_write(&b->c.lock); ++out_unlock_intent: ++ six_unlock_intent(&b->c.lock); ++ ret = -ENOMEM; ++ goto out; ++} ++ ++static int btree_node_reclaim(struct bch_fs *c, struct btree *b) ++{ ++ return __btree_node_reclaim(c, b, false); ++} ++ ++static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) ++{ ++ return __btree_node_reclaim(c, b, true); ++} ++ ++static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, ++ struct shrink_control *sc) ++{ ++ struct bch_fs *c = container_of(shrink, struct bch_fs, ++ btree_cache.shrink); ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b, *t; ++ unsigned long nr = sc->nr_to_scan; ++ unsigned long can_free; ++ unsigned long touched = 0; ++ unsigned long freed = 0; ++ unsigned i; ++ ++ if (btree_shrinker_disabled(c)) ++ return SHRINK_STOP; ++ ++ /* Return -1 if we can't do anything right now */ ++ if (sc->gfp_mask & __GFP_FS) ++ mutex_lock(&bc->lock); ++ else if (!mutex_trylock(&bc->lock)) ++ return -1; ++ ++ /* ++ * It's _really_ critical that we don't free too many btree nodes - we ++ * have to always leave ourselves a reserve. The reserve is how we ++ * guarantee that allocating memory for a new btree node can always ++ * succeed, so that inserting keys into the btree can always succeed and ++ * IO can always make forward progress: ++ */ ++ nr /= btree_pages(c); ++ can_free = btree_cache_can_free(bc); ++ nr = min_t(unsigned long, nr, can_free); ++ ++ i = 0; ++ list_for_each_entry_safe(b, t, &bc->freeable, list) { ++ touched++; ++ ++ if (freed >= nr) ++ break; ++ ++ if (++i > 3 && ++ !btree_node_reclaim(c, b)) { ++ btree_node_data_free(c, b); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ freed++; ++ } ++ } ++restart: ++ list_for_each_entry_safe(b, t, &bc->live, list) { ++ touched++; ++ ++ if (freed >= nr) { ++ /* Save position */ ++ if (&t->list != &bc->live) ++ list_move_tail(&bc->live, &t->list); ++ break; ++ } ++ ++ if (!btree_node_accessed(b) && ++ !btree_node_reclaim(c, b)) { ++ /* can't call bch2_btree_node_hash_remove under lock */ ++ freed++; ++ if (&t->list != &bc->live) ++ list_move_tail(&bc->live, &t->list); ++ ++ btree_node_data_free(c, b); ++ mutex_unlock(&bc->lock); ++ ++ bch2_btree_node_hash_remove(bc, b); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ ++ if (freed >= nr) ++ goto out; ++ ++ if (sc->gfp_mask & __GFP_FS) ++ mutex_lock(&bc->lock); ++ else if (!mutex_trylock(&bc->lock)) ++ goto out; ++ goto restart; ++ } else ++ clear_btree_node_accessed(b); ++ } ++ ++ mutex_unlock(&bc->lock); ++out: ++ return (unsigned long) freed * btree_pages(c); ++} ++ ++static unsigned long bch2_btree_cache_count(struct shrinker *shrink, ++ struct shrink_control *sc) ++{ ++ struct bch_fs *c = container_of(shrink, struct bch_fs, ++ btree_cache.shrink); ++ struct btree_cache *bc = &c->btree_cache; ++ ++ if (btree_shrinker_disabled(c)) ++ return 0; ++ ++ return btree_cache_can_free(bc) * btree_pages(c); ++} ++ ++void bch2_fs_btree_cache_exit(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ unsigned i; ++ ++ if (bc->shrink.list.next) ++ unregister_shrinker(&bc->shrink); ++ ++ mutex_lock(&bc->lock); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ if (c->verify_data) ++ list_move(&c->verify_data->list, &bc->live); ++ ++ kvpfree(c->verify_ondisk, btree_bytes(c)); ++#endif ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (c->btree_roots[i].b) ++ list_add(&c->btree_roots[i].b->list, &bc->live); ++ ++ list_splice(&bc->freeable, &bc->live); ++ ++ while (!list_empty(&bc->live)) { ++ b = list_first_entry(&bc->live, struct btree, list); ++ ++ BUG_ON(btree_node_read_in_flight(b) || ++ btree_node_write_in_flight(b)); ++ ++ if (btree_node_dirty(b)) ++ bch2_btree_complete_write(c, b, btree_current_write(b)); ++ clear_btree_node_dirty(b); ++ ++ btree_node_data_free(c, b); ++ } ++ ++ while (!list_empty(&bc->freed)) { ++ b = list_first_entry(&bc->freed, struct btree, list); ++ list_del(&b->list); ++ kfree(b); ++ } ++ ++ mutex_unlock(&bc->lock); ++ ++ if (bc->table_init_done) ++ rhashtable_destroy(&bc->table); ++} ++ ++int bch2_fs_btree_cache_init(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ unsigned i; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ ret = rhashtable_init(&bc->table, &bch_btree_cache_params); ++ if (ret) ++ goto out; ++ ++ bc->table_init_done = true; ++ ++ bch2_recalc_btree_reserve(c); ++ ++ for (i = 0; i < bc->reserve; i++) ++ if (!btree_node_mem_alloc(c, GFP_KERNEL)) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ list_splice_init(&bc->live, &bc->freeable); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ mutex_init(&c->verify_lock); ++ ++ c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); ++ if (!c->verify_ondisk) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ c->verify_data = btree_node_mem_alloc(c, GFP_KERNEL); ++ if (!c->verify_data) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ list_del_init(&c->verify_data->list); ++#endif ++ ++ bc->shrink.count_objects = bch2_btree_cache_count; ++ bc->shrink.scan_objects = bch2_btree_cache_scan; ++ bc->shrink.seeks = 4; ++ bc->shrink.batch = btree_pages(c) * 2; ++ register_shrinker(&bc->shrink); ++out: ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} ++ ++void bch2_fs_btree_cache_init_early(struct btree_cache *bc) ++{ ++ mutex_init(&bc->lock); ++ INIT_LIST_HEAD(&bc->live); ++ INIT_LIST_HEAD(&bc->freeable); ++ INIT_LIST_HEAD(&bc->freed); ++} ++ ++/* ++ * We can only have one thread cannibalizing other cached btree nodes at a time, ++ * or we'll deadlock. We use an open coded mutex to ensure that, which a ++ * cannibalize_bucket() will take. This means every time we unlock the root of ++ * the btree, we need to release this lock if we have it held. ++ */ ++void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ ++ if (bc->alloc_lock == current) { ++ trace_btree_node_cannibalize_unlock(c); ++ bc->alloc_lock = NULL; ++ closure_wake_up(&bc->alloc_wait); ++ } ++} ++ ++int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct task_struct *old; ++ ++ old = cmpxchg(&bc->alloc_lock, NULL, current); ++ if (old == NULL || old == current) ++ goto success; ++ ++ if (!cl) { ++ trace_btree_node_cannibalize_lock_fail(c); ++ return -ENOMEM; ++ } ++ ++ closure_wait(&bc->alloc_wait, cl); ++ ++ /* Try again, after adding ourselves to waitlist */ ++ old = cmpxchg(&bc->alloc_lock, NULL, current); ++ if (old == NULL || old == current) { ++ /* We raced */ ++ closure_wake_up(&bc->alloc_wait); ++ goto success; ++ } ++ ++ trace_btree_node_cannibalize_lock_fail(c); ++ return -EAGAIN; ++ ++success: ++ trace_btree_node_cannibalize_lock(c); ++ return 0; ++} ++ ++static struct btree *btree_node_cannibalize(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ ++ list_for_each_entry_reverse(b, &bc->live, list) ++ if (!btree_node_reclaim(c, b)) ++ return b; ++ ++ while (1) { ++ list_for_each_entry_reverse(b, &bc->live, list) ++ if (!btree_node_write_and_reclaim(c, b)) ++ return b; ++ ++ /* ++ * Rare case: all nodes were intent-locked. ++ * Just busy-wait. ++ */ ++ WARN_ONCE(1, "btree cache cannibalize failed\n"); ++ cond_resched(); ++ } ++} ++ ++struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ u64 start_time = local_clock(); ++ unsigned flags; ++ ++ flags = memalloc_nofs_save(); ++ mutex_lock(&bc->lock); ++ ++ /* ++ * btree_free() doesn't free memory; it sticks the node on the end of ++ * the list. Check if there's any freed nodes there: ++ */ ++ list_for_each_entry(b, &bc->freeable, list) ++ if (!btree_node_reclaim(c, b)) ++ goto got_node; ++ ++ /* ++ * We never free struct btree itself, just the memory that holds the on ++ * disk node. Check the freed list before allocating a new one: ++ */ ++ list_for_each_entry(b, &bc->freed, list) ++ if (!btree_node_reclaim(c, b)) ++ goto got_node; ++ ++ b = NULL; ++got_node: ++ if (b) ++ list_del_init(&b->list); ++ mutex_unlock(&bc->lock); ++ ++ if (!b) { ++ b = kzalloc(sizeof(struct btree), GFP_KERNEL); ++ if (!b) ++ goto err; ++ ++ bkey_btree_ptr_init(&b->key); ++ six_lock_init(&b->c.lock); ++ INIT_LIST_HEAD(&b->list); ++ INIT_LIST_HEAD(&b->write_blocked); ++ ++ BUG_ON(!six_trylock_intent(&b->c.lock)); ++ BUG_ON(!six_trylock_write(&b->c.lock)); ++ } ++ ++ if (!b->data) { ++ if (__btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) ++ goto err; ++ ++ mutex_lock(&bc->lock); ++ bc->used++; ++ mutex_unlock(&bc->lock); ++ } ++ ++ BUG_ON(btree_node_hashed(b)); ++ BUG_ON(btree_node_write_in_flight(b)); ++out: ++ b->flags = 0; ++ b->written = 0; ++ b->nsets = 0; ++ b->sib_u64s[0] = 0; ++ b->sib_u64s[1] = 0; ++ b->whiteout_u64s = 0; ++ bch2_btree_keys_init(b, &c->expensive_debug_checks); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], ++ start_time); ++ ++ memalloc_nofs_restore(flags); ++ return b; ++err: ++ mutex_lock(&bc->lock); ++ ++ if (b) { ++ list_add(&b->list, &bc->freed); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ } ++ ++ /* Try to cannibalize another cached btree node: */ ++ if (bc->alloc_lock == current) { ++ b = btree_node_cannibalize(c); ++ list_del_init(&b->list); ++ mutex_unlock(&bc->lock); ++ ++ bch2_btree_node_hash_remove(bc, b); ++ ++ trace_btree_node_cannibalize(c); ++ goto out; ++ } ++ ++ mutex_unlock(&bc->lock); ++ memalloc_nofs_restore(flags); ++ return ERR_PTR(-ENOMEM); ++} ++ ++/* Slowpath, don't want it inlined into btree_iter_traverse() */ ++static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, ++ struct btree_iter *iter, ++ const struct bkey_i *k, ++ enum btree_id btree_id, ++ unsigned level, ++ enum six_lock_type lock_type, ++ bool sync) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ ++ BUG_ON(level + 1 >= BTREE_MAX_DEPTH); ++ /* ++ * Parent node must be locked, else we could read in a btree node that's ++ * been freed: ++ */ ++ if (iter && !bch2_btree_node_relock(iter, level + 1)) ++ return ERR_PTR(-EINTR); ++ ++ b = bch2_btree_node_mem_alloc(c); ++ if (IS_ERR(b)) ++ return b; ++ ++ bkey_copy(&b->key, k); ++ if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { ++ /* raced with another fill: */ ++ ++ /* mark as unhashed... */ ++ b->hash_val = 0; ++ ++ mutex_lock(&bc->lock); ++ list_add(&b->list, &bc->freeable); ++ mutex_unlock(&bc->lock); ++ ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ return NULL; ++ } ++ ++ /* ++ * Unlock before doing IO: ++ * ++ * XXX: ideally should be dropping all btree node locks here ++ */ ++ if (iter && btree_node_read_locked(iter, level + 1)) ++ btree_node_unlock(iter, level + 1); ++ ++ bch2_btree_node_read(c, b, sync); ++ ++ six_unlock_write(&b->c.lock); ++ ++ if (!sync) { ++ six_unlock_intent(&b->c.lock); ++ return NULL; ++ } ++ ++ if (lock_type == SIX_LOCK_read) ++ six_lock_downgrade(&b->c.lock); ++ ++ return b; ++} ++ ++static int lock_node_check_fn(struct six_lock *lock, void *p) ++{ ++ struct btree *b = container_of(lock, struct btree, c.lock); ++ const struct bkey_i *k = p; ++ ++ return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1; ++} ++ ++/** ++ * bch_btree_node_get - find a btree node in the cache and lock it, reading it ++ * in from disk if necessary. ++ * ++ * If IO is necessary and running under generic_make_request, returns -EAGAIN. ++ * ++ * The btree node will have either a read or a write lock held, depending on ++ * the @write parameter. ++ */ ++struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, ++ const struct bkey_i *k, unsigned level, ++ enum six_lock_type lock_type) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ struct bset_tree *t; ++ ++ EBUG_ON(level >= BTREE_MAX_DEPTH); ++ ++ b = btree_node_mem_ptr(k); ++ if (b) ++ goto lock_node; ++retry: ++ b = btree_cache_find(bc, k); ++ if (unlikely(!b)) { ++ /* ++ * We must have the parent locked to call bch2_btree_node_fill(), ++ * else we could read in a btree node from disk that's been ++ * freed: ++ */ ++ b = bch2_btree_node_fill(c, iter, k, iter->btree_id, ++ level, lock_type, true); ++ ++ /* We raced and found the btree node in the cache */ ++ if (!b) ++ goto retry; ++ ++ if (IS_ERR(b)) ++ return b; ++ } else { ++lock_node: ++ /* ++ * There's a potential deadlock with splits and insertions into ++ * interior nodes we have to avoid: ++ * ++ * The other thread might be holding an intent lock on the node ++ * we want, and they want to update its parent node so they're ++ * going to upgrade their intent lock on the parent node to a ++ * write lock. ++ * ++ * But if we're holding a read lock on the parent, and we're ++ * trying to get the intent lock they're holding, we deadlock. ++ * ++ * So to avoid this we drop the read locks on parent nodes when ++ * we're starting to take intent locks - and handle the race. ++ * ++ * The race is that they might be about to free the node we ++ * want, and dropping our read lock on the parent node lets them ++ * update the parent marking the node we want as freed, and then ++ * free it: ++ * ++ * To guard against this, btree nodes are evicted from the cache ++ * when they're freed - and b->hash_val is zeroed out, which we ++ * check for after we lock the node. ++ * ++ * Then, bch2_btree_node_relock() on the parent will fail - because ++ * the parent was modified, when the pointer to the node we want ++ * was removed - and we'll bail out: ++ */ ++ if (btree_node_read_locked(iter, level + 1)) ++ btree_node_unlock(iter, level + 1); ++ ++ if (!btree_node_lock(b, k->k.p, level, iter, lock_type, ++ lock_node_check_fn, (void *) k)) { ++ if (b->hash_val != btree_ptr_hash_val(k)) ++ goto retry; ++ return ERR_PTR(-EINTR); ++ } ++ ++ if (unlikely(b->hash_val != btree_ptr_hash_val(k) || ++ b->c.level != level || ++ race_fault())) { ++ six_unlock_type(&b->c.lock, lock_type); ++ if (bch2_btree_node_relock(iter, level + 1)) ++ goto retry; ++ ++ trace_trans_restart_btree_node_reused(iter->trans->ip); ++ return ERR_PTR(-EINTR); ++ } ++ } ++ ++ /* XXX: waiting on IO with btree locks held: */ ++ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, ++ TASK_UNINTERRUPTIBLE); ++ ++ prefetch(b->aux_data); ++ ++ for_each_bset(b, t) { ++ void *p = (u64 *) b->aux_data + t->aux_data_offset; ++ ++ prefetch(p + L1_CACHE_BYTES * 0); ++ prefetch(p + L1_CACHE_BYTES * 1); ++ prefetch(p + L1_CACHE_BYTES * 2); ++ } ++ ++ /* avoid atomic set bit if it's not needed: */ ++ if (!btree_node_accessed(b)) ++ set_btree_node_accessed(b); ++ ++ if (unlikely(btree_node_read_error(b))) { ++ six_unlock_type(&b->c.lock, lock_type); ++ return ERR_PTR(-EIO); ++ } ++ ++ EBUG_ON(b->c.btree_id != iter->btree_id || ++ BTREE_NODE_LEVEL(b->data) != level || ++ bkey_cmp(b->data->max_key, k->k.p)); ++ ++ return b; ++} ++ ++struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, ++ const struct bkey_i *k, ++ enum btree_id btree_id, ++ unsigned level) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ struct bset_tree *t; ++ int ret; ++ ++ EBUG_ON(level >= BTREE_MAX_DEPTH); ++ ++ b = btree_node_mem_ptr(k); ++ if (b) ++ goto lock_node; ++retry: ++ b = btree_cache_find(bc, k); ++ if (unlikely(!b)) { ++ b = bch2_btree_node_fill(c, NULL, k, btree_id, ++ level, SIX_LOCK_read, true); ++ ++ /* We raced and found the btree node in the cache */ ++ if (!b) ++ goto retry; ++ ++ if (IS_ERR(b)) ++ return b; ++ } else { ++lock_node: ++ ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k); ++ if (ret) ++ goto retry; ++ ++ if (unlikely(b->hash_val != btree_ptr_hash_val(k) || ++ b->c.btree_id != btree_id || ++ b->c.level != level)) { ++ six_unlock_read(&b->c.lock); ++ goto retry; ++ } ++ } ++ ++ /* XXX: waiting on IO with btree locks held: */ ++ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, ++ TASK_UNINTERRUPTIBLE); ++ ++ prefetch(b->aux_data); ++ ++ for_each_bset(b, t) { ++ void *p = (u64 *) b->aux_data + t->aux_data_offset; ++ ++ prefetch(p + L1_CACHE_BYTES * 0); ++ prefetch(p + L1_CACHE_BYTES * 1); ++ prefetch(p + L1_CACHE_BYTES * 2); ++ } ++ ++ /* avoid atomic set bit if it's not needed: */ ++ if (!btree_node_accessed(b)) ++ set_btree_node_accessed(b); ++ ++ if (unlikely(btree_node_read_error(b))) { ++ six_unlock_read(&b->c.lock); ++ return ERR_PTR(-EIO); ++ } ++ ++ EBUG_ON(b->c.btree_id != btree_id || ++ BTREE_NODE_LEVEL(b->data) != level || ++ bkey_cmp(b->data->max_key, k->k.p)); ++ ++ return b; ++} ++ ++struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, ++ struct btree_iter *iter, ++ struct btree *b, ++ enum btree_node_sibling sib) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree *parent; ++ struct btree_node_iter node_iter; ++ struct bkey_packed *k; ++ BKEY_PADDED(k) tmp; ++ struct btree *ret = NULL; ++ unsigned level = b->c.level; ++ ++ parent = btree_iter_node(iter, level + 1); ++ if (!parent) ++ return NULL; ++ ++ /* ++ * There's a corner case where a btree_iter might have a node locked ++ * that is just outside its current pos - when ++ * bch2_btree_iter_set_pos_same_leaf() gets to the end of the node. ++ * ++ * But the lock ordering checks in __bch2_btree_node_lock() go off of ++ * iter->pos, not the node's key: so if the iterator is marked as ++ * needing to be traversed, we risk deadlock if we don't bail out here: ++ */ ++ if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) ++ return ERR_PTR(-EINTR); ++ ++ if (!bch2_btree_node_relock(iter, level + 1)) { ++ ret = ERR_PTR(-EINTR); ++ goto out; ++ } ++ ++ node_iter = iter->l[parent->c.level].iter; ++ ++ k = bch2_btree_node_iter_peek_all(&node_iter, parent); ++ BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p)); ++ ++ k = sib == btree_prev_sib ++ ? bch2_btree_node_iter_prev(&node_iter, parent) ++ : (bch2_btree_node_iter_advance(&node_iter, parent), ++ bch2_btree_node_iter_peek(&node_iter, parent)); ++ if (!k) ++ goto out; ++ ++ bch2_bkey_unpack(parent, &tmp.k, k); ++ ++ ret = bch2_btree_node_get(c, iter, &tmp.k, level, ++ SIX_LOCK_intent); ++ ++ if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) { ++ struct btree_iter *linked; ++ ++ if (!bch2_btree_node_relock(iter, level + 1)) ++ goto out; ++ ++ /* ++ * We might have got -EINTR because trylock failed, and we're ++ * holding other locks that would cause us to deadlock: ++ */ ++ trans_for_each_iter(trans, linked) ++ if (btree_iter_cmp(iter, linked) < 0) ++ __bch2_btree_iter_unlock(linked); ++ ++ if (sib == btree_prev_sib) ++ btree_node_unlock(iter, level); ++ ++ ret = bch2_btree_node_get(c, iter, &tmp.k, level, ++ SIX_LOCK_intent); ++ ++ /* ++ * before btree_iter_relock() calls btree_iter_verify_locks(): ++ */ ++ if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) ++ btree_node_unlock(iter, level + 1); ++ ++ if (!bch2_btree_node_relock(iter, level)) { ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); ++ ++ if (!IS_ERR(ret)) { ++ six_unlock_intent(&ret->c.lock); ++ ret = ERR_PTR(-EINTR); ++ } ++ } ++ ++ bch2_trans_relock(trans); ++ } ++out: ++ if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) ++ btree_node_unlock(iter, level + 1); ++ ++ if (PTR_ERR_OR_ZERO(ret) == -EINTR) ++ bch2_btree_iter_upgrade(iter, level + 2); ++ ++ BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level)); ++ ++ if (!IS_ERR_OR_NULL(ret)) { ++ struct btree *n1 = ret, *n2 = b; ++ ++ if (sib != btree_prev_sib) ++ swap(n1, n2); ++ ++ BUG_ON(bkey_cmp(bkey_successor(n1->key.k.p), ++ n2->data->min_key)); ++ } ++ ++ bch2_btree_trans_verify_locks(trans); ++ ++ return ret; ++} ++ ++void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, ++ const struct bkey_i *k, unsigned level) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ ++ BUG_ON(!btree_node_locked(iter, level + 1)); ++ BUG_ON(level >= BTREE_MAX_DEPTH); ++ ++ b = btree_cache_find(bc, k); ++ if (b) ++ return; ++ ++ bch2_btree_node_fill(c, iter, k, iter->btree_id, ++ level, SIX_LOCK_read, false); ++} ++ ++void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, ++ struct btree *b) ++{ ++ const struct bkey_format *f = &b->format; ++ struct bset_stats stats; ++ ++ memset(&stats, 0, sizeof(stats)); ++ ++ bch2_btree_keys_stats(b, &stats); ++ ++ pr_buf(out, ++ "l %u %llu:%llu - %llu:%llu:\n" ++ " ptrs: ", ++ b->c.level, ++ b->data->min_key.inode, ++ b->data->min_key.offset, ++ b->data->max_key.inode, ++ b->data->max_key.offset); ++ bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); ++ pr_buf(out, "\n" ++ " format: u64s %u fields %u %u %u %u %u\n" ++ " unpack fn len: %u\n" ++ " bytes used %zu/%zu (%zu%% full)\n" ++ " sib u64s: %u, %u (merge threshold %zu)\n" ++ " nr packed keys %u\n" ++ " nr unpacked keys %u\n" ++ " floats %zu\n" ++ " failed unpacked %zu\n", ++ f->key_u64s, ++ f->bits_per_field[0], ++ f->bits_per_field[1], ++ f->bits_per_field[2], ++ f->bits_per_field[3], ++ f->bits_per_field[4], ++ b->unpack_fn_len, ++ b->nr.live_u64s * sizeof(u64), ++ btree_bytes(c) - sizeof(struct btree_node), ++ b->nr.live_u64s * 100 / btree_max_u64s(c), ++ b->sib_u64s[0], ++ b->sib_u64s[1], ++ BTREE_FOREGROUND_MERGE_THRESHOLD(c), ++ b->nr.packed_keys, ++ b->nr.unpacked_keys, ++ stats.floats, ++ stats.failed); ++} +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +new file mode 100644 +index 000000000000..2160012c734f +--- /dev/null ++++ b/fs/bcachefs/btree_cache.h +@@ -0,0 +1,109 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_CACHE_H ++#define _BCACHEFS_BTREE_CACHE_H ++ ++#include "bcachefs.h" ++#include "btree_types.h" ++ ++struct btree_iter; ++ ++extern const char * const bch2_btree_ids[]; ++ ++void bch2_recalc_btree_reserve(struct bch_fs *); ++ ++void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); ++int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); ++int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, ++ unsigned, enum btree_id); ++ ++void bch2_btree_cache_cannibalize_unlock(struct bch_fs *); ++int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); ++ ++struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); ++ ++struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, ++ const struct bkey_i *, unsigned, ++ enum six_lock_type); ++ ++struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, ++ enum btree_id, unsigned); ++ ++struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, ++ struct btree *, enum btree_node_sibling); ++ ++void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *, ++ const struct bkey_i *, unsigned); ++ ++void bch2_fs_btree_cache_exit(struct bch_fs *); ++int bch2_fs_btree_cache_init(struct bch_fs *); ++void bch2_fs_btree_cache_init_early(struct btree_cache *); ++ ++static inline u64 btree_ptr_hash_val(const struct bkey_i *k) ++{ ++ switch (k->k.type) { ++ case KEY_TYPE_btree_ptr: ++ return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start); ++ case KEY_TYPE_btree_ptr_v2: ++ return bkey_i_to_btree_ptr_v2_c(k)->v.seq; ++ default: ++ return 0; ++ } ++} ++ ++static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k) ++{ ++ return k->k.type == KEY_TYPE_btree_ptr_v2 ++ ? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr ++ : NULL; ++} ++ ++/* is btree node in hash table? */ ++static inline bool btree_node_hashed(struct btree *b) ++{ ++ return b->hash_val != 0; ++} ++ ++#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \ ++ for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl, \ ++ &(_c)->btree_cache.table), \ ++ _iter = 0; _iter < (_tbl)->size; _iter++) \ ++ rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash) ++ ++static inline size_t btree_bytes(struct bch_fs *c) ++{ ++ return c->opts.btree_node_size << 9; ++} ++ ++static inline size_t btree_max_u64s(struct bch_fs *c) ++{ ++ return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64); ++} ++ ++static inline size_t btree_page_order(struct bch_fs *c) ++{ ++ return get_order(btree_bytes(c)); ++} ++ ++static inline size_t btree_pages(struct bch_fs *c) ++{ ++ return 1 << btree_page_order(c); ++} ++ ++static inline unsigned btree_blocks(struct bch_fs *c) ++{ ++ return c->opts.btree_node_size >> c->block_bits; ++} ++ ++#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3) ++ ++#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) ++#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ ++ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \ ++ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2)) ++ ++#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->c.btree_id].b) ++ ++void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, ++ struct btree *); ++ ++#endif /* _BCACHEFS_BTREE_CACHE_H */ +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +new file mode 100644 +index 000000000000..8771ef1f07cc +--- /dev/null ++++ b/fs/bcachefs/btree_gc.c +@@ -0,0 +1,1388 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2010 Kent Overstreet ++ * Copyright (C) 2014 Datera Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "bkey_methods.h" ++#include "btree_locking.h" ++#include "btree_update_interior.h" ++#include "btree_io.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "clock.h" ++#include "debug.h" ++#include "ec.h" ++#include "error.h" ++#include "extents.h" ++#include "journal.h" ++#include "keylist.h" ++#include "move.h" ++#include "recovery.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) ++{ ++ write_seqcount_begin(&c->gc_pos_lock); ++ c->gc_pos = new_pos; ++ write_seqcount_end(&c->gc_pos_lock); ++} ++ ++static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) ++{ ++ BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); ++ __gc_pos_set(c, new_pos); ++} ++ ++static int bch2_gc_check_topology(struct bch_fs *c, ++ struct bkey_s_c k, ++ struct bpos *expected_start, ++ struct bpos expected_end, ++ bool is_last) ++{ ++ int ret = 0; ++ ++ if (k.k->type == KEY_TYPE_btree_ptr_v2) { ++ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); ++ ++ if (fsck_err_on(bkey_cmp(*expected_start, bp.v->min_key), c, ++ "btree node with incorrect min_key: got %llu:%llu, should be %llu:%llu", ++ bp.v->min_key.inode, ++ bp.v->min_key.offset, ++ expected_start->inode, ++ expected_start->offset)) { ++ BUG(); ++ } ++ } ++ ++ *expected_start = bkey_cmp(k.k->p, POS_MAX) ++ ? bkey_successor(k.k->p) ++ : k.k->p; ++ ++ if (fsck_err_on(is_last && ++ bkey_cmp(k.k->p, expected_end), c, ++ "btree node with incorrect max_key: got %llu:%llu, should be %llu:%llu", ++ k.k->p.inode, ++ k.k->p.offset, ++ expected_end.inode, ++ expected_end.offset)) { ++ BUG(); ++ } ++fsck_err: ++ return ret; ++} ++ ++/* marking of btree keys/nodes: */ ++ ++static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, ++ u8 *max_stale, bool initial) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ unsigned flags = ++ BTREE_TRIGGER_GC| ++ (initial ? BTREE_TRIGGER_NOATOMIC : 0); ++ int ret = 0; ++ ++ if (initial) { ++ BUG_ON(journal_seq_verify(c) && ++ k.k->version.lo > journal_cur_seq(&c->journal)); ++ ++ /* XXX change to fsck check */ ++ if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, ++ "key version number higher than recorded: %llu > %llu", ++ k.k->version.lo, ++ atomic64_read(&c->key_version))) ++ atomic64_set(&c->key_version, k.k->version.lo); ++ ++ if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || ++ fsck_err_on(!bch2_bkey_replicas_marked(c, k, false), c, ++ "superblock not marked as containing replicas (type %u)", ++ k.k->type)) { ++ ret = bch2_mark_bkey_replicas(c, k); ++ if (ret) ++ return ret; ++ } ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, true); ++ struct bucket *g2 = PTR_BUCKET(ca, ptr, false); ++ ++ if (mustfix_fsck_err_on(!g->gen_valid, c, ++ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree", ++ ptr->dev, PTR_BUCKET_NR(ca, ptr), ++ bch2_data_types[ptr_data_type(k.k, ptr)], ++ ptr->gen)) { ++ g2->_mark.gen = g->_mark.gen = ptr->gen; ++ g2->gen_valid = g->gen_valid = true; ++ } ++ ++ if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, ++ "bucket %u:%zu data type %s ptr gen in the future: %u > %u", ++ ptr->dev, PTR_BUCKET_NR(ca, ptr), ++ bch2_data_types[ptr_data_type(k.k, ptr)], ++ ptr->gen, g->mark.gen)) { ++ g2->_mark.gen = g->_mark.gen = ptr->gen; ++ g2->gen_valid = g->gen_valid = true; ++ g2->_mark.data_type = 0; ++ g2->_mark.dirty_sectors = 0; ++ g2->_mark.cached_sectors = 0; ++ set_bit(BCH_FS_FIXED_GENS, &c->flags); ++ } ++ } ++ } ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, true); ++ ++ if (gen_after(g->oldest_gen, ptr->gen)) ++ g->oldest_gen = ptr->gen; ++ ++ *max_stale = max(*max_stale, ptr_stale(ca, ptr)); ++ } ++ ++ bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags); ++fsck_err: ++ return ret; ++} ++ ++static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, ++ bool initial) ++{ ++ struct bpos next_node_start = b->data->min_key; ++ struct btree_node_iter iter; ++ struct bkey unpacked; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ *max_stale = 0; ++ ++ if (!btree_node_type_needs_gc(btree_node_type(b))) ++ return 0; ++ ++ bch2_btree_node_iter_init_from_start(&iter, b); ++ ++ while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { ++ bch2_bkey_debugcheck(c, b, k); ++ ++ ret = bch2_gc_mark_key(c, k, max_stale, initial); ++ if (ret) ++ break; ++ ++ bch2_btree_node_iter_advance(&iter, b); ++ ++ if (b->c.level) { ++ ret = bch2_gc_check_topology(c, k, ++ &next_node_start, ++ b->data->max_key, ++ bch2_btree_node_iter_end(&iter)); ++ if (ret) ++ break; ++ } ++ } ++ ++ return ret; ++} ++ ++static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, ++ bool initial, bool metadata_only) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct btree *b; ++ unsigned depth = metadata_only ? 1 ++ : expensive_debug_checks(c) ? 0 ++ : !btree_node_type_needs_gc(btree_id) ? 1 ++ : 0; ++ u8 max_stale = 0; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); ++ ++ __for_each_btree_node(&trans, iter, btree_id, POS_MIN, ++ 0, depth, BTREE_ITER_PREFETCH, b) { ++ bch2_verify_btree_nr_keys(b); ++ ++ gc_pos_set(c, gc_pos_btree_node(b)); ++ ++ ret = btree_gc_mark_node(c, b, &max_stale, initial); ++ if (ret) ++ break; ++ ++ if (!initial) { ++ if (max_stale > 64) ++ bch2_btree_node_rewrite(c, iter, ++ b->data->keys.seq, ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_NOWAIT| ++ BTREE_INSERT_GC_LOCK_HELD); ++ else if (!btree_gc_rewrite_disabled(c) && ++ (btree_gc_always_rewrite(c) || max_stale > 16)) ++ bch2_btree_node_rewrite(c, iter, ++ b->data->keys.seq, ++ BTREE_INSERT_NOWAIT| ++ BTREE_INSERT_GC_LOCK_HELD); ++ } ++ ++ bch2_trans_cond_resched(&trans); ++ } ++ ret = bch2_trans_exit(&trans) ?: ret; ++ if (ret) ++ return ret; ++ ++ mutex_lock(&c->btree_root_lock); ++ b = c->btree_roots[btree_id].b; ++ if (!btree_node_fake(b)) ++ ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), ++ &max_stale, initial); ++ gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); ++ mutex_unlock(&c->btree_root_lock); ++ ++ return ret; ++} ++ ++static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, ++ struct journal_keys *journal_keys, ++ unsigned target_depth) ++{ ++ struct btree_and_journal_iter iter; ++ struct bkey_s_c k; ++ struct bpos next_node_start = b->data->min_key; ++ u8 max_stale = 0; ++ int ret = 0; ++ ++ bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); ++ ++ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { ++ bch2_bkey_debugcheck(c, b, k); ++ ++ BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0); ++ BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0); ++ ++ ret = bch2_gc_mark_key(c, k, &max_stale, true); ++ if (ret) ++ break; ++ ++ if (b->c.level) { ++ struct btree *child; ++ BKEY_PADDED(k) tmp; ++ ++ bkey_reassemble(&tmp.k, k); ++ k = bkey_i_to_s_c(&tmp.k); ++ ++ bch2_btree_and_journal_iter_advance(&iter); ++ ++ ret = bch2_gc_check_topology(c, k, ++ &next_node_start, ++ b->data->max_key, ++ !bch2_btree_and_journal_iter_peek(&iter).k); ++ if (ret) ++ break; ++ ++ if (b->c.level > target_depth) { ++ child = bch2_btree_node_get_noiter(c, &tmp.k, ++ b->c.btree_id, b->c.level - 1); ++ ret = PTR_ERR_OR_ZERO(child); ++ if (ret) ++ break; ++ ++ ret = bch2_gc_btree_init_recurse(c, child, ++ journal_keys, target_depth); ++ six_unlock_read(&child->c.lock); ++ ++ if (ret) ++ break; ++ } ++ } else { ++ bch2_btree_and_journal_iter_advance(&iter); ++ } ++ } ++ ++ return ret; ++} ++ ++static int bch2_gc_btree_init(struct bch_fs *c, ++ struct journal_keys *journal_keys, ++ enum btree_id btree_id, ++ bool metadata_only) ++{ ++ struct btree *b; ++ unsigned target_depth = metadata_only ? 1 ++ : expensive_debug_checks(c) ? 0 ++ : !btree_node_type_needs_gc(btree_id) ? 1 ++ : 0; ++ u8 max_stale = 0; ++ int ret = 0; ++ ++ b = c->btree_roots[btree_id].b; ++ ++ if (btree_node_fake(b)) ++ return 0; ++ ++ six_lock_read(&b->c.lock, NULL, NULL); ++ if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c, ++ "btree root with incorrect min_key: %llu:%llu", ++ b->data->min_key.inode, ++ b->data->min_key.offset)) { ++ BUG(); ++ } ++ ++ if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c, ++ "btree root with incorrect min_key: %llu:%llu", ++ b->data->max_key.inode, ++ b->data->max_key.offset)) { ++ BUG(); ++ } ++ ++ if (b->c.level >= target_depth) ++ ret = bch2_gc_btree_init_recurse(c, b, ++ journal_keys, target_depth); ++ ++ if (!ret) ++ ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), ++ &max_stale, true); ++fsck_err: ++ six_unlock_read(&b->c.lock); ++ ++ return ret; ++} ++ ++static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) ++{ ++ return (int) btree_id_to_gc_phase(l) - ++ (int) btree_id_to_gc_phase(r); ++} ++ ++static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, ++ bool initial, bool metadata_only) ++{ ++ enum btree_id ids[BTREE_ID_NR]; ++ unsigned i; ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ ids[i] = i; ++ bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) { ++ enum btree_id id = ids[i]; ++ int ret = initial ++ ? bch2_gc_btree_init(c, journal_keys, ++ id, metadata_only) ++ : bch2_gc_btree(c, id, initial, metadata_only); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, ++ u64 start, u64 end, ++ enum bch_data_type type, ++ unsigned flags) ++{ ++ u64 b = sector_to_bucket(ca, start); ++ ++ do { ++ unsigned sectors = ++ min_t(u64, bucket_to_sector(ca, b + 1), end) - start; ++ ++ bch2_mark_metadata_bucket(c, ca, b, type, sectors, ++ gc_phase(GC_PHASE_SB), flags); ++ b++; ++ start += sectors; ++ } while (start < end); ++} ++ ++void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, ++ unsigned flags) ++{ ++ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; ++ unsigned i; ++ u64 b; ++ ++ /* ++ * This conditional is kind of gross, but we may be called from the ++ * device add path, before the new device has actually been added to the ++ * running filesystem: ++ */ ++ if (c) { ++ lockdep_assert_held(&c->sb_lock); ++ percpu_down_read(&c->mark_lock); ++ } ++ ++ for (i = 0; i < layout->nr_superblocks; i++) { ++ u64 offset = le64_to_cpu(layout->sb_offset[i]); ++ ++ if (offset == BCH_SB_SECTOR) ++ mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, ++ BCH_DATA_SB, flags); ++ ++ mark_metadata_sectors(c, ca, offset, ++ offset + (1 << layout->sb_max_size_bits), ++ BCH_DATA_SB, flags); ++ } ++ ++ for (i = 0; i < ca->journal.nr; i++) { ++ b = ca->journal.buckets[i]; ++ bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_JOURNAL, ++ ca->mi.bucket_size, ++ gc_phase(GC_PHASE_SB), flags); ++ } ++ ++ if (c) ++ percpu_up_read(&c->mark_lock); ++} ++ ++static void bch2_mark_superblocks(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ mutex_lock(&c->sb_lock); ++ gc_pos_set(c, gc_phase(GC_PHASE_SB)); ++ ++ for_each_online_member(ca, c, i) ++ bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); ++ mutex_unlock(&c->sb_lock); ++} ++ ++#if 0 ++/* Also see bch2_pending_btree_node_free_insert_done() */ ++static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) ++{ ++ struct btree_update *as; ++ struct pending_btree_node_free *d; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE)); ++ ++ for_each_pending_btree_node_free(c, as, d) ++ if (d->index_update_done) ++ bch2_mark_key(c, bkey_i_to_s_c(&d->key), ++ 0, 0, NULL, 0, ++ BTREE_TRIGGER_GC); ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++} ++#endif ++ ++static void bch2_mark_allocator_buckets(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ struct open_bucket *ob; ++ size_t i, j, iter; ++ unsigned ci; ++ ++ percpu_down_read(&c->mark_lock); ++ ++ spin_lock(&c->freelist_lock); ++ gc_pos_set(c, gc_pos_alloc(c, NULL)); ++ ++ for_each_member_device(ca, c, ci) { ++ fifo_for_each_entry(i, &ca->free_inc, iter) ++ bch2_mark_alloc_bucket(c, ca, i, true, ++ gc_pos_alloc(c, NULL), ++ BTREE_TRIGGER_GC); ++ ++ ++ ++ for (j = 0; j < RESERVE_NR; j++) ++ fifo_for_each_entry(i, &ca->free[j], iter) ++ bch2_mark_alloc_bucket(c, ca, i, true, ++ gc_pos_alloc(c, NULL), ++ BTREE_TRIGGER_GC); ++ } ++ ++ spin_unlock(&c->freelist_lock); ++ ++ for (ob = c->open_buckets; ++ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ++ ob++) { ++ spin_lock(&ob->lock); ++ if (ob->valid) { ++ gc_pos_set(c, gc_pos_alloc(c, ob)); ++ ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true, ++ gc_pos_alloc(c, ob), ++ BTREE_TRIGGER_GC); ++ } ++ spin_unlock(&ob->lock); ++ } ++ ++ percpu_up_read(&c->mark_lock); ++} ++ ++static void bch2_gc_free(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ genradix_free(&c->stripes[1]); ++ ++ for_each_member_device(ca, c, i) { ++ kvpfree(rcu_dereference_protected(ca->buckets[1], 1), ++ sizeof(struct bucket_array) + ++ ca->mi.nbuckets * sizeof(struct bucket)); ++ ca->buckets[1] = NULL; ++ ++ free_percpu(ca->usage[1]); ++ ca->usage[1] = NULL; ++ } ++ ++ free_percpu(c->usage_gc); ++ c->usage_gc = NULL; ++} ++ ++static int bch2_gc_done(struct bch_fs *c, ++ bool initial, bool metadata_only) ++{ ++ struct bch_dev *ca; ++ bool verify = !metadata_only && ++ (!initial || ++ (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))); ++ unsigned i; ++ int ret = 0; ++ ++#define copy_field(_f, _msg, ...) \ ++ if (dst->_f != src->_f) { \ ++ if (verify) \ ++ fsck_err(c, _msg ": got %llu, should be %llu" \ ++ , ##__VA_ARGS__, dst->_f, src->_f); \ ++ dst->_f = src->_f; \ ++ } ++#define copy_stripe_field(_f, _msg, ...) \ ++ if (dst->_f != src->_f) { \ ++ if (verify) \ ++ fsck_err(c, "stripe %zu has wrong "_msg \ ++ ": got %u, should be %u", \ ++ dst_iter.pos, ##__VA_ARGS__, \ ++ dst->_f, src->_f); \ ++ dst->_f = src->_f; \ ++ dst->dirty = true; \ ++ } ++#define copy_bucket_field(_f) \ ++ if (dst->b[b].mark._f != src->b[b].mark._f) { \ ++ if (verify) \ ++ fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \ ++ ": got %u, should be %u", i, b, \ ++ dst->b[b].mark.gen, \ ++ bch2_data_types[dst->b[b].mark.data_type],\ ++ dst->b[b].mark._f, src->b[b].mark._f); \ ++ dst->b[b]._mark._f = src->b[b].mark._f; \ ++ } ++#define copy_dev_field(_f, _msg, ...) \ ++ copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) ++#define copy_fs_field(_f, _msg, ...) \ ++ copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) ++ ++ if (!metadata_only) { ++ struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0); ++ struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0); ++ struct stripe *dst, *src; ++ unsigned i; ++ ++ c->ec_stripes_heap.used = 0; ++ ++ while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) && ++ (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) { ++ BUG_ON(src_iter.pos != dst_iter.pos); ++ ++ copy_stripe_field(alive, "alive"); ++ copy_stripe_field(sectors, "sectors"); ++ copy_stripe_field(algorithm, "algorithm"); ++ copy_stripe_field(nr_blocks, "nr_blocks"); ++ copy_stripe_field(nr_redundant, "nr_redundant"); ++ copy_stripe_field(blocks_nonempty, ++ "blocks_nonempty"); ++ ++ for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++) ++ copy_stripe_field(block_sectors[i], ++ "block_sectors[%u]", i); ++ ++ if (dst->alive) ++ bch2_stripes_heap_insert(c, dst, dst_iter.pos); ++ ++ genradix_iter_advance(&dst_iter, &c->stripes[0]); ++ genradix_iter_advance(&src_iter, &c->stripes[1]); ++ } ++ } ++ ++ for_each_member_device(ca, c, i) { ++ struct bucket_array *dst = __bucket_array(ca, 0); ++ struct bucket_array *src = __bucket_array(ca, 1); ++ size_t b; ++ ++ for (b = 0; b < src->nbuckets; b++) { ++ copy_bucket_field(gen); ++ copy_bucket_field(data_type); ++ copy_bucket_field(owned_by_allocator); ++ copy_bucket_field(stripe); ++ copy_bucket_field(dirty_sectors); ++ copy_bucket_field(cached_sectors); ++ ++ dst->b[b].oldest_gen = src->b[b].oldest_gen; ++ } ++ }; ++ ++ bch2_fs_usage_acc_to_base(c, 0); ++ bch2_fs_usage_acc_to_base(c, 1); ++ ++ bch2_dev_usage_from_buckets(c); ++ ++ { ++ unsigned nr = fs_usage_u64s(c); ++ struct bch_fs_usage *dst = c->usage_base; ++ struct bch_fs_usage *src = (void *) ++ bch2_acc_percpu_u64s((void *) c->usage_gc, nr); ++ ++ copy_fs_field(hidden, "hidden"); ++ copy_fs_field(btree, "btree"); ++ ++ if (!metadata_only) { ++ copy_fs_field(data, "data"); ++ copy_fs_field(cached, "cached"); ++ copy_fs_field(reserved, "reserved"); ++ copy_fs_field(nr_inodes,"nr_inodes"); ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) ++ copy_fs_field(persistent_reserved[i], ++ "persistent_reserved[%i]", i); ++ } ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ char buf[80]; ++ ++ if (metadata_only && ++ (e->data_type == BCH_DATA_USER || ++ e->data_type == BCH_DATA_CACHED)) ++ continue; ++ ++ bch2_replicas_entry_to_text(&PBUF(buf), e); ++ ++ copy_fs_field(replicas[i], "%s", buf); ++ } ++ } ++ ++#undef copy_fs_field ++#undef copy_dev_field ++#undef copy_bucket_field ++#undef copy_stripe_field ++#undef copy_field ++fsck_err: ++ return ret; ++} ++ ++static int bch2_gc_start(struct bch_fs *c, ++ bool metadata_only) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ int ret; ++ ++ BUG_ON(c->usage_gc); ++ ++ c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), ++ sizeof(u64), GFP_KERNEL); ++ if (!c->usage_gc) { ++ bch_err(c, "error allocating c->usage_gc"); ++ return -ENOMEM; ++ } ++ ++ for_each_member_device(ca, c, i) { ++ BUG_ON(ca->buckets[1]); ++ BUG_ON(ca->usage[1]); ++ ++ ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + ++ ca->mi.nbuckets * sizeof(struct bucket), ++ GFP_KERNEL|__GFP_ZERO); ++ if (!ca->buckets[1]) { ++ percpu_ref_put(&ca->ref); ++ bch_err(c, "error allocating ca->buckets[gc]"); ++ return -ENOMEM; ++ } ++ ++ ca->usage[1] = alloc_percpu(struct bch_dev_usage); ++ if (!ca->usage[1]) { ++ bch_err(c, "error allocating ca->usage[gc]"); ++ percpu_ref_put(&ca->ref); ++ return -ENOMEM; ++ } ++ } ++ ++ ret = bch2_ec_mem_alloc(c, true); ++ if (ret) { ++ bch_err(c, "error allocating ec gc mem"); ++ return ret; ++ } ++ ++ percpu_down_write(&c->mark_lock); ++ ++ /* ++ * indicate to stripe code that we need to allocate for the gc stripes ++ * radix tree, too ++ */ ++ gc_pos_set(c, gc_phase(GC_PHASE_START)); ++ ++ for_each_member_device(ca, c, i) { ++ struct bucket_array *dst = __bucket_array(ca, 1); ++ struct bucket_array *src = __bucket_array(ca, 0); ++ size_t b; ++ ++ dst->first_bucket = src->first_bucket; ++ dst->nbuckets = src->nbuckets; ++ ++ for (b = 0; b < src->nbuckets; b++) { ++ struct bucket *d = &dst->b[b]; ++ struct bucket *s = &src->b[b]; ++ ++ d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; ++ d->gen_valid = s->gen_valid; ++ ++ if (metadata_only && ++ (s->mark.data_type == BCH_DATA_USER || ++ s->mark.data_type == BCH_DATA_CACHED)) { ++ d->_mark = s->mark; ++ d->_mark.owned_by_allocator = 0; ++ } ++ } ++ }; ++ ++ percpu_up_write(&c->mark_lock); ++ ++ return 0; ++} ++ ++/** ++ * bch2_gc - walk _all_ references to buckets, and recompute them: ++ * ++ * Order matters here: ++ * - Concurrent GC relies on the fact that we have a total ordering for ++ * everything that GC walks - see gc_will_visit_node(), ++ * gc_will_visit_root() ++ * ++ * - also, references move around in the course of index updates and ++ * various other crap: everything needs to agree on the ordering ++ * references are allowed to move around in - e.g., we're allowed to ++ * start with a reference owned by an open_bucket (the allocator) and ++ * move it to the btree, but not the reverse. ++ * ++ * This is necessary to ensure that gc doesn't miss references that ++ * move around - if references move backwards in the ordering GC ++ * uses, GC could skip past them ++ */ ++int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, ++ bool initial, bool metadata_only) ++{ ++ struct bch_dev *ca; ++ u64 start_time = local_clock(); ++ unsigned i, iter = 0; ++ int ret; ++ ++ lockdep_assert_held(&c->state_lock); ++ trace_gc_start(c); ++ ++ down_write(&c->gc_lock); ++ ++ /* flush interior btree updates: */ ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c)); ++again: ++ ret = bch2_gc_start(c, metadata_only); ++ if (ret) ++ goto out; ++ ++ bch2_mark_superblocks(c); ++ ++ ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only); ++ if (ret) ++ goto out; ++ ++#if 0 ++ bch2_mark_pending_btree_node_frees(c); ++#endif ++ bch2_mark_allocator_buckets(c); ++ ++ c->gc_count++; ++out: ++ if (!ret && ++ (test_bit(BCH_FS_FIXED_GENS, &c->flags) || ++ (!iter && test_restart_gc(c)))) { ++ /* ++ * XXX: make sure gens we fixed got saved ++ */ ++ if (iter++ <= 2) { ++ bch_info(c, "Fixed gens, restarting mark and sweep:"); ++ clear_bit(BCH_FS_FIXED_GENS, &c->flags); ++ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); ++ ++ percpu_down_write(&c->mark_lock); ++ bch2_gc_free(c); ++ percpu_up_write(&c->mark_lock); ++ /* flush fsck errors, reset counters */ ++ bch2_flush_fsck_errs(c); ++ ++ goto again; ++ } ++ ++ bch_info(c, "Unable to fix bucket gens, looping"); ++ ret = -EINVAL; ++ } ++ ++ if (!ret) { ++ bch2_journal_block(&c->journal); ++ ++ percpu_down_write(&c->mark_lock); ++ ret = bch2_gc_done(c, initial, metadata_only); ++ ++ bch2_journal_unblock(&c->journal); ++ } else { ++ percpu_down_write(&c->mark_lock); ++ } ++ ++ /* Indicates that gc is no longer in progress: */ ++ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); ++ ++ bch2_gc_free(c); ++ percpu_up_write(&c->mark_lock); ++ ++ up_write(&c->gc_lock); ++ ++ trace_gc_end(c); ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); ++ ++ /* ++ * Wake up allocator in case it was waiting for buckets ++ * because of not being able to inc gens ++ */ ++ for_each_member_device(ca, c, i) ++ bch2_wake_allocator(ca); ++ ++ /* ++ * At startup, allocations can happen directly instead of via the ++ * allocator thread - issue wakeup in case they blocked on gc_lock: ++ */ ++ closure_wake_up(&c->freelist_wait); ++ return ret; ++} ++ ++/* ++ * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree ++ * node pointers currently never have cached pointers that can become stale: ++ */ ++static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id id) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ percpu_down_read(&c->mark_lock); ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, false); ++ ++ if (gen_after(g->gc_gen, ptr->gen)) ++ g->gc_gen = ptr->gen; ++ ++ if (gen_after(g->mark.gen, ptr->gen) > 32) { ++ /* rewrite btree node */ ++ ++ } ++ } ++ percpu_up_read(&c->mark_lock); ++ } ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++int bch2_gc_gens(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ struct bucket_array *buckets; ++ struct bucket *g; ++ unsigned i; ++ int ret; ++ ++ /* ++ * Ideally we would be using state_lock and not gc_lock here, but that ++ * introduces a deadlock in the RO path - we currently take the state ++ * lock at the start of going RO, thus the gc thread may get stuck: ++ */ ++ down_read(&c->gc_lock); ++ ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ for_each_bucket(g, buckets) ++ g->gc_gen = g->mark.gen; ++ up_read(&ca->bucket_lock); ++ } ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (btree_node_type_needs_gc(i)) { ++ ret = bch2_gc_btree_gens(c, i); ++ if (ret) ++ goto err; ++ } ++ ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ for_each_bucket(g, buckets) ++ g->oldest_gen = g->gc_gen; ++ up_read(&ca->bucket_lock); ++ } ++err: ++ up_read(&c->gc_lock); ++ return ret; ++} ++ ++/* Btree coalescing */ ++ ++static void recalc_packed_keys(struct btree *b) ++{ ++ struct bset *i = btree_bset_first(b); ++ struct bkey_packed *k; ++ ++ memset(&b->nr, 0, sizeof(b->nr)); ++ ++ BUG_ON(b->nsets != 1); ++ ++ vstruct_for_each(i, k) ++ btree_keys_account_key_add(&b->nr, 0, k); ++} ++ ++static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, ++ struct btree *old_nodes[GC_MERGE_NODES]) ++{ ++ struct btree *parent = btree_node_parent(iter, old_nodes[0]); ++ unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0; ++ unsigned blocks = btree_blocks(c) * 2 / 3; ++ struct btree *new_nodes[GC_MERGE_NODES]; ++ struct btree_update *as; ++ struct keylist keylist; ++ struct bkey_format_state format_state; ++ struct bkey_format new_format; ++ ++ memset(new_nodes, 0, sizeof(new_nodes)); ++ bch2_keylist_init(&keylist, NULL); ++ ++ /* Count keys that are not deleted */ ++ for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++) ++ u64s += old_nodes[i]->nr.live_u64s; ++ ++ nr_old_nodes = nr_new_nodes = i; ++ ++ /* Check if all keys in @old_nodes could fit in one fewer node */ ++ if (nr_old_nodes <= 1 || ++ __vstruct_blocks(struct btree_node, c->block_bits, ++ DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks) ++ return; ++ ++ /* Find a format that all keys in @old_nodes can pack into */ ++ bch2_bkey_format_init(&format_state); ++ ++ for (i = 0; i < nr_old_nodes; i++) ++ __bch2_btree_calc_format(&format_state, old_nodes[i]); ++ ++ new_format = bch2_bkey_format_done(&format_state); ++ ++ /* Check if repacking would make any nodes too big to fit */ ++ for (i = 0; i < nr_old_nodes; i++) ++ if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) { ++ trace_btree_gc_coalesce_fail(c, ++ BTREE_GC_COALESCE_FAIL_FORMAT_FITS); ++ return; ++ } ++ ++ if (bch2_keylist_realloc(&keylist, NULL, 0, ++ (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) { ++ trace_btree_gc_coalesce_fail(c, ++ BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC); ++ return; ++ } ++ ++ as = bch2_btree_update_start(iter->trans, iter->btree_id, ++ btree_update_reserve_required(c, parent) + nr_old_nodes, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE, ++ NULL); ++ if (IS_ERR(as)) { ++ trace_btree_gc_coalesce_fail(c, ++ BTREE_GC_COALESCE_FAIL_RESERVE_GET); ++ bch2_keylist_free(&keylist, NULL); ++ return; ++ } ++ ++ trace_btree_gc_coalesce(c, old_nodes[0]); ++ ++ for (i = 0; i < nr_old_nodes; i++) ++ bch2_btree_interior_update_will_free_node(as, old_nodes[i]); ++ ++ /* Repack everything with @new_format and sort down to one bset */ ++ for (i = 0; i < nr_old_nodes; i++) ++ new_nodes[i] = ++ __bch2_btree_node_alloc_replacement(as, old_nodes[i], ++ new_format); ++ ++ /* ++ * Conceptually we concatenate the nodes together and slice them ++ * up at different boundaries. ++ */ ++ for (i = nr_new_nodes - 1; i > 0; --i) { ++ struct btree *n1 = new_nodes[i]; ++ struct btree *n2 = new_nodes[i - 1]; ++ ++ struct bset *s1 = btree_bset_first(n1); ++ struct bset *s2 = btree_bset_first(n2); ++ struct bkey_packed *k, *last = NULL; ++ ++ /* Calculate how many keys from @n2 we could fit inside @n1 */ ++ u64s = 0; ++ ++ for (k = s2->start; ++ k < vstruct_last(s2) && ++ vstruct_blocks_plus(n1->data, c->block_bits, ++ u64s + k->u64s) <= blocks; ++ k = bkey_next_skip_noops(k, vstruct_last(s2))) { ++ last = k; ++ u64s += k->u64s; ++ } ++ ++ if (u64s == le16_to_cpu(s2->u64s)) { ++ /* n2 fits entirely in n1 */ ++ n1->key.k.p = n1->data->max_key = n2->data->max_key; ++ ++ memcpy_u64s(vstruct_last(s1), ++ s2->start, ++ le16_to_cpu(s2->u64s)); ++ le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s)); ++ ++ set_btree_bset_end(n1, n1->set); ++ ++ six_unlock_write(&n2->c.lock); ++ bch2_btree_node_free_never_inserted(c, n2); ++ six_unlock_intent(&n2->c.lock); ++ ++ memmove(new_nodes + i - 1, ++ new_nodes + i, ++ sizeof(new_nodes[0]) * (nr_new_nodes - i)); ++ new_nodes[--nr_new_nodes] = NULL; ++ } else if (u64s) { ++ /* move part of n2 into n1 */ ++ n1->key.k.p = n1->data->max_key = ++ bkey_unpack_pos(n1, last); ++ ++ n2->data->min_key = bkey_successor(n1->data->max_key); ++ ++ memcpy_u64s(vstruct_last(s1), ++ s2->start, u64s); ++ le16_add_cpu(&s1->u64s, u64s); ++ ++ memmove(s2->start, ++ vstruct_idx(s2, u64s), ++ (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64)); ++ s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s); ++ ++ set_btree_bset_end(n1, n1->set); ++ set_btree_bset_end(n2, n2->set); ++ } ++ } ++ ++ for (i = 0; i < nr_new_nodes; i++) { ++ struct btree *n = new_nodes[i]; ++ ++ recalc_packed_keys(n); ++ btree_node_reset_sib_u64s(n); ++ ++ bch2_btree_build_aux_trees(n); ++ ++ bch2_btree_update_add_new_node(as, n); ++ six_unlock_write(&n->c.lock); ++ ++ bch2_btree_node_write(c, n, SIX_LOCK_intent); ++ } ++ ++ /* ++ * The keys for the old nodes get deleted. We don't want to insert keys ++ * that compare equal to the keys for the new nodes we'll also be ++ * inserting - we can't because keys on a keylist must be strictly ++ * greater than the previous keys, and we also don't need to since the ++ * key for the new node will serve the same purpose (overwriting the key ++ * for the old node). ++ */ ++ for (i = 0; i < nr_old_nodes; i++) { ++ struct bkey_i delete; ++ unsigned j; ++ ++ for (j = 0; j < nr_new_nodes; j++) ++ if (!bkey_cmp(old_nodes[i]->key.k.p, ++ new_nodes[j]->key.k.p)) ++ goto next; ++ ++ bkey_init(&delete.k); ++ delete.k.p = old_nodes[i]->key.k.p; ++ bch2_keylist_add_in_order(&keylist, &delete); ++next: ++ i = i; ++ } ++ ++ /* ++ * Keys for the new nodes get inserted: bch2_btree_insert_keys() only ++ * does the lookup once and thus expects the keys to be in sorted order ++ * so we have to make sure the new keys are correctly ordered with ++ * respect to the deleted keys added in the previous loop ++ */ ++ for (i = 0; i < nr_new_nodes; i++) ++ bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key); ++ ++ /* Insert the newly coalesced nodes */ ++ bch2_btree_insert_node(as, parent, iter, &keylist, 0); ++ ++ BUG_ON(!bch2_keylist_empty(&keylist)); ++ ++ BUG_ON(iter->l[old_nodes[0]->c.level].b != old_nodes[0]); ++ ++ bch2_btree_iter_node_replace(iter, new_nodes[0]); ++ ++ for (i = 0; i < nr_new_nodes; i++) ++ bch2_btree_update_get_open_buckets(as, new_nodes[i]); ++ ++ /* Free the old nodes and update our sliding window */ ++ for (i = 0; i < nr_old_nodes; i++) { ++ bch2_btree_node_free_inmem(c, old_nodes[i], iter); ++ ++ /* ++ * the index update might have triggered a split, in which case ++ * the nodes we coalesced - the new nodes we just created - ++ * might not be sibling nodes anymore - don't add them to the ++ * sliding window (except the first): ++ */ ++ if (!i) { ++ old_nodes[i] = new_nodes[i]; ++ } else { ++ old_nodes[i] = NULL; ++ } ++ } ++ ++ for (i = 0; i < nr_new_nodes; i++) ++ six_unlock_intent(&new_nodes[i]->c.lock); ++ ++ bch2_btree_update_done(as); ++ bch2_keylist_free(&keylist, NULL); ++} ++ ++static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct btree *b; ++ bool kthread = (current->flags & PF_KTHREAD) != 0; ++ unsigned i; ++ ++ /* Sliding window of adjacent btree nodes */ ++ struct btree *merge[GC_MERGE_NODES]; ++ u32 lock_seq[GC_MERGE_NODES]; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ /* ++ * XXX: We don't have a good way of positively matching on sibling nodes ++ * that have the same parent - this code works by handling the cases ++ * where they might not have the same parent, and is thus fragile. Ugh. ++ * ++ * Perhaps redo this to use multiple linked iterators? ++ */ ++ memset(merge, 0, sizeof(merge)); ++ ++ __for_each_btree_node(&trans, iter, btree_id, POS_MIN, ++ BTREE_MAX_DEPTH, 0, ++ BTREE_ITER_PREFETCH, b) { ++ memmove(merge + 1, merge, ++ sizeof(merge) - sizeof(merge[0])); ++ memmove(lock_seq + 1, lock_seq, ++ sizeof(lock_seq) - sizeof(lock_seq[0])); ++ ++ merge[0] = b; ++ ++ for (i = 1; i < GC_MERGE_NODES; i++) { ++ if (!merge[i] || ++ !six_relock_intent(&merge[i]->c.lock, lock_seq[i])) ++ break; ++ ++ if (merge[i]->c.level != merge[0]->c.level) { ++ six_unlock_intent(&merge[i]->c.lock); ++ break; ++ } ++ } ++ memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0])); ++ ++ bch2_coalesce_nodes(c, iter, merge); ++ ++ for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) { ++ lock_seq[i] = merge[i]->c.lock.state.seq; ++ six_unlock_intent(&merge[i]->c.lock); ++ } ++ ++ lock_seq[0] = merge[0]->c.lock.state.seq; ++ ++ if (kthread && kthread_should_stop()) { ++ bch2_trans_exit(&trans); ++ return -ESHUTDOWN; ++ } ++ ++ bch2_trans_cond_resched(&trans); ++ ++ /* ++ * If the parent node wasn't relocked, it might have been split ++ * and the nodes in our sliding window might not have the same ++ * parent anymore - blow away the sliding window: ++ */ ++ if (btree_iter_node(iter, iter->level + 1) && ++ !btree_node_intent_locked(iter, iter->level + 1)) ++ memset(merge + 1, 0, ++ (GC_MERGE_NODES - 1) * sizeof(merge[0])); ++ } ++ return bch2_trans_exit(&trans); ++} ++ ++/** ++ * bch_coalesce - coalesce adjacent nodes with low occupancy ++ */ ++void bch2_coalesce(struct bch_fs *c) ++{ ++ enum btree_id id; ++ ++ down_read(&c->gc_lock); ++ trace_gc_coalesce_start(c); ++ ++ for (id = 0; id < BTREE_ID_NR; id++) { ++ int ret = c->btree_roots[id].b ++ ? bch2_coalesce_btree(c, id) ++ : 0; ++ ++ if (ret) { ++ if (ret != -ESHUTDOWN) ++ bch_err(c, "btree coalescing failed: %d", ret); ++ return; ++ } ++ } ++ ++ trace_gc_coalesce_end(c); ++ up_read(&c->gc_lock); ++} ++ ++static int bch2_gc_thread(void *arg) ++{ ++ struct bch_fs *c = arg; ++ struct io_clock *clock = &c->io_clock[WRITE]; ++ unsigned long last = atomic_long_read(&clock->now); ++ unsigned last_kick = atomic_read(&c->kick_gc); ++ int ret; ++ ++ set_freezable(); ++ ++ while (1) { ++ while (1) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ if (kthread_should_stop()) { ++ __set_current_state(TASK_RUNNING); ++ return 0; ++ } ++ ++ if (atomic_read(&c->kick_gc) != last_kick) ++ break; ++ ++ if (c->btree_gc_periodic) { ++ unsigned long next = last + c->capacity / 16; ++ ++ if (atomic_long_read(&clock->now) >= next) ++ break; ++ ++ bch2_io_clock_schedule_timeout(clock, next); ++ } else { ++ schedule(); ++ } ++ ++ try_to_freeze(); ++ } ++ __set_current_state(TASK_RUNNING); ++ ++ last = atomic_long_read(&clock->now); ++ last_kick = atomic_read(&c->kick_gc); ++ ++ /* ++ * Full gc is currently incompatible with btree key cache: ++ */ ++#if 0 ++ ret = bch2_gc(c, NULL, false, false); ++#else ++ ret = bch2_gc_gens(c); ++#endif ++ if (ret) ++ bch_err(c, "btree gc failed: %i", ret); ++ ++ debug_check_no_locks_held(); ++ } ++ ++ return 0; ++} ++ ++void bch2_gc_thread_stop(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ p = c->gc_thread; ++ c->gc_thread = NULL; ++ ++ if (p) { ++ kthread_stop(p); ++ put_task_struct(p); ++ } ++} ++ ++int bch2_gc_thread_start(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ BUG_ON(c->gc_thread); ++ ++ p = kthread_create(bch2_gc_thread, c, "bch_gc"); ++ if (IS_ERR(p)) ++ return PTR_ERR(p); ++ ++ get_task_struct(p); ++ c->gc_thread = p; ++ wake_up_process(p); ++ return 0; ++} +diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h +new file mode 100644 +index 000000000000..3694a3df62a8 +--- /dev/null ++++ b/fs/bcachefs/btree_gc.h +@@ -0,0 +1,121 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_GC_H ++#define _BCACHEFS_BTREE_GC_H ++ ++#include "btree_types.h" ++ ++void bch2_coalesce(struct bch_fs *); ++ ++struct journal_keys; ++int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool); ++int bch2_gc_gens(struct bch_fs *); ++void bch2_gc_thread_stop(struct bch_fs *); ++int bch2_gc_thread_start(struct bch_fs *); ++void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned); ++ ++/* ++ * For concurrent mark and sweep (with other index updates), we define a total ++ * ordering of _all_ references GC walks: ++ * ++ * Note that some references will have the same GC position as others - e.g. ++ * everything within the same btree node; in those cases we're relying on ++ * whatever locking exists for where those references live, i.e. the write lock ++ * on a btree node. ++ * ++ * That locking is also required to ensure GC doesn't pass the updater in ++ * between the updater adding/removing the reference and updating the GC marks; ++ * without that, we would at best double count sometimes. ++ * ++ * That part is important - whenever calling bch2_mark_pointers(), a lock _must_ ++ * be held that prevents GC from passing the position the updater is at. ++ * ++ * (What about the start of gc, when we're clearing all the marks? GC clears the ++ * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc ++ * position inside its cmpxchg loop, so crap magically works). ++ */ ++ ++/* Position of (the start of) a gc phase: */ ++static inline struct gc_pos gc_phase(enum gc_phase phase) ++{ ++ return (struct gc_pos) { ++ .phase = phase, ++ .pos = POS_MIN, ++ .level = 0, ++ }; ++} ++ ++static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) ++{ ++ if (l.phase != r.phase) ++ return l.phase < r.phase ? -1 : 1; ++ if (bkey_cmp(l.pos, r.pos)) ++ return bkey_cmp(l.pos, r.pos); ++ if (l.level != r.level) ++ return l.level < r.level ? -1 : 1; ++ return 0; ++} ++ ++static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) ++{ ++ switch (id) { ++#define x(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n; ++ BCH_BTREE_IDS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++static inline struct gc_pos gc_pos_btree(enum btree_id id, ++ struct bpos pos, unsigned level) ++{ ++ return (struct gc_pos) { ++ .phase = btree_id_to_gc_phase(id), ++ .pos = pos, ++ .level = level, ++ }; ++} ++ ++/* ++ * GC position of the pointers within a btree node: note, _not_ for &b->key ++ * itself, that lives in the parent node: ++ */ ++static inline struct gc_pos gc_pos_btree_node(struct btree *b) ++{ ++ return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level); ++} ++ ++/* ++ * GC position of the pointer to a btree root: we don't use ++ * gc_pos_pointer_to_btree_node() here to avoid a potential race with ++ * btree_split() increasing the tree depth - the new root will have level > the ++ * old root and thus have a greater gc position than the old root, but that ++ * would be incorrect since once gc has marked the root it's not coming back. ++ */ ++static inline struct gc_pos gc_pos_btree_root(enum btree_id id) ++{ ++ return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH); ++} ++ ++static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob) ++{ ++ return (struct gc_pos) { ++ .phase = GC_PHASE_ALLOC, ++ .pos = POS(ob ? ob - c->open_buckets : 0, 0), ++ }; ++} ++ ++static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) ++{ ++ unsigned seq; ++ bool ret; ++ ++ do { ++ seq = read_seqcount_begin(&c->gc_pos_lock); ++ ret = gc_pos_cmp(pos, c->gc_pos) <= 0; ++ } while (read_seqcount_retry(&c->gc_pos_lock, seq)); ++ ++ return ret; ++} ++ ++#endif /* _BCACHEFS_BTREE_GC_H */ +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +new file mode 100644 +index 000000000000..d2c28eb75bde +--- /dev/null ++++ b/fs/bcachefs/btree_io.c +@@ -0,0 +1,1868 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "bkey_sort.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_locking.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "debug.h" ++#include "error.h" ++#include "extents.h" ++#include "io.h" ++#include "journal_reclaim.h" ++#include "journal_seq_blacklist.h" ++#include "super-io.h" ++ ++#include ++#include ++ ++static void verify_no_dups(struct btree *b, ++ struct bkey_packed *start, ++ struct bkey_packed *end, ++ bool extents) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bkey_packed *k, *p; ++ ++ if (start == end) ++ return; ++ ++ for (p = start, k = bkey_next_skip_noops(start, end); ++ k != end; ++ p = k, k = bkey_next_skip_noops(k, end)) { ++ struct bkey l = bkey_unpack_key(b, p); ++ struct bkey r = bkey_unpack_key(b, k); ++ ++ BUG_ON(extents ++ ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0 ++ : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); ++ //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0); ++ } ++#endif ++} ++ ++static void set_needs_whiteout(struct bset *i, int v) ++{ ++ struct bkey_packed *k; ++ ++ for (k = i->start; ++ k != vstruct_last(i); ++ k = bkey_next_skip_noops(k, vstruct_last(i))) ++ k->needs_whiteout = v; ++} ++ ++static void btree_bounce_free(struct bch_fs *c, unsigned order, ++ bool used_mempool, void *p) ++{ ++ if (used_mempool) ++ mempool_free(p, &c->btree_bounce_pool); ++ else ++ vpfree(p, PAGE_SIZE << order); ++} ++ ++static void *btree_bounce_alloc(struct bch_fs *c, unsigned order, ++ bool *used_mempool) ++{ ++ unsigned flags = memalloc_nofs_save(); ++ void *p; ++ ++ BUG_ON(order > btree_page_order(c)); ++ ++ *used_mempool = false; ++ p = (void *) __get_free_pages(__GFP_NOWARN|GFP_NOWAIT, order); ++ if (!p) { ++ *used_mempool = true; ++ p = mempool_alloc(&c->btree_bounce_pool, GFP_NOIO); ++ } ++ memalloc_nofs_restore(flags); ++ return p; ++} ++ ++static void sort_bkey_ptrs(const struct btree *bt, ++ struct bkey_packed **ptrs, unsigned nr) ++{ ++ unsigned n = nr, a = nr / 2, b, c, d; ++ ++ if (!a) ++ return; ++ ++ /* Heap sort: see lib/sort.c: */ ++ while (1) { ++ if (a) ++ a--; ++ else if (--n) ++ swap(ptrs[0], ptrs[n]); ++ else ++ break; ++ ++ for (b = a; c = 2 * b + 1, (d = c + 1) < n;) ++ b = bkey_cmp_packed(bt, ++ ptrs[c], ++ ptrs[d]) >= 0 ? c : d; ++ if (d == n) ++ b = c; ++ ++ while (b != a && ++ bkey_cmp_packed(bt, ++ ptrs[a], ++ ptrs[b]) >= 0) ++ b = (b - 1) / 2; ++ c = b; ++ while (b != a) { ++ b = (b - 1) / 2; ++ swap(ptrs[b], ptrs[c]); ++ } ++ } ++} ++ ++static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) ++{ ++ struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k; ++ bool used_mempool = false; ++ unsigned order; ++ ++ if (!b->whiteout_u64s) ++ return; ++ ++ order = get_order(b->whiteout_u64s * sizeof(u64)); ++ ++ new_whiteouts = btree_bounce_alloc(c, order, &used_mempool); ++ ++ ptrs = ptrs_end = ((void *) new_whiteouts + (PAGE_SIZE << order)); ++ ++ for (k = unwritten_whiteouts_start(c, b); ++ k != unwritten_whiteouts_end(c, b); ++ k = bkey_next(k)) ++ *--ptrs = k; ++ ++ sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs); ++ ++ k = new_whiteouts; ++ ++ while (ptrs != ptrs_end) { ++ bkey_copy(k, *ptrs); ++ k = bkey_next(k); ++ ptrs++; ++ } ++ ++ verify_no_dups(b, new_whiteouts, ++ (void *) ((u64 *) new_whiteouts + b->whiteout_u64s), ++ btree_node_old_extent_overwrite(b)); ++ ++ memcpy_u64s(unwritten_whiteouts_start(c, b), ++ new_whiteouts, b->whiteout_u64s); ++ ++ btree_bounce_free(c, order, used_mempool, new_whiteouts); ++} ++ ++static bool should_compact_bset(struct btree *b, struct bset_tree *t, ++ bool compacting, enum compact_mode mode) ++{ ++ if (!bset_dead_u64s(b, t)) ++ return false; ++ ++ switch (mode) { ++ case COMPACT_LAZY: ++ return should_compact_bset_lazy(b, t) || ++ (compacting && !bset_written(b, bset(b, t))); ++ case COMPACT_ALL: ++ return true; ++ default: ++ BUG(); ++ } ++} ++ ++static bool bch2_compact_extent_whiteouts(struct bch_fs *c, ++ struct btree *b, ++ enum compact_mode mode) ++{ ++ const struct bkey_format *f = &b->format; ++ struct bset_tree *t; ++ struct bkey_packed *whiteouts = NULL; ++ struct bkey_packed *u_start, *u_pos; ++ struct sort_iter sort_iter; ++ unsigned order, whiteout_u64s = 0, u64s; ++ bool used_mempool, compacting = false; ++ ++ BUG_ON(!btree_node_is_extents(b)); ++ ++ for_each_bset(b, t) ++ if (should_compact_bset(b, t, whiteout_u64s != 0, mode)) ++ whiteout_u64s += bset_dead_u64s(b, t); ++ ++ if (!whiteout_u64s) ++ return false; ++ ++ bch2_sort_whiteouts(c, b); ++ ++ sort_iter_init(&sort_iter, b); ++ ++ whiteout_u64s += b->whiteout_u64s; ++ order = get_order(whiteout_u64s * sizeof(u64)); ++ ++ whiteouts = btree_bounce_alloc(c, order, &used_mempool); ++ u_start = u_pos = whiteouts; ++ ++ memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b), ++ b->whiteout_u64s); ++ u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64); ++ ++ sort_iter_add(&sort_iter, u_start, u_pos); ++ ++ for_each_bset(b, t) { ++ struct bset *i = bset(b, t); ++ struct bkey_packed *k, *n, *out, *start, *end; ++ struct btree_node_entry *src = NULL, *dst = NULL; ++ ++ if (t != b->set && !bset_written(b, i)) { ++ src = container_of(i, struct btree_node_entry, keys); ++ dst = max(write_block(b), ++ (void *) btree_bkey_last(b, t - 1)); ++ } ++ ++ if (src != dst) ++ compacting = true; ++ ++ if (!should_compact_bset(b, t, compacting, mode)) { ++ if (src != dst) { ++ memmove(dst, src, sizeof(*src) + ++ le16_to_cpu(src->keys.u64s) * ++ sizeof(u64)); ++ i = &dst->keys; ++ set_btree_bset(b, t, i); ++ } ++ continue; ++ } ++ ++ compacting = true; ++ u_start = u_pos; ++ start = i->start; ++ end = vstruct_last(i); ++ ++ if (src != dst) { ++ memmove(dst, src, sizeof(*src)); ++ i = &dst->keys; ++ set_btree_bset(b, t, i); ++ } ++ ++ out = i->start; ++ ++ for (k = start; k != end; k = n) { ++ n = bkey_next_skip_noops(k, end); ++ ++ if (bkey_deleted(k)) ++ continue; ++ ++ BUG_ON(bkey_whiteout(k) && ++ k->needs_whiteout && ++ bkey_written(b, k)); ++ ++ if (bkey_whiteout(k) && !k->needs_whiteout) ++ continue; ++ ++ if (bkey_whiteout(k)) { ++ memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k)); ++ set_bkeyp_val_u64s(f, u_pos, 0); ++ u_pos = bkey_next(u_pos); ++ } else { ++ bkey_copy(out, k); ++ out = bkey_next(out); ++ } ++ } ++ ++ sort_iter_add(&sort_iter, u_start, u_pos); ++ ++ i->u64s = cpu_to_le16((u64 *) out - i->_data); ++ set_btree_bset_end(b, t); ++ bch2_bset_set_no_aux_tree(b, t); ++ } ++ ++ b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts; ++ ++ BUG_ON((void *) unwritten_whiteouts_start(c, b) < ++ (void *) btree_bkey_last(b, bset_tree_last(b))); ++ ++ u64s = bch2_sort_extent_whiteouts(unwritten_whiteouts_start(c, b), ++ &sort_iter); ++ ++ BUG_ON(u64s > b->whiteout_u64s); ++ BUG_ON(u_pos != whiteouts && !u64s); ++ ++ if (u64s != b->whiteout_u64s) { ++ void *src = unwritten_whiteouts_start(c, b); ++ ++ b->whiteout_u64s = u64s; ++ memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s); ++ } ++ ++ verify_no_dups(b, ++ unwritten_whiteouts_start(c, b), ++ unwritten_whiteouts_end(c, b), ++ true); ++ ++ btree_bounce_free(c, order, used_mempool, whiteouts); ++ ++ bch2_btree_build_aux_trees(b); ++ ++ bch_btree_keys_u64s_remaining(c, b); ++ bch2_verify_btree_nr_keys(b); ++ ++ return true; ++} ++ ++static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) ++{ ++ struct bset_tree *t; ++ bool ret = false; ++ ++ for_each_bset(b, t) { ++ struct bset *i = bset(b, t); ++ struct bkey_packed *k, *n, *out, *start, *end; ++ struct btree_node_entry *src = NULL, *dst = NULL; ++ ++ if (t != b->set && !bset_written(b, i)) { ++ src = container_of(i, struct btree_node_entry, keys); ++ dst = max(write_block(b), ++ (void *) btree_bkey_last(b, t - 1)); ++ } ++ ++ if (src != dst) ++ ret = true; ++ ++ if (!should_compact_bset(b, t, ret, mode)) { ++ if (src != dst) { ++ memmove(dst, src, sizeof(*src) + ++ le16_to_cpu(src->keys.u64s) * ++ sizeof(u64)); ++ i = &dst->keys; ++ set_btree_bset(b, t, i); ++ } ++ continue; ++ } ++ ++ start = btree_bkey_first(b, t); ++ end = btree_bkey_last(b, t); ++ ++ if (src != dst) { ++ memmove(dst, src, sizeof(*src)); ++ i = &dst->keys; ++ set_btree_bset(b, t, i); ++ } ++ ++ out = i->start; ++ ++ for (k = start; k != end; k = n) { ++ n = bkey_next_skip_noops(k, end); ++ ++ if (!bkey_whiteout(k)) { ++ bkey_copy(out, k); ++ out = bkey_next(out); ++ } else { ++ BUG_ON(k->needs_whiteout); ++ } ++ } ++ ++ i->u64s = cpu_to_le16((u64 *) out - i->_data); ++ set_btree_bset_end(b, t); ++ bch2_bset_set_no_aux_tree(b, t); ++ ret = true; ++ } ++ ++ bch2_verify_btree_nr_keys(b); ++ ++ bch2_btree_build_aux_trees(b); ++ ++ return ret; ++} ++ ++bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, ++ enum compact_mode mode) ++{ ++ return !btree_node_old_extent_overwrite(b) ++ ? bch2_drop_whiteouts(b, mode) ++ : bch2_compact_extent_whiteouts(c, b, mode); ++} ++ ++static void btree_node_sort(struct bch_fs *c, struct btree *b, ++ struct btree_iter *iter, ++ unsigned start_idx, ++ unsigned end_idx, ++ bool filter_whiteouts) ++{ ++ struct btree_node *out; ++ struct sort_iter sort_iter; ++ struct bset_tree *t; ++ struct bset *start_bset = bset(b, &b->set[start_idx]); ++ bool used_mempool = false; ++ u64 start_time, seq = 0; ++ unsigned i, u64s = 0, order, shift = end_idx - start_idx - 1; ++ bool sorting_entire_node = start_idx == 0 && ++ end_idx == b->nsets; ++ ++ sort_iter_init(&sort_iter, b); ++ ++ for (t = b->set + start_idx; ++ t < b->set + end_idx; ++ t++) { ++ u64s += le16_to_cpu(bset(b, t)->u64s); ++ sort_iter_add(&sort_iter, ++ btree_bkey_first(b, t), ++ btree_bkey_last(b, t)); ++ } ++ ++ order = sorting_entire_node ++ ? btree_page_order(c) ++ : get_order(__vstruct_bytes(struct btree_node, u64s)); ++ ++ out = btree_bounce_alloc(c, order, &used_mempool); ++ ++ start_time = local_clock(); ++ ++ if (btree_node_old_extent_overwrite(b)) ++ filter_whiteouts = bset_written(b, start_bset); ++ ++ u64s = (btree_node_old_extent_overwrite(b) ++ ? bch2_sort_extents ++ : bch2_sort_keys)(out->keys.start, ++ &sort_iter, ++ filter_whiteouts); ++ ++ out->keys.u64s = cpu_to_le16(u64s); ++ ++ BUG_ON(vstruct_end(&out->keys) > (void *) out + (PAGE_SIZE << order)); ++ ++ if (sorting_entire_node) ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], ++ start_time); ++ ++ /* Make sure we preserve bset journal_seq: */ ++ for (t = b->set + start_idx; t < b->set + end_idx; t++) ++ seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq)); ++ start_bset->journal_seq = cpu_to_le64(seq); ++ ++ if (sorting_entire_node) { ++ unsigned u64s = le16_to_cpu(out->keys.u64s); ++ ++ BUG_ON(order != btree_page_order(c)); ++ ++ /* ++ * Our temporary buffer is the same size as the btree node's ++ * buffer, we can just swap buffers instead of doing a big ++ * memcpy() ++ */ ++ *out = *b->data; ++ out->keys.u64s = cpu_to_le16(u64s); ++ swap(out, b->data); ++ set_btree_bset(b, b->set, &b->data->keys); ++ } else { ++ start_bset->u64s = out->keys.u64s; ++ memcpy_u64s(start_bset->start, ++ out->keys.start, ++ le16_to_cpu(out->keys.u64s)); ++ } ++ ++ for (i = start_idx + 1; i < end_idx; i++) ++ b->nr.bset_u64s[start_idx] += ++ b->nr.bset_u64s[i]; ++ ++ b->nsets -= shift; ++ ++ for (i = start_idx + 1; i < b->nsets; i++) { ++ b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift]; ++ b->set[i] = b->set[i + shift]; ++ } ++ ++ for (i = b->nsets; i < MAX_BSETS; i++) ++ b->nr.bset_u64s[i] = 0; ++ ++ set_btree_bset_end(b, &b->set[start_idx]); ++ bch2_bset_set_no_aux_tree(b, &b->set[start_idx]); ++ ++ btree_bounce_free(c, order, used_mempool, out); ++ ++ bch2_verify_btree_nr_keys(b); ++} ++ ++void bch2_btree_sort_into(struct bch_fs *c, ++ struct btree *dst, ++ struct btree *src) ++{ ++ struct btree_nr_keys nr; ++ struct btree_node_iter src_iter; ++ u64 start_time = local_clock(); ++ ++ BUG_ON(dst->nsets != 1); ++ ++ bch2_bset_set_no_aux_tree(dst, dst->set); ++ ++ bch2_btree_node_iter_init_from_start(&src_iter, src); ++ ++ if (btree_node_is_extents(src)) ++ nr = bch2_sort_repack_merge(c, btree_bset_first(dst), ++ src, &src_iter, ++ &dst->format, ++ true); ++ else ++ nr = bch2_sort_repack(btree_bset_first(dst), ++ src, &src_iter, ++ &dst->format, ++ true); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], ++ start_time); ++ ++ set_btree_bset_end(dst, dst->set); ++ ++ dst->nr.live_u64s += nr.live_u64s; ++ dst->nr.bset_u64s[0] += nr.bset_u64s[0]; ++ dst->nr.packed_keys += nr.packed_keys; ++ dst->nr.unpacked_keys += nr.unpacked_keys; ++ ++ bch2_verify_btree_nr_keys(dst); ++} ++ ++#define SORT_CRIT (4096 / sizeof(u64)) ++ ++/* ++ * We're about to add another bset to the btree node, so if there's currently ++ * too many bsets - sort some of them together: ++ */ ++static bool btree_node_compact(struct bch_fs *c, struct btree *b, ++ struct btree_iter *iter) ++{ ++ unsigned unwritten_idx; ++ bool ret = false; ++ ++ for (unwritten_idx = 0; ++ unwritten_idx < b->nsets; ++ unwritten_idx++) ++ if (!bset_written(b, bset(b, &b->set[unwritten_idx]))) ++ break; ++ ++ if (b->nsets - unwritten_idx > 1) { ++ btree_node_sort(c, b, iter, unwritten_idx, ++ b->nsets, false); ++ ret = true; ++ } ++ ++ if (unwritten_idx > 1) { ++ btree_node_sort(c, b, iter, 0, unwritten_idx, false); ++ ret = true; ++ } ++ ++ return ret; ++} ++ ++void bch2_btree_build_aux_trees(struct btree *b) ++{ ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) ++ bch2_bset_build_aux_tree(b, t, ++ !bset_written(b, bset(b, t)) && ++ t == bset_tree_last(b)); ++} ++ ++/* ++ * @bch_btree_init_next - initialize a new (unwritten) bset that can then be ++ * inserted into ++ * ++ * Safe to call if there already is an unwritten bset - will only add a new bset ++ * if @b doesn't already have one. ++ * ++ * Returns true if we sorted (i.e. invalidated iterators ++ */ ++void bch2_btree_init_next(struct bch_fs *c, struct btree *b, ++ struct btree_iter *iter) ++{ ++ struct btree_node_entry *bne; ++ bool did_sort; ++ ++ EBUG_ON(!(b->c.lock.state.seq & 1)); ++ EBUG_ON(iter && iter->l[b->c.level].b != b); ++ ++ did_sort = btree_node_compact(c, b, iter); ++ ++ bne = want_new_bset(c, b); ++ if (bne) ++ bch2_bset_init_next(c, b, bne); ++ ++ bch2_btree_build_aux_trees(b); ++ ++ if (iter && did_sort) ++ bch2_btree_iter_reinit_node(iter, b); ++} ++ ++static struct nonce btree_nonce(struct bset *i, unsigned offset) ++{ ++ return (struct nonce) {{ ++ [0] = cpu_to_le32(offset), ++ [1] = ((__le32 *) &i->seq)[0], ++ [2] = ((__le32 *) &i->seq)[1], ++ [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, ++ }}; ++} ++ ++static void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) ++{ ++ struct nonce nonce = btree_nonce(i, offset); ++ ++ if (!offset) { ++ struct btree_node *bn = container_of(i, struct btree_node, keys); ++ unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; ++ ++ bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, ++ bytes); ++ ++ nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); ++ } ++ ++ bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, ++ vstruct_end(i) - (void *) i->_data); ++} ++ ++static void btree_err_msg(struct printbuf *out, struct bch_fs *c, ++ struct btree *b, struct bset *i, ++ unsigned offset, int write) ++{ ++ pr_buf(out, "error validating btree node %sat btree %u level %u/%u\n" ++ "pos ", ++ write ? "before write " : "", ++ b->c.btree_id, b->c.level, ++ c->btree_roots[b->c.btree_id].level); ++ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); ++ ++ pr_buf(out, " node offset %u", b->written); ++ if (i) ++ pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s)); ++} ++ ++enum btree_err_type { ++ BTREE_ERR_FIXABLE, ++ BTREE_ERR_WANT_RETRY, ++ BTREE_ERR_MUST_RETRY, ++ BTREE_ERR_FATAL, ++}; ++ ++enum btree_validate_ret { ++ BTREE_RETRY_READ = 64, ++}; ++ ++#define btree_err(type, c, b, i, msg, ...) \ ++({ \ ++ __label__ out; \ ++ char _buf[300]; \ ++ struct printbuf out = PBUF(_buf); \ ++ \ ++ btree_err_msg(&out, c, b, i, b->written, write); \ ++ pr_buf(&out, ": " msg, ##__VA_ARGS__); \ ++ \ ++ if (type == BTREE_ERR_FIXABLE && \ ++ write == READ && \ ++ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ ++ mustfix_fsck_err(c, "%s", _buf); \ ++ goto out; \ ++ } \ ++ \ ++ switch (write) { \ ++ case READ: \ ++ bch_err(c, "%s", _buf); \ ++ \ ++ switch (type) { \ ++ case BTREE_ERR_FIXABLE: \ ++ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ goto fsck_err; \ ++ case BTREE_ERR_WANT_RETRY: \ ++ if (have_retry) { \ ++ ret = BTREE_RETRY_READ; \ ++ goto fsck_err; \ ++ } \ ++ break; \ ++ case BTREE_ERR_MUST_RETRY: \ ++ ret = BTREE_RETRY_READ; \ ++ goto fsck_err; \ ++ case BTREE_ERR_FATAL: \ ++ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ goto fsck_err; \ ++ } \ ++ break; \ ++ case WRITE: \ ++ bch_err(c, "corrupt metadata before write: %s", _buf); \ ++ \ ++ if (bch2_fs_inconsistent(c)) { \ ++ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ goto fsck_err; \ ++ } \ ++ break; \ ++ } \ ++out: \ ++ true; \ ++}) ++ ++#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) ++ ++static int validate_bset(struct bch_fs *c, struct btree *b, ++ struct bset *i, unsigned sectors, ++ int write, bool have_retry) ++{ ++ unsigned version = le16_to_cpu(i->version); ++ const char *err; ++ int ret = 0; ++ ++ btree_err_on((version != BCH_BSET_VERSION_OLD && ++ version < bcachefs_metadata_version_min) || ++ version >= bcachefs_metadata_version_max, ++ BTREE_ERR_FATAL, c, b, i, ++ "unsupported bset version"); ++ ++ if (btree_err_on(b->written + sectors > c->opts.btree_node_size, ++ BTREE_ERR_FIXABLE, c, b, i, ++ "bset past end of btree node")) { ++ i->u64s = 0; ++ return 0; ++ } ++ ++ btree_err_on(b->written && !i->u64s, ++ BTREE_ERR_FIXABLE, c, b, i, ++ "empty bset"); ++ ++ if (!b->written) { ++ struct btree_node *bn = ++ container_of(i, struct btree_node, keys); ++ /* These indicate that we read the wrong btree node: */ ++ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { ++ struct bch_btree_ptr_v2 *bp = ++ &bkey_i_to_btree_ptr_v2(&b->key)->v; ++ ++ /* XXX endianness */ ++ btree_err_on(bp->seq != bn->keys.seq, ++ BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "incorrect sequence number (wrong btree node)"); ++ } ++ ++ btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, ++ BTREE_ERR_MUST_RETRY, c, b, i, ++ "incorrect btree id"); ++ ++ btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, ++ BTREE_ERR_MUST_RETRY, c, b, i, ++ "incorrect level"); ++ ++ if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) { ++ u64 *p = (u64 *) &bn->ptr; ++ ++ *p = swab64(*p); ++ } ++ ++ if (!write) ++ compat_btree_node(b->c.level, b->c.btree_id, version, ++ BSET_BIG_ENDIAN(i), write, bn); ++ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { ++ struct bch_btree_ptr_v2 *bp = ++ &bkey_i_to_btree_ptr_v2(&b->key)->v; ++ ++ btree_err_on(bkey_cmp(b->data->min_key, bp->min_key), ++ BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "incorrect min_key: got %llu:%llu should be %llu:%llu", ++ b->data->min_key.inode, ++ b->data->min_key.offset, ++ bp->min_key.inode, ++ bp->min_key.offset); ++ } ++ ++ btree_err_on(bkey_cmp(bn->max_key, b->key.k.p), ++ BTREE_ERR_MUST_RETRY, c, b, i, ++ "incorrect max key"); ++ ++ if (write) ++ compat_btree_node(b->c.level, b->c.btree_id, version, ++ BSET_BIG_ENDIAN(i), write, bn); ++ ++ /* XXX: ideally we would be validating min_key too */ ++#if 0 ++ /* ++ * not correct anymore, due to btree node write error ++ * handling ++ * ++ * need to add bn->seq to btree keys and verify ++ * against that ++ */ ++ btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key), ++ bn->ptr), ++ BTREE_ERR_FATAL, c, b, i, ++ "incorrect backpointer"); ++#endif ++ err = bch2_bkey_format_validate(&bn->format); ++ btree_err_on(err, ++ BTREE_ERR_FATAL, c, b, i, ++ "invalid bkey format: %s", err); ++ ++ compat_bformat(b->c.level, b->c.btree_id, version, ++ BSET_BIG_ENDIAN(i), write, ++ &bn->format); ++ } ++fsck_err: ++ return ret; ++} ++ ++static int validate_bset_keys(struct bch_fs *c, struct btree *b, ++ struct bset *i, unsigned *whiteout_u64s, ++ int write, bool have_retry) ++{ ++ unsigned version = le16_to_cpu(i->version); ++ struct bkey_packed *k, *prev = NULL; ++ bool seen_non_whiteout = false; ++ int ret = 0; ++ ++ if (!BSET_SEPARATE_WHITEOUTS(i)) { ++ seen_non_whiteout = true; ++ *whiteout_u64s = 0; ++ } ++ ++ for (k = i->start; ++ k != vstruct_last(i);) { ++ struct bkey_s u; ++ struct bkey tmp; ++ const char *invalid; ++ ++ if (btree_err_on(bkey_next(k) > vstruct_last(i), ++ BTREE_ERR_FIXABLE, c, b, i, ++ "key extends past end of bset")) { ++ i->u64s = cpu_to_le16((u64 *) k - i->_data); ++ break; ++ } ++ ++ if (btree_err_on(k->format > KEY_FORMAT_CURRENT, ++ BTREE_ERR_FIXABLE, c, b, i, ++ "invalid bkey format %u", k->format)) { ++ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); ++ memmove_u64s_down(k, bkey_next(k), ++ (u64 *) vstruct_end(i) - (u64 *) k); ++ continue; ++ } ++ ++ /* XXX: validate k->u64s */ ++ if (!write) ++ bch2_bkey_compat(b->c.level, b->c.btree_id, version, ++ BSET_BIG_ENDIAN(i), write, ++ &b->format, k); ++ ++ u = __bkey_disassemble(b, k, &tmp); ++ ++ invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?: ++ bch2_bkey_in_btree_node(b, u.s_c) ?: ++ (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL); ++ if (invalid) { ++ char buf[160]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); ++ btree_err(BTREE_ERR_FIXABLE, c, b, i, ++ "invalid bkey:\n%s\n%s", invalid, buf); ++ ++ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); ++ memmove_u64s_down(k, bkey_next(k), ++ (u64 *) vstruct_end(i) - (u64 *) k); ++ continue; ++ } ++ ++ if (write) ++ bch2_bkey_compat(b->c.level, b->c.btree_id, version, ++ BSET_BIG_ENDIAN(i), write, ++ &b->format, k); ++ ++ /* ++ * with the separate whiteouts thing (used for extents), the ++ * second set of keys actually can have whiteouts too, so we ++ * can't solely go off bkey_whiteout()... ++ */ ++ ++ if (!seen_non_whiteout && ++ (!bkey_whiteout(k) || ++ (prev && bkey_iter_cmp(b, prev, k) > 0))) { ++ *whiteout_u64s = k->_data - i->_data; ++ seen_non_whiteout = true; ++ } else if (prev && bkey_iter_cmp(b, prev, k) > 0) { ++ char buf1[80]; ++ char buf2[80]; ++ struct bkey up = bkey_unpack_key(b, prev); ++ ++ bch2_bkey_to_text(&PBUF(buf1), &up); ++ bch2_bkey_to_text(&PBUF(buf2), u.k); ++ ++ bch2_dump_bset(c, b, i, 0); ++ btree_err(BTREE_ERR_FATAL, c, b, i, ++ "keys out of order: %s > %s", ++ buf1, buf2); ++ /* XXX: repair this */ ++ } ++ ++ prev = k; ++ k = bkey_next_skip_noops(k, vstruct_last(i)); ++ } ++fsck_err: ++ return ret; ++} ++ ++int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry) ++{ ++ struct btree_node_entry *bne; ++ struct sort_iter *iter; ++ struct btree_node *sorted; ++ struct bkey_packed *k; ++ struct bch_extent_ptr *ptr; ++ struct bset *i; ++ bool used_mempool, blacklisted; ++ unsigned u64s; ++ int ret, retry_read = 0, write = READ; ++ ++ iter = mempool_alloc(&c->fill_iter, GFP_NOIO); ++ sort_iter_init(iter, b); ++ iter->size = (btree_blocks(c) + 1) * 2; ++ ++ if (bch2_meta_read_fault("btree")) ++ btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "dynamic fault"); ++ ++ btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), ++ BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "bad magic"); ++ ++ btree_err_on(!b->data->keys.seq, ++ BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "bad btree header"); ++ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { ++ struct bch_btree_ptr_v2 *bp = ++ &bkey_i_to_btree_ptr_v2(&b->key)->v; ++ ++ btree_err_on(b->data->keys.seq != bp->seq, ++ BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "got wrong btree node (seq %llx want %llx)", ++ b->data->keys.seq, bp->seq); ++ } ++ ++ while (b->written < c->opts.btree_node_size) { ++ unsigned sectors, whiteout_u64s = 0; ++ struct nonce nonce; ++ struct bch_csum csum; ++ bool first = !b->written; ++ ++ if (!b->written) { ++ i = &b->data->keys; ++ ++ btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), ++ BTREE_ERR_WANT_RETRY, c, b, i, ++ "unknown checksum type"); ++ ++ nonce = btree_nonce(i, b->written << 9); ++ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); ++ ++ btree_err_on(bch2_crc_cmp(csum, b->data->csum), ++ BTREE_ERR_WANT_RETRY, c, b, i, ++ "invalid checksum"); ++ ++ bset_encrypt(c, i, b->written << 9); ++ ++ if (btree_node_is_extents(b) && ++ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) { ++ set_btree_node_old_extent_overwrite(b); ++ set_btree_node_need_rewrite(b); ++ } ++ ++ sectors = vstruct_sectors(b->data, c->block_bits); ++ } else { ++ bne = write_block(b); ++ i = &bne->keys; ++ ++ if (i->seq != b->data->keys.seq) ++ break; ++ ++ btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), ++ BTREE_ERR_WANT_RETRY, c, b, i, ++ "unknown checksum type"); ++ ++ nonce = btree_nonce(i, b->written << 9); ++ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); ++ ++ btree_err_on(bch2_crc_cmp(csum, bne->csum), ++ BTREE_ERR_WANT_RETRY, c, b, i, ++ "invalid checksum"); ++ ++ bset_encrypt(c, i, b->written << 9); ++ ++ sectors = vstruct_sectors(bne, c->block_bits); ++ } ++ ++ ret = validate_bset(c, b, i, sectors, ++ READ, have_retry); ++ if (ret) ++ goto fsck_err; ++ ++ if (!b->written) ++ btree_node_set_format(b, b->data->format); ++ ++ ret = validate_bset_keys(c, b, i, &whiteout_u64s, ++ READ, have_retry); ++ if (ret) ++ goto fsck_err; ++ ++ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); ++ ++ b->written += sectors; ++ ++ blacklisted = bch2_journal_seq_is_blacklisted(c, ++ le64_to_cpu(i->journal_seq), ++ true); ++ ++ btree_err_on(blacklisted && first, ++ BTREE_ERR_FIXABLE, c, b, i, ++ "first btree node bset has blacklisted journal seq"); ++ if (blacklisted && !first) ++ continue; ++ ++ sort_iter_add(iter, i->start, ++ vstruct_idx(i, whiteout_u64s)); ++ ++ sort_iter_add(iter, ++ vstruct_idx(i, whiteout_u64s), ++ vstruct_last(i)); ++ } ++ ++ for (bne = write_block(b); ++ bset_byte_offset(b, bne) < btree_bytes(c); ++ bne = (void *) bne + block_bytes(c)) ++ btree_err_on(bne->keys.seq == b->data->keys.seq, ++ BTREE_ERR_WANT_RETRY, c, b, NULL, ++ "found bset signature after last bset"); ++ ++ sorted = btree_bounce_alloc(c, btree_page_order(c), &used_mempool); ++ sorted->keys.u64s = 0; ++ ++ set_btree_bset(b, b->set, &b->data->keys); ++ ++ b->nr = (btree_node_old_extent_overwrite(b) ++ ? bch2_extent_sort_fix_overlapping ++ : bch2_key_sort_fix_overlapping)(c, &sorted->keys, iter); ++ ++ u64s = le16_to_cpu(sorted->keys.u64s); ++ *sorted = *b->data; ++ sorted->keys.u64s = cpu_to_le16(u64s); ++ swap(sorted, b->data); ++ set_btree_bset(b, b->set, &b->data->keys); ++ b->nsets = 1; ++ ++ BUG_ON(b->nr.live_u64s != u64s); ++ ++ btree_bounce_free(c, btree_page_order(c), used_mempool, sorted); ++ ++ i = &b->data->keys; ++ for (k = i->start; k != vstruct_last(i);) { ++ struct bkey tmp; ++ struct bkey_s u = __bkey_disassemble(b, k, &tmp); ++ const char *invalid = bch2_bkey_val_invalid(c, u.s_c); ++ ++ if (invalid || ++ (inject_invalid_keys(c) && ++ !bversion_cmp(u.k->version, MAX_VERSION))) { ++ char buf[160]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); ++ btree_err(BTREE_ERR_FIXABLE, c, b, i, ++ "invalid bkey %s: %s", buf, invalid); ++ ++ btree_keys_account_key_drop(&b->nr, 0, k); ++ ++ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); ++ memmove_u64s_down(k, bkey_next(k), ++ (u64 *) vstruct_end(i) - (u64 *) k); ++ set_btree_bset_end(b, b->set); ++ continue; ++ } ++ ++ if (u.k->type == KEY_TYPE_btree_ptr_v2) { ++ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u); ++ ++ bp.v->mem_ptr = 0; ++ } ++ ++ k = bkey_next_skip_noops(k, vstruct_last(i)); ++ } ++ ++ bch2_bset_build_aux_tree(b, b->set, false); ++ ++ set_needs_whiteout(btree_bset_first(b), true); ++ ++ btree_node_reset_sib_u64s(b); ++ ++ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ if (ca->mi.state != BCH_MEMBER_STATE_RW) ++ set_btree_node_need_rewrite(b); ++ } ++out: ++ mempool_free(iter, &c->fill_iter); ++ return retry_read; ++fsck_err: ++ if (ret == BTREE_RETRY_READ) { ++ retry_read = 1; ++ } else { ++ bch2_inconsistent_error(c); ++ set_btree_node_read_error(b); ++ } ++ goto out; ++} ++ ++static void btree_node_read_work(struct work_struct *work) ++{ ++ struct btree_read_bio *rb = ++ container_of(work, struct btree_read_bio, work); ++ struct bch_fs *c = rb->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); ++ struct btree *b = rb->bio.bi_private; ++ struct bio *bio = &rb->bio; ++ struct bch_io_failures failed = { .nr = 0 }; ++ bool can_retry; ++ ++ goto start; ++ while (1) { ++ bch_info(c, "retrying read"); ++ ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); ++ rb->have_ioref = bch2_dev_get_ioref(ca, READ); ++ bio_reset(bio); ++ bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; ++ bio->bi_iter.bi_sector = rb->pick.ptr.offset; ++ bio->bi_iter.bi_size = btree_bytes(c); ++ ++ if (rb->have_ioref) { ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ submit_bio_wait(bio); ++ } else { ++ bio->bi_status = BLK_STS_REMOVED; ++ } ++start: ++ bch2_dev_io_err_on(bio->bi_status, ca, "btree read: %s", ++ blk_status_to_str(bio->bi_status)); ++ if (rb->have_ioref) ++ percpu_ref_put(&ca->io_ref); ++ rb->have_ioref = false; ++ ++ bch2_mark_io_failure(&failed, &rb->pick); ++ ++ can_retry = bch2_bkey_pick_read_device(c, ++ bkey_i_to_s_c(&b->key), ++ &failed, &rb->pick) > 0; ++ ++ if (!bio->bi_status && ++ !bch2_btree_node_read_done(c, b, can_retry)) ++ break; ++ ++ if (!can_retry) { ++ set_btree_node_read_error(b); ++ break; ++ } ++ } ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], ++ rb->start_time); ++ bio_put(&rb->bio); ++ clear_btree_node_read_in_flight(b); ++ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); ++} ++ ++static void btree_node_read_endio(struct bio *bio) ++{ ++ struct btree_read_bio *rb = ++ container_of(bio, struct btree_read_bio, bio); ++ struct bch_fs *c = rb->c; ++ ++ if (rb->have_ioref) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); ++ bch2_latency_acct(ca, rb->start_time, READ); ++ } ++ ++ queue_work(system_unbound_wq, &rb->work); ++} ++ ++void bch2_btree_node_read(struct bch_fs *c, struct btree *b, ++ bool sync) ++{ ++ struct extent_ptr_decoded pick; ++ struct btree_read_bio *rb; ++ struct bch_dev *ca; ++ struct bio *bio; ++ int ret; ++ ++ trace_btree_read(c, b); ++ ++ ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), ++ NULL, &pick); ++ if (bch2_fs_fatal_err_on(ret <= 0, c, ++ "btree node read error: no device to read from")) { ++ set_btree_node_read_error(b); ++ return; ++ } ++ ++ ca = bch_dev_bkey_exists(c, pick.ptr.dev); ++ ++ bio = bio_alloc_bioset(GFP_NOIO, buf_pages(b->data, ++ btree_bytes(c)), ++ &c->btree_bio); ++ rb = container_of(bio, struct btree_read_bio, bio); ++ rb->c = c; ++ rb->start_time = local_clock(); ++ rb->have_ioref = bch2_dev_get_ioref(ca, READ); ++ rb->pick = pick; ++ INIT_WORK(&rb->work, btree_node_read_work); ++ bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; ++ bio->bi_iter.bi_sector = pick.ptr.offset; ++ bio->bi_end_io = btree_node_read_endio; ++ bio->bi_private = b; ++ bch2_bio_map(bio, b->data, btree_bytes(c)); ++ ++ set_btree_node_read_in_flight(b); ++ ++ if (rb->have_ioref) { ++ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_BTREE], ++ bio_sectors(bio)); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ ++ if (sync) { ++ submit_bio_wait(bio); ++ ++ bio->bi_private = b; ++ btree_node_read_work(&rb->work); ++ } else { ++ submit_bio(bio); ++ } ++ } else { ++ bio->bi_status = BLK_STS_REMOVED; ++ ++ if (sync) ++ btree_node_read_work(&rb->work); ++ else ++ queue_work(system_unbound_wq, &rb->work); ++ ++ } ++} ++ ++int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, ++ const struct bkey_i *k, unsigned level) ++{ ++ struct closure cl; ++ struct btree *b; ++ int ret; ++ ++ closure_init_stack(&cl); ++ ++ do { ++ ret = bch2_btree_cache_cannibalize_lock(c, &cl); ++ closure_sync(&cl); ++ } while (ret); ++ ++ b = bch2_btree_node_mem_alloc(c); ++ bch2_btree_cache_cannibalize_unlock(c); ++ ++ BUG_ON(IS_ERR(b)); ++ ++ bkey_copy(&b->key, k); ++ BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); ++ ++ bch2_btree_node_read(c, b, true); ++ ++ if (btree_node_read_error(b)) { ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ mutex_lock(&c->btree_cache.lock); ++ list_move(&b->list, &c->btree_cache.freeable); ++ mutex_unlock(&c->btree_cache.lock); ++ ++ ret = -EIO; ++ goto err; ++ } ++ ++ bch2_btree_set_root_for_read(c, b); ++err: ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ ++ return ret; ++} ++ ++void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, ++ struct btree_write *w) ++{ ++ unsigned long old, new, v = READ_ONCE(b->will_make_reachable); ++ ++ do { ++ old = new = v; ++ if (!(old & 1)) ++ break; ++ ++ new &= ~1UL; ++ } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old); ++ ++ if (old & 1) ++ closure_put(&((struct btree_update *) new)->cl); ++ ++ bch2_journal_pin_drop(&c->journal, &w->journal); ++} ++ ++static void btree_node_write_done(struct bch_fs *c, struct btree *b) ++{ ++ struct btree_write *w = btree_prev_write(b); ++ ++ bch2_btree_complete_write(c, b, w); ++ btree_node_io_unlock(b); ++} ++ ++static void bch2_btree_node_write_error(struct bch_fs *c, ++ struct btree_write_bio *wbio) ++{ ++ struct btree *b = wbio->wbio.bio.bi_private; ++ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; ++ struct bch_extent_ptr *ptr; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p, ++ BTREE_MAX_DEPTH, b->c.level, 0); ++retry: ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ goto err; ++ ++ /* has node been freed? */ ++ if (iter->l[b->c.level].b != b) { ++ /* node has been freed: */ ++ BUG_ON(!btree_node_dying(b)); ++ goto out; ++ } ++ ++ BUG_ON(!btree_node_hashed(b)); ++ ++ bkey_copy(&tmp.k, &b->key); ++ ++ bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr, ++ bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); ++ ++ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&tmp.k))) ++ goto err; ++ ++ ret = bch2_btree_node_update_key(c, iter, b, &tmp.k); ++ if (ret == -EINTR) ++ goto retry; ++ if (ret) ++ goto err; ++out: ++ bch2_trans_exit(&trans); ++ bio_put(&wbio->wbio.bio); ++ btree_node_write_done(c, b); ++ return; ++err: ++ set_btree_node_noevict(b); ++ bch2_fs_fatal_error(c, "fatal error writing btree node"); ++ goto out; ++} ++ ++void bch2_btree_write_error_work(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, struct bch_fs, ++ btree_write_error_work); ++ struct bio *bio; ++ ++ while (1) { ++ spin_lock_irq(&c->btree_write_error_lock); ++ bio = bio_list_pop(&c->btree_write_error_list); ++ spin_unlock_irq(&c->btree_write_error_lock); ++ ++ if (!bio) ++ break; ++ ++ bch2_btree_node_write_error(c, ++ container_of(bio, struct btree_write_bio, wbio.bio)); ++ } ++} ++ ++static void btree_node_write_work(struct work_struct *work) ++{ ++ struct btree_write_bio *wbio = ++ container_of(work, struct btree_write_bio, work); ++ struct bch_fs *c = wbio->wbio.c; ++ struct btree *b = wbio->wbio.bio.bi_private; ++ ++ btree_bounce_free(c, ++ wbio->wbio.order, ++ wbio->wbio.used_mempool, ++ wbio->data); ++ ++ if (wbio->wbio.failed.nr) { ++ unsigned long flags; ++ ++ spin_lock_irqsave(&c->btree_write_error_lock, flags); ++ bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio); ++ spin_unlock_irqrestore(&c->btree_write_error_lock, flags); ++ ++ queue_work(c->wq, &c->btree_write_error_work); ++ return; ++ } ++ ++ bio_put(&wbio->wbio.bio); ++ btree_node_write_done(c, b); ++} ++ ++static void btree_node_write_endio(struct bio *bio) ++{ ++ struct bch_write_bio *wbio = to_wbio(bio); ++ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; ++ struct bch_write_bio *orig = parent ?: wbio; ++ struct bch_fs *c = wbio->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); ++ unsigned long flags; ++ ++ if (wbio->have_ioref) ++ bch2_latency_acct(ca, wbio->submit_time, WRITE); ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write: %s", ++ blk_status_to_str(bio->bi_status)) || ++ bch2_meta_write_fault("btree")) { ++ spin_lock_irqsave(&c->btree_write_error_lock, flags); ++ bch2_dev_list_add_dev(&orig->failed, wbio->dev); ++ spin_unlock_irqrestore(&c->btree_write_error_lock, flags); ++ } ++ ++ if (wbio->have_ioref) ++ percpu_ref_put(&ca->io_ref); ++ ++ if (parent) { ++ bio_put(bio); ++ bio_endio(&parent->bio); ++ } else { ++ struct btree_write_bio *wb = ++ container_of(orig, struct btree_write_bio, wbio); ++ ++ INIT_WORK(&wb->work, btree_node_write_work); ++ queue_work(system_unbound_wq, &wb->work); ++ } ++} ++ ++static int validate_bset_for_write(struct bch_fs *c, struct btree *b, ++ struct bset *i, unsigned sectors) ++{ ++ unsigned whiteout_u64s = 0; ++ int ret; ++ ++ if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE)) ++ return -1; ++ ++ ret = validate_bset(c, b, i, sectors, WRITE, false) ?: ++ validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false); ++ if (ret) ++ bch2_inconsistent_error(c); ++ ++ return ret; ++} ++ ++void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, ++ enum six_lock_type lock_type_held) ++{ ++ struct btree_write_bio *wbio; ++ struct bset_tree *t; ++ struct bset *i; ++ struct btree_node *bn = NULL; ++ struct btree_node_entry *bne = NULL; ++ BKEY_PADDED(key) k; ++ struct bch_extent_ptr *ptr; ++ struct sort_iter sort_iter; ++ struct nonce nonce; ++ unsigned bytes_to_write, sectors_to_write, order, bytes, u64s; ++ u64 seq = 0; ++ bool used_mempool; ++ unsigned long old, new; ++ bool validate_before_checksum = false; ++ void *data; ++ ++ if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) ++ return; ++ ++ /* ++ * We may only have a read lock on the btree node - the dirty bit is our ++ * "lock" against racing with other threads that may be trying to start ++ * a write, we do a write iff we clear the dirty bit. Since setting the ++ * dirty bit requires a write lock, we can't race with other threads ++ * redirtying it: ++ */ ++ do { ++ old = new = READ_ONCE(b->flags); ++ ++ if (!(old & (1 << BTREE_NODE_dirty))) ++ return; ++ ++ if (!btree_node_may_write(b)) ++ return; ++ ++ if (old & (1 << BTREE_NODE_write_in_flight)) { ++ btree_node_wait_on_io(b); ++ continue; ++ } ++ ++ new &= ~(1 << BTREE_NODE_dirty); ++ new &= ~(1 << BTREE_NODE_need_write); ++ new |= (1 << BTREE_NODE_write_in_flight); ++ new |= (1 << BTREE_NODE_just_written); ++ new ^= (1 << BTREE_NODE_write_idx); ++ } while (cmpxchg_acquire(&b->flags, old, new) != old); ++ ++ BUG_ON(btree_node_fake(b)); ++ BUG_ON((b->will_make_reachable != 0) != !b->written); ++ ++ BUG_ON(b->written >= c->opts.btree_node_size); ++ BUG_ON(b->written & (c->opts.block_size - 1)); ++ BUG_ON(bset_written(b, btree_bset_last(b))); ++ BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); ++ BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); ++ ++ bch2_sort_whiteouts(c, b); ++ ++ sort_iter_init(&sort_iter, b); ++ ++ bytes = !b->written ++ ? sizeof(struct btree_node) ++ : sizeof(struct btree_node_entry); ++ ++ bytes += b->whiteout_u64s * sizeof(u64); ++ ++ for_each_bset(b, t) { ++ i = bset(b, t); ++ ++ if (bset_written(b, i)) ++ continue; ++ ++ bytes += le16_to_cpu(i->u64s) * sizeof(u64); ++ sort_iter_add(&sort_iter, ++ btree_bkey_first(b, t), ++ btree_bkey_last(b, t)); ++ seq = max(seq, le64_to_cpu(i->journal_seq)); ++ } ++ ++ order = get_order(bytes); ++ data = btree_bounce_alloc(c, order, &used_mempool); ++ ++ if (!b->written) { ++ bn = data; ++ *bn = *b->data; ++ i = &bn->keys; ++ } else { ++ bne = data; ++ bne->keys = b->data->keys; ++ i = &bne->keys; ++ } ++ ++ i->journal_seq = cpu_to_le64(seq); ++ i->u64s = 0; ++ ++ if (!btree_node_old_extent_overwrite(b)) { ++ sort_iter_add(&sort_iter, ++ unwritten_whiteouts_start(c, b), ++ unwritten_whiteouts_end(c, b)); ++ SET_BSET_SEPARATE_WHITEOUTS(i, false); ++ } else { ++ memcpy_u64s(i->start, ++ unwritten_whiteouts_start(c, b), ++ b->whiteout_u64s); ++ i->u64s = cpu_to_le16(b->whiteout_u64s); ++ SET_BSET_SEPARATE_WHITEOUTS(i, true); ++ } ++ ++ b->whiteout_u64s = 0; ++ ++ u64s = btree_node_old_extent_overwrite(b) ++ ? bch2_sort_extents(vstruct_last(i), &sort_iter, false) ++ : bch2_sort_keys(i->start, &sort_iter, false); ++ le16_add_cpu(&i->u64s, u64s); ++ ++ set_needs_whiteout(i, false); ++ ++ /* do we have data to write? */ ++ if (b->written && !i->u64s) ++ goto nowrite; ++ ++ bytes_to_write = vstruct_end(i) - data; ++ sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; ++ ++ memset(data + bytes_to_write, 0, ++ (sectors_to_write << 9) - bytes_to_write); ++ ++ BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size); ++ BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); ++ BUG_ON(i->seq != b->data->keys.seq); ++ ++ i->version = c->sb.version < bcachefs_metadata_version_new_versioning ++ ? cpu_to_le16(BCH_BSET_VERSION_OLD) ++ : cpu_to_le16(c->sb.version); ++ SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c)); ++ ++ if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))) ++ validate_before_checksum = true; ++ ++ /* validate_bset will be modifying: */ ++ if (le16_to_cpu(i->version) < bcachefs_metadata_version_max) ++ validate_before_checksum = true; ++ ++ /* if we're going to be encrypting, check metadata validity first: */ ++ if (validate_before_checksum && ++ validate_bset_for_write(c, b, i, sectors_to_write)) ++ goto err; ++ ++ bset_encrypt(c, i, b->written << 9); ++ ++ nonce = btree_nonce(i, b->written << 9); ++ ++ if (bn) ++ bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn); ++ else ++ bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); ++ ++ /* if we're not encrypting, check metadata after checksumming: */ ++ if (!validate_before_checksum && ++ validate_bset_for_write(c, b, i, sectors_to_write)) ++ goto err; ++ ++ /* ++ * We handle btree write errors by immediately halting the journal - ++ * after we've done that, we can't issue any subsequent btree writes ++ * because they might have pointers to new nodes that failed to write. ++ * ++ * Furthermore, there's no point in doing any more btree writes because ++ * with the journal stopped, we're never going to update the journal to ++ * reflect that those writes were done and the data flushed from the ++ * journal: ++ * ++ * Also on journal error, the pending write may have updates that were ++ * never journalled (interior nodes, see btree_update_nodes_written()) - ++ * it's critical that we don't do the write in that case otherwise we ++ * will have updates visible that weren't in the journal: ++ * ++ * Make sure to update b->written so bch2_btree_init_next() doesn't ++ * break: ++ */ ++ if (bch2_journal_error(&c->journal) || ++ c->opts.nochanges) ++ goto err; ++ ++ trace_btree_write(b, bytes_to_write, sectors_to_write); ++ ++ wbio = container_of(bio_alloc_bioset(GFP_NOIO, ++ buf_pages(data, sectors_to_write << 9), ++ &c->btree_bio), ++ struct btree_write_bio, wbio.bio); ++ wbio_init(&wbio->wbio.bio); ++ wbio->data = data; ++ wbio->wbio.order = order; ++ wbio->wbio.used_mempool = used_mempool; ++ wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META; ++ wbio->wbio.bio.bi_end_io = btree_node_write_endio; ++ wbio->wbio.bio.bi_private = b; ++ ++ bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); ++ ++ /* ++ * If we're appending to a leaf node, we don't technically need FUA - ++ * this write just needs to be persisted before the next journal write, ++ * which will be marked FLUSH|FUA. ++ * ++ * Similarly if we're writing a new btree root - the pointer is going to ++ * be in the next journal entry. ++ * ++ * But if we're writing a new btree node (that isn't a root) or ++ * appending to a non leaf btree node, we need either FUA or a flush ++ * when we write the parent with the new pointer. FUA is cheaper than a ++ * flush, and writes appending to leaf nodes aren't blocking anything so ++ * just make all btree node writes FUA to keep things sane. ++ */ ++ ++ bkey_copy(&k.key, &b->key); ++ ++ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&k.key)), ptr) ++ ptr->offset += b->written; ++ ++ b->written += sectors_to_write; ++ ++ /* XXX: submitting IO with btree locks held: */ ++ bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_BTREE, &k.key); ++ return; ++err: ++ set_btree_node_noevict(b); ++ b->written += sectors_to_write; ++nowrite: ++ btree_bounce_free(c, order, used_mempool, data); ++ btree_node_write_done(c, b); ++} ++ ++/* ++ * Work that must be done with write lock held: ++ */ ++bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) ++{ ++ bool invalidated_iter = false; ++ struct btree_node_entry *bne; ++ struct bset_tree *t; ++ ++ if (!btree_node_just_written(b)) ++ return false; ++ ++ BUG_ON(b->whiteout_u64s); ++ ++ clear_btree_node_just_written(b); ++ ++ /* ++ * Note: immediately after write, bset_written() doesn't work - the ++ * amount of data we had to write after compaction might have been ++ * smaller than the offset of the last bset. ++ * ++ * However, we know that all bsets have been written here, as long as ++ * we're still holding the write lock: ++ */ ++ ++ /* ++ * XXX: decide if we really want to unconditionally sort down to a ++ * single bset: ++ */ ++ if (b->nsets > 1) { ++ btree_node_sort(c, b, NULL, 0, b->nsets, true); ++ invalidated_iter = true; ++ } else { ++ invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL); ++ } ++ ++ for_each_bset(b, t) ++ set_needs_whiteout(bset(b, t), true); ++ ++ bch2_btree_verify(c, b); ++ ++ /* ++ * If later we don't unconditionally sort down to a single bset, we have ++ * to ensure this is still true: ++ */ ++ BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b)); ++ ++ bne = want_new_bset(c, b); ++ if (bne) ++ bch2_bset_init_next(c, b, bne); ++ ++ bch2_btree_build_aux_trees(b); ++ ++ return invalidated_iter; ++} ++ ++/* ++ * Use this one if the node is intent locked: ++ */ ++void bch2_btree_node_write(struct bch_fs *c, struct btree *b, ++ enum six_lock_type lock_type_held) ++{ ++ BUG_ON(lock_type_held == SIX_LOCK_write); ++ ++ if (lock_type_held == SIX_LOCK_intent || ++ six_lock_tryupgrade(&b->c.lock)) { ++ __bch2_btree_node_write(c, b, SIX_LOCK_intent); ++ ++ /* don't cycle lock unnecessarily: */ ++ if (btree_node_just_written(b) && ++ six_trylock_write(&b->c.lock)) { ++ bch2_btree_post_write_cleanup(c, b); ++ six_unlock_write(&b->c.lock); ++ } ++ ++ if (lock_type_held == SIX_LOCK_read) ++ six_lock_downgrade(&b->c.lock); ++ } else { ++ __bch2_btree_node_write(c, b, SIX_LOCK_read); ++ } ++} ++ ++static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) ++{ ++ struct bucket_table *tbl; ++ struct rhash_head *pos; ++ struct btree *b; ++ unsigned i; ++restart: ++ rcu_read_lock(); ++ for_each_cached_btree(b, c, tbl, i, pos) ++ if (test_bit(flag, &b->flags)) { ++ rcu_read_unlock(); ++ wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE); ++ goto restart; ++ ++ } ++ rcu_read_unlock(); ++} ++ ++void bch2_btree_flush_all_reads(struct bch_fs *c) ++{ ++ __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight); ++} ++ ++void bch2_btree_flush_all_writes(struct bch_fs *c) ++{ ++ __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); ++} ++ ++void bch2_btree_verify_flushed(struct bch_fs *c) ++{ ++ struct bucket_table *tbl; ++ struct rhash_head *pos; ++ struct btree *b; ++ unsigned i; ++ ++ rcu_read_lock(); ++ for_each_cached_btree(b, c, tbl, i, pos) { ++ unsigned long flags = READ_ONCE(b->flags); ++ ++ BUG_ON((flags & (1 << BTREE_NODE_dirty)) || ++ (flags & (1 << BTREE_NODE_write_in_flight))); ++ } ++ rcu_read_unlock(); ++} ++ ++ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *c, char *buf) ++{ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ struct bucket_table *tbl; ++ struct rhash_head *pos; ++ struct btree *b; ++ unsigned i; ++ ++ rcu_read_lock(); ++ for_each_cached_btree(b, c, tbl, i, pos) { ++ unsigned long flags = READ_ONCE(b->flags); ++ ++ if (!(flags & (1 << BTREE_NODE_dirty))) ++ continue; ++ ++ pr_buf(&out, "%p d %u n %u l %u w %u b %u r %u:%lu\n", ++ b, ++ (flags & (1 << BTREE_NODE_dirty)) != 0, ++ (flags & (1 << BTREE_NODE_need_write)) != 0, ++ b->c.level, ++ b->written, ++ !list_empty_careful(&b->write_blocked), ++ b->will_make_reachable != 0, ++ b->will_make_reachable & 1); ++ } ++ rcu_read_unlock(); ++ ++ return out.pos - buf; ++} +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +new file mode 100644 +index 000000000000..f3d7ec749b61 +--- /dev/null ++++ b/fs/bcachefs/btree_io.h +@@ -0,0 +1,190 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_IO_H ++#define _BCACHEFS_BTREE_IO_H ++ ++#include "bkey_methods.h" ++#include "bset.h" ++#include "btree_locking.h" ++#include "extents.h" ++#include "io_types.h" ++ ++struct bch_fs; ++struct btree_write; ++struct btree; ++struct btree_iter; ++ ++struct btree_read_bio { ++ struct bch_fs *c; ++ u64 start_time; ++ unsigned have_ioref:1; ++ struct extent_ptr_decoded pick; ++ struct work_struct work; ++ struct bio bio; ++}; ++ ++struct btree_write_bio { ++ void *data; ++ struct work_struct work; ++ struct bch_write_bio wbio; ++}; ++ ++static inline void btree_node_io_unlock(struct btree *b) ++{ ++ EBUG_ON(!btree_node_write_in_flight(b)); ++ clear_btree_node_write_in_flight(b); ++ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); ++} ++ ++static inline void btree_node_io_lock(struct btree *b) ++{ ++ wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, ++ TASK_UNINTERRUPTIBLE); ++} ++ ++static inline void btree_node_wait_on_io(struct btree *b) ++{ ++ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, ++ TASK_UNINTERRUPTIBLE); ++} ++ ++static inline bool btree_node_may_write(struct btree *b) ++{ ++ return list_empty_careful(&b->write_blocked) && ++ (!b->written || !b->will_make_reachable); ++} ++ ++enum compact_mode { ++ COMPACT_LAZY, ++ COMPACT_ALL, ++}; ++ ++bool bch2_compact_whiteouts(struct bch_fs *, struct btree *, ++ enum compact_mode); ++ ++static inline bool should_compact_bset_lazy(struct btree *b, ++ struct bset_tree *t) ++{ ++ unsigned total_u64s = bset_u64s(t); ++ unsigned dead_u64s = bset_dead_u64s(b, t); ++ ++ return dead_u64s > 64 && dead_u64s * 3 > total_u64s; ++} ++ ++static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) ++{ ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) ++ if (should_compact_bset_lazy(b, t)) ++ return bch2_compact_whiteouts(c, b, COMPACT_LAZY); ++ ++ return false; ++} ++ ++void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); ++ ++void bch2_btree_build_aux_trees(struct btree *); ++void bch2_btree_init_next(struct bch_fs *, struct btree *, ++ struct btree_iter *); ++ ++int bch2_btree_node_read_done(struct bch_fs *, struct btree *, bool); ++void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); ++int bch2_btree_root_read(struct bch_fs *, enum btree_id, ++ const struct bkey_i *, unsigned); ++ ++void bch2_btree_complete_write(struct bch_fs *, struct btree *, ++ struct btree_write *); ++void bch2_btree_write_error_work(struct work_struct *); ++ ++void __bch2_btree_node_write(struct bch_fs *, struct btree *, ++ enum six_lock_type); ++bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); ++ ++void bch2_btree_node_write(struct bch_fs *, struct btree *, ++ enum six_lock_type); ++ ++static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, ++ enum six_lock_type lock_held) ++{ ++ while (b->written && ++ btree_node_need_write(b) && ++ btree_node_may_write(b)) { ++ if (!btree_node_write_in_flight(b)) { ++ bch2_btree_node_write(c, b, lock_held); ++ break; ++ } ++ ++ six_unlock_type(&b->c.lock, lock_held); ++ btree_node_wait_on_io(b); ++ btree_node_lock_type(c, b, lock_held); ++ } ++} ++ ++#define bch2_btree_node_write_cond(_c, _b, cond) \ ++do { \ ++ unsigned long old, new, v = READ_ONCE((_b)->flags); \ ++ \ ++ do { \ ++ old = new = v; \ ++ \ ++ if (!(old & (1 << BTREE_NODE_dirty)) || !(cond)) \ ++ break; \ ++ \ ++ new |= (1 << BTREE_NODE_need_write); \ ++ } while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \ ++ \ ++ btree_node_write_if_need(_c, _b, SIX_LOCK_read); \ ++} while (0) ++ ++void bch2_btree_flush_all_reads(struct bch_fs *); ++void bch2_btree_flush_all_writes(struct bch_fs *); ++void bch2_btree_verify_flushed(struct bch_fs *); ++ssize_t bch2_dirty_btree_nodes_print(struct bch_fs *, char *); ++ ++static inline void compat_bformat(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, struct bkey_format *f) ++{ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_id == BTREE_ID_INODES) { ++ swap(f->bits_per_field[BKEY_FIELD_INODE], ++ f->bits_per_field[BKEY_FIELD_OFFSET]); ++ swap(f->field_offset[BKEY_FIELD_INODE], ++ f->field_offset[BKEY_FIELD_OFFSET]); ++ } ++} ++ ++static inline void compat_bpos(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, struct bpos *p) ++{ ++ if (big_endian != CPU_BIG_ENDIAN) ++ bch2_bpos_swab(p); ++ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_id == BTREE_ID_INODES) ++ swap(p->inode, p->offset); ++} ++ ++static inline void compat_btree_node(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, ++ struct btree_node *bn) ++{ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_node_type_is_extents(btree_id) && ++ bkey_cmp(bn->min_key, POS_MIN) && ++ write) ++ bn->min_key = bkey_predecessor(bn->min_key); ++ ++ compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key); ++ compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key); ++ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_node_type_is_extents(btree_id) && ++ bkey_cmp(bn->min_key, POS_MIN) && ++ !write) ++ bn->min_key = bkey_successor(bn->min_key); ++} ++ ++#endif /* _BCACHEFS_BTREE_IO_H */ +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +new file mode 100644 +index 000000000000..6fab76c3220c +--- /dev/null ++++ b/fs/bcachefs/btree_iter.c +@@ -0,0 +1,2445 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_cache.h" ++#include "btree_iter.h" ++#include "btree_key_cache.h" ++#include "btree_locking.h" ++#include "btree_update.h" ++#include "debug.h" ++#include "extents.h" ++#include "journal.h" ++ ++#include ++#include ++ ++static inline bool is_btree_node(struct btree_iter *iter, unsigned l) ++{ ++ return l < BTREE_MAX_DEPTH && ++ (unsigned long) iter->l[l].b >= 128; ++} ++ ++static inline struct bpos btree_iter_search_key(struct btree_iter *iter) ++{ ++ struct bpos pos = iter->pos; ++ ++ if ((iter->flags & BTREE_ITER_IS_EXTENTS) && ++ bkey_cmp(pos, POS_MAX)) ++ pos = bkey_successor(pos); ++ return pos; ++} ++ ++static inline bool btree_iter_pos_before_node(struct btree_iter *iter, ++ struct btree *b) ++{ ++ return bkey_cmp(btree_iter_search_key(iter), b->data->min_key) < 0; ++} ++ ++static inline bool btree_iter_pos_after_node(struct btree_iter *iter, ++ struct btree *b) ++{ ++ return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0; ++} ++ ++static inline bool btree_iter_pos_in_node(struct btree_iter *iter, ++ struct btree *b) ++{ ++ return iter->btree_id == b->c.btree_id && ++ !btree_iter_pos_before_node(iter, b) && ++ !btree_iter_pos_after_node(iter, b); ++} ++ ++/* Btree node locking: */ ++ ++void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) ++{ ++ bch2_btree_node_unlock_write_inlined(b, iter); ++} ++ ++void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) ++{ ++ struct btree_iter *linked; ++ unsigned readers = 0; ++ ++ EBUG_ON(!btree_node_intent_locked(iter, b->c.level)); ++ ++ trans_for_each_iter(iter->trans, linked) ++ if (linked->l[b->c.level].b == b && ++ btree_node_read_locked(linked, b->c.level)) ++ readers++; ++ ++ /* ++ * Must drop our read locks before calling six_lock_write() - ++ * six_unlock() won't do wakeups until the reader count ++ * goes to 0, and it's safe because we have the node intent ++ * locked: ++ */ ++ atomic64_sub(__SIX_VAL(read_lock, readers), ++ &b->c.lock.state.counter); ++ btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write); ++ atomic64_add(__SIX_VAL(read_lock, readers), ++ &b->c.lock.state.counter); ++} ++ ++bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) ++{ ++ struct btree *b = btree_iter_node(iter, level); ++ int want = __btree_lock_want(iter, level); ++ ++ if (!is_btree_node(iter, level)) ++ return false; ++ ++ if (race_fault()) ++ return false; ++ ++ if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) || ++ (btree_node_lock_seq_matches(iter, b, level) && ++ btree_node_lock_increment(iter->trans, b, level, want))) { ++ mark_btree_node_locked(iter, level, want); ++ return true; ++ } else { ++ return false; ++ } ++} ++ ++static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level) ++{ ++ struct btree *b = iter->l[level].b; ++ ++ EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED); ++ ++ if (!is_btree_node(iter, level)) ++ return false; ++ ++ if (btree_node_intent_locked(iter, level)) ++ return true; ++ ++ if (race_fault()) ++ return false; ++ ++ if (btree_node_locked(iter, level) ++ ? six_lock_tryupgrade(&b->c.lock) ++ : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq)) ++ goto success; ++ ++ if (btree_node_lock_seq_matches(iter, b, level) && ++ btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) { ++ btree_node_unlock(iter, level); ++ goto success; ++ } ++ ++ return false; ++success: ++ mark_btree_node_intent_locked(iter, level); ++ return true; ++} ++ ++static inline bool btree_iter_get_locks(struct btree_iter *iter, ++ bool upgrade, bool trace) ++{ ++ unsigned l = iter->level; ++ int fail_idx = -1; ++ ++ do { ++ if (!btree_iter_node(iter, l)) ++ break; ++ ++ if (!(upgrade ++ ? bch2_btree_node_upgrade(iter, l) ++ : bch2_btree_node_relock(iter, l))) { ++ if (trace) ++ (upgrade ++ ? trace_node_upgrade_fail ++ : trace_node_relock_fail)(l, iter->l[l].lock_seq, ++ is_btree_node(iter, l) ++ ? 0 ++ : (unsigned long) iter->l[l].b, ++ is_btree_node(iter, l) ++ ? iter->l[l].b->c.lock.state.seq ++ : 0); ++ ++ fail_idx = l; ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ } ++ ++ l++; ++ } while (l < iter->locks_want); ++ ++ /* ++ * When we fail to get a lock, we have to ensure that any child nodes ++ * can't be relocked so bch2_btree_iter_traverse has to walk back up to ++ * the node that we failed to relock: ++ */ ++ while (fail_idx >= 0) { ++ btree_node_unlock(iter, fail_idx); ++ iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS; ++ --fail_idx; ++ } ++ ++ if (iter->uptodate == BTREE_ITER_NEED_RELOCK) ++ iter->uptodate = BTREE_ITER_NEED_PEEK; ++ ++ bch2_btree_trans_verify_locks(iter->trans); ++ ++ return iter->uptodate < BTREE_ITER_NEED_RELOCK; ++} ++ ++static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b, ++ enum btree_iter_type type) ++{ ++ return type != BTREE_ITER_CACHED ++ ? container_of(_b, struct btree, c)->key.k.p ++ : container_of(_b, struct bkey_cached, c)->key.pos; ++} ++ ++/* Slowpath: */ ++bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, ++ unsigned level, struct btree_iter *iter, ++ enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, ++ void *p) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree_iter *linked; ++ u64 start_time = local_clock(); ++ bool ret = true; ++ ++ /* Check if it's safe to block: */ ++ trans_for_each_iter(trans, linked) { ++ if (!linked->nodes_locked) ++ continue; ++ ++ /* ++ * Can't block taking an intent lock if we have _any_ nodes read ++ * locked: ++ * ++ * - Our read lock blocks another thread with an intent lock on ++ * the same node from getting a write lock, and thus from ++ * dropping its intent lock ++ * ++ * - And the other thread may have multiple nodes intent locked: ++ * both the node we want to intent lock, and the node we ++ * already have read locked - deadlock: ++ */ ++ if (type == SIX_LOCK_intent && ++ linked->nodes_locked != linked->nodes_intent_locked) { ++ if (!(trans->nounlock)) { ++ linked->locks_want = max_t(unsigned, ++ linked->locks_want, ++ __fls(linked->nodes_locked) + 1); ++ if (!btree_iter_get_locks(linked, true, false)) ++ ret = false; ++ } else { ++ ret = false; ++ } ++ } ++ ++ /* ++ * Interior nodes must be locked before their descendants: if ++ * another iterator has possible descendants locked of the node ++ * we're about to lock, it must have the ancestors locked too: ++ */ ++ if (linked->btree_id == iter->btree_id && ++ level > __fls(linked->nodes_locked)) { ++ if (!(trans->nounlock)) { ++ linked->locks_want = ++ max(level + 1, max_t(unsigned, ++ linked->locks_want, ++ iter->locks_want)); ++ if (!btree_iter_get_locks(linked, true, false)) ++ ret = false; ++ } else { ++ ret = false; ++ } ++ } ++ ++ /* Must lock btree nodes in key order: */ ++ if ((cmp_int(iter->btree_id, linked->btree_id) ?: ++ -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0) ++ ret = false; ++ ++ if (iter->btree_id == linked->btree_id && ++ btree_node_locked(linked, level) && ++ bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b, ++ btree_iter_type(linked))) <= 0) ++ ret = false; ++ ++ /* ++ * Recheck if this is a node we already have locked - since one ++ * of the get_locks() calls might've successfully ++ * upgraded/relocked it: ++ */ ++ if (linked->l[level].b == b && ++ btree_node_locked_type(linked, level) >= type) { ++ six_lock_increment(&b->c.lock, type); ++ return true; ++ } ++ } ++ ++ if (unlikely(!ret)) { ++ trace_trans_restart_would_deadlock(iter->trans->ip); ++ return false; ++ } ++ ++ if (six_trylock_type(&b->c.lock, type)) ++ return true; ++ ++ if (six_lock_type(&b->c.lock, type, should_sleep_fn, p)) ++ return false; ++ ++ bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)], ++ start_time); ++ return true; ++} ++ ++/* Btree iterator locking: */ ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++static void bch2_btree_iter_verify_locks(struct btree_iter *iter) ++{ ++ unsigned l; ++ ++ if (!(iter->trans->iters_linked & (1ULL << iter->idx))) { ++ BUG_ON(iter->nodes_locked); ++ return; ++ } ++ ++ for (l = 0; is_btree_node(iter, l); l++) { ++ if (iter->uptodate >= BTREE_ITER_NEED_RELOCK && ++ !btree_node_locked(iter, l)) ++ continue; ++ ++ BUG_ON(btree_lock_want(iter, l) != ++ btree_node_locked_type(iter, l)); ++ } ++} ++ ++void bch2_btree_trans_verify_locks(struct btree_trans *trans) ++{ ++ struct btree_iter *iter; ++ ++ trans_for_each_iter_all(trans, iter) ++ bch2_btree_iter_verify_locks(iter); ++} ++#else ++static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {} ++#endif ++ ++__flatten ++bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace) ++{ ++ return btree_iter_get_locks(iter, false, trace); ++} ++ ++bool __bch2_btree_iter_upgrade(struct btree_iter *iter, ++ unsigned new_locks_want) ++{ ++ struct btree_iter *linked; ++ ++ EBUG_ON(iter->locks_want >= new_locks_want); ++ ++ iter->locks_want = new_locks_want; ++ ++ if (btree_iter_get_locks(iter, true, true)) ++ return true; ++ ++ /* ++ * Ancestor nodes must be locked before child nodes, so set locks_want ++ * on iterators that might lock ancestors before us to avoid getting ++ * -EINTR later: ++ */ ++ trans_for_each_iter(iter->trans, linked) ++ if (linked != iter && ++ linked->btree_id == iter->btree_id && ++ linked->locks_want < new_locks_want) { ++ linked->locks_want = new_locks_want; ++ btree_iter_get_locks(linked, true, false); ++ } ++ ++ return false; ++} ++ ++bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter, ++ unsigned new_locks_want) ++{ ++ unsigned l = iter->level; ++ ++ EBUG_ON(iter->locks_want >= new_locks_want); ++ ++ iter->locks_want = new_locks_want; ++ ++ do { ++ if (!btree_iter_node(iter, l)) ++ break; ++ ++ if (!bch2_btree_node_upgrade(iter, l)) { ++ iter->locks_want = l; ++ return false; ++ } ++ ++ l++; ++ } while (l < iter->locks_want); ++ ++ return true; ++} ++ ++void __bch2_btree_iter_downgrade(struct btree_iter *iter, ++ unsigned downgrade_to) ++{ ++ unsigned l, new_locks_want = downgrade_to ?: ++ (iter->flags & BTREE_ITER_INTENT ? 1 : 0); ++ ++ if (iter->locks_want < downgrade_to) { ++ iter->locks_want = new_locks_want; ++ ++ while (iter->nodes_locked && ++ (l = __fls(iter->nodes_locked)) >= iter->locks_want) { ++ if (l > iter->level) { ++ btree_node_unlock(iter, l); ++ } else { ++ if (btree_node_intent_locked(iter, l)) { ++ six_lock_downgrade(&iter->l[l].b->c.lock); ++ iter->nodes_intent_locked ^= 1 << l; ++ } ++ break; ++ } ++ } ++ } ++ ++ bch2_btree_trans_verify_locks(iter->trans); ++} ++ ++void bch2_trans_downgrade(struct btree_trans *trans) ++{ ++ struct btree_iter *iter; ++ ++ trans_for_each_iter(trans, iter) ++ bch2_btree_iter_downgrade(iter); ++} ++ ++/* Btree transaction locking: */ ++ ++bool bch2_trans_relock(struct btree_trans *trans) ++{ ++ struct btree_iter *iter; ++ bool ret = true; ++ ++ trans_for_each_iter(trans, iter) ++ if (iter->uptodate == BTREE_ITER_NEED_RELOCK) ++ ret &= bch2_btree_iter_relock(iter, true); ++ ++ return ret; ++} ++ ++void bch2_trans_unlock(struct btree_trans *trans) ++{ ++ struct btree_iter *iter; ++ ++ trans_for_each_iter(trans, iter) ++ __bch2_btree_iter_unlock(iter); ++} ++ ++/* Btree iterator: */ ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++static void bch2_btree_iter_verify_cached(struct btree_iter *iter) ++{ ++ struct bkey_cached *ck; ++ bool locked = btree_node_locked(iter, 0); ++ ++ if (!bch2_btree_node_relock(iter, 0)) ++ return; ++ ++ ck = (void *) iter->l[0].b; ++ BUG_ON(ck->key.btree_id != iter->btree_id || ++ bkey_cmp(ck->key.pos, iter->pos)); ++ ++ if (!locked) ++ btree_node_unlock(iter, 0); ++} ++ ++static void bch2_btree_iter_verify_level(struct btree_iter *iter, ++ unsigned level) ++{ ++ struct bpos pos = btree_iter_search_key(iter); ++ struct btree_iter_level *l = &iter->l[level]; ++ struct btree_node_iter tmp = l->iter; ++ bool locked = btree_node_locked(iter, level); ++ struct bkey_packed *p, *k; ++ char buf1[100], buf2[100]; ++ const char *msg; ++ ++ if (!debug_check_iterators(iter->trans->c)) ++ return; ++ ++ if (btree_iter_type(iter) == BTREE_ITER_CACHED) { ++ if (!level) ++ bch2_btree_iter_verify_cached(iter); ++ return; ++ } ++ ++ BUG_ON(iter->level < iter->min_depth); ++ ++ if (!btree_iter_node(iter, level)) ++ return; ++ ++ if (!bch2_btree_node_relock(iter, level)) ++ return; ++ ++ /* ++ * Ideally this invariant would always be true, and hopefully in the ++ * future it will be, but for now set_pos_same_leaf() breaks it: ++ */ ++ BUG_ON(iter->uptodate < BTREE_ITER_NEED_TRAVERSE && ++ !btree_iter_pos_in_node(iter, l->b)); ++ ++ /* ++ * node iterators don't use leaf node iterator: ++ */ ++ if (btree_iter_type(iter) == BTREE_ITER_NODES && ++ level <= iter->min_depth) ++ goto unlock; ++ ++ bch2_btree_node_iter_verify(&l->iter, l->b); ++ ++ /* ++ * For interior nodes, the iterator will have skipped past ++ * deleted keys: ++ * ++ * For extents, the iterator may have skipped past deleted keys (but not ++ * whiteouts) ++ */ ++ p = level || btree_node_type_is_extents(iter->btree_id) ++ ? bch2_btree_node_iter_prev_filter(&tmp, l->b, KEY_TYPE_discard) ++ : bch2_btree_node_iter_prev_all(&tmp, l->b); ++ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); ++ ++ if (p && bkey_iter_pos_cmp(l->b, p, &pos) >= 0) { ++ msg = "before"; ++ goto err; ++ } ++ ++ if (k && bkey_iter_pos_cmp(l->b, k, &pos) < 0) { ++ msg = "after"; ++ goto err; ++ } ++unlock: ++ if (!locked) ++ btree_node_unlock(iter, level); ++ return; ++err: ++ strcpy(buf1, "(none)"); ++ strcpy(buf2, "(none)"); ++ ++ if (p) { ++ struct bkey uk = bkey_unpack_key(l->b, p); ++ bch2_bkey_to_text(&PBUF(buf1), &uk); ++ } ++ ++ if (k) { ++ struct bkey uk = bkey_unpack_key(l->b, k); ++ bch2_bkey_to_text(&PBUF(buf2), &uk); ++ } ++ ++ panic("iterator should be %s key at level %u:\n" ++ "iter pos %s %llu:%llu\n" ++ "prev key %s\n" ++ "cur key %s\n", ++ msg, level, ++ iter->flags & BTREE_ITER_IS_EXTENTS ? ">" : "=>", ++ iter->pos.inode, iter->pos.offset, ++ buf1, buf2); ++} ++ ++static void bch2_btree_iter_verify(struct btree_iter *iter) ++{ ++ unsigned i; ++ ++ bch2_btree_trans_verify_locks(iter->trans); ++ ++ for (i = 0; i < BTREE_MAX_DEPTH; i++) ++ bch2_btree_iter_verify_level(iter, i); ++} ++ ++void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) ++{ ++ struct btree_iter *iter; ++ ++ if (!debug_check_iterators(trans->c)) ++ return; ++ ++ trans_for_each_iter_with_node(trans, b, iter) ++ bch2_btree_iter_verify_level(iter, b->c.level); ++} ++ ++#else ++ ++static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {} ++static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} ++ ++#endif ++ ++static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, ++ struct btree *b, ++ struct bset_tree *t, ++ struct bkey_packed *k) ++{ ++ struct btree_node_iter_set *set; ++ ++ btree_node_iter_for_each(iter, set) ++ if (set->end == t->end_offset) { ++ set->k = __btree_node_key_to_offset(b, k); ++ bch2_btree_node_iter_sort(iter, b); ++ return; ++ } ++ ++ bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); ++} ++ ++static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter, ++ struct btree *b, ++ struct bkey_packed *where) ++{ ++ struct btree_iter_level *l = &iter->l[b->c.level]; ++ struct bpos pos = btree_iter_search_key(iter); ++ ++ if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) ++ return; ++ ++ if (bkey_iter_pos_cmp(l->b, where, &pos) < 0) ++ bch2_btree_node_iter_advance(&l->iter, l->b); ++ ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++} ++ ++void bch2_btree_iter_fix_key_modified(struct btree_iter *iter, ++ struct btree *b, ++ struct bkey_packed *where) ++{ ++ struct btree_iter *linked; ++ ++ trans_for_each_iter_with_node(iter->trans, b, linked) { ++ __bch2_btree_iter_fix_key_modified(linked, b, where); ++ bch2_btree_iter_verify_level(linked, b->c.level); ++ } ++} ++ ++static void __bch2_btree_node_iter_fix(struct btree_iter *iter, ++ struct btree *b, ++ struct btree_node_iter *node_iter, ++ struct bset_tree *t, ++ struct bkey_packed *where, ++ unsigned clobber_u64s, ++ unsigned new_u64s) ++{ ++ const struct bkey_packed *end = btree_bkey_last(b, t); ++ struct btree_node_iter_set *set; ++ unsigned offset = __btree_node_key_to_offset(b, where); ++ int shift = new_u64s - clobber_u64s; ++ unsigned old_end = t->end_offset - shift; ++ unsigned orig_iter_pos = node_iter->data[0].k; ++ bool iter_current_key_modified = ++ orig_iter_pos >= offset && ++ orig_iter_pos <= offset + clobber_u64s; ++ struct bpos iter_pos = btree_iter_search_key(iter); ++ ++ btree_node_iter_for_each(node_iter, set) ++ if (set->end == old_end) ++ goto found; ++ ++ /* didn't find the bset in the iterator - might have to readd it: */ ++ if (new_u64s && ++ bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { ++ bch2_btree_node_iter_push(node_iter, b, where, end); ++ goto fixup_done; ++ } else { ++ /* Iterator is after key that changed */ ++ return; ++ } ++found: ++ set->end = t->end_offset; ++ ++ /* Iterator hasn't gotten to the key that changed yet: */ ++ if (set->k < offset) ++ return; ++ ++ if (new_u64s && ++ bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { ++ set->k = offset; ++ } else if (set->k < offset + clobber_u64s) { ++ set->k = offset + new_u64s; ++ if (set->k == set->end) ++ bch2_btree_node_iter_set_drop(node_iter, set); ++ } else { ++ /* Iterator is after key that changed */ ++ set->k = (int) set->k + shift; ++ return; ++ } ++ ++ bch2_btree_node_iter_sort(node_iter, b); ++fixup_done: ++ if (node_iter->data[0].k != orig_iter_pos) ++ iter_current_key_modified = true; ++ ++ /* ++ * When a new key is added, and the node iterator now points to that ++ * key, the iterator might have skipped past deleted keys that should ++ * come after the key the iterator now points to. We have to rewind to ++ * before those deleted keys - otherwise ++ * bch2_btree_node_iter_prev_all() breaks: ++ */ ++ if (!bch2_btree_node_iter_end(node_iter) && ++ iter_current_key_modified && ++ (b->c.level || ++ btree_node_type_is_extents(iter->btree_id))) { ++ struct bset_tree *t; ++ struct bkey_packed *k, *k2, *p; ++ ++ k = bch2_btree_node_iter_peek_all(node_iter, b); ++ ++ for_each_bset(b, t) { ++ bool set_pos = false; ++ ++ if (node_iter->data[0].end == t->end_offset) ++ continue; ++ ++ k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t); ++ ++ while ((p = bch2_bkey_prev_all(b, t, k2)) && ++ bkey_iter_cmp(b, k, p) < 0) { ++ k2 = p; ++ set_pos = true; ++ } ++ ++ if (set_pos) ++ btree_node_iter_set_set_pos(node_iter, ++ b, t, k2); ++ } ++ } ++ ++ if (!b->c.level && ++ node_iter == &iter->l[0].iter && ++ iter_current_key_modified) ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++} ++ ++void bch2_btree_node_iter_fix(struct btree_iter *iter, ++ struct btree *b, ++ struct btree_node_iter *node_iter, ++ struct bkey_packed *where, ++ unsigned clobber_u64s, ++ unsigned new_u64s) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, where); ++ struct btree_iter *linked; ++ ++ if (node_iter != &iter->l[b->c.level].iter) { ++ __bch2_btree_node_iter_fix(iter, b, node_iter, t, ++ where, clobber_u64s, new_u64s); ++ ++ if (debug_check_iterators(iter->trans->c)) ++ bch2_btree_node_iter_verify(node_iter, b); ++ } ++ ++ trans_for_each_iter_with_node(iter->trans, b, linked) { ++ __bch2_btree_node_iter_fix(linked, b, ++ &linked->l[b->c.level].iter, t, ++ where, clobber_u64s, new_u64s); ++ bch2_btree_iter_verify_level(linked, b->c.level); ++ } ++} ++ ++static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, ++ struct btree_iter_level *l, ++ struct bkey *u, ++ struct bkey_packed *k) ++{ ++ struct bkey_s_c ret; ++ ++ if (unlikely(!k)) { ++ /* ++ * signal to bch2_btree_iter_peek_slot() that we're currently at ++ * a hole ++ */ ++ u->type = KEY_TYPE_deleted; ++ return bkey_s_c_null; ++ } ++ ++ ret = bkey_disassemble(l->b, k, u); ++ ++ if (debug_check_bkeys(iter->trans->c)) ++ bch2_bkey_debugcheck(iter->trans->c, l->b, ret); ++ ++ return ret; ++} ++ ++/* peek_all() doesn't skip deleted keys */ ++static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter, ++ struct btree_iter_level *l, ++ struct bkey *u) ++{ ++ return __btree_iter_unpack(iter, l, u, ++ bch2_btree_node_iter_peek_all(&l->iter, l->b)); ++} ++ ++static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, ++ struct btree_iter_level *l) ++{ ++ return __btree_iter_unpack(iter, l, &iter->k, ++ bch2_btree_node_iter_peek(&l->iter, l->b)); ++} ++ ++static inline struct bkey_s_c __btree_iter_prev(struct btree_iter *iter, ++ struct btree_iter_level *l) ++{ ++ return __btree_iter_unpack(iter, l, &iter->k, ++ bch2_btree_node_iter_prev(&l->iter, l->b)); ++} ++ ++static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, ++ struct btree_iter_level *l, ++ int max_advance) ++{ ++ struct bpos pos = btree_iter_search_key(iter); ++ struct bkey_packed *k; ++ int nr_advanced = 0; ++ ++ while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && ++ bkey_iter_pos_cmp(l->b, k, &pos) < 0) { ++ if (max_advance > 0 && nr_advanced >= max_advance) ++ return false; ++ ++ bch2_btree_node_iter_advance(&l->iter, l->b); ++ nr_advanced++; ++ } ++ ++ return true; ++} ++ ++/* ++ * Verify that iterator for parent node points to child node: ++ */ ++static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) ++{ ++ struct btree_iter_level *l; ++ unsigned plevel; ++ bool parent_locked; ++ struct bkey_packed *k; ++ ++ if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) ++ return; ++ ++ plevel = b->c.level + 1; ++ if (!btree_iter_node(iter, plevel)) ++ return; ++ ++ parent_locked = btree_node_locked(iter, plevel); ++ ++ if (!bch2_btree_node_relock(iter, plevel)) ++ return; ++ ++ l = &iter->l[plevel]; ++ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); ++ if (!k || ++ bkey_deleted(k) || ++ bkey_cmp_left_packed(l->b, k, &b->key.k.p)) { ++ char buf[100]; ++ struct bkey uk = bkey_unpack_key(b, k); ++ ++ bch2_bkey_to_text(&PBUF(buf), &uk); ++ panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n", ++ buf, b->key.k.p.inode, b->key.k.p.offset); ++ } ++ ++ if (!parent_locked) ++ btree_node_unlock(iter, b->c.level + 1); ++} ++ ++static inline void __btree_iter_init(struct btree_iter *iter, ++ unsigned level) ++{ ++ struct bpos pos = btree_iter_search_key(iter); ++ struct btree_iter_level *l = &iter->l[level]; ++ ++ bch2_btree_node_iter_init(&l->iter, l->b, &pos); ++ ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++} ++ ++static inline void btree_iter_node_set(struct btree_iter *iter, ++ struct btree *b) ++{ ++ BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); ++ ++ btree_iter_verify_new_node(iter, b); ++ ++ EBUG_ON(!btree_iter_pos_in_node(iter, b)); ++ EBUG_ON(b->c.lock.state.seq & 1); ++ ++ iter->l[b->c.level].lock_seq = b->c.lock.state.seq; ++ iter->l[b->c.level].b = b; ++ __btree_iter_init(iter, b->c.level); ++} ++ ++/* ++ * A btree node is being replaced - update the iterator to point to the new ++ * node: ++ */ ++void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) ++{ ++ enum btree_node_locked_type t; ++ struct btree_iter *linked; ++ ++ trans_for_each_iter(iter->trans, linked) ++ if (btree_iter_type(linked) != BTREE_ITER_CACHED && ++ btree_iter_pos_in_node(linked, b)) { ++ /* ++ * bch2_btree_iter_node_drop() has already been called - ++ * the old node we're replacing has already been ++ * unlocked and the pointer invalidated ++ */ ++ BUG_ON(btree_node_locked(linked, b->c.level)); ++ ++ t = btree_lock_want(linked, b->c.level); ++ if (t != BTREE_NODE_UNLOCKED) { ++ six_lock_increment(&b->c.lock, t); ++ mark_btree_node_locked(linked, b->c.level, t); ++ } ++ ++ btree_iter_node_set(linked, b); ++ } ++} ++ ++void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) ++{ ++ struct btree_iter *linked; ++ unsigned level = b->c.level; ++ ++ trans_for_each_iter(iter->trans, linked) ++ if (linked->l[level].b == b) { ++ __btree_node_unlock(linked, level); ++ linked->l[level].b = BTREE_ITER_NO_NODE_DROP; ++ } ++} ++ ++/* ++ * A btree node has been modified in such a way as to invalidate iterators - fix ++ * them: ++ */ ++void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b) ++{ ++ struct btree_iter *linked; ++ ++ trans_for_each_iter_with_node(iter->trans, b, linked) ++ __btree_iter_init(linked, b->c.level); ++} ++ ++static int lock_root_check_fn(struct six_lock *lock, void *p) ++{ ++ struct btree *b = container_of(lock, struct btree, c.lock); ++ struct btree **rootp = p; ++ ++ return b == *rootp ? 0 : -1; ++} ++ ++static inline int btree_iter_lock_root(struct btree_iter *iter, ++ unsigned depth_want) ++{ ++ struct bch_fs *c = iter->trans->c; ++ struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b; ++ enum six_lock_type lock_type; ++ unsigned i; ++ ++ EBUG_ON(iter->nodes_locked); ++ ++ while (1) { ++ b = READ_ONCE(*rootp); ++ iter->level = READ_ONCE(b->c.level); ++ ++ if (unlikely(iter->level < depth_want)) { ++ /* ++ * the root is at a lower depth than the depth we want: ++ * got to the end of the btree, or we're walking nodes ++ * greater than some depth and there are no nodes >= ++ * that depth ++ */ ++ iter->level = depth_want; ++ for (i = iter->level; i < BTREE_MAX_DEPTH; i++) ++ iter->l[i].b = NULL; ++ return 1; ++ } ++ ++ lock_type = __btree_lock_want(iter, iter->level); ++ if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, ++ iter, lock_type, ++ lock_root_check_fn, rootp))) ++ return -EINTR; ++ ++ if (likely(b == READ_ONCE(*rootp) && ++ b->c.level == iter->level && ++ !race_fault())) { ++ for (i = 0; i < iter->level; i++) ++ iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT; ++ iter->l[iter->level].b = b; ++ for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++) ++ iter->l[i].b = NULL; ++ ++ mark_btree_node_locked(iter, iter->level, lock_type); ++ btree_iter_node_set(iter, b); ++ return 0; ++ } ++ ++ six_unlock_type(&b->c.lock, lock_type); ++ } ++} ++ ++noinline ++static void btree_iter_prefetch(struct btree_iter *iter) ++{ ++ struct bch_fs *c = iter->trans->c; ++ struct btree_iter_level *l = &iter->l[iter->level]; ++ struct btree_node_iter node_iter = l->iter; ++ struct bkey_packed *k; ++ BKEY_PADDED(k) tmp; ++ unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) ++ ? (iter->level > 1 ? 0 : 2) ++ : (iter->level > 1 ? 1 : 16); ++ bool was_locked = btree_node_locked(iter, iter->level); ++ ++ while (nr) { ++ if (!bch2_btree_node_relock(iter, iter->level)) ++ return; ++ ++ bch2_btree_node_iter_advance(&node_iter, l->b); ++ k = bch2_btree_node_iter_peek(&node_iter, l->b); ++ if (!k) ++ break; ++ ++ bch2_bkey_unpack(l->b, &tmp.k, k); ++ bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1); ++ } ++ ++ if (!was_locked) ++ btree_node_unlock(iter, iter->level); ++} ++ ++static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, ++ unsigned plevel, struct btree *b) ++{ ++ struct btree_iter_level *l = &iter->l[plevel]; ++ bool locked = btree_node_locked(iter, plevel); ++ struct bkey_packed *k; ++ struct bch_btree_ptr_v2 *bp; ++ ++ if (!bch2_btree_node_relock(iter, plevel)) ++ return; ++ ++ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); ++ BUG_ON(k->type != KEY_TYPE_btree_ptr_v2); ++ ++ bp = (void *) bkeyp_val(&l->b->format, k); ++ bp->mem_ptr = (unsigned long)b; ++ ++ if (!locked) ++ btree_node_unlock(iter, plevel); ++} ++ ++static __always_inline int btree_iter_down(struct btree_iter *iter) ++{ ++ struct bch_fs *c = iter->trans->c; ++ struct btree_iter_level *l = &iter->l[iter->level]; ++ struct btree *b; ++ unsigned level = iter->level - 1; ++ enum six_lock_type lock_type = __btree_lock_want(iter, level); ++ BKEY_PADDED(k) tmp; ++ ++ EBUG_ON(!btree_node_locked(iter, iter->level)); ++ ++ bch2_bkey_unpack(l->b, &tmp.k, ++ bch2_btree_node_iter_peek(&l->iter, l->b)); ++ ++ b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type); ++ if (unlikely(IS_ERR(b))) ++ return PTR_ERR(b); ++ ++ mark_btree_node_locked(iter, level, lock_type); ++ btree_iter_node_set(iter, b); ++ ++ if (tmp.k.k.type == KEY_TYPE_btree_ptr_v2 && ++ unlikely(b != btree_node_mem_ptr(&tmp.k))) ++ btree_node_mem_ptr_set(iter, level + 1, b); ++ ++ if (iter->flags & BTREE_ITER_PREFETCH) ++ btree_iter_prefetch(iter); ++ ++ iter->level = level; ++ ++ return 0; ++} ++ ++static void btree_iter_up(struct btree_iter *iter) ++{ ++ btree_node_unlock(iter, iter->level++); ++} ++ ++static int btree_iter_traverse_one(struct btree_iter *); ++ ++static int __btree_iter_traverse_all(struct btree_trans *trans, int ret) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter; ++ u8 sorted[BTREE_ITER_MAX]; ++ unsigned i, nr_sorted = 0; ++ ++ if (trans->in_traverse_all) ++ return -EINTR; ++ ++ trans->in_traverse_all = true; ++retry_all: ++ nr_sorted = 0; ++ ++ trans_for_each_iter(trans, iter) ++ sorted[nr_sorted++] = iter->idx; ++ ++#define btree_iter_cmp_by_idx(_l, _r) \ ++ btree_iter_cmp(&trans->iters[_l], &trans->iters[_r]) ++ ++ bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); ++#undef btree_iter_cmp_by_idx ++ bch2_trans_unlock(trans); ++ ++ if (unlikely(ret == -ENOMEM)) { ++ struct closure cl; ++ ++ closure_init_stack(&cl); ++ ++ do { ++ ret = bch2_btree_cache_cannibalize_lock(c, &cl); ++ closure_sync(&cl); ++ } while (ret); ++ } ++ ++ if (unlikely(ret == -EIO)) { ++ trans->error = true; ++ goto out; ++ } ++ ++ BUG_ON(ret && ret != -EINTR); ++ ++ /* Now, redo traversals in correct order: */ ++ for (i = 0; i < nr_sorted; i++) { ++ unsigned idx = sorted[i]; ++ ++ /* ++ * sucessfully traversing one iterator can cause another to be ++ * unlinked, in btree_key_cache_fill() ++ */ ++ if (!(trans->iters_linked & (1ULL << idx))) ++ continue; ++ ++ ret = btree_iter_traverse_one(&trans->iters[idx]); ++ if (ret) ++ goto retry_all; ++ } ++ ++ if (hweight64(trans->iters_live) > 1) ++ ret = -EINTR; ++ else ++ trans_for_each_iter(trans, iter) ++ if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) { ++ ret = -EINTR; ++ break; ++ } ++out: ++ bch2_btree_cache_cannibalize_unlock(c); ++ ++ trans->in_traverse_all = false; ++ return ret; ++} ++ ++int bch2_btree_iter_traverse_all(struct btree_trans *trans) ++{ ++ return __btree_iter_traverse_all(trans, 0); ++} ++ ++static inline bool btree_iter_good_node(struct btree_iter *iter, ++ unsigned l, int check_pos) ++{ ++ if (!is_btree_node(iter, l) || ++ !bch2_btree_node_relock(iter, l)) ++ return false; ++ ++ if (check_pos <= 0 && btree_iter_pos_before_node(iter, iter->l[l].b)) ++ return false; ++ if (check_pos >= 0 && btree_iter_pos_after_node(iter, iter->l[l].b)) ++ return false; ++ return true; ++} ++ ++static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter, ++ int check_pos) ++{ ++ unsigned l = iter->level; ++ ++ while (btree_iter_node(iter, l) && ++ !btree_iter_good_node(iter, l, check_pos)) { ++ btree_node_unlock(iter, l); ++ iter->l[l].b = BTREE_ITER_NO_NODE_UP; ++ l++; ++ } ++ ++ return l; ++} ++ ++/* ++ * This is the main state machine for walking down the btree - walks down to a ++ * specified depth ++ * ++ * Returns 0 on success, -EIO on error (error reading in a btree node). ++ * ++ * On error, caller (peek_node()/peek_key()) must return NULL; the error is ++ * stashed in the iterator and returned from bch2_trans_exit(). ++ */ ++static int btree_iter_traverse_one(struct btree_iter *iter) ++{ ++ unsigned depth_want = iter->level; ++ ++ /* ++ * if we need interior nodes locked, call btree_iter_relock() to make ++ * sure we walk back up enough that we lock them: ++ */ ++ if (iter->uptodate == BTREE_ITER_NEED_RELOCK || ++ iter->locks_want > 1) ++ bch2_btree_iter_relock(iter, false); ++ ++ if (btree_iter_type(iter) == BTREE_ITER_CACHED) ++ return bch2_btree_iter_traverse_cached(iter); ++ ++ if (iter->uptodate < BTREE_ITER_NEED_RELOCK) ++ return 0; ++ ++ if (unlikely(iter->level >= BTREE_MAX_DEPTH)) ++ return 0; ++ ++ /* ++ * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos ++ * here unnecessary ++ */ ++ iter->level = btree_iter_up_until_good_node(iter, 0); ++ ++ /* ++ * If we've got a btree node locked (i.e. we aren't about to relock the ++ * root) - advance its node iterator if necessary: ++ * ++ * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary ++ */ ++ if (is_btree_node(iter, iter->level)) { ++ BUG_ON(!btree_iter_pos_in_node(iter, iter->l[iter->level].b)); ++ ++ btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1); ++ } ++ ++ /* ++ * Note: iter->nodes[iter->level] may be temporarily NULL here - that ++ * would indicate to other code that we got to the end of the btree, ++ * here it indicates that relocking the root failed - it's critical that ++ * btree_iter_lock_root() comes next and that it can't fail ++ */ ++ while (iter->level > depth_want) { ++ int ret = btree_iter_node(iter, iter->level) ++ ? btree_iter_down(iter) ++ : btree_iter_lock_root(iter, depth_want); ++ if (unlikely(ret)) { ++ if (ret == 1) ++ return 0; ++ ++ iter->level = depth_want; ++ ++ if (ret == -EIO) { ++ iter->flags |= BTREE_ITER_ERROR; ++ iter->l[iter->level].b = ++ BTREE_ITER_NO_NODE_ERROR; ++ } else { ++ iter->l[iter->level].b = ++ BTREE_ITER_NO_NODE_DOWN; ++ } ++ return ret; ++ } ++ } ++ ++ iter->uptodate = BTREE_ITER_NEED_PEEK; ++ ++ bch2_btree_iter_verify(iter); ++ return 0; ++} ++ ++int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) ++{ ++ struct btree_trans *trans = iter->trans; ++ int ret; ++ ++ ret = bch2_trans_cond_resched(trans) ?: ++ btree_iter_traverse_one(iter); ++ if (unlikely(ret)) ++ ret = __btree_iter_traverse_all(trans, ret); ++ ++ return ret; ++} ++ ++static inline void bch2_btree_iter_checks(struct btree_iter *iter) ++{ ++ enum btree_iter_type type = btree_iter_type(iter); ++ ++ EBUG_ON(iter->btree_id >= BTREE_ID_NR); ++ ++ BUG_ON((type == BTREE_ITER_KEYS || ++ type == BTREE_ITER_CACHED) && ++ (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || ++ bkey_cmp(iter->pos, iter->k.p) > 0)); ++ ++ bch2_btree_iter_verify_locks(iter); ++ bch2_btree_iter_verify_level(iter, iter->level); ++} ++ ++/* Iterate across nodes (leaf and interior nodes) */ ++ ++struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) ++{ ++ struct btree *b; ++ int ret; ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); ++ bch2_btree_iter_checks(iter); ++ ++ if (iter->uptodate == BTREE_ITER_UPTODATE) ++ return iter->l[iter->level].b; ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ return NULL; ++ ++ b = btree_iter_node(iter, iter->level); ++ if (!b) ++ return NULL; ++ ++ BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); ++ ++ iter->pos = b->key.k.p; ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ ++ bch2_btree_iter_verify(iter); ++ ++ return b; ++} ++ ++struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) ++{ ++ struct btree *b; ++ int ret; ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); ++ bch2_btree_iter_checks(iter); ++ ++ /* already got to end? */ ++ if (!btree_iter_node(iter, iter->level)) ++ return NULL; ++ ++ bch2_trans_cond_resched(iter->trans); ++ ++ btree_iter_up(iter); ++ ++ if (!bch2_btree_node_relock(iter, iter->level)) ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ return NULL; ++ ++ /* got to end? */ ++ b = btree_iter_node(iter, iter->level); ++ if (!b) ++ return NULL; ++ ++ if (bkey_cmp(iter->pos, b->key.k.p) < 0) { ++ /* ++ * Haven't gotten to the end of the parent node: go back down to ++ * the next child node ++ */ ++ ++ /* ++ * We don't really want to be unlocking here except we can't ++ * directly tell btree_iter_traverse() "traverse to this level" ++ * except by setting iter->level, so we have to unlock so we ++ * don't screw up our lock invariants: ++ */ ++ if (btree_node_read_locked(iter, iter->level)) ++ btree_node_unlock(iter, iter->level); ++ ++ iter->pos = bkey_successor(iter->pos); ++ iter->level = iter->min_depth; ++ ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ return NULL; ++ ++ b = iter->l[iter->level].b; ++ } ++ ++ iter->pos = b->key.k.p; ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ ++ bch2_btree_iter_verify(iter); ++ ++ return b; ++} ++ ++/* Iterate across keys (in leaf nodes only) */ ++ ++void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ ++ EBUG_ON(iter->level != 0); ++ EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); ++ EBUG_ON(!btree_node_locked(iter, 0)); ++ EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0); ++ ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = new_pos; ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++ ++ btree_iter_advance_to_pos(iter, l, -1); ++ ++ /* ++ * XXX: ++ * keeping a node locked that's outside (even just outside) iter->pos ++ * breaks __bch2_btree_node_lock(). This seems to only affect ++ * bch2_btree_node_get_sibling so for now it's fixed there, but we ++ * should try to get rid of this corner case. ++ * ++ * (this behaviour is currently needed for BTREE_INSERT_NOUNLOCK) ++ */ ++ ++ if (bch2_btree_node_iter_end(&l->iter) && ++ btree_iter_pos_after_node(iter, l->b)) ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++} ++ ++static void btree_iter_pos_changed(struct btree_iter *iter, int cmp) ++{ ++ unsigned l = iter->level; ++ ++ if (!cmp) ++ goto out; ++ ++ if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) { ++ btree_node_unlock(iter, 0); ++ iter->l[0].b = BTREE_ITER_NO_NODE_UP; ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ return; ++ } ++ ++ l = btree_iter_up_until_good_node(iter, cmp); ++ ++ if (btree_iter_node(iter, l)) { ++ /* ++ * We might have to skip over many keys, or just a few: try ++ * advancing the node iterator, and if we have to skip over too ++ * many keys just reinit it (or if we're rewinding, since that ++ * is expensive). ++ */ ++ if (cmp < 0 || ++ !btree_iter_advance_to_pos(iter, &iter->l[l], 8)) ++ __btree_iter_init(iter, l); ++ ++ /* Don't leave it locked if we're not supposed to: */ ++ if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED) ++ btree_node_unlock(iter, l); ++ } ++out: ++ if (l != iter->level) ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ else ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++} ++ ++void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos, ++ bool strictly_greater) ++{ ++ struct bpos old = btree_iter_search_key(iter); ++ int cmp; ++ ++ iter->flags &= ~BTREE_ITER_IS_EXTENTS; ++ iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0; ++ ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = new_pos; ++ ++ cmp = bkey_cmp(btree_iter_search_key(iter), old); ++ ++ btree_iter_pos_changed(iter, cmp); ++} ++ ++void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) ++{ ++ int cmp = bkey_cmp(new_pos, iter->pos); ++ ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = new_pos; ++ ++ btree_iter_pos_changed(iter, cmp); ++} ++ ++static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ bool ret; ++ ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = l->b->key.k.p; ++ ++ ret = bkey_cmp(iter->pos, POS_MAX) != 0; ++ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) ++ iter->k.p = iter->pos = bkey_successor(iter->pos); ++ ++ btree_iter_pos_changed(iter, 1); ++ return ret; ++} ++ ++static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ bool ret; ++ ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = l->b->data->min_key; ++ iter->uptodate = BTREE_ITER_NEED_TRAVERSE; ++ ++ ret = bkey_cmp(iter->pos, POS_MIN) != 0; ++ if (ret) { ++ iter->k.p = iter->pos = bkey_predecessor(iter->pos); ++ ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) ++ iter->k.p = iter->pos = bkey_predecessor(iter->pos); ++ } ++ ++ btree_iter_pos_changed(iter, -1); ++ return ret; ++} ++ ++/** ++ * btree_iter_peek_uptodate - given an iterator that is uptodate, return the key ++ * it currently points to ++ */ ++static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_s_c ret = { .k = &iter->k }; ++ ++ if (!bkey_deleted(&iter->k)) { ++ struct bkey_packed *_k = ++ __bch2_btree_node_iter_peek_all(&l->iter, l->b); ++ ++ ret.v = bkeyp_val(&l->b->format, _k); ++ ++ if (debug_check_iterators(iter->trans->c)) { ++ struct bkey k = bkey_unpack_key(l->b, _k); ++ ++ BUG_ON(memcmp(&k, &iter->k, sizeof(k))); ++ } ++ ++ if (debug_check_bkeys(iter->trans->c)) ++ bch2_bkey_debugcheck(iter->trans->c, l->b, ret); ++ } ++ ++ return ret; ++} ++ ++/** ++ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's ++ * current position ++ */ ++struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_s_c k; ++ int ret; ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ bch2_btree_iter_checks(iter); ++ ++ if (iter->uptodate == BTREE_ITER_UPTODATE && ++ !bkey_deleted(&iter->k)) ++ return btree_iter_peek_uptodate(iter); ++ ++ while (1) { ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ k = __btree_iter_peek(iter, l); ++ if (likely(k.k)) ++ break; ++ ++ if (!btree_iter_set_pos_to_next_leaf(iter)) ++ return bkey_s_c_null; ++ } ++ ++ /* ++ * iter->pos should always be equal to the key we just ++ * returned - except extents can straddle iter->pos: ++ */ ++ if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || ++ bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) ++ iter->pos = bkey_start_pos(k.k); ++ ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ ++ bch2_btree_iter_verify_level(iter, 0); ++ return k; ++} ++ ++/** ++ * bch2_btree_iter_next: returns first key greater than iterator's current ++ * position ++ */ ++struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) ++{ ++ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) ++ return bkey_s_c_null; ++ ++ bch2_btree_iter_set_pos(iter, ++ (iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? iter->k.p ++ : bkey_successor(iter->k.p)); ++ ++ return bch2_btree_iter_peek(iter); ++} ++ ++static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter) ++{ ++ struct bpos pos = btree_iter_search_key(iter); ++ struct btree_trans *trans = iter->trans; ++ struct btree_insert_entry *i; ++ ++ trans_for_each_update2(trans, i) ++ if ((cmp_int(iter->btree_id, i->iter->btree_id) ?: ++ bkey_cmp(pos, i->k->k.p)) <= 0) ++ break; ++ ++ return i < trans->updates2 + trans->nr_updates2 && ++ iter->btree_id == i->iter->btree_id ++ ? bkey_i_to_s_c(i->k) ++ : bkey_s_c_null; ++} ++ ++static struct bkey_s_c __bch2_btree_iter_peek_with_updates(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_s_c k = __btree_iter_peek(iter, l); ++ struct bkey_s_c u = __btree_trans_updates_peek(iter); ++ ++ if (k.k && (!u.k || bkey_cmp(k.k->p, u.k->p) < 0)) ++ return k; ++ if (u.k && bkey_cmp(u.k->p, l->b->key.k.p) <= 0) { ++ iter->k = *u.k; ++ return u; ++ } ++ return bkey_s_c_null; ++} ++ ++struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) ++{ ++ struct bkey_s_c k; ++ int ret; ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ bch2_btree_iter_checks(iter); ++ ++ while (1) { ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ k = __bch2_btree_iter_peek_with_updates(iter); ++ ++ if (k.k && bkey_deleted(k.k)) { ++ bch2_btree_iter_set_pos(iter, ++ (iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? iter->k.p ++ : bkey_successor(iter->k.p)); ++ continue; ++ } ++ ++ if (likely(k.k)) ++ break; ++ ++ if (!btree_iter_set_pos_to_next_leaf(iter)) ++ return bkey_s_c_null; ++ } ++ ++ /* ++ * iter->pos should always be equal to the key we just ++ * returned - except extents can straddle iter->pos: ++ */ ++ if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || ++ bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) ++ iter->pos = bkey_start_pos(k.k); ++ ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ return k; ++} ++ ++struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter) ++{ ++ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) ++ return bkey_s_c_null; ++ ++ bch2_btree_iter_set_pos(iter, ++ (iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? iter->k.p ++ : bkey_successor(iter->k.p)); ++ ++ return bch2_btree_iter_peek_with_updates(iter); ++} ++ ++/** ++ * bch2_btree_iter_peek_prev: returns first key less than or equal to ++ * iterator's current position ++ */ ++struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) ++{ ++ struct bpos pos = iter->pos; ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_s_c k; ++ int ret; ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ bch2_btree_iter_checks(iter); ++ ++ if (iter->uptodate == BTREE_ITER_UPTODATE && ++ !bkey_deleted(&iter->k)) ++ return btree_iter_peek_uptodate(iter); ++ ++ while (1) { ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ k = __btree_iter_peek(iter, l); ++ if (!k.k || bkey_cmp(bkey_start_pos(k.k), pos) > 0) ++ k = __btree_iter_prev(iter, l); ++ ++ if (likely(k.k)) ++ break; ++ ++ if (!btree_iter_set_pos_to_prev_leaf(iter)) ++ return bkey_s_c_null; ++ } ++ ++ EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0); ++ iter->pos = bkey_start_pos(k.k); ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ return k; ++} ++ ++/** ++ * bch2_btree_iter_prev: returns first key less than iterator's current ++ * position ++ */ ++struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) ++{ ++ struct bpos pos = bkey_start_pos(&iter->k); ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ bch2_btree_iter_checks(iter); ++ ++ if (unlikely(!bkey_cmp(pos, POS_MIN))) ++ return bkey_s_c_null; ++ ++ bch2_btree_iter_set_pos(iter, bkey_predecessor(pos)); ++ ++ return bch2_btree_iter_peek_prev(iter); ++} ++ ++static inline struct bkey_s_c ++__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct btree_node_iter node_iter; ++ struct bkey_s_c k; ++ struct bkey n; ++ int ret; ++ ++ /* keys & holes can't span inode numbers: */ ++ if (iter->pos.offset == KEY_OFFSET_MAX) { ++ if (iter->pos.inode == KEY_INODE_MAX) ++ return bkey_s_c_null; ++ ++ bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos)); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ } ++ ++ /* ++ * iterator is now at the correct position for inserting at iter->pos, ++ * but we need to keep iterating until we find the first non whiteout so ++ * we know how big a hole we have, if any: ++ */ ++ ++ node_iter = l->iter; ++ k = __btree_iter_unpack(iter, l, &iter->k, ++ bch2_btree_node_iter_peek(&node_iter, l->b)); ++ ++ if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { ++ /* ++ * We're not setting iter->uptodate because the node iterator ++ * doesn't necessarily point at the key we're returning: ++ */ ++ ++ EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0); ++ bch2_btree_iter_verify_level(iter, 0); ++ return k; ++ } ++ ++ /* hole */ ++ ++ if (!k.k) ++ k.k = &l->b->key.k; ++ ++ bkey_init(&n); ++ n.p = iter->pos; ++ bch2_key_resize(&n, ++ min_t(u64, KEY_SIZE_MAX, ++ (k.k->p.inode == n.p.inode ++ ? bkey_start_offset(k.k) ++ : KEY_OFFSET_MAX) - ++ n.p.offset)); ++ ++ EBUG_ON(!n.size); ++ ++ iter->k = n; ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ ++ bch2_btree_iter_verify_level(iter, 0); ++ return (struct bkey_s_c) { &iter->k, NULL }; ++} ++ ++struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_s_c k; ++ int ret; ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ bch2_btree_iter_checks(iter); ++ ++ if (iter->uptodate == BTREE_ITER_UPTODATE) ++ return btree_iter_peek_uptodate(iter); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) ++ return __bch2_btree_iter_peek_slot_extents(iter); ++ ++ k = __btree_iter_peek_all(iter, l, &iter->k); ++ ++ EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0); ++ ++ if (!k.k || bkey_cmp(iter->pos, k.k->p)) { ++ /* hole */ ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos; ++ k = (struct bkey_s_c) { &iter->k, NULL }; ++ } ++ ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ bch2_btree_iter_verify_level(iter, 0); ++ return k; ++} ++ ++struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) ++{ ++ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) ++ return bkey_s_c_null; ++ ++ bch2_btree_iter_set_pos(iter, ++ (iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? iter->k.p ++ : bkey_successor(iter->k.p)); ++ ++ return bch2_btree_iter_peek_slot(iter); ++} ++ ++struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter) ++{ ++ struct bkey_cached *ck; ++ int ret; ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED); ++ bch2_btree_iter_checks(iter); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ ck = (void *) iter->l[0].b; ++ ++ EBUG_ON(iter->btree_id != ck->key.btree_id || ++ bkey_cmp(iter->pos, ck->key.pos)); ++ BUG_ON(!ck->valid); ++ ++ return bkey_i_to_s_c(ck->k); ++} ++ ++static inline void bch2_btree_iter_init(struct btree_trans *trans, ++ struct btree_iter *iter, enum btree_id btree_id, ++ struct bpos pos, unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ unsigned i; ++ ++ if (btree_node_type_is_extents(btree_id) && ++ !(flags & BTREE_ITER_NODES)) ++ flags |= BTREE_ITER_IS_EXTENTS; ++ ++ iter->trans = trans; ++ iter->pos = pos; ++ bkey_init(&iter->k); ++ iter->k.p = pos; ++ iter->flags = flags; ++ iter->uptodate = BTREE_ITER_NEED_TRAVERSE; ++ iter->btree_id = btree_id; ++ iter->level = 0; ++ iter->min_depth = 0; ++ iter->locks_want = flags & BTREE_ITER_INTENT ? 1 : 0; ++ iter->nodes_locked = 0; ++ iter->nodes_intent_locked = 0; ++ for (i = 0; i < ARRAY_SIZE(iter->l); i++) ++ iter->l[i].b = BTREE_ITER_NO_NODE_INIT; ++ ++ prefetch(c->btree_roots[btree_id].b); ++} ++ ++/* new transactional stuff: */ ++ ++static inline void __bch2_trans_iter_free(struct btree_trans *trans, ++ unsigned idx) ++{ ++ __bch2_btree_iter_unlock(&trans->iters[idx]); ++ trans->iters_linked &= ~(1ULL << idx); ++ trans->iters_live &= ~(1ULL << idx); ++ trans->iters_touched &= ~(1ULL << idx); ++} ++ ++int bch2_trans_iter_put(struct btree_trans *trans, ++ struct btree_iter *iter) ++{ ++ int ret; ++ ++ if (IS_ERR_OR_NULL(iter)) ++ return 0; ++ ++ BUG_ON(trans->iters + iter->idx != iter); ++ ++ ret = btree_iter_err(iter); ++ ++ if (!(trans->iters_touched & (1ULL << iter->idx)) && ++ !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) ++ __bch2_trans_iter_free(trans, iter->idx); ++ ++ trans->iters_live &= ~(1ULL << iter->idx); ++ return ret; ++} ++ ++int bch2_trans_iter_free(struct btree_trans *trans, ++ struct btree_iter *iter) ++{ ++ if (IS_ERR_OR_NULL(iter)) ++ return 0; ++ ++ trans->iters_touched &= ~(1ULL << iter->idx); ++ ++ return bch2_trans_iter_put(trans, iter); ++} ++ ++static int bch2_trans_realloc_iters(struct btree_trans *trans, ++ unsigned new_size) ++{ ++ void *p, *new_iters, *new_updates, *new_updates2; ++ size_t iters_bytes; ++ size_t updates_bytes; ++ ++ new_size = roundup_pow_of_two(new_size); ++ ++ BUG_ON(new_size > BTREE_ITER_MAX); ++ ++ if (new_size <= trans->size) ++ return 0; ++ ++ BUG_ON(trans->used_mempool); ++ ++ bch2_trans_unlock(trans); ++ ++ iters_bytes = sizeof(struct btree_iter) * new_size; ++ updates_bytes = sizeof(struct btree_insert_entry) * new_size; ++ ++ p = kmalloc(iters_bytes + ++ updates_bytes + ++ updates_bytes, GFP_NOFS); ++ if (p) ++ goto success; ++ ++ p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); ++ new_size = BTREE_ITER_MAX; ++ ++ trans->used_mempool = true; ++success: ++ new_iters = p; p += iters_bytes; ++ new_updates = p; p += updates_bytes; ++ new_updates2 = p; p += updates_bytes; ++ ++ memcpy(new_iters, trans->iters, ++ sizeof(struct btree_iter) * trans->nr_iters); ++ memcpy(new_updates, trans->updates, ++ sizeof(struct btree_insert_entry) * trans->nr_updates); ++ memcpy(new_updates2, trans->updates2, ++ sizeof(struct btree_insert_entry) * trans->nr_updates2); ++ ++ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) ++ memset(trans->iters, POISON_FREE, ++ sizeof(struct btree_iter) * trans->nr_iters + ++ sizeof(struct btree_insert_entry) * trans->nr_iters); ++ ++ if (trans->iters != trans->iters_onstack) ++ kfree(trans->iters); ++ ++ trans->iters = new_iters; ++ trans->updates = new_updates; ++ trans->updates2 = new_updates2; ++ trans->size = new_size; ++ ++ if (trans->iters_live) { ++ trace_trans_restart_iters_realloced(trans->ip, trans->size); ++ return -EINTR; ++ } ++ ++ return 0; ++} ++ ++static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) ++{ ++ unsigned idx = __ffs64(~trans->iters_linked); ++ ++ if (idx < trans->nr_iters) ++ goto got_slot; ++ ++ if (trans->nr_iters == trans->size) { ++ int ret; ++ ++ if (trans->nr_iters >= BTREE_ITER_MAX) { ++ struct btree_iter *iter; ++ ++ trans_for_each_iter(trans, iter) { ++ pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps", ++ bch2_btree_ids[iter->btree_id], ++ iter->pos.inode, ++ iter->pos.offset, ++ (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", ++ (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", ++ iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", ++ (void *) iter->ip_allocated); ++ } ++ ++ panic("trans iter oveflow\n"); ++ } ++ ++ ret = bch2_trans_realloc_iters(trans, trans->size * 2); ++ if (ret) ++ return ERR_PTR(ret); ++ } ++ ++ idx = trans->nr_iters++; ++ BUG_ON(trans->nr_iters > trans->size); ++ ++ trans->iters[idx].idx = idx; ++got_slot: ++ BUG_ON(trans->iters_linked & (1ULL << idx)); ++ trans->iters_linked |= 1ULL << idx; ++ trans->iters[idx].flags = 0; ++ return &trans->iters[idx]; ++} ++ ++static inline void btree_iter_copy(struct btree_iter *dst, ++ struct btree_iter *src) ++{ ++ unsigned i, idx = dst->idx; ++ ++ *dst = *src; ++ dst->idx = idx; ++ dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; ++ ++ for (i = 0; i < BTREE_MAX_DEPTH; i++) ++ if (btree_node_locked(dst, i)) ++ six_lock_increment(&dst->l[i].b->c.lock, ++ __btree_lock_want(dst, i)); ++ ++ dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; ++ dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT; ++} ++ ++static inline struct bpos bpos_diff(struct bpos l, struct bpos r) ++{ ++ if (bkey_cmp(l, r) > 0) ++ swap(l, r); ++ ++ return POS(r.inode - l.inode, r.offset - l.offset); ++} ++ ++static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, ++ unsigned btree_id, struct bpos pos, ++ unsigned flags) ++{ ++ struct btree_iter *iter, *best = NULL; ++ ++ BUG_ON(trans->nr_iters > BTREE_ITER_MAX); ++ ++ trans_for_each_iter(trans, iter) { ++ if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE)) ++ continue; ++ ++ if (iter->btree_id != btree_id) ++ continue; ++ ++ if (best && ++ bkey_cmp(bpos_diff(best->pos, pos), ++ bpos_diff(iter->pos, pos)) < 0) ++ continue; ++ ++ best = iter; ++ } ++ ++ if (!best) { ++ iter = btree_trans_iter_alloc(trans); ++ if (IS_ERR(iter)) ++ return iter; ++ ++ bch2_btree_iter_init(trans, iter, btree_id, pos, flags); ++ } else if ((trans->iters_live & (1ULL << best->idx)) || ++ (best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) { ++ iter = btree_trans_iter_alloc(trans); ++ if (IS_ERR(iter)) ++ return iter; ++ ++ btree_iter_copy(iter, best); ++ } else { ++ iter = best; ++ } ++ ++ iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; ++ iter->flags &= ~BTREE_ITER_USER_FLAGS; ++ iter->flags |= flags & BTREE_ITER_USER_FLAGS; ++ ++ if (iter->flags & BTREE_ITER_INTENT) ++ bch2_btree_iter_upgrade(iter, 1); ++ else ++ bch2_btree_iter_downgrade(iter); ++ ++ BUG_ON(iter->btree_id != btree_id); ++ BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE); ++ BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); ++ BUG_ON(iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT); ++ BUG_ON(trans->iters_live & (1ULL << iter->idx)); ++ ++ trans->iters_live |= 1ULL << iter->idx; ++ trans->iters_touched |= 1ULL << iter->idx; ++ ++ return iter; ++} ++ ++struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, ++ enum btree_id btree_id, ++ struct bpos pos, unsigned flags) ++{ ++ struct btree_iter *iter = ++ __btree_trans_get_iter(trans, btree_id, pos, flags); ++ ++ if (!IS_ERR(iter)) ++ __bch2_btree_iter_set_pos(iter, pos, ++ btree_node_type_is_extents(btree_id)); ++ return iter; ++} ++ ++struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, ++ enum btree_id btree_id, ++ struct bpos pos, ++ unsigned locks_want, ++ unsigned depth, ++ unsigned flags) ++{ ++ struct btree_iter *iter = ++ __btree_trans_get_iter(trans, btree_id, pos, ++ flags|BTREE_ITER_NODES); ++ unsigned i; ++ ++ BUG_ON(IS_ERR(iter)); ++ BUG_ON(bkey_cmp(iter->pos, pos)); ++ ++ iter->locks_want = locks_want; ++ iter->level = depth; ++ iter->min_depth = depth; ++ ++ for (i = 0; i < ARRAY_SIZE(iter->l); i++) ++ iter->l[i].b = NULL; ++ iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; ++ ++ return iter; ++} ++ ++struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans, ++ struct btree_iter *src) ++{ ++ struct btree_iter *iter; ++ ++ iter = btree_trans_iter_alloc(trans); ++ if (IS_ERR(iter)) ++ return iter; ++ ++ btree_iter_copy(iter, src); ++ ++ trans->iters_live |= 1ULL << iter->idx; ++ /* ++ * We don't need to preserve this iter since it's cheap to copy it ++ * again - this will cause trans_iter_put() to free it right away: ++ */ ++ trans->iters_touched &= ~(1ULL << iter->idx); ++ ++ return iter; ++} ++ ++static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size) ++{ ++ if (size > trans->mem_bytes) { ++ size_t old_bytes = trans->mem_bytes; ++ size_t new_bytes = roundup_pow_of_two(size); ++ void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); ++ ++ if (!new_mem) ++ return -ENOMEM; ++ ++ trans->mem = new_mem; ++ trans->mem_bytes = new_bytes; ++ ++ if (old_bytes) { ++ trace_trans_restart_mem_realloced(trans->ip, new_bytes); ++ return -EINTR; ++ } ++ } ++ ++ return 0; ++} ++ ++void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) ++{ ++ void *p; ++ int ret; ++ ++ ret = bch2_trans_preload_mem(trans, trans->mem_top + size); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ p = trans->mem + trans->mem_top; ++ trans->mem_top += size; ++ return p; ++} ++ ++inline void bch2_trans_unlink_iters(struct btree_trans *trans) ++{ ++ u64 iters = trans->iters_linked & ++ ~trans->iters_touched & ++ ~trans->iters_live; ++ ++ while (iters) { ++ unsigned idx = __ffs64(iters); ++ ++ iters &= ~(1ULL << idx); ++ __bch2_trans_iter_free(trans, idx); ++ } ++} ++ ++void bch2_trans_reset(struct btree_trans *trans, unsigned flags) ++{ ++ struct btree_iter *iter; ++ ++ trans_for_each_iter(trans, iter) ++ iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT| ++ BTREE_ITER_SET_POS_AFTER_COMMIT); ++ ++ bch2_trans_unlink_iters(trans); ++ ++ trans->iters_touched &= trans->iters_live; ++ ++ trans->need_reset = 0; ++ trans->nr_updates = 0; ++ trans->nr_updates2 = 0; ++ trans->mem_top = 0; ++ ++ trans->extra_journal_entries = NULL; ++ trans->extra_journal_entry_u64s = 0; ++ ++ if (trans->fs_usage_deltas) { ++ trans->fs_usage_deltas->used = 0; ++ memset(&trans->fs_usage_deltas->memset_start, 0, ++ (void *) &trans->fs_usage_deltas->memset_end - ++ (void *) &trans->fs_usage_deltas->memset_start); ++ } ++ ++ if (!(flags & TRANS_RESET_NOTRAVERSE)) ++ bch2_btree_iter_traverse_all(trans); ++} ++ ++void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, ++ unsigned expected_nr_iters, ++ size_t expected_mem_bytes) ++{ ++ memset(trans, 0, offsetof(struct btree_trans, iters_onstack)); ++ ++ /* ++ * reallocating iterators currently completely breaks ++ * bch2_trans_iter_put(): ++ */ ++ expected_nr_iters = BTREE_ITER_MAX; ++ ++ trans->c = c; ++ trans->ip = _RET_IP_; ++ trans->size = ARRAY_SIZE(trans->iters_onstack); ++ trans->iters = trans->iters_onstack; ++ trans->updates = trans->updates_onstack; ++ trans->updates2 = trans->updates2_onstack; ++ trans->fs_usage_deltas = NULL; ++ ++ if (expected_nr_iters > trans->size) ++ bch2_trans_realloc_iters(trans, expected_nr_iters); ++ ++ if (expected_mem_bytes) ++ bch2_trans_preload_mem(trans, expected_mem_bytes); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ trans->pid = current->pid; ++ mutex_lock(&c->btree_trans_lock); ++ list_add(&trans->list, &c->btree_trans_list); ++ mutex_unlock(&c->btree_trans_lock); ++#endif ++} ++ ++int bch2_trans_exit(struct btree_trans *trans) ++{ ++ bch2_trans_unlock(trans); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ mutex_lock(&trans->c->btree_trans_lock); ++ list_del(&trans->list); ++ mutex_unlock(&trans->c->btree_trans_lock); ++#endif ++ ++ bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); ++ ++ kfree(trans->fs_usage_deltas); ++ kfree(trans->mem); ++ if (trans->used_mempool) ++ mempool_free(trans->iters, &trans->c->btree_iters_pool); ++ else if (trans->iters != trans->iters_onstack) ++ kfree(trans->iters); ++ trans->mem = (void *) 0x1; ++ trans->iters = (void *) 0x1; ++ ++ return trans->error ? -EIO : 0; ++} ++ ++static void bch2_btree_iter_node_to_text(struct printbuf *out, ++ struct btree_bkey_cached_common *_b, ++ enum btree_iter_type type) ++{ ++ pr_buf(out, " %px l=%u %s:", ++ _b, _b->level, bch2_btree_ids[_b->btree_id]); ++ bch2_bpos_to_text(out, btree_node_pos(_b, type)); ++} ++ ++void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct btree_trans *trans; ++ struct btree_iter *iter; ++ struct btree *b; ++ unsigned l; ++ ++ mutex_lock(&c->btree_trans_lock); ++ list_for_each_entry(trans, &c->btree_trans_list, list) { ++ pr_buf(out, "%i %px %ps\n", trans->pid, trans, (void *) trans->ip); ++ ++ trans_for_each_iter(trans, iter) { ++ if (!iter->nodes_locked) ++ continue; ++ ++ pr_buf(out, " iter %u %s:", ++ iter->idx, ++ bch2_btree_ids[iter->btree_id]); ++ bch2_bpos_to_text(out, iter->pos); ++ pr_buf(out, "\n"); ++ ++ for (l = 0; l < BTREE_MAX_DEPTH; l++) { ++ if (btree_node_locked(iter, l)) { ++ pr_buf(out, " %s l=%u ", ++ btree_node_intent_locked(iter, l) ? "i" : "r", l); ++ bch2_btree_iter_node_to_text(out, ++ (void *) iter->l[l].b, ++ btree_iter_type(iter)); ++ pr_buf(out, "\n"); ++ } ++ } ++ } ++ ++ b = READ_ONCE(trans->locking); ++ if (b) { ++ pr_buf(out, " locking iter %u l=%u %s:", ++ trans->locking_iter_idx, ++ trans->locking_level, ++ bch2_btree_ids[trans->locking_btree_id]); ++ bch2_bpos_to_text(out, trans->locking_pos); ++ ++ ++ pr_buf(out, " node "); ++ bch2_btree_iter_node_to_text(out, ++ (void *) b, ++ btree_iter_type(&trans->iters[trans->locking_iter_idx])); ++ pr_buf(out, "\n"); ++ } ++ } ++ mutex_unlock(&c->btree_trans_lock); ++#endif ++} ++ ++void bch2_fs_btree_iter_exit(struct bch_fs *c) ++{ ++ mempool_exit(&c->btree_iters_pool); ++} ++ ++int bch2_fs_btree_iter_init(struct bch_fs *c) ++{ ++ unsigned nr = BTREE_ITER_MAX; ++ ++ INIT_LIST_HEAD(&c->btree_trans_list); ++ mutex_init(&c->btree_trans_lock); ++ ++ return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, ++ sizeof(struct btree_iter) * nr + ++ sizeof(struct btree_insert_entry) * nr + ++ sizeof(struct btree_insert_entry) * nr); ++} +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +new file mode 100644 +index 000000000000..bd9ec3ec9a92 +--- /dev/null ++++ b/fs/bcachefs/btree_iter.h +@@ -0,0 +1,314 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_ITER_H ++#define _BCACHEFS_BTREE_ITER_H ++ ++#include "bset.h" ++#include "btree_types.h" ++ ++static inline void btree_iter_set_dirty(struct btree_iter *iter, ++ enum btree_iter_uptodate u) ++{ ++ iter->uptodate = max_t(unsigned, iter->uptodate, u); ++} ++ ++static inline struct btree *btree_iter_node(struct btree_iter *iter, ++ unsigned level) ++{ ++ return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL; ++} ++ ++static inline bool btree_node_lock_seq_matches(const struct btree_iter *iter, ++ const struct btree *b, unsigned level) ++{ ++ /* ++ * We don't compare the low bits of the lock sequence numbers because ++ * @iter might have taken a write lock on @b, and we don't want to skip ++ * the linked iterator if the sequence numbers were equal before taking ++ * that write lock. The lock sequence number is incremented by taking ++ * and releasing write locks and is even when unlocked: ++ */ ++ return iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1; ++} ++ ++static inline struct btree *btree_node_parent(struct btree_iter *iter, ++ struct btree *b) ++{ ++ return btree_iter_node(iter, b->c.level + 1); ++} ++ ++static inline bool btree_trans_has_multiple_iters(const struct btree_trans *trans) ++{ ++ return hweight64(trans->iters_linked) > 1; ++} ++ ++static inline int btree_iter_err(const struct btree_iter *iter) ++{ ++ return iter->flags & BTREE_ITER_ERROR ? -EIO : 0; ++} ++ ++/* Iterate over iters within a transaction: */ ++ ++#define trans_for_each_iter_all(_trans, _iter) \ ++ for (_iter = (_trans)->iters; \ ++ _iter < (_trans)->iters + (_trans)->nr_iters; \ ++ _iter++) ++ ++static inline struct btree_iter * ++__trans_next_iter(struct btree_trans *trans, unsigned idx) ++{ ++ EBUG_ON(idx < trans->nr_iters && trans->iters[idx].idx != idx); ++ ++ for (; idx < trans->nr_iters; idx++) ++ if (trans->iters_linked & (1ULL << idx)) ++ return &trans->iters[idx]; ++ ++ return NULL; ++} ++ ++#define trans_for_each_iter(_trans, _iter) \ ++ for (_iter = __trans_next_iter((_trans), 0); \ ++ (_iter); \ ++ _iter = __trans_next_iter((_trans), (_iter)->idx + 1)) ++ ++static inline bool __iter_has_node(const struct btree_iter *iter, ++ const struct btree *b) ++{ ++ return iter->l[b->c.level].b == b && ++ btree_node_lock_seq_matches(iter, b, b->c.level); ++} ++ ++static inline struct btree_iter * ++__trans_next_iter_with_node(struct btree_trans *trans, struct btree *b, ++ unsigned idx) ++{ ++ struct btree_iter *iter = __trans_next_iter(trans, idx); ++ ++ while (iter && !__iter_has_node(iter, b)) ++ iter = __trans_next_iter(trans, iter->idx + 1); ++ ++ return iter; ++} ++ ++#define trans_for_each_iter_with_node(_trans, _b, _iter) \ ++ for (_iter = __trans_next_iter_with_node((_trans), (_b), 0); \ ++ (_iter); \ ++ _iter = __trans_next_iter_with_node((_trans), (_b), \ ++ (_iter)->idx + 1)) ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_btree_trans_verify_iters(struct btree_trans *, struct btree *); ++void bch2_btree_trans_verify_locks(struct btree_trans *); ++#else ++static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans, ++ struct btree *b) {} ++static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {} ++#endif ++ ++void bch2_btree_iter_fix_key_modified(struct btree_iter *, struct btree *, ++ struct bkey_packed *); ++void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, ++ struct btree_node_iter *, struct bkey_packed *, ++ unsigned, unsigned); ++ ++bool bch2_btree_iter_relock(struct btree_iter *, bool); ++bool bch2_trans_relock(struct btree_trans *); ++void bch2_trans_unlock(struct btree_trans *); ++ ++bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned); ++bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned); ++ ++static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter, ++ unsigned new_locks_want) ++{ ++ new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); ++ ++ return iter->locks_want < new_locks_want ++ ? (!iter->trans->nounlock ++ ? __bch2_btree_iter_upgrade(iter, new_locks_want) ++ : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want)) ++ : iter->uptodate <= BTREE_ITER_NEED_PEEK; ++} ++ ++void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned); ++ ++static inline void bch2_btree_iter_downgrade(struct btree_iter *iter) ++{ ++ if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0) ++ __bch2_btree_iter_downgrade(iter, 0); ++} ++ ++void bch2_trans_downgrade(struct btree_trans *); ++ ++void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *); ++void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *); ++ ++void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *); ++ ++int __must_check __bch2_btree_iter_traverse(struct btree_iter *); ++ ++static inline int __must_check ++bch2_btree_iter_traverse(struct btree_iter *iter) ++{ ++ return iter->uptodate >= BTREE_ITER_NEED_RELOCK ++ ? __bch2_btree_iter_traverse(iter) ++ : 0; ++} ++ ++int bch2_btree_iter_traverse_all(struct btree_trans *); ++ ++struct btree *bch2_btree_iter_peek_node(struct btree_iter *); ++struct btree *bch2_btree_iter_next_node(struct btree_iter *); ++ ++struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); ++ ++struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *); ++ ++struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); ++ ++struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); ++ ++struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *); ++ ++void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos); ++void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool); ++void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); ++ ++static inline int btree_iter_cmp(const struct btree_iter *l, ++ const struct btree_iter *r) ++{ ++ return cmp_int(l->btree_id, r->btree_id) ?: ++ -cmp_int(btree_iter_type(l), btree_iter_type(r)) ?: ++ bkey_cmp(l->pos, r->pos); ++} ++ ++/* ++ * Unlocks before scheduling ++ * Note: does not revalidate iterator ++ */ ++static inline int bch2_trans_cond_resched(struct btree_trans *trans) ++{ ++ if (need_resched() || race_fault()) { ++ bch2_trans_unlock(trans); ++ schedule(); ++ return bch2_trans_relock(trans) ? 0 : -EINTR; ++ } else { ++ return 0; ++ } ++} ++ ++#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ ++ _locks_want, _depth, _flags, _b) \ ++ for (iter = bch2_trans_get_node_iter((_trans), (_btree_id), \ ++ _start, _locks_want, _depth, _flags), \ ++ _b = bch2_btree_iter_peek_node(_iter); \ ++ (_b); \ ++ (_b) = bch2_btree_iter_next_node(_iter)) ++ ++#define for_each_btree_node(_trans, _iter, _btree_id, _start, \ ++ _flags, _b) \ ++ __for_each_btree_node(_trans, _iter, _btree_id, _start, \ ++ 0, 0, _flags, _b) ++ ++static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, ++ unsigned flags) ++{ ++ if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED) ++ return bch2_btree_iter_peek_cached(iter); ++ else ++ return flags & BTREE_ITER_SLOTS ++ ? bch2_btree_iter_peek_slot(iter) ++ : bch2_btree_iter_peek(iter); ++} ++ ++static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter, ++ unsigned flags) ++{ ++ return flags & BTREE_ITER_SLOTS ++ ? bch2_btree_iter_next_slot(iter) ++ : bch2_btree_iter_next(iter); ++} ++ ++static inline int bkey_err(struct bkey_s_c k) ++{ ++ return PTR_ERR_OR_ZERO(k.k); ++} ++ ++#define for_each_btree_key(_trans, _iter, _btree_id, \ ++ _start, _flags, _k, _ret) \ ++ for ((_ret) = PTR_ERR_OR_ZERO((_iter) = \ ++ bch2_trans_get_iter((_trans), (_btree_id), \ ++ (_start), (_flags))) ?: \ ++ PTR_ERR_OR_ZERO(((_k) = \ ++ __bch2_btree_iter_peek(_iter, _flags)).k); \ ++ !_ret && (_k).k; \ ++ (_ret) = PTR_ERR_OR_ZERO(((_k) = \ ++ __bch2_btree_iter_next(_iter, _flags)).k)) ++ ++#define for_each_btree_key_continue(_iter, _flags, _k, _ret) \ ++ for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \ ++ !((_ret) = bkey_err(_k)) && (_k).k; \ ++ (_k) = __bch2_btree_iter_next(_iter, _flags)) ++ ++/* new multiple iterator interface: */ ++ ++int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *); ++int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *); ++ ++void bch2_trans_unlink_iters(struct btree_trans *); ++ ++struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id, ++ struct bpos, unsigned); ++ ++static inline struct btree_iter * ++bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id, ++ struct bpos pos, unsigned flags) ++{ ++ struct btree_iter *iter = ++ __bch2_trans_get_iter(trans, btree_id, pos, flags); ++ ++ if (!IS_ERR(iter)) ++ iter->ip_allocated = _THIS_IP_; ++ return iter; ++} ++ ++struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *, ++ struct btree_iter *); ++static inline struct btree_iter * ++bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src) ++{ ++ struct btree_iter *iter = ++ __bch2_trans_copy_iter(trans, src); ++ ++ if (!IS_ERR(iter)) ++ iter->ip_allocated = _THIS_IP_; ++ return iter; ++ ++} ++ ++struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *, ++ enum btree_id, struct bpos, ++ unsigned, unsigned, unsigned); ++ ++#define TRANS_RESET_NOTRAVERSE (1 << 0) ++ ++void bch2_trans_reset(struct btree_trans *, unsigned); ++ ++static inline void bch2_trans_begin(struct btree_trans *trans) ++{ ++ return bch2_trans_reset(trans, 0); ++} ++ ++void *bch2_trans_kmalloc(struct btree_trans *, size_t); ++void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t); ++int bch2_trans_exit(struct btree_trans *); ++ ++void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *); ++ ++void bch2_fs_btree_iter_exit(struct bch_fs *); ++int bch2_fs_btree_iter_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_BTREE_ITER_H */ +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +new file mode 100644 +index 000000000000..d73cc8ddadac +--- /dev/null ++++ b/fs/bcachefs/btree_key_cache.c +@@ -0,0 +1,519 @@ ++ ++#include "bcachefs.h" ++#include "btree_cache.h" ++#include "btree_iter.h" ++#include "btree_key_cache.h" ++#include "btree_locking.h" ++#include "btree_update.h" ++#include "error.h" ++#include "journal.h" ++#include "journal_reclaim.h" ++ ++#include ++ ++static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, ++ const void *obj) ++{ ++ const struct bkey_cached *ck = obj; ++ const struct bkey_cached_key *key = arg->key; ++ ++ return cmp_int(ck->key.btree_id, key->btree_id) ?: ++ bkey_cmp(ck->key.pos, key->pos); ++} ++ ++static const struct rhashtable_params bch2_btree_key_cache_params = { ++ .head_offset = offsetof(struct bkey_cached, hash), ++ .key_offset = offsetof(struct bkey_cached, key), ++ .key_len = sizeof(struct bkey_cached_key), ++ .obj_cmpfn = bch2_btree_key_cache_cmp_fn, ++}; ++ ++__flatten ++static inline struct bkey_cached * ++btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) ++{ ++ struct bkey_cached_key key = { ++ .btree_id = btree_id, ++ .pos = pos, ++ }; ++ ++ return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, ++ bch2_btree_key_cache_params); ++} ++ ++static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) ++{ ++ if (!six_trylock_intent(&ck->c.lock)) ++ return false; ++ ++ if (!six_trylock_write(&ck->c.lock)) { ++ six_unlock_intent(&ck->c.lock); ++ return false; ++ } ++ ++ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ six_unlock_write(&ck->c.lock); ++ six_unlock_intent(&ck->c.lock); ++ return false; ++ } ++ ++ return true; ++} ++ ++static void bkey_cached_evict(struct btree_key_cache *c, ++ struct bkey_cached *ck) ++{ ++ BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, ++ bch2_btree_key_cache_params)); ++ memset(&ck->key, ~0, sizeof(ck->key)); ++} ++ ++static void bkey_cached_free(struct btree_key_cache *c, ++ struct bkey_cached *ck) ++{ ++ list_move(&ck->list, &c->freed); ++ ++ kfree(ck->k); ++ ck->k = NULL; ++ ck->u64s = 0; ++ ++ six_unlock_write(&ck->c.lock); ++ six_unlock_intent(&ck->c.lock); ++} ++ ++static struct bkey_cached * ++bkey_cached_alloc(struct btree_key_cache *c) ++{ ++ struct bkey_cached *ck; ++ ++ list_for_each_entry(ck, &c->freed, list) ++ if (bkey_cached_lock_for_evict(ck)) ++ return ck; ++ ++ list_for_each_entry(ck, &c->clean, list) ++ if (bkey_cached_lock_for_evict(ck)) { ++ bkey_cached_evict(c, ck); ++ return ck; ++ } ++ ++ ck = kzalloc(sizeof(*ck), GFP_NOFS); ++ if (!ck) ++ return NULL; ++ ++ INIT_LIST_HEAD(&ck->list); ++ six_lock_init(&ck->c.lock); ++ BUG_ON(!six_trylock_intent(&ck->c.lock)); ++ BUG_ON(!six_trylock_write(&ck->c.lock)); ++ ++ return ck; ++} ++ ++static struct bkey_cached * ++btree_key_cache_create(struct btree_key_cache *c, ++ enum btree_id btree_id, ++ struct bpos pos) ++{ ++ struct bkey_cached *ck; ++ ++ ck = bkey_cached_alloc(c); ++ if (!ck) ++ return ERR_PTR(-ENOMEM); ++ ++ ck->c.level = 0; ++ ck->c.btree_id = btree_id; ++ ck->key.btree_id = btree_id; ++ ck->key.pos = pos; ++ ck->valid = false; ++ ++ BUG_ON(ck->flags); ++ ++ if (rhashtable_lookup_insert_fast(&c->table, ++ &ck->hash, ++ bch2_btree_key_cache_params)) { ++ /* We raced with another fill: */ ++ bkey_cached_free(c, ck); ++ return NULL; ++ } ++ ++ list_move(&ck->list, &c->clean); ++ six_unlock_write(&ck->c.lock); ++ ++ return ck; ++} ++ ++static int btree_key_cache_fill(struct btree_trans *trans, ++ struct btree_iter *ck_iter, ++ struct bkey_cached *ck) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ unsigned new_u64s = 0; ++ struct bkey_i *new_k = NULL; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, ck->key.btree_id, ++ ck->key.pos, BTREE_ITER_SLOTS); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) { ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++ } ++ ++ if (!bch2_btree_node_relock(ck_iter, 0)) { ++ bch2_trans_iter_put(trans, iter); ++ trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ return -EINTR; ++ } ++ ++ if (k.k->u64s > ck->u64s) { ++ new_u64s = roundup_pow_of_two(k.k->u64s); ++ new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS); ++ if (!new_k) { ++ bch2_trans_iter_put(trans, iter); ++ return -ENOMEM; ++ } ++ } ++ ++ bch2_btree_node_lock_write(ck_iter->l[0].b, ck_iter); ++ if (new_k) { ++ kfree(ck->k); ++ ck->u64s = new_u64s; ++ ck->k = new_k; ++ } ++ ++ bkey_reassemble(ck->k, k); ++ ck->valid = true; ++ bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter); ++ ++ /* We're not likely to need this iterator again: */ ++ bch2_trans_iter_free(trans, iter); ++ ++ return 0; ++} ++ ++static int bkey_cached_check_fn(struct six_lock *lock, void *p) ++{ ++ struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock); ++ const struct btree_iter *iter = p; ++ ++ return ck->key.btree_id == iter->btree_id && ++ !bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1; ++} ++ ++int bch2_btree_iter_traverse_cached(struct btree_iter *iter) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct bch_fs *c = trans->c; ++ struct bkey_cached *ck; ++ int ret = 0; ++ ++ BUG_ON(iter->level); ++ ++ if (btree_node_locked(iter, 0)) { ++ ck = (void *) iter->l[0].b; ++ goto fill; ++ } ++retry: ++ ck = btree_key_cache_find(c, iter->btree_id, iter->pos); ++ if (!ck) { ++ if (iter->flags & BTREE_ITER_CACHED_NOCREATE) { ++ iter->l[0].b = NULL; ++ return 0; ++ } ++ ++ mutex_lock(&c->btree_key_cache.lock); ++ ck = btree_key_cache_create(&c->btree_key_cache, ++ iter->btree_id, iter->pos); ++ mutex_unlock(&c->btree_key_cache.lock); ++ ++ ret = PTR_ERR_OR_ZERO(ck); ++ if (ret) ++ goto err; ++ if (!ck) ++ goto retry; ++ ++ mark_btree_node_locked(iter, 0, SIX_LOCK_intent); ++ iter->locks_want = 1; ++ } else { ++ enum six_lock_type lock_want = __btree_lock_want(iter, 0); ++ ++ if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want, ++ bkey_cached_check_fn, iter)) { ++ if (ck->key.btree_id != iter->btree_id || ++ bkey_cmp(ck->key.pos, iter->pos)) { ++ goto retry; ++ } ++ ++ trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ ret = -EINTR; ++ goto err; ++ } ++ ++ if (ck->key.btree_id != iter->btree_id || ++ bkey_cmp(ck->key.pos, iter->pos)) { ++ six_unlock_type(&ck->c.lock, lock_want); ++ goto retry; ++ } ++ ++ mark_btree_node_locked(iter, 0, lock_want); ++ } ++ ++ iter->l[0].lock_seq = ck->c.lock.state.seq; ++ iter->l[0].b = (void *) ck; ++fill: ++ if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) { ++ if (!btree_node_intent_locked(iter, 0)) ++ bch2_btree_iter_upgrade(iter, 1); ++ if (!btree_node_intent_locked(iter, 0)) { ++ trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ ret = -EINTR; ++ goto err; ++ } ++ ++ ret = btree_key_cache_fill(trans, iter, ck); ++ if (ret) ++ goto err; ++ } ++ ++ iter->uptodate = BTREE_ITER_NEED_PEEK; ++ bch2_btree_iter_downgrade(iter); ++ return ret; ++err: ++ if (ret != -EINTR) { ++ btree_node_unlock(iter, 0); ++ iter->flags |= BTREE_ITER_ERROR; ++ iter->l[0].b = BTREE_ITER_NO_NODE_ERROR; ++ } ++ return ret; ++} ++ ++static int btree_key_cache_flush_pos(struct btree_trans *trans, ++ struct bkey_cached_key key, ++ u64 journal_seq, ++ bool evict) ++{ ++ struct bch_fs *c = trans->c; ++ struct journal *j = &c->journal; ++ struct btree_iter *c_iter = NULL, *b_iter = NULL; ++ struct bkey_cached *ck; ++ int ret; ++ ++ b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, ++ BTREE_ITER_SLOTS| ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(b_iter); ++ if (ret) ++ goto out; ++ ++ c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_CACHED_NOCREATE| ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(c_iter); ++ if (ret) ++ goto out; ++retry: ++ ret = bch2_btree_iter_traverse(c_iter); ++ if (ret) ++ goto err; ++ ++ ck = (void *) c_iter->l[0].b; ++ if (!ck || ++ (journal_seq && ck->journal.seq != journal_seq)) ++ goto out; ++ ++ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ if (!evict) ++ goto out; ++ goto evict; ++ } ++ ++ ret = bch2_btree_iter_traverse(b_iter) ?: ++ bch2_trans_update(trans, b_iter, ck->k, BTREE_TRIGGER_NORUN) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOUNLOCK| ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_USE_ALLOC_RESERVE| ++ BTREE_INSERT_JOURNAL_RESERVED| ++ BTREE_INSERT_JOURNAL_RECLAIM); ++err: ++ if (ret == -EINTR) ++ goto retry; ++ ++ BUG_ON(ret && !bch2_journal_error(j)); ++ ++ if (ret) ++ goto out; ++ ++ bch2_journal_pin_drop(j, &ck->journal); ++ bch2_journal_preres_put(j, &ck->res); ++ clear_bit(BKEY_CACHED_DIRTY, &ck->flags); ++ ++ if (!evict) { ++ mutex_lock(&c->btree_key_cache.lock); ++ list_move_tail(&ck->list, &c->btree_key_cache.clean); ++ mutex_unlock(&c->btree_key_cache.lock); ++ } else { ++evict: ++ BUG_ON(!btree_node_intent_locked(c_iter, 0)); ++ ++ mark_btree_node_unlocked(c_iter, 0); ++ c_iter->l[0].b = NULL; ++ ++ six_lock_write(&ck->c.lock, NULL, NULL); ++ ++ mutex_lock(&c->btree_key_cache.lock); ++ bkey_cached_evict(&c->btree_key_cache, ck); ++ bkey_cached_free(&c->btree_key_cache, ck); ++ mutex_unlock(&c->btree_key_cache.lock); ++ } ++out: ++ bch2_trans_iter_put(trans, b_iter); ++ bch2_trans_iter_put(trans, c_iter); ++ return ret; ++} ++ ++static void btree_key_cache_journal_flush(struct journal *j, ++ struct journal_entry_pin *pin, ++ u64 seq) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bkey_cached *ck = ++ container_of(pin, struct bkey_cached, journal); ++ struct bkey_cached_key key; ++ struct btree_trans trans; ++ ++ six_lock_read(&ck->c.lock, NULL, NULL); ++ key = READ_ONCE(ck->key); ++ ++ if (ck->journal.seq != seq || ++ !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ six_unlock_read(&ck->c.lock); ++ return; ++ } ++ six_unlock_read(&ck->c.lock); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ btree_key_cache_flush_pos(&trans, key, seq, false); ++ bch2_trans_exit(&trans); ++} ++ ++/* ++ * Flush and evict a key from the key cache: ++ */ ++int bch2_btree_key_cache_flush(struct btree_trans *trans, ++ enum btree_id id, struct bpos pos) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_cached_key key = { id, pos }; ++ ++ /* Fastpath - assume it won't be found: */ ++ if (!btree_key_cache_find(c, id, pos)) ++ return 0; ++ ++ return btree_key_cache_flush_pos(trans, key, 0, true); ++} ++ ++bool bch2_btree_insert_key_cached(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_cached *ck = (void *) iter->l[0].b; ++ ++ BUG_ON(insert->u64s > ck->u64s); ++ ++ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { ++ int difference; ++ ++ BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s); ++ ++ difference = jset_u64s(insert->u64s) - ck->res.u64s; ++ if (difference > 0) { ++ trans->journal_preres.u64s -= difference; ++ ck->res.u64s += difference; ++ } ++ } ++ ++ bkey_copy(ck->k, insert); ++ ck->valid = true; ++ ++ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ mutex_lock(&c->btree_key_cache.lock); ++ list_del_init(&ck->list); ++ ++ set_bit(BKEY_CACHED_DIRTY, &ck->flags); ++ mutex_unlock(&c->btree_key_cache.lock); ++ } ++ ++ bch2_journal_pin_update(&c->journal, trans->journal_res.seq, ++ &ck->journal, btree_key_cache_journal_flush); ++ return true; ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_btree_key_cache_verify_clean(struct btree_trans *trans, ++ enum btree_id id, struct bpos pos) ++{ ++ BUG_ON(btree_key_cache_find(trans->c, id, pos)); ++} ++#endif ++ ++void bch2_fs_btree_key_cache_exit(struct btree_key_cache *c) ++{ ++ struct bkey_cached *ck, *n; ++ ++ mutex_lock(&c->lock); ++ list_for_each_entry_safe(ck, n, &c->clean, list) { ++ kfree(ck->k); ++ kfree(ck); ++ } ++ list_for_each_entry_safe(ck, n, &c->freed, list) ++ kfree(ck); ++ mutex_unlock(&c->lock); ++ ++ rhashtable_destroy(&c->table); ++} ++ ++void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) ++{ ++ mutex_init(&c->lock); ++ INIT_LIST_HEAD(&c->freed); ++ INIT_LIST_HEAD(&c->clean); ++} ++ ++int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) ++{ ++ return rhashtable_init(&c->table, &bch2_btree_key_cache_params); ++} ++ ++void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) ++{ ++ struct bucket_table *tbl; ++ struct bkey_cached *ck; ++ struct rhash_head *pos; ++ size_t i; ++ ++ mutex_lock(&c->lock); ++ tbl = rht_dereference_rcu(c->table.tbl, &c->table); ++ ++ for (i = 0; i < tbl->size; i++) { ++ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { ++ pr_buf(out, "%s:", ++ bch2_btree_ids[ck->key.btree_id]); ++ bch2_bpos_to_text(out, ck->key.pos); ++ ++ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) ++ pr_buf(out, " journal seq %llu", ck->journal.seq); ++ pr_buf(out, "\n"); ++ } ++ } ++ mutex_unlock(&c->lock); ++} +diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h +new file mode 100644 +index 000000000000..b1756c6c622c +--- /dev/null ++++ b/fs/bcachefs/btree_key_cache.h +@@ -0,0 +1,25 @@ ++#ifndef _BCACHEFS_BTREE_KEY_CACHE_H ++#define _BCACHEFS_BTREE_KEY_CACHE_H ++ ++int bch2_btree_iter_traverse_cached(struct btree_iter *); ++ ++bool bch2_btree_insert_key_cached(struct btree_trans *, ++ struct btree_iter *, struct bkey_i *); ++int bch2_btree_key_cache_flush(struct btree_trans *, ++ enum btree_id, struct bpos); ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_btree_key_cache_verify_clean(struct btree_trans *, ++ enum btree_id, struct bpos); ++#else ++static inline void ++bch2_btree_key_cache_verify_clean(struct btree_trans *trans, ++ enum btree_id id, struct bpos pos) {} ++#endif ++ ++void bch2_fs_btree_key_cache_exit(struct btree_key_cache *); ++void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *); ++int bch2_fs_btree_key_cache_init(struct btree_key_cache *); ++ ++void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *); ++ ++#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ +diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h +new file mode 100644 +index 000000000000..81fbf3e18647 +--- /dev/null ++++ b/fs/bcachefs/btree_locking.h +@@ -0,0 +1,257 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_LOCKING_H ++#define _BCACHEFS_BTREE_LOCKING_H ++ ++/* ++ * Only for internal btree use: ++ * ++ * The btree iterator tracks what locks it wants to take, and what locks it ++ * currently has - here we have wrappers for locking/unlocking btree nodes and ++ * updating the iterator state ++ */ ++ ++#include ++ ++#include "btree_iter.h" ++ ++/* matches six lock types */ ++enum btree_node_locked_type { ++ BTREE_NODE_UNLOCKED = -1, ++ BTREE_NODE_READ_LOCKED = SIX_LOCK_read, ++ BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent, ++}; ++ ++static inline int btree_node_locked_type(struct btree_iter *iter, ++ unsigned level) ++{ ++ /* ++ * We're relying on the fact that if nodes_intent_locked is set ++ * nodes_locked must be set as well, so that we can compute without ++ * branches: ++ */ ++ return BTREE_NODE_UNLOCKED + ++ ((iter->nodes_locked >> level) & 1) + ++ ((iter->nodes_intent_locked >> level) & 1); ++} ++ ++static inline bool btree_node_intent_locked(struct btree_iter *iter, ++ unsigned level) ++{ ++ return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED; ++} ++ ++static inline bool btree_node_read_locked(struct btree_iter *iter, ++ unsigned level) ++{ ++ return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED; ++} ++ ++static inline bool btree_node_locked(struct btree_iter *iter, unsigned level) ++{ ++ return iter->nodes_locked & (1 << level); ++} ++ ++static inline void mark_btree_node_unlocked(struct btree_iter *iter, ++ unsigned level) ++{ ++ iter->nodes_locked &= ~(1 << level); ++ iter->nodes_intent_locked &= ~(1 << level); ++} ++ ++static inline void mark_btree_node_locked(struct btree_iter *iter, ++ unsigned level, ++ enum six_lock_type type) ++{ ++ /* relying on this to avoid a branch */ ++ BUILD_BUG_ON(SIX_LOCK_read != 0); ++ BUILD_BUG_ON(SIX_LOCK_intent != 1); ++ ++ iter->nodes_locked |= 1 << level; ++ iter->nodes_intent_locked |= type << level; ++} ++ ++static inline void mark_btree_node_intent_locked(struct btree_iter *iter, ++ unsigned level) ++{ ++ mark_btree_node_locked(iter, level, SIX_LOCK_intent); ++} ++ ++static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level) ++{ ++ return level < iter->locks_want ++ ? SIX_LOCK_intent ++ : SIX_LOCK_read; ++} ++ ++static inline enum btree_node_locked_type ++btree_lock_want(struct btree_iter *iter, int level) ++{ ++ if (level < iter->level) ++ return BTREE_NODE_UNLOCKED; ++ if (level < iter->locks_want) ++ return BTREE_NODE_INTENT_LOCKED; ++ if (level == iter->level) ++ return BTREE_NODE_READ_LOCKED; ++ return BTREE_NODE_UNLOCKED; ++} ++ ++static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level) ++{ ++ int lock_type = btree_node_locked_type(iter, level); ++ ++ EBUG_ON(level >= BTREE_MAX_DEPTH); ++ ++ if (lock_type != BTREE_NODE_UNLOCKED) ++ six_unlock_type(&iter->l[level].b->c.lock, lock_type); ++ mark_btree_node_unlocked(iter, level); ++} ++ ++static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) ++{ ++ EBUG_ON(!level && iter->trans->nounlock); ++ ++ __btree_node_unlock(iter, level); ++} ++ ++static inline void __bch2_btree_iter_unlock(struct btree_iter *iter) ++{ ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); ++ ++ while (iter->nodes_locked) ++ btree_node_unlock(iter, __ffs(iter->nodes_locked)); ++} ++ ++static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) ++{ ++ switch (type) { ++ case SIX_LOCK_read: ++ return BCH_TIME_btree_lock_contended_read; ++ case SIX_LOCK_intent: ++ return BCH_TIME_btree_lock_contended_intent; ++ case SIX_LOCK_write: ++ return BCH_TIME_btree_lock_contended_write; ++ default: ++ BUG(); ++ } ++} ++ ++/* ++ * wrapper around six locks that just traces lock contended time ++ */ ++static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b, ++ enum six_lock_type type) ++{ ++ u64 start_time = local_clock(); ++ ++ six_lock_type(&b->c.lock, type, NULL, NULL); ++ bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); ++} ++ ++static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b, ++ enum six_lock_type type) ++{ ++ if (!six_trylock_type(&b->c.lock, type)) ++ __btree_node_lock_type(c, b, type); ++} ++ ++/* ++ * Lock a btree node if we already have it locked on one of our linked ++ * iterators: ++ */ ++static inline bool btree_node_lock_increment(struct btree_trans *trans, ++ struct btree *b, unsigned level, ++ enum btree_node_locked_type want) ++{ ++ struct btree_iter *iter; ++ ++ trans_for_each_iter(trans, iter) ++ if (iter->l[level].b == b && ++ btree_node_locked_type(iter, level) >= want) { ++ six_lock_increment(&b->c.lock, want); ++ return true; ++ } ++ ++ return false; ++} ++ ++bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned, ++ struct btree_iter *, enum six_lock_type, ++ six_lock_should_sleep_fn, void *); ++ ++static inline bool btree_node_lock(struct btree *b, ++ struct bpos pos, unsigned level, ++ struct btree_iter *iter, ++ enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) ++{ ++ struct btree_trans *trans = iter->trans; ++ bool ret; ++ ++ EBUG_ON(level >= BTREE_MAX_DEPTH); ++ EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ trans->locking = b; ++ trans->locking_iter_idx = iter->idx; ++ trans->locking_pos = pos; ++ trans->locking_btree_id = iter->btree_id; ++ trans->locking_level = level; ++#endif ++ ret = likely(six_trylock_type(&b->c.lock, type)) || ++ btree_node_lock_increment(trans, b, level, type) || ++ __bch2_btree_node_lock(b, pos, level, iter, type, ++ should_sleep_fn, p); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ trans->locking = NULL; ++#endif ++ return ret; ++} ++ ++bool __bch2_btree_node_relock(struct btree_iter *, unsigned); ++ ++static inline bool bch2_btree_node_relock(struct btree_iter *iter, ++ unsigned level) ++{ ++ EBUG_ON(btree_node_locked(iter, level) && ++ btree_node_locked_type(iter, level) != ++ __btree_lock_want(iter, level)); ++ ++ return likely(btree_node_locked(iter, level)) || ++ __bch2_btree_node_relock(iter, level); ++} ++ ++/* ++ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will ++ * succeed: ++ */ ++static inline void ++bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter) ++{ ++ struct btree_iter *linked; ++ ++ EBUG_ON(iter->l[b->c.level].b != b); ++ EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq); ++ ++ trans_for_each_iter_with_node(iter->trans, b, linked) ++ linked->l[b->c.level].lock_seq += 2; ++ ++ six_unlock_write(&b->c.lock); ++} ++ ++void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *); ++ ++void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *); ++ ++static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) ++{ ++ EBUG_ON(iter->l[b->c.level].b != b); ++ EBUG_ON(iter->l[b->c.level].lock_seq != b->c.lock.state.seq); ++ ++ if (unlikely(!six_trylock_write(&b->c.lock))) ++ __bch2_btree_node_lock_write(b, iter); ++} ++ ++#endif /* _BCACHEFS_BTREE_LOCKING_H */ ++ ++ +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +new file mode 100644 +index 000000000000..98611b1da1ed +--- /dev/null ++++ b/fs/bcachefs/btree_types.h +@@ -0,0 +1,666 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_TYPES_H ++#define _BCACHEFS_BTREE_TYPES_H ++ ++#include ++#include ++#include ++ ++#include "bkey_methods.h" ++#include "buckets_types.h" ++#include "journal_types.h" ++ ++struct open_bucket; ++struct btree_update; ++struct btree_trans; ++ ++#define MAX_BSETS 3U ++ ++struct btree_nr_keys { ++ ++ /* ++ * Amount of live metadata (i.e. size of node after a compaction) in ++ * units of u64s ++ */ ++ u16 live_u64s; ++ u16 bset_u64s[MAX_BSETS]; ++ ++ /* live keys only: */ ++ u16 packed_keys; ++ u16 unpacked_keys; ++}; ++ ++struct bset_tree { ++ /* ++ * We construct a binary tree in an array as if the array ++ * started at 1, so that things line up on the same cachelines ++ * better: see comments in bset.c at cacheline_to_bkey() for ++ * details ++ */ ++ ++ /* size of the binary tree and prev array */ ++ u16 size; ++ ++ /* function of size - precalculated for to_inorder() */ ++ u16 extra; ++ ++ u16 data_offset; ++ u16 aux_data_offset; ++ u16 end_offset; ++ ++ struct bpos max_key; ++}; ++ ++struct btree_write { ++ struct journal_entry_pin journal; ++}; ++ ++struct btree_alloc { ++ struct open_buckets ob; ++ BKEY_PADDED(k); ++}; ++ ++struct btree_bkey_cached_common { ++ struct six_lock lock; ++ u8 level; ++ u8 btree_id; ++}; ++ ++struct btree { ++ struct btree_bkey_cached_common c; ++ ++ struct rhash_head hash; ++ u64 hash_val; ++ ++ unsigned long flags; ++ u16 written; ++ u8 nsets; ++ u8 nr_key_bits; ++ ++ struct bkey_format format; ++ ++ struct btree_node *data; ++ void *aux_data; ++ ++ /* ++ * Sets of sorted keys - the real btree node - plus a binary search tree ++ * ++ * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point ++ * to the memory we have allocated for this btree node. Additionally, ++ * set[0]->data points to the entire btree node as it exists on disk. ++ */ ++ struct bset_tree set[MAX_BSETS]; ++ ++ struct btree_nr_keys nr; ++ u16 sib_u64s[2]; ++ u16 whiteout_u64s; ++ u8 page_order; ++ u8 unpack_fn_len; ++ ++ /* ++ * XXX: add a delete sequence number, so when bch2_btree_node_relock() ++ * fails because the lock sequence number has changed - i.e. the ++ * contents were modified - we can still relock the node if it's still ++ * the one we want, without redoing the traversal ++ */ ++ ++ /* ++ * For asynchronous splits/interior node updates: ++ * When we do a split, we allocate new child nodes and update the parent ++ * node to point to them: we update the parent in memory immediately, ++ * but then we must wait until the children have been written out before ++ * the update to the parent can be written - this is a list of the ++ * btree_updates that are blocking this node from being ++ * written: ++ */ ++ struct list_head write_blocked; ++ ++ /* ++ * Also for asynchronous splits/interior node updates: ++ * If a btree node isn't reachable yet, we don't want to kick off ++ * another write - because that write also won't yet be reachable and ++ * marking it as completed before it's reachable would be incorrect: ++ */ ++ unsigned long will_make_reachable; ++ ++ struct open_buckets ob; ++ ++ /* lru list */ ++ struct list_head list; ++ ++ struct btree_write writes[2]; ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ bool *expensive_debug_checks; ++#endif ++ ++ /* Key/pointer for this btree node */ ++ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); ++}; ++ ++struct btree_cache { ++ struct rhashtable table; ++ bool table_init_done; ++ /* ++ * We never free a struct btree, except on shutdown - we just put it on ++ * the btree_cache_freed list and reuse it later. This simplifies the ++ * code, and it doesn't cost us much memory as the memory usage is ++ * dominated by buffers that hold the actual btree node data and those ++ * can be freed - and the number of struct btrees allocated is ++ * effectively bounded. ++ * ++ * btree_cache_freeable effectively is a small cache - we use it because ++ * high order page allocations can be rather expensive, and it's quite ++ * common to delete and allocate btree nodes in quick succession. It ++ * should never grow past ~2-3 nodes in practice. ++ */ ++ struct mutex lock; ++ struct list_head live; ++ struct list_head freeable; ++ struct list_head freed; ++ ++ /* Number of elements in live + freeable lists */ ++ unsigned used; ++ unsigned reserve; ++ struct shrinker shrink; ++ ++ /* ++ * If we need to allocate memory for a new btree node and that ++ * allocation fails, we can cannibalize another node in the btree cache ++ * to satisfy the allocation - lock to guarantee only one thread does ++ * this at a time: ++ */ ++ struct task_struct *alloc_lock; ++ struct closure_waitlist alloc_wait; ++}; ++ ++struct btree_node_iter { ++ struct btree_node_iter_set { ++ u16 k, end; ++ } data[MAX_BSETS]; ++}; ++ ++enum btree_iter_type { ++ BTREE_ITER_KEYS, ++ BTREE_ITER_NODES, ++ BTREE_ITER_CACHED, ++}; ++ ++#define BTREE_ITER_TYPE ((1 << 2) - 1) ++ ++/* ++ * Iterate over all possible positions, synthesizing deleted keys for holes: ++ */ ++#define BTREE_ITER_SLOTS (1 << 2) ++/* ++ * Indicates that intent locks should be taken on leaf nodes, because we expect ++ * to be doing updates: ++ */ ++#define BTREE_ITER_INTENT (1 << 3) ++/* ++ * Causes the btree iterator code to prefetch additional btree nodes from disk: ++ */ ++#define BTREE_ITER_PREFETCH (1 << 4) ++/* ++ * Indicates that this iterator should not be reused until transaction commit, ++ * either because a pending update references it or because the update depends ++ * on that particular key being locked (e.g. by the str_hash code, for hash ++ * table consistency) ++ */ ++#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5) ++/* ++ * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for ++ * @pos or the first key strictly greater than @pos ++ */ ++#define BTREE_ITER_IS_EXTENTS (1 << 6) ++#define BTREE_ITER_ERROR (1 << 7) ++#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8) ++#define BTREE_ITER_CACHED_NOFILL (1 << 9) ++#define BTREE_ITER_CACHED_NOCREATE (1 << 10) ++ ++#define BTREE_ITER_USER_FLAGS \ ++ (BTREE_ITER_SLOTS \ ++ |BTREE_ITER_INTENT \ ++ |BTREE_ITER_PREFETCH \ ++ |BTREE_ITER_CACHED_NOFILL \ ++ |BTREE_ITER_CACHED_NOCREATE) ++ ++enum btree_iter_uptodate { ++ BTREE_ITER_UPTODATE = 0, ++ BTREE_ITER_NEED_PEEK = 1, ++ BTREE_ITER_NEED_RELOCK = 2, ++ BTREE_ITER_NEED_TRAVERSE = 3, ++}; ++ ++#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1) ++#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2) ++#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3) ++#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4) ++#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5) ++#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6) ++#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7) ++ ++/* ++ * @pos - iterator's current position ++ * @level - current btree depth ++ * @locks_want - btree level below which we start taking intent locks ++ * @nodes_locked - bitmask indicating which nodes in @nodes are locked ++ * @nodes_intent_locked - bitmask indicating which locks are intent locks ++ */ ++struct btree_iter { ++ struct btree_trans *trans; ++ struct bpos pos; ++ struct bpos pos_after_commit; ++ ++ u16 flags; ++ u8 idx; ++ ++ enum btree_id btree_id:4; ++ enum btree_iter_uptodate uptodate:4; ++ unsigned level:4, ++ min_depth:4, ++ locks_want:4, ++ nodes_locked:4, ++ nodes_intent_locked:4; ++ ++ struct btree_iter_level { ++ struct btree *b; ++ struct btree_node_iter iter; ++ u32 lock_seq; ++ } l[BTREE_MAX_DEPTH]; ++ ++ /* ++ * Current unpacked key - so that bch2_btree_iter_next()/ ++ * bch2_btree_iter_next_slot() can correctly advance pos. ++ */ ++ struct bkey k; ++ unsigned long ip_allocated; ++}; ++ ++static inline enum btree_iter_type ++btree_iter_type(const struct btree_iter *iter) ++{ ++ return iter->flags & BTREE_ITER_TYPE; ++} ++ ++static inline struct btree_iter_level *iter_l(struct btree_iter *iter) ++{ ++ return iter->l + iter->level; ++} ++ ++struct btree_key_cache { ++ struct mutex lock; ++ struct rhashtable table; ++ struct list_head freed; ++ struct list_head clean; ++}; ++ ++struct bkey_cached_key { ++ u32 btree_id; ++ struct bpos pos; ++} __attribute__((packed, aligned(4))); ++ ++#define BKEY_CACHED_DIRTY 0 ++ ++struct bkey_cached { ++ struct btree_bkey_cached_common c; ++ ++ unsigned long flags; ++ u8 u64s; ++ bool valid; ++ struct bkey_cached_key key; ++ ++ struct rhash_head hash; ++ struct list_head list; ++ ++ struct journal_preres res; ++ struct journal_entry_pin journal; ++ ++ struct bkey_i *k; ++}; ++ ++struct btree_insert_entry { ++ unsigned trigger_flags; ++ unsigned trans_triggers_run:1; ++ struct bkey_i *k; ++ struct btree_iter *iter; ++}; ++ ++#ifndef CONFIG_LOCKDEP ++#define BTREE_ITER_MAX 64 ++#else ++#define BTREE_ITER_MAX 32 ++#endif ++ ++struct btree_trans { ++ struct bch_fs *c; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct list_head list; ++ struct btree *locking; ++ unsigned locking_iter_idx; ++ struct bpos locking_pos; ++ u8 locking_btree_id; ++ u8 locking_level; ++ pid_t pid; ++#endif ++ unsigned long ip; ++ ++ u64 iters_linked; ++ u64 iters_live; ++ u64 iters_touched; ++ ++ u8 nr_iters; ++ u8 nr_updates; ++ u8 nr_updates2; ++ u8 size; ++ unsigned used_mempool:1; ++ unsigned error:1; ++ unsigned nounlock:1; ++ unsigned need_reset:1; ++ unsigned in_traverse_all:1; ++ ++ unsigned mem_top; ++ unsigned mem_bytes; ++ void *mem; ++ ++ struct btree_iter *iters; ++ struct btree_insert_entry *updates; ++ struct btree_insert_entry *updates2; ++ ++ /* update path: */ ++ struct jset_entry *extra_journal_entries; ++ unsigned extra_journal_entry_u64s; ++ struct journal_entry_pin *journal_pin; ++ ++ struct journal_res journal_res; ++ struct journal_preres journal_preres; ++ u64 *journal_seq; ++ struct disk_reservation *disk_res; ++ unsigned flags; ++ unsigned journal_u64s; ++ unsigned journal_preres_u64s; ++ struct replicas_delta_list *fs_usage_deltas; ++ ++ struct btree_iter iters_onstack[2]; ++ struct btree_insert_entry updates_onstack[2]; ++ struct btree_insert_entry updates2_onstack[2]; ++}; ++ ++#define BTREE_FLAG(flag) \ ++static inline bool btree_node_ ## flag(struct btree *b) \ ++{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ ++ \ ++static inline void set_btree_node_ ## flag(struct btree *b) \ ++{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ ++ \ ++static inline void clear_btree_node_ ## flag(struct btree *b) \ ++{ clear_bit(BTREE_NODE_ ## flag, &b->flags); } ++ ++enum btree_flags { ++ BTREE_NODE_read_in_flight, ++ BTREE_NODE_read_error, ++ BTREE_NODE_dirty, ++ BTREE_NODE_need_write, ++ BTREE_NODE_noevict, ++ BTREE_NODE_write_idx, ++ BTREE_NODE_accessed, ++ BTREE_NODE_write_in_flight, ++ BTREE_NODE_just_written, ++ BTREE_NODE_dying, ++ BTREE_NODE_fake, ++ BTREE_NODE_old_extent_overwrite, ++ BTREE_NODE_need_rewrite, ++}; ++ ++BTREE_FLAG(read_in_flight); ++BTREE_FLAG(read_error); ++BTREE_FLAG(dirty); ++BTREE_FLAG(need_write); ++BTREE_FLAG(noevict); ++BTREE_FLAG(write_idx); ++BTREE_FLAG(accessed); ++BTREE_FLAG(write_in_flight); ++BTREE_FLAG(just_written); ++BTREE_FLAG(dying); ++BTREE_FLAG(fake); ++BTREE_FLAG(old_extent_overwrite); ++BTREE_FLAG(need_rewrite); ++ ++static inline struct btree_write *btree_current_write(struct btree *b) ++{ ++ return b->writes + btree_node_write_idx(b); ++} ++ ++static inline struct btree_write *btree_prev_write(struct btree *b) ++{ ++ return b->writes + (btree_node_write_idx(b) ^ 1); ++} ++ ++static inline struct bset_tree *bset_tree_last(struct btree *b) ++{ ++ EBUG_ON(!b->nsets); ++ return b->set + b->nsets - 1; ++} ++ ++static inline void * ++__btree_node_offset_to_ptr(const struct btree *b, u16 offset) ++{ ++ return (void *) ((u64 *) b->data + 1 + offset); ++} ++ ++static inline u16 ++__btree_node_ptr_to_offset(const struct btree *b, const void *p) ++{ ++ u16 ret = (u64 *) p - 1 - (u64 *) b->data; ++ ++ EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p); ++ return ret; ++} ++ ++static inline struct bset *bset(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ return __btree_node_offset_to_ptr(b, t->data_offset); ++} ++ ++static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t) ++{ ++ t->end_offset = ++ __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t))); ++} ++ ++static inline void set_btree_bset(struct btree *b, struct bset_tree *t, ++ const struct bset *i) ++{ ++ t->data_offset = __btree_node_ptr_to_offset(b, i); ++ set_btree_bset_end(b, t); ++} ++ ++static inline struct bset *btree_bset_first(struct btree *b) ++{ ++ return bset(b, b->set); ++} ++ ++static inline struct bset *btree_bset_last(struct btree *b) ++{ ++ return bset(b, bset_tree_last(b)); ++} ++ ++static inline u16 ++__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k) ++{ ++ return __btree_node_ptr_to_offset(b, k); ++} ++ ++static inline struct bkey_packed * ++__btree_node_offset_to_key(const struct btree *b, u16 k) ++{ ++ return __btree_node_offset_to_ptr(b, k); ++} ++ ++static inline unsigned btree_bkey_first_offset(const struct bset_tree *t) ++{ ++ return t->data_offset + offsetof(struct bset, _data) / sizeof(u64); ++} ++ ++#define btree_bkey_first(_b, _t) \ ++({ \ ++ EBUG_ON(bset(_b, _t)->start != \ ++ __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\ ++ \ ++ bset(_b, _t)->start; \ ++}) ++ ++#define btree_bkey_last(_b, _t) \ ++({ \ ++ EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \ ++ vstruct_last(bset(_b, _t))); \ ++ \ ++ __btree_node_offset_to_key(_b, (_t)->end_offset); \ ++}) ++ ++static inline unsigned bset_u64s(struct bset_tree *t) ++{ ++ return t->end_offset - t->data_offset - ++ sizeof(struct bset) / sizeof(u64); ++} ++ ++static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t) ++{ ++ return bset_u64s(t) - b->nr.bset_u64s[t - b->set]; ++} ++ ++static inline unsigned bset_byte_offset(struct btree *b, void *i) ++{ ++ return i - (void *) b->data; ++} ++ ++enum btree_node_type { ++#define x(kwd, val, name) BKEY_TYPE_##kwd = val, ++ BCH_BTREE_IDS() ++#undef x ++ BKEY_TYPE_BTREE, ++}; ++ ++/* Type of a key in btree @id at level @level: */ ++static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id) ++{ ++ return level ? BKEY_TYPE_BTREE : (enum btree_node_type) id; ++} ++ ++/* Type of keys @b contains: */ ++static inline enum btree_node_type btree_node_type(struct btree *b) ++{ ++ return __btree_node_type(b->c.level, b->c.btree_id); ++} ++ ++static inline bool btree_node_type_is_extents(enum btree_node_type type) ++{ ++ switch (type) { ++ case BKEY_TYPE_EXTENTS: ++ case BKEY_TYPE_REFLINK: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static inline bool btree_node_is_extents(struct btree *b) ++{ ++ return btree_node_type_is_extents(btree_node_type(b)); ++} ++ ++static inline enum btree_node_type btree_iter_key_type(struct btree_iter *iter) ++{ ++ return __btree_node_type(iter->level, iter->btree_id); ++} ++ ++static inline bool btree_iter_is_extents(struct btree_iter *iter) ++{ ++ return btree_node_type_is_extents(btree_iter_key_type(iter)); ++} ++ ++#define BTREE_NODE_TYPE_HAS_TRIGGERS \ ++ ((1U << BKEY_TYPE_EXTENTS)| \ ++ (1U << BKEY_TYPE_ALLOC)| \ ++ (1U << BKEY_TYPE_INODES)| \ ++ (1U << BKEY_TYPE_REFLINK)| \ ++ (1U << BKEY_TYPE_EC)| \ ++ (1U << BKEY_TYPE_BTREE)) ++ ++#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ ++ ((1U << BKEY_TYPE_EXTENTS)| \ ++ (1U << BKEY_TYPE_INODES)| \ ++ (1U << BKEY_TYPE_REFLINK)) ++ ++enum btree_trigger_flags { ++ __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ ++ __BTREE_TRIGGER_NOOVERWRITES, /* Don't run triggers on overwrites */ ++ ++ __BTREE_TRIGGER_INSERT, ++ __BTREE_TRIGGER_OVERWRITE, ++ __BTREE_TRIGGER_OVERWRITE_SPLIT, ++ ++ __BTREE_TRIGGER_GC, ++ __BTREE_TRIGGER_BUCKET_INVALIDATE, ++ __BTREE_TRIGGER_ALLOC_READ, ++ __BTREE_TRIGGER_NOATOMIC, ++}; ++ ++#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) ++#define BTREE_TRIGGER_NOOVERWRITES (1U << __BTREE_TRIGGER_NOOVERWRITES) ++ ++#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) ++#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) ++#define BTREE_TRIGGER_OVERWRITE_SPLIT (1U << __BTREE_TRIGGER_OVERWRITE_SPLIT) ++ ++#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) ++#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) ++#define BTREE_TRIGGER_ALLOC_READ (1U << __BTREE_TRIGGER_ALLOC_READ) ++#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) ++ ++static inline bool btree_node_type_needs_gc(enum btree_node_type type) ++{ ++ return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); ++} ++ ++struct btree_root { ++ struct btree *b; ++ ++ /* On disk root - see async splits: */ ++ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); ++ u8 level; ++ u8 alive; ++ s8 error; ++}; ++ ++/* ++ * Optional hook that will be called just prior to a btree node update, when ++ * we're holding the write lock and we know what key is about to be overwritten: ++ */ ++ ++enum btree_insert_ret { ++ BTREE_INSERT_OK, ++ /* leaf node needs to be split */ ++ BTREE_INSERT_BTREE_NODE_FULL, ++ BTREE_INSERT_ENOSPC, ++ BTREE_INSERT_NEED_MARK_REPLICAS, ++ BTREE_INSERT_NEED_JOURNAL_RES, ++}; ++ ++enum btree_gc_coalesce_fail_reason { ++ BTREE_GC_COALESCE_FAIL_RESERVE_GET, ++ BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC, ++ BTREE_GC_COALESCE_FAIL_FORMAT_FITS, ++}; ++ ++enum btree_node_sibling { ++ btree_prev_sib, ++ btree_next_sib, ++}; ++ ++typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *, ++ struct btree *, ++ struct btree_node_iter *); ++ ++#endif /* _BCACHEFS_BTREE_TYPES_H */ +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +new file mode 100644 +index 000000000000..e0b1bde37484 +--- /dev/null ++++ b/fs/bcachefs/btree_update.h +@@ -0,0 +1,144 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_UPDATE_H ++#define _BCACHEFS_BTREE_UPDATE_H ++ ++#include "btree_iter.h" ++#include "journal.h" ++ ++struct bch_fs; ++struct btree; ++ ++void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *, ++ struct btree_iter *); ++bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, ++ struct btree_node_iter *, struct bkey_i *); ++void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); ++ ++enum btree_insert_flags { ++ __BTREE_INSERT_NOUNLOCK, ++ __BTREE_INSERT_NOFAIL, ++ __BTREE_INSERT_NOCHECK_RW, ++ __BTREE_INSERT_LAZY_RW, ++ __BTREE_INSERT_USE_RESERVE, ++ __BTREE_INSERT_USE_ALLOC_RESERVE, ++ __BTREE_INSERT_JOURNAL_REPLAY, ++ __BTREE_INSERT_JOURNAL_RESERVED, ++ __BTREE_INSERT_JOURNAL_RECLAIM, ++ __BTREE_INSERT_NOWAIT, ++ __BTREE_INSERT_GC_LOCK_HELD, ++ __BCH_HASH_SET_MUST_CREATE, ++ __BCH_HASH_SET_MUST_REPLACE, ++}; ++ ++/* ++ * Don't drop locks _after_ successfully updating btree: ++ */ ++#define BTREE_INSERT_NOUNLOCK (1 << __BTREE_INSERT_NOUNLOCK) ++ ++/* Don't check for -ENOSPC: */ ++#define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL) ++ ++#define BTREE_INSERT_NOCHECK_RW (1 << __BTREE_INSERT_NOCHECK_RW) ++#define BTREE_INSERT_LAZY_RW (1 << __BTREE_INSERT_LAZY_RW) ++ ++/* for copygc, or when merging btree nodes */ ++#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE) ++#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE) ++ ++/* Insert is for journal replay - don't get journal reservations: */ ++#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY) ++ ++/* Indicates that we have pre-reserved space in the journal: */ ++#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED) ++ ++/* Insert is being called from journal reclaim path: */ ++#define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM) ++ ++/* Don't block on allocation failure (for new btree nodes: */ ++#define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT) ++#define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD) ++ ++#define BCH_HASH_SET_MUST_CREATE (1 << __BCH_HASH_SET_MUST_CREATE) ++#define BCH_HASH_SET_MUST_REPLACE (1 << __BCH_HASH_SET_MUST_REPLACE) ++ ++int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); ++ ++int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *); ++int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, ++ struct disk_reservation *, u64 *, int flags); ++ ++int bch2_btree_delete_at_range(struct btree_trans *, struct btree_iter *, ++ struct bpos, u64 *); ++int bch2_btree_delete_range(struct bch_fs *, enum btree_id, ++ struct bpos, struct bpos, u64 *); ++ ++int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, ++ __le64, unsigned); ++int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, ++ struct btree *, struct bkey_i *); ++ ++int bch2_trans_update(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, enum btree_trigger_flags); ++int __bch2_trans_commit(struct btree_trans *); ++ ++/** ++ * bch2_trans_commit - insert keys at given iterator positions ++ * ++ * This is main entry point for btree updates. ++ * ++ * Return values: ++ * -EINTR: locking changed, this function should be called again. ++ * -EROFS: filesystem read only ++ * -EIO: journal or btree node IO error ++ */ ++static inline int bch2_trans_commit(struct btree_trans *trans, ++ struct disk_reservation *disk_res, ++ u64 *journal_seq, ++ unsigned flags) ++{ ++ trans->disk_res = disk_res; ++ trans->journal_seq = journal_seq; ++ trans->flags = flags; ++ ++ return __bch2_trans_commit(trans); ++} ++ ++#define __bch2_trans_do(_trans, _disk_res, _journal_seq, _flags, _do) \ ++({ \ ++ int _ret; \ ++ \ ++ while (1) { \ ++ _ret = (_do) ?: bch2_trans_commit(_trans, (_disk_res), \ ++ (_journal_seq), (_flags)); \ ++ if (_ret != -EINTR) \ ++ break; \ ++ bch2_trans_reset(_trans, 0); \ ++ } \ ++ \ ++ _ret; \ ++}) ++ ++#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ ++({ \ ++ struct btree_trans trans; \ ++ int _ret, _ret2; \ ++ \ ++ bch2_trans_init(&trans, (_c), 0, 0); \ ++ _ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \ ++ _do); \ ++ _ret2 = bch2_trans_exit(&trans); \ ++ \ ++ _ret ?: _ret2; \ ++}) ++ ++#define trans_for_each_update(_trans, _i) \ ++ for ((_i) = (_trans)->updates; \ ++ (_i) < (_trans)->updates + (_trans)->nr_updates; \ ++ (_i)++) ++ ++#define trans_for_each_update2(_trans, _i) \ ++ for ((_i) = (_trans)->updates2; \ ++ (_i) < (_trans)->updates2 + (_trans)->nr_updates2; \ ++ (_i)++) ++ ++#endif /* _BCACHEFS_BTREE_UPDATE_H */ +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +new file mode 100644 +index 000000000000..b41916f93c9b +--- /dev/null ++++ b/fs/bcachefs/btree_update_interior.c +@@ -0,0 +1,2076 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_methods.h" ++#include "btree_cache.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_locking.h" ++#include "buckets.h" ++#include "extents.h" ++#include "journal.h" ++#include "journal_reclaim.h" ++#include "keylist.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++#include ++#include ++ ++/* Debug code: */ ++ ++/* ++ * Verify that child nodes correctly span parent node's range: ++ */ ++static void btree_node_interior_verify(struct btree *b) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bpos next_node = b->data->min_key; ++ struct btree_node_iter iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_btree_ptr_v2 bp; ++ struct bkey unpacked; ++ ++ BUG_ON(!b->c.level); ++ ++ bch2_btree_node_iter_init_from_start(&iter, b); ++ ++ while (1) { ++ k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked); ++ if (k.k->type != KEY_TYPE_btree_ptr_v2) ++ break; ++ bp = bkey_s_c_to_btree_ptr_v2(k); ++ ++ BUG_ON(bkey_cmp(next_node, bp.v->min_key)); ++ ++ bch2_btree_node_iter_advance(&iter, b); ++ ++ if (bch2_btree_node_iter_end(&iter)) { ++ BUG_ON(bkey_cmp(k.k->p, b->key.k.p)); ++ break; ++ } ++ ++ next_node = bkey_successor(k.k->p); ++ } ++#endif ++} ++ ++/* Calculate ideal packed bkey format for new btree nodes: */ ++ ++void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) ++{ ++ struct bkey_packed *k; ++ struct bset_tree *t; ++ struct bkey uk; ++ ++ bch2_bkey_format_add_pos(s, b->data->min_key); ++ ++ for_each_bset(b, t) ++ bset_tree_for_each_key(b, t, k) ++ if (!bkey_whiteout(k)) { ++ uk = bkey_unpack_key(b, k); ++ bch2_bkey_format_add_key(s, &uk); ++ } ++} ++ ++static struct bkey_format bch2_btree_calc_format(struct btree *b) ++{ ++ struct bkey_format_state s; ++ ++ bch2_bkey_format_init(&s); ++ __bch2_btree_calc_format(&s, b); ++ ++ return bch2_bkey_format_done(&s); ++} ++ ++static size_t btree_node_u64s_with_format(struct btree *b, ++ struct bkey_format *new_f) ++{ ++ struct bkey_format *old_f = &b->format; ++ ++ /* stupid integer promotion rules */ ++ ssize_t delta = ++ (((int) new_f->key_u64s - old_f->key_u64s) * ++ (int) b->nr.packed_keys) + ++ (((int) new_f->key_u64s - BKEY_U64s) * ++ (int) b->nr.unpacked_keys); ++ ++ BUG_ON(delta + b->nr.live_u64s < 0); ++ ++ return b->nr.live_u64s + delta; ++} ++ ++/** ++ * btree_node_format_fits - check if we could rewrite node with a new format ++ * ++ * This assumes all keys can pack with the new format -- it just checks if ++ * the re-packed keys would fit inside the node itself. ++ */ ++bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, ++ struct bkey_format *new_f) ++{ ++ size_t u64s = btree_node_u64s_with_format(b, new_f); ++ ++ return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c); ++} ++ ++/* Btree node freeing/allocation: */ ++ ++static void __btree_node_free(struct bch_fs *c, struct btree *b) ++{ ++ trace_btree_node_free(c, b); ++ ++ BUG_ON(btree_node_dirty(b)); ++ BUG_ON(btree_node_need_write(b)); ++ BUG_ON(b == btree_node_root(c, b)); ++ BUG_ON(b->ob.nr); ++ BUG_ON(!list_empty(&b->write_blocked)); ++ BUG_ON(b->will_make_reachable); ++ ++ clear_btree_node_noevict(b); ++ ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ six_lock_wakeup_all(&b->c.lock); ++ ++ mutex_lock(&c->btree_cache.lock); ++ list_move(&b->list, &c->btree_cache.freeable); ++ mutex_unlock(&c->btree_cache.lock); ++} ++ ++void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) ++{ ++ struct open_buckets ob = b->ob; ++ ++ b->ob.nr = 0; ++ ++ clear_btree_node_dirty(b); ++ ++ btree_node_lock_type(c, b, SIX_LOCK_write); ++ __btree_node_free(c, b); ++ six_unlock_write(&b->c.lock); ++ ++ bch2_open_buckets_put(c, &ob); ++} ++ ++void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, ++ struct btree_iter *iter) ++{ ++ struct btree_iter *linked; ++ ++ trans_for_each_iter(iter->trans, linked) ++ BUG_ON(linked->l[b->c.level].b == b); ++ ++ six_lock_write(&b->c.lock, NULL, NULL); ++ __btree_node_free(c, b); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++} ++ ++static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, ++ struct disk_reservation *res, ++ struct closure *cl, ++ unsigned flags) ++{ ++ struct write_point *wp; ++ struct btree *b; ++ BKEY_PADDED(k) tmp; ++ struct open_buckets ob = { .nr = 0 }; ++ struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; ++ unsigned nr_reserve; ++ enum alloc_reserve alloc_reserve; ++ ++ if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) { ++ nr_reserve = 0; ++ alloc_reserve = RESERVE_ALLOC; ++ } else if (flags & BTREE_INSERT_USE_RESERVE) { ++ nr_reserve = BTREE_NODE_RESERVE / 2; ++ alloc_reserve = RESERVE_BTREE; ++ } else { ++ nr_reserve = BTREE_NODE_RESERVE; ++ alloc_reserve = RESERVE_NONE; ++ } ++ ++ mutex_lock(&c->btree_reserve_cache_lock); ++ if (c->btree_reserve_cache_nr > nr_reserve) { ++ struct btree_alloc *a = ++ &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; ++ ++ ob = a->ob; ++ bkey_copy(&tmp.k, &a->k); ++ mutex_unlock(&c->btree_reserve_cache_lock); ++ goto mem_alloc; ++ } ++ mutex_unlock(&c->btree_reserve_cache_lock); ++ ++retry: ++ wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0, ++ writepoint_ptr(&c->btree_write_point), ++ &devs_have, ++ res->nr_replicas, ++ c->opts.metadata_replicas_required, ++ alloc_reserve, 0, cl); ++ if (IS_ERR(wp)) ++ return ERR_CAST(wp); ++ ++ if (wp->sectors_free < c->opts.btree_node_size) { ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ if (ob->sectors_free < c->opts.btree_node_size) ++ ob->sectors_free = 0; ++ ++ bch2_alloc_sectors_done(c, wp); ++ goto retry; ++ } ++ ++ if (c->sb.features & (1ULL << BCH_FEATURE_btree_ptr_v2)) ++ bkey_btree_ptr_v2_init(&tmp.k); ++ else ++ bkey_btree_ptr_init(&tmp.k); ++ ++ bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size); ++ ++ bch2_open_bucket_get(c, wp, &ob); ++ bch2_alloc_sectors_done(c, wp); ++mem_alloc: ++ b = bch2_btree_node_mem_alloc(c); ++ ++ /* we hold cannibalize_lock: */ ++ BUG_ON(IS_ERR(b)); ++ BUG_ON(b->ob.nr); ++ ++ bkey_copy(&b->key, &tmp.k); ++ b->ob = ob; ++ ++ return b; ++} ++ ++static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *b; ++ int ret; ++ ++ BUG_ON(level >= BTREE_MAX_DEPTH); ++ BUG_ON(!as->nr_prealloc_nodes); ++ ++ b = as->prealloc_nodes[--as->nr_prealloc_nodes]; ++ ++ set_btree_node_accessed(b); ++ set_btree_node_dirty(b); ++ set_btree_node_need_write(b); ++ ++ bch2_bset_init_first(b, &b->data->keys); ++ b->c.level = level; ++ b->c.btree_id = as->btree_id; ++ ++ memset(&b->nr, 0, sizeof(b->nr)); ++ b->data->magic = cpu_to_le64(bset_magic(c)); ++ b->data->flags = 0; ++ SET_BTREE_NODE_ID(b->data, as->btree_id); ++ SET_BTREE_NODE_LEVEL(b->data, level); ++ b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr; ++ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { ++ struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key); ++ ++ bp->v.mem_ptr = 0; ++ bp->v.seq = b->data->keys.seq; ++ bp->v.sectors_written = 0; ++ bp->v.sectors = cpu_to_le16(c->opts.btree_node_size); ++ } ++ ++ if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite)) ++ SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); ++ ++ if (btree_node_is_extents(b) && ++ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) { ++ set_btree_node_old_extent_overwrite(b); ++ set_btree_node_need_rewrite(b); ++ } ++ ++ bch2_btree_build_aux_trees(b); ++ ++ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); ++ BUG_ON(ret); ++ ++ trace_btree_node_alloc(c, b); ++ return b; ++} ++ ++static void btree_set_min(struct btree *b, struct bpos pos) ++{ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) ++ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos; ++ b->data->min_key = pos; ++} ++ ++static void btree_set_max(struct btree *b, struct bpos pos) ++{ ++ b->key.k.p = pos; ++ b->data->max_key = pos; ++} ++ ++struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as, ++ struct btree *b, ++ struct bkey_format format) ++{ ++ struct btree *n; ++ ++ n = bch2_btree_node_alloc(as, b->c.level); ++ ++ SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); ++ ++ btree_set_min(n, b->data->min_key); ++ btree_set_max(n, b->data->max_key); ++ ++ n->data->format = format; ++ btree_node_set_format(n, format); ++ ++ bch2_btree_sort_into(as->c, n, b); ++ ++ btree_node_reset_sib_u64s(n); ++ ++ n->key.k.p = b->key.k.p; ++ return n; ++} ++ ++static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as, ++ struct btree *b) ++{ ++ struct bkey_format new_f = bch2_btree_calc_format(b); ++ ++ /* ++ * The keys might expand with the new format - if they wouldn't fit in ++ * the btree node anymore, use the old format for now: ++ */ ++ if (!bch2_btree_node_format_fits(as->c, b, &new_f)) ++ new_f = b->format; ++ ++ return __bch2_btree_node_alloc_replacement(as, b, new_f); ++} ++ ++static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level) ++{ ++ struct btree *b = bch2_btree_node_alloc(as, level); ++ ++ btree_set_min(b, POS_MIN); ++ btree_set_max(b, POS_MAX); ++ b->data->format = bch2_btree_calc_format(b); ++ ++ btree_node_set_format(b, b->data->format); ++ bch2_btree_build_aux_trees(b); ++ ++ bch2_btree_update_add_new_node(as, b); ++ six_unlock_write(&b->c.lock); ++ ++ return b; ++} ++ ++static void bch2_btree_reserve_put(struct btree_update *as) ++{ ++ struct bch_fs *c = as->c; ++ ++ mutex_lock(&c->btree_reserve_cache_lock); ++ ++ while (as->nr_prealloc_nodes) { ++ struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes]; ++ ++ six_unlock_write(&b->c.lock); ++ ++ if (c->btree_reserve_cache_nr < ++ ARRAY_SIZE(c->btree_reserve_cache)) { ++ struct btree_alloc *a = ++ &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; ++ ++ a->ob = b->ob; ++ b->ob.nr = 0; ++ bkey_copy(&a->k, &b->key); ++ } else { ++ bch2_open_buckets_put(c, &b->ob); ++ } ++ ++ btree_node_lock_type(c, b, SIX_LOCK_write); ++ __btree_node_free(c, b); ++ six_unlock_write(&b->c.lock); ++ ++ six_unlock_intent(&b->c.lock); ++ } ++ ++ mutex_unlock(&c->btree_reserve_cache_lock); ++} ++ ++static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes, ++ unsigned flags, struct closure *cl) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *b; ++ int ret; ++ ++ BUG_ON(nr_nodes > BTREE_RESERVE_MAX); ++ ++ /* ++ * Protects reaping from the btree node cache and using the btree node ++ * open bucket reserve: ++ */ ++ ret = bch2_btree_cache_cannibalize_lock(c, cl); ++ if (ret) ++ return ret; ++ ++ while (as->nr_prealloc_nodes < nr_nodes) { ++ b = __bch2_btree_node_alloc(c, &as->disk_res, ++ flags & BTREE_INSERT_NOWAIT ++ ? NULL : cl, flags); ++ if (IS_ERR(b)) { ++ ret = PTR_ERR(b); ++ goto err_free; ++ } ++ ++ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key)); ++ if (ret) ++ goto err_free; ++ ++ as->prealloc_nodes[as->nr_prealloc_nodes++] = b; ++ } ++ ++ bch2_btree_cache_cannibalize_unlock(c); ++ return 0; ++err_free: ++ bch2_btree_cache_cannibalize_unlock(c); ++ trace_btree_reserve_get_fail(c, nr_nodes, cl); ++ return ret; ++} ++ ++/* Asynchronous interior node update machinery */ ++ ++static void bch2_btree_update_free(struct btree_update *as) ++{ ++ struct bch_fs *c = as->c; ++ ++ bch2_journal_preres_put(&c->journal, &as->journal_preres); ++ ++ bch2_journal_pin_drop(&c->journal, &as->journal); ++ bch2_journal_pin_flush(&c->journal, &as->journal); ++ bch2_disk_reservation_put(c, &as->disk_res); ++ bch2_btree_reserve_put(as); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_del(&as->unwritten_list); ++ list_del(&as->list); ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ closure_debug_destroy(&as->cl); ++ mempool_free(as, &c->btree_interior_update_pool); ++ ++ closure_wake_up(&c->btree_interior_update_wait); ++} ++ ++static void btree_update_will_delete_key(struct btree_update *as, ++ struct bkey_i *k) ++{ ++ BUG_ON(bch2_keylist_u64s(&as->old_keys) + k->k.u64s > ++ ARRAY_SIZE(as->_old_keys)); ++ bch2_keylist_add(&as->old_keys, k); ++} ++ ++static void btree_update_will_add_key(struct btree_update *as, ++ struct bkey_i *k) ++{ ++ BUG_ON(bch2_keylist_u64s(&as->new_keys) + k->k.u64s > ++ ARRAY_SIZE(as->_new_keys)); ++ bch2_keylist_add(&as->new_keys, k); ++} ++ ++/* ++ * The transactional part of an interior btree node update, where we journal the ++ * update we did to the interior node and update alloc info: ++ */ ++static int btree_update_nodes_written_trans(struct btree_trans *trans, ++ struct btree_update *as) ++{ ++ struct bkey_i *k; ++ int ret; ++ ++ trans->extra_journal_entries = (void *) &as->journal_entries[0]; ++ trans->extra_journal_entry_u64s = as->journal_u64s; ++ trans->journal_pin = &as->journal; ++ ++ for_each_keylist_key(&as->new_keys, k) { ++ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k), ++ 0, 0, BTREE_TRIGGER_INSERT); ++ if (ret) ++ return ret; ++ } ++ ++ for_each_keylist_key(&as->old_keys, k) { ++ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k), ++ 0, 0, BTREE_TRIGGER_OVERWRITE); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static void btree_update_nodes_written(struct btree_update *as) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *b = as->b; ++ u64 journal_seq = 0; ++ unsigned i; ++ int ret; ++ ++ /* ++ * We did an update to a parent node where the pointers we added pointed ++ * to child nodes that weren't written yet: now, the child nodes have ++ * been written so we can write out the update to the interior node. ++ */ ++ ++ /* ++ * We can't call into journal reclaim here: we'd block on the journal ++ * reclaim lock, but we may need to release the open buckets we have ++ * pinned in order for other btree updates to make forward progress, and ++ * journal reclaim does btree updates when flushing bkey_cached entries, ++ * which may require allocations as well. ++ */ ++ ret = bch2_trans_do(c, &as->disk_res, &journal_seq, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_USE_ALLOC_RESERVE| ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_JOURNAL_RECLAIM| ++ BTREE_INSERT_JOURNAL_RESERVED, ++ btree_update_nodes_written_trans(&trans, as)); ++ BUG_ON(ret && !bch2_journal_error(&c->journal)); ++ ++ if (b) { ++ /* ++ * @b is the node we did the final insert into: ++ * ++ * On failure to get a journal reservation, we still have to ++ * unblock the write and allow most of the write path to happen ++ * so that shutdown works, but the i->journal_seq mechanism ++ * won't work to prevent the btree write from being visible (we ++ * didn't get a journal sequence number) - instead ++ * __bch2_btree_node_write() doesn't do the actual write if ++ * we're in journal error state: ++ */ ++ ++ btree_node_lock_type(c, b, SIX_LOCK_intent); ++ btree_node_lock_type(c, b, SIX_LOCK_write); ++ mutex_lock(&c->btree_interior_update_lock); ++ ++ list_del(&as->write_blocked_list); ++ ++ if (!ret && as->b == b) { ++ struct bset *i = btree_bset_last(b); ++ ++ BUG_ON(!b->c.level); ++ BUG_ON(!btree_node_dirty(b)); ++ ++ i->journal_seq = cpu_to_le64( ++ max(journal_seq, ++ le64_to_cpu(i->journal_seq))); ++ ++ bch2_btree_add_journal_pin(c, b, journal_seq); ++ } ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++ six_unlock_write(&b->c.lock); ++ ++ btree_node_write_if_need(c, b, SIX_LOCK_intent); ++ six_unlock_intent(&b->c.lock); ++ } ++ ++ bch2_journal_pin_drop(&c->journal, &as->journal); ++ ++ bch2_journal_preres_put(&c->journal, &as->journal_preres); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ for (i = 0; i < as->nr_new_nodes; i++) { ++ b = as->new_nodes[i]; ++ ++ BUG_ON(b->will_make_reachable != (unsigned long) as); ++ b->will_make_reachable = 0; ++ } ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ for (i = 0; i < as->nr_new_nodes; i++) { ++ b = as->new_nodes[i]; ++ ++ btree_node_lock_type(c, b, SIX_LOCK_read); ++ btree_node_write_if_need(c, b, SIX_LOCK_read); ++ six_unlock_read(&b->c.lock); ++ } ++ ++ for (i = 0; i < as->nr_open_buckets; i++) ++ bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]); ++ ++ bch2_btree_update_free(as); ++} ++ ++static void btree_interior_update_work(struct work_struct *work) ++{ ++ struct bch_fs *c = ++ container_of(work, struct bch_fs, btree_interior_update_work); ++ struct btree_update *as; ++ ++ while (1) { ++ mutex_lock(&c->btree_interior_update_lock); ++ as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, ++ struct btree_update, unwritten_list); ++ if (as && !as->nodes_written) ++ as = NULL; ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ if (!as) ++ break; ++ ++ btree_update_nodes_written(as); ++ } ++} ++ ++static void btree_update_set_nodes_written(struct closure *cl) ++{ ++ struct btree_update *as = container_of(cl, struct btree_update, cl); ++ struct bch_fs *c = as->c; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ as->nodes_written = true; ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); ++} ++ ++/* ++ * We're updating @b with pointers to nodes that haven't finished writing yet: ++ * block @b from being written until @as completes ++ */ ++static void btree_update_updated_node(struct btree_update *as, struct btree *b) ++{ ++ struct bch_fs *c = as->c; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); ++ ++ BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); ++ BUG_ON(!btree_node_dirty(b)); ++ ++ as->mode = BTREE_INTERIOR_UPDATING_NODE; ++ as->b = b; ++ list_add(&as->write_blocked_list, &b->write_blocked); ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++} ++ ++static void btree_update_reparent(struct btree_update *as, ++ struct btree_update *child) ++{ ++ struct bch_fs *c = as->c; ++ ++ lockdep_assert_held(&c->btree_interior_update_lock); ++ ++ child->b = NULL; ++ child->mode = BTREE_INTERIOR_UPDATING_AS; ++ ++ /* ++ * When we write a new btree root, we have to drop our journal pin ++ * _before_ the new nodes are technically reachable; see ++ * btree_update_nodes_written(). ++ * ++ * This goes for journal pins that are recursively blocked on us - so, ++ * just transfer the journal pin to the new interior update so ++ * btree_update_nodes_written() can drop it. ++ */ ++ bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL); ++ bch2_journal_pin_drop(&c->journal, &child->journal); ++} ++ ++static void btree_update_updated_root(struct btree_update *as, struct btree *b) ++{ ++ struct bkey_i *insert = &b->key; ++ struct bch_fs *c = as->c; ++ ++ BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); ++ ++ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > ++ ARRAY_SIZE(as->journal_entries)); ++ ++ as->journal_u64s += ++ journal_entry_set((void *) &as->journal_entries[as->journal_u64s], ++ BCH_JSET_ENTRY_btree_root, ++ b->c.btree_id, b->c.level, ++ insert, insert->k.u64s); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); ++ ++ as->mode = BTREE_INTERIOR_UPDATING_ROOT; ++ mutex_unlock(&c->btree_interior_update_lock); ++} ++ ++/* ++ * bch2_btree_update_add_new_node: ++ * ++ * This causes @as to wait on @b to be written, before it gets to ++ * bch2_btree_update_nodes_written ++ * ++ * Additionally, it sets b->will_make_reachable to prevent any additional writes ++ * to @b from happening besides the first until @b is reachable on disk ++ * ++ * And it adds @b to the list of @as's new nodes, so that we can update sector ++ * counts in bch2_btree_update_nodes_written: ++ */ ++void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b) ++{ ++ struct bch_fs *c = as->c; ++ ++ closure_get(&as->cl); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes)); ++ BUG_ON(b->will_make_reachable); ++ ++ as->new_nodes[as->nr_new_nodes++] = b; ++ b->will_make_reachable = 1UL|(unsigned long) as; ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ btree_update_will_add_key(as, &b->key); ++} ++ ++/* ++ * returns true if @b was a new node ++ */ ++static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) ++{ ++ struct btree_update *as; ++ unsigned long v; ++ unsigned i; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ /* ++ * When b->will_make_reachable != 0, it owns a ref on as->cl that's ++ * dropped when it gets written by bch2_btree_complete_write - the ++ * xchg() is for synchronization with bch2_btree_complete_write: ++ */ ++ v = xchg(&b->will_make_reachable, 0); ++ as = (struct btree_update *) (v & ~1UL); ++ ++ if (!as) { ++ mutex_unlock(&c->btree_interior_update_lock); ++ return; ++ } ++ ++ for (i = 0; i < as->nr_new_nodes; i++) ++ if (as->new_nodes[i] == b) ++ goto found; ++ ++ BUG(); ++found: ++ array_remove_item(as->new_nodes, as->nr_new_nodes, i); ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ if (v & 1) ++ closure_put(&as->cl); ++} ++ ++void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b) ++{ ++ while (b->ob.nr) ++ as->open_buckets[as->nr_open_buckets++] = ++ b->ob.v[--b->ob.nr]; ++} ++ ++/* ++ * @b is being split/rewritten: it may have pointers to not-yet-written btree ++ * nodes and thus outstanding btree_updates - redirect @b's ++ * btree_updates to point to this btree_update: ++ */ ++void bch2_btree_interior_update_will_free_node(struct btree_update *as, ++ struct btree *b) ++{ ++ struct bch_fs *c = as->c; ++ struct btree_update *p, *n; ++ struct btree_write *w; ++ ++ set_btree_node_dying(b); ++ ++ if (btree_node_fake(b)) ++ return; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ ++ /* ++ * Does this node have any btree_update operations preventing ++ * it from being written? ++ * ++ * If so, redirect them to point to this btree_update: we can ++ * write out our new nodes, but we won't make them visible until those ++ * operations complete ++ */ ++ list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { ++ list_del_init(&p->write_blocked_list); ++ btree_update_reparent(as, p); ++ ++ /* ++ * for flush_held_btree_writes() waiting on updates to flush or ++ * nodes to be writeable: ++ */ ++ closure_wake_up(&c->btree_interior_update_wait); ++ } ++ ++ clear_btree_node_dirty(b); ++ clear_btree_node_need_write(b); ++ ++ /* ++ * Does this node have unwritten data that has a pin on the journal? ++ * ++ * If so, transfer that pin to the btree_update operation - ++ * note that if we're freeing multiple nodes, we only need to keep the ++ * oldest pin of any of the nodes we're freeing. We'll release the pin ++ * when the new nodes are persistent and reachable on disk: ++ */ ++ w = btree_current_write(b); ++ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); ++ bch2_journal_pin_drop(&c->journal, &w->journal); ++ ++ w = btree_prev_write(b); ++ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); ++ bch2_journal_pin_drop(&c->journal, &w->journal); ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ /* ++ * Is this a node that isn't reachable on disk yet? ++ * ++ * Nodes that aren't reachable yet have writes blocked until they're ++ * reachable - now that we've cancelled any pending writes and moved ++ * things waiting on that write to wait on this update, we can drop this ++ * node from the list of nodes that the other update is making ++ * reachable, prior to freeing it: ++ */ ++ btree_update_drop_new_node(c, b); ++ ++ btree_update_will_delete_key(as, &b->key); ++} ++ ++void bch2_btree_update_done(struct btree_update *as) ++{ ++ BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); ++ ++ bch2_btree_reserve_put(as); ++ ++ continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq); ++} ++ ++struct btree_update * ++bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, ++ unsigned nr_nodes, unsigned flags, ++ struct closure *cl) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_update *as; ++ int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) ++ ? BCH_DISK_RESERVATION_NOFAIL : 0; ++ int journal_flags = (flags & BTREE_INSERT_JOURNAL_RESERVED) ++ ? JOURNAL_RES_GET_RECLAIM : 0; ++ int ret = 0; ++ ++ /* ++ * This check isn't necessary for correctness - it's just to potentially ++ * prevent us from doing a lot of work that'll end up being wasted: ++ */ ++ ret = bch2_journal_error(&c->journal); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO); ++ memset(as, 0, sizeof(*as)); ++ closure_init(&as->cl, NULL); ++ as->c = c; ++ as->mode = BTREE_INTERIOR_NO_UPDATE; ++ as->btree_id = id; ++ INIT_LIST_HEAD(&as->list); ++ INIT_LIST_HEAD(&as->unwritten_list); ++ INIT_LIST_HEAD(&as->write_blocked_list); ++ bch2_keylist_init(&as->old_keys, as->_old_keys); ++ bch2_keylist_init(&as->new_keys, as->_new_keys); ++ bch2_keylist_init(&as->parent_keys, as->inline_keys); ++ ++ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, ++ BTREE_UPDATE_JOURNAL_RES, ++ journal_flags|JOURNAL_RES_GET_NONBLOCK); ++ if (ret == -EAGAIN) { ++ if (flags & BTREE_INSERT_NOUNLOCK) ++ return ERR_PTR(-EINTR); ++ ++ bch2_trans_unlock(trans); ++ ++ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, ++ BTREE_UPDATE_JOURNAL_RES, ++ journal_flags); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ if (!bch2_trans_relock(trans)) { ++ ret = -EINTR; ++ goto err; ++ } ++ } ++ ++ ret = bch2_disk_reservation_get(c, &as->disk_res, ++ nr_nodes * c->opts.btree_node_size, ++ c->opts.metadata_replicas, ++ disk_res_flags); ++ if (ret) ++ goto err; ++ ++ ret = bch2_btree_reserve_get(as, nr_nodes, flags, cl); ++ if (ret) ++ goto err; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_add_tail(&as->list, &c->btree_interior_update_list); ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ return as; ++err: ++ bch2_btree_update_free(as); ++ return ERR_PTR(ret); ++} ++ ++/* Btree root updates: */ ++ ++static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) ++{ ++ /* Root nodes cannot be reaped */ ++ mutex_lock(&c->btree_cache.lock); ++ list_del_init(&b->list); ++ mutex_unlock(&c->btree_cache.lock); ++ ++ mutex_lock(&c->btree_root_lock); ++ BUG_ON(btree_node_root(c, b) && ++ (b->c.level < btree_node_root(c, b)->c.level || ++ !btree_node_dying(btree_node_root(c, b)))); ++ ++ btree_node_root(c, b) = b; ++ mutex_unlock(&c->btree_root_lock); ++ ++ bch2_recalc_btree_reserve(c); ++} ++ ++/** ++ * bch_btree_set_root - update the root in memory and on disk ++ * ++ * To ensure forward progress, the current task must not be holding any ++ * btree node write locks. However, you must hold an intent lock on the ++ * old root. ++ * ++ * Note: This allocates a journal entry but doesn't add any keys to ++ * it. All the btree roots are part of every journal write, so there ++ * is nothing new to be done. This just guarantees that there is a ++ * journal write. ++ */ ++static void bch2_btree_set_root(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *old; ++ ++ trace_btree_set_root(c, b); ++ BUG_ON(!b->written && ++ !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)); ++ ++ old = btree_node_root(c, b); ++ ++ /* ++ * Ensure no one is using the old root while we switch to the ++ * new root: ++ */ ++ bch2_btree_node_lock_write(old, iter); ++ ++ bch2_btree_set_root_inmem(c, b); ++ ++ btree_update_updated_root(as, b); ++ ++ /* ++ * Unlock old root after new root is visible: ++ * ++ * The new root isn't persistent, but that's ok: we still have ++ * an intent lock on the new root, and any updates that would ++ * depend on the new root would have to update the new root. ++ */ ++ bch2_btree_node_unlock_write(old, iter); ++} ++ ++/* Interior node updates: */ ++ ++static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter, ++ struct bkey_i *insert, ++ struct btree_node_iter *node_iter) ++{ ++ struct bkey_packed *k; ++ ++ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > ++ ARRAY_SIZE(as->journal_entries)); ++ ++ as->journal_u64s += ++ journal_entry_set((void *) &as->journal_entries[as->journal_u64s], ++ BCH_JSET_ENTRY_btree_keys, ++ b->c.btree_id, b->c.level, ++ insert, insert->k.u64s); ++ ++ while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && ++ bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) ++ bch2_btree_node_iter_advance(node_iter, b); ++ ++ bch2_btree_bset_insert_key(iter, b, node_iter, insert); ++ set_btree_node_dirty(b); ++ set_btree_node_need_write(b); ++} ++ ++/* ++ * Move keys from n1 (original replacement node, now lower node) to n2 (higher ++ * node) ++ */ ++static struct btree *__btree_split_node(struct btree_update *as, ++ struct btree *n1, ++ struct btree_iter *iter) ++{ ++ size_t nr_packed = 0, nr_unpacked = 0; ++ struct btree *n2; ++ struct bset *set1, *set2; ++ struct bkey_packed *k, *prev = NULL; ++ ++ n2 = bch2_btree_node_alloc(as, n1->c.level); ++ bch2_btree_update_add_new_node(as, n2); ++ ++ n2->data->max_key = n1->data->max_key; ++ n2->data->format = n1->format; ++ SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data)); ++ n2->key.k.p = n1->key.k.p; ++ ++ btree_node_set_format(n2, n2->data->format); ++ ++ set1 = btree_bset_first(n1); ++ set2 = btree_bset_first(n2); ++ ++ /* ++ * Has to be a linear search because we don't have an auxiliary ++ * search tree yet ++ */ ++ k = set1->start; ++ while (1) { ++ struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1)); ++ ++ if (n == vstruct_last(set1)) ++ break; ++ if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5) ++ break; ++ ++ if (bkey_packed(k)) ++ nr_packed++; ++ else ++ nr_unpacked++; ++ ++ prev = k; ++ k = n; ++ } ++ ++ BUG_ON(!prev); ++ ++ btree_set_max(n1, bkey_unpack_pos(n1, prev)); ++ btree_set_min(n2, bkey_successor(n1->key.k.p)); ++ ++ set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k); ++ set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s)); ++ ++ set_btree_bset_end(n1, n1->set); ++ set_btree_bset_end(n2, n2->set); ++ ++ n2->nr.live_u64s = le16_to_cpu(set2->u64s); ++ n2->nr.bset_u64s[0] = le16_to_cpu(set2->u64s); ++ n2->nr.packed_keys = n1->nr.packed_keys - nr_packed; ++ n2->nr.unpacked_keys = n1->nr.unpacked_keys - nr_unpacked; ++ ++ n1->nr.live_u64s = le16_to_cpu(set1->u64s); ++ n1->nr.bset_u64s[0] = le16_to_cpu(set1->u64s); ++ n1->nr.packed_keys = nr_packed; ++ n1->nr.unpacked_keys = nr_unpacked; ++ ++ BUG_ON(!set1->u64s); ++ BUG_ON(!set2->u64s); ++ ++ memcpy_u64s(set2->start, ++ vstruct_end(set1), ++ le16_to_cpu(set2->u64s)); ++ ++ btree_node_reset_sib_u64s(n1); ++ btree_node_reset_sib_u64s(n2); ++ ++ bch2_verify_btree_nr_keys(n1); ++ bch2_verify_btree_nr_keys(n2); ++ ++ if (n1->c.level) { ++ btree_node_interior_verify(n1); ++ btree_node_interior_verify(n2); ++ } ++ ++ return n2; ++} ++ ++/* ++ * For updates to interior nodes, we've got to do the insert before we split ++ * because the stuff we're inserting has to be inserted atomically. Post split, ++ * the keys might have to go in different nodes and the split would no longer be ++ * atomic. ++ * ++ * Worse, if the insert is from btree node coalescing, if we do the insert after ++ * we do the split (and pick the pivot) - the pivot we pick might be between ++ * nodes that were coalesced, and thus in the middle of a child node post ++ * coalescing: ++ */ ++static void btree_split_insert_keys(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter, ++ struct keylist *keys) ++{ ++ struct btree_node_iter node_iter; ++ struct bkey_i *k = bch2_keylist_front(keys); ++ struct bkey_packed *src, *dst, *n; ++ struct bset *i; ++ ++ BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE); ++ ++ bch2_btree_node_iter_init(&node_iter, b, &k->k.p); ++ ++ while (!bch2_keylist_empty(keys)) { ++ k = bch2_keylist_front(keys); ++ ++ bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter); ++ bch2_keylist_pop_front(keys); ++ } ++ ++ /* ++ * We can't tolerate whiteouts here - with whiteouts there can be ++ * duplicate keys, and it would be rather bad if we picked a duplicate ++ * for the pivot: ++ */ ++ i = btree_bset_first(b); ++ src = dst = i->start; ++ while (src != vstruct_last(i)) { ++ n = bkey_next_skip_noops(src, vstruct_last(i)); ++ if (!bkey_deleted(src)) { ++ memmove_u64s_down(dst, src, src->u64s); ++ dst = bkey_next(dst); ++ } ++ src = n; ++ } ++ ++ i->u64s = cpu_to_le16((u64 *) dst - i->_data); ++ set_btree_bset_end(b, b->set); ++ ++ BUG_ON(b->nsets != 1 || ++ b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s)); ++ ++ btree_node_interior_verify(b); ++} ++ ++static void btree_split(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter, struct keylist *keys, ++ unsigned flags) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *parent = btree_node_parent(iter, b); ++ struct btree *n1, *n2 = NULL, *n3 = NULL; ++ u64 start_time = local_clock(); ++ ++ BUG_ON(!parent && (b != btree_node_root(c, b))); ++ BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level)); ++ ++ bch2_btree_interior_update_will_free_node(as, b); ++ ++ n1 = bch2_btree_node_alloc_replacement(as, b); ++ bch2_btree_update_add_new_node(as, n1); ++ ++ if (keys) ++ btree_split_insert_keys(as, n1, iter, keys); ++ ++ if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) { ++ trace_btree_split(c, b); ++ ++ n2 = __btree_split_node(as, n1, iter); ++ ++ bch2_btree_build_aux_trees(n2); ++ bch2_btree_build_aux_trees(n1); ++ six_unlock_write(&n2->c.lock); ++ six_unlock_write(&n1->c.lock); ++ ++ bch2_btree_node_write(c, n2, SIX_LOCK_intent); ++ ++ /* ++ * Note that on recursive parent_keys == keys, so we ++ * can't start adding new keys to parent_keys before emptying it ++ * out (which we did with btree_split_insert_keys() above) ++ */ ++ bch2_keylist_add(&as->parent_keys, &n1->key); ++ bch2_keylist_add(&as->parent_keys, &n2->key); ++ ++ if (!parent) { ++ /* Depth increases, make a new root */ ++ n3 = __btree_root_alloc(as, b->c.level + 1); ++ ++ n3->sib_u64s[0] = U16_MAX; ++ n3->sib_u64s[1] = U16_MAX; ++ ++ btree_split_insert_keys(as, n3, iter, &as->parent_keys); ++ ++ bch2_btree_node_write(c, n3, SIX_LOCK_intent); ++ } ++ } else { ++ trace_btree_compact(c, b); ++ ++ bch2_btree_build_aux_trees(n1); ++ six_unlock_write(&n1->c.lock); ++ ++ if (parent) ++ bch2_keylist_add(&as->parent_keys, &n1->key); ++ } ++ ++ bch2_btree_node_write(c, n1, SIX_LOCK_intent); ++ ++ /* New nodes all written, now make them visible: */ ++ ++ if (parent) { ++ /* Split a non root node */ ++ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); ++ } else if (n3) { ++ bch2_btree_set_root(as, n3, iter); ++ } else { ++ /* Root filled up but didn't need to be split */ ++ bch2_btree_set_root(as, n1, iter); ++ } ++ ++ bch2_btree_update_get_open_buckets(as, n1); ++ if (n2) ++ bch2_btree_update_get_open_buckets(as, n2); ++ if (n3) ++ bch2_btree_update_get_open_buckets(as, n3); ++ ++ /* Successful split, update the iterator to point to the new nodes: */ ++ ++ six_lock_increment(&b->c.lock, SIX_LOCK_intent); ++ bch2_btree_iter_node_drop(iter, b); ++ if (n3) ++ bch2_btree_iter_node_replace(iter, n3); ++ if (n2) ++ bch2_btree_iter_node_replace(iter, n2); ++ bch2_btree_iter_node_replace(iter, n1); ++ ++ /* ++ * The old node must be freed (in memory) _before_ unlocking the new ++ * nodes - else another thread could re-acquire a read lock on the old ++ * node after another thread has locked and updated the new node, thus ++ * seeing stale data: ++ */ ++ bch2_btree_node_free_inmem(c, b, iter); ++ ++ if (n3) ++ six_unlock_intent(&n3->c.lock); ++ if (n2) ++ six_unlock_intent(&n2->c.lock); ++ six_unlock_intent(&n1->c.lock); ++ ++ bch2_btree_trans_verify_locks(iter->trans); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split], ++ start_time); ++} ++ ++static void ++bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter, struct keylist *keys) ++{ ++ struct btree_iter *linked; ++ struct btree_node_iter node_iter; ++ struct bkey_i *insert = bch2_keylist_front(keys); ++ struct bkey_packed *k; ++ ++ /* Don't screw up @iter's position: */ ++ node_iter = iter->l[b->c.level].iter; ++ ++ /* ++ * btree_split(), btree_gc_coalesce() will insert keys before ++ * the iterator's current position - they know the keys go in ++ * the node the iterator points to: ++ */ ++ while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && ++ (bkey_cmp_packed(b, k, &insert->k) >= 0)) ++ ; ++ ++ for_each_keylist_key(keys, insert) ++ bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter); ++ ++ btree_update_updated_node(as, b); ++ ++ trans_for_each_iter_with_node(iter->trans, b, linked) ++ bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); ++ ++ bch2_btree_trans_verify_iters(iter->trans, b); ++} ++ ++/** ++ * bch_btree_insert_node - insert bkeys into a given btree node ++ * ++ * @iter: btree iterator ++ * @keys: list of keys to insert ++ * @hook: insert callback ++ * @persistent: if not null, @persistent will wait on journal write ++ * ++ * Inserts as many keys as it can into a given btree node, splitting it if full. ++ * If a split occurred, this function will return early. This can only happen ++ * for leaf nodes -- inserts into interior nodes have to be atomic. ++ */ ++void bch2_btree_insert_node(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter, struct keylist *keys, ++ unsigned flags) ++{ ++ struct bch_fs *c = as->c; ++ int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); ++ int old_live_u64s = b->nr.live_u64s; ++ int live_u64s_added, u64s_added; ++ ++ BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level)); ++ BUG_ON(!b->c.level); ++ BUG_ON(!as || as->b); ++ bch2_verify_keylist_sorted(keys); ++ ++ if (as->must_rewrite) ++ goto split; ++ ++ bch2_btree_node_lock_for_insert(c, b, iter); ++ ++ if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { ++ bch2_btree_node_unlock_write(b, iter); ++ goto split; ++ } ++ ++ bch2_btree_insert_keys_interior(as, b, iter, keys); ++ ++ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; ++ u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; ++ ++ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) ++ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); ++ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) ++ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); ++ ++ if (u64s_added > live_u64s_added && ++ bch2_maybe_compact_whiteouts(c, b)) ++ bch2_btree_iter_reinit_node(iter, b); ++ ++ bch2_btree_node_unlock_write(b, iter); ++ ++ btree_node_interior_verify(b); ++ ++ /* ++ * when called from the btree_split path the new nodes aren't added to ++ * the btree iterator yet, so the merge path's unlock/wait/relock dance ++ * won't work: ++ */ ++ bch2_foreground_maybe_merge(c, iter, b->c.level, ++ flags|BTREE_INSERT_NOUNLOCK); ++ return; ++split: ++ btree_split(as, b, iter, keys, flags); ++} ++ ++int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, ++ unsigned flags) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree *b = iter_l(iter)->b; ++ struct btree_update *as; ++ struct closure cl; ++ int ret = 0; ++ struct btree_insert_entry *i; ++ ++ /* ++ * We already have a disk reservation and open buckets pinned; this ++ * allocation must not block: ++ */ ++ trans_for_each_update(trans, i) ++ if (btree_node_type_needs_gc(i->iter->btree_id)) ++ flags |= BTREE_INSERT_USE_RESERVE; ++ ++ closure_init_stack(&cl); ++ ++ /* Hack, because gc and splitting nodes doesn't mix yet: */ ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && ++ !down_read_trylock(&c->gc_lock)) { ++ if (flags & BTREE_INSERT_NOUNLOCK) { ++ trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ return -EINTR; ++ } ++ ++ bch2_trans_unlock(trans); ++ down_read(&c->gc_lock); ++ ++ if (!bch2_trans_relock(trans)) ++ ret = -EINTR; ++ } ++ ++ /* ++ * XXX: figure out how far we might need to split, ++ * instead of locking/reserving all the way to the root: ++ */ ++ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { ++ trace_trans_restart_iter_upgrade(trans->ip); ++ ret = -EINTR; ++ goto out; ++ } ++ ++ as = bch2_btree_update_start(trans, iter->btree_id, ++ btree_update_reserve_required(c, b), flags, ++ !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); ++ if (IS_ERR(as)) { ++ ret = PTR_ERR(as); ++ if (ret == -EAGAIN) { ++ BUG_ON(flags & BTREE_INSERT_NOUNLOCK); ++ bch2_trans_unlock(trans); ++ ret = -EINTR; ++ ++ trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ } ++ goto out; ++ } ++ ++ btree_split(as, b, iter, NULL, flags); ++ bch2_btree_update_done(as); ++ ++ /* ++ * We haven't successfully inserted yet, so don't downgrade all the way ++ * back to read locks; ++ */ ++ __bch2_btree_iter_downgrade(iter, 1); ++out: ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) ++ up_read(&c->gc_lock); ++ closure_sync(&cl); ++ return ret; ++} ++ ++void __bch2_foreground_maybe_merge(struct bch_fs *c, ++ struct btree_iter *iter, ++ unsigned level, ++ unsigned flags, ++ enum btree_node_sibling sib) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree_update *as; ++ struct bkey_format_state new_s; ++ struct bkey_format new_f; ++ struct bkey_i delete; ++ struct btree *b, *m, *n, *prev, *next, *parent; ++ struct closure cl; ++ size_t sib_u64s; ++ int ret = 0; ++ ++ BUG_ON(!btree_node_locked(iter, level)); ++ ++ closure_init_stack(&cl); ++retry: ++ BUG_ON(!btree_node_locked(iter, level)); ++ ++ b = iter->l[level].b; ++ ++ parent = btree_node_parent(iter, b); ++ if (!parent) ++ goto out; ++ ++ if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) ++ goto out; ++ ++ /* XXX: can't be holding read locks */ ++ m = bch2_btree_node_get_sibling(c, iter, b, sib); ++ if (IS_ERR(m)) { ++ ret = PTR_ERR(m); ++ goto err; ++ } ++ ++ /* NULL means no sibling: */ ++ if (!m) { ++ b->sib_u64s[sib] = U16_MAX; ++ goto out; ++ } ++ ++ if (sib == btree_prev_sib) { ++ prev = m; ++ next = b; ++ } else { ++ prev = b; ++ next = m; ++ } ++ ++ bch2_bkey_format_init(&new_s); ++ __bch2_btree_calc_format(&new_s, b); ++ __bch2_btree_calc_format(&new_s, m); ++ new_f = bch2_bkey_format_done(&new_s); ++ ++ sib_u64s = btree_node_u64s_with_format(b, &new_f) + ++ btree_node_u64s_with_format(m, &new_f); ++ ++ if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) { ++ sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c); ++ sib_u64s /= 2; ++ sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c); ++ } ++ ++ sib_u64s = min(sib_u64s, btree_max_u64s(c)); ++ b->sib_u64s[sib] = sib_u64s; ++ ++ if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) { ++ six_unlock_intent(&m->c.lock); ++ goto out; ++ } ++ ++ /* We're changing btree topology, doesn't mix with gc: */ ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && ++ !down_read_trylock(&c->gc_lock)) ++ goto err_cycle_gc_lock; ++ ++ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { ++ ret = -EINTR; ++ goto err_unlock; ++ } ++ ++ as = bch2_btree_update_start(trans, iter->btree_id, ++ btree_update_reserve_required(c, parent) + 1, ++ flags| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE, ++ !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); ++ if (IS_ERR(as)) { ++ ret = PTR_ERR(as); ++ goto err_unlock; ++ } ++ ++ trace_btree_merge(c, b); ++ ++ bch2_btree_interior_update_will_free_node(as, b); ++ bch2_btree_interior_update_will_free_node(as, m); ++ ++ n = bch2_btree_node_alloc(as, b->c.level); ++ bch2_btree_update_add_new_node(as, n); ++ ++ btree_set_min(n, prev->data->min_key); ++ btree_set_max(n, next->data->max_key); ++ n->data->format = new_f; ++ ++ btree_node_set_format(n, new_f); ++ ++ bch2_btree_sort_into(c, n, prev); ++ bch2_btree_sort_into(c, n, next); ++ ++ bch2_btree_build_aux_trees(n); ++ six_unlock_write(&n->c.lock); ++ ++ bkey_init(&delete.k); ++ delete.k.p = prev->key.k.p; ++ bch2_keylist_add(&as->parent_keys, &delete); ++ bch2_keylist_add(&as->parent_keys, &n->key); ++ ++ bch2_btree_node_write(c, n, SIX_LOCK_intent); ++ ++ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); ++ ++ bch2_btree_update_get_open_buckets(as, n); ++ ++ six_lock_increment(&b->c.lock, SIX_LOCK_intent); ++ bch2_btree_iter_node_drop(iter, b); ++ bch2_btree_iter_node_drop(iter, m); ++ ++ bch2_btree_iter_node_replace(iter, n); ++ ++ bch2_btree_trans_verify_iters(trans, n); ++ ++ bch2_btree_node_free_inmem(c, b, iter); ++ bch2_btree_node_free_inmem(c, m, iter); ++ ++ six_unlock_intent(&n->c.lock); ++ ++ bch2_btree_update_done(as); ++ ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) ++ up_read(&c->gc_lock); ++out: ++ bch2_btree_trans_verify_locks(trans); ++ ++ /* ++ * Don't downgrade locks here: we're called after successful insert, ++ * and the caller will downgrade locks after a successful insert ++ * anyways (in case e.g. a split was required first) ++ * ++ * And we're also called when inserting into interior nodes in the ++ * split path, and downgrading to read locks in there is potentially ++ * confusing: ++ */ ++ closure_sync(&cl); ++ return; ++ ++err_cycle_gc_lock: ++ six_unlock_intent(&m->c.lock); ++ ++ if (flags & BTREE_INSERT_NOUNLOCK) ++ goto out; ++ ++ bch2_trans_unlock(trans); ++ ++ down_read(&c->gc_lock); ++ up_read(&c->gc_lock); ++ ret = -EINTR; ++ goto err; ++ ++err_unlock: ++ six_unlock_intent(&m->c.lock); ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) ++ up_read(&c->gc_lock); ++err: ++ BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK)); ++ ++ if ((ret == -EAGAIN || ret == -EINTR) && ++ !(flags & BTREE_INSERT_NOUNLOCK)) { ++ bch2_trans_unlock(trans); ++ closure_sync(&cl); ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ goto out; ++ ++ goto retry; ++ } ++ ++ goto out; ++} ++ ++static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, ++ struct btree *b, unsigned flags, ++ struct closure *cl) ++{ ++ struct btree *n, *parent = btree_node_parent(iter, b); ++ struct btree_update *as; ++ ++ as = bch2_btree_update_start(iter->trans, iter->btree_id, ++ (parent ++ ? btree_update_reserve_required(c, parent) ++ : 0) + 1, ++ flags, cl); ++ if (IS_ERR(as)) { ++ trace_btree_gc_rewrite_node_fail(c, b); ++ return PTR_ERR(as); ++ } ++ ++ bch2_btree_interior_update_will_free_node(as, b); ++ ++ n = bch2_btree_node_alloc_replacement(as, b); ++ bch2_btree_update_add_new_node(as, n); ++ ++ bch2_btree_build_aux_trees(n); ++ six_unlock_write(&n->c.lock); ++ ++ trace_btree_gc_rewrite_node(c, b); ++ ++ bch2_btree_node_write(c, n, SIX_LOCK_intent); ++ ++ if (parent) { ++ bch2_keylist_add(&as->parent_keys, &n->key); ++ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); ++ } else { ++ bch2_btree_set_root(as, n, iter); ++ } ++ ++ bch2_btree_update_get_open_buckets(as, n); ++ ++ six_lock_increment(&b->c.lock, SIX_LOCK_intent); ++ bch2_btree_iter_node_drop(iter, b); ++ bch2_btree_iter_node_replace(iter, n); ++ bch2_btree_node_free_inmem(c, b, iter); ++ six_unlock_intent(&n->c.lock); ++ ++ bch2_btree_update_done(as); ++ return 0; ++} ++ ++/** ++ * bch_btree_node_rewrite - Rewrite/move a btree node ++ * ++ * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e. ++ * btree_check_reserve() has to wait) ++ */ ++int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, ++ __le64 seq, unsigned flags) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct closure cl; ++ struct btree *b; ++ int ret; ++ ++ flags |= BTREE_INSERT_NOFAIL; ++ ++ closure_init_stack(&cl); ++ ++ bch2_btree_iter_upgrade(iter, U8_MAX); ++ ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) { ++ if (!down_read_trylock(&c->gc_lock)) { ++ bch2_trans_unlock(trans); ++ down_read(&c->gc_lock); ++ } ++ } ++ ++ while (1) { ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ break; ++ ++ b = bch2_btree_iter_peek_node(iter); ++ if (!b || b->data->keys.seq != seq) ++ break; ++ ++ ret = __btree_node_rewrite(c, iter, b, flags, &cl); ++ if (ret != -EAGAIN && ++ ret != -EINTR) ++ break; ++ ++ bch2_trans_unlock(trans); ++ closure_sync(&cl); ++ } ++ ++ bch2_btree_iter_downgrade(iter); ++ ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) ++ up_read(&c->gc_lock); ++ ++ closure_sync(&cl); ++ return ret; ++} ++ ++static void __bch2_btree_node_update_key(struct bch_fs *c, ++ struct btree_update *as, ++ struct btree_iter *iter, ++ struct btree *b, struct btree *new_hash, ++ struct bkey_i *new_key) ++{ ++ struct btree *parent; ++ int ret; ++ ++ btree_update_will_delete_key(as, &b->key); ++ btree_update_will_add_key(as, new_key); ++ ++ parent = btree_node_parent(iter, b); ++ if (parent) { ++ if (new_hash) { ++ bkey_copy(&new_hash->key, new_key); ++ ret = bch2_btree_node_hash_insert(&c->btree_cache, ++ new_hash, b->c.level, b->c.btree_id); ++ BUG_ON(ret); ++ } ++ ++ bch2_keylist_add(&as->parent_keys, new_key); ++ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0); ++ ++ if (new_hash) { ++ mutex_lock(&c->btree_cache.lock); ++ bch2_btree_node_hash_remove(&c->btree_cache, new_hash); ++ ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ bkey_copy(&b->key, new_key); ++ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); ++ BUG_ON(ret); ++ mutex_unlock(&c->btree_cache.lock); ++ } else { ++ bkey_copy(&b->key, new_key); ++ } ++ } else { ++ BUG_ON(btree_node_root(c, b) != b); ++ ++ bch2_btree_node_lock_write(b, iter); ++ bkey_copy(&b->key, new_key); ++ ++ if (btree_ptr_hash_val(&b->key) != b->hash_val) { ++ mutex_lock(&c->btree_cache.lock); ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); ++ BUG_ON(ret); ++ mutex_unlock(&c->btree_cache.lock); ++ } ++ ++ btree_update_updated_root(as, b); ++ bch2_btree_node_unlock_write(b, iter); ++ } ++ ++ bch2_btree_update_done(as); ++} ++ ++int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, ++ struct btree *b, ++ struct bkey_i *new_key) ++{ ++ struct btree *parent = btree_node_parent(iter, b); ++ struct btree_update *as = NULL; ++ struct btree *new_hash = NULL; ++ struct closure cl; ++ int ret; ++ ++ closure_init_stack(&cl); ++ ++ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) ++ return -EINTR; ++ ++ if (!down_read_trylock(&c->gc_lock)) { ++ bch2_trans_unlock(iter->trans); ++ down_read(&c->gc_lock); ++ ++ if (!bch2_trans_relock(iter->trans)) { ++ ret = -EINTR; ++ goto err; ++ } ++ } ++ ++ /* ++ * check btree_ptr_hash_val() after @b is locked by ++ * btree_iter_traverse(): ++ */ ++ if (btree_ptr_hash_val(new_key) != b->hash_val) { ++ /* bch2_btree_reserve_get will unlock */ ++ ret = bch2_btree_cache_cannibalize_lock(c, &cl); ++ if (ret) { ++ bch2_trans_unlock(iter->trans); ++ up_read(&c->gc_lock); ++ closure_sync(&cl); ++ down_read(&c->gc_lock); ++ ++ if (!bch2_trans_relock(iter->trans)) { ++ ret = -EINTR; ++ goto err; ++ } ++ } ++ ++ new_hash = bch2_btree_node_mem_alloc(c); ++ } ++ ++ as = bch2_btree_update_start(iter->trans, iter->btree_id, ++ parent ? btree_update_reserve_required(c, parent) : 0, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_USE_ALLOC_RESERVE, ++ &cl); ++ ++ if (IS_ERR(as)) { ++ ret = PTR_ERR(as); ++ if (ret == -EAGAIN) ++ ret = -EINTR; ++ ++ if (ret != -EINTR) ++ goto err; ++ ++ bch2_trans_unlock(iter->trans); ++ up_read(&c->gc_lock); ++ closure_sync(&cl); ++ down_read(&c->gc_lock); ++ ++ if (!bch2_trans_relock(iter->trans)) ++ goto err; ++ } ++ ++ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key)); ++ if (ret) ++ goto err_free_update; ++ ++ __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key); ++ ++ bch2_btree_iter_downgrade(iter); ++err: ++ if (new_hash) { ++ mutex_lock(&c->btree_cache.lock); ++ list_move(&new_hash->list, &c->btree_cache.freeable); ++ mutex_unlock(&c->btree_cache.lock); ++ ++ six_unlock_write(&new_hash->c.lock); ++ six_unlock_intent(&new_hash->c.lock); ++ } ++ up_read(&c->gc_lock); ++ closure_sync(&cl); ++ return ret; ++err_free_update: ++ bch2_btree_update_free(as); ++ goto err; ++} ++ ++/* Init code: */ ++ ++/* ++ * Only for filesystem bringup, when first reading the btree roots or allocating ++ * btree roots when initializing a new filesystem: ++ */ ++void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) ++{ ++ BUG_ON(btree_node_root(c, b)); ++ ++ bch2_btree_set_root_inmem(c, b); ++} ++ ++void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) ++{ ++ struct closure cl; ++ struct btree *b; ++ int ret; ++ ++ closure_init_stack(&cl); ++ ++ do { ++ ret = bch2_btree_cache_cannibalize_lock(c, &cl); ++ closure_sync(&cl); ++ } while (ret); ++ ++ b = bch2_btree_node_mem_alloc(c); ++ bch2_btree_cache_cannibalize_unlock(c); ++ ++ set_btree_node_fake(b); ++ set_btree_node_need_rewrite(b); ++ b->c.level = 0; ++ b->c.btree_id = id; ++ ++ bkey_btree_ptr_init(&b->key); ++ b->key.k.p = POS_MAX; ++ *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id; ++ ++ bch2_bset_init_first(b, &b->data->keys); ++ bch2_btree_build_aux_trees(b); ++ ++ b->data->flags = 0; ++ btree_set_min(b, POS_MIN); ++ btree_set_max(b, POS_MAX); ++ b->data->format = bch2_btree_calc_format(b); ++ btree_node_set_format(b, b->data->format); ++ ++ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, ++ b->c.level, b->c.btree_id); ++ BUG_ON(ret); ++ ++ bch2_btree_set_root_inmem(c, b); ++ ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++} ++ ++ssize_t bch2_btree_updates_print(struct bch_fs *c, char *buf) ++{ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ struct btree_update *as; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_for_each_entry(as, &c->btree_interior_update_list, list) ++ pr_buf(&out, "%p m %u w %u r %u j %llu\n", ++ as, ++ as->mode, ++ as->nodes_written, ++ atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK, ++ as->journal.seq); ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ return out.pos - buf; ++} ++ ++size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c) ++{ ++ size_t ret = 0; ++ struct list_head *i; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_for_each(i, &c->btree_interior_update_list) ++ ret++; ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ return ret; ++} ++ ++void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset) ++{ ++ struct btree_root *r; ++ struct jset_entry *entry; ++ ++ mutex_lock(&c->btree_root_lock); ++ ++ vstruct_for_each(jset, entry) ++ if (entry->type == BCH_JSET_ENTRY_btree_root) { ++ r = &c->btree_roots[entry->btree_id]; ++ r->level = entry->level; ++ r->alive = true; ++ bkey_copy(&r->key, &entry->start[0]); ++ } ++ ++ mutex_unlock(&c->btree_root_lock); ++} ++ ++struct jset_entry * ++bch2_btree_roots_to_journal_entries(struct bch_fs *c, ++ struct jset_entry *start, ++ struct jset_entry *end) ++{ ++ struct jset_entry *entry; ++ unsigned long have = 0; ++ unsigned i; ++ ++ for (entry = start; entry < end; entry = vstruct_next(entry)) ++ if (entry->type == BCH_JSET_ENTRY_btree_root) ++ __set_bit(entry->btree_id, &have); ++ ++ mutex_lock(&c->btree_root_lock); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (c->btree_roots[i].alive && !test_bit(i, &have)) { ++ journal_entry_set(end, ++ BCH_JSET_ENTRY_btree_root, ++ i, c->btree_roots[i].level, ++ &c->btree_roots[i].key, ++ c->btree_roots[i].key.u64s); ++ end = vstruct_next(end); ++ } ++ ++ mutex_unlock(&c->btree_root_lock); ++ ++ return end; ++} ++ ++void bch2_fs_btree_interior_update_exit(struct bch_fs *c) ++{ ++ if (c->btree_interior_update_worker) ++ destroy_workqueue(c->btree_interior_update_worker); ++ mempool_exit(&c->btree_interior_update_pool); ++} ++ ++int bch2_fs_btree_interior_update_init(struct bch_fs *c) ++{ ++ mutex_init(&c->btree_reserve_cache_lock); ++ INIT_LIST_HEAD(&c->btree_interior_update_list); ++ INIT_LIST_HEAD(&c->btree_interior_updates_unwritten); ++ mutex_init(&c->btree_interior_update_lock); ++ INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work); ++ ++ c->btree_interior_update_worker = ++ alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1); ++ if (!c->btree_interior_update_worker) ++ return -ENOMEM; ++ ++ return mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, ++ sizeof(struct btree_update)); ++} +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +new file mode 100644 +index 000000000000..4a5b9dcfbdd0 +--- /dev/null ++++ b/fs/bcachefs/btree_update_interior.h +@@ -0,0 +1,331 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H ++#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H ++ ++#include "btree_cache.h" ++#include "btree_locking.h" ++#include "btree_update.h" ++ ++void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *); ++bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *, ++ struct bkey_format *); ++ ++#define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES) ++ ++#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1)) ++ ++/* ++ * Tracks an in progress split/rewrite of a btree node and the update to the ++ * parent node: ++ * ++ * When we split/rewrite a node, we do all the updates in memory without ++ * waiting for any writes to complete - we allocate the new node(s) and update ++ * the parent node, possibly recursively up to the root. ++ * ++ * The end result is that we have one or more new nodes being written - ++ * possibly several, if there were multiple splits - and then a write (updating ++ * an interior node) which will make all these new nodes visible. ++ * ++ * Additionally, as we split/rewrite nodes we free the old nodes - but the old ++ * nodes can't be freed (their space on disk can't be reclaimed) until the ++ * update to the interior node that makes the new node visible completes - ++ * until then, the old nodes are still reachable on disk. ++ * ++ */ ++struct btree_update { ++ struct closure cl; ++ struct bch_fs *c; ++ ++ struct list_head list; ++ struct list_head unwritten_list; ++ ++ /* What kind of update are we doing? */ ++ enum { ++ BTREE_INTERIOR_NO_UPDATE, ++ BTREE_INTERIOR_UPDATING_NODE, ++ BTREE_INTERIOR_UPDATING_ROOT, ++ BTREE_INTERIOR_UPDATING_AS, ++ } mode; ++ ++ unsigned must_rewrite:1; ++ unsigned nodes_written:1; ++ ++ enum btree_id btree_id; ++ ++ struct disk_reservation disk_res; ++ struct journal_preres journal_preres; ++ ++ /* ++ * BTREE_INTERIOR_UPDATING_NODE: ++ * The update that made the new nodes visible was a regular update to an ++ * existing interior node - @b. We can't write out the update to @b ++ * until the new nodes we created are finished writing, so we block @b ++ * from writing by putting this btree_interior update on the ++ * @b->write_blocked list with @write_blocked_list: ++ */ ++ struct btree *b; ++ struct list_head write_blocked_list; ++ ++ /* ++ * We may be freeing nodes that were dirty, and thus had journal entries ++ * pinned: we need to transfer the oldest of those pins to the ++ * btree_update operation, and release it when the new node(s) ++ * are all persistent and reachable: ++ */ ++ struct journal_entry_pin journal; ++ ++ /* Preallocated nodes we reserve when we start the update: */ ++ struct btree *prealloc_nodes[BTREE_UPDATE_NODES_MAX]; ++ unsigned nr_prealloc_nodes; ++ ++ /* Nodes being freed: */ ++ struct keylist old_keys; ++ u64 _old_keys[BTREE_UPDATE_NODES_MAX * ++ BKEY_BTREE_PTR_VAL_U64s_MAX]; ++ ++ /* Nodes being added: */ ++ struct keylist new_keys; ++ u64 _new_keys[BTREE_UPDATE_NODES_MAX * ++ BKEY_BTREE_PTR_VAL_U64s_MAX]; ++ ++ /* New nodes, that will be made reachable by this update: */ ++ struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; ++ unsigned nr_new_nodes; ++ ++ open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX * ++ BCH_REPLICAS_MAX]; ++ open_bucket_idx_t nr_open_buckets; ++ ++ unsigned journal_u64s; ++ u64 journal_entries[BTREE_UPDATE_JOURNAL_RES]; ++ ++ /* Only here to reduce stack usage on recursive splits: */ ++ struct keylist parent_keys; ++ /* ++ * Enough room for btree_split's keys without realloc - btree node ++ * pointers never have crc/compression info, so we only need to acount ++ * for the pointers for three keys ++ */ ++ u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; ++}; ++ ++void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *, ++ struct btree_iter *); ++void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *); ++ ++void bch2_btree_update_get_open_buckets(struct btree_update *, struct btree *); ++ ++struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, ++ struct btree *, ++ struct bkey_format); ++ ++void bch2_btree_update_done(struct btree_update *); ++struct btree_update * ++bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned, ++ unsigned, struct closure *); ++ ++void bch2_btree_interior_update_will_free_node(struct btree_update *, ++ struct btree *); ++void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); ++ ++void bch2_btree_insert_node(struct btree_update *, struct btree *, ++ struct btree_iter *, struct keylist *, ++ unsigned); ++int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned); ++ ++void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *, ++ unsigned, unsigned, enum btree_node_sibling); ++ ++static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c, ++ struct btree_iter *iter, ++ unsigned level, unsigned flags, ++ enum btree_node_sibling sib) ++{ ++ struct btree *b; ++ ++ if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) ++ return; ++ ++ if (!bch2_btree_node_relock(iter, level)) ++ return; ++ ++ b = iter->l[level].b; ++ if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) ++ return; ++ ++ __bch2_foreground_maybe_merge(c, iter, level, flags, sib); ++} ++ ++static inline void bch2_foreground_maybe_merge(struct bch_fs *c, ++ struct btree_iter *iter, ++ unsigned level, ++ unsigned flags) ++{ ++ bch2_foreground_maybe_merge_sibling(c, iter, level, flags, ++ btree_prev_sib); ++ bch2_foreground_maybe_merge_sibling(c, iter, level, flags, ++ btree_next_sib); ++} ++ ++void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); ++void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); ++ ++static inline unsigned btree_update_reserve_required(struct bch_fs *c, ++ struct btree *b) ++{ ++ unsigned depth = btree_node_root(c, b)->c.level + 1; ++ ++ /* ++ * Number of nodes we might have to allocate in a worst case btree ++ * split operation - we split all the way up to the root, then allocate ++ * a new root, unless we're already at max depth: ++ */ ++ if (depth < BTREE_MAX_DEPTH) ++ return (depth - b->c.level) * 2 + 1; ++ else ++ return (depth - b->c.level) * 2 - 1; ++} ++ ++static inline void btree_node_reset_sib_u64s(struct btree *b) ++{ ++ b->sib_u64s[0] = b->nr.live_u64s; ++ b->sib_u64s[1] = b->nr.live_u64s; ++} ++ ++static inline void *btree_data_end(struct bch_fs *c, struct btree *b) ++{ ++ return (void *) b->data + btree_bytes(c); ++} ++ ++static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c, ++ struct btree *b) ++{ ++ return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s); ++} ++ ++static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c, ++ struct btree *b) ++{ ++ return btree_data_end(c, b); ++} ++ ++static inline void *write_block(struct btree *b) ++{ ++ return (void *) b->data + (b->written << 9); ++} ++ ++static inline bool __btree_addr_written(struct btree *b, void *p) ++{ ++ return p < write_block(b); ++} ++ ++static inline bool bset_written(struct btree *b, struct bset *i) ++{ ++ return __btree_addr_written(b, i); ++} ++ ++static inline bool bkey_written(struct btree *b, struct bkey_packed *k) ++{ ++ return __btree_addr_written(b, k); ++} ++ ++static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, ++ struct btree *b, ++ void *end) ++{ ++ ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + ++ b->whiteout_u64s; ++ ssize_t total = c->opts.btree_node_size << 6; ++ ++ return total - used; ++} ++ ++static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, ++ struct btree *b) ++{ ++ ssize_t remaining = __bch_btree_u64s_remaining(c, b, ++ btree_bkey_last(b, bset_tree_last(b))); ++ ++ BUG_ON(remaining < 0); ++ ++ if (bset_written(b, btree_bset_last(b))) ++ return 0; ++ ++ return remaining; ++} ++ ++static inline unsigned btree_write_set_buffer(struct btree *b) ++{ ++ /* ++ * Could buffer up larger amounts of keys for btrees with larger keys, ++ * pending benchmarking: ++ */ ++ return 4 << 10; ++} ++ ++static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, ++ struct btree *b) ++{ ++ struct bset_tree *t = bset_tree_last(b); ++ struct btree_node_entry *bne = max(write_block(b), ++ (void *) btree_bkey_last(b, bset_tree_last(b))); ++ ssize_t remaining_space = ++ __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]); ++ ++ if (unlikely(bset_written(b, bset(b, t)))) { ++ if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) ++ return bne; ++ } else { ++ if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && ++ remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3)) ++ return bne; ++ } ++ ++ return NULL; ++} ++ ++static inline void push_whiteout(struct bch_fs *c, struct btree *b, ++ struct bpos pos) ++{ ++ struct bkey_packed k; ++ ++ BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s); ++ ++ if (!bkey_pack_pos(&k, pos, b)) { ++ struct bkey *u = (void *) &k; ++ ++ bkey_init(u); ++ u->p = pos; ++ } ++ ++ k.needs_whiteout = true; ++ ++ b->whiteout_u64s += k.u64s; ++ bkey_copy(unwritten_whiteouts_start(c, b), &k); ++} ++ ++/* ++ * write lock must be held on @b (else the dirty bset that we were going to ++ * insert into could be written out from under us) ++ */ ++static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, ++ struct btree *b, unsigned u64s) ++{ ++ if (unlikely(btree_node_fake(b))) ++ return false; ++ ++ return u64s <= bch_btree_keys_u64s_remaining(c, b); ++} ++ ++ssize_t bch2_btree_updates_print(struct bch_fs *, char *); ++ ++size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *); ++ ++void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *); ++struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, ++ struct jset_entry *, struct jset_entry *); ++ ++void bch2_fs_btree_interior_update_exit(struct bch_fs *); ++int bch2_fs_btree_interior_update_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +new file mode 100644 +index 000000000000..cf4105e83eda +--- /dev/null ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -0,0 +1,1174 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_gc.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_key_cache.h" ++#include "btree_locking.h" ++#include "buckets.h" ++#include "debug.h" ++#include "error.h" ++#include "extent_update.h" ++#include "journal.h" ++#include "journal_reclaim.h" ++#include "keylist.h" ++#include "replicas.h" ++ ++#include ++#include ++#include ++ ++static inline bool same_leaf_as_prev(struct btree_trans *trans, ++ struct btree_insert_entry *i) ++{ ++ return i != trans->updates2 && ++ iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b; ++} ++ ++inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, ++ struct btree_iter *iter) ++{ ++ bch2_btree_node_lock_write(b, iter); ++ ++ if (btree_iter_type(iter) == BTREE_ITER_CACHED) ++ return; ++ ++ if (unlikely(btree_node_just_written(b)) && ++ bch2_btree_post_write_cleanup(c, b)) ++ bch2_btree_iter_reinit_node(iter, b); ++ ++ /* ++ * If the last bset has been written, or if it's gotten too big - start ++ * a new bset to insert into: ++ */ ++ if (want_new_bset(c, b)) ++ bch2_btree_init_next(c, b, iter); ++} ++ ++/* Inserting into a given leaf node (last stage of insert): */ ++ ++/* Handle overwrites and do insert, for non extents: */ ++bool bch2_btree_bset_insert_key(struct btree_iter *iter, ++ struct btree *b, ++ struct btree_node_iter *node_iter, ++ struct bkey_i *insert) ++{ ++ struct bkey_packed *k; ++ unsigned clobber_u64s = 0, new_u64s = 0; ++ ++ EBUG_ON(btree_node_just_written(b)); ++ EBUG_ON(bset_written(b, btree_bset_last(b))); ++ EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); ++ EBUG_ON(bkey_cmp(b->data->min_key, POS_MIN) && ++ bkey_cmp(bkey_start_pos(&insert->k), ++ bkey_predecessor(b->data->min_key)) < 0); ++ EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0); ++ EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0); ++ EBUG_ON(insert->k.u64s > ++ bch_btree_keys_u64s_remaining(iter->trans->c, b)); ++ EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); ++ ++ k = bch2_btree_node_iter_peek_all(node_iter, b); ++ if (k && bkey_cmp_packed(b, k, &insert->k)) ++ k = NULL; ++ ++ /* @k is the key being overwritten/deleted, if any: */ ++ EBUG_ON(k && bkey_whiteout(k)); ++ ++ /* Deleting, but not found? nothing to do: */ ++ if (bkey_whiteout(&insert->k) && !k) ++ return false; ++ ++ if (bkey_whiteout(&insert->k)) { ++ /* Deleting: */ ++ btree_account_key_drop(b, k); ++ k->type = KEY_TYPE_deleted; ++ ++ if (k->needs_whiteout) ++ push_whiteout(iter->trans->c, b, insert->k.p); ++ k->needs_whiteout = false; ++ ++ if (k >= btree_bset_last(b)->start) { ++ clobber_u64s = k->u64s; ++ bch2_bset_delete(b, k, clobber_u64s); ++ goto fix_iter; ++ } else { ++ bch2_btree_iter_fix_key_modified(iter, b, k); ++ } ++ ++ return true; ++ } ++ ++ if (k) { ++ /* Overwriting: */ ++ btree_account_key_drop(b, k); ++ k->type = KEY_TYPE_deleted; ++ ++ insert->k.needs_whiteout = k->needs_whiteout; ++ k->needs_whiteout = false; ++ ++ if (k >= btree_bset_last(b)->start) { ++ clobber_u64s = k->u64s; ++ goto overwrite; ++ } else { ++ bch2_btree_iter_fix_key_modified(iter, b, k); ++ } ++ } ++ ++ k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); ++overwrite: ++ bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); ++ new_u64s = k->u64s; ++fix_iter: ++ if (clobber_u64s != new_u64s) ++ bch2_btree_node_iter_fix(iter, b, node_iter, k, ++ clobber_u64s, new_u64s); ++ return true; ++} ++ ++static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, ++ unsigned i, u64 seq) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct btree_write *w = container_of(pin, struct btree_write, journal); ++ struct btree *b = container_of(w, struct btree, writes[i]); ++ ++ btree_node_lock_type(c, b, SIX_LOCK_read); ++ bch2_btree_node_write_cond(c, b, ++ (btree_current_write(b) == w && w->journal.seq == seq)); ++ six_unlock_read(&b->c.lock); ++} ++ ++static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) ++{ ++ return __btree_node_flush(j, pin, 0, seq); ++} ++ ++static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) ++{ ++ return __btree_node_flush(j, pin, 1, seq); ++} ++ ++inline void bch2_btree_add_journal_pin(struct bch_fs *c, ++ struct btree *b, u64 seq) ++{ ++ struct btree_write *w = btree_current_write(b); ++ ++ bch2_journal_pin_add(&c->journal, seq, &w->journal, ++ btree_node_write_idx(b) == 0 ++ ? btree_node_flush0 ++ : btree_node_flush1); ++} ++ ++/** ++ * btree_insert_key - insert a key one key into a leaf node ++ */ ++static bool btree_insert_key_leaf(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree *b = iter_l(iter)->b; ++ struct bset_tree *t = bset_tree_last(b); ++ struct bset *i = bset(b, t); ++ int old_u64s = bset_u64s(t); ++ int old_live_u64s = b->nr.live_u64s; ++ int live_u64s_added, u64s_added; ++ ++ EBUG_ON(!iter->level && ++ !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)); ++ ++ if (unlikely(!bch2_btree_bset_insert_key(iter, b, ++ &iter_l(iter)->iter, insert))) ++ return false; ++ ++ i->journal_seq = cpu_to_le64(max(trans->journal_res.seq, ++ le64_to_cpu(i->journal_seq))); ++ ++ bch2_btree_add_journal_pin(c, b, trans->journal_res.seq); ++ ++ if (unlikely(!btree_node_dirty(b))) ++ set_btree_node_dirty(b); ++ ++ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; ++ u64s_added = (int) bset_u64s(t) - old_u64s; ++ ++ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) ++ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); ++ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) ++ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); ++ ++ if (u64s_added > live_u64s_added && ++ bch2_maybe_compact_whiteouts(c, b)) ++ bch2_btree_iter_reinit_node(iter, b); ++ ++ trace_btree_insert_key(c, b, insert); ++ return true; ++} ++ ++/* Cached btree updates: */ ++ ++/* Normal update interface: */ ++ ++static inline void btree_insert_entry_checks(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct bch_fs *c = trans->c; ++ ++ BUG_ON(bkey_cmp(insert->k.p, iter->pos)); ++ BUG_ON(debug_check_bkeys(c) && ++ bch2_bkey_invalid(c, bkey_i_to_s_c(insert), ++ __btree_node_type(iter->level, iter->btree_id))); ++} ++ ++static noinline int ++bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s) ++{ ++ struct bch_fs *c = trans->c; ++ int ret; ++ ++ bch2_trans_unlock(trans); ++ ++ ret = bch2_journal_preres_get(&c->journal, ++ &trans->journal_preres, u64s, 0); ++ if (ret) ++ return ret; ++ ++ if (!bch2_trans_relock(trans)) { ++ trace_trans_restart_journal_preres_get(trans->ip); ++ return -EINTR; ++ } ++ ++ return 0; ++} ++ ++static inline int bch2_trans_journal_res_get(struct btree_trans *trans, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ int ret; ++ ++ if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) ++ flags |= JOURNAL_RES_GET_RESERVED; ++ ++ ret = bch2_journal_res_get(&c->journal, &trans->journal_res, ++ trans->journal_u64s, flags); ++ ++ return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret; ++} ++ ++static enum btree_insert_ret ++btree_key_can_insert(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert, ++ unsigned u64s) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree *b = iter_l(iter)->b; ++ ++ if (unlikely(btree_node_need_rewrite(b)) || ++ unlikely(u64s > bch_btree_keys_u64s_remaining(c, b))) ++ return BTREE_INSERT_BTREE_NODE_FULL; ++ ++ return BTREE_INSERT_OK; ++} ++ ++static enum btree_insert_ret ++btree_key_can_insert_cached(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert, ++ unsigned u64s) ++{ ++ struct bkey_cached *ck = (void *) iter->l[0].b; ++ unsigned new_u64s; ++ struct bkey_i *new_k; ++ ++ BUG_ON(iter->level); ++ ++ if (u64s <= ck->u64s) ++ return BTREE_INSERT_OK; ++ ++ new_u64s = roundup_pow_of_two(u64s); ++ new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); ++ if (!new_k) ++ return -ENOMEM; ++ ++ ck->u64s = new_u64s; ++ ck->k = new_k; ++ return BTREE_INSERT_OK; ++} ++ ++static inline void do_btree_insert_one(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct bch_fs *c = trans->c; ++ struct journal *j = &c->journal; ++ bool did_work; ++ ++ EBUG_ON(trans->journal_res.ref != ++ !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); ++ ++ insert->k.needs_whiteout = false; ++ ++ did_work = (btree_iter_type(iter) != BTREE_ITER_CACHED) ++ ? btree_insert_key_leaf(trans, iter, insert) ++ : bch2_btree_insert_key_cached(trans, iter, insert); ++ if (!did_work) ++ return; ++ ++ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { ++ bch2_journal_add_keys(j, &trans->journal_res, ++ iter->btree_id, insert); ++ ++ bch2_journal_set_has_inode(j, &trans->journal_res, ++ insert->k.p.inode); ++ ++ if (trans->journal_seq) ++ *trans->journal_seq = trans->journal_res.seq; ++ } ++} ++ ++static inline bool iter_has_trans_triggers(struct btree_iter *iter) ++{ ++ return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << iter->btree_id); ++} ++ ++static inline bool iter_has_nontrans_triggers(struct btree_iter *iter) ++{ ++ return (BTREE_NODE_TYPE_HAS_TRIGGERS & ++ ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS) & ++ (1U << iter->btree_id); ++} ++ ++static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter) ++{ ++ __bch2_btree_iter_unlock(iter); ++} ++ ++static noinline void bch2_trans_mark_gc(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_insert_entry *i; ++ ++ trans_for_each_update(trans, i) { ++ /* ++ * XXX: synchronization of cached update triggers with gc ++ */ ++ BUG_ON(btree_iter_type(i->iter) == BTREE_ITER_CACHED); ++ ++ if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) ++ bch2_mark_update(trans, i->iter, i->k, NULL, ++ i->trigger_flags|BTREE_TRIGGER_GC); ++ } ++} ++ ++static inline int ++bch2_trans_commit_write_locked(struct btree_trans *trans, ++ struct btree_insert_entry **stopped_at) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_fs_usage *fs_usage = NULL; ++ struct btree_insert_entry *i; ++ unsigned u64s = 0; ++ bool marking = false; ++ int ret; ++ ++ if (race_fault()) { ++ trace_trans_restart_fault_inject(trans->ip); ++ return -EINTR; ++ } ++ ++ /* ++ * Check if the insert will fit in the leaf node with the write lock ++ * held, otherwise another thread could write the node changing the ++ * amount of space available: ++ */ ++ ++ prefetch(&trans->c->journal.flags); ++ ++ trans_for_each_update2(trans, i) { ++ /* Multiple inserts might go to same leaf: */ ++ if (!same_leaf_as_prev(trans, i)) ++ u64s = 0; ++ ++ u64s += i->k->k.u64s; ++ ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED ++ ? btree_key_can_insert(trans, i->iter, i->k, u64s) ++ : btree_key_can_insert_cached(trans, i->iter, i->k, u64s); ++ if (ret) { ++ *stopped_at = i; ++ return ret; ++ } ++ ++ if (btree_node_type_needs_gc(i->iter->btree_id)) ++ marking = true; ++ } ++ ++ if (marking) { ++ percpu_down_read(&c->mark_lock); ++ fs_usage = bch2_fs_usage_scratch_get(c); ++ } ++ ++ /* ++ * Don't get journal reservation until after we know insert will ++ * succeed: ++ */ ++ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { ++ ret = bch2_trans_journal_res_get(trans, ++ JOURNAL_RES_GET_NONBLOCK); ++ if (ret) ++ goto err; ++ } else { ++ trans->journal_res.seq = c->journal.replay_journal_seq; ++ } ++ ++ if (unlikely(trans->extra_journal_entry_u64s)) { ++ memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), ++ trans->extra_journal_entries, ++ trans->extra_journal_entry_u64s); ++ ++ trans->journal_res.offset += trans->extra_journal_entry_u64s; ++ trans->journal_res.u64s -= trans->extra_journal_entry_u64s; ++ } ++ ++ /* ++ * Not allowed to fail after we've gotten our journal reservation - we ++ * have to use it: ++ */ ++ ++ if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { ++ if (journal_seq_verify(c)) ++ trans_for_each_update2(trans, i) ++ i->k->k.version.lo = trans->journal_res.seq; ++ else if (inject_invalid_keys(c)) ++ trans_for_each_update2(trans, i) ++ i->k->k.version = MAX_VERSION; ++ } ++ ++ /* Must be called under mark_lock: */ ++ if (marking && trans->fs_usage_deltas && ++ bch2_replicas_delta_list_apply(c, fs_usage, ++ trans->fs_usage_deltas)) { ++ ret = BTREE_INSERT_NEED_MARK_REPLICAS; ++ goto err; ++ } ++ ++ trans_for_each_update(trans, i) ++ if (iter_has_nontrans_triggers(i->iter)) ++ bch2_mark_update(trans, i->iter, i->k, ++ fs_usage, i->trigger_flags); ++ ++ if (marking) ++ bch2_trans_fs_usage_apply(trans, fs_usage); ++ ++ if (unlikely(c->gc_pos.phase)) ++ bch2_trans_mark_gc(trans); ++ ++ trans_for_each_update2(trans, i) ++ do_btree_insert_one(trans, i->iter, i->k); ++err: ++ if (marking) { ++ bch2_fs_usage_scratch_put(c, fs_usage); ++ percpu_up_read(&c->mark_lock); ++ } ++ ++ return ret; ++} ++ ++/* ++ * Get journal reservation, take write locks, and attempt to do btree update(s): ++ */ ++static inline int do_bch2_trans_commit(struct btree_trans *trans, ++ struct btree_insert_entry **stopped_at) ++{ ++ struct btree_insert_entry *i; ++ struct btree_iter *iter; ++ int ret; ++ ++ trans_for_each_update2(trans, i) ++ BUG_ON(!btree_node_intent_locked(i->iter, i->iter->level)); ++ ++ ret = bch2_journal_preres_get(&trans->c->journal, ++ &trans->journal_preres, trans->journal_preres_u64s, ++ JOURNAL_RES_GET_NONBLOCK| ++ ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) ++ ? JOURNAL_RES_GET_RECLAIM : 0)); ++ if (unlikely(ret == -EAGAIN)) ++ ret = bch2_trans_journal_preres_get_cold(trans, ++ trans->journal_preres_u64s); ++ if (unlikely(ret)) ++ return ret; ++ ++ /* ++ * Can't be holding any read locks when we go to take write locks: ++ * ++ * note - this must be done after bch2_trans_journal_preres_get_cold() ++ * or anything else that might call bch2_trans_relock(), since that ++ * would just retake the read locks: ++ */ ++ trans_for_each_iter(trans, iter) { ++ if (iter->nodes_locked != iter->nodes_intent_locked) { ++ EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); ++ EBUG_ON(trans->iters_live & (1ULL << iter->idx)); ++ bch2_btree_iter_unlock_noinline(iter); ++ } ++ } ++ ++ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) ++ trans_for_each_update2(trans, i) ++ btree_insert_entry_checks(trans, i->iter, i->k); ++ bch2_btree_trans_verify_locks(trans); ++ ++ trans_for_each_update2(trans, i) ++ if (!same_leaf_as_prev(trans, i)) ++ bch2_btree_node_lock_for_insert(trans->c, ++ iter_l(i->iter)->b, i->iter); ++ ++ ret = bch2_trans_commit_write_locked(trans, stopped_at); ++ ++ trans_for_each_update2(trans, i) ++ if (!same_leaf_as_prev(trans, i)) ++ bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b, ++ i->iter); ++ ++ if (!ret && trans->journal_pin) ++ bch2_journal_pin_add(&trans->c->journal, trans->journal_res.seq, ++ trans->journal_pin, NULL); ++ ++ /* ++ * Drop journal reservation after dropping write locks, since dropping ++ * the journal reservation may kick off a journal write: ++ */ ++ bch2_journal_res_put(&trans->c->journal, &trans->journal_res); ++ ++ if (unlikely(ret)) ++ return ret; ++ ++ if (trans->flags & BTREE_INSERT_NOUNLOCK) ++ trans->nounlock = true; ++ ++ trans_for_each_update2(trans, i) ++ if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && ++ !same_leaf_as_prev(trans, i)) ++ bch2_foreground_maybe_merge(trans->c, i->iter, ++ 0, trans->flags); ++ ++ trans->nounlock = false; ++ ++ bch2_trans_downgrade(trans); ++ ++ return 0; ++} ++ ++static noinline ++int bch2_trans_commit_error(struct btree_trans *trans, ++ struct btree_insert_entry *i, ++ int ret) ++{ ++ struct bch_fs *c = trans->c; ++ unsigned flags = trans->flags; ++ ++ /* ++ * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree ++ * update; if we haven't done anything yet it doesn't apply ++ */ ++ flags &= ~BTREE_INSERT_NOUNLOCK; ++ ++ switch (ret) { ++ case BTREE_INSERT_BTREE_NODE_FULL: ++ ret = bch2_btree_split_leaf(c, i->iter, flags); ++ ++ /* ++ * if the split succeeded without dropping locks the insert will ++ * still be atomic (what the caller peeked() and is overwriting ++ * won't have changed) ++ */ ++#if 0 ++ /* ++ * XXX: ++ * split -> btree node merging (of parent node) might still drop ++ * locks when we're not passing it BTREE_INSERT_NOUNLOCK ++ * ++ * we don't want to pass BTREE_INSERT_NOUNLOCK to split as that ++ * will inhibit merging - but we don't have a reliable way yet ++ * (do we?) of checking if we dropped locks in this path ++ */ ++ if (!ret) ++ goto retry; ++#endif ++ ++ /* ++ * don't care if we got ENOSPC because we told split it ++ * couldn't block: ++ */ ++ if (!ret || ++ ret == -EINTR || ++ (flags & BTREE_INSERT_NOUNLOCK)) { ++ trace_trans_restart_btree_node_split(trans->ip); ++ ret = -EINTR; ++ } ++ break; ++ case BTREE_INSERT_ENOSPC: ++ ret = -ENOSPC; ++ break; ++ case BTREE_INSERT_NEED_MARK_REPLICAS: ++ bch2_trans_unlock(trans); ++ ++ trans_for_each_update(trans, i) { ++ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k)); ++ if (ret) ++ return ret; ++ } ++ ++ if (bch2_trans_relock(trans)) ++ return 0; ++ ++ trace_trans_restart_mark_replicas(trans->ip); ++ ret = -EINTR; ++ break; ++ case BTREE_INSERT_NEED_JOURNAL_RES: ++ bch2_trans_unlock(trans); ++ ++ ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK); ++ if (ret) ++ return ret; ++ ++ if (bch2_trans_relock(trans)) ++ return 0; ++ ++ trace_trans_restart_journal_res_get(trans->ip); ++ ret = -EINTR; ++ break; ++ default: ++ BUG_ON(ret >= 0); ++ break; ++ } ++ ++ if (ret == -EINTR) { ++ int ret2 = bch2_btree_iter_traverse_all(trans); ++ ++ if (ret2) { ++ trace_trans_restart_traverse(trans->ip); ++ return ret2; ++ } ++ ++ trace_trans_restart_atomic(trans->ip); ++ } ++ ++ return ret; ++} ++ ++static noinline int ++bch2_trans_commit_get_rw_cold(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ int ret; ++ ++ if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW))) ++ return -EROFS; ++ ++ bch2_trans_unlock(trans); ++ ++ ret = bch2_fs_read_write_early(c); ++ if (ret) ++ return ret; ++ ++ percpu_ref_get(&c->writes); ++ return 0; ++} ++ ++static void bch2_trans_update2(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct btree_insert_entry *i, n = (struct btree_insert_entry) { ++ .iter = iter, .k = insert ++ }; ++ ++ btree_insert_entry_checks(trans, n.iter, n.k); ++ ++ BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); ++ ++ EBUG_ON(trans->nr_updates2 >= trans->nr_iters); ++ ++ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ ++ trans_for_each_update2(trans, i) { ++ if (btree_iter_cmp(n.iter, i->iter) == 0) { ++ *i = n; ++ return; ++ } ++ ++ if (btree_iter_cmp(n.iter, i->iter) <= 0) ++ break; ++ } ++ ++ array_insert_item(trans->updates2, trans->nr_updates2, ++ i - trans->updates2, n); ++} ++ ++static int extent_update_to_keys(struct btree_trans *trans, ++ struct btree_iter *orig_iter, ++ struct bkey_i *insert) ++{ ++ struct btree_iter *iter; ++ int ret; ++ ++ ret = bch2_extent_can_insert(trans, orig_iter, insert); ++ if (ret) ++ return ret; ++ ++ if (bkey_deleted(&insert->k)) ++ return 0; ++ ++ iter = bch2_trans_copy_iter(trans, orig_iter); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ iter->flags |= BTREE_ITER_INTENT; ++ __bch2_btree_iter_set_pos(iter, insert->k.p, false); ++ bch2_trans_update2(trans, iter, insert); ++ bch2_trans_iter_put(trans, iter); ++ return 0; ++} ++ ++static int extent_handle_overwrites(struct btree_trans *trans, ++ enum btree_id btree_id, ++ struct bpos start, struct bpos end) ++{ ++ struct btree_iter *iter = NULL, *update_iter; ++ struct bkey_i *update; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(iter); ++ if (ret) ++ return ret; ++ ++ k = bch2_btree_iter_peek_with_updates(iter); ++ ++ while (k.k && !(ret = bkey_err(k))) { ++ if (bkey_cmp(end, bkey_start_pos(k.k)) <= 0) ++ break; ++ ++ if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { ++ update_iter = bch2_trans_copy_iter(trans, iter); ++ if ((ret = PTR_ERR_OR_ZERO(update_iter))) ++ goto err; ++ ++ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ goto err; ++ ++ bkey_reassemble(update, k); ++ bch2_cut_back(start, update); ++ ++ __bch2_btree_iter_set_pos(update_iter, update->k.p, false); ++ bch2_trans_update2(trans, update_iter, update); ++ bch2_trans_iter_put(trans, update_iter); ++ } ++ ++ if (bkey_cmp(k.k->p, end) > 0) { ++ update_iter = bch2_trans_copy_iter(trans, iter); ++ if ((ret = PTR_ERR_OR_ZERO(update_iter))) ++ goto err; ++ ++ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ goto err; ++ ++ bkey_reassemble(update, k); ++ bch2_cut_front(end, update); ++ ++ __bch2_btree_iter_set_pos(update_iter, update->k.p, false); ++ bch2_trans_update2(trans, update_iter, update); ++ bch2_trans_iter_put(trans, update_iter); ++ } else { ++ update_iter = bch2_trans_copy_iter(trans, iter); ++ if ((ret = PTR_ERR_OR_ZERO(update_iter))) ++ goto err; ++ ++ update = bch2_trans_kmalloc(trans, sizeof(struct bkey)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ goto err; ++ ++ update->k = *k.k; ++ set_bkey_val_u64s(&update->k, 0); ++ update->k.type = KEY_TYPE_deleted; ++ update->k.size = 0; ++ ++ __bch2_btree_iter_set_pos(update_iter, update->k.p, false); ++ bch2_trans_update2(trans, update_iter, update); ++ bch2_trans_iter_put(trans, update_iter); ++ } ++ ++ k = bch2_btree_iter_next_with_updates(iter); ++ } ++err: ++ if (!IS_ERR_OR_NULL(iter)) ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++int __bch2_trans_commit(struct btree_trans *trans) ++{ ++ struct btree_insert_entry *i = NULL; ++ struct btree_iter *iter; ++ bool trans_trigger_run; ++ unsigned u64s; ++ int ret = 0; ++ ++ BUG_ON(trans->need_reset); ++ ++ if (!trans->nr_updates) ++ goto out_noupdates; ++ ++ if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) ++ lockdep_assert_held(&trans->c->gc_lock); ++ ++ memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); ++ ++ trans->journal_u64s = trans->extra_journal_entry_u64s; ++ trans->journal_preres_u64s = 0; ++ ++ if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && ++ unlikely(!percpu_ref_tryget(&trans->c->writes))) { ++ ret = bch2_trans_commit_get_rw_cold(trans); ++ if (ret) ++ return ret; ++ } ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ trans_for_each_update(trans, i) ++ if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && ++ !(i->trigger_flags & BTREE_TRIGGER_NORUN)) ++ bch2_btree_key_cache_verify_clean(trans, ++ i->iter->btree_id, i->iter->pos); ++#endif ++ ++ /* ++ * Running triggers will append more updates to the list of updates as ++ * we're walking it: ++ */ ++ do { ++ trans_trigger_run = false; ++ ++ trans_for_each_update(trans, i) { ++ if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK && ++ (ret = bch2_btree_iter_traverse(i->iter)))) { ++ trace_trans_restart_traverse(trans->ip); ++ goto out; ++ } ++ ++ /* ++ * We're not using bch2_btree_iter_upgrade here because ++ * we know trans->nounlock can't be set: ++ */ ++ if (unlikely(i->iter->locks_want < 1 && ++ !__bch2_btree_iter_upgrade(i->iter, 1))) { ++ trace_trans_restart_upgrade(trans->ip); ++ ret = -EINTR; ++ goto out; ++ } ++ ++ if (iter_has_trans_triggers(i->iter) && ++ !i->trans_triggers_run) { ++ i->trans_triggers_run = true; ++ trans_trigger_run = true; ++ ++ ret = bch2_trans_mark_update(trans, i->iter, i->k, ++ i->trigger_flags); ++ if (unlikely(ret)) { ++ if (ret == -EINTR) ++ trace_trans_restart_mark(trans->ip); ++ goto out; ++ } ++ } ++ } ++ } while (trans_trigger_run); ++ ++ /* Turn extents updates into keys: */ ++ trans_for_each_update(trans, i) ++ if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { ++ struct bpos start = bkey_start_pos(&i->k->k); ++ ++ while (i + 1 < trans->updates + trans->nr_updates && ++ i[0].iter->btree_id == i[1].iter->btree_id && ++ !bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k))) ++ i++; ++ ++ ret = extent_handle_overwrites(trans, i->iter->btree_id, ++ start, i->k->k.p); ++ if (ret) ++ goto out; ++ } ++ ++ trans_for_each_update(trans, i) { ++ if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { ++ ret = extent_update_to_keys(trans, i->iter, i->k); ++ if (ret) ++ goto out; ++ } else { ++ bch2_trans_update2(trans, i->iter, i->k); ++ } ++ } ++ ++ trans_for_each_update2(trans, i) { ++ BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK); ++ BUG_ON(i->iter->locks_want < 1); ++ ++ u64s = jset_u64s(i->k->k.u64s); ++ if (btree_iter_type(i->iter) == BTREE_ITER_CACHED && ++ likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) ++ trans->journal_preres_u64s += u64s; ++ trans->journal_u64s += u64s; ++ } ++retry: ++ memset(&trans->journal_res, 0, sizeof(trans->journal_res)); ++ ++ ret = do_bch2_trans_commit(trans, &i); ++ ++ /* make sure we didn't drop or screw up locks: */ ++ bch2_btree_trans_verify_locks(trans); ++ ++ if (ret) ++ goto err; ++ ++ trans_for_each_iter(trans, iter) ++ if ((trans->iters_live & (1ULL << iter->idx)) && ++ (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) { ++ if (trans->flags & BTREE_INSERT_NOUNLOCK) ++ bch2_btree_iter_set_pos_same_leaf(iter, iter->pos_after_commit); ++ else ++ bch2_btree_iter_set_pos(iter, iter->pos_after_commit); ++ } ++out: ++ bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); ++ ++ if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) ++ percpu_ref_put(&trans->c->writes); ++out_noupdates: ++ bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0); ++ ++ return ret; ++err: ++ ret = bch2_trans_commit_error(trans, i, ret); ++ if (ret) ++ goto out; ++ ++ goto retry; ++} ++ ++int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_i *k, enum btree_trigger_flags flags) ++{ ++ struct btree_insert_entry *i, n = (struct btree_insert_entry) { ++ .trigger_flags = flags, .iter = iter, .k = k ++ }; ++ ++ EBUG_ON(bkey_cmp(iter->pos, ++ (iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? bkey_start_pos(&k->k) ++ : k->k.p)); ++ ++ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ ++ if (btree_node_type_is_extents(iter->btree_id)) { ++ iter->pos_after_commit = k->k.p; ++ iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT; ++ } ++ ++ /* ++ * Pending updates are kept sorted: first, find position of new update: ++ */ ++ trans_for_each_update(trans, i) ++ if (btree_iter_cmp(iter, i->iter) <= 0) ++ break; ++ ++ /* ++ * Now delete/trim any updates the new update overwrites: ++ */ ++ if (i > trans->updates && ++ i[-1].iter->btree_id == iter->btree_id && ++ bkey_cmp(iter->pos, i[-1].k->k.p) < 0) ++ bch2_cut_back(n.iter->pos, i[-1].k); ++ ++ while (i < trans->updates + trans->nr_updates && ++ iter->btree_id == i->iter->btree_id && ++ bkey_cmp(n.k->k.p, i->k->k.p) >= 0) ++ array_remove_item(trans->updates, trans->nr_updates, ++ i - trans->updates); ++ ++ if (i < trans->updates + trans->nr_updates && ++ iter->btree_id == i->iter->btree_id && ++ bkey_cmp(n.k->k.p, i->iter->pos) > 0) { ++ /* ++ * When we have an extent that overwrites the start of another ++ * update, trimming that extent will mean the iterator's ++ * position has to change since the iterator position has to ++ * match the extent's start pos - but we don't want to change ++ * the iterator pos if some other code is using it, so we may ++ * need to clone it: ++ */ ++ if (trans->iters_live & (1ULL << i->iter->idx)) { ++ i->iter = bch2_trans_copy_iter(trans, i->iter); ++ if (IS_ERR(i->iter)) { ++ trans->need_reset = true; ++ return PTR_ERR(i->iter); ++ } ++ ++ i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ bch2_trans_iter_put(trans, i->iter); ++ } ++ ++ bch2_cut_front(n.k->k.p, i->k); ++ bch2_btree_iter_set_pos(i->iter, n.k->k.p); ++ } ++ ++ EBUG_ON(trans->nr_updates >= trans->nr_iters); ++ ++ array_insert_item(trans->updates, trans->nr_updates, ++ i - trans->updates, n); ++ return 0; ++} ++ ++int __bch2_btree_insert(struct btree_trans *trans, ++ enum btree_id id, struct bkey_i *k) ++{ ++ struct btree_iter *iter; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), ++ BTREE_ITER_INTENT); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ ret = bch2_btree_iter_traverse(iter) ?: ++ bch2_trans_update(trans, iter, k, 0); ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++/** ++ * bch2_btree_insert - insert keys into the extent btree ++ * @c: pointer to struct bch_fs ++ * @id: btree to insert into ++ * @insert_keys: list of keys to insert ++ * @hook: insert callback ++ */ ++int bch2_btree_insert(struct bch_fs *c, enum btree_id id, ++ struct bkey_i *k, ++ struct disk_reservation *disk_res, ++ u64 *journal_seq, int flags) ++{ ++ return bch2_trans_do(c, disk_res, journal_seq, flags, ++ __bch2_btree_insert(&trans, id, k)); ++} ++ ++int bch2_btree_delete_at_range(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bpos end, ++ u64 *journal_seq) ++{ ++ struct bkey_s_c k; ++ int ret = 0; ++retry: ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret = bkey_err(k)) && ++ bkey_cmp(iter->pos, end) < 0) { ++ struct bkey_i delete; ++ ++ bch2_trans_begin(trans); ++ ++ bkey_init(&delete.k); ++ ++ /* ++ * For extents, iter.pos won't necessarily be the same as ++ * bkey_start_pos(k.k) (for non extents they always will be the ++ * same). It's important that we delete starting from iter.pos ++ * because the range we want to delete could start in the middle ++ * of k. ++ * ++ * (bch2_btree_iter_peek() does guarantee that iter.pos >= ++ * bkey_start_pos(k.k)). ++ */ ++ delete.k.p = iter->pos; ++ ++ if (btree_node_type_is_extents(iter->btree_id)) { ++ unsigned max_sectors = ++ KEY_SIZE_MAX & (~0 << trans->c->block_bits); ++ ++ /* create the biggest key we can */ ++ bch2_key_resize(&delete.k, max_sectors); ++ bch2_cut_back(end, &delete); ++ ++ ret = bch2_extent_trim_atomic(&delete, iter); ++ if (ret) ++ break; ++ } ++ ++ bch2_trans_update(trans, iter, &delete, 0); ++ ret = bch2_trans_commit(trans, NULL, journal_seq, ++ BTREE_INSERT_NOFAIL); ++ if (ret) ++ break; ++ ++ bch2_trans_cond_resched(trans); ++ } ++ ++ if (ret == -EINTR) { ++ ret = 0; ++ goto retry; ++ } ++ ++ return ret; ++ ++} ++ ++int bch2_btree_delete_at(struct btree_trans *trans, ++ struct btree_iter *iter, unsigned flags) ++{ ++ struct bkey_i k; ++ ++ bkey_init(&k.k); ++ k.k.p = iter->pos; ++ ++ bch2_trans_update(trans, iter, &k, 0); ++ return bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE|flags); ++} ++ ++/* ++ * bch_btree_delete_range - delete everything within a given range ++ * ++ * Range is a half open interval - [start, end) ++ */ ++int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, ++ struct bpos start, struct bpos end, ++ u64 *journal_seq) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ int ret = 0; ++ ++ /* ++ * XXX: whether we need mem/more iters depends on whether this btree id ++ * has triggers ++ */ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); ++ ++ iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT); ++ ++ ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq); ++ ret = bch2_trans_exit(&trans) ?: ret; ++ ++ BUG_ON(ret == -EINTR); ++ return ret; ++} +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +new file mode 100644 +index 000000000000..0ec194b93c71 +--- /dev/null ++++ b/fs/bcachefs/buckets.c +@@ -0,0 +1,2126 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Code for manipulating bucket marks for garbage collection. ++ * ++ * Copyright 2014 Datera, Inc. ++ * ++ * Bucket states: ++ * - free bucket: mark == 0 ++ * The bucket contains no data and will not be read ++ * ++ * - allocator bucket: owned_by_allocator == 1 ++ * The bucket is on a free list, or it is an open bucket ++ * ++ * - cached bucket: owned_by_allocator == 0 && ++ * dirty_sectors == 0 && ++ * cached_sectors > 0 ++ * The bucket contains data but may be safely discarded as there are ++ * enough replicas of the data on other cache devices, or it has been ++ * written back to the backing device ++ * ++ * - dirty bucket: owned_by_allocator == 0 && ++ * dirty_sectors > 0 ++ * The bucket contains data that we must not discard (either only copy, ++ * or one of the 'main copies' for data requiring multiple replicas) ++ * ++ * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1 ++ * This is a btree node, journal or gen/prio bucket ++ * ++ * Lifecycle: ++ * ++ * bucket invalidated => bucket on freelist => open bucket => ++ * [dirty bucket =>] cached bucket => bucket invalidated => ... ++ * ++ * Note that cache promotion can skip the dirty bucket step, as data ++ * is copied from a deeper tier to a shallower tier, onto a cached ++ * bucket. ++ * Note also that a cached bucket can spontaneously become dirty -- ++ * see below. ++ * ++ * Only a traversal of the key space can determine whether a bucket is ++ * truly dirty or cached. ++ * ++ * Transitions: ++ * ++ * - free => allocator: bucket was invalidated ++ * - cached => allocator: bucket was invalidated ++ * ++ * - allocator => dirty: open bucket was filled up ++ * - allocator => cached: open bucket was filled up ++ * - allocator => metadata: metadata was allocated ++ * ++ * - dirty => cached: dirty sectors were copied to a deeper tier ++ * - dirty => free: dirty sectors were overwritten or moved (copy gc) ++ * - cached => free: cached sectors were overwritten ++ * ++ * - metadata => free: metadata was freed ++ * ++ * Oddities: ++ * - cached => dirty: a device was removed so formerly replicated data ++ * is no longer sufficiently replicated ++ * - free => cached: cannot happen ++ * - free => dirty: cannot happen ++ * - free => metadata: cannot happen ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "bset.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "ec.h" ++#include "error.h" ++#include "movinggc.h" ++#include "replicas.h" ++ ++#include ++#include ++ ++/* ++ * Clear journal_seq_valid for buckets for which it's not needed, to prevent ++ * wraparound: ++ */ ++void bch2_bucket_seq_cleanup(struct bch_fs *c) ++{ ++ u64 journal_seq = atomic64_read(&c->journal.seq); ++ u16 last_seq_ondisk = c->journal.last_seq_ondisk; ++ struct bch_dev *ca; ++ struct bucket_array *buckets; ++ struct bucket *g; ++ struct bucket_mark m; ++ unsigned i; ++ ++ if (journal_seq - c->last_bucket_seq_cleanup < ++ (1U << (BUCKET_JOURNAL_SEQ_BITS - 2))) ++ return; ++ ++ c->last_bucket_seq_cleanup = journal_seq; ++ ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ for_each_bucket(g, buckets) { ++ bucket_cmpxchg(g, m, ({ ++ if (!m.journal_seq_valid || ++ bucket_needs_journal_commit(m, last_seq_ondisk)) ++ break; ++ ++ m.journal_seq_valid = 0; ++ })); ++ } ++ up_read(&ca->bucket_lock); ++ } ++} ++ ++void bch2_fs_usage_initialize(struct bch_fs *c) ++{ ++ struct bch_fs_usage *usage; ++ unsigned i; ++ ++ percpu_down_write(&c->mark_lock); ++ usage = c->usage_base; ++ ++ bch2_fs_usage_acc_to_base(c, 0); ++ bch2_fs_usage_acc_to_base(c, 1); ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) ++ usage->reserved += usage->persistent_reserved[i]; ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ ++ switch (e->data_type) { ++ case BCH_DATA_BTREE: ++ usage->btree += usage->replicas[i]; ++ break; ++ case BCH_DATA_USER: ++ usage->data += usage->replicas[i]; ++ break; ++ case BCH_DATA_CACHED: ++ usage->cached += usage->replicas[i]; ++ break; ++ } ++ } ++ ++ percpu_up_write(&c->mark_lock); ++} ++ ++void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage *fs_usage) ++{ ++ if (fs_usage == c->usage_scratch) ++ mutex_unlock(&c->usage_scratch_lock); ++ else ++ kfree(fs_usage); ++} ++ ++struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *c) ++{ ++ struct bch_fs_usage *ret; ++ unsigned bytes = fs_usage_u64s(c) * sizeof(u64); ++ ++ ret = kzalloc(bytes, GFP_NOWAIT|__GFP_NOWARN); ++ if (ret) ++ return ret; ++ ++ if (mutex_trylock(&c->usage_scratch_lock)) ++ goto out_pool; ++ ++ ret = kzalloc(bytes, GFP_NOFS); ++ if (ret) ++ return ret; ++ ++ mutex_lock(&c->usage_scratch_lock); ++out_pool: ++ ret = c->usage_scratch; ++ memset(ret, 0, bytes); ++ return ret; ++} ++ ++struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bch_dev_usage ret; ++ ++ memset(&ret, 0, sizeof(ret)); ++ acc_u64s_percpu((u64 *) &ret, ++ (u64 __percpu *) ca->usage[0], ++ sizeof(ret) / sizeof(u64)); ++ ++ return ret; ++} ++ ++static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, ++ unsigned journal_seq, ++ bool gc) ++{ ++ return this_cpu_ptr(gc ++ ? c->usage_gc ++ : c->usage[journal_seq & 1]); ++} ++ ++u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) ++{ ++ ssize_t offset = v - (u64 *) c->usage_base; ++ unsigned seq; ++ u64 ret; ++ ++ BUG_ON(offset < 0 || offset >= fs_usage_u64s(c)); ++ percpu_rwsem_assert_held(&c->mark_lock); ++ ++ do { ++ seq = read_seqcount_begin(&c->usage_lock); ++ ret = *v + ++ percpu_u64_get((u64 __percpu *) c->usage[0] + offset) + ++ percpu_u64_get((u64 __percpu *) c->usage[1] + offset); ++ } while (read_seqcount_retry(&c->usage_lock, seq)); ++ ++ return ret; ++} ++ ++struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c) ++{ ++ struct bch_fs_usage *ret; ++ unsigned seq, v, u64s = fs_usage_u64s(c); ++retry: ++ ret = kmalloc(u64s * sizeof(u64), GFP_NOFS); ++ if (unlikely(!ret)) ++ return NULL; ++ ++ percpu_down_read(&c->mark_lock); ++ ++ v = fs_usage_u64s(c); ++ if (unlikely(u64s != v)) { ++ u64s = v; ++ percpu_up_read(&c->mark_lock); ++ kfree(ret); ++ goto retry; ++ } ++ ++ do { ++ seq = read_seqcount_begin(&c->usage_lock); ++ memcpy(ret, c->usage_base, u64s * sizeof(u64)); ++ acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s); ++ acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[1], u64s); ++ } while (read_seqcount_retry(&c->usage_lock, seq)); ++ ++ return ret; ++} ++ ++void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) ++{ ++ unsigned u64s = fs_usage_u64s(c); ++ ++ BUG_ON(idx >= 2); ++ ++ write_seqcount_begin(&c->usage_lock); ++ ++ acc_u64s_percpu((u64 *) c->usage_base, ++ (u64 __percpu *) c->usage[idx], u64s); ++ percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); ++ ++ write_seqcount_end(&c->usage_lock); ++} ++ ++void bch2_fs_usage_to_text(struct printbuf *out, ++ struct bch_fs *c, ++ struct bch_fs_usage *fs_usage) ++{ ++ unsigned i; ++ ++ pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity); ++ ++ pr_buf(out, "hidden:\t\t\t\t%llu\n", ++ fs_usage->hidden); ++ pr_buf(out, "data:\t\t\t\t%llu\n", ++ fs_usage->data); ++ pr_buf(out, "cached:\t\t\t\t%llu\n", ++ fs_usage->cached); ++ pr_buf(out, "reserved:\t\t\t%llu\n", ++ fs_usage->reserved); ++ pr_buf(out, "nr_inodes:\t\t\t%llu\n", ++ fs_usage->nr_inodes); ++ pr_buf(out, "online reserved:\t\t%llu\n", ++ fs_usage->online_reserved); ++ ++ for (i = 0; ++ i < ARRAY_SIZE(fs_usage->persistent_reserved); ++ i++) { ++ pr_buf(out, "%u replicas:\n", i + 1); ++ pr_buf(out, "\treserved:\t\t%llu\n", ++ fs_usage->persistent_reserved[i]); ++ } ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ ++ pr_buf(out, "\t"); ++ bch2_replicas_entry_to_text(out, e); ++ pr_buf(out, ":\t%llu\n", fs_usage->replicas[i]); ++ } ++} ++ ++#define RESERVE_FACTOR 6 ++ ++static u64 reserve_factor(u64 r) ++{ ++ return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); ++} ++ ++static u64 avail_factor(u64 r) ++{ ++ return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1); ++} ++ ++u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage) ++{ ++ return min(fs_usage->hidden + ++ fs_usage->btree + ++ fs_usage->data + ++ reserve_factor(fs_usage->reserved + ++ fs_usage->online_reserved), ++ c->capacity); ++} ++ ++static struct bch_fs_usage_short ++__bch2_fs_usage_read_short(struct bch_fs *c) ++{ ++ struct bch_fs_usage_short ret; ++ u64 data, reserved; ++ ++ ret.capacity = c->capacity - ++ bch2_fs_usage_read_one(c, &c->usage_base->hidden); ++ ++ data = bch2_fs_usage_read_one(c, &c->usage_base->data) + ++ bch2_fs_usage_read_one(c, &c->usage_base->btree); ++ reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + ++ bch2_fs_usage_read_one(c, &c->usage_base->online_reserved); ++ ++ ret.used = min(ret.capacity, data + reserve_factor(reserved)); ++ ret.free = ret.capacity - ret.used; ++ ++ ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes); ++ ++ return ret; ++} ++ ++struct bch_fs_usage_short ++bch2_fs_usage_read_short(struct bch_fs *c) ++{ ++ struct bch_fs_usage_short ret; ++ ++ percpu_down_read(&c->mark_lock); ++ ret = __bch2_fs_usage_read_short(c); ++ percpu_up_read(&c->mark_lock); ++ ++ return ret; ++} ++ ++static inline int is_unavailable_bucket(struct bucket_mark m) ++{ ++ return !is_available_bucket(m); ++} ++ ++static inline int is_fragmented_bucket(struct bucket_mark m, ++ struct bch_dev *ca) ++{ ++ if (!m.owned_by_allocator && ++ m.data_type == BCH_DATA_USER && ++ bucket_sectors_used(m)) ++ return max_t(int, 0, (int) ca->mi.bucket_size - ++ bucket_sectors_used(m)); ++ return 0; ++} ++ ++static inline int bucket_stripe_sectors(struct bucket_mark m) ++{ ++ return m.stripe ? m.dirty_sectors : 0; ++} ++ ++static inline enum bch_data_type bucket_type(struct bucket_mark m) ++{ ++ return m.cached_sectors && !m.dirty_sectors ++ ? BCH_DATA_CACHED ++ : m.data_type; ++} ++ ++static bool bucket_became_unavailable(struct bucket_mark old, ++ struct bucket_mark new) ++{ ++ return is_available_bucket(old) && ++ !is_available_bucket(new); ++} ++ ++int bch2_fs_usage_apply(struct bch_fs *c, ++ struct bch_fs_usage *fs_usage, ++ struct disk_reservation *disk_res, ++ unsigned journal_seq) ++{ ++ s64 added = fs_usage->data + fs_usage->reserved; ++ s64 should_not_have_added; ++ int ret = 0; ++ ++ percpu_rwsem_assert_held(&c->mark_lock); ++ ++ /* ++ * Not allowed to reduce sectors_available except by getting a ++ * reservation: ++ */ ++ should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0); ++ if (WARN_ONCE(should_not_have_added > 0, ++ "disk usage increased by %lli without a reservation", ++ should_not_have_added)) { ++ atomic64_sub(should_not_have_added, &c->sectors_available); ++ added -= should_not_have_added; ++ ret = -1; ++ } ++ ++ if (added > 0) { ++ disk_res->sectors -= added; ++ fs_usage->online_reserved -= added; ++ } ++ ++ preempt_disable(); ++ acc_u64s((u64 *) fs_usage_ptr(c, journal_seq, false), ++ (u64 *) fs_usage, fs_usage_u64s(c)); ++ preempt_enable(); ++ ++ return ret; ++} ++ ++static inline void account_bucket(struct bch_fs_usage *fs_usage, ++ struct bch_dev_usage *dev_usage, ++ enum bch_data_type type, ++ int nr, s64 size) ++{ ++ if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL) ++ fs_usage->hidden += size; ++ ++ dev_usage->buckets[type] += nr; ++} ++ ++static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, ++ struct bch_fs_usage *fs_usage, ++ struct bucket_mark old, struct bucket_mark new, ++ bool gc) ++{ ++ struct bch_dev_usage *u; ++ ++ percpu_rwsem_assert_held(&c->mark_lock); ++ ++ preempt_disable(); ++ u = this_cpu_ptr(ca->usage[gc]); ++ ++ if (bucket_type(old)) ++ account_bucket(fs_usage, u, bucket_type(old), ++ -1, -ca->mi.bucket_size); ++ ++ if (bucket_type(new)) ++ account_bucket(fs_usage, u, bucket_type(new), ++ 1, ca->mi.bucket_size); ++ ++ u->buckets_alloc += ++ (int) new.owned_by_allocator - (int) old.owned_by_allocator; ++ u->buckets_unavailable += ++ is_unavailable_bucket(new) - is_unavailable_bucket(old); ++ ++ u->buckets_ec += (int) new.stripe - (int) old.stripe; ++ u->sectors_ec += bucket_stripe_sectors(new) - ++ bucket_stripe_sectors(old); ++ ++ u->sectors[old.data_type] -= old.dirty_sectors; ++ u->sectors[new.data_type] += new.dirty_sectors; ++ u->sectors[BCH_DATA_CACHED] += ++ (int) new.cached_sectors - (int) old.cached_sectors; ++ u->sectors_fragmented += ++ is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca); ++ preempt_enable(); ++ ++ if (!is_available_bucket(old) && is_available_bucket(new)) ++ bch2_wake_allocator(ca); ++} ++ ++void bch2_dev_usage_from_buckets(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ struct bucket_mark old = { .v.counter = 0 }; ++ struct bucket_array *buckets; ++ struct bucket *g; ++ unsigned i; ++ int cpu; ++ ++ c->usage_base->hidden = 0; ++ ++ for_each_member_device(ca, c, i) { ++ for_each_possible_cpu(cpu) ++ memset(per_cpu_ptr(ca->usage[0], cpu), 0, ++ sizeof(*ca->usage[0])); ++ ++ buckets = bucket_array(ca); ++ ++ for_each_bucket(g, buckets) ++ bch2_dev_usage_update(c, ca, c->usage_base, ++ old, g->mark, false); ++ } ++} ++ ++static inline int update_replicas(struct bch_fs *c, ++ struct bch_fs_usage *fs_usage, ++ struct bch_replicas_entry *r, ++ s64 sectors) ++{ ++ int idx = bch2_replicas_entry_idx(c, r); ++ ++ if (idx < 0) ++ return -1; ++ ++ if (!fs_usage) ++ return 0; ++ ++ switch (r->data_type) { ++ case BCH_DATA_BTREE: ++ fs_usage->btree += sectors; ++ break; ++ case BCH_DATA_USER: ++ fs_usage->data += sectors; ++ break; ++ case BCH_DATA_CACHED: ++ fs_usage->cached += sectors; ++ break; ++ } ++ fs_usage->replicas[idx] += sectors; ++ return 0; ++} ++ ++static inline void update_cached_sectors(struct bch_fs *c, ++ struct bch_fs_usage *fs_usage, ++ unsigned dev, s64 sectors) ++{ ++ struct bch_replicas_padded r; ++ ++ bch2_replicas_entry_cached(&r.e, dev); ++ ++ update_replicas(c, fs_usage, &r.e, sectors); ++} ++ ++static struct replicas_delta_list * ++replicas_deltas_realloc(struct btree_trans *trans, unsigned more) ++{ ++ struct replicas_delta_list *d = trans->fs_usage_deltas; ++ unsigned new_size = d ? (d->size + more) * 2 : 128; ++ ++ if (!d || d->used + more > d->size) { ++ d = krealloc(d, sizeof(*d) + new_size, GFP_NOIO|__GFP_ZERO); ++ BUG_ON(!d); ++ ++ d->size = new_size; ++ trans->fs_usage_deltas = d; ++ } ++ return d; ++} ++ ++static inline void update_replicas_list(struct btree_trans *trans, ++ struct bch_replicas_entry *r, ++ s64 sectors) ++{ ++ struct replicas_delta_list *d; ++ struct replicas_delta *n; ++ unsigned b; ++ ++ if (!sectors) ++ return; ++ ++ b = replicas_entry_bytes(r) + 8; ++ d = replicas_deltas_realloc(trans, b); ++ ++ n = (void *) d->d + d->used; ++ n->delta = sectors; ++ memcpy(&n->r, r, replicas_entry_bytes(r)); ++ d->used += b; ++} ++ ++static inline void update_cached_sectors_list(struct btree_trans *trans, ++ unsigned dev, s64 sectors) ++{ ++ struct bch_replicas_padded r; ++ ++ bch2_replicas_entry_cached(&r.e, dev); ++ ++ update_replicas_list(trans, &r.e, sectors); ++} ++ ++static inline struct replicas_delta * ++replicas_delta_next(struct replicas_delta *d) ++{ ++ return (void *) d + replicas_entry_bytes(&d->r) + 8; ++} ++ ++int bch2_replicas_delta_list_apply(struct bch_fs *c, ++ struct bch_fs_usage *fs_usage, ++ struct replicas_delta_list *r) ++{ ++ struct replicas_delta *d = r->d; ++ struct replicas_delta *top = (void *) r->d + r->used; ++ unsigned i; ++ ++ for (d = r->d; d != top; d = replicas_delta_next(d)) ++ if (update_replicas(c, fs_usage, &d->r, d->delta)) { ++ top = d; ++ goto unwind; ++ } ++ ++ if (!fs_usage) ++ return 0; ++ ++ fs_usage->nr_inodes += r->nr_inodes; ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) { ++ fs_usage->reserved += r->persistent_reserved[i]; ++ fs_usage->persistent_reserved[i] += r->persistent_reserved[i]; ++ } ++ ++ return 0; ++unwind: ++ for (d = r->d; d != top; d = replicas_delta_next(d)) ++ update_replicas(c, fs_usage, &d->r, -d->delta); ++ return -1; ++} ++ ++#define do_mark_fn(fn, c, pos, flags, ...) \ ++({ \ ++ int gc, ret = 0; \ ++ \ ++ percpu_rwsem_assert_held(&c->mark_lock); \ ++ \ ++ for (gc = 0; gc < 2 && !ret; gc++) \ ++ if (!gc == !(flags & BTREE_TRIGGER_GC) || \ ++ (gc && gc_visited(c, pos))) \ ++ ret = fn(c, __VA_ARGS__, gc); \ ++ ret; \ ++}) ++ ++static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, struct bucket_mark *ret, ++ bool gc) ++{ ++ struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); ++ struct bucket *g = __bucket(ca, b, gc); ++ struct bucket_mark old, new; ++ ++ old = bucket_cmpxchg(g, new, ({ ++ BUG_ON(!is_available_bucket(new)); ++ ++ new.owned_by_allocator = true; ++ new.data_type = 0; ++ new.cached_sectors = 0; ++ new.dirty_sectors = 0; ++ new.gen++; ++ })); ++ ++ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); ++ ++ if (old.cached_sectors) ++ update_cached_sectors(c, fs_usage, ca->dev_idx, ++ -((s64) old.cached_sectors)); ++ ++ if (!gc) ++ *ret = old; ++ return 0; ++} ++ ++void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, struct bucket_mark *old) ++{ ++ do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0, ++ ca, b, old); ++ ++ if (!old->owned_by_allocator && old->cached_sectors) ++ trace_invalidate(ca, bucket_to_sector(ca, b), ++ old->cached_sectors); ++} ++ ++static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, bool owned_by_allocator, ++ bool gc) ++{ ++ struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); ++ struct bucket *g = __bucket(ca, b, gc); ++ struct bucket_mark old, new; ++ ++ old = bucket_cmpxchg(g, new, ({ ++ new.owned_by_allocator = owned_by_allocator; ++ })); ++ ++ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); ++ ++ BUG_ON(!gc && ++ !owned_by_allocator && !old.owned_by_allocator); ++ ++ return 0; ++} ++ ++void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, bool owned_by_allocator, ++ struct gc_pos pos, unsigned flags) ++{ ++ preempt_disable(); ++ ++ do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags, ++ ca, b, owned_by_allocator); ++ ++ preempt_enable(); ++} ++ ++static int bch2_mark_alloc(struct bch_fs *c, struct bkey_s_c k, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, unsigned flags) ++{ ++ bool gc = flags & BTREE_TRIGGER_GC; ++ struct bkey_alloc_unpacked u; ++ struct bch_dev *ca; ++ struct bucket *g; ++ struct bucket_mark old, m; ++ ++ /* ++ * alloc btree is read in by bch2_alloc_read, not gc: ++ */ ++ if ((flags & BTREE_TRIGGER_GC) && ++ !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) ++ return 0; ++ ++ ca = bch_dev_bkey_exists(c, k.k->p.inode); ++ ++ if (k.k->p.offset >= ca->mi.nbuckets) ++ return 0; ++ ++ g = __bucket(ca, k.k->p.offset, gc); ++ u = bch2_alloc_unpack(k); ++ ++ old = bucket_cmpxchg(g, m, ({ ++ m.gen = u.gen; ++ m.data_type = u.data_type; ++ m.dirty_sectors = u.dirty_sectors; ++ m.cached_sectors = u.cached_sectors; ++ ++ if (journal_seq) { ++ m.journal_seq_valid = 1; ++ m.journal_seq = journal_seq; ++ } ++ })); ++ ++ if (!(flags & BTREE_TRIGGER_ALLOC_READ)) ++ bch2_dev_usage_update(c, ca, fs_usage, old, m, gc); ++ ++ g->io_time[READ] = u.read_time; ++ g->io_time[WRITE] = u.write_time; ++ g->oldest_gen = u.oldest_gen; ++ g->gen_valid = 1; ++ ++ /* ++ * need to know if we're getting called from the invalidate path or ++ * not: ++ */ ++ ++ if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && ++ old.cached_sectors) { ++ update_cached_sectors(c, fs_usage, ca->dev_idx, ++ -old.cached_sectors); ++ trace_invalidate(ca, bucket_to_sector(ca, k.k->p.offset), ++ old.cached_sectors); ++ } ++ ++ return 0; ++} ++ ++#define checked_add(a, b) \ ++({ \ ++ unsigned _res = (unsigned) (a) + (b); \ ++ bool overflow = _res > U16_MAX; \ ++ if (overflow) \ ++ _res = U16_MAX; \ ++ (a) = _res; \ ++ overflow; \ ++}) ++ ++static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, enum bch_data_type data_type, ++ unsigned sectors, bool gc) ++{ ++ struct bucket *g = __bucket(ca, b, gc); ++ struct bucket_mark old, new; ++ bool overflow; ++ ++ BUG_ON(data_type != BCH_DATA_SB && ++ data_type != BCH_DATA_JOURNAL); ++ ++ old = bucket_cmpxchg(g, new, ({ ++ new.data_type = data_type; ++ overflow = checked_add(new.dirty_sectors, sectors); ++ })); ++ ++ bch2_fs_inconsistent_on(old.data_type && ++ old.data_type != data_type, c, ++ "different types of data in same bucket: %s, %s", ++ bch2_data_types[old.data_type], ++ bch2_data_types[data_type]); ++ ++ bch2_fs_inconsistent_on(overflow, c, ++ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > U16_MAX", ++ ca->dev_idx, b, new.gen, ++ bch2_data_types[old.data_type ?: data_type], ++ old.dirty_sectors, sectors); ++ ++ if (c) ++ bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc), ++ old, new, gc); ++ ++ return 0; ++} ++ ++void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, enum bch_data_type type, ++ unsigned sectors, struct gc_pos pos, ++ unsigned flags) ++{ ++ BUG_ON(type != BCH_DATA_SB && ++ type != BCH_DATA_JOURNAL); ++ ++ preempt_disable(); ++ ++ if (likely(c)) { ++ do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags, ++ ca, b, type, sectors); ++ } else { ++ __bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0); ++ } ++ ++ preempt_enable(); ++} ++ ++static s64 disk_sectors_scaled(unsigned n, unsigned d, unsigned sectors) ++{ ++ return DIV_ROUND_UP(sectors * n, d); ++} ++ ++static s64 __ptr_disk_sectors_delta(unsigned old_size, ++ unsigned offset, s64 delta, ++ unsigned flags, ++ unsigned n, unsigned d) ++{ ++ BUG_ON(!n || !d); ++ ++ if (flags & BTREE_TRIGGER_OVERWRITE_SPLIT) { ++ BUG_ON(offset + -delta > old_size); ++ ++ return -disk_sectors_scaled(n, d, old_size) + ++ disk_sectors_scaled(n, d, offset) + ++ disk_sectors_scaled(n, d, old_size - offset + delta); ++ } else if (flags & BTREE_TRIGGER_OVERWRITE) { ++ BUG_ON(offset + -delta > old_size); ++ ++ return -disk_sectors_scaled(n, d, old_size) + ++ disk_sectors_scaled(n, d, old_size + delta); ++ } else { ++ return disk_sectors_scaled(n, d, delta); ++ } ++} ++ ++static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, ++ unsigned offset, s64 delta, ++ unsigned flags) ++{ ++ return __ptr_disk_sectors_delta(p.crc.live_size, ++ offset, delta, flags, ++ p.crc.compressed_size, ++ p.crc.uncompressed_size); ++} ++ ++static void bucket_set_stripe(struct bch_fs *c, ++ const struct bch_stripe *v, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, ++ unsigned flags) ++{ ++ bool enabled = !(flags & BTREE_TRIGGER_OVERWRITE); ++ bool gc = flags & BTREE_TRIGGER_GC; ++ unsigned i; ++ ++ for (i = 0; i < v->nr_blocks; i++) { ++ const struct bch_extent_ptr *ptr = v->ptrs + i; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, gc); ++ struct bucket_mark new, old; ++ ++ old = bucket_cmpxchg(g, new, ({ ++ new.stripe = enabled; ++ if (journal_seq) { ++ new.journal_seq_valid = 1; ++ new.journal_seq = journal_seq; ++ } ++ })); ++ ++ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); ++ ++ /* ++ * XXX write repair code for these, flag stripe as possibly bad ++ */ ++ if (old.gen != ptr->gen) ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "stripe with stale pointer"); ++#if 0 ++ /* ++ * We'd like to check for these, but these checks don't work ++ * yet: ++ */ ++ if (old.stripe && enabled) ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "multiple stripes using same bucket"); ++ ++ if (!old.stripe && !enabled) ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "deleting stripe but bucket not marked as stripe bucket"); ++#endif ++ } ++} ++ ++static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, ++ struct extent_ptr_decoded p, ++ s64 sectors, enum bch_data_type ptr_data_type, ++ u8 bucket_gen, u8 *bucket_data_type, ++ u16 *dirty_sectors, u16 *cached_sectors) ++{ ++ u16 *dst_sectors = !p.ptr.cached ++ ? dirty_sectors ++ : cached_sectors; ++ u16 orig_sectors = *dst_sectors; ++ char buf[200]; ++ ++ if (gen_after(p.ptr.gen, bucket_gen)) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), ++ bucket_gen, ++ bch2_data_types[*bucket_data_type ?: ptr_data_type], ++ p.ptr.gen, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ return -EIO; ++ } ++ ++ if (gen_cmp(bucket_gen, p.ptr.gen) >= 96U) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), ++ bucket_gen, ++ bch2_data_types[*bucket_data_type ?: ptr_data_type], ++ p.ptr.gen, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ return -EIO; ++ } ++ ++ if (bucket_gen != p.ptr.gen && !p.ptr.cached) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), ++ bucket_gen, ++ bch2_data_types[*bucket_data_type ?: ptr_data_type], ++ p.ptr.gen, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ return -EIO; ++ } ++ ++ if (bucket_gen != p.ptr.gen) ++ return 1; ++ ++ if (*bucket_data_type && *bucket_data_type != ptr_data_type) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), ++ bucket_gen, ++ bch2_data_types[*bucket_data_type], ++ bch2_data_types[ptr_data_type], ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ return -EIO; ++ } ++ ++ if (checked_add(*dst_sectors, sectors)) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), ++ bucket_gen, ++ bch2_data_types[*bucket_data_type ?: ptr_data_type], ++ orig_sectors, sectors, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ return -EIO; ++ } ++ ++ *bucket_data_type = *dirty_sectors || *cached_sectors ++ ? ptr_data_type : 0; ++ return 0; ++} ++ ++static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, ++ struct extent_ptr_decoded p, ++ s64 sectors, enum bch_data_type data_type, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, unsigned flags) ++{ ++ bool gc = flags & BTREE_TRIGGER_GC; ++ struct bucket_mark old, new; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc); ++ u8 bucket_data_type; ++ u64 v; ++ int ret; ++ ++ v = atomic64_read(&g->_mark.v); ++ do { ++ new.v.counter = old.v.counter = v; ++ bucket_data_type = new.data_type; ++ ++ ret = __mark_pointer(c, k, p, sectors, data_type, new.gen, ++ &bucket_data_type, ++ &new.dirty_sectors, ++ &new.cached_sectors); ++ if (ret) ++ return ret; ++ ++ new.data_type = bucket_data_type; ++ ++ if (journal_seq) { ++ new.journal_seq_valid = 1; ++ new.journal_seq = journal_seq; ++ } ++ ++ if (flags & BTREE_TRIGGER_NOATOMIC) { ++ g->_mark = new; ++ break; ++ } ++ } while ((v = atomic64_cmpxchg(&g->_mark.v, ++ old.v.counter, ++ new.v.counter)) != old.v.counter); ++ ++ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); ++ ++ BUG_ON(!gc && bucket_became_unavailable(old, new)); ++ ++ return 0; ++} ++ ++static int bch2_mark_stripe_ptr(struct bch_fs *c, ++ struct bch_extent_stripe_ptr p, ++ enum bch_data_type data_type, ++ struct bch_fs_usage *fs_usage, ++ s64 sectors, unsigned flags, ++ struct bch_replicas_padded *r, ++ unsigned *nr_data, ++ unsigned *nr_parity) ++{ ++ bool gc = flags & BTREE_TRIGGER_GC; ++ struct stripe *m; ++ unsigned old, new; ++ int blocks_nonempty_delta; ++ ++ m = genradix_ptr(&c->stripes[gc], p.idx); ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ ++ if (!m || !m->alive) { ++ spin_unlock(&c->ec_stripes_heap_lock); ++ bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", ++ (u64) p.idx); ++ return -EIO; ++ } ++ ++ BUG_ON(m->r.e.data_type != data_type); ++ ++ *nr_data = m->nr_blocks - m->nr_redundant; ++ *nr_parity = m->nr_redundant; ++ *r = m->r; ++ ++ old = m->block_sectors[p.block]; ++ m->block_sectors[p.block] += sectors; ++ new = m->block_sectors[p.block]; ++ ++ blocks_nonempty_delta = (int) !!new - (int) !!old; ++ if (blocks_nonempty_delta) { ++ m->blocks_nonempty += blocks_nonempty_delta; ++ ++ if (!gc) ++ bch2_stripes_heap_update(c, m, p.idx); ++ } ++ ++ m->dirty = true; ++ ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ return 0; ++} ++ ++static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k, ++ unsigned offset, s64 sectors, ++ enum bch_data_type data_type, ++ struct bch_fs_usage *fs_usage, ++ unsigned journal_seq, unsigned flags) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ struct bch_replicas_padded r; ++ s64 dirty_sectors = 0; ++ bool stale; ++ int ret; ++ ++ r.e.data_type = data_type; ++ r.e.nr_devs = 0; ++ r.e.nr_required = 1; ++ ++ BUG_ON(!sectors); ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ s64 disk_sectors = data_type == BCH_DATA_BTREE ++ ? sectors ++ : ptr_disk_sectors_delta(p, offset, sectors, flags); ++ ++ ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type, ++ fs_usage, journal_seq, flags); ++ if (ret < 0) ++ return ret; ++ ++ stale = ret > 0; ++ ++ if (p.ptr.cached) { ++ if (!stale) ++ update_cached_sectors(c, fs_usage, p.ptr.dev, ++ disk_sectors); ++ } else if (!p.has_ec) { ++ dirty_sectors += disk_sectors; ++ r.e.devs[r.e.nr_devs++] = p.ptr.dev; ++ } else { ++ struct bch_replicas_padded ec_r; ++ unsigned nr_data, nr_parity; ++ s64 parity_sectors; ++ ++ ret = bch2_mark_stripe_ptr(c, p.ec, data_type, ++ fs_usage, disk_sectors, flags, ++ &ec_r, &nr_data, &nr_parity); ++ if (ret) ++ return ret; ++ ++ parity_sectors = ++ __ptr_disk_sectors_delta(p.crc.live_size, ++ offset, sectors, flags, ++ p.crc.compressed_size * nr_parity, ++ p.crc.uncompressed_size * nr_data); ++ ++ update_replicas(c, fs_usage, &ec_r.e, ++ disk_sectors + parity_sectors); ++ ++ /* ++ * There may be other dirty pointers in this extent, but ++ * if so they're not required for mounting if we have an ++ * erasure coded pointer in this extent: ++ */ ++ r.e.nr_required = 0; ++ } ++ } ++ ++ if (r.e.nr_devs) ++ update_replicas(c, fs_usage, &r.e, dirty_sectors); ++ ++ return 0; ++} ++ ++static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, unsigned flags) ++{ ++ bool gc = flags & BTREE_TRIGGER_GC; ++ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); ++ size_t idx = s.k->p.offset; ++ struct stripe *m = genradix_ptr(&c->stripes[gc], idx); ++ unsigned i; ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ ++ if (!m || ((flags & BTREE_TRIGGER_OVERWRITE) && !m->alive)) { ++ spin_unlock(&c->ec_stripes_heap_lock); ++ bch_err_ratelimited(c, "error marking nonexistent stripe %zu", ++ idx); ++ return -1; ++ } ++ ++ if (!(flags & BTREE_TRIGGER_OVERWRITE)) { ++ m->sectors = le16_to_cpu(s.v->sectors); ++ m->algorithm = s.v->algorithm; ++ m->nr_blocks = s.v->nr_blocks; ++ m->nr_redundant = s.v->nr_redundant; ++ ++ bch2_bkey_to_replicas(&m->r.e, k); ++ ++ /* ++ * XXX: account for stripes somehow here ++ */ ++#if 0 ++ update_replicas(c, fs_usage, &m->r.e, stripe_sectors); ++#endif ++ ++ /* gc recalculates these fields: */ ++ if (!(flags & BTREE_TRIGGER_GC)) { ++ for (i = 0; i < s.v->nr_blocks; i++) { ++ m->block_sectors[i] = ++ stripe_blockcount_get(s.v, i); ++ m->blocks_nonempty += !!m->block_sectors[i]; ++ } ++ } ++ ++ if (!gc) ++ bch2_stripes_heap_update(c, m, idx); ++ m->alive = true; ++ } else { ++ if (!gc) ++ bch2_stripes_heap_del(c, m, idx); ++ memset(m, 0, sizeof(*m)); ++ } ++ ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ bucket_set_stripe(c, s.v, fs_usage, 0, flags); ++ return 0; ++} ++ ++static int bch2_mark_key_locked(struct bch_fs *c, ++ struct bkey_s_c k, ++ unsigned offset, s64 sectors, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, unsigned flags) ++{ ++ int ret = 0; ++ ++ preempt_disable(); ++ ++ if (!fs_usage || (flags & BTREE_TRIGGER_GC)) ++ fs_usage = fs_usage_ptr(c, journal_seq, ++ flags & BTREE_TRIGGER_GC); ++ ++ switch (k.k->type) { ++ case KEY_TYPE_alloc: ++ ret = bch2_mark_alloc(c, k, fs_usage, journal_seq, flags); ++ break; ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: ++ sectors = !(flags & BTREE_TRIGGER_OVERWRITE) ++ ? c->opts.btree_node_size ++ : -c->opts.btree_node_size; ++ ++ ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_BTREE, ++ fs_usage, journal_seq, flags); ++ break; ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ ret = bch2_mark_extent(c, k, offset, sectors, BCH_DATA_USER, ++ fs_usage, journal_seq, flags); ++ break; ++ case KEY_TYPE_stripe: ++ ret = bch2_mark_stripe(c, k, fs_usage, journal_seq, flags); ++ break; ++ case KEY_TYPE_inode: ++ if (!(flags & BTREE_TRIGGER_OVERWRITE)) ++ fs_usage->nr_inodes++; ++ else ++ fs_usage->nr_inodes--; ++ break; ++ case KEY_TYPE_reservation: { ++ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; ++ ++ sectors *= replicas; ++ replicas = clamp_t(unsigned, replicas, 1, ++ ARRAY_SIZE(fs_usage->persistent_reserved)); ++ ++ fs_usage->reserved += sectors; ++ fs_usage->persistent_reserved[replicas - 1] += sectors; ++ break; ++ } ++ } ++ ++ preempt_enable(); ++ ++ return ret; ++} ++ ++int bch2_mark_key(struct bch_fs *c, struct bkey_s_c k, ++ unsigned offset, s64 sectors, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, unsigned flags) ++{ ++ int ret; ++ ++ percpu_down_read(&c->mark_lock); ++ ret = bch2_mark_key_locked(c, k, offset, sectors, ++ fs_usage, journal_seq, flags); ++ percpu_up_read(&c->mark_lock); ++ ++ return ret; ++} ++ ++inline int bch2_mark_overwrite(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c old, ++ struct bkey_i *new, ++ struct bch_fs_usage *fs_usage, ++ unsigned flags, ++ bool is_extents) ++{ ++ struct bch_fs *c = trans->c; ++ unsigned offset = 0; ++ s64 sectors = -((s64) old.k->size); ++ ++ flags |= BTREE_TRIGGER_OVERWRITE; ++ ++ if (is_extents ++ ? bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0 ++ : bkey_cmp(new->k.p, old.k->p)) ++ return 0; ++ ++ if (is_extents) { ++ switch (bch2_extent_overlap(&new->k, old.k)) { ++ case BCH_EXTENT_OVERLAP_ALL: ++ offset = 0; ++ sectors = -((s64) old.k->size); ++ break; ++ case BCH_EXTENT_OVERLAP_BACK: ++ offset = bkey_start_offset(&new->k) - ++ bkey_start_offset(old.k); ++ sectors = bkey_start_offset(&new->k) - ++ old.k->p.offset; ++ break; ++ case BCH_EXTENT_OVERLAP_FRONT: ++ offset = 0; ++ sectors = bkey_start_offset(old.k) - ++ new->k.p.offset; ++ break; ++ case BCH_EXTENT_OVERLAP_MIDDLE: ++ offset = bkey_start_offset(&new->k) - ++ bkey_start_offset(old.k); ++ sectors = -((s64) new->k.size); ++ flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; ++ break; ++ } ++ ++ BUG_ON(sectors >= 0); ++ } ++ ++ return bch2_mark_key_locked(c, old, offset, sectors, fs_usage, ++ trans->journal_res.seq, flags) ?: 1; ++} ++ ++int bch2_mark_update(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert, ++ struct bch_fs_usage *fs_usage, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree *b = iter_l(iter)->b; ++ struct btree_node_iter node_iter = iter_l(iter)->iter; ++ struct bkey_packed *_k; ++ int ret = 0; ++ ++ if (unlikely(flags & BTREE_TRIGGER_NORUN)) ++ return 0; ++ ++ if (!btree_node_type_needs_gc(iter->btree_id)) ++ return 0; ++ ++ bch2_mark_key_locked(c, bkey_i_to_s_c(insert), ++ 0, insert->k.size, ++ fs_usage, trans->journal_res.seq, ++ BTREE_TRIGGER_INSERT|flags); ++ ++ if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES)) ++ return 0; ++ ++ /* ++ * For non extents, we only mark the new key, not the key being ++ * overwritten - unless we're actually deleting: ++ */ ++ if ((iter->btree_id == BTREE_ID_ALLOC || ++ iter->btree_id == BTREE_ID_EC) && ++ !bkey_deleted(&insert->k)) ++ return 0; ++ ++ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { ++ struct bkey unpacked; ++ struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); ++ ++ ret = bch2_mark_overwrite(trans, iter, k, insert, ++ fs_usage, flags, ++ btree_node_type_is_extents(iter->btree_id)); ++ if (ret <= 0) ++ break; ++ ++ bch2_btree_node_iter_advance(&node_iter, b); ++ } ++ ++ return ret; ++} ++ ++void bch2_trans_fs_usage_apply(struct btree_trans *trans, ++ struct bch_fs_usage *fs_usage) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_insert_entry *i; ++ static int warned_disk_usage = 0; ++ u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; ++ char buf[200]; ++ ++ if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res, ++ trans->journal_res.seq) || ++ warned_disk_usage || ++ xchg(&warned_disk_usage, 1)) ++ return; ++ ++ bch_err(c, "disk usage increased more than %llu sectors reserved", ++ disk_res_sectors); ++ ++ trans_for_each_update(trans, i) { ++ pr_err("while inserting"); ++ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); ++ pr_err("%s", buf); ++ pr_err("overlapping with"); ++ ++ if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) { ++ struct btree *b = iter_l(i->iter)->b; ++ struct btree_node_iter node_iter = iter_l(i->iter)->iter; ++ struct bkey_packed *_k; ++ ++ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { ++ struct bkey unpacked; ++ struct bkey_s_c k; ++ ++ pr_info("_k %px format %u", _k, _k->format); ++ k = bkey_disassemble(b, _k, &unpacked); ++ ++ if (btree_node_is_extents(b) ++ ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0 ++ : bkey_cmp(i->k->k.p, k.k->p)) ++ break; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, k); ++ pr_err("%s", buf); ++ ++ bch2_btree_node_iter_advance(&node_iter, b); ++ } ++ } else { ++ struct bkey_cached *ck = (void *) i->iter->l[0].b; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k)); ++ pr_err("%s", buf); ++ } ++ } ++} ++ ++/* trans_mark: */ ++ ++static struct btree_iter *trans_get_update(struct btree_trans *trans, ++ enum btree_id btree_id, struct bpos pos, ++ struct bkey_s_c *k) ++{ ++ struct btree_insert_entry *i; ++ ++ trans_for_each_update(trans, i) ++ if (i->iter->btree_id == btree_id && ++ (btree_node_type_is_extents(btree_id) ++ ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 && ++ bkey_cmp(pos, i->k->k.p) < 0 ++ : !bkey_cmp(pos, i->iter->pos))) { ++ *k = bkey_i_to_s_c(i->k); ++ return i->iter; ++ } ++ ++ return NULL; ++} ++ ++static int trans_get_key(struct btree_trans *trans, ++ enum btree_id btree_id, struct bpos pos, ++ struct btree_iter **iter, ++ struct bkey_s_c *k) ++{ ++ unsigned flags = btree_id != BTREE_ID_ALLOC ++ ? BTREE_ITER_SLOTS ++ : BTREE_ITER_CACHED; ++ int ret; ++ ++ *iter = trans_get_update(trans, btree_id, pos, k); ++ if (*iter) ++ return 1; ++ ++ *iter = bch2_trans_get_iter(trans, btree_id, pos, ++ flags|BTREE_ITER_INTENT); ++ if (IS_ERR(*iter)) ++ return PTR_ERR(*iter); ++ ++ *k = __bch2_btree_iter_peek(*iter, flags); ++ ret = bkey_err(*k); ++ if (ret) ++ bch2_trans_iter_put(trans, *iter); ++ return ret; ++} ++ ++static int bch2_trans_mark_pointer(struct btree_trans *trans, ++ struct bkey_s_c k, struct extent_ptr_decoded p, ++ s64 sectors, enum bch_data_type data_type) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ struct bpos pos = POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)); ++ struct btree_iter *iter; ++ struct bkey_s_c k_a; ++ struct bkey_alloc_unpacked u; ++ struct bkey_i_alloc *a; ++ struct bucket *g; ++ int ret; ++ ++ iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k_a); ++ if (iter) { ++ u = bch2_alloc_unpack(k_a); ++ } else { ++ iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, pos, ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_INTENT); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ goto out; ++ ++ percpu_down_read(&c->mark_lock); ++ g = bucket(ca, pos.offset); ++ u = alloc_mem_to_key(g, READ_ONCE(g->mark)); ++ percpu_up_read(&c->mark_lock); ++ } ++ ++ ret = __mark_pointer(c, k, p, sectors, data_type, u.gen, &u.data_type, ++ &u.dirty_sectors, &u.cached_sectors); ++ if (ret) ++ goto out; ++ ++ a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); ++ ret = PTR_ERR_OR_ZERO(a); ++ if (ret) ++ goto out; ++ ++ bkey_alloc_init(&a->k_i); ++ a->k.p = pos; ++ bch2_alloc_pack(a, u); ++ bch2_trans_update(trans, iter, &a->k_i, 0); ++out: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, ++ struct bch_extent_stripe_ptr p, ++ s64 sectors, enum bch_data_type data_type, ++ struct bch_replicas_padded *r, ++ unsigned *nr_data, ++ unsigned *nr_parity) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_i_stripe *s; ++ int ret = 0; ++ ++ ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k); ++ if (ret < 0) ++ return ret; ++ ++ if (k.k->type != KEY_TYPE_stripe) { ++ bch2_fs_inconsistent(c, ++ "pointer to nonexistent stripe %llu", ++ (u64) p.idx); ++ ret = -EIO; ++ goto out; ++ } ++ ++ s = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(s); ++ if (ret) ++ goto out; ++ ++ bkey_reassemble(&s->k_i, k); ++ ++ stripe_blockcount_set(&s->v, p.block, ++ stripe_blockcount_get(&s->v, p.block) + ++ sectors); ++ ++ *nr_data = s->v.nr_blocks - s->v.nr_redundant; ++ *nr_parity = s->v.nr_redundant; ++ bch2_bkey_to_replicas(&r->e, bkey_i_to_s_c(&s->k_i)); ++ bch2_trans_update(trans, iter, &s->k_i, 0); ++out: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static int bch2_trans_mark_extent(struct btree_trans *trans, ++ struct bkey_s_c k, unsigned offset, ++ s64 sectors, unsigned flags, ++ enum bch_data_type data_type) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ struct bch_replicas_padded r; ++ s64 dirty_sectors = 0; ++ bool stale; ++ int ret; ++ ++ r.e.data_type = data_type; ++ r.e.nr_devs = 0; ++ r.e.nr_required = 1; ++ ++ BUG_ON(!sectors); ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ s64 disk_sectors = data_type == BCH_DATA_BTREE ++ ? sectors ++ : ptr_disk_sectors_delta(p, offset, sectors, flags); ++ ++ ret = bch2_trans_mark_pointer(trans, k, p, disk_sectors, ++ data_type); ++ if (ret < 0) ++ return ret; ++ ++ stale = ret > 0; ++ ++ if (p.ptr.cached) { ++ if (!stale) ++ update_cached_sectors_list(trans, p.ptr.dev, ++ disk_sectors); ++ } else if (!p.has_ec) { ++ dirty_sectors += disk_sectors; ++ r.e.devs[r.e.nr_devs++] = p.ptr.dev; ++ } else { ++ struct bch_replicas_padded ec_r; ++ unsigned nr_data, nr_parity; ++ s64 parity_sectors; ++ ++ ret = bch2_trans_mark_stripe_ptr(trans, p.ec, ++ disk_sectors, data_type, ++ &ec_r, &nr_data, &nr_parity); ++ if (ret) ++ return ret; ++ ++ parity_sectors = ++ __ptr_disk_sectors_delta(p.crc.live_size, ++ offset, sectors, flags, ++ p.crc.compressed_size * nr_parity, ++ p.crc.uncompressed_size * nr_data); ++ ++ update_replicas_list(trans, &ec_r.e, ++ disk_sectors + parity_sectors); ++ ++ r.e.nr_required = 0; ++ } ++ } ++ ++ if (r.e.nr_devs) ++ update_replicas_list(trans, &r.e, dirty_sectors); ++ ++ return 0; ++} ++ ++static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, ++ struct bkey_s_c_reflink_p p, ++ u64 idx, unsigned sectors, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_i_reflink_v *r_v; ++ s64 ret; ++ ++ ret = trans_get_key(trans, BTREE_ID_REFLINK, ++ POS(0, idx), &iter, &k); ++ if (ret < 0) ++ return ret; ++ ++ if (k.k->type != KEY_TYPE_reflink_v) { ++ bch2_fs_inconsistent(c, ++ "%llu:%llu len %u points to nonexistent indirect extent %llu", ++ p.k->p.inode, p.k->p.offset, p.k->size, idx); ++ ret = -EIO; ++ goto err; ++ } ++ ++ if ((flags & BTREE_TRIGGER_OVERWRITE) && ++ (bkey_start_offset(k.k) < idx || ++ k.k->p.offset > idx + sectors)) ++ goto out; ++ ++ sectors = k.k->p.offset - idx; ++ ++ r_v = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(r_v); ++ if (ret) ++ goto err; ++ ++ bkey_reassemble(&r_v->k_i, k); ++ ++ le64_add_cpu(&r_v->v.refcount, ++ !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1); ++ ++ if (!r_v->v.refcount) { ++ r_v->k.type = KEY_TYPE_deleted; ++ set_bkey_val_u64s(&r_v->k, 0); ++ } ++ ++ bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); ++ BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); ++ ++ bch2_trans_update(trans, iter, &r_v->k_i, 0); ++out: ++ ret = sectors; ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static int bch2_trans_mark_reflink_p(struct btree_trans *trans, ++ struct bkey_s_c_reflink_p p, unsigned offset, ++ s64 sectors, unsigned flags) ++{ ++ u64 idx = le64_to_cpu(p.v->idx) + offset; ++ s64 ret = 0; ++ ++ sectors = abs(sectors); ++ BUG_ON(offset + sectors > p.k->size); ++ ++ while (sectors) { ++ ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags); ++ if (ret < 0) ++ break; ++ ++ idx += ret; ++ sectors = max_t(s64, 0LL, sectors - ret); ++ ret = 0; ++ } ++ ++ return ret; ++} ++ ++int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, ++ unsigned offset, s64 sectors, unsigned flags) ++{ ++ struct replicas_delta_list *d; ++ struct bch_fs *c = trans->c; ++ ++ switch (k.k->type) { ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: ++ sectors = !(flags & BTREE_TRIGGER_OVERWRITE) ++ ? c->opts.btree_node_size ++ : -c->opts.btree_node_size; ++ ++ return bch2_trans_mark_extent(trans, k, offset, sectors, ++ flags, BCH_DATA_BTREE); ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ return bch2_trans_mark_extent(trans, k, offset, sectors, ++ flags, BCH_DATA_USER); ++ case KEY_TYPE_inode: ++ d = replicas_deltas_realloc(trans, 0); ++ ++ if (!(flags & BTREE_TRIGGER_OVERWRITE)) ++ d->nr_inodes++; ++ else ++ d->nr_inodes--; ++ return 0; ++ case KEY_TYPE_reservation: { ++ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; ++ ++ d = replicas_deltas_realloc(trans, 0); ++ ++ sectors *= replicas; ++ replicas = clamp_t(unsigned, replicas, 1, ++ ARRAY_SIZE(d->persistent_reserved)); ++ ++ d->persistent_reserved[replicas - 1] += sectors; ++ return 0; ++ } ++ case KEY_TYPE_reflink_p: ++ return bch2_trans_mark_reflink_p(trans, ++ bkey_s_c_to_reflink_p(k), ++ offset, sectors, flags); ++ default: ++ return 0; ++ } ++} ++ ++int bch2_trans_mark_update(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert, ++ unsigned flags) ++{ ++ struct btree *b = iter_l(iter)->b; ++ struct btree_node_iter node_iter = iter_l(iter)->iter; ++ struct bkey_packed *_k; ++ int ret; ++ ++ if (unlikely(flags & BTREE_TRIGGER_NORUN)) ++ return 0; ++ ++ if (!btree_node_type_needs_gc(iter->btree_id)) ++ return 0; ++ ++ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert), ++ 0, insert->k.size, BTREE_TRIGGER_INSERT); ++ if (ret) ++ return ret; ++ ++ if (unlikely(flags & BTREE_TRIGGER_NOOVERWRITES)) ++ return 0; ++ ++ if (btree_iter_type(iter) == BTREE_ITER_CACHED) { ++ struct bkey_cached *ck = (void *) iter->l[0].b; ++ ++ return bch2_trans_mark_key(trans, bkey_i_to_s_c(ck->k), ++ 0, 0, BTREE_TRIGGER_OVERWRITE); ++ } ++ ++ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { ++ struct bkey unpacked; ++ struct bkey_s_c k; ++ unsigned offset = 0; ++ s64 sectors = 0; ++ unsigned flags = BTREE_TRIGGER_OVERWRITE; ++ ++ k = bkey_disassemble(b, _k, &unpacked); ++ ++ if (btree_node_is_extents(b) ++ ? bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0 ++ : bkey_cmp(insert->k.p, k.k->p)) ++ break; ++ ++ if (btree_node_is_extents(b)) { ++ switch (bch2_extent_overlap(&insert->k, k.k)) { ++ case BCH_EXTENT_OVERLAP_ALL: ++ offset = 0; ++ sectors = -((s64) k.k->size); ++ break; ++ case BCH_EXTENT_OVERLAP_BACK: ++ offset = bkey_start_offset(&insert->k) - ++ bkey_start_offset(k.k); ++ sectors = bkey_start_offset(&insert->k) - ++ k.k->p.offset; ++ break; ++ case BCH_EXTENT_OVERLAP_FRONT: ++ offset = 0; ++ sectors = bkey_start_offset(k.k) - ++ insert->k.p.offset; ++ break; ++ case BCH_EXTENT_OVERLAP_MIDDLE: ++ offset = bkey_start_offset(&insert->k) - ++ bkey_start_offset(k.k); ++ sectors = -((s64) insert->k.size); ++ flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; ++ break; ++ } ++ ++ BUG_ON(sectors >= 0); ++ } ++ ++ ret = bch2_trans_mark_key(trans, k, offset, sectors, flags); ++ if (ret) ++ return ret; ++ ++ bch2_btree_node_iter_advance(&node_iter, b); ++ } ++ ++ return 0; ++} ++ ++/* Disk reservations: */ ++ ++static u64 bch2_recalc_sectors_available(struct bch_fs *c) ++{ ++ percpu_u64_set(&c->pcpu->sectors_available, 0); ++ ++ return avail_factor(__bch2_fs_usage_read_short(c).free); ++} ++ ++void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) ++{ ++ percpu_down_read(&c->mark_lock); ++ this_cpu_sub(c->usage[0]->online_reserved, ++ res->sectors); ++ percpu_up_read(&c->mark_lock); ++ ++ res->sectors = 0; ++} ++ ++#define SECTORS_CACHE 1024 ++ ++int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, ++ unsigned sectors, int flags) ++{ ++ struct bch_fs_pcpu *pcpu; ++ u64 old, v, get; ++ s64 sectors_available; ++ int ret; ++ ++ percpu_down_read(&c->mark_lock); ++ preempt_disable(); ++ pcpu = this_cpu_ptr(c->pcpu); ++ ++ if (sectors <= pcpu->sectors_available) ++ goto out; ++ ++ v = atomic64_read(&c->sectors_available); ++ do { ++ old = v; ++ get = min((u64) sectors + SECTORS_CACHE, old); ++ ++ if (get < sectors) { ++ preempt_enable(); ++ percpu_up_read(&c->mark_lock); ++ goto recalculate; ++ } ++ } while ((v = atomic64_cmpxchg(&c->sectors_available, ++ old, old - get)) != old); ++ ++ pcpu->sectors_available += get; ++ ++out: ++ pcpu->sectors_available -= sectors; ++ this_cpu_add(c->usage[0]->online_reserved, sectors); ++ res->sectors += sectors; ++ ++ preempt_enable(); ++ percpu_up_read(&c->mark_lock); ++ return 0; ++ ++recalculate: ++ percpu_down_write(&c->mark_lock); ++ ++ sectors_available = bch2_recalc_sectors_available(c); ++ ++ if (sectors <= sectors_available || ++ (flags & BCH_DISK_RESERVATION_NOFAIL)) { ++ atomic64_set(&c->sectors_available, ++ max_t(s64, 0, sectors_available - sectors)); ++ this_cpu_add(c->usage[0]->online_reserved, sectors); ++ res->sectors += sectors; ++ ret = 0; ++ } else { ++ atomic64_set(&c->sectors_available, sectors_available); ++ ret = -ENOSPC; ++ } ++ ++ percpu_up_write(&c->mark_lock); ++ ++ return ret; ++} ++ ++/* Startup/shutdown: */ ++ ++static void buckets_free_rcu(struct rcu_head *rcu) ++{ ++ struct bucket_array *buckets = ++ container_of(rcu, struct bucket_array, rcu); ++ ++ kvpfree(buckets, ++ sizeof(struct bucket_array) + ++ buckets->nbuckets * sizeof(struct bucket)); ++} ++ ++int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ++{ ++ struct bucket_array *buckets = NULL, *old_buckets = NULL; ++ unsigned long *buckets_nouse = NULL; ++ alloc_fifo free[RESERVE_NR]; ++ alloc_fifo free_inc; ++ alloc_heap alloc_heap; ++ copygc_heap copygc_heap; ++ ++ size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, ++ ca->mi.bucket_size / c->opts.btree_node_size); ++ /* XXX: these should be tunable */ ++ size_t reserve_none = max_t(size_t, 1, nbuckets >> 9); ++ size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7); ++ size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), ++ btree_reserve * 2); ++ bool resize = ca->buckets[0] != NULL, ++ start_copygc = ca->copygc_thread != NULL; ++ int ret = -ENOMEM; ++ unsigned i; ++ ++ memset(&free, 0, sizeof(free)); ++ memset(&free_inc, 0, sizeof(free_inc)); ++ memset(&alloc_heap, 0, sizeof(alloc_heap)); ++ memset(©gc_heap, 0, sizeof(copygc_heap)); ++ ++ if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + ++ nbuckets * sizeof(struct bucket), ++ GFP_KERNEL|__GFP_ZERO)) || ++ !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * ++ sizeof(unsigned long), ++ GFP_KERNEL|__GFP_ZERO)) || ++ !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) || ++ !init_fifo(&free[RESERVE_MOVINGGC], ++ copygc_reserve, GFP_KERNEL) || ++ !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || ++ !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) || ++ !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL) || ++ !init_heap(©gc_heap, copygc_reserve, GFP_KERNEL)) ++ goto err; ++ ++ buckets->first_bucket = ca->mi.first_bucket; ++ buckets->nbuckets = nbuckets; ++ ++ bch2_copygc_stop(ca); ++ ++ if (resize) { ++ down_write(&c->gc_lock); ++ down_write(&ca->bucket_lock); ++ percpu_down_write(&c->mark_lock); ++ } ++ ++ old_buckets = bucket_array(ca); ++ ++ if (resize) { ++ size_t n = min(buckets->nbuckets, old_buckets->nbuckets); ++ ++ memcpy(buckets->b, ++ old_buckets->b, ++ n * sizeof(struct bucket)); ++ memcpy(buckets_nouse, ++ ca->buckets_nouse, ++ BITS_TO_LONGS(n) * sizeof(unsigned long)); ++ } ++ ++ rcu_assign_pointer(ca->buckets[0], buckets); ++ buckets = old_buckets; ++ ++ swap(ca->buckets_nouse, buckets_nouse); ++ ++ if (resize) { ++ percpu_up_write(&c->mark_lock); ++ up_write(&c->gc_lock); ++ } ++ ++ spin_lock(&c->freelist_lock); ++ for (i = 0; i < RESERVE_NR; i++) { ++ fifo_move(&free[i], &ca->free[i]); ++ swap(ca->free[i], free[i]); ++ } ++ fifo_move(&free_inc, &ca->free_inc); ++ swap(ca->free_inc, free_inc); ++ spin_unlock(&c->freelist_lock); ++ ++ /* with gc lock held, alloc_heap can't be in use: */ ++ swap(ca->alloc_heap, alloc_heap); ++ ++ /* and we shut down copygc: */ ++ swap(ca->copygc_heap, copygc_heap); ++ ++ nbuckets = ca->mi.nbuckets; ++ ++ if (resize) ++ up_write(&ca->bucket_lock); ++ ++ if (start_copygc && ++ bch2_copygc_start(c, ca)) ++ bch_err(ca, "error restarting copygc thread"); ++ ++ ret = 0; ++err: ++ free_heap(©gc_heap); ++ free_heap(&alloc_heap); ++ free_fifo(&free_inc); ++ for (i = 0; i < RESERVE_NR; i++) ++ free_fifo(&free[i]); ++ kvpfree(buckets_nouse, ++ BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); ++ if (buckets) ++ call_rcu(&old_buckets->rcu, buckets_free_rcu); ++ ++ return ret; ++} ++ ++void bch2_dev_buckets_free(struct bch_dev *ca) ++{ ++ unsigned i; ++ ++ free_heap(&ca->copygc_heap); ++ free_heap(&ca->alloc_heap); ++ free_fifo(&ca->free_inc); ++ for (i = 0; i < RESERVE_NR; i++) ++ free_fifo(&ca->free[i]); ++ kvpfree(ca->buckets_nouse, ++ BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); ++ kvpfree(rcu_dereference_protected(ca->buckets[0], 1), ++ sizeof(struct bucket_array) + ++ ca->mi.nbuckets * sizeof(struct bucket)); ++ ++ free_percpu(ca->usage[0]); ++} ++ ++int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) ++{ ++ if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage))) ++ return -ENOMEM; ++ ++ return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);; ++} +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +new file mode 100644 +index 000000000000..97265fe90e96 +--- /dev/null ++++ b/fs/bcachefs/buckets.h +@@ -0,0 +1,327 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Code for manipulating bucket marks for garbage collection. ++ * ++ * Copyright 2014 Datera, Inc. ++ */ ++ ++#ifndef _BUCKETS_H ++#define _BUCKETS_H ++ ++#include "buckets_types.h" ++#include "super.h" ++ ++#define for_each_bucket(_b, _buckets) \ ++ for (_b = (_buckets)->b + (_buckets)->first_bucket; \ ++ _b < (_buckets)->b + (_buckets)->nbuckets; _b++) ++ ++#define bucket_cmpxchg(g, new, expr) \ ++({ \ ++ struct bucket *_g = g; \ ++ u64 _v = atomic64_read(&(g)->_mark.v); \ ++ struct bucket_mark _old; \ ++ \ ++ do { \ ++ (new).v.counter = _old.v.counter = _v; \ ++ expr; \ ++ } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v, \ ++ _old.v.counter, \ ++ (new).v.counter)) != _old.v.counter);\ ++ _old; \ ++}) ++ ++static inline struct bucket_array *__bucket_array(struct bch_dev *ca, ++ bool gc) ++{ ++ return rcu_dereference_check(ca->buckets[gc], ++ !ca->fs || ++ percpu_rwsem_is_held(&ca->fs->mark_lock) || ++ lockdep_is_held(&ca->fs->gc_lock) || ++ lockdep_is_held(&ca->bucket_lock)); ++} ++ ++static inline struct bucket_array *bucket_array(struct bch_dev *ca) ++{ ++ return __bucket_array(ca, false); ++} ++ ++static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc) ++{ ++ struct bucket_array *buckets = __bucket_array(ca, gc); ++ ++ BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); ++ return buckets->b + b; ++} ++ ++static inline struct bucket *bucket(struct bch_dev *ca, size_t b) ++{ ++ return __bucket(ca, b, false); ++} ++ ++static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, int rw) ++{ ++ bucket(ca, b)->io_time[rw] = c->bucket_clock[rw].hand; ++} ++ ++static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw) ++{ ++ return c->bucket_clock[rw].hand - g->io_time[rw]; ++} ++ ++/* ++ * bucket_gc_gen() returns the difference between the bucket's current gen and ++ * the oldest gen of any pointer into that bucket in the btree. ++ */ ++ ++static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b) ++{ ++ struct bucket *g = bucket(ca, b); ++ ++ return g->mark.gen - g->oldest_gen; ++} ++ ++static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, ++ const struct bch_extent_ptr *ptr) ++{ ++ return sector_to_bucket(ca, ptr->offset); ++} ++ ++static inline struct bucket *PTR_BUCKET(struct bch_dev *ca, ++ const struct bch_extent_ptr *ptr, ++ bool gc) ++{ ++ return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc); ++} ++ ++static inline enum bch_data_type ptr_data_type(const struct bkey *k, ++ const struct bch_extent_ptr *ptr) ++{ ++ if (k->type == KEY_TYPE_btree_ptr || ++ k->type == KEY_TYPE_btree_ptr_v2) ++ return BCH_DATA_BTREE; ++ ++ return ptr->cached ? BCH_DATA_CACHED : BCH_DATA_USER; ++} ++ ++static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca, ++ const struct bch_extent_ptr *ptr) ++{ ++ struct bucket_mark m; ++ ++ rcu_read_lock(); ++ m = READ_ONCE(PTR_BUCKET(ca, ptr, 0)->mark); ++ rcu_read_unlock(); ++ ++ return m; ++} ++ ++static inline int gen_cmp(u8 a, u8 b) ++{ ++ return (s8) (a - b); ++} ++ ++static inline int gen_after(u8 a, u8 b) ++{ ++ int r = gen_cmp(a, b); ++ ++ return r > 0 ? r : 0; ++} ++ ++/** ++ * ptr_stale() - check if a pointer points into a bucket that has been ++ * invalidated. ++ */ ++static inline u8 ptr_stale(struct bch_dev *ca, ++ const struct bch_extent_ptr *ptr) ++{ ++ return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen); ++} ++ ++static inline s64 __ptr_disk_sectors(struct extent_ptr_decoded p, ++ unsigned live_size) ++{ ++ return live_size && p.crc.compression_type ++ ? max(1U, DIV_ROUND_UP(live_size * p.crc.compressed_size, ++ p.crc.uncompressed_size)) ++ : live_size; ++} ++ ++static inline s64 ptr_disk_sectors(struct extent_ptr_decoded p) ++{ ++ return __ptr_disk_sectors(p, p.crc.live_size); ++} ++ ++/* bucket gc marks */ ++ ++static inline unsigned bucket_sectors_used(struct bucket_mark mark) ++{ ++ return mark.dirty_sectors + mark.cached_sectors; ++} ++ ++static inline bool bucket_unused(struct bucket_mark mark) ++{ ++ return !mark.owned_by_allocator && ++ !mark.data_type && ++ !bucket_sectors_used(mark); ++} ++ ++static inline bool is_available_bucket(struct bucket_mark mark) ++{ ++ return (!mark.owned_by_allocator && ++ !mark.dirty_sectors && ++ !mark.stripe); ++} ++ ++static inline bool bucket_needs_journal_commit(struct bucket_mark m, ++ u16 last_seq_ondisk) ++{ ++ return m.journal_seq_valid && ++ ((s16) m.journal_seq - (s16) last_seq_ondisk > 0); ++} ++ ++/* Device usage: */ ++ ++struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *, struct bch_dev *); ++ ++void bch2_dev_usage_from_buckets(struct bch_fs *); ++ ++static inline u64 __dev_buckets_available(struct bch_dev *ca, ++ struct bch_dev_usage stats) ++{ ++ u64 total = ca->mi.nbuckets - ca->mi.first_bucket; ++ ++ if (WARN_ONCE(stats.buckets_unavailable > total, ++ "buckets_unavailable overflow (%llu > %llu)\n", ++ stats.buckets_unavailable, total)) ++ return 0; ++ ++ return total - stats.buckets_unavailable; ++} ++ ++/* ++ * Number of reclaimable buckets - only for use by the allocator thread: ++ */ ++static inline u64 dev_buckets_available(struct bch_fs *c, struct bch_dev *ca) ++{ ++ return __dev_buckets_available(ca, bch2_dev_usage_read(c, ca)); ++} ++ ++static inline u64 __dev_buckets_free(struct bch_dev *ca, ++ struct bch_dev_usage stats) ++{ ++ return __dev_buckets_available(ca, stats) + ++ fifo_used(&ca->free[RESERVE_NONE]) + ++ fifo_used(&ca->free_inc); ++} ++ ++static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca) ++{ ++ return __dev_buckets_free(ca, bch2_dev_usage_read(c, ca)); ++} ++ ++/* Filesystem usage: */ ++ ++static inline unsigned fs_usage_u64s(struct bch_fs *c) ++{ ++ ++ return sizeof(struct bch_fs_usage) / sizeof(u64) + ++ READ_ONCE(c->replicas.nr); ++} ++ ++void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *); ++struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *); ++ ++u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *); ++ ++struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *); ++ ++void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned); ++ ++void bch2_fs_usage_to_text(struct printbuf *, ++ struct bch_fs *, struct bch_fs_usage *); ++ ++u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *); ++ ++struct bch_fs_usage_short ++bch2_fs_usage_read_short(struct bch_fs *); ++ ++/* key/bucket marking: */ ++ ++void bch2_bucket_seq_cleanup(struct bch_fs *); ++void bch2_fs_usage_initialize(struct bch_fs *); ++ ++void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *, ++ size_t, struct bucket_mark *); ++void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, ++ size_t, bool, struct gc_pos, unsigned); ++void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, ++ size_t, enum bch_data_type, unsigned, ++ struct gc_pos, unsigned); ++ ++int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, s64, ++ struct bch_fs_usage *, u64, unsigned); ++int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, ++ struct disk_reservation *, unsigned); ++ ++int bch2_mark_overwrite(struct btree_trans *, struct btree_iter *, ++ struct bkey_s_c, struct bkey_i *, ++ struct bch_fs_usage *, unsigned, bool); ++int bch2_mark_update(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, struct bch_fs_usage *, unsigned); ++ ++int bch2_replicas_delta_list_apply(struct bch_fs *, ++ struct bch_fs_usage *, ++ struct replicas_delta_list *); ++int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, ++ unsigned, s64, unsigned); ++int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, ++ struct bkey_i *insert, unsigned); ++void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *); ++ ++/* disk reservations: */ ++ ++void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *); ++ ++static inline void bch2_disk_reservation_put(struct bch_fs *c, ++ struct disk_reservation *res) ++{ ++ if (res->sectors) ++ __bch2_disk_reservation_put(c, res); ++} ++ ++#define BCH_DISK_RESERVATION_NOFAIL (1 << 0) ++ ++int bch2_disk_reservation_add(struct bch_fs *, ++ struct disk_reservation *, ++ unsigned, int); ++ ++static inline struct disk_reservation ++bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) ++{ ++ return (struct disk_reservation) { ++ .sectors = 0, ++#if 0 ++ /* not used yet: */ ++ .gen = c->capacity_gen, ++#endif ++ .nr_replicas = nr_replicas, ++ }; ++} ++ ++static inline int bch2_disk_reservation_get(struct bch_fs *c, ++ struct disk_reservation *res, ++ unsigned sectors, ++ unsigned nr_replicas, ++ int flags) ++{ ++ *res = bch2_disk_reservation_init(c, nr_replicas); ++ ++ return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags); ++} ++ ++int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64); ++void bch2_dev_buckets_free(struct bch_dev *); ++int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *); ++ ++#endif /* _BUCKETS_H */ +diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h +new file mode 100644 +index 000000000000..53f22726893d +--- /dev/null ++++ b/fs/bcachefs/buckets_types.h +@@ -0,0 +1,133 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BUCKETS_TYPES_H ++#define _BUCKETS_TYPES_H ++ ++#include "bcachefs_format.h" ++#include "util.h" ++ ++#define BUCKET_JOURNAL_SEQ_BITS 16 ++ ++struct bucket_mark { ++ union { ++ atomic64_t v; ++ ++ struct { ++ u8 gen; ++ u8 data_type:3, ++ owned_by_allocator:1, ++ journal_seq_valid:1, ++ stripe:1; ++ u16 dirty_sectors; ++ u16 cached_sectors; ++ ++ /* ++ * low bits of journal sequence number when this bucket was most ++ * recently modified: if journal_seq_valid is set, this bucket can't be ++ * reused until the journal sequence number written to disk is >= the ++ * bucket's journal sequence number: ++ */ ++ u16 journal_seq; ++ }; ++ }; ++}; ++ ++struct bucket { ++ union { ++ struct bucket_mark _mark; ++ const struct bucket_mark mark; ++ }; ++ ++ u16 io_time[2]; ++ u8 oldest_gen; ++ u8 gc_gen; ++ unsigned gen_valid:1; ++}; ++ ++struct bucket_array { ++ struct rcu_head rcu; ++ u16 first_bucket; ++ size_t nbuckets; ++ struct bucket b[]; ++}; ++ ++struct bch_dev_usage { ++ u64 buckets[BCH_DATA_NR]; ++ u64 buckets_alloc; ++ u64 buckets_unavailable; ++ ++ /* _compressed_ sectors: */ ++ u64 sectors[BCH_DATA_NR]; ++ u64 sectors_fragmented; ++ ++ u64 buckets_ec; ++ u64 sectors_ec; ++}; ++ ++struct bch_fs_usage { ++ /* all fields are in units of 512 byte sectors: */ ++ ++ u64 online_reserved; ++ ++ /* fields after online_reserved are cleared/recalculated by gc: */ ++ u64 gc_start[0]; ++ ++ u64 hidden; ++ u64 btree; ++ u64 data; ++ u64 cached; ++ u64 reserved; ++ u64 nr_inodes; ++ ++ /* XXX: add stats for compression ratio */ ++#if 0 ++ u64 uncompressed; ++ u64 compressed; ++#endif ++ ++ /* broken out: */ ++ ++ u64 persistent_reserved[BCH_REPLICAS_MAX]; ++ u64 replicas[]; ++}; ++ ++struct bch_fs_usage_short { ++ u64 capacity; ++ u64 used; ++ u64 free; ++ u64 nr_inodes; ++}; ++ ++struct replicas_delta { ++ s64 delta; ++ struct bch_replicas_entry r; ++} __packed; ++ ++struct replicas_delta_list { ++ unsigned size; ++ unsigned used; ++ ++ struct {} memset_start; ++ u64 nr_inodes; ++ u64 persistent_reserved[BCH_REPLICAS_MAX]; ++ struct {} memset_end; ++ struct replicas_delta d[0]; ++}; ++ ++/* ++ * A reservation for space on disk: ++ */ ++struct disk_reservation { ++ u64 sectors; ++ u32 gen; ++ unsigned nr_replicas; ++}; ++ ++struct copygc_heap_entry { ++ u8 gen; ++ u32 sectors; ++ u64 offset; ++}; ++ ++typedef HEAP(struct copygc_heap_entry) copygc_heap; ++ ++#endif /* _BUCKETS_TYPES_H */ +diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c +new file mode 100644 +index 000000000000..3af521947502 +--- /dev/null ++++ b/fs/bcachefs/chardev.c +@@ -0,0 +1,704 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef NO_BCACHEFS_CHARDEV ++ ++#include "bcachefs.h" ++#include "bcachefs_ioctl.h" ++#include "buckets.h" ++#include "chardev.h" ++#include "move.h" ++#include "replicas.h" ++#include "super.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* returns with ref on ca->ref */ ++static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, ++ unsigned flags) ++{ ++ struct bch_dev *ca; ++ ++ if (flags & BCH_BY_INDEX) { ++ if (dev >= c->sb.nr_devices) ++ return ERR_PTR(-EINVAL); ++ ++ rcu_read_lock(); ++ ca = rcu_dereference(c->devs[dev]); ++ if (ca) ++ percpu_ref_get(&ca->ref); ++ rcu_read_unlock(); ++ ++ if (!ca) ++ return ERR_PTR(-EINVAL); ++ } else { ++ char *path; ++ ++ path = strndup_user((const char __user *) ++ (unsigned long) dev, PATH_MAX); ++ if (IS_ERR(path)) ++ return ERR_CAST(path); ++ ++ ca = bch2_dev_lookup(c, path); ++ kfree(path); ++ } ++ ++ return ca; ++} ++ ++#if 0 ++static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) ++{ ++ struct bch_ioctl_assemble arg; ++ struct bch_fs *c; ++ u64 *user_devs = NULL; ++ char **devs = NULL; ++ unsigned i; ++ int ret = -EFAULT; ++ ++ if (copy_from_user(&arg, user_arg, sizeof(arg))) ++ return -EFAULT; ++ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL); ++ if (!user_devs) ++ return -ENOMEM; ++ ++ devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL); ++ ++ if (copy_from_user(user_devs, user_arg->devs, ++ sizeof(u64) * arg.nr_devs)) ++ goto err; ++ ++ for (i = 0; i < arg.nr_devs; i++) { ++ devs[i] = strndup_user((const char __user *)(unsigned long) ++ user_devs[i], ++ PATH_MAX); ++ if (!devs[i]) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ } ++ ++ c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty()); ++ ret = PTR_ERR_OR_ZERO(c); ++ if (!ret) ++ closure_put(&c->cl); ++err: ++ if (devs) ++ for (i = 0; i < arg.nr_devs; i++) ++ kfree(devs[i]); ++ kfree(devs); ++ return ret; ++} ++ ++static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg) ++{ ++ struct bch_ioctl_incremental arg; ++ const char *err; ++ char *path; ++ ++ if (copy_from_user(&arg, user_arg, sizeof(arg))) ++ return -EFAULT; ++ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); ++ if (!path) ++ return -ENOMEM; ++ ++ err = bch2_fs_open_incremental(path); ++ kfree(path); ++ ++ if (err) { ++ pr_err("Could not register bcachefs devices: %s", err); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++#endif ++ ++static long bch2_global_ioctl(unsigned cmd, void __user *arg) ++{ ++ switch (cmd) { ++#if 0 ++ case BCH_IOCTL_ASSEMBLE: ++ return bch2_ioctl_assemble(arg); ++ case BCH_IOCTL_INCREMENTAL: ++ return bch2_ioctl_incremental(arg); ++#endif ++ default: ++ return -ENOTTY; ++ } ++} ++ ++static long bch2_ioctl_query_uuid(struct bch_fs *c, ++ struct bch_ioctl_query_uuid __user *user_arg) ++{ ++ return copy_to_user(&user_arg->uuid, ++ &c->sb.user_uuid, ++ sizeof(c->sb.user_uuid)); ++} ++ ++#if 0 ++static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg) ++{ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ return bch2_fs_start(c); ++} ++ ++static long bch2_ioctl_stop(struct bch_fs *c) ++{ ++ bch2_fs_stop(c); ++ return 0; ++} ++#endif ++ ++static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) ++{ ++ char *path; ++ int ret; ++ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); ++ if (!path) ++ return -ENOMEM; ++ ++ ret = bch2_dev_add(c, path); ++ kfree(path); ++ ++ return ret; ++} ++ ++static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg) ++{ ++ struct bch_dev *ca; ++ ++ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| ++ BCH_FORCE_IF_METADATA_LOST| ++ BCH_FORCE_IF_DEGRADED| ++ BCH_BY_INDEX)) || ++ arg.pad) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ return bch2_dev_remove(c, ca, arg.flags); ++} ++ ++static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg) ++{ ++ char *path; ++ int ret; ++ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); ++ if (!path) ++ return -ENOMEM; ++ ++ ret = bch2_dev_online(c, path); ++ kfree(path); ++ return ret; ++} ++ ++static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) ++{ ++ struct bch_dev *ca; ++ int ret; ++ ++ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| ++ BCH_FORCE_IF_METADATA_LOST| ++ BCH_FORCE_IF_DEGRADED| ++ BCH_BY_INDEX)) || ++ arg.pad) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ ret = bch2_dev_offline(c, ca, arg.flags); ++ percpu_ref_put(&ca->ref); ++ return ret; ++} ++ ++static long bch2_ioctl_disk_set_state(struct bch_fs *c, ++ struct bch_ioctl_disk_set_state arg) ++{ ++ struct bch_dev *ca; ++ int ret; ++ ++ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| ++ BCH_FORCE_IF_METADATA_LOST| ++ BCH_FORCE_IF_DEGRADED| ++ BCH_BY_INDEX)) || ++ arg.pad[0] || arg.pad[1] || arg.pad[2]) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags); ++ ++ percpu_ref_put(&ca->ref); ++ return ret; ++} ++ ++struct bch_data_ctx { ++ struct bch_fs *c; ++ struct bch_ioctl_data arg; ++ struct bch_move_stats stats; ++ ++ int ret; ++ ++ struct task_struct *thread; ++}; ++ ++static int bch2_data_thread(void *arg) ++{ ++ struct bch_data_ctx *ctx = arg; ++ ++ ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); ++ ++ ctx->stats.data_type = U8_MAX; ++ return 0; ++} ++ ++static int bch2_data_job_release(struct inode *inode, struct file *file) ++{ ++ struct bch_data_ctx *ctx = file->private_data; ++ ++ kthread_stop(ctx->thread); ++ put_task_struct(ctx->thread); ++ kfree(ctx); ++ return 0; ++} ++ ++static ssize_t bch2_data_job_read(struct file *file, char __user *buf, ++ size_t len, loff_t *ppos) ++{ ++ struct bch_data_ctx *ctx = file->private_data; ++ struct bch_fs *c = ctx->c; ++ struct bch_ioctl_data_event e = { ++ .type = BCH_DATA_EVENT_PROGRESS, ++ .p.data_type = ctx->stats.data_type, ++ .p.btree_id = ctx->stats.btree_id, ++ .p.pos = ctx->stats.pos, ++ .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), ++ .p.sectors_total = bch2_fs_usage_read_short(c).used, ++ }; ++ ++ if (len < sizeof(e)) ++ return -EINVAL; ++ ++ return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e); ++} ++ ++static const struct file_operations bcachefs_data_ops = { ++ .release = bch2_data_job_release, ++ .read = bch2_data_job_read, ++ .llseek = no_llseek, ++}; ++ ++static long bch2_ioctl_data(struct bch_fs *c, ++ struct bch_ioctl_data arg) ++{ ++ struct bch_data_ctx *ctx = NULL; ++ struct file *file = NULL; ++ unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK; ++ int ret, fd = -1; ++ ++ if (arg.op >= BCH_DATA_OP_NR || arg.flags) ++ return -EINVAL; ++ ++ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); ++ if (!ctx) ++ return -ENOMEM; ++ ++ ctx->c = c; ++ ctx->arg = arg; ++ ++ ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]"); ++ if (IS_ERR(ctx->thread)) { ++ ret = PTR_ERR(ctx->thread); ++ goto err; ++ } ++ ++ ret = get_unused_fd_flags(flags); ++ if (ret < 0) ++ goto err; ++ fd = ret; ++ ++ file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags); ++ if (IS_ERR(file)) { ++ ret = PTR_ERR(file); ++ goto err; ++ } ++ ++ fd_install(fd, file); ++ ++ get_task_struct(ctx->thread); ++ wake_up_process(ctx->thread); ++ ++ return fd; ++err: ++ if (fd >= 0) ++ put_unused_fd(fd); ++ if (!IS_ERR_OR_NULL(ctx->thread)) ++ kthread_stop(ctx->thread); ++ kfree(ctx); ++ return ret; ++} ++ ++static long bch2_ioctl_fs_usage(struct bch_fs *c, ++ struct bch_ioctl_fs_usage __user *user_arg) ++{ ++ struct bch_ioctl_fs_usage *arg = NULL; ++ struct bch_replicas_usage *dst_e, *dst_end; ++ struct bch_fs_usage *src; ++ u32 replica_entries_bytes; ++ unsigned i; ++ int ret = 0; ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EINVAL; ++ ++ if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) ++ return -EFAULT; ++ ++ arg = kzalloc(sizeof(*arg) + replica_entries_bytes, GFP_KERNEL); ++ if (!arg) ++ return -ENOMEM; ++ ++ src = bch2_fs_usage_read(c); ++ if (!src) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ arg->capacity = c->capacity; ++ arg->used = bch2_fs_sectors_used(c, src); ++ arg->online_reserved = src->online_reserved; ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) ++ arg->persistent_reserved[i] = src->persistent_reserved[i]; ++ ++ dst_e = arg->replicas; ++ dst_end = (void *) arg->replicas + replica_entries_bytes; ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *src_e = ++ cpu_replicas_entry(&c->replicas, i); ++ ++ if (replicas_usage_next(dst_e) > dst_end) { ++ ret = -ERANGE; ++ break; ++ } ++ ++ dst_e->sectors = src->replicas[i]; ++ dst_e->r = *src_e; ++ ++ /* recheck after setting nr_devs: */ ++ if (replicas_usage_next(dst_e) > dst_end) { ++ ret = -ERANGE; ++ break; ++ } ++ ++ memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs); ++ ++ dst_e = replicas_usage_next(dst_e); ++ } ++ ++ arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas; ++ ++ percpu_up_read(&c->mark_lock); ++ kfree(src); ++ ++ if (!ret) ++ ret = copy_to_user(user_arg, arg, ++ sizeof(*arg) + arg->replica_entries_bytes); ++err: ++ kfree(arg); ++ return ret; ++} ++ ++static long bch2_ioctl_dev_usage(struct bch_fs *c, ++ struct bch_ioctl_dev_usage __user *user_arg) ++{ ++ struct bch_ioctl_dev_usage arg; ++ struct bch_dev_usage src; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EINVAL; ++ ++ if (copy_from_user(&arg, user_arg, sizeof(arg))) ++ return -EFAULT; ++ ++ if ((arg.flags & ~BCH_BY_INDEX) || ++ arg.pad[0] || ++ arg.pad[1] || ++ arg.pad[2]) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ src = bch2_dev_usage_read(c, ca); ++ ++ arg.state = ca->mi.state; ++ arg.bucket_size = ca->mi.bucket_size; ++ arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; ++ arg.available_buckets = arg.nr_buckets - src.buckets_unavailable; ++ arg.ec_buckets = src.buckets_ec; ++ arg.ec_sectors = src.sectors_ec; ++ ++ for (i = 0; i < BCH_DATA_NR; i++) { ++ arg.buckets[i] = src.buckets[i]; ++ arg.sectors[i] = src.sectors[i]; ++ } ++ ++ percpu_ref_put(&ca->ref); ++ ++ return copy_to_user(user_arg, &arg, sizeof(arg)); ++} ++ ++static long bch2_ioctl_read_super(struct bch_fs *c, ++ struct bch_ioctl_read_super arg) ++{ ++ struct bch_dev *ca = NULL; ++ struct bch_sb *sb; ++ int ret = 0; ++ ++ if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) || ++ arg.pad) ++ return -EINVAL; ++ ++ mutex_lock(&c->sb_lock); ++ ++ if (arg.flags & BCH_READ_DEV) { ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ ++ if (IS_ERR(ca)) { ++ ret = PTR_ERR(ca); ++ goto err; ++ } ++ ++ sb = ca->disk_sb.sb; ++ } else { ++ sb = c->disk_sb.sb; ++ } ++ ++ if (vstruct_bytes(sb) > arg.size) { ++ ret = -ERANGE; ++ goto err; ++ } ++ ++ ret = copy_to_user((void __user *)(unsigned long)arg.sb, ++ sb, vstruct_bytes(sb)); ++err: ++ if (ca) ++ percpu_ref_put(&ca->ref); ++ mutex_unlock(&c->sb_lock); ++ return ret; ++} ++ ++static long bch2_ioctl_disk_get_idx(struct bch_fs *c, ++ struct bch_ioctl_disk_get_idx arg) ++{ ++ dev_t dev = huge_decode_dev(arg.dev); ++ struct bch_dev *ca; ++ unsigned i; ++ ++ for_each_online_member(ca, c, i) ++ if (ca->disk_sb.bdev->bd_dev == dev) { ++ percpu_ref_put(&ca->io_ref); ++ return i; ++ } ++ ++ return -ENOENT; ++} ++ ++static long bch2_ioctl_disk_resize(struct bch_fs *c, ++ struct bch_ioctl_disk_resize arg) ++{ ++ struct bch_dev *ca; ++ int ret; ++ ++ if ((arg.flags & ~BCH_BY_INDEX) || ++ arg.pad) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ ret = bch2_dev_resize(c, ca, arg.nbuckets); ++ ++ percpu_ref_put(&ca->ref); ++ return ret; ++} ++ ++#define BCH_IOCTL(_name, _argtype) \ ++do { \ ++ _argtype i; \ ++ \ ++ if (copy_from_user(&i, arg, sizeof(i))) \ ++ return -EFAULT; \ ++ return bch2_ioctl_##_name(c, i); \ ++} while (0) ++ ++long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) ++{ ++ /* ioctls that don't require admin cap: */ ++ switch (cmd) { ++ case BCH_IOCTL_QUERY_UUID: ++ return bch2_ioctl_query_uuid(c, arg); ++ case BCH_IOCTL_FS_USAGE: ++ return bch2_ioctl_fs_usage(c, arg); ++ case BCH_IOCTL_DEV_USAGE: ++ return bch2_ioctl_dev_usage(c, arg); ++ } ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ switch (cmd) { ++#if 0 ++ case BCH_IOCTL_START: ++ BCH_IOCTL(start, struct bch_ioctl_start); ++ case BCH_IOCTL_STOP: ++ return bch2_ioctl_stop(c); ++#endif ++ case BCH_IOCTL_READ_SUPER: ++ BCH_IOCTL(read_super, struct bch_ioctl_read_super); ++ case BCH_IOCTL_DISK_GET_IDX: ++ BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx); ++ } ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EINVAL; ++ ++ /* ioctls that do require admin cap: */ ++ switch (cmd) { ++ case BCH_IOCTL_DISK_ADD: ++ BCH_IOCTL(disk_add, struct bch_ioctl_disk); ++ case BCH_IOCTL_DISK_REMOVE: ++ BCH_IOCTL(disk_remove, struct bch_ioctl_disk); ++ case BCH_IOCTL_DISK_ONLINE: ++ BCH_IOCTL(disk_online, struct bch_ioctl_disk); ++ case BCH_IOCTL_DISK_OFFLINE: ++ BCH_IOCTL(disk_offline, struct bch_ioctl_disk); ++ case BCH_IOCTL_DISK_SET_STATE: ++ BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state); ++ case BCH_IOCTL_DATA: ++ BCH_IOCTL(data, struct bch_ioctl_data); ++ case BCH_IOCTL_DISK_RESIZE: ++ BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); ++ ++ default: ++ return -ENOTTY; ++ } ++} ++ ++static DEFINE_IDR(bch_chardev_minor); ++ ++static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v) ++{ ++ unsigned minor = iminor(file_inode(filp)); ++ struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL; ++ void __user *arg = (void __user *) v; ++ ++ return c ++ ? bch2_fs_ioctl(c, cmd, arg) ++ : bch2_global_ioctl(cmd, arg); ++} ++ ++static const struct file_operations bch_chardev_fops = { ++ .owner = THIS_MODULE, ++ .unlocked_ioctl = bch2_chardev_ioctl, ++ .open = nonseekable_open, ++}; ++ ++static int bch_chardev_major; ++static struct class *bch_chardev_class; ++static struct device *bch_chardev; ++ ++void bch2_fs_chardev_exit(struct bch_fs *c) ++{ ++ if (!IS_ERR_OR_NULL(c->chardev)) ++ device_unregister(c->chardev); ++ if (c->minor >= 0) ++ idr_remove(&bch_chardev_minor, c->minor); ++} ++ ++int bch2_fs_chardev_init(struct bch_fs *c) ++{ ++ c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL); ++ if (c->minor < 0) ++ return c->minor; ++ ++ c->chardev = device_create(bch_chardev_class, NULL, ++ MKDEV(bch_chardev_major, c->minor), c, ++ "bcachefs%u-ctl", c->minor); ++ if (IS_ERR(c->chardev)) ++ return PTR_ERR(c->chardev); ++ ++ return 0; ++} ++ ++void bch2_chardev_exit(void) ++{ ++ if (!IS_ERR_OR_NULL(bch_chardev_class)) ++ device_destroy(bch_chardev_class, ++ MKDEV(bch_chardev_major, U8_MAX)); ++ if (!IS_ERR_OR_NULL(bch_chardev_class)) ++ class_destroy(bch_chardev_class); ++ if (bch_chardev_major > 0) ++ unregister_chrdev(bch_chardev_major, "bcachefs"); ++} ++ ++int __init bch2_chardev_init(void) ++{ ++ bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops); ++ if (bch_chardev_major < 0) ++ return bch_chardev_major; ++ ++ bch_chardev_class = class_create(THIS_MODULE, "bcachefs"); ++ if (IS_ERR(bch_chardev_class)) ++ return PTR_ERR(bch_chardev_class); ++ ++ bch_chardev = device_create(bch_chardev_class, NULL, ++ MKDEV(bch_chardev_major, U8_MAX), ++ NULL, "bcachefs-ctl"); ++ if (IS_ERR(bch_chardev)) ++ return PTR_ERR(bch_chardev); ++ ++ return 0; ++} ++ ++#endif /* NO_BCACHEFS_CHARDEV */ +diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h +new file mode 100644 +index 000000000000..3a4890d39ff9 +--- /dev/null ++++ b/fs/bcachefs/chardev.h +@@ -0,0 +1,31 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_CHARDEV_H ++#define _BCACHEFS_CHARDEV_H ++ ++#ifndef NO_BCACHEFS_FS ++ ++long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *); ++ ++void bch2_fs_chardev_exit(struct bch_fs *); ++int bch2_fs_chardev_init(struct bch_fs *); ++ ++void bch2_chardev_exit(void); ++int __init bch2_chardev_init(void); ++ ++#else ++ ++static inline long bch2_fs_ioctl(struct bch_fs *c, ++ unsigned cmd, void __user * arg) ++{ ++ return -ENOSYS; ++} ++ ++static inline void bch2_fs_chardev_exit(struct bch_fs *c) {} ++static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; } ++ ++static inline void bch2_chardev_exit(void) {} ++static inline int __init bch2_chardev_init(void) { return 0; } ++ ++#endif /* NO_BCACHEFS_FS */ ++ ++#endif /* _BCACHEFS_CHARDEV_H */ +diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c +new file mode 100644 +index 000000000000..3d88719ba86c +--- /dev/null ++++ b/fs/bcachefs/checksum.c +@@ -0,0 +1,618 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "checksum.h" ++#include "super.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static u64 bch2_checksum_init(unsigned type) ++{ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ return 0; ++ case BCH_CSUM_CRC32C_NONZERO: ++ return U32_MAX; ++ case BCH_CSUM_CRC64_NONZERO: ++ return U64_MAX; ++ case BCH_CSUM_CRC32C: ++ return 0; ++ case BCH_CSUM_CRC64: ++ return 0; ++ default: ++ BUG(); ++ } ++} ++ ++static u64 bch2_checksum_final(unsigned type, u64 crc) ++{ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ return 0; ++ case BCH_CSUM_CRC32C_NONZERO: ++ return crc ^ U32_MAX; ++ case BCH_CSUM_CRC64_NONZERO: ++ return crc ^ U64_MAX; ++ case BCH_CSUM_CRC32C: ++ return crc; ++ case BCH_CSUM_CRC64: ++ return crc; ++ default: ++ BUG(); ++ } ++} ++ ++static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t len) ++{ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ return 0; ++ case BCH_CSUM_CRC32C_NONZERO: ++ case BCH_CSUM_CRC32C: ++ return crc32c(crc, data, len); ++ case BCH_CSUM_CRC64_NONZERO: ++ case BCH_CSUM_CRC64: ++ return crc64_be(crc, data, len); ++ default: ++ BUG(); ++ } ++} ++ ++static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, ++ struct nonce nonce, ++ struct scatterlist *sg, size_t len) ++{ ++ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); ++ int ret; ++ ++ skcipher_request_set_sync_tfm(req, tfm); ++ skcipher_request_set_crypt(req, sg, sg, len, nonce.d); ++ ++ ret = crypto_skcipher_encrypt(req); ++ BUG_ON(ret); ++} ++ ++static inline void do_encrypt(struct crypto_sync_skcipher *tfm, ++ struct nonce nonce, ++ void *buf, size_t len) ++{ ++ struct scatterlist sg; ++ ++ sg_init_one(&sg, buf, len); ++ do_encrypt_sg(tfm, nonce, &sg, len); ++} ++ ++int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, ++ void *buf, size_t len) ++{ ++ struct crypto_sync_skcipher *chacha20 = ++ crypto_alloc_sync_skcipher("chacha20", 0, 0); ++ int ret; ++ ++ if (!chacha20) { ++ pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20)); ++ return PTR_ERR(chacha20); ++ } ++ ++ ret = crypto_skcipher_setkey(&chacha20->base, ++ (void *) key, sizeof(*key)); ++ if (ret) { ++ pr_err("crypto_skcipher_setkey() error: %i", ret); ++ goto err; ++ } ++ ++ do_encrypt(chacha20, nonce, buf, len); ++err: ++ crypto_free_sync_skcipher(chacha20); ++ return ret; ++} ++ ++static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc, ++ struct nonce nonce) ++{ ++ u8 key[POLY1305_KEY_SIZE]; ++ ++ nonce.d[3] ^= BCH_NONCE_POLY; ++ ++ memset(key, 0, sizeof(key)); ++ do_encrypt(c->chacha20, nonce, key, sizeof(key)); ++ ++ desc->tfm = c->poly1305; ++ crypto_shash_init(desc); ++ crypto_shash_update(desc, key, sizeof(key)); ++} ++ ++struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, ++ struct nonce nonce, const void *data, size_t len) ++{ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ case BCH_CSUM_CRC32C_NONZERO: ++ case BCH_CSUM_CRC64_NONZERO: ++ case BCH_CSUM_CRC32C: ++ case BCH_CSUM_CRC64: { ++ u64 crc = bch2_checksum_init(type); ++ ++ crc = bch2_checksum_update(type, crc, data, len); ++ crc = bch2_checksum_final(type, crc); ++ ++ return (struct bch_csum) { .lo = cpu_to_le64(crc) }; ++ } ++ ++ case BCH_CSUM_CHACHA20_POLY1305_80: ++ case BCH_CSUM_CHACHA20_POLY1305_128: { ++ SHASH_DESC_ON_STACK(desc, c->poly1305); ++ u8 digest[POLY1305_DIGEST_SIZE]; ++ struct bch_csum ret = { 0 }; ++ ++ gen_poly_key(c, desc, nonce); ++ ++ crypto_shash_update(desc, data, len); ++ crypto_shash_final(desc, digest); ++ ++ memcpy(&ret, digest, bch_crc_bytes[type]); ++ return ret; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++void bch2_encrypt(struct bch_fs *c, unsigned type, ++ struct nonce nonce, void *data, size_t len) ++{ ++ if (!bch2_csum_type_is_encryption(type)) ++ return; ++ ++ do_encrypt(c->chacha20, nonce, data, len); ++} ++ ++static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, ++ struct nonce nonce, struct bio *bio, ++ struct bvec_iter *iter) ++{ ++ struct bio_vec bv; ++ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ return (struct bch_csum) { 0 }; ++ case BCH_CSUM_CRC32C_NONZERO: ++ case BCH_CSUM_CRC64_NONZERO: ++ case BCH_CSUM_CRC32C: ++ case BCH_CSUM_CRC64: { ++ u64 crc = bch2_checksum_init(type); ++ ++#ifdef CONFIG_HIGHMEM ++ __bio_for_each_segment(bv, bio, *iter, *iter) { ++ void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; ++ crc = bch2_checksum_update(type, ++ crc, p, bv.bv_len); ++ kunmap_atomic(p); ++ } ++#else ++ __bio_for_each_bvec(bv, bio, *iter, *iter) ++ crc = bch2_checksum_update(type, crc, ++ page_address(bv.bv_page) + bv.bv_offset, ++ bv.bv_len); ++#endif ++ crc = bch2_checksum_final(type, crc); ++ return (struct bch_csum) { .lo = cpu_to_le64(crc) }; ++ } ++ ++ case BCH_CSUM_CHACHA20_POLY1305_80: ++ case BCH_CSUM_CHACHA20_POLY1305_128: { ++ SHASH_DESC_ON_STACK(desc, c->poly1305); ++ u8 digest[POLY1305_DIGEST_SIZE]; ++ struct bch_csum ret = { 0 }; ++ ++ gen_poly_key(c, desc, nonce); ++ ++#ifdef CONFIG_HIGHMEM ++ __bio_for_each_segment(bv, bio, *iter, *iter) { ++ void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; ++ ++ crypto_shash_update(desc, p, bv.bv_len); ++ kunmap_atomic(p); ++ } ++#else ++ __bio_for_each_bvec(bv, bio, *iter, *iter) ++ crypto_shash_update(desc, ++ page_address(bv.bv_page) + bv.bv_offset, ++ bv.bv_len); ++#endif ++ crypto_shash_final(desc, digest); ++ ++ memcpy(&ret, digest, bch_crc_bytes[type]); ++ return ret; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, ++ struct nonce nonce, struct bio *bio) ++{ ++ struct bvec_iter iter = bio->bi_iter; ++ ++ return __bch2_checksum_bio(c, type, nonce, bio, &iter); ++} ++ ++void bch2_encrypt_bio(struct bch_fs *c, unsigned type, ++ struct nonce nonce, struct bio *bio) ++{ ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ struct scatterlist sgl[16], *sg = sgl; ++ size_t bytes = 0; ++ ++ if (!bch2_csum_type_is_encryption(type)) ++ return; ++ ++ sg_init_table(sgl, ARRAY_SIZE(sgl)); ++ ++ bio_for_each_segment(bv, bio, iter) { ++ if (sg == sgl + ARRAY_SIZE(sgl)) { ++ sg_mark_end(sg - 1); ++ do_encrypt_sg(c->chacha20, nonce, sgl, bytes); ++ ++ nonce = nonce_add(nonce, bytes); ++ bytes = 0; ++ ++ sg_init_table(sgl, ARRAY_SIZE(sgl)); ++ sg = sgl; ++ } ++ ++ sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset); ++ bytes += bv.bv_len; ++ } ++ ++ sg_mark_end(sg - 1); ++ do_encrypt_sg(c->chacha20, nonce, sgl, bytes); ++} ++ ++struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, ++ struct bch_csum b, size_t b_len) ++{ ++ BUG_ON(!bch2_checksum_mergeable(type)); ++ ++ while (b_len) { ++ unsigned b = min_t(unsigned, b_len, PAGE_SIZE); ++ ++ a.lo = bch2_checksum_update(type, a.lo, ++ page_address(ZERO_PAGE(0)), b); ++ b_len -= b; ++ } ++ ++ a.lo ^= b.lo; ++ a.hi ^= b.hi; ++ return a; ++} ++ ++int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, ++ struct bversion version, ++ struct bch_extent_crc_unpacked crc_old, ++ struct bch_extent_crc_unpacked *crc_a, ++ struct bch_extent_crc_unpacked *crc_b, ++ unsigned len_a, unsigned len_b, ++ unsigned new_csum_type) ++{ ++ struct bvec_iter iter = bio->bi_iter; ++ struct nonce nonce = extent_nonce(version, crc_old); ++ struct bch_csum merged = { 0 }; ++ struct crc_split { ++ struct bch_extent_crc_unpacked *crc; ++ unsigned len; ++ unsigned csum_type; ++ struct bch_csum csum; ++ } splits[3] = { ++ { crc_a, len_a, new_csum_type }, ++ { crc_b, len_b, new_csum_type }, ++ { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type }, ++ }, *i; ++ bool mergeable = crc_old.csum_type == new_csum_type && ++ bch2_checksum_mergeable(new_csum_type); ++ unsigned crc_nonce = crc_old.nonce; ++ ++ BUG_ON(len_a + len_b > bio_sectors(bio)); ++ BUG_ON(crc_old.uncompressed_size != bio_sectors(bio)); ++ BUG_ON(crc_is_compressed(crc_old)); ++ BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) != ++ bch2_csum_type_is_encryption(new_csum_type)); ++ ++ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { ++ iter.bi_size = i->len << 9; ++ if (mergeable || i->crc) ++ i->csum = __bch2_checksum_bio(c, i->csum_type, ++ nonce, bio, &iter); ++ else ++ bio_advance_iter(bio, &iter, i->len << 9); ++ nonce = nonce_add(nonce, i->len << 9); ++ } ++ ++ if (mergeable) ++ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) ++ merged = bch2_checksum_merge(new_csum_type, merged, ++ i->csum, i->len << 9); ++ else ++ merged = bch2_checksum_bio(c, crc_old.csum_type, ++ extent_nonce(version, crc_old), bio); ++ ++ if (bch2_crc_cmp(merged, crc_old.csum)) ++ return -EIO; ++ ++ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { ++ if (i->crc) ++ *i->crc = (struct bch_extent_crc_unpacked) { ++ .csum_type = i->csum_type, ++ .compression_type = crc_old.compression_type, ++ .compressed_size = i->len, ++ .uncompressed_size = i->len, ++ .offset = 0, ++ .live_size = i->len, ++ .nonce = crc_nonce, ++ .csum = i->csum, ++ }; ++ ++ if (bch2_csum_type_is_encryption(new_csum_type)) ++ crc_nonce += i->len; ++ } ++ ++ return 0; ++} ++ ++#ifdef __KERNEL__ ++int bch2_request_key(struct bch_sb *sb, struct bch_key *key) ++{ ++ char key_description[60]; ++ struct key *keyring_key; ++ const struct user_key_payload *ukp; ++ int ret; ++ ++ snprintf(key_description, sizeof(key_description), ++ "bcachefs:%pUb", &sb->user_uuid); ++ ++ keyring_key = request_key(&key_type_logon, key_description, NULL); ++ if (IS_ERR(keyring_key)) ++ return PTR_ERR(keyring_key); ++ ++ down_read(&keyring_key->sem); ++ ukp = dereference_key_locked(keyring_key); ++ if (ukp->datalen == sizeof(*key)) { ++ memcpy(key, ukp->data, ukp->datalen); ++ ret = 0; ++ } else { ++ ret = -EINVAL; ++ } ++ up_read(&keyring_key->sem); ++ key_put(keyring_key); ++ ++ return ret; ++} ++#else ++#include ++#include ++ ++int bch2_request_key(struct bch_sb *sb, struct bch_key *key) ++{ ++ key_serial_t key_id; ++ char key_description[60]; ++ char uuid[40]; ++ ++ uuid_unparse_lower(sb->user_uuid.b, uuid); ++ sprintf(key_description, "bcachefs:%s", uuid); ++ ++ key_id = request_key("user", key_description, NULL, ++ KEY_SPEC_USER_KEYRING); ++ if (key_id < 0) ++ return -errno; ++ ++ if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key)) ++ return -1; ++ ++ return 0; ++} ++#endif ++ ++int bch2_decrypt_sb_key(struct bch_fs *c, ++ struct bch_sb_field_crypt *crypt, ++ struct bch_key *key) ++{ ++ struct bch_encrypted_key sb_key = crypt->key; ++ struct bch_key user_key; ++ int ret = 0; ++ ++ /* is key encrypted? */ ++ if (!bch2_key_is_encrypted(&sb_key)) ++ goto out; ++ ++ ret = bch2_request_key(c->disk_sb.sb, &user_key); ++ if (ret) { ++ bch_err(c, "error requesting encryption key: %i", ret); ++ goto err; ++ } ++ ++ /* decrypt real key: */ ++ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), ++ &sb_key, sizeof(sb_key)); ++ if (ret) ++ goto err; ++ ++ if (bch2_key_is_encrypted(&sb_key)) { ++ bch_err(c, "incorrect encryption key"); ++ ret = -EINVAL; ++ goto err; ++ } ++out: ++ *key = sb_key.key; ++err: ++ memzero_explicit(&sb_key, sizeof(sb_key)); ++ memzero_explicit(&user_key, sizeof(user_key)); ++ return ret; ++} ++ ++static int bch2_alloc_ciphers(struct bch_fs *c) ++{ ++ if (!c->chacha20) ++ c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); ++ if (IS_ERR(c->chacha20)) { ++ bch_err(c, "error requesting chacha20 module: %li", ++ PTR_ERR(c->chacha20)); ++ return PTR_ERR(c->chacha20); ++ } ++ ++ if (!c->poly1305) ++ c->poly1305 = crypto_alloc_shash("poly1305", 0, 0); ++ if (IS_ERR(c->poly1305)) { ++ bch_err(c, "error requesting poly1305 module: %li", ++ PTR_ERR(c->poly1305)); ++ return PTR_ERR(c->poly1305); ++ } ++ ++ return 0; ++} ++ ++int bch2_disable_encryption(struct bch_fs *c) ++{ ++ struct bch_sb_field_crypt *crypt; ++ struct bch_key key; ++ int ret = -EINVAL; ++ ++ mutex_lock(&c->sb_lock); ++ ++ crypt = bch2_sb_get_crypt(c->disk_sb.sb); ++ if (!crypt) ++ goto out; ++ ++ /* is key encrypted? */ ++ ret = 0; ++ if (bch2_key_is_encrypted(&crypt->key)) ++ goto out; ++ ++ ret = bch2_decrypt_sb_key(c, crypt, &key); ++ if (ret) ++ goto out; ++ ++ crypt->key.magic = BCH_KEY_MAGIC; ++ crypt->key.key = key; ++ ++ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0); ++ bch2_write_super(c); ++out: ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++int bch2_enable_encryption(struct bch_fs *c, bool keyed) ++{ ++ struct bch_encrypted_key key; ++ struct bch_key user_key; ++ struct bch_sb_field_crypt *crypt; ++ int ret = -EINVAL; ++ ++ mutex_lock(&c->sb_lock); ++ ++ /* Do we already have an encryption key? */ ++ if (bch2_sb_get_crypt(c->disk_sb.sb)) ++ goto err; ++ ++ ret = bch2_alloc_ciphers(c); ++ if (ret) ++ goto err; ++ ++ key.magic = BCH_KEY_MAGIC; ++ get_random_bytes(&key.key, sizeof(key.key)); ++ ++ if (keyed) { ++ ret = bch2_request_key(c->disk_sb.sb, &user_key); ++ if (ret) { ++ bch_err(c, "error requesting encryption key: %i", ret); ++ goto err; ++ } ++ ++ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), ++ &key, sizeof(key)); ++ if (ret) ++ goto err; ++ } ++ ++ ret = crypto_skcipher_setkey(&c->chacha20->base, ++ (void *) &key.key, sizeof(key.key)); ++ if (ret) ++ goto err; ++ ++ crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64)); ++ if (!crypt) { ++ ret = -ENOMEM; /* XXX this technically could be -ENOSPC */ ++ goto err; ++ } ++ ++ crypt->key = key; ++ ++ /* write superblock */ ++ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1); ++ bch2_write_super(c); ++err: ++ mutex_unlock(&c->sb_lock); ++ memzero_explicit(&user_key, sizeof(user_key)); ++ memzero_explicit(&key, sizeof(key)); ++ return ret; ++} ++ ++void bch2_fs_encryption_exit(struct bch_fs *c) ++{ ++ if (!IS_ERR_OR_NULL(c->poly1305)) ++ crypto_free_shash(c->poly1305); ++ if (!IS_ERR_OR_NULL(c->chacha20)) ++ crypto_free_sync_skcipher(c->chacha20); ++ if (!IS_ERR_OR_NULL(c->sha256)) ++ crypto_free_shash(c->sha256); ++} ++ ++int bch2_fs_encryption_init(struct bch_fs *c) ++{ ++ struct bch_sb_field_crypt *crypt; ++ struct bch_key key; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ c->sha256 = crypto_alloc_shash("sha256", 0, 0); ++ if (IS_ERR(c->sha256)) { ++ bch_err(c, "error requesting sha256 module"); ++ ret = PTR_ERR(c->sha256); ++ goto out; ++ } ++ ++ crypt = bch2_sb_get_crypt(c->disk_sb.sb); ++ if (!crypt) ++ goto out; ++ ++ ret = bch2_alloc_ciphers(c); ++ if (ret) ++ goto out; ++ ++ ret = bch2_decrypt_sb_key(c, crypt, &key); ++ if (ret) ++ goto out; ++ ++ ret = crypto_skcipher_setkey(&c->chacha20->base, ++ (void *) &key.key, sizeof(key.key)); ++ if (ret) ++ goto out; ++out: ++ memzero_explicit(&key, sizeof(key)); ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} +diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h +new file mode 100644 +index 000000000000..24dee8039d57 +--- /dev/null ++++ b/fs/bcachefs/checksum.h +@@ -0,0 +1,202 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_CHECKSUM_H ++#define _BCACHEFS_CHECKSUM_H ++ ++#include "bcachefs.h" ++#include "extents_types.h" ++#include "super-io.h" ++ ++#include ++#include ++ ++static inline bool bch2_checksum_mergeable(unsigned type) ++{ ++ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ case BCH_CSUM_CRC32C: ++ case BCH_CSUM_CRC64: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum, ++ struct bch_csum, size_t); ++ ++#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28) ++#define BCH_NONCE_BTREE cpu_to_le32(2 << 28) ++#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28) ++#define BCH_NONCE_PRIO cpu_to_le32(4 << 28) ++#define BCH_NONCE_POLY cpu_to_le32(1 << 31) ++ ++struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, ++ const void *, size_t); ++ ++/* ++ * This is used for various on disk data structures - bch_sb, prio_set, bset, ++ * jset: The checksum is _always_ the first field of these structs ++ */ ++#define csum_vstruct(_c, _type, _nonce, _i) \ ++({ \ ++ const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \ ++ const void *end = vstruct_end(_i); \ ++ \ ++ bch2_checksum(_c, _type, _nonce, start, end - start); \ ++}) ++ ++int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); ++int bch2_request_key(struct bch_sb *, struct bch_key *); ++ ++void bch2_encrypt(struct bch_fs *, unsigned, struct nonce, ++ void *data, size_t); ++ ++struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned, ++ struct nonce, struct bio *); ++ ++int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion, ++ struct bch_extent_crc_unpacked, ++ struct bch_extent_crc_unpacked *, ++ struct bch_extent_crc_unpacked *, ++ unsigned, unsigned, unsigned); ++ ++void bch2_encrypt_bio(struct bch_fs *, unsigned, ++ struct nonce, struct bio *); ++ ++int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, ++ struct bch_key *); ++ ++int bch2_disable_encryption(struct bch_fs *); ++int bch2_enable_encryption(struct bch_fs *, bool); ++ ++void bch2_fs_encryption_exit(struct bch_fs *); ++int bch2_fs_encryption_init(struct bch_fs *); ++ ++static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, ++ bool data) ++{ ++ switch (type) { ++ case BCH_CSUM_OPT_NONE: ++ return BCH_CSUM_NONE; ++ case BCH_CSUM_OPT_CRC32C: ++ return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO; ++ case BCH_CSUM_OPT_CRC64: ++ return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO; ++ default: ++ BUG(); ++ } ++} ++ ++static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, ++ unsigned opt) ++{ ++ if (c->sb.encryption_type) ++ return c->opts.wide_macs ++ ? BCH_CSUM_CHACHA20_POLY1305_128 ++ : BCH_CSUM_CHACHA20_POLY1305_80; ++ ++ return bch2_csum_opt_to_type(opt, true); ++} ++ ++static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) ++{ ++ if (c->sb.encryption_type) ++ return BCH_CSUM_CHACHA20_POLY1305_128; ++ ++ return bch2_csum_opt_to_type(c->opts.metadata_checksum, false); ++} ++ ++static const unsigned bch2_compression_opt_to_type[] = { ++#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, ++ BCH_COMPRESSION_OPTS() ++#undef x ++}; ++ ++static inline bool bch2_checksum_type_valid(const struct bch_fs *c, ++ unsigned type) ++{ ++ if (type >= BCH_CSUM_NR) ++ return false; ++ ++ if (bch2_csum_type_is_encryption(type) && !c->chacha20) ++ return false; ++ ++ return true; ++} ++ ++/* returns true if not equal */ ++static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) ++{ ++ /* ++ * XXX: need some way of preventing the compiler from optimizing this ++ * into a form that isn't constant time.. ++ */ ++ return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0; ++} ++ ++/* for skipping ahead and encrypting/decrypting at an offset: */ ++static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) ++{ ++ EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); ++ ++ le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); ++ return nonce; ++} ++ ++static inline struct nonce null_nonce(void) ++{ ++ struct nonce ret; ++ ++ memset(&ret, 0, sizeof(ret)); ++ return ret; ++} ++ ++static inline struct nonce extent_nonce(struct bversion version, ++ struct bch_extent_crc_unpacked crc) ++{ ++ unsigned compression_type = crc_is_compressed(crc) ++ ? crc.compression_type ++ : 0; ++ unsigned size = compression_type ? crc.uncompressed_size : 0; ++ struct nonce nonce = (struct nonce) {{ ++ [0] = cpu_to_le32(size << 22), ++ [1] = cpu_to_le32(version.lo), ++ [2] = cpu_to_le32(version.lo >> 32), ++ [3] = cpu_to_le32(version.hi| ++ (compression_type << 24))^BCH_NONCE_EXTENT, ++ }}; ++ ++ return nonce_add(nonce, crc.nonce << 9); ++} ++ ++static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key) ++{ ++ return le64_to_cpu(key->magic) != BCH_KEY_MAGIC; ++} ++ ++static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb) ++{ ++ __le64 magic = __bch2_sb_magic(sb); ++ ++ return (struct nonce) {{ ++ [0] = 0, ++ [1] = 0, ++ [2] = ((__le32 *) &magic)[0], ++ [3] = ((__le32 *) &magic)[1], ++ }}; ++} ++ ++static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c) ++{ ++ __le64 magic = bch2_sb_magic(c); ++ ++ return (struct nonce) {{ ++ [0] = 0, ++ [1] = 0, ++ [2] = ((__le32 *) &magic)[0], ++ [3] = ((__le32 *) &magic)[1], ++ }}; ++} ++ ++#endif /* _BCACHEFS_CHECKSUM_H */ +diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c +new file mode 100644 +index 000000000000..a9f5d5696622 +--- /dev/null ++++ b/fs/bcachefs/clock.c +@@ -0,0 +1,194 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "clock.h" ++ ++#include ++#include ++#include ++ ++static inline long io_timer_cmp(io_timer_heap *h, ++ struct io_timer *l, ++ struct io_timer *r) ++{ ++ return l->expire - r->expire; ++} ++ ++void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) ++{ ++ size_t i; ++ ++ spin_lock(&clock->timer_lock); ++ ++ if (time_after_eq((unsigned long) atomic_long_read(&clock->now), ++ timer->expire)) { ++ spin_unlock(&clock->timer_lock); ++ timer->fn(timer); ++ return; ++ } ++ ++ for (i = 0; i < clock->timers.used; i++) ++ if (clock->timers.data[i] == timer) ++ goto out; ++ ++ BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL)); ++out: ++ spin_unlock(&clock->timer_lock); ++} ++ ++void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) ++{ ++ size_t i; ++ ++ spin_lock(&clock->timer_lock); ++ ++ for (i = 0; i < clock->timers.used; i++) ++ if (clock->timers.data[i] == timer) { ++ heap_del(&clock->timers, i, io_timer_cmp, NULL); ++ break; ++ } ++ ++ spin_unlock(&clock->timer_lock); ++} ++ ++struct io_clock_wait { ++ struct io_timer io_timer; ++ struct timer_list cpu_timer; ++ struct task_struct *task; ++ int expired; ++}; ++ ++static void io_clock_wait_fn(struct io_timer *timer) ++{ ++ struct io_clock_wait *wait = container_of(timer, ++ struct io_clock_wait, io_timer); ++ ++ wait->expired = 1; ++ wake_up_process(wait->task); ++} ++ ++static void io_clock_cpu_timeout(struct timer_list *timer) ++{ ++ struct io_clock_wait *wait = container_of(timer, ++ struct io_clock_wait, cpu_timer); ++ ++ wait->expired = 1; ++ wake_up_process(wait->task); ++} ++ ++void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until) ++{ ++ struct io_clock_wait wait; ++ ++ /* XXX: calculate sleep time rigorously */ ++ wait.io_timer.expire = until; ++ wait.io_timer.fn = io_clock_wait_fn; ++ wait.task = current; ++ wait.expired = 0; ++ bch2_io_timer_add(clock, &wait.io_timer); ++ ++ schedule(); ++ ++ bch2_io_timer_del(clock, &wait.io_timer); ++} ++ ++void bch2_kthread_io_clock_wait(struct io_clock *clock, ++ unsigned long io_until, ++ unsigned long cpu_timeout) ++{ ++ bool kthread = (current->flags & PF_KTHREAD) != 0; ++ struct io_clock_wait wait; ++ ++ wait.io_timer.expire = io_until; ++ wait.io_timer.fn = io_clock_wait_fn; ++ wait.task = current; ++ wait.expired = 0; ++ bch2_io_timer_add(clock, &wait.io_timer); ++ ++ timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0); ++ ++ if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) ++ mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); ++ ++ while (1) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ if (kthread && kthread_should_stop()) ++ break; ++ ++ if (wait.expired) ++ break; ++ ++ schedule(); ++ try_to_freeze(); ++ } ++ ++ __set_current_state(TASK_RUNNING); ++ del_singleshot_timer_sync(&wait.cpu_timer); ++ destroy_timer_on_stack(&wait.cpu_timer); ++ bch2_io_timer_del(clock, &wait.io_timer); ++} ++ ++static struct io_timer *get_expired_timer(struct io_clock *clock, ++ unsigned long now) ++{ ++ struct io_timer *ret = NULL; ++ ++ spin_lock(&clock->timer_lock); ++ ++ if (clock->timers.used && ++ time_after_eq(now, clock->timers.data[0]->expire)) ++ heap_pop(&clock->timers, ret, io_timer_cmp, NULL); ++ ++ spin_unlock(&clock->timer_lock); ++ ++ return ret; ++} ++ ++void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) ++{ ++ struct io_timer *timer; ++ unsigned long now = atomic_long_add_return(sectors, &clock->now); ++ ++ while ((timer = get_expired_timer(clock, now))) ++ timer->fn(timer); ++} ++ ++ssize_t bch2_io_timers_show(struct io_clock *clock, char *buf) ++{ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ unsigned long now; ++ unsigned i; ++ ++ spin_lock(&clock->timer_lock); ++ now = atomic_long_read(&clock->now); ++ ++ for (i = 0; i < clock->timers.used; i++) ++ pr_buf(&out, "%ps:\t%li\n", ++ clock->timers.data[i]->fn, ++ clock->timers.data[i]->expire - now); ++ spin_unlock(&clock->timer_lock); ++ ++ return out.pos - buf; ++} ++ ++void bch2_io_clock_exit(struct io_clock *clock) ++{ ++ free_heap(&clock->timers); ++ free_percpu(clock->pcpu_buf); ++} ++ ++int bch2_io_clock_init(struct io_clock *clock) ++{ ++ atomic_long_set(&clock->now, 0); ++ spin_lock_init(&clock->timer_lock); ++ ++ clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); ++ ++ clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf); ++ if (!clock->pcpu_buf) ++ return -ENOMEM; ++ ++ if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ return 0; ++} +diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h +new file mode 100644 +index 000000000000..da50afe206cc +--- /dev/null ++++ b/fs/bcachefs/clock.h +@@ -0,0 +1,38 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_CLOCK_H ++#define _BCACHEFS_CLOCK_H ++ ++void bch2_io_timer_add(struct io_clock *, struct io_timer *); ++void bch2_io_timer_del(struct io_clock *, struct io_timer *); ++void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long, ++ unsigned long); ++ ++void __bch2_increment_clock(struct io_clock *, unsigned); ++ ++static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors, ++ int rw) ++{ ++ struct io_clock *clock = &c->io_clock[rw]; ++ ++ if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >= ++ IO_CLOCK_PCPU_SECTORS)) ++ __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0)); ++} ++ ++void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); ++ ++#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\ ++({ \ ++ long __ret = timeout; \ ++ might_sleep(); \ ++ if (!___wait_cond_timeout(condition)) \ ++ __ret = __wait_event_timeout(wq, condition, timeout); \ ++ __ret; \ ++}) ++ ++ssize_t bch2_io_timers_show(struct io_clock *, char *); ++ ++void bch2_io_clock_exit(struct io_clock *); ++int bch2_io_clock_init(struct io_clock *); ++ ++#endif /* _BCACHEFS_CLOCK_H */ +diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h +new file mode 100644 +index 000000000000..92c740a47565 +--- /dev/null ++++ b/fs/bcachefs/clock_types.h +@@ -0,0 +1,37 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_CLOCK_TYPES_H ++#define _BCACHEFS_CLOCK_TYPES_H ++ ++#include "util.h" ++ ++#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3) ++ ++/* ++ * Clocks/timers in units of sectors of IO: ++ * ++ * Note - they use percpu batching, so they're only approximate. ++ */ ++ ++struct io_timer; ++typedef void (*io_timer_fn)(struct io_timer *); ++ ++struct io_timer { ++ io_timer_fn fn; ++ unsigned long expire; ++}; ++ ++/* Amount to buffer up on a percpu counter */ ++#define IO_CLOCK_PCPU_SECTORS 128 ++ ++typedef HEAP(struct io_timer *) io_timer_heap; ++ ++struct io_clock { ++ atomic_long_t now; ++ u16 __percpu *pcpu_buf; ++ unsigned max_slop; ++ ++ spinlock_t timer_lock; ++ io_timer_heap timers; ++}; ++ ++#endif /* _BCACHEFS_CLOCK_TYPES_H */ +diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c +new file mode 100644 +index 000000000000..3d75527d2d81 +--- /dev/null ++++ b/fs/bcachefs/compress.c +@@ -0,0 +1,633 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "checksum.h" ++#include "compress.h" ++#include "extents.h" ++#include "io.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++ ++/* Bounce buffer: */ ++struct bbuf { ++ void *b; ++ enum { ++ BB_NONE, ++ BB_VMAP, ++ BB_KMALLOC, ++ BB_MEMPOOL, ++ } type; ++ int rw; ++}; ++ ++static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw) ++{ ++ void *b; ++ ++ BUG_ON(size > c->sb.encoded_extent_max << 9); ++ ++ b = kmalloc(size, GFP_NOIO|__GFP_NOWARN); ++ if (b) ++ return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw }; ++ ++ b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO); ++ if (b) ++ return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw }; ++ ++ BUG(); ++} ++ ++static bool bio_phys_contig(struct bio *bio, struct bvec_iter start) ++{ ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ void *expected_start = NULL; ++ ++ __bio_for_each_bvec(bv, bio, iter, start) { ++ if (expected_start && ++ expected_start != page_address(bv.bv_page) + bv.bv_offset) ++ return false; ++ ++ expected_start = page_address(bv.bv_page) + ++ bv.bv_offset + bv.bv_len; ++ } ++ ++ return true; ++} ++ ++static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, ++ struct bvec_iter start, int rw) ++{ ++ struct bbuf ret; ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ unsigned nr_pages = 0, flags; ++ struct page *stack_pages[16]; ++ struct page **pages = NULL; ++ void *data; ++ ++ BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max); ++ ++ if (!IS_ENABLED(CONFIG_HIGHMEM) && ++ bio_phys_contig(bio, start)) ++ return (struct bbuf) { ++ .b = page_address(bio_iter_page(bio, start)) + ++ bio_iter_offset(bio, start), ++ .type = BB_NONE, .rw = rw ++ }; ++ ++ /* check if we can map the pages contiguously: */ ++ __bio_for_each_segment(bv, bio, iter, start) { ++ if (iter.bi_size != start.bi_size && ++ bv.bv_offset) ++ goto bounce; ++ ++ if (bv.bv_len < iter.bi_size && ++ bv.bv_offset + bv.bv_len < PAGE_SIZE) ++ goto bounce; ++ ++ nr_pages++; ++ } ++ ++ BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages); ++ ++ pages = nr_pages > ARRAY_SIZE(stack_pages) ++ ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO) ++ : stack_pages; ++ if (!pages) ++ goto bounce; ++ ++ nr_pages = 0; ++ __bio_for_each_segment(bv, bio, iter, start) ++ pages[nr_pages++] = bv.bv_page; ++ ++ flags = memalloc_nofs_save(); ++ data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); ++ memalloc_nofs_restore(flags); ++ ++ if (pages != stack_pages) ++ kfree(pages); ++ ++ if (data) ++ return (struct bbuf) { ++ .b = data + bio_iter_offset(bio, start), ++ .type = BB_VMAP, .rw = rw ++ }; ++bounce: ++ ret = __bounce_alloc(c, start.bi_size, rw); ++ ++ if (rw == READ) ++ memcpy_from_bio(ret.b, bio, start); ++ ++ return ret; ++} ++ ++static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw) ++{ ++ return __bio_map_or_bounce(c, bio, bio->bi_iter, rw); ++} ++ ++static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf) ++{ ++ switch (buf.type) { ++ case BB_NONE: ++ break; ++ case BB_VMAP: ++ vunmap((void *) ((unsigned long) buf.b & PAGE_MASK)); ++ break; ++ case BB_KMALLOC: ++ kfree(buf.b); ++ break; ++ case BB_MEMPOOL: ++ mempool_free(buf.b, &c->compression_bounce[buf.rw]); ++ break; ++ } ++} ++ ++static inline void zlib_set_workspace(z_stream *strm, void *workspace) ++{ ++#ifdef __KERNEL__ ++ strm->workspace = workspace; ++#endif ++} ++ ++static int __bio_uncompress(struct bch_fs *c, struct bio *src, ++ void *dst_data, struct bch_extent_crc_unpacked crc) ++{ ++ struct bbuf src_data = { NULL }; ++ size_t src_len = src->bi_iter.bi_size; ++ size_t dst_len = crc.uncompressed_size << 9; ++ void *workspace; ++ int ret; ++ ++ src_data = bio_map_or_bounce(c, src, READ); ++ ++ switch (crc.compression_type) { ++ case BCH_COMPRESSION_TYPE_lz4_old: ++ case BCH_COMPRESSION_TYPE_lz4: ++ ret = LZ4_decompress_safe_partial(src_data.b, dst_data, ++ src_len, dst_len, dst_len); ++ if (ret != dst_len) ++ goto err; ++ break; ++ case BCH_COMPRESSION_TYPE_gzip: { ++ z_stream strm = { ++ .next_in = src_data.b, ++ .avail_in = src_len, ++ .next_out = dst_data, ++ .avail_out = dst_len, ++ }; ++ ++ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); ++ ++ zlib_set_workspace(&strm, workspace); ++ zlib_inflateInit2(&strm, -MAX_WBITS); ++ ret = zlib_inflate(&strm, Z_FINISH); ++ ++ mempool_free(workspace, &c->decompress_workspace); ++ ++ if (ret != Z_STREAM_END) ++ goto err; ++ break; ++ } ++ case BCH_COMPRESSION_TYPE_zstd: { ++ ZSTD_DCtx *ctx; ++ size_t real_src_len = le32_to_cpup(src_data.b); ++ ++ if (real_src_len > src_len - 4) ++ goto err; ++ ++ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); ++ ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound()); ++ ++ ret = ZSTD_decompressDCtx(ctx, ++ dst_data, dst_len, ++ src_data.b + 4, real_src_len); ++ ++ mempool_free(workspace, &c->decompress_workspace); ++ ++ if (ret != dst_len) ++ goto err; ++ break; ++ } ++ default: ++ BUG(); ++ } ++ ret = 0; ++out: ++ bio_unmap_or_unbounce(c, src_data); ++ return ret; ++err: ++ ret = -EIO; ++ goto out; ++} ++ ++int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, ++ struct bch_extent_crc_unpacked *crc) ++{ ++ struct bbuf data = { NULL }; ++ size_t dst_len = crc->uncompressed_size << 9; ++ ++ /* bio must own its pages: */ ++ BUG_ON(!bio->bi_vcnt); ++ BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); ++ ++ if (crc->uncompressed_size > c->sb.encoded_extent_max || ++ crc->compressed_size > c->sb.encoded_extent_max) { ++ bch_err(c, "error rewriting existing data: extent too big"); ++ return -EIO; ++ } ++ ++ data = __bounce_alloc(c, dst_len, WRITE); ++ ++ if (__bio_uncompress(c, bio, data.b, *crc)) { ++ bch_err(c, "error rewriting existing data: decompression error"); ++ bio_unmap_or_unbounce(c, data); ++ return -EIO; ++ } ++ ++ /* ++ * XXX: don't have a good way to assert that the bio was allocated with ++ * enough space, we depend on bch2_move_extent doing the right thing ++ */ ++ bio->bi_iter.bi_size = crc->live_size << 9; ++ ++ memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9)); ++ ++ crc->csum_type = 0; ++ crc->compression_type = 0; ++ crc->compressed_size = crc->live_size; ++ crc->uncompressed_size = crc->live_size; ++ crc->offset = 0; ++ crc->csum = (struct bch_csum) { 0, 0 }; ++ ++ bio_unmap_or_unbounce(c, data); ++ return 0; ++} ++ ++int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, ++ struct bio *dst, struct bvec_iter dst_iter, ++ struct bch_extent_crc_unpacked crc) ++{ ++ struct bbuf dst_data = { NULL }; ++ size_t dst_len = crc.uncompressed_size << 9; ++ int ret = -ENOMEM; ++ ++ if (crc.uncompressed_size > c->sb.encoded_extent_max || ++ crc.compressed_size > c->sb.encoded_extent_max) ++ return -EIO; ++ ++ dst_data = dst_len == dst_iter.bi_size ++ ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) ++ : __bounce_alloc(c, dst_len, WRITE); ++ ++ ret = __bio_uncompress(c, src, dst_data.b, crc); ++ if (ret) ++ goto err; ++ ++ if (dst_data.type != BB_NONE && ++ dst_data.type != BB_VMAP) ++ memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9)); ++err: ++ bio_unmap_or_unbounce(c, dst_data); ++ return ret; ++} ++ ++static int attempt_compress(struct bch_fs *c, ++ void *workspace, ++ void *dst, size_t dst_len, ++ void *src, size_t src_len, ++ enum bch_compression_type compression_type) ++{ ++ switch (compression_type) { ++ case BCH_COMPRESSION_TYPE_lz4: { ++ int len = src_len; ++ int ret = LZ4_compress_destSize( ++ src, dst, ++ &len, dst_len, ++ workspace); ++ ++ if (len < src_len) ++ return -len; ++ ++ return ret; ++ } ++ case BCH_COMPRESSION_TYPE_gzip: { ++ z_stream strm = { ++ .next_in = src, ++ .avail_in = src_len, ++ .next_out = dst, ++ .avail_out = dst_len, ++ }; ++ ++ zlib_set_workspace(&strm, workspace); ++ zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION, ++ Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, ++ Z_DEFAULT_STRATEGY); ++ ++ if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END) ++ return 0; ++ ++ if (zlib_deflateEnd(&strm) != Z_OK) ++ return 0; ++ ++ return strm.total_out; ++ } ++ case BCH_COMPRESSION_TYPE_zstd: { ++ ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace, ++ ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams)); ++ ++ size_t len = ZSTD_compressCCtx(ctx, ++ dst + 4, dst_len - 4, ++ src, src_len, ++ c->zstd_params); ++ if (ZSTD_isError(len)) ++ return 0; ++ ++ *((__le32 *) dst) = cpu_to_le32(len); ++ return len + 4; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++static unsigned __bio_compress(struct bch_fs *c, ++ struct bio *dst, size_t *dst_len, ++ struct bio *src, size_t *src_len, ++ enum bch_compression_type compression_type) ++{ ++ struct bbuf src_data = { NULL }, dst_data = { NULL }; ++ void *workspace; ++ unsigned pad; ++ int ret = 0; ++ ++ BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR); ++ BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type])); ++ ++ /* If it's only one block, don't bother trying to compress: */ ++ if (bio_sectors(src) <= c->opts.block_size) ++ return 0; ++ ++ dst_data = bio_map_or_bounce(c, dst, WRITE); ++ src_data = bio_map_or_bounce(c, src, READ); ++ ++ workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOIO); ++ ++ *src_len = src->bi_iter.bi_size; ++ *dst_len = dst->bi_iter.bi_size; ++ ++ /* ++ * XXX: this algorithm sucks when the compression code doesn't tell us ++ * how much would fit, like LZ4 does: ++ */ ++ while (1) { ++ if (*src_len <= block_bytes(c)) { ++ ret = -1; ++ break; ++ } ++ ++ ret = attempt_compress(c, workspace, ++ dst_data.b, *dst_len, ++ src_data.b, *src_len, ++ compression_type); ++ if (ret > 0) { ++ *dst_len = ret; ++ ret = 0; ++ break; ++ } ++ ++ /* Didn't fit: should we retry with a smaller amount? */ ++ if (*src_len <= *dst_len) { ++ ret = -1; ++ break; ++ } ++ ++ /* ++ * If ret is negative, it's a hint as to how much data would fit ++ */ ++ BUG_ON(-ret >= *src_len); ++ ++ if (ret < 0) ++ *src_len = -ret; ++ else ++ *src_len -= (*src_len - *dst_len) / 2; ++ *src_len = round_down(*src_len, block_bytes(c)); ++ } ++ ++ mempool_free(workspace, &c->compress_workspace[compression_type]); ++ ++ if (ret) ++ goto err; ++ ++ /* Didn't get smaller: */ ++ if (round_up(*dst_len, block_bytes(c)) >= *src_len) ++ goto err; ++ ++ pad = round_up(*dst_len, block_bytes(c)) - *dst_len; ++ ++ memset(dst_data.b + *dst_len, 0, pad); ++ *dst_len += pad; ++ ++ if (dst_data.type != BB_NONE && ++ dst_data.type != BB_VMAP) ++ memcpy_to_bio(dst, dst->bi_iter, dst_data.b); ++ ++ BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size); ++ BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size); ++ BUG_ON(*dst_len & (block_bytes(c) - 1)); ++ BUG_ON(*src_len & (block_bytes(c) - 1)); ++out: ++ bio_unmap_or_unbounce(c, src_data); ++ bio_unmap_or_unbounce(c, dst_data); ++ return compression_type; ++err: ++ compression_type = BCH_COMPRESSION_TYPE_incompressible; ++ goto out; ++} ++ ++unsigned bch2_bio_compress(struct bch_fs *c, ++ struct bio *dst, size_t *dst_len, ++ struct bio *src, size_t *src_len, ++ unsigned compression_type) ++{ ++ unsigned orig_dst = dst->bi_iter.bi_size; ++ unsigned orig_src = src->bi_iter.bi_size; ++ ++ /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ ++ src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size, ++ c->sb.encoded_extent_max << 9); ++ /* Don't generate a bigger output than input: */ ++ dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); ++ ++ if (compression_type == BCH_COMPRESSION_TYPE_lz4_old) ++ compression_type = BCH_COMPRESSION_TYPE_lz4; ++ ++ compression_type = ++ __bio_compress(c, dst, dst_len, src, src_len, compression_type); ++ ++ dst->bi_iter.bi_size = orig_dst; ++ src->bi_iter.bi_size = orig_src; ++ return compression_type; ++} ++ ++static int __bch2_fs_compress_init(struct bch_fs *, u64); ++ ++#define BCH_FEATURE_none 0 ++ ++static const unsigned bch2_compression_opt_to_feature[] = { ++#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t, ++ BCH_COMPRESSION_OPTS() ++#undef x ++}; ++ ++#undef BCH_FEATURE_none ++ ++static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f) ++{ ++ int ret = 0; ++ ++ if ((c->sb.features & f) == f) ++ return 0; ++ ++ mutex_lock(&c->sb_lock); ++ ++ if ((c->sb.features & f) == f) { ++ mutex_unlock(&c->sb_lock); ++ return 0; ++ } ++ ++ ret = __bch2_fs_compress_init(c, c->sb.features|f); ++ if (ret) { ++ mutex_unlock(&c->sb_lock); ++ return ret; ++ } ++ ++ c->disk_sb.sb->features[0] |= cpu_to_le64(f); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++int bch2_check_set_has_compressed_data(struct bch_fs *c, ++ unsigned compression_type) ++{ ++ BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature)); ++ ++ return compression_type ++ ? __bch2_check_set_has_compressed_data(c, ++ 1ULL << bch2_compression_opt_to_feature[compression_type]) ++ : 0; ++} ++ ++void bch2_fs_compress_exit(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ mempool_exit(&c->decompress_workspace); ++ for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++) ++ mempool_exit(&c->compress_workspace[i]); ++ mempool_exit(&c->compression_bounce[WRITE]); ++ mempool_exit(&c->compression_bounce[READ]); ++} ++ ++static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) ++{ ++ size_t max_extent = c->sb.encoded_extent_max << 9; ++ size_t decompress_workspace_size = 0; ++ bool decompress_workspace_needed; ++ ZSTD_parameters params = ZSTD_getParams(0, max_extent, 0); ++ struct { ++ unsigned feature; ++ unsigned type; ++ size_t compress_workspace; ++ size_t decompress_workspace; ++ } compression_types[] = { ++ { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, LZ4_MEM_COMPRESS, 0 }, ++ { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip, ++ zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), ++ zlib_inflate_workspacesize(), }, ++ { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd, ++ ZSTD_CCtxWorkspaceBound(params.cParams), ++ ZSTD_DCtxWorkspaceBound() }, ++ }, *i; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ c->zstd_params = params; ++ ++ for (i = compression_types; ++ i < compression_types + ARRAY_SIZE(compression_types); ++ i++) ++ if (features & (1 << i->feature)) ++ goto have_compressed; ++ ++ goto out; ++have_compressed: ++ ++ if (!mempool_initialized(&c->compression_bounce[READ])) { ++ ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], ++ 1, max_extent); ++ if (ret) ++ goto out; ++ } ++ ++ if (!mempool_initialized(&c->compression_bounce[WRITE])) { ++ ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], ++ 1, max_extent); ++ if (ret) ++ goto out; ++ } ++ ++ for (i = compression_types; ++ i < compression_types + ARRAY_SIZE(compression_types); ++ i++) { ++ decompress_workspace_size = ++ max(decompress_workspace_size, i->decompress_workspace); ++ ++ if (!(features & (1 << i->feature))) ++ continue; ++ ++ if (i->decompress_workspace) ++ decompress_workspace_needed = true; ++ ++ if (mempool_initialized(&c->compress_workspace[i->type])) ++ continue; ++ ++ ret = mempool_init_kvpmalloc_pool( ++ &c->compress_workspace[i->type], ++ 1, i->compress_workspace); ++ if (ret) ++ goto out; ++ } ++ ++ if (!mempool_initialized(&c->decompress_workspace)) { ++ ret = mempool_init_kvpmalloc_pool( ++ &c->decompress_workspace, ++ 1, decompress_workspace_size); ++ if (ret) ++ goto out; ++ } ++out: ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} ++ ++int bch2_fs_compress_init(struct bch_fs *c) ++{ ++ u64 f = c->sb.features; ++ ++ if (c->opts.compression) ++ f |= 1ULL << bch2_compression_opt_to_feature[c->opts.compression]; ++ ++ if (c->opts.background_compression) ++ f |= 1ULL << bch2_compression_opt_to_feature[c->opts.background_compression]; ++ ++ return __bch2_fs_compress_init(c, f); ++ ++} +diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h +new file mode 100644 +index 000000000000..4bab1f61b3b5 +--- /dev/null ++++ b/fs/bcachefs/compress.h +@@ -0,0 +1,18 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_COMPRESS_H ++#define _BCACHEFS_COMPRESS_H ++ ++#include "extents_types.h" ++ ++int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, ++ struct bch_extent_crc_unpacked *); ++int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, ++ struct bvec_iter, struct bch_extent_crc_unpacked); ++unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, ++ struct bio *, size_t *, unsigned); ++ ++int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned); ++void bch2_fs_compress_exit(struct bch_fs *); ++int bch2_fs_compress_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_COMPRESS_H */ +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +new file mode 100644 +index 000000000000..aa10591a3b1a +--- /dev/null ++++ b/fs/bcachefs/debug.c +@@ -0,0 +1,432 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Assorted bcachefs debug code ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "debug.h" ++#include "error.h" ++#include "extents.h" ++#include "fsck.h" ++#include "inode.h" ++#include "io.h" ++#include "super.h" ++ ++#include ++#include ++#include ++#include ++#include ++ ++static struct dentry *bch_debug; ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++void __bch2_btree_verify(struct bch_fs *c, struct btree *b) ++{ ++ struct btree *v = c->verify_data; ++ struct btree_node *n_ondisk, *n_sorted, *n_inmemory; ++ struct bset *sorted, *inmemory; ++ struct extent_ptr_decoded pick; ++ struct bch_dev *ca; ++ struct bio *bio; ++ ++ if (c->opts.nochanges) ++ return; ++ ++ btree_node_io_lock(b); ++ mutex_lock(&c->verify_lock); ++ ++ n_ondisk = c->verify_ondisk; ++ n_sorted = c->verify_data->data; ++ n_inmemory = b->data; ++ ++ bkey_copy(&v->key, &b->key); ++ v->written = 0; ++ v->c.level = b->c.level; ++ v->c.btree_id = b->c.btree_id; ++ bch2_btree_keys_init(v, &c->expensive_debug_checks); ++ ++ if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), ++ NULL, &pick) <= 0) ++ return; ++ ++ ca = bch_dev_bkey_exists(c, pick.ptr.dev); ++ if (!bch2_dev_get_ioref(ca, READ)) ++ return; ++ ++ bio = bio_alloc_bioset(GFP_NOIO, ++ buf_pages(n_sorted, btree_bytes(c)), ++ &c->btree_bio); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_opf = REQ_OP_READ|REQ_META; ++ bio->bi_iter.bi_sector = pick.ptr.offset; ++ bch2_bio_map(bio, n_sorted, btree_bytes(c)); ++ ++ submit_bio_wait(bio); ++ ++ bio_put(bio); ++ percpu_ref_put(&ca->io_ref); ++ ++ memcpy(n_ondisk, n_sorted, btree_bytes(c)); ++ ++ if (bch2_btree_node_read_done(c, v, false)) ++ goto out; ++ ++ n_sorted = c->verify_data->data; ++ sorted = &n_sorted->keys; ++ inmemory = &n_inmemory->keys; ++ ++ if (inmemory->u64s != sorted->u64s || ++ memcmp(inmemory->start, ++ sorted->start, ++ vstruct_end(inmemory) - (void *) inmemory->start)) { ++ unsigned offset = 0, sectors; ++ struct bset *i; ++ unsigned j; ++ ++ console_lock(); ++ ++ printk(KERN_ERR "*** in memory:\n"); ++ bch2_dump_bset(c, b, inmemory, 0); ++ ++ printk(KERN_ERR "*** read back in:\n"); ++ bch2_dump_bset(c, v, sorted, 0); ++ ++ while (offset < b->written) { ++ if (!offset ) { ++ i = &n_ondisk->keys; ++ sectors = vstruct_blocks(n_ondisk, c->block_bits) << ++ c->block_bits; ++ } else { ++ struct btree_node_entry *bne = ++ (void *) n_ondisk + (offset << 9); ++ i = &bne->keys; ++ ++ sectors = vstruct_blocks(bne, c->block_bits) << ++ c->block_bits; ++ } ++ ++ printk(KERN_ERR "*** on disk block %u:\n", offset); ++ bch2_dump_bset(c, b, i, offset); ++ ++ offset += sectors; ++ } ++ ++ printk(KERN_ERR "*** block %u/%u not written\n", ++ offset >> c->block_bits, btree_blocks(c)); ++ ++ for (j = 0; j < le16_to_cpu(inmemory->u64s); j++) ++ if (inmemory->_data[j] != sorted->_data[j]) ++ break; ++ ++ printk(KERN_ERR "b->written %u\n", b->written); ++ ++ console_unlock(); ++ panic("verify failed at %u\n", j); ++ } ++out: ++ mutex_unlock(&c->verify_lock); ++ btree_node_io_unlock(b); ++} ++ ++#endif ++ ++#ifdef CONFIG_DEBUG_FS ++ ++/* XXX: bch_fs refcounting */ ++ ++struct dump_iter { ++ struct bpos from; ++ struct bch_fs *c; ++ enum btree_id id; ++ ++ char buf[PAGE_SIZE]; ++ size_t bytes; /* what's currently in buf */ ++ ++ char __user *ubuf; /* destination user buffer */ ++ size_t size; /* size of requested read */ ++ ssize_t ret; /* bytes read so far */ ++}; ++ ++static int flush_buf(struct dump_iter *i) ++{ ++ if (i->bytes) { ++ size_t bytes = min(i->bytes, i->size); ++ int err = copy_to_user(i->ubuf, i->buf, bytes); ++ ++ if (err) ++ return err; ++ ++ i->ret += bytes; ++ i->ubuf += bytes; ++ i->size -= bytes; ++ i->bytes -= bytes; ++ memmove(i->buf, i->buf + bytes, i->bytes); ++ } ++ ++ return 0; ++} ++ ++static int bch2_dump_open(struct inode *inode, struct file *file) ++{ ++ struct btree_debug *bd = inode->i_private; ++ struct dump_iter *i; ++ ++ i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); ++ if (!i) ++ return -ENOMEM; ++ ++ file->private_data = i; ++ i->from = POS_MIN; ++ i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]); ++ i->id = bd->id; ++ ++ return 0; ++} ++ ++static int bch2_dump_release(struct inode *inode, struct file *file) ++{ ++ kfree(file->private_data); ++ return 0; ++} ++ ++static ssize_t bch2_read_btree(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ err = flush_buf(i); ++ if (err) ++ return err; ++ ++ if (!i->size) ++ return i->ret; ++ ++ bch2_trans_init(&trans, i->c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); ++ k = bch2_btree_iter_peek(iter); ++ ++ while (k.k && !(err = bkey_err(k))) { ++ bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k); ++ i->bytes = strlen(i->buf); ++ BUG_ON(i->bytes >= PAGE_SIZE); ++ i->buf[i->bytes] = '\n'; ++ i->bytes++; ++ ++ k = bch2_btree_iter_next(iter); ++ i->from = iter->pos; ++ ++ err = flush_buf(i); ++ if (err) ++ break; ++ ++ if (!i->size) ++ break; ++ } ++ bch2_trans_exit(&trans); ++ ++ return err < 0 ? err : i->ret; ++} ++ ++static const struct file_operations btree_debug_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_dump_open, ++ .release = bch2_dump_release, ++ .read = bch2_read_btree, ++}; ++ ++static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct btree *b; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ err = flush_buf(i); ++ if (err) ++ return err; ++ ++ if (!i->size || !bkey_cmp(POS_MAX, i->from)) ++ return i->ret; ++ ++ bch2_trans_init(&trans, i->c, 0, 0); ++ ++ for_each_btree_node(&trans, iter, i->id, i->from, 0, b) { ++ bch2_btree_node_to_text(&PBUF(i->buf), i->c, b); ++ i->bytes = strlen(i->buf); ++ err = flush_buf(i); ++ if (err) ++ break; ++ ++ /* ++ * can't easily correctly restart a btree node traversal across ++ * all nodes, meh ++ */ ++ i->from = bkey_cmp(POS_MAX, b->key.k.p) ++ ? bkey_successor(b->key.k.p) ++ : b->key.k.p; ++ ++ if (!i->size) ++ break; ++ } ++ bch2_trans_exit(&trans); ++ ++ return err < 0 ? err : i->ret; ++} ++ ++static const struct file_operations btree_format_debug_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_dump_open, ++ .release = bch2_dump_release, ++ .read = bch2_read_btree_formats, ++}; ++ ++static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct btree *prev_node = NULL; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ err = flush_buf(i); ++ if (err) ++ return err; ++ ++ if (!i->size) ++ return i->ret; ++ ++ bch2_trans_init(&trans, i->c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); ++ ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(err = bkey_err(k))) { ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_packed *_k = ++ bch2_btree_node_iter_peek(&l->iter, l->b); ++ ++ if (l->b != prev_node) { ++ bch2_btree_node_to_text(&PBUF(i->buf), i->c, l->b); ++ i->bytes = strlen(i->buf); ++ err = flush_buf(i); ++ if (err) ++ break; ++ } ++ prev_node = l->b; ++ ++ bch2_bfloat_to_text(&PBUF(i->buf), l->b, _k); ++ i->bytes = strlen(i->buf); ++ err = flush_buf(i); ++ if (err) ++ break; ++ ++ bch2_btree_iter_next(iter); ++ i->from = iter->pos; ++ ++ err = flush_buf(i); ++ if (err) ++ break; ++ ++ if (!i->size) ++ break; ++ } ++ bch2_trans_exit(&trans); ++ ++ return err < 0 ? err : i->ret; ++} ++ ++static const struct file_operations bfloat_failed_debug_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_dump_open, ++ .release = bch2_dump_release, ++ .read = bch2_read_bfloat_failed, ++}; ++ ++void bch2_fs_debug_exit(struct bch_fs *c) ++{ ++ if (!IS_ERR_OR_NULL(c->debug)) ++ debugfs_remove_recursive(c->debug); ++} ++ ++void bch2_fs_debug_init(struct bch_fs *c) ++{ ++ struct btree_debug *bd; ++ char name[100]; ++ ++ if (IS_ERR_OR_NULL(bch_debug)) ++ return; ++ ++ snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); ++ c->debug = debugfs_create_dir(name, bch_debug); ++ if (IS_ERR_OR_NULL(c->debug)) ++ return; ++ ++ for (bd = c->btree_debug; ++ bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); ++ bd++) { ++ bd->id = bd - c->btree_debug; ++ bd->btree = debugfs_create_file(bch2_btree_ids[bd->id], ++ 0400, c->debug, bd, ++ &btree_debug_ops); ++ ++ snprintf(name, sizeof(name), "%s-formats", ++ bch2_btree_ids[bd->id]); ++ ++ bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd, ++ &btree_format_debug_ops); ++ ++ snprintf(name, sizeof(name), "%s-bfloat-failed", ++ bch2_btree_ids[bd->id]); ++ ++ bd->failed = debugfs_create_file(name, 0400, c->debug, bd, ++ &bfloat_failed_debug_ops); ++ } ++} ++ ++#endif ++ ++void bch2_debug_exit(void) ++{ ++ if (!IS_ERR_OR_NULL(bch_debug)) ++ debugfs_remove_recursive(bch_debug); ++} ++ ++int __init bch2_debug_init(void) ++{ ++ int ret = 0; ++ ++ bch_debug = debugfs_create_dir("bcachefs", NULL); ++ return ret; ++} +diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h +new file mode 100644 +index 000000000000..56c2d1ab5f63 +--- /dev/null ++++ b/fs/bcachefs/debug.h +@@ -0,0 +1,63 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_DEBUG_H ++#define _BCACHEFS_DEBUG_H ++ ++#include "bcachefs.h" ++ ++struct bio; ++struct btree; ++struct bch_fs; ++ ++#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; ++BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++#define BCH_DEBUG_PARAM(name, description) \ ++ static inline bool name(struct bch_fs *c) \ ++ { return bch2_##name || c->name; } ++BCH_DEBUG_PARAMS_ALWAYS() ++#undef BCH_DEBUG_PARAM ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++#define BCH_DEBUG_PARAM(name, description) \ ++ static inline bool name(struct bch_fs *c) \ ++ { return bch2_##name || c->name; } ++BCH_DEBUG_PARAMS_DEBUG() ++#undef BCH_DEBUG_PARAM ++ ++void __bch2_btree_verify(struct bch_fs *, struct btree *); ++ ++#define bypass_torture_test(d) ((d)->bypass_torture_test) ++ ++#else /* DEBUG */ ++ ++#define BCH_DEBUG_PARAM(name, description) \ ++ static inline bool name(struct bch_fs *c) { return false; } ++BCH_DEBUG_PARAMS_DEBUG() ++#undef BCH_DEBUG_PARAM ++ ++static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {} ++ ++#define bypass_torture_test(d) 0 ++ ++#endif ++ ++static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) ++{ ++ if (verify_btree_ondisk(c)) ++ __bch2_btree_verify(c, b); ++} ++ ++#ifdef CONFIG_DEBUG_FS ++void bch2_fs_debug_exit(struct bch_fs *); ++void bch2_fs_debug_init(struct bch_fs *); ++#else ++static inline void bch2_fs_debug_exit(struct bch_fs *c) {} ++static inline void bch2_fs_debug_init(struct bch_fs *c) {} ++#endif ++ ++void bch2_debug_exit(void); ++int bch2_debug_init(void); ++ ++#endif /* _BCACHEFS_DEBUG_H */ +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +new file mode 100644 +index 000000000000..f34bfda8ab0d +--- /dev/null ++++ b/fs/bcachefs/dirent.c +@@ -0,0 +1,385 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_update.h" ++#include "extents.h" ++#include "dirent.h" ++#include "fs.h" ++#include "keylist.h" ++#include "str_hash.h" ++ ++#include ++ ++unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) ++{ ++ unsigned len = bkey_val_bytes(d.k) - ++ offsetof(struct bch_dirent, d_name); ++ ++ return strnlen(d.v->d_name, len); ++} ++ ++static u64 bch2_dirent_hash(const struct bch_hash_info *info, ++ const struct qstr *name) ++{ ++ struct bch_str_hash_ctx ctx; ++ ++ bch2_str_hash_init(&ctx, info); ++ bch2_str_hash_update(&ctx, info, name->name, name->len); ++ ++ /* [0,2) reserved for dots */ ++ return max_t(u64, bch2_str_hash_end(&ctx, info), 2); ++} ++ ++static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) ++{ ++ return bch2_dirent_hash(info, key); ++} ++ ++static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) ++{ ++ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); ++ struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); ++ ++ return bch2_dirent_hash(info, &name); ++} ++ ++static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) ++{ ++ struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); ++ int len = bch2_dirent_name_bytes(l); ++ const struct qstr *r = _r; ++ ++ return len - r->len ?: memcmp(l.v->d_name, r->name, len); ++} ++ ++static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) ++{ ++ struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); ++ struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r); ++ int l_len = bch2_dirent_name_bytes(l); ++ int r_len = bch2_dirent_name_bytes(r); ++ ++ return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len); ++} ++ ++const struct bch_hash_desc bch2_dirent_hash_desc = { ++ .btree_id = BTREE_ID_DIRENTS, ++ .key_type = KEY_TYPE_dirent, ++ .hash_key = dirent_hash_key, ++ .hash_bkey = dirent_hash_bkey, ++ .cmp_key = dirent_cmp_key, ++ .cmp_bkey = dirent_cmp_bkey, ++}; ++ ++const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); ++ unsigned len; ++ ++ if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) ++ return "value too small"; ++ ++ len = bch2_dirent_name_bytes(d); ++ if (!len) ++ return "empty name"; ++ ++ /* ++ * older versions of bcachefs were buggy and creating dirent ++ * keys that were bigger than necessary: ++ */ ++ if (bkey_val_u64s(k.k) > dirent_val_u64s(len + 7)) ++ return "value too big"; ++ ++ if (len > BCH_NAME_MAX) ++ return "dirent name too big"; ++ ++ return NULL; ++} ++ ++void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); ++ ++ bch_scnmemcpy(out, d.v->d_name, ++ bch2_dirent_name_bytes(d)); ++ pr_buf(out, " -> %llu type %u", d.v->d_inum, d.v->d_type); ++} ++ ++static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, ++ u8 type, const struct qstr *name, u64 dst) ++{ ++ struct bkey_i_dirent *dirent; ++ unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len); ++ ++ if (name->len > BCH_NAME_MAX) ++ return ERR_PTR(-ENAMETOOLONG); ++ ++ BUG_ON(u64s > U8_MAX); ++ ++ dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); ++ if (IS_ERR(dirent)) ++ return dirent; ++ ++ bkey_dirent_init(&dirent->k_i); ++ dirent->k.u64s = u64s; ++ dirent->v.d_inum = cpu_to_le64(dst); ++ dirent->v.d_type = type; ++ ++ memcpy(dirent->v.d_name, name->name, name->len); ++ memset(dirent->v.d_name + name->len, 0, ++ bkey_val_bytes(&dirent->k) - ++ offsetof(struct bch_dirent, d_name) - ++ name->len); ++ ++ EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len); ++ ++ return dirent; ++} ++ ++int bch2_dirent_create(struct btree_trans *trans, ++ u64 dir_inum, const struct bch_hash_info *hash_info, ++ u8 type, const struct qstr *name, u64 dst_inum, ++ int flags) ++{ ++ struct bkey_i_dirent *dirent; ++ int ret; ++ ++ dirent = dirent_create_key(trans, type, name, dst_inum); ++ ret = PTR_ERR_OR_ZERO(dirent); ++ if (ret) ++ return ret; ++ ++ return bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, ++ dir_inum, &dirent->k_i, flags); ++} ++ ++static void dirent_copy_target(struct bkey_i_dirent *dst, ++ struct bkey_s_c_dirent src) ++{ ++ dst->v.d_inum = src.v->d_inum; ++ dst->v.d_type = src.v->d_type; ++} ++ ++int bch2_dirent_rename(struct btree_trans *trans, ++ u64 src_dir, struct bch_hash_info *src_hash, ++ u64 dst_dir, struct bch_hash_info *dst_hash, ++ const struct qstr *src_name, u64 *src_inum, ++ const struct qstr *dst_name, u64 *dst_inum, ++ enum bch_rename_mode mode) ++{ ++ struct btree_iter *src_iter = NULL, *dst_iter = NULL; ++ struct bkey_s_c old_src, old_dst; ++ struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; ++ struct bpos dst_pos = ++ POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name)); ++ int ret = 0; ++ ++ *src_inum = *dst_inum = 0; ++ ++ /* ++ * Lookup dst: ++ * ++ * Note that in BCH_RENAME mode, we're _not_ checking if ++ * the target already exists - we're relying on the VFS ++ * to do that check for us for correctness: ++ */ ++ dst_iter = mode == BCH_RENAME ++ ? bch2_hash_hole(trans, bch2_dirent_hash_desc, ++ dst_hash, dst_dir, dst_name) ++ : bch2_hash_lookup(trans, bch2_dirent_hash_desc, ++ dst_hash, dst_dir, dst_name, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(dst_iter); ++ if (ret) ++ goto out; ++ ++ old_dst = bch2_btree_iter_peek_slot(dst_iter); ++ ++ if (mode != BCH_RENAME) ++ *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum); ++ ++ /* Lookup src: */ ++ src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc, ++ src_hash, src_dir, src_name, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(src_iter); ++ if (ret) ++ goto out; ++ ++ old_src = bch2_btree_iter_peek_slot(src_iter); ++ *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum); ++ ++ /* Create new dst key: */ ++ new_dst = dirent_create_key(trans, 0, dst_name, 0); ++ ret = PTR_ERR_OR_ZERO(new_dst); ++ if (ret) ++ goto out; ++ ++ dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); ++ new_dst->k.p = dst_iter->pos; ++ ++ /* Create new src key: */ ++ if (mode == BCH_RENAME_EXCHANGE) { ++ new_src = dirent_create_key(trans, 0, src_name, 0); ++ ret = PTR_ERR_OR_ZERO(new_src); ++ if (ret) ++ goto out; ++ ++ dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); ++ new_src->k.p = src_iter->pos; ++ } else { ++ new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); ++ ret = PTR_ERR_OR_ZERO(new_src); ++ if (ret) ++ goto out; ++ ++ bkey_init(&new_src->k); ++ new_src->k.p = src_iter->pos; ++ ++ if (bkey_cmp(dst_pos, src_iter->pos) <= 0 && ++ bkey_cmp(src_iter->pos, dst_iter->pos) < 0) { ++ /* ++ * We have a hash collision for the new dst key, ++ * and new_src - the key we're deleting - is between ++ * new_dst's hashed slot and the slot we're going to be ++ * inserting it into - oops. This will break the hash ++ * table if we don't deal with it: ++ */ ++ if (mode == BCH_RENAME) { ++ /* ++ * If we're not overwriting, we can just insert ++ * new_dst at the src position: ++ */ ++ new_dst->k.p = src_iter->pos; ++ bch2_trans_update(trans, src_iter, ++ &new_dst->k_i, 0); ++ goto out; ++ } else { ++ /* If we're overwriting, we can't insert new_dst ++ * at a different slot because it has to ++ * overwrite old_dst - just make sure to use a ++ * whiteout when deleting src: ++ */ ++ new_src->k.type = KEY_TYPE_whiteout; ++ } ++ } else { ++ /* Check if we need a whiteout to delete src: */ ++ ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc, ++ src_hash, src_iter); ++ if (ret < 0) ++ goto out; ++ ++ if (ret) ++ new_src->k.type = KEY_TYPE_whiteout; ++ } ++ } ++ ++ bch2_trans_update(trans, src_iter, &new_src->k_i, 0); ++ bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0); ++out: ++ bch2_trans_iter_put(trans, src_iter); ++ bch2_trans_iter_put(trans, dst_iter); ++ return ret; ++} ++ ++int bch2_dirent_delete_at(struct btree_trans *trans, ++ const struct bch_hash_info *hash_info, ++ struct btree_iter *iter) ++{ ++ return bch2_hash_delete_at(trans, bch2_dirent_hash_desc, ++ hash_info, iter); ++} ++ ++struct btree_iter * ++__bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum, ++ const struct bch_hash_info *hash_info, ++ const struct qstr *name, unsigned flags) ++{ ++ return bch2_hash_lookup(trans, bch2_dirent_hash_desc, ++ hash_info, dir_inum, name, flags); ++} ++ ++u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, ++ const struct bch_hash_info *hash_info, ++ const struct qstr *name) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 inum = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = __bch2_dirent_lookup_trans(&trans, dir_inum, ++ hash_info, name, 0); ++ if (IS_ERR(iter)) { ++ BUG_ON(PTR_ERR(iter) == -EINTR); ++ goto out; ++ } ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); ++out: ++ bch2_trans_exit(&trans); ++ return inum; ++} ++ ++int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ for_each_btree_key(trans, iter, BTREE_ID_DIRENTS, ++ POS(dir_inum, 0), 0, k, ret) { ++ if (k.k->p.inode > dir_inum) ++ break; ++ ++ if (k.k->type == KEY_TYPE_dirent) { ++ ret = -ENOTEMPTY; ++ break; ++ } ++ } ++ bch2_trans_iter_put(trans, iter); ++ ++ return ret; ++} ++ ++int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_dirent dirent; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, ++ POS(inum, ctx->pos), 0, k, ret) { ++ if (k.k->p.inode > inum) ++ break; ++ ++ if (k.k->type != KEY_TYPE_dirent) ++ continue; ++ ++ dirent = bkey_s_c_to_dirent(k); ++ ++ /* ++ * XXX: dir_emit() can fault and block, while we're holding ++ * locks ++ */ ++ ctx->pos = dirent.k->p.offset; ++ if (!dir_emit(ctx, dirent.v->d_name, ++ bch2_dirent_name_bytes(dirent), ++ le64_to_cpu(dirent.v->d_inum), ++ dirent.v->d_type)) ++ break; ++ ctx->pos = dirent.k->p.offset + 1; ++ } ++ ret = bch2_trans_exit(&trans) ?: ret; ++ ++ return ret; ++} +diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h +new file mode 100644 +index 000000000000..34769371dd13 +--- /dev/null ++++ b/fs/bcachefs/dirent.h +@@ -0,0 +1,63 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_DIRENT_H ++#define _BCACHEFS_DIRENT_H ++ ++#include "str_hash.h" ++ ++extern const struct bch_hash_desc bch2_dirent_hash_desc; ++ ++const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_dirent (struct bkey_ops) { \ ++ .key_invalid = bch2_dirent_invalid, \ ++ .val_to_text = bch2_dirent_to_text, \ ++} ++ ++struct qstr; ++struct file; ++struct dir_context; ++struct bch_fs; ++struct bch_hash_info; ++struct bch_inode_info; ++ ++unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent); ++ ++static inline unsigned dirent_val_u64s(unsigned len) ++{ ++ return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len, ++ sizeof(u64)); ++} ++ ++int bch2_dirent_create(struct btree_trans *, u64, ++ const struct bch_hash_info *, u8, ++ const struct qstr *, u64, int); ++ ++int bch2_dirent_delete_at(struct btree_trans *, ++ const struct bch_hash_info *, ++ struct btree_iter *); ++ ++enum bch_rename_mode { ++ BCH_RENAME, ++ BCH_RENAME_OVERWRITE, ++ BCH_RENAME_EXCHANGE, ++}; ++ ++int bch2_dirent_rename(struct btree_trans *, ++ u64, struct bch_hash_info *, ++ u64, struct bch_hash_info *, ++ const struct qstr *, u64 *, ++ const struct qstr *, u64 *, ++ enum bch_rename_mode); ++ ++struct btree_iter * ++__bch2_dirent_lookup_trans(struct btree_trans *, u64, ++ const struct bch_hash_info *, ++ const struct qstr *, unsigned); ++u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *, ++ const struct qstr *); ++ ++int bch2_empty_dir_trans(struct btree_trans *, u64); ++int bch2_readdir(struct bch_fs *, u64, struct dir_context *); ++ ++#endif /* _BCACHEFS_DIRENT_H */ +diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c +new file mode 100644 +index 000000000000..4a4ec8f46108 +--- /dev/null ++++ b/fs/bcachefs/disk_groups.c +@@ -0,0 +1,481 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "disk_groups.h" ++#include "super-io.h" ++ ++#include ++ ++static int group_cmp(const void *_l, const void *_r) ++{ ++ const struct bch_disk_group *l = _l; ++ const struct bch_disk_group *r = _r; ++ ++ return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) - ++ (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?: ++ ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) - ++ (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?: ++ strncmp(l->label, r->label, sizeof(l->label)); ++} ++ ++static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ field_to_type(f, disk_groups); ++ struct bch_disk_group *g, *sorted = NULL; ++ struct bch_sb_field_members *mi; ++ struct bch_member *m; ++ unsigned i, nr_groups, len; ++ const char *err = NULL; ++ ++ mi = bch2_sb_get_members(sb); ++ groups = bch2_sb_get_disk_groups(sb); ++ nr_groups = disk_groups_nr(groups); ++ ++ for (m = mi->members; ++ m < mi->members + sb->nr_devices; ++ m++) { ++ unsigned g; ++ ++ if (!BCH_MEMBER_GROUP(m)) ++ continue; ++ ++ g = BCH_MEMBER_GROUP(m) - 1; ++ ++ if (g >= nr_groups || ++ BCH_GROUP_DELETED(&groups->entries[g])) ++ return "disk has invalid group"; ++ } ++ ++ if (!nr_groups) ++ return NULL; ++ ++ for (g = groups->entries; ++ g < groups->entries + nr_groups; ++ g++) { ++ if (BCH_GROUP_DELETED(g)) ++ continue; ++ ++ len = strnlen(g->label, sizeof(g->label)); ++ if (!len) { ++ err = "group with empty label"; ++ goto err; ++ } ++ } ++ ++ sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL); ++ if (!sorted) ++ return "cannot allocate memory"; ++ ++ memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted)); ++ sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL); ++ ++ for (i = 0; i + 1 < nr_groups; i++) ++ if (!BCH_GROUP_DELETED(sorted + i) && ++ !group_cmp(sorted + i, sorted + i + 1)) { ++ err = "duplicate groups"; ++ goto err; ++ } ++ ++ err = NULL; ++err: ++ kfree(sorted); ++ return err; ++} ++ ++static void bch2_sb_disk_groups_to_text(struct printbuf *out, ++ struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ field_to_type(f, disk_groups); ++ struct bch_disk_group *g; ++ unsigned nr_groups = disk_groups_nr(groups); ++ ++ for (g = groups->entries; ++ g < groups->entries + nr_groups; ++ g++) { ++ if (g != groups->entries) ++ pr_buf(out, " "); ++ ++ if (BCH_GROUP_DELETED(g)) ++ pr_buf(out, "[deleted]"); ++ else ++ pr_buf(out, "[parent %llu name %s]", ++ BCH_GROUP_PARENT(g), g->label); ++ } ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = { ++ .validate = bch2_sb_disk_groups_validate, ++ .to_text = bch2_sb_disk_groups_to_text ++}; ++ ++int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) ++{ ++ struct bch_sb_field_members *mi; ++ struct bch_sb_field_disk_groups *groups; ++ struct bch_disk_groups_cpu *cpu_g, *old_g; ++ unsigned i, g, nr_groups; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ groups = bch2_sb_get_disk_groups(c->disk_sb.sb); ++ nr_groups = disk_groups_nr(groups); ++ ++ if (!groups) ++ return 0; ++ ++ cpu_g = kzalloc(sizeof(*cpu_g) + ++ sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL); ++ if (!cpu_g) ++ return -ENOMEM; ++ ++ cpu_g->nr = nr_groups; ++ ++ for (i = 0; i < nr_groups; i++) { ++ struct bch_disk_group *src = &groups->entries[i]; ++ struct bch_disk_group_cpu *dst = &cpu_g->entries[i]; ++ ++ dst->deleted = BCH_GROUP_DELETED(src); ++ dst->parent = BCH_GROUP_PARENT(src); ++ } ++ ++ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { ++ struct bch_member *m = mi->members + i; ++ struct bch_disk_group_cpu *dst = ++ &cpu_g->entries[BCH_MEMBER_GROUP(m)]; ++ ++ if (!bch2_member_exists(m)) ++ continue; ++ ++ g = BCH_MEMBER_GROUP(m); ++ while (g) { ++ dst = &cpu_g->entries[g - 1]; ++ __set_bit(i, dst->devs.d); ++ g = dst->parent; ++ } ++ } ++ ++ old_g = rcu_dereference_protected(c->disk_groups, ++ lockdep_is_held(&c->sb_lock)); ++ rcu_assign_pointer(c->disk_groups, cpu_g); ++ if (old_g) ++ kfree_rcu(old_g, rcu); ++ ++ return 0; ++} ++ ++const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) ++{ ++ struct target t = target_decode(target); ++ ++ switch (t.type) { ++ case TARGET_NULL: ++ return NULL; ++ case TARGET_DEV: { ++ struct bch_dev *ca = t.dev < c->sb.nr_devices ++ ? rcu_dereference(c->devs[t.dev]) ++ : NULL; ++ return ca ? &ca->self : NULL; ++ } ++ case TARGET_GROUP: { ++ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); ++ ++ return t.group < g->nr && !g->entries[t.group].deleted ++ ? &g->entries[t.group].devs ++ : NULL; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) ++{ ++ struct target t = target_decode(target); ++ ++ switch (t.type) { ++ case TARGET_NULL: ++ return false; ++ case TARGET_DEV: ++ return dev == t.dev; ++ case TARGET_GROUP: { ++ struct bch_disk_groups_cpu *g; ++ const struct bch_devs_mask *m; ++ bool ret; ++ ++ rcu_read_lock(); ++ g = rcu_dereference(c->disk_groups); ++ m = t.group < g->nr && !g->entries[t.group].deleted ++ ? &g->entries[t.group].devs ++ : NULL; ++ ++ ret = m ? test_bit(dev, m->d) : false; ++ rcu_read_unlock(); ++ ++ return ret; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups, ++ unsigned parent, ++ const char *name, unsigned namelen) ++{ ++ unsigned i, nr_groups = disk_groups_nr(groups); ++ ++ if (!namelen || namelen > BCH_SB_LABEL_SIZE) ++ return -EINVAL; ++ ++ for (i = 0; i < nr_groups; i++) { ++ struct bch_disk_group *g = groups->entries + i; ++ ++ if (BCH_GROUP_DELETED(g)) ++ continue; ++ ++ if (!BCH_GROUP_DELETED(g) && ++ BCH_GROUP_PARENT(g) == parent && ++ strnlen(g->label, sizeof(g->label)) == namelen && ++ !memcmp(name, g->label, namelen)) ++ return i; ++ } ++ ++ return -1; ++} ++ ++static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent, ++ const char *name, unsigned namelen) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ bch2_sb_get_disk_groups(sb->sb); ++ unsigned i, nr_groups = disk_groups_nr(groups); ++ struct bch_disk_group *g; ++ ++ if (!namelen || namelen > BCH_SB_LABEL_SIZE) ++ return -EINVAL; ++ ++ for (i = 0; ++ i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]); ++ i++) ++ ; ++ ++ if (i == nr_groups) { ++ unsigned u64s = ++ (sizeof(struct bch_sb_field_disk_groups) + ++ sizeof(struct bch_disk_group) * (nr_groups + 1)) / ++ sizeof(u64); ++ ++ groups = bch2_sb_resize_disk_groups(sb, u64s); ++ if (!groups) ++ return -ENOSPC; ++ ++ nr_groups = disk_groups_nr(groups); ++ } ++ ++ BUG_ON(i >= nr_groups); ++ ++ g = &groups->entries[i]; ++ ++ memcpy(g->label, name, namelen); ++ if (namelen < sizeof(g->label)) ++ g->label[namelen] = '\0'; ++ SET_BCH_GROUP_DELETED(g, 0); ++ SET_BCH_GROUP_PARENT(g, parent); ++ SET_BCH_GROUP_DATA_ALLOWED(g, ~0); ++ ++ return i; ++} ++ ++int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ bch2_sb_get_disk_groups(sb->sb); ++ int v = -1; ++ ++ do { ++ const char *next = strchrnul(name, '.'); ++ unsigned len = next - name; ++ ++ if (*next == '.') ++ next++; ++ ++ v = __bch2_disk_group_find(groups, v + 1, name, len); ++ name = next; ++ } while (*name && v >= 0); ++ ++ return v; ++} ++ ++int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) ++{ ++ struct bch_sb_field_disk_groups *groups; ++ unsigned parent = 0; ++ int v = -1; ++ ++ do { ++ const char *next = strchrnul(name, '.'); ++ unsigned len = next - name; ++ ++ if (*next == '.') ++ next++; ++ ++ groups = bch2_sb_get_disk_groups(sb->sb); ++ ++ v = __bch2_disk_group_find(groups, parent, name, len); ++ if (v < 0) ++ v = __bch2_disk_group_add(sb, parent, name, len); ++ if (v < 0) ++ return v; ++ ++ parent = v + 1; ++ name = next; ++ } while (*name && v >= 0); ++ ++ return v; ++} ++ ++void bch2_disk_path_to_text(struct printbuf *out, ++ struct bch_sb_handle *sb, ++ unsigned v) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ bch2_sb_get_disk_groups(sb->sb); ++ struct bch_disk_group *g; ++ unsigned nr = 0; ++ u16 path[32]; ++ ++ while (1) { ++ if (nr == ARRAY_SIZE(path)) ++ goto inval; ++ ++ if (v >= disk_groups_nr(groups)) ++ goto inval; ++ ++ g = groups->entries + v; ++ ++ if (BCH_GROUP_DELETED(g)) ++ goto inval; ++ ++ path[nr++] = v; ++ ++ if (!BCH_GROUP_PARENT(g)) ++ break; ++ ++ v = BCH_GROUP_PARENT(g) - 1; ++ } ++ ++ while (nr) { ++ v = path[--nr]; ++ g = groups->entries + v; ++ ++ bch_scnmemcpy(out, g->label, ++ strnlen(g->label, sizeof(g->label))); ++ ++ if (nr) ++ pr_buf(out, "."); ++ } ++ return; ++inval: ++ pr_buf(out, "invalid group %u", v); ++} ++ ++int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) ++{ ++ struct bch_member *mi; ++ int v = -1; ++ ++ mutex_lock(&c->sb_lock); ++ ++ if (!strlen(name) || !strcmp(name, "none")) ++ goto write_sb; ++ ++ v = bch2_disk_path_find_or_create(&c->disk_sb, name); ++ if (v < 0) { ++ mutex_unlock(&c->sb_lock); ++ return v; ++ } ++ ++write_sb: ++ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; ++ SET_BCH_MEMBER_GROUP(mi, v + 1); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v) ++{ ++ struct bch_dev *ca; ++ int g; ++ ++ if (!strlen(buf) || !strcmp(buf, "none")) { ++ *v = 0; ++ return 0; ++ } ++ ++ /* Is it a device? */ ++ ca = bch2_dev_lookup(c, buf); ++ if (!IS_ERR(ca)) { ++ *v = dev_to_target(ca->dev_idx); ++ percpu_ref_put(&ca->ref); ++ return 0; ++ } ++ ++ mutex_lock(&c->sb_lock); ++ g = bch2_disk_path_find(&c->disk_sb, buf); ++ mutex_unlock(&c->sb_lock); ++ ++ if (g >= 0) { ++ *v = group_to_target(g); ++ return 0; ++ } ++ ++ return -EINVAL; ++} ++ ++void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v) ++{ ++ struct target t = target_decode(v); ++ ++ switch (t.type) { ++ case TARGET_NULL: ++ pr_buf(out, "none"); ++ break; ++ case TARGET_DEV: { ++ struct bch_dev *ca; ++ ++ rcu_read_lock(); ++ ca = t.dev < c->sb.nr_devices ++ ? rcu_dereference(c->devs[t.dev]) ++ : NULL; ++ ++ if (ca && percpu_ref_tryget(&ca->io_ref)) { ++ char b[BDEVNAME_SIZE]; ++ ++ pr_buf(out, "/dev/%s", ++ bdevname(ca->disk_sb.bdev, b)); ++ percpu_ref_put(&ca->io_ref); ++ } else if (ca) { ++ pr_buf(out, "offline device %u", t.dev); ++ } else { ++ pr_buf(out, "invalid device %u", t.dev); ++ } ++ ++ rcu_read_unlock(); ++ break; ++ } ++ case TARGET_GROUP: ++ mutex_lock(&c->sb_lock); ++ bch2_disk_path_to_text(out, &c->disk_sb, t.group); ++ mutex_unlock(&c->sb_lock); ++ break; ++ default: ++ BUG(); ++ } ++} +diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h +new file mode 100644 +index 000000000000..c8e0c37a5e1a +--- /dev/null ++++ b/fs/bcachefs/disk_groups.h +@@ -0,0 +1,88 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_DISK_GROUPS_H ++#define _BCACHEFS_DISK_GROUPS_H ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups; ++ ++static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups) ++{ ++ return groups ++ ? (vstruct_end(&groups->field) - ++ (void *) &groups->entries[0]) / sizeof(struct bch_disk_group) ++ : 0; ++} ++ ++struct target { ++ enum { ++ TARGET_NULL, ++ TARGET_DEV, ++ TARGET_GROUP, ++ } type; ++ union { ++ unsigned dev; ++ unsigned group; ++ }; ++}; ++ ++#define TARGET_DEV_START 1 ++#define TARGET_GROUP_START (256 + TARGET_DEV_START) ++ ++static inline u16 dev_to_target(unsigned dev) ++{ ++ return TARGET_DEV_START + dev; ++} ++ ++static inline u16 group_to_target(unsigned group) ++{ ++ return TARGET_GROUP_START + group; ++} ++ ++static inline struct target target_decode(unsigned target) ++{ ++ if (target >= TARGET_GROUP_START) ++ return (struct target) { ++ .type = TARGET_GROUP, ++ .group = target - TARGET_GROUP_START ++ }; ++ ++ if (target >= TARGET_DEV_START) ++ return (struct target) { ++ .type = TARGET_DEV, ++ .group = target - TARGET_DEV_START ++ }; ++ ++ return (struct target) { .type = TARGET_NULL }; ++} ++ ++const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned); ++ ++static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c, ++ enum bch_data_type data_type, ++ u16 target) ++{ ++ struct bch_devs_mask devs = c->rw_devs[data_type]; ++ const struct bch_devs_mask *t = bch2_target_to_mask(c, target); ++ ++ if (t) ++ bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); ++ return devs; ++} ++ ++bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned); ++ ++int bch2_disk_path_find(struct bch_sb_handle *, const char *); ++int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); ++void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *, ++ unsigned); ++ ++int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *); ++void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, u64); ++ ++int bch2_sb_disk_groups_to_cpu(struct bch_fs *); ++ ++int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); ++ ++const char *bch2_sb_validate_disk_groups(struct bch_sb *, ++ struct bch_sb_field *); ++ ++#endif /* _BCACHEFS_DISK_GROUPS_H */ +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +new file mode 100644 +index 000000000000..8c7e9cb74888 +--- /dev/null ++++ b/fs/bcachefs/ec.c +@@ -0,0 +1,1368 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++/* erasure coding */ ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_on_stack.h" ++#include "bset.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "error.h" ++#include "io.h" ++#include "keylist.h" ++#include "recovery.h" ++#include "super-io.h" ++#include "util.h" ++ ++#include ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++ ++static void raid5_recov(unsigned disks, unsigned failed_idx, ++ size_t size, void **data) ++{ ++ unsigned i = 2, nr; ++ ++ BUG_ON(failed_idx >= disks); ++ ++ swap(data[0], data[failed_idx]); ++ memcpy(data[0], data[1], size); ++ ++ while (i < disks) { ++ nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); ++ xor_blocks(nr, size, data[0], data + i); ++ i += nr; ++ } ++ ++ swap(data[0], data[failed_idx]); ++} ++ ++static void raid_gen(int nd, int np, size_t size, void **v) ++{ ++ if (np >= 1) ++ raid5_recov(nd + np, nd, size, v); ++ if (np >= 2) ++ raid6_call.gen_syndrome(nd + np, size, v); ++ BUG_ON(np > 2); ++} ++ ++static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) ++{ ++ switch (nr) { ++ case 0: ++ break; ++ case 1: ++ if (ir[0] < nd + 1) ++ raid5_recov(nd + 1, ir[0], size, v); ++ else ++ raid6_call.gen_syndrome(nd + np, size, v); ++ break; ++ case 2: ++ if (ir[1] < nd) { ++ /* data+data failure. */ ++ raid6_2data_recov(nd + np, size, ir[0], ir[1], v); ++ } else if (ir[0] < nd) { ++ /* data + p/q failure */ ++ ++ if (ir[1] == nd) /* data + p failure */ ++ raid6_datap_recov(nd + np, size, ir[0], v); ++ else { /* data + q failure */ ++ raid5_recov(nd + 1, ir[0], size, v); ++ raid6_call.gen_syndrome(nd + np, size, v); ++ } ++ } else { ++ raid_gen(nd, np, size, v); ++ } ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++#else ++ ++#include ++ ++#endif ++ ++struct ec_bio { ++ struct bch_dev *ca; ++ struct ec_stripe_buf *buf; ++ size_t idx; ++ struct bio bio; ++}; ++ ++/* Stripes btree keys: */ ++ ++const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; ++ ++ if (k.k->p.inode) ++ return "invalid stripe key"; ++ ++ if (bkey_val_bytes(k.k) < sizeof(*s)) ++ return "incorrect value size"; ++ ++ if (bkey_val_bytes(k.k) < sizeof(*s) || ++ bkey_val_u64s(k.k) < stripe_val_u64s(s)) ++ return "incorrect value size"; ++ ++ return bch2_bkey_ptrs_invalid(c, k); ++} ++ ++void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; ++ unsigned i; ++ ++ pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", ++ s->algorithm, ++ le16_to_cpu(s->sectors), ++ s->nr_blocks - s->nr_redundant, ++ s->nr_redundant, ++ s->csum_type, ++ 1U << s->csum_granularity_bits); ++ ++ for (i = 0; i < s->nr_blocks; i++) ++ pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev, ++ (u64) s->ptrs[i].offset, ++ stripe_blockcount_get(s, i)); ++} ++ ++static int ptr_matches_stripe(struct bch_fs *c, ++ struct bch_stripe *v, ++ const struct bch_extent_ptr *ptr) ++{ ++ unsigned i; ++ ++ for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) { ++ const struct bch_extent_ptr *ptr2 = v->ptrs + i; ++ ++ if (ptr->dev == ptr2->dev && ++ ptr->gen == ptr2->gen && ++ ptr->offset >= ptr2->offset && ++ ptr->offset < ptr2->offset + le16_to_cpu(v->sectors)) ++ return i; ++ } ++ ++ return -1; ++} ++ ++static int extent_matches_stripe(struct bch_fs *c, ++ struct bch_stripe *v, ++ struct bkey_s_c k) ++{ ++ ++ switch (k.k->type) { ++ case KEY_TYPE_extent: { ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ const struct bch_extent_ptr *ptr; ++ int idx; ++ ++ extent_for_each_ptr(e, ptr) { ++ idx = ptr_matches_stripe(c, v, ptr); ++ if (idx >= 0) ++ return idx; ++ } ++ break; ++ } ++ } ++ ++ return -1; ++} ++ ++static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_extent: { ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ const union bch_extent_entry *entry; ++ ++ extent_for_each_entry(e, entry) ++ if (extent_entry_type(entry) == ++ BCH_EXTENT_ENTRY_stripe_ptr && ++ entry->stripe_ptr.idx == idx) ++ return true; ++ ++ break; ++ } ++ } ++ ++ return false; ++} ++ ++static void ec_stripe_key_init(struct bch_fs *c, ++ struct bkey_i_stripe *s, ++ struct open_buckets *blocks, ++ struct open_buckets *parity, ++ unsigned stripe_size) ++{ ++ struct open_bucket *ob; ++ unsigned i, u64s; ++ ++ bkey_stripe_init(&s->k_i); ++ s->v.sectors = cpu_to_le16(stripe_size); ++ s->v.algorithm = 0; ++ s->v.nr_blocks = parity->nr + blocks->nr; ++ s->v.nr_redundant = parity->nr; ++ s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); ++ s->v.csum_type = BCH_CSUM_CRC32C; ++ s->v.pad = 0; ++ ++ open_bucket_for_each(c, blocks, ob, i) ++ s->v.ptrs[i] = ob->ptr; ++ ++ open_bucket_for_each(c, parity, ob, i) ++ s->v.ptrs[blocks->nr + i] = ob->ptr; ++ ++ while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { ++ BUG_ON(1 << s->v.csum_granularity_bits >= ++ le16_to_cpu(s->v.sectors) || ++ s->v.csum_granularity_bits == U8_MAX); ++ s->v.csum_granularity_bits++; ++ } ++ ++ set_bkey_val_u64s(&s->k, u64s); ++} ++ ++/* Checksumming: */ ++ ++static void ec_generate_checksums(struct ec_stripe_buf *buf) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned csum_granularity = 1 << v->csum_granularity_bits; ++ unsigned csums_per_device = stripe_csums_per_device(v); ++ unsigned csum_bytes = bch_crc_bytes[v->csum_type]; ++ unsigned i, j; ++ ++ if (!csum_bytes) ++ return; ++ ++ BUG_ON(buf->offset); ++ BUG_ON(buf->size != le16_to_cpu(v->sectors)); ++ ++ for (i = 0; i < v->nr_blocks; i++) { ++ for (j = 0; j < csums_per_device; j++) { ++ unsigned offset = j << v->csum_granularity_bits; ++ unsigned len = min(csum_granularity, buf->size - offset); ++ ++ struct bch_csum csum = ++ bch2_checksum(NULL, v->csum_type, ++ null_nonce(), ++ buf->data[i] + (offset << 9), ++ len << 9); ++ ++ memcpy(stripe_csum(v, i, j), &csum, csum_bytes); ++ } ++ } ++} ++ ++static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned csum_granularity = 1 << v->csum_granularity_bits; ++ unsigned csum_bytes = bch_crc_bytes[v->csum_type]; ++ unsigned i; ++ ++ if (!csum_bytes) ++ return; ++ ++ for (i = 0; i < v->nr_blocks; i++) { ++ unsigned offset = buf->offset; ++ unsigned end = buf->offset + buf->size; ++ ++ if (!test_bit(i, buf->valid)) ++ continue; ++ ++ while (offset < end) { ++ unsigned j = offset >> v->csum_granularity_bits; ++ unsigned len = min(csum_granularity, end - offset); ++ struct bch_csum csum; ++ ++ BUG_ON(offset & (csum_granularity - 1)); ++ BUG_ON(offset + len != le16_to_cpu(v->sectors) && ++ ((offset + len) & (csum_granularity - 1))); ++ ++ csum = bch2_checksum(NULL, v->csum_type, ++ null_nonce(), ++ buf->data[i] + ((offset - buf->offset) << 9), ++ len << 9); ++ ++ if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) { ++ __bcache_io_error(c, ++ "checksum error while doing reconstruct read (%u:%u)", ++ i, j); ++ clear_bit(i, buf->valid); ++ break; ++ } ++ ++ offset += len; ++ } ++ } ++} ++ ++/* Erasure coding: */ ++ ++static void ec_generate_ec(struct ec_stripe_buf *buf) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned nr_data = v->nr_blocks - v->nr_redundant; ++ unsigned bytes = le16_to_cpu(v->sectors) << 9; ++ ++ raid_gen(nr_data, v->nr_redundant, bytes, buf->data); ++} ++ ++static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr) ++{ ++ return nr - bitmap_weight(buf->valid, nr); ++} ++ ++static unsigned ec_nr_failed(struct ec_stripe_buf *buf) ++{ ++ return __ec_nr_failed(buf, buf->key.v.nr_blocks); ++} ++ ++static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0; ++ unsigned nr_data = v->nr_blocks - v->nr_redundant; ++ unsigned bytes = buf->size << 9; ++ ++ if (ec_nr_failed(buf) > v->nr_redundant) { ++ __bcache_io_error(c, ++ "error doing reconstruct read: unable to read enough blocks"); ++ return -1; ++ } ++ ++ for (i = 0; i < nr_data; i++) ++ if (!test_bit(i, buf->valid)) ++ failed[nr_failed++] = i; ++ ++ raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); ++ return 0; ++} ++ ++/* IO: */ ++ ++static void ec_block_endio(struct bio *bio) ++{ ++ struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); ++ struct bch_dev *ca = ec_bio->ca; ++ struct closure *cl = bio->bi_private; ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s", ++ bio_data_dir(bio) ? "write" : "read", ++ blk_status_to_str(bio->bi_status))) ++ clear_bit(ec_bio->idx, ec_bio->buf->valid); ++ ++ bio_put(&ec_bio->bio); ++ percpu_ref_put(&ca->io_ref); ++ closure_put(cl); ++} ++ ++static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, ++ unsigned rw, unsigned idx, struct closure *cl) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned offset = 0, bytes = buf->size << 9; ++ struct bch_extent_ptr *ptr = &v->ptrs[idx]; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ if (!bch2_dev_get_ioref(ca, rw)) { ++ clear_bit(idx, buf->valid); ++ return; ++ } ++ ++ while (offset < bytes) { ++ unsigned nr_iovecs = min_t(size_t, BIO_MAX_PAGES, ++ DIV_ROUND_UP(bytes, PAGE_SIZE)); ++ unsigned b = min_t(size_t, bytes - offset, ++ nr_iovecs << PAGE_SHIFT); ++ struct ec_bio *ec_bio; ++ ++ ec_bio = container_of(bio_alloc_bioset(GFP_KERNEL, nr_iovecs, ++ &c->ec_bioset), ++ struct ec_bio, bio); ++ ++ ec_bio->ca = ca; ++ ec_bio->buf = buf; ++ ec_bio->idx = idx; ++ ++ bio_set_dev(&ec_bio->bio, ca->disk_sb.bdev); ++ bio_set_op_attrs(&ec_bio->bio, rw, 0); ++ ++ ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); ++ ec_bio->bio.bi_end_io = ec_block_endio; ++ ec_bio->bio.bi_private = cl; ++ ++ bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); ++ ++ closure_get(cl); ++ percpu_ref_get(&ca->io_ref); ++ ++ submit_bio(&ec_bio->bio); ++ ++ offset += b; ++ } ++ ++ percpu_ref_put(&ca->io_ref); ++} ++ ++/* recovery read path: */ ++int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct ec_stripe_buf *buf; ++ struct closure cl; ++ struct bkey_s_c k; ++ struct bch_stripe *v; ++ unsigned stripe_idx; ++ unsigned offset, end; ++ unsigned i, nr_data, csum_granularity; ++ int ret = 0, idx; ++ ++ closure_init_stack(&cl); ++ ++ BUG_ON(!rbio->pick.has_ec); ++ ++ stripe_idx = rbio->pick.ec.idx; ++ ++ buf = kzalloc(sizeof(*buf), GFP_NOIO); ++ if (!buf) ++ return -ENOMEM; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, ++ POS(0, stripe_idx), ++ BTREE_ITER_SLOTS); ++ k = bch2_btree_iter_peek_slot(iter); ++ if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) { ++ __bcache_io_error(c, ++ "error doing reconstruct read: stripe not found"); ++ kfree(buf); ++ return bch2_trans_exit(&trans) ?: -EIO; ++ } ++ ++ bkey_reassemble(&buf->key.k_i, k); ++ bch2_trans_exit(&trans); ++ ++ v = &buf->key.v; ++ ++ nr_data = v->nr_blocks - v->nr_redundant; ++ ++ idx = ptr_matches_stripe(c, v, &rbio->pick.ptr); ++ BUG_ON(idx < 0); ++ ++ csum_granularity = 1U << v->csum_granularity_bits; ++ ++ offset = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset; ++ end = offset + bio_sectors(&rbio->bio); ++ ++ BUG_ON(end > le16_to_cpu(v->sectors)); ++ ++ buf->offset = round_down(offset, csum_granularity); ++ buf->size = min_t(unsigned, le16_to_cpu(v->sectors), ++ round_up(end, csum_granularity)) - buf->offset; ++ ++ for (i = 0; i < v->nr_blocks; i++) { ++ buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO); ++ if (!buf->data[i]) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ } ++ ++ memset(buf->valid, 0xFF, sizeof(buf->valid)); ++ ++ for (i = 0; i < v->nr_blocks; i++) { ++ struct bch_extent_ptr *ptr = v->ptrs + i; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ if (ptr_stale(ca, ptr)) { ++ __bcache_io_error(c, ++ "error doing reconstruct read: stale pointer"); ++ clear_bit(i, buf->valid); ++ continue; ++ } ++ ++ ec_block_io(c, buf, REQ_OP_READ, i, &cl); ++ } ++ ++ closure_sync(&cl); ++ ++ if (ec_nr_failed(buf) > v->nr_redundant) { ++ __bcache_io_error(c, ++ "error doing reconstruct read: unable to read enough blocks"); ++ ret = -EIO; ++ goto err; ++ } ++ ++ ec_validate_checksums(c, buf); ++ ++ ret = ec_do_recov(c, buf); ++ if (ret) ++ goto err; ++ ++ memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, ++ buf->data[idx] + ((offset - buf->offset) << 9)); ++err: ++ for (i = 0; i < v->nr_blocks; i++) ++ kfree(buf->data[i]); ++ kfree(buf); ++ return ret; ++} ++ ++/* stripe bucket accounting: */ ++ ++static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) ++{ ++ ec_stripes_heap n, *h = &c->ec_stripes_heap; ++ ++ if (idx >= h->size) { ++ if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) ++ return -ENOMEM; ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ if (n.size > h->size) { ++ memcpy(n.data, h->data, h->used * sizeof(h->data[0])); ++ n.used = h->used; ++ swap(*h, n); ++ } ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ free_heap(&n); ++ } ++ ++ if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp)) ++ return -ENOMEM; ++ ++ if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && ++ !genradix_ptr_alloc(&c->stripes[1], idx, gfp)) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++static int ec_stripe_mem_alloc(struct bch_fs *c, ++ struct btree_iter *iter) ++{ ++ size_t idx = iter->pos.offset; ++ int ret = 0; ++ ++ if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN)) ++ return ret; ++ ++ bch2_trans_unlock(iter->trans); ++ ret = -EINTR; ++ ++ if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL)) ++ return ret; ++ ++ return -ENOMEM; ++} ++ ++static ssize_t stripe_idx_to_delete(struct bch_fs *c) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ ++ return h->used && h->data[0].blocks_nonempty == 0 ++ ? h->data[0].idx : -1; ++} ++ ++static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, ++ struct ec_stripe_heap_entry l, ++ struct ec_stripe_heap_entry r) ++{ ++ return ((l.blocks_nonempty > r.blocks_nonempty) - ++ (l.blocks_nonempty < r.blocks_nonempty)); ++} ++ ++static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, ++ size_t i) ++{ ++ struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); ++ ++ genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i; ++} ++ ++static void heap_verify_backpointer(struct bch_fs *c, size_t idx) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ struct stripe *m = genradix_ptr(&c->stripes[0], idx); ++ ++ BUG_ON(!m->alive); ++ BUG_ON(m->heap_idx >= h->used); ++ BUG_ON(h->data[m->heap_idx].idx != idx); ++} ++ ++void bch2_stripes_heap_update(struct bch_fs *c, ++ struct stripe *m, size_t idx) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ size_t i; ++ ++ if (m->alive) { ++ heap_verify_backpointer(c, idx); ++ ++ h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; ++ ++ i = m->heap_idx; ++ heap_sift_up(h, i, ec_stripes_heap_cmp, ++ ec_stripes_heap_set_backpointer); ++ heap_sift_down(h, i, ec_stripes_heap_cmp, ++ ec_stripes_heap_set_backpointer); ++ ++ heap_verify_backpointer(c, idx); ++ } else { ++ bch2_stripes_heap_insert(c, m, idx); ++ } ++ ++ if (stripe_idx_to_delete(c) >= 0 && ++ !percpu_ref_is_dying(&c->writes)) ++ schedule_work(&c->ec_stripe_delete_work); ++} ++ ++void bch2_stripes_heap_del(struct bch_fs *c, ++ struct stripe *m, size_t idx) ++{ ++ heap_verify_backpointer(c, idx); ++ ++ m->alive = false; ++ heap_del(&c->ec_stripes_heap, m->heap_idx, ++ ec_stripes_heap_cmp, ++ ec_stripes_heap_set_backpointer); ++} ++ ++void bch2_stripes_heap_insert(struct bch_fs *c, ++ struct stripe *m, size_t idx) ++{ ++ BUG_ON(heap_full(&c->ec_stripes_heap)); ++ ++ heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { ++ .idx = idx, ++ .blocks_nonempty = m->blocks_nonempty, ++ }), ++ ec_stripes_heap_cmp, ++ ec_stripes_heap_set_backpointer); ++ m->alive = true; ++ ++ heap_verify_backpointer(c, idx); ++} ++ ++/* stripe deletion */ ++ ++static int ec_stripe_delete(struct bch_fs *c, size_t idx) ++{ ++ return bch2_btree_delete_range(c, BTREE_ID_EC, ++ POS(0, idx), ++ POS(0, idx + 1), ++ NULL); ++} ++ ++static void ec_stripe_delete_work(struct work_struct *work) ++{ ++ struct bch_fs *c = ++ container_of(work, struct bch_fs, ec_stripe_delete_work); ++ ssize_t idx; ++ ++ down_read(&c->gc_lock); ++ mutex_lock(&c->ec_stripe_create_lock); ++ ++ while (1) { ++ spin_lock(&c->ec_stripes_heap_lock); ++ idx = stripe_idx_to_delete(c); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ if (idx < 0) ++ break; ++ ++ if (ec_stripe_delete(c, idx)) ++ break; ++ } ++ ++ mutex_unlock(&c->ec_stripe_create_lock); ++ up_read(&c->gc_lock); ++} ++ ++/* stripe creation: */ ++ ++static int ec_stripe_bkey_insert(struct bch_fs *c, ++ struct bkey_i_stripe *stripe) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bpos start_pos = POS(0, c->ec_stripe_hint); ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EC, start_pos, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { ++ if (start_pos.offset) { ++ start_pos = POS_MIN; ++ bch2_btree_iter_set_pos(iter, start_pos); ++ continue; ++ } ++ ++ ret = -ENOSPC; ++ break; ++ } ++ ++ if (bkey_deleted(k.k)) ++ goto found_slot; ++ } ++ ++ goto err; ++found_slot: ++ start_pos = iter->pos; ++ ++ ret = ec_stripe_mem_alloc(c, iter); ++ if (ret) ++ goto err; ++ ++ stripe->k.p = iter->pos; ++ ++ bch2_trans_update(&trans, iter, &stripe->k_i, 0); ++ ++ ret = bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL); ++err: ++ bch2_trans_iter_put(&trans, iter); ++ ++ if (ret == -EINTR) ++ goto retry; ++ ++ c->ec_stripe_hint = ret ? start_pos.offset : start_pos.offset + 1; ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++static void extent_stripe_ptr_add(struct bkey_s_extent e, ++ struct ec_stripe_buf *s, ++ struct bch_extent_ptr *ptr, ++ unsigned block) ++{ ++ struct bch_extent_stripe_ptr *dst = (void *) ptr; ++ union bch_extent_entry *end = extent_entry_last(e); ++ ++ memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst); ++ e.k->u64s += sizeof(*dst) / sizeof(u64); ++ ++ *dst = (struct bch_extent_stripe_ptr) { ++ .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, ++ .block = block, ++ .idx = s->key.k.p.offset, ++ }; ++} ++ ++static int ec_stripe_update_ptrs(struct bch_fs *c, ++ struct ec_stripe_buf *s, ++ struct bkey *pos) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_s_extent e; ++ struct bkey_on_stack sk; ++ int ret = 0, dev, idx; ++ ++ bkey_on_stack_init(&sk); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ bkey_start_pos(pos), ++ BTREE_ITER_INTENT); ++ ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret = bkey_err(k)) && ++ bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { ++ struct bch_extent_ptr *ptr, *ec_ptr = NULL; ++ ++ if (extent_has_stripe_ptr(k, s->key.k.p.offset)) { ++ bch2_btree_iter_next(iter); ++ continue; ++ } ++ ++ idx = extent_matches_stripe(c, &s->key.v, k); ++ if (idx < 0) { ++ bch2_btree_iter_next(iter); ++ continue; ++ } ++ ++ dev = s->key.v.ptrs[idx].dev; ++ ++ bkey_on_stack_reassemble(&sk, c, k); ++ e = bkey_i_to_s_extent(sk.k); ++ ++ extent_for_each_ptr(e, ptr) { ++ if (ptr->dev == dev) ++ ec_ptr = ptr; ++ else ++ ptr->cached = true; ++ } ++ ++ extent_stripe_ptr_add(e, s, ec_ptr, idx); ++ ++ bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); ++ bch2_trans_update(&trans, iter, sk.k, 0); ++ ++ ret = bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE); ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ break; ++ } ++ ++ bch2_trans_exit(&trans); ++ bkey_on_stack_exit(&sk, c); ++ ++ return ret; ++} ++ ++/* ++ * data buckets of new stripe all written: create the stripe ++ */ ++static void ec_stripe_create(struct ec_stripe_new *s) ++{ ++ struct bch_fs *c = s->c; ++ struct open_bucket *ob; ++ struct bkey_i *k; ++ struct bch_stripe *v = &s->stripe.key.v; ++ unsigned i, nr_data = v->nr_blocks - v->nr_redundant; ++ struct closure cl; ++ int ret; ++ ++ BUG_ON(s->h->s == s); ++ ++ closure_init_stack(&cl); ++ ++ if (s->err) { ++ bch_err(c, "error creating stripe: error writing data buckets"); ++ goto err; ++ } ++ ++ if (!percpu_ref_tryget(&c->writes)) ++ goto err; ++ ++ BUG_ON(bitmap_weight(s->blocks_allocated, ++ s->blocks.nr) != s->blocks.nr); ++ ++ ec_generate_ec(&s->stripe); ++ ++ ec_generate_checksums(&s->stripe); ++ ++ /* write p/q: */ ++ for (i = nr_data; i < v->nr_blocks; i++) ++ ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl); ++ ++ closure_sync(&cl); ++ ++ for (i = nr_data; i < v->nr_blocks; i++) ++ if (!test_bit(i, s->stripe.valid)) { ++ bch_err(c, "error creating stripe: error writing redundancy buckets"); ++ goto err_put_writes; ++ } ++ ++ mutex_lock(&c->ec_stripe_create_lock); ++ ++ ret = ec_stripe_bkey_insert(c, &s->stripe.key); ++ if (ret) { ++ bch_err(c, "error creating stripe: error creating stripe key"); ++ goto err_unlock; ++ } ++ ++ for_each_keylist_key(&s->keys, k) { ++ ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k); ++ if (ret) ++ break; ++ } ++ ++err_unlock: ++ mutex_unlock(&c->ec_stripe_create_lock); ++err_put_writes: ++ percpu_ref_put(&c->writes); ++err: ++ open_bucket_for_each(c, &s->blocks, ob, i) { ++ ob->ec = NULL; ++ __bch2_open_bucket_put(c, ob); ++ } ++ ++ bch2_open_buckets_put(c, &s->parity); ++ ++ bch2_keylist_free(&s->keys, s->inline_keys); ++ ++ mutex_lock(&s->h->lock); ++ list_del(&s->list); ++ mutex_unlock(&s->h->lock); ++ ++ for (i = 0; i < s->stripe.key.v.nr_blocks; i++) ++ kvpfree(s->stripe.data[i], s->stripe.size << 9); ++ kfree(s); ++} ++ ++static struct ec_stripe_new *ec_stripe_set_pending(struct ec_stripe_head *h) ++{ ++ struct ec_stripe_new *s = h->s; ++ ++ list_add(&s->list, &h->stripes); ++ h->s = NULL; ++ ++ return s; ++} ++ ++static void ec_stripe_new_put(struct ec_stripe_new *s) ++{ ++ BUG_ON(atomic_read(&s->pin) <= 0); ++ if (atomic_dec_and_test(&s->pin)) ++ ec_stripe_create(s); ++} ++ ++/* have a full bucket - hand it off to be erasure coded: */ ++void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob) ++{ ++ struct ec_stripe_new *s = ob->ec; ++ ++ if (ob->sectors_free) ++ s->err = -1; ++ ++ ec_stripe_new_put(s); ++} ++ ++void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) ++{ ++ struct ec_stripe_new *s = ob->ec; ++ ++ s->err = -EIO; ++} ++ ++void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) ++{ ++ struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); ++ struct bch_dev *ca; ++ unsigned offset; ++ ++ if (!ob) ++ return NULL; ++ ++ ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ offset = ca->mi.bucket_size - ob->sectors_free; ++ ++ return ob->ec->stripe.data[ob->ec_idx] + (offset << 9); ++} ++ ++void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, ++ struct bpos pos, unsigned sectors) ++{ ++ struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); ++ struct ec_stripe_new *ec; ++ ++ if (!ob) ++ return; ++ ++ ec = ob->ec; ++ mutex_lock(&ec->lock); ++ ++ if (bch2_keylist_realloc(&ec->keys, ec->inline_keys, ++ ARRAY_SIZE(ec->inline_keys), ++ BKEY_U64s)) { ++ BUG(); ++ } ++ ++ bkey_init(&ec->keys.top->k); ++ ec->keys.top->k.p = pos; ++ bch2_key_resize(&ec->keys.top->k, sectors); ++ bch2_keylist_push(&ec->keys); ++ ++ mutex_unlock(&ec->lock); ++} ++ ++static int unsigned_cmp(const void *_l, const void *_r) ++{ ++ unsigned l = *((const unsigned *) _l); ++ unsigned r = *((const unsigned *) _r); ++ ++ return cmp_int(l, r); ++} ++ ++/* pick most common bucket size: */ ++static unsigned pick_blocksize(struct bch_fs *c, ++ struct bch_devs_mask *devs) ++{ ++ struct bch_dev *ca; ++ unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX]; ++ struct { ++ unsigned nr, size; ++ } cur = { 0, 0 }, best = { 0, 0 }; ++ ++ for_each_member_device_rcu(ca, c, i, devs) ++ sizes[nr++] = ca->mi.bucket_size; ++ ++ sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); ++ ++ for (i = 0; i < nr; i++) { ++ if (sizes[i] != cur.size) { ++ if (cur.nr > best.nr) ++ best = cur; ++ ++ cur.nr = 0; ++ cur.size = sizes[i]; ++ } ++ ++ cur.nr++; ++ } ++ ++ if (cur.nr > best.nr) ++ best = cur; ++ ++ return best.size; ++} ++ ++int bch2_ec_stripe_new_alloc(struct bch_fs *c, struct ec_stripe_head *h) ++{ ++ struct ec_stripe_new *s; ++ unsigned i; ++ ++ BUG_ON(h->parity.nr != h->redundancy); ++ BUG_ON(!h->blocks.nr); ++ BUG_ON(h->parity.nr + h->blocks.nr > EC_STRIPE_MAX); ++ lockdep_assert_held(&h->lock); ++ ++ s = kzalloc(sizeof(*s), GFP_KERNEL); ++ if (!s) ++ return -ENOMEM; ++ ++ mutex_init(&s->lock); ++ atomic_set(&s->pin, 1); ++ s->c = c; ++ s->h = h; ++ s->blocks = h->blocks; ++ s->parity = h->parity; ++ ++ memset(&h->blocks, 0, sizeof(h->blocks)); ++ memset(&h->parity, 0, sizeof(h->parity)); ++ ++ bch2_keylist_init(&s->keys, s->inline_keys); ++ ++ s->stripe.offset = 0; ++ s->stripe.size = h->blocksize; ++ memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid)); ++ ++ ec_stripe_key_init(c, &s->stripe.key, ++ &s->blocks, &s->parity, ++ h->blocksize); ++ ++ for (i = 0; i < s->stripe.key.v.nr_blocks; i++) { ++ s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL); ++ if (!s->stripe.data[i]) ++ goto err; ++ } ++ ++ h->s = s; ++ ++ return 0; ++err: ++ for (i = 0; i < s->stripe.key.v.nr_blocks; i++) ++ kvpfree(s->stripe.data[i], s->stripe.size << 9); ++ kfree(s); ++ return -ENOMEM; ++} ++ ++static struct ec_stripe_head * ++ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, ++ unsigned algo, unsigned redundancy) ++{ ++ struct ec_stripe_head *h; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ h = kzalloc(sizeof(*h), GFP_KERNEL); ++ if (!h) ++ return NULL; ++ ++ mutex_init(&h->lock); ++ mutex_lock(&h->lock); ++ INIT_LIST_HEAD(&h->stripes); ++ ++ h->target = target; ++ h->algo = algo; ++ h->redundancy = redundancy; ++ ++ rcu_read_lock(); ++ h->devs = target_rw_devs(c, BCH_DATA_USER, target); ++ ++ for_each_member_device_rcu(ca, c, i, &h->devs) ++ if (!ca->mi.durability) ++ __clear_bit(i, h->devs.d); ++ ++ h->blocksize = pick_blocksize(c, &h->devs); ++ ++ for_each_member_device_rcu(ca, c, i, &h->devs) ++ if (ca->mi.bucket_size == h->blocksize) ++ h->nr_active_devs++; ++ ++ rcu_read_unlock(); ++ list_add(&h->list, &c->ec_new_stripe_list); ++ return h; ++} ++ ++void bch2_ec_stripe_head_put(struct ec_stripe_head *h) ++{ ++ struct ec_stripe_new *s = NULL; ++ ++ if (h->s && ++ bitmap_weight(h->s->blocks_allocated, ++ h->s->blocks.nr) == h->s->blocks.nr) ++ s = ec_stripe_set_pending(h); ++ ++ mutex_unlock(&h->lock); ++ ++ if (s) ++ ec_stripe_new_put(s); ++} ++ ++struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, ++ unsigned target, ++ unsigned algo, ++ unsigned redundancy) ++{ ++ struct ec_stripe_head *h; ++ ++ if (!redundancy) ++ return NULL; ++ ++ mutex_lock(&c->ec_new_stripe_lock); ++ list_for_each_entry(h, &c->ec_new_stripe_list, list) ++ if (h->target == target && ++ h->algo == algo && ++ h->redundancy == redundancy) { ++ mutex_lock(&h->lock); ++ goto found; ++ } ++ ++ h = ec_new_stripe_head_alloc(c, target, algo, redundancy); ++found: ++ mutex_unlock(&c->ec_new_stripe_lock); ++ return h; ++} ++ ++void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct ec_stripe_head *h; ++ struct open_bucket *ob; ++ unsigned i; ++ ++ mutex_lock(&c->ec_new_stripe_lock); ++ list_for_each_entry(h, &c->ec_new_stripe_list, list) { ++ struct ec_stripe_new *s = NULL; ++ ++ mutex_lock(&h->lock); ++ bch2_open_buckets_stop_dev(c, ca, &h->blocks); ++ bch2_open_buckets_stop_dev(c, ca, &h->parity); ++ ++ if (!h->s) ++ goto unlock; ++ ++ open_bucket_for_each(c, &h->s->blocks, ob, i) ++ if (ob->ptr.dev == ca->dev_idx) ++ goto found; ++ open_bucket_for_each(c, &h->s->parity, ob, i) ++ if (ob->ptr.dev == ca->dev_idx) ++ goto found; ++ goto unlock; ++found: ++ h->s->err = -1; ++ s = ec_stripe_set_pending(h); ++unlock: ++ mutex_unlock(&h->lock); ++ ++ if (s) ++ ec_stripe_new_put(s); ++ } ++ mutex_unlock(&c->ec_new_stripe_lock); ++} ++ ++static int __bch2_stripe_write_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct stripe *m, ++ size_t idx, ++ struct bkey_i_stripe *new_key) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c k; ++ unsigned i; ++ int ret; ++ ++ bch2_btree_iter_set_pos(iter, POS(0, idx)); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ return ret; ++ ++ if (k.k->type != KEY_TYPE_stripe) ++ return -EIO; ++ ++ bkey_reassemble(&new_key->k_i, k); ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ ++ for (i = 0; i < new_key->v.nr_blocks; i++) ++ stripe_blockcount_set(&new_key->v, i, ++ m->block_sectors[i]); ++ m->dirty = false; ++ ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ bch2_trans_update(trans, iter, &new_key->k_i, 0); ++ return 0; ++} ++ ++int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct genradix_iter giter; ++ struct bkey_i_stripe *new_key; ++ struct stripe *m; ++ int ret = 0; ++ ++ new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL); ++ BUG_ON(!new_key); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ ++ genradix_for_each(&c->stripes[0], giter, m) { ++ if (!m->dirty) ++ continue; ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL|flags, ++ __bch2_stripe_write_key(&trans, iter, m, ++ giter.pos, new_key)); ++ ++ if (ret) ++ break; ++ ++ *wrote = true; ++ } ++ ++ bch2_trans_exit(&trans); ++ ++ kfree(new_key); ++ ++ return ret; ++} ++ ++static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_s_c k) ++{ ++ int ret = 0; ++ ++ if (k.k->type == KEY_TYPE_stripe) ++ ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: ++ bch2_mark_key(c, k, 0, 0, NULL, 0, ++ BTREE_TRIGGER_ALLOC_READ| ++ BTREE_TRIGGER_NOATOMIC); ++ ++ return ret; ++} ++ ++int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) ++{ ++ int ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_EC, ++ NULL, bch2_stripes_read_fn); ++ if (ret) ++ bch_err(c, "error reading stripes: %i", ret); ++ ++ return ret; ++} ++ ++int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ size_t i, idx = 0; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0); ++ ++ k = bch2_btree_iter_prev(iter); ++ if (!IS_ERR_OR_NULL(k.k)) ++ idx = k.k->p.offset + 1; ++ ret = bch2_trans_exit(&trans); ++ if (ret) ++ return ret; ++ ++ if (!idx) ++ return 0; ++ ++ if (!gc && ++ !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx), ++ GFP_KERNEL)) ++ return -ENOMEM; ++#if 0 ++ ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL); ++#else ++ for (i = 0; i < idx; i++) ++ if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL)) ++ return -ENOMEM; ++#endif ++ return 0; ++} ++ ++void bch2_fs_ec_exit(struct bch_fs *c) ++{ ++ struct ec_stripe_head *h; ++ ++ while (1) { ++ mutex_lock(&c->ec_new_stripe_lock); ++ h = list_first_entry_or_null(&c->ec_new_stripe_list, ++ struct ec_stripe_head, list); ++ if (h) ++ list_del(&h->list); ++ mutex_unlock(&c->ec_new_stripe_lock); ++ if (!h) ++ break; ++ ++ BUG_ON(h->s); ++ BUG_ON(!list_empty(&h->stripes)); ++ kfree(h); ++ } ++ ++ free_heap(&c->ec_stripes_heap); ++ genradix_free(&c->stripes[0]); ++ bioset_exit(&c->ec_bioset); ++} ++ ++int bch2_fs_ec_init(struct bch_fs *c) ++{ ++ INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); ++ ++ return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), ++ BIOSET_NEED_BVECS); ++} +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +new file mode 100644 +index 000000000000..4dfaac034886 +--- /dev/null ++++ b/fs/bcachefs/ec.h +@@ -0,0 +1,163 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EC_H ++#define _BCACHEFS_EC_H ++ ++#include "ec_types.h" ++#include "keylist_types.h" ++ ++const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ ++#define bch2_bkey_ops_stripe (struct bkey_ops) { \ ++ .key_invalid = bch2_stripe_invalid, \ ++ .val_to_text = bch2_stripe_to_text, \ ++ .swab = bch2_ptr_swab, \ ++} ++ ++static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) ++{ ++ return DIV_ROUND_UP(le16_to_cpu(s->sectors), ++ 1 << s->csum_granularity_bits); ++} ++ ++static inline unsigned stripe_csum_offset(const struct bch_stripe *s, ++ unsigned dev, unsigned csum_idx) ++{ ++ unsigned csum_bytes = bch_crc_bytes[s->csum_type]; ++ ++ return sizeof(struct bch_stripe) + ++ sizeof(struct bch_extent_ptr) * s->nr_blocks + ++ (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes; ++} ++ ++static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s, ++ unsigned idx) ++{ ++ return stripe_csum_offset(s, s->nr_blocks, 0) + ++ sizeof(u16) * idx; ++} ++ ++static inline unsigned stripe_blockcount_get(const struct bch_stripe *s, ++ unsigned idx) ++{ ++ return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx)); ++} ++ ++static inline void stripe_blockcount_set(struct bch_stripe *s, ++ unsigned idx, unsigned v) ++{ ++ __le16 *p = (void *) s + stripe_blockcount_offset(s, idx); ++ ++ *p = cpu_to_le16(v); ++} ++ ++static inline unsigned stripe_val_u64s(const struct bch_stripe *s) ++{ ++ return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks), ++ sizeof(u64)); ++} ++ ++static inline void *stripe_csum(struct bch_stripe *s, ++ unsigned dev, unsigned csum_idx) ++{ ++ return (void *) s + stripe_csum_offset(s, dev, csum_idx); ++} ++ ++struct bch_read_bio; ++ ++struct ec_stripe_buf { ++ /* might not be buffering the entire stripe: */ ++ unsigned offset; ++ unsigned size; ++ unsigned long valid[BITS_TO_LONGS(EC_STRIPE_MAX)]; ++ ++ void *data[EC_STRIPE_MAX]; ++ ++ union { ++ struct bkey_i_stripe key; ++ u64 pad[255]; ++ }; ++}; ++ ++struct ec_stripe_head; ++ ++struct ec_stripe_new { ++ struct bch_fs *c; ++ struct ec_stripe_head *h; ++ struct mutex lock; ++ struct list_head list; ++ ++ /* counts in flight writes, stripe is created when pin == 0 */ ++ atomic_t pin; ++ ++ int err; ++ ++ unsigned long blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)]; ++ ++ struct open_buckets blocks; ++ struct open_buckets parity; ++ ++ struct keylist keys; ++ u64 inline_keys[BKEY_U64s * 8]; ++ ++ struct ec_stripe_buf stripe; ++}; ++ ++struct ec_stripe_head { ++ struct list_head list; ++ struct mutex lock; ++ ++ struct list_head stripes; ++ ++ unsigned target; ++ unsigned algo; ++ unsigned redundancy; ++ ++ struct bch_devs_mask devs; ++ unsigned nr_active_devs; ++ ++ unsigned blocksize; ++ ++ struct dev_stripe_state block_stripe; ++ struct dev_stripe_state parity_stripe; ++ ++ struct open_buckets blocks; ++ struct open_buckets parity; ++ ++ struct ec_stripe_new *s; ++}; ++ ++int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *); ++ ++void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); ++void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *, ++ struct bpos, unsigned); ++ ++void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *); ++void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); ++ ++int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); ++ ++void bch2_ec_stripe_head_put(struct ec_stripe_head *); ++struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned, ++ unsigned, unsigned); ++ ++void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); ++void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); ++void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t); ++ ++void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); ++ ++void bch2_ec_flush_new_stripes(struct bch_fs *); ++ ++struct journal_keys; ++int bch2_stripes_read(struct bch_fs *, struct journal_keys *); ++int bch2_stripes_write(struct bch_fs *, unsigned, bool *); ++ ++int bch2_ec_mem_alloc(struct bch_fs *, bool); ++ ++void bch2_fs_ec_exit(struct bch_fs *); ++int bch2_fs_ec_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_EC_H */ +diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h +new file mode 100644 +index 000000000000..5c3f77c8aac7 +--- /dev/null ++++ b/fs/bcachefs/ec_types.h +@@ -0,0 +1,38 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EC_TYPES_H ++#define _BCACHEFS_EC_TYPES_H ++ ++#include ++ ++#define EC_STRIPE_MAX 16 ++ ++struct bch_replicas_padded { ++ struct bch_replicas_entry e; ++ u8 pad[EC_STRIPE_MAX]; ++}; ++ ++struct stripe { ++ size_t heap_idx; ++ ++ u16 sectors; ++ u8 algorithm; ++ ++ u8 nr_blocks; ++ u8 nr_redundant; ++ ++ unsigned alive:1; ++ unsigned dirty:1; ++ u8 blocks_nonempty; ++ u16 block_sectors[EC_STRIPE_MAX]; ++ ++ struct bch_replicas_padded r; ++}; ++ ++struct ec_stripe_heap_entry { ++ size_t idx; ++ unsigned blocks_nonempty; ++}; ++ ++typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap; ++ ++#endif /* _BCACHEFS_EC_TYPES_H */ +diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c +new file mode 100644 +index 000000000000..cd46706fb6f5 +--- /dev/null ++++ b/fs/bcachefs/error.c +@@ -0,0 +1,172 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "error.h" ++#include "io.h" ++#include "super.h" ++ ++#define FSCK_ERR_RATELIMIT_NR 10 ++ ++bool bch2_inconsistent_error(struct bch_fs *c) ++{ ++ set_bit(BCH_FS_ERROR, &c->flags); ++ ++ switch (c->opts.errors) { ++ case BCH_ON_ERROR_CONTINUE: ++ return false; ++ case BCH_ON_ERROR_RO: ++ if (bch2_fs_emergency_read_only(c)) ++ bch_err(c, "emergency read only"); ++ return true; ++ case BCH_ON_ERROR_PANIC: ++ panic(bch2_fmt(c, "panic after error")); ++ return true; ++ default: ++ BUG(); ++ } ++} ++ ++void bch2_fatal_error(struct bch_fs *c) ++{ ++ if (bch2_fs_emergency_read_only(c)) ++ bch_err(c, "emergency read only"); ++} ++ ++void bch2_io_error_work(struct work_struct *work) ++{ ++ struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); ++ struct bch_fs *c = ca->fs; ++ bool dev; ++ ++ down_write(&c->state_lock); ++ dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO, ++ BCH_FORCE_IF_DEGRADED); ++ if (dev ++ ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_RO, ++ BCH_FORCE_IF_DEGRADED) ++ : bch2_fs_emergency_read_only(c)) ++ bch_err(ca, ++ "too many IO errors, setting %s RO", ++ dev ? "device" : "filesystem"); ++ up_write(&c->state_lock); ++} ++ ++void bch2_io_error(struct bch_dev *ca) ++{ ++ //queue_work(system_long_wq, &ca->io_error_work); ++} ++ ++#ifdef __KERNEL__ ++#define ask_yn() false ++#else ++#include "tools-util.h" ++#endif ++ ++enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags, ++ const char *fmt, ...) ++{ ++ struct fsck_err_state *s = NULL; ++ va_list args; ++ bool fix = false, print = true, suppressing = false; ++ char _buf[sizeof(s->buf)], *buf = _buf; ++ ++ if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) { ++ va_start(args, fmt); ++ vprintk(fmt, args); ++ va_end(args); ++ ++ return bch2_inconsistent_error(c) ++ ? FSCK_ERR_EXIT ++ : FSCK_ERR_FIX; ++ } ++ ++ mutex_lock(&c->fsck_error_lock); ++ ++ list_for_each_entry(s, &c->fsck_errors, list) ++ if (s->fmt == fmt) ++ goto found; ++ ++ s = kzalloc(sizeof(*s), GFP_NOFS); ++ if (!s) { ++ if (!c->fsck_alloc_err) ++ bch_err(c, "kmalloc err, cannot ratelimit fsck errs"); ++ c->fsck_alloc_err = true; ++ buf = _buf; ++ goto print; ++ } ++ ++ INIT_LIST_HEAD(&s->list); ++ s->fmt = fmt; ++found: ++ list_move(&s->list, &c->fsck_errors); ++ s->nr++; ++ if (c->opts.ratelimit_errors && ++ s->nr >= FSCK_ERR_RATELIMIT_NR) { ++ if (s->nr == FSCK_ERR_RATELIMIT_NR) ++ suppressing = true; ++ else ++ print = false; ++ } ++ buf = s->buf; ++print: ++ va_start(args, fmt); ++ vscnprintf(buf, sizeof(_buf), fmt, args); ++ va_end(args); ++ ++ if (c->opts.fix_errors == FSCK_OPT_EXIT) { ++ bch_err(c, "%s, exiting", buf); ++ } else if (flags & FSCK_CAN_FIX) { ++ if (c->opts.fix_errors == FSCK_OPT_ASK) { ++ printk(KERN_ERR "%s: fix?", buf); ++ fix = ask_yn(); ++ } else if (c->opts.fix_errors == FSCK_OPT_YES || ++ (c->opts.nochanges && ++ !(flags & FSCK_CAN_IGNORE))) { ++ if (print) ++ bch_err(c, "%s, fixing", buf); ++ fix = true; ++ } else { ++ if (print) ++ bch_err(c, "%s, not fixing", buf); ++ fix = false; ++ } ++ } else if (flags & FSCK_NEED_FSCK) { ++ if (print) ++ bch_err(c, "%s (run fsck to correct)", buf); ++ } else { ++ if (print) ++ bch_err(c, "%s (repair unimplemented)", buf); ++ } ++ ++ if (suppressing) ++ bch_err(c, "Ratelimiting new instances of previous error"); ++ ++ mutex_unlock(&c->fsck_error_lock); ++ ++ if (fix) { ++ set_bit(BCH_FS_ERRORS_FIXED, &c->flags); ++ return FSCK_ERR_FIX; ++ } else { ++ set_bit(BCH_FS_ERROR, &c->flags); ++ return c->opts.fix_errors == FSCK_OPT_EXIT || ++ !(flags & FSCK_CAN_IGNORE) ++ ? FSCK_ERR_EXIT ++ : FSCK_ERR_IGNORE; ++ } ++} ++ ++void bch2_flush_fsck_errs(struct bch_fs *c) ++{ ++ struct fsck_err_state *s, *n; ++ ++ mutex_lock(&c->fsck_error_lock); ++ ++ list_for_each_entry_safe(s, n, &c->fsck_errors, list) { ++ if (s->ratelimited) ++ bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf); ++ ++ list_del(&s->list); ++ kfree(s); ++ } ++ ++ mutex_unlock(&c->fsck_error_lock); ++} +diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h +new file mode 100644 +index 000000000000..94b53312fbbd +--- /dev/null ++++ b/fs/bcachefs/error.h +@@ -0,0 +1,211 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ERROR_H ++#define _BCACHEFS_ERROR_H ++ ++#include ++#include ++ ++struct bch_dev; ++struct bch_fs; ++struct work_struct; ++ ++/* ++ * XXX: separate out errors that indicate on disk data is inconsistent, and flag ++ * superblock as such ++ */ ++ ++/* Error messages: */ ++ ++/* ++ * Inconsistency errors: The on disk data is inconsistent. If these occur during ++ * initial recovery, they don't indicate a bug in the running code - we walk all ++ * the metadata before modifying anything. If they occur at runtime, they ++ * indicate either a bug in the running code or (less likely) data is being ++ * silently corrupted under us. ++ * ++ * XXX: audit all inconsistent errors and make sure they're all recoverable, in ++ * BCH_ON_ERROR_CONTINUE mode ++ */ ++ ++bool bch2_inconsistent_error(struct bch_fs *); ++ ++#define bch2_fs_inconsistent(c, ...) \ ++({ \ ++ bch_err(c, __VA_ARGS__); \ ++ bch2_inconsistent_error(c); \ ++}) ++ ++#define bch2_fs_inconsistent_on(cond, c, ...) \ ++({ \ ++ int _ret = !!(cond); \ ++ \ ++ if (_ret) \ ++ bch2_fs_inconsistent(c, __VA_ARGS__); \ ++ _ret; \ ++}) ++ ++/* ++ * Later we might want to mark only the particular device inconsistent, not the ++ * entire filesystem: ++ */ ++ ++#define bch2_dev_inconsistent(ca, ...) \ ++do { \ ++ bch_err(ca, __VA_ARGS__); \ ++ bch2_inconsistent_error((ca)->fs); \ ++} while (0) ++ ++#define bch2_dev_inconsistent_on(cond, ca, ...) \ ++({ \ ++ int _ret = !!(cond); \ ++ \ ++ if (_ret) \ ++ bch2_dev_inconsistent(ca, __VA_ARGS__); \ ++ _ret; \ ++}) ++ ++/* ++ * Fsck errors: inconsistency errors we detect at mount time, and should ideally ++ * be able to repair: ++ */ ++ ++enum { ++ BCH_FSCK_OK = 0, ++ BCH_FSCK_ERRORS_NOT_FIXED = 1, ++ BCH_FSCK_REPAIR_UNIMPLEMENTED = 2, ++ BCH_FSCK_REPAIR_IMPOSSIBLE = 3, ++ BCH_FSCK_UNKNOWN_VERSION = 4, ++}; ++ ++enum fsck_err_opts { ++ FSCK_OPT_EXIT, ++ FSCK_OPT_YES, ++ FSCK_OPT_NO, ++ FSCK_OPT_ASK, ++}; ++ ++enum fsck_err_ret { ++ FSCK_ERR_IGNORE = 0, ++ FSCK_ERR_FIX = 1, ++ FSCK_ERR_EXIT = 2, ++}; ++ ++struct fsck_err_state { ++ struct list_head list; ++ const char *fmt; ++ u64 nr; ++ bool ratelimited; ++ char buf[512]; ++}; ++ ++#define FSCK_CAN_FIX (1 << 0) ++#define FSCK_CAN_IGNORE (1 << 1) ++#define FSCK_NEED_FSCK (1 << 2) ++ ++__printf(3, 4) __cold ++enum fsck_err_ret bch2_fsck_err(struct bch_fs *, ++ unsigned, const char *, ...); ++void bch2_flush_fsck_errs(struct bch_fs *); ++ ++#define __fsck_err(c, _flags, msg, ...) \ ++({ \ ++ int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\ ++ \ ++ if (_fix == FSCK_ERR_EXIT) { \ ++ bch_err(c, "Unable to continue, halting"); \ ++ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ goto fsck_err; \ ++ } \ ++ \ ++ _fix; \ ++}) ++ ++/* These macros return true if error should be fixed: */ ++ ++/* XXX: mark in superblock that filesystem contains errors, if we ignore: */ ++ ++#define __fsck_err_on(cond, c, _flags, ...) \ ++ ((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false) ++ ++#define need_fsck_err_on(cond, c, ...) \ ++ __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) ++ ++#define need_fsck_err(c, ...) \ ++ __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) ++ ++#define mustfix_fsck_err(c, ...) \ ++ __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__) ++ ++#define mustfix_fsck_err_on(cond, c, ...) \ ++ __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__) ++ ++#define fsck_err(c, ...) \ ++ __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) ++ ++#define fsck_err_on(cond, c, ...) \ ++ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) ++ ++/* ++ * Fatal errors: these don't indicate a bug, but we can't continue running in RW ++ * mode - pretty much just due to metadata IO errors: ++ */ ++ ++void bch2_fatal_error(struct bch_fs *); ++ ++#define bch2_fs_fatal_error(c, ...) \ ++do { \ ++ bch_err(c, __VA_ARGS__); \ ++ bch2_fatal_error(c); \ ++} while (0) ++ ++#define bch2_fs_fatal_err_on(cond, c, ...) \ ++({ \ ++ int _ret = !!(cond); \ ++ \ ++ if (_ret) \ ++ bch2_fs_fatal_error(c, __VA_ARGS__); \ ++ _ret; \ ++}) ++ ++/* ++ * IO errors: either recoverable metadata IO (because we have replicas), or data ++ * IO - we need to log it and print out a message, but we don't (necessarily) ++ * want to shut down the fs: ++ */ ++ ++void bch2_io_error_work(struct work_struct *); ++ ++/* Does the error handling without logging a message */ ++void bch2_io_error(struct bch_dev *); ++ ++/* Logs message and handles the error: */ ++#define bch2_dev_io_error(ca, fmt, ...) \ ++do { \ ++ printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs, \ ++ "IO error on %s for " fmt), \ ++ (ca)->name, ##__VA_ARGS__); \ ++ bch2_io_error(ca); \ ++} while (0) ++ ++#define bch2_dev_io_err_on(cond, ca, ...) \ ++({ \ ++ bool _ret = (cond); \ ++ \ ++ if (_ret) \ ++ bch2_dev_io_error(ca, __VA_ARGS__); \ ++ _ret; \ ++}) ++ ++/* kill? */ ++ ++#define __bcache_io_error(c, fmt, ...) \ ++ printk_ratelimited(KERN_ERR bch2_fmt(c, \ ++ "IO error: " fmt), ##__VA_ARGS__) ++ ++#define bcache_io_error(c, bio, fmt, ...) \ ++do { \ ++ __bcache_io_error(c, fmt, ##__VA_ARGS__); \ ++ (bio)->bi_status = BLK_STS_IOERR; \ ++} while (0) ++ ++#endif /* _BCACHEFS_ERROR_H */ +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +new file mode 100644 +index 000000000000..fd011df3cb99 +--- /dev/null ++++ b/fs/bcachefs/extent_update.c +@@ -0,0 +1,229 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "bkey_on_stack.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "debug.h" ++#include "extents.h" ++#include "extent_update.h" ++ ++/* ++ * This counts the number of iterators to the alloc & ec btrees we'll need ++ * inserting/removing this extent: ++ */ ++static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ unsigned ret = 0; ++ ++ bkey_extent_entry_for_each(ptrs, entry) { ++ switch (__extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ ret++; ++ } ++ } ++ ++ return ret; ++} ++ ++static int count_iters_for_insert(struct btree_trans *trans, ++ struct bkey_s_c k, ++ unsigned offset, ++ struct bpos *end, ++ unsigned *nr_iters, ++ unsigned max_iters) ++{ ++ int ret = 0, ret2 = 0; ++ ++ if (*nr_iters >= max_iters) { ++ *end = bpos_min(*end, k.k->p); ++ ret = 1; ++ } ++ ++ switch (k.k->type) { ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ *nr_iters += bch2_bkey_nr_alloc_ptrs(k); ++ ++ if (*nr_iters >= max_iters) { ++ *end = bpos_min(*end, k.k->p); ++ ret = 1; ++ } ++ ++ break; ++ case KEY_TYPE_reflink_p: { ++ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); ++ u64 idx = le64_to_cpu(p.v->idx); ++ unsigned sectors = bpos_min(*end, p.k->p).offset - ++ bkey_start_offset(p.k); ++ struct btree_iter *iter; ++ struct bkey_s_c r_k; ++ ++ for_each_btree_key(trans, iter, ++ BTREE_ID_REFLINK, POS(0, idx + offset), ++ BTREE_ITER_SLOTS, r_k, ret2) { ++ if (bkey_cmp(bkey_start_pos(r_k.k), ++ POS(0, idx + sectors)) >= 0) ++ break; ++ ++ /* extent_update_to_keys(), for the reflink_v update */ ++ *nr_iters += 1; ++ ++ *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); ++ ++ if (*nr_iters >= max_iters) { ++ struct bpos pos = bkey_start_pos(k.k); ++ pos.offset += min_t(u64, k.k->size, ++ r_k.k->p.offset - idx); ++ ++ *end = bpos_min(*end, pos); ++ ret = 1; ++ break; ++ } ++ } ++ ++ bch2_trans_iter_put(trans, iter); ++ break; ++ } ++ } ++ ++ return ret2 ?: ret; ++} ++ ++#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) ++ ++int bch2_extent_atomic_end(struct btree_iter *iter, ++ struct bkey_i *insert, ++ struct bpos *end) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree *b; ++ struct btree_node_iter node_iter; ++ struct bkey_packed *_k; ++ unsigned nr_iters = 0; ++ int ret; ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ return ret; ++ ++ b = iter->l[0].b; ++ node_iter = iter->l[0].iter; ++ ++ BUG_ON(bkey_cmp(b->data->min_key, POS_MIN) && ++ bkey_cmp(bkey_start_pos(&insert->k), ++ bkey_predecessor(b->data->min_key)) < 0); ++ ++ *end = bpos_min(insert->k.p, b->key.k.p); ++ ++ /* extent_update_to_keys(): */ ++ nr_iters += 1; ++ ++ ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, ++ &nr_iters, EXTENT_ITERS_MAX / 2); ++ if (ret < 0) ++ return ret; ++ ++ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { ++ struct bkey unpacked; ++ struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); ++ unsigned offset = 0; ++ ++ if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) ++ break; ++ ++ if (bkey_cmp(bkey_start_pos(&insert->k), ++ bkey_start_pos(k.k)) > 0) ++ offset = bkey_start_offset(&insert->k) - ++ bkey_start_offset(k.k); ++ ++ /* extent_handle_overwrites(): */ ++ switch (bch2_extent_overlap(&insert->k, k.k)) { ++ case BCH_EXTENT_OVERLAP_ALL: ++ case BCH_EXTENT_OVERLAP_FRONT: ++ nr_iters += 1; ++ break; ++ case BCH_EXTENT_OVERLAP_BACK: ++ case BCH_EXTENT_OVERLAP_MIDDLE: ++ nr_iters += 2; ++ break; ++ } ++ ++ ret = count_iters_for_insert(trans, k, offset, end, ++ &nr_iters, EXTENT_ITERS_MAX); ++ if (ret) ++ break; ++ ++ bch2_btree_node_iter_advance(&node_iter, b); ++ } ++ ++ return ret < 0 ? ret : 0; ++} ++ ++int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) ++{ ++ struct bpos end; ++ int ret; ++ ++ ret = bch2_extent_atomic_end(iter, k, &end); ++ if (ret) ++ return ret; ++ ++ bch2_cut_back(end, k); ++ return 0; ++} ++ ++int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) ++{ ++ struct bpos end; ++ int ret; ++ ++ ret = bch2_extent_atomic_end(iter, k, &end); ++ if (ret) ++ return ret; ++ ++ return !bkey_cmp(end, k->k.p); ++} ++ ++enum btree_insert_ret ++bch2_extent_can_insert(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct btree_node_iter node_iter = l->iter; ++ struct bkey_packed *_k; ++ struct bkey_s_c k; ++ struct bkey unpacked; ++ int sectors; ++ ++ _k = bch2_btree_node_iter_peek(&node_iter, l->b); ++ if (!_k) ++ return BTREE_INSERT_OK; ++ ++ k = bkey_disassemble(l->b, _k, &unpacked); ++ ++ /* Check if we're splitting a compressed extent: */ ++ ++ if (bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k)) > 0 && ++ bkey_cmp(insert->k.p, k.k->p) < 0 && ++ (sectors = bch2_bkey_sectors_compressed(k))) { ++ int flags = trans->flags & BTREE_INSERT_NOFAIL ++ ? BCH_DISK_RESERVATION_NOFAIL : 0; ++ ++ switch (bch2_disk_reservation_add(trans->c, trans->disk_res, ++ sectors, flags)) { ++ case 0: ++ break; ++ case -ENOSPC: ++ return BTREE_INSERT_ENOSPC; ++ default: ++ BUG(); ++ } ++ } ++ ++ return BTREE_INSERT_OK; ++} +diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h +new file mode 100644 +index 000000000000..38dc084627d2 +--- /dev/null ++++ b/fs/bcachefs/extent_update.h +@@ -0,0 +1,16 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EXTENT_UPDATE_H ++#define _BCACHEFS_EXTENT_UPDATE_H ++ ++#include "bcachefs.h" ++ ++int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *, ++ struct bpos *); ++int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); ++int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); ++ ++enum btree_insert_ret ++bch2_extent_can_insert(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *); ++ ++#endif /* _BCACHEFS_EXTENT_UPDATE_H */ +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +new file mode 100644 +index 000000000000..251d4af773a5 +--- /dev/null ++++ b/fs/bcachefs/extents.c +@@ -0,0 +1,1268 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2010 Kent Overstreet ++ * ++ * Code for managing the extent btree and dynamically updating the writeback ++ * dirty sector count. ++ */ ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_gc.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "debug.h" ++#include "disk_groups.h" ++#include "error.h" ++#include "extents.h" ++#include "inode.h" ++#include "journal.h" ++#include "replicas.h" ++#include "super.h" ++#include "super-io.h" ++#include "util.h" ++ ++#include ++ ++static unsigned bch2_crc_field_size_max[] = { ++ [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, ++ [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, ++ [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, ++}; ++ ++static void bch2_extent_crc_pack(union bch_extent_crc *, ++ struct bch_extent_crc_unpacked, ++ enum bch_extent_entry_type); ++ ++static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, ++ unsigned dev) ++{ ++ struct bch_dev_io_failures *i; ++ ++ for (i = f->devs; i < f->devs + f->nr; i++) ++ if (i->dev == dev) ++ return i; ++ ++ return NULL; ++} ++ ++void bch2_mark_io_failure(struct bch_io_failures *failed, ++ struct extent_ptr_decoded *p) ++{ ++ struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev); ++ ++ if (!f) { ++ BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); ++ ++ f = &failed->devs[failed->nr++]; ++ f->dev = p->ptr.dev; ++ f->idx = p->idx; ++ f->nr_failed = 1; ++ f->nr_retries = 0; ++ } else if (p->idx != f->idx) { ++ f->idx = p->idx; ++ f->nr_failed = 1; ++ f->nr_retries = 0; ++ } else { ++ f->nr_failed++; ++ } ++} ++ ++/* ++ * returns true if p1 is better than p2: ++ */ ++static inline bool ptr_better(struct bch_fs *c, ++ const struct extent_ptr_decoded p1, ++ const struct extent_ptr_decoded p2) ++{ ++ if (likely(!p1.idx && !p2.idx)) { ++ struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev); ++ struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev); ++ ++ u64 l1 = atomic64_read(&dev1->cur_latency[READ]); ++ u64 l2 = atomic64_read(&dev2->cur_latency[READ]); ++ ++ /* Pick at random, biased in favor of the faster device: */ ++ ++ return bch2_rand_range(l1 + l2) > l1; ++ } ++ ++ if (force_reconstruct_read(c)) ++ return p1.idx > p2.idx; ++ ++ return p1.idx < p2.idx; ++} ++ ++/* ++ * This picks a non-stale pointer, preferably from a device other than @avoid. ++ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to ++ * other devices, it will still pick a pointer from avoid. ++ */ ++int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, ++ struct bch_io_failures *failed, ++ struct extent_ptr_decoded *pick) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ struct bch_dev_io_failures *f; ++ struct bch_dev *ca; ++ int ret = 0; ++ ++ if (k.k->type == KEY_TYPE_error) ++ return -EIO; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ ++ /* ++ * If there are any dirty pointers it's an error if we can't ++ * read: ++ */ ++ if (!ret && !p.ptr.cached) ++ ret = -EIO; ++ ++ if (p.ptr.cached && ptr_stale(ca, &p.ptr)) ++ continue; ++ ++ f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; ++ if (f) ++ p.idx = f->nr_failed < f->nr_retries ++ ? f->idx ++ : f->idx + 1; ++ ++ if (!p.idx && ++ !bch2_dev_is_readable(ca)) ++ p.idx++; ++ ++ if (force_reconstruct_read(c) && ++ !p.idx && p.has_ec) ++ p.idx++; ++ ++ if (p.idx >= (unsigned) p.has_ec + 1) ++ continue; ++ ++ if (ret > 0 && !ptr_better(c, p, *pick)) ++ continue; ++ ++ *pick = p; ++ ret = 1; ++ } ++ ++ return ret; ++} ++ ++/* KEY_TYPE_btree_ptr: */ ++ ++const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) ++ return "value too big"; ++ ++ return bch2_bkey_ptrs_invalid(c, k); ++} ++ ++void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ const char *err; ++ char buf[160]; ++ struct bucket_mark mark; ++ struct bch_dev *ca; ++ ++ if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) ++ return; ++ ++ if (!percpu_down_read_trylock(&c->mark_lock)) ++ return; ++ ++ bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && ++ !bch2_bkey_replicas_marked_locked(c, k, false), c, ++ "btree key bad (replicas not marked in superblock):\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ mark = ptr_bucket_mark(ca, ptr); ++ ++ err = "stale"; ++ if (gen_after(mark.gen, ptr->gen)) ++ goto err; ++ ++ err = "inconsistent"; ++ if (mark.data_type != BCH_DATA_BTREE || ++ mark.dirty_sectors < c->opts.btree_node_size) ++ goto err; ++ } ++out: ++ percpu_up_read(&c->mark_lock); ++ return; ++err: ++ bch2_fs_inconsistent(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", ++ err, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), ++ PTR_BUCKET_NR(ca, ptr), ++ mark.gen, (unsigned) mark.v.counter); ++ goto out; ++} ++ ++void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ bch2_bkey_ptrs_to_text(out, c, k); ++} ++ ++void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); ++ ++ pr_buf(out, "seq %llx sectors %u written %u min_key ", ++ le64_to_cpu(bp.v->seq), ++ le16_to_cpu(bp.v->sectors), ++ le16_to_cpu(bp.v->sectors_written)); ++ ++ bch2_bpos_to_text(out, bp.v->min_key); ++ pr_buf(out, " "); ++ bch2_bkey_ptrs_to_text(out, c, k); ++} ++ ++void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, ++ unsigned big_endian, int write, ++ struct bkey_s k) ++{ ++ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k); ++ ++ compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key); ++ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_node_type_is_extents(btree_id) && ++ bkey_cmp(bp.v->min_key, POS_MIN)) ++ bp.v->min_key = write ++ ? bkey_predecessor(bp.v->min_key) ++ : bkey_successor(bp.v->min_key); ++} ++ ++/* KEY_TYPE_extent: */ ++ ++const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ return bch2_bkey_ptrs_invalid(c, k); ++} ++ ++void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ char buf[160]; ++ ++ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) || ++ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) ++ return; ++ ++ if (!percpu_down_read_trylock(&c->mark_lock)) ++ return; ++ ++ bch2_fs_inconsistent_on(!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && ++ !bch2_bkey_replicas_marked_locked(c, e.s_c, false), c, ++ "extent key bad (replicas not marked in superblock):\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf)); ++ ++ extent_for_each_ptr_decode(e, p, entry) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); ++ unsigned stale = gen_after(mark.gen, p.ptr.gen); ++ unsigned disk_sectors = ptr_disk_sectors(p); ++ unsigned mark_sectors = p.ptr.cached ++ ? mark.cached_sectors ++ : mark.dirty_sectors; ++ ++ bch2_fs_inconsistent_on(stale && !p.ptr.cached, c, ++ "stale dirty pointer (ptr gen %u bucket %u", ++ p.ptr.gen, mark.gen); ++ ++ bch2_fs_inconsistent_on(stale > 96, c, ++ "key too stale: %i", stale); ++ ++ bch2_fs_inconsistent_on(!stale && ++ (mark.data_type != BCH_DATA_USER || ++ mark_sectors < disk_sectors), c, ++ "extent pointer not marked: %s:\n" ++ "type %u sectors %u < %u", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), ++ mark.data_type, ++ mark_sectors, disk_sectors); ++ } ++ ++ percpu_up_read(&c->mark_lock); ++} ++ ++void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ bch2_bkey_ptrs_to_text(out, c, k); ++} ++ ++enum merge_result bch2_extent_merge(struct bch_fs *c, ++ struct bkey_s _l, struct bkey_s _r) ++{ ++ struct bkey_s_extent l = bkey_s_to_extent(_l); ++ struct bkey_s_extent r = bkey_s_to_extent(_r); ++ union bch_extent_entry *en_l = l.v->start; ++ union bch_extent_entry *en_r = r.v->start; ++ struct bch_extent_crc_unpacked crc_l, crc_r; ++ ++ if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k)) ++ return BCH_MERGE_NOMERGE; ++ ++ crc_l = bch2_extent_crc_unpack(l.k, NULL); ++ ++ extent_for_each_entry(l, en_l) { ++ en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); ++ ++ if (extent_entry_type(en_l) != extent_entry_type(en_r)) ++ return BCH_MERGE_NOMERGE; ++ ++ switch (extent_entry_type(en_l)) { ++ case BCH_EXTENT_ENTRY_ptr: { ++ const struct bch_extent_ptr *lp = &en_l->ptr; ++ const struct bch_extent_ptr *rp = &en_r->ptr; ++ struct bch_dev *ca; ++ ++ if (lp->offset + crc_l.compressed_size != rp->offset || ++ lp->dev != rp->dev || ++ lp->gen != rp->gen) ++ return BCH_MERGE_NOMERGE; ++ ++ /* We don't allow extents to straddle buckets: */ ++ ca = bch_dev_bkey_exists(c, lp->dev); ++ ++ if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) ++ return BCH_MERGE_NOMERGE; ++ ++ break; ++ } ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ if (en_l->stripe_ptr.block != en_r->stripe_ptr.block || ++ en_l->stripe_ptr.idx != en_r->stripe_ptr.idx) ++ return BCH_MERGE_NOMERGE; ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ case BCH_EXTENT_ENTRY_crc64: ++ case BCH_EXTENT_ENTRY_crc128: ++ crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); ++ crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); ++ ++ if (crc_l.csum_type != crc_r.csum_type || ++ crc_l.compression_type != crc_r.compression_type || ++ crc_l.nonce != crc_r.nonce) ++ return BCH_MERGE_NOMERGE; ++ ++ if (crc_l.offset + crc_l.live_size != crc_l.compressed_size || ++ crc_r.offset) ++ return BCH_MERGE_NOMERGE; ++ ++ if (!bch2_checksum_mergeable(crc_l.csum_type)) ++ return BCH_MERGE_NOMERGE; ++ ++ if (crc_is_compressed(crc_l)) ++ return BCH_MERGE_NOMERGE; ++ ++ if (crc_l.csum_type && ++ crc_l.uncompressed_size + ++ crc_r.uncompressed_size > c->sb.encoded_extent_max) ++ return BCH_MERGE_NOMERGE; ++ ++ if (crc_l.uncompressed_size + crc_r.uncompressed_size > ++ bch2_crc_field_size_max[extent_entry_type(en_l)]) ++ return BCH_MERGE_NOMERGE; ++ ++ break; ++ default: ++ return BCH_MERGE_NOMERGE; ++ } ++ } ++ ++ extent_for_each_entry(l, en_l) { ++ struct bch_extent_crc_unpacked crc_l, crc_r; ++ ++ en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); ++ ++ if (!extent_entry_is_crc(en_l)) ++ continue; ++ ++ crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); ++ crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); ++ ++ crc_l.csum = bch2_checksum_merge(crc_l.csum_type, ++ crc_l.csum, ++ crc_r.csum, ++ crc_r.uncompressed_size << 9); ++ ++ crc_l.uncompressed_size += crc_r.uncompressed_size; ++ crc_l.compressed_size += crc_r.compressed_size; ++ ++ bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, ++ extent_entry_type(en_l)); ++ } ++ ++ bch2_key_resize(l.k, l.k->size + r.k->size); ++ ++ return BCH_MERGE_MERGE; ++} ++ ++/* KEY_TYPE_reservation: */ ++ ++const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); ++ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) ++ return "incorrect value size"; ++ ++ if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) ++ return "invalid nr_replicas"; ++ ++ return NULL; ++} ++ ++void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); ++ ++ pr_buf(out, "generation %u replicas %u", ++ le32_to_cpu(r.v->generation), ++ r.v->nr_replicas); ++} ++ ++enum merge_result bch2_reservation_merge(struct bch_fs *c, ++ struct bkey_s _l, struct bkey_s _r) ++{ ++ struct bkey_s_reservation l = bkey_s_to_reservation(_l); ++ struct bkey_s_reservation r = bkey_s_to_reservation(_r); ++ ++ if (l.v->generation != r.v->generation || ++ l.v->nr_replicas != r.v->nr_replicas) ++ return BCH_MERGE_NOMERGE; ++ ++ if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { ++ bch2_key_resize(l.k, KEY_SIZE_MAX); ++ bch2_cut_front_s(l.k->p, r.s); ++ return BCH_MERGE_PARTIAL; ++ } ++ ++ bch2_key_resize(l.k, l.k->size + r.k->size); ++ ++ return BCH_MERGE_MERGE; ++} ++ ++/* Extent checksum entries: */ ++ ++/* returns true if not equal */ ++static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, ++ struct bch_extent_crc_unpacked r) ++{ ++ return (l.csum_type != r.csum_type || ++ l.compression_type != r.compression_type || ++ l.compressed_size != r.compressed_size || ++ l.uncompressed_size != r.uncompressed_size || ++ l.offset != r.offset || ++ l.live_size != r.live_size || ++ l.nonce != r.nonce || ++ bch2_crc_cmp(l.csum, r.csum)); ++} ++ ++static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, ++ struct bch_extent_crc_unpacked n) ++{ ++ return !crc_is_compressed(u) && ++ u.csum_type && ++ u.uncompressed_size > u.live_size && ++ bch2_csum_type_is_encryption(u.csum_type) == ++ bch2_csum_type_is_encryption(n.csum_type); ++} ++ ++bool bch2_can_narrow_extent_crcs(struct bkey_s_c k, ++ struct bch_extent_crc_unpacked n) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ struct bch_extent_crc_unpacked crc; ++ const union bch_extent_entry *i; ++ ++ if (!n.csum_type) ++ return false; ++ ++ bkey_for_each_crc(k.k, ptrs, crc, i) ++ if (can_narrow_crc(crc, n)) ++ return true; ++ ++ return false; ++} ++ ++/* ++ * We're writing another replica for this extent, so while we've got the data in ++ * memory we'll be computing a new checksum for the currently live data. ++ * ++ * If there are other replicas we aren't moving, and they are checksummed but ++ * not compressed, we can modify them to point to only the data that is ++ * currently live (so that readers won't have to bounce) while we've got the ++ * checksum we need: ++ */ ++bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); ++ struct bch_extent_crc_unpacked u; ++ struct extent_ptr_decoded p; ++ union bch_extent_entry *i; ++ bool ret = false; ++ ++ /* Find a checksum entry that covers only live data: */ ++ if (!n.csum_type) { ++ bkey_for_each_crc(&k->k, ptrs, u, i) ++ if (!crc_is_compressed(u) && ++ u.csum_type && ++ u.live_size == u.uncompressed_size) { ++ n = u; ++ goto found; ++ } ++ return false; ++ } ++found: ++ BUG_ON(crc_is_compressed(n)); ++ BUG_ON(n.offset); ++ BUG_ON(n.live_size != k->k.size); ++ ++restart_narrow_pointers: ++ ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); ++ ++ bkey_for_each_ptr_decode(&k->k, ptrs, p, i) ++ if (can_narrow_crc(p.crc, n)) { ++ bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr); ++ p.ptr.offset += p.crc.offset; ++ p.crc = n; ++ bch2_extent_ptr_decoded_append(k, &p); ++ ret = true; ++ goto restart_narrow_pointers; ++ } ++ ++ return ret; ++} ++ ++static void bch2_extent_crc_pack(union bch_extent_crc *dst, ++ struct bch_extent_crc_unpacked src, ++ enum bch_extent_entry_type type) ++{ ++#define set_common_fields(_dst, _src) \ ++ _dst.type = 1 << type; \ ++ _dst.csum_type = _src.csum_type, \ ++ _dst.compression_type = _src.compression_type, \ ++ _dst._compressed_size = _src.compressed_size - 1, \ ++ _dst._uncompressed_size = _src.uncompressed_size - 1, \ ++ _dst.offset = _src.offset ++ ++ switch (type) { ++ case BCH_EXTENT_ENTRY_crc32: ++ set_common_fields(dst->crc32, src); ++ dst->crc32.csum = *((__le32 *) &src.csum.lo); ++ break; ++ case BCH_EXTENT_ENTRY_crc64: ++ set_common_fields(dst->crc64, src); ++ dst->crc64.nonce = src.nonce; ++ dst->crc64.csum_lo = src.csum.lo; ++ dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); ++ break; ++ case BCH_EXTENT_ENTRY_crc128: ++ set_common_fields(dst->crc128, src); ++ dst->crc128.nonce = src.nonce; ++ dst->crc128.csum = src.csum; ++ break; ++ default: ++ BUG(); ++ } ++#undef set_common_fields ++} ++ ++void bch2_extent_crc_append(struct bkey_i *k, ++ struct bch_extent_crc_unpacked new) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); ++ union bch_extent_crc *crc = (void *) ptrs.end; ++ enum bch_extent_entry_type type; ++ ++ if (bch_crc_bytes[new.csum_type] <= 4 && ++ new.uncompressed_size <= CRC32_SIZE_MAX && ++ new.nonce <= CRC32_NONCE_MAX) ++ type = BCH_EXTENT_ENTRY_crc32; ++ else if (bch_crc_bytes[new.csum_type] <= 10 && ++ new.uncompressed_size <= CRC64_SIZE_MAX && ++ new.nonce <= CRC64_NONCE_MAX) ++ type = BCH_EXTENT_ENTRY_crc64; ++ else if (bch_crc_bytes[new.csum_type] <= 16 && ++ new.uncompressed_size <= CRC128_SIZE_MAX && ++ new.nonce <= CRC128_NONCE_MAX) ++ type = BCH_EXTENT_ENTRY_crc128; ++ else ++ BUG(); ++ ++ bch2_extent_crc_pack(crc, new, type); ++ ++ k->k.u64s += extent_entry_u64s(ptrs.end); ++ ++ EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); ++} ++ ++/* Generic code for keys with pointers: */ ++ ++unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) ++{ ++ return bch2_bkey_devs(k).nr; ++} ++ ++unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) ++{ ++ return k.k->type == KEY_TYPE_reservation ++ ? bkey_s_c_to_reservation(k).v->nr_replicas ++ : bch2_bkey_dirty_devs(k).nr; ++} ++ ++unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) ++{ ++ unsigned ret = 0; ++ ++ if (k.k->type == KEY_TYPE_reservation) { ++ ret = bkey_s_c_to_reservation(k).v->nr_replicas; ++ } else { ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ ret += !p.ptr.cached && !crc_is_compressed(p.crc); ++ } ++ ++ return ret; ++} ++ ++unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ unsigned ret = 0; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (!p.ptr.cached && crc_is_compressed(p.crc)) ++ ret += p.crc.compressed_size; ++ ++ return ret; ++} ++ ++bool bch2_bkey_is_incompressible(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct bch_extent_crc_unpacked crc; ++ ++ bkey_for_each_crc(k.k, ptrs, crc, entry) ++ if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) ++ return true; ++ return false; ++} ++ ++bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, ++ unsigned nr_replicas) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bpos end = pos; ++ struct bkey_s_c k; ++ bool ret = true; ++ int err; ++ ++ end.offset += size; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos, ++ BTREE_ITER_SLOTS, k, err) { ++ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) ++ break; ++ ++ if (nr_replicas > bch2_bkey_nr_ptrs_fully_allocated(k)) { ++ ret = false; ++ break; ++ } ++ } ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++static unsigned bch2_extent_ptr_durability(struct bch_fs *c, ++ struct extent_ptr_decoded p) ++{ ++ unsigned durability = 0; ++ struct bch_dev *ca; ++ ++ if (p.ptr.cached) ++ return 0; ++ ++ ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ ++ if (ca->mi.state != BCH_MEMBER_STATE_FAILED) ++ durability = max_t(unsigned, durability, ca->mi.durability); ++ ++ if (p.has_ec) { ++ struct stripe *s = ++ genradix_ptr(&c->stripes[0], p.ec.idx); ++ ++ if (WARN_ON(!s)) ++ goto out; ++ ++ durability = max_t(unsigned, durability, s->nr_redundant); ++ } ++out: ++ return durability; ++} ++ ++unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ unsigned durability = 0; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ durability += bch2_extent_ptr_durability(c, p); ++ ++ return durability; ++} ++ ++void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, ++ unsigned target, ++ unsigned nr_desired_replicas) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); ++ union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; ++ ++ if (target && extra > 0) ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ int n = bch2_extent_ptr_durability(c, p); ++ ++ if (n && n <= extra && ++ !bch2_dev_in_target(c, p.ptr.dev, target)) { ++ entry->ptr.cached = true; ++ extra -= n; ++ } ++ } ++ ++ if (extra > 0) ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ int n = bch2_extent_ptr_durability(c, p); ++ ++ if (n && n <= extra) { ++ entry->ptr.cached = true; ++ extra -= n; ++ } ++ } ++} ++ ++void bch2_bkey_append_ptr(struct bkey_i *k, ++ struct bch_extent_ptr ptr) ++{ ++ EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev)); ++ ++ switch (k->k.type) { ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: ++ case KEY_TYPE_extent: ++ EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); ++ ++ ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; ++ ++ memcpy((void *) &k->v + bkey_val_bytes(&k->k), ++ &ptr, ++ sizeof(ptr)); ++ k->u64s++; ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static inline void __extent_entry_insert(struct bkey_i *k, ++ union bch_extent_entry *dst, ++ union bch_extent_entry *new) ++{ ++ union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); ++ ++ memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), ++ dst, (u64 *) end - (u64 *) dst); ++ k->k.u64s += extent_entry_u64s(new); ++ memcpy(dst, new, extent_entry_bytes(new)); ++} ++ ++void bch2_extent_ptr_decoded_append(struct bkey_i *k, ++ struct extent_ptr_decoded *p) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); ++ struct bch_extent_crc_unpacked crc = ++ bch2_extent_crc_unpack(&k->k, NULL); ++ union bch_extent_entry *pos; ++ ++ if (!bch2_crc_unpacked_cmp(crc, p->crc)) { ++ pos = ptrs.start; ++ goto found; ++ } ++ ++ bkey_for_each_crc(&k->k, ptrs, crc, pos) ++ if (!bch2_crc_unpacked_cmp(crc, p->crc)) { ++ pos = extent_entry_next(pos); ++ goto found; ++ } ++ ++ bch2_extent_crc_append(k, p->crc); ++ pos = bkey_val_end(bkey_i_to_s(k)); ++found: ++ p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; ++ __extent_entry_insert(k, pos, to_entry(&p->ptr)); ++ ++ if (p->has_ec) { ++ p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; ++ __extent_entry_insert(k, pos, to_entry(&p->ec)); ++ } ++} ++ ++static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, ++ union bch_extent_entry *entry) ++{ ++ union bch_extent_entry *i = ptrs.start; ++ ++ if (i == entry) ++ return NULL; ++ ++ while (extent_entry_next(i) != entry) ++ i = extent_entry_next(i); ++ return i; ++} ++ ++union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, ++ struct bch_extent_ptr *ptr) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); ++ union bch_extent_entry *dst, *src, *prev; ++ bool drop_crc = true; ++ ++ EBUG_ON(ptr < &ptrs.start->ptr || ++ ptr >= &ptrs.end->ptr); ++ EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); ++ ++ src = extent_entry_next(to_entry(ptr)); ++ if (src != ptrs.end && ++ !extent_entry_is_crc(src)) ++ drop_crc = false; ++ ++ dst = to_entry(ptr); ++ while ((prev = extent_entry_prev(ptrs, dst))) { ++ if (extent_entry_is_ptr(prev)) ++ break; ++ ++ if (extent_entry_is_crc(prev)) { ++ if (drop_crc) ++ dst = prev; ++ break; ++ } ++ ++ dst = prev; ++ } ++ ++ memmove_u64s_down(dst, src, ++ (u64 *) ptrs.end - (u64 *) src); ++ k.k->u64s -= (u64 *) src - (u64 *) dst; ++ ++ return dst; ++} ++ ++void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) ++{ ++ struct bch_extent_ptr *ptr; ++ ++ bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); ++} ++ ++const struct bch_extent_ptr * ++bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(ptrs, ptr) ++ if (ptr->dev == dev) ++ return ptr; ++ ++ return NULL; ++} ++ ++bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(ptrs, ptr) ++ if (bch2_dev_in_target(c, ptr->dev, target) && ++ (!ptr->cached || ++ !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) ++ return true; ++ ++ return false; ++} ++ ++bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, ++ struct bch_extent_ptr m, u64 offset) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (p.ptr.dev == m.dev && ++ p.ptr.gen == m.gen && ++ (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == ++ (s64) m.offset - offset) ++ return true; ++ ++ return false; ++} ++ ++/* ++ * bch_extent_normalize - clean up an extent, dropping stale pointers etc. ++ * ++ * Returns true if @k should be dropped entirely ++ * ++ * For existing keys, only called when btree nodes are being rewritten, not when ++ * they're merely being compacted/resorted in memory. ++ */ ++bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) ++{ ++ struct bch_extent_ptr *ptr; ++ ++ bch2_bkey_drop_ptrs(k, ptr, ++ ptr->cached && ++ ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); ++ ++ /* will only happen if all pointers were cached: */ ++ if (!bch2_bkey_nr_ptrs(k.s_c)) ++ k.k->type = KEY_TYPE_discard; ++ ++ return bkey_whiteout(k.k); ++} ++ ++void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct bch_extent_crc_unpacked crc; ++ const struct bch_extent_ptr *ptr; ++ const struct bch_extent_stripe_ptr *ec; ++ struct bch_dev *ca; ++ bool first = true; ++ ++ bkey_extent_entry_for_each(ptrs, entry) { ++ if (!first) ++ pr_buf(out, " "); ++ ++ switch (__extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ ptr = entry_to_ptr(entry); ++ ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] ++ ? bch_dev_bkey_exists(c, ptr->dev) ++ : NULL; ++ ++ pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev, ++ (u64) ptr->offset, ptr->gen, ++ ptr->cached ? " cached" : "", ++ ca && ptr_stale(ca, ptr) ++ ? " stale" : ""); ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ case BCH_EXTENT_ENTRY_crc64: ++ case BCH_EXTENT_ENTRY_crc128: ++ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); ++ ++ pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u", ++ crc.compressed_size, ++ crc.uncompressed_size, ++ crc.offset, crc.nonce, ++ crc.csum_type, ++ crc.compression_type); ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ ec = &entry->stripe_ptr; ++ ++ pr_buf(out, "ec: idx %llu block %u", ++ (u64) ec->idx, ec->block); ++ break; ++ default: ++ pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); ++ return; ++ } ++ ++ first = false; ++ } ++} ++ ++static const char *extent_ptr_invalid(const struct bch_fs *c, ++ struct bkey_s_c k, ++ const struct bch_extent_ptr *ptr, ++ unsigned size_ondisk, ++ bool metadata) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr2; ++ struct bch_dev *ca; ++ ++ if (!bch2_dev_exists2(c, ptr->dev)) ++ return "pointer to invalid device"; ++ ++ ca = bch_dev_bkey_exists(c, ptr->dev); ++ if (!ca) ++ return "pointer to invalid device"; ++ ++ bkey_for_each_ptr(ptrs, ptr2) ++ if (ptr != ptr2 && ptr->dev == ptr2->dev) ++ return "multiple pointers to same device"; ++ ++ if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets)) ++ return "offset past end of device"; ++ ++ if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) ++ return "offset before first bucket"; ++ ++ if (bucket_remainder(ca, ptr->offset) + ++ size_ondisk > ca->mi.bucket_size) ++ return "spans multiple buckets"; ++ ++ return NULL; ++} ++ ++const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct bch_extent_crc_unpacked crc; ++ unsigned size_ondisk = k.k->size; ++ const char *reason; ++ unsigned nonce = UINT_MAX; ++ ++ if (k.k->type == KEY_TYPE_btree_ptr) ++ size_ondisk = c->opts.btree_node_size; ++ if (k.k->type == KEY_TYPE_btree_ptr_v2) ++ size_ondisk = le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors); ++ ++ bkey_extent_entry_for_each(ptrs, entry) { ++ if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) ++ return "invalid extent entry type"; ++ ++ if (k.k->type == KEY_TYPE_btree_ptr && ++ !extent_entry_is_ptr(entry)) ++ return "has non ptr field"; ++ ++ switch (extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ reason = extent_ptr_invalid(c, k, &entry->ptr, ++ size_ondisk, false); ++ if (reason) ++ return reason; ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ case BCH_EXTENT_ENTRY_crc64: ++ case BCH_EXTENT_ENTRY_crc128: ++ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); ++ ++ if (crc.offset + crc.live_size > ++ crc.uncompressed_size) ++ return "checksum offset + key size > uncompressed size"; ++ ++ size_ondisk = crc.compressed_size; ++ ++ if (!bch2_checksum_type_valid(c, crc.csum_type)) ++ return "invalid checksum type"; ++ ++ if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) ++ return "invalid compression type"; ++ ++ if (bch2_csum_type_is_encryption(crc.csum_type)) { ++ if (nonce == UINT_MAX) ++ nonce = crc.offset + crc.nonce; ++ else if (nonce != crc.offset + crc.nonce) ++ return "incorrect nonce"; ++ } ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ break; ++ } ++ } ++ ++ return NULL; ++} ++ ++void bch2_ptr_swab(struct bkey_s k) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); ++ union bch_extent_entry *entry; ++ u64 *d; ++ ++ for (d = (u64 *) ptrs.start; ++ d != (u64 *) ptrs.end; ++ d++) ++ *d = swab64(*d); ++ ++ for (entry = ptrs.start; ++ entry < ptrs.end; ++ entry = extent_entry_next(entry)) { ++ switch (extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ entry->crc32.csum = swab32(entry->crc32.csum); ++ break; ++ case BCH_EXTENT_ENTRY_crc64: ++ entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); ++ entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); ++ break; ++ case BCH_EXTENT_ENTRY_crc128: ++ entry->crc128.csum.hi = (__force __le64) ++ swab64((__force u64) entry->crc128.csum.hi); ++ entry->crc128.csum.lo = (__force __le64) ++ swab64((__force u64) entry->crc128.csum.lo); ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ break; ++ } ++ } ++} ++ ++/* Generic extent code: */ ++ ++int bch2_cut_front_s(struct bpos where, struct bkey_s k) ++{ ++ unsigned new_val_u64s = bkey_val_u64s(k.k); ++ int val_u64s_delta; ++ u64 sub; ++ ++ if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) ++ return 0; ++ ++ EBUG_ON(bkey_cmp(where, k.k->p) > 0); ++ ++ sub = where.offset - bkey_start_offset(k.k); ++ ++ k.k->size -= sub; ++ ++ if (!k.k->size) { ++ k.k->type = KEY_TYPE_deleted; ++ new_val_u64s = 0; ++ } ++ ++ switch (k.k->type) { ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: { ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); ++ union bch_extent_entry *entry; ++ bool seen_crc = false; ++ ++ bkey_extent_entry_for_each(ptrs, entry) { ++ switch (extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ if (!seen_crc) ++ entry->ptr.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ entry->crc32.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_crc64: ++ entry->crc64.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_crc128: ++ entry->crc128.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ break; ++ } ++ ++ if (extent_entry_is_crc(entry)) ++ seen_crc = true; ++ } ++ ++ break; ++ } ++ case KEY_TYPE_reflink_p: { ++ struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); ++ ++ le64_add_cpu(&p.v->idx, sub); ++ break; ++ } ++ case KEY_TYPE_inline_data: { ++ struct bkey_s_inline_data d = bkey_s_to_inline_data(k); ++ ++ sub = min_t(u64, sub << 9, bkey_val_bytes(d.k)); ++ ++ memmove(d.v->data, ++ d.v->data + sub, ++ bkey_val_bytes(d.k) - sub); ++ ++ new_val_u64s -= sub >> 3; ++ break; ++ } ++ } ++ ++ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; ++ BUG_ON(val_u64s_delta < 0); ++ ++ set_bkey_val_u64s(k.k, new_val_u64s); ++ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); ++ return -val_u64s_delta; ++} ++ ++int bch2_cut_back_s(struct bpos where, struct bkey_s k) ++{ ++ unsigned new_val_u64s = bkey_val_u64s(k.k); ++ int val_u64s_delta; ++ u64 len = 0; ++ ++ if (bkey_cmp(where, k.k->p) >= 0) ++ return 0; ++ ++ EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0); ++ ++ len = where.offset - bkey_start_offset(k.k); ++ ++ k.k->p = where; ++ k.k->size = len; ++ ++ if (!len) { ++ k.k->type = KEY_TYPE_deleted; ++ new_val_u64s = 0; ++ } ++ ++ switch (k.k->type) { ++ case KEY_TYPE_inline_data: ++ new_val_u64s = min(new_val_u64s, k.k->size << 6); ++ break; ++ } ++ ++ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; ++ BUG_ON(val_u64s_delta < 0); ++ ++ set_bkey_val_u64s(k.k, new_val_u64s); ++ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); ++ return -val_u64s_delta; ++} +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +new file mode 100644 +index 000000000000..29b15365d19c +--- /dev/null ++++ b/fs/bcachefs/extents.h +@@ -0,0 +1,603 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EXTENTS_H ++#define _BCACHEFS_EXTENTS_H ++ ++#include "bcachefs.h" ++#include "bkey.h" ++#include "extents_types.h" ++ ++struct bch_fs; ++struct btree_trans; ++ ++/* extent entries: */ ++ ++#define extent_entry_last(_e) \ ++ ((typeof(&(_e).v->start[0])) bkey_val_end(_e)) ++ ++#define entry_to_ptr(_entry) \ ++({ \ ++ EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \ ++ \ ++ __builtin_choose_expr( \ ++ type_is_exact(_entry, const union bch_extent_entry *), \ ++ (const struct bch_extent_ptr *) (_entry), \ ++ (struct bch_extent_ptr *) (_entry)); \ ++}) ++ ++/* downcast, preserves const */ ++#define to_entry(_entry) \ ++({ \ ++ BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \ ++ !type_is(_entry, struct bch_extent_ptr *) && \ ++ !type_is(_entry, struct bch_extent_stripe_ptr *)); \ ++ \ ++ __builtin_choose_expr( \ ++ (type_is_exact(_entry, const union bch_extent_crc *) || \ ++ type_is_exact(_entry, const struct bch_extent_ptr *) ||\ ++ type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\ ++ (const union bch_extent_entry *) (_entry), \ ++ (union bch_extent_entry *) (_entry)); \ ++}) ++ ++#define extent_entry_next(_entry) \ ++ ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) ++ ++static inline unsigned ++__extent_entry_type(const union bch_extent_entry *e) ++{ ++ return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX; ++} ++ ++static inline enum bch_extent_entry_type ++extent_entry_type(const union bch_extent_entry *e) ++{ ++ int ret = __ffs(e->type); ++ ++ EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX); ++ ++ return ret; ++} ++ ++static inline size_t extent_entry_bytes(const union bch_extent_entry *entry) ++{ ++ switch (extent_entry_type(entry)) { ++#define x(f, n) \ ++ case BCH_EXTENT_ENTRY_##f: \ ++ return sizeof(struct bch_extent_##f); ++ BCH_EXTENT_ENTRY_TYPES() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) ++{ ++ return extent_entry_bytes(entry) / sizeof(u64); ++} ++ ++static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) ++{ ++ switch (extent_entry_type(e)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static inline bool extent_entry_is_crc(const union bch_extent_entry *e) ++{ ++ switch (extent_entry_type(e)) { ++ case BCH_EXTENT_ENTRY_crc32: ++ case BCH_EXTENT_ENTRY_crc64: ++ case BCH_EXTENT_ENTRY_crc128: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++union bch_extent_crc { ++ u8 type; ++ struct bch_extent_crc32 crc32; ++ struct bch_extent_crc64 crc64; ++ struct bch_extent_crc128 crc128; ++}; ++ ++#define __entry_to_crc(_entry) \ ++ __builtin_choose_expr( \ ++ type_is_exact(_entry, const union bch_extent_entry *), \ ++ (const union bch_extent_crc *) (_entry), \ ++ (union bch_extent_crc *) (_entry)) ++ ++#define entry_to_crc(_entry) \ ++({ \ ++ EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \ ++ \ ++ __entry_to_crc(_entry); \ ++}) ++ ++static inline struct bch_extent_crc_unpacked ++bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) ++{ ++#define common_fields(_crc) \ ++ .csum_type = _crc.csum_type, \ ++ .compression_type = _crc.compression_type, \ ++ .compressed_size = _crc._compressed_size + 1, \ ++ .uncompressed_size = _crc._uncompressed_size + 1, \ ++ .offset = _crc.offset, \ ++ .live_size = k->size ++ ++ if (!crc) ++ return (struct bch_extent_crc_unpacked) { ++ .compressed_size = k->size, ++ .uncompressed_size = k->size, ++ .live_size = k->size, ++ }; ++ ++ switch (extent_entry_type(to_entry(crc))) { ++ case BCH_EXTENT_ENTRY_crc32: { ++ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { ++ common_fields(crc->crc32), ++ }; ++ ++ *((__le32 *) &ret.csum.lo) = crc->crc32.csum; ++ ++ memcpy(&ret.csum.lo, &crc->crc32.csum, ++ sizeof(crc->crc32.csum)); ++ ++ return ret; ++ } ++ case BCH_EXTENT_ENTRY_crc64: { ++ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { ++ common_fields(crc->crc64), ++ .nonce = crc->crc64.nonce, ++ .csum.lo = (__force __le64) crc->crc64.csum_lo, ++ }; ++ ++ *((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi; ++ ++ return ret; ++ } ++ case BCH_EXTENT_ENTRY_crc128: { ++ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { ++ common_fields(crc->crc128), ++ .nonce = crc->crc128.nonce, ++ .csum = crc->crc128.csum, ++ }; ++ ++ return ret; ++ } ++ default: ++ BUG(); ++ } ++#undef common_fields ++} ++ ++static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc) ++{ ++ return (crc.compression_type != BCH_COMPRESSION_TYPE_none && ++ crc.compression_type != BCH_COMPRESSION_TYPE_incompressible); ++} ++ ++/* bkey_ptrs: generically over any key type that has ptrs */ ++ ++struct bkey_ptrs_c { ++ const union bch_extent_entry *start; ++ const union bch_extent_entry *end; ++}; ++ ++struct bkey_ptrs { ++ union bch_extent_entry *start; ++ union bch_extent_entry *end; ++}; ++ ++static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_btree_ptr: { ++ struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); ++ return (struct bkey_ptrs_c) { ++ to_entry(&e.v->start[0]), ++ to_entry(extent_entry_last(e)) ++ }; ++ } ++ case KEY_TYPE_extent: { ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ return (struct bkey_ptrs_c) { ++ e.v->start, ++ extent_entry_last(e) ++ }; ++ } ++ case KEY_TYPE_stripe: { ++ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); ++ return (struct bkey_ptrs_c) { ++ to_entry(&s.v->ptrs[0]), ++ to_entry(&s.v->ptrs[s.v->nr_blocks]), ++ }; ++ } ++ case KEY_TYPE_reflink_v: { ++ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); ++ ++ return (struct bkey_ptrs_c) { ++ r.v->start, ++ bkey_val_end(r), ++ }; ++ } ++ case KEY_TYPE_btree_ptr_v2: { ++ struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k); ++ return (struct bkey_ptrs_c) { ++ to_entry(&e.v->start[0]), ++ to_entry(extent_entry_last(e)) ++ }; ++ } ++ default: ++ return (struct bkey_ptrs_c) { NULL, NULL }; ++ } ++} ++ ++static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) ++{ ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c); ++ ++ return (struct bkey_ptrs) { ++ (void *) p.start, ++ (void *) p.end ++ }; ++} ++ ++#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ ++ for ((_entry) = (_start); \ ++ (_entry) < (_end); \ ++ (_entry) = extent_entry_next(_entry)) ++ ++#define __bkey_ptr_next(_ptr, _end) \ ++({ \ ++ typeof(_end) _entry; \ ++ \ ++ __bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry) \ ++ if (extent_entry_is_ptr(_entry)) \ ++ break; \ ++ \ ++ _entry < (_end) ? entry_to_ptr(_entry) : NULL; \ ++}) ++ ++#define bkey_extent_entry_for_each_from(_p, _entry, _start) \ ++ __bkey_extent_entry_for_each_from(_start, (_p).end, _entry) ++ ++#define bkey_extent_entry_for_each(_p, _entry) \ ++ bkey_extent_entry_for_each_from(_p, _entry, _p.start) ++ ++#define __bkey_for_each_ptr(_start, _end, _ptr) \ ++ for ((_ptr) = (_start); \ ++ ((_ptr) = __bkey_ptr_next(_ptr, _end)); \ ++ (_ptr)++) ++ ++#define bkey_ptr_next(_p, _ptr) \ ++ __bkey_ptr_next(_ptr, (_p).end) ++ ++#define bkey_for_each_ptr(_p, _ptr) \ ++ __bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr) ++ ++#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry) \ ++({ \ ++ __label__ out; \ ++ \ ++ (_ptr).idx = 0; \ ++ (_ptr).has_ec = false; \ ++ \ ++ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ ++ switch (extent_entry_type(_entry)) { \ ++ case BCH_EXTENT_ENTRY_ptr: \ ++ (_ptr).ptr = _entry->ptr; \ ++ goto out; \ ++ case BCH_EXTENT_ENTRY_crc32: \ ++ case BCH_EXTENT_ENTRY_crc64: \ ++ case BCH_EXTENT_ENTRY_crc128: \ ++ (_ptr).crc = bch2_extent_crc_unpack(_k, \ ++ entry_to_crc(_entry)); \ ++ break; \ ++ case BCH_EXTENT_ENTRY_stripe_ptr: \ ++ (_ptr).ec = _entry->stripe_ptr; \ ++ (_ptr).has_ec = true; \ ++ break; \ ++ } \ ++out: \ ++ _entry < (_end); \ ++}) ++ ++#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry) \ ++ for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \ ++ (_entry) = _start; \ ++ __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \ ++ (_entry) = extent_entry_next(_entry)) ++ ++#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \ ++ __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ ++ _ptr, _entry) ++ ++#define bkey_crc_next(_k, _start, _end, _crc, _iter) \ ++({ \ ++ __bkey_extent_entry_for_each_from(_iter, _end, _iter) \ ++ if (extent_entry_is_crc(_iter)) { \ ++ (_crc) = bch2_extent_crc_unpack(_k, \ ++ entry_to_crc(_iter)); \ ++ break; \ ++ } \ ++ \ ++ (_iter) < (_end); \ ++}) ++ ++#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \ ++ for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \ ++ (_iter) = (_start); \ ++ bkey_crc_next(_k, _start, _end, _crc, _iter); \ ++ (_iter) = extent_entry_next(_iter)) ++ ++#define bkey_for_each_crc(_k, _p, _crc, _iter) \ ++ __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter) ++ ++/* Iterate over pointers in KEY_TYPE_extent: */ ++ ++#define extent_for_each_entry_from(_e, _entry, _start) \ ++ __bkey_extent_entry_for_each_from(_start, \ ++ extent_entry_last(_e),_entry) ++ ++#define extent_for_each_entry(_e, _entry) \ ++ extent_for_each_entry_from(_e, _entry, (_e).v->start) ++ ++#define extent_ptr_next(_e, _ptr) \ ++ __bkey_ptr_next(_ptr, extent_entry_last(_e)) ++ ++#define extent_for_each_ptr(_e, _ptr) \ ++ __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) ++ ++#define extent_for_each_ptr_decode(_e, _ptr, _entry) \ ++ __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ ++ extent_entry_last(_e), _ptr, _entry) ++ ++/* utility code common to all keys with pointers: */ ++ ++void bch2_mark_io_failure(struct bch_io_failures *, ++ struct extent_ptr_decoded *); ++int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, ++ struct bch_io_failures *, ++ struct extent_ptr_decoded *); ++ ++/* KEY_TYPE_btree_ptr: */ ++ ++const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c); ++void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ ++void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, ++ int, struct bkey_s); ++ ++#define bch2_bkey_ops_btree_ptr (struct bkey_ops) { \ ++ .key_invalid = bch2_btree_ptr_invalid, \ ++ .key_debugcheck = bch2_btree_ptr_debugcheck, \ ++ .val_to_text = bch2_btree_ptr_to_text, \ ++ .swab = bch2_ptr_swab, \ ++} ++ ++#define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \ ++ .key_invalid = bch2_btree_ptr_invalid, \ ++ .key_debugcheck = bch2_btree_ptr_debugcheck, \ ++ .val_to_text = bch2_btree_ptr_v2_to_text, \ ++ .swab = bch2_ptr_swab, \ ++ .compat = bch2_btree_ptr_v2_compat, \ ++} ++ ++/* KEY_TYPE_extent: */ ++ ++const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c); ++void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++enum merge_result bch2_extent_merge(struct bch_fs *, ++ struct bkey_s, struct bkey_s); ++ ++#define bch2_bkey_ops_extent (struct bkey_ops) { \ ++ .key_invalid = bch2_extent_invalid, \ ++ .key_debugcheck = bch2_extent_debugcheck, \ ++ .val_to_text = bch2_extent_to_text, \ ++ .swab = bch2_ptr_swab, \ ++ .key_normalize = bch2_extent_normalize, \ ++ .key_merge = bch2_extent_merge, \ ++} ++ ++/* KEY_TYPE_reservation: */ ++ ++const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++enum merge_result bch2_reservation_merge(struct bch_fs *, ++ struct bkey_s, struct bkey_s); ++ ++#define bch2_bkey_ops_reservation (struct bkey_ops) { \ ++ .key_invalid = bch2_reservation_invalid, \ ++ .val_to_text = bch2_reservation_to_text, \ ++ .key_merge = bch2_reservation_merge, \ ++} ++ ++/* Extent checksum entries: */ ++ ++bool bch2_can_narrow_extent_crcs(struct bkey_s_c, ++ struct bch_extent_crc_unpacked); ++bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); ++void bch2_extent_crc_append(struct bkey_i *, ++ struct bch_extent_crc_unpacked); ++ ++/* Generic code for keys with pointers: */ ++ ++static inline bool bkey_extent_is_direct_data(const struct bkey *k) ++{ ++ switch (k->type) { ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static inline bool bkey_extent_is_data(const struct bkey *k) ++{ ++ return bkey_extent_is_direct_data(k) || ++ k->type == KEY_TYPE_inline_data || ++ k->type == KEY_TYPE_reflink_p; ++} ++ ++/* ++ * Should extent be counted under inode->i_sectors? ++ */ ++static inline bool bkey_extent_is_allocation(const struct bkey *k) ++{ ++ switch (k->type) { ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reservation: ++ case KEY_TYPE_reflink_p: ++ case KEY_TYPE_reflink_v: ++ case KEY_TYPE_inline_data: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) ++{ ++ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(p, ptr) ++ ret.devs[ret.nr++] = ptr->dev; ++ ++ return ret; ++} ++ ++static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) ++{ ++ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(p, ptr) ++ if (!ptr->cached) ++ ret.devs[ret.nr++] = ptr->dev; ++ ++ return ret; ++} ++ ++static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) ++{ ++ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(p, ptr) ++ if (ptr->cached) ++ ret.devs[ret.nr++] = ptr->dev; ++ ++ return ret; ++} ++ ++unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); ++unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); ++unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); ++bool bch2_bkey_is_incompressible(struct bkey_s_c); ++unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); ++bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned); ++unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); ++ ++void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, ++ unsigned, unsigned); ++ ++void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); ++void bch2_extent_ptr_decoded_append(struct bkey_i *, ++ struct extent_ptr_decoded *); ++union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, ++ struct bch_extent_ptr *); ++ ++#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \ ++do { \ ++ struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \ ++ \ ++ _ptr = &_ptrs.start->ptr; \ ++ \ ++ while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \ ++ if (_cond) { \ ++ _ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr); \ ++ _ptrs = bch2_bkey_ptrs(_k); \ ++ continue; \ ++ } \ ++ \ ++ (_ptr)++; \ ++ } \ ++} while (0) ++ ++void bch2_bkey_drop_device(struct bkey_s, unsigned); ++const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned); ++bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); ++ ++bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, ++ struct bch_extent_ptr, u64); ++ ++bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); ++void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c); ++ ++void bch2_ptr_swab(struct bkey_s); ++ ++/* Generic extent code: */ ++ ++int bch2_cut_front_s(struct bpos, struct bkey_s); ++int bch2_cut_back_s(struct bpos, struct bkey_s); ++ ++static inline void bch2_cut_front(struct bpos where, struct bkey_i *k) ++{ ++ bch2_cut_front_s(where, bkey_i_to_s(k)); ++} ++ ++static inline void bch2_cut_back(struct bpos where, struct bkey_i *k) ++{ ++ bch2_cut_back_s(where, bkey_i_to_s(k)); ++} ++ ++/** ++ * bch_key_resize - adjust size of @k ++ * ++ * bkey_start_offset(k) will be preserved, modifies where the extent ends ++ */ ++static inline void bch2_key_resize(struct bkey *k, unsigned new_size) ++{ ++ k->p.offset -= k->size; ++ k->p.offset += new_size; ++ k->size = new_size; ++} ++ ++/* ++ * In extent_sort_fix_overlapping(), insert_fixup_extent(), ++ * extent_merge_inline() - we're modifying keys in place that are packed. To do ++ * that we have to unpack the key, modify the unpacked key - then this ++ * copies/repacks the unpacked to the original as necessary. ++ */ ++static inline void extent_save(struct btree *b, struct bkey_packed *dst, ++ struct bkey *src) ++{ ++ struct bkey_format *f = &b->format; ++ struct bkey_i *dst_unpacked; ++ ++ if ((dst_unpacked = packed_to_bkey(dst))) ++ dst_unpacked->k = *src; ++ else ++ BUG_ON(!bch2_bkey_pack_key(dst, src, f)); ++} ++ ++#endif /* _BCACHEFS_EXTENTS_H */ +diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h +new file mode 100644 +index 000000000000..43d6c341ecca +--- /dev/null ++++ b/fs/bcachefs/extents_types.h +@@ -0,0 +1,40 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EXTENTS_TYPES_H ++#define _BCACHEFS_EXTENTS_TYPES_H ++ ++#include "bcachefs_format.h" ++ ++struct bch_extent_crc_unpacked { ++ u32 compressed_size; ++ u32 uncompressed_size; ++ u32 live_size; ++ ++ u8 csum_type; ++ u8 compression_type; ++ ++ u16 offset; ++ ++ u16 nonce; ++ ++ struct bch_csum csum; ++}; ++ ++struct extent_ptr_decoded { ++ unsigned idx; ++ bool has_ec; ++ struct bch_extent_crc_unpacked crc; ++ struct bch_extent_ptr ptr; ++ struct bch_extent_stripe_ptr ec; ++}; ++ ++struct bch_io_failures { ++ u8 nr; ++ struct bch_dev_io_failures { ++ u8 dev; ++ u8 idx; ++ u8 nr_failed; ++ u8 nr_retries; ++ } devs[BCH_REPLICAS_MAX]; ++}; ++ ++#endif /* _BCACHEFS_EXTENTS_TYPES_H */ +diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h +new file mode 100644 +index 000000000000..26d5cad7e6a5 +--- /dev/null ++++ b/fs/bcachefs/eytzinger.h +@@ -0,0 +1,285 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _EYTZINGER_H ++#define _EYTZINGER_H ++ ++#include ++#include ++ ++#include "util.h" ++ ++/* ++ * Traversal for trees in eytzinger layout - a full binary tree layed out in an ++ * array ++ */ ++ ++/* ++ * One based indexing version: ++ * ++ * With one based indexing each level of the tree starts at a power of two - ++ * good for cacheline alignment: ++ * ++ * Size parameter is treated as if we were using 0 based indexing, however: ++ * valid nodes, and inorder indices, are in the range [1..size) - that is, there ++ * are actually size - 1 elements ++ */ ++ ++static inline unsigned eytzinger1_child(unsigned i, unsigned child) ++{ ++ EBUG_ON(child > 1); ++ ++ return (i << 1) + child; ++} ++ ++static inline unsigned eytzinger1_left_child(unsigned i) ++{ ++ return eytzinger1_child(i, 0); ++} ++ ++static inline unsigned eytzinger1_right_child(unsigned i) ++{ ++ return eytzinger1_child(i, 1); ++} ++ ++static inline unsigned eytzinger1_first(unsigned size) ++{ ++ return rounddown_pow_of_two(size - 1); ++} ++ ++static inline unsigned eytzinger1_last(unsigned size) ++{ ++ return rounddown_pow_of_two(size) - 1; ++} ++ ++/* ++ * eytzinger1_next() and eytzinger1_prev() have the nice properties that ++ * ++ * eytzinger1_next(0) == eytzinger1_first()) ++ * eytzinger1_prev(0) == eytzinger1_last()) ++ * ++ * eytzinger1_prev(eytzinger1_first()) == 0 ++ * eytzinger1_next(eytzinger1_last()) == 0 ++ */ ++ ++static inline unsigned eytzinger1_next(unsigned i, unsigned size) ++{ ++ EBUG_ON(i >= size); ++ ++ if (eytzinger1_right_child(i) < size) { ++ i = eytzinger1_right_child(i); ++ ++ i <<= __fls(size) - __fls(i); ++ i >>= i >= size; ++ } else { ++ i >>= ffz(i) + 1; ++ } ++ ++ return i; ++} ++ ++static inline unsigned eytzinger1_prev(unsigned i, unsigned size) ++{ ++ EBUG_ON(i >= size); ++ ++ if (eytzinger1_left_child(i) < size) { ++ i = eytzinger1_left_child(i) + 1; ++ ++ i <<= __fls(size) - __fls(i); ++ i -= 1; ++ i >>= i >= size; ++ } else { ++ i >>= __ffs(i) + 1; ++ } ++ ++ return i; ++} ++ ++static inline unsigned eytzinger1_extra(unsigned size) ++{ ++ return (size - rounddown_pow_of_two(size - 1)) << 1; ++} ++ ++static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, ++ unsigned extra) ++{ ++ unsigned b = __fls(i); ++ unsigned shift = __fls(size - 1) - b; ++ int s; ++ ++ EBUG_ON(!i || i >= size); ++ ++ i ^= 1U << b; ++ i <<= 1; ++ i |= 1; ++ i <<= shift; ++ ++ /* ++ * sign bit trick: ++ * ++ * if (i > extra) ++ * i -= (i - extra) >> 1; ++ */ ++ s = extra - i; ++ i += (s >> 1) & (s >> 31); ++ ++ return i; ++} ++ ++static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, ++ unsigned extra) ++{ ++ unsigned shift; ++ int s; ++ ++ EBUG_ON(!i || i >= size); ++ ++ /* ++ * sign bit trick: ++ * ++ * if (i > extra) ++ * i += i - extra; ++ */ ++ s = extra - i; ++ i -= s & (s >> 31); ++ ++ shift = __ffs(i); ++ ++ i >>= shift + 1; ++ i |= 1U << (__fls(size - 1) - shift); ++ ++ return i; ++} ++ ++static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size) ++{ ++ return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size)); ++} ++ ++static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) ++{ ++ return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size)); ++} ++ ++#define eytzinger1_for_each(_i, _size) \ ++ for ((_i) = eytzinger1_first((_size)); \ ++ (_i) != 0; \ ++ (_i) = eytzinger1_next((_i), (_size))) ++ ++/* Zero based indexing version: */ ++ ++static inline unsigned eytzinger0_child(unsigned i, unsigned child) ++{ ++ EBUG_ON(child > 1); ++ ++ return (i << 1) + 1 + child; ++} ++ ++static inline unsigned eytzinger0_left_child(unsigned i) ++{ ++ return eytzinger0_child(i, 0); ++} ++ ++static inline unsigned eytzinger0_right_child(unsigned i) ++{ ++ return eytzinger0_child(i, 1); ++} ++ ++static inline unsigned eytzinger0_first(unsigned size) ++{ ++ return eytzinger1_first(size + 1) - 1; ++} ++ ++static inline unsigned eytzinger0_last(unsigned size) ++{ ++ return eytzinger1_last(size + 1) - 1; ++} ++ ++static inline unsigned eytzinger0_next(unsigned i, unsigned size) ++{ ++ return eytzinger1_next(i + 1, size + 1) - 1; ++} ++ ++static inline unsigned eytzinger0_prev(unsigned i, unsigned size) ++{ ++ return eytzinger1_prev(i + 1, size + 1) - 1; ++} ++ ++static inline unsigned eytzinger0_extra(unsigned size) ++{ ++ return eytzinger1_extra(size + 1); ++} ++ ++static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size, ++ unsigned extra) ++{ ++ return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1; ++} ++ ++static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size, ++ unsigned extra) ++{ ++ return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1; ++} ++ ++static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size) ++{ ++ return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size)); ++} ++ ++static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) ++{ ++ return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size)); ++} ++ ++#define eytzinger0_for_each(_i, _size) \ ++ for ((_i) = eytzinger0_first((_size)); \ ++ (_i) != -1; \ ++ (_i) = eytzinger0_next((_i), (_size))) ++ ++typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size); ++ ++/* return greatest node <= @search, or -1 if not found */ ++static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, ++ eytzinger_cmp_fn cmp, const void *search) ++{ ++ unsigned i, n = 0; ++ ++ if (!nr) ++ return -1; ++ ++ do { ++ i = n; ++ n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0); ++ } while (n < nr); ++ ++ if (n & 1) { ++ /* @i was greater than @search, return previous node: */ ++ ++ if (i == eytzinger0_first(nr)) ++ return -1; ++ ++ return eytzinger0_prev(i, nr); ++ } else { ++ return i; ++ } ++} ++ ++#define eytzinger0_find(base, nr, size, _cmp, search) \ ++({ \ ++ void *_base = (base); \ ++ void *_search = (search); \ ++ size_t _nr = (nr); \ ++ size_t _size = (size); \ ++ size_t _i = 0; \ ++ int _res; \ ++ \ ++ while (_i < _nr && \ ++ (_res = _cmp(_search, _base + _i * _size, _size))) \ ++ _i = eytzinger0_child(_i, _res > 0); \ ++ _i; \ ++}) ++ ++void eytzinger0_sort(void *, size_t, size_t, ++ int (*cmp_func)(const void *, const void *, size_t), ++ void (*swap_func)(void *, void *, size_t)); ++ ++#endif /* _EYTZINGER_H */ +diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h +new file mode 100644 +index 000000000000..cdb272708a4b +--- /dev/null ++++ b/fs/bcachefs/fifo.h +@@ -0,0 +1,127 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FIFO_H ++#define _BCACHEFS_FIFO_H ++ ++#include "util.h" ++ ++#define FIFO(type) \ ++struct { \ ++ size_t front, back, size, mask; \ ++ type *data; \ ++} ++ ++#define DECLARE_FIFO(type, name) FIFO(type) name ++ ++#define fifo_buf_size(fifo) \ ++ ((fifo)->size \ ++ ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]) \ ++ : 0) ++ ++#define init_fifo(fifo, _size, _gfp) \ ++({ \ ++ (fifo)->front = (fifo)->back = 0; \ ++ (fifo)->size = (_size); \ ++ (fifo)->mask = (fifo)->size \ ++ ? roundup_pow_of_two((fifo)->size) - 1 \ ++ : 0; \ ++ (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \ ++}) ++ ++#define free_fifo(fifo) \ ++do { \ ++ kvpfree((fifo)->data, fifo_buf_size(fifo)); \ ++ (fifo)->data = NULL; \ ++} while (0) ++ ++#define fifo_swap(l, r) \ ++do { \ ++ swap((l)->front, (r)->front); \ ++ swap((l)->back, (r)->back); \ ++ swap((l)->size, (r)->size); \ ++ swap((l)->mask, (r)->mask); \ ++ swap((l)->data, (r)->data); \ ++} while (0) ++ ++#define fifo_move(dest, src) \ ++do { \ ++ typeof(*((dest)->data)) _t; \ ++ while (!fifo_full(dest) && \ ++ fifo_pop(src, _t)) \ ++ fifo_push(dest, _t); \ ++} while (0) ++ ++#define fifo_used(fifo) (((fifo)->back - (fifo)->front)) ++#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) ++ ++#define fifo_empty(fifo) ((fifo)->front == (fifo)->back) ++#define fifo_full(fifo) (fifo_used(fifo) == (fifo)->size) ++ ++#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask]) ++#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) ++ ++#define fifo_entry_idx_abs(fifo, p) \ ++ ((((p) >= &fifo_peek_front(fifo) \ ++ ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) + \ ++ (((p) - (fifo)->data))) ++ ++#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask) ++#define fifo_idx_entry(fifo, i) (fifo)->data[((fifo)->front + (i)) & (fifo)->mask] ++ ++#define fifo_push_back_ref(f) \ ++ (fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask]) ++ ++#define fifo_push_front_ref(f) \ ++ (fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask]) ++ ++#define fifo_push_back(fifo, new) \ ++({ \ ++ typeof((fifo)->data) _r = fifo_push_back_ref(fifo); \ ++ if (_r) \ ++ *_r = (new); \ ++ _r != NULL; \ ++}) ++ ++#define fifo_push_front(fifo, new) \ ++({ \ ++ typeof((fifo)->data) _r = fifo_push_front_ref(fifo); \ ++ if (_r) \ ++ *_r = (new); \ ++ _r != NULL; \ ++}) ++ ++#define fifo_pop_front(fifo, i) \ ++({ \ ++ bool _r = !fifo_empty((fifo)); \ ++ if (_r) \ ++ (i) = (fifo)->data[(fifo)->front++ & (fifo)->mask]; \ ++ _r; \ ++}) ++ ++#define fifo_pop_back(fifo, i) \ ++({ \ ++ bool _r = !fifo_empty((fifo)); \ ++ if (_r) \ ++ (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]; \ ++ _r; \ ++}) ++ ++#define fifo_push_ref(fifo) fifo_push_back_ref(fifo) ++#define fifo_push(fifo, i) fifo_push_back(fifo, (i)) ++#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) ++#define fifo_peek(fifo) fifo_peek_front(fifo) ++ ++#define fifo_for_each_entry(_entry, _fifo, _iter) \ ++ for (typecheck(typeof((_fifo)->front), _iter), \ ++ (_iter) = (_fifo)->front; \ ++ ((_iter != (_fifo)->back) && \ ++ (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \ ++ (_iter)++) ++ ++#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \ ++ for (typecheck(typeof((_fifo)->front), _iter), \ ++ (_iter) = (_fifo)->front; \ ++ ((_iter != (_fifo)->back) && \ ++ (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \ ++ (_iter)++) ++ ++#endif /* _BCACHEFS_FIFO_H */ +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +new file mode 100644 +index 000000000000..878419d40992 +--- /dev/null ++++ b/fs/bcachefs/fs-common.c +@@ -0,0 +1,317 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "acl.h" ++#include "btree_update.h" ++#include "dirent.h" ++#include "fs-common.h" ++#include "inode.h" ++#include "xattr.h" ++ ++#include ++ ++int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, ++ struct bch_inode_unpacked *dir_u, ++ struct bch_inode_unpacked *new_inode, ++ const struct qstr *name, ++ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, ++ struct posix_acl *default_acl, ++ struct posix_acl *acl) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *dir_iter = NULL; ++ struct bch_hash_info hash = bch2_hash_info_init(c, new_inode); ++ u64 now = bch2_current_time(trans->c); ++ int ret; ++ ++ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(dir_iter); ++ if (ret) ++ goto err; ++ ++ bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); ++ ++ if (!name) ++ new_inode->bi_flags |= BCH_INODE_UNLINKED; ++ ++ ret = bch2_inode_create(trans, new_inode, ++ BLOCKDEV_INODE_MAX, 0, ++ &c->unused_inode_hint); ++ if (ret) ++ goto err; ++ ++ if (default_acl) { ++ ret = bch2_set_acl_trans(trans, new_inode, &hash, ++ default_acl, ACL_TYPE_DEFAULT); ++ if (ret) ++ goto err; ++ } ++ ++ if (acl) { ++ ret = bch2_set_acl_trans(trans, new_inode, &hash, ++ acl, ACL_TYPE_ACCESS); ++ if (ret) ++ goto err; ++ } ++ ++ if (name) { ++ struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u); ++ dir_u->bi_mtime = dir_u->bi_ctime = now; ++ ++ if (S_ISDIR(new_inode->bi_mode)) ++ dir_u->bi_nlink++; ++ ++ ret = bch2_inode_write(trans, dir_iter, dir_u); ++ if (ret) ++ goto err; ++ ++ ret = bch2_dirent_create(trans, dir_inum, &dir_hash, ++ mode_to_type(new_inode->bi_mode), ++ name, new_inode->bi_inum, ++ BCH_HASH_SET_MUST_CREATE); ++ if (ret) ++ goto err; ++ } ++err: ++ bch2_trans_iter_put(trans, dir_iter); ++ return ret; ++} ++ ++int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, ++ u64 inum, struct bch_inode_unpacked *dir_u, ++ struct bch_inode_unpacked *inode_u, const struct qstr *name) ++{ ++ struct btree_iter *dir_iter = NULL, *inode_iter = NULL; ++ struct bch_hash_info dir_hash; ++ u64 now = bch2_current_time(trans->c); ++ int ret; ++ ++ inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(inode_iter); ++ if (ret) ++ goto err; ++ ++ inode_u->bi_ctime = now; ++ bch2_inode_nlink_inc(inode_u); ++ ++ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0); ++ ret = PTR_ERR_OR_ZERO(dir_iter); ++ if (ret) ++ goto err; ++ ++ dir_u->bi_mtime = dir_u->bi_ctime = now; ++ ++ dir_hash = bch2_hash_info_init(trans->c, dir_u); ++ ++ ret = bch2_dirent_create(trans, dir_inum, &dir_hash, ++ mode_to_type(inode_u->bi_mode), ++ name, inum, BCH_HASH_SET_MUST_CREATE) ?: ++ bch2_inode_write(trans, dir_iter, dir_u) ?: ++ bch2_inode_write(trans, inode_iter, inode_u); ++err: ++ bch2_trans_iter_put(trans, dir_iter); ++ bch2_trans_iter_put(trans, inode_iter); ++ return ret; ++} ++ ++int bch2_unlink_trans(struct btree_trans *trans, ++ u64 dir_inum, struct bch_inode_unpacked *dir_u, ++ struct bch_inode_unpacked *inode_u, ++ const struct qstr *name) ++{ ++ struct btree_iter *dir_iter = NULL, *dirent_iter = NULL, ++ *inode_iter = NULL; ++ struct bch_hash_info dir_hash; ++ u64 inum, now = bch2_current_time(trans->c); ++ struct bkey_s_c k; ++ int ret; ++ ++ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(dir_iter); ++ if (ret) ++ goto err; ++ ++ dir_hash = bch2_hash_info_init(trans->c, dir_u); ++ ++ dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash, ++ name, BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(dirent_iter); ++ if (ret) ++ goto err; ++ ++ k = bch2_btree_iter_peek_slot(dirent_iter); ++ inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); ++ ++ inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(inode_iter); ++ if (ret) ++ goto err; ++ ++ dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; ++ dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode); ++ bch2_inode_nlink_dec(inode_u); ++ ++ ret = (S_ISDIR(inode_u->bi_mode) ++ ? bch2_empty_dir_trans(trans, inum) ++ : 0) ?: ++ bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?: ++ bch2_inode_write(trans, dir_iter, dir_u) ?: ++ bch2_inode_write(trans, inode_iter, inode_u); ++err: ++ bch2_trans_iter_put(trans, inode_iter); ++ bch2_trans_iter_put(trans, dirent_iter); ++ bch2_trans_iter_put(trans, dir_iter); ++ return ret; ++} ++ ++bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, ++ struct bch_inode_unpacked *src_u) ++{ ++ u64 src, dst; ++ unsigned id; ++ bool ret = false; ++ ++ for (id = 0; id < Inode_opt_nr; id++) { ++ if (dst_u->bi_fields_set & (1 << id)) ++ continue; ++ ++ src = bch2_inode_opt_get(src_u, id); ++ dst = bch2_inode_opt_get(dst_u, id); ++ ++ if (src == dst) ++ continue; ++ ++ bch2_inode_opt_set(dst_u, id, src); ++ ret = true; ++ } ++ ++ return ret; ++} ++ ++int bch2_rename_trans(struct btree_trans *trans, ++ u64 src_dir, struct bch_inode_unpacked *src_dir_u, ++ u64 dst_dir, struct bch_inode_unpacked *dst_dir_u, ++ struct bch_inode_unpacked *src_inode_u, ++ struct bch_inode_unpacked *dst_inode_u, ++ const struct qstr *src_name, ++ const struct qstr *dst_name, ++ enum bch_rename_mode mode) ++{ ++ struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL; ++ struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL; ++ struct bch_hash_info src_hash, dst_hash; ++ u64 src_inode, dst_inode, now = bch2_current_time(trans->c); ++ int ret; ++ ++ src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(src_dir_iter); ++ if (ret) ++ goto err; ++ ++ src_hash = bch2_hash_info_init(trans->c, src_dir_u); ++ ++ if (dst_dir != src_dir) { ++ dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(dst_dir_iter); ++ if (ret) ++ goto err; ++ ++ dst_hash = bch2_hash_info_init(trans->c, dst_dir_u); ++ } else { ++ dst_dir_u = src_dir_u; ++ dst_hash = src_hash; ++ } ++ ++ ret = bch2_dirent_rename(trans, ++ src_dir, &src_hash, ++ dst_dir, &dst_hash, ++ src_name, &src_inode, ++ dst_name, &dst_inode, ++ mode); ++ if (ret) ++ goto err; ++ ++ src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(src_inode_iter); ++ if (ret) ++ goto err; ++ ++ if (dst_inode) { ++ dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(dst_inode_iter); ++ if (ret) ++ goto err; ++ } ++ ++ if (mode == BCH_RENAME_OVERWRITE) { ++ if (S_ISDIR(src_inode_u->bi_mode) != ++ S_ISDIR(dst_inode_u->bi_mode)) { ++ ret = -ENOTDIR; ++ goto err; ++ } ++ ++ if (S_ISDIR(dst_inode_u->bi_mode) && ++ bch2_empty_dir_trans(trans, dst_inode)) { ++ ret = -ENOTEMPTY; ++ goto err; ++ } ++ } ++ ++ if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && ++ S_ISDIR(src_inode_u->bi_mode)) { ++ ret = -EXDEV; ++ goto err; ++ } ++ ++ if (mode == BCH_RENAME_EXCHANGE && ++ bch2_reinherit_attrs(dst_inode_u, src_dir_u) && ++ S_ISDIR(dst_inode_u->bi_mode)) { ++ ret = -EXDEV; ++ goto err; ++ } ++ ++ if (S_ISDIR(src_inode_u->bi_mode)) { ++ src_dir_u->bi_nlink--; ++ dst_dir_u->bi_nlink++; ++ } ++ ++ if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) { ++ dst_dir_u->bi_nlink--; ++ src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; ++ } ++ ++ if (mode == BCH_RENAME_OVERWRITE) ++ bch2_inode_nlink_dec(dst_inode_u); ++ ++ src_dir_u->bi_mtime = now; ++ src_dir_u->bi_ctime = now; ++ ++ if (src_dir != dst_dir) { ++ dst_dir_u->bi_mtime = now; ++ dst_dir_u->bi_ctime = now; ++ } ++ ++ src_inode_u->bi_ctime = now; ++ ++ if (dst_inode) ++ dst_inode_u->bi_ctime = now; ++ ++ ret = bch2_inode_write(trans, src_dir_iter, src_dir_u) ?: ++ (src_dir != dst_dir ++ ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u) ++ : 0 ) ?: ++ bch2_inode_write(trans, src_inode_iter, src_inode_u) ?: ++ (dst_inode ++ ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u) ++ : 0 ); ++err: ++ bch2_trans_iter_put(trans, dst_inode_iter); ++ bch2_trans_iter_put(trans, src_inode_iter); ++ bch2_trans_iter_put(trans, dst_dir_iter); ++ bch2_trans_iter_put(trans, src_dir_iter); ++ return ret; ++} +diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h +new file mode 100644 +index 000000000000..2273b7961c9b +--- /dev/null ++++ b/fs/bcachefs/fs-common.h +@@ -0,0 +1,37 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FS_COMMON_H ++#define _BCACHEFS_FS_COMMON_H ++ ++struct posix_acl; ++ ++int bch2_create_trans(struct btree_trans *, u64, ++ struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, ++ const struct qstr *, ++ uid_t, gid_t, umode_t, dev_t, ++ struct posix_acl *, ++ struct posix_acl *); ++ ++int bch2_link_trans(struct btree_trans *, u64, ++ u64, struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, ++ const struct qstr *); ++ ++int bch2_unlink_trans(struct btree_trans *, ++ u64, struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, ++ const struct qstr *); ++ ++int bch2_rename_trans(struct btree_trans *, ++ u64, struct bch_inode_unpacked *, ++ u64, struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, ++ const struct qstr *, ++ const struct qstr *, ++ enum bch_rename_mode); ++ ++bool bch2_reinherit_attrs(struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *); ++ ++#endif /* _BCACHEFS_FS_COMMON_H */ +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +new file mode 100644 +index 000000000000..ec78e7b52375 +--- /dev/null ++++ b/fs/bcachefs/fs-io.c +@@ -0,0 +1,3132 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef NO_BCACHEFS_FS ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_on_stack.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "clock.h" ++#include "error.h" ++#include "extents.h" ++#include "extent_update.h" ++#include "fs.h" ++#include "fs-io.h" ++#include "fsck.h" ++#include "inode.h" ++#include "journal.h" ++#include "io.h" ++#include "keylist.h" ++#include "quota.h" ++#include "reflink.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++struct quota_res { ++ u64 sectors; ++}; ++ ++struct bch_writepage_io { ++ struct closure cl; ++ struct bch_inode_info *inode; ++ ++ /* must be last: */ ++ struct bch_write_op op; ++}; ++ ++struct dio_write { ++ struct completion done; ++ struct kiocb *req; ++ struct mm_struct *mm; ++ unsigned loop:1, ++ sync:1, ++ free_iov:1; ++ struct quota_res quota_res; ++ u64 written; ++ ++ struct iov_iter iter; ++ struct iovec inline_vecs[2]; ++ ++ /* must be last: */ ++ struct bch_write_op op; ++}; ++ ++struct dio_read { ++ struct closure cl; ++ struct kiocb *req; ++ long ret; ++ struct bch_read_bio rbio; ++}; ++ ++/* pagecache_block must be held */ ++static int write_invalidate_inode_pages_range(struct address_space *mapping, ++ loff_t start, loff_t end) ++{ ++ int ret; ++ ++ /* ++ * XXX: the way this is currently implemented, we can spin if a process ++ * is continually redirtying a specific page ++ */ ++ do { ++ if (!mapping->nrpages && ++ !mapping->nrexceptional) ++ return 0; ++ ++ ret = filemap_write_and_wait_range(mapping, start, end); ++ if (ret) ++ break; ++ ++ if (!mapping->nrpages) ++ return 0; ++ ++ ret = invalidate_inode_pages2_range(mapping, ++ start >> PAGE_SHIFT, ++ end >> PAGE_SHIFT); ++ } while (ret == -EBUSY); ++ ++ return ret; ++} ++ ++/* quotas */ ++ ++#ifdef CONFIG_BCACHEFS_QUOTA ++ ++static void bch2_quota_reservation_put(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res) ++{ ++ if (!res->sectors) ++ return; ++ ++ mutex_lock(&inode->ei_quota_lock); ++ BUG_ON(res->sectors > inode->ei_quota_reserved); ++ ++ bch2_quota_acct(c, inode->ei_qid, Q_SPC, ++ -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); ++ inode->ei_quota_reserved -= res->sectors; ++ mutex_unlock(&inode->ei_quota_lock); ++ ++ res->sectors = 0; ++} ++ ++static int bch2_quota_reservation_add(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res, ++ unsigned sectors, ++ bool check_enospc) ++{ ++ int ret; ++ ++ mutex_lock(&inode->ei_quota_lock); ++ ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, ++ check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); ++ if (likely(!ret)) { ++ inode->ei_quota_reserved += sectors; ++ res->sectors += sectors; ++ } ++ mutex_unlock(&inode->ei_quota_lock); ++ ++ return ret; ++} ++ ++#else ++ ++static void bch2_quota_reservation_put(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res) ++{ ++} ++ ++static int bch2_quota_reservation_add(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res, ++ unsigned sectors, ++ bool check_enospc) ++{ ++ return 0; ++} ++ ++#endif ++ ++/* i_size updates: */ ++ ++struct inode_new_size { ++ loff_t new_size; ++ u64 now; ++ unsigned fields; ++}; ++ ++static int inode_set_size(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct inode_new_size *s = p; ++ ++ bi->bi_size = s->new_size; ++ if (s->fields & ATTR_ATIME) ++ bi->bi_atime = s->now; ++ if (s->fields & ATTR_MTIME) ++ bi->bi_mtime = s->now; ++ if (s->fields & ATTR_CTIME) ++ bi->bi_ctime = s->now; ++ ++ return 0; ++} ++ ++int __must_check bch2_write_inode_size(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ loff_t new_size, unsigned fields) ++{ ++ struct inode_new_size s = { ++ .new_size = new_size, ++ .now = bch2_current_time(c), ++ .fields = fields, ++ }; ++ ++ return bch2_write_inode(c, inode, inode_set_size, &s, fields); ++} ++ ++static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, ++ struct quota_res *quota_res, s64 sectors) ++{ ++ if (!sectors) ++ return; ++ ++ mutex_lock(&inode->ei_quota_lock); ++#ifdef CONFIG_BCACHEFS_QUOTA ++ if (quota_res && sectors > 0) { ++ BUG_ON(sectors > quota_res->sectors); ++ BUG_ON(sectors > inode->ei_quota_reserved); ++ ++ quota_res->sectors -= sectors; ++ inode->ei_quota_reserved -= sectors; ++ } else { ++ bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); ++ } ++#endif ++ inode->v.i_blocks += sectors; ++ mutex_unlock(&inode->ei_quota_lock); ++} ++ ++/* page state: */ ++ ++/* stored in page->private: */ ++ ++struct bch_page_sector { ++ /* Uncompressed, fully allocated replicas: */ ++ unsigned nr_replicas:3; ++ ++ /* Owns PAGE_SECTORS * replicas_reserved sized reservation: */ ++ unsigned replicas_reserved:3; ++ ++ /* i_sectors: */ ++ enum { ++ SECTOR_UNALLOCATED, ++ SECTOR_RESERVED, ++ SECTOR_DIRTY, ++ SECTOR_ALLOCATED, ++ } state:2; ++}; ++ ++struct bch_page_state { ++ spinlock_t lock; ++ atomic_t write_count; ++ struct bch_page_sector s[PAGE_SECTORS]; ++}; ++ ++static inline struct bch_page_state *__bch2_page_state(struct page *page) ++{ ++ return page_has_private(page) ++ ? (struct bch_page_state *) page_private(page) ++ : NULL; ++} ++ ++static inline struct bch_page_state *bch2_page_state(struct page *page) ++{ ++ EBUG_ON(!PageLocked(page)); ++ ++ return __bch2_page_state(page); ++} ++ ++/* for newly allocated pages: */ ++static void __bch2_page_state_release(struct page *page) ++{ ++ struct bch_page_state *s = __bch2_page_state(page); ++ ++ if (!s) ++ return; ++ ++ ClearPagePrivate(page); ++ set_page_private(page, 0); ++ put_page(page); ++ kfree(s); ++} ++ ++static void bch2_page_state_release(struct page *page) ++{ ++ struct bch_page_state *s = bch2_page_state(page); ++ ++ if (!s) ++ return; ++ ++ ClearPagePrivate(page); ++ set_page_private(page, 0); ++ put_page(page); ++ kfree(s); ++} ++ ++/* for newly allocated pages: */ ++static struct bch_page_state *__bch2_page_state_create(struct page *page, ++ gfp_t gfp) ++{ ++ struct bch_page_state *s; ++ ++ s = kzalloc(sizeof(*s), GFP_NOFS|gfp); ++ if (!s) ++ return NULL; ++ ++ spin_lock_init(&s->lock); ++ /* ++ * migrate_page_move_mapping() assumes that pages with private data ++ * have their count elevated by 1. ++ */ ++ get_page(page); ++ set_page_private(page, (unsigned long) s); ++ SetPagePrivate(page); ++ return s; ++} ++ ++static struct bch_page_state *bch2_page_state_create(struct page *page, ++ gfp_t gfp) ++{ ++ return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp); ++} ++ ++static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) ++{ ++ /* XXX: this should not be open coded */ ++ return inode->ei_inode.bi_data_replicas ++ ? inode->ei_inode.bi_data_replicas - 1 ++ : c->opts.data_replicas; ++} ++ ++static inline unsigned sectors_to_reserve(struct bch_page_sector *s, ++ unsigned nr_replicas) ++{ ++ return max(0, (int) nr_replicas - ++ s->nr_replicas - ++ s->replicas_reserved); ++} ++ ++static int bch2_get_page_disk_reservation(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct page *page, bool check_enospc) ++{ ++ struct bch_page_state *s = bch2_page_state_create(page, 0); ++ unsigned nr_replicas = inode_nr_replicas(c, inode); ++ struct disk_reservation disk_res = { 0 }; ++ unsigned i, disk_res_sectors = 0; ++ int ret; ++ ++ if (!s) ++ return -ENOMEM; ++ ++ for (i = 0; i < ARRAY_SIZE(s->s); i++) ++ disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); ++ ++ if (!disk_res_sectors) ++ return 0; ++ ++ ret = bch2_disk_reservation_get(c, &disk_res, ++ disk_res_sectors, 1, ++ !check_enospc ++ ? BCH_DISK_RESERVATION_NOFAIL ++ : 0); ++ if (unlikely(ret)) ++ return ret; ++ ++ for (i = 0; i < ARRAY_SIZE(s->s); i++) ++ s->s[i].replicas_reserved += ++ sectors_to_reserve(&s->s[i], nr_replicas); ++ ++ return 0; ++} ++ ++struct bch2_page_reservation { ++ struct disk_reservation disk; ++ struct quota_res quota; ++}; ++ ++static void bch2_page_reservation_init(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch2_page_reservation *res) ++{ ++ memset(res, 0, sizeof(*res)); ++ ++ res->disk.nr_replicas = inode_nr_replicas(c, inode); ++} ++ ++static void bch2_page_reservation_put(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch2_page_reservation *res) ++{ ++ bch2_disk_reservation_put(c, &res->disk); ++ bch2_quota_reservation_put(c, inode, &res->quota); ++} ++ ++static int bch2_page_reservation_get(struct bch_fs *c, ++ struct bch_inode_info *inode, struct page *page, ++ struct bch2_page_reservation *res, ++ unsigned offset, unsigned len, bool check_enospc) ++{ ++ struct bch_page_state *s = bch2_page_state_create(page, 0); ++ unsigned i, disk_sectors = 0, quota_sectors = 0; ++ int ret; ++ ++ if (!s) ++ return -ENOMEM; ++ ++ for (i = round_down(offset, block_bytes(c)) >> 9; ++ i < round_up(offset + len, block_bytes(c)) >> 9; ++ i++) { ++ disk_sectors += sectors_to_reserve(&s->s[i], ++ res->disk.nr_replicas); ++ quota_sectors += s->s[i].state == SECTOR_UNALLOCATED; ++ } ++ ++ if (disk_sectors) { ++ ret = bch2_disk_reservation_add(c, &res->disk, ++ disk_sectors, ++ !check_enospc ++ ? BCH_DISK_RESERVATION_NOFAIL ++ : 0); ++ if (unlikely(ret)) ++ return ret; ++ } ++ ++ if (quota_sectors) { ++ ret = bch2_quota_reservation_add(c, inode, &res->quota, ++ quota_sectors, ++ check_enospc); ++ if (unlikely(ret)) { ++ struct disk_reservation tmp = { ++ .sectors = disk_sectors ++ }; ++ ++ bch2_disk_reservation_put(c, &tmp); ++ res->disk.sectors -= disk_sectors; ++ return ret; ++ } ++ } ++ ++ return 0; ++} ++ ++static void bch2_clear_page_bits(struct page *page) ++{ ++ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_page_state *s = bch2_page_state(page); ++ struct disk_reservation disk_res = { 0 }; ++ int i, dirty_sectors = 0; ++ ++ if (!s) ++ return; ++ ++ EBUG_ON(!PageLocked(page)); ++ EBUG_ON(PageWriteback(page)); ++ ++ for (i = 0; i < ARRAY_SIZE(s->s); i++) { ++ disk_res.sectors += s->s[i].replicas_reserved; ++ s->s[i].replicas_reserved = 0; ++ ++ if (s->s[i].state == SECTOR_DIRTY) { ++ dirty_sectors++; ++ s->s[i].state = SECTOR_UNALLOCATED; ++ } ++ } ++ ++ bch2_disk_reservation_put(c, &disk_res); ++ ++ if (dirty_sectors) ++ i_sectors_acct(c, inode, NULL, -dirty_sectors); ++ ++ bch2_page_state_release(page); ++} ++ ++static void bch2_set_page_dirty(struct bch_fs *c, ++ struct bch_inode_info *inode, struct page *page, ++ struct bch2_page_reservation *res, ++ unsigned offset, unsigned len) ++{ ++ struct bch_page_state *s = bch2_page_state(page); ++ unsigned i, dirty_sectors = 0; ++ ++ WARN_ON((u64) page_offset(page) + offset + len > ++ round_up((u64) i_size_read(&inode->v), block_bytes(c))); ++ ++ spin_lock(&s->lock); ++ ++ for (i = round_down(offset, block_bytes(c)) >> 9; ++ i < round_up(offset + len, block_bytes(c)) >> 9; ++ i++) { ++ unsigned sectors = sectors_to_reserve(&s->s[i], ++ res->disk.nr_replicas); ++ ++ /* ++ * This can happen if we race with the error path in ++ * bch2_writepage_io_done(): ++ */ ++ sectors = min_t(unsigned, sectors, res->disk.sectors); ++ ++ s->s[i].replicas_reserved += sectors; ++ res->disk.sectors -= sectors; ++ ++ if (s->s[i].state == SECTOR_UNALLOCATED) ++ dirty_sectors++; ++ ++ s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY); ++ } ++ ++ spin_unlock(&s->lock); ++ ++ if (dirty_sectors) ++ i_sectors_acct(c, inode, &res->quota, dirty_sectors); ++ ++ if (!PageDirty(page)) ++ __set_page_dirty_nobuffers(page); ++} ++ ++vm_fault_t bch2_page_fault(struct vm_fault *vmf) ++{ ++ struct file *file = vmf->vma->vm_file; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ int ret; ++ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ret = filemap_fault(vmf); ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ ++ return ret; ++} ++ ++vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) ++{ ++ struct page *page = vmf->page; ++ struct file *file = vmf->vma->vm_file; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct address_space *mapping = file->f_mapping; ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch2_page_reservation res; ++ unsigned len; ++ loff_t isize; ++ int ret = VM_FAULT_LOCKED; ++ ++ bch2_page_reservation_init(c, inode, &res); ++ ++ sb_start_pagefault(inode->v.i_sb); ++ file_update_time(file); ++ ++ /* ++ * Not strictly necessary, but helps avoid dio writes livelocking in ++ * write_invalidate_inode_pages_range() - can drop this if/when we get ++ * a write_invalidate_inode_pages_range() that works without dropping ++ * page lock before invalidating page ++ */ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ++ lock_page(page); ++ isize = i_size_read(&inode->v); ++ ++ if (page->mapping != mapping || page_offset(page) >= isize) { ++ unlock_page(page); ++ ret = VM_FAULT_NOPAGE; ++ goto out; ++ } ++ ++ len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page)); ++ ++ if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) { ++ unlock_page(page); ++ ret = VM_FAULT_SIGBUS; ++ goto out; ++ } ++ ++ bch2_set_page_dirty(c, inode, page, &res, 0, len); ++ bch2_page_reservation_put(c, inode, &res); ++ ++ wait_for_stable_page(page); ++out: ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ sb_end_pagefault(inode->v.i_sb); ++ ++ return ret; ++} ++ ++void bch2_invalidatepage(struct page *page, unsigned int offset, ++ unsigned int length) ++{ ++ if (offset || length < PAGE_SIZE) ++ return; ++ ++ bch2_clear_page_bits(page); ++} ++ ++int bch2_releasepage(struct page *page, gfp_t gfp_mask) ++{ ++ if (PageDirty(page)) ++ return 0; ++ ++ bch2_clear_page_bits(page); ++ return 1; ++} ++ ++#ifdef CONFIG_MIGRATION ++int bch2_migrate_page(struct address_space *mapping, struct page *newpage, ++ struct page *page, enum migrate_mode mode) ++{ ++ int ret; ++ ++ EBUG_ON(!PageLocked(page)); ++ EBUG_ON(!PageLocked(newpage)); ++ ++ ret = migrate_page_move_mapping(mapping, newpage, page, 0); ++ if (ret != MIGRATEPAGE_SUCCESS) ++ return ret; ++ ++ if (PagePrivate(page)) { ++ ClearPagePrivate(page); ++ get_page(newpage); ++ set_page_private(newpage, page_private(page)); ++ set_page_private(page, 0); ++ put_page(page); ++ SetPagePrivate(newpage); ++ } ++ ++ if (mode != MIGRATE_SYNC_NO_COPY) ++ migrate_page_copy(newpage, page); ++ else ++ migrate_page_states(newpage, page); ++ return MIGRATEPAGE_SUCCESS; ++} ++#endif ++ ++/* readpage(s): */ ++ ++static void bch2_readpages_end_io(struct bio *bio) ++{ ++ struct bvec_iter_all iter; ++ struct bio_vec *bv; ++ ++ bio_for_each_segment_all(bv, bio, iter) { ++ struct page *page = bv->bv_page; ++ ++ if (!bio->bi_status) { ++ SetPageUptodate(page); ++ } else { ++ ClearPageUptodate(page); ++ SetPageError(page); ++ } ++ unlock_page(page); ++ } ++ ++ bio_put(bio); ++} ++ ++static inline void page_state_init_for_read(struct page *page) ++{ ++ SetPagePrivate(page); ++ page->private = 0; ++} ++ ++struct readpages_iter { ++ struct address_space *mapping; ++ struct page **pages; ++ unsigned nr_pages; ++ unsigned nr_added; ++ unsigned idx; ++ pgoff_t offset; ++}; ++ ++static int readpages_iter_init(struct readpages_iter *iter, ++ struct address_space *mapping, ++ struct list_head *pages, unsigned nr_pages) ++{ ++ memset(iter, 0, sizeof(*iter)); ++ ++ iter->mapping = mapping; ++ iter->offset = list_last_entry(pages, struct page, lru)->index; ++ ++ iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); ++ if (!iter->pages) ++ return -ENOMEM; ++ ++ while (!list_empty(pages)) { ++ struct page *page = list_last_entry(pages, struct page, lru); ++ ++ __bch2_page_state_create(page, __GFP_NOFAIL); ++ ++ iter->pages[iter->nr_pages++] = page; ++ list_del(&page->lru); ++ } ++ ++ return 0; ++} ++ ++static inline struct page *readpage_iter_next(struct readpages_iter *iter) ++{ ++ struct page *page; ++ unsigned i; ++ int ret; ++ ++ BUG_ON(iter->idx > iter->nr_added); ++ BUG_ON(iter->nr_added > iter->nr_pages); ++ ++ if (iter->idx < iter->nr_added) ++ goto out; ++ ++ while (1) { ++ if (iter->idx == iter->nr_pages) ++ return NULL; ++ ++ ret = add_to_page_cache_lru_vec(iter->mapping, ++ iter->pages + iter->nr_added, ++ iter->nr_pages - iter->nr_added, ++ iter->offset + iter->nr_added, ++ GFP_NOFS); ++ if (ret > 0) ++ break; ++ ++ page = iter->pages[iter->nr_added]; ++ iter->idx++; ++ iter->nr_added++; ++ ++ __bch2_page_state_release(page); ++ put_page(page); ++ } ++ ++ iter->nr_added += ret; ++ ++ for (i = iter->idx; i < iter->nr_added; i++) ++ put_page(iter->pages[i]); ++out: ++ EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); ++ ++ return iter->pages[iter->idx]; ++} ++ ++static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) ++{ ++ struct bvec_iter iter; ++ struct bio_vec bv; ++ unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v ++ ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); ++ unsigned state = k.k->type == KEY_TYPE_reservation ++ ? SECTOR_RESERVED ++ : SECTOR_ALLOCATED; ++ ++ bio_for_each_segment(bv, bio, iter) { ++ struct bch_page_state *s = bch2_page_state(bv.bv_page); ++ unsigned i; ++ ++ for (i = bv.bv_offset >> 9; ++ i < (bv.bv_offset + bv.bv_len) >> 9; ++ i++) { ++ s->s[i].nr_replicas = nr_ptrs; ++ s->s[i].state = state; ++ } ++ } ++} ++ ++static bool extent_partial_reads_expensive(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ struct bch_extent_crc_unpacked crc; ++ const union bch_extent_entry *i; ++ ++ bkey_for_each_crc(k.k, ptrs, crc, i) ++ if (crc.csum_type || crc.compression_type) ++ return true; ++ return false; ++} ++ ++static void readpage_bio_extend(struct readpages_iter *iter, ++ struct bio *bio, ++ unsigned sectors_this_extent, ++ bool get_more) ++{ ++ while (bio_sectors(bio) < sectors_this_extent && ++ bio->bi_vcnt < bio->bi_max_vecs) { ++ pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT; ++ struct page *page = readpage_iter_next(iter); ++ int ret; ++ ++ if (page) { ++ if (iter->offset + iter->idx != page_offset) ++ break; ++ ++ iter->idx++; ++ } else { ++ if (!get_more) ++ break; ++ ++ page = xa_load(&iter->mapping->i_pages, page_offset); ++ if (page && !xa_is_value(page)) ++ break; ++ ++ page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); ++ if (!page) ++ break; ++ ++ if (!__bch2_page_state_create(page, 0)) { ++ put_page(page); ++ break; ++ } ++ ++ ret = add_to_page_cache_lru(page, iter->mapping, ++ page_offset, GFP_NOFS); ++ if (ret) { ++ __bch2_page_state_release(page); ++ put_page(page); ++ break; ++ } ++ ++ put_page(page); ++ } ++ ++ BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0)); ++ } ++} ++ ++static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, ++ struct bch_read_bio *rbio, u64 inum, ++ struct readpages_iter *readpages_iter) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_on_stack sk; ++ int flags = BCH_READ_RETRY_IF_STALE| ++ BCH_READ_MAY_PROMOTE; ++ int ret = 0; ++ ++ rbio->c = c; ++ rbio->start_time = local_clock(); ++ ++ bkey_on_stack_init(&sk); ++retry: ++ while (1) { ++ struct bkey_s_c k; ++ unsigned bytes, sectors, offset_into_extent; ++ ++ bch2_btree_iter_set_pos(iter, ++ POS(inum, rbio->bio.bi_iter.bi_sector)); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ break; ++ ++ bkey_on_stack_reassemble(&sk, c, k); ++ k = bkey_i_to_s_c(sk.k); ++ ++ offset_into_extent = iter->pos.offset - ++ bkey_start_offset(k.k); ++ sectors = k.k->size - offset_into_extent; ++ ++ ret = bch2_read_indirect_extent(trans, ++ &offset_into_extent, &sk); ++ if (ret) ++ break; ++ ++ sectors = min(sectors, k.k->size - offset_into_extent); ++ ++ bch2_trans_unlock(trans); ++ ++ if (readpages_iter) ++ readpage_bio_extend(readpages_iter, &rbio->bio, sectors, ++ extent_partial_reads_expensive(k)); ++ ++ bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; ++ swap(rbio->bio.bi_iter.bi_size, bytes); ++ ++ if (rbio->bio.bi_iter.bi_size == bytes) ++ flags |= BCH_READ_LAST_FRAGMENT; ++ ++ if (bkey_extent_is_allocation(k.k)) ++ bch2_add_page_sectors(&rbio->bio, k); ++ ++ bch2_read_extent(c, rbio, k, offset_into_extent, flags); ++ ++ if (flags & BCH_READ_LAST_FRAGMENT) ++ break; ++ ++ swap(rbio->bio.bi_iter.bi_size, bytes); ++ bio_advance(&rbio->bio, bytes); ++ } ++ ++ if (ret == -EINTR) ++ goto retry; ++ ++ if (ret) { ++ bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); ++ bio_endio(&rbio->bio); ++ } ++ ++ bkey_on_stack_exit(&sk, c); ++} ++ ++int bch2_readpages(struct file *file, struct address_space *mapping, ++ struct list_head *pages, unsigned nr_pages) ++{ ++ struct bch_inode_info *inode = to_bch_ei(mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct page *page; ++ struct readpages_iter readpages_iter; ++ int ret; ++ ++ ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages); ++ BUG_ON(ret); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, ++ BTREE_ITER_SLOTS); ++ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ++ while ((page = readpage_iter_next(&readpages_iter))) { ++ pgoff_t index = readpages_iter.offset + readpages_iter.idx; ++ unsigned n = min_t(unsigned, ++ readpages_iter.nr_pages - ++ readpages_iter.idx, ++ BIO_MAX_PAGES); ++ struct bch_read_bio *rbio = ++ rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read), ++ opts); ++ ++ readpages_iter.idx++; ++ ++ bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0); ++ rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT; ++ rbio->bio.bi_end_io = bch2_readpages_end_io; ++ BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); ++ ++ bchfs_read(&trans, iter, rbio, inode->v.i_ino, ++ &readpages_iter); ++ } ++ ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ ++ bch2_trans_exit(&trans); ++ kfree(readpages_iter.pages); ++ ++ return 0; ++} ++ ++static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, ++ u64 inum, struct page *page) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ ++ bch2_page_state_create(page, __GFP_NOFAIL); ++ ++ bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC); ++ rbio->bio.bi_iter.bi_sector = ++ (sector_t) page->index << PAGE_SECTOR_SHIFT; ++ BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, ++ BTREE_ITER_SLOTS); ++ ++ bchfs_read(&trans, iter, rbio, inum, NULL); ++ ++ bch2_trans_exit(&trans); ++} ++ ++int bch2_readpage(struct file *file, struct page *page) ++{ ++ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); ++ struct bch_read_bio *rbio; ++ ++ rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts); ++ rbio->bio.bi_end_io = bch2_readpages_end_io; ++ ++ __bchfs_readpage(c, rbio, inode->v.i_ino, page); ++ return 0; ++} ++ ++static void bch2_read_single_page_end_io(struct bio *bio) ++{ ++ complete(bio->bi_private); ++} ++ ++static int bch2_read_single_page(struct page *page, ++ struct address_space *mapping) ++{ ++ struct bch_inode_info *inode = to_bch_ei(mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_read_bio *rbio; ++ int ret; ++ DECLARE_COMPLETION_ONSTACK(done); ++ ++ rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), ++ io_opts(c, &inode->ei_inode)); ++ rbio->bio.bi_private = &done; ++ rbio->bio.bi_end_io = bch2_read_single_page_end_io; ++ ++ __bchfs_readpage(c, rbio, inode->v.i_ino, page); ++ wait_for_completion(&done); ++ ++ ret = blk_status_to_errno(rbio->bio.bi_status); ++ bio_put(&rbio->bio); ++ ++ if (ret < 0) ++ return ret; ++ ++ SetPageUptodate(page); ++ return 0; ++} ++ ++/* writepages: */ ++ ++struct bch_writepage_state { ++ struct bch_writepage_io *io; ++ struct bch_io_opts opts; ++}; ++ ++static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, ++ struct bch_inode_info *inode) ++{ ++ return (struct bch_writepage_state) { ++ .opts = io_opts(c, &inode->ei_inode) ++ }; ++} ++ ++static void bch2_writepage_io_free(struct closure *cl) ++{ ++ struct bch_writepage_io *io = container_of(cl, ++ struct bch_writepage_io, cl); ++ ++ bio_put(&io->op.wbio.bio); ++} ++ ++static void bch2_writepage_io_done(struct closure *cl) ++{ ++ struct bch_writepage_io *io = container_of(cl, ++ struct bch_writepage_io, cl); ++ struct bch_fs *c = io->op.c; ++ struct bio *bio = &io->op.wbio.bio; ++ struct bvec_iter_all iter; ++ struct bio_vec *bvec; ++ unsigned i; ++ ++ if (io->op.error) { ++ bio_for_each_segment_all(bvec, bio, iter) { ++ struct bch_page_state *s; ++ ++ SetPageError(bvec->bv_page); ++ mapping_set_error(bvec->bv_page->mapping, -EIO); ++ ++ s = __bch2_page_state(bvec->bv_page); ++ spin_lock(&s->lock); ++ for (i = 0; i < PAGE_SECTORS; i++) ++ s->s[i].nr_replicas = 0; ++ spin_unlock(&s->lock); ++ } ++ } ++ ++ if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { ++ bio_for_each_segment_all(bvec, bio, iter) { ++ struct bch_page_state *s; ++ ++ s = __bch2_page_state(bvec->bv_page); ++ spin_lock(&s->lock); ++ for (i = 0; i < PAGE_SECTORS; i++) ++ s->s[i].nr_replicas = 0; ++ spin_unlock(&s->lock); ++ } ++ } ++ ++ /* ++ * racing with fallocate can cause us to add fewer sectors than ++ * expected - but we shouldn't add more sectors than expected: ++ */ ++ BUG_ON(io->op.i_sectors_delta > 0); ++ ++ /* ++ * (error (due to going RO) halfway through a page can screw that up ++ * slightly) ++ * XXX wtf? ++ BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); ++ */ ++ ++ /* ++ * PageWriteback is effectively our ref on the inode - fixup i_blocks ++ * before calling end_page_writeback: ++ */ ++ i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); ++ ++ bio_for_each_segment_all(bvec, bio, iter) { ++ struct bch_page_state *s = __bch2_page_state(bvec->bv_page); ++ ++ if (atomic_dec_and_test(&s->write_count)) ++ end_page_writeback(bvec->bv_page); ++ } ++ ++ closure_return_with_destructor(&io->cl, bch2_writepage_io_free); ++} ++ ++static void bch2_writepage_do_io(struct bch_writepage_state *w) ++{ ++ struct bch_writepage_io *io = w->io; ++ ++ w->io = NULL; ++ closure_call(&io->op.cl, bch2_write, NULL, &io->cl); ++ continue_at(&io->cl, bch2_writepage_io_done, NULL); ++} ++ ++/* ++ * Get a bch_writepage_io and add @page to it - appending to an existing one if ++ * possible, else allocating a new one: ++ */ ++static void bch2_writepage_io_alloc(struct bch_fs *c, ++ struct writeback_control *wbc, ++ struct bch_writepage_state *w, ++ struct bch_inode_info *inode, ++ u64 sector, ++ unsigned nr_replicas) ++{ ++ struct bch_write_op *op; ++ ++ w->io = container_of(bio_alloc_bioset(GFP_NOFS, ++ BIO_MAX_PAGES, ++ &c->writepage_bioset), ++ struct bch_writepage_io, op.wbio.bio); ++ ++ closure_init(&w->io->cl, NULL); ++ w->io->inode = inode; ++ ++ op = &w->io->op; ++ bch2_write_op_init(op, c, w->opts); ++ op->target = w->opts.foreground_target; ++ op_journal_seq_set(op, &inode->ei_journal_seq); ++ op->nr_replicas = nr_replicas; ++ op->res.nr_replicas = nr_replicas; ++ op->write_point = writepoint_hashed(inode->ei_last_dirtied); ++ op->pos = POS(inode->v.i_ino, sector); ++ op->wbio.bio.bi_iter.bi_sector = sector; ++ op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); ++} ++ ++static int __bch2_writepage(struct page *page, ++ struct writeback_control *wbc, ++ void *data) ++{ ++ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_writepage_state *w = data; ++ struct bch_page_state *s, orig; ++ unsigned i, offset, nr_replicas_this_write = U32_MAX; ++ loff_t i_size = i_size_read(&inode->v); ++ pgoff_t end_index = i_size >> PAGE_SHIFT; ++ int ret; ++ ++ EBUG_ON(!PageUptodate(page)); ++ ++ /* Is the page fully inside i_size? */ ++ if (page->index < end_index) ++ goto do_io; ++ ++ /* Is the page fully outside i_size? (truncate in progress) */ ++ offset = i_size & (PAGE_SIZE - 1); ++ if (page->index > end_index || !offset) { ++ unlock_page(page); ++ return 0; ++ } ++ ++ /* ++ * The page straddles i_size. It must be zeroed out on each and every ++ * writepage invocation because it may be mmapped. "A file is mapped ++ * in multiples of the page size. For a file that is not a multiple of ++ * the page size, the remaining memory is zeroed when mapped, and ++ * writes to that region are not written out to the file." ++ */ ++ zero_user_segment(page, offset, PAGE_SIZE); ++do_io: ++ s = bch2_page_state_create(page, __GFP_NOFAIL); ++ ++ ret = bch2_get_page_disk_reservation(c, inode, page, true); ++ if (ret) { ++ SetPageError(page); ++ mapping_set_error(page->mapping, ret); ++ unlock_page(page); ++ return 0; ++ } ++ ++ /* Before unlocking the page, get copy of reservations: */ ++ orig = *s; ++ ++ for (i = 0; i < PAGE_SECTORS; i++) { ++ if (s->s[i].state < SECTOR_DIRTY) ++ continue; ++ ++ nr_replicas_this_write = ++ min_t(unsigned, nr_replicas_this_write, ++ s->s[i].nr_replicas + ++ s->s[i].replicas_reserved); ++ } ++ ++ for (i = 0; i < PAGE_SECTORS; i++) { ++ if (s->s[i].state < SECTOR_DIRTY) ++ continue; ++ ++ s->s[i].nr_replicas = w->opts.compression ++ ? 0 : nr_replicas_this_write; ++ ++ s->s[i].replicas_reserved = 0; ++ s->s[i].state = SECTOR_ALLOCATED; ++ } ++ ++ BUG_ON(atomic_read(&s->write_count)); ++ atomic_set(&s->write_count, 1); ++ ++ BUG_ON(PageWriteback(page)); ++ set_page_writeback(page); ++ ++ unlock_page(page); ++ ++ offset = 0; ++ while (1) { ++ unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0; ++ u64 sector; ++ ++ while (offset < PAGE_SECTORS && ++ orig.s[offset].state < SECTOR_DIRTY) ++ offset++; ++ ++ if (offset == PAGE_SECTORS) ++ break; ++ ++ sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset; ++ ++ while (offset + sectors < PAGE_SECTORS && ++ orig.s[offset + sectors].state >= SECTOR_DIRTY) ++ sectors++; ++ ++ for (i = offset; i < offset + sectors; i++) { ++ reserved_sectors += orig.s[i].replicas_reserved; ++ dirty_sectors += orig.s[i].state == SECTOR_DIRTY; ++ } ++ ++ if (w->io && ++ (w->io->op.res.nr_replicas != nr_replicas_this_write || ++ bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || ++ w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= ++ (BIO_MAX_PAGES * PAGE_SIZE) || ++ bio_end_sector(&w->io->op.wbio.bio) != sector)) ++ bch2_writepage_do_io(w); ++ ++ if (!w->io) ++ bch2_writepage_io_alloc(c, wbc, w, inode, sector, ++ nr_replicas_this_write); ++ ++ atomic_inc(&s->write_count); ++ ++ BUG_ON(inode != w->io->inode); ++ BUG_ON(!bio_add_page(&w->io->op.wbio.bio, page, ++ sectors << 9, offset << 9)); ++ ++ /* Check for writing past i_size: */ ++ WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) > ++ round_up(i_size, block_bytes(c))); ++ ++ w->io->op.res.sectors += reserved_sectors; ++ w->io->op.i_sectors_delta -= dirty_sectors; ++ w->io->op.new_i_size = i_size; ++ ++ offset += sectors; ++ } ++ ++ if (atomic_dec_and_test(&s->write_count)) ++ end_page_writeback(page); ++ ++ return 0; ++} ++ ++int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) ++{ ++ struct bch_fs *c = mapping->host->i_sb->s_fs_info; ++ struct bch_writepage_state w = ++ bch_writepage_state_init(c, to_bch_ei(mapping->host)); ++ struct blk_plug plug; ++ int ret; ++ ++ blk_start_plug(&plug); ++ ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); ++ if (w.io) ++ bch2_writepage_do_io(&w); ++ blk_finish_plug(&plug); ++ return ret; ++} ++ ++int bch2_writepage(struct page *page, struct writeback_control *wbc) ++{ ++ struct bch_fs *c = page->mapping->host->i_sb->s_fs_info; ++ struct bch_writepage_state w = ++ bch_writepage_state_init(c, to_bch_ei(page->mapping->host)); ++ int ret; ++ ++ ret = __bch2_writepage(page, wbc, &w); ++ if (w.io) ++ bch2_writepage_do_io(&w); ++ ++ return ret; ++} ++ ++/* buffered writes: */ ++ ++int bch2_write_begin(struct file *file, struct address_space *mapping, ++ loff_t pos, unsigned len, unsigned flags, ++ struct page **pagep, void **fsdata) ++{ ++ struct bch_inode_info *inode = to_bch_ei(mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch2_page_reservation *res; ++ pgoff_t index = pos >> PAGE_SHIFT; ++ unsigned offset = pos & (PAGE_SIZE - 1); ++ struct page *page; ++ int ret = -ENOMEM; ++ ++ res = kmalloc(sizeof(*res), GFP_KERNEL); ++ if (!res) ++ return -ENOMEM; ++ ++ bch2_page_reservation_init(c, inode, res); ++ *fsdata = res; ++ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ++ page = grab_cache_page_write_begin(mapping, index, flags); ++ if (!page) ++ goto err_unlock; ++ ++ if (PageUptodate(page)) ++ goto out; ++ ++ /* If we're writing entire page, don't need to read it in first: */ ++ if (len == PAGE_SIZE) ++ goto out; ++ ++ if (!offset && pos + len >= inode->v.i_size) { ++ zero_user_segment(page, len, PAGE_SIZE); ++ flush_dcache_page(page); ++ goto out; ++ } ++ ++ if (index > inode->v.i_size >> PAGE_SHIFT) { ++ zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE); ++ flush_dcache_page(page); ++ goto out; ++ } ++readpage: ++ ret = bch2_read_single_page(page, mapping); ++ if (ret) ++ goto err; ++out: ++ ret = bch2_page_reservation_get(c, inode, page, res, ++ offset, len, true); ++ if (ret) { ++ if (!PageUptodate(page)) { ++ /* ++ * If the page hasn't been read in, we won't know if we ++ * actually need a reservation - we don't actually need ++ * to read here, we just need to check if the page is ++ * fully backed by uncompressed data: ++ */ ++ goto readpage; ++ } ++ ++ goto err; ++ } ++ ++ *pagep = page; ++ return 0; ++err: ++ unlock_page(page); ++ put_page(page); ++ *pagep = NULL; ++err_unlock: ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ kfree(res); ++ *fsdata = NULL; ++ return ret; ++} ++ ++int bch2_write_end(struct file *file, struct address_space *mapping, ++ loff_t pos, unsigned len, unsigned copied, ++ struct page *page, void *fsdata) ++{ ++ struct bch_inode_info *inode = to_bch_ei(mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch2_page_reservation *res = fsdata; ++ unsigned offset = pos & (PAGE_SIZE - 1); ++ ++ lockdep_assert_held(&inode->v.i_rwsem); ++ ++ if (unlikely(copied < len && !PageUptodate(page))) { ++ /* ++ * The page needs to be read in, but that would destroy ++ * our partial write - simplest thing is to just force ++ * userspace to redo the write: ++ */ ++ zero_user(page, 0, PAGE_SIZE); ++ flush_dcache_page(page); ++ copied = 0; ++ } ++ ++ spin_lock(&inode->v.i_lock); ++ if (pos + copied > inode->v.i_size) ++ i_size_write(&inode->v, pos + copied); ++ spin_unlock(&inode->v.i_lock); ++ ++ if (copied) { ++ if (!PageUptodate(page)) ++ SetPageUptodate(page); ++ ++ bch2_set_page_dirty(c, inode, page, res, offset, copied); ++ ++ inode->ei_last_dirtied = (unsigned long) current; ++ } ++ ++ unlock_page(page); ++ put_page(page); ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ ++ bch2_page_reservation_put(c, inode, res); ++ kfree(res); ++ ++ return copied; ++} ++ ++#define WRITE_BATCH_PAGES 32 ++ ++static int __bch2_buffered_write(struct bch_inode_info *inode, ++ struct address_space *mapping, ++ struct iov_iter *iter, ++ loff_t pos, unsigned len) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct page *pages[WRITE_BATCH_PAGES]; ++ struct bch2_page_reservation res; ++ unsigned long index = pos >> PAGE_SHIFT; ++ unsigned offset = pos & (PAGE_SIZE - 1); ++ unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); ++ unsigned i, reserved = 0, set_dirty = 0; ++ unsigned copied = 0, nr_pages_copied = 0; ++ int ret = 0; ++ ++ BUG_ON(!len); ++ BUG_ON(nr_pages > ARRAY_SIZE(pages)); ++ ++ bch2_page_reservation_init(c, inode, &res); ++ ++ for (i = 0; i < nr_pages; i++) { ++ pages[i] = grab_cache_page_write_begin(mapping, index + i, 0); ++ if (!pages[i]) { ++ nr_pages = i; ++ if (!i) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ len = min_t(unsigned, len, ++ nr_pages * PAGE_SIZE - offset); ++ break; ++ } ++ } ++ ++ if (offset && !PageUptodate(pages[0])) { ++ ret = bch2_read_single_page(pages[0], mapping); ++ if (ret) ++ goto out; ++ } ++ ++ if ((pos + len) & (PAGE_SIZE - 1) && ++ !PageUptodate(pages[nr_pages - 1])) { ++ if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) { ++ zero_user(pages[nr_pages - 1], 0, PAGE_SIZE); ++ } else { ++ ret = bch2_read_single_page(pages[nr_pages - 1], mapping); ++ if (ret) ++ goto out; ++ } ++ } ++ ++ while (reserved < len) { ++ struct page *page = pages[(offset + reserved) >> PAGE_SHIFT]; ++ unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1); ++ unsigned pg_len = min_t(unsigned, len - reserved, ++ PAGE_SIZE - pg_offset); ++retry_reservation: ++ ret = bch2_page_reservation_get(c, inode, page, &res, ++ pg_offset, pg_len, true); ++ ++ if (ret && !PageUptodate(page)) { ++ ret = bch2_read_single_page(page, mapping); ++ if (!ret) ++ goto retry_reservation; ++ } ++ ++ if (ret) ++ goto out; ++ ++ reserved += pg_len; ++ } ++ ++ if (mapping_writably_mapped(mapping)) ++ for (i = 0; i < nr_pages; i++) ++ flush_dcache_page(pages[i]); ++ ++ while (copied < len) { ++ struct page *page = pages[(offset + copied) >> PAGE_SHIFT]; ++ unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1); ++ unsigned pg_len = min_t(unsigned, len - copied, ++ PAGE_SIZE - pg_offset); ++ unsigned pg_copied = iov_iter_copy_from_user_atomic(page, ++ iter, pg_offset, pg_len); ++ ++ if (!pg_copied) ++ break; ++ ++ flush_dcache_page(page); ++ iov_iter_advance(iter, pg_copied); ++ copied += pg_copied; ++ } ++ ++ if (!copied) ++ goto out; ++ ++ if (copied < len && ++ ((offset + copied) & (PAGE_SIZE - 1))) { ++ struct page *page = pages[(offset + copied) >> PAGE_SHIFT]; ++ ++ if (!PageUptodate(page)) { ++ zero_user(page, 0, PAGE_SIZE); ++ copied -= (offset + copied) & (PAGE_SIZE - 1); ++ } ++ } ++ ++ spin_lock(&inode->v.i_lock); ++ if (pos + copied > inode->v.i_size) ++ i_size_write(&inode->v, pos + copied); ++ spin_unlock(&inode->v.i_lock); ++ ++ while (set_dirty < copied) { ++ struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT]; ++ unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1); ++ unsigned pg_len = min_t(unsigned, copied - set_dirty, ++ PAGE_SIZE - pg_offset); ++ ++ if (!PageUptodate(page)) ++ SetPageUptodate(page); ++ ++ bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len); ++ unlock_page(page); ++ put_page(page); ++ ++ set_dirty += pg_len; ++ } ++ ++ nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE); ++ inode->ei_last_dirtied = (unsigned long) current; ++out: ++ for (i = nr_pages_copied; i < nr_pages; i++) { ++ unlock_page(pages[i]); ++ put_page(pages[i]); ++ } ++ ++ bch2_page_reservation_put(c, inode, &res); ++ ++ return copied ?: ret; ++} ++ ++static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) ++{ ++ struct file *file = iocb->ki_filp; ++ struct address_space *mapping = file->f_mapping; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ loff_t pos = iocb->ki_pos; ++ ssize_t written = 0; ++ int ret = 0; ++ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ++ do { ++ unsigned offset = pos & (PAGE_SIZE - 1); ++ unsigned bytes = min_t(unsigned long, iov_iter_count(iter), ++ PAGE_SIZE * WRITE_BATCH_PAGES - offset); ++again: ++ /* ++ * Bring in the user page that we will copy from _first_. ++ * Otherwise there's a nasty deadlock on copying from the ++ * same page as we're writing to, without it being marked ++ * up-to-date. ++ * ++ * Not only is this an optimisation, but it is also required ++ * to check that the address is actually valid, when atomic ++ * usercopies are used, below. ++ */ ++ if (unlikely(iov_iter_fault_in_readable(iter, bytes))) { ++ bytes = min_t(unsigned long, iov_iter_count(iter), ++ PAGE_SIZE - offset); ++ ++ if (unlikely(iov_iter_fault_in_readable(iter, bytes))) { ++ ret = -EFAULT; ++ break; ++ } ++ } ++ ++ if (unlikely(fatal_signal_pending(current))) { ++ ret = -EINTR; ++ break; ++ } ++ ++ ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); ++ if (unlikely(ret < 0)) ++ break; ++ ++ cond_resched(); ++ ++ if (unlikely(ret == 0)) { ++ /* ++ * If we were unable to copy any data at all, we must ++ * fall back to a single segment length write. ++ * ++ * If we didn't fallback here, we could livelock ++ * because not all segments in the iov can be copied at ++ * once without a pagefault. ++ */ ++ bytes = min_t(unsigned long, PAGE_SIZE - offset, ++ iov_iter_single_seg_count(iter)); ++ goto again; ++ } ++ pos += ret; ++ written += ret; ++ ++ balance_dirty_pages_ratelimited(mapping); ++ } while (iov_iter_count(iter)); ++ ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ ++ return written ? written : ret; ++} ++ ++/* O_DIRECT reads */ ++ ++static void bch2_dio_read_complete(struct closure *cl) ++{ ++ struct dio_read *dio = container_of(cl, struct dio_read, cl); ++ ++ dio->req->ki_complete(dio->req, dio->ret, 0); ++ bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ ++} ++ ++static void bch2_direct_IO_read_endio(struct bio *bio) ++{ ++ struct dio_read *dio = bio->bi_private; ++ ++ if (bio->bi_status) ++ dio->ret = blk_status_to_errno(bio->bi_status); ++ ++ closure_put(&dio->cl); ++} ++ ++static void bch2_direct_IO_read_split_endio(struct bio *bio) ++{ ++ bch2_direct_IO_read_endio(bio); ++ bio_check_pages_dirty(bio); /* transfers ownership */ ++} ++ ++static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) ++{ ++ struct file *file = req->ki_filp; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); ++ struct dio_read *dio; ++ struct bio *bio; ++ loff_t offset = req->ki_pos; ++ bool sync = is_sync_kiocb(req); ++ size_t shorten; ++ ssize_t ret; ++ ++ if ((offset|iter->count) & (block_bytes(c) - 1)) ++ return -EINVAL; ++ ++ ret = min_t(loff_t, iter->count, ++ max_t(loff_t, 0, i_size_read(&inode->v) - offset)); ++ ++ if (!ret) ++ return ret; ++ ++ shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); ++ iter->count -= shorten; ++ ++ bio = bio_alloc_bioset(GFP_KERNEL, ++ iov_iter_npages(iter, BIO_MAX_PAGES), ++ &c->dio_read_bioset); ++ ++ bio->bi_end_io = bch2_direct_IO_read_endio; ++ ++ dio = container_of(bio, struct dio_read, rbio.bio); ++ closure_init(&dio->cl, NULL); ++ ++ /* ++ * this is a _really_ horrible hack just to avoid an atomic sub at the ++ * end: ++ */ ++ if (!sync) { ++ set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL); ++ atomic_set(&dio->cl.remaining, ++ CLOSURE_REMAINING_INITIALIZER - ++ CLOSURE_RUNNING + ++ CLOSURE_DESTRUCTOR); ++ } else { ++ atomic_set(&dio->cl.remaining, ++ CLOSURE_REMAINING_INITIALIZER + 1); ++ } ++ ++ dio->req = req; ++ dio->ret = ret; ++ ++ goto start; ++ while (iter->count) { ++ bio = bio_alloc_bioset(GFP_KERNEL, ++ iov_iter_npages(iter, BIO_MAX_PAGES), ++ &c->bio_read); ++ bio->bi_end_io = bch2_direct_IO_read_split_endio; ++start: ++ bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC); ++ bio->bi_iter.bi_sector = offset >> 9; ++ bio->bi_private = dio; ++ ++ ret = bio_iov_iter_get_pages(bio, iter); ++ if (ret < 0) { ++ /* XXX: fault inject this path */ ++ bio->bi_status = BLK_STS_RESOURCE; ++ bio_endio(bio); ++ break; ++ } ++ ++ offset += bio->bi_iter.bi_size; ++ bio_set_pages_dirty(bio); ++ ++ if (iter->count) ++ closure_get(&dio->cl); ++ ++ bch2_read(c, rbio_init(bio, opts), inode->v.i_ino); ++ } ++ ++ iter->count += shorten; ++ ++ if (sync) { ++ closure_sync(&dio->cl); ++ closure_debug_destroy(&dio->cl); ++ ret = dio->ret; ++ bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ ++ return ret; ++ } else { ++ return -EIOCBQUEUED; ++ } ++} ++ ++ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) ++{ ++ struct file *file = iocb->ki_filp; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct address_space *mapping = file->f_mapping; ++ size_t count = iov_iter_count(iter); ++ ssize_t ret; ++ ++ if (!count) ++ return 0; /* skip atime */ ++ ++ if (iocb->ki_flags & IOCB_DIRECT) { ++ struct blk_plug plug; ++ ++ ret = filemap_write_and_wait_range(mapping, ++ iocb->ki_pos, ++ iocb->ki_pos + count - 1); ++ if (ret < 0) ++ return ret; ++ ++ file_accessed(file); ++ ++ blk_start_plug(&plug); ++ ret = bch2_direct_IO_read(iocb, iter); ++ blk_finish_plug(&plug); ++ ++ if (ret >= 0) ++ iocb->ki_pos += ret; ++ } else { ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ret = generic_file_read_iter(iocb, iter); ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ } ++ ++ return ret; ++} ++ ++/* O_DIRECT writes */ ++ ++static void bch2_dio_write_loop_async(struct bch_write_op *); ++ ++static long bch2_dio_write_loop(struct dio_write *dio) ++{ ++ bool kthread = (current->flags & PF_KTHREAD) != 0; ++ struct kiocb *req = dio->req; ++ struct address_space *mapping = req->ki_filp->f_mapping; ++ struct bch_inode_info *inode = file_bch_inode(req->ki_filp); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bio *bio = &dio->op.wbio.bio; ++ struct bvec_iter_all iter; ++ struct bio_vec *bv; ++ unsigned unaligned; ++ bool sync = dio->sync; ++ long ret; ++ ++ if (dio->loop) ++ goto loop; ++ ++ while (1) { ++ if (kthread) ++ use_mm(dio->mm); ++ BUG_ON(current->faults_disabled_mapping); ++ current->faults_disabled_mapping = mapping; ++ ++ ret = bio_iov_iter_get_pages(bio, &dio->iter); ++ ++ current->faults_disabled_mapping = NULL; ++ if (kthread) ++ unuse_mm(dio->mm); ++ ++ if (unlikely(ret < 0)) ++ goto err; ++ ++ unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); ++ bio->bi_iter.bi_size -= unaligned; ++ iov_iter_revert(&dio->iter, unaligned); ++ ++ if (!bio->bi_iter.bi_size) { ++ /* ++ * bio_iov_iter_get_pages was only able to get < ++ * blocksize worth of pages: ++ */ ++ bio_for_each_segment_all(bv, bio, iter) ++ put_page(bv->bv_page); ++ ret = -EFAULT; ++ goto err; ++ } ++ ++ bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode)); ++ dio->op.end_io = bch2_dio_write_loop_async; ++ dio->op.target = dio->op.opts.foreground_target; ++ op_journal_seq_set(&dio->op, &inode->ei_journal_seq); ++ dio->op.write_point = writepoint_hashed((unsigned long) current); ++ dio->op.nr_replicas = dio->op.opts.data_replicas; ++ dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); ++ ++ if ((req->ki_flags & IOCB_DSYNC) && ++ !c->opts.journal_flush_disabled) ++ dio->op.flags |= BCH_WRITE_FLUSH; ++ ++ ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), ++ dio->op.opts.data_replicas, 0); ++ if (unlikely(ret) && ++ !bch2_check_range_allocated(c, dio->op.pos, ++ bio_sectors(bio), dio->op.opts.data_replicas)) ++ goto err; ++ ++ task_io_account_write(bio->bi_iter.bi_size); ++ ++ if (!dio->sync && !dio->loop && dio->iter.count) { ++ struct iovec *iov = dio->inline_vecs; ++ ++ if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { ++ iov = kmalloc(dio->iter.nr_segs * sizeof(*iov), ++ GFP_KERNEL); ++ if (unlikely(!iov)) { ++ dio->sync = sync = true; ++ goto do_io; ++ } ++ ++ dio->free_iov = true; ++ } ++ ++ memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov)); ++ dio->iter.iov = iov; ++ } ++do_io: ++ dio->loop = true; ++ closure_call(&dio->op.cl, bch2_write, NULL, NULL); ++ ++ if (sync) ++ wait_for_completion(&dio->done); ++ else ++ return -EIOCBQUEUED; ++loop: ++ i_sectors_acct(c, inode, &dio->quota_res, ++ dio->op.i_sectors_delta); ++ req->ki_pos += (u64) dio->op.written << 9; ++ dio->written += dio->op.written; ++ ++ spin_lock(&inode->v.i_lock); ++ if (req->ki_pos > inode->v.i_size) ++ i_size_write(&inode->v, req->ki_pos); ++ spin_unlock(&inode->v.i_lock); ++ ++ bio_for_each_segment_all(bv, bio, iter) ++ put_page(bv->bv_page); ++ if (!dio->iter.count || dio->op.error) ++ break; ++ ++ bio_reset(bio); ++ reinit_completion(&dio->done); ++ } ++ ++ ret = dio->op.error ?: ((long) dio->written << 9); ++err: ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ bch2_quota_reservation_put(c, inode, &dio->quota_res); ++ ++ if (dio->free_iov) ++ kfree(dio->iter.iov); ++ ++ bio_put(bio); ++ ++ /* inode->i_dio_count is our ref on inode and thus bch_fs */ ++ inode_dio_end(&inode->v); ++ ++ if (!sync) { ++ req->ki_complete(req, ret, 0); ++ ret = -EIOCBQUEUED; ++ } ++ return ret; ++} ++ ++static void bch2_dio_write_loop_async(struct bch_write_op *op) ++{ ++ struct dio_write *dio = container_of(op, struct dio_write, op); ++ ++ if (dio->sync) ++ complete(&dio->done); ++ else ++ bch2_dio_write_loop(dio); ++} ++ ++static noinline ++ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) ++{ ++ struct file *file = req->ki_filp; ++ struct address_space *mapping = file->f_mapping; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct dio_write *dio; ++ struct bio *bio; ++ bool locked = true, extending; ++ ssize_t ret; ++ ++ prefetch(&c->opts); ++ prefetch((void *) &c->opts + 64); ++ prefetch(&inode->ei_inode); ++ prefetch((void *) &inode->ei_inode + 64); ++ ++ inode_lock(&inode->v); ++ ++ ret = generic_write_checks(req, iter); ++ if (unlikely(ret <= 0)) ++ goto err; ++ ++ ret = file_remove_privs(file); ++ if (unlikely(ret)) ++ goto err; ++ ++ ret = file_update_time(file); ++ if (unlikely(ret)) ++ goto err; ++ ++ if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) ++ goto err; ++ ++ inode_dio_begin(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ extending = req->ki_pos + iter->count > inode->v.i_size; ++ if (!extending) { ++ inode_unlock(&inode->v); ++ locked = false; ++ } ++ ++ bio = bio_alloc_bioset(GFP_KERNEL, ++ iov_iter_npages(iter, BIO_MAX_PAGES), ++ &c->dio_write_bioset); ++ dio = container_of(bio, struct dio_write, op.wbio.bio); ++ init_completion(&dio->done); ++ dio->req = req; ++ dio->mm = current->mm; ++ dio->loop = false; ++ dio->sync = is_sync_kiocb(req) || extending; ++ dio->free_iov = false; ++ dio->quota_res.sectors = 0; ++ dio->written = 0; ++ dio->iter = *iter; ++ ++ ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, ++ iter->count >> 9, true); ++ if (unlikely(ret)) ++ goto err_put_bio; ++ ++ ret = write_invalidate_inode_pages_range(mapping, ++ req->ki_pos, ++ req->ki_pos + iter->count - 1); ++ if (unlikely(ret)) ++ goto err_put_bio; ++ ++ ret = bch2_dio_write_loop(dio); ++err: ++ if (locked) ++ inode_unlock(&inode->v); ++ return ret; ++err_put_bio: ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ bch2_quota_reservation_put(c, inode, &dio->quota_res); ++ bio_put(bio); ++ inode_dio_end(&inode->v); ++ goto err; ++} ++ ++ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) ++{ ++ struct file *file = iocb->ki_filp; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ ssize_t ret; ++ ++ if (iocb->ki_flags & IOCB_DIRECT) ++ return bch2_direct_write(iocb, from); ++ ++ /* We can write back this queue in page reclaim */ ++ current->backing_dev_info = inode_to_bdi(&inode->v); ++ inode_lock(&inode->v); ++ ++ ret = generic_write_checks(iocb, from); ++ if (ret <= 0) ++ goto unlock; ++ ++ ret = file_remove_privs(file); ++ if (ret) ++ goto unlock; ++ ++ ret = file_update_time(file); ++ if (ret) ++ goto unlock; ++ ++ ret = bch2_buffered_write(iocb, from); ++ if (likely(ret > 0)) ++ iocb->ki_pos += ret; ++unlock: ++ inode_unlock(&inode->v); ++ current->backing_dev_info = NULL; ++ ++ if (ret > 0) ++ ret = generic_write_sync(iocb, ret); ++ ++ return ret; ++} ++ ++/* fsync: */ ++ ++int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ int ret, ret2; ++ ++ ret = file_write_and_wait_range(file, start, end); ++ if (ret) ++ return ret; ++ ++ if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC)) ++ goto out; ++ ++ ret = sync_inode_metadata(&inode->v, 1); ++ if (ret) ++ return ret; ++out: ++ if (!c->opts.journal_flush_disabled) ++ ret = bch2_journal_flush_seq(&c->journal, ++ inode->ei_journal_seq); ++ ret2 = file_check_and_advance_wb_err(file); ++ ++ return ret ?: ret2; ++} ++ ++/* truncate: */ ++ ++static inline int range_has_data(struct bch_fs *c, ++ struct bpos start, ++ struct bpos end) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k, ret) { ++ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) ++ break; ++ ++ if (bkey_extent_is_data(k.k)) { ++ ret = 1; ++ break; ++ } ++ } ++ ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++static int __bch2_truncate_page(struct bch_inode_info *inode, ++ pgoff_t index, loff_t start, loff_t end) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct address_space *mapping = inode->v.i_mapping; ++ struct bch_page_state *s; ++ unsigned start_offset = start & (PAGE_SIZE - 1); ++ unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; ++ unsigned i; ++ struct page *page; ++ int ret = 0; ++ ++ /* Page boundary? Nothing to do */ ++ if (!((index == start >> PAGE_SHIFT && start_offset) || ++ (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE))) ++ return 0; ++ ++ /* Above i_size? */ ++ if (index << PAGE_SHIFT >= inode->v.i_size) ++ return 0; ++ ++ page = find_lock_page(mapping, index); ++ if (!page) { ++ /* ++ * XXX: we're doing two index lookups when we end up reading the ++ * page ++ */ ++ ret = range_has_data(c, ++ POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT), ++ POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT)); ++ if (ret <= 0) ++ return ret; ++ ++ page = find_or_create_page(mapping, index, GFP_KERNEL); ++ if (unlikely(!page)) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ } ++ ++ s = bch2_page_state_create(page, 0); ++ if (!s) { ++ ret = -ENOMEM; ++ goto unlock; ++ } ++ ++ if (!PageUptodate(page)) { ++ ret = bch2_read_single_page(page, mapping); ++ if (ret) ++ goto unlock; ++ } ++ ++ if (index != start >> PAGE_SHIFT) ++ start_offset = 0; ++ if (index != end >> PAGE_SHIFT) ++ end_offset = PAGE_SIZE; ++ ++ for (i = round_up(start_offset, block_bytes(c)) >> 9; ++ i < round_down(end_offset, block_bytes(c)) >> 9; ++ i++) { ++ s->s[i].nr_replicas = 0; ++ s->s[i].state = SECTOR_UNALLOCATED; ++ } ++ ++ zero_user_segment(page, start_offset, end_offset); ++ ++ /* ++ * Bit of a hack - we don't want truncate to fail due to -ENOSPC. ++ * ++ * XXX: because we aren't currently tracking whether the page has actual ++ * data in it (vs. just 0s, or only partially written) this wrong. ick. ++ */ ++ ret = bch2_get_page_disk_reservation(c, inode, page, false); ++ BUG_ON(ret); ++ ++ __set_page_dirty_nobuffers(page); ++unlock: ++ unlock_page(page); ++ put_page(page); ++out: ++ return ret; ++} ++ ++static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) ++{ ++ return __bch2_truncate_page(inode, from >> PAGE_SHIFT, ++ from, round_up(from, PAGE_SIZE)); ++} ++ ++static int bch2_extend(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *inode_u, ++ struct iattr *iattr) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct address_space *mapping = inode->v.i_mapping; ++ int ret; ++ ++ /* ++ * sync appends: ++ * ++ * this has to be done _before_ extending i_size: ++ */ ++ ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); ++ if (ret) ++ return ret; ++ ++ truncate_setsize(&inode->v, iattr->ia_size); ++ setattr_copy(&inode->v, iattr); ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode_size(c, inode, inode->v.i_size, ++ ATTR_MTIME|ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++ ++ return ret; ++} ++ ++static int bch2_truncate_finish_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; ++ bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); ++ return 0; ++} ++ ++static int bch2_truncate_start_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, void *p) ++{ ++ u64 *new_i_size = p; ++ ++ bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY; ++ bi->bi_size = *new_i_size; ++ return 0; ++} ++ ++int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct address_space *mapping = inode->v.i_mapping; ++ struct bch_inode_unpacked inode_u; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ u64 new_i_size = iattr->ia_size; ++ s64 i_sectors_delta = 0; ++ int ret = 0; ++ ++ inode_dio_wait(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ /* ++ * fetch current on disk i_size: inode is locked, i_size can only ++ * increase underneath us: ++ */ ++ bch2_trans_init(&trans, c, 0, 0); ++ iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, 0); ++ ret = PTR_ERR_OR_ZERO(iter); ++ bch2_trans_exit(&trans); ++ ++ if (ret) ++ goto err; ++ ++ /* ++ * check this before next assertion; on filesystem error our normal ++ * invariants are a bit broken (truncate has to truncate the page cache ++ * before the inode). ++ */ ++ ret = bch2_journal_error(&c->journal); ++ if (ret) ++ goto err; ++ ++ BUG_ON(inode->v.i_size < inode_u.bi_size); ++ ++ if (iattr->ia_size > inode->v.i_size) { ++ ret = bch2_extend(inode, &inode_u, iattr); ++ goto err; ++ } ++ ++ ret = bch2_truncate_page(inode, iattr->ia_size); ++ if (unlikely(ret)) ++ goto err; ++ ++ /* ++ * When extending, we're going to write the new i_size to disk ++ * immediately so we need to flush anything above the current on disk ++ * i_size first: ++ * ++ * Also, when extending we need to flush the page that i_size currently ++ * straddles - if it's mapped to userspace, we need to ensure that ++ * userspace has to redirty it and call .mkwrite -> set_page_dirty ++ * again to allocate the part of the page that was extended. ++ */ ++ if (iattr->ia_size > inode_u.bi_size) ++ ret = filemap_write_and_wait_range(mapping, ++ inode_u.bi_size, ++ iattr->ia_size - 1); ++ else if (iattr->ia_size & (PAGE_SIZE - 1)) ++ ret = filemap_write_and_wait_range(mapping, ++ round_down(iattr->ia_size, PAGE_SIZE), ++ iattr->ia_size - 1); ++ if (ret) ++ goto err; ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode(c, inode, bch2_truncate_start_fn, ++ &new_i_size, 0); ++ mutex_unlock(&inode->ei_update_lock); ++ ++ if (unlikely(ret)) ++ goto err; ++ ++ truncate_setsize(&inode->v, iattr->ia_size); ++ ++ ret = bch2_fpunch(c, inode->v.i_ino, ++ round_up(iattr->ia_size, block_bytes(c)) >> 9, ++ U64_MAX, &inode->ei_journal_seq, &i_sectors_delta); ++ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++ ++ if (unlikely(ret)) ++ goto err; ++ ++ setattr_copy(&inode->v, iattr); ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, ++ ATTR_MTIME|ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++err: ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ return ret; ++} ++ ++/* fallocate: */ ++ ++static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ u64 discard_start = round_up(offset, block_bytes(c)) >> 9; ++ u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9; ++ int ret = 0; ++ ++ inode_lock(&inode->v); ++ inode_dio_wait(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ ret = __bch2_truncate_page(inode, ++ offset >> PAGE_SHIFT, ++ offset, offset + len); ++ if (unlikely(ret)) ++ goto err; ++ ++ if (offset >> PAGE_SHIFT != ++ (offset + len) >> PAGE_SHIFT) { ++ ret = __bch2_truncate_page(inode, ++ (offset + len) >> PAGE_SHIFT, ++ offset, offset + len); ++ if (unlikely(ret)) ++ goto err; ++ } ++ ++ truncate_pagecache_range(&inode->v, offset, offset + len - 1); ++ ++ if (discard_start < discard_end) { ++ s64 i_sectors_delta = 0; ++ ++ ret = bch2_fpunch(c, inode->v.i_ino, ++ discard_start, discard_end, ++ &inode->ei_journal_seq, ++ &i_sectors_delta); ++ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++ } ++err: ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ inode_unlock(&inode->v); ++ ++ return ret; ++} ++ ++static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, ++ loff_t offset, loff_t len, ++ bool insert) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct address_space *mapping = inode->v.i_mapping; ++ struct bkey_on_stack copy; ++ struct btree_trans trans; ++ struct btree_iter *src, *dst; ++ loff_t shift, new_size; ++ u64 src_start; ++ int ret; ++ ++ if ((offset | len) & (block_bytes(c) - 1)) ++ return -EINVAL; ++ ++ bkey_on_stack_init(©); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); ++ ++ /* ++ * We need i_mutex to keep the page cache consistent with the extents ++ * btree, and the btree consistent with i_size - we don't need outside ++ * locking for the extents btree itself, because we're using linked ++ * iterators ++ */ ++ inode_lock(&inode->v); ++ inode_dio_wait(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ if (insert) { ++ ret = -EFBIG; ++ if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len) ++ goto err; ++ ++ ret = -EINVAL; ++ if (offset >= inode->v.i_size) ++ goto err; ++ ++ src_start = U64_MAX; ++ shift = len; ++ } else { ++ ret = -EINVAL; ++ if (offset + len >= inode->v.i_size) ++ goto err; ++ ++ src_start = offset + len; ++ shift = -len; ++ } ++ ++ new_size = inode->v.i_size + shift; ++ ++ ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); ++ if (ret) ++ goto err; ++ ++ if (insert) { ++ i_size_write(&inode->v, new_size); ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode_size(c, inode, new_size, ++ ATTR_MTIME|ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++ } else { ++ s64 i_sectors_delta = 0; ++ ++ ret = bch2_fpunch(c, inode->v.i_ino, ++ offset >> 9, (offset + len) >> 9, ++ &inode->ei_journal_seq, ++ &i_sectors_delta); ++ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++ ++ if (ret) ++ goto err; ++ } ++ ++ src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(inode->v.i_ino, src_start >> 9), ++ BTREE_ITER_INTENT); ++ BUG_ON(IS_ERR_OR_NULL(src)); ++ ++ dst = bch2_trans_copy_iter(&trans, src); ++ BUG_ON(IS_ERR_OR_NULL(dst)); ++ ++ while (1) { ++ struct disk_reservation disk_res = ++ bch2_disk_reservation_init(c, 0); ++ struct bkey_i delete; ++ struct bkey_s_c k; ++ struct bpos next_pos; ++ struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); ++ struct bpos atomic_end; ++ unsigned trigger_flags = 0; ++ ++ k = insert ++ ? bch2_btree_iter_peek_prev(src) ++ : bch2_btree_iter_peek(src); ++ if ((ret = bkey_err(k))) ++ goto bkey_err; ++ ++ if (!k.k || k.k->p.inode != inode->v.i_ino) ++ break; ++ ++ BUG_ON(bkey_cmp(src->pos, bkey_start_pos(k.k))); ++ ++ if (insert && ++ bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0) ++ break; ++reassemble: ++ bkey_on_stack_reassemble(©, c, k); ++ ++ if (insert && ++ bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) ++ bch2_cut_front(move_pos, copy.k); ++ ++ copy.k->k.p.offset += shift >> 9; ++ bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k->k)); ++ ++ ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end); ++ if (ret) ++ goto bkey_err; ++ ++ if (bkey_cmp(atomic_end, copy.k->k.p)) { ++ if (insert) { ++ move_pos = atomic_end; ++ move_pos.offset -= shift >> 9; ++ goto reassemble; ++ } else { ++ bch2_cut_back(atomic_end, copy.k); ++ } ++ } ++ ++ bkey_init(&delete.k); ++ delete.k.p = copy.k->k.p; ++ delete.k.size = copy.k->k.size; ++ delete.k.p.offset -= shift >> 9; ++ ++ next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; ++ ++ if (copy.k->k.size == k.k->size) { ++ /* ++ * If we're moving the entire extent, we can skip ++ * running triggers: ++ */ ++ trigger_flags |= BTREE_TRIGGER_NORUN; ++ } else { ++ /* We might end up splitting compressed extents: */ ++ unsigned nr_ptrs = ++ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k)); ++ ++ ret = bch2_disk_reservation_get(c, &disk_res, ++ copy.k->k.size, nr_ptrs, ++ BCH_DISK_RESERVATION_NOFAIL); ++ BUG_ON(ret); ++ } ++ ++ bch2_btree_iter_set_pos(src, bkey_start_pos(&delete.k)); ++ ++ ret = bch2_trans_update(&trans, src, &delete, trigger_flags) ?: ++ bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?: ++ bch2_trans_commit(&trans, &disk_res, ++ &inode->ei_journal_seq, ++ BTREE_INSERT_NOFAIL); ++ bch2_disk_reservation_put(c, &disk_res); ++bkey_err: ++ if (!ret) ++ bch2_btree_iter_set_pos(src, next_pos); ++ ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ goto err; ++ ++ bch2_trans_cond_resched(&trans); ++ } ++ bch2_trans_unlock(&trans); ++ ++ if (!insert) { ++ i_size_write(&inode->v, new_size); ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode_size(c, inode, new_size, ++ ATTR_MTIME|ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++ } ++err: ++ bch2_trans_exit(&trans); ++ bkey_on_stack_exit(©, c); ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ inode_unlock(&inode->v); ++ return ret; ++} ++ ++static long bchfs_fallocate(struct bch_inode_info *inode, int mode, ++ loff_t offset, loff_t len) ++{ ++ struct address_space *mapping = inode->v.i_mapping; ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bpos end_pos; ++ loff_t end = offset + len; ++ loff_t block_start = round_down(offset, block_bytes(c)); ++ loff_t block_end = round_up(end, block_bytes(c)); ++ unsigned sectors; ++ unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas; ++ int ret; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ inode_lock(&inode->v); ++ inode_dio_wait(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { ++ ret = inode_newsize_ok(&inode->v, end); ++ if (ret) ++ goto err; ++ } ++ ++ if (mode & FALLOC_FL_ZERO_RANGE) { ++ ret = __bch2_truncate_page(inode, ++ offset >> PAGE_SHIFT, ++ offset, end); ++ ++ if (!ret && ++ offset >> PAGE_SHIFT != end >> PAGE_SHIFT) ++ ret = __bch2_truncate_page(inode, ++ end >> PAGE_SHIFT, ++ offset, end); ++ ++ if (unlikely(ret)) ++ goto err; ++ ++ truncate_pagecache_range(&inode->v, offset, end - 1); ++ } ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(inode->v.i_ino, block_start >> 9), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ end_pos = POS(inode->v.i_ino, block_end >> 9); ++ ++ while (bkey_cmp(iter->pos, end_pos) < 0) { ++ s64 i_sectors_delta = 0; ++ struct disk_reservation disk_res = { 0 }; ++ struct quota_res quota_res = { 0 }; ++ struct bkey_i_reservation reservation; ++ struct bkey_s_c k; ++ ++ bch2_trans_begin(&trans); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ if ((ret = bkey_err(k))) ++ goto bkey_err; ++ ++ /* already reserved */ ++ if (k.k->type == KEY_TYPE_reservation && ++ bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) { ++ bch2_btree_iter_next_slot(iter); ++ continue; ++ } ++ ++ if (bkey_extent_is_data(k.k) && ++ !(mode & FALLOC_FL_ZERO_RANGE)) { ++ bch2_btree_iter_next_slot(iter); ++ continue; ++ } ++ ++ bkey_reservation_init(&reservation.k_i); ++ reservation.k.type = KEY_TYPE_reservation; ++ reservation.k.p = k.k->p; ++ reservation.k.size = k.k->size; ++ ++ bch2_cut_front(iter->pos, &reservation.k_i); ++ bch2_cut_back(end_pos, &reservation.k_i); ++ ++ sectors = reservation.k.size; ++ reservation.v.nr_replicas = bch2_bkey_nr_ptrs_allocated(k); ++ ++ if (!bkey_extent_is_allocation(k.k)) { ++ ret = bch2_quota_reservation_add(c, inode, ++ "a_res, ++ sectors, true); ++ if (unlikely(ret)) ++ goto bkey_err; ++ } ++ ++ if (reservation.v.nr_replicas < replicas || ++ bch2_bkey_sectors_compressed(k)) { ++ ret = bch2_disk_reservation_get(c, &disk_res, sectors, ++ replicas, 0); ++ if (unlikely(ret)) ++ goto bkey_err; ++ ++ reservation.v.nr_replicas = disk_res.nr_replicas; ++ } ++ ++ ret = bch2_extent_update(&trans, iter, &reservation.k_i, ++ &disk_res, &inode->ei_journal_seq, ++ 0, &i_sectors_delta); ++ i_sectors_acct(c, inode, "a_res, i_sectors_delta); ++bkey_err: ++ bch2_quota_reservation_put(c, inode, "a_res); ++ bch2_disk_reservation_put(c, &disk_res); ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ goto err; ++ } ++ ++ /* ++ * Do we need to extend the file? ++ * ++ * If we zeroed up to the end of the file, we dropped whatever writes ++ * were going to write out the current i_size, so we have to extend ++ * manually even if FL_KEEP_SIZE was set: ++ */ ++ if (end >= inode->v.i_size && ++ (!(mode & FALLOC_FL_KEEP_SIZE) || ++ (mode & FALLOC_FL_ZERO_RANGE))) { ++ struct btree_iter *inode_iter; ++ struct bch_inode_unpacked inode_u; ++ ++ do { ++ bch2_trans_begin(&trans); ++ inode_iter = bch2_inode_peek(&trans, &inode_u, ++ inode->v.i_ino, 0); ++ ret = PTR_ERR_OR_ZERO(inode_iter); ++ } while (ret == -EINTR); ++ ++ bch2_trans_unlock(&trans); ++ ++ if (ret) ++ goto err; ++ ++ /* ++ * Sync existing appends before extending i_size, ++ * as in bch2_extend(): ++ */ ++ ret = filemap_write_and_wait_range(mapping, ++ inode_u.bi_size, S64_MAX); ++ if (ret) ++ goto err; ++ ++ if (mode & FALLOC_FL_KEEP_SIZE) ++ end = inode->v.i_size; ++ else ++ i_size_write(&inode->v, end); ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode_size(c, inode, end, 0); ++ mutex_unlock(&inode->ei_update_lock); ++ } ++err: ++ bch2_trans_exit(&trans); ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ inode_unlock(&inode->v); ++ return ret; ++} ++ ++long bch2_fallocate_dispatch(struct file *file, int mode, ++ loff_t offset, loff_t len) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ long ret; ++ ++ if (!percpu_ref_tryget(&c->writes)) ++ return -EROFS; ++ ++ if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) ++ ret = bchfs_fallocate(inode, mode, offset, len); ++ else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) ++ ret = bchfs_fpunch(inode, offset, len); ++ else if (mode == FALLOC_FL_INSERT_RANGE) ++ ret = bchfs_fcollapse_finsert(inode, offset, len, true); ++ else if (mode == FALLOC_FL_COLLAPSE_RANGE) ++ ret = bchfs_fcollapse_finsert(inode, offset, len, false); ++ else ++ ret = -EOPNOTSUPP; ++ ++ percpu_ref_put(&c->writes); ++ ++ return ret; ++} ++ ++static void mark_range_unallocated(struct bch_inode_info *inode, ++ loff_t start, loff_t end) ++{ ++ pgoff_t index = start >> PAGE_SHIFT; ++ pgoff_t end_index = (end - 1) >> PAGE_SHIFT; ++ struct pagevec pvec; ++ ++ pagevec_init(&pvec); ++ ++ do { ++ unsigned nr_pages, i, j; ++ ++ nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, ++ &index, end_index); ++ if (nr_pages == 0) ++ break; ++ ++ for (i = 0; i < nr_pages; i++) { ++ struct page *page = pvec.pages[i]; ++ struct bch_page_state *s; ++ ++ lock_page(page); ++ s = bch2_page_state(page); ++ ++ if (s) { ++ spin_lock(&s->lock); ++ for (j = 0; j < PAGE_SECTORS; j++) ++ s->s[j].nr_replicas = 0; ++ spin_unlock(&s->lock); ++ } ++ ++ unlock_page(page); ++ } ++ pagevec_release(&pvec); ++ } while (index <= end_index); ++} ++ ++loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, ++ struct file *file_dst, loff_t pos_dst, ++ loff_t len, unsigned remap_flags) ++{ ++ struct bch_inode_info *src = file_bch_inode(file_src); ++ struct bch_inode_info *dst = file_bch_inode(file_dst); ++ struct bch_fs *c = src->v.i_sb->s_fs_info; ++ s64 i_sectors_delta = 0; ++ u64 aligned_len; ++ loff_t ret = 0; ++ ++ if (!c->opts.reflink) ++ return -EOPNOTSUPP; ++ ++ if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) ++ return -EINVAL; ++ ++ if (remap_flags & REMAP_FILE_DEDUP) ++ return -EOPNOTSUPP; ++ ++ if ((pos_src & (block_bytes(c) - 1)) || ++ (pos_dst & (block_bytes(c) - 1))) ++ return -EINVAL; ++ ++ if (src == dst && ++ abs(pos_src - pos_dst) < len) ++ return -EINVAL; ++ ++ bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); ++ ++ file_update_time(file_dst); ++ ++ inode_dio_wait(&src->v); ++ inode_dio_wait(&dst->v); ++ ++ ret = generic_remap_file_range_prep(file_src, pos_src, ++ file_dst, pos_dst, ++ &len, remap_flags); ++ if (ret < 0 || len == 0) ++ goto err; ++ ++ aligned_len = round_up((u64) len, block_bytes(c)); ++ ++ ret = write_invalidate_inode_pages_range(dst->v.i_mapping, ++ pos_dst, pos_dst + len - 1); ++ if (ret) ++ goto err; ++ ++ mark_range_unallocated(src, pos_src, pos_src + aligned_len); ++ ++ ret = bch2_remap_range(c, ++ POS(dst->v.i_ino, pos_dst >> 9), ++ POS(src->v.i_ino, pos_src >> 9), ++ aligned_len >> 9, ++ &dst->ei_journal_seq, ++ pos_dst + len, &i_sectors_delta); ++ if (ret < 0) ++ goto err; ++ ++ /* ++ * due to alignment, we might have remapped slightly more than requsted ++ */ ++ ret = min((u64) ret << 9, (u64) len); ++ ++ /* XXX get a quota reservation */ ++ i_sectors_acct(c, dst, NULL, i_sectors_delta); ++ ++ spin_lock(&dst->v.i_lock); ++ if (pos_dst + ret > dst->v.i_size) ++ i_size_write(&dst->v, pos_dst + ret); ++ spin_unlock(&dst->v.i_lock); ++err: ++ bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); ++ ++ return ret; ++} ++ ++/* fseek: */ ++ ++static int page_data_offset(struct page *page, unsigned offset) ++{ ++ struct bch_page_state *s = bch2_page_state(page); ++ unsigned i; ++ ++ if (s) ++ for (i = offset >> 9; i < PAGE_SECTORS; i++) ++ if (s->s[i].state >= SECTOR_DIRTY) ++ return i << 9; ++ ++ return -1; ++} ++ ++static loff_t bch2_seek_pagecache_data(struct inode *vinode, ++ loff_t start_offset, ++ loff_t end_offset) ++{ ++ struct address_space *mapping = vinode->i_mapping; ++ struct page *page; ++ pgoff_t start_index = start_offset >> PAGE_SHIFT; ++ pgoff_t end_index = end_offset >> PAGE_SHIFT; ++ pgoff_t index = start_index; ++ loff_t ret; ++ int offset; ++ ++ while (index <= end_index) { ++ if (find_get_pages_range(mapping, &index, end_index, 1, &page)) { ++ lock_page(page); ++ ++ offset = page_data_offset(page, ++ page->index == start_index ++ ? start_offset & (PAGE_SIZE - 1) ++ : 0); ++ if (offset >= 0) { ++ ret = clamp(((loff_t) page->index << PAGE_SHIFT) + ++ offset, ++ start_offset, end_offset); ++ unlock_page(page); ++ put_page(page); ++ return ret; ++ } ++ ++ unlock_page(page); ++ put_page(page); ++ } else { ++ break; ++ } ++ } ++ ++ return end_offset; ++} ++ ++static loff_t bch2_seek_data(struct file *file, u64 offset) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 isize, next_data = MAX_LFS_FILESIZE; ++ int ret; ++ ++ isize = i_size_read(&inode->v); ++ if (offset >= isize) ++ return -ENXIO; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, ++ POS(inode->v.i_ino, offset >> 9), 0, k, ret) { ++ if (k.k->p.inode != inode->v.i_ino) { ++ break; ++ } else if (bkey_extent_is_data(k.k)) { ++ next_data = max(offset, bkey_start_offset(k.k) << 9); ++ break; ++ } else if (k.k->p.offset >> 9 > isize) ++ break; ++ } ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ if (ret) ++ return ret; ++ ++ if (next_data > offset) ++ next_data = bch2_seek_pagecache_data(&inode->v, ++ offset, next_data); ++ ++ if (next_data >= isize) ++ return -ENXIO; ++ ++ return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); ++} ++ ++static int __page_hole_offset(struct page *page, unsigned offset) ++{ ++ struct bch_page_state *s = bch2_page_state(page); ++ unsigned i; ++ ++ if (!s) ++ return 0; ++ ++ for (i = offset >> 9; i < PAGE_SECTORS; i++) ++ if (s->s[i].state < SECTOR_DIRTY) ++ return i << 9; ++ ++ return -1; ++} ++ ++static loff_t page_hole_offset(struct address_space *mapping, loff_t offset) ++{ ++ pgoff_t index = offset >> PAGE_SHIFT; ++ struct page *page; ++ int pg_offset; ++ loff_t ret = -1; ++ ++ page = find_lock_entry(mapping, index); ++ if (!page || xa_is_value(page)) ++ return offset; ++ ++ pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1)); ++ if (pg_offset >= 0) ++ ret = ((loff_t) index << PAGE_SHIFT) + pg_offset; ++ ++ unlock_page(page); ++ ++ return ret; ++} ++ ++static loff_t bch2_seek_pagecache_hole(struct inode *vinode, ++ loff_t start_offset, ++ loff_t end_offset) ++{ ++ struct address_space *mapping = vinode->i_mapping; ++ loff_t offset = start_offset, hole; ++ ++ while (offset < end_offset) { ++ hole = page_hole_offset(mapping, offset); ++ if (hole >= 0 && hole <= end_offset) ++ return max(start_offset, hole); ++ ++ offset += PAGE_SIZE; ++ offset &= PAGE_MASK; ++ } ++ ++ return end_offset; ++} ++ ++static loff_t bch2_seek_hole(struct file *file, u64 offset) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 isize, next_hole = MAX_LFS_FILESIZE; ++ int ret; ++ ++ isize = i_size_read(&inode->v); ++ if (offset >= isize) ++ return -ENXIO; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, ++ POS(inode->v.i_ino, offset >> 9), ++ BTREE_ITER_SLOTS, k, ret) { ++ if (k.k->p.inode != inode->v.i_ino) { ++ next_hole = bch2_seek_pagecache_hole(&inode->v, ++ offset, MAX_LFS_FILESIZE); ++ break; ++ } else if (!bkey_extent_is_data(k.k)) { ++ next_hole = bch2_seek_pagecache_hole(&inode->v, ++ max(offset, bkey_start_offset(k.k) << 9), ++ k.k->p.offset << 9); ++ ++ if (next_hole < k.k->p.offset << 9) ++ break; ++ } else { ++ offset = max(offset, bkey_start_offset(k.k) << 9); ++ } ++ } ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ if (ret) ++ return ret; ++ ++ if (next_hole > isize) ++ next_hole = isize; ++ ++ return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); ++} ++ ++loff_t bch2_llseek(struct file *file, loff_t offset, int whence) ++{ ++ switch (whence) { ++ case SEEK_SET: ++ case SEEK_CUR: ++ case SEEK_END: ++ return generic_file_llseek(file, offset, whence); ++ case SEEK_DATA: ++ return bch2_seek_data(file, offset); ++ case SEEK_HOLE: ++ return bch2_seek_hole(file, offset); ++ } ++ ++ return -EINVAL; ++} ++ ++void bch2_fs_fsio_exit(struct bch_fs *c) ++{ ++ bioset_exit(&c->dio_write_bioset); ++ bioset_exit(&c->dio_read_bioset); ++ bioset_exit(&c->writepage_bioset); ++} ++ ++int bch2_fs_fsio_init(struct bch_fs *c) ++{ ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ if (bioset_init(&c->writepage_bioset, ++ 4, offsetof(struct bch_writepage_io, op.wbio.bio), ++ BIOSET_NEED_BVECS) || ++ bioset_init(&c->dio_read_bioset, ++ 4, offsetof(struct dio_read, rbio.bio), ++ BIOSET_NEED_BVECS) || ++ bioset_init(&c->dio_write_bioset, ++ 4, offsetof(struct dio_write, op.wbio.bio), ++ BIOSET_NEED_BVECS)) ++ ret = -ENOMEM; ++ ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} ++ ++#endif /* NO_BCACHEFS_FS */ +diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h +new file mode 100644 +index 000000000000..7063556d289b +--- /dev/null ++++ b/fs/bcachefs/fs-io.h +@@ -0,0 +1,57 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FS_IO_H ++#define _BCACHEFS_FS_IO_H ++ ++#ifndef NO_BCACHEFS_FS ++ ++#include "buckets.h" ++#include "io_types.h" ++ ++#include ++ ++struct quota_res; ++ ++int __must_check bch2_write_inode_size(struct bch_fs *, ++ struct bch_inode_info *, ++ loff_t, unsigned); ++ ++int bch2_writepage(struct page *, struct writeback_control *); ++int bch2_readpage(struct file *, struct page *); ++ ++int bch2_writepages(struct address_space *, struct writeback_control *); ++int bch2_readpages(struct file *, struct address_space *, ++ struct list_head *, unsigned); ++ ++int bch2_write_begin(struct file *, struct address_space *, loff_t, ++ unsigned, unsigned, struct page **, void **); ++int bch2_write_end(struct file *, struct address_space *, loff_t, ++ unsigned, unsigned, struct page *, void *); ++ ++ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); ++ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); ++ ++int bch2_fsync(struct file *, loff_t, loff_t, int); ++ ++int bch2_truncate(struct bch_inode_info *, struct iattr *); ++long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); ++ ++loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, ++ loff_t, loff_t, unsigned); ++ ++loff_t bch2_llseek(struct file *, loff_t, int); ++ ++vm_fault_t bch2_page_fault(struct vm_fault *); ++vm_fault_t bch2_page_mkwrite(struct vm_fault *); ++void bch2_invalidatepage(struct page *, unsigned int, unsigned int); ++int bch2_releasepage(struct page *, gfp_t); ++int bch2_migrate_page(struct address_space *, struct page *, ++ struct page *, enum migrate_mode); ++ ++void bch2_fs_fsio_exit(struct bch_fs *); ++int bch2_fs_fsio_init(struct bch_fs *); ++#else ++static inline void bch2_fs_fsio_exit(struct bch_fs *c) {} ++static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; } ++#endif ++ ++#endif /* _BCACHEFS_FS_IO_H */ +diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c +new file mode 100644 +index 000000000000..031e6d931171 +--- /dev/null ++++ b/fs/bcachefs/fs-ioctl.c +@@ -0,0 +1,308 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef NO_BCACHEFS_FS ++ ++#include "bcachefs.h" ++#include "chardev.h" ++#include "dirent.h" ++#include "fs.h" ++#include "fs-common.h" ++#include "fs-ioctl.h" ++#include "quota.h" ++ ++#include ++#include ++ ++#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) ++ ++struct flags_set { ++ unsigned mask; ++ unsigned flags; ++ ++ unsigned projid; ++}; ++ ++static int bch2_inode_flags_set(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ /* ++ * We're relying on btree locking here for exclusion with other ioctl ++ * calls - use the flags in the btree (@bi), not inode->i_flags: ++ */ ++ struct flags_set *s = p; ++ unsigned newflags = s->flags; ++ unsigned oldflags = bi->bi_flags & s->mask; ++ ++ if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) && ++ !capable(CAP_LINUX_IMMUTABLE)) ++ return -EPERM; ++ ++ if (!S_ISREG(bi->bi_mode) && ++ !S_ISDIR(bi->bi_mode) && ++ (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags) ++ return -EINVAL; ++ ++ bi->bi_flags &= ~s->mask; ++ bi->bi_flags |= newflags; ++ ++ bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); ++ return 0; ++} ++ ++static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg) ++{ ++ unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags); ++ ++ return put_user(flags, arg); ++} ++ ++static int bch2_ioc_setflags(struct bch_fs *c, ++ struct file *file, ++ struct bch_inode_info *inode, ++ void __user *arg) ++{ ++ struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) }; ++ unsigned uflags; ++ int ret; ++ ++ if (get_user(uflags, (int __user *) arg)) ++ return -EFAULT; ++ ++ s.flags = map_flags_rev(bch_flags_to_uflags, uflags); ++ if (uflags) ++ return -EOPNOTSUPP; ++ ++ ret = mnt_want_write_file(file); ++ if (ret) ++ return ret; ++ ++ inode_lock(&inode->v); ++ if (!inode_owner_or_capable(&inode->v)) { ++ ret = -EACCES; ++ goto setflags_out; ++ } ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode(c, inode, bch2_inode_flags_set, &s, ++ ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++ ++setflags_out: ++ inode_unlock(&inode->v); ++ mnt_drop_write_file(file); ++ return ret; ++} ++ ++static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, ++ struct fsxattr __user *arg) ++{ ++ struct fsxattr fa = { 0 }; ++ ++ fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags); ++ fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ]; ++ ++ return copy_to_user(arg, &fa, sizeof(fa)); ++} ++ ++static int fssetxattr_inode_update_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct flags_set *s = p; ++ ++ if (s->projid != bi->bi_project) { ++ bi->bi_fields_set |= 1U << Inode_opt_project; ++ bi->bi_project = s->projid; ++ } ++ ++ return bch2_inode_flags_set(inode, bi, p); ++} ++ ++static int bch2_ioc_fssetxattr(struct bch_fs *c, ++ struct file *file, ++ struct bch_inode_info *inode, ++ struct fsxattr __user *arg) ++{ ++ struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) }; ++ struct fsxattr fa; ++ int ret; ++ ++ if (copy_from_user(&fa, arg, sizeof(fa))) ++ return -EFAULT; ++ ++ s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags); ++ if (fa.fsx_xflags) ++ return -EOPNOTSUPP; ++ ++ if (fa.fsx_projid >= U32_MAX) ++ return -EINVAL; ++ ++ s.projid = fa.fsx_projid + 1; ++ ++ ret = mnt_want_write_file(file); ++ if (ret) ++ return ret; ++ ++ inode_lock(&inode->v); ++ if (!inode_owner_or_capable(&inode->v)) { ++ ret = -EACCES; ++ goto err; ++ } ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_set_projid(c, inode, s.projid); ++ if (ret) ++ goto err_unlock; ++ ++ ret = bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, ++ ATTR_CTIME); ++err_unlock: ++ mutex_unlock(&inode->ei_update_lock); ++err: ++ inode_unlock(&inode->v); ++ mnt_drop_write_file(file); ++ return ret; ++} ++ ++static int bch2_reinherit_attrs_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct bch_inode_info *dir = p; ++ ++ return !bch2_reinherit_attrs(bi, &dir->ei_inode); ++} ++ ++static int bch2_ioc_reinherit_attrs(struct bch_fs *c, ++ struct file *file, ++ struct bch_inode_info *src, ++ const char __user *name) ++{ ++ struct bch_inode_info *dst; ++ struct inode *vinode = NULL; ++ char *kname = NULL; ++ struct qstr qstr; ++ int ret = 0; ++ u64 inum; ++ ++ kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); ++ if (!kname) ++ return -ENOMEM; ++ ++ ret = strncpy_from_user(kname, name, BCH_NAME_MAX); ++ if (unlikely(ret < 0)) ++ goto err1; ++ ++ qstr.len = ret; ++ qstr.name = kname; ++ ++ ret = -ENOENT; ++ inum = bch2_dirent_lookup(c, src->v.i_ino, ++ &src->ei_str_hash, ++ &qstr); ++ if (!inum) ++ goto err1; ++ ++ vinode = bch2_vfs_inode_get(c, inum); ++ ret = PTR_ERR_OR_ZERO(vinode); ++ if (ret) ++ goto err1; ++ ++ dst = to_bch_ei(vinode); ++ ++ ret = mnt_want_write_file(file); ++ if (ret) ++ goto err2; ++ ++ bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst); ++ ++ if (inode_attr_changing(src, dst, Inode_opt_project)) { ++ ret = bch2_fs_quota_transfer(c, dst, ++ src->ei_qid, ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (ret) ++ goto err3; ++ } ++ ++ ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0); ++err3: ++ bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst); ++ ++ /* return true if we did work */ ++ if (ret >= 0) ++ ret = !ret; ++ ++ mnt_drop_write_file(file); ++err2: ++ iput(vinode); ++err1: ++ kfree(kname); ++ ++ return ret; ++} ++ ++long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct super_block *sb = inode->v.i_sb; ++ struct bch_fs *c = sb->s_fs_info; ++ ++ switch (cmd) { ++ case FS_IOC_GETFLAGS: ++ return bch2_ioc_getflags(inode, (int __user *) arg); ++ ++ case FS_IOC_SETFLAGS: ++ return bch2_ioc_setflags(c, file, inode, (int __user *) arg); ++ ++ case FS_IOC_FSGETXATTR: ++ return bch2_ioc_fsgetxattr(inode, (void __user *) arg); ++ case FS_IOC_FSSETXATTR: ++ return bch2_ioc_fssetxattr(c, file, inode, ++ (void __user *) arg); ++ ++ case BCHFS_IOC_REINHERIT_ATTRS: ++ return bch2_ioc_reinherit_attrs(c, file, inode, ++ (void __user *) arg); ++ ++ case FS_IOC_GETVERSION: ++ return -ENOTTY; ++ case FS_IOC_SETVERSION: ++ return -ENOTTY; ++ ++ case FS_IOC_GOINGDOWN: ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ down_write(&sb->s_umount); ++ sb->s_flags |= SB_RDONLY; ++ if (bch2_fs_emergency_read_only(c)) ++ bch_err(c, "emergency read only due to ioctl"); ++ up_write(&sb->s_umount); ++ return 0; ++ ++ default: ++ return bch2_fs_ioctl(c, cmd, (void __user *) arg); ++ } ++} ++ ++#ifdef CONFIG_COMPAT ++long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg) ++{ ++ /* These are just misnamed, they actually get/put from/to user an int */ ++ switch (cmd) { ++ case FS_IOC_GETFLAGS: ++ cmd = FS_IOC_GETFLAGS; ++ break; ++ case FS_IOC32_SETFLAGS: ++ cmd = FS_IOC_SETFLAGS; ++ break; ++ default: ++ return -ENOIOCTLCMD; ++ } ++ return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); ++} ++#endif ++ ++#endif /* NO_BCACHEFS_FS */ +diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h +new file mode 100644 +index 000000000000..f201980ef2c3 +--- /dev/null ++++ b/fs/bcachefs/fs-ioctl.h +@@ -0,0 +1,81 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FS_IOCTL_H ++#define _BCACHEFS_FS_IOCTL_H ++ ++/* Inode flags: */ ++ ++/* bcachefs inode flags -> vfs inode flags: */ ++static const unsigned bch_flags_to_vfs[] = { ++ [__BCH_INODE_SYNC] = S_SYNC, ++ [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE, ++ [__BCH_INODE_APPEND] = S_APPEND, ++ [__BCH_INODE_NOATIME] = S_NOATIME, ++}; ++ ++/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ ++static const unsigned bch_flags_to_uflags[] = { ++ [__BCH_INODE_SYNC] = FS_SYNC_FL, ++ [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL, ++ [__BCH_INODE_APPEND] = FS_APPEND_FL, ++ [__BCH_INODE_NODUMP] = FS_NODUMP_FL, ++ [__BCH_INODE_NOATIME] = FS_NOATIME_FL, ++}; ++ ++/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ ++static const unsigned bch_flags_to_xflags[] = { ++ [__BCH_INODE_SYNC] = FS_XFLAG_SYNC, ++ [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE, ++ [__BCH_INODE_APPEND] = FS_XFLAG_APPEND, ++ [__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP, ++ [__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME, ++ //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; ++}; ++ ++#define set_flags(_map, _in, _out) \ ++do { \ ++ unsigned _i; \ ++ \ ++ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ ++ if ((_in) & (1 << _i)) \ ++ (_out) |= _map[_i]; \ ++ else \ ++ (_out) &= ~_map[_i]; \ ++} while (0) ++ ++#define map_flags(_map, _in) \ ++({ \ ++ unsigned _out = 0; \ ++ \ ++ set_flags(_map, _in, _out); \ ++ _out; \ ++}) ++ ++#define map_flags_rev(_map, _in) \ ++({ \ ++ unsigned _i, _out = 0; \ ++ \ ++ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ ++ if ((_in) & _map[_i]) { \ ++ (_out) |= 1 << _i; \ ++ (_in) &= ~_map[_i]; \ ++ } \ ++ (_out); \ ++}) ++ ++#define map_defined(_map) \ ++({ \ ++ unsigned _in = ~0; \ ++ \ ++ map_flags_rev(_map, _in); \ ++}) ++ ++/* Set VFS inode flags from bcachefs inode: */ ++static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) ++{ ++ set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); ++} ++ ++long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long); ++long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long); ++ ++#endif /* _BCACHEFS_FS_IOCTL_H */ +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +new file mode 100644 +index 000000000000..a47923d67f7a +--- /dev/null ++++ b/fs/bcachefs/fs.c +@@ -0,0 +1,1605 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef NO_BCACHEFS_FS ++ ++#include "bcachefs.h" ++#include "acl.h" ++#include "bkey_on_stack.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "chardev.h" ++#include "dirent.h" ++#include "extents.h" ++#include "fs.h" ++#include "fs-common.h" ++#include "fs-io.h" ++#include "fs-ioctl.h" ++#include "fsck.h" ++#include "inode.h" ++#include "io.h" ++#include "journal.h" ++#include "keylist.h" ++#include "quota.h" ++#include "super.h" ++#include "xattr.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static struct kmem_cache *bch2_inode_cache; ++ ++static void bch2_vfs_inode_init(struct bch_fs *, ++ struct bch_inode_info *, ++ struct bch_inode_unpacked *); ++ ++static void journal_seq_copy(struct bch_inode_info *dst, ++ u64 journal_seq) ++{ ++ u64 old, v = READ_ONCE(dst->ei_journal_seq); ++ ++ do { ++ old = v; ++ ++ if (old >= journal_seq) ++ break; ++ } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old); ++} ++ ++static void __pagecache_lock_put(struct pagecache_lock *lock, long i) ++{ ++ BUG_ON(atomic_long_read(&lock->v) == 0); ++ ++ if (atomic_long_sub_return_release(i, &lock->v) == 0) ++ wake_up_all(&lock->wait); ++} ++ ++static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i) ++{ ++ long v = atomic_long_read(&lock->v), old; ++ ++ do { ++ old = v; ++ ++ if (i > 0 ? v < 0 : v > 0) ++ return false; ++ } while ((v = atomic_long_cmpxchg_acquire(&lock->v, ++ old, old + i)) != old); ++ return true; ++} ++ ++static void __pagecache_lock_get(struct pagecache_lock *lock, long i) ++{ ++ wait_event(lock->wait, __pagecache_lock_tryget(lock, i)); ++} ++ ++void bch2_pagecache_add_put(struct pagecache_lock *lock) ++{ ++ __pagecache_lock_put(lock, 1); ++} ++ ++void bch2_pagecache_add_get(struct pagecache_lock *lock) ++{ ++ __pagecache_lock_get(lock, 1); ++} ++ ++void bch2_pagecache_block_put(struct pagecache_lock *lock) ++{ ++ __pagecache_lock_put(lock, -1); ++} ++ ++void bch2_pagecache_block_get(struct pagecache_lock *lock) ++{ ++ __pagecache_lock_get(lock, -1); ++} ++ ++void bch2_inode_update_after_write(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ unsigned fields) ++{ ++ set_nlink(&inode->v, bch2_inode_nlink_get(bi)); ++ i_uid_write(&inode->v, bi->bi_uid); ++ i_gid_write(&inode->v, bi->bi_gid); ++ inode->v.i_mode = bi->bi_mode; ++ ++ if (fields & ATTR_ATIME) ++ inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime); ++ if (fields & ATTR_MTIME) ++ inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime); ++ if (fields & ATTR_CTIME) ++ inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime); ++ ++ inode->ei_inode = *bi; ++ ++ bch2_inode_flags_to_vfs(inode); ++} ++ ++int __must_check bch2_write_inode(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ inode_set_fn set, ++ void *p, unsigned fields) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bch_inode_unpacked inode_u; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(iter) ?: ++ (set ? set(inode, &inode_u, p) : 0) ?: ++ bch2_inode_write(&trans, iter, &inode_u) ?: ++ bch2_trans_commit(&trans, NULL, ++ &inode->ei_journal_seq, ++ BTREE_INSERT_NOUNLOCK| ++ BTREE_INSERT_NOFAIL); ++ ++ /* ++ * the btree node lock protects inode->ei_inode, not ei_update_lock; ++ * this is important for inode updates via bchfs_write_index_update ++ */ ++ if (!ret) ++ bch2_inode_update_after_write(c, inode, &inode_u, fields); ++ ++ bch2_trans_iter_put(&trans, iter); ++ ++ if (ret == -EINTR) ++ goto retry; ++ ++ bch2_trans_exit(&trans); ++ return ret < 0 ? ret : 0; ++} ++ ++int bch2_fs_quota_transfer(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch_qid new_qid, ++ unsigned qtypes, ++ enum quota_acct_mode mode) ++{ ++ unsigned i; ++ int ret; ++ ++ qtypes &= enabled_qtypes(c); ++ ++ for (i = 0; i < QTYP_NR; i++) ++ if (new_qid.q[i] == inode->ei_qid.q[i]) ++ qtypes &= ~(1U << i); ++ ++ if (!qtypes) ++ return 0; ++ ++ mutex_lock(&inode->ei_quota_lock); ++ ++ ret = bch2_quota_transfer(c, qtypes, new_qid, ++ inode->ei_qid, ++ inode->v.i_blocks + ++ inode->ei_quota_reserved, ++ mode); ++ if (!ret) ++ for (i = 0; i < QTYP_NR; i++) ++ if (qtypes & (1 << i)) ++ inode->ei_qid.q[i] = new_qid.q[i]; ++ ++ mutex_unlock(&inode->ei_quota_lock); ++ ++ return ret; ++} ++ ++struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) ++{ ++ struct bch_inode_unpacked inode_u; ++ struct bch_inode_info *inode; ++ int ret; ++ ++ inode = to_bch_ei(iget_locked(c->vfs_sb, inum)); ++ if (unlikely(!inode)) ++ return ERR_PTR(-ENOMEM); ++ if (!(inode->v.i_state & I_NEW)) ++ return &inode->v; ++ ++ ret = bch2_inode_find_by_inum(c, inum, &inode_u); ++ if (ret) { ++ iget_failed(&inode->v); ++ return ERR_PTR(ret); ++ } ++ ++ bch2_vfs_inode_init(c, inode, &inode_u); ++ ++ inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum); ++ ++ unlock_new_inode(&inode->v); ++ ++ return &inode->v; ++} ++ ++static struct bch_inode_info * ++__bch2_create(struct bch_inode_info *dir, struct dentry *dentry, ++ umode_t mode, dev_t rdev, bool tmpfile) ++{ ++ struct bch_fs *c = dir->v.i_sb->s_fs_info; ++ struct user_namespace *ns = dir->v.i_sb->s_user_ns; ++ struct btree_trans trans; ++ struct bch_inode_unpacked dir_u; ++ struct bch_inode_info *inode, *old; ++ struct bch_inode_unpacked inode_u; ++ struct posix_acl *default_acl = NULL, *acl = NULL; ++ u64 journal_seq = 0; ++ int ret; ++ ++ /* ++ * preallocate acls + vfs inode before btree transaction, so that ++ * nothing can fail after the transaction succeeds: ++ */ ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); ++ if (ret) ++ return ERR_PTR(ret); ++#endif ++ inode = to_bch_ei(new_inode(c->vfs_sb)); ++ if (unlikely(!inode)) { ++ inode = ERR_PTR(-ENOMEM); ++ goto err; ++ } ++ ++ bch2_inode_init_early(c, &inode_u); ++ ++ if (!tmpfile) ++ mutex_lock(&dir->ei_update_lock); ++ ++ bch2_trans_init(&trans, c, 8, 1024); ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u, ++ !tmpfile ? &dentry->d_name : NULL, ++ from_kuid(ns, current_fsuid()), ++ from_kgid(ns, current_fsgid()), ++ mode, rdev, ++ default_acl, acl) ?: ++ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (unlikely(ret)) ++ goto err_before_quota; ++ ++ ret = bch2_trans_commit(&trans, NULL, &journal_seq, ++ BTREE_INSERT_NOUNLOCK); ++ if (unlikely(ret)) { ++ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, ++ KEY_TYPE_QUOTA_WARN); ++err_before_quota: ++ if (ret == -EINTR) ++ goto retry; ++ goto err_trans; ++ } ++ ++ if (!tmpfile) { ++ bch2_inode_update_after_write(c, dir, &dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ journal_seq_copy(dir, journal_seq); ++ mutex_unlock(&dir->ei_update_lock); ++ } ++ ++ bch2_vfs_inode_init(c, inode, &inode_u); ++ journal_seq_copy(inode, journal_seq); ++ ++ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); ++ set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); ++ ++ /* ++ * we must insert the new inode into the inode cache before calling ++ * bch2_trans_exit() and dropping locks, else we could race with another ++ * thread pulling the inode in and modifying it: ++ */ ++ ++ old = to_bch_ei(insert_inode_locked2(&inode->v)); ++ if (unlikely(old)) { ++ /* ++ * We raced, another process pulled the new inode into cache ++ * before us: ++ */ ++ journal_seq_copy(old, journal_seq); ++ make_bad_inode(&inode->v); ++ iput(&inode->v); ++ ++ inode = old; ++ } else { ++ /* ++ * we really don't want insert_inode_locked2() to be setting ++ * I_NEW... ++ */ ++ unlock_new_inode(&inode->v); ++ } ++ ++ bch2_trans_exit(&trans); ++err: ++ posix_acl_release(default_acl); ++ posix_acl_release(acl); ++ return inode; ++err_trans: ++ if (!tmpfile) ++ mutex_unlock(&dir->ei_update_lock); ++ ++ bch2_trans_exit(&trans); ++ make_bad_inode(&inode->v); ++ iput(&inode->v); ++ inode = ERR_PTR(ret); ++ goto err; ++} ++ ++/* methods */ ++ ++static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, ++ unsigned int flags) ++{ ++ struct bch_fs *c = vdir->i_sb->s_fs_info; ++ struct bch_inode_info *dir = to_bch_ei(vdir); ++ struct inode *vinode = NULL; ++ u64 inum; ++ ++ inum = bch2_dirent_lookup(c, dir->v.i_ino, ++ &dir->ei_str_hash, ++ &dentry->d_name); ++ ++ if (inum) ++ vinode = bch2_vfs_inode_get(c, inum); ++ ++ return d_splice_alias(vinode, dentry); ++} ++ ++static int bch2_mknod(struct inode *vdir, struct dentry *dentry, ++ umode_t mode, dev_t rdev) ++{ ++ struct bch_inode_info *inode = ++ __bch2_create(to_bch_ei(vdir), dentry, mode, rdev, false); ++ ++ if (IS_ERR(inode)) ++ return PTR_ERR(inode); ++ ++ d_instantiate(dentry, &inode->v); ++ return 0; ++} ++ ++static int bch2_create(struct inode *vdir, struct dentry *dentry, ++ umode_t mode, bool excl) ++{ ++ return bch2_mknod(vdir, dentry, mode|S_IFREG, 0); ++} ++ ++static int __bch2_link(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch_inode_info *dir, ++ struct dentry *dentry) ++{ ++ struct btree_trans trans; ++ struct bch_inode_unpacked dir_u, inode_u; ++ int ret; ++ ++ mutex_lock(&inode->ei_update_lock); ++ bch2_trans_init(&trans, c, 4, 1024); ++ ++ do { ++ bch2_trans_begin(&trans); ++ ret = bch2_link_trans(&trans, ++ dir->v.i_ino, ++ inode->v.i_ino, &dir_u, &inode_u, ++ &dentry->d_name) ?: ++ bch2_trans_commit(&trans, NULL, ++ &inode->ei_journal_seq, ++ BTREE_INSERT_NOUNLOCK); ++ } while (ret == -EINTR); ++ ++ if (likely(!ret)) { ++ BUG_ON(inode_u.bi_inum != inode->v.i_ino); ++ ++ journal_seq_copy(inode, dir->ei_journal_seq); ++ bch2_inode_update_after_write(c, dir, &dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME); ++ } ++ ++ bch2_trans_exit(&trans); ++ mutex_unlock(&inode->ei_update_lock); ++ return ret; ++} ++ ++static int bch2_link(struct dentry *old_dentry, struct inode *vdir, ++ struct dentry *dentry) ++{ ++ struct bch_fs *c = vdir->i_sb->s_fs_info; ++ struct bch_inode_info *dir = to_bch_ei(vdir); ++ struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode); ++ int ret; ++ ++ lockdep_assert_held(&inode->v.i_rwsem); ++ ++ ret = __bch2_link(c, inode, dir, dentry); ++ if (unlikely(ret)) ++ return ret; ++ ++ ihold(&inode->v); ++ d_instantiate(dentry, &inode->v); ++ return 0; ++} ++ ++static int bch2_unlink(struct inode *vdir, struct dentry *dentry) ++{ ++ struct bch_fs *c = vdir->i_sb->s_fs_info; ++ struct bch_inode_info *dir = to_bch_ei(vdir); ++ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); ++ struct bch_inode_unpacked dir_u, inode_u; ++ struct btree_trans trans; ++ int ret; ++ ++ bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); ++ bch2_trans_init(&trans, c, 4, 1024); ++ ++ do { ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_unlink_trans(&trans, ++ dir->v.i_ino, &dir_u, ++ &inode_u, &dentry->d_name) ?: ++ bch2_trans_commit(&trans, NULL, ++ &dir->ei_journal_seq, ++ BTREE_INSERT_NOUNLOCK| ++ BTREE_INSERT_NOFAIL); ++ } while (ret == -EINTR); ++ ++ if (likely(!ret)) { ++ BUG_ON(inode_u.bi_inum != inode->v.i_ino); ++ ++ journal_seq_copy(inode, dir->ei_journal_seq); ++ bch2_inode_update_after_write(c, dir, &dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ bch2_inode_update_after_write(c, inode, &inode_u, ++ ATTR_MTIME); ++ } ++ ++ bch2_trans_exit(&trans); ++ bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); ++ ++ return ret; ++} ++ ++static int bch2_symlink(struct inode *vdir, struct dentry *dentry, ++ const char *symname) ++{ ++ struct bch_fs *c = vdir->i_sb->s_fs_info; ++ struct bch_inode_info *dir = to_bch_ei(vdir), *inode; ++ int ret; ++ ++ inode = __bch2_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0, true); ++ if (unlikely(IS_ERR(inode))) ++ return PTR_ERR(inode); ++ ++ inode_lock(&inode->v); ++ ret = page_symlink(&inode->v, symname, strlen(symname) + 1); ++ inode_unlock(&inode->v); ++ ++ if (unlikely(ret)) ++ goto err; ++ ++ ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX); ++ if (unlikely(ret)) ++ goto err; ++ ++ journal_seq_copy(dir, inode->ei_journal_seq); ++ ++ ret = __bch2_link(c, inode, dir, dentry); ++ if (unlikely(ret)) ++ goto err; ++ ++ d_instantiate(dentry, &inode->v); ++ return 0; ++err: ++ iput(&inode->v); ++ return ret; ++} ++ ++static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode) ++{ ++ return bch2_mknod(vdir, dentry, mode|S_IFDIR, 0); ++} ++ ++static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry, ++ struct inode *dst_vdir, struct dentry *dst_dentry, ++ unsigned flags) ++{ ++ struct bch_fs *c = src_vdir->i_sb->s_fs_info; ++ struct bch_inode_info *src_dir = to_bch_ei(src_vdir); ++ struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir); ++ struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); ++ struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); ++ struct bch_inode_unpacked dst_dir_u, src_dir_u; ++ struct bch_inode_unpacked src_inode_u, dst_inode_u; ++ struct btree_trans trans; ++ enum bch_rename_mode mode = flags & RENAME_EXCHANGE ++ ? BCH_RENAME_EXCHANGE ++ : dst_dentry->d_inode ++ ? BCH_RENAME_OVERWRITE : BCH_RENAME; ++ u64 journal_seq = 0; ++ int ret; ++ ++ if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) ++ return -EINVAL; ++ ++ if (mode == BCH_RENAME_OVERWRITE) { ++ ret = filemap_write_and_wait_range(src_inode->v.i_mapping, ++ 0, LLONG_MAX); ++ if (ret) ++ return ret; ++ } ++ ++ bch2_trans_init(&trans, c, 8, 2048); ++ ++ bch2_lock_inodes(INODE_UPDATE_LOCK, ++ src_dir, ++ dst_dir, ++ src_inode, ++ dst_inode); ++ ++ if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { ++ ret = bch2_fs_quota_transfer(c, src_inode, ++ dst_dir->ei_qid, ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (ret) ++ goto err; ++ } ++ ++ if (mode == BCH_RENAME_EXCHANGE && ++ inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) { ++ ret = bch2_fs_quota_transfer(c, dst_inode, ++ src_dir->ei_qid, ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (ret) ++ goto err; ++ } ++ ++retry: ++ bch2_trans_begin(&trans); ++ ret = bch2_rename_trans(&trans, ++ src_dir->v.i_ino, &src_dir_u, ++ dst_dir->v.i_ino, &dst_dir_u, ++ &src_inode_u, ++ &dst_inode_u, ++ &src_dentry->d_name, ++ &dst_dentry->d_name, ++ mode) ?: ++ bch2_trans_commit(&trans, NULL, ++ &journal_seq, ++ BTREE_INSERT_NOUNLOCK); ++ if (ret == -EINTR) ++ goto retry; ++ if (unlikely(ret)) ++ goto err; ++ ++ BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); ++ BUG_ON(dst_inode && ++ dst_inode->v.i_ino != dst_inode_u.bi_inum); ++ ++ bch2_inode_update_after_write(c, src_dir, &src_dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ journal_seq_copy(src_dir, journal_seq); ++ ++ if (src_dir != dst_dir) { ++ bch2_inode_update_after_write(c, dst_dir, &dst_dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ journal_seq_copy(dst_dir, journal_seq); ++ } ++ ++ bch2_inode_update_after_write(c, src_inode, &src_inode_u, ++ ATTR_CTIME); ++ journal_seq_copy(src_inode, journal_seq); ++ ++ if (dst_inode) { ++ bch2_inode_update_after_write(c, dst_inode, &dst_inode_u, ++ ATTR_CTIME); ++ journal_seq_copy(dst_inode, journal_seq); ++ } ++err: ++ bch2_trans_exit(&trans); ++ ++ bch2_fs_quota_transfer(c, src_inode, ++ bch_qid(&src_inode->ei_inode), ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_NOCHECK); ++ if (dst_inode) ++ bch2_fs_quota_transfer(c, dst_inode, ++ bch_qid(&dst_inode->ei_inode), ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_NOCHECK); ++ ++ bch2_unlock_inodes(INODE_UPDATE_LOCK, ++ src_dir, ++ dst_dir, ++ src_inode, ++ dst_inode); ++ ++ return ret; ++} ++ ++void bch2_setattr_copy(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ struct iattr *attr) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ unsigned int ia_valid = attr->ia_valid; ++ ++ if (ia_valid & ATTR_UID) ++ bi->bi_uid = from_kuid(c->vfs_sb->s_user_ns, attr->ia_uid); ++ if (ia_valid & ATTR_GID) ++ bi->bi_gid = from_kgid(c->vfs_sb->s_user_ns, attr->ia_gid); ++ ++ if (ia_valid & ATTR_ATIME) ++ bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); ++ if (ia_valid & ATTR_MTIME) ++ bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime); ++ if (ia_valid & ATTR_CTIME) ++ bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime); ++ ++ if (ia_valid & ATTR_MODE) { ++ umode_t mode = attr->ia_mode; ++ kgid_t gid = ia_valid & ATTR_GID ++ ? attr->ia_gid ++ : inode->v.i_gid; ++ ++ if (!in_group_p(gid) && ++ !capable_wrt_inode_uidgid(&inode->v, CAP_FSETID)) ++ mode &= ~S_ISGID; ++ bi->bi_mode = mode; ++ } ++} ++ ++static int bch2_setattr_nonsize(struct bch_inode_info *inode, ++ struct iattr *attr) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_qid qid; ++ struct btree_trans trans; ++ struct btree_iter *inode_iter; ++ struct bch_inode_unpacked inode_u; ++ struct posix_acl *acl = NULL; ++ int ret; ++ ++ mutex_lock(&inode->ei_update_lock); ++ ++ qid = inode->ei_qid; ++ ++ if (attr->ia_valid & ATTR_UID) ++ qid.q[QTYP_USR] = from_kuid(&init_user_ns, attr->ia_uid); ++ ++ if (attr->ia_valid & ATTR_GID) ++ qid.q[QTYP_GRP] = from_kgid(&init_user_ns, attr->ia_gid); ++ ++ ret = bch2_fs_quota_transfer(c, inode, qid, ~0, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (ret) ++ goto err; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ kfree(acl); ++ acl = NULL; ++ ++ inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(inode_iter); ++ if (ret) ++ goto btree_err; ++ ++ bch2_setattr_copy(inode, &inode_u, attr); ++ ++ if (attr->ia_valid & ATTR_MODE) { ++ ret = bch2_acl_chmod(&trans, inode, inode_u.bi_mode, &acl); ++ if (ret) ++ goto btree_err; ++ } ++ ++ ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: ++ bch2_trans_commit(&trans, NULL, ++ &inode->ei_journal_seq, ++ BTREE_INSERT_NOUNLOCK| ++ BTREE_INSERT_NOFAIL); ++btree_err: ++ if (ret == -EINTR) ++ goto retry; ++ if (unlikely(ret)) ++ goto err_trans; ++ ++ bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid); ++ ++ if (acl) ++ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); ++err_trans: ++ bch2_trans_exit(&trans); ++err: ++ mutex_unlock(&inode->ei_update_lock); ++ ++ return ret; ++} ++ ++static int bch2_getattr(const struct path *path, struct kstat *stat, ++ u32 request_mask, unsigned query_flags) ++{ ++ struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ stat->dev = inode->v.i_sb->s_dev; ++ stat->ino = inode->v.i_ino; ++ stat->mode = inode->v.i_mode; ++ stat->nlink = inode->v.i_nlink; ++ stat->uid = inode->v.i_uid; ++ stat->gid = inode->v.i_gid; ++ stat->rdev = inode->v.i_rdev; ++ stat->size = i_size_read(&inode->v); ++ stat->atime = inode->v.i_atime; ++ stat->mtime = inode->v.i_mtime; ++ stat->ctime = inode->v.i_ctime; ++ stat->blksize = block_bytes(c); ++ stat->blocks = inode->v.i_blocks; ++ ++ if (request_mask & STATX_BTIME) { ++ stat->result_mask |= STATX_BTIME; ++ stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); ++ } ++ ++ if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE) ++ stat->attributes |= STATX_ATTR_IMMUTABLE; ++ stat->attributes_mask |= STATX_ATTR_IMMUTABLE; ++ ++ if (inode->ei_inode.bi_flags & BCH_INODE_APPEND) ++ stat->attributes |= STATX_ATTR_APPEND; ++ stat->attributes_mask |= STATX_ATTR_APPEND; ++ ++ if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP) ++ stat->attributes |= STATX_ATTR_NODUMP; ++ stat->attributes_mask |= STATX_ATTR_NODUMP; ++ ++ return 0; ++} ++ ++static int bch2_setattr(struct dentry *dentry, struct iattr *iattr) ++{ ++ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); ++ int ret; ++ ++ lockdep_assert_held(&inode->v.i_rwsem); ++ ++ ret = setattr_prepare(dentry, iattr); ++ if (ret) ++ return ret; ++ ++ return iattr->ia_valid & ATTR_SIZE ++ ? bch2_truncate(inode, iattr) ++ : bch2_setattr_nonsize(inode, iattr); ++} ++ ++static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode) ++{ ++ struct bch_inode_info *inode = ++ __bch2_create(to_bch_ei(vdir), dentry, mode, 0, true); ++ ++ if (IS_ERR(inode)) ++ return PTR_ERR(inode); ++ ++ d_mark_tmpfile(dentry, &inode->v); ++ d_instantiate(dentry, &inode->v); ++ return 0; ++} ++ ++static int bch2_fill_extent(struct bch_fs *c, ++ struct fiemap_extent_info *info, ++ struct bkey_s_c k, unsigned flags) ++{ ++ if (bkey_extent_is_data(k.k)) { ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ int ret; ++ ++ if (k.k->type == KEY_TYPE_reflink_v) ++ flags |= FIEMAP_EXTENT_SHARED; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ int flags2 = 0; ++ u64 offset = p.ptr.offset; ++ ++ if (p.crc.compression_type) ++ flags2 |= FIEMAP_EXTENT_ENCODED; ++ else ++ offset += p.crc.offset; ++ ++ if ((offset & (c->opts.block_size - 1)) || ++ (k.k->size & (c->opts.block_size - 1))) ++ flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; ++ ++ ret = fiemap_fill_next_extent(info, ++ bkey_start_offset(k.k) << 9, ++ offset << 9, ++ k.k->size << 9, flags|flags2); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++ } else if (k.k->type == KEY_TYPE_reservation) { ++ return fiemap_fill_next_extent(info, ++ bkey_start_offset(k.k) << 9, ++ 0, k.k->size << 9, ++ flags| ++ FIEMAP_EXTENT_DELALLOC| ++ FIEMAP_EXTENT_UNWRITTEN); ++ } else { ++ BUG(); ++ } ++} ++ ++static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, ++ u64 start, u64 len) ++{ ++ struct bch_fs *c = vinode->i_sb->s_fs_info; ++ struct bch_inode_info *ei = to_bch_ei(vinode); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_on_stack cur, prev; ++ struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); ++ unsigned offset_into_extent, sectors; ++ bool have_extent = false; ++ int ret = 0; ++ ++ if (start + len < start) ++ return -EINVAL; ++ ++ bkey_on_stack_init(&cur); ++ bkey_on_stack_init(&prev); ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(ei->v.i_ino, start >> 9), 0); ++retry: ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret = bkey_err(k)) && ++ bkey_cmp(iter->pos, end) < 0) { ++ if (!bkey_extent_is_data(k.k) && ++ k.k->type != KEY_TYPE_reservation) { ++ bch2_btree_iter_next(iter); ++ continue; ++ } ++ ++ bkey_on_stack_realloc(&cur, c, k.k->u64s); ++ bkey_on_stack_realloc(&prev, c, k.k->u64s); ++ bkey_reassemble(cur.k, k); ++ k = bkey_i_to_s_c(cur.k); ++ ++ offset_into_extent = iter->pos.offset - ++ bkey_start_offset(k.k); ++ sectors = k.k->size - offset_into_extent; ++ ++ ret = bch2_read_indirect_extent(&trans, ++ &offset_into_extent, &cur); ++ if (ret) ++ break; ++ ++ sectors = min(sectors, k.k->size - offset_into_extent); ++ ++ if (offset_into_extent) ++ bch2_cut_front(POS(k.k->p.inode, ++ bkey_start_offset(k.k) + ++ offset_into_extent), ++ cur.k); ++ bch2_key_resize(&cur.k->k, sectors); ++ cur.k->k.p = iter->pos; ++ cur.k->k.p.offset += cur.k->k.size; ++ ++ if (have_extent) { ++ ret = bch2_fill_extent(c, info, ++ bkey_i_to_s_c(prev.k), 0); ++ if (ret) ++ break; ++ } ++ ++ bkey_copy(prev.k, cur.k); ++ have_extent = true; ++ ++ if (k.k->type == KEY_TYPE_reflink_v) ++ bch2_btree_iter_set_pos(iter, k.k->p); ++ else ++ bch2_btree_iter_next(iter); ++ } ++ ++ if (ret == -EINTR) ++ goto retry; ++ ++ if (!ret && have_extent) ++ ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), ++ FIEMAP_EXTENT_LAST); ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ bkey_on_stack_exit(&cur, c); ++ bkey_on_stack_exit(&prev, c); ++ return ret < 0 ? ret : 0; ++} ++ ++static const struct vm_operations_struct bch_vm_ops = { ++ .fault = bch2_page_fault, ++ .map_pages = filemap_map_pages, ++ .page_mkwrite = bch2_page_mkwrite, ++}; ++ ++static int bch2_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ file_accessed(file); ++ ++ vma->vm_ops = &bch_vm_ops; ++ return 0; ++} ++ ++/* Directories: */ ++ ++static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) ++{ ++ return generic_file_llseek_size(file, offset, whence, ++ S64_MAX, S64_MAX); ++} ++ ++static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ if (!dir_emit_dots(file, ctx)) ++ return 0; ++ ++ return bch2_readdir(c, inode->v.i_ino, ctx); ++} ++ ++static const struct file_operations bch_file_operations = { ++ .llseek = bch2_llseek, ++ .read_iter = bch2_read_iter, ++ .write_iter = bch2_write_iter, ++ .mmap = bch2_mmap, ++ .open = generic_file_open, ++ .fsync = bch2_fsync, ++ .splice_read = generic_file_splice_read, ++ /* ++ * Broken, on v5.3: ++ .splice_write = iter_file_splice_write, ++ */ ++ .fallocate = bch2_fallocate_dispatch, ++ .unlocked_ioctl = bch2_fs_file_ioctl, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = bch2_compat_fs_ioctl, ++#endif ++ .remap_file_range = bch2_remap_file_range, ++}; ++ ++static const struct inode_operations bch_file_inode_operations = { ++ .getattr = bch2_getattr, ++ .setattr = bch2_setattr, ++ .fiemap = bch2_fiemap, ++ .listxattr = bch2_xattr_list, ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ .get_acl = bch2_get_acl, ++ .set_acl = bch2_set_acl, ++#endif ++}; ++ ++static const struct inode_operations bch_dir_inode_operations = { ++ .lookup = bch2_lookup, ++ .create = bch2_create, ++ .link = bch2_link, ++ .unlink = bch2_unlink, ++ .symlink = bch2_symlink, ++ .mkdir = bch2_mkdir, ++ .rmdir = bch2_unlink, ++ .mknod = bch2_mknod, ++ .rename = bch2_rename2, ++ .getattr = bch2_getattr, ++ .setattr = bch2_setattr, ++ .tmpfile = bch2_tmpfile, ++ .listxattr = bch2_xattr_list, ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ .get_acl = bch2_get_acl, ++ .set_acl = bch2_set_acl, ++#endif ++}; ++ ++static const struct file_operations bch_dir_file_operations = { ++ .llseek = bch2_dir_llseek, ++ .read = generic_read_dir, ++ .iterate_shared = bch2_vfs_readdir, ++ .fsync = bch2_fsync, ++ .unlocked_ioctl = bch2_fs_file_ioctl, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = bch2_compat_fs_ioctl, ++#endif ++}; ++ ++static const struct inode_operations bch_symlink_inode_operations = { ++ .get_link = page_get_link, ++ .getattr = bch2_getattr, ++ .setattr = bch2_setattr, ++ .listxattr = bch2_xattr_list, ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ .get_acl = bch2_get_acl, ++ .set_acl = bch2_set_acl, ++#endif ++}; ++ ++static const struct inode_operations bch_special_inode_operations = { ++ .getattr = bch2_getattr, ++ .setattr = bch2_setattr, ++ .listxattr = bch2_xattr_list, ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ .get_acl = bch2_get_acl, ++ .set_acl = bch2_set_acl, ++#endif ++}; ++ ++static const struct address_space_operations bch_address_space_operations = { ++ .writepage = bch2_writepage, ++ .readpage = bch2_readpage, ++ .writepages = bch2_writepages, ++ .readpages = bch2_readpages, ++ .set_page_dirty = __set_page_dirty_nobuffers, ++ .write_begin = bch2_write_begin, ++ .write_end = bch2_write_end, ++ .invalidatepage = bch2_invalidatepage, ++ .releasepage = bch2_releasepage, ++ .direct_IO = noop_direct_IO, ++#ifdef CONFIG_MIGRATION ++ .migratepage = bch2_migrate_page, ++#endif ++ .error_remove_page = generic_error_remove_page, ++}; ++ ++static struct inode *bch2_nfs_get_inode(struct super_block *sb, ++ u64 ino, u32 generation) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct inode *vinode; ++ ++ if (ino < BCACHEFS_ROOT_INO) ++ return ERR_PTR(-ESTALE); ++ ++ vinode = bch2_vfs_inode_get(c, ino); ++ if (IS_ERR(vinode)) ++ return ERR_CAST(vinode); ++ if (generation && vinode->i_generation != generation) { ++ /* we didn't find the right inode.. */ ++ iput(vinode); ++ return ERR_PTR(-ESTALE); ++ } ++ return vinode; ++} ++ ++static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid, ++ int fh_len, int fh_type) ++{ ++ return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ++ bch2_nfs_get_inode); ++} ++ ++static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid, ++ int fh_len, int fh_type) ++{ ++ return generic_fh_to_parent(sb, fid, fh_len, fh_type, ++ bch2_nfs_get_inode); ++} ++ ++static const struct export_operations bch_export_ops = { ++ .fh_to_dentry = bch2_fh_to_dentry, ++ .fh_to_parent = bch2_fh_to_parent, ++ //.get_parent = bch2_get_parent, ++}; ++ ++static void bch2_vfs_inode_init(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi) ++{ ++ bch2_inode_update_after_write(c, inode, bi, ~0); ++ ++ inode->v.i_blocks = bi->bi_sectors; ++ inode->v.i_ino = bi->bi_inum; ++ inode->v.i_rdev = bi->bi_dev; ++ inode->v.i_generation = bi->bi_generation; ++ inode->v.i_size = bi->bi_size; ++ ++ inode->ei_journal_seq = 0; ++ inode->ei_quota_reserved = 0; ++ inode->ei_str_hash = bch2_hash_info_init(c, bi); ++ inode->ei_qid = bch_qid(bi); ++ ++ inode->v.i_mapping->a_ops = &bch_address_space_operations; ++ ++ switch (inode->v.i_mode & S_IFMT) { ++ case S_IFREG: ++ inode->v.i_op = &bch_file_inode_operations; ++ inode->v.i_fop = &bch_file_operations; ++ break; ++ case S_IFDIR: ++ inode->v.i_op = &bch_dir_inode_operations; ++ inode->v.i_fop = &bch_dir_file_operations; ++ break; ++ case S_IFLNK: ++ inode_nohighmem(&inode->v); ++ inode->v.i_op = &bch_symlink_inode_operations; ++ break; ++ default: ++ init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev); ++ inode->v.i_op = &bch_special_inode_operations; ++ break; ++ } ++} ++ ++static struct inode *bch2_alloc_inode(struct super_block *sb) ++{ ++ struct bch_inode_info *inode; ++ ++ inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS); ++ if (!inode) ++ return NULL; ++ ++ inode_init_once(&inode->v); ++ mutex_init(&inode->ei_update_lock); ++ pagecache_lock_init(&inode->ei_pagecache_lock); ++ mutex_init(&inode->ei_quota_lock); ++ inode->ei_journal_seq = 0; ++ ++ return &inode->v; ++} ++ ++static void bch2_i_callback(struct rcu_head *head) ++{ ++ struct inode *vinode = container_of(head, struct inode, i_rcu); ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ ++ kmem_cache_free(bch2_inode_cache, inode); ++} ++ ++static void bch2_destroy_inode(struct inode *vinode) ++{ ++ call_rcu(&vinode->i_rcu, bch2_i_callback); ++} ++ ++static int inode_update_times_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ bi->bi_atime = timespec_to_bch2_time(c, inode->v.i_atime); ++ bi->bi_mtime = timespec_to_bch2_time(c, inode->v.i_mtime); ++ bi->bi_ctime = timespec_to_bch2_time(c, inode->v.i_ctime); ++ ++ return 0; ++} ++ ++static int bch2_vfs_write_inode(struct inode *vinode, ++ struct writeback_control *wbc) ++{ ++ struct bch_fs *c = vinode->i_sb->s_fs_info; ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ int ret; ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, ++ ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++ ++ return ret; ++} ++ ++static void bch2_evict_inode(struct inode *vinode) ++{ ++ struct bch_fs *c = vinode->i_sb->s_fs_info; ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ ++ truncate_inode_pages_final(&inode->v.i_data); ++ ++ clear_inode(&inode->v); ++ ++ BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); ++ ++ if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) { ++ bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), ++ KEY_TYPE_QUOTA_WARN); ++ bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, ++ KEY_TYPE_QUOTA_WARN); ++ bch2_inode_rm(c, inode->v.i_ino); ++ } ++} ++ ++static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) ++{ ++ struct super_block *sb = dentry->d_sb; ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); ++ unsigned shift = sb->s_blocksize_bits - 9; ++ u64 fsid; ++ ++ buf->f_type = BCACHEFS_STATFS_MAGIC; ++ buf->f_bsize = sb->s_blocksize; ++ buf->f_blocks = usage.capacity >> shift; ++ buf->f_bfree = (usage.capacity - usage.used) >> shift; ++ buf->f_bavail = buf->f_bfree; ++ buf->f_files = usage.nr_inodes; ++ buf->f_ffree = U64_MAX; ++ ++ fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ ++ le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); ++ buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; ++ buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; ++ buf->f_namelen = BCH_NAME_MAX; ++ ++ return 0; ++} ++ ++static int bch2_sync_fs(struct super_block *sb, int wait) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ ++ if (c->opts.journal_flush_disabled) ++ return 0; ++ ++ if (!wait) { ++ bch2_journal_flush_async(&c->journal, NULL); ++ return 0; ++ } ++ ++ return bch2_journal_flush(&c->journal); ++} ++ ++static struct bch_fs *bch2_path_to_fs(const char *dev) ++{ ++ struct bch_fs *c; ++ struct block_device *bdev = lookup_bdev(dev); ++ ++ if (IS_ERR(bdev)) ++ return ERR_CAST(bdev); ++ ++ c = bch2_bdev_to_fs(bdev); ++ bdput(bdev); ++ return c ?: ERR_PTR(-ENOENT); ++} ++ ++static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * const *devs, ++ unsigned nr_devs, struct bch_opts opts) ++{ ++ struct bch_fs *c, *c1, *c2; ++ size_t i; ++ ++ if (!nr_devs) ++ return ERR_PTR(-EINVAL); ++ ++ c = bch2_fs_open(devs, nr_devs, opts); ++ ++ if (IS_ERR(c) && PTR_ERR(c) == -EBUSY) { ++ /* ++ * Already open? ++ * Look up each block device, make sure they all belong to a ++ * filesystem and they all belong to the _same_ filesystem ++ */ ++ ++ c1 = bch2_path_to_fs(devs[0]); ++ if (IS_ERR(c1)) ++ return c; ++ ++ for (i = 1; i < nr_devs; i++) { ++ c2 = bch2_path_to_fs(devs[i]); ++ if (!IS_ERR(c2)) ++ closure_put(&c2->cl); ++ ++ if (c1 != c2) { ++ closure_put(&c1->cl); ++ return c; ++ } ++ } ++ ++ c = c1; ++ } ++ ++ if (IS_ERR(c)) ++ return c; ++ ++ down_write(&c->state_lock); ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) { ++ up_write(&c->state_lock); ++ closure_put(&c->cl); ++ pr_err("err mounting %s: incomplete filesystem", dev_name); ++ return ERR_PTR(-EINVAL); ++ } ++ ++ up_write(&c->state_lock); ++ ++ set_bit(BCH_FS_BDEV_MOUNTED, &c->flags); ++ return c; ++} ++ ++static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name, ++ struct bch_opts opts) ++{ ++ char *dev_name = NULL, **devs = NULL, *s; ++ struct bch_fs *c = ERR_PTR(-ENOMEM); ++ size_t i, nr_devs = 0; ++ ++ dev_name = kstrdup(_dev_name, GFP_KERNEL); ++ if (!dev_name) ++ goto err; ++ ++ for (s = dev_name; s; s = strchr(s + 1, ':')) ++ nr_devs++; ++ ++ devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL); ++ if (!devs) ++ goto err; ++ ++ for (i = 0, s = dev_name; ++ s; ++ (s = strchr(s, ':')) && (*s++ = '\0')) ++ devs[i++] = s; ++ ++ c = __bch2_open_as_blockdevs(_dev_name, devs, nr_devs, opts); ++err: ++ kfree(devs); ++ kfree(dev_name); ++ return c; ++} ++ ++static int bch2_remount(struct super_block *sb, int *flags, char *data) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_opts opts = bch2_opts_empty(); ++ int ret; ++ ++ opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); ++ ++ ret = bch2_parse_mount_opts(&opts, data); ++ if (ret) ++ return ret; ++ ++ if (opts.read_only != c->opts.read_only) { ++ down_write(&c->state_lock); ++ ++ if (opts.read_only) { ++ bch2_fs_read_only(c); ++ ++ sb->s_flags |= SB_RDONLY; ++ } else { ++ ret = bch2_fs_read_write(c); ++ if (ret) { ++ bch_err(c, "error going rw: %i", ret); ++ up_write(&c->state_lock); ++ return -EINVAL; ++ } ++ ++ sb->s_flags &= ~SB_RDONLY; ++ } ++ ++ c->opts.read_only = opts.read_only; ++ ++ up_write(&c->state_lock); ++ } ++ ++ if (opts.errors >= 0) ++ c->opts.errors = opts.errors; ++ ++ return ret; ++} ++ ++static int bch2_show_options(struct seq_file *seq, struct dentry *root) ++{ ++ struct bch_fs *c = root->d_sb->s_fs_info; ++ enum bch_opt_id i; ++ char buf[512]; ++ ++ for (i = 0; i < bch2_opts_nr; i++) { ++ const struct bch_option *opt = &bch2_opt_table[i]; ++ u64 v = bch2_opt_get_by_id(&c->opts, i); ++ ++ if (!(opt->mode & OPT_MOUNT)) ++ continue; ++ ++ if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) ++ continue; ++ ++ bch2_opt_to_text(&PBUF(buf), c, opt, v, ++ OPT_SHOW_MOUNT_STYLE); ++ seq_putc(seq, ','); ++ seq_puts(seq, buf); ++ } ++ ++ return 0; ++ ++} ++ ++static const struct super_operations bch_super_operations = { ++ .alloc_inode = bch2_alloc_inode, ++ .destroy_inode = bch2_destroy_inode, ++ .write_inode = bch2_vfs_write_inode, ++ .evict_inode = bch2_evict_inode, ++ .sync_fs = bch2_sync_fs, ++ .statfs = bch2_statfs, ++ .show_options = bch2_show_options, ++ .remount_fs = bch2_remount, ++#if 0 ++ .put_super = bch2_put_super, ++ .freeze_fs = bch2_freeze, ++ .unfreeze_fs = bch2_unfreeze, ++#endif ++}; ++ ++static int bch2_test_super(struct super_block *s, void *data) ++{ ++ return s->s_fs_info == data; ++} ++ ++static int bch2_set_super(struct super_block *s, void *data) ++{ ++ s->s_fs_info = data; ++ return 0; ++} ++ ++static struct dentry *bch2_mount(struct file_system_type *fs_type, ++ int flags, const char *dev_name, void *data) ++{ ++ struct bch_fs *c; ++ struct bch_dev *ca; ++ struct super_block *sb; ++ struct inode *vinode; ++ struct bch_opts opts = bch2_opts_empty(); ++ unsigned i; ++ int ret; ++ ++ opt_set(opts, read_only, (flags & SB_RDONLY) != 0); ++ ++ ret = bch2_parse_mount_opts(&opts, data); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ c = bch2_open_as_blockdevs(dev_name, opts); ++ if (IS_ERR(c)) ++ return ERR_CAST(c); ++ ++ sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|SB_NOSEC, c); ++ if (IS_ERR(sb)) { ++ closure_put(&c->cl); ++ return ERR_CAST(sb); ++ } ++ ++ BUG_ON(sb->s_fs_info != c); ++ ++ if (sb->s_root) { ++ closure_put(&c->cl); ++ ++ if ((flags ^ sb->s_flags) & SB_RDONLY) { ++ ret = -EBUSY; ++ goto err_put_super; ++ } ++ goto out; ++ } ++ ++ sb->s_blocksize = block_bytes(c); ++ sb->s_blocksize_bits = ilog2(block_bytes(c)); ++ sb->s_maxbytes = MAX_LFS_FILESIZE; ++ sb->s_op = &bch_super_operations; ++ sb->s_export_op = &bch_export_ops; ++#ifdef CONFIG_BCACHEFS_QUOTA ++ sb->s_qcop = &bch2_quotactl_operations; ++ sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ; ++#endif ++ sb->s_xattr = bch2_xattr_handlers; ++ sb->s_magic = BCACHEFS_STATFS_MAGIC; ++ sb->s_time_gran = c->sb.time_precision; ++ c->vfs_sb = sb; ++ strlcpy(sb->s_id, c->name, sizeof(sb->s_id)); ++ ++ ret = super_setup_bdi(sb); ++ if (ret) ++ goto err_put_super; ++ ++ sb->s_bdi->congested_fn = bch2_congested; ++ sb->s_bdi->congested_data = c; ++ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; ++ ++ for_each_online_member(ca, c, i) { ++ struct block_device *bdev = ca->disk_sb.bdev; ++ ++ /* XXX: create an anonymous device for multi device filesystems */ ++ sb->s_bdev = bdev; ++ sb->s_dev = bdev->bd_dev; ++ percpu_ref_put(&ca->io_ref); ++ break; ++ } ++ ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ if (c->opts.acl) ++ sb->s_flags |= SB_POSIXACL; ++#endif ++ ++ vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO); ++ if (IS_ERR(vinode)) { ++ bch_err(c, "error mounting: error getting root inode %i", ++ (int) PTR_ERR(vinode)); ++ ret = PTR_ERR(vinode); ++ goto err_put_super; ++ } ++ ++ sb->s_root = d_make_root(vinode); ++ if (!sb->s_root) { ++ bch_err(c, "error mounting: error allocating root dentry"); ++ ret = -ENOMEM; ++ goto err_put_super; ++ } ++ ++ sb->s_flags |= SB_ACTIVE; ++out: ++ return dget(sb->s_root); ++ ++err_put_super: ++ deactivate_locked_super(sb); ++ return ERR_PTR(ret); ++} ++ ++static void bch2_kill_sb(struct super_block *sb) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ ++ generic_shutdown_super(sb); ++ ++ if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags)) ++ bch2_fs_stop(c); ++ else ++ closure_put(&c->cl); ++} ++ ++static struct file_system_type bcache_fs_type = { ++ .owner = THIS_MODULE, ++ .name = "bcachefs", ++ .mount = bch2_mount, ++ .kill_sb = bch2_kill_sb, ++ .fs_flags = FS_REQUIRES_DEV, ++}; ++ ++MODULE_ALIAS_FS("bcachefs"); ++ ++void bch2_vfs_exit(void) ++{ ++ unregister_filesystem(&bcache_fs_type); ++ if (bch2_inode_cache) ++ kmem_cache_destroy(bch2_inode_cache); ++} ++ ++int __init bch2_vfs_init(void) ++{ ++ int ret = -ENOMEM; ++ ++ bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0); ++ if (!bch2_inode_cache) ++ goto err; ++ ++ ret = register_filesystem(&bcache_fs_type); ++ if (ret) ++ goto err; ++ ++ return 0; ++err: ++ bch2_vfs_exit(); ++ return ret; ++} ++ ++#endif /* NO_BCACHEFS_FS */ +diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h +new file mode 100644 +index 000000000000..eda903a45325 +--- /dev/null ++++ b/fs/bcachefs/fs.h +@@ -0,0 +1,174 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FS_H ++#define _BCACHEFS_FS_H ++ ++#include "inode.h" ++#include "opts.h" ++#include "str_hash.h" ++#include "quota_types.h" ++ ++#include ++#include ++ ++/* ++ * Two-state lock - can be taken for add or block - both states are shared, ++ * like read side of rwsem, but conflict with other state: ++ */ ++struct pagecache_lock { ++ atomic_long_t v; ++ wait_queue_head_t wait; ++}; ++ ++static inline void pagecache_lock_init(struct pagecache_lock *lock) ++{ ++ atomic_long_set(&lock->v, 0); ++ init_waitqueue_head(&lock->wait); ++} ++ ++void bch2_pagecache_add_put(struct pagecache_lock *); ++void bch2_pagecache_add_get(struct pagecache_lock *); ++void bch2_pagecache_block_put(struct pagecache_lock *); ++void bch2_pagecache_block_get(struct pagecache_lock *); ++ ++struct bch_inode_info { ++ struct inode v; ++ ++ struct mutex ei_update_lock; ++ u64 ei_journal_seq; ++ u64 ei_quota_reserved; ++ unsigned long ei_last_dirtied; ++ ++ struct pagecache_lock ei_pagecache_lock; ++ ++ struct mutex ei_quota_lock; ++ struct bch_qid ei_qid; ++ ++ struct bch_hash_info ei_str_hash; ++ ++ /* copy of inode in btree: */ ++ struct bch_inode_unpacked ei_inode; ++}; ++ ++#define to_bch_ei(_inode) \ ++ container_of_or_null(_inode, struct bch_inode_info, v) ++ ++static inline int ptrcmp(void *l, void *r) ++{ ++ return cmp_int(l, r); ++} ++ ++enum bch_inode_lock_op { ++ INODE_LOCK = (1U << 0), ++ INODE_PAGECACHE_BLOCK = (1U << 1), ++ INODE_UPDATE_LOCK = (1U << 2), ++}; ++ ++#define bch2_lock_inodes(_locks, ...) \ ++do { \ ++ struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ ++ unsigned i; \ ++ \ ++ bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ ++ \ ++ for (i = 1; i < ARRAY_SIZE(a); i++) \ ++ if (a[i] != a[i - 1]) { \ ++ if ((_locks) & INODE_LOCK) \ ++ down_write_nested(&a[i]->v.i_rwsem, i); \ ++ if ((_locks) & INODE_PAGECACHE_BLOCK) \ ++ bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\ ++ if ((_locks) & INODE_UPDATE_LOCK) \ ++ mutex_lock_nested(&a[i]->ei_update_lock, i);\ ++ } \ ++} while (0) ++ ++#define bch2_unlock_inodes(_locks, ...) \ ++do { \ ++ struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ ++ unsigned i; \ ++ \ ++ bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ ++ \ ++ for (i = 1; i < ARRAY_SIZE(a); i++) \ ++ if (a[i] != a[i - 1]) { \ ++ if ((_locks) & INODE_LOCK) \ ++ up_write(&a[i]->v.i_rwsem); \ ++ if ((_locks) & INODE_PAGECACHE_BLOCK) \ ++ bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\ ++ if ((_locks) & INODE_UPDATE_LOCK) \ ++ mutex_unlock(&a[i]->ei_update_lock); \ ++ } \ ++} while (0) ++ ++static inline struct bch_inode_info *file_bch_inode(struct file *file) ++{ ++ return to_bch_ei(file_inode(file)); ++} ++ ++static inline bool inode_attr_changing(struct bch_inode_info *dir, ++ struct bch_inode_info *inode, ++ enum inode_opt_id id) ++{ ++ return !(inode->ei_inode.bi_fields_set & (1 << id)) && ++ bch2_inode_opt_get(&dir->ei_inode, id) != ++ bch2_inode_opt_get(&inode->ei_inode, id); ++} ++ ++static inline bool inode_attrs_changing(struct bch_inode_info *dir, ++ struct bch_inode_info *inode) ++{ ++ unsigned id; ++ ++ for (id = 0; id < Inode_opt_nr; id++) ++ if (inode_attr_changing(dir, inode, id)) ++ return true; ++ ++ return false; ++} ++ ++struct bch_inode_unpacked; ++ ++#ifndef NO_BCACHEFS_FS ++ ++int bch2_fs_quota_transfer(struct bch_fs *, ++ struct bch_inode_info *, ++ struct bch_qid, ++ unsigned, ++ enum quota_acct_mode); ++ ++static inline int bch2_set_projid(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ u32 projid) ++{ ++ struct bch_qid qid = inode->ei_qid; ++ ++ qid.q[QTYP_PRJ] = projid; ++ ++ return bch2_fs_quota_transfer(c, inode, qid, ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_PREALLOC); ++} ++ ++struct inode *bch2_vfs_inode_get(struct bch_fs *, u64); ++ ++/* returns 0 if we want to do the update, or error is passed up */ ++typedef int (*inode_set_fn)(struct bch_inode_info *, ++ struct bch_inode_unpacked *, void *); ++ ++void bch2_inode_update_after_write(struct bch_fs *, ++ struct bch_inode_info *, ++ struct bch_inode_unpacked *, ++ unsigned); ++int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, ++ inode_set_fn, void *, unsigned); ++ ++void bch2_vfs_exit(void); ++int bch2_vfs_init(void); ++ ++#else ++ ++static inline void bch2_vfs_exit(void) {} ++static inline int bch2_vfs_init(void) { return 0; } ++ ++#endif /* NO_BCACHEFS_FS */ ++ ++#endif /* _BCACHEFS_FS_H */ +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +new file mode 100644 +index 000000000000..c6ca5968a2e0 +--- /dev/null ++++ b/fs/bcachefs/fsck.c +@@ -0,0 +1,1498 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_on_stack.h" ++#include "btree_update.h" ++#include "dirent.h" ++#include "error.h" ++#include "fs-common.h" ++#include "fsck.h" ++#include "inode.h" ++#include "keylist.h" ++#include "super.h" ++#include "xattr.h" ++ ++#include /* struct qstr */ ++#include ++ ++#define QSTR(n) { { { .len = strlen(n) } }, .name = n } ++ ++static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 sectors = 0; ++ int ret; ++ ++ for_each_btree_key(trans, iter, BTREE_ID_EXTENTS, ++ POS(inum, 0), 0, k, ret) { ++ if (k.k->p.inode != inum) ++ break; ++ ++ if (bkey_extent_is_allocation(k.k)) ++ sectors += k.k->size; ++ } ++ ++ bch2_trans_iter_free(trans, iter); ++ ++ return ret ?: sectors; ++} ++ ++static int __remove_dirent(struct btree_trans *trans, ++ struct bkey_s_c_dirent dirent) ++{ ++ struct bch_fs *c = trans->c; ++ struct qstr name; ++ struct bch_inode_unpacked dir_inode; ++ struct bch_hash_info dir_hash_info; ++ u64 dir_inum = dirent.k->p.inode; ++ int ret; ++ char *buf; ++ ++ name.len = bch2_dirent_name_bytes(dirent); ++ buf = bch2_trans_kmalloc(trans, name.len + 1); ++ if (IS_ERR(buf)) ++ return PTR_ERR(buf); ++ ++ memcpy(buf, dirent.v->d_name, name.len); ++ buf[name.len] = '\0'; ++ name.name = buf; ++ ++ ret = bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode); ++ if (ret && ret != -EINTR) ++ bch_err(c, "remove_dirent: err %i looking up directory inode", ret); ++ if (ret) ++ return ret; ++ ++ dir_hash_info = bch2_hash_info_init(c, &dir_inode); ++ ++ ret = bch2_hash_delete(trans, bch2_dirent_hash_desc, ++ &dir_hash_info, dir_inum, &name); ++ if (ret && ret != -EINTR) ++ bch_err(c, "remove_dirent: err %i deleting dirent", ret); ++ if (ret) ++ return ret; ++ ++ return 0; ++} ++ ++static int remove_dirent(struct btree_trans *trans, ++ struct bkey_s_c_dirent dirent) ++{ ++ return __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ __remove_dirent(trans, dirent)); ++} ++ ++static int reattach_inode(struct bch_fs *c, ++ struct bch_inode_unpacked *lostfound_inode, ++ u64 inum) ++{ ++ struct bch_inode_unpacked dir_u, inode_u; ++ char name_buf[20]; ++ struct qstr name; ++ int ret; ++ ++ snprintf(name_buf, sizeof(name_buf), "%llu", inum); ++ name = (struct qstr) QSTR(name_buf); ++ ++ ret = bch2_trans_do(c, NULL, NULL, ++ BTREE_INSERT_LAZY_RW, ++ bch2_link_trans(&trans, lostfound_inode->bi_inum, ++ inum, &dir_u, &inode_u, &name)); ++ if (ret) ++ bch_err(c, "error %i reattaching inode %llu", ret, inum); ++ ++ return ret; ++} ++ ++struct inode_walker { ++ bool first_this_inode; ++ bool have_inode; ++ u64 cur_inum; ++ struct bch_inode_unpacked inode; ++}; ++ ++static struct inode_walker inode_walker_init(void) ++{ ++ return (struct inode_walker) { ++ .cur_inum = -1, ++ .have_inode = false, ++ }; ++} ++ ++static int walk_inode(struct btree_trans *trans, ++ struct inode_walker *w, u64 inum) ++{ ++ if (inum != w->cur_inum) { ++ int ret = bch2_inode_find_by_inum_trans(trans, inum, ++ &w->inode); ++ ++ if (ret && ret != -ENOENT) ++ return ret; ++ ++ w->have_inode = !ret; ++ w->cur_inum = inum; ++ w->first_this_inode = true; ++ } else { ++ w->first_this_inode = false; ++ } ++ ++ return 0; ++} ++ ++struct hash_check { ++ struct bch_hash_info info; ++ ++ /* start of current chain of hash collisions: */ ++ struct btree_iter *chain; ++ ++ /* next offset in current chain of hash collisions: */ ++ u64 chain_end; ++}; ++ ++static void hash_check_init(struct hash_check *h) ++{ ++ h->chain = NULL; ++ h->chain_end = 0; ++} ++ ++static void hash_stop_chain(struct btree_trans *trans, ++ struct hash_check *h) ++{ ++ if (h->chain) ++ bch2_trans_iter_free(trans, h->chain); ++ h->chain = NULL; ++} ++ ++static void hash_check_set_inode(struct btree_trans *trans, ++ struct hash_check *h, ++ const struct bch_inode_unpacked *bi) ++{ ++ h->info = bch2_hash_info_init(trans->c, bi); ++ hash_stop_chain(trans, h); ++} ++ ++static int hash_redo_key(const struct bch_hash_desc desc, ++ struct btree_trans *trans, struct hash_check *h, ++ struct btree_iter *k_iter, struct bkey_s_c k, ++ u64 hashed) ++{ ++ struct bkey_i delete; ++ struct bkey_i *tmp; ++ ++ tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if (IS_ERR(tmp)) ++ return PTR_ERR(tmp); ++ ++ bkey_reassemble(tmp, k); ++ ++ bkey_init(&delete.k); ++ delete.k.p = k_iter->pos; ++ bch2_trans_update(trans, k_iter, &delete, 0); ++ ++ return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode, ++ tmp, BCH_HASH_SET_MUST_CREATE); ++} ++ ++static int fsck_hash_delete_at(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ struct bch_hash_info *info, ++ struct btree_iter *iter) ++{ ++ int ret; ++retry: ++ ret = bch2_hash_delete_at(trans, desc, info, iter) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW); ++ if (ret == -EINTR) { ++ ret = bch2_btree_iter_traverse(iter); ++ if (!ret) ++ goto retry; ++ } ++ ++ return ret; ++} ++ ++static int hash_check_duplicates(struct btree_trans *trans, ++ const struct bch_hash_desc desc, struct hash_check *h, ++ struct btree_iter *k_iter, struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter; ++ struct bkey_s_c k2; ++ char buf[200]; ++ int ret = 0; ++ ++ if (!bkey_cmp(h->chain->pos, k_iter->pos)) ++ return 0; ++ ++ iter = bch2_trans_copy_iter(trans, h->chain); ++ BUG_ON(IS_ERR(iter)); ++ ++ for_each_btree_key_continue(iter, 0, k2, ret) { ++ if (bkey_cmp(k2.k->p, k.k->p) >= 0) ++ break; ++ ++ if (fsck_err_on(k2.k->type == desc.key_type && ++ !desc.cmp_bkey(k, k2), c, ++ "duplicate hash table keys:\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) { ++ ret = fsck_hash_delete_at(trans, desc, &h->info, k_iter); ++ if (ret) ++ return ret; ++ ret = 1; ++ break; ++ } ++ } ++fsck_err: ++ bch2_trans_iter_free(trans, iter); ++ return ret; ++} ++ ++static void hash_set_chain_start(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ struct hash_check *h, ++ struct btree_iter *k_iter, struct bkey_s_c k) ++{ ++ bool hole = (k.k->type != KEY_TYPE_whiteout && ++ k.k->type != desc.key_type); ++ ++ if (hole || k.k->p.offset > h->chain_end + 1) ++ hash_stop_chain(trans, h); ++ ++ if (!hole) { ++ if (!h->chain) { ++ h->chain = bch2_trans_copy_iter(trans, k_iter); ++ BUG_ON(IS_ERR(h->chain)); ++ } ++ ++ h->chain_end = k.k->p.offset; ++ } ++} ++ ++static bool key_has_correct_hash(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ struct hash_check *h, ++ struct btree_iter *k_iter, struct bkey_s_c k) ++{ ++ u64 hash; ++ ++ hash_set_chain_start(trans, desc, h, k_iter, k); ++ ++ if (k.k->type != desc.key_type) ++ return true; ++ ++ hash = desc.hash_bkey(&h->info, k); ++ ++ return hash >= h->chain->pos.offset && ++ hash <= k.k->p.offset; ++} ++ ++static int hash_check_key(struct btree_trans *trans, ++ const struct bch_hash_desc desc, struct hash_check *h, ++ struct btree_iter *k_iter, struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ char buf[200]; ++ u64 hashed; ++ int ret = 0; ++ ++ hash_set_chain_start(trans, desc, h, k_iter, k); ++ ++ if (k.k->type != desc.key_type) ++ return 0; ++ ++ hashed = desc.hash_bkey(&h->info, k); ++ ++ if (fsck_err_on(hashed < h->chain->pos.offset || ++ hashed > k.k->p.offset, c, ++ "hash table key at wrong offset: btree %u, %llu, " ++ "hashed to %llu chain starts at %llu\n%s", ++ desc.btree_id, k.k->p.offset, ++ hashed, h->chain->pos.offset, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) { ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, ++ hash_redo_key(desc, trans, h, k_iter, k, hashed)); ++ if (ret) { ++ bch_err(c, "hash_redo_key err %i", ret); ++ return ret; ++ } ++ return 1; ++ } ++ ++ ret = hash_check_duplicates(trans, desc, h, k_iter, k); ++fsck_err: ++ return ret; ++} ++ ++static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h, ++ struct btree_iter *iter, struct bkey_s_c *k) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_i_dirent *d = NULL; ++ int ret = -EINVAL; ++ char buf[200]; ++ unsigned len; ++ u64 hash; ++ ++ if (key_has_correct_hash(trans, bch2_dirent_hash_desc, h, iter, *k)) ++ return 0; ++ ++ len = bch2_dirent_name_bytes(bkey_s_c_to_dirent(*k)); ++ BUG_ON(!len); ++ ++ memcpy(buf, bkey_s_c_to_dirent(*k).v->d_name, len); ++ buf[len] = '\0'; ++ ++ d = kmalloc(bkey_bytes(k->k), GFP_KERNEL); ++ if (!d) { ++ bch_err(c, "memory allocation failure"); ++ return -ENOMEM; ++ } ++ ++ bkey_reassemble(&d->k_i, *k); ++ ++ do { ++ --len; ++ if (!len) ++ goto err_redo; ++ ++ d->k.u64s = BKEY_U64s + dirent_val_u64s(len); ++ ++ BUG_ON(bkey_val_bytes(&d->k) < ++ offsetof(struct bch_dirent, d_name) + len); ++ ++ memset(d->v.d_name + len, 0, ++ bkey_val_bytes(&d->k) - ++ offsetof(struct bch_dirent, d_name) - len); ++ ++ hash = bch2_dirent_hash_desc.hash_bkey(&h->info, ++ bkey_i_to_s_c(&d->k_i)); ++ } while (hash < h->chain->pos.offset || ++ hash > k->k->p.offset); ++ ++ if (fsck_err(c, "dirent with junk at end, was %s (%zu) now %s (%u)", ++ buf, strlen(buf), d->v.d_name, len)) { ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ (bch2_trans_update(trans, iter, &d->k_i, 0), 0)); ++ if (ret) ++ goto err; ++ ++ *k = bch2_btree_iter_peek(iter); ++ ++ BUG_ON(k->k->type != KEY_TYPE_dirent); ++ } ++err: ++fsck_err: ++ kfree(d); ++ return ret; ++err_redo: ++ hash = bch2_dirent_hash_desc.hash_bkey(&h->info, *k); ++ ++ if (fsck_err(c, "cannot fix dirent by removing trailing garbage %s (%zu)\n" ++ "hash table key at wrong offset: btree %u, offset %llu, " ++ "hashed to %llu chain starts at %llu\n%s", ++ buf, strlen(buf), BTREE_ID_DIRENTS, ++ k->k->p.offset, hash, h->chain->pos.offset, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ *k), buf))) { ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, ++ hash_redo_key(bch2_dirent_hash_desc, trans, ++ h, iter, *k, hash)); ++ if (ret) ++ bch_err(c, "hash_redo_key err %i", ret); ++ else ++ ret = 1; ++ } ++ ++ goto err; ++} ++ ++static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size) ++{ ++ return bch2_btree_delete_range(c, BTREE_ID_EXTENTS, ++ POS(inode_nr, round_up(new_size, block_bytes(c)) >> 9), ++ POS(inode_nr + 1, 0), NULL); ++} ++ ++static int bch2_fix_overlapping_extent(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k, struct bpos cut_at) ++{ ++ struct btree_iter *u_iter; ++ struct bkey_i *u; ++ int ret; ++ ++ u = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(u); ++ if (ret) ++ return ret; ++ ++ bkey_reassemble(u, k); ++ bch2_cut_front(cut_at, u); ++ ++ u_iter = bch2_trans_copy_iter(trans, iter); ++ ret = PTR_ERR_OR_ZERO(u_iter); ++ if (ret) ++ return ret; ++ ++ /* ++ * We don't want to go through the ++ * extent_handle_overwrites path: ++ */ ++ __bch2_btree_iter_set_pos(u_iter, u->k.p, false); ++ ++ /* ++ * XXX: this is going to leave disk space ++ * accounting slightly wrong ++ */ ++ ret = bch2_trans_update(trans, u_iter, u, 0); ++ bch2_trans_iter_put(trans, u_iter); ++ return ret; ++} ++ ++/* ++ * Walk extents: verify that extents have a corresponding S_ISREG inode, and ++ * that i_size an i_sectors are consistent ++ */ ++noinline_for_stack ++static int check_extents(struct bch_fs *c) ++{ ++ struct inode_walker w = inode_walker_init(); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_on_stack prev; ++ u64 i_sectors; ++ int ret = 0; ++ ++ bkey_on_stack_init(&prev); ++ prev.k->k = KEY(0, 0, 0); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ bch_verbose(c, "checking extents"); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(BCACHEFS_ROOT_INO, 0), ++ BTREE_ITER_INTENT); ++retry: ++ for_each_btree_key_continue(iter, 0, k, ret) { ++ if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { ++ char buf1[200]; ++ char buf2[200]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); ++ bch2_bkey_val_to_text(&PBUF(buf2), c, k); ++ ++ if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) { ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ bch2_fix_overlapping_extent(&trans, ++ iter, k, prev.k->k.p)); ++ if (ret) ++ goto err; ++ } ++ } ++ bkey_on_stack_reassemble(&prev, c, k); ++ ++ ret = walk_inode(&trans, &w, k.k->p.inode); ++ if (ret) ++ break; ++ ++ if (fsck_err_on(!w.have_inode, c, ++ "extent type %u for missing inode %llu", ++ k.k->type, k.k->p.inode) || ++ fsck_err_on(w.have_inode && ++ !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c, ++ "extent type %u for non regular file, inode %llu mode %o", ++ k.k->type, k.k->p.inode, w.inode.bi_mode)) { ++ bch2_trans_unlock(&trans); ++ ++ ret = bch2_inode_truncate(c, k.k->p.inode, 0); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ if (fsck_err_on(w.first_this_inode && ++ w.have_inode && ++ !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) && ++ w.inode.bi_sectors != ++ (i_sectors = bch2_count_inode_sectors(&trans, w.cur_inum)), ++ c, "inode %llu has incorrect i_sectors: got %llu, should be %llu", ++ w.inode.bi_inum, ++ w.inode.bi_sectors, i_sectors)) { ++ struct bkey_inode_buf p; ++ ++ w.inode.bi_sectors = i_sectors; ++ ++ bch2_trans_unlock(&trans); ++ ++ bch2_inode_pack(&p, &w.inode); ++ ++ ret = bch2_btree_insert(c, BTREE_ID_INODES, ++ &p.inode.k_i, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW); ++ if (ret) { ++ bch_err(c, "error in fsck: error %i updating inode", ret); ++ goto err; ++ } ++ ++ /* revalidate iterator: */ ++ k = bch2_btree_iter_peek(iter); ++ } ++ ++ if (fsck_err_on(w.have_inode && ++ !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && ++ k.k->type != KEY_TYPE_reservation && ++ k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c, ++ "extent type %u offset %llu past end of inode %llu, i_size %llu", ++ k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) { ++ bch2_trans_unlock(&trans); ++ ++ ret = bch2_inode_truncate(c, k.k->p.inode, ++ w.inode.bi_size); ++ if (ret) ++ goto err; ++ continue; ++ } ++ } ++err: ++fsck_err: ++ if (ret == -EINTR) ++ goto retry; ++ bkey_on_stack_exit(&prev, c); ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++/* ++ * Walk dirents: verify that they all have a corresponding S_ISDIR inode, ++ * validate d_type ++ */ ++noinline_for_stack ++static int check_dirents(struct bch_fs *c) ++{ ++ struct inode_walker w = inode_walker_init(); ++ struct hash_check h; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ unsigned name_len; ++ char buf[200]; ++ int ret = 0; ++ ++ bch_verbose(c, "checking dirents"); ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ hash_check_init(&h); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, ++ POS(BCACHEFS_ROOT_INO, 0), 0); ++retry: ++ for_each_btree_key_continue(iter, 0, k, ret) { ++ struct bkey_s_c_dirent d; ++ struct bch_inode_unpacked target; ++ bool have_target; ++ u64 d_inum; ++ ++ ret = walk_inode(&trans, &w, k.k->p.inode); ++ if (ret) ++ break; ++ ++ if (fsck_err_on(!w.have_inode, c, ++ "dirent in nonexisting directory:\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf)) || ++ fsck_err_on(!S_ISDIR(w.inode.bi_mode), c, ++ "dirent in non directory inode type %u:\n%s", ++ mode_to_type(w.inode.bi_mode), ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) { ++ ret = bch2_btree_delete_at(&trans, iter, 0); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ if (w.first_this_inode && w.have_inode) ++ hash_check_set_inode(&trans, &h, &w.inode); ++ ++ ret = check_dirent_hash(&trans, &h, iter, &k); ++ if (ret > 0) { ++ ret = 0; ++ continue; ++ } ++ if (ret) ++ goto fsck_err; ++ ++ if (ret) ++ goto fsck_err; ++ ++ if (k.k->type != KEY_TYPE_dirent) ++ continue; ++ ++ d = bkey_s_c_to_dirent(k); ++ d_inum = le64_to_cpu(d.v->d_inum); ++ ++ name_len = bch2_dirent_name_bytes(d); ++ ++ if (fsck_err_on(!name_len, c, "empty dirent") || ++ fsck_err_on(name_len == 1 && ++ !memcmp(d.v->d_name, ".", 1), c, ++ ". dirent") || ++ fsck_err_on(name_len == 2 && ++ !memcmp(d.v->d_name, "..", 2), c, ++ ".. dirent") || ++ fsck_err_on(name_len == 2 && ++ !memcmp(d.v->d_name, "..", 2), c, ++ ".. dirent") || ++ fsck_err_on(memchr(d.v->d_name, '/', name_len), c, ++ "dirent name has invalid chars")) { ++ ret = remove_dirent(&trans, d); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ if (fsck_err_on(d_inum == d.k->p.inode, c, ++ "dirent points to own directory:\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) { ++ ret = remove_dirent(&trans, d); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ ret = bch2_inode_find_by_inum_trans(&trans, d_inum, &target); ++ if (ret && ret != -ENOENT) ++ break; ++ ++ have_target = !ret; ++ ret = 0; ++ ++ if (fsck_err_on(!have_target, c, ++ "dirent points to missing inode:\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) { ++ ret = remove_dirent(&trans, d); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ if (fsck_err_on(have_target && ++ d.v->d_type != ++ mode_to_type(target.bi_mode), c, ++ "incorrect d_type: should be %u:\n%s", ++ mode_to_type(target.bi_mode), ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) { ++ struct bkey_i_dirent *n; ++ ++ n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); ++ if (!n) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ bkey_reassemble(&n->k_i, d.s_c); ++ n->v.d_type = mode_to_type(target.bi_mode); ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ (bch2_trans_update(&trans, iter, &n->k_i, 0), 0)); ++ kfree(n); ++ if (ret) ++ goto err; ++ ++ } ++ } ++ ++ hash_stop_chain(&trans, &h); ++err: ++fsck_err: ++ if (ret == -EINTR) ++ goto retry; ++ ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++/* ++ * Walk xattrs: verify that they all have a corresponding inode ++ */ ++noinline_for_stack ++static int check_xattrs(struct bch_fs *c) ++{ ++ struct inode_walker w = inode_walker_init(); ++ struct hash_check h; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch_verbose(c, "checking xattrs"); ++ ++ hash_check_init(&h); ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, ++ POS(BCACHEFS_ROOT_INO, 0), 0); ++retry: ++ for_each_btree_key_continue(iter, 0, k, ret) { ++ ret = walk_inode(&trans, &w, k.k->p.inode); ++ if (ret) ++ break; ++ ++ if (fsck_err_on(!w.have_inode, c, ++ "xattr for missing inode %llu", ++ k.k->p.inode)) { ++ ret = bch2_btree_delete_at(&trans, iter, 0); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ if (w.first_this_inode && w.have_inode) ++ hash_check_set_inode(&trans, &h, &w.inode); ++ ++ ret = hash_check_key(&trans, bch2_xattr_hash_desc, ++ &h, iter, k); ++ if (ret) ++ goto fsck_err; ++ } ++err: ++fsck_err: ++ if (ret == -EINTR) ++ goto retry; ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++/* Get root directory, create if it doesn't exist: */ ++static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode) ++{ ++ struct bkey_inode_buf packed; ++ int ret; ++ ++ bch_verbose(c, "checking root directory"); ++ ++ ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode); ++ if (ret && ret != -ENOENT) ++ return ret; ++ ++ if (fsck_err_on(ret, c, "root directory missing")) ++ goto create_root; ++ ++ if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c, ++ "root inode not a directory")) ++ goto create_root; ++ ++ return 0; ++fsck_err: ++ return ret; ++create_root: ++ bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755, ++ 0, NULL); ++ root_inode->bi_inum = BCACHEFS_ROOT_INO; ++ ++ bch2_inode_pack(&packed, root_inode); ++ ++ return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, ++ NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW); ++} ++ ++/* Get lost+found, create if it doesn't exist: */ ++static int check_lostfound(struct bch_fs *c, ++ struct bch_inode_unpacked *root_inode, ++ struct bch_inode_unpacked *lostfound_inode) ++{ ++ struct qstr lostfound = QSTR("lost+found"); ++ struct bch_hash_info root_hash_info = ++ bch2_hash_info_init(c, root_inode); ++ u64 inum; ++ int ret; ++ ++ bch_verbose(c, "checking lost+found"); ++ ++ inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info, ++ &lostfound); ++ if (!inum) { ++ bch_notice(c, "creating lost+found"); ++ goto create_lostfound; ++ } ++ ++ ret = bch2_inode_find_by_inum(c, inum, lostfound_inode); ++ if (ret && ret != -ENOENT) ++ return ret; ++ ++ if (fsck_err_on(ret, c, "lost+found missing")) ++ goto create_lostfound; ++ ++ if (fsck_err_on(!S_ISDIR(lostfound_inode->bi_mode), c, ++ "lost+found inode not a directory")) ++ goto create_lostfound; ++ ++ return 0; ++fsck_err: ++ return ret; ++create_lostfound: ++ bch2_inode_init_early(c, lostfound_inode); ++ ++ ret = bch2_trans_do(c, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ bch2_create_trans(&trans, ++ BCACHEFS_ROOT_INO, root_inode, ++ lostfound_inode, &lostfound, ++ 0, 0, S_IFDIR|0700, 0, NULL, NULL)); ++ if (ret) ++ bch_err(c, "error creating lost+found: %i", ret); ++ ++ return ret; ++} ++ ++struct inode_bitmap { ++ unsigned long *bits; ++ size_t size; ++}; ++ ++static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr) ++{ ++ return nr < b->size ? test_bit(nr, b->bits) : false; ++} ++ ++static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr) ++{ ++ if (nr >= b->size) { ++ size_t new_size = max_t(size_t, max_t(size_t, ++ PAGE_SIZE * 8, ++ b->size * 2), ++ nr + 1); ++ void *n; ++ ++ new_size = roundup_pow_of_two(new_size); ++ n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO); ++ if (!n) { ++ return -ENOMEM; ++ } ++ ++ b->bits = n; ++ b->size = new_size; ++ } ++ ++ __set_bit(nr, b->bits); ++ return 0; ++} ++ ++struct pathbuf { ++ size_t nr; ++ size_t size; ++ ++ struct pathbuf_entry { ++ u64 inum; ++ u64 offset; ++ } *entries; ++}; ++ ++static int path_down(struct pathbuf *p, u64 inum) ++{ ++ if (p->nr == p->size) { ++ size_t new_size = max_t(size_t, 256UL, p->size * 2); ++ void *n = krealloc(p->entries, ++ new_size * sizeof(p->entries[0]), ++ GFP_KERNEL); ++ if (!n) ++ return -ENOMEM; ++ ++ p->entries = n; ++ p->size = new_size; ++ }; ++ ++ p->entries[p->nr++] = (struct pathbuf_entry) { ++ .inum = inum, ++ .offset = 0, ++ }; ++ return 0; ++} ++ ++noinline_for_stack ++static int check_directory_structure(struct bch_fs *c, ++ struct bch_inode_unpacked *lostfound_inode) ++{ ++ struct inode_bitmap dirs_done = { NULL, 0 }; ++ struct pathbuf path = { 0, 0, NULL }; ++ struct pathbuf_entry *e; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_dirent dirent; ++ bool had_unreachable; ++ u64 d_inum; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ bch_verbose(c, "checking directory structure"); ++ ++ /* DFS: */ ++restart_dfs: ++ had_unreachable = false; ++ ++ ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO); ++ if (ret) { ++ bch_err(c, "memory allocation failure in inode_bitmap_set()"); ++ goto err; ++ } ++ ++ ret = path_down(&path, BCACHEFS_ROOT_INO); ++ if (ret) ++ goto err; ++ ++ while (path.nr) { ++next: ++ e = &path.entries[path.nr - 1]; ++ ++ if (e->offset == U64_MAX) ++ goto up; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, ++ POS(e->inum, e->offset + 1), 0, k, ret) { ++ if (k.k->p.inode != e->inum) ++ break; ++ ++ e->offset = k.k->p.offset; ++ ++ if (k.k->type != KEY_TYPE_dirent) ++ continue; ++ ++ dirent = bkey_s_c_to_dirent(k); ++ ++ if (dirent.v->d_type != DT_DIR) ++ continue; ++ ++ d_inum = le64_to_cpu(dirent.v->d_inum); ++ ++ if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c, ++ "directory %llu has multiple hardlinks", ++ d_inum)) { ++ ret = remove_dirent(&trans, dirent); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ ret = inode_bitmap_set(&dirs_done, d_inum); ++ if (ret) { ++ bch_err(c, "memory allocation failure in inode_bitmap_set()"); ++ goto err; ++ } ++ ++ ret = path_down(&path, d_inum); ++ if (ret) { ++ goto err; ++ } ++ ++ ret = bch2_trans_iter_free(&trans, iter); ++ if (ret) { ++ bch_err(c, "btree error %i in fsck", ret); ++ goto err; ++ } ++ goto next; ++ } ++ ret = bch2_trans_iter_free(&trans, iter) ?: ret; ++ if (ret) { ++ bch_err(c, "btree error %i in fsck", ret); ++ goto err; ++ } ++up: ++ path.nr--; ++ } ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS_MIN, 0); ++retry: ++ for_each_btree_key_continue(iter, 0, k, ret) { ++ if (k.k->type != KEY_TYPE_inode) ++ continue; ++ ++ if (!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode))) ++ continue; ++ ++ ret = bch2_empty_dir_trans(&trans, k.k->p.inode); ++ if (ret == -EINTR) ++ goto retry; ++ if (!ret) ++ continue; ++ ++ if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.offset), c, ++ "unreachable directory found (inum %llu)", ++ k.k->p.offset)) { ++ bch2_trans_unlock(&trans); ++ ++ ret = reattach_inode(c, lostfound_inode, k.k->p.offset); ++ if (ret) { ++ goto err; ++ } ++ ++ had_unreachable = true; ++ } ++ } ++ bch2_trans_iter_free(&trans, iter); ++ if (ret) ++ goto err; ++ ++ if (had_unreachable) { ++ bch_info(c, "reattached unreachable directories, restarting pass to check for loops"); ++ kfree(dirs_done.bits); ++ kfree(path.entries); ++ memset(&dirs_done, 0, sizeof(dirs_done)); ++ memset(&path, 0, sizeof(path)); ++ goto restart_dfs; ++ } ++err: ++fsck_err: ++ ret = bch2_trans_exit(&trans) ?: ret; ++ kfree(dirs_done.bits); ++ kfree(path.entries); ++ return ret; ++} ++ ++struct nlink { ++ u32 count; ++ u32 dir_count; ++}; ++ ++typedef GENRADIX(struct nlink) nlink_table; ++ ++static void inc_link(struct bch_fs *c, nlink_table *links, ++ u64 range_start, u64 *range_end, ++ u64 inum, bool dir) ++{ ++ struct nlink *link; ++ ++ if (inum < range_start || inum >= *range_end) ++ return; ++ ++ link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL); ++ if (!link) { ++ bch_verbose(c, "allocation failed during fsck - will need another pass"); ++ *range_end = inum; ++ return; ++ } ++ ++ if (dir) ++ link->dir_count++; ++ else ++ link->count++; ++} ++ ++noinline_for_stack ++static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, ++ u64 range_start, u64 *range_end) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_dirent d; ++ u64 d_inum; ++ int ret; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret) { ++ switch (k.k->type) { ++ case KEY_TYPE_dirent: ++ d = bkey_s_c_to_dirent(k); ++ d_inum = le64_to_cpu(d.v->d_inum); ++ ++ if (d.v->d_type == DT_DIR) ++ inc_link(c, links, range_start, range_end, ++ d.k->p.inode, true); ++ ++ inc_link(c, links, range_start, range_end, ++ d_inum, false); ++ ++ break; ++ } ++ ++ bch2_trans_cond_resched(&trans); ++ } ++ ret = bch2_trans_exit(&trans) ?: ret; ++ if (ret) ++ bch_err(c, "error in fsck: btree error %i while walking dirents", ret); ++ ++ return ret; ++} ++ ++static int check_inode_nlink(struct bch_fs *c, ++ struct bch_inode_unpacked *lostfound_inode, ++ struct bch_inode_unpacked *u, ++ struct nlink *link, ++ bool *do_update) ++{ ++ u32 i_nlink = bch2_inode_nlink_get(u); ++ u32 real_i_nlink = ++ link->count * nlink_bias(u->bi_mode) + ++ link->dir_count; ++ int ret = 0; ++ ++ /* ++ * These should have been caught/fixed by earlier passes, we don't ++ * repair them here: ++ */ ++ if (S_ISDIR(u->bi_mode) && link->count > 1) { ++ need_fsck_err(c, "directory %llu with multiple hardlinks: %u", ++ u->bi_inum, link->count); ++ return 0; ++ } ++ ++ if (S_ISDIR(u->bi_mode) && !link->count) { ++ need_fsck_err(c, "unreachable directory found (inum %llu)", ++ u->bi_inum); ++ return 0; ++ } ++ ++ if (!S_ISDIR(u->bi_mode) && link->dir_count) { ++ need_fsck_err(c, "non directory with subdirectories (inum %llu)", ++ u->bi_inum); ++ return 0; ++ } ++ ++ if (!link->count && ++ !(u->bi_flags & BCH_INODE_UNLINKED) && ++ (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { ++ if (fsck_err(c, "unreachable inode %llu not marked as unlinked (type %u)", ++ u->bi_inum, mode_to_type(u->bi_mode)) == ++ FSCK_ERR_IGNORE) ++ return 0; ++ ++ ret = reattach_inode(c, lostfound_inode, u->bi_inum); ++ if (ret) ++ return ret; ++ ++ link->count = 1; ++ real_i_nlink = nlink_bias(u->bi_mode) + link->dir_count; ++ goto set_i_nlink; ++ } ++ ++ if (i_nlink < link->count) { ++ if (fsck_err(c, "inode %llu i_link too small (%u < %u, type %i)", ++ u->bi_inum, i_nlink, link->count, ++ mode_to_type(u->bi_mode)) == FSCK_ERR_IGNORE) ++ return 0; ++ goto set_i_nlink; ++ } ++ ++ if (i_nlink != real_i_nlink && ++ c->sb.clean) { ++ if (fsck_err(c, "filesystem marked clean, " ++ "but inode %llu has wrong i_nlink " ++ "(type %u i_nlink %u, should be %u)", ++ u->bi_inum, mode_to_type(u->bi_mode), ++ i_nlink, real_i_nlink) == FSCK_ERR_IGNORE) ++ return 0; ++ goto set_i_nlink; ++ } ++ ++ if (i_nlink != real_i_nlink && ++ (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { ++ if (fsck_err(c, "inode %llu has wrong i_nlink " ++ "(type %u i_nlink %u, should be %u)", ++ u->bi_inum, mode_to_type(u->bi_mode), ++ i_nlink, real_i_nlink) == FSCK_ERR_IGNORE) ++ return 0; ++ goto set_i_nlink; ++ } ++ ++ if (real_i_nlink && i_nlink != real_i_nlink) ++ bch_verbose(c, "setting inode %llu nlink from %u to %u", ++ u->bi_inum, i_nlink, real_i_nlink); ++set_i_nlink: ++ if (i_nlink != real_i_nlink) { ++ bch2_inode_nlink_set(u, real_i_nlink); ++ *do_update = true; ++ } ++fsck_err: ++ return ret; ++} ++ ++static int check_inode(struct btree_trans *trans, ++ struct bch_inode_unpacked *lostfound_inode, ++ struct btree_iter *iter, ++ struct bkey_s_c_inode inode, ++ struct nlink *link) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_inode_unpacked u; ++ bool do_update = false; ++ int ret = 0; ++ ++ ret = bch2_inode_unpack(inode, &u); ++ ++ bch2_trans_unlock(trans); ++ ++ if (bch2_fs_inconsistent_on(ret, c, ++ "error unpacking inode %llu in fsck", ++ inode.k->p.inode)) ++ return ret; ++ ++ if (link) { ++ ret = check_inode_nlink(c, lostfound_inode, &u, link, ++ &do_update); ++ if (ret) ++ return ret; ++ } ++ ++ if (u.bi_flags & BCH_INODE_UNLINKED && ++ (!c->sb.clean || ++ fsck_err(c, "filesystem marked clean, but inode %llu unlinked", ++ u.bi_inum))) { ++ bch_verbose(c, "deleting inode %llu", u.bi_inum); ++ ++ ret = bch2_inode_rm(c, u.bi_inum); ++ if (ret) ++ bch_err(c, "error in fsck: error %i while deleting inode", ret); ++ return ret; ++ } ++ ++ if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY && ++ (!c->sb.clean || ++ fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty", ++ u.bi_inum))) { ++ bch_verbose(c, "truncating inode %llu", u.bi_inum); ++ ++ /* ++ * XXX: need to truncate partial blocks too here - or ideally ++ * just switch units to bytes and that issue goes away ++ */ ++ ++ ret = bch2_inode_truncate(c, u.bi_inum, u.bi_size); ++ if (ret) { ++ bch_err(c, "error in fsck: error %i truncating inode", ret); ++ return ret; ++ } ++ ++ /* ++ * We truncated without our normal sector accounting hook, just ++ * make sure we recalculate it: ++ */ ++ u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY; ++ ++ u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; ++ do_update = true; ++ } ++ ++ if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY && ++ (!c->sb.clean || ++ fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty", ++ u.bi_inum))) { ++ s64 sectors; ++ ++ bch_verbose(c, "recounting sectors for inode %llu", ++ u.bi_inum); ++ ++ sectors = bch2_count_inode_sectors(trans, u.bi_inum); ++ if (sectors < 0) { ++ bch_err(c, "error in fsck: error %i recounting inode sectors", ++ (int) sectors); ++ return sectors; ++ } ++ ++ u.bi_sectors = sectors; ++ u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY; ++ do_update = true; ++ } ++ ++ if (do_update) { ++ struct bkey_inode_buf p; ++ ++ bch2_inode_pack(&p, &u); ++ ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ (bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0)); ++ if (ret) ++ bch_err(c, "error in fsck: error %i " ++ "updating inode", ret); ++ } ++fsck_err: ++ return ret; ++} ++ ++noinline_for_stack ++static int bch2_gc_walk_inodes(struct bch_fs *c, ++ struct bch_inode_unpacked *lostfound_inode, ++ nlink_table *links, ++ u64 range_start, u64 range_end) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct nlink *link, zero_links = { 0, 0 }; ++ struct genradix_iter nlinks_iter; ++ int ret = 0, ret2 = 0; ++ u64 nlinks_pos; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, ++ POS(0, range_start), 0); ++ nlinks_iter = genradix_iter_init(links, 0); ++ ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret2 = bkey_err(k))) { ++peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); ++ ++ if (!link && (!k.k || iter->pos.offset >= range_end)) ++ break; ++ ++ nlinks_pos = range_start + nlinks_iter.pos; ++ if (iter->pos.offset > nlinks_pos) { ++ /* Should have been caught by dirents pass: */ ++ need_fsck_err_on(link && link->count, c, ++ "missing inode %llu (nlink %u)", ++ nlinks_pos, link->count); ++ genradix_iter_advance(&nlinks_iter, links); ++ goto peek_nlinks; ++ } ++ ++ if (iter->pos.offset < nlinks_pos || !link) ++ link = &zero_links; ++ ++ if (k.k && k.k->type == KEY_TYPE_inode) { ++ ret = check_inode(&trans, lostfound_inode, iter, ++ bkey_s_c_to_inode(k), link); ++ BUG_ON(ret == -EINTR); ++ if (ret) ++ break; ++ } else { ++ /* Should have been caught by dirents pass: */ ++ need_fsck_err_on(link->count, c, ++ "missing inode %llu (nlink %u)", ++ nlinks_pos, link->count); ++ } ++ ++ if (nlinks_pos == iter->pos.offset) ++ genradix_iter_advance(&nlinks_iter, links); ++ ++ bch2_btree_iter_next(iter); ++ bch2_trans_cond_resched(&trans); ++ } ++fsck_err: ++ bch2_trans_exit(&trans); ++ ++ if (ret2) ++ bch_err(c, "error in fsck: btree error %i while walking inodes", ret2); ++ ++ return ret ?: ret2; ++} ++ ++noinline_for_stack ++static int check_inode_nlinks(struct bch_fs *c, ++ struct bch_inode_unpacked *lostfound_inode) ++{ ++ nlink_table links; ++ u64 this_iter_range_start, next_iter_range_start = 0; ++ int ret = 0; ++ ++ bch_verbose(c, "checking inode nlinks"); ++ ++ genradix_init(&links); ++ ++ do { ++ this_iter_range_start = next_iter_range_start; ++ next_iter_range_start = U64_MAX; ++ ++ ret = bch2_gc_walk_dirents(c, &links, ++ this_iter_range_start, ++ &next_iter_range_start); ++ if (ret) ++ break; ++ ++ ret = bch2_gc_walk_inodes(c, lostfound_inode, &links, ++ this_iter_range_start, ++ next_iter_range_start); ++ if (ret) ++ break; ++ ++ genradix_free(&links); ++ } while (next_iter_range_start != U64_MAX); ++ ++ genradix_free(&links); ++ ++ return ret; ++} ++ ++/* ++ * Checks for inconsistencies that shouldn't happen, unless we have a bug. ++ * Doesn't fix them yet, mainly because they haven't yet been observed: ++ */ ++int bch2_fsck_full(struct bch_fs *c) ++{ ++ struct bch_inode_unpacked root_inode, lostfound_inode; ++ ++ return check_extents(c) ?: ++ check_dirents(c) ?: ++ check_xattrs(c) ?: ++ check_root(c, &root_inode) ?: ++ check_lostfound(c, &root_inode, &lostfound_inode) ?: ++ check_directory_structure(c, &lostfound_inode) ?: ++ check_inode_nlinks(c, &lostfound_inode); ++} ++ ++int bch2_fsck_inode_nlink(struct bch_fs *c) ++{ ++ struct bch_inode_unpacked root_inode, lostfound_inode; ++ ++ return check_root(c, &root_inode) ?: ++ check_lostfound(c, &root_inode, &lostfound_inode) ?: ++ check_inode_nlinks(c, &lostfound_inode); ++} ++ ++int bch2_fsck_walk_inodes_only(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_inode inode; ++ int ret; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k, ret) { ++ if (k.k->type != KEY_TYPE_inode) ++ continue; ++ ++ inode = bkey_s_c_to_inode(k); ++ ++ if (inode.v->bi_flags & ++ (BCH_INODE_I_SIZE_DIRTY| ++ BCH_INODE_I_SECTORS_DIRTY| ++ BCH_INODE_UNLINKED)) { ++ ret = check_inode(&trans, NULL, iter, inode, NULL); ++ BUG_ON(ret == -EINTR); ++ if (ret) ++ break; ++ } ++ } ++ BUG_ON(ret == -EINTR); ++ ++ return bch2_trans_exit(&trans) ?: ret; ++} +diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h +new file mode 100644 +index 000000000000..9e4af02bde1e +--- /dev/null ++++ b/fs/bcachefs/fsck.h +@@ -0,0 +1,9 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FSCK_H ++#define _BCACHEFS_FSCK_H ++ ++int bch2_fsck_full(struct bch_fs *); ++int bch2_fsck_inode_nlink(struct bch_fs *); ++int bch2_fsck_walk_inodes_only(struct bch_fs *); ++ ++#endif /* _BCACHEFS_FSCK_H */ +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +new file mode 100644 +index 000000000000..7d20f082ad45 +--- /dev/null ++++ b/fs/bcachefs/inode.c +@@ -0,0 +1,554 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_update.h" ++#include "error.h" ++#include "extents.h" ++#include "inode.h" ++#include "str_hash.h" ++ ++#include ++ ++#include ++ ++const char * const bch2_inode_opts[] = { ++#define x(name, ...) #name, ++ BCH_INODE_OPTS() ++#undef x ++ NULL, ++}; ++ ++static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; ++static const u8 bits_table[8] = { ++ 1 * 8 - 1, ++ 2 * 8 - 2, ++ 3 * 8 - 3, ++ 4 * 8 - 4, ++ 6 * 8 - 5, ++ 8 * 8 - 6, ++ 10 * 8 - 7, ++ 13 * 8 - 8, ++}; ++ ++static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo) ++{ ++ __be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), }; ++ unsigned shift, bytes, bits = likely(!hi) ++ ? fls64(lo) ++ : fls64(hi) + 64; ++ ++ for (shift = 1; shift <= 8; shift++) ++ if (bits < bits_table[shift - 1]) ++ goto got_shift; ++ ++ BUG(); ++got_shift: ++ bytes = byte_table[shift - 1]; ++ ++ BUG_ON(out + bytes > end); ++ ++ memcpy(out, (u8 *) in + 16 - bytes, bytes); ++ *out |= (1 << 8) >> shift; ++ ++ return bytes; ++} ++ ++static int inode_decode_field(const u8 *in, const u8 *end, ++ u64 out[2], unsigned *out_bits) ++{ ++ __be64 be[2] = { 0, 0 }; ++ unsigned bytes, shift; ++ u8 *p; ++ ++ if (in >= end) ++ return -1; ++ ++ if (!*in) ++ return -1; ++ ++ /* ++ * position of highest set bit indicates number of bytes: ++ * shift = number of bits to remove in high byte: ++ */ ++ shift = 8 - __fls(*in); /* 1 <= shift <= 8 */ ++ bytes = byte_table[shift - 1]; ++ ++ if (in + bytes > end) ++ return -1; ++ ++ p = (u8 *) be + 16 - bytes; ++ memcpy(p, in, bytes); ++ *p ^= (1 << 8) >> shift; ++ ++ out[0] = be64_to_cpu(be[0]); ++ out[1] = be64_to_cpu(be[1]); ++ *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]); ++ ++ return bytes; ++} ++ ++void bch2_inode_pack(struct bkey_inode_buf *packed, ++ const struct bch_inode_unpacked *inode) ++{ ++ u8 *out = packed->inode.v.fields; ++ u8 *end = (void *) &packed[1]; ++ u8 *last_nonzero_field = out; ++ unsigned nr_fields = 0, last_nonzero_fieldnr = 0; ++ unsigned bytes; ++ ++ bkey_inode_init(&packed->inode.k_i); ++ packed->inode.k.p.offset = inode->bi_inum; ++ packed->inode.v.bi_hash_seed = inode->bi_hash_seed; ++ packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags); ++ packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); ++ ++#define x(_name, _bits) \ ++ out += inode_encode_field(out, end, 0, inode->_name); \ ++ nr_fields++; \ ++ \ ++ if (inode->_name) { \ ++ last_nonzero_field = out; \ ++ last_nonzero_fieldnr = nr_fields; \ ++ } ++ ++ BCH_INODE_FIELDS() ++#undef x ++ ++ out = last_nonzero_field; ++ nr_fields = last_nonzero_fieldnr; ++ ++ bytes = out - (u8 *) &packed->inode.v; ++ set_bkey_val_bytes(&packed->inode.k, bytes); ++ memset_u64s_tail(&packed->inode.v, 0, bytes); ++ ++ SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields); ++ ++ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { ++ struct bch_inode_unpacked unpacked; ++ ++ int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode), ++ &unpacked); ++ BUG_ON(ret); ++ BUG_ON(unpacked.bi_inum != inode->bi_inum); ++ BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); ++ BUG_ON(unpacked.bi_mode != inode->bi_mode); ++ ++#define x(_name, _bits) BUG_ON(unpacked._name != inode->_name); ++ BCH_INODE_FIELDS() ++#undef x ++ } ++} ++ ++int bch2_inode_unpack(struct bkey_s_c_inode inode, ++ struct bch_inode_unpacked *unpacked) ++{ ++ const u8 *in = inode.v->fields; ++ const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k); ++ u64 field[2]; ++ unsigned fieldnr = 0, field_bits; ++ int ret; ++ ++ unpacked->bi_inum = inode.k->p.offset; ++ unpacked->bi_hash_seed = inode.v->bi_hash_seed; ++ unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); ++ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); ++ ++#define x(_name, _bits) \ ++ if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ ++ memset(&unpacked->_name, 0, \ ++ sizeof(*unpacked) - \ ++ offsetof(struct bch_inode_unpacked, _name)); \ ++ return 0; \ ++ } \ ++ \ ++ ret = inode_decode_field(in, end, field, &field_bits); \ ++ if (ret < 0) \ ++ return ret; \ ++ \ ++ if (field_bits > sizeof(unpacked->_name) * 8) \ ++ return -1; \ ++ \ ++ unpacked->_name = field[1]; \ ++ in += ret; ++ ++ BCH_INODE_FIELDS() ++#undef x ++ ++ /* XXX: signal if there were more fields than expected? */ ++ ++ return 0; ++} ++ ++struct btree_iter *bch2_inode_peek(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode, ++ u64 inum, unsigned flags) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum), ++ BTREE_ITER_SLOTS|flags); ++ if (IS_ERR(iter)) ++ return iter; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO; ++ if (ret) ++ goto err; ++ ++ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); ++ if (ret) ++ goto err; ++ ++ return iter; ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ERR_PTR(ret); ++} ++ ++int bch2_inode_write(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bch_inode_unpacked *inode) ++{ ++ struct bkey_inode_buf *inode_p; ++ ++ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); ++ if (IS_ERR(inode_p)) ++ return PTR_ERR(inode_p); ++ ++ bch2_inode_pack(inode_p, inode); ++ bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); ++ return 0; ++} ++ ++const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); ++ struct bch_inode_unpacked unpacked; ++ ++ if (k.k->p.inode) ++ return "nonzero k.p.inode"; ++ ++ if (bkey_val_bytes(k.k) < sizeof(struct bch_inode)) ++ return "incorrect value size"; ++ ++ if (k.k->p.offset < BLOCKDEV_INODE_MAX) ++ return "fs inode in blockdev range"; ++ ++ if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) ++ return "invalid str hash type"; ++ ++ if (bch2_inode_unpack(inode, &unpacked)) ++ return "invalid variable length fields"; ++ ++ if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) ++ return "invalid data checksum type"; ++ ++ if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) ++ return "invalid data checksum type"; ++ ++ if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && ++ unpacked.bi_nlink != 0) ++ return "flagged as unlinked but bi_nlink != 0"; ++ ++ return NULL; ++} ++ ++void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); ++ struct bch_inode_unpacked unpacked; ++ ++ if (bch2_inode_unpack(inode, &unpacked)) { ++ pr_buf(out, "(unpack error)"); ++ return; ++ } ++ ++#define x(_name, _bits) \ ++ pr_buf(out, #_name ": %llu ", (u64) unpacked._name); ++ BCH_INODE_FIELDS() ++#undef x ++} ++ ++const char *bch2_inode_generation_invalid(const struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ if (k.k->p.inode) ++ return "nonzero k.p.inode"; ++ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) ++ return "incorrect value size"; ++ ++ return NULL; ++} ++ ++void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k); ++ ++ pr_buf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); ++} ++ ++void bch2_inode_init_early(struct bch_fs *c, ++ struct bch_inode_unpacked *inode_u) ++{ ++ enum bch_str_hash_type str_hash = ++ bch2_str_hash_opt_to_type(c, c->opts.str_hash); ++ ++ memset(inode_u, 0, sizeof(*inode_u)); ++ ++ /* ick */ ++ inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET; ++ get_random_bytes(&inode_u->bi_hash_seed, ++ sizeof(inode_u->bi_hash_seed)); ++} ++ ++void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, ++ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, ++ struct bch_inode_unpacked *parent) ++{ ++ inode_u->bi_mode = mode; ++ inode_u->bi_uid = uid; ++ inode_u->bi_gid = gid; ++ inode_u->bi_dev = rdev; ++ inode_u->bi_atime = now; ++ inode_u->bi_mtime = now; ++ inode_u->bi_ctime = now; ++ inode_u->bi_otime = now; ++ ++ if (parent && parent->bi_mode & S_ISGID) { ++ inode_u->bi_gid = parent->bi_gid; ++ if (S_ISDIR(mode)) ++ inode_u->bi_mode |= S_ISGID; ++ } ++ ++ if (parent) { ++#define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name; ++ BCH_INODE_OPTS() ++#undef x ++ } ++} ++ ++void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, ++ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, ++ struct bch_inode_unpacked *parent) ++{ ++ bch2_inode_init_early(c, inode_u); ++ bch2_inode_init_late(inode_u, bch2_current_time(c), ++ uid, gid, mode, rdev, parent); ++} ++ ++static inline u32 bkey_generation(struct bkey_s_c k) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_inode: ++ BUG(); ++ case KEY_TYPE_inode_generation: ++ return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); ++ default: ++ return 0; ++ } ++} ++ ++int bch2_inode_create(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode_u, ++ u64 min, u64 max, u64 *hint) ++{ ++ struct bkey_inode_buf *inode_p; ++ struct btree_iter *iter = NULL; ++ struct bkey_s_c k; ++ u64 start; ++ int ret; ++ ++ if (!max) ++ max = ULLONG_MAX; ++ ++ if (trans->c->opts.inodes_32bit) ++ max = min_t(u64, max, U32_MAX); ++ ++ start = READ_ONCE(*hint); ++ ++ if (start >= max || start < min) ++ start = min; ++ ++ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); ++ if (IS_ERR(inode_p)) ++ return PTR_ERR(inode_p); ++again: ++ for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(0, start), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ if (bkey_cmp(iter->pos, POS(0, max)) > 0) ++ break; ++ ++ if (k.k->type != KEY_TYPE_inode) ++ goto found_slot; ++ } ++ ++ bch2_trans_iter_put(trans, iter); ++ ++ if (ret) ++ return ret; ++ ++ if (start != min) { ++ /* Retry from start */ ++ start = min; ++ goto again; ++ } ++ ++ return -ENOSPC; ++found_slot: ++ *hint = k.k->p.offset; ++ inode_u->bi_inum = k.k->p.offset; ++ inode_u->bi_generation = bkey_generation(k); ++ ++ bch2_inode_pack(inode_p, inode_u); ++ bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); ++ bch2_trans_iter_put(trans, iter); ++ return 0; ++} ++ ++int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_i_inode_generation delete; ++ struct bpos start = POS(inode_nr, 0); ++ struct bpos end = POS(inode_nr + 1, 0); ++ int ret; ++ ++ /* ++ * If this was a directory, there shouldn't be any real dirents left - ++ * but there could be whiteouts (from hash collisions) that we should ++ * delete: ++ * ++ * XXX: the dirent could ideally would delete whiteouts when they're no ++ * longer needed ++ */ ++ ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, ++ start, end, NULL) ?: ++ bch2_btree_delete_range(c, BTREE_ID_XATTRS, ++ start, end, NULL) ?: ++ bch2_btree_delete_range(c, BTREE_ID_DIRENTS, ++ start, end, NULL); ++ if (ret) ++ return ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ do { ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); ++ u32 bi_generation = 0; ++ ++ ret = bkey_err(k); ++ if (ret) ++ break; ++ ++ bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c, ++ "inode %llu not found when deleting", ++ inode_nr); ++ ++ switch (k.k->type) { ++ case KEY_TYPE_inode: { ++ struct bch_inode_unpacked inode_u; ++ ++ if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u)) ++ bi_generation = inode_u.bi_generation + 1; ++ break; ++ } ++ case KEY_TYPE_inode_generation: { ++ struct bkey_s_c_inode_generation g = ++ bkey_s_c_to_inode_generation(k); ++ bi_generation = le32_to_cpu(g.v->bi_generation); ++ break; ++ } ++ } ++ ++ if (!bi_generation) { ++ bkey_init(&delete.k); ++ delete.k.p.offset = inode_nr; ++ } else { ++ bkey_inode_generation_init(&delete.k_i); ++ delete.k.p.offset = inode_nr; ++ delete.v.bi_generation = cpu_to_le32(bi_generation); ++ } ++ ++ bch2_trans_update(&trans, iter, &delete.k_i, 0); ++ ++ ret = bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL); ++ } while (ret == -EINTR); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, ++ struct bch_inode_unpacked *inode) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, ++ POS(0, inode_nr), BTREE_ITER_SLOTS); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ ret = k.k->type == KEY_TYPE_inode ++ ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode) ++ : -ENOENT; ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, ++ struct bch_inode_unpacked *inode) ++{ ++ return bch2_trans_do(c, NULL, NULL, 0, ++ bch2_inode_find_by_inum_trans(&trans, inode_nr, inode)); ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_inode_pack_test(void) ++{ ++ struct bch_inode_unpacked *u, test_inodes[] = { ++ { ++ .bi_atime = U64_MAX, ++ .bi_ctime = U64_MAX, ++ .bi_mtime = U64_MAX, ++ .bi_otime = U64_MAX, ++ .bi_size = U64_MAX, ++ .bi_sectors = U64_MAX, ++ .bi_uid = U32_MAX, ++ .bi_gid = U32_MAX, ++ .bi_nlink = U32_MAX, ++ .bi_generation = U32_MAX, ++ .bi_dev = U32_MAX, ++ }, ++ }; ++ ++ for (u = test_inodes; ++ u < test_inodes + ARRAY_SIZE(test_inodes); ++ u++) { ++ struct bkey_inode_buf p; ++ ++ bch2_inode_pack(&p, u); ++ } ++} ++#endif +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +new file mode 100644 +index 000000000000..bb759a46dc41 +--- /dev/null ++++ b/fs/bcachefs/inode.h +@@ -0,0 +1,177 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_INODE_H ++#define _BCACHEFS_INODE_H ++ ++#include "opts.h" ++ ++extern const char * const bch2_inode_opts[]; ++ ++const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_inode (struct bkey_ops) { \ ++ .key_invalid = bch2_inode_invalid, \ ++ .val_to_text = bch2_inode_to_text, \ ++} ++ ++const char *bch2_inode_generation_invalid(const struct bch_fs *, ++ struct bkey_s_c); ++void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ ++#define bch2_bkey_ops_inode_generation (struct bkey_ops) { \ ++ .key_invalid = bch2_inode_generation_invalid, \ ++ .val_to_text = bch2_inode_generation_to_text, \ ++} ++ ++struct bch_inode_unpacked { ++ u64 bi_inum; ++ __le64 bi_hash_seed; ++ u32 bi_flags; ++ u16 bi_mode; ++ ++#define x(_name, _bits) u##_bits _name; ++ BCH_INODE_FIELDS() ++#undef x ++}; ++ ++struct bkey_inode_buf { ++ struct bkey_i_inode inode; ++ ++#define x(_name, _bits) + 8 + _bits / 8 ++ u8 _pad[0 + BCH_INODE_FIELDS()]; ++#undef x ++} __attribute__((packed, aligned(8))); ++ ++void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); ++int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); ++ ++struct btree_iter *bch2_inode_peek(struct btree_trans *, ++ struct bch_inode_unpacked *, u64, unsigned); ++int bch2_inode_write(struct btree_trans *, struct btree_iter *, ++ struct bch_inode_unpacked *); ++ ++void bch2_inode_init_early(struct bch_fs *, ++ struct bch_inode_unpacked *); ++void bch2_inode_init_late(struct bch_inode_unpacked *, u64, ++ uid_t, gid_t, umode_t, dev_t, ++ struct bch_inode_unpacked *); ++void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, ++ uid_t, gid_t, umode_t, dev_t, ++ struct bch_inode_unpacked *); ++ ++int bch2_inode_create(struct btree_trans *, ++ struct bch_inode_unpacked *, ++ u64, u64, u64 *); ++ ++int bch2_inode_rm(struct bch_fs *, u64); ++ ++int bch2_inode_find_by_inum_trans(struct btree_trans *, u64, ++ struct bch_inode_unpacked *); ++int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *); ++ ++static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode) ++{ ++ struct bch_io_opts ret = { 0 }; ++ ++#define x(_name, _bits) \ ++ if (inode->bi_##_name) \ ++ opt_set(ret, _name, inode->bi_##_name - 1); ++ BCH_INODE_OPTS() ++#undef x ++ return ret; ++} ++ ++static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode, ++ enum inode_opt_id id, u64 v) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Inode_opt_##_name: \ ++ inode->bi_##_name = v; \ ++ break; ++ BCH_INODE_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode, ++ enum inode_opt_id id) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Inode_opt_##_name: \ ++ return inode->bi_##_name; ++ BCH_INODE_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++static inline struct bch_io_opts ++io_opts(struct bch_fs *c, struct bch_inode_unpacked *inode) ++{ ++ struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts); ++ ++ bch2_io_opts_apply(&opts, bch2_inode_opts_get(inode)); ++ return opts; ++} ++ ++static inline u8 mode_to_type(umode_t mode) ++{ ++ return (mode >> 12) & 15; ++} ++ ++/* i_nlink: */ ++ ++static inline unsigned nlink_bias(umode_t mode) ++{ ++ return S_ISDIR(mode) ? 2 : 1; ++} ++ ++static inline void bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) ++{ ++ if (bi->bi_flags & BCH_INODE_UNLINKED) ++ bi->bi_flags &= ~BCH_INODE_UNLINKED; ++ else ++ bi->bi_nlink++; ++} ++ ++static inline void bch2_inode_nlink_dec(struct bch_inode_unpacked *bi) ++{ ++ BUG_ON(bi->bi_flags & BCH_INODE_UNLINKED); ++ if (bi->bi_nlink) ++ bi->bi_nlink--; ++ else ++ bi->bi_flags |= BCH_INODE_UNLINKED; ++} ++ ++static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi) ++{ ++ return bi->bi_flags & BCH_INODE_UNLINKED ++ ? 0 ++ : bi->bi_nlink + nlink_bias(bi->bi_mode); ++} ++ ++static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, ++ unsigned nlink) ++{ ++ if (nlink) { ++ bi->bi_nlink = nlink - nlink_bias(bi->bi_mode); ++ bi->bi_flags &= ~BCH_INODE_UNLINKED; ++ } else { ++ bi->bi_nlink = 0; ++ bi->bi_flags |= BCH_INODE_UNLINKED; ++ } ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_inode_pack_test(void); ++#else ++static inline void bch2_inode_pack_test(void) {} ++#endif ++ ++#endif /* _BCACHEFS_INODE_H */ +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +new file mode 100644 +index 000000000000..8d608c900525 +--- /dev/null ++++ b/fs/bcachefs/io.c +@@ -0,0 +1,2355 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Some low level IO code, and hacks for various block layer limitations ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_on_stack.h" ++#include "bset.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "compress.h" ++#include "clock.h" ++#include "debug.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "error.h" ++#include "extent_update.h" ++#include "inode.h" ++#include "io.h" ++#include "journal.h" ++#include "keylist.h" ++#include "move.h" ++#include "rebalance.h" ++#include "super.h" ++#include "super-io.h" ++ ++#include ++#include ++ ++#include ++ ++static bool bch2_target_congested(struct bch_fs *c, u16 target) ++{ ++ const struct bch_devs_mask *devs; ++ unsigned d, nr = 0, total = 0; ++ u64 now = local_clock(), last; ++ s64 congested; ++ struct bch_dev *ca; ++ ++ if (!target) ++ return false; ++ ++ rcu_read_lock(); ++ devs = bch2_target_to_mask(c, target); ++ for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { ++ ca = rcu_dereference(c->devs[d]); ++ if (!ca) ++ continue; ++ ++ congested = atomic_read(&ca->congested); ++ last = READ_ONCE(ca->congested_last); ++ if (time_after64(now, last)) ++ congested -= (now - last) >> 12; ++ ++ total += max(congested, 0LL); ++ nr++; ++ } ++ rcu_read_unlock(); ++ ++ return bch2_rand_range(nr * CONGESTED_MAX) < total; ++} ++ ++static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, ++ u64 now, int rw) ++{ ++ u64 latency_capable = ++ ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; ++ /* ideally we'd be taking into account the device's variance here: */ ++ u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); ++ s64 latency_over = io_latency - latency_threshold; ++ ++ if (latency_threshold && latency_over > 0) { ++ /* ++ * bump up congested by approximately latency_over * 4 / ++ * latency_threshold - we don't need much accuracy here so don't ++ * bother with the divide: ++ */ ++ if (atomic_read(&ca->congested) < CONGESTED_MAX) ++ atomic_add(latency_over >> ++ max_t(int, ilog2(latency_threshold) - 2, 0), ++ &ca->congested); ++ ++ ca->congested_last = now; ++ } else if (atomic_read(&ca->congested) > 0) { ++ atomic_dec(&ca->congested); ++ } ++} ++ ++void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) ++{ ++ atomic64_t *latency = &ca->cur_latency[rw]; ++ u64 now = local_clock(); ++ u64 io_latency = time_after64(now, submit_time) ++ ? now - submit_time ++ : 0; ++ u64 old, new, v = atomic64_read(latency); ++ ++ do { ++ old = v; ++ ++ /* ++ * If the io latency was reasonably close to the current ++ * latency, skip doing the update and atomic operation - most of ++ * the time: ++ */ ++ if (abs((int) (old - io_latency)) < (old >> 1) && ++ now & ~(~0 << 5)) ++ break; ++ ++ new = ewma_add(old, io_latency, 5); ++ } while ((v = atomic64_cmpxchg(latency, old, new)) != old); ++ ++ bch2_congested_acct(ca, io_latency, now, rw); ++ ++ __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); ++} ++ ++/* Allocate, free from mempool: */ ++ ++void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) ++{ ++ struct bvec_iter_all iter; ++ struct bio_vec *bv; ++ ++ bio_for_each_segment_all(bv, bio, iter) ++ if (bv->bv_page != ZERO_PAGE(0)) ++ mempool_free(bv->bv_page, &c->bio_bounce_pages); ++ bio->bi_vcnt = 0; ++} ++ ++static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) ++{ ++ struct page *page; ++ ++ if (likely(!*using_mempool)) { ++ page = alloc_page(GFP_NOIO); ++ if (unlikely(!page)) { ++ mutex_lock(&c->bio_bounce_pages_lock); ++ *using_mempool = true; ++ goto pool_alloc; ++ ++ } ++ } else { ++pool_alloc: ++ page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO); ++ } ++ ++ return page; ++} ++ ++void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, ++ size_t size) ++{ ++ bool using_mempool = false; ++ ++ while (size) { ++ struct page *page = __bio_alloc_page_pool(c, &using_mempool); ++ unsigned len = min(PAGE_SIZE, size); ++ ++ BUG_ON(!bio_add_page(bio, page, len, 0)); ++ size -= len; ++ } ++ ++ if (using_mempool) ++ mutex_unlock(&c->bio_bounce_pages_lock); ++} ++ ++/* Extent update path: */ ++ ++static int sum_sector_overwrites(struct btree_trans *trans, ++ struct btree_iter *extent_iter, ++ struct bkey_i *new, ++ bool may_allocate, ++ bool *maybe_extending, ++ s64 *delta) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c old; ++ int ret = 0; ++ ++ *maybe_extending = true; ++ *delta = 0; ++ ++ iter = bch2_trans_copy_iter(trans, extent_iter); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { ++ if (!may_allocate && ++ bch2_bkey_nr_ptrs_fully_allocated(old) < ++ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) { ++ ret = -ENOSPC; ++ break; ++ } ++ ++ *delta += (min(new->k.p.offset, ++ old.k->p.offset) - ++ max(bkey_start_offset(&new->k), ++ bkey_start_offset(old.k))) * ++ (bkey_extent_is_allocation(&new->k) - ++ bkey_extent_is_allocation(old.k)); ++ ++ if (bkey_cmp(old.k->p, new->k.p) >= 0) { ++ /* ++ * Check if there's already data above where we're ++ * going to be writing to - this means we're definitely ++ * not extending the file: ++ * ++ * Note that it's not sufficient to check if there's ++ * data up to the sector offset we're going to be ++ * writing to, because i_size could be up to one block ++ * less: ++ */ ++ if (!bkey_cmp(old.k->p, new->k.p)) ++ old = bch2_btree_iter_next(iter); ++ ++ if (old.k && !bkey_err(old) && ++ old.k->p.inode == extent_iter->pos.inode && ++ bkey_extent_is_data(old.k)) ++ *maybe_extending = false; ++ ++ break; ++ } ++ } ++ ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++int bch2_extent_update(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *k, ++ struct disk_reservation *disk_res, ++ u64 *journal_seq, ++ u64 new_i_size, ++ s64 *i_sectors_delta) ++{ ++ /* this must live until after bch2_trans_commit(): */ ++ struct bkey_inode_buf inode_p; ++ bool extending = false; ++ s64 delta = 0; ++ int ret; ++ ++ ret = bch2_extent_trim_atomic(k, iter); ++ if (ret) ++ return ret; ++ ++ ret = sum_sector_overwrites(trans, iter, k, ++ disk_res && disk_res->sectors != 0, ++ &extending, &delta); ++ if (ret) ++ return ret; ++ ++ new_i_size = extending ++ ? min(k->k.p.offset << 9, new_i_size) ++ : 0; ++ ++ if (delta || new_i_size) { ++ struct btree_iter *inode_iter; ++ struct bch_inode_unpacked inode_u; ++ ++ inode_iter = bch2_inode_peek(trans, &inode_u, ++ k->k.p.inode, BTREE_ITER_INTENT); ++ if (IS_ERR(inode_iter)) ++ return PTR_ERR(inode_iter); ++ ++ /* ++ * XXX: ++ * writeback can race a bit with truncate, because truncate ++ * first updates the inode then truncates the pagecache. This is ++ * ugly, but lets us preserve the invariant that the in memory ++ * i_size is always >= the on disk i_size. ++ * ++ BUG_ON(new_i_size > inode_u.bi_size && ++ (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)); ++ */ ++ BUG_ON(new_i_size > inode_u.bi_size && !extending); ++ ++ if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && ++ new_i_size > inode_u.bi_size) ++ inode_u.bi_size = new_i_size; ++ else ++ new_i_size = 0; ++ ++ inode_u.bi_sectors += delta; ++ ++ if (delta || new_i_size) { ++ bch2_inode_pack(&inode_p, &inode_u); ++ bch2_trans_update(trans, inode_iter, ++ &inode_p.inode.k_i, 0); ++ } ++ ++ bch2_trans_iter_put(trans, inode_iter); ++ } ++ ++ bch2_trans_update(trans, iter, k, 0); ++ ++ ret = bch2_trans_commit(trans, disk_res, journal_seq, ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE); ++ if (!ret && i_sectors_delta) ++ *i_sectors_delta += delta; ++ ++ return ret; ++} ++ ++int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, ++ struct bpos end, u64 *journal_seq, ++ s64 *i_sectors_delta) ++{ ++ struct bch_fs *c = trans->c; ++ unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); ++ struct bkey_s_c k; ++ int ret = 0, ret2 = 0; ++ ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ bkey_cmp(iter->pos, end) < 0) { ++ struct disk_reservation disk_res = ++ bch2_disk_reservation_init(c, 0); ++ struct bkey_i delete; ++ ++ bch2_trans_begin(trans); ++ ++ ret = bkey_err(k); ++ if (ret) ++ goto btree_err; ++ ++ bkey_init(&delete.k); ++ delete.k.p = iter->pos; ++ ++ /* create the biggest key we can */ ++ bch2_key_resize(&delete.k, max_sectors); ++ bch2_cut_back(end, &delete); ++ ++ ret = bch2_extent_update(trans, iter, &delete, ++ &disk_res, journal_seq, ++ 0, i_sectors_delta); ++ bch2_disk_reservation_put(c, &disk_res); ++btree_err: ++ if (ret == -EINTR) { ++ ret2 = ret; ++ ret = 0; ++ } ++ if (ret) ++ break; ++ } ++ ++ if (bkey_cmp(iter->pos, end) > 0) { ++ bch2_btree_iter_set_pos(iter, end); ++ ret = bch2_btree_iter_traverse(iter); ++ } ++ ++ return ret ?: ret2; ++} ++ ++int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, ++ u64 *journal_seq, s64 *i_sectors_delta) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(inum, start), ++ BTREE_ITER_INTENT); ++ ++ ret = bch2_fpunch_at(&trans, iter, POS(inum, end), ++ journal_seq, i_sectors_delta); ++ bch2_trans_exit(&trans); ++ ++ if (ret == -EINTR) ++ ret = 0; ++ ++ return ret; ++} ++ ++int bch2_write_index_default(struct bch_write_op *op) ++{ ++ struct bch_fs *c = op->c; ++ struct bkey_on_stack sk; ++ struct keylist *keys = &op->insert_keys; ++ struct bkey_i *k = bch2_keylist_front(keys); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ int ret; ++ ++ bkey_on_stack_init(&sk); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ bkey_start_pos(&k->k), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ ++ do { ++ bch2_trans_begin(&trans); ++ ++ k = bch2_keylist_front(keys); ++ ++ bkey_on_stack_realloc(&sk, c, k->k.u64s); ++ bkey_copy(sk.k, k); ++ bch2_cut_front(iter->pos, sk.k); ++ ++ ret = bch2_extent_update(&trans, iter, sk.k, ++ &op->res, op_journal_seq(op), ++ op->new_i_size, &op->i_sectors_delta); ++ if (ret == -EINTR) ++ continue; ++ if (ret) ++ break; ++ ++ if (bkey_cmp(iter->pos, k->k.p) >= 0) ++ bch2_keylist_pop_front(keys); ++ } while (!bch2_keylist_empty(keys)); ++ ++ bch2_trans_exit(&trans); ++ bkey_on_stack_exit(&sk, c); ++ ++ return ret; ++} ++ ++/* Writes */ ++ ++void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, ++ enum bch_data_type type, ++ const struct bkey_i *k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); ++ const struct bch_extent_ptr *ptr; ++ struct bch_write_bio *n; ++ struct bch_dev *ca; ++ ++ BUG_ON(c->opts.nochanges); ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || ++ !c->devs[ptr->dev]); ++ ++ ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ if (to_entry(ptr + 1) < ptrs.end) { ++ n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO, ++ &ca->replica_set)); ++ ++ n->bio.bi_end_io = wbio->bio.bi_end_io; ++ n->bio.bi_private = wbio->bio.bi_private; ++ n->parent = wbio; ++ n->split = true; ++ n->bounce = false; ++ n->put_bio = true; ++ n->bio.bi_opf = wbio->bio.bi_opf; ++ bio_inc_remaining(&wbio->bio); ++ } else { ++ n = wbio; ++ n->split = false; ++ } ++ ++ n->c = c; ++ n->dev = ptr->dev; ++ n->have_ioref = bch2_dev_get_ioref(ca, WRITE); ++ n->submit_time = local_clock(); ++ n->bio.bi_iter.bi_sector = ptr->offset; ++ ++ if (!journal_flushes_device(ca)) ++ n->bio.bi_opf |= REQ_FUA; ++ ++ if (likely(n->have_ioref)) { ++ this_cpu_add(ca->io_done->sectors[WRITE][type], ++ bio_sectors(&n->bio)); ++ ++ bio_set_dev(&n->bio, ca->disk_sb.bdev); ++ submit_bio(&n->bio); ++ } else { ++ n->bio.bi_status = BLK_STS_REMOVED; ++ bio_endio(&n->bio); ++ } ++ } ++} ++ ++static void __bch2_write(struct closure *); ++ ++static void bch2_write_done(struct closure *cl) ++{ ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bch_fs *c = op->c; ++ ++ if (!op->error && (op->flags & BCH_WRITE_FLUSH)) ++ op->error = bch2_journal_error(&c->journal); ++ ++ bch2_disk_reservation_put(c, &op->res); ++ percpu_ref_put(&c->writes); ++ bch2_keylist_free(&op->insert_keys, op->inline_keys); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); ++ ++ if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) ++ up(&c->io_in_flight); ++ ++ if (op->end_io) { ++ EBUG_ON(cl->parent); ++ closure_debug_destroy(cl); ++ op->end_io(op); ++ } else { ++ closure_return(cl); ++ } ++} ++ ++/** ++ * bch_write_index - after a write, update index to point to new data ++ */ ++static void __bch2_write_index(struct bch_write_op *op) ++{ ++ struct bch_fs *c = op->c; ++ struct keylist *keys = &op->insert_keys; ++ struct bch_extent_ptr *ptr; ++ struct bkey_i *src, *dst = keys->keys, *n, *k; ++ unsigned dev; ++ int ret; ++ ++ for (src = keys->keys; src != keys->top; src = n) { ++ n = bkey_next(src); ++ ++ if (bkey_extent_is_direct_data(&src->k)) { ++ bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, ++ test_bit(ptr->dev, op->failed.d)); ++ ++ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) { ++ ret = -EIO; ++ goto err; ++ } ++ } ++ ++ if (dst != src) ++ memmove_u64s_down(dst, src, src->u64s); ++ dst = bkey_next(dst); ++ } ++ ++ keys->top = dst; ++ ++ /* ++ * probably not the ideal place to hook this in, but I don't ++ * particularly want to plumb io_opts all the way through the btree ++ * update stack right now ++ */ ++ for_each_keylist_key(keys, k) { ++ bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts); ++ ++ if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k))) ++ bch2_check_set_feature(op->c, BCH_FEATURE_incompressible); ++ ++ } ++ ++ if (!bch2_keylist_empty(keys)) { ++ u64 sectors_start = keylist_sectors(keys); ++ int ret = op->index_update_fn(op); ++ ++ BUG_ON(ret == -EINTR); ++ BUG_ON(keylist_sectors(keys) && !ret); ++ ++ op->written += sectors_start - keylist_sectors(keys); ++ ++ if (ret) { ++ __bcache_io_error(c, "btree IO error %i", ret); ++ op->error = ret; ++ } ++ } ++out: ++ /* If some a bucket wasn't written, we can't erasure code it: */ ++ for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) ++ bch2_open_bucket_write_error(c, &op->open_buckets, dev); ++ ++ bch2_open_buckets_put(c, &op->open_buckets); ++ return; ++err: ++ keys->top = keys->keys; ++ op->error = ret; ++ goto out; ++} ++ ++static void bch2_write_index(struct closure *cl) ++{ ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bch_fs *c = op->c; ++ ++ __bch2_write_index(op); ++ ++ if (!(op->flags & BCH_WRITE_DONE)) { ++ continue_at(cl, __bch2_write, index_update_wq(op)); ++ } else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) { ++ bch2_journal_flush_seq_async(&c->journal, ++ *op_journal_seq(op), ++ cl); ++ continue_at(cl, bch2_write_done, index_update_wq(op)); ++ } else { ++ continue_at_nobarrier(cl, bch2_write_done, NULL); ++ } ++} ++ ++static void bch2_write_endio(struct bio *bio) ++{ ++ struct closure *cl = bio->bi_private; ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bch_write_bio *wbio = to_wbio(bio); ++ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; ++ struct bch_fs *c = wbio->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s", ++ blk_status_to_str(bio->bi_status))) ++ set_bit(wbio->dev, op->failed.d); ++ ++ if (wbio->have_ioref) { ++ bch2_latency_acct(ca, wbio->submit_time, WRITE); ++ percpu_ref_put(&ca->io_ref); ++ } ++ ++ if (wbio->bounce) ++ bch2_bio_free_pages_pool(c, bio); ++ ++ if (wbio->put_bio) ++ bio_put(bio); ++ ++ if (parent) ++ bio_endio(&parent->bio); ++ else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT)) ++ closure_put(cl); ++ else ++ continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op)); ++} ++ ++static void init_append_extent(struct bch_write_op *op, ++ struct write_point *wp, ++ struct bversion version, ++ struct bch_extent_crc_unpacked crc) ++{ ++ struct bch_fs *c = op->c; ++ struct bkey_i_extent *e; ++ struct open_bucket *ob; ++ unsigned i; ++ ++ BUG_ON(crc.compressed_size > wp->sectors_free); ++ wp->sectors_free -= crc.compressed_size; ++ op->pos.offset += crc.uncompressed_size; ++ ++ e = bkey_extent_init(op->insert_keys.top); ++ e->k.p = op->pos; ++ e->k.size = crc.uncompressed_size; ++ e->k.version = version; ++ ++ if (crc.csum_type || ++ crc.compression_type || ++ crc.nonce) ++ bch2_extent_crc_append(&e->k_i, crc); ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ union bch_extent_entry *end = ++ bkey_val_end(bkey_i_to_s(&e->k_i)); ++ ++ end->ptr = ob->ptr; ++ end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; ++ end->ptr.cached = !ca->mi.durability || ++ (op->flags & BCH_WRITE_CACHED) != 0; ++ end->ptr.offset += ca->mi.bucket_size - ob->sectors_free; ++ ++ e->k.u64s++; ++ ++ BUG_ON(crc.compressed_size > ob->sectors_free); ++ ob->sectors_free -= crc.compressed_size; ++ } ++ ++ bch2_keylist_push(&op->insert_keys); ++} ++ ++static struct bio *bch2_write_bio_alloc(struct bch_fs *c, ++ struct write_point *wp, ++ struct bio *src, ++ bool *page_alloc_failed, ++ void *buf) ++{ ++ struct bch_write_bio *wbio; ++ struct bio *bio; ++ unsigned output_available = ++ min(wp->sectors_free << 9, src->bi_iter.bi_size); ++ unsigned pages = DIV_ROUND_UP(output_available + ++ (buf ++ ? ((unsigned long) buf & (PAGE_SIZE - 1)) ++ : 0), PAGE_SIZE); ++ ++ bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write); ++ wbio = wbio_init(bio); ++ wbio->put_bio = true; ++ /* copy WRITE_SYNC flag */ ++ wbio->bio.bi_opf = src->bi_opf; ++ ++ if (buf) { ++ bch2_bio_map(bio, buf, output_available); ++ return bio; ++ } ++ ++ wbio->bounce = true; ++ ++ /* ++ * We can't use mempool for more than c->sb.encoded_extent_max ++ * worth of pages, but we'd like to allocate more if we can: ++ */ ++ bch2_bio_alloc_pages_pool(c, bio, ++ min_t(unsigned, output_available, ++ c->sb.encoded_extent_max << 9)); ++ ++ if (bio->bi_iter.bi_size < output_available) ++ *page_alloc_failed = ++ bch2_bio_alloc_pages(bio, ++ output_available - ++ bio->bi_iter.bi_size, ++ GFP_NOFS) != 0; ++ ++ return bio; ++} ++ ++static int bch2_write_rechecksum(struct bch_fs *c, ++ struct bch_write_op *op, ++ unsigned new_csum_type) ++{ ++ struct bio *bio = &op->wbio.bio; ++ struct bch_extent_crc_unpacked new_crc; ++ int ret; ++ ++ /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ ++ ++ if (bch2_csum_type_is_encryption(op->crc.csum_type) != ++ bch2_csum_type_is_encryption(new_csum_type)) ++ new_csum_type = op->crc.csum_type; ++ ++ ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, ++ NULL, &new_crc, ++ op->crc.offset, op->crc.live_size, ++ new_csum_type); ++ if (ret) ++ return ret; ++ ++ bio_advance(bio, op->crc.offset << 9); ++ bio->bi_iter.bi_size = op->crc.live_size << 9; ++ op->crc = new_crc; ++ return 0; ++} ++ ++static int bch2_write_decrypt(struct bch_write_op *op) ++{ ++ struct bch_fs *c = op->c; ++ struct nonce nonce = extent_nonce(op->version, op->crc); ++ struct bch_csum csum; ++ ++ if (!bch2_csum_type_is_encryption(op->crc.csum_type)) ++ return 0; ++ ++ /* ++ * If we need to decrypt data in the write path, we'll no longer be able ++ * to verify the existing checksum (poly1305 mac, in this case) after ++ * it's decrypted - this is the last point we'll be able to reverify the ++ * checksum: ++ */ ++ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); ++ if (bch2_crc_cmp(op->crc.csum, csum)) ++ return -EIO; ++ ++ bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); ++ op->crc.csum_type = 0; ++ op->crc.csum = (struct bch_csum) { 0, 0 }; ++ return 0; ++} ++ ++static enum prep_encoded_ret { ++ PREP_ENCODED_OK, ++ PREP_ENCODED_ERR, ++ PREP_ENCODED_CHECKSUM_ERR, ++ PREP_ENCODED_DO_WRITE, ++} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) ++{ ++ struct bch_fs *c = op->c; ++ struct bio *bio = &op->wbio.bio; ++ ++ if (!(op->flags & BCH_WRITE_DATA_ENCODED)) ++ return PREP_ENCODED_OK; ++ ++ BUG_ON(bio_sectors(bio) != op->crc.compressed_size); ++ ++ /* Can we just write the entire extent as is? */ ++ if (op->crc.uncompressed_size == op->crc.live_size && ++ op->crc.compressed_size <= wp->sectors_free && ++ (op->crc.compression_type == op->compression_type || ++ op->incompressible)) { ++ if (!crc_is_compressed(op->crc) && ++ op->csum_type != op->crc.csum_type && ++ bch2_write_rechecksum(c, op, op->csum_type)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ return PREP_ENCODED_DO_WRITE; ++ } ++ ++ /* ++ * If the data is compressed and we couldn't write the entire extent as ++ * is, we have to decompress it: ++ */ ++ if (crc_is_compressed(op->crc)) { ++ struct bch_csum csum; ++ ++ if (bch2_write_decrypt(op)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ /* Last point we can still verify checksum: */ ++ csum = bch2_checksum_bio(c, op->crc.csum_type, ++ extent_nonce(op->version, op->crc), ++ bio); ++ if (bch2_crc_cmp(op->crc.csum, csum)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) ++ return PREP_ENCODED_ERR; ++ } ++ ++ /* ++ * No longer have compressed data after this point - data might be ++ * encrypted: ++ */ ++ ++ /* ++ * If the data is checksummed and we're only writing a subset, ++ * rechecksum and adjust bio to point to currently live data: ++ */ ++ if ((op->crc.live_size != op->crc.uncompressed_size || ++ op->crc.csum_type != op->csum_type) && ++ bch2_write_rechecksum(c, op, op->csum_type)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ /* ++ * If we want to compress the data, it has to be decrypted: ++ */ ++ if ((op->compression_type || ++ bch2_csum_type_is_encryption(op->crc.csum_type) != ++ bch2_csum_type_is_encryption(op->csum_type)) && ++ bch2_write_decrypt(op)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ return PREP_ENCODED_OK; ++} ++ ++static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, ++ struct bio **_dst) ++{ ++ struct bch_fs *c = op->c; ++ struct bio *src = &op->wbio.bio, *dst = src; ++ struct bvec_iter saved_iter; ++ void *ec_buf; ++ struct bpos ec_pos = op->pos; ++ unsigned total_output = 0, total_input = 0; ++ bool bounce = false; ++ bool page_alloc_failed = false; ++ int ret, more = 0; ++ ++ BUG_ON(!bio_sectors(src)); ++ ++ ec_buf = bch2_writepoint_ec_buf(c, wp); ++ ++ switch (bch2_write_prep_encoded_data(op, wp)) { ++ case PREP_ENCODED_OK: ++ break; ++ case PREP_ENCODED_ERR: ++ ret = -EIO; ++ goto err; ++ case PREP_ENCODED_CHECKSUM_ERR: ++ BUG(); ++ goto csum_err; ++ case PREP_ENCODED_DO_WRITE: ++ /* XXX look for bug here */ ++ if (ec_buf) { ++ dst = bch2_write_bio_alloc(c, wp, src, ++ &page_alloc_failed, ++ ec_buf); ++ bio_copy_data(dst, src); ++ bounce = true; ++ } ++ init_append_extent(op, wp, op->version, op->crc); ++ goto do_write; ++ } ++ ++ if (ec_buf || ++ op->compression_type || ++ (op->csum_type && ++ !(op->flags & BCH_WRITE_PAGES_STABLE)) || ++ (bch2_csum_type_is_encryption(op->csum_type) && ++ !(op->flags & BCH_WRITE_PAGES_OWNED))) { ++ dst = bch2_write_bio_alloc(c, wp, src, ++ &page_alloc_failed, ++ ec_buf); ++ bounce = true; ++ } ++ ++ saved_iter = dst->bi_iter; ++ ++ do { ++ struct bch_extent_crc_unpacked crc = ++ (struct bch_extent_crc_unpacked) { 0 }; ++ struct bversion version = op->version; ++ size_t dst_len, src_len; ++ ++ if (page_alloc_failed && ++ bio_sectors(dst) < wp->sectors_free && ++ bio_sectors(dst) < c->sb.encoded_extent_max) ++ break; ++ ++ BUG_ON(op->compression_type && ++ (op->flags & BCH_WRITE_DATA_ENCODED) && ++ bch2_csum_type_is_encryption(op->crc.csum_type)); ++ BUG_ON(op->compression_type && !bounce); ++ ++ crc.compression_type = op->incompressible ++ ? BCH_COMPRESSION_TYPE_incompressible ++ : op->compression_type ++ ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, ++ op->compression_type) ++ : 0; ++ if (!crc_is_compressed(crc)) { ++ dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); ++ dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); ++ ++ if (op->csum_type) ++ dst_len = min_t(unsigned, dst_len, ++ c->sb.encoded_extent_max << 9); ++ ++ if (bounce) { ++ swap(dst->bi_iter.bi_size, dst_len); ++ bio_copy_data(dst, src); ++ swap(dst->bi_iter.bi_size, dst_len); ++ } ++ ++ src_len = dst_len; ++ } ++ ++ BUG_ON(!src_len || !dst_len); ++ ++ if (bch2_csum_type_is_encryption(op->csum_type)) { ++ if (bversion_zero(version)) { ++ version.lo = atomic64_inc_return(&c->key_version); ++ } else { ++ crc.nonce = op->nonce; ++ op->nonce += src_len >> 9; ++ } ++ } ++ ++ if ((op->flags & BCH_WRITE_DATA_ENCODED) && ++ !crc_is_compressed(crc) && ++ bch2_csum_type_is_encryption(op->crc.csum_type) == ++ bch2_csum_type_is_encryption(op->csum_type)) { ++ /* ++ * Note: when we're using rechecksum(), we need to be ++ * checksumming @src because it has all the data our ++ * existing checksum covers - if we bounced (because we ++ * were trying to compress), @dst will only have the ++ * part of the data the new checksum will cover. ++ * ++ * But normally we want to be checksumming post bounce, ++ * because part of the reason for bouncing is so the ++ * data can't be modified (by userspace) while it's in ++ * flight. ++ */ ++ if (bch2_rechecksum_bio(c, src, version, op->crc, ++ &crc, &op->crc, ++ src_len >> 9, ++ bio_sectors(src) - (src_len >> 9), ++ op->csum_type)) ++ goto csum_err; ++ } else { ++ if ((op->flags & BCH_WRITE_DATA_ENCODED) && ++ bch2_rechecksum_bio(c, src, version, op->crc, ++ NULL, &op->crc, ++ src_len >> 9, ++ bio_sectors(src) - (src_len >> 9), ++ op->crc.csum_type)) ++ goto csum_err; ++ ++ crc.compressed_size = dst_len >> 9; ++ crc.uncompressed_size = src_len >> 9; ++ crc.live_size = src_len >> 9; ++ ++ swap(dst->bi_iter.bi_size, dst_len); ++ bch2_encrypt_bio(c, op->csum_type, ++ extent_nonce(version, crc), dst); ++ crc.csum = bch2_checksum_bio(c, op->csum_type, ++ extent_nonce(version, crc), dst); ++ crc.csum_type = op->csum_type; ++ swap(dst->bi_iter.bi_size, dst_len); ++ } ++ ++ init_append_extent(op, wp, version, crc); ++ ++ if (dst != src) ++ bio_advance(dst, dst_len); ++ bio_advance(src, src_len); ++ total_output += dst_len; ++ total_input += src_len; ++ } while (dst->bi_iter.bi_size && ++ src->bi_iter.bi_size && ++ wp->sectors_free && ++ !bch2_keylist_realloc(&op->insert_keys, ++ op->inline_keys, ++ ARRAY_SIZE(op->inline_keys), ++ BKEY_EXTENT_U64s_MAX)); ++ ++ more = src->bi_iter.bi_size != 0; ++ ++ dst->bi_iter = saved_iter; ++ ++ if (dst == src && more) { ++ BUG_ON(total_output != total_input); ++ ++ dst = bio_split(src, total_input >> 9, ++ GFP_NOIO, &c->bio_write); ++ wbio_init(dst)->put_bio = true; ++ /* copy WRITE_SYNC flag */ ++ dst->bi_opf = src->bi_opf; ++ } ++ ++ dst->bi_iter.bi_size = total_output; ++do_write: ++ /* might have done a realloc... */ ++ bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9); ++ ++ *_dst = dst; ++ return more; ++csum_err: ++ bch_err(c, "error verifying existing checksum while " ++ "rewriting existing data (memory corruption?)"); ++ ret = -EIO; ++err: ++ if (to_wbio(dst)->bounce) ++ bch2_bio_free_pages_pool(c, dst); ++ if (to_wbio(dst)->put_bio) ++ bio_put(dst); ++ ++ return ret; ++} ++ ++static void __bch2_write(struct closure *cl) ++{ ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bch_fs *c = op->c; ++ struct write_point *wp; ++ struct bio *bio; ++ bool skip_put = true; ++ int ret; ++again: ++ memset(&op->failed, 0, sizeof(op->failed)); ++ ++ do { ++ struct bkey_i *key_to_write; ++ unsigned key_to_write_offset = op->insert_keys.top_p - ++ op->insert_keys.keys_p; ++ ++ /* +1 for possible cache device: */ ++ if (op->open_buckets.nr + op->nr_replicas + 1 > ++ ARRAY_SIZE(op->open_buckets.v)) ++ goto flush_io; ++ ++ if (bch2_keylist_realloc(&op->insert_keys, ++ op->inline_keys, ++ ARRAY_SIZE(op->inline_keys), ++ BKEY_EXTENT_U64s_MAX)) ++ goto flush_io; ++ ++ if ((op->flags & BCH_WRITE_FROM_INTERNAL) && ++ percpu_ref_is_dying(&c->writes)) { ++ ret = -EROFS; ++ goto err; ++ } ++ ++ wp = bch2_alloc_sectors_start(c, ++ op->target, ++ op->opts.erasure_code, ++ op->write_point, ++ &op->devs_have, ++ op->nr_replicas, ++ op->nr_replicas_required, ++ op->alloc_reserve, ++ op->flags, ++ (op->flags & BCH_WRITE_ALLOC_NOWAIT) ? NULL : cl); ++ EBUG_ON(!wp); ++ ++ if (unlikely(IS_ERR(wp))) { ++ if (unlikely(PTR_ERR(wp) != -EAGAIN)) { ++ ret = PTR_ERR(wp); ++ goto err; ++ } ++ ++ goto flush_io; ++ } ++ ++ bch2_open_bucket_get(c, wp, &op->open_buckets); ++ ret = bch2_write_extent(op, wp, &bio); ++ bch2_alloc_sectors_done(c, wp); ++ ++ if (ret < 0) ++ goto err; ++ ++ if (ret) { ++ skip_put = false; ++ } else { ++ /* ++ * for the skip_put optimization this has to be set ++ * before we submit the bio: ++ */ ++ op->flags |= BCH_WRITE_DONE; ++ } ++ ++ bio->bi_end_io = bch2_write_endio; ++ bio->bi_private = &op->cl; ++ bio->bi_opf |= REQ_OP_WRITE; ++ ++ if (!skip_put) ++ closure_get(bio->bi_private); ++ else ++ op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT; ++ ++ key_to_write = (void *) (op->insert_keys.keys_p + ++ key_to_write_offset); ++ ++ bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_USER, ++ key_to_write); ++ } while (ret); ++ ++ if (!skip_put) ++ continue_at(cl, bch2_write_index, index_update_wq(op)); ++ return; ++err: ++ op->error = ret; ++ op->flags |= BCH_WRITE_DONE; ++ ++ continue_at(cl, bch2_write_index, index_update_wq(op)); ++ return; ++flush_io: ++ /* ++ * If the write can't all be submitted at once, we generally want to ++ * block synchronously as that signals backpressure to the caller. ++ * ++ * However, if we're running out of a workqueue, we can't block here ++ * because we'll be blocking other work items from completing: ++ */ ++ if (current->flags & PF_WQ_WORKER) { ++ continue_at(cl, bch2_write_index, index_update_wq(op)); ++ return; ++ } ++ ++ closure_sync(cl); ++ ++ if (!bch2_keylist_empty(&op->insert_keys)) { ++ __bch2_write_index(op); ++ ++ if (op->error) { ++ op->flags |= BCH_WRITE_DONE; ++ continue_at_nobarrier(cl, bch2_write_done, NULL); ++ return; ++ } ++ } ++ ++ goto again; ++} ++ ++static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) ++{ ++ struct closure *cl = &op->cl; ++ struct bio *bio = &op->wbio.bio; ++ struct bvec_iter iter; ++ struct bkey_i_inline_data *id; ++ unsigned sectors; ++ int ret; ++ ++ bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); ++ ++ ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, ++ ARRAY_SIZE(op->inline_keys), ++ BKEY_U64s + DIV_ROUND_UP(data_len, 8)); ++ if (ret) { ++ op->error = ret; ++ goto err; ++ } ++ ++ sectors = bio_sectors(bio); ++ op->pos.offset += sectors; ++ ++ id = bkey_inline_data_init(op->insert_keys.top); ++ id->k.p = op->pos; ++ id->k.version = op->version; ++ id->k.size = sectors; ++ ++ iter = bio->bi_iter; ++ iter.bi_size = data_len; ++ memcpy_from_bio(id->v.data, bio, iter); ++ ++ while (data_len & 7) ++ id->v.data[data_len++] = '\0'; ++ set_bkey_val_bytes(&id->k, data_len); ++ bch2_keylist_push(&op->insert_keys); ++ ++ op->flags |= BCH_WRITE_WROTE_DATA_INLINE; ++ op->flags |= BCH_WRITE_DONE; ++ ++ continue_at_nobarrier(cl, bch2_write_index, NULL); ++ return; ++err: ++ bch2_write_done(&op->cl); ++} ++ ++/** ++ * bch_write - handle a write to a cache device or flash only volume ++ * ++ * This is the starting point for any data to end up in a cache device; it could ++ * be from a normal write, or a writeback write, or a write to a flash only ++ * volume - it's also used by the moving garbage collector to compact data in ++ * mostly empty buckets. ++ * ++ * It first writes the data to the cache, creating a list of keys to be inserted ++ * (if the data won't fit in a single open bucket, there will be multiple keys); ++ * after the data is written it calls bch_journal, and after the keys have been ++ * added to the next journal write they're inserted into the btree. ++ * ++ * If op->discard is true, instead of inserting the data it invalidates the ++ * region of the cache represented by op->bio and op->inode. ++ */ ++void bch2_write(struct closure *cl) ++{ ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bio *bio = &op->wbio.bio; ++ struct bch_fs *c = op->c; ++ unsigned data_len; ++ ++ BUG_ON(!op->nr_replicas); ++ BUG_ON(!op->write_point.v); ++ BUG_ON(!bkey_cmp(op->pos, POS_MAX)); ++ ++ op->start_time = local_clock(); ++ bch2_keylist_init(&op->insert_keys, op->inline_keys); ++ wbio_init(bio)->put_bio = false; ++ ++ if (bio_sectors(bio) & (c->opts.block_size - 1)) { ++ __bcache_io_error(c, "misaligned write"); ++ op->error = -EIO; ++ goto err; ++ } ++ ++ if (c->opts.nochanges || ++ !percpu_ref_tryget(&c->writes)) { ++ if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) ++ __bcache_io_error(c, "read only"); ++ op->error = -EROFS; ++ goto err; ++ } ++ ++ /* ++ * Can't ratelimit copygc - we'd deadlock: ++ */ ++ if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) ++ down(&c->io_in_flight); ++ ++ bch2_increment_clock(c, bio_sectors(bio), WRITE); ++ ++ data_len = min_t(u64, bio->bi_iter.bi_size, ++ op->new_i_size - (op->pos.offset << 9)); ++ ++ if (c->opts.inline_data && ++ data_len <= min(block_bytes(c) / 2, 1024U)) { ++ bch2_write_data_inline(op, data_len); ++ return; ++ } ++ ++ continue_at_nobarrier(cl, __bch2_write, NULL); ++ return; ++err: ++ bch2_disk_reservation_put(c, &op->res); ++ ++ if (op->end_io) { ++ EBUG_ON(cl->parent); ++ closure_debug_destroy(cl); ++ op->end_io(op); ++ } else { ++ closure_return(cl); ++ } ++} ++ ++/* Cache promotion on read */ ++ ++struct promote_op { ++ struct closure cl; ++ struct rcu_head rcu; ++ u64 start_time; ++ ++ struct rhash_head hash; ++ struct bpos pos; ++ ++ struct migrate_write write; ++ struct bio_vec bi_inline_vecs[0]; /* must be last */ ++}; ++ ++static const struct rhashtable_params bch_promote_params = { ++ .head_offset = offsetof(struct promote_op, hash), ++ .key_offset = offsetof(struct promote_op, pos), ++ .key_len = sizeof(struct bpos), ++}; ++ ++static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k, ++ struct bpos pos, ++ struct bch_io_opts opts, ++ unsigned flags) ++{ ++ if (!(flags & BCH_READ_MAY_PROMOTE)) ++ return false; ++ ++ if (!opts.promote_target) ++ return false; ++ ++ if (bch2_bkey_has_target(c, k, opts.promote_target)) ++ return false; ++ ++ if (bch2_target_congested(c, opts.promote_target)) { ++ /* XXX trace this */ ++ return false; ++ } ++ ++ if (rhashtable_lookup_fast(&c->promote_table, &pos, ++ bch_promote_params)) ++ return false; ++ ++ return true; ++} ++ ++static void promote_free(struct bch_fs *c, struct promote_op *op) ++{ ++ int ret; ++ ++ ret = rhashtable_remove_fast(&c->promote_table, &op->hash, ++ bch_promote_params); ++ BUG_ON(ret); ++ percpu_ref_put(&c->writes); ++ kfree_rcu(op, rcu); ++} ++ ++static void promote_done(struct closure *cl) ++{ ++ struct promote_op *op = ++ container_of(cl, struct promote_op, cl); ++ struct bch_fs *c = op->write.op.c; ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_data_promote], ++ op->start_time); ++ ++ bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio); ++ promote_free(c, op); ++} ++ ++static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) ++{ ++ struct bch_fs *c = rbio->c; ++ struct closure *cl = &op->cl; ++ struct bio *bio = &op->write.op.wbio.bio; ++ ++ trace_promote(&rbio->bio); ++ ++ /* we now own pages: */ ++ BUG_ON(!rbio->bounce); ++ BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); ++ ++ memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, ++ sizeof(struct bio_vec) * rbio->bio.bi_vcnt); ++ swap(bio->bi_vcnt, rbio->bio.bi_vcnt); ++ ++ bch2_migrate_read_done(&op->write, rbio); ++ ++ closure_init(cl, NULL); ++ closure_call(&op->write.op.cl, bch2_write, c->wq, cl); ++ closure_return_with_destructor(cl, promote_done); ++} ++ ++static struct promote_op *__promote_alloc(struct bch_fs *c, ++ enum btree_id btree_id, ++ struct bkey_s_c k, ++ struct bpos pos, ++ struct extent_ptr_decoded *pick, ++ struct bch_io_opts opts, ++ unsigned sectors, ++ struct bch_read_bio **rbio) ++{ ++ struct promote_op *op = NULL; ++ struct bio *bio; ++ unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); ++ int ret; ++ ++ if (!percpu_ref_tryget(&c->writes)) ++ return NULL; ++ ++ op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO); ++ if (!op) ++ goto err; ++ ++ op->start_time = local_clock(); ++ op->pos = pos; ++ ++ /* ++ * We don't use the mempool here because extents that aren't ++ * checksummed or compressed can be too big for the mempool: ++ */ ++ *rbio = kzalloc(sizeof(struct bch_read_bio) + ++ sizeof(struct bio_vec) * pages, ++ GFP_NOIO); ++ if (!*rbio) ++ goto err; ++ ++ rbio_init(&(*rbio)->bio, opts); ++ bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages); ++ ++ if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, ++ GFP_NOIO)) ++ goto err; ++ ++ (*rbio)->bounce = true; ++ (*rbio)->split = true; ++ (*rbio)->kmalloc = true; ++ ++ if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, ++ bch_promote_params)) ++ goto err; ++ ++ bio = &op->write.op.wbio.bio; ++ bio_init(bio, bio->bi_inline_vecs, pages); ++ ++ ret = bch2_migrate_write_init(c, &op->write, ++ writepoint_hashed((unsigned long) current), ++ opts, ++ DATA_PROMOTE, ++ (struct data_opts) { ++ .target = opts.promote_target ++ }, ++ btree_id, k); ++ BUG_ON(ret); ++ ++ return op; ++err: ++ if (*rbio) ++ bio_free_pages(&(*rbio)->bio); ++ kfree(*rbio); ++ *rbio = NULL; ++ kfree(op); ++ percpu_ref_put(&c->writes); ++ return NULL; ++} ++ ++noinline ++static struct promote_op *promote_alloc(struct bch_fs *c, ++ struct bvec_iter iter, ++ struct bkey_s_c k, ++ struct extent_ptr_decoded *pick, ++ struct bch_io_opts opts, ++ unsigned flags, ++ struct bch_read_bio **rbio, ++ bool *bounce, ++ bool *read_full) ++{ ++ bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); ++ /* data might have to be decompressed in the write path: */ ++ unsigned sectors = promote_full ++ ? max(pick->crc.compressed_size, pick->crc.live_size) ++ : bvec_iter_sectors(iter); ++ struct bpos pos = promote_full ++ ? bkey_start_pos(k.k) ++ : POS(k.k->p.inode, iter.bi_sector); ++ struct promote_op *promote; ++ ++ if (!should_promote(c, k, pos, opts, flags)) ++ return NULL; ++ ++ promote = __promote_alloc(c, ++ k.k->type == KEY_TYPE_reflink_v ++ ? BTREE_ID_REFLINK ++ : BTREE_ID_EXTENTS, ++ k, pos, pick, opts, sectors, rbio); ++ if (!promote) ++ return NULL; ++ ++ *bounce = true; ++ *read_full = promote_full; ++ return promote; ++} ++ ++/* Read */ ++ ++#define READ_RETRY_AVOID 1 ++#define READ_RETRY 2 ++#define READ_ERR 3 ++ ++enum rbio_context { ++ RBIO_CONTEXT_NULL, ++ RBIO_CONTEXT_HIGHPRI, ++ RBIO_CONTEXT_UNBOUND, ++}; ++ ++static inline struct bch_read_bio * ++bch2_rbio_parent(struct bch_read_bio *rbio) ++{ ++ return rbio->split ? rbio->parent : rbio; ++} ++ ++__always_inline ++static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, ++ enum rbio_context context, ++ struct workqueue_struct *wq) ++{ ++ if (context <= rbio->context) { ++ fn(&rbio->work); ++ } else { ++ rbio->work.func = fn; ++ rbio->context = context; ++ queue_work(wq, &rbio->work); ++ } ++} ++ ++static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) ++{ ++ BUG_ON(rbio->bounce && !rbio->split); ++ ++ if (rbio->promote) ++ promote_free(rbio->c, rbio->promote); ++ rbio->promote = NULL; ++ ++ if (rbio->bounce) ++ bch2_bio_free_pages_pool(rbio->c, &rbio->bio); ++ ++ if (rbio->split) { ++ struct bch_read_bio *parent = rbio->parent; ++ ++ if (rbio->kmalloc) ++ kfree(rbio); ++ else ++ bio_put(&rbio->bio); ++ ++ rbio = parent; ++ } ++ ++ return rbio; ++} ++ ++/* ++ * Only called on a top level bch_read_bio to complete an entire read request, ++ * not a split: ++ */ ++static void bch2_rbio_done(struct bch_read_bio *rbio) ++{ ++ if (rbio->start_time) ++ bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], ++ rbio->start_time); ++ bio_endio(&rbio->bio); ++} ++ ++static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, ++ struct bvec_iter bvec_iter, u64 inode, ++ struct bch_io_failures *failed, ++ unsigned flags) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_on_stack sk; ++ struct bkey_s_c k; ++ int ret; ++ ++ flags &= ~BCH_READ_LAST_FRAGMENT; ++ flags |= BCH_READ_MUST_CLONE; ++ ++ bkey_on_stack_init(&sk); ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ rbio->pos, BTREE_ITER_SLOTS); ++retry: ++ rbio->bio.bi_status = 0; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ if (bkey_err(k)) ++ goto err; ++ ++ bkey_on_stack_reassemble(&sk, c, k); ++ k = bkey_i_to_s_c(sk.k); ++ bch2_trans_unlock(&trans); ++ ++ if (!bch2_bkey_matches_ptr(c, k, ++ rbio->pick.ptr, ++ rbio->pos.offset - ++ rbio->pick.crc.offset)) { ++ /* extent we wanted to read no longer exists: */ ++ rbio->hole = true; ++ goto out; ++ } ++ ++ ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags); ++ if (ret == READ_RETRY) ++ goto retry; ++ if (ret) ++ goto err; ++out: ++ bch2_rbio_done(rbio); ++ bch2_trans_exit(&trans); ++ bkey_on_stack_exit(&sk, c); ++ return; ++err: ++ rbio->bio.bi_status = BLK_STS_IOERR; ++ goto out; ++} ++ ++static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, ++ struct bvec_iter bvec_iter, u64 inode, ++ struct bch_io_failures *failed, unsigned flags) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_on_stack sk; ++ struct bkey_s_c k; ++ int ret; ++ ++ flags &= ~BCH_READ_LAST_FRAGMENT; ++ flags |= BCH_READ_MUST_CLONE; ++ ++ bkey_on_stack_init(&sk); ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, ++ POS(inode, bvec_iter.bi_sector), ++ BTREE_ITER_SLOTS, k, ret) { ++ unsigned bytes, sectors, offset_into_extent; ++ ++ bkey_on_stack_reassemble(&sk, c, k); ++ k = bkey_i_to_s_c(sk.k); ++ ++ offset_into_extent = iter->pos.offset - ++ bkey_start_offset(k.k); ++ sectors = k.k->size - offset_into_extent; ++ ++ ret = bch2_read_indirect_extent(&trans, ++ &offset_into_extent, &sk); ++ if (ret) ++ break; ++ ++ sectors = min(sectors, k.k->size - offset_into_extent); ++ ++ bch2_trans_unlock(&trans); ++ ++ bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; ++ swap(bvec_iter.bi_size, bytes); ++ ++ ret = __bch2_read_extent(c, rbio, bvec_iter, k, ++ offset_into_extent, failed, flags); ++ switch (ret) { ++ case READ_RETRY: ++ goto retry; ++ case READ_ERR: ++ goto err; ++ }; ++ ++ if (bytes == bvec_iter.bi_size) ++ goto out; ++ ++ swap(bvec_iter.bi_size, bytes); ++ bio_advance_iter(&rbio->bio, &bvec_iter, bytes); ++ } ++ ++ if (ret == -EINTR) ++ goto retry; ++ /* ++ * If we get here, it better have been because there was an error ++ * reading a btree node ++ */ ++ BUG_ON(!ret); ++ __bcache_io_error(c, "btree IO error: %i", ret); ++err: ++ rbio->bio.bi_status = BLK_STS_IOERR; ++out: ++ bch2_trans_exit(&trans); ++ bkey_on_stack_exit(&sk, c); ++ bch2_rbio_done(rbio); ++} ++ ++static void bch2_rbio_retry(struct work_struct *work) ++{ ++ struct bch_read_bio *rbio = ++ container_of(work, struct bch_read_bio, work); ++ struct bch_fs *c = rbio->c; ++ struct bvec_iter iter = rbio->bvec_iter; ++ unsigned flags = rbio->flags; ++ u64 inode = rbio->pos.inode; ++ struct bch_io_failures failed = { .nr = 0 }; ++ ++ trace_read_retry(&rbio->bio); ++ ++ if (rbio->retry == READ_RETRY_AVOID) ++ bch2_mark_io_failure(&failed, &rbio->pick); ++ ++ rbio->bio.bi_status = 0; ++ ++ rbio = bch2_rbio_free(rbio); ++ ++ flags |= BCH_READ_IN_RETRY; ++ flags &= ~BCH_READ_MAY_PROMOTE; ++ ++ if (flags & BCH_READ_NODECODE) ++ bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags); ++ else ++ bch2_read_retry(c, rbio, iter, inode, &failed, flags); ++} ++ ++static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, ++ blk_status_t error) ++{ ++ rbio->retry = retry; ++ ++ if (rbio->flags & BCH_READ_IN_RETRY) ++ return; ++ ++ if (retry == READ_ERR) { ++ rbio = bch2_rbio_free(rbio); ++ ++ rbio->bio.bi_status = error; ++ bch2_rbio_done(rbio); ++ } else { ++ bch2_rbio_punt(rbio, bch2_rbio_retry, ++ RBIO_CONTEXT_UNBOUND, system_unbound_wq); ++ } ++} ++ ++static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, ++ struct bch_read_bio *rbio) ++{ ++ struct bch_fs *c = rbio->c; ++ u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset; ++ struct bch_extent_crc_unpacked new_crc; ++ struct btree_iter *iter = NULL; ++ struct bkey_i *new; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ if (crc_is_compressed(rbio->pick.crc)) ++ return 0; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_EXTENTS, rbio->pos, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ if ((ret = PTR_ERR_OR_ZERO(iter))) ++ goto out; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ if ((ret = bkey_err(k))) ++ goto out; ++ ++ /* ++ * going to be temporarily appending another checksum entry: ++ */ ++ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + ++ BKEY_EXTENT_U64s_MAX * 8); ++ if ((ret = PTR_ERR_OR_ZERO(new))) ++ goto out; ++ ++ bkey_reassemble(new, k); ++ k = bkey_i_to_s_c(new); ++ ++ if (bversion_cmp(k.k->version, rbio->version) || ++ !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) ++ goto out; ++ ++ /* Extent was merged? */ ++ if (bkey_start_offset(k.k) < data_offset || ++ k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) ++ goto out; ++ ++ if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, ++ rbio->pick.crc, NULL, &new_crc, ++ bkey_start_offset(k.k) - data_offset, k.k->size, ++ rbio->pick.crc.csum_type)) { ++ bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); ++ ret = 0; ++ goto out; ++ } ++ ++ if (!bch2_bkey_narrow_crcs(new, new_crc)) ++ goto out; ++ ++ bch2_trans_update(trans, iter, new, 0); ++out: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) ++{ ++ bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, ++ __bch2_rbio_narrow_crcs(&trans, rbio)); ++} ++ ++/* Inner part that may run in process context */ ++static void __bch2_read_endio(struct work_struct *work) ++{ ++ struct bch_read_bio *rbio = ++ container_of(work, struct bch_read_bio, work); ++ struct bch_fs *c = rbio->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); ++ struct bio *src = &rbio->bio; ++ struct bio *dst = &bch2_rbio_parent(rbio)->bio; ++ struct bvec_iter dst_iter = rbio->bvec_iter; ++ struct bch_extent_crc_unpacked crc = rbio->pick.crc; ++ struct nonce nonce = extent_nonce(rbio->version, crc); ++ struct bch_csum csum; ++ ++ /* Reset iterator for checksumming and copying bounced data: */ ++ if (rbio->bounce) { ++ src->bi_iter.bi_size = crc.compressed_size << 9; ++ src->bi_iter.bi_idx = 0; ++ src->bi_iter.bi_bvec_done = 0; ++ } else { ++ src->bi_iter = rbio->bvec_iter; ++ } ++ ++ csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); ++ if (bch2_crc_cmp(csum, rbio->pick.crc.csum)) ++ goto csum_err; ++ ++ if (unlikely(rbio->narrow_crcs)) ++ bch2_rbio_narrow_crcs(rbio); ++ ++ if (rbio->flags & BCH_READ_NODECODE) ++ goto nodecode; ++ ++ /* Adjust crc to point to subset of data we want: */ ++ crc.offset += rbio->offset_into_extent; ++ crc.live_size = bvec_iter_sectors(rbio->bvec_iter); ++ ++ if (crc_is_compressed(crc)) { ++ bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) ++ goto decompression_err; ++ } else { ++ /* don't need to decrypt the entire bio: */ ++ nonce = nonce_add(nonce, crc.offset << 9); ++ bio_advance(src, crc.offset << 9); ++ ++ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); ++ src->bi_iter.bi_size = dst_iter.bi_size; ++ ++ bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ ++ if (rbio->bounce) { ++ struct bvec_iter src_iter = src->bi_iter; ++ bio_copy_data_iter(dst, &dst_iter, src, &src_iter); ++ } ++ } ++ ++ if (rbio->promote) { ++ /* ++ * Re encrypt data we decrypted, so it's consistent with ++ * rbio->crc: ++ */ ++ bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ promote_start(rbio->promote, rbio); ++ rbio->promote = NULL; ++ } ++nodecode: ++ if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { ++ rbio = bch2_rbio_free(rbio); ++ bch2_rbio_done(rbio); ++ } ++ return; ++csum_err: ++ /* ++ * Checksum error: if the bio wasn't bounced, we may have been ++ * reading into buffers owned by userspace (that userspace can ++ * scribble over) - retry the read, bouncing it this time: ++ */ ++ if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { ++ rbio->flags |= BCH_READ_MUST_BOUNCE; ++ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); ++ return; ++ } ++ ++ bch2_dev_io_error(ca, ++ "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)", ++ rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, ++ rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, ++ csum.hi, csum.lo, crc.csum_type); ++ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); ++ return; ++decompression_err: ++ __bcache_io_error(c, "decompression error, inode %llu offset %llu", ++ rbio->pos.inode, ++ (u64) rbio->bvec_iter.bi_sector); ++ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); ++ return; ++} ++ ++static void bch2_read_endio(struct bio *bio) ++{ ++ struct bch_read_bio *rbio = ++ container_of(bio, struct bch_read_bio, bio); ++ struct bch_fs *c = rbio->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); ++ struct workqueue_struct *wq = NULL; ++ enum rbio_context context = RBIO_CONTEXT_NULL; ++ ++ if (rbio->have_ioref) { ++ bch2_latency_acct(ca, rbio->submit_time, READ); ++ percpu_ref_put(&ca->io_ref); ++ } ++ ++ if (!rbio->split) ++ rbio->bio.bi_end_io = rbio->end_io; ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s", ++ blk_status_to_str(bio->bi_status))) { ++ bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); ++ return; ++ } ++ ++ if (rbio->pick.ptr.cached && ++ (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || ++ ptr_stale(ca, &rbio->pick.ptr))) { ++ atomic_long_inc(&c->read_realloc_races); ++ ++ if (rbio->flags & BCH_READ_RETRY_IF_STALE) ++ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); ++ else ++ bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); ++ return; ++ } ++ ++ if (rbio->narrow_crcs || ++ crc_is_compressed(rbio->pick.crc) || ++ bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) ++ context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; ++ else if (rbio->pick.crc.csum_type) ++ context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; ++ ++ bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); ++} ++ ++int __bch2_read_indirect_extent(struct btree_trans *trans, ++ unsigned *offset_into_extent, ++ struct bkey_on_stack *orig_k) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 reflink_offset; ++ int ret; ++ ++ reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + ++ *offset_into_extent; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK, ++ POS(0, reflink_offset), ++ BTREE_ITER_SLOTS); ++ ret = PTR_ERR_OR_ZERO(iter); ++ if (ret) ++ return ret; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_reflink_v) { ++ __bcache_io_error(trans->c, ++ "pointer to nonexistent indirect extent"); ++ ret = -EIO; ++ goto err; ++ } ++ ++ *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); ++ bkey_on_stack_reassemble(orig_k, trans->c, k); ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, ++ struct bvec_iter iter, struct bkey_s_c k, ++ unsigned offset_into_extent, ++ struct bch_io_failures *failed, unsigned flags) ++{ ++ struct extent_ptr_decoded pick; ++ struct bch_read_bio *rbio = NULL; ++ struct bch_dev *ca; ++ struct promote_op *promote = NULL; ++ bool bounce = false, read_full = false, narrow_crcs = false; ++ struct bpos pos = bkey_start_pos(k.k); ++ int pick_ret; ++ ++ if (k.k->type == KEY_TYPE_inline_data) { ++ struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); ++ unsigned bytes = min_t(unsigned, iter.bi_size, ++ bkey_val_bytes(d.k)); ++ ++ swap(iter.bi_size, bytes); ++ memcpy_to_bio(&orig->bio, iter, d.v->data); ++ swap(iter.bi_size, bytes); ++ bio_advance_iter(&orig->bio, &iter, bytes); ++ zero_fill_bio_iter(&orig->bio, iter); ++ goto out_read_done; ++ } ++ ++ pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); ++ ++ /* hole or reservation - just zero fill: */ ++ if (!pick_ret) ++ goto hole; ++ ++ if (pick_ret < 0) { ++ __bcache_io_error(c, "no device to read from"); ++ goto err; ++ } ++ ++ if (pick_ret > 0) ++ ca = bch_dev_bkey_exists(c, pick.ptr.dev); ++ ++ if (flags & BCH_READ_NODECODE) { ++ /* ++ * can happen if we retry, and the extent we were going to read ++ * has been merged in the meantime: ++ */ ++ if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) ++ goto hole; ++ ++ iter.bi_size = pick.crc.compressed_size << 9; ++ goto get_bio; ++ } ++ ++ if (!(flags & BCH_READ_LAST_FRAGMENT) || ++ bio_flagged(&orig->bio, BIO_CHAIN)) ++ flags |= BCH_READ_MUST_CLONE; ++ ++ narrow_crcs = !(flags & BCH_READ_IN_RETRY) && ++ bch2_can_narrow_extent_crcs(k, pick.crc); ++ ++ if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) ++ flags |= BCH_READ_MUST_BOUNCE; ++ ++ EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); ++ ++ if (crc_is_compressed(pick.crc) || ++ (pick.crc.csum_type != BCH_CSUM_NONE && ++ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || ++ (bch2_csum_type_is_encryption(pick.crc.csum_type) && ++ (flags & BCH_READ_USER_MAPPED)) || ++ (flags & BCH_READ_MUST_BOUNCE)))) { ++ read_full = true; ++ bounce = true; ++ } ++ ++ if (orig->opts.promote_target) ++ promote = promote_alloc(c, iter, k, &pick, orig->opts, flags, ++ &rbio, &bounce, &read_full); ++ ++ if (!read_full) { ++ EBUG_ON(crc_is_compressed(pick.crc)); ++ EBUG_ON(pick.crc.csum_type && ++ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || ++ bvec_iter_sectors(iter) != pick.crc.live_size || ++ pick.crc.offset || ++ offset_into_extent)); ++ ++ pos.offset += offset_into_extent; ++ pick.ptr.offset += pick.crc.offset + ++ offset_into_extent; ++ offset_into_extent = 0; ++ pick.crc.compressed_size = bvec_iter_sectors(iter); ++ pick.crc.uncompressed_size = bvec_iter_sectors(iter); ++ pick.crc.offset = 0; ++ pick.crc.live_size = bvec_iter_sectors(iter); ++ offset_into_extent = 0; ++ } ++get_bio: ++ if (rbio) { ++ /* ++ * promote already allocated bounce rbio: ++ * promote needs to allocate a bio big enough for uncompressing ++ * data in the write path, but we're not going to use it all ++ * here: ++ */ ++ EBUG_ON(rbio->bio.bi_iter.bi_size < ++ pick.crc.compressed_size << 9); ++ rbio->bio.bi_iter.bi_size = ++ pick.crc.compressed_size << 9; ++ } else if (bounce) { ++ unsigned sectors = pick.crc.compressed_size; ++ ++ rbio = rbio_init(bio_alloc_bioset(GFP_NOIO, ++ DIV_ROUND_UP(sectors, PAGE_SECTORS), ++ &c->bio_read_split), ++ orig->opts); ++ ++ bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); ++ rbio->bounce = true; ++ rbio->split = true; ++ } else if (flags & BCH_READ_MUST_CLONE) { ++ /* ++ * Have to clone if there were any splits, due to error ++ * reporting issues (if a split errored, and retrying didn't ++ * work, when it reports the error to its parent (us) we don't ++ * know if the error was from our bio, and we should retry, or ++ * from the whole bio, in which case we don't want to retry and ++ * lose the error) ++ */ ++ rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO, ++ &c->bio_read_split), ++ orig->opts); ++ rbio->bio.bi_iter = iter; ++ rbio->split = true; ++ } else { ++ rbio = orig; ++ rbio->bio.bi_iter = iter; ++ EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); ++ } ++ ++ EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); ++ ++ rbio->c = c; ++ rbio->submit_time = local_clock(); ++ if (rbio->split) ++ rbio->parent = orig; ++ else ++ rbio->end_io = orig->bio.bi_end_io; ++ rbio->bvec_iter = iter; ++ rbio->offset_into_extent= offset_into_extent; ++ rbio->flags = flags; ++ rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); ++ rbio->narrow_crcs = narrow_crcs; ++ rbio->hole = 0; ++ rbio->retry = 0; ++ rbio->context = 0; ++ /* XXX: only initialize this if needed */ ++ rbio->devs_have = bch2_bkey_devs(k); ++ rbio->pick = pick; ++ rbio->pos = pos; ++ rbio->version = k.k->version; ++ rbio->promote = promote; ++ INIT_WORK(&rbio->work, NULL); ++ ++ rbio->bio.bi_opf = orig->bio.bi_opf; ++ rbio->bio.bi_iter.bi_sector = pick.ptr.offset; ++ rbio->bio.bi_end_io = bch2_read_endio; ++ ++ if (rbio->bounce) ++ trace_read_bounce(&rbio->bio); ++ ++ bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); ++ ++ rcu_read_lock(); ++ bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ); ++ rcu_read_unlock(); ++ ++ if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { ++ bio_inc_remaining(&orig->bio); ++ trace_read_split(&orig->bio); ++ } ++ ++ if (!rbio->pick.idx) { ++ if (!rbio->have_ioref) { ++ __bcache_io_error(c, "no device to read from"); ++ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); ++ goto out; ++ } ++ ++ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_USER], ++ bio_sectors(&rbio->bio)); ++ bio_set_dev(&rbio->bio, ca->disk_sb.bdev); ++ ++ if (likely(!(flags & BCH_READ_IN_RETRY))) ++ submit_bio(&rbio->bio); ++ else ++ submit_bio_wait(&rbio->bio); ++ } else { ++ /* Attempting reconstruct read: */ ++ if (bch2_ec_read_extent(c, rbio)) { ++ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); ++ goto out; ++ } ++ ++ if (likely(!(flags & BCH_READ_IN_RETRY))) ++ bio_endio(&rbio->bio); ++ } ++out: ++ if (likely(!(flags & BCH_READ_IN_RETRY))) { ++ return 0; ++ } else { ++ int ret; ++ ++ rbio->context = RBIO_CONTEXT_UNBOUND; ++ bch2_read_endio(&rbio->bio); ++ ++ ret = rbio->retry; ++ rbio = bch2_rbio_free(rbio); ++ ++ if (ret == READ_RETRY_AVOID) { ++ bch2_mark_io_failure(failed, &pick); ++ ret = READ_RETRY; ++ } ++ ++ return ret; ++ } ++ ++err: ++ if (flags & BCH_READ_IN_RETRY) ++ return READ_ERR; ++ ++ orig->bio.bi_status = BLK_STS_IOERR; ++ goto out_read_done; ++ ++hole: ++ /* ++ * won't normally happen in the BCH_READ_NODECODE ++ * (bch2_move_extent()) path, but if we retry and the extent we wanted ++ * to read no longer exists we have to signal that: ++ */ ++ if (flags & BCH_READ_NODECODE) ++ orig->hole = true; ++ ++ zero_fill_bio_iter(&orig->bio, iter); ++out_read_done: ++ if (flags & BCH_READ_LAST_FRAGMENT) ++ bch2_rbio_done(orig); ++ return 0; ++} ++ ++void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_on_stack sk; ++ struct bkey_s_c k; ++ unsigned flags = BCH_READ_RETRY_IF_STALE| ++ BCH_READ_MAY_PROMOTE| ++ BCH_READ_USER_MAPPED; ++ int ret; ++ ++ BUG_ON(rbio->_state); ++ BUG_ON(flags & BCH_READ_NODECODE); ++ BUG_ON(flags & BCH_READ_IN_RETRY); ++ ++ rbio->c = c; ++ rbio->start_time = local_clock(); ++ ++ bkey_on_stack_init(&sk); ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(inode, rbio->bio.bi_iter.bi_sector), ++ BTREE_ITER_SLOTS); ++ while (1) { ++ unsigned bytes, sectors, offset_into_extent; ++ ++ bch2_btree_iter_set_pos(iter, ++ POS(inode, rbio->bio.bi_iter.bi_sector)); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ offset_into_extent = iter->pos.offset - ++ bkey_start_offset(k.k); ++ sectors = k.k->size - offset_into_extent; ++ ++ bkey_on_stack_reassemble(&sk, c, k); ++ k = bkey_i_to_s_c(sk.k); ++ ++ ret = bch2_read_indirect_extent(&trans, ++ &offset_into_extent, &sk); ++ if (ret) ++ goto err; ++ ++ /* ++ * With indirect extents, the amount of data to read is the min ++ * of the original extent and the indirect extent: ++ */ ++ sectors = min(sectors, k.k->size - offset_into_extent); ++ ++ /* ++ * Unlock the iterator while the btree node's lock is still in ++ * cache, before doing the IO: ++ */ ++ bch2_trans_unlock(&trans); ++ ++ bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; ++ swap(rbio->bio.bi_iter.bi_size, bytes); ++ ++ if (rbio->bio.bi_iter.bi_size == bytes) ++ flags |= BCH_READ_LAST_FRAGMENT; ++ ++ bch2_read_extent(c, rbio, k, offset_into_extent, flags); ++ ++ if (flags & BCH_READ_LAST_FRAGMENT) ++ break; ++ ++ swap(rbio->bio.bi_iter.bi_size, bytes); ++ bio_advance(&rbio->bio, bytes); ++ } ++out: ++ bch2_trans_exit(&trans); ++ bkey_on_stack_exit(&sk, c); ++ return; ++err: ++ if (ret == -EINTR) ++ goto retry; ++ ++ bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); ++ bch2_rbio_done(rbio); ++ goto out; ++} ++ ++void bch2_fs_io_exit(struct bch_fs *c) ++{ ++ if (c->promote_table.tbl) ++ rhashtable_destroy(&c->promote_table); ++ mempool_exit(&c->bio_bounce_pages); ++ bioset_exit(&c->bio_write); ++ bioset_exit(&c->bio_read_split); ++ bioset_exit(&c->bio_read); ++} ++ ++int bch2_fs_io_init(struct bch_fs *c) ++{ ++ if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), ++ BIOSET_NEED_BVECS) || ++ bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), ++ BIOSET_NEED_BVECS) || ++ bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), ++ BIOSET_NEED_BVECS) || ++ mempool_init_page_pool(&c->bio_bounce_pages, ++ max_t(unsigned, ++ c->opts.btree_node_size, ++ c->sb.encoded_extent_max) / ++ PAGE_SECTORS, 0) || ++ rhashtable_init(&c->promote_table, &bch_promote_params)) ++ return -ENOMEM; ++ ++ return 0; ++} +diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h +new file mode 100644 +index 000000000000..0ad293bd6295 +--- /dev/null ++++ b/fs/bcachefs/io.h +@@ -0,0 +1,167 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_IO_H ++#define _BCACHEFS_IO_H ++ ++#include "checksum.h" ++#include "bkey_on_stack.h" ++#include "io_types.h" ++ ++#define to_wbio(_bio) \ ++ container_of((_bio), struct bch_write_bio, bio) ++ ++#define to_rbio(_bio) \ ++ container_of((_bio), struct bch_read_bio, bio) ++ ++void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); ++void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); ++ ++void bch2_latency_acct(struct bch_dev *, u64, int); ++ ++void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, ++ enum bch_data_type, const struct bkey_i *); ++ ++#define BLK_STS_REMOVED ((__force blk_status_t)128) ++ ++enum bch_write_flags { ++ BCH_WRITE_ALLOC_NOWAIT = (1 << 0), ++ BCH_WRITE_CACHED = (1 << 1), ++ BCH_WRITE_FLUSH = (1 << 2), ++ BCH_WRITE_DATA_ENCODED = (1 << 3), ++ BCH_WRITE_PAGES_STABLE = (1 << 4), ++ BCH_WRITE_PAGES_OWNED = (1 << 5), ++ BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6), ++ BCH_WRITE_WROTE_DATA_INLINE = (1 << 7), ++ BCH_WRITE_FROM_INTERNAL = (1 << 8), ++ ++ /* Internal: */ ++ BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9), ++ BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 10), ++ BCH_WRITE_DONE = (1 << 11), ++}; ++ ++static inline u64 *op_journal_seq(struct bch_write_op *op) ++{ ++ return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR) ++ ? op->journal_seq_p : &op->journal_seq; ++} ++ ++static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq) ++{ ++ op->journal_seq_p = journal_seq; ++ op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR; ++} ++ ++static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) ++{ ++ return op->alloc_reserve == RESERVE_MOVINGGC ++ ? op->c->copygc_wq ++ : op->c->wq; ++} ++ ++int bch2_extent_update(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, struct disk_reservation *, ++ u64 *, u64, s64 *); ++int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, ++ struct bpos, u64 *, s64 *); ++int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *); ++ ++int bch2_write_index_default(struct bch_write_op *); ++ ++static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, ++ struct bch_io_opts opts) ++{ ++ op->c = c; ++ op->end_io = NULL; ++ op->flags = 0; ++ op->written = 0; ++ op->error = 0; ++ op->csum_type = bch2_data_checksum_type(c, opts.data_checksum); ++ op->compression_type = bch2_compression_opt_to_type[opts.compression]; ++ op->nr_replicas = 0; ++ op->nr_replicas_required = c->opts.data_replicas_required; ++ op->alloc_reserve = RESERVE_NONE; ++ op->incompressible = 0; ++ op->open_buckets.nr = 0; ++ op->devs_have.nr = 0; ++ op->target = 0; ++ op->opts = opts; ++ op->pos = POS_MAX; ++ op->version = ZERO_VERSION; ++ op->write_point = (struct write_point_specifier) { 0 }; ++ op->res = (struct disk_reservation) { 0 }; ++ op->journal_seq = 0; ++ op->new_i_size = U64_MAX; ++ op->i_sectors_delta = 0; ++ op->index_update_fn = bch2_write_index_default; ++} ++ ++void bch2_write(struct closure *); ++ ++static inline struct bch_write_bio *wbio_init(struct bio *bio) ++{ ++ struct bch_write_bio *wbio = to_wbio(bio); ++ ++ memset(wbio, 0, offsetof(struct bch_write_bio, bio)); ++ return wbio; ++} ++ ++struct bch_devs_mask; ++struct cache_promote_op; ++struct extent_ptr_decoded; ++ ++int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, ++ struct bkey_on_stack *); ++ ++static inline int bch2_read_indirect_extent(struct btree_trans *trans, ++ unsigned *offset_into_extent, ++ struct bkey_on_stack *k) ++{ ++ return k->k->k.type == KEY_TYPE_reflink_p ++ ? __bch2_read_indirect_extent(trans, offset_into_extent, k) ++ : 0; ++} ++ ++enum bch_read_flags { ++ BCH_READ_RETRY_IF_STALE = 1 << 0, ++ BCH_READ_MAY_PROMOTE = 1 << 1, ++ BCH_READ_USER_MAPPED = 1 << 2, ++ BCH_READ_NODECODE = 1 << 3, ++ BCH_READ_LAST_FRAGMENT = 1 << 4, ++ ++ /* internal: */ ++ BCH_READ_MUST_BOUNCE = 1 << 5, ++ BCH_READ_MUST_CLONE = 1 << 6, ++ BCH_READ_IN_RETRY = 1 << 7, ++}; ++ ++int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, ++ struct bvec_iter, struct bkey_s_c, unsigned, ++ struct bch_io_failures *, unsigned); ++ ++static inline void bch2_read_extent(struct bch_fs *c, ++ struct bch_read_bio *rbio, ++ struct bkey_s_c k, ++ unsigned offset_into_extent, ++ unsigned flags) ++{ ++ __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, ++ offset_into_extent, NULL, flags); ++} ++ ++void bch2_read(struct bch_fs *, struct bch_read_bio *, u64); ++ ++static inline struct bch_read_bio *rbio_init(struct bio *bio, ++ struct bch_io_opts opts) ++{ ++ struct bch_read_bio *rbio = to_rbio(bio); ++ ++ rbio->_state = 0; ++ rbio->promote = NULL; ++ rbio->opts = opts; ++ return rbio; ++} ++ ++void bch2_fs_io_exit(struct bch_fs *); ++int bch2_fs_io_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_IO_H */ +diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h +new file mode 100644 +index 000000000000..684e4c9a5d98 +--- /dev/null ++++ b/fs/bcachefs/io_types.h +@@ -0,0 +1,149 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_IO_TYPES_H ++#define _BCACHEFS_IO_TYPES_H ++ ++#include "alloc_types.h" ++#include "btree_types.h" ++#include "buckets_types.h" ++#include "extents_types.h" ++#include "keylist_types.h" ++#include "opts.h" ++#include "super_types.h" ++ ++#include ++#include ++ ++struct bch_read_bio { ++ struct bch_fs *c; ++ u64 start_time; ++ u64 submit_time; ++ ++ /* ++ * Reads will often have to be split, and if the extent being read from ++ * was checksummed or compressed we'll also have to allocate bounce ++ * buffers and copy the data back into the original bio. ++ * ++ * If we didn't have to split, we have to save and restore the original ++ * bi_end_io - @split below indicates which: ++ */ ++ union { ++ struct bch_read_bio *parent; ++ bio_end_io_t *end_io; ++ }; ++ ++ /* ++ * Saved copy of bio->bi_iter, from submission time - allows us to ++ * resubmit on IO error, and also to copy data back to the original bio ++ * when we're bouncing: ++ */ ++ struct bvec_iter bvec_iter; ++ ++ unsigned offset_into_extent; ++ ++ u16 flags; ++ union { ++ struct { ++ u16 bounce:1, ++ split:1, ++ kmalloc:1, ++ have_ioref:1, ++ narrow_crcs:1, ++ hole:1, ++ retry:2, ++ context:2; ++ }; ++ u16 _state; ++ }; ++ ++ struct bch_devs_list devs_have; ++ ++ struct extent_ptr_decoded pick; ++ /* start pos of data we read (may not be pos of data we want) */ ++ struct bpos pos; ++ struct bversion version; ++ ++ struct promote_op *promote; ++ ++ struct bch_io_opts opts; ++ ++ struct work_struct work; ++ ++ struct bio bio; ++}; ++ ++struct bch_write_bio { ++ struct bch_fs *c; ++ struct bch_write_bio *parent; ++ ++ u64 submit_time; ++ ++ struct bch_devs_list failed; ++ u8 order; ++ u8 dev; ++ ++ unsigned split:1, ++ bounce:1, ++ put_bio:1, ++ have_ioref:1, ++ used_mempool:1; ++ ++ struct bio bio; ++}; ++ ++struct bch_write_op { ++ struct closure cl; ++ struct bch_fs *c; ++ void (*end_io)(struct bch_write_op *); ++ u64 start_time; ++ ++ unsigned written; /* sectors */ ++ u16 flags; ++ s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ ++ ++ unsigned csum_type:4; ++ unsigned compression_type:4; ++ unsigned nr_replicas:4; ++ unsigned nr_replicas_required:4; ++ unsigned alloc_reserve:3; ++ unsigned incompressible:1; ++ ++ struct bch_devs_list devs_have; ++ u16 target; ++ u16 nonce; ++ struct bch_io_opts opts; ++ ++ struct bpos pos; ++ struct bversion version; ++ ++ /* For BCH_WRITE_DATA_ENCODED: */ ++ struct bch_extent_crc_unpacked crc; ++ ++ struct write_point_specifier write_point; ++ ++ struct disk_reservation res; ++ ++ struct open_buckets open_buckets; ++ ++ /* ++ * If caller wants to flush but hasn't passed us a journal_seq ptr, we ++ * still need to stash the journal_seq somewhere: ++ */ ++ union { ++ u64 *journal_seq_p; ++ u64 journal_seq; ++ }; ++ u64 new_i_size; ++ s64 i_sectors_delta; ++ ++ int (*index_update_fn)(struct bch_write_op *); ++ ++ struct bch_devs_mask failed; ++ ++ struct keylist insert_keys; ++ u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2]; ++ ++ /* Must be last: */ ++ struct bch_write_bio wbio; ++}; ++ ++#endif /* _BCACHEFS_IO_TYPES_H */ +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +new file mode 100644 +index 000000000000..b4f7b61ba9ac +--- /dev/null ++++ b/fs/bcachefs/journal.c +@@ -0,0 +1,1254 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * bcachefs journalling code, for btree insertions ++ * ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_methods.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "journal.h" ++#include "journal_io.h" ++#include "journal_reclaim.h" ++#include "journal_seq_blacklist.h" ++#include "super-io.h" ++ ++#include ++ ++static bool __journal_entry_is_open(union journal_res_state state) ++{ ++ return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; ++} ++ ++static bool journal_entry_is_open(struct journal *j) ++{ ++ return __journal_entry_is_open(j->reservations); ++} ++ ++static void journal_pin_new_entry(struct journal *j, int count) ++{ ++ struct journal_entry_pin_list *p; ++ ++ /* ++ * The fifo_push() needs to happen at the same time as j->seq is ++ * incremented for journal_last_seq() to be calculated correctly ++ */ ++ atomic64_inc(&j->seq); ++ p = fifo_push_ref(&j->pin); ++ ++ INIT_LIST_HEAD(&p->list); ++ INIT_LIST_HEAD(&p->flushed); ++ atomic_set(&p->count, count); ++ p->devs.nr = 0; ++} ++ ++static void bch2_journal_buf_init(struct journal *j) ++{ ++ struct journal_buf *buf = journal_cur_buf(j); ++ ++ memset(buf->has_inode, 0, sizeof(buf->has_inode)); ++ ++ memset(buf->data, 0, sizeof(*buf->data)); ++ buf->data->seq = cpu_to_le64(journal_cur_seq(j)); ++ buf->data->u64s = 0; ++} ++ ++void bch2_journal_halt(struct journal *j) ++{ ++ union journal_res_state old, new; ++ u64 v = atomic64_read(&j->reservations.counter); ++ ++ do { ++ old.v = new.v = v; ++ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) ++ return; ++ ++ new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL; ++ } while ((v = atomic64_cmpxchg(&j->reservations.counter, ++ old.v, new.v)) != old.v); ++ ++ journal_wake(j); ++ closure_wake_up(&journal_cur_buf(j)->wait); ++} ++ ++/* journal entry close/open: */ ++ ++void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set) ++{ ++ if (!need_write_just_set && ++ test_bit(JOURNAL_NEED_WRITE, &j->flags)) ++ bch2_time_stats_update(j->delay_time, ++ j->need_write_time); ++ ++ clear_bit(JOURNAL_NEED_WRITE, &j->flags); ++ ++ closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); ++} ++ ++/* ++ * Returns true if journal entry is now closed: ++ */ ++static bool __journal_entry_close(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_buf *buf = journal_cur_buf(j); ++ union journal_res_state old, new; ++ u64 v = atomic64_read(&j->reservations.counter); ++ bool set_need_write = false; ++ unsigned sectors; ++ ++ lockdep_assert_held(&j->lock); ++ ++ do { ++ old.v = new.v = v; ++ if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL) ++ return true; ++ ++ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) { ++ /* this entry will never be written: */ ++ closure_wake_up(&buf->wait); ++ return true; ++ } ++ ++ if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) { ++ set_bit(JOURNAL_NEED_WRITE, &j->flags); ++ j->need_write_time = local_clock(); ++ set_need_write = true; ++ } ++ ++ if (new.prev_buf_unwritten) ++ return false; ++ ++ new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL; ++ new.idx++; ++ new.prev_buf_unwritten = 1; ++ ++ BUG_ON(journal_state_count(new, new.idx)); ++ } while ((v = atomic64_cmpxchg(&j->reservations.counter, ++ old.v, new.v)) != old.v); ++ ++ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); ++ ++ sectors = vstruct_blocks_plus(buf->data, c->block_bits, ++ buf->u64s_reserved) << c->block_bits; ++ BUG_ON(sectors > buf->sectors); ++ buf->sectors = sectors; ++ ++ bkey_extent_init(&buf->key); ++ ++ /* ++ * We have to set last_seq here, _before_ opening a new journal entry: ++ * ++ * A threads may replace an old pin with a new pin on their current ++ * journal reservation - the expectation being that the journal will ++ * contain either what the old pin protected or what the new pin ++ * protects. ++ * ++ * After the old pin is dropped journal_last_seq() won't include the old ++ * pin, so we can only write the updated last_seq on the entry that ++ * contains whatever the new pin protects. ++ * ++ * Restated, we can _not_ update last_seq for a given entry if there ++ * could be a newer entry open with reservations/pins that have been ++ * taken against it. ++ * ++ * Hence, we want update/set last_seq on the current journal entry right ++ * before we open a new one: ++ */ ++ buf->data->last_seq = cpu_to_le64(journal_last_seq(j)); ++ ++ if (journal_entry_empty(buf->data)) ++ clear_bit(JOURNAL_NOT_EMPTY, &j->flags); ++ else ++ set_bit(JOURNAL_NOT_EMPTY, &j->flags); ++ ++ journal_pin_new_entry(j, 1); ++ ++ bch2_journal_buf_init(j); ++ ++ cancel_delayed_work(&j->write_work); ++ ++ bch2_journal_space_available(j); ++ ++ bch2_journal_buf_put(j, old.idx, set_need_write); ++ return true; ++} ++ ++static bool journal_entry_close(struct journal *j) ++{ ++ bool ret; ++ ++ spin_lock(&j->lock); ++ ret = __journal_entry_close(j); ++ spin_unlock(&j->lock); ++ ++ return ret; ++} ++ ++/* ++ * should _only_ called from journal_res_get() - when we actually want a ++ * journal reservation - journal entry is open means journal is dirty: ++ * ++ * returns: ++ * 0: success ++ * -ENOSPC: journal currently full, must invoke reclaim ++ * -EAGAIN: journal blocked, must wait ++ * -EROFS: insufficient rw devices or journal error ++ */ ++static int journal_entry_open(struct journal *j) ++{ ++ struct journal_buf *buf = journal_cur_buf(j); ++ union journal_res_state old, new; ++ int u64s; ++ u64 v; ++ ++ lockdep_assert_held(&j->lock); ++ BUG_ON(journal_entry_is_open(j)); ++ ++ if (j->blocked) ++ return -EAGAIN; ++ ++ if (j->cur_entry_error) ++ return j->cur_entry_error; ++ ++ BUG_ON(!j->cur_entry_sectors); ++ ++ buf->u64s_reserved = j->entry_u64s_reserved; ++ buf->disk_sectors = j->cur_entry_sectors; ++ buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9); ++ ++ u64s = (int) (buf->sectors << 9) / sizeof(u64) - ++ journal_entry_overhead(j); ++ u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); ++ ++ if (u64s <= le32_to_cpu(buf->data->u64s)) ++ return -ENOSPC; ++ ++ /* ++ * Must be set before marking the journal entry as open: ++ */ ++ j->cur_entry_u64s = u64s; ++ ++ v = atomic64_read(&j->reservations.counter); ++ do { ++ old.v = new.v = v; ++ ++ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) ++ return -EROFS; ++ ++ /* Handle any already added entries */ ++ new.cur_entry_offset = le32_to_cpu(buf->data->u64s); ++ ++ EBUG_ON(journal_state_count(new, new.idx)); ++ journal_state_inc(&new); ++ } while ((v = atomic64_cmpxchg(&j->reservations.counter, ++ old.v, new.v)) != old.v); ++ ++ if (j->res_get_blocked_start) ++ bch2_time_stats_update(j->blocked_time, ++ j->res_get_blocked_start); ++ j->res_get_blocked_start = 0; ++ ++ mod_delayed_work(system_freezable_wq, ++ &j->write_work, ++ msecs_to_jiffies(j->write_delay_ms)); ++ journal_wake(j); ++ return 0; ++} ++ ++static bool journal_quiesced(struct journal *j) ++{ ++ union journal_res_state state = READ_ONCE(j->reservations); ++ bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state); ++ ++ if (!ret) ++ journal_entry_close(j); ++ return ret; ++} ++ ++static void journal_quiesce(struct journal *j) ++{ ++ wait_event(j->wait, journal_quiesced(j)); ++} ++ ++static void journal_write_work(struct work_struct *work) ++{ ++ struct journal *j = container_of(work, struct journal, write_work.work); ++ ++ journal_entry_close(j); ++} ++ ++/* ++ * Given an inode number, if that inode number has data in the journal that ++ * hasn't yet been flushed, return the journal sequence number that needs to be ++ * flushed: ++ */ ++u64 bch2_inode_journal_seq(struct journal *j, u64 inode) ++{ ++ size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); ++ u64 seq = 0; ++ ++ if (!test_bit(h, j->buf[0].has_inode) && ++ !test_bit(h, j->buf[1].has_inode)) ++ return 0; ++ ++ spin_lock(&j->lock); ++ if (test_bit(h, journal_cur_buf(j)->has_inode)) ++ seq = journal_cur_seq(j); ++ else if (test_bit(h, journal_prev_buf(j)->has_inode)) ++ seq = journal_cur_seq(j) - 1; ++ spin_unlock(&j->lock); ++ ++ return seq; ++} ++ ++static int __journal_res_get(struct journal *j, struct journal_res *res, ++ unsigned flags) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_buf *buf; ++ bool can_discard; ++ int ret; ++retry: ++ if (journal_res_get_fast(j, res, flags)) ++ return 0; ++ ++ if (bch2_journal_error(j)) ++ return -EROFS; ++ ++ spin_lock(&j->lock); ++ ++ /* ++ * Recheck after taking the lock, so we don't race with another thread ++ * that just did journal_entry_open() and call journal_entry_close() ++ * unnecessarily ++ */ ++ if (journal_res_get_fast(j, res, flags)) { ++ spin_unlock(&j->lock); ++ return 0; ++ } ++ ++ if (!(flags & JOURNAL_RES_GET_RESERVED) && ++ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { ++ /* ++ * Don't want to close current journal entry, just need to ++ * invoke reclaim: ++ */ ++ ret = -ENOSPC; ++ goto unlock; ++ } ++ ++ /* ++ * If we couldn't get a reservation because the current buf filled up, ++ * and we had room for a bigger entry on disk, signal that we want to ++ * realloc the journal bufs: ++ */ ++ buf = journal_cur_buf(j); ++ if (journal_entry_is_open(j) && ++ buf->buf_size >> 9 < buf->disk_sectors && ++ buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) ++ j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); ++ ++ if (journal_entry_is_open(j) && ++ !__journal_entry_close(j)) { ++ /* ++ * We failed to get a reservation on the current open journal ++ * entry because it's full, and we can't close it because ++ * there's still a previous one in flight: ++ */ ++ trace_journal_entry_full(c); ++ ret = -EAGAIN; ++ } else { ++ ret = journal_entry_open(j); ++ } ++unlock: ++ if ((ret == -EAGAIN || ret == -ENOSPC) && ++ !j->res_get_blocked_start) ++ j->res_get_blocked_start = local_clock() ?: 1; ++ ++ can_discard = j->can_discard; ++ spin_unlock(&j->lock); ++ ++ if (!ret) ++ goto retry; ++ ++ if (ret == -ENOSPC) { ++ WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED), ++ "JOURNAL_RES_GET_RESERVED set but journal full"); ++ ++ /* ++ * Journal is full - can't rely on reclaim from work item due to ++ * freezing: ++ */ ++ trace_journal_full(c); ++ ++ if (!(flags & JOURNAL_RES_GET_NONBLOCK)) { ++ if (can_discard) { ++ bch2_journal_do_discards(j); ++ goto retry; ++ } ++ ++ if (mutex_trylock(&j->reclaim_lock)) { ++ bch2_journal_reclaim(j); ++ mutex_unlock(&j->reclaim_lock); ++ } ++ } ++ ++ ret = -EAGAIN; ++ } ++ ++ return ret; ++} ++ ++/* ++ * Essentially the entry function to the journaling code. When bcachefs is doing ++ * a btree insert, it calls this function to get the current journal write. ++ * Journal write is the structure used set up journal writes. The calling ++ * function will then add its keys to the structure, queuing them for the next ++ * write. ++ * ++ * To ensure forward progress, the current task must not be holding any ++ * btree node write locks. ++ */ ++int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, ++ unsigned flags) ++{ ++ int ret; ++ ++ closure_wait_event(&j->async_wait, ++ (ret = __journal_res_get(j, res, flags)) != -EAGAIN || ++ (flags & JOURNAL_RES_GET_NONBLOCK)); ++ return ret; ++} ++ ++/* journal_preres: */ ++ ++static bool journal_preres_available(struct journal *j, ++ struct journal_preres *res, ++ unsigned new_u64s, ++ unsigned flags) ++{ ++ bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags); ++ ++ if (!ret) ++ bch2_journal_reclaim_work(&j->reclaim_work.work); ++ ++ return ret; ++} ++ ++int __bch2_journal_preres_get(struct journal *j, ++ struct journal_preres *res, ++ unsigned new_u64s, ++ unsigned flags) ++{ ++ int ret; ++ ++ closure_wait_event(&j->preres_wait, ++ (ret = bch2_journal_error(j)) || ++ journal_preres_available(j, res, new_u64s, flags)); ++ return ret; ++} ++ ++/* journal_entry_res: */ ++ ++void bch2_journal_entry_res_resize(struct journal *j, ++ struct journal_entry_res *res, ++ unsigned new_u64s) ++{ ++ union journal_res_state state; ++ int d = new_u64s - res->u64s; ++ ++ spin_lock(&j->lock); ++ ++ j->entry_u64s_reserved += d; ++ if (d <= 0) ++ goto out; ++ ++ j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); ++ smp_mb(); ++ state = READ_ONCE(j->reservations); ++ ++ if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL && ++ state.cur_entry_offset > j->cur_entry_u64s) { ++ j->cur_entry_u64s += d; ++ /* ++ * Not enough room in current journal entry, have to flush it: ++ */ ++ __journal_entry_close(j); ++ } else { ++ journal_cur_buf(j)->u64s_reserved += d; ++ } ++out: ++ spin_unlock(&j->lock); ++ res->u64s += d; ++} ++ ++/* journal flushing: */ ++ ++u64 bch2_journal_last_unwritten_seq(struct journal *j) ++{ ++ u64 seq; ++ ++ spin_lock(&j->lock); ++ seq = journal_cur_seq(j); ++ if (j->reservations.prev_buf_unwritten) ++ seq--; ++ spin_unlock(&j->lock); ++ ++ return seq; ++} ++ ++/** ++ * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't ++ * open yet, or wait if we cannot ++ * ++ * used by the btree interior update machinery, when it needs to write a new ++ * btree root - every journal entry contains the roots of all the btrees, so it ++ * doesn't need to bother with getting a journal reservation ++ */ ++int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ int ret; ++ ++ spin_lock(&j->lock); ++ ++ /* ++ * Can't try to open more than one sequence number ahead: ++ */ ++ BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j)); ++ ++ if (journal_cur_seq(j) > seq || ++ journal_entry_is_open(j)) { ++ spin_unlock(&j->lock); ++ return 0; ++ } ++ ++ if (journal_cur_seq(j) < seq && ++ !__journal_entry_close(j)) { ++ /* haven't finished writing out the previous one: */ ++ trace_journal_entry_full(c); ++ ret = -EAGAIN; ++ } else { ++ BUG_ON(journal_cur_seq(j) != seq); ++ ++ ret = journal_entry_open(j); ++ } ++ ++ if ((ret == -EAGAIN || ret == -ENOSPC) && ++ !j->res_get_blocked_start) ++ j->res_get_blocked_start = local_clock() ?: 1; ++ ++ if (ret == -EAGAIN || ret == -ENOSPC) ++ closure_wait(&j->async_wait, cl); ++ ++ spin_unlock(&j->lock); ++ ++ if (ret == -ENOSPC) { ++ trace_journal_full(c); ++ bch2_journal_reclaim_work(&j->reclaim_work.work); ++ ret = -EAGAIN; ++ } ++ ++ return ret; ++} ++ ++static int journal_seq_error(struct journal *j, u64 seq) ++{ ++ union journal_res_state state = READ_ONCE(j->reservations); ++ ++ if (seq == journal_cur_seq(j)) ++ return bch2_journal_error(j); ++ ++ if (seq + 1 == journal_cur_seq(j) && ++ !state.prev_buf_unwritten && ++ seq > j->seq_ondisk) ++ return -EIO; ++ ++ return 0; ++} ++ ++static inline struct journal_buf * ++journal_seq_to_buf(struct journal *j, u64 seq) ++{ ++ /* seq should be for a journal entry that has been opened: */ ++ BUG_ON(seq > journal_cur_seq(j)); ++ BUG_ON(seq == journal_cur_seq(j) && ++ j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); ++ ++ if (seq == journal_cur_seq(j)) ++ return journal_cur_buf(j); ++ if (seq + 1 == journal_cur_seq(j) && ++ j->reservations.prev_buf_unwritten) ++ return journal_prev_buf(j); ++ return NULL; ++} ++ ++/** ++ * bch2_journal_wait_on_seq - wait for a journal entry to be written ++ * ++ * does _not_ cause @seq to be written immediately - if there is no other ++ * activity to cause the relevant journal entry to be filled up or flushed it ++ * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is ++ * configurable). ++ */ ++void bch2_journal_wait_on_seq(struct journal *j, u64 seq, ++ struct closure *parent) ++{ ++ struct journal_buf *buf; ++ ++ spin_lock(&j->lock); ++ ++ if ((buf = journal_seq_to_buf(j, seq))) { ++ if (!closure_wait(&buf->wait, parent)) ++ BUG(); ++ ++ if (seq == journal_cur_seq(j)) { ++ smp_mb(); ++ if (bch2_journal_error(j)) ++ closure_wake_up(&buf->wait); ++ } ++ } ++ ++ spin_unlock(&j->lock); ++} ++ ++/** ++ * bch2_journal_flush_seq_async - wait for a journal entry to be written ++ * ++ * like bch2_journal_wait_on_seq, except that it triggers a write immediately if ++ * necessary ++ */ ++void bch2_journal_flush_seq_async(struct journal *j, u64 seq, ++ struct closure *parent) ++{ ++ struct journal_buf *buf; ++ ++ spin_lock(&j->lock); ++ ++ if (parent && ++ (buf = journal_seq_to_buf(j, seq))) ++ if (!closure_wait(&buf->wait, parent)) ++ BUG(); ++ ++ if (seq == journal_cur_seq(j)) ++ __journal_entry_close(j); ++ spin_unlock(&j->lock); ++} ++ ++static int journal_seq_flushed(struct journal *j, u64 seq) ++{ ++ int ret; ++ ++ spin_lock(&j->lock); ++ ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq); ++ ++ if (seq == journal_cur_seq(j)) ++ __journal_entry_close(j); ++ spin_unlock(&j->lock); ++ ++ return ret; ++} ++ ++int bch2_journal_flush_seq(struct journal *j, u64 seq) ++{ ++ u64 start_time = local_clock(); ++ int ret, ret2; ++ ++ ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq))); ++ ++ bch2_time_stats_update(j->flush_seq_time, start_time); ++ ++ return ret ?: ret2 < 0 ? ret2 : 0; ++} ++ ++/** ++ * bch2_journal_meta_async - force a journal entry to be written ++ */ ++void bch2_journal_meta_async(struct journal *j, struct closure *parent) ++{ ++ struct journal_res res; ++ ++ memset(&res, 0, sizeof(res)); ++ ++ bch2_journal_res_get(j, &res, jset_u64s(0), 0); ++ bch2_journal_res_put(j, &res); ++ ++ bch2_journal_flush_seq_async(j, res.seq, parent); ++} ++ ++int bch2_journal_meta(struct journal *j) ++{ ++ struct journal_res res; ++ int ret; ++ ++ memset(&res, 0, sizeof(res)); ++ ++ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); ++ if (ret) ++ return ret; ++ ++ bch2_journal_res_put(j, &res); ++ ++ return bch2_journal_flush_seq(j, res.seq); ++} ++ ++/* ++ * bch2_journal_flush_async - if there is an open journal entry, or a journal ++ * still being written, write it and wait for the write to complete ++ */ ++void bch2_journal_flush_async(struct journal *j, struct closure *parent) ++{ ++ u64 seq, journal_seq; ++ ++ spin_lock(&j->lock); ++ journal_seq = journal_cur_seq(j); ++ ++ if (journal_entry_is_open(j)) { ++ seq = journal_seq; ++ } else if (journal_seq) { ++ seq = journal_seq - 1; ++ } else { ++ spin_unlock(&j->lock); ++ return; ++ } ++ spin_unlock(&j->lock); ++ ++ bch2_journal_flush_seq_async(j, seq, parent); ++} ++ ++int bch2_journal_flush(struct journal *j) ++{ ++ u64 seq, journal_seq; ++ ++ spin_lock(&j->lock); ++ journal_seq = journal_cur_seq(j); ++ ++ if (journal_entry_is_open(j)) { ++ seq = journal_seq; ++ } else if (journal_seq) { ++ seq = journal_seq - 1; ++ } else { ++ spin_unlock(&j->lock); ++ return 0; ++ } ++ spin_unlock(&j->lock); ++ ++ return bch2_journal_flush_seq(j, seq); ++} ++ ++/* block/unlock the journal: */ ++ ++void bch2_journal_unblock(struct journal *j) ++{ ++ spin_lock(&j->lock); ++ j->blocked--; ++ spin_unlock(&j->lock); ++ ++ journal_wake(j); ++} ++ ++void bch2_journal_block(struct journal *j) ++{ ++ spin_lock(&j->lock); ++ j->blocked++; ++ spin_unlock(&j->lock); ++ ++ journal_quiesce(j); ++} ++ ++/* allocate journal on a device: */ ++ ++static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, ++ bool new_fs, struct closure *cl) ++{ ++ struct bch_fs *c = ca->fs; ++ struct journal_device *ja = &ca->journal; ++ struct bch_sb_field_journal *journal_buckets; ++ u64 *new_bucket_seq = NULL, *new_buckets = NULL; ++ int ret = 0; ++ ++ /* don't handle reducing nr of buckets yet: */ ++ if (nr <= ja->nr) ++ return 0; ++ ++ ret = -ENOMEM; ++ new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); ++ new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); ++ if (!new_buckets || !new_bucket_seq) ++ goto err; ++ ++ journal_buckets = bch2_sb_resize_journal(&ca->disk_sb, ++ nr + sizeof(*journal_buckets) / sizeof(u64)); ++ if (!journal_buckets) ++ goto err; ++ ++ /* ++ * We may be called from the device add path, before the new device has ++ * actually been added to the running filesystem: ++ */ ++ if (c) ++ spin_lock(&c->journal.lock); ++ ++ memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); ++ memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); ++ swap(new_buckets, ja->buckets); ++ swap(new_bucket_seq, ja->bucket_seq); ++ ++ if (c) ++ spin_unlock(&c->journal.lock); ++ ++ while (ja->nr < nr) { ++ struct open_bucket *ob = NULL; ++ unsigned pos; ++ long bucket; ++ ++ if (new_fs) { ++ bucket = bch2_bucket_alloc_new_fs(ca); ++ if (bucket < 0) { ++ ret = -ENOSPC; ++ goto err; ++ } ++ } else { ++ ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, ++ false, cl); ++ if (IS_ERR(ob)) { ++ ret = cl ? -EAGAIN : -ENOSPC; ++ goto err; ++ } ++ ++ bucket = sector_to_bucket(ca, ob->ptr.offset); ++ } ++ ++ if (c) { ++ percpu_down_read(&c->mark_lock); ++ spin_lock(&c->journal.lock); ++ } ++ ++ pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0; ++ __array_insert_item(ja->buckets, ja->nr, pos); ++ __array_insert_item(ja->bucket_seq, ja->nr, pos); ++ __array_insert_item(journal_buckets->buckets, ja->nr, pos); ++ ja->nr++; ++ ++ ja->buckets[pos] = bucket; ++ ja->bucket_seq[pos] = 0; ++ journal_buckets->buckets[pos] = cpu_to_le64(bucket); ++ ++ if (pos <= ja->discard_idx) ++ ja->discard_idx = (ja->discard_idx + 1) % ja->nr; ++ if (pos <= ja->dirty_idx_ondisk) ++ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; ++ if (pos <= ja->dirty_idx) ++ ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; ++ if (pos <= ja->cur_idx) ++ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ++ ++ bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_JOURNAL, ++ ca->mi.bucket_size, ++ gc_phase(GC_PHASE_SB), ++ 0); ++ ++ if (c) { ++ spin_unlock(&c->journal.lock); ++ percpu_up_read(&c->mark_lock); ++ } ++ ++ if (!new_fs) ++ bch2_open_bucket_put(c, ob); ++ } ++ ++ ret = 0; ++err: ++ kfree(new_bucket_seq); ++ kfree(new_buckets); ++ ++ return ret; ++} ++ ++/* ++ * Allocate more journal space at runtime - not currently making use if it, but ++ * the code works: ++ */ ++int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, ++ unsigned nr) ++{ ++ struct journal_device *ja = &ca->journal; ++ struct closure cl; ++ unsigned current_nr; ++ int ret; ++ ++ closure_init_stack(&cl); ++ ++ do { ++ struct disk_reservation disk_res = { 0, 0 }; ++ ++ closure_sync(&cl); ++ ++ mutex_lock(&c->sb_lock); ++ current_nr = ja->nr; ++ ++ /* ++ * note: journal buckets aren't really counted as _sectors_ used yet, so ++ * we don't need the disk reservation to avoid the BUG_ON() in buckets.c ++ * when space used goes up without a reservation - but we do need the ++ * reservation to ensure we'll actually be able to allocate: ++ */ ++ ++ if (bch2_disk_reservation_get(c, &disk_res, ++ bucket_to_sector(ca, nr - ja->nr), 1, 0)) { ++ mutex_unlock(&c->sb_lock); ++ return -ENOSPC; ++ } ++ ++ ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl); ++ ++ bch2_disk_reservation_put(c, &disk_res); ++ ++ if (ja->nr != current_nr) ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ } while (ret == -EAGAIN); ++ ++ return ret; ++} ++ ++int bch2_dev_journal_alloc(struct bch_dev *ca) ++{ ++ unsigned nr; ++ ++ if (dynamic_fault("bcachefs:add:journal_alloc")) ++ return -ENOMEM; ++ ++ /* ++ * clamp journal size to 1024 buckets or 512MB (in sectors), whichever ++ * is smaller: ++ */ ++ nr = clamp_t(unsigned, ca->mi.nbuckets >> 8, ++ BCH_JOURNAL_BUCKETS_MIN, ++ min(1 << 10, ++ (1 << 20) / ca->mi.bucket_size)); ++ ++ return __bch2_set_nr_journal_buckets(ca, nr, true, NULL); ++} ++ ++/* startup/shutdown: */ ++ ++static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) ++{ ++ union journal_res_state state; ++ struct journal_buf *w; ++ bool ret; ++ ++ spin_lock(&j->lock); ++ state = READ_ONCE(j->reservations); ++ w = j->buf + !state.idx; ++ ++ ret = state.prev_buf_unwritten && ++ bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx); ++ spin_unlock(&j->lock); ++ ++ return ret; ++} ++ ++void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) ++{ ++ wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx)); ++} ++ ++void bch2_fs_journal_stop(struct journal *j) ++{ ++ bch2_journal_flush_all_pins(j); ++ ++ wait_event(j->wait, journal_entry_close(j)); ++ ++ /* do we need to write another journal entry? */ ++ if (test_bit(JOURNAL_NOT_EMPTY, &j->flags)) ++ bch2_journal_meta(j); ++ ++ journal_quiesce(j); ++ ++ BUG_ON(!bch2_journal_error(j) && ++ test_bit(JOURNAL_NOT_EMPTY, &j->flags)); ++ ++ cancel_delayed_work_sync(&j->write_work); ++ cancel_delayed_work_sync(&j->reclaim_work); ++} ++ ++int bch2_fs_journal_start(struct journal *j, u64 cur_seq, ++ struct list_head *journal_entries) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_entry_pin_list *p; ++ struct journal_replay *i; ++ u64 last_seq = cur_seq, nr, seq; ++ ++ if (!list_empty(journal_entries)) ++ last_seq = le64_to_cpu(list_last_entry(journal_entries, ++ struct journal_replay, list)->j.last_seq); ++ ++ nr = cur_seq - last_seq; ++ ++ if (nr + 1 > j->pin.size) { ++ free_fifo(&j->pin); ++ init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); ++ if (!j->pin.data) { ++ bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); ++ return -ENOMEM; ++ } ++ } ++ ++ j->replay_journal_seq = last_seq; ++ j->replay_journal_seq_end = cur_seq; ++ j->last_seq_ondisk = last_seq; ++ j->pin.front = last_seq; ++ j->pin.back = cur_seq; ++ atomic64_set(&j->seq, cur_seq - 1); ++ ++ fifo_for_each_entry_ptr(p, &j->pin, seq) { ++ INIT_LIST_HEAD(&p->list); ++ INIT_LIST_HEAD(&p->flushed); ++ atomic_set(&p->count, 1); ++ p->devs.nr = 0; ++ } ++ ++ list_for_each_entry(i, journal_entries, list) { ++ seq = le64_to_cpu(i->j.seq); ++ BUG_ON(seq >= cur_seq); ++ ++ if (seq < last_seq) ++ continue; ++ ++ journal_seq_pin(j, seq)->devs = i->devs; ++ } ++ ++ spin_lock(&j->lock); ++ ++ set_bit(JOURNAL_STARTED, &j->flags); ++ ++ journal_pin_new_entry(j, 1); ++ bch2_journal_buf_init(j); ++ ++ c->last_bucket_seq_cleanup = journal_cur_seq(j); ++ ++ bch2_journal_space_available(j); ++ spin_unlock(&j->lock); ++ ++ return 0; ++} ++ ++/* init/exit: */ ++ ++void bch2_dev_journal_exit(struct bch_dev *ca) ++{ ++ kfree(ca->journal.bio); ++ kfree(ca->journal.buckets); ++ kfree(ca->journal.bucket_seq); ++ ++ ca->journal.bio = NULL; ++ ca->journal.buckets = NULL; ++ ca->journal.bucket_seq = NULL; ++} ++ ++int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) ++{ ++ struct journal_device *ja = &ca->journal; ++ struct bch_sb_field_journal *journal_buckets = ++ bch2_sb_get_journal(sb); ++ unsigned i; ++ ++ ja->nr = bch2_nr_journal_buckets(journal_buckets); ++ ++ ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); ++ if (!ja->bucket_seq) ++ return -ENOMEM; ++ ++ ca->journal.bio = bio_kmalloc(GFP_KERNEL, ++ DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE)); ++ if (!ca->journal.bio) ++ return -ENOMEM; ++ ++ ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); ++ if (!ja->buckets) ++ return -ENOMEM; ++ ++ for (i = 0; i < ja->nr; i++) ++ ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); ++ ++ return 0; ++} ++ ++void bch2_fs_journal_exit(struct journal *j) ++{ ++ kvpfree(j->buf[1].data, j->buf[1].buf_size); ++ kvpfree(j->buf[0].data, j->buf[0].buf_size); ++ free_fifo(&j->pin); ++} ++ ++int bch2_fs_journal_init(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ static struct lock_class_key res_key; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ spin_lock_init(&j->lock); ++ spin_lock_init(&j->err_lock); ++ init_waitqueue_head(&j->wait); ++ INIT_DELAYED_WORK(&j->write_work, journal_write_work); ++ INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work); ++ init_waitqueue_head(&j->pin_flush_wait); ++ mutex_init(&j->reclaim_lock); ++ mutex_init(&j->discard_lock); ++ ++ lockdep_init_map(&j->res_map, "journal res", &res_key, 0); ++ ++ j->buf[0].buf_size = JOURNAL_ENTRY_SIZE_MIN; ++ j->buf[1].buf_size = JOURNAL_ENTRY_SIZE_MIN; ++ j->write_delay_ms = 1000; ++ j->reclaim_delay_ms = 100; ++ ++ /* Btree roots: */ ++ j->entry_u64s_reserved += ++ BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX); ++ ++ atomic64_set(&j->reservations.counter, ++ ((union journal_res_state) ++ { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); ++ ++ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || ++ !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) || ++ !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ j->pin.front = j->pin.back = 1; ++out: ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} ++ ++/* debug: */ ++ ++ssize_t bch2_journal_print_debug(struct journal *j, char *buf) ++{ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ union journal_res_state s; ++ struct bch_dev *ca; ++ unsigned iter; ++ ++ rcu_read_lock(); ++ spin_lock(&j->lock); ++ s = READ_ONCE(j->reservations); ++ ++ pr_buf(&out, ++ "active journal entries:\t%llu\n" ++ "seq:\t\t\t%llu\n" ++ "last_seq:\t\t%llu\n" ++ "last_seq_ondisk:\t%llu\n" ++ "prereserved:\t\t%u/%u\n" ++ "current entry sectors:\t%u\n" ++ "current entry:\t\t", ++ fifo_used(&j->pin), ++ journal_cur_seq(j), ++ journal_last_seq(j), ++ j->last_seq_ondisk, ++ j->prereserved.reserved, ++ j->prereserved.remaining, ++ j->cur_entry_sectors); ++ ++ switch (s.cur_entry_offset) { ++ case JOURNAL_ENTRY_ERROR_VAL: ++ pr_buf(&out, "error\n"); ++ break; ++ case JOURNAL_ENTRY_CLOSED_VAL: ++ pr_buf(&out, "closed\n"); ++ break; ++ default: ++ pr_buf(&out, "%u/%u\n", ++ s.cur_entry_offset, ++ j->cur_entry_u64s); ++ break; ++ } ++ ++ pr_buf(&out, ++ "current entry refs:\t%u\n" ++ "prev entry unwritten:\t", ++ journal_state_count(s, s.idx)); ++ ++ if (s.prev_buf_unwritten) ++ pr_buf(&out, "yes, ref %u sectors %u\n", ++ journal_state_count(s, !s.idx), ++ journal_prev_buf(j)->sectors); ++ else ++ pr_buf(&out, "no\n"); ++ ++ pr_buf(&out, ++ "need write:\t\t%i\n" ++ "replay done:\t\t%i\n", ++ test_bit(JOURNAL_NEED_WRITE, &j->flags), ++ test_bit(JOURNAL_REPLAY_DONE, &j->flags)); ++ ++ for_each_member_device_rcu(ca, c, iter, ++ &c->rw_devs[BCH_DATA_JOURNAL]) { ++ struct journal_device *ja = &ca->journal; ++ ++ if (!ja->nr) ++ continue; ++ ++ pr_buf(&out, ++ "dev %u:\n" ++ "\tnr\t\t%u\n" ++ "\tavailable\t%u:%u\n" ++ "\tdiscard_idx\t\t%u\n" ++ "\tdirty_idx_ondisk\t%u (seq %llu)\n" ++ "\tdirty_idx\t\t%u (seq %llu)\n" ++ "\tcur_idx\t\t%u (seq %llu)\n", ++ iter, ja->nr, ++ bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ++ ja->sectors_free, ++ ja->discard_idx, ++ ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk], ++ ja->dirty_idx, ja->bucket_seq[ja->dirty_idx], ++ ja->cur_idx, ja->bucket_seq[ja->cur_idx]); ++ } ++ ++ spin_unlock(&j->lock); ++ rcu_read_unlock(); ++ ++ return out.pos - buf; ++} ++ ++ssize_t bch2_journal_print_pins(struct journal *j, char *buf) ++{ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ struct journal_entry_pin_list *pin_list; ++ struct journal_entry_pin *pin; ++ u64 i; ++ ++ spin_lock(&j->lock); ++ fifo_for_each_entry_ptr(pin_list, &j->pin, i) { ++ pr_buf(&out, "%llu: count %u\n", ++ i, atomic_read(&pin_list->count)); ++ ++ list_for_each_entry(pin, &pin_list->list, list) ++ pr_buf(&out, "\t%px %ps\n", ++ pin, pin->flush); ++ ++ if (!list_empty(&pin_list->flushed)) ++ pr_buf(&out, "flushed:\n"); ++ ++ list_for_each_entry(pin, &pin_list->flushed, list) ++ pr_buf(&out, "\t%px %ps\n", ++ pin, pin->flush); ++ } ++ spin_unlock(&j->lock); ++ ++ return out.pos - buf; ++} +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +new file mode 100644 +index 000000000000..30de6d96188e +--- /dev/null ++++ b/fs/bcachefs/journal.h +@@ -0,0 +1,519 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_H ++#define _BCACHEFS_JOURNAL_H ++ ++/* ++ * THE JOURNAL: ++ * ++ * The primary purpose of the journal is to log updates (insertions) to the ++ * b-tree, to avoid having to do synchronous updates to the b-tree on disk. ++ * ++ * Without the journal, the b-tree is always internally consistent on ++ * disk - and in fact, in the earliest incarnations bcache didn't have a journal ++ * but did handle unclean shutdowns by doing all index updates synchronously ++ * (with coalescing). ++ * ++ * Updates to interior nodes still happen synchronously and without the journal ++ * (for simplicity) - this may change eventually but updates to interior nodes ++ * are rare enough it's not a huge priority. ++ * ++ * This means the journal is relatively separate from the b-tree; it consists of ++ * just a list of keys and journal replay consists of just redoing those ++ * insertions in same order that they appear in the journal. ++ * ++ * PERSISTENCE: ++ * ++ * For synchronous updates (where we're waiting on the index update to hit ++ * disk), the journal entry will be written out immediately (or as soon as ++ * possible, if the write for the previous journal entry was still in flight). ++ * ++ * Synchronous updates are specified by passing a closure (@flush_cl) to ++ * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter ++ * down to the journalling code. That closure will will wait on the journal ++ * write to complete (via closure_wait()). ++ * ++ * If the index update wasn't synchronous, the journal entry will be ++ * written out after 10 ms have elapsed, by default (the delay_ms field ++ * in struct journal). ++ * ++ * JOURNAL ENTRIES: ++ * ++ * A journal entry is variable size (struct jset), it's got a fixed length ++ * header and then a variable number of struct jset_entry entries. ++ * ++ * Journal entries are identified by monotonically increasing 64 bit sequence ++ * numbers - jset->seq; other places in the code refer to this sequence number. ++ * ++ * A jset_entry entry contains one or more bkeys (which is what gets inserted ++ * into the b-tree). We need a container to indicate which b-tree the key is ++ * for; also, the roots of the various b-trees are stored in jset_entry entries ++ * (one for each b-tree) - this lets us add new b-tree types without changing ++ * the on disk format. ++ * ++ * We also keep some things in the journal header that are logically part of the ++ * superblock - all the things that are frequently updated. This is for future ++ * bcache on raw flash support; the superblock (which will become another ++ * journal) can't be moved or wear leveled, so it contains just enough ++ * information to find the main journal, and the superblock only has to be ++ * rewritten when we want to move/wear level the main journal. ++ * ++ * JOURNAL LAYOUT ON DISK: ++ * ++ * The journal is written to a ringbuffer of buckets (which is kept in the ++ * superblock); the individual buckets are not necessarily contiguous on disk ++ * which means that journal entries are not allowed to span buckets, but also ++ * that we can resize the journal at runtime if desired (unimplemented). ++ * ++ * The journal buckets exist in the same pool as all the other buckets that are ++ * managed by the allocator and garbage collection - garbage collection marks ++ * the journal buckets as metadata buckets. ++ * ++ * OPEN/DIRTY JOURNAL ENTRIES: ++ * ++ * Open/dirty journal entries are journal entries that contain b-tree updates ++ * that have not yet been written out to the b-tree on disk. We have to track ++ * which journal entries are dirty, and we also have to avoid wrapping around ++ * the journal and overwriting old but still dirty journal entries with new ++ * journal entries. ++ * ++ * On disk, this is represented with the "last_seq" field of struct jset; ++ * last_seq is the first sequence number that journal replay has to replay. ++ * ++ * To avoid overwriting dirty journal entries on disk, we keep a mapping (in ++ * journal_device->seq) of for each journal bucket, the highest sequence number ++ * any journal entry it contains. Then, by comparing that against last_seq we ++ * can determine whether that journal bucket contains dirty journal entries or ++ * not. ++ * ++ * To track which journal entries are dirty, we maintain a fifo of refcounts ++ * (where each entry corresponds to a specific sequence number) - when a ref ++ * goes to 0, that journal entry is no longer dirty. ++ * ++ * Journalling of index updates is done at the same time as the b-tree itself is ++ * being modified (see btree_insert_key()); when we add the key to the journal ++ * the pending b-tree write takes a ref on the journal entry the key was added ++ * to. If a pending b-tree write would need to take refs on multiple dirty ++ * journal entries, it only keeps the ref on the oldest one (since a newer ++ * journal entry will still be replayed if an older entry was dirty). ++ * ++ * JOURNAL FILLING UP: ++ * ++ * There are two ways the journal could fill up; either we could run out of ++ * space to write to, or we could have too many open journal entries and run out ++ * of room in the fifo of refcounts. Since those refcounts are decremented ++ * without any locking we can't safely resize that fifo, so we handle it the ++ * same way. ++ * ++ * If the journal fills up, we start flushing dirty btree nodes until we can ++ * allocate space for a journal write again - preferentially flushing btree ++ * nodes that are pinning the oldest journal entries first. ++ */ ++ ++#include ++ ++#include "journal_types.h" ++ ++struct bch_fs; ++ ++static inline void journal_wake(struct journal *j) ++{ ++ wake_up(&j->wait); ++ closure_wake_up(&j->async_wait); ++ closure_wake_up(&j->preres_wait); ++} ++ ++static inline struct journal_buf *journal_cur_buf(struct journal *j) ++{ ++ return j->buf + j->reservations.idx; ++} ++ ++static inline struct journal_buf *journal_prev_buf(struct journal *j) ++{ ++ return j->buf + !j->reservations.idx; ++} ++ ++/* Sequence number of oldest dirty journal entry */ ++ ++static inline u64 journal_last_seq(struct journal *j) ++{ ++ return j->pin.front; ++} ++ ++static inline u64 journal_cur_seq(struct journal *j) ++{ ++ BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); ++ ++ return j->pin.back - 1; ++} ++ ++u64 bch2_inode_journal_seq(struct journal *, u64); ++ ++static inline int journal_state_count(union journal_res_state s, int idx) ++{ ++ return idx == 0 ? s.buf0_count : s.buf1_count; ++} ++ ++static inline void journal_state_inc(union journal_res_state *s) ++{ ++ s->buf0_count += s->idx == 0; ++ s->buf1_count += s->idx == 1; ++} ++ ++static inline void bch2_journal_set_has_inode(struct journal *j, ++ struct journal_res *res, ++ u64 inum) ++{ ++ struct journal_buf *buf = &j->buf[res->idx]; ++ unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8)); ++ ++ /* avoid atomic op if possible */ ++ if (unlikely(!test_bit(bit, buf->has_inode))) ++ set_bit(bit, buf->has_inode); ++} ++ ++/* ++ * Amount of space that will be taken up by some keys in the journal (i.e. ++ * including the jset header) ++ */ ++static inline unsigned jset_u64s(unsigned u64s) ++{ ++ return u64s + sizeof(struct jset_entry) / sizeof(u64); ++} ++ ++static inline int journal_entry_overhead(struct journal *j) ++{ ++ return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved; ++} ++ ++static inline struct jset_entry * ++bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) ++{ ++ struct jset *jset = buf->data; ++ struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s)); ++ ++ memset(entry, 0, sizeof(*entry)); ++ entry->u64s = cpu_to_le16(u64s); ++ ++ le32_add_cpu(&jset->u64s, jset_u64s(u64s)); ++ ++ return entry; ++} ++ ++static inline struct jset_entry * ++journal_res_entry(struct journal *j, struct journal_res *res) ++{ ++ return vstruct_idx(j->buf[res->idx].data, res->offset); ++} ++ ++static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type, ++ enum btree_id id, unsigned level, ++ const void *data, unsigned u64s) ++{ ++ memset(entry, 0, sizeof(*entry)); ++ entry->u64s = cpu_to_le16(u64s); ++ entry->type = type; ++ entry->btree_id = id; ++ entry->level = level; ++ memcpy_u64s_small(entry->_data, data, u64s); ++ ++ return jset_u64s(u64s); ++} ++ ++static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res, ++ unsigned type, enum btree_id id, ++ unsigned level, ++ const void *data, unsigned u64s) ++{ ++ unsigned actual = journal_entry_set(journal_res_entry(j, res), ++ type, id, level, data, u64s); ++ ++ EBUG_ON(!res->ref); ++ EBUG_ON(actual > res->u64s); ++ ++ res->offset += actual; ++ res->u64s -= actual; ++} ++ ++static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res, ++ enum btree_id id, const struct bkey_i *k) ++{ ++ bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys, ++ id, 0, k, k->k.u64s); ++} ++ ++static inline bool journal_entry_empty(struct jset *j) ++{ ++ struct jset_entry *i; ++ ++ if (j->seq != j->last_seq) ++ return false; ++ ++ vstruct_for_each(j, i) ++ if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s) ++ return false; ++ return true; ++} ++ ++void __bch2_journal_buf_put(struct journal *, bool); ++ ++static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, ++ bool need_write_just_set) ++{ ++ union journal_res_state s; ++ ++ s.v = atomic64_sub_return(((union journal_res_state) { ++ .buf0_count = idx == 0, ++ .buf1_count = idx == 1, ++ }).v, &j->reservations.counter); ++ if (!journal_state_count(s, idx)) { ++ EBUG_ON(s.idx == idx || !s.prev_buf_unwritten); ++ __bch2_journal_buf_put(j, need_write_just_set); ++ } ++} ++ ++/* ++ * This function releases the journal write structure so other threads can ++ * then proceed to add their keys as well. ++ */ ++static inline void bch2_journal_res_put(struct journal *j, ++ struct journal_res *res) ++{ ++ if (!res->ref) ++ return; ++ ++ lock_release(&j->res_map, _THIS_IP_); ++ ++ while (res->u64s) ++ bch2_journal_add_entry(j, res, ++ BCH_JSET_ENTRY_btree_keys, ++ 0, 0, NULL, 0); ++ ++ bch2_journal_buf_put(j, res->idx, false); ++ ++ res->ref = 0; ++} ++ ++int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, ++ unsigned); ++ ++#define JOURNAL_RES_GET_NONBLOCK (1 << 0) ++#define JOURNAL_RES_GET_CHECK (1 << 1) ++#define JOURNAL_RES_GET_RESERVED (1 << 2) ++#define JOURNAL_RES_GET_RECLAIM (1 << 3) ++ ++static inline int journal_res_get_fast(struct journal *j, ++ struct journal_res *res, ++ unsigned flags) ++{ ++ union journal_res_state old, new; ++ u64 v = atomic64_read(&j->reservations.counter); ++ ++ do { ++ old.v = new.v = v; ++ ++ /* ++ * Check if there is still room in the current journal ++ * entry: ++ */ ++ if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) ++ return 0; ++ ++ EBUG_ON(!journal_state_count(new, new.idx)); ++ ++ if (!(flags & JOURNAL_RES_GET_RESERVED) && ++ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) ++ return 0; ++ ++ if (flags & JOURNAL_RES_GET_CHECK) ++ return 1; ++ ++ new.cur_entry_offset += res->u64s; ++ journal_state_inc(&new); ++ } while ((v = atomic64_cmpxchg(&j->reservations.counter, ++ old.v, new.v)) != old.v); ++ ++ res->ref = true; ++ res->idx = old.idx; ++ res->offset = old.cur_entry_offset; ++ res->seq = le64_to_cpu(j->buf[old.idx].data->seq); ++ return 1; ++} ++ ++static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, ++ unsigned u64s, unsigned flags) ++{ ++ int ret; ++ ++ EBUG_ON(res->ref); ++ EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); ++ ++ res->u64s = u64s; ++ ++ if (journal_res_get_fast(j, res, flags)) ++ goto out; ++ ++ ret = bch2_journal_res_get_slowpath(j, res, flags); ++ if (ret) ++ return ret; ++out: ++ if (!(flags & JOURNAL_RES_GET_CHECK)) { ++ lock_acquire_shared(&j->res_map, 0, ++ (flags & JOURNAL_RES_GET_NONBLOCK) != 0, ++ NULL, _THIS_IP_); ++ EBUG_ON(!res->ref); ++ } ++ return 0; ++} ++ ++/* journal_preres: */ ++ ++static inline bool journal_check_may_get_unreserved(struct journal *j) ++{ ++ union journal_preres_state s = READ_ONCE(j->prereserved); ++ bool ret = s.reserved <= s.remaining && ++ fifo_free(&j->pin) > 8; ++ ++ lockdep_assert_held(&j->lock); ++ ++ if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { ++ if (ret) { ++ set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); ++ journal_wake(j); ++ } else { ++ clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); ++ } ++ } ++ return ret; ++} ++ ++static inline void bch2_journal_preres_put(struct journal *j, ++ struct journal_preres *res) ++{ ++ union journal_preres_state s = { .reserved = res->u64s }; ++ ++ if (!res->u64s) ++ return; ++ ++ s.v = atomic64_sub_return(s.v, &j->prereserved.counter); ++ res->u64s = 0; ++ closure_wake_up(&j->preres_wait); ++ ++ if (s.reserved <= s.remaining && ++ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { ++ spin_lock(&j->lock); ++ journal_check_may_get_unreserved(j); ++ spin_unlock(&j->lock); ++ } ++} ++ ++int __bch2_journal_preres_get(struct journal *, ++ struct journal_preres *, unsigned, unsigned); ++ ++static inline int bch2_journal_preres_get_fast(struct journal *j, ++ struct journal_preres *res, ++ unsigned new_u64s, ++ unsigned flags) ++{ ++ int d = new_u64s - res->u64s; ++ union journal_preres_state old, new; ++ u64 v = atomic64_read(&j->prereserved.counter); ++ ++ do { ++ old.v = new.v = v; ++ ++ new.reserved += d; ++ ++ /* ++ * If we're being called from the journal reclaim path, we have ++ * to unconditionally give out the pre-reservation, there's ++ * nothing else sensible we can do - otherwise we'd recurse back ++ * into the reclaim path and deadlock: ++ */ ++ ++ if (!(flags & JOURNAL_RES_GET_RECLAIM) && ++ new.reserved > new.remaining) ++ return 0; ++ } while ((v = atomic64_cmpxchg(&j->prereserved.counter, ++ old.v, new.v)) != old.v); ++ ++ res->u64s += d; ++ return 1; ++} ++ ++static inline int bch2_journal_preres_get(struct journal *j, ++ struct journal_preres *res, ++ unsigned new_u64s, ++ unsigned flags) ++{ ++ if (new_u64s <= res->u64s) ++ return 0; ++ ++ if (bch2_journal_preres_get_fast(j, res, new_u64s, flags)) ++ return 0; ++ ++ if (flags & JOURNAL_RES_GET_NONBLOCK) ++ return -EAGAIN; ++ ++ return __bch2_journal_preres_get(j, res, new_u64s, flags); ++} ++ ++/* journal_entry_res: */ ++ ++void bch2_journal_entry_res_resize(struct journal *, ++ struct journal_entry_res *, ++ unsigned); ++ ++u64 bch2_journal_last_unwritten_seq(struct journal *); ++int bch2_journal_open_seq_async(struct journal *, u64, struct closure *); ++ ++void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *); ++void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); ++void bch2_journal_flush_async(struct journal *, struct closure *); ++void bch2_journal_meta_async(struct journal *, struct closure *); ++ ++int bch2_journal_flush_seq(struct journal *, u64); ++int bch2_journal_flush(struct journal *); ++int bch2_journal_meta(struct journal *); ++ ++void bch2_journal_halt(struct journal *); ++ ++static inline int bch2_journal_error(struct journal *j) ++{ ++ return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ++ ? -EIO : 0; ++} ++ ++struct bch_dev; ++ ++static inline bool journal_flushes_device(struct bch_dev *ca) ++{ ++ return true; ++} ++ ++static inline void bch2_journal_set_replay_done(struct journal *j) ++{ ++ BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); ++ set_bit(JOURNAL_REPLAY_DONE, &j->flags); ++} ++ ++void bch2_journal_unblock(struct journal *); ++void bch2_journal_block(struct journal *); ++ ++ssize_t bch2_journal_print_debug(struct journal *, char *); ++ssize_t bch2_journal_print_pins(struct journal *, char *); ++ ++int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, ++ unsigned nr); ++int bch2_dev_journal_alloc(struct bch_dev *); ++ ++void bch2_dev_journal_stop(struct journal *, struct bch_dev *); ++ ++void bch2_fs_journal_stop(struct journal *); ++int bch2_fs_journal_start(struct journal *, u64, struct list_head *); ++ ++void bch2_dev_journal_exit(struct bch_dev *); ++int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); ++void bch2_fs_journal_exit(struct journal *); ++int bch2_fs_journal_init(struct journal *); ++ ++#endif /* _BCACHEFS_JOURNAL_H */ +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +new file mode 100644 +index 000000000000..c298c2b7721d +--- /dev/null ++++ b/fs/bcachefs/journal_io.c +@@ -0,0 +1,1150 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "btree_io.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "error.h" ++#include "journal.h" ++#include "journal_io.h" ++#include "journal_reclaim.h" ++#include "replicas.h" ++ ++#include ++ ++struct journal_list { ++ struct closure cl; ++ struct mutex lock; ++ struct list_head *head; ++ int ret; ++}; ++ ++#define JOURNAL_ENTRY_ADD_OK 0 ++#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 ++ ++/* ++ * Given a journal entry we just read, add it to the list of journal entries to ++ * be replayed: ++ */ ++static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, ++ struct journal_list *jlist, struct jset *j) ++{ ++ struct journal_replay *i, *pos; ++ struct list_head *where; ++ size_t bytes = vstruct_bytes(j); ++ __le64 last_seq; ++ int ret; ++ ++ last_seq = !list_empty(jlist->head) ++ ? list_last_entry(jlist->head, struct journal_replay, ++ list)->j.last_seq ++ : 0; ++ ++ if (!c->opts.read_entire_journal) { ++ /* Is this entry older than the range we need? */ ++ if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) { ++ ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; ++ goto out; ++ } ++ ++ /* Drop entries we don't need anymore */ ++ list_for_each_entry_safe(i, pos, jlist->head, list) { ++ if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq)) ++ break; ++ list_del(&i->list); ++ kvpfree(i, offsetof(struct journal_replay, j) + ++ vstruct_bytes(&i->j)); ++ } ++ } ++ ++ list_for_each_entry_reverse(i, jlist->head, list) { ++ /* Duplicate? */ ++ if (le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { ++ fsck_err_on(bytes != vstruct_bytes(&i->j) || ++ memcmp(j, &i->j, bytes), c, ++ "found duplicate but non identical journal entries (seq %llu)", ++ le64_to_cpu(j->seq)); ++ goto found; ++ } ++ ++ if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) { ++ where = &i->list; ++ goto add; ++ } ++ } ++ ++ where = jlist->head; ++add: ++ i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); ++ if (!i) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ list_add(&i->list, where); ++ i->devs.nr = 0; ++ memcpy(&i->j, j, bytes); ++found: ++ if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx)) ++ bch2_dev_list_add_dev(&i->devs, ca->dev_idx); ++ else ++ fsck_err_on(1, c, "duplicate journal entries on same device"); ++ ret = JOURNAL_ENTRY_ADD_OK; ++out: ++fsck_err: ++ return ret; ++} ++ ++static struct nonce journal_nonce(const struct jset *jset) ++{ ++ return (struct nonce) {{ ++ [0] = 0, ++ [1] = ((__le32 *) &jset->seq)[0], ++ [2] = ((__le32 *) &jset->seq)[1], ++ [3] = BCH_NONCE_JOURNAL, ++ }}; ++} ++ ++/* this fills in a range with empty jset_entries: */ ++static void journal_entry_null_range(void *start, void *end) ++{ ++ struct jset_entry *entry; ++ ++ for (entry = start; entry != end; entry = vstruct_next(entry)) ++ memset(entry, 0, sizeof(*entry)); ++} ++ ++#define JOURNAL_ENTRY_REREAD 5 ++#define JOURNAL_ENTRY_NONE 6 ++#define JOURNAL_ENTRY_BAD 7 ++ ++#define journal_entry_err(c, msg, ...) \ ++({ \ ++ switch (write) { \ ++ case READ: \ ++ mustfix_fsck_err(c, msg, ##__VA_ARGS__); \ ++ break; \ ++ case WRITE: \ ++ bch_err(c, "corrupt metadata before write:\n" \ ++ msg, ##__VA_ARGS__); \ ++ if (bch2_fs_inconsistent(c)) { \ ++ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ goto fsck_err; \ ++ } \ ++ break; \ ++ } \ ++ true; \ ++}) ++ ++#define journal_entry_err_on(cond, c, msg, ...) \ ++ ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false) ++ ++static int journal_validate_key(struct bch_fs *c, struct jset *jset, ++ struct jset_entry *entry, ++ unsigned level, enum btree_id btree_id, ++ struct bkey_i *k, ++ const char *type, int write) ++{ ++ void *next = vstruct_next(entry); ++ const char *invalid; ++ unsigned version = le32_to_cpu(jset->version); ++ int ret = 0; ++ ++ if (journal_entry_err_on(!k->k.u64s, c, ++ "invalid %s in journal: k->u64s 0", type)) { ++ entry->u64s = cpu_to_le16((u64 *) k - entry->_data); ++ journal_entry_null_range(vstruct_next(entry), next); ++ return 0; ++ } ++ ++ if (journal_entry_err_on((void *) bkey_next(k) > ++ (void *) vstruct_next(entry), c, ++ "invalid %s in journal: extends past end of journal entry", ++ type)) { ++ entry->u64s = cpu_to_le16((u64 *) k - entry->_data); ++ journal_entry_null_range(vstruct_next(entry), next); ++ return 0; ++ } ++ ++ if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c, ++ "invalid %s in journal: bad format %u", ++ type, k->k.format)) { ++ le16_add_cpu(&entry->u64s, -k->k.u64s); ++ memmove(k, bkey_next(k), next - (void *) bkey_next(k)); ++ journal_entry_null_range(vstruct_next(entry), next); ++ return 0; ++ } ++ ++ if (!write) ++ bch2_bkey_compat(level, btree_id, version, ++ JSET_BIG_ENDIAN(jset), write, ++ NULL, bkey_to_packed(k)); ++ ++ invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), ++ __btree_node_type(level, btree_id)); ++ if (invalid) { ++ char buf[160]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k)); ++ mustfix_fsck_err(c, "invalid %s in journal: %s\n%s", ++ type, invalid, buf); ++ ++ le16_add_cpu(&entry->u64s, -k->k.u64s); ++ memmove(k, bkey_next(k), next - (void *) bkey_next(k)); ++ journal_entry_null_range(vstruct_next(entry), next); ++ return 0; ++ } ++ ++ if (write) ++ bch2_bkey_compat(level, btree_id, version, ++ JSET_BIG_ENDIAN(jset), write, ++ NULL, bkey_to_packed(k)); ++fsck_err: ++ return ret; ++} ++ ++static int journal_entry_validate_btree_keys(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ struct bkey_i *k; ++ ++ vstruct_for_each(entry, k) { ++ int ret = journal_validate_key(c, jset, entry, ++ entry->level, ++ entry->btree_id, ++ k, "key", write); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static int journal_entry_validate_btree_root(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ struct bkey_i *k = entry->start; ++ int ret = 0; ++ ++ if (journal_entry_err_on(!entry->u64s || ++ le16_to_cpu(entry->u64s) != k->k.u64s, c, ++ "invalid btree root journal entry: wrong number of keys")) { ++ void *next = vstruct_next(entry); ++ /* ++ * we don't want to null out this jset_entry, ++ * just the contents, so that later we can tell ++ * we were _supposed_ to have a btree root ++ */ ++ entry->u64s = 0; ++ journal_entry_null_range(vstruct_next(entry), next); ++ return 0; ++ } ++ ++ return journal_validate_key(c, jset, entry, 1, entry->btree_id, k, ++ "btree root", write); ++fsck_err: ++ return ret; ++} ++ ++static int journal_entry_validate_prio_ptrs(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ /* obsolete, don't care: */ ++ return 0; ++} ++ ++static int journal_entry_validate_blacklist(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ int ret = 0; ++ ++ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c, ++ "invalid journal seq blacklist entry: bad size")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ } ++fsck_err: ++ return ret; ++} ++ ++static int journal_entry_validate_blacklist_v2(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ struct jset_entry_blacklist_v2 *bl_entry; ++ int ret = 0; ++ ++ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c, ++ "invalid journal seq blacklist entry: bad size")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ goto out; ++ } ++ ++ bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); ++ ++ if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > ++ le64_to_cpu(bl_entry->end), c, ++ "invalid journal seq blacklist entry: start > end")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ } ++out: ++fsck_err: ++ return ret; ++} ++ ++static int journal_entry_validate_usage(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); ++ int ret = 0; ++ ++ if (journal_entry_err_on(bytes < sizeof(*u), ++ c, ++ "invalid journal entry usage: bad size")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ return ret; ++ } ++ ++fsck_err: ++ return ret; ++} ++ ++static int journal_entry_validate_data_usage(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ struct jset_entry_data_usage *u = ++ container_of(entry, struct jset_entry_data_usage, entry); ++ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); ++ int ret = 0; ++ ++ if (journal_entry_err_on(bytes < sizeof(*u) || ++ bytes < sizeof(*u) + u->r.nr_devs, ++ c, ++ "invalid journal entry usage: bad size")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ return ret; ++ } ++ ++fsck_err: ++ return ret; ++} ++ ++struct jset_entry_ops { ++ int (*validate)(struct bch_fs *, struct jset *, ++ struct jset_entry *, int); ++}; ++ ++static const struct jset_entry_ops bch2_jset_entry_ops[] = { ++#define x(f, nr) \ ++ [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ ++ .validate = journal_entry_validate_##f, \ ++ }, ++ BCH_JSET_ENTRY_TYPES() ++#undef x ++}; ++ ++static int journal_entry_validate(struct bch_fs *c, struct jset *jset, ++ struct jset_entry *entry, int write) ++{ ++ return entry->type < BCH_JSET_ENTRY_NR ++ ? bch2_jset_entry_ops[entry->type].validate(c, jset, ++ entry, write) ++ : 0; ++} ++ ++static int jset_validate_entries(struct bch_fs *c, struct jset *jset, ++ int write) ++{ ++ struct jset_entry *entry; ++ int ret = 0; ++ ++ vstruct_for_each(jset, entry) { ++ if (journal_entry_err_on(vstruct_next(entry) > ++ vstruct_last(jset), c, ++ "journal entry extends past end of jset")) { ++ jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); ++ break; ++ } ++ ++ ret = journal_entry_validate(c, jset, entry, write); ++ if (ret) ++ break; ++ } ++fsck_err: ++ return ret; ++} ++ ++static int jset_validate(struct bch_fs *c, ++ struct jset *jset, u64 sector, ++ unsigned bucket_sectors_left, ++ unsigned sectors_read, ++ int write) ++{ ++ size_t bytes = vstruct_bytes(jset); ++ struct bch_csum csum; ++ unsigned version; ++ int ret = 0; ++ ++ if (le64_to_cpu(jset->magic) != jset_magic(c)) ++ return JOURNAL_ENTRY_NONE; ++ ++ version = le32_to_cpu(jset->version); ++ if ((version != BCH_JSET_VERSION_OLD && ++ version < bcachefs_metadata_version_min) || ++ version >= bcachefs_metadata_version_max) { ++ bch_err(c, "unknown journal entry version %u", jset->version); ++ return BCH_FSCK_UNKNOWN_VERSION; ++ } ++ ++ if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c, ++ "journal entry too big (%zu bytes), sector %lluu", ++ bytes, sector)) { ++ /* XXX: note we might have missing journal entries */ ++ return JOURNAL_ENTRY_BAD; ++ } ++ ++ if (bytes > sectors_read << 9) ++ return JOURNAL_ENTRY_REREAD; ++ ++ if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, ++ "journal entry with unknown csum type %llu sector %lluu", ++ JSET_CSUM_TYPE(jset), sector)) ++ return JOURNAL_ENTRY_BAD; ++ ++ csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); ++ if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c, ++ "journal checksum bad, sector %llu", sector)) { ++ /* XXX: retry IO, when we start retrying checksum errors */ ++ /* XXX: note we might have missing journal entries */ ++ return JOURNAL_ENTRY_BAD; ++ } ++ ++ bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), ++ jset->encrypted_start, ++ vstruct_end(jset) - (void *) jset->encrypted_start); ++ ++ if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, ++ "invalid journal entry: last_seq > seq")) ++ jset->last_seq = jset->seq; ++ ++ return 0; ++fsck_err: ++ return ret; ++} ++ ++struct journal_read_buf { ++ void *data; ++ size_t size; ++}; ++ ++static int journal_read_buf_realloc(struct journal_read_buf *b, ++ size_t new_size) ++{ ++ void *n; ++ ++ /* the bios are sized for this many pages, max: */ ++ if (new_size > JOURNAL_ENTRY_SIZE_MAX) ++ return -ENOMEM; ++ ++ new_size = roundup_pow_of_two(new_size); ++ n = kvpmalloc(new_size, GFP_KERNEL); ++ if (!n) ++ return -ENOMEM; ++ ++ kvpfree(b->data, b->size); ++ b->data = n; ++ b->size = new_size; ++ return 0; ++} ++ ++static int journal_read_bucket(struct bch_dev *ca, ++ struct journal_read_buf *buf, ++ struct journal_list *jlist, ++ unsigned bucket) ++{ ++ struct bch_fs *c = ca->fs; ++ struct journal_device *ja = &ca->journal; ++ struct jset *j = NULL; ++ unsigned sectors, sectors_read = 0; ++ u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), ++ end = offset + ca->mi.bucket_size; ++ bool saw_bad = false; ++ int ret = 0; ++ ++ pr_debug("reading %u", bucket); ++ ++ while (offset < end) { ++ if (!sectors_read) { ++ struct bio *bio; ++reread: ++ sectors_read = min_t(unsigned, ++ end - offset, buf->size >> 9); ++ ++ bio = bio_kmalloc(GFP_KERNEL, ++ buf_pages(buf->data, ++ sectors_read << 9)); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_iter.bi_sector = offset; ++ bio_set_op_attrs(bio, REQ_OP_READ, 0); ++ bch2_bio_map(bio, buf->data, sectors_read << 9); ++ ++ ret = submit_bio_wait(bio); ++ bio_put(bio); ++ ++ if (bch2_dev_io_err_on(ret, ca, ++ "journal read from sector %llu", ++ offset) || ++ bch2_meta_read_fault("journal")) ++ return -EIO; ++ ++ j = buf->data; ++ } ++ ++ ret = jset_validate(c, j, offset, ++ end - offset, sectors_read, ++ READ); ++ switch (ret) { ++ case BCH_FSCK_OK: ++ break; ++ case JOURNAL_ENTRY_REREAD: ++ if (vstruct_bytes(j) > buf->size) { ++ ret = journal_read_buf_realloc(buf, ++ vstruct_bytes(j)); ++ if (ret) ++ return ret; ++ } ++ goto reread; ++ case JOURNAL_ENTRY_NONE: ++ if (!saw_bad) ++ return 0; ++ sectors = c->opts.block_size; ++ goto next_block; ++ case JOURNAL_ENTRY_BAD: ++ saw_bad = true; ++ sectors = c->opts.block_size; ++ goto next_block; ++ default: ++ return ret; ++ } ++ ++ /* ++ * This happens sometimes if we don't have discards on - ++ * when we've partially overwritten a bucket with new ++ * journal entries. We don't need the rest of the ++ * bucket: ++ */ ++ if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) ++ return 0; ++ ++ ja->bucket_seq[bucket] = le64_to_cpu(j->seq); ++ ++ mutex_lock(&jlist->lock); ++ ret = journal_entry_add(c, ca, jlist, j); ++ mutex_unlock(&jlist->lock); ++ ++ switch (ret) { ++ case JOURNAL_ENTRY_ADD_OK: ++ break; ++ case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: ++ break; ++ default: ++ return ret; ++ } ++ ++ sectors = vstruct_sectors(j, c->block_bits); ++next_block: ++ pr_debug("next"); ++ offset += sectors; ++ sectors_read -= sectors; ++ j = ((void *) j) + (sectors << 9); ++ } ++ ++ return 0; ++} ++ ++static void bch2_journal_read_device(struct closure *cl) ++{ ++ struct journal_device *ja = ++ container_of(cl, struct journal_device, read); ++ struct bch_dev *ca = container_of(ja, struct bch_dev, journal); ++ struct journal_list *jlist = ++ container_of(cl->parent, struct journal_list, cl); ++ struct journal_read_buf buf = { NULL, 0 }; ++ u64 min_seq = U64_MAX; ++ unsigned i; ++ int ret; ++ ++ if (!ja->nr) ++ goto out; ++ ++ ret = journal_read_buf_realloc(&buf, PAGE_SIZE); ++ if (ret) ++ goto err; ++ ++ pr_debug("%u journal buckets", ja->nr); ++ ++ for (i = 0; i < ja->nr; i++) { ++ ret = journal_read_bucket(ca, &buf, jlist, i); ++ if (ret) ++ goto err; ++ } ++ ++ /* Find the journal bucket with the highest sequence number: */ ++ for (i = 0; i < ja->nr; i++) { ++ if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx]) ++ ja->cur_idx = i; ++ ++ min_seq = min(ja->bucket_seq[i], min_seq); ++ } ++ ++ /* ++ * If there's duplicate journal entries in multiple buckets (which ++ * definitely isn't supposed to happen, but...) - make sure to start ++ * cur_idx at the last of those buckets, so we don't deadlock trying to ++ * allocate ++ */ ++ while (ja->bucket_seq[ja->cur_idx] > min_seq && ++ ja->bucket_seq[ja->cur_idx] > ++ ja->bucket_seq[(ja->cur_idx + 1) % ja->nr]) ++ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ++ ++ ja->sectors_free = 0; ++ ++ /* ++ * Set dirty_idx to indicate the entire journal is full and needs to be ++ * reclaimed - journal reclaim will immediately reclaim whatever isn't ++ * pinned when it first runs: ++ */ ++ ja->discard_idx = ja->dirty_idx_ondisk = ++ ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; ++out: ++ kvpfree(buf.data, buf.size); ++ percpu_ref_put(&ca->io_ref); ++ closure_return(cl); ++ return; ++err: ++ mutex_lock(&jlist->lock); ++ jlist->ret = ret; ++ mutex_unlock(&jlist->lock); ++ goto out; ++} ++ ++int bch2_journal_read(struct bch_fs *c, struct list_head *list) ++{ ++ struct journal_list jlist; ++ struct journal_replay *i; ++ struct bch_dev *ca; ++ unsigned iter; ++ size_t keys = 0, entries = 0; ++ bool degraded = false; ++ int ret = 0; ++ ++ closure_init_stack(&jlist.cl); ++ mutex_init(&jlist.lock); ++ jlist.head = list; ++ jlist.ret = 0; ++ ++ for_each_member_device(ca, c, iter) { ++ if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && ++ !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_JOURNAL))) ++ continue; ++ ++ if ((ca->mi.state == BCH_MEMBER_STATE_RW || ++ ca->mi.state == BCH_MEMBER_STATE_RO) && ++ percpu_ref_tryget(&ca->io_ref)) ++ closure_call(&ca->journal.read, ++ bch2_journal_read_device, ++ system_unbound_wq, ++ &jlist.cl); ++ else ++ degraded = true; ++ } ++ ++ closure_sync(&jlist.cl); ++ ++ if (jlist.ret) ++ return jlist.ret; ++ ++ list_for_each_entry(i, list, list) { ++ struct jset_entry *entry; ++ struct bkey_i *k, *_n; ++ struct bch_replicas_padded replicas; ++ char buf[80]; ++ ++ ret = jset_validate_entries(c, &i->j, READ); ++ if (ret) ++ goto fsck_err; ++ ++ /* ++ * If we're mounting in degraded mode - if we didn't read all ++ * the devices - this is wrong: ++ */ ++ ++ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs); ++ ++ if (!degraded && ++ (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || ++ fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c, ++ "superblock not marked as containing replicas %s", ++ (bch2_replicas_entry_to_text(&PBUF(buf), ++ &replicas.e), buf)))) { ++ ret = bch2_mark_replicas(c, &replicas.e); ++ if (ret) ++ return ret; ++ } ++ ++ for_each_jset_key(k, _n, entry, &i->j) ++ keys++; ++ entries++; ++ } ++ ++ if (!list_empty(list)) { ++ i = list_last_entry(list, struct journal_replay, list); ++ ++ bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", ++ keys, entries, le64_to_cpu(i->j.seq)); ++ } ++fsck_err: ++ return ret; ++} ++ ++/* journal write: */ ++ ++static void __journal_write_alloc(struct journal *j, ++ struct journal_buf *w, ++ struct dev_alloc_list *devs_sorted, ++ unsigned sectors, ++ unsigned *replicas, ++ unsigned replicas_want) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_device *ja; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ if (*replicas >= replicas_want) ++ return; ++ ++ for (i = 0; i < devs_sorted->nr; i++) { ++ ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); ++ if (!ca) ++ continue; ++ ++ ja = &ca->journal; ++ ++ /* ++ * Check that we can use this device, and aren't already using ++ * it: ++ */ ++ if (!ca->mi.durability || ++ ca->mi.state != BCH_MEMBER_STATE_RW || ++ !ja->nr || ++ bch2_bkey_has_device(bkey_i_to_s_c(&w->key), ++ ca->dev_idx) || ++ sectors > ja->sectors_free) ++ continue; ++ ++ bch2_dev_stripe_increment(c, ca, &j->wp.stripe); ++ ++ bch2_bkey_append_ptr(&w->key, ++ (struct bch_extent_ptr) { ++ .offset = bucket_to_sector(ca, ++ ja->buckets[ja->cur_idx]) + ++ ca->mi.bucket_size - ++ ja->sectors_free, ++ .dev = ca->dev_idx, ++ }); ++ ++ ja->sectors_free -= sectors; ++ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); ++ ++ *replicas += ca->mi.durability; ++ ++ if (*replicas >= replicas_want) ++ break; ++ } ++} ++ ++/** ++ * journal_next_bucket - move on to the next journal bucket if possible ++ */ ++static int journal_write_alloc(struct journal *j, struct journal_buf *w, ++ unsigned sectors) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_device *ja; ++ struct bch_dev *ca; ++ struct dev_alloc_list devs_sorted; ++ unsigned i, replicas = 0, replicas_want = ++ READ_ONCE(c->opts.metadata_replicas); ++ ++ rcu_read_lock(); ++ ++ devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, ++ &c->rw_devs[BCH_DATA_JOURNAL]); ++ ++ __journal_write_alloc(j, w, &devs_sorted, ++ sectors, &replicas, replicas_want); ++ ++ if (replicas >= replicas_want) ++ goto done; ++ ++ for (i = 0; i < devs_sorted.nr; i++) { ++ ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); ++ if (!ca) ++ continue; ++ ++ ja = &ca->journal; ++ ++ if (sectors > ja->sectors_free && ++ sectors <= ca->mi.bucket_size && ++ bch2_journal_dev_buckets_available(j, ja, ++ journal_space_discarded)) { ++ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ++ ja->sectors_free = ca->mi.bucket_size; ++ ++ /* ++ * ja->bucket_seq[ja->cur_idx] must always have ++ * something sensible: ++ */ ++ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); ++ } ++ } ++ ++ __journal_write_alloc(j, w, &devs_sorted, ++ sectors, &replicas, replicas_want); ++done: ++ rcu_read_unlock(); ++ ++ return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; ++} ++ ++static void journal_write_compact(struct jset *jset) ++{ ++ struct jset_entry *i, *next, *prev = NULL; ++ ++ /* ++ * Simple compaction, dropping empty jset_entries (from journal ++ * reservations that weren't fully used) and merging jset_entries that ++ * can be. ++ * ++ * If we wanted to be really fancy here, we could sort all the keys in ++ * the jset and drop keys that were overwritten - probably not worth it: ++ */ ++ vstruct_for_each_safe(jset, i, next) { ++ unsigned u64s = le16_to_cpu(i->u64s); ++ ++ /* Empty entry: */ ++ if (!u64s) ++ continue; ++ ++ /* Can we merge with previous entry? */ ++ if (prev && ++ i->btree_id == prev->btree_id && ++ i->level == prev->level && ++ i->type == prev->type && ++ i->type == BCH_JSET_ENTRY_btree_keys && ++ le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { ++ memmove_u64s_down(vstruct_next(prev), ++ i->_data, ++ u64s); ++ le16_add_cpu(&prev->u64s, u64s); ++ continue; ++ } ++ ++ /* Couldn't merge, move i into new position (after prev): */ ++ prev = prev ? vstruct_next(prev) : jset->start; ++ if (i != prev) ++ memmove_u64s_down(prev, i, jset_u64s(u64s)); ++ } ++ ++ prev = prev ? vstruct_next(prev) : jset->start; ++ jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); ++} ++ ++static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) ++{ ++ /* we aren't holding j->lock: */ ++ unsigned new_size = READ_ONCE(j->buf_size_want); ++ void *new_buf; ++ ++ if (buf->buf_size >= new_size) ++ return; ++ ++ new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN); ++ if (!new_buf) ++ return; ++ ++ memcpy(new_buf, buf->data, buf->buf_size); ++ kvpfree(buf->data, buf->buf_size); ++ buf->data = new_buf; ++ buf->buf_size = new_size; ++} ++ ++static void journal_write_done(struct closure *cl) ++{ ++ struct journal *j = container_of(cl, struct journal, io); ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_buf *w = journal_prev_buf(j); ++ struct bch_devs_list devs = ++ bch2_bkey_devs(bkey_i_to_s_c(&w->key)); ++ struct bch_replicas_padded replicas; ++ u64 seq = le64_to_cpu(w->data->seq); ++ u64 last_seq = le64_to_cpu(w->data->last_seq); ++ ++ bch2_time_stats_update(j->write_time, j->write_start_time); ++ ++ if (!devs.nr) { ++ bch_err(c, "unable to write journal to sufficient devices"); ++ goto err; ++ } ++ ++ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, devs); ++ ++ if (bch2_mark_replicas(c, &replicas.e)) ++ goto err; ++ ++ spin_lock(&j->lock); ++ if (seq >= j->pin.front) ++ journal_seq_pin(j, seq)->devs = devs; ++ ++ j->seq_ondisk = seq; ++ j->last_seq_ondisk = last_seq; ++ bch2_journal_space_available(j); ++ ++ /* ++ * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard ++ * more buckets: ++ * ++ * Must come before signaling write completion, for ++ * bch2_fs_journal_stop(): ++ */ ++ mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0); ++out: ++ /* also must come before signalling write completion: */ ++ closure_debug_destroy(cl); ++ ++ BUG_ON(!j->reservations.prev_buf_unwritten); ++ atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v, ++ &j->reservations.counter); ++ ++ closure_wake_up(&w->wait); ++ journal_wake(j); ++ ++ if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) ++ mod_delayed_work(system_freezable_wq, &j->write_work, 0); ++ spin_unlock(&j->lock); ++ return; ++err: ++ bch2_fatal_error(c); ++ spin_lock(&j->lock); ++ goto out; ++} ++ ++static void journal_write_endio(struct bio *bio) ++{ ++ struct bch_dev *ca = bio->bi_private; ++ struct journal *j = &ca->fs->journal; ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s", ++ blk_status_to_str(bio->bi_status)) || ++ bch2_meta_write_fault("journal")) { ++ struct journal_buf *w = journal_prev_buf(j); ++ unsigned long flags; ++ ++ spin_lock_irqsave(&j->err_lock, flags); ++ bch2_bkey_drop_device(bkey_i_to_s(&w->key), ca->dev_idx); ++ spin_unlock_irqrestore(&j->err_lock, flags); ++ } ++ ++ closure_put(&j->io); ++ percpu_ref_put(&ca->io_ref); ++} ++ ++void bch2_journal_write(struct closure *cl) ++{ ++ struct journal *j = container_of(cl, struct journal, io); ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ struct journal_buf *w = journal_prev_buf(j); ++ struct jset_entry *start, *end; ++ struct jset *jset; ++ struct bio *bio; ++ struct bch_extent_ptr *ptr; ++ bool validate_before_checksum = false; ++ unsigned i, sectors, bytes, u64s; ++ int ret; ++ ++ bch2_journal_pin_put(j, le64_to_cpu(w->data->seq)); ++ ++ journal_buf_realloc(j, w); ++ jset = w->data; ++ ++ j->write_start_time = local_clock(); ++ ++ /* ++ * New btree roots are set by journalling them; when the journal entry ++ * gets written we have to propagate them to c->btree_roots ++ * ++ * But, every journal entry we write has to contain all the btree roots ++ * (at least for now); so after we copy btree roots to c->btree_roots we ++ * have to get any missing btree roots and add them to this journal ++ * entry: ++ */ ++ ++ bch2_journal_entries_to_btree_roots(c, jset); ++ ++ start = end = vstruct_last(jset); ++ ++ end = bch2_btree_roots_to_journal_entries(c, jset->start, end); ++ ++ end = bch2_journal_super_entries_add_common(c, end, ++ le64_to_cpu(jset->seq)); ++ u64s = (u64 *) end - (u64 *) start; ++ BUG_ON(u64s > j->entry_u64s_reserved); ++ ++ le32_add_cpu(&jset->u64s, u64s); ++ BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors); ++ ++ journal_write_compact(jset); ++ ++ jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); ++ jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); ++ jset->magic = cpu_to_le64(jset_magic(c)); ++ ++ jset->version = c->sb.version < bcachefs_metadata_version_new_versioning ++ ? cpu_to_le32(BCH_JSET_VERSION_OLD) ++ : cpu_to_le32(c->sb.version); ++ ++ SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); ++ SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); ++ ++ if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) ++ validate_before_checksum = true; ++ ++ if (le32_to_cpu(jset->version) < bcachefs_metadata_version_max) ++ validate_before_checksum = true; ++ ++ if (validate_before_checksum && ++ jset_validate_entries(c, jset, WRITE)) ++ goto err; ++ ++ bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), ++ jset->encrypted_start, ++ vstruct_end(jset) - (void *) jset->encrypted_start); ++ ++ jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), ++ journal_nonce(jset), jset); ++ ++ if (!validate_before_checksum && ++ jset_validate_entries(c, jset, WRITE)) ++ goto err; ++ ++ sectors = vstruct_sectors(jset, c->block_bits); ++ BUG_ON(sectors > w->sectors); ++ ++ bytes = vstruct_bytes(jset); ++ memset((void *) jset + bytes, 0, (sectors << 9) - bytes); ++ ++retry_alloc: ++ spin_lock(&j->lock); ++ ret = journal_write_alloc(j, w, sectors); ++ ++ if (ret && j->can_discard) { ++ spin_unlock(&j->lock); ++ bch2_journal_do_discards(j); ++ goto retry_alloc; ++ } ++ ++ /* ++ * write is allocated, no longer need to account for it in ++ * bch2_journal_space_available(): ++ */ ++ w->sectors = 0; ++ ++ /* ++ * journal entry has been compacted and allocated, recalculate space ++ * available: ++ */ ++ bch2_journal_space_available(j); ++ spin_unlock(&j->lock); ++ ++ if (ret) { ++ bch_err(c, "Unable to allocate journal write"); ++ bch2_fatal_error(c); ++ continue_at(cl, journal_write_done, system_highpri_wq); ++ return; ++ } ++ ++ /* ++ * XXX: we really should just disable the entire journal in nochanges ++ * mode ++ */ ++ if (c->opts.nochanges) ++ goto no_io; ++ ++ extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { ++ ca = bch_dev_bkey_exists(c, ptr->dev); ++ if (!percpu_ref_tryget(&ca->io_ref)) { ++ /* XXX: fix this */ ++ bch_err(c, "missing device for journal write\n"); ++ continue; ++ } ++ ++ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_JOURNAL], ++ sectors); ++ ++ bio = ca->journal.bio; ++ bio_reset(bio); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_iter.bi_sector = ptr->offset; ++ bio->bi_end_io = journal_write_endio; ++ bio->bi_private = ca; ++ bio_set_op_attrs(bio, REQ_OP_WRITE, ++ REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); ++ bch2_bio_map(bio, jset, sectors << 9); ++ ++ trace_journal_write(bio); ++ closure_bio_submit(bio, cl); ++ ++ ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq); ++ } ++ ++ for_each_rw_member(ca, c, i) ++ if (journal_flushes_device(ca) && ++ !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { ++ percpu_ref_get(&ca->io_ref); ++ ++ bio = ca->journal.bio; ++ bio_reset(bio); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_opf = REQ_OP_FLUSH; ++ bio->bi_end_io = journal_write_endio; ++ bio->bi_private = ca; ++ closure_bio_submit(bio, cl); ++ } ++ ++no_io: ++ bch2_bucket_seq_cleanup(c); ++ ++ continue_at(cl, journal_write_done, system_highpri_wq); ++ return; ++err: ++ bch2_inconsistent_error(c); ++ continue_at(cl, journal_write_done, system_highpri_wq); ++} +diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h +new file mode 100644 +index 000000000000..72e575f360af +--- /dev/null ++++ b/fs/bcachefs/journal_io.h +@@ -0,0 +1,42 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_IO_H ++#define _BCACHEFS_JOURNAL_IO_H ++ ++/* ++ * Only used for holding the journal entries we read in btree_journal_read() ++ * during cache_registration ++ */ ++struct journal_replay { ++ struct list_head list; ++ struct bch_devs_list devs; ++ /* must be last: */ ++ struct jset j; ++}; ++ ++static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, ++ struct jset_entry *entry, unsigned type) ++{ ++ while (entry < vstruct_last(jset)) { ++ if (entry->type == type) ++ return entry; ++ ++ entry = vstruct_next(entry); ++ } ++ ++ return NULL; ++} ++ ++#define for_each_jset_entry_type(entry, jset, type) \ ++ for (entry = (jset)->start; \ ++ (entry = __jset_entry_type_next(jset, entry, type)); \ ++ entry = vstruct_next(entry)) ++ ++#define for_each_jset_key(k, _n, entry, jset) \ ++ for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ ++ vstruct_for_each_safe(entry, k, _n) ++ ++int bch2_journal_read(struct bch_fs *, struct list_head *); ++ ++void bch2_journal_write(struct closure *); ++ ++#endif /* _BCACHEFS_JOURNAL_IO_H */ +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +new file mode 100644 +index 000000000000..4811ab9f879e +--- /dev/null ++++ b/fs/bcachefs/journal_reclaim.c +@@ -0,0 +1,644 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "journal.h" ++#include "journal_io.h" ++#include "journal_reclaim.h" ++#include "replicas.h" ++#include "super.h" ++ ++/* Free space calculations: */ ++ ++static unsigned journal_space_from(struct journal_device *ja, ++ enum journal_space_from from) ++{ ++ switch (from) { ++ case journal_space_discarded: ++ return ja->discard_idx; ++ case journal_space_clean_ondisk: ++ return ja->dirty_idx_ondisk; ++ case journal_space_clean: ++ return ja->dirty_idx; ++ default: ++ BUG(); ++ } ++} ++ ++unsigned bch2_journal_dev_buckets_available(struct journal *j, ++ struct journal_device *ja, ++ enum journal_space_from from) ++{ ++ unsigned available = (journal_space_from(ja, from) - ++ ja->cur_idx - 1 + ja->nr) % ja->nr; ++ ++ /* ++ * Don't use the last bucket unless writing the new last_seq ++ * will make another bucket available: ++ */ ++ if (available && ja->dirty_idx_ondisk == ja->dirty_idx) ++ --available; ++ ++ return available; ++} ++ ++static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) ++{ ++ union journal_preres_state old, new; ++ u64 v = atomic64_read(&j->prereserved.counter); ++ ++ do { ++ old.v = new.v = v; ++ new.remaining = u64s_remaining; ++ } while ((v = atomic64_cmpxchg(&j->prereserved.counter, ++ old.v, new.v)) != old.v); ++} ++ ++static struct journal_space { ++ unsigned next_entry; ++ unsigned remaining; ++} __journal_space_available(struct journal *j, unsigned nr_devs_want, ++ enum journal_space_from from) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ unsigned sectors_next_entry = UINT_MAX; ++ unsigned sectors_total = UINT_MAX; ++ unsigned i, nr_devs = 0; ++ unsigned unwritten_sectors = j->reservations.prev_buf_unwritten ++ ? journal_prev_buf(j)->sectors ++ : 0; ++ ++ rcu_read_lock(); ++ for_each_member_device_rcu(ca, c, i, ++ &c->rw_devs[BCH_DATA_JOURNAL]) { ++ struct journal_device *ja = &ca->journal; ++ unsigned buckets_this_device, sectors_this_device; ++ ++ if (!ja->nr) ++ continue; ++ ++ buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from); ++ sectors_this_device = ja->sectors_free; ++ ++ /* ++ * We that we don't allocate the space for a journal entry ++ * until we write it out - thus, account for it here: ++ */ ++ if (unwritten_sectors >= sectors_this_device) { ++ if (!buckets_this_device) ++ continue; ++ ++ buckets_this_device--; ++ sectors_this_device = ca->mi.bucket_size; ++ } ++ ++ sectors_this_device -= unwritten_sectors; ++ ++ if (sectors_this_device < ca->mi.bucket_size && ++ buckets_this_device) { ++ buckets_this_device--; ++ sectors_this_device = ca->mi.bucket_size; ++ } ++ ++ if (!sectors_this_device) ++ continue; ++ ++ sectors_next_entry = min(sectors_next_entry, ++ sectors_this_device); ++ ++ sectors_total = min(sectors_total, ++ buckets_this_device * ca->mi.bucket_size + ++ sectors_this_device); ++ ++ nr_devs++; ++ } ++ rcu_read_unlock(); ++ ++ if (nr_devs < nr_devs_want) ++ return (struct journal_space) { 0, 0 }; ++ ++ return (struct journal_space) { ++ .next_entry = sectors_next_entry, ++ .remaining = max_t(int, 0, sectors_total - sectors_next_entry), ++ }; ++} ++ ++void bch2_journal_space_available(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ struct journal_space discarded, clean_ondisk, clean; ++ unsigned overhead, u64s_remaining = 0; ++ unsigned max_entry_size = min(j->buf[0].buf_size >> 9, ++ j->buf[1].buf_size >> 9); ++ unsigned i, nr_online = 0, nr_devs_want; ++ bool can_discard = false; ++ int ret = 0; ++ ++ lockdep_assert_held(&j->lock); ++ ++ rcu_read_lock(); ++ for_each_member_device_rcu(ca, c, i, ++ &c->rw_devs[BCH_DATA_JOURNAL]) { ++ struct journal_device *ja = &ca->journal; ++ ++ if (!ja->nr) ++ continue; ++ ++ while (ja->dirty_idx != ja->cur_idx && ++ ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j)) ++ ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; ++ ++ while (ja->dirty_idx_ondisk != ja->dirty_idx && ++ ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk) ++ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; ++ ++ if (ja->discard_idx != ja->dirty_idx_ondisk) ++ can_discard = true; ++ ++ max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); ++ nr_online++; ++ } ++ rcu_read_unlock(); ++ ++ j->can_discard = can_discard; ++ ++ if (nr_online < c->opts.metadata_replicas_required) { ++ ret = -EROFS; ++ goto out; ++ } ++ ++ if (!fifo_free(&j->pin)) { ++ ret = -ENOSPC; ++ goto out; ++ } ++ ++ nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); ++ ++ discarded = __journal_space_available(j, nr_devs_want, journal_space_discarded); ++ clean_ondisk = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk); ++ clean = __journal_space_available(j, nr_devs_want, journal_space_clean); ++ ++ if (!discarded.next_entry) ++ ret = -ENOSPC; ++ ++ overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) * ++ journal_entry_overhead(j); ++ u64s_remaining = clean.remaining << 6; ++ u64s_remaining = max_t(int, 0, u64s_remaining - overhead); ++ u64s_remaining /= 4; ++out: ++ j->cur_entry_sectors = !ret ? discarded.next_entry : 0; ++ j->cur_entry_error = ret; ++ journal_set_remaining(j, u64s_remaining); ++ journal_check_may_get_unreserved(j); ++ ++ if (!ret) ++ journal_wake(j); ++} ++ ++/* Discards - last part of journal reclaim: */ ++ ++static bool should_discard_bucket(struct journal *j, struct journal_device *ja) ++{ ++ bool ret; ++ ++ spin_lock(&j->lock); ++ ret = ja->discard_idx != ja->dirty_idx_ondisk; ++ spin_unlock(&j->lock); ++ ++ return ret; ++} ++ ++/* ++ * Advance ja->discard_idx as long as it points to buckets that are no longer ++ * dirty, issuing discards if necessary: ++ */ ++void bch2_journal_do_discards(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ unsigned iter; ++ ++ mutex_lock(&j->discard_lock); ++ ++ for_each_rw_member(ca, c, iter) { ++ struct journal_device *ja = &ca->journal; ++ ++ while (should_discard_bucket(j, ja)) { ++ if (ca->mi.discard && ++ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) ++ blkdev_issue_discard(ca->disk_sb.bdev, ++ bucket_to_sector(ca, ++ ja->buckets[ja->discard_idx]), ++ ca->mi.bucket_size, GFP_NOIO, 0); ++ ++ spin_lock(&j->lock); ++ ja->discard_idx = (ja->discard_idx + 1) % ja->nr; ++ ++ bch2_journal_space_available(j); ++ spin_unlock(&j->lock); ++ } ++ } ++ ++ mutex_unlock(&j->discard_lock); ++} ++ ++/* ++ * Journal entry pinning - machinery for holding a reference on a given journal ++ * entry, holding it open to ensure it gets replayed during recovery: ++ */ ++ ++static void bch2_journal_reclaim_fast(struct journal *j) ++{ ++ struct journal_entry_pin_list temp; ++ bool popped = false; ++ ++ lockdep_assert_held(&j->lock); ++ ++ /* ++ * Unpin journal entries whose reference counts reached zero, meaning ++ * all btree nodes got written out ++ */ ++ while (!fifo_empty(&j->pin) && ++ !atomic_read(&fifo_peek_front(&j->pin).count)) { ++ BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); ++ BUG_ON(!fifo_pop(&j->pin, temp)); ++ popped = true; ++ } ++ ++ if (popped) ++ bch2_journal_space_available(j); ++} ++ ++void bch2_journal_pin_put(struct journal *j, u64 seq) ++{ ++ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); ++ ++ if (atomic_dec_and_test(&pin_list->count)) { ++ spin_lock(&j->lock); ++ bch2_journal_reclaim_fast(j); ++ spin_unlock(&j->lock); ++ } ++} ++ ++static inline void __journal_pin_drop(struct journal *j, ++ struct journal_entry_pin *pin) ++{ ++ struct journal_entry_pin_list *pin_list; ++ ++ if (!journal_pin_active(pin)) ++ return; ++ ++ pin_list = journal_seq_pin(j, pin->seq); ++ pin->seq = 0; ++ list_del_init(&pin->list); ++ ++ /* ++ * Unpinning a journal entry make make journal_next_bucket() succeed, if ++ * writing a new last_seq will now make another bucket available: ++ */ ++ if (atomic_dec_and_test(&pin_list->count) && ++ pin_list == &fifo_peek_front(&j->pin)) ++ bch2_journal_reclaim_fast(j); ++ else if (fifo_used(&j->pin) == 1 && ++ atomic_read(&pin_list->count) == 1) ++ journal_wake(j); ++} ++ ++void bch2_journal_pin_drop(struct journal *j, ++ struct journal_entry_pin *pin) ++{ ++ spin_lock(&j->lock); ++ __journal_pin_drop(j, pin); ++ spin_unlock(&j->lock); ++} ++ ++static void bch2_journal_pin_add_locked(struct journal *j, u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) ++{ ++ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); ++ ++ __journal_pin_drop(j, pin); ++ ++ BUG_ON(!atomic_read(&pin_list->count) && seq == journal_last_seq(j)); ++ ++ atomic_inc(&pin_list->count); ++ pin->seq = seq; ++ pin->flush = flush_fn; ++ ++ list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed); ++} ++ ++void __bch2_journal_pin_add(struct journal *j, u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) ++{ ++ spin_lock(&j->lock); ++ bch2_journal_pin_add_locked(j, seq, pin, flush_fn); ++ spin_unlock(&j->lock); ++ ++ /* ++ * If the journal is currently full, we might want to call flush_fn ++ * immediately: ++ */ ++ journal_wake(j); ++} ++ ++void bch2_journal_pin_update(struct journal *j, u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) ++{ ++ if (journal_pin_active(pin) && pin->seq < seq) ++ return; ++ ++ spin_lock(&j->lock); ++ ++ if (pin->seq != seq) { ++ bch2_journal_pin_add_locked(j, seq, pin, flush_fn); ++ } else { ++ struct journal_entry_pin_list *pin_list = ++ journal_seq_pin(j, seq); ++ ++ /* ++ * If the pin is already pinning the right sequence number, it ++ * still might've already been flushed: ++ */ ++ list_move(&pin->list, &pin_list->list); ++ } ++ ++ spin_unlock(&j->lock); ++ ++ /* ++ * If the journal is currently full, we might want to call flush_fn ++ * immediately: ++ */ ++ journal_wake(j); ++} ++ ++void bch2_journal_pin_copy(struct journal *j, ++ struct journal_entry_pin *dst, ++ struct journal_entry_pin *src, ++ journal_pin_flush_fn flush_fn) ++{ ++ spin_lock(&j->lock); ++ ++ if (journal_pin_active(src) && ++ (!journal_pin_active(dst) || src->seq < dst->seq)) ++ bch2_journal_pin_add_locked(j, src->seq, dst, flush_fn); ++ ++ spin_unlock(&j->lock); ++} ++ ++/** ++ * bch2_journal_pin_flush: ensure journal pin callback is no longer running ++ */ ++void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) ++{ ++ BUG_ON(journal_pin_active(pin)); ++ ++ wait_event(j->pin_flush_wait, j->flush_in_progress != pin); ++} ++ ++/* ++ * Journal reclaim: flush references to open journal entries to reclaim space in ++ * the journal ++ * ++ * May be done by the journal code in the background as needed to free up space ++ * for more journal entries, or as part of doing a clean shutdown, or to migrate ++ * data off of a specific device: ++ */ ++ ++static struct journal_entry_pin * ++journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) ++{ ++ struct journal_entry_pin_list *pin_list; ++ struct journal_entry_pin *ret = NULL; ++ ++ if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)) ++ return NULL; ++ ++ spin_lock(&j->lock); ++ ++ fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) ++ if (*seq > max_seq || ++ (ret = list_first_entry_or_null(&pin_list->list, ++ struct journal_entry_pin, list))) ++ break; ++ ++ if (ret) { ++ list_move(&ret->list, &pin_list->flushed); ++ BUG_ON(j->flush_in_progress); ++ j->flush_in_progress = ret; ++ j->last_flushed = jiffies; ++ } ++ ++ spin_unlock(&j->lock); ++ ++ return ret; ++} ++ ++/* returns true if we did work */ ++static bool journal_flush_pins(struct journal *j, u64 seq_to_flush, ++ unsigned min_nr) ++{ ++ struct journal_entry_pin *pin; ++ bool ret = false; ++ u64 seq; ++ ++ lockdep_assert_held(&j->reclaim_lock); ++ ++ while ((pin = journal_get_next_pin(j, min_nr ++ ? U64_MAX : seq_to_flush, &seq))) { ++ if (min_nr) ++ min_nr--; ++ ++ pin->flush(j, pin, seq); ++ ++ BUG_ON(j->flush_in_progress != pin); ++ j->flush_in_progress = NULL; ++ wake_up(&j->pin_flush_wait); ++ ret = true; ++ } ++ ++ return ret; ++} ++ ++/** ++ * bch2_journal_reclaim - free up journal buckets ++ * ++ * Background journal reclaim writes out btree nodes. It should be run ++ * early enough so that we never completely run out of journal buckets. ++ * ++ * High watermarks for triggering background reclaim: ++ * - FIFO has fewer than 512 entries left ++ * - fewer than 25% journal buckets free ++ * ++ * Background reclaim runs until low watermarks are reached: ++ * - FIFO has more than 1024 entries left ++ * - more than 50% journal buckets free ++ * ++ * As long as a reclaim can complete in the time it takes to fill up ++ * 512 journal entries or 25% of all journal buckets, then ++ * journal_next_bucket() should not stall. ++ */ ++void bch2_journal_reclaim(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ unsigned iter, min_nr = 0; ++ u64 seq_to_flush = 0; ++ ++ lockdep_assert_held(&j->reclaim_lock); ++ ++ bch2_journal_do_discards(j); ++ ++ spin_lock(&j->lock); ++ ++ for_each_rw_member(ca, c, iter) { ++ struct journal_device *ja = &ca->journal; ++ unsigned nr_buckets, bucket_to_flush; ++ ++ if (!ja->nr) ++ continue; ++ ++ /* Try to keep the journal at most half full: */ ++ nr_buckets = ja->nr / 2; ++ ++ /* And include pre-reservations: */ ++ nr_buckets += DIV_ROUND_UP(j->prereserved.reserved, ++ (ca->mi.bucket_size << 6) - ++ journal_entry_overhead(j)); ++ ++ nr_buckets = min(nr_buckets, ja->nr); ++ ++ bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; ++ seq_to_flush = max(seq_to_flush, ++ ja->bucket_seq[bucket_to_flush]); ++ } ++ ++ /* Also flush if the pin fifo is more than half full */ ++ seq_to_flush = max_t(s64, seq_to_flush, ++ (s64) journal_cur_seq(j) - ++ (j->pin.size >> 1)); ++ spin_unlock(&j->lock); ++ ++ /* ++ * If it's been longer than j->reclaim_delay_ms since we last flushed, ++ * make sure to flush at least one journal pin: ++ */ ++ if (time_after(jiffies, j->last_flushed + ++ msecs_to_jiffies(j->reclaim_delay_ms))) ++ min_nr = 1; ++ ++ if (j->prereserved.reserved * 2 > j->prereserved.remaining) { ++ seq_to_flush = max(seq_to_flush, journal_last_seq(j)); ++ min_nr = 1; ++ } ++ ++ journal_flush_pins(j, seq_to_flush, min_nr); ++ ++ if (!bch2_journal_error(j)) ++ queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, ++ msecs_to_jiffies(j->reclaim_delay_ms)); ++} ++ ++void bch2_journal_reclaim_work(struct work_struct *work) ++{ ++ struct journal *j = container_of(to_delayed_work(work), ++ struct journal, reclaim_work); ++ ++ mutex_lock(&j->reclaim_lock); ++ bch2_journal_reclaim(j); ++ mutex_unlock(&j->reclaim_lock); ++} ++ ++static int journal_flush_done(struct journal *j, u64 seq_to_flush, ++ bool *did_work) ++{ ++ int ret; ++ ++ ret = bch2_journal_error(j); ++ if (ret) ++ return ret; ++ ++ mutex_lock(&j->reclaim_lock); ++ ++ *did_work = journal_flush_pins(j, seq_to_flush, 0); ++ ++ spin_lock(&j->lock); ++ /* ++ * If journal replay hasn't completed, the unreplayed journal entries ++ * hold refs on their corresponding sequence numbers ++ */ ++ ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || ++ journal_last_seq(j) > seq_to_flush || ++ (fifo_used(&j->pin) == 1 && ++ atomic_read(&fifo_peek_front(&j->pin).count) == 1); ++ ++ spin_unlock(&j->lock); ++ mutex_unlock(&j->reclaim_lock); ++ ++ return ret; ++} ++ ++bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) ++{ ++ bool did_work = false; ++ ++ if (!test_bit(JOURNAL_STARTED, &j->flags)) ++ return false; ++ ++ closure_wait_event(&j->async_wait, ++ journal_flush_done(j, seq_to_flush, &did_work)); ++ ++ return did_work; ++} ++ ++int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_entry_pin_list *p; ++ u64 iter, seq = 0; ++ int ret = 0; ++ ++ spin_lock(&j->lock); ++ fifo_for_each_entry_ptr(p, &j->pin, iter) ++ if (dev_idx >= 0 ++ ? bch2_dev_list_has_dev(p->devs, dev_idx) ++ : p->devs.nr < c->opts.metadata_replicas) ++ seq = iter; ++ spin_unlock(&j->lock); ++ ++ bch2_journal_flush_pins(j, seq); ++ ++ ret = bch2_journal_error(j); ++ if (ret) ++ return ret; ++ ++ mutex_lock(&c->replicas_gc_lock); ++ bch2_replicas_gc_start(c, 1 << BCH_DATA_JOURNAL); ++ ++ seq = 0; ++ ++ spin_lock(&j->lock); ++ while (!ret && seq < j->pin.back) { ++ struct bch_replicas_padded replicas; ++ ++ seq = max(seq, journal_last_seq(j)); ++ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, ++ journal_seq_pin(j, seq)->devs); ++ seq++; ++ ++ spin_unlock(&j->lock); ++ ret = bch2_mark_replicas(c, &replicas.e); ++ spin_lock(&j->lock); ++ } ++ spin_unlock(&j->lock); ++ ++ ret = bch2_replicas_gc_end(c, ret); ++ mutex_unlock(&c->replicas_gc_lock); ++ ++ return ret; ++} +diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h +new file mode 100644 +index 000000000000..8128907a7623 +--- /dev/null ++++ b/fs/bcachefs/journal_reclaim.h +@@ -0,0 +1,69 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_RECLAIM_H ++#define _BCACHEFS_JOURNAL_RECLAIM_H ++ ++#define JOURNAL_PIN (32 * 1024) ++ ++enum journal_space_from { ++ journal_space_discarded, ++ journal_space_clean_ondisk, ++ journal_space_clean, ++}; ++ ++unsigned bch2_journal_dev_buckets_available(struct journal *, ++ struct journal_device *, ++ enum journal_space_from); ++void bch2_journal_space_available(struct journal *); ++ ++static inline bool journal_pin_active(struct journal_entry_pin *pin) ++{ ++ return pin->seq != 0; ++} ++ ++static inline struct journal_entry_pin_list * ++journal_seq_pin(struct journal *j, u64 seq) ++{ ++ EBUG_ON(seq < j->pin.front || seq >= j->pin.back); ++ ++ return &j->pin.data[seq & j->pin.mask]; ++} ++ ++void bch2_journal_pin_put(struct journal *, u64); ++void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); ++ ++void __bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *, ++ journal_pin_flush_fn); ++ ++static inline void bch2_journal_pin_add(struct journal *j, u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) ++{ ++ if (unlikely(!journal_pin_active(pin) || pin->seq > seq)) ++ __bch2_journal_pin_add(j, seq, pin, flush_fn); ++} ++ ++void bch2_journal_pin_update(struct journal *, u64, ++ struct journal_entry_pin *, ++ journal_pin_flush_fn); ++ ++void bch2_journal_pin_copy(struct journal *, ++ struct journal_entry_pin *, ++ struct journal_entry_pin *, ++ journal_pin_flush_fn); ++ ++void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); ++ ++void bch2_journal_do_discards(struct journal *); ++void bch2_journal_reclaim(struct journal *); ++void bch2_journal_reclaim_work(struct work_struct *); ++ ++bool bch2_journal_flush_pins(struct journal *, u64); ++ ++static inline bool bch2_journal_flush_all_pins(struct journal *j) ++{ ++ return bch2_journal_flush_pins(j, U64_MAX); ++} ++ ++int bch2_journal_flush_device_pins(struct journal *, int); ++ ++#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ +diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c +new file mode 100644 +index 000000000000..a21de0088753 +--- /dev/null ++++ b/fs/bcachefs/journal_seq_blacklist.c +@@ -0,0 +1,318 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_iter.h" ++#include "eytzinger.h" ++#include "journal_seq_blacklist.h" ++#include "super-io.h" ++ ++/* ++ * journal_seq_blacklist machinery: ++ * ++ * To guarantee order of btree updates after a crash, we need to detect when a ++ * btree node entry (bset) is newer than the newest journal entry that was ++ * successfully written, and ignore it - effectively ignoring any btree updates ++ * that didn't make it into the journal. ++ * ++ * If we didn't do this, we might have two btree nodes, a and b, both with ++ * updates that weren't written to the journal yet: if b was updated after a, ++ * but b was flushed and not a - oops; on recovery we'll find that the updates ++ * to b happened, but not the updates to a that happened before it. ++ * ++ * Ignoring bsets that are newer than the newest journal entry is always safe, ++ * because everything they contain will also have been journalled - and must ++ * still be present in the journal on disk until a journal entry has been ++ * written _after_ that bset was written. ++ * ++ * To accomplish this, bsets record the newest journal sequence number they ++ * contain updates for; then, on startup, the btree code queries the journal ++ * code to ask "Is this sequence number newer than the newest journal entry? If ++ * so, ignore it." ++ * ++ * When this happens, we must blacklist that journal sequence number: the ++ * journal must not write any entries with that sequence number, and it must ++ * record that it was blacklisted so that a) on recovery we don't think we have ++ * missing journal entries and b) so that the btree code continues to ignore ++ * that bset, until that btree node is rewritten. ++ */ ++ ++static unsigned ++blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) ++{ ++ return bl ++ ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / ++ sizeof(struct journal_seq_blacklist_entry)) ++ : 0; ++} ++ ++static unsigned sb_blacklist_u64s(unsigned nr) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl; ++ ++ return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); ++} ++ ++static struct bch_sb_field_journal_seq_blacklist * ++blacklist_entry_try_merge(struct bch_fs *c, ++ struct bch_sb_field_journal_seq_blacklist *bl, ++ unsigned i) ++{ ++ unsigned nr = blacklist_nr_entries(bl); ++ ++ if (le64_to_cpu(bl->start[i].end) >= ++ le64_to_cpu(bl->start[i + 1].start)) { ++ bl->start[i].end = bl->start[i + 1].end; ++ --nr; ++ memmove(&bl->start[i], ++ &bl->start[i + 1], ++ sizeof(bl->start[0]) * (nr - i)); ++ ++ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, ++ sb_blacklist_u64s(nr)); ++ BUG_ON(!bl); ++ } ++ ++ return bl; ++} ++ ++int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl; ++ unsigned i, nr; ++ int ret = 0; ++ ++ mutex_lock(&c->sb_lock); ++ bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); ++ nr = blacklist_nr_entries(bl); ++ ++ if (bl) { ++ for (i = 0; i < nr; i++) { ++ struct journal_seq_blacklist_entry *e = ++ bl->start + i; ++ ++ if (start == le64_to_cpu(e->start) && ++ end == le64_to_cpu(e->end)) ++ goto out; ++ ++ if (start <= le64_to_cpu(e->start) && ++ end >= le64_to_cpu(e->end)) { ++ e->start = cpu_to_le64(start); ++ e->end = cpu_to_le64(end); ++ ++ if (i + 1 < nr) ++ bl = blacklist_entry_try_merge(c, ++ bl, i); ++ if (i) ++ bl = blacklist_entry_try_merge(c, ++ bl, i - 1); ++ goto out_write_sb; ++ } ++ } ++ } ++ ++ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, ++ sb_blacklist_u64s(nr + 1)); ++ if (!bl) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ bl->start[nr].start = cpu_to_le64(start); ++ bl->start[nr].end = cpu_to_le64(end); ++out_write_sb: ++ c->disk_sb.sb->features[0] |= ++ 1ULL << BCH_FEATURE_journal_seq_blacklist_v3; ++ ++ ret = bch2_write_super(c); ++out: ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++static int journal_seq_blacklist_table_cmp(const void *_l, ++ const void *_r, size_t size) ++{ ++ const struct journal_seq_blacklist_table_entry *l = _l; ++ const struct journal_seq_blacklist_table_entry *r = _r; ++ ++ return cmp_int(l->start, r->start); ++} ++ ++bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, ++ bool dirty) ++{ ++ struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; ++ struct journal_seq_blacklist_table_entry search = { .start = seq }; ++ int idx; ++ ++ if (!t) ++ return false; ++ ++ idx = eytzinger0_find_le(t->entries, t->nr, ++ sizeof(t->entries[0]), ++ journal_seq_blacklist_table_cmp, ++ &search); ++ if (idx < 0) ++ return false; ++ ++ BUG_ON(t->entries[idx].start > seq); ++ ++ if (seq >= t->entries[idx].end) ++ return false; ++ ++ if (dirty) ++ t->entries[idx].dirty = true; ++ return true; ++} ++ ++int bch2_blacklist_table_initialize(struct bch_fs *c) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl = ++ bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); ++ struct journal_seq_blacklist_table *t; ++ unsigned i, nr = blacklist_nr_entries(bl); ++ ++ BUG_ON(c->journal_seq_blacklist_table); ++ ++ if (!bl) ++ return 0; ++ ++ t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr, ++ GFP_KERNEL); ++ if (!t) ++ return -ENOMEM; ++ ++ t->nr = nr; ++ ++ for (i = 0; i < nr; i++) { ++ t->entries[i].start = le64_to_cpu(bl->start[i].start); ++ t->entries[i].end = le64_to_cpu(bl->start[i].end); ++ } ++ ++ eytzinger0_sort(t->entries, ++ t->nr, ++ sizeof(t->entries[0]), ++ journal_seq_blacklist_table_cmp, ++ NULL); ++ ++ c->journal_seq_blacklist_table = t; ++ return 0; ++} ++ ++static const char * ++bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl = ++ field_to_type(f, journal_seq_blacklist); ++ struct journal_seq_blacklist_entry *i; ++ unsigned nr = blacklist_nr_entries(bl); ++ ++ for (i = bl->start; i < bl->start + nr; i++) { ++ if (le64_to_cpu(i->start) >= ++ le64_to_cpu(i->end)) ++ return "entry start >= end"; ++ ++ if (i + 1 < bl->start + nr && ++ le64_to_cpu(i[0].end) > ++ le64_to_cpu(i[1].start)) ++ return "entries out of order"; ++ } ++ ++ return NULL; ++} ++ ++static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, ++ struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl = ++ field_to_type(f, journal_seq_blacklist); ++ struct journal_seq_blacklist_entry *i; ++ unsigned nr = blacklist_nr_entries(bl); ++ ++ for (i = bl->start; i < bl->start + nr; i++) { ++ if (i != bl->start) ++ pr_buf(out, " "); ++ ++ pr_buf(out, "%llu-%llu", ++ le64_to_cpu(i->start), ++ le64_to_cpu(i->end)); ++ } ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { ++ .validate = bch2_sb_journal_seq_blacklist_validate, ++ .to_text = bch2_sb_journal_seq_blacklist_to_text ++}; ++ ++void bch2_blacklist_entries_gc(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, struct bch_fs, ++ journal_seq_blacklist_gc_work); ++ struct journal_seq_blacklist_table *t; ++ struct bch_sb_field_journal_seq_blacklist *bl; ++ struct journal_seq_blacklist_entry *src, *dst; ++ struct btree_trans trans; ++ unsigned i, nr, new_nr; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) { ++ struct btree_iter *iter; ++ struct btree *b; ++ ++ for_each_btree_node(&trans, iter, i, POS_MIN, ++ BTREE_ITER_PREFETCH, b) ++ if (test_bit(BCH_FS_STOPPING, &c->flags)) { ++ bch2_trans_exit(&trans); ++ return; ++ } ++ bch2_trans_iter_free(&trans, iter); ++ } ++ ++ ret = bch2_trans_exit(&trans); ++ if (ret) ++ return; ++ ++ mutex_lock(&c->sb_lock); ++ bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); ++ if (!bl) ++ goto out; ++ ++ nr = blacklist_nr_entries(bl); ++ dst = bl->start; ++ ++ t = c->journal_seq_blacklist_table; ++ BUG_ON(nr != t->nr); ++ ++ for (src = bl->start, i = eytzinger0_first(t->nr); ++ src < bl->start + nr; ++ src++, i = eytzinger0_next(i, nr)) { ++ BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); ++ BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); ++ ++ if (t->entries[i].dirty) ++ *dst++ = *src; ++ } ++ ++ new_nr = dst - bl->start; ++ ++ bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); ++ ++ if (new_nr != nr) { ++ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, ++ new_nr ? sb_blacklist_u64s(new_nr) : 0); ++ BUG_ON(new_nr && !bl); ++ ++ if (!new_nr) ++ c->disk_sb.sb->features[0] &= ++ ~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); ++ ++ bch2_write_super(c); ++ } ++out: ++ mutex_unlock(&c->sb_lock); ++} +diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h +new file mode 100644 +index 000000000000..03f4b97247fd +--- /dev/null ++++ b/fs/bcachefs/journal_seq_blacklist.h +@@ -0,0 +1,13 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H ++#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H ++ ++bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); ++int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); ++int bch2_blacklist_table_initialize(struct bch_fs *); ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; ++ ++void bch2_blacklist_entries_gc(struct work_struct *); ++ ++#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +new file mode 100644 +index 000000000000..154b51b891d3 +--- /dev/null ++++ b/fs/bcachefs/journal_types.h +@@ -0,0 +1,277 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_TYPES_H ++#define _BCACHEFS_JOURNAL_TYPES_H ++ ++#include ++#include ++ ++#include "alloc_types.h" ++#include "super_types.h" ++#include "fifo.h" ++ ++struct journal_res; ++ ++/* ++ * We put two of these in struct journal; we used them for writes to the ++ * journal that are being staged or in flight. ++ */ ++struct journal_buf { ++ struct jset *data; ++ ++ BKEY_PADDED(key); ++ ++ struct closure_waitlist wait; ++ ++ unsigned buf_size; /* size in bytes of @data */ ++ unsigned sectors; /* maximum size for current entry */ ++ unsigned disk_sectors; /* maximum size entry could have been, if ++ buf_size was bigger */ ++ unsigned u64s_reserved; ++ /* bloom filter: */ ++ unsigned long has_inode[1024 / sizeof(unsigned long)]; ++}; ++ ++/* ++ * Something that makes a journal entry dirty - i.e. a btree node that has to be ++ * flushed: ++ */ ++ ++struct journal_entry_pin_list { ++ struct list_head list; ++ struct list_head flushed; ++ atomic_t count; ++ struct bch_devs_list devs; ++}; ++ ++struct journal; ++struct journal_entry_pin; ++typedef void (*journal_pin_flush_fn)(struct journal *j, ++ struct journal_entry_pin *, u64); ++ ++struct journal_entry_pin { ++ struct list_head list; ++ journal_pin_flush_fn flush; ++ u64 seq; ++}; ++ ++struct journal_res { ++ bool ref; ++ u8 idx; ++ u16 u64s; ++ u32 offset; ++ u64 seq; ++}; ++ ++/* ++ * For reserving space in the journal prior to getting a reservation on a ++ * particular journal entry: ++ */ ++struct journal_preres { ++ unsigned u64s; ++}; ++ ++union journal_res_state { ++ struct { ++ atomic64_t counter; ++ }; ++ ++ struct { ++ u64 v; ++ }; ++ ++ struct { ++ u64 cur_entry_offset:20, ++ idx:1, ++ prev_buf_unwritten:1, ++ buf0_count:21, ++ buf1_count:21; ++ }; ++}; ++ ++union journal_preres_state { ++ struct { ++ atomic64_t counter; ++ }; ++ ++ struct { ++ u64 v; ++ }; ++ ++ struct { ++ u32 reserved; ++ u32 remaining; ++ }; ++}; ++ ++/* bytes: */ ++#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ ++#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ ++ ++/* ++ * We stash some journal state as sentinal values in cur_entry_offset: ++ * note - cur_entry_offset is in units of u64s ++ */ ++#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) ++ ++#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) ++#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) ++ ++/* ++ * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP, ++ * either because something's waiting on the write to complete or because it's ++ * been dirty too long and the timer's expired. ++ */ ++ ++enum { ++ JOURNAL_REPLAY_DONE, ++ JOURNAL_STARTED, ++ JOURNAL_RECLAIM_STARTED, ++ JOURNAL_NEED_WRITE, ++ JOURNAL_NOT_EMPTY, ++ JOURNAL_MAY_GET_UNRESERVED, ++}; ++ ++/* Embedded in struct bch_fs */ ++struct journal { ++ /* Fastpath stuff up front: */ ++ ++ unsigned long flags; ++ ++ union journal_res_state reservations; ++ ++ /* Max size of current journal entry */ ++ unsigned cur_entry_u64s; ++ unsigned cur_entry_sectors; ++ ++ /* ++ * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if ++ * insufficient devices: ++ */ ++ int cur_entry_error; ++ ++ union journal_preres_state prereserved; ++ ++ /* Reserved space in journal entry to be used just prior to write */ ++ unsigned entry_u64s_reserved; ++ ++ unsigned buf_size_want; ++ ++ /* ++ * Two journal entries -- one is currently open for new entries, the ++ * other is possibly being written out. ++ */ ++ struct journal_buf buf[2]; ++ ++ spinlock_t lock; ++ ++ /* if nonzero, we may not open a new journal entry: */ ++ unsigned blocked; ++ ++ /* Used when waiting because the journal was full */ ++ wait_queue_head_t wait; ++ struct closure_waitlist async_wait; ++ struct closure_waitlist preres_wait; ++ ++ struct closure io; ++ struct delayed_work write_work; ++ ++ /* Sequence number of most recent journal entry (last entry in @pin) */ ++ atomic64_t seq; ++ ++ /* seq, last_seq from the most recent journal entry successfully written */ ++ u64 seq_ondisk; ++ u64 last_seq_ondisk; ++ ++ /* ++ * FIFO of journal entries whose btree updates have not yet been ++ * written out. ++ * ++ * Each entry is a reference count. The position in the FIFO is the ++ * entry's sequence number relative to @seq. ++ * ++ * The journal entry itself holds a reference count, put when the ++ * journal entry is written out. Each btree node modified by the journal ++ * entry also holds a reference count, put when the btree node is ++ * written. ++ * ++ * When a reference count reaches zero, the journal entry is no longer ++ * needed. When all journal entries in the oldest journal bucket are no ++ * longer needed, the bucket can be discarded and reused. ++ */ ++ struct { ++ u64 front, back, size, mask; ++ struct journal_entry_pin_list *data; ++ } pin; ++ ++ u64 replay_journal_seq; ++ u64 replay_journal_seq_end; ++ ++ struct write_point wp; ++ spinlock_t err_lock; ++ ++ struct delayed_work reclaim_work; ++ struct mutex reclaim_lock; ++ unsigned long last_flushed; ++ struct journal_entry_pin *flush_in_progress; ++ wait_queue_head_t pin_flush_wait; ++ ++ /* protects advancing ja->discard_idx: */ ++ struct mutex discard_lock; ++ bool can_discard; ++ ++ unsigned write_delay_ms; ++ unsigned reclaim_delay_ms; ++ ++ u64 res_get_blocked_start; ++ u64 need_write_time; ++ u64 write_start_time; ++ ++ struct time_stats *write_time; ++ struct time_stats *delay_time; ++ struct time_stats *blocked_time; ++ struct time_stats *flush_seq_time; ++ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map res_map; ++#endif ++}; ++ ++/* ++ * Embedded in struct bch_dev. First three fields refer to the array of journal ++ * buckets, in bch_sb. ++ */ ++struct journal_device { ++ /* ++ * For each journal bucket, contains the max sequence number of the ++ * journal writes it contains - so we know when a bucket can be reused. ++ */ ++ u64 *bucket_seq; ++ ++ unsigned sectors_free; ++ ++ /* ++ * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx: ++ */ ++ unsigned discard_idx; /* Next bucket to discard */ ++ unsigned dirty_idx_ondisk; ++ unsigned dirty_idx; ++ unsigned cur_idx; /* Journal bucket we're currently writing to */ ++ unsigned nr; ++ ++ u64 *buckets; ++ ++ /* Bio for journal reads/writes to this device */ ++ struct bio *bio; ++ ++ /* for bch_journal_read_device */ ++ struct closure read; ++}; ++ ++/* ++ * journal_entry_res - reserve space in every journal entry: ++ */ ++struct journal_entry_res { ++ unsigned u64s; ++}; ++ ++#endif /* _BCACHEFS_JOURNAL_TYPES_H */ +diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c +new file mode 100644 +index 000000000000..864dfaa67b7a +--- /dev/null ++++ b/fs/bcachefs/keylist.c +@@ -0,0 +1,67 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "keylist.h" ++ ++int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s, ++ size_t nr_inline_u64s, size_t new_u64s) ++{ ++ size_t oldsize = bch2_keylist_u64s(l); ++ size_t newsize = oldsize + new_u64s; ++ u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p; ++ u64 *new_keys; ++ ++ newsize = roundup_pow_of_two(newsize); ++ ++ if (newsize <= nr_inline_u64s || ++ (old_buf && roundup_pow_of_two(oldsize) == newsize)) ++ return 0; ++ ++ new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO); ++ if (!new_keys) ++ return -ENOMEM; ++ ++ if (!old_buf) ++ memcpy_u64s(new_keys, inline_u64s, oldsize); ++ ++ l->keys_p = new_keys; ++ l->top_p = new_keys + oldsize; ++ ++ return 0; ++} ++ ++void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert) ++{ ++ struct bkey_i *where; ++ ++ for_each_keylist_key(l, where) ++ if (bkey_cmp(insert->k.p, where->k.p) < 0) ++ break; ++ ++ memmove_u64s_up((u64 *) where + insert->k.u64s, ++ where, ++ ((u64 *) l->top) - ((u64 *) where)); ++ ++ l->top_p += insert->k.u64s; ++ bkey_copy(where, insert); ++} ++ ++void bch2_keylist_pop_front(struct keylist *l) ++{ ++ l->top_p -= bch2_keylist_front(l)->k.u64s; ++ ++ memmove_u64s_down(l->keys, ++ bkey_next(l->keys), ++ bch2_keylist_u64s(l)); ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_verify_keylist_sorted(struct keylist *l) ++{ ++ struct bkey_i *k; ++ ++ for_each_keylist_key(l, k) ++ BUG_ON(bkey_next(k) != l->top && ++ bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0); ++} ++#endif +diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h +new file mode 100644 +index 000000000000..195799bb20bc +--- /dev/null ++++ b/fs/bcachefs/keylist.h +@@ -0,0 +1,76 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_KEYLIST_H ++#define _BCACHEFS_KEYLIST_H ++ ++#include "keylist_types.h" ++ ++int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t); ++void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *); ++void bch2_keylist_pop_front(struct keylist *); ++ ++static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys) ++{ ++ l->top_p = l->keys_p = inline_keys; ++} ++ ++static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys) ++{ ++ if (l->keys_p != inline_keys) ++ kfree(l->keys_p); ++ bch2_keylist_init(l, inline_keys); ++} ++ ++static inline void bch2_keylist_push(struct keylist *l) ++{ ++ l->top = bkey_next(l->top); ++} ++ ++static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k) ++{ ++ bkey_copy(l->top, k); ++ bch2_keylist_push(l); ++} ++ ++static inline bool bch2_keylist_empty(struct keylist *l) ++{ ++ return l->top == l->keys; ++} ++ ++static inline size_t bch2_keylist_u64s(struct keylist *l) ++{ ++ return l->top_p - l->keys_p; ++} ++ ++static inline size_t bch2_keylist_bytes(struct keylist *l) ++{ ++ return bch2_keylist_u64s(l) * sizeof(u64); ++} ++ ++static inline struct bkey_i *bch2_keylist_front(struct keylist *l) ++{ ++ return l->keys; ++} ++ ++#define for_each_keylist_key(_keylist, _k) \ ++ for (_k = (_keylist)->keys; \ ++ _k != (_keylist)->top; \ ++ _k = bkey_next(_k)) ++ ++static inline u64 keylist_sectors(struct keylist *keys) ++{ ++ struct bkey_i *k; ++ u64 ret = 0; ++ ++ for_each_keylist_key(keys, k) ++ ret += k->k.size; ++ ++ return ret; ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_verify_keylist_sorted(struct keylist *); ++#else ++static inline void bch2_verify_keylist_sorted(struct keylist *l) {} ++#endif ++ ++#endif /* _BCACHEFS_KEYLIST_H */ +diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h +new file mode 100644 +index 000000000000..4b3ff7d8a875 +--- /dev/null ++++ b/fs/bcachefs/keylist_types.h +@@ -0,0 +1,16 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_KEYLIST_TYPES_H ++#define _BCACHEFS_KEYLIST_TYPES_H ++ ++struct keylist { ++ union { ++ struct bkey_i *keys; ++ u64 *keys_p; ++ }; ++ union { ++ struct bkey_i *top; ++ u64 *top_p; ++ }; ++}; ++ ++#endif /* _BCACHEFS_KEYLIST_TYPES_H */ +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +new file mode 100644 +index 000000000000..96c8690adc5b +--- /dev/null ++++ b/fs/bcachefs/migrate.c +@@ -0,0 +1,170 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Code for moving data off a device. ++ */ ++ ++#include "bcachefs.h" ++#include "bkey_on_stack.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "extents.h" ++#include "io.h" ++#include "journal.h" ++#include "keylist.h" ++#include "migrate.h" ++#include "move.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, ++ unsigned dev_idx, int flags, bool metadata) ++{ ++ unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; ++ unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; ++ unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED; ++ unsigned nr_good; ++ ++ bch2_bkey_drop_device(k, dev_idx); ++ ++ nr_good = bch2_bkey_durability(c, k.s_c); ++ if ((!nr_good && !(flags & lost)) || ++ (nr_good < replicas && !(flags & degraded))) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags, ++ enum btree_id btree_id) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_on_stack sk; ++ int ret = 0; ++ ++ bkey_on_stack_init(&sk); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, ++ BTREE_ITER_PREFETCH); ++ ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret = bkey_err(k))) { ++ if (!bch2_bkey_has_device(k, dev_idx)) { ++ bch2_btree_iter_next(iter); ++ continue; ++ } ++ ++ bkey_on_stack_reassemble(&sk, c, k); ++ ++ ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k), ++ dev_idx, flags, false); ++ if (ret) ++ break; ++ ++ /* ++ * If the new extent no longer has any pointers, bch2_extent_normalize() ++ * will do the appropriate thing with it (turning it into a ++ * KEY_TYPE_error key, or just a discard if it was a cached extent) ++ */ ++ bch2_extent_normalize(c, bkey_i_to_s(sk.k)); ++ ++ bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); ++ ++ bch2_trans_update(&trans, iter, sk.k, 0); ++ ++ ret = bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL); ++ ++ /* ++ * don't want to leave ret == -EINTR, since if we raced and ++ * something else overwrote the key we could spuriously return ++ * -EINTR below: ++ */ ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ break; ++ } ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ bkey_on_stack_exit(&sk, c); ++ ++ BUG_ON(ret == -EINTR); ++ ++ return ret; ++} ++ ++static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) ++{ ++ return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_EXTENTS) ?: ++ __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_REFLINK); ++} ++ ++static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct closure cl; ++ struct btree *b; ++ unsigned id; ++ int ret; ++ ++ /* don't handle this yet: */ ++ if (flags & BCH_FORCE_IF_METADATA_LOST) ++ return -EINVAL; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ closure_init_stack(&cl); ++ ++ for (id = 0; id < BTREE_ID_NR; id++) { ++ for_each_btree_node(&trans, iter, id, POS_MIN, ++ BTREE_ITER_PREFETCH, b) { ++ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; ++retry: ++ if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key), ++ dev_idx)) ++ continue; ++ ++ bkey_copy(&tmp.k, &b->key); ++ ++ ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.k), ++ dev_idx, flags, true); ++ if (ret) { ++ bch_err(c, "Cannot drop device without losing data"); ++ goto err; ++ } ++ ++ ret = bch2_btree_node_update_key(c, iter, b, &tmp.k); ++ if (ret == -EINTR) { ++ b = bch2_btree_iter_peek_node(iter); ++ goto retry; ++ } ++ if (ret) { ++ bch_err(c, "Error updating btree node key: %i", ret); ++ goto err; ++ } ++ } ++ bch2_trans_iter_free(&trans, iter); ++ } ++ ++ /* flush relevant btree updates */ ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c)); ++ ++ ret = 0; ++err: ++ ret = bch2_trans_exit(&trans) ?: ret; ++ ++ BUG_ON(ret == -EINTR); ++ ++ return ret; ++} ++ ++int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) ++{ ++ return bch2_dev_usrdata_drop(c, dev_idx, flags) ?: ++ bch2_dev_metadata_drop(c, dev_idx, flags); ++} +diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h +new file mode 100644 +index 000000000000..027efaa0d575 +--- /dev/null ++++ b/fs/bcachefs/migrate.h +@@ -0,0 +1,7 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_MIGRATE_H ++#define _BCACHEFS_MIGRATE_H ++ ++int bch2_dev_data_drop(struct bch_fs *, unsigned, int); ++ ++#endif /* _BCACHEFS_MIGRATE_H */ +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +new file mode 100644 +index 000000000000..b42350f9e9fb +--- /dev/null ++++ b/fs/bcachefs/move.c +@@ -0,0 +1,815 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_on_stack.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "disk_groups.h" ++#include "inode.h" ++#include "io.h" ++#include "journal_reclaim.h" ++#include "move.h" ++#include "replicas.h" ++#include "super-io.h" ++#include "keylist.h" ++ ++#include ++#include ++ ++#include ++ ++#define SECTORS_IN_FLIGHT_PER_DEVICE 2048 ++ ++struct moving_io { ++ struct list_head list; ++ struct closure cl; ++ bool read_completed; ++ ++ unsigned read_sectors; ++ unsigned write_sectors; ++ ++ struct bch_read_bio rbio; ++ ++ struct migrate_write write; ++ /* Must be last since it is variable size */ ++ struct bio_vec bi_inline_vecs[0]; ++}; ++ ++struct moving_context { ++ /* Closure for waiting on all reads and writes to complete */ ++ struct closure cl; ++ ++ struct bch_move_stats *stats; ++ ++ struct list_head reads; ++ ++ /* in flight sectors: */ ++ atomic_t read_sectors; ++ atomic_t write_sectors; ++ ++ wait_queue_head_t wait; ++}; ++ ++static int bch2_migrate_index_update(struct bch_write_op *op) ++{ ++ struct bch_fs *c = op->c; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct migrate_write *m = ++ container_of(op, struct migrate_write, op); ++ struct keylist *keys = &op->insert_keys; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ iter = bch2_trans_get_iter(&trans, m->btree_id, ++ bkey_start_pos(&bch2_keylist_front(keys)->k), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ ++ while (1) { ++ struct bkey_s_c k; ++ struct bkey_i *insert; ++ struct bkey_i_extent *new; ++ BKEY_PADDED(k) _new, _insert; ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ bool did_work = false; ++ int nr; ++ ++ bch2_trans_reset(&trans, 0); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) { ++ if (ret == -EINTR) ++ continue; ++ break; ++ } ++ ++ new = bkey_i_to_extent(bch2_keylist_front(keys)); ++ ++ if (bversion_cmp(k.k->version, new->k.version) || ++ !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset)) ++ goto nomatch; ++ ++ if (m->data_cmd == DATA_REWRITE && ++ !bch2_bkey_has_device(k, m->data_opts.rewrite_dev)) ++ goto nomatch; ++ ++ bkey_reassemble(&_insert.k, k); ++ insert = &_insert.k; ++ ++ bkey_copy(&_new.k, bch2_keylist_front(keys)); ++ new = bkey_i_to_extent(&_new.k); ++ bch2_cut_front(iter->pos, &new->k_i); ++ ++ bch2_cut_front(iter->pos, insert); ++ bch2_cut_back(new->k.p, insert); ++ bch2_cut_back(insert->k.p, &new->k_i); ++ ++ if (m->data_cmd == DATA_REWRITE) ++ bch2_bkey_drop_device(bkey_i_to_s(insert), ++ m->data_opts.rewrite_dev); ++ ++ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { ++ if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) { ++ /* ++ * raced with another move op? extent already ++ * has a pointer to the device we just wrote ++ * data to ++ */ ++ continue; ++ } ++ ++ bch2_extent_ptr_decoded_append(insert, &p); ++ did_work = true; ++ } ++ ++ if (!did_work) ++ goto nomatch; ++ ++ bch2_bkey_narrow_crcs(insert, ++ (struct bch_extent_crc_unpacked) { 0 }); ++ bch2_extent_normalize(c, bkey_i_to_s(insert)); ++ bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert), ++ op->opts.background_target, ++ op->opts.data_replicas); ++ ++ /* ++ * If we're not fully overwriting @k, and it's compressed, we ++ * need a reservation for all the pointers in @insert ++ */ ++ nr = bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(insert)) - ++ m->nr_ptrs_reserved; ++ ++ if (insert->k.size < k.k->size && ++ bch2_bkey_sectors_compressed(k) && ++ nr > 0) { ++ ret = bch2_disk_reservation_add(c, &op->res, ++ keylist_sectors(keys) * nr, 0); ++ if (ret) ++ goto out; ++ ++ m->nr_ptrs_reserved += nr; ++ goto next; ++ } ++ ++ bch2_trans_update(&trans, iter, insert, 0); ++ ++ ret = bch2_trans_commit(&trans, &op->res, ++ op_journal_seq(op), ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ m->data_opts.btree_insert_flags); ++ if (!ret) ++ atomic_long_inc(&c->extent_migrate_done); ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ break; ++next: ++ while (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) >= 0) { ++ bch2_keylist_pop_front(keys); ++ if (bch2_keylist_empty(keys)) ++ goto out; ++ } ++ continue; ++nomatch: ++ if (m->ctxt) { ++ BUG_ON(k.k->p.offset <= iter->pos.offset); ++ atomic64_inc(&m->ctxt->stats->keys_raced); ++ atomic64_add(k.k->p.offset - iter->pos.offset, ++ &m->ctxt->stats->sectors_raced); ++ } ++ atomic_long_inc(&c->extent_migrate_raced); ++ trace_move_race(&new->k); ++ bch2_btree_iter_next_slot(iter); ++ goto next; ++ } ++out: ++ bch2_trans_exit(&trans); ++ BUG_ON(ret == -EINTR); ++ return ret; ++} ++ ++void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio) ++{ ++ /* write bio must own pages: */ ++ BUG_ON(!m->op.wbio.bio.bi_vcnt); ++ ++ m->ptr = rbio->pick.ptr; ++ m->offset = rbio->pos.offset - rbio->pick.crc.offset; ++ m->op.devs_have = rbio->devs_have; ++ m->op.pos = rbio->pos; ++ m->op.version = rbio->version; ++ m->op.crc = rbio->pick.crc; ++ m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; ++ ++ if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) { ++ m->op.nonce = m->op.crc.nonce + m->op.crc.offset; ++ m->op.csum_type = m->op.crc.csum_type; ++ } ++ ++ if (m->data_cmd == DATA_REWRITE) ++ bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev); ++} ++ ++int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, ++ struct write_point_specifier wp, ++ struct bch_io_opts io_opts, ++ enum data_cmd data_cmd, ++ struct data_opts data_opts, ++ enum btree_id btree_id, ++ struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ int ret; ++ ++ m->btree_id = btree_id; ++ m->data_cmd = data_cmd; ++ m->data_opts = data_opts; ++ m->nr_ptrs_reserved = 0; ++ ++ bch2_write_op_init(&m->op, c, io_opts); ++ ++ if (!bch2_bkey_is_incompressible(k)) ++ m->op.compression_type = ++ bch2_compression_opt_to_type[io_opts.background_compression ?: ++ io_opts.compression]; ++ else ++ m->op.incompressible = true; ++ ++ m->op.target = data_opts.target, ++ m->op.write_point = wp; ++ ++ if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) ++ m->op.alloc_reserve = RESERVE_MOVINGGC; ++ ++ m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS| ++ BCH_WRITE_PAGES_STABLE| ++ BCH_WRITE_PAGES_OWNED| ++ BCH_WRITE_DATA_ENCODED| ++ BCH_WRITE_FROM_INTERNAL; ++ ++ m->op.nr_replicas = 1; ++ m->op.nr_replicas_required = 1; ++ m->op.index_update_fn = bch2_migrate_index_update; ++ ++ switch (data_cmd) { ++ case DATA_ADD_REPLICAS: { ++ /* ++ * DATA_ADD_REPLICAS is used for moving data to a different ++ * device in the background, and due to compression the new copy ++ * might take up more space than the old copy: ++ */ ++#if 0 ++ int nr = (int) io_opts.data_replicas - ++ bch2_bkey_nr_ptrs_allocated(k); ++#endif ++ int nr = (int) io_opts.data_replicas; ++ ++ if (nr > 0) { ++ m->op.nr_replicas = m->nr_ptrs_reserved = nr; ++ ++ ret = bch2_disk_reservation_get(c, &m->op.res, ++ k.k->size, m->op.nr_replicas, 0); ++ if (ret) ++ return ret; ++ } ++ break; ++ } ++ case DATA_REWRITE: { ++ unsigned compressed_sectors = 0; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (!p.ptr.cached && ++ crc_is_compressed(p.crc) && ++ bch2_dev_in_target(c, p.ptr.dev, data_opts.target)) ++ compressed_sectors += p.crc.compressed_size; ++ ++ if (compressed_sectors) { ++ ret = bch2_disk_reservation_add(c, &m->op.res, ++ compressed_sectors, ++ BCH_DISK_RESERVATION_NOFAIL); ++ if (ret) ++ return ret; ++ } ++ break; ++ } ++ case DATA_PROMOTE: ++ m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; ++ m->op.flags |= BCH_WRITE_CACHED; ++ break; ++ default: ++ BUG(); ++ } ++ ++ return 0; ++} ++ ++static void move_free(struct closure *cl) ++{ ++ struct moving_io *io = container_of(cl, struct moving_io, cl); ++ struct moving_context *ctxt = io->write.ctxt; ++ struct bvec_iter_all iter; ++ struct bio_vec *bv; ++ ++ bch2_disk_reservation_put(io->write.op.c, &io->write.op.res); ++ ++ bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter) ++ if (bv->bv_page) ++ __free_page(bv->bv_page); ++ ++ wake_up(&ctxt->wait); ++ ++ kfree(io); ++} ++ ++static void move_write_done(struct closure *cl) ++{ ++ struct moving_io *io = container_of(cl, struct moving_io, cl); ++ ++ atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); ++ closure_return_with_destructor(cl, move_free); ++} ++ ++static void move_write(struct closure *cl) ++{ ++ struct moving_io *io = container_of(cl, struct moving_io, cl); ++ ++ if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { ++ closure_return_with_destructor(cl, move_free); ++ return; ++ } ++ ++ bch2_migrate_read_done(&io->write, &io->rbio); ++ ++ atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); ++ closure_call(&io->write.op.cl, bch2_write, NULL, cl); ++ continue_at(cl, move_write_done, NULL); ++} ++ ++static inline struct moving_io *next_pending_write(struct moving_context *ctxt) ++{ ++ struct moving_io *io = ++ list_first_entry_or_null(&ctxt->reads, struct moving_io, list); ++ ++ return io && io->read_completed ? io : NULL; ++} ++ ++static void move_read_endio(struct bio *bio) ++{ ++ struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); ++ struct moving_context *ctxt = io->write.ctxt; ++ ++ atomic_sub(io->read_sectors, &ctxt->read_sectors); ++ io->read_completed = true; ++ ++ if (next_pending_write(ctxt)) ++ wake_up(&ctxt->wait); ++ ++ closure_put(&ctxt->cl); ++} ++ ++static void do_pending_writes(struct moving_context *ctxt) ++{ ++ struct moving_io *io; ++ ++ while ((io = next_pending_write(ctxt))) { ++ list_del(&io->list); ++ closure_call(&io->cl, move_write, NULL, &ctxt->cl); ++ } ++} ++ ++#define move_ctxt_wait_event(_ctxt, _cond) \ ++do { \ ++ do_pending_writes(_ctxt); \ ++ \ ++ if (_cond) \ ++ break; \ ++ __wait_event((_ctxt)->wait, \ ++ next_pending_write(_ctxt) || (_cond)); \ ++} while (1) ++ ++static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) ++{ ++ unsigned sectors_pending = atomic_read(&ctxt->write_sectors); ++ ++ move_ctxt_wait_event(ctxt, ++ !atomic_read(&ctxt->write_sectors) || ++ atomic_read(&ctxt->write_sectors) != sectors_pending); ++} ++ ++static int bch2_move_extent(struct bch_fs *c, ++ struct moving_context *ctxt, ++ struct write_point_specifier wp, ++ struct bch_io_opts io_opts, ++ enum btree_id btree_id, ++ struct bkey_s_c k, ++ enum data_cmd data_cmd, ++ struct data_opts data_opts) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ struct moving_io *io; ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ unsigned sectors = k.k->size, pages; ++ int ret = -ENOMEM; ++ ++ move_ctxt_wait_event(ctxt, ++ atomic_read(&ctxt->write_sectors) < ++ SECTORS_IN_FLIGHT_PER_DEVICE); ++ ++ move_ctxt_wait_event(ctxt, ++ atomic_read(&ctxt->read_sectors) < ++ SECTORS_IN_FLIGHT_PER_DEVICE); ++ ++ /* write path might have to decompress data: */ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); ++ ++ pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); ++ io = kzalloc(sizeof(struct moving_io) + ++ sizeof(struct bio_vec) * pages, GFP_KERNEL); ++ if (!io) ++ goto err; ++ ++ io->write.ctxt = ctxt; ++ io->read_sectors = k.k->size; ++ io->write_sectors = k.k->size; ++ ++ bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages); ++ bio_set_prio(&io->write.op.wbio.bio, ++ IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); ++ ++ if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, ++ GFP_KERNEL)) ++ goto err_free; ++ ++ io->rbio.c = c; ++ io->rbio.opts = io_opts; ++ bio_init(&io->rbio.bio, io->bi_inline_vecs, pages); ++ io->rbio.bio.bi_vcnt = pages; ++ bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); ++ io->rbio.bio.bi_iter.bi_size = sectors << 9; ++ ++ bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0); ++ io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); ++ io->rbio.bio.bi_end_io = move_read_endio; ++ ++ ret = bch2_migrate_write_init(c, &io->write, wp, io_opts, ++ data_cmd, data_opts, btree_id, k); ++ if (ret) ++ goto err_free_pages; ++ ++ atomic64_inc(&ctxt->stats->keys_moved); ++ atomic64_add(k.k->size, &ctxt->stats->sectors_moved); ++ ++ trace_move_extent(k.k); ++ ++ atomic_add(io->read_sectors, &ctxt->read_sectors); ++ list_add_tail(&io->list, &ctxt->reads); ++ ++ /* ++ * dropped by move_read_endio() - guards against use after free of ++ * ctxt when doing wakeup ++ */ ++ closure_get(&ctxt->cl); ++ bch2_read_extent(c, &io->rbio, k, 0, ++ BCH_READ_NODECODE| ++ BCH_READ_LAST_FRAGMENT); ++ return 0; ++err_free_pages: ++ bio_free_pages(&io->write.op.wbio.bio); ++err_free: ++ kfree(io); ++err: ++ trace_move_alloc_fail(k.k); ++ return ret; ++} ++ ++static int __bch2_move_data(struct bch_fs *c, ++ struct moving_context *ctxt, ++ struct bch_ratelimit *rate, ++ struct write_point_specifier wp, ++ struct bpos start, ++ struct bpos end, ++ move_pred_fn pred, void *arg, ++ struct bch_move_stats *stats, ++ enum btree_id btree_id) ++{ ++ bool kthread = (current->flags & PF_KTHREAD) != 0; ++ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); ++ struct bkey_on_stack sk; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct data_opts data_opts; ++ enum data_cmd data_cmd; ++ u64 delay, cur_inum = U64_MAX; ++ int ret = 0, ret2; ++ ++ bkey_on_stack_init(&sk); ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ stats->data_type = BCH_DATA_USER; ++ stats->btree_id = btree_id; ++ stats->pos = POS_MIN; ++ ++ iter = bch2_trans_get_iter(&trans, btree_id, start, ++ BTREE_ITER_PREFETCH); ++ ++ if (rate) ++ bch2_ratelimit_reset(rate); ++ ++ while (1) { ++ do { ++ delay = rate ? bch2_ratelimit_delay(rate) : 0; ++ ++ if (delay) { ++ bch2_trans_unlock(&trans); ++ set_current_state(TASK_INTERRUPTIBLE); ++ } ++ ++ if (kthread && (ret = kthread_should_stop())) { ++ __set_current_state(TASK_RUNNING); ++ goto out; ++ } ++ ++ if (delay) ++ schedule_timeout(delay); ++ ++ if (unlikely(freezing(current))) { ++ bch2_trans_unlock(&trans); ++ move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); ++ try_to_freeze(); ++ } ++ } while (delay); ++peek: ++ k = bch2_btree_iter_peek(iter); ++ ++ stats->pos = iter->pos; ++ ++ if (!k.k) ++ break; ++ ret = bkey_err(k); ++ if (ret) ++ break; ++ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) ++ break; ++ ++ if (!bkey_extent_is_direct_data(k.k)) ++ goto next_nondata; ++ ++ if (btree_id == BTREE_ID_EXTENTS && ++ cur_inum != k.k->p.inode) { ++ struct bch_inode_unpacked inode; ++ ++ /* don't hold btree locks while looking up inode: */ ++ bch2_trans_unlock(&trans); ++ ++ io_opts = bch2_opts_to_inode_opts(c->opts); ++ if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode)) ++ bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode)); ++ cur_inum = k.k->p.inode; ++ goto peek; ++ } ++ ++ switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) { ++ case DATA_SKIP: ++ goto next; ++ case DATA_SCRUB: ++ BUG(); ++ case DATA_ADD_REPLICAS: ++ case DATA_REWRITE: ++ case DATA_PROMOTE: ++ break; ++ default: ++ BUG(); ++ } ++ ++ /* unlock before doing IO: */ ++ bkey_on_stack_reassemble(&sk, c, k); ++ k = bkey_i_to_s_c(sk.k); ++ bch2_trans_unlock(&trans); ++ ++ ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k, ++ data_cmd, data_opts); ++ if (ret2) { ++ if (ret2 == -ENOMEM) { ++ /* memory allocation failure, wait for some IO to finish */ ++ bch2_move_ctxt_wait_for_io(ctxt); ++ continue; ++ } ++ ++ /* XXX signal failure */ ++ goto next; ++ } ++ ++ if (rate) ++ bch2_ratelimit_increment(rate, k.k->size); ++next: ++ atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k), ++ &stats->sectors_seen); ++next_nondata: ++ bch2_btree_iter_next(iter); ++ bch2_trans_cond_resched(&trans); ++ } ++out: ++ ret = bch2_trans_exit(&trans) ?: ret; ++ bkey_on_stack_exit(&sk, c); ++ ++ return ret; ++} ++ ++int bch2_move_data(struct bch_fs *c, ++ struct bch_ratelimit *rate, ++ struct write_point_specifier wp, ++ struct bpos start, ++ struct bpos end, ++ move_pred_fn pred, void *arg, ++ struct bch_move_stats *stats) ++{ ++ struct moving_context ctxt = { .stats = stats }; ++ int ret; ++ ++ closure_init_stack(&ctxt.cl); ++ INIT_LIST_HEAD(&ctxt.reads); ++ init_waitqueue_head(&ctxt.wait); ++ ++ stats->data_type = BCH_DATA_USER; ++ ++ ret = __bch2_move_data(c, &ctxt, rate, wp, start, end, ++ pred, arg, stats, BTREE_ID_EXTENTS) ?: ++ __bch2_move_data(c, &ctxt, rate, wp, start, end, ++ pred, arg, stats, BTREE_ID_REFLINK); ++ ++ move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); ++ closure_sync(&ctxt.cl); ++ ++ EBUG_ON(atomic_read(&ctxt.write_sectors)); ++ ++ trace_move_data(c, ++ atomic64_read(&stats->sectors_moved), ++ atomic64_read(&stats->keys_moved)); ++ ++ return ret; ++} ++ ++static int bch2_move_btree(struct bch_fs *c, ++ move_pred_fn pred, ++ void *arg, ++ struct bch_move_stats *stats) ++{ ++ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct btree *b; ++ unsigned id; ++ struct data_opts data_opts; ++ enum data_cmd cmd; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ stats->data_type = BCH_DATA_BTREE; ++ ++ for (id = 0; id < BTREE_ID_NR; id++) { ++ stats->btree_id = id; ++ ++ for_each_btree_node(&trans, iter, id, POS_MIN, ++ BTREE_ITER_PREFETCH, b) { ++ stats->pos = iter->pos; ++ ++ switch ((cmd = pred(c, arg, ++ bkey_i_to_s_c(&b->key), ++ &io_opts, &data_opts))) { ++ case DATA_SKIP: ++ goto next; ++ case DATA_SCRUB: ++ BUG(); ++ case DATA_ADD_REPLICAS: ++ case DATA_REWRITE: ++ break; ++ default: ++ BUG(); ++ } ++ ++ ret = bch2_btree_node_rewrite(c, iter, ++ b->data->keys.seq, 0) ?: ret; ++next: ++ bch2_trans_cond_resched(&trans); ++ } ++ ++ ret = bch2_trans_iter_free(&trans, iter) ?: ret; ++ } ++ ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++#if 0 ++static enum data_cmd scrub_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) ++{ ++ return DATA_SCRUB; ++} ++#endif ++ ++static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) ++{ ++ unsigned nr_good = bch2_bkey_durability(c, k); ++ unsigned replicas = 0; ++ ++ switch (k.k->type) { ++ case KEY_TYPE_btree_ptr: ++ replicas = c->opts.metadata_replicas; ++ break; ++ case KEY_TYPE_extent: ++ replicas = io_opts->data_replicas; ++ break; ++ } ++ ++ if (!nr_good || nr_good >= replicas) ++ return DATA_SKIP; ++ ++ data_opts->target = 0; ++ data_opts->btree_insert_flags = 0; ++ return DATA_ADD_REPLICAS; ++} ++ ++static enum data_cmd migrate_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) ++{ ++ struct bch_ioctl_data *op = arg; ++ ++ if (!bch2_bkey_has_device(k, op->migrate.dev)) ++ return DATA_SKIP; ++ ++ data_opts->target = 0; ++ data_opts->btree_insert_flags = 0; ++ data_opts->rewrite_dev = op->migrate.dev; ++ return DATA_REWRITE; ++} ++ ++int bch2_data_job(struct bch_fs *c, ++ struct bch_move_stats *stats, ++ struct bch_ioctl_data op) ++{ ++ int ret = 0; ++ ++ switch (op.op) { ++ case BCH_DATA_OP_REREPLICATE: ++ stats->data_type = BCH_DATA_JOURNAL; ++ ret = bch2_journal_flush_device_pins(&c->journal, -1); ++ ++ ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret; ++ ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c)); ++ ++ ret = bch2_replicas_gc2(c) ?: ret; ++ ++ ret = bch2_move_data(c, NULL, ++ writepoint_hashed((unsigned long) current), ++ op.start, ++ op.end, ++ rereplicate_pred, c, stats) ?: ret; ++ ret = bch2_replicas_gc2(c) ?: ret; ++ break; ++ case BCH_DATA_OP_MIGRATE: ++ if (op.migrate.dev >= c->sb.nr_devices) ++ return -EINVAL; ++ ++ stats->data_type = BCH_DATA_JOURNAL; ++ ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); ++ ++ ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret; ++ ret = bch2_replicas_gc2(c) ?: ret; ++ ++ ret = bch2_move_data(c, NULL, ++ writepoint_hashed((unsigned long) current), ++ op.start, ++ op.end, ++ migrate_pred, &op, stats) ?: ret; ++ ret = bch2_replicas_gc2(c) ?: ret; ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++ return ret; ++} +diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h +new file mode 100644 +index 000000000000..0acd1720d4f8 +--- /dev/null ++++ b/fs/bcachefs/move.h +@@ -0,0 +1,64 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_MOVE_H ++#define _BCACHEFS_MOVE_H ++ ++#include "btree_iter.h" ++#include "buckets.h" ++#include "io_types.h" ++#include "move_types.h" ++ ++struct bch_read_bio; ++struct moving_context; ++ ++enum data_cmd { ++ DATA_SKIP, ++ DATA_SCRUB, ++ DATA_ADD_REPLICAS, ++ DATA_REWRITE, ++ DATA_PROMOTE, ++}; ++ ++struct data_opts { ++ u16 target; ++ unsigned rewrite_dev; ++ int btree_insert_flags; ++}; ++ ++struct migrate_write { ++ enum btree_id btree_id; ++ enum data_cmd data_cmd; ++ struct data_opts data_opts; ++ ++ unsigned nr_ptrs_reserved; ++ ++ struct moving_context *ctxt; ++ ++ /* what we read: */ ++ struct bch_extent_ptr ptr; ++ u64 offset; ++ ++ struct bch_write_op op; ++}; ++ ++void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *); ++int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *, ++ struct write_point_specifier, ++ struct bch_io_opts, ++ enum data_cmd, struct data_opts, ++ enum btree_id, struct bkey_s_c); ++ ++typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *, ++ struct bkey_s_c, ++ struct bch_io_opts *, struct data_opts *); ++ ++int bch2_move_data(struct bch_fs *, struct bch_ratelimit *, ++ struct write_point_specifier, ++ struct bpos, struct bpos, ++ move_pred_fn, void *, ++ struct bch_move_stats *); ++ ++int bch2_data_job(struct bch_fs *, ++ struct bch_move_stats *, ++ struct bch_ioctl_data); ++ ++#endif /* _BCACHEFS_MOVE_H */ +diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h +new file mode 100644 +index 000000000000..fc0de165af9f +--- /dev/null ++++ b/fs/bcachefs/move_types.h +@@ -0,0 +1,17 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_MOVE_TYPES_H ++#define _BCACHEFS_MOVE_TYPES_H ++ ++struct bch_move_stats { ++ enum bch_data_type data_type; ++ enum btree_id btree_id; ++ struct bpos pos; ++ ++ atomic64_t keys_moved; ++ atomic64_t keys_raced; ++ atomic64_t sectors_moved; ++ atomic64_t sectors_seen; ++ atomic64_t sectors_raced; ++}; ++ ++#endif /* _BCACHEFS_MOVE_TYPES_H */ +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +new file mode 100644 +index 000000000000..0a87cd7405dd +--- /dev/null ++++ b/fs/bcachefs/movinggc.c +@@ -0,0 +1,322 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Moving/copying garbage collector ++ * ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "btree_iter.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "clock.h" ++#include "disk_groups.h" ++#include "extents.h" ++#include "eytzinger.h" ++#include "io.h" ++#include "keylist.h" ++#include "move.h" ++#include "movinggc.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * We can't use the entire copygc reserve in one iteration of copygc: we may ++ * need the buckets we're freeing up to go back into the copygc reserve to make ++ * forward progress, but if the copygc reserve is full they'll be available for ++ * any allocation - and it's possible that in a given iteration, we free up most ++ * of the buckets we're going to free before we allocate most of the buckets ++ * we're going to allocate. ++ * ++ * If we only use half of the reserve per iteration, then in steady state we'll ++ * always have room in the reserve for the buckets we're going to need in the ++ * next iteration: ++ */ ++#define COPYGC_BUCKETS_PER_ITER(ca) \ ++ ((ca)->free[RESERVE_MOVINGGC].size / 2) ++ ++/* ++ * Max sectors to move per iteration: Have to take into account internal ++ * fragmentation from the multiple write points for each generation: ++ */ ++#define COPYGC_SECTORS_PER_ITER(ca) \ ++ ((ca)->mi.bucket_size * COPYGC_BUCKETS_PER_ITER(ca)) ++ ++static inline int sectors_used_cmp(copygc_heap *heap, ++ struct copygc_heap_entry l, ++ struct copygc_heap_entry r) ++{ ++ return cmp_int(l.sectors, r.sectors); ++} ++ ++static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) ++{ ++ const struct copygc_heap_entry *l = _l; ++ const struct copygc_heap_entry *r = _r; ++ ++ return cmp_int(l->offset, r->offset); ++} ++ ++static bool __copygc_pred(struct bch_dev *ca, ++ struct bkey_s_c k) ++{ ++ copygc_heap *h = &ca->copygc_heap; ++ const struct bch_extent_ptr *ptr = ++ bch2_bkey_has_device(k, ca->dev_idx); ++ ++ if (ptr) { ++ struct copygc_heap_entry search = { .offset = ptr->offset }; ++ ++ ssize_t i = eytzinger0_find_le(h->data, h->used, ++ sizeof(h->data[0]), ++ bucket_offset_cmp, &search); ++#if 0 ++ /* eytzinger search verify code: */ ++ ssize_t j = -1, k; ++ ++ for (k = 0; k < h->used; k++) ++ if (h->data[k].offset <= ptr->offset && ++ (j < 0 || h->data[k].offset > h->data[j].offset)) ++ j = k; ++ ++ BUG_ON(i != j); ++#endif ++ return (i >= 0 && ++ ptr->offset < h->data[i].offset + ca->mi.bucket_size && ++ ptr->gen == h->data[i].gen); ++ } ++ ++ return false; ++} ++ ++static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) ++{ ++ struct bch_dev *ca = arg; ++ ++ if (!__copygc_pred(ca, k)) ++ return DATA_SKIP; ++ ++ data_opts->target = dev_to_target(ca->dev_idx); ++ data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; ++ data_opts->rewrite_dev = ca->dev_idx; ++ return DATA_REWRITE; ++} ++ ++static bool have_copygc_reserve(struct bch_dev *ca) ++{ ++ bool ret; ++ ++ spin_lock(&ca->fs->freelist_lock); ++ ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) || ++ ca->allocator_state != ALLOCATOR_RUNNING; ++ spin_unlock(&ca->fs->freelist_lock); ++ ++ return ret; ++} ++ ++static void bch2_copygc(struct bch_fs *c, struct bch_dev *ca) ++{ ++ copygc_heap *h = &ca->copygc_heap; ++ struct copygc_heap_entry e, *i; ++ struct bucket_array *buckets; ++ struct bch_move_stats move_stats; ++ u64 sectors_to_move = 0, sectors_not_moved = 0; ++ u64 buckets_to_move, buckets_not_moved = 0; ++ size_t b; ++ int ret; ++ ++ memset(&move_stats, 0, sizeof(move_stats)); ++ closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); ++ ++ /* ++ * Find buckets with lowest sector counts, skipping completely ++ * empty buckets, by building a maxheap sorted by sector count, ++ * and repeatedly replacing the maximum element until all ++ * buckets have been visited. ++ */ ++ h->used = 0; ++ ++ /* ++ * We need bucket marks to be up to date - gc can't be recalculating ++ * them: ++ */ ++ down_read(&c->gc_lock); ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { ++ struct bucket_mark m = READ_ONCE(buckets->b[b].mark); ++ struct copygc_heap_entry e; ++ ++ if (m.owned_by_allocator || ++ m.data_type != BCH_DATA_USER || ++ !bucket_sectors_used(m) || ++ bucket_sectors_used(m) >= ca->mi.bucket_size) ++ continue; ++ ++ e = (struct copygc_heap_entry) { ++ .gen = m.gen, ++ .sectors = bucket_sectors_used(m), ++ .offset = bucket_to_sector(ca, b), ++ }; ++ heap_add_or_replace(h, e, -sectors_used_cmp, NULL); ++ } ++ up_read(&ca->bucket_lock); ++ up_read(&c->gc_lock); ++ ++ for (i = h->data; i < h->data + h->used; i++) ++ sectors_to_move += i->sectors; ++ ++ while (sectors_to_move > COPYGC_SECTORS_PER_ITER(ca)) { ++ BUG_ON(!heap_pop(h, e, -sectors_used_cmp, NULL)); ++ sectors_to_move -= e.sectors; ++ } ++ ++ buckets_to_move = h->used; ++ ++ if (!buckets_to_move) ++ return; ++ ++ eytzinger0_sort(h->data, h->used, ++ sizeof(h->data[0]), ++ bucket_offset_cmp, NULL); ++ ++ ret = bch2_move_data(c, &ca->copygc_pd.rate, ++ writepoint_ptr(&ca->copygc_write_point), ++ POS_MIN, POS_MAX, ++ copygc_pred, ca, ++ &move_stats); ++ ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ for (i = h->data; i < h->data + h->used; i++) { ++ size_t b = sector_to_bucket(ca, i->offset); ++ struct bucket_mark m = READ_ONCE(buckets->b[b].mark); ++ ++ if (i->gen == m.gen && bucket_sectors_used(m)) { ++ sectors_not_moved += bucket_sectors_used(m); ++ buckets_not_moved++; ++ } ++ } ++ up_read(&ca->bucket_lock); ++ ++ if (sectors_not_moved && !ret) ++ bch_warn_ratelimited(c, ++ "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)", ++ sectors_not_moved, sectors_to_move, ++ buckets_not_moved, buckets_to_move, ++ atomic64_read(&move_stats.sectors_moved), ++ atomic64_read(&move_stats.keys_raced), ++ atomic64_read(&move_stats.sectors_raced)); ++ ++ trace_copygc(ca, ++ atomic64_read(&move_stats.sectors_moved), sectors_not_moved, ++ buckets_to_move, buckets_not_moved); ++} ++ ++/* ++ * Copygc runs when the amount of fragmented data is above some arbitrary ++ * threshold: ++ * ++ * The threshold at the limit - when the device is full - is the amount of space ++ * we reserved in bch2_recalc_capacity; we can't have more than that amount of ++ * disk space stranded due to fragmentation and store everything we have ++ * promised to store. ++ * ++ * But we don't want to be running copygc unnecessarily when the device still ++ * has plenty of free space - rather, we want copygc to smoothly run every so ++ * often and continually reduce the amount of fragmented space as the device ++ * fills up. So, we increase the threshold by half the current free space. ++ */ ++unsigned long bch2_copygc_wait_amount(struct bch_dev *ca) ++{ ++ struct bch_fs *c = ca->fs; ++ struct bch_dev_usage usage = bch2_dev_usage_read(c, ca); ++ u64 fragmented_allowed = ca->copygc_threshold + ++ ((__dev_buckets_available(ca, usage) * ca->mi.bucket_size) >> 1); ++ ++ return max_t(s64, 0, fragmented_allowed - usage.sectors_fragmented); ++} ++ ++static int bch2_copygc_thread(void *arg) ++{ ++ struct bch_dev *ca = arg; ++ struct bch_fs *c = ca->fs; ++ struct io_clock *clock = &c->io_clock[WRITE]; ++ unsigned long last, wait; ++ ++ set_freezable(); ++ ++ while (!kthread_should_stop()) { ++ if (kthread_wait_freezable(c->copy_gc_enabled)) ++ break; ++ ++ last = atomic_long_read(&clock->now); ++ wait = bch2_copygc_wait_amount(ca); ++ ++ if (wait > clock->max_slop) { ++ bch2_kthread_io_clock_wait(clock, last + wait, ++ MAX_SCHEDULE_TIMEOUT); ++ continue; ++ } ++ ++ bch2_copygc(c, ca); ++ } ++ ++ return 0; ++} ++ ++void bch2_copygc_stop(struct bch_dev *ca) ++{ ++ ca->copygc_pd.rate.rate = UINT_MAX; ++ bch2_ratelimit_reset(&ca->copygc_pd.rate); ++ ++ if (ca->copygc_thread) { ++ kthread_stop(ca->copygc_thread); ++ put_task_struct(ca->copygc_thread); ++ } ++ ca->copygc_thread = NULL; ++} ++ ++int bch2_copygc_start(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct task_struct *t; ++ ++ if (ca->copygc_thread) ++ return 0; ++ ++ if (c->opts.nochanges) ++ return 0; ++ ++ if (bch2_fs_init_fault("copygc_start")) ++ return -ENOMEM; ++ ++ t = kthread_create(bch2_copygc_thread, ca, ++ "bch_copygc[%s]", ca->name); ++ if (IS_ERR(t)) ++ return PTR_ERR(t); ++ ++ get_task_struct(t); ++ ++ ca->copygc_thread = t; ++ wake_up_process(ca->copygc_thread); ++ ++ return 0; ++} ++ ++void bch2_dev_copygc_init(struct bch_dev *ca) ++{ ++ bch2_pd_controller_init(&ca->copygc_pd); ++ ca->copygc_pd.d_term = 0; ++} +diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h +new file mode 100644 +index 000000000000..dcd479632cf1 +--- /dev/null ++++ b/fs/bcachefs/movinggc.h +@@ -0,0 +1,9 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_MOVINGGC_H ++#define _BCACHEFS_MOVINGGC_H ++ ++void bch2_copygc_stop(struct bch_dev *); ++int bch2_copygc_start(struct bch_fs *, struct bch_dev *); ++void bch2_dev_copygc_init(struct bch_dev *); ++ ++#endif /* _BCACHEFS_MOVINGGC_H */ +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +new file mode 100644 +index 000000000000..94d6c044a27d +--- /dev/null ++++ b/fs/bcachefs/opts.c +@@ -0,0 +1,440 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include ++ ++#include "bcachefs.h" ++#include "compress.h" ++#include "disk_groups.h" ++#include "opts.h" ++#include "super-io.h" ++#include "util.h" ++ ++const char * const bch2_error_actions[] = { ++ "continue", ++ "remount-ro", ++ "panic", ++ NULL ++}; ++ ++const char * const bch2_sb_features[] = { ++#define x(f, n) #f, ++ BCH_SB_FEATURES() ++#undef x ++ NULL ++}; ++ ++const char * const bch2_csum_opts[] = { ++ "none", ++ "crc32c", ++ "crc64", ++ NULL ++}; ++ ++const char * const bch2_compression_opts[] = { ++#define x(t, n) #t, ++ BCH_COMPRESSION_OPTS() ++#undef x ++ NULL ++}; ++ ++const char * const bch2_str_hash_types[] = { ++ "crc32c", ++ "crc64", ++ "siphash", ++ NULL ++}; ++ ++const char * const bch2_data_types[] = { ++ "none", ++ "sb", ++ "journal", ++ "btree", ++ "data", ++ "cached", ++ NULL ++}; ++ ++const char * const bch2_cache_replacement_policies[] = { ++ "lru", ++ "fifo", ++ "random", ++ NULL ++}; ++ ++/* Default is -1; we skip past it for struct cached_dev's cache mode */ ++const char * const bch2_cache_modes[] = { ++ "default", ++ "writethrough", ++ "writeback", ++ "writearound", ++ "none", ++ NULL ++}; ++ ++const char * const bch2_dev_state[] = { ++ "readwrite", ++ "readonly", ++ "failed", ++ "spare", ++ NULL ++}; ++ ++void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) ++{ ++#define x(_name, ...) \ ++ if (opt_defined(src, _name)) \ ++ opt_set(*dst, _name, src._name); ++ ++ BCH_OPTS() ++#undef x ++} ++ ++bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Opt_##_name: \ ++ return opt_defined(*opts, _name); ++ BCH_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Opt_##_name: \ ++ return opts->_name; ++ BCH_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Opt_##_name: \ ++ opt_set(*opts, _name, v); \ ++ break; ++ BCH_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++/* ++ * Initial options from superblock - here we don't want any options undefined, ++ * any options the superblock doesn't specify are set to 0: ++ */ ++struct bch_opts bch2_opts_from_sb(struct bch_sb *sb) ++{ ++ struct bch_opts opts = bch2_opts_empty(); ++ ++#define x(_name, _bits, _mode, _type, _sb_opt, ...) \ ++ if (_sb_opt != NO_SB_OPT) \ ++ opt_set(opts, _name, _sb_opt(sb)); ++ BCH_OPTS() ++#undef x ++ ++ return opts; ++} ++ ++const struct bch_option bch2_opt_table[] = { ++#define OPT_BOOL() .type = BCH_OPT_BOOL ++#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, .min = _min, .max = _max ++#define OPT_SECTORS(_min, _max) .type = BCH_OPT_SECTORS, .min = _min, .max = _max ++#define OPT_STR(_choices) .type = BCH_OPT_STR, .choices = _choices ++#define OPT_FN(_fn) .type = BCH_OPT_FN, \ ++ .parse = _fn##_parse, \ ++ .to_text = _fn##_to_text ++ ++#define x(_name, _bits, _mode, _type, _sb_opt, _default, _hint, _help) \ ++ [Opt_##_name] = { \ ++ .attr = { \ ++ .name = #_name, \ ++ .mode = (_mode) & OPT_RUNTIME ? 0644 : 0444, \ ++ }, \ ++ .mode = _mode, \ ++ .hint = _hint, \ ++ .help = _help, \ ++ .set_sb = SET_##_sb_opt, \ ++ _type \ ++ }, ++ ++ BCH_OPTS() ++#undef x ++}; ++ ++int bch2_opt_lookup(const char *name) ++{ ++ const struct bch_option *i; ++ ++ for (i = bch2_opt_table; ++ i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table); ++ i++) ++ if (!strcmp(name, i->attr.name)) ++ return i - bch2_opt_table; ++ ++ return -1; ++} ++ ++struct synonym { ++ const char *s1, *s2; ++}; ++ ++static const struct synonym bch_opt_synonyms[] = { ++ { "quota", "usrquota" }, ++}; ++ ++static int bch2_mount_opt_lookup(const char *name) ++{ ++ const struct synonym *i; ++ ++ for (i = bch_opt_synonyms; ++ i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms); ++ i++) ++ if (!strcmp(name, i->s1)) ++ name = i->s2; ++ ++ return bch2_opt_lookup(name); ++} ++ ++int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, ++ const char *val, u64 *res) ++{ ++ ssize_t ret; ++ ++ switch (opt->type) { ++ case BCH_OPT_BOOL: ++ ret = kstrtou64(val, 10, res); ++ if (ret < 0) ++ return ret; ++ ++ if (*res > 1) ++ return -ERANGE; ++ break; ++ case BCH_OPT_UINT: ++ ret = kstrtou64(val, 10, res); ++ if (ret < 0) ++ return ret; ++ ++ if (*res < opt->min || *res >= opt->max) ++ return -ERANGE; ++ break; ++ case BCH_OPT_SECTORS: ++ ret = bch2_strtou64_h(val, res); ++ if (ret < 0) ++ return ret; ++ ++ if (*res & 511) ++ return -EINVAL; ++ ++ *res >>= 9; ++ ++ if (*res < opt->min || *res >= opt->max) ++ return -ERANGE; ++ break; ++ case BCH_OPT_STR: ++ ret = match_string(opt->choices, -1, val); ++ if (ret < 0) ++ return ret; ++ ++ *res = ret; ++ break; ++ case BCH_OPT_FN: ++ if (!c) ++ return -EINVAL; ++ ++ return opt->parse(c, val, res); ++ } ++ ++ return 0; ++} ++ ++void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c, ++ const struct bch_option *opt, u64 v, ++ unsigned flags) ++{ ++ if (flags & OPT_SHOW_MOUNT_STYLE) { ++ if (opt->type == BCH_OPT_BOOL) { ++ pr_buf(out, "%s%s", ++ v ? "" : "no", ++ opt->attr.name); ++ return; ++ } ++ ++ pr_buf(out, "%s=", opt->attr.name); ++ } ++ ++ switch (opt->type) { ++ case BCH_OPT_BOOL: ++ case BCH_OPT_UINT: ++ pr_buf(out, "%lli", v); ++ break; ++ case BCH_OPT_SECTORS: ++ bch2_hprint(out, v); ++ break; ++ case BCH_OPT_STR: ++ if (flags & OPT_SHOW_FULL_LIST) ++ bch2_string_opt_to_text(out, opt->choices, v); ++ else ++ pr_buf(out, opt->choices[v]); ++ break; ++ case BCH_OPT_FN: ++ opt->to_text(out, c, v); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) ++{ ++ int ret = 0; ++ ++ switch (id) { ++ case Opt_compression: ++ case Opt_background_compression: ++ ret = bch2_check_set_has_compressed_data(c, v); ++ break; ++ case Opt_erasure_code: ++ if (v) ++ bch2_check_set_feature(c, BCH_FEATURE_ec); ++ break; ++ } ++ ++ return ret; ++} ++ ++int bch2_opts_check_may_set(struct bch_fs *c) ++{ ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < bch2_opts_nr; i++) { ++ ret = bch2_opt_check_may_set(c, i, ++ bch2_opt_get_by_id(&c->opts, i)); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++int bch2_parse_mount_opts(struct bch_opts *opts, char *options) ++{ ++ char *opt, *name, *val; ++ int ret, id; ++ u64 v; ++ ++ while ((opt = strsep(&options, ",")) != NULL) { ++ name = strsep(&opt, "="); ++ val = opt; ++ ++ if (val) { ++ id = bch2_mount_opt_lookup(name); ++ if (id < 0) ++ goto bad_opt; ++ ++ ret = bch2_opt_parse(NULL, &bch2_opt_table[id], val, &v); ++ if (ret < 0) ++ goto bad_val; ++ } else { ++ id = bch2_mount_opt_lookup(name); ++ v = 1; ++ ++ if (id < 0 && ++ !strncmp("no", name, 2)) { ++ id = bch2_mount_opt_lookup(name + 2); ++ v = 0; ++ } ++ ++ if (id < 0) ++ goto bad_opt; ++ ++ if (bch2_opt_table[id].type != BCH_OPT_BOOL) ++ goto no_val; ++ } ++ ++ if (!(bch2_opt_table[id].mode & OPT_MOUNT)) ++ goto bad_opt; ++ ++ if (id == Opt_acl && ++ !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL)) ++ goto bad_opt; ++ ++ if ((id == Opt_usrquota || ++ id == Opt_grpquota) && ++ !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) ++ goto bad_opt; ++ ++ bch2_opt_set_by_id(opts, id, v); ++ } ++ ++ return 0; ++bad_opt: ++ pr_err("Bad mount option %s", name); ++ return -1; ++bad_val: ++ pr_err("Invalid value %s for mount option %s", val, name); ++ return -1; ++no_val: ++ pr_err("Mount option %s requires a value", name); ++ return -1; ++} ++ ++/* io opts: */ ++ ++struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) ++{ ++ struct bch_io_opts ret = { 0 }; ++#define x(_name, _bits) \ ++ if (opt_defined(src, _name)) \ ++ opt_set(ret, _name, src._name); ++ BCH_INODE_OPTS() ++#undef x ++ return ret; ++} ++ ++struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src) ++{ ++ struct bch_opts ret = { 0 }; ++#define x(_name, _bits) \ ++ if (opt_defined(src, _name)) \ ++ opt_set(ret, _name, src._name); ++ BCH_INODE_OPTS() ++#undef x ++ return ret; ++} ++ ++void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src) ++{ ++#define x(_name, _bits) \ ++ if (opt_defined(src, _name)) \ ++ opt_set(*dst, _name, src._name); ++ BCH_INODE_OPTS() ++#undef x ++} ++ ++bool bch2_opt_is_inode_opt(enum bch_opt_id id) ++{ ++ static const enum bch_opt_id inode_opt_list[] = { ++#define x(_name, _bits) Opt_##_name, ++ BCH_INODE_OPTS() ++#undef x ++ }; ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++) ++ if (inode_opt_list[i] == id) ++ return true; ++ ++ return false; ++} +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +new file mode 100644 +index 000000000000..3b051e7a8f1d +--- /dev/null ++++ b/fs/bcachefs/opts.h +@@ -0,0 +1,435 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_OPTS_H ++#define _BCACHEFS_OPTS_H ++ ++#include ++#include ++#include ++#include ++#include "bcachefs_format.h" ++ ++extern const char * const bch2_error_actions[]; ++extern const char * const bch2_sb_features[]; ++extern const char * const bch2_csum_opts[]; ++extern const char * const bch2_compression_opts[]; ++extern const char * const bch2_str_hash_types[]; ++extern const char * const bch2_data_types[]; ++extern const char * const bch2_cache_replacement_policies[]; ++extern const char * const bch2_cache_modes[]; ++extern const char * const bch2_dev_state[]; ++ ++/* ++ * Mount options; we also store defaults in the superblock. ++ * ++ * Also exposed via sysfs: if an option is writeable, and it's also stored in ++ * the superblock, changing it via sysfs (currently? might change this) also ++ * updates the superblock. ++ * ++ * We store options as signed integers, where -1 means undefined. This means we ++ * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only ++ * apply the options from that struct that are defined. ++ */ ++ ++/* dummy option, for options that aren't stored in the superblock */ ++LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0); ++ ++/* When can be set: */ ++enum opt_mode { ++ OPT_FORMAT = (1 << 0), ++ OPT_MOUNT = (1 << 1), ++ OPT_RUNTIME = (1 << 2), ++ OPT_INODE = (1 << 3), ++ OPT_DEVICE = (1 << 4), ++}; ++ ++enum opt_type { ++ BCH_OPT_BOOL, ++ BCH_OPT_UINT, ++ BCH_OPT_SECTORS, ++ BCH_OPT_STR, ++ BCH_OPT_FN, ++}; ++ ++/** ++ * x(name, shortopt, type, in mem type, mode, sb_opt) ++ * ++ * @name - name of mount option, sysfs attribute, and struct bch_opts ++ * member ++ * ++ * @mode - when opt may be set ++ * ++ * @sb_option - name of corresponding superblock option ++ * ++ * @type - one of OPT_BOOL, OPT_UINT, OPT_STR ++ */ ++ ++/* ++ * XXX: add fields for ++ * - default value ++ * - helptext ++ */ ++ ++#ifdef __KERNEL__ ++#define RATELIMIT_ERRORS true ++#else ++#define RATELIMIT_ERRORS false ++#endif ++ ++#define BCH_OPTS() \ ++ x(block_size, u16, \ ++ OPT_FORMAT, \ ++ OPT_SECTORS(1, 128), \ ++ BCH_SB_BLOCK_SIZE, 8, \ ++ "size", NULL) \ ++ x(btree_node_size, u16, \ ++ OPT_FORMAT, \ ++ OPT_SECTORS(1, 128), \ ++ BCH_SB_BTREE_NODE_SIZE, 512, \ ++ "size", "Btree node size, default 256k") \ ++ x(errors, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_STR(bch2_error_actions), \ ++ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_RO, \ ++ NULL, "Action to take on filesystem error") \ ++ x(metadata_replicas, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_UINT(1, BCH_REPLICAS_MAX), \ ++ BCH_SB_META_REPLICAS_WANT, 1, \ ++ "#", "Number of metadata replicas") \ ++ x(data_replicas, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_UINT(1, BCH_REPLICAS_MAX), \ ++ BCH_SB_DATA_REPLICAS_WANT, 1, \ ++ "#", "Number of data replicas") \ ++ x(metadata_replicas_required, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_UINT(1, BCH_REPLICAS_MAX), \ ++ BCH_SB_META_REPLICAS_REQ, 1, \ ++ "#", NULL) \ ++ x(data_replicas_required, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_UINT(1, BCH_REPLICAS_MAX), \ ++ BCH_SB_DATA_REPLICAS_REQ, 1, \ ++ "#", NULL) \ ++ x(metadata_checksum, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_STR(bch2_csum_opts), \ ++ BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ ++ NULL, NULL) \ ++ x(data_checksum, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_STR(bch2_csum_opts), \ ++ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ ++ NULL, NULL) \ ++ x(compression, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_STR(bch2_compression_opts), \ ++ BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ ++ NULL, NULL) \ ++ x(background_compression, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_STR(bch2_compression_opts), \ ++ BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ ++ NULL, NULL) \ ++ x(str_hash, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_STR(bch2_str_hash_types), \ ++ BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_SIPHASH, \ ++ NULL, "Hash function for directory entries and xattrs")\ ++ x(foreground_target, u16, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FN(bch2_opt_target), \ ++ BCH_SB_FOREGROUND_TARGET, 0, \ ++ "(target)", "Device or disk group for foreground writes") \ ++ x(background_target, u16, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FN(bch2_opt_target), \ ++ BCH_SB_BACKGROUND_TARGET, 0, \ ++ "(target)", "Device or disk group to move data to in the background")\ ++ x(promote_target, u16, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FN(bch2_opt_target), \ ++ BCH_SB_PROMOTE_TARGET, 0, \ ++ "(target)", "Device or disk group to promote data to on read")\ ++ x(erasure_code, u16, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_BOOL(), \ ++ BCH_SB_ERASURE_CODE, false, \ ++ NULL, "Enable erasure coding (DO NOT USE YET)") \ ++ x(inodes_32bit, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH_SB_INODE_32BIT, false, \ ++ NULL, "Constrain inode numbers to 32 bits") \ ++ x(gc_reserve_percent, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_UINT(5, 21), \ ++ BCH_SB_GC_RESERVE, 8, \ ++ "%", "Percentage of disk space to reserve for copygc")\ ++ x(gc_reserve_bytes, u64, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_SECTORS(0, U64_MAX), \ ++ BCH_SB_GC_RESERVE_BYTES, 0, \ ++ "%", "Amount of disk space to reserve for copygc\n" \ ++ "Takes precedence over gc_reserve_percent if set")\ ++ x(root_reserve_percent, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_UINT(0, 100), \ ++ BCH_SB_ROOT_RESERVE, 0, \ ++ "%", "Percentage of disk space to reserve for superuser")\ ++ x(wide_macs, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH_SB_128_BIT_MACS, false, \ ++ NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\ ++ x(inline_data, u8, \ ++ OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Enable inline data extents") \ ++ x(acl, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH_SB_POSIX_ACL, true, \ ++ NULL, "Enable POSIX acls") \ ++ x(usrquota, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH_SB_USRQUOTA, false, \ ++ NULL, "Enable user quotas") \ ++ x(grpquota, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH_SB_GRPQUOTA, false, \ ++ NULL, "Enable group quotas") \ ++ x(prjquota, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH_SB_PRJQUOTA, false, \ ++ NULL, "Enable project quotas") \ ++ x(reflink, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH_SB_REFLINK, true, \ ++ NULL, "Enable reflink support") \ ++ x(degraded, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Allow mounting in degraded mode") \ ++ x(discard, u8, \ ++ OPT_MOUNT|OPT_DEVICE, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Enable discard/TRIM support") \ ++ x(verbose, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Extra debugging information during mount/recovery")\ ++ x(journal_flush_disabled, u8, \ ++ OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Disable journal flush on sync/fsync\n" \ ++ "If enabled, writes can be lost, but only since the\n"\ ++ "last journal write (default 1 second)") \ ++ x(fsck, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Run fsck on mount") \ ++ x(fix_errors, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Fix errors during fsck without asking") \ ++ x(ratelimit_errors, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, RATELIMIT_ERRORS, \ ++ NULL, "Ratelimit error messages during fsck") \ ++ x(nochanges, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Super read only mode - no writes at all will be issued,\n"\ ++ "even if we have to replay the journal") \ ++ x(norecovery, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Don't replay the journal") \ ++ x(keep_journal, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Don't free journal entries/keys after startup")\ ++ x(read_entire_journal, u8, \ ++ 0, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Read all journal entries, not just dirty ones")\ ++ x(noexcl, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Don't open device in exclusive mode") \ ++ x(sb, u64, \ ++ OPT_MOUNT, \ ++ OPT_UINT(0, S64_MAX), \ ++ NO_SB_OPT, BCH_SB_SECTOR, \ ++ "offset", "Sector offset of superblock") \ ++ x(read_only, u8, \ ++ 0, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, NULL) \ ++ x(nostart, u8, \ ++ 0, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Don\'t start filesystem, only open devices") \ ++ x(reconstruct_alloc, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Reconstruct alloc btree") \ ++ x(version_upgrade, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Set superblock to latest version,\n" \ ++ "allowing any new features to be used") \ ++ x(project, u8, \ ++ OPT_INODE, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, NULL) \ ++ x(fs_size, u64, \ ++ OPT_DEVICE, \ ++ OPT_SECTORS(0, S64_MAX), \ ++ NO_SB_OPT, 0, \ ++ "size", "Size of filesystem on device") \ ++ x(bucket, u32, \ ++ OPT_DEVICE, \ ++ OPT_SECTORS(0, S64_MAX), \ ++ NO_SB_OPT, 0, \ ++ "size", "Size of filesystem on device") \ ++ x(durability, u8, \ ++ OPT_DEVICE, \ ++ OPT_UINT(0, BCH_REPLICAS_MAX), \ ++ NO_SB_OPT, 1, \ ++ "n", "Data written to this device will be considered\n"\ ++ "to have already been replicated n times") ++ ++struct bch_opts { ++#define x(_name, _bits, ...) unsigned _name##_defined:1; ++ BCH_OPTS() ++#undef x ++ ++#define x(_name, _bits, ...) _bits _name; ++ BCH_OPTS() ++#undef x ++}; ++ ++static const struct bch_opts bch2_opts_default = { ++#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \ ++ ._name##_defined = true, \ ++ ._name = _default, \ ++ ++ BCH_OPTS() ++#undef x ++}; ++ ++#define opt_defined(_opts, _name) ((_opts)._name##_defined) ++ ++#define opt_get(_opts, _name) \ ++ (opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name) ++ ++#define opt_set(_opts, _name, _v) \ ++do { \ ++ (_opts)._name##_defined = true; \ ++ (_opts)._name = _v; \ ++} while (0) ++ ++static inline struct bch_opts bch2_opts_empty(void) ++{ ++ return (struct bch_opts) { 0 }; ++} ++ ++void bch2_opts_apply(struct bch_opts *, struct bch_opts); ++ ++enum bch_opt_id { ++#define x(_name, ...) Opt_##_name, ++ BCH_OPTS() ++#undef x ++ bch2_opts_nr ++}; ++ ++struct bch_fs; ++struct printbuf; ++ ++struct bch_option { ++ struct attribute attr; ++ void (*set_sb)(struct bch_sb *, u64); ++ enum opt_mode mode; ++ enum opt_type type; ++ ++ union { ++ struct { ++ u64 min, max; ++ }; ++ struct { ++ const char * const *choices; ++ }; ++ struct { ++ int (*parse)(struct bch_fs *, const char *, u64 *); ++ void (*to_text)(struct printbuf *, struct bch_fs *, u64); ++ }; ++ }; ++ ++ const char *hint; ++ const char *help; ++ ++}; ++ ++extern const struct bch_option bch2_opt_table[]; ++ ++bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); ++u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); ++void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); ++ ++struct bch_opts bch2_opts_from_sb(struct bch_sb *); ++ ++int bch2_opt_lookup(const char *); ++int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *); ++ ++#define OPT_SHOW_FULL_LIST (1 << 0) ++#define OPT_SHOW_MOUNT_STYLE (1 << 1) ++ ++void bch2_opt_to_text(struct printbuf *, struct bch_fs *, ++ const struct bch_option *, u64, unsigned); ++ ++int bch2_opt_check_may_set(struct bch_fs *, int, u64); ++int bch2_opts_check_may_set(struct bch_fs *); ++int bch2_parse_mount_opts(struct bch_opts *, char *); ++ ++/* inode opts: */ ++ ++struct bch_io_opts { ++#define x(_name, _bits) unsigned _name##_defined:1; ++ BCH_INODE_OPTS() ++#undef x ++ ++#define x(_name, _bits) u##_bits _name; ++ BCH_INODE_OPTS() ++#undef x ++}; ++ ++struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); ++struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts); ++void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts); ++bool bch2_opt_is_inode_opt(enum bch_opt_id); ++ ++#endif /* _BCACHEFS_OPTS_H */ +diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c +new file mode 100644 +index 000000000000..d3032a46e7f3 +--- /dev/null ++++ b/fs/bcachefs/quota.c +@@ -0,0 +1,783 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "btree_update.h" ++#include "inode.h" ++#include "quota.h" ++#include "super-io.h" ++ ++static const char *bch2_sb_validate_quota(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_quota *q = field_to_type(f, quota); ++ ++ if (vstruct_bytes(&q->field) != sizeof(*q)) ++ return "invalid field quota: wrong size"; ++ ++ return NULL; ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_quota = { ++ .validate = bch2_sb_validate_quota, ++}; ++ ++const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ if (k.k->p.inode >= QTYP_NR) ++ return "invalid quota type"; ++ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) ++ return "incorrect value size"; ++ ++ return NULL; ++} ++ ++static const char * const bch2_quota_counters[] = { ++ "space", ++ "inodes", ++}; ++ ++void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_quota dq = bkey_s_c_to_quota(k); ++ unsigned i; ++ ++ for (i = 0; i < Q_COUNTERS; i++) ++ pr_buf(out, "%s hardlimit %llu softlimit %llu", ++ bch2_quota_counters[i], ++ le64_to_cpu(dq.v->c[i].hardlimit), ++ le64_to_cpu(dq.v->c[i].softlimit)); ++} ++ ++#ifdef CONFIG_BCACHEFS_QUOTA ++ ++#include ++#include ++#include ++ ++static inline unsigned __next_qtype(unsigned i, unsigned qtypes) ++{ ++ qtypes >>= i; ++ return qtypes ? i + __ffs(qtypes) : QTYP_NR; ++} ++ ++#define for_each_set_qtype(_c, _i, _q, _qtypes) \ ++ for (_i = 0; \ ++ (_i = __next_qtype(_i, _qtypes), \ ++ _q = &(_c)->quotas[_i], \ ++ _i < QTYP_NR); \ ++ _i++) ++ ++static bool ignore_hardlimit(struct bch_memquota_type *q) ++{ ++ if (capable(CAP_SYS_RESOURCE)) ++ return true; ++#if 0 ++ struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; ++ ++ return capable(CAP_SYS_RESOURCE) && ++ (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || ++ !(info->dqi_flags & DQF_ROOT_SQUASH)); ++#endif ++ return false; ++} ++ ++enum quota_msg { ++ SOFTWARN, /* Softlimit reached */ ++ SOFTLONGWARN, /* Grace time expired */ ++ HARDWARN, /* Hardlimit reached */ ++ ++ HARDBELOW, /* Usage got below inode hardlimit */ ++ SOFTBELOW, /* Usage got below inode softlimit */ ++}; ++ ++static int quota_nl[][Q_COUNTERS] = { ++ [HARDWARN][Q_SPC] = QUOTA_NL_BHARDWARN, ++ [SOFTLONGWARN][Q_SPC] = QUOTA_NL_BSOFTLONGWARN, ++ [SOFTWARN][Q_SPC] = QUOTA_NL_BSOFTWARN, ++ [HARDBELOW][Q_SPC] = QUOTA_NL_BHARDBELOW, ++ [SOFTBELOW][Q_SPC] = QUOTA_NL_BSOFTBELOW, ++ ++ [HARDWARN][Q_INO] = QUOTA_NL_IHARDWARN, ++ [SOFTLONGWARN][Q_INO] = QUOTA_NL_ISOFTLONGWARN, ++ [SOFTWARN][Q_INO] = QUOTA_NL_ISOFTWARN, ++ [HARDBELOW][Q_INO] = QUOTA_NL_IHARDBELOW, ++ [SOFTBELOW][Q_INO] = QUOTA_NL_ISOFTBELOW, ++}; ++ ++struct quota_msgs { ++ u8 nr; ++ struct { ++ u8 qtype; ++ u8 msg; ++ } m[QTYP_NR * Q_COUNTERS]; ++}; ++ ++static void prepare_msg(unsigned qtype, ++ enum quota_counters counter, ++ struct quota_msgs *msgs, ++ enum quota_msg msg_type) ++{ ++ BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m)); ++ ++ msgs->m[msgs->nr].qtype = qtype; ++ msgs->m[msgs->nr].msg = quota_nl[msg_type][counter]; ++ msgs->nr++; ++} ++ ++static void prepare_warning(struct memquota_counter *qc, ++ unsigned qtype, ++ enum quota_counters counter, ++ struct quota_msgs *msgs, ++ enum quota_msg msg_type) ++{ ++ if (qc->warning_issued & (1 << msg_type)) ++ return; ++ ++ prepare_msg(qtype, counter, msgs, msg_type); ++} ++ ++static void flush_warnings(struct bch_qid qid, ++ struct super_block *sb, ++ struct quota_msgs *msgs) ++{ ++ unsigned i; ++ ++ for (i = 0; i < msgs->nr; i++) ++ quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]), ++ sb->s_dev, msgs->m[i].msg); ++} ++ ++static int bch2_quota_check_limit(struct bch_fs *c, ++ unsigned qtype, ++ struct bch_memquota *mq, ++ struct quota_msgs *msgs, ++ enum quota_counters counter, ++ s64 v, ++ enum quota_acct_mode mode) ++{ ++ struct bch_memquota_type *q = &c->quotas[qtype]; ++ struct memquota_counter *qc = &mq->c[counter]; ++ u64 n = qc->v + v; ++ ++ BUG_ON((s64) n < 0); ++ ++ if (mode == KEY_TYPE_QUOTA_NOCHECK) ++ return 0; ++ ++ if (v <= 0) { ++ if (n < qc->hardlimit && ++ (qc->warning_issued & (1 << HARDWARN))) { ++ qc->warning_issued &= ~(1 << HARDWARN); ++ prepare_msg(qtype, counter, msgs, HARDBELOW); ++ } ++ ++ if (n < qc->softlimit && ++ (qc->warning_issued & (1 << SOFTWARN))) { ++ qc->warning_issued &= ~(1 << SOFTWARN); ++ prepare_msg(qtype, counter, msgs, SOFTBELOW); ++ } ++ ++ qc->warning_issued = 0; ++ return 0; ++ } ++ ++ if (qc->hardlimit && ++ qc->hardlimit < n && ++ !ignore_hardlimit(q)) { ++ if (mode == KEY_TYPE_QUOTA_PREALLOC) ++ return -EDQUOT; ++ ++ prepare_warning(qc, qtype, counter, msgs, HARDWARN); ++ } ++ ++ if (qc->softlimit && ++ qc->softlimit < n && ++ qc->timer && ++ ktime_get_real_seconds() >= qc->timer && ++ !ignore_hardlimit(q)) { ++ if (mode == KEY_TYPE_QUOTA_PREALLOC) ++ return -EDQUOT; ++ ++ prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN); ++ } ++ ++ if (qc->softlimit && ++ qc->softlimit < n && ++ qc->timer == 0) { ++ if (mode == KEY_TYPE_QUOTA_PREALLOC) ++ return -EDQUOT; ++ ++ prepare_warning(qc, qtype, counter, msgs, SOFTWARN); ++ ++ /* XXX is this the right one? */ ++ qc->timer = ktime_get_real_seconds() + ++ q->limits[counter].warnlimit; ++ } ++ ++ return 0; ++} ++ ++int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, ++ enum quota_counters counter, s64 v, ++ enum quota_acct_mode mode) ++{ ++ unsigned qtypes = enabled_qtypes(c); ++ struct bch_memquota_type *q; ++ struct bch_memquota *mq[QTYP_NR]; ++ struct quota_msgs msgs; ++ unsigned i; ++ int ret = 0; ++ ++ memset(&msgs, 0, sizeof(msgs)); ++ ++ for_each_set_qtype(c, i, q, qtypes) ++ mutex_lock_nested(&q->lock, i); ++ ++ for_each_set_qtype(c, i, q, qtypes) { ++ mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS); ++ if (!mq[i]) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode); ++ if (ret) ++ goto err; ++ } ++ ++ for_each_set_qtype(c, i, q, qtypes) ++ mq[i]->c[counter].v += v; ++err: ++ for_each_set_qtype(c, i, q, qtypes) ++ mutex_unlock(&q->lock); ++ ++ flush_warnings(qid, c->vfs_sb, &msgs); ++ ++ return ret; ++} ++ ++static void __bch2_quota_transfer(struct bch_memquota *src_q, ++ struct bch_memquota *dst_q, ++ enum quota_counters counter, s64 v) ++{ ++ BUG_ON(v > src_q->c[counter].v); ++ BUG_ON(v + dst_q->c[counter].v < v); ++ ++ src_q->c[counter].v -= v; ++ dst_q->c[counter].v += v; ++} ++ ++int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, ++ struct bch_qid dst, ++ struct bch_qid src, u64 space, ++ enum quota_acct_mode mode) ++{ ++ struct bch_memquota_type *q; ++ struct bch_memquota *src_q[3], *dst_q[3]; ++ struct quota_msgs msgs; ++ unsigned i; ++ int ret = 0; ++ ++ qtypes &= enabled_qtypes(c); ++ ++ memset(&msgs, 0, sizeof(msgs)); ++ ++ for_each_set_qtype(c, i, q, qtypes) ++ mutex_lock_nested(&q->lock, i); ++ ++ for_each_set_qtype(c, i, q, qtypes) { ++ src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS); ++ dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_NOFS); ++ ++ if (!src_q[i] || !dst_q[i]) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC, ++ dst_q[i]->c[Q_SPC].v + space, ++ mode); ++ if (ret) ++ goto err; ++ ++ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO, ++ dst_q[i]->c[Q_INO].v + 1, ++ mode); ++ if (ret) ++ goto err; ++ } ++ ++ for_each_set_qtype(c, i, q, qtypes) { ++ __bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space); ++ __bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1); ++ } ++ ++err: ++ for_each_set_qtype(c, i, q, qtypes) ++ mutex_unlock(&q->lock); ++ ++ flush_warnings(dst, c->vfs_sb, &msgs); ++ ++ return ret; ++} ++ ++static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_quota dq; ++ struct bch_memquota_type *q; ++ struct bch_memquota *mq; ++ unsigned i; ++ ++ BUG_ON(k.k->p.inode >= QTYP_NR); ++ ++ switch (k.k->type) { ++ case KEY_TYPE_quota: ++ dq = bkey_s_c_to_quota(k); ++ q = &c->quotas[k.k->p.inode]; ++ ++ mutex_lock(&q->lock); ++ mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL); ++ if (!mq) { ++ mutex_unlock(&q->lock); ++ return -ENOMEM; ++ } ++ ++ for (i = 0; i < Q_COUNTERS; i++) { ++ mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit); ++ mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit); ++ } ++ ++ mutex_unlock(&q->lock); ++ } ++ ++ return 0; ++} ++ ++static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_QUOTAS, POS(type, 0), ++ BTREE_ITER_PREFETCH, k, ret) { ++ if (k.k->p.inode != type) ++ break; ++ ++ ret = __bch2_quota_set(c, k); ++ if (ret) ++ break; ++ } ++ ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++void bch2_fs_quota_exit(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(c->quotas); i++) ++ genradix_free(&c->quotas[i].table); ++} ++ ++void bch2_fs_quota_init(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(c->quotas); i++) ++ mutex_init(&c->quotas[i].lock); ++} ++ ++static void bch2_sb_quota_read(struct bch_fs *c) ++{ ++ struct bch_sb_field_quota *sb_quota; ++ unsigned i, j; ++ ++ sb_quota = bch2_sb_get_quota(c->disk_sb.sb); ++ if (!sb_quota) ++ return; ++ ++ for (i = 0; i < QTYP_NR; i++) { ++ struct bch_memquota_type *q = &c->quotas[i]; ++ ++ for (j = 0; j < Q_COUNTERS; j++) { ++ q->limits[j].timelimit = ++ le32_to_cpu(sb_quota->q[i].c[j].timelimit); ++ q->limits[j].warnlimit = ++ le32_to_cpu(sb_quota->q[i].c[j].warnlimit); ++ } ++ } ++} ++ ++int bch2_fs_quota_read(struct bch_fs *c) ++{ ++ unsigned i, qtypes = enabled_qtypes(c); ++ struct bch_memquota_type *q; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bch_inode_unpacked u; ++ struct bkey_s_c k; ++ int ret; ++ ++ mutex_lock(&c->sb_lock); ++ bch2_sb_quota_read(c); ++ mutex_unlock(&c->sb_lock); ++ ++ for_each_set_qtype(c, i, q, qtypes) { ++ ret = bch2_quota_init_type(c, i); ++ if (ret) ++ return ret; ++ } ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ switch (k.k->type) { ++ case KEY_TYPE_inode: ++ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u); ++ if (ret) ++ return ret; ++ ++ bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, ++ KEY_TYPE_QUOTA_NOCHECK); ++ bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, ++ KEY_TYPE_QUOTA_NOCHECK); ++ } ++ } ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++/* Enable/disable/delete quotas for an entire filesystem: */ ++ ++static int bch2_quota_enable(struct super_block *sb, unsigned uflags) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ /* Accounting must be enabled at mount time: */ ++ if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT)) ++ return -EINVAL; ++ ++ /* Can't enable enforcement without accounting: */ ++ if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota) ++ return -EINVAL; ++ ++ if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota) ++ return -EINVAL; ++ ++ if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota) ++ return -EINVAL; ++ ++ mutex_lock(&c->sb_lock); ++ if (uflags & FS_QUOTA_UDQ_ENFD) ++ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true); ++ ++ if (uflags & FS_QUOTA_GDQ_ENFD) ++ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true); ++ ++ if (uflags & FS_QUOTA_PDQ_ENFD) ++ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++static int bch2_quota_disable(struct super_block *sb, unsigned uflags) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ mutex_lock(&c->sb_lock); ++ if (uflags & FS_QUOTA_UDQ_ENFD) ++ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false); ++ ++ if (uflags & FS_QUOTA_GDQ_ENFD) ++ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false); ++ ++ if (uflags & FS_QUOTA_PDQ_ENFD) ++ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++static int bch2_quota_remove(struct super_block *sb, unsigned uflags) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ int ret; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ if (uflags & FS_USER_QUOTA) { ++ if (c->opts.usrquota) ++ return -EINVAL; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, ++ POS(QTYP_USR, 0), ++ POS(QTYP_USR + 1, 0), ++ NULL); ++ if (ret) ++ return ret; ++ } ++ ++ if (uflags & FS_GROUP_QUOTA) { ++ if (c->opts.grpquota) ++ return -EINVAL; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, ++ POS(QTYP_GRP, 0), ++ POS(QTYP_GRP + 1, 0), ++ NULL); ++ if (ret) ++ return ret; ++ } ++ ++ if (uflags & FS_PROJ_QUOTA) { ++ if (c->opts.prjquota) ++ return -EINVAL; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, ++ POS(QTYP_PRJ, 0), ++ POS(QTYP_PRJ + 1, 0), ++ NULL); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Return quota status information, such as enforcements, quota file inode ++ * numbers etc. ++ */ ++static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ unsigned qtypes = enabled_qtypes(c); ++ unsigned i; ++ ++ memset(state, 0, sizeof(*state)); ++ ++ for (i = 0; i < QTYP_NR; i++) { ++ state->s_state[i].flags |= QCI_SYSFILE; ++ ++ if (!(qtypes & (1 << i))) ++ continue; ++ ++ state->s_state[i].flags |= QCI_ACCT_ENABLED; ++ ++ state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit; ++ state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit; ++ ++ state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit; ++ state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Adjust quota timers & warnings ++ */ ++static int bch2_quota_set_info(struct super_block *sb, int type, ++ struct qc_info *info) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_sb_field_quota *sb_quota; ++ struct bch_memquota_type *q; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ if (type >= QTYP_NR) ++ return -EINVAL; ++ ++ if (!((1 << type) & enabled_qtypes(c))) ++ return -ESRCH; ++ ++ if (info->i_fieldmask & ++ ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS)) ++ return -EINVAL; ++ ++ q = &c->quotas[type]; ++ ++ mutex_lock(&c->sb_lock); ++ sb_quota = bch2_sb_get_quota(c->disk_sb.sb); ++ if (!sb_quota) { ++ sb_quota = bch2_sb_resize_quota(&c->disk_sb, ++ sizeof(*sb_quota) / sizeof(u64)); ++ if (!sb_quota) ++ return -ENOSPC; ++ } ++ ++ if (info->i_fieldmask & QC_SPC_TIMER) ++ sb_quota->q[type].c[Q_SPC].timelimit = ++ cpu_to_le32(info->i_spc_timelimit); ++ ++ if (info->i_fieldmask & QC_SPC_WARNS) ++ sb_quota->q[type].c[Q_SPC].warnlimit = ++ cpu_to_le32(info->i_spc_warnlimit); ++ ++ if (info->i_fieldmask & QC_INO_TIMER) ++ sb_quota->q[type].c[Q_INO].timelimit = ++ cpu_to_le32(info->i_ino_timelimit); ++ ++ if (info->i_fieldmask & QC_INO_WARNS) ++ sb_quota->q[type].c[Q_INO].warnlimit = ++ cpu_to_le32(info->i_ino_warnlimit); ++ ++ bch2_sb_quota_read(c); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++/* Get/set individual quotas: */ ++ ++static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src) ++{ ++ dst->d_space = src->c[Q_SPC].v << 9; ++ dst->d_spc_hardlimit = src->c[Q_SPC].hardlimit << 9; ++ dst->d_spc_softlimit = src->c[Q_SPC].softlimit << 9; ++ dst->d_spc_timer = src->c[Q_SPC].timer; ++ dst->d_spc_warns = src->c[Q_SPC].warns; ++ ++ dst->d_ino_count = src->c[Q_INO].v; ++ dst->d_ino_hardlimit = src->c[Q_INO].hardlimit; ++ dst->d_ino_softlimit = src->c[Q_INO].softlimit; ++ dst->d_ino_timer = src->c[Q_INO].timer; ++ dst->d_ino_warns = src->c[Q_INO].warns; ++} ++ ++static int bch2_get_quota(struct super_block *sb, struct kqid kqid, ++ struct qc_dqblk *qdq) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_memquota_type *q = &c->quotas[kqid.type]; ++ qid_t qid = from_kqid(&init_user_ns, kqid); ++ struct bch_memquota *mq; ++ ++ memset(qdq, 0, sizeof(*qdq)); ++ ++ mutex_lock(&q->lock); ++ mq = genradix_ptr(&q->table, qid); ++ if (mq) ++ __bch2_quota_get(qdq, mq); ++ mutex_unlock(&q->lock); ++ ++ return 0; ++} ++ ++static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid, ++ struct qc_dqblk *qdq) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_memquota_type *q = &c->quotas[kqid->type]; ++ qid_t qid = from_kqid(&init_user_ns, *kqid); ++ struct genradix_iter iter; ++ struct bch_memquota *mq; ++ int ret = 0; ++ ++ mutex_lock(&q->lock); ++ ++ genradix_for_each_from(&q->table, iter, mq, qid) ++ if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) { ++ __bch2_quota_get(qdq, mq); ++ *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos); ++ goto found; ++ } ++ ++ ret = -ENOENT; ++found: ++ mutex_unlock(&q->lock); ++ return ret; ++} ++ ++static int bch2_set_quota_trans(struct btree_trans *trans, ++ struct bkey_i_quota *new_quota, ++ struct qc_dqblk *qdq) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_QUOTAS, new_quota->k.p, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(iter); ++ ++ ret = bkey_err(k); ++ if (unlikely(ret)) ++ return ret; ++ ++ if (k.k->type == KEY_TYPE_quota) ++ new_quota->v = *bkey_s_c_to_quota(k).v; ++ ++ if (qdq->d_fieldmask & QC_SPC_SOFT) ++ new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9); ++ if (qdq->d_fieldmask & QC_SPC_HARD) ++ new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9); ++ ++ if (qdq->d_fieldmask & QC_INO_SOFT) ++ new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit); ++ if (qdq->d_fieldmask & QC_INO_HARD) ++ new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); ++ ++ return bch2_trans_update(trans, iter, &new_quota->k_i, 0); ++} ++ ++static int bch2_set_quota(struct super_block *sb, struct kqid qid, ++ struct qc_dqblk *qdq) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct btree_trans trans; ++ struct bkey_i_quota new_quota; ++ int ret; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ bkey_quota_init(&new_quota.k_i); ++ new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOUNLOCK, ++ bch2_set_quota_trans(&trans, &new_quota, qdq)) ?: ++ __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i)); ++ ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++const struct quotactl_ops bch2_quotactl_operations = { ++ .quota_enable = bch2_quota_enable, ++ .quota_disable = bch2_quota_disable, ++ .rm_xquota = bch2_quota_remove, ++ ++ .get_state = bch2_quota_get_state, ++ .set_info = bch2_quota_set_info, ++ ++ .get_dqblk = bch2_get_quota, ++ .get_nextdqblk = bch2_get_next_quota, ++ .set_dqblk = bch2_set_quota, ++}; ++ ++#endif /* CONFIG_BCACHEFS_QUOTA */ +diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h +new file mode 100644 +index 000000000000..51e4f9713ef0 +--- /dev/null ++++ b/fs/bcachefs/quota.h +@@ -0,0 +1,71 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_QUOTA_H ++#define _BCACHEFS_QUOTA_H ++ ++#include "inode.h" ++#include "quota_types.h" ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_quota; ++ ++const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_quota (struct bkey_ops) { \ ++ .key_invalid = bch2_quota_invalid, \ ++ .val_to_text = bch2_quota_to_text, \ ++} ++ ++static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u) ++{ ++ return (struct bch_qid) { ++ .q[QTYP_USR] = u->bi_uid, ++ .q[QTYP_GRP] = u->bi_gid, ++ .q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0, ++ }; ++} ++ ++static inline unsigned enabled_qtypes(struct bch_fs *c) ++{ ++ return ((c->opts.usrquota << QTYP_USR)| ++ (c->opts.grpquota << QTYP_GRP)| ++ (c->opts.prjquota << QTYP_PRJ)); ++} ++ ++#ifdef CONFIG_BCACHEFS_QUOTA ++ ++int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters, ++ s64, enum quota_acct_mode); ++ ++int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid, ++ struct bch_qid, u64, enum quota_acct_mode); ++ ++void bch2_fs_quota_exit(struct bch_fs *); ++void bch2_fs_quota_init(struct bch_fs *); ++int bch2_fs_quota_read(struct bch_fs *); ++ ++extern const struct quotactl_ops bch2_quotactl_operations; ++ ++#else ++ ++static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, ++ enum quota_counters counter, s64 v, ++ enum quota_acct_mode mode) ++{ ++ return 0; ++} ++ ++static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, ++ struct bch_qid dst, ++ struct bch_qid src, u64 space, ++ enum quota_acct_mode mode) ++{ ++ return 0; ++} ++ ++static inline void bch2_fs_quota_exit(struct bch_fs *c) {} ++static inline void bch2_fs_quota_init(struct bch_fs *c) {} ++static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; } ++ ++#endif ++ ++#endif /* _BCACHEFS_QUOTA_H */ +diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h +new file mode 100644 +index 000000000000..6a136083d389 +--- /dev/null ++++ b/fs/bcachefs/quota_types.h +@@ -0,0 +1,43 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_QUOTA_TYPES_H ++#define _BCACHEFS_QUOTA_TYPES_H ++ ++#include ++ ++struct bch_qid { ++ u32 q[QTYP_NR]; ++}; ++ ++enum quota_acct_mode { ++ KEY_TYPE_QUOTA_PREALLOC, ++ KEY_TYPE_QUOTA_WARN, ++ KEY_TYPE_QUOTA_NOCHECK, ++}; ++ ++struct memquota_counter { ++ u64 v; ++ u64 hardlimit; ++ u64 softlimit; ++ s64 timer; ++ int warns; ++ int warning_issued; ++}; ++ ++struct bch_memquota { ++ struct memquota_counter c[Q_COUNTERS]; ++}; ++ ++typedef GENRADIX(struct bch_memquota) bch_memquota_table; ++ ++struct quota_limit { ++ u32 timelimit; ++ u32 warnlimit; ++}; ++ ++struct bch_memquota_type { ++ struct quota_limit limits[Q_COUNTERS]; ++ bch_memquota_table table; ++ struct mutex lock; ++}; ++ ++#endif /* _BCACHEFS_QUOTA_TYPES_H */ +diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c +new file mode 100644 +index 000000000000..e15a2b1dc5d0 +--- /dev/null ++++ b/fs/bcachefs/rebalance.c +@@ -0,0 +1,334 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "btree_iter.h" ++#include "buckets.h" ++#include "clock.h" ++#include "disk_groups.h" ++#include "extents.h" ++#include "io.h" ++#include "move.h" ++#include "rebalance.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++ ++/* ++ * Check if an extent should be moved: ++ * returns -1 if it should not be moved, or ++ * device of pointer that should be moved, if known, or INT_MAX if unknown ++ */ ++static int __bch2_rebalance_pred(struct bch_fs *c, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ if (io_opts->background_compression && ++ !bch2_bkey_is_incompressible(k)) ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (!p.ptr.cached && ++ p.crc.compression_type != ++ bch2_compression_opt_to_type[io_opts->background_compression]) ++ return p.ptr.dev; ++ ++ if (io_opts->background_target) ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (!p.ptr.cached && ++ !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target)) ++ return p.ptr.dev; ++ ++ return -1; ++} ++ ++void bch2_rebalance_add_key(struct bch_fs *c, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts) ++{ ++ atomic64_t *counter; ++ int dev; ++ ++ dev = __bch2_rebalance_pred(c, k, io_opts); ++ if (dev < 0) ++ return; ++ ++ counter = dev < INT_MAX ++ ? &bch_dev_bkey_exists(c, dev)->rebalance_work ++ : &c->rebalance.work_unknown_dev; ++ ++ if (atomic64_add_return(k.k->size, counter) == k.k->size) ++ rebalance_wakeup(c); ++} ++ ++static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) ++{ ++ if (__bch2_rebalance_pred(c, k, io_opts) >= 0) { ++ data_opts->target = io_opts->background_target; ++ data_opts->btree_insert_flags = 0; ++ return DATA_ADD_REPLICAS; ++ } else { ++ return DATA_SKIP; ++ } ++} ++ ++void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) ++{ ++ if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) == ++ sectors) ++ rebalance_wakeup(c); ++} ++ ++struct rebalance_work { ++ int dev_most_full_idx; ++ unsigned dev_most_full_percent; ++ u64 dev_most_full_work; ++ u64 dev_most_full_capacity; ++ u64 total_work; ++}; ++ ++static void rebalance_work_accumulate(struct rebalance_work *w, ++ u64 dev_work, u64 unknown_dev, u64 capacity, int idx) ++{ ++ unsigned percent_full; ++ u64 work = dev_work + unknown_dev; ++ ++ if (work < dev_work || work < unknown_dev) ++ work = U64_MAX; ++ work = min(work, capacity); ++ ++ percent_full = div64_u64(work * 100, capacity); ++ ++ if (percent_full >= w->dev_most_full_percent) { ++ w->dev_most_full_idx = idx; ++ w->dev_most_full_percent = percent_full; ++ w->dev_most_full_work = work; ++ w->dev_most_full_capacity = capacity; ++ } ++ ++ if (w->total_work + dev_work >= w->total_work && ++ w->total_work + dev_work >= dev_work) ++ w->total_work += dev_work; ++} ++ ++static struct rebalance_work rebalance_work(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ struct rebalance_work ret = { .dev_most_full_idx = -1 }; ++ u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev); ++ unsigned i; ++ ++ for_each_online_member(ca, c, i) ++ rebalance_work_accumulate(&ret, ++ atomic64_read(&ca->rebalance_work), ++ unknown_dev, ++ bucket_to_sector(ca, ca->mi.nbuckets - ++ ca->mi.first_bucket), ++ i); ++ ++ rebalance_work_accumulate(&ret, ++ unknown_dev, 0, c->capacity, -1); ++ ++ return ret; ++} ++ ++static void rebalance_work_reset(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ for_each_online_member(ca, c, i) ++ atomic64_set(&ca->rebalance_work, 0); ++ ++ atomic64_set(&c->rebalance.work_unknown_dev, 0); ++} ++ ++static unsigned long curr_cputime(void) ++{ ++ u64 utime, stime; ++ ++ task_cputime_adjusted(current, &utime, &stime); ++ return nsecs_to_jiffies(utime + stime); ++} ++ ++static int bch2_rebalance_thread(void *arg) ++{ ++ struct bch_fs *c = arg; ++ struct bch_fs_rebalance *r = &c->rebalance; ++ struct io_clock *clock = &c->io_clock[WRITE]; ++ struct rebalance_work w, p; ++ unsigned long start, prev_start; ++ unsigned long prev_run_time, prev_run_cputime; ++ unsigned long cputime, prev_cputime; ++ unsigned long io_start; ++ long throttle; ++ ++ set_freezable(); ++ ++ io_start = atomic_long_read(&clock->now); ++ p = rebalance_work(c); ++ prev_start = jiffies; ++ prev_cputime = curr_cputime(); ++ ++ while (!kthread_wait_freezable(r->enabled)) { ++ cond_resched(); ++ ++ start = jiffies; ++ cputime = curr_cputime(); ++ ++ prev_run_time = start - prev_start; ++ prev_run_cputime = cputime - prev_cputime; ++ ++ w = rebalance_work(c); ++ BUG_ON(!w.dev_most_full_capacity); ++ ++ if (!w.total_work) { ++ r->state = REBALANCE_WAITING; ++ kthread_wait_freezable(rebalance_work(c).total_work); ++ continue; ++ } ++ ++ /* ++ * If there isn't much work to do, throttle cpu usage: ++ */ ++ throttle = prev_run_cputime * 100 / ++ max(1U, w.dev_most_full_percent) - ++ prev_run_time; ++ ++ if (w.dev_most_full_percent < 20 && throttle > 0) { ++ r->throttled_until_iotime = io_start + ++ div_u64(w.dev_most_full_capacity * ++ (20 - w.dev_most_full_percent), ++ 50); ++ ++ if (atomic_long_read(&clock->now) + clock->max_slop < ++ r->throttled_until_iotime) { ++ r->throttled_until_cputime = start + throttle; ++ r->state = REBALANCE_THROTTLED; ++ ++ bch2_kthread_io_clock_wait(clock, ++ r->throttled_until_iotime, ++ throttle); ++ continue; ++ } ++ } ++ ++ /* minimum 1 mb/sec: */ ++ r->pd.rate.rate = ++ max_t(u64, 1 << 11, ++ r->pd.rate.rate * ++ max(p.dev_most_full_percent, 1U) / ++ max(w.dev_most_full_percent, 1U)); ++ ++ io_start = atomic_long_read(&clock->now); ++ p = w; ++ prev_start = start; ++ prev_cputime = cputime; ++ ++ r->state = REBALANCE_RUNNING; ++ memset(&r->move_stats, 0, sizeof(r->move_stats)); ++ rebalance_work_reset(c); ++ ++ bch2_move_data(c, ++ /* ratelimiting disabled for now */ ++ NULL, /* &r->pd.rate, */ ++ writepoint_ptr(&c->rebalance_write_point), ++ POS_MIN, POS_MAX, ++ rebalance_pred, NULL, ++ &r->move_stats); ++ } ++ ++ return 0; ++} ++ ++ssize_t bch2_rebalance_work_show(struct bch_fs *c, char *buf) ++{ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ struct bch_fs_rebalance *r = &c->rebalance; ++ struct rebalance_work w = rebalance_work(c); ++ char h1[21], h2[21]; ++ ++ bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9); ++ bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9); ++ pr_buf(&out, "fullest_dev (%i):\t%s/%s\n", ++ w.dev_most_full_idx, h1, h2); ++ ++ bch2_hprint(&PBUF(h1), w.total_work << 9); ++ bch2_hprint(&PBUF(h2), c->capacity << 9); ++ pr_buf(&out, "total work:\t\t%s/%s\n", h1, h2); ++ ++ pr_buf(&out, "rate:\t\t\t%u\n", r->pd.rate.rate); ++ ++ switch (r->state) { ++ case REBALANCE_WAITING: ++ pr_buf(&out, "waiting\n"); ++ break; ++ case REBALANCE_THROTTLED: ++ bch2_hprint(&PBUF(h1), ++ (r->throttled_until_iotime - ++ atomic_long_read(&c->io_clock[WRITE].now)) << 9); ++ pr_buf(&out, "throttled for %lu sec or %s io\n", ++ (r->throttled_until_cputime - jiffies) / HZ, ++ h1); ++ break; ++ case REBALANCE_RUNNING: ++ pr_buf(&out, "running\n"); ++ pr_buf(&out, "pos %llu:%llu\n", ++ r->move_stats.pos.inode, ++ r->move_stats.pos.offset); ++ break; ++ } ++ ++ return out.pos - buf; ++} ++ ++void bch2_rebalance_stop(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ c->rebalance.pd.rate.rate = UINT_MAX; ++ bch2_ratelimit_reset(&c->rebalance.pd.rate); ++ ++ p = rcu_dereference_protected(c->rebalance.thread, 1); ++ c->rebalance.thread = NULL; ++ ++ if (p) { ++ /* for sychronizing with rebalance_wakeup() */ ++ synchronize_rcu(); ++ ++ kthread_stop(p); ++ put_task_struct(p); ++ } ++} ++ ++int bch2_rebalance_start(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ if (c->opts.nochanges) ++ return 0; ++ ++ p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance"); ++ if (IS_ERR(p)) ++ return PTR_ERR(p); ++ ++ get_task_struct(p); ++ rcu_assign_pointer(c->rebalance.thread, p); ++ wake_up_process(p); ++ return 0; ++} ++ ++void bch2_fs_rebalance_init(struct bch_fs *c) ++{ ++ bch2_pd_controller_init(&c->rebalance.pd); ++ ++ atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX); ++} +diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h +new file mode 100644 +index 000000000000..99e2a1fb6084 +--- /dev/null ++++ b/fs/bcachefs/rebalance.h +@@ -0,0 +1,28 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_REBALANCE_H ++#define _BCACHEFS_REBALANCE_H ++ ++#include "rebalance_types.h" ++ ++static inline void rebalance_wakeup(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ rcu_read_lock(); ++ p = rcu_dereference(c->rebalance.thread); ++ if (p) ++ wake_up_process(p); ++ rcu_read_unlock(); ++} ++ ++void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c, ++ struct bch_io_opts *); ++void bch2_rebalance_add_work(struct bch_fs *, u64); ++ ++ssize_t bch2_rebalance_work_show(struct bch_fs *, char *); ++ ++void bch2_rebalance_stop(struct bch_fs *); ++int bch2_rebalance_start(struct bch_fs *); ++void bch2_fs_rebalance_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_REBALANCE_H */ +diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h +new file mode 100644 +index 000000000000..192c6be20ced +--- /dev/null ++++ b/fs/bcachefs/rebalance_types.h +@@ -0,0 +1,27 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_REBALANCE_TYPES_H ++#define _BCACHEFS_REBALANCE_TYPES_H ++ ++#include "move_types.h" ++ ++enum rebalance_state { ++ REBALANCE_WAITING, ++ REBALANCE_THROTTLED, ++ REBALANCE_RUNNING, ++}; ++ ++struct bch_fs_rebalance { ++ struct task_struct __rcu *thread; ++ struct bch_pd_controller pd; ++ ++ atomic64_t work_unknown_dev; ++ ++ enum rebalance_state state; ++ unsigned long throttled_until_iotime; ++ unsigned long throttled_until_cputime; ++ struct bch_move_stats move_stats; ++ ++ unsigned enabled:1; ++}; ++ ++#endif /* _BCACHEFS_REBALANCE_TYPES_H */ +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +new file mode 100644 +index 000000000000..41b864dcdc39 +--- /dev/null ++++ b/fs/bcachefs/recovery.c +@@ -0,0 +1,1317 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_io.h" ++#include "buckets.h" ++#include "dirent.h" ++#include "ec.h" ++#include "error.h" ++#include "fs-common.h" ++#include "fsck.h" ++#include "journal_io.h" ++#include "journal_reclaim.h" ++#include "journal_seq_blacklist.h" ++#include "quota.h" ++#include "recovery.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++#include ++#include ++ ++#define QSTR(n) { { { .len = strlen(n) } }, .name = n } ++ ++/* iterate over keys read from the journal: */ ++ ++static struct journal_key *journal_key_search(struct journal_keys *journal_keys, ++ enum btree_id id, unsigned level, ++ struct bpos pos) ++{ ++ size_t l = 0, r = journal_keys->nr, m; ++ ++ while (l < r) { ++ m = l + ((r - l) >> 1); ++ if ((cmp_int(id, journal_keys->d[m].btree_id) ?: ++ cmp_int(level, journal_keys->d[m].level) ?: ++ bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0) ++ l = m + 1; ++ else ++ r = m; ++ } ++ ++ BUG_ON(l < journal_keys->nr && ++ (cmp_int(id, journal_keys->d[l].btree_id) ?: ++ cmp_int(level, journal_keys->d[l].level) ?: ++ bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0); ++ ++ BUG_ON(l && ++ (cmp_int(id, journal_keys->d[l - 1].btree_id) ?: ++ cmp_int(level, journal_keys->d[l - 1].level) ?: ++ bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0); ++ ++ return l < journal_keys->nr ? journal_keys->d + l : NULL; ++} ++ ++static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) ++{ ++ if (iter->k && ++ iter->k < iter->keys->d + iter->keys->nr && ++ iter->k->btree_id == iter->btree_id && ++ iter->k->level == iter->level) ++ return iter->k->k; ++ ++ iter->k = NULL; ++ return NULL; ++} ++ ++static void bch2_journal_iter_advance(struct journal_iter *iter) ++{ ++ if (iter->k) ++ iter->k++; ++} ++ ++static void bch2_journal_iter_init(struct journal_iter *iter, ++ struct journal_keys *journal_keys, ++ enum btree_id id, unsigned level, ++ struct bpos pos) ++{ ++ iter->btree_id = id; ++ iter->level = level; ++ iter->keys = journal_keys; ++ iter->k = journal_key_search(journal_keys, id, level, pos); ++} ++ ++static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) ++{ ++ return iter->btree ++ ? bch2_btree_iter_peek(iter->btree) ++ : bch2_btree_node_iter_peek_unpack(&iter->node_iter, ++ iter->b, &iter->unpacked); ++} ++ ++static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) ++{ ++ if (iter->btree) ++ bch2_btree_iter_next(iter->btree); ++ else ++ bch2_btree_node_iter_advance(&iter->node_iter, iter->b); ++} ++ ++void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) ++{ ++ switch (iter->last) { ++ case none: ++ break; ++ case btree: ++ bch2_journal_iter_advance_btree(iter); ++ break; ++ case journal: ++ bch2_journal_iter_advance(&iter->journal); ++ break; ++ } ++ ++ iter->last = none; ++} ++ ++struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) ++{ ++ struct bkey_s_c ret; ++ ++ while (1) { ++ struct bkey_s_c btree_k = ++ bch2_journal_iter_peek_btree(iter); ++ struct bkey_s_c journal_k = ++ bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal)); ++ ++ if (btree_k.k && journal_k.k) { ++ int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p); ++ ++ if (!cmp) ++ bch2_journal_iter_advance_btree(iter); ++ ++ iter->last = cmp < 0 ? btree : journal; ++ } else if (btree_k.k) { ++ iter->last = btree; ++ } else if (journal_k.k) { ++ iter->last = journal; ++ } else { ++ iter->last = none; ++ return bkey_s_c_null; ++ } ++ ++ ret = iter->last == journal ? journal_k : btree_k; ++ ++ if (iter->b && ++ bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) { ++ iter->journal.k = NULL; ++ iter->last = none; ++ return bkey_s_c_null; ++ } ++ ++ if (!bkey_deleted(ret.k)) ++ break; ++ ++ bch2_btree_and_journal_iter_advance(iter); ++ } ++ ++ return ret; ++} ++ ++struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *iter) ++{ ++ bch2_btree_and_journal_iter_advance(iter); ++ ++ return bch2_btree_and_journal_iter_peek(iter); ++} ++ ++void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter, ++ struct btree_trans *trans, ++ struct journal_keys *journal_keys, ++ enum btree_id id, struct bpos pos) ++{ ++ memset(iter, 0, sizeof(*iter)); ++ ++ iter->btree = bch2_trans_get_iter(trans, id, pos, 0); ++ bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos); ++} ++ ++void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, ++ struct journal_keys *journal_keys, ++ struct btree *b) ++{ ++ memset(iter, 0, sizeof(*iter)); ++ ++ iter->b = b; ++ bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); ++ bch2_journal_iter_init(&iter->journal, journal_keys, ++ b->c.btree_id, b->c.level, b->data->min_key); ++} ++ ++/* Walk btree, overlaying keys from the journal: */ ++ ++static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b, ++ struct journal_keys *journal_keys, ++ enum btree_id btree_id, ++ btree_walk_node_fn node_fn, ++ btree_walk_key_fn key_fn) ++{ ++ struct btree_and_journal_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); ++ ++ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { ++ ret = key_fn(c, btree_id, b->c.level, k); ++ if (ret) ++ break; ++ ++ if (b->c.level) { ++ struct btree *child; ++ BKEY_PADDED(k) tmp; ++ ++ bkey_reassemble(&tmp.k, k); ++ k = bkey_i_to_s_c(&tmp.k); ++ ++ bch2_btree_and_journal_iter_advance(&iter); ++ ++ if (b->c.level > 0) { ++ child = bch2_btree_node_get_noiter(c, &tmp.k, ++ b->c.btree_id, b->c.level - 1); ++ ret = PTR_ERR_OR_ZERO(child); ++ if (ret) ++ break; ++ ++ ret = (node_fn ? node_fn(c, b) : 0) ?: ++ bch2_btree_and_journal_walk_recurse(c, child, ++ journal_keys, btree_id, node_fn, key_fn); ++ six_unlock_read(&child->c.lock); ++ ++ if (ret) ++ break; ++ } ++ } else { ++ bch2_btree_and_journal_iter_advance(&iter); ++ } ++ } ++ ++ return ret; ++} ++ ++int bch2_btree_and_journal_walk(struct bch_fs *c, struct journal_keys *journal_keys, ++ enum btree_id btree_id, ++ btree_walk_node_fn node_fn, ++ btree_walk_key_fn key_fn) ++{ ++ struct btree *b = c->btree_roots[btree_id].b; ++ int ret = 0; ++ ++ if (btree_node_fake(b)) ++ return 0; ++ ++ six_lock_read(&b->c.lock, NULL, NULL); ++ ret = (node_fn ? node_fn(c, b) : 0) ?: ++ bch2_btree_and_journal_walk_recurse(c, b, journal_keys, btree_id, ++ node_fn, key_fn) ?: ++ key_fn(c, btree_id, b->c.level + 1, bkey_i_to_s_c(&b->key)); ++ six_unlock_read(&b->c.lock); ++ ++ return ret; ++} ++ ++/* sort and dedup all keys in the journal: */ ++ ++void bch2_journal_entries_free(struct list_head *list) ++{ ++ ++ while (!list_empty(list)) { ++ struct journal_replay *i = ++ list_first_entry(list, struct journal_replay, list); ++ list_del(&i->list); ++ kvpfree(i, offsetof(struct journal_replay, j) + ++ vstruct_bytes(&i->j)); ++ } ++} ++ ++/* ++ * When keys compare equal, oldest compares first: ++ */ ++static int journal_sort_key_cmp(const void *_l, const void *_r) ++{ ++ const struct journal_key *l = _l; ++ const struct journal_key *r = _r; ++ ++ return cmp_int(l->btree_id, r->btree_id) ?: ++ cmp_int(l->level, r->level) ?: ++ bkey_cmp(l->k->k.p, r->k->k.p) ?: ++ cmp_int(l->journal_seq, r->journal_seq) ?: ++ cmp_int(l->journal_offset, r->journal_offset); ++} ++ ++void bch2_journal_keys_free(struct journal_keys *keys) ++{ ++ kvfree(keys->d); ++ keys->d = NULL; ++ keys->nr = 0; ++} ++ ++static struct journal_keys journal_keys_sort(struct list_head *journal_entries) ++{ ++ struct journal_replay *p; ++ struct jset_entry *entry; ++ struct bkey_i *k, *_n; ++ struct journal_keys keys = { NULL }; ++ struct journal_key *src, *dst; ++ size_t nr_keys = 0; ++ ++ if (list_empty(journal_entries)) ++ return keys; ++ ++ keys.journal_seq_base = ++ le64_to_cpu(list_last_entry(journal_entries, ++ struct journal_replay, list)->j.last_seq); ++ ++ list_for_each_entry(p, journal_entries, list) { ++ if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) ++ continue; ++ ++ for_each_jset_key(k, _n, entry, &p->j) ++ nr_keys++; ++ } ++ ++ ++ keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL); ++ if (!keys.d) ++ goto err; ++ ++ list_for_each_entry(p, journal_entries, list) { ++ if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) ++ continue; ++ ++ for_each_jset_key(k, _n, entry, &p->j) ++ keys.d[keys.nr++] = (struct journal_key) { ++ .btree_id = entry->btree_id, ++ .level = entry->level, ++ .k = k, ++ .journal_seq = le64_to_cpu(p->j.seq) - ++ keys.journal_seq_base, ++ .journal_offset = k->_data - p->j._data, ++ }; ++ } ++ ++ sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL); ++ ++ src = dst = keys.d; ++ while (src < keys.d + keys.nr) { ++ while (src + 1 < keys.d + keys.nr && ++ src[0].btree_id == src[1].btree_id && ++ src[0].level == src[1].level && ++ !bkey_cmp(src[0].k->k.p, src[1].k->k.p)) ++ src++; ++ ++ *dst++ = *src++; ++ } ++ ++ keys.nr = dst - keys.d; ++err: ++ return keys; ++} ++ ++/* journal replay: */ ++ ++static void replay_now_at(struct journal *j, u64 seq) ++{ ++ BUG_ON(seq < j->replay_journal_seq); ++ BUG_ON(seq > j->replay_journal_seq_end); ++ ++ while (j->replay_journal_seq < seq) ++ bch2_journal_pin_put(j, j->replay_journal_seq++); ++} ++ ++static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id, ++ struct bkey_i *k) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter, *split_iter; ++ /* ++ * We might cause compressed extents to be split, so we need to pass in ++ * a disk_reservation: ++ */ ++ struct disk_reservation disk_res = ++ bch2_disk_reservation_init(c, 0); ++ struct bkey_i *split; ++ struct bpos atomic_end; ++ /* ++ * Some extents aren't equivalent - w.r.t. what the triggers do ++ * - if they're split: ++ */ ++ bool remark_if_split = bch2_bkey_sectors_compressed(bkey_i_to_s_c(k)) || ++ k->k.type == KEY_TYPE_reflink_p; ++ bool remark = false; ++ int ret; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ iter = bch2_trans_get_iter(&trans, btree_id, ++ bkey_start_pos(&k->k), ++ BTREE_ITER_INTENT); ++ ++ do { ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ goto err; ++ ++ atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p); ++ ++ split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k)); ++ ret = PTR_ERR_OR_ZERO(split); ++ if (ret) ++ goto err; ++ ++ if (!remark && ++ remark_if_split && ++ bkey_cmp(atomic_end, k->k.p) < 0) { ++ ret = bch2_disk_reservation_add(c, &disk_res, ++ k->k.size * ++ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k)), ++ BCH_DISK_RESERVATION_NOFAIL); ++ BUG_ON(ret); ++ ++ remark = true; ++ } ++ ++ bkey_copy(split, k); ++ bch2_cut_front(iter->pos, split); ++ bch2_cut_back(atomic_end, split); ++ ++ split_iter = bch2_trans_copy_iter(&trans, iter); ++ ret = PTR_ERR_OR_ZERO(split_iter); ++ if (ret) ++ goto err; ++ ++ /* ++ * It's important that we don't go through the ++ * extent_handle_overwrites() and extent_update_to_keys() path ++ * here: journal replay is supposed to treat extents like ++ * regular keys ++ */ ++ __bch2_btree_iter_set_pos(split_iter, split->k.p, false); ++ bch2_trans_update(&trans, split_iter, split, !remark ++ ? BTREE_TRIGGER_NORUN ++ : BTREE_TRIGGER_NOOVERWRITES); ++ ++ bch2_btree_iter_set_pos(iter, split->k.p); ++ } while (bkey_cmp(iter->pos, k->k.p) < 0); ++ ++ if (remark) { ++ ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k), ++ 0, -((s64) k->k.size), ++ BTREE_TRIGGER_OVERWRITE); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_trans_commit(&trans, &disk_res, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_JOURNAL_REPLAY); ++err: ++ if (ret == -EINTR) ++ goto retry; ++ ++ bch2_disk_reservation_put(c, &disk_res); ++ ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++static int __bch2_journal_replay_key(struct btree_trans *trans, ++ enum btree_id id, unsigned level, ++ struct bkey_i *k) ++{ ++ struct btree_iter *iter; ++ int ret; ++ ++ iter = bch2_trans_get_node_iter(trans, id, k->k.p, ++ BTREE_MAX_DEPTH, level, ++ BTREE_ITER_INTENT); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ /* ++ * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run ++ * extent_handle_overwrites() and extent_update_to_keys() - but we don't ++ * want that here, journal replay is supposed to treat extents like ++ * regular keys: ++ */ ++ __bch2_btree_iter_set_pos(iter, k->k.p, false); ++ ++ ret = bch2_btree_iter_traverse(iter) ?: ++ bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_i *k) ++{ ++ return bch2_trans_do(c, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_JOURNAL_REPLAY, ++ __bch2_journal_replay_key(&trans, id, level, k)); ++} ++ ++static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k) ++{ ++ struct btree_iter *iter; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, k->k.p, ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(iter) ?: ++ bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) ++{ ++ return bch2_trans_do(c, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_JOURNAL_REPLAY, ++ __bch2_alloc_replay_key(&trans, k)); ++} ++ ++static int journal_sort_seq_cmp(const void *_l, const void *_r) ++{ ++ const struct journal_key *l = _l; ++ const struct journal_key *r = _r; ++ ++ return cmp_int(r->level, l->level) ?: ++ cmp_int(l->journal_seq, r->journal_seq) ?: ++ cmp_int(l->btree_id, r->btree_id) ?: ++ bkey_cmp(l->k->k.p, r->k->k.p); ++} ++ ++static int bch2_journal_replay(struct bch_fs *c, ++ struct journal_keys keys) ++{ ++ struct journal *j = &c->journal; ++ struct journal_key *i; ++ u64 seq; ++ int ret; ++ ++ sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL); ++ ++ if (keys.nr) ++ replay_now_at(j, keys.journal_seq_base); ++ ++ seq = j->replay_journal_seq; ++ ++ /* ++ * First replay updates to the alloc btree - these will only update the ++ * btree key cache: ++ */ ++ for_each_journal_key(keys, i) { ++ cond_resched(); ++ ++ if (!i->level && i->btree_id == BTREE_ID_ALLOC) { ++ j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; ++ ret = bch2_alloc_replay_key(c, i->k); ++ if (ret) ++ goto err; ++ } ++ } ++ ++ /* ++ * Next replay updates to interior btree nodes: ++ */ ++ for_each_journal_key(keys, i) { ++ cond_resched(); ++ ++ if (i->level) { ++ j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; ++ ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); ++ if (ret) ++ goto err; ++ } ++ } ++ ++ /* ++ * Now that the btree is in a consistent state, we can start journal ++ * reclaim (which will be flushing entries from the btree key cache back ++ * to the btree: ++ */ ++ set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); ++ set_bit(JOURNAL_RECLAIM_STARTED, &j->flags); ++ ++ j->replay_journal_seq = seq; ++ ++ /* ++ * Now replay leaf node updates: ++ */ ++ for_each_journal_key(keys, i) { ++ cond_resched(); ++ ++ if (i->level || i->btree_id == BTREE_ID_ALLOC) ++ continue; ++ ++ replay_now_at(j, keys.journal_seq_base + i->journal_seq); ++ ++ ret = i->k->k.size ++ ? bch2_extent_replay_key(c, i->btree_id, i->k) ++ : bch2_journal_replay_key(c, i->btree_id, i->level, i->k); ++ if (ret) ++ goto err; ++ } ++ ++ replay_now_at(j, j->replay_journal_seq_end); ++ j->replay_journal_seq = 0; ++ ++ bch2_journal_set_replay_done(j); ++ bch2_journal_flush_all_pins(j); ++ return bch2_journal_error(j); ++err: ++ bch_err(c, "journal replay: error %d while replaying key", ret); ++ return ret; ++} ++ ++static bool journal_empty(struct list_head *journal) ++{ ++ return list_empty(journal) || ++ journal_entry_empty(&list_last_entry(journal, ++ struct journal_replay, list)->j); ++} ++ ++static int ++verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c, ++ struct list_head *journal) ++{ ++ struct journal_replay *i = ++ list_last_entry(journal, struct journal_replay, list); ++ u64 start_seq = le64_to_cpu(i->j.last_seq); ++ u64 end_seq = le64_to_cpu(i->j.seq); ++ u64 seq = start_seq; ++ int ret = 0; ++ ++ list_for_each_entry(i, journal, list) { ++ if (le64_to_cpu(i->j.seq) < start_seq) ++ continue; ++ ++ fsck_err_on(seq != le64_to_cpu(i->j.seq), c, ++ "journal entries %llu-%llu missing! (replaying %llu-%llu)", ++ seq, le64_to_cpu(i->j.seq) - 1, ++ start_seq, end_seq); ++ ++ seq = le64_to_cpu(i->j.seq); ++ ++ fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c, ++ "found blacklisted journal entry %llu", seq); ++ ++ do { ++ seq++; ++ } while (bch2_journal_seq_is_blacklisted(c, seq, false)); ++ } ++fsck_err: ++ return ret; ++} ++ ++/* journal replay early: */ ++ ++static int journal_replay_entry_early(struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ int ret = 0; ++ ++ switch (entry->type) { ++ case BCH_JSET_ENTRY_btree_root: { ++ struct btree_root *r; ++ ++ if (entry->btree_id >= BTREE_ID_NR) { ++ bch_err(c, "filesystem has unknown btree type %u", ++ entry->btree_id); ++ return -EINVAL; ++ } ++ ++ r = &c->btree_roots[entry->btree_id]; ++ ++ if (entry->u64s) { ++ r->level = entry->level; ++ bkey_copy(&r->key, &entry->start[0]); ++ r->error = 0; ++ } else { ++ r->error = -EIO; ++ } ++ r->alive = true; ++ break; ++ } ++ case BCH_JSET_ENTRY_usage: { ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ ++ switch (entry->btree_id) { ++ case FS_USAGE_RESERVED: ++ if (entry->level < BCH_REPLICAS_MAX) ++ c->usage_base->persistent_reserved[entry->level] = ++ le64_to_cpu(u->v); ++ break; ++ case FS_USAGE_INODES: ++ c->usage_base->nr_inodes = le64_to_cpu(u->v); ++ break; ++ case FS_USAGE_KEY_VERSION: ++ atomic64_set(&c->key_version, ++ le64_to_cpu(u->v)); ++ break; ++ } ++ ++ break; ++ } ++ case BCH_JSET_ENTRY_data_usage: { ++ struct jset_entry_data_usage *u = ++ container_of(entry, struct jset_entry_data_usage, entry); ++ ret = bch2_replicas_set_usage(c, &u->r, ++ le64_to_cpu(u->v)); ++ break; ++ } ++ case BCH_JSET_ENTRY_blacklist: { ++ struct jset_entry_blacklist *bl_entry = ++ container_of(entry, struct jset_entry_blacklist, entry); ++ ++ ret = bch2_journal_seq_blacklist_add(c, ++ le64_to_cpu(bl_entry->seq), ++ le64_to_cpu(bl_entry->seq) + 1); ++ break; ++ } ++ case BCH_JSET_ENTRY_blacklist_v2: { ++ struct jset_entry_blacklist_v2 *bl_entry = ++ container_of(entry, struct jset_entry_blacklist_v2, entry); ++ ++ ret = bch2_journal_seq_blacklist_add(c, ++ le64_to_cpu(bl_entry->start), ++ le64_to_cpu(bl_entry->end) + 1); ++ break; ++ } ++ } ++ ++ return ret; ++} ++ ++static int journal_replay_early(struct bch_fs *c, ++ struct bch_sb_field_clean *clean, ++ struct list_head *journal) ++{ ++ struct jset_entry *entry; ++ int ret; ++ ++ if (clean) { ++ c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock); ++ c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock); ++ ++ for (entry = clean->start; ++ entry != vstruct_end(&clean->field); ++ entry = vstruct_next(entry)) { ++ ret = journal_replay_entry_early(c, entry); ++ if (ret) ++ return ret; ++ } ++ } else { ++ struct journal_replay *i = ++ list_last_entry(journal, struct journal_replay, list); ++ ++ c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); ++ c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); ++ ++ list_for_each_entry(i, journal, list) ++ vstruct_for_each(&i->j, entry) { ++ ret = journal_replay_entry_early(c, entry); ++ if (ret) ++ return ret; ++ } ++ } ++ ++ bch2_fs_usage_initialize(c); ++ ++ return 0; ++} ++ ++/* sb clean section: */ ++ ++static struct bkey_i *btree_root_find(struct bch_fs *c, ++ struct bch_sb_field_clean *clean, ++ struct jset *j, ++ enum btree_id id, unsigned *level) ++{ ++ struct bkey_i *k; ++ struct jset_entry *entry, *start, *end; ++ ++ if (clean) { ++ start = clean->start; ++ end = vstruct_end(&clean->field); ++ } else { ++ start = j->start; ++ end = vstruct_last(j); ++ } ++ ++ for (entry = start; entry < end; entry = vstruct_next(entry)) ++ if (entry->type == BCH_JSET_ENTRY_btree_root && ++ entry->btree_id == id) ++ goto found; ++ ++ return NULL; ++found: ++ if (!entry->u64s) ++ return ERR_PTR(-EINVAL); ++ ++ k = entry->start; ++ *level = entry->level; ++ return k; ++} ++ ++static int verify_superblock_clean(struct bch_fs *c, ++ struct bch_sb_field_clean **cleanp, ++ struct jset *j) ++{ ++ unsigned i; ++ struct bch_sb_field_clean *clean = *cleanp; ++ int ret = 0; ++ ++ if (!c->sb.clean || !j) ++ return 0; ++ ++ if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, ++ "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", ++ le64_to_cpu(clean->journal_seq), ++ le64_to_cpu(j->seq))) { ++ kfree(clean); ++ *cleanp = NULL; ++ return 0; ++ } ++ ++ mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, ++ "superblock read clock doesn't match journal after clean shutdown"); ++ mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, ++ "superblock read clock doesn't match journal after clean shutdown"); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) { ++ char buf1[200], buf2[200]; ++ struct bkey_i *k1, *k2; ++ unsigned l1 = 0, l2 = 0; ++ ++ k1 = btree_root_find(c, clean, NULL, i, &l1); ++ k2 = btree_root_find(c, NULL, j, i, &l2); ++ ++ if (!k1 && !k2) ++ continue; ++ ++ mustfix_fsck_err_on(!k1 || !k2 || ++ IS_ERR(k1) || ++ IS_ERR(k2) || ++ k1->k.u64s != k2->k.u64s || ++ memcmp(k1, k2, bkey_bytes(k1)) || ++ l1 != l2, c, ++ "superblock btree root %u doesn't match journal after clean shutdown\n" ++ "sb: l=%u %s\n" ++ "journal: l=%u %s\n", i, ++ l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1), ++ l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2)); ++ } ++fsck_err: ++ return ret; ++} ++ ++static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) ++{ ++ struct bch_sb_field_clean *clean, *sb_clean; ++ int ret; ++ ++ mutex_lock(&c->sb_lock); ++ sb_clean = bch2_sb_get_clean(c->disk_sb.sb); ++ ++ if (fsck_err_on(!sb_clean, c, ++ "superblock marked clean but clean section not present")) { ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ c->sb.clean = false; ++ mutex_unlock(&c->sb_lock); ++ return NULL; ++ } ++ ++ clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), ++ GFP_KERNEL); ++ if (!clean) { ++ mutex_unlock(&c->sb_lock); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ if (le16_to_cpu(c->disk_sb.sb->version) < ++ bcachefs_metadata_version_bkey_renumber) ++ bch2_sb_clean_renumber(clean, READ); ++ ++ mutex_unlock(&c->sb_lock); ++ ++ return clean; ++fsck_err: ++ mutex_unlock(&c->sb_lock); ++ return ERR_PTR(ret); ++} ++ ++static int read_btree_roots(struct bch_fs *c) ++{ ++ unsigned i; ++ int ret = 0; ++ ++ for (i = 0; i < BTREE_ID_NR; i++) { ++ struct btree_root *r = &c->btree_roots[i]; ++ ++ if (!r->alive) ++ continue; ++ ++ if (i == BTREE_ID_ALLOC && ++ c->opts.reconstruct_alloc) { ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); ++ continue; ++ } ++ ++ ++ if (r->error) { ++ __fsck_err(c, i == BTREE_ID_ALLOC ++ ? FSCK_CAN_IGNORE : 0, ++ "invalid btree root %s", ++ bch2_btree_ids[i]); ++ if (i == BTREE_ID_ALLOC) ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); ++ } ++ ++ ret = bch2_btree_root_read(c, i, &r->key, r->level); ++ if (ret) { ++ __fsck_err(c, i == BTREE_ID_ALLOC ++ ? FSCK_CAN_IGNORE : 0, ++ "error reading btree root %s", ++ bch2_btree_ids[i]); ++ if (i == BTREE_ID_ALLOC) ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); ++ } ++ } ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (!c->btree_roots[i].b) ++ bch2_btree_root_alloc(c, i); ++fsck_err: ++ return ret; ++} ++ ++int bch2_fs_recovery(struct bch_fs *c) ++{ ++ const char *err = "cannot allocate memory"; ++ struct bch_sb_field_clean *clean = NULL; ++ u64 journal_seq; ++ bool wrote = false, write_sb = false; ++ int ret; ++ ++ if (c->sb.clean) ++ clean = read_superblock_clean(c); ++ ret = PTR_ERR_OR_ZERO(clean); ++ if (ret) ++ goto err; ++ ++ if (c->sb.clean) ++ bch_info(c, "recovering from clean shutdown, journal seq %llu", ++ le64_to_cpu(clean->journal_seq)); ++ ++ if (!c->replicas.entries) { ++ bch_info(c, "building replicas info"); ++ set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); ++ } ++ ++ if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { ++ struct jset *j; ++ ++ ret = bch2_journal_read(c, &c->journal_entries); ++ if (ret) ++ goto err; ++ ++ if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c, ++ "filesystem marked clean but journal not empty")) { ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ c->sb.clean = false; ++ } ++ ++ if (!c->sb.clean && list_empty(&c->journal_entries)) { ++ bch_err(c, "no journal entries found"); ++ ret = BCH_FSCK_REPAIR_IMPOSSIBLE; ++ goto err; ++ } ++ ++ c->journal_keys = journal_keys_sort(&c->journal_entries); ++ if (!c->journal_keys.d) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ j = &list_last_entry(&c->journal_entries, ++ struct journal_replay, list)->j; ++ ++ ret = verify_superblock_clean(c, &clean, j); ++ if (ret) ++ goto err; ++ ++ journal_seq = le64_to_cpu(j->seq) + 1; ++ } else { ++ journal_seq = le64_to_cpu(clean->journal_seq) + 1; ++ } ++ ++ if (!c->sb.clean && ++ !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { ++ bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ ret = journal_replay_early(c, clean, &c->journal_entries); ++ if (ret) ++ goto err; ++ ++ if (!c->sb.clean) { ++ ret = bch2_journal_seq_blacklist_add(c, ++ journal_seq, ++ journal_seq + 4); ++ if (ret) { ++ bch_err(c, "error creating new journal seq blacklist entry"); ++ goto err; ++ } ++ ++ journal_seq += 4; ++ } ++ ++ ret = bch2_blacklist_table_initialize(c); ++ ++ if (!list_empty(&c->journal_entries)) { ++ ret = verify_journal_entries_not_blacklisted_or_missing(c, ++ &c->journal_entries); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_fs_journal_start(&c->journal, journal_seq, ++ &c->journal_entries); ++ if (ret) ++ goto err; ++ ++ ret = read_btree_roots(c); ++ if (ret) ++ goto err; ++ ++ bch_verbose(c, "starting alloc read"); ++ err = "error reading allocation information"; ++ ret = bch2_alloc_read(c, &c->journal_keys); ++ if (ret) ++ goto err; ++ bch_verbose(c, "alloc read done"); ++ ++ bch_verbose(c, "starting stripes_read"); ++ err = "error reading stripes"; ++ ret = bch2_stripes_read(c, &c->journal_keys); ++ if (ret) ++ goto err; ++ bch_verbose(c, "stripes_read done"); ++ ++ set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); ++ ++ if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) && ++ !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) { ++ /* ++ * interior btree node updates aren't consistent with the ++ * journal; after an unclean shutdown we have to walk all ++ * pointers to metadata: ++ */ ++ bch_info(c, "starting metadata mark and sweep"); ++ err = "error in mark and sweep"; ++ ret = bch2_gc(c, &c->journal_keys, true, true); ++ if (ret) ++ goto err; ++ bch_verbose(c, "mark and sweep done"); ++ } ++ ++ if (c->opts.fsck || ++ !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) || ++ test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { ++ bch_info(c, "starting mark and sweep"); ++ err = "error in mark and sweep"; ++ ret = bch2_gc(c, &c->journal_keys, true, false); ++ if (ret) ++ goto err; ++ bch_verbose(c, "mark and sweep done"); ++ } ++ ++ clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); ++ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); ++ ++ /* ++ * Skip past versions that might have possibly been used (as nonces), ++ * but hadn't had their pointers written: ++ */ ++ if (c->sb.encryption_type && !c->sb.clean) ++ atomic64_add(1 << 16, &c->key_version); ++ ++ if (c->opts.norecovery) ++ goto out; ++ ++ bch_verbose(c, "starting journal replay"); ++ err = "journal replay failed"; ++ ret = bch2_journal_replay(c, c->journal_keys); ++ if (ret) ++ goto err; ++ bch_verbose(c, "journal replay done"); ++ ++ if (!c->opts.nochanges) { ++ /* ++ * note that even when filesystem was clean there might be work ++ * to do here, if we ran gc (because of fsck) which recalculated ++ * oldest_gen: ++ */ ++ bch_verbose(c, "writing allocation info"); ++ err = "error writing out alloc info"; ++ ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW, &wrote) ?: ++ bch2_alloc_write(c, BTREE_INSERT_LAZY_RW, &wrote); ++ if (ret) { ++ bch_err(c, "error writing alloc info"); ++ goto err; ++ } ++ bch_verbose(c, "alloc write done"); ++ ++ set_bit(BCH_FS_ALLOC_WRITTEN, &c->flags); ++ } ++ ++ if (!c->sb.clean) { ++ if (!(c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { ++ bch_info(c, "checking inode link counts"); ++ err = "error in recovery"; ++ ret = bch2_fsck_inode_nlink(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "check inodes done"); ++ ++ } else { ++ bch_verbose(c, "checking for deleted inodes"); ++ err = "error in recovery"; ++ ret = bch2_fsck_walk_inodes_only(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "check inodes done"); ++ } ++ } ++ ++ if (c->opts.fsck) { ++ bch_info(c, "starting fsck"); ++ err = "error in fsck"; ++ ret = bch2_fsck_full(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "fsck done"); ++ } ++ ++ if (enabled_qtypes(c)) { ++ bch_verbose(c, "reading quotas"); ++ ret = bch2_fs_quota_read(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "quotas done"); ++ } ++ ++ mutex_lock(&c->sb_lock); ++ if (c->opts.version_upgrade) { ++ if (c->sb.version < bcachefs_metadata_version_new_versioning) ++ c->disk_sb.sb->version_min = ++ le16_to_cpu(bcachefs_metadata_version_min); ++ c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); ++ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; ++ write_sb = true; ++ } ++ ++ if (!test_bit(BCH_FS_ERROR, &c->flags)) { ++ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; ++ write_sb = true; ++ } ++ ++ if (c->opts.fsck && ++ !test_bit(BCH_FS_ERROR, &c->flags)) { ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; ++ SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); ++ write_sb = true; ++ } ++ ++ if (write_sb) ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ if (c->journal_seq_blacklist_table && ++ c->journal_seq_blacklist_table->nr > 128) ++ queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); ++out: ++ ret = 0; ++err: ++fsck_err: ++ set_bit(BCH_FS_FSCK_DONE, &c->flags); ++ bch2_flush_fsck_errs(c); ++ ++ if (!c->opts.keep_journal) { ++ bch2_journal_keys_free(&c->journal_keys); ++ bch2_journal_entries_free(&c->journal_entries); ++ } ++ kfree(clean); ++ if (ret) ++ bch_err(c, "Error in recovery: %s (%i)", err, ret); ++ else ++ bch_verbose(c, "ret %i", ret); ++ return ret; ++} ++ ++int bch2_fs_initialize(struct bch_fs *c) ++{ ++ struct bch_inode_unpacked root_inode, lostfound_inode; ++ struct bkey_inode_buf packed_inode; ++ struct qstr lostfound = QSTR("lost+found"); ++ const char *err = "cannot allocate memory"; ++ struct bch_dev *ca; ++ LIST_HEAD(journal); ++ unsigned i; ++ int ret; ++ ++ bch_notice(c, "initializing new filesystem"); ++ ++ mutex_lock(&c->sb_lock); ++ for_each_online_member(ca, c, i) ++ bch2_mark_dev_superblock(c, ca, 0); ++ mutex_unlock(&c->sb_lock); ++ ++ mutex_lock(&c->sb_lock); ++ c->disk_sb.sb->version = c->disk_sb.sb->version_min = ++ le16_to_cpu(bcachefs_metadata_version_current); ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; ++ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); ++ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ bch2_btree_root_alloc(c, i); ++ ++ set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); ++ set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); ++ ++ err = "unable to allocate journal buckets"; ++ for_each_online_member(ca, c, i) { ++ ret = bch2_dev_journal_alloc(ca); ++ if (ret) { ++ percpu_ref_put(&ca->io_ref); ++ goto err; ++ } ++ } ++ ++ /* ++ * journal_res_get() will crash if called before this has ++ * set up the journal.pin FIFO and journal.cur pointer: ++ */ ++ bch2_fs_journal_start(&c->journal, 1, &journal); ++ bch2_journal_set_replay_done(&c->journal); ++ ++ bch2_inode_init(c, &root_inode, 0, 0, ++ S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); ++ root_inode.bi_inum = BCACHEFS_ROOT_INO; ++ bch2_inode_pack(&packed_inode, &root_inode); ++ ++ err = "error creating root directory"; ++ ret = bch2_btree_insert(c, BTREE_ID_INODES, ++ &packed_inode.inode.k_i, ++ NULL, NULL, BTREE_INSERT_LAZY_RW); ++ if (ret) ++ goto err; ++ ++ bch2_inode_init_early(c, &lostfound_inode); ++ ++ err = "error creating lost+found"; ++ ret = bch2_trans_do(c, NULL, NULL, 0, ++ bch2_create_trans(&trans, BCACHEFS_ROOT_INO, ++ &root_inode, &lostfound_inode, ++ &lostfound, ++ 0, 0, S_IFDIR|0700, 0, ++ NULL, NULL)); ++ if (ret) ++ goto err; ++ ++ if (enabled_qtypes(c)) { ++ ret = bch2_fs_quota_read(c); ++ if (ret) ++ goto err; ++ } ++ ++ err = "error writing first journal entry"; ++ ret = bch2_journal_meta(&c->journal); ++ if (ret) ++ goto err; ++ ++ mutex_lock(&c->sb_lock); ++ SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++err: ++ pr_err("Error initializing new filesystem: %s (%i)", err, ret); ++ return ret; ++} +diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h +new file mode 100644 +index 000000000000..a66827c9addf +--- /dev/null ++++ b/fs/bcachefs/recovery.h +@@ -0,0 +1,60 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_RECOVERY_H ++#define _BCACHEFS_RECOVERY_H ++ ++#define for_each_journal_key(keys, i) \ ++ for (i = (keys).d; i < (keys).d + (keys).nr; (i)++) ++ ++struct journal_iter { ++ enum btree_id btree_id; ++ unsigned level; ++ struct journal_keys *keys; ++ struct journal_key *k; ++}; ++ ++/* ++ * Iterate over keys in the btree, with keys from the journal overlaid on top: ++ */ ++ ++struct btree_and_journal_iter { ++ struct btree_iter *btree; ++ ++ struct btree *b; ++ struct btree_node_iter node_iter; ++ struct bkey unpacked; ++ ++ struct journal_iter journal; ++ ++ enum last_key_returned { ++ none, ++ btree, ++ journal, ++ } last; ++}; ++ ++void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); ++struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); ++struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *); ++ ++void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *, ++ struct btree_trans *, ++ struct journal_keys *, ++ enum btree_id, struct bpos); ++void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, ++ struct journal_keys *, ++ struct btree *); ++ ++typedef int (*btree_walk_node_fn)(struct bch_fs *c, struct btree *b); ++typedef int (*btree_walk_key_fn)(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_s_c k); ++ ++int bch2_btree_and_journal_walk(struct bch_fs *, struct journal_keys *, enum btree_id, ++ btree_walk_node_fn, btree_walk_key_fn); ++ ++void bch2_journal_keys_free(struct journal_keys *); ++void bch2_journal_entries_free(struct list_head *); ++ ++int bch2_fs_recovery(struct bch_fs *); ++int bch2_fs_initialize(struct bch_fs *); ++ ++#endif /* _BCACHEFS_RECOVERY_H */ +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +new file mode 100644 +index 000000000000..3c473f1380a6 +--- /dev/null ++++ b/fs/bcachefs/reflink.c +@@ -0,0 +1,303 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "bkey_on_stack.h" ++#include "btree_update.h" ++#include "extents.h" ++#include "inode.h" ++#include "io.h" ++#include "reflink.h" ++ ++#include ++ ++/* reflink pointers */ ++ ++const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); ++ ++ if (bkey_val_bytes(p.k) != sizeof(*p.v)) ++ return "incorrect value size"; ++ ++ return NULL; ++} ++ ++void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); ++ ++ pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx)); ++} ++ ++enum merge_result bch2_reflink_p_merge(struct bch_fs *c, ++ struct bkey_s _l, struct bkey_s _r) ++{ ++ struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l); ++ struct bkey_s_reflink_p r = bkey_s_to_reflink_p(_r); ++ ++ if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) ++ return BCH_MERGE_NOMERGE; ++ ++ if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { ++ bch2_key_resize(l.k, KEY_SIZE_MAX); ++ bch2_cut_front_s(l.k->p, _r); ++ return BCH_MERGE_PARTIAL; ++ } ++ ++ bch2_key_resize(l.k, l.k->size + r.k->size); ++ ++ return BCH_MERGE_MERGE; ++} ++ ++/* indirect extents */ ++ ++const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); ++ ++ if (bkey_val_bytes(r.k) < sizeof(*r.v)) ++ return "incorrect value size"; ++ ++ return bch2_bkey_ptrs_invalid(c, k); ++} ++ ++void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); ++ ++ pr_buf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); ++ ++ bch2_bkey_ptrs_to_text(out, c, k); ++} ++ ++static int bch2_make_extent_indirect(struct btree_trans *trans, ++ struct btree_iter *extent_iter, ++ struct bkey_i_extent *e) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *reflink_iter; ++ struct bkey_s_c k; ++ struct bkey_i_reflink_v *r_v; ++ struct bkey_i_reflink_p *r_p; ++ int ret; ++ ++ for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK, ++ POS(0, c->reflink_hint), ++ BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) { ++ if (reflink_iter->pos.inode) { ++ bch2_btree_iter_set_pos(reflink_iter, POS_MIN); ++ continue; ++ } ++ ++ if (bkey_deleted(k.k) && e->k.size <= k.k->size) ++ break; ++ } ++ ++ if (ret) ++ goto err; ++ ++ /* rewind iter to start of hole, if necessary: */ ++ bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k)); ++ ++ r_v = bch2_trans_kmalloc(trans, sizeof(*r_v) + bkey_val_bytes(&e->k)); ++ ret = PTR_ERR_OR_ZERO(r_v); ++ if (ret) ++ goto err; ++ ++ bkey_reflink_v_init(&r_v->k_i); ++ r_v->k.p = reflink_iter->pos; ++ bch2_key_resize(&r_v->k, e->k.size); ++ r_v->k.version = e->k.version; ++ ++ set_bkey_val_u64s(&r_v->k, bkey_val_u64s(&r_v->k) + ++ bkey_val_u64s(&e->k)); ++ r_v->v.refcount = 0; ++ memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k)); ++ ++ bch2_trans_update(trans, reflink_iter, &r_v->k_i, 0); ++ ++ r_p = bch2_trans_kmalloc(trans, sizeof(*r_p)); ++ if (IS_ERR(r_p)) ++ return PTR_ERR(r_p); ++ ++ e->k.type = KEY_TYPE_reflink_p; ++ r_p = bkey_i_to_reflink_p(&e->k_i); ++ set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); ++ r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); ++ ++ bch2_trans_update(trans, extent_iter, &r_p->k_i, 0); ++err: ++ if (!IS_ERR(reflink_iter)) ++ c->reflink_hint = reflink_iter->pos.offset; ++ bch2_trans_iter_put(trans, reflink_iter); ++ ++ return ret; ++} ++ ++static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) ++{ ++ struct bkey_s_c k = bch2_btree_iter_peek(iter); ++ int ret; ++ ++ for_each_btree_key_continue(iter, 0, k, ret) { ++ if (bkey_cmp(iter->pos, end) >= 0) ++ return bkey_s_c_null; ++ ++ if (k.k->type == KEY_TYPE_extent || ++ k.k->type == KEY_TYPE_reflink_p) ++ break; ++ } ++ ++ return k; ++} ++ ++s64 bch2_remap_range(struct bch_fs *c, ++ struct bpos dst_start, struct bpos src_start, ++ u64 remap_sectors, u64 *journal_seq, ++ u64 new_i_size, s64 *i_sectors_delta) ++{ ++ struct btree_trans trans; ++ struct btree_iter *dst_iter, *src_iter; ++ struct bkey_s_c src_k; ++ BKEY_PADDED(k) new_dst; ++ struct bkey_on_stack new_src; ++ struct bpos dst_end = dst_start, src_end = src_start; ++ struct bpos dst_want, src_want; ++ u64 src_done, dst_done; ++ int ret = 0, ret2 = 0; ++ ++ if (!c->opts.reflink) ++ return -EOPNOTSUPP; ++ ++ if (!percpu_ref_tryget(&c->writes)) ++ return -EROFS; ++ ++ bch2_check_set_feature(c, BCH_FEATURE_reflink); ++ ++ dst_end.offset += remap_sectors; ++ src_end.offset += remap_sectors; ++ ++ bkey_on_stack_init(&new_src); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); ++ ++ src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start, ++ BTREE_ITER_INTENT); ++ dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start, ++ BTREE_ITER_INTENT); ++ ++ while (1) { ++ bch2_trans_begin(&trans); ++ ++ trans.mem_top = 0; ++ ++ if (fatal_signal_pending(current)) { ++ ret = -EINTR; ++ goto err; ++ } ++ ++ src_k = get_next_src(src_iter, src_end); ++ ret = bkey_err(src_k); ++ if (ret) ++ goto btree_err; ++ ++ src_done = bpos_min(src_iter->pos, src_end).offset - ++ src_start.offset; ++ dst_want = POS(dst_start.inode, dst_start.offset + src_done); ++ ++ if (bkey_cmp(dst_iter->pos, dst_want) < 0) { ++ ret = bch2_fpunch_at(&trans, dst_iter, dst_want, ++ journal_seq, i_sectors_delta); ++ if (ret) ++ goto btree_err; ++ continue; ++ } ++ ++ BUG_ON(bkey_cmp(dst_iter->pos, dst_want)); ++ ++ if (!bkey_cmp(dst_iter->pos, dst_end)) ++ break; ++ ++ if (src_k.k->type == KEY_TYPE_extent) { ++ bkey_on_stack_reassemble(&new_src, c, src_k); ++ src_k = bkey_i_to_s_c(new_src.k); ++ ++ bch2_cut_front(src_iter->pos, new_src.k); ++ bch2_cut_back(src_end, new_src.k); ++ ++ ret = bch2_make_extent_indirect(&trans, src_iter, ++ bkey_i_to_extent(new_src.k)); ++ if (ret) ++ goto btree_err; ++ ++ BUG_ON(src_k.k->type != KEY_TYPE_reflink_p); ++ } ++ ++ if (src_k.k->type == KEY_TYPE_reflink_p) { ++ struct bkey_s_c_reflink_p src_p = ++ bkey_s_c_to_reflink_p(src_k); ++ struct bkey_i_reflink_p *dst_p = ++ bkey_reflink_p_init(&new_dst.k); ++ ++ u64 offset = le64_to_cpu(src_p.v->idx) + ++ (src_iter->pos.offset - ++ bkey_start_offset(src_k.k)); ++ ++ dst_p->v.idx = cpu_to_le64(offset); ++ } else { ++ BUG(); ++ } ++ ++ new_dst.k.k.p = dst_iter->pos; ++ bch2_key_resize(&new_dst.k.k, ++ min(src_k.k->p.offset - src_iter->pos.offset, ++ dst_end.offset - dst_iter->pos.offset)); ++ ++ ret = bch2_extent_update(&trans, dst_iter, &new_dst.k, ++ NULL, journal_seq, ++ new_i_size, i_sectors_delta); ++ if (ret) ++ goto btree_err; ++ ++ dst_done = dst_iter->pos.offset - dst_start.offset; ++ src_want = POS(src_start.inode, src_start.offset + dst_done); ++ bch2_btree_iter_set_pos(src_iter, src_want); ++btree_err: ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ goto err; ++ } ++ ++ BUG_ON(bkey_cmp(dst_iter->pos, dst_end)); ++err: ++ BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0); ++ ++ dst_done = dst_iter->pos.offset - dst_start.offset; ++ new_i_size = min(dst_iter->pos.offset << 9, new_i_size); ++ ++ bch2_trans_begin(&trans); ++ ++ do { ++ struct bch_inode_unpacked inode_u; ++ struct btree_iter *inode_iter; ++ ++ inode_iter = bch2_inode_peek(&trans, &inode_u, ++ dst_start.inode, BTREE_ITER_INTENT); ++ ret2 = PTR_ERR_OR_ZERO(inode_iter); ++ ++ if (!ret2 && ++ inode_u.bi_size < new_i_size) { ++ inode_u.bi_size = new_i_size; ++ ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?: ++ bch2_trans_commit(&trans, NULL, journal_seq, 0); ++ } ++ } while (ret2 == -EINTR); ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ bkey_on_stack_exit(&new_src, c); ++ ++ percpu_ref_put(&c->writes); ++ ++ return dst_done ?: ret ?: ret2; ++} +diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h +new file mode 100644 +index 000000000000..5445c1cf0797 +--- /dev/null ++++ b/fs/bcachefs/reflink.h +@@ -0,0 +1,31 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_REFLINK_H ++#define _BCACHEFS_REFLINK_H ++ ++const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++enum merge_result bch2_reflink_p_merge(struct bch_fs *, ++ struct bkey_s, struct bkey_s); ++ ++#define bch2_bkey_ops_reflink_p (struct bkey_ops) { \ ++ .key_invalid = bch2_reflink_p_invalid, \ ++ .val_to_text = bch2_reflink_p_to_text, \ ++ .key_merge = bch2_reflink_p_merge, \ ++} ++ ++const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ ++ ++#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \ ++ .key_invalid = bch2_reflink_v_invalid, \ ++ .val_to_text = bch2_reflink_v_to_text, \ ++ .swab = bch2_ptr_swab, \ ++} ++ ++s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos, ++ u64, u64 *, u64, s64 *); ++ ++#endif /* _BCACHEFS_REFLINK_H */ +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +new file mode 100644 +index 000000000000..67a7128fd9af +--- /dev/null ++++ b/fs/bcachefs/replicas.c +@@ -0,0 +1,1084 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "buckets.h" ++#include "journal.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, ++ struct bch_replicas_cpu *); ++ ++/* Replicas tracking - in memory: */ ++ ++static inline int u8_cmp(u8 l, u8 r) ++{ ++ return cmp_int(l, r); ++} ++ ++static void verify_replicas_entry(struct bch_replicas_entry *e) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ unsigned i; ++ ++ BUG_ON(e->data_type >= BCH_DATA_NR); ++ BUG_ON(!e->nr_devs); ++ BUG_ON(e->nr_required > 1 && ++ e->nr_required >= e->nr_devs); ++ ++ for (i = 0; i + 1 < e->nr_devs; i++) ++ BUG_ON(e->devs[i] >= e->devs[i + 1]); ++#endif ++} ++ ++static void replicas_entry_sort(struct bch_replicas_entry *e) ++{ ++ bubble_sort(e->devs, e->nr_devs, u8_cmp); ++} ++ ++static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) ++{ ++ eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); ++} ++ ++void bch2_replicas_entry_to_text(struct printbuf *out, ++ struct bch_replicas_entry *e) ++{ ++ unsigned i; ++ ++ pr_buf(out, "%s: %u/%u [", ++ bch2_data_types[e->data_type], ++ e->nr_required, ++ e->nr_devs); ++ ++ for (i = 0; i < e->nr_devs; i++) ++ pr_buf(out, i ? " %u" : "%u", e->devs[i]); ++ pr_buf(out, "]"); ++} ++ ++void bch2_cpu_replicas_to_text(struct printbuf *out, ++ struct bch_replicas_cpu *r) ++{ ++ struct bch_replicas_entry *e; ++ bool first = true; ++ ++ for_each_cpu_replicas_entry(r, e) { ++ if (!first) ++ pr_buf(out, " "); ++ first = false; ++ ++ bch2_replicas_entry_to_text(out, e); ++ } ++} ++ ++static void extent_to_replicas(struct bkey_s_c k, ++ struct bch_replicas_entry *r) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ r->nr_required = 1; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ if (p.ptr.cached) ++ continue; ++ ++ if (!p.has_ec) ++ r->devs[r->nr_devs++] = p.ptr.dev; ++ else ++ r->nr_required = 0; ++ } ++} ++ ++static void stripe_to_replicas(struct bkey_s_c k, ++ struct bch_replicas_entry *r) ++{ ++ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); ++ const struct bch_extent_ptr *ptr; ++ ++ r->nr_required = s.v->nr_blocks - s.v->nr_redundant; ++ ++ for (ptr = s.v->ptrs; ++ ptr < s.v->ptrs + s.v->nr_blocks; ++ ptr++) ++ r->devs[r->nr_devs++] = ptr->dev; ++} ++ ++void bch2_bkey_to_replicas(struct bch_replicas_entry *e, ++ struct bkey_s_c k) ++{ ++ e->nr_devs = 0; ++ ++ switch (k.k->type) { ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: ++ e->data_type = BCH_DATA_BTREE; ++ extent_to_replicas(k, e); ++ break; ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ e->data_type = BCH_DATA_USER; ++ extent_to_replicas(k, e); ++ break; ++ case KEY_TYPE_stripe: ++ e->data_type = BCH_DATA_USER; ++ stripe_to_replicas(k, e); ++ break; ++ } ++ ++ replicas_entry_sort(e); ++} ++ ++void bch2_devlist_to_replicas(struct bch_replicas_entry *e, ++ enum bch_data_type data_type, ++ struct bch_devs_list devs) ++{ ++ unsigned i; ++ ++ BUG_ON(!data_type || ++ data_type == BCH_DATA_SB || ++ data_type >= BCH_DATA_NR); ++ ++ e->data_type = data_type; ++ e->nr_devs = 0; ++ e->nr_required = 1; ++ ++ for (i = 0; i < devs.nr; i++) ++ e->devs[e->nr_devs++] = devs.devs[i]; ++ ++ replicas_entry_sort(e); ++} ++ ++static struct bch_replicas_cpu ++cpu_replicas_add_entry(struct bch_replicas_cpu *old, ++ struct bch_replicas_entry *new_entry) ++{ ++ unsigned i; ++ struct bch_replicas_cpu new = { ++ .nr = old->nr + 1, ++ .entry_size = max_t(unsigned, old->entry_size, ++ replicas_entry_bytes(new_entry)), ++ }; ++ ++ BUG_ON(!new_entry->data_type); ++ verify_replicas_entry(new_entry); ++ ++ new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO); ++ if (!new.entries) ++ return new; ++ ++ for (i = 0; i < old->nr; i++) ++ memcpy(cpu_replicas_entry(&new, i), ++ cpu_replicas_entry(old, i), ++ old->entry_size); ++ ++ memcpy(cpu_replicas_entry(&new, old->nr), ++ new_entry, ++ replicas_entry_bytes(new_entry)); ++ ++ bch2_cpu_replicas_sort(&new); ++ return new; ++} ++ ++static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, ++ struct bch_replicas_entry *search) ++{ ++ int idx, entry_size = replicas_entry_bytes(search); ++ ++ if (unlikely(entry_size > r->entry_size)) ++ return -1; ++ ++ verify_replicas_entry(search); ++ ++#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) ++ idx = eytzinger0_find(r->entries, r->nr, r->entry_size, ++ entry_cmp, search); ++#undef entry_cmp ++ ++ return idx < r->nr ? idx : -1; ++} ++ ++int bch2_replicas_entry_idx(struct bch_fs *c, ++ struct bch_replicas_entry *search) ++{ ++ replicas_entry_sort(search); ++ ++ return __replicas_entry_idx(&c->replicas, search); ++} ++ ++static bool __replicas_has_entry(struct bch_replicas_cpu *r, ++ struct bch_replicas_entry *search) ++{ ++ return __replicas_entry_idx(r, search) >= 0; ++} ++ ++static bool bch2_replicas_marked_locked(struct bch_fs *c, ++ struct bch_replicas_entry *search, ++ bool check_gc_replicas) ++{ ++ if (!search->nr_devs) ++ return true; ++ ++ verify_replicas_entry(search); ++ ++ return __replicas_has_entry(&c->replicas, search) && ++ (!check_gc_replicas || ++ likely((!c->replicas_gc.entries)) || ++ __replicas_has_entry(&c->replicas_gc, search)); ++} ++ ++bool bch2_replicas_marked(struct bch_fs *c, ++ struct bch_replicas_entry *search, ++ bool check_gc_replicas) ++{ ++ bool marked; ++ ++ percpu_down_read(&c->mark_lock); ++ marked = bch2_replicas_marked_locked(c, search, check_gc_replicas); ++ percpu_up_read(&c->mark_lock); ++ ++ return marked; ++} ++ ++static void __replicas_table_update(struct bch_fs_usage *dst, ++ struct bch_replicas_cpu *dst_r, ++ struct bch_fs_usage *src, ++ struct bch_replicas_cpu *src_r) ++{ ++ int src_idx, dst_idx; ++ ++ *dst = *src; ++ ++ for (src_idx = 0; src_idx < src_r->nr; src_idx++) { ++ if (!src->replicas[src_idx]) ++ continue; ++ ++ dst_idx = __replicas_entry_idx(dst_r, ++ cpu_replicas_entry(src_r, src_idx)); ++ BUG_ON(dst_idx < 0); ++ ++ dst->replicas[dst_idx] = src->replicas[src_idx]; ++ } ++} ++ ++static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, ++ struct bch_replicas_cpu *dst_r, ++ struct bch_fs_usage __percpu *src_p, ++ struct bch_replicas_cpu *src_r) ++{ ++ unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; ++ struct bch_fs_usage *dst, *src = (void *) ++ bch2_acc_percpu_u64s((void *) src_p, src_nr); ++ ++ preempt_disable(); ++ dst = this_cpu_ptr(dst_p); ++ preempt_enable(); ++ ++ __replicas_table_update(dst, dst_r, src, src_r); ++} ++ ++/* ++ * Resize filesystem accounting: ++ */ ++static int replicas_table_update(struct bch_fs *c, ++ struct bch_replicas_cpu *new_r) ++{ ++ struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL }; ++ struct bch_fs_usage *new_scratch = NULL; ++ struct bch_fs_usage __percpu *new_gc = NULL; ++ struct bch_fs_usage *new_base = NULL; ++ unsigned bytes = sizeof(struct bch_fs_usage) + ++ sizeof(u64) * new_r->nr; ++ int ret = -ENOMEM; ++ ++ if (!(new_base = kzalloc(bytes, GFP_NOIO)) || ++ !(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64), ++ GFP_NOIO)) || ++ !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64), ++ GFP_NOIO)) || ++ !(new_scratch = kmalloc(bytes, GFP_NOIO)) || ++ (c->usage_gc && ++ !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) { ++ bch_err(c, "error updating replicas table: memory allocation failure"); ++ goto err; ++ } ++ ++ if (c->usage_base) ++ __replicas_table_update(new_base, new_r, ++ c->usage_base, &c->replicas); ++ if (c->usage[0]) ++ __replicas_table_update_pcpu(new_usage[0], new_r, ++ c->usage[0], &c->replicas); ++ if (c->usage[1]) ++ __replicas_table_update_pcpu(new_usage[1], new_r, ++ c->usage[1], &c->replicas); ++ if (c->usage_gc) ++ __replicas_table_update_pcpu(new_gc, new_r, ++ c->usage_gc, &c->replicas); ++ ++ swap(c->usage_base, new_base); ++ swap(c->usage[0], new_usage[0]); ++ swap(c->usage[1], new_usage[1]); ++ swap(c->usage_scratch, new_scratch); ++ swap(c->usage_gc, new_gc); ++ swap(c->replicas, *new_r); ++ ret = 0; ++err: ++ free_percpu(new_gc); ++ kfree(new_scratch); ++ free_percpu(new_usage[1]); ++ free_percpu(new_usage[0]); ++ kfree(new_base); ++ return ret; ++} ++ ++static unsigned reserve_journal_replicas(struct bch_fs *c, ++ struct bch_replicas_cpu *r) ++{ ++ struct bch_replicas_entry *e; ++ unsigned journal_res_u64s = 0; ++ ++ /* nr_inodes: */ ++ journal_res_u64s += ++ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); ++ ++ /* key_version: */ ++ journal_res_u64s += ++ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); ++ ++ /* persistent_reserved: */ ++ journal_res_u64s += ++ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) * ++ BCH_REPLICAS_MAX; ++ ++ for_each_cpu_replicas_entry(r, e) ++ journal_res_u64s += ++ DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) + ++ e->nr_devs, sizeof(u64)); ++ return journal_res_u64s; ++} ++ ++noinline ++static int bch2_mark_replicas_slowpath(struct bch_fs *c, ++ struct bch_replicas_entry *new_entry) ++{ ++ struct bch_replicas_cpu new_r, new_gc; ++ int ret = 0; ++ ++ verify_replicas_entry(new_entry); ++ ++ memset(&new_r, 0, sizeof(new_r)); ++ memset(&new_gc, 0, sizeof(new_gc)); ++ ++ mutex_lock(&c->sb_lock); ++ ++ if (c->replicas_gc.entries && ++ !__replicas_has_entry(&c->replicas_gc, new_entry)) { ++ new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry); ++ if (!new_gc.entries) ++ goto err; ++ } ++ ++ if (!__replicas_has_entry(&c->replicas, new_entry)) { ++ new_r = cpu_replicas_add_entry(&c->replicas, new_entry); ++ if (!new_r.entries) ++ goto err; ++ ++ ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); ++ if (ret) ++ goto err; ++ ++ bch2_journal_entry_res_resize(&c->journal, ++ &c->replicas_journal_res, ++ reserve_journal_replicas(c, &new_r)); ++ } ++ ++ if (!new_r.entries && ++ !new_gc.entries) ++ goto out; ++ ++ /* allocations done, now commit: */ ++ ++ if (new_r.entries) ++ bch2_write_super(c); ++ ++ /* don't update in memory replicas until changes are persistent */ ++ percpu_down_write(&c->mark_lock); ++ if (new_r.entries) ++ ret = replicas_table_update(c, &new_r); ++ if (new_gc.entries) ++ swap(new_gc, c->replicas_gc); ++ percpu_up_write(&c->mark_lock); ++out: ++ mutex_unlock(&c->sb_lock); ++ ++ kfree(new_r.entries); ++ kfree(new_gc.entries); ++ ++ return ret; ++err: ++ bch_err(c, "error adding replicas entry: memory allocation failure"); ++ ret = -ENOMEM; ++ goto out; ++} ++ ++int bch2_mark_replicas(struct bch_fs *c, ++ struct bch_replicas_entry *r) ++{ ++ return likely(bch2_replicas_marked(c, r, true)) ++ ? 0 ++ : bch2_mark_replicas_slowpath(c, r); ++} ++ ++bool bch2_bkey_replicas_marked_locked(struct bch_fs *c, ++ struct bkey_s_c k, ++ bool check_gc_replicas) ++{ ++ struct bch_replicas_padded search; ++ struct bch_devs_list cached = bch2_bkey_cached_devs(k); ++ unsigned i; ++ ++ for (i = 0; i < cached.nr; i++) { ++ bch2_replicas_entry_cached(&search.e, cached.devs[i]); ++ ++ if (!bch2_replicas_marked_locked(c, &search.e, ++ check_gc_replicas)) ++ return false; ++ } ++ ++ bch2_bkey_to_replicas(&search.e, k); ++ ++ return bch2_replicas_marked_locked(c, &search.e, check_gc_replicas); ++} ++ ++bool bch2_bkey_replicas_marked(struct bch_fs *c, ++ struct bkey_s_c k, ++ bool check_gc_replicas) ++{ ++ bool marked; ++ ++ percpu_down_read(&c->mark_lock); ++ marked = bch2_bkey_replicas_marked_locked(c, k, check_gc_replicas); ++ percpu_up_read(&c->mark_lock); ++ ++ return marked; ++} ++ ++int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bch_replicas_padded search; ++ struct bch_devs_list cached = bch2_bkey_cached_devs(k); ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < cached.nr; i++) { ++ bch2_replicas_entry_cached(&search.e, cached.devs[i]); ++ ++ ret = bch2_mark_replicas(c, &search.e); ++ if (ret) ++ return ret; ++ } ++ ++ bch2_bkey_to_replicas(&search.e, k); ++ ++ return bch2_mark_replicas(c, &search.e); ++} ++ ++int bch2_replicas_gc_end(struct bch_fs *c, int ret) ++{ ++ unsigned i; ++ ++ lockdep_assert_held(&c->replicas_gc_lock); ++ ++ mutex_lock(&c->sb_lock); ++ percpu_down_write(&c->mark_lock); ++ ++ /* ++ * this is kind of crappy; the replicas gc mechanism needs to be ripped ++ * out ++ */ ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ struct bch_replicas_cpu n; ++ ++ if (!__replicas_has_entry(&c->replicas_gc, e) && ++ (c->usage_base->replicas[i] || ++ percpu_u64_get(&c->usage[0]->replicas[i]) || ++ percpu_u64_get(&c->usage[1]->replicas[i]))) { ++ n = cpu_replicas_add_entry(&c->replicas_gc, e); ++ if (!n.entries) { ++ ret = -ENOSPC; ++ goto err; ++ } ++ ++ swap(n, c->replicas_gc); ++ kfree(n.entries); ++ } ++ } ++ ++ if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) { ++ ret = -ENOSPC; ++ goto err; ++ } ++ ++ ret = replicas_table_update(c, &c->replicas_gc); ++err: ++ kfree(c->replicas_gc.entries); ++ c->replicas_gc.entries = NULL; ++ ++ percpu_up_write(&c->mark_lock); ++ ++ if (!ret) ++ bch2_write_super(c); ++ ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) ++{ ++ struct bch_replicas_entry *e; ++ unsigned i = 0; ++ ++ lockdep_assert_held(&c->replicas_gc_lock); ++ ++ mutex_lock(&c->sb_lock); ++ BUG_ON(c->replicas_gc.entries); ++ ++ c->replicas_gc.nr = 0; ++ c->replicas_gc.entry_size = 0; ++ ++ for_each_cpu_replicas_entry(&c->replicas, e) ++ if (!((1 << e->data_type) & typemask)) { ++ c->replicas_gc.nr++; ++ c->replicas_gc.entry_size = ++ max_t(unsigned, c->replicas_gc.entry_size, ++ replicas_entry_bytes(e)); ++ } ++ ++ c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, ++ c->replicas_gc.entry_size, ++ GFP_NOIO); ++ if (!c->replicas_gc.entries) { ++ mutex_unlock(&c->sb_lock); ++ bch_err(c, "error allocating c->replicas_gc"); ++ return -ENOMEM; ++ } ++ ++ for_each_cpu_replicas_entry(&c->replicas, e) ++ if (!((1 << e->data_type) & typemask)) ++ memcpy(cpu_replicas_entry(&c->replicas_gc, i++), ++ e, c->replicas_gc.entry_size); ++ ++ bch2_cpu_replicas_sort(&c->replicas_gc); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++int bch2_replicas_gc2(struct bch_fs *c) ++{ ++ struct bch_replicas_cpu new = { 0 }; ++ unsigned i, nr; ++ int ret = 0; ++ ++ bch2_journal_meta(&c->journal); ++retry: ++ nr = READ_ONCE(c->replicas.nr); ++ new.entry_size = READ_ONCE(c->replicas.entry_size); ++ new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); ++ if (!new.entries) { ++ bch_err(c, "error allocating c->replicas_gc"); ++ return -ENOMEM; ++ } ++ ++ mutex_lock(&c->sb_lock); ++ percpu_down_write(&c->mark_lock); ++ ++ if (nr != c->replicas.nr || ++ new.entry_size != c->replicas.entry_size) { ++ percpu_up_write(&c->mark_lock); ++ mutex_unlock(&c->sb_lock); ++ kfree(new.entries); ++ goto retry; ++ } ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ ++ if (e->data_type == BCH_DATA_JOURNAL || ++ c->usage_base->replicas[i] || ++ percpu_u64_get(&c->usage[0]->replicas[i]) || ++ percpu_u64_get(&c->usage[1]->replicas[i])) ++ memcpy(cpu_replicas_entry(&new, new.nr++), ++ e, new.entry_size); ++ } ++ ++ bch2_cpu_replicas_sort(&new); ++ ++ if (bch2_cpu_replicas_to_sb_replicas(c, &new)) { ++ ret = -ENOSPC; ++ goto err; ++ } ++ ++ ret = replicas_table_update(c, &new); ++err: ++ kfree(new.entries); ++ ++ percpu_up_write(&c->mark_lock); ++ ++ if (!ret) ++ bch2_write_super(c); ++ ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++int bch2_replicas_set_usage(struct bch_fs *c, ++ struct bch_replicas_entry *r, ++ u64 sectors) ++{ ++ int ret, idx = bch2_replicas_entry_idx(c, r); ++ ++ if (idx < 0) { ++ struct bch_replicas_cpu n; ++ ++ n = cpu_replicas_add_entry(&c->replicas, r); ++ if (!n.entries) ++ return -ENOMEM; ++ ++ ret = replicas_table_update(c, &n); ++ if (ret) ++ return ret; ++ ++ kfree(n.entries); ++ ++ idx = bch2_replicas_entry_idx(c, r); ++ BUG_ON(ret < 0); ++ } ++ ++ c->usage_base->replicas[idx] = sectors; ++ ++ return 0; ++} ++ ++/* Replicas tracking - superblock: */ ++ ++static int ++__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, ++ struct bch_replicas_cpu *cpu_r) ++{ ++ struct bch_replicas_entry *e, *dst; ++ unsigned nr = 0, entry_size = 0, idx = 0; ++ ++ for_each_replicas_entry(sb_r, e) { ++ entry_size = max_t(unsigned, entry_size, ++ replicas_entry_bytes(e)); ++ nr++; ++ } ++ ++ cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); ++ if (!cpu_r->entries) ++ return -ENOMEM; ++ ++ cpu_r->nr = nr; ++ cpu_r->entry_size = entry_size; ++ ++ for_each_replicas_entry(sb_r, e) { ++ dst = cpu_replicas_entry(cpu_r, idx++); ++ memcpy(dst, e, replicas_entry_bytes(e)); ++ replicas_entry_sort(dst); ++ } ++ ++ return 0; ++} ++ ++static int ++__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, ++ struct bch_replicas_cpu *cpu_r) ++{ ++ struct bch_replicas_entry_v0 *e; ++ unsigned nr = 0, entry_size = 0, idx = 0; ++ ++ for_each_replicas_entry(sb_r, e) { ++ entry_size = max_t(unsigned, entry_size, ++ replicas_entry_bytes(e)); ++ nr++; ++ } ++ ++ entry_size += sizeof(struct bch_replicas_entry) - ++ sizeof(struct bch_replicas_entry_v0); ++ ++ cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); ++ if (!cpu_r->entries) ++ return -ENOMEM; ++ ++ cpu_r->nr = nr; ++ cpu_r->entry_size = entry_size; ++ ++ for_each_replicas_entry(sb_r, e) { ++ struct bch_replicas_entry *dst = ++ cpu_replicas_entry(cpu_r, idx++); ++ ++ dst->data_type = e->data_type; ++ dst->nr_devs = e->nr_devs; ++ dst->nr_required = 1; ++ memcpy(dst->devs, e->devs, e->nr_devs); ++ replicas_entry_sort(dst); ++ } ++ ++ return 0; ++} ++ ++int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) ++{ ++ struct bch_sb_field_replicas *sb_v1; ++ struct bch_sb_field_replicas_v0 *sb_v0; ++ struct bch_replicas_cpu new_r = { 0, 0, NULL }; ++ int ret = 0; ++ ++ if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb))) ++ ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r); ++ else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb))) ++ ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r); ++ ++ if (ret) ++ return -ENOMEM; ++ ++ bch2_cpu_replicas_sort(&new_r); ++ ++ percpu_down_write(&c->mark_lock); ++ ++ ret = replicas_table_update(c, &new_r); ++ percpu_up_write(&c->mark_lock); ++ ++ kfree(new_r.entries); ++ ++ return 0; ++} ++ ++static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, ++ struct bch_replicas_cpu *r) ++{ ++ struct bch_sb_field_replicas_v0 *sb_r; ++ struct bch_replicas_entry_v0 *dst; ++ struct bch_replicas_entry *src; ++ size_t bytes; ++ ++ bytes = sizeof(struct bch_sb_field_replicas); ++ ++ for_each_cpu_replicas_entry(r, src) ++ bytes += replicas_entry_bytes(src) - 1; ++ ++ sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb, ++ DIV_ROUND_UP(bytes, sizeof(u64))); ++ if (!sb_r) ++ return -ENOSPC; ++ ++ bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); ++ sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb); ++ ++ memset(&sb_r->entries, 0, ++ vstruct_end(&sb_r->field) - ++ (void *) &sb_r->entries); ++ ++ dst = sb_r->entries; ++ for_each_cpu_replicas_entry(r, src) { ++ dst->data_type = src->data_type; ++ dst->nr_devs = src->nr_devs; ++ memcpy(dst->devs, src->devs, src->nr_devs); ++ ++ dst = replicas_entry_next(dst); ++ ++ BUG_ON((void *) dst > vstruct_end(&sb_r->field)); ++ } ++ ++ return 0; ++} ++ ++static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, ++ struct bch_replicas_cpu *r) ++{ ++ struct bch_sb_field_replicas *sb_r; ++ struct bch_replicas_entry *dst, *src; ++ bool need_v1 = false; ++ size_t bytes; ++ ++ bytes = sizeof(struct bch_sb_field_replicas); ++ ++ for_each_cpu_replicas_entry(r, src) { ++ bytes += replicas_entry_bytes(src); ++ if (src->nr_required != 1) ++ need_v1 = true; ++ } ++ ++ if (!need_v1) ++ return bch2_cpu_replicas_to_sb_replicas_v0(c, r); ++ ++ sb_r = bch2_sb_resize_replicas(&c->disk_sb, ++ DIV_ROUND_UP(bytes, sizeof(u64))); ++ if (!sb_r) ++ return -ENOSPC; ++ ++ bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); ++ sb_r = bch2_sb_get_replicas(c->disk_sb.sb); ++ ++ memset(&sb_r->entries, 0, ++ vstruct_end(&sb_r->field) - ++ (void *) &sb_r->entries); ++ ++ dst = sb_r->entries; ++ for_each_cpu_replicas_entry(r, src) { ++ memcpy(dst, src, replicas_entry_bytes(src)); ++ ++ dst = replicas_entry_next(dst); ++ ++ BUG_ON((void *) dst > vstruct_end(&sb_r->field)); ++ } ++ ++ return 0; ++} ++ ++static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r) ++{ ++ unsigned i; ++ ++ sort_cmp_size(cpu_r->entries, ++ cpu_r->nr, ++ cpu_r->entry_size, ++ memcmp, NULL); ++ ++ for (i = 0; i + 1 < cpu_r->nr; i++) { ++ struct bch_replicas_entry *l = ++ cpu_replicas_entry(cpu_r, i); ++ struct bch_replicas_entry *r = ++ cpu_replicas_entry(cpu_r, i + 1); ++ ++ BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0); ++ ++ if (!memcmp(l, r, cpu_r->entry_size)) ++ return "duplicate replicas entry"; ++ } ++ ++ return NULL; ++} ++ ++static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f) ++{ ++ struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); ++ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); ++ struct bch_replicas_cpu cpu_r = { .entries = NULL }; ++ struct bch_replicas_entry *e; ++ const char *err; ++ unsigned i; ++ ++ for_each_replicas_entry(sb_r, e) { ++ err = "invalid replicas entry: invalid data type"; ++ if (e->data_type >= BCH_DATA_NR) ++ goto err; ++ ++ err = "invalid replicas entry: no devices"; ++ if (!e->nr_devs) ++ goto err; ++ ++ err = "invalid replicas entry: bad nr_required"; ++ if (e->nr_required > 1 && ++ e->nr_required >= e->nr_devs) ++ goto err; ++ ++ err = "invalid replicas entry: invalid device"; ++ for (i = 0; i < e->nr_devs; i++) ++ if (!bch2_dev_exists(sb, mi, e->devs[i])) ++ goto err; ++ } ++ ++ err = "cannot allocate memory"; ++ if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r)) ++ goto err; ++ ++ err = check_dup_replicas_entries(&cpu_r); ++err: ++ kfree(cpu_r.entries); ++ return err; ++} ++ ++static void bch2_sb_replicas_to_text(struct printbuf *out, ++ struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_replicas *r = field_to_type(f, replicas); ++ struct bch_replicas_entry *e; ++ bool first = true; ++ ++ for_each_replicas_entry(r, e) { ++ if (!first) ++ pr_buf(out, " "); ++ first = false; ++ ++ bch2_replicas_entry_to_text(out, e); ++ } ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_replicas = { ++ .validate = bch2_sb_validate_replicas, ++ .to_text = bch2_sb_replicas_to_text, ++}; ++ ++static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f) ++{ ++ struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); ++ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); ++ struct bch_replicas_cpu cpu_r = { .entries = NULL }; ++ struct bch_replicas_entry_v0 *e; ++ const char *err; ++ unsigned i; ++ ++ for_each_replicas_entry_v0(sb_r, e) { ++ err = "invalid replicas entry: invalid data type"; ++ if (e->data_type >= BCH_DATA_NR) ++ goto err; ++ ++ err = "invalid replicas entry: no devices"; ++ if (!e->nr_devs) ++ goto err; ++ ++ err = "invalid replicas entry: invalid device"; ++ for (i = 0; i < e->nr_devs; i++) ++ if (!bch2_dev_exists(sb, mi, e->devs[i])) ++ goto err; ++ } ++ ++ err = "cannot allocate memory"; ++ if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r)) ++ goto err; ++ ++ err = check_dup_replicas_entries(&cpu_r); ++err: ++ kfree(cpu_r.entries); ++ return err; ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { ++ .validate = bch2_sb_validate_replicas_v0, ++}; ++ ++/* Query replicas: */ ++ ++struct replicas_status __bch2_replicas_status(struct bch_fs *c, ++ struct bch_devs_mask online_devs) ++{ ++ struct bch_sb_field_members *mi; ++ struct bch_replicas_entry *e; ++ unsigned i, nr_online, nr_offline; ++ struct replicas_status ret; ++ ++ memset(&ret, 0, sizeof(ret)); ++ ++ for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) ++ ret.replicas[i].redundancy = INT_MAX; ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ ++ percpu_down_read(&c->mark_lock); ++ ++ for_each_cpu_replicas_entry(&c->replicas, e) { ++ if (e->data_type >= ARRAY_SIZE(ret.replicas)) ++ panic("e %p data_type %u\n", e, e->data_type); ++ ++ nr_online = nr_offline = 0; ++ ++ for (i = 0; i < e->nr_devs; i++) { ++ BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, ++ e->devs[i])); ++ ++ if (test_bit(e->devs[i], online_devs.d)) ++ nr_online++; ++ else ++ nr_offline++; ++ } ++ ++ ret.replicas[e->data_type].redundancy = ++ min(ret.replicas[e->data_type].redundancy, ++ (int) nr_online - (int) e->nr_required); ++ ++ ret.replicas[e->data_type].nr_offline = ++ max(ret.replicas[e->data_type].nr_offline, ++ nr_offline); ++ } ++ ++ percpu_up_read(&c->mark_lock); ++ ++ for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) ++ if (ret.replicas[i].redundancy == INT_MAX) ++ ret.replicas[i].redundancy = 0; ++ ++ return ret; ++} ++ ++struct replicas_status bch2_replicas_status(struct bch_fs *c) ++{ ++ return __bch2_replicas_status(c, bch2_online_devs(c)); ++} ++ ++static bool have_enough_devs(struct replicas_status s, ++ enum bch_data_type type, ++ bool force_if_degraded, ++ bool force_if_lost) ++{ ++ return (!s.replicas[type].nr_offline || force_if_degraded) && ++ (s.replicas[type].redundancy >= 0 || force_if_lost); ++} ++ ++bool bch2_have_enough_devs(struct replicas_status s, unsigned flags) ++{ ++ return (have_enough_devs(s, BCH_DATA_JOURNAL, ++ flags & BCH_FORCE_IF_METADATA_DEGRADED, ++ flags & BCH_FORCE_IF_METADATA_LOST) && ++ have_enough_devs(s, BCH_DATA_BTREE, ++ flags & BCH_FORCE_IF_METADATA_DEGRADED, ++ flags & BCH_FORCE_IF_METADATA_LOST) && ++ have_enough_devs(s, BCH_DATA_USER, ++ flags & BCH_FORCE_IF_DATA_DEGRADED, ++ flags & BCH_FORCE_IF_DATA_LOST)); ++} ++ ++int bch2_replicas_online(struct bch_fs *c, bool meta) ++{ ++ struct replicas_status s = bch2_replicas_status(c); ++ ++ return (meta ++ ? min(s.replicas[BCH_DATA_JOURNAL].redundancy, ++ s.replicas[BCH_DATA_BTREE].redundancy) ++ : s.replicas[BCH_DATA_USER].redundancy) + 1; ++} ++ ++unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bch_replicas_entry *e; ++ unsigned i, ret = 0; ++ ++ percpu_down_read(&c->mark_lock); ++ ++ for_each_cpu_replicas_entry(&c->replicas, e) ++ for (i = 0; i < e->nr_devs; i++) ++ if (e->devs[i] == ca->dev_idx) ++ ret |= 1 << e->data_type; ++ ++ percpu_up_read(&c->mark_lock); ++ ++ return ret; ++} ++ ++int bch2_fs_replicas_init(struct bch_fs *c) ++{ ++ c->journal.entry_u64s_reserved += ++ reserve_journal_replicas(c, &c->replicas); ++ ++ return replicas_table_update(c, &c->replicas); ++} +diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h +new file mode 100644 +index 000000000000..8527d82841bb +--- /dev/null ++++ b/fs/bcachefs/replicas.h +@@ -0,0 +1,95 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_REPLICAS_H ++#define _BCACHEFS_REPLICAS_H ++ ++#include "eytzinger.h" ++#include "replicas_types.h" ++ ++void bch2_replicas_entry_to_text(struct printbuf *, ++ struct bch_replicas_entry *); ++void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); ++ ++static inline struct bch_replicas_entry * ++cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) ++{ ++ return (void *) r->entries + r->entry_size * i; ++} ++ ++int bch2_replicas_entry_idx(struct bch_fs *, ++ struct bch_replicas_entry *); ++ ++void bch2_devlist_to_replicas(struct bch_replicas_entry *, ++ enum bch_data_type, ++ struct bch_devs_list); ++bool bch2_replicas_marked(struct bch_fs *, ++ struct bch_replicas_entry *, bool); ++int bch2_mark_replicas(struct bch_fs *, ++ struct bch_replicas_entry *); ++ ++bool bch2_bkey_replicas_marked_locked(struct bch_fs *, ++ struct bkey_s_c, bool); ++void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); ++bool bch2_bkey_replicas_marked(struct bch_fs *, ++ struct bkey_s_c, bool); ++int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c); ++ ++static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, ++ unsigned dev) ++{ ++ e->data_type = BCH_DATA_CACHED; ++ e->nr_devs = 1; ++ e->nr_required = 1; ++ e->devs[0] = dev; ++} ++ ++struct replicas_status { ++ struct { ++ int redundancy; ++ unsigned nr_offline; ++ } replicas[BCH_DATA_NR]; ++}; ++ ++struct replicas_status __bch2_replicas_status(struct bch_fs *, ++ struct bch_devs_mask); ++struct replicas_status bch2_replicas_status(struct bch_fs *); ++bool bch2_have_enough_devs(struct replicas_status, unsigned); ++ ++int bch2_replicas_online(struct bch_fs *, bool); ++unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); ++ ++int bch2_replicas_gc_end(struct bch_fs *, int); ++int bch2_replicas_gc_start(struct bch_fs *, unsigned); ++int bch2_replicas_gc2(struct bch_fs *); ++ ++int bch2_replicas_set_usage(struct bch_fs *, ++ struct bch_replicas_entry *, ++ u64); ++ ++#define for_each_cpu_replicas_entry(_r, _i) \ ++ for (_i = (_r)->entries; \ ++ (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ ++ _i = (void *) (_i) + (_r)->entry_size) ++ ++/* iterate over superblock replicas - used by userspace tools: */ ++ ++#define replicas_entry_next(_i) \ ++ ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i))) ++ ++#define for_each_replicas_entry(_r, _i) \ ++ for (_i = (_r)->entries; \ ++ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ ++ (_i) = replicas_entry_next(_i)) ++ ++#define for_each_replicas_entry_v0(_r, _i) \ ++ for (_i = (_r)->entries; \ ++ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ ++ (_i) = replicas_entry_next(_i)) ++ ++int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_replicas; ++extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0; ++ ++int bch2_fs_replicas_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_REPLICAS_H */ +diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h +new file mode 100644 +index 000000000000..0535b1d3760e +--- /dev/null ++++ b/fs/bcachefs/replicas_types.h +@@ -0,0 +1,10 @@ ++#ifndef _BCACHEFS_REPLICAS_TYPES_H ++#define _BCACHEFS_REPLICAS_TYPES_H ++ ++struct bch_replicas_cpu { ++ unsigned nr; ++ unsigned entry_size; ++ struct bch_replicas_entry *entries; ++}; ++ ++#endif /* _BCACHEFS_REPLICAS_TYPES_H */ +diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c +new file mode 100644 +index 000000000000..c062edb3fbc2 +--- /dev/null ++++ b/fs/bcachefs/siphash.c +@@ -0,0 +1,173 @@ ++// SPDX-License-Identifier: BSD-3-Clause ++/* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */ ++ ++/*- ++ * Copyright (c) 2013 Andre Oppermann ++ * All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. The name of the author may not be used to endorse or promote ++ * products derived from this software without specific prior written ++ * permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND ++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS ++ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ++ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY ++ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ++ * SUCH DAMAGE. ++ */ ++ ++/* ++ * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d ++ * are the number of compression rounds and the number of finalization rounds. ++ * A compression round is identical to a finalization round and this round ++ * function is called SipRound. Given a 128-bit key k and a (possibly empty) ++ * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m). ++ * ++ * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18, ++ * by Jean-Philippe Aumasson and Daniel J. Bernstein, ++ * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa ++ * https://131002.net/siphash/siphash.pdf ++ * https://131002.net/siphash/ ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include "siphash.h" ++ ++static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds) ++{ ++ while (rounds--) { ++ ctx->v[0] += ctx->v[1]; ++ ctx->v[2] += ctx->v[3]; ++ ctx->v[1] = rol64(ctx->v[1], 13); ++ ctx->v[3] = rol64(ctx->v[3], 16); ++ ++ ctx->v[1] ^= ctx->v[0]; ++ ctx->v[3] ^= ctx->v[2]; ++ ctx->v[0] = rol64(ctx->v[0], 32); ++ ++ ctx->v[2] += ctx->v[1]; ++ ctx->v[0] += ctx->v[3]; ++ ctx->v[1] = rol64(ctx->v[1], 17); ++ ctx->v[3] = rol64(ctx->v[3], 21); ++ ++ ctx->v[1] ^= ctx->v[2]; ++ ctx->v[3] ^= ctx->v[0]; ++ ctx->v[2] = rol64(ctx->v[2], 32); ++ } ++} ++ ++static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds) ++{ ++ u64 m = get_unaligned_le64(ptr); ++ ++ ctx->v[3] ^= m; ++ SipHash_Rounds(ctx, rounds); ++ ctx->v[0] ^= m; ++} ++ ++void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key) ++{ ++ u64 k0, k1; ++ ++ k0 = le64_to_cpu(key->k0); ++ k1 = le64_to_cpu(key->k1); ++ ++ ctx->v[0] = 0x736f6d6570736575ULL ^ k0; ++ ctx->v[1] = 0x646f72616e646f6dULL ^ k1; ++ ctx->v[2] = 0x6c7967656e657261ULL ^ k0; ++ ctx->v[3] = 0x7465646279746573ULL ^ k1; ++ ++ memset(ctx->buf, 0, sizeof(ctx->buf)); ++ ctx->bytes = 0; ++} ++ ++void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, ++ const void *src, size_t len) ++{ ++ const u8 *ptr = src; ++ size_t left, used; ++ ++ if (len == 0) ++ return; ++ ++ used = ctx->bytes % sizeof(ctx->buf); ++ ctx->bytes += len; ++ ++ if (used > 0) { ++ left = sizeof(ctx->buf) - used; ++ ++ if (len >= left) { ++ memcpy(&ctx->buf[used], ptr, left); ++ SipHash_CRounds(ctx, ctx->buf, rc); ++ len -= left; ++ ptr += left; ++ } else { ++ memcpy(&ctx->buf[used], ptr, len); ++ return; ++ } ++ } ++ ++ while (len >= sizeof(ctx->buf)) { ++ SipHash_CRounds(ctx, ptr, rc); ++ len -= sizeof(ctx->buf); ++ ptr += sizeof(ctx->buf); ++ } ++ ++ if (len > 0) ++ memcpy(&ctx->buf[used], ptr, len); ++} ++ ++void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf) ++{ ++ u64 r; ++ ++ r = SipHash_End(ctx, rc, rf); ++ ++ *((__le64 *) dst) = cpu_to_le64(r); ++} ++ ++u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf) ++{ ++ u64 r; ++ size_t left, used; ++ ++ used = ctx->bytes % sizeof(ctx->buf); ++ left = sizeof(ctx->buf) - used; ++ memset(&ctx->buf[used], 0, left - 1); ++ ctx->buf[7] = ctx->bytes; ++ ++ SipHash_CRounds(ctx, ctx->buf, rc); ++ ctx->v[2] ^= 0xff; ++ SipHash_Rounds(ctx, rf); ++ ++ r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]); ++ memset(ctx, 0, sizeof(*ctx)); ++ return (r); ++} ++ ++u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len) ++{ ++ SIPHASH_CTX ctx; ++ ++ SipHash_Init(&ctx, key); ++ SipHash_Update(&ctx, rc, rf, src, len); ++ return SipHash_End(&ctx, rc, rf); ++} +diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h +new file mode 100644 +index 000000000000..3dfaf34a43b2 +--- /dev/null ++++ b/fs/bcachefs/siphash.h +@@ -0,0 +1,87 @@ ++/* SPDX-License-Identifier: BSD-3-Clause */ ++/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */ ++/*- ++ * Copyright (c) 2013 Andre Oppermann ++ * All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. The name of the author may not be used to endorse or promote ++ * products derived from this software without specific prior written ++ * permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND ++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS ++ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ++ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY ++ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ++ * SUCH DAMAGE. ++ * ++ * $FreeBSD$ ++ */ ++ ++/* ++ * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions) ++ * optimized for speed on short messages returning a 64bit hash/digest value. ++ * ++ * The number of rounds is defined during the initialization: ++ * SipHash24_Init() for the fast and resonable strong version ++ * SipHash48_Init() for the strong version (half as fast) ++ * ++ * struct SIPHASH_CTX ctx; ++ * SipHash24_Init(&ctx); ++ * SipHash_SetKey(&ctx, "16bytes long key"); ++ * SipHash_Update(&ctx, pointer_to_string, length_of_string); ++ * SipHash_Final(output, &ctx); ++ */ ++ ++#ifndef _SIPHASH_H_ ++#define _SIPHASH_H_ ++ ++#include ++ ++#define SIPHASH_BLOCK_LENGTH 8 ++#define SIPHASH_KEY_LENGTH 16 ++#define SIPHASH_DIGEST_LENGTH 8 ++ ++typedef struct _SIPHASH_CTX { ++ u64 v[4]; ++ u8 buf[SIPHASH_BLOCK_LENGTH]; ++ u32 bytes; ++} SIPHASH_CTX; ++ ++typedef struct { ++ __le64 k0; ++ __le64 k1; ++} SIPHASH_KEY; ++ ++void SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *); ++void SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t); ++u64 SipHash_End(SIPHASH_CTX *, int, int); ++void SipHash_Final(void *, SIPHASH_CTX *, int, int); ++u64 SipHash(const SIPHASH_KEY *, int, int, const void *, size_t); ++ ++#define SipHash24_Init(_c, _k) SipHash_Init((_c), (_k)) ++#define SipHash24_Update(_c, _p, _l) SipHash_Update((_c), 2, 4, (_p), (_l)) ++#define SipHash24_End(_d) SipHash_End((_d), 2, 4) ++#define SipHash24_Final(_d, _c) SipHash_Final((_d), (_c), 2, 4) ++#define SipHash24(_k, _p, _l) SipHash((_k), 2, 4, (_p), (_l)) ++ ++#define SipHash48_Init(_c, _k) SipHash_Init((_c), (_k)) ++#define SipHash48_Update(_c, _p, _l) SipHash_Update((_c), 4, 8, (_p), (_l)) ++#define SipHash48_End(_d) SipHash_End((_d), 4, 8) ++#define SipHash48_Final(_d, _c) SipHash_Final((_d), (_c), 4, 8) ++#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l)) ++ ++#endif /* _SIPHASH_H_ */ +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +new file mode 100644 +index 000000000000..dea9b7252b88 +--- /dev/null ++++ b/fs/bcachefs/str_hash.h +@@ -0,0 +1,336 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_STR_HASH_H ++#define _BCACHEFS_STR_HASH_H ++ ++#include "btree_iter.h" ++#include "btree_update.h" ++#include "checksum.h" ++#include "error.h" ++#include "inode.h" ++#include "siphash.h" ++#include "super.h" ++ ++#include ++#include ++#include ++ ++static inline enum bch_str_hash_type ++bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) ++{ ++ switch (opt) { ++ case BCH_STR_HASH_OPT_CRC32C: ++ return BCH_STR_HASH_CRC32C; ++ case BCH_STR_HASH_OPT_CRC64: ++ return BCH_STR_HASH_CRC64; ++ case BCH_STR_HASH_OPT_SIPHASH: ++ return c->sb.features & (1ULL << BCH_FEATURE_new_siphash) ++ ? BCH_STR_HASH_SIPHASH ++ : BCH_STR_HASH_SIPHASH_OLD; ++ default: ++ BUG(); ++ } ++} ++ ++struct bch_hash_info { ++ u8 type; ++ union { ++ __le64 crc_key; ++ SIPHASH_KEY siphash_key; ++ }; ++}; ++ ++static inline struct bch_hash_info ++bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) ++{ ++ /* XXX ick */ ++ struct bch_hash_info info = { ++ .type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) & ++ ~(~0U << INODE_STR_HASH_BITS), ++ .crc_key = bi->bi_hash_seed, ++ }; ++ ++ if (unlikely(info.type == BCH_STR_HASH_SIPHASH_OLD)) { ++ SHASH_DESC_ON_STACK(desc, c->sha256); ++ u8 digest[SHA256_DIGEST_SIZE]; ++ ++ desc->tfm = c->sha256; ++ ++ crypto_shash_digest(desc, (void *) &bi->bi_hash_seed, ++ sizeof(bi->bi_hash_seed), digest); ++ memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); ++ } ++ ++ return info; ++} ++ ++struct bch_str_hash_ctx { ++ union { ++ u32 crc32c; ++ u64 crc64; ++ SIPHASH_CTX siphash; ++ }; ++}; ++ ++static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx, ++ const struct bch_hash_info *info) ++{ ++ switch (info->type) { ++ case BCH_STR_HASH_CRC32C: ++ ctx->crc32c = crc32c(~0, &info->crc_key, sizeof(info->crc_key)); ++ break; ++ case BCH_STR_HASH_CRC64: ++ ctx->crc64 = crc64_be(~0, &info->crc_key, sizeof(info->crc_key)); ++ break; ++ case BCH_STR_HASH_SIPHASH_OLD: ++ case BCH_STR_HASH_SIPHASH: ++ SipHash24_Init(&ctx->siphash, &info->siphash_key); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx, ++ const struct bch_hash_info *info, ++ const void *data, size_t len) ++{ ++ switch (info->type) { ++ case BCH_STR_HASH_CRC32C: ++ ctx->crc32c = crc32c(ctx->crc32c, data, len); ++ break; ++ case BCH_STR_HASH_CRC64: ++ ctx->crc64 = crc64_be(ctx->crc64, data, len); ++ break; ++ case BCH_STR_HASH_SIPHASH_OLD: ++ case BCH_STR_HASH_SIPHASH: ++ SipHash24_Update(&ctx->siphash, data, len); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, ++ const struct bch_hash_info *info) ++{ ++ switch (info->type) { ++ case BCH_STR_HASH_CRC32C: ++ return ctx->crc32c; ++ case BCH_STR_HASH_CRC64: ++ return ctx->crc64 >> 1; ++ case BCH_STR_HASH_SIPHASH_OLD: ++ case BCH_STR_HASH_SIPHASH: ++ return SipHash24_End(&ctx->siphash) >> 1; ++ default: ++ BUG(); ++ } ++} ++ ++struct bch_hash_desc { ++ enum btree_id btree_id; ++ u8 key_type; ++ ++ u64 (*hash_key)(const struct bch_hash_info *, const void *); ++ u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c); ++ bool (*cmp_key)(struct bkey_s_c, const void *); ++ bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c); ++}; ++ ++static __always_inline struct btree_iter * ++bch2_hash_lookup(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ u64 inode, const void *key, ++ unsigned flags) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ for_each_btree_key(trans, iter, desc.btree_id, ++ POS(inode, desc.hash_key(info, key)), ++ BTREE_ITER_SLOTS|flags, k, ret) { ++ if (iter->pos.inode != inode) ++ break; ++ ++ if (k.k->type == desc.key_type) { ++ if (!desc.cmp_key(k, key)) ++ return iter; ++ } else if (k.k->type == KEY_TYPE_whiteout) { ++ ; ++ } else { ++ /* hole, not found */ ++ break; ++ } ++ } ++ bch2_trans_iter_put(trans, iter); ++ ++ return ERR_PTR(ret ?: -ENOENT); ++} ++ ++static __always_inline struct btree_iter * ++bch2_hash_hole(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ u64 inode, const void *key) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ for_each_btree_key(trans, iter, desc.btree_id, ++ POS(inode, desc.hash_key(info, key)), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ if (iter->pos.inode != inode) ++ break; ++ ++ if (k.k->type != desc.key_type) ++ return iter; ++ } ++ ++ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ bch2_trans_iter_put(trans, iter); ++ ++ return ERR_PTR(ret ?: -ENOSPC); ++} ++ ++static __always_inline ++int bch2_hash_needs_whiteout(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ struct btree_iter *start) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ iter = bch2_trans_copy_iter(trans, start); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ bch2_btree_iter_next_slot(iter); ++ ++ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) { ++ if (k.k->type != desc.key_type && ++ k.k->type != KEY_TYPE_whiteout) ++ break; ++ ++ if (k.k->type == desc.key_type && ++ desc.hash_bkey(info, k) <= start->pos.offset) { ++ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ ret = 1; ++ break; ++ } ++ } ++ ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static __always_inline ++int bch2_hash_set(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ u64 inode, struct bkey_i *insert, int flags) ++{ ++ struct btree_iter *iter, *slot = NULL; ++ struct bkey_s_c k; ++ bool found = false; ++ int ret; ++ ++ for_each_btree_key(trans, iter, desc.btree_id, ++ POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ if (iter->pos.inode != inode) ++ break; ++ ++ if (k.k->type == desc.key_type) { ++ if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) ++ goto found; ++ ++ /* hash collision: */ ++ continue; ++ } ++ ++ if (!slot && ++ !(flags & BCH_HASH_SET_MUST_REPLACE)) { ++ slot = bch2_trans_copy_iter(trans, iter); ++ if (IS_ERR(slot)) ++ return PTR_ERR(slot); ++ } ++ ++ if (k.k->type != KEY_TYPE_whiteout) ++ goto not_found; ++ } ++ ++ if (!ret) ++ ret = -ENOSPC; ++out: ++ bch2_trans_iter_put(trans, slot); ++ bch2_trans_iter_put(trans, iter); ++ ++ return ret; ++found: ++ found = true; ++not_found: ++ ++ if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) { ++ ret = -ENOENT; ++ } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) { ++ ret = -EEXIST; ++ } else { ++ if (!found && slot) ++ swap(iter, slot); ++ ++ insert->k.p = iter->pos; ++ bch2_trans_update(trans, iter, insert, 0); ++ } ++ ++ goto out; ++} ++ ++static __always_inline ++int bch2_hash_delete_at(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ struct btree_iter *iter) ++{ ++ struct bkey_i *delete; ++ int ret; ++ ++ ret = bch2_hash_needs_whiteout(trans, desc, info, iter); ++ if (ret < 0) ++ return ret; ++ ++ delete = bch2_trans_kmalloc(trans, sizeof(*delete)); ++ if (IS_ERR(delete)) ++ return PTR_ERR(delete); ++ ++ bkey_init(&delete->k); ++ delete->k.p = iter->pos; ++ delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted; ++ ++ bch2_trans_update(trans, iter, delete, 0); ++ return 0; ++} ++ ++static __always_inline ++int bch2_hash_delete(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ u64 inode, const void *key) ++{ ++ struct btree_iter *iter; ++ int ret; ++ ++ iter = bch2_hash_lookup(trans, desc, info, inode, key, ++ BTREE_ITER_INTENT); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ ret = bch2_hash_delete_at(trans, desc, info, iter); ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++#endif /* _BCACHEFS_STR_HASH_H */ +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +new file mode 100644 +index 000000000000..9a221d3e1652 +--- /dev/null ++++ b/fs/bcachefs/super-io.c +@@ -0,0 +1,1158 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "error.h" ++#include "io.h" ++#include "journal.h" ++#include "journal_seq_blacklist.h" ++#include "replicas.h" ++#include "quota.h" ++#include "super-io.h" ++#include "super.h" ++#include "vstructs.h" ++ ++#include ++#include ++ ++const char * const bch2_sb_fields[] = { ++#define x(name, nr) #name, ++ BCH_SB_FIELDS() ++#undef x ++ NULL ++}; ++ ++static const char *bch2_sb_field_validate(struct bch_sb *, ++ struct bch_sb_field *); ++ ++struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb, ++ enum bch_sb_field_type type) ++{ ++ struct bch_sb_field *f; ++ ++ /* XXX: need locking around superblock to access optional fields */ ++ ++ vstruct_for_each(sb, f) ++ if (le32_to_cpu(f->type) == type) ++ return f; ++ return NULL; ++} ++ ++static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, ++ struct bch_sb_field *f, ++ unsigned u64s) ++{ ++ unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; ++ unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s; ++ ++ BUG_ON(get_order(__vstruct_bytes(struct bch_sb, sb_u64s)) > ++ sb->page_order); ++ ++ if (!f && !u64s) { ++ /* nothing to do: */ ++ } else if (!f) { ++ f = vstruct_last(sb->sb); ++ memset(f, 0, sizeof(u64) * u64s); ++ f->u64s = cpu_to_le32(u64s); ++ f->type = 0; ++ } else { ++ void *src, *dst; ++ ++ src = vstruct_end(f); ++ ++ if (u64s) { ++ f->u64s = cpu_to_le32(u64s); ++ dst = vstruct_end(f); ++ } else { ++ dst = f; ++ } ++ ++ memmove(dst, src, vstruct_end(sb->sb) - src); ++ ++ if (dst > src) ++ memset(src, 0, dst - src); ++ } ++ ++ sb->sb->u64s = cpu_to_le32(sb_u64s); ++ ++ return u64s ? f : NULL; ++} ++ ++void bch2_sb_field_delete(struct bch_sb_handle *sb, ++ enum bch_sb_field_type type) ++{ ++ struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); ++ ++ if (f) ++ __bch2_sb_field_resize(sb, f, 0); ++} ++ ++/* Superblock realloc/free: */ ++ ++void bch2_free_super(struct bch_sb_handle *sb) ++{ ++ if (sb->bio) ++ bio_put(sb->bio); ++ if (!IS_ERR_OR_NULL(sb->bdev)) ++ blkdev_put(sb->bdev, sb->mode); ++ ++ free_pages((unsigned long) sb->sb, sb->page_order); ++ memset(sb, 0, sizeof(*sb)); ++} ++ ++int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) ++{ ++ size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s); ++ unsigned order = get_order(new_bytes); ++ struct bch_sb *new_sb; ++ struct bio *bio; ++ ++ if (sb->sb && sb->page_order >= order) ++ return 0; ++ ++ if (sb->have_layout) { ++ u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; ++ ++ if (new_bytes > max_bytes) { ++ char buf[BDEVNAME_SIZE]; ++ ++ pr_err("%s: superblock too big: want %zu but have %llu", ++ bdevname(sb->bdev, buf), new_bytes, max_bytes); ++ return -ENOSPC; ++ } ++ } ++ ++ if (sb->page_order >= order && sb->sb) ++ return 0; ++ ++ if (dynamic_fault("bcachefs:add:super_realloc")) ++ return -ENOMEM; ++ ++ if (sb->have_bio) { ++ bio = bio_kmalloc(GFP_KERNEL, 1 << order); ++ if (!bio) ++ return -ENOMEM; ++ ++ if (sb->bio) ++ bio_put(sb->bio); ++ sb->bio = bio; ++ } ++ ++ new_sb = (void *) __get_free_pages(GFP_NOFS|__GFP_ZERO, order); ++ if (!new_sb) ++ return -ENOMEM; ++ ++ if (sb->sb) ++ memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order); ++ ++ free_pages((unsigned long) sb->sb, sb->page_order); ++ sb->sb = new_sb; ++ ++ sb->page_order = order; ++ ++ return 0; ++} ++ ++struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb, ++ enum bch_sb_field_type type, ++ unsigned u64s) ++{ ++ struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); ++ ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; ++ ssize_t d = -old_u64s + u64s; ++ ++ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) ++ return NULL; ++ ++ if (sb->fs_sb) { ++ struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb); ++ struct bch_dev *ca; ++ unsigned i; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ /* XXX: we're not checking that offline device have enough space */ ++ ++ for_each_online_member(ca, c, i) { ++ struct bch_sb_handle *sb = &ca->disk_sb; ++ ++ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { ++ percpu_ref_put(&ca->ref); ++ return NULL; ++ } ++ } ++ } ++ ++ f = bch2_sb_field_get(sb->sb, type); ++ f = __bch2_sb_field_resize(sb, f, u64s); ++ if (f) ++ f->type = cpu_to_le32(type); ++ return f; ++} ++ ++/* Superblock validate: */ ++ ++static inline void __bch2_sb_layout_size_assert(void) ++{ ++ BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); ++} ++ ++static const char *validate_sb_layout(struct bch_sb_layout *layout) ++{ ++ u64 offset, prev_offset, max_sectors; ++ unsigned i; ++ ++ if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) ++ return "Not a bcachefs superblock layout"; ++ ++ if (layout->layout_type != 0) ++ return "Invalid superblock layout type"; ++ ++ if (!layout->nr_superblocks) ++ return "Invalid superblock layout: no superblocks"; ++ ++ if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) ++ return "Invalid superblock layout: too many superblocks"; ++ ++ max_sectors = 1 << layout->sb_max_size_bits; ++ ++ prev_offset = le64_to_cpu(layout->sb_offset[0]); ++ ++ for (i = 1; i < layout->nr_superblocks; i++) { ++ offset = le64_to_cpu(layout->sb_offset[i]); ++ ++ if (offset < prev_offset + max_sectors) ++ return "Invalid superblock layout: superblocks overlap"; ++ prev_offset = offset; ++ } ++ ++ return NULL; ++} ++ ++const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) ++{ ++ struct bch_sb *sb = disk_sb->sb; ++ struct bch_sb_field *f; ++ struct bch_sb_field_members *mi; ++ const char *err; ++ u32 version, version_min; ++ u16 block_size; ++ ++ version = le16_to_cpu(sb->version); ++ version_min = version >= bcachefs_metadata_version_new_versioning ++ ? le16_to_cpu(sb->version_min) ++ : version; ++ ++ if (version >= bcachefs_metadata_version_max || ++ version_min < bcachefs_metadata_version_min) ++ return "Unsupported superblock version"; ++ ++ if (version_min > version) ++ return "Bad minimum version"; ++ ++ if (sb->features[1] || ++ (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) ++ return "Filesystem has incompatible features"; ++ ++ block_size = le16_to_cpu(sb->block_size); ++ ++ if (!is_power_of_2(block_size) || ++ block_size > PAGE_SECTORS) ++ return "Bad block size"; ++ ++ if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) ++ return "Bad user UUID"; ++ ++ if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) ++ return "Bad internal UUID"; ++ ++ if (!sb->nr_devices || ++ sb->nr_devices <= sb->dev_idx || ++ sb->nr_devices > BCH_SB_MEMBERS_MAX) ++ return "Bad number of member devices"; ++ ++ if (!BCH_SB_META_REPLICAS_WANT(sb) || ++ BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) ++ return "Invalid number of metadata replicas"; ++ ++ if (!BCH_SB_META_REPLICAS_REQ(sb) || ++ BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) ++ return "Invalid number of metadata replicas"; ++ ++ if (!BCH_SB_DATA_REPLICAS_WANT(sb) || ++ BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) ++ return "Invalid number of data replicas"; ++ ++ if (!BCH_SB_DATA_REPLICAS_REQ(sb) || ++ BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) ++ return "Invalid number of data replicas"; ++ ++ if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) ++ return "Invalid metadata checksum type"; ++ ++ if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) ++ return "Invalid metadata checksum type"; ++ ++ if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR) ++ return "Invalid compression type"; ++ ++ if (!BCH_SB_BTREE_NODE_SIZE(sb)) ++ return "Btree node size not set"; ++ ++ if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb))) ++ return "Btree node size not a power of two"; ++ ++ if (BCH_SB_GC_RESERVE(sb) < 5) ++ return "gc reserve percentage too small"; ++ ++ if (!sb->time_precision || ++ le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) ++ return "invalid time precision"; ++ ++ /* validate layout */ ++ err = validate_sb_layout(&sb->layout); ++ if (err) ++ return err; ++ ++ vstruct_for_each(sb, f) { ++ if (!f->u64s) ++ return "Invalid superblock: invalid optional field"; ++ ++ if (vstruct_next(f) > vstruct_last(sb)) ++ return "Invalid superblock: invalid optional field"; ++ } ++ ++ /* members must be validated first: */ ++ mi = bch2_sb_get_members(sb); ++ if (!mi) ++ return "Invalid superblock: member info area missing"; ++ ++ err = bch2_sb_field_validate(sb, &mi->field); ++ if (err) ++ return err; ++ ++ vstruct_for_each(sb, f) { ++ if (le32_to_cpu(f->type) == BCH_SB_FIELD_members) ++ continue; ++ ++ err = bch2_sb_field_validate(sb, f); ++ if (err) ++ return err; ++ } ++ ++ return NULL; ++} ++ ++/* device open: */ ++ ++static void bch2_sb_update(struct bch_fs *c) ++{ ++ struct bch_sb *src = c->disk_sb.sb; ++ struct bch_sb_field_members *mi = bch2_sb_get_members(src); ++ struct bch_dev *ca; ++ unsigned i; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ c->sb.uuid = src->uuid; ++ c->sb.user_uuid = src->user_uuid; ++ c->sb.version = le16_to_cpu(src->version); ++ c->sb.nr_devices = src->nr_devices; ++ c->sb.clean = BCH_SB_CLEAN(src); ++ c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); ++ c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src); ++ c->sb.time_base_lo = le64_to_cpu(src->time_base_lo); ++ c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); ++ c->sb.time_precision = le32_to_cpu(src->time_precision); ++ c->sb.features = le64_to_cpu(src->features[0]); ++ c->sb.compat = le64_to_cpu(src->compat[0]); ++ ++ for_each_member_device(ca, c, i) ++ ca->mi = bch2_mi_to_cpu(mi->members + i); ++} ++ ++/* doesn't copy member info */ ++static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) ++{ ++ struct bch_sb_field *src_f, *dst_f; ++ struct bch_sb *dst = dst_handle->sb; ++ unsigned i; ++ ++ dst->version = src->version; ++ dst->version_min = src->version_min; ++ dst->seq = src->seq; ++ dst->uuid = src->uuid; ++ dst->user_uuid = src->user_uuid; ++ memcpy(dst->label, src->label, sizeof(dst->label)); ++ ++ dst->block_size = src->block_size; ++ dst->nr_devices = src->nr_devices; ++ ++ dst->time_base_lo = src->time_base_lo; ++ dst->time_base_hi = src->time_base_hi; ++ dst->time_precision = src->time_precision; ++ ++ memcpy(dst->flags, src->flags, sizeof(dst->flags)); ++ memcpy(dst->features, src->features, sizeof(dst->features)); ++ memcpy(dst->compat, src->compat, sizeof(dst->compat)); ++ ++ for (i = 0; i < BCH_SB_FIELD_NR; i++) { ++ if (i == BCH_SB_FIELD_journal) ++ continue; ++ ++ src_f = bch2_sb_field_get(src, i); ++ dst_f = bch2_sb_field_get(dst, i); ++ dst_f = __bch2_sb_field_resize(dst_handle, dst_f, ++ src_f ? le32_to_cpu(src_f->u64s) : 0); ++ ++ if (src_f) ++ memcpy(dst_f, src_f, vstruct_bytes(src_f)); ++ } ++} ++ ++int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) ++{ ++ struct bch_sb_field_journal *journal_buckets = ++ bch2_sb_get_journal(src); ++ unsigned journal_u64s = journal_buckets ++ ? le32_to_cpu(journal_buckets->field.u64s) ++ : 0; ++ int ret; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ ret = bch2_sb_realloc(&c->disk_sb, ++ le32_to_cpu(src->u64s) - journal_u64s); ++ if (ret) ++ return ret; ++ ++ __copy_super(&c->disk_sb, src); ++ ++ ret = bch2_sb_replicas_to_cpu_replicas(c); ++ if (ret) ++ return ret; ++ ++ ret = bch2_sb_disk_groups_to_cpu(c); ++ if (ret) ++ return ret; ++ ++ bch2_sb_update(c); ++ return 0; ++} ++ ++int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bch_sb *src = c->disk_sb.sb, *dst = ca->disk_sb.sb; ++ struct bch_sb_field_journal *journal_buckets = ++ bch2_sb_get_journal(dst); ++ unsigned journal_u64s = journal_buckets ++ ? le32_to_cpu(journal_buckets->field.u64s) ++ : 0; ++ unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s; ++ int ret; ++ ++ ret = bch2_sb_realloc(&ca->disk_sb, u64s); ++ if (ret) ++ return ret; ++ ++ __copy_super(&ca->disk_sb, src); ++ return 0; ++} ++ ++/* read superblock: */ ++ ++static const char *read_one_super(struct bch_sb_handle *sb, u64 offset) ++{ ++ struct bch_csum csum; ++ size_t bytes; ++reread: ++ bio_reset(sb->bio); ++ bio_set_dev(sb->bio, sb->bdev); ++ sb->bio->bi_iter.bi_sector = offset; ++ bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); ++ bch2_bio_map(sb->bio, sb->sb, PAGE_SIZE << sb->page_order); ++ ++ if (submit_bio_wait(sb->bio)) ++ return "IO error"; ++ ++ if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) ++ return "Not a bcachefs superblock"; ++ ++ if (le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_min || ++ le16_to_cpu(sb->sb->version) >= bcachefs_metadata_version_max) ++ return "Unsupported superblock version"; ++ ++ bytes = vstruct_bytes(sb->sb); ++ ++ if (bytes > 512 << sb->sb->layout.sb_max_size_bits) ++ return "Bad superblock: too big"; ++ ++ if (get_order(bytes) > sb->page_order) { ++ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s))) ++ return "cannot allocate memory"; ++ goto reread; ++ } ++ ++ if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) ++ return "unknown csum type"; ++ ++ /* XXX: verify MACs */ ++ csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), ++ null_nonce(), sb->sb); ++ ++ if (bch2_crc_cmp(csum, sb->sb->csum)) ++ return "bad checksum reading superblock"; ++ ++ sb->seq = le64_to_cpu(sb->sb->seq); ++ ++ return NULL; ++} ++ ++int bch2_read_super(const char *path, struct bch_opts *opts, ++ struct bch_sb_handle *sb) ++{ ++ u64 offset = opt_get(*opts, sb); ++ struct bch_sb_layout layout; ++ const char *err; ++ __le64 *i; ++ int ret; ++ ++ pr_verbose_init(*opts, ""); ++ ++ memset(sb, 0, sizeof(*sb)); ++ sb->mode = FMODE_READ; ++ sb->have_bio = true; ++ ++ if (!opt_get(*opts, noexcl)) ++ sb->mode |= FMODE_EXCL; ++ ++ if (!opt_get(*opts, nochanges)) ++ sb->mode |= FMODE_WRITE; ++ ++ sb->bdev = blkdev_get_by_path(path, sb->mode, sb); ++ if (IS_ERR(sb->bdev) && ++ PTR_ERR(sb->bdev) == -EACCES && ++ opt_get(*opts, read_only)) { ++ sb->mode &= ~FMODE_WRITE; ++ ++ sb->bdev = blkdev_get_by_path(path, sb->mode, sb); ++ if (!IS_ERR(sb->bdev)) ++ opt_set(*opts, nochanges, true); ++ } ++ ++ if (IS_ERR(sb->bdev)) { ++ ret = PTR_ERR(sb->bdev); ++ goto out; ++ } ++ ++ err = "cannot allocate memory"; ++ ret = bch2_sb_realloc(sb, 0); ++ if (ret) ++ goto err; ++ ++ ret = -EFAULT; ++ err = "dynamic fault"; ++ if (bch2_fs_init_fault("read_super")) ++ goto err; ++ ++ ret = -EINVAL; ++ err = read_one_super(sb, offset); ++ if (!err) ++ goto got_super; ++ ++ if (opt_defined(*opts, sb)) ++ goto err; ++ ++ pr_err("error reading default superblock: %s", err); ++ ++ /* ++ * Error reading primary superblock - read location of backup ++ * superblocks: ++ */ ++ bio_reset(sb->bio); ++ bio_set_dev(sb->bio, sb->bdev); ++ sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; ++ bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); ++ /* ++ * use sb buffer to read layout, since sb buffer is page aligned but ++ * layout won't be: ++ */ ++ bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout)); ++ ++ err = "IO error"; ++ if (submit_bio_wait(sb->bio)) ++ goto err; ++ ++ memcpy(&layout, sb->sb, sizeof(layout)); ++ err = validate_sb_layout(&layout); ++ if (err) ++ goto err; ++ ++ for (i = layout.sb_offset; ++ i < layout.sb_offset + layout.nr_superblocks; i++) { ++ offset = le64_to_cpu(*i); ++ ++ if (offset == opt_get(*opts, sb)) ++ continue; ++ ++ err = read_one_super(sb, offset); ++ if (!err) ++ goto got_super; ++ } ++ ++ ret = -EINVAL; ++ goto err; ++ ++got_super: ++ err = "Superblock block size smaller than device block size"; ++ ret = -EINVAL; ++ if (le16_to_cpu(sb->sb->block_size) << 9 < ++ bdev_logical_block_size(sb->bdev)) ++ goto err; ++ ++ if (sb->mode & FMODE_WRITE) ++ bdev_get_queue(sb->bdev)->backing_dev_info->capabilities ++ |= BDI_CAP_STABLE_WRITES; ++ ret = 0; ++ sb->have_layout = true; ++out: ++ pr_verbose_init(*opts, "ret %i", ret); ++ return ret; ++err: ++ bch2_free_super(sb); ++ pr_err("error reading superblock: %s", err); ++ goto out; ++} ++ ++/* write superblock: */ ++ ++static void write_super_endio(struct bio *bio) ++{ ++ struct bch_dev *ca = bio->bi_private; ++ ++ /* XXX: return errors directly */ ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write: %s", ++ blk_status_to_str(bio->bi_status))) ++ ca->sb_write_error = 1; ++ ++ closure_put(&ca->fs->sb_write); ++ percpu_ref_put(&ca->io_ref); ++} ++ ++static void read_back_super(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bch_sb *sb = ca->disk_sb.sb; ++ struct bio *bio = ca->disk_sb.bio; ++ ++ bio_reset(bio); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); ++ bio->bi_end_io = write_super_endio; ++ bio->bi_private = ca; ++ bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC|REQ_META); ++ bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); ++ ++ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_SB], ++ bio_sectors(bio)); ++ ++ percpu_ref_get(&ca->io_ref); ++ closure_bio_submit(bio, &c->sb_write); ++} ++ ++static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) ++{ ++ struct bch_sb *sb = ca->disk_sb.sb; ++ struct bio *bio = ca->disk_sb.bio; ++ ++ sb->offset = sb->layout.sb_offset[idx]; ++ ++ SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum); ++ sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), ++ null_nonce(), sb); ++ ++ bio_reset(bio); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); ++ bio->bi_end_io = write_super_endio; ++ bio->bi_private = ca; ++ bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META); ++ bch2_bio_map(bio, sb, ++ roundup((size_t) vstruct_bytes(sb), ++ bdev_logical_block_size(ca->disk_sb.bdev))); ++ ++ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_SB], ++ bio_sectors(bio)); ++ ++ percpu_ref_get(&ca->io_ref); ++ closure_bio_submit(bio, &c->sb_write); ++} ++ ++int bch2_write_super(struct bch_fs *c) ++{ ++ struct closure *cl = &c->sb_write; ++ struct bch_dev *ca; ++ unsigned i, sb = 0, nr_wrote; ++ const char *err; ++ struct bch_devs_mask sb_written; ++ bool wrote, can_mount_without_written, can_mount_with_written; ++ int ret = 0; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ closure_init_stack(cl); ++ memset(&sb_written, 0, sizeof(sb_written)); ++ ++ le64_add_cpu(&c->disk_sb.sb->seq, 1); ++ ++ if (test_bit(BCH_FS_ERROR, &c->flags)) ++ SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); ++ ++ for_each_online_member(ca, c, i) ++ bch2_sb_from_fs(c, ca); ++ ++ for_each_online_member(ca, c, i) { ++ err = bch2_sb_validate(&ca->disk_sb); ++ if (err) { ++ bch2_fs_inconsistent(c, "sb invalid before write: %s", err); ++ ret = -1; ++ goto out; ++ } ++ } ++ ++ if (c->opts.nochanges) ++ goto out; ++ ++ for_each_online_member(ca, c, i) { ++ __set_bit(ca->dev_idx, sb_written.d); ++ ca->sb_write_error = 0; ++ } ++ ++ for_each_online_member(ca, c, i) ++ read_back_super(c, ca); ++ closure_sync(cl); ++ ++ for_each_online_member(ca, c, i) { ++ if (!ca->sb_write_error && ++ ca->disk_sb.seq != ++ le64_to_cpu(ca->sb_read_scratch->seq)) { ++ bch2_fs_fatal_error(c, ++ "Superblock modified by another process"); ++ percpu_ref_put(&ca->io_ref); ++ ret = -EROFS; ++ goto out; ++ } ++ } ++ ++ do { ++ wrote = false; ++ for_each_online_member(ca, c, i) ++ if (!ca->sb_write_error && ++ sb < ca->disk_sb.sb->layout.nr_superblocks) { ++ write_one_super(c, ca, sb); ++ wrote = true; ++ } ++ closure_sync(cl); ++ sb++; ++ } while (wrote); ++ ++ for_each_online_member(ca, c, i) { ++ if (ca->sb_write_error) ++ __clear_bit(ca->dev_idx, sb_written.d); ++ else ++ ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq); ++ } ++ ++ nr_wrote = dev_mask_nr(&sb_written); ++ ++ can_mount_with_written = ++ bch2_have_enough_devs(__bch2_replicas_status(c, sb_written), ++ BCH_FORCE_IF_DEGRADED); ++ ++ for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) ++ sb_written.d[i] = ~sb_written.d[i]; ++ ++ can_mount_without_written = ++ bch2_have_enough_devs(__bch2_replicas_status(c, sb_written), ++ BCH_FORCE_IF_DEGRADED); ++ ++ /* ++ * If we would be able to mount _without_ the devices we successfully ++ * wrote superblocks to, we weren't able to write to enough devices: ++ * ++ * Exception: if we can mount without the successes because we haven't ++ * written anything (new filesystem), we continue if we'd be able to ++ * mount with the devices we did successfully write to: ++ */ ++ if (bch2_fs_fatal_err_on(!nr_wrote || ++ (can_mount_without_written && ++ !can_mount_with_written), c, ++ "Unable to write superblock to sufficient devices")) ++ ret = -1; ++out: ++ /* Make new options visible after they're persistent: */ ++ bch2_sb_update(c); ++ return ret; ++} ++ ++void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) ++{ ++ mutex_lock(&c->sb_lock); ++ if (!(c->sb.features & (1ULL << feat))) { ++ c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat); ++ ++ bch2_write_super(c); ++ } ++ mutex_unlock(&c->sb_lock); ++} ++ ++/* BCH_SB_FIELD_journal: */ ++ ++static int u64_cmp(const void *_l, const void *_r) ++{ ++ u64 l = *((const u64 *) _l), r = *((const u64 *) _r); ++ ++ return l < r ? -1 : l > r ? 1 : 0; ++} ++ ++static const char *bch2_sb_validate_journal(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_journal *journal = field_to_type(f, journal); ++ struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; ++ const char *err; ++ unsigned nr; ++ unsigned i; ++ u64 *b; ++ ++ journal = bch2_sb_get_journal(sb); ++ if (!journal) ++ return NULL; ++ ++ nr = bch2_nr_journal_buckets(journal); ++ if (!nr) ++ return NULL; ++ ++ b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); ++ if (!b) ++ return "cannot allocate memory"; ++ ++ for (i = 0; i < nr; i++) ++ b[i] = le64_to_cpu(journal->buckets[i]); ++ ++ sort(b, nr, sizeof(u64), u64_cmp, NULL); ++ ++ err = "journal bucket at sector 0"; ++ if (!b[0]) ++ goto err; ++ ++ err = "journal bucket before first bucket"; ++ if (m && b[0] < le16_to_cpu(m->first_bucket)) ++ goto err; ++ ++ err = "journal bucket past end of device"; ++ if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets)) ++ goto err; ++ ++ err = "duplicate journal buckets"; ++ for (i = 0; i + 1 < nr; i++) ++ if (b[i] == b[i + 1]) ++ goto err; ++ ++ err = NULL; ++err: ++ kfree(b); ++ return err; ++} ++ ++static const struct bch_sb_field_ops bch_sb_field_ops_journal = { ++ .validate = bch2_sb_validate_journal, ++}; ++ ++/* BCH_SB_FIELD_members: */ ++ ++static const char *bch2_sb_validate_members(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_members *mi = field_to_type(f, members); ++ struct bch_member *m; ++ ++ if ((void *) (mi->members + sb->nr_devices) > ++ vstruct_end(&mi->field)) ++ return "Invalid superblock: bad member info"; ++ ++ for (m = mi->members; ++ m < mi->members + sb->nr_devices; ++ m++) { ++ if (!bch2_member_exists(m)) ++ continue; ++ ++ if (le64_to_cpu(m->nbuckets) > LONG_MAX) ++ return "Too many buckets"; ++ ++ if (le64_to_cpu(m->nbuckets) - ++ le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) ++ return "Not enough buckets"; ++ ++ if (le16_to_cpu(m->bucket_size) < ++ le16_to_cpu(sb->block_size)) ++ return "bucket size smaller than block size"; ++ ++ if (le16_to_cpu(m->bucket_size) < ++ BCH_SB_BTREE_NODE_SIZE(sb)) ++ return "bucket size smaller than btree node size"; ++ } ++ ++ return NULL; ++} ++ ++static const struct bch_sb_field_ops bch_sb_field_ops_members = { ++ .validate = bch2_sb_validate_members, ++}; ++ ++/* BCH_SB_FIELD_crypt: */ ++ ++static const char *bch2_sb_validate_crypt(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); ++ ++ if (vstruct_bytes(&crypt->field) != sizeof(*crypt)) ++ return "invalid field crypt: wrong size"; ++ ++ if (BCH_CRYPT_KDF_TYPE(crypt)) ++ return "invalid field crypt: bad kdf type"; ++ ++ return NULL; ++} ++ ++static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { ++ .validate = bch2_sb_validate_crypt, ++}; ++ ++/* BCH_SB_FIELD_clean: */ ++ ++void bch2_sb_clean_renumber(struct bch_sb_field_clean *clean, int write) ++{ ++ struct jset_entry *entry; ++ ++ for (entry = clean->start; ++ entry < (struct jset_entry *) vstruct_end(&clean->field); ++ entry = vstruct_next(entry)) ++ bch2_bkey_renumber(BKEY_TYPE_BTREE, bkey_to_packed(entry->start), write); ++} ++ ++int bch2_fs_mark_dirty(struct bch_fs *c) ++{ ++ int ret; ++ ++ /* ++ * Unconditionally write superblock, to verify it hasn't changed before ++ * we go rw: ++ */ ++ ++ mutex_lock(&c->sb_lock); ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates; ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled; ++ ret = bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++static void ++entry_init_u64s(struct jset_entry *entry, unsigned u64s) ++{ ++ memset(entry, 0, u64s * sizeof(u64)); ++ ++ /* ++ * The u64s field counts from the start of data, ignoring the shared ++ * fields. ++ */ ++ entry->u64s = u64s - 1; ++} ++ ++static void ++entry_init_size(struct jset_entry *entry, size_t size) ++{ ++ unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); ++ entry_init_u64s(entry, u64s); ++} ++ ++struct jset_entry * ++bch2_journal_super_entries_add_common(struct bch_fs *c, ++ struct jset_entry *entry, ++ u64 journal_seq) ++{ ++ unsigned i; ++ ++ percpu_down_write(&c->mark_lock); ++ ++ if (!journal_seq) { ++ bch2_fs_usage_acc_to_base(c, 0); ++ bch2_fs_usage_acc_to_base(c, 1); ++ } else { ++ bch2_fs_usage_acc_to_base(c, journal_seq & 1); ++ } ++ ++ { ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ ++ entry_init_size(entry, sizeof(*u)); ++ u->entry.type = BCH_JSET_ENTRY_usage; ++ u->entry.btree_id = FS_USAGE_INODES; ++ u->v = cpu_to_le64(c->usage_base->nr_inodes); ++ ++ entry = vstruct_next(entry); ++ } ++ ++ { ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ ++ entry_init_size(entry, sizeof(*u)); ++ u->entry.type = BCH_JSET_ENTRY_usage; ++ u->entry.btree_id = FS_USAGE_KEY_VERSION; ++ u->v = cpu_to_le64(atomic64_read(&c->key_version)); ++ ++ entry = vstruct_next(entry); ++ } ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) { ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ ++ entry_init_size(entry, sizeof(*u)); ++ u->entry.type = BCH_JSET_ENTRY_usage; ++ u->entry.btree_id = FS_USAGE_RESERVED; ++ u->entry.level = i; ++ u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); ++ ++ entry = vstruct_next(entry); ++ } ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ struct jset_entry_data_usage *u = ++ container_of(entry, struct jset_entry_data_usage, entry); ++ ++ entry_init_size(entry, sizeof(*u) + e->nr_devs); ++ u->entry.type = BCH_JSET_ENTRY_data_usage; ++ u->v = cpu_to_le64(c->usage_base->replicas[i]); ++ memcpy(&u->r, e, replicas_entry_bytes(e)); ++ ++ entry = vstruct_next(entry); ++ } ++ ++ percpu_up_write(&c->mark_lock); ++ ++ return entry; ++} ++ ++void bch2_fs_mark_clean(struct bch_fs *c) ++{ ++ struct bch_sb_field_clean *sb_clean; ++ struct jset_entry *entry; ++ unsigned u64s; ++ ++ mutex_lock(&c->sb_lock); ++ if (BCH_SB_CLEAN(c->disk_sb.sb)) ++ goto out; ++ ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, true); ++ ++ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; ++ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA; ++ c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates); ++ c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled); ++ ++ u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; ++ ++ sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s); ++ if (!sb_clean) { ++ bch_err(c, "error resizing superblock while setting filesystem clean"); ++ goto out; ++ } ++ ++ sb_clean->flags = 0; ++ sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); ++ sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); ++ sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1); ++ ++ /* Trying to catch outstanding bug: */ ++ BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); ++ ++ entry = sb_clean->start; ++ entry = bch2_journal_super_entries_add_common(c, entry, 0); ++ entry = bch2_btree_roots_to_journal_entries(c, entry, entry); ++ BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); ++ ++ memset(entry, 0, ++ vstruct_end(&sb_clean->field) - (void *) entry); ++ ++ if (le16_to_cpu(c->disk_sb.sb->version) < ++ bcachefs_metadata_version_bkey_renumber) ++ bch2_sb_clean_renumber(sb_clean, WRITE); ++ ++ bch2_write_super(c); ++out: ++ mutex_unlock(&c->sb_lock); ++} ++ ++static const char *bch2_sb_validate_clean(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_clean *clean = field_to_type(f, clean); ++ ++ if (vstruct_bytes(&clean->field) < sizeof(*clean)) ++ return "invalid field crypt: wrong size"; ++ ++ return NULL; ++} ++ ++static const struct bch_sb_field_ops bch_sb_field_ops_clean = { ++ .validate = bch2_sb_validate_clean, ++}; ++ ++static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { ++#define x(f, nr) \ ++ [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, ++ BCH_SB_FIELDS() ++#undef x ++}; ++ ++static const char *bch2_sb_field_validate(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ unsigned type = le32_to_cpu(f->type); ++ ++ return type < BCH_SB_FIELD_NR ++ ? bch2_sb_field_ops[type]->validate(sb, f) ++ : NULL; ++} ++ ++void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ unsigned type = le32_to_cpu(f->type); ++ const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR ++ ? bch2_sb_field_ops[type] : NULL; ++ ++ if (ops) ++ pr_buf(out, "%s", bch2_sb_fields[type]); ++ else ++ pr_buf(out, "(unknown field %u)", type); ++ ++ pr_buf(out, " (size %llu):", vstruct_bytes(f)); ++ ++ if (ops && ops->to_text) ++ bch2_sb_field_ops[type]->to_text(out, sb, f); ++} +diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h +new file mode 100644 +index 000000000000..7a068158efca +--- /dev/null ++++ b/fs/bcachefs/super-io.h +@@ -0,0 +1,137 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SUPER_IO_H ++#define _BCACHEFS_SUPER_IO_H ++ ++#include "extents.h" ++#include "eytzinger.h" ++#include "super_types.h" ++#include "super.h" ++ ++#include ++ ++struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type); ++struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *, ++ enum bch_sb_field_type, unsigned); ++void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type); ++ ++#define field_to_type(_f, _name) \ ++ container_of_or_null(_f, struct bch_sb_field_##_name, field) ++ ++#define x(_name, _nr) \ ++static inline struct bch_sb_field_##_name * \ ++bch2_sb_get_##_name(struct bch_sb *sb) \ ++{ \ ++ return field_to_type(bch2_sb_field_get(sb, \ ++ BCH_SB_FIELD_##_name), _name); \ ++} \ ++ \ ++static inline struct bch_sb_field_##_name * \ ++bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s) \ ++{ \ ++ return field_to_type(bch2_sb_field_resize(sb, \ ++ BCH_SB_FIELD_##_name, u64s), _name); \ ++} ++ ++BCH_SB_FIELDS() ++#undef x ++ ++extern const char * const bch2_sb_fields[]; ++ ++struct bch_sb_field_ops { ++ const char * (*validate)(struct bch_sb *, struct bch_sb_field *); ++ void (*to_text)(struct printbuf *, struct bch_sb *, ++ struct bch_sb_field *); ++}; ++ ++static inline __le64 bch2_sb_magic(struct bch_fs *c) ++{ ++ __le64 ret; ++ memcpy(&ret, &c->sb.uuid, sizeof(ret)); ++ return ret; ++} ++ ++static inline __u64 jset_magic(struct bch_fs *c) ++{ ++ return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC); ++} ++ ++static inline __u64 bset_magic(struct bch_fs *c) ++{ ++ return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC); ++} ++ ++int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *); ++int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); ++ ++void bch2_free_super(struct bch_sb_handle *); ++int bch2_sb_realloc(struct bch_sb_handle *, unsigned); ++ ++const char *bch2_sb_validate(struct bch_sb_handle *); ++ ++int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); ++int bch2_write_super(struct bch_fs *); ++void __bch2_check_set_feature(struct bch_fs *, unsigned); ++ ++static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) ++{ ++ if (!(c->sb.features & (1ULL << feat))) ++ __bch2_check_set_feature(c, feat); ++} ++ ++/* BCH_SB_FIELD_journal: */ ++ ++static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) ++{ ++ return j ++ ? (__le64 *) vstruct_end(&j->field) - j->buckets ++ : 0; ++} ++ ++/* BCH_SB_FIELD_members: */ ++ ++static inline bool bch2_member_exists(struct bch_member *m) ++{ ++ return !bch2_is_zero(m->uuid.b, sizeof(uuid_le)); ++} ++ ++static inline bool bch2_dev_exists(struct bch_sb *sb, ++ struct bch_sb_field_members *mi, ++ unsigned dev) ++{ ++ return dev < sb->nr_devices && ++ bch2_member_exists(&mi->members[dev]); ++} ++ ++static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) ++{ ++ return (struct bch_member_cpu) { ++ .nbuckets = le64_to_cpu(mi->nbuckets), ++ .first_bucket = le16_to_cpu(mi->first_bucket), ++ .bucket_size = le16_to_cpu(mi->bucket_size), ++ .group = BCH_MEMBER_GROUP(mi), ++ .state = BCH_MEMBER_STATE(mi), ++ .replacement = BCH_MEMBER_REPLACEMENT(mi), ++ .discard = BCH_MEMBER_DISCARD(mi), ++ .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), ++ .durability = BCH_MEMBER_DURABILITY(mi) ++ ? BCH_MEMBER_DURABILITY(mi) - 1 ++ : 1, ++ .valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)), ++ }; ++} ++ ++/* BCH_SB_FIELD_clean: */ ++ ++struct jset_entry * ++bch2_journal_super_entries_add_common(struct bch_fs *, ++ struct jset_entry *, u64); ++ ++void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int); ++ ++int bch2_fs_mark_dirty(struct bch_fs *); ++void bch2_fs_mark_clean(struct bch_fs *); ++ ++void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, ++ struct bch_sb_field *); ++ ++#endif /* _BCACHEFS_SUPER_IO_H */ +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +new file mode 100644 +index 000000000000..0cdf285e4ffd +--- /dev/null ++++ b/fs/bcachefs/super.c +@@ -0,0 +1,2046 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * bcachefs setup/teardown code, and some metadata io - read a superblock and ++ * figure out what to do with it. ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "bkey_sort.h" ++#include "btree_cache.h" ++#include "btree_gc.h" ++#include "btree_key_cache.h" ++#include "btree_update_interior.h" ++#include "btree_io.h" ++#include "chardev.h" ++#include "checksum.h" ++#include "clock.h" ++#include "compress.h" ++#include "debug.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "error.h" ++#include "fs.h" ++#include "fs-io.h" ++#include "fsck.h" ++#include "inode.h" ++#include "io.h" ++#include "journal.h" ++#include "journal_reclaim.h" ++#include "journal_seq_blacklist.h" ++#include "move.h" ++#include "migrate.h" ++#include "movinggc.h" ++#include "quota.h" ++#include "rebalance.h" ++#include "recovery.h" ++#include "replicas.h" ++#include "super.h" ++#include "super-io.h" ++#include "sysfs.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Kent Overstreet "); ++ ++#define KTYPE(type) \ ++struct kobj_type type ## _ktype = { \ ++ .release = type ## _release, \ ++ .sysfs_ops = &type ## _sysfs_ops, \ ++ .default_attrs = type ## _files \ ++} ++ ++static void bch2_fs_release(struct kobject *); ++static void bch2_dev_release(struct kobject *); ++ ++static void bch2_fs_internal_release(struct kobject *k) ++{ ++} ++ ++static void bch2_fs_opts_dir_release(struct kobject *k) ++{ ++} ++ ++static void bch2_fs_time_stats_release(struct kobject *k) ++{ ++} ++ ++static KTYPE(bch2_fs); ++static KTYPE(bch2_fs_internal); ++static KTYPE(bch2_fs_opts_dir); ++static KTYPE(bch2_fs_time_stats); ++static KTYPE(bch2_dev); ++ ++static struct kset *bcachefs_kset; ++static LIST_HEAD(bch_fs_list); ++static DEFINE_MUTEX(bch_fs_list_lock); ++ ++static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait); ++ ++static void bch2_dev_free(struct bch_dev *); ++static int bch2_dev_alloc(struct bch_fs *, unsigned); ++static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); ++static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); ++ ++struct bch_fs *bch2_bdev_to_fs(struct block_device *bdev) ++{ ++ struct bch_fs *c; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ mutex_lock(&bch_fs_list_lock); ++ rcu_read_lock(); ++ ++ list_for_each_entry(c, &bch_fs_list, list) ++ for_each_member_device_rcu(ca, c, i, NULL) ++ if (ca->disk_sb.bdev == bdev) { ++ closure_get(&c->cl); ++ goto found; ++ } ++ c = NULL; ++found: ++ rcu_read_unlock(); ++ mutex_unlock(&bch_fs_list_lock); ++ ++ return c; ++} ++ ++static struct bch_fs *__bch2_uuid_to_fs(uuid_le uuid) ++{ ++ struct bch_fs *c; ++ ++ lockdep_assert_held(&bch_fs_list_lock); ++ ++ list_for_each_entry(c, &bch_fs_list, list) ++ if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid_le))) ++ return c; ++ ++ return NULL; ++} ++ ++struct bch_fs *bch2_uuid_to_fs(uuid_le uuid) ++{ ++ struct bch_fs *c; ++ ++ mutex_lock(&bch_fs_list_lock); ++ c = __bch2_uuid_to_fs(uuid); ++ if (c) ++ closure_get(&c->cl); ++ mutex_unlock(&bch_fs_list_lock); ++ ++ return c; ++} ++ ++int bch2_congested(void *data, int bdi_bits) ++{ ++ struct bch_fs *c = data; ++ struct backing_dev_info *bdi; ++ struct bch_dev *ca; ++ unsigned i; ++ int ret = 0; ++ ++ rcu_read_lock(); ++ if (bdi_bits & (1 << WB_sync_congested)) { ++ /* Reads - check all devices: */ ++ for_each_readable_member(ca, c, i) { ++ bdi = ca->disk_sb.bdev->bd_bdi; ++ ++ if (bdi_congested(bdi, bdi_bits)) { ++ ret = 1; ++ break; ++ } ++ } ++ } else { ++ unsigned target = READ_ONCE(c->opts.foreground_target); ++ const struct bch_devs_mask *devs = target ++ ? bch2_target_to_mask(c, target) ++ : &c->rw_devs[BCH_DATA_USER]; ++ ++ for_each_member_device_rcu(ca, c, i, devs) { ++ bdi = ca->disk_sb.bdev->bd_bdi; ++ ++ if (bdi_congested(bdi, bdi_bits)) { ++ ret = 1; ++ break; ++ } ++ } ++ } ++ rcu_read_unlock(); ++ ++ return ret; ++} ++ ++/* Filesystem RO/RW: */ ++ ++/* ++ * For startup/shutdown of RW stuff, the dependencies are: ++ * ++ * - foreground writes depend on copygc and rebalance (to free up space) ++ * ++ * - copygc and rebalance depend on mark and sweep gc (they actually probably ++ * don't because they either reserve ahead of time or don't block if ++ * allocations fail, but allocations can require mark and sweep gc to run ++ * because of generation number wraparound) ++ * ++ * - all of the above depends on the allocator threads ++ * ++ * - allocator depends on the journal (when it rewrites prios and gens) ++ */ ++ ++static void __bch2_fs_read_only(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ bool wrote = false; ++ unsigned i, clean_passes = 0; ++ int ret; ++ ++ bch2_rebalance_stop(c); ++ ++ for_each_member_device(ca, c, i) ++ bch2_copygc_stop(ca); ++ ++ bch2_gc_thread_stop(c); ++ ++ /* ++ * Flush journal before stopping allocators, because flushing journal ++ * blacklist entries involves allocating new btree nodes: ++ */ ++ bch2_journal_flush_all_pins(&c->journal); ++ ++ /* ++ * If the allocator threads didn't all start up, the btree updates to ++ * write out alloc info aren't going to work: ++ */ ++ if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags)) ++ goto nowrote_alloc; ++ ++ bch_verbose(c, "writing alloc info"); ++ /* ++ * This should normally just be writing the bucket read/write clocks: ++ */ ++ ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?: ++ bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote); ++ bch_verbose(c, "writing alloc info complete"); ++ ++ if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) ++ bch2_fs_inconsistent(c, "error writing out alloc info %i", ret); ++ ++ if (ret) ++ goto nowrote_alloc; ++ ++ bch_verbose(c, "flushing journal and stopping allocators"); ++ ++ bch2_journal_flush_all_pins(&c->journal); ++ set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); ++ ++ do { ++ clean_passes++; ++ ++ if (bch2_journal_flush_all_pins(&c->journal)) ++ clean_passes = 0; ++ ++ /* ++ * In flight interior btree updates will generate more journal ++ * updates and btree updates (alloc btree): ++ */ ++ if (bch2_btree_interior_updates_nr_pending(c)) { ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c)); ++ clean_passes = 0; ++ } ++ flush_work(&c->btree_interior_update_work); ++ ++ if (bch2_journal_flush_all_pins(&c->journal)) ++ clean_passes = 0; ++ } while (clean_passes < 2); ++ bch_verbose(c, "flushing journal and stopping allocators complete"); ++ ++ set_bit(BCH_FS_ALLOC_CLEAN, &c->flags); ++nowrote_alloc: ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c)); ++ flush_work(&c->btree_interior_update_work); ++ ++ for_each_member_device(ca, c, i) ++ bch2_dev_allocator_stop(ca); ++ ++ clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); ++ clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); ++ ++ bch2_fs_journal_stop(&c->journal); ++ ++ /* ++ * the journal kicks off btree writes via reclaim - wait for in flight ++ * writes after stopping journal: ++ */ ++ if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) ++ bch2_btree_flush_all_writes(c); ++ else ++ bch2_btree_verify_flushed(c); ++ ++ /* ++ * After stopping journal: ++ */ ++ for_each_member_device(ca, c, i) ++ bch2_dev_allocator_remove(c, ca); ++} ++ ++static void bch2_writes_disabled(struct percpu_ref *writes) ++{ ++ struct bch_fs *c = container_of(writes, struct bch_fs, writes); ++ ++ set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); ++ wake_up(&bch_read_only_wait); ++} ++ ++void bch2_fs_read_only(struct bch_fs *c) ++{ ++ if (!test_bit(BCH_FS_RW, &c->flags)) { ++ cancel_delayed_work_sync(&c->journal.reclaim_work); ++ return; ++ } ++ ++ BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); ++ ++ /* ++ * Block new foreground-end write operations from starting - any new ++ * writes will return -EROFS: ++ * ++ * (This is really blocking new _allocations_, writes to previously ++ * allocated space can still happen until stopping the allocator in ++ * bch2_dev_allocator_stop()). ++ */ ++ percpu_ref_kill(&c->writes); ++ ++ cancel_work_sync(&c->ec_stripe_delete_work); ++ cancel_delayed_work(&c->pd_controllers_update); ++ ++ /* ++ * If we're not doing an emergency shutdown, we want to wait on ++ * outstanding writes to complete so they don't see spurious errors due ++ * to shutting down the allocator: ++ * ++ * If we are doing an emergency shutdown outstanding writes may ++ * hang until we shutdown the allocator so we don't want to wait ++ * on outstanding writes before shutting everything down - but ++ * we do need to wait on them before returning and signalling ++ * that going RO is complete: ++ */ ++ wait_event(bch_read_only_wait, ++ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) || ++ test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); ++ ++ __bch2_fs_read_only(c); ++ ++ wait_event(bch_read_only_wait, ++ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); ++ ++ clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); ++ ++ if (!bch2_journal_error(&c->journal) && ++ !test_bit(BCH_FS_ERROR, &c->flags) && ++ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) && ++ test_bit(BCH_FS_STARTED, &c->flags) && ++ test_bit(BCH_FS_ALLOC_CLEAN, &c->flags) && ++ !c->opts.norecovery) { ++ bch_verbose(c, "marking filesystem clean"); ++ bch2_fs_mark_clean(c); ++ } ++ ++ clear_bit(BCH_FS_RW, &c->flags); ++} ++ ++static void bch2_fs_read_only_work(struct work_struct *work) ++{ ++ struct bch_fs *c = ++ container_of(work, struct bch_fs, read_only_work); ++ ++ down_write(&c->state_lock); ++ bch2_fs_read_only(c); ++ up_write(&c->state_lock); ++} ++ ++static void bch2_fs_read_only_async(struct bch_fs *c) ++{ ++ queue_work(system_long_wq, &c->read_only_work); ++} ++ ++bool bch2_fs_emergency_read_only(struct bch_fs *c) ++{ ++ bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags); ++ ++ bch2_fs_read_only_async(c); ++ bch2_journal_halt(&c->journal); ++ ++ wake_up(&bch_read_only_wait); ++ return ret; ++} ++ ++static int bch2_fs_read_write_late(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ int ret; ++ ++ ret = bch2_gc_thread_start(c); ++ if (ret) { ++ bch_err(c, "error starting gc thread"); ++ return ret; ++ } ++ ++ for_each_rw_member(ca, c, i) { ++ ret = bch2_copygc_start(c, ca); ++ if (ret) { ++ bch_err(c, "error starting copygc threads"); ++ percpu_ref_put(&ca->io_ref); ++ return ret; ++ } ++ } ++ ++ ret = bch2_rebalance_start(c); ++ if (ret) { ++ bch_err(c, "error starting rebalance thread"); ++ return ret; ++ } ++ ++ schedule_delayed_work(&c->pd_controllers_update, 5 * HZ); ++ ++ schedule_work(&c->ec_stripe_delete_work); ++ ++ return 0; ++} ++ ++static int __bch2_fs_read_write(struct bch_fs *c, bool early) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ int ret; ++ ++ if (test_bit(BCH_FS_RW, &c->flags)) ++ return 0; ++ ++ /* ++ * nochanges is used for fsck -n mode - we have to allow going rw ++ * during recovery for that to work: ++ */ ++ if (c->opts.norecovery || ++ (c->opts.nochanges && ++ (!early || c->opts.read_only))) ++ return -EROFS; ++ ++ ret = bch2_fs_mark_dirty(c); ++ if (ret) ++ goto err; ++ ++ clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags); ++ ++ for_each_rw_member(ca, c, i) ++ bch2_dev_allocator_add(c, ca); ++ bch2_recalc_capacity(c); ++ ++ for_each_rw_member(ca, c, i) { ++ ret = bch2_dev_allocator_start(ca); ++ if (ret) { ++ bch_err(c, "error starting allocator threads"); ++ percpu_ref_put(&ca->io_ref); ++ goto err; ++ } ++ } ++ ++ set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); ++ ++ if (!early) { ++ ret = bch2_fs_read_write_late(c); ++ if (ret) ++ goto err; ++ } ++ ++ percpu_ref_reinit(&c->writes); ++ set_bit(BCH_FS_RW, &c->flags); ++ ++ queue_delayed_work(c->journal_reclaim_wq, ++ &c->journal.reclaim_work, 0); ++ return 0; ++err: ++ __bch2_fs_read_only(c); ++ return ret; ++} ++ ++int bch2_fs_read_write(struct bch_fs *c) ++{ ++ return __bch2_fs_read_write(c, false); ++} ++ ++int bch2_fs_read_write_early(struct bch_fs *c) ++{ ++ lockdep_assert_held(&c->state_lock); ++ ++ return __bch2_fs_read_write(c, true); ++} ++ ++/* Filesystem startup/shutdown: */ ++ ++static void bch2_fs_free(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ for (i = 0; i < BCH_TIME_STAT_NR; i++) ++ bch2_time_stats_exit(&c->times[i]); ++ ++ bch2_fs_quota_exit(c); ++ bch2_fs_fsio_exit(c); ++ bch2_fs_ec_exit(c); ++ bch2_fs_encryption_exit(c); ++ bch2_fs_io_exit(c); ++ bch2_fs_btree_interior_update_exit(c); ++ bch2_fs_btree_iter_exit(c); ++ bch2_fs_btree_key_cache_exit(&c->btree_key_cache); ++ bch2_fs_btree_cache_exit(c); ++ bch2_fs_journal_exit(&c->journal); ++ bch2_io_clock_exit(&c->io_clock[WRITE]); ++ bch2_io_clock_exit(&c->io_clock[READ]); ++ bch2_fs_compress_exit(c); ++ bch2_journal_keys_free(&c->journal_keys); ++ bch2_journal_entries_free(&c->journal_entries); ++ percpu_free_rwsem(&c->mark_lock); ++ kfree(c->usage_scratch); ++ free_percpu(c->usage[1]); ++ free_percpu(c->usage[0]); ++ kfree(c->usage_base); ++ free_percpu(c->pcpu); ++ mempool_exit(&c->large_bkey_pool); ++ mempool_exit(&c->btree_bounce_pool); ++ bioset_exit(&c->btree_bio); ++ mempool_exit(&c->fill_iter); ++ percpu_ref_exit(&c->writes); ++ kfree(c->replicas.entries); ++ kfree(c->replicas_gc.entries); ++ kfree(rcu_dereference_protected(c->disk_groups, 1)); ++ kfree(c->journal_seq_blacklist_table); ++ ++ if (c->journal_reclaim_wq) ++ destroy_workqueue(c->journal_reclaim_wq); ++ if (c->copygc_wq) ++ destroy_workqueue(c->copygc_wq); ++ if (c->wq) ++ destroy_workqueue(c->wq); ++ ++ free_pages((unsigned long) c->disk_sb.sb, ++ c->disk_sb.page_order); ++ kvpfree(c, sizeof(*c)); ++ module_put(THIS_MODULE); ++} ++ ++static void bch2_fs_release(struct kobject *kobj) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); ++ ++ bch2_fs_free(c); ++} ++ ++void bch2_fs_stop(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ bch_verbose(c, "shutting down"); ++ ++ set_bit(BCH_FS_STOPPING, &c->flags); ++ ++ cancel_work_sync(&c->journal_seq_blacklist_gc_work); ++ ++ down_write(&c->state_lock); ++ bch2_fs_read_only(c); ++ up_write(&c->state_lock); ++ ++ for_each_member_device(ca, c, i) ++ if (ca->kobj.state_in_sysfs && ++ ca->disk_sb.bdev) ++ sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, ++ "bcachefs"); ++ ++ if (c->kobj.state_in_sysfs) ++ kobject_del(&c->kobj); ++ ++ bch2_fs_debug_exit(c); ++ bch2_fs_chardev_exit(c); ++ ++ kobject_put(&c->time_stats); ++ kobject_put(&c->opts_dir); ++ kobject_put(&c->internal); ++ ++ mutex_lock(&bch_fs_list_lock); ++ list_del(&c->list); ++ mutex_unlock(&bch_fs_list_lock); ++ ++ closure_sync(&c->cl); ++ closure_debug_destroy(&c->cl); ++ ++ /* btree prefetch might have kicked off reads in the background: */ ++ bch2_btree_flush_all_reads(c); ++ ++ for_each_member_device(ca, c, i) ++ cancel_work_sync(&ca->io_error_work); ++ ++ cancel_work_sync(&c->btree_write_error_work); ++ cancel_delayed_work_sync(&c->pd_controllers_update); ++ cancel_work_sync(&c->read_only_work); ++ ++ for (i = 0; i < c->sb.nr_devices; i++) ++ if (c->devs[i]) ++ bch2_dev_free(rcu_dereference_protected(c->devs[i], 1)); ++ ++ bch_verbose(c, "shutdown complete"); ++ ++ kobject_put(&c->kobj); ++} ++ ++static const char *bch2_fs_online(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ const char *err = NULL; ++ unsigned i; ++ int ret; ++ ++ lockdep_assert_held(&bch_fs_list_lock); ++ ++ if (!list_empty(&c->list)) ++ return NULL; ++ ++ if (__bch2_uuid_to_fs(c->sb.uuid)) ++ return "filesystem UUID already open"; ++ ++ ret = bch2_fs_chardev_init(c); ++ if (ret) ++ return "error creating character device"; ++ ++ bch2_fs_debug_init(c); ++ ++ if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) || ++ kobject_add(&c->internal, &c->kobj, "internal") || ++ kobject_add(&c->opts_dir, &c->kobj, "options") || ++ kobject_add(&c->time_stats, &c->kobj, "time_stats") || ++ bch2_opts_create_sysfs_files(&c->opts_dir)) ++ return "error creating sysfs objects"; ++ ++ down_write(&c->state_lock); ++ ++ err = "error creating sysfs objects"; ++ __for_each_member_device(ca, c, i, NULL) ++ if (bch2_dev_sysfs_online(c, ca)) ++ goto err; ++ ++ list_add(&c->list, &bch_fs_list); ++ err = NULL; ++err: ++ up_write(&c->state_lock); ++ return err; ++} ++ ++static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) ++{ ++ struct bch_sb_field_members *mi; ++ struct bch_fs *c; ++ unsigned i, iter_size; ++ const char *err; ++ ++ pr_verbose_init(opts, ""); ++ ++ c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); ++ if (!c) ++ goto out; ++ ++ __module_get(THIS_MODULE); ++ ++ c->minor = -1; ++ c->disk_sb.fs_sb = true; ++ ++ init_rwsem(&c->state_lock); ++ mutex_init(&c->sb_lock); ++ mutex_init(&c->replicas_gc_lock); ++ mutex_init(&c->btree_root_lock); ++ INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); ++ ++ init_rwsem(&c->gc_lock); ++ ++ for (i = 0; i < BCH_TIME_STAT_NR; i++) ++ bch2_time_stats_init(&c->times[i]); ++ ++ bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); ++ bch2_fs_allocator_background_init(c); ++ bch2_fs_allocator_foreground_init(c); ++ bch2_fs_rebalance_init(c); ++ bch2_fs_quota_init(c); ++ ++ INIT_LIST_HEAD(&c->list); ++ ++ mutex_init(&c->usage_scratch_lock); ++ ++ mutex_init(&c->bio_bounce_pages_lock); ++ ++ bio_list_init(&c->btree_write_error_list); ++ spin_lock_init(&c->btree_write_error_lock); ++ INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work); ++ ++ INIT_WORK(&c->journal_seq_blacklist_gc_work, ++ bch2_blacklist_entries_gc); ++ ++ INIT_LIST_HEAD(&c->journal_entries); ++ ++ INIT_LIST_HEAD(&c->fsck_errors); ++ mutex_init(&c->fsck_error_lock); ++ ++ INIT_LIST_HEAD(&c->ec_new_stripe_list); ++ mutex_init(&c->ec_new_stripe_lock); ++ mutex_init(&c->ec_stripe_create_lock); ++ spin_lock_init(&c->ec_stripes_heap_lock); ++ ++ seqcount_init(&c->gc_pos_lock); ++ ++ seqcount_init(&c->usage_lock); ++ ++ sema_init(&c->io_in_flight, 64); ++ ++ c->copy_gc_enabled = 1; ++ c->rebalance.enabled = 1; ++ c->promote_whole_extents = true; ++ ++ c->journal.write_time = &c->times[BCH_TIME_journal_write]; ++ c->journal.delay_time = &c->times[BCH_TIME_journal_delay]; ++ c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal]; ++ c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; ++ ++ bch2_fs_btree_cache_init_early(&c->btree_cache); ++ ++ if (percpu_init_rwsem(&c->mark_lock)) ++ goto err; ++ ++ mutex_lock(&c->sb_lock); ++ ++ if (bch2_sb_to_fs(c, sb)) { ++ mutex_unlock(&c->sb_lock); ++ goto err; ++ } ++ ++ mutex_unlock(&c->sb_lock); ++ ++ scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid); ++ ++ c->opts = bch2_opts_default; ++ bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb)); ++ bch2_opts_apply(&c->opts, opts); ++ ++ c->block_bits = ilog2(c->opts.block_size); ++ c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); ++ ++ if (bch2_fs_init_fault("fs_alloc")) ++ goto err; ++ ++ iter_size = sizeof(struct sort_iter) + ++ (btree_blocks(c) + 1) * 2 * ++ sizeof(struct sort_iter_set); ++ ++ if (!(c->wq = alloc_workqueue("bcachefs", ++ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || ++ !(c->copygc_wq = alloc_workqueue("bcache_copygc", ++ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || ++ !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal", ++ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || ++ percpu_ref_init(&c->writes, bch2_writes_disabled, ++ PERCPU_REF_INIT_DEAD, GFP_KERNEL) || ++ mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || ++ bioset_init(&c->btree_bio, 1, ++ max(offsetof(struct btree_read_bio, bio), ++ offsetof(struct btree_write_bio, wbio.bio)), ++ BIOSET_NEED_BVECS) || ++ !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || ++ mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, ++ btree_bytes(c)) || ++ mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || ++ bch2_io_clock_init(&c->io_clock[READ]) || ++ bch2_io_clock_init(&c->io_clock[WRITE]) || ++ bch2_fs_journal_init(&c->journal) || ++ bch2_fs_replicas_init(c) || ++ bch2_fs_btree_cache_init(c) || ++ bch2_fs_btree_key_cache_init(&c->btree_key_cache) || ++ bch2_fs_btree_iter_init(c) || ++ bch2_fs_btree_interior_update_init(c) || ++ bch2_fs_io_init(c) || ++ bch2_fs_encryption_init(c) || ++ bch2_fs_compress_init(c) || ++ bch2_fs_ec_init(c) || ++ bch2_fs_fsio_init(c)) ++ goto err; ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ for (i = 0; i < c->sb.nr_devices; i++) ++ if (bch2_dev_exists(c->disk_sb.sb, mi, i) && ++ bch2_dev_alloc(c, i)) ++ goto err; ++ ++ /* ++ * Now that all allocations have succeeded, init various refcounty ++ * things that let us shutdown: ++ */ ++ closure_init(&c->cl, NULL); ++ ++ c->kobj.kset = bcachefs_kset; ++ kobject_init(&c->kobj, &bch2_fs_ktype); ++ kobject_init(&c->internal, &bch2_fs_internal_ktype); ++ kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); ++ kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); ++ ++ mutex_lock(&bch_fs_list_lock); ++ err = bch2_fs_online(c); ++ mutex_unlock(&bch_fs_list_lock); ++ if (err) { ++ bch_err(c, "bch2_fs_online() error: %s", err); ++ goto err; ++ } ++out: ++ pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM); ++ return c; ++err: ++ bch2_fs_free(c); ++ c = NULL; ++ goto out; ++} ++ ++noinline_for_stack ++static void print_mount_opts(struct bch_fs *c) ++{ ++ enum bch_opt_id i; ++ char buf[512]; ++ struct printbuf p = PBUF(buf); ++ bool first = true; ++ ++ strcpy(buf, "(null)"); ++ ++ if (c->opts.read_only) { ++ pr_buf(&p, "ro"); ++ first = false; ++ } ++ ++ for (i = 0; i < bch2_opts_nr; i++) { ++ const struct bch_option *opt = &bch2_opt_table[i]; ++ u64 v = bch2_opt_get_by_id(&c->opts, i); ++ ++ if (!(opt->mode & OPT_MOUNT)) ++ continue; ++ ++ if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) ++ continue; ++ ++ if (!first) ++ pr_buf(&p, ","); ++ first = false; ++ bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE); ++ } ++ ++ bch_info(c, "mounted with opts: %s", buf); ++} ++ ++int bch2_fs_start(struct bch_fs *c) ++{ ++ const char *err = "cannot allocate memory"; ++ struct bch_sb_field_members *mi; ++ struct bch_dev *ca; ++ time64_t now = ktime_get_real_seconds(); ++ unsigned i; ++ int ret = -EINVAL; ++ ++ down_write(&c->state_lock); ++ ++ BUG_ON(test_bit(BCH_FS_STARTED, &c->flags)); ++ ++ mutex_lock(&c->sb_lock); ++ ++ for_each_online_member(ca, c, i) ++ bch2_sb_from_fs(c, ca); ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ for_each_online_member(ca, c, i) ++ mi->members[ca->dev_idx].last_mount = cpu_to_le64(now); ++ ++ mutex_unlock(&c->sb_lock); ++ ++ for_each_rw_member(ca, c, i) ++ bch2_dev_allocator_add(c, ca); ++ bch2_recalc_capacity(c); ++ ++ ret = BCH_SB_INITIALIZED(c->disk_sb.sb) ++ ? bch2_fs_recovery(c) ++ : bch2_fs_initialize(c); ++ if (ret) ++ goto err; ++ ++ ret = bch2_opts_check_may_set(c); ++ if (ret) ++ goto err; ++ ++ err = "dynamic fault"; ++ ret = -EINVAL; ++ if (bch2_fs_init_fault("fs_start")) ++ goto err; ++ ++ set_bit(BCH_FS_STARTED, &c->flags); ++ ++ if (c->opts.read_only || c->opts.nochanges) { ++ bch2_fs_read_only(c); ++ } else { ++ err = "error going read write"; ++ ret = !test_bit(BCH_FS_RW, &c->flags) ++ ? bch2_fs_read_write(c) ++ : bch2_fs_read_write_late(c); ++ if (ret) ++ goto err; ++ } ++ ++ print_mount_opts(c); ++ ret = 0; ++out: ++ up_write(&c->state_lock); ++ return ret; ++err: ++ switch (ret) { ++ case BCH_FSCK_ERRORS_NOT_FIXED: ++ bch_err(c, "filesystem contains errors: please report this to the developers"); ++ pr_cont("mount with -o fix_errors to repair\n"); ++ err = "fsck error"; ++ break; ++ case BCH_FSCK_REPAIR_UNIMPLEMENTED: ++ bch_err(c, "filesystem contains errors: please report this to the developers"); ++ pr_cont("repair unimplemented: inform the developers so that it can be added\n"); ++ err = "fsck error"; ++ break; ++ case BCH_FSCK_REPAIR_IMPOSSIBLE: ++ bch_err(c, "filesystem contains errors, but repair impossible"); ++ err = "fsck error"; ++ break; ++ case BCH_FSCK_UNKNOWN_VERSION: ++ err = "unknown metadata version";; ++ break; ++ case -ENOMEM: ++ err = "cannot allocate memory"; ++ break; ++ case -EIO: ++ err = "IO error"; ++ break; ++ } ++ ++ if (ret >= 0) ++ ret = -EIO; ++ goto out; ++} ++ ++static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) ++{ ++ struct bch_sb_field_members *sb_mi; ++ ++ sb_mi = bch2_sb_get_members(sb); ++ if (!sb_mi) ++ return "Invalid superblock: member info area missing"; ++ ++ if (le16_to_cpu(sb->block_size) != c->opts.block_size) ++ return "mismatched block size"; ++ ++ if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) < ++ BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) ++ return "new cache bucket size is too small"; ++ ++ return NULL; ++} ++ ++static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb) ++{ ++ struct bch_sb *newest = ++ le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb; ++ struct bch_sb_field_members *mi = bch2_sb_get_members(newest); ++ ++ if (uuid_le_cmp(fs->uuid, sb->uuid)) ++ return "device not a member of filesystem"; ++ ++ if (!bch2_dev_exists(newest, mi, sb->dev_idx)) ++ return "device has been removed"; ++ ++ if (fs->block_size != sb->block_size) ++ return "mismatched block size"; ++ ++ return NULL; ++} ++ ++/* Device startup/shutdown: */ ++ ++static void bch2_dev_release(struct kobject *kobj) ++{ ++ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); ++ ++ kfree(ca); ++} ++ ++static void bch2_dev_free(struct bch_dev *ca) ++{ ++ cancel_work_sync(&ca->io_error_work); ++ ++ if (ca->kobj.state_in_sysfs && ++ ca->disk_sb.bdev) ++ sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, ++ "bcachefs"); ++ ++ if (ca->kobj.state_in_sysfs) ++ kobject_del(&ca->kobj); ++ ++ bch2_free_super(&ca->disk_sb); ++ bch2_dev_journal_exit(ca); ++ ++ free_percpu(ca->io_done); ++ bioset_exit(&ca->replica_set); ++ bch2_dev_buckets_free(ca); ++ free_page((unsigned long) ca->sb_read_scratch); ++ ++ bch2_time_stats_exit(&ca->io_latency[WRITE]); ++ bch2_time_stats_exit(&ca->io_latency[READ]); ++ ++ percpu_ref_exit(&ca->io_ref); ++ percpu_ref_exit(&ca->ref); ++ kobject_put(&ca->kobj); ++} ++ ++static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) ++{ ++ ++ lockdep_assert_held(&c->state_lock); ++ ++ if (percpu_ref_is_zero(&ca->io_ref)) ++ return; ++ ++ __bch2_dev_read_only(c, ca); ++ ++ reinit_completion(&ca->io_ref_completion); ++ percpu_ref_kill(&ca->io_ref); ++ wait_for_completion(&ca->io_ref_completion); ++ ++ if (ca->kobj.state_in_sysfs) { ++ struct kobject *block = ++ &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj; ++ ++ sysfs_remove_link(block, "bcachefs"); ++ sysfs_remove_link(&ca->kobj, "block"); ++ } ++ ++ bch2_free_super(&ca->disk_sb); ++ bch2_dev_journal_exit(ca); ++} ++ ++static void bch2_dev_ref_complete(struct percpu_ref *ref) ++{ ++ struct bch_dev *ca = container_of(ref, struct bch_dev, ref); ++ ++ complete(&ca->ref_completion); ++} ++ ++static void bch2_dev_io_ref_complete(struct percpu_ref *ref) ++{ ++ struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); ++ ++ complete(&ca->io_ref_completion); ++} ++ ++static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) ++{ ++ int ret; ++ ++ if (!c->kobj.state_in_sysfs) ++ return 0; ++ ++ if (!ca->kobj.state_in_sysfs) { ++ ret = kobject_add(&ca->kobj, &c->kobj, ++ "dev-%u", ca->dev_idx); ++ if (ret) ++ return ret; ++ } ++ ++ if (ca->disk_sb.bdev) { ++ struct kobject *block = ++ &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj; ++ ++ ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); ++ if (ret) ++ return ret; ++ ret = sysfs_create_link(&ca->kobj, block, "block"); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, ++ struct bch_member *member) ++{ ++ struct bch_dev *ca; ++ ++ ca = kzalloc(sizeof(*ca), GFP_KERNEL); ++ if (!ca) ++ return NULL; ++ ++ kobject_init(&ca->kobj, &bch2_dev_ktype); ++ init_completion(&ca->ref_completion); ++ init_completion(&ca->io_ref_completion); ++ ++ init_rwsem(&ca->bucket_lock); ++ ++ writepoint_init(&ca->copygc_write_point, BCH_DATA_USER); ++ ++ bch2_dev_copygc_init(ca); ++ ++ INIT_WORK(&ca->io_error_work, bch2_io_error_work); ++ ++ bch2_time_stats_init(&ca->io_latency[READ]); ++ bch2_time_stats_init(&ca->io_latency[WRITE]); ++ ++ ca->mi = bch2_mi_to_cpu(member); ++ ca->uuid = member->uuid; ++ ++ if (opt_defined(c->opts, discard)) ++ ca->mi.discard = opt_get(c->opts, discard); ++ ++ if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, ++ 0, GFP_KERNEL) || ++ percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, ++ PERCPU_REF_INIT_DEAD, GFP_KERNEL) || ++ !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || ++ bch2_dev_buckets_alloc(c, ca) || ++ bioset_init(&ca->replica_set, 4, ++ offsetof(struct bch_write_bio, bio), 0) || ++ !(ca->io_done = alloc_percpu(*ca->io_done))) ++ goto err; ++ ++ return ca; ++err: ++ bch2_dev_free(ca); ++ return NULL; ++} ++ ++static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, ++ unsigned dev_idx) ++{ ++ ca->dev_idx = dev_idx; ++ __set_bit(ca->dev_idx, ca->self.d); ++ scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); ++ ++ ca->fs = c; ++ rcu_assign_pointer(c->devs[ca->dev_idx], ca); ++ ++ if (bch2_dev_sysfs_online(c, ca)) ++ pr_warn("error creating sysfs objects"); ++} ++ ++static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) ++{ ++ struct bch_member *member = ++ bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx; ++ struct bch_dev *ca = NULL; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ if (bch2_fs_init_fault("dev_alloc")) ++ goto err; ++ ++ ca = __bch2_dev_alloc(c, member); ++ if (!ca) ++ goto err; ++ ++ bch2_dev_attach(c, ca, dev_idx); ++out: ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++err: ++ if (ca) ++ bch2_dev_free(ca); ++ ret = -ENOMEM; ++ goto out; ++} ++ ++static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) ++{ ++ unsigned ret; ++ ++ if (bch2_dev_is_online(ca)) { ++ bch_err(ca, "already have device online in slot %u", ++ sb->sb->dev_idx); ++ return -EINVAL; ++ } ++ ++ if (get_capacity(sb->bdev->bd_disk) < ++ ca->mi.bucket_size * ca->mi.nbuckets) { ++ bch_err(ca, "cannot online: device too small"); ++ return -EINVAL; ++ } ++ ++ BUG_ON(!percpu_ref_is_zero(&ca->io_ref)); ++ ++ if (get_capacity(sb->bdev->bd_disk) < ++ ca->mi.bucket_size * ca->mi.nbuckets) { ++ bch_err(ca, "device too small"); ++ return -EINVAL; ++ } ++ ++ ret = bch2_dev_journal_init(ca, sb->sb); ++ if (ret) ++ return ret; ++ ++ /* Commit: */ ++ ca->disk_sb = *sb; ++ if (sb->mode & FMODE_EXCL) ++ ca->disk_sb.bdev->bd_holder = ca; ++ memset(sb, 0, sizeof(*sb)); ++ ++ percpu_ref_reinit(&ca->io_ref); ++ ++ return 0; ++} ++ ++static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) ++{ ++ struct bch_dev *ca; ++ int ret; ++ ++ lockdep_assert_held(&c->state_lock); ++ ++ if (le64_to_cpu(sb->sb->seq) > ++ le64_to_cpu(c->disk_sb.sb->seq)) ++ bch2_sb_to_fs(c, sb->sb); ++ ++ BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices || ++ !c->devs[sb->sb->dev_idx]); ++ ++ ca = bch_dev_locked(c, sb->sb->dev_idx); ++ ++ ret = __bch2_dev_attach_bdev(ca, sb); ++ if (ret) ++ return ret; ++ ++ if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) && ++ !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_SB])) { ++ mutex_lock(&c->sb_lock); ++ bch2_mark_dev_superblock(ca->fs, ca, 0); ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ bch2_dev_sysfs_online(c, ca); ++ ++ if (c->sb.nr_devices == 1) ++ bdevname(ca->disk_sb.bdev, c->name); ++ bdevname(ca->disk_sb.bdev, ca->name); ++ ++ rebalance_wakeup(c); ++ return 0; ++} ++ ++/* Device management: */ ++ ++/* ++ * Note: this function is also used by the error paths - when a particular ++ * device sees an error, we call it to determine whether we can just set the ++ * device RO, or - if this function returns false - we'll set the whole ++ * filesystem RO: ++ * ++ * XXX: maybe we should be more explicit about whether we're changing state ++ * because we got an error or what have you? ++ */ ++bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, ++ enum bch_member_state new_state, int flags) ++{ ++ struct bch_devs_mask new_online_devs; ++ struct replicas_status s; ++ struct bch_dev *ca2; ++ int i, nr_rw = 0, required; ++ ++ lockdep_assert_held(&c->state_lock); ++ ++ switch (new_state) { ++ case BCH_MEMBER_STATE_RW: ++ return true; ++ case BCH_MEMBER_STATE_RO: ++ if (ca->mi.state != BCH_MEMBER_STATE_RW) ++ return true; ++ ++ /* do we have enough devices to write to? */ ++ for_each_member_device(ca2, c, i) ++ if (ca2 != ca) ++ nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW; ++ ++ required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) ++ ? c->opts.metadata_replicas ++ : c->opts.metadata_replicas_required, ++ !(flags & BCH_FORCE_IF_DATA_DEGRADED) ++ ? c->opts.data_replicas ++ : c->opts.data_replicas_required); ++ ++ return nr_rw >= required; ++ case BCH_MEMBER_STATE_FAILED: ++ case BCH_MEMBER_STATE_SPARE: ++ if (ca->mi.state != BCH_MEMBER_STATE_RW && ++ ca->mi.state != BCH_MEMBER_STATE_RO) ++ return true; ++ ++ /* do we have enough devices to read from? */ ++ new_online_devs = bch2_online_devs(c); ++ __clear_bit(ca->dev_idx, new_online_devs.d); ++ ++ s = __bch2_replicas_status(c, new_online_devs); ++ ++ return bch2_have_enough_devs(s, flags); ++ default: ++ BUG(); ++ } ++} ++ ++static bool bch2_fs_may_start(struct bch_fs *c) ++{ ++ struct replicas_status s; ++ struct bch_sb_field_members *mi; ++ struct bch_dev *ca; ++ unsigned i, flags = c->opts.degraded ++ ? BCH_FORCE_IF_DEGRADED ++ : 0; ++ ++ if (!c->opts.degraded) { ++ mutex_lock(&c->sb_lock); ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ ++ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { ++ if (!bch2_dev_exists(c->disk_sb.sb, mi, i)) ++ continue; ++ ++ ca = bch_dev_locked(c, i); ++ ++ if (!bch2_dev_is_online(ca) && ++ (ca->mi.state == BCH_MEMBER_STATE_RW || ++ ca->mi.state == BCH_MEMBER_STATE_RO)) { ++ mutex_unlock(&c->sb_lock); ++ return false; ++ } ++ } ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ s = bch2_replicas_status(c); ++ ++ return bch2_have_enough_devs(s, flags); ++} ++ ++static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) ++{ ++ bch2_copygc_stop(ca); ++ ++ /* ++ * The allocator thread itself allocates btree nodes, so stop it first: ++ */ ++ bch2_dev_allocator_stop(ca); ++ bch2_dev_allocator_remove(c, ca); ++ bch2_dev_journal_stop(&c->journal, ca); ++} ++ ++static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) ++{ ++ lockdep_assert_held(&c->state_lock); ++ ++ BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW); ++ ++ bch2_dev_allocator_add(c, ca); ++ bch2_recalc_capacity(c); ++ ++ if (bch2_dev_allocator_start(ca)) ++ return "error starting allocator thread"; ++ ++ if (bch2_copygc_start(c, ca)) ++ return "error starting copygc thread"; ++ ++ return NULL; ++} ++ ++int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, ++ enum bch_member_state new_state, int flags) ++{ ++ struct bch_sb_field_members *mi; ++ int ret = 0; ++ ++ if (ca->mi.state == new_state) ++ return 0; ++ ++ if (!bch2_dev_state_allowed(c, ca, new_state, flags)) ++ return -EINVAL; ++ ++ if (new_state != BCH_MEMBER_STATE_RW) ++ __bch2_dev_read_only(c, ca); ++ ++ bch_notice(ca, "%s", bch2_dev_state[new_state]); ++ ++ mutex_lock(&c->sb_lock); ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ if (new_state == BCH_MEMBER_STATE_RW && ++ __bch2_dev_read_write(c, ca)) ++ ret = -ENOMEM; ++ ++ rebalance_wakeup(c); ++ ++ return ret; ++} ++ ++int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, ++ enum bch_member_state new_state, int flags) ++{ ++ int ret; ++ ++ down_write(&c->state_lock); ++ ret = __bch2_dev_set_state(c, ca, new_state, flags); ++ up_write(&c->state_lock); ++ ++ return ret; ++} ++ ++/* Device add/removal: */ ++ ++int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct btree_trans trans; ++ size_t i; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < ca->mi.nbuckets; i++) { ++ ret = bch2_btree_key_cache_flush(&trans, ++ BTREE_ID_ALLOC, POS(ca->dev_idx, i)); ++ if (ret) ++ break; ++ } ++ bch2_trans_exit(&trans); ++ ++ if (ret) ++ return ret; ++ ++ return bch2_btree_delete_range(c, BTREE_ID_ALLOC, ++ POS(ca->dev_idx, 0), ++ POS(ca->dev_idx + 1, 0), ++ NULL); ++} ++ ++int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) ++{ ++ struct bch_sb_field_members *mi; ++ unsigned dev_idx = ca->dev_idx, data; ++ int ret = -EINVAL; ++ ++ down_write(&c->state_lock); ++ ++ /* ++ * We consume a reference to ca->ref, regardless of whether we succeed ++ * or fail: ++ */ ++ percpu_ref_put(&ca->ref); ++ ++ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { ++ bch_err(ca, "Cannot remove without losing data"); ++ goto err; ++ } ++ ++ __bch2_dev_read_only(c, ca); ++ ++ ret = bch2_dev_data_drop(c, ca->dev_idx, flags); ++ if (ret) { ++ bch_err(ca, "Remove failed: error %i dropping data", ret); ++ goto err; ++ } ++ ++ ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); ++ if (ret) { ++ bch_err(ca, "Remove failed: error %i flushing journal", ret); ++ goto err; ++ } ++ ++ ret = bch2_dev_remove_alloc(c, ca); ++ if (ret) { ++ bch_err(ca, "Remove failed, error deleting alloc info"); ++ goto err; ++ } ++ ++ /* ++ * must flush all existing journal entries, they might have ++ * (overwritten) keys that point to the device we're removing: ++ */ ++ bch2_journal_flush_all_pins(&c->journal); ++ /* ++ * hack to ensure bch2_replicas_gc2() clears out entries to this device ++ */ ++ bch2_journal_meta(&c->journal); ++ ret = bch2_journal_error(&c->journal); ++ if (ret) { ++ bch_err(ca, "Remove failed, journal error"); ++ goto err; ++ } ++ ++ ret = bch2_replicas_gc2(c); ++ if (ret) { ++ bch_err(ca, "Remove failed: error %i from replicas gc", ret); ++ goto err; ++ } ++ ++ data = bch2_dev_has_data(c, ca); ++ if (data) { ++ char data_has_str[100]; ++ ++ bch2_flags_to_text(&PBUF(data_has_str), ++ bch2_data_types, data); ++ bch_err(ca, "Remove failed, still has data (%s)", data_has_str); ++ ret = -EBUSY; ++ goto err; ++ } ++ ++ __bch2_dev_offline(c, ca); ++ ++ mutex_lock(&c->sb_lock); ++ rcu_assign_pointer(c->devs[ca->dev_idx], NULL); ++ mutex_unlock(&c->sb_lock); ++ ++ percpu_ref_kill(&ca->ref); ++ wait_for_completion(&ca->ref_completion); ++ ++ bch2_dev_free(ca); ++ ++ /* ++ * Free this device's slot in the bch_member array - all pointers to ++ * this device must be gone: ++ */ ++ mutex_lock(&c->sb_lock); ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid)); ++ ++ bch2_write_super(c); ++ ++ mutex_unlock(&c->sb_lock); ++ up_write(&c->state_lock); ++ return 0; ++err: ++ if (ca->mi.state == BCH_MEMBER_STATE_RW && ++ !percpu_ref_is_zero(&ca->io_ref)) ++ __bch2_dev_read_write(c, ca); ++ up_write(&c->state_lock); ++ return ret; ++} ++ ++static void dev_usage_clear(struct bch_dev *ca) ++{ ++ struct bucket_array *buckets; ++ ++ percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0])); ++ ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets); ++ up_read(&ca->bucket_lock); ++} ++ ++/* Add new device to running filesystem: */ ++int bch2_dev_add(struct bch_fs *c, const char *path) ++{ ++ struct bch_opts opts = bch2_opts_empty(); ++ struct bch_sb_handle sb; ++ const char *err; ++ struct bch_dev *ca = NULL; ++ struct bch_sb_field_members *mi; ++ struct bch_member dev_mi; ++ unsigned dev_idx, nr_devices, u64s; ++ int ret; ++ ++ ret = bch2_read_super(path, &opts, &sb); ++ if (ret) ++ return ret; ++ ++ err = bch2_sb_validate(&sb); ++ if (err) ++ return -EINVAL; ++ ++ dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx]; ++ ++ err = bch2_dev_may_add(sb.sb, c); ++ if (err) ++ return -EINVAL; ++ ++ ca = __bch2_dev_alloc(c, &dev_mi); ++ if (!ca) { ++ bch2_free_super(&sb); ++ return -ENOMEM; ++ } ++ ++ ret = __bch2_dev_attach_bdev(ca, &sb); ++ if (ret) { ++ bch2_dev_free(ca); ++ return ret; ++ } ++ ++ /* ++ * We want to allocate journal on the new device before adding the new ++ * device to the filesystem because allocating after we attach requires ++ * spinning up the allocator thread, and the allocator thread requires ++ * doing btree writes, which if the existing devices are RO isn't going ++ * to work ++ * ++ * So we have to mark where the superblocks are, but marking allocated ++ * data normally updates the filesystem usage too, so we have to mark, ++ * allocate the journal, reset all the marks, then remark after we ++ * attach... ++ */ ++ bch2_mark_dev_superblock(ca->fs, ca, 0); ++ ++ err = "journal alloc failed"; ++ ret = bch2_dev_journal_alloc(ca); ++ if (ret) ++ goto err; ++ ++ dev_usage_clear(ca); ++ ++ down_write(&c->state_lock); ++ mutex_lock(&c->sb_lock); ++ ++ err = "insufficient space in new superblock"; ++ ret = bch2_sb_from_fs(c, ca); ++ if (ret) ++ goto err_unlock; ++ ++ mi = bch2_sb_get_members(ca->disk_sb.sb); ++ ++ if (!bch2_sb_resize_members(&ca->disk_sb, ++ le32_to_cpu(mi->field.u64s) + ++ sizeof(dev_mi) / sizeof(u64))) { ++ ret = -ENOSPC; ++ goto err_unlock; ++ } ++ ++ if (dynamic_fault("bcachefs:add:no_slot")) ++ goto no_slot; ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) ++ if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx)) ++ goto have_slot; ++no_slot: ++ err = "no slots available in superblock"; ++ ret = -ENOSPC; ++ goto err_unlock; ++ ++have_slot: ++ nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); ++ u64s = (sizeof(struct bch_sb_field_members) + ++ sizeof(struct bch_member) * nr_devices) / sizeof(u64); ++ ++ err = "no space in superblock for member info"; ++ ret = -ENOSPC; ++ ++ mi = bch2_sb_resize_members(&c->disk_sb, u64s); ++ if (!mi) ++ goto err_unlock; ++ ++ /* success: */ ++ ++ mi->members[dev_idx] = dev_mi; ++ mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_real_seconds()); ++ c->disk_sb.sb->nr_devices = nr_devices; ++ ++ ca->disk_sb.sb->dev_idx = dev_idx; ++ bch2_dev_attach(c, ca, dev_idx); ++ ++ bch2_mark_dev_superblock(c, ca, 0); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ if (ca->mi.state == BCH_MEMBER_STATE_RW) { ++ err = __bch2_dev_read_write(c, ca); ++ if (err) ++ goto err_late; ++ } ++ ++ up_write(&c->state_lock); ++ return 0; ++ ++err_unlock: ++ mutex_unlock(&c->sb_lock); ++ up_write(&c->state_lock); ++err: ++ if (ca) ++ bch2_dev_free(ca); ++ bch2_free_super(&sb); ++ bch_err(c, "Unable to add device: %s", err); ++ return ret; ++err_late: ++ bch_err(c, "Error going rw after adding device: %s", err); ++ return -EINVAL; ++} ++ ++/* Hot add existing device to running filesystem: */ ++int bch2_dev_online(struct bch_fs *c, const char *path) ++{ ++ struct bch_opts opts = bch2_opts_empty(); ++ struct bch_sb_handle sb = { NULL }; ++ struct bch_sb_field_members *mi; ++ struct bch_dev *ca; ++ unsigned dev_idx; ++ const char *err; ++ int ret; ++ ++ down_write(&c->state_lock); ++ ++ ret = bch2_read_super(path, &opts, &sb); ++ if (ret) { ++ up_write(&c->state_lock); ++ return ret; ++ } ++ ++ dev_idx = sb.sb->dev_idx; ++ ++ err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb); ++ if (err) ++ goto err; ++ ++ if (bch2_dev_attach_bdev(c, &sb)) { ++ err = "bch2_dev_attach_bdev() error"; ++ goto err; ++ } ++ ++ ca = bch_dev_locked(c, dev_idx); ++ if (ca->mi.state == BCH_MEMBER_STATE_RW) { ++ err = __bch2_dev_read_write(c, ca); ++ if (err) ++ goto err; ++ } ++ ++ mutex_lock(&c->sb_lock); ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ ++ mi->members[ca->dev_idx].last_mount = ++ cpu_to_le64(ktime_get_real_seconds()); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ up_write(&c->state_lock); ++ return 0; ++err: ++ up_write(&c->state_lock); ++ bch2_free_super(&sb); ++ bch_err(c, "error bringing %s online: %s", path, err); ++ return -EINVAL; ++} ++ ++int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) ++{ ++ down_write(&c->state_lock); ++ ++ if (!bch2_dev_is_online(ca)) { ++ bch_err(ca, "Already offline"); ++ up_write(&c->state_lock); ++ return 0; ++ } ++ ++ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { ++ bch_err(ca, "Cannot offline required disk"); ++ up_write(&c->state_lock); ++ return -EINVAL; ++ } ++ ++ __bch2_dev_offline(c, ca); ++ ++ up_write(&c->state_lock); ++ return 0; ++} ++ ++int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ++{ ++ struct bch_member *mi; ++ int ret = 0; ++ ++ down_write(&c->state_lock); ++ ++ if (nbuckets < ca->mi.nbuckets) { ++ bch_err(ca, "Cannot shrink yet"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ if (bch2_dev_is_online(ca) && ++ get_capacity(ca->disk_sb.bdev->bd_disk) < ++ ca->mi.bucket_size * nbuckets) { ++ bch_err(ca, "New size larger than device"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ ret = bch2_dev_buckets_resize(c, ca, nbuckets); ++ if (ret) { ++ bch_err(ca, "Resize error: %i", ret); ++ goto err; ++ } ++ ++ mutex_lock(&c->sb_lock); ++ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; ++ mi->nbuckets = cpu_to_le64(nbuckets); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ bch2_recalc_capacity(c); ++err: ++ up_write(&c->state_lock); ++ return ret; ++} ++ ++/* return with ref on ca->ref: */ ++struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path) ++{ ++ ++ struct block_device *bdev = lookup_bdev(path); ++ struct bch_dev *ca; ++ unsigned i; ++ ++ if (IS_ERR(bdev)) ++ return ERR_CAST(bdev); ++ ++ for_each_member_device(ca, c, i) ++ if (ca->disk_sb.bdev == bdev) ++ goto found; ++ ++ ca = ERR_PTR(-ENOENT); ++found: ++ bdput(bdev); ++ return ca; ++} ++ ++/* Filesystem open: */ ++ ++struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, ++ struct bch_opts opts) ++{ ++ struct bch_sb_handle *sb = NULL; ++ struct bch_fs *c = NULL; ++ unsigned i, best_sb = 0; ++ const char *err; ++ int ret = -ENOMEM; ++ ++ pr_verbose_init(opts, ""); ++ ++ if (!nr_devices) { ++ c = ERR_PTR(-EINVAL); ++ goto out2; ++ } ++ ++ if (!try_module_get(THIS_MODULE)) { ++ c = ERR_PTR(-ENODEV); ++ goto out2; ++ } ++ ++ sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); ++ if (!sb) ++ goto err; ++ ++ for (i = 0; i < nr_devices; i++) { ++ ret = bch2_read_super(devices[i], &opts, &sb[i]); ++ if (ret) ++ goto err; ++ ++ err = bch2_sb_validate(&sb[i]); ++ if (err) ++ goto err_print; ++ } ++ ++ for (i = 1; i < nr_devices; i++) ++ if (le64_to_cpu(sb[i].sb->seq) > ++ le64_to_cpu(sb[best_sb].sb->seq)) ++ best_sb = i; ++ ++ for (i = 0; i < nr_devices; i++) { ++ err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb); ++ if (err) ++ goto err_print; ++ } ++ ++ ret = -ENOMEM; ++ c = bch2_fs_alloc(sb[best_sb].sb, opts); ++ if (!c) ++ goto err; ++ ++ err = "bch2_dev_online() error"; ++ down_write(&c->state_lock); ++ for (i = 0; i < nr_devices; i++) ++ if (bch2_dev_attach_bdev(c, &sb[i])) { ++ up_write(&c->state_lock); ++ goto err_print; ++ } ++ up_write(&c->state_lock); ++ ++ err = "insufficient devices"; ++ if (!bch2_fs_may_start(c)) ++ goto err_print; ++ ++ if (!c->opts.nostart) { ++ ret = bch2_fs_start(c); ++ if (ret) ++ goto err; ++ } ++out: ++ kfree(sb); ++ module_put(THIS_MODULE); ++out2: ++ pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c)); ++ return c; ++err_print: ++ pr_err("bch_fs_open err opening %s: %s", ++ devices[0], err); ++ ret = -EINVAL; ++err: ++ if (c) ++ bch2_fs_stop(c); ++ for (i = 0; i < nr_devices; i++) ++ bch2_free_super(&sb[i]); ++ c = ERR_PTR(ret); ++ goto out; ++} ++ ++static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb, ++ struct bch_opts opts) ++{ ++ const char *err; ++ struct bch_fs *c; ++ bool allocated_fs = false; ++ int ret; ++ ++ err = bch2_sb_validate(sb); ++ if (err) ++ return err; ++ ++ mutex_lock(&bch_fs_list_lock); ++ c = __bch2_uuid_to_fs(sb->sb->uuid); ++ if (c) { ++ closure_get(&c->cl); ++ ++ err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb); ++ if (err) ++ goto err; ++ } else { ++ c = bch2_fs_alloc(sb->sb, opts); ++ err = "cannot allocate memory"; ++ if (!c) ++ goto err; ++ ++ allocated_fs = true; ++ } ++ ++ err = "bch2_dev_online() error"; ++ ++ mutex_lock(&c->sb_lock); ++ if (bch2_dev_attach_bdev(c, sb)) { ++ mutex_unlock(&c->sb_lock); ++ goto err; ++ } ++ mutex_unlock(&c->sb_lock); ++ ++ if (!c->opts.nostart && bch2_fs_may_start(c)) { ++ err = "error starting filesystem"; ++ ret = bch2_fs_start(c); ++ if (ret) ++ goto err; ++ } ++ ++ closure_put(&c->cl); ++ mutex_unlock(&bch_fs_list_lock); ++ ++ return NULL; ++err: ++ mutex_unlock(&bch_fs_list_lock); ++ ++ if (allocated_fs) ++ bch2_fs_stop(c); ++ else if (c) ++ closure_put(&c->cl); ++ ++ return err; ++} ++ ++const char *bch2_fs_open_incremental(const char *path) ++{ ++ struct bch_sb_handle sb; ++ struct bch_opts opts = bch2_opts_empty(); ++ const char *err; ++ ++ if (bch2_read_super(path, &opts, &sb)) ++ return "error reading superblock"; ++ ++ err = __bch2_fs_open_incremental(&sb, opts); ++ bch2_free_super(&sb); ++ ++ return err; ++} ++ ++/* Global interfaces/init */ ++ ++static void bcachefs_exit(void) ++{ ++ bch2_debug_exit(); ++ bch2_vfs_exit(); ++ bch2_chardev_exit(); ++ if (bcachefs_kset) ++ kset_unregister(bcachefs_kset); ++} ++ ++static int __init bcachefs_init(void) ++{ ++ bch2_bkey_pack_test(); ++ bch2_inode_pack_test(); ++ ++ if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || ++ bch2_chardev_init() || ++ bch2_vfs_init() || ++ bch2_debug_init()) ++ goto err; ++ ++ return 0; ++err: ++ bcachefs_exit(); ++ return -ENOMEM; ++} ++ ++#define BCH_DEBUG_PARAM(name, description) \ ++ bool bch2_##name; \ ++ module_param_named(name, bch2_##name, bool, 0644); \ ++ MODULE_PARM_DESC(name, description); ++BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++module_exit(bcachefs_exit); ++module_init(bcachefs_init); +diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h +new file mode 100644 +index 000000000000..4aa5dd7917cf +--- /dev/null ++++ b/fs/bcachefs/super.h +@@ -0,0 +1,231 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SUPER_H ++#define _BCACHEFS_SUPER_H ++ ++#include "extents.h" ++ ++#include "bcachefs_ioctl.h" ++ ++#include ++ ++static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) ++{ ++ return div_u64(s, ca->mi.bucket_size); ++} ++ ++static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) ++{ ++ return ((sector_t) b) * ca->mi.bucket_size; ++} ++ ++static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) ++{ ++ u32 remainder; ++ ++ div_u64_rem(s, ca->mi.bucket_size, &remainder); ++ return remainder; ++} ++ ++static inline bool bch2_dev_is_online(struct bch_dev *ca) ++{ ++ return !percpu_ref_is_zero(&ca->io_ref); ++} ++ ++static inline bool bch2_dev_is_readable(struct bch_dev *ca) ++{ ++ return bch2_dev_is_online(ca) && ++ ca->mi.state != BCH_MEMBER_STATE_FAILED; ++} ++ ++static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) ++{ ++ if (!percpu_ref_tryget(&ca->io_ref)) ++ return false; ++ ++ if (ca->mi.state == BCH_MEMBER_STATE_RW || ++ (ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ)) ++ return true; ++ ++ percpu_ref_put(&ca->io_ref); ++ return false; ++} ++ ++static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) ++{ ++ return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); ++} ++ ++static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, ++ unsigned dev) ++{ ++ unsigned i; ++ ++ for (i = 0; i < devs.nr; i++) ++ if (devs.devs[i] == dev) ++ return true; ++ ++ return false; ++} ++ ++static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, ++ unsigned dev) ++{ ++ unsigned i; ++ ++ for (i = 0; i < devs->nr; i++) ++ if (devs->devs[i] == dev) { ++ array_remove_item(devs->devs, devs->nr, i); ++ return; ++ } ++} ++ ++static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, ++ unsigned dev) ++{ ++ BUG_ON(bch2_dev_list_has_dev(*devs, dev)); ++ BUG_ON(devs->nr >= BCH_REPLICAS_MAX); ++ devs->devs[devs->nr++] = dev; ++} ++ ++static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) ++{ ++ return (struct bch_devs_list) { .nr = 1, .devs[0] = dev }; ++} ++ ++static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, ++ const struct bch_devs_mask *mask) ++{ ++ struct bch_dev *ca = NULL; ++ ++ while ((*iter = mask ++ ? find_next_bit(mask->d, c->sb.nr_devices, *iter) ++ : *iter) < c->sb.nr_devices && ++ !(ca = rcu_dereference_check(c->devs[*iter], ++ lockdep_is_held(&c->state_lock)))) ++ (*iter)++; ++ ++ return ca; ++} ++ ++#define __for_each_member_device(ca, c, iter, mask) \ ++ for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++) ++ ++#define for_each_member_device_rcu(ca, c, iter, mask) \ ++ __for_each_member_device(ca, c, iter, mask) ++ ++static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter) ++{ ++ struct bch_dev *ca; ++ ++ rcu_read_lock(); ++ if ((ca = __bch2_next_dev(c, iter, NULL))) ++ percpu_ref_get(&ca->ref); ++ rcu_read_unlock(); ++ ++ return ca; ++} ++ ++/* ++ * If you break early, you must drop your ref on the current device ++ */ ++#define for_each_member_device(ca, c, iter) \ ++ for ((iter) = 0; \ ++ (ca = bch2_get_next_dev(c, &(iter))); \ ++ percpu_ref_put(&ca->ref), (iter)++) ++ ++static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, ++ unsigned *iter, ++ int state_mask) ++{ ++ struct bch_dev *ca; ++ ++ rcu_read_lock(); ++ while ((ca = __bch2_next_dev(c, iter, NULL)) && ++ (!((1 << ca->mi.state) & state_mask) || ++ !percpu_ref_tryget(&ca->io_ref))) ++ (*iter)++; ++ rcu_read_unlock(); ++ ++ return ca; ++} ++ ++#define __for_each_online_member(ca, c, iter, state_mask) \ ++ for ((iter) = 0; \ ++ (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \ ++ percpu_ref_put(&ca->io_ref), (iter)++) ++ ++#define for_each_online_member(ca, c, iter) \ ++ __for_each_online_member(ca, c, iter, ~0) ++ ++#define for_each_rw_member(ca, c, iter) \ ++ __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_RW) ++ ++#define for_each_readable_member(ca, c, iter) \ ++ __for_each_online_member(ca, c, iter, \ ++ (1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO)) ++ ++/* ++ * If a key exists that references a device, the device won't be going away and ++ * we can omit rcu_read_lock(): ++ */ ++static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx) ++{ ++ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); ++ ++ return rcu_dereference_check(c->devs[idx], 1); ++} ++ ++static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) ++{ ++ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); ++ ++ return rcu_dereference_protected(c->devs[idx], ++ lockdep_is_held(&c->sb_lock) || ++ lockdep_is_held(&c->state_lock)); ++} ++ ++/* XXX kill, move to struct bch_fs */ ++static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) ++{ ++ struct bch_devs_mask devs; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ memset(&devs, 0, sizeof(devs)); ++ for_each_online_member(ca, c, i) ++ __set_bit(ca->dev_idx, devs.d); ++ return devs; ++} ++ ++struct bch_fs *bch2_bdev_to_fs(struct block_device *); ++struct bch_fs *bch2_uuid_to_fs(uuid_le); ++int bch2_congested(void *, int); ++ ++bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, ++ enum bch_member_state, int); ++int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *, ++ enum bch_member_state, int); ++int bch2_dev_set_state(struct bch_fs *, struct bch_dev *, ++ enum bch_member_state, int); ++ ++int bch2_dev_fail(struct bch_dev *, int); ++int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int); ++int bch2_dev_add(struct bch_fs *, const char *); ++int bch2_dev_online(struct bch_fs *, const char *); ++int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int); ++int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); ++struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); ++ ++bool bch2_fs_emergency_read_only(struct bch_fs *); ++void bch2_fs_read_only(struct bch_fs *); ++ ++int bch2_fs_read_write(struct bch_fs *); ++int bch2_fs_read_write_early(struct bch_fs *); ++ ++void bch2_fs_stop(struct bch_fs *); ++ ++int bch2_fs_start(struct bch_fs *); ++struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); ++const char *bch2_fs_open_incremental(const char *path); ++ ++#endif /* _BCACHEFS_SUPER_H */ +diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h +new file mode 100644 +index 000000000000..20406ebd6f5b +--- /dev/null ++++ b/fs/bcachefs/super_types.h +@@ -0,0 +1,51 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SUPER_TYPES_H ++#define _BCACHEFS_SUPER_TYPES_H ++ ++struct bch_sb_handle { ++ struct bch_sb *sb; ++ struct block_device *bdev; ++ struct bio *bio; ++ unsigned page_order; ++ fmode_t mode; ++ unsigned have_layout:1; ++ unsigned have_bio:1; ++ unsigned fs_sb:1; ++ u64 seq; ++}; ++ ++struct bch_devs_mask { ++ unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)]; ++}; ++ ++struct bch_devs_list { ++ u8 nr; ++ u8 devs[BCH_REPLICAS_MAX + 1]; ++}; ++ ++struct bch_member_cpu { ++ u64 nbuckets; /* device size */ ++ u16 first_bucket; /* index of first bucket used */ ++ u16 bucket_size; /* sectors */ ++ u16 group; ++ u8 state; ++ u8 replacement; ++ u8 discard; ++ u8 data_allowed; ++ u8 durability; ++ u8 valid; ++}; ++ ++struct bch_disk_group_cpu { ++ bool deleted; ++ u16 parent; ++ struct bch_devs_mask devs; ++}; ++ ++struct bch_disk_groups_cpu { ++ struct rcu_head rcu; ++ unsigned nr; ++ struct bch_disk_group_cpu entries[]; ++}; ++ ++#endif /* _BCACHEFS_SUPER_TYPES_H */ +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +new file mode 100644 +index 000000000000..c169d282a1f9 +--- /dev/null ++++ b/fs/bcachefs/sysfs.c +@@ -0,0 +1,1091 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * bcache sysfs interfaces ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#ifndef NO_BCACHEFS_SYSFS ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "sysfs.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_key_cache.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "clock.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "inode.h" ++#include "journal.h" ++#include "keylist.h" ++#include "move.h" ++#include "opts.h" ++#include "rebalance.h" ++#include "replicas.h" ++#include "super-io.h" ++#include "tests.h" ++ ++#include ++#include ++#include ++ ++#include "util.h" ++ ++#define SYSFS_OPS(type) \ ++struct sysfs_ops type ## _sysfs_ops = { \ ++ .show = type ## _show, \ ++ .store = type ## _store \ ++} ++ ++#define SHOW(fn) \ ++static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ ++ char *buf) \ ++ ++#define STORE(fn) \ ++static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ ++ const char *buf, size_t size) \ ++ ++#define __sysfs_attribute(_name, _mode) \ ++ static struct attribute sysfs_##_name = \ ++ { .name = #_name, .mode = _mode } ++ ++#define write_attribute(n) __sysfs_attribute(n, S_IWUSR) ++#define read_attribute(n) __sysfs_attribute(n, S_IRUGO) ++#define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR) ++ ++#define sysfs_printf(file, fmt, ...) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\ ++} while (0) ++ ++#define sysfs_print(file, var) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ return snprint(buf, PAGE_SIZE, var); \ ++} while (0) ++ ++#define sysfs_hprint(file, val) \ ++do { \ ++ if (attr == &sysfs_ ## file) { \ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); \ ++ bch2_hprint(&out, val); \ ++ pr_buf(&out, "\n"); \ ++ return out.pos - buf; \ ++ } \ ++} while (0) ++ ++#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var)) ++#define var_print(_var) sysfs_print(_var, var(_var)) ++#define var_hprint(_var) sysfs_hprint(_var, var(_var)) ++ ++#define sysfs_strtoul(file, var) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ return strtoul_safe(buf, var) ?: (ssize_t) size; \ ++} while (0) ++ ++#define sysfs_strtoul_clamp(file, var, min, max) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ return strtoul_safe_clamp(buf, var, min, max) \ ++ ?: (ssize_t) size; \ ++} while (0) ++ ++#define strtoul_or_return(cp) \ ++({ \ ++ unsigned long _v; \ ++ int _r = kstrtoul(cp, 10, &_v); \ ++ if (_r) \ ++ return _r; \ ++ _v; \ ++}) ++ ++#define strtoul_restrict_or_return(cp, min, max) \ ++({ \ ++ unsigned long __v = 0; \ ++ int _r = strtoul_safe_restrict(cp, __v, min, max); \ ++ if (_r) \ ++ return _r; \ ++ __v; \ ++}) ++ ++#define strtoi_h_or_return(cp) \ ++({ \ ++ u64 _v; \ ++ int _r = strtoi_h(cp, &_v); \ ++ if (_r) \ ++ return _r; \ ++ _v; \ ++}) ++ ++#define sysfs_hatoi(file, var) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ return strtoi_h(buf, &var) ?: (ssize_t) size; \ ++} while (0) ++ ++write_attribute(trigger_journal_flush); ++write_attribute(trigger_btree_coalesce); ++write_attribute(trigger_gc); ++write_attribute(prune_cache); ++rw_attribute(btree_gc_periodic); ++ ++read_attribute(uuid); ++read_attribute(minor); ++read_attribute(bucket_size); ++read_attribute(block_size); ++read_attribute(btree_node_size); ++read_attribute(first_bucket); ++read_attribute(nbuckets); ++read_attribute(durability); ++read_attribute(iodone); ++ ++read_attribute(io_latency_read); ++read_attribute(io_latency_write); ++read_attribute(io_latency_stats_read); ++read_attribute(io_latency_stats_write); ++read_attribute(congested); ++ ++read_attribute(bucket_quantiles_last_read); ++read_attribute(bucket_quantiles_last_write); ++read_attribute(bucket_quantiles_fragmentation); ++read_attribute(bucket_quantiles_oldest_gen); ++ ++read_attribute(reserve_stats); ++read_attribute(btree_cache_size); ++read_attribute(compression_stats); ++read_attribute(journal_debug); ++read_attribute(journal_pins); ++read_attribute(btree_updates); ++read_attribute(dirty_btree_nodes); ++read_attribute(btree_key_cache); ++read_attribute(btree_transactions); ++ ++read_attribute(internal_uuid); ++ ++read_attribute(has_data); ++read_attribute(alloc_debug); ++write_attribute(wake_allocator); ++ ++read_attribute(read_realloc_races); ++read_attribute(extent_migrate_done); ++read_attribute(extent_migrate_raced); ++ ++rw_attribute(journal_write_delay_ms); ++rw_attribute(journal_reclaim_delay_ms); ++ ++rw_attribute(discard); ++rw_attribute(cache_replacement_policy); ++rw_attribute(label); ++ ++rw_attribute(copy_gc_enabled); ++sysfs_pd_controller_attribute(copy_gc); ++ ++rw_attribute(rebalance_enabled); ++sysfs_pd_controller_attribute(rebalance); ++read_attribute(rebalance_work); ++rw_attribute(promote_whole_extents); ++ ++read_attribute(new_stripes); ++ ++rw_attribute(pd_controllers_update_seconds); ++ ++read_attribute(meta_replicas_have); ++read_attribute(data_replicas_have); ++ ++read_attribute(io_timers_read); ++read_attribute(io_timers_write); ++ ++#ifdef CONFIG_BCACHEFS_TESTS ++write_attribute(perf_test); ++#endif /* CONFIG_BCACHEFS_TESTS */ ++ ++#define BCH_DEBUG_PARAM(name, description) \ ++ rw_attribute(name); ++ ++ BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++#define x(_name) \ ++ static struct attribute sysfs_time_stat_##_name = \ ++ { .name = #_name, .mode = S_IRUGO }; ++ BCH_TIME_STATS() ++#undef x ++ ++static struct attribute sysfs_state_rw = { ++ .name = "state", ++ .mode = S_IRUGO ++}; ++ ++static size_t bch2_btree_cache_size(struct bch_fs *c) ++{ ++ size_t ret = 0; ++ struct btree *b; ++ ++ mutex_lock(&c->btree_cache.lock); ++ list_for_each_entry(b, &c->btree_cache.live, list) ++ ret += btree_bytes(c); ++ ++ mutex_unlock(&c->btree_cache.lock); ++ return ret; ++} ++ ++static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf) ++{ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c); ++ ++ if (!fs_usage) ++ return -ENOMEM; ++ ++ bch2_fs_usage_to_text(&out, c, fs_usage); ++ ++ percpu_up_read(&c->mark_lock); ++ ++ kfree(fs_usage); ++ ++ return out.pos - buf; ++} ++ ++static ssize_t bch2_compression_stats(struct bch_fs *c, char *buf) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0, ++ nr_compressed_extents = 0, ++ compressed_sectors_compressed = 0, ++ compressed_sectors_uncompressed = 0; ++ int ret; ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EPERM; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, 0, k, ret) ++ if (k.k->type == KEY_TYPE_extent) { ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ extent_for_each_ptr_decode(e, p, entry) { ++ if (!crc_is_compressed(p.crc)) { ++ nr_uncompressed_extents++; ++ uncompressed_sectors += e.k->size; ++ } else { ++ nr_compressed_extents++; ++ compressed_sectors_compressed += ++ p.crc.compressed_size; ++ compressed_sectors_uncompressed += ++ p.crc.uncompressed_size; ++ } ++ ++ /* only looking at the first ptr */ ++ break; ++ } ++ } ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ if (ret) ++ return ret; ++ ++ return scnprintf(buf, PAGE_SIZE, ++ "uncompressed data:\n" ++ " nr extents: %llu\n" ++ " size (bytes): %llu\n" ++ "compressed data:\n" ++ " nr extents: %llu\n" ++ " compressed size (bytes): %llu\n" ++ " uncompressed size (bytes): %llu\n", ++ nr_uncompressed_extents, ++ uncompressed_sectors << 9, ++ nr_compressed_extents, ++ compressed_sectors_compressed << 9, ++ compressed_sectors_uncompressed << 9); ++} ++ ++static ssize_t bch2_new_stripes(struct bch_fs *c, char *buf) ++{ ++ char *out = buf, *end = buf + PAGE_SIZE; ++ struct ec_stripe_head *h; ++ struct ec_stripe_new *s; ++ ++ mutex_lock(&c->ec_new_stripe_lock); ++ list_for_each_entry(h, &c->ec_new_stripe_list, list) { ++ out += scnprintf(out, end - out, ++ "target %u algo %u redundancy %u:\n", ++ h->target, h->algo, h->redundancy); ++ ++ if (h->s) ++ out += scnprintf(out, end - out, ++ "\tpending: blocks %u allocated %u\n", ++ h->s->blocks.nr, ++ bitmap_weight(h->s->blocks_allocated, ++ h->s->blocks.nr)); ++ ++ mutex_lock(&h->lock); ++ list_for_each_entry(s, &h->stripes, list) ++ out += scnprintf(out, end - out, ++ "\tin flight: blocks %u allocated %u pin %u\n", ++ s->blocks.nr, ++ bitmap_weight(s->blocks_allocated, ++ s->blocks.nr), ++ atomic_read(&s->pin)); ++ mutex_unlock(&h->lock); ++ ++ } ++ mutex_unlock(&c->ec_new_stripe_lock); ++ ++ return out - buf; ++} ++ ++SHOW(bch2_fs) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); ++ ++ sysfs_print(minor, c->minor); ++ sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); ++ ++ sysfs_print(journal_write_delay_ms, c->journal.write_delay_ms); ++ sysfs_print(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); ++ ++ sysfs_print(block_size, block_bytes(c)); ++ sysfs_print(btree_node_size, btree_bytes(c)); ++ sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); ++ ++ sysfs_print(read_realloc_races, ++ atomic_long_read(&c->read_realloc_races)); ++ sysfs_print(extent_migrate_done, ++ atomic_long_read(&c->extent_migrate_done)); ++ sysfs_print(extent_migrate_raced, ++ atomic_long_read(&c->extent_migrate_raced)); ++ ++ sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); ++ ++ sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); ++ ++ sysfs_print(pd_controllers_update_seconds, ++ c->pd_controllers_update_seconds); ++ ++ sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); ++ sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ ++ ++ if (attr == &sysfs_rebalance_work) ++ return bch2_rebalance_work_show(c, buf); ++ ++ sysfs_print(promote_whole_extents, c->promote_whole_extents); ++ ++ sysfs_printf(meta_replicas_have, "%i", bch2_replicas_online(c, true)); ++ sysfs_printf(data_replicas_have, "%i", bch2_replicas_online(c, false)); ++ ++ /* Debugging: */ ++ ++ if (attr == &sysfs_alloc_debug) ++ return show_fs_alloc_debug(c, buf); ++ ++ if (attr == &sysfs_journal_debug) ++ return bch2_journal_print_debug(&c->journal, buf); ++ ++ if (attr == &sysfs_journal_pins) ++ return bch2_journal_print_pins(&c->journal, buf); ++ ++ if (attr == &sysfs_btree_updates) ++ return bch2_btree_updates_print(c, buf); ++ ++ if (attr == &sysfs_dirty_btree_nodes) ++ return bch2_dirty_btree_nodes_print(c, buf); ++ ++ if (attr == &sysfs_btree_key_cache) { ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ ++ bch2_btree_key_cache_to_text(&out, &c->btree_key_cache); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_btree_transactions) { ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ ++ bch2_btree_trans_to_text(&out, c); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_compression_stats) ++ return bch2_compression_stats(c, buf); ++ ++ if (attr == &sysfs_new_stripes) ++ return bch2_new_stripes(c, buf); ++ ++ if (attr == &sysfs_io_timers_read) ++ return bch2_io_timers_show(&c->io_clock[READ], buf); ++ if (attr == &sysfs_io_timers_write) ++ return bch2_io_timers_show(&c->io_clock[WRITE], buf); ++ ++#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); ++ BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++ return 0; ++} ++ ++STORE(bch2_fs) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); ++ ++ sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms); ++ sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); ++ ++ if (attr == &sysfs_btree_gc_periodic) { ++ ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic) ++ ?: (ssize_t) size; ++ ++ wake_up_process(c->gc_thread); ++ return ret; ++ } ++ ++ if (attr == &sysfs_copy_gc_enabled) { ++ struct bch_dev *ca; ++ unsigned i; ++ ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) ++ ?: (ssize_t) size; ++ ++ for_each_member_device(ca, c, i) ++ if (ca->copygc_thread) ++ wake_up_process(ca->copygc_thread); ++ return ret; ++ } ++ ++ if (attr == &sysfs_rebalance_enabled) { ++ ssize_t ret = strtoul_safe(buf, c->rebalance.enabled) ++ ?: (ssize_t) size; ++ ++ rebalance_wakeup(c); ++ return ret; ++ } ++ ++ sysfs_strtoul(pd_controllers_update_seconds, ++ c->pd_controllers_update_seconds); ++ sysfs_pd_controller_store(rebalance, &c->rebalance.pd); ++ ++ sysfs_strtoul(promote_whole_extents, c->promote_whole_extents); ++ ++ /* Debugging: */ ++ ++#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name); ++ BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EPERM; ++ ++ /* Debugging: */ ++ ++ if (attr == &sysfs_trigger_journal_flush) ++ bch2_journal_meta_async(&c->journal, NULL); ++ ++ if (attr == &sysfs_trigger_btree_coalesce) ++ bch2_coalesce(c); ++ ++ if (attr == &sysfs_trigger_gc) { ++ /* ++ * Full gc is currently incompatible with btree key cache: ++ */ ++#if 0 ++ down_read(&c->state_lock); ++ bch2_gc(c, NULL, false, false); ++ up_read(&c->state_lock); ++#else ++ bch2_gc_gens(c); ++#endif ++ } ++ ++ if (attr == &sysfs_prune_cache) { ++ struct shrink_control sc; ++ ++ sc.gfp_mask = GFP_KERNEL; ++ sc.nr_to_scan = strtoul_or_return(buf); ++ c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); ++ } ++ ++#ifdef CONFIG_BCACHEFS_TESTS ++ if (attr == &sysfs_perf_test) { ++ char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; ++ char *test = strsep(&p, " \t\n"); ++ char *nr_str = strsep(&p, " \t\n"); ++ char *threads_str = strsep(&p, " \t\n"); ++ unsigned threads; ++ u64 nr; ++ int ret = -EINVAL; ++ ++ if (threads_str && ++ !(ret = kstrtouint(threads_str, 10, &threads)) && ++ !(ret = bch2_strtoull_h(nr_str, &nr))) ++ bch2_btree_perf_test(c, test, nr, threads); ++ else ++ size = ret; ++ kfree(tmp); ++ } ++#endif ++ return size; ++} ++SYSFS_OPS(bch2_fs); ++ ++struct attribute *bch2_fs_files[] = { ++ &sysfs_minor, ++ &sysfs_block_size, ++ &sysfs_btree_node_size, ++ &sysfs_btree_cache_size, ++ ++ &sysfs_meta_replicas_have, ++ &sysfs_data_replicas_have, ++ ++ &sysfs_journal_write_delay_ms, ++ &sysfs_journal_reclaim_delay_ms, ++ ++ &sysfs_promote_whole_extents, ++ ++ &sysfs_compression_stats, ++ ++#ifdef CONFIG_BCACHEFS_TESTS ++ &sysfs_perf_test, ++#endif ++ NULL ++}; ++ ++/* internal dir - just a wrapper */ ++ ++SHOW(bch2_fs_internal) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, internal); ++ return bch2_fs_show(&c->kobj, attr, buf); ++} ++ ++STORE(bch2_fs_internal) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, internal); ++ return bch2_fs_store(&c->kobj, attr, buf, size); ++} ++SYSFS_OPS(bch2_fs_internal); ++ ++struct attribute *bch2_fs_internal_files[] = { ++ &sysfs_alloc_debug, ++ &sysfs_journal_debug, ++ &sysfs_journal_pins, ++ &sysfs_btree_updates, ++ &sysfs_dirty_btree_nodes, ++ &sysfs_btree_key_cache, ++ &sysfs_btree_transactions, ++ ++ &sysfs_read_realloc_races, ++ &sysfs_extent_migrate_done, ++ &sysfs_extent_migrate_raced, ++ ++ &sysfs_trigger_journal_flush, ++ &sysfs_trigger_btree_coalesce, ++ &sysfs_trigger_gc, ++ &sysfs_prune_cache, ++ ++ &sysfs_copy_gc_enabled, ++ ++ &sysfs_rebalance_enabled, ++ &sysfs_rebalance_work, ++ sysfs_pd_controller_files(rebalance), ++ ++ &sysfs_new_stripes, ++ ++ &sysfs_io_timers_read, ++ &sysfs_io_timers_write, ++ ++ &sysfs_internal_uuid, ++ ++#define BCH_DEBUG_PARAM(name, description) &sysfs_##name, ++ BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++ NULL ++}; ++ ++/* options */ ++ ++SHOW(bch2_fs_opts_dir) ++{ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); ++ const struct bch_option *opt = container_of(attr, struct bch_option, attr); ++ int id = opt - bch2_opt_table; ++ u64 v = bch2_opt_get_by_id(&c->opts, id); ++ ++ bch2_opt_to_text(&out, c, opt, v, OPT_SHOW_FULL_LIST); ++ pr_buf(&out, "\n"); ++ ++ return out.pos - buf; ++} ++ ++STORE(bch2_fs_opts_dir) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); ++ const struct bch_option *opt = container_of(attr, struct bch_option, attr); ++ int ret, id = opt - bch2_opt_table; ++ char *tmp; ++ u64 v; ++ ++ tmp = kstrdup(buf, GFP_KERNEL); ++ if (!tmp) ++ return -ENOMEM; ++ ++ ret = bch2_opt_parse(c, opt, strim(tmp), &v); ++ kfree(tmp); ++ ++ if (ret < 0) ++ return ret; ++ ++ ret = bch2_opt_check_may_set(c, id, v); ++ if (ret < 0) ++ return ret; ++ ++ if (opt->set_sb != SET_NO_SB_OPT) { ++ mutex_lock(&c->sb_lock); ++ opt->set_sb(c->disk_sb.sb, v); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ bch2_opt_set_by_id(&c->opts, id, v); ++ ++ if ((id == Opt_background_target || ++ id == Opt_background_compression) && v) { ++ bch2_rebalance_add_work(c, S64_MAX); ++ rebalance_wakeup(c); ++ } ++ ++ return size; ++} ++SYSFS_OPS(bch2_fs_opts_dir); ++ ++struct attribute *bch2_fs_opts_dir_files[] = { NULL }; ++ ++int bch2_opts_create_sysfs_files(struct kobject *kobj) ++{ ++ const struct bch_option *i; ++ int ret; ++ ++ for (i = bch2_opt_table; ++ i < bch2_opt_table + bch2_opts_nr; ++ i++) { ++ if (!(i->mode & (OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME))) ++ continue; ++ ++ ret = sysfs_create_file(kobj, &i->attr); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++/* time stats */ ++ ++SHOW(bch2_fs_time_stats) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); ++ ++#define x(name) \ ++ if (attr == &sysfs_time_stat_##name) \ ++ return bch2_time_stats_print(&c->times[BCH_TIME_##name],\ ++ buf, PAGE_SIZE); ++ BCH_TIME_STATS() ++#undef x ++ ++ return 0; ++} ++ ++STORE(bch2_fs_time_stats) ++{ ++ return size; ++} ++SYSFS_OPS(bch2_fs_time_stats); ++ ++struct attribute *bch2_fs_time_stats_files[] = { ++#define x(name) \ ++ &sysfs_time_stat_##name, ++ BCH_TIME_STATS() ++#undef x ++ NULL ++}; ++ ++typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *, ++ size_t, void *); ++ ++static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, void *private) ++{ ++ int rw = (private ? 1 : 0); ++ ++ return bucket_last_io(c, bucket(ca, b), rw); ++} ++ ++static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, void *private) ++{ ++ struct bucket *g = bucket(ca, b); ++ return bucket_sectors_used(g->mark); ++} ++ ++static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, void *private) ++{ ++ return bucket_gc_gen(ca, b); ++} ++ ++static int unsigned_cmp(const void *_l, const void *_r) ++{ ++ const unsigned *l = _l; ++ const unsigned *r = _r; ++ ++ return cmp_int(*l, *r); ++} ++ ++static ssize_t show_quantiles(struct bch_fs *c, struct bch_dev *ca, ++ char *buf, bucket_map_fn *fn, void *private) ++{ ++ size_t i, n; ++ /* Compute 31 quantiles */ ++ unsigned q[31], *p; ++ ssize_t ret = 0; ++ ++ down_read(&ca->bucket_lock); ++ n = ca->mi.nbuckets; ++ ++ p = vzalloc(n * sizeof(unsigned)); ++ if (!p) { ++ up_read(&ca->bucket_lock); ++ return -ENOMEM; ++ } ++ ++ for (i = ca->mi.first_bucket; i < n; i++) ++ p[i] = fn(c, ca, i, private); ++ ++ sort(p, n, sizeof(unsigned), unsigned_cmp, NULL); ++ up_read(&ca->bucket_lock); ++ ++ while (n && ++ !p[n - 1]) ++ --n; ++ ++ for (i = 0; i < ARRAY_SIZE(q); i++) ++ q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)]; ++ ++ vfree(p); ++ ++ for (i = 0; i < ARRAY_SIZE(q); i++) ++ ret += scnprintf(buf + ret, PAGE_SIZE - ret, ++ "%u ", q[i]); ++ buf[ret - 1] = '\n'; ++ ++ return ret; ++} ++ ++static ssize_t show_reserve_stats(struct bch_dev *ca, char *buf) ++{ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ enum alloc_reserve i; ++ ++ spin_lock(&ca->fs->freelist_lock); ++ ++ pr_buf(&out, "free_inc:\t%zu\t%zu\n", ++ fifo_used(&ca->free_inc), ++ ca->free_inc.size); ++ ++ for (i = 0; i < RESERVE_NR; i++) ++ pr_buf(&out, "free[%u]:\t%zu\t%zu\n", i, ++ fifo_used(&ca->free[i]), ++ ca->free[i].size); ++ ++ spin_unlock(&ca->fs->freelist_lock); ++ ++ return out.pos - buf; ++} ++ ++static ssize_t show_dev_alloc_debug(struct bch_dev *ca, char *buf) ++{ ++ struct bch_fs *c = ca->fs; ++ struct bch_dev_usage stats = bch2_dev_usage_read(c, ca); ++ unsigned i, nr[BCH_DATA_NR]; ++ ++ memset(nr, 0, sizeof(nr)); ++ ++ for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) ++ nr[c->open_buckets[i].type]++; ++ ++ return scnprintf(buf, PAGE_SIZE, ++ "free_inc: %zu/%zu\n" ++ "free[RESERVE_BTREE]: %zu/%zu\n" ++ "free[RESERVE_MOVINGGC]: %zu/%zu\n" ++ "free[RESERVE_NONE]: %zu/%zu\n" ++ "buckets:\n" ++ " capacity: %llu\n" ++ " alloc: %llu\n" ++ " sb: %llu\n" ++ " journal: %llu\n" ++ " meta: %llu\n" ++ " user: %llu\n" ++ " cached: %llu\n" ++ " erasure coded: %llu\n" ++ " available: %lli\n" ++ "sectors:\n" ++ " sb: %llu\n" ++ " journal: %llu\n" ++ " meta: %llu\n" ++ " user: %llu\n" ++ " cached: %llu\n" ++ " erasure coded: %llu\n" ++ " fragmented: %llu\n" ++ " copygc threshold: %llu\n" ++ "freelist_wait: %s\n" ++ "open buckets: %u/%u (reserved %u)\n" ++ "open_buckets_wait: %s\n" ++ "open_buckets_btree: %u\n" ++ "open_buckets_user: %u\n" ++ "btree reserve cache: %u\n", ++ fifo_used(&ca->free_inc), ca->free_inc.size, ++ fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size, ++ fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, ++ fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, ++ ca->mi.nbuckets - ca->mi.first_bucket, ++ stats.buckets_alloc, ++ stats.buckets[BCH_DATA_SB], ++ stats.buckets[BCH_DATA_JOURNAL], ++ stats.buckets[BCH_DATA_BTREE], ++ stats.buckets[BCH_DATA_USER], ++ stats.buckets[BCH_DATA_CACHED], ++ stats.buckets_ec, ++ ca->mi.nbuckets - ca->mi.first_bucket - stats.buckets_unavailable, ++ stats.sectors[BCH_DATA_SB], ++ stats.sectors[BCH_DATA_JOURNAL], ++ stats.sectors[BCH_DATA_BTREE], ++ stats.sectors[BCH_DATA_USER], ++ stats.sectors[BCH_DATA_CACHED], ++ stats.sectors_ec, ++ stats.sectors_fragmented, ++ ca->copygc_threshold, ++ c->freelist_wait.list.first ? "waiting" : "empty", ++ c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, ++ BTREE_NODE_OPEN_BUCKET_RESERVE, ++ c->open_buckets_wait.list.first ? "waiting" : "empty", ++ nr[BCH_DATA_BTREE], ++ nr[BCH_DATA_USER], ++ c->btree_reserve_cache_nr); ++} ++ ++static const char * const bch2_rw[] = { ++ "read", ++ "write", ++ NULL ++}; ++ ++static ssize_t show_dev_iodone(struct bch_dev *ca, char *buf) ++{ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ int rw, i; ++ ++ for (rw = 0; rw < 2; rw++) { ++ pr_buf(&out, "%s:\n", bch2_rw[rw]); ++ ++ for (i = 1; i < BCH_DATA_NR; i++) ++ pr_buf(&out, "%-12s:%12llu\n", ++ bch2_data_types[i], ++ percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); ++ } ++ ++ return out.pos - buf; ++} ++ ++SHOW(bch2_dev) ++{ ++ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); ++ struct bch_fs *c = ca->fs; ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ ++ sysfs_printf(uuid, "%pU\n", ca->uuid.b); ++ ++ sysfs_print(bucket_size, bucket_bytes(ca)); ++ sysfs_print(block_size, block_bytes(c)); ++ sysfs_print(first_bucket, ca->mi.first_bucket); ++ sysfs_print(nbuckets, ca->mi.nbuckets); ++ sysfs_print(durability, ca->mi.durability); ++ sysfs_print(discard, ca->mi.discard); ++ ++ if (attr == &sysfs_label) { ++ if (ca->mi.group) { ++ mutex_lock(&c->sb_lock); ++ bch2_disk_path_to_text(&out, &c->disk_sb, ++ ca->mi.group - 1); ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ pr_buf(&out, "\n"); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_has_data) { ++ bch2_flags_to_text(&out, bch2_data_types, ++ bch2_dev_has_data(c, ca)); ++ pr_buf(&out, "\n"); ++ return out.pos - buf; ++ } ++ ++ sysfs_pd_controller_show(copy_gc, &ca->copygc_pd); ++ ++ if (attr == &sysfs_cache_replacement_policy) { ++ bch2_string_opt_to_text(&out, ++ bch2_cache_replacement_policies, ++ ca->mi.replacement); ++ pr_buf(&out, "\n"); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_state_rw) { ++ bch2_string_opt_to_text(&out, bch2_dev_state, ++ ca->mi.state); ++ pr_buf(&out, "\n"); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_iodone) ++ return show_dev_iodone(ca, buf); ++ ++ sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); ++ sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); ++ ++ if (attr == &sysfs_io_latency_stats_read) ++ return bch2_time_stats_print(&ca->io_latency[READ], buf, PAGE_SIZE); ++ if (attr == &sysfs_io_latency_stats_write) ++ return bch2_time_stats_print(&ca->io_latency[WRITE], buf, PAGE_SIZE); ++ ++ sysfs_printf(congested, "%u%%", ++ clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) ++ * 100 / CONGESTED_MAX); ++ ++ if (attr == &sysfs_bucket_quantiles_last_read) ++ return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 0); ++ if (attr == &sysfs_bucket_quantiles_last_write) ++ return show_quantiles(c, ca, buf, bucket_last_io_fn, (void *) 1); ++ if (attr == &sysfs_bucket_quantiles_fragmentation) ++ return show_quantiles(c, ca, buf, bucket_sectors_used_fn, NULL); ++ if (attr == &sysfs_bucket_quantiles_oldest_gen) ++ return show_quantiles(c, ca, buf, bucket_oldest_gen_fn, NULL); ++ ++ if (attr == &sysfs_reserve_stats) ++ return show_reserve_stats(ca, buf); ++ if (attr == &sysfs_alloc_debug) ++ return show_dev_alloc_debug(ca, buf); ++ ++ return 0; ++} ++ ++STORE(bch2_dev) ++{ ++ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); ++ struct bch_fs *c = ca->fs; ++ struct bch_member *mi; ++ ++ sysfs_pd_controller_store(copy_gc, &ca->copygc_pd); ++ ++ if (attr == &sysfs_discard) { ++ bool v = strtoul_or_return(buf); ++ ++ mutex_lock(&c->sb_lock); ++ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; ++ ++ if (v != BCH_MEMBER_DISCARD(mi)) { ++ SET_BCH_MEMBER_DISCARD(mi, v); ++ bch2_write_super(c); ++ } ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ if (attr == &sysfs_cache_replacement_policy) { ++ ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf); ++ ++ if (v < 0) ++ return v; ++ ++ mutex_lock(&c->sb_lock); ++ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; ++ ++ if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) { ++ SET_BCH_MEMBER_REPLACEMENT(mi, v); ++ bch2_write_super(c); ++ } ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ if (attr == &sysfs_label) { ++ char *tmp; ++ int ret; ++ ++ tmp = kstrdup(buf, GFP_KERNEL); ++ if (!tmp) ++ return -ENOMEM; ++ ++ ret = bch2_dev_group_set(c, ca, strim(tmp)); ++ kfree(tmp); ++ if (ret) ++ return ret; ++ } ++ ++ if (attr == &sysfs_wake_allocator) ++ bch2_wake_allocator(ca); ++ ++ return size; ++} ++SYSFS_OPS(bch2_dev); ++ ++struct attribute *bch2_dev_files[] = { ++ &sysfs_uuid, ++ &sysfs_bucket_size, ++ &sysfs_block_size, ++ &sysfs_first_bucket, ++ &sysfs_nbuckets, ++ &sysfs_durability, ++ ++ /* settings: */ ++ &sysfs_discard, ++ &sysfs_cache_replacement_policy, ++ &sysfs_state_rw, ++ &sysfs_label, ++ ++ &sysfs_has_data, ++ &sysfs_iodone, ++ ++ &sysfs_io_latency_read, ++ &sysfs_io_latency_write, ++ &sysfs_io_latency_stats_read, ++ &sysfs_io_latency_stats_write, ++ &sysfs_congested, ++ ++ /* alloc info - other stats: */ ++ &sysfs_bucket_quantiles_last_read, ++ &sysfs_bucket_quantiles_last_write, ++ &sysfs_bucket_quantiles_fragmentation, ++ &sysfs_bucket_quantiles_oldest_gen, ++ ++ &sysfs_reserve_stats, ++ ++ /* debug: */ ++ &sysfs_alloc_debug, ++ &sysfs_wake_allocator, ++ ++ sysfs_pd_controller_files(copy_gc), ++ NULL ++}; ++ ++#endif /* _BCACHEFS_SYSFS_H_ */ +diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h +new file mode 100644 +index 000000000000..525fd05d91f7 +--- /dev/null ++++ b/fs/bcachefs/sysfs.h +@@ -0,0 +1,44 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SYSFS_H_ ++#define _BCACHEFS_SYSFS_H_ ++ ++#include ++ ++#ifndef NO_BCACHEFS_SYSFS ++ ++struct attribute; ++struct sysfs_ops; ++ ++extern struct attribute *bch2_fs_files[]; ++extern struct attribute *bch2_fs_internal_files[]; ++extern struct attribute *bch2_fs_opts_dir_files[]; ++extern struct attribute *bch2_fs_time_stats_files[]; ++extern struct attribute *bch2_dev_files[]; ++ ++extern struct sysfs_ops bch2_fs_sysfs_ops; ++extern struct sysfs_ops bch2_fs_internal_sysfs_ops; ++extern struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; ++extern struct sysfs_ops bch2_fs_time_stats_sysfs_ops; ++extern struct sysfs_ops bch2_dev_sysfs_ops; ++ ++int bch2_opts_create_sysfs_files(struct kobject *); ++ ++#else ++ ++static struct attribute *bch2_fs_files[] = {}; ++static struct attribute *bch2_fs_internal_files[] = {}; ++static struct attribute *bch2_fs_opts_dir_files[] = {}; ++static struct attribute *bch2_fs_time_stats_files[] = {}; ++static struct attribute *bch2_dev_files[] = {}; ++ ++static const struct sysfs_ops bch2_fs_sysfs_ops; ++static const struct sysfs_ops bch2_fs_internal_sysfs_ops; ++static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; ++static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; ++static const struct sysfs_ops bch2_dev_sysfs_ops; ++ ++static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; } ++ ++#endif /* NO_BCACHEFS_SYSFS */ ++ ++#endif /* _BCACHEFS_SYSFS_H_ */ +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +new file mode 100644 +index 000000000000..4dcace650416 +--- /dev/null ++++ b/fs/bcachefs/tests.c +@@ -0,0 +1,725 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifdef CONFIG_BCACHEFS_TESTS ++ ++#include "bcachefs.h" ++#include "btree_update.h" ++#include "journal_reclaim.h" ++#include "tests.h" ++ ++#include "linux/kthread.h" ++#include "linux/random.h" ++ ++static void delete_test_keys(struct bch_fs *c) ++{ ++ int ret; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, ++ POS(0, 0), POS(0, U64_MAX), ++ NULL); ++ BUG_ON(ret); ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, ++ POS(0, 0), POS(0, U64_MAX), ++ NULL); ++ BUG_ON(ret); ++} ++ ++/* unit tests */ ++ ++static void test_delete(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_i_cookie k; ++ int ret; ++ ++ bkey_cookie_init(&k.k_i); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p, ++ BTREE_ITER_INTENT); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ BUG_ON(ret); ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_trans_update(&trans, iter, &k.k_i, 0)); ++ BUG_ON(ret); ++ ++ pr_info("deleting once"); ++ ret = bch2_btree_delete_at(&trans, iter, 0); ++ BUG_ON(ret); ++ ++ pr_info("deleting twice"); ++ ret = bch2_btree_delete_at(&trans, iter, 0); ++ BUG_ON(ret); ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_delete_written(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_i_cookie k; ++ int ret; ++ ++ bkey_cookie_init(&k.k_i); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p, ++ BTREE_ITER_INTENT); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ BUG_ON(ret); ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_trans_update(&trans, iter, &k.k_i, 0)); ++ BUG_ON(ret); ++ ++ bch2_journal_flush_all_pins(&c->journal); ++ ++ ret = bch2_btree_delete_at(&trans, iter, 0); ++ BUG_ON(ret); ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_iterate(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 i; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ delete_test_keys(c); ++ ++ pr_info("inserting test keys"); ++ ++ for (i = 0; i < nr; i++) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = i; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, ++ NULL, NULL, 0); ++ BUG_ON(ret); ++ } ++ ++ pr_info("iterating forwards"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, ++ POS_MIN, 0, k, ret) { ++ if (k.k->p.inode) ++ break; ++ ++ BUG_ON(k.k->p.offset != i++); ++ } ++ ++ BUG_ON(i != nr); ++ ++ pr_info("iterating backwards"); ++ ++ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) ++ BUG_ON(k.k->p.offset != --i); ++ ++ BUG_ON(i); ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_iterate_extents(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 i; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ delete_test_keys(c); ++ ++ pr_info("inserting test extents"); ++ ++ for (i = 0; i < nr; i += 8) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = i + 8; ++ k.k.size = 8; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, ++ NULL, NULL, 0); ++ BUG_ON(ret); ++ } ++ ++ pr_info("iterating forwards"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, ++ POS_MIN, 0, k, ret) { ++ BUG_ON(bkey_start_offset(k.k) != i); ++ i = k.k->p.offset; ++ } ++ ++ BUG_ON(i != nr); ++ ++ pr_info("iterating backwards"); ++ ++ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) { ++ BUG_ON(k.k->p.offset != i); ++ i = bkey_start_offset(k.k); ++ } ++ ++ BUG_ON(i); ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_iterate_slots(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 i; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ delete_test_keys(c); ++ ++ pr_info("inserting test keys"); ++ ++ for (i = 0; i < nr; i++) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = i * 2; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, ++ NULL, NULL, 0); ++ BUG_ON(ret); ++ } ++ ++ pr_info("iterating forwards"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, ++ 0, k, ret) { ++ if (k.k->p.inode) ++ break; ++ ++ BUG_ON(k.k->p.offset != i); ++ i += 2; ++ } ++ bch2_trans_iter_free(&trans, iter); ++ ++ BUG_ON(i != nr * 2); ++ ++ pr_info("iterating forwards by slots"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, ++ BTREE_ITER_SLOTS, k, ret) { ++ BUG_ON(k.k->p.offset != i); ++ BUG_ON(bkey_deleted(k.k) != (i & 1)); ++ ++ i++; ++ if (i == nr * 2) ++ break; ++ } ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 i; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ delete_test_keys(c); ++ ++ pr_info("inserting test keys"); ++ ++ for (i = 0; i < nr; i += 16) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = i + 16; ++ k.k.size = 8; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, ++ NULL, NULL, 0); ++ BUG_ON(ret); ++ } ++ ++ pr_info("iterating forwards"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, ++ 0, k, ret) { ++ BUG_ON(bkey_start_offset(k.k) != i + 8); ++ BUG_ON(k.k->size != 8); ++ i += 16; ++ } ++ bch2_trans_iter_free(&trans, iter); ++ ++ BUG_ON(i != nr); ++ ++ pr_info("iterating forwards by slots"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, ++ BTREE_ITER_SLOTS, k, ret) { ++ BUG_ON(bkey_deleted(k.k) != !(i % 16)); ++ ++ BUG_ON(bkey_start_offset(k.k) != i); ++ BUG_ON(k.k->size != 8); ++ i = k.k->p.offset; ++ ++ if (i == nr) ++ break; ++ } ++ ++ bch2_trans_exit(&trans); ++} ++ ++/* ++ * XXX: we really want to make sure we've got a btree with depth > 0 for these ++ * tests ++ */ ++static void test_peek_end(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0); ++ ++ k = bch2_btree_iter_peek(iter); ++ BUG_ON(k.k); ++ ++ k = bch2_btree_iter_peek(iter); ++ BUG_ON(k.k); ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_peek_end_extents(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, 0); ++ ++ k = bch2_btree_iter_peek(iter); ++ BUG_ON(k.k); ++ ++ k = bch2_btree_iter_peek(iter); ++ BUG_ON(k.k); ++ ++ bch2_trans_exit(&trans); ++} ++ ++/* extent unit tests */ ++ ++u64 test_version; ++ ++static void insert_test_extent(struct bch_fs *c, ++ u64 start, u64 end) ++{ ++ struct bkey_i_cookie k; ++ int ret; ++ ++ //pr_info("inserting %llu-%llu v %llu", start, end, test_version); ++ ++ bkey_cookie_init(&k.k_i); ++ k.k_i.k.p.offset = end; ++ k.k_i.k.size = end - start; ++ k.k_i.k.version.lo = test_version++; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, ++ NULL, NULL, 0); ++ BUG_ON(ret); ++} ++ ++static void __test_extent_overwrite(struct bch_fs *c, ++ u64 e1_start, u64 e1_end, ++ u64 e2_start, u64 e2_end) ++{ ++ insert_test_extent(c, e1_start, e1_end); ++ insert_test_extent(c, e2_start, e2_end); ++ ++ delete_test_keys(c); ++} ++ ++static void test_extent_overwrite_front(struct bch_fs *c, u64 nr) ++{ ++ __test_extent_overwrite(c, 0, 64, 0, 32); ++ __test_extent_overwrite(c, 8, 64, 0, 32); ++} ++ ++static void test_extent_overwrite_back(struct bch_fs *c, u64 nr) ++{ ++ __test_extent_overwrite(c, 0, 64, 32, 64); ++ __test_extent_overwrite(c, 0, 64, 32, 72); ++} ++ ++static void test_extent_overwrite_middle(struct bch_fs *c, u64 nr) ++{ ++ __test_extent_overwrite(c, 0, 64, 32, 40); ++} ++ ++static void test_extent_overwrite_all(struct bch_fs *c, u64 nr) ++{ ++ __test_extent_overwrite(c, 32, 64, 0, 64); ++ __test_extent_overwrite(c, 32, 64, 0, 128); ++ __test_extent_overwrite(c, 32, 64, 32, 64); ++ __test_extent_overwrite(c, 32, 64, 32, 128); ++} ++ ++/* perf tests */ ++ ++static u64 test_rand(void) ++{ ++ u64 v; ++#if 0 ++ v = prandom_u32(); ++#else ++ prandom_bytes(&v, sizeof(v)); ++#endif ++ return v; ++} ++ ++static void rand_insert(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct bkey_i_cookie k; ++ int ret; ++ u64 i; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < nr; i++) { ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = test_rand(); ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ __bch2_btree_insert(&trans, BTREE_ID_XATTRS, &k.k_i)); ++ ++ BUG_ON(ret); ++ } ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void rand_lookup(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 i; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < nr; i++) { ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, ++ POS(0, test_rand()), 0); ++ ++ k = bch2_btree_iter_peek(iter); ++ bch2_trans_iter_free(&trans, iter); ++ } ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void rand_mixed(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ u64 i; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < nr; i++) { ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, ++ POS(0, test_rand()), 0); ++ ++ k = bch2_btree_iter_peek(iter); ++ ++ if (!(i & 3) && k.k) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p = iter->pos; ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_trans_update(&trans, iter, &k.k_i, 0)); ++ ++ BUG_ON(ret); ++ } ++ ++ bch2_trans_iter_free(&trans, iter); ++ } ++ ++ bch2_trans_exit(&trans); ++} ++ ++static int __do_delete(struct btree_trans *trans, struct bpos pos) ++{ ++ struct btree_iter *iter; ++ struct bkey_i delete; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_XATTRS, pos, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(iter); ++ if (ret) ++ goto err; ++ ++ k = bch2_btree_iter_peek(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ bkey_init(&delete.k); ++ delete.k.p = k.k->p; ++ ++ bch2_trans_update(trans, iter, &delete, 0); ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static void rand_delete(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ int ret; ++ u64 i; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < nr; i++) { ++ struct bpos pos = POS(0, test_rand()); ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ __do_delete(&trans, pos)); ++ BUG_ON(ret); ++ } ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void seq_insert(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_i_cookie insert; ++ int ret; ++ u64 i = 0; ++ ++ bkey_cookie_init(&insert.k_i); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ insert.k.p = iter->pos; ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_trans_update(&trans, iter, &insert.k_i, 0)); ++ ++ BUG_ON(ret); ++ ++ if (++i == nr) ++ break; ++ } ++ bch2_trans_exit(&trans); ++} ++ ++static void seq_lookup(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, 0, k, ret) ++ ; ++ bch2_trans_exit(&trans); ++} ++ ++static void seq_overwrite(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, ++ BTREE_ITER_INTENT, k, ret) { ++ struct bkey_i_cookie u; ++ ++ bkey_reassemble(&u.k_i, k); ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_trans_update(&trans, iter, &u.k_i, 0)); ++ ++ BUG_ON(ret); ++ } ++ bch2_trans_exit(&trans); ++} ++ ++static void seq_delete(struct bch_fs *c, u64 nr) ++{ ++ int ret; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, ++ POS(0, 0), POS(0, U64_MAX), ++ NULL); ++ BUG_ON(ret); ++} ++ ++typedef void (*perf_test_fn)(struct bch_fs *, u64); ++ ++struct test_job { ++ struct bch_fs *c; ++ u64 nr; ++ unsigned nr_threads; ++ perf_test_fn fn; ++ ++ atomic_t ready; ++ wait_queue_head_t ready_wait; ++ ++ atomic_t done; ++ struct completion done_completion; ++ ++ u64 start; ++ u64 finish; ++}; ++ ++static int btree_perf_test_thread(void *data) ++{ ++ struct test_job *j = data; ++ ++ if (atomic_dec_and_test(&j->ready)) { ++ wake_up(&j->ready_wait); ++ j->start = sched_clock(); ++ } else { ++ wait_event(j->ready_wait, !atomic_read(&j->ready)); ++ } ++ ++ j->fn(j->c, j->nr / j->nr_threads); ++ ++ if (atomic_dec_and_test(&j->done)) { ++ j->finish = sched_clock(); ++ complete(&j->done_completion); ++ } ++ ++ return 0; ++} ++ ++void bch2_btree_perf_test(struct bch_fs *c, const char *testname, ++ u64 nr, unsigned nr_threads) ++{ ++ struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; ++ char name_buf[20], nr_buf[20], per_sec_buf[20]; ++ unsigned i; ++ u64 time; ++ ++ atomic_set(&j.ready, nr_threads); ++ init_waitqueue_head(&j.ready_wait); ++ ++ atomic_set(&j.done, nr_threads); ++ init_completion(&j.done_completion); ++ ++#define perf_test(_test) \ ++ if (!strcmp(testname, #_test)) j.fn = _test ++ ++ perf_test(rand_insert); ++ perf_test(rand_lookup); ++ perf_test(rand_mixed); ++ perf_test(rand_delete); ++ ++ perf_test(seq_insert); ++ perf_test(seq_lookup); ++ perf_test(seq_overwrite); ++ perf_test(seq_delete); ++ ++ /* a unit test, not a perf test: */ ++ perf_test(test_delete); ++ perf_test(test_delete_written); ++ perf_test(test_iterate); ++ perf_test(test_iterate_extents); ++ perf_test(test_iterate_slots); ++ perf_test(test_iterate_slots_extents); ++ perf_test(test_peek_end); ++ perf_test(test_peek_end_extents); ++ ++ perf_test(test_extent_overwrite_front); ++ perf_test(test_extent_overwrite_back); ++ perf_test(test_extent_overwrite_middle); ++ perf_test(test_extent_overwrite_all); ++ ++ if (!j.fn) { ++ pr_err("unknown test %s", testname); ++ return; ++ } ++ ++ //pr_info("running test %s:", testname); ++ ++ if (nr_threads == 1) ++ btree_perf_test_thread(&j); ++ else ++ for (i = 0; i < nr_threads; i++) ++ kthread_run(btree_perf_test_thread, &j, ++ "bcachefs perf test[%u]", i); ++ ++ while (wait_for_completion_interruptible(&j.done_completion)) ++ ; ++ ++ time = j.finish - j.start; ++ ++ scnprintf(name_buf, sizeof(name_buf), "%s:", testname); ++ bch2_hprint(&PBUF(nr_buf), nr); ++ bch2_hprint(&PBUF(per_sec_buf), nr * NSEC_PER_SEC / time); ++ printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n", ++ name_buf, nr_buf, nr_threads, ++ time / NSEC_PER_SEC, ++ time * nr_threads / nr, ++ per_sec_buf); ++} ++ ++#endif /* CONFIG_BCACHEFS_TESTS */ +diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h +new file mode 100644 +index 000000000000..551d0764225e +--- /dev/null ++++ b/fs/bcachefs/tests.h +@@ -0,0 +1,15 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_TEST_H ++#define _BCACHEFS_TEST_H ++ ++struct bch_fs; ++ ++#ifdef CONFIG_BCACHEFS_TESTS ++ ++void bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned); ++ ++#else ++ ++#endif /* CONFIG_BCACHEFS_TESTS */ ++ ++#endif /* _BCACHEFS_TEST_H */ +diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c +new file mode 100644 +index 000000000000..59e8dfa3d245 +--- /dev/null ++++ b/fs/bcachefs/trace.c +@@ -0,0 +1,12 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "alloc_types.h" ++#include "buckets.h" ++#include "btree_types.h" ++#include "keylist.h" ++ ++#include ++#include "keylist.h" ++ ++#define CREATE_TRACE_POINTS ++#include +diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c +new file mode 100644 +index 000000000000..e69d03d1109f +--- /dev/null ++++ b/fs/bcachefs/util.c +@@ -0,0 +1,910 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * random utiility code, for bcache but in theory not specific to bcache ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "eytzinger.h" ++#include "util.h" ++ ++static const char si_units[] = "?kMGTPEZY"; ++ ++static int __bch2_strtoh(const char *cp, u64 *res, ++ u64 t_max, bool t_signed) ++{ ++ bool positive = *cp != '-'; ++ unsigned u; ++ u64 v = 0; ++ ++ if (*cp == '+' || *cp == '-') ++ cp++; ++ ++ if (!isdigit(*cp)) ++ return -EINVAL; ++ ++ do { ++ if (v > U64_MAX / 10) ++ return -ERANGE; ++ v *= 10; ++ if (v > U64_MAX - (*cp - '0')) ++ return -ERANGE; ++ v += *cp - '0'; ++ cp++; ++ } while (isdigit(*cp)); ++ ++ for (u = 1; u < strlen(si_units); u++) ++ if (*cp == si_units[u]) { ++ cp++; ++ goto got_unit; ++ } ++ u = 0; ++got_unit: ++ if (*cp == '\n') ++ cp++; ++ if (*cp) ++ return -EINVAL; ++ ++ if (fls64(v) + u * 10 > 64) ++ return -ERANGE; ++ ++ v <<= u * 10; ++ ++ if (positive) { ++ if (v > t_max) ++ return -ERANGE; ++ } else { ++ if (v && !t_signed) ++ return -ERANGE; ++ ++ if (v > t_max + 1) ++ return -ERANGE; ++ v = -v; ++ } ++ ++ *res = v; ++ return 0; ++} ++ ++#define STRTO_H(name, type) \ ++int bch2_ ## name ## _h(const char *cp, type *res) \ ++{ \ ++ u64 v; \ ++ int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \ ++ ANYSINT_MAX(type) != ((type) ~0ULL)); \ ++ *res = v; \ ++ return ret; \ ++} ++ ++STRTO_H(strtoint, int) ++STRTO_H(strtouint, unsigned int) ++STRTO_H(strtoll, long long) ++STRTO_H(strtoull, unsigned long long) ++STRTO_H(strtou64, u64) ++ ++void bch2_hprint(struct printbuf *buf, s64 v) ++{ ++ int u, t = 0; ++ ++ for (u = 0; v >= 1024 || v <= -1024; u++) { ++ t = v & ~(~0U << 10); ++ v >>= 10; ++ } ++ ++ pr_buf(buf, "%lli", v); ++ ++ /* ++ * 103 is magic: t is in the range [-1023, 1023] and we want ++ * to turn it into [-9, 9] ++ */ ++ if (u && v < 100 && v > -100) ++ pr_buf(buf, ".%i", t / 103); ++ if (u) ++ pr_buf(buf, "%c", si_units[u]); ++} ++ ++void bch2_string_opt_to_text(struct printbuf *out, ++ const char * const list[], ++ size_t selected) ++{ ++ size_t i; ++ ++ for (i = 0; list[i]; i++) ++ pr_buf(out, i == selected ? "[%s] " : "%s ", list[i]); ++} ++ ++void bch2_flags_to_text(struct printbuf *out, ++ const char * const list[], u64 flags) ++{ ++ unsigned bit, nr = 0; ++ bool first = true; ++ ++ if (out->pos != out->end) ++ *out->pos = '\0'; ++ ++ while (list[nr]) ++ nr++; ++ ++ while (flags && (bit = __ffs(flags)) < nr) { ++ if (!first) ++ pr_buf(out, ","); ++ first = false; ++ pr_buf(out, "%s", list[bit]); ++ flags ^= 1 << bit; ++ } ++} ++ ++u64 bch2_read_flag_list(char *opt, const char * const list[]) ++{ ++ u64 ret = 0; ++ char *p, *s, *d = kstrndup(opt, PAGE_SIZE - 1, GFP_KERNEL); ++ ++ if (!d) ++ return -ENOMEM; ++ ++ s = strim(d); ++ ++ while ((p = strsep(&s, ","))) { ++ int flag = match_string(list, -1, p); ++ if (flag < 0) { ++ ret = -1; ++ break; ++ } ++ ++ ret |= 1 << flag; ++ } ++ ++ kfree(d); ++ ++ return ret; ++} ++ ++bool bch2_is_zero(const void *_p, size_t n) ++{ ++ const char *p = _p; ++ size_t i; ++ ++ for (i = 0; i < n; i++) ++ if (p[i]) ++ return false; ++ return true; ++} ++ ++static void bch2_quantiles_update(struct quantiles *q, u64 v) ++{ ++ unsigned i = 0; ++ ++ while (i < ARRAY_SIZE(q->entries)) { ++ struct quantile_entry *e = q->entries + i; ++ ++ if (unlikely(!e->step)) { ++ e->m = v; ++ e->step = max_t(unsigned, v / 2, 1024); ++ } else if (e->m > v) { ++ e->m = e->m >= e->step ++ ? e->m - e->step ++ : 0; ++ } else if (e->m < v) { ++ e->m = e->m + e->step > e->m ++ ? e->m + e->step ++ : U32_MAX; ++ } ++ ++ if ((e->m > v ? e->m - v : v - e->m) < e->step) ++ e->step = max_t(unsigned, e->step / 2, 1); ++ ++ if (v >= e->m) ++ break; ++ ++ i = eytzinger0_child(i, v > e->m); ++ } ++} ++ ++/* time stats: */ ++ ++static void bch2_time_stats_update_one(struct time_stats *stats, ++ u64 start, u64 end) ++{ ++ u64 duration, freq; ++ ++ duration = time_after64(end, start) ++ ? end - start : 0; ++ freq = time_after64(end, stats->last_event) ++ ? end - stats->last_event : 0; ++ ++ stats->count++; ++ ++ stats->average_duration = stats->average_duration ++ ? ewma_add(stats->average_duration, duration, 6) ++ : duration; ++ ++ stats->average_frequency = stats->average_frequency ++ ? ewma_add(stats->average_frequency, freq, 6) ++ : freq; ++ ++ stats->max_duration = max(stats->max_duration, duration); ++ ++ stats->last_event = end; ++ ++ bch2_quantiles_update(&stats->quantiles, duration); ++} ++ ++void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end) ++{ ++ unsigned long flags; ++ ++ if (!stats->buffer) { ++ spin_lock_irqsave(&stats->lock, flags); ++ bch2_time_stats_update_one(stats, start, end); ++ ++ if (stats->average_frequency < 32 && ++ stats->count > 1024) ++ stats->buffer = ++ alloc_percpu_gfp(struct time_stat_buffer, ++ GFP_ATOMIC); ++ spin_unlock_irqrestore(&stats->lock, flags); ++ } else { ++ struct time_stat_buffer_entry *i; ++ struct time_stat_buffer *b; ++ ++ preempt_disable(); ++ b = this_cpu_ptr(stats->buffer); ++ ++ BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); ++ b->entries[b->nr++] = (struct time_stat_buffer_entry) { ++ .start = start, ++ .end = end ++ }; ++ ++ if (b->nr == ARRAY_SIZE(b->entries)) { ++ spin_lock_irqsave(&stats->lock, flags); ++ for (i = b->entries; ++ i < b->entries + ARRAY_SIZE(b->entries); ++ i++) ++ bch2_time_stats_update_one(stats, i->start, i->end); ++ spin_unlock_irqrestore(&stats->lock, flags); ++ ++ b->nr = 0; ++ } ++ ++ preempt_enable(); ++ } ++} ++ ++static const struct time_unit { ++ const char *name; ++ u32 nsecs; ++} time_units[] = { ++ { "ns", 1 }, ++ { "us", NSEC_PER_USEC }, ++ { "ms", NSEC_PER_MSEC }, ++ { "sec", NSEC_PER_SEC }, ++}; ++ ++static const struct time_unit *pick_time_units(u64 ns) ++{ ++ const struct time_unit *u; ++ ++ for (u = time_units; ++ u + 1 < time_units + ARRAY_SIZE(time_units) && ++ ns >= u[1].nsecs << 1; ++ u++) ++ ; ++ ++ return u; ++} ++ ++static void pr_time_units(struct printbuf *out, u64 ns) ++{ ++ const struct time_unit *u = pick_time_units(ns); ++ ++ pr_buf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); ++} ++ ++size_t bch2_time_stats_print(struct time_stats *stats, char *buf, size_t len) ++{ ++ struct printbuf out = _PBUF(buf, len); ++ const struct time_unit *u; ++ u64 freq = READ_ONCE(stats->average_frequency); ++ u64 q, last_q = 0; ++ int i; ++ ++ pr_buf(&out, "count:\t\t%llu\n", ++ stats->count); ++ pr_buf(&out, "rate:\t\t%llu/sec\n", ++ freq ? div64_u64(NSEC_PER_SEC, freq) : 0); ++ ++ pr_buf(&out, "frequency:\t"); ++ pr_time_units(&out, freq); ++ ++ pr_buf(&out, "\navg duration:\t"); ++ pr_time_units(&out, stats->average_duration); ++ ++ pr_buf(&out, "\nmax duration:\t"); ++ pr_time_units(&out, stats->max_duration); ++ ++ i = eytzinger0_first(NR_QUANTILES); ++ u = pick_time_units(stats->quantiles.entries[i].m); ++ ++ pr_buf(&out, "\nquantiles (%s):\t", u->name); ++ eytzinger0_for_each(i, NR_QUANTILES) { ++ bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; ++ ++ q = max(stats->quantiles.entries[i].m, last_q); ++ pr_buf(&out, "%llu%s", ++ div_u64(q, u->nsecs), ++ is_last ? "\n" : " "); ++ last_q = q; ++ } ++ ++ return out.pos - buf; ++} ++ ++void bch2_time_stats_exit(struct time_stats *stats) ++{ ++ free_percpu(stats->buffer); ++} ++ ++void bch2_time_stats_init(struct time_stats *stats) ++{ ++ memset(stats, 0, sizeof(*stats)); ++ spin_lock_init(&stats->lock); ++} ++ ++/* ratelimit: */ ++ ++/** ++ * bch2_ratelimit_delay() - return how long to delay until the next time to do ++ * some work ++ * ++ * @d - the struct bch_ratelimit to update ++ * ++ * Returns the amount of time to delay by, in jiffies ++ */ ++u64 bch2_ratelimit_delay(struct bch_ratelimit *d) ++{ ++ u64 now = local_clock(); ++ ++ return time_after64(d->next, now) ++ ? nsecs_to_jiffies(d->next - now) ++ : 0; ++} ++ ++/** ++ * bch2_ratelimit_increment() - increment @d by the amount of work done ++ * ++ * @d - the struct bch_ratelimit to update ++ * @done - the amount of work done, in arbitrary units ++ */ ++void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done) ++{ ++ u64 now = local_clock(); ++ ++ d->next += div_u64(done * NSEC_PER_SEC, d->rate); ++ ++ if (time_before64(now + NSEC_PER_SEC, d->next)) ++ d->next = now + NSEC_PER_SEC; ++ ++ if (time_after64(now - NSEC_PER_SEC * 2, d->next)) ++ d->next = now - NSEC_PER_SEC * 2; ++} ++ ++/* pd controller: */ ++ ++/* ++ * Updates pd_controller. Attempts to scale inputed values to units per second. ++ * @target: desired value ++ * @actual: current value ++ * ++ * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing ++ * it makes actual go down. ++ */ ++void bch2_pd_controller_update(struct bch_pd_controller *pd, ++ s64 target, s64 actual, int sign) ++{ ++ s64 proportional, derivative, change; ++ ++ unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ; ++ ++ if (seconds_since_update == 0) ++ return; ++ ++ pd->last_update = jiffies; ++ ++ proportional = actual - target; ++ proportional *= seconds_since_update; ++ proportional = div_s64(proportional, pd->p_term_inverse); ++ ++ derivative = actual - pd->last_actual; ++ derivative = div_s64(derivative, seconds_since_update); ++ derivative = ewma_add(pd->smoothed_derivative, derivative, ++ (pd->d_term / seconds_since_update) ?: 1); ++ derivative = derivative * pd->d_term; ++ derivative = div_s64(derivative, pd->p_term_inverse); ++ ++ change = proportional + derivative; ++ ++ /* Don't increase rate if not keeping up */ ++ if (change > 0 && ++ pd->backpressure && ++ time_after64(local_clock(), ++ pd->rate.next + NSEC_PER_MSEC)) ++ change = 0; ++ ++ change *= (sign * -1); ++ ++ pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change, ++ 1, UINT_MAX); ++ ++ pd->last_actual = actual; ++ pd->last_derivative = derivative; ++ pd->last_proportional = proportional; ++ pd->last_change = change; ++ pd->last_target = target; ++} ++ ++void bch2_pd_controller_init(struct bch_pd_controller *pd) ++{ ++ pd->rate.rate = 1024; ++ pd->last_update = jiffies; ++ pd->p_term_inverse = 6000; ++ pd->d_term = 30; ++ pd->d_smooth = pd->d_term; ++ pd->backpressure = 1; ++} ++ ++size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf) ++{ ++ /* 2^64 - 1 is 20 digits, plus null byte */ ++ char rate[21]; ++ char actual[21]; ++ char target[21]; ++ char proportional[21]; ++ char derivative[21]; ++ char change[21]; ++ s64 next_io; ++ ++ bch2_hprint(&PBUF(rate), pd->rate.rate); ++ bch2_hprint(&PBUF(actual), pd->last_actual); ++ bch2_hprint(&PBUF(target), pd->last_target); ++ bch2_hprint(&PBUF(proportional), pd->last_proportional); ++ bch2_hprint(&PBUF(derivative), pd->last_derivative); ++ bch2_hprint(&PBUF(change), pd->last_change); ++ ++ next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC); ++ ++ return sprintf(buf, ++ "rate:\t\t%s/sec\n" ++ "target:\t\t%s\n" ++ "actual:\t\t%s\n" ++ "proportional:\t%s\n" ++ "derivative:\t%s\n" ++ "change:\t\t%s/sec\n" ++ "next io:\t%llims\n", ++ rate, target, actual, proportional, ++ derivative, change, next_io); ++} ++ ++/* misc: */ ++ ++void bch2_bio_map(struct bio *bio, void *base, size_t size) ++{ ++ while (size) { ++ struct page *page = is_vmalloc_addr(base) ++ ? vmalloc_to_page(base) ++ : virt_to_page(base); ++ unsigned offset = offset_in_page(base); ++ unsigned len = min_t(size_t, PAGE_SIZE - offset, size); ++ ++ BUG_ON(!bio_add_page(bio, page, len, offset)); ++ size -= len; ++ base += len; ++ } ++} ++ ++int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) ++{ ++ while (size) { ++ struct page *page = alloc_page(gfp_mask); ++ unsigned len = min(PAGE_SIZE, size); ++ ++ if (!page) ++ return -ENOMEM; ++ ++ BUG_ON(!bio_add_page(bio, page, len, 0)); ++ size -= len; ++ } ++ ++ return 0; ++} ++ ++size_t bch2_rand_range(size_t max) ++{ ++ size_t rand; ++ ++ if (!max) ++ return 0; ++ ++ do { ++ rand = get_random_long(); ++ rand &= roundup_pow_of_two(max) - 1; ++ } while (rand >= max); ++ ++ return rand; ++} ++ ++void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src) ++{ ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ ++ __bio_for_each_segment(bv, dst, iter, dst_iter) { ++ void *dstp = kmap_atomic(bv.bv_page); ++ memcpy(dstp + bv.bv_offset, src, bv.bv_len); ++ kunmap_atomic(dstp); ++ ++ src += bv.bv_len; ++ } ++} ++ ++void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) ++{ ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ ++ __bio_for_each_segment(bv, src, iter, src_iter) { ++ void *srcp = kmap_atomic(bv.bv_page); ++ memcpy(dst, srcp + bv.bv_offset, bv.bv_len); ++ kunmap_atomic(srcp); ++ ++ dst += bv.bv_len; ++ } ++} ++ ++void bch_scnmemcpy(struct printbuf *out, ++ const char *src, size_t len) ++{ ++ size_t n = printbuf_remaining(out); ++ ++ if (n) { ++ n = min(n - 1, len); ++ memcpy(out->pos, src, n); ++ out->pos += n; ++ *out->pos = '\0'; ++ } ++} ++ ++#include "eytzinger.h" ++ ++static int alignment_ok(const void *base, size_t align) ++{ ++ return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || ++ ((unsigned long)base & (align - 1)) == 0; ++} ++ ++static void u32_swap(void *a, void *b, size_t size) ++{ ++ u32 t = *(u32 *)a; ++ *(u32 *)a = *(u32 *)b; ++ *(u32 *)b = t; ++} ++ ++static void u64_swap(void *a, void *b, size_t size) ++{ ++ u64 t = *(u64 *)a; ++ *(u64 *)a = *(u64 *)b; ++ *(u64 *)b = t; ++} ++ ++static void generic_swap(void *a, void *b, size_t size) ++{ ++ char t; ++ ++ do { ++ t = *(char *)a; ++ *(char *)a++ = *(char *)b; ++ *(char *)b++ = t; ++ } while (--size > 0); ++} ++ ++static inline int do_cmp(void *base, size_t n, size_t size, ++ int (*cmp_func)(const void *, const void *, size_t), ++ size_t l, size_t r) ++{ ++ return cmp_func(base + inorder_to_eytzinger0(l, n) * size, ++ base + inorder_to_eytzinger0(r, n) * size, ++ size); ++} ++ ++static inline void do_swap(void *base, size_t n, size_t size, ++ void (*swap_func)(void *, void *, size_t), ++ size_t l, size_t r) ++{ ++ swap_func(base + inorder_to_eytzinger0(l, n) * size, ++ base + inorder_to_eytzinger0(r, n) * size, ++ size); ++} ++ ++void eytzinger0_sort(void *base, size_t n, size_t size, ++ int (*cmp_func)(const void *, const void *, size_t), ++ void (*swap_func)(void *, void *, size_t)) ++{ ++ int i, c, r; ++ ++ if (!swap_func) { ++ if (size == 4 && alignment_ok(base, 4)) ++ swap_func = u32_swap; ++ else if (size == 8 && alignment_ok(base, 8)) ++ swap_func = u64_swap; ++ else ++ swap_func = generic_swap; ++ } ++ ++ /* heapify */ ++ for (i = n / 2 - 1; i >= 0; --i) { ++ for (r = i; r * 2 + 1 < n; r = c) { ++ c = r * 2 + 1; ++ ++ if (c + 1 < n && ++ do_cmp(base, n, size, cmp_func, c, c + 1) < 0) ++ c++; ++ ++ if (do_cmp(base, n, size, cmp_func, r, c) >= 0) ++ break; ++ ++ do_swap(base, n, size, swap_func, r, c); ++ } ++ } ++ ++ /* sort */ ++ for (i = n - 1; i > 0; --i) { ++ do_swap(base, n, size, swap_func, 0, i); ++ ++ for (r = 0; r * 2 + 1 < i; r = c) { ++ c = r * 2 + 1; ++ ++ if (c + 1 < i && ++ do_cmp(base, n, size, cmp_func, c, c + 1) < 0) ++ c++; ++ ++ if (do_cmp(base, n, size, cmp_func, r, c) >= 0) ++ break; ++ ++ do_swap(base, n, size, swap_func, r, c); ++ } ++ } ++} ++ ++void sort_cmp_size(void *base, size_t num, size_t size, ++ int (*cmp_func)(const void *, const void *, size_t), ++ void (*swap_func)(void *, void *, size_t size)) ++{ ++ /* pre-scale counters for performance */ ++ int i = (num/2 - 1) * size, n = num * size, c, r; ++ ++ if (!swap_func) { ++ if (size == 4 && alignment_ok(base, 4)) ++ swap_func = u32_swap; ++ else if (size == 8 && alignment_ok(base, 8)) ++ swap_func = u64_swap; ++ else ++ swap_func = generic_swap; ++ } ++ ++ /* heapify */ ++ for ( ; i >= 0; i -= size) { ++ for (r = i; r * 2 + size < n; r = c) { ++ c = r * 2 + size; ++ if (c < n - size && ++ cmp_func(base + c, base + c + size, size) < 0) ++ c += size; ++ if (cmp_func(base + r, base + c, size) >= 0) ++ break; ++ swap_func(base + r, base + c, size); ++ } ++ } ++ ++ /* sort */ ++ for (i = n - size; i > 0; i -= size) { ++ swap_func(base, base + i, size); ++ for (r = 0; r * 2 + size < i; r = c) { ++ c = r * 2 + size; ++ if (c < i - size && ++ cmp_func(base + c, base + c + size, size) < 0) ++ c += size; ++ if (cmp_func(base + r, base + c, size) >= 0) ++ break; ++ swap_func(base + r, base + c, size); ++ } ++ } ++} ++ ++static void mempool_free_vp(void *element, void *pool_data) ++{ ++ size_t size = (size_t) pool_data; ++ ++ vpfree(element, size); ++} ++ ++static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data) ++{ ++ size_t size = (size_t) pool_data; ++ ++ return vpmalloc(size, gfp_mask); ++} ++ ++int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size) ++{ ++ return size < PAGE_SIZE ++ ? mempool_init_kmalloc_pool(pool, min_nr, size) ++ : mempool_init(pool, min_nr, mempool_alloc_vp, ++ mempool_free_vp, (void *) size); ++} ++ ++#if 0 ++void eytzinger1_test(void) ++{ ++ unsigned inorder, eytz, size; ++ ++ pr_info("1 based eytzinger test:"); ++ ++ for (size = 2; ++ size < 65536; ++ size++) { ++ unsigned extra = eytzinger1_extra(size); ++ ++ if (!(size % 4096)) ++ pr_info("tree size %u", size); ++ ++ BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size)); ++ BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size)); ++ ++ BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0); ++ BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0); ++ ++ inorder = 1; ++ eytzinger1_for_each(eytz, size) { ++ BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz); ++ BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder); ++ BUG_ON(eytz != eytzinger1_last(size) && ++ eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz); ++ ++ inorder++; ++ } ++ } ++} ++ ++void eytzinger0_test(void) ++{ ++ ++ unsigned inorder, eytz, size; ++ ++ pr_info("0 based eytzinger test:"); ++ ++ for (size = 1; ++ size < 65536; ++ size++) { ++ unsigned extra = eytzinger0_extra(size); ++ ++ if (!(size % 4096)) ++ pr_info("tree size %u", size); ++ ++ BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size)); ++ BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size)); ++ ++ BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1); ++ BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1); ++ ++ inorder = 0; ++ eytzinger0_for_each(eytz, size) { ++ BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz); ++ BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder); ++ BUG_ON(eytz != eytzinger0_last(size) && ++ eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz); ++ ++ inorder++; ++ } ++ } ++} ++ ++static inline int cmp_u16(const void *_l, const void *_r, size_t size) ++{ ++ const u16 *l = _l, *r = _r; ++ ++ return (*l > *r) - (*r - *l); ++} ++ ++static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) ++{ ++ int i, c1 = -1, c2 = -1; ++ ssize_t r; ++ ++ r = eytzinger0_find_le(test_array, nr, ++ sizeof(test_array[0]), ++ cmp_u16, &search); ++ if (r >= 0) ++ c1 = test_array[r]; ++ ++ for (i = 0; i < nr; i++) ++ if (test_array[i] <= search && test_array[i] > c2) ++ c2 = test_array[i]; ++ ++ if (c1 != c2) { ++ eytzinger0_for_each(i, nr) ++ pr_info("[%3u] = %12u", i, test_array[i]); ++ pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i", ++ i, r, c1, c2); ++ } ++} ++ ++void eytzinger0_find_test(void) ++{ ++ unsigned i, nr, allocated = 1 << 12; ++ u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL); ++ ++ for (nr = 1; nr < allocated; nr++) { ++ pr_info("testing %u elems", nr); ++ ++ get_random_bytes(test_array, nr * sizeof(test_array[0])); ++ eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); ++ ++ /* verify array is sorted correctly: */ ++ eytzinger0_for_each(i, nr) ++ BUG_ON(i != eytzinger0_last(nr) && ++ test_array[i] > test_array[eytzinger0_next(i, nr)]); ++ ++ for (i = 0; i < U16_MAX; i += 1 << 12) ++ eytzinger0_find_test_val(test_array, nr, i); ++ ++ for (i = 0; i < nr; i++) { ++ eytzinger0_find_test_val(test_array, nr, test_array[i] - 1); ++ eytzinger0_find_test_val(test_array, nr, test_array[i]); ++ eytzinger0_find_test_val(test_array, nr, test_array[i] + 1); ++ } ++ } ++ ++ kfree(test_array); ++} ++#endif ++ ++/* ++ * Accumulate percpu counters onto one cpu's copy - only valid when access ++ * against any percpu counter is guarded against ++ */ ++u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) ++{ ++ u64 *ret; ++ int cpu; ++ ++ preempt_disable(); ++ ret = this_cpu_ptr(p); ++ preempt_enable(); ++ ++ for_each_possible_cpu(cpu) { ++ u64 *i = per_cpu_ptr(p, cpu); ++ ++ if (i != ret) { ++ acc_u64s(ret, i, nr); ++ memset(i, 0, nr * sizeof(u64)); ++ } ++ } ++ ++ return ret; ++} +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +new file mode 100644 +index 000000000000..0128daba5970 +--- /dev/null ++++ b/fs/bcachefs/util.h +@@ -0,0 +1,761 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_UTIL_H ++#define _BCACHEFS_UTIL_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define PAGE_SECTOR_SHIFT (PAGE_SHIFT - 9) ++#define PAGE_SECTORS (1UL << PAGE_SECTOR_SHIFT) ++ ++struct closure; ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++#define EBUG_ON(cond) BUG_ON(cond) ++#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) ++#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) ++#define atomic_sub_bug(i, v) BUG_ON(atomic_sub_return(i, v) < 0) ++#define atomic_add_bug(i, v) BUG_ON(atomic_add_return(i, v) < 0) ++#define atomic_long_dec_bug(v) BUG_ON(atomic_long_dec_return(v) < 0) ++#define atomic_long_sub_bug(i, v) BUG_ON(atomic_long_sub_return(i, v) < 0) ++#define atomic64_dec_bug(v) BUG_ON(atomic64_dec_return(v) < 0) ++#define atomic64_inc_bug(v, i) BUG_ON(atomic64_inc_return(v) <= i) ++#define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0) ++#define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0) ++ ++#define memcpy(dst, src, len) \ ++({ \ ++ void *_dst = (dst); \ ++ const void *_src = (src); \ ++ size_t _len = (len); \ ++ \ ++ BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \ ++ (void *) (_dst) + (_len) <= (void *) (_src))); \ ++ memcpy(_dst, _src, _len); \ ++}) ++ ++#else /* DEBUG */ ++ ++#define EBUG_ON(cond) ++#define atomic_dec_bug(v) atomic_dec(v) ++#define atomic_inc_bug(v, i) atomic_inc(v) ++#define atomic_sub_bug(i, v) atomic_sub(i, v) ++#define atomic_add_bug(i, v) atomic_add(i, v) ++#define atomic_long_dec_bug(v) atomic_long_dec(v) ++#define atomic_long_sub_bug(i, v) atomic_long_sub(i, v) ++#define atomic64_dec_bug(v) atomic64_dec(v) ++#define atomic64_inc_bug(v, i) atomic64_inc(v) ++#define atomic64_sub_bug(i, v) atomic64_sub(i, v) ++#define atomic64_add_bug(i, v) atomic64_add(i, v) ++ ++#endif ++ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++#define CPU_BIG_ENDIAN 0 ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++#define CPU_BIG_ENDIAN 1 ++#endif ++ ++/* type hackery */ ++ ++#define type_is_exact(_val, _type) \ ++ __builtin_types_compatible_p(typeof(_val), _type) ++ ++#define type_is(_val, _type) \ ++ (__builtin_types_compatible_p(typeof(_val), _type) || \ ++ __builtin_types_compatible_p(typeof(_val), const _type)) ++ ++/* Userspace doesn't align allocations as nicely as the kernel allocators: */ ++static inline size_t buf_pages(void *p, size_t len) ++{ ++ return DIV_ROUND_UP(len + ++ ((unsigned long) p & (PAGE_SIZE - 1)), ++ PAGE_SIZE); ++} ++ ++static inline void vpfree(void *p, size_t size) ++{ ++ if (is_vmalloc_addr(p)) ++ vfree(p); ++ else ++ free_pages((unsigned long) p, get_order(size)); ++} ++ ++static inline void *vpmalloc(size_t size, gfp_t gfp_mask) ++{ ++ return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, ++ get_order(size)) ?: ++ __vmalloc(size, gfp_mask, PAGE_KERNEL); ++} ++ ++static inline void kvpfree(void *p, size_t size) ++{ ++ if (size < PAGE_SIZE) ++ kfree(p); ++ else ++ vpfree(p, size); ++} ++ ++static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) ++{ ++ return size < PAGE_SIZE ++ ? kmalloc(size, gfp_mask) ++ : vpmalloc(size, gfp_mask); ++} ++ ++int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t); ++ ++#define HEAP(type) \ ++struct { \ ++ size_t size, used; \ ++ type *data; \ ++} ++ ++#define DECLARE_HEAP(type, name) HEAP(type) name ++ ++#define init_heap(heap, _size, gfp) \ ++({ \ ++ (heap)->used = 0; \ ++ (heap)->size = (_size); \ ++ (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\ ++ (gfp)); \ ++}) ++ ++#define free_heap(heap) \ ++do { \ ++ kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \ ++ (heap)->data = NULL; \ ++} while (0) ++ ++#define heap_set_backpointer(h, i, _fn) \ ++do { \ ++ void (*fn)(typeof(h), size_t) = _fn; \ ++ if (fn) \ ++ fn(h, i); \ ++} while (0) ++ ++#define heap_swap(h, i, j, set_backpointer) \ ++do { \ ++ swap((h)->data[i], (h)->data[j]); \ ++ heap_set_backpointer(h, i, set_backpointer); \ ++ heap_set_backpointer(h, j, set_backpointer); \ ++} while (0) ++ ++#define heap_peek(h) \ ++({ \ ++ EBUG_ON(!(h)->used); \ ++ (h)->data[0]; \ ++}) ++ ++#define heap_full(h) ((h)->used == (h)->size) ++ ++#define heap_sift_down(h, i, cmp, set_backpointer) \ ++do { \ ++ size_t _c, _j = i; \ ++ \ ++ for (; _j * 2 + 1 < (h)->used; _j = _c) { \ ++ _c = _j * 2 + 1; \ ++ if (_c + 1 < (h)->used && \ ++ cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0) \ ++ _c++; \ ++ \ ++ if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0) \ ++ break; \ ++ heap_swap(h, _c, _j, set_backpointer); \ ++ } \ ++} while (0) ++ ++#define heap_sift_up(h, i, cmp, set_backpointer) \ ++do { \ ++ while (i) { \ ++ size_t p = (i - 1) / 2; \ ++ if (cmp(h, (h)->data[i], (h)->data[p]) >= 0) \ ++ break; \ ++ heap_swap(h, i, p, set_backpointer); \ ++ i = p; \ ++ } \ ++} while (0) ++ ++#define __heap_add(h, d, cmp, set_backpointer) \ ++({ \ ++ size_t _i = (h)->used++; \ ++ (h)->data[_i] = d; \ ++ heap_set_backpointer(h, _i, set_backpointer); \ ++ \ ++ heap_sift_up(h, _i, cmp, set_backpointer); \ ++ _i; \ ++}) ++ ++#define heap_add(h, d, cmp, set_backpointer) \ ++({ \ ++ bool _r = !heap_full(h); \ ++ if (_r) \ ++ __heap_add(h, d, cmp, set_backpointer); \ ++ _r; \ ++}) ++ ++#define heap_add_or_replace(h, new, cmp, set_backpointer) \ ++do { \ ++ if (!heap_add(h, new, cmp, set_backpointer) && \ ++ cmp(h, new, heap_peek(h)) >= 0) { \ ++ (h)->data[0] = new; \ ++ heap_set_backpointer(h, 0, set_backpointer); \ ++ heap_sift_down(h, 0, cmp, set_backpointer); \ ++ } \ ++} while (0) ++ ++#define heap_del(h, i, cmp, set_backpointer) \ ++do { \ ++ size_t _i = (i); \ ++ \ ++ BUG_ON(_i >= (h)->used); \ ++ (h)->used--; \ ++ heap_swap(h, _i, (h)->used, set_backpointer); \ ++ heap_sift_up(h, _i, cmp, set_backpointer); \ ++ heap_sift_down(h, _i, cmp, set_backpointer); \ ++} while (0) ++ ++#define heap_pop(h, d, cmp, set_backpointer) \ ++({ \ ++ bool _r = (h)->used; \ ++ if (_r) { \ ++ (d) = (h)->data[0]; \ ++ heap_del(h, 0, cmp, set_backpointer); \ ++ } \ ++ _r; \ ++}) ++ ++#define heap_resort(heap, cmp, set_backpointer) \ ++do { \ ++ ssize_t _i; \ ++ for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \ ++ heap_sift_down(heap, _i, cmp, set_backpointer); \ ++} while (0) ++ ++#define ANYSINT_MAX(t) \ ++ ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) ++ ++struct printbuf { ++ char *pos; ++ char *end; ++}; ++ ++static inline size_t printbuf_remaining(struct printbuf *buf) ++{ ++ return buf->end - buf->pos; ++} ++ ++#define _PBUF(_buf, _len) \ ++ ((struct printbuf) { \ ++ .pos = _buf, \ ++ .end = _buf + _len, \ ++ }) ++ ++#define PBUF(_buf) _PBUF(_buf, sizeof(_buf)) ++ ++#define pr_buf(_out, ...) \ ++do { \ ++ (_out)->pos += scnprintf((_out)->pos, printbuf_remaining(_out), \ ++ __VA_ARGS__); \ ++} while (0) ++ ++void bch_scnmemcpy(struct printbuf *, const char *, size_t); ++ ++int bch2_strtoint_h(const char *, int *); ++int bch2_strtouint_h(const char *, unsigned int *); ++int bch2_strtoll_h(const char *, long long *); ++int bch2_strtoull_h(const char *, unsigned long long *); ++int bch2_strtou64_h(const char *, u64 *); ++ ++static inline int bch2_strtol_h(const char *cp, long *res) ++{ ++#if BITS_PER_LONG == 32 ++ return bch2_strtoint_h(cp, (int *) res); ++#else ++ return bch2_strtoll_h(cp, (long long *) res); ++#endif ++} ++ ++static inline int bch2_strtoul_h(const char *cp, long *res) ++{ ++#if BITS_PER_LONG == 32 ++ return bch2_strtouint_h(cp, (unsigned int *) res); ++#else ++ return bch2_strtoull_h(cp, (unsigned long long *) res); ++#endif ++} ++ ++#define strtoi_h(cp, res) \ ++ ( type_is(*res, int) ? bch2_strtoint_h(cp, (void *) res)\ ++ : type_is(*res, long) ? bch2_strtol_h(cp, (void *) res)\ ++ : type_is(*res, long long) ? bch2_strtoll_h(cp, (void *) res)\ ++ : type_is(*res, unsigned) ? bch2_strtouint_h(cp, (void *) res)\ ++ : type_is(*res, unsigned long) ? bch2_strtoul_h(cp, (void *) res)\ ++ : type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\ ++ : -EINVAL) ++ ++#define strtoul_safe(cp, var) \ ++({ \ ++ unsigned long _v; \ ++ int _r = kstrtoul(cp, 10, &_v); \ ++ if (!_r) \ ++ var = _v; \ ++ _r; \ ++}) ++ ++#define strtoul_safe_clamp(cp, var, min, max) \ ++({ \ ++ unsigned long _v; \ ++ int _r = kstrtoul(cp, 10, &_v); \ ++ if (!_r) \ ++ var = clamp_t(typeof(var), _v, min, max); \ ++ _r; \ ++}) ++ ++#define strtoul_safe_restrict(cp, var, min, max) \ ++({ \ ++ unsigned long _v; \ ++ int _r = kstrtoul(cp, 10, &_v); \ ++ if (!_r && _v >= min && _v <= max) \ ++ var = _v; \ ++ else \ ++ _r = -EINVAL; \ ++ _r; \ ++}) ++ ++#define snprint(buf, size, var) \ ++ snprintf(buf, size, \ ++ type_is(var, int) ? "%i\n" \ ++ : type_is(var, unsigned) ? "%u\n" \ ++ : type_is(var, long) ? "%li\n" \ ++ : type_is(var, unsigned long) ? "%lu\n" \ ++ : type_is(var, s64) ? "%lli\n" \ ++ : type_is(var, u64) ? "%llu\n" \ ++ : type_is(var, char *) ? "%s\n" \ ++ : "%i\n", var) ++ ++void bch2_hprint(struct printbuf *, s64); ++ ++bool bch2_is_zero(const void *, size_t); ++ ++void bch2_string_opt_to_text(struct printbuf *, ++ const char * const [], size_t); ++ ++void bch2_flags_to_text(struct printbuf *, const char * const[], u64); ++u64 bch2_read_flag_list(char *, const char * const[]); ++ ++#define NR_QUANTILES 15 ++#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) ++#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) ++#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) ++ ++struct quantiles { ++ struct quantile_entry { ++ u64 m; ++ u64 step; ++ } entries[NR_QUANTILES]; ++}; ++ ++struct time_stat_buffer { ++ unsigned nr; ++ struct time_stat_buffer_entry { ++ u64 start; ++ u64 end; ++ } entries[32]; ++}; ++ ++struct time_stats { ++ spinlock_t lock; ++ u64 count; ++ /* all fields are in nanoseconds */ ++ u64 average_duration; ++ u64 average_frequency; ++ u64 max_duration; ++ u64 last_event; ++ struct quantiles quantiles; ++ ++ struct time_stat_buffer __percpu *buffer; ++}; ++ ++void __bch2_time_stats_update(struct time_stats *stats, u64, u64); ++ ++static inline void bch2_time_stats_update(struct time_stats *stats, u64 start) ++{ ++ __bch2_time_stats_update(stats, start, local_clock()); ++} ++ ++size_t bch2_time_stats_print(struct time_stats *, char *, size_t); ++ ++void bch2_time_stats_exit(struct time_stats *); ++void bch2_time_stats_init(struct time_stats *); ++ ++#define ewma_add(ewma, val, weight) \ ++({ \ ++ typeof(ewma) _ewma = (ewma); \ ++ typeof(weight) _weight = (weight); \ ++ \ ++ (((_ewma << _weight) - _ewma) + (val)) >> _weight; \ ++}) ++ ++struct bch_ratelimit { ++ /* Next time we want to do some work, in nanoseconds */ ++ u64 next; ++ ++ /* ++ * Rate at which we want to do work, in units per nanosecond ++ * The units here correspond to the units passed to ++ * bch2_ratelimit_increment() ++ */ ++ unsigned rate; ++}; ++ ++static inline void bch2_ratelimit_reset(struct bch_ratelimit *d) ++{ ++ d->next = local_clock(); ++} ++ ++u64 bch2_ratelimit_delay(struct bch_ratelimit *); ++void bch2_ratelimit_increment(struct bch_ratelimit *, u64); ++ ++struct bch_pd_controller { ++ struct bch_ratelimit rate; ++ unsigned long last_update; ++ ++ s64 last_actual; ++ s64 smoothed_derivative; ++ ++ unsigned p_term_inverse; ++ unsigned d_smooth; ++ unsigned d_term; ++ ++ /* for exporting to sysfs (no effect on behavior) */ ++ s64 last_derivative; ++ s64 last_proportional; ++ s64 last_change; ++ s64 last_target; ++ ++ /* If true, the rate will not increase if bch2_ratelimit_delay() ++ * is not being called often enough. */ ++ bool backpressure; ++}; ++ ++void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int); ++void bch2_pd_controller_init(struct bch_pd_controller *); ++size_t bch2_pd_controller_print_debug(struct bch_pd_controller *, char *); ++ ++#define sysfs_pd_controller_attribute(name) \ ++ rw_attribute(name##_rate); \ ++ rw_attribute(name##_rate_bytes); \ ++ rw_attribute(name##_rate_d_term); \ ++ rw_attribute(name##_rate_p_term_inverse); \ ++ read_attribute(name##_rate_debug) ++ ++#define sysfs_pd_controller_files(name) \ ++ &sysfs_##name##_rate, \ ++ &sysfs_##name##_rate_bytes, \ ++ &sysfs_##name##_rate_d_term, \ ++ &sysfs_##name##_rate_p_term_inverse, \ ++ &sysfs_##name##_rate_debug ++ ++#define sysfs_pd_controller_show(name, var) \ ++do { \ ++ sysfs_hprint(name##_rate, (var)->rate.rate); \ ++ sysfs_print(name##_rate_bytes, (var)->rate.rate); \ ++ sysfs_print(name##_rate_d_term, (var)->d_term); \ ++ sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \ ++ \ ++ if (attr == &sysfs_##name##_rate_debug) \ ++ return bch2_pd_controller_print_debug(var, buf); \ ++} while (0) ++ ++#define sysfs_pd_controller_store(name, var) \ ++do { \ ++ sysfs_strtoul_clamp(name##_rate, \ ++ (var)->rate.rate, 1, UINT_MAX); \ ++ sysfs_strtoul_clamp(name##_rate_bytes, \ ++ (var)->rate.rate, 1, UINT_MAX); \ ++ sysfs_strtoul(name##_rate_d_term, (var)->d_term); \ ++ sysfs_strtoul_clamp(name##_rate_p_term_inverse, \ ++ (var)->p_term_inverse, 1, INT_MAX); \ ++} while (0) ++ ++#define container_of_or_null(ptr, type, member) \ ++({ \ ++ typeof(ptr) _ptr = ptr; \ ++ _ptr ? container_of(_ptr, type, member) : NULL; \ ++}) ++ ++/* Does linear interpolation between powers of two */ ++static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) ++{ ++ unsigned fract = x & ~(~0 << fract_bits); ++ ++ x >>= fract_bits; ++ x = 1 << x; ++ x += (x * fract) >> fract_bits; ++ ++ return x; ++} ++ ++void bch2_bio_map(struct bio *bio, void *base, size_t); ++int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t); ++ ++static inline sector_t bdev_sectors(struct block_device *bdev) ++{ ++ return bdev->bd_inode->i_size >> 9; ++} ++ ++#define closure_bio_submit(bio, cl) \ ++do { \ ++ closure_get(cl); \ ++ submit_bio(bio); \ ++} while (0) ++ ++#define kthread_wait_freezable(cond) \ ++({ \ ++ int _ret = 0; \ ++ while (1) { \ ++ set_current_state(TASK_INTERRUPTIBLE); \ ++ if (kthread_should_stop()) { \ ++ _ret = -1; \ ++ break; \ ++ } \ ++ \ ++ if (cond) \ ++ break; \ ++ \ ++ schedule(); \ ++ try_to_freeze(); \ ++ } \ ++ set_current_state(TASK_RUNNING); \ ++ _ret; \ ++}) ++ ++size_t bch2_rand_range(size_t); ++ ++void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); ++void memcpy_from_bio(void *, struct bio *, struct bvec_iter); ++ ++static inline void memcpy_u64s_small(void *dst, const void *src, ++ unsigned u64s) ++{ ++ u64 *d = dst; ++ const u64 *s = src; ++ ++ while (u64s--) ++ *d++ = *s++; ++} ++ ++static inline void __memcpy_u64s(void *dst, const void *src, ++ unsigned u64s) ++{ ++#ifdef CONFIG_X86_64 ++ long d0, d1, d2; ++ asm volatile("rep ; movsq" ++ : "=&c" (d0), "=&D" (d1), "=&S" (d2) ++ : "0" (u64s), "1" (dst), "2" (src) ++ : "memory"); ++#else ++ u64 *d = dst; ++ const u64 *s = src; ++ ++ while (u64s--) ++ *d++ = *s++; ++#endif ++} ++ ++static inline void memcpy_u64s(void *dst, const void *src, ++ unsigned u64s) ++{ ++ EBUG_ON(!(dst >= src + u64s * sizeof(u64) || ++ dst + u64s * sizeof(u64) <= src)); ++ ++ __memcpy_u64s(dst, src, u64s); ++} ++ ++static inline void __memmove_u64s_down(void *dst, const void *src, ++ unsigned u64s) ++{ ++ __memcpy_u64s(dst, src, u64s); ++} ++ ++static inline void memmove_u64s_down(void *dst, const void *src, ++ unsigned u64s) ++{ ++ EBUG_ON(dst > src); ++ ++ __memmove_u64s_down(dst, src, u64s); ++} ++ ++static inline void __memmove_u64s_up_small(void *_dst, const void *_src, ++ unsigned u64s) ++{ ++ u64 *dst = (u64 *) _dst + u64s; ++ u64 *src = (u64 *) _src + u64s; ++ ++ while (u64s--) ++ *--dst = *--src; ++} ++ ++static inline void memmove_u64s_up_small(void *dst, const void *src, ++ unsigned u64s) ++{ ++ EBUG_ON(dst < src); ++ ++ __memmove_u64s_up_small(dst, src, u64s); ++} ++ ++static inline void __memmove_u64s_up(void *_dst, const void *_src, ++ unsigned u64s) ++{ ++ u64 *dst = (u64 *) _dst + u64s - 1; ++ u64 *src = (u64 *) _src + u64s - 1; ++ ++#ifdef CONFIG_X86_64 ++ long d0, d1, d2; ++ asm volatile("std ;\n" ++ "rep ; movsq\n" ++ "cld ;\n" ++ : "=&c" (d0), "=&D" (d1), "=&S" (d2) ++ : "0" (u64s), "1" (dst), "2" (src) ++ : "memory"); ++#else ++ while (u64s--) ++ *dst-- = *src--; ++#endif ++} ++ ++static inline void memmove_u64s_up(void *dst, const void *src, ++ unsigned u64s) ++{ ++ EBUG_ON(dst < src); ++ ++ __memmove_u64s_up(dst, src, u64s); ++} ++ ++static inline void memmove_u64s(void *dst, const void *src, ++ unsigned u64s) ++{ ++ if (dst < src) ++ __memmove_u64s_down(dst, src, u64s); ++ else ++ __memmove_u64s_up(dst, src, u64s); ++} ++ ++/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */ ++static inline void memset_u64s_tail(void *s, int c, unsigned bytes) ++{ ++ unsigned rem = round_up(bytes, sizeof(u64)) - bytes; ++ ++ memset(s + bytes, c, rem); ++} ++ ++void sort_cmp_size(void *base, size_t num, size_t size, ++ int (*cmp_func)(const void *, const void *, size_t), ++ void (*swap_func)(void *, void *, size_t)); ++ ++/* just the memmove, doesn't update @_nr */ ++#define __array_insert_item(_array, _nr, _pos) \ ++ memmove(&(_array)[(_pos) + 1], \ ++ &(_array)[(_pos)], \ ++ sizeof((_array)[0]) * ((_nr) - (_pos))) ++ ++#define array_insert_item(_array, _nr, _pos, _new_item) \ ++do { \ ++ __array_insert_item(_array, _nr, _pos); \ ++ (_nr)++; \ ++ (_array)[(_pos)] = (_new_item); \ ++} while (0) ++ ++#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \ ++do { \ ++ (_nr) -= (_nr_to_remove); \ ++ memmove(&(_array)[(_pos)], \ ++ &(_array)[(_pos) + (_nr_to_remove)], \ ++ sizeof((_array)[0]) * ((_nr) - (_pos))); \ ++} while (0) ++ ++#define array_remove_item(_array, _nr, _pos) \ ++ array_remove_items(_array, _nr, _pos, 1) ++ ++#define bubble_sort(_base, _nr, _cmp) \ ++do { \ ++ ssize_t _i, _end; \ ++ bool _swapped = true; \ ++ \ ++ for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\ ++ _swapped = false; \ ++ for (_i = 0; _i < _end; _i++) \ ++ if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \ ++ swap((_base)[_i], (_base)[_i + 1]); \ ++ _swapped = true; \ ++ } \ ++ } \ ++} while (0) ++ ++static inline u64 percpu_u64_get(u64 __percpu *src) ++{ ++ u64 ret = 0; ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ ret += *per_cpu_ptr(src, cpu); ++ return ret; ++} ++ ++static inline void percpu_u64_set(u64 __percpu *dst, u64 src) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ *per_cpu_ptr(dst, cpu) = 0; ++ ++ preempt_disable(); ++ *this_cpu_ptr(dst) = src; ++ preempt_enable(); ++} ++ ++static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr) ++{ ++ unsigned i; ++ ++ for (i = 0; i < nr; i++) ++ acc[i] += src[i]; ++} ++ ++static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src, ++ unsigned nr) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ acc_u64s(acc, per_cpu_ptr(src, cpu), nr); ++} ++ ++static inline void percpu_memset(void __percpu *p, int c, size_t bytes) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ memset(per_cpu_ptr(p, cpu), c, bytes); ++} ++ ++u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); ++ ++#define cmp_int(l, r) ((l > r) - (l < r)) ++ ++#endif /* _BCACHEFS_UTIL_H */ +diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h +new file mode 100644 +index 000000000000..c099cdc0605f +--- /dev/null ++++ b/fs/bcachefs/vstructs.h +@@ -0,0 +1,63 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _VSTRUCTS_H ++#define _VSTRUCTS_H ++ ++#include "util.h" ++ ++/* ++ * NOTE: we can't differentiate between __le64 and u64 with type_is - this ++ * assumes u64 is little endian: ++ */ ++#define __vstruct_u64s(_s) \ ++({ \ ++ ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s) \ ++ : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s) \ ++ : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s) \ ++ : ((__force u8) ((_s)->u64s))); \ ++}) ++ ++#define __vstruct_bytes(_type, _u64s) \ ++({ \ ++ BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \ ++ \ ++ (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ ++}) ++ ++#define vstruct_bytes(_s) \ ++ __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s)) ++ ++#define __vstruct_blocks(_type, _sector_block_bits, _u64s) \ ++ (round_up(__vstruct_bytes(_type, _u64s), \ ++ 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits))) ++ ++#define vstruct_blocks(_s, _sector_block_bits) \ ++ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s)) ++ ++#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s) \ ++ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, \ ++ __vstruct_u64s(_s) + (_u64s)) ++ ++#define vstruct_sectors(_s, _sector_block_bits) \ ++ (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9) ++ ++#define vstruct_next(_s) \ ++ ((typeof(_s)) ((_s)->_data + __vstruct_u64s(_s))) ++#define vstruct_last(_s) \ ++ ((typeof(&(_s)->start[0])) ((_s)->_data + __vstruct_u64s(_s))) ++#define vstruct_end(_s) \ ++ ((void *) ((_s)->_data + __vstruct_u64s(_s))) ++ ++#define vstruct_for_each(_s, _i) \ ++ for (_i = (_s)->start; \ ++ _i < vstruct_last(_s); \ ++ _i = vstruct_next(_i)) ++ ++#define vstruct_for_each_safe(_s, _i, _t) \ ++ for (_i = (_s)->start; \ ++ _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \ ++ _i = _t) ++ ++#define vstruct_idx(_s, _idx) \ ++ ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx))) ++ ++#endif /* _VSTRUCTS_H */ +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +new file mode 100644 +index 000000000000..725a6f3ef8ce +--- /dev/null ++++ b/fs/bcachefs/xattr.c +@@ -0,0 +1,582 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_update.h" ++#include "extents.h" ++#include "fs.h" ++#include "rebalance.h" ++#include "str_hash.h" ++#include "xattr.h" ++ ++#include ++#include ++#include ++ ++static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned); ++ ++static u64 bch2_xattr_hash(const struct bch_hash_info *info, ++ const struct xattr_search_key *key) ++{ ++ struct bch_str_hash_ctx ctx; ++ ++ bch2_str_hash_init(&ctx, info); ++ bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type)); ++ bch2_str_hash_update(&ctx, info, key->name.name, key->name.len); ++ ++ return bch2_str_hash_end(&ctx, info); ++} ++ ++static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key) ++{ ++ return bch2_xattr_hash(info, key); ++} ++ ++static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) ++{ ++ struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k); ++ ++ return bch2_xattr_hash(info, ++ &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len)); ++} ++ ++static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) ++{ ++ struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); ++ const struct xattr_search_key *r = _r; ++ ++ return l.v->x_type != r->type || ++ l.v->x_name_len != r->name.len || ++ memcmp(l.v->x_name, r->name.name, r->name.len); ++} ++ ++static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) ++{ ++ struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); ++ struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r); ++ ++ return l.v->x_type != r.v->x_type || ++ l.v->x_name_len != r.v->x_name_len || ++ memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len); ++} ++ ++const struct bch_hash_desc bch2_xattr_hash_desc = { ++ .btree_id = BTREE_ID_XATTRS, ++ .key_type = KEY_TYPE_xattr, ++ .hash_key = xattr_hash_key, ++ .hash_bkey = xattr_hash_bkey, ++ .cmp_key = xattr_cmp_key, ++ .cmp_bkey = xattr_cmp_bkey, ++}; ++ ++const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ const struct xattr_handler *handler; ++ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); ++ ++ if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) ++ return "value too small"; ++ ++ if (bkey_val_u64s(k.k) < ++ xattr_val_u64s(xattr.v->x_name_len, ++ le16_to_cpu(xattr.v->x_val_len))) ++ return "value too small"; ++ ++ if (bkey_val_u64s(k.k) > ++ xattr_val_u64s(xattr.v->x_name_len, ++ le16_to_cpu(xattr.v->x_val_len) + 4)) ++ return "value too big"; ++ ++ handler = bch2_xattr_type_to_handler(xattr.v->x_type); ++ if (!handler) ++ return "invalid type"; ++ ++ if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) ++ return "xattr name has invalid characters"; ++ ++ return NULL; ++} ++ ++void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ const struct xattr_handler *handler; ++ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); ++ ++ handler = bch2_xattr_type_to_handler(xattr.v->x_type); ++ if (handler && handler->prefix) ++ pr_buf(out, "%s", handler->prefix); ++ else if (handler) ++ pr_buf(out, "(type %u)", xattr.v->x_type); ++ else ++ pr_buf(out, "(unknown type %u)", xattr.v->x_type); ++ ++ bch_scnmemcpy(out, xattr.v->x_name, ++ xattr.v->x_name_len); ++ pr_buf(out, ":"); ++ bch_scnmemcpy(out, xattr_val(xattr.v), ++ le16_to_cpu(xattr.v->x_val_len)); ++} ++ ++int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, ++ const char *name, void *buffer, size_t size, int type) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c_xattr xattr; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, ++ &inode->ei_str_hash, inode->v.i_ino, ++ &X_SEARCH(type, name, strlen(name)), ++ 0); ++ if (IS_ERR(iter)) { ++ bch2_trans_exit(&trans); ++ BUG_ON(PTR_ERR(iter) == -EINTR); ++ ++ return PTR_ERR(iter) == -ENOENT ? -ENODATA : PTR_ERR(iter); ++ } ++ ++ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); ++ ret = le16_to_cpu(xattr.v->x_val_len); ++ if (buffer) { ++ if (ret > size) ++ ret = -ERANGE; ++ else ++ memcpy(buffer, xattr_val(xattr.v), ret); ++ } ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++int bch2_xattr_set(struct btree_trans *trans, u64 inum, ++ const struct bch_hash_info *hash_info, ++ const char *name, const void *value, size_t size, ++ int type, int flags) ++{ ++ int ret; ++ ++ if (value) { ++ struct bkey_i_xattr *xattr; ++ unsigned namelen = strlen(name); ++ unsigned u64s = BKEY_U64s + ++ xattr_val_u64s(namelen, size); ++ ++ if (u64s > U8_MAX) ++ return -ERANGE; ++ ++ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); ++ if (IS_ERR(xattr)) ++ return PTR_ERR(xattr); ++ ++ bkey_xattr_init(&xattr->k_i); ++ xattr->k.u64s = u64s; ++ xattr->v.x_type = type; ++ xattr->v.x_name_len = namelen; ++ xattr->v.x_val_len = cpu_to_le16(size); ++ memcpy(xattr->v.x_name, name, namelen); ++ memcpy(xattr_val(&xattr->v), value, size); ++ ++ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, ++ inum, &xattr->k_i, ++ (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)| ++ (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0)); ++ } else { ++ struct xattr_search_key search = ++ X_SEARCH(type, name, strlen(name)); ++ ++ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, ++ hash_info, inum, &search); ++ } ++ ++ if (ret == -ENOENT) ++ ret = flags & XATTR_REPLACE ? -ENODATA : 0; ++ ++ return ret; ++} ++ ++struct xattr_buf { ++ char *buf; ++ size_t len; ++ size_t used; ++}; ++ ++static int __bch2_xattr_emit(const char *prefix, ++ const char *name, size_t name_len, ++ struct xattr_buf *buf) ++{ ++ const size_t prefix_len = strlen(prefix); ++ const size_t total_len = prefix_len + name_len + 1; ++ ++ if (buf->buf) { ++ if (buf->used + total_len > buf->len) ++ return -ERANGE; ++ ++ memcpy(buf->buf + buf->used, prefix, prefix_len); ++ memcpy(buf->buf + buf->used + prefix_len, ++ name, name_len); ++ buf->buf[buf->used + prefix_len + name_len] = '\0'; ++ } ++ ++ buf->used += total_len; ++ return 0; ++} ++ ++static int bch2_xattr_emit(struct dentry *dentry, ++ const struct bch_xattr *xattr, ++ struct xattr_buf *buf) ++{ ++ const struct xattr_handler *handler = ++ bch2_xattr_type_to_handler(xattr->x_type); ++ ++ return handler && (!handler->list || handler->list(dentry)) ++ ? __bch2_xattr_emit(handler->prefix ?: handler->name, ++ xattr->x_name, xattr->x_name_len, buf) ++ : 0; ++} ++ ++static int bch2_xattr_list_bcachefs(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct xattr_buf *buf, ++ bool all) ++{ ++ const char *prefix = all ? "bcachefs_effective." : "bcachefs."; ++ unsigned id; ++ int ret = 0; ++ u64 v; ++ ++ for (id = 0; id < Inode_opt_nr; id++) { ++ v = bch2_inode_opt_get(&inode->ei_inode, id); ++ if (!v) ++ continue; ++ ++ if (!all && ++ !(inode->ei_inode.bi_fields_set & (1 << id))) ++ continue; ++ ++ ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id], ++ strlen(bch2_inode_opts[id]), buf); ++ if (ret) ++ break; ++ } ++ ++ return ret; ++} ++ ++ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) ++{ ++ struct bch_fs *c = dentry->d_sb->s_fs_info; ++ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; ++ u64 inum = dentry->d_inode->i_ino; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, ++ POS(inum, 0), 0, k, ret) { ++ BUG_ON(k.k->p.inode < inum); ++ ++ if (k.k->p.inode > inum) ++ break; ++ ++ if (k.k->type != KEY_TYPE_xattr) ++ continue; ++ ++ ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); ++ if (ret) ++ break; ++ } ++ ret = bch2_trans_exit(&trans) ?: ret; ++ ++ if (ret) ++ return ret; ++ ++ ret = bch2_xattr_list_bcachefs(c, inode, &buf, false); ++ if (ret) ++ return ret; ++ ++ ret = bch2_xattr_list_bcachefs(c, inode, &buf, true); ++ if (ret) ++ return ret; ++ ++ return buf.used; ++} ++ ++static int bch2_xattr_get_handler(const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, void *buffer, size_t size) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ return bch2_xattr_get(c, inode, name, buffer, size, handler->flags); ++} ++ ++static int bch2_xattr_set_handler(const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, const void *value, ++ size_t size, int flags) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0, ++ bch2_xattr_set(&trans, inode->v.i_ino, ++ &inode->ei_str_hash, ++ name, value, size, ++ handler->flags, flags)); ++} ++ ++static const struct xattr_handler bch_xattr_user_handler = { ++ .prefix = XATTR_USER_PREFIX, ++ .get = bch2_xattr_get_handler, ++ .set = bch2_xattr_set_handler, ++ .flags = KEY_TYPE_XATTR_INDEX_USER, ++}; ++ ++static bool bch2_xattr_trusted_list(struct dentry *dentry) ++{ ++ return capable(CAP_SYS_ADMIN); ++} ++ ++static const struct xattr_handler bch_xattr_trusted_handler = { ++ .prefix = XATTR_TRUSTED_PREFIX, ++ .list = bch2_xattr_trusted_list, ++ .get = bch2_xattr_get_handler, ++ .set = bch2_xattr_set_handler, ++ .flags = KEY_TYPE_XATTR_INDEX_TRUSTED, ++}; ++ ++static const struct xattr_handler bch_xattr_security_handler = { ++ .prefix = XATTR_SECURITY_PREFIX, ++ .get = bch2_xattr_get_handler, ++ .set = bch2_xattr_set_handler, ++ .flags = KEY_TYPE_XATTR_INDEX_SECURITY, ++}; ++ ++#ifndef NO_BCACHEFS_FS ++ ++static int opt_to_inode_opt(int id) ++{ ++ switch (id) { ++#define x(name, ...) \ ++ case Opt_##name: return Inode_opt_##name; ++ BCH_INODE_OPTS() ++#undef x ++ default: ++ return -1; ++ } ++} ++ ++static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, void *buffer, size_t size, ++ bool all) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_opts opts = ++ bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode)); ++ const struct bch_option *opt; ++ int id, inode_opt_id; ++ char buf[512]; ++ struct printbuf out = PBUF(buf); ++ unsigned val_len; ++ u64 v; ++ ++ id = bch2_opt_lookup(name); ++ if (id < 0 || !bch2_opt_is_inode_opt(id)) ++ return -EINVAL; ++ ++ inode_opt_id = opt_to_inode_opt(id); ++ if (inode_opt_id < 0) ++ return -EINVAL; ++ ++ opt = bch2_opt_table + id; ++ ++ if (!bch2_opt_defined_by_id(&opts, id)) ++ return -ENODATA; ++ ++ if (!all && ++ !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id))) ++ return -ENODATA; ++ ++ v = bch2_opt_get_by_id(&opts, id); ++ bch2_opt_to_text(&out, c, opt, v, 0); ++ ++ val_len = out.pos - buf; ++ ++ if (buffer && val_len > size) ++ return -ERANGE; ++ ++ if (buffer) ++ memcpy(buffer, buf, val_len); ++ return val_len; ++} ++ ++static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, void *buffer, size_t size) ++{ ++ return __bch2_xattr_bcachefs_get(handler, dentry, vinode, ++ name, buffer, size, false); ++} ++ ++struct inode_opt_set { ++ int id; ++ u64 v; ++ bool defined; ++}; ++ ++static int inode_opt_set_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct inode_opt_set *s = p; ++ ++ if (s->defined) ++ bi->bi_fields_set |= 1U << s->id; ++ else ++ bi->bi_fields_set &= ~(1U << s->id); ++ ++ bch2_inode_opt_set(bi, s->id, s->v); ++ ++ return 0; ++} ++ ++static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, const void *value, ++ size_t size, int flags) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ const struct bch_option *opt; ++ char *buf; ++ struct inode_opt_set s; ++ int opt_id, inode_opt_id, ret; ++ ++ opt_id = bch2_opt_lookup(name); ++ if (opt_id < 0) ++ return -EINVAL; ++ ++ opt = bch2_opt_table + opt_id; ++ ++ inode_opt_id = opt_to_inode_opt(opt_id); ++ if (inode_opt_id < 0) ++ return -EINVAL; ++ ++ s.id = inode_opt_id; ++ ++ if (value) { ++ u64 v = 0; ++ ++ buf = kmalloc(size + 1, GFP_KERNEL); ++ if (!buf) ++ return -ENOMEM; ++ memcpy(buf, value, size); ++ buf[size] = '\0'; ++ ++ ret = bch2_opt_parse(c, opt, buf, &v); ++ kfree(buf); ++ ++ if (ret < 0) ++ return ret; ++ ++ ret = bch2_opt_check_may_set(c, opt_id, v); ++ if (ret < 0) ++ return ret; ++ ++ s.v = v + 1; ++ s.defined = true; ++ } else { ++ if (!IS_ROOT(dentry)) { ++ struct bch_inode_info *dir = ++ to_bch_ei(d_inode(dentry->d_parent)); ++ ++ s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id); ++ } else { ++ s.v = 0; ++ } ++ ++ s.defined = false; ++ } ++ ++ mutex_lock(&inode->ei_update_lock); ++ if (inode_opt_id == Inode_opt_project) { ++ ret = bch2_set_projid(c, inode, s.v); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); ++err: ++ mutex_unlock(&inode->ei_update_lock); ++ ++ if (value && ++ (opt_id == Opt_background_compression || ++ opt_id == Opt_background_target)) ++ bch2_rebalance_add_work(c, inode->v.i_blocks); ++ ++ return ret; ++} ++ ++static const struct xattr_handler bch_xattr_bcachefs_handler = { ++ .prefix = "bcachefs.", ++ .get = bch2_xattr_bcachefs_get, ++ .set = bch2_xattr_bcachefs_set, ++}; ++ ++static int bch2_xattr_bcachefs_get_effective( ++ const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, void *buffer, size_t size) ++{ ++ return __bch2_xattr_bcachefs_get(handler, dentry, vinode, ++ name, buffer, size, true); ++} ++ ++static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { ++ .prefix = "bcachefs_effective.", ++ .get = bch2_xattr_bcachefs_get_effective, ++ .set = bch2_xattr_bcachefs_set, ++}; ++ ++#endif /* NO_BCACHEFS_FS */ ++ ++const struct xattr_handler *bch2_xattr_handlers[] = { ++ &bch_xattr_user_handler, ++ &posix_acl_access_xattr_handler, ++ &posix_acl_default_xattr_handler, ++ &bch_xattr_trusted_handler, ++ &bch_xattr_security_handler, ++#ifndef NO_BCACHEFS_FS ++ &bch_xattr_bcachefs_handler, ++ &bch_xattr_bcachefs_effective_handler, ++#endif ++ NULL ++}; ++ ++static const struct xattr_handler *bch_xattr_handler_map[] = { ++ [KEY_TYPE_XATTR_INDEX_USER] = &bch_xattr_user_handler, ++ [KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS] = ++ &posix_acl_access_xattr_handler, ++ [KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT] = ++ &posix_acl_default_xattr_handler, ++ [KEY_TYPE_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler, ++ [KEY_TYPE_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler, ++}; ++ ++static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type) ++{ ++ return type < ARRAY_SIZE(bch_xattr_handler_map) ++ ? bch_xattr_handler_map[type] ++ : NULL; ++} +diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h +new file mode 100644 +index 000000000000..4151065ab853 +--- /dev/null ++++ b/fs/bcachefs/xattr.h +@@ -0,0 +1,49 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_XATTR_H ++#define _BCACHEFS_XATTR_H ++ ++#include "str_hash.h" ++ ++extern const struct bch_hash_desc bch2_xattr_hash_desc; ++ ++const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_xattr (struct bkey_ops) { \ ++ .key_invalid = bch2_xattr_invalid, \ ++ .val_to_text = bch2_xattr_to_text, \ ++} ++ ++static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) ++{ ++ return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + ++ name_len + val_len, sizeof(u64)); ++} ++ ++#define xattr_val(_xattr) \ ++ ((void *) (_xattr)->x_name + (_xattr)->x_name_len) ++ ++struct xattr_search_key { ++ u8 type; ++ struct qstr name; ++}; ++ ++#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \ ++ { .type = _type, .name = QSTR_INIT(_name, _len) }) ++ ++struct dentry; ++struct xattr_handler; ++struct bch_hash_info; ++struct bch_inode_info; ++ ++int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *, ++ const char *, void *, size_t, int); ++ ++int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *, ++ const char *, const void *, size_t, int, int); ++ ++ssize_t bch2_xattr_list(struct dentry *, char *, size_t); ++ ++extern const struct xattr_handler *bch2_xattr_handlers[]; ++ ++#endif /* _BCACHEFS_XATTR_H */ +diff --git a/fs/cifs/file.c b/fs/cifs/file.c +index 75ddce8ef456..31d4aff3bbe5 100644 +--- a/fs/cifs/file.c ++++ b/fs/cifs/file.c +@@ -4299,20 +4299,12 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, + + page = lru_to_page(page_list); + +- /* +- * Lock the page and put it in the cache. Since no one else +- * should have access to this page, we're safe to simply set +- * PG_locked without checking it first. +- */ +- __SetPageLocked(page); +- rc = add_to_page_cache_locked(page, mapping, +- page->index, gfp); ++ rc = add_to_page_cache(page, mapping, ++ page->index, gfp); + + /* give up if we can't stick it in the cache */ +- if (rc) { +- __ClearPageLocked(page); ++ if (rc) + return rc; +- } + + /* move first page to the tmplist */ + *offset = (loff_t)page->index << PAGE_SHIFT; +@@ -4331,11 +4323,8 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, + if (*bytes + PAGE_SIZE > rsize) + break; + +- __SetPageLocked(page); +- if (add_to_page_cache_locked(page, mapping, page->index, gfp)) { +- __ClearPageLocked(page); ++ if (add_to_page_cache(page, mapping, page->index, gfp)) + break; +- } + list_move_tail(&page->lru, tmplist); + (*bytes) += PAGE_SIZE; + expected_index++; +diff --git a/fs/dcache.c b/fs/dcache.c +index b280e07e162b..7a73f5bf9c76 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -3113,9 +3113,8 @@ void d_genocide(struct dentry *parent) + + EXPORT_SYMBOL(d_genocide); + +-void d_tmpfile(struct dentry *dentry, struct inode *inode) ++void d_mark_tmpfile(struct dentry *dentry, struct inode *inode) + { +- inode_dec_link_count(inode); + BUG_ON(dentry->d_name.name != dentry->d_iname || + !hlist_unhashed(&dentry->d_u.d_alias) || + !d_unlinked(dentry)); +@@ -3125,6 +3124,13 @@ void d_tmpfile(struct dentry *dentry, struct inode *inode) + (unsigned long long)inode->i_ino); + spin_unlock(&dentry->d_lock); + spin_unlock(&dentry->d_parent->d_lock); ++} ++EXPORT_SYMBOL(d_mark_tmpfile); ++ ++void d_tmpfile(struct dentry *dentry, struct inode *inode) ++{ ++ inode_dec_link_count(inode); ++ d_mark_tmpfile(dentry, inode); + d_instantiate(dentry, inode); + } + EXPORT_SYMBOL(d_tmpfile); +diff --git a/fs/inode.c b/fs/inode.c +index 93d9252a00ab..f2b6d24f3456 100644 +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -1503,6 +1503,46 @@ int insert_inode_locked(struct inode *inode) + } + EXPORT_SYMBOL(insert_inode_locked); + ++struct inode *insert_inode_locked2(struct inode *inode) ++{ ++ struct super_block *sb = inode->i_sb; ++ ino_t ino = inode->i_ino; ++ struct hlist_head *head = inode_hashtable + hash(sb, ino); ++ ++ while (1) { ++ struct inode *old = NULL; ++ spin_lock(&inode_hash_lock); ++ hlist_for_each_entry(old, head, i_hash) { ++ if (old->i_ino != ino) ++ continue; ++ if (old->i_sb != sb) ++ continue; ++ spin_lock(&old->i_lock); ++ if (old->i_state & (I_FREEING|I_WILL_FREE)) { ++ spin_unlock(&old->i_lock); ++ continue; ++ } ++ break; ++ } ++ if (likely(!old)) { ++ spin_lock(&inode->i_lock); ++ inode->i_state |= I_NEW | I_CREATING; ++ hlist_add_head(&inode->i_hash, head); ++ spin_unlock(&inode->i_lock); ++ spin_unlock(&inode_hash_lock); ++ return NULL; ++ } ++ __iget(old); ++ spin_unlock(&old->i_lock); ++ spin_unlock(&inode_hash_lock); ++ wait_on_inode(old); ++ if (unlikely(!inode_unhashed(old))) ++ return old; ++ iput(old); ++ } ++} ++EXPORT_SYMBOL(insert_inode_locked2); ++ + int insert_inode_locked4(struct inode *inode, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) + { +diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h +index 71e387a5fe90..e916f046fed4 100644 +--- a/include/asm-generic/vmlinux.lds.h ++++ b/include/asm-generic/vmlinux.lds.h +@@ -323,6 +323,10 @@ + __start___verbose = .; \ + KEEP(*(__verbose)) \ + __stop___verbose = .; \ ++ . = ALIGN(8); \ ++ __start___faults = .; \ ++ *(__faults) \ ++ __stop___faults = .; \ + LIKELY_PROFILE() \ + BRANCH_PROFILE() \ + TRACE_PRINTKS() \ +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 32868fbedc9e..2979f9082a98 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -889,6 +889,7 @@ extern const char *blk_op_str(unsigned int op); + + int blk_status_to_errno(blk_status_t status); + blk_status_t errno_to_blk_status(int errno); ++const char *blk_status_to_str(blk_status_t status); + + int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin); + +diff --git a/include/linux/closure.h b/include/linux/closure.h +new file mode 100644 +index 000000000000..abacb91c3565 +--- /dev/null ++++ b/include/linux/closure.h +@@ -0,0 +1,404 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _LINUX_CLOSURE_H ++#define _LINUX_CLOSURE_H ++ ++#include ++#include ++#include ++#include ++ ++/* ++ * Closure is perhaps the most overused and abused term in computer science, but ++ * since I've been unable to come up with anything better you're stuck with it ++ * again. ++ * ++ * What are closures? ++ * ++ * They embed a refcount. The basic idea is they count "things that are in ++ * progress" - in flight bios, some other thread that's doing something else - ++ * anything you might want to wait on. ++ * ++ * The refcount may be manipulated with closure_get() and closure_put(). ++ * closure_put() is where many of the interesting things happen, when it causes ++ * the refcount to go to 0. ++ * ++ * Closures can be used to wait on things both synchronously and asynchronously, ++ * and synchronous and asynchronous use can be mixed without restriction. To ++ * wait synchronously, use closure_sync() - you will sleep until your closure's ++ * refcount hits 1. ++ * ++ * To wait asynchronously, use ++ * continue_at(cl, next_function, workqueue); ++ * ++ * passing it, as you might expect, the function to run when nothing is pending ++ * and the workqueue to run that function out of. ++ * ++ * continue_at() also, critically, requires a 'return' immediately following the ++ * location where this macro is referenced, to return to the calling function. ++ * There's good reason for this. ++ * ++ * To use safely closures asynchronously, they must always have a refcount while ++ * they are running owned by the thread that is running them. Otherwise, suppose ++ * you submit some bios and wish to have a function run when they all complete: ++ * ++ * foo_endio(struct bio *bio) ++ * { ++ * closure_put(cl); ++ * } ++ * ++ * closure_init(cl); ++ * ++ * do_stuff(); ++ * closure_get(cl); ++ * bio1->bi_endio = foo_endio; ++ * bio_submit(bio1); ++ * ++ * do_more_stuff(); ++ * closure_get(cl); ++ * bio2->bi_endio = foo_endio; ++ * bio_submit(bio2); ++ * ++ * continue_at(cl, complete_some_read, system_wq); ++ * ++ * If closure's refcount started at 0, complete_some_read() could run before the ++ * second bio was submitted - which is almost always not what you want! More ++ * importantly, it wouldn't be possible to say whether the original thread or ++ * complete_some_read()'s thread owned the closure - and whatever state it was ++ * associated with! ++ * ++ * So, closure_init() initializes a closure's refcount to 1 - and when a ++ * closure_fn is run, the refcount will be reset to 1 first. ++ * ++ * Then, the rule is - if you got the refcount with closure_get(), release it ++ * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount ++ * on a closure because you called closure_init() or you were run out of a ++ * closure - _always_ use continue_at(). Doing so consistently will help ++ * eliminate an entire class of particularly pernicious races. ++ * ++ * Lastly, you might have a wait list dedicated to a specific event, and have no ++ * need for specifying the condition - you just want to wait until someone runs ++ * closure_wake_up() on the appropriate wait list. In that case, just use ++ * closure_wait(). It will return either true or false, depending on whether the ++ * closure was already on a wait list or not - a closure can only be on one wait ++ * list at a time. ++ * ++ * Parents: ++ * ++ * closure_init() takes two arguments - it takes the closure to initialize, and ++ * a (possibly null) parent. ++ * ++ * If parent is non null, the new closure will have a refcount for its lifetime; ++ * a closure is considered to be "finished" when its refcount hits 0 and the ++ * function to run is null. Hence ++ * ++ * continue_at(cl, NULL, NULL); ++ * ++ * returns up the (spaghetti) stack of closures, precisely like normal return ++ * returns up the C stack. continue_at() with non null fn is better thought of ++ * as doing a tail call. ++ * ++ * All this implies that a closure should typically be embedded in a particular ++ * struct (which its refcount will normally control the lifetime of), and that ++ * struct can very much be thought of as a stack frame. ++ */ ++ ++struct closure; ++struct closure_syncer; ++typedef void (closure_fn) (struct closure *); ++extern struct dentry *bcache_debug; ++ ++struct closure_waitlist { ++ struct llist_head list; ++}; ++ ++enum closure_state { ++ /* ++ * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by ++ * the thread that owns the closure, and cleared by the thread that's ++ * waking up the closure. ++ * ++ * The rest are for debugging and don't affect behaviour: ++ * ++ * CLOSURE_RUNNING: Set when a closure is running (i.e. by ++ * closure_init() and when closure_put() runs then next function), and ++ * must be cleared before remaining hits 0. Primarily to help guard ++ * against incorrect usage and accidentally transferring references. ++ * continue_at() and closure_return() clear it for you, if you're doing ++ * something unusual you can use closure_set_dead() which also helps ++ * annotate where references are being transferred. ++ */ ++ ++ CLOSURE_BITS_START = (1U << 26), ++ CLOSURE_DESTRUCTOR = (1U << 26), ++ CLOSURE_WAITING = (1U << 28), ++ CLOSURE_RUNNING = (1U << 30), ++}; ++ ++#define CLOSURE_GUARD_MASK \ ++ ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1) ++ ++#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) ++#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) ++ ++struct closure { ++ union { ++ struct { ++ struct workqueue_struct *wq; ++ struct closure_syncer *s; ++ struct llist_node list; ++ closure_fn *fn; ++ }; ++ struct work_struct work; ++ }; ++ ++ struct closure *parent; ++ ++ atomic_t remaining; ++ ++#ifdef CONFIG_DEBUG_CLOSURES ++#define CLOSURE_MAGIC_DEAD 0xc054dead ++#define CLOSURE_MAGIC_ALIVE 0xc054a11e ++ ++ unsigned int magic; ++ struct list_head all; ++ unsigned long ip; ++ unsigned long waiting_on; ++#endif ++}; ++ ++void closure_sub(struct closure *cl, int v); ++void closure_put(struct closure *cl); ++void __closure_wake_up(struct closure_waitlist *list); ++bool closure_wait(struct closure_waitlist *list, struct closure *cl); ++void __closure_sync(struct closure *cl); ++ ++/** ++ * closure_sync - sleep until a closure a closure has nothing left to wait on ++ * ++ * Sleeps until the refcount hits 1 - the thread that's running the closure owns ++ * the last refcount. ++ */ ++static inline void closure_sync(struct closure *cl) ++{ ++ if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) ++ __closure_sync(cl); ++} ++ ++#ifdef CONFIG_DEBUG_CLOSURES ++ ++void closure_debug_create(struct closure *cl); ++void closure_debug_destroy(struct closure *cl); ++ ++#else ++ ++static inline void closure_debug_create(struct closure *cl) {} ++static inline void closure_debug_destroy(struct closure *cl) {} ++ ++#endif ++ ++static inline void closure_set_ip(struct closure *cl) ++{ ++#ifdef CONFIG_DEBUG_CLOSURES ++ cl->ip = _THIS_IP_; ++#endif ++} ++ ++static inline void closure_set_ret_ip(struct closure *cl) ++{ ++#ifdef CONFIG_DEBUG_CLOSURES ++ cl->ip = _RET_IP_; ++#endif ++} ++ ++static inline void closure_set_waiting(struct closure *cl, unsigned long f) ++{ ++#ifdef CONFIG_DEBUG_CLOSURES ++ cl->waiting_on = f; ++#endif ++} ++ ++static inline void closure_set_stopped(struct closure *cl) ++{ ++ atomic_sub(CLOSURE_RUNNING, &cl->remaining); ++} ++ ++static inline void set_closure_fn(struct closure *cl, closure_fn *fn, ++ struct workqueue_struct *wq) ++{ ++ closure_set_ip(cl); ++ cl->fn = fn; ++ cl->wq = wq; ++ /* between atomic_dec() in closure_put() */ ++ smp_mb__before_atomic(); ++} ++ ++static inline void closure_queue(struct closure *cl) ++{ ++ struct workqueue_struct *wq = cl->wq; ++ /** ++ * Changes made to closure, work_struct, or a couple of other structs ++ * may cause work.func not pointing to the right location. ++ */ ++ BUILD_BUG_ON(offsetof(struct closure, fn) ++ != offsetof(struct work_struct, func)); ++ ++ if (wq) { ++ INIT_WORK(&cl->work, cl->work.func); ++ queue_work(wq, &cl->work); ++ } else ++ cl->fn(cl); ++} ++ ++/** ++ * closure_get - increment a closure's refcount ++ */ ++static inline void closure_get(struct closure *cl) ++{ ++#ifdef CONFIG_DEBUG_CLOSURES ++ BUG_ON((atomic_inc_return(&cl->remaining) & ++ CLOSURE_REMAINING_MASK) <= 1); ++#else ++ atomic_inc(&cl->remaining); ++#endif ++} ++ ++/** ++ * closure_init - Initialize a closure, setting the refcount to 1 ++ * @cl: closure to initialize ++ * @parent: parent of the new closure. cl will take a refcount on it for its ++ * lifetime; may be NULL. ++ */ ++static inline void closure_init(struct closure *cl, struct closure *parent) ++{ ++ cl->fn = NULL; ++ cl->parent = parent; ++ if (parent) ++ closure_get(parent); ++ ++ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); ++ ++ closure_debug_create(cl); ++ closure_set_ip(cl); ++} ++ ++static inline void closure_init_stack(struct closure *cl) ++{ ++ memset(cl, 0, sizeof(struct closure)); ++ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); ++} ++ ++/** ++ * closure_wake_up - wake up all closures on a wait list, ++ * with memory barrier ++ */ ++static inline void closure_wake_up(struct closure_waitlist *list) ++{ ++ /* Memory barrier for the wait list */ ++ smp_mb(); ++ __closure_wake_up(list); ++} ++ ++/** ++ * continue_at - jump to another function with barrier ++ * ++ * After @cl is no longer waiting on anything (i.e. all outstanding refs have ++ * been dropped with closure_put()), it will resume execution at @fn running out ++ * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). ++ * ++ * This is because after calling continue_at() you no longer have a ref on @cl, ++ * and whatever @cl owns may be freed out from under you - a running closure fn ++ * has a ref on its own closure which continue_at() drops. ++ * ++ * Note you are expected to immediately return after using this macro. ++ */ ++#define continue_at(_cl, _fn, _wq) \ ++do { \ ++ set_closure_fn(_cl, _fn, _wq); \ ++ closure_sub(_cl, CLOSURE_RUNNING + 1); \ ++} while (0) ++ ++/** ++ * closure_return - finish execution of a closure ++ * ++ * This is used to indicate that @cl is finished: when all outstanding refs on ++ * @cl have been dropped @cl's ref on its parent closure (as passed to ++ * closure_init()) will be dropped, if one was specified - thus this can be ++ * thought of as returning to the parent closure. ++ */ ++#define closure_return(_cl) continue_at((_cl), NULL, NULL) ++ ++/** ++ * continue_at_nobarrier - jump to another function without barrier ++ * ++ * Causes @fn to be executed out of @cl, in @wq context (or called directly if ++ * @wq is NULL). ++ * ++ * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, ++ * thus it's not safe to touch anything protected by @cl after a ++ * continue_at_nobarrier(). ++ */ ++#define continue_at_nobarrier(_cl, _fn, _wq) \ ++do { \ ++ closure_set_ip(_cl); \ ++ if (_wq) { \ ++ INIT_WORK(&(_cl)->work, (void *) _fn); \ ++ queue_work((_wq), &(_cl)->work); \ ++ } else { \ ++ (_fn)(_cl); \ ++ } \ ++} while (0) ++ ++/** ++ * closure_return_with_destructor - finish execution of a closure, ++ * with destructor ++ * ++ * Works like closure_return(), except @destructor will be called when all ++ * outstanding refs on @cl have been dropped; @destructor may be used to safely ++ * free the memory occupied by @cl, and it is called with the ref on the parent ++ * closure still held - so @destructor could safely return an item to a ++ * freelist protected by @cl's parent. ++ */ ++#define closure_return_with_destructor(_cl, _destructor) \ ++do { \ ++ set_closure_fn(_cl, _destructor, NULL); \ ++ closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ ++} while (0) ++ ++/** ++ * closure_call - execute @fn out of a new, uninitialized closure ++ * ++ * Typically used when running out of one closure, and we want to run @fn ++ * asynchronously out of a new closure - @parent will then wait for @cl to ++ * finish. ++ */ ++static inline void closure_call(struct closure *cl, closure_fn fn, ++ struct workqueue_struct *wq, ++ struct closure *parent) ++{ ++ closure_init(cl, parent); ++ continue_at_nobarrier(cl, fn, wq); ++} ++ ++#define __closure_wait_event(waitlist, _cond) \ ++do { \ ++ struct closure cl; \ ++ \ ++ closure_init_stack(&cl); \ ++ \ ++ while (1) { \ ++ closure_wait(waitlist, &cl); \ ++ if (_cond) \ ++ break; \ ++ closure_sync(&cl); \ ++ } \ ++ closure_wake_up(waitlist); \ ++ closure_sync(&cl); \ ++} while (0) ++ ++#define closure_wait_event(waitlist, _cond) \ ++do { \ ++ if (!(_cond)) \ ++ __closure_wait_event(waitlist, _cond); \ ++} while (0) ++ ++#endif /* _LINUX_CLOSURE_H */ +diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h +index cdf016596659..d3ab422fd4bf 100644 +--- a/include/linux/compiler_attributes.h ++++ b/include/linux/compiler_attributes.h +@@ -270,4 +270,9 @@ + */ + #define __weak __attribute__((__weak__)) + ++/* ++ * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-flatten-function-attribute ++ */ ++#define __flatten __attribute__((flatten)) ++ + #endif /* __LINUX_COMPILER_ATTRIBUTES_H */ +diff --git a/include/linux/dcache.h b/include/linux/dcache.h +index c1488cc84fd9..3d6c4102ecc1 100644 +--- a/include/linux/dcache.h ++++ b/include/linux/dcache.h +@@ -254,6 +254,7 @@ extern struct dentry * d_make_root(struct inode *); + /* - the ramfs-type tree */ + extern void d_genocide(struct dentry *); + ++extern void d_mark_tmpfile(struct dentry *, struct inode *); + extern void d_tmpfile(struct dentry *, struct inode *); + + extern struct dentry *d_find_alias(struct inode *); +diff --git a/include/linux/dynamic_fault.h b/include/linux/dynamic_fault.h +new file mode 100644 +index 000000000000..6e7bb56ae8b4 +--- /dev/null ++++ b/include/linux/dynamic_fault.h +@@ -0,0 +1,117 @@ ++#ifndef _DYNAMIC_FAULT_H ++#define _DYNAMIC_FAULT_H ++ ++#include ++#include ++#include ++ ++enum dfault_enabled { ++ DFAULT_DISABLED, ++ DFAULT_ENABLED, ++ DFAULT_ONESHOT, ++}; ++ ++union dfault_state { ++ struct { ++ unsigned enabled:2; ++ unsigned count:30; ++ }; ++ ++ struct { ++ unsigned v; ++ }; ++}; ++ ++/* ++ * An instance of this structure is created in a special ++ * ELF section at every dynamic fault callsite. At runtime, ++ * the special section is treated as an array of these. ++ */ ++struct _dfault { ++ const char *modname; ++ const char *function; ++ const char *filename; ++ const char *class; ++ ++ const u16 line; ++ ++ unsigned frequency; ++ union dfault_state state; ++ ++ struct static_key enabled; ++} __aligned(8); ++ ++ ++#ifdef CONFIG_DYNAMIC_FAULT ++ ++int dfault_add_module(struct _dfault *tab, unsigned int n, const char *mod); ++int dfault_remove_module(char *mod_name); ++bool __dynamic_fault_enabled(struct _dfault *); ++ ++#define dynamic_fault(_class) \ ++({ \ ++ static struct _dfault descriptor \ ++ __used __aligned(8) __attribute__((section("__faults"))) = { \ ++ .modname = KBUILD_MODNAME, \ ++ .function = __func__, \ ++ .filename = __FILE__, \ ++ .line = __LINE__, \ ++ .class = _class, \ ++ }; \ ++ \ ++ static_key_false(&descriptor.enabled) && \ ++ __dynamic_fault_enabled(&descriptor); \ ++}) ++ ++#define memory_fault() dynamic_fault("memory") ++#define race_fault() dynamic_fault("race") ++ ++#define kmalloc(...) \ ++ (memory_fault() ? NULL : kmalloc(__VA_ARGS__)) ++#define kzalloc(...) \ ++ (memory_fault() ? NULL : kzalloc(__VA_ARGS__)) ++#define krealloc(...) \ ++ (memory_fault() ? NULL : krealloc(__VA_ARGS__)) ++ ++#define mempool_alloc(pool, gfp_mask) \ ++ ((!gfpflags_allow_blocking(gfp_mask) && memory_fault()) \ ++ ? NULL : mempool_alloc(pool, gfp_mask)) ++ ++#define __get_free_pages(...) \ ++ (memory_fault() ? 0 : __get_free_pages(__VA_ARGS__)) ++#define alloc_pages_node(...) \ ++ (memory_fault() ? NULL : alloc_pages_node(__VA_ARGS__)) ++#define alloc_pages_nodemask(...) \ ++ (memory_fault() ? NULL : alloc_pages_nodemask(__VA_ARGS__)) ++ ++#define bio_alloc_bioset(gfp_mask, ...) \ ++ ((!gfpflags_allow_blocking(gfp_mask) && memory_fault()) \ ++ ? NULL : bio_alloc_bioset(gfp_mask, __VA_ARGS__)) ++ ++#define bio_clone(bio, gfp_mask) \ ++ ((!gfpflags_allow_blocking(gfp_mask) && memory_fault()) \ ++ ? NULL : bio_clone(bio, gfp_mask)) ++ ++#define bio_clone_bioset(bio, gfp_mask, bs) \ ++ ((!gfpflags_allow_blocking(gfp_mask) && memory_fault()) \ ++ ? NULL : bio_clone_bioset(bio, gfp_mask, bs)) ++ ++#define bio_kmalloc(...) \ ++ (memory_fault() ? NULL : bio_kmalloc(__VA_ARGS__)) ++#define bio_clone_kmalloc(...) \ ++ (memory_fault() ? NULL : bio_clone_kmalloc(__VA_ARGS__)) ++ ++#define bio_iov_iter_get_pages(...) \ ++ (memory_fault() ? -ENOMEM : bio_iov_iter_get_pages(__VA_ARGS__)) ++ ++#else /* CONFIG_DYNAMIC_FAULT */ ++ ++#define dfault_add_module(tab, n, modname) 0 ++#define dfault_remove_module(mod) 0 ++#define dynamic_fault(_class) 0 ++#define memory_fault() 0 ++#define race_fault() 0 ++ ++#endif /* CONFIG_DYNAMIC_FAULT */ ++ ++#endif +diff --git a/include/linux/fs.h b/include/linux/fs.h +index 45cc10cdf6dd..51f2268a3eaa 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -3072,6 +3072,7 @@ extern struct inode *find_inode_nowait(struct super_block *, + void *data); + extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); + extern int insert_inode_locked(struct inode *); ++extern struct inode *insert_inode_locked2(struct inode *); + #ifdef CONFIG_DEBUG_LOCK_ALLOC + extern void lockdep_annotate_inode_mutex_key(struct inode *inode); + #else +diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h +index a8f7bd8ea1c6..2b41ba4377ec 100644 +--- a/include/linux/pagemap.h ++++ b/include/linux/pagemap.h +@@ -605,32 +605,21 @@ static inline int fault_in_pages_readable(const char __user *uaddr, int size) + return 0; + } + +-int add_to_page_cache_locked(struct page *page, struct address_space *mapping, +- pgoff_t index, gfp_t gfp_mask); ++int add_to_page_cache(struct page *page, struct address_space *mapping, ++ pgoff_t index, gfp_t gfp_mask); + int add_to_page_cache_lru(struct page *page, struct address_space *mapping, + pgoff_t index, gfp_t gfp_mask); ++int add_to_page_cache_lru_vec(struct address_space *mapping, ++ struct page **pages, ++ unsigned nr_pages, ++ pgoff_t offset, gfp_t gfp_mask); ++ + extern void delete_from_page_cache(struct page *page); + extern void __delete_from_page_cache(struct page *page, void *shadow); + int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); + void delete_from_page_cache_batch(struct address_space *mapping, + struct pagevec *pvec); + +-/* +- * Like add_to_page_cache_locked, but used to add newly allocated pages: +- * the page is new, so we can just run __SetPageLocked() against it. +- */ +-static inline int add_to_page_cache(struct page *page, +- struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) +-{ +- int error; +- +- __SetPageLocked(page); +- error = add_to_page_cache_locked(page, mapping, offset, gfp_mask); +- if (unlikely(error)) +- __ClearPageLocked(page); +- return error; +-} +- + static inline unsigned long dir_pages(struct inode *inode) + { + return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >> +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 4418f5cb8324..3f99f17a095b 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -45,6 +45,7 @@ struct io_context; + struct mempolicy; + struct nameidata; + struct nsproxy; ++struct pagecache_lock; + struct perf_event_context; + struct pid_namespace; + struct pipe_inode_info; +@@ -734,6 +735,7 @@ struct task_struct { + + struct mm_struct *mm; + struct mm_struct *active_mm; ++ struct address_space *faults_disabled_mapping; + + /* Per-thread vma caching: */ + struct vmacache vmacache; +diff --git a/include/linux/six.h b/include/linux/six.h +new file mode 100644 +index 000000000000..a16e94f482e9 +--- /dev/null ++++ b/include/linux/six.h +@@ -0,0 +1,197 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#ifndef _LINUX_SIX_H ++#define _LINUX_SIX_H ++ ++/* ++ * Shared/intent/exclusive locks: sleepable read/write locks, much like rw ++ * semaphores, except with a third intermediate state, intent. Basic operations ++ * are: ++ * ++ * six_lock_read(&foo->lock); ++ * six_unlock_read(&foo->lock); ++ * ++ * six_lock_intent(&foo->lock); ++ * six_unlock_intent(&foo->lock); ++ * ++ * six_lock_write(&foo->lock); ++ * six_unlock_write(&foo->lock); ++ * ++ * Intent locks block other intent locks, but do not block read locks, and you ++ * must have an intent lock held before taking a write lock, like so: ++ * ++ * six_lock_intent(&foo->lock); ++ * six_lock_write(&foo->lock); ++ * six_unlock_write(&foo->lock); ++ * six_unlock_intent(&foo->lock); ++ * ++ * Other operations: ++ * ++ * six_trylock_read() ++ * six_trylock_intent() ++ * six_trylock_write() ++ * ++ * six_lock_downgrade(): convert from intent to read ++ * six_lock_tryupgrade(): attempt to convert from read to intent ++ * ++ * Locks also embed a sequence number, which is incremented when the lock is ++ * locked or unlocked for write. The current sequence number can be grabbed ++ * while a lock is held from lock->state.seq; then, if you drop the lock you can ++ * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock ++ * iff it hasn't been locked for write in the meantime. ++ * ++ * There are also operations that take the lock type as a parameter, where the ++ * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write: ++ * ++ * six_lock_type(lock, type) ++ * six_unlock_type(lock, type) ++ * six_relock(lock, type, seq) ++ * six_trylock_type(lock, type) ++ * six_trylock_convert(lock, from, to) ++ * ++ * A lock may be held multiple types by the same thread (for read or intent, ++ * not write). However, the six locks code does _not_ implement the actual ++ * recursive checks itself though - rather, if your code (e.g. btree iterator ++ * code) knows that the current thread already has a lock held, and for the ++ * correct type, six_lock_increment() may be used to bump up the counter for ++ * that type - the only effect is that one more call to unlock will be required ++ * before the lock is unlocked. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#define SIX_LOCK_SEPARATE_LOCKFNS ++ ++union six_lock_state { ++ struct { ++ atomic64_t counter; ++ }; ++ ++ struct { ++ u64 v; ++ }; ++ ++ struct { ++ /* for waitlist_bitnr() */ ++ unsigned long l; ++ }; ++ ++ struct { ++ unsigned read_lock:28; ++ unsigned intent_lock:1; ++ unsigned waiters:3; ++ /* ++ * seq works much like in seqlocks: it's incremented every time ++ * we lock and unlock for write. ++ * ++ * If it's odd write lock is held, even unlocked. ++ * ++ * Thus readers can unlock, and then lock again later iff it ++ * hasn't been modified in the meantime. ++ */ ++ u32 seq; ++ }; ++}; ++ ++enum six_lock_type { ++ SIX_LOCK_read, ++ SIX_LOCK_intent, ++ SIX_LOCK_write, ++}; ++ ++struct six_lock { ++ union six_lock_state state; ++ unsigned intent_lock_recurse; ++ struct task_struct *owner; ++ struct optimistic_spin_queue osq; ++ ++ raw_spinlock_t wait_lock; ++ struct list_head wait_list[2]; ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map dep_map; ++#endif ++}; ++ ++typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); ++ ++static __always_inline void __six_lock_init(struct six_lock *lock, ++ const char *name, ++ struct lock_class_key *key) ++{ ++ atomic64_set(&lock->state.counter, 0); ++ raw_spin_lock_init(&lock->wait_lock); ++ INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]); ++ INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]); ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ debug_check_no_locks_freed((void *) lock, sizeof(*lock)); ++ lockdep_init_map(&lock->dep_map, name, key, 0); ++#endif ++} ++ ++#define six_lock_init(lock) \ ++do { \ ++ static struct lock_class_key __key; \ ++ \ ++ __six_lock_init((lock), #lock, &__key); \ ++} while (0) ++ ++#define __SIX_VAL(field, _v) (((union six_lock_state) { .field = _v }).v) ++ ++#define __SIX_LOCK(type) \ ++bool six_trylock_##type(struct six_lock *); \ ++bool six_relock_##type(struct six_lock *, u32); \ ++int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\ ++void six_unlock_##type(struct six_lock *); ++ ++__SIX_LOCK(read) ++__SIX_LOCK(intent) ++__SIX_LOCK(write) ++#undef __SIX_LOCK ++ ++#define SIX_LOCK_DISPATCH(type, fn, ...) \ ++ switch (type) { \ ++ case SIX_LOCK_read: \ ++ return fn##_read(__VA_ARGS__); \ ++ case SIX_LOCK_intent: \ ++ return fn##_intent(__VA_ARGS__); \ ++ case SIX_LOCK_write: \ ++ return fn##_write(__VA_ARGS__); \ ++ default: \ ++ BUG(); \ ++ } ++ ++static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) ++{ ++ SIX_LOCK_DISPATCH(type, six_trylock, lock); ++} ++ ++static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, ++ unsigned seq) ++{ ++ SIX_LOCK_DISPATCH(type, six_relock, lock, seq); ++} ++ ++static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) ++{ ++ SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p); ++} ++ ++static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) ++{ ++ SIX_LOCK_DISPATCH(type, six_unlock, lock); ++} ++ ++void six_lock_downgrade(struct six_lock *); ++bool six_lock_tryupgrade(struct six_lock *); ++bool six_trylock_convert(struct six_lock *, enum six_lock_type, ++ enum six_lock_type); ++ ++void six_lock_increment(struct six_lock *, enum six_lock_type); ++ ++void six_lock_wakeup_all(struct six_lock *); ++ ++#endif /* _LINUX_SIX_H */ +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +new file mode 100644 +index 000000000000..bafbccafae30 +--- /dev/null ++++ b/include/trace/events/bcachefs.h +@@ -0,0 +1,664 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#undef TRACE_SYSTEM ++#define TRACE_SYSTEM bcachefs ++ ++#if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ) ++#define _TRACE_BCACHE_H ++ ++#include ++ ++DECLARE_EVENT_CLASS(bpos, ++ TP_PROTO(struct bpos *p), ++ TP_ARGS(p), ++ ++ TP_STRUCT__entry( ++ __field(u64, inode ) ++ __field(u64, offset ) ++ ), ++ ++ TP_fast_assign( ++ __entry->inode = p->inode; ++ __entry->offset = p->offset; ++ ), ++ ++ TP_printk("%llu:%llu", __entry->inode, __entry->offset) ++); ++ ++DECLARE_EVENT_CLASS(bkey, ++ TP_PROTO(const struct bkey *k), ++ TP_ARGS(k), ++ ++ TP_STRUCT__entry( ++ __field(u64, inode ) ++ __field(u64, offset ) ++ __field(u32, size ) ++ ), ++ ++ TP_fast_assign( ++ __entry->inode = k->p.inode; ++ __entry->offset = k->p.offset; ++ __entry->size = k->size; ++ ), ++ ++ TP_printk("%llu:%llu len %u", __entry->inode, ++ __entry->offset, __entry->size) ++); ++ ++DECLARE_EVENT_CLASS(bch_fs, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ ), ++ ++ TP_printk("%pU", __entry->uuid) ++); ++ ++DECLARE_EVENT_CLASS(bio, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ __field(sector_t, sector ) ++ __field(unsigned int, nr_sector ) ++ __array(char, rwbs, 6 ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = bio->bi_disk ? bio_dev(bio) : 0; ++ __entry->sector = bio->bi_iter.bi_sector; ++ __entry->nr_sector = bio->bi_iter.bi_size >> 9; ++ blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); ++ ), ++ ++ TP_printk("%d,%d %s %llu + %u", ++ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, ++ (unsigned long long)__entry->sector, __entry->nr_sector) ++); ++ ++/* io.c: */ ++ ++DEFINE_EVENT(bio, read_split, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++DEFINE_EVENT(bio, read_bounce, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++DEFINE_EVENT(bio, read_retry, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++DEFINE_EVENT(bio, promote, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++/* Journal */ ++ ++DEFINE_EVENT(bch_fs, journal_full, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, journal_entry_full, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bio, journal_write, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++/* bset.c: */ ++ ++DEFINE_EVENT(bpos, bkey_pack_pos_fail, ++ TP_PROTO(struct bpos *p), ++ TP_ARGS(p) ++); ++ ++/* Btree */ ++ ++DECLARE_EVENT_CLASS(btree_node, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(u8, level ) ++ __field(u8, id ) ++ __field(u64, inode ) ++ __field(u64, offset ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->level = b->c.level; ++ __entry->id = b->c.btree_id; ++ __entry->inode = b->key.k.p.inode; ++ __entry->offset = b->key.k.p.offset; ++ ), ++ ++ TP_printk("%pU %u id %u %llu:%llu", ++ __entry->uuid, __entry->level, __entry->id, ++ __entry->inode, __entry->offset) ++); ++ ++DEFINE_EVENT(btree_node, btree_read, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++TRACE_EVENT(btree_write, ++ TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors), ++ TP_ARGS(b, bytes, sectors), ++ ++ TP_STRUCT__entry( ++ __field(enum btree_node_type, type) ++ __field(unsigned, bytes ) ++ __field(unsigned, sectors ) ++ ), ++ ++ TP_fast_assign( ++ __entry->type = btree_node_type(b); ++ __entry->bytes = bytes; ++ __entry->sectors = sectors; ++ ), ++ ++ TP_printk("bkey type %u bytes %u sectors %u", ++ __entry->type , __entry->bytes, __entry->sectors) ++); ++ ++DEFINE_EVENT(btree_node, btree_node_alloc, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_node_free, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_node_reap, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DECLARE_EVENT_CLASS(btree_node_cannibalize_lock, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ ), ++ ++ TP_printk("%pU", __entry->uuid) ++); ++ ++DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock_fail, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++TRACE_EVENT(btree_reserve_get_fail, ++ TP_PROTO(struct bch_fs *c, size_t required, struct closure *cl), ++ TP_ARGS(c, required, cl), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(size_t, required ) ++ __field(struct closure *, cl ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->required = required; ++ __entry->cl = cl; ++ ), ++ ++ TP_printk("%pU required %zu by %p", __entry->uuid, ++ __entry->required, __entry->cl) ++); ++ ++TRACE_EVENT(btree_insert_key, ++ TP_PROTO(struct bch_fs *c, struct btree *b, struct bkey_i *k), ++ TP_ARGS(c, b, k), ++ ++ TP_STRUCT__entry( ++ __field(u8, id ) ++ __field(u64, inode ) ++ __field(u64, offset ) ++ __field(u32, size ) ++ ), ++ ++ TP_fast_assign( ++ __entry->id = b->c.btree_id; ++ __entry->inode = k->k.p.inode; ++ __entry->offset = k->k.p.offset; ++ __entry->size = k->k.size; ++ ), ++ ++ TP_printk("btree %u: %llu:%llu len %u", __entry->id, ++ __entry->inode, __entry->offset, __entry->size) ++); ++ ++DEFINE_EVENT(btree_node, btree_split, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_compact, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_merge, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_set_root, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++/* Garbage collection */ ++ ++DEFINE_EVENT(btree_node, btree_gc_coalesce, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++TRACE_EVENT(btree_gc_coalesce_fail, ++ TP_PROTO(struct bch_fs *c, int reason), ++ TP_ARGS(c, reason), ++ ++ TP_STRUCT__entry( ++ __field(u8, reason ) ++ __array(char, uuid, 16 ) ++ ), ++ ++ TP_fast_assign( ++ __entry->reason = reason; ++ memcpy(__entry->uuid, c->disk_sb.sb->user_uuid.b, 16); ++ ), ++ ++ TP_printk("%pU: %u", __entry->uuid, __entry->reason) ++); ++ ++DEFINE_EVENT(btree_node, btree_gc_rewrite_node, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_gc_rewrite_node_fail, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(bch_fs, gc_start, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, gc_end, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, gc_coalesce_start, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, gc_coalesce_end, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, gc_cannot_inc_gens, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++/* Allocator */ ++ ++TRACE_EVENT(alloc_batch, ++ TP_PROTO(struct bch_dev *ca, size_t free, size_t total), ++ TP_ARGS(ca, free, total), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(size_t, free ) ++ __field(size_t, total ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, ca->uuid.b, 16); ++ __entry->free = free; ++ __entry->total = total; ++ ), ++ ++ TP_printk("%pU free %zu total %zu", ++ __entry->uuid, __entry->free, __entry->total) ++); ++ ++TRACE_EVENT(invalidate, ++ TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors), ++ TP_ARGS(ca, offset, sectors), ++ ++ TP_STRUCT__entry( ++ __field(unsigned, sectors ) ++ __field(dev_t, dev ) ++ __field(__u64, offset ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = ca->disk_sb.bdev->bd_dev; ++ __entry->offset = offset, ++ __entry->sectors = sectors; ++ ), ++ ++ TP_printk("invalidated %u sectors at %d,%d sector=%llu", ++ __entry->sectors, MAJOR(__entry->dev), ++ MINOR(__entry->dev), __entry->offset) ++); ++ ++DEFINE_EVENT(bch_fs, rescale_prios, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DECLARE_EVENT_CLASS(bucket_alloc, ++ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), ++ TP_ARGS(ca, reserve), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16) ++ __field(enum alloc_reserve, reserve ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, ca->uuid.b, 16); ++ __entry->reserve = reserve; ++ ), ++ ++ TP_printk("%pU reserve %d", __entry->uuid, __entry->reserve) ++); ++ ++DEFINE_EVENT(bucket_alloc, bucket_alloc, ++ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), ++ TP_ARGS(ca, reserve) ++); ++ ++DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, ++ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), ++ TP_ARGS(ca, reserve) ++); ++ ++DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail, ++ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), ++ TP_ARGS(ca, reserve) ++); ++ ++/* Moving IO */ ++ ++DEFINE_EVENT(bkey, move_extent, ++ TP_PROTO(const struct bkey *k), ++ TP_ARGS(k) ++); ++ ++DEFINE_EVENT(bkey, move_alloc_fail, ++ TP_PROTO(const struct bkey *k), ++ TP_ARGS(k) ++); ++ ++DEFINE_EVENT(bkey, move_race, ++ TP_PROTO(const struct bkey *k), ++ TP_ARGS(k) ++); ++ ++TRACE_EVENT(move_data, ++ TP_PROTO(struct bch_fs *c, u64 sectors_moved, ++ u64 keys_moved), ++ TP_ARGS(c, sectors_moved, keys_moved), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(u64, sectors_moved ) ++ __field(u64, keys_moved ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->sectors_moved = sectors_moved; ++ __entry->keys_moved = keys_moved; ++ ), ++ ++ TP_printk("%pU sectors_moved %llu keys_moved %llu", ++ __entry->uuid, __entry->sectors_moved, __entry->keys_moved) ++); ++ ++TRACE_EVENT(copygc, ++ TP_PROTO(struct bch_dev *ca, ++ u64 sectors_moved, u64 sectors_not_moved, ++ u64 buckets_moved, u64 buckets_not_moved), ++ TP_ARGS(ca, ++ sectors_moved, sectors_not_moved, ++ buckets_moved, buckets_not_moved), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(u64, sectors_moved ) ++ __field(u64, sectors_not_moved ) ++ __field(u64, buckets_moved ) ++ __field(u64, buckets_not_moved ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, ca->uuid.b, 16); ++ __entry->sectors_moved = sectors_moved; ++ __entry->sectors_not_moved = sectors_not_moved; ++ __entry->buckets_moved = buckets_moved; ++ __entry->buckets_not_moved = buckets_moved; ++ ), ++ ++ TP_printk("%pU sectors moved %llu remain %llu buckets moved %llu remain %llu", ++ __entry->uuid, ++ __entry->sectors_moved, __entry->sectors_not_moved, ++ __entry->buckets_moved, __entry->buckets_not_moved) ++); ++ ++TRACE_EVENT(transaction_restart_ip, ++ TP_PROTO(unsigned long caller, unsigned long ip), ++ TP_ARGS(caller, ip), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, caller ) ++ __field(unsigned long, ip ) ++ ), ++ ++ TP_fast_assign( ++ __entry->caller = caller; ++ __entry->ip = ip; ++ ), ++ ++ TP_printk("%pF %pF", (void *) __entry->caller, (void *) __entry->ip) ++); ++ ++DECLARE_EVENT_CLASS(transaction_restart, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, ip ) ++ ), ++ ++ TP_fast_assign( ++ __entry->ip = ip; ++ ), ++ ++ TP_printk("%pf", (void *) __entry->ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_would_deadlock, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++TRACE_EVENT(trans_restart_iters_realloced, ++ TP_PROTO(unsigned long ip, unsigned nr), ++ TP_ARGS(ip, nr), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, ip ) ++ __field(unsigned, nr ) ++ ), ++ ++ TP_fast_assign( ++ __entry->ip = ip; ++ __entry->nr = nr; ++ ), ++ ++ TP_printk("%pf nr %u", (void *) __entry->ip, __entry->nr) ++); ++ ++TRACE_EVENT(trans_restart_mem_realloced, ++ TP_PROTO(unsigned long ip, unsigned long bytes), ++ TP_ARGS(ip, bytes), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, ip ) ++ __field(unsigned long, bytes ) ++ ), ++ ++ TP_fast_assign( ++ __entry->ip = ip; ++ __entry->bytes = bytes; ++ ), ++ ++ TP_printk("%pf bytes %lu", (void *) __entry->ip, __entry->bytes) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_fault_inject, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_btree_node_split, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_mark, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_upgrade, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_iter_upgrade, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_traverse, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_atomic, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DECLARE_EVENT_CLASS(node_lock_fail, ++ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), ++ TP_ARGS(level, iter_seq, node, node_seq), ++ ++ TP_STRUCT__entry( ++ __field(u32, level) ++ __field(u32, iter_seq) ++ __field(u32, node) ++ __field(u32, node_seq) ++ ), ++ ++ TP_fast_assign( ++ __entry->level = level; ++ __entry->iter_seq = iter_seq; ++ __entry->node = node; ++ __entry->node_seq = node_seq; ++ ), ++ ++ TP_printk("level %u iter seq %u node %u node seq %u", ++ __entry->level, __entry->iter_seq, ++ __entry->node, __entry->node_seq) ++); ++ ++DEFINE_EVENT(node_lock_fail, node_upgrade_fail, ++ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), ++ TP_ARGS(level, iter_seq, node, node_seq) ++); ++ ++DEFINE_EVENT(node_lock_fail, node_relock_fail, ++ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), ++ TP_ARGS(level, iter_seq, node, node_seq) ++); ++ ++#endif /* _TRACE_BCACHE_H */ ++ ++/* This part must be outside protection */ ++#include +diff --git a/init/init_task.c b/init/init_task.c +index bd403ed3e418..3035fffd976b 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -76,6 +76,7 @@ struct task_struct init_task + .nr_cpus_allowed= NR_CPUS, + .mm = NULL, + .active_mm = &init_mm, ++ .faults_disabled_mapping = NULL, + .restart_block = { + .fn = do_no_restart_syscall, + }, +diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks +index 3de8fd11873b..ab8aa082ce56 100644 +--- a/kernel/Kconfig.locks ++++ b/kernel/Kconfig.locks +@@ -259,3 +259,6 @@ config ARCH_HAS_MMIOWB + config MMIOWB + def_bool y if ARCH_HAS_MMIOWB + depends on SMP ++ ++config SIXLOCKS ++ bool +diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile +index 45452facff3b..6c8f7340c0a2 100644 +--- a/kernel/locking/Makefile ++++ b/kernel/locking/Makefile +@@ -29,3 +29,4 @@ obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o + obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o + obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o + obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o ++obj-$(CONFIG_SIXLOCKS) += six.o +diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h +index baca699b94e9..4abb462d914d 100644 +--- a/kernel/locking/lockdep_internals.h ++++ b/kernel/locking/lockdep_internals.h +@@ -96,7 +96,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = + #else + #define MAX_LOCKDEP_ENTRIES 32768UL + +-#define MAX_LOCKDEP_CHAINS_BITS 16 ++#define MAX_LOCKDEP_CHAINS_BITS 18 + + /* + * Stack-trace: tightly packed array of stack backtrace +@@ -114,7 +114,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = + + #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) + +-#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) ++#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*10) + + extern struct list_head all_lock_classes; + extern struct lock_chain lock_chains[]; +diff --git a/kernel/locking/six.c b/kernel/locking/six.c +new file mode 100644 +index 000000000000..49d46ed2e18e +--- /dev/null ++++ b/kernel/locking/six.c +@@ -0,0 +1,553 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef DEBUG ++#define EBUG_ON(cond) BUG_ON(cond) ++#else ++#define EBUG_ON(cond) do {} while (0) ++#endif ++ ++#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_) ++#define six_release(l) lock_release(l, _RET_IP_) ++ ++struct six_lock_vals { ++ /* Value we add to the lock in order to take the lock: */ ++ u64 lock_val; ++ ++ /* If the lock has this value (used as a mask), taking the lock fails: */ ++ u64 lock_fail; ++ ++ /* Value we add to the lock in order to release the lock: */ ++ u64 unlock_val; ++ ++ /* Mask that indicates lock is held for this type: */ ++ u64 held_mask; ++ ++ /* Waitlist we wakeup when releasing the lock: */ ++ enum six_lock_type unlock_wakeup; ++}; ++ ++#define __SIX_LOCK_HELD_read __SIX_VAL(read_lock, ~0) ++#define __SIX_LOCK_HELD_intent __SIX_VAL(intent_lock, ~0) ++#define __SIX_LOCK_HELD_write __SIX_VAL(seq, 1) ++ ++#define LOCK_VALS { \ ++ [SIX_LOCK_read] = { \ ++ .lock_val = __SIX_VAL(read_lock, 1), \ ++ .lock_fail = __SIX_LOCK_HELD_write, \ ++ .unlock_val = -__SIX_VAL(read_lock, 1), \ ++ .held_mask = __SIX_LOCK_HELD_read, \ ++ .unlock_wakeup = SIX_LOCK_write, \ ++ }, \ ++ [SIX_LOCK_intent] = { \ ++ .lock_val = __SIX_VAL(intent_lock, 1), \ ++ .lock_fail = __SIX_LOCK_HELD_intent, \ ++ .unlock_val = -__SIX_VAL(intent_lock, 1), \ ++ .held_mask = __SIX_LOCK_HELD_intent, \ ++ .unlock_wakeup = SIX_LOCK_intent, \ ++ }, \ ++ [SIX_LOCK_write] = { \ ++ .lock_val = __SIX_VAL(seq, 1), \ ++ .lock_fail = __SIX_LOCK_HELD_read, \ ++ .unlock_val = __SIX_VAL(seq, 1), \ ++ .held_mask = __SIX_LOCK_HELD_write, \ ++ .unlock_wakeup = SIX_LOCK_read, \ ++ }, \ ++} ++ ++static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, ++ union six_lock_state old) ++{ ++ if (type != SIX_LOCK_intent) ++ return; ++ ++ if (!old.intent_lock) { ++ EBUG_ON(lock->owner); ++ lock->owner = current; ++ } else { ++ EBUG_ON(lock->owner != current); ++ } ++} ++ ++static __always_inline bool do_six_trylock_type(struct six_lock *lock, ++ enum six_lock_type type) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ union six_lock_state old; ++ u64 v = READ_ONCE(lock->state.v); ++ ++ EBUG_ON(type == SIX_LOCK_write && lock->owner != current); ++ ++ do { ++ old.v = v; ++ ++ EBUG_ON(type == SIX_LOCK_write && ++ ((old.v & __SIX_LOCK_HELD_write) || ++ !(old.v & __SIX_LOCK_HELD_intent))); ++ ++ if (old.v & l[type].lock_fail) ++ return false; ++ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, ++ old.v, ++ old.v + l[type].lock_val)) != old.v); ++ ++ six_set_owner(lock, type, old); ++ return true; ++} ++ ++__always_inline __flatten ++static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type) ++{ ++ if (!do_six_trylock_type(lock, type)) ++ return false; ++ ++ if (type != SIX_LOCK_write) ++ six_acquire(&lock->dep_map, 1); ++ return true; ++} ++ ++__always_inline __flatten ++static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, ++ unsigned seq) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ union six_lock_state old; ++ u64 v = READ_ONCE(lock->state.v); ++ ++ do { ++ old.v = v; ++ ++ if (old.seq != seq || old.v & l[type].lock_fail) ++ return false; ++ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, ++ old.v, ++ old.v + l[type].lock_val)) != old.v); ++ ++ six_set_owner(lock, type, old); ++ if (type != SIX_LOCK_write) ++ six_acquire(&lock->dep_map, 1); ++ return true; ++} ++ ++struct six_lock_waiter { ++ struct list_head list; ++ struct task_struct *task; ++}; ++ ++/* This is probably up there with the more evil things I've done */ ++#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l)) ++ ++#ifdef CONFIG_LOCK_SPIN_ON_OWNER ++ ++static inline int six_can_spin_on_owner(struct six_lock *lock) ++{ ++ struct task_struct *owner; ++ int retval = 1; ++ ++ if (need_resched()) ++ return 0; ++ ++ rcu_read_lock(); ++ owner = READ_ONCE(lock->owner); ++ if (owner) ++ retval = owner->on_cpu; ++ rcu_read_unlock(); ++ /* ++ * if lock->owner is not set, the mutex owner may have just acquired ++ * it and not set the owner yet or the mutex has been released. ++ */ ++ return retval; ++} ++ ++static inline bool six_spin_on_owner(struct six_lock *lock, ++ struct task_struct *owner) ++{ ++ bool ret = true; ++ ++ rcu_read_lock(); ++ while (lock->owner == owner) { ++ /* ++ * Ensure we emit the owner->on_cpu, dereference _after_ ++ * checking lock->owner still matches owner. If that fails, ++ * owner might point to freed memory. If it still matches, ++ * the rcu_read_lock() ensures the memory stays valid. ++ */ ++ barrier(); ++ ++ if (!owner->on_cpu || need_resched()) { ++ ret = false; ++ break; ++ } ++ ++ cpu_relax(); ++ } ++ rcu_read_unlock(); ++ ++ return ret; ++} ++ ++static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) ++{ ++ struct task_struct *task = current; ++ ++ if (type == SIX_LOCK_write) ++ return false; ++ ++ preempt_disable(); ++ if (!six_can_spin_on_owner(lock)) ++ goto fail; ++ ++ if (!osq_lock(&lock->osq)) ++ goto fail; ++ ++ while (1) { ++ struct task_struct *owner; ++ ++ /* ++ * If there's an owner, wait for it to either ++ * release the lock or go to sleep. ++ */ ++ owner = READ_ONCE(lock->owner); ++ if (owner && !six_spin_on_owner(lock, owner)) ++ break; ++ ++ if (do_six_trylock_type(lock, type)) { ++ osq_unlock(&lock->osq); ++ preempt_enable(); ++ return true; ++ } ++ ++ /* ++ * When there's no owner, we might have preempted between the ++ * owner acquiring the lock and setting the owner field. If ++ * we're an RT task that will live-lock because we won't let ++ * the owner complete. ++ */ ++ if (!owner && (need_resched() || rt_task(task))) ++ break; ++ ++ /* ++ * The cpu_relax() call is a compiler barrier which forces ++ * everything in this loop to be re-loaded. We don't need ++ * memory barriers as we'll eventually observe the right ++ * values at the cost of a few extra spins. ++ */ ++ cpu_relax(); ++ } ++ ++ osq_unlock(&lock->osq); ++fail: ++ preempt_enable(); ++ ++ /* ++ * If we fell out of the spin path because of need_resched(), ++ * reschedule now, before we try-lock again. This avoids getting ++ * scheduled out right after we obtained the lock. ++ */ ++ if (need_resched()) ++ schedule(); ++ ++ return false; ++} ++ ++#else /* CONFIG_LOCK_SPIN_ON_OWNER */ ++ ++static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) ++{ ++ return false; ++} ++ ++#endif ++ ++noinline ++static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ union six_lock_state old, new; ++ struct six_lock_waiter wait; ++ int ret = 0; ++ u64 v; ++ ++ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; ++ if (ret) ++ return ret; ++ ++ if (six_optimistic_spin(lock, type)) ++ return 0; ++ ++ lock_contended(&lock->dep_map, _RET_IP_); ++ ++ INIT_LIST_HEAD(&wait.list); ++ wait.task = current; ++ ++ while (1) { ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ if (type == SIX_LOCK_write) ++ EBUG_ON(lock->owner != current); ++ else if (list_empty_careful(&wait.list)) { ++ raw_spin_lock(&lock->wait_lock); ++ list_add_tail(&wait.list, &lock->wait_list[type]); ++ raw_spin_unlock(&lock->wait_lock); ++ } ++ ++ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; ++ if (ret) ++ break; ++ ++ v = READ_ONCE(lock->state.v); ++ do { ++ new.v = old.v = v; ++ ++ if (!(old.v & l[type].lock_fail)) ++ new.v += l[type].lock_val; ++ else if (!(new.waiters & (1 << type))) ++ new.waiters |= 1 << type; ++ else ++ break; /* waiting bit already set */ ++ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, ++ old.v, new.v)) != old.v); ++ ++ if (!(old.v & l[type].lock_fail)) ++ break; ++ ++ schedule(); ++ } ++ ++ if (!ret) ++ six_set_owner(lock, type, old); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ if (!list_empty_careful(&wait.list)) { ++ raw_spin_lock(&lock->wait_lock); ++ list_del_init(&wait.list); ++ raw_spin_unlock(&lock->wait_lock); ++ } ++ ++ return ret; ++} ++ ++__always_inline ++static int __six_lock_type(struct six_lock *lock, enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) ++{ ++ int ret; ++ ++ if (type != SIX_LOCK_write) ++ six_acquire(&lock->dep_map, 0); ++ ++ ret = do_six_trylock_type(lock, type) ? 0 ++ : __six_lock_type_slowpath(lock, type, should_sleep_fn, p); ++ ++ if (ret && type != SIX_LOCK_write) ++ six_release(&lock->dep_map); ++ if (!ret) ++ lock_acquired(&lock->dep_map, _RET_IP_); ++ ++ return ret; ++} ++ ++static inline void six_lock_wakeup(struct six_lock *lock, ++ union six_lock_state state, ++ unsigned waitlist_id) ++{ ++ struct list_head *wait_list = &lock->wait_list[waitlist_id]; ++ struct six_lock_waiter *w, *next; ++ ++ if (waitlist_id == SIX_LOCK_write && state.read_lock) ++ return; ++ ++ if (!(state.waiters & (1 << waitlist_id))) ++ return; ++ ++ clear_bit(waitlist_bitnr(waitlist_id), ++ (unsigned long *) &lock->state.v); ++ ++ if (waitlist_id == SIX_LOCK_write) { ++ struct task_struct *p = READ_ONCE(lock->owner); ++ ++ if (p) ++ wake_up_process(p); ++ return; ++ } ++ ++ raw_spin_lock(&lock->wait_lock); ++ ++ list_for_each_entry_safe(w, next, wait_list, list) { ++ list_del_init(&w->list); ++ ++ if (wake_up_process(w->task) && ++ waitlist_id != SIX_LOCK_read) { ++ if (!list_empty(wait_list)) ++ set_bit(waitlist_bitnr(waitlist_id), ++ (unsigned long *) &lock->state.v); ++ break; ++ } ++ } ++ ++ raw_spin_unlock(&lock->wait_lock); ++} ++ ++__always_inline __flatten ++static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ union six_lock_state state; ++ ++ EBUG_ON(!(lock->state.v & l[type].held_mask)); ++ EBUG_ON(type == SIX_LOCK_write && ++ !(lock->state.v & __SIX_LOCK_HELD_intent)); ++ ++ if (type != SIX_LOCK_write) ++ six_release(&lock->dep_map); ++ ++ if (type == SIX_LOCK_intent) { ++ EBUG_ON(lock->owner != current); ++ ++ if (lock->intent_lock_recurse) { ++ --lock->intent_lock_recurse; ++ return; ++ } ++ ++ lock->owner = NULL; ++ } ++ ++ state.v = atomic64_add_return_release(l[type].unlock_val, ++ &lock->state.counter); ++ six_lock_wakeup(lock, state, l[type].unlock_wakeup); ++} ++ ++#define __SIX_LOCK(type) \ ++bool six_trylock_##type(struct six_lock *lock) \ ++{ \ ++ return __six_trylock_type(lock, SIX_LOCK_##type); \ ++} \ ++EXPORT_SYMBOL_GPL(six_trylock_##type); \ ++ \ ++bool six_relock_##type(struct six_lock *lock, u32 seq) \ ++{ \ ++ return __six_relock_type(lock, SIX_LOCK_##type, seq); \ ++} \ ++EXPORT_SYMBOL_GPL(six_relock_##type); \ ++ \ ++int six_lock_##type(struct six_lock *lock, \ ++ six_lock_should_sleep_fn should_sleep_fn, void *p) \ ++{ \ ++ return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p);\ ++} \ ++EXPORT_SYMBOL_GPL(six_lock_##type); \ ++ \ ++void six_unlock_##type(struct six_lock *lock) \ ++{ \ ++ __six_unlock_type(lock, SIX_LOCK_##type); \ ++} \ ++EXPORT_SYMBOL_GPL(six_unlock_##type); ++ ++__SIX_LOCK(read) ++__SIX_LOCK(intent) ++__SIX_LOCK(write) ++ ++#undef __SIX_LOCK ++ ++/* Convert from intent to read: */ ++void six_lock_downgrade(struct six_lock *lock) ++{ ++ six_lock_increment(lock, SIX_LOCK_read); ++ six_unlock_intent(lock); ++} ++EXPORT_SYMBOL_GPL(six_lock_downgrade); ++ ++bool six_lock_tryupgrade(struct six_lock *lock) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ union six_lock_state old, new; ++ u64 v = READ_ONCE(lock->state.v); ++ ++ do { ++ new.v = old.v = v; ++ ++ EBUG_ON(!(old.v & l[SIX_LOCK_read].held_mask)); ++ ++ new.v += l[SIX_LOCK_read].unlock_val; ++ ++ if (new.v & l[SIX_LOCK_intent].lock_fail) ++ return false; ++ ++ new.v += l[SIX_LOCK_intent].lock_val; ++ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, ++ old.v, new.v)) != old.v); ++ ++ six_set_owner(lock, SIX_LOCK_intent, old); ++ six_lock_wakeup(lock, new, l[SIX_LOCK_read].unlock_wakeup); ++ ++ return true; ++} ++EXPORT_SYMBOL_GPL(six_lock_tryupgrade); ++ ++bool six_trylock_convert(struct six_lock *lock, ++ enum six_lock_type from, ++ enum six_lock_type to) ++{ ++ EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write); ++ ++ if (to == from) ++ return true; ++ ++ if (to == SIX_LOCK_read) { ++ six_lock_downgrade(lock); ++ return true; ++ } else { ++ return six_lock_tryupgrade(lock); ++ } ++} ++EXPORT_SYMBOL_GPL(six_trylock_convert); ++ ++/* ++ * Increment read/intent lock count, assuming we already have it read or intent ++ * locked: ++ */ ++void six_lock_increment(struct six_lock *lock, enum six_lock_type type) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ ++ EBUG_ON(type == SIX_LOCK_write); ++ six_acquire(&lock->dep_map, 0); ++ ++ /* XXX: assert already locked, and that we don't overflow: */ ++ ++ switch (type) { ++ case SIX_LOCK_read: ++ atomic64_add(l[type].lock_val, &lock->state.counter); ++ break; ++ case SIX_LOCK_intent: ++ lock->intent_lock_recurse++; ++ break; ++ case SIX_LOCK_write: ++ BUG(); ++ break; ++ } ++} ++EXPORT_SYMBOL_GPL(six_lock_increment); ++ ++void six_lock_wakeup_all(struct six_lock *lock) ++{ ++ struct six_lock_waiter *w; ++ ++ raw_spin_lock(&lock->wait_lock); ++ ++ list_for_each_entry(w, &lock->wait_list[0], list) ++ wake_up_process(w->task); ++ list_for_each_entry(w, &lock->wait_list[1], list) ++ wake_up_process(w->task); ++ ++ raw_spin_unlock(&lock->wait_lock); ++} ++EXPORT_SYMBOL_GPL(six_lock_wakeup_all); +diff --git a/lib/Kconfig b/lib/Kconfig +index 5d53f9609c25..a7024d19e000 100644 +--- a/lib/Kconfig ++++ b/lib/Kconfig +@@ -451,6 +451,9 @@ config ASSOCIATIVE_ARRAY + + for more information. + ++config CLOSURES ++ bool ++ + config HAS_IOMEM + bool + depends on !NO_IOMEM +diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug +index 21d9c5f6e7ec..aa82ecff7123 100644 +--- a/lib/Kconfig.debug ++++ b/lib/Kconfig.debug +@@ -1411,6 +1411,15 @@ config DEBUG_CREDENTIALS + + source "kernel/rcu/Kconfig.debug" + ++config DEBUG_CLOSURES ++ bool "Debug closures (bcache async widgits)" ++ depends on CLOSURES ++ select DEBUG_FS ++ help ++ Keeps all active closures in a linked list and provides a debugfs ++ interface to list them, which makes it possible to see asynchronous ++ operations that get stuck. ++ + config DEBUG_WQ_FORCE_RR_CPU + bool "Force round-robin CPU selection for unbound work items" + depends on DEBUG_KERNEL +@@ -1721,6 +1730,11 @@ config FAULT_INJECTION_STACKTRACE_FILTER + help + Provide stacktrace filter for fault-injection capabilities + ++config DYNAMIC_FAULT ++ bool "Enable dynamic fault support" ++ default n ++ depends on DEBUG_FS ++ + config ARCH_HAS_KCOV + bool + help +diff --git a/lib/Makefile b/lib/Makefile +index 685aee60de1d..74a60979c41c 100644 +--- a/lib/Makefile ++++ b/lib/Makefile +@@ -189,6 +189,8 @@ obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o + obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o + obj-$(CONFIG_SYMBOLIC_ERRNAME) += errname.o + ++obj-$(CONFIG_DYNAMIC_FAULT) += dynamic_fault.o ++ + obj-$(CONFIG_NLATTR) += nlattr.o + + obj-$(CONFIG_LRU_CACHE) += lru_cache.o +@@ -201,6 +203,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o + + obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o + ++obj-$(CONFIG_CLOSURES) += closure.o ++ + obj-$(CONFIG_DQL) += dynamic_queue_limits.o + + obj-$(CONFIG_GLOB) += glob.o +diff --git a/lib/closure.c b/lib/closure.c +new file mode 100644 +index 000000000000..3e6366c26209 +--- /dev/null ++++ b/lib/closure.c +@@ -0,0 +1,214 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Asynchronous refcounty things ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++static inline void closure_put_after_sub(struct closure *cl, int flags) ++{ ++ int r = flags & CLOSURE_REMAINING_MASK; ++ ++ BUG_ON(flags & CLOSURE_GUARD_MASK); ++ BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); ++ ++ if (!r) { ++ if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { ++ atomic_set(&cl->remaining, ++ CLOSURE_REMAINING_INITIALIZER); ++ closure_queue(cl); ++ } else { ++ struct closure *parent = cl->parent; ++ closure_fn *destructor = cl->fn; ++ ++ closure_debug_destroy(cl); ++ ++ if (destructor) ++ destructor(cl); ++ ++ if (parent) ++ closure_put(parent); ++ } ++ } ++} ++ ++/* For clearing flags with the same atomic op as a put */ ++void closure_sub(struct closure *cl, int v) ++{ ++ closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); ++} ++EXPORT_SYMBOL(closure_sub); ++ ++/* ++ * closure_put - decrement a closure's refcount ++ */ ++void closure_put(struct closure *cl) ++{ ++ closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); ++} ++EXPORT_SYMBOL(closure_put); ++ ++/* ++ * closure_wake_up - wake up all closures on a wait list, without memory barrier ++ */ ++void __closure_wake_up(struct closure_waitlist *wait_list) ++{ ++ struct llist_node *list; ++ struct closure *cl, *t; ++ struct llist_node *reverse = NULL; ++ ++ list = llist_del_all(&wait_list->list); ++ ++ /* We first reverse the list to preserve FIFO ordering and fairness */ ++ reverse = llist_reverse_order(list); ++ ++ /* Then do the wakeups */ ++ llist_for_each_entry_safe(cl, t, reverse, list) { ++ closure_set_waiting(cl, 0); ++ closure_sub(cl, CLOSURE_WAITING + 1); ++ } ++} ++EXPORT_SYMBOL(__closure_wake_up); ++ ++/** ++ * closure_wait - add a closure to a waitlist ++ * @waitlist: will own a ref on @cl, which will be released when ++ * closure_wake_up() is called on @waitlist. ++ * @cl: closure pointer. ++ * ++ */ ++bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) ++{ ++ if (atomic_read(&cl->remaining) & CLOSURE_WAITING) ++ return false; ++ ++ closure_set_waiting(cl, _RET_IP_); ++ atomic_add(CLOSURE_WAITING + 1, &cl->remaining); ++ llist_add(&cl->list, &waitlist->list); ++ ++ return true; ++} ++EXPORT_SYMBOL(closure_wait); ++ ++struct closure_syncer { ++ struct task_struct *task; ++ int done; ++}; ++ ++static void closure_sync_fn(struct closure *cl) ++{ ++ struct closure_syncer *s = cl->s; ++ struct task_struct *p; ++ ++ rcu_read_lock(); ++ p = READ_ONCE(s->task); ++ s->done = 1; ++ wake_up_process(p); ++ rcu_read_unlock(); ++} ++ ++void __sched __closure_sync(struct closure *cl) ++{ ++ struct closure_syncer s = { .task = current }; ++ ++ cl->s = &s; ++ continue_at(cl, closure_sync_fn, NULL); ++ ++ while (1) { ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ if (s.done) ++ break; ++ schedule(); ++ } ++ ++ __set_current_state(TASK_RUNNING); ++} ++EXPORT_SYMBOL(__closure_sync); ++ ++#ifdef CONFIG_DEBUG_CLOSURES ++ ++static LIST_HEAD(closure_list); ++static DEFINE_SPINLOCK(closure_list_lock); ++ ++void closure_debug_create(struct closure *cl) ++{ ++ unsigned long flags; ++ ++ BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); ++ cl->magic = CLOSURE_MAGIC_ALIVE; ++ ++ spin_lock_irqsave(&closure_list_lock, flags); ++ list_add(&cl->all, &closure_list); ++ spin_unlock_irqrestore(&closure_list_lock, flags); ++} ++EXPORT_SYMBOL(closure_debug_create); ++ ++void closure_debug_destroy(struct closure *cl) ++{ ++ unsigned long flags; ++ ++ BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); ++ cl->magic = CLOSURE_MAGIC_DEAD; ++ ++ spin_lock_irqsave(&closure_list_lock, flags); ++ list_del(&cl->all); ++ spin_unlock_irqrestore(&closure_list_lock, flags); ++} ++EXPORT_SYMBOL(closure_debug_destroy); ++ ++static int debug_seq_show(struct seq_file *f, void *data) ++{ ++ struct closure *cl; ++ ++ spin_lock_irq(&closure_list_lock); ++ ++ list_for_each_entry(cl, &closure_list, all) { ++ int r = atomic_read(&cl->remaining); ++ ++ seq_printf(f, "%p: %pS -> %pS p %p r %i ", ++ cl, (void *) cl->ip, cl->fn, cl->parent, ++ r & CLOSURE_REMAINING_MASK); ++ ++ seq_printf(f, "%s%s\n", ++ test_bit(WORK_STRUCT_PENDING_BIT, ++ work_data_bits(&cl->work)) ? "Q" : "", ++ r & CLOSURE_RUNNING ? "R" : ""); ++ ++ if (r & CLOSURE_WAITING) ++ seq_printf(f, " W %pS\n", ++ (void *) cl->waiting_on); ++ ++ seq_puts(f, "\n"); ++ } ++ ++ spin_unlock_irq(&closure_list_lock); ++ return 0; ++} ++ ++static int debug_seq_open(struct inode *inode, struct file *file) ++{ ++ return single_open(file, debug_seq_show, NULL); ++} ++ ++static const struct file_operations debug_ops = { ++ .owner = THIS_MODULE, ++ .open = debug_seq_open, ++ .read = seq_read, ++ .release = single_release ++}; ++ ++static int __init closure_debug_init(void) ++{ ++ debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops); ++ return 0; ++} ++late_initcall(closure_debug_init) ++ ++#endif +diff --git a/lib/dynamic_fault.c b/lib/dynamic_fault.c +new file mode 100644 +index 000000000000..75fc9a1b4bce +--- /dev/null ++++ b/lib/dynamic_fault.c +@@ -0,0 +1,760 @@ ++/* ++ * lib/dynamic_fault.c ++ * ++ * make dynamic_fault() calls runtime configurable based upon their ++ * source module. ++ * ++ * Copyright (C) 2011 Adam Berkan ++ * Based on dynamic_debug.c: ++ * Copyright (C) 2008 Jason Baron ++ * By Greg Banks ++ * Copyright (c) 2008 Silicon Graphics Inc. All Rights Reserved. ++ * ++ */ ++ ++#define pr_fmt(fmt) "dfault: " fmt "\n" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#undef kzalloc ++ ++extern struct _dfault __start___faults[]; ++extern struct _dfault __stop___faults[]; ++ ++struct dfault_table { ++ struct list_head link; ++ char *mod_name; ++ unsigned int num_dfaults; ++ struct _dfault *dfaults; ++}; ++ ++struct dfault_query { ++ const char *filename; ++ const char *module; ++ const char *function; ++ const char *class; ++ unsigned int first_line, last_line; ++ unsigned int first_index, last_index; ++ ++ unsigned match_line:1; ++ unsigned match_index:1; ++ ++ unsigned set_enabled:1; ++ unsigned enabled:2; ++ ++ unsigned set_frequency:1; ++ unsigned frequency; ++}; ++ ++struct dfault_iter { ++ struct dfault_table *table; ++ unsigned int idx; ++}; ++ ++static DEFINE_MUTEX(dfault_lock); ++static LIST_HEAD(dfault_tables); ++ ++bool __dynamic_fault_enabled(struct _dfault *df) ++{ ++ union dfault_state old, new; ++ unsigned v = df->state.v; ++ bool ret; ++ ++ do { ++ old.v = new.v = v; ++ ++ if (new.enabled == DFAULT_DISABLED) ++ return false; ++ ++ ret = df->frequency ++ ? ++new.count >= df->frequency ++ : true; ++ if (ret) ++ new.count = 0; ++ if (ret && new.enabled == DFAULT_ONESHOT) ++ new.enabled = DFAULT_DISABLED; ++ } while ((v = cmpxchg(&df->state.v, old.v, new.v)) != old.v); ++ ++ if (ret) ++ pr_debug("returned true for %s:%u", df->filename, df->line); ++ ++ return ret; ++} ++EXPORT_SYMBOL(__dynamic_fault_enabled); ++ ++/* Return the last part of a pathname */ ++static inline const char *basename(const char *path) ++{ ++ const char *tail = strrchr(path, '/'); ++ ++ return tail ? tail + 1 : path; ++} ++ ++/* format a string into buf[] which describes the _dfault's flags */ ++static char *dfault_describe_flags(struct _dfault *df, char *buf, size_t buflen) ++{ ++ switch (df->state.enabled) { ++ case DFAULT_DISABLED: ++ strlcpy(buf, "disabled", buflen); ++ break; ++ case DFAULT_ENABLED: ++ strlcpy(buf, "enabled", buflen); ++ break; ++ case DFAULT_ONESHOT: ++ strlcpy(buf, "oneshot", buflen); ++ break; ++ default: ++ BUG(); ++ } ++ ++ return buf; ++} ++ ++/* ++ * must be called with dfault_lock held ++ */ ++ ++/* ++ * Search the tables for _dfault's which match the given ++ * `query' and apply the `flags' and `mask' to them. Tells ++ * the user which dfault's were changed, or whether none ++ * were matched. ++ */ ++static int dfault_change(const struct dfault_query *query) ++{ ++ struct dfault_table *dt; ++ unsigned int nfound = 0; ++ unsigned i, index = 0; ++ char flagbuf[16]; ++ ++ /* search for matching dfaults */ ++ mutex_lock(&dfault_lock); ++ list_for_each_entry(dt, &dfault_tables, link) { ++ ++ /* match against the module name */ ++ if (query->module != NULL && ++ strcmp(query->module, dt->mod_name)) ++ continue; ++ ++ for (i = 0 ; i < dt->num_dfaults ; i++) { ++ struct _dfault *df = &dt->dfaults[i]; ++ ++ /* match against the source filename */ ++ if (query->filename != NULL && ++ strcmp(query->filename, df->filename) && ++ strcmp(query->filename, basename(df->filename))) ++ continue; ++ ++ /* match against the function */ ++ if (query->function != NULL && ++ strcmp(query->function, df->function)) ++ continue; ++ ++ /* match against the class */ ++ if (query->class) { ++ size_t len = strlen(query->class); ++ ++ if (strncmp(query->class, df->class, len)) ++ continue; ++ ++ if (df->class[len] && df->class[len] != ':') ++ continue; ++ } ++ ++ /* match against the line number range */ ++ if (query->match_line && ++ (df->line < query->first_line || ++ df->line > query->last_line)) ++ continue; ++ ++ /* match against the fault index */ ++ if (query->match_index && ++ (index < query->first_index || ++ index > query->last_index)) { ++ index++; ++ continue; ++ } ++ ++ if (query->set_enabled && ++ query->enabled != df->state.enabled) { ++ if (query->enabled != DFAULT_DISABLED) ++ static_key_slow_inc(&df->enabled); ++ else if (df->state.enabled != DFAULT_DISABLED) ++ static_key_slow_dec(&df->enabled); ++ ++ df->state.enabled = query->enabled; ++ } ++ ++ if (query->set_frequency) ++ df->frequency = query->frequency; ++ ++ pr_debug("changed %s:%d [%s]%s #%d %s", ++ df->filename, df->line, dt->mod_name, ++ df->function, index, ++ dfault_describe_flags(df, flagbuf, ++ sizeof(flagbuf))); ++ ++ index++; ++ nfound++; ++ } ++ } ++ mutex_unlock(&dfault_lock); ++ ++ pr_debug("dfault: %u matches", nfound); ++ ++ return nfound ? 0 : -ENOENT; ++} ++ ++/* ++ * Split the buffer `buf' into space-separated words. ++ * Handles simple " and ' quoting, i.e. without nested, ++ * embedded or escaped \". Return the number of words ++ * or <0 on error. ++ */ ++static int dfault_tokenize(char *buf, char *words[], int maxwords) ++{ ++ int nwords = 0; ++ ++ while (*buf) { ++ char *end; ++ ++ /* Skip leading whitespace */ ++ buf = skip_spaces(buf); ++ if (!*buf) ++ break; /* oh, it was trailing whitespace */ ++ ++ /* Run `end' over a word, either whitespace separated or quoted ++ */ ++ if (*buf == '"' || *buf == '\'') { ++ int quote = *buf++; ++ ++ for (end = buf ; *end && *end != quote ; end++) ++ ; ++ if (!*end) ++ return -EINVAL; /* unclosed quote */ ++ } else { ++ for (end = buf ; *end && !isspace(*end) ; end++) ++ ; ++ BUG_ON(end == buf); ++ } ++ /* Here `buf' is the start of the word, `end' is one past the ++ * end ++ */ ++ ++ if (nwords == maxwords) ++ return -EINVAL; /* ran out of words[] before bytes */ ++ if (*end) ++ *end++ = '\0'; /* terminate the word */ ++ words[nwords++] = buf; ++ buf = end; ++ } ++ ++ return nwords; ++} ++ ++/* ++ * Parse a range. ++ */ ++static inline int parse_range(char *str, ++ unsigned int *first, ++ unsigned int *last) ++{ ++ char *first_str = str; ++ char *last_str = strchr(first_str, '-'); ++ ++ if (last_str) ++ *last_str++ = '\0'; ++ ++ if (kstrtouint(first_str, 10, first)) ++ return -EINVAL; ++ ++ if (!last_str) ++ *last = *first; ++ else if (kstrtouint(last_str, 10, last)) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++enum dfault_token { ++ TOK_INVALID, ++ ++ /* Queries */ ++ TOK_FUNC, ++ TOK_FILE, ++ TOK_LINE, ++ TOK_MODULE, ++ TOK_CLASS, ++ TOK_INDEX, ++ ++ /* Commands */ ++ TOK_DISABLE, ++ TOK_ENABLE, ++ TOK_ONESHOT, ++ TOK_FREQUENCY, ++}; ++ ++static const struct { ++ const char *str; ++ enum dfault_token tok; ++ unsigned args_required; ++} dfault_token_strs[] = { ++ { "func", TOK_FUNC, 1, }, ++ { "file", TOK_FILE, 1, }, ++ { "line", TOK_LINE, 1, }, ++ { "module", TOK_MODULE, 1, }, ++ { "class", TOK_CLASS, 1, }, ++ { "index", TOK_INDEX, 1, }, ++ { "disable", TOK_DISABLE, 0, }, ++ { "enable", TOK_ENABLE, 0, }, ++ { "oneshot", TOK_ONESHOT, 0, }, ++ { "frequency", TOK_FREQUENCY, 1, }, ++}; ++ ++static enum dfault_token str_to_token(const char *word, unsigned nr_words) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(dfault_token_strs); i++) ++ if (!strcmp(word, dfault_token_strs[i].str)) { ++ if (nr_words < dfault_token_strs[i].args_required) { ++ pr_debug("insufficient arguments to \"%s\"", ++ word); ++ return TOK_INVALID; ++ } ++ ++ return dfault_token_strs[i].tok; ++ } ++ ++ pr_debug("unknown keyword \"%s\"", word); ++ ++ return TOK_INVALID; ++} ++ ++static int dfault_parse_command(struct dfault_query *query, ++ enum dfault_token tok, ++ char *words[], size_t nr_words) ++{ ++ unsigned i = 0; ++ int ret; ++ ++ switch (tok) { ++ case TOK_INVALID: ++ return -EINVAL; ++ case TOK_FUNC: ++ query->function = words[i++]; ++ case TOK_FILE: ++ query->filename = words[i++]; ++ return 1; ++ case TOK_LINE: ++ ret = parse_range(words[i++], ++ &query->first_line, ++ &query->last_line); ++ if (ret) ++ return ret; ++ query->match_line = true; ++ break; ++ case TOK_MODULE: ++ query->module = words[i++]; ++ break; ++ case TOK_CLASS: ++ query->class = words[i++]; ++ break; ++ case TOK_INDEX: ++ ret = parse_range(words[i++], ++ &query->first_index, ++ &query->last_index); ++ if (ret) ++ return ret; ++ query->match_index = true; ++ break; ++ case TOK_DISABLE: ++ query->set_enabled = true; ++ query->enabled = DFAULT_DISABLED; ++ break; ++ case TOK_ENABLE: ++ query->set_enabled = true; ++ query->enabled = DFAULT_ENABLED; ++ break; ++ case TOK_ONESHOT: ++ query->set_enabled = true; ++ query->enabled = DFAULT_ONESHOT; ++ break; ++ case TOK_FREQUENCY: ++ query->set_frequency = 1; ++ ret = kstrtouint(words[i++], 10, &query->frequency); ++ if (ret) ++ return ret; ++ ++ if (!query->set_enabled) { ++ query->set_enabled = 1; ++ query->enabled = DFAULT_ENABLED; ++ } ++ break; ++ } ++ ++ return i; ++} ++ ++/* ++ * Parse words[] as a dfault query specification, which is a series ++ * of (keyword, value) pairs chosen from these possibilities: ++ * ++ * func ++ * file ++ * file ++ * module ++ * line ++ * line - // where either may be empty ++ * index - // dynamic faults numbered from ++ * // to inside each matching function ++ */ ++static int dfault_parse_query(struct dfault_query *query, ++ char *words[], size_t nr_words) ++{ ++ unsigned i = 0; ++ ++ while (i < nr_words) { ++ const char *tok_str = words[i++]; ++ enum dfault_token tok = str_to_token(tok_str, nr_words - i); ++ int ret = dfault_parse_command(query, tok, words + i, ++ nr_words - i); ++ ++ if (ret < 0) ++ return ret; ++ i += ret; ++ BUG_ON(i > nr_words); ++ } ++ ++ return 0; ++} ++ ++/* ++ * File_ops->write method for /dynamic_fault/conrol. Gathers the ++ * command text from userspace, parses and executes it. ++ */ ++static ssize_t dfault_proc_write(struct file *file, const char __user *ubuf, ++ size_t len, loff_t *offp) ++{ ++ struct dfault_query query; ++#define MAXWORDS 9 ++ int nwords; ++ char *words[MAXWORDS]; ++ char tmpbuf[256]; ++ int ret; ++ ++ memset(&query, 0, sizeof(query)); ++ ++ if (len == 0) ++ return 0; ++ /* we don't check *offp -- multiple writes() are allowed */ ++ if (len > sizeof(tmpbuf)-1) ++ return -E2BIG; ++ if (copy_from_user(tmpbuf, ubuf, len)) ++ return -EFAULT; ++ tmpbuf[len] = '\0'; ++ ++ pr_debug("read %zu bytes from userspace", len); ++ ++ nwords = dfault_tokenize(tmpbuf, words, MAXWORDS); ++ if (nwords < 0) ++ return -EINVAL; ++ if (dfault_parse_query(&query, words, nwords)) ++ return -EINVAL; ++ ++ /* actually go and implement the change */ ++ ret = dfault_change(&query); ++ if (ret < 0) ++ return ret; ++ ++ *offp += len; ++ return len; ++} ++ ++/* Control file read code */ ++ ++/* ++ * Set the iterator to point to the first _dfault object ++ * and return a pointer to that first object. Returns ++ * NULL if there are no _dfaults at all. ++ */ ++static struct _dfault *dfault_iter_first(struct dfault_iter *iter) ++{ ++ if (list_empty(&dfault_tables)) { ++ iter->table = NULL; ++ iter->idx = 0; ++ return NULL; ++ } ++ iter->table = list_entry(dfault_tables.next, ++ struct dfault_table, link); ++ iter->idx = 0; ++ return &iter->table->dfaults[iter->idx]; ++} ++ ++/* ++ * Advance the iterator to point to the next _dfault ++ * object from the one the iterator currently points at, ++ * and returns a pointer to the new _dfault. Returns ++ * NULL if the iterator has seen all the _dfaults. ++ */ ++static struct _dfault *dfault_iter_next(struct dfault_iter *iter) ++{ ++ if (iter->table == NULL) ++ return NULL; ++ if (++iter->idx == iter->table->num_dfaults) { ++ /* iterate to next table */ ++ iter->idx = 0; ++ if (list_is_last(&iter->table->link, &dfault_tables)) { ++ iter->table = NULL; ++ return NULL; ++ } ++ iter->table = list_entry(iter->table->link.next, ++ struct dfault_table, link); ++ } ++ return &iter->table->dfaults[iter->idx]; ++} ++ ++/* ++ * Seq_ops start method. Called at the start of every ++ * read() call from userspace. Takes the dfault_lock and ++ * seeks the seq_file's iterator to the given position. ++ */ ++static void *dfault_proc_start(struct seq_file *m, loff_t *pos) ++{ ++ struct dfault_iter *iter = m->private; ++ struct _dfault *dp; ++ int n = *pos; ++ ++ mutex_lock(&dfault_lock); ++ ++ if (n < 0) ++ return NULL; ++ dp = dfault_iter_first(iter); ++ while (dp != NULL && --n >= 0) ++ dp = dfault_iter_next(iter); ++ return dp; ++} ++ ++/* ++ * Seq_ops next method. Called several times within a read() ++ * call from userspace, with dfault_lock held. Walks to the ++ * next _dfault object with a special case for the header line. ++ */ ++static void *dfault_proc_next(struct seq_file *m, void *p, loff_t *pos) ++{ ++ struct dfault_iter *iter = m->private; ++ struct _dfault *dp; ++ ++ if (p == SEQ_START_TOKEN) ++ dp = dfault_iter_first(iter); ++ else ++ dp = dfault_iter_next(iter); ++ ++*pos; ++ return dp; ++} ++ ++/* ++ * Seq_ops show method. Called several times within a read() ++ * call from userspace, with dfault_lock held. Formats the ++ * current _dfault as a single human-readable line, with a ++ * special case for the header line. ++ */ ++static int dfault_proc_show(struct seq_file *m, void *p) ++{ ++ struct dfault_iter *iter = m->private; ++ struct _dfault *df = p; ++ char flagsbuf[8]; ++ ++ seq_printf(m, "%s:%u class:%s module:%s func:%s %s \"\"\n", ++ df->filename, df->line, df->class, ++ iter->table->mod_name, df->function, ++ dfault_describe_flags(df, flagsbuf, sizeof(flagsbuf))); ++ ++ return 0; ++} ++ ++/* ++ * Seq_ops stop method. Called at the end of each read() ++ * call from userspace. Drops dfault_lock. ++ */ ++static void dfault_proc_stop(struct seq_file *m, void *p) ++{ ++ mutex_unlock(&dfault_lock); ++} ++ ++static const struct seq_operations dfault_proc_seqops = { ++ .start = dfault_proc_start, ++ .next = dfault_proc_next, ++ .show = dfault_proc_show, ++ .stop = dfault_proc_stop ++}; ++ ++/* ++ * File_ops->open method for /dynamic_fault/control. Does the seq_file ++ * setup dance, and also creates an iterator to walk the _dfaults. ++ * Note that we create a seq_file always, even for O_WRONLY files ++ * where it's not needed, as doing so simplifies the ->release method. ++ */ ++static int dfault_proc_open(struct inode *inode, struct file *file) ++{ ++ struct dfault_iter *iter; ++ int err; ++ ++ iter = kzalloc(sizeof(*iter), GFP_KERNEL); ++ if (iter == NULL) ++ return -ENOMEM; ++ ++ err = seq_open(file, &dfault_proc_seqops); ++ if (err) { ++ kfree(iter); ++ return err; ++ } ++ ((struct seq_file *) file->private_data)->private = iter; ++ return 0; ++} ++ ++static const struct file_operations dfault_proc_fops = { ++ .owner = THIS_MODULE, ++ .open = dfault_proc_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release_private, ++ .write = dfault_proc_write ++}; ++ ++/* ++ * Allocate a new dfault_table for the given module ++ * and add it to the global list. ++ */ ++int dfault_add_module(struct _dfault *tab, unsigned int n, ++ const char *name) ++{ ++ struct dfault_table *dt; ++ char *new_name; ++ const char *func = NULL; ++ int i; ++ ++ dt = kzalloc(sizeof(*dt), GFP_KERNEL); ++ if (dt == NULL) ++ return -ENOMEM; ++ new_name = kstrdup(name, GFP_KERNEL); ++ if (new_name == NULL) { ++ kfree(dt); ++ return -ENOMEM; ++ } ++ dt->mod_name = new_name; ++ dt->num_dfaults = n; ++ dt->dfaults = tab; ++ ++ mutex_lock(&dfault_lock); ++ list_add_tail(&dt->link, &dfault_tables); ++ mutex_unlock(&dfault_lock); ++ ++ /* __attribute__(("section")) emits things in reverse order */ ++ for (i = n - 1; i >= 0; i--) ++ if (!func || strcmp(tab[i].function, func)) ++ func = tab[i].function; ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(dfault_add_module); ++ ++static void dfault_table_free(struct dfault_table *dt) ++{ ++ list_del_init(&dt->link); ++ kfree(dt->mod_name); ++ kfree(dt); ++} ++ ++/* ++ * Called in response to a module being unloaded. Removes ++ * any dfault_table's which point at the module. ++ */ ++int dfault_remove_module(char *mod_name) ++{ ++ struct dfault_table *dt, *nextdt; ++ int ret = -ENOENT; ++ ++ mutex_lock(&dfault_lock); ++ list_for_each_entry_safe(dt, nextdt, &dfault_tables, link) { ++ if (!strcmp(dt->mod_name, mod_name)) { ++ dfault_table_free(dt); ++ ret = 0; ++ } ++ } ++ mutex_unlock(&dfault_lock); ++ return ret; ++} ++EXPORT_SYMBOL_GPL(dfault_remove_module); ++ ++static void dfault_remove_all_tables(void) ++{ ++ mutex_lock(&dfault_lock); ++ while (!list_empty(&dfault_tables)) { ++ struct dfault_table *dt = list_entry(dfault_tables.next, ++ struct dfault_table, ++ link); ++ dfault_table_free(dt); ++ } ++ mutex_unlock(&dfault_lock); ++} ++ ++static int __init dynamic_fault_init(void) ++{ ++ struct dentry *dir, *file; ++ struct _dfault *iter, *iter_start; ++ const char *modname = NULL; ++ int ret = 0; ++ int n = 0; ++ ++ dir = debugfs_create_dir("dynamic_fault", NULL); ++ if (!dir) ++ return -ENOMEM; ++ file = debugfs_create_file("control", 0644, dir, NULL, ++ &dfault_proc_fops); ++ if (!file) { ++ debugfs_remove(dir); ++ return -ENOMEM; ++ } ++ if (__start___faults != __stop___faults) { ++ iter = __start___faults; ++ modname = iter->modname; ++ iter_start = iter; ++ for (; iter < __stop___faults; iter++) { ++ if (strcmp(modname, iter->modname)) { ++ ret = dfault_add_module(iter_start, n, modname); ++ if (ret) ++ goto out_free; ++ n = 0; ++ modname = iter->modname; ++ iter_start = iter; ++ } ++ n++; ++ } ++ ret = dfault_add_module(iter_start, n, modname); ++ } ++out_free: ++ if (ret) { ++ dfault_remove_all_tables(); ++ debugfs_remove(dir); ++ debugfs_remove(file); ++ } ++ return 0; ++} ++module_init(dynamic_fault_init); +diff --git a/mm/filemap.c b/mm/filemap.c +index 23a051a7ef0f..d39a3f28d6a9 100644 +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -116,6 +116,69 @@ + * ->tasklist_lock (memory_failure, collect_procs_ao) + */ + ++static int page_cache_tree_insert_vec(struct page *pages[], ++ unsigned nr_pages, ++ struct address_space *mapping, ++ pgoff_t index, ++ gfp_t gfp_mask, ++ void *shadow[]) ++{ ++ XA_STATE(xas, &mapping->i_pages, index); ++ void *old; ++ int i = 0, error = 0; ++ ++ mapping_set_update(&xas, mapping); ++ ++ if (!nr_pages) ++ return 0; ++ ++ xa_lock_irq(&mapping->i_pages); ++ ++ while (1) { ++ old = xas_load(&xas); ++ if (old && !xa_is_value(old)) { ++ error = -EEXIST; ++ break; ++ } ++ ++ xas_store(&xas, pages[i]); ++ error = xas_error(&xas); ++ ++ if (error == -ENOMEM) { ++ xa_unlock_irq(&mapping->i_pages); ++ if (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)) ++ error = 0; ++ xa_lock_irq(&mapping->i_pages); ++ ++ if (!error) ++ continue; ++ break; ++ } ++ ++ if (error) ++ break; ++ ++ if (shadow) ++ shadow[i] = old; ++ if (xa_is_value(old)) ++ mapping->nrexceptional--; ++ mapping->nrpages++; ++ ++ /* hugetlb pages do not participate in page cache accounting. */ ++ if (!PageHuge(pages[i])) ++ __inc_node_page_state(pages[i], NR_FILE_PAGES); ++ ++ if (++i == nr_pages) ++ break; ++ ++ xas_next(&xas); ++ } ++ ++ xa_unlock_irq(&mapping->i_pages); ++ ++ return i ?: error; ++} ++ + static void page_cache_delete(struct address_space *mapping, + struct page *page, void *shadow) + { +@@ -825,118 +888,154 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) + } + EXPORT_SYMBOL_GPL(replace_page_cache_page); + +-static int __add_to_page_cache_locked(struct page *page, +- struct address_space *mapping, +- pgoff_t offset, gfp_t gfp_mask, +- void **shadowp) ++static int add_to_page_cache_vec(struct page **pages, unsigned nr_pages, ++ struct address_space *mapping, ++ pgoff_t index, gfp_t gfp_mask, ++ void *shadow[]) + { +- XA_STATE(xas, &mapping->i_pages, offset); +- int huge = PageHuge(page); + struct mem_cgroup *memcg; +- int error; +- void *old; ++ int i, nr_added = 0, error = 0; + +- VM_BUG_ON_PAGE(!PageLocked(page), page); +- VM_BUG_ON_PAGE(PageSwapBacked(page), page); +- mapping_set_update(&xas, mapping); ++ for (i = 0; i < nr_pages; i++) { ++ struct page *page = pages[i]; + +- if (!huge) { +- error = mem_cgroup_try_charge(page, current->mm, +- gfp_mask, &memcg, false); +- if (error) +- return error; ++ VM_BUG_ON_PAGE(PageSwapBacked(page), page); ++ VM_BUG_ON_PAGE(PageSwapCache(page), page); ++ ++ if (!PageHuge(page)) { ++ error = mem_cgroup_try_charge(page, current->mm, ++ gfp_mask, &memcg, false); ++ if (error) { ++ if (!i) ++ return error; ++ nr_pages = i; ++ break; ++ } ++ } ++ ++ __SetPageLocked(page); ++ get_page(page); ++ page->mapping = mapping; ++ page->index = index + i; + } + +- get_page(page); +- page->mapping = mapping; +- page->index = offset; ++ error = page_cache_tree_insert_vec(pages, nr_pages, mapping, ++ index, gfp_mask, shadow); ++ if (error > 0) { ++ nr_added = error; ++ error = 0; ++ } + +- do { +- xas_lock_irq(&xas); +- old = xas_load(&xas); +- if (old && !xa_is_value(old)) +- xas_set_err(&xas, -EEXIST); +- xas_store(&xas, page); +- if (xas_error(&xas)) +- goto unlock; ++ for (i = 0; i < nr_added; i++) { ++ struct page *page = pages[i]; + +- if (xa_is_value(old)) { +- mapping->nrexceptional--; +- if (shadowp) +- *shadowp = old; +- } +- mapping->nrpages++; ++ if (!PageHuge(page)) ++ mem_cgroup_commit_charge(page, memcg, false, false); + +- /* hugetlb pages do not participate in page cache accounting */ +- if (!huge) +- __inc_node_page_state(page, NR_FILE_PAGES); +-unlock: +- xas_unlock_irq(&xas); +- } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)); ++ trace_mm_filemap_add_to_page_cache(page); ++ } + +- if (xas_error(&xas)) +- goto error; ++ for (i = nr_added; i < nr_pages; i++) { ++ struct page *page = pages[i]; + +- if (!huge) +- mem_cgroup_commit_charge(page, memcg, false, false); +- trace_mm_filemap_add_to_page_cache(page); +- return 0; +-error: +- page->mapping = NULL; +- /* Leave page->index set: truncation relies upon it */ +- if (!huge) +- mem_cgroup_cancel_charge(page, memcg, false); +- put_page(page); +- return xas_error(&xas); ++ if (!PageHuge(page)) ++ mem_cgroup_cancel_charge(page, memcg, false); ++ ++ /* Leave page->index set: truncation relies upon it */ ++ page->mapping = NULL; ++ put_page(page); ++ __ClearPageLocked(page); ++ } ++ ++ return nr_added ?: error; + } +-ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO); + + /** +- * add_to_page_cache_locked - add a locked page to the pagecache ++ * add_to_page_cache - add a newly allocated page to the pagecache + * @page: page to add + * @mapping: the page's address_space + * @offset: page index + * @gfp_mask: page allocation mode + * +- * This function is used to add a page to the pagecache. It must be locked. +- * This function does not add the page to the LRU. The caller must do that. ++ * This function is used to add a page to the pagecache. It must be newly ++ * allocated. This function does not add the page to the LRU. The caller must ++ * do that. + * + * Return: %0 on success, negative error code otherwise. + */ +-int add_to_page_cache_locked(struct page *page, struct address_space *mapping, +- pgoff_t offset, gfp_t gfp_mask) ++int add_to_page_cache(struct page *page, struct address_space *mapping, ++ pgoff_t offset, gfp_t gfp_mask) + { +- return __add_to_page_cache_locked(page, mapping, offset, +- gfp_mask, NULL); ++ int ret = add_to_page_cache_vec(&page, 1, mapping, offset, ++ gfp_mask, NULL); ++ if (ret < 0) ++ return ret; ++ return 0; + } +-EXPORT_SYMBOL(add_to_page_cache_locked); ++EXPORT_SYMBOL(add_to_page_cache); ++ALLOW_ERROR_INJECTION(add_to_page_cache, ERRNO); + +-int add_to_page_cache_lru(struct page *page, struct address_space *mapping, +- pgoff_t offset, gfp_t gfp_mask) ++int add_to_page_cache_lru_vec(struct address_space *mapping, ++ struct page **pages, ++ unsigned nr_pages, ++ pgoff_t offset, gfp_t gfp_mask) + { +- void *shadow = NULL; +- int ret; ++ void *shadow_stack[8], **shadow = shadow_stack; ++ int i, ret = 0, err = 0, nr_added; ++ ++ if (nr_pages > ARRAY_SIZE(shadow_stack)) { ++ shadow = kmalloc_array(nr_pages, sizeof(void *), gfp_mask); ++ if (!shadow) ++ goto slowpath; ++ } ++ ++ for (i = 0; i < nr_pages; i++) ++ VM_BUG_ON_PAGE(PageActive(pages[i]), pages[i]); ++ ++ ret = add_to_page_cache_vec(pages, nr_pages, mapping, ++ offset, gfp_mask, shadow); ++ nr_added = ret > 0 ? ret : 0; ++ ++ /* ++ * The page might have been evicted from cache only recently, in which ++ * case it should be activated like any other repeatedly accessed page. ++ * The exception is pages getting rewritten; evicting other data from ++ * the working set, only to cache data that will get overwritten with ++ * something else, is a waste of memory. ++ */ ++ for (i = 0; i < nr_added; i++) { ++ struct page *page = pages[i]; ++ void *s = shadow[i]; + +- __SetPageLocked(page); +- ret = __add_to_page_cache_locked(page, mapping, offset, +- gfp_mask, &shadow); +- if (unlikely(ret)) +- __ClearPageLocked(page); +- else { +- /* +- * The page might have been evicted from cache only +- * recently, in which case it should be activated like +- * any other repeatedly accessed page. +- * The exception is pages getting rewritten; evicting other +- * data from the working set, only to cache data that will +- * get overwritten with something else, is a waste of memory. +- */ + WARN_ON_ONCE(PageActive(page)); +- if (!(gfp_mask & __GFP_WRITE) && shadow) +- workingset_refault(page, shadow); ++ if (!(gfp_mask & __GFP_WRITE) && s) ++ workingset_refault(page, s); + lru_cache_add(page); + } ++ ++ if (shadow != shadow_stack) ++ kfree(shadow); ++ + return ret; ++slowpath: ++ for (i = 0; i < nr_pages; i++) { ++ err = add_to_page_cache_lru(pages[i], mapping, ++ offset + i, gfp_mask); ++ if (err) ++ break; ++ } ++ ++ return i ?: err; ++} ++EXPORT_SYMBOL_GPL(add_to_page_cache_lru_vec); ++ ++int add_to_page_cache_lru(struct page *page, struct address_space *mapping, ++ pgoff_t offset, gfp_t gfp_mask) ++{ ++ int ret = add_to_page_cache_lru_vec(mapping, &page, 1, offset, gfp_mask); ++ if (ret < 0) ++ return ret; ++ return 0; + } + EXPORT_SYMBOL_GPL(add_to_page_cache_lru); + +@@ -1827,6 +1926,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, + + return ret; + } ++EXPORT_SYMBOL(find_get_pages_range); + + /** + * find_get_pages_contig - gang contiguous pagecache lookup +@@ -1975,6 +2075,222 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra) + ra->ra_pages /= 4; + } + ++static struct page * ++generic_file_buffered_read_readpage(struct file *filp, ++ struct address_space *mapping, ++ struct page *page) ++{ ++ struct file_ra_state *ra = &filp->f_ra; ++ int error; ++ ++ /* ++ * A previous I/O error may have been due to temporary ++ * failures, eg. multipath errors. ++ * PG_error will be set again if readpage fails. ++ */ ++ ClearPageError(page); ++ /* Start the actual read. The read will unlock the page. */ ++ error = mapping->a_ops->readpage(filp, page); ++ ++ if (unlikely(error)) { ++ put_page(page); ++ return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL; ++ } ++ ++ if (!PageUptodate(page)) { ++ error = lock_page_killable(page); ++ if (unlikely(error)) { ++ put_page(page); ++ return ERR_PTR(error); ++ } ++ if (!PageUptodate(page)) { ++ if (page->mapping == NULL) { ++ /* ++ * invalidate_mapping_pages got it ++ */ ++ unlock_page(page); ++ put_page(page); ++ return NULL; ++ } ++ unlock_page(page); ++ shrink_readahead_size_eio(ra); ++ put_page(page); ++ return ERR_PTR(-EIO); ++ } ++ unlock_page(page); ++ } ++ ++ return page; ++} ++ ++static struct page * ++generic_file_buffered_read_pagenotuptodate(struct file *filp, ++ struct iov_iter *iter, ++ struct page *page, ++ loff_t pos, loff_t count) ++{ ++ struct address_space *mapping = filp->f_mapping; ++ struct inode *inode = mapping->host; ++ int error; ++ ++ /* ++ * See comment in do_read_cache_page on why ++ * wait_on_page_locked is used to avoid unnecessarily ++ * serialisations and why it's safe. ++ */ ++ error = wait_on_page_locked_killable(page); ++ if (unlikely(error)) { ++ put_page(page); ++ return ERR_PTR(error); ++ } ++ ++ if (PageUptodate(page)) ++ return page; ++ ++ if (inode->i_blkbits == PAGE_SHIFT || ++ !mapping->a_ops->is_partially_uptodate) ++ goto page_not_up_to_date; ++ /* pipes can't handle partially uptodate pages */ ++ if (unlikely(iov_iter_is_pipe(iter))) ++ goto page_not_up_to_date; ++ if (!trylock_page(page)) ++ goto page_not_up_to_date; ++ /* Did it get truncated before we got the lock? */ ++ if (!page->mapping) ++ goto page_not_up_to_date_locked; ++ ++ if (!mapping->a_ops->is_partially_uptodate(page, ++ pos & ~PAGE_MASK, count)) ++ goto page_not_up_to_date_locked; ++ unlock_page(page); ++ return page; ++ ++page_not_up_to_date: ++ /* Get exclusive access to the page ... */ ++ error = lock_page_killable(page); ++ if (unlikely(error)) { ++ put_page(page); ++ return ERR_PTR(error); ++ } ++ ++page_not_up_to_date_locked: ++ /* Did it get truncated before we got the lock? */ ++ if (!page->mapping) { ++ unlock_page(page); ++ put_page(page); ++ return NULL; ++ } ++ ++ /* Did somebody else fill it already? */ ++ if (PageUptodate(page)) { ++ unlock_page(page); ++ return page; ++ } ++ ++ return generic_file_buffered_read_readpage(filp, mapping, page); ++} ++ ++static struct page * ++generic_file_buffered_read_no_cached_page(struct kiocb *iocb, ++ struct iov_iter *iter) ++{ ++ struct file *filp = iocb->ki_filp; ++ struct address_space *mapping = filp->f_mapping; ++ pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; ++ struct page *page; ++ int error; ++ ++ /* ++ * Ok, it wasn't cached, so we need to create a new ++ * page.. ++ */ ++ page = page_cache_alloc(mapping); ++ if (!page) ++ return ERR_PTR(-ENOMEM); ++ ++ error = add_to_page_cache_lru(page, mapping, index, ++ mapping_gfp_constraint(mapping, GFP_KERNEL)); ++ if (error) { ++ put_page(page); ++ return error != -EEXIST ? ERR_PTR(error) : NULL; ++ } ++ ++ return generic_file_buffered_read_readpage(filp, mapping, page); ++} ++ ++static int generic_file_buffered_read_get_pages(struct kiocb *iocb, ++ struct iov_iter *iter, ++ struct page **pages, ++ unsigned nr) ++{ ++ struct file *filp = iocb->ki_filp; ++ struct address_space *mapping = filp->f_mapping; ++ struct file_ra_state *ra = &filp->f_ra; ++ pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; ++ pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; ++ int i, j, ret, err = 0; ++ ++ nr = min_t(unsigned long, last_index - index, nr); ++find_page: ++ if (fatal_signal_pending(current)) ++ return -EINTR; ++ ++ ret = find_get_pages_contig(mapping, index, nr, pages); ++ if (ret) ++ goto got_pages; ++ ++ if (iocb->ki_flags & IOCB_NOWAIT) ++ return -EAGAIN; ++ ++ page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); ++ ++ ret = find_get_pages_contig(mapping, index, nr, pages); ++ if (ret) ++ goto got_pages; ++ ++ pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter); ++ err = PTR_ERR_OR_ZERO(pages[0]); ++ ret = !IS_ERR_OR_NULL(pages[0]); ++got_pages: ++ for (i = 0; i < ret; i++) { ++ struct page *page = pages[i]; ++ pgoff_t pg_index = index +i; ++ loff_t pg_pos = max(iocb->ki_pos, ++ (loff_t) pg_index << PAGE_SHIFT); ++ loff_t pg_count = iocb->ki_pos + iter->count - pg_pos; ++ ++ if (PageReadahead(page)) ++ page_cache_async_readahead(mapping, ra, filp, page, ++ pg_index, last_index - pg_index); ++ ++ if (!PageUptodate(page)) { ++ if (iocb->ki_flags & IOCB_NOWAIT) { ++ for (j = i; j < ret; j++) ++ put_page(pages[j]); ++ ret = i; ++ err = -EAGAIN; ++ break; ++ } ++ ++ page = generic_file_buffered_read_pagenotuptodate(filp, ++ iter, page, pg_pos, pg_count); ++ if (IS_ERR_OR_NULL(page)) { ++ for (j = i + 1; j < ret; j++) ++ put_page(pages[j]); ++ ret = i; ++ err = PTR_ERR_OR_ZERO(page); ++ break; ++ } ++ } ++ } ++ ++ if (likely(ret)) ++ return ret; ++ if (err) ++ return err; ++ goto find_page; ++} ++ + /** + * generic_file_buffered_read - generic file read routine + * @iocb: the iocb to read +@@ -1995,252 +2311,108 @@ static ssize_t generic_file_buffered_read(struct kiocb *iocb, + struct iov_iter *iter, ssize_t written) + { + struct file *filp = iocb->ki_filp; ++ struct file_ra_state *ra = &filp->f_ra; + struct address_space *mapping = filp->f_mapping; + struct inode *inode = mapping->host; +- struct file_ra_state *ra = &filp->f_ra; +- loff_t *ppos = &iocb->ki_pos; +- pgoff_t index; +- pgoff_t last_index; +- pgoff_t prev_index; +- unsigned long offset; /* offset into pagecache page */ +- unsigned int prev_offset; +- int error = 0; +- +- if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) ++ size_t orig_count = iov_iter_count(iter); ++ struct page *page_array[8], **pages; ++ unsigned nr_pages = ARRAY_SIZE(page_array); ++ unsigned read_nr_pages = ((iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT) - ++ (iocb->ki_pos >> PAGE_SHIFT); ++ int i, pg_nr, error = 0; ++ bool writably_mapped; ++ loff_t isize, end_offset; ++ ++ if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) + return 0; + iov_iter_truncate(iter, inode->i_sb->s_maxbytes); + +- index = *ppos >> PAGE_SHIFT; +- prev_index = ra->prev_pos >> PAGE_SHIFT; +- prev_offset = ra->prev_pos & (PAGE_SIZE-1); +- last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; +- offset = *ppos & ~PAGE_MASK; +- +- for (;;) { +- struct page *page; +- pgoff_t end_index; +- loff_t isize; +- unsigned long nr, ret; ++ if (read_nr_pages > nr_pages && ++ (pages = kmalloc_array(read_nr_pages, sizeof(void *), GFP_KERNEL))) ++ nr_pages = read_nr_pages; ++ else ++ pages = page_array; + ++ do { + cond_resched(); +-find_page: +- if (fatal_signal_pending(current)) { +- error = -EINTR; +- goto out; +- } + +- page = find_get_page(mapping, index); +- if (!page) { +- if (iocb->ki_flags & IOCB_NOWAIT) +- goto would_block; +- page_cache_sync_readahead(mapping, +- ra, filp, +- index, last_index - index); +- page = find_get_page(mapping, index); +- if (unlikely(page == NULL)) +- goto no_cached_page; +- } +- if (PageReadahead(page)) { +- page_cache_async_readahead(mapping, +- ra, filp, page, +- index, last_index - index); ++ i = 0; ++ pg_nr = generic_file_buffered_read_get_pages(iocb, iter, ++ pages, nr_pages); ++ if (pg_nr < 0) { ++ error = pg_nr; ++ break; + } +- if (!PageUptodate(page)) { +- if (iocb->ki_flags & IOCB_NOWAIT) { +- put_page(page); +- goto would_block; +- } + +- /* +- * See comment in do_read_cache_page on why +- * wait_on_page_locked is used to avoid unnecessarily +- * serialisations and why it's safe. +- */ +- error = wait_on_page_locked_killable(page); +- if (unlikely(error)) +- goto readpage_error; +- if (PageUptodate(page)) +- goto page_ok; +- +- if (inode->i_blkbits == PAGE_SHIFT || +- !mapping->a_ops->is_partially_uptodate) +- goto page_not_up_to_date; +- /* pipes can't handle partially uptodate pages */ +- if (unlikely(iov_iter_is_pipe(iter))) +- goto page_not_up_to_date; +- if (!trylock_page(page)) +- goto page_not_up_to_date; +- /* Did it get truncated before we got the lock? */ +- if (!page->mapping) +- goto page_not_up_to_date_locked; +- if (!mapping->a_ops->is_partially_uptodate(page, +- offset, iter->count)) +- goto page_not_up_to_date_locked; +- unlock_page(page); +- } +-page_ok: + /* +- * i_size must be checked after we know the page is Uptodate. ++ * i_size must be checked after we know the pages are Uptodate. + * + * Checking i_size after the check allows us to calculate + * the correct value for "nr", which means the zero-filled + * part of the page is not copied back to userspace (unless + * another truncate extends the file - this is desired though). + */ +- + isize = i_size_read(inode); +- end_index = (isize - 1) >> PAGE_SHIFT; +- if (unlikely(!isize || index > end_index)) { +- put_page(page); +- goto out; +- } ++ if (unlikely(iocb->ki_pos >= isize)) ++ goto put_pages; + +- /* nr is the maximum number of bytes to copy from this page */ +- nr = PAGE_SIZE; +- if (index == end_index) { +- nr = ((isize - 1) & ~PAGE_MASK) + 1; +- if (nr <= offset) { +- put_page(page); +- goto out; +- } +- } +- nr = nr - offset; ++ end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); + +- /* If users can be writing to this page using arbitrary +- * virtual addresses, take care about potential aliasing +- * before reading the page on the kernel side. +- */ +- if (mapping_writably_mapped(mapping)) +- flush_dcache_page(page); ++ while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr > ++ (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT) ++ put_page(pages[--pg_nr]); + + /* +- * When a sequential read accesses a page several times, +- * only mark it as accessed the first time. ++ * Once we start copying data, we don't want to be touching any ++ * cachelines that might be contended: + */ +- if (prev_index != index || offset != prev_offset) +- mark_page_accessed(page); +- prev_index = index; ++ writably_mapped = mapping_writably_mapped(mapping); + + /* +- * Ok, we have the page, and it's up-to-date, so +- * now we can copy it to user space... ++ * When a sequential read accesses a page several times, only ++ * mark it as accessed the first time. + */ ++ if (iocb->ki_pos >> PAGE_SHIFT != ++ ra->prev_pos >> PAGE_SHIFT) ++ mark_page_accessed(pages[0]); ++ for (i = 1; i < pg_nr; i++) ++ mark_page_accessed(pages[i]); ++ ++ for (i = 0; i < pg_nr; i++) { ++ unsigned offset = iocb->ki_pos & ~PAGE_MASK; ++ unsigned bytes = min_t(loff_t, end_offset - iocb->ki_pos, ++ PAGE_SIZE - offset); ++ unsigned copied; + +- ret = copy_page_to_iter(page, offset, nr, iter); +- offset += ret; +- index += offset >> PAGE_SHIFT; +- offset &= ~PAGE_MASK; +- prev_offset = offset; +- +- put_page(page); +- written += ret; +- if (!iov_iter_count(iter)) +- goto out; +- if (ret < nr) { +- error = -EFAULT; +- goto out; +- } +- continue; +- +-page_not_up_to_date: +- /* Get exclusive access to the page ... */ +- error = lock_page_killable(page); +- if (unlikely(error)) +- goto readpage_error; +- +-page_not_up_to_date_locked: +- /* Did it get truncated before we got the lock? */ +- if (!page->mapping) { +- unlock_page(page); +- put_page(page); +- continue; +- } ++ /* ++ * If users can be writing to this page using arbitrary ++ * virtual addresses, take care about potential aliasing ++ * before reading the page on the kernel side. ++ */ ++ if (writably_mapped) ++ flush_dcache_page(pages[i]); + +- /* Did somebody else fill it already? */ +- if (PageUptodate(page)) { +- unlock_page(page); +- goto page_ok; +- } ++ copied = copy_page_to_iter(pages[i], offset, bytes, iter); + +-readpage: +- /* +- * A previous I/O error may have been due to temporary +- * failures, eg. multipath errors. +- * PG_error will be set again if readpage fails. +- */ +- ClearPageError(page); +- /* Start the actual read. The read will unlock the page. */ +- error = mapping->a_ops->readpage(filp, page); ++ iocb->ki_pos += copied; ++ ra->prev_pos = iocb->ki_pos; + +- if (unlikely(error)) { +- if (error == AOP_TRUNCATED_PAGE) { +- put_page(page); +- error = 0; +- goto find_page; +- } +- goto readpage_error; +- } +- +- if (!PageUptodate(page)) { +- error = lock_page_killable(page); +- if (unlikely(error)) +- goto readpage_error; +- if (!PageUptodate(page)) { +- if (page->mapping == NULL) { +- /* +- * invalidate_mapping_pages got it +- */ +- unlock_page(page); +- put_page(page); +- goto find_page; +- } +- unlock_page(page); +- shrink_readahead_size_eio(ra); +- error = -EIO; +- goto readpage_error; ++ if (copied < bytes) { ++ error = -EFAULT; ++ break; + } +- unlock_page(page); + } ++put_pages: ++ for (i = 0; i < pg_nr; i++) ++ put_page(pages[i]); ++ } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); + +- goto page_ok; +- +-readpage_error: +- /* UHHUH! A synchronous read error occurred. Report it */ +- put_page(page); +- goto out; +- +-no_cached_page: +- /* +- * Ok, it wasn't cached, so we need to create a new +- * page.. +- */ +- page = page_cache_alloc(mapping); +- if (!page) { +- error = -ENOMEM; +- goto out; +- } +- error = add_to_page_cache_lru(page, mapping, index, +- mapping_gfp_constraint(mapping, GFP_KERNEL)); +- if (error) { +- put_page(page); +- if (error == -EEXIST) { +- error = 0; +- goto find_page; +- } +- goto out; +- } +- goto readpage; +- } ++ file_accessed(filp); ++ written += orig_count - iov_iter_count(iter); + +-would_block: +- error = -EAGAIN; +-out: +- ra->prev_pos = prev_index; +- ra->prev_pos <<= PAGE_SHIFT; +- ra->prev_pos |= prev_offset; ++ if (pages != page_array) ++ kfree(pages); + +- *ppos = ((loff_t)index << PAGE_SHIFT) + offset; +- file_accessed(filp); + return written ? written : error; + } + +diff --git a/mm/gup.c b/mm/gup.c +index 87a6a59fe667..6ecc36d28c04 100644 +--- a/mm/gup.c ++++ b/mm/gup.c +@@ -1093,6 +1093,13 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + } + cond_resched(); + ++ if (current->faults_disabled_mapping && ++ vma->vm_file && ++ vma->vm_file->f_mapping == current->faults_disabled_mapping) { ++ ret = -EFAULT; ++ goto out; ++ } ++ + page = follow_page_mask(vma, start, foll_flags, &ctx); + if (!page) { + ret = faultin_page(tsk, vma, start, &foll_flags, +diff --git a/mm/page-writeback.c b/mm/page-writeback.c +index 7326b54ab728..bdc2eb057b3b 100644 +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -2467,20 +2467,19 @@ int __set_page_dirty_nobuffers(struct page *page) + lock_page_memcg(page); + if (!TestSetPageDirty(page)) { + struct address_space *mapping = page_mapping(page); +- unsigned long flags; + + if (!mapping) { + unlock_page_memcg(page); + return 1; + } + +- xa_lock_irqsave(&mapping->i_pages, flags); ++ xa_lock_irq(&mapping->i_pages); + BUG_ON(page_mapping(page) != mapping); + WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); + account_page_dirtied(page, mapping); + __xa_set_mark(&mapping->i_pages, page_index(page), + PAGECACHE_TAG_DIRTY); +- xa_unlock_irqrestore(&mapping->i_pages, flags); ++ xa_unlock_irq(&mapping->i_pages); + unlock_page_memcg(page); + + if (mapping->host) { diff --git a/linux-tkg/linux-tkg-patches/5.7/0009-glitched-bmq.patch b/linux-tkg/linux-tkg-patches/5.7/0009-glitched-bmq.patch new file mode 100644 index 0000000..38666e4 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.7/0009-glitched-bmq.patch @@ -0,0 +1,90 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched - BMQ + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_500 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -39,6 +39,13 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,6 +59,7 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_500 ++ default HZ_750 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -46,6 +46,13 @@ choice + on desktops with great smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -60,6 +67,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 500 if HZ_500 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 9270a4370d54..30d01e647417 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -159,7 +159,7 @@ struct scan_control { + /* + * From 0 .. 100. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 20; + /* + * The total number of pages which are beyond the high watermark within all + * zones. diff --git a/linux-tkg/linux-tkg-patches/5.7/0009-glitched-ondemand-bmq.patch b/linux-tkg/linux-tkg-patches/5.7/0009-glitched-ondemand-bmq.patch new file mode 100644 index 0000000..a926040 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.7/0009-glitched-ondemand-bmq.patch @@ -0,0 +1,18 @@ +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index 6b423eebfd5d..61e3271675d6 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -21,10 +21,10 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_SAMPLING_DOWN_FACTOR (1) ++#define DEF_FREQUENCY_UP_THRESHOLD (55) ++#define DEF_SAMPLING_DOWN_FACTOR (5) + #define MAX_SAMPLING_DOWN_FACTOR (100000) +-#define MICRO_FREQUENCY_UP_THRESHOLD (95) ++#define MICRO_FREQUENCY_UP_THRESHOLD (63) + #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) + #define MIN_FREQUENCY_UP_THRESHOLD (1) + #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux-tkg/linux-tkg-patches/5.7/0009-prjc_v5.7-r3.patch b/linux-tkg/linux-tkg-patches/5.7/0009-prjc_v5.7-r3.patch new file mode 100644 index 0000000..d95c1c6 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.7/0009-prjc_v5.7-r3.patch @@ -0,0 +1,7817 @@ +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 5e2ce88d6eda..eda08ad54201 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4445,6 +4445,12 @@ + + sbni= [NET] Granch SBNI12 leased line adapter + ++ sched_timeslice= ++ [KNL] Time slice in us for BMQ scheduler. ++ Format: (must be >= 1000) ++ Default: 4000 ++ See Documentation/scheduler/sched-BMQ.txt ++ + sched_debug [KNL] Enables verbose scheduler debug messages. + + schedstats= [KNL,X86] Enable or disable scheduled statistics. +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index 0d427fd10941..e0e112c68fa5 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -1230,3 +1230,13 @@ is 10 seconds. + + The softlockup threshold is (``2 * watchdog_thresh``). Setting this + tunable to zero will disable lockup detection altogether. ++ ++yield_type: ++=========== ++ ++BMQ CPU scheduler only. This determines what type of yield calls to ++sched_yield will perform. ++ ++ 0 - No yield. ++ 1 - Deboost and requeue task. (default) ++ 2 - Set run queue skip task. +diff --git a/Documentation/scheduler/sched-BMQ.txt b/Documentation/scheduler/sched-BMQ.txt +new file mode 100644 +index 000000000000..05c84eec0f31 +--- /dev/null ++++ b/Documentation/scheduler/sched-BMQ.txt +@@ -0,0 +1,110 @@ ++ BitMap queue CPU Scheduler ++ -------------------------- ++ ++CONTENT ++======== ++ ++ Background ++ Design ++ Overview ++ Task policy ++ Priority management ++ BitMap Queue ++ CPU Assignment and Migration ++ ++ ++Background ++========== ++ ++BitMap Queue CPU scheduler, referred to as BMQ from here on, is an evolution ++of previous Priority and Deadline based Skiplist multiple queue scheduler(PDS), ++and inspired by Zircon scheduler. The goal of it is to keep the scheduler code ++simple, while efficiency and scalable for interactive tasks, such as desktop, ++movie playback and gaming etc. ++ ++Design ++====== ++ ++Overview ++-------- ++ ++BMQ use per CPU run queue design, each CPU(logical) has it's own run queue, ++each CPU is responsible for scheduling the tasks that are putting into it's ++run queue. ++ ++The run queue is a set of priority queues. Note that these queues are fifo ++queue for non-rt tasks or priority queue for rt tasks in data structure. See ++BitMap Queue below for details. BMQ is optimized for non-rt tasks in the fact ++that most applications are non-rt tasks. No matter the queue is fifo or ++priority, In each queue is an ordered list of runnable tasks awaiting execution ++and the data structures are the same. When it is time for a new task to run, ++the scheduler simply looks the lowest numbered queueue that contains a task, ++and runs the first task from the head of that queue. And per CPU idle task is ++also in the run queue, so the scheduler can always find a task to run on from ++its run queue. ++ ++Each task will assigned the same timeslice(default 4ms) when it is picked to ++start running. Task will be reinserted at the end of the appropriate priority ++queue when it uses its whole timeslice. When the scheduler selects a new task ++from the priority queue it sets the CPU's preemption timer for the remainder of ++the previous timeslice. When that timer fires the scheduler will stop execution ++on that task, select another task and start over again. ++ ++If a task blocks waiting for a shared resource then it's taken out of its ++priority queue and is placed in a wait queue for the shared resource. When it ++is unblocked it will be reinserted in the appropriate priority queue of an ++eligible CPU. ++ ++Task policy ++----------- ++ ++BMQ supports DEADLINE, FIFO, RR, NORMAL, BATCH and IDLE task policy like the ++mainline CFS scheduler. But BMQ is heavy optimized for non-rt task, that's ++NORMAL/BATCH/IDLE policy tasks. Below is the implementation detail of each ++policy. ++ ++DEADLINE ++ It is squashed as priority 0 FIFO task. ++ ++FIFO/RR ++ All RT tasks share one single priority queue in BMQ run queue designed. The ++complexity of insert operation is O(n). BMQ is not designed for system runs ++with major rt policy tasks. ++ ++NORMAL/BATCH/IDLE ++ BATCH and IDLE tasks are treated as the same policy. They compete CPU with ++NORMAL policy tasks, but they just don't boost. To control the priority of ++NORMAL/BATCH/IDLE tasks, simply use nice level. ++ ++ISO ++ ISO policy is not supported in BMQ. Please use nice level -20 NORMAL policy ++task instead. ++ ++Priority management ++------------------- ++ ++RT tasks have priority from 0-99. For non-rt tasks, there are three different ++factors used to determine the effective priority of a task. The effective ++priority being what is used to determine which queue it will be in. ++ ++The first factor is simply the task’s static priority. Which is assigned from ++task's nice level, within [-20, 19] in userland's point of view and [0, 39] ++internally. ++ ++The second factor is the priority boost. This is a value bounded between ++[-MAX_PRIORITY_ADJ, MAX_PRIORITY_ADJ] used to offset the base priority, it is ++modified by the following cases: ++ ++*When a thread has used up its entire timeslice, always deboost its boost by ++increasing by one. ++*When a thread gives up cpu control(voluntary or non-voluntary) to reschedule, ++and its switch-in time(time after last switch and run) below the thredhold ++based on its priority boost, will boost its boost by decreasing by one buti is ++capped at 0 (won’t go negative). ++ ++The intent in this system is to ensure that interactive threads are serviced ++quickly. These are usually the threads that interact directly with the user ++and cause user-perceivable latency. These threads usually do little work and ++spend most of their time blocked awaiting another user event. So they get the ++priority boost from unblocking while background threads that do most of the ++processing receive the priority penalty for using their entire timeslice. +diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +index f18d5067cd0f..fe489fc01c73 100644 +--- a/arch/powerpc/platforms/cell/spufs/sched.c ++++ b/arch/powerpc/platforms/cell/spufs/sched.c +@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; + static struct timer_list spusched_timer; + static struct timer_list spuloadavg_timer; + +-/* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- + /* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. +diff --git a/fs/proc/base.c b/fs/proc/base.c +index eb2255e95f62..62b8cedbccb6 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -479,7 +479,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + seq_puts(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h +index 8874f681b056..59eb72bf7d5f 100644 +--- a/include/asm-generic/resource.h ++++ b/include/asm-generic/resource.h +@@ -23,7 +23,7 @@ + [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY }, \ + [RLIMIT_SIGPENDING] = { 0, 0 }, \ + [RLIMIT_MSGQUEUE] = { MQ_BYTES_MAX, MQ_BYTES_MAX }, \ +- [RLIMIT_NICE] = { 0, 0 }, \ ++ [RLIMIT_NICE] = { 30, 30 }, \ + [RLIMIT_RTPRIO] = { 0, 0 }, \ + [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, \ + } +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 4418f5cb8324..1e8030513489 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -652,13 +652,18 @@ struct task_struct { + unsigned int flags; + unsigned int ptrace; + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) + struct llist_node wake_entry; ++#endif ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_ALT) + int on_cpu; ++#endif ++#ifdef CONFIG_SMP + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ + unsigned int cpu; + #endif ++#ifndef CONFIG_SCHED_ALT + unsigned int wakee_flips; + unsigned long wakee_flip_decay_ts; + struct task_struct *last_wakee; +@@ -672,6 +677,7 @@ struct task_struct { + */ + int recent_used_cpu; + int wake_cpu; ++#endif /* !CONFIG_SCHED_ALT */ + #endif + int on_rq; + +@@ -680,13 +686,25 @@ struct task_struct { + int normal_prio; + unsigned int rt_priority; + ++#ifdef CONFIG_SCHED_ALT ++ u64 last_ran; ++ s64 time_slice; ++ int boost_prio; ++#ifdef CONFIG_SCHED_BMQ ++ int bmq_idx; ++ struct list_head bmq_node; ++#endif /* CONFIG_SCHED_BMQ */ ++ /* sched_clock time spent running */ ++ u64 sched_time; ++#else /* !CONFIG_SCHED_ALT */ + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++ struct sched_dl_entity dl; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +- struct sched_dl_entity dl; + + #ifdef CONFIG_UCLAMP_TASK + /* Clamp values requested for a scheduling entity */ +@@ -1306,6 +1324,15 @@ struct task_struct { + */ + }; + ++#ifdef CONFIG_SCHED_ALT ++#define tsk_seruntime(t) ((t)->sched_time) ++/* replace the uncertian rt_timeout with 0UL */ ++#define tsk_rttimeout(t) (0UL) ++#else /* CFS */ ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++#endif /* !CONFIG_SCHED_ALT */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index 1aff00b65f3c..da0306d2fedb 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -1,5 +1,20 @@ + /* SPDX-License-Identifier: GPL-2.0 */ + ++#ifdef CONFIG_SCHED_ALT ++ ++#ifdef CONFIG_SCHED_BMQ ++#define __tsk_deadline(p) (0UL) ++ ++static inline int dl_task(struct task_struct *p) ++{ ++ return 0; ++} ++#endif ++ ++#else ++ ++#define __tsk_deadline(p) ((p)->dl.deadline) ++ + /* + * SCHED_DEADLINE tasks has negative priorities, reflecting + * the fact that any of them has higher prio than RT and +@@ -19,6 +34,7 @@ static inline int dl_task(struct task_struct *p) + { + return dl_prio(p->prio); + } ++#endif /* CONFIG_SCHED_ALT */ + + static inline bool dl_time_before(u64 a, u64 b) + { +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index 7d64feafc408..ba6fd6a5b4b1 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -20,11 +20,17 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ + #define MAX_RT_PRIO MAX_USER_RT_PRIO + + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) + ++#ifdef CONFIG_SCHED_ALT ++/* +/- priority levels from the base priority */ ++#define MAX_PRIORITY_ADJ 4 ++#endif ++ + /* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c08b4..0a7565d0d3cf 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_ALT + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff --git a/init/Kconfig b/init/Kconfig +index 74a5ac65644f..4ef358fc7b51 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -689,9 +689,33 @@ config GENERIC_SCHED_CLOCK + + menu "Scheduler features" + ++menuconfig SCHED_ALT ++ bool "Alternative CPU Schedulers" ++ default y ++ help ++ This feature enable alternative CPU scheduler" ++ ++if SCHED_ALT ++ ++choice ++ prompt "Alternative CPU Scheduler" ++ default SCHED_BMQ ++ ++config SCHED_BMQ ++ bool "BMQ CPU scheduler" ++ help ++ The BitMap Queue CPU scheduler for excellent interactivity and ++ responsiveness on the desktop and solid scalability on normal ++ hardware and commodity servers. ++ ++endchoice ++ ++endif ++ + config UCLAMP_TASK + bool "Enable utilization clamping for RT/FAIR tasks" + depends on CPU_FREQ_GOV_SCHEDUTIL ++ depends on !SCHED_BMQ + help + This feature enables the scheduler to track the clamped utilization + of each CPU based on RUNNABLE tasks scheduled on that CPU. +@@ -777,6 +801,7 @@ config NUMA_BALANCING + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_BMQ + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -878,7 +903,7 @@ menuconfig CGROUP_SCHED + bandwidth allocation to such task groups. It uses cgroups to group + tasks. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_BMQ + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -1134,6 +1159,7 @@ config CHECKPOINT_RESTORE + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_BMQ + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff --git a/init/init_task.c b/init/init_task.c +index bd403ed3e418..737a814482d6 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -67,9 +67,15 @@ struct task_struct init_task + .stack = init_stack, + .usage = REFCOUNT_INIT(2), + .flags = PF_KTHREAD, ++#ifdef CONFIG_SCHED_ALT ++ .prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, ++ .static_prio = DEFAULT_PRIO, ++ .normal_prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, ++#else + .prio = MAX_PRIO - 20, + .static_prio = MAX_PRIO - 20, + .normal_prio = MAX_PRIO - 20, ++#endif + .policy = SCHED_NORMAL, + .cpus_ptr = &init_task.cpus_mask, + .cpus_mask = CPU_MASK_ALL, +@@ -79,6 +85,14 @@ struct task_struct init_task + .restart_block = { + .fn = do_no_restart_syscall, + }, ++#ifdef CONFIG_SCHED_ALT ++ .boost_prio = 0, ++#ifdef CONFIG_SCHED_BMQ ++ .bmq_idx = 15, ++ .bmq_node = LIST_HEAD_INIT(init_task.bmq_node), ++#endif ++ .time_slice = HZ, ++#else + .se = { + .group_node = LIST_HEAD_INIT(init_task.se.group_node), + }, +@@ -86,6 +100,7 @@ struct task_struct init_task + .run_list = LIST_HEAD_INIT(init_task.rt.run_list), + .time_slice = RR_TIMESLICE, + }, ++#endif + .tasks = LIST_HEAD_INIT(init_task.tasks), + #ifdef CONFIG_SMP + .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index 729d3a5c772e..1e3dac9b6a43 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -636,7 +636,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) + return ret; + } + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_ALT) + /* + * Helper routine for generate_sched_domains(). + * Do cpusets a, b have overlapping effective cpus_allowed masks? +@@ -1009,7 +1009,7 @@ static void rebuild_sched_domains_locked(void) + /* Have scheduler rebuild the domains */ + partition_and_rebuild_sched_domains(ndoms, doms, attr); + } +-#else /* !CONFIG_SMP */ ++#else /* !CONFIG_SMP || CONFIG_SCHED_ALT */ + static void rebuild_sched_domains_locked(void) + { + } +diff --git a/kernel/delayacct.c b/kernel/delayacct.c +index 27725754ac99..769d773c7182 100644 +--- a/kernel/delayacct.c ++++ b/kernel/delayacct.c +@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff --git a/kernel/exit.c b/kernel/exit.c +index d56fe51bdf07..3aa2c1e822b0 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -122,7 +122,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -143,7 +143,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +index f6310f848f34..4176ad070bc9 100644 +--- a/kernel/livepatch/transition.c ++++ b/kernel/livepatch/transition.c +@@ -306,7 +306,11 @@ static bool klp_try_switch_task(struct task_struct *task) + */ + rq = task_rq_lock(task, &flags); + ++#ifdef CONFIG_SCHED_ALT ++ if (task_running(task) && task != current) { ++#else + if (task_running(rq, task) && task != current) { ++#endif + snprintf(err_buf, STACK_ERR_BUF_SIZE, + "%s: %s:%d is running\n", __func__, task->comm, + task->pid); +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index c9f090d64f00..b5d0c7088021 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -229,7 +229,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + * Only use with rt_mutex_waiter_{less,equal}() + */ + #define task_to_waiter(p) \ +- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } ++ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = __tsk_deadline(p) } + + static inline int + rt_mutex_waiter_less(struct rt_mutex_waiter *left, +@@ -238,6 +238,7 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, + if (left->prio < right->prio) + return 1; + ++#ifndef CONFIG_SCHED_BMQ + /* + * If both waiters have dl_prio(), we check the deadlines of the + * associated tasks. +@@ -246,6 +247,7 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, + */ + if (dl_prio(left->prio)) + return dl_time_before(left->deadline, right->deadline); ++#endif + + return 0; + } +@@ -257,6 +259,7 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, + if (left->prio != right->prio) + return 0; + ++#ifndef CONFIG_SCHED_BMQ + /* + * If both waiters have dl_prio(), we check the deadlines of the + * associated tasks. +@@ -265,6 +268,7 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, + */ + if (dl_prio(left->prio)) + return left->deadline == right->deadline; ++#endif + + return 1; + } +@@ -680,7 +684,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, + * the values of the node being removed. + */ + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + rt_mutex_enqueue(lock, waiter); + +@@ -953,7 +957,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + waiter->task = task; + waiter->lock = lock; + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + /* Get the top priority waiter on the lock */ + if (rt_mutex_has_waiters(lock)) +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 21fb5a5662b5..1cad9ff599a4 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -16,14 +16,20 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + +-obj-y += core.o loadavg.o clock.o cputime.o +-obj-y += idle.o fair.o rt.o deadline.o +-obj-y += wait.o wait_bit.o swait.o completion.o +- +-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o ++ifdef CONFIG_SCHED_ALT ++obj-y += alt_core.o alt_debug.o ++else ++obj-y += core.o ++obj-y += fair.o rt.o deadline.o ++obj-$(CONFIG_SMP) += cpudeadline.o stop_task.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o +-obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o ++endif ++obj-y += loadavg.o clock.o cputime.o ++obj-y += idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o ++obj-$(CONFIG_SMP) += cpupri.o pelt.o topology.o ++obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o +diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c +new file mode 100644 +index 000000000000..48e5fac710bc +--- /dev/null ++++ b/kernel/sched/alt_core.c +@@ -0,0 +1,6057 @@ ++/* ++ * kernel/sched/alt_core.c ++ * ++ * Core alternative kernel scheduler code and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel ++ * scheduler by Alfred Chen. ++ * 2019-02-20 BMQ(BitMap Queue) kernel scheduler by Alfred Chen. ++ */ ++#include "sched.h" ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++ ++#include "../workqueue_internal.h" ++#include "../../fs/io-wq.h" ++#include "../smpboot.h" ++ ++#include "pelt.h" ++ ++#define CREATE_TRACE_POINTS ++#include ++ ++/* rt_prio(prio) defined in include/linux/sched/rt.h */ ++#define rt_task(p) rt_prio((p)->prio) ++#define rt_policy(policy) ((policy) == SCHED_FIFO || (policy) == SCHED_RR) ++#define task_has_rt_policy(p) (rt_policy((p)->policy)) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* Default time slice is 4 in ms, can be set via kernel parameter "sched_timeslice" */ ++u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000); ++ ++static int __init sched_timeslice(char *str) ++{ ++ int timeslice_us; ++ ++ get_option(&str, ×lice_us); ++ if (timeslice_us >= 1000) ++ sched_timeslice_ns = timeslice_us * 1000; ++ ++ return 0; ++} ++early_param("sched_timeslice", sched_timeslice); ++ ++/* Reschedule if less than this many μs left */ ++#define RESCHED_NS (100 * 1000) ++ ++/** ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Deboost and requeue task. (default) ++ * 2: Set rq skip task. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++#define rq_switch_time(rq) ((rq)->clock - (rq)->last_ts_switch) ++#define boost_threshold(p) (sched_timeslice_ns >>\ ++ (10 - MAX_PRIORITY_ADJ - (p)->boost_prio)) ++ ++static inline void boost_task(struct task_struct *p) ++{ ++ int limit; ++ ++ switch (p->policy) { ++ case SCHED_NORMAL: ++ limit = -MAX_PRIORITY_ADJ; ++ break; ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ limit = 0; ++ break; ++ default: ++ return; ++ } ++ ++ if (p->boost_prio > limit) ++ p->boost_prio--; ++} ++ ++static inline void deboost_task(struct task_struct *p) ++{ ++ if (p->boost_prio < MAX_PRIORITY_ADJ) ++ p->boost_prio++; ++} ++ ++#ifdef CONFIG_SMP ++static cpumask_t sched_rq_pending_mask ____cacheline_aligned_in_smp; ++ ++DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_masks); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_end_mask); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_mask); ++ ++#ifdef CONFIG_SCHED_SMT ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++EXPORT_SYMBOL_GPL(sched_smt_present); ++#endif ++ ++/* ++ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of ++ * the domain), this allows us to quickly tell if two cpus are in the same cache ++ * domain, see cpus_share_cache(). ++ */ ++DEFINE_PER_CPU(int, sd_llc_id); ++#endif /* CONFIG_SMP */ ++ ++static DEFINE_MUTEX(sched_hotcpu_mutex); ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++#define IDLE_WM (IDLE_TASK_SCHED_PRIO) ++ ++static cpumask_t sched_sg_idle_mask ____cacheline_aligned_in_smp; ++static cpumask_t sched_rq_watermark[SCHED_BITS] ____cacheline_aligned_in_smp; ++ ++static inline void update_sched_rq_watermark(struct rq *rq) ++{ ++ unsigned long watermark = find_first_bit(rq->queue.bitmap, SCHED_BITS); ++ unsigned long last_wm = rq->watermark; ++ unsigned long i; ++ int cpu; ++ ++ if (watermark == last_wm) ++ return; ++ ++ rq->watermark = watermark; ++ cpu = cpu_of(rq); ++ if (watermark < last_wm) { ++ for (i = watermark + 1; i <= last_wm; i++) ++ cpumask_andnot(&sched_rq_watermark[i], ++ &sched_rq_watermark[i], cpumask_of(cpu)); ++#ifdef CONFIG_SCHED_SMT ++ if (!static_branch_likely(&sched_smt_present)) ++ return; ++ if (IDLE_WM == last_wm) ++ cpumask_andnot(&sched_sg_idle_mask, ++ &sched_sg_idle_mask, cpu_smt_mask(cpu)); ++#endif ++ return; ++ } ++ /* last_wm < watermark */ ++ for (i = last_wm + 1; i <= watermark; i++) ++ cpumask_set_cpu(cpu, &sched_rq_watermark[i]); ++#ifdef CONFIG_SCHED_SMT ++ if (!static_branch_likely(&sched_smt_present)) ++ return; ++ if (IDLE_WM == watermark) { ++ cpumask_t tmp; ++ cpumask_and(&tmp, cpu_smt_mask(cpu), &sched_rq_watermark[IDLE_WM]); ++ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) ++ cpumask_or(&sched_sg_idle_mask, cpu_smt_mask(cpu), ++ &sched_sg_idle_mask); ++ } ++#endif ++} ++ ++static inline int task_sched_prio(struct task_struct *p) ++{ ++ return (p->prio < MAX_RT_PRIO)? p->prio : p->prio + p->boost_prio; ++} ++ ++#include "bmq_imp.h" ++ ++static inline struct task_struct *rq_runnable_task(struct rq *rq) ++{ ++ struct task_struct *next = sched_rq_first_task(rq); ++ ++ if (unlikely(next == rq->skip)) ++ next = sched_rq_next_task(next, rq); ++ ++ return next; ++} ++ ++/* ++ * Context: p->pi_lock ++ */ ++static inline struct rq ++*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock(&rq->lock); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ *plock = NULL; ++ return rq; ++ } ++ } ++} ++ ++static inline void ++__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) ++{ ++ if (NULL != lock) ++ raw_spin_unlock(lock); ++} ++ ++static inline struct rq ++*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, ++ unsigned long *flags) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock_irqsave(&rq->lock, *flags); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&rq->lock, *flags); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ raw_spin_lock_irqsave(&p->pi_lock, *flags); ++ if (likely(!p->on_cpu && !p->on_rq && ++ rq == task_rq(p))) { ++ *plock = &p->pi_lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); ++ } ++ } ++} ++ ++static inline void ++task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, ++ unsigned long *flags) ++{ ++ raw_spin_unlock_irqrestore(lock, *flags); ++} ++ ++/* ++ * __task_rq_lock - lock the rq @p resides on. ++ */ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ for (;;) { ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) ++ return rq; ++ raw_spin_unlock(&rq->lock); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. ++ */ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ for (;;) { ++ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ /* ++ * move_queued_task() task_rq_lock() ++ * ++ * ACQUIRE (rq->lock) ++ * [S] ->on_rq = MIGRATING [L] rq = task_rq() ++ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); ++ * [S] ->cpu = new_cpu [L] task_rq() ++ * [L] ->on_rq ++ * RELEASE (rq->lock) ++ * ++ * If we observe the old CPU in task_rq_lock(), the acquire of ++ * the old rq->lock will fully serialize against the stores. ++ * ++ * If we observe the new CPU in task_rq_lock(), the address ++ * dependency headed by '[L] rq = task_rq()' and the acquire ++ * will pair with the WMB to ensure we then also see migrating. ++ */ ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++static inline void ++rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irqsave(&rq->lock, rf->flags); ++} ++ ++static inline void ++rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irqrestore(&rq->lock, rf->flags); ++} ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++ s64 __maybe_unused steal = 0, irq_delta = 0; ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ delta -= steal; ++ } ++#endif ++ ++ rq->clock_task += delta; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ if ((irq_delta + steal)) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta <= 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out ++ * of nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu; ++ ++ if (!tick_nohz_full_enabled()) ++ return; ++ ++ cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (rq->nr_running < 2) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++/* ++ * Add/Remove/Requeue task to/from the runqueue routines ++ * Context: rq->lock ++ */ ++static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "sched: dequeue task reside on cpu%d from cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ ++ __SCHED_DEQUEUE_TASK(p, rq, flags, update_sched_rq_watermark(rq)); ++ --rq->nr_running; ++#ifdef CONFIG_SMP ++ if (1 == rq->nr_running) ++ cpumask_clear_cpu(cpu_of(rq), &sched_rq_pending_mask); ++#endif ++ ++ sched_update_tick_dependency(rq); ++} ++ ++static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "sched: enqueue task reside on cpu%d to cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ ++ __SCHED_ENQUEUE_TASK(p, rq, flags); ++ update_sched_rq_watermark(rq); ++ ++rq->nr_running; ++#ifdef CONFIG_SMP ++ if (2 == rq->nr_running) ++ cpumask_set_cpu(cpu_of(rq), &sched_rq_pending_mask); ++#endif ++ ++ sched_update_tick_dependency(rq); ++ ++ /* ++ * If in_iowait is set, the code below may not trigger any cpufreq ++ * utilization updates, so do it here explicitly with the IOWAIT flag ++ * passed. ++ */ ++ if (p->in_iowait) ++ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); ++} ++ ++static inline void requeue_task(struct task_struct *p, struct rq *rq) ++{ ++ lockdep_assert_held(&rq->lock); ++ WARN_ONCE(task_rq(p) != rq, "sched: cpu[%d] requeue task reside on cpu%d\n", ++ cpu_of(rq), task_cpu(p)); ++ ++ __requeue_task(p, rq); ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * In order to ensure that a pending wakeup will observe our pending ++ * state, even in the failed case, an explicit smp_mb() must be used. ++ */ ++ smp_mb__before_atomic(); ++ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) ++ return false; ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++ return true; ++} ++ ++/** ++ * wake_q_add() - queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ */ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task)) ++ get_task_struct(task); ++} ++ ++/** ++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ * ++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers ++ * that already hold reference to @task can call the 'safe' version and trust ++ * wake_q to do the right thing depending whether or not the @task is already ++ * queued for wakeup. ++ */ ++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (!__wake_q_add(head, task)) ++ put_task_struct(task); ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* task can safely be re-inserted now: */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++/* ++ * resched_curr - mark rq's current task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_curr(struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ int cpu; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ if (test_tsk_need_resched(curr)) ++ return; ++ ++ cpu = cpu_of(rq); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(curr)) ++ smp_send_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(cpu_rq(cpu)); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++} ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_NO_HZ_COMMON ++void nohz_balance_enter_idle(int cpu) ++{ ++} ++ ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(), default_cpu = -1; ++ struct cpumask *mask; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { ++ if (!idle_cpu(cpu)) ++ return cpu; ++ default_cpu = cpu; ++ } ++ ++ for (mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ mask < per_cpu(sched_cpu_affinity_end_mask, cpu); mask++) ++ for_each_cpu_and(i, mask, housekeeping_cpumask(HK_FLAG_TIMER)) ++ if (!idle_cpu(i)) ++ return i; ++ ++ if (default_cpu == -1) ++ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++ cpu = default_cpu; ++ ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++static inline void wake_up_idle_cpu(int cpu) ++{ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ set_tsk_need_resched(cpu_rq(cpu)->idle); ++ smp_send_reschedule(cpu); ++} ++ ++static inline bool wake_up_full_nohz_cpu(int cpu) ++{ ++ /* ++ * We just need the target to call irq_exit() and re-evaluate ++ * the next tick. The nohz full kick at least implies that. ++ * If needed we can still optimize that later with an ++ * empty IRQ. ++ */ ++ if (tick_nohz_full_cpu(cpu)) { ++ if (cpu != smp_processor_id() || ++ tick_nohz_tick_stopped()) ++ tick_nohz_full_kick_cpu(cpu); ++ return true; ++ } ++ ++ return false; ++} ++ ++void wake_up_nohz_cpu(int cpu) ++{ ++ if (cpu_online(cpu) && !wake_up_full_nohz_cpu(cpu)) ++ wake_up_idle_cpu(cpu); ++} ++ ++static inline bool got_nohz_idle_kick(void) ++{ ++ int cpu = smp_processor_id(); ++ ++ /* TODO: need to support nohz_flag ++ if (!(atomic_read(nohz_flags(cpu)) & NOHZ_KICK_MASK)) ++ return false; ++ */ ++ ++ if (idle_cpu(cpu) && !need_resched()) ++ return true; ++ ++ /* ++ * We can't run Idle Load Balance on this CPU for this time so we ++ * cancel it and clear NOHZ_BALANCE_KICK ++ */ ++ /* TODO: need to support nohz_flag ++ atomic_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); ++ */ ++ return false; ++} ++ ++#else /* CONFIG_NO_HZ_COMMON */ ++ ++static inline bool got_nohz_idle_kick(void) ++{ ++ return false; ++} ++#endif /* CONFIG_NO_HZ_COMMON */ ++#endif /* CONFIG_SMP */ ++ ++static inline void check_preempt_curr(struct rq *rq) ++{ ++ if (sched_rq_first_task(rq) != rq->curr) ++ resched_curr(rq); ++} ++ ++#ifdef CONFIG_SCHED_HRTICK ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++ ++static void hrtick_clear(struct rq *rq) ++{ ++ if (hrtimer_active(&rq->hrtick_timer)) ++ hrtimer_cancel(&rq->hrtick_timer); ++} ++ ++/* ++ * High-resolution timer tick. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrtick(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrtick_timer); ++ struct task_struct *p; ++ ++ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); ++ ++ raw_spin_lock(&rq->lock); ++ p = rq->curr; ++ p->time_slice = 0; ++ resched_curr(rq); ++ raw_spin_unlock(&rq->lock); ++ ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Use hrtick when: ++ * - enabled by features ++ * - hrtimer is actually high res ++ */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ /** ++ * Alt schedule FW doesn't support sched_feat yet ++ if (!sched_feat(HRTICK)) ++ return 0; ++ */ ++ if (!cpu_active(cpu_of(rq))) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrtick_timer); ++} ++ ++#ifdef CONFIG_SMP ++ ++static void __hrtick_restart(struct rq *rq) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ++ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); ++} ++ ++/* ++ * called from hardirq (IPI) context ++ */ ++static void __hrtick_start(void *arg) ++{ ++ struct rq *rq = arg; ++ ++ raw_spin_lock(&rq->lock); ++ __hrtick_restart(rq); ++ raw_spin_unlock(&rq->lock); ++} ++ ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ktime_t time; ++ s64 delta; ++ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense and can cause timer DoS. ++ */ ++ delta = max_t(s64, delay, 10000LL); ++ time = ktime_add_ns(timer->base->get_time(), delta); ++ ++ hrtimer_set_expires(timer, time); ++ ++ if (rq == this_rq()) ++ __hrtick_restart(rq); ++ else ++ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); ++} ++ ++#else ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense. Rely on vruntime for fairness. ++ */ ++ delay = max_t(u64, delay, 10000LL); ++ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED_HARD); ++} ++#endif /* CONFIG_SMP */ ++ ++static void hrtick_rq_init(struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ rq->hrtick_csd.flags = 0; ++ rq->hrtick_csd.func = __hrtick_start; ++ rq->hrtick_csd.info = rq; ++#endif ++ ++ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); ++ rq->hrtick_timer.function = hrtick; ++} ++#else /* CONFIG_SCHED_HRTICK */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline void hrtick_clear(struct rq *rq) ++{ ++} ++ ++static inline void hrtick_rq_init(struct rq *rq) ++{ ++} ++#endif /* CONFIG_SCHED_HRTICK */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ if (task_has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ ++ return p->static_prio + MAX_PRIORITY_ADJ; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static void activate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible--; ++ enqueue_task(p, rq, ENQUEUE_WAKEUP); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++ cpufreq_update_util(rq, 0); ++} ++ ++/* ++ * deactivate_task - remove a task from the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible++; ++ dequeue_task(p, rq, DEQUEUE_SLEEP); ++ p->on_rq = 0; ++ cpufreq_update_util(rq, 0); ++} ++ ++static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ WRITE_ONCE(p->cpu, cpu); ++#else ++ WRITE_ONCE(task_thread_info(p)->cpu, cpu); ++#endif ++#endif ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++#ifdef CONFIG_SCHED_DEBUG ++ /* ++ * We should never call set_task_cpu() on a blocked task, ++ * ttwu() will sort out the placement. ++ */ ++ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && ++ !p->on_rq); ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * sched_move_task() holds both and thus holding either pins the cgroup, ++ * see task_group(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(&task_rq(p)->lock))); ++#endif ++ /* ++ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. ++ */ ++ WARN_ON_ONCE(!cpu_online(new_cpu)); ++#endif ++ if (task_cpu(p) == new_cpu) ++ return; ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ __set_task_cpu(p, new_cpu); ++} ++ ++static inline bool is_per_cpu_kthread(struct task_struct *p) ++{ ++ return ((p->flags & PF_KTHREAD) && (1 == p->nr_cpus_allowed)); ++} ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr() and select_fallback_rq(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ ++ if (is_per_cpu_kthread(p)) ++ return cpu_online(cpu); ++ ++ return cpu_active(cpu); ++} ++ ++/* ++ * This is how migration works: ++ * ++ * 1) we invoke migration_cpu_stop() on the target CPU using ++ * stop_one_cpu(). ++ * 2) stopper starts to run (implicitly forcing the migrated thread ++ * off the CPU) ++ * 3) it checks whether the migrated task is still in the wrong runqueue. ++ * 4) if it's in the wrong runqueue then the migration thread removes ++ * it and puts it into the right queue. ++ * 5) stopper completes and stop_one_cpu() returns and the migration ++ * is done. ++ */ ++ ++/* ++ * move_queued_task - move a queued task to new rq. ++ * ++ * Returns (locked) new rq. Old rq's lock is released. ++ */ ++static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int ++ new_cpu) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ++ dequeue_task(p, rq, 0); ++ set_task_cpu(p, new_cpu); ++ raw_spin_unlock(&rq->lock); ++ ++ rq = cpu_rq(new_cpu); ++ ++ raw_spin_lock(&rq->lock); ++ BUG_ON(task_cpu(p) != new_cpu); ++ enqueue_task(p, rq, 0); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++ check_preempt_curr(rq); ++ ++ return rq; ++} ++ ++struct migration_arg { ++ struct task_struct *task; ++ int dest_cpu; ++}; ++ ++/* ++ * Move (not current) task off this CPU, onto the destination CPU. We're doing ++ * this because either it can't run here any more (set_cpus_allowed() ++ * away from this CPU, or CPU going down), or because we're ++ * attempting to rebalance this task on exec (sched_exec). ++ * ++ * So we race with normal scheduler movements, but that's OK, as long ++ * as the task is no longer on this CPU. ++ */ ++static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int ++ dest_cpu) ++{ ++ /* Affinity changed (again). */ ++ if (!is_cpu_allowed(p, dest_cpu)) ++ return rq; ++ ++ update_rq_clock(rq); ++ return move_queued_task(rq, p, dest_cpu); ++} ++ ++/* ++ * migration_cpu_stop - this will be executed by a highprio stopper thread ++ * and performs thread migration by bumping thread off CPU then ++ * 'pushing' onto another runqueue. ++ */ ++static int migration_cpu_stop(void *data) ++{ ++ struct migration_arg *arg = data; ++ struct task_struct *p = arg->task; ++ struct rq *rq = this_rq(); ++ ++ /* ++ * The original target CPU might have gone down and we might ++ * be on another CPU but it doesn't matter. ++ */ ++ local_irq_disable(); ++ /* ++ * We need to explicitly wake pending tasks before running ++ * __migrate_task() such that we will not miss enforcing cpus_ptr ++ * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. ++ */ ++ sched_ttwu_pending(); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ /* ++ * If task_rq(p) != rq, it cannot be migrated here, because we're ++ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because ++ * we're holding p->pi_lock. ++ */ ++ if (task_rq(p) == rq && task_on_rq_queued(p)) ++ rq = __migrate_task(rq, p, arg->dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_enable(); ++ return 0; ++} ++ ++static inline void ++set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ set_cpus_allowed_common(p, new_mask); ++} ++#endif ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ unsigned long flags; ++ bool running, on_rq; ++ unsigned long ncsw; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(p) && p == rq->curr) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ task_access_lock_irqsave(p, &lock, &flags); ++ trace_sched_wait_task(p); ++ running = task_running(p); ++ on_rq = p->on_rq; ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(on_rq)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_send_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++ ++/* ++ * ->cpus_ptr is protected by both rq->lock and p->pi_lock ++ * ++ * A few notes on cpu_active vs cpu_online: ++ * ++ * - cpu_active must be a subset of cpu_online ++ * ++ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, ++ * see __set_cpus_allowed_ptr(). At this point the newly online ++ * CPU isn't yet part of the sched domains, and balancing will not ++ * see it. ++ * ++ * - on cpu-down we clear cpu_active() to mask the sched domains and ++ * avoid the load balancer to place new tasks on the to be removed ++ * CPU. Existing tasks will remain running there and will be taken ++ * off. ++ * ++ * This means that fallback selection must not select !active CPUs. ++ * And can assume that any active CPU must be online. Conversely ++ * select_task_rq() below may allow selection of !active CPUs in order ++ * to satisfy the above rules. ++ */ ++static int select_fallback_rq(int cpu, struct task_struct *p) ++{ ++ int nid = cpu_to_node(cpu); ++ const struct cpumask *nodemask = NULL; ++ enum { cpuset, possible, fail } state = cpuset; ++ int dest_cpu; ++ ++ /* ++ * If the node that the CPU is on has been offlined, cpu_to_node() ++ * will return -1. There is no CPU on the node, and we should ++ * select the CPU on the other node. ++ */ ++ if (nid != -1) { ++ nodemask = cpumask_of_node(nid); ++ ++ /* Look for allowed, online CPU in same node. */ ++ for_each_cpu(dest_cpu, nodemask) { ++ if (!cpu_active(dest_cpu)) ++ continue; ++ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) ++ return dest_cpu; ++ } ++ } ++ ++ for (;;) { ++ /* Any allowed, online CPU? */ ++ for_each_cpu(dest_cpu, p->cpus_ptr) { ++ if (!is_cpu_allowed(p, dest_cpu)) ++ continue; ++ goto out; ++ } ++ ++ /* No more Mr. Nice Guy. */ ++ switch (state) { ++ case cpuset: ++ if (IS_ENABLED(CONFIG_CPUSETS)) { ++ cpuset_cpus_allowed_fallback(p); ++ state = possible; ++ break; ++ } ++ /* Fall-through */ ++ case possible: ++ do_set_cpus_allowed(p, cpu_possible_mask); ++ state = fail; ++ break; ++ ++ case fail: ++ BUG(); ++ break; ++ } ++ } ++ ++out: ++ if (state != cpuset) { ++ /* ++ * Don't tell them about moving exiting tasks or ++ * kernel threads (both mm NULL), since they never ++ * leave kernel. ++ */ ++ if (p->mm && printk_ratelimit()) { ++ printk_deferred("process %d (%s) no longer affine to cpu%d\n", ++ task_pid_nr(p), p->comm, cpu); ++ } ++ } ++ ++ return dest_cpu; ++} ++ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ cpumask_t chk_mask, tmp; ++ ++ if (unlikely(!cpumask_and(&chk_mask, p->cpus_ptr, cpu_online_mask))) ++ return select_fallback_rq(task_cpu(p), p); ++ ++ if ( ++#ifdef CONFIG_SCHED_SMT ++ cpumask_and(&tmp, &chk_mask, &sched_sg_idle_mask) || ++#endif ++ cpumask_and(&tmp, &chk_mask, &sched_rq_watermark[IDLE_WM]) || ++ cpumask_and(&tmp, &chk_mask, ++ &sched_rq_watermark[task_sched_prio(p) + 1])) ++ return best_mask_cpu(task_cpu(p), &tmp); ++ ++ return best_mask_cpu(task_cpu(p), &chk_mask); ++} ++ ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ int dest_cpu; ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (cpumask_equal(&p->cpus_mask, new_mask)) ++ goto out; ++ ++ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ if (dest_cpu >= nr_cpu_ids) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ do_set_cpus_allowed(p, new_mask); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(p) || p->state == TASK_WAKING) { ++ struct migration_arg arg = { p, dest_cpu }; ++ ++ /* Need help from migration thread: drop lock and wait. */ ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); ++ return 0; ++ } ++ if (task_on_rq_queued(p)) { ++ /* ++ * OK, since we're going to drop the lock immediately ++ * afterwards anyway. ++ */ ++ update_rq_clock(rq); ++ rq = move_queued_task(rq, p, dest_cpu); ++ lock = &rq->lock; ++ } ++ ++out: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#else /* CONFIG_SMP */ ++ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static inline int ++__set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++ ++#endif /* CONFIG_SMP */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq= this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) ++ __schedstat_inc(rq->ttwu_local); ++ else { ++ /** Alt schedule FW ToDo: ++ * How to do ttwu_wake_remote ++ */ ++ } ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static inline void ++ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static inline void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++#ifdef CONFIG_SMP ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++#endif ++ ++ activate_task(p, rq); ++ ttwu_do_wakeup(rq, p, 0); ++} ++ ++static int ttwu_remote(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ rq = __task_access_lock(p, &lock); ++ if (task_on_rq_queued(p)) { ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_access_unlock(p, lock); ++ ++ return ret; ++} ++ ++#ifdef CONFIG_SMP ++void sched_ttwu_pending(void) ++{ ++ struct rq *rq = this_rq(); ++ struct llist_node *llist = llist_del_all(&rq->wake_list); ++ struct task_struct *p, *t; ++ struct rq_flags rf; ++ ++ if (!llist) ++ return; ++ ++ rq_lock_irqsave(rq, &rf); ++ update_rq_clock(rq); ++ ++ llist_for_each_entry_safe(p, t, llist, wake_entry) ++ ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0); ++ check_preempt_curr(rq); ++ ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++void scheduler_ipi(void) ++{ ++ /* ++ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting ++ * TIF_NEED_RESCHED remotely (for the first time) will also send ++ * this IPI. ++ */ ++ preempt_fold_need_resched(); ++ ++ if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick()) ++ return; ++ ++ irq_enter(); ++ sched_ttwu_pending(); ++ ++ /* ++ * Check if someone kicked us for doing the nohz idle load balance. ++ */ ++ if (unlikely(got_nohz_idle_kick())) { ++ /* TODO need to kick off balance ++ this_rq()->idle_balance = 1; ++ raise_softirq_irqoff(SCHED_SOFTIRQ); ++ */ ++ } ++ irq_exit(); ++} ++ ++static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); ++ ++ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { ++ if (!set_nr_if_polling(rq->idle)) ++ smp_send_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++ } ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (is_idle_task(rq->curr)) ++ smp_send_reschedule(cpu); ++ /* Else CPU is not idle, do nothing here */ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); ++} ++#endif /* CONFIG_SMP */ ++ ++static inline void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++#if defined(CONFIG_SMP) ++ if (!cpus_share_cache(smp_processor_id(), cpu)) { ++ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ ++ ttwu_queue_remote(p, cpu, wake_flags); ++ return; ++ } ++#endif ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ check_preempt_curr(rq); ++ raw_spin_unlock(&rq->lock); ++} ++ ++/* ++ * Notes on Program-Order guarantees on SMP systems. ++ * ++ * MIGRATION ++ * ++ * The basic program-order guarantee on SMP systems is that when a task [t] ++ * migrates, all its activity on its old CPU [c0] happens-before any subsequent ++ * execution on its new CPU [c1]. ++ * ++ * For migration (of runnable tasks) this is provided by the following means: ++ * ++ * A) UNLOCK of the rq(c0)->lock scheduling out task t ++ * B) migration for t is required to synchronize *both* rq(c0)->lock and ++ * rq(c1)->lock (if not at the same time, then in that order). ++ * C) LOCK of the rq(c1)->lock scheduling in task ++ * ++ * Transitivity guarantees that B happens after A and C after B. ++ * Note: we only require RCpc transitivity. ++ * Note: the CPU doing B need not be c0 or c1 ++ * ++ * Example: ++ * ++ * CPU0 CPU1 CPU2 ++ * ++ * LOCK rq(0)->lock ++ * sched-out X ++ * sched-in Y ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(0)->lock // orders against CPU0 ++ * dequeue X ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(1)->lock ++ * enqueue X ++ * UNLOCK rq(1)->lock ++ * ++ * LOCK rq(1)->lock // orders against CPU2 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(1)->lock ++ * ++ * ++ * BLOCKING -- aka. SLEEP + WAKEUP ++ * ++ * For blocking we (obviously) need to provide the same guarantee as for ++ * migration. However the means are completely different as there is no lock ++ * chain to provide order. Instead we do: ++ * ++ * 1) smp_store_release(X->on_cpu, 0) ++ * 2) smp_cond_load_acquire(!X->on_cpu) ++ * ++ * Example: ++ * ++ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) ++ * ++ * LOCK rq(0)->lock LOCK X->pi_lock ++ * dequeue X ++ * sched-out X ++ * smp_store_release(X->on_cpu, 0); ++ * ++ * smp_cond_load_acquire(&X->on_cpu, !VAL); ++ * X->state = WAKING ++ * set_task_cpu(X,2) ++ * ++ * LOCK rq(2)->lock ++ * enqueue X ++ * X->state = RUNNING ++ * UNLOCK rq(2)->lock ++ * ++ * LOCK rq(2)->lock // orders against CPU1 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(2)->lock ++ * ++ * UNLOCK X->pi_lock ++ * UNLOCK rq(0)->lock ++ * ++ * ++ * However; for wakeups there is a second guarantee we must provide, namely we ++ * must observe the state that lead to our wakeup. That is, not only must our ++ * task observe its own prior state, it must also observe the stores prior to ++ * its wakeup. ++ * ++ * This means that any means of doing remote wakeups must order the CPU doing ++ * the wakeup against the CPU the task is going to end up running on. This, ++ * however, is already required for the regular Program-Order guarantee above, ++ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). ++ * ++ */ ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Return: %true if @p was woken up, %false if it was already running. ++ * or @state didn't match @p's state. ++ */ ++static int try_to_wake_up(struct task_struct *p, unsigned int state, ++ int wake_flags) ++{ ++ unsigned long flags; ++ int cpu, success = 0; ++ ++ preempt_disable(); ++ if (p == current) { ++ /* ++ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) ++ * == smp_processor_id()'. Together this means we can special ++ * case the whole 'p->on_rq && ttwu_remote()' case below ++ * without taking any locks. ++ * ++ * In particular: ++ * - we rely on Program-Order guarantees for all the ordering, ++ * - we're serialized against set_special_state() by virtue of ++ * it disabling IRQs (this allows not taking ->pi_lock). ++ */ ++ if (!(p->state & state)) ++ goto out; ++ ++ success = 1; ++ cpu = task_cpu(p); ++ trace_sched_waking(p); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++ goto out; ++ } ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with mb() in ++ * set_current_state() the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ if (!(p->state & state)) ++ goto unlock; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ cpu = task_cpu(p); ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ if (p->on_rq && ttwu_remote(p, wake_flags)) ++ goto unlock; ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until its done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ p->sched_contributes_to_load = !!task_contributes_to_load(p); ++ p->state = TASK_WAKING; ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ if(this_rq()->clock_task - p->last_ran > sched_timeslice_ns) ++ boost_task(p); ++ ++ cpu = select_task_rq(p); ++ ++ if (cpu != task_cpu(p)) { ++ wake_flags |= WF_MIGRATED; ++ psi_ttwu_dequeue(p); ++ set_task_cpu(p, cpu); ++ } ++#else /* CONFIG_SMP */ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++#endif /* CONFIG_SMP */ ++ ++ ttwu_queue(p, cpu, wake_flags); ++unlock: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++out: ++ if (success) ++ ttwu_stat(p, cpu, wake_flags); ++ preempt_enable(); ++ ++ return success; ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ * ++ * __sched_fork() is basic setup used by init_idle() too: ++ */ ++static inline void __sched_fork(unsigned long clone_flags, struct task_struct *p) ++{ ++ p->on_rq = 0; ++ p->on_cpu = 0; ++ p->utime = 0; ++ p->stime = 0; ++ p->sched_time = 0; ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ ++#ifdef CONFIG_COMPACTION ++ p->capture_control = NULL; ++#endif ++} ++ ++/* ++ * fork()/clone()-time setup: ++ */ ++int sched_fork(unsigned long clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ ++ __sched_fork(clone_flags, p); ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = current->normal_prio; ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (task_has_rt_policy(p)) { ++ p->policy = SCHED_NORMAL; ++ p->static_prio = NICE_TO_PRIO(0); ++ p->rt_priority = 0; ++ } else if (PRIO_TO_NICE(p->static_prio) < 0) ++ p->static_prio = NICE_TO_PRIO(0); ++ ++ p->prio = p->normal_prio = normal_prio(p); ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ p->boost_prio = (p->boost_prio < 0) ? ++ p->boost_prio + MAX_PRIORITY_ADJ : MAX_PRIORITY_ADJ; ++ /* ++ * The child is not yet in the pid-hash so no cgroup attach races, ++ * and the cgroup is pinned to this child due to cgroup_fork() ++ * is ran before sched_fork(). ++ * ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. ++ */ ++ rq = this_rq(); ++ raw_spin_lock(&rq->lock); ++ rq->curr->time_slice /= 2; ++ p->time_slice = rq->curr->time_slice; ++#ifdef CONFIG_SCHED_HRTICK ++ hrtick_start(rq, rq->curr->time_slice); ++#endif ++ ++ if (p->time_slice < RESCHED_NS) { ++ p->time_slice = sched_timeslice_ns; ++ resched_curr(rq); ++ } ++ raw_spin_unlock(&rq->lock); ++ ++ /* ++ * We're setting the CPU for the first time, we don't migrate, ++ * so use __set_task_cpu(). ++ */ ++ __set_task_cpu(p, cpu_of(rq)); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ return 0; ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ p->state = TASK_RUNNING; ++ ++ rq = cpu_rq(select_task_rq(p)); ++#ifdef CONFIG_SMP ++ /* ++ * Fork balancing, do it here and not earlier because: ++ * - cpus_ptr can change in the fork path ++ * - any previously selected CPU might disappear through hotplug ++ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, ++ * as we're not fully set-up yet. ++ */ ++ __set_task_cpu(p, cpu_of(rq)); ++#endif ++ ++ raw_spin_lock(&rq->lock); ++ ++ update_rq_clock(rq); ++ activate_task(p, rq); ++ trace_sched_wakeup_new(p); ++ check_preempt_curr(rq); ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ */ ++ next->on_cpu = 1; ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->on_cpu is cleared, the task can be moved to a different CPU. ++ * We must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#else ++ prev->on_cpu = 0; ++#endif ++} ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock.dep_map, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock.owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static struct rq *finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(&rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ finish_lock_switch(rq); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct_rcu_user(prev); ++ } ++ ++ tick_nohz_task_switch(); ++ return rq; ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq; ++ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ rq = finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline struct rq * ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prepare_task_switch(rq, prev, next); ++ ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * kernel -> kernel lazy + transfer active ++ * user -> kernel lazy + mmgrab() active ++ * ++ * kernel -> user switch + mmdrop() active ++ * user -> user switch ++ */ ++ if (!next->mm) { // to kernel ++ enter_lazy_tlb(prev->active_mm, next); ++ ++ next->active_mm = prev->active_mm; ++ if (prev->mm) // from user ++ mmgrab(prev->active_mm); ++ else ++ prev->active_mm = NULL; ++ } else { // to user ++ membarrier_switch_mm(rq, prev->active_mm, next->mm); ++ /* ++ * sys_membarrier() requires an smp_mb() between setting ++ * rq->curr / membarrier_switch_mm() and returning to userspace. ++ * ++ * The below provides this either through switch_mm(), or in ++ * case 'prev->active_mm == next->mm' through ++ * finish_task_switch()'s mmdrop(). ++ */ ++ switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ ++ if (!prev->mm) { // from kernel ++ /* will mmdrop() in finish_task_switch(). */ ++ rq->prev_mm = prev->active_mm; ++ prev->active_mm = NULL; ++ } ++ } ++ ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ return finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptible section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ return raw_rq()->nr_running == 1; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int i; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += cpu_rq(i)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpuidle menu ++ * governor, are using nonsensical data. Preferring shallow idle state selection ++ * for a CPU that has IO-wait which might not even end up running the task when ++ * it does become runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ return atomic_read(&cpu_rq(cpu)->nr_iowait); ++} ++ ++/* ++ * IO-wait accounting, and how its mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += nr_iowait_cpu(i); ++ ++ return sum; ++} ++ ++#ifdef CONFIG_SMP ++ ++/* ++ * sched_exec - execve() is a valuable balancing opportunity, because at ++ * this point the task has the smallest effective memory and cache ++ * footprint. ++ */ ++void sched_exec(void) ++{ ++ struct task_struct *p = current; ++ int dest_cpu; ++ ++ if (task_rq(p)->nr_running < 2) ++ return; ++ ++ dest_cpu = cpumask_any_and(p->cpus_ptr, &sched_rq_watermark[IDLE_WM]); ++ if ( dest_cpu < nr_cpu_ids) { ++#ifdef CONFIG_SCHED_SMT ++ int smt = cpumask_any_and(p->cpus_ptr, &sched_sg_idle_mask); ++ if (smt < nr_cpu_ids) ++ dest_cpu = smt; ++#endif ++ if (likely(cpu_active(dest_cpu))) { ++ struct migration_arg arg = { p, dest_cpu }; ++ ++ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); ++ return; ++ } ++ } ++} ++ ++#endif ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++static inline void update_curr(struct rq *rq, struct task_struct *p) ++{ ++ s64 ns = rq->clock_task - p->last_ran; ++ ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ p->time_slice -= ns; ++ p->last_ran = rq->clock_task; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimization chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_access_lock_irqsave(p, &lock, &flags); ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_rq_clock(rq); ++ update_curr(rq, p); ++ } ++ ns = tsk_seruntime(p); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ return ns; ++} ++ ++DEFINE_PER_CPU(unsigned long, thermal_pressure); ++ ++void arch_set_thermal_pressure(struct cpumask *cpus, ++ unsigned long th_pressure) ++{ ++ int cpu; ++ ++ for_each_cpu(cpu, cpus) ++ WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static inline void scheduler_task_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ if (is_idle_task(p)) ++ return; ++ ++ update_curr(rq, p); ++ cpufreq_update_util(rq, 0); ++ ++ /* ++ * Tasks have less than RESCHED_NS of time slice left they will be ++ * rescheduled. ++ */ ++ if (p->time_slice >= RESCHED_NS) ++ return; ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ arch_scale_freq_tick(); ++ sched_clock_tick(); ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ scheduler_task_tick(rq); ++ calc_global_load_tick(rq); ++ psi_task_tick(rq); ++ ++ rq->last_tick = rq->clock; ++ raw_spin_unlock(&rq->lock); ++ ++ perf_event_task_tick(); ++} ++ ++#ifdef CONFIG_SCHED_SMT ++static inline int active_load_balance_cpu_stop(void *data) ++{ ++ struct rq *rq = this_rq(); ++ struct task_struct *p = data; ++ cpumask_t tmp; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ rq->active_balance = 0; ++ /* _something_ may have changed the task, double check again */ ++ if (task_on_rq_queued(p) && task_rq(p) == rq && ++ cpumask_and(&tmp, p->cpus_ptr, &sched_sg_idle_mask)) { ++ int cpu = cpu_of(rq); ++ int dcpu = __best_mask_cpu(cpu, &tmp, ++ per_cpu(sched_cpu_llc_mask, cpu)); ++ rq = move_queued_task(rq, p, dcpu); ++ } ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_restore(flags); ++ ++ return 0; ++} ++ ++/* sg_balance_trigger - trigger slibing group balance for @cpu */ ++static inline int sg_balance_trigger(const int cpu) ++{ ++ struct rq *rq= cpu_rq(cpu); ++ unsigned long flags; ++ struct task_struct *curr; ++ int res; ++ ++ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) ++ return 0; ++ curr = rq->curr; ++ res = (!is_idle_task(curr)) && (1 == rq->nr_running) &&\ ++ cpumask_intersects(curr->cpus_ptr, &sched_sg_idle_mask) &&\ ++ (!rq->active_balance); ++ ++ if (res) ++ rq->active_balance = 1; ++ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ if (res) ++ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, ++ curr, &rq->active_balance_work); ++ return res; ++} ++ ++/* ++ * sg_balance_check - slibing group balance check for run queue @rq ++ */ ++static inline void sg_balance_check(struct rq *rq) ++{ ++ cpumask_t chk; ++ int cpu; ++ ++ /* exit when no sg in idle */ ++ if (cpumask_empty(&sched_sg_idle_mask)) ++ return; ++ ++ cpu = cpu_of(rq); ++ /* ++ * Only cpu in slibing idle group will do the checking and then ++ * find potential cpus which can migrate the current running task ++ */ ++ if (cpumask_test_cpu(cpu, &sched_sg_idle_mask) && ++ cpumask_andnot(&chk, cpu_online_mask, &sched_rq_pending_mask) && ++ cpumask_andnot(&chk, &chk, &sched_rq_watermark[IDLE_WM])) { ++ int i, tried = 0; ++ ++ for_each_cpu_wrap(i, &chk, cpu) { ++ if (cpumask_subset(cpu_smt_mask(i), &chk)) { ++ if (sg_balance_trigger(i)) ++ return; ++ if (tried) ++ return; ++ tried++; ++ } ++ } ++ } ++} ++#endif /* CONFIG_SCHED_SMT */ ++ ++#ifdef CONFIG_NO_HZ_FULL ++ ++struct tick_work { ++ int cpu; ++ atomic_t state; ++ struct delayed_work work; ++}; ++/* Values for ->state, see diagram below. */ ++#define TICK_SCHED_REMOTE_OFFLINE 0 ++#define TICK_SCHED_REMOTE_OFFLINING 1 ++#define TICK_SCHED_REMOTE_RUNNING 2 ++ ++/* ++ * State diagram for ->state: ++ * ++ * ++ * TICK_SCHED_REMOTE_OFFLINE ++ * | ^ ++ * | | ++ * | | sched_tick_remote() ++ * | | ++ * | | ++ * +--TICK_SCHED_REMOTE_OFFLINING ++ * | ^ ++ * | | ++ * sched_tick_start() | | sched_tick_stop() ++ * | | ++ * V | ++ * TICK_SCHED_REMOTE_RUNNING ++ * ++ * ++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() ++ * and sched_tick_start() are happy to leave the state in RUNNING. ++ */ ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ unsigned long flags; ++ u64 delta; ++ int os; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (!tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ curr = rq->curr; ++ if (cpu_is_offline(cpu)) ++ goto out_unlock; ++ ++ update_rq_clock(rq); ++ if (!is_idle_task(curr)) { ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ delta = rq_clock_task(rq) - curr->last_ran; ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ } ++ scheduler_task_tick(rq); ++ ++ calc_load_nohz_remote(rq); ++out_unlock: ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++out_requeue: ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. But ++ * first update state to reflect hotplug activity if required. ++ */ ++ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); ++ if (os == TICK_SCHED_REMOTE_RUNNING) ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ int os; ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); ++ if (os == TICK_SCHED_REMOTE_OFFLINE) { ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++ } ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ cancel_delayed_work_sync(&twork->work); ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_PREEMPT_TRACER)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ if (panic_on_warn) ++ panic("scheduling while atomic\n"); ++ ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev, bool preempt) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++#endif ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++ if (!preempt && prev->state && prev->non_block_count) { ++ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", ++ prev->comm, prev->pid, prev->non_block_count); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++ } ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++#ifdef CONFIG_SMP ++ ++#define SCHED_RQ_NR_MIGRATION (32UL) ++/* ++ * Migrate pending tasks in @rq to @dest_cpu ++ * Will try to migrate mininal of half of @rq nr_running tasks and ++ * SCHED_RQ_NR_MIGRATION to @dest_cpu ++ */ ++static inline int ++migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, const int dest_cpu) ++{ ++ struct task_struct *p, *skip = rq->curr; ++ int nr_migrated = 0; ++ int nr_tries = min(rq->nr_running / 2, SCHED_RQ_NR_MIGRATION); ++ ++ while (skip != rq->idle && nr_tries && ++ (p = sched_rq_next_task(skip, rq)) != rq->idle) { ++ skip = sched_rq_next_task(p, rq); ++ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) { ++ __SCHED_DEQUEUE_TASK(p, rq, 0, ); ++ set_task_cpu(p, dest_cpu); ++ __SCHED_ENQUEUE_TASK(p, dest_rq, 0); ++ nr_migrated++; ++ } ++ nr_tries--; ++ } ++ ++ return nr_migrated; ++} ++ ++static inline int take_other_rq_tasks(struct rq *rq, int cpu) ++{ ++ struct cpumask *affinity_mask, *end_mask; ++ ++ if (unlikely(!rq->online)) ++ return 0; ++ ++ if (cpumask_empty(&sched_rq_pending_mask)) ++ return 0; ++ ++ affinity_mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ end_mask = per_cpu(sched_cpu_affinity_end_mask, cpu); ++ do { ++ int i; ++ for_each_cpu_and(i, &sched_rq_pending_mask, affinity_mask) { ++ int nr_migrated; ++ struct rq *src_rq; ++ ++ src_rq = cpu_rq(i); ++ if (!do_raw_spin_trylock(&src_rq->lock)) ++ continue; ++ spin_acquire(&src_rq->lock.dep_map, ++ SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ ++ if ((nr_migrated = migrate_pending_tasks(src_rq, rq, cpu))) { ++ src_rq->nr_running -= nr_migrated; ++#ifdef CONFIG_SMP ++ if (src_rq->nr_running < 2) ++ cpumask_clear_cpu(i, &sched_rq_pending_mask); ++#endif ++ rq->nr_running += nr_migrated; ++#ifdef CONFIG_SMP ++ if (rq->nr_running > 1) ++ cpumask_set_cpu(cpu, &sched_rq_pending_mask); ++#endif ++ update_sched_rq_watermark(rq); ++ cpufreq_update_util(rq, 0); ++ ++ spin_release(&src_rq->lock.dep_map, _RET_IP_); ++ do_raw_spin_unlock(&src_rq->lock); ++ ++ return 1; ++ } ++ ++ spin_release(&src_rq->lock.dep_map, _RET_IP_); ++ do_raw_spin_unlock(&src_rq->lock); ++ } ++ } while (++affinity_mask < end_mask); ++ ++ return 0; ++} ++#endif ++ ++/* ++ * Timeslices below RESCHED_NS are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. ++ */ ++static inline void check_curr(struct task_struct *p, struct rq *rq) ++{ ++ if (unlikely(rq->idle == p)) ++ return; ++ ++ update_curr(rq, p); ++ ++ if (p->time_slice < RESCHED_NS) { ++ p->time_slice = sched_timeslice_ns; ++ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) { ++ if (SCHED_RR != p->policy) ++ deboost_task(p); ++ requeue_task(p, rq); ++ } ++ } ++} ++ ++static inline struct task_struct * ++choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) ++{ ++ struct task_struct *next; ++ ++ if (unlikely(rq->skip)) { ++ next = rq_runnable_task(rq); ++ if (next == rq->idle) { ++#ifdef CONFIG_SMP ++ if (!take_other_rq_tasks(rq, cpu)) { ++#endif ++ rq->skip = NULL; ++ schedstat_inc(rq->sched_goidle); ++ return next; ++#ifdef CONFIG_SMP ++ } ++ next = rq_runnable_task(rq); ++#endif ++ } ++ rq->skip = NULL; ++#ifdef CONFIG_HIGH_RES_TIMERS ++ hrtick_start(rq, next->time_slice); ++#endif ++ return next; ++ } ++ ++ next = sched_rq_first_task(rq); ++ if (next == rq->idle) { ++#ifdef CONFIG_SMP ++ if (!take_other_rq_tasks(rq, cpu)) { ++#endif ++ schedstat_inc(rq->sched_goidle); ++ return next; ++#ifdef CONFIG_SMP ++ } ++ next = sched_rq_first_task(rq); ++#endif ++ } ++#ifdef CONFIG_HIGH_RES_TIMERS ++ hrtick_start(rq, next->time_slice); ++#endif ++ return next; ++} ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next; ++ unsigned long *switch_count; ++ struct rq *rq; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ ++ schedule_debug(prev, preempt); ++ ++ /* by passing sched_feat(HRTICK) checking which Alt schedule FW doesn't support */ ++ hrtick_clear(rq); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(). ++ * ++ * The membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ raw_spin_lock(&rq->lock); ++ smp_mb__after_spinlock(); ++ ++ update_rq_clock(rq); ++ ++ switch_count = &prev->nivcsw; ++ if (!preempt && prev->state) { ++ if (signal_pending_state(prev->state, prev)) { ++ prev->state = TASK_RUNNING; ++ } else { ++ if (rq_switch_time(rq) < boost_threshold(prev)) ++ boost_task(prev); ++ deactivate_task(prev, rq); ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ check_curr(prev, rq); ++ ++ next = choose_next_task(rq, cpu, prev); ++ ++ if (likely(prev != next)) { ++ next->last_ran = rq->clock_task; ++ rq->last_ts_switch = rq->clock; ++ ++ rq->nr_switches++; ++ /* ++ * RCU users of rcu_dereference(rq->curr) may not see ++ * changes to task_struct made by pick_next_task(). ++ */ ++ RCU_INIT_POINTER(rq->curr, next); ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ ++ psi_sched_switch(prev, next, !task_on_rq_queued(prev)); ++ ++ trace_sched_switch(preempt, prev, next); ++ ++ /* Also unlocks the rq: */ ++ rq = context_switch(rq, prev, next); ++ } else ++ raw_spin_unlock_irq(&rq->lock); ++ ++#ifdef CONFIG_SCHED_SMT ++ sg_balance_check(rq); ++#endif ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(): */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ ++ __schedule(false); ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ if (!tsk->state) ++ return; ++ ++ /* ++ * If a worker went to sleep, notify and ask workqueue whether ++ * it wants to wake up a task to maintain concurrency. ++ * As this function is called inside the schedule() context, ++ * we disable preemption to avoid it calling schedule() again ++ * in the possible wakeup of a kworker and because wq_worker_sleeping() ++ * requires it. ++ */ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ preempt_disable(); ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_sleeping(tsk); ++ else ++ io_wq_worker_sleeping(tsk); ++ preempt_enable_no_resched(); ++ } ++ ++ if (tsk_is_pi_blocked(tsk)) ++ return; ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++static void sched_update_worker(struct task_struct *tsk) ++{ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_running(tsk); ++ else ++ io_wq_worker_running(tsk); ++ } ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ sched_update_worker(tsk); ++} ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_CONTEXT_TRACKING ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != CONTEXT_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPTION ++/* ++ * This is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPTION */ ++ ++/* ++ * This is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++static inline void check_task_changed(struct rq *rq, struct task_struct *p) ++{ ++ /* Trigger resched if task sched_prio has been modified. */ ++ if (task_on_rq_queued(p) && sched_task_need_requeue(p)) { ++ requeue_task(p, rq); ++ check_preempt_curr(rq); ++ } ++} ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_access_lock(p, &lock); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guaratees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ p->prio = prio; ++ ++ check_task_changed(rq, p); ++out_unlock: ++ __task_access_unlock(p, lock); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ p->static_prio = NICE_TO_PRIO(nice); ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (task_has_rt_policy(p)) ++ goto out_unlock; ++ ++ p->prio = effective_prio(p); ++ check_task_changed(rq, p); ++out_unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ if (p->prio < MAX_RT_PRIO) ++ return (p->prio - MAX_RT_PRIO); ++ return (p->prio - MAX_RT_PRIO + p->boost_prio); ++} ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (rq->curr != rq->idle) ++ return 0; ++ ++ if (rq->nr_running) ++ return 0; ++ ++#ifdef CONFIG_SMP ++ if (!llist_empty(&rq->wake_list)) ++ return 0; ++#endif ++ ++ return 1; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the cpu @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++static void __setscheduler_params(struct task_struct *p, ++ const struct sched_attr *attr) ++{ ++ int policy = attr->sched_policy; ++ ++ if (policy == SETPARAM_POLICY) ++ policy = p->policy; ++ ++ p->policy = policy; ++ ++ /* ++ * allow normal nice value to be set, but will not have any ++ * effect on scheduling until the task not SCHED_NORMAL/ ++ * SCHED_BATCH ++ */ ++ p->static_prio = NICE_TO_PRIO(attr->sched_nice); ++ ++ /* ++ * __sched_setscheduler() ensures attr->sched_priority == 0 when ++ * !rt_policy. Always setting this ensures that things like ++ * getparam()/getattr() don't report silly values for !rt tasks. ++ */ ++ p->rt_priority = attr->sched_priority; ++ p->normal_prio = normal_prio(p); ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct rq *rq, struct task_struct *p, ++ const struct sched_attr *attr, bool keep_boost) ++{ ++ __setscheduler_params(p, attr); ++ ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++} ++ ++/* ++ * check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int __sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, ++ bool user, bool pi) ++{ ++ const struct sched_attr dl_squash_attr = { ++ .size = sizeof(struct sched_attr), ++ .sched_policy = SCHED_FIFO, ++ .sched_nice = 0, ++ .sched_priority = 99, ++ }; ++ int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ int retval, oldpolicy = -1; ++ int policy = attr->sched_policy; ++ unsigned long flags; ++ struct rq *rq; ++ int reset_on_fork; ++ raw_spinlock_t *lock; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ /* ++ * Alt schedule FW supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO ++ */ ++ if (unlikely(SCHED_DEADLINE == policy)) { ++ attr = &dl_squash_attr; ++ policy = attr->sched_policy; ++ newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); ++ ++ if (policy > SCHED_IDLE) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH and SCHED_IDLE is 0. ++ */ ++ if (attr->sched_priority < 0 || ++ (p->mm && attr->sched_priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if ((SCHED_RR == policy || SCHED_FIFO == policy) != ++ (attr->sched_priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (SCHED_FIFO == policy || SCHED_RR == policy) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (attr->sched_priority > p->rt_priority && ++ attr->sched_priority > rlim_rtprio) ++ return -EPERM; ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ if (pi) ++ cpuset_read_lock(); ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ /* ++ * To be able to change p->policy safely, task_access_lock() ++ * must be called. ++ * IF use task_access_lock() here: ++ * For the task p which is not running, reading rq->stop is ++ * racy but acceptable as ->stop doesn't change much. ++ * An enhancemnet can be made to read rq->stop saftly. ++ */ ++ rq = __task_access_lock(p, &lock); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea ++ */ ++ if (p == rq->stop) { ++ retval = -EINVAL; ++ goto unlock; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy)) { ++ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) ++ goto change; ++ if (!rt_policy(policy) && ++ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) ++ goto change; ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ retval = 0; ++ goto unlock; ++ } ++change: ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ goto recheck; ++ } ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ if (pi) { ++ /* ++ * Take priority boosted tasks into account. If the new ++ * effective priority is unchanged, we just store the new ++ * normal parameters and do not touch the scheduler class and ++ * the runqueue. This will be done when the task deboost ++ * itself. ++ */ ++ if (rt_effective_prio(p, newprio) == p->prio) { ++ __setscheduler_params(p, attr); ++ retval = 0; ++ goto unlock; ++ } ++ } ++ ++ __setscheduler(rq, p, attr, pi); ++ ++ check_task_changed(rq, p); ++ ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ if (pi) { ++ cpuset_read_unlock(); ++ rt_mutex_adjust_pi(p); ++ } ++ ++ preempt_enable(); ++ ++ return 0; ++ ++unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ return retval; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ ++ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { ++ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; ++ policy &= ~SCHED_RESET_ON_FORK; ++ attr.sched_policy = policy; ++ } ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++ ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++EXPORT_SYMBOL_GPL(sched_setscheduler); ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++EXPORT_SYMBOL_GPL(sched_setattr); ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setscheduler(p, policy, &lparam); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) ++ goto err_size; ++ ++ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); ++ if (ret) { ++ if (ret == -E2BIG) ++ goto err_size; ++ return ret; ++ } ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * @param: structure containing the new RT priority. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (p != NULL) ++ retval = sched_setattr(p, &attr); ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (task_has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/* ++ * Copy the kernel size attribute structure (which might be larger ++ * than what user-space knows about) to user-space. ++ * ++ * Note that all cases are valid: user-space buffer can be larger or ++ * smaller than the kernel-space buffer. The usual case is that both ++ * have the same size. ++ */ ++static int ++sched_attr_copy_to_user(struct sched_attr __user *uattr, ++ struct sched_attr *kattr, ++ unsigned int usize) ++{ ++ unsigned int ksize = sizeof(*kattr); ++ ++ if (!access_ok(uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * sched_getattr() ABI forwards and backwards compatibility: ++ * ++ * If usize == ksize then we just copy everything to user-space and all is good. ++ * ++ * If usize < ksize then we only copy as much as user-space has space for, ++ * this keeps ABI compatibility as well. We skip the rest. ++ * ++ * If usize > ksize then user-space is using a newer version of the ABI, ++ * which part the kernel doesn't know about. Just ignore it - tooling can ++ * detect the kernel's knowledge of attributes from the attr->size value ++ * which is set to ksize in this case. ++ */ ++ kattr->size = min(usize, ksize); ++ ++ if (copy_to_user(uattr, kattr, kattr->size)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @usize: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, usize, unsigned int, flags) ++{ ++ struct sched_attr kattr = { }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || usize > PAGE_SIZE || ++ usize < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ kattr.sched_policy = p->policy; ++ if (p->sched_reset_on_fork) ++ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; ++ if (task_has_rt_policy(p)) ++ kattr.sched_priority = p->rt_priority; ++ else ++ kattr.sched_nice = task_nice(p); ++ ++#ifdef CONFIG_UCLAMP_TASK ++ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; ++ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; ++#endif ++ ++ rcu_read_unlock(); ++ ++ return sched_attr_copy_to_user(uattr, &kattr, usize); ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_allowed, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ put_online_cpus(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_allowed); ++ cpumask_and(new_mask, in_mask, cpus_allowed); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_allowed); ++ if (!cpumask_subset(new_mask, cpus_allowed)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_allowed to the ++ * cpuset's cpus_allowed ++ */ ++ cpumask_copy(new_mask, cpus_allowed); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_allowed); ++out_put_task: ++ put_task_struct(p); ++ put_online_cpus(); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ struct cpumask *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ raw_spinlock_t *lock; ++ unsigned long flags; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ task_access_lock_irqsave(p, &lock, &flags); ++ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: size of CPU mask copied to user_mask_ptr on success. An ++ * error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min_t(size_t, len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ * ++ * Return: 0. ++ */ ++static void do_sched_yield(void) ++{ ++ struct rq *rq; ++ struct rq_flags rf; ++ ++ if (!sched_yield_type) ++ return; ++ ++ rq = this_rq_lock_irq(&rf); ++ ++ schedstat_inc(rq->yld_count); ++ ++ if (1 == sched_yield_type) { ++ if (!rt_task(current)) { ++ current->boost_prio = MAX_PRIORITY_ADJ; ++ requeue_task(current, rq); ++ } ++ } else if (2 == sched_yield_type) { ++ if (rq->nr_running > 1) ++ rq->skip = current; ++ } ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ preempt_disable(); ++ raw_spin_unlock(&rq->lock); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPTION ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ rcu_all_qs(); ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, its already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * In Alt schedule FW, yield_to is not supported. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void __sched io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ int retval; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ rcu_read_unlock(); ++ ++ *t = ns_to_timespec64(sched_timeslice_ns); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct __kernel_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT_32BIT_TIME ++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, ++ struct old_timespec32 __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_old_timespec32(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, ++ task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ show_stack(p, NULL); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++#if BITS_PER_LONG == 32 ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#else ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#endif ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++#ifdef CONFIG_SCHED_DEBUG ++ /* TODO: Alt schedule FW should support this ++ if (!state_filter) ++ sysrq_sched_debug_show(); ++ */ ++#endif ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: CPU the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ __sched_fork(0, idle); ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ idle->last_ran = rq->clock_task; ++ idle->state = TASK_RUNNING; ++ idle->flags |= PF_IDLE; ++ sched_queue_init_idle(rq, idle); ++ ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#endif ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ __set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->idle = idle; ++ rcu_assign_pointer(rq->curr, idle); ++ idle->on_cpu = 1; ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++#ifdef CONFIG_SMP ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_mask may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++bool sched_smp_initialized __read_mostly; ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Ensures that the idle task is using init_mm right before its CPU goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(current != this_rq()->idle); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ finish_arch_post_lock_switch(); ++ } ++ ++ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ ++} ++ ++/* ++ * Migrate all tasks from the rq, sleeping tasks will be migrated by ++ * try_to_wake_up()->select_task_rq(). ++ * ++ * Called with rq->lock held even though we'er in stop_machine() and ++ * there's no concurrency possible, we hold the required locks anyway ++ * because of lock validation efforts. ++ */ ++static void migrate_tasks(struct rq *dead_rq) ++{ ++ struct rq *rq = dead_rq; ++ struct task_struct *p, *stop = rq->stop; ++ int count = 0; ++ ++ /* ++ * Fudge the rq selection such that the below task selection loop ++ * doesn't get stuck on the currently eligible stop task. ++ * ++ * We're currently inside stop_machine() and the rq is either stuck ++ * in the stop_machine_cpu_stop() loop, or we're executing this code, ++ * either way we should never end up calling schedule() until we're ++ * done here. ++ */ ++ rq->stop = NULL; ++ ++ p = sched_rq_first_task(rq); ++ while (p != rq->idle) { ++ int dest_cpu; ++ ++ /* skip the running task */ ++ if (task_running(p) || 1 == p->nr_cpus_allowed) { ++ p = sched_rq_next_task(p, rq); ++ continue; ++ } ++ ++ /* ++ * Rules for changing task_struct::cpus_allowed are holding ++ * both pi_lock and rq->lock, such that holding either ++ * stabilizes the mask. ++ * ++ * Drop rq->lock is not quite as disastrous as it usually is ++ * because !cpu_active at this point, which means load-balance ++ * will not interfere. Also, stop-machine. ++ */ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ /* ++ * Since we're inside stop-machine, _nothing_ should have ++ * changed the task, WARN if weird stuff happened, because in ++ * that case the above rq->lock drop is a fail too. ++ */ ++ if (WARN_ON(task_rq(p) != rq || !task_on_rq_queued(p))) { ++ raw_spin_unlock(&p->pi_lock); ++ p = sched_rq_next_task(p, rq); ++ continue; ++ } ++ ++ count++; ++ /* Find suitable destination for @next, with force if needed. */ ++ dest_cpu = select_fallback_rq(dead_rq->cpu, p); ++ rq = __migrate_task(rq, p, dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ rq = dead_rq; ++ raw_spin_lock(&rq->lock); ++ /* Check queued task all over from the header again */ ++ p = sched_rq_first_task(rq); ++ } ++ ++ rq->stop = stop; ++} ++ ++static void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) ++ rq->online = false; ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++static void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) ++ rq->online = true; ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going up, increment the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_inc_cpuslocked(&sched_smt_present); ++#endif ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) ++ cpuset_cpu_active(); ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all cpus have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_online(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu(); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going down, decrement the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) { ++ static_branch_dec_cpuslocked(&sched_smt_present); ++ if (!static_branch_likely(&sched_smt_present)) ++ cpumask_clear(&sched_sg_idle_mask); ++ } ++#endif ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ return 0; ++} ++ ++static void sched_rq_cpu_starting(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ rq->calc_load_update = calc_load_update; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_rq_cpu_starting(cpu); ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ /* Handle pending wakeups and then migrate everything off */ ++ sched_ttwu_pending(); ++ ++ sched_tick_stop(cpu); ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_offline(rq); ++ migrate_tasks(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ hrtick_clear(rq); ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_SMP ++static void sched_init_topology_cpumask_early(void) ++{ ++ int cpu, level; ++ cpumask_t *tmp; ++ ++ for_each_possible_cpu(cpu) { ++ for (level = 0; level < NR_CPU_AFFINITY_CHK_LEVEL; level++) { ++ tmp = &(per_cpu(sched_cpu_affinity_masks, cpu)[level]); ++ cpumask_copy(tmp, cpu_possible_mask); ++ cpumask_clear_cpu(cpu, tmp); ++ } ++ per_cpu(sched_cpu_llc_mask, cpu) = ++ &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ per_cpu(sched_cpu_affinity_end_mask, cpu) = ++ &(per_cpu(sched_cpu_affinity_masks, cpu)[1]); ++ /*per_cpu(sd_llc_id, cpu) = cpu;*/ ++ } ++} ++ ++#define TOPOLOGY_CPUMASK(name, mask, last) \ ++ if (cpumask_and(chk, chk, mask)) \ ++ printk(KERN_INFO "sched: cpu#%02d affinity mask: 0x%08lx - "#name,\ ++ cpu, (chk++)->bits[0]); \ ++ if (!last) \ ++ cpumask_complement(chk, mask) ++ ++static void sched_init_topology_cpumask(void) ++{ ++ int cpu; ++ cpumask_t *chk; ++ ++ for_each_online_cpu(cpu) { ++ /* take chance to reset time slice for idle tasks */ ++ cpu_rq(cpu)->idle->time_slice = sched_timeslice_ns; ++ ++ chk = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ ++ cpumask_complement(chk, cpumask_of(cpu)); ++#ifdef CONFIG_SCHED_SMT ++ TOPOLOGY_CPUMASK(smt, topology_sibling_cpumask(cpu), false); ++#endif ++ per_cpu(sd_llc_id, cpu) = cpumask_first(cpu_coregroup_mask(cpu)); ++ per_cpu(sched_cpu_llc_mask, cpu) = chk; ++ TOPOLOGY_CPUMASK(coregroup, cpu_coregroup_mask(cpu), false); ++ ++ TOPOLOGY_CPUMASK(core, topology_core_cpumask(cpu), false); ++ ++ TOPOLOGY_CPUMASK(others, cpu_online_mask, true); ++ ++ per_cpu(sched_cpu_affinity_end_mask, cpu) = chk; ++ printk(KERN_INFO "sched: cpu#%02d llc_id = %d, llc_mask idx = %d\n", ++ cpu, per_cpu(sd_llc_id, cpu), ++ (int) (per_cpu(sched_cpu_llc_mask, cpu) - ++ &(per_cpu(sched_cpu_affinity_masks, cpu)[0]))); ++ } ++} ++#endif ++ ++void __init sched_init_smp(void) ++{ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) ++ BUG(); ++ ++ sched_init_topology_cpumask(); ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++ cpu_rq(0)->idle->time_slice = sched_timeslice_ns; ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++ int i; ++ struct rq *rq; ++ ++ printk(KERN_INFO ALT_SCHED_VERSION_MSG); ++ ++ wait_bit_init(); ++ ++#ifdef CONFIG_SMP ++ for (i = 0; i < SCHED_BITS; i++) ++ cpumask_copy(&sched_rq_watermark[i], cpu_present_mask); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ ++ sched_queue_init(rq); ++ rq->watermark = IDLE_WM; ++ rq->skip = NULL; ++ ++ raw_spin_lock_init(&rq->lock); ++ rq->nr_running = rq->nr_uninterruptible = 0; ++ rq->calc_load_active = 0; ++ rq->calc_load_update = jiffies + LOAD_FREQ; ++#ifdef CONFIG_SMP ++ rq->online = false; ++ rq->cpu = i; ++ ++#ifdef CONFIG_SCHED_SMT ++ rq->active_balance = 0; ++#endif ++#endif ++ rq->nr_switches = 0; ++ atomic_set(&rq->nr_iowait, 0); ++ hrtick_rq_init(rq); ++ } ++#ifdef CONFIG_SMP ++ /* Set rq->online for cpu 0 */ ++ cpu_rq(0)->online = true; ++#endif ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++ ++ sched_init_topology_cpumask_early(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++ ++ psi_init(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current) && !current->non_block_count) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++#ifdef CONFIG_DEBUG_PREEMPT ++ if (!preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++#endif ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++ ++void __cant_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > preempt_offset) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); ++ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++void normalize_rt_tasks(void) ++{ ++ struct task_struct *g, *p; ++ struct sched_attr attr = { ++ .sched_policy = SCHED_NORMAL, ++ }; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p)) { ++ /* ++ * Renice negative nice level userspace ++ * tasks back to 0: ++ */ ++ if (task_nice(p) < 0) ++ set_user_nice(p, 0); ++ continue; ++ } ++ ++ __sched_setscheduler(p, &attr, false, false); ++ } ++ read_unlock(&tasklist_lock); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * ia64_set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_legacy_files[] = { ++ { } /* Terminate */ ++}; ++ ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++#undef CREATE_TRACE_POINTS +diff --git a/kernel/sched/alt_debug.c b/kernel/sched/alt_debug.c +new file mode 100644 +index 000000000000..835e6bb98dda +--- /dev/null ++++ b/kernel/sched/alt_debug.c +@@ -0,0 +1,31 @@ ++/* ++ * kernel/sched/alt_debug.c ++ * ++ * Print the BMQ debugging details ++ * ++ * Author: Alfred Chen ++ * Date : 2020 ++ */ ++#include "sched.h" ++ ++/* ++ * This allows printing both to /proc/sched_debug and ++ * to the console ++ */ ++#define SEQ_printf(m, x...) \ ++ do { \ ++ if (m) \ ++ seq_printf(m, x); \ ++ else \ ++ pr_cont(x); \ ++ } while (0) ++ ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{ ++ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), ++ get_nr_threads(p)); ++} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} +diff --git a/kernel/sched/alt_sched.h b/kernel/sched/alt_sched.h +new file mode 100644 +index 000000000000..2b66983cce42 +--- /dev/null ++++ b/kernel/sched/alt_sched.h +@@ -0,0 +1,527 @@ ++#ifndef ALT_SCHED_H ++#define ALT_SCHED_H ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#ifdef CONFIG_PARAVIRT ++# include ++#endif ++ ++#include "cpupri.h" ++ ++#ifdef CONFIG_SCHED_BMQ ++#include "bmq.h" ++#endif ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++} ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ /* runqueue lock: */ ++ raw_spinlock_t lock; ++ ++ struct task_struct __rcu *curr; ++ struct task_struct *idle, *stop, *skip; ++ struct mm_struct *prev_mm; ++ ++#ifdef CONFIG_SCHED_BMQ ++ struct bmq queue; ++#endif ++ unsigned long watermark; ++ ++ /* switch count */ ++ u64 nr_switches; ++ ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_MEMBARRIER ++ int membarrier_state; ++#endif ++ ++#ifdef CONFIG_SMP ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ struct sched_avg avg_irq; ++#endif ++ ++#ifdef CONFIG_SCHED_SMT ++ int active_balance; ++ struct cpu_stop_work active_balance_work; ++#endif ++#endif /* CONFIG_SMP */ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ /* calc_load related fields */ ++ unsigned long calc_load_update; ++ long calc_load_active; ++ ++ u64 clock, last_tick; ++ u64 last_ts_switch; ++ u64 clock_task; ++ ++ unsigned long nr_running; ++ unsigned long nr_uninterruptible; ++ ++#ifdef CONFIG_SCHED_HRTICK ++#ifdef CONFIG_SMP ++ call_single_data_t hrtick_csd; ++#endif ++ struct hrtimer hrtick_timer; ++#endif ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++ ++#ifdef CONFIG_SMP ++ struct llist_head wake_list; ++#endif ++ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++}; ++ ++extern unsigned long calc_load_update; ++extern atomic_long_t calc_load_tasks; ++ ++extern void calc_global_load_tick(struct rq *this_rq); ++extern long calc_load_fold_active(struct rq *this_rq, long adjust); ++ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) ++#define this_rq() this_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++ ++#ifdef CONFIG_SMP ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++extern bool sched_smp_initialized; ++ ++enum { ++ BASE_CPU_AFFINITY_CHK_LEVEL = 1, ++#ifdef CONFIG_SCHED_SMT ++ SMT_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++#ifdef CONFIG_SCHED_MC ++ MC_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++ NR_CPU_AFFINITY_CHK_LEVEL ++}; ++ ++DECLARE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_masks); ++ ++static inline int __best_mask_cpu(int cpu, const cpumask_t *cpumask, ++ const cpumask_t *mask) ++{ ++ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) ++ mask++; ++ return cpu; ++} ++ ++static inline int best_mask_cpu(int cpu, const cpumask_t *cpumask) ++{ ++ return cpumask_test_cpu(cpu, cpumask)? cpu : ++ __best_mask_cpu(cpu, cpumask, &(per_cpu(sched_cpu_affinity_masks, cpu)[0])); ++} ++ ++extern void sched_ttwu_pending(void); ++#else /* !CONFIG_SMP */ ++static inline void sched_ttwu_pending(void) { } ++#endif /* CONFIG_SMP */ ++ ++#ifndef arch_scale_freq_tick ++static __always_inline ++void arch_scale_freq_tick(void) ++{ ++} ++#endif ++ ++#ifndef arch_scale_freq_capacity ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock_task; ++} ++ ++/* ++ * {de,en}queue flags: ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ */ ++ ++#define DEQUEUE_SLEEP 0x01 ++ ++#define ENQUEUE_WAKEUP 0x01 ++ ++ ++/* ++ * Below are scheduler API which using in other kernel code ++ * It use the dummy rq_flags ++ * ToDo : BMQ need to support these APIs for compatibility with mainline ++ * scheduler code. ++ */ ++struct rq_flags { ++ unsigned long flags; ++}; ++ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock); ++ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock); ++ ++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(&rq->lock); ++} ++ ++static inline void ++task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++} ++ ++static inline void ++rq_unlock_irq(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++static inline struct rq * ++this_rq_lock_irq(struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ raw_spin_lock(&rq->lock); ++ ++ return rq; ++} ++ ++static inline int task_current(struct rq *rq, struct task_struct *p) ++{ ++ return rq->curr == p; ++} ++ ++static inline bool task_running(struct task_struct *p) ++{ ++ return p->on_cpu; ++} ++ ++extern struct static_key_false sched_schedstats; ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++static inline int cpu_of(const struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ return rq->cpu; ++#else ++ return 0; ++#endif ++} ++ ++#include "stats.h" ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); ++ ++/** ++ * cpufreq_update_util - Take a note about CPU utilization changes. ++ * @rq: Runqueue to carry out the update for. ++ * @flags: Update reason flags. ++ * ++ * This function is called by the scheduler on the CPU whose utilization is ++ * being updated. ++ * ++ * It can only be called from RCU-sched read-side critical sections. ++ * ++ * The way cpufreq is currently arranged requires it to evaluate the CPU ++ * performance state (frequency/voltage) on a regular basis to prevent it from ++ * being stuck in a completely inadequate performance level for too long. ++ * That is not guaranteed to happen if the updates are only triggered from CFS ++ * and DL, though, because they may not be coming in if only RT tasks are ++ * active all the time (or there are RT tasks only). ++ * ++ * As a workaround for that issue, this function is called periodically by the ++ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, ++ * but that really is a band-aid. Going forward it should be replaced with ++ * solutions targeted more specifically at RT tasks. ++ */ ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); ++ if (data) ++ data->func(data, rq_clock(rq), flags); ++} ++#else ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} ++#endif /* CONFIG_CPU_FREQ */ ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern int __init sched_tick_offload_init(void); ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++#endif ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++extern void schedule_idle(void); ++ ++/* ++ * !! For sched_setattr_nocheck() (kernel) only !! ++ * ++ * This is actually gross. :( ++ * ++ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE ++ * tasks, but still be able to sleep. We need this on platforms that cannot ++ * atomically change clock frequency. Remove once fast switching will be ++ * available on such platforms. ++ * ++ * SUGOV stands for SchedUtil GOVernor. ++ */ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++#ifdef CONFIG_MEMBARRIER ++/* ++ * The scheduler provides memory barriers required by membarrier between: ++ * - prior user-space memory accesses and store to rq->membarrier_state, ++ * - store to rq->membarrier_state and following user-space memory accesses. ++ * In the same way it provides those guarantees around store to rq->curr. ++ */ ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++ int membarrier_state; ++ ++ if (prev_mm == next_mm) ++ return; ++ ++ membarrier_state = atomic_read(&next_mm->membarrier_state); ++ if (READ_ONCE(rq->membarrier_state) == membarrier_state) ++ return; ++ ++ WRITE_ONCE(rq->membarrier_state, membarrier_state); ++} ++#else ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++} ++#endif ++ ++static inline int task_running_nice(struct task_struct *p) ++{ ++ return (p->prio + p->boost_prio > DEFAULT_PRIO + MAX_PRIORITY_ADJ); ++} ++ ++#ifdef CONFIG_NUMA ++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); ++#else ++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return nr_cpu_ids; ++} ++#endif ++ ++void swake_up_all_locked(struct swait_queue_head *q); ++void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); ++ ++#endif /* ALT_SCHED_H */ +diff --git a/kernel/sched/bmq.h b/kernel/sched/bmq.h +new file mode 100644 +index 000000000000..4ce30c30bd3e +--- /dev/null ++++ b/kernel/sched/bmq.h +@@ -0,0 +1,14 @@ ++#ifndef BMQ_H ++#define BMQ_H ++ ++/* bits: ++ * RT(0-99), Low prio adj range, nice width, high prio adj range, cpu idle task */ ++#define SCHED_BITS (MAX_RT_PRIO + NICE_WIDTH + 2 * MAX_PRIORITY_ADJ + 1) ++#define IDLE_TASK_SCHED_PRIO (SCHED_BITS - 1) ++ ++struct bmq { ++ DECLARE_BITMAP(bitmap, SCHED_BITS); ++ struct list_head heads[SCHED_BITS]; ++}; ++ ++#endif +diff --git a/kernel/sched/bmq_imp.h b/kernel/sched/bmq_imp.h +new file mode 100644 +index 000000000000..cb0fc0688a89 +--- /dev/null ++++ b/kernel/sched/bmq_imp.h +@@ -0,0 +1,86 @@ ++#define ALT_SCHED_VERSION_MSG "sched/bmq: BMQ CPU Scheduler 5.7-r3 by Alfred Chen.\n" ++ ++static inline void sched_queue_init(struct rq *rq) ++{ ++ struct bmq *q = &rq->queue; ++ int i; ++ ++ bitmap_zero(q->bitmap, SCHED_BITS); ++ for(i = 0; i < SCHED_BITS; i++) ++ INIT_LIST_HEAD(&q->heads[i]); ++} ++ ++static inline void sched_queue_init_idle(struct rq *rq, struct task_struct *idle) ++{ ++ struct bmq *q = &rq->queue; ++ ++ idle->bmq_idx = IDLE_TASK_SCHED_PRIO; ++ INIT_LIST_HEAD(&q->heads[idle->bmq_idx]); ++ list_add(&idle->bmq_node, &q->heads[idle->bmq_idx]); ++ set_bit(idle->bmq_idx, q->bitmap); ++} ++ ++/* ++ * This routine used in bmq scheduler only which assume the idle task in the bmq ++ */ ++static inline struct task_struct *sched_rq_first_task(struct rq *rq) ++{ ++ unsigned long idx = find_first_bit(rq->queue.bitmap, SCHED_BITS); ++ const struct list_head *head = &rq->queue.heads[idx]; ++ ++ return list_first_entry(head, struct task_struct, bmq_node); ++} ++ ++static inline struct task_struct * ++sched_rq_next_task(struct task_struct *p, struct rq *rq) ++{ ++ unsigned long idx = p->bmq_idx; ++ struct list_head *head = &rq->queue.heads[idx]; ++ ++ if (list_is_last(&p->bmq_node, head)) { ++ idx = find_next_bit(rq->queue.bitmap, SCHED_BITS, idx + 1); ++ head = &rq->queue.heads[idx]; ++ ++ return list_first_entry(head, struct task_struct, bmq_node); ++ } ++ ++ return list_next_entry(p, bmq_node); ++} ++ ++#define __SCHED_DEQUEUE_TASK(p, rq, flags, func) \ ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); \ ++ sched_info_dequeued(rq, p); \ ++ \ ++ list_del(&p->bmq_node); \ ++ if (list_empty(&rq->queue.heads[p->bmq_idx])) { \ ++ clear_bit(p->bmq_idx, rq->queue.bitmap);\ ++ func; \ ++ } ++ ++#define __SCHED_ENQUEUE_TASK(p, rq, flags) \ ++ sched_info_queued(rq, p); \ ++ psi_enqueue(p, flags); \ ++ \ ++ p->bmq_idx = task_sched_prio(p); \ ++ list_add_tail(&p->bmq_node, &rq->queue.heads[p->bmq_idx]); \ ++ set_bit(p->bmq_idx, rq->queue.bitmap) ++ ++static inline void __requeue_task(struct task_struct *p, struct rq *rq) ++{ ++ int idx = task_sched_prio(p); ++ ++ list_del(&p->bmq_node); ++ list_add_tail(&p->bmq_node, &rq->queue.heads[idx]); ++ if (idx != p->bmq_idx) { ++ if (list_empty(&rq->queue.heads[p->bmq_idx])) ++ clear_bit(p->bmq_idx, rq->queue.bitmap); ++ p->bmq_idx = idx; ++ set_bit(p->bmq_idx, rq->queue.bitmap); ++ update_sched_rq_watermark(rq); ++ } ++} ++ ++static inline bool sched_task_need_requeue(struct task_struct *p) ++{ ++ return (task_sched_prio(p) != p->bmq_idx); ++} +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index 7fbaee24c824..0d7ad05b84fe 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -183,6 +183,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifndef CONFIG_SCHED_ALT + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -300,6 +301,13 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) + + return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); + } ++#else /* CONFIG_SCHED_ALT */ ++static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) ++{ ++ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); ++ return sg_cpu->max; ++} ++#endif + + /** + * sugov_iowait_reset() - Reset the IO boost status of a CPU. +@@ -443,7 +451,9 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } + */ + static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) + { ++#ifndef CONFIG_SCHED_ALT + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) ++#endif + sg_policy->limits_changed = true; + } + +@@ -686,6 +696,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + } + + ret = sched_setattr_nocheck(thread, &attr); ++ + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); +@@ -916,6 +927,7 @@ static int __init sugov_register(void) + core_initcall(sugov_register); + + #ifdef CONFIG_ENERGY_MODEL ++#ifndef CONFIG_SCHED_ALT + extern bool sched_energy_update; + extern struct mutex sched_energy_mutex; + +@@ -946,4 +958,10 @@ void sched_cpufreq_governor_change(struct cpufreq_policy *policy, + } + + } ++#else /* CONFIG_SCHED_ALT */ ++void sched_cpufreq_governor_change(struct cpufreq_policy *policy, ++ struct cpufreq_governor *old_gov) ++{ ++} ++#endif + #endif +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index ff9435dee1df..0ee9967d2d74 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -122,7 +122,7 @@ void account_user_time(struct task_struct *p, u64 cputime) + p->utime += cputime; + account_group_user_time(p, cputime); + +- index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; ++ index = task_running_nice(p) ? CPUTIME_NICE : CPUTIME_USER; + + /* Add user time to cpustat. */ + task_group_account_field(p, index, cputime); +@@ -146,7 +146,7 @@ void account_guest_time(struct task_struct *p, u64 cputime) + p->gtime += cputime; + + /* Add guest time to cpustat. */ +- if (task_nice(p) > 0) { ++ if (task_running_nice(p)) { + cpustat[CPUTIME_NICE] += cputime; + cpustat[CPUTIME_GUEST_NICE] += cputime; + } else { +@@ -269,7 +269,7 @@ static inline u64 account_other_time(u64 max) + #ifdef CONFIG_64BIT + static inline u64 read_sum_exec_runtime(struct task_struct *t) + { +- return t->se.sum_exec_runtime; ++ return tsk_seruntime(t); + } + #else + static u64 read_sum_exec_runtime(struct task_struct *t) +@@ -279,7 +279,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) + struct rq *rq; + + rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; ++ ns = tsk_seruntime(t); + task_rq_unlock(rq, t, &rf); + + return ns; +@@ -658,7 +658,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index b743bf38f08f..472478a4f2a8 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -361,6 +361,7 @@ void cpu_startup_entry(enum cpuhp_state state) + do_idle(); + } + ++#ifndef CONFIG_SCHED_ALT + /* + * idle-task scheduling class. + */ +@@ -481,3 +482,4 @@ const struct sched_class idle_sched_class = { + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif +diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c +index b647d04d9c8b..f1983eb87f13 100644 +--- a/kernel/sched/pelt.c ++++ b/kernel/sched/pelt.c +@@ -250,6 +250,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) + WRITE_ONCE(sa->util_avg, sa->util_sum / divider); + } + ++#ifndef CONFIG_SCHED_ALT + /* + * sched_entity: + * +@@ -367,6 +368,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + + return 0; + } ++#endif + + #ifdef CONFIG_SCHED_THERMAL_PRESSURE + /* +diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h +index eb034d9f024d..49aa805750c5 100644 +--- a/kernel/sched/pelt.h ++++ b/kernel/sched/pelt.h +@@ -1,11 +1,13 @@ + #ifdef CONFIG_SMP + #include "sched-pelt.h" + ++#ifndef CONFIG_SCHED_ALT + int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); + int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); + int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); + int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); + int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); ++#endif + + #ifdef CONFIG_SCHED_THERMAL_PRESSURE + int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); +@@ -37,6 +39,7 @@ update_irq_load_avg(struct rq *rq, u64 running) + } + #endif + ++#ifndef CONFIG_SCHED_ALT + /* + * When a task is dequeued, its estimated utilization should not be update if + * its util_avg has not been updated at least once. +@@ -157,9 +160,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) + return rq_clock_pelt(rq_of(cfs_rq)); + } + #endif ++#endif /* CONFIG_SCHED_ALT */ + + #else + ++#ifndef CONFIG_SCHED_ALT + static inline int + update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) + { +@@ -177,6 +182,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + { + return 0; + } ++#endif + + static inline int + update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 1f58677a8f23..682e6b3802c1 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2,6 +2,10 @@ + /* + * Scheduler internal types and methods: + */ ++#ifdef CONFIG_SCHED_ALT ++#include "alt_sched.h" ++#else ++ + #include + + #include +@@ -2548,3 +2552,9 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) + + void swake_up_all_locked(struct swait_queue_head *q); + void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); ++ ++static inline int task_running_nice(struct task_struct *p) ++{ ++ return (task_nice(p) > 0); ++} ++#endif /* !CONFIG_SCHED_ALT */ +diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c +index 750fb3c67eed..108422ebc7bf 100644 +--- a/kernel/sched/stats.c ++++ b/kernel/sched/stats.c +@@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v) + } else { + struct rq *rq; + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_ALT + struct sched_domain *sd; + int dcount = 0; ++#endif + #endif + cpu = (unsigned long)(v - 2); + rq = cpu_rq(cpu); +@@ -40,6 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + seq_printf(seq, "\n"); + + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_ALT + /* domain-specific stats */ + rcu_read_lock(); + for_each_domain(cpu, sd) { +@@ -68,6 +71,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + sd->ttwu_move_balance); + } + rcu_read_unlock(); ++#endif + #endif + } + return 0; +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 8344757bba6e..558ce8a70926 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -4,6 +4,7 @@ + */ + #include "sched.h" + ++#ifndef CONFIG_SCHED_ALT + DEFINE_MUTEX(sched_domains_mutex); + + /* Protected by sched_domains_mutex: */ +@@ -1190,8 +1191,10 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) + */ + + static int default_relax_domain_level = -1; ++#endif /* CONFIG_SCHED_ALT */ + int sched_domain_level_max; + ++#ifndef CONFIG_SCHED_ALT + static int __init setup_relax_domain_level(char *str) + { + if (kstrtoint(str, 0, &default_relax_domain_level)) +@@ -1424,6 +1427,7 @@ sd_init(struct sched_domain_topology_level *tl, + + return sd; + } ++#endif /* CONFIG_SCHED_ALT */ + + /* + * Topology list, bottom-up. +@@ -1453,6 +1457,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl) + sched_domain_topology = tl; + } + ++#ifndef CONFIG_SCHED_ALT + #ifdef CONFIG_NUMA + + static const struct cpumask *sd_numa_mask(int cpu) +@@ -2327,3 +2332,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], + partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); + mutex_unlock(&sched_domains_mutex); + } ++#else /* CONFIG_SCHED_ALT */ ++void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ++ struct sched_domain_attr *dattr_new) ++{} ++ ++#ifdef CONFIG_NUMA ++int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; ++ ++int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return best_mask_cpu(cpu, cpus); ++} ++#endif /* CONFIG_NUMA */ ++#endif +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index 8a176d8727a3..8e2ba49be0e1 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -132,6 +132,10 @@ static unsigned long one_ul = 1; + static unsigned long long_max = LONG_MAX; + static int one_hundred = 100; + static int one_thousand = 1000; ++#ifdef CONFIG_SCHED_ALT ++static int __maybe_unused zero = 0; ++extern int sched_yield_type; ++#endif + #ifdef CONFIG_PRINTK + static int ten_thousand = 10000; + #endif +@@ -288,7 +292,7 @@ static struct ctl_table sysctl_base_table[] = { + { } + }; + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_ALT) + static int min_sched_granularity_ns = 100000; /* 100 usecs */ + static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns; /* 0 usecs */ +@@ -305,6 +309,7 @@ static int max_extfrag_threshold = 1000; + #endif + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_ALT + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -486,6 +491,7 @@ static struct ctl_table kern_table[] = { + .extra2 = SYSCTL_ONE, + }, + #endif ++#endif /* !CONFIG_SCHED_ALT */ + #ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", +@@ -1049,6 +1055,17 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_ALT ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &two, ++ }, ++#endif + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index d89da1c7e005..a73adff9f309 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -1923,8 +1923,10 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, + int ret = 0; + u64 slack; + ++#ifndef CONFIG_SCHED_ALT + slack = current->timer_slack_ns; + if (dl_task(current) || rt_task(current)) ++#endif + slack = 0; + + hrtimer_init_sleeper_on_stack(&t, clockid, mode); +diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +index 2fd3b3fa68bf..e053bc56c019 100644 +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -236,7 +236,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) + u64 stime, utime; + + task_cputime(p, &utime, &stime); +- store_samples(samples, stime, utime, p->se.sum_exec_runtime); ++ store_samples(samples, stime, utime, tsk_seruntime(p)); + } + + static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, +@@ -806,6 +806,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, + } + } + ++#ifndef CONFIG_SCHED_ALT + static inline void check_dl_overrun(struct task_struct *tsk) + { + if (tsk->dl.dl_overrun) { +@@ -813,6 +814,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + } + } ++#endif + + static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) + { +@@ -840,8 +842,10 @@ static void check_thread_timers(struct task_struct *tsk, + u64 samples[CPUCLOCK_MAX]; + unsigned long soft; + ++#ifndef CONFIG_SCHED_ALT + if (dl_task(tsk)) + check_dl_overrun(tsk); ++#endif + + if (expiry_cache_is_inactive(pct)) + return; +@@ -855,7 +859,7 @@ static void check_thread_timers(struct task_struct *tsk, + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ +- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); ++ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + /* At the hard limit, send SIGKILL. No further action. */ +@@ -1091,8 +1095,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) + return true; + } + ++#ifndef CONFIG_SCHED_ALT + if (dl_task(tsk) && tsk->dl.dl_overrun) + return true; ++#endif + + return false; + } +diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +index b5e3496cf803..cfbae0a21cef 100644 +--- a/kernel/trace/trace_selftest.c ++++ b/kernel/trace/trace_selftest.c +@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_ALT ++ /* No deadline on BMQ, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + diff --git a/linux-tkg/linux-tkg-patches/5.7/0010-5.7-glitched-cachy.patch b/linux-tkg/linux-tkg-patches/5.7/0010-5.7-glitched-cachy.patch new file mode 100644 index 0000000..c6f0a34 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.7/0010-5.7-glitched-cachy.patch @@ -0,0 +1,3936 @@ +diff --color -rubN linux-5.7.6/include/linux/sched.h linux-5.7.6.cachy/include/linux/sched.h +--- linux-5.7.6/include/linux/sched.h 2020-06-25 01:49:26.000000000 +1000 ++++ linux-5.7.6.cachy/include/linux/sched.h 2020-07-24 17:51:45.879582847 +1000 +@@ -452,9 +452,14 @@ + /* For load-balancing: */ + struct load_weight load; + struct rb_node run_node; ++ ++ struct sched_entity* next[2]; ++ + struct list_head group_node; + unsigned int on_rq; + ++ int quantom; ++ + u64 exec_start; + u64 sum_exec_runtime; + u64 vruntime; +@@ -464,16 +469,6 @@ + + struct sched_statistics statistics; + +-#ifdef CONFIG_FAIR_GROUP_SCHED +- int depth; +- struct sched_entity *parent; +- /* rq on which this entity is (to be) queued: */ +- struct cfs_rq *cfs_rq; +- /* rq "owned" by this entity/group: */ +- struct cfs_rq *my_q; +- /* cached value of my_q->h_nr_running */ +- unsigned long runnable_weight; +-#endif + + #ifdef CONFIG_SMP + /* +diff --color -rubN linux-5.7.6/kernel/sched/core.c linux-5.7.6.cachy/kernel/sched/core.c +--- linux-5.7.6/kernel/sched/core.c 2020-06-25 01:49:26.000000000 +1000 ++++ linux-5.7.6.cachy/kernel/sched/core.c 2020-07-24 17:51:57.991504128 +1000 +@@ -2672,18 +2672,14 @@ + p->se.prev_sum_exec_runtime = 0; + p->se.nr_migrations = 0; + p->se.vruntime = 0; +- INIT_LIST_HEAD(&p->se.group_node); + +-#ifdef CONFIG_FAIR_GROUP_SCHED +- p->se.cfs_rq = NULL; +-#endif ++ INIT_LIST_HEAD(&p->se.group_node); + + #ifdef CONFIG_SCHEDSTATS + /* Even if schedstat is disabled, there should not be garbage */ + memset(&p->se.statistics, 0, sizeof(p->se.statistics)); + #endif + +- RB_CLEAR_NODE(&p->dl.rb_node); + init_dl_task_timer(&p->dl); + init_dl_inactive_task_timer(&p->dl); + __dl_clear_params(p); +@@ -3246,31 +3242,10 @@ + + #ifdef CONFIG_SMP + +-/* rq->lock is NOT held, but preemption is disabled */ +-static void __balance_callback(struct rq *rq) +-{ +- struct callback_head *head, *next; +- void (*func)(struct rq *rq); +- unsigned long flags; +- +- raw_spin_lock_irqsave(&rq->lock, flags); +- head = rq->balance_callback; +- rq->balance_callback = NULL; +- while (head) { +- func = (void (*)(struct rq *))head->func; +- next = head->next; +- head->next = NULL; +- head = next; +- +- func(rq); +- } +- raw_spin_unlock_irqrestore(&rq->lock, flags); +-} ++///* rq->lock is NOT held, but preemption is disabled */ + + static inline void balance_callback(struct rq *rq) + { +- if (unlikely(rq->balance_callback)) +- __balance_callback(rq); + } + + #else +@@ -3606,7 +3581,6 @@ + + #ifdef CONFIG_SMP + rq->idle_balance = idle_cpu(cpu); +- trigger_load_balance(rq); + #endif + } + +@@ -6574,23 +6548,12 @@ + + wait_bit_init(); + +-#ifdef CONFIG_FAIR_GROUP_SCHED +- ptr += 2 * nr_cpu_ids * sizeof(void **); +-#endif + #ifdef CONFIG_RT_GROUP_SCHED + ptr += 2 * nr_cpu_ids * sizeof(void **); + #endif + if (ptr) { + ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT); + +-#ifdef CONFIG_FAIR_GROUP_SCHED +- root_task_group.se = (struct sched_entity **)ptr; +- ptr += nr_cpu_ids * sizeof(void **); +- +- root_task_group.cfs_rq = (struct cfs_rq **)ptr; +- ptr += nr_cpu_ids * sizeof(void **); +- +-#endif /* CONFIG_FAIR_GROUP_SCHED */ + #ifdef CONFIG_RT_GROUP_SCHED + root_task_group.rt_se = (struct sched_rt_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); +@@ -6641,32 +6604,7 @@ + init_cfs_rq(&rq->cfs); + init_rt_rq(&rq->rt); + init_dl_rq(&rq->dl); +-#ifdef CONFIG_FAIR_GROUP_SCHED +- root_task_group.shares = ROOT_TASK_GROUP_LOAD; +- INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); +- rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; +- /* +- * How much CPU bandwidth does root_task_group get? +- * +- * In case of task-groups formed thr' the cgroup filesystem, it +- * gets 100% of the CPU resources in the system. This overall +- * system CPU resource is divided among the tasks of +- * root_task_group and its child task-groups in a fair manner, +- * based on each entity's (task or task-group's) weight +- * (se->load.weight). +- * +- * In other words, if root_task_group has 10 tasks of weight +- * 1024) and two child groups A0 and A1 (of weight 1024 each), +- * then A0's share of the CPU resource is: +- * +- * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% +- * +- * We achieve this by letting root_task_group's tasks sit +- * directly in rq->cfs (i.e root_task_group->se[] = NULL). +- */ +- init_cfs_bandwidth(&root_task_group.cfs_bandwidth); +- init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); +-#endif /* CONFIG_FAIR_GROUP_SCHED */ ++ + + rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; + #ifdef CONFIG_RT_GROUP_SCHED +diff --color -rubN linux-5.7.6/kernel/sched/debug.c linux-5.7.6.cachy/kernel/sched/debug.c +--- linux-5.7.6/kernel/sched/debug.c 2020-06-25 01:49:26.000000000 +1000 ++++ linux-5.7.6.cachy/kernel/sched/debug.c 2020-07-24 17:52:15.419390856 +1000 +@@ -385,7 +385,7 @@ + return; + + PN(se->exec_start); +- PN(se->vruntime); ++ //PN(se->vruntime); + PN(se->sum_exec_runtime); + + if (schedstat_enabled()) { +@@ -437,9 +437,9 @@ + else + SEQ_printf(m, " %c", task_state_to_char(p)); + +- SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", +- p->comm, task_pid_nr(p), +- SPLIT_NS(p->se.vruntime), ++ SEQ_printf(m, "%15s %5d %9d %9Ld %8d ", ++ p->comm, task_pid_nr(p), p->se.quantom, ++ //SPLIT_NS(p->se.vruntime),%9Ld.%06ld + (long long)(p->nvcsw + p->nivcsw), + p->prio); + +@@ -464,9 +464,9 @@ + + SEQ_printf(m, "\n"); + SEQ_printf(m, "runnable tasks:\n"); +- SEQ_printf(m, " S task PID tree-key switches prio" ++ SEQ_printf(m, " S task PID quantom switches prio" + " wait-time sum-exec sum-sleep\n"); +- SEQ_printf(m, "-------------------------------------------------------" ++ SEQ_printf(m, "--------------------------------------------------------------------" + "----------------------------------------------------\n"); + + rcu_read_lock(); +@@ -481,10 +481,8 @@ + + void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) + { +- s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, +- spread, rq0_min_vruntime, spread0; + struct rq *rq = cpu_rq(cpu); +- struct sched_entity *last; ++ //struct sched_entity *last; + unsigned long flags; + + #ifdef CONFIG_FAIR_GROUP_SCHED +@@ -498,26 +496,26 @@ + SPLIT_NS(cfs_rq->exec_clock)); + + raw_spin_lock_irqsave(&rq->lock, flags); +- if (rb_first_cached(&cfs_rq->tasks_timeline)) +- MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; +- last = __pick_last_entity(cfs_rq); +- if (last) +- max_vruntime = last->vruntime; +- min_vruntime = cfs_rq->min_vruntime; +- rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; ++ //if (rb_first_cached(&cfs_rq->tasks_timeline)) ++ //MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; ++ //last = __pick_last_entity(cfs_rq); ++ //if (last) ++ //max_vruntime = last->vruntime; ++ //min_vruntime = cfs_rq->min_vruntime; ++ //rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; + raw_spin_unlock_irqrestore(&rq->lock, flags); +- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", +- SPLIT_NS(MIN_vruntime)); +- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", +- SPLIT_NS(min_vruntime)); +- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", +- SPLIT_NS(max_vruntime)); +- spread = max_vruntime - MIN_vruntime; +- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", +- SPLIT_NS(spread)); +- spread0 = min_vruntime - rq0_min_vruntime; +- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", +- SPLIT_NS(spread0)); ++ //SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", ++ //SPLIT_NS(MIN_vruntime)); ++ //SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", ++ //SPLIT_NS(min_vruntime)); ++ //SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", ++ //SPLIT_NS(max_vruntime)); ++ //spread = max_vruntime - MIN_vruntime; ++ //SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", ++ //SPLIT_NS(spread)); ++ //spread0 = min_vruntime - rq0_min_vruntime; ++ //SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", ++ //SPLIT_NS(spread0)); + SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", + cfs_rq->nr_spread_over); + SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); +@@ -875,7 +873,7 @@ + #define PN_SCHEDSTAT(F) __PSN(#F, schedstat_val(p->F)) + + PN(se.exec_start); +- PN(se.vruntime); ++ //PN(se.vruntime); + PN(se.sum_exec_runtime); + + nr_switches = p->nvcsw + p->nivcsw; +diff --color -rubN linux-5.7.6/kernel/sched/fair.c linux-5.7.6.cachy/kernel/sched/fair.c +--- linux-5.7.6/kernel/sched/fair.c 2020-06-25 01:49:26.000000000 +1000 ++++ linux-5.7.6.cachy/kernel/sched/fair.c 2020-07-24 17:52:09.159431543 +1000 +@@ -86,6 +86,9 @@ + + const_debug unsigned int sysctl_sched_migration_cost = 500000UL; + ++#define DIR_RIGHT 0 ++#define DIR_LEFT 1 ++ + int sched_thermal_decay_shift; + static int __init setup_sched_thermal_decay_shift(char *str) + { +@@ -259,193 +262,6 @@ + * CFS operations on generic schedulable entities: + */ + +-#ifdef CONFIG_FAIR_GROUP_SCHED +-static inline struct task_struct *task_of(struct sched_entity *se) +-{ +- SCHED_WARN_ON(!entity_is_task(se)); +- return container_of(se, struct task_struct, se); +-} +- +-/* Walk up scheduling entities hierarchy */ +-#define for_each_sched_entity(se) \ +- for (; se; se = se->parent) +- +-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +-{ +- return p->se.cfs_rq; +-} +- +-/* runqueue on which this entity is (to be) queued */ +-static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +-{ +- return se->cfs_rq; +-} +- +-/* runqueue "owned" by this group */ +-static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +-{ +- return grp->my_q; +-} +- +-static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len) +-{ +- if (!path) +- return; +- +- if (cfs_rq && task_group_is_autogroup(cfs_rq->tg)) +- autogroup_path(cfs_rq->tg, path, len); +- else if (cfs_rq && cfs_rq->tg->css.cgroup) +- cgroup_path(cfs_rq->tg->css.cgroup, path, len); +- else +- strlcpy(path, "(null)", len); +-} +- +-static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +-{ +- struct rq *rq = rq_of(cfs_rq); +- int cpu = cpu_of(rq); +- +- if (cfs_rq->on_list) +- return rq->tmp_alone_branch == &rq->leaf_cfs_rq_list; +- +- cfs_rq->on_list = 1; +- +- /* +- * Ensure we either appear before our parent (if already +- * enqueued) or force our parent to appear after us when it is +- * enqueued. The fact that we always enqueue bottom-up +- * reduces this to two cases and a special case for the root +- * cfs_rq. Furthermore, it also means that we will always reset +- * tmp_alone_branch either when the branch is connected +- * to a tree or when we reach the top of the tree +- */ +- if (cfs_rq->tg->parent && +- cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { +- /* +- * If parent is already on the list, we add the child +- * just before. Thanks to circular linked property of +- * the list, this means to put the child at the tail +- * of the list that starts by parent. +- */ +- list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, +- &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); +- /* +- * The branch is now connected to its tree so we can +- * reset tmp_alone_branch to the beginning of the +- * list. +- */ +- rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; +- return true; +- } +- +- if (!cfs_rq->tg->parent) { +- /* +- * cfs rq without parent should be put +- * at the tail of the list. +- */ +- list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, +- &rq->leaf_cfs_rq_list); +- /* +- * We have reach the top of a tree so we can reset +- * tmp_alone_branch to the beginning of the list. +- */ +- rq->tmp_alone_branch = &rq->leaf_cfs_rq_list; +- return true; +- } +- +- /* +- * The parent has not already been added so we want to +- * make sure that it will be put after us. +- * tmp_alone_branch points to the begin of the branch +- * where we will add parent. +- */ +- list_add_rcu(&cfs_rq->leaf_cfs_rq_list, rq->tmp_alone_branch); +- /* +- * update tmp_alone_branch to points to the new begin +- * of the branch +- */ +- rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list; +- return false; +-} +- +-static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) +-{ +- if (cfs_rq->on_list) { +- struct rq *rq = rq_of(cfs_rq); +- +- /* +- * With cfs_rq being unthrottled/throttled during an enqueue, +- * it can happen the tmp_alone_branch points the a leaf that +- * we finally want to del. In this case, tmp_alone_branch moves +- * to the prev element but it will point to rq->leaf_cfs_rq_list +- * at the end of the enqueue. +- */ +- if (rq->tmp_alone_branch == &cfs_rq->leaf_cfs_rq_list) +- rq->tmp_alone_branch = cfs_rq->leaf_cfs_rq_list.prev; +- +- list_del_rcu(&cfs_rq->leaf_cfs_rq_list); +- cfs_rq->on_list = 0; +- } +-} +- +-static inline void assert_list_leaf_cfs_rq(struct rq *rq) +-{ +- SCHED_WARN_ON(rq->tmp_alone_branch != &rq->leaf_cfs_rq_list); +-} +- +-/* Iterate thr' all leaf cfs_rq's on a runqueue */ +-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \ +- list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \ +- leaf_cfs_rq_list) +- +-/* Do the two (enqueued) entities belong to the same group ? */ +-static inline struct cfs_rq * +-is_same_group(struct sched_entity *se, struct sched_entity *pse) +-{ +- if (se->cfs_rq == pse->cfs_rq) +- return se->cfs_rq; +- +- return NULL; +-} +- +-static inline struct sched_entity *parent_entity(struct sched_entity *se) +-{ +- return se->parent; +-} +- +-static void +-find_matching_se(struct sched_entity **se, struct sched_entity **pse) +-{ +- int se_depth, pse_depth; +- +- /* +- * preemption test can be made between sibling entities who are in the +- * same cfs_rq i.e who have a common parent. Walk up the hierarchy of +- * both tasks until we find their ancestors who are siblings of common +- * parent. +- */ +- +- /* First walk up until both entities are at same depth */ +- se_depth = (*se)->depth; +- pse_depth = (*pse)->depth; +- +- while (se_depth > pse_depth) { +- se_depth--; +- *se = parent_entity(*se); +- } +- +- while (pse_depth > se_depth) { +- pse_depth--; +- *pse = parent_entity(*pse); +- } +- +- while (!is_same_group(*se, *pse)) { +- *se = parent_entity(*se); +- *pse = parent_entity(*pse); +- } +-} +- +-#else /* !CONFIG_FAIR_GROUP_SCHED */ + + static inline struct task_struct *task_of(struct sched_entity *se) + { +@@ -506,138 +322,67 @@ + { + } + +-#endif /* CONFIG_FAIR_GROUP_SCHED */ + + static __always_inline + void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec); + +-/************************************************************** +- * Scheduling class tree data structure manipulation methods: ++/* ++ * Enqueue an entity + */ +- +-static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime) ++static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + { +- s64 delta = (s64)(vruntime - max_vruntime); +- if (delta > 0) +- max_vruntime = vruntime; ++ se->next[DIR_RIGHT] = NULL; ++ se->next[DIR_LEFT] = NULL; + +- return max_vruntime; +-} ++ if (likely(cfs_rq->head)) ++ { ++ se->next[DIR_RIGHT] = cfs_rq->head; ++ cfs_rq->head->next[DIR_LEFT] = se; + +-static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) +-{ +- s64 delta = (s64)(vruntime - min_vruntime); +- if (delta < 0) +- min_vruntime = vruntime; ++ // lastly reset the head ++ cfs_rq->head = se; + +- return min_vruntime; +-} ++ return; ++ } + +-static inline int entity_before(struct sched_entity *a, +- struct sched_entity *b) +-{ +- return (s64)(a->vruntime - b->vruntime) < 0; ++ // if empty rq ++ cfs_rq->head = se; + } + +-static void update_min_vruntime(struct cfs_rq *cfs_rq) ++static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + { +- struct sched_entity *curr = cfs_rq->curr; +- struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline); +- +- u64 vruntime = cfs_rq->min_vruntime; + +- if (curr) { +- if (curr->on_rq) +- vruntime = curr->vruntime; +- else +- curr = NULL; ++ // if only one se in rq ++ if (unlikely(cfs_rq->head->next[DIR_RIGHT] == NULL)) ++ cfs_rq->head = NULL; ++ else if (unlikely(se == cfs_rq->head)) ++ { ++ // if it is the head ++ cfs_rq->head = cfs_rq->head->next[DIR_RIGHT]; ++ cfs_rq->head->next[DIR_LEFT] = NULL; + } +- +- if (leftmost) { /* non-empty tree */ +- struct sched_entity *se; +- se = rb_entry(leftmost, struct sched_entity, run_node); +- +- if (!curr) +- vruntime = se->vruntime; + else +- vruntime = min_vruntime(vruntime, se->vruntime); +- } +- +- /* ensure we never gain time by being placed backwards. */ +- cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); +-#ifndef CONFIG_64BIT +- smp_wmb(); +- cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; +-#endif +-} ++ { ++ // if in the middle ++ struct sched_entity *prev = se->next[DIR_LEFT]; ++ struct sched_entity *next = se->next[DIR_RIGHT]; + +-/* +- * Enqueue an entity into the rb-tree: +- */ +-static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +-{ +- struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node; +- struct rb_node *parent = NULL; +- struct sched_entity *entry; +- bool leftmost = true; ++ prev->next[DIR_RIGHT] = next; + +- /* +- * Find the right place in the rbtree: +- */ +- while (*link) { +- parent = *link; +- entry = rb_entry(parent, struct sched_entity, run_node); +- /* +- * We dont care about collisions. Nodes with +- * the same key stay together. +- */ +- if (entity_before(se, entry)) { +- link = &parent->rb_left; +- } else { +- link = &parent->rb_right; +- leftmost = false; ++ if (next) ++ next->next[DIR_LEFT] = prev; + } +- } +- +- rb_link_node(&se->run_node, parent, link); +- rb_insert_color_cached(&se->run_node, +- &cfs_rq->tasks_timeline, leftmost); +-} +- +-static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +-{ +- rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); + } + + struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) + { +- struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline); +- +- if (!left) +- return NULL; +- +- return rb_entry(left, struct sched_entity, run_node); +-} +- +-static struct sched_entity *__pick_next_entity(struct sched_entity *se) +-{ +- struct rb_node *next = rb_next(&se->run_node); +- +- if (!next) +- return NULL; +- +- return rb_entry(next, struct sched_entity, run_node); ++ return cfs_rq->head; + } + + #ifdef CONFIG_SCHED_DEBUG + struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) + { +- struct rb_node *last = rb_last(&cfs_rq->tasks_timeline.rb_root); +- +- if (!last) +- return NULL; +- +- return rb_entry(last, struct sched_entity, run_node); ++ return cfs_rq->head; + } + + /************************************************************** +@@ -723,16 +468,6 @@ + return slice; + } + +-/* +- * We calculate the vruntime slice of a to-be-inserted task. +- * +- * vs = s/w +- */ +-static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) +-{ +- return calc_delta_fair(sched_slice(cfs_rq, se), se); +-} +- + #include "pelt.h" + #ifdef CONFIG_SMP + +@@ -856,6 +591,7 @@ + return; + + curr->exec_start = now; ++ curr->quantom++; + + schedstat_set(curr->statistics.exec_max, + max(delta_exec, curr->statistics.exec_max)); +@@ -864,12 +600,10 @@ + schedstat_add(cfs_rq->exec_clock, delta_exec); + + curr->vruntime += calc_delta_fair(delta_exec, curr); +- update_min_vruntime(cfs_rq); + + if (entity_is_task(curr)) { + struct task_struct *curtask = task_of(curr); + +- trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); + cgroup_account_cputime(curtask, delta_exec); + account_group_exec_runtime(curtask, delta_exec); + } +@@ -2897,39 +2631,6 @@ + } + } + +-/* +- * Drive the periodic memory faults.. +- */ +-static void task_tick_numa(struct rq *rq, struct task_struct *curr) +-{ +- struct callback_head *work = &curr->numa_work; +- u64 period, now; +- +- /* +- * We don't care about NUMA placement if we don't have memory. +- */ +- if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work) +- return; +- +- /* +- * Using runtime rather than walltime has the dual advantage that +- * we (mostly) drive the selection from busy threads and that the +- * task needs to have done some actual work before we bother with +- * NUMA placement. +- */ +- now = curr->se.sum_exec_runtime; +- period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; +- +- if (now > curr->node_stamp + period) { +- if (!curr->node_stamp) +- curr->numa_scan_period = task_scan_start(curr); +- curr->node_stamp += period; +- +- if (!time_before(jiffies, curr->mm->numa_next_scan)) +- task_work_add(curr, work, true); +- } +-} +- + static void update_scan_period(struct task_struct *p, int new_cpu) + { + int src_nid = cpu_to_node(task_cpu(p)); +@@ -2965,9 +2666,6 @@ + } + + #else +-static void task_tick_numa(struct rq *rq, struct task_struct *curr) +-{ +-} + + static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p) + { +@@ -4072,50 +3770,9 @@ + static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) + { + #ifdef CONFIG_SCHED_DEBUG +- s64 d = se->vruntime - cfs_rq->min_vruntime; +- +- if (d < 0) +- d = -d; +- +- if (d > 3*sysctl_sched_latency) +- schedstat_inc(cfs_rq->nr_spread_over); + #endif + } + +-static void +-place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) +-{ +- u64 vruntime = cfs_rq->min_vruntime; +- +- /* +- * The 'current' period is already promised to the current tasks, +- * however the extra weight of the new task will slow them down a +- * little, place the new task so that it fits in the slot that +- * stays open at the end. +- */ +- if (initial && sched_feat(START_DEBIT)) +- vruntime += sched_vslice(cfs_rq, se); +- +- /* sleeps up to a single latency don't count. */ +- if (!initial) { +- unsigned long thresh = sysctl_sched_latency; +- +- /* +- * Halve their sleep time's effect, to allow +- * for a gentler effect of sleepers: +- */ +- if (sched_feat(GENTLE_FAIR_SLEEPERS)) +- thresh >>= 1; +- +- vruntime -= thresh; +- } +- +- /* ensure we never gain time by being placed backwards. */ +- se->vruntime = max_vruntime(se->vruntime, vruntime); +-} +- +-static void check_enqueue_throttle(struct cfs_rq *cfs_rq); +- + static inline void check_schedstat_required(void) + { + #ifdef CONFIG_SCHEDSTATS +@@ -4171,28 +3828,11 @@ + static void + enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + { +- bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED); + bool curr = cfs_rq->curr == se; + +- /* +- * If we're the current task, we must renormalise before calling +- * update_curr(). +- */ +- if (renorm && curr) +- se->vruntime += cfs_rq->min_vruntime; +- + update_curr(cfs_rq); + + /* +- * Otherwise, renormalise after, such that we're placed at the current +- * moment in time, instead of some random moment in the past. Being +- * placed in the past could significantly boost this task to the +- * fairness detriment of existing tasks. +- */ +- if (renorm && !curr) +- se->vruntime += cfs_rq->min_vruntime; +- +- /* + * When enqueuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. + * - Add its load to cfs_rq->runnable_avg +@@ -4205,71 +3845,12 @@ + update_cfs_group(se); + account_entity_enqueue(cfs_rq, se); + +- if (flags & ENQUEUE_WAKEUP) +- place_entity(cfs_rq, se, 0); +- + check_schedstat_required(); + update_stats_enqueue(cfs_rq, se, flags); + check_spread(cfs_rq, se); + if (!curr) + __enqueue_entity(cfs_rq, se); + se->on_rq = 1; +- +- /* +- * When bandwidth control is enabled, cfs might have been removed +- * because of a parent been throttled but cfs->nr_running > 1. Try to +- * add it unconditionnally. +- */ +- if (cfs_rq->nr_running == 1 || cfs_bandwidth_used()) +- list_add_leaf_cfs_rq(cfs_rq); +- +- if (cfs_rq->nr_running == 1) +- check_enqueue_throttle(cfs_rq); +-} +- +-static void __clear_buddies_last(struct sched_entity *se) +-{ +- for_each_sched_entity(se) { +- struct cfs_rq *cfs_rq = cfs_rq_of(se); +- if (cfs_rq->last != se) +- break; +- +- cfs_rq->last = NULL; +- } +-} +- +-static void __clear_buddies_next(struct sched_entity *se) +-{ +- for_each_sched_entity(se) { +- struct cfs_rq *cfs_rq = cfs_rq_of(se); +- if (cfs_rq->next != se) +- break; +- +- cfs_rq->next = NULL; +- } +-} +- +-static void __clear_buddies_skip(struct sched_entity *se) +-{ +- for_each_sched_entity(se) { +- struct cfs_rq *cfs_rq = cfs_rq_of(se); +- if (cfs_rq->skip != se) +- break; +- +- cfs_rq->skip = NULL; +- } +-} +- +-static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) +-{ +- if (cfs_rq->last == se) +- __clear_buddies_last(se); +- +- if (cfs_rq->next == se) +- __clear_buddies_next(se); +- +- if (cfs_rq->skip == se) +- __clear_buddies_skip(se); + } + + static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); +@@ -4295,75 +3876,15 @@ + + update_stats_dequeue(cfs_rq, se, flags); + +- clear_buddies(cfs_rq, se); +- +- if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); ++ + se->on_rq = 0; + account_entity_dequeue(cfs_rq, se); + +- /* +- * Normalize after update_curr(); which will also have moved +- * min_vruntime if @se is the one holding it back. But before doing +- * update_min_vruntime() again, which will discount @se's position and +- * can move min_vruntime forward still more. +- */ +- if (!(flags & DEQUEUE_SLEEP)) +- se->vruntime -= cfs_rq->min_vruntime; +- + /* return excess runtime on last dequeue */ + return_cfs_rq_runtime(cfs_rq); + + update_cfs_group(se); +- +- /* +- * Now advance min_vruntime if @se was the entity holding it back, +- * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be +- * put back on, and if we advance min_vruntime, we'll be placed back +- * further than we started -- ie. we'll be penalized. +- */ +- if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) +- update_min_vruntime(cfs_rq); +-} +- +-/* +- * Preempt the current task with a newly woken task if needed: +- */ +-static void +-check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) +-{ +- unsigned long ideal_runtime, delta_exec; +- struct sched_entity *se; +- s64 delta; +- +- ideal_runtime = sched_slice(cfs_rq, curr); +- delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; +- if (delta_exec > ideal_runtime) { +- resched_curr(rq_of(cfs_rq)); +- /* +- * The current task ran long enough, ensure it doesn't get +- * re-elected due to buddy favours. +- */ +- clear_buddies(cfs_rq, curr); +- return; +- } +- +- /* +- * Ensure that a task that missed wakeup preemption by a +- * narrow margin doesn't have to wait for a full slice. +- * This also mitigates buddy induced latencies under load. +- */ +- if (delta_exec < sysctl_sched_min_granularity) +- return; +- +- se = __pick_first_entity(cfs_rq); +- delta = curr->vruntime - se->vruntime; +- +- if (delta < 0) +- return; +- +- if (delta > ideal_runtime) +- resched_curr(rq_of(cfs_rq)); + } + + static void +@@ -4371,96 +3892,18 @@ + { + /* 'current' is not kept within the tree. */ + if (se->on_rq) { +- /* +- * Any task has to be enqueued before it get to execute on +- * a CPU. So account for the time it spent waiting on the +- * runqueue. +- */ + update_stats_wait_end(cfs_rq, se); +- __dequeue_entity(cfs_rq, se); + update_load_avg(cfs_rq, se, UPDATE_TG); + } + + update_stats_curr_start(cfs_rq, se); + cfs_rq->curr = se; + +- /* +- * Track our maximum slice length, if the CPU's load is at +- * least twice that of our own weight (i.e. dont track it +- * when there are only lesser-weight tasks around): +- */ +- if (schedstat_enabled() && +- rq_of(cfs_rq)->cfs.load.weight >= 2*se->load.weight) { +- schedstat_set(se->statistics.slice_max, +- max((u64)schedstat_val(se->statistics.slice_max), +- se->sum_exec_runtime - se->prev_sum_exec_runtime)); +- } +- + se->prev_sum_exec_runtime = se->sum_exec_runtime; + } + + static int +-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); +- +-/* +- * Pick the next process, keeping these things in mind, in this order: +- * 1) keep things fair between processes/task groups +- * 2) pick the "next" process, since someone really wants that to run +- * 3) pick the "last" process, for cache locality +- * 4) do not run the "skip" process, if something else is available +- */ +-static struct sched_entity * +-pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) +-{ +- struct sched_entity *left = __pick_first_entity(cfs_rq); +- struct sched_entity *se; +- +- /* +- * If curr is set we have to see if its left of the leftmost entity +- * still in the tree, provided there was anything in the tree at all. +- */ +- if (!left || (curr && entity_before(curr, left))) +- left = curr; +- +- se = left; /* ideally we run the leftmost entity */ +- +- /* +- * Avoid running the skip buddy, if running something else can +- * be done without getting too unfair. +- */ +- if (cfs_rq->skip == se) { +- struct sched_entity *second; +- +- if (se == curr) { +- second = __pick_first_entity(cfs_rq); +- } else { +- second = __pick_next_entity(se); +- if (!second || (curr && entity_before(curr, second))) +- second = curr; +- } +- +- if (second && wakeup_preempt_entity(second, left) < 1) +- se = second; +- } +- +- /* +- * Prefer last buddy, try to return the CPU to a preempted task. +- */ +- if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) +- se = cfs_rq->last; +- +- /* +- * Someone really wants this to run. If it's not unfair, run it. +- */ +- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) +- se = cfs_rq->next; +- +- clear_buddies(cfs_rq, se); +- +- return se; +-} +- +-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); ++wakeup_preempt_entity(u64 now, struct sched_entity *curr, struct sched_entity *se); + + static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) + { +@@ -4471,21 +3914,19 @@ + if (prev->on_rq) + update_curr(cfs_rq); + +- /* throttle cfs_rqs exceeding runtime */ +- check_cfs_rq_runtime(cfs_rq); +- +- check_spread(cfs_rq, prev); +- + if (prev->on_rq) { + update_stats_wait_start(cfs_rq, prev); +- /* Put 'current' back into the tree. */ +- __enqueue_entity(cfs_rq, prev); + /* in !on_rq case, update occurred at dequeue */ + update_load_avg(cfs_rq, prev, 0); + } + cfs_rq->curr = NULL; + } + ++static int check_preempt_curr_fair(struct sched_entity *curr) ++{ ++ return 1; ++} ++ + static void + entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) + { +@@ -4509,6 +3950,12 @@ + resched_curr(rq_of(cfs_rq)); + return; + } ++ ++ if (check_preempt_curr_fair(curr) == 1) { ++ resched_curr(rq_of(cfs_rq)); ++ return; ++ } ++ + /* + * don't let the period tick interfere with the hrtick preemption + */ +@@ -4516,9 +3963,6 @@ + hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) + return; + #endif +- +- if (cfs_rq->nr_running > 1) +- check_preempt_tick(cfs_rq, curr); + } + + +@@ -5082,30 +4526,6 @@ + raw_spin_unlock_irqrestore(&cfs_b->lock, flags); + } + +-/* +- * When a group wakes up we want to make sure that its quota is not already +- * expired/exceeded, otherwise it may be allowed to steal additional ticks of +- * runtime as update_curr() throttling can not not trigger until it's on-rq. +- */ +-static void check_enqueue_throttle(struct cfs_rq *cfs_rq) +-{ +- if (!cfs_bandwidth_used()) +- return; +- +- /* an active group must be handled by the update_curr()->put() path */ +- if (!cfs_rq->runtime_enabled || cfs_rq->curr) +- return; +- +- /* ensure the group is not already throttled */ +- if (cfs_rq_throttled(cfs_rq)) +- return; +- +- /* update runtime allocation */ +- account_cfs_rq_runtime(cfs_rq, 0); +- if (cfs_rq->runtime_remaining <= 0) +- throttle_cfs_rq(cfs_rq); +-} +- + static void sync_throttle(struct task_group *tg, int cpu) + { + struct cfs_rq *pcfs_rq, *cfs_rq; +@@ -5123,26 +4543,6 @@ + cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); + } + +-/* conditionally throttle active cfs_rq's from put_prev_entity() */ +-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) +-{ +- if (!cfs_bandwidth_used()) +- return false; +- +- if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0)) +- return false; +- +- /* +- * it's possible for a throttled entity to be forced into a running +- * state (e.g. set_curr_task), in this case we're finished. +- */ +- if (cfs_rq_throttled(cfs_rq)) +- return true; +- +- throttle_cfs_rq(cfs_rq); +- return true; +-} +- + static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) + { + struct cfs_bandwidth *cfs_b = +@@ -5318,8 +4718,6 @@ + } + + static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} +-static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } +-static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} + static inline void sync_throttle(struct task_group *tg, int cpu) {} + static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} + +@@ -5548,8 +4946,6 @@ + hrtick_update(rq); + } + +-static void set_next_buddy(struct sched_entity *se); +- + /* + * The dequeue_task method is called before nr_running is + * decreased. We remove the task from the rbtree and +@@ -5578,12 +4974,6 @@ + if (cfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); +- /* +- * Bias pick_next to pick a task from this cfs_rq, as +- * p is sleeping when it is within its sched_slice. +- */ +- if (task_sleep && se && !throttled_hierarchy(cfs_rq)) +- set_next_buddy(se); + break; + } + flags |= DEQUEUE_SLEEP; +@@ -5699,53 +5089,6 @@ + return cpu_rq(cpu)->cpu_capacity; + } + +-static void record_wakee(struct task_struct *p) +-{ +- /* +- * Only decay a single time; tasks that have less then 1 wakeup per +- * jiffy will not have built up many flips. +- */ +- if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) { +- current->wakee_flips >>= 1; +- current->wakee_flip_decay_ts = jiffies; +- } +- +- if (current->last_wakee != p) { +- current->last_wakee = p; +- current->wakee_flips++; +- } +-} +- +-/* +- * Detect M:N waker/wakee relationships via a switching-frequency heuristic. +- * +- * A waker of many should wake a different task than the one last awakened +- * at a frequency roughly N times higher than one of its wakees. +- * +- * In order to determine whether we should let the load spread vs consolidating +- * to shared cache, we look for a minimum 'flip' frequency of llc_size in one +- * partner, and a factor of lls_size higher frequency in the other. +- * +- * With both conditions met, we can be relatively sure that the relationship is +- * non-monogamous, with partner count exceeding socket size. +- * +- * Waker/wakee being client/server, worker/dispatcher, interrupt source or +- * whatever is irrelevant, spread criteria is apparent partner count exceeds +- * socket size. +- */ +-static int wake_wide(struct task_struct *p) +-{ +- unsigned int master = current->wakee_flips; +- unsigned int slave = p->wakee_flips; +- int factor = this_cpu_read(sd_llc_size); +- +- if (master < slave) +- swap(master, slave); +- if (slave < factor || master < slave * factor) +- return 0; +- return 1; +-} +- + /* + * The purpose of wake_affine() is to quickly determine on which CPU we can run + * soonest. For the purpose of speed we only consider the waking and previous +@@ -6402,238 +5745,6 @@ + return min_t(unsigned long, util, capacity_orig_of(cpu)); + } + +-/* +- * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued) +- * to @dst_cpu. +- */ +-static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu) +-{ +- struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs; +- unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg); +- +- /* +- * If @p migrates from @cpu to another, remove its contribution. Or, +- * if @p migrates from another CPU to @cpu, add its contribution. In +- * the other cases, @cpu is not impacted by the migration, so the +- * util_avg should already be correct. +- */ +- if (task_cpu(p) == cpu && dst_cpu != cpu) +- sub_positive(&util, task_util(p)); +- else if (task_cpu(p) != cpu && dst_cpu == cpu) +- util += task_util(p); +- +- if (sched_feat(UTIL_EST)) { +- util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued); +- +- /* +- * During wake-up, the task isn't enqueued yet and doesn't +- * appear in the cfs_rq->avg.util_est.enqueued of any rq, +- * so just add it (if needed) to "simulate" what will be +- * cpu_util() after the task has been enqueued. +- */ +- if (dst_cpu == cpu) +- util_est += _task_util_est(p); +- +- util = max(util, util_est); +- } +- +- return min(util, capacity_orig_of(cpu)); +-} +- +-/* +- * compute_energy(): Estimates the energy that @pd would consume if @p was +- * migrated to @dst_cpu. compute_energy() predicts what will be the utilization +- * landscape of @pd's CPUs after the task migration, and uses the Energy Model +- * to compute what would be the energy if we decided to actually migrate that +- * task. +- */ +-static long +-compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) +-{ +- struct cpumask *pd_mask = perf_domain_span(pd); +- unsigned long cpu_cap = arch_scale_cpu_capacity(cpumask_first(pd_mask)); +- unsigned long max_util = 0, sum_util = 0; +- int cpu; +- +- /* +- * The capacity state of CPUs of the current rd can be driven by CPUs +- * of another rd if they belong to the same pd. So, account for the +- * utilization of these CPUs too by masking pd with cpu_online_mask +- * instead of the rd span. +- * +- * If an entire pd is outside of the current rd, it will not appear in +- * its pd list and will not be accounted by compute_energy(). +- */ +- for_each_cpu_and(cpu, pd_mask, cpu_online_mask) { +- unsigned long cpu_util, util_cfs = cpu_util_next(cpu, p, dst_cpu); +- struct task_struct *tsk = cpu == dst_cpu ? p : NULL; +- +- /* +- * Busy time computation: utilization clamping is not +- * required since the ratio (sum_util / cpu_capacity) +- * is already enough to scale the EM reported power +- * consumption at the (eventually clamped) cpu_capacity. +- */ +- sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, +- ENERGY_UTIL, NULL); +- +- /* +- * Performance domain frequency: utilization clamping +- * must be considered since it affects the selection +- * of the performance domain frequency. +- * NOTE: in case RT tasks are running, by default the +- * FREQUENCY_UTIL's utilization can be max OPP. +- */ +- cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, +- FREQUENCY_UTIL, tsk); +- max_util = max(max_util, cpu_util); +- } +- +- return em_pd_energy(pd->em_pd, max_util, sum_util); +-} +- +-/* +- * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the +- * waking task. find_energy_efficient_cpu() looks for the CPU with maximum +- * spare capacity in each performance domain and uses it as a potential +- * candidate to execute the task. Then, it uses the Energy Model to figure +- * out which of the CPU candidates is the most energy-efficient. +- * +- * The rationale for this heuristic is as follows. In a performance domain, +- * all the most energy efficient CPU candidates (according to the Energy +- * Model) are those for which we'll request a low frequency. When there are +- * several CPUs for which the frequency request will be the same, we don't +- * have enough data to break the tie between them, because the Energy Model +- * only includes active power costs. With this model, if we assume that +- * frequency requests follow utilization (e.g. using schedutil), the CPU with +- * the maximum spare capacity in a performance domain is guaranteed to be among +- * the best candidates of the performance domain. +- * +- * In practice, it could be preferable from an energy standpoint to pack +- * small tasks on a CPU in order to let other CPUs go in deeper idle states, +- * but that could also hurt our chances to go cluster idle, and we have no +- * ways to tell with the current Energy Model if this is actually a good +- * idea or not. So, find_energy_efficient_cpu() basically favors +- * cluster-packing, and spreading inside a cluster. That should at least be +- * a good thing for latency, and this is consistent with the idea that most +- * of the energy savings of EAS come from the asymmetry of the system, and +- * not so much from breaking the tie between identical CPUs. That's also the +- * reason why EAS is enabled in the topology code only for systems where +- * SD_ASYM_CPUCAPACITY is set. +- * +- * NOTE: Forkees are not accepted in the energy-aware wake-up path because +- * they don't have any useful utilization data yet and it's not possible to +- * forecast their impact on energy consumption. Consequently, they will be +- * placed by find_idlest_cpu() on the least loaded CPU, which might turn out +- * to be energy-inefficient in some use-cases. The alternative would be to +- * bias new tasks towards specific types of CPUs first, or to try to infer +- * their util_avg from the parent task, but those heuristics could hurt +- * other use-cases too. So, until someone finds a better way to solve this, +- * let's keep things simple by re-using the existing slow path. +- */ +-static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) +-{ +- unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; +- struct root_domain *rd = cpu_rq(smp_processor_id())->rd; +- unsigned long cpu_cap, util, base_energy = 0; +- int cpu, best_energy_cpu = prev_cpu; +- struct sched_domain *sd; +- struct perf_domain *pd; +- +- rcu_read_lock(); +- pd = rcu_dereference(rd->pd); +- if (!pd || READ_ONCE(rd->overutilized)) +- goto fail; +- +- /* +- * Energy-aware wake-up happens on the lowest sched_domain starting +- * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu. +- */ +- sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity)); +- while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) +- sd = sd->parent; +- if (!sd) +- goto fail; +- +- sync_entity_load_avg(&p->se); +- if (!task_util_est(p)) +- goto unlock; +- +- for (; pd; pd = pd->next) { +- unsigned long cur_delta, spare_cap, max_spare_cap = 0; +- unsigned long base_energy_pd; +- int max_spare_cap_cpu = -1; +- +- /* Compute the 'base' energy of the pd, without @p */ +- base_energy_pd = compute_energy(p, -1, pd); +- base_energy += base_energy_pd; +- +- for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) { +- if (!cpumask_test_cpu(cpu, p->cpus_ptr)) +- continue; +- +- util = cpu_util_next(cpu, p, cpu); +- cpu_cap = capacity_of(cpu); +- spare_cap = cpu_cap - util; +- +- /* +- * Skip CPUs that cannot satisfy the capacity request. +- * IOW, placing the task there would make the CPU +- * overutilized. Take uclamp into account to see how +- * much capacity we can get out of the CPU; this is +- * aligned with schedutil_cpu_util(). +- */ +- util = uclamp_rq_util_with(cpu_rq(cpu), util, p); +- if (!fits_capacity(util, cpu_cap)) +- continue; +- +- /* Always use prev_cpu as a candidate. */ +- if (cpu == prev_cpu) { +- prev_delta = compute_energy(p, prev_cpu, pd); +- prev_delta -= base_energy_pd; +- best_delta = min(best_delta, prev_delta); +- } +- +- /* +- * Find the CPU with the maximum spare capacity in +- * the performance domain +- */ +- if (spare_cap > max_spare_cap) { +- max_spare_cap = spare_cap; +- max_spare_cap_cpu = cpu; +- } +- } +- +- /* Evaluate the energy impact of using this CPU. */ +- if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) { +- cur_delta = compute_energy(p, max_spare_cap_cpu, pd); +- cur_delta -= base_energy_pd; +- if (cur_delta < best_delta) { +- best_delta = cur_delta; +- best_energy_cpu = max_spare_cap_cpu; +- } +- } +- } +-unlock: +- rcu_read_unlock(); +- +- /* +- * Pick the best CPU if prev_cpu cannot be used, or if it saves at +- * least 6% of the energy used by prev_cpu. +- */ +- if (prev_delta == ULONG_MAX) +- return best_energy_cpu; +- +- if ((prev_delta - best_delta) > ((prev_delta + base_energy) >> 4)) +- return best_energy_cpu; +- +- return prev_cpu; +- +-fail: +- rcu_read_unlock(); +- +- return -1; +-} + + /* + * select_task_rq_fair: Select target runqueue for the waking task in domains +@@ -6656,19 +5767,6 @@ + int want_affine = 0; + int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); + +- if (sd_flag & SD_BALANCE_WAKE) { +- record_wakee(p); +- +- if (sched_energy_enabled()) { +- new_cpu = find_energy_efficient_cpu(p, prev_cpu); +- if (new_cpu >= 0) +- return new_cpu; +- new_cpu = prev_cpu; +- } +- +- want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); +- } +- + rcu_read_lock(); + for_each_domain(cpu, tmp) { + if (!(tmp->flags & SD_LOAD_BALANCE)) +@@ -6696,7 +5794,9 @@ + if (unlikely(sd)) { + /* Slow path */ + new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); +- } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ ++ } ++ ++ else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ + /* Fast path */ + + new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); +@@ -6718,59 +5818,6 @@ + */ + static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) + { +- /* +- * As blocked tasks retain absolute vruntime the migration needs to +- * deal with this by subtracting the old and adding the new +- * min_vruntime -- the latter is done by enqueue_entity() when placing +- * the task on the new runqueue. +- */ +- if (p->state == TASK_WAKING) { +- struct sched_entity *se = &p->se; +- struct cfs_rq *cfs_rq = cfs_rq_of(se); +- u64 min_vruntime; +- +-#ifndef CONFIG_64BIT +- u64 min_vruntime_copy; +- +- do { +- min_vruntime_copy = cfs_rq->min_vruntime_copy; +- smp_rmb(); +- min_vruntime = cfs_rq->min_vruntime; +- } while (min_vruntime != min_vruntime_copy); +-#else +- min_vruntime = cfs_rq->min_vruntime; +-#endif +- +- se->vruntime -= min_vruntime; +- } +- +- if (p->on_rq == TASK_ON_RQ_MIGRATING) { +- /* +- * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old' +- * rq->lock and can modify state directly. +- */ +- lockdep_assert_held(&task_rq(p)->lock); +- detach_entity_cfs_rq(&p->se); +- +- } else { +- /* +- * We are supposed to update the task to "current" time, then +- * its up to date and ready to go to new CPU/cfs_rq. But we +- * have difficulty in getting what current time is, so simply +- * throw away the out-of-date time. This will result in the +- * wakee task is less decayed, but giving the wakee more load +- * sounds not bad. +- */ +- remove_entity_load_avg(&p->se); +- } +- +- /* Tell new CPU we are migrated */ +- p->se.avg.last_update_time = 0; +- +- /* We have migrated, no longer consider this task hot */ +- p->se.exec_start = 0; +- +- update_scan_period(p, new_cpu); + } + + static void task_dead_fair(struct task_struct *p) +@@ -6781,32 +5828,10 @@ + static int + balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + { +- if (rq->nr_running) + return 1; +- +- return newidle_balance(rq, rf) != 0; + } + #endif /* CONFIG_SMP */ + +-static unsigned long wakeup_gran(struct sched_entity *se) +-{ +- unsigned long gran = sysctl_sched_wakeup_granularity; +- +- /* +- * Since its curr running now, convert the gran from real-time +- * to virtual-time in his units. +- * +- * By using 'se' instead of 'curr' we penalize light tasks, so +- * they get preempted easier. That is, if 'se' < 'curr' then +- * the resulting gran will be larger, therefore penalizing the +- * lighter, if otoh 'se' > 'curr' then the resulting gran will +- * be smaller, again penalizing the lighter task. +- * +- * This is especially important for buddies when the leftmost +- * task is higher priority than the buddy. +- */ +- return calc_delta_fair(gran, se); +-} + + /* + * Should 'se' preempt 'curr'. +@@ -6817,54 +5842,43 @@ + * g + * |<--->|c + * +- * w(c, s1) = -1 ++ * w(c, s1) = -1 // don't preempt + * w(c, s2) = 0 +- * w(c, s3) = 1 ++ * w(c, s3) = 1 // preempt + * + */ + static int +-wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) +-{ +- s64 gran, vdiff = curr->vruntime - se->vruntime; +- +- if (vdiff <= 0) +- return -1; +- +- gran = wakeup_gran(se); +- if (vdiff > gran) +- return 1; +- +- return 0; +-} +- +-static void set_last_buddy(struct sched_entity *se) ++wakeup_preempt_entity(u64 now, struct sched_entity *curr, struct sched_entity *se) + { +- if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) +- return; ++ u64 r_curr, r_se, w_curr, w_se; ++ struct task_struct *t_curr = task_of(curr); ++ struct task_struct *t_se = task_of(se); ++ u64 vr_curr = curr->sum_exec_runtime + 1; ++ u64 vr_se = se->sum_exec_runtime + 1; ++ s64 diff; ++ ++ w_curr = (now - t_curr->start_boottime) - vr_curr; ++ w_se = (now - t_se->start_boottime) - vr_se; ++ ++ w_curr *= (140 - t_curr->prio); ++ w_se *= (140 - t_se->prio); ++ ++ r_curr = w_curr / vr_curr; ++ r_se = w_se / vr_se; ++ diff = (s64)(r_se) - (s64)(r_curr); + +- for_each_sched_entity(se) { +- if (SCHED_WARN_ON(!se->on_rq)) +- return; +- cfs_rq_of(se)->last = se; ++ if (diff == 0) ++ { ++ r_curr = w_curr % vr_curr; ++ r_se = w_se % vr_se; ++ diff = (s64)(r_se) - (s64)(r_curr); + } +-} + +-static void set_next_buddy(struct sched_entity *se) +-{ +- if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se)))) +- return; + +- for_each_sched_entity(se) { +- if (SCHED_WARN_ON(!se->on_rq)) +- return; +- cfs_rq_of(se)->next = se; +- } +-} ++ if (diff > 0) ++ return 1; + +-static void set_skip_buddy(struct sched_entity *se) +-{ +- for_each_sched_entity(se) +- cfs_rq_of(se)->skip = se; ++ return -1; + } + + /* +@@ -6874,28 +5888,12 @@ + { + struct task_struct *curr = rq->curr; + struct sched_entity *se = &curr->se, *pse = &p->se; +- struct cfs_rq *cfs_rq = task_cfs_rq(curr); +- int scale = cfs_rq->nr_running >= sched_nr_latency; +- int next_buddy_marked = 0; ++ u64 now = rq_clock_task(rq); + + if (unlikely(se == pse)) + return; + + /* +- * This is possible from callers such as attach_tasks(), in which we +- * unconditionally check_prempt_curr() after an enqueue (which may have +- * lead to a throttle). This both saves work and prevents false +- * next-buddy nomination below. +- */ +- if (unlikely(throttled_hierarchy(cfs_rq_of(pse)))) +- return; +- +- if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { +- set_next_buddy(pse); +- next_buddy_marked = 1; +- } +- +- /* + * We can come here with TIF_NEED_RESCHED already set from new task + * wake up path. + * +@@ -6923,13 +5921,7 @@ + find_matching_se(&se, &pse); + update_curr(cfs_rq_of(se)); + BUG_ON(!pse); +- if (wakeup_preempt_entity(se, pse) == 1) { +- /* +- * Bias pick_next to pick the sched entity that is +- * triggering this preemption. +- */ +- if (!next_buddy_marked) +- set_next_buddy(pse); ++ if (wakeup_preempt_entity(now, se, pse) == 1) { + goto preempt; + } + +@@ -6948,113 +5940,36 @@ + */ + if (unlikely(!se->on_rq || curr == rq->idle)) + return; +- +- if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) +- set_last_buddy(se); + } + + struct task_struct * + pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + { + struct cfs_rq *cfs_rq = &rq->cfs; +- struct sched_entity *se; ++ struct sched_entity *se, *next; + struct task_struct *p; +- int new_tasks; ++ u64 now = rq_clock_task(rq); + +-again: +- if (!sched_fair_runnable(rq)) ++ if (unlikely(!sched_fair_runnable(rq))) + goto idle; + +-#ifdef CONFIG_FAIR_GROUP_SCHED +- if (!prev || prev->sched_class != &fair_sched_class) +- goto simple; +- +- /* +- * Because of the set_next_buddy() in dequeue_task_fair() it is rather +- * likely that a next task is from the same cgroup as the current. +- * +- * Therefore attempt to avoid putting and setting the entire cgroup +- * hierarchy, only change the part that actually changes. +- */ ++ se = next = cfs_rq->head; ++ next = next->next[DIR_RIGHT]; + +- do { +- struct sched_entity *curr = cfs_rq->curr; +- +- /* +- * Since we got here without doing put_prev_entity() we also +- * have to consider cfs_rq->curr. If it is still a runnable +- * entity, update_curr() will update its vruntime, otherwise +- * forget we've ever seen it. +- */ +- if (curr) { +- if (curr->on_rq) +- update_curr(cfs_rq); +- else +- curr = NULL; +- +- /* +- * This call to check_cfs_rq_runtime() will do the +- * throttle and dequeue its entity in the parent(s). +- * Therefore the nr_running test will indeed +- * be correct. +- */ +- if (unlikely(check_cfs_rq_runtime(cfs_rq))) { +- cfs_rq = &rq->cfs; +- +- if (!cfs_rq->nr_running) +- goto idle; +- +- goto simple; +- } +- } +- +- se = pick_next_entity(cfs_rq, curr); +- cfs_rq = group_cfs_rq(se); +- } while (cfs_rq); +- +- p = task_of(se); +- +- /* +- * Since we haven't yet done put_prev_entity and if the selected task +- * is a different task than we started out with, try and touch the +- * least amount of cfs_rqs. +- */ +- if (prev != p) { +- struct sched_entity *pse = &prev->se; +- +- while (!(cfs_rq = is_same_group(se, pse))) { +- int se_depth = se->depth; +- int pse_depth = pse->depth; +- +- if (se_depth <= pse_depth) { +- put_prev_entity(cfs_rq_of(pse), pse); +- pse = parent_entity(pse); +- } +- if (se_depth >= pse_depth) { +- set_next_entity(cfs_rq_of(se), se); +- se = parent_entity(se); +- } +- } ++ while (next) ++ { ++ if (wakeup_preempt_entity(now, se, next) == 1) ++ se = next; + +- put_prev_entity(cfs_rq, pse); +- set_next_entity(cfs_rq, se); ++ next = next->next[DIR_RIGHT]; + } + +- goto done; +-simple: +-#endif +- if (prev) +- put_prev_task(rq, prev); +- +- do { +- se = pick_next_entity(cfs_rq, NULL); + set_next_entity(cfs_rq, se); +- cfs_rq = group_cfs_rq(se); +- } while (cfs_rq); + + p = task_of(se); + +-done: __maybe_unused; ++ se->quantom = 0; ++ + #ifdef CONFIG_SMP + /* + * Move the next running task to the front of +@@ -7075,19 +5990,6 @@ + if (!rf) + return NULL; + +- new_tasks = newidle_balance(rq, rf); +- +- /* +- * Because newidle_balance() releases (and re-acquires) rq->lock, it is +- * possible for any higher priority task to appear. In that case we +- * must re-start the pick_next_entity() loop. +- */ +- if (new_tasks < 0) +- return RETRY_TASK; +- +- if (new_tasks > 0) +- goto again; +- + /* + * rq is about to be idle, check if we need to update the + * lost_idle_time of clock_pelt +@@ -7125,7 +6027,6 @@ + { + struct task_struct *curr = rq->curr; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); +- struct sched_entity *se = &curr->se; + + /* + * Are we the only task in the tree? +@@ -7133,8 +6034,6 @@ + if (unlikely(rq->nr_running == 1)) + return; + +- clear_buddies(cfs_rq, se); +- + if (curr->policy != SCHED_BATCH) { + update_rq_clock(rq); + /* +@@ -7148,8 +6047,6 @@ + */ + rq_clock_skip_update(rq); + } +- +- set_skip_buddy(se); + } + + static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) +@@ -7160,9 +6057,6 @@ + if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se))) + return false; + +- /* Tell the scheduler that we'd really like pse to run next. */ +- set_next_buddy(se); +- + yield_task_fair(rq); + + return true; +@@ -7370,39 +6264,6 @@ + struct list_head tasks; + }; + +-/* +- * Is this task likely cache-hot: +- */ +-static int task_hot(struct task_struct *p, struct lb_env *env) +-{ +- s64 delta; +- +- lockdep_assert_held(&env->src_rq->lock); +- +- if (p->sched_class != &fair_sched_class) +- return 0; +- +- if (unlikely(task_has_idle_policy(p))) +- return 0; +- +- /* +- * Buddy candidates are cache hot: +- */ +- if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running && +- (&p->se == cfs_rq_of(&p->se)->next || +- &p->se == cfs_rq_of(&p->se)->last)) +- return 1; +- +- if (sysctl_sched_migration_cost == -1) +- return 1; +- if (sysctl_sched_migration_cost == 0) +- return 0; +- +- delta = rq_clock_task(env->src_rq) - p->se.exec_start; +- +- return delta < (s64)sysctl_sched_migration_cost; +-} +- + #ifdef CONFIG_NUMA_BALANCING + /* + * Returns 1, if task migration degrades locality +@@ -7463,302 +6324,10 @@ + } + #endif + +-/* +- * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? +- */ +-static +-int can_migrate_task(struct task_struct *p, struct lb_env *env) +-{ +- int tsk_cache_hot; +- +- lockdep_assert_held(&env->src_rq->lock); +- +- /* +- * We do not migrate tasks that are: +- * 1) throttled_lb_pair, or +- * 2) cannot be migrated to this CPU due to cpus_ptr, or +- * 3) running (obviously), or +- * 4) are cache-hot on their current CPU. +- */ +- if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) +- return 0; +- +- if (!cpumask_test_cpu(env->dst_cpu, p->cpus_ptr)) { +- int cpu; +- +- schedstat_inc(p->se.statistics.nr_failed_migrations_affine); +- +- env->flags |= LBF_SOME_PINNED; +- +- /* +- * Remember if this task can be migrated to any other CPU in +- * our sched_group. We may want to revisit it if we couldn't +- * meet load balance goals by pulling other tasks on src_cpu. +- * +- * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have +- * already computed one in current iteration. +- */ +- if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) +- return 0; +- +- /* Prevent to re-select dst_cpu via env's CPUs: */ +- for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { +- if (cpumask_test_cpu(cpu, p->cpus_ptr)) { +- env->flags |= LBF_DST_PINNED; +- env->new_dst_cpu = cpu; +- break; +- } +- } +- +- return 0; +- } +- +- /* Record that we found atleast one task that could run on dst_cpu */ +- env->flags &= ~LBF_ALL_PINNED; +- +- if (task_running(env->src_rq, p)) { +- schedstat_inc(p->se.statistics.nr_failed_migrations_running); +- return 0; +- } +- +- /* +- * Aggressive migration if: +- * 1) destination numa is preferred +- * 2) task is cache cold, or +- * 3) too many balance attempts have failed. +- */ +- tsk_cache_hot = migrate_degrades_locality(p, env); +- if (tsk_cache_hot == -1) +- tsk_cache_hot = task_hot(p, env); +- +- if (tsk_cache_hot <= 0 || +- env->sd->nr_balance_failed > env->sd->cache_nice_tries) { +- if (tsk_cache_hot == 1) { +- schedstat_inc(env->sd->lb_hot_gained[env->idle]); +- schedstat_inc(p->se.statistics.nr_forced_migrations); +- } +- return 1; +- } +- +- schedstat_inc(p->se.statistics.nr_failed_migrations_hot); +- return 0; +-} +- +-/* +- * detach_task() -- detach the task for the migration specified in env +- */ +-static void detach_task(struct task_struct *p, struct lb_env *env) +-{ +- lockdep_assert_held(&env->src_rq->lock); +- +- deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); +- set_task_cpu(p, env->dst_cpu); +-} + +-/* +- * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as +- * part of active balancing operations within "domain". +- * +- * Returns a task if successful and NULL otherwise. +- */ +-static struct task_struct *detach_one_task(struct lb_env *env) +-{ +- struct task_struct *p; +- +- lockdep_assert_held(&env->src_rq->lock); +- +- list_for_each_entry_reverse(p, +- &env->src_rq->cfs_tasks, se.group_node) { +- if (!can_migrate_task(p, env)) +- continue; +- +- detach_task(p, env); +- +- /* +- * Right now, this is only the second place where +- * lb_gained[env->idle] is updated (other is detach_tasks) +- * so we can safely collect stats here rather than +- * inside detach_tasks(). +- */ +- schedstat_inc(env->sd->lb_gained[env->idle]); +- return p; +- } +- return NULL; +-} + + static const unsigned int sched_nr_migrate_break = 32; + +-/* +- * detach_tasks() -- tries to detach up to imbalance load/util/tasks from +- * busiest_rq, as part of a balancing operation within domain "sd". +- * +- * Returns number of detached tasks if successful and 0 otherwise. +- */ +-static int detach_tasks(struct lb_env *env) +-{ +- struct list_head *tasks = &env->src_rq->cfs_tasks; +- unsigned long util, load; +- struct task_struct *p; +- int detached = 0; +- +- lockdep_assert_held(&env->src_rq->lock); +- +- if (env->imbalance <= 0) +- return 0; +- +- while (!list_empty(tasks)) { +- /* +- * We don't want to steal all, otherwise we may be treated likewise, +- * which could at worst lead to a livelock crash. +- */ +- if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1) +- break; +- +- p = list_last_entry(tasks, struct task_struct, se.group_node); +- +- env->loop++; +- /* We've more or less seen every task there is, call it quits */ +- if (env->loop > env->loop_max) +- break; +- +- /* take a breather every nr_migrate tasks */ +- if (env->loop > env->loop_break) { +- env->loop_break += sched_nr_migrate_break; +- env->flags |= LBF_NEED_BREAK; +- break; +- } +- +- if (!can_migrate_task(p, env)) +- goto next; +- +- switch (env->migration_type) { +- case migrate_load: +- load = task_h_load(p); +- +- if (sched_feat(LB_MIN) && +- load < 16 && !env->sd->nr_balance_failed) +- goto next; +- +- /* +- * Make sure that we don't migrate too much load. +- * Nevertheless, let relax the constraint if +- * scheduler fails to find a good waiting task to +- * migrate. +- */ +- if (load/2 > env->imbalance && +- env->sd->nr_balance_failed <= env->sd->cache_nice_tries) +- goto next; +- +- env->imbalance -= load; +- break; +- +- case migrate_util: +- util = task_util_est(p); +- +- if (util > env->imbalance) +- goto next; +- +- env->imbalance -= util; +- break; +- +- case migrate_task: +- env->imbalance--; +- break; +- +- case migrate_misfit: +- /* This is not a misfit task */ +- if (task_fits_capacity(p, capacity_of(env->src_cpu))) +- goto next; +- +- env->imbalance = 0; +- break; +- } +- +- detach_task(p, env); +- list_add(&p->se.group_node, &env->tasks); +- +- detached++; +- +-#ifdef CONFIG_PREEMPTION +- /* +- * NEWIDLE balancing is a source of latency, so preemptible +- * kernels will stop after the first task is detached to minimize +- * the critical section. +- */ +- if (env->idle == CPU_NEWLY_IDLE) +- break; +-#endif +- +- /* +- * We only want to steal up to the prescribed amount of +- * load/util/tasks. +- */ +- if (env->imbalance <= 0) +- break; +- +- continue; +-next: +- list_move(&p->se.group_node, tasks); +- } +- +- /* +- * Right now, this is one of only two places we collect this stat +- * so we can safely collect detach_one_task() stats here rather +- * than inside detach_one_task(). +- */ +- schedstat_add(env->sd->lb_gained[env->idle], detached); +- +- return detached; +-} +- +-/* +- * attach_task() -- attach the task detached by detach_task() to its new rq. +- */ +-static void attach_task(struct rq *rq, struct task_struct *p) +-{ +- lockdep_assert_held(&rq->lock); +- +- BUG_ON(task_rq(p) != rq); +- activate_task(rq, p, ENQUEUE_NOCLOCK); +- check_preempt_curr(rq, p, 0); +-} +- +-/* +- * attach_one_task() -- attaches the task returned from detach_one_task() to +- * its new rq. +- */ +-static void attach_one_task(struct rq *rq, struct task_struct *p) +-{ +- struct rq_flags rf; +- +- rq_lock(rq, &rf); +- update_rq_clock(rq); +- attach_task(rq, p); +- rq_unlock(rq, &rf); +-} +- +-/* +- * attach_tasks() -- attaches all tasks detached by detach_tasks() to their +- * new rq. +- */ +-static void attach_tasks(struct lb_env *env) +-{ +- struct list_head *tasks = &env->tasks; +- struct task_struct *p; +- struct rq_flags rf; +- +- rq_lock(env->dst_rq, &rf); +- update_rq_clock(env->dst_rq); +- +- while (!list_empty(tasks)) { +- p = list_first_entry(tasks, struct task_struct, se.group_node); +- list_del_init(&p->se.group_node); +- +- attach_task(env->dst_rq, p); +- } +- +- rq_unlock(env->dst_rq, &rf); +-} + + #ifdef CONFIG_NO_HZ_COMMON + static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) +@@ -9086,293 +7655,6 @@ + ) / SCHED_CAPACITY_SCALE; + } + +-/******* find_busiest_group() helpers end here *********************/ +- +-/* +- * Decision matrix according to the local and busiest group type: +- * +- * busiest \ local has_spare fully_busy misfit asym imbalanced overloaded +- * has_spare nr_idle balanced N/A N/A balanced balanced +- * fully_busy nr_idle nr_idle N/A N/A balanced balanced +- * misfit_task force N/A N/A N/A force force +- * asym_packing force force N/A N/A force force +- * imbalanced force force N/A N/A force force +- * overloaded force force N/A N/A force avg_load +- * +- * N/A : Not Applicable because already filtered while updating +- * statistics. +- * balanced : The system is balanced for these 2 groups. +- * force : Calculate the imbalance as load migration is probably needed. +- * avg_load : Only if imbalance is significant enough. +- * nr_idle : dst_cpu is not busy and the number of idle CPUs is quite +- * different in groups. +- */ +- +-/** +- * find_busiest_group - Returns the busiest group within the sched_domain +- * if there is an imbalance. +- * +- * Also calculates the amount of runnable load which should be moved +- * to restore balance. +- * +- * @env: The load balancing environment. +- * +- * Return: - The busiest group if imbalance exists. +- */ +-static struct sched_group *find_busiest_group(struct lb_env *env) +-{ +- struct sg_lb_stats *local, *busiest; +- struct sd_lb_stats sds; +- +- init_sd_lb_stats(&sds); +- +- /* +- * Compute the various statistics relevant for load balancing at +- * this level. +- */ +- update_sd_lb_stats(env, &sds); +- +- if (sched_energy_enabled()) { +- struct root_domain *rd = env->dst_rq->rd; +- +- if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized)) +- goto out_balanced; +- } +- +- local = &sds.local_stat; +- busiest = &sds.busiest_stat; +- +- /* There is no busy sibling group to pull tasks from */ +- if (!sds.busiest) +- goto out_balanced; +- +- /* Misfit tasks should be dealt with regardless of the avg load */ +- if (busiest->group_type == group_misfit_task) +- goto force_balance; +- +- /* ASYM feature bypasses nice load balance check */ +- if (busiest->group_type == group_asym_packing) +- goto force_balance; +- +- /* +- * If the busiest group is imbalanced the below checks don't +- * work because they assume all things are equal, which typically +- * isn't true due to cpus_ptr constraints and the like. +- */ +- if (busiest->group_type == group_imbalanced) +- goto force_balance; +- +- /* +- * If the local group is busier than the selected busiest group +- * don't try and pull any tasks. +- */ +- if (local->group_type > busiest->group_type) +- goto out_balanced; +- +- /* +- * When groups are overloaded, use the avg_load to ensure fairness +- * between tasks. +- */ +- if (local->group_type == group_overloaded) { +- /* +- * If the local group is more loaded than the selected +- * busiest group don't try to pull any tasks. +- */ +- if (local->avg_load >= busiest->avg_load) +- goto out_balanced; +- +- /* XXX broken for overlapping NUMA groups */ +- sds.avg_load = (sds.total_load * SCHED_CAPACITY_SCALE) / +- sds.total_capacity; +- +- /* +- * Don't pull any tasks if this group is already above the +- * domain average load. +- */ +- if (local->avg_load >= sds.avg_load) +- goto out_balanced; +- +- /* +- * If the busiest group is more loaded, use imbalance_pct to be +- * conservative. +- */ +- if (100 * busiest->avg_load <= +- env->sd->imbalance_pct * local->avg_load) +- goto out_balanced; +- } +- +- /* Try to move all excess tasks to child's sibling domain */ +- if (sds.prefer_sibling && local->group_type == group_has_spare && +- busiest->sum_nr_running > local->sum_nr_running + 1) +- goto force_balance; +- +- if (busiest->group_type != group_overloaded) { +- if (env->idle == CPU_NOT_IDLE) +- /* +- * If the busiest group is not overloaded (and as a +- * result the local one too) but this CPU is already +- * busy, let another idle CPU try to pull task. +- */ +- goto out_balanced; +- +- if (busiest->group_weight > 1 && +- local->idle_cpus <= (busiest->idle_cpus + 1)) +- /* +- * If the busiest group is not overloaded +- * and there is no imbalance between this and busiest +- * group wrt idle CPUs, it is balanced. The imbalance +- * becomes significant if the diff is greater than 1 +- * otherwise we might end up to just move the imbalance +- * on another group. Of course this applies only if +- * there is more than 1 CPU per group. +- */ +- goto out_balanced; +- +- if (busiest->sum_h_nr_running == 1) +- /* +- * busiest doesn't have any tasks waiting to run +- */ +- goto out_balanced; +- } +- +-force_balance: +- /* Looks like there is an imbalance. Compute it */ +- calculate_imbalance(env, &sds); +- return env->imbalance ? sds.busiest : NULL; +- +-out_balanced: +- env->imbalance = 0; +- return NULL; +-} +- +-/* +- * find_busiest_queue - find the busiest runqueue among the CPUs in the group. +- */ +-static struct rq *find_busiest_queue(struct lb_env *env, +- struct sched_group *group) +-{ +- struct rq *busiest = NULL, *rq; +- unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1; +- unsigned int busiest_nr = 0; +- int i; +- +- for_each_cpu_and(i, sched_group_span(group), env->cpus) { +- unsigned long capacity, load, util; +- unsigned int nr_running; +- enum fbq_type rt; +- +- rq = cpu_rq(i); +- rt = fbq_classify_rq(rq); +- +- /* +- * We classify groups/runqueues into three groups: +- * - regular: there are !numa tasks +- * - remote: there are numa tasks that run on the 'wrong' node +- * - all: there is no distinction +- * +- * In order to avoid migrating ideally placed numa tasks, +- * ignore those when there's better options. +- * +- * If we ignore the actual busiest queue to migrate another +- * task, the next balance pass can still reduce the busiest +- * queue by moving tasks around inside the node. +- * +- * If we cannot move enough load due to this classification +- * the next pass will adjust the group classification and +- * allow migration of more tasks. +- * +- * Both cases only affect the total convergence complexity. +- */ +- if (rt > env->fbq_type) +- continue; +- +- capacity = capacity_of(i); +- nr_running = rq->cfs.h_nr_running; +- +- /* +- * For ASYM_CPUCAPACITY domains, don't pick a CPU that could +- * eventually lead to active_balancing high->low capacity. +- * Higher per-CPU capacity is considered better than balancing +- * average load. +- */ +- if (env->sd->flags & SD_ASYM_CPUCAPACITY && +- capacity_of(env->dst_cpu) < capacity && +- nr_running == 1) +- continue; +- +- switch (env->migration_type) { +- case migrate_load: +- /* +- * When comparing with load imbalance, use cpu_load() +- * which is not scaled with the CPU capacity. +- */ +- load = cpu_load(rq); +- +- if (nr_running == 1 && load > env->imbalance && +- !check_cpu_capacity(rq, env->sd)) +- break; +- +- /* +- * For the load comparisons with the other CPUs, +- * consider the cpu_load() scaled with the CPU +- * capacity, so that the load can be moved away +- * from the CPU that is potentially running at a +- * lower capacity. +- * +- * Thus we're looking for max(load_i / capacity_i), +- * crosswise multiplication to rid ourselves of the +- * division works out to: +- * load_i * capacity_j > load_j * capacity_i; +- * where j is our previous maximum. +- */ +- if (load * busiest_capacity > busiest_load * capacity) { +- busiest_load = load; +- busiest_capacity = capacity; +- busiest = rq; +- } +- break; +- +- case migrate_util: +- util = cpu_util(cpu_of(rq)); +- +- /* +- * Don't try to pull utilization from a CPU with one +- * running task. Whatever its utilization, we will fail +- * detach the task. +- */ +- if (nr_running <= 1) +- continue; +- +- if (busiest_util < util) { +- busiest_util = util; +- busiest = rq; +- } +- break; +- +- case migrate_task: +- if (busiest_nr < nr_running) { +- busiest_nr = nr_running; +- busiest = rq; +- } +- break; +- +- case migrate_misfit: +- /* +- * For ASYM_CPUCAPACITY domains with misfit tasks we +- * simply seek the "biggest" misfit task. +- */ +- if (rq->misfit_task_load > busiest_load) { +- busiest_load = rq->misfit_task_load; +- busiest = rq; +- } +- +- break; +- +- } +- } +- +- return busiest; +-} + + /* + * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but +@@ -9419,334 +7701,6 @@ + return 0; + } + +-static int need_active_balance(struct lb_env *env) +-{ +- struct sched_domain *sd = env->sd; +- +- if (voluntary_active_balance(env)) +- return 1; +- +- return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); +-} +- +-static int active_load_balance_cpu_stop(void *data); +- +-static int should_we_balance(struct lb_env *env) +-{ +- struct sched_group *sg = env->sd->groups; +- int cpu, balance_cpu = -1; +- +- /* +- * Ensure the balancing environment is consistent; can happen +- * when the softirq triggers 'during' hotplug. +- */ +- if (!cpumask_test_cpu(env->dst_cpu, env->cpus)) +- return 0; +- +- /* +- * In the newly idle case, we will allow all the CPUs +- * to do the newly idle load balance. +- */ +- if (env->idle == CPU_NEWLY_IDLE) +- return 1; +- +- /* Try to find first idle CPU */ +- for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { +- if (!idle_cpu(cpu)) +- continue; +- +- balance_cpu = cpu; +- break; +- } +- +- if (balance_cpu == -1) +- balance_cpu = group_balance_cpu(sg); +- +- /* +- * First idle CPU or the first CPU(busiest) in this sched group +- * is eligible for doing load balancing at this and above domains. +- */ +- return balance_cpu == env->dst_cpu; +-} +- +-/* +- * Check this_cpu to ensure it is balanced within domain. Attempt to move +- * tasks if there is an imbalance. +- */ +-static int load_balance(int this_cpu, struct rq *this_rq, +- struct sched_domain *sd, enum cpu_idle_type idle, +- int *continue_balancing) +-{ +- int ld_moved, cur_ld_moved, active_balance = 0; +- struct sched_domain *sd_parent = sd->parent; +- struct sched_group *group; +- struct rq *busiest; +- struct rq_flags rf; +- struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); +- +- struct lb_env env = { +- .sd = sd, +- .dst_cpu = this_cpu, +- .dst_rq = this_rq, +- .dst_grpmask = sched_group_span(sd->groups), +- .idle = idle, +- .loop_break = sched_nr_migrate_break, +- .cpus = cpus, +- .fbq_type = all, +- .tasks = LIST_HEAD_INIT(env.tasks), +- }; +- +- cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask); +- +- schedstat_inc(sd->lb_count[idle]); +- +-redo: +- if (!should_we_balance(&env)) { +- *continue_balancing = 0; +- goto out_balanced; +- } +- +- group = find_busiest_group(&env); +- if (!group) { +- schedstat_inc(sd->lb_nobusyg[idle]); +- goto out_balanced; +- } +- +- busiest = find_busiest_queue(&env, group); +- if (!busiest) { +- schedstat_inc(sd->lb_nobusyq[idle]); +- goto out_balanced; +- } +- +- BUG_ON(busiest == env.dst_rq); +- +- schedstat_add(sd->lb_imbalance[idle], env.imbalance); +- +- env.src_cpu = busiest->cpu; +- env.src_rq = busiest; +- +- ld_moved = 0; +- if (busiest->nr_running > 1) { +- /* +- * Attempt to move tasks. If find_busiest_group has found +- * an imbalance but busiest->nr_running <= 1, the group is +- * still unbalanced. ld_moved simply stays zero, so it is +- * correctly treated as an imbalance. +- */ +- env.flags |= LBF_ALL_PINNED; +- env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); +- +-more_balance: +- rq_lock_irqsave(busiest, &rf); +- update_rq_clock(busiest); +- +- /* +- * cur_ld_moved - load moved in current iteration +- * ld_moved - cumulative load moved across iterations +- */ +- cur_ld_moved = detach_tasks(&env); +- +- /* +- * We've detached some tasks from busiest_rq. Every +- * task is masked "TASK_ON_RQ_MIGRATING", so we can safely +- * unlock busiest->lock, and we are able to be sure +- * that nobody can manipulate the tasks in parallel. +- * See task_rq_lock() family for the details. +- */ +- +- rq_unlock(busiest, &rf); +- +- if (cur_ld_moved) { +- attach_tasks(&env); +- ld_moved += cur_ld_moved; +- } +- +- local_irq_restore(rf.flags); +- +- if (env.flags & LBF_NEED_BREAK) { +- env.flags &= ~LBF_NEED_BREAK; +- goto more_balance; +- } +- +- /* +- * Revisit (affine) tasks on src_cpu that couldn't be moved to +- * us and move them to an alternate dst_cpu in our sched_group +- * where they can run. The upper limit on how many times we +- * iterate on same src_cpu is dependent on number of CPUs in our +- * sched_group. +- * +- * This changes load balance semantics a bit on who can move +- * load to a given_cpu. In addition to the given_cpu itself +- * (or a ilb_cpu acting on its behalf where given_cpu is +- * nohz-idle), we now have balance_cpu in a position to move +- * load to given_cpu. In rare situations, this may cause +- * conflicts (balance_cpu and given_cpu/ilb_cpu deciding +- * _independently_ and at _same_ time to move some load to +- * given_cpu) causing exceess load to be moved to given_cpu. +- * This however should not happen so much in practice and +- * moreover subsequent load balance cycles should correct the +- * excess load moved. +- */ +- if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { +- +- /* Prevent to re-select dst_cpu via env's CPUs */ +- __cpumask_clear_cpu(env.dst_cpu, env.cpus); +- +- env.dst_rq = cpu_rq(env.new_dst_cpu); +- env.dst_cpu = env.new_dst_cpu; +- env.flags &= ~LBF_DST_PINNED; +- env.loop = 0; +- env.loop_break = sched_nr_migrate_break; +- +- /* +- * Go back to "more_balance" rather than "redo" since we +- * need to continue with same src_cpu. +- */ +- goto more_balance; +- } +- +- /* +- * We failed to reach balance because of affinity. +- */ +- if (sd_parent) { +- int *group_imbalance = &sd_parent->groups->sgc->imbalance; +- +- if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) +- *group_imbalance = 1; +- } +- +- /* All tasks on this runqueue were pinned by CPU affinity */ +- if (unlikely(env.flags & LBF_ALL_PINNED)) { +- __cpumask_clear_cpu(cpu_of(busiest), cpus); +- /* +- * Attempting to continue load balancing at the current +- * sched_domain level only makes sense if there are +- * active CPUs remaining as possible busiest CPUs to +- * pull load from which are not contained within the +- * destination group that is receiving any migrated +- * load. +- */ +- if (!cpumask_subset(cpus, env.dst_grpmask)) { +- env.loop = 0; +- env.loop_break = sched_nr_migrate_break; +- goto redo; +- } +- goto out_all_pinned; +- } +- } +- +- if (!ld_moved) { +- schedstat_inc(sd->lb_failed[idle]); +- /* +- * Increment the failure counter only on periodic balance. +- * We do not want newidle balance, which can be very +- * frequent, pollute the failure counter causing +- * excessive cache_hot migrations and active balances. +- */ +- if (idle != CPU_NEWLY_IDLE) +- sd->nr_balance_failed++; +- +- if (need_active_balance(&env)) { +- unsigned long flags; +- +- raw_spin_lock_irqsave(&busiest->lock, flags); +- +- /* +- * Don't kick the active_load_balance_cpu_stop, +- * if the curr task on busiest CPU can't be +- * moved to this_cpu: +- */ +- if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) { +- raw_spin_unlock_irqrestore(&busiest->lock, +- flags); +- env.flags |= LBF_ALL_PINNED; +- goto out_one_pinned; +- } +- +- /* +- * ->active_balance synchronizes accesses to +- * ->active_balance_work. Once set, it's cleared +- * only after active load balance is finished. +- */ +- if (!busiest->active_balance) { +- busiest->active_balance = 1; +- busiest->push_cpu = this_cpu; +- active_balance = 1; +- } +- raw_spin_unlock_irqrestore(&busiest->lock, flags); +- +- if (active_balance) { +- stop_one_cpu_nowait(cpu_of(busiest), +- active_load_balance_cpu_stop, busiest, +- &busiest->active_balance_work); +- } +- +- /* We've kicked active balancing, force task migration. */ +- sd->nr_balance_failed = sd->cache_nice_tries+1; +- } +- } else +- sd->nr_balance_failed = 0; +- +- if (likely(!active_balance) || voluntary_active_balance(&env)) { +- /* We were unbalanced, so reset the balancing interval */ +- sd->balance_interval = sd->min_interval; +- } else { +- /* +- * If we've begun active balancing, start to back off. This +- * case may not be covered by the all_pinned logic if there +- * is only 1 task on the busy runqueue (because we don't call +- * detach_tasks). +- */ +- if (sd->balance_interval < sd->max_interval) +- sd->balance_interval *= 2; +- } +- +- goto out; +- +-out_balanced: +- /* +- * We reach balance although we may have faced some affinity +- * constraints. Clear the imbalance flag only if other tasks got +- * a chance to move and fix the imbalance. +- */ +- if (sd_parent && !(env.flags & LBF_ALL_PINNED)) { +- int *group_imbalance = &sd_parent->groups->sgc->imbalance; +- +- if (*group_imbalance) +- *group_imbalance = 0; +- } +- +-out_all_pinned: +- /* +- * We reach balance because all tasks are pinned at this level so +- * we can't migrate them. Let the imbalance flag set so parent level +- * can try to migrate them. +- */ +- schedstat_inc(sd->lb_balanced[idle]); +- +- sd->nr_balance_failed = 0; +- +-out_one_pinned: +- ld_moved = 0; +- +- /* +- * newidle_balance() disregards balance intervals, so we could +- * repeatedly reach this code, which would lead to balance_interval +- * skyrocketting in a short amount of time. Skip the balance_interval +- * increase logic to avoid that. +- */ +- if (env.idle == CPU_NEWLY_IDLE) +- goto out; +- +- /* tune up the balancing interval */ +- if ((env.flags & LBF_ALL_PINNED && +- sd->balance_interval < MAX_PINNED_INTERVAL) || +- sd->balance_interval < sd->max_interval) +- sd->balance_interval *= 2; +-out: +- return ld_moved; +-} +- + static inline unsigned long + get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) + { +@@ -9776,99 +7730,6 @@ + } + + /* +- * active_load_balance_cpu_stop is run by the CPU stopper. It pushes +- * running tasks off the busiest CPU onto idle CPUs. It requires at +- * least 1 task to be running on each physical CPU where possible, and +- * avoids physical / logical imbalances. +- */ +-static int active_load_balance_cpu_stop(void *data) +-{ +- struct rq *busiest_rq = data; +- int busiest_cpu = cpu_of(busiest_rq); +- int target_cpu = busiest_rq->push_cpu; +- struct rq *target_rq = cpu_rq(target_cpu); +- struct sched_domain *sd; +- struct task_struct *p = NULL; +- struct rq_flags rf; +- +- rq_lock_irq(busiest_rq, &rf); +- /* +- * Between queueing the stop-work and running it is a hole in which +- * CPUs can become inactive. We should not move tasks from or to +- * inactive CPUs. +- */ +- if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) +- goto out_unlock; +- +- /* Make sure the requested CPU hasn't gone down in the meantime: */ +- if (unlikely(busiest_cpu != smp_processor_id() || +- !busiest_rq->active_balance)) +- goto out_unlock; +- +- /* Is there any task to move? */ +- if (busiest_rq->nr_running <= 1) +- goto out_unlock; +- +- /* +- * This condition is "impossible", if it occurs +- * we need to fix it. Originally reported by +- * Bjorn Helgaas on a 128-CPU setup. +- */ +- BUG_ON(busiest_rq == target_rq); +- +- /* Search for an sd spanning us and the target CPU. */ +- rcu_read_lock(); +- for_each_domain(target_cpu, sd) { +- if ((sd->flags & SD_LOAD_BALANCE) && +- cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) +- break; +- } +- +- if (likely(sd)) { +- struct lb_env env = { +- .sd = sd, +- .dst_cpu = target_cpu, +- .dst_rq = target_rq, +- .src_cpu = busiest_rq->cpu, +- .src_rq = busiest_rq, +- .idle = CPU_IDLE, +- /* +- * can_migrate_task() doesn't need to compute new_dst_cpu +- * for active balancing. Since we have CPU_IDLE, but no +- * @dst_grpmask we need to make that test go away with lying +- * about DST_PINNED. +- */ +- .flags = LBF_DST_PINNED, +- }; +- +- schedstat_inc(sd->alb_count); +- update_rq_clock(busiest_rq); +- +- p = detach_one_task(&env); +- if (p) { +- schedstat_inc(sd->alb_pushed); +- /* Active balancing done, reset the failure counter. */ +- sd->nr_balance_failed = 0; +- } else { +- schedstat_inc(sd->alb_failed); +- } +- } +- rcu_read_unlock(); +-out_unlock: +- busiest_rq->active_balance = 0; +- rq_unlock(busiest_rq, &rf); +- +- if (p) +- attach_one_task(target_rq, p); +- +- local_irq_enable(); +- +- return 0; +-} +- +-static DEFINE_SPINLOCK(balancing); +- +-/* + * Scale the max load_balance interval with the number of CPUs in the system. + * This trades load-balance latency on larger machines for less cross talk. + */ +@@ -9877,114 +7738,6 @@ + max_load_balance_interval = HZ*num_online_cpus()/10; + } + +-/* +- * It checks each scheduling domain to see if it is due to be balanced, +- * and initiates a balancing operation if so. +- * +- * Balancing parameters are set up in init_sched_domains. +- */ +-static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) +-{ +- int continue_balancing = 1; +- int cpu = rq->cpu; +- int busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); +- unsigned long interval; +- struct sched_domain *sd; +- /* Earliest time when we have to do rebalance again */ +- unsigned long next_balance = jiffies + 60*HZ; +- int update_next_balance = 0; +- int need_serialize, need_decay = 0; +- u64 max_cost = 0; +- +- rcu_read_lock(); +- for_each_domain(cpu, sd) { +- /* +- * Decay the newidle max times here because this is a regular +- * visit to all the domains. Decay ~1% per second. +- */ +- if (time_after(jiffies, sd->next_decay_max_lb_cost)) { +- sd->max_newidle_lb_cost = +- (sd->max_newidle_lb_cost * 253) / 256; +- sd->next_decay_max_lb_cost = jiffies + HZ; +- need_decay = 1; +- } +- max_cost += sd->max_newidle_lb_cost; +- +- if (!(sd->flags & SD_LOAD_BALANCE)) +- continue; +- +- /* +- * Stop the load balance at this level. There is another +- * CPU in our sched group which is doing load balancing more +- * actively. +- */ +- if (!continue_balancing) { +- if (need_decay) +- continue; +- break; +- } +- +- interval = get_sd_balance_interval(sd, busy); +- +- need_serialize = sd->flags & SD_SERIALIZE; +- if (need_serialize) { +- if (!spin_trylock(&balancing)) +- goto out; +- } +- +- if (time_after_eq(jiffies, sd->last_balance + interval)) { +- if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { +- /* +- * The LBF_DST_PINNED logic could have changed +- * env->dst_cpu, so we can't know our idle +- * state even if we migrated tasks. Update it. +- */ +- idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; +- busy = idle != CPU_IDLE && !sched_idle_cpu(cpu); +- } +- sd->last_balance = jiffies; +- interval = get_sd_balance_interval(sd, busy); +- } +- if (need_serialize) +- spin_unlock(&balancing); +-out: +- if (time_after(next_balance, sd->last_balance + interval)) { +- next_balance = sd->last_balance + interval; +- update_next_balance = 1; +- } +- } +- if (need_decay) { +- /* +- * Ensure the rq-wide value also decays but keep it at a +- * reasonable floor to avoid funnies with rq->avg_idle. +- */ +- rq->max_idle_balance_cost = +- max((u64)sysctl_sched_migration_cost, max_cost); +- } +- rcu_read_unlock(); +- +- /* +- * next_balance will be updated only when there is a need. +- * When the cpu is attached to null domain for ex, it will not be +- * updated. +- */ +- if (likely(update_next_balance)) { +- rq->next_balance = next_balance; +- +-#ifdef CONFIG_NO_HZ_COMMON +- /* +- * If this CPU has been elected to perform the nohz idle +- * balance. Other idle CPUs have already rebalanced with +- * nohz_idle_balance() and nohz.next_balance has been +- * updated accordingly. This CPU is now running the idle load +- * balance for itself and we need to update the +- * nohz.next_balance accordingly. +- */ +- if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance)) +- nohz.next_balance = rq->next_balance; +-#endif +- } +-} + + static inline int on_null_domain(struct rq *rq) + { +@@ -10014,420 +7767,12 @@ + return nr_cpu_ids; + } + +-/* +- * Kick a CPU to do the nohz balancing, if it is time for it. We pick any +- * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one). +- */ +-static void kick_ilb(unsigned int flags) +-{ +- int ilb_cpu; +- +- nohz.next_balance++; +- +- ilb_cpu = find_new_ilb(); +- +- if (ilb_cpu >= nr_cpu_ids) +- return; +- +- flags = atomic_fetch_or(flags, nohz_flags(ilb_cpu)); +- if (flags & NOHZ_KICK_MASK) +- return; +- +- /* +- * Use smp_send_reschedule() instead of resched_cpu(). +- * This way we generate a sched IPI on the target CPU which +- * is idle. And the softirq performing nohz idle load balance +- * will be run before returning from the IPI. +- */ +- smp_send_reschedule(ilb_cpu); +-} +- +-/* +- * Current decision point for kicking the idle load balancer in the presence +- * of idle CPUs in the system. +- */ +-static void nohz_balancer_kick(struct rq *rq) +-{ +- unsigned long now = jiffies; +- struct sched_domain_shared *sds; +- struct sched_domain *sd; +- int nr_busy, i, cpu = rq->cpu; +- unsigned int flags = 0; +- +- if (unlikely(rq->idle_balance)) +- return; +- +- /* +- * We may be recently in ticked or tickless idle mode. At the first +- * busy tick after returning from idle, we will update the busy stats. +- */ +- nohz_balance_exit_idle(rq); +- +- /* +- * None are in tickless mode and hence no need for NOHZ idle load +- * balancing. +- */ +- if (likely(!atomic_read(&nohz.nr_cpus))) +- return; +- +- if (READ_ONCE(nohz.has_blocked) && +- time_after(now, READ_ONCE(nohz.next_blocked))) +- flags = NOHZ_STATS_KICK; +- +- if (time_before(now, nohz.next_balance)) +- goto out; +- +- if (rq->nr_running >= 2) { +- flags = NOHZ_KICK_MASK; +- goto out; +- } +- +- rcu_read_lock(); +- +- sd = rcu_dereference(rq->sd); +- if (sd) { +- /* +- * If there's a CFS task and the current CPU has reduced +- * capacity; kick the ILB to see if there's a better CPU to run +- * on. +- */ +- if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { +- flags = NOHZ_KICK_MASK; +- goto unlock; +- } +- } +- +- sd = rcu_dereference(per_cpu(sd_asym_packing, cpu)); +- if (sd) { +- /* +- * When ASYM_PACKING; see if there's a more preferred CPU +- * currently idle; in which case, kick the ILB to move tasks +- * around. +- */ +- for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) { +- if (sched_asym_prefer(i, cpu)) { +- flags = NOHZ_KICK_MASK; +- goto unlock; +- } +- } +- } +- +- sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, cpu)); +- if (sd) { +- /* +- * When ASYM_CPUCAPACITY; see if there's a higher capacity CPU +- * to run the misfit task on. +- */ +- if (check_misfit_status(rq, sd)) { +- flags = NOHZ_KICK_MASK; +- goto unlock; +- } +- +- /* +- * For asymmetric systems, we do not want to nicely balance +- * cache use, instead we want to embrace asymmetry and only +- * ensure tasks have enough CPU capacity. +- * +- * Skip the LLC logic because it's not relevant in that case. +- */ +- goto unlock; +- } +- +- sds = rcu_dereference(per_cpu(sd_llc_shared, cpu)); +- if (sds) { +- /* +- * If there is an imbalance between LLC domains (IOW we could +- * increase the overall cache use), we need some less-loaded LLC +- * domain to pull some load. Likewise, we may need to spread +- * load within the current LLC domain (e.g. packed SMT cores but +- * other CPUs are idle). We can't really know from here how busy +- * the others are - so just get a nohz balance going if it looks +- * like this LLC domain has tasks we could move. +- */ +- nr_busy = atomic_read(&sds->nr_busy_cpus); +- if (nr_busy > 1) { +- flags = NOHZ_KICK_MASK; +- goto unlock; +- } +- } +-unlock: +- rcu_read_unlock(); +-out: +- if (flags) +- kick_ilb(flags); +-} +- +-static void set_cpu_sd_state_busy(int cpu) +-{ +- struct sched_domain *sd; +- +- rcu_read_lock(); +- sd = rcu_dereference(per_cpu(sd_llc, cpu)); +- +- if (!sd || !sd->nohz_idle) +- goto unlock; +- sd->nohz_idle = 0; +- +- atomic_inc(&sd->shared->nr_busy_cpus); +-unlock: +- rcu_read_unlock(); +-} +- + void nohz_balance_exit_idle(struct rq *rq) + { +- SCHED_WARN_ON(rq != this_rq()); +- +- if (likely(!rq->nohz_tick_stopped)) +- return; +- +- rq->nohz_tick_stopped = 0; +- cpumask_clear_cpu(rq->cpu, nohz.idle_cpus_mask); +- atomic_dec(&nohz.nr_cpus); +- +- set_cpu_sd_state_busy(rq->cpu); + } + +-static void set_cpu_sd_state_idle(int cpu) +-{ +- struct sched_domain *sd; +- +- rcu_read_lock(); +- sd = rcu_dereference(per_cpu(sd_llc, cpu)); +- +- if (!sd || sd->nohz_idle) +- goto unlock; +- sd->nohz_idle = 1; +- +- atomic_dec(&sd->shared->nr_busy_cpus); +-unlock: +- rcu_read_unlock(); +-} +- +-/* +- * This routine will record that the CPU is going idle with tick stopped. +- * This info will be used in performing idle load balancing in the future. +- */ + void nohz_balance_enter_idle(int cpu) + { +- struct rq *rq = cpu_rq(cpu); +- +- SCHED_WARN_ON(cpu != smp_processor_id()); +- +- /* If this CPU is going down, then nothing needs to be done: */ +- if (!cpu_active(cpu)) +- return; +- +- /* Spare idle load balancing on CPUs that don't want to be disturbed: */ +- if (!housekeeping_cpu(cpu, HK_FLAG_SCHED)) +- return; +- +- /* +- * Can be set safely without rq->lock held +- * If a clear happens, it will have evaluated last additions because +- * rq->lock is held during the check and the clear +- */ +- rq->has_blocked_load = 1; +- +- /* +- * The tick is still stopped but load could have been added in the +- * meantime. We set the nohz.has_blocked flag to trig a check of the +- * *_avg. The CPU is already part of nohz.idle_cpus_mask so the clear +- * of nohz.has_blocked can only happen after checking the new load +- */ +- if (rq->nohz_tick_stopped) +- goto out; +- +- /* If we're a completely isolated CPU, we don't play: */ +- if (on_null_domain(rq)) +- return; +- +- rq->nohz_tick_stopped = 1; +- +- cpumask_set_cpu(cpu, nohz.idle_cpus_mask); +- atomic_inc(&nohz.nr_cpus); +- +- /* +- * Ensures that if nohz_idle_balance() fails to observe our +- * @idle_cpus_mask store, it must observe the @has_blocked +- * store. +- */ +- smp_mb__after_atomic(); +- +- set_cpu_sd_state_idle(cpu); +- +-out: +- /* +- * Each time a cpu enter idle, we assume that it has blocked load and +- * enable the periodic update of the load of idle cpus +- */ +- WRITE_ONCE(nohz.has_blocked, 1); +-} +- +-/* +- * Internal function that runs load balance for all idle cpus. The load balance +- * can be a simple update of blocked load or a complete load balance with +- * tasks movement depending of flags. +- * The function returns false if the loop has stopped before running +- * through all idle CPUs. +- */ +-static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags, +- enum cpu_idle_type idle) +-{ +- /* Earliest time when we have to do rebalance again */ +- unsigned long now = jiffies; +- unsigned long next_balance = now + 60*HZ; +- bool has_blocked_load = false; +- int update_next_balance = 0; +- int this_cpu = this_rq->cpu; +- int balance_cpu; +- int ret = false; +- struct rq *rq; +- +- SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK); +- +- /* +- * We assume there will be no idle load after this update and clear +- * the has_blocked flag. If a cpu enters idle in the mean time, it will +- * set the has_blocked flag and trig another update of idle load. +- * Because a cpu that becomes idle, is added to idle_cpus_mask before +- * setting the flag, we are sure to not clear the state and not +- * check the load of an idle cpu. +- */ +- WRITE_ONCE(nohz.has_blocked, 0); +- +- /* +- * Ensures that if we miss the CPU, we must see the has_blocked +- * store from nohz_balance_enter_idle(). +- */ +- smp_mb(); +- +- for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { +- if (balance_cpu == this_cpu || !idle_cpu(balance_cpu)) +- continue; +- +- /* +- * If this CPU gets work to do, stop the load balancing +- * work being done for other CPUs. Next load +- * balancing owner will pick it up. +- */ +- if (need_resched()) { +- has_blocked_load = true; +- goto abort; +- } +- +- rq = cpu_rq(balance_cpu); +- +- has_blocked_load |= update_nohz_stats(rq, true); +- +- /* +- * If time for next balance is due, +- * do the balance. +- */ +- if (time_after_eq(jiffies, rq->next_balance)) { +- struct rq_flags rf; +- +- rq_lock_irqsave(rq, &rf); +- update_rq_clock(rq); +- rq_unlock_irqrestore(rq, &rf); +- +- if (flags & NOHZ_BALANCE_KICK) +- rebalance_domains(rq, CPU_IDLE); +- } +- +- if (time_after(next_balance, rq->next_balance)) { +- next_balance = rq->next_balance; +- update_next_balance = 1; +- } +- } +- +- /* Newly idle CPU doesn't need an update */ +- if (idle != CPU_NEWLY_IDLE) { +- update_blocked_averages(this_cpu); +- has_blocked_load |= this_rq->has_blocked_load; +- } +- +- if (flags & NOHZ_BALANCE_KICK) +- rebalance_domains(this_rq, CPU_IDLE); +- +- WRITE_ONCE(nohz.next_blocked, +- now + msecs_to_jiffies(LOAD_AVG_PERIOD)); +- +- /* The full idle balance loop has been done */ +- ret = true; +- +-abort: +- /* There is still blocked load, enable periodic update */ +- if (has_blocked_load) +- WRITE_ONCE(nohz.has_blocked, 1); +- +- /* +- * next_balance will be updated only when there is a need. +- * When the CPU is attached to null domain for ex, it will not be +- * updated. +- */ +- if (likely(update_next_balance)) +- nohz.next_balance = next_balance; +- +- return ret; +-} +- +-/* +- * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the +- * rebalancing for all the cpus for whom scheduler ticks are stopped. +- */ +-static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) +-{ +- int this_cpu = this_rq->cpu; +- unsigned int flags; +- +- if (!(atomic_read(nohz_flags(this_cpu)) & NOHZ_KICK_MASK)) +- return false; +- +- if (idle != CPU_IDLE) { +- atomic_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); +- return false; +- } +- +- /* could be _relaxed() */ +- flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu)); +- if (!(flags & NOHZ_KICK_MASK)) +- return false; +- +- _nohz_idle_balance(this_rq, flags, idle); +- +- return true; +-} +- +-static void nohz_newidle_balance(struct rq *this_rq) +-{ +- int this_cpu = this_rq->cpu; +- +- /* +- * This CPU doesn't want to be disturbed by scheduler +- * housekeeping +- */ +- if (!housekeeping_cpu(this_cpu, HK_FLAG_SCHED)) +- return; +- +- /* Will wake up very soon. No time for doing anything else*/ +- if (this_rq->avg_idle < sysctl_sched_migration_cost) +- return; +- +- /* Don't need to update blocked load of idle CPUs*/ +- if (!READ_ONCE(nohz.has_blocked) || +- time_before(jiffies, READ_ONCE(nohz.next_blocked))) +- return; +- +- raw_spin_unlock(&this_rq->lock); +- /* +- * This CPU is going to be idle and blocked load of idle CPUs +- * need to be updated. Run the ilb locally as it is a good +- * candidate for ilb instead of waking up another idle CPU. +- * Kick an normal ilb if we failed to do the update. +- */ +- if (!_nohz_idle_balance(this_rq, NOHZ_STATS_KICK, CPU_NEWLY_IDLE)) +- kick_ilb(NOHZ_STATS_KICK); +- raw_spin_lock(&this_rq->lock); + } + + #else /* !CONFIG_NO_HZ_COMMON */ +@@ -10441,169 +7786,6 @@ + static inline void nohz_newidle_balance(struct rq *this_rq) { } + #endif /* CONFIG_NO_HZ_COMMON */ + +-/* +- * idle_balance is called by schedule() if this_cpu is about to become +- * idle. Attempts to pull tasks from other CPUs. +- * +- * Returns: +- * < 0 - we released the lock and there are !fair tasks present +- * 0 - failed, no new tasks +- * > 0 - success, new (fair) tasks present +- */ +-int newidle_balance(struct rq *this_rq, struct rq_flags *rf) +-{ +- unsigned long next_balance = jiffies + HZ; +- int this_cpu = this_rq->cpu; +- struct sched_domain *sd; +- int pulled_task = 0; +- u64 curr_cost = 0; +- +- update_misfit_status(NULL, this_rq); +- /* +- * We must set idle_stamp _before_ calling idle_balance(), such that we +- * measure the duration of idle_balance() as idle time. +- */ +- this_rq->idle_stamp = rq_clock(this_rq); +- +- /* +- * Do not pull tasks towards !active CPUs... +- */ +- if (!cpu_active(this_cpu)) +- return 0; +- +- /* +- * This is OK, because current is on_cpu, which avoids it being picked +- * for load-balance and preemption/IRQs are still disabled avoiding +- * further scheduler activity on it and we're being very careful to +- * re-start the picking loop. +- */ +- rq_unpin_lock(this_rq, rf); +- +- if (this_rq->avg_idle < sysctl_sched_migration_cost || +- !READ_ONCE(this_rq->rd->overload)) { +- +- rcu_read_lock(); +- sd = rcu_dereference_check_sched_domain(this_rq->sd); +- if (sd) +- update_next_balance(sd, &next_balance); +- rcu_read_unlock(); +- +- nohz_newidle_balance(this_rq); +- +- goto out; +- } +- +- raw_spin_unlock(&this_rq->lock); +- +- update_blocked_averages(this_cpu); +- rcu_read_lock(); +- for_each_domain(this_cpu, sd) { +- int continue_balancing = 1; +- u64 t0, domain_cost; +- +- if (!(sd->flags & SD_LOAD_BALANCE)) +- continue; +- +- if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) { +- update_next_balance(sd, &next_balance); +- break; +- } +- +- if (sd->flags & SD_BALANCE_NEWIDLE) { +- t0 = sched_clock_cpu(this_cpu); +- +- pulled_task = load_balance(this_cpu, this_rq, +- sd, CPU_NEWLY_IDLE, +- &continue_balancing); +- +- domain_cost = sched_clock_cpu(this_cpu) - t0; +- if (domain_cost > sd->max_newidle_lb_cost) +- sd->max_newidle_lb_cost = domain_cost; +- +- curr_cost += domain_cost; +- } +- +- update_next_balance(sd, &next_balance); +- +- /* +- * Stop searching for tasks to pull if there are +- * now runnable tasks on this rq. +- */ +- if (pulled_task || this_rq->nr_running > 0) +- break; +- } +- rcu_read_unlock(); +- +- raw_spin_lock(&this_rq->lock); +- +- if (curr_cost > this_rq->max_idle_balance_cost) +- this_rq->max_idle_balance_cost = curr_cost; +- +-out: +- /* +- * While browsing the domains, we released the rq lock, a task could +- * have been enqueued in the meantime. Since we're not going idle, +- * pretend we pulled a task. +- */ +- if (this_rq->cfs.h_nr_running && !pulled_task) +- pulled_task = 1; +- +- /* Move the next balance forward */ +- if (time_after(this_rq->next_balance, next_balance)) +- this_rq->next_balance = next_balance; +- +- /* Is there a task of a high priority class? */ +- if (this_rq->nr_running != this_rq->cfs.h_nr_running) +- pulled_task = -1; +- +- if (pulled_task) +- this_rq->idle_stamp = 0; +- +- rq_repin_lock(this_rq, rf); +- +- return pulled_task; +-} +- +-/* +- * run_rebalance_domains is triggered when needed from the scheduler tick. +- * Also triggered for nohz idle balancing (with nohz_balancing_kick set). +- */ +-static __latent_entropy void run_rebalance_domains(struct softirq_action *h) +-{ +- struct rq *this_rq = this_rq(); +- enum cpu_idle_type idle = this_rq->idle_balance ? +- CPU_IDLE : CPU_NOT_IDLE; +- +- /* +- * If this CPU has a pending nohz_balance_kick, then do the +- * balancing on behalf of the other idle CPUs whose ticks are +- * stopped. Do nohz_idle_balance *before* rebalance_domains to +- * give the idle CPUs a chance to load balance. Else we may +- * load balance only within the local sched_domain hierarchy +- * and abort nohz_idle_balance altogether if we pull some load. +- */ +- if (nohz_idle_balance(this_rq, idle)) +- return; +- +- /* normal load balance */ +- update_blocked_averages(this_rq->cpu); +- rebalance_domains(this_rq, idle); +-} +- +-/* +- * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. +- */ +-void trigger_load_balance(struct rq *rq) +-{ +- /* Don't need to rebalance while attached to NULL domain */ +- if (unlikely(on_null_domain(rq))) +- return; +- +- if (time_after_eq(jiffies, rq->next_balance)) +- raise_softirq(SCHED_SOFTIRQ); +- +- nohz_balancer_kick(rq); +-} + + static void rq_online_fair(struct rq *rq) + { +@@ -10640,9 +7822,6 @@ + entity_tick(cfs_rq, se, queued); + } + +- if (static_branch_unlikely(&sched_numa_balancing)) +- task_tick_numa(rq, curr); +- + update_misfit_status(curr, rq); + update_overutilized_status(task_rq(curr)); + } +@@ -10655,7 +7834,7 @@ + static void task_fork_fair(struct task_struct *p) + { + struct cfs_rq *cfs_rq; +- struct sched_entity *se = &p->se, *curr; ++ struct sched_entity *curr; + struct rq *rq = this_rq(); + struct rq_flags rf; + +@@ -10666,20 +7845,9 @@ + curr = cfs_rq->curr; + if (curr) { + update_curr(cfs_rq); +- se->vruntime = curr->vruntime; + } +- place_entity(cfs_rq, se, 1); + +- if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { +- /* +- * Upon rescheduling, sched_class::put_prev_task() will place +- * 'current' within the tree based on its new key value. +- */ +- swap(curr->vruntime, se->vruntime); +- resched_curr(rq); +- } + +- se->vruntime -= cfs_rq->min_vruntime; + rq_unlock(rq, &rf); + } + +@@ -10708,58 +7876,9 @@ + check_preempt_curr(rq, p, 0); + } + +-static inline bool vruntime_normalized(struct task_struct *p) +-{ +- struct sched_entity *se = &p->se; +- +- /* +- * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases, +- * the dequeue_entity(.flags=0) will already have normalized the +- * vruntime. +- */ +- if (p->on_rq) +- return true; +- +- /* +- * When !on_rq, vruntime of the task has usually NOT been normalized. +- * But there are some cases where it has already been normalized: +- * +- * - A forked child which is waiting for being woken up by +- * wake_up_new_task(). +- * - A task which has been woken up by try_to_wake_up() and +- * waiting for actually being woken up by sched_ttwu_pending(). +- */ +- if (!se->sum_exec_runtime || +- (p->state == TASK_WAKING && p->sched_remote_wakeup)) +- return true; +- +- return false; +-} +- +-#ifdef CONFIG_FAIR_GROUP_SCHED +-/* +- * Propagate the changes of the sched_entity across the tg tree to make it +- * visible to the root +- */ +-static void propagate_entity_cfs_rq(struct sched_entity *se) +-{ +- struct cfs_rq *cfs_rq; +- +- /* Start to propagate at parent */ +- se = se->parent; + +- for_each_sched_entity(se) { +- cfs_rq = cfs_rq_of(se); +- +- if (cfs_rq_throttled(cfs_rq)) +- break; +- +- update_load_avg(cfs_rq, se, UPDATE_TG); +- } +-} +-#else + static void propagate_entity_cfs_rq(struct sched_entity *se) { } +-#endif ++ + + static void detach_entity_cfs_rq(struct sched_entity *se) + { +@@ -10776,14 +7895,6 @@ + { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + +-#ifdef CONFIG_FAIR_GROUP_SCHED +- /* +- * Since the real-depth could have been changed (only FAIR +- * class maintain depth value), reset depth properly. +- */ +- se->depth = se->parent ? se->parent->depth + 1 : 0; +-#endif +- + /* Synchronize entity with its cfs_rq */ + update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD); + attach_entity_load_avg(cfs_rq, se); +@@ -10794,29 +7905,13 @@ + static void detach_task_cfs_rq(struct task_struct *p) + { + struct sched_entity *se = &p->se; +- struct cfs_rq *cfs_rq = cfs_rq_of(se); +- +- if (!vruntime_normalized(p)) { +- /* +- * Fix up our vruntime so that the current sleep doesn't +- * cause 'unlimited' sleep bonus. +- */ +- place_entity(cfs_rq, se, 0); +- se->vruntime -= cfs_rq->min_vruntime; +- } +- + detach_entity_cfs_rq(se); + } + + static void attach_task_cfs_rq(struct task_struct *p) + { + struct sched_entity *se = &p->se; +- struct cfs_rq *cfs_rq = cfs_rq_of(se); +- + attach_entity_cfs_rq(se); +- +- if (!vruntime_normalized(p)) +- se->vruntime += cfs_rq->min_vruntime; + } + + static void switched_from_fair(struct rq *rq, struct task_struct *p) +@@ -10879,6 +7974,8 @@ + #ifdef CONFIG_SMP + raw_spin_lock_init(&cfs_rq->removed.lock); + #endif ++ ++ cfs_rq->head = NULL; + } + + #ifdef CONFIG_FAIR_GROUP_SCHED +@@ -11203,7 +8300,6 @@ + __init void init_sched_fair_class(void) + { + #ifdef CONFIG_SMP +- open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); + + #ifdef CONFIG_NO_HZ_COMMON + nohz.next_balance = jiffies; +diff --color -rubN linux-5.7.6/kernel/sched/sched.h linux-5.7.6.cachy/kernel/sched/sched.h +--- linux-5.7.6/kernel/sched/sched.h 2020-06-25 01:49:26.000000000 +1000 ++++ linux-5.7.6.cachy/kernel/sched/sched.h 2020-07-24 17:52:04.479461959 +1000 +@@ -516,6 +516,7 @@ + * 'curr' points to currently running entity on this cfs_rq. + * It is set to NULL otherwise (i.e when none are currently running). + */ ++ struct sched_entity *head; + struct sched_entity *curr; + struct sched_entity *next; + struct sched_entity *last; +@@ -541,50 +542,7 @@ + unsigned long runnable_avg; + } removed; + +-#ifdef CONFIG_FAIR_GROUP_SCHED +- unsigned long tg_load_avg_contrib; +- long propagate; +- long prop_runnable_sum; +- +- /* +- * h_load = weight * f(tg) +- * +- * Where f(tg) is the recursive weight fraction assigned to +- * this group. +- */ +- unsigned long h_load; +- u64 last_h_load_update; +- struct sched_entity *h_load_next; +-#endif /* CONFIG_FAIR_GROUP_SCHED */ + #endif /* CONFIG_SMP */ +- +-#ifdef CONFIG_FAIR_GROUP_SCHED +- struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ +- +- /* +- * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in +- * a hierarchy). Non-leaf lrqs hold other higher schedulable entities +- * (like users, containers etc.) +- * +- * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU. +- * This list is used during load balance. +- */ +- int on_list; +- struct list_head leaf_cfs_rq_list; +- struct task_group *tg; /* group that "owns" this runqueue */ +- +-#ifdef CONFIG_CFS_BANDWIDTH +- int runtime_enabled; +- s64 runtime_remaining; +- +- u64 throttled_clock; +- u64 throttled_clock_task; +- u64 throttled_clock_task_time; +- int throttled; +- int throttle_count; +- struct list_head throttled_list; +-#endif /* CONFIG_CFS_BANDWIDTH */ +-#endif /* CONFIG_FAIR_GROUP_SCHED */ + }; + + static inline int rt_bandwidth_enabled(void) +diff --color -rubN linux-5.7.6/Makefile linux-5.7.6.cachy/Makefile +--- linux-5.7.6/Makefile 2020-06-25 01:49:26.000000000 +1000 ++++ linux-5.7.6.cachy/Makefile 2020-07-24 14:33:53.453645295 +1000 +@@ -2,8 +2,8 @@ + VERSION = 5 + PATCHLEVEL = 7 + SUBLEVEL = 6 +-EXTRAVERSION = +-NAME = Kleptomaniac Octopus ++EXTRAVERSION = -cachy ++NAME = Cachy + + # *DOCUMENTATION* + # To see a list of typical targets execute "make help" diff --git a/linux-tkg/linux-tkg-patches/5.7/0011-ZFS-fix.patch b/linux-tkg/linux-tkg-patches/5.7/0011-ZFS-fix.patch new file mode 100644 index 0000000..af71d04 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.7/0011-ZFS-fix.patch @@ -0,0 +1,43 @@ +From 1e010beda2896bdf3082fb37a3e49f8ce20e04d8 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= +Date: Thu, 2 May 2019 05:28:08 +0100 +Subject: [PATCH] x86/fpu: Export kernel_fpu_{begin,end}() with + EXPORT_SYMBOL_GPL +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +We need these symbols in zfs as the fpu implementation breaks userspace: + +https://github.com/zfsonlinux/zfs/issues/9346 +Signed-off-by: Jörg Thalheim +--- + arch/x86/kernel/fpu/core.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c +index 12c70840980e..352538b3bb5d 100644 +--- a/arch/x86/kernel/fpu/core.c ++++ b/arch/x86/kernel/fpu/core.c +@@ -102,7 +102,7 @@ void kernel_fpu_begin(void) + } + __cpu_invalidate_fpregs_state(); + } +-EXPORT_SYMBOL_GPL(kernel_fpu_begin); ++EXPORT_SYMBOL(kernel_fpu_begin); + + void kernel_fpu_end(void) + { +@@ -111,7 +111,7 @@ void kernel_fpu_end(void) + this_cpu_write(in_kernel_fpu, false); + preempt_enable(); + } +-EXPORT_SYMBOL_GPL(kernel_fpu_end); ++EXPORT_SYMBOL(kernel_fpu_end); + + /* + * Save the FPU state (mark it for reload if necessary): +-- +2.23.0 + + diff --git a/linux-tkg/linux-tkg-patches/5.7/0012-linux-hardened.patch b/linux-tkg/linux-tkg-patches/5.7/0012-linux-hardened.patch new file mode 100644 index 0000000..6f20939 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.7/0012-linux-hardened.patch @@ -0,0 +1,2916 @@ +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 5e2ce88d6eda..5cdeccf3459f 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -518,17 +518,6 @@ + nosocket -- Disable socket memory accounting. + nokmem -- Disable kernel memory accounting. + +- checkreqprot [SELINUX] Set initial checkreqprot flag value. +- Format: { "0" | "1" } +- See security/selinux/Kconfig help text. +- 0 -- check protection applied by kernel (includes +- any implied execute protection). +- 1 -- check protection requested by application. +- Default value is set via a kernel config option. +- Value can be changed at runtime via +- /sys/fs/selinux/checkreqprot. +- Setting checkreqprot to 1 is deprecated. +- + cio_ignore= [S390] + See Documentation/s390/common_io.rst for details. + clk_ignore_unused +@@ -3446,6 +3435,11 @@ + the specified number of seconds. This is to be used if + your oopses keep scrolling off the screen. + ++ extra_latent_entropy ++ Enable a very simple form of latent entropy extraction ++ from the first 4GB of memory as the bootmem allocator ++ passes the memory pages to the buddy allocator. ++ + pcbit= [HW,ISDN] + + pcd. [PARIDE] +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index 0d427fd10941..e0042d797c38 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -1167,6 +1167,26 @@ If a value outside of this range is written to ``threads-max`` an + ``EINVAL`` error occurs. + + ++tiocsti_restrict ++================ ++ ++This toggle indicates whether unprivileged users are prevented from ++using the ``TIOCSTI`` ioctl to inject commands into other processes ++which share a tty session. ++ ++When ``tiocsti_restrict`` is set to (0) there are no restrictions(accept ++the default restriction of only being able to injection commands into ++one's own tty). When ``tiocsti_restrict`` is set to (1), users must have ++``CAP_SYS_ADMIN`` to use the ``TIOCSTI`` ioctl. ++ ++When user namespaces are in use, the check for the capability ++``CAP_SYS_ADMIN`` is done against the user namespace that originally ++opened the tty. ++ ++The kernel config option ``CONFIG_SECURITY_TIOCSTI_RESTRICT`` sets the ++default value of ``tiocsti_restrict``. ++ ++ + unknown_nmi_panic + ================= + +diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt +index 9375324aa8e1..7bd9b330493c 100644 +--- a/Documentation/networking/ip-sysctl.txt ++++ b/Documentation/networking/ip-sysctl.txt +@@ -587,6 +587,23 @@ tcp_comp_sack_nr - INTEGER + + Default : 44 + ++tcp_simult_connect - BOOLEAN ++ Enable TCP simultaneous connect that adds a weakness in Linux's strict ++ implementation of TCP that allows two clients to connect to each other ++ without either entering a listening state. The weakness allows an attacker ++ to easily prevent a client from connecting to a known server provided the ++ source port for the connection is guessed correctly. ++ ++ As the weakness could be used to prevent an antivirus or IPS from fetching ++ updates, or prevent an SSL gateway from fetching a CRL, it should be ++ eliminated by disabling this option. Though Linux is one of few operating ++ systems supporting simultaneous connect, it has no legitimate use in ++ practice and is rarely supported by firewalls. ++ ++ Disabling this may break TCP STUNT which is used by some applications for ++ NAT traversal. ++ Default: Value of CONFIG_TCP_SIMULT_CONNECT_DEFAULT_ON ++ + tcp_slow_start_after_idle - BOOLEAN + If set, provide RFC2861 behavior and time out the congestion + window after an idle period. An idle period is defined at +diff --git a/arch/Kconfig b/arch/Kconfig +index 786a85d4ad40..78ae69e78a81 100644 +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -671,7 +671,7 @@ config ARCH_MMAP_RND_BITS + int "Number of bits to use for ASLR of mmap base address" if EXPERT + range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX + default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT +- default ARCH_MMAP_RND_BITS_MIN ++ default ARCH_MMAP_RND_BITS_MAX + depends on HAVE_ARCH_MMAP_RND_BITS + help + This value can be used to select the number of bits to use to +@@ -705,7 +705,7 @@ config ARCH_MMAP_RND_COMPAT_BITS + int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT + range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX + default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT +- default ARCH_MMAP_RND_COMPAT_BITS_MIN ++ default ARCH_MMAP_RND_COMPAT_BITS_MAX + depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS + help + This value can be used to select the number of bits to use to +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index 5d513f461957..39abe5fd57fb 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -1216,6 +1216,7 @@ config RODATA_FULL_DEFAULT_ENABLED + + config ARM64_SW_TTBR0_PAN + bool "Emulate Privileged Access Never using TTBR0_EL1 switching" ++ default y + help + Enabling this option prevents the kernel from accessing + user-space memory directly by pointing TTBR0_EL1 to a reserved +@@ -1706,6 +1707,7 @@ config RANDOMIZE_BASE + bool "Randomize the address of the kernel image" + select ARM64_MODULE_PLTS if MODULES + select RELOCATABLE ++ default y + help + Randomizes the virtual address at which the kernel image is + loaded, as a security feature that deters exploit attempts +diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug +index a1efa246c9ed..ccacb3619b59 100644 +--- a/arch/arm64/Kconfig.debug ++++ b/arch/arm64/Kconfig.debug +@@ -26,6 +26,7 @@ config ARM64_RANDOMIZE_TEXT_OFFSET + config DEBUG_WX + bool "Warn on W+X mappings at boot" + select PTDUMP_CORE ++ default y + ---help--- + Generate a warning if any W+X mappings are found at boot. + +diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig +index 03d0189f7d68..d7c642f8f063 100644 +--- a/arch/arm64/configs/defconfig ++++ b/arch/arm64/configs/defconfig +@@ -1,4 +1,3 @@ +-CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_AUDIT=y + CONFIG_NO_HZ_IDLE=y +diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h +index b618017205a3..0a228dbcad65 100644 +--- a/arch/arm64/include/asm/elf.h ++++ b/arch/arm64/include/asm/elf.h +@@ -103,14 +103,10 @@ + + /* + * This is the base location for PIE (ET_DYN with INTERP) loads. On +- * 64-bit, this is above 4GB to leave the entire 32-bit address ++ * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * space open for things that want to use the area for 32-bit pointers. + */ +-#ifdef CONFIG_ARM64_FORCE_52BIT +-#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3) +-#else +-#define ELF_ET_DYN_BASE (2 * DEFAULT_MAP_WINDOW_64 / 3) +-#endif /* CONFIG_ARM64_FORCE_52BIT */ ++#define ELF_ET_DYN_BASE 0x100000000UL + + #ifndef __ASSEMBLY__ + +@@ -164,10 +160,10 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, + /* 1GB of VA */ + #ifdef CONFIG_COMPAT + #define STACK_RND_MASK (test_thread_flag(TIF_32BIT) ? \ +- 0x7ff >> (PAGE_SHIFT - 12) : \ +- 0x3ffff >> (PAGE_SHIFT - 12)) ++ ((1UL << mmap_rnd_compat_bits) - 1) >> (PAGE_SHIFT - 12) : \ ++ ((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12)) + #else +-#define STACK_RND_MASK (0x3ffff >> (PAGE_SHIFT - 12)) ++#define STACK_RND_MASK (((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12)) + #endif + + #ifdef __AARCH64EB__ +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 2d3f963fd6f1..7b5923dd44e1 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1191,8 +1191,7 @@ config VM86 + default X86_LEGACY_VM86 + + config X86_16BIT +- bool "Enable support for 16-bit segments" if EXPERT +- default y ++ bool "Enable support for 16-bit segments" + depends on MODIFY_LDT_SYSCALL + ---help--- + This option is required by programs like Wine to run 16-bit +@@ -2329,7 +2328,7 @@ config COMPAT_VDSO + choice + prompt "vsyscall table for legacy applications" + depends on X86_64 +- default LEGACY_VSYSCALL_XONLY ++ default LEGACY_VSYSCALL_NONE + help + Legacy user code that does not know how to find the vDSO expects + to be able to issue three syscalls by calling fixed addresses in +@@ -2425,8 +2424,7 @@ config CMDLINE_OVERRIDE + be set to 'N' under normal conditions. + + config MODIFY_LDT_SYSCALL +- bool "Enable the LDT (local descriptor table)" if EXPERT +- default y ++ bool "Enable the LDT (local descriptor table)" + ---help--- + Linux can allow user programs to install a per-process x86 + Local Descriptor Table (LDT) using the modify_ldt(2) system +diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug +index 2e74690b028a..87c7294dd172 100644 +--- a/arch/x86/Kconfig.debug ++++ b/arch/x86/Kconfig.debug +@@ -75,6 +75,7 @@ config EFI_PGT_DUMP + config DEBUG_WX + bool "Warn on W+X mappings at boot" + select PTDUMP_CORE ++ default y + ---help--- + Generate a warning if any W+X mappings are found at boot. + +diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig +index 614961009075..06c473ba6b1a 100644 +--- a/arch/x86/configs/x86_64_defconfig ++++ b/arch/x86/configs/x86_64_defconfig +@@ -1,5 +1,4 @@ + # CONFIG_LOCALVERSION_AUTO is not set +-CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_TASKSTATS=y +diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c +index 43428cc514c8..1b01bf6a6fe7 100644 +--- a/arch/x86/entry/vdso/vma.c ++++ b/arch/x86/entry/vdso/vma.c +@@ -316,55 +316,9 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) + } + + #ifdef CONFIG_X86_64 +-/* +- * Put the vdso above the (randomized) stack with another randomized +- * offset. This way there is no hole in the middle of address space. +- * To save memory make sure it is still in the same PTE as the stack +- * top. This doesn't give that many random bits. +- * +- * Note that this algorithm is imperfect: the distribution of the vdso +- * start address within a PMD is biased toward the end. +- * +- * Only used for the 64-bit and x32 vdsos. +- */ +-static unsigned long vdso_addr(unsigned long start, unsigned len) +-{ +- unsigned long addr, end; +- unsigned offset; +- +- /* +- * Round up the start address. It can start out unaligned as a result +- * of stack start randomization. +- */ +- start = PAGE_ALIGN(start); +- +- /* Round the lowest possible end address up to a PMD boundary. */ +- end = (start + len + PMD_SIZE - 1) & PMD_MASK; +- if (end >= TASK_SIZE_MAX) +- end = TASK_SIZE_MAX; +- end -= len; +- +- if (end > start) { +- offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); +- addr = start + (offset << PAGE_SHIFT); +- } else { +- addr = start; +- } +- +- /* +- * Forcibly align the final address in case we have a hardware +- * issue that requires alignment for performance reasons. +- */ +- addr = align_vdso_addr(addr); +- +- return addr; +-} +- + static int map_vdso_randomized(const struct vdso_image *image) + { +- unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start); +- +- return map_vdso(image, addr); ++ return map_vdso(image, 0); + } + #endif + +diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h +index 69c0f892e310..f9f7a85bb71e 100644 +--- a/arch/x86/include/asm/elf.h ++++ b/arch/x86/include/asm/elf.h +@@ -248,11 +248,11 @@ extern int force_personality32; + + /* + * This is the base location for PIE (ET_DYN with INTERP) loads. On +- * 64-bit, this is above 4GB to leave the entire 32-bit address ++ * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * space open for things that want to use the area for 32-bit pointers. + */ + #define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ +- (DEFAULT_MAP_WINDOW / 3 * 2)) ++ 0x100000000UL) + + /* This yields a mask that user programs can use to figure out what + instruction set this CPU supports. This could be done in user space, +@@ -312,8 +312,8 @@ extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len); + + #ifdef CONFIG_X86_32 + +-#define __STACK_RND_MASK(is32bit) (0x7ff) +-#define STACK_RND_MASK (0x7ff) ++#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1) ++#define STACK_RND_MASK ((1UL << mmap_rnd_bits) - 1) + + #define ARCH_DLINFO ARCH_DLINFO_IA32 + +@@ -322,7 +322,11 @@ extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len); + #else /* CONFIG_X86_32 */ + + /* 1GB for 64bit, 8MB for 32bit */ +-#define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff) ++#ifdef CONFIG_COMPAT ++#define __STACK_RND_MASK(is32bit) ((is32bit) ? (1UL << mmap_rnd_compat_bits) - 1 : (1UL << mmap_rnd_bits) - 1) ++#else ++#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1) ++#endif + #define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32()) + + #define ARCH_DLINFO \ +@@ -380,5 +384,4 @@ struct va_alignment { + } ____cacheline_aligned; + + extern struct va_alignment va_align; +-extern unsigned long align_vdso_addr(unsigned long); + #endif /* _ASM_X86_ELF_H */ +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 6f66d841262d..b786e7cb395d 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -295,6 +295,7 @@ static inline void cr4_set_bits_irqsoff(unsigned long mask) + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + if ((cr4 | mask) != cr4) + __cr4_set(cr4 | mask); + } +@@ -305,6 +306,7 @@ static inline void cr4_clear_bits_irqsoff(unsigned long mask) + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + if ((cr4 & ~mask) != cr4) + __cr4_set(cr4 & ~mask); + } +@@ -334,6 +336,7 @@ static inline void cr4_toggle_bits_irqsoff(unsigned long mask) + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + __cr4_set(cr4 ^ mask); + } + +@@ -440,6 +443,7 @@ static inline void __native_flush_tlb_global(void) + raw_local_irq_save(flags); + + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + /* toggle PGE */ + native_write_cr4(cr4 ^ X86_CR4_PGE); + /* write old PGE again and flush TLBs */ +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index 8f4533c1a4ec..632ef7ef4615 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -42,6 +42,8 @@ + #include + #include + #include ++#include ++#include + + #include "process.h" + +@@ -907,7 +909,10 @@ unsigned long arch_align_stack(unsigned long sp) + + unsigned long arch_randomize_brk(struct mm_struct *mm) + { +- return randomize_page(mm->brk, 0x02000000); ++ if (mmap_is_ia32()) ++ return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE; ++ else ++ return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE; + } + + /* +diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c +index 504fa5425bce..e30ec4c750d1 100644 +--- a/arch/x86/kernel/sys_x86_64.c ++++ b/arch/x86/kernel/sys_x86_64.c +@@ -52,13 +52,6 @@ static unsigned long get_align_bits(void) + return va_align.bits & get_align_mask(); + } + +-unsigned long align_vdso_addr(unsigned long addr) +-{ +- unsigned long align_mask = get_align_mask(); +- addr = (addr + align_mask) & ~align_mask; +- return addr | get_align_bits(); +-} +- + static int __init control_va_addr_alignment(char *str) + { + /* guard against enabling this on other CPU families */ +@@ -120,10 +113,7 @@ static void find_start_end(unsigned long addr, unsigned long flags, + } + + *begin = get_mmap_base(1); +- if (in_32bit_syscall()) +- *end = task_size_32bit(); +- else +- *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW); ++ *end = get_mmap_base(0); + } + + unsigned long +@@ -200,7 +190,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; +- info.low_limit = PAGE_SIZE; ++ info.low_limit = get_mmap_base(1); + info.high_limit = get_mmap_base(0); + + /* +diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c +index 4222a010057a..2c0c6b47b75b 100644 +--- a/arch/x86/mm/init_32.c ++++ b/arch/x86/mm/init_32.c +@@ -566,9 +566,9 @@ static void __init pagetable_init(void) + + #define DEFAULT_PTE_MASK ~(_PAGE_NX | _PAGE_GLOBAL) + /* Bits supported by the hardware: */ +-pteval_t __supported_pte_mask __read_mostly = DEFAULT_PTE_MASK; ++pteval_t __supported_pte_mask __ro_after_init = DEFAULT_PTE_MASK; + /* Bits allowed in normal kernel mappings: */ +-pteval_t __default_kernel_pte_mask __read_mostly = DEFAULT_PTE_MASK; ++pteval_t __default_kernel_pte_mask __ro_after_init = DEFAULT_PTE_MASK; + EXPORT_SYMBOL_GPL(__supported_pte_mask); + /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ + EXPORT_SYMBOL(__default_kernel_pte_mask); +diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c +index 8b5f73f5e207..83f76a72f684 100644 +--- a/arch/x86/mm/init_64.c ++++ b/arch/x86/mm/init_64.c +@@ -98,9 +98,9 @@ DEFINE_ENTRY(pte, pte, init) + */ + + /* Bits supported by the hardware: */ +-pteval_t __supported_pte_mask __read_mostly = ~0; ++pteval_t __supported_pte_mask __ro_after_init = ~0; + /* Bits allowed in normal kernel mappings: */ +-pteval_t __default_kernel_pte_mask __read_mostly = ~0; ++pteval_t __default_kernel_pte_mask __ro_after_init = ~0; + EXPORT_SYMBOL_GPL(__supported_pte_mask); + /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ + EXPORT_SYMBOL(__default_kernel_pte_mask); +diff --git a/block/blk-softirq.c b/block/blk-softirq.c +index 6e7ec87d49fa..d6ee3f8b3e74 100644 +--- a/block/blk-softirq.c ++++ b/block/blk-softirq.c +@@ -20,7 +20,7 @@ static DEFINE_PER_CPU(struct list_head, blk_cpu_done); + * Softirq action handler - move entries to local list and loop over them + * while passing them to the queue registered handler. + */ +-static __latent_entropy void blk_done_softirq(struct softirq_action *h) ++static __latent_entropy void blk_done_softirq(void) + { + struct list_head *cpu_list, local_list; + +diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c +index e74c8fe2a5fd..ec43f04b1687 100644 +--- a/drivers/ata/libata-core.c ++++ b/drivers/ata/libata-core.c +@@ -4541,7 +4541,7 @@ void ata_qc_free(struct ata_queued_cmd *qc) + struct ata_port *ap; + unsigned int tag; + +- WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ ++ BUG_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ + ap = qc->ap; + + qc->flags = 0; +@@ -4558,7 +4558,7 @@ void __ata_qc_complete(struct ata_queued_cmd *qc) + struct ata_port *ap; + struct ata_link *link; + +- WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ ++ BUG_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ + WARN_ON_ONCE(!(qc->flags & ATA_QCFLAG_ACTIVE)); + ap = qc->ap; + link = qc->dev->link; +diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig +index d4665fe9ccd2..315576465ca0 100644 +--- a/drivers/char/Kconfig ++++ b/drivers/char/Kconfig +@@ -326,7 +326,6 @@ config NSC_GPIO + + config DEVMEM + bool "/dev/mem virtual device support" +- default y + help + Say Y here if you want to support the /dev/mem device. + The /dev/mem device is used to access areas of physical +@@ -390,7 +389,6 @@ config MAX_RAW_DEVS + config DEVPORT + bool "/dev/port character device" + depends on ISA || PCI +- default y + help + Say Y here if you want to support the /dev/port device. The /dev/port + device is similar to /dev/mem, but for I/O ports. +diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig +index 2dff93d7a501..f1da13f791cd 100644 +--- a/drivers/tty/Kconfig ++++ b/drivers/tty/Kconfig +@@ -122,7 +122,6 @@ config UNIX98_PTYS + + config LEGACY_PTYS + bool "Legacy (BSD) PTY support" +- default y + ---help--- + A pseudo terminal (PTY) is a software device consisting of two + halves: a master and a slave. The slave device behaves identical to +diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c +index 5a6f36b391d9..616d82a19160 100644 +--- a/drivers/tty/tty_io.c ++++ b/drivers/tty/tty_io.c +@@ -174,6 +174,7 @@ static void free_tty_struct(struct tty_struct *tty) + put_device(tty->dev); + kfree(tty->write_buf); + tty->magic = 0xDEADDEAD; ++ put_user_ns(tty->owner_user_ns); + kfree(tty); + } + +@@ -2179,11 +2180,19 @@ static int tty_fasync(int fd, struct file *filp, int on) + * FIXME: may race normal receive processing + */ + ++int tiocsti_restrict = IS_ENABLED(CONFIG_SECURITY_TIOCSTI_RESTRICT); ++ + static int tiocsti(struct tty_struct *tty, char __user *p) + { + char ch, mbz = 0; + struct tty_ldisc *ld; + ++ if (tiocsti_restrict && ++ !ns_capable(tty->owner_user_ns, CAP_SYS_ADMIN)) { ++ dev_warn_ratelimited(tty->dev, ++ "Denied TIOCSTI ioctl for non-privileged process\n"); ++ return -EPERM; ++ } + if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + if (get_user(ch, p)) +@@ -3009,6 +3018,7 @@ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx) + tty->index = idx; + tty_line_name(driver, idx, tty->name); + tty->dev = tty_get_device(tty); ++ tty->owner_user_ns = get_user_ns(current_user_ns()); + + return tty; + } +diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c +index fc748c731832..f745c9ee5885 100644 +--- a/drivers/usb/core/hub.c ++++ b/drivers/usb/core/hub.c +@@ -46,6 +46,8 @@ + #define USB_TP_TRANSMISSION_DELAY 40 /* ns */ + #define USB_TP_TRANSMISSION_DELAY_MAX 65535 /* ns */ + ++extern int deny_new_usb; ++ + /* Protect struct usb_device->state and ->children members + * Note: Both are also protected by ->dev.sem, except that ->state can + * change to USB_STATE_NOTATTACHED even when the semaphore isn't held. */ +@@ -5100,6 +5102,12 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, + goto done; + return; + } ++ ++ if (deny_new_usb) { ++ dev_err(&port_dev->dev, "denied insert of USB device on port %d\n", port1); ++ goto done; ++ } ++ + if (hub_is_superspeed(hub->hdev)) + unit_load = 150; + else +diff --git a/fs/exec.c b/fs/exec.c +index 2c465119affc..bf220ff8c019 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -62,6 +62,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -274,6 +275,8 @@ static int __bprm_mm_init(struct linux_binprm *bprm) + mm->stack_vm = mm->total_vm = 1; + up_write(&mm->mmap_sem); + bprm->p = vma->vm_end - sizeof(void *); ++ if (randomize_va_space) ++ bprm->p ^= get_random_int() & ~PAGE_MASK; + return 0; + err: + up_write(&mm->mmap_sem); +diff --git a/fs/namei.c b/fs/namei.c +index a320371899cf..6cc595eed647 100644 +--- a/fs/namei.c ++++ b/fs/namei.c +@@ -918,10 +918,10 @@ static inline void put_link(struct nameidata *nd) + path_put(&last->link); + } + +-int sysctl_protected_symlinks __read_mostly = 0; +-int sysctl_protected_hardlinks __read_mostly = 0; +-int sysctl_protected_fifos __read_mostly; +-int sysctl_protected_regular __read_mostly; ++int sysctl_protected_symlinks __read_mostly = 1; ++int sysctl_protected_hardlinks __read_mostly = 1; ++int sysctl_protected_fifos __read_mostly = 2; ++int sysctl_protected_regular __read_mostly = 2; + + /** + * may_follow_link - Check symlink following for unsafe situations +diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig +index 88e1763e02f3..71820a515c91 100644 +--- a/fs/nfs/Kconfig ++++ b/fs/nfs/Kconfig +@@ -195,7 +195,6 @@ config NFS_DEBUG + bool + depends on NFS_FS && SUNRPC_DEBUG + select CRC32 +- default y + + config NFS_DISABLE_UDP_SUPPORT + bool "NFS: Disable NFS UDP protocol support" +diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig +index 27ef84d99f59..fb27f99a5e66 100644 +--- a/fs/proc/Kconfig ++++ b/fs/proc/Kconfig +@@ -41,7 +41,6 @@ config PROC_KCORE + config PROC_VMCORE + bool "/proc/vmcore support" + depends on PROC_FS && CRASH_DUMP +- default y + help + Exports the dump image of crashed kernel in ELF format. + +diff --git a/fs/stat.c b/fs/stat.c +index 030008796479..b1c2c0d5b874 100644 +--- a/fs/stat.c ++++ b/fs/stat.c +@@ -42,8 +42,13 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) + stat->gid = inode->i_gid; + stat->rdev = inode->i_rdev; + stat->size = i_size_read(inode); +- stat->atime = inode->i_atime; +- stat->mtime = inode->i_mtime; ++ if (is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) { ++ stat->atime = inode->i_ctime; ++ stat->mtime = inode->i_ctime; ++ } else { ++ stat->atime = inode->i_atime; ++ stat->mtime = inode->i_mtime; ++ } + stat->ctime = inode->i_ctime; + stat->blksize = i_blocksize(inode); + stat->blocks = inode->i_blocks; +@@ -79,9 +84,14 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, + if (IS_AUTOMOUNT(inode)) + stat->attributes |= STATX_ATTR_AUTOMOUNT; + +- if (inode->i_op->getattr) +- return inode->i_op->getattr(path, stat, request_mask, +- query_flags); ++ if (inode->i_op->getattr) { ++ int retval = inode->i_op->getattr(path, stat, request_mask, query_flags); ++ if (!retval && is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) { ++ stat->atime = stat->ctime; ++ stat->mtime = stat->ctime; ++ } ++ return retval; ++ } + + generic_fillattr(inode, stat); + return 0; +diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c +index e39fdec8a0b0..08610405fdae 100644 +--- a/fs/userfaultfd.c ++++ b/fs/userfaultfd.c +@@ -28,7 +28,11 @@ + #include + #include + ++#ifdef CONFIG_USERFAULTFD_UNPRIVILEGED + int sysctl_unprivileged_userfaultfd __read_mostly = 1; ++#else ++int sysctl_unprivileged_userfaultfd __read_mostly; ++#endif + + static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly; + +diff --git a/include/linux/cache.h b/include/linux/cache.h +index 750621e41d1c..e7157c18c62c 100644 +--- a/include/linux/cache.h ++++ b/include/linux/cache.h +@@ -31,6 +31,8 @@ + #define __ro_after_init __attribute__((__section__(".data..ro_after_init"))) + #endif + ++#define __read_only __ro_after_init ++ + #ifndef ____cacheline_aligned + #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) + #endif +diff --git a/include/linux/capability.h b/include/linux/capability.h +index ecce0f43c73a..e46306dd4401 100644 +--- a/include/linux/capability.h ++++ b/include/linux/capability.h +@@ -208,6 +208,7 @@ extern bool has_capability_noaudit(struct task_struct *t, int cap); + extern bool has_ns_capability_noaudit(struct task_struct *t, + struct user_namespace *ns, int cap); + extern bool capable(int cap); ++extern bool capable_noaudit(int cap); + extern bool ns_capable(struct user_namespace *ns, int cap); + extern bool ns_capable_noaudit(struct user_namespace *ns, int cap); + extern bool ns_capable_setid(struct user_namespace *ns, int cap); +@@ -234,6 +235,10 @@ static inline bool capable(int cap) + { + return true; + } ++static inline bool capable_noaudit(int cap) ++{ ++ return true; ++} + static inline bool ns_capable(struct user_namespace *ns, int cap) + { + return true; +diff --git a/include/linux/fs.h b/include/linux/fs.h +index 45cc10cdf6dd..162d589f120a 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -3659,4 +3659,15 @@ static inline int inode_drain_writes(struct inode *inode) + return filemap_write_and_wait(inode->i_mapping); + } + ++extern int device_sidechannel_restrict; ++ ++static inline bool is_sidechannel_device(const struct inode *inode) ++{ ++ umode_t mode; ++ if (!device_sidechannel_restrict) ++ return false; ++ mode = inode->i_mode; ++ return ((S_ISCHR(mode) || S_ISBLK(mode)) && (mode & (S_IROTH | S_IWOTH))); ++} ++ + #endif /* _LINUX_FS_H */ +diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h +index 5ab28f6c7d26..6333478e581c 100644 +--- a/include/linux/fsnotify.h ++++ b/include/linux/fsnotify.h +@@ -65,6 +65,9 @@ static inline int fsnotify_file(struct file *file, __u32 mask) + struct inode *inode = file_inode(file); + int ret; + ++ if (mask & (FS_ACCESS | FS_MODIFY) && is_sidechannel_device(inode)) ++ return 0; ++ + if (file->f_mode & FMODE_NONOTIFY) + return 0; + +diff --git a/include/linux/gfp.h b/include/linux/gfp.h +index 4aba4c86c626..7d2bd45f35ed 100644 +--- a/include/linux/gfp.h ++++ b/include/linux/gfp.h +@@ -561,9 +561,9 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, + extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); + extern unsigned long get_zeroed_page(gfp_t gfp_mask); + +-void *alloc_pages_exact(size_t size, gfp_t gfp_mask); ++void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __attribute__((alloc_size(1))); + void free_pages_exact(void *virt, size_t size); +-void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); ++void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __attribute__((alloc_size(2))); + + #define __get_free_page(gfp_mask) \ + __get_free_pages((gfp_mask), 0) +diff --git a/include/linux/highmem.h b/include/linux/highmem.h +index ea5cdbd8c2c3..805b84d6bbca 100644 +--- a/include/linux/highmem.h ++++ b/include/linux/highmem.h +@@ -215,6 +215,13 @@ static inline void clear_highpage(struct page *page) + kunmap_atomic(kaddr); + } + ++static inline void verify_zero_highpage(struct page *page) ++{ ++ void *kaddr = kmap_atomic(page); ++ BUG_ON(memchr_inv(kaddr, 0, PAGE_SIZE)); ++ kunmap_atomic(kaddr); ++} ++ + static inline void zero_user_segments(struct page *page, + unsigned start1, unsigned end1, + unsigned start2, unsigned end2) +diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h +index 80f637c3a6f3..0188c5fa11cb 100644 +--- a/include/linux/interrupt.h ++++ b/include/linux/interrupt.h +@@ -554,7 +554,7 @@ extern const char * const softirq_to_name[NR_SOFTIRQS]; + + struct softirq_action + { +- void (*action)(struct softirq_action *); ++ void (*action)(void); + }; + + asmlinkage void do_softirq(void); +@@ -569,7 +569,7 @@ static inline void do_softirq_own_stack(void) + } + #endif + +-extern void open_softirq(int nr, void (*action)(struct softirq_action *)); ++extern void __init open_softirq(int nr, void (*action)(void)); + extern void softirq_init(void); + extern void __raise_softirq_irqoff(unsigned int nr); + +diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h +index 069aa2ebef90..cb9e3637a620 100644 +--- a/include/linux/kobject_ns.h ++++ b/include/linux/kobject_ns.h +@@ -45,7 +45,7 @@ struct kobj_ns_type_operations { + void (*drop_ns)(void *); + }; + +-int kobj_ns_type_register(const struct kobj_ns_type_operations *ops); ++int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops); + int kobj_ns_type_registered(enum kobj_ns_type type); + const struct kobj_ns_type_operations *kobj_child_ns_ops(struct kobject *parent); + const struct kobj_ns_type_operations *kobj_ns_ops(struct kobject *kobj); +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 465e8ad671f8..57f78e2fcdac 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -751,7 +751,7 @@ static inline int is_vmalloc_or_module_addr(const void *x) + } + #endif + +-extern void *kvmalloc_node(size_t size, gfp_t flags, int node); ++extern void *kvmalloc_node(size_t size, gfp_t flags, int node) __attribute__((alloc_size(1))); + static inline void *kvmalloc(size_t size, gfp_t flags) + { + return kvmalloc_node(size, flags, NUMA_NO_NODE); +diff --git a/include/linux/percpu.h b/include/linux/percpu.h +index 5e76af742c80..9a6c682ec127 100644 +--- a/include/linux/percpu.h ++++ b/include/linux/percpu.h +@@ -123,7 +123,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, + pcpu_fc_populate_pte_fn_t populate_pte_fn); + #endif + +-extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align); ++extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __attribute__((alloc_size(1))); + extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr); + extern bool is_kernel_percpu_address(unsigned long addr); + +@@ -131,8 +131,8 @@ extern bool is_kernel_percpu_address(unsigned long addr); + extern void __init setup_per_cpu_areas(void); + #endif + +-extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); +-extern void __percpu *__alloc_percpu(size_t size, size_t align); ++extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __attribute__((alloc_size(1))); ++extern void __percpu *__alloc_percpu(size_t size, size_t align) __attribute__((alloc_size(1))); + extern void free_percpu(void __percpu *__pdata); + extern phys_addr_t per_cpu_ptr_to_phys(void *addr); + +diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h +index 9c3e7619c929..2976a90b927c 100644 +--- a/include/linux/perf_event.h ++++ b/include/linux/perf_event.h +@@ -1303,6 +1303,14 @@ static inline int perf_is_paranoid(void) + return sysctl_perf_event_paranoid > -1; + } + ++static inline int perf_allow_open(struct perf_event_attr *attr) ++{ ++ if (sysctl_perf_event_paranoid > 2 && !capable(CAP_SYS_ADMIN)) ++ return -EACCES; ++ ++ return security_perf_event_open(attr, PERF_SECURITY_OPEN); ++} ++ + static inline int perf_allow_kernel(struct perf_event_attr *attr) + { + if (sysctl_perf_event_paranoid > 1 && !capable(CAP_SYS_ADMIN)) +diff --git a/include/linux/slab.h b/include/linux/slab.h +index 6d454886bcaf..60e0df2ccc59 100644 +--- a/include/linux/slab.h ++++ b/include/linux/slab.h +@@ -184,7 +184,7 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *, struct mem_cgroup *); + /* + * Common kmalloc functions provided by all allocators + */ +-void * __must_check krealloc(const void *, size_t, gfp_t); ++void * __must_check krealloc(const void *, size_t, gfp_t) __attribute((alloc_size(2))); + void kfree(const void *); + void kzfree(const void *); + size_t __ksize(const void *); +@@ -389,7 +389,7 @@ static __always_inline unsigned int kmalloc_index(size_t size) + } + #endif /* !CONFIG_SLOB */ + +-void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc; ++void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc __attribute__((alloc_size(1))); + void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc; + void kmem_cache_free(struct kmem_cache *, void *); + +@@ -413,7 +413,7 @@ static __always_inline void kfree_bulk(size_t size, void **p) + } + + #ifdef CONFIG_NUMA +-void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc; ++void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc __attribute__((alloc_size(1))); + void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc; + #else + static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node) +@@ -538,7 +538,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) + * Try really hard to succeed the allocation but fail + * eventually. + */ +-static __always_inline void *kmalloc(size_t size, gfp_t flags) ++static __always_inline __attribute__((alloc_size(1))) void *kmalloc(size_t size, gfp_t flags) + { + if (__builtin_constant_p(size)) { + #ifndef CONFIG_SLOB +@@ -560,7 +560,7 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags) + return __kmalloc(size, flags); + } + +-static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) ++static __always_inline __attribute__((alloc_size(1))) void *kmalloc_node(size_t size, gfp_t flags, int node) + { + #ifndef CONFIG_SLOB + if (__builtin_constant_p(size) && +diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h +index d2153789bd9f..97da977d6060 100644 +--- a/include/linux/slub_def.h ++++ b/include/linux/slub_def.h +@@ -121,6 +121,11 @@ struct kmem_cache { + unsigned long random; + #endif + ++#ifdef CONFIG_SLAB_CANARY ++ unsigned long random_active; ++ unsigned long random_inactive; ++#endif ++ + #ifdef CONFIG_NUMA + /* + * Defragmentation by allocating from a remote node. +diff --git a/include/linux/string.h b/include/linux/string.h +index 9b7a0632e87a..5c2420dfe2e7 100644 +--- a/include/linux/string.h ++++ b/include/linux/string.h +@@ -271,6 +271,12 @@ void __read_overflow2(void) __compiletime_error("detected read beyond size of ob + void __read_overflow3(void) __compiletime_error("detected read beyond size of object passed as 3rd parameter"); + void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter"); + ++#ifdef CONFIG_FORTIFY_SOURCE_STRICT_STRING ++#define __string_size(p) __builtin_object_size(p, 1) ++#else ++#define __string_size(p) __builtin_object_size(p, 0) ++#endif ++ + #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) + + #ifdef CONFIG_KASAN +@@ -299,7 +305,7 @@ extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) + + __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) + { +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + if (__builtin_constant_p(size) && p_size < size) + __write_overflow(); + if (p_size < size) +@@ -309,7 +315,7 @@ __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) + + __FORTIFY_INLINE char *strcat(char *p, const char *q) + { +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + if (p_size == (size_t)-1) + return __underlying_strcat(p, q); + if (strlcat(p, q, p_size) >= p_size) +@@ -320,7 +326,7 @@ __FORTIFY_INLINE char *strcat(char *p, const char *q) + __FORTIFY_INLINE __kernel_size_t strlen(const char *p) + { + __kernel_size_t ret; +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + + /* Work around gcc excess stack consumption issue */ + if (p_size == (size_t)-1 || +@@ -335,7 +341,7 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p) + extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); + __FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen) + { +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + __kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); + if (p_size <= ret && maxlen != ret) + fortify_panic(__func__); +@@ -347,8 +353,8 @@ extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); + __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) + { + size_t ret; +- size_t p_size = __builtin_object_size(p, 0); +- size_t q_size = __builtin_object_size(q, 0); ++ size_t p_size = __string_size(p); ++ size_t q_size = __string_size(q); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __real_strlcpy(p, q, size); + ret = strlen(q); +@@ -368,8 +374,8 @@ __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) + __FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count) + { + size_t p_len, copy_len; +- size_t p_size = __builtin_object_size(p, 0); +- size_t q_size = __builtin_object_size(q, 0); ++ size_t p_size = __string_size(p); ++ size_t q_size = __string_size(q); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __underlying_strncat(p, q, count); + p_len = strlen(p); +@@ -482,8 +488,8 @@ __FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp) + /* defined after fortified strlen and memcpy to reuse them */ + __FORTIFY_INLINE char *strcpy(char *p, const char *q) + { +- size_t p_size = __builtin_object_size(p, 0); +- size_t q_size = __builtin_object_size(q, 0); ++ size_t p_size = __string_size(p); ++ size_t q_size = __string_size(q); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __underlying_strcpy(p, q); + memcpy(p, q, strlen(q) + 1); +diff --git a/include/linux/tty.h b/include/linux/tty.h +index a99e9b8e4e31..ee272abea5f9 100644 +--- a/include/linux/tty.h ++++ b/include/linux/tty.h +@@ -14,6 +14,7 @@ + #include + #include + #include ++#include + + + /* +@@ -338,6 +339,7 @@ struct tty_struct { + /* If the tty has a pending do_SAK, queue it here - akpm */ + struct work_struct SAK_work; + struct tty_port *port; ++ struct user_namespace *owner_user_ns; + } __randomize_layout; + + /* Each of a tty's open files has private_data pointing to tty_file_private */ +@@ -347,6 +349,8 @@ struct tty_file_private { + struct list_head list; + }; + ++extern int tiocsti_restrict; ++ + /* tty magic number */ + #define TTY_MAGIC 0x5401 + +diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h +index a95d3cc74d79..93c9cc5baa23 100644 +--- a/include/linux/vmalloc.h ++++ b/include/linux/vmalloc.h +@@ -102,20 +102,20 @@ static inline void vmalloc_init(void) + static inline unsigned long vmalloc_nr_pages(void) { return 0; } + #endif + +-extern void *vmalloc(unsigned long size); +-extern void *vzalloc(unsigned long size); +-extern void *vmalloc_user(unsigned long size); +-extern void *vmalloc_node(unsigned long size, int node); +-extern void *vzalloc_node(unsigned long size, int node); +-extern void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags); +-extern void *vmalloc_exec(unsigned long size); +-extern void *vmalloc_32(unsigned long size); +-extern void *vmalloc_32_user(unsigned long size); +-extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); ++extern void *vmalloc(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vzalloc(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_user(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_node(unsigned long size, int node) __attribute__((alloc_size(1))); ++extern void *vzalloc_node(unsigned long size, int node) __attribute__((alloc_size(1))); ++extern void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags) __attribute__((alloc_size(1))); ++extern void *vmalloc_exec(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_32(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_32_user(unsigned long size) __attribute__((alloc_size(1))); ++extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) __attribute__((alloc_size(1))); + extern void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags, int node, +- const void *caller); ++ const void *caller) __attribute__((alloc_size(1))); + #ifndef CONFIG_MMU + extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags); + static inline void *__vmalloc_node_flags_caller(unsigned long size, int node, +diff --git a/include/net/tcp.h b/include/net/tcp.h +index 6f8e60c6fbc7..fe971ed1978b 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -244,6 +244,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); + /* sysctl variables for tcp */ + extern int sysctl_tcp_max_orphans; + extern long sysctl_tcp_mem[3]; ++extern int sysctl_tcp_simult_connect; + + #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ + #define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */ +diff --git a/init/Kconfig b/init/Kconfig +index 74a5ac65644f..b0f67731c203 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -349,6 +349,7 @@ config USELIB + config AUDIT + bool "Auditing support" + depends on NET ++ default y + help + Enable auditing infrastructure that can be used with another + kernel subsystem, such as SELinux (which requires this for +@@ -1102,6 +1103,22 @@ config USER_NS + + If unsure, say N. + ++config USER_NS_UNPRIVILEGED ++ bool "Allow unprivileged users to create namespaces" ++ depends on USER_NS ++ default n ++ help ++ When disabled, unprivileged users will not be able to create ++ new namespaces. Allowing users to create their own namespaces ++ has been part of several recent local privilege escalation ++ exploits, so if you need user namespaces but are ++ paranoid^Wsecurity-conscious you want to disable this. ++ ++ This setting can be overridden at runtime via the ++ kernel.unprivileged_userns_clone sysctl. ++ ++ If unsure, say N. ++ + config PID_NS + bool "PID Namespaces" + default y +@@ -1515,8 +1532,7 @@ config SHMEM + which may be appropriate on small systems without swap. + + config AIO +- bool "Enable AIO support" if EXPERT +- default y ++ bool "Enable AIO support" + help + This option enables POSIX asynchronous I/O which may by used + by some high performance threaded applications. Disabling +@@ -1652,6 +1668,23 @@ config USERFAULTFD + Enable the userfaultfd() system call that allows to intercept and + handle page faults in userland. + ++config USERFAULTFD_UNPRIVILEGED ++ bool "Allow unprivileged users to use the userfaultfd syscall" ++ depends on USERFAULTFD ++ default n ++ help ++ When disabled, unprivileged users will not be able to use the userfaultfd ++ syscall. Userfaultfd provide attackers with a way to stall a kernel ++ thread in the middle of memory accesses from userspace by initiating an ++ access on an unmapped page. To avoid various heap grooming and heap ++ spraying techniques for exploiting use-after-free flaws this should be ++ disabled by default. ++ ++ This setting can be overridden at runtime via the ++ vm.unprivileged_userfaultfd sysctl. ++ ++ If unsure, say N. ++ + config ARCH_HAS_MEMBARRIER_CALLBACKS + bool + +@@ -1764,7 +1797,7 @@ config VM_EVENT_COUNTERS + + config SLUB_DEBUG + default y +- bool "Enable SLUB debugging support" if EXPERT ++ bool "Enable SLUB debugging support" + depends on SLUB && SYSFS + help + SLUB has extensive debug support features. Disabling these can +@@ -1788,7 +1821,6 @@ config SLUB_MEMCG_SYSFS_ON + + config COMPAT_BRK + bool "Disable heap randomization" +- default y + help + Randomizing heap placement makes heap exploits harder, but it + also breaks ancient binaries (including anything libc5 based). +@@ -1835,7 +1867,6 @@ endchoice + + config SLAB_MERGE_DEFAULT + bool "Allow slab caches to be merged" +- default y + help + For reduced kernel memory fragmentation, slab caches can be + merged when they share the same size and other characteristics. +@@ -1848,9 +1879,9 @@ config SLAB_MERGE_DEFAULT + command line. + + config SLAB_FREELIST_RANDOM +- default n + depends on SLAB || SLUB + bool "SLAB freelist randomization" ++ default y + help + Randomizes the freelist order used on creating new pages. This + security feature reduces the predictability of the kernel slab +@@ -1859,12 +1890,30 @@ config SLAB_FREELIST_RANDOM + config SLAB_FREELIST_HARDENED + bool "Harden slab freelist metadata" + depends on SLUB ++ default y + help + Many kernel heap attacks try to target slab cache metadata and + other infrastructure. This options makes minor performance + sacrifices to harden the kernel slab allocator against common + freelist exploit methods. + ++config SLAB_CANARY ++ depends on SLUB ++ depends on !SLAB_MERGE_DEFAULT ++ bool "SLAB canaries" ++ default y ++ help ++ Place canaries at the end of kernel slab allocations, sacrificing ++ some performance and memory usage for security. ++ ++ Canaries can detect some forms of heap corruption when allocations ++ are freed and as part of the HARDENED_USERCOPY feature. It provides ++ basic use-after-free detection for HARDENED_USERCOPY. ++ ++ Canaries absorb small overflows (rendering them harmless), mitigate ++ non-NUL terminated C string overflows on 64-bit via a guaranteed zero ++ byte and provide basic double-free detection. ++ + config SHUFFLE_PAGE_ALLOCATOR + bool "Page allocator randomization" + default SLAB_FREELIST_RANDOM && ACPI_NUMA +diff --git a/kernel/audit.c b/kernel/audit.c +index f711f424a28a..f15d1d41244c 100644 +--- a/kernel/audit.c ++++ b/kernel/audit.c +@@ -1642,6 +1642,9 @@ static int __init audit_enable(char *str) + + if (audit_default == AUDIT_OFF) + audit_initialized = AUDIT_DISABLED; ++ else if (!audit_ever_enabled) ++ audit_initialized = AUDIT_UNINITIALIZED; ++ + if (audit_set_enabled(audit_default)) + pr_err("audit: error setting audit state (%d)\n", + audit_default); +diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c +index 916f5132a984..296a07014999 100644 +--- a/kernel/bpf/core.c ++++ b/kernel/bpf/core.c +@@ -520,7 +520,7 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) + /* All BPF JIT sysctl knobs here. */ + int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); + int bpf_jit_kallsyms __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_DEFAULT_ON); +-int bpf_jit_harden __read_mostly; ++int bpf_jit_harden __read_mostly = 2; + long bpf_jit_limit __read_mostly; + + static void +diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c +index c8acc8f37583..ccf05cdfd932 100644 +--- a/kernel/bpf/syscall.c ++++ b/kernel/bpf/syscall.c +@@ -43,7 +43,7 @@ static DEFINE_SPINLOCK(prog_idr_lock); + static DEFINE_IDR(map_idr); + static DEFINE_SPINLOCK(map_idr_lock); + +-int sysctl_unprivileged_bpf_disabled __read_mostly; ++int sysctl_unprivileged_bpf_disabled __read_mostly = 1; + + static const struct bpf_map_ops * const bpf_map_types[] = { + #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) +diff --git a/kernel/capability.c b/kernel/capability.c +index 1444f3954d75..8cc9dd7992f2 100644 +--- a/kernel/capability.c ++++ b/kernel/capability.c +@@ -449,6 +449,12 @@ bool capable(int cap) + return ns_capable(&init_user_ns, cap); + } + EXPORT_SYMBOL(capable); ++ ++bool capable_noaudit(int cap) ++{ ++ return ns_capable_noaudit(&init_user_ns, cap); ++} ++EXPORT_SYMBOL(capable_noaudit); + #endif /* CONFIG_MULTIUSER */ + + /** +diff --git a/kernel/events/core.c b/kernel/events/core.c +index 1dd91f960839..90a629557f9e 100644 +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -406,8 +406,13 @@ static cpumask_var_t perf_online_mask; + * 0 - disallow raw tracepoint access for unpriv + * 1 - disallow cpu events for unpriv + * 2 - disallow kernel profiling for unpriv ++ * 3 - disallow all unpriv perf event use + */ ++#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT ++int sysctl_perf_event_paranoid __read_mostly = 3; ++#else + int sysctl_perf_event_paranoid __read_mostly = 2; ++#endif + + /* Minimum for 512 kiB + 1 user control page */ + int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ +@@ -11501,7 +11506,7 @@ SYSCALL_DEFINE5(perf_event_open, + return -EINVAL; + + /* Do we allow access to perf_event_open(2) ? */ +- err = security_perf_event_open(&attr, PERF_SECURITY_OPEN); ++ err = perf_allow_open(&attr); + if (err) + return err; + +diff --git a/kernel/fork.c b/kernel/fork.c +index 48ed22774efa..ec61454a18d5 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -106,6 +106,11 @@ + + #define CREATE_TRACE_POINTS + #include ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#else ++#define unprivileged_userns_clone 0 ++#endif + + /* + * Minimum number of threads to boot the kernel +@@ -1848,6 +1853,10 @@ static __latent_entropy struct task_struct *copy_process( + if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) + return ERR_PTR(-EINVAL); + ++ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) ++ if (!capable(CAP_SYS_ADMIN)) ++ return ERR_PTR(-EPERM); ++ + /* + * Thread groups must share signals as well, and detached threads + * can only be started up within the thread group. +@@ -2948,6 +2957,12 @@ int ksys_unshare(unsigned long unshare_flags) + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; + ++ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { ++ err = -EPERM; ++ if (!capable(CAP_SYS_ADMIN)) ++ goto bad_unshare_out; ++ } ++ + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; +diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c +index dd572ce7c747..95af139ac6ba 100644 +--- a/kernel/rcu/tiny.c ++++ b/kernel/rcu/tiny.c +@@ -100,7 +100,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head) + } + + /* Invoke the RCU callbacks whose grace period has elapsed. */ +-static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) ++static __latent_entropy void rcu_process_callbacks(void) + { + struct rcu_head *next, *list; + unsigned long flags; +diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c +index d9a49cd6065a..9f63b28e3ebe 100644 +--- a/kernel/rcu/tree.c ++++ b/kernel/rcu/tree.c +@@ -2437,7 +2437,7 @@ static __latent_entropy void rcu_core(void) + trace_rcu_utilization(TPS("End RCU core")); + } + +-static void rcu_core_si(struct softirq_action *h) ++static void rcu_core_si(void) + { + rcu_core(); + } +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 5725199b32dc..dfb99620cb41 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10568,7 +10568,7 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf) + * run_rebalance_domains is triggered when needed from the scheduler tick. + * Also triggered for nohz idle balancing (with nohz_balancing_kick set). + */ +-static __latent_entropy void run_rebalance_domains(struct softirq_action *h) ++static __latent_entropy void run_rebalance_domains(void) + { + struct rq *this_rq = this_rq(); + enum cpu_idle_type idle = this_rq->idle_balance ? +diff --git a/kernel/softirq.c b/kernel/softirq.c +index a47c6dd57452..c12cb85a6504 100644 +--- a/kernel/softirq.c ++++ b/kernel/softirq.c +@@ -52,7 +52,7 @@ DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat); + EXPORT_PER_CPU_SYMBOL(irq_stat); + #endif + +-static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; ++static struct softirq_action softirq_vec[NR_SOFTIRQS] __ro_after_init __aligned(PAGE_SIZE); + + DEFINE_PER_CPU(struct task_struct *, ksoftirqd); + +@@ -289,7 +289,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) + kstat_incr_softirqs_this_cpu(vec_nr); + + trace_softirq_entry(vec_nr); +- h->action(h); ++ h->action(); + trace_softirq_exit(vec_nr); + if (unlikely(prev_count != preempt_count())) { + pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n", +@@ -453,7 +453,7 @@ void __raise_softirq_irqoff(unsigned int nr) + or_softirq_pending(1UL << nr); + } + +-void open_softirq(int nr, void (*action)(struct softirq_action *)) ++void __init open_softirq(int nr, void (*action)(void)) + { + softirq_vec[nr].action = action; + } +@@ -499,8 +499,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) + } + EXPORT_SYMBOL(__tasklet_hi_schedule); + +-static void tasklet_action_common(struct softirq_action *a, +- struct tasklet_head *tl_head, ++static void tasklet_action_common(struct tasklet_head *tl_head, + unsigned int softirq_nr) + { + struct tasklet_struct *list; +@@ -537,14 +536,14 @@ static void tasklet_action_common(struct softirq_action *a, + } + } + +-static __latent_entropy void tasklet_action(struct softirq_action *a) ++static __latent_entropy void tasklet_action(void) + { +- tasklet_action_common(a, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); ++ tasklet_action_common(this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); + } + +-static __latent_entropy void tasklet_hi_action(struct softirq_action *a) ++static __latent_entropy void tasklet_hi_action(void) + { +- tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); ++ tasklet_action_common(this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); + } + + void tasklet_init(struct tasklet_struct *t, +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index 8a176d8727a3..87bc1d26c376 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -68,6 +68,7 @@ + #include + #include + #include ++#include + + #include "../lib/kstrtox.h" + +@@ -104,12 +105,19 @@ + #if defined(CONFIG_SYSCTL) + + /* External variables not in a header file. */ ++#if IS_ENABLED(CONFIG_USB) ++int deny_new_usb __read_mostly = 0; ++EXPORT_SYMBOL(deny_new_usb); ++#endif + extern int suid_dumpable; + #ifdef CONFIG_COREDUMP + extern int core_uses_pid; + extern char core_pattern[]; + extern unsigned int core_pipe_limit; + #endif ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#endif + extern int pid_max; + extern int pid_max_min, pid_max_max; + extern int percpu_pagelist_fraction; +@@ -121,32 +129,32 @@ extern int sysctl_nr_trim_pages; + + /* Constants used for minimum and maximum */ + #ifdef CONFIG_LOCKUP_DETECTOR +-static int sixty = 60; ++static int sixty __read_only = 60; + #endif + +-static int __maybe_unused neg_one = -1; +-static int __maybe_unused two = 2; +-static int __maybe_unused four = 4; +-static unsigned long zero_ul; +-static unsigned long one_ul = 1; +-static unsigned long long_max = LONG_MAX; +-static int one_hundred = 100; +-static int one_thousand = 1000; ++static int __maybe_unused neg_one __read_only = -1; ++static int __maybe_unused two __read_only = 2; ++static int __maybe_unused four __read_only = 4; ++static unsigned long zero_ul __read_only; ++static unsigned long one_ul __read_only = 1; ++static unsigned long long_max __read_only = LONG_MAX; ++static int one_hundred __read_only = 100; ++static int one_thousand __read_only = 1000; + #ifdef CONFIG_PRINTK +-static int ten_thousand = 10000; ++static int ten_thousand __read_only = 10000; + #endif + #ifdef CONFIG_PERF_EVENTS +-static int six_hundred_forty_kb = 640 * 1024; ++static int six_hundred_forty_kb __read_only = 640 * 1024; + #endif + + /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ +-static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; ++static unsigned long dirty_bytes_min __read_only = 2 * PAGE_SIZE; + + /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ +-static int maxolduid = 65535; +-static int minolduid; ++static int maxolduid __read_only = 65535; ++static int minolduid __read_only; + +-static int ngroups_max = NGROUPS_MAX; ++static int ngroups_max __read_only = NGROUPS_MAX; + static const int cap_last_cap = CAP_LAST_CAP; + + /* +@@ -154,9 +162,12 @@ static const int cap_last_cap = CAP_LAST_CAP; + * and hung_task_check_interval_secs + */ + #ifdef CONFIG_DETECT_HUNG_TASK +-static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); ++static unsigned long hung_task_timeout_max __read_only = (LONG_MAX/HZ); + #endif + ++int device_sidechannel_restrict __read_mostly = 1; ++EXPORT_SYMBOL(device_sidechannel_restrict); ++ + #ifdef CONFIG_INOTIFY_USER + #include + #endif +@@ -289,19 +300,19 @@ static struct ctl_table sysctl_base_table[] = { + }; + + #ifdef CONFIG_SCHED_DEBUG +-static int min_sched_granularity_ns = 100000; /* 100 usecs */ +-static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ +-static int min_wakeup_granularity_ns; /* 0 usecs */ +-static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ ++static int min_sched_granularity_ns __read_only = 100000; /* 100 usecs */ ++static int max_sched_granularity_ns __read_only = NSEC_PER_SEC; /* 1 second */ ++static int min_wakeup_granularity_ns __read_only; /* 0 usecs */ ++static int max_wakeup_granularity_ns __read_only = NSEC_PER_SEC; /* 1 second */ + #ifdef CONFIG_SMP +-static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; +-static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; ++static int min_sched_tunable_scaling __read_only = SCHED_TUNABLESCALING_NONE; ++static int max_sched_tunable_scaling __read_only = SCHED_TUNABLESCALING_END-1; + #endif /* CONFIG_SMP */ + #endif /* CONFIG_SCHED_DEBUG */ + + #ifdef CONFIG_COMPACTION +-static int min_extfrag_threshold; +-static int max_extfrag_threshold = 1000; ++static int min_extfrag_threshold __read_only; ++static int max_extfrag_threshold __read_only = 1000; + #endif + + static struct ctl_table kern_table[] = { +@@ -534,6 +545,15 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_USER_NS ++ { ++ .procname = "unprivileged_userns_clone", ++ .data = &unprivileged_userns_clone, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++#endif + #ifdef CONFIG_PROC_SYSCTL + { + .procname = "tainted", +@@ -880,6 +900,37 @@ static struct ctl_table kern_table[] = { + .extra1 = SYSCTL_ZERO, + .extra2 = &two, + }, ++#endif ++#if defined CONFIG_TTY ++ { ++ .procname = "tiocsti_restrict", ++ .data = &tiocsti_restrict, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_minmax_sysadmin, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, ++#endif ++ { ++ .procname = "device_sidechannel_restrict", ++ .data = &device_sidechannel_restrict, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_minmax_sysadmin, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, ++#if IS_ENABLED(CONFIG_USB) ++ { ++ .procname = "deny_new_usb", ++ .data = &deny_new_usb, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_minmax_sysadmin, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, + #endif + { + .procname = "ngroups_max", +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index d89da1c7e005..8e1003ef3ebb 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -1588,7 +1588,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, + } + } + +-static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) ++static __latent_entropy void hrtimer_run_softirq(void) + { + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + unsigned long flags; +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index a5221abb4594..636f4f9566fa 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1780,7 +1780,7 @@ static inline void __run_timers(struct timer_base *base) + /* + * This function runs timers and the timer-tq in bottom half context. + */ +-static __latent_entropy void run_timer_softirq(struct softirq_action *h) ++static __latent_entropy void run_timer_softirq(void) + { + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index 8eadadc478f9..c36ecd19562c 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -21,6 +21,13 @@ + #include + #include + ++/* sysctl */ ++#ifdef CONFIG_USER_NS_UNPRIVILEGED ++int unprivileged_userns_clone = 1; ++#else ++int unprivileged_userns_clone; ++#endif ++ + static struct kmem_cache *user_ns_cachep __read_mostly; + static DEFINE_MUTEX(userns_state_mutex); + +diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug +index 21d9c5f6e7ec..ab5ae07fa69a 100644 +--- a/lib/Kconfig.debug ++++ b/lib/Kconfig.debug +@@ -337,6 +337,9 @@ config SECTION_MISMATCH_WARN_ONLY + + If unsure, say Y. + ++config DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE ++ bool "Enable verbose reporting of writable function pointers" ++ + # + # Select this config option from the architecture Kconfig, if it + # is preferred to always offer frame pointers as a config +@@ -798,6 +801,7 @@ menu "Debug Oops, Lockups and Hangs" + + config PANIC_ON_OOPS + bool "Panic on Oops" ++ default y + help + Say Y here to enable the kernel to panic when it oopses. This + has the same effect as setting oops=panic on the kernel command +@@ -807,7 +811,7 @@ config PANIC_ON_OOPS + anything erroneous after an oops which could result in data + corruption or other issues. + +- Say N if unsure. ++ Say Y if unsure. + + config PANIC_ON_OOPS_VALUE + int +@@ -1346,6 +1350,7 @@ menu "Debug kernel data structures" + config DEBUG_LIST + bool "Debug linked list manipulation" + depends on DEBUG_KERNEL || BUG_ON_DATA_CORRUPTION ++ default y + help + Enable this to turn on extended checks in the linked-list + walking routines. +@@ -1385,6 +1390,7 @@ config DEBUG_NOTIFIERS + config BUG_ON_DATA_CORRUPTION + bool "Trigger a BUG when data corruption is detected" + select DEBUG_LIST ++ default y + help + Select this option if the kernel should BUG when it encounters + data corruption in kernel memory structures when they get checked +@@ -1540,6 +1546,7 @@ config STRICT_DEVMEM + config IO_STRICT_DEVMEM + bool "Filter I/O access to /dev/mem" + depends on STRICT_DEVMEM ++ default y + help + If this option is disabled, you allow userspace (root) access to all + io-memory regardless of whether a driver is actively using that +diff --git a/lib/irq_poll.c b/lib/irq_poll.c +index 2f17b488d58e..b6e7996a0058 100644 +--- a/lib/irq_poll.c ++++ b/lib/irq_poll.c +@@ -75,7 +75,7 @@ void irq_poll_complete(struct irq_poll *iop) + } + EXPORT_SYMBOL(irq_poll_complete); + +-static void __latent_entropy irq_poll_softirq(struct softirq_action *h) ++static void __latent_entropy irq_poll_softirq(void) + { + struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll); + int rearm = 0, budget = irq_poll_budget; +diff --git a/lib/kobject.c b/lib/kobject.c +index 83198cb37d8d..4a053b7aef42 100644 +--- a/lib/kobject.c ++++ b/lib/kobject.c +@@ -1009,9 +1009,9 @@ EXPORT_SYMBOL_GPL(kset_create_and_add); + + + static DEFINE_SPINLOCK(kobj_ns_type_lock); +-static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES]; ++static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES] __ro_after_init; + +-int kobj_ns_type_register(const struct kobj_ns_type_operations *ops) ++int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops) + { + enum kobj_ns_type type = ops->type; + int error; +diff --git a/lib/nlattr.c b/lib/nlattr.c +index cace9b307781..39ba1387045d 100644 +--- a/lib/nlattr.c ++++ b/lib/nlattr.c +@@ -571,6 +571,8 @@ int nla_memcpy(void *dest, const struct nlattr *src, int count) + { + int minlen = min_t(int, count, nla_len(src)); + ++ BUG_ON(minlen < 0); ++ + memcpy(dest, nla_data(src), minlen); + if (count > minlen) + memset(dest + minlen, 0, count - minlen); +diff --git a/lib/vsprintf.c b/lib/vsprintf.c +index 7c47ad52ce2f..d1e002579732 100644 +--- a/lib/vsprintf.c ++++ b/lib/vsprintf.c +@@ -817,7 +817,7 @@ static char *ptr_to_id(char *buf, char *end, const void *ptr, + return pointer_string(buf, end, (const void *)hashval, spec); + } + +-int kptr_restrict __read_mostly; ++int kptr_restrict __read_mostly = 2; + + static noinline_for_stack + char *restricted_pointer(char *buf, char *end, const void *ptr, +diff --git a/mm/Kconfig b/mm/Kconfig +index c1acc34c1c35..06dd0aa41a1b 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -320,7 +320,8 @@ config KSM + config DEFAULT_MMAP_MIN_ADDR + int "Low address space to protect from user allocation" + depends on MMU +- default 4096 ++ default 32768 if ARM || (ARM64 && COMPAT) ++ default 65536 + help + This is the portion of low virtual memory which should be protected + from userspace allocation. Keeping a user from writing to low pages +diff --git a/mm/mmap.c b/mm/mmap.c +index f609e9ec4a25..66297ff169d9 100644 +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -231,6 +231,13 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) + + newbrk = PAGE_ALIGN(brk); + oldbrk = PAGE_ALIGN(mm->brk); ++ /* properly handle unaligned min_brk as an empty heap */ ++ if (min_brk & ~PAGE_MASK) { ++ if (brk == min_brk) ++ newbrk -= PAGE_SIZE; ++ if (mm->brk == min_brk) ++ oldbrk -= PAGE_SIZE; ++ } + if (oldbrk == newbrk) { + mm->brk = brk; + goto success; +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index d0c0d9364aa6..1f1a45afac2a 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -68,6 +68,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -106,6 +107,15 @@ struct pcpu_drain { + static DEFINE_MUTEX(pcpu_drain_mutex); + static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain); + ++bool __meminitdata extra_latent_entropy; ++ ++static int __init setup_extra_latent_entropy(char *str) ++{ ++ extra_latent_entropy = true; ++ return 0; ++} ++early_param("extra_latent_entropy", setup_extra_latent_entropy); ++ + #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY + volatile unsigned long latent_entropy __latent_entropy; + EXPORT_SYMBOL(latent_entropy); +@@ -1479,6 +1489,25 @@ static void __free_pages_ok(struct page *page, unsigned int order) + local_irq_restore(flags); + } + ++static void __init __gather_extra_latent_entropy(struct page *page, ++ unsigned int nr_pages) ++{ ++ if (extra_latent_entropy && !PageHighMem(page) && page_to_pfn(page) < 0x100000) { ++ unsigned long hash = 0; ++ size_t index, end = PAGE_SIZE * nr_pages / sizeof hash; ++ const unsigned long *data = lowmem_page_address(page); ++ ++ for (index = 0; index < end; index++) ++ hash ^= hash + data[index]; ++#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY ++ latent_entropy ^= hash; ++ add_device_randomness((const void *)&latent_entropy, sizeof(latent_entropy)); ++#else ++ add_device_randomness((const void *)&hash, sizeof(hash)); ++#endif ++ } ++} ++ + void __free_pages_core(struct page *page, unsigned int order) + { + unsigned int nr_pages = 1 << order; +@@ -1493,7 +1522,6 @@ void __free_pages_core(struct page *page, unsigned int order) + } + __ClearPageReserved(p); + set_page_count(p, 0); +- + atomic_long_add(nr_pages, &page_zone(page)->managed_pages); + set_page_refcounted(page); + __free_pages(page, order); +@@ -1544,6 +1572,7 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, + { + if (early_page_uninitialised(pfn)) + return; ++ __gather_extra_latent_entropy(page, 1 << order); + __free_pages_core(page, order); + } + +@@ -1635,6 +1664,7 @@ static void __init deferred_free_range(unsigned long pfn, + if (nr_pages == pageblock_nr_pages && + (pfn & (pageblock_nr_pages - 1)) == 0) { + set_pageblock_migratetype(page, MIGRATE_MOVABLE); ++ __gather_extra_latent_entropy(page, 1 << pageblock_order); + __free_pages_core(page, pageblock_order); + return; + } +@@ -1642,6 +1672,7 @@ static void __init deferred_free_range(unsigned long pfn, + for (i = 0; i < nr_pages; i++, page++, pfn++) { + if ((pfn & (pageblock_nr_pages - 1)) == 0) + set_pageblock_migratetype(page, MIGRATE_MOVABLE); ++ __gather_extra_latent_entropy(page, 1); + __free_pages_core(page, 0); + } + } +@@ -2202,6 +2233,12 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags + { + post_alloc_hook(page, order, gfp_flags); + ++ if (IS_ENABLED(CONFIG_PAGE_SANITIZE_VERIFY) && want_init_on_free()) { ++ int i; ++ for (i = 0; i < (1 << order); i++) ++ verify_zero_highpage(page + i); ++ } ++ + if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags)) + kernel_init_free_pages(page, 1 << order); + +diff --git a/mm/slab.h b/mm/slab.h +index 74f7e09a7cfd..ce786e0af610 100644 +--- a/mm/slab.h ++++ b/mm/slab.h +@@ -472,9 +472,13 @@ static inline struct kmem_cache *virt_to_cache(const void *obj) + struct page *page; + + page = virt_to_head_page(obj); ++#ifdef CONFIG_BUG_ON_DATA_CORRUPTION ++ BUG_ON(!PageSlab(page)); ++#else + if (WARN_ONCE(!PageSlab(page), "%s: Object is not a Slab page!\n", + __func__)) + return NULL; ++#endif + return page->slab_cache; + } + +@@ -520,9 +524,14 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) + return s; + + cachep = virt_to_cache(x); +- WARN_ONCE(cachep && !slab_equal_or_root(cachep, s), +- "%s: Wrong slab cache. %s but object is from %s\n", +- __func__, s->name, cachep->name); ++ if (cachep && !slab_equal_or_root(cachep, s)) { ++#ifdef CONFIG_BUG_ON_DATA_CORRUPTION ++ BUG(); ++#else ++ WARN_ONCE(1, "%s: Wrong slab cache. %s but object is from %s\n", ++ __func__, s->name, cachep->name); ++#endif ++ } + return cachep; + } + +@@ -547,7 +556,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s) + * back there or track user information then we can + * only use the space before that information. + */ +- if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) ++ if ((s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) || IS_ENABLED(CONFIG_SLAB_CANARY)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation +@@ -676,8 +685,10 @@ static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { } + static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) + { + if (static_branch_unlikely(&init_on_alloc)) { ++#ifndef CONFIG_SLUB + if (c->ctor) + return false; ++#endif + if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) + return flags & __GFP_ZERO; + return true; +@@ -687,9 +698,15 @@ static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c) + + static inline bool slab_want_init_on_free(struct kmem_cache *c) + { +- if (static_branch_unlikely(&init_on_free)) +- return !(c->ctor || +- (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))); ++ if (static_branch_unlikely(&init_on_free)) { ++#ifndef CONFIG_SLUB ++ if (c->ctor) ++ return false; ++#endif ++ if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ++ return false; ++ return true; ++ } + return false; + } + +diff --git a/mm/slab_common.c b/mm/slab_common.c +index 37d48a56431d..b8947336d0e1 100644 +--- a/mm/slab_common.c ++++ b/mm/slab_common.c +@@ -28,10 +28,10 @@ + + #include "slab.h" + +-enum slab_state slab_state; ++enum slab_state slab_state __ro_after_init; + LIST_HEAD(slab_caches); + DEFINE_MUTEX(slab_mutex); +-struct kmem_cache *kmem_cache; ++struct kmem_cache *kmem_cache __ro_after_init; + + #ifdef CONFIG_HARDENED_USERCOPY + bool usercopy_fallback __ro_after_init = +@@ -59,7 +59,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, + /* + * Merge control. If this is set then no merging of slab caches will occur. + */ +-static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); ++static bool slab_nomerge __ro_after_init = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); + + static int __init setup_slab_nomerge(char *str) + { +diff --git a/mm/slub.c b/mm/slub.c +index 660f4324c097..54c3291a7571 100644 +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -123,6 +123,12 @@ static inline int kmem_cache_debug(struct kmem_cache *s) + #endif + } + ++static inline bool has_sanitize_verify(struct kmem_cache *s) ++{ ++ return IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) && ++ slab_want_init_on_free(s); ++} ++ + void *fixup_red_left(struct kmem_cache *s, void *p) + { + if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) +@@ -494,13 +500,13 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p) + * Debug settings: + */ + #if defined(CONFIG_SLUB_DEBUG_ON) +-static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS; ++static slab_flags_t slub_debug __ro_after_init = DEBUG_DEFAULT_FLAGS; + #else +-static slab_flags_t slub_debug; ++static slab_flags_t slub_debug __ro_after_init; + #endif + +-static char *slub_debug_slabs; +-static int disable_higher_order_debug; ++static char *slub_debug_slabs __ro_after_init; ++static int disable_higher_order_debug __ro_after_init; + + /* + * slub is about to manipulate internal object metadata. This memory lies +@@ -571,6 +577,33 @@ static inline unsigned int get_info_end(struct kmem_cache *s) + return s->inuse; + } + ++#ifdef CONFIG_SLAB_CANARY ++static inline unsigned long *get_canary(struct kmem_cache *s, void *object) ++{ ++ return object + get_info_end(s); ++} ++ ++static inline unsigned long get_canary_value(const void *canary, unsigned long value) ++{ ++ return (value ^ (unsigned long)canary) & CANARY_MASK; ++} ++ ++static inline void set_canary(struct kmem_cache *s, void *object, unsigned long value) ++{ ++ unsigned long *canary = get_canary(s, object); ++ *canary = get_canary_value(canary, value); ++} ++ ++static inline void check_canary(struct kmem_cache *s, void *object, unsigned long value) ++{ ++ unsigned long *canary = get_canary(s, object); ++ BUG_ON(*canary != get_canary_value(canary, value)); ++} ++#else ++#define set_canary(s, object, value) ++#define check_canary(s, object, value) ++#endif ++ + static struct track *get_track(struct kmem_cache *s, void *object, + enum track_item alloc) + { +@@ -578,6 +611,9 @@ static struct track *get_track(struct kmem_cache *s, void *object, + + p = object + get_info_end(s); + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ p = (void *)p + sizeof(void *); ++ + return p + alloc; + } + +@@ -719,6 +755,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) + + off = get_info_end(s); + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ off += sizeof(void *); ++ + if (s->flags & SLAB_STORE_USER) + off += 2 * sizeof(struct track); + +@@ -827,8 +866,9 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, + * Meta data starts here. + * + * A. Free pointer (if we cannot overwrite object on free) +- * B. Tracking data for SLAB_STORE_USER +- * C. Padding to reach required alignment boundary or at mininum ++ * B. Canary for SLAB_CANARY ++ * C. Tracking data for SLAB_STORE_USER ++ * D. Padding to reach required alignment boundary or at mininum + * one word if debugging is on to be able to detect writes + * before the word boundary. + * +@@ -846,6 +886,9 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) + { + unsigned long off = get_info_end(s); /* The end of info */ + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ off += sizeof(void *); ++ + if (s->flags & SLAB_STORE_USER) + /* We also have user information there */ + off += 2 * sizeof(struct track); +@@ -1491,6 +1534,8 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, + object = next; + next = get_freepointer(s, object); + ++ check_canary(s, object, s->random_active); ++ + if (slab_want_init_on_free(s)) { + /* + * Clear the object and the metadata, but don't touch +@@ -1501,8 +1546,12 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, + : 0; + memset((char *)object + s->inuse, 0, + s->size - s->inuse - rsize); +- ++ if (!IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) && s->ctor) ++ s->ctor(object); + } ++ ++ set_canary(s, object, s->random_inactive); ++ + /* If object's reuse doesn't have to be delayed */ + if (!slab_free_hook(s, object)) { + /* Move object to the new freelist */ +@@ -1510,6 +1559,18 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, + *head = object; + if (!*tail) + *tail = object; ++ } else if (slab_want_init_on_free(s) && s->ctor) { ++ /* Objects that are put into quarantine by KASAN will ++ * still undergo free_consistency_checks() and thus ++ * need to show a valid freepointer to check_object(). ++ * ++ * Note that doing this for all caches (not just ctor ++ * ones, which have s->offset >= object_size)) causes a ++ * GPF, due to KASAN poisoning and the way ++ * set_freepointer() eventually dereferences the ++ * freepointer. ++ */ ++ set_freepointer(s, object, NULL); + } + } while (object != old_tail); + +@@ -1523,8 +1584,9 @@ static void *setup_object(struct kmem_cache *s, struct page *page, + void *object) + { + setup_object_debug(s, page, object); ++ set_canary(s, object, s->random_inactive); + object = kasan_init_slab_obj(s, object); +- if (unlikely(s->ctor)) { ++ if (unlikely(s->ctor) && !has_sanitize_verify(s)) { + kasan_unpoison_object_data(s, object); + s->ctor(object); + kasan_poison_object_data(s, object); +@@ -2818,8 +2880,28 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, + + maybe_wipe_obj_freeptr(s, object); + +- if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) ++ if (has_sanitize_verify(s) && object) { ++ /* KASAN hasn't unpoisoned the object yet (this is done in the ++ * post-alloc hook), so let's do it temporarily. ++ */ ++ kasan_unpoison_object_data(s, object); ++ BUG_ON(memchr_inv(object, 0, s->object_size)); ++ if (s->ctor) ++ s->ctor(object); ++ kasan_poison_object_data(s, object); ++ } else if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object) { + memset(object, 0, s->object_size); ++ if (s->ctor) { ++ kasan_unpoison_object_data(s, object); ++ s->ctor(object); ++ kasan_poison_object_data(s, object); ++ } ++ } ++ ++ if (object) { ++ check_canary(s, object, s->random_inactive); ++ set_canary(s, object, s->random_active); ++ } + + slab_post_alloc_hook(s, gfpflags, 1, &object); + +@@ -3204,7 +3286,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) + { + struct kmem_cache_cpu *c; +- int i; ++ int i, k; + + /* memcg and kmem_cache debug support */ + s = slab_pre_alloc_hook(s, flags); +@@ -3253,11 +3335,35 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + local_irq_enable(); + + /* Clear memory outside IRQ disabled fastpath loop */ +- if (unlikely(slab_want_init_on_alloc(flags, s))) { ++ if (has_sanitize_verify(s)) { ++ int j; ++ ++ for (j = 0; j < i; j++) { ++ /* KASAN hasn't unpoisoned the object yet (this is done ++ * in the post-alloc hook), so let's do it temporarily. ++ */ ++ kasan_unpoison_object_data(s, p[j]); ++ BUG_ON(memchr_inv(p[j], 0, s->object_size)); ++ if (s->ctor) ++ s->ctor(p[j]); ++ kasan_poison_object_data(s, p[j]); ++ } ++ } else if (unlikely(slab_want_init_on_alloc(flags, s))) { + int j; + +- for (j = 0; j < i; j++) ++ for (j = 0; j < i; j++) { + memset(p[j], 0, s->object_size); ++ if (s->ctor) { ++ kasan_unpoison_object_data(s, p[j]); ++ s->ctor(p[j]); ++ kasan_poison_object_data(s, p[j]); ++ } ++ } ++ } ++ ++ for (k = 0; k < i; k++) { ++ check_canary(s, p[k], s->random_inactive); ++ set_canary(s, p[k], s->random_active); + } + + /* memcg and kmem_cache debug support */ +@@ -3291,9 +3397,9 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk); + * and increases the number of allocations possible without having to + * take the list_lock. + */ +-static unsigned int slub_min_order; +-static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; +-static unsigned int slub_min_objects; ++static unsigned int slub_min_order __ro_after_init; ++static unsigned int slub_max_order __ro_after_init = PAGE_ALLOC_COSTLY_ORDER; ++static unsigned int slub_min_objects __ro_after_init; + + /* + * Calculate the order of allocation given an slab object size. +@@ -3461,6 +3567,7 @@ static void early_kmem_cache_node_alloc(int node) + init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); + init_tracking(kmem_cache_node, n); + #endif ++ set_canary(kmem_cache_node, n, kmem_cache_node->random_active); + n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), + GFP_KERNEL); + page->freelist = get_freepointer(kmem_cache_node, n); +@@ -3641,6 +3748,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) + s->offset = ALIGN(freepointer_area / 2, sizeof(void *)); + } + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ size += sizeof(void *); ++ + #ifdef CONFIG_SLUB_DEBUG + if (flags & SLAB_STORE_USER) + /* +@@ -3713,6 +3823,10 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) + #ifdef CONFIG_SLAB_FREELIST_HARDENED + s->random = get_random_long(); + #endif ++#ifdef CONFIG_SLAB_CANARY ++ s->random_active = get_random_long(); ++ s->random_inactive = get_random_long(); ++#endif + + if (!calculate_sizes(s, -1)) + goto error; +@@ -3988,6 +4102,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, + offset -= s->red_left_pad; + } + ++ check_canary(s, (void *)ptr - offset, s->random_active); ++ + /* Allow address range falling entirely within usercopy region. */ + if (offset >= s->useroffset && + offset - s->useroffset <= s->usersize && +@@ -4021,7 +4137,11 @@ size_t __ksize(const void *object) + page = virt_to_head_page(object); + + if (unlikely(!PageSlab(page))) { ++#ifdef CONFIG_BUG_ON_DATA_CORRUPTION ++ BUG_ON(!PageCompound(page)); ++#else + WARN_ON(!PageCompound(page)); ++#endif + return page_size(page); + } + +@@ -4848,7 +4968,7 @@ enum slab_stat_type { + #define SO_TOTAL (1 << SL_TOTAL) + + #ifdef CONFIG_MEMCG +-static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON); ++static bool memcg_sysfs_enabled __ro_after_init = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON); + + static int __init setup_slub_memcg_sysfs(char *str) + { +diff --git a/mm/swap.c b/mm/swap.c +index bf9a79fed62d..3375d4cf4ee8 100644 +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -94,6 +94,13 @@ static void __put_compound_page(struct page *page) + if (!PageHuge(page)) + __page_cache_release(page); + dtor = get_compound_page_dtor(page); ++ if (!PageHuge(page)) ++ BUG_ON(dtor != free_compound_page ++#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++ && dtor != free_transhuge_page ++#endif ++ ); ++ + (*dtor)(page); + } + +diff --git a/mm/util.c b/mm/util.c +index dc1c877d5481..4872ec1b8858 100644 +--- a/mm/util.c ++++ b/mm/util.c +@@ -335,9 +335,9 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) + { + /* Is the current task 32bit ? */ + if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task()) +- return randomize_page(mm->brk, SZ_32M); ++ return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE; + +- return randomize_page(mm->brk, SZ_1G); ++ return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE; + } + + unsigned long arch_mmap_rnd(void) +diff --git a/net/core/dev.c b/net/core/dev.c +index c9ee5d80d5ea..9904a4aefa8b 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -4750,7 +4750,7 @@ int netif_rx_ni(struct sk_buff *skb) + } + EXPORT_SYMBOL(netif_rx_ni); + +-static __latent_entropy void net_tx_action(struct softirq_action *h) ++static __latent_entropy void net_tx_action(void) + { + struct softnet_data *sd = this_cpu_ptr(&softnet_data); + +@@ -6622,7 +6622,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) + return work; + } + +-static __latent_entropy void net_rx_action(struct softirq_action *h) ++static __latent_entropy void net_rx_action(void) + { + struct softnet_data *sd = this_cpu_ptr(&softnet_data); + unsigned long time_limit = jiffies + +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index 25a8888826b8..7343a827e166 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -267,6 +267,7 @@ config IP_PIMSM_V2 + + config SYN_COOKIES + bool "IP: TCP syncookie support" ++ default y + ---help--- + Normal TCP/IP networking is open to an attack known as "SYN + flooding". This denial-of-service attack prevents legitimate remote +@@ -739,3 +740,26 @@ config TCP_MD5SIG + on the Internet. + + If unsure, say N. ++ ++config TCP_SIMULT_CONNECT_DEFAULT_ON ++ bool "Enable TCP simultaneous connect" ++ help ++ Enable TCP simultaneous connect that adds a weakness in Linux's strict ++ implementation of TCP that allows two clients to connect to each other ++ without either entering a listening state. The weakness allows an ++ attacker to easily prevent a client from connecting to a known server ++ provided the source port for the connection is guessed correctly. ++ ++ As the weakness could be used to prevent an antivirus or IPS from ++ fetching updates, or prevent an SSL gateway from fetching a CRL, it ++ should be eliminated by disabling this option. Though Linux is one of ++ few operating systems supporting simultaneous connect, it has no ++ legitimate use in practice and is rarely supported by firewalls. ++ ++ Disabling this may break TCP STUNT which is used by some applications ++ for NAT traversal. ++ ++ This setting can be overridden at runtime via the ++ net.ipv4.tcp_simult_connect sysctl. ++ ++ If unsure, say N. +diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c +index 81b267e990a1..587dbfdbcf1a 100644 +--- a/net/ipv4/sysctl_net_ipv4.c ++++ b/net/ipv4/sysctl_net_ipv4.c +@@ -604,6 +604,15 @@ static struct ctl_table ipv4_table[] = { + .mode = 0644, + .proc_handler = proc_do_static_key, + }, ++ { ++ .procname = "tcp_simult_connect", ++ .data = &sysctl_tcp_simult_connect, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, + { } + }; + +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 1fa009999f57..43aa2340feb2 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -82,6 +82,7 @@ + #include + + int sysctl_tcp_max_orphans __read_mostly = NR_FILE; ++int sysctl_tcp_simult_connect __read_mostly = IS_ENABLED(CONFIG_TCP_SIMULT_CONNECT_DEFAULT_ON); + + #define FLAG_DATA 0x01 /* Incoming frame contained data. */ + #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ +@@ -6064,7 +6065,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, + tcp_paws_reject(&tp->rx_opt, 0)) + goto discard_and_undo; + +- if (th->syn) { ++ if (th->syn && sysctl_tcp_simult_connect) { + /* We see SYN without ACK. It is attempt of + * simultaneous connect with crossed SYNs. + * Particularly, it can be connect to self. +diff --git a/scripts/Makefile.modpost b/scripts/Makefile.modpost +index 33aaa572f686..447648fc48f4 100644 +--- a/scripts/Makefile.modpost ++++ b/scripts/Makefile.modpost +@@ -53,6 +53,7 @@ MODPOST = scripts/mod/modpost \ + $(if $(KBUILD_EXTMOD),$(addprefix -e ,$(KBUILD_EXTRA_SYMBOLS))) \ + $(if $(KBUILD_EXTMOD),-o $(modulesymfile)) \ + $(if $(CONFIG_SECTION_MISMATCH_WARN_ONLY),,-E) \ ++ $(if $(CONFIG_DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE),-f) \ + $(if $(CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS)$(KBUILD_NSDEPS),-N) \ + $(if $(KBUILD_MODPOST_WARN),-w) + +diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig +index 013ba3a57669..31ce967a1959 100644 +--- a/scripts/gcc-plugins/Kconfig ++++ b/scripts/gcc-plugins/Kconfig +@@ -53,6 +53,11 @@ config GCC_PLUGIN_LATENT_ENTROPY + is some slowdown of the boot process (about 0.5%) and fork and + irq processing. + ++ When extra_latent_entropy is passed on the kernel command line, ++ entropy will be extracted from up to the first 4GB of RAM while the ++ runtime memory allocator is being initialized. This costs even more ++ slowdown of the boot process. ++ + Note that entropy extracted this way is not cryptographically + secure! + +diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c +index 5c3c50c5ec52..b539cd7159be 100644 +--- a/scripts/mod/modpost.c ++++ b/scripts/mod/modpost.c +@@ -37,6 +37,8 @@ static int warn_unresolved = 0; + /* How a symbol is exported */ + static int sec_mismatch_count = 0; + static int sec_mismatch_fatal = 0; ++static int writable_fptr_count = 0; ++static int writable_fptr_verbose = 0; + /* ignore missing files */ + static int ignore_missing_files; + /* If set to 1, only warn (instead of error) about missing ns imports */ +@@ -1007,6 +1009,7 @@ enum mismatch { + ANY_EXIT_TO_ANY_INIT, + EXPORT_TO_INIT_EXIT, + EXTABLE_TO_NON_TEXT, ++ DATA_TO_TEXT + }; + + /** +@@ -1133,6 +1136,12 @@ static const struct sectioncheck sectioncheck[] = { + .good_tosec = {ALL_TEXT_SECTIONS , NULL}, + .mismatch = EXTABLE_TO_NON_TEXT, + .handler = extable_mismatch_handler, ++}, ++/* Do not reference code from writable data */ ++{ ++ .fromsec = { DATA_SECTIONS, NULL }, ++ .bad_tosec = { ALL_TEXT_SECTIONS, NULL }, ++ .mismatch = DATA_TO_TEXT + } + }; + +@@ -1320,10 +1329,10 @@ static Elf_Sym *find_elf_symbol(struct elf_info *elf, Elf64_Sword addr, + continue; + if (!is_valid_name(elf, sym)) + continue; +- if (sym->st_value == addr) +- return sym; + /* Find a symbol nearby - addr are maybe negative */ + d = sym->st_value - addr; ++ if (d == 0) ++ return sym; + if (d < 0) + d = addr - sym->st_value; + if (d < distance) { +@@ -1458,7 +1467,13 @@ static void report_sec_mismatch(const char *modname, + char *prl_from; + char *prl_to; + +- sec_mismatch_count++; ++ if (mismatch->mismatch == DATA_TO_TEXT) { ++ writable_fptr_count++; ++ if (!writable_fptr_verbose) ++ return; ++ } else { ++ sec_mismatch_count++; ++ } + + get_pretty_name(from_is_func, &from, &from_p); + get_pretty_name(to_is_func, &to, &to_p); +@@ -1580,6 +1595,12 @@ static void report_sec_mismatch(const char *modname, + fatal("There's a special handler for this mismatch type, " + "we should never get here."); + break; ++ case DATA_TO_TEXT: ++ fprintf(stderr, ++ "The %s %s:%s references\n" ++ "the %s %s:%s%s\n", ++ from, fromsec, fromsym, to, tosec, tosym, to_p); ++ break; + } + fprintf(stderr, "\n"); + } +@@ -2559,7 +2580,7 @@ int main(int argc, char **argv) + struct ext_sym_list *extsym_iter; + struct ext_sym_list *extsym_start = NULL; + +- while ((opt = getopt(argc, argv, "i:e:mnsT:o:awENd:")) != -1) { ++ while ((opt = getopt(argc, argv, "i:e:fmnsT:o:awENd:")) != -1) { + switch (opt) { + case 'i': + kernel_read = optarg; +@@ -2573,6 +2594,9 @@ int main(int argc, char **argv) + extsym_iter->file = optarg; + extsym_start = extsym_iter; + break; ++ case 'f': ++ writable_fptr_verbose = 1; ++ break; + case 'm': + modversions = 1; + break; +@@ -2676,6 +2700,11 @@ int main(int argc, char **argv) + } + + free(buf.p); ++ if (writable_fptr_count && !writable_fptr_verbose) ++ warn("modpost: Found %d writable function pointer%s.\n" ++ "To see full details build your kernel with:\n" ++ "'make CONFIG_DEBUG_WRITABLE_FUNCTION_POINTERS_VERBOSE=y'\n", ++ writable_fptr_count, (writable_fptr_count == 1 ? "" : "s")); + + return err; + } +diff --git a/security/Kconfig b/security/Kconfig +index cd3cc7da3a55..127b54aecf87 100644 +--- a/security/Kconfig ++++ b/security/Kconfig +@@ -9,7 +9,7 @@ source "security/keys/Kconfig" + + config SECURITY_DMESG_RESTRICT + bool "Restrict unprivileged access to the kernel syslog" +- default n ++ default y + help + This enforces restrictions on unprivileged users reading the kernel + syslog via dmesg(8). +@@ -19,10 +19,34 @@ config SECURITY_DMESG_RESTRICT + + If you are unsure how to answer this question, answer N. + ++config SECURITY_PERF_EVENTS_RESTRICT ++ bool "Restrict unprivileged use of performance events" ++ depends on PERF_EVENTS ++ default y ++ help ++ If you say Y here, the kernel.perf_event_paranoid sysctl ++ will be set to 3 by default, and no unprivileged use of the ++ perf_event_open syscall will be permitted unless it is ++ changed. ++ ++config SECURITY_TIOCSTI_RESTRICT ++ bool "Restrict unprivileged use of tiocsti command injection" ++ default y ++ help ++ This enforces restrictions on unprivileged users injecting commands ++ into other processes which share a tty session using the TIOCSTI ++ ioctl. This option makes TIOCSTI use require CAP_SYS_ADMIN. ++ ++ If this option is not selected, no restrictions will be enforced ++ unless the tiocsti_restrict sysctl is explicitly set to (1). ++ ++ If you are unsure how to answer this question, answer N. ++ + config SECURITY + bool "Enable different security models" + depends on SYSFS + depends on MULTIUSER ++ default y + help + This allows you to choose different security modules to be + configured into your kernel. +@@ -48,6 +72,7 @@ config SECURITYFS + config SECURITY_NETWORK + bool "Socket and Networking Security Hooks" + depends on SECURITY ++ default y + help + This enables the socket and networking security hooks. + If enabled, a security module can use these hooks to +@@ -154,6 +179,7 @@ config HARDENED_USERCOPY + bool "Harden memory copies between kernel and userspace" + depends on HAVE_HARDENED_USERCOPY_ALLOCATOR + imply STRICT_DEVMEM ++ default y + help + This option checks for obviously wrong memory regions when + copying memory to/from the kernel (via copy_to_user() and +@@ -166,7 +192,6 @@ config HARDENED_USERCOPY + config HARDENED_USERCOPY_FALLBACK + bool "Allow usercopy whitelist violations to fallback to object size" + depends on HARDENED_USERCOPY +- default y + help + This is a temporary option that allows missing usercopy whitelists + to be discovered via a WARN() to the kernel log, instead of +@@ -191,10 +216,21 @@ config HARDENED_USERCOPY_PAGESPAN + config FORTIFY_SOURCE + bool "Harden common str/mem functions against buffer overflows" + depends on ARCH_HAS_FORTIFY_SOURCE ++ default y + help + Detect overflows of buffers in common string and memory functions + where the compiler can determine and validate the buffer sizes. + ++config FORTIFY_SOURCE_STRICT_STRING ++ bool "Harden common functions against buffer overflows" ++ depends on FORTIFY_SOURCE ++ depends on EXPERT ++ help ++ Perform stricter overflow checks catching overflows within objects ++ for common C string functions rather than only between objects. ++ ++ This is not yet intended for production use, only bug finding. ++ + config STATIC_USERMODEHELPER + bool "Force all usermode helper calls through a single binary" + help +diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening +index af4c979b38ee..001796a391e9 100644 +--- a/security/Kconfig.hardening ++++ b/security/Kconfig.hardening +@@ -169,6 +169,7 @@ config STACKLEAK_RUNTIME_DISABLE + + config INIT_ON_ALLOC_DEFAULT_ON + bool "Enable heap memory zeroing on allocation by default" ++ default yes + help + This has the effect of setting "init_on_alloc=1" on the kernel + command line. This can be disabled with "init_on_alloc=0". +@@ -181,6 +182,7 @@ config INIT_ON_ALLOC_DEFAULT_ON + + config INIT_ON_FREE_DEFAULT_ON + bool "Enable heap memory zeroing on free by default" ++ default yes + help + This has the effect of setting "init_on_free=1" on the kernel + command line. This can be disabled with "init_on_free=0". +@@ -196,6 +198,21 @@ config INIT_ON_FREE_DEFAULT_ON + touching "cold" memory areas. Most cases see 3-5% impact. Some + synthetic workloads have measured as high as 8%. + ++config PAGE_SANITIZE_VERIFY ++ bool "Verify sanitized pages" ++ default y ++ help ++ When init_on_free is enabled, verify that newly allocated pages ++ are zeroed to detect write-after-free bugs. ++ ++config SLAB_SANITIZE_VERIFY ++ default y ++ bool "Verify sanitized SLAB allocations" ++ depends on !KASAN ++ help ++ When init_on_free is enabled, verify that newly allocated slab ++ objects are zeroed to detect write-after-free bugs. ++ + endmenu + + endmenu +diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig +index 9e921fc72538..ae851a826c26 100644 +--- a/security/selinux/Kconfig ++++ b/security/selinux/Kconfig +@@ -3,7 +3,7 @@ config SECURITY_SELINUX + bool "NSA SELinux Support" + depends on SECURITY_NETWORK && AUDIT && NET && INET + select NETWORK_SECMARK +- default n ++ default y + help + This selects NSA Security-Enhanced Linux (SELinux). + You will also need a policy configuration and a labeled filesystem. +@@ -70,29 +70,6 @@ config SECURITY_SELINUX_AVC_STATS + /sys/fs/selinux/avc/cache_stats, which may be monitored via + tools such as avcstat. + +-config SECURITY_SELINUX_CHECKREQPROT_VALUE +- int "NSA SELinux checkreqprot default value" +- depends on SECURITY_SELINUX +- range 0 1 +- default 0 +- help +- This option sets the default value for the 'checkreqprot' flag +- that determines whether SELinux checks the protection requested +- by the application or the protection that will be applied by the +- kernel (including any implied execute for read-implies-exec) for +- mmap and mprotect calls. If this option is set to 0 (zero), +- SELinux will default to checking the protection that will be applied +- by the kernel. If this option is set to 1 (one), SELinux will +- default to checking the protection requested by the application. +- The checkreqprot flag may be changed from the default via the +- 'checkreqprot=' boot parameter. It may also be changed at runtime +- via /sys/fs/selinux/checkreqprot if authorized by policy. +- +- WARNING: this option is deprecated and will be removed in a future +- kernel release. +- +- If you are unsure how to answer this question, answer 0. +- + config SECURITY_SELINUX_SIDTAB_HASH_BITS + int "NSA SELinux sidtab hashtable size" + depends on SECURITY_SELINUX +diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c +index 4c037c2545c1..2437a1895baa 100644 +--- a/security/selinux/hooks.c ++++ b/security/selinux/hooks.c +@@ -135,21 +135,7 @@ static int __init selinux_enabled_setup(char *str) + __setup("selinux=", selinux_enabled_setup); + #endif + +-static unsigned int selinux_checkreqprot_boot = +- CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE; +- +-static int __init checkreqprot_setup(char *str) +-{ +- unsigned long checkreqprot; +- +- if (!kstrtoul(str, 0, &checkreqprot)) { +- selinux_checkreqprot_boot = checkreqprot ? 1 : 0; +- if (checkreqprot) +- pr_warn("SELinux: checkreqprot set to 1 via kernel parameter. This is deprecated and will be rejected in a future kernel release.\n"); +- } +- return 1; +-} +-__setup("checkreqprot=", checkreqprot_setup); ++static const unsigned int selinux_checkreqprot_boot; + + /** + * selinux_secmark_enabled - Check to see if SECMARK is currently enabled +diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c +index 4781314c2510..7f068515d799 100644 +--- a/security/selinux/selinuxfs.c ++++ b/security/selinux/selinuxfs.c +@@ -641,7 +641,6 @@ static ssize_t sel_read_checkreqprot(struct file *filp, char __user *buf, + static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) + { +- struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info; + char *page; + ssize_t length; + unsigned int new_value; +@@ -665,18 +664,9 @@ static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, + return PTR_ERR(page); + + length = -EINVAL; +- if (sscanf(page, "%u", &new_value) != 1) ++ if (sscanf(page, "%u", &new_value) != 1 || new_value) + goto out; + +- if (new_value) { +- char comm[sizeof(current->comm)]; +- +- memcpy(comm, current->comm, sizeof(comm)); +- pr_warn_once("SELinux: %s (%d) set checkreqprot to 1. This is deprecated and will be rejected in a future kernel release.\n", +- comm, current->pid); +- } +- +- fsi->state->checkreqprot = new_value ? 1 : 0; + length = count; + out: + kfree(page); +diff --git a/security/yama/Kconfig b/security/yama/Kconfig +index a810304123ca..b809050b25d2 100644 +--- a/security/yama/Kconfig ++++ b/security/yama/Kconfig +@@ -2,7 +2,7 @@ + config SECURITY_YAMA + bool "Yama support" + depends on SECURITY +- default n ++ default y + help + This selects Yama, which extends DAC support with additional + system-wide security settings beyond regular Linux discretionary diff --git a/linux-tkg/linux-tkg-patches/5.7/0012-misc-additions.patch b/linux-tkg/linux-tkg-patches/5.7/0012-misc-additions.patch new file mode 100644 index 0000000..33f5502 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.7/0012-misc-additions.patch @@ -0,0 +1,55 @@ +diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig +index 0840d27381ea..73aba9a31064 100644 +--- a/drivers/tty/Kconfig ++++ b/drivers/tty/Kconfig +@@ -75,6 +75,19 @@ config VT_CONSOLE_SLEEP + def_bool y + depends on VT_CONSOLE && PM_SLEEP + ++config NR_TTY_DEVICES ++ int "Maximum tty device number" ++ depends on VT ++ range 12 63 ++ default 63 ++ ---help--- ++ This option is used to change the number of tty devices in /dev. ++ The default value is 63. The lowest number you can set is 12, ++ 63 is also the upper limit so we don't overrun the serial ++ consoles. ++ ++ If unsure, say 63. ++ + config HW_CONSOLE + bool + depends on VT && !UML +diff --git a/include/uapi/linux/vt.h b/include/uapi/linux/vt.h +index e9d39c48520a..3bceead8da40 100644 +--- a/include/uapi/linux/vt.h ++++ b/include/uapi/linux/vt.h +@@ -3,12 +3,25 @@ + #define _UAPI_LINUX_VT_H + + ++/* ++ * We will make this definition solely for the purpose of making packages ++ * such as splashutils build, because they can not understand that ++ * NR_TTY_DEVICES is defined in the kernel configuration. ++ */ ++#ifndef CONFIG_NR_TTY_DEVICES ++#define CONFIG_NR_TTY_DEVICES 63 ++#endif ++ + /* + * These constants are also useful for user-level apps (e.g., VC + * resizing). + */ + #define MIN_NR_CONSOLES 1 /* must be at least 1 */ +-#define MAX_NR_CONSOLES 63 /* serial lines start at 64 */ ++/* ++ * NR_TTY_DEVICES: ++ * Value MUST be at least 12 and must never be higher then 63 ++ */ ++#define MAX_NR_CONSOLES CONFIG_NR_TTY_DEVICES /* serial lines start above this */ + /* Note: the ioctl VT_GETSTATE does not work for + consoles 16 and higher (since it returns a short) */ + diff --git a/linux-tkg/linux-tkg-patches/5.8/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch b/linux-tkg/linux-tkg-patches/5.8/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch new file mode 100644 index 0000000..83240cb --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.8/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch @@ -0,0 +1,156 @@ +From 5ec2dd3a095442ec1a21d86042a4994f2ba24e63 Mon Sep 17 00:00:00 2001 +Message-Id: <5ec2dd3a095442ec1a21d86042a4994f2ba24e63.1512651251.git.jan.steffens@gmail.com> +From: Serge Hallyn +Date: Fri, 31 May 2013 19:12:12 +0100 +Subject: [PATCH] add sysctl to disallow unprivileged CLONE_NEWUSER by default + +Signed-off-by: Serge Hallyn +[bwh: Remove unneeded binary sysctl bits] +Signed-off-by: Daniel Micay +--- + kernel/fork.c | 15 +++++++++++++++ + kernel/sysctl.c | 12 ++++++++++++ + kernel/user_namespace.c | 3 +++ + 3 files changed, 30 insertions(+) + +diff --git a/kernel/fork.c b/kernel/fork.c +index 07cc743698d3668e..4011d68a8ff9305c 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -102,6 +102,11 @@ + + #define CREATE_TRACE_POINTS + #include ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#else ++#define unprivileged_userns_clone 0 ++#endif + + /* + * Minimum number of threads to boot the kernel +@@ -1555,6 +1560,10 @@ static __latent_entropy struct task_struct *copy_process( + if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) + return ERR_PTR(-EINVAL); + ++ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) ++ if (!capable(CAP_SYS_ADMIN)) ++ return ERR_PTR(-EPERM); ++ + /* + * Thread groups must share signals as well, and detached threads + * can only be started up within the thread group. +@@ -2348,6 +2357,12 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; + ++ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { ++ err = -EPERM; ++ if (!capable(CAP_SYS_ADMIN)) ++ goto bad_unshare_out; ++ } ++ + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index b86520ed3fb60fbf..f7dab3760839f1a1 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -105,6 +105,9 @@ extern int core_uses_pid; + + #if defined(CONFIG_SYSCTL) + ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#endif + /* Constants used for minimum and maximum */ + #ifdef CONFIG_LOCKUP_DETECTOR + static int sixty = 60; +@@ -513,6 +516,15 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_USER_NS ++ { ++ .procname = "unprivileged_userns_clone", ++ .data = &unprivileged_userns_clone, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++#endif + #ifdef CONFIG_PROC_SYSCTL + { + .procname = "tainted", +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index c490f1e4313b998a..dd03bd39d7bf194d 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -24,6 +24,9 @@ + #include + #include + ++/* sysctl */ ++int unprivileged_userns_clone; ++ + static struct kmem_cache *user_ns_cachep __read_mostly; + static DEFINE_MUTEX(userns_state_mutex); + +-- +2.15.1 + +From b5202296055dd333db4425120d3f93ef4e6a0573 Mon Sep 17 00:00:00 2001 +From: "Jan Alexander Steffens (heftig)" +Date: Thu, 7 Dec 2017 13:50:48 +0100 +Subject: ZEN: Add CONFIG for unprivileged_userns_clone + +This way our default behavior continues to match the vanilla kernel. +--- + init/Kconfig | 16 ++++++++++++++++ + kernel/user_namespace.c | 4 ++++ + 2 files changed, 20 insertions(+) + +diff --git a/init/Kconfig b/init/Kconfig +index 4592bf7997c0..f3df02990aff 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1004,6 +1004,22 @@ config USER_NS + + If unsure, say N. + ++config USER_NS_UNPRIVILEGED ++ bool "Allow unprivileged users to create namespaces" ++ default y ++ depends on USER_NS ++ help ++ When disabled, unprivileged users will not be able to create ++ new namespaces. Allowing users to create their own namespaces ++ has been part of several recent local privilege escalation ++ exploits, so if you need user namespaces but are ++ paranoid^Wsecurity-conscious you want to disable this. ++ ++ This setting can be overridden at runtime via the ++ kernel.unprivileged_userns_clone sysctl. ++ ++ If unsure, say Y. ++ + config PID_NS + bool "PID Namespaces" + default y +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index 6b9dbc257e34..107b17f0d528 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -27,7 +27,11 @@ + #include + + /* sysctl */ ++#ifdef CONFIG_USER_NS_UNPRIVILEGED ++int unprivileged_userns_clone = 1; ++#else + int unprivileged_userns_clone; ++#endif + + static struct kmem_cache *user_ns_cachep __read_mostly; + static DEFINE_MUTEX(userns_state_mutex); diff --git a/linux-tkg/linux-tkg-patches/5.8/0002-clear-patches.patch b/linux-tkg/linux-tkg-patches/5.8/0002-clear-patches.patch new file mode 100644 index 0000000..22a32f5 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.8/0002-clear-patches.patch @@ -0,0 +1,360 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Mon, 14 Mar 2016 11:10:58 -0600 +Subject: [PATCH] pci pme wakeups + +Reduce wakeups for PME checks, which are a workaround for miswired +boards (sadly, too many of them) in laptops. +--- + drivers/pci/pci.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c +index c9338f9..6974fbf 100644 +--- a/drivers/pci/pci.c ++++ b/drivers/pci/pci.c +@@ -62,7 +62,7 @@ struct pci_pme_device { + struct pci_dev *dev; + }; + +-#define PME_TIMEOUT 1000 /* How long between PME checks */ ++#define PME_TIMEOUT 4000 /* How long between PME checks */ + + static void pci_dev_d3_sleep(struct pci_dev *dev) + { +-- +https://clearlinux.org + +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Sat, 19 Mar 2016 21:32:19 -0400 +Subject: [PATCH] intel_idle: tweak cpuidle cstates + +Increase target_residency in cpuidle cstate + +Tune intel_idle to be a bit less agressive; +Clear linux is cleaner in hygiene (wakupes) than the average linux, +so we can afford changing these in a way that increases +performance while keeping power efficiency +--- + drivers/idle/intel_idle.c | 44 +++++++++++++++++++-------------------- + 1 file changed, 22 insertions(+), 22 deletions(-) + +diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c +index f449584..c994d24 100644 +--- a/drivers/idle/intel_idle.c ++++ b/drivers/idle/intel_idle.c +@@ -531,7 +531,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -539,7 +539,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 33, +- .target_residency = 100, ++ .target_residency = 900, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -547,7 +547,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, +- .target_residency = 400, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -555,7 +555,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { + .desc = "MWAIT 0x32", + .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 166, +- .target_residency = 500, ++ .target_residency = 1500, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -563,7 +563,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, +- .target_residency = 900, ++ .target_residency = 2000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -571,7 +571,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 600, +- .target_residency = 1800, ++ .target_residency = 5000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -579,7 +579,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 2600, +- .target_residency = 7700, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -599,7 +599,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -607,7 +607,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 40, +- .target_residency = 100, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -615,7 +615,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, +- .target_residency = 400, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -623,7 +623,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { + .desc = "MWAIT 0x32", + .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 166, +- .target_residency = 500, ++ .target_residency = 2000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -631,7 +631,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, +- .target_residency = 900, ++ .target_residency = 4000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -639,7 +639,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 600, +- .target_residency = 1800, ++ .target_residency = 7000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -647,7 +647,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 2600, +- .target_residency = 7700, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -668,7 +668,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -676,7 +676,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 70, +- .target_residency = 100, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -684,7 +684,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 85, +- .target_residency = 200, ++ .target_residency = 600, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -692,7 +692,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { + .desc = "MWAIT 0x33", + .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 124, +- .target_residency = 800, ++ .target_residency = 3000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -700,7 +700,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 200, +- .target_residency = 800, ++ .target_residency = 3200, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -708,7 +708,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 480, +- .target_residency = 5000, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -716,7 +716,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 890, +- .target_residency = 5000, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -737,7 +737,7 @@ static struct cpuidle_state skx_cstates[] __initdata = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 300, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +-- +https://clearlinux.org + +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Fri, 6 Jan 2017 15:34:09 +0000 +Subject: [PATCH] ipv4/tcp: allow the memory tuning for tcp to go a little + bigger than default + +--- + net/ipv4/tcp.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 30c1142..4345075 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -4201,8 +4201,8 @@ void __init tcp_init(void) + tcp_init_mem(); + /* Set per-socket limits to no more than 1/128 the pressure threshold */ + limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); +- max_wshare = min(4UL*1024*1024, limit); +- max_rshare = min(6UL*1024*1024, limit); ++ max_wshare = min(16UL*1024*1024, limit); ++ max_rshare = min(16UL*1024*1024, limit); + + init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; +-- +https://clearlinux.org + +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Sun, 18 Feb 2018 23:35:41 +0000 +Subject: [PATCH] locking: rwsem: spin faster + +tweak rwsem owner spinning a bit +--- + kernel/locking/rwsem.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c +index f11b9bd..1bbfcc1 100644 +--- a/kernel/locking/rwsem.c ++++ b/kernel/locking/rwsem.c +@@ -717,6 +717,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) + struct task_struct *new, *owner; + unsigned long flags, new_flags; + enum owner_state state; ++ int i = 0; + + owner = rwsem_owner_flags(sem, &flags); + state = rwsem_owner_state(owner, flags, nonspinnable); +@@ -750,7 +751,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) + break; + } + +- cpu_relax(); ++ if (i++ > 1000) ++ cpu_relax(); + } + rcu_read_unlock(); + +-- +https://clearlinux.org + +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Thu, 2 Jun 2016 23:36:32 -0500 +Subject: [PATCH] initialize ata before graphics + +ATA init is the long pole in the boot process, and its asynchronous. +move the graphics init after it so that ata and graphics initialize +in parallel +--- + drivers/Makefile | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +diff --git a/drivers/Makefile b/drivers/Makefile +index c0cd1b9..af1e2fb 100644 +--- a/drivers/Makefile ++++ b/drivers/Makefile +@@ -59,15 +59,8 @@ obj-y += char/ + # iommu/ comes before gpu as gpu are using iommu controllers + obj-y += iommu/ + +-# gpu/ comes after char for AGP vs DRM startup and after iommu +-obj-y += gpu/ +- + obj-$(CONFIG_CONNECTOR) += connector/ + +-# i810fb and intelfb depend on char/agp/ +-obj-$(CONFIG_FB_I810) += video/fbdev/i810/ +-obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ +- + obj-$(CONFIG_PARPORT) += parport/ + obj-$(CONFIG_NVM) += lightnvm/ + obj-y += base/ block/ misc/ mfd/ nfc/ +@@ -80,6 +73,14 @@ obj-$(CONFIG_IDE) += ide/ + obj-y += scsi/ + obj-y += nvme/ + obj-$(CONFIG_ATA) += ata/ ++ ++# gpu/ comes after char for AGP vs DRM startup and after iommu ++obj-y += gpu/ ++ ++# i810fb and intelfb depend on char/agp/ ++obj-$(CONFIG_FB_I810) += video/fbdev/i810/ ++obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ ++ + obj-$(CONFIG_TARGET_CORE) += target/ + obj-$(CONFIG_MTD) += mtd/ + obj-$(CONFIG_SPI) += spi/ +-- +https://clearlinux.org + diff --git a/linux-tkg/linux-tkg-patches/5.8/0003-glitched-base.patch b/linux-tkg/linux-tkg-patches/5.8/0003-glitched-base.patch new file mode 100644 index 0000000..fb09b35 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.8/0003-glitched-base.patch @@ -0,0 +1,708 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: [PATCH 01/17] glitched + +--- + scripts/mkcompile_h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h +index baf3ab8d9d49..854e32e6aec7 100755 +--- a/scripts/mkcompile_h ++++ b/scripts/mkcompile_h +@@ -41,8 +41,8 @@ else + fi + + UTS_VERSION="#$VERSION" +-CONFIG_FLAGS="" +-if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi ++CONFIG_FLAGS="TKG" ++if [ -n "$SMP" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS SMP"; fi + if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi + if [ -n "$PREEMPT_RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT_RT"; fi + +-- +2.28.0 + + +From c304f43d14e98d4bf1215fc10bc5012f554bdd8a Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Mon, 29 Jan 2018 16:59:22 +0000 +Subject: [PATCH 02/17] dcache: cache_pressure = 50 decreases the rate at which + VFS caches are reclaimed + +Signed-off-by: Alexandre Frade +--- + fs/dcache.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/dcache.c b/fs/dcache.c +index 361ea7ab30ea..0c5cf69b241a 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -71,7 +71,7 @@ + * If no ancestor relationship: + * arbitrary, since it's serialized on rename_lock + */ +-int sysctl_vfs_cache_pressure __read_mostly = 100; ++int sysctl_vfs_cache_pressure __read_mostly = 50; + EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); + + __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); +-- +2.28.0 + + +From 28f32f59d9d55ac7ec3a20b79bdd02d2a0a5f7e1 Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Mon, 29 Jan 2018 18:29:13 +0000 +Subject: [PATCH 03/17] sched/core: nr_migrate = 128 increases number of tasks + to iterate in a single balance run. + +Signed-off-by: Alexandre Frade +--- + kernel/sched/core.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index f788cd61df21..2bfbb4213707 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -59,7 +59,7 @@ const_debug unsigned int sysctl_sched_features = + * Number of tasks to iterate in a single balance run. + * Limited because this is done with IRQs disabled. + */ +-const_debug unsigned int sysctl_sched_nr_migrate = 32; ++const_debug unsigned int sysctl_sched_nr_migrate = 128; + + /* + * period over which we measure -rt task CPU usage in us. +@@ -71,9 +71,9 @@ __read_mostly int scheduler_running; + + /* + * part of the period that we allow rt tasks to run in us. +- * default: 0.95s ++ * XanMod default: 0.98s + */ +-int sysctl_sched_rt_runtime = 950000; ++int sysctl_sched_rt_runtime = 980000; + + /* + * __task_rq_lock - lock the rq @p resides on. +-- +2.28.0 + + +From acc49f33a10f61dc66c423888cbb883ba46710e4 Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Mon, 29 Jan 2018 17:41:29 +0000 +Subject: [PATCH 04/17] scripts: disable the localversion "+" tag of a git repo + +Signed-off-by: Alexandre Frade +--- + scripts/setlocalversion | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/scripts/setlocalversion b/scripts/setlocalversion +index 20f2efd57b11..0552d8b9f582 100755 +--- a/scripts/setlocalversion ++++ b/scripts/setlocalversion +@@ -54,7 +54,7 @@ scm_version() + # If only the short version is requested, don't bother + # running further git commands + if $short; then +- echo "+" ++ # echo "+" + return + fi + # If we are past a tagged commit (like +-- +2.28.0 + + +From 61fcb33fb0de8bc0f060e0a1ada38ed149217f4d Mon Sep 17 00:00:00 2001 +From: Oleksandr Natalenko +Date: Wed, 11 Dec 2019 11:46:19 +0100 +Subject: [PATCH 05/17] init/Kconfig: enable -O3 for all arches + +Building a kernel with -O3 may help in hunting bugs like [1] and thus +using this switch should not be restricted to one specific arch only. + +With that, lets expose it for everyone. + +[1] https://lore.kernel.org/lkml/673b885183fb64f1cbb3ed2387524077@natalenko.name/ + +Signed-off-by: Oleksandr Natalenko +--- + init/Kconfig | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/init/Kconfig b/init/Kconfig +index 0498af567f70..3ae8678e1145 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1278,7 +1278,6 @@ config CC_OPTIMIZE_FOR_PERFORMANCE + + config CC_OPTIMIZE_FOR_PERFORMANCE_O3 + bool "Optimize more for performance (-O3)" +- depends on ARC + help + Choosing this option will pass "-O3" to your compiler to optimize + the kernel yet more for performance. +-- +2.28.0 + + +From 360c6833e07cc9fdef5746f6bc45bdbc7212288d Mon Sep 17 00:00:00 2001 +From: "Jan Alexander Steffens (heftig)" +Date: Fri, 26 Oct 2018 11:22:33 +0100 +Subject: [PATCH 06/17] infiniband: Fix __read_overflow2 error with -O3 + inlining + +--- + drivers/infiniband/core/addr.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c +index 3a98439bba83..6efc4f907f58 100644 +--- a/drivers/infiniband/core/addr.c ++++ b/drivers/infiniband/core/addr.c +@@ -820,6 +820,7 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, + union { + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; ++ struct sockaddr_ib _sockaddr_ib; + } sgid_addr, dgid_addr; + int ret; + +-- +2.28.0 + + +From f85ed068b4d0e6c31edce8574a95757a60e58b87 Mon Sep 17 00:00:00 2001 +From: Etienne Juvigny +Date: Mon, 3 Sep 2018 17:36:25 +0200 +Subject: [PATCH 07/17] Zenify & stuff + +--- + init/Kconfig | 32 ++++++++++++++++++++++++++++++++ + kernel/sched/fair.c | 25 +++++++++++++++++++++++++ + mm/page-writeback.c | 8 ++++++++ + 3 files changed, 65 insertions(+) + +diff --git a/init/Kconfig b/init/Kconfig +index 3ae8678e1145..da708eed0f1e 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -92,6 +92,38 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config ZENIFY ++ bool "A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience" ++ default y ++ help ++ Tunes the kernel for responsiveness at the cost of throughput and power usage. ++ ++ --- Virtual Memory Subsystem --------------------------- ++ ++ Mem dirty before bg writeback..: 10 % -> 20 % ++ Mem dirty before sync writeback: 20 % -> 50 % ++ ++ --- Block Layer ---------------------------------------- ++ ++ Queue depth...............: 128 -> 512 ++ Default MQ scheduler......: mq-deadline -> bfq ++ ++ --- CFS CPU Scheduler ---------------------------------- ++ ++ Scheduling latency.............: 6 -> 3 ms ++ Minimal granularity............: 0.75 -> 0.3 ms ++ Wakeup granularity.............: 1 -> 0.5 ms ++ CPU migration cost.............: 0.5 -> 0.25 ms ++ Bandwidth slice size...........: 5 -> 3 ms ++ Ondemand fine upscaling limit..: 95 % -> 85 % ++ ++ --- MuQSS CPU Scheduler -------------------------------- ++ ++ Scheduling interval............: 6 -> 3 ms ++ ISO task max realtime use......: 70 % -> 25 % ++ Ondemand coarse upscaling limit: 80 % -> 45 % ++ Ondemand fine upscaling limit..: 95 % -> 45 % ++ + config BROKEN + bool + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 6b3b59cc51d6..2a0072192c3d 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -37,8 +37,13 @@ + * + * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_latency = 3000000ULL; ++static unsigned int normalized_sysctl_sched_latency = 3000000ULL; ++#else + unsigned int sysctl_sched_latency = 6000000ULL; + static unsigned int normalized_sysctl_sched_latency = 6000000ULL; ++#endif + + /* + * The initial- and re-scaling of tunables is configurable +@@ -58,13 +63,22 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L + * + * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_min_granularity = 300000ULL; ++static unsigned int normalized_sysctl_sched_min_granularity = 300000ULL; ++#else + unsigned int sysctl_sched_min_granularity = 750000ULL; + static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; ++#endif + + /* + * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity + */ ++#ifdef CONFIG_ZENIFY ++static unsigned int sched_nr_latency = 10; ++#else + static unsigned int sched_nr_latency = 8; ++#endif + + /* + * After fork, child runs first. If set to 0 (default) then +@@ -81,10 +95,17 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; + * + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_wakeup_granularity = 500000UL; ++static unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL; ++ ++const_debug unsigned int sysctl_sched_migration_cost = 50000UL; ++#else + unsigned int sysctl_sched_wakeup_granularity = 1000000UL; + static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; + + const_debug unsigned int sysctl_sched_migration_cost = 500000UL; ++#endif + + int sched_thermal_decay_shift; + static int __init setup_sched_thermal_decay_shift(char *str) +@@ -128,8 +149,12 @@ int __weak arch_asym_cpu_priority(int cpu) + * + * (default: 5 msec, units: microseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; ++#else + unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; + #endif ++#endif + + static inline void update_load_add(struct load_weight *lw, unsigned long inc) + { +diff --git a/mm/page-writeback.c b/mm/page-writeback.c +index 28b3e7a67565..01a1aef2b9b1 100644 +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -71,7 +71,11 @@ static long ratelimit_pages = 32; + /* + * Start background writeback (via writeback threads) at this percentage + */ ++#ifdef CONFIG_ZENIFY ++int dirty_background_ratio = 20; ++#else + int dirty_background_ratio = 10; ++#endif + + /* + * dirty_background_bytes starts at 0 (disabled) so that it is a function of +@@ -88,7 +92,11 @@ int vm_highmem_is_dirtyable; + /* + * The generator of dirty data starts writeback at this percentage + */ ++#ifdef CONFIG_ZENIFY ++int vm_dirty_ratio = 50; ++#else + int vm_dirty_ratio = 20; ++#endif + + /* + * vm_dirty_bytes starts at 0 (disabled) so that it is a function of +-- +2.28.0 + + +From e92e67143385cf285851e12aa8b7f083dd38dd24 Mon Sep 17 00:00:00 2001 +From: Steven Barrett +Date: Sun, 16 Jan 2011 18:57:32 -0600 +Subject: [PATCH 08/17] ZEN: Allow TCP YeAH as default congestion control + +4.4: In my tests YeAH dramatically slowed down transfers over a WLAN, + reducing throughput from ~65Mbps (CUBIC) to ~7MBps (YeAH) over 10 + seconds (netperf TCP_STREAM) including long stalls. + + Be careful when choosing this. ~heftig +--- + net/ipv4/Kconfig | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index e64e59b536d3..bfb55ef7ebbe 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -691,6 +691,9 @@ choice + config DEFAULT_VEGAS + bool "Vegas" if TCP_CONG_VEGAS=y + ++ config DEFAULT_YEAH ++ bool "YeAH" if TCP_CONG_YEAH=y ++ + config DEFAULT_VENO + bool "Veno" if TCP_CONG_VENO=y + +@@ -724,6 +727,7 @@ config DEFAULT_TCP_CONG + default "htcp" if DEFAULT_HTCP + default "hybla" if DEFAULT_HYBLA + default "vegas" if DEFAULT_VEGAS ++ default "yeah" if DEFAULT_YEAH + default "westwood" if DEFAULT_WESTWOOD + default "veno" if DEFAULT_VENO + default "reno" if DEFAULT_RENO +-- +2.28.0 + + +From 76dbe7477bfde1b5e8bf29a71b5af7ab2be9b98e Mon Sep 17 00:00:00 2001 +From: Steven Barrett +Date: Wed, 28 Nov 2018 19:01:27 -0600 +Subject: [PATCH 09/17] zen: Use [defer+madvise] as default khugepaged defrag + strategy + +For some reason, the default strategy to respond to THP fault fallbacks +is still just madvise, meaning stall if the program wants transparent +hugepages, but don't trigger a background reclaim / compaction if THP +begins to fail allocations. This creates a snowball affect where we +still use the THP code paths, but we almost always fail once a system +has been active and busy for a while. + +The option "defer" was created for interactive systems where THP can +still improve performance. If we have to fallback to a regular page due +to an allocation failure or anything else, we will trigger a background +reclaim and compaction so future THP attempts succeed and previous +attempts eventually have their smaller pages combined without stalling +running applications. + +We still want madvise to stall applications that explicitely want THP, +so defer+madvise _does_ make a ton of sense. Make it the default for +interactive systems, especially if the kernel maintainer left +transparent hugepages on "always". + +Reasoning and details in the original patch: https://lwn.net/Articles/711248/ +--- + mm/huge_memory.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index 74300e337c3c..9277f22c10a7 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -53,7 +53,11 @@ unsigned long transparent_hugepage_flags __read_mostly = + #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE + (1< +Date: Wed, 24 Oct 2018 16:58:52 -0300 +Subject: [PATCH 10/17] net/sched: allow configuring cake qdisc as default + +Signed-off-by: Alexandre Frade +--- + net/sched/Kconfig | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/net/sched/Kconfig b/net/sched/Kconfig +index 84badf00647e..6a922bca9f39 100644 +--- a/net/sched/Kconfig ++++ b/net/sched/Kconfig +@@ -471,6 +471,9 @@ choice + config DEFAULT_SFQ + bool "Stochastic Fair Queue" if NET_SCH_SFQ + ++ config DEFAULT_CAKE ++ bool "Common Applications Kept Enhanced" if NET_SCH_CAKE ++ + config DEFAULT_PFIFO_FAST + bool "Priority FIFO Fast" + endchoice +@@ -481,6 +484,7 @@ config DEFAULT_NET_SCH + default "fq" if DEFAULT_FQ + default "fq_codel" if DEFAULT_FQ_CODEL + default "sfq" if DEFAULT_SFQ ++ default "cake" if DEFAULT_CAKE + default "pfifo_fast" + endif + +-- +2.28.0 + + +From 816ee502759e954304693813bd03d94986b28dba Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Mon, 18 Feb 2019 17:40:57 +0100 +Subject: [PATCH 11/17] mm: Set watermark_scale_factor to 200 (from 10) + +Multiple users have reported it's helping reducing/eliminating stuttering +with DXVK. +--- + mm/page_alloc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 898ff44f2c7b..e72074034793 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -330,7 +330,7 @@ int watermark_boost_factor __read_mostly; + #else + int watermark_boost_factor __read_mostly = 15000; + #endif +-int watermark_scale_factor = 10; ++int watermark_scale_factor = 200; + + static unsigned long nr_kernel_pages __initdata; + static unsigned long nr_all_pages __initdata; +-- +2.28.0 + + +From 90240bcd90a568878738e66c0d45bed3e38e347b Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Fri, 19 Apr 2019 12:33:38 +0200 +Subject: [PATCH 12/17] Set vm.max_map_count to 262144 by default + +The value is still pretty low, and AMD64-ABI and ELF extended numbering +supports that, so we should be fine on modern x86 systems. + +This fixes crashes in some applications using more than 65535 vmas (also +affects some windows games running in wine, such as Star Citizen). +--- + include/linux/mm.h | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/include/linux/mm.h b/include/linux/mm.h +index bc05c3588aa3..b0cefe94920d 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -190,8 +190,7 @@ static inline void __mm_zero_struct_page(struct page *page) + * not a hard limit any more. Although some userspace tools can be surprised by + * that. + */ +-#define MAPCOUNT_ELF_CORE_MARGIN (5) +-#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) ++#define DEFAULT_MAX_MAP_COUNT (262144) + + extern int sysctl_max_map_count; + +-- +2.28.0 + + +From 3a34034dba5efe91bcec491efe8c66e8087f509b Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Mon, 27 Jul 2020 00:19:18 +0200 +Subject: [PATCH 13/17] mm: bump DEFAULT_MAX_MAP_COUNT + +Some games such as Detroit: Become Human tend to be very crash prone with +lower values. +--- + include/linux/mm.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/linux/mm.h b/include/linux/mm.h +index b0cefe94920d..890165099b07 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -190,7 +190,7 @@ static inline void __mm_zero_struct_page(struct page *page) + * not a hard limit any more. Although some userspace tools can be surprised by + * that. + */ +-#define DEFAULT_MAX_MAP_COUNT (262144) ++#define DEFAULT_MAX_MAP_COUNT (524288) + + extern int sysctl_max_map_count; + +-- +2.28.0 + + +From 977812938da7c7226415778c340832141d9278b7 Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Mon, 25 Nov 2019 15:13:06 -0300 +Subject: [PATCH 14/17] elevator: set default scheduler to bfq for blk-mq + +Signed-off-by: Alexandre Frade +--- + block/elevator.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/block/elevator.c b/block/elevator.c +index 4eab3d70e880..79669aa39d79 100644 +--- a/block/elevator.c ++++ b/block/elevator.c +@@ -623,15 +623,15 @@ static inline bool elv_support_iosched(struct request_queue *q) + } + + /* +- * For single queue devices, default to using mq-deadline. If we have multiple +- * queues or mq-deadline is not available, default to "none". ++ * For single queue devices, default to using bfq. If we have multiple ++ * queues or bfq is not available, default to "none". + */ + static struct elevator_type *elevator_get_default(struct request_queue *q) + { + if (q->nr_hw_queues != 1) + return NULL; + +- return elevator_get(q, "mq-deadline", false); ++ return elevator_get(q, "bfq", false); + } + + /* +-- +2.28.0 + + +From e2111bc5989131c675659d40e0cc4f214df2f990 Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Fri, 10 May 2019 16:45:59 -0300 +Subject: [PATCH 15/17] block: set rq_affinity = 2 for full multithreading I/O + requests + +Signed-off-by: Alexandre Frade +--- + include/linux/blkdev.h | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 28efe374a2e1..d4e5d35d2ece 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -624,7 +624,8 @@ struct request_queue { + #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ + + #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ +- (1 << QUEUE_FLAG_SAME_COMP)) ++ (1 << QUEUE_FLAG_SAME_COMP) | \ ++ (1 << QUEUE_FLAG_SAME_FORCE)) + + void blk_queue_flag_set(unsigned int flag, struct request_queue *q); + void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); +-- +2.28.0 + + +From 3c229f434aca65c4ca61772bc03c3e0370817b92 Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Mon, 3 Aug 2020 17:05:04 +0000 +Subject: [PATCH 16/17] mm: set 2 megabytes for address_space-level file + read-ahead pages size + +Signed-off-by: Alexandre Frade +--- + include/linux/pagemap.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h +index cf2468da68e9..007dea784451 100644 +--- a/include/linux/pagemap.h ++++ b/include/linux/pagemap.h +@@ -655,7 +655,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); + void delete_from_page_cache_batch(struct address_space *mapping, + struct pagevec *pvec); + +-#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) ++#define VM_READAHEAD_PAGES (SZ_2M / PAGE_SIZE) + + void page_cache_sync_readahead(struct address_space *, struct file_ra_state *, + struct file *, pgoff_t index, unsigned long req_count); +-- +2.28.0 + + +From 716f41cf6631f3a85834dcb67b4ce99185b6387f Mon Sep 17 00:00:00 2001 +From: Steven Barrett +Date: Wed, 15 Jan 2020 20:43:56 -0600 +Subject: [PATCH 17/17] ZEN: intel-pstate: Implement "enable" parameter + +If intel-pstate is compiled into the kernel, it will preempt the loading +of acpi-cpufreq so you can take advantage of hardware p-states without +any friction. + +However, intel-pstate is not completely superior to cpufreq's ondemand +for one reason. There's no concept of an up_threshold property. + +In ondemand, up_threshold essentially reduces the maximum utilization to +compare against, allowing you to hit max frequencies and turbo boost +from a much lower core utilization. + +With intel-pstate, you have the concept of minimum and maximum +performance, but no tunable that lets you define, maximum frequency +means 50% core utilization. For just this oversight, there's reasons +you may want ondemand. + +Lets support setting "enable" in kernel boot parameters. This lets +kernel maintainers include "intel_pstate=disable" statically in the +static boot parameters, but let users of the kernel override this +selection. +--- + Documentation/admin-guide/kernel-parameters.txt | 3 +++ + drivers/cpufreq/intel_pstate.c | 2 ++ + 2 files changed, 5 insertions(+) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index fb95fad81c79..3e92fee81e33 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -1857,6 +1857,9 @@ + disable + Do not enable intel_pstate as the default + scaling driver for the supported processors ++ enable ++ Enable intel_pstate in-case "disable" was passed ++ previously in the kernel boot parameters + passive + Use intel_pstate as a scaling driver, but configure it + to work with generic cpufreq governors (instead of +diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c +index 36a469150ff9..aee891c9b78a 100644 +--- a/drivers/cpufreq/intel_pstate.c ++++ b/drivers/cpufreq/intel_pstate.c +@@ -2845,6 +2845,8 @@ static int __init intel_pstate_setup(char *str) + pr_info("HWP disabled\n"); + no_hwp = 1; + } ++ if (!strcmp(str, "enable")) ++ no_load = 0; + if (!strcmp(str, "force")) + force_load = 1; + if (!strcmp(str, "hwp_only")) +-- +2.28.0 + diff --git a/linux-tkg/linux-tkg-patches/5.8/0003-glitched-cfs.patch b/linux-tkg/linux-tkg-patches/5.8/0003-glitched-cfs.patch new file mode 100644 index 0000000..06b7f02 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.8/0003-glitched-cfs.patch @@ -0,0 +1,72 @@ +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_500 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -39,6 +39,13 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,6 +59,7 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_500 ++ default HZ_750 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -46,6 +46,13 @@ choice + on desktops with great smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -60,6 +67,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 500 if HZ_500 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK + diff --git a/linux-tkg/linux-tkg-patches/5.8/0005-glitched-pds.patch b/linux-tkg/linux-tkg-patches/5.8/0005-glitched-pds.patch new file mode 100644 index 0000000..4307c45 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.8/0005-glitched-pds.patch @@ -0,0 +1,90 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched - PDS + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_500 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -39,6 +39,13 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,6 +59,7 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_500 ++ default HZ_750 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -46,6 +46,13 @@ choice + on desktops with great smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -60,6 +67,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 500 if HZ_500 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 9270a4370d54..30d01e647417 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -159,7 +159,7 @@ struct scan_control { + /* + * From 0 .. 100. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 20; + /* + * The total number of pages which are beyond the high watermark within all + * zones. diff --git a/linux-tkg/linux-tkg-patches/5.8/0005-undead-glitched-ondemand-pds.patch b/linux-tkg/linux-tkg-patches/5.8/0005-undead-glitched-ondemand-pds.patch new file mode 100644 index 0000000..c1929e8 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.8/0005-undead-glitched-ondemand-pds.patch @@ -0,0 +1,18 @@ +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index 6b423eebfd5d..61e3271675d6 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -21,10 +21,10 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (63) +-#define DEF_SAMPLING_DOWN_FACTOR (1) ++#define DEF_FREQUENCY_UP_THRESHOLD (55) ++#define DEF_SAMPLING_DOWN_FACTOR (5) + #define MAX_SAMPLING_DOWN_FACTOR (100000) +-#define MICRO_FREQUENCY_UP_THRESHOLD (95) ++#define MICRO_FREQUENCY_UP_THRESHOLD (63) + #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) + #define MIN_FREQUENCY_UP_THRESHOLD (1) + #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux-tkg/linux-tkg-patches/5.8/0005-undead-glitched-pds.patch b/linux-tkg/linux-tkg-patches/5.8/0005-undead-glitched-pds.patch new file mode 100644 index 0000000..23271f5 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.8/0005-undead-glitched-pds.patch @@ -0,0 +1,166 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched - PDS + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_500 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -39,6 +39,13 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,6 +59,7 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_500 ++ default HZ_750 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -46,6 +46,13 @@ choice + on desktops with great smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -60,6 +67,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 500 if HZ_500 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 9270a4370d54..30d01e647417 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -159,7 +159,7 @@ struct scan_control { + /* + * From 0 .. 100. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 20; + /* + * The total number of pages which are beyond the high watermark within all + * zones. + +diff --git a/init/Kconfig b/init/Kconfig +index 11fd9b502d06..e9bc34d3019b 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -715,6 +715,7 @@ menu "Scheduler features" + config UCLAMP_TASK + bool "Enable utilization clamping for RT/FAIR tasks" + depends on CPU_FREQ_GOV_SCHEDUTIL ++ depends on !SCHED_PDS + help + This feature enables the scheduler to track the clamped utilization + of each CPU based on RUNNABLE tasks scheduled on that CPU. +@@ -948,7 +948,6 @@ config CGROUP_DEVICE + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" +- depends on !SCHED_PDS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index b23231bae996..cab4e5c5b38e 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -24,13 +24,13 @@ obj-y += fair.o rt.o deadline.o + obj-$(CONFIG_SMP) += cpudeadline.o topology.o stop_task.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o +-obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o + endif + obj-y += loadavg.o clock.o cputime.o + obj-y += idle.o + obj-y += wait.o wait_bit.o swait.o completion.o + obj-$(CONFIG_SMP) += cpupri.o pelt.o + obj-$(CONFIG_SCHEDSTATS) += stats.o ++obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o + +diff --git a/kernel/sched/pds.c b/kernel/sched/pds.c +index 9281ad164..f09a609cf 100644 +--- a/kernel/sched/pds.c ++++ b/kernel/sched/pds.c +@@ -81,6 +81,18 @@ enum { + NR_CPU_AFFINITY_CHK_LEVEL + }; + ++/* ++ * This allows printing both to /proc/sched_debug and ++ * to the console ++ */ ++#define SEQ_printf(m, x...) \ ++ do { \ ++ if (m) \ ++ seq_printf(m, x); \ ++ else \ ++ pr_cont(x); \ ++ } while (0) ++ + static inline void print_scheduler_version(void) + { + printk(KERN_INFO "pds: PDS-mq CPU Scheduler 0.99o by Alfred Chen.\n"); +@@ -6353,7 +6365,10 @@ void ia64_set_curr_task(int cpu, struct task_struct *p) + #ifdef CONFIG_SCHED_DEBUG + void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, + struct seq_file *m) +-{} ++{ ++ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), ++ get_nr_threads(p)); ++} + + void proc_sched_set_task(struct task_struct *p) + {} diff --git a/linux-tkg/linux-tkg-patches/5.8/0005-v5.8_undead-pds099o.patch b/linux-tkg/linux-tkg-patches/5.8/0005-v5.8_undead-pds099o.patch new file mode 100644 index 0000000..7cb7e91 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.8/0005-v5.8_undead-pds099o.patch @@ -0,0 +1,8530 @@ +From 68f1a9541ef3185b1021e8e54d2712c7039418d7 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Mon, 15 Jun 2020 23:58:41 +0200 +Subject: PDS 099o, initial 5.8 rebase + + +diff --git a/Documentation/scheduler/sched-PDS-mq.txt b/Documentation/scheduler/sched-PDS-mq.txt +new file mode 100644 +index 000000000000..709e86f6487e +--- /dev/null ++++ b/Documentation/scheduler/sched-PDS-mq.txt +@@ -0,0 +1,56 @@ ++ Priority and Deadline based Skiplist multiple queue Scheduler ++ ------------------------------------------------------------- ++ ++CONTENT ++======== ++ ++ 0. Development ++ 1. Overview ++ 1.1 Design goal ++ 1.2 Design summary ++ 2. Design Detail ++ 2.1 Skip list implementation ++ 2.2 Task preempt ++ 2.3 Task policy, priority and deadline ++ 2.4 Task selection ++ 2.5 Run queue balance ++ 2.6 Task migration ++ ++ ++0. Development ++============== ++ ++Priority and Deadline based Skiplist multiple queue scheduler, referred to as ++PDS from here on, is developed upon the enhancement patchset VRQ(Variable Run ++Queue) for BFS(Brain Fuck Scheduler by Con Kolivas). PDS inherits the existing ++design from VRQ and inspired by the introduction of skiplist data structure ++to the scheduler by Con Kolivas. However, PDS is different from MuQSS(Multiple ++Queue Skiplist Scheduler, the successor after BFS) in many ways. ++ ++1. Overview ++=========== ++ ++1.1 Design goal ++--------------- ++ ++PDS is designed to make the cpu process scheduler code to be simple, but while ++efficiency and scalable. Be Simple, the scheduler code will be easy to be read ++and the behavious of scheduler will be easy to predict. Be efficiency, the ++scheduler shall be well balance the thoughput performance and task interactivity ++at the same time for different properties the tasks behave. Be scalable, the ++performance of the scheduler should be in good shape with the glowing of ++workload or with the growing of the cpu numbers. ++ ++1.2 Design summary ++------------------ ++ ++PDS is described as a multiple run queues cpu scheduler. Each cpu has its own ++run queue. A heavry customized skiplist is used as the backend data structure ++of the cpu run queue. Tasks in run queue is sorted by priority then virtual ++deadline(simplfy to just deadline from here on). In PDS, balance action among ++run queues are kept as less as possible to reduce the migration cost. Cpumask ++data structure is widely used in cpu affinity checking and cpu preemption/ ++selection to make PDS scalable with increasing cpu number. ++ ++ ++To be continued... +diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +index f18d5067cd0f..fe489fc01c73 100644 +--- a/arch/powerpc/platforms/cell/spufs/sched.c ++++ b/arch/powerpc/platforms/cell/spufs/sched.c +@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; + static struct timer_list spusched_timer; + static struct timer_list spuloadavg_timer; + +-/* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- + /* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 2d3f963fd6f1..5f41ead019b1 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1006,6 +1006,22 @@ config NR_CPUS + config SCHED_SMT + def_bool y if SMP + ++config SMT_NICE ++ bool "SMT (Hyperthreading) aware nice priority and policy support" ++ depends on SCHED_PDS && SCHED_SMT ++ default y ++ ---help--- ++ Enabling Hyperthreading on Intel CPUs decreases the effectiveness ++ of the use of 'nice' levels and different scheduling policies ++ (e.g. realtime) due to sharing of CPU power between hyperthreads. ++ SMT nice support makes each logical CPU aware of what is running on ++ its hyperthread siblings, maintaining appropriate distribution of ++ CPU according to nice levels and scheduling policies at the expense ++ of slightly increased overhead. ++ ++ If unsure say Y here. ++ ++ + config SCHED_MC + def_bool y + prompt "Multi-core scheduler support" +diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c +index 737ff3b9c2c0..b5bc5a1b6de7 100644 +--- a/drivers/cpufreq/cpufreq_conservative.c ++++ b/drivers/cpufreq/cpufreq_conservative.c +@@ -28,8 +28,8 @@ struct cs_dbs_tuners { + }; + + /* Conservative governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_FREQUENCY_DOWN_THRESHOLD (20) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) ++#define DEF_FREQUENCY_DOWN_THRESHOLD (26) + #define DEF_FREQUENCY_STEP (5) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (10) +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index 82a4d37ddecb..1130e0f5db72 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -18,7 +18,7 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) ++#define DEF_FREQUENCY_UP_THRESHOLD (63) + #define DEF_SAMPLING_DOWN_FACTOR (1) + #define MAX_SAMPLING_DOWN_FACTOR (100000) + #define MICRO_FREQUENCY_UP_THRESHOLD (95) +@@ -127,7 +127,7 @@ static void dbs_freq_increase(struct cpufreq_policy *policy, unsigned int freq) + } + + /* +- * Every sampling_rate, we check, if current idle time is less than 20% ++ * Every sampling_rate, we check, if current idle time is less than 37% + * (default), then we try to increase frequency. Else, we adjust the frequency + * proportional to load. + */ +diff --git a/fs/proc/base.c b/fs/proc/base.c +index eb2255e95f62..62b8cedbccb6 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -479,7 +479,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + seq_puts(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff --git a/include/linux/init_task.h b/include/linux/init_task.h +index 2c620d7ac432..1a7987c40c80 100644 +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -36,7 +36,11 @@ extern struct cred init_cred; + #define INIT_PREV_CPUTIME(x) + #endif + ++#ifdef CONFIG_SCHED_PDS ++#define INIT_TASK_COMM "PDS" ++#else + #define INIT_TASK_COMM "swapper" ++#endif /* !CONFIG_SCHED_PDS */ + + /* Attach to the init_task data structure for proper alignment */ + #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK +diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h +index fed6ba96c527..f03a5ee419a1 100644 +--- a/include/linux/jiffies.h ++++ b/include/linux/jiffies.h +@@ -169,7 +169,7 @@ static inline u64 get_jiffies_64(void) + * Have the 32 bit jiffies value wrap 5 minutes after boot + * so jiffies wrap bugs show up earlier. + */ +-#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) ++#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ)) + + /* + * Change timeval to jiffies, trying to avoid the +diff --git a/kernel/smp.c b/kernel/smp.c +index 4418f5cb8324..2b51afac5b06 100644 +--- a/kernel/smp.c ++++ b/kernel/smp.c +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 4418f5cb8324..2b51afac5b06 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -31,6 +31,7 @@ + #include + #include + #include ++#include + + /* task_struct member predeclarations (sorted alphabetically): */ + struct audit_context; +@@ -652,9 +653,13 @@ struct task_struct { + unsigned int flags; + unsigned int ptrace; + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_PDS) + int on_cpu; ++#endif ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) + struct __call_single_node wake_entry; ++#endif ++#ifdef CONFIG_SMP + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ + unsigned int cpu; +@@ -663,6 +668,7 @@ struct task_struct { + unsigned long wakee_flip_decay_ts; + struct task_struct *last_wakee; + ++#ifndef CONFIG_SCHED_PDS + /* + * recent_used_cpu is initially set as the last CPU used by a task + * that wakes affine another task. Waker/wakee relationships can +@@ -671,6 +677,7 @@ struct task_struct { + * used CPU that may be idle. + */ + int recent_used_cpu; ++#endif /* CONFIG_SCHED_PDS */ + int wake_cpu; + #endif + int on_rq; +@@ -680,13 +687,27 @@ struct task_struct { + int normal_prio; + unsigned int rt_priority; + ++#ifdef CONFIG_SCHED_PDS ++ int time_slice; ++ u64 deadline; ++ /* skip list level */ ++ int sl_level; ++ /* skip list node */ ++ struct skiplist_node sl_node; ++ /* 8bits prio and 56bits deadline for quick processing */ ++ u64 priodl; ++ u64 last_ran; ++ /* sched_clock time spent running */ ++ u64 sched_time; ++#else /* CONFIG_SCHED_PDS */ + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++ struct sched_dl_entity dl; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +- struct sched_dl_entity dl; + + #ifdef CONFIG_UCLAMP_TASK + /* Clamp values requested for a scheduling entity */ +@@ -1306,6 +1327,29 @@ struct task_struct { + */ + }; + ++#ifdef CONFIG_SCHED_PDS ++void cpu_scaling(int cpu); ++void cpu_nonscaling(int cpu); ++#define tsk_seruntime(t) ((t)->sched_time) ++/* replace the uncertian rt_timeout with 0UL */ ++#define tsk_rttimeout(t) (0UL) ++ ++#define task_running_idle(p) ((p)->prio == IDLE_PRIO) ++#else /* CFS */ ++extern int runqueue_is_locked(int cpu); ++static inline void cpu_scaling(int cpu) ++{ ++} ++ ++static inline void cpu_nonscaling(int cpu) ++{ ++} ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++ ++#define iso_task(p) (false) ++#endif /* CONFIG_SCHED_PDS */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index 1aff00b65f3c..a5e5fc2c9170 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -1,5 +1,22 @@ + /* SPDX-License-Identifier: GPL-2.0 */ + ++#ifdef CONFIG_SCHED_PDS ++ ++#define __tsk_deadline(p) ((p)->deadline) ++ ++static inline int dl_prio(int prio) ++{ ++ return 1; ++} ++ ++static inline int dl_task(struct task_struct *p) ++{ ++ return 1; ++} ++#else ++ ++#define __tsk_deadline(p) ((p)->dl.deadline) ++ + /* + * SCHED_DEADLINE tasks has negative priorities, reflecting + * the fact that any of them has higher prio than RT and +@@ -19,6 +36,7 @@ static inline int dl_task(struct task_struct *p) + { + return dl_prio(p->prio); + } ++#endif /* CONFIG_SCHED_PDS */ + + static inline bool dl_time_before(u64 a, u64 b) + { +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index 7d64feafc408..fba04bb91492 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -20,7 +20,18 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ ++#ifdef CONFIG_SCHED_PDS ++#define ISO_PRIO (MAX_USER_RT_PRIO) ++ ++#define MAX_RT_PRIO ((MAX_USER_RT_PRIO) + 1) ++ ++#define NORMAL_PRIO (MAX_RT_PRIO) ++#define IDLE_PRIO ((MAX_RT_PRIO) + 1) ++#define PRIO_LIMIT ((IDLE_PRIO) + 1) ++#else /* !CONFIG_SCHED_PDS */ + #define MAX_RT_PRIO MAX_USER_RT_PRIO ++#endif /* CONFIG_SCHED_PDS */ + + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c08b4..a96012e6f15e 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_PDS + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h +index 38359071236a..90328ccd527f 100644 +--- a/include/linux/sched/task.h ++++ b/include/linux/sched/task.h +@@ -106,7 +106,7 @@ extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); + extern void free_task(struct task_struct *tsk); + + /* sched_exec is called by processes performing an exec */ +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) + extern void sched_exec(void); + #else + #define sched_exec() {} +diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h +new file mode 100644 +index 000000000000..713fedd8034f +--- /dev/null ++++ b/include/linux/skip_list.h +@@ -0,0 +1,177 @@ ++/* ++ Copyright (C) 2016 Alfred Chen. ++ ++ Code based on Con Kolivas's skip list implementation for BFS, and ++ which is based on example originally by William Pugh. ++ ++Skip Lists are a probabilistic alternative to balanced trees, as ++described in the June 1990 issue of CACM and were invented by ++William Pugh in 1987. ++ ++A couple of comments about this implementation: ++ ++This file only provides a infrastructure of skip list. ++ ++skiplist_node is embedded into container data structure, to get rid the ++dependency of kmalloc/kfree operation in scheduler code. ++ ++A customized search function should be defined using DEFINE_SKIPLIST_INSERT ++macro and be used for skip list insert operation. ++ ++Random Level is also not defined in this file, instead, it should be customized ++implemented and set to node->level then pass to the customized skiplist_insert ++function. ++ ++Levels start at zero and go up to (NUM_SKIPLIST_LEVEL -1) ++ ++NUM_SKIPLIST_LEVEL in this implementation is 8 instead of origin 16, ++considering that there will be 256 entries to enable the top level when using ++random level p=0.5, and that number is more than enough for a run queue usage ++in a scheduler usage. And it also help to reduce the memory usage of the ++embedded skip list node in task_struct to about 50%. ++ ++The insertion routine has been implemented so as to use the ++dirty hack described in the CACM paper: if a random level is ++generated that is more than the current maximum level, the ++current maximum level plus one is used instead. ++ ++BFS Notes: In this implementation of skiplists, there are bidirectional ++next/prev pointers and the insert function returns a pointer to the actual ++node the value is stored. The key here is chosen by the scheduler so as to ++sort tasks according to the priority list requirements and is no longer used ++by the scheduler after insertion. The scheduler lookup, however, occurs in ++O(1) time because it is always the first item in the level 0 linked list. ++Since the task struct stores a copy of the node pointer upon skiplist_insert, ++it can also remove it much faster than the original implementation with the ++aid of prev<->next pointer manipulation and no searching. ++*/ ++#ifndef _LINUX_SKIP_LIST_H ++#define _LINUX_SKIP_LIST_H ++ ++#include ++ ++#define NUM_SKIPLIST_LEVEL (8) ++ ++struct skiplist_node { ++ int level; /* Levels in this node */ ++ struct skiplist_node *next[NUM_SKIPLIST_LEVEL]; ++ struct skiplist_node *prev[NUM_SKIPLIST_LEVEL]; ++}; ++ ++#define SKIPLIST_NODE_INIT(name) { 0,\ ++ {&name, &name, &name, &name,\ ++ &name, &name, &name, &name},\ ++ {&name, &name, &name, &name,\ ++ &name, &name, &name, &name},\ ++ } ++ ++static inline void INIT_SKIPLIST_NODE(struct skiplist_node *node) ++{ ++ /* only level 0 ->next matters in skiplist_empty()*/ ++ WRITE_ONCE(node->next[0], node); ++} ++ ++/** ++ * FULL_INIT_SKIPLIST_NODE -- fully init a skiplist_node, expecially for header ++ * @node: the skip list node to be inited. ++ */ ++static inline void FULL_INIT_SKIPLIST_NODE(struct skiplist_node *node) ++{ ++ int i; ++ ++ node->level = 0; ++ for (i = 0; i < NUM_SKIPLIST_LEVEL; i++) { ++ WRITE_ONCE(node->next[i], node); ++ node->prev[i] = node; ++ } ++} ++ ++/** ++ * skiplist_empty - test whether a skip list is empty ++ * @head: the skip list to test. ++ */ ++static inline int skiplist_empty(const struct skiplist_node *head) ++{ ++ return READ_ONCE(head->next[0]) == head; ++} ++ ++/** ++ * skiplist_entry - get the struct for this entry ++ * @ptr: the &struct skiplist_node pointer. ++ * @type: the type of the struct this is embedded in. ++ * @member: the name of the skiplist_node within the struct. ++ */ ++#define skiplist_entry(ptr, type, member) \ ++ container_of(ptr, type, member) ++ ++/** ++ * DEFINE_SKIPLIST_INSERT_FUNC -- macro to define a customized skip list insert ++ * function, which takes two parameters, first one is the header node of the ++ * skip list, second one is the skip list node to be inserted ++ * @func_name: the customized skip list insert function name ++ * @search_func: the search function to be used, which takes two parameters, ++ * 1st one is the itrator of skiplist_node in the list, the 2nd is the skip list ++ * node to be inserted, the function should return true if search should be ++ * continued, otherwise return false. ++ * Returns 1 if @node is inserted as the first item of skip list at level zero, ++ * otherwise 0 ++ */ ++#define DEFINE_SKIPLIST_INSERT_FUNC(func_name, search_func)\ ++static inline int func_name(struct skiplist_node *head, struct skiplist_node *node)\ ++{\ ++ struct skiplist_node *update[NUM_SKIPLIST_LEVEL];\ ++ struct skiplist_node *p, *q;\ ++ int k = head->level;\ ++\ ++ p = head;\ ++ do {\ ++ while (q = p->next[k], q != head && search_func(q, node))\ ++ p = q;\ ++ update[k] = p;\ ++ } while (--k >= 0);\ ++\ ++ k = node->level;\ ++ if (unlikely(k > head->level)) {\ ++ node->level = k = ++head->level;\ ++ update[k] = head;\ ++ }\ ++\ ++ do {\ ++ p = update[k];\ ++ q = p->next[k];\ ++ node->next[k] = q;\ ++ p->next[k] = node;\ ++ node->prev[k] = p;\ ++ q->prev[k] = node;\ ++ } while (--k >= 0);\ ++\ ++ return (p == head);\ ++} ++ ++/** ++ * skiplist_del_init -- delete skip list node from a skip list and reset it's ++ * init state ++ * @head: the header node of the skip list to be deleted from. ++ * @node: the skip list node to be deleted, the caller need to ensure @node is ++ * in skip list which @head represent. ++ * Returns 1 if @node is the first item of skip level at level zero, otherwise 0 ++ */ ++static inline int ++skiplist_del_init(struct skiplist_node *head, struct skiplist_node *node) ++{ ++ int l, m = node->level; ++ ++ for (l = 0; l <= m; l++) { ++ node->prev[l]->next[l] = node->next[l]; ++ node->next[l]->prev[l] = node->prev[l]; ++ } ++ if (m == head->level && m > 0) { ++ while (head->next[m] == head && m > 0) ++ m--; ++ head->level = m; ++ } ++ INIT_SKIPLIST_NODE(node); ++ ++ return (node->prev[0] == head); ++} ++#endif /* _LINUX_SKIP_LIST_H */ +diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h +index 3bac0a8ceab2..d6d384ddb57d 100644 +--- a/include/uapi/linux/sched.h ++++ b/include/uapi/linux/sched.h +@@ -115,7 +115,10 @@ struct clone_args { + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 +-/* SCHED_ISO: reserved but not implemented yet */ ++/* SCHED_ISO: Implemented in BFS/MuQSSPDS only */ ++ ++#define SCHED_ISO 4 ++ + #define SCHED_IDLE 5 + #define SCHED_DEADLINE 6 + +diff --git a/init/Kconfig b/init/Kconfig +index 74a5ac65644f..e4fd406b58dd 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -61,6 +61,21 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config SCHED_PDS ++ bool "PDS-mq cpu scheduler" ++ help ++ The Priority and Deadline based Skip list multiple queue CPU ++ Scheduler for excellent interactivity and responsiveness on the ++ desktop and solid scalability on normal hardware and commodity ++ servers. ++ ++ Currently incompatible with the Group CPU scheduler, and RCU TORTURE ++ TEST so these options are disabled. ++ ++ Say Y here. ++ default y ++ ++ + config BROKEN + bool + +@@ -777,6 +792,7 @@ config NUMA_BALANCING + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_PDS + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -878,7 +894,7 @@ menuconfig CGROUP_SCHED + bandwidth allocation to such task groups. It uses cgroups to group + tasks. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_PDS + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -1007,6 +1023,7 @@ config CGROUP_DEVICE + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" ++ depends on !SCHED_PDS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +@@ -1134,6 +1151,7 @@ config CHECKPOINT_RESTORE + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_PDS + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff --git a/init/init_task.c b/init/init_task.c +index bd403ed3e418..162d3deddd45 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -59,6 +59,126 @@ struct task_struct init_task + __init_task_data + #endif + = { ++#ifdef CONFIG_SCHED_PDS ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ .thread_info = INIT_THREAD_INFO(init_task), ++ .stack_refcount = ATOMIC_INIT(1), ++#endif ++ .state = 0, ++ .stack = init_stack, ++ .usage = ATOMIC_INIT(2), ++ .flags = PF_KTHREAD, ++ .prio = NORMAL_PRIO, ++ .static_prio = MAX_PRIO - 20, ++ .normal_prio = NORMAL_PRIO, ++ .deadline = 0, /* PDS only */ ++ .policy = SCHED_NORMAL, ++ .cpus_ptr = &init_task.cpus_mask, ++ .cpus_mask = CPU_MASK_ALL, ++ .nr_cpus_allowed= NR_CPUS, ++ .mm = NULL, ++ .active_mm = &init_mm, ++ .restart_block = { ++ .fn = do_no_restart_syscall, ++ }, ++ .sl_level = 0, /* PDS only */ ++ .sl_node = SKIPLIST_NODE_INIT(init_task.sl_node), /* PDS only */ ++ .time_slice = HZ, /* PDS only */ ++ .tasks = LIST_HEAD_INIT(init_task.tasks), ++#ifdef CONFIG_SMP ++ .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), ++#endif ++#ifdef CONFIG_CGROUP_SCHED ++ .sched_task_group = &root_task_group, ++#endif ++ .ptraced = LIST_HEAD_INIT(init_task.ptraced), ++ .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), ++ .real_parent = &init_task, ++ .parent = &init_task, ++ .children = LIST_HEAD_INIT(init_task.children), ++ .sibling = LIST_HEAD_INIT(init_task.sibling), ++ .group_leader = &init_task, ++ RCU_POINTER_INITIALIZER(real_cred, &init_cred), ++ RCU_POINTER_INITIALIZER(cred, &init_cred), ++ .comm = INIT_TASK_COMM, ++ .thread = INIT_THREAD, ++ .fs = &init_fs, ++ .files = &init_files, ++ .signal = &init_signals, ++ .sighand = &init_sighand, ++ .nsproxy = &init_nsproxy, ++ .pending = { ++ .list = LIST_HEAD_INIT(init_task.pending.list), ++ .signal = {{0}} ++ }, ++ .blocked = {{0}}, ++ .alloc_lock = __SPIN_LOCK_UNLOCKED(init_task.alloc_lock), ++ .journal_info = NULL, ++ INIT_CPU_TIMERS(init_task) ++ .pi_lock = __RAW_SPIN_LOCK_UNLOCKED(init_task.pi_lock), ++ .timer_slack_ns = 50000, /* 50 usec default slack */ ++ .thread_pid = &init_struct_pid, ++ .thread_group = LIST_HEAD_INIT(init_task.thread_group), ++ .thread_node = LIST_HEAD_INIT(init_signals.thread_head), ++#ifdef CONFIG_AUDITSYSCALL ++ .loginuid = INVALID_UID, ++ .sessionid = AUDIT_SID_UNSET, ++#endif ++#ifdef CONFIG_PERF_EVENTS ++ .perf_event_mutex = __MUTEX_INITIALIZER(init_task.perf_event_mutex), ++ .perf_event_list = LIST_HEAD_INIT(init_task.perf_event_list), ++#endif ++#ifdef CONFIG_PREEMPT_RCU ++ .rcu_read_lock_nesting = 0, ++ .rcu_read_unlock_special.s = 0, ++ .rcu_node_entry = LIST_HEAD_INIT(init_task.rcu_node_entry), ++ .rcu_blocked_node = NULL, ++#endif ++#ifdef CONFIG_TASKS_RCU ++ .rcu_tasks_holdout = false, ++ .rcu_tasks_holdout_list = LIST_HEAD_INIT(init_task.rcu_tasks_holdout_list), ++ .rcu_tasks_idle_cpu = -1, ++#endif ++#ifdef CONFIG_CPUSETS ++ .mems_allowed_seq = SEQCNT_ZERO(init_task.mems_allowed_seq), ++#endif ++#ifdef CONFIG_RT_MUTEXES ++ .pi_waiters = RB_ROOT_CACHED, ++ .pi_top_task = NULL, ++#endif ++ INIT_PREV_CPUTIME(init_task) ++#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN ++ .vtime.seqcount = SEQCNT_ZERO(init_task.vtime_seqcount), ++ .vtime.starttime = 0, ++ .vtime.state = VTIME_SYS, ++#endif ++#ifdef CONFIG_NUMA_BALANCING ++ .numa_preferred_nid = -1, ++ .numa_group = NULL, ++ .numa_faults = NULL, ++#endif ++#ifdef CONFIG_KASAN ++ .kasan_depth = 1, ++#endif ++#ifdef CONFIG_TRACE_IRQFLAGS ++ .softirqs_enabled = 1, ++#endif ++#ifdef CONFIG_LOCKDEP ++ .lockdep_recursion = 0, ++#endif ++#ifdef CONFIG_FUNCTION_GRAPH_TRACER ++ .ret_stack = NULL, ++#endif ++#if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPT) ++ .trace_recursion = 0, ++#endif ++#ifdef CONFIG_LIVEPATCH ++ .patch_state = KLP_UNDEFINED, ++#endif ++#ifdef CONFIG_SECURITY ++ .security = NULL, ++#endif ++#else /* CONFIG_SCHED_PDS */ + #ifdef CONFIG_THREAD_INFO_IN_TASK + .thread_info = INIT_THREAD_INFO(init_task), + .stack_refcount = REFCOUNT_INIT(1), +@@ -182,6 +302,7 @@ struct task_struct init_task + #ifdef CONFIG_SECURITY + .security = NULL, + #endif ++#endif /* CONFIG_SCHED_PDS */ + }; + EXPORT_SYMBOL(init_task); + +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index 729d3a5c772e..10a7c52b90d5 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -636,7 +636,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) + return ret; + } + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_PDS) + /* + * Helper routine for generate_sched_domains(). + * Do cpusets a, b have overlapping effective cpus_allowed masks? +@@ -1009,7 +1009,7 @@ static void rebuild_sched_domains_locked(void) + /* Have scheduler rebuild the domains */ + partition_and_rebuild_sched_domains(ndoms, doms, attr); + } +-#else /* !CONFIG_SMP */ ++#else /* !CONFIG_SMP || CONFIG_SCHED_PDS */ + static void rebuild_sched_domains_locked(void) + { + } +diff --git a/kernel/delayacct.c b/kernel/delayacct.c +index 27725754ac99..769d773c7182 100644 +--- a/kernel/delayacct.c ++++ b/kernel/delayacct.c +@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff --git a/kernel/exit.c b/kernel/exit.c +index ce2a75bc0ade..f0f864bc1ab9 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -122,7 +122,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -143,7 +143,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +index f6310f848f34..b5de980c7d4e 100644 +--- a/kernel/livepatch/transition.c ++++ b/kernel/livepatch/transition.c +@@ -306,7 +306,11 @@ static bool klp_try_switch_task(struct task_struct *task) + */ + rq = task_rq_lock(task, &flags); + ++#ifdef CONFIG_SCHED_PDS ++ if (task_running(task) && task != current) { ++#else + if (task_running(rq, task) && task != current) { ++#endif + snprintf(err_buf, STACK_ERR_BUF_SIZE, + "%s: %s:%d is running\n", __func__, task->comm, + task->pid); +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index c9f090d64f00..063d15a1ab8b 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -229,7 +229,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + * Only use with rt_mutex_waiter_{less,equal}() + */ + #define task_to_waiter(p) \ +- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } ++ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = __tsk_deadline(p) } + + static inline int + rt_mutex_waiter_less(struct rt_mutex_waiter *left, +@@ -680,7 +680,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, + * the values of the node being removed. + */ + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + rt_mutex_enqueue(lock, waiter); + +@@ -953,7 +953,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + waiter->task = task; + waiter->lock = lock; + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + /* Get the top priority waiter on the lock */ + if (rt_mutex_has_waiters(lock)) +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 21fb5a5662b5..8ebe4e33fb5f 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -16,15 +16,21 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + +-obj-y += core.o loadavg.o clock.o cputime.o +-obj-y += idle.o fair.o rt.o deadline.o +-obj-y += wait.o wait_bit.o swait.o completion.o +- +-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o ++ifdef CONFIG_SCHED_PDS ++obj-y += pds.o ++else ++obj-y += core.o ++obj-y += fair.o rt.o deadline.o ++obj-$(CONFIG_SMP) += cpudeadline.o topology.o stop_task.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o +-obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o ++endif ++obj-y += loadavg.o clock.o cputime.o ++obj-y += idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o ++obj-$(CONFIG_SMP) += cpupri.o pelt.o ++obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index 7fbaee24c824..28377ad56248 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -183,6 +183,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifndef CONFIG_SCHED_PDS + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -300,6 +301,13 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) + + return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); + } ++#else /* CONFIG_SCHED_PDS */ ++static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) ++{ ++ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); ++ return sg_cpu->max; ++} ++#endif + + /** + * sugov_iowait_reset() - Reset the IO boost status of a CPU. +@@ -443,7 +451,9 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } + */ + static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) + { ++#ifndef CONFIG_SCHED_PDS + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) ++#endif + sg_policy->limits_changed = true; + } + +@@ -686,6 +696,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + } + + ret = sched_setattr_nocheck(thread, &attr); ++ + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); +@@ -916,6 +927,7 @@ static int __init sugov_register(void) + core_initcall(sugov_register); + + #ifdef CONFIG_ENERGY_MODEL ++#ifndef CONFIG_SCHED_PDS + extern bool sched_energy_update; + extern struct mutex sched_energy_mutex; + +@@ -946,4 +958,10 @@ void sched_cpufreq_governor_change(struct cpufreq_policy *policy, + } + + } ++#else /* CONFIG_SCHED_PDS */ ++void sched_cpufreq_governor_change(struct cpufreq_policy *policy, ++ struct cpufreq_governor *old_gov) ++{ ++} ++#endif + #endif +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index ff9435dee1df..1377ea3d1b76 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -122,7 +122,12 @@ void account_user_time(struct task_struct *p, u64 cputime) + p->utime += cputime; + account_group_user_time(p, cputime); + ++#ifdef CONFIG_SCHED_PDS ++ index = (task_nice(p) > 0 || task_running_idle(p)) ? CPUTIME_NICE : ++ CPUTIME_USER; ++#else + index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; ++#endif + + /* Add user time to cpustat. */ + task_group_account_field(p, index, cputime); +@@ -146,7 +151,11 @@ void account_guest_time(struct task_struct *p, u64 cputime) + p->gtime += cputime; + + /* Add guest time to cpustat. */ ++#ifdef CONFIG_SCHED_PDS ++ if (task_nice(p) > 0 || task_running_idle(p)) { ++#else + if (task_nice(p) > 0) { ++#endif + cpustat[CPUTIME_NICE] += cputime; + cpustat[CPUTIME_GUEST_NICE] += cputime; + } else { +@@ -269,7 +278,7 @@ static inline u64 account_other_time(u64 max) + #ifdef CONFIG_64BIT + static inline u64 read_sum_exec_runtime(struct task_struct *t) + { +- return t->se.sum_exec_runtime; ++ return tsk_seruntime(t); + } + #else + static u64 read_sum_exec_runtime(struct task_struct *t) +@@ -279,7 +288,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) + struct rq *rq; + + rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; ++ ns = tsk_seruntime(t); + task_rq_unlock(rq, t, &rf); + + return ns; +@@ -658,7 +667,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index b743bf38f08f..16e5754af1cf 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -361,6 +361,7 @@ void cpu_startup_entry(enum cpuhp_state state) + do_idle(); + } + ++#ifndef CONFIG_SCHED_PDS + /* + * idle-task scheduling class. + */ +@@ -481,3 +482,4 @@ const struct sched_class idle_sched_class = { + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif +diff --git a/kernel/sched/pds.c b/kernel/sched/pds.c +new file mode 100644 +index 000000000000..02d7d5a67c77 +--- /dev/null ++++ b/kernel/sched/pds.c +@@ -0,0 +1,6619 @@ ++/* ++ * kernel/sched/pds.c, was kernel/sched.c ++ * ++ * PDS-mq Core kernel scheduler code and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel ++ * scheduler by Alfred Chen. ++ */ ++#include "pds_sched.h" ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++ ++#include "../workqueue_internal.h" ++#include "../../fs/io-wq.h" ++#include "../smpboot.h" ++ ++#include "pelt.h" ++#include "smp.h" ++ ++#define CREATE_TRACE_POINTS ++#include ++ ++ ++#define rt_prio(prio) ((prio) < MAX_RT_PRIO) ++#define rt_task(p) rt_prio((p)->prio) ++#define rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR || \ ++ (policy) == SCHED_ISO) ++#define task_has_rt_policy(p) (rt_policy((p)->policy)) ++ ++#define idle_policy(policy) ((policy) == SCHED_IDLE) ++#define idleprio_task(p) unlikely(idle_policy((p)->policy)) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* ++ * Some helpers for converting to/from various scales. Use shifts to get ++ * approximate multiples of ten for less overhead. ++ */ ++#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) ++#define JIFFY_NS (1000000000 / HZ) ++#define HALF_JIFFY_NS (1000000000 / HZ / 2) ++#define HALF_JIFFY_US (1000000 / HZ / 2) ++#define MS_TO_NS(TIME) ((TIME) << 20) ++#define MS_TO_US(TIME) ((TIME) << 10) ++#define NS_TO_MS(TIME) ((TIME) >> 20) ++#define NS_TO_US(TIME) ((TIME) >> 10) ++#define US_TO_NS(TIME) ((TIME) << 10) ++ ++#define RESCHED_US (100) /* Reschedule if less than this many μs left */ ++ ++enum { ++ BASE_CPU_AFFINITY_CHK_LEVEL = 1, ++#ifdef CONFIG_SCHED_SMT ++ SMT_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++#ifdef CONFIG_SCHED_MC ++ MC_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++ NR_CPU_AFFINITY_CHK_LEVEL ++}; ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "pds: PDS-mq CPU Scheduler 0.99o by Alfred Chen and kept alive artificially by Tk-Glitch.\n"); ++} ++ ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 6ms. Scales with number of cpus. ++ * Tunable via /proc interface. ++ */ ++#define SCHED_DEFAULT_RR (4) ++int rr_interval __read_mostly = SCHED_DEFAULT_RR; ++ ++static int __init rr_interval_set(char *str) ++{ ++ u32 rr; ++ ++ pr_info("rr_interval: "); ++ if (kstrtouint(str, 0, &rr)) { ++ pr_cont("using default of %u, unable to parse %s\n", ++ rr_interval, str); ++ return 1; ++ } ++ ++ rr_interval = rr; ++ pr_cont("%d\n", rr_interval); ++ ++ return 1; ++} ++__setup("rr_interval=", rr_interval_set); ++ ++ ++static const u64 sched_prio2deadline[NICE_WIDTH] = { ++/* -20 */ 6291456, 6920601, 7612661, 8373927, 9211319, ++/* -15 */ 10132450, 11145695, 12260264, 13486290, 14834919, ++/* -10 */ 16318410, 17950251, 19745276, 21719803, 23891783, ++/* -5 */ 26280961, 28909057, 31799962, 34979958, 38477953, ++/* 0 */ 42325748, 46558322, 51214154, 56335569, 61969125, ++/* 5 */ 68166037, 74982640, 82480904, 90728994, 99801893, ++/* 10 */ 109782082, 120760290, 132836319, 146119950, 160731945, ++/* 15 */ 176805139, 194485652, 213934217, 235327638, 258860401 ++}; ++ ++/** ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Yield only to better priority/deadline tasks. (default) ++ * 2: Expire timeslice and recalculate deadline. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++/* ++ * The quota handed out to tasks of all priority levels when refilling their ++ * time_slice. ++ */ ++static inline int timeslice(void) ++{ ++ return MS_TO_US(rr_interval); ++} ++ ++#ifdef CONFIG_SMP ++enum { ++SCHED_RQ_EMPTY = 0, ++SCHED_RQ_IDLE, ++SCHED_RQ_NORMAL_0, ++SCHED_RQ_NORMAL_1, ++SCHED_RQ_NORMAL_2, ++SCHED_RQ_NORMAL_3, ++SCHED_RQ_NORMAL_4, ++SCHED_RQ_NORMAL_5, ++SCHED_RQ_NORMAL_6, ++SCHED_RQ_NORMAL_7, ++SCHED_RQ_ISO, ++SCHED_RQ_RT, ++NR_SCHED_RQ_QUEUED_LEVEL ++}; ++ ++static cpumask_t sched_rq_queued_masks[NR_SCHED_RQ_QUEUED_LEVEL] ++____cacheline_aligned_in_smp; ++ ++static DECLARE_BITMAP(sched_rq_queued_masks_bitmap, NR_SCHED_RQ_QUEUED_LEVEL) ++____cacheline_aligned_in_smp; ++ ++static cpumask_t sched_rq_pending_masks[NR_SCHED_RQ_QUEUED_LEVEL] ++____cacheline_aligned_in_smp; ++ ++static DECLARE_BITMAP(sched_rq_pending_masks_bitmap, NR_SCHED_RQ_QUEUED_LEVEL) ++____cacheline_aligned_in_smp; ++ ++DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_chk_masks); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_start_mask); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_chk_end_masks); ++ ++#ifdef CONFIG_SCHED_SMT ++DEFINE_PER_CPU(int, sched_sibling_cpu); ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++EXPORT_SYMBOL_GPL(sched_smt_present); ++ ++static cpumask_t sched_cpu_sg_idle_mask ____cacheline_aligned_in_smp; ++ ++#ifdef CONFIG_SMT_NICE ++/* ++ * Preemptible sibling group mask ++ * Which all sibling cpus are running at PRIO_LIMIT or IDLE_PRIO ++ */ ++static cpumask_t sched_cpu_psg_mask ____cacheline_aligned_in_smp; ++/* ++ * SMT supressed mask ++ * When a cpu is running task with NORMAL/ISO/RT policy, its sibling cpu ++ * will be supressed to run IDLE priority task. ++ */ ++static cpumask_t sched_smt_supressed_mask ____cacheline_aligned_in_smp; ++#endif /* CONFIG_SMT_NICE */ ++#endif ++ ++static int sched_rq_prio[NR_CPUS] ____cacheline_aligned; ++ ++/* ++ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of ++ * the domain), this allows us to quickly tell if two cpus are in the same cache ++ * domain, see cpus_share_cache(). ++ */ ++DEFINE_PER_CPU(int, sd_llc_id); ++ ++int __weak arch_sd_sibling_asym_packing(void) ++{ ++ return 0*SD_ASYM_PACKING; ++} ++#else ++struct rq *uprq; ++#endif /* CONFIG_SMP */ ++ ++static DEFINE_MUTEX(sched_hotcpu_mutex); ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++/* ++ * Context: p->pi_lock ++ */ ++static inline struct rq ++*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock(&rq->lock); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ *plock = NULL; ++ return rq; ++ } ++ } ++} ++ ++static inline void ++__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) ++{ ++ if (NULL != lock) ++ raw_spin_unlock(lock); ++} ++ ++static inline struct rq ++*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, ++ unsigned long *flags) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock_irqsave(&rq->lock, *flags); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&rq->lock, *flags); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ raw_spin_lock_irqsave(&p->pi_lock, *flags); ++ if (likely(!p->on_cpu && !p->on_rq && ++ rq == task_rq(p))) { ++ *plock = &p->pi_lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); ++ } ++ } ++} ++ ++static inline void ++task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, ++ unsigned long *flags) ++{ ++ raw_spin_unlock_irqrestore(lock, *flags); ++} ++ ++/* ++ * __task_rq_lock - lock the rq @p resides on. ++ */ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ for (;;) { ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) ++ return rq; ++ raw_spin_unlock(&rq->lock); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. ++ */ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ for (;;) { ++ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ /* ++ * move_queued_task() task_rq_lock() ++ * ++ * ACQUIRE (rq->lock) ++ * [S] ->on_rq = MIGRATING [L] rq = task_rq() ++ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); ++ * [S] ->cpu = new_cpu [L] task_rq() ++ * [L] ->on_rq ++ * RELEASE (rq->lock) ++ * ++ * If we observe the old CPU in task_rq_lock(), the acquire of ++ * the old rq->lock will fully serialize against the stores. ++ * ++ * If we observe the new CPU in task_rq_lock(), the address ++ * dependency headed by '[L] rq = task_rq()' and the acquire ++ * will pair with the WMB to ensure we then also see migrating. ++ */ ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++ s64 __maybe_unused steal = 0, irq_delta = 0; ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ ++ delta -= steal; ++ } ++#endif ++ ++ rq->clock_task += delta; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ if ((irq_delta + steal)) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta <= 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++static inline void update_task_priodl(struct task_struct *p) ++{ ++ p->priodl = (((u64) (p->prio))<<56) | ((p->deadline)>>8); ++} ++ ++/* ++ * Deadline is "now" in niffies + (offset by priority). Setting the deadline ++ * is the key to everything. It distributes CPU fairly amongst tasks of the ++ * same nice value, it proportions CPU according to nice level, it means the ++ * task that last woke up the longest ago has the earliest deadline, thus ++ * ensuring that interactive tasks get low latency on wake up. The CPU ++ * proportion works out to the square of the virtual deadline difference, so ++ * this equation will give nice 19 3% CPU compared to nice 0. ++ */ ++static inline u64 task_deadline_diff(const struct task_struct *p) ++{ ++ return sched_prio2deadline[TASK_USER_PRIO(p)]; ++} ++ ++static inline u64 static_deadline_diff(int static_prio) ++{ ++ return sched_prio2deadline[USER_PRIO(static_prio)]; ++} ++ ++/* ++ * The time_slice is only refilled when it is empty and that is when we set a ++ * new deadline for non-rt tasks. ++ */ ++static inline void time_slice_expired(struct task_struct *p, struct rq *rq) ++{ ++ p->time_slice = timeslice(); ++ if (p->prio >= NORMAL_PRIO) ++ p->deadline = rq->clock + task_deadline_diff(p); ++ ++ update_task_priodl(p); ++} ++ ++static inline struct task_struct *rq_first_queued_task(struct rq *rq) ++{ ++ struct skiplist_node *node = rq->sl_header.next[0]; ++ ++ if (node == &rq->sl_header) ++ return rq->idle; ++ ++ return skiplist_entry(node, struct task_struct, sl_node); ++} ++ ++static inline struct task_struct *rq_second_queued_task(struct rq *rq) ++{ ++ struct skiplist_node *node = rq->sl_header.next[0]->next[0]; ++ ++ if (node == &rq->sl_header) ++ return rq->idle; ++ ++ return skiplist_entry(node, struct task_struct, sl_node); ++} ++ ++static inline int is_second_in_rq(struct task_struct *p, struct rq *rq) ++{ ++ return (p->sl_node.prev[0]->prev[0] == &rq->sl_header); ++} ++ ++static const int task_dl_hash_tbl[] = { ++/* 0 4 8 12 */ ++ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ++/* 16 20 24 28 */ ++ 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 5, 6, 7 ++}; ++ ++static inline int ++task_deadline_level(const struct task_struct *p, const struct rq *rq) ++{ ++ u64 delta = (rq->clock + sched_prio2deadline[39] - p->deadline) >> 23; ++ ++ delta = min((size_t)delta, ARRAY_SIZE(task_dl_hash_tbl) - 1); ++ return task_dl_hash_tbl[delta]; ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * flush_smp_call_function_from_idle() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_SMT_NICE ++static void resched_cpu_if_curr_is(int cpu, int priority) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ rcu_read_lock(); ++ ++ if (rcu_dereference(rq->curr)->prio != priority) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ if (!do_raw_spin_trylock(&rq->lock)) ++ goto out; ++ spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ ++ if (priority == rq->curr->prio) ++ smp_send_reschedule(cpu); ++ /* Else CPU is not idle, do nothing here */ ++ ++ spin_release(&rq->lock.dep_map, _RET_IP_); ++ do_raw_spin_unlock(&rq->lock); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++#endif /* CONFIG_SMT_NICE */ ++ ++static inline bool ++__update_cpumasks_bitmap(int cpu, unsigned long *plevel, unsigned long level, ++ cpumask_t cpumasks[], unsigned long bitmap[]) ++{ ++ if (*plevel == level) ++ return false; ++ ++ cpumask_clear_cpu(cpu, cpumasks + *plevel); ++ if (cpumask_empty(cpumasks + *plevel)) ++ clear_bit(*plevel, bitmap); ++ cpumask_set_cpu(cpu, cpumasks + level); ++ set_bit(level, bitmap); ++ ++ *plevel = level; ++ ++ return true; ++} ++ ++static inline int ++task_running_policy_level(const struct task_struct *p, const struct rq *rq) ++{ ++ int prio = p->prio; ++ ++ if (NORMAL_PRIO == prio) ++ return SCHED_RQ_NORMAL_0 + task_deadline_level(p, rq); ++ ++ if (ISO_PRIO == prio) ++ return SCHED_RQ_ISO; ++ if (prio < MAX_RT_PRIO) ++ return SCHED_RQ_RT; ++ return PRIO_LIMIT - prio; ++} ++ ++static inline void update_sched_rq_queued_masks_normal(struct rq *rq) ++{ ++ struct task_struct *p = rq_first_queued_task(rq); ++ ++ if (p->prio != NORMAL_PRIO) ++ return; ++ ++ __update_cpumasks_bitmap(cpu_of(rq), &rq->queued_level, ++ task_running_policy_level(p, rq), ++ &sched_rq_queued_masks[0], ++ &sched_rq_queued_masks_bitmap[0]); ++} ++ ++#ifdef CONFIG_SMT_NICE ++static inline void update_sched_cpu_psg_mask(const int cpu) ++{ ++ cpumask_t tmp; ++ ++ cpumask_or(&tmp, &sched_rq_queued_masks[SCHED_RQ_EMPTY], ++ &sched_rq_queued_masks[SCHED_RQ_IDLE]); ++ cpumask_and(&tmp, &tmp, cpu_smt_mask(cpu)); ++ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) ++ cpumask_or(&sched_cpu_psg_mask, &sched_cpu_psg_mask, ++ cpu_smt_mask(cpu)); ++ else ++ cpumask_andnot(&sched_cpu_psg_mask, &sched_cpu_psg_mask, ++ cpu_smt_mask(cpu)); ++} ++#endif ++ ++static inline void update_sched_rq_queued_masks(struct rq *rq) ++{ ++ int cpu = cpu_of(rq); ++ struct task_struct *p = rq_first_queued_task(rq); ++ unsigned long level; ++#ifdef CONFIG_SCHED_SMT ++ unsigned long last_level = rq->queued_level; ++#endif ++ ++ level = task_running_policy_level(p, rq); ++ sched_rq_prio[cpu] = p->prio; ++ ++ if (!__update_cpumasks_bitmap(cpu, &rq->queued_level, level, ++ &sched_rq_queued_masks[0], ++ &sched_rq_queued_masks_bitmap[0])) ++ return; ++ ++#ifdef CONFIG_SCHED_SMT ++ if (cpu == per_cpu(sched_sibling_cpu, cpu)) ++ return; ++ ++ if (SCHED_RQ_EMPTY == last_level) { ++ cpumask_andnot(&sched_cpu_sg_idle_mask, &sched_cpu_sg_idle_mask, ++ cpu_smt_mask(cpu)); ++ } else if (SCHED_RQ_EMPTY == level) { ++ cpumask_t tmp; ++ ++ cpumask_and(&tmp, cpu_smt_mask(cpu), ++ &sched_rq_queued_masks[SCHED_RQ_EMPTY]); ++ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) ++ cpumask_or(&sched_cpu_sg_idle_mask, cpu_smt_mask(cpu), ++ &sched_cpu_sg_idle_mask); ++ } ++ ++#ifdef CONFIG_SMT_NICE ++ if (level <= SCHED_RQ_IDLE && last_level > SCHED_RQ_IDLE) { ++ cpumask_clear_cpu(per_cpu(sched_sibling_cpu, cpu), ++ &sched_smt_supressed_mask); ++ update_sched_cpu_psg_mask(cpu); ++ resched_cpu_if_curr_is(per_cpu(sched_sibling_cpu, cpu), PRIO_LIMIT); ++ } else if (last_level <= SCHED_RQ_IDLE && level > SCHED_RQ_IDLE) { ++ cpumask_set_cpu(per_cpu(sched_sibling_cpu, cpu), ++ &sched_smt_supressed_mask); ++ update_sched_cpu_psg_mask(cpu); ++ resched_cpu_if_curr_is(per_cpu(sched_sibling_cpu, cpu), IDLE_PRIO); ++ } ++#endif /* CONFIG_SMT_NICE */ ++#endif ++} ++ ++static inline void update_sched_rq_pending_masks(struct rq *rq) ++{ ++ unsigned long level; ++ struct task_struct *p = rq_second_queued_task(rq); ++ ++ level = task_running_policy_level(p, rq); ++ ++ __update_cpumasks_bitmap(cpu_of(rq), &rq->pending_level, level, ++ &sched_rq_pending_masks[0], ++ &sched_rq_pending_masks_bitmap[0]); ++} ++ ++#else /* CONFIG_SMP */ ++static inline void update_sched_rq_queued_masks(struct rq *rq) {} ++static inline void update_sched_rq_queued_masks_normal(struct rq *rq) {} ++static inline void update_sched_rq_pending_masks(struct rq *rq) {} ++#endif ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out ++ * of nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu; ++ ++ if (!tick_nohz_full_enabled()) ++ return; ++ ++ cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (rq->nr_running < 2) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++/* ++ * Removing from the runqueue. Deleting a task from the skip list is done ++ * via the stored node reference in the task struct and does not require a full ++ * look up. Thus it occurs in O(k) time where k is the "level" of the list the ++ * task was stored at - usually < 4, max 16. ++ * ++ * Context: rq->lock ++ */ ++static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "pds: dequeue task reside on cpu%d from cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ if (skiplist_del_init(&rq->sl_header, &p->sl_node)) { ++ update_sched_rq_queued_masks(rq); ++ update_sched_rq_pending_masks(rq); ++ } else if (is_second_in_rq(p, rq)) ++ update_sched_rq_pending_masks(rq); ++ rq->nr_running--; ++ ++ sched_update_tick_dependency(rq); ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); ++ ++ sched_info_dequeued(rq, p); ++} ++ ++/* ++ * To determine if it's safe for a task of SCHED_IDLE to actually run as ++ * an idle task, we ensure none of the following conditions are met. ++ */ ++static inline bool idleprio_suitable(struct task_struct *p) ++{ ++ return (!freezing(p) && !signal_pending(p) && ++ !(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING))); ++} ++ ++/* ++ * pds_skiplist_random_level -- Returns a pseudo-random level number for skip ++ * list node which is used in PDS run queue. ++ * ++ * In current implementation, based on testing, the first 8 bits in microseconds ++ * of niffies are suitable for random level population. ++ * find_first_bit() is used to satisfy p = 0.5 between each levels, and there ++ * should be platform hardware supported instruction(known as ctz/clz) to speed ++ * up this function. ++ * The skiplist level for a task is populated when task is created and doesn't ++ * change in task's life time. When task is being inserted into run queue, this ++ * skiplist level is set to task's sl_node->level, the skiplist insert function ++ * may change it based on current level of the skip lsit. ++ */ ++static inline int pds_skiplist_random_level(const struct task_struct *p) ++{ ++ long unsigned int randseed; ++ ++ /* ++ * 1. Some architectures don't have better than microsecond resolution ++ * so mask out ~microseconds as a factor of the random seed for skiplist ++ * insertion. ++ * 2. Use address of task structure pointer as another factor of the ++ * random seed for task burst forking scenario. ++ */ ++ randseed = (task_rq(p)->clock ^ (long unsigned int)p) >> 10; ++ ++ return find_first_bit(&randseed, NUM_SKIPLIST_LEVEL - 1); ++} ++ ++/** ++ * pds_skiplist_task_search -- search function used in PDS run queue skip list ++ * node insert operation. ++ * @it: iterator pointer to the node in the skip list ++ * @node: pointer to the skiplist_node to be inserted ++ * ++ * Returns true if key of @it is less or equal to key value of @node, otherwise ++ * false. ++ */ ++static inline bool ++pds_skiplist_task_search(struct skiplist_node *it, struct skiplist_node *node) ++{ ++ return (skiplist_entry(it, struct task_struct, sl_node)->priodl <= ++ skiplist_entry(node, struct task_struct, sl_node)->priodl); ++} ++ ++/* ++ * Define the skip list insert function for PDS ++ */ ++DEFINE_SKIPLIST_INSERT_FUNC(pds_skiplist_insert, pds_skiplist_task_search); ++ ++/* ++ * Adding task to the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "pds: enqueue task reside on cpu%d to cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ ++ p->sl_node.level = p->sl_level; ++ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node)) { ++ update_sched_rq_queued_masks(rq); ++ update_sched_rq_pending_masks(rq); ++ } else if (is_second_in_rq(p, rq)) ++ update_sched_rq_pending_masks(rq); ++ rq->nr_running++; ++ ++ sched_update_tick_dependency(rq); ++ ++ sched_info_queued(rq, p); ++ psi_enqueue(p, flags); ++ ++ /* ++ * If in_iowait is set, the code below may not trigger any cpufreq ++ * utilization updates, so do it here explicitly with the IOWAIT flag ++ * passed. ++ */ ++ if (p->in_iowait) ++ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT); ++} ++ ++static inline void requeue_task(struct task_struct *p, struct rq *rq) ++{ ++ bool b_first, b_second; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ WARN_ONCE(task_rq(p) != rq, "pds: cpu[%d] requeue task reside on cpu%d\n", ++ cpu_of(rq), task_cpu(p)); ++ ++ b_first = skiplist_del_init(&rq->sl_header, &p->sl_node); ++ b_second = is_second_in_rq(p, rq); ++ ++ p->sl_node.level = p->sl_level; ++ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node) || b_first) { ++ update_sched_rq_queued_masks(rq); ++ update_sched_rq_pending_masks(rq); ++ } else if (is_second_in_rq(p, rq) || b_second) ++ update_sched_rq_pending_masks(rq); ++} ++ ++/* ++ * resched_curr - mark rq's current task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_curr(struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ int cpu; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ if (test_tsk_need_resched(curr)) ++ return; ++ ++ cpu = cpu_of(rq); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(curr)) ++ smp_send_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) ++{ ++ struct task_struct *curr = rq->curr; ++ ++ if (curr->prio == PRIO_LIMIT) ++ resched_curr(rq); ++ ++ if (task_running_idle(p)) ++ return; ++ ++ if (p->priodl < curr->priodl) ++ resched_curr(rq); ++} ++ ++#ifdef CONFIG_SCHED_HRTICK ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++ ++static void hrtick_clear(struct rq *rq) ++{ ++ if (hrtimer_active(&rq->hrtick_timer)) ++ hrtimer_cancel(&rq->hrtick_timer); ++} ++ ++/* ++ * High-resolution timer tick. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrtick(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrtick_timer); ++ struct task_struct *p; ++ ++ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); ++ ++ raw_spin_lock(&rq->lock); ++ p = rq->curr; ++ p->time_slice = 0; ++ resched_curr(rq); ++ raw_spin_unlock(&rq->lock); ++ ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Use hrtick when: ++ * - enabled by features ++ * - hrtimer is actually high res ++ */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ /** ++ * PDS doesn't support sched_feat yet ++ if (!sched_feat(HRTICK)) ++ return 0; ++ */ ++ if (!cpu_active(cpu_of(rq))) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrtick_timer); ++} ++ ++#ifdef CONFIG_SMP ++ ++static void __hrtick_restart(struct rq *rq) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ++ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); ++} ++ ++/* ++ * called from hardirq (IPI) context ++ */ ++static void __hrtick_start(void *arg) ++{ ++ struct rq *rq = arg; ++ ++ raw_spin_lock(&rq->lock); ++ __hrtick_restart(rq); ++ raw_spin_unlock(&rq->lock); ++} ++ ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ktime_t time; ++ s64 delta; ++ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense and can cause timer DoS. ++ */ ++ delta = max_t(s64, delay, 10000LL); ++ time = ktime_add_ns(timer->base->get_time(), delta); ++ ++ hrtimer_set_expires(timer, time); ++ ++ if (rq == this_rq()) ++ __hrtick_restart(rq); ++ else ++ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); ++} ++ ++#else ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense. Rely on vruntime for fairness. ++ */ ++ delay = max_t(u64, delay, 10000LL); ++ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED_HARD); ++} ++#endif /* CONFIG_SMP */ ++ ++static void hrtick_rq_init(struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ rq->hrtick_csd.flags = 0; ++ rq->hrtick_csd.func = __hrtick_start; ++ rq->hrtick_csd.info = rq; ++#endif ++ ++ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); ++ rq->hrtick_timer.function = hrtick; ++} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ if ((rq->clock - rq->last_tick > HALF_JIFFY_NS) || hrtick_enabled(rq)) ++ return 0; ++ ++ return HALF_JIFFY_NS; ++} ++ ++#else /* CONFIG_SCHED_HRTICK */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline void hrtick_clear(struct rq *rq) ++{ ++} ++ ++static inline void hrtick_rq_init(struct rq *rq) ++{ ++} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ return (rq->clock - rq->last_tick > HALF_JIFFY_NS)? 0:HALF_JIFFY_NS; ++} ++#endif /* CONFIG_SCHED_HRTICK */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ static const int policy_to_prio[] = { ++ NORMAL_PRIO, /* SCHED_NORMAL */ ++ 0, /* SCHED_FIFO */ ++ 0, /* SCHED_RR */ ++ IDLE_PRIO, /* SCHED_BATCH */ ++ ISO_PRIO, /* SCHED_ISO */ ++ IDLE_PRIO /* SCHED_IDLE */ ++ }; ++ ++ if (task_has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ return policy_to_prio[p->policy]; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static void activate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible--; ++ enqueue_task(p, rq, ENQUEUE_WAKEUP); ++ p->on_rq = 1; ++ cpufreq_update_this_cpu(rq, 0); ++} ++ ++/* ++ * deactivate_task - remove a task from the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible++; ++ dequeue_task(p, rq, DEQUEUE_SLEEP); ++ p->on_rq = 0; ++ cpufreq_update_this_cpu(rq, 0); ++} ++ ++static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ WRITE_ONCE(p->cpu, cpu); ++#else ++ WRITE_ONCE(task_thread_info(p)->cpu, cpu); ++#endif ++#endif ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++#ifdef CONFIG_SCHED_DEBUG ++ /* ++ * We should never call set_task_cpu() on a blocked task, ++ * ttwu() will sort out the placement. ++ */ ++ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && ++ !p->on_rq); ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * sched_move_task() holds both and thus holding either pins the cgroup, ++ * see task_group(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(&task_rq(p)->lock))); ++#endif ++ /* ++ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. ++ */ ++ WARN_ON_ONCE(!cpu_online(new_cpu)); ++#endif ++ if (task_cpu(p) == new_cpu) ++ return; ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ __set_task_cpu(p, new_cpu); ++} ++ ++static inline bool is_per_cpu_kthread(struct task_struct *p) ++{ ++ return ((p->flags & PF_KTHREAD) && (1 == p->nr_cpus_allowed)); ++} ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr() and select_fallback_rq(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ if (!cpumask_test_cpu(cpu, &p->cpus_mask)) ++ return false; ++ ++ if (is_per_cpu_kthread(p)) ++ return cpu_online(cpu); ++ ++ return cpu_active(cpu); ++} ++ ++/* ++ * This is how migration works: ++ * ++ * 1) we invoke migration_cpu_stop() on the target CPU using ++ * stop_one_cpu(). ++ * 2) stopper starts to run (implicitly forcing the migrated thread ++ * off the CPU) ++ * 3) it checks whether the migrated task is still in the wrong runqueue. ++ * 4) if it's in the wrong runqueue then the migration thread removes ++ * it and puts it into the right queue. ++ * 5) stopper completes and stop_one_cpu() returns and the migration ++ * is done. ++ */ ++ ++/* ++ * move_queued_task - move a queued task to new rq. ++ * ++ * Returns (locked) new rq. Old rq's lock is released. ++ */ ++static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int ++ new_cpu) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ p->on_rq = TASK_ON_RQ_MIGRATING; ++ dequeue_task(p, rq, 0); ++ set_task_cpu(p, new_cpu); ++ raw_spin_unlock(&rq->lock); ++ ++ rq = cpu_rq(new_cpu); ++ ++ raw_spin_lock(&rq->lock); ++ BUG_ON(task_cpu(p) != new_cpu); ++ enqueue_task(p, rq, 0); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++ check_preempt_curr(rq, p); ++ ++ return rq; ++} ++ ++struct migration_arg { ++ struct task_struct *task; ++ int dest_cpu; ++}; ++ ++/* ++ * Move (not current) task off this CPU, onto the destination CPU. We're doing ++ * this because either it can't run here any more (set_cpus_allowed() ++ * away from this CPU, or CPU going down), or because we're ++ * attempting to rebalance this task on exec (sched_exec). ++ * ++ * So we race with normal scheduler movements, but that's OK, as long ++ * as the task is no longer on this CPU. ++ */ ++static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int ++ dest_cpu) ++{ ++ /* Affinity changed (again). */ ++ if (!is_cpu_allowed(p, dest_cpu)) ++ return rq; ++ ++ update_rq_clock(rq); ++ return move_queued_task(rq, p, dest_cpu); ++} ++ ++/* ++ * migration_cpu_stop - this will be executed by a highprio stopper thread ++ * and performs thread migration by bumping thread off CPU then ++ * 'pushing' onto another runqueue. ++ */ ++static int migration_cpu_stop(void *data) ++{ ++ struct migration_arg *arg = data; ++ struct task_struct *p = arg->task; ++ struct rq *rq = this_rq(); ++ ++ /* ++ * The original target CPU might have gone down and we might ++ * be on another CPU but it doesn't matter. ++ */ ++ local_irq_disable(); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ /* ++ * If task_rq(p) != rq, it cannot be migrated here, because we're ++ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because ++ * we're holding p->pi_lock. ++ */ ++ if (task_rq(p) == rq) ++ if (task_on_rq_queued(p)) ++ rq = __migrate_task(rq, p, arg->dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_enable(); ++ return 0; ++} ++ ++static inline void ++set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ set_cpus_allowed_common(p, new_mask); ++} ++#endif ++ ++/* Enter with rq lock held. We know p is on the local CPU */ ++static inline void __set_tsk_resched(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ unsigned long flags; ++ bool running, on_rq; ++ unsigned long ncsw; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(p) && p == rq->curr) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ task_access_lock_irqsave(p, &lock, &flags); ++ trace_sched_wait_task(p); ++ running = task_running(p); ++ on_rq = p->on_rq; ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(on_rq)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_send_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++ ++/* ++ * ->cpus_mask is protected by both rq->lock and p->pi_lock ++ * ++ * A few notes on cpu_active vs cpu_online: ++ * ++ * - cpu_active must be a subset of cpu_online ++ * ++ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, ++ * see __set_cpus_allowed_ptr(). At this point the newly online ++ * CPU isn't yet part of the sched domains, and balancing will not ++ * see it. ++ * ++ * - on cpu-down we clear cpu_active() to mask the sched domains and ++ * avoid the load balancer to place new tasks on the to be removed ++ * CPU. Existing tasks will remain running there and will be taken ++ * off. ++ * ++ * This means that fallback selection must not select !active CPUs. ++ * And can assume that any active CPU must be online. Conversely ++ * select_task_rq() below may allow selection of !active CPUs in order ++ * to satisfy the above rules. ++ */ ++static int select_fallback_rq(int cpu, struct task_struct *p) ++{ ++ int nid = cpu_to_node(cpu); ++ const struct cpumask *nodemask = NULL; ++ enum { cpuset, possible, fail } state = cpuset; ++ int dest_cpu; ++ ++ /* ++ * If the node that the CPU is on has been offlined, cpu_to_node() ++ * will return -1. There is no CPU on the node, and we should ++ * select the CPU on the other node. ++ */ ++ if (nid != -1) { ++ nodemask = cpumask_of_node(nid); ++ ++ /* Look for allowed, online CPU in same node. */ ++ for_each_cpu(dest_cpu, nodemask) { ++ if (!cpu_active(dest_cpu)) ++ continue; ++ if (cpumask_test_cpu(dest_cpu, &p->cpus_mask)) ++ return dest_cpu; ++ } ++ } ++ ++ for (;;) { ++ /* Any allowed, online CPU? */ ++ for_each_cpu(dest_cpu, &p->cpus_mask) { ++ if (!is_cpu_allowed(p, dest_cpu)) ++ continue; ++ goto out; ++ } ++ ++ /* No more Mr. Nice Guy. */ ++ switch (state) { ++ case cpuset: ++ if (IS_ENABLED(CONFIG_CPUSETS)) { ++ cpuset_cpus_allowed_fallback(p); ++ state = possible; ++ break; ++ } ++ /* Fall-through */ ++ case possible: ++ do_set_cpus_allowed(p, cpu_possible_mask); ++ state = fail; ++ break; ++ ++ case fail: ++ BUG(); ++ break; ++ } ++ } ++ ++out: ++ if (state != cpuset) { ++ /* ++ * Don't tell them about moving exiting tasks or ++ * kernel threads (both mm NULL), since they never ++ * leave kernel. ++ */ ++ if (p->mm && printk_ratelimit()) { ++ printk_deferred("process %d (%s) no longer affine to cpu%d\n", ++ task_pid_nr(p), p->comm, cpu); ++ } ++ } ++ ++ return dest_cpu; ++} ++ ++static inline int best_mask_cpu(int cpu, const cpumask_t *cpumask) ++{ ++ cpumask_t *mask; ++ ++ if (cpumask_test_cpu(cpu, cpumask)) ++ return cpu; ++ ++ mask = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) ++ mask++; ++ ++ return cpu; ++} ++ ++/* ++ * task_preemptible_rq - return the rq which the given task can preempt on ++ * @p: task wants to preempt CPU ++ * @only_preempt_low_policy: indicate only preempt rq running low policy than @p ++ */ ++static inline int ++task_preemptible_rq_idle(struct task_struct *p, cpumask_t *chk_mask) ++{ ++ cpumask_t tmp; ++ ++#ifdef CONFIG_SCHED_SMT ++ if (cpumask_and(&tmp, chk_mask, &sched_cpu_sg_idle_mask)) ++ return best_mask_cpu(task_cpu(p), &tmp); ++#endif ++ ++#ifdef CONFIG_SMT_NICE ++ /* Only ttwu on cpu which is not smt supressed */ ++ if (cpumask_andnot(&tmp, chk_mask, &sched_smt_supressed_mask)) { ++ cpumask_t t; ++ if (cpumask_and(&t, &tmp, &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ return best_mask_cpu(task_cpu(p), &t); ++ return best_mask_cpu(task_cpu(p), &tmp); ++ } ++#endif ++ ++ if (cpumask_and(&tmp, chk_mask, &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ return best_mask_cpu(task_cpu(p), &tmp); ++ return best_mask_cpu(task_cpu(p), chk_mask); ++} ++ ++static inline int ++task_preemptible_rq(struct task_struct *p, cpumask_t *chk_mask, ++ int preempt_level) ++{ ++ cpumask_t tmp; ++ int level; ++ ++#ifdef CONFIG_SCHED_SMT ++#ifdef CONFIG_SMT_NICE ++ if (cpumask_and(&tmp, chk_mask, &sched_cpu_psg_mask)) ++ return best_mask_cpu(task_cpu(p), &tmp); ++#else ++ if (cpumask_and(&tmp, chk_mask, &sched_cpu_sg_idle_mask)) ++ return best_mask_cpu(task_cpu(p), &tmp); ++#endif ++#endif ++ ++ level = find_first_bit(sched_rq_queued_masks_bitmap, ++ NR_SCHED_RQ_QUEUED_LEVEL); ++ ++ while (level < preempt_level) { ++ if (cpumask_and(&tmp, chk_mask, &sched_rq_queued_masks[level])) ++ return best_mask_cpu(task_cpu(p), &tmp); ++ ++ level = find_next_bit(sched_rq_queued_masks_bitmap, ++ NR_SCHED_RQ_QUEUED_LEVEL, ++ level + 1); ++ } ++ ++ if (unlikely(SCHED_RQ_RT == level && ++ level == preempt_level && ++ cpumask_and(&tmp, chk_mask, ++ &sched_rq_queued_masks[SCHED_RQ_RT]))) { ++ unsigned int cpu; ++ ++ for_each_cpu (cpu, &tmp) ++ if (p->prio < sched_rq_prio[cpu]) ++ return cpu; ++ } ++ ++ return best_mask_cpu(task_cpu(p), chk_mask); ++} ++ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ cpumask_t chk_mask; ++ ++ if (unlikely(!cpumask_and(&chk_mask, &p->cpus_mask, cpu_online_mask))) ++ return select_fallback_rq(task_cpu(p), p); ++ ++ /* Check IDLE tasks suitable to run normal priority */ ++ if (idleprio_task(p)) { ++ if (idleprio_suitable(p)) { ++ p->prio = p->normal_prio; ++ update_task_priodl(p); ++ return task_preemptible_rq_idle(p, &chk_mask); ++ } ++ p->prio = NORMAL_PRIO; ++ update_task_priodl(p); ++ } ++ ++ return task_preemptible_rq(p, &chk_mask, ++ task_running_policy_level(p, this_rq())); ++} ++#else /* CONFIG_SMP */ ++static inline int select_task_rq(struct task_struct *p) ++{ ++ return 0; ++} ++#endif /* CONFIG_SMP */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq= this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) ++ __schedstat_inc(rq->ttwu_local); ++ else { ++ /** PDS ToDo: ++ * How to do ttwu_wake_remote ++ */ ++ } ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static inline void ++ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static inline void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++#ifdef CONFIG_SMP ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++#endif ++ ++ activate_task(p, rq); ++ ttwu_do_wakeup(rq, p, 0); ++} ++ ++static int ttwu_remote(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ rq = __task_access_lock(p, &lock); ++ if (task_on_rq_queued(p)) { ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_access_unlock(p, lock); ++ ++ return ret; ++} ++ ++/* ++ * Notes on Program-Order guarantees on SMP systems. ++ * ++ * MIGRATION ++ * ++ * The basic program-order guarantee on SMP systems is that when a task [t] ++ * migrates, all its activity on its old CPU [c0] happens-before any subsequent ++ * execution on its new CPU [c1]. ++ * ++ * For migration (of runnable tasks) this is provided by the following means: ++ * ++ * A) UNLOCK of the rq(c0)->lock scheduling out task t ++ * B) migration for t is required to synchronize *both* rq(c0)->lock and ++ * rq(c1)->lock (if not at the same time, then in that order). ++ * C) LOCK of the rq(c1)->lock scheduling in task ++ * ++ * Transitivity guarantees that B happens after A and C after B. ++ * Note: we only require RCpc transitivity. ++ * Note: the CPU doing B need not be c0 or c1 ++ * ++ * Example: ++ * ++ * CPU0 CPU1 CPU2 ++ * ++ * LOCK rq(0)->lock ++ * sched-out X ++ * sched-in Y ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(0)->lock // orders against CPU0 ++ * dequeue X ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(1)->lock ++ * enqueue X ++ * UNLOCK rq(1)->lock ++ * ++ * LOCK rq(1)->lock // orders against CPU2 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(1)->lock ++ * ++ * ++ * BLOCKING -- aka. SLEEP + WAKEUP ++ * ++ * For blocking we (obviously) need to provide the same guarantee as for ++ * migration. However the means are completely different as there is no lock ++ * chain to provide order. Instead we do: ++ * ++ * 1) smp_store_release(X->on_cpu, 0) ++ * 2) smp_cond_load_acquire(!X->on_cpu) ++ * ++ * Example: ++ * ++ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) ++ * ++ * LOCK rq(0)->lock LOCK X->pi_lock ++ * dequeue X ++ * sched-out X ++ * smp_store_release(X->on_cpu, 0); ++ * ++ * smp_cond_load_acquire(&X->on_cpu, !VAL); ++ * X->state = WAKING ++ * set_task_cpu(X,2) ++ * ++ * LOCK rq(2)->lock ++ * enqueue X ++ * X->state = RUNNING ++ * UNLOCK rq(2)->lock ++ * ++ * LOCK rq(2)->lock // orders against CPU1 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(2)->lock ++ * ++ * UNLOCK X->pi_lock ++ * UNLOCK rq(0)->lock ++ * ++ * ++ * However; for wakeups there is a second guarantee we must provide, namely we ++ * must observe the state that lead to our wakeup. That is, not only must our ++ * task observe its own prior state, it must also observe the stores prior to ++ * its wakeup. ++ * ++ * This means that any means of doing remote wakeups must order the CPU doing ++ * the wakeup against the CPU the task is going to end up running on. This, ++ * however, is already required for the regular Program-Order guarantee above, ++ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). ++ * ++ */ ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Return: %true if @p was woken up, %false if it was already running. ++ * or @state didn't match @p's state. ++ */ ++static int try_to_wake_up(struct task_struct *p, unsigned int state, ++ int wake_flags) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ int cpu, success = 0; ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with mb() in ++ * set_current_state() the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ if (!(p->state & state)) ++ goto out; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ cpu = task_cpu(p); ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * flush_smp_call_function_from_idle() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ if (p->on_rq && ttwu_remote(p, wake_flags)) ++ goto stat; ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until its done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ p->sched_contributes_to_load = !!task_contributes_to_load(p); ++ p->state = TASK_WAKING; ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ if (SCHED_ISO == p->policy && ISO_PRIO != p->prio) { ++ p->prio = ISO_PRIO; ++ p->deadline = 0UL; ++ update_task_priodl(p); ++ } ++ ++ cpu = select_task_rq(p); ++ ++ if (cpu != task_cpu(p)) { ++ wake_flags |= WF_MIGRATED; ++ psi_ttwu_dequeue(p); ++ set_task_cpu(p, cpu); ++ } ++#else /* CONFIG_SMP */ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++#endif ++ ++ rq = cpu_rq(cpu); ++ raw_spin_lock(&rq->lock); ++ ++ update_rq_clock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ check_preempt_curr(rq, p); ++ ++ raw_spin_unlock(&rq->lock); ++ ++stat: ++ ttwu_stat(p, cpu, wake_flags); ++out: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return success; ++} ++ ++/** ++ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state ++ * @p: Process for which the function is to be invoked. ++ * @func: Function to invoke. ++ * @arg: Argument to function. ++ * ++ * If the specified task can be quickly locked into a definite state ++ * (either sleeping or on a given runqueue), arrange to keep it in that ++ * state while invoking @func(@arg). This function can use ->on_rq and ++ * task_curr() to work out what the state is, if required. Given that ++ * @func can be invoked with a runqueue lock held, it had better be quite ++ * lightweight. ++ * ++ * Returns: ++ * @false if the task slipped out from under the locks. ++ * @true if the task was locked onto a runqueue or is sleeping. ++ * However, @func can override this by returning @false. ++ */ ++bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) ++{ ++ bool ret = false; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ lockdep_assert_irqs_enabled(); ++ raw_spin_lock_irq(&p->pi_lock); ++ if (p->on_rq) { ++ rq = __task_rq_lock(p, &rf); ++ if (task_rq(p) == rq) ++ ret = func(p, arg); ++ rq_unlock(rq, &rf); ++ } else { ++ switch (p->state) { ++ case TASK_RUNNING: ++ case TASK_WAKING: ++ break; ++ default: ++ smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). ++ if (!p->on_rq) ++ ret = func(p, arg); ++ } ++ } ++ raw_spin_unlock_irq(&p->pi_lock); ++ return ret; ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ */ ++int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ int cpu = get_cpu(); ++ struct rq *rq = this_rq(); ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ /* Should be reset in fork.c but done here for ease of PDS patching */ ++ p->on_cpu = ++ p->on_rq = ++ p->utime = ++ p->stime = ++ p->sched_time = 0; ++ ++ p->sl_level = pds_skiplist_random_level(p); ++ INIT_SKIPLIST_NODE(&p->sl_node); ++ ++#ifdef CONFIG_COMPACTION ++ p->capture_control = NULL; ++#endif ++ ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = current->normal_prio; ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (task_has_rt_policy(p)) { ++ p->policy = SCHED_NORMAL; ++ p->static_prio = NICE_TO_PRIO(0); ++ p->rt_priority = 0; ++ } else if (PRIO_TO_NICE(p->static_prio) < 0) ++ p->static_prio = NICE_TO_PRIO(0); ++ ++ p->prio = p->normal_prio = normal_prio(p); ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ rq->curr->time_slice /= 2; ++ p->time_slice = rq->curr->time_slice; ++#ifdef CONFIG_SCHED_HRTICK ++ hrtick_start(rq, US_TO_NS(rq->curr->time_slice)); ++#endif ++ ++ if (p->time_slice < RESCHED_US) { ++ update_rq_clock(rq); ++ time_slice_expired(p, rq); ++ resched_curr(rq); ++ } else ++ update_task_priodl(p); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ /* ++ * The child is not yet in the pid-hash so no cgroup attach races, ++ * and the cgroup is pinned to this child due to cgroup_fork() ++ * is ran before sched_fork(). ++ * ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ /* ++ * We're setting the CPU for the first time, we don't migrate, ++ * so use __set_task_cpu(). ++ */ ++ __set_task_cpu(p, cpu); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ put_cpu(); ++ return 0; ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ p->state = TASK_RUNNING; ++ ++ rq = cpu_rq(select_task_rq(p)); ++#ifdef CONFIG_SMP ++ /* ++ * Fork balancing, do it here and not earlier because: ++ * - cpus_mask can change in the fork path ++ * - any previously selected CPU might disappear through hotplug ++ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, ++ * as we're not fully set-up yet. ++ */ ++ __set_task_cpu(p, cpu_of(rq)); ++#endif ++ ++ raw_spin_lock(&rq->lock); ++ ++ update_rq_clock(rq); ++ activate_task(p, rq); ++ trace_sched_wakeup_new(p); ++ check_preempt_curr(rq, p); ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ */ ++ next->on_cpu = 1; ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->on_cpu is cleared, the task can be moved to a different CPU. ++ * We must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#else ++ prev->on_cpu = 0; ++#endif ++} ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock.dep_map, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock.owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static struct rq *finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(&rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ finish_lock_switch(rq); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct_rcu_user(prev); ++ } ++ ++ tick_nohz_task_switch(); ++ return rq; ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq; ++ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ rq = finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline struct rq * ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prepare_task_switch(rq, prev, next); ++ ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * kernel -> kernel lazy + transfer active ++ * user -> kernel lazy + mmgrab() active ++ * ++ * kernel -> user switch + mmdrop() active ++ * user -> user switch ++ */ ++ if (!next->mm) { // to kernel ++ enter_lazy_tlb(prev->active_mm, next); ++ ++ next->active_mm = prev->active_mm; ++ if (prev->mm) // from user ++ mmgrab(prev->active_mm); ++ else ++ prev->active_mm = NULL; ++ } else { // to user ++ membarrier_switch_mm(rq, prev->active_mm, next->mm); ++ /* ++ * sys_membarrier() requires an smp_mb() between setting ++ * rq->curr / membarrier_switch_mm() and returning to userspace. ++ * ++ * The below provides this either through switch_mm(), or in ++ * case 'prev->active_mm == next->mm' through ++ * finish_task_switch()'s mmdrop(). ++ */ ++ switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ ++ if (!prev->mm) { // from kernel ++ /* will mmdrop() in finish_task_switch(). */ ++ rq->prev_mm = prev->active_mm; ++ prev->active_mm = NULL; ++ } ++ } ++ ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ return finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptible section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ return raw_rq()->nr_running == 1; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int i; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += cpu_rq(i)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpuidle menu ++ * governor, are using nonsensical data. Preferring shallow idle state selection ++ * for a CPU that has IO-wait which might not even end up running the task when ++ * it does become runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ return atomic_read(&cpu_rq(cpu)->nr_iowait); ++} ++ ++/* ++ * IO-wait accounting, and how its mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += nr_iowait_cpu(i); ++ ++ return sum; ++} ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++static inline void pds_update_curr(struct rq *rq, struct task_struct *p) ++{ ++ s64 ns = rq->clock_task - p->last_ran; ++ ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ p->time_slice -= NS_TO_US(ns); ++ p->last_ran = rq->clock_task; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimization chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_access_lock_irqsave(p, &lock, &flags); ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_rq_clock(rq); ++ pds_update_curr(rq, p); ++ } ++ ns = tsk_seruntime(p); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ return ns; ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static inline void pds_scheduler_task_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ if (is_idle_task(p)) ++ return; ++ ++ pds_update_curr(rq, p); ++ ++ cpufreq_update_util(rq, 0); ++ ++ /* ++ * Tasks that were scheduled in the first half of a tick are not ++ * allowed to run into the 2nd half of the next tick if they will ++ * run out of time slice in the interim. Otherwise, if they have ++ * less than RESCHED_US μs of time slice left they will be rescheduled. ++ */ ++ if (p->time_slice - rq->dither >= RESCHED_US) ++ return; ++ ++ /** ++ * p->time_slice < RESCHED_US. We will modify task_struct under ++ * rq lock as p is rq->curr ++ */ ++ __set_tsk_resched(p); ++} ++ ++#ifdef CONFIG_SMP ++ ++#ifdef CONFIG_SCHED_SMT ++static int active_load_balance_cpu_stop(void *data) ++{ ++ struct rq *rq = this_rq(); ++ struct task_struct *p = data; ++ int cpu; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ rq->active_balance = 0; ++ /* ++ * _something_ may have changed the task, double check again ++ */ ++ if (task_on_rq_queued(p) && task_rq(p) == rq && ++ (cpu = cpumask_any_and(&p->cpus_mask, &sched_cpu_sg_idle_mask)) < nr_cpu_ids) ++ rq = __migrate_task(rq, p, cpu); ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_restore(flags); ++ ++ return 0; ++} ++ ++/* pds_sg_balance_trigger - trigger slibing group balance for @cpu */ ++static void pds_sg_balance_trigger(const int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ struct task_struct *curr; ++ ++ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) ++ return; ++ curr = rq->curr; ++ if (!is_idle_task(curr) && ++ cpumask_intersects(&curr->cpus_mask, &sched_cpu_sg_idle_mask)) { ++ int active_balance = 0; ++ ++ if (likely(!rq->active_balance)) { ++ rq->active_balance = 1; ++ active_balance = 1; ++ } ++ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ if (likely(active_balance)) ++ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, ++ curr, &rq->active_balance_work); ++ } else ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++} ++ ++/* ++ * pds_sg_balance_check - slibing group balance check for run queue @rq ++ */ ++static inline void pds_sg_balance_check(const struct rq *rq) ++{ ++ cpumask_t chk; ++ int i; ++ ++ /* Only online cpu will do sg balance checking */ ++ if (unlikely(!rq->online)) ++ return; ++ ++ /* Only cpu in slibing idle group will do the checking */ ++ if (!cpumask_test_cpu(cpu_of(rq), &sched_cpu_sg_idle_mask)) ++ return; ++ ++ /* Find potential cpus which can migrate the currently running task */ ++ if (!cpumask_andnot(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY], ++ &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ return; ++ ++ for_each_cpu(i, &chk) { ++ /* skip the cpu which has idle slibing cpu */ ++ if (cpumask_test_cpu(per_cpu(sched_sibling_cpu, i), ++ &sched_rq_queued_masks[SCHED_RQ_EMPTY])) ++ continue; ++ pds_sg_balance_trigger(i); ++ } ++} ++DEFINE_PER_CPU(unsigned long, thermal_pressure); ++ ++void arch_set_thermal_pressure(struct cpumask *cpus, ++ unsigned long th_pressure) ++{ ++ int cpu; ++ ++ for_each_cpu(cpu, cpus) ++ WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); ++} ++#endif /* CONFIG_SCHED_SMT */ ++#endif /* CONFIG_SMP */ ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ arch_scale_freq_tick(); ++ sched_clock_tick(); ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ pds_scheduler_task_tick(rq); ++ update_sched_rq_queued_masks_normal(rq); ++ calc_global_load_tick(rq); ++ psi_task_tick(rq); ++ ++ rq->last_tick = rq->clock; ++ raw_spin_unlock(&rq->lock); ++ ++ perf_event_task_tick(); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++struct tick_work { ++ int cpu; ++ atomic_t state; ++ struct delayed_work work; ++}; ++/* Values for ->state, see diagram below. */ ++#define TICK_SCHED_REMOTE_OFFLINE 0 ++#define TICK_SCHED_REMOTE_OFFLINING 1 ++#define TICK_SCHED_REMOTE_RUNNING 2 ++ ++/* ++ * State diagram for ->state: ++ * ++ * ++ * TICK_SCHED_REMOTE_OFFLINE ++ * | ^ ++ * | | ++ * | | sched_tick_remote() ++ * | | ++ * | | ++ * +--TICK_SCHED_REMOTE_OFFLINING ++ * | ^ ++ * | | ++ * sched_tick_start() | | sched_tick_stop() ++ * | | ++ * V | ++ * TICK_SCHED_REMOTE_RUNNING ++ * ++ * ++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() ++ * and sched_tick_start() are happy to leave the state in RUNNING. ++ */ ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ unsigned long flags; ++ u64 delta; ++ int os; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (!tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ curr = rq->curr; ++ if (cpu_is_offline(cpu)) ++ goto out_unlock; ++ ++ update_rq_clock(rq); ++ if (!is_idle_task(curr)) { ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ delta = rq_clock_task(rq) - curr->last_ran; ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ } ++ pds_scheduler_task_tick(rq); ++ update_sched_rq_queued_masks_normal(rq); ++ calc_load_nohz_remote(rq); ++ ++out_unlock: ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++out_requeue: ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. But ++ * first update state to reflect hotplug activity if required. ++ */ ++ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); ++ if (os == TICK_SCHED_REMOTE_RUNNING) ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ int os; ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); ++ if (os == TICK_SCHED_REMOTE_OFFLINE) { ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++ } ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ cancel_delayed_work_sync(&twork->work); ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_PREEMPT_TRACER)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++/* ++ * Timeslices below RESCHED_US are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. SCHED_BATCH tasks ++ * have been flagged be not latency sensitive and likely to be fully CPU ++ * bound so every time they're rescheduled they have their time_slice ++ * refilled, but get a new later deadline to have little effect on ++ * SCHED_NORMAL tasks. ++ ++ */ ++static inline void check_deadline(struct task_struct *p, struct rq *rq) ++{ ++ if (rq->idle == p) ++ return; ++ ++ pds_update_curr(rq, p); ++ ++ if (p->time_slice < RESCHED_US) { ++ time_slice_expired(p, rq); ++ if (SCHED_ISO == p->policy && ISO_PRIO == p->prio) { ++ p->prio = NORMAL_PRIO; ++ p->deadline = rq->clock + task_deadline_diff(p); ++ update_task_priodl(p); ++ } ++ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) ++ requeue_task(p, rq); ++ } ++} ++ ++#ifdef CONFIG_SMP ++ ++#define SCHED_RQ_NR_MIGRATION (32UL) ++/* ++ * Migrate pending tasks in @rq to @dest_cpu ++ * Will try to migrate mininal of half of @rq nr_running tasks and ++ * SCHED_RQ_NR_MIGRATION to @dest_cpu ++ */ ++static inline int ++migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, int filter_prio) ++{ ++ struct task_struct *p; ++ int dest_cpu = cpu_of(dest_rq); ++ int nr_migrated = 0; ++ int nr_tries = min((rq->nr_running + 1) / 2, SCHED_RQ_NR_MIGRATION); ++ struct skiplist_node *node = rq->sl_header.next[0]; ++ ++ while (nr_tries && node != &rq->sl_header) { ++ p = skiplist_entry(node, struct task_struct, sl_node); ++ node = node->next[0]; ++ ++ if (task_running(p)) ++ continue; ++ if (p->prio >= filter_prio) ++ break; ++ if (cpumask_test_cpu(dest_cpu, &p->cpus_mask)) { ++ dequeue_task(p, rq, 0); ++ set_task_cpu(p, dest_cpu); ++ enqueue_task(p, dest_rq, 0); ++ nr_migrated++; ++ } ++ nr_tries--; ++ /* make a jump */ ++ if (node == &rq->sl_header) ++ break; ++ node = node->next[0]; ++ } ++ ++ return nr_migrated; ++} ++ ++static inline int ++take_queued_task_cpumask(struct rq *rq, cpumask_t *chk_mask, int filter_prio) ++{ ++ int src_cpu; ++ ++ for_each_cpu(src_cpu, chk_mask) { ++ int nr_migrated; ++ struct rq *src_rq = cpu_rq(src_cpu); ++ ++ if (!do_raw_spin_trylock(&src_rq->lock)) { ++ if (PRIO_LIMIT == filter_prio) ++ continue; ++ return 0; ++ } ++ spin_acquire(&src_rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ ++ update_rq_clock(src_rq); ++ if ((nr_migrated = migrate_pending_tasks(src_rq, rq, filter_prio))) ++ cpufreq_update_this_cpu(rq, 0); ++ ++ spin_release(&src_rq->lock.dep_map, _RET_IP_); ++ do_raw_spin_unlock(&src_rq->lock); ++ ++ if (nr_migrated || PRIO_LIMIT != filter_prio) ++ return nr_migrated; ++ } ++ return 0; ++} ++ ++static inline int take_other_rq_task(struct rq *rq, int cpu, int filter_prio) ++{ ++ struct cpumask *affinity_mask, *end; ++ struct cpumask chk; ++ ++ if (PRIO_LIMIT == filter_prio) { ++ cpumask_complement(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY]); ++#ifdef CONFIG_SMT_NICE ++ { ++ /* also try to take IDLE priority tasks from smt supressed cpu */ ++ struct cpumask t; ++ if (cpumask_and(&t, &sched_smt_supressed_mask, ++ &sched_rq_queued_masks[SCHED_RQ_IDLE])) ++ cpumask_or(&chk, &chk, &t); ++ } ++#endif ++ } else if (NORMAL_PRIO == filter_prio) { ++ cpumask_or(&chk, &sched_rq_pending_masks[SCHED_RQ_RT], ++ &sched_rq_pending_masks[SCHED_RQ_ISO]); ++ } else if (IDLE_PRIO == filter_prio) { ++ cpumask_complement(&chk, &sched_rq_pending_masks[SCHED_RQ_EMPTY]); ++ cpumask_andnot(&chk, &chk, &sched_rq_pending_masks[SCHED_RQ_IDLE]); ++ } else ++ cpumask_copy(&chk, &sched_rq_pending_masks[SCHED_RQ_RT]); ++ ++ if (cpumask_empty(&chk)) ++ return 0; ++ ++ affinity_mask = per_cpu(sched_cpu_llc_start_mask, cpu); ++ end = per_cpu(sched_cpu_affinity_chk_end_masks, cpu); ++ do { ++ struct cpumask tmp; ++ ++ if (cpumask_and(&tmp, &chk, affinity_mask) && ++ take_queued_task_cpumask(rq, &tmp, filter_prio)) ++ return 1; ++ } while (++affinity_mask < end); ++ ++ return 0; ++} ++#endif ++ ++static inline struct task_struct * ++choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) ++{ ++ struct task_struct *next = rq_first_queued_task(rq); ++ ++#ifdef CONFIG_SMT_NICE ++ if (cpumask_test_cpu(cpu, &sched_smt_supressed_mask)) { ++ if (next->prio >= IDLE_PRIO) { ++ if (rq->online && ++ take_other_rq_task(rq, cpu, IDLE_PRIO)) ++ return rq_first_queued_task(rq); ++ return rq->idle; ++ } ++ } ++#endif ++ ++#ifdef CONFIG_SMP ++ if (likely(rq->online)) ++ if (take_other_rq_task(rq, cpu, next->prio)) { ++ resched_curr(rq); ++ return rq_first_queued_task(rq); ++ } ++#endif ++ return next; ++} ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(KERN_ERR, preempt_disable_ip); ++ } ++ if (panic_on_warn) ++ panic("scheduling while atomic\n"); ++ ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev, bool preempt) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++#endif ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++ if (!preempt && prev->state && prev->non_block_count) { ++ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", ++ prev->comm, prev->pid, prev->non_block_count); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++ } ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ p->last_ran = rq->clock_task; ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++ if (p != rq->idle) ++ hrtick_start(rq, US_TO_NS(p->time_slice)); ++#endif ++ /* update rq->dither */ ++ rq->dither = rq_dither(rq); ++} ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next; ++ unsigned long *switch_count; ++ struct rq *rq; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ ++ schedule_debug(prev, preempt); ++ ++ /* by passing sched_feat(HRTICK) checking which PDS doesn't support */ ++ hrtick_clear(rq); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(). ++ * ++ * The membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ raw_spin_lock(&rq->lock); ++ smp_mb__after_spinlock(); ++ ++ update_rq_clock(rq); ++ ++ switch_count = &prev->nivcsw; ++ if (!preempt && prev->state) { ++ if (signal_pending_state(prev->state, prev)) { ++ prev->state = TASK_RUNNING; ++ } else { ++ deactivate_task(prev, rq); ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ check_deadline(prev, rq); ++ ++ next = choose_next_task(rq, cpu, prev); ++ ++ set_rq_task(rq, next); ++ ++ if (prev != next) { ++ if (next->prio == PRIO_LIMIT) ++ schedstat_inc(rq->sched_goidle); ++ ++ /* ++ * RCU users of rcu_dereference(rq->curr) may not see ++ * changes to task_struct made by pick_next_task(). ++ */ ++ RCU_INIT_POINTER(rq->curr, next); ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ rq->nr_switches++; ++ ++ psi_sched_switch(prev, next, !task_on_rq_queued(prev)); ++ ++ trace_sched_switch(preempt, prev, next); ++ ++ /* Also unlocks the rq: */ ++ rq = context_switch(rq, prev, next); ++#ifdef CONFIG_SCHED_SMT ++ pds_sg_balance_check(rq); ++#endif ++ } else ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(): */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ __schedule(false); ++ ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ if (!tsk->state || tsk_is_pi_blocked(tsk) || ++ signal_pending_state(tsk->state, tsk)) ++ return; ++ ++ /* ++ * If a worker went to sleep, notify and ask workqueue whether ++ * it wants to wake up a task to maintain concurrency. ++ * As this function is called inside the schedule() context, ++ * we disable preemption to avoid it calling schedule() again ++ * in the possible wakeup of a kworker. ++ */ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ preempt_disable(); ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_sleeping(tsk); ++ else ++ io_wq_worker_sleeping(tsk); ++ preempt_enable_no_resched(); ++ } ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++static void sched_update_worker(struct task_struct *tsk) ++{ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_running(tsk); ++ else ++ io_wq_worker_running(tsk); ++ } ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ sched_update_worker(tsk); ++} ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_CONTEXT_TRACKING ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != CONTEXT_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPTION ++/* ++ * This is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPTION */ ++ ++/* ++ * This is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++static inline void ++check_task_changed(struct rq *rq, struct task_struct *p) ++{ ++ /* ++ * Trigger changes when task priority/deadline modified. ++ */ ++ if (task_on_rq_queued(p)) { ++ struct task_struct *first; ++ ++ requeue_task(p, rq); ++ ++ /* Resched if first queued task not running and not IDLE */ ++ if ((first = rq_first_queued_task(rq)) != rq->curr && ++ !task_running_idle(first)) ++ resched_curr(rq); ++ } ++} ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_access_lock(p, &lock); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guaratees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ p->prio = prio; ++ update_task_priodl(p); ++ ++ check_task_changed(rq, p); ++ ++out_unlock: ++ __task_access_unlock(p, lock); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ int new_static; ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ new_static = NICE_TO_PRIO(nice); ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ /* rq lock may not held!! */ ++ update_rq_clock(rq); ++ ++ p->static_prio = new_static; ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (task_has_rt_policy(p)) ++ goto out_unlock; ++ ++ p->deadline -= task_deadline_diff(p); ++ p->deadline += static_deadline_diff(new_static); ++ p->prio = effective_prio(p); ++ update_task_priodl(p); ++ ++ check_task_changed(rq, p); ++out_unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ int level, prio = p->prio - MAX_RT_PRIO; ++ static const int level_to_nice_prio[] = {39, 33, 26, 20, 14, 7, 0, 0}; ++ ++ /* rt tasks */ ++ if (prio <= 0) ++ goto out; ++ ++ preempt_disable(); ++ level = task_deadline_level(p, this_rq()); ++ preempt_enable(); ++ prio += level_to_nice_prio[level]; ++ if (idleprio_task(p)) ++ prio += NICE_WIDTH; ++out: ++ return prio; ++} ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ return cpu_curr(cpu) == cpu_rq(cpu)->idle; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the cpu @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++#ifdef CONFIG_SMP ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ int dest_cpu; ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (cpumask_equal(&p->cpus_mask, new_mask)) ++ goto out; ++ ++ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ if (dest_cpu >= nr_cpu_ids) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ do_set_cpus_allowed(p, new_mask); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(p) || p->state == TASK_WAKING) { ++ struct migration_arg arg = { p, dest_cpu }; ++ ++ /* Need help from migration thread: drop lock and wait. */ ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); ++ return 0; ++ } ++ if (task_on_rq_queued(p)) { ++ /* ++ * OK, since we're going to drop the lock immediately ++ * afterwards anyway. ++ */ ++ update_rq_clock(rq); ++ rq = move_queued_task(rq, p, dest_cpu); ++ lock = &rq->lock; ++ } ++ ++out: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#else ++static inline int ++__set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++#endif ++ ++static u64 task_init_deadline(const struct task_struct *p) ++{ ++ return task_rq(p)->clock + task_deadline_diff(p); ++} ++ ++u64 (* task_init_deadline_func_tbl[])(const struct task_struct *p) = { ++ task_init_deadline, /* SCHED_NORMAL */ ++ NULL, /* SCHED_FIFO */ ++ NULL, /* SCHED_RR */ ++ task_init_deadline, /* SCHED_BATCH */ ++ NULL, /* SCHED_ISO */ ++ task_init_deadline /* SCHED_IDLE */ ++}; ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++static void __setscheduler_params(struct task_struct *p, ++ const struct sched_attr *attr) ++{ ++ int old_policy = p->policy; ++ int policy = attr->sched_policy; ++ ++ if (policy == SETPARAM_POLICY) ++ policy = p->policy; ++ ++ p->policy = policy; ++ ++ /* ++ * allow normal nice value to be set, but will not have any ++ * effect on scheduling until the task not SCHED_NORMAL/ ++ * SCHED_BATCH ++ */ ++ p->static_prio = NICE_TO_PRIO(attr->sched_nice); ++ ++ /* ++ * __sched_setscheduler() ensures attr->sched_priority == 0 when ++ * !rt_policy. Always setting this ensures that things like ++ * getparam()/getattr() don't report silly values for !rt tasks. ++ */ ++ p->rt_priority = attr->sched_priority; ++ p->normal_prio = normal_prio(p); ++ ++ if (old_policy != policy) ++ p->deadline = (task_init_deadline_func_tbl[p->policy])? ++ task_init_deadline_func_tbl[p->policy](p):0ULL; ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct rq *rq, struct task_struct *p, ++ const struct sched_attr *attr, bool keep_boost) ++{ ++ __setscheduler_params(p, attr); ++ ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++ update_task_priodl(p); ++} ++ ++/* ++ * check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int ++__sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, bool user, bool pi) ++{ ++ const struct sched_attr dl_squash_attr = { ++ .size = sizeof(struct sched_attr), ++ .sched_policy = SCHED_FIFO, ++ .sched_nice = 0, ++ .sched_priority = 99, ++ }; ++ int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ int retval, oldpolicy = -1; ++ int policy = attr->sched_policy; ++ unsigned long flags; ++ struct rq *rq; ++ int reset_on_fork; ++ raw_spinlock_t *lock; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ /* ++ * PDS supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO ++ */ ++ if (unlikely(SCHED_DEADLINE == policy)) { ++ attr = &dl_squash_attr; ++ policy = attr->sched_policy; ++ newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); ++ ++ if (policy > SCHED_IDLE) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH and SCHED_IDLE is 0. ++ */ ++ if (attr->sched_priority < 0 || ++ (p->mm && attr->sched_priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if ((SCHED_RR == policy || SCHED_FIFO == policy) != ++ (attr->sched_priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (SCHED_FIFO == policy || SCHED_RR == policy) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (attr->sched_priority > p->rt_priority && ++ attr->sched_priority > rlim_rtprio) ++ return -EPERM; ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ if (pi) ++ cpuset_read_lock(); ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ /* ++ * To be able to change p->policy safely, task_access_lock() ++ * must be called. ++ * IF use task_access_lock() here: ++ * For the task p which is not running, reading rq->stop is ++ * racy but acceptable as ->stop doesn't change much. ++ * An enhancemnet can be made to read rq->stop saftly. ++ */ ++ rq = __task_access_lock(p, &lock); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea ++ */ ++ if (p == rq->stop) { ++ retval = -EINVAL; ++ goto unlock; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy)) { ++ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) ++ goto change; ++ if (!rt_policy(policy) && ++ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) ++ goto change; ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ retval = 0; ++ goto unlock; ++ } ++change: ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ goto recheck; ++ } ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ if (pi) { ++ /* ++ * Take priority boosted tasks into account. If the new ++ * effective priority is unchanged, we just store the new ++ * normal parameters and do not touch the scheduler class and ++ * the runqueue. This will be done when the task deboost ++ * itself. ++ */ ++ if (rt_effective_prio(p, newprio) == p->prio) { ++ __setscheduler_params(p, attr); ++ retval = 0; ++ goto unlock; ++ } ++ } ++ ++ __setscheduler(rq, p, attr, pi); ++ ++ check_task_changed(rq, p); ++ ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ if (pi) { ++ cpuset_read_unlock(); ++ rt_mutex_adjust_pi(p); ++ } ++ ++ preempt_enable(); ++ ++ return 0; ++ ++unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ return retval; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ ++ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { ++ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; ++ policy &= ~SCHED_RESET_ON_FORK; ++ attr.sched_policy = policy; ++ } ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++ ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++EXPORT_SYMBOL_GPL(sched_setscheduler); ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++EXPORT_SYMBOL_GPL(sched_setattr); ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setscheduler(p, policy, &lparam); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) ++ goto err_size; ++ ++ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); ++ if (ret) { ++ if (ret == -E2BIG) ++ goto err_size; ++ return ret; ++ } ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * @param: structure containing the new RT priority. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (p != NULL) ++ retval = sched_setattr(p, &attr); ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (task_has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/* ++ * Copy the kernel size attribute structure (which might be larger ++ * than what user-space knows about) to user-space. ++ * ++ * Note that all cases are valid: user-space buffer can be larger or ++ * smaller than the kernel-space buffer. The usual case is that both ++ * have the same size. ++ */ ++static int ++sched_attr_copy_to_user(struct sched_attr __user *uattr, ++ struct sched_attr *kattr, ++ unsigned int usize) ++{ ++ unsigned int ksize = sizeof(*kattr); ++ ++ if (!access_ok(uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * sched_getattr() ABI forwards and backwards compatibility: ++ * ++ * If usize == ksize then we just copy everything to user-space and all is good. ++ * ++ * If usize < ksize then we only copy as much as user-space has space for, ++ * this keeps ABI compatibility as well. We skip the rest. ++ * ++ * If usize > ksize then user-space is using a newer version of the ABI, ++ * which part the kernel doesn't know about. Just ignore it - tooling can ++ * detect the kernel's knowledge of attributes from the attr->size value ++ * which is set to ksize in this case. ++ */ ++ kattr->size = min(usize, ksize); ++ ++ if (copy_to_user(uattr, kattr, kattr->size)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @usize: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, usize, unsigned int, flags) ++{ ++ struct sched_attr kattr = { }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || usize > PAGE_SIZE || ++ usize < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ kattr.sched_policy = p->policy; ++ if (rt_task(p)) ++ kattr.sched_priority = p->rt_priority; ++ else ++ kattr.sched_nice = task_nice(p); ++ ++#ifdef CONFIG_UCLAMP_TASK ++ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; ++ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; ++#endif ++ ++ rcu_read_unlock(); ++ ++ return sched_attr_copy_to_user(uattr, &kattr, usize); ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_mask, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ put_online_cpus(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_mask); ++ cpumask_and(new_mask, in_mask, cpus_mask); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_mask); ++ if (!cpumask_subset(new_mask, cpus_mask)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_mask to the ++ * cpuset's cpus_mask ++ */ ++ cpumask_copy(new_mask, cpus_mask); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_mask); ++out_put_task: ++ put_task_struct(p); ++ put_online_cpus(); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ struct cpumask *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ raw_spinlock_t *lock; ++ unsigned long flags; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ task_access_lock_irqsave(p, &lock, &flags); ++ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: size of CPU mask copied to user_mask_ptr on success. An ++ * error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min_t(size_t, len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ * ++ * Return: 0. ++ */ ++static void do_sched_yield(void) ++{ ++ struct rq *rq; ++ struct rq_flags rf; ++ ++ if (!sched_yield_type) ++ return; ++ ++ rq = this_rq_lock_irq(&rf); ++ ++ if (sched_yield_type > 1) { ++ time_slice_expired(current, rq); ++ requeue_task(current, rq); ++ } ++ schedstat_inc(rq->yld_count); ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ preempt_disable(); ++ raw_spin_unlock(&rq->lock); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPTION ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ rcu_all_qs(); ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, its already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * In PDS, yield_to is not supported. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ int retval; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ rcu_read_unlock(); ++ ++ *t = ns_to_timespec64(MS_TO_NS(rr_interval)); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct __kernel_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT_32BIT_TIME ++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, ++ struct old_timespec32 __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_old_timespec32(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, ++ task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ show_stack(p, NULL, KERN_INFO); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++#if BITS_PER_LONG == 32 ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#else ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#endif ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++#ifdef CONFIG_SCHED_DEBUG ++ /* PDS TODO: should support this ++ if (!state_filter) ++ sysrq_sched_debug_show(); ++ */ ++#endif ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: cpu the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ idle->last_ran = rq->clock_task; ++ idle->state = TASK_RUNNING; ++ idle->flags |= PF_IDLE; ++ /* Setting prio to illegal value shouldn't matter when never queued */ ++ idle->prio = PRIO_LIMIT; ++ idle->deadline = rq_clock(rq) + task_deadline_diff(idle); ++ update_task_priodl(idle); ++ ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#endif ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ __set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->idle = idle; ++ rcu_assign_pointer(rq->curr, idle); ++ idle->on_cpu = 1; ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(cpu_rq(cpu)); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++} ++ ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * In order to ensure that a pending wakeup will observe our pending ++ * state, even in the failed case, an explicit smp_mb() must be used. ++ */ ++ smp_mb__before_atomic(); ++ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) ++ return false; ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++ return true; ++} ++ ++/** ++ * wake_q_add() - queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ */ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task)) ++ get_task_struct(task); ++} ++ ++/** ++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ * ++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers ++ * that already hold reference to @task can call the 'safe' version and trust ++ * wake_q to do the right thing depending whether or not the @task is already ++ * queued for wakeup. ++ */ ++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (!__wake_q_add(head, task)) ++ put_task_struct(task); ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* task can safely be re-inserted now: */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++#ifdef CONFIG_SMP ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_mask may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++static bool sched_smp_initialized __read_mostly; ++ ++#ifdef CONFIG_NO_HZ_COMMON ++void nohz_balance_enter_idle(int cpu) ++{ ++} ++ ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(), default_cpu = -1; ++ struct cpumask *mask; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { ++ if (!idle_cpu(cpu)) ++ return cpu; ++ default_cpu = cpu; ++ } ++ ++ for (mask = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ mask < per_cpu(sched_cpu_affinity_chk_end_masks, cpu); mask++) ++ for_each_cpu_and(i, mask, housekeeping_cpumask(HK_FLAG_TIMER)) ++ if (!idle_cpu(i)) ++ return i; ++ ++ if (default_cpu == -1) ++ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++ cpu = default_cpu; ++ ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++void wake_up_idle_cpu(int cpu) ++{ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ set_tsk_need_resched(cpu_rq(cpu)->idle); ++ smp_send_reschedule(cpu); ++} ++ ++void wake_up_nohz_cpu(int cpu) ++{ ++ wake_up_idle_cpu(cpu); ++} ++#endif /* CONFIG_NO_HZ_COMMON */ ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Ensures that the idle task is using init_mm right before its CPU goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(current != this_rq()->idle); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ finish_arch_post_lock_switch(); ++ } ++ ++ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ ++} ++ ++/* ++ * Migrate all tasks from the rq, sleeping tasks will be migrated by ++ * try_to_wake_up()->select_task_rq(). ++ * ++ * Called with rq->lock held even though we'er in stop_machine() and ++ * there's no concurrency possible, we hold the required locks anyway ++ * because of lock validation efforts. ++ */ ++static void migrate_tasks(struct rq *dead_rq) ++{ ++ struct rq *rq = dead_rq; ++ struct task_struct *p, *stop = rq->stop; ++ struct skiplist_node *node; ++ int count = 0; ++ ++ /* ++ * Fudge the rq selection such that the below task selection loop ++ * doesn't get stuck on the currently eligible stop task. ++ * ++ * We're currently inside stop_machine() and the rq is either stuck ++ * in the stop_machine_cpu_stop() loop, or we're executing this code, ++ * either way we should never end up calling schedule() until we're ++ * done here. ++ */ ++ rq->stop = NULL; ++ ++ node = &rq->sl_header; ++ while ((node = node->next[0]) != &rq->sl_header) { ++ int dest_cpu; ++ ++ p = skiplist_entry(node, struct task_struct, sl_node); ++ ++ /* skip the running task */ ++ if (task_running(p)) ++ continue; ++ ++ /* ++ * Rules for changing task_struct::cpus_mask are holding ++ * both pi_lock and rq->lock, such that holding either ++ * stabilizes the mask. ++ * ++ * Drop rq->lock is not quite as disastrous as it usually is ++ * because !cpu_active at this point, which means load-balance ++ * will not interfere. Also, stop-machine. ++ */ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ /* ++ * Since we're inside stop-machine, _nothing_ should have ++ * changed the task, WARN if weird stuff happened, because in ++ * that case the above rq->lock drop is a fail too. ++ */ ++ if (WARN_ON(task_rq(p) != rq || !task_on_rq_queued(p))) { ++ raw_spin_unlock(&p->pi_lock); ++ continue; ++ } ++ ++ count++; ++ /* Find suitable destination for @next, with force if needed. */ ++ dest_cpu = select_fallback_rq(dead_rq->cpu, p); ++ ++ rq = __migrate_task(rq, p, dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ rq = dead_rq; ++ raw_spin_lock(&rq->lock); ++ /* Check queued task all over from the header again */ ++ node = &rq->sl_header; ++ } ++ ++ rq->stop = stop; ++} ++ ++static void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) ++ rq->online = false; ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++static void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) ++ rq->online = true; ++} ++ ++#ifdef CONFIG_SCHED_DEBUG ++ ++static __read_mostly int sched_debug_enabled; ++ ++static int __init sched_debug_setup(char *str) ++{ ++ sched_debug_enabled = 1; ++ ++ return 0; ++} ++early_param("sched_debug", sched_debug_setup); ++ ++static inline bool sched_debug(void) ++{ ++ return sched_debug_enabled; ++} ++#else /* !CONFIG_SCHED_DEBUG */ ++static inline bool sched_debug(void) ++{ ++ return false; ++} ++#endif /* CONFIG_SCHED_DEBUG */ ++ ++#ifdef CONFIG_SMP ++void send_call_function_single_ipi(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (!set_nr_if_polling(rq->idle)) ++ arch_send_call_function_single_ipi(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++void sched_ttwu_pending(void *arg) ++{ ++ struct llist_node *llist = arg; ++ struct rq *rq = this_rq(); ++ struct task_struct *p, *t; ++ struct rq_flags rf; ++ ++ if (!llist) ++ return; ++ ++ /* ++ * rq::ttwu_pending racy indication of out-standing wakeups. ++ * Races such that false-negatives are possible, since they ++ * are shorter lived that false-positives would be. ++ */ ++ WRITE_ONCE(rq->ttwu_pending, 0); ++ ++ rq_lock_irqsave(rq, &rf); ++ update_rq_clock(rq); ++ ++ /*llist_for_each_entry_safe(p, t, llist, wake_entry) ++ ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);*/ ++ ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (is_idle_task(rq->curr)) ++ smp_send_reschedule(cpu); ++ /* Else CPU is not idle, do nothing here */ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Topology list, bottom-up. ++ */ ++static struct sched_domain_topology_level default_topology[] = { ++#ifdef CONFIG_SCHED_SMT ++ { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, ++#endif ++#ifdef CONFIG_SCHED_MC ++ { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, ++#endif ++ { cpu_cpu_mask, SD_INIT_NAME(DIE) }, ++ { NULL, }, ++}; ++ ++static struct sched_domain_topology_level *sched_domain_topology = ++ default_topology; ++ ++#define for_each_sd_topology(tl) \ ++ for (tl = sched_domain_topology; tl->mask; tl++) ++ ++void set_sched_topology(struct sched_domain_topology_level *tl) ++{ ++ if (WARN_ON_ONCE(sched_smp_initialized)) ++ return; ++ ++ sched_domain_topology = tl; ++} ++ ++/* ++ * Initializers for schedule domains ++ * Non-inlined to reduce accumulated stack pressure in build_sched_domains() ++ */ ++ ++int sched_domain_level_max; ++ ++/* ++ * Partition sched domains as specified by the 'ndoms_new' ++ * cpumasks in the array doms_new[] of cpumasks. This compares ++ * doms_new[] to the current sched domain partitioning, doms_cur[]. ++ * It destroys each deleted domain and builds each new domain. ++ * ++ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. ++ * The masks don't intersect (don't overlap.) We should setup one ++ * sched domain for each mask. CPUs not in any of the cpumasks will ++ * not be load balanced. If the same cpumask appears both in the ++ * current 'doms_cur' domains and in the new 'doms_new', we can leave ++ * it as it is. ++ * ++ * The passed in 'doms_new' should be allocated using ++ * alloc_sched_domains. This routine takes ownership of it and will ++ * free_sched_domains it when done with it. If the caller failed the ++ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, ++ * and partition_sched_domains() will fallback to the single partition ++ * 'fallback_doms', it also forces the domains to be rebuilt. ++ * ++ * If doms_new == NULL it will be replaced with cpu_online_mask. ++ * ndoms_new == 0 is a special case for destroying existing domains, ++ * and it will not create the default domain. ++ * ++ * Call with hotplug lock held ++ */ ++void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ++ struct sched_domain_attr *dattr_new) ++{ ++ /** ++ * PDS doesn't depend on sched domains, but just keep this api ++ */ ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++#ifdef CONFIG_NUMA ++int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; ++ ++/* ++ * sched_numa_find_closest() - given the NUMA topology, find the cpu ++ * closest to @cpu from @cpumask. ++ * cpumask: cpumask to find a cpu from ++ * cpu: cpu to be close to ++ * ++ * returns: cpu, or nr_cpu_ids when nothing found. ++ */ ++int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return best_mask_cpu(cpu, cpus); ++} ++#endif /* CONFIG_NUMA */ ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going up, increment the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_inc_cpuslocked(&sched_smt_present); ++#endif ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) ++ cpuset_cpu_active(); ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all cpus have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_online(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu(); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going down, decrement the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_dec_cpuslocked(&sched_smt_present); ++#endif ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ return 0; ++} ++ ++static void sched_rq_cpu_starting(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ rq->calc_load_update = calc_load_update; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_rq_cpu_starting(cpu); ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ sched_tick_stop(cpu); ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_offline(rq); ++ migrate_tasks(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ hrtick_clear(rq); ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_SMP ++static void sched_init_topology_cpumask_early(void) ++{ ++ int cpu, level; ++ cpumask_t *tmp; ++ ++ for_each_possible_cpu(cpu) { ++ for (level = 0; level < NR_CPU_AFFINITY_CHK_LEVEL; level++) { ++ tmp = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[level]); ++ cpumask_copy(tmp, cpu_possible_mask); ++ cpumask_clear_cpu(cpu, tmp); ++ } ++ per_cpu(sched_cpu_llc_start_mask, cpu) = ++ &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ per_cpu(sched_cpu_affinity_chk_end_masks, cpu) = ++ &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[1]); ++ } ++} ++ ++static void sched_init_topology_cpumask(void) ++{ ++ int cpu; ++ cpumask_t *chk; ++ ++ for_each_online_cpu(cpu) { ++ chk = &(per_cpu(sched_cpu_affinity_chk_masks, cpu)[0]); ++ ++#ifdef CONFIG_SCHED_SMT ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++ if (cpumask_and(chk, chk, topology_sibling_cpumask(cpu))) { ++ per_cpu(sched_sibling_cpu, cpu) = cpumask_first(chk); ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - smt 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ } ++#endif ++#ifdef CONFIG_SCHED_MC ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++ if (cpumask_and(chk, chk, cpu_coregroup_mask(cpu))) { ++ per_cpu(sched_cpu_llc_start_mask, cpu) = chk; ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - coregroup 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ } ++ cpumask_complement(chk, cpu_coregroup_mask(cpu)); ++ ++ /** ++ * Set up sd_llc_id per CPU ++ */ ++ per_cpu(sd_llc_id, cpu) = ++ cpumask_first(cpu_coregroup_mask(cpu)); ++#else ++ per_cpu(sd_llc_id, cpu) = ++ cpumask_first(topology_core_cpumask(cpu)); ++ ++ per_cpu(sched_cpu_llc_start_mask, cpu) = chk; ++ ++ cpumask_setall(chk); ++ cpumask_clear_cpu(cpu, chk); ++#endif /* NOT CONFIG_SCHED_MC */ ++ if (cpumask_and(chk, chk, topology_core_cpumask(cpu))) ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - core 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ cpumask_complement(chk, topology_core_cpumask(cpu)); ++ ++ if (cpumask_and(chk, chk, cpu_online_mask)) ++ printk(KERN_INFO "pds: cpu #%d affinity check mask - others 0x%08lx", ++ cpu, (chk++)->bits[0]); ++ ++ per_cpu(sched_cpu_affinity_chk_end_masks, cpu) = chk; ++ } ++} ++#endif ++ ++void __init sched_init_smp(void) ++{ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) ++ BUG(); ++ ++ cpumask_copy(&sched_rq_queued_masks[SCHED_RQ_EMPTY], cpu_online_mask); ++ ++ sched_init_topology_cpumask(); ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++ int i; ++ struct rq *rq; ++ ++ print_scheduler_version(); ++ ++ wait_bit_init(); ++ ++#ifdef CONFIG_SMP ++ for (i = 0; i < NR_SCHED_RQ_QUEUED_LEVEL; i++) ++ cpumask_clear(&sched_rq_queued_masks[i]); ++ cpumask_setall(&sched_rq_queued_masks[SCHED_RQ_EMPTY]); ++ set_bit(SCHED_RQ_EMPTY, sched_rq_queued_masks_bitmap); ++ ++ cpumask_setall(&sched_rq_pending_masks[SCHED_RQ_EMPTY]); ++ set_bit(SCHED_RQ_EMPTY, sched_rq_pending_masks_bitmap); ++#else ++ uprq = &per_cpu(runqueues, 0); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ FULL_INIT_SKIPLIST_NODE(&rq->sl_header); ++ raw_spin_lock_init(&rq->lock); ++ rq->dither = 0; ++ rq->nr_running = rq->nr_uninterruptible = 0; ++ rq->calc_load_active = 0; ++ rq->calc_load_update = jiffies + LOAD_FREQ; ++#ifdef CONFIG_SMP ++ rq->online = false; ++ rq->cpu = i; ++ ++ rq->queued_level = SCHED_RQ_EMPTY; ++ rq->pending_level = SCHED_RQ_EMPTY; ++#ifdef CONFIG_SCHED_SMT ++ per_cpu(sched_sibling_cpu, i) = i; ++ rq->active_balance = 0; ++#endif ++#endif ++ rq->nr_switches = 0; ++ atomic_set(&rq->nr_iowait, 0); ++ hrtick_rq_init(rq); ++ } ++#ifdef CONFIG_SMP ++ /* Set rq->online for cpu 0 */ ++ cpu_rq(0)->online = true; ++#endif ++ ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++ ++ sched_init_topology_cpumask_early(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++ ++ psi_init(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current) && !current->non_block_count) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++#ifdef CONFIG_DEBUG_PREEMPT ++ if (!preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(KERN_ERR, preempt_disable_ip); ++ } ++#endif ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++ ++void __cant_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > preempt_offset) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); ++ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++void normalize_rt_tasks(void) ++{ ++ struct task_struct *g, *p; ++ struct sched_attr attr = { ++ .sched_policy = SCHED_NORMAL, ++ }; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p)) { ++ /* ++ * Renice negative nice level userspace ++ * tasks back to 0: ++ */ ++ if (task_nice(p) < 0) ++ set_user_nice(p, 0); ++ continue; ++ } ++ ++ __sched_setscheduler(p, &attr, false, false); ++ } ++ read_unlock(&tasklist_lock); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * ia64_set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++#ifdef CONFIG_SCHED_DEBUG ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_legacy_files[] = { ++ { } /* Terminate */ ++}; ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++#undef CREATE_TRACE_POINTS +diff --git a/kernel/sched/pds_sched.h b/kernel/sched/pds_sched.h +new file mode 100644 +index 000000000000..6c3361f06087 +--- /dev/null ++++ b/kernel/sched/pds_sched.h +@@ -0,0 +1,577 @@ ++#ifndef PDS_SCHED_H ++#define PDS_SCHED_H ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#ifdef CONFIG_PARAVIRT ++# include ++#endif ++ ++#include "cpupri.h" ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++} ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++ ++/* ++ * rq::clock_update_flags bits ++ */ ++#define RQCF_REQ_SKIP 0x01 ++#define RQCF_ACT_SKIP 0x02 ++#define RQCF_UPDATED 0x04 ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ /* runqueue lock: */ ++ raw_spinlock_t lock; ++ ++ struct task_struct __rcu *curr; ++ struct task_struct *idle, *stop; ++ struct mm_struct *prev_mm; ++ ++ struct skiplist_node sl_header; ++ ++ /* switch count */ ++ u64 nr_switches; ++ ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_MEMBARRIER ++ int membarrier_state; ++#endif ++ ++#ifdef CONFIG_SMP ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ unsigned int ttwu_pending; ++ unsigned int clock_update_flags; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ struct sched_avg avg_irq; ++#endif ++#ifdef CONFIG_SCHED_THERMAL_PRESSURE ++ struct sched_avg avg_thermal; ++#endif ++ ++ unsigned long queued_level; ++ unsigned long pending_level; ++ ++#ifdef CONFIG_SCHED_SMT ++ int active_balance; ++ struct cpu_stop_work active_balance_work; ++#endif ++#endif /* CONFIG_SMP */ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ /* calc_load related fields */ ++ unsigned long calc_load_update; ++ long calc_load_active; ++ ++ u64 clock, last_tick; ++ u64 clock_task; ++ int dither; ++ ++ unsigned long nr_running; ++ unsigned long nr_uninterruptible; ++ ++#ifdef CONFIG_SCHED_HRTICK ++#ifdef CONFIG_SMP ++ call_single_data_t hrtick_csd; ++#endif ++ struct hrtimer hrtick_timer; ++#endif ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++}; ++ ++#define task_contributes_to_load(task) ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ ++ (task->flags & PF_FROZEN) == 0 && \ ++ (task->state & TASK_NOLOAD) == 0) ++ ++extern unsigned long calc_load_update; ++extern atomic_long_t calc_load_tasks; ++ ++extern void calc_global_load_tick(struct rq *this_rq); ++extern long calc_load_fold_active(struct rq *this_rq, long adjust); ++ ++#ifndef CONFIG_SMP ++extern struct rq *uprq; ++#define cpu_rq(cpu) (uprq) ++#define this_rq() (uprq) ++#define raw_rq() (uprq) ++#define task_rq(p) (uprq) ++#define cpu_curr(cpu) ((uprq)->curr) ++#else /* CONFIG_SMP */ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) ++#define this_rq() this_cpu_ptr(&runqueues) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++#endif /* CONFIG_SMP */ ++ ++#ifndef arch_scale_freq_tick ++static __always_inline ++void arch_scale_freq_tick(void) ++{ ++} ++#endif ++ ++#ifndef arch_scale_freq_capacity ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock_task; ++} ++ ++/** ++ * By default the decay is the default pelt decay period. ++ * The decay shift can change the decay period in ++ * multiples of 32. ++ * Decay shift Decay period(ms) ++ * 0 32 ++ * 1 64 ++ * 2 128 ++ * 3 256 ++ * 4 512 ++ */ ++extern int sched_thermal_decay_shift; ++ ++static inline u64 rq_clock_thermal(struct rq *rq) ++{ ++ return rq_clock_task(rq) >> sched_thermal_decay_shift; ++} ++ ++/* ++ * {de,en}queue flags: ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ */ ++ ++#define DEQUEUE_SLEEP 0x01 ++ ++#define ENQUEUE_WAKEUP 0x01 ++ ++ ++/* ++ * Below are scheduler API which using in other kernel code ++ * It use the dummy rq_flags ++ * ToDo : PDS need to support these APIs for compatibility with mainline ++ * scheduler code. ++ */ ++struct rq_flags { ++ unsigned long flags; ++ struct pin_cookie cookie; ++ unsigned int clock_update_flags; ++}; ++ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock); ++ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock); ++ ++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(&rq->lock); ++} ++ ++static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) ++{ ++ rf->cookie = lockdep_pin_lock(&rq->lock); ++ ++#ifdef CONFIG_SCHED_DEBUG ++ rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); ++ rf->clock_update_flags = 0; ++#endif ++} ++ ++static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) ++{ ++#ifdef CONFIG_SCHED_DEBUG ++ if (rq->clock_update_flags > RQCF_ACT_SKIP) ++ rf->clock_update_flags = RQCF_UPDATED; ++#endif ++ ++ lockdep_unpin_lock(&rq->lock, rf->cookie); ++} ++ ++static inline void ++task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++} ++ ++static inline void ++rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irqsave(&rq->lock, rf->flags); ++ rq_pin_lock(rq, rf); ++} ++ ++static inline void ++rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ rq_unpin_lock(rq, rf); ++ raw_spin_unlock_irqrestore(&rq->lock, rf->flags); ++} ++ ++static inline void ++rq_unlock_irq(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++static inline void ++rq_unlock(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ rq_unpin_lock(rq, rf); ++ raw_spin_unlock(&rq->lock); ++} ++ ++static inline struct rq * ++this_rq_lock_irq(struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ raw_spin_lock(&rq->lock); ++ ++ return rq; ++} ++ ++static inline int task_current(struct rq *rq, struct task_struct *p) ++{ ++ return rq->curr == p; ++} ++ ++static inline bool task_running(struct task_struct *p) ++{ ++ return p->on_cpu; ++} ++ ++extern struct static_key_false sched_schedstats; ++ ++extern void flush_smp_call_function_from_idle(void); ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++static inline int cpu_of(const struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ return rq->cpu; ++#else ++ return 0; ++#endif ++} ++ ++#include "stats.h" ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); ++ ++/** ++ * cpufreq_update_util - Take a note about CPU utilization changes. ++ * @rq: Runqueue to carry out the update for. ++ * @flags: Update reason flags. ++ * ++ * This function is called by the scheduler on the CPU whose utilization is ++ * being updated. ++ * ++ * It can only be called from RCU-sched read-side critical sections. ++ * ++ * The way cpufreq is currently arranged requires it to evaluate the CPU ++ * performance state (frequency/voltage) on a regular basis to prevent it from ++ * being stuck in a completely inadequate performance level for too long. ++ * That is not guaranteed to happen if the updates are only triggered from CFS ++ * and DL, though, because they may not be coming in if only RT tasks are ++ * active all the time (or there are RT tasks only). ++ * ++ * As a workaround for that issue, this function is called periodically by the ++ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, ++ * but that really is a band-aid. Going forward it should be replaced with ++ * solutions targeted more specifically at RT tasks. ++ */ ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); ++ if (data) ++ data->func(data, rq_clock(rq), flags); ++} ++ ++static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) ++{ ++ if (cpu_of(rq) == smp_processor_id()) ++ cpufreq_update_util(rq, flags); ++} ++#else ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} ++static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {} ++#endif /* CONFIG_CPU_FREQ */ ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern int __init sched_tick_offload_init(void); ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++#endif ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++extern void schedule_idle(void); ++ ++/* ++ * !! For sched_setattr_nocheck() (kernel) only !! ++ * ++ * This is actually gross. :( ++ * ++ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE ++ * tasks, but still be able to sleep. We need this on platforms that cannot ++ * atomically change clock frequency. Remove once fast switching will be ++ * available on such platforms. ++ * ++ * SUGOV stands for SchedUtil GOVernor. ++ */ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++#ifdef CONFIG_MEMBARRIER ++/* ++ * The scheduler provides memory barriers required by membarrier between: ++ * - prior user-space memory accesses and store to rq->membarrier_state, ++ * - store to rq->membarrier_state and following user-space memory accesses. ++ * In the same way it provides those guarantees around store to rq->curr. ++ */ ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++ int membarrier_state; ++ ++ if (prev_mm == next_mm) ++ return; ++ ++ membarrier_state = atomic_read(&next_mm->membarrier_state); ++ if (READ_ONCE(rq->membarrier_state) == membarrier_state) ++ return; ++ ++ WRITE_ONCE(rq->membarrier_state, membarrier_state); ++} ++#else ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++} ++#endif ++ ++#ifdef CONFIG_NUMA ++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); ++#else ++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return nr_cpu_ids; ++} ++#endif ++ ++void swake_up_all_locked(struct swait_queue_head *q); ++void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); ++ ++#endif /* PDS_SCHED_H */ +diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c +index b647d04d9c8b..05b6cfd91842 100644 +--- a/kernel/sched/pelt.c ++++ b/kernel/sched/pelt.c +@@ -250,6 +250,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) + WRITE_ONCE(sa->util_avg, sa->util_sum / divider); + } + ++#ifndef CONFIG_SCHED_PDS + /* + * sched_entity: + * +@@ -367,6 +368,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + + return 0; + } ++#endif + + #ifdef CONFIG_SCHED_THERMAL_PRESSURE + /* +diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h +index eb034d9f024d..a074572f2976 100644 +--- a/kernel/sched/pelt.h ++++ b/kernel/sched/pelt.h +@@ -1,11 +1,13 @@ + #ifdef CONFIG_SMP + #include "sched-pelt.h" + ++#ifndef CONFIG_SCHED_PDS + int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); + int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); + int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); + int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); + int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); ++#endif + + #ifdef CONFIG_SCHED_THERMAL_PRESSURE + int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); +@@ -37,6 +39,7 @@ update_irq_load_avg(struct rq *rq, u64 running) + } + #endif + ++#ifndef CONFIG_SCHED_PDS + /* + * When a task is dequeued, its estimated utilization should not be update if + * its util_avg has not been updated at least once. +@@ -157,9 +160,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) + return rq_clock_pelt(rq_of(cfs_rq)); + } + #endif ++#endif /* CONFIG_SCHED_PDS */ + + #else + ++#ifndef CONFIG_SCHED_PDS + static inline int + update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) + { +@@ -188,6 +193,7 @@ static inline u64 thermal_load_avg(struct rq *rq) + { + return 0; + } ++#endif + + static inline int + update_irq_load_avg(struct rq *rq, u64 running) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index db3a57675ccf..5a8060bd2343 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2,6 +2,10 @@ + /* + * Scheduler internal types and methods: + */ ++#ifdef CONFIG_SCHED_PDS ++#include "pds_sched.h" ++#else ++ + #include + + #include +@@ -2546,3 +2550,5 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) + + void swake_up_all_locked(struct swait_queue_head *q); + void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); ++ ++#endif /* !CONFIG_SCHED_PDS */ +diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c +index 750fb3c67eed..45bd43942575 100644 +--- a/kernel/sched/stats.c ++++ b/kernel/sched/stats.c +@@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v) + } else { + struct rq *rq; + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_PDS + struct sched_domain *sd; + int dcount = 0; ++#endif + #endif + cpu = (unsigned long)(v - 2); + rq = cpu_rq(cpu); +@@ -40,6 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + seq_printf(seq, "\n"); + + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_PDS + /* domain-specific stats */ + rcu_read_lock(); + for_each_domain(cpu, sd) { +@@ -68,6 +71,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + sd->ttwu_move_balance); + } + rcu_read_unlock(); ++#endif + #endif + } + return 0; +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index 8a176d8727a3..b9dde576b576 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -130,9 +130,13 @@ static int __maybe_unused four = 4; + static unsigned long zero_ul; + static unsigned long one_ul = 1; + static unsigned long long_max = LONG_MAX; +-static int one_hundred = 100; +-static int two_hundred = 200; +-static int one_thousand = 1000; ++static int __read_mostly one_hundred = 100; ++static int __read_mostly two_hundred = 200; ++static int __read_mostly one_thousand = 1000; ++#ifdef CONFIG_SCHED_PDS ++extern int rr_interval; ++extern int sched_yield_type; ++#endif + #ifdef CONFIG_PRINTK + static int ten_thousand = 10000; + #endif +@@ -288,7 +292,7 @@ static struct ctl_table sysctl_base_table[] = { + { } + }; + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_PDS) + static int min_sched_granularity_ns = 100000; /* 100 usecs */ + static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns; /* 0 usecs */ +@@ -305,6 +309,7 @@ static int max_extfrag_threshold = 1000; + #endif + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_PDS + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -486,6 +491,7 @@ static struct ctl_table kern_table[] = { + .extra2 = SYSCTL_ONE, + }, + #endif ++#endif /* !CONFIG_SCHED_PDS */ + #ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", +@@ -1049,6 +1055,26 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_PDS ++ { ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ONE, ++ .extra2 = &one_thousand, ++ }, ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &two, ++ }, ++#endif + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +index 2fd3b3fa68bf..6f3b08afdd4c 100644 +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -236,7 +236,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) + u64 stime, utime; + + task_cputime(p, &utime, &stime); +- store_samples(samples, stime, utime, p->se.sum_exec_runtime); ++ store_samples(samples, stime, utime, tsk_seruntime(p)); + } + + static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, +@@ -806,6 +806,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, + } + } + ++#ifndef CONFIG_SCHED_PDS + static inline void check_dl_overrun(struct task_struct *tsk) + { + if (tsk->dl.dl_overrun) { +@@ -813,6 +814,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + } + } ++#endif + + static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) + { +@@ -840,8 +842,10 @@ static void check_thread_timers(struct task_struct *tsk, + u64 samples[CPUCLOCK_MAX]; + unsigned long soft; + ++#ifndef CONFIG_SCHED_PDS + if (dl_task(tsk)) + check_dl_overrun(tsk); ++#endif + + if (expiry_cache_is_inactive(pct)) + return; +@@ -855,7 +859,7 @@ static void check_thread_timers(struct task_struct *tsk, + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ +- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); ++ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + /* At the hard limit, send SIGKILL. No further action. */ +@@ -1091,8 +1095,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) + return true; + } + ++#ifndef CONFIG_SCHED_PDS + if (dl_task(tsk) && tsk->dl.dl_overrun) + return true; ++#endif + + return false; + } +diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +index b5e3496cf803..0816db0b9c16 100644 +--- a/kernel/trace/trace_selftest.c ++++ b/kernel/trace/trace_selftest.c +@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_PDS ++ /* No deadline on BFS, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + diff --git a/linux-tkg/linux-tkg-patches/5.8/0006-add-acs-overrides_iommu.patch b/linux-tkg/linux-tkg-patches/5.8/0006-add-acs-overrides_iommu.patch new file mode 100644 index 0000000..d1303a5 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.8/0006-add-acs-overrides_iommu.patch @@ -0,0 +1,193 @@ +From cdeab384f48dd9c88e2dff2e9ad8d57dca1a1b1c Mon Sep 17 00:00:00 2001 +From: Mark Weiman +Date: Sun, 12 Aug 2018 11:36:21 -0400 +Subject: [PATCH] pci: Enable overrides for missing ACS capabilities + +This an updated version of Alex Williamson's patch from: +https://lkml.org/lkml/2013/5/30/513 + +Original commit message follows: + +PCIe ACS (Access Control Services) is the PCIe 2.0+ feature that +allows us to control whether transactions are allowed to be redirected +in various subnodes of a PCIe topology. For instance, if two +endpoints are below a root port or downsteam switch port, the +downstream port may optionally redirect transactions between the +devices, bypassing upstream devices. The same can happen internally +on multifunction devices. The transaction may never be visible to the +upstream devices. + +One upstream device that we particularly care about is the IOMMU. If +a redirection occurs in the topology below the IOMMU, then the IOMMU +cannot provide isolation between devices. This is why the PCIe spec +encourages topologies to include ACS support. Without it, we have to +assume peer-to-peer DMA within a hierarchy can bypass IOMMU isolation. + +Unfortunately, far too many topologies do not support ACS to make this +a steadfast requirement. Even the latest chipsets from Intel are only +sporadically supporting ACS. We have trouble getting interconnect +vendors to include the PCIe spec required PCIe capability, let alone +suggested features. + +Therefore, we need to add some flexibility. The pcie_acs_override= +boot option lets users opt-in specific devices or sets of devices to +assume ACS support. The "downstream" option assumes full ACS support +on root ports and downstream switch ports. The "multifunction" +option assumes the subset of ACS features available on multifunction +endpoints and upstream switch ports are supported. The "id:nnnn:nnnn" +option enables ACS support on devices matching the provided vendor +and device IDs, allowing more strategic ACS overrides. These options +may be combined in any order. A maximum of 16 id specific overrides +are available. It's suggested to use the most limited set of options +necessary to avoid completely disabling ACS across the topology. +Note to hardware vendors, we have facilities to permanently quirk +specific devices which enforce isolation but not provide an ACS +capability. Please contact me to have your devices added and save +your customers the hassle of this boot option. + +Signed-off-by: Mark Weiman +--- + .../admin-guide/kernel-parameters.txt | 9 ++ + drivers/pci/quirks.c | 101 ++++++++++++++++++ + 2 files changed, 110 insertions(+) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index aefd358a5ca3..173b3596fd9e 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -3190,6 +3190,15 @@ + nomsi [MSI] If the PCI_MSI kernel config parameter is + enabled, this kernel boot option can be used to + disable the use of MSI interrupts system-wide. ++ pcie_acs_override = ++ [PCIE] Override missing PCIe ACS support for: ++ downstream ++ All downstream ports - full ACS capabilities ++ multifunction ++ All multifunction devices - multifunction ACS subset ++ id:nnnn:nnnn ++ Specific device - full ACS capabilities ++ Specified as vid:did (vendor/device ID) in hex + noioapicquirk [APIC] Disable all boot interrupt quirks. + Safety option to keep boot IRQs enabled. This + should never be necessary. +diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c +index 4700d24e5d55..8f7a3d7fd9c1 100644 +--- a/drivers/pci/quirks.c ++++ b/drivers/pci/quirks.c +@@ -3372,6 +3372,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev) + dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET; + } + ++static bool acs_on_downstream; ++static bool acs_on_multifunction; ++ ++#define NUM_ACS_IDS 16 ++struct acs_on_id { ++ unsigned short vendor; ++ unsigned short device; ++}; ++static struct acs_on_id acs_on_ids[NUM_ACS_IDS]; ++static u8 max_acs_id; ++ ++static __init int pcie_acs_override_setup(char *p) ++{ ++ if (!p) ++ return -EINVAL; ++ ++ while (*p) { ++ if (!strncmp(p, "downstream", 10)) ++ acs_on_downstream = true; ++ if (!strncmp(p, "multifunction", 13)) ++ acs_on_multifunction = true; ++ if (!strncmp(p, "id:", 3)) { ++ char opt[5]; ++ int ret; ++ long val; ++ ++ if (max_acs_id >= NUM_ACS_IDS - 1) { ++ pr_warn("Out of PCIe ACS override slots (%d)\n", ++ NUM_ACS_IDS); ++ goto next; ++ } ++ ++ p += 3; ++ snprintf(opt, 5, "%s", p); ++ ret = kstrtol(opt, 16, &val); ++ if (ret) { ++ pr_warn("PCIe ACS ID parse error %d\n", ret); ++ goto next; ++ } ++ acs_on_ids[max_acs_id].vendor = val; ++ ++ p += strcspn(p, ":"); ++ if (*p != ':') { ++ pr_warn("PCIe ACS invalid ID\n"); ++ goto next; ++ } ++ ++ p++; ++ snprintf(opt, 5, "%s", p); ++ ret = kstrtol(opt, 16, &val); ++ if (ret) { ++ pr_warn("PCIe ACS ID parse error %d\n", ret); ++ goto next; ++ } ++ acs_on_ids[max_acs_id].device = val; ++ max_acs_id++; ++ } ++next: ++ p += strcspn(p, ","); ++ if (*p == ',') ++ p++; ++ } ++ ++ if (acs_on_downstream || acs_on_multifunction || max_acs_id) ++ pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n"); ++ ++ return 0; ++} ++early_param("pcie_acs_override", pcie_acs_override_setup); ++ ++static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags) ++{ ++ int i; ++ ++ /* Never override ACS for legacy devices or devices with ACS caps */ ++ if (!pci_is_pcie(dev) || ++ pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS)) ++ return -ENOTTY; ++ ++ for (i = 0; i < max_acs_id; i++) ++ if (acs_on_ids[i].vendor == dev->vendor && ++ acs_on_ids[i].device == dev->device) ++ return 1; ++ ++ switch (pci_pcie_type(dev)) { ++ case PCI_EXP_TYPE_DOWNSTREAM: ++ case PCI_EXP_TYPE_ROOT_PORT: ++ if (acs_on_downstream) ++ return 1; ++ break; ++ case PCI_EXP_TYPE_ENDPOINT: ++ case PCI_EXP_TYPE_UPSTREAM: ++ case PCI_EXP_TYPE_LEG_END: ++ case PCI_EXP_TYPE_RC_END: ++ if (acs_on_multifunction && dev->multifunction) ++ return 1; ++ } ++ ++ return -ENOTTY; ++} + /* + * Some Atheros AR9xxx and QCA988x chips do not behave after a bus reset. + * The device will throw a Link Down error on AER-capable systems and +@@ -4513,6 +4613,7 @@ static const struct pci_dev_acs_enabled { + { PCI_VENDOR_ID_ZHAOXIN, 0x9083, pci_quirk_mf_endpoint_acs }, + /* Zhaoxin Root/Downstream Ports */ + { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, ++ { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, + { 0 } + }; + + diff --git a/linux-tkg/linux-tkg-patches/5.8/0007-v5.8-fsync.patch b/linux-tkg/linux-tkg-patches/5.8/0007-v5.8-fsync.patch new file mode 100644 index 0000000..01c86d8 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.8/0007-v5.8-fsync.patch @@ -0,0 +1,908 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Mon, 20 Apr 2020 14:09:11 +0200 +Subject: Import Fsync v3 patchset - Squashed from https://gitlab.collabora.com/tonyk/linux/-/commits/futex-proton-v3 + +diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h +index a89eb0accd5e2ee527be1e3e11b1117ff5bf94b4..580001e89c6caed57dd8b3cb491d65dce846caff 100644 +--- a/include/uapi/linux/futex.h ++++ b/include/uapi/linux/futex.h +@@ -21,6 +21,7 @@ + #define FUTEX_WAKE_BITSET 10 + #define FUTEX_WAIT_REQUEUE_PI 11 + #define FUTEX_CMP_REQUEUE_PI 12 ++#define FUTEX_WAIT_MULTIPLE 13 + + #define FUTEX_PRIVATE_FLAG 128 + #define FUTEX_CLOCK_REALTIME 256 +@@ -40,6 +41,8 @@ + FUTEX_PRIVATE_FLAG) + #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ + FUTEX_PRIVATE_FLAG) ++#define FUTEX_WAIT_MULTIPLE_PRIVATE (FUTEX_WAIT_MULTIPLE | \ ++ FUTEX_PRIVATE_FLAG) + + /* + * Support for robust futexes: the kernel cleans up held futexes at +@@ -150,4 +153,21 @@ struct robust_list_head { + (((op & 0xf) << 28) | ((cmp & 0xf) << 24) \ + | ((oparg & 0xfff) << 12) | (cmparg & 0xfff)) + ++/* ++ * Maximum number of multiple futexes to wait for ++ */ ++#define FUTEX_MULTIPLE_MAX_COUNT 128 ++ ++/** ++ * struct futex_wait_block - Block of futexes to be waited for ++ * @uaddr: User address of the futex ++ * @val: Futex value expected by userspace ++ * @bitset: Bitset for the optional bitmasked wakeup ++ */ ++struct futex_wait_block { ++ __u32 __user *uaddr; ++ __u32 val; ++ __u32 bitset; ++}; ++ + #endif /* _UAPI_LINUX_FUTEX_H */ +diff --git a/kernel/futex.c b/kernel/futex.c +index 0cf84c8664f207c574325b899ef2e57f01295a94..58cf9eb2b851b4858e29b5ef4114a29a92e676ba 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -215,6 +215,8 @@ struct futex_pi_state { + * @rt_waiter: rt_waiter storage for use with requeue_pi + * @requeue_pi_key: the requeue_pi target futex key + * @bitset: bitset for the optional bitmasked wakeup ++ * @uaddr: userspace address of futex ++ * @uval: expected futex's value + * + * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so + * we can wake only the relevant ones (hashed queues may be shared). +@@ -237,6 +239,8 @@ struct futex_q { + struct rt_mutex_waiter *rt_waiter; + union futex_key *requeue_pi_key; + u32 bitset; ++ u32 __user *uaddr; ++ u32 uval; + } __randomize_layout; + + static const struct futex_q futex_q_init = { +@@ -2420,6 +2424,29 @@ static int unqueue_me(struct futex_q *q) + return ret; + } + ++/** ++ * unqueue_multiple() - Remove several futexes from their futex_hash_bucket ++ * @q: The list of futexes to unqueue ++ * @count: Number of futexes in the list ++ * ++ * Helper to unqueue a list of futexes. This can't fail. ++ * ++ * Return: ++ * - >=0 - Index of the last futex that was awoken; ++ * - -1 - If no futex was awoken ++ */ ++static int unqueue_multiple(struct futex_q *q, int count) ++{ ++ int ret = -1; ++ int i; ++ ++ for (i = 0; i < count; i++) { ++ if (!unqueue_me(&q[i])) ++ ret = i; ++ } ++ return ret; ++} ++ + /* + * PI futexes can not be requeued and must remove themself from the + * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry +@@ -2783,6 +2810,211 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + return ret; + } + ++/** ++ * futex_wait_multiple_setup() - Prepare to wait and enqueue multiple futexes ++ * @qs: The corresponding futex list ++ * @count: The size of the lists ++ * @flags: Futex flags (FLAGS_SHARED, etc.) ++ * @awaken: Index of the last awoken futex ++ * ++ * Prepare multiple futexes in a single step and enqueue them. This may fail if ++ * the futex list is invalid or if any futex was already awoken. On success the ++ * task is ready to interruptible sleep. ++ * ++ * Return: ++ * - 1 - One of the futexes was awaken by another thread ++ * - 0 - Success ++ * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL ++ */ ++static int futex_wait_multiple_setup(struct futex_q *qs, int count, ++ unsigned int flags, int *awaken) ++{ ++ struct futex_hash_bucket *hb; ++ int ret, i; ++ u32 uval; ++ ++ /* ++ * Enqueuing multiple futexes is tricky, because we need to ++ * enqueue each futex in the list before dealing with the next ++ * one to avoid deadlocking on the hash bucket. But, before ++ * enqueuing, we need to make sure that current->state is ++ * TASK_INTERRUPTIBLE, so we don't absorb any awake events, which ++ * cannot be done before the get_futex_key of the next key, ++ * because it calls get_user_pages, which can sleep. Thus, we ++ * fetch the list of futexes keys in two steps, by first pinning ++ * all the memory keys in the futex key, and only then we read ++ * each key and queue the corresponding futex. ++ */ ++retry: ++ for (i = 0; i < count; i++) { ++ qs[i].key = FUTEX_KEY_INIT; ++ ret = get_futex_key(qs[i].uaddr, flags & FLAGS_SHARED, ++ &qs[i].key, FUTEX_READ); ++ if (unlikely(ret)) { ++ for (--i; i >= 0; i--) ++ put_futex_key(&qs[i].key); ++ return ret; ++ } ++ } ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ for (i = 0; i < count; i++) { ++ struct futex_q *q = &qs[i]; ++ ++ hb = queue_lock(q); ++ ++ ret = get_futex_value_locked(&uval, q->uaddr); ++ if (ret) { ++ /* ++ * We need to try to handle the fault, which ++ * cannot be done without sleep, so we need to ++ * undo all the work already done, to make sure ++ * we don't miss any wake ups. Therefore, clean ++ * up, handle the fault and retry from the ++ * beginning. ++ */ ++ queue_unlock(hb); ++ ++ /* ++ * Keys 0..(i-1) are implicitly put ++ * on unqueue_multiple. ++ */ ++ put_futex_key(&q->key); ++ ++ *awaken = unqueue_multiple(qs, i); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ /* ++ * On a real fault, prioritize the error even if ++ * some other futex was awoken. Userspace gave ++ * us a bad address, -EFAULT them. ++ */ ++ ret = get_user(uval, q->uaddr); ++ if (ret) ++ return ret; ++ ++ /* ++ * Even if the page fault was handled, If ++ * something was already awaken, we can safely ++ * give up and succeed to give a hint for userspace to ++ * acquire the right futex faster. ++ */ ++ if (*awaken >= 0) ++ return 1; ++ ++ goto retry; ++ } ++ ++ if (uval != q->uval) { ++ queue_unlock(hb); ++ ++ put_futex_key(&qs[i].key); ++ ++ /* ++ * If something was already awaken, we can ++ * safely ignore the error and succeed. ++ */ ++ *awaken = unqueue_multiple(qs, i); ++ __set_current_state(TASK_RUNNING); ++ if (*awaken >= 0) ++ return 1; ++ ++ return -EWOULDBLOCK; ++ } ++ ++ /* ++ * The bucket lock can't be held while dealing with the ++ * next futex. Queue each futex at this moment so hb can ++ * be unlocked. ++ */ ++ queue_me(&qs[i], hb); ++ } ++ return 0; ++} ++ ++/** ++ * futex_wait_multiple() - Prepare to wait on and enqueue several futexes ++ * @qs: The list of futexes to wait on ++ * @op: Operation code from futex's syscall ++ * @count: The number of objects ++ * @abs_time: Timeout before giving up and returning to userspace ++ * ++ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function ++ * sleeps on a group of futexes and returns on the first futex that ++ * triggered, or after the timeout has elapsed. ++ * ++ * Return: ++ * - >=0 - Hint to the futex that was awoken ++ * - <0 - On error ++ */ ++static int futex_wait_multiple(struct futex_q *qs, int op, ++ u32 count, ktime_t *abs_time) ++{ ++ struct hrtimer_sleeper timeout, *to; ++ int ret, flags = 0, hint = 0; ++ unsigned int i; ++ ++ if (!(op & FUTEX_PRIVATE_FLAG)) ++ flags |= FLAGS_SHARED; ++ ++ if (op & FUTEX_CLOCK_REALTIME) ++ flags |= FLAGS_CLOCKRT; ++ ++ to = futex_setup_timer(abs_time, &timeout, flags, 0); ++ while (1) { ++ ret = futex_wait_multiple_setup(qs, count, flags, &hint); ++ if (ret) { ++ if (ret > 0) { ++ /* A futex was awaken during setup */ ++ ret = hint; ++ } ++ break; ++ } ++ ++ if (to) ++ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); ++ ++ /* ++ * Avoid sleeping if another thread already tried to ++ * wake us. ++ */ ++ for (i = 0; i < count; i++) { ++ if (plist_node_empty(&qs[i].list)) ++ break; ++ } ++ ++ if (i == count && (!to || to->task)) ++ freezable_schedule(); ++ ++ ret = unqueue_multiple(qs, count); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ if (ret >= 0) ++ break; ++ if (to && !to->task) { ++ ret = -ETIMEDOUT; ++ break; ++ } else if (signal_pending(current)) { ++ ret = -ERESTARTSYS; ++ break; ++ } ++ /* ++ * The final case is a spurious wakeup, for ++ * which just retry. ++ */ ++ } ++ ++ if (to) { ++ hrtimer_cancel(&to->timer); ++ destroy_hrtimer_on_stack(&to->timer); ++ } ++ ++ return ret; ++} ++ + static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, + ktime_t *abs_time, u32 bitset) + { +@@ -3907,6 +4139,43 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, + return -ENOSYS; + } + ++/** ++ * futex_read_wait_block - Read an array of futex_wait_block from userspace ++ * @uaddr: Userspace address of the block ++ * @count: Number of blocks to be read ++ * ++ * This function creates and allocate an array of futex_q (we zero it to ++ * initialize the fields) and then, for each futex_wait_block element from ++ * userspace, fill a futex_q element with proper values. ++ */ ++inline struct futex_q *futex_read_wait_block(u32 __user *uaddr, u32 count) ++{ ++ unsigned int i; ++ struct futex_q *qs; ++ struct futex_wait_block fwb; ++ struct futex_wait_block __user *entry = ++ (struct futex_wait_block __user *)uaddr; ++ ++ if (!count || count > FUTEX_MULTIPLE_MAX_COUNT) ++ return ERR_PTR(-EINVAL); ++ ++ qs = kcalloc(count, sizeof(*qs), GFP_KERNEL); ++ if (!qs) ++ return ERR_PTR(-ENOMEM); ++ ++ for (i = 0; i < count; i++) { ++ if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { ++ kfree(qs); ++ return ERR_PTR(-EFAULT); ++ } ++ ++ qs[i].uaddr = fwb.uaddr; ++ qs[i].uval = fwb.val; ++ qs[i].bitset = fwb.bitset; ++ } ++ ++ return qs; ++} + + SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, +@@ -3919,7 +4188,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET || +- cmd == FUTEX_WAIT_REQUEUE_PI)) { ++ cmd == FUTEX_WAIT_REQUEUE_PI || ++ cmd == FUTEX_WAIT_MULTIPLE)) { + if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) + return -EFAULT; + if (get_timespec64(&ts, utime)) +@@ -3940,6 +4210,25 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) + val2 = (u32) (unsigned long) utime; + ++ if (cmd == FUTEX_WAIT_MULTIPLE) { ++ int ret; ++ struct futex_q *qs; ++ ++#ifdef CONFIG_X86_X32 ++ if (unlikely(in_x32_syscall())) ++ return -ENOSYS; ++#endif ++ qs = futex_read_wait_block(uaddr, val); ++ ++ if (IS_ERR(qs)) ++ return PTR_ERR(qs); ++ ++ ret = futex_wait_multiple(qs, op, val, tp); ++ kfree(qs); ++ ++ return ret; ++ } ++ + return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); + } + +@@ -4102,6 +4391,57 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, + #endif /* CONFIG_COMPAT */ + + #ifdef CONFIG_COMPAT_32BIT_TIME ++/** ++ * struct compat_futex_wait_block - Block of futexes to be waited for ++ * @uaddr: User address of the futex (compatible pointer) ++ * @val: Futex value expected by userspace ++ * @bitset: Bitset for the optional bitmasked wakeup ++ */ ++struct compat_futex_wait_block { ++ compat_uptr_t uaddr; ++ __u32 val; ++ __u32 bitset; ++}; ++ ++/** ++ * compat_futex_read_wait_block - Read an array of futex_wait_block from ++ * userspace ++ * @uaddr: Userspace address of the block ++ * @count: Number of blocks to be read ++ * ++ * This function does the same as futex_read_wait_block(), except that it ++ * converts the pointer to the futex from the compat version to the regular one. ++ */ ++inline struct futex_q *compat_futex_read_wait_block(u32 __user *uaddr, ++ u32 count) ++{ ++ unsigned int i; ++ struct futex_q *qs; ++ struct compat_futex_wait_block fwb; ++ struct compat_futex_wait_block __user *entry = ++ (struct compat_futex_wait_block __user *)uaddr; ++ ++ if (!count || count > FUTEX_MULTIPLE_MAX_COUNT) ++ return ERR_PTR(-EINVAL); ++ ++ qs = kcalloc(count, sizeof(*qs), GFP_KERNEL); ++ if (!qs) ++ return ERR_PTR(-ENOMEM); ++ ++ for (i = 0; i < count; i++) { ++ if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { ++ kfree(qs); ++ return ERR_PTR(-EFAULT); ++ } ++ ++ qs[i].uaddr = compat_ptr(fwb.uaddr); ++ qs[i].uval = fwb.val; ++ qs[i].bitset = fwb.bitset; ++ } ++ ++ return qs; ++} ++ + SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + struct old_timespec32 __user *, utime, u32 __user *, uaddr2, + u32, val3) +@@ -4113,7 +4453,8 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET || +- cmd == FUTEX_WAIT_REQUEUE_PI)) { ++ cmd == FUTEX_WAIT_REQUEUE_PI || ++ cmd == FUTEX_WAIT_MULTIPLE)) { + if (get_old_timespec32(&ts, utime)) + return -EFAULT; + if (!timespec64_valid(&ts)) +@@ -4128,6 +4469,19 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) + val2 = (int) (unsigned long) utime; + ++ if (cmd == FUTEX_WAIT_MULTIPLE) { ++ int ret; ++ struct futex_q *qs = compat_futex_read_wait_block(uaddr, val); ++ ++ if (IS_ERR(qs)) ++ return PTR_ERR(qs); ++ ++ ret = futex_wait_multiple(qs, op, val, tp); ++ kfree(qs); ++ ++ return ret; ++ } ++ + return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); + } + #endif /* CONFIG_COMPAT_32BIT_TIME */ +diff --git a/tools/testing/selftests/futex/functional/futex_wait_timeout.c b/tools/testing/selftests/futex/functional/futex_wait_timeout.c +index ee55e6d389a3f053194435342c4e471dc7cf8786..2a63e1c2cfb6407a5988233217cff2e52787bc66 100644 +--- a/tools/testing/selftests/futex/functional/futex_wait_timeout.c ++++ b/tools/testing/selftests/futex/functional/futex_wait_timeout.c +@@ -11,6 +11,7 @@ + * + * HISTORY + * 2009-Nov-6: Initial version by Darren Hart ++ * 2019-Dec-13: Add WAIT_MULTIPLE test by Krisman + * + *****************************************************************************/ + +@@ -41,6 +42,8 @@ int main(int argc, char *argv[]) + { + futex_t f1 = FUTEX_INITIALIZER; + struct timespec to; ++ time_t secs; ++ struct futex_wait_block fwb = {&f1, f1, 0}; + int res, ret = RET_PASS; + int c; + +@@ -65,7 +68,7 @@ int main(int argc, char *argv[]) + } + + ksft_print_header(); +- ksft_set_plan(1); ++ ksft_set_plan(2); + ksft_print_msg("%s: Block on a futex and wait for timeout\n", + basename(argv[0])); + ksft_print_msg("\tArguments: timeout=%ldns\n", timeout_ns); +@@ -79,8 +82,39 @@ int main(int argc, char *argv[]) + if (!res || errno != ETIMEDOUT) { + fail("futex_wait returned %d\n", ret < 0 ? errno : ret); + ret = RET_FAIL; ++ } else ++ ksft_test_result_pass("futex_wait timeout succeeds\n"); ++ ++ info("Calling futex_wait_multiple on f1: %u @ %p\n", f1, &f1); ++ ++ /* Setup absolute time */ ++ ret = clock_gettime(CLOCK_REALTIME, &to); ++ secs = (to.tv_nsec + timeout_ns) / 1000000000; ++ to.tv_nsec = ((int64_t)to.tv_nsec + timeout_ns) % 1000000000; ++ to.tv_sec += secs; ++ info("to.tv_sec = %ld\n", to.tv_sec); ++ info("to.tv_nsec = %ld\n", to.tv_nsec); ++ ++ res = futex_wait_multiple(&fwb, 1, &to, ++ FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME); ++ ++#ifdef __ILP32__ ++ if (res == -1 && errno == ENOSYS) { ++ ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); ++ } else { ++ ksft_test_result_fail("futex_wait_multiple returned %d\n", ++ res < 0 ? errno : res); ++ ret = RET_FAIL; + } ++#else ++ if (!res || errno != ETIMEDOUT) { ++ ksft_test_result_fail("futex_wait_multiple returned %d\n", ++ res < 0 ? errno : res); ++ ret = RET_FAIL; ++ } else ++ ksft_test_result_pass("futex_wait_multiple timeout succeeds\n"); ++#endif /* __ILP32__ */ + +- print_result(TEST_NAME, ret); ++ ksft_print_cnts(); + return ret; + } +diff --git a/tools/testing/selftests/futex/include/futextest.h b/tools/testing/selftests/futex/include/futextest.h +index ddbcfc9b7bac4aebb5bac2f249e26ecfd948aa84..bb103bef4557012ef9a389ca74c868e4476a8a31 100644 +--- a/tools/testing/selftests/futex/include/futextest.h ++++ b/tools/testing/selftests/futex/include/futextest.h +@@ -38,6 +38,14 @@ typedef volatile u_int32_t futex_t; + #ifndef FUTEX_CMP_REQUEUE_PI + #define FUTEX_CMP_REQUEUE_PI 12 + #endif ++#ifndef FUTEX_WAIT_MULTIPLE ++#define FUTEX_WAIT_MULTIPLE 13 ++struct futex_wait_block { ++ futex_t *uaddr; ++ futex_t val; ++ __u32 bitset; ++}; ++#endif + #ifndef FUTEX_WAIT_REQUEUE_PI_PRIVATE + #define FUTEX_WAIT_REQUEUE_PI_PRIVATE (FUTEX_WAIT_REQUEUE_PI | \ + FUTEX_PRIVATE_FLAG) +@@ -80,6 +88,20 @@ futex_wait(futex_t *uaddr, futex_t val, struct timespec *timeout, int opflags) + return futex(uaddr, FUTEX_WAIT, val, timeout, NULL, 0, opflags); + } + ++/** ++ * futex_wait_multiple() - block on several futexes with optional timeout ++ * @fwb: wait block user space address ++ * @count: number of entities at fwb ++ * @timeout: absolute timeout ++ */ ++static inline int ++futex_wait_multiple(struct futex_wait_block *fwb, int count, ++ struct timespec *timeout, int opflags) ++{ ++ return futex(fwb, FUTEX_WAIT_MULTIPLE, count, timeout, NULL, 0, ++ opflags); ++} ++ + /** + * futex_wake() - wake one or more tasks blocked on uaddr + * @nr_wake: wake up to this many tasks +diff --git a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c +index 0ae390ff816449c88d0bb655a26eb014382c2b4f..bcbac042992d447e0bc9ef5fefe94e875de310f2 100644 +--- a/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c ++++ b/tools/testing/selftests/futex/functional/futex_wait_wouldblock.c +@@ -12,6 +12,7 @@ + * + * HISTORY + * 2009-Nov-14: Initial version by Gowrishankar ++ * 2019-Dec-13: Add WAIT_MULTIPLE test by Krisman + * + *****************************************************************************/ + +@@ -40,6 +41,7 @@ int main(int argc, char *argv[]) + { + struct timespec to = {.tv_sec = 0, .tv_nsec = timeout_ns}; + futex_t f1 = FUTEX_INITIALIZER; ++ struct futex_wait_block fwb = {&f1, f1+1, 0}; + int res, ret = RET_PASS; + int c; + +@@ -61,7 +63,7 @@ int main(int argc, char *argv[]) + } + + ksft_print_header(); +- ksft_set_plan(1); ++ ksft_set_plan(2); + ksft_print_msg("%s: Test the unexpected futex value in FUTEX_WAIT\n", + basename(argv[0])); + +@@ -71,8 +73,30 @@ int main(int argc, char *argv[]) + fail("futex_wait returned: %d %s\n", + res ? errno : res, res ? strerror(errno) : ""); + ret = RET_FAIL; ++ } else ++ ksft_test_result_pass("futex_wait wouldblock succeeds\n"); ++ ++ info("Calling futex_wait_multiple on f1: %u @ %p with val=%u\n", ++ f1, &f1, f1+1); ++ res = futex_wait_multiple(&fwb, 1, NULL, FUTEX_PRIVATE_FLAG); ++ ++#ifdef __ILP32__ ++ if (res != -1 || errno != ENOSYS) { ++ ksft_test_result_fail("futex_wait_multiple returned %d\n", ++ res < 0 ? errno : res); ++ ret = RET_FAIL; ++ } else { ++ ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); ++ } ++#else ++ if (!res || errno != EWOULDBLOCK) { ++ ksft_test_result_fail("futex_wait_multiple returned %d\n", ++ res < 0 ? errno : res); ++ ret = RET_FAIL; + } ++ ksft_test_result_pass("futex_wait_multiple wouldblock succeeds\n"); ++#endif /* __ILP32__ */ + +- print_result(TEST_NAME, ret); ++ ksft_print_cnts(); + return ret; + } +diff --git a/tools/testing/selftests/futex/functional/.gitignore b/tools/testing/selftests/futex/functional/.gitignore +index a09f570619023750f558c84004aff166b4337d72..4660128a545edb04a17cc6bd9760931c1386122f 100644 +--- a/tools/testing/selftests/futex/functional/.gitignore ++++ b/tools/testing/selftests/futex/functional/.gitignore +@@ -5,3 +5,4 @@ futex_wait_private_mapped_file + futex_wait_timeout + futex_wait_uninitialized_heap + futex_wait_wouldblock ++futex_wait_multiple +diff --git a/tools/testing/selftests/futex/functional/Makefile b/tools/testing/selftests/futex/functional/Makefile +index 30996306cabcfe89a47977643e529b122893bb7e..75f9fface11fa3c90c1bdb9a49b3ea51291afd58 100644 +--- a/tools/testing/selftests/futex/functional/Makefile ++++ b/tools/testing/selftests/futex/functional/Makefile +@@ -14,7 +14,8 @@ TEST_GEN_FILES := \ + futex_requeue_pi_signal_restart \ + futex_requeue_pi_mismatched_ops \ + futex_wait_uninitialized_heap \ +- futex_wait_private_mapped_file ++ futex_wait_private_mapped_file \ ++ futex_wait_multiple + + TEST_PROGS := run.sh + +diff --git a/tools/testing/selftests/futex/functional/futex_wait_multiple.c b/tools/testing/selftests/futex/functional/futex_wait_multiple.c +new file mode 100644 +index 0000000000000000000000000000000000000000..b48422e79f42edba1653bb0bd2a4c4fd98d2d48d +--- /dev/null ++++ b/tools/testing/selftests/futex/functional/futex_wait_multiple.c +@@ -0,0 +1,173 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/****************************************************************************** ++ * ++ * Copyright © Collabora, Ltd., 2019 ++ * ++ * DESCRIPTION ++ * Test basic semantics of FUTEX_WAIT_MULTIPLE ++ * ++ * AUTHOR ++ * Gabriel Krisman Bertazi ++ * ++ * HISTORY ++ * 2019-Dec-13: Initial version by Krisman ++ * ++ *****************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "futextest.h" ++#include "logging.h" ++ ++#define TEST_NAME "futex-wait-multiple" ++#define timeout_ns 100000 ++#define MAX_COUNT 128 ++#define WAKE_WAIT_US 3000000 ++ ++int ret = RET_PASS; ++char *progname; ++futex_t f[MAX_COUNT] = {0}; ++struct futex_wait_block fwb[MAX_COUNT]; ++ ++void usage(char *prog) ++{ ++ printf("Usage: %s\n", prog); ++ printf(" -c Use color\n"); ++ printf(" -h Display this help message\n"); ++ printf(" -v L Verbosity level: %d=QUIET %d=CRITICAL %d=INFO\n", ++ VQUIET, VCRITICAL, VINFO); ++} ++ ++void test_count_overflow(void) ++{ ++ futex_t f = FUTEX_INITIALIZER; ++ struct futex_wait_block fwb[MAX_COUNT+1]; ++ int res, i; ++ ++ ksft_print_msg("%s: Test a too big number of futexes\n", progname); ++ ++ for (i = 0; i < MAX_COUNT+1; i++) { ++ fwb[i].uaddr = &f; ++ fwb[i].val = f; ++ fwb[i].bitset = 0; ++ } ++ ++ res = futex_wait_multiple(fwb, MAX_COUNT+1, NULL, FUTEX_PRIVATE_FLAG); ++ ++#ifdef __ILP32__ ++ if (res != -1 || errno != ENOSYS) { ++ ksft_test_result_fail("futex_wait_multiple returned %d\n", ++ res < 0 ? errno : res); ++ ret = RET_FAIL; ++ } else { ++ ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); ++ } ++#else ++ if (res != -1 || errno != EINVAL) { ++ ksft_test_result_fail("futex_wait_multiple returned %d\n", ++ res < 0 ? errno : res); ++ ret = RET_FAIL; ++ } else { ++ ksft_test_result_pass("futex_wait_multiple count overflow succeed\n"); ++ } ++ ++#endif /* __ILP32__ */ ++} ++ ++void *waiterfn(void *arg) ++{ ++ int res; ++ ++ res = futex_wait_multiple(fwb, MAX_COUNT, NULL, FUTEX_PRIVATE_FLAG); ++ ++#ifdef __ILP32__ ++ if (res != -1 || errno != ENOSYS) { ++ ksft_test_result_fail("futex_wait_multiple returned %d\n", ++ res < 0 ? errno : res); ++ ret = RET_FAIL; ++ } else { ++ ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); ++ } ++#else ++ if (res < 0) ++ ksft_print_msg("waiter failed %d\n", res); ++ ++ info("futex_wait_multiple: Got hint futex %d was freed\n", res); ++#endif /* __ILP32__ */ ++ ++ return NULL; ++} ++ ++void test_fwb_wakeup(void) ++{ ++ int res, i; ++ pthread_t waiter; ++ ++ ksft_print_msg("%s: Test wake up in a list of futex\n", progname); ++ ++ for (i = 0; i < MAX_COUNT; i++) { ++ fwb[i].uaddr = &f[i]; ++ fwb[i].val = f[i]; ++ fwb[i].bitset = 0xffffffff; ++ } ++ ++ res = pthread_create(&waiter, NULL, waiterfn, NULL); ++ if (res) { ++ ksft_test_result_fail("Creating waiting thread failed"); ++ ksft_exit_fail(); ++ } ++ ++ usleep(WAKE_WAIT_US); ++ res = futex_wake(&(f[MAX_COUNT-1]), 1, FUTEX_PRIVATE_FLAG); ++ if (res != 1) { ++ ksft_test_result_fail("Failed to wake thread res=%d\n", res); ++ ksft_exit_fail(); ++ } ++ ++ pthread_join(waiter, NULL); ++ ksft_test_result_pass("%s succeed\n", __func__); ++} ++ ++int main(int argc, char *argv[]) ++{ ++ int c; ++ ++ while ((c = getopt(argc, argv, "cht:v:")) != -1) { ++ switch (c) { ++ case 'c': ++ log_color(1); ++ break; ++ case 'h': ++ usage(basename(argv[0])); ++ exit(0); ++ case 'v': ++ log_verbosity(atoi(optarg)); ++ break; ++ default: ++ usage(basename(argv[0])); ++ exit(1); ++ } ++ } ++ ++ progname = basename(argv[0]); ++ ++ ksft_print_header(); ++ ksft_set_plan(2); ++ ++ test_count_overflow(); ++ ++#ifdef __ILP32__ ++ // if it's a 32x binary, there's no futex to wakeup ++ ksft_test_result_skip("futex_wait_multiple not supported at x32\n"); ++#else ++ test_fwb_wakeup(); ++#endif /* __ILP32__ */ ++ ++ ksft_print_cnts(); ++ return ret; ++} +diff --git a/tools/testing/selftests/futex/functional/run.sh b/tools/testing/selftests/futex/functional/run.sh +index 1acb6ace1680e8f3d6b3ee2dc528c19ddfdb018e..a8be94f28ff78b4879d2d19bca5d9b0fcb26c1f8 100755 +--- a/tools/testing/selftests/futex/functional/run.sh ++++ b/tools/testing/selftests/futex/functional/run.sh +@@ -73,3 +73,6 @@ echo + echo + ./futex_wait_uninitialized_heap $COLOR + ./futex_wait_private_mapped_file $COLOR ++ ++echo ++./futex_wait_multiple $COLOR +diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h +index 580001e89c6caed57dd8b3cb491d65dce846caff..a3e760886b8e7e74285fdcf2caaaa6f66ad16675 100644 +--- a/include/uapi/linux/futex.h ++++ b/include/uapi/linux/futex.h +@@ -21,7 +21,7 @@ + #define FUTEX_WAKE_BITSET 10 + #define FUTEX_WAIT_REQUEUE_PI 11 + #define FUTEX_CMP_REQUEUE_PI 12 +-#define FUTEX_WAIT_MULTIPLE 13 ++#define FUTEX_WAIT_MULTIPLE 31 + + #define FUTEX_PRIVATE_FLAG 128 + #define FUTEX_CLOCK_REALTIME 256 +diff --git a/kernel/futex.c b/kernel/futex.c +index 58cf9eb2b851b4858e29b5ef4114a29a92e676ba..e0bb628a5e1988dcc9ae5442a4259edc229d578d 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -4198,7 +4198,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + return -EINVAL; + + t = timespec64_to_ktime(ts); +- if (cmd == FUTEX_WAIT) ++ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) + t = ktime_add_safe(ktime_get(), t); + tp = &t; + } +@@ -4399,6 +4399,7 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, + */ + struct compat_futex_wait_block { + compat_uptr_t uaddr; ++ __u32 pad; + __u32 val; + __u32 bitset; + }; +@@ -4461,7 +4462,7 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + return -EINVAL; + + t = timespec64_to_ktime(ts); +- if (cmd == FUTEX_WAIT) ++ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) + t = ktime_add_safe(ktime_get(), t); + tp = &t; + } diff --git a/linux-tkg/linux-tkg-patches/5.8/0008-5.8-bcachefs.patch b/linux-tkg/linux-tkg-patches/5.8/0008-5.8-bcachefs.patch new file mode 100644 index 0000000..69cd9f9 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.8/0008-5.8-bcachefs.patch @@ -0,0 +1,70598 @@ +diff --git a/block/bio.c b/block/bio.c +index a7366c02c9b5..9a5a289757f9 100644 +--- a/block/bio.c ++++ b/block/bio.c +@@ -1316,6 +1316,7 @@ void bio_set_pages_dirty(struct bio *bio) + set_page_dirty_lock(bvec->bv_page); + } + } ++EXPORT_SYMBOL_GPL(bio_set_pages_dirty); + + /* + * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. +@@ -1375,6 +1376,7 @@ void bio_check_pages_dirty(struct bio *bio) + spin_unlock_irqrestore(&bio_dirty_lock, flags); + schedule_work(&bio_dirty_work); + } ++EXPORT_SYMBOL_GPL(bio_check_pages_dirty); + + static inline bool bio_remaining_done(struct bio *bio) + { +diff --git a/block/blk-core.c b/block/blk-core.c +index 03252af8c82c..71907944fa78 100644 +--- a/block/blk-core.c ++++ b/block/blk-core.c +@@ -215,18 +215,23 @@ int blk_status_to_errno(blk_status_t status) + } + EXPORT_SYMBOL_GPL(blk_status_to_errno); + +-static void print_req_error(struct request *req, blk_status_t status, +- const char *caller) ++const char *blk_status_to_str(blk_status_t status) + { + int idx = (__force int)status; + + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) +- return; ++ return "(invalid error)"; ++ return blk_errors[idx].name; ++} ++EXPORT_SYMBOL_GPL(blk_status_to_str); + ++static void print_req_error(struct request *req, blk_status_t status, ++ const char *caller) ++{ + printk_ratelimited(KERN_ERR + "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " + "phys_seg %u prio class %u\n", +- caller, blk_errors[idx].name, ++ caller, blk_status_to_str(status), + req->rq_disk ? req->rq_disk->disk_name : "?", + blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), + req->cmd_flags & ~REQ_OP_MASK, +diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig +index bf7dd96db9b3..14274562f6e1 100644 +--- a/drivers/md/bcache/Kconfig ++++ b/drivers/md/bcache/Kconfig +@@ -3,6 +3,7 @@ + config BCACHE + tristate "Block device as cache" + select CRC64 ++ select CLOSURES + help + Allows a block device to be used as cache for other devices; uses + a btree for indexing and the layout is optimized for SSDs. +@@ -18,15 +19,6 @@ config BCACHE_DEBUG + Enables extra debugging tools, allows expensive runtime checks to be + turned on. + +-config BCACHE_CLOSURES_DEBUG +- bool "Debug closures" +- depends on BCACHE +- select DEBUG_FS +- help +- Keeps all active closures in a linked list and provides a debugfs +- interface to list them, which makes it possible to see asynchronous +- operations that get stuck. +- + config BCACHE_ASYNC_REGISTRAION + bool "Asynchronous device registration (EXPERIMENTAL)" + depends on BCACHE +diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile +index fd714628da6a..0fb1b6009da3 100644 +--- a/drivers/md/bcache/Makefile ++++ b/drivers/md/bcache/Makefile +@@ -2,6 +2,6 @@ + + obj-$(CONFIG_BCACHE) += bcache.o + +-bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ +- io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ ++bcache-y := alloc.o bset.o btree.o debug.o extents.o io.o\ ++ journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ + util.o writeback.o +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 221e0191b687..4e82115c5524 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -180,6 +180,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -192,7 +193,6 @@ + + #include "bset.h" + #include "util.h" +-#include "closure.h" + + struct bucket { + atomic_t pin; +diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c +deleted file mode 100644 +index 0164a1fe94a9..000000000000 +--- a/drivers/md/bcache/closure.c ++++ /dev/null +@@ -1,217 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0 +-/* +- * Asynchronous refcounty things +- * +- * Copyright 2010, 2011 Kent Overstreet +- * Copyright 2012 Google, Inc. +- */ +- +-#include +-#include +-#include +-#include +- +-#include "closure.h" +- +-static inline void closure_put_after_sub(struct closure *cl, int flags) +-{ +- int r = flags & CLOSURE_REMAINING_MASK; +- +- BUG_ON(flags & CLOSURE_GUARD_MASK); +- BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); +- +- if (!r) { +- if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { +- atomic_set(&cl->remaining, +- CLOSURE_REMAINING_INITIALIZER); +- closure_queue(cl); +- } else { +- struct closure *parent = cl->parent; +- closure_fn *destructor = cl->fn; +- +- closure_debug_destroy(cl); +- +- if (destructor) +- destructor(cl); +- +- if (parent) +- closure_put(parent); +- } +- } +-} +- +-/* For clearing flags with the same atomic op as a put */ +-void closure_sub(struct closure *cl, int v) +-{ +- closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); +-} +- +-/* +- * closure_put - decrement a closure's refcount +- */ +-void closure_put(struct closure *cl) +-{ +- closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); +-} +- +-/* +- * closure_wake_up - wake up all closures on a wait list, without memory barrier +- */ +-void __closure_wake_up(struct closure_waitlist *wait_list) +-{ +- struct llist_node *list; +- struct closure *cl, *t; +- struct llist_node *reverse = NULL; +- +- list = llist_del_all(&wait_list->list); +- +- /* We first reverse the list to preserve FIFO ordering and fairness */ +- reverse = llist_reverse_order(list); +- +- /* Then do the wakeups */ +- llist_for_each_entry_safe(cl, t, reverse, list) { +- closure_set_waiting(cl, 0); +- closure_sub(cl, CLOSURE_WAITING + 1); +- } +-} +- +-/** +- * closure_wait - add a closure to a waitlist +- * @waitlist: will own a ref on @cl, which will be released when +- * closure_wake_up() is called on @waitlist. +- * @cl: closure pointer. +- * +- */ +-bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) +-{ +- if (atomic_read(&cl->remaining) & CLOSURE_WAITING) +- return false; +- +- closure_set_waiting(cl, _RET_IP_); +- atomic_add(CLOSURE_WAITING + 1, &cl->remaining); +- llist_add(&cl->list, &waitlist->list); +- +- return true; +-} +- +-struct closure_syncer { +- struct task_struct *task; +- int done; +-}; +- +-static void closure_sync_fn(struct closure *cl) +-{ +- struct closure_syncer *s = cl->s; +- struct task_struct *p; +- +- rcu_read_lock(); +- p = READ_ONCE(s->task); +- s->done = 1; +- wake_up_process(p); +- rcu_read_unlock(); +-} +- +-void __sched __closure_sync(struct closure *cl) +-{ +- struct closure_syncer s = { .task = current }; +- +- cl->s = &s; +- continue_at(cl, closure_sync_fn, NULL); +- +- while (1) { +- set_current_state(TASK_UNINTERRUPTIBLE); +- if (s.done) +- break; +- schedule(); +- } +- +- __set_current_state(TASK_RUNNING); +-} +- +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- +-static LIST_HEAD(closure_list); +-static DEFINE_SPINLOCK(closure_list_lock); +- +-void closure_debug_create(struct closure *cl) +-{ +- unsigned long flags; +- +- BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); +- cl->magic = CLOSURE_MAGIC_ALIVE; +- +- spin_lock_irqsave(&closure_list_lock, flags); +- list_add(&cl->all, &closure_list); +- spin_unlock_irqrestore(&closure_list_lock, flags); +-} +- +-void closure_debug_destroy(struct closure *cl) +-{ +- unsigned long flags; +- +- BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); +- cl->magic = CLOSURE_MAGIC_DEAD; +- +- spin_lock_irqsave(&closure_list_lock, flags); +- list_del(&cl->all); +- spin_unlock_irqrestore(&closure_list_lock, flags); +-} +- +-static struct dentry *closure_debug; +- +-static int debug_seq_show(struct seq_file *f, void *data) +-{ +- struct closure *cl; +- +- spin_lock_irq(&closure_list_lock); +- +- list_for_each_entry(cl, &closure_list, all) { +- int r = atomic_read(&cl->remaining); +- +- seq_printf(f, "%p: %pS -> %pS p %p r %i ", +- cl, (void *) cl->ip, cl->fn, cl->parent, +- r & CLOSURE_REMAINING_MASK); +- +- seq_printf(f, "%s%s\n", +- test_bit(WORK_STRUCT_PENDING_BIT, +- work_data_bits(&cl->work)) ? "Q" : "", +- r & CLOSURE_RUNNING ? "R" : ""); +- +- if (r & CLOSURE_WAITING) +- seq_printf(f, " W %pS\n", +- (void *) cl->waiting_on); +- +- seq_printf(f, "\n"); +- } +- +- spin_unlock_irq(&closure_list_lock); +- return 0; +-} +- +-static int debug_seq_open(struct inode *inode, struct file *file) +-{ +- return single_open(file, debug_seq_show, NULL); +-} +- +-static const struct file_operations debug_ops = { +- .owner = THIS_MODULE, +- .open = debug_seq_open, +- .read = seq_read, +- .release = single_release +-}; +- +-void __init closure_debug_init(void) +-{ +- if (!IS_ERR_OR_NULL(bcache_debug)) +- /* +- * it is unnecessary to check return value of +- * debugfs_create_file(), we should not care +- * about this. +- */ +- closure_debug = debugfs_create_file( +- "closures", 0400, bcache_debug, NULL, &debug_ops); +-} +-#endif +- +-MODULE_AUTHOR("Kent Overstreet "); +-MODULE_LICENSE("GPL"); +diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h +deleted file mode 100644 +index c88cdc4ae4ec..000000000000 +--- a/drivers/md/bcache/closure.h ++++ /dev/null +@@ -1,378 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef _LINUX_CLOSURE_H +-#define _LINUX_CLOSURE_H +- +-#include +-#include +-#include +-#include +- +-/* +- * Closure is perhaps the most overused and abused term in computer science, but +- * since I've been unable to come up with anything better you're stuck with it +- * again. +- * +- * What are closures? +- * +- * They embed a refcount. The basic idea is they count "things that are in +- * progress" - in flight bios, some other thread that's doing something else - +- * anything you might want to wait on. +- * +- * The refcount may be manipulated with closure_get() and closure_put(). +- * closure_put() is where many of the interesting things happen, when it causes +- * the refcount to go to 0. +- * +- * Closures can be used to wait on things both synchronously and asynchronously, +- * and synchronous and asynchronous use can be mixed without restriction. To +- * wait synchronously, use closure_sync() - you will sleep until your closure's +- * refcount hits 1. +- * +- * To wait asynchronously, use +- * continue_at(cl, next_function, workqueue); +- * +- * passing it, as you might expect, the function to run when nothing is pending +- * and the workqueue to run that function out of. +- * +- * continue_at() also, critically, requires a 'return' immediately following the +- * location where this macro is referenced, to return to the calling function. +- * There's good reason for this. +- * +- * To use safely closures asynchronously, they must always have a refcount while +- * they are running owned by the thread that is running them. Otherwise, suppose +- * you submit some bios and wish to have a function run when they all complete: +- * +- * foo_endio(struct bio *bio) +- * { +- * closure_put(cl); +- * } +- * +- * closure_init(cl); +- * +- * do_stuff(); +- * closure_get(cl); +- * bio1->bi_endio = foo_endio; +- * bio_submit(bio1); +- * +- * do_more_stuff(); +- * closure_get(cl); +- * bio2->bi_endio = foo_endio; +- * bio_submit(bio2); +- * +- * continue_at(cl, complete_some_read, system_wq); +- * +- * If closure's refcount started at 0, complete_some_read() could run before the +- * second bio was submitted - which is almost always not what you want! More +- * importantly, it wouldn't be possible to say whether the original thread or +- * complete_some_read()'s thread owned the closure - and whatever state it was +- * associated with! +- * +- * So, closure_init() initializes a closure's refcount to 1 - and when a +- * closure_fn is run, the refcount will be reset to 1 first. +- * +- * Then, the rule is - if you got the refcount with closure_get(), release it +- * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount +- * on a closure because you called closure_init() or you were run out of a +- * closure - _always_ use continue_at(). Doing so consistently will help +- * eliminate an entire class of particularly pernicious races. +- * +- * Lastly, you might have a wait list dedicated to a specific event, and have no +- * need for specifying the condition - you just want to wait until someone runs +- * closure_wake_up() on the appropriate wait list. In that case, just use +- * closure_wait(). It will return either true or false, depending on whether the +- * closure was already on a wait list or not - a closure can only be on one wait +- * list at a time. +- * +- * Parents: +- * +- * closure_init() takes two arguments - it takes the closure to initialize, and +- * a (possibly null) parent. +- * +- * If parent is non null, the new closure will have a refcount for its lifetime; +- * a closure is considered to be "finished" when its refcount hits 0 and the +- * function to run is null. Hence +- * +- * continue_at(cl, NULL, NULL); +- * +- * returns up the (spaghetti) stack of closures, precisely like normal return +- * returns up the C stack. continue_at() with non null fn is better thought of +- * as doing a tail call. +- * +- * All this implies that a closure should typically be embedded in a particular +- * struct (which its refcount will normally control the lifetime of), and that +- * struct can very much be thought of as a stack frame. +- */ +- +-struct closure; +-struct closure_syncer; +-typedef void (closure_fn) (struct closure *); +-extern struct dentry *bcache_debug; +- +-struct closure_waitlist { +- struct llist_head list; +-}; +- +-enum closure_state { +- /* +- * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by +- * the thread that owns the closure, and cleared by the thread that's +- * waking up the closure. +- * +- * The rest are for debugging and don't affect behaviour: +- * +- * CLOSURE_RUNNING: Set when a closure is running (i.e. by +- * closure_init() and when closure_put() runs then next function), and +- * must be cleared before remaining hits 0. Primarily to help guard +- * against incorrect usage and accidentally transferring references. +- * continue_at() and closure_return() clear it for you, if you're doing +- * something unusual you can use closure_set_dead() which also helps +- * annotate where references are being transferred. +- */ +- +- CLOSURE_BITS_START = (1U << 26), +- CLOSURE_DESTRUCTOR = (1U << 26), +- CLOSURE_WAITING = (1U << 28), +- CLOSURE_RUNNING = (1U << 30), +-}; +- +-#define CLOSURE_GUARD_MASK \ +- ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1) +- +-#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) +-#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) +- +-struct closure { +- union { +- struct { +- struct workqueue_struct *wq; +- struct closure_syncer *s; +- struct llist_node list; +- closure_fn *fn; +- }; +- struct work_struct work; +- }; +- +- struct closure *parent; +- +- atomic_t remaining; +- +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +-#define CLOSURE_MAGIC_DEAD 0xc054dead +-#define CLOSURE_MAGIC_ALIVE 0xc054a11e +- +- unsigned int magic; +- struct list_head all; +- unsigned long ip; +- unsigned long waiting_on; +-#endif +-}; +- +-void closure_sub(struct closure *cl, int v); +-void closure_put(struct closure *cl); +-void __closure_wake_up(struct closure_waitlist *list); +-bool closure_wait(struct closure_waitlist *list, struct closure *cl); +-void __closure_sync(struct closure *cl); +- +-/** +- * closure_sync - sleep until a closure a closure has nothing left to wait on +- * +- * Sleeps until the refcount hits 1 - the thread that's running the closure owns +- * the last refcount. +- */ +-static inline void closure_sync(struct closure *cl) +-{ +- if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) +- __closure_sync(cl); +-} +- +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- +-void closure_debug_init(void); +-void closure_debug_create(struct closure *cl); +-void closure_debug_destroy(struct closure *cl); +- +-#else +- +-static inline void closure_debug_init(void) {} +-static inline void closure_debug_create(struct closure *cl) {} +-static inline void closure_debug_destroy(struct closure *cl) {} +- +-#endif +- +-static inline void closure_set_ip(struct closure *cl) +-{ +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- cl->ip = _THIS_IP_; +-#endif +-} +- +-static inline void closure_set_ret_ip(struct closure *cl) +-{ +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- cl->ip = _RET_IP_; +-#endif +-} +- +-static inline void closure_set_waiting(struct closure *cl, unsigned long f) +-{ +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- cl->waiting_on = f; +-#endif +-} +- +-static inline void closure_set_stopped(struct closure *cl) +-{ +- atomic_sub(CLOSURE_RUNNING, &cl->remaining); +-} +- +-static inline void set_closure_fn(struct closure *cl, closure_fn *fn, +- struct workqueue_struct *wq) +-{ +- closure_set_ip(cl); +- cl->fn = fn; +- cl->wq = wq; +- /* between atomic_dec() in closure_put() */ +- smp_mb__before_atomic(); +-} +- +-static inline void closure_queue(struct closure *cl) +-{ +- struct workqueue_struct *wq = cl->wq; +- /** +- * Changes made to closure, work_struct, or a couple of other structs +- * may cause work.func not pointing to the right location. +- */ +- BUILD_BUG_ON(offsetof(struct closure, fn) +- != offsetof(struct work_struct, func)); +- if (wq) { +- INIT_WORK(&cl->work, cl->work.func); +- BUG_ON(!queue_work(wq, &cl->work)); +- } else +- cl->fn(cl); +-} +- +-/** +- * closure_get - increment a closure's refcount +- */ +-static inline void closure_get(struct closure *cl) +-{ +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- BUG_ON((atomic_inc_return(&cl->remaining) & +- CLOSURE_REMAINING_MASK) <= 1); +-#else +- atomic_inc(&cl->remaining); +-#endif +-} +- +-/** +- * closure_init - Initialize a closure, setting the refcount to 1 +- * @cl: closure to initialize +- * @parent: parent of the new closure. cl will take a refcount on it for its +- * lifetime; may be NULL. +- */ +-static inline void closure_init(struct closure *cl, struct closure *parent) +-{ +- memset(cl, 0, sizeof(struct closure)); +- cl->parent = parent; +- if (parent) +- closure_get(parent); +- +- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +- +- closure_debug_create(cl); +- closure_set_ip(cl); +-} +- +-static inline void closure_init_stack(struct closure *cl) +-{ +- memset(cl, 0, sizeof(struct closure)); +- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +-} +- +-/** +- * closure_wake_up - wake up all closures on a wait list, +- * with memory barrier +- */ +-static inline void closure_wake_up(struct closure_waitlist *list) +-{ +- /* Memory barrier for the wait list */ +- smp_mb(); +- __closure_wake_up(list); +-} +- +-/** +- * continue_at - jump to another function with barrier +- * +- * After @cl is no longer waiting on anything (i.e. all outstanding refs have +- * been dropped with closure_put()), it will resume execution at @fn running out +- * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). +- * +- * This is because after calling continue_at() you no longer have a ref on @cl, +- * and whatever @cl owns may be freed out from under you - a running closure fn +- * has a ref on its own closure which continue_at() drops. +- * +- * Note you are expected to immediately return after using this macro. +- */ +-#define continue_at(_cl, _fn, _wq) \ +-do { \ +- set_closure_fn(_cl, _fn, _wq); \ +- closure_sub(_cl, CLOSURE_RUNNING + 1); \ +-} while (0) +- +-/** +- * closure_return - finish execution of a closure +- * +- * This is used to indicate that @cl is finished: when all outstanding refs on +- * @cl have been dropped @cl's ref on its parent closure (as passed to +- * closure_init()) will be dropped, if one was specified - thus this can be +- * thought of as returning to the parent closure. +- */ +-#define closure_return(_cl) continue_at((_cl), NULL, NULL) +- +-/** +- * continue_at_nobarrier - jump to another function without barrier +- * +- * Causes @fn to be executed out of @cl, in @wq context (or called directly if +- * @wq is NULL). +- * +- * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, +- * thus it's not safe to touch anything protected by @cl after a +- * continue_at_nobarrier(). +- */ +-#define continue_at_nobarrier(_cl, _fn, _wq) \ +-do { \ +- set_closure_fn(_cl, _fn, _wq); \ +- closure_queue(_cl); \ +-} while (0) +- +-/** +- * closure_return_with_destructor - finish execution of a closure, +- * with destructor +- * +- * Works like closure_return(), except @destructor will be called when all +- * outstanding refs on @cl have been dropped; @destructor may be used to safely +- * free the memory occupied by @cl, and it is called with the ref on the parent +- * closure still held - so @destructor could safely return an item to a +- * freelist protected by @cl's parent. +- */ +-#define closure_return_with_destructor(_cl, _destructor) \ +-do { \ +- set_closure_fn(_cl, _destructor, NULL); \ +- closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ +-} while (0) +- +-/** +- * closure_call - execute @fn out of a new, uninitialized closure +- * +- * Typically used when running out of one closure, and we want to run @fn +- * asynchronously out of a new closure - @parent will then wait for @cl to +- * finish. +- */ +-static inline void closure_call(struct closure *cl, closure_fn fn, +- struct workqueue_struct *wq, +- struct closure *parent) +-{ +- closure_init(cl, parent); +- continue_at_nobarrier(cl, fn, wq); +-} +- +-#endif /* _LINUX_CLOSURE_H */ +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 2014016f9a60..331febeabade 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -2819,7 +2819,6 @@ static int __init bcache_init(void) + goto err; + + bch_debug_init(); +- closure_debug_init(); + + bcache_is_reboot = false; + +diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h +index c029f7443190..59093f9f1793 100644 +--- a/drivers/md/bcache/util.h ++++ b/drivers/md/bcache/util.h +@@ -4,6 +4,7 @@ + #define _BCACHE_UTIL_H + + #include ++#include + #include + #include + #include +@@ -13,8 +14,6 @@ + #include + #include + +-#include "closure.h" +- + #define PAGE_SECTORS (PAGE_SIZE / 512) + + struct closure; +diff --git a/fs/Kconfig b/fs/Kconfig +index a88aa3af73c1..18e1627b95f9 100644 +--- a/fs/Kconfig ++++ b/fs/Kconfig +@@ -40,6 +40,7 @@ source "fs/ocfs2/Kconfig" + source "fs/btrfs/Kconfig" + source "fs/nilfs2/Kconfig" + source "fs/f2fs/Kconfig" ++source "fs/bcachefs/Kconfig" + source "fs/zonefs/Kconfig" + + config FS_DAX +diff --git a/fs/Makefile b/fs/Makefile +index 2ce5112b02c8..8e926e6bf48f 100644 +--- a/fs/Makefile ++++ b/fs/Makefile +@@ -130,6 +130,7 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ + obj-$(CONFIG_BTRFS_FS) += btrfs/ + obj-$(CONFIG_GFS2_FS) += gfs2/ + obj-$(CONFIG_F2FS_FS) += f2fs/ ++obj-$(CONFIG_BCACHEFS_FS) += bcachefs/ + obj-$(CONFIG_CEPH_FS) += ceph/ + obj-$(CONFIG_PSTORE) += pstore/ + obj-$(CONFIG_EFIVAR_FS) += efivarfs/ +diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig +new file mode 100644 +index 000000000000..10abddae6a80 +--- /dev/null ++++ b/fs/bcachefs/Kconfig +@@ -0,0 +1,50 @@ ++ ++config BCACHEFS_FS ++ tristate "bcachefs filesystem support" ++ depends on BLOCK ++ select EXPORTFS ++ select CLOSURES ++ select LIBCRC32C ++ select CRC64 ++ select FS_POSIX_ACL ++ select LZ4_COMPRESS ++ select LZ4_DECOMPRESS ++ select ZLIB_DEFLATE ++ select ZLIB_INFLATE ++ select ZSTD_COMPRESS ++ select ZSTD_DECOMPRESS ++ select CRYPTO_SHA256 ++ select CRYPTO_CHACHA20 ++ select CRYPTO_POLY1305 ++ select KEYS ++ select SIXLOCKS ++ select RAID6_PQ ++ select XOR_BLOCKS ++ ---help--- ++ The bcachefs filesystem - a modern, copy on write filesystem, with ++ support for multiple devices, compression, checksumming, etc. ++ ++config BCACHEFS_QUOTA ++ bool "bcachefs quota support" ++ depends on BCACHEFS_FS ++ select QUOTACTL ++ ++config BCACHEFS_POSIX_ACL ++ bool "bcachefs POSIX ACL support" ++ depends on BCACHEFS_FS ++ select FS_POSIX_ACL ++ ++config BCACHEFS_DEBUG ++ bool "bcachefs debugging" ++ depends on BCACHEFS_FS ++ ---help--- ++ Enables many extra debugging checks and assertions. ++ ++ The resulting code will be significantly slower than normal; you ++ probably shouldn't select this option unless you're a developer. ++ ++config BCACHEFS_TESTS ++ bool "bcachefs unit and performance tests" ++ depends on BCACHEFS_FS ++ ---help--- ++ Include some unit and performance tests for the core btree code +diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile +new file mode 100644 +index 000000000000..d85ced62c0dd +--- /dev/null ++++ b/fs/bcachefs/Makefile +@@ -0,0 +1,59 @@ ++ ++obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o ++ ++bcachefs-y := \ ++ acl.o \ ++ alloc_background.o \ ++ alloc_foreground.o \ ++ bkey.o \ ++ bkey_methods.o \ ++ bkey_sort.o \ ++ bset.o \ ++ btree_cache.o \ ++ btree_gc.o \ ++ btree_io.o \ ++ btree_iter.o \ ++ btree_key_cache.o \ ++ btree_update_interior.o \ ++ btree_update_leaf.o \ ++ buckets.o \ ++ chardev.o \ ++ checksum.o \ ++ clock.o \ ++ compress.o \ ++ debug.o \ ++ dirent.o \ ++ disk_groups.o \ ++ ec.o \ ++ error.o \ ++ extents.o \ ++ extent_update.o \ ++ fs.o \ ++ fs-common.o \ ++ fs-ioctl.o \ ++ fs-io.o \ ++ fsck.o \ ++ inode.o \ ++ io.o \ ++ journal.o \ ++ journal_io.o \ ++ journal_reclaim.o \ ++ journal_seq_blacklist.o \ ++ keylist.o \ ++ migrate.o \ ++ move.o \ ++ movinggc.o \ ++ opts.o \ ++ quota.o \ ++ rebalance.o \ ++ recovery.o \ ++ reflink.o \ ++ replicas.o \ ++ siphash.o \ ++ super.o \ ++ super-io.o \ ++ sysfs.o \ ++ tests.o \ ++ trace.o \ ++ util.o \ ++ xattr.o +diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c +new file mode 100644 +index 000000000000..76c98ddbf628 +--- /dev/null ++++ b/fs/bcachefs/acl.c +@@ -0,0 +1,388 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ ++#include "bcachefs.h" ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include "acl.h" ++#include "fs.h" ++#include "xattr.h" ++ ++static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long) ++{ ++ return sizeof(bch_acl_header) + ++ sizeof(bch_acl_entry_short) * nr_short + ++ sizeof(bch_acl_entry) * nr_long; ++} ++ ++static inline int acl_to_xattr_type(int type) ++{ ++ switch (type) { ++ case ACL_TYPE_ACCESS: ++ return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS; ++ case ACL_TYPE_DEFAULT: ++ return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT; ++ default: ++ BUG(); ++ } ++} ++ ++/* ++ * Convert from filesystem to in-memory representation. ++ */ ++static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size) ++{ ++ const void *p, *end = value + size; ++ struct posix_acl *acl; ++ struct posix_acl_entry *out; ++ unsigned count = 0; ++ ++ if (!value) ++ return NULL; ++ if (size < sizeof(bch_acl_header)) ++ goto invalid; ++ if (((bch_acl_header *)value)->a_version != ++ cpu_to_le32(BCH_ACL_VERSION)) ++ goto invalid; ++ ++ p = value + sizeof(bch_acl_header); ++ while (p < end) { ++ const bch_acl_entry *entry = p; ++ ++ if (p + sizeof(bch_acl_entry_short) > end) ++ goto invalid; ++ ++ switch (le16_to_cpu(entry->e_tag)) { ++ case ACL_USER_OBJ: ++ case ACL_GROUP_OBJ: ++ case ACL_MASK: ++ case ACL_OTHER: ++ p += sizeof(bch_acl_entry_short); ++ break; ++ case ACL_USER: ++ case ACL_GROUP: ++ p += sizeof(bch_acl_entry); ++ break; ++ default: ++ goto invalid; ++ } ++ ++ count++; ++ } ++ ++ if (p > end) ++ goto invalid; ++ ++ if (!count) ++ return NULL; ++ ++ acl = posix_acl_alloc(count, GFP_KERNEL); ++ if (!acl) ++ return ERR_PTR(-ENOMEM); ++ ++ out = acl->a_entries; ++ ++ p = value + sizeof(bch_acl_header); ++ while (p < end) { ++ const bch_acl_entry *in = p; ++ ++ out->e_tag = le16_to_cpu(in->e_tag); ++ out->e_perm = le16_to_cpu(in->e_perm); ++ ++ switch (out->e_tag) { ++ case ACL_USER_OBJ: ++ case ACL_GROUP_OBJ: ++ case ACL_MASK: ++ case ACL_OTHER: ++ p += sizeof(bch_acl_entry_short); ++ break; ++ case ACL_USER: ++ out->e_uid = make_kuid(&init_user_ns, ++ le32_to_cpu(in->e_id)); ++ p += sizeof(bch_acl_entry); ++ break; ++ case ACL_GROUP: ++ out->e_gid = make_kgid(&init_user_ns, ++ le32_to_cpu(in->e_id)); ++ p += sizeof(bch_acl_entry); ++ break; ++ } ++ ++ out++; ++ } ++ ++ BUG_ON(out != acl->a_entries + acl->a_count); ++ ++ return acl; ++invalid: ++ pr_err("invalid acl entry"); ++ return ERR_PTR(-EINVAL); ++} ++ ++#define acl_for_each_entry(acl, acl_e) \ ++ for (acl_e = acl->a_entries; \ ++ acl_e < acl->a_entries + acl->a_count; \ ++ acl_e++) ++ ++/* ++ * Convert from in-memory to filesystem representation. ++ */ ++static struct bkey_i_xattr * ++bch2_acl_to_xattr(struct btree_trans *trans, ++ const struct posix_acl *acl, ++ int type) ++{ ++ struct bkey_i_xattr *xattr; ++ bch_acl_header *acl_header; ++ const struct posix_acl_entry *acl_e; ++ void *outptr; ++ unsigned nr_short = 0, nr_long = 0, acl_len, u64s; ++ ++ acl_for_each_entry(acl, acl_e) { ++ switch (acl_e->e_tag) { ++ case ACL_USER: ++ case ACL_GROUP: ++ nr_long++; ++ break; ++ case ACL_USER_OBJ: ++ case ACL_GROUP_OBJ: ++ case ACL_MASK: ++ case ACL_OTHER: ++ nr_short++; ++ break; ++ default: ++ return ERR_PTR(-EINVAL); ++ } ++ } ++ ++ acl_len = bch2_acl_size(nr_short, nr_long); ++ u64s = BKEY_U64s + xattr_val_u64s(0, acl_len); ++ ++ if (u64s > U8_MAX) ++ return ERR_PTR(-E2BIG); ++ ++ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); ++ if (IS_ERR(xattr)) ++ return xattr; ++ ++ bkey_xattr_init(&xattr->k_i); ++ xattr->k.u64s = u64s; ++ xattr->v.x_type = acl_to_xattr_type(type); ++ xattr->v.x_name_len = 0, ++ xattr->v.x_val_len = cpu_to_le16(acl_len); ++ ++ acl_header = xattr_val(&xattr->v); ++ acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION); ++ ++ outptr = (void *) acl_header + sizeof(*acl_header); ++ ++ acl_for_each_entry(acl, acl_e) { ++ bch_acl_entry *entry = outptr; ++ ++ entry->e_tag = cpu_to_le16(acl_e->e_tag); ++ entry->e_perm = cpu_to_le16(acl_e->e_perm); ++ switch (acl_e->e_tag) { ++ case ACL_USER: ++ entry->e_id = cpu_to_le32( ++ from_kuid(&init_user_ns, acl_e->e_uid)); ++ outptr += sizeof(bch_acl_entry); ++ break; ++ case ACL_GROUP: ++ entry->e_id = cpu_to_le32( ++ from_kgid(&init_user_ns, acl_e->e_gid)); ++ outptr += sizeof(bch_acl_entry); ++ break; ++ ++ case ACL_USER_OBJ: ++ case ACL_GROUP_OBJ: ++ case ACL_MASK: ++ case ACL_OTHER: ++ outptr += sizeof(bch_acl_entry_short); ++ break; ++ } ++ } ++ ++ BUG_ON(outptr != xattr_val(&xattr->v) + acl_len); ++ ++ return xattr; ++} ++ ++struct posix_acl *bch2_get_acl(struct inode *vinode, int type) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c_xattr xattr; ++ struct posix_acl *acl = NULL; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, ++ &inode->ei_str_hash, inode->v.i_ino, ++ &X_SEARCH(acl_to_xattr_type(type), "", 0), ++ 0); ++ if (IS_ERR(iter)) { ++ if (PTR_ERR(iter) == -EINTR) ++ goto retry; ++ ++ if (PTR_ERR(iter) != -ENOENT) ++ acl = ERR_CAST(iter); ++ goto out; ++ } ++ ++ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); ++ ++ acl = bch2_acl_from_disk(xattr_val(xattr.v), ++ le16_to_cpu(xattr.v->x_val_len)); ++ ++ if (!IS_ERR(acl)) ++ set_cached_acl(&inode->v, type, acl); ++out: ++ bch2_trans_exit(&trans); ++ return acl; ++} ++ ++int bch2_set_acl_trans(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode_u, ++ const struct bch_hash_info *hash_info, ++ struct posix_acl *acl, int type) ++{ ++ int ret; ++ ++ if (type == ACL_TYPE_DEFAULT && ++ !S_ISDIR(inode_u->bi_mode)) ++ return acl ? -EACCES : 0; ++ ++ if (acl) { ++ struct bkey_i_xattr *xattr = ++ bch2_acl_to_xattr(trans, acl, type); ++ if (IS_ERR(xattr)) ++ return PTR_ERR(xattr); ++ ++ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, ++ inode_u->bi_inum, &xattr->k_i, 0); ++ } else { ++ struct xattr_search_key search = ++ X_SEARCH(acl_to_xattr_type(type), "", 0); ++ ++ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info, ++ inode_u->bi_inum, &search); ++ } ++ ++ return ret == -ENOENT ? 0 : ret; ++} ++ ++int bch2_set_acl(struct inode *vinode, struct posix_acl *_acl, int type) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter *inode_iter; ++ struct bch_inode_unpacked inode_u; ++ struct posix_acl *acl; ++ umode_t mode; ++ int ret; ++ ++ mutex_lock(&inode->ei_update_lock); ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ acl = _acl; ++ ++ inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(inode_iter); ++ if (ret) ++ goto btree_err; ++ ++ mode = inode_u.bi_mode; ++ ++ if (type == ACL_TYPE_ACCESS) { ++ ret = posix_acl_update_mode(&inode->v, &mode, &acl); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_set_acl_trans(&trans, &inode_u, ++ &inode->ei_str_hash, ++ acl, type); ++ if (ret) ++ goto btree_err; ++ ++ inode_u.bi_ctime = bch2_current_time(c); ++ inode_u.bi_mode = mode; ++ ++ ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: ++ bch2_trans_commit(&trans, NULL, ++ &inode->ei_journal_seq, ++ BTREE_INSERT_NOUNLOCK); ++btree_err: ++ if (ret == -EINTR) ++ goto retry; ++ if (unlikely(ret)) ++ goto err; ++ ++ bch2_inode_update_after_write(c, inode, &inode_u, ++ ATTR_CTIME|ATTR_MODE); ++ ++ set_cached_acl(&inode->v, type, acl); ++err: ++ bch2_trans_exit(&trans); ++ mutex_unlock(&inode->ei_update_lock); ++ ++ return ret; ++} ++ ++int bch2_acl_chmod(struct btree_trans *trans, ++ struct bch_inode_info *inode, ++ umode_t mode, ++ struct posix_acl **new_acl) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c_xattr xattr; ++ struct bkey_i_xattr *new; ++ struct posix_acl *acl; ++ int ret = 0; ++ ++ iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, ++ &inode->ei_str_hash, inode->v.i_ino, ++ &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0), ++ BTREE_ITER_INTENT); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0; ++ ++ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); ++ ++ acl = bch2_acl_from_disk(xattr_val(xattr.v), ++ le16_to_cpu(xattr.v->x_val_len)); ++ if (IS_ERR_OR_NULL(acl)) ++ return PTR_ERR(acl); ++ ++ ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); ++ if (ret) ++ goto err; ++ ++ new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); ++ if (IS_ERR(new)) { ++ ret = PTR_ERR(new); ++ goto err; ++ } ++ ++ new->k.p = iter->pos; ++ bch2_trans_update(trans, iter, &new->k_i, 0); ++ *new_acl = acl; ++ acl = NULL; ++err: ++ kfree(acl); ++ return ret; ++} ++ ++#endif /* CONFIG_BCACHEFS_POSIX_ACL */ +diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h +new file mode 100644 +index 000000000000..cb62d502a7ff +--- /dev/null ++++ b/fs/bcachefs/acl.h +@@ -0,0 +1,59 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ACL_H ++#define _BCACHEFS_ACL_H ++ ++struct bch_inode_unpacked; ++struct bch_hash_info; ++struct bch_inode_info; ++struct posix_acl; ++ ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ ++#define BCH_ACL_VERSION 0x0001 ++ ++typedef struct { ++ __le16 e_tag; ++ __le16 e_perm; ++ __le32 e_id; ++} bch_acl_entry; ++ ++typedef struct { ++ __le16 e_tag; ++ __le16 e_perm; ++} bch_acl_entry_short; ++ ++typedef struct { ++ __le32 a_version; ++} bch_acl_header; ++ ++struct posix_acl *bch2_get_acl(struct inode *, int); ++ ++int bch2_set_acl_trans(struct btree_trans *, ++ struct bch_inode_unpacked *, ++ const struct bch_hash_info *, ++ struct posix_acl *, int); ++int bch2_set_acl(struct inode *, struct posix_acl *, int); ++int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *, ++ umode_t, struct posix_acl **); ++ ++#else ++ ++static inline int bch2_set_acl_trans(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode_u, ++ const struct bch_hash_info *hash_info, ++ struct posix_acl *acl, int type) ++{ ++ return 0; ++} ++ ++static inline int bch2_acl_chmod(struct btree_trans *trans, ++ struct bch_inode_info *inode, ++ umode_t mode, ++ struct posix_acl **new_acl) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_BCACHEFS_POSIX_ACL */ ++ ++#endif /* _BCACHEFS_ACL_H */ +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +new file mode 100644 +index 000000000000..9aa0b42b26b6 +--- /dev/null ++++ b/fs/bcachefs/alloc_background.c +@@ -0,0 +1,1436 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_key_cache.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "clock.h" ++#include "debug.h" ++#include "ec.h" ++#include "error.h" ++#include "recovery.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static const char * const bch2_alloc_field_names[] = { ++#define x(name, bytes) #name, ++ BCH_ALLOC_FIELDS() ++#undef x ++ NULL ++}; ++ ++static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int); ++ ++/* Ratelimiting/PD controllers */ ++ ++static void pd_controllers_update(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(to_delayed_work(work), ++ struct bch_fs, ++ pd_controllers_update); ++ struct bch_dev *ca; ++ s64 free = 0, fragmented = 0; ++ unsigned i; ++ ++ for_each_member_device(ca, c, i) { ++ struct bch_dev_usage stats = bch2_dev_usage_read(ca); ++ ++ free += bucket_to_sector(ca, ++ __dev_buckets_free(ca, stats)) << 9; ++ /* ++ * Bytes of internal fragmentation, which can be ++ * reclaimed by copy GC ++ */ ++ fragmented += max_t(s64, 0, (bucket_to_sector(ca, ++ stats.buckets[BCH_DATA_user] + ++ stats.buckets[BCH_DATA_cached]) - ++ (stats.sectors[BCH_DATA_user] + ++ stats.sectors[BCH_DATA_cached])) << 9); ++ } ++ ++ bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1); ++ schedule_delayed_work(&c->pd_controllers_update, ++ c->pd_controllers_update_seconds * HZ); ++} ++ ++/* Persistent alloc info: */ ++ ++static inline u64 get_alloc_field(const struct bch_alloc *a, ++ const void **p, unsigned field) ++{ ++ unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; ++ u64 v; ++ ++ if (!(a->fields & (1 << field))) ++ return 0; ++ ++ switch (bytes) { ++ case 1: ++ v = *((const u8 *) *p); ++ break; ++ case 2: ++ v = le16_to_cpup(*p); ++ break; ++ case 4: ++ v = le32_to_cpup(*p); ++ break; ++ case 8: ++ v = le64_to_cpup(*p); ++ break; ++ default: ++ BUG(); ++ } ++ ++ *p += bytes; ++ return v; ++} ++ ++static inline void put_alloc_field(struct bkey_i_alloc *a, void **p, ++ unsigned field, u64 v) ++{ ++ unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; ++ ++ if (!v) ++ return; ++ ++ a->v.fields |= 1 << field; ++ ++ switch (bytes) { ++ case 1: ++ *((u8 *) *p) = v; ++ break; ++ case 2: ++ *((__le16 *) *p) = cpu_to_le16(v); ++ break; ++ case 4: ++ *((__le32 *) *p) = cpu_to_le32(v); ++ break; ++ case 8: ++ *((__le64 *) *p) = cpu_to_le64(v); ++ break; ++ default: ++ BUG(); ++ } ++ ++ *p += bytes; ++} ++ ++struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) ++{ ++ struct bkey_alloc_unpacked ret = { .gen = 0 }; ++ ++ if (k.k->type == KEY_TYPE_alloc) { ++ const struct bch_alloc *a = bkey_s_c_to_alloc(k).v; ++ const void *d = a->data; ++ unsigned idx = 0; ++ ++ ret.gen = a->gen; ++ ++#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++); ++ BCH_ALLOC_FIELDS() ++#undef x ++ } ++ return ret; ++} ++ ++void bch2_alloc_pack(struct bkey_i_alloc *dst, ++ const struct bkey_alloc_unpacked src) ++{ ++ unsigned idx = 0; ++ void *d = dst->v.data; ++ unsigned bytes; ++ ++ dst->v.fields = 0; ++ dst->v.gen = src.gen; ++ ++#define x(_name, _bits) put_alloc_field(dst, &d, idx++, src._name); ++ BCH_ALLOC_FIELDS() ++#undef x ++ ++ bytes = (void *) d - (void *) &dst->v; ++ set_bkey_val_bytes(&dst->k, bytes); ++ memset_u64s_tail(&dst->v, 0, bytes); ++} ++ ++static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) ++{ ++ unsigned i, bytes = offsetof(struct bch_alloc, data); ++ ++ for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++) ++ if (a->fields & (1 << i)) ++ bytes += BCH_ALLOC_FIELD_BYTES[i]; ++ ++ return DIV_ROUND_UP(bytes, sizeof(u64)); ++} ++ ++const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); ++ ++ if (k.k->p.inode >= c->sb.nr_devices || ++ !c->devs[k.k->p.inode]) ++ return "invalid device"; ++ ++ /* allow for unknown fields */ ++ if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v)) ++ return "incorrect value size"; ++ ++ return NULL; ++} ++ ++void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); ++ const void *d = a.v->data; ++ unsigned i; ++ ++ pr_buf(out, "gen %u", a.v->gen); ++ ++ for (i = 0; i < BCH_ALLOC_FIELD_NR; i++) ++ if (a.v->fields & (1 << i)) ++ pr_buf(out, " %s %llu", ++ bch2_alloc_field_names[i], ++ get_alloc_field(a.v, &d, i)); ++} ++ ++static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_s_c k) ++{ ++ if (!level) ++ bch2_mark_key(c, k, 0, 0, NULL, 0, ++ BTREE_TRIGGER_ALLOC_READ| ++ BTREE_TRIGGER_NOATOMIC); ++ ++ return 0; ++} ++ ++int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ int ret = 0; ++ ++ ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC, ++ NULL, bch2_alloc_read_fn); ++ if (ret) { ++ bch_err(c, "error reading alloc info: %i", ret); ++ return ret; ++ } ++ ++ percpu_down_write(&c->mark_lock); ++ bch2_dev_usage_from_buckets(c); ++ percpu_up_write(&c->mark_lock); ++ ++ mutex_lock(&c->bucket_clock[READ].lock); ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ bch2_recalc_oldest_io(c, ca, READ); ++ up_read(&ca->bucket_lock); ++ } ++ mutex_unlock(&c->bucket_clock[READ].lock); ++ ++ mutex_lock(&c->bucket_clock[WRITE].lock); ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ bch2_recalc_oldest_io(c, ca, WRITE); ++ up_read(&ca->bucket_lock); ++ } ++ mutex_unlock(&c->bucket_clock[WRITE].lock); ++ ++ return 0; ++} ++ ++enum alloc_write_ret { ++ ALLOC_WROTE, ++ ALLOC_NOWROTE, ++ ALLOC_END, ++}; ++ ++static int bch2_alloc_write_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c k; ++ struct bch_dev *ca; ++ struct bucket_array *ba; ++ struct bucket *g; ++ struct bucket_mark m; ++ struct bkey_alloc_unpacked old_u, new_u; ++ __BKEY_PADDED(k, 8) alloc_key; /* hack: */ ++ struct bkey_i_alloc *a; ++ int ret; ++retry: ++ bch2_trans_begin(trans); ++ ++ ret = bch2_btree_key_cache_flush(trans, ++ BTREE_ID_ALLOC, iter->pos); ++ if (ret) ++ goto err; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ old_u = bch2_alloc_unpack(k); ++ ++ if (iter->pos.inode >= c->sb.nr_devices || ++ !c->devs[iter->pos.inode]) ++ return ALLOC_END; ++ ++ percpu_down_read(&c->mark_lock); ++ ca = bch_dev_bkey_exists(c, iter->pos.inode); ++ ba = bucket_array(ca); ++ ++ if (iter->pos.offset >= ba->nbuckets) { ++ percpu_up_read(&c->mark_lock); ++ return ALLOC_END; ++ } ++ ++ g = &ba->b[iter->pos.offset]; ++ m = READ_ONCE(g->mark); ++ new_u = alloc_mem_to_key(g, m); ++ percpu_up_read(&c->mark_lock); ++ ++ if (!bkey_alloc_unpacked_cmp(old_u, new_u)) ++ return ALLOC_NOWROTE; ++ ++ a = bkey_alloc_init(&alloc_key.k); ++ a->k.p = iter->pos; ++ bch2_alloc_pack(a, new_u); ++ ++ bch2_trans_update(trans, iter, &a->k_i, ++ BTREE_TRIGGER_NORUN); ++ ret = bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ flags); ++err: ++ if (ret == -EINTR) ++ goto retry; ++ return ret; ++} ++ ++int bch2_alloc_write(struct bch_fs *c, unsigned flags, bool *wrote) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bch_dev *ca; ++ unsigned i; ++ int ret = 0; ++ ++ BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, POS_MIN, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ ++ for_each_rw_member(ca, c, i) { ++ unsigned first_bucket; ++ ++ percpu_down_read(&c->mark_lock); ++ first_bucket = bucket_array(ca)->first_bucket; ++ percpu_up_read(&c->mark_lock); ++ ++ bch2_btree_iter_set_pos(iter, POS(i, first_bucket)); ++ ++ while (1) { ++ bch2_trans_cond_resched(&trans); ++ ++ ret = bch2_alloc_write_key(&trans, iter, flags); ++ if (ret < 0 || ret == ALLOC_END) ++ break; ++ if (ret == ALLOC_WROTE) ++ *wrote = true; ++ bch2_btree_iter_next_slot(iter); ++ } ++ ++ if (ret < 0) { ++ percpu_ref_put(&ca->io_ref); ++ break; ++ } ++ } ++ ++ bch2_trans_exit(&trans); ++ ++ return ret < 0 ? ret : 0; ++} ++ ++/* Bucket IO clocks: */ ++ ++static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw) ++{ ++ struct bucket_clock *clock = &c->bucket_clock[rw]; ++ struct bucket_array *buckets = bucket_array(ca); ++ struct bucket *g; ++ u16 max_last_io = 0; ++ unsigned i; ++ ++ lockdep_assert_held(&c->bucket_clock[rw].lock); ++ ++ /* Recalculate max_last_io for this device: */ ++ for_each_bucket(g, buckets) ++ max_last_io = max(max_last_io, bucket_last_io(c, g, rw)); ++ ++ ca->max_last_bucket_io[rw] = max_last_io; ++ ++ /* Recalculate global max_last_io: */ ++ max_last_io = 0; ++ ++ for_each_member_device(ca, c, i) ++ max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]); ++ ++ clock->max_last_io = max_last_io; ++} ++ ++static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw) ++{ ++ struct bucket_clock *clock = &c->bucket_clock[rw]; ++ struct bucket_array *buckets; ++ struct bch_dev *ca; ++ struct bucket *g; ++ unsigned i; ++ ++ trace_rescale_prios(c); ++ ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ for_each_bucket(g, buckets) ++ g->io_time[rw] = clock->hand - ++ bucket_last_io(c, g, rw) / 2; ++ ++ bch2_recalc_oldest_io(c, ca, rw); ++ ++ up_read(&ca->bucket_lock); ++ } ++} ++ ++static inline u64 bucket_clock_freq(u64 capacity) ++{ ++ return max(capacity >> 10, 2028ULL); ++} ++ ++static void bch2_inc_clock_hand(struct io_timer *timer) ++{ ++ struct bucket_clock *clock = container_of(timer, ++ struct bucket_clock, rescale); ++ struct bch_fs *c = container_of(clock, ++ struct bch_fs, bucket_clock[clock->rw]); ++ struct bch_dev *ca; ++ u64 capacity; ++ unsigned i; ++ ++ mutex_lock(&clock->lock); ++ ++ /* if clock cannot be advanced more, rescale prio */ ++ if (clock->max_last_io >= U16_MAX - 2) ++ bch2_rescale_bucket_io_times(c, clock->rw); ++ ++ BUG_ON(clock->max_last_io >= U16_MAX - 2); ++ ++ for_each_member_device(ca, c, i) ++ ca->max_last_bucket_io[clock->rw]++; ++ clock->max_last_io++; ++ clock->hand++; ++ ++ mutex_unlock(&clock->lock); ++ ++ capacity = READ_ONCE(c->capacity); ++ ++ if (!capacity) ++ return; ++ ++ /* ++ * we only increment when 0.1% of the filesystem capacity has been read ++ * or written too, this determines if it's time ++ * ++ * XXX: we shouldn't really be going off of the capacity of devices in ++ * RW mode (that will be 0 when we're RO, yet we can still service ++ * reads) ++ */ ++ timer->expire += bucket_clock_freq(capacity); ++ ++ bch2_io_timer_add(&c->io_clock[clock->rw], timer); ++} ++ ++static void bch2_bucket_clock_init(struct bch_fs *c, int rw) ++{ ++ struct bucket_clock *clock = &c->bucket_clock[rw]; ++ ++ clock->hand = 1; ++ clock->rw = rw; ++ clock->rescale.fn = bch2_inc_clock_hand; ++ clock->rescale.expire = bucket_clock_freq(c->capacity); ++ mutex_init(&clock->lock); ++} ++ ++/* Background allocator thread: */ ++ ++/* ++ * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens ++ * (marking them as invalidated on disk), then optionally issues discard ++ * commands to the newly free buckets, then puts them on the various freelists. ++ */ ++ ++#define BUCKET_GC_GEN_MAX 96U ++ ++/** ++ * wait_buckets_available - wait on reclaimable buckets ++ * ++ * If there aren't enough available buckets to fill up free_inc, wait until ++ * there are. ++ */ ++static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) ++{ ++ unsigned long gc_count = c->gc_count; ++ u64 available; ++ int ret = 0; ++ ++ ca->allocator_state = ALLOCATOR_BLOCKED; ++ closure_wake_up(&c->freelist_wait); ++ ++ while (1) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ if (kthread_should_stop()) { ++ ret = 1; ++ break; ++ } ++ ++ if (gc_count != c->gc_count) ++ ca->inc_gen_really_needs_gc = 0; ++ ++ available = max_t(s64, 0, dev_buckets_available(ca) - ++ ca->inc_gen_really_needs_gc); ++ ++ if (available > fifo_free(&ca->free_inc) || ++ (available && ++ (!fifo_full(&ca->free[RESERVE_BTREE]) || ++ !fifo_full(&ca->free[RESERVE_MOVINGGC])))) ++ break; ++ ++ up_read(&c->gc_lock); ++ schedule(); ++ try_to_freeze(); ++ down_read(&c->gc_lock); ++ } ++ ++ __set_current_state(TASK_RUNNING); ++ ca->allocator_state = ALLOCATOR_RUNNING; ++ closure_wake_up(&c->freelist_wait); ++ ++ return ret; ++} ++ ++static bool bch2_can_invalidate_bucket(struct bch_dev *ca, ++ size_t bucket, ++ struct bucket_mark mark) ++{ ++ u8 gc_gen; ++ ++ if (!is_available_bucket(mark)) ++ return false; ++ ++ if (ca->buckets_nouse && ++ test_bit(bucket, ca->buckets_nouse)) ++ return false; ++ ++ gc_gen = bucket_gc_gen(ca, bucket); ++ ++ if (gc_gen >= BUCKET_GC_GEN_MAX / 2) ++ ca->inc_gen_needs_gc++; ++ ++ if (gc_gen >= BUCKET_GC_GEN_MAX) ++ ca->inc_gen_really_needs_gc++; ++ ++ return gc_gen < BUCKET_GC_GEN_MAX; ++} ++ ++/* ++ * Determines what order we're going to reuse buckets, smallest bucket_key() ++ * first. ++ * ++ * ++ * - We take into account the read prio of the bucket, which gives us an ++ * indication of how hot the data is -- we scale the prio so that the prio ++ * farthest from the clock is worth 1/8th of the closest. ++ * ++ * - The number of sectors of cached data in the bucket, which gives us an ++ * indication of the cost in cache misses this eviction will cause. ++ * ++ * - If hotness * sectors used compares equal, we pick the bucket with the ++ * smallest bucket_gc_gen() - since incrementing the same bucket's generation ++ * number repeatedly forces us to run mark and sweep gc to avoid generation ++ * number wraparound. ++ */ ++ ++static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, struct bucket_mark m) ++{ ++ unsigned last_io = bucket_last_io(c, bucket(ca, b), READ); ++ unsigned max_last_io = ca->max_last_bucket_io[READ]; ++ ++ /* ++ * Time since last read, scaled to [0, 8) where larger value indicates ++ * more recently read data: ++ */ ++ unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io; ++ ++ /* How much we want to keep the data in this bucket: */ ++ unsigned long data_wantness = ++ (hotness + 1) * bucket_sectors_used(m); ++ ++ unsigned long needs_journal_commit = ++ bucket_needs_journal_commit(m, c->journal.last_seq_ondisk); ++ ++ return (data_wantness << 9) | ++ (needs_journal_commit << 8) | ++ (bucket_gc_gen(ca, b) / 16); ++} ++ ++static inline int bucket_alloc_cmp(alloc_heap *h, ++ struct alloc_heap_entry l, ++ struct alloc_heap_entry r) ++{ ++ return cmp_int(l.key, r.key) ?: ++ cmp_int(r.nr, l.nr) ?: ++ cmp_int(l.bucket, r.bucket); ++} ++ ++static inline int bucket_idx_cmp(const void *_l, const void *_r) ++{ ++ const struct alloc_heap_entry *l = _l, *r = _r; ++ ++ return cmp_int(l->bucket, r->bucket); ++} ++ ++static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bucket_array *buckets; ++ struct alloc_heap_entry e = { 0 }; ++ size_t b, i, nr = 0; ++ ++ ca->alloc_heap.used = 0; ++ ++ mutex_lock(&c->bucket_clock[READ].lock); ++ down_read(&ca->bucket_lock); ++ ++ buckets = bucket_array(ca); ++ ++ bch2_recalc_oldest_io(c, ca, READ); ++ ++ /* ++ * Find buckets with lowest read priority, by building a maxheap sorted ++ * by read priority and repeatedly replacing the maximum element until ++ * all buckets have been visited. ++ */ ++ for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { ++ struct bucket_mark m = READ_ONCE(buckets->b[b].mark); ++ unsigned long key = bucket_sort_key(c, ca, b, m); ++ ++ if (!bch2_can_invalidate_bucket(ca, b, m)) ++ continue; ++ ++ if (e.nr && e.bucket + e.nr == b && e.key == key) { ++ e.nr++; ++ } else { ++ if (e.nr) ++ heap_add_or_replace(&ca->alloc_heap, e, ++ -bucket_alloc_cmp, NULL); ++ ++ e = (struct alloc_heap_entry) { ++ .bucket = b, ++ .nr = 1, ++ .key = key, ++ }; ++ } ++ ++ cond_resched(); ++ } ++ ++ if (e.nr) ++ heap_add_or_replace(&ca->alloc_heap, e, ++ -bucket_alloc_cmp, NULL); ++ ++ for (i = 0; i < ca->alloc_heap.used; i++) ++ nr += ca->alloc_heap.data[i].nr; ++ ++ while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) { ++ nr -= ca->alloc_heap.data[0].nr; ++ heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL); ++ } ++ ++ up_read(&ca->bucket_lock); ++ mutex_unlock(&c->bucket_clock[READ].lock); ++} ++ ++static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bucket_array *buckets = bucket_array(ca); ++ struct bucket_mark m; ++ size_t b, start; ++ ++ if (ca->fifo_last_bucket < ca->mi.first_bucket || ++ ca->fifo_last_bucket >= ca->mi.nbuckets) ++ ca->fifo_last_bucket = ca->mi.first_bucket; ++ ++ start = ca->fifo_last_bucket; ++ ++ do { ++ ca->fifo_last_bucket++; ++ if (ca->fifo_last_bucket == ca->mi.nbuckets) ++ ca->fifo_last_bucket = ca->mi.first_bucket; ++ ++ b = ca->fifo_last_bucket; ++ m = READ_ONCE(buckets->b[b].mark); ++ ++ if (bch2_can_invalidate_bucket(ca, b, m)) { ++ struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; ++ ++ heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); ++ if (heap_full(&ca->alloc_heap)) ++ break; ++ } ++ ++ cond_resched(); ++ } while (ca->fifo_last_bucket != start); ++} ++ ++static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bucket_array *buckets = bucket_array(ca); ++ struct bucket_mark m; ++ size_t checked, i; ++ ++ for (checked = 0; ++ checked < ca->mi.nbuckets / 2; ++ checked++) { ++ size_t b = bch2_rand_range(ca->mi.nbuckets - ++ ca->mi.first_bucket) + ++ ca->mi.first_bucket; ++ ++ m = READ_ONCE(buckets->b[b].mark); ++ ++ if (bch2_can_invalidate_bucket(ca, b, m)) { ++ struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; ++ ++ heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); ++ if (heap_full(&ca->alloc_heap)) ++ break; ++ } ++ ++ cond_resched(); ++ } ++ ++ sort(ca->alloc_heap.data, ++ ca->alloc_heap.used, ++ sizeof(ca->alloc_heap.data[0]), ++ bucket_idx_cmp, NULL); ++ ++ /* remove duplicates: */ ++ for (i = 0; i + 1 < ca->alloc_heap.used; i++) ++ if (ca->alloc_heap.data[i].bucket == ++ ca->alloc_heap.data[i + 1].bucket) ++ ca->alloc_heap.data[i].nr = 0; ++} ++ ++static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) ++{ ++ size_t i, nr = 0; ++ ++ ca->inc_gen_needs_gc = 0; ++ ++ switch (ca->mi.replacement) { ++ case CACHE_REPLACEMENT_LRU: ++ find_reclaimable_buckets_lru(c, ca); ++ break; ++ case CACHE_REPLACEMENT_FIFO: ++ find_reclaimable_buckets_fifo(c, ca); ++ break; ++ case CACHE_REPLACEMENT_RANDOM: ++ find_reclaimable_buckets_random(c, ca); ++ break; ++ } ++ ++ heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL); ++ ++ for (i = 0; i < ca->alloc_heap.used; i++) ++ nr += ca->alloc_heap.data[i].nr; ++ ++ return nr; ++} ++ ++static inline long next_alloc_bucket(struct bch_dev *ca) ++{ ++ struct alloc_heap_entry e, *top = ca->alloc_heap.data; ++ ++ while (ca->alloc_heap.used) { ++ if (top->nr) { ++ size_t b = top->bucket; ++ ++ top->bucket++; ++ top->nr--; ++ return b; ++ } ++ ++ heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); ++ } ++ ++ return -1; ++} ++ ++/* ++ * returns sequence number of most recent journal entry that updated this ++ * bucket: ++ */ ++static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m) ++{ ++ if (m.journal_seq_valid) { ++ u64 journal_seq = atomic64_read(&c->journal.seq); ++ u64 bucket_seq = journal_seq; ++ ++ bucket_seq &= ~((u64) U16_MAX); ++ bucket_seq |= m.journal_seq; ++ ++ if (bucket_seq > journal_seq) ++ bucket_seq -= 1 << 16; ++ ++ return bucket_seq; ++ } else { ++ return 0; ++ } ++} ++ ++static int bch2_invalidate_one_bucket2(struct btree_trans *trans, ++ struct bch_dev *ca, ++ struct btree_iter *iter, ++ u64 *journal_seq, unsigned flags) ++{ ++#if 0 ++ __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key; ++#else ++ /* hack: */ ++ __BKEY_PADDED(k, 8) alloc_key; ++#endif ++ struct bch_fs *c = trans->c; ++ struct bkey_i_alloc *a; ++ struct bkey_alloc_unpacked u; ++ struct bucket *g; ++ struct bucket_mark m; ++ bool invalidating_cached_data; ++ size_t b; ++ int ret = 0; ++ ++ BUG_ON(!ca->alloc_heap.used || ++ !ca->alloc_heap.data[0].nr); ++ b = ca->alloc_heap.data[0].bucket; ++ ++ /* first, put on free_inc and mark as owned by allocator: */ ++ percpu_down_read(&c->mark_lock); ++ spin_lock(&c->freelist_lock); ++ ++ verify_not_on_freelist(c, ca, b); ++ ++ BUG_ON(!fifo_push(&ca->free_inc, b)); ++ ++ g = bucket(ca, b); ++ m = READ_ONCE(g->mark); ++ ++ invalidating_cached_data = m.cached_sectors != 0; ++ ++ /* ++ * If we're not invalidating cached data, we only increment the bucket ++ * gen in memory here, the incremented gen will be updated in the btree ++ * by bch2_trans_mark_pointer(): ++ */ ++ ++ if (!invalidating_cached_data) ++ bch2_invalidate_bucket(c, ca, b, &m); ++ else ++ bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); ++ ++ spin_unlock(&c->freelist_lock); ++ percpu_up_read(&c->mark_lock); ++ ++ if (!invalidating_cached_data) ++ goto out; ++ ++ /* ++ * If the read-only path is trying to shut down, we can't be generating ++ * new btree updates: ++ */ ++ if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) { ++ ret = 1; ++ goto out; ++ } ++ ++ BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); ++ ++ bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); ++retry: ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ return ret; ++ ++ percpu_down_read(&c->mark_lock); ++ g = bucket(ca, iter->pos.offset); ++ m = READ_ONCE(g->mark); ++ u = alloc_mem_to_key(g, m); ++ ++ percpu_up_read(&c->mark_lock); ++ ++ invalidating_cached_data = u.cached_sectors != 0; ++ ++ u.gen++; ++ u.data_type = 0; ++ u.dirty_sectors = 0; ++ u.cached_sectors = 0; ++ u.read_time = c->bucket_clock[READ].hand; ++ u.write_time = c->bucket_clock[WRITE].hand; ++ ++ a = bkey_alloc_init(&alloc_key.k); ++ a->k.p = iter->pos; ++ bch2_alloc_pack(a, u); ++ ++ bch2_trans_update(trans, iter, &a->k_i, ++ BTREE_TRIGGER_BUCKET_INVALIDATE); ++ ++ /* ++ * XXX: ++ * when using deferred btree updates, we have journal reclaim doing ++ * btree updates and thus requiring the allocator to make forward ++ * progress, and here the allocator is requiring space in the journal - ++ * so we need a journal pre-reservation: ++ */ ++ ret = bch2_trans_commit(trans, NULL, ++ invalidating_cached_data ? journal_seq : NULL, ++ BTREE_INSERT_NOUNLOCK| ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_USE_ALLOC_RESERVE| ++ flags); ++ if (ret == -EINTR) ++ goto retry; ++out: ++ if (!ret) { ++ /* remove from alloc_heap: */ ++ struct alloc_heap_entry e, *top = ca->alloc_heap.data; ++ ++ top->bucket++; ++ top->nr--; ++ ++ if (!top->nr) ++ heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); ++ ++ /* ++ * Make sure we flush the last journal entry that updated this ++ * bucket (i.e. deleting the last reference) before writing to ++ * this bucket again: ++ */ ++ *journal_seq = max(*journal_seq, bucket_journal_seq(c, m)); ++ } else { ++ size_t b2; ++ ++ /* remove from free_inc: */ ++ percpu_down_read(&c->mark_lock); ++ spin_lock(&c->freelist_lock); ++ ++ bch2_mark_alloc_bucket(c, ca, b, false, ++ gc_pos_alloc(c, NULL), 0); ++ ++ BUG_ON(!fifo_pop_back(&ca->free_inc, b2)); ++ BUG_ON(b != b2); ++ ++ spin_unlock(&c->freelist_lock); ++ percpu_up_read(&c->mark_lock); ++ } ++ ++ return ret < 0 ? ret : 0; ++} ++ ++/* ++ * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc: ++ */ ++static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ u64 journal_seq = 0; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, ++ POS(ca->dev_idx, 0), ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_INTENT); ++ ++ /* Only use nowait if we've already invalidated at least one bucket: */ ++ while (!ret && ++ !fifo_full(&ca->free_inc) && ++ ca->alloc_heap.used) ++ ret = bch2_invalidate_one_bucket2(&trans, ca, iter, &journal_seq, ++ BTREE_INSERT_GC_LOCK_HELD| ++ (!fifo_empty(&ca->free_inc) ++ ? BTREE_INSERT_NOWAIT : 0)); ++ ++ bch2_trans_exit(&trans); ++ ++ /* If we used NOWAIT, don't return the error: */ ++ if (!fifo_empty(&ca->free_inc)) ++ ret = 0; ++ if (ret) { ++ bch_err(ca, "error invalidating buckets: %i", ret); ++ return ret; ++ } ++ ++ if (journal_seq) ++ ret = bch2_journal_flush_seq(&c->journal, journal_seq); ++ if (ret) { ++ bch_err(ca, "journal error: %i", ret); ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket) ++{ ++ unsigned i; ++ int ret = 0; ++ ++ while (1) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ spin_lock(&c->freelist_lock); ++ for (i = 0; i < RESERVE_NR; i++) { ++ ++ /* ++ * Don't strand buckets on the copygc freelist until ++ * after recovery is finished: ++ */ ++ if (!test_bit(BCH_FS_STARTED, &c->flags) && ++ i == RESERVE_MOVINGGC) ++ continue; ++ ++ if (fifo_push(&ca->free[i], bucket)) { ++ fifo_pop(&ca->free_inc, bucket); ++ ++ closure_wake_up(&c->freelist_wait); ++ ca->allocator_state = ALLOCATOR_RUNNING; ++ ++ spin_unlock(&c->freelist_lock); ++ goto out; ++ } ++ } ++ ++ if (ca->allocator_state != ALLOCATOR_BLOCKED_FULL) { ++ ca->allocator_state = ALLOCATOR_BLOCKED_FULL; ++ closure_wake_up(&c->freelist_wait); ++ } ++ ++ spin_unlock(&c->freelist_lock); ++ ++ if ((current->flags & PF_KTHREAD) && ++ kthread_should_stop()) { ++ ret = 1; ++ break; ++ } ++ ++ schedule(); ++ try_to_freeze(); ++ } ++out: ++ __set_current_state(TASK_RUNNING); ++ return ret; ++} ++ ++/* ++ * Pulls buckets off free_inc, discards them (if enabled), then adds them to ++ * freelists, waiting until there's room if necessary: ++ */ ++static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca) ++{ ++ while (!fifo_empty(&ca->free_inc)) { ++ size_t bucket = fifo_peek(&ca->free_inc); ++ ++ if (ca->mi.discard && ++ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) ++ blkdev_issue_discard(ca->disk_sb.bdev, ++ bucket_to_sector(ca, bucket), ++ ca->mi.bucket_size, GFP_NOIO, 0); ++ ++ if (push_invalidated_bucket(c, ca, bucket)) ++ return 1; ++ } ++ ++ return 0; ++} ++ ++/** ++ * bch_allocator_thread - move buckets from free_inc to reserves ++ * ++ * The free_inc FIFO is populated by find_reclaimable_buckets(), and ++ * the reserves are depleted by bucket allocation. When we run out ++ * of free_inc, try to invalidate some buckets and write out ++ * prios and gens. ++ */ ++static int bch2_allocator_thread(void *arg) ++{ ++ struct bch_dev *ca = arg; ++ struct bch_fs *c = ca->fs; ++ size_t nr; ++ int ret; ++ ++ set_freezable(); ++ ca->allocator_state = ALLOCATOR_RUNNING; ++ ++ while (1) { ++ cond_resched(); ++ if (kthread_should_stop()) ++ break; ++ ++ pr_debug("discarding %zu invalidated buckets", ++ fifo_used(&ca->free_inc)); ++ ++ ret = discard_invalidated_buckets(c, ca); ++ if (ret) ++ goto stop; ++ ++ down_read(&c->gc_lock); ++ ++ ret = bch2_invalidate_buckets(c, ca); ++ if (ret) { ++ up_read(&c->gc_lock); ++ goto stop; ++ } ++ ++ if (!fifo_empty(&ca->free_inc)) { ++ up_read(&c->gc_lock); ++ continue; ++ } ++ ++ pr_debug("free_inc now empty"); ++ ++ do { ++ /* ++ * Find some buckets that we can invalidate, either ++ * they're completely unused, or only contain clean data ++ * that's been written back to the backing device or ++ * another cache tier ++ */ ++ ++ pr_debug("scanning for reclaimable buckets"); ++ ++ nr = find_reclaimable_buckets(c, ca); ++ ++ pr_debug("found %zu buckets", nr); ++ ++ trace_alloc_batch(ca, nr, ca->alloc_heap.size); ++ ++ if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || ++ ca->inc_gen_really_needs_gc) && ++ c->gc_thread) { ++ atomic_inc(&c->kick_gc); ++ wake_up_process(c->gc_thread); ++ } ++ ++ /* ++ * If we found any buckets, we have to invalidate them ++ * before we scan for more - but if we didn't find very ++ * many we may want to wait on more buckets being ++ * available so we don't spin: ++ */ ++ if (!nr || ++ (nr < ALLOC_SCAN_BATCH(ca) && ++ !fifo_empty(&ca->free[RESERVE_NONE]))) { ++ ret = wait_buckets_available(c, ca); ++ if (ret) { ++ up_read(&c->gc_lock); ++ goto stop; ++ } ++ } ++ } while (!nr); ++ ++ up_read(&c->gc_lock); ++ ++ pr_debug("%zu buckets to invalidate", nr); ++ ++ /* ++ * alloc_heap is now full of newly-invalidated buckets: next, ++ * write out the new bucket gens: ++ */ ++ } ++ ++stop: ++ pr_debug("alloc thread stopping (ret %i)", ret); ++ ca->allocator_state = ALLOCATOR_STOPPED; ++ closure_wake_up(&c->freelist_wait); ++ return 0; ++} ++ ++/* Startup/shutdown (ro/rw): */ ++ ++void bch2_recalc_capacity(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ u64 capacity = 0, reserved_sectors = 0, gc_reserve, copygc_threshold = 0; ++ unsigned bucket_size_max = 0; ++ unsigned long ra_pages = 0; ++ unsigned i, j; ++ ++ lockdep_assert_held(&c->state_lock); ++ ++ for_each_online_member(ca, c, i) { ++ struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi; ++ ++ ra_pages += bdi->ra_pages; ++ } ++ ++ bch2_set_ra_pages(c, ra_pages); ++ ++ for_each_rw_member(ca, c, i) { ++ u64 dev_reserve = 0; ++ ++ /* ++ * We need to reserve buckets (from the number ++ * of currently available buckets) against ++ * foreground writes so that mainly copygc can ++ * make forward progress. ++ * ++ * We need enough to refill the various reserves ++ * from scratch - copygc will use its entire ++ * reserve all at once, then run against when ++ * its reserve is refilled (from the formerly ++ * available buckets). ++ * ++ * This reserve is just used when considering if ++ * allocations for foreground writes must wait - ++ * not -ENOSPC calculations. ++ */ ++ for (j = 0; j < RESERVE_NONE; j++) ++ dev_reserve += ca->free[j].size; ++ ++ dev_reserve += 1; /* btree write point */ ++ dev_reserve += 1; /* copygc write point */ ++ dev_reserve += 1; /* rebalance write point */ ++ ++ dev_reserve *= ca->mi.bucket_size; ++ ++ copygc_threshold += dev_reserve; ++ ++ capacity += bucket_to_sector(ca, ca->mi.nbuckets - ++ ca->mi.first_bucket); ++ ++ reserved_sectors += dev_reserve * 2; ++ ++ bucket_size_max = max_t(unsigned, bucket_size_max, ++ ca->mi.bucket_size); ++ } ++ ++ gc_reserve = c->opts.gc_reserve_bytes ++ ? c->opts.gc_reserve_bytes >> 9 ++ : div64_u64(capacity * c->opts.gc_reserve_percent, 100); ++ ++ reserved_sectors = max(gc_reserve, reserved_sectors); ++ ++ reserved_sectors = min(reserved_sectors, capacity); ++ ++ c->copygc_threshold = copygc_threshold; ++ c->capacity = capacity - reserved_sectors; ++ ++ c->bucket_size_max = bucket_size_max; ++ ++ if (c->capacity) { ++ bch2_io_timer_add(&c->io_clock[READ], ++ &c->bucket_clock[READ].rescale); ++ bch2_io_timer_add(&c->io_clock[WRITE], ++ &c->bucket_clock[WRITE].rescale); ++ } else { ++ bch2_io_timer_del(&c->io_clock[READ], ++ &c->bucket_clock[READ].rescale); ++ bch2_io_timer_del(&c->io_clock[WRITE], ++ &c->bucket_clock[WRITE].rescale); ++ } ++ ++ /* Wake up case someone was waiting for buckets */ ++ closure_wake_up(&c->freelist_wait); ++} ++ ++static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct open_bucket *ob; ++ bool ret = false; ++ ++ for (ob = c->open_buckets; ++ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ++ ob++) { ++ spin_lock(&ob->lock); ++ if (ob->valid && !ob->on_partial_list && ++ ob->ptr.dev == ca->dev_idx) ++ ret = true; ++ spin_unlock(&ob->lock); ++ } ++ ++ return ret; ++} ++ ++/* device goes ro: */ ++void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) ++{ ++ unsigned i; ++ ++ BUG_ON(ca->alloc_thread); ++ ++ /* First, remove device from allocation groups: */ ++ ++ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) ++ clear_bit(ca->dev_idx, c->rw_devs[i].d); ++ ++ /* ++ * Capacity is calculated based off of devices in allocation groups: ++ */ ++ bch2_recalc_capacity(c); ++ ++ /* Next, close write points that point to this device... */ ++ for (i = 0; i < ARRAY_SIZE(c->write_points); i++) ++ bch2_writepoint_stop(c, ca, &c->write_points[i]); ++ ++ bch2_writepoint_stop(c, ca, &c->copygc_write_point); ++ bch2_writepoint_stop(c, ca, &c->rebalance_write_point); ++ bch2_writepoint_stop(c, ca, &c->btree_write_point); ++ ++ mutex_lock(&c->btree_reserve_cache_lock); ++ while (c->btree_reserve_cache_nr) { ++ struct btree_alloc *a = ++ &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; ++ ++ bch2_open_buckets_put(c, &a->ob); ++ } ++ mutex_unlock(&c->btree_reserve_cache_lock); ++ ++ while (1) { ++ struct open_bucket *ob; ++ ++ spin_lock(&c->freelist_lock); ++ if (!ca->open_buckets_partial_nr) { ++ spin_unlock(&c->freelist_lock); ++ break; ++ } ++ ob = c->open_buckets + ++ ca->open_buckets_partial[--ca->open_buckets_partial_nr]; ++ ob->on_partial_list = false; ++ spin_unlock(&c->freelist_lock); ++ ++ bch2_open_bucket_put(c, ob); ++ } ++ ++ bch2_ec_stop_dev(c, ca); ++ ++ /* ++ * Wake up threads that were blocked on allocation, so they can notice ++ * the device can no longer be removed and the capacity has changed: ++ */ ++ closure_wake_up(&c->freelist_wait); ++ ++ /* ++ * journal_res_get() can block waiting for free space in the journal - ++ * it needs to notice there may not be devices to allocate from anymore: ++ */ ++ wake_up(&c->journal.wait); ++ ++ /* Now wait for any in flight writes: */ ++ ++ closure_wait_event(&c->open_buckets_wait, ++ !bch2_dev_has_open_write_point(c, ca)); ++} ++ ++/* device goes rw: */ ++void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) ++ if (ca->mi.data_allowed & (1 << i)) ++ set_bit(ca->dev_idx, c->rw_devs[i].d); ++} ++ ++void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca) ++{ ++ if (ca->alloc_thread) ++ closure_wait_event(&c->freelist_wait, ++ ca->allocator_state != ALLOCATOR_RUNNING); ++} ++ ++/* stop allocator thread: */ ++void bch2_dev_allocator_stop(struct bch_dev *ca) ++{ ++ struct task_struct *p; ++ ++ p = rcu_dereference_protected(ca->alloc_thread, 1); ++ ca->alloc_thread = NULL; ++ ++ /* ++ * We need an rcu barrier between setting ca->alloc_thread = NULL and ++ * the thread shutting down to avoid bch2_wake_allocator() racing: ++ * ++ * XXX: it would be better to have the rcu barrier be asynchronous ++ * instead of blocking us here ++ */ ++ synchronize_rcu(); ++ ++ if (p) { ++ kthread_stop(p); ++ put_task_struct(p); ++ } ++} ++ ++/* start allocator thread: */ ++int bch2_dev_allocator_start(struct bch_dev *ca) ++{ ++ struct task_struct *p; ++ ++ /* ++ * allocator thread already started? ++ */ ++ if (ca->alloc_thread) ++ return 0; ++ ++ p = kthread_create(bch2_allocator_thread, ca, ++ "bch_alloc[%s]", ca->name); ++ if (IS_ERR(p)) ++ return PTR_ERR(p); ++ ++ get_task_struct(p); ++ rcu_assign_pointer(ca->alloc_thread, p); ++ wake_up_process(p); ++ return 0; ++} ++ ++void bch2_fs_allocator_background_init(struct bch_fs *c) ++{ ++ spin_lock_init(&c->freelist_lock); ++ bch2_bucket_clock_init(c, READ); ++ bch2_bucket_clock_init(c, WRITE); ++ ++ c->pd_controllers_update_seconds = 5; ++ INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); ++} +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +new file mode 100644 +index 000000000000..f6b9f27f0713 +--- /dev/null ++++ b/fs/bcachefs/alloc_background.h +@@ -0,0 +1,97 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ALLOC_BACKGROUND_H ++#define _BCACHEFS_ALLOC_BACKGROUND_H ++ ++#include "bcachefs.h" ++#include "alloc_types.h" ++#include "debug.h" ++ ++struct bkey_alloc_unpacked { ++ u8 gen; ++#define x(_name, _bits) u##_bits _name; ++ BCH_ALLOC_FIELDS() ++#undef x ++}; ++ ++/* returns true if not equal */ ++static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, ++ struct bkey_alloc_unpacked r) ++{ ++ return l.gen != r.gen ++#define x(_name, _bits) || l._name != r._name ++ BCH_ALLOC_FIELDS() ++#undef x ++ ; ++} ++ ++struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); ++void bch2_alloc_pack(struct bkey_i_alloc *, ++ const struct bkey_alloc_unpacked); ++ ++static inline struct bkey_alloc_unpacked ++alloc_mem_to_key(struct bucket *g, struct bucket_mark m) ++{ ++ return (struct bkey_alloc_unpacked) { ++ .gen = m.gen, ++ .oldest_gen = g->oldest_gen, ++ .data_type = m.data_type, ++ .dirty_sectors = m.dirty_sectors, ++ .cached_sectors = m.cached_sectors, ++ .read_time = g->io_time[READ], ++ .write_time = g->io_time[WRITE], ++ }; ++} ++ ++#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) ++ ++const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_alloc (struct bkey_ops) { \ ++ .key_invalid = bch2_alloc_invalid, \ ++ .val_to_text = bch2_alloc_to_text, \ ++} ++ ++struct journal_keys; ++int bch2_alloc_read(struct bch_fs *, struct journal_keys *); ++ ++static inline void bch2_wake_allocator(struct bch_dev *ca) ++{ ++ struct task_struct *p; ++ ++ rcu_read_lock(); ++ p = rcu_dereference(ca->alloc_thread); ++ if (p) ++ wake_up_process(p); ++ rcu_read_unlock(); ++} ++ ++static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, ++ size_t bucket) ++{ ++ if (expensive_debug_checks(c)) { ++ size_t iter; ++ long i; ++ unsigned j; ++ ++ for (j = 0; j < RESERVE_NR; j++) ++ fifo_for_each_entry(i, &ca->free[j], iter) ++ BUG_ON(i == bucket); ++ fifo_for_each_entry(i, &ca->free_inc, iter) ++ BUG_ON(i == bucket); ++ } ++} ++ ++void bch2_recalc_capacity(struct bch_fs *); ++ ++void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); ++void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); ++ ++void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); ++void bch2_dev_allocator_stop(struct bch_dev *); ++int bch2_dev_allocator_start(struct bch_dev *); ++ ++int bch2_alloc_write(struct bch_fs *, unsigned, bool *); ++void bch2_fs_allocator_background_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +new file mode 100644 +index 000000000000..4a048828869b +--- /dev/null ++++ b/fs/bcachefs/alloc_foreground.c +@@ -0,0 +1,992 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Primary bucket allocation code ++ * ++ * Copyright 2012 Google, Inc. ++ * ++ * Allocation in bcache is done in terms of buckets: ++ * ++ * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in ++ * btree pointers - they must match for the pointer to be considered valid. ++ * ++ * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a ++ * bucket simply by incrementing its gen. ++ * ++ * The gens (along with the priorities; it's really the gens are important but ++ * the code is named as if it's the priorities) are written in an arbitrary list ++ * of buckets on disk, with a pointer to them in the journal header. ++ * ++ * When we invalidate a bucket, we have to write its new gen to disk and wait ++ * for that write to complete before we use it - otherwise after a crash we ++ * could have pointers that appeared to be good but pointed to data that had ++ * been overwritten. ++ * ++ * Since the gens and priorities are all stored contiguously on disk, we can ++ * batch this up: We fill up the free_inc list with freshly invalidated buckets, ++ * call prio_write(), and when prio_write() finishes we pull buckets off the ++ * free_inc list and optionally discard them. ++ * ++ * free_inc isn't the only freelist - if it was, we'd often have to sleep while ++ * priorities and gens were being written before we could allocate. c->free is a ++ * smaller freelist, and buckets on that list are always ready to be used. ++ * ++ * If we've got discards enabled, that happens when a bucket moves from the ++ * free_inc list to the free list. ++ * ++ * It's important to ensure that gens don't wrap around - with respect to ++ * either the oldest gen in the btree or the gen on disk. This is quite ++ * difficult to do in practice, but we explicitly guard against it anyways - if ++ * a bucket is in danger of wrapping around we simply skip invalidating it that ++ * time around, and we garbage collect or rewrite the priorities sooner than we ++ * would have otherwise. ++ * ++ * bch2_bucket_alloc() allocates a single bucket from a specific device. ++ * ++ * bch2_bucket_alloc_set() allocates one or more buckets from different devices ++ * in a given filesystem. ++ * ++ * invalidate_buckets() drives all the processes described above. It's called ++ * from bch2_bucket_alloc() and a few other places that need to make sure free ++ * buckets are ready. ++ * ++ * invalidate_buckets_(lru|fifo)() find buckets that are available to be ++ * invalidated, and then invalidate them and stick them on the free_inc list - ++ * in either lru or fifo order. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "clock.h" ++#include "debug.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "io.h" ++ ++#include ++#include ++#include ++#include ++ ++/* ++ * Open buckets represent a bucket that's currently being allocated from. They ++ * serve two purposes: ++ * ++ * - They track buckets that have been partially allocated, allowing for ++ * sub-bucket sized allocations - they're used by the sector allocator below ++ * ++ * - They provide a reference to the buckets they own that mark and sweep GC ++ * can find, until the new allocation has a pointer to it inserted into the ++ * btree ++ * ++ * When allocating some space with the sector allocator, the allocation comes ++ * with a reference to an open bucket - the caller is required to put that ++ * reference _after_ doing the index update that makes its allocation reachable. ++ */ ++ ++void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) ++{ ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ ++ if (ob->ec) { ++ bch2_ec_bucket_written(c, ob); ++ return; ++ } ++ ++ percpu_down_read(&c->mark_lock); ++ spin_lock(&ob->lock); ++ ++ bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), ++ false, gc_pos_alloc(c, ob), 0); ++ ob->valid = false; ++ ob->type = 0; ++ ++ spin_unlock(&ob->lock); ++ percpu_up_read(&c->mark_lock); ++ ++ spin_lock(&c->freelist_lock); ++ ob->freelist = c->open_buckets_freelist; ++ c->open_buckets_freelist = ob - c->open_buckets; ++ c->open_buckets_nr_free++; ++ spin_unlock(&c->freelist_lock); ++ ++ closure_wake_up(&c->open_buckets_wait); ++} ++ ++void bch2_open_bucket_write_error(struct bch_fs *c, ++ struct open_buckets *obs, ++ unsigned dev) ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, obs, ob, i) ++ if (ob->ptr.dev == dev && ++ ob->ec) ++ bch2_ec_bucket_cancel(c, ob); ++} ++ ++static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) ++{ ++ struct open_bucket *ob; ++ ++ BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free); ++ ++ ob = c->open_buckets + c->open_buckets_freelist; ++ c->open_buckets_freelist = ob->freelist; ++ atomic_set(&ob->pin, 1); ++ ob->type = 0; ++ ++ c->open_buckets_nr_free--; ++ return ob; ++} ++ ++static void open_bucket_free_unused(struct bch_fs *c, ++ struct write_point *wp, ++ struct open_bucket *ob) ++{ ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ bool may_realloc = wp->type == BCH_DATA_user; ++ ++ BUG_ON(ca->open_buckets_partial_nr > ++ ARRAY_SIZE(ca->open_buckets_partial)); ++ ++ if (ca->open_buckets_partial_nr < ++ ARRAY_SIZE(ca->open_buckets_partial) && ++ may_realloc) { ++ spin_lock(&c->freelist_lock); ++ ob->on_partial_list = true; ++ ca->open_buckets_partial[ca->open_buckets_partial_nr++] = ++ ob - c->open_buckets; ++ spin_unlock(&c->freelist_lock); ++ ++ closure_wake_up(&c->open_buckets_wait); ++ closure_wake_up(&c->freelist_wait); ++ } else { ++ bch2_open_bucket_put(c, ob); ++ } ++} ++ ++static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, obs, ob, i) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ ++ BUG_ON(ptr_stale(ca, &ob->ptr)); ++ } ++#endif ++} ++ ++/* _only_ for allocating the journal on a new device: */ ++long bch2_bucket_alloc_new_fs(struct bch_dev *ca) ++{ ++ struct bucket_array *buckets; ++ ssize_t b; ++ ++ rcu_read_lock(); ++ buckets = bucket_array(ca); ++ ++ for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) ++ if (is_available_bucket(buckets->b[b].mark)) ++ goto success; ++ b = -1; ++success: ++ rcu_read_unlock(); ++ return b; ++} ++ ++static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) ++{ ++ switch (reserve) { ++ case RESERVE_ALLOC: ++ return 0; ++ case RESERVE_BTREE: ++ return OPEN_BUCKETS_COUNT / 4; ++ default: ++ return OPEN_BUCKETS_COUNT / 2; ++ } ++} ++ ++/** ++ * bch_bucket_alloc - allocate a single bucket from a specific device ++ * ++ * Returns index of bucket on success, 0 on failure ++ * */ ++struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, ++ enum alloc_reserve reserve, ++ bool may_alloc_partial, ++ struct closure *cl) ++{ ++ struct bucket_array *buckets; ++ struct open_bucket *ob; ++ long bucket = 0; ++ ++ spin_lock(&c->freelist_lock); ++ ++ if (may_alloc_partial) { ++ int i; ++ ++ for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) { ++ ob = c->open_buckets + ca->open_buckets_partial[i]; ++ ++ if (reserve <= ob->alloc_reserve) { ++ array_remove_item(ca->open_buckets_partial, ++ ca->open_buckets_partial_nr, ++ i); ++ ob->on_partial_list = false; ++ ob->alloc_reserve = reserve; ++ spin_unlock(&c->freelist_lock); ++ return ob; ++ } ++ } ++ } ++ ++ if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { ++ if (cl) ++ closure_wait(&c->open_buckets_wait, cl); ++ ++ if (!c->blocked_allocate_open_bucket) ++ c->blocked_allocate_open_bucket = local_clock(); ++ ++ spin_unlock(&c->freelist_lock); ++ trace_open_bucket_alloc_fail(ca, reserve); ++ return ERR_PTR(-OPEN_BUCKETS_EMPTY); ++ } ++ ++ if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket))) ++ goto out; ++ ++ switch (reserve) { ++ case RESERVE_ALLOC: ++ if (fifo_pop(&ca->free[RESERVE_BTREE], bucket)) ++ goto out; ++ break; ++ case RESERVE_BTREE: ++ if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >= ++ ca->free[RESERVE_BTREE].size && ++ fifo_pop(&ca->free[RESERVE_BTREE], bucket)) ++ goto out; ++ break; ++ case RESERVE_MOVINGGC: ++ if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket)) ++ goto out; ++ break; ++ default: ++ break; ++ } ++ ++ if (cl) ++ closure_wait(&c->freelist_wait, cl); ++ ++ if (!c->blocked_allocate) ++ c->blocked_allocate = local_clock(); ++ ++ spin_unlock(&c->freelist_lock); ++ ++ trace_bucket_alloc_fail(ca, reserve); ++ return ERR_PTR(-FREELIST_EMPTY); ++out: ++ verify_not_on_freelist(c, ca, bucket); ++ ++ ob = bch2_open_bucket_alloc(c); ++ ++ spin_lock(&ob->lock); ++ buckets = bucket_array(ca); ++ ++ ob->valid = true; ++ ob->sectors_free = ca->mi.bucket_size; ++ ob->alloc_reserve = reserve; ++ ob->ptr = (struct bch_extent_ptr) { ++ .type = 1 << BCH_EXTENT_ENTRY_ptr, ++ .gen = buckets->b[bucket].mark.gen, ++ .offset = bucket_to_sector(ca, bucket), ++ .dev = ca->dev_idx, ++ }; ++ ++ bucket_io_clock_reset(c, ca, bucket, READ); ++ bucket_io_clock_reset(c, ca, bucket, WRITE); ++ spin_unlock(&ob->lock); ++ ++ if (c->blocked_allocate_open_bucket) { ++ bch2_time_stats_update( ++ &c->times[BCH_TIME_blocked_allocate_open_bucket], ++ c->blocked_allocate_open_bucket); ++ c->blocked_allocate_open_bucket = 0; ++ } ++ ++ if (c->blocked_allocate) { ++ bch2_time_stats_update( ++ &c->times[BCH_TIME_blocked_allocate], ++ c->blocked_allocate); ++ c->blocked_allocate = 0; ++ } ++ ++ spin_unlock(&c->freelist_lock); ++ ++ bch2_wake_allocator(ca); ++ ++ trace_bucket_alloc(ca, reserve); ++ return ob; ++} ++ ++static int __dev_stripe_cmp(struct dev_stripe_state *stripe, ++ unsigned l, unsigned r) ++{ ++ return ((stripe->next_alloc[l] > stripe->next_alloc[r]) - ++ (stripe->next_alloc[l] < stripe->next_alloc[r])); ++} ++ ++#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) ++ ++struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, ++ struct dev_stripe_state *stripe, ++ struct bch_devs_mask *devs) ++{ ++ struct dev_alloc_list ret = { .nr = 0 }; ++ unsigned i; ++ ++ for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) ++ ret.devs[ret.nr++] = i; ++ ++ bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); ++ return ret; ++} ++ ++void bch2_dev_stripe_increment(struct bch_dev *ca, ++ struct dev_stripe_state *stripe) ++{ ++ u64 *v = stripe->next_alloc + ca->dev_idx; ++ u64 free_space = dev_buckets_free(ca); ++ u64 free_space_inv = free_space ++ ? div64_u64(1ULL << 48, free_space) ++ : 1ULL << 48; ++ u64 scale = *v / 4; ++ ++ if (*v + free_space_inv >= *v) ++ *v += free_space_inv; ++ else ++ *v = U64_MAX; ++ ++ for (v = stripe->next_alloc; ++ v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) ++ *v = *v < scale ? 0 : *v - scale; ++} ++ ++#define BUCKET_MAY_ALLOC_PARTIAL (1 << 0) ++#define BUCKET_ALLOC_USE_DURABILITY (1 << 1) ++ ++static void add_new_bucket(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct bch_devs_mask *devs_may_alloc, ++ unsigned *nr_effective, ++ bool *have_cache, ++ unsigned flags, ++ struct open_bucket *ob) ++{ ++ unsigned durability = ++ bch_dev_bkey_exists(c, ob->ptr.dev)->mi.durability; ++ ++ __clear_bit(ob->ptr.dev, devs_may_alloc->d); ++ *nr_effective += (flags & BUCKET_ALLOC_USE_DURABILITY) ++ ? durability : 1; ++ *have_cache |= !durability; ++ ++ ob_push(c, ptrs, ob); ++} ++ ++enum bucket_alloc_ret ++bch2_bucket_alloc_set(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct dev_stripe_state *stripe, ++ struct bch_devs_mask *devs_may_alloc, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ enum alloc_reserve reserve, ++ unsigned flags, ++ struct closure *cl) ++{ ++ struct dev_alloc_list devs_sorted = ++ bch2_dev_alloc_list(c, stripe, devs_may_alloc); ++ struct bch_dev *ca; ++ enum bucket_alloc_ret ret = INSUFFICIENT_DEVICES; ++ unsigned i; ++ ++ BUG_ON(*nr_effective >= nr_replicas); ++ ++ for (i = 0; i < devs_sorted.nr; i++) { ++ struct open_bucket *ob; ++ ++ ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); ++ if (!ca) ++ continue; ++ ++ if (!ca->mi.durability && *have_cache) ++ continue; ++ ++ ob = bch2_bucket_alloc(c, ca, reserve, ++ flags & BUCKET_MAY_ALLOC_PARTIAL, cl); ++ if (IS_ERR(ob)) { ++ ret = -PTR_ERR(ob); ++ ++ if (cl) ++ return ret; ++ continue; ++ } ++ ++ add_new_bucket(c, ptrs, devs_may_alloc, ++ nr_effective, have_cache, flags, ob); ++ ++ bch2_dev_stripe_increment(ca, stripe); ++ ++ if (*nr_effective >= nr_replicas) ++ return ALLOC_SUCCESS; ++ } ++ ++ return ret; ++} ++ ++/* Allocate from stripes: */ ++ ++/* ++ * if we can't allocate a new stripe because there are already too many ++ * partially filled stripes, force allocating from an existing stripe even when ++ * it's to a device we don't want: ++ */ ++ ++static void bucket_alloc_from_stripe(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct write_point *wp, ++ struct bch_devs_mask *devs_may_alloc, ++ u16 target, ++ unsigned erasure_code, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ unsigned flags) ++{ ++ struct dev_alloc_list devs_sorted; ++ struct ec_stripe_head *h; ++ struct open_bucket *ob; ++ struct bch_dev *ca; ++ unsigned i, ec_idx; ++ ++ if (!erasure_code) ++ return; ++ ++ if (nr_replicas < 2) ++ return; ++ ++ if (ec_open_bucket(c, ptrs)) ++ return; ++ ++ h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1); ++ if (!h) ++ return; ++ ++ devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); ++ ++ for (i = 0; i < devs_sorted.nr; i++) ++ open_bucket_for_each(c, &h->s->blocks, ob, ec_idx) ++ if (ob->ptr.dev == devs_sorted.devs[i] && ++ !test_and_set_bit(h->s->data_block_idx[ec_idx], ++ h->s->blocks_allocated)) ++ goto got_bucket; ++ goto out_put_head; ++got_bucket: ++ ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ ++ ob->ec_idx = h->s->data_block_idx[ec_idx]; ++ ob->ec = h->s; ++ ++ add_new_bucket(c, ptrs, devs_may_alloc, ++ nr_effective, have_cache, flags, ob); ++ atomic_inc(&h->s->pin); ++out_put_head: ++ bch2_ec_stripe_head_put(c, h); ++} ++ ++/* Sector allocator */ ++ ++static void get_buckets_from_writepoint(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct write_point *wp, ++ struct bch_devs_mask *devs_may_alloc, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ unsigned flags, ++ bool need_ec) ++{ ++ struct open_buckets ptrs_skip = { .nr = 0 }; ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ ++ if (*nr_effective < nr_replicas && ++ test_bit(ob->ptr.dev, devs_may_alloc->d) && ++ (ca->mi.durability || ++ (wp->type == BCH_DATA_user && !*have_cache)) && ++ (ob->ec || !need_ec)) { ++ add_new_bucket(c, ptrs, devs_may_alloc, ++ nr_effective, have_cache, ++ flags, ob); ++ } else { ++ ob_push(c, &ptrs_skip, ob); ++ } ++ } ++ wp->ptrs = ptrs_skip; ++} ++ ++static enum bucket_alloc_ret ++open_bucket_add_buckets(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct write_point *wp, ++ struct bch_devs_list *devs_have, ++ u16 target, ++ unsigned erasure_code, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ enum alloc_reserve reserve, ++ unsigned flags, ++ struct closure *_cl) ++{ ++ struct bch_devs_mask devs; ++ struct open_bucket *ob; ++ struct closure *cl = NULL; ++ enum bucket_alloc_ret ret; ++ unsigned i; ++ ++ rcu_read_lock(); ++ devs = target_rw_devs(c, wp->type, target); ++ rcu_read_unlock(); ++ ++ /* Don't allocate from devices we already have pointers to: */ ++ for (i = 0; i < devs_have->nr; i++) ++ __clear_bit(devs_have->devs[i], devs.d); ++ ++ open_bucket_for_each(c, ptrs, ob, i) ++ __clear_bit(ob->ptr.dev, devs.d); ++ ++ if (erasure_code) { ++ if (!ec_open_bucket(c, ptrs)) { ++ get_buckets_from_writepoint(c, ptrs, wp, &devs, ++ nr_replicas, nr_effective, ++ have_cache, flags, true); ++ if (*nr_effective >= nr_replicas) ++ return 0; ++ } ++ ++ if (!ec_open_bucket(c, ptrs)) { ++ bucket_alloc_from_stripe(c, ptrs, wp, &devs, ++ target, erasure_code, ++ nr_replicas, nr_effective, ++ have_cache, flags); ++ if (*nr_effective >= nr_replicas) ++ return 0; ++ } ++ } ++ ++ get_buckets_from_writepoint(c, ptrs, wp, &devs, ++ nr_replicas, nr_effective, ++ have_cache, flags, false); ++ if (*nr_effective >= nr_replicas) ++ return 0; ++ ++ percpu_down_read(&c->mark_lock); ++ rcu_read_lock(); ++ ++retry_blocking: ++ /* ++ * Try nonblocking first, so that if one device is full we'll try from ++ * other devices: ++ */ ++ ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs, ++ nr_replicas, nr_effective, have_cache, ++ reserve, flags, cl); ++ if (ret && ret != INSUFFICIENT_DEVICES && !cl && _cl) { ++ cl = _cl; ++ goto retry_blocking; ++ } ++ ++ rcu_read_unlock(); ++ percpu_up_read(&c->mark_lock); ++ ++ return ret; ++} ++ ++void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, ++ struct open_buckets *obs) ++{ ++ struct open_buckets ptrs = { .nr = 0 }; ++ struct open_bucket *ob, *ob2; ++ unsigned i, j; ++ ++ open_bucket_for_each(c, obs, ob, i) { ++ bool drop = !ca || ob->ptr.dev == ca->dev_idx; ++ ++ if (!drop && ob->ec) { ++ mutex_lock(&ob->ec->lock); ++ open_bucket_for_each(c, &ob->ec->blocks, ob2, j) ++ drop |= ob2->ptr.dev == ca->dev_idx; ++ open_bucket_for_each(c, &ob->ec->parity, ob2, j) ++ drop |= ob2->ptr.dev == ca->dev_idx; ++ mutex_unlock(&ob->ec->lock); ++ } ++ ++ if (drop) ++ bch2_open_bucket_put(c, ob); ++ else ++ ob_push(c, &ptrs, ob); ++ } ++ ++ *obs = ptrs; ++} ++ ++void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, ++ struct write_point *wp) ++{ ++ mutex_lock(&wp->lock); ++ bch2_open_buckets_stop_dev(c, ca, &wp->ptrs); ++ mutex_unlock(&wp->lock); ++} ++ ++static inline struct hlist_head *writepoint_hash(struct bch_fs *c, ++ unsigned long write_point) ++{ ++ unsigned hash = ++ hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); ++ ++ return &c->write_points_hash[hash]; ++} ++ ++static struct write_point *__writepoint_find(struct hlist_head *head, ++ unsigned long write_point) ++{ ++ struct write_point *wp; ++ ++ hlist_for_each_entry_rcu(wp, head, node) ++ if (wp->write_point == write_point) ++ return wp; ++ ++ return NULL; ++} ++ ++static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) ++{ ++ u64 stranded = c->write_points_nr * c->bucket_size_max; ++ u64 free = bch2_fs_usage_read_short(c).free; ++ ++ return stranded * factor > free; ++} ++ ++static bool try_increase_writepoints(struct bch_fs *c) ++{ ++ struct write_point *wp; ++ ++ if (c->write_points_nr == ARRAY_SIZE(c->write_points) || ++ too_many_writepoints(c, 32)) ++ return false; ++ ++ wp = c->write_points + c->write_points_nr++; ++ hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); ++ return true; ++} ++ ++static bool try_decrease_writepoints(struct bch_fs *c, ++ unsigned old_nr) ++{ ++ struct write_point *wp; ++ ++ mutex_lock(&c->write_points_hash_lock); ++ if (c->write_points_nr < old_nr) { ++ mutex_unlock(&c->write_points_hash_lock); ++ return true; ++ } ++ ++ if (c->write_points_nr == 1 || ++ !too_many_writepoints(c, 8)) { ++ mutex_unlock(&c->write_points_hash_lock); ++ return false; ++ } ++ ++ wp = c->write_points + --c->write_points_nr; ++ ++ hlist_del_rcu(&wp->node); ++ mutex_unlock(&c->write_points_hash_lock); ++ ++ bch2_writepoint_stop(c, NULL, wp); ++ return true; ++} ++ ++static struct write_point *writepoint_find(struct bch_fs *c, ++ unsigned long write_point) ++{ ++ struct write_point *wp, *oldest; ++ struct hlist_head *head; ++ ++ if (!(write_point & 1UL)) { ++ wp = (struct write_point *) write_point; ++ mutex_lock(&wp->lock); ++ return wp; ++ } ++ ++ head = writepoint_hash(c, write_point); ++restart_find: ++ wp = __writepoint_find(head, write_point); ++ if (wp) { ++lock_wp: ++ mutex_lock(&wp->lock); ++ if (wp->write_point == write_point) ++ goto out; ++ mutex_unlock(&wp->lock); ++ goto restart_find; ++ } ++restart_find_oldest: ++ oldest = NULL; ++ for (wp = c->write_points; ++ wp < c->write_points + c->write_points_nr; wp++) ++ if (!oldest || time_before64(wp->last_used, oldest->last_used)) ++ oldest = wp; ++ ++ mutex_lock(&oldest->lock); ++ mutex_lock(&c->write_points_hash_lock); ++ if (oldest >= c->write_points + c->write_points_nr || ++ try_increase_writepoints(c)) { ++ mutex_unlock(&c->write_points_hash_lock); ++ mutex_unlock(&oldest->lock); ++ goto restart_find_oldest; ++ } ++ ++ wp = __writepoint_find(head, write_point); ++ if (wp && wp != oldest) { ++ mutex_unlock(&c->write_points_hash_lock); ++ mutex_unlock(&oldest->lock); ++ goto lock_wp; ++ } ++ ++ wp = oldest; ++ hlist_del_rcu(&wp->node); ++ wp->write_point = write_point; ++ hlist_add_head_rcu(&wp->node, head); ++ mutex_unlock(&c->write_points_hash_lock); ++out: ++ wp->last_used = sched_clock(); ++ return wp; ++} ++ ++/* ++ * Get us an open_bucket we can allocate from, return with it locked: ++ */ ++struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, ++ unsigned target, ++ unsigned erasure_code, ++ struct write_point_specifier write_point, ++ struct bch_devs_list *devs_have, ++ unsigned nr_replicas, ++ unsigned nr_replicas_required, ++ enum alloc_reserve reserve, ++ unsigned flags, ++ struct closure *cl) ++{ ++ struct write_point *wp; ++ struct open_bucket *ob; ++ struct open_buckets ptrs; ++ unsigned nr_effective, write_points_nr; ++ unsigned ob_flags = 0; ++ bool have_cache; ++ enum bucket_alloc_ret ret; ++ int i; ++ ++ if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) ++ ob_flags |= BUCKET_ALLOC_USE_DURABILITY; ++ ++ BUG_ON(!nr_replicas || !nr_replicas_required); ++retry: ++ ptrs.nr = 0; ++ nr_effective = 0; ++ write_points_nr = c->write_points_nr; ++ have_cache = false; ++ ++ wp = writepoint_find(c, write_point.v); ++ ++ if (wp->type == BCH_DATA_user) ++ ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; ++ ++ /* metadata may not allocate on cache devices: */ ++ if (wp->type != BCH_DATA_user) ++ have_cache = true; ++ ++ if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { ++ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, ++ target, erasure_code, ++ nr_replicas, &nr_effective, ++ &have_cache, reserve, ++ ob_flags, cl); ++ } else { ++ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, ++ target, erasure_code, ++ nr_replicas, &nr_effective, ++ &have_cache, reserve, ++ ob_flags, NULL); ++ if (!ret) ++ goto alloc_done; ++ ++ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, ++ 0, erasure_code, ++ nr_replicas, &nr_effective, ++ &have_cache, reserve, ++ ob_flags, cl); ++ } ++alloc_done: ++ BUG_ON(!ret && nr_effective < nr_replicas); ++ ++ if (erasure_code && !ec_open_bucket(c, &ptrs)) ++ pr_debug("failed to get ec bucket: ret %u", ret); ++ ++ if (ret == INSUFFICIENT_DEVICES && ++ nr_effective >= nr_replicas_required) ++ ret = 0; ++ ++ if (ret) ++ goto err; ++ ++ /* Free buckets we didn't use: */ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ open_bucket_free_unused(c, wp, ob); ++ ++ wp->ptrs = ptrs; ++ ++ wp->sectors_free = UINT_MAX; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ wp->sectors_free = min(wp->sectors_free, ob->sectors_free); ++ ++ BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); ++ ++ verify_not_stale(c, &wp->ptrs); ++ ++ return wp; ++err: ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ if (ptrs.nr < ARRAY_SIZE(ptrs.v)) ++ ob_push(c, &ptrs, ob); ++ else ++ open_bucket_free_unused(c, wp, ob); ++ wp->ptrs = ptrs; ++ ++ mutex_unlock(&wp->lock); ++ ++ if (ret == FREELIST_EMPTY && ++ try_decrease_writepoints(c, write_points_nr)) ++ goto retry; ++ ++ switch (ret) { ++ case OPEN_BUCKETS_EMPTY: ++ case FREELIST_EMPTY: ++ return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC); ++ case INSUFFICIENT_DEVICES: ++ return ERR_PTR(-EROFS); ++ default: ++ BUG(); ++ } ++} ++ ++/* ++ * Append pointers to the space we just allocated to @k, and mark @sectors space ++ * as allocated out of @ob ++ */ ++void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, ++ struct bkey_i *k, unsigned sectors) ++ ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ BUG_ON(sectors > wp->sectors_free); ++ wp->sectors_free -= sectors; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ struct bch_extent_ptr tmp = ob->ptr; ++ ++ tmp.cached = !ca->mi.durability && ++ wp->type == BCH_DATA_user; ++ ++ tmp.offset += ca->mi.bucket_size - ob->sectors_free; ++ bch2_bkey_append_ptr(k, tmp); ++ ++ BUG_ON(sectors > ob->sectors_free); ++ ob->sectors_free -= sectors; ++ } ++} ++ ++/* ++ * Append pointers to the space we just allocated to @k, and mark @sectors space ++ * as allocated out of @ob ++ */ ++void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) ++{ ++ struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 }; ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob); ++ wp->ptrs = keep; ++ ++ mutex_unlock(&wp->lock); ++ ++ bch2_open_buckets_put(c, &ptrs); ++} ++ ++static inline void writepoint_init(struct write_point *wp, ++ enum bch_data_type type) ++{ ++ mutex_init(&wp->lock); ++ wp->type = type; ++} ++ ++void bch2_fs_allocator_foreground_init(struct bch_fs *c) ++{ ++ struct open_bucket *ob; ++ struct write_point *wp; ++ ++ mutex_init(&c->write_points_hash_lock); ++ c->write_points_nr = ARRAY_SIZE(c->write_points); ++ ++ /* open bucket 0 is a sentinal NULL: */ ++ spin_lock_init(&c->open_buckets[0].lock); ++ ++ for (ob = c->open_buckets + 1; ++ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { ++ spin_lock_init(&ob->lock); ++ c->open_buckets_nr_free++; ++ ++ ob->freelist = c->open_buckets_freelist; ++ c->open_buckets_freelist = ob - c->open_buckets; ++ } ++ ++ writepoint_init(&c->btree_write_point, BCH_DATA_btree); ++ writepoint_init(&c->rebalance_write_point, BCH_DATA_user); ++ writepoint_init(&c->copygc_write_point, BCH_DATA_user); ++ ++ for (wp = c->write_points; ++ wp < c->write_points + c->write_points_nr; wp++) { ++ writepoint_init(wp, BCH_DATA_user); ++ ++ wp->last_used = sched_clock(); ++ wp->write_point = (unsigned long) wp; ++ hlist_add_head_rcu(&wp->node, ++ writepoint_hash(c, wp->write_point)); ++ } ++} +diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h +new file mode 100644 +index 000000000000..c658295cb8e0 +--- /dev/null ++++ b/fs/bcachefs/alloc_foreground.h +@@ -0,0 +1,138 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ALLOC_FOREGROUND_H ++#define _BCACHEFS_ALLOC_FOREGROUND_H ++ ++#include "bcachefs.h" ++#include "alloc_types.h" ++ ++#include ++ ++struct bkey; ++struct bch_dev; ++struct bch_fs; ++struct bch_devs_List; ++ ++enum bucket_alloc_ret { ++ ALLOC_SUCCESS, ++ OPEN_BUCKETS_EMPTY, ++ FREELIST_EMPTY, /* Allocator thread not keeping up */ ++ INSUFFICIENT_DEVICES, ++}; ++ ++struct dev_alloc_list { ++ unsigned nr; ++ u8 devs[BCH_SB_MEMBERS_MAX]; ++}; ++ ++struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, ++ struct dev_stripe_state *, ++ struct bch_devs_mask *); ++void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); ++ ++long bch2_bucket_alloc_new_fs(struct bch_dev *); ++ ++struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, ++ enum alloc_reserve, bool, ++ struct closure *); ++ ++static inline void ob_push(struct bch_fs *c, struct open_buckets *obs, ++ struct open_bucket *ob) ++{ ++ BUG_ON(obs->nr >= ARRAY_SIZE(obs->v)); ++ ++ obs->v[obs->nr++] = ob - c->open_buckets; ++} ++ ++#define open_bucket_for_each(_c, _obs, _ob, _i) \ ++ for ((_i) = 0; \ ++ (_i) < (_obs)->nr && \ ++ ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true); \ ++ (_i)++) ++ ++static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, ++ struct open_buckets *obs) ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, obs, ob, i) ++ if (ob->ec) ++ return ob; ++ ++ return NULL; ++} ++ ++void bch2_open_bucket_write_error(struct bch_fs *, ++ struct open_buckets *, unsigned); ++ ++void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); ++ ++static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) ++{ ++ if (atomic_dec_and_test(&ob->pin)) ++ __bch2_open_bucket_put(c, ob); ++} ++ ++static inline void bch2_open_buckets_put(struct bch_fs *c, ++ struct open_buckets *ptrs) ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, ptrs, ob, i) ++ bch2_open_bucket_put(c, ob); ++ ptrs->nr = 0; ++} ++ ++static inline void bch2_open_bucket_get(struct bch_fs *c, ++ struct write_point *wp, ++ struct open_buckets *ptrs) ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) { ++ ob->type = wp->type; ++ atomic_inc(&ob->pin); ++ ob_push(c, ptrs, ob); ++ } ++} ++ ++enum bucket_alloc_ret ++bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, ++ struct dev_stripe_state *, struct bch_devs_mask *, ++ unsigned, unsigned *, bool *, enum alloc_reserve, ++ unsigned, struct closure *); ++ ++struct write_point *bch2_alloc_sectors_start(struct bch_fs *, ++ unsigned, unsigned, ++ struct write_point_specifier, ++ struct bch_devs_list *, ++ unsigned, unsigned, ++ enum alloc_reserve, ++ unsigned, ++ struct closure *); ++ ++void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, ++ struct bkey_i *, unsigned); ++void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); ++ ++void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *, ++ struct open_buckets *); ++ ++void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *, ++ struct write_point *); ++ ++static inline struct write_point_specifier writepoint_hashed(unsigned long v) ++{ ++ return (struct write_point_specifier) { .v = v | 1 }; ++} ++ ++static inline struct write_point_specifier writepoint_ptr(struct write_point *wp) ++{ ++ return (struct write_point_specifier) { .v = (unsigned long) wp }; ++} ++ ++void bch2_fs_allocator_foreground_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ +diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h +new file mode 100644 +index 000000000000..20705460bb0a +--- /dev/null ++++ b/fs/bcachefs/alloc_types.h +@@ -0,0 +1,113 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ALLOC_TYPES_H ++#define _BCACHEFS_ALLOC_TYPES_H ++ ++#include ++#include ++ ++#include "clock_types.h" ++#include "fifo.h" ++ ++struct ec_bucket_buf; ++ ++/* There's two of these clocks, one for reads and one for writes: */ ++struct bucket_clock { ++ /* ++ * "now" in (read/write) IO time - incremented whenever we do X amount ++ * of reads or writes. ++ * ++ * Goes with the bucket read/write prios: when we read or write to a ++ * bucket we reset the bucket's prio to the current hand; thus hand - ++ * prio = time since bucket was last read/written. ++ * ++ * The units are some amount (bytes/sectors) of data read/written, and ++ * the units can change on the fly if we need to rescale to fit ++ * everything in a u16 - your only guarantee is that the units are ++ * consistent. ++ */ ++ u16 hand; ++ u16 max_last_io; ++ ++ int rw; ++ ++ struct io_timer rescale; ++ struct mutex lock; ++}; ++ ++/* There is one reserve for each type of btree, one for prios and gens ++ * and one for moving GC */ ++enum alloc_reserve { ++ RESERVE_ALLOC = -1, ++ RESERVE_BTREE = 0, ++ RESERVE_MOVINGGC = 1, ++ RESERVE_NONE = 2, ++ RESERVE_NR = 3, ++}; ++ ++typedef FIFO(long) alloc_fifo; ++ ++#define OPEN_BUCKETS_COUNT 1024 ++ ++#define WRITE_POINT_HASH_NR 32 ++#define WRITE_POINT_MAX 32 ++ ++typedef u16 open_bucket_idx_t; ++ ++struct open_bucket { ++ spinlock_t lock; ++ atomic_t pin; ++ open_bucket_idx_t freelist; ++ ++ /* ++ * When an open bucket has an ec_stripe attached, this is the index of ++ * the block in the stripe this open_bucket corresponds to: ++ */ ++ u8 ec_idx; ++ u8 type; ++ unsigned valid:1; ++ unsigned on_partial_list:1; ++ int alloc_reserve:3; ++ unsigned sectors_free; ++ struct bch_extent_ptr ptr; ++ struct ec_stripe_new *ec; ++}; ++ ++#define OPEN_BUCKET_LIST_MAX 15 ++ ++struct open_buckets { ++ open_bucket_idx_t nr; ++ open_bucket_idx_t v[OPEN_BUCKET_LIST_MAX]; ++}; ++ ++struct dev_stripe_state { ++ u64 next_alloc[BCH_SB_MEMBERS_MAX]; ++}; ++ ++struct write_point { ++ struct hlist_node node; ++ struct mutex lock; ++ u64 last_used; ++ unsigned long write_point; ++ enum bch_data_type type; ++ bool is_ec; ++ ++ /* calculated based on how many pointers we're actually going to use: */ ++ unsigned sectors_free; ++ ++ struct open_buckets ptrs; ++ struct dev_stripe_state stripe; ++}; ++ ++struct write_point_specifier { ++ unsigned long v; ++}; ++ ++struct alloc_heap_entry { ++ size_t bucket; ++ size_t nr; ++ unsigned long key; ++}; ++ ++typedef HEAP(struct alloc_heap_entry) alloc_heap; ++ ++#endif /* _BCACHEFS_ALLOC_TYPES_H */ +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +new file mode 100644 +index 000000000000..3a5a00e53cbf +--- /dev/null ++++ b/fs/bcachefs/bcachefs.h +@@ -0,0 +1,883 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_H ++#define _BCACHEFS_H ++ ++/* ++ * SOME HIGH LEVEL CODE DOCUMENTATION: ++ * ++ * Bcache mostly works with cache sets, cache devices, and backing devices. ++ * ++ * Support for multiple cache devices hasn't quite been finished off yet, but ++ * it's about 95% plumbed through. A cache set and its cache devices is sort of ++ * like a md raid array and its component devices. Most of the code doesn't care ++ * about individual cache devices, the main abstraction is the cache set. ++ * ++ * Multiple cache devices is intended to give us the ability to mirror dirty ++ * cached data and metadata, without mirroring clean cached data. ++ * ++ * Backing devices are different, in that they have a lifetime independent of a ++ * cache set. When you register a newly formatted backing device it'll come up ++ * in passthrough mode, and then you can attach and detach a backing device from ++ * a cache set at runtime - while it's mounted and in use. Detaching implicitly ++ * invalidates any cached data for that backing device. ++ * ++ * A cache set can have multiple (many) backing devices attached to it. ++ * ++ * There's also flash only volumes - this is the reason for the distinction ++ * between struct cached_dev and struct bcache_device. A flash only volume ++ * works much like a bcache device that has a backing device, except the ++ * "cached" data is always dirty. The end result is that we get thin ++ * provisioning with very little additional code. ++ * ++ * Flash only volumes work but they're not production ready because the moving ++ * garbage collector needs more work. More on that later. ++ * ++ * BUCKETS/ALLOCATION: ++ * ++ * Bcache is primarily designed for caching, which means that in normal ++ * operation all of our available space will be allocated. Thus, we need an ++ * efficient way of deleting things from the cache so we can write new things to ++ * it. ++ * ++ * To do this, we first divide the cache device up into buckets. A bucket is the ++ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ ++ * works efficiently. ++ * ++ * Each bucket has a 16 bit priority, and an 8 bit generation associated with ++ * it. The gens and priorities for all the buckets are stored contiguously and ++ * packed on disk (in a linked list of buckets - aside from the superblock, all ++ * of bcache's metadata is stored in buckets). ++ * ++ * The priority is used to implement an LRU. We reset a bucket's priority when ++ * we allocate it or on cache it, and every so often we decrement the priority ++ * of each bucket. It could be used to implement something more sophisticated, ++ * if anyone ever gets around to it. ++ * ++ * The generation is used for invalidating buckets. Each pointer also has an 8 ++ * bit generation embedded in it; for a pointer to be considered valid, its gen ++ * must match the gen of the bucket it points into. Thus, to reuse a bucket all ++ * we have to do is increment its gen (and write its new gen to disk; we batch ++ * this up). ++ * ++ * Bcache is entirely COW - we never write twice to a bucket, even buckets that ++ * contain metadata (including btree nodes). ++ * ++ * THE BTREE: ++ * ++ * Bcache is in large part design around the btree. ++ * ++ * At a high level, the btree is just an index of key -> ptr tuples. ++ * ++ * Keys represent extents, and thus have a size field. Keys also have a variable ++ * number of pointers attached to them (potentially zero, which is handy for ++ * invalidating the cache). ++ * ++ * The key itself is an inode:offset pair. The inode number corresponds to a ++ * backing device or a flash only volume. The offset is the ending offset of the ++ * extent within the inode - not the starting offset; this makes lookups ++ * slightly more convenient. ++ * ++ * Pointers contain the cache device id, the offset on that device, and an 8 bit ++ * generation number. More on the gen later. ++ * ++ * Index lookups are not fully abstracted - cache lookups in particular are ++ * still somewhat mixed in with the btree code, but things are headed in that ++ * direction. ++ * ++ * Updates are fairly well abstracted, though. There are two different ways of ++ * updating the btree; insert and replace. ++ * ++ * BTREE_INSERT will just take a list of keys and insert them into the btree - ++ * overwriting (possibly only partially) any extents they overlap with. This is ++ * used to update the index after a write. ++ * ++ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is ++ * overwriting a key that matches another given key. This is used for inserting ++ * data into the cache after a cache miss, and for background writeback, and for ++ * the moving garbage collector. ++ * ++ * There is no "delete" operation; deleting things from the index is ++ * accomplished by either by invalidating pointers (by incrementing a bucket's ++ * gen) or by inserting a key with 0 pointers - which will overwrite anything ++ * previously present at that location in the index. ++ * ++ * This means that there are always stale/invalid keys in the btree. They're ++ * filtered out by the code that iterates through a btree node, and removed when ++ * a btree node is rewritten. ++ * ++ * BTREE NODES: ++ * ++ * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and ++ * free smaller than a bucket - so, that's how big our btree nodes are. ++ * ++ * (If buckets are really big we'll only use part of the bucket for a btree node ++ * - no less than 1/4th - but a bucket still contains no more than a single ++ * btree node. I'd actually like to change this, but for now we rely on the ++ * bucket's gen for deleting btree nodes when we rewrite/split a node.) ++ * ++ * Anyways, btree nodes are big - big enough to be inefficient with a textbook ++ * btree implementation. ++ * ++ * The way this is solved is that btree nodes are internally log structured; we ++ * can append new keys to an existing btree node without rewriting it. This ++ * means each set of keys we write is sorted, but the node is not. ++ * ++ * We maintain this log structure in memory - keeping 1Mb of keys sorted would ++ * be expensive, and we have to distinguish between the keys we have written and ++ * the keys we haven't. So to do a lookup in a btree node, we have to search ++ * each sorted set. But we do merge written sets together lazily, so the cost of ++ * these extra searches is quite low (normally most of the keys in a btree node ++ * will be in one big set, and then there'll be one or two sets that are much ++ * smaller). ++ * ++ * This log structure makes bcache's btree more of a hybrid between a ++ * conventional btree and a compacting data structure, with some of the ++ * advantages of both. ++ * ++ * GARBAGE COLLECTION: ++ * ++ * We can't just invalidate any bucket - it might contain dirty data or ++ * metadata. If it once contained dirty data, other writes might overwrite it ++ * later, leaving no valid pointers into that bucket in the index. ++ * ++ * Thus, the primary purpose of garbage collection is to find buckets to reuse. ++ * It also counts how much valid data it each bucket currently contains, so that ++ * allocation can reuse buckets sooner when they've been mostly overwritten. ++ * ++ * It also does some things that are really internal to the btree ++ * implementation. If a btree node contains pointers that are stale by more than ++ * some threshold, it rewrites the btree node to avoid the bucket's generation ++ * wrapping around. It also merges adjacent btree nodes if they're empty enough. ++ * ++ * THE JOURNAL: ++ * ++ * Bcache's journal is not necessary for consistency; we always strictly ++ * order metadata writes so that the btree and everything else is consistent on ++ * disk in the event of an unclean shutdown, and in fact bcache had writeback ++ * caching (with recovery from unclean shutdown) before journalling was ++ * implemented. ++ * ++ * Rather, the journal is purely a performance optimization; we can't complete a ++ * write until we've updated the index on disk, otherwise the cache would be ++ * inconsistent in the event of an unclean shutdown. This means that without the ++ * journal, on random write workloads we constantly have to update all the leaf ++ * nodes in the btree, and those writes will be mostly empty (appending at most ++ * a few keys each) - highly inefficient in terms of amount of metadata writes, ++ * and it puts more strain on the various btree resorting/compacting code. ++ * ++ * The journal is just a log of keys we've inserted; on startup we just reinsert ++ * all the keys in the open journal entries. That means that when we're updating ++ * a node in the btree, we can wait until a 4k block of keys fills up before ++ * writing them out. ++ * ++ * For simplicity, we only journal updates to leaf nodes; updates to parent ++ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth ++ * the complexity to deal with journalling them (in particular, journal replay) ++ * - updates to non leaf nodes just happen synchronously (see btree_split()). ++ */ ++ ++#undef pr_fmt ++#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "bcachefs_format.h" ++#include "fifo.h" ++#include "opts.h" ++#include "util.h" ++ ++#define dynamic_fault(...) 0 ++#define race_fault(...) 0 ++ ++#define bch2_fs_init_fault(name) \ ++ dynamic_fault("bcachefs:bch_fs_init:" name) ++#define bch2_meta_read_fault(name) \ ++ dynamic_fault("bcachefs:meta:read:" name) ++#define bch2_meta_write_fault(name) \ ++ dynamic_fault("bcachefs:meta:write:" name) ++ ++#ifdef __KERNEL__ ++#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) ++#else ++#define bch2_fmt(_c, fmt) fmt "\n" ++#endif ++ ++#define bch_info(c, fmt, ...) \ ++ printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_notice(c, fmt, ...) \ ++ printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_warn(c, fmt, ...) \ ++ printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_warn_ratelimited(c, fmt, ...) \ ++ printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_err(c, fmt, ...) \ ++ printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_err_ratelimited(c, fmt, ...) \ ++ printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) ++ ++#define bch_verbose(c, fmt, ...) \ ++do { \ ++ if ((c)->opts.verbose) \ ++ bch_info(c, fmt, ##__VA_ARGS__); \ ++} while (0) ++ ++#define pr_verbose_init(opts, fmt, ...) \ ++do { \ ++ if (opt_get(opts, verbose)) \ ++ pr_info(fmt, ##__VA_ARGS__); \ ++} while (0) ++ ++/* Parameters that are useful for debugging, but should always be compiled in: */ ++#define BCH_DEBUG_PARAMS_ALWAYS() \ ++ BCH_DEBUG_PARAM(key_merging_disabled, \ ++ "Disables merging of extents") \ ++ BCH_DEBUG_PARAM(btree_gc_always_rewrite, \ ++ "Causes mark and sweep to compact and rewrite every " \ ++ "btree node it traverses") \ ++ BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \ ++ "Disables rewriting of btree nodes during mark and sweep")\ ++ BCH_DEBUG_PARAM(btree_shrinker_disabled, \ ++ "Disables the shrinker callback for the btree node cache") ++ ++/* Parameters that should only be compiled in in debug mode: */ ++#define BCH_DEBUG_PARAMS_DEBUG() \ ++ BCH_DEBUG_PARAM(expensive_debug_checks, \ ++ "Enables various runtime debugging checks that " \ ++ "significantly affect performance") \ ++ BCH_DEBUG_PARAM(debug_check_iterators, \ ++ "Enables extra verification for btree iterators") \ ++ BCH_DEBUG_PARAM(debug_check_bkeys, \ ++ "Run bkey_debugcheck (primarily checking GC/allocation "\ ++ "information) when iterating over keys") \ ++ BCH_DEBUG_PARAM(verify_btree_ondisk, \ ++ "Reread btree nodes at various points to verify the " \ ++ "mergesort in the read path against modifications " \ ++ "done in memory") \ ++ BCH_DEBUG_PARAM(journal_seq_verify, \ ++ "Store the journal sequence number in the version " \ ++ "number of every btree key, and verify that btree " \ ++ "update ordering is preserved during recovery") \ ++ BCH_DEBUG_PARAM(inject_invalid_keys, \ ++ "Store the journal sequence number in the version " \ ++ "number of every btree key, and verify that btree " \ ++ "update ordering is preserved during recovery") \ ++ BCH_DEBUG_PARAM(test_alloc_startup, \ ++ "Force allocator startup to use the slowpath where it" \ ++ "can't find enough free buckets without invalidating" \ ++ "cached data") \ ++ BCH_DEBUG_PARAM(force_reconstruct_read, \ ++ "Force reads to use the reconstruct path, when reading" \ ++ "from erasure coded extents") \ ++ BCH_DEBUG_PARAM(test_restart_gc, \ ++ "Test restarting mark and sweep gc when bucket gens change") ++ ++#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL() ++#else ++#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() ++#endif ++ ++#define BCH_TIME_STATS() \ ++ x(btree_node_mem_alloc) \ ++ x(btree_node_split) \ ++ x(btree_node_sort) \ ++ x(btree_node_read) \ ++ x(btree_gc) \ ++ x(btree_lock_contended_read) \ ++ x(btree_lock_contended_intent) \ ++ x(btree_lock_contended_write) \ ++ x(data_write) \ ++ x(data_read) \ ++ x(data_promote) \ ++ x(journal_write) \ ++ x(journal_delay) \ ++ x(journal_flush_seq) \ ++ x(blocked_journal) \ ++ x(blocked_allocate) \ ++ x(blocked_allocate_open_bucket) ++ ++enum bch_time_stats { ++#define x(name) BCH_TIME_##name, ++ BCH_TIME_STATS() ++#undef x ++ BCH_TIME_STAT_NR ++}; ++ ++#include "alloc_types.h" ++#include "btree_types.h" ++#include "buckets_types.h" ++#include "clock_types.h" ++#include "ec_types.h" ++#include "journal_types.h" ++#include "keylist_types.h" ++#include "quota_types.h" ++#include "rebalance_types.h" ++#include "replicas_types.h" ++#include "super_types.h" ++ ++/* Number of nodes btree coalesce will try to coalesce at once */ ++#define GC_MERGE_NODES 4U ++ ++/* Maximum number of nodes we might need to allocate atomically: */ ++#define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1)) ++ ++/* Size of the freelist we allocate btree nodes from: */ ++#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4) ++ ++#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX) ++ ++struct btree; ++ ++enum gc_phase { ++ GC_PHASE_NOT_RUNNING, ++ GC_PHASE_START, ++ GC_PHASE_SB, ++ ++ GC_PHASE_BTREE_EC, ++ GC_PHASE_BTREE_EXTENTS, ++ GC_PHASE_BTREE_INODES, ++ GC_PHASE_BTREE_DIRENTS, ++ GC_PHASE_BTREE_XATTRS, ++ GC_PHASE_BTREE_ALLOC, ++ GC_PHASE_BTREE_QUOTAS, ++ GC_PHASE_BTREE_REFLINK, ++ ++ GC_PHASE_PENDING_DELETE, ++ GC_PHASE_ALLOC, ++}; ++ ++struct gc_pos { ++ enum gc_phase phase; ++ struct bpos pos; ++ unsigned level; ++}; ++ ++struct io_count { ++ u64 sectors[2][BCH_DATA_NR]; ++}; ++ ++struct bch_dev { ++ struct kobject kobj; ++ struct percpu_ref ref; ++ struct completion ref_completion; ++ struct percpu_ref io_ref; ++ struct completion io_ref_completion; ++ ++ struct bch_fs *fs; ++ ++ u8 dev_idx; ++ /* ++ * Cached version of this device's member info from superblock ++ * Committed by bch2_write_super() -> bch_fs_mi_update() ++ */ ++ struct bch_member_cpu mi; ++ uuid_le uuid; ++ char name[BDEVNAME_SIZE]; ++ ++ struct bch_sb_handle disk_sb; ++ struct bch_sb *sb_read_scratch; ++ int sb_write_error; ++ ++ struct bch_devs_mask self; ++ ++ /* biosets used in cloned bios for writing multiple replicas */ ++ struct bio_set replica_set; ++ ++ /* ++ * Buckets: ++ * Per-bucket arrays are protected by c->mark_lock, bucket_lock and ++ * gc_lock, for device resize - holding any is sufficient for access: ++ * Or rcu_read_lock(), but only for ptr_stale(): ++ */ ++ struct bucket_array __rcu *buckets[2]; ++ unsigned long *buckets_nouse; ++ struct rw_semaphore bucket_lock; ++ ++ struct bch_dev_usage __percpu *usage[2]; ++ ++ /* Allocator: */ ++ struct task_struct __rcu *alloc_thread; ++ ++ /* ++ * free: Buckets that are ready to be used ++ * ++ * free_inc: Incoming buckets - these are buckets that currently have ++ * cached data in them, and we can't reuse them until after we write ++ * their new gen to disk. After prio_write() finishes writing the new ++ * gens/prios, they'll be moved to the free list (and possibly discarded ++ * in the process) ++ */ ++ alloc_fifo free[RESERVE_NR]; ++ alloc_fifo free_inc; ++ ++ open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; ++ open_bucket_idx_t open_buckets_partial_nr; ++ ++ size_t fifo_last_bucket; ++ ++ /* last calculated minimum prio */ ++ u16 max_last_bucket_io[2]; ++ ++ size_t inc_gen_needs_gc; ++ size_t inc_gen_really_needs_gc; ++ ++ /* ++ * XXX: this should be an enum for allocator state, so as to include ++ * error state ++ */ ++ enum { ++ ALLOCATOR_STOPPED, ++ ALLOCATOR_RUNNING, ++ ALLOCATOR_BLOCKED, ++ ALLOCATOR_BLOCKED_FULL, ++ } allocator_state; ++ ++ alloc_heap alloc_heap; ++ ++ atomic64_t rebalance_work; ++ ++ struct journal_device journal; ++ ++ struct work_struct io_error_work; ++ ++ /* The rest of this all shows up in sysfs */ ++ atomic64_t cur_latency[2]; ++ struct time_stats io_latency[2]; ++ ++#define CONGESTED_MAX 1024 ++ atomic_t congested; ++ u64 congested_last; ++ ++ struct io_count __percpu *io_done; ++}; ++ ++enum { ++ /* startup: */ ++ BCH_FS_ALLOC_READ_DONE, ++ BCH_FS_ALLOC_CLEAN, ++ BCH_FS_ALLOCATOR_RUNNING, ++ BCH_FS_ALLOCATOR_STOPPING, ++ BCH_FS_INITIAL_GC_DONE, ++ BCH_FS_BTREE_INTERIOR_REPLAY_DONE, ++ BCH_FS_FSCK_DONE, ++ BCH_FS_STARTED, ++ BCH_FS_RW, ++ ++ /* shutdown: */ ++ BCH_FS_STOPPING, ++ BCH_FS_EMERGENCY_RO, ++ BCH_FS_WRITE_DISABLE_COMPLETE, ++ ++ /* errors: */ ++ BCH_FS_ERROR, ++ BCH_FS_ERRORS_FIXED, ++ ++ /* misc: */ ++ BCH_FS_BDEV_MOUNTED, ++ BCH_FS_FIXED_GENS, ++ BCH_FS_ALLOC_WRITTEN, ++ BCH_FS_REBUILD_REPLICAS, ++ BCH_FS_HOLD_BTREE_WRITES, ++}; ++ ++struct btree_debug { ++ unsigned id; ++ struct dentry *btree; ++ struct dentry *btree_format; ++ struct dentry *failed; ++}; ++ ++struct bch_fs_pcpu { ++ u64 sectors_available; ++}; ++ ++struct journal_seq_blacklist_table { ++ size_t nr; ++ struct journal_seq_blacklist_table_entry { ++ u64 start; ++ u64 end; ++ bool dirty; ++ } entries[0]; ++}; ++ ++struct journal_keys { ++ struct journal_key { ++ enum btree_id btree_id:8; ++ unsigned level:8; ++ struct bkey_i *k; ++ u32 journal_seq; ++ u32 journal_offset; ++ } *d; ++ size_t nr; ++ u64 journal_seq_base; ++}; ++ ++struct bch_fs { ++ struct closure cl; ++ ++ struct list_head list; ++ struct kobject kobj; ++ struct kobject internal; ++ struct kobject opts_dir; ++ struct kobject time_stats; ++ unsigned long flags; ++ ++ int minor; ++ struct device *chardev; ++ struct super_block *vfs_sb; ++ char name[40]; ++ ++ /* ro/rw, add/remove/resize devices: */ ++ struct rw_semaphore state_lock; ++ ++ /* Counts outstanding writes, for clean transition to read-only */ ++ struct percpu_ref writes; ++ struct work_struct read_only_work; ++ ++ struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; ++ ++ struct bch_replicas_cpu replicas; ++ struct bch_replicas_cpu replicas_gc; ++ struct mutex replicas_gc_lock; ++ ++ struct journal_entry_res replicas_journal_res; ++ ++ struct bch_disk_groups_cpu __rcu *disk_groups; ++ ++ struct bch_opts opts; ++ ++ /* Updated by bch2_sb_update():*/ ++ struct { ++ uuid_le uuid; ++ uuid_le user_uuid; ++ ++ u16 version; ++ u16 encoded_extent_max; ++ ++ u8 nr_devices; ++ u8 clean; ++ ++ u8 encryption_type; ++ ++ u64 time_base_lo; ++ u32 time_base_hi; ++ u32 time_precision; ++ u64 features; ++ u64 compat; ++ } sb; ++ ++ struct bch_sb_handle disk_sb; ++ ++ unsigned short block_bits; /* ilog2(block_size) */ ++ ++ u16 btree_foreground_merge_threshold; ++ ++ struct closure sb_write; ++ struct mutex sb_lock; ++ ++ /* BTREE CACHE */ ++ struct bio_set btree_bio; ++ ++ struct btree_root btree_roots[BTREE_ID_NR]; ++ struct mutex btree_root_lock; ++ ++ struct btree_cache btree_cache; ++ ++ /* ++ * Cache of allocated btree nodes - if we allocate a btree node and ++ * don't use it, if we free it that space can't be reused until going ++ * _all_ the way through the allocator (which exposes us to a livelock ++ * when allocating btree reserves fail halfway through) - instead, we ++ * can stick them here: ++ */ ++ struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2]; ++ unsigned btree_reserve_cache_nr; ++ struct mutex btree_reserve_cache_lock; ++ ++ mempool_t btree_interior_update_pool; ++ struct list_head btree_interior_update_list; ++ struct list_head btree_interior_updates_unwritten; ++ struct mutex btree_interior_update_lock; ++ struct closure_waitlist btree_interior_update_wait; ++ ++ struct workqueue_struct *btree_interior_update_worker; ++ struct work_struct btree_interior_update_work; ++ ++ /* btree_iter.c: */ ++ struct mutex btree_trans_lock; ++ struct list_head btree_trans_list; ++ mempool_t btree_iters_pool; ++ ++ struct btree_key_cache btree_key_cache; ++ ++ struct workqueue_struct *wq; ++ /* copygc needs its own workqueue for index updates.. */ ++ struct workqueue_struct *copygc_wq; ++ struct workqueue_struct *journal_reclaim_wq; ++ ++ /* ALLOCATION */ ++ struct delayed_work pd_controllers_update; ++ unsigned pd_controllers_update_seconds; ++ ++ struct bch_devs_mask rw_devs[BCH_DATA_NR]; ++ ++ u64 capacity; /* sectors */ ++ ++ /* ++ * When capacity _decreases_ (due to a disk being removed), we ++ * increment capacity_gen - this invalidates outstanding reservations ++ * and forces them to be revalidated ++ */ ++ u32 capacity_gen; ++ unsigned bucket_size_max; ++ ++ atomic64_t sectors_available; ++ ++ struct bch_fs_pcpu __percpu *pcpu; ++ ++ struct percpu_rw_semaphore mark_lock; ++ ++ seqcount_t usage_lock; ++ struct bch_fs_usage *usage_base; ++ struct bch_fs_usage __percpu *usage[2]; ++ struct bch_fs_usage __percpu *usage_gc; ++ ++ /* single element mempool: */ ++ struct mutex usage_scratch_lock; ++ struct bch_fs_usage *usage_scratch; ++ ++ /* ++ * When we invalidate buckets, we use both the priority and the amount ++ * of good data to determine which buckets to reuse first - to weight ++ * those together consistently we keep track of the smallest nonzero ++ * priority of any bucket. ++ */ ++ struct bucket_clock bucket_clock[2]; ++ ++ struct io_clock io_clock[2]; ++ ++ /* JOURNAL SEQ BLACKLIST */ ++ struct journal_seq_blacklist_table * ++ journal_seq_blacklist_table; ++ struct work_struct journal_seq_blacklist_gc_work; ++ ++ /* ALLOCATOR */ ++ spinlock_t freelist_lock; ++ struct closure_waitlist freelist_wait; ++ u64 blocked_allocate; ++ u64 blocked_allocate_open_bucket; ++ open_bucket_idx_t open_buckets_freelist; ++ open_bucket_idx_t open_buckets_nr_free; ++ struct closure_waitlist open_buckets_wait; ++ struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; ++ ++ struct write_point btree_write_point; ++ struct write_point rebalance_write_point; ++ ++ struct write_point write_points[WRITE_POINT_MAX]; ++ struct hlist_head write_points_hash[WRITE_POINT_HASH_NR]; ++ struct mutex write_points_hash_lock; ++ unsigned write_points_nr; ++ ++ /* GARBAGE COLLECTION */ ++ struct task_struct *gc_thread; ++ atomic_t kick_gc; ++ unsigned long gc_count; ++ ++ /* ++ * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] ++ * has been marked by GC. ++ * ++ * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.) ++ * ++ * Protected by gc_pos_lock. Only written to by GC thread, so GC thread ++ * can read without a lock. ++ */ ++ seqcount_t gc_pos_lock; ++ struct gc_pos gc_pos; ++ ++ /* ++ * The allocation code needs gc_mark in struct bucket to be correct, but ++ * it's not while a gc is in progress. ++ */ ++ struct rw_semaphore gc_lock; ++ ++ /* IO PATH */ ++ struct semaphore io_in_flight; ++ struct bio_set bio_read; ++ struct bio_set bio_read_split; ++ struct bio_set bio_write; ++ struct mutex bio_bounce_pages_lock; ++ mempool_t bio_bounce_pages; ++ struct rhashtable promote_table; ++ ++ mempool_t compression_bounce[2]; ++ mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR]; ++ mempool_t decompress_workspace; ++ ZSTD_parameters zstd_params; ++ ++ struct crypto_shash *sha256; ++ struct crypto_sync_skcipher *chacha20; ++ struct crypto_shash *poly1305; ++ ++ atomic64_t key_version; ++ ++ mempool_t large_bkey_pool; ++ ++ /* REBALANCE */ ++ struct bch_fs_rebalance rebalance; ++ ++ /* COPYGC */ ++ struct task_struct *copygc_thread; ++ copygc_heap copygc_heap; ++ struct bch_pd_controller copygc_pd; ++ struct write_point copygc_write_point; ++ u64 copygc_threshold; ++ ++ /* STRIPES: */ ++ GENRADIX(struct stripe) stripes[2]; ++ ++ ec_stripes_heap ec_stripes_heap; ++ spinlock_t ec_stripes_heap_lock; ++ ++ /* ERASURE CODING */ ++ struct list_head ec_stripe_head_list; ++ struct mutex ec_stripe_head_lock; ++ ++ struct list_head ec_stripe_new_list; ++ struct mutex ec_stripe_new_lock; ++ ++ struct work_struct ec_stripe_create_work; ++ u64 ec_stripe_hint; ++ ++ struct bio_set ec_bioset; ++ ++ struct work_struct ec_stripe_delete_work; ++ struct llist_head ec_stripe_delete_list; ++ ++ /* REFLINK */ ++ u64 reflink_hint; ++ ++ /* VFS IO PATH - fs-io.c */ ++ struct bio_set writepage_bioset; ++ struct bio_set dio_write_bioset; ++ struct bio_set dio_read_bioset; ++ ++ struct bio_list btree_write_error_list; ++ struct work_struct btree_write_error_work; ++ spinlock_t btree_write_error_lock; ++ ++ /* ERRORS */ ++ struct list_head fsck_errors; ++ struct mutex fsck_error_lock; ++ bool fsck_alloc_err; ++ ++ /* QUOTAS */ ++ struct bch_memquota_type quotas[QTYP_NR]; ++ ++ /* DEBUG JUNK */ ++ struct dentry *debug; ++ struct btree_debug btree_debug[BTREE_ID_NR]; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct btree *verify_data; ++ struct btree_node *verify_ondisk; ++ struct mutex verify_lock; ++#endif ++ ++ u64 unused_inode_hint; ++ ++ /* ++ * A btree node on disk could have too many bsets for an iterator to fit ++ * on the stack - have to dynamically allocate them ++ */ ++ mempool_t fill_iter; ++ ++ mempool_t btree_bounce_pool; ++ ++ struct journal journal; ++ struct list_head journal_entries; ++ struct journal_keys journal_keys; ++ ++ u64 last_bucket_seq_cleanup; ++ ++ /* The rest of this all shows up in sysfs */ ++ atomic_long_t read_realloc_races; ++ atomic_long_t extent_migrate_done; ++ atomic_long_t extent_migrate_raced; ++ ++ unsigned btree_gc_periodic:1; ++ unsigned copy_gc_enabled:1; ++ bool promote_whole_extents; ++ ++#define BCH_DEBUG_PARAM(name, description) bool name; ++ BCH_DEBUG_PARAMS_ALL() ++#undef BCH_DEBUG_PARAM ++ ++ struct time_stats times[BCH_TIME_STAT_NR]; ++}; ++ ++static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) ++{ ++#ifndef NO_BCACHEFS_FS ++ if (c->vfs_sb) ++ c->vfs_sb->s_bdi->ra_pages = ra_pages; ++#endif ++} ++ ++static inline unsigned bucket_bytes(const struct bch_dev *ca) ++{ ++ return ca->mi.bucket_size << 9; ++} ++ ++static inline unsigned block_bytes(const struct bch_fs *c) ++{ ++ return c->opts.block_size << 9; ++} ++ ++static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time) ++{ ++ return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo); ++} ++ ++static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts) ++{ ++ s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo; ++ ++ if (c->sb.time_precision == 1) ++ return ns; ++ ++ return div_s64(ns, c->sb.time_precision); ++} ++ ++static inline s64 bch2_current_time(struct bch_fs *c) ++{ ++ struct timespec64 now; ++ ++ ktime_get_coarse_real_ts64(&now); ++ return timespec_to_bch2_time(c, now); ++} ++ ++static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) ++{ ++ return dev < c->sb.nr_devices && c->devs[dev]; ++} ++ ++#endif /* _BCACHEFS_H */ +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +new file mode 100644 +index 000000000000..d5a2230e403c +--- /dev/null ++++ b/fs/bcachefs/bcachefs_format.h +@@ -0,0 +1,1671 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FORMAT_H ++#define _BCACHEFS_FORMAT_H ++ ++/* ++ * bcachefs on disk data structures ++ * ++ * OVERVIEW: ++ * ++ * There are three main types of on disk data structures in bcachefs (this is ++ * reduced from 5 in bcache) ++ * ++ * - superblock ++ * - journal ++ * - btree ++ * ++ * The btree is the primary structure; most metadata exists as keys in the ++ * various btrees. There are only a small number of btrees, they're not ++ * sharded - we have one btree for extents, another for inodes, et cetera. ++ * ++ * SUPERBLOCK: ++ * ++ * The superblock contains the location of the journal, the list of devices in ++ * the filesystem, and in general any metadata we need in order to decide ++ * whether we can start a filesystem or prior to reading the journal/btree ++ * roots. ++ * ++ * The superblock is extensible, and most of the contents of the superblock are ++ * in variable length, type tagged fields; see struct bch_sb_field. ++ * ++ * Backup superblocks do not reside in a fixed location; also, superblocks do ++ * not have a fixed size. To locate backup superblocks we have struct ++ * bch_sb_layout; we store a copy of this inside every superblock, and also ++ * before the first superblock. ++ * ++ * JOURNAL: ++ * ++ * The journal primarily records btree updates in the order they occurred; ++ * journal replay consists of just iterating over all the keys in the open ++ * journal entries and re-inserting them into the btrees. ++ * ++ * The journal also contains entry types for the btree roots, and blacklisted ++ * journal sequence numbers (see journal_seq_blacklist.c). ++ * ++ * BTREE: ++ * ++ * bcachefs btrees are copy on write b+ trees, where nodes are big (typically ++ * 128k-256k) and log structured. We use struct btree_node for writing the first ++ * entry in a given node (offset 0), and struct btree_node_entry for all ++ * subsequent writes. ++ * ++ * After the header, btree node entries contain a list of keys in sorted order. ++ * Values are stored inline with the keys; since values are variable length (and ++ * keys effectively are variable length too, due to packing) we can't do random ++ * access without building up additional in memory tables in the btree node read ++ * path. ++ * ++ * BTREE KEYS (struct bkey): ++ * ++ * The various btrees share a common format for the key - so as to avoid ++ * switching in fastpath lookup/comparison code - but define their own ++ * structures for the key values. ++ * ++ * The size of a key/value pair is stored as a u8 in units of u64s, so the max ++ * size is just under 2k. The common part also contains a type tag for the ++ * value, and a format field indicating whether the key is packed or not (and ++ * also meant to allow adding new key fields in the future, if desired). ++ * ++ * bkeys, when stored within a btree node, may also be packed. In that case, the ++ * bkey_format in that node is used to unpack it. Packed bkeys mean that we can ++ * be generous with field sizes in the common part of the key format (64 bit ++ * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#define LE_BITMASK(_bits, name, type, field, offset, end) \ ++static const unsigned name##_OFFSET = offset; \ ++static const unsigned name##_BITS = (end - offset); \ ++static const __u##_bits name##_MAX = (1ULL << (end - offset)) - 1; \ ++ \ ++static inline __u64 name(const type *k) \ ++{ \ ++ return (__le##_bits##_to_cpu(k->field) >> offset) & \ ++ ~(~0ULL << (end - offset)); \ ++} \ ++ \ ++static inline void SET_##name(type *k, __u64 v) \ ++{ \ ++ __u##_bits new = __le##_bits##_to_cpu(k->field); \ ++ \ ++ new &= ~(~(~0ULL << (end - offset)) << offset); \ ++ new |= (v & ~(~0ULL << (end - offset))) << offset; \ ++ k->field = __cpu_to_le##_bits(new); \ ++} ++ ++#define LE16_BITMASK(n, t, f, o, e) LE_BITMASK(16, n, t, f, o, e) ++#define LE32_BITMASK(n, t, f, o, e) LE_BITMASK(32, n, t, f, o, e) ++#define LE64_BITMASK(n, t, f, o, e) LE_BITMASK(64, n, t, f, o, e) ++ ++struct bkey_format { ++ __u8 key_u64s; ++ __u8 nr_fields; ++ /* One unused slot for now: */ ++ __u8 bits_per_field[6]; ++ __le64 field_offset[6]; ++}; ++ ++/* Btree keys - all units are in sectors */ ++ ++struct bpos { ++ /* ++ * Word order matches machine byte order - btree code treats a bpos as a ++ * single large integer, for search/comparison purposes ++ * ++ * Note that wherever a bpos is embedded in another on disk data ++ * structure, it has to be byte swabbed when reading in metadata that ++ * wasn't written in native endian order: ++ */ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ __u32 snapshot; ++ __u64 offset; ++ __u64 inode; ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++ __u64 inode; ++ __u64 offset; /* Points to end of extent - sectors */ ++ __u32 snapshot; ++#else ++#error edit for your odd byteorder. ++#endif ++} __attribute__((packed, aligned(4))); ++ ++#define KEY_INODE_MAX ((__u64)~0ULL) ++#define KEY_OFFSET_MAX ((__u64)~0ULL) ++#define KEY_SNAPSHOT_MAX ((__u32)~0U) ++#define KEY_SIZE_MAX ((__u32)~0U) ++ ++static inline struct bpos POS(__u64 inode, __u64 offset) ++{ ++ struct bpos ret; ++ ++ ret.inode = inode; ++ ret.offset = offset; ++ ret.snapshot = 0; ++ ++ return ret; ++} ++ ++#define POS_MIN POS(0, 0) ++#define POS_MAX POS(KEY_INODE_MAX, KEY_OFFSET_MAX) ++ ++/* Empty placeholder struct, for container_of() */ ++struct bch_val { ++ __u64 __nothing[0]; ++}; ++ ++struct bversion { ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ __u64 lo; ++ __u32 hi; ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++ __u32 hi; ++ __u64 lo; ++#endif ++} __attribute__((packed, aligned(4))); ++ ++struct bkey { ++ /* Size of combined key and value, in u64s */ ++ __u8 u64s; ++ ++ /* Format of key (0 for format local to btree node) */ ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u8 format:7, ++ needs_whiteout:1; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u8 needs_whiteout:1, ++ format:7; ++#else ++#error edit for your odd byteorder. ++#endif ++ ++ /* Type of the value */ ++ __u8 type; ++ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ __u8 pad[1]; ++ ++ struct bversion version; ++ __u32 size; /* extent size, in sectors */ ++ struct bpos p; ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++ struct bpos p; ++ __u32 size; /* extent size, in sectors */ ++ struct bversion version; ++ ++ __u8 pad[1]; ++#endif ++} __attribute__((packed, aligned(8))); ++ ++struct bkey_packed { ++ __u64 _data[0]; ++ ++ /* Size of combined key and value, in u64s */ ++ __u8 u64s; ++ ++ /* Format of key (0 for format local to btree node) */ ++ ++ /* ++ * XXX: next incompat on disk format change, switch format and ++ * needs_whiteout - bkey_packed() will be cheaper if format is the high ++ * bits of the bitfield ++ */ ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u8 format:7, ++ needs_whiteout:1; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u8 needs_whiteout:1, ++ format:7; ++#endif ++ ++ /* Type of the value */ ++ __u8 type; ++ __u8 key_start[0]; ++ ++ /* ++ * We copy bkeys with struct assignment in various places, and while ++ * that shouldn't be done with packed bkeys we can't disallow it in C, ++ * and it's legal to cast a bkey to a bkey_packed - so padding it out ++ * to the same size as struct bkey should hopefully be safest. ++ */ ++ __u8 pad[sizeof(struct bkey) - 3]; ++} __attribute__((packed, aligned(8))); ++ ++#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64)) ++#define BKEY_U64s_MAX U8_MAX ++#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s) ++ ++#define KEY_PACKED_BITS_START 24 ++ ++#define KEY_FORMAT_LOCAL_BTREE 0 ++#define KEY_FORMAT_CURRENT 1 ++ ++enum bch_bkey_fields { ++ BKEY_FIELD_INODE, ++ BKEY_FIELD_OFFSET, ++ BKEY_FIELD_SNAPSHOT, ++ BKEY_FIELD_SIZE, ++ BKEY_FIELD_VERSION_HI, ++ BKEY_FIELD_VERSION_LO, ++ BKEY_NR_FIELDS, ++}; ++ ++#define bkey_format_field(name, field) \ ++ [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8) ++ ++#define BKEY_FORMAT_CURRENT \ ++((struct bkey_format) { \ ++ .key_u64s = BKEY_U64s, \ ++ .nr_fields = BKEY_NR_FIELDS, \ ++ .bits_per_field = { \ ++ bkey_format_field(INODE, p.inode), \ ++ bkey_format_field(OFFSET, p.offset), \ ++ bkey_format_field(SNAPSHOT, p.snapshot), \ ++ bkey_format_field(SIZE, size), \ ++ bkey_format_field(VERSION_HI, version.hi), \ ++ bkey_format_field(VERSION_LO, version.lo), \ ++ }, \ ++}) ++ ++/* bkey with inline value */ ++struct bkey_i { ++ __u64 _data[0]; ++ ++ union { ++ struct { ++ /* Size of combined key and value, in u64s */ ++ __u8 u64s; ++ }; ++ struct { ++ struct bkey k; ++ struct bch_val v; ++ }; ++ }; ++}; ++ ++#define KEY(_inode, _offset, _size) \ ++((struct bkey) { \ ++ .u64s = BKEY_U64s, \ ++ .format = KEY_FORMAT_CURRENT, \ ++ .p = POS(_inode, _offset), \ ++ .size = _size, \ ++}) ++ ++static inline void bkey_init(struct bkey *k) ++{ ++ *k = KEY(0, 0, 0); ++} ++ ++#define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64)) ++ ++#define __BKEY_PADDED(key, pad) \ ++ struct { struct bkey_i key; __u64 key ## _pad[pad]; } ++ ++/* ++ * - DELETED keys are used internally to mark keys that should be ignored but ++ * override keys in composition order. Their version number is ignored. ++ * ++ * - DISCARDED keys indicate that the data is all 0s because it has been ++ * discarded. DISCARDs may have a version; if the version is nonzero the key ++ * will be persistent, otherwise the key will be dropped whenever the btree ++ * node is rewritten (like DELETED keys). ++ * ++ * - ERROR: any read of the data returns a read error, as the data was lost due ++ * to a failing device. Like DISCARDED keys, they can be removed (overridden) ++ * by new writes or cluster-wide GC. Node repair can also overwrite them with ++ * the same or a more recent version number, but not with an older version ++ * number. ++ * ++ * - WHITEOUT: for hash table btrees ++*/ ++#define BCH_BKEY_TYPES() \ ++ x(deleted, 0) \ ++ x(discard, 1) \ ++ x(error, 2) \ ++ x(cookie, 3) \ ++ x(whiteout, 4) \ ++ x(btree_ptr, 5) \ ++ x(extent, 6) \ ++ x(reservation, 7) \ ++ x(inode, 8) \ ++ x(inode_generation, 9) \ ++ x(dirent, 10) \ ++ x(xattr, 11) \ ++ x(alloc, 12) \ ++ x(quota, 13) \ ++ x(stripe, 14) \ ++ x(reflink_p, 15) \ ++ x(reflink_v, 16) \ ++ x(inline_data, 17) \ ++ x(btree_ptr_v2, 18) ++ ++enum bch_bkey_type { ++#define x(name, nr) KEY_TYPE_##name = nr, ++ BCH_BKEY_TYPES() ++#undef x ++ KEY_TYPE_MAX, ++}; ++ ++struct bch_cookie { ++ struct bch_val v; ++ __le64 cookie; ++}; ++ ++/* Extents */ ++ ++/* ++ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally ++ * preceded by checksum/compression information (bch_extent_crc32 or ++ * bch_extent_crc64). ++ * ++ * One major determining factor in the format of extents is how we handle and ++ * represent extents that have been partially overwritten and thus trimmed: ++ * ++ * If an extent is not checksummed or compressed, when the extent is trimmed we ++ * don't have to remember the extent we originally allocated and wrote: we can ++ * merely adjust ptr->offset to point to the start of the data that is currently ++ * live. The size field in struct bkey records the current (live) size of the ++ * extent, and is also used to mean "size of region on disk that we point to" in ++ * this case. ++ * ++ * Thus an extent that is not checksummed or compressed will consist only of a ++ * list of bch_extent_ptrs, with none of the fields in ++ * bch_extent_crc32/bch_extent_crc64. ++ * ++ * When an extent is checksummed or compressed, it's not possible to read only ++ * the data that is currently live: we have to read the entire extent that was ++ * originally written, and then return only the part of the extent that is ++ * currently live. ++ * ++ * Thus, in addition to the current size of the extent in struct bkey, we need ++ * to store the size of the originally allocated space - this is the ++ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, ++ * when the extent is trimmed, instead of modifying the offset field of the ++ * pointer, we keep a second smaller offset field - "offset into the original ++ * extent of the currently live region". ++ * ++ * The other major determining factor is replication and data migration: ++ * ++ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated ++ * write, we will initially write all the replicas in the same format, with the ++ * same checksum type and compression format - however, when copygc runs later (or ++ * tiering/cache promotion, anything that moves data), it is not in general ++ * going to rewrite all the pointers at once - one of the replicas may be in a ++ * bucket on one device that has very little fragmentation while another lives ++ * in a bucket that has become heavily fragmented, and thus is being rewritten ++ * sooner than the rest. ++ * ++ * Thus it will only move a subset of the pointers (or in the case of ++ * tiering/cache promotion perhaps add a single pointer without dropping any ++ * current pointers), and if the extent has been partially overwritten it must ++ * write only the currently live portion (or copygc would not be able to reduce ++ * fragmentation!) - which necessitates a different bch_extent_crc format for ++ * the new pointer. ++ * ++ * But in the interests of space efficiency, we don't want to store one ++ * bch_extent_crc for each pointer if we don't have to. ++ * ++ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and ++ * bch_extent_ptrs appended arbitrarily one after the other. We determine the ++ * type of a given entry with a scheme similar to utf8 (except we're encoding a ++ * type, not a size), encoding the type in the position of the first set bit: ++ * ++ * bch_extent_crc32 - 0b1 ++ * bch_extent_ptr - 0b10 ++ * bch_extent_crc64 - 0b100 ++ * ++ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and ++ * bch_extent_crc64 is the least constrained). ++ * ++ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, ++ * until the next bch_extent_crc32/64. ++ * ++ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer ++ * is neither checksummed nor compressed. ++ */ ++ ++/* 128 bits, sufficient for cryptographic MACs: */ ++struct bch_csum { ++ __le64 lo; ++ __le64 hi; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_EXTENT_ENTRY_TYPES() \ ++ x(ptr, 0) \ ++ x(crc32, 1) \ ++ x(crc64, 2) \ ++ x(crc128, 3) \ ++ x(stripe_ptr, 4) ++#define BCH_EXTENT_ENTRY_MAX 5 ++ ++enum bch_extent_entry_type { ++#define x(f, n) BCH_EXTENT_ENTRY_##f = n, ++ BCH_EXTENT_ENTRY_TYPES() ++#undef x ++}; ++ ++/* Compressed/uncompressed size are stored biased by 1: */ ++struct bch_extent_crc32 { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u32 type:2, ++ _compressed_size:7, ++ _uncompressed_size:7, ++ offset:7, ++ _unused:1, ++ csum_type:4, ++ compression_type:4; ++ __u32 csum; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u32 csum; ++ __u32 compression_type:4, ++ csum_type:4, ++ _unused:1, ++ offset:7, ++ _uncompressed_size:7, ++ _compressed_size:7, ++ type:2; ++#endif ++} __attribute__((packed, aligned(8))); ++ ++#define CRC32_SIZE_MAX (1U << 7) ++#define CRC32_NONCE_MAX 0 ++ ++struct bch_extent_crc64 { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:3, ++ _compressed_size:9, ++ _uncompressed_size:9, ++ offset:9, ++ nonce:10, ++ csum_type:4, ++ compression_type:4, ++ csum_hi:16; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 csum_hi:16, ++ compression_type:4, ++ csum_type:4, ++ nonce:10, ++ offset:9, ++ _uncompressed_size:9, ++ _compressed_size:9, ++ type:3; ++#endif ++ __u64 csum_lo; ++} __attribute__((packed, aligned(8))); ++ ++#define CRC64_SIZE_MAX (1U << 9) ++#define CRC64_NONCE_MAX ((1U << 10) - 1) ++ ++struct bch_extent_crc128 { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:4, ++ _compressed_size:13, ++ _uncompressed_size:13, ++ offset:13, ++ nonce:13, ++ csum_type:4, ++ compression_type:4; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 compression_type:4, ++ csum_type:4, ++ nonce:13, ++ offset:13, ++ _uncompressed_size:13, ++ _compressed_size:13, ++ type:4; ++#endif ++ struct bch_csum csum; ++} __attribute__((packed, aligned(8))); ++ ++#define CRC128_SIZE_MAX (1U << 13) ++#define CRC128_NONCE_MAX ((1U << 13) - 1) ++ ++/* ++ * @reservation - pointer hasn't been written to, just reserved ++ */ ++struct bch_extent_ptr { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:1, ++ cached:1, ++ unused:1, ++ reservation:1, ++ offset:44, /* 8 petabytes */ ++ dev:8, ++ gen:8; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 gen:8, ++ dev:8, ++ offset:44, ++ reservation:1, ++ unused:1, ++ cached:1, ++ type:1; ++#endif ++} __attribute__((packed, aligned(8))); ++ ++struct bch_extent_stripe_ptr { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:5, ++ block:8, ++ idx:51; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 idx:51, ++ block:8, ++ type:5; ++#endif ++}; ++ ++struct bch_extent_reservation { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:6, ++ unused:22, ++ replicas:4, ++ generation:32; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 generation:32, ++ replicas:4, ++ unused:22, ++ type:6; ++#endif ++}; ++ ++union bch_extent_entry { ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 ++ unsigned long type; ++#elif __BITS_PER_LONG == 32 ++ struct { ++ unsigned long pad; ++ unsigned long type; ++ }; ++#else ++#error edit for your odd byteorder. ++#endif ++ ++#define x(f, n) struct bch_extent_##f f; ++ BCH_EXTENT_ENTRY_TYPES() ++#undef x ++}; ++ ++struct bch_btree_ptr { ++ struct bch_val v; ++ ++ struct bch_extent_ptr start[0]; ++ __u64 _data[0]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_btree_ptr_v2 { ++ struct bch_val v; ++ ++ __u64 mem_ptr; ++ __le64 seq; ++ __le16 sectors_written; ++ /* In case we ever decide to do variable size btree nodes: */ ++ __le16 sectors; ++ struct bpos min_key; ++ struct bch_extent_ptr start[0]; ++ __u64 _data[0]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_extent { ++ struct bch_val v; ++ ++ union bch_extent_entry start[0]; ++ __u64 _data[0]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_reservation { ++ struct bch_val v; ++ ++ __le32 generation; ++ __u8 nr_replicas; ++ __u8 pad[3]; ++} __attribute__((packed, aligned(8))); ++ ++/* Maximum size (in u64s) a single pointer could be: */ ++#define BKEY_EXTENT_PTR_U64s_MAX\ ++ ((sizeof(struct bch_extent_crc128) + \ ++ sizeof(struct bch_extent_ptr)) / sizeof(u64)) ++ ++/* Maximum possible size of an entire extent value: */ ++#define BKEY_EXTENT_VAL_U64s_MAX \ ++ (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) ++ ++#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX) ++ ++/* * Maximum possible size of an entire extent, key + value: */ ++#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) ++ ++/* Btree pointers don't carry around checksums: */ ++#define BKEY_BTREE_PTR_VAL_U64s_MAX \ ++ ((sizeof(struct bch_btree_ptr_v2) + \ ++ sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(u64)) ++#define BKEY_BTREE_PTR_U64s_MAX \ ++ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) ++ ++/* Inodes */ ++ ++#define BLOCKDEV_INODE_MAX 4096 ++ ++#define BCACHEFS_ROOT_INO 4096 ++ ++struct bch_inode { ++ struct bch_val v; ++ ++ __le64 bi_hash_seed; ++ __le32 bi_flags; ++ __le16 bi_mode; ++ __u8 fields[0]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_inode_generation { ++ struct bch_val v; ++ ++ __le32 bi_generation; ++ __le32 pad; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_INODE_FIELDS() \ ++ x(bi_atime, 64) \ ++ x(bi_ctime, 64) \ ++ x(bi_mtime, 64) \ ++ x(bi_otime, 64) \ ++ x(bi_size, 64) \ ++ x(bi_sectors, 64) \ ++ x(bi_uid, 32) \ ++ x(bi_gid, 32) \ ++ x(bi_nlink, 32) \ ++ x(bi_generation, 32) \ ++ x(bi_dev, 32) \ ++ x(bi_data_checksum, 8) \ ++ x(bi_compression, 8) \ ++ x(bi_project, 32) \ ++ x(bi_background_compression, 8) \ ++ x(bi_data_replicas, 8) \ ++ x(bi_promote_target, 16) \ ++ x(bi_foreground_target, 16) \ ++ x(bi_background_target, 16) \ ++ x(bi_erasure_code, 16) \ ++ x(bi_fields_set, 16) ++ ++/* subset of BCH_INODE_FIELDS */ ++#define BCH_INODE_OPTS() \ ++ x(data_checksum, 8) \ ++ x(compression, 8) \ ++ x(project, 32) \ ++ x(background_compression, 8) \ ++ x(data_replicas, 8) \ ++ x(promote_target, 16) \ ++ x(foreground_target, 16) \ ++ x(background_target, 16) \ ++ x(erasure_code, 16) ++ ++enum inode_opt_id { ++#define x(name, ...) \ ++ Inode_opt_##name, ++ BCH_INODE_OPTS() ++#undef x ++ Inode_opt_nr, ++}; ++ ++enum { ++ /* ++ * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL ++ * flags) ++ */ ++ __BCH_INODE_SYNC = 0, ++ __BCH_INODE_IMMUTABLE = 1, ++ __BCH_INODE_APPEND = 2, ++ __BCH_INODE_NODUMP = 3, ++ __BCH_INODE_NOATIME = 4, ++ ++ __BCH_INODE_I_SIZE_DIRTY= 5, ++ __BCH_INODE_I_SECTORS_DIRTY= 6, ++ __BCH_INODE_UNLINKED = 7, ++ ++ /* bits 20+ reserved for packed fields below: */ ++}; ++ ++#define BCH_INODE_SYNC (1 << __BCH_INODE_SYNC) ++#define BCH_INODE_IMMUTABLE (1 << __BCH_INODE_IMMUTABLE) ++#define BCH_INODE_APPEND (1 << __BCH_INODE_APPEND) ++#define BCH_INODE_NODUMP (1 << __BCH_INODE_NODUMP) ++#define BCH_INODE_NOATIME (1 << __BCH_INODE_NOATIME) ++#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY) ++#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY) ++#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) ++ ++LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); ++LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 32); ++ ++/* Dirents */ ++ ++/* ++ * Dirents (and xattrs) have to implement string lookups; since our b-tree ++ * doesn't support arbitrary length strings for the key, we instead index by a ++ * 64 bit hash (currently truncated sha1) of the string, stored in the offset ++ * field of the key - using linear probing to resolve hash collisions. This also ++ * provides us with the readdir cookie posix requires. ++ * ++ * Linear probing requires us to use whiteouts for deletions, in the event of a ++ * collision: ++ */ ++ ++struct bch_dirent { ++ struct bch_val v; ++ ++ /* Target inode number: */ ++ __le64 d_inum; ++ ++ /* ++ * Copy of mode bits 12-15 from the target inode - so userspace can get ++ * the filetype without having to do a stat() ++ */ ++ __u8 d_type; ++ ++ __u8 d_name[]; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_NAME_MAX (U8_MAX * sizeof(u64) - \ ++ sizeof(struct bkey) - \ ++ offsetof(struct bch_dirent, d_name)) ++ ++ ++/* Xattrs */ ++ ++#define KEY_TYPE_XATTR_INDEX_USER 0 ++#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 ++#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 ++#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 ++#define KEY_TYPE_XATTR_INDEX_SECURITY 4 ++ ++struct bch_xattr { ++ struct bch_val v; ++ __u8 x_type; ++ __u8 x_name_len; ++ __le16 x_val_len; ++ __u8 x_name[]; ++} __attribute__((packed, aligned(8))); ++ ++/* Bucket/allocation information: */ ++ ++struct bch_alloc { ++ struct bch_val v; ++ __u8 fields; ++ __u8 gen; ++ __u8 data[]; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_ALLOC_FIELDS() \ ++ x(read_time, 16) \ ++ x(write_time, 16) \ ++ x(data_type, 8) \ ++ x(dirty_sectors, 16) \ ++ x(cached_sectors, 16) \ ++ x(oldest_gen, 8) ++ ++enum { ++#define x(name, bytes) BCH_ALLOC_FIELD_##name, ++ BCH_ALLOC_FIELDS() ++#undef x ++ BCH_ALLOC_FIELD_NR ++}; ++ ++static const unsigned BCH_ALLOC_FIELD_BYTES[] = { ++#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8, ++ BCH_ALLOC_FIELDS() ++#undef x ++}; ++ ++#define x(name, bits) + (bits / 8) ++static const unsigned BKEY_ALLOC_VAL_U64s_MAX = ++ DIV_ROUND_UP(offsetof(struct bch_alloc, data) ++ BCH_ALLOC_FIELDS(), sizeof(u64)); ++#undef x ++ ++#define BKEY_ALLOC_U64s_MAX (BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX) ++ ++/* Quotas: */ ++ ++enum quota_types { ++ QTYP_USR = 0, ++ QTYP_GRP = 1, ++ QTYP_PRJ = 2, ++ QTYP_NR = 3, ++}; ++ ++enum quota_counters { ++ Q_SPC = 0, ++ Q_INO = 1, ++ Q_COUNTERS = 2, ++}; ++ ++struct bch_quota_counter { ++ __le64 hardlimit; ++ __le64 softlimit; ++}; ++ ++struct bch_quota { ++ struct bch_val v; ++ struct bch_quota_counter c[Q_COUNTERS]; ++} __attribute__((packed, aligned(8))); ++ ++/* Erasure coding */ ++ ++struct bch_stripe { ++ struct bch_val v; ++ __le16 sectors; ++ __u8 algorithm; ++ __u8 nr_blocks; ++ __u8 nr_redundant; ++ ++ __u8 csum_granularity_bits; ++ __u8 csum_type; ++ __u8 pad; ++ ++ struct bch_extent_ptr ptrs[0]; ++} __attribute__((packed, aligned(8))); ++ ++/* Reflink: */ ++ ++struct bch_reflink_p { ++ struct bch_val v; ++ __le64 idx; ++ ++ __le32 reservation_generation; ++ __u8 nr_replicas; ++ __u8 pad[3]; ++}; ++ ++struct bch_reflink_v { ++ struct bch_val v; ++ __le64 refcount; ++ union bch_extent_entry start[0]; ++ __u64 _data[0]; ++}; ++ ++/* Inline data */ ++ ++struct bch_inline_data { ++ struct bch_val v; ++ u8 data[0]; ++}; ++ ++/* Optional/variable size superblock sections: */ ++ ++struct bch_sb_field { ++ __u64 _data[0]; ++ __le32 u64s; ++ __le32 type; ++}; ++ ++#define BCH_SB_FIELDS() \ ++ x(journal, 0) \ ++ x(members, 1) \ ++ x(crypt, 2) \ ++ x(replicas_v0, 3) \ ++ x(quota, 4) \ ++ x(disk_groups, 5) \ ++ x(clean, 6) \ ++ x(replicas, 7) \ ++ x(journal_seq_blacklist, 8) ++ ++enum bch_sb_field_type { ++#define x(f, nr) BCH_SB_FIELD_##f = nr, ++ BCH_SB_FIELDS() ++#undef x ++ BCH_SB_FIELD_NR ++}; ++ ++/* BCH_SB_FIELD_journal: */ ++ ++struct bch_sb_field_journal { ++ struct bch_sb_field field; ++ __le64 buckets[0]; ++}; ++ ++/* BCH_SB_FIELD_members: */ ++ ++#define BCH_MIN_NR_NBUCKETS (1 << 6) ++ ++struct bch_member { ++ uuid_le uuid; ++ __le64 nbuckets; /* device size */ ++ __le16 first_bucket; /* index of first bucket used */ ++ __le16 bucket_size; /* sectors */ ++ __le32 pad; ++ __le64 last_mount; /* time_t */ ++ ++ __le64 flags[2]; ++}; ++ ++LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4) ++/* 4-10 unused, was TIER, HAS_(META)DATA */ ++LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14) ++LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15) ++LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20) ++LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28) ++LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30) ++ ++#define BCH_TIER_MAX 4U ++ ++#if 0 ++LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); ++LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); ++#endif ++ ++enum bch_member_state { ++ BCH_MEMBER_STATE_RW = 0, ++ BCH_MEMBER_STATE_RO = 1, ++ BCH_MEMBER_STATE_FAILED = 2, ++ BCH_MEMBER_STATE_SPARE = 3, ++ BCH_MEMBER_STATE_NR = 4, ++}; ++ ++enum cache_replacement { ++ CACHE_REPLACEMENT_LRU = 0, ++ CACHE_REPLACEMENT_FIFO = 1, ++ CACHE_REPLACEMENT_RANDOM = 2, ++ CACHE_REPLACEMENT_NR = 3, ++}; ++ ++struct bch_sb_field_members { ++ struct bch_sb_field field; ++ struct bch_member members[0]; ++}; ++ ++/* BCH_SB_FIELD_crypt: */ ++ ++struct nonce { ++ __le32 d[4]; ++}; ++ ++struct bch_key { ++ __le64 key[4]; ++}; ++ ++#define BCH_KEY_MAGIC \ ++ (((u64) 'b' << 0)|((u64) 'c' << 8)| \ ++ ((u64) 'h' << 16)|((u64) '*' << 24)| \ ++ ((u64) '*' << 32)|((u64) 'k' << 40)| \ ++ ((u64) 'e' << 48)|((u64) 'y' << 56)) ++ ++struct bch_encrypted_key { ++ __le64 magic; ++ struct bch_key key; ++}; ++ ++/* ++ * If this field is present in the superblock, it stores an encryption key which ++ * is used encrypt all other data/metadata. The key will normally be encrypted ++ * with the key userspace provides, but if encryption has been turned off we'll ++ * just store the master key unencrypted in the superblock so we can access the ++ * previously encrypted data. ++ */ ++struct bch_sb_field_crypt { ++ struct bch_sb_field field; ++ ++ __le64 flags; ++ __le64 kdf_flags; ++ struct bch_encrypted_key key; ++}; ++ ++LE64_BITMASK(BCH_CRYPT_KDF_TYPE, struct bch_sb_field_crypt, flags, 0, 4); ++ ++enum bch_kdf_types { ++ BCH_KDF_SCRYPT = 0, ++ BCH_KDF_NR = 1, ++}; ++ ++/* stored as base 2 log of scrypt params: */ ++LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); ++LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); ++LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); ++ ++/* BCH_SB_FIELD_replicas: */ ++ ++#define BCH_DATA_TYPES() \ ++ x(none, 0) \ ++ x(sb, 1) \ ++ x(journal, 2) \ ++ x(btree, 3) \ ++ x(user, 4) \ ++ x(cached, 5) ++ ++enum bch_data_type { ++#define x(t, n) BCH_DATA_##t, ++ BCH_DATA_TYPES() ++#undef x ++ BCH_DATA_NR ++}; ++ ++struct bch_replicas_entry_v0 { ++ __u8 data_type; ++ __u8 nr_devs; ++ __u8 devs[0]; ++} __attribute__((packed)); ++ ++struct bch_sb_field_replicas_v0 { ++ struct bch_sb_field field; ++ struct bch_replicas_entry_v0 entries[0]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_replicas_entry { ++ __u8 data_type; ++ __u8 nr_devs; ++ __u8 nr_required; ++ __u8 devs[0]; ++} __attribute__((packed)); ++ ++#define replicas_entry_bytes(_i) \ ++ (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) ++ ++struct bch_sb_field_replicas { ++ struct bch_sb_field field; ++ struct bch_replicas_entry entries[0]; ++} __attribute__((packed, aligned(8))); ++ ++/* BCH_SB_FIELD_quota: */ ++ ++struct bch_sb_quota_counter { ++ __le32 timelimit; ++ __le32 warnlimit; ++}; ++ ++struct bch_sb_quota_type { ++ __le64 flags; ++ struct bch_sb_quota_counter c[Q_COUNTERS]; ++}; ++ ++struct bch_sb_field_quota { ++ struct bch_sb_field field; ++ struct bch_sb_quota_type q[QTYP_NR]; ++} __attribute__((packed, aligned(8))); ++ ++/* BCH_SB_FIELD_disk_groups: */ ++ ++#define BCH_SB_LABEL_SIZE 32 ++ ++struct bch_disk_group { ++ __u8 label[BCH_SB_LABEL_SIZE]; ++ __le64 flags[2]; ++} __attribute__((packed, aligned(8))); ++ ++LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) ++LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) ++LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) ++ ++struct bch_sb_field_disk_groups { ++ struct bch_sb_field field; ++ struct bch_disk_group entries[0]; ++} __attribute__((packed, aligned(8))); ++ ++/* ++ * On clean shutdown, store btree roots and current journal sequence number in ++ * the superblock: ++ */ ++struct jset_entry { ++ __le16 u64s; ++ __u8 btree_id; ++ __u8 level; ++ __u8 type; /* designates what this jset holds */ ++ __u8 pad[3]; ++ ++ union { ++ struct bkey_i start[0]; ++ __u64 _data[0]; ++ }; ++}; ++ ++struct bch_sb_field_clean { ++ struct bch_sb_field field; ++ ++ __le32 flags; ++ __le16 read_clock; ++ __le16 write_clock; ++ __le64 journal_seq; ++ ++ union { ++ struct jset_entry start[0]; ++ __u64 _data[0]; ++ }; ++}; ++ ++struct journal_seq_blacklist_entry { ++ __le64 start; ++ __le64 end; ++}; ++ ++struct bch_sb_field_journal_seq_blacklist { ++ struct bch_sb_field field; ++ ++ union { ++ struct journal_seq_blacklist_entry start[0]; ++ __u64 _data[0]; ++ }; ++}; ++ ++/* Superblock: */ ++ ++/* ++ * New versioning scheme: ++ * One common version number for all on disk data structures - superblock, btree ++ * nodes, journal entries ++ */ ++#define BCH_JSET_VERSION_OLD 2 ++#define BCH_BSET_VERSION_OLD 3 ++ ++enum bcachefs_metadata_version { ++ bcachefs_metadata_version_min = 9, ++ bcachefs_metadata_version_new_versioning = 10, ++ bcachefs_metadata_version_bkey_renumber = 10, ++ bcachefs_metadata_version_inode_btree_change = 11, ++ bcachefs_metadata_version_max = 12, ++}; ++ ++#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) ++ ++#define BCH_SB_SECTOR 8 ++#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */ ++ ++struct bch_sb_layout { ++ uuid_le magic; /* bcachefs superblock UUID */ ++ __u8 layout_type; ++ __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */ ++ __u8 nr_superblocks; ++ __u8 pad[5]; ++ __le64 sb_offset[61]; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_SB_LAYOUT_SECTOR 7 ++ ++/* ++ * @offset - sector where this sb was written ++ * @version - on disk format version ++ * @version_min - Oldest metadata version this filesystem contains; so we can ++ * safely drop compatibility code and refuse to mount filesystems ++ * we'd need it for ++ * @magic - identifies as a bcachefs superblock (BCACHE_MAGIC) ++ * @seq - incremented each time superblock is written ++ * @uuid - used for generating various magic numbers and identifying ++ * member devices, never changes ++ * @user_uuid - user visible UUID, may be changed ++ * @label - filesystem label ++ * @seq - identifies most recent superblock, incremented each time ++ * superblock is written ++ * @features - enabled incompatible features ++ */ ++struct bch_sb { ++ struct bch_csum csum; ++ __le16 version; ++ __le16 version_min; ++ __le16 pad[2]; ++ uuid_le magic; ++ uuid_le uuid; ++ uuid_le user_uuid; ++ __u8 label[BCH_SB_LABEL_SIZE]; ++ __le64 offset; ++ __le64 seq; ++ ++ __le16 block_size; ++ __u8 dev_idx; ++ __u8 nr_devices; ++ __le32 u64s; ++ ++ __le64 time_base_lo; ++ __le32 time_base_hi; ++ __le32 time_precision; ++ ++ __le64 flags[8]; ++ __le64 features[2]; ++ __le64 compat[2]; ++ ++ struct bch_sb_layout layout; ++ ++ union { ++ struct bch_sb_field start[0]; ++ __le64 _data[0]; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++/* ++ * Flags: ++ * BCH_SB_INITALIZED - set on first mount ++ * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect ++ * behaviour of mount/recovery path: ++ * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits ++ * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80 ++ * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides ++ * DATA/META_CSUM_TYPE. Also indicates encryption ++ * algorithm in use, if/when we get more than one ++ */ ++ ++LE16_BITMASK(BCH_SB_BLOCK_SIZE, struct bch_sb, block_size, 0, 16); ++ ++LE64_BITMASK(BCH_SB_INITIALIZED, struct bch_sb, flags[0], 0, 1); ++LE64_BITMASK(BCH_SB_CLEAN, struct bch_sb, flags[0], 1, 2); ++LE64_BITMASK(BCH_SB_CSUM_TYPE, struct bch_sb, flags[0], 2, 8); ++LE64_BITMASK(BCH_SB_ERROR_ACTION, struct bch_sb, flags[0], 8, 12); ++ ++LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE, struct bch_sb, flags[0], 12, 28); ++ ++LE64_BITMASK(BCH_SB_GC_RESERVE, struct bch_sb, flags[0], 28, 33); ++LE64_BITMASK(BCH_SB_ROOT_RESERVE, struct bch_sb, flags[0], 33, 40); ++ ++LE64_BITMASK(BCH_SB_META_CSUM_TYPE, struct bch_sb, flags[0], 40, 44); ++LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48); ++ ++LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52); ++LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56); ++ ++LE64_BITMASK(BCH_SB_POSIX_ACL, struct bch_sb, flags[0], 56, 57); ++LE64_BITMASK(BCH_SB_USRQUOTA, struct bch_sb, flags[0], 57, 58); ++LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[0], 58, 59); ++LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60); ++ ++LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61); ++ ++LE64_BITMASK(BCH_SB_REFLINK, struct bch_sb, flags[0], 61, 62); ++ ++/* 61-64 unused */ ++ ++LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); ++LE64_BITMASK(BCH_SB_COMPRESSION_TYPE, struct bch_sb, flags[1], 4, 8); ++LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9); ++ ++LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10); ++LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14); ++ ++/* ++ * Max size of an extent that may require bouncing to read or write ++ * (checksummed, compressed): 64k ++ */ ++LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS, ++ struct bch_sb, flags[1], 14, 20); ++ ++LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24); ++LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28); ++ ++LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40); ++LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52); ++LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64); ++ ++LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE, ++ struct bch_sb, flags[2], 0, 4); ++LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); ++ ++LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); ++ ++/* ++ * Features: ++ * ++ * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist ++ * reflink: gates KEY_TYPE_reflink ++ * inline_data: gates KEY_TYPE_inline_data ++ * new_siphash: gates BCH_STR_HASH_SIPHASH ++ * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE ++ */ ++#define BCH_SB_FEATURES() \ ++ x(lz4, 0) \ ++ x(gzip, 1) \ ++ x(zstd, 2) \ ++ x(atomic_nlink, 3) \ ++ x(ec, 4) \ ++ x(journal_seq_blacklist_v3, 5) \ ++ x(reflink, 6) \ ++ x(new_siphash, 7) \ ++ x(inline_data, 8) \ ++ x(new_extent_overwrite, 9) \ ++ x(incompressible, 10) \ ++ x(btree_ptr_v2, 11) \ ++ x(extents_above_btree_updates, 12) \ ++ x(btree_updates_journalled, 13) ++ ++#define BCH_SB_FEATURES_ALL \ ++ ((1ULL << BCH_FEATURE_new_siphash)| \ ++ (1ULL << BCH_FEATURE_new_extent_overwrite)| \ ++ (1ULL << BCH_FEATURE_btree_ptr_v2)| \ ++ (1ULL << BCH_FEATURE_extents_above_btree_updates)) ++ ++enum bch_sb_feature { ++#define x(f, n) BCH_FEATURE_##f, ++ BCH_SB_FEATURES() ++#undef x ++ BCH_FEATURE_NR, ++}; ++ ++enum bch_sb_compat { ++ BCH_COMPAT_FEAT_ALLOC_INFO = 0, ++ BCH_COMPAT_FEAT_ALLOC_METADATA = 1, ++}; ++ ++/* options: */ ++ ++#define BCH_REPLICAS_MAX 4U ++ ++enum bch_error_actions { ++ BCH_ON_ERROR_CONTINUE = 0, ++ BCH_ON_ERROR_RO = 1, ++ BCH_ON_ERROR_PANIC = 2, ++ BCH_NR_ERROR_ACTIONS = 3, ++}; ++ ++enum bch_str_hash_type { ++ BCH_STR_HASH_CRC32C = 0, ++ BCH_STR_HASH_CRC64 = 1, ++ BCH_STR_HASH_SIPHASH_OLD = 2, ++ BCH_STR_HASH_SIPHASH = 3, ++ BCH_STR_HASH_NR = 4, ++}; ++ ++enum bch_str_hash_opts { ++ BCH_STR_HASH_OPT_CRC32C = 0, ++ BCH_STR_HASH_OPT_CRC64 = 1, ++ BCH_STR_HASH_OPT_SIPHASH = 2, ++ BCH_STR_HASH_OPT_NR = 3, ++}; ++ ++enum bch_csum_type { ++ BCH_CSUM_NONE = 0, ++ BCH_CSUM_CRC32C_NONZERO = 1, ++ BCH_CSUM_CRC64_NONZERO = 2, ++ BCH_CSUM_CHACHA20_POLY1305_80 = 3, ++ BCH_CSUM_CHACHA20_POLY1305_128 = 4, ++ BCH_CSUM_CRC32C = 5, ++ BCH_CSUM_CRC64 = 6, ++ BCH_CSUM_NR = 7, ++}; ++ ++static const unsigned bch_crc_bytes[] = { ++ [BCH_CSUM_NONE] = 0, ++ [BCH_CSUM_CRC32C_NONZERO] = 4, ++ [BCH_CSUM_CRC32C] = 4, ++ [BCH_CSUM_CRC64_NONZERO] = 8, ++ [BCH_CSUM_CRC64] = 8, ++ [BCH_CSUM_CHACHA20_POLY1305_80] = 10, ++ [BCH_CSUM_CHACHA20_POLY1305_128] = 16, ++}; ++ ++static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) ++{ ++ switch (type) { ++ case BCH_CSUM_CHACHA20_POLY1305_80: ++ case BCH_CSUM_CHACHA20_POLY1305_128: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++enum bch_csum_opts { ++ BCH_CSUM_OPT_NONE = 0, ++ BCH_CSUM_OPT_CRC32C = 1, ++ BCH_CSUM_OPT_CRC64 = 2, ++ BCH_CSUM_OPT_NR = 3, ++}; ++ ++#define BCH_COMPRESSION_TYPES() \ ++ x(none, 0) \ ++ x(lz4_old, 1) \ ++ x(gzip, 2) \ ++ x(lz4, 3) \ ++ x(zstd, 4) \ ++ x(incompressible, 5) ++ ++enum bch_compression_type { ++#define x(t, n) BCH_COMPRESSION_TYPE_##t, ++ BCH_COMPRESSION_TYPES() ++#undef x ++ BCH_COMPRESSION_TYPE_NR ++}; ++ ++#define BCH_COMPRESSION_OPTS() \ ++ x(none, 0) \ ++ x(lz4, 1) \ ++ x(gzip, 2) \ ++ x(zstd, 3) ++ ++enum bch_compression_opts { ++#define x(t, n) BCH_COMPRESSION_OPT_##t, ++ BCH_COMPRESSION_OPTS() ++#undef x ++ BCH_COMPRESSION_OPT_NR ++}; ++ ++/* ++ * Magic numbers ++ * ++ * The various other data structures have their own magic numbers, which are ++ * xored with the first part of the cache set's UUID ++ */ ++ ++#define BCACHE_MAGIC \ ++ UUID_LE(0xf67385c6, 0x1a4e, 0xca45, \ ++ 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81) ++ ++#define BCACHEFS_STATFS_MAGIC 0xca451a4e ++ ++#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL) ++#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL) ++ ++static inline __le64 __bch2_sb_magic(struct bch_sb *sb) ++{ ++ __le64 ret; ++ memcpy(&ret, &sb->uuid, sizeof(ret)); ++ return ret; ++} ++ ++static inline __u64 __jset_magic(struct bch_sb *sb) ++{ ++ return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC); ++} ++ ++static inline __u64 __bset_magic(struct bch_sb *sb) ++{ ++ return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC); ++} ++ ++/* Journal */ ++ ++#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) ++ ++#define BCH_JSET_ENTRY_TYPES() \ ++ x(btree_keys, 0) \ ++ x(btree_root, 1) \ ++ x(prio_ptrs, 2) \ ++ x(blacklist, 3) \ ++ x(blacklist_v2, 4) \ ++ x(usage, 5) \ ++ x(data_usage, 6) ++ ++enum { ++#define x(f, nr) BCH_JSET_ENTRY_##f = nr, ++ BCH_JSET_ENTRY_TYPES() ++#undef x ++ BCH_JSET_ENTRY_NR ++}; ++ ++/* ++ * Journal sequence numbers can be blacklisted: bsets record the max sequence ++ * number of all the journal entries they contain updates for, so that on ++ * recovery we can ignore those bsets that contain index updates newer that what ++ * made it into the journal. ++ * ++ * This means that we can't reuse that journal_seq - we have to skip it, and ++ * then record that we skipped it so that the next time we crash and recover we ++ * don't think there was a missing journal entry. ++ */ ++struct jset_entry_blacklist { ++ struct jset_entry entry; ++ __le64 seq; ++}; ++ ++struct jset_entry_blacklist_v2 { ++ struct jset_entry entry; ++ __le64 start; ++ __le64 end; ++}; ++ ++enum { ++ FS_USAGE_RESERVED = 0, ++ FS_USAGE_INODES = 1, ++ FS_USAGE_KEY_VERSION = 2, ++ FS_USAGE_NR = 3 ++}; ++ ++struct jset_entry_usage { ++ struct jset_entry entry; ++ __le64 v; ++} __attribute__((packed)); ++ ++struct jset_entry_data_usage { ++ struct jset_entry entry; ++ __le64 v; ++ struct bch_replicas_entry r; ++} __attribute__((packed)); ++ ++/* ++ * On disk format for a journal entry: ++ * seq is monotonically increasing; every journal entry has its own unique ++ * sequence number. ++ * ++ * last_seq is the oldest journal entry that still has keys the btree hasn't ++ * flushed to disk yet. ++ * ++ * version is for on disk format changes. ++ */ ++struct jset { ++ struct bch_csum csum; ++ ++ __le64 magic; ++ __le64 seq; ++ __le32 version; ++ __le32 flags; ++ ++ __le32 u64s; /* size of d[] in u64s */ ++ ++ __u8 encrypted_start[0]; ++ ++ __le16 read_clock; ++ __le16 write_clock; ++ ++ /* Sequence number of oldest dirty journal entry */ ++ __le64 last_seq; ++ ++ ++ union { ++ struct jset_entry start[0]; ++ __u64 _data[0]; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); ++LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); ++ ++#define BCH_JOURNAL_BUCKETS_MIN 8 ++ ++/* Btree: */ ++ ++#define BCH_BTREE_IDS() \ ++ x(EXTENTS, 0, "extents") \ ++ x(INODES, 1, "inodes") \ ++ x(DIRENTS, 2, "dirents") \ ++ x(XATTRS, 3, "xattrs") \ ++ x(ALLOC, 4, "alloc") \ ++ x(QUOTAS, 5, "quotas") \ ++ x(EC, 6, "stripes") \ ++ x(REFLINK, 7, "reflink") ++ ++enum btree_id { ++#define x(kwd, val, name) BTREE_ID_##kwd = val, ++ BCH_BTREE_IDS() ++#undef x ++ BTREE_ID_NR ++}; ++ ++#define BTREE_MAX_DEPTH 4U ++ ++/* Btree nodes */ ++ ++/* ++ * Btree nodes ++ * ++ * On disk a btree node is a list/log of these; within each set the keys are ++ * sorted ++ */ ++struct bset { ++ __le64 seq; ++ ++ /* ++ * Highest journal entry this bset contains keys for. ++ * If on recovery we don't see that journal entry, this bset is ignored: ++ * this allows us to preserve the order of all index updates after a ++ * crash, since the journal records a total order of all index updates ++ * and anything that didn't make it to the journal doesn't get used. ++ */ ++ __le64 journal_seq; ++ ++ __le32 flags; ++ __le16 version; ++ __le16 u64s; /* count of d[] in u64s */ ++ ++ union { ++ struct bkey_packed start[0]; ++ __u64 _data[0]; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4); ++ ++LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5); ++LE32_BITMASK(BSET_SEPARATE_WHITEOUTS, ++ struct bset, flags, 5, 6); ++ ++struct btree_node { ++ struct bch_csum csum; ++ __le64 magic; ++ ++ /* this flags field is encrypted, unlike bset->flags: */ ++ __le64 flags; ++ ++ /* Closed interval: */ ++ struct bpos min_key; ++ struct bpos max_key; ++ struct bch_extent_ptr ptr; ++ struct bkey_format format; ++ ++ union { ++ struct bset keys; ++ struct { ++ __u8 pad[22]; ++ __le16 u64s; ++ __u64 _data[0]; ++ ++ }; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4); ++LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8); ++LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE, ++ struct btree_node, flags, 8, 9); ++/* 9-32 unused */ ++LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64); ++ ++struct btree_node_entry { ++ struct bch_csum csum; ++ ++ union { ++ struct bset keys; ++ struct { ++ __u8 pad[22]; ++ __le16 u64s; ++ __u64 _data[0]; ++ ++ }; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++#endif /* _BCACHEFS_FORMAT_H */ +diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h +new file mode 100644 +index 000000000000..d71157a3e073 +--- /dev/null ++++ b/fs/bcachefs/bcachefs_ioctl.h +@@ -0,0 +1,332 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_IOCTL_H ++#define _BCACHEFS_IOCTL_H ++ ++#include ++#include ++#include "bcachefs_format.h" ++ ++/* ++ * Flags common to multiple ioctls: ++ */ ++#define BCH_FORCE_IF_DATA_LOST (1 << 0) ++#define BCH_FORCE_IF_METADATA_LOST (1 << 1) ++#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2) ++#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3) ++ ++#define BCH_FORCE_IF_DEGRADED \ ++ (BCH_FORCE_IF_DATA_DEGRADED| \ ++ BCH_FORCE_IF_METADATA_DEGRADED) ++ ++/* ++ * If cleared, ioctl that refer to a device pass it as a pointer to a pathname ++ * (e.g. /dev/sda1); if set, the dev field is the device's index within the ++ * filesystem: ++ */ ++#define BCH_BY_INDEX (1 << 4) ++ ++/* ++ * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem ++ * wide superblock: ++ */ ++#define BCH_READ_DEV (1 << 5) ++ ++/* global control dev: */ ++ ++/* These are currently broken, and probably unnecessary: */ ++#if 0 ++#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble) ++#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental) ++ ++struct bch_ioctl_assemble { ++ __u32 flags; ++ __u32 nr_devs; ++ __u64 pad; ++ __u64 devs[]; ++}; ++ ++struct bch_ioctl_incremental { ++ __u32 flags; ++ __u64 pad; ++ __u64 dev; ++}; ++#endif ++ ++/* filesystem ioctls: */ ++ ++#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid) ++ ++/* These only make sense when we also have incremental assembly */ ++#if 0 ++#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start) ++#define BCH_IOCTL_STOP _IO(0xbc, 3) ++#endif ++ ++#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk) ++#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk) ++#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk) ++#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk) ++#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state) ++#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data) ++#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage) ++#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage) ++#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super) ++#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx) ++#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) ++ ++/* ioctl below act on a particular file, not the filesystem as a whole: */ ++ ++#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) ++ ++/* ++ * BCH_IOCTL_QUERY_UUID: get filesystem UUID ++ * ++ * Returns user visible UUID, not internal UUID (which may not ever be changed); ++ * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with ++ * this UUID. ++ */ ++struct bch_ioctl_query_uuid { ++ uuid_le uuid; ++}; ++ ++#if 0 ++struct bch_ioctl_start { ++ __u32 flags; ++ __u32 pad; ++}; ++#endif ++ ++/* ++ * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem ++ * ++ * The specified device must not be open or in use. On success, the new device ++ * will be an online member of the filesystem just like any other member. ++ * ++ * The device must first be prepared by userspace by formatting with a bcachefs ++ * superblock, which is only used for passing in superblock options/parameters ++ * for that device (in struct bch_member). The new device's superblock should ++ * not claim to be a member of any existing filesystem - UUIDs on it will be ++ * ignored. ++ */ ++ ++/* ++ * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem ++ * ++ * Any data present on @dev will be permanently deleted, and @dev will be ++ * removed from its slot in the filesystem's list of member devices. The device ++ * may be either offline or offline. ++ * ++ * Will fail removing @dev would leave us with insufficient read write devices ++ * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are ++ * set. ++ */ ++ ++/* ++ * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem ++ * but is not open (e.g. because we started in degraded mode), bring it online ++ * ++ * all existing data on @dev will be available once the device is online, ++ * exactly as if @dev was present when the filesystem was first mounted ++ */ ++ ++/* ++ * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that ++ * block device, without removing it from the filesystem (so it can be brought ++ * back online later) ++ * ++ * Data present on @dev will be unavailable while @dev is offline (unless ++ * replicated), but will still be intact and untouched if @dev is brought back ++ * online ++ * ++ * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would ++ * leave us with insufficient read write devices or degraded/unavailable data, ++ * unless the approprate BCH_FORCE_IF_* flags are set. ++ */ ++ ++struct bch_ioctl_disk { ++ __u32 flags; ++ __u32 pad; ++ __u64 dev; ++}; ++ ++/* ++ * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem ++ * ++ * @new_state - one of the bch_member_state states (rw, ro, failed, ++ * spare) ++ * ++ * Will refuse to change member state if we would then have insufficient devices ++ * to write to, or if it would result in degraded data (when @new_state is ++ * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set. ++ */ ++struct bch_ioctl_disk_set_state { ++ __u32 flags; ++ __u8 new_state; ++ __u8 pad[3]; ++ __u64 dev; ++}; ++ ++enum bch_data_ops { ++ BCH_DATA_OP_SCRUB = 0, ++ BCH_DATA_OP_REREPLICATE = 1, ++ BCH_DATA_OP_MIGRATE = 2, ++ BCH_DATA_OP_NR = 3, ++}; ++ ++/* ++ * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g. ++ * scrub, rereplicate, migrate). ++ * ++ * This ioctl kicks off a job in the background, and returns a file descriptor. ++ * Reading from the file descriptor returns a struct bch_ioctl_data_event, ++ * indicating current progress, and closing the file descriptor will stop the ++ * job. The file descriptor is O_CLOEXEC. ++ */ ++struct bch_ioctl_data { ++ __u32 op; ++ __u32 flags; ++ ++ struct bpos start; ++ struct bpos end; ++ ++ union { ++ struct { ++ __u32 dev; ++ __u32 pad; ++ } migrate; ++ struct { ++ __u64 pad[8]; ++ }; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++enum bch_data_event { ++ BCH_DATA_EVENT_PROGRESS = 0, ++ /* XXX: add an event for reporting errors */ ++ BCH_DATA_EVENT_NR = 1, ++}; ++ ++struct bch_ioctl_data_progress { ++ __u8 data_type; ++ __u8 btree_id; ++ __u8 pad[2]; ++ struct bpos pos; ++ ++ __u64 sectors_done; ++ __u64 sectors_total; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_ioctl_data_event { ++ __u8 type; ++ __u8 pad[7]; ++ union { ++ struct bch_ioctl_data_progress p; ++ __u64 pad2[15]; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_replicas_usage { ++ __u64 sectors; ++ struct bch_replicas_entry r; ++} __attribute__((packed)); ++ ++static inline struct bch_replicas_usage * ++replicas_usage_next(struct bch_replicas_usage *u) ++{ ++ return (void *) u + replicas_entry_bytes(&u->r) + 8; ++} ++ ++/* ++ * BCH_IOCTL_FS_USAGE: query filesystem disk space usage ++ * ++ * Returns disk space usage broken out by data type, number of replicas, and ++ * by component device ++ * ++ * @replica_entries_bytes - size, in bytes, allocated for replica usage entries ++ * ++ * On success, @replica_entries_bytes will be changed to indicate the number of ++ * bytes actually used. ++ * ++ * Returns -ERANGE if @replica_entries_bytes was too small ++ */ ++struct bch_ioctl_fs_usage { ++ __u64 capacity; ++ __u64 used; ++ __u64 online_reserved; ++ __u64 persistent_reserved[BCH_REPLICAS_MAX]; ++ ++ __u32 replica_entries_bytes; ++ __u32 pad; ++ ++ struct bch_replicas_usage replicas[0]; ++}; ++ ++/* ++ * BCH_IOCTL_DEV_USAGE: query device disk space usage ++ * ++ * Returns disk space usage broken out by data type - both by buckets and ++ * sectors. ++ */ ++struct bch_ioctl_dev_usage { ++ __u64 dev; ++ __u32 flags; ++ __u8 state; ++ __u8 pad[7]; ++ ++ __u32 bucket_size; ++ __u64 nr_buckets; ++ __u64 available_buckets; ++ ++ __u64 buckets[BCH_DATA_NR]; ++ __u64 sectors[BCH_DATA_NR]; ++ ++ __u64 ec_buckets; ++ __u64 ec_sectors; ++}; ++ ++/* ++ * BCH_IOCTL_READ_SUPER: read filesystem superblock ++ * ++ * Equivalent to reading the superblock directly from the block device, except ++ * avoids racing with the kernel writing the superblock or having to figure out ++ * which block device to read ++ * ++ * @sb - buffer to read into ++ * @size - size of userspace allocated buffer ++ * @dev - device to read superblock for, if BCH_READ_DEV flag is ++ * specified ++ * ++ * Returns -ERANGE if buffer provided is too small ++ */ ++struct bch_ioctl_read_super { ++ __u32 flags; ++ __u32 pad; ++ __u64 dev; ++ __u64 size; ++ __u64 sb; ++}; ++ ++/* ++ * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to ++ * determine if disk is a (online) member - if so, returns device's index ++ * ++ * Returns -ENOENT if not found ++ */ ++struct bch_ioctl_disk_get_idx { ++ __u64 dev; ++}; ++ ++/* ++ * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device ++ * ++ * @dev - member to resize ++ * @nbuckets - new number of buckets ++ */ ++struct bch_ioctl_disk_resize { ++ __u32 flags; ++ __u32 pad; ++ __u64 dev; ++ __u64 nbuckets; ++}; ++ ++#endif /* _BCACHEFS_IOCTL_H */ +diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c +new file mode 100644 +index 000000000000..4d0c9129cd4a +--- /dev/null ++++ b/fs/bcachefs/bkey.c +@@ -0,0 +1,1154 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey.h" ++#include "bkey_methods.h" ++#include "bset.h" ++#include "util.h" ++ ++#undef EBUG_ON ++ ++#ifdef DEBUG_BKEYS ++#define EBUG_ON(cond) BUG_ON(cond) ++#else ++#define EBUG_ON(cond) ++#endif ++ ++const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT; ++ ++struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, ++ const struct bkey_packed *); ++ ++void bch2_to_binary(char *out, const u64 *p, unsigned nr_bits) ++{ ++ unsigned bit = high_bit_offset, done = 0; ++ ++ while (1) { ++ while (bit < 64) { ++ if (done && !(done % 8)) ++ *out++ = ' '; ++ *out++ = *p & (1ULL << (63 - bit)) ? '1' : '0'; ++ bit++; ++ done++; ++ if (done == nr_bits) { ++ *out++ = '\0'; ++ return; ++ } ++ } ++ ++ p = next_word(p); ++ bit = 0; ++ } ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++static void bch2_bkey_pack_verify(const struct bkey_packed *packed, ++ const struct bkey *unpacked, ++ const struct bkey_format *format) ++{ ++ struct bkey tmp; ++ ++ BUG_ON(bkeyp_val_u64s(format, packed) != ++ bkey_val_u64s(unpacked)); ++ ++ BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed)); ++ ++ tmp = __bch2_bkey_unpack_key(format, packed); ++ ++ if (memcmp(&tmp, unpacked, sizeof(struct bkey))) { ++ char buf1[160], buf2[160]; ++ char buf3[160], buf4[160]; ++ ++ bch2_bkey_to_text(&PBUF(buf1), unpacked); ++ bch2_bkey_to_text(&PBUF(buf2), &tmp); ++ bch2_to_binary(buf3, (void *) unpacked, 80); ++ bch2_to_binary(buf4, high_word(format, packed), 80); ++ ++ panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n", ++ format->key_u64s, ++ format->bits_per_field[0], ++ format->bits_per_field[1], ++ format->bits_per_field[2], ++ format->bits_per_field[3], ++ format->bits_per_field[4], ++ buf1, buf2, buf3, buf4); ++ } ++} ++ ++#else ++static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed, ++ const struct bkey *unpacked, ++ const struct bkey_format *format) {} ++#endif ++ ++struct pack_state { ++ const struct bkey_format *format; ++ unsigned bits; /* bits remaining in current word */ ++ u64 w; /* current word */ ++ u64 *p; /* pointer to next word */ ++}; ++ ++__always_inline ++static struct pack_state pack_state_init(const struct bkey_format *format, ++ struct bkey_packed *k) ++{ ++ u64 *p = high_word(format, k); ++ ++ return (struct pack_state) { ++ .format = format, ++ .bits = 64 - high_bit_offset, ++ .w = 0, ++ .p = p, ++ }; ++} ++ ++__always_inline ++static void pack_state_finish(struct pack_state *state, ++ struct bkey_packed *k) ++{ ++ EBUG_ON(state->p < k->_data); ++ EBUG_ON(state->p >= k->_data + state->format->key_u64s); ++ ++ *state->p = state->w; ++} ++ ++struct unpack_state { ++ const struct bkey_format *format; ++ unsigned bits; /* bits remaining in current word */ ++ u64 w; /* current word */ ++ const u64 *p; /* pointer to next word */ ++}; ++ ++__always_inline ++static struct unpack_state unpack_state_init(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ const u64 *p = high_word(format, k); ++ ++ return (struct unpack_state) { ++ .format = format, ++ .bits = 64 - high_bit_offset, ++ .w = *p << high_bit_offset, ++ .p = p, ++ }; ++} ++ ++__always_inline ++static u64 get_inc_field(struct unpack_state *state, unsigned field) ++{ ++ unsigned bits = state->format->bits_per_field[field]; ++ u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]); ++ ++ if (bits >= state->bits) { ++ v = state->w >> (64 - bits); ++ bits -= state->bits; ++ ++ state->p = next_word(state->p); ++ state->w = *state->p; ++ state->bits = 64; ++ } ++ ++ /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ ++ v |= (state->w >> 1) >> (63 - bits); ++ state->w <<= bits; ++ state->bits -= bits; ++ ++ return v + offset; ++} ++ ++__always_inline ++static bool set_inc_field(struct pack_state *state, unsigned field, u64 v) ++{ ++ unsigned bits = state->format->bits_per_field[field]; ++ u64 offset = le64_to_cpu(state->format->field_offset[field]); ++ ++ if (v < offset) ++ return false; ++ ++ v -= offset; ++ ++ if (fls64(v) > bits) ++ return false; ++ ++ if (bits > state->bits) { ++ bits -= state->bits; ++ /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ ++ state->w |= (v >> 1) >> (bits - 1); ++ ++ *state->p = state->w; ++ state->p = next_word(state->p); ++ state->w = 0; ++ state->bits = 64; ++ } ++ ++ state->bits -= bits; ++ state->w |= v << state->bits; ++ ++ return true; ++} ++ ++/* ++ * Note: does NOT set out->format (we don't know what it should be here!) ++ * ++ * Also: doesn't work on extents - it doesn't preserve the invariant that ++ * if k is packed bkey_start_pos(k) will successfully pack ++ */ ++static bool bch2_bkey_transform_key(const struct bkey_format *out_f, ++ struct bkey_packed *out, ++ const struct bkey_format *in_f, ++ const struct bkey_packed *in) ++{ ++ struct pack_state out_s = pack_state_init(out_f, out); ++ struct unpack_state in_s = unpack_state_init(in_f, in); ++ unsigned i; ++ ++ out->_data[0] = 0; ++ ++ for (i = 0; i < BKEY_NR_FIELDS; i++) ++ if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i))) ++ return false; ++ ++ /* Can't happen because the val would be too big to unpack: */ ++ EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX); ++ ++ pack_state_finish(&out_s, out); ++ out->u64s = out_f->key_u64s + in->u64s - in_f->key_u64s; ++ out->needs_whiteout = in->needs_whiteout; ++ out->type = in->type; ++ ++ return true; ++} ++ ++bool bch2_bkey_transform(const struct bkey_format *out_f, ++ struct bkey_packed *out, ++ const struct bkey_format *in_f, ++ const struct bkey_packed *in) ++{ ++ if (!bch2_bkey_transform_key(out_f, out, in_f, in)) ++ return false; ++ ++ memcpy_u64s((u64 *) out + out_f->key_u64s, ++ (u64 *) in + in_f->key_u64s, ++ (in->u64s - in_f->key_u64s)); ++ return true; ++} ++ ++#define bkey_fields() \ ++ x(BKEY_FIELD_INODE, p.inode) \ ++ x(BKEY_FIELD_OFFSET, p.offset) \ ++ x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ ++ x(BKEY_FIELD_SIZE, size) \ ++ x(BKEY_FIELD_VERSION_HI, version.hi) \ ++ x(BKEY_FIELD_VERSION_LO, version.lo) ++ ++struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format, ++ const struct bkey_packed *in) ++{ ++ struct unpack_state state = unpack_state_init(format, in); ++ struct bkey out; ++ ++ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); ++ EBUG_ON(in->u64s < format->key_u64s); ++ EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); ++ EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX); ++ ++ out.u64s = BKEY_U64s + in->u64s - format->key_u64s; ++ out.format = KEY_FORMAT_CURRENT; ++ out.needs_whiteout = in->needs_whiteout; ++ out.type = in->type; ++ out.pad[0] = 0; ++ ++#define x(id, field) out.field = get_inc_field(&state, id); ++ bkey_fields() ++#undef x ++ ++ return out; ++} ++ ++#ifndef HAVE_BCACHEFS_COMPILED_UNPACK ++struct bpos __bkey_unpack_pos(const struct bkey_format *format, ++ const struct bkey_packed *in) ++{ ++ struct unpack_state state = unpack_state_init(format, in); ++ struct bpos out; ++ ++ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); ++ EBUG_ON(in->u64s < format->key_u64s); ++ EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); ++ ++ out.inode = get_inc_field(&state, BKEY_FIELD_INODE); ++ out.offset = get_inc_field(&state, BKEY_FIELD_OFFSET); ++ out.snapshot = get_inc_field(&state, BKEY_FIELD_SNAPSHOT); ++ ++ return out; ++} ++#endif ++ ++/** ++ * bch2_bkey_pack_key -- pack just the key, not the value ++ */ ++bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, ++ const struct bkey_format *format) ++{ ++ struct pack_state state = pack_state_init(format, out); ++ ++ EBUG_ON((void *) in == (void *) out); ++ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); ++ EBUG_ON(in->format != KEY_FORMAT_CURRENT); ++ ++ out->_data[0] = 0; ++ ++#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false; ++ bkey_fields() ++#undef x ++ ++ /* ++ * Extents - we have to guarantee that if an extent is packed, a trimmed ++ * version will also pack: ++ */ ++ if (bkey_start_offset(in) < ++ le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET])) ++ return false; ++ ++ pack_state_finish(&state, out); ++ out->u64s = format->key_u64s + in->u64s - BKEY_U64s; ++ out->format = KEY_FORMAT_LOCAL_BTREE; ++ out->needs_whiteout = in->needs_whiteout; ++ out->type = in->type; ++ ++ bch2_bkey_pack_verify(out, in, format); ++ return true; ++} ++ ++/** ++ * bch2_bkey_unpack -- unpack the key and the value ++ */ ++void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst, ++ const struct bkey_packed *src) ++{ ++ __bkey_unpack_key(b, &dst->k, src); ++ ++ memcpy_u64s(&dst->v, ++ bkeyp_val(&b->format, src), ++ bkeyp_val_u64s(&b->format, src)); ++} ++ ++/** ++ * bch2_bkey_pack -- pack the key and the value ++ */ ++bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in, ++ const struct bkey_format *format) ++{ ++ struct bkey_packed tmp; ++ ++ if (!bch2_bkey_pack_key(&tmp, &in->k, format)) ++ return false; ++ ++ memmove_u64s((u64 *) out + format->key_u64s, ++ &in->v, ++ bkey_val_u64s(&in->k)); ++ memcpy_u64s(out, &tmp, format->key_u64s); ++ ++ return true; ++} ++ ++__always_inline ++static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) ++{ ++ unsigned bits = state->format->bits_per_field[field]; ++ u64 offset = le64_to_cpu(state->format->field_offset[field]); ++ bool ret = true; ++ ++ EBUG_ON(v < offset); ++ v -= offset; ++ ++ if (fls64(v) > bits) { ++ v = ~(~0ULL << bits); ++ ret = false; ++ } ++ ++ if (bits > state->bits) { ++ bits -= state->bits; ++ state->w |= (v >> 1) >> (bits - 1); ++ ++ *state->p = state->w; ++ state->p = next_word(state->p); ++ state->w = 0; ++ state->bits = 64; ++ } ++ ++ state->bits -= bits; ++ state->w |= v << state->bits; ++ ++ return ret; ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++static bool bkey_packed_successor(struct bkey_packed *out, ++ const struct btree *b, ++ struct bkey_packed k) ++{ ++ const struct bkey_format *f = &b->format; ++ unsigned nr_key_bits = b->nr_key_bits; ++ unsigned first_bit, offset; ++ u64 *p; ++ ++ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); ++ ++ if (!nr_key_bits) ++ return false; ++ ++ *out = k; ++ ++ first_bit = high_bit_offset + nr_key_bits - 1; ++ p = nth_word(high_word(f, out), first_bit >> 6); ++ offset = 63 - (first_bit & 63); ++ ++ while (nr_key_bits) { ++ unsigned bits = min(64 - offset, nr_key_bits); ++ u64 mask = (~0ULL >> (64 - bits)) << offset; ++ ++ if ((*p & mask) != mask) { ++ *p += 1ULL << offset; ++ EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0); ++ return true; ++ } ++ ++ *p &= ~mask; ++ p = prev_word(p); ++ nr_key_bits -= bits; ++ offset = 0; ++ } ++ ++ return false; ++} ++#endif ++ ++/* ++ * Returns a packed key that compares <= in ++ * ++ * This is used in bset_search_tree(), where we need a packed pos in order to be ++ * able to compare against the keys in the auxiliary search tree - and it's ++ * legal to use a packed pos that isn't equivalent to the original pos, ++ * _provided_ it compares <= to the original pos. ++ */ ++enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, ++ struct bpos in, ++ const struct btree *b) ++{ ++ const struct bkey_format *f = &b->format; ++ struct pack_state state = pack_state_init(f, out); ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bpos orig = in; ++#endif ++ bool exact = true; ++ ++ out->_data[0] = 0; ++ ++ if (unlikely(in.snapshot < ++ le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) { ++ if (!in.offset-- && ++ !in.inode--) ++ return BKEY_PACK_POS_FAIL; ++ in.snapshot = KEY_SNAPSHOT_MAX; ++ exact = false; ++ } ++ ++ if (unlikely(in.offset < ++ le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) { ++ if (!in.inode--) ++ return BKEY_PACK_POS_FAIL; ++ in.offset = KEY_OFFSET_MAX; ++ in.snapshot = KEY_SNAPSHOT_MAX; ++ exact = false; ++ } ++ ++ if (unlikely(in.inode < ++ le64_to_cpu(f->field_offset[BKEY_FIELD_INODE]))) ++ return BKEY_PACK_POS_FAIL; ++ ++ if (!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode)) { ++ in.offset = KEY_OFFSET_MAX; ++ in.snapshot = KEY_SNAPSHOT_MAX; ++ exact = false; ++ } ++ ++ if (!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset)) { ++ in.snapshot = KEY_SNAPSHOT_MAX; ++ exact = false; ++ } ++ ++ if (!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot)) ++ exact = false; ++ ++ pack_state_finish(&state, out); ++ out->u64s = f->key_u64s; ++ out->format = KEY_FORMAT_LOCAL_BTREE; ++ out->type = KEY_TYPE_deleted; ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ if (exact) { ++ BUG_ON(bkey_cmp_left_packed(b, out, &orig)); ++ } else { ++ struct bkey_packed successor; ++ ++ BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); ++ BUG_ON(bkey_packed_successor(&successor, b, *out) && ++ bkey_cmp_left_packed(b, &successor, &orig) < 0); ++ } ++#endif ++ ++ return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER; ++} ++ ++void bch2_bkey_format_init(struct bkey_format_state *s) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(s->field_min); i++) ++ s->field_min[i] = U64_MAX; ++ ++ for (i = 0; i < ARRAY_SIZE(s->field_max); i++) ++ s->field_max[i] = 0; ++ ++ /* Make sure we can store a size of 0: */ ++ s->field_min[BKEY_FIELD_SIZE] = 0; ++} ++ ++static void __bkey_format_add(struct bkey_format_state *s, ++ unsigned field, u64 v) ++{ ++ s->field_min[field] = min(s->field_min[field], v); ++ s->field_max[field] = max(s->field_max[field], v); ++} ++ ++/* ++ * Changes @format so that @k can be successfully packed with @format ++ */ ++void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k) ++{ ++#define x(id, field) __bkey_format_add(s, id, k->field); ++ bkey_fields() ++#undef x ++ __bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k)); ++} ++ ++void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p) ++{ ++ unsigned field = 0; ++ ++ __bkey_format_add(s, field++, p.inode); ++ __bkey_format_add(s, field++, p.offset); ++ __bkey_format_add(s, field++, p.snapshot); ++} ++ ++/* ++ * We don't want it to be possible for the packed format to represent fields ++ * bigger than a u64... that will cause confusion and issues (like with ++ * bkey_packed_successor()) ++ */ ++static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i, ++ unsigned bits, u64 offset) ++{ ++ offset = bits == 64 ? 0 : min(offset, U64_MAX - ((1ULL << bits) - 1)); ++ ++ f->bits_per_field[i] = bits; ++ f->field_offset[i] = cpu_to_le64(offset); ++} ++ ++struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) ++{ ++ unsigned i, bits = KEY_PACKED_BITS_START; ++ struct bkey_format ret = { ++ .nr_fields = BKEY_NR_FIELDS, ++ }; ++ ++ for (i = 0; i < ARRAY_SIZE(s->field_min); i++) { ++ s->field_min[i] = min(s->field_min[i], s->field_max[i]); ++ ++ set_format_field(&ret, i, ++ fls64(s->field_max[i] - s->field_min[i]), ++ s->field_min[i]); ++ ++ bits += ret.bits_per_field[i]; ++ } ++ ++ /* allow for extent merging: */ ++ if (ret.bits_per_field[BKEY_FIELD_SIZE]) { ++ ret.bits_per_field[BKEY_FIELD_SIZE] += 4; ++ bits += 4; ++ } ++ ++ ret.key_u64s = DIV_ROUND_UP(bits, 64); ++ ++ /* if we have enough spare bits, round fields up to nearest byte */ ++ bits = ret.key_u64s * 64 - bits; ++ ++ for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) { ++ unsigned r = round_up(ret.bits_per_field[i], 8) - ++ ret.bits_per_field[i]; ++ ++ if (r <= bits) { ++ set_format_field(&ret, i, ++ ret.bits_per_field[i] + r, ++ le64_to_cpu(ret.field_offset[i])); ++ bits -= r; ++ } ++ } ++ ++ EBUG_ON(bch2_bkey_format_validate(&ret)); ++ return ret; ++} ++ ++const char *bch2_bkey_format_validate(struct bkey_format *f) ++{ ++ unsigned i, bits = KEY_PACKED_BITS_START; ++ ++ if (f->nr_fields != BKEY_NR_FIELDS) ++ return "incorrect number of fields"; ++ ++ for (i = 0; i < f->nr_fields; i++) { ++ u64 field_offset = le64_to_cpu(f->field_offset[i]); ++ ++ if (f->bits_per_field[i] > 64) ++ return "field too large"; ++ ++ if (field_offset && ++ (f->bits_per_field[i] == 64 || ++ (field_offset + ((1ULL << f->bits_per_field[i]) - 1) < ++ field_offset))) ++ return "offset + bits overflow"; ++ ++ bits += f->bits_per_field[i]; ++ } ++ ++ if (f->key_u64s != DIV_ROUND_UP(bits, 64)) ++ return "incorrect key_u64s"; ++ ++ return NULL; ++} ++ ++/* ++ * Most significant differing bit ++ * Bits are indexed from 0 - return is [0, nr_key_bits) ++ */ ++__pure ++unsigned bch2_bkey_greatest_differing_bit(const struct btree *b, ++ const struct bkey_packed *l_k, ++ const struct bkey_packed *r_k) ++{ ++ const u64 *l = high_word(&b->format, l_k); ++ const u64 *r = high_word(&b->format, r_k); ++ unsigned nr_key_bits = b->nr_key_bits; ++ unsigned word_bits = 64 - high_bit_offset; ++ u64 l_v, r_v; ++ ++ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); ++ ++ /* for big endian, skip past header */ ++ l_v = *l & (~0ULL >> high_bit_offset); ++ r_v = *r & (~0ULL >> high_bit_offset); ++ ++ while (nr_key_bits) { ++ if (nr_key_bits < word_bits) { ++ l_v >>= word_bits - nr_key_bits; ++ r_v >>= word_bits - nr_key_bits; ++ nr_key_bits = 0; ++ } else { ++ nr_key_bits -= word_bits; ++ } ++ ++ if (l_v != r_v) ++ return fls64(l_v ^ r_v) - 1 + nr_key_bits; ++ ++ l = next_word(l); ++ r = next_word(r); ++ ++ l_v = *l; ++ r_v = *r; ++ word_bits = 64; ++ } ++ ++ return 0; ++} ++ ++/* ++ * First set bit ++ * Bits are indexed from 0 - return is [0, nr_key_bits) ++ */ ++__pure ++unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k) ++{ ++ const u64 *p = high_word(&b->format, k); ++ unsigned nr_key_bits = b->nr_key_bits; ++ unsigned ret = 0, offset; ++ ++ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); ++ ++ offset = nr_key_bits; ++ while (offset > 64) { ++ p = next_word(p); ++ offset -= 64; ++ } ++ ++ offset = 64 - offset; ++ ++ while (nr_key_bits) { ++ unsigned bits = nr_key_bits + offset < 64 ++ ? nr_key_bits ++ : 64 - offset; ++ ++ u64 mask = (~0ULL >> (64 - bits)) << offset; ++ ++ if (*p & mask) ++ return ret + __ffs64(*p & mask) - offset; ++ ++ p = prev_word(p); ++ nr_key_bits -= bits; ++ ret += bits; ++ offset = 0; ++ } ++ ++ return 0; ++} ++ ++#ifdef CONFIG_X86_64 ++ ++static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, ++ unsigned nr_key_bits) ++{ ++ long d0, d1, d2, d3; ++ int cmp; ++ ++ /* we shouldn't need asm for this, but gcc is being retarded: */ ++ ++ asm(".intel_syntax noprefix;" ++ "xor eax, eax;" ++ "xor edx, edx;" ++ "1:;" ++ "mov r8, [rdi];" ++ "mov r9, [rsi];" ++ "sub ecx, 64;" ++ "jl 2f;" ++ ++ "cmp r8, r9;" ++ "jnz 3f;" ++ ++ "lea rdi, [rdi - 8];" ++ "lea rsi, [rsi - 8];" ++ "jmp 1b;" ++ ++ "2:;" ++ "not ecx;" ++ "shr r8, 1;" ++ "shr r9, 1;" ++ "shr r8, cl;" ++ "shr r9, cl;" ++ "cmp r8, r9;" ++ ++ "3:\n" ++ "seta al;" ++ "setb dl;" ++ "sub eax, edx;" ++ ".att_syntax prefix;" ++ : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp) ++ : "0" (l), "1" (r), "3" (nr_key_bits) ++ : "r8", "r9", "cc", "memory"); ++ ++ return cmp; ++} ++ ++#define I(_x) (*(out)++ = (_x)) ++#define I1(i0) I(i0) ++#define I2(i0, i1) (I1(i0), I(i1)) ++#define I3(i0, i1, i2) (I2(i0, i1), I(i2)) ++#define I4(i0, i1, i2, i3) (I3(i0, i1, i2), I(i3)) ++#define I5(i0, i1, i2, i3, i4) (I4(i0, i1, i2, i3), I(i4)) ++ ++static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out, ++ enum bch_bkey_fields field, ++ unsigned dst_offset, unsigned dst_size, ++ bool *eax_zeroed) ++{ ++ unsigned bits = format->bits_per_field[field]; ++ u64 offset = le64_to_cpu(format->field_offset[field]); ++ unsigned i, byte, bit_offset, align, shl, shr; ++ ++ if (!bits && !offset) { ++ if (!*eax_zeroed) { ++ /* xor eax, eax */ ++ I2(0x31, 0xc0); ++ } ++ ++ *eax_zeroed = true; ++ goto set_field; ++ } ++ ++ if (!bits) { ++ /* just return offset: */ ++ ++ switch (dst_size) { ++ case 8: ++ if (offset > S32_MAX) { ++ /* mov [rdi + dst_offset], offset */ ++ I3(0xc7, 0x47, dst_offset); ++ memcpy(out, &offset, 4); ++ out += 4; ++ ++ I3(0xc7, 0x47, dst_offset + 4); ++ memcpy(out, (void *) &offset + 4, 4); ++ out += 4; ++ } else { ++ /* mov [rdi + dst_offset], offset */ ++ /* sign extended */ ++ I4(0x48, 0xc7, 0x47, dst_offset); ++ memcpy(out, &offset, 4); ++ out += 4; ++ } ++ break; ++ case 4: ++ /* mov [rdi + dst_offset], offset */ ++ I3(0xc7, 0x47, dst_offset); ++ memcpy(out, &offset, 4); ++ out += 4; ++ break; ++ default: ++ BUG(); ++ } ++ ++ return out; ++ } ++ ++ bit_offset = format->key_u64s * 64; ++ for (i = 0; i <= field; i++) ++ bit_offset -= format->bits_per_field[i]; ++ ++ byte = bit_offset / 8; ++ bit_offset -= byte * 8; ++ ++ *eax_zeroed = false; ++ ++ if (bit_offset == 0 && bits == 8) { ++ /* movzx eax, BYTE PTR [rsi + imm8] */ ++ I4(0x0f, 0xb6, 0x46, byte); ++ } else if (bit_offset == 0 && bits == 16) { ++ /* movzx eax, WORD PTR [rsi + imm8] */ ++ I4(0x0f, 0xb7, 0x46, byte); ++ } else if (bit_offset + bits <= 32) { ++ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); ++ byte -= align; ++ bit_offset += align * 8; ++ ++ BUG_ON(bit_offset + bits > 32); ++ ++ /* mov eax, [rsi + imm8] */ ++ I3(0x8b, 0x46, byte); ++ ++ if (bit_offset) { ++ /* shr eax, imm8 */ ++ I3(0xc1, 0xe8, bit_offset); ++ } ++ ++ if (bit_offset + bits < 32) { ++ unsigned mask = ~0U >> (32 - bits); ++ ++ /* and eax, imm32 */ ++ I1(0x25); ++ memcpy(out, &mask, 4); ++ out += 4; ++ } ++ } else if (bit_offset + bits <= 64) { ++ align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7); ++ byte -= align; ++ bit_offset += align * 8; ++ ++ BUG_ON(bit_offset + bits > 64); ++ ++ /* mov rax, [rsi + imm8] */ ++ I4(0x48, 0x8b, 0x46, byte); ++ ++ shl = 64 - bit_offset - bits; ++ shr = bit_offset + shl; ++ ++ if (shl) { ++ /* shl rax, imm8 */ ++ I4(0x48, 0xc1, 0xe0, shl); ++ } ++ ++ if (shr) { ++ /* shr rax, imm8 */ ++ I4(0x48, 0xc1, 0xe8, shr); ++ } ++ } else { ++ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); ++ byte -= align; ++ bit_offset += align * 8; ++ ++ BUG_ON(bit_offset + bits > 96); ++ ++ /* mov rax, [rsi + byte] */ ++ I4(0x48, 0x8b, 0x46, byte); ++ ++ /* mov edx, [rsi + byte + 8] */ ++ I3(0x8b, 0x56, byte + 8); ++ ++ /* bits from next word: */ ++ shr = bit_offset + bits - 64; ++ BUG_ON(shr > bit_offset); ++ ++ /* shr rax, bit_offset */ ++ I4(0x48, 0xc1, 0xe8, shr); ++ ++ /* shl rdx, imm8 */ ++ I4(0x48, 0xc1, 0xe2, 64 - shr); ++ ++ /* or rax, rdx */ ++ I3(0x48, 0x09, 0xd0); ++ ++ shr = bit_offset - shr; ++ ++ if (shr) { ++ /* shr rax, imm8 */ ++ I4(0x48, 0xc1, 0xe8, shr); ++ } ++ } ++ ++ /* rax += offset: */ ++ if (offset > S32_MAX) { ++ /* mov rdx, imm64 */ ++ I2(0x48, 0xba); ++ memcpy(out, &offset, 8); ++ out += 8; ++ /* add %rdx, %rax */ ++ I3(0x48, 0x01, 0xd0); ++ } else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) { ++ /* add rax, imm32 */ ++ I2(0x48, 0x05); ++ memcpy(out, &offset, 4); ++ out += 4; ++ } else if (offset) { ++ /* add eax, imm32 */ ++ I1(0x05); ++ memcpy(out, &offset, 4); ++ out += 4; ++ } ++set_field: ++ switch (dst_size) { ++ case 8: ++ /* mov [rdi + dst_offset], rax */ ++ I4(0x48, 0x89, 0x47, dst_offset); ++ break; ++ case 4: ++ /* mov [rdi + dst_offset], eax */ ++ I3(0x89, 0x47, dst_offset); ++ break; ++ default: ++ BUG(); ++ } ++ ++ return out; ++} ++ ++int bch2_compile_bkey_format(const struct bkey_format *format, void *_out) ++{ ++ bool eax_zeroed = false; ++ u8 *out = _out; ++ ++ /* ++ * rdi: dst - unpacked key ++ * rsi: src - packed key ++ */ ++ ++ /* k->u64s, k->format, k->type */ ++ ++ /* mov eax, [rsi] */ ++ I2(0x8b, 0x06); ++ ++ /* add eax, BKEY_U64s - format->key_u64s */ ++ I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0); ++ ++ /* and eax, imm32: mask out k->pad: */ ++ I5(0x25, 0xff, 0xff, 0xff, 0); ++ ++ /* mov [rdi], eax */ ++ I2(0x89, 0x07); ++ ++#define x(id, field) \ ++ out = compile_bkey_field(format, out, id, \ ++ offsetof(struct bkey, field), \ ++ sizeof(((struct bkey *) NULL)->field), \ ++ &eax_zeroed); ++ bkey_fields() ++#undef x ++ ++ /* retq */ ++ I1(0xc3); ++ ++ return (void *) out - _out; ++} ++ ++#else ++static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, ++ unsigned nr_key_bits) ++{ ++ u64 l_v, r_v; ++ ++ if (!nr_key_bits) ++ return 0; ++ ++ /* for big endian, skip past header */ ++ nr_key_bits += high_bit_offset; ++ l_v = *l & (~0ULL >> high_bit_offset); ++ r_v = *r & (~0ULL >> high_bit_offset); ++ ++ while (1) { ++ if (nr_key_bits < 64) { ++ l_v >>= 64 - nr_key_bits; ++ r_v >>= 64 - nr_key_bits; ++ nr_key_bits = 0; ++ } else { ++ nr_key_bits -= 64; ++ } ++ ++ if (!nr_key_bits || l_v != r_v) ++ break; ++ ++ l = next_word(l); ++ r = next_word(r); ++ ++ l_v = *l; ++ r_v = *r; ++ } ++ ++ return cmp_int(l_v, r_v); ++} ++#endif ++ ++__pure ++int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l, ++ const struct bkey_packed *r, ++ const struct btree *b) ++{ ++ const struct bkey_format *f = &b->format; ++ int ret; ++ ++ EBUG_ON(!bkey_packed(l) || !bkey_packed(r)); ++ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); ++ ++ ret = __bkey_cmp_bits(high_word(f, l), ++ high_word(f, r), ++ b->nr_key_bits); ++ ++ EBUG_ON(ret != bkey_cmp(bkey_unpack_pos(b, l), ++ bkey_unpack_pos(b, r))); ++ return ret; ++} ++ ++__pure __flatten ++int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bpos *r) ++{ ++ return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r); ++} ++ ++__pure __flatten ++int __bch2_bkey_cmp_packed(const struct bkey_packed *l, ++ const struct bkey_packed *r, ++ const struct btree *b) ++{ ++ struct bkey unpacked; ++ ++ if (likely(bkey_packed(l) && bkey_packed(r))) ++ return __bch2_bkey_cmp_packed_format_checked(l, r, b); ++ ++ if (bkey_packed(l)) { ++ __bkey_unpack_key_format_checked(b, &unpacked, l); ++ l = (void*) &unpacked; ++ } else if (bkey_packed(r)) { ++ __bkey_unpack_key_format_checked(b, &unpacked, r); ++ r = (void*) &unpacked; ++ } ++ ++ return bkey_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); ++} ++ ++__pure __flatten ++int __bch2_bkey_cmp_left_packed(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bpos *r) ++{ ++ const struct bkey *l_unpacked; ++ ++ return unlikely(l_unpacked = packed_to_bkey_c(l)) ++ ? bkey_cmp(l_unpacked->p, *r) ++ : __bch2_bkey_cmp_left_packed_format_checked(b, l, r); ++} ++ ++void bch2_bpos_swab(struct bpos *p) ++{ ++ u8 *l = (u8 *) p; ++ u8 *h = ((u8 *) &p[1]) - 1; ++ ++ while (l < h) { ++ swap(*l, *h); ++ l++; ++ --h; ++ } ++} ++ ++void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k) ++{ ++ const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current; ++ u8 *l = k->key_start; ++ u8 *h = (u8 *) (k->_data + f->key_u64s) - 1; ++ ++ while (l < h) { ++ swap(*l, *h); ++ l++; ++ --h; ++ } ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_bkey_pack_test(void) ++{ ++ struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0); ++ struct bkey_packed p; ++ ++ struct bkey_format test_format = { ++ .key_u64s = 2, ++ .nr_fields = BKEY_NR_FIELDS, ++ .bits_per_field = { ++ 13, ++ 64, ++ }, ++ }; ++ ++ struct unpack_state in_s = ++ unpack_state_init(&bch2_bkey_format_current, (void *) &t); ++ struct pack_state out_s = pack_state_init(&test_format, &p); ++ unsigned i; ++ ++ for (i = 0; i < out_s.format->nr_fields; i++) { ++ u64 a, v = get_inc_field(&in_s, i); ++ ++ switch (i) { ++#define x(id, field) case id: a = t.field; break; ++ bkey_fields() ++#undef x ++ default: ++ BUG(); ++ } ++ ++ if (a != v) ++ panic("got %llu actual %llu i %u\n", v, a, i); ++ ++ if (!set_inc_field(&out_s, i, v)) ++ panic("failed at %u\n", i); ++ } ++ ++ BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format)); ++} ++#endif +diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h +new file mode 100644 +index 000000000000..cbcfbd26bc58 +--- /dev/null ++++ b/fs/bcachefs/bkey.h +@@ -0,0 +1,605 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BKEY_H ++#define _BCACHEFS_BKEY_H ++ ++#include ++#include "bcachefs_format.h" ++ ++#include "util.h" ++#include "vstructs.h" ++ ++#ifdef CONFIG_X86_64 ++#define HAVE_BCACHEFS_COMPILED_UNPACK 1 ++#endif ++ ++void bch2_to_binary(char *, const u64 *, unsigned); ++ ++/* bkey with split value, const */ ++struct bkey_s_c { ++ const struct bkey *k; ++ const struct bch_val *v; ++}; ++ ++/* bkey with split value */ ++struct bkey_s { ++ union { ++ struct { ++ struct bkey *k; ++ struct bch_val *v; ++ }; ++ struct bkey_s_c s_c; ++ }; ++}; ++ ++#define bkey_next(_k) vstruct_next(_k) ++ ++static inline struct bkey_packed *bkey_next_skip_noops(struct bkey_packed *k, ++ struct bkey_packed *end) ++{ ++ k = bkey_next(k); ++ ++ while (k != end && !k->u64s) ++ k = (void *) ((u64 *) k + 1); ++ return k; ++} ++ ++#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) ++ ++static inline size_t bkey_val_bytes(const struct bkey *k) ++{ ++ return bkey_val_u64s(k) * sizeof(u64); ++} ++ ++static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) ++{ ++ k->u64s = BKEY_U64s + val_u64s; ++} ++ ++static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) ++{ ++ k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64)); ++} ++ ++#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) ++ ++#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) ++ ++#define bkey_whiteout(_k) \ ++ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard) ++ ++#define bkey_packed_typecheck(_k) \ ++({ \ ++ BUILD_BUG_ON(!type_is(_k, struct bkey *) && \ ++ !type_is(_k, struct bkey_packed *)); \ ++ type_is(_k, struct bkey_packed *); \ ++}) ++ ++enum bkey_lr_packed { ++ BKEY_PACKED_BOTH, ++ BKEY_PACKED_RIGHT, ++ BKEY_PACKED_LEFT, ++ BKEY_PACKED_NONE, ++}; ++ ++#define bkey_lr_packed_typecheck(_l, _r) \ ++ (!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1)) ++ ++#define bkey_lr_packed(_l, _r) \ ++ ((_l)->format + ((_r)->format << 1)) ++ ++#define bkey_copy(_dst, _src) \ ++do { \ ++ BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) && \ ++ !type_is(_dst, struct bkey_packed *)); \ ++ BUILD_BUG_ON(!type_is(_src, struct bkey_i *) && \ ++ !type_is(_src, struct bkey_packed *)); \ ++ EBUG_ON((u64 *) (_dst) > (u64 *) (_src) && \ ++ (u64 *) (_dst) < (u64 *) (_src) + \ ++ ((struct bkey *) (_src))->u64s); \ ++ \ ++ memcpy_u64s_small((_dst), (_src), \ ++ ((struct bkey *) (_src))->u64s); \ ++} while (0) ++ ++struct btree; ++ ++struct bkey_format_state { ++ u64 field_min[BKEY_NR_FIELDS]; ++ u64 field_max[BKEY_NR_FIELDS]; ++}; ++ ++void bch2_bkey_format_init(struct bkey_format_state *); ++void bch2_bkey_format_add_key(struct bkey_format_state *, const struct bkey *); ++void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); ++struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); ++const char *bch2_bkey_format_validate(struct bkey_format *); ++ ++__pure ++unsigned bch2_bkey_greatest_differing_bit(const struct btree *, ++ const struct bkey_packed *, ++ const struct bkey_packed *); ++__pure ++unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *); ++ ++__pure ++int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *, ++ const struct bkey_packed *, ++ const struct btree *); ++ ++__pure ++int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *, ++ const struct bkey_packed *, ++ const struct bpos *); ++ ++__pure ++int __bch2_bkey_cmp_packed(const struct bkey_packed *, ++ const struct bkey_packed *, ++ const struct btree *); ++ ++__pure ++int __bch2_bkey_cmp_left_packed(const struct btree *, ++ const struct bkey_packed *, ++ const struct bpos *); ++ ++static inline __pure ++int bkey_cmp_left_packed(const struct btree *b, ++ const struct bkey_packed *l, const struct bpos *r) ++{ ++ return __bch2_bkey_cmp_left_packed(b, l, r); ++} ++ ++/* ++ * we prefer to pass bpos by ref, but it's often enough terribly convenient to ++ * pass it by by val... as much as I hate c++, const ref would be nice here: ++ */ ++__pure __flatten ++static inline int bkey_cmp_left_packed_byval(const struct btree *b, ++ const struct bkey_packed *l, ++ struct bpos r) ++{ ++ return bkey_cmp_left_packed(b, l, &r); ++} ++ ++/* ++ * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to ++ * skip dispatching on k->format: ++ */ ++#define bkey_cmp_packed(_b, _l, _r) \ ++({ \ ++ int _cmp; \ ++ \ ++ switch (bkey_lr_packed_typecheck(_l, _r)) { \ ++ case BKEY_PACKED_NONE: \ ++ _cmp = bkey_cmp(((struct bkey *) (_l))->p, \ ++ ((struct bkey *) (_r))->p); \ ++ break; \ ++ case BKEY_PACKED_LEFT: \ ++ _cmp = bkey_cmp_left_packed((_b), \ ++ (struct bkey_packed *) (_l), \ ++ &((struct bkey *) (_r))->p); \ ++ break; \ ++ case BKEY_PACKED_RIGHT: \ ++ _cmp = -bkey_cmp_left_packed((_b), \ ++ (struct bkey_packed *) (_r), \ ++ &((struct bkey *) (_l))->p); \ ++ break; \ ++ case BKEY_PACKED_BOTH: \ ++ _cmp = __bch2_bkey_cmp_packed((void *) (_l), \ ++ (void *) (_r), (_b)); \ ++ break; \ ++ } \ ++ _cmp; \ ++}) ++ ++#if 1 ++static __always_inline int bkey_cmp(struct bpos l, struct bpos r) ++{ ++ if (l.inode != r.inode) ++ return l.inode < r.inode ? -1 : 1; ++ if (l.offset != r.offset) ++ return l.offset < r.offset ? -1 : 1; ++ if (l.snapshot != r.snapshot) ++ return l.snapshot < r.snapshot ? -1 : 1; ++ return 0; ++} ++#else ++int bkey_cmp(struct bpos l, struct bpos r); ++#endif ++ ++static inline struct bpos bpos_min(struct bpos l, struct bpos r) ++{ ++ return bkey_cmp(l, r) < 0 ? l : r; ++} ++ ++void bch2_bpos_swab(struct bpos *); ++void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); ++ ++static __always_inline int bversion_cmp(struct bversion l, struct bversion r) ++{ ++ return cmp_int(l.hi, r.hi) ?: ++ cmp_int(l.lo, r.lo); ++} ++ ++#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) ++#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL }) ++ ++static __always_inline int bversion_zero(struct bversion v) ++{ ++ return !bversion_cmp(v, ZERO_VERSION); ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++/* statement expressions confusing unlikely()? */ ++#define bkey_packed(_k) \ ++ ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \ ++ (_k)->format != KEY_FORMAT_CURRENT; }) ++#else ++#define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT) ++#endif ++ ++/* ++ * It's safe to treat an unpacked bkey as a packed one, but not the reverse ++ */ ++static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k) ++{ ++ return (struct bkey_packed *) k; ++} ++ ++static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k) ++{ ++ return (const struct bkey_packed *) k; ++} ++ ++static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k) ++{ ++ return bkey_packed(k) ? NULL : (struct bkey_i *) k; ++} ++ ++static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k) ++{ ++ return bkey_packed(k) ? NULL : (const struct bkey *) k; ++} ++ ++static inline unsigned bkey_format_key_bits(const struct bkey_format *format) ++{ ++ return format->bits_per_field[BKEY_FIELD_INODE] + ++ format->bits_per_field[BKEY_FIELD_OFFSET] + ++ format->bits_per_field[BKEY_FIELD_SNAPSHOT]; ++} ++ ++static inline struct bpos bkey_successor(struct bpos p) ++{ ++ struct bpos ret = p; ++ ++ if (!++ret.offset) ++ BUG_ON(!++ret.inode); ++ ++ return ret; ++} ++ ++static inline struct bpos bkey_predecessor(struct bpos p) ++{ ++ struct bpos ret = p; ++ ++ if (!ret.offset--) ++ BUG_ON(!ret.inode--); ++ ++ return ret; ++} ++ ++static inline u64 bkey_start_offset(const struct bkey *k) ++{ ++ return k->p.offset - k->size; ++} ++ ++static inline struct bpos bkey_start_pos(const struct bkey *k) ++{ ++ return (struct bpos) { ++ .inode = k->p.inode, ++ .offset = bkey_start_offset(k), ++ .snapshot = k->p.snapshot, ++ }; ++} ++ ++/* Packed helpers */ ++ ++static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s; ++ ++ EBUG_ON(k->u64s < ret); ++ return ret; ++} ++ ++static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ return bkeyp_key_u64s(format, k) * sizeof(u64); ++} ++ ++static inline unsigned bkeyp_val_u64s(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ return k->u64s - bkeyp_key_u64s(format, k); ++} ++ ++static inline size_t bkeyp_val_bytes(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ return bkeyp_val_u64s(format, k) * sizeof(u64); ++} ++ ++static inline void set_bkeyp_val_u64s(const struct bkey_format *format, ++ struct bkey_packed *k, unsigned val_u64s) ++{ ++ k->u64s = bkeyp_key_u64s(format, k) + val_u64s; ++} ++ ++#define bkeyp_val(_format, _k) \ ++ ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k))) ++ ++extern const struct bkey_format bch2_bkey_format_current; ++ ++bool bch2_bkey_transform(const struct bkey_format *, ++ struct bkey_packed *, ++ const struct bkey_format *, ++ const struct bkey_packed *); ++ ++struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, ++ const struct bkey_packed *); ++ ++#ifndef HAVE_BCACHEFS_COMPILED_UNPACK ++struct bpos __bkey_unpack_pos(const struct bkey_format *, ++ const struct bkey_packed *); ++#endif ++ ++bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *, ++ const struct bkey_format *); ++ ++enum bkey_pack_pos_ret { ++ BKEY_PACK_POS_EXACT, ++ BKEY_PACK_POS_SMALLER, ++ BKEY_PACK_POS_FAIL, ++}; ++ ++enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos, ++ const struct btree *); ++ ++static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in, ++ const struct btree *b) ++{ ++ return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT; ++} ++ ++void bch2_bkey_unpack(const struct btree *, struct bkey_i *, ++ const struct bkey_packed *); ++bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *, ++ const struct bkey_format *); ++ ++static inline u64 bkey_field_max(const struct bkey_format *f, ++ enum bch_bkey_fields nr) ++{ ++ return f->bits_per_field[nr] < 64 ++ ? (le64_to_cpu(f->field_offset[nr]) + ++ ~(~0ULL << f->bits_per_field[nr])) ++ : U64_MAX; ++} ++ ++#ifdef HAVE_BCACHEFS_COMPILED_UNPACK ++ ++int bch2_compile_bkey_format(const struct bkey_format *, void *); ++ ++#else ++ ++static inline int bch2_compile_bkey_format(const struct bkey_format *format, ++ void *out) { return 0; } ++ ++#endif ++ ++static inline void bkey_reassemble(struct bkey_i *dst, ++ struct bkey_s_c src) ++{ ++ dst->k = *src.k; ++ memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k)); ++} ++ ++#define bkey_s_null ((struct bkey_s) { .k = NULL }) ++#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) ++ ++#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) ++#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) ++ ++static inline struct bkey_s bkey_to_s(struct bkey *k) ++{ ++ return (struct bkey_s) { .k = k, .v = NULL }; ++} ++ ++static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) ++{ ++ return (struct bkey_s_c) { .k = k, .v = NULL }; ++} ++ ++static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) ++{ ++ return (struct bkey_s) { .k = &k->k, .v = &k->v }; ++} ++ ++static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) ++{ ++ return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; ++} ++ ++/* ++ * For a given type of value (e.g. struct bch_extent), generates the types for ++ * bkey + bch_extent - inline, split, split const - and also all the conversion ++ * functions, which also check that the value is of the correct type. ++ * ++ * We use anonymous unions for upcasting - e.g. converting from e.g. a ++ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion ++ * functions. ++ */ ++#define BKEY_VAL_ACCESSORS(name) \ ++struct bkey_i_##name { \ ++ union { \ ++ struct bkey k; \ ++ struct bkey_i k_i; \ ++ }; \ ++ struct bch_##name v; \ ++}; \ ++ \ ++struct bkey_s_c_##name { \ ++ union { \ ++ struct { \ ++ const struct bkey *k; \ ++ const struct bch_##name *v; \ ++ }; \ ++ struct bkey_s_c s_c; \ ++ }; \ ++}; \ ++ \ ++struct bkey_s_##name { \ ++ union { \ ++ struct { \ ++ struct bkey *k; \ ++ struct bch_##name *v; \ ++ }; \ ++ struct bkey_s_c_##name c; \ ++ struct bkey_s s; \ ++ struct bkey_s_c s_c; \ ++ }; \ ++}; \ ++ \ ++static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ ++{ \ ++ EBUG_ON(k->k.type != KEY_TYPE_##name); \ ++ return container_of(&k->k, struct bkey_i_##name, k); \ ++} \ ++ \ ++static inline const struct bkey_i_##name * \ ++bkey_i_to_##name##_c(const struct bkey_i *k) \ ++{ \ ++ EBUG_ON(k->k.type != KEY_TYPE_##name); \ ++ return container_of(&k->k, struct bkey_i_##name, k); \ ++} \ ++ \ ++static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ ++{ \ ++ EBUG_ON(k.k->type != KEY_TYPE_##name); \ ++ return (struct bkey_s_##name) { \ ++ .k = k.k, \ ++ .v = container_of(k.v, struct bch_##name, v), \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ ++{ \ ++ EBUG_ON(k.k->type != KEY_TYPE_##name); \ ++ return (struct bkey_s_c_##name) { \ ++ .k = k.k, \ ++ .v = container_of(k.v, struct bch_##name, v), \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ ++{ \ ++ return (struct bkey_s_##name) { \ ++ .k = &k->k, \ ++ .v = &k->v, \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_c_##name \ ++name##_i_to_s_c(const struct bkey_i_##name *k) \ ++{ \ ++ return (struct bkey_s_c_##name) { \ ++ .k = &k->k, \ ++ .v = &k->v, \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ ++{ \ ++ EBUG_ON(k->k.type != KEY_TYPE_##name); \ ++ return (struct bkey_s_##name) { \ ++ .k = &k->k, \ ++ .v = container_of(&k->v, struct bch_##name, v), \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_c_##name \ ++bkey_i_to_s_c_##name(const struct bkey_i *k) \ ++{ \ ++ EBUG_ON(k->k.type != KEY_TYPE_##name); \ ++ return (struct bkey_s_c_##name) { \ ++ .k = &k->k, \ ++ .v = container_of(&k->v, struct bch_##name, v), \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ ++{ \ ++ struct bkey_i_##name *k = \ ++ container_of(&_k->k, struct bkey_i_##name, k); \ ++ \ ++ bkey_init(&k->k); \ ++ memset(&k->v, 0, sizeof(k->v)); \ ++ k->k.type = KEY_TYPE_##name; \ ++ set_bkey_val_bytes(&k->k, sizeof(k->v)); \ ++ \ ++ return k; \ ++} ++ ++BKEY_VAL_ACCESSORS(cookie); ++BKEY_VAL_ACCESSORS(btree_ptr); ++BKEY_VAL_ACCESSORS(extent); ++BKEY_VAL_ACCESSORS(reservation); ++BKEY_VAL_ACCESSORS(inode); ++BKEY_VAL_ACCESSORS(inode_generation); ++BKEY_VAL_ACCESSORS(dirent); ++BKEY_VAL_ACCESSORS(xattr); ++BKEY_VAL_ACCESSORS(alloc); ++BKEY_VAL_ACCESSORS(quota); ++BKEY_VAL_ACCESSORS(stripe); ++BKEY_VAL_ACCESSORS(reflink_p); ++BKEY_VAL_ACCESSORS(reflink_v); ++BKEY_VAL_ACCESSORS(inline_data); ++BKEY_VAL_ACCESSORS(btree_ptr_v2); ++ ++/* byte order helpers */ ++ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ ++static inline unsigned high_word_offset(const struct bkey_format *f) ++{ ++ return f->key_u64s - 1; ++} ++ ++#define high_bit_offset 0 ++#define nth_word(p, n) ((p) - (n)) ++ ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++ ++static inline unsigned high_word_offset(const struct bkey_format *f) ++{ ++ return 0; ++} ++ ++#define high_bit_offset KEY_PACKED_BITS_START ++#define nth_word(p, n) ((p) + (n)) ++ ++#else ++#error edit for your odd byteorder. ++#endif ++ ++#define high_word(f, k) ((k)->_data + high_word_offset(f)) ++#define next_word(p) nth_word(p, 1) ++#define prev_word(p) nth_word(p, -1) ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_bkey_pack_test(void); ++#else ++static inline void bch2_bkey_pack_test(void) {} ++#endif ++ ++#endif /* _BCACHEFS_BKEY_H */ +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +new file mode 100644 +index 000000000000..36e0c5152b47 +--- /dev/null ++++ b/fs/bcachefs/bkey_methods.c +@@ -0,0 +1,353 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_types.h" ++#include "alloc_background.h" ++#include "dirent.h" ++#include "ec.h" ++#include "error.h" ++#include "extents.h" ++#include "inode.h" ++#include "quota.h" ++#include "reflink.h" ++#include "xattr.h" ++ ++const char * const bch2_bkey_types[] = { ++#define x(name, nr) #name, ++ BCH_BKEY_TYPES() ++#undef x ++ NULL ++}; ++ ++static const char *deleted_key_invalid(const struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ return NULL; ++} ++ ++#define bch2_bkey_ops_deleted (struct bkey_ops) { \ ++ .key_invalid = deleted_key_invalid, \ ++} ++ ++#define bch2_bkey_ops_discard (struct bkey_ops) { \ ++ .key_invalid = deleted_key_invalid, \ ++} ++ ++static const char *empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ if (bkey_val_bytes(k.k)) ++ return "value size should be zero"; ++ ++ return NULL; ++} ++ ++#define bch2_bkey_ops_error (struct bkey_ops) { \ ++ .key_invalid = empty_val_key_invalid, \ ++} ++ ++static const char *key_type_cookie_invalid(const struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) ++ return "incorrect value size"; ++ ++ return NULL; ++} ++ ++#define bch2_bkey_ops_cookie (struct bkey_ops) { \ ++ .key_invalid = key_type_cookie_invalid, \ ++} ++ ++#define bch2_bkey_ops_whiteout (struct bkey_ops) { \ ++ .key_invalid = empty_val_key_invalid, \ ++} ++ ++static const char *key_type_inline_data_invalid(const struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ return NULL; ++} ++ ++static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ pr_buf(out, "(%zu bytes)", bkey_val_bytes(k.k)); ++} ++ ++#define bch2_bkey_ops_inline_data (struct bkey_ops) { \ ++ .key_invalid = key_type_inline_data_invalid, \ ++ .val_to_text = key_type_inline_data_to_text, \ ++} ++ ++static const struct bkey_ops bch2_bkey_ops[] = { ++#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, ++ BCH_BKEY_TYPES() ++#undef x ++}; ++ ++const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k) ++{ ++ if (k.k->type >= KEY_TYPE_MAX) ++ return "invalid type"; ++ ++ return bch2_bkey_ops[k.k->type].key_invalid(c, k); ++} ++ ++const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, ++ enum btree_node_type type) ++{ ++ if (k.k->u64s < BKEY_U64s) ++ return "u64s too small"; ++ ++ if (type == BKEY_TYPE_BTREE && ++ bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) ++ return "value too big"; ++ ++ if (btree_node_type_is_extents(type)) { ++ if ((k.k->size == 0) != bkey_deleted(k.k)) ++ return "bad size field"; ++ ++ if (k.k->size > k.k->p.offset) ++ return "size greater than offset"; ++ } else { ++ if (k.k->size) ++ return "nonzero size field"; ++ } ++ ++ if (k.k->p.snapshot) ++ return "nonzero snapshot"; ++ ++ if (type != BKEY_TYPE_BTREE && ++ !bkey_cmp(k.k->p, POS_MAX)) ++ return "POS_MAX key"; ++ ++ return NULL; ++} ++ ++const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, ++ enum btree_node_type type) ++{ ++ return __bch2_bkey_invalid(c, k, type) ?: ++ bch2_bkey_val_invalid(c, k); ++} ++ ++const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) ++{ ++ if (bkey_cmp(k.k->p, b->data->min_key) < 0) ++ return "key before start of btree node"; ++ ++ if (bkey_cmp(k.k->p, b->data->max_key) > 0) ++ return "key past end of btree node"; ++ ++ return NULL; ++} ++ ++void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; ++ const char *invalid; ++ ++ BUG_ON(!k.k->u64s); ++ ++ invalid = bch2_bkey_invalid(c, k, btree_node_type(b)) ?: ++ bch2_bkey_in_btree_node(b, k); ++ if (invalid) { ++ char buf[160]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, k); ++ bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid); ++ return; ++ } ++ ++ if (ops->key_debugcheck) ++ ops->key_debugcheck(c, k); ++} ++ ++void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) ++{ ++ if (!bkey_cmp(pos, POS_MIN)) ++ pr_buf(out, "POS_MIN"); ++ else if (!bkey_cmp(pos, POS_MAX)) ++ pr_buf(out, "POS_MAX"); ++ else ++ pr_buf(out, "%llu:%llu", pos.inode, pos.offset); ++} ++ ++void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) ++{ ++ if (k) { ++ pr_buf(out, "u64s %u type %s ", k->u64s, ++ bch2_bkey_types[k->type]); ++ ++ bch2_bpos_to_text(out, k->p); ++ ++ pr_buf(out, " snap %u len %u ver %llu", ++ k->p.snapshot, k->size, k->version.lo); ++ } else { ++ pr_buf(out, "(null)"); ++ } ++} ++ ++void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; ++ ++ if (likely(ops->val_to_text)) ++ ops->val_to_text(out, c, k); ++} ++ ++void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ bch2_bkey_to_text(out, k.k); ++ ++ if (k.k) { ++ pr_buf(out, ": "); ++ bch2_val_to_text(out, c, k); ++ } ++} ++ ++void bch2_bkey_swab_val(struct bkey_s k) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; ++ ++ if (ops->swab) ++ ops->swab(k); ++} ++ ++bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; ++ ++ return ops->key_normalize ++ ? ops->key_normalize(c, k) ++ : false; ++} ++ ++enum merge_result bch2_bkey_merge(struct bch_fs *c, ++ struct bkey_s l, struct bkey_s r) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type]; ++ enum merge_result ret; ++ ++ if (key_merging_disabled(c) || ++ !ops->key_merge || ++ l.k->type != r.k->type || ++ bversion_cmp(l.k->version, r.k->version) || ++ bkey_cmp(l.k->p, bkey_start_pos(r.k))) ++ return BCH_MERGE_NOMERGE; ++ ++ ret = ops->key_merge(c, l, r); ++ ++ if (ret != BCH_MERGE_NOMERGE) ++ l.k->needs_whiteout |= r.k->needs_whiteout; ++ return ret; ++} ++ ++static const struct old_bkey_type { ++ u8 btree_node_type; ++ u8 old; ++ u8 new; ++} bkey_renumber_table[] = { ++ {BKEY_TYPE_BTREE, 128, KEY_TYPE_btree_ptr }, ++ {BKEY_TYPE_EXTENTS, 128, KEY_TYPE_extent }, ++ {BKEY_TYPE_EXTENTS, 129, KEY_TYPE_extent }, ++ {BKEY_TYPE_EXTENTS, 130, KEY_TYPE_reservation }, ++ {BKEY_TYPE_INODES, 128, KEY_TYPE_inode }, ++ {BKEY_TYPE_INODES, 130, KEY_TYPE_inode_generation }, ++ {BKEY_TYPE_DIRENTS, 128, KEY_TYPE_dirent }, ++ {BKEY_TYPE_DIRENTS, 129, KEY_TYPE_whiteout }, ++ {BKEY_TYPE_XATTRS, 128, KEY_TYPE_xattr }, ++ {BKEY_TYPE_XATTRS, 129, KEY_TYPE_whiteout }, ++ {BKEY_TYPE_ALLOC, 128, KEY_TYPE_alloc }, ++ {BKEY_TYPE_QUOTAS, 128, KEY_TYPE_quota }, ++}; ++ ++void bch2_bkey_renumber(enum btree_node_type btree_node_type, ++ struct bkey_packed *k, ++ int write) ++{ ++ const struct old_bkey_type *i; ++ ++ for (i = bkey_renumber_table; ++ i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table); ++ i++) ++ if (btree_node_type == i->btree_node_type && ++ k->type == (write ? i->new : i->old)) { ++ k->type = write ? i->old : i->new; ++ break; ++ } ++} ++ ++void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, ++ struct bkey_format *f, ++ struct bkey_packed *k) ++{ ++ const struct bkey_ops *ops; ++ struct bkey uk; ++ struct bkey_s u; ++ int i; ++ ++ /* ++ * Do these operations in reverse order in the write path: ++ */ ++ ++ for (i = 0; i < 4; i++) ++ switch (!write ? i : 3 - i) { ++ case 0: ++ if (big_endian != CPU_BIG_ENDIAN) ++ bch2_bkey_swab_key(f, k); ++ break; ++ case 1: ++ if (version < bcachefs_metadata_version_bkey_renumber) ++ bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write); ++ break; ++ case 2: ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_id == BTREE_ID_INODES) { ++ if (!bkey_packed(k)) { ++ struct bkey_i *u = packed_to_bkey(k); ++ swap(u->k.p.inode, u->k.p.offset); ++ } else if (f->bits_per_field[BKEY_FIELD_INODE] && ++ f->bits_per_field[BKEY_FIELD_OFFSET]) { ++ struct bkey_format tmp = *f, *in = f, *out = &tmp; ++ ++ swap(tmp.bits_per_field[BKEY_FIELD_INODE], ++ tmp.bits_per_field[BKEY_FIELD_OFFSET]); ++ swap(tmp.field_offset[BKEY_FIELD_INODE], ++ tmp.field_offset[BKEY_FIELD_OFFSET]); ++ ++ if (!write) ++ swap(in, out); ++ ++ uk = __bch2_bkey_unpack_key(in, k); ++ swap(uk.p.inode, uk.p.offset); ++ BUG_ON(!bch2_bkey_pack_key(k, &uk, out)); ++ } ++ } ++ break; ++ case 3: ++ if (!bkey_packed(k)) { ++ u = bkey_i_to_s(packed_to_bkey(k)); ++ } else { ++ uk = __bch2_bkey_unpack_key(f, k); ++ u.k = &uk; ++ u.v = bkeyp_val(f, k); ++ } ++ ++ if (big_endian != CPU_BIG_ENDIAN) ++ bch2_bkey_swab_val(u); ++ ++ ops = &bch2_bkey_ops[k->type]; ++ ++ if (ops->compat) ++ ops->compat(btree_id, version, big_endian, write, u); ++ break; ++ default: ++ BUG(); ++ } ++} +diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h +new file mode 100644 +index 000000000000..0bca725ae3b8 +--- /dev/null ++++ b/fs/bcachefs/bkey_methods.h +@@ -0,0 +1,82 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BKEY_METHODS_H ++#define _BCACHEFS_BKEY_METHODS_H ++ ++#include "bkey.h" ++ ++struct bch_fs; ++struct btree; ++struct bkey; ++enum btree_node_type; ++ ++extern const char * const bch2_bkey_types[]; ++ ++enum merge_result { ++ BCH_MERGE_NOMERGE, ++ ++ /* ++ * The keys were mergeable, but would have overflowed size - so instead ++ * l was changed to the maximum size, and both keys were modified: ++ */ ++ BCH_MERGE_PARTIAL, ++ BCH_MERGE_MERGE, ++}; ++ ++struct bkey_ops { ++ /* Returns reason for being invalid if invalid, else NULL: */ ++ const char * (*key_invalid)(const struct bch_fs *, ++ struct bkey_s_c); ++ void (*key_debugcheck)(struct bch_fs *, struct bkey_s_c); ++ void (*val_to_text)(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ void (*swab)(struct bkey_s); ++ bool (*key_normalize)(struct bch_fs *, struct bkey_s); ++ enum merge_result (*key_merge)(struct bch_fs *, ++ struct bkey_s, struct bkey_s); ++ void (*compat)(enum btree_id id, unsigned version, ++ unsigned big_endian, int write, ++ struct bkey_s); ++}; ++ ++const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c); ++const char *__bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, ++ enum btree_node_type); ++const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, ++ enum btree_node_type); ++const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c); ++ ++void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); ++ ++void bch2_bpos_to_text(struct printbuf *, struct bpos); ++void bch2_bkey_to_text(struct printbuf *, const struct bkey *); ++void bch2_val_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ ++void bch2_bkey_swab_val(struct bkey_s); ++ ++bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); ++ ++enum merge_result bch2_bkey_merge(struct bch_fs *, ++ struct bkey_s, struct bkey_s); ++ ++void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); ++ ++void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned, ++ int, struct bkey_format *, struct bkey_packed *); ++ ++static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, ++ struct bkey_format *f, ++ struct bkey_packed *k) ++{ ++ if (version < bcachefs_metadata_version_current || ++ big_endian != CPU_BIG_ENDIAN) ++ __bch2_bkey_compat(level, btree_id, version, ++ big_endian, write, f, k); ++ ++} ++ ++#endif /* _BCACHEFS_BKEY_METHODS_H */ +diff --git a/fs/bcachefs/bkey_on_stack.h b/fs/bcachefs/bkey_on_stack.h +new file mode 100644 +index 000000000000..f607a0cb37ed +--- /dev/null ++++ b/fs/bcachefs/bkey_on_stack.h +@@ -0,0 +1,43 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BKEY_ON_STACK_H ++#define _BCACHEFS_BKEY_ON_STACK_H ++ ++#include "bcachefs.h" ++ ++struct bkey_on_stack { ++ struct bkey_i *k; ++ u64 onstack[12]; ++}; ++ ++static inline void bkey_on_stack_realloc(struct bkey_on_stack *s, ++ struct bch_fs *c, unsigned u64s) ++{ ++ if (s->k == (void *) s->onstack && ++ u64s > ARRAY_SIZE(s->onstack)) { ++ s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS); ++ memcpy(s->k, s->onstack, sizeof(s->onstack)); ++ } ++} ++ ++static inline void bkey_on_stack_reassemble(struct bkey_on_stack *s, ++ struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ bkey_on_stack_realloc(s, c, k.k->u64s); ++ bkey_reassemble(s->k, k); ++} ++ ++static inline void bkey_on_stack_init(struct bkey_on_stack *s) ++{ ++ s->k = (void *) s->onstack; ++} ++ ++static inline void bkey_on_stack_exit(struct bkey_on_stack *s, ++ struct bch_fs *c) ++{ ++ if (s->k != (void *) s->onstack) ++ mempool_free(s->k, &c->large_bkey_pool); ++ s->k = NULL; ++} ++ ++#endif /* _BCACHEFS_BKEY_ON_STACK_H */ +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +new file mode 100644 +index 000000000000..839e78d1dc35 +--- /dev/null ++++ b/fs/bcachefs/bkey_sort.c +@@ -0,0 +1,515 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "bkey_on_stack.h" ++#include "bkey_sort.h" ++#include "bset.h" ++#include "extents.h" ++ ++typedef int (*sort_cmp_fn)(struct btree *, ++ struct bkey_packed *, ++ struct bkey_packed *); ++ ++static inline bool sort_iter_end(struct sort_iter *iter) ++{ ++ return !iter->used; ++} ++ ++static inline void __sort_iter_sift(struct sort_iter *iter, ++ unsigned from, ++ sort_cmp_fn cmp) ++{ ++ unsigned i; ++ ++ for (i = from; ++ i + 1 < iter->used && ++ cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0; ++ i++) ++ swap(iter->data[i], iter->data[i + 1]); ++} ++ ++static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp) ++{ ++ ++ __sort_iter_sift(iter, 0, cmp); ++} ++ ++static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) ++{ ++ unsigned i = iter->used; ++ ++ while (i--) ++ __sort_iter_sift(iter, i, cmp); ++} ++ ++static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) ++{ ++ return !sort_iter_end(iter) ? iter->data->k : NULL; ++} ++ ++static inline void __sort_iter_advance(struct sort_iter *iter, ++ unsigned idx, sort_cmp_fn cmp) ++{ ++ struct sort_iter_set *i = iter->data + idx; ++ ++ BUG_ON(idx >= iter->used); ++ ++ i->k = bkey_next_skip_noops(i->k, i->end); ++ ++ BUG_ON(i->k > i->end); ++ ++ if (i->k == i->end) ++ array_remove_item(iter->data, iter->used, idx); ++ else ++ __sort_iter_sift(iter, idx, cmp); ++} ++ ++static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) ++{ ++ __sort_iter_advance(iter, 0, cmp); ++} ++ ++static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, ++ sort_cmp_fn cmp) ++{ ++ struct bkey_packed *ret = sort_iter_peek(iter); ++ ++ if (ret) ++ sort_iter_advance(iter, cmp); ++ ++ return ret; ++} ++ ++/* ++ * If keys compare equal, compare by pointer order: ++ */ ++static inline int key_sort_fix_overlapping_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) ++{ ++ return bkey_cmp_packed(b, l, r) ?: ++ cmp_int((unsigned long) l, (unsigned long) r); ++} ++ ++static inline bool should_drop_next_key(struct sort_iter *iter) ++{ ++ /* ++ * key_sort_cmp() ensures that when keys compare equal the older key ++ * comes first; so if l->k compares equal to r->k then l->k is older ++ * and should be dropped. ++ */ ++ return iter->used >= 2 && ++ !bkey_cmp_packed(iter->b, ++ iter->data[0].k, ++ iter->data[1].k); ++} ++ ++struct btree_nr_keys ++bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, ++ struct sort_iter *iter) ++{ ++ struct bkey_packed *out = dst->start; ++ struct bkey_packed *k; ++ struct btree_nr_keys nr; ++ ++ memset(&nr, 0, sizeof(nr)); ++ ++ sort_iter_sort(iter, key_sort_fix_overlapping_cmp); ++ ++ while ((k = sort_iter_peek(iter))) { ++ if (!bkey_whiteout(k) && ++ !should_drop_next_key(iter)) { ++ bkey_copy(out, k); ++ btree_keys_account_key_add(&nr, 0, out); ++ out = bkey_next(out); ++ } ++ ++ sort_iter_advance(iter, key_sort_fix_overlapping_cmp); ++ } ++ ++ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); ++ return nr; ++} ++ ++static void extent_sort_append(struct bch_fs *c, ++ struct bkey_format *f, ++ struct btree_nr_keys *nr, ++ struct bkey_packed **out, ++ struct bkey_s k) ++{ ++ if (!bkey_whiteout(k.k)) { ++ if (!bch2_bkey_pack_key(*out, k.k, f)) ++ memcpy_u64s_small(*out, k.k, BKEY_U64s); ++ ++ memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k)); ++ ++ btree_keys_account_key_add(nr, 0, *out); ++ *out = bkey_next(*out); ++ } ++} ++ ++/* Sort + repack in a new format: */ ++struct btree_nr_keys ++bch2_sort_repack(struct bset *dst, struct btree *src, ++ struct btree_node_iter *src_iter, ++ struct bkey_format *out_f, ++ bool filter_whiteouts) ++{ ++ struct bkey_format *in_f = &src->format; ++ struct bkey_packed *in, *out = vstruct_last(dst); ++ struct btree_nr_keys nr; ++ ++ memset(&nr, 0, sizeof(nr)); ++ ++ while ((in = bch2_btree_node_iter_next_all(src_iter, src))) { ++ if (filter_whiteouts && bkey_whiteout(in)) ++ continue; ++ ++ if (bch2_bkey_transform(out_f, out, bkey_packed(in) ++ ? in_f : &bch2_bkey_format_current, in)) ++ out->format = KEY_FORMAT_LOCAL_BTREE; ++ else ++ bch2_bkey_unpack(src, (void *) out, in); ++ ++ btree_keys_account_key_add(&nr, 0, out); ++ out = bkey_next(out); ++ } ++ ++ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); ++ return nr; ++} ++ ++/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */ ++struct btree_nr_keys ++bch2_sort_repack_merge(struct bch_fs *c, ++ struct bset *dst, struct btree *src, ++ struct btree_node_iter *iter, ++ struct bkey_format *out_f, ++ bool filter_whiteouts) ++{ ++ struct bkey_packed *out = vstruct_last(dst), *k_packed; ++ struct bkey_on_stack k; ++ struct btree_nr_keys nr; ++ ++ memset(&nr, 0, sizeof(nr)); ++ bkey_on_stack_init(&k); ++ ++ while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) { ++ if (filter_whiteouts && bkey_whiteout(k_packed)) ++ continue; ++ ++ /* ++ * NOTE: ++ * bch2_bkey_normalize may modify the key we pass it (dropping ++ * stale pointers) and we don't have a write lock on the src ++ * node; we have to make a copy of the entire key before calling ++ * normalize ++ */ ++ bkey_on_stack_realloc(&k, c, k_packed->u64s + BKEY_U64s); ++ bch2_bkey_unpack(src, k.k, k_packed); ++ ++ if (filter_whiteouts && ++ bch2_bkey_normalize(c, bkey_i_to_s(k.k))) ++ continue; ++ ++ extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k)); ++ } ++ ++ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); ++ bkey_on_stack_exit(&k, c); ++ return nr; ++} ++ ++static inline int sort_keys_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) ++{ ++ return bkey_cmp_packed(b, l, r) ?: ++ (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: ++ (int) l->needs_whiteout - (int) r->needs_whiteout; ++} ++ ++unsigned bch2_sort_keys(struct bkey_packed *dst, ++ struct sort_iter *iter, ++ bool filter_whiteouts) ++{ ++ const struct bkey_format *f = &iter->b->format; ++ struct bkey_packed *in, *next, *out = dst; ++ ++ sort_iter_sort(iter, sort_keys_cmp); ++ ++ while ((in = sort_iter_next(iter, sort_keys_cmp))) { ++ bool needs_whiteout = false; ++ ++ if (bkey_whiteout(in) && ++ (filter_whiteouts || !in->needs_whiteout)) ++ continue; ++ ++ while ((next = sort_iter_peek(iter)) && ++ !bkey_cmp_packed(iter->b, in, next)) { ++ BUG_ON(in->needs_whiteout && ++ next->needs_whiteout); ++ needs_whiteout |= in->needs_whiteout; ++ in = sort_iter_next(iter, sort_keys_cmp); ++ } ++ ++ if (bkey_whiteout(in)) { ++ memcpy_u64s(out, in, bkeyp_key_u64s(f, in)); ++ set_bkeyp_val_u64s(f, out, 0); ++ } else { ++ bkey_copy(out, in); ++ } ++ out->needs_whiteout |= needs_whiteout; ++ out = bkey_next(out); ++ } ++ ++ return (u64 *) out - (u64 *) dst; ++} ++ ++/* Compat code for btree_node_old_extent_overwrite: */ ++ ++/* ++ * If keys compare equal, compare by pointer order: ++ * ++ * Necessary for sort_fix_overlapping() - if there are multiple keys that ++ * compare equal in different sets, we have to process them newest to oldest. ++ */ ++static inline int extent_sort_fix_overlapping_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) ++{ ++ struct bkey ul = bkey_unpack_key(b, l); ++ struct bkey ur = bkey_unpack_key(b, r); ++ ++ return bkey_cmp(bkey_start_pos(&ul), ++ bkey_start_pos(&ur)) ?: ++ cmp_int((unsigned long) r, (unsigned long) l); ++} ++ ++/* ++ * The algorithm in extent_sort_fix_overlapping() relies on keys in the same ++ * bset being ordered by start offset - but 0 size whiteouts (which are always ++ * KEY_TYPE_deleted) break this ordering, so we need to skip over them: ++ */ ++static void extent_iter_advance(struct sort_iter *iter, unsigned idx) ++{ ++ struct sort_iter_set *i = iter->data + idx; ++ ++ do { ++ i->k = bkey_next_skip_noops(i->k, i->end); ++ } while (i->k != i->end && bkey_deleted(i->k)); ++ ++ if (i->k == i->end) ++ array_remove_item(iter->data, iter->used, idx); ++ else ++ __sort_iter_sift(iter, idx, extent_sort_fix_overlapping_cmp); ++} ++ ++struct btree_nr_keys ++bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, ++ struct sort_iter *iter) ++{ ++ struct btree *b = iter->b; ++ struct bkey_format *f = &b->format; ++ struct sort_iter_set *_l = iter->data, *_r = iter->data + 1; ++ struct bkey_packed *out = dst->start; ++ struct bkey l_unpacked, r_unpacked; ++ struct bkey_s l, r; ++ struct btree_nr_keys nr; ++ struct bkey_on_stack split; ++ unsigned i; ++ ++ memset(&nr, 0, sizeof(nr)); ++ bkey_on_stack_init(&split); ++ ++ sort_iter_sort(iter, extent_sort_fix_overlapping_cmp); ++ for (i = 0; i < iter->used;) { ++ if (bkey_deleted(iter->data[i].k)) ++ __sort_iter_advance(iter, i, ++ extent_sort_fix_overlapping_cmp); ++ else ++ i++; ++ } ++ ++ while (!sort_iter_end(iter)) { ++ l = __bkey_disassemble(b, _l->k, &l_unpacked); ++ ++ if (iter->used == 1) { ++ extent_sort_append(c, f, &nr, &out, l); ++ extent_iter_advance(iter, 0); ++ continue; ++ } ++ ++ r = __bkey_disassemble(b, _r->k, &r_unpacked); ++ ++ /* If current key and next key don't overlap, just append */ ++ if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { ++ extent_sort_append(c, f, &nr, &out, l); ++ extent_iter_advance(iter, 0); ++ continue; ++ } ++ ++ /* Skip 0 size keys */ ++ if (!r.k->size) { ++ extent_iter_advance(iter, 1); ++ continue; ++ } ++ ++ /* ++ * overlap: keep the newer key and trim the older key so they ++ * don't overlap. comparing pointers tells us which one is ++ * newer, since the bsets are appended one after the other. ++ */ ++ ++ /* can't happen because of comparison func */ ++ BUG_ON(_l->k < _r->k && ++ !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k))); ++ ++ if (_l->k > _r->k) { ++ /* l wins, trim r */ ++ if (bkey_cmp(l.k->p, r.k->p) >= 0) { ++ extent_iter_advance(iter, 1); ++ } else { ++ bch2_cut_front_s(l.k->p, r); ++ extent_save(b, _r->k, r.k); ++ __sort_iter_sift(iter, 1, ++ extent_sort_fix_overlapping_cmp); ++ } ++ } else if (bkey_cmp(l.k->p, r.k->p) > 0) { ++ ++ /* ++ * r wins, but it overlaps in the middle of l - split l: ++ */ ++ bkey_on_stack_reassemble(&split, c, l.s_c); ++ bch2_cut_back(bkey_start_pos(r.k), split.k); ++ ++ bch2_cut_front_s(r.k->p, l); ++ extent_save(b, _l->k, l.k); ++ ++ __sort_iter_sift(iter, 0, ++ extent_sort_fix_overlapping_cmp); ++ ++ extent_sort_append(c, f, &nr, &out, ++ bkey_i_to_s(split.k)); ++ } else { ++ bch2_cut_back_s(bkey_start_pos(r.k), l); ++ extent_save(b, _l->k, l.k); ++ } ++ } ++ ++ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); ++ ++ bkey_on_stack_exit(&split, c); ++ return nr; ++} ++ ++static inline int sort_extents_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) ++{ ++ return bkey_cmp_packed(b, l, r) ?: ++ (int) bkey_deleted(l) - (int) bkey_deleted(r); ++} ++ ++unsigned bch2_sort_extents(struct bkey_packed *dst, ++ struct sort_iter *iter, ++ bool filter_whiteouts) ++{ ++ struct bkey_packed *in, *out = dst; ++ ++ sort_iter_sort(iter, sort_extents_cmp); ++ ++ while ((in = sort_iter_next(iter, sort_extents_cmp))) { ++ if (bkey_deleted(in)) ++ continue; ++ ++ if (bkey_whiteout(in) && ++ (filter_whiteouts || !in->needs_whiteout)) ++ continue; ++ ++ bkey_copy(out, in); ++ out = bkey_next(out); ++ } ++ ++ return (u64 *) out - (u64 *) dst; ++} ++ ++static inline int sort_extent_whiteouts_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) ++{ ++ struct bkey ul = bkey_unpack_key(b, l); ++ struct bkey ur = bkey_unpack_key(b, r); ++ ++ return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur)); ++} ++ ++unsigned bch2_sort_extent_whiteouts(struct bkey_packed *dst, ++ struct sort_iter *iter) ++{ ++ const struct bkey_format *f = &iter->b->format; ++ struct bkey_packed *in, *out = dst; ++ struct bkey_i l, r; ++ bool prev = false, l_packed = false; ++ u64 max_packed_size = bkey_field_max(f, BKEY_FIELD_SIZE); ++ u64 max_packed_offset = bkey_field_max(f, BKEY_FIELD_OFFSET); ++ u64 new_size; ++ ++ max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX); ++ ++ sort_iter_sort(iter, sort_extent_whiteouts_cmp); ++ ++ while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) { ++ if (bkey_deleted(in)) ++ continue; ++ ++ EBUG_ON(bkeyp_val_u64s(f, in)); ++ EBUG_ON(in->type != KEY_TYPE_discard); ++ ++ r.k = bkey_unpack_key(iter->b, in); ++ ++ if (prev && ++ bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) { ++ if (bkey_cmp(l.k.p, r.k.p) >= 0) ++ continue; ++ ++ new_size = l_packed ++ ? min(max_packed_size, max_packed_offset - ++ bkey_start_offset(&l.k)) ++ : KEY_SIZE_MAX; ++ ++ new_size = min(new_size, r.k.p.offset - ++ bkey_start_offset(&l.k)); ++ ++ BUG_ON(new_size < l.k.size); ++ ++ bch2_key_resize(&l.k, new_size); ++ ++ if (bkey_cmp(l.k.p, r.k.p) >= 0) ++ continue; ++ ++ bch2_cut_front(l.k.p, &r); ++ } ++ ++ if (prev) { ++ if (!bch2_bkey_pack(out, &l, f)) { ++ BUG_ON(l_packed); ++ bkey_copy(out, &l); ++ } ++ out = bkey_next(out); ++ } ++ ++ l = r; ++ prev = true; ++ l_packed = bkey_packed(in); ++ } ++ ++ if (prev) { ++ if (!bch2_bkey_pack(out, &l, f)) { ++ BUG_ON(l_packed); ++ bkey_copy(out, &l); ++ } ++ out = bkey_next(out); ++ } ++ ++ return (u64 *) out - (u64 *) dst; ++} +diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h +new file mode 100644 +index 000000000000..458a051fdac5 +--- /dev/null ++++ b/fs/bcachefs/bkey_sort.h +@@ -0,0 +1,57 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BKEY_SORT_H ++#define _BCACHEFS_BKEY_SORT_H ++ ++struct sort_iter { ++ struct btree *b; ++ unsigned used; ++ unsigned size; ++ ++ struct sort_iter_set { ++ struct bkey_packed *k, *end; ++ } data[MAX_BSETS + 1]; ++}; ++ ++static inline void sort_iter_init(struct sort_iter *iter, struct btree *b) ++{ ++ iter->b = b; ++ iter->used = 0; ++ iter->size = ARRAY_SIZE(iter->data); ++} ++ ++static inline void sort_iter_add(struct sort_iter *iter, ++ struct bkey_packed *k, ++ struct bkey_packed *end) ++{ ++ BUG_ON(iter->used >= iter->size); ++ ++ if (k != end) ++ iter->data[iter->used++] = (struct sort_iter_set) { k, end }; ++} ++ ++struct btree_nr_keys ++bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *, ++ struct sort_iter *); ++struct btree_nr_keys ++bch2_extent_sort_fix_overlapping(struct bch_fs *, struct bset *, ++ struct sort_iter *); ++ ++struct btree_nr_keys ++bch2_sort_repack(struct bset *, struct btree *, ++ struct btree_node_iter *, ++ struct bkey_format *, bool); ++struct btree_nr_keys ++bch2_sort_repack_merge(struct bch_fs *, ++ struct bset *, struct btree *, ++ struct btree_node_iter *, ++ struct bkey_format *, bool); ++ ++unsigned bch2_sort_keys(struct bkey_packed *, ++ struct sort_iter *, bool); ++unsigned bch2_sort_extents(struct bkey_packed *, ++ struct sort_iter *, bool); ++ ++unsigned bch2_sort_extent_whiteouts(struct bkey_packed *, ++ struct sort_iter *); ++ ++#endif /* _BCACHEFS_BKEY_SORT_H */ +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +new file mode 100644 +index 000000000000..f7c2841ed8a7 +--- /dev/null ++++ b/fs/bcachefs/bset.c +@@ -0,0 +1,1742 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Code for working with individual keys, and sorted sets of keys with in a ++ * btree node ++ * ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "btree_cache.h" ++#include "bset.h" ++#include "eytzinger.h" ++#include "util.h" ++ ++#include ++#include ++#include ++#include ++ ++/* hack.. */ ++#include "alloc_types.h" ++#include ++ ++static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *, ++ struct btree *); ++ ++static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) ++{ ++ unsigned n = ARRAY_SIZE(iter->data); ++ ++ while (n && __btree_node_iter_set_end(iter, n - 1)) ++ --n; ++ ++ return n; ++} ++ ++struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) ++{ ++ unsigned offset = __btree_node_key_to_offset(b, k); ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) ++ if (offset <= t->end_offset) { ++ EBUG_ON(offset < btree_bkey_first_offset(t)); ++ return t; ++ } ++ ++ BUG(); ++} ++ ++/* ++ * There are never duplicate live keys in the btree - but including keys that ++ * have been flagged as deleted (and will be cleaned up later) we _will_ see ++ * duplicates. ++ * ++ * Thus the sort order is: usual key comparison first, but for keys that compare ++ * equal the deleted key(s) come first, and the (at most one) live version comes ++ * last. ++ * ++ * The main reason for this is insertion: to handle overwrites, we first iterate ++ * over keys that compare equal to our insert key, and then insert immediately ++ * prior to the first key greater than the key we're inserting - our insert ++ * position will be after all keys that compare equal to our insert key, which ++ * by the time we actually do the insert will all be deleted. ++ */ ++ ++void bch2_dump_bset(struct bch_fs *c, struct btree *b, ++ struct bset *i, unsigned set) ++{ ++ struct bkey_packed *_k, *_n; ++ struct bkey uk, n; ++ struct bkey_s_c k; ++ char buf[200]; ++ ++ if (!i->u64s) ++ return; ++ ++ for (_k = i->start; ++ _k < vstruct_last(i); ++ _k = _n) { ++ _n = bkey_next_skip_noops(_k, vstruct_last(i)); ++ ++ k = bkey_disassemble(b, _k, &uk); ++ if (c) ++ bch2_bkey_val_to_text(&PBUF(buf), c, k); ++ else ++ bch2_bkey_to_text(&PBUF(buf), k.k); ++ printk(KERN_ERR "block %u key %5zu: %s\n", set, ++ _k->_data - i->_data, buf); ++ ++ if (_n == vstruct_last(i)) ++ continue; ++ ++ n = bkey_unpack_key(b, _n); ++ ++ if (bkey_cmp(bkey_start_pos(&n), k.k->p) < 0) { ++ printk(KERN_ERR "Key skipped backwards\n"); ++ continue; ++ } ++ ++ if (!bkey_deleted(k.k) && ++ !bkey_cmp(n.p, k.k->p)) ++ printk(KERN_ERR "Duplicate keys\n"); ++ } ++} ++ ++void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) ++{ ++ struct bset_tree *t; ++ ++ console_lock(); ++ for_each_bset(b, t) ++ bch2_dump_bset(c, b, bset(b, t), t - b->set); ++ console_unlock(); ++} ++ ++void bch2_dump_btree_node_iter(struct btree *b, ++ struct btree_node_iter *iter) ++{ ++ struct btree_node_iter_set *set; ++ ++ printk(KERN_ERR "btree node iter with %u/%u sets:\n", ++ __btree_node_iter_used(iter), b->nsets); ++ ++ btree_node_iter_for_each(iter, set) { ++ struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ struct bkey uk = bkey_unpack_key(b, k); ++ char buf[100]; ++ ++ bch2_bkey_to_text(&PBUF(buf), &uk); ++ printk(KERN_ERR "set %zu key %u: %s\n", ++ t - b->set, set->k, buf); ++ } ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++void __bch2_verify_btree_nr_keys(struct btree *b) ++{ ++ struct bset_tree *t; ++ struct bkey_packed *k; ++ struct btree_nr_keys nr = { 0 }; ++ ++ for_each_bset(b, t) ++ bset_tree_for_each_key(b, t, k) ++ if (!bkey_whiteout(k)) ++ btree_keys_account_key_add(&nr, t - b->set, k); ++ ++ BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); ++} ++ ++static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, ++ struct btree *b) ++{ ++ struct btree_node_iter iter = *_iter; ++ const struct bkey_packed *k, *n; ++ ++ k = bch2_btree_node_iter_peek_all(&iter, b); ++ __bch2_btree_node_iter_advance(&iter, b); ++ n = bch2_btree_node_iter_peek_all(&iter, b); ++ ++ bkey_unpack_key(b, k); ++ ++ if (n && ++ bkey_iter_cmp(b, k, n) > 0) { ++ struct btree_node_iter_set *set; ++ struct bkey ku = bkey_unpack_key(b, k); ++ struct bkey nu = bkey_unpack_key(b, n); ++ char buf1[80], buf2[80]; ++ ++ bch2_dump_btree_node(NULL, b); ++ bch2_bkey_to_text(&PBUF(buf1), &ku); ++ bch2_bkey_to_text(&PBUF(buf2), &nu); ++ printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", ++ buf1, buf2); ++ printk(KERN_ERR "iter was:"); ++ ++ btree_node_iter_for_each(_iter, set) { ++ struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ printk(" [%zi %zi]", t - b->set, ++ k->_data - bset(b, t)->_data); ++ } ++ panic("\n"); ++ } ++} ++ ++void bch2_btree_node_iter_verify(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ struct btree_node_iter_set *set, *s2; ++ struct bkey_packed *k, *p; ++ struct bset_tree *t; ++ ++ if (bch2_btree_node_iter_end(iter)) ++ return; ++ ++ /* Verify no duplicates: */ ++ btree_node_iter_for_each(iter, set) ++ btree_node_iter_for_each(iter, s2) ++ BUG_ON(set != s2 && set->end == s2->end); ++ ++ /* Verify that set->end is correct: */ ++ btree_node_iter_for_each(iter, set) { ++ for_each_bset(b, t) ++ if (set->end == t->end_offset) ++ goto found; ++ BUG(); ++found: ++ BUG_ON(set->k < btree_bkey_first_offset(t) || ++ set->k >= t->end_offset); ++ } ++ ++ /* Verify iterator is sorted: */ ++ btree_node_iter_for_each(iter, set) ++ BUG_ON(set != iter->data && ++ btree_node_iter_cmp(b, set[-1], set[0]) > 0); ++ ++ k = bch2_btree_node_iter_peek_all(iter, b); ++ ++ for_each_bset(b, t) { ++ if (iter->data[0].end == t->end_offset) ++ continue; ++ ++ p = bch2_bkey_prev_all(b, t, ++ bch2_btree_node_iter_bset_pos(iter, b, t)); ++ ++ BUG_ON(p && bkey_iter_cmp(b, k, p) < 0); ++ } ++} ++ ++void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, ++ struct bkey_packed *insert, unsigned clobber_u64s) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, where); ++ struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); ++ struct bkey_packed *next = (void *) (where->_data + clobber_u64s); ++#if 0 ++ BUG_ON(prev && ++ bkey_iter_cmp(b, prev, insert) > 0); ++#else ++ if (prev && ++ bkey_iter_cmp(b, prev, insert) > 0) { ++ struct bkey k1 = bkey_unpack_key(b, prev); ++ struct bkey k2 = bkey_unpack_key(b, insert); ++ char buf1[100]; ++ char buf2[100]; ++ ++ bch2_dump_btree_node(NULL, b); ++ bch2_bkey_to_text(&PBUF(buf1), &k1); ++ bch2_bkey_to_text(&PBUF(buf2), &k2); ++ ++ panic("prev > insert:\n" ++ "prev key %s\n" ++ "insert key %s\n", ++ buf1, buf2); ++ } ++#endif ++#if 0 ++ BUG_ON(next != btree_bkey_last(b, t) && ++ bkey_iter_cmp(b, insert, next) > 0); ++#else ++ if (next != btree_bkey_last(b, t) && ++ bkey_iter_cmp(b, insert, next) > 0) { ++ struct bkey k1 = bkey_unpack_key(b, insert); ++ struct bkey k2 = bkey_unpack_key(b, next); ++ char buf1[100]; ++ char buf2[100]; ++ ++ bch2_dump_btree_node(NULL, b); ++ bch2_bkey_to_text(&PBUF(buf1), &k1); ++ bch2_bkey_to_text(&PBUF(buf2), &k2); ++ ++ panic("insert > next:\n" ++ "insert key %s\n" ++ "next key %s\n", ++ buf1, buf2); ++ } ++#endif ++} ++ ++#else ++ ++static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, ++ struct btree *b) {} ++ ++#endif ++ ++/* Auxiliary search trees */ ++ ++#define BFLOAT_FAILED_UNPACKED U8_MAX ++#define BFLOAT_FAILED U8_MAX ++ ++struct bkey_float { ++ u8 exponent; ++ u8 key_offset; ++ u16 mantissa; ++}; ++#define BKEY_MANTISSA_BITS 16 ++ ++static unsigned bkey_float_byte_offset(unsigned idx) ++{ ++ return idx * sizeof(struct bkey_float); ++} ++ ++struct ro_aux_tree { ++ struct bkey_float f[0]; ++}; ++ ++struct rw_aux_tree { ++ u16 offset; ++ struct bpos k; ++}; ++ ++static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) ++{ ++ BUG_ON(t->aux_data_offset == U16_MAX); ++ ++ switch (bset_aux_tree_type(t)) { ++ case BSET_NO_AUX_TREE: ++ return t->aux_data_offset; ++ case BSET_RO_AUX_TREE: ++ return t->aux_data_offset + ++ DIV_ROUND_UP(t->size * sizeof(struct bkey_float) + ++ t->size * sizeof(u8), 8); ++ case BSET_RW_AUX_TREE: ++ return t->aux_data_offset + ++ DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); ++ default: ++ BUG(); ++ } ++} ++ ++static unsigned bset_aux_tree_buf_start(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ return t == b->set ++ ? DIV_ROUND_UP(b->unpack_fn_len, 8) ++ : bset_aux_tree_buf_end(t - 1); ++} ++ ++static void *__aux_tree_base(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ return b->aux_data + t->aux_data_offset * 8; ++} ++ ++static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); ++ ++ return __aux_tree_base(b, t); ++} ++ ++static u8 *ro_aux_tree_prev(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); ++ ++ return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size); ++} ++ ++static struct bkey_float *bkey_float(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned idx) ++{ ++ return ro_aux_tree_base(b, t)->f + idx; ++} ++ ++static void bset_aux_tree_verify(struct btree *b) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) { ++ if (t->aux_data_offset == U16_MAX) ++ continue; ++ ++ BUG_ON(t != b->set && ++ t[-1].aux_data_offset == U16_MAX); ++ ++ BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); ++ BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); ++ BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); ++ } ++#endif ++} ++ ++void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks) ++{ ++ unsigned i; ++ ++ b->nsets = 0; ++ memset(&b->nr, 0, sizeof(b->nr)); ++#ifdef CONFIG_BCACHEFS_DEBUG ++ b->expensive_debug_checks = expensive_debug_checks; ++#endif ++ for (i = 0; i < MAX_BSETS; i++) ++ b->set[i].data_offset = U16_MAX; ++ ++ bch2_bset_set_no_aux_tree(b, b->set); ++} ++ ++/* Binary tree stuff for auxiliary search trees */ ++ ++/* ++ * Cacheline/offset <-> bkey pointer arithmetic: ++ * ++ * t->tree is a binary search tree in an array; each node corresponds to a key ++ * in one cacheline in t->set (BSET_CACHELINE bytes). ++ * ++ * This means we don't have to store the full index of the key that a node in ++ * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and ++ * then bkey_float->m gives us the offset within that cacheline, in units of 8 ++ * bytes. ++ * ++ * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to ++ * make this work. ++ * ++ * To construct the bfloat for an arbitrary key we need to know what the key ++ * immediately preceding it is: we have to check if the two keys differ in the ++ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size ++ * of the previous key so we can walk backwards to it from t->tree[j]'s key. ++ */ ++ ++static inline void *bset_cacheline(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned cacheline) ++{ ++ return (void *) round_down((unsigned long) btree_bkey_first(b, t), ++ L1_CACHE_BYTES) + ++ cacheline * BSET_CACHELINE; ++} ++ ++static struct bkey_packed *cacheline_to_bkey(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned cacheline, ++ unsigned offset) ++{ ++ return bset_cacheline(b, t, cacheline) + offset * 8; ++} ++ ++static unsigned bkey_to_cacheline(const struct btree *b, ++ const struct bset_tree *t, ++ const struct bkey_packed *k) ++{ ++ return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE; ++} ++ ++static ssize_t __bkey_to_cacheline_offset(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned cacheline, ++ const struct bkey_packed *k) ++{ ++ return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline); ++} ++ ++static unsigned bkey_to_cacheline_offset(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned cacheline, ++ const struct bkey_packed *k) ++{ ++ size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k); ++ ++ EBUG_ON(m > U8_MAX); ++ return m; ++} ++ ++static inline struct bkey_packed *tree_to_bkey(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned j) ++{ ++ return cacheline_to_bkey(b, t, ++ __eytzinger1_to_inorder(j, t->size, t->extra), ++ bkey_float(b, t, j)->key_offset); ++} ++ ++static struct bkey_packed *tree_to_prev_bkey(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned j) ++{ ++ unsigned prev_u64s = ro_aux_tree_prev(b, t)[j]; ++ ++ return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s); ++} ++ ++static struct rw_aux_tree *rw_aux_tree(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); ++ ++ return __aux_tree_base(b, t); ++} ++ ++/* ++ * For the write set - the one we're currently inserting keys into - we don't ++ * maintain a full search tree, we just keep a simple lookup table in t->prev. ++ */ ++static struct bkey_packed *rw_aux_to_bkey(const struct btree *b, ++ struct bset_tree *t, ++ unsigned j) ++{ ++ return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset); ++} ++ ++static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, ++ unsigned j, struct bkey_packed *k) ++{ ++ EBUG_ON(k >= btree_bkey_last(b, t)); ++ ++ rw_aux_tree(b, t)[j] = (struct rw_aux_tree) { ++ .offset = __btree_node_key_to_offset(b, k), ++ .k = bkey_unpack_pos(b, k), ++ }; ++} ++ ++static void bch2_bset_verify_rw_aux_tree(struct btree *b, ++ struct bset_tree *t) ++{ ++ struct bkey_packed *k = btree_bkey_first(b, t); ++ unsigned j = 0; ++ ++ if (!btree_keys_expensive_checks(b)) ++ return; ++ ++ BUG_ON(bset_has_ro_aux_tree(t)); ++ ++ if (!bset_has_rw_aux_tree(t)) ++ return; ++ ++ BUG_ON(t->size < 1); ++ BUG_ON(rw_aux_to_bkey(b, t, j) != k); ++ ++ goto start; ++ while (1) { ++ if (rw_aux_to_bkey(b, t, j) == k) { ++ BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k, ++ bkey_unpack_pos(b, k))); ++start: ++ if (++j == t->size) ++ break; ++ ++ BUG_ON(rw_aux_tree(b, t)[j].offset <= ++ rw_aux_tree(b, t)[j - 1].offset); ++ } ++ ++ k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); ++ BUG_ON(k >= btree_bkey_last(b, t)); ++ } ++} ++ ++/* returns idx of first entry >= offset: */ ++static unsigned rw_aux_tree_bsearch(struct btree *b, ++ struct bset_tree *t, ++ unsigned offset) ++{ ++ unsigned bset_offs = offset - btree_bkey_first_offset(t); ++ unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t); ++ unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0; ++ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); ++ EBUG_ON(!t->size); ++ EBUG_ON(idx > t->size); ++ ++ while (idx < t->size && ++ rw_aux_tree(b, t)[idx].offset < offset) ++ idx++; ++ ++ while (idx && ++ rw_aux_tree(b, t)[idx - 1].offset >= offset) ++ idx--; ++ ++ EBUG_ON(idx < t->size && ++ rw_aux_tree(b, t)[idx].offset < offset); ++ EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset); ++ EBUG_ON(idx + 1 < t->size && ++ rw_aux_tree(b, t)[idx].offset == ++ rw_aux_tree(b, t)[idx + 1].offset); ++ ++ return idx; ++} ++ ++static inline unsigned bkey_mantissa(const struct bkey_packed *k, ++ const struct bkey_float *f, ++ unsigned idx) ++{ ++ u64 v; ++ ++ EBUG_ON(!bkey_packed(k)); ++ ++ v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3))); ++ ++ /* ++ * In little endian, we're shifting off low bits (and then the bits we ++ * want are at the low end), in big endian we're shifting off high bits ++ * (and then the bits we want are at the high end, so we shift them ++ * back down): ++ */ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ v >>= f->exponent & 7; ++#else ++ v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS; ++#endif ++ return (u16) v; ++} ++ ++static void make_bfloat(struct btree *b, struct bset_tree *t, ++ unsigned j, ++ struct bkey_packed *min_key, ++ struct bkey_packed *max_key) ++{ ++ struct bkey_float *f = bkey_float(b, t, j); ++ struct bkey_packed *m = tree_to_bkey(b, t, j); ++ struct bkey_packed *l, *r; ++ unsigned mantissa; ++ int shift, exponent, high_bit; ++ ++ if (is_power_of_2(j)) { ++ l = min_key; ++ ++ if (!l->u64s) { ++ if (!bkey_pack_pos(l, b->data->min_key, b)) { ++ struct bkey_i tmp; ++ ++ bkey_init(&tmp.k); ++ tmp.k.p = b->data->min_key; ++ bkey_copy(l, &tmp); ++ } ++ } ++ } else { ++ l = tree_to_prev_bkey(b, t, j >> ffs(j)); ++ ++ EBUG_ON(m < l); ++ } ++ ++ if (is_power_of_2(j + 1)) { ++ r = max_key; ++ ++ if (!r->u64s) { ++ if (!bkey_pack_pos(r, t->max_key, b)) { ++ struct bkey_i tmp; ++ ++ bkey_init(&tmp.k); ++ tmp.k.p = t->max_key; ++ bkey_copy(r, &tmp); ++ } ++ } ++ } else { ++ r = tree_to_bkey(b, t, j >> (ffz(j) + 1)); ++ ++ EBUG_ON(m > r); ++ } ++ ++ /* ++ * for failed bfloats, the lookup code falls back to comparing against ++ * the original key. ++ */ ++ ++ if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) || ++ !b->nr_key_bits) { ++ f->exponent = BFLOAT_FAILED_UNPACKED; ++ return; ++ } ++ ++ /* ++ * The greatest differing bit of l and r is the first bit we must ++ * include in the bfloat mantissa we're creating in order to do ++ * comparisons - that bit always becomes the high bit of ++ * bfloat->mantissa, and thus the exponent we're calculating here is ++ * the position of what will become the low bit in bfloat->mantissa: ++ * ++ * Note that this may be negative - we may be running off the low end ++ * of the key: we handle this later: ++ */ ++ high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r), ++ min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1); ++ exponent = high_bit - (BKEY_MANTISSA_BITS - 1); ++ ++ /* ++ * Then we calculate the actual shift value, from the start of the key ++ * (k->_data), to get the key bits starting at exponent: ++ */ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; ++ ++ EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64); ++#else ++ shift = high_bit_offset + ++ b->nr_key_bits - ++ exponent - ++ BKEY_MANTISSA_BITS; ++ ++ EBUG_ON(shift < KEY_PACKED_BITS_START); ++#endif ++ EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); ++ ++ f->exponent = shift; ++ mantissa = bkey_mantissa(m, f, j); ++ ++ /* ++ * If we've got garbage bits, set them to all 1s - it's legal for the ++ * bfloat to compare larger than the original key, but not smaller: ++ */ ++ if (exponent < 0) ++ mantissa |= ~(~0U << -exponent); ++ ++ f->mantissa = mantissa; ++} ++ ++/* bytes remaining - only valid for last bset: */ ++static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t) ++{ ++ bset_aux_tree_verify(b); ++ ++ return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); ++} ++ ++static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t) ++{ ++ return __bset_tree_capacity(b, t) / ++ (sizeof(struct bkey_float) + sizeof(u8)); ++} ++ ++static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t) ++{ ++ return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); ++} ++ ++static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) ++{ ++ struct bkey_packed *k; ++ ++ t->size = 1; ++ t->extra = BSET_RW_AUX_TREE_VAL; ++ rw_aux_tree(b, t)[0].offset = ++ __btree_node_key_to_offset(b, btree_bkey_first(b, t)); ++ ++ bset_tree_for_each_key(b, t, k) { ++ if (t->size == bset_rw_tree_capacity(b, t)) ++ break; ++ ++ if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) > ++ L1_CACHE_BYTES) ++ rw_aux_tree_set(b, t, t->size++, k); ++ } ++} ++ ++static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) ++{ ++ struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); ++ struct bkey_packed min_key, max_key; ++ unsigned j, cacheline = 1; ++ ++ /* signal to make_bfloat() that they're uninitialized: */ ++ min_key.u64s = max_key.u64s = 0; ++ ++ t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), ++ bset_ro_tree_capacity(b, t)); ++retry: ++ if (t->size < 2) { ++ t->size = 0; ++ t->extra = BSET_NO_AUX_TREE_VAL; ++ return; ++ } ++ ++ t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; ++ ++ /* First we figure out where the first key in each cacheline is */ ++ eytzinger1_for_each(j, t->size) { ++ while (bkey_to_cacheline(b, t, k) < cacheline) ++ prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); ++ ++ if (k >= btree_bkey_last(b, t)) { ++ /* XXX: this path sucks */ ++ t->size--; ++ goto retry; ++ } ++ ++ ro_aux_tree_prev(b, t)[j] = prev->u64s; ++ bkey_float(b, t, j)->key_offset = ++ bkey_to_cacheline_offset(b, t, cacheline++, k); ++ ++ EBUG_ON(tree_to_prev_bkey(b, t, j) != prev); ++ EBUG_ON(tree_to_bkey(b, t, j) != k); ++ } ++ ++ while (k != btree_bkey_last(b, t)) ++ prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); ++ ++ t->max_key = bkey_unpack_pos(b, prev); ++ ++ /* Then we build the tree */ ++ eytzinger1_for_each(j, t->size) ++ make_bfloat(b, t, j, &min_key, &max_key); ++} ++ ++static void bset_alloc_tree(struct btree *b, struct bset_tree *t) ++{ ++ struct bset_tree *i; ++ ++ for (i = b->set; i != t; i++) ++ BUG_ON(bset_has_rw_aux_tree(i)); ++ ++ bch2_bset_set_no_aux_tree(b, t); ++ ++ /* round up to next cacheline: */ ++ t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t), ++ SMP_CACHE_BYTES / sizeof(u64)); ++ ++ bset_aux_tree_verify(b); ++} ++ ++void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t, ++ bool writeable) ++{ ++ if (writeable ++ ? bset_has_rw_aux_tree(t) ++ : bset_has_ro_aux_tree(t)) ++ return; ++ ++ bset_alloc_tree(b, t); ++ ++ if (!__bset_tree_capacity(b, t)) ++ return; ++ ++ if (writeable) ++ __build_rw_aux_tree(b, t); ++ else ++ __build_ro_aux_tree(b, t); ++ ++ bset_aux_tree_verify(b); ++} ++ ++void bch2_bset_init_first(struct btree *b, struct bset *i) ++{ ++ struct bset_tree *t; ++ ++ BUG_ON(b->nsets); ++ ++ memset(i, 0, sizeof(*i)); ++ get_random_bytes(&i->seq, sizeof(i->seq)); ++ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); ++ ++ t = &b->set[b->nsets++]; ++ set_btree_bset(b, t, i); ++} ++ ++void bch2_bset_init_next(struct bch_fs *c, struct btree *b, ++ struct btree_node_entry *bne) ++{ ++ struct bset *i = &bne->keys; ++ struct bset_tree *t; ++ ++ BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c)); ++ BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); ++ BUG_ON(b->nsets >= MAX_BSETS); ++ ++ memset(i, 0, sizeof(*i)); ++ i->seq = btree_bset_first(b)->seq; ++ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); ++ ++ t = &b->set[b->nsets++]; ++ set_btree_bset(b, t, i); ++} ++ ++/* ++ * find _some_ key in the same bset as @k that precedes @k - not necessarily the ++ * immediate predecessor: ++ */ ++static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, ++ struct bkey_packed *k) ++{ ++ struct bkey_packed *p; ++ unsigned offset; ++ int j; ++ ++ EBUG_ON(k < btree_bkey_first(b, t) || ++ k > btree_bkey_last(b, t)); ++ ++ if (k == btree_bkey_first(b, t)) ++ return NULL; ++ ++ switch (bset_aux_tree_type(t)) { ++ case BSET_NO_AUX_TREE: ++ p = btree_bkey_first(b, t); ++ break; ++ case BSET_RO_AUX_TREE: ++ j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k)); ++ ++ do { ++ p = j ? tree_to_bkey(b, t, ++ __inorder_to_eytzinger1(j--, ++ t->size, t->extra)) ++ : btree_bkey_first(b, t); ++ } while (p >= k); ++ break; ++ case BSET_RW_AUX_TREE: ++ offset = __btree_node_key_to_offset(b, k); ++ j = rw_aux_tree_bsearch(b, t, offset); ++ p = j ? rw_aux_to_bkey(b, t, j - 1) ++ : btree_bkey_first(b, t); ++ break; ++ } ++ ++ return p; ++} ++ ++struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, ++ struct bset_tree *t, ++ struct bkey_packed *k, ++ unsigned min_key_type) ++{ ++ struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; ++ ++ while ((p = __bkey_prev(b, t, k)) && !ret) { ++ for (i = p; i != k; i = bkey_next_skip_noops(i, k)) ++ if (i->type >= min_key_type) ++ ret = i; ++ ++ k = p; ++ } ++ ++ if (btree_keys_expensive_checks(b)) { ++ BUG_ON(ret >= orig_k); ++ ++ for (i = ret ++ ? bkey_next_skip_noops(ret, orig_k) ++ : btree_bkey_first(b, t); ++ i != orig_k; ++ i = bkey_next_skip_noops(i, orig_k)) ++ BUG_ON(i->type >= min_key_type); ++ } ++ ++ return ret; ++} ++ ++/* Insert */ ++ ++static void rw_aux_tree_fix_invalidated_key(struct btree *b, ++ struct bset_tree *t, ++ struct bkey_packed *k) ++{ ++ unsigned offset = __btree_node_key_to_offset(b, k); ++ unsigned j = rw_aux_tree_bsearch(b, t, offset); ++ ++ if (j < t->size && ++ rw_aux_tree(b, t)[j].offset == offset) ++ rw_aux_tree_set(b, t, j, k); ++ ++ bch2_bset_verify_rw_aux_tree(b, t); ++} ++ ++static void ro_aux_tree_fix_invalidated_key(struct btree *b, ++ struct bset_tree *t, ++ struct bkey_packed *k) ++{ ++ struct bkey_packed min_key, max_key; ++ unsigned inorder, j; ++ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); ++ ++ /* signal to make_bfloat() that they're uninitialized: */ ++ min_key.u64s = max_key.u64s = 0; ++ ++ if (bkey_next_skip_noops(k, btree_bkey_last(b, t)) == btree_bkey_last(b, t)) { ++ t->max_key = bkey_unpack_pos(b, k); ++ ++ for (j = 1; j < t->size; j = j * 2 + 1) ++ make_bfloat(b, t, j, &min_key, &max_key); ++ } ++ ++ inorder = bkey_to_cacheline(b, t, k); ++ ++ if (inorder && ++ inorder < t->size) { ++ j = __inorder_to_eytzinger1(inorder, t->size, t->extra); ++ ++ if (k == tree_to_bkey(b, t, j)) { ++ /* Fix the node this key corresponds to */ ++ make_bfloat(b, t, j, &min_key, &max_key); ++ ++ /* Children for which this key is the right boundary */ ++ for (j = eytzinger1_left_child(j); ++ j < t->size; ++ j = eytzinger1_right_child(j)) ++ make_bfloat(b, t, j, &min_key, &max_key); ++ } ++ } ++ ++ if (inorder + 1 < t->size) { ++ j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra); ++ ++ if (k == tree_to_prev_bkey(b, t, j)) { ++ make_bfloat(b, t, j, &min_key, &max_key); ++ ++ /* Children for which this key is the left boundary */ ++ for (j = eytzinger1_right_child(j); ++ j < t->size; ++ j = eytzinger1_left_child(j)) ++ make_bfloat(b, t, j, &min_key, &max_key); ++ } ++ } ++} ++ ++/** ++ * bch2_bset_fix_invalidated_key() - given an existing key @k that has been ++ * modified, fix any auxiliary search tree by remaking all the nodes in the ++ * auxiliary search tree that @k corresponds to ++ */ ++void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ ++ switch (bset_aux_tree_type(t)) { ++ case BSET_NO_AUX_TREE: ++ break; ++ case BSET_RO_AUX_TREE: ++ ro_aux_tree_fix_invalidated_key(b, t, k); ++ break; ++ case BSET_RW_AUX_TREE: ++ rw_aux_tree_fix_invalidated_key(b, t, k); ++ break; ++ } ++} ++ ++static void bch2_bset_fix_lookup_table(struct btree *b, ++ struct bset_tree *t, ++ struct bkey_packed *_where, ++ unsigned clobber_u64s, ++ unsigned new_u64s) ++{ ++ int shift = new_u64s - clobber_u64s; ++ unsigned l, j, where = __btree_node_key_to_offset(b, _where); ++ ++ EBUG_ON(bset_has_ro_aux_tree(t)); ++ ++ if (!bset_has_rw_aux_tree(t)) ++ return; ++ ++ /* returns first entry >= where */ ++ l = rw_aux_tree_bsearch(b, t, where); ++ ++ if (!l) /* never delete first entry */ ++ l++; ++ else if (l < t->size && ++ where < t->end_offset && ++ rw_aux_tree(b, t)[l].offset == where) ++ rw_aux_tree_set(b, t, l++, _where); ++ ++ /* l now > where */ ++ ++ for (j = l; ++ j < t->size && ++ rw_aux_tree(b, t)[j].offset < where + clobber_u64s; ++ j++) ++ ; ++ ++ if (j < t->size && ++ rw_aux_tree(b, t)[j].offset + shift == ++ rw_aux_tree(b, t)[l - 1].offset) ++ j++; ++ ++ memmove(&rw_aux_tree(b, t)[l], ++ &rw_aux_tree(b, t)[j], ++ (void *) &rw_aux_tree(b, t)[t->size] - ++ (void *) &rw_aux_tree(b, t)[j]); ++ t->size -= j - l; ++ ++ for (j = l; j < t->size; j++) ++ rw_aux_tree(b, t)[j].offset += shift; ++ ++ EBUG_ON(l < t->size && ++ rw_aux_tree(b, t)[l].offset == ++ rw_aux_tree(b, t)[l - 1].offset); ++ ++ if (t->size < bset_rw_tree_capacity(b, t) && ++ (l < t->size ++ ? rw_aux_tree(b, t)[l].offset ++ : t->end_offset) - ++ rw_aux_tree(b, t)[l - 1].offset > ++ L1_CACHE_BYTES / sizeof(u64)) { ++ struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1); ++ struct bkey_packed *end = l < t->size ++ ? rw_aux_to_bkey(b, t, l) ++ : btree_bkey_last(b, t); ++ struct bkey_packed *k = start; ++ ++ while (1) { ++ k = bkey_next_skip_noops(k, end); ++ if (k == end) ++ break; ++ ++ if ((void *) k - (void *) start >= L1_CACHE_BYTES) { ++ memmove(&rw_aux_tree(b, t)[l + 1], ++ &rw_aux_tree(b, t)[l], ++ (void *) &rw_aux_tree(b, t)[t->size] - ++ (void *) &rw_aux_tree(b, t)[l]); ++ t->size++; ++ rw_aux_tree_set(b, t, l, k); ++ break; ++ } ++ } ++ } ++ ++ bch2_bset_verify_rw_aux_tree(b, t); ++ bset_aux_tree_verify(b); ++} ++ ++void bch2_bset_insert(struct btree *b, ++ struct btree_node_iter *iter, ++ struct bkey_packed *where, ++ struct bkey_i *insert, ++ unsigned clobber_u64s) ++{ ++ struct bkey_format *f = &b->format; ++ struct bset_tree *t = bset_tree_last(b); ++ struct bkey_packed packed, *src = bkey_to_packed(insert); ++ ++ bch2_bset_verify_rw_aux_tree(b, t); ++ bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s); ++ ++ if (bch2_bkey_pack_key(&packed, &insert->k, f)) ++ src = &packed; ++ ++ if (!bkey_whiteout(&insert->k)) ++ btree_keys_account_key_add(&b->nr, t - b->set, src); ++ ++ if (src->u64s != clobber_u64s) { ++ u64 *src_p = where->_data + clobber_u64s; ++ u64 *dst_p = where->_data + src->u64s; ++ ++ EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < ++ (int) clobber_u64s - src->u64s); ++ ++ memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); ++ le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s); ++ set_btree_bset_end(b, t); ++ } ++ ++ memcpy_u64s(where, src, ++ bkeyp_key_u64s(f, src)); ++ memcpy_u64s(bkeyp_val(f, where), &insert->v, ++ bkeyp_val_u64s(f, src)); ++ ++ if (src->u64s != clobber_u64s) ++ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); ++ ++ bch2_verify_btree_nr_keys(b); ++} ++ ++void bch2_bset_delete(struct btree *b, ++ struct bkey_packed *where, ++ unsigned clobber_u64s) ++{ ++ struct bset_tree *t = bset_tree_last(b); ++ u64 *src_p = where->_data + clobber_u64s; ++ u64 *dst_p = where->_data; ++ ++ bch2_bset_verify_rw_aux_tree(b, t); ++ ++ EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s); ++ ++ memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); ++ le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s); ++ set_btree_bset_end(b, t); ++ ++ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0); ++} ++ ++/* Lookup */ ++ ++__flatten ++static struct bkey_packed *bset_search_write_set(const struct btree *b, ++ struct bset_tree *t, ++ struct bpos *search, ++ const struct bkey_packed *packed_search) ++{ ++ unsigned l = 0, r = t->size; ++ ++ while (l + 1 != r) { ++ unsigned m = (l + r) >> 1; ++ ++ if (bkey_cmp(rw_aux_tree(b, t)[m].k, *search) < 0) ++ l = m; ++ else ++ r = m; ++ } ++ ++ return rw_aux_to_bkey(b, t, l); ++} ++ ++static inline void prefetch_four_cachelines(void *p) ++{ ++#ifdef CONFIG_X86_64 ++ asm(".intel_syntax noprefix;" ++ "prefetcht0 [%0 - 127 + 64 * 0];" ++ "prefetcht0 [%0 - 127 + 64 * 1];" ++ "prefetcht0 [%0 - 127 + 64 * 2];" ++ "prefetcht0 [%0 - 127 + 64 * 3];" ++ ".att_syntax prefix;" ++ : ++ : "r" (p + 127)); ++#else ++ prefetch(p + L1_CACHE_BYTES * 0); ++ prefetch(p + L1_CACHE_BYTES * 1); ++ prefetch(p + L1_CACHE_BYTES * 2); ++ prefetch(p + L1_CACHE_BYTES * 3); ++#endif ++} ++ ++static inline bool bkey_mantissa_bits_dropped(const struct btree *b, ++ const struct bkey_float *f, ++ unsigned idx) ++{ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits; ++ ++ return f->exponent > key_bits_start; ++#else ++ unsigned key_bits_end = high_bit_offset + b->nr_key_bits; ++ ++ return f->exponent + BKEY_MANTISSA_BITS < key_bits_end; ++#endif ++} ++ ++__flatten ++static struct bkey_packed *bset_search_tree(const struct btree *b, ++ struct bset_tree *t, ++ struct bpos *search, ++ const struct bkey_packed *packed_search) ++{ ++ struct ro_aux_tree *base = ro_aux_tree_base(b, t); ++ struct bkey_float *f; ++ struct bkey_packed *k; ++ unsigned inorder, n = 1, l, r; ++ int cmp; ++ ++ do { ++ if (likely(n << 4 < t->size)) ++ prefetch(&base->f[n << 4]); ++ ++ f = &base->f[n]; ++ ++ if (!unlikely(packed_search)) ++ goto slowpath; ++ if (unlikely(f->exponent >= BFLOAT_FAILED)) ++ goto slowpath; ++ ++ l = f->mantissa; ++ r = bkey_mantissa(packed_search, f, n); ++ ++ if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n)) ++ goto slowpath; ++ ++ n = n * 2 + (l < r); ++ continue; ++slowpath: ++ k = tree_to_bkey(b, t, n); ++ cmp = bkey_cmp_p_or_unp(b, k, packed_search, search); ++ if (!cmp) ++ return k; ++ ++ n = n * 2 + (cmp < 0); ++ } while (n < t->size); ++ ++ inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra); ++ ++ /* ++ * n would have been the node we recursed to - the low bit tells us if ++ * we recursed left or recursed right. ++ */ ++ if (likely(!(n & 1))) { ++ --inorder; ++ if (unlikely(!inorder)) ++ return btree_bkey_first(b, t); ++ ++ f = &base->f[eytzinger1_prev(n >> 1, t->size)]; ++ } ++ ++ return cacheline_to_bkey(b, t, inorder, f->key_offset); ++} ++ ++static __always_inline __flatten ++struct bkey_packed *__bch2_bset_search(struct btree *b, ++ struct bset_tree *t, ++ struct bpos *search, ++ const struct bkey_packed *lossy_packed_search) ++{ ++ ++ /* ++ * First, we search for a cacheline, then lastly we do a linear search ++ * within that cacheline. ++ * ++ * To search for the cacheline, there's three different possibilities: ++ * * The set is too small to have a search tree, so we just do a linear ++ * search over the whole set. ++ * * The set is the one we're currently inserting into; keeping a full ++ * auxiliary search tree up to date would be too expensive, so we ++ * use a much simpler lookup table to do a binary search - ++ * bset_search_write_set(). ++ * * Or we use the auxiliary search tree we constructed earlier - ++ * bset_search_tree() ++ */ ++ ++ switch (bset_aux_tree_type(t)) { ++ case BSET_NO_AUX_TREE: ++ return btree_bkey_first(b, t); ++ case BSET_RW_AUX_TREE: ++ return bset_search_write_set(b, t, search, lossy_packed_search); ++ case BSET_RO_AUX_TREE: ++ /* ++ * Each node in the auxiliary search tree covers a certain range ++ * of bits, and keys above and below the set it covers might ++ * differ outside those bits - so we have to special case the ++ * start and end - handle that here: ++ */ ++ ++ if (bkey_cmp(*search, t->max_key) > 0) ++ return btree_bkey_last(b, t); ++ ++ return bset_search_tree(b, t, search, lossy_packed_search); ++ default: ++ unreachable(); ++ } ++} ++ ++static __always_inline __flatten ++struct bkey_packed *bch2_bset_search_linear(struct btree *b, ++ struct bset_tree *t, ++ struct bpos *search, ++ struct bkey_packed *packed_search, ++ const struct bkey_packed *lossy_packed_search, ++ struct bkey_packed *m) ++{ ++ if (lossy_packed_search) ++ while (m != btree_bkey_last(b, t) && ++ bkey_iter_cmp_p_or_unp(b, m, ++ lossy_packed_search, search) < 0) ++ m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); ++ ++ if (!packed_search) ++ while (m != btree_bkey_last(b, t) && ++ bkey_iter_pos_cmp(b, m, search) < 0) ++ m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); ++ ++ if (btree_keys_expensive_checks(b)) { ++ struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); ++ ++ BUG_ON(prev && ++ bkey_iter_cmp_p_or_unp(b, prev, ++ packed_search, search) >= 0); ++ } ++ ++ return m; ++} ++ ++/* ++ * Returns the first key greater than or equal to @search ++ */ ++static __always_inline __flatten ++struct bkey_packed *bch2_bset_search(struct btree *b, ++ struct bset_tree *t, ++ struct bpos *search, ++ struct bkey_packed *packed_search, ++ const struct bkey_packed *lossy_packed_search) ++{ ++ struct bkey_packed *m = __bch2_bset_search(b, t, search, ++ lossy_packed_search); ++ ++ return bch2_bset_search_linear(b, t, search, ++ packed_search, lossy_packed_search, m); ++} ++ ++/* Btree node iterator */ ++ ++static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, ++ struct btree *b, ++ const struct bkey_packed *k, ++ const struct bkey_packed *end) ++{ ++ if (k != end) { ++ struct btree_node_iter_set *pos; ++ ++ btree_node_iter_for_each(iter, pos) ++ ; ++ ++ BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data)); ++ *pos = (struct btree_node_iter_set) { ++ __btree_node_key_to_offset(b, k), ++ __btree_node_key_to_offset(b, end) ++ }; ++ } ++} ++ ++void bch2_btree_node_iter_push(struct btree_node_iter *iter, ++ struct btree *b, ++ const struct bkey_packed *k, ++ const struct bkey_packed *end) ++{ ++ __bch2_btree_node_iter_push(iter, b, k, end); ++ bch2_btree_node_iter_sort(iter, b); ++} ++ ++noinline __flatten __attribute__((cold)) ++static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, ++ struct btree *b, struct bpos *search) ++{ ++ struct bset_tree *t; ++ ++ trace_bkey_pack_pos_fail(search); ++ ++ for_each_bset(b, t) ++ __bch2_btree_node_iter_push(iter, b, ++ bch2_bset_search(b, t, search, NULL, NULL), ++ btree_bkey_last(b, t)); ++ ++ bch2_btree_node_iter_sort(iter, b); ++} ++ ++/** ++ * bch_btree_node_iter_init - initialize a btree node iterator, starting from a ++ * given position ++ * ++ * Main entry point to the lookup code for individual btree nodes: ++ * ++ * NOTE: ++ * ++ * When you don't filter out deleted keys, btree nodes _do_ contain duplicate ++ * keys. This doesn't matter for most code, but it does matter for lookups. ++ * ++ * Some adjacent keys with a string of equal keys: ++ * i j k k k k l m ++ * ++ * If you search for k, the lookup code isn't guaranteed to return you any ++ * specific k. The lookup code is conceptually doing a binary search and ++ * iterating backwards is very expensive so if the pivot happens to land at the ++ * last k that's what you'll get. ++ * ++ * This works out ok, but it's something to be aware of: ++ * ++ * - For non extents, we guarantee that the live key comes last - see ++ * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't ++ * see will only be deleted keys you don't care about. ++ * ++ * - For extents, deleted keys sort last (see the comment at the top of this ++ * file). But when you're searching for extents, you actually want the first ++ * key strictly greater than your search key - an extent that compares equal ++ * to the search key is going to have 0 sectors after the search key. ++ * ++ * But this does mean that we can't just search for ++ * bkey_successor(start_of_range) to get the first extent that overlaps with ++ * the range we want - if we're unlucky and there's an extent that ends ++ * exactly where we searched, then there could be a deleted key at the same ++ * position and we'd get that when we search instead of the preceding extent ++ * we needed. ++ * ++ * So we've got to search for start_of_range, then after the lookup iterate ++ * past any extents that compare equal to the position we searched for. ++ */ ++__flatten ++void bch2_btree_node_iter_init(struct btree_node_iter *iter, ++ struct btree *b, struct bpos *search) ++{ ++ struct bkey_packed p, *packed_search = NULL; ++ struct btree_node_iter_set *pos = iter->data; ++ struct bkey_packed *k[MAX_BSETS]; ++ unsigned i; ++ ++ EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0); ++ bset_aux_tree_verify(b); ++ ++ memset(iter, 0, sizeof(*iter)); ++ ++ switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) { ++ case BKEY_PACK_POS_EXACT: ++ packed_search = &p; ++ break; ++ case BKEY_PACK_POS_SMALLER: ++ packed_search = NULL; ++ break; ++ case BKEY_PACK_POS_FAIL: ++ btree_node_iter_init_pack_failed(iter, b, search); ++ return; ++ } ++ ++ for (i = 0; i < b->nsets; i++) { ++ k[i] = __bch2_bset_search(b, b->set + i, search, &p); ++ prefetch_four_cachelines(k[i]); ++ } ++ ++ for (i = 0; i < b->nsets; i++) { ++ struct bset_tree *t = b->set + i; ++ struct bkey_packed *end = btree_bkey_last(b, t); ++ ++ k[i] = bch2_bset_search_linear(b, t, search, ++ packed_search, &p, k[i]); ++ if (k[i] != end) ++ *pos++ = (struct btree_node_iter_set) { ++ __btree_node_key_to_offset(b, k[i]), ++ __btree_node_key_to_offset(b, end) ++ }; ++ } ++ ++ bch2_btree_node_iter_sort(iter, b); ++} ++ ++void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ struct bset_tree *t; ++ ++ memset(iter, 0, sizeof(*iter)); ++ ++ for_each_bset(b, t) ++ __bch2_btree_node_iter_push(iter, b, ++ btree_bkey_first(b, t), ++ btree_bkey_last(b, t)); ++ bch2_btree_node_iter_sort(iter, b); ++} ++ ++struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter, ++ struct btree *b, ++ struct bset_tree *t) ++{ ++ struct btree_node_iter_set *set; ++ ++ btree_node_iter_for_each(iter, set) ++ if (set->end == t->end_offset) ++ return __btree_node_offset_to_key(b, set->k); ++ ++ return btree_bkey_last(b, t); ++} ++ ++static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter, ++ struct btree *b, ++ unsigned first) ++{ ++ bool ret; ++ ++ if ((ret = (btree_node_iter_cmp(b, ++ iter->data[first], ++ iter->data[first + 1]) > 0))) ++ swap(iter->data[first], iter->data[first + 1]); ++ return ret; ++} ++ ++void bch2_btree_node_iter_sort(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ /* unrolled bubble sort: */ ++ ++ if (!__btree_node_iter_set_end(iter, 2)) { ++ btree_node_iter_sort_two(iter, b, 0); ++ btree_node_iter_sort_two(iter, b, 1); ++ } ++ ++ if (!__btree_node_iter_set_end(iter, 1)) ++ btree_node_iter_sort_two(iter, b, 0); ++} ++ ++void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter, ++ struct btree_node_iter_set *set) ++{ ++ struct btree_node_iter_set *last = ++ iter->data + ARRAY_SIZE(iter->data) - 1; ++ ++ memmove(&set[0], &set[1], (void *) last - (void *) set); ++ *last = (struct btree_node_iter_set) { 0, 0 }; ++} ++ ++static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s; ++ ++ EBUG_ON(iter->data->k > iter->data->end); ++ ++ while (!__btree_node_iter_set_end(iter, 0) && ++ !__bch2_btree_node_iter_peek_all(iter, b)->u64s) ++ iter->data->k++; ++ ++ if (unlikely(__btree_node_iter_set_end(iter, 0))) { ++ bch2_btree_node_iter_set_drop(iter, iter->data); ++ return; ++ } ++ ++ if (__btree_node_iter_set_end(iter, 1)) ++ return; ++ ++ if (!btree_node_iter_sort_two(iter, b, 0)) ++ return; ++ ++ if (__btree_node_iter_set_end(iter, 2)) ++ return; ++ ++ btree_node_iter_sort_two(iter, b, 1); ++} ++ ++void bch2_btree_node_iter_advance(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ if (btree_keys_expensive_checks(b)) { ++ bch2_btree_node_iter_verify(iter, b); ++ bch2_btree_node_iter_next_check(iter, b); ++ } ++ ++ __bch2_btree_node_iter_advance(iter, b); ++} ++ ++/* ++ * Expensive: ++ */ ++struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ struct bkey_packed *k, *prev = NULL; ++ struct btree_node_iter_set *set; ++ struct bset_tree *t; ++ unsigned end = 0; ++ ++ if (btree_keys_expensive_checks(b)) ++ bch2_btree_node_iter_verify(iter, b); ++ ++ for_each_bset(b, t) { ++ k = bch2_bkey_prev_all(b, t, ++ bch2_btree_node_iter_bset_pos(iter, b, t)); ++ if (k && ++ (!prev || bkey_iter_cmp(b, k, prev) > 0)) { ++ prev = k; ++ end = t->end_offset; ++ } ++ } ++ ++ if (!prev) ++ return NULL; ++ ++ /* ++ * We're manually memmoving instead of just calling sort() to ensure the ++ * prev we picked ends up in slot 0 - sort won't necessarily put it ++ * there because of duplicate deleted keys: ++ */ ++ btree_node_iter_for_each(iter, set) ++ if (set->end == end) ++ goto found; ++ ++ BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]); ++found: ++ BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data)); ++ ++ memmove(&iter->data[1], ++ &iter->data[0], ++ (void *) set - (void *) &iter->data[0]); ++ ++ iter->data[0].k = __btree_node_key_to_offset(b, prev); ++ iter->data[0].end = end; ++ ++ if (btree_keys_expensive_checks(b)) ++ bch2_btree_node_iter_verify(iter, b); ++ return prev; ++} ++ ++struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter, ++ struct btree *b, ++ unsigned min_key_type) ++{ ++ struct bkey_packed *prev; ++ ++ do { ++ prev = bch2_btree_node_iter_prev_all(iter, b); ++ } while (prev && prev->type < min_key_type); ++ ++ return prev; ++} ++ ++struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, ++ struct btree *b, ++ struct bkey *u) ++{ ++ struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b); ++ ++ return k ? bkey_disassemble(b, k, u) : bkey_s_c_null; ++} ++ ++/* Mergesort */ ++ ++void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats) ++{ ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) { ++ enum bset_aux_tree_type type = bset_aux_tree_type(t); ++ size_t j; ++ ++ stats->sets[type].nr++; ++ stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) * ++ sizeof(u64); ++ ++ if (bset_has_ro_aux_tree(t)) { ++ stats->floats += t->size - 1; ++ ++ for (j = 1; j < t->size; j++) ++ stats->failed += ++ bkey_float(b, t, j)->exponent == ++ BFLOAT_FAILED; ++ } ++ } ++} ++ ++void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, ++ struct bkey_packed *k) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ struct bkey uk; ++ unsigned j, inorder; ++ ++ if (out->pos != out->end) ++ *out->pos = '\0'; ++ ++ if (!bset_has_ro_aux_tree(t)) ++ return; ++ ++ inorder = bkey_to_cacheline(b, t, k); ++ if (!inorder || inorder >= t->size) ++ return; ++ ++ j = __inorder_to_eytzinger1(inorder, t->size, t->extra); ++ if (k != tree_to_bkey(b, t, j)) ++ return; ++ ++ switch (bkey_float(b, t, j)->exponent) { ++ case BFLOAT_FAILED: ++ uk = bkey_unpack_key(b, k); ++ pr_buf(out, ++ " failed unpacked at depth %u\n" ++ "\t%llu:%llu\n", ++ ilog2(j), ++ uk.p.inode, uk.p.offset); ++ break; ++ } ++} +diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h +new file mode 100644 +index 000000000000..5921cf689105 +--- /dev/null ++++ b/fs/bcachefs/bset.h +@@ -0,0 +1,661 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BSET_H ++#define _BCACHEFS_BSET_H ++ ++#include ++#include ++ ++#include "bcachefs_format.h" ++#include "bkey.h" ++#include "bkey_methods.h" ++#include "btree_types.h" ++#include "util.h" /* for time_stats */ ++#include "vstructs.h" ++ ++/* ++ * BKEYS: ++ * ++ * A bkey contains a key, a size field, a variable number of pointers, and some ++ * ancillary flag bits. ++ * ++ * We use two different functions for validating bkeys, bkey_invalid and ++ * bkey_deleted(). ++ * ++ * The one exception to the rule that ptr_invalid() filters out invalid keys is ++ * that it also filters out keys of size 0 - these are keys that have been ++ * completely overwritten. It'd be safe to delete these in memory while leaving ++ * them on disk, just unnecessary work - so we filter them out when resorting ++ * instead. ++ * ++ * We can't filter out stale keys when we're resorting, because garbage ++ * collection needs to find them to ensure bucket gens don't wrap around - ++ * unless we're rewriting the btree node those stale keys still exist on disk. ++ * ++ * We also implement functions here for removing some number of sectors from the ++ * front or the back of a bkey - this is mainly used for fixing overlapping ++ * extents, by removing the overlapping sectors from the older key. ++ * ++ * BSETS: ++ * ++ * A bset is an array of bkeys laid out contiguously in memory in sorted order, ++ * along with a header. A btree node is made up of a number of these, written at ++ * different times. ++ * ++ * There could be many of them on disk, but we never allow there to be more than ++ * 4 in memory - we lazily resort as needed. ++ * ++ * We implement code here for creating and maintaining auxiliary search trees ++ * (described below) for searching an individial bset, and on top of that we ++ * implement a btree iterator. ++ * ++ * BTREE ITERATOR: ++ * ++ * Most of the code in bcache doesn't care about an individual bset - it needs ++ * to search entire btree nodes and iterate over them in sorted order. ++ * ++ * The btree iterator code serves both functions; it iterates through the keys ++ * in a btree node in sorted order, starting from either keys after a specific ++ * point (if you pass it a search key) or the start of the btree node. ++ * ++ * AUXILIARY SEARCH TREES: ++ * ++ * Since keys are variable length, we can't use a binary search on a bset - we ++ * wouldn't be able to find the start of the next key. But binary searches are ++ * slow anyways, due to terrible cache behaviour; bcache originally used binary ++ * searches and that code topped out at under 50k lookups/second. ++ * ++ * So we need to construct some sort of lookup table. Since we only insert keys ++ * into the last (unwritten) set, most of the keys within a given btree node are ++ * usually in sets that are mostly constant. We use two different types of ++ * lookup tables to take advantage of this. ++ * ++ * Both lookup tables share in common that they don't index every key in the ++ * set; they index one key every BSET_CACHELINE bytes, and then a linear search ++ * is used for the rest. ++ * ++ * For sets that have been written to disk and are no longer being inserted ++ * into, we construct a binary search tree in an array - traversing a binary ++ * search tree in an array gives excellent locality of reference and is very ++ * fast, since both children of any node are adjacent to each other in memory ++ * (and their grandchildren, and great grandchildren...) - this means ++ * prefetching can be used to great effect. ++ * ++ * It's quite useful performance wise to keep these nodes small - not just ++ * because they're more likely to be in L2, but also because we can prefetch ++ * more nodes on a single cacheline and thus prefetch more iterations in advance ++ * when traversing this tree. ++ * ++ * Nodes in the auxiliary search tree must contain both a key to compare against ++ * (we don't want to fetch the key from the set, that would defeat the purpose), ++ * and a pointer to the key. We use a few tricks to compress both of these. ++ * ++ * To compress the pointer, we take advantage of the fact that one node in the ++ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have ++ * a function (to_inorder()) that takes the index of a node in a binary tree and ++ * returns what its index would be in an inorder traversal, so we only have to ++ * store the low bits of the offset. ++ * ++ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To ++ * compress that, we take advantage of the fact that when we're traversing the ++ * search tree at every iteration we know that both our search key and the key ++ * we're looking for lie within some range - bounded by our previous ++ * comparisons. (We special case the start of a search so that this is true even ++ * at the root of the tree). ++ * ++ * So we know the key we're looking for is between a and b, and a and b don't ++ * differ higher than bit 50, we don't need to check anything higher than bit ++ * 50. ++ * ++ * We don't usually need the rest of the bits, either; we only need enough bits ++ * to partition the key range we're currently checking. Consider key n - the ++ * key our auxiliary search tree node corresponds to, and key p, the key ++ * immediately preceding n. The lowest bit we need to store in the auxiliary ++ * search tree is the highest bit that differs between n and p. ++ * ++ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the ++ * comparison. But we'd really like our nodes in the auxiliary search tree to be ++ * of fixed size. ++ * ++ * The solution is to make them fixed size, and when we're constructing a node ++ * check if p and n differed in the bits we needed them to. If they don't we ++ * flag that node, and when doing lookups we fallback to comparing against the ++ * real key. As long as this doesn't happen to often (and it seems to reliably ++ * happen a bit less than 1% of the time), we win - even on failures, that key ++ * is then more likely to be in cache than if we were doing binary searches all ++ * the way, since we're touching so much less memory. ++ * ++ * The keys in the auxiliary search tree are stored in (software) floating ++ * point, with an exponent and a mantissa. The exponent needs to be big enough ++ * to address all the bits in the original key, but the number of bits in the ++ * mantissa is somewhat arbitrary; more bits just gets us fewer failures. ++ * ++ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys ++ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes. ++ * We need one node per 128 bytes in the btree node, which means the auxiliary ++ * search trees take up 3% as much memory as the btree itself. ++ * ++ * Constructing these auxiliary search trees is moderately expensive, and we ++ * don't want to be constantly rebuilding the search tree for the last set ++ * whenever we insert another key into it. For the unwritten set, we use a much ++ * simpler lookup table - it's just a flat array, so index i in the lookup table ++ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing ++ * within each byte range works the same as with the auxiliary search trees. ++ * ++ * These are much easier to keep up to date when we insert a key - we do it ++ * somewhat lazily; when we shift a key up we usually just increment the pointer ++ * to it, only when it would overflow do we go to the trouble of finding the ++ * first key in that range of bytes again. ++ */ ++ ++extern bool bch2_expensive_debug_checks; ++ ++static inline bool btree_keys_expensive_checks(const struct btree *b) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ return bch2_expensive_debug_checks || *b->expensive_debug_checks; ++#else ++ return false; ++#endif ++} ++ ++enum bset_aux_tree_type { ++ BSET_NO_AUX_TREE, ++ BSET_RO_AUX_TREE, ++ BSET_RW_AUX_TREE, ++}; ++ ++#define BSET_TREE_NR_TYPES 3 ++ ++#define BSET_NO_AUX_TREE_VAL (U16_MAX) ++#define BSET_RW_AUX_TREE_VAL (U16_MAX - 1) ++ ++static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t) ++{ ++ switch (t->extra) { ++ case BSET_NO_AUX_TREE_VAL: ++ EBUG_ON(t->size); ++ return BSET_NO_AUX_TREE; ++ case BSET_RW_AUX_TREE_VAL: ++ EBUG_ON(!t->size); ++ return BSET_RW_AUX_TREE; ++ default: ++ EBUG_ON(!t->size); ++ return BSET_RO_AUX_TREE; ++ } ++} ++ ++/* ++ * BSET_CACHELINE was originally intended to match the hardware cacheline size - ++ * it used to be 64, but I realized the lookup code would touch slightly less ++ * memory if it was 128. ++ * ++ * It definites the number of bytes (in struct bset) per struct bkey_float in ++ * the auxiliar search tree - when we're done searching the bset_float tree we ++ * have this many bytes left that we do a linear search over. ++ * ++ * Since (after level 5) every level of the bset_tree is on a new cacheline, ++ * we're touching one fewer cacheline in the bset tree in exchange for one more ++ * cacheline in the linear search - but the linear search might stop before it ++ * gets to the second cacheline. ++ */ ++ ++#define BSET_CACHELINE 128 ++ ++static inline size_t btree_keys_cachelines(struct btree *b) ++{ ++ return (1U << b->byte_order) / BSET_CACHELINE; ++} ++ ++static inline size_t btree_aux_data_bytes(struct btree *b) ++{ ++ return btree_keys_cachelines(b) * 8; ++} ++ ++static inline size_t btree_aux_data_u64s(struct btree *b) ++{ ++ return btree_aux_data_bytes(b) / sizeof(u64); ++} ++ ++typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); ++ ++static inline void ++__bkey_unpack_key_format_checked(const struct btree *b, ++ struct bkey *dst, ++ const struct bkey_packed *src) ++{ ++#ifdef HAVE_BCACHEFS_COMPILED_UNPACK ++ { ++ compiled_unpack_fn unpack_fn = b->aux_data; ++ unpack_fn(dst, src); ++ ++ if (btree_keys_expensive_checks(b)) { ++ struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); ++ ++ BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); ++ } ++ } ++#else ++ *dst = __bch2_bkey_unpack_key(&b->format, src); ++#endif ++} ++ ++static inline struct bkey ++bkey_unpack_key_format_checked(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++ struct bkey dst; ++ ++ __bkey_unpack_key_format_checked(b, &dst, src); ++ return dst; ++} ++ ++static inline void __bkey_unpack_key(const struct btree *b, ++ struct bkey *dst, ++ const struct bkey_packed *src) ++{ ++ if (likely(bkey_packed(src))) ++ __bkey_unpack_key_format_checked(b, dst, src); ++ else ++ *dst = *packed_to_bkey_c(src); ++} ++ ++/** ++ * bkey_unpack_key -- unpack just the key, not the value ++ */ ++static inline struct bkey bkey_unpack_key(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++ return likely(bkey_packed(src)) ++ ? bkey_unpack_key_format_checked(b, src) ++ : *packed_to_bkey_c(src); ++} ++ ++static inline struct bpos ++bkey_unpack_pos_format_checked(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++#ifdef HAVE_BCACHEFS_COMPILED_UNPACK ++ return bkey_unpack_key_format_checked(b, src).p; ++#else ++ return __bkey_unpack_pos(&b->format, src); ++#endif ++} ++ ++static inline struct bpos bkey_unpack_pos(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++ return likely(bkey_packed(src)) ++ ? bkey_unpack_pos_format_checked(b, src) ++ : packed_to_bkey_c(src)->p; ++} ++ ++/* Disassembled bkeys */ ++ ++static inline struct bkey_s_c bkey_disassemble(struct btree *b, ++ const struct bkey_packed *k, ++ struct bkey *u) ++{ ++ __bkey_unpack_key(b, u, k); ++ ++ return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), }; ++} ++ ++/* non const version: */ ++static inline struct bkey_s __bkey_disassemble(struct btree *b, ++ struct bkey_packed *k, ++ struct bkey *u) ++{ ++ __bkey_unpack_key(b, u, k); ++ ++ return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; ++} ++ ++#define for_each_bset(_b, _t) \ ++ for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) ++ ++#define bset_tree_for_each_key(_b, _t, _k) \ ++ for (_k = btree_bkey_first(_b, _t); \ ++ _k != btree_bkey_last(_b, _t); \ ++ _k = bkey_next_skip_noops(_k, btree_bkey_last(_b, _t))) ++ ++static inline bool bset_has_ro_aux_tree(struct bset_tree *t) ++{ ++ return bset_aux_tree_type(t) == BSET_RO_AUX_TREE; ++} ++ ++static inline bool bset_has_rw_aux_tree(struct bset_tree *t) ++{ ++ return bset_aux_tree_type(t) == BSET_RW_AUX_TREE; ++} ++ ++static inline void bch2_bset_set_no_aux_tree(struct btree *b, ++ struct bset_tree *t) ++{ ++ BUG_ON(t < b->set); ++ ++ for (; t < b->set + ARRAY_SIZE(b->set); t++) { ++ t->size = 0; ++ t->extra = BSET_NO_AUX_TREE_VAL; ++ t->aux_data_offset = U16_MAX; ++ } ++} ++ ++static inline void btree_node_set_format(struct btree *b, ++ struct bkey_format f) ++{ ++ int len; ++ ++ b->format = f; ++ b->nr_key_bits = bkey_format_key_bits(&f); ++ ++ len = bch2_compile_bkey_format(&b->format, b->aux_data); ++ BUG_ON(len < 0 || len > U8_MAX); ++ ++ b->unpack_fn_len = len; ++ ++ bch2_bset_set_no_aux_tree(b, b->set); ++} ++ ++static inline struct bset *bset_next_set(struct btree *b, ++ unsigned block_bytes) ++{ ++ struct bset *i = btree_bset_last(b); ++ ++ EBUG_ON(!is_power_of_2(block_bytes)); ++ ++ return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); ++} ++ ++void bch2_btree_keys_init(struct btree *, bool *); ++ ++void bch2_bset_init_first(struct btree *, struct bset *); ++void bch2_bset_init_next(struct bch_fs *, struct btree *, ++ struct btree_node_entry *); ++void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); ++void bch2_bset_fix_invalidated_key(struct btree *, struct bkey_packed *); ++ ++void bch2_bset_insert(struct btree *, struct btree_node_iter *, ++ struct bkey_packed *, struct bkey_i *, unsigned); ++void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned); ++ ++/* Bkey utility code */ ++ ++/* packed or unpacked */ ++static inline int bkey_cmp_p_or_unp(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bkey_packed *r_packed, ++ const struct bpos *r) ++{ ++ EBUG_ON(r_packed && !bkey_packed(r_packed)); ++ ++ if (unlikely(!bkey_packed(l))) ++ return bkey_cmp(packed_to_bkey_c(l)->p, *r); ++ ++ if (likely(r_packed)) ++ return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b); ++ ++ return __bch2_bkey_cmp_left_packed_format_checked(b, l, r); ++} ++ ++struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *); ++ ++struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *, ++ struct bkey_packed *, unsigned); ++ ++static inline struct bkey_packed * ++bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k) ++{ ++ return bch2_bkey_prev_filter(b, t, k, 0); ++} ++ ++static inline struct bkey_packed * ++bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k) ++{ ++ return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_discard + 1); ++} ++ ++enum bch_extent_overlap { ++ BCH_EXTENT_OVERLAP_ALL = 0, ++ BCH_EXTENT_OVERLAP_BACK = 1, ++ BCH_EXTENT_OVERLAP_FRONT = 2, ++ BCH_EXTENT_OVERLAP_MIDDLE = 3, ++}; ++ ++/* Returns how k overlaps with m */ ++static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, ++ const struct bkey *m) ++{ ++ int cmp1 = bkey_cmp(k->p, m->p) < 0; ++ int cmp2 = bkey_cmp(bkey_start_pos(k), ++ bkey_start_pos(m)) > 0; ++ ++ return (cmp1 << 1) + cmp2; ++} ++ ++/* Btree key iteration */ ++ ++void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *, ++ const struct bkey_packed *, ++ const struct bkey_packed *); ++void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *, ++ struct bpos *); ++void bch2_btree_node_iter_init_from_start(struct btree_node_iter *, ++ struct btree *); ++struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *, ++ struct btree *, ++ struct bset_tree *); ++ ++void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *); ++void bch2_btree_node_iter_set_drop(struct btree_node_iter *, ++ struct btree_node_iter_set *); ++void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *); ++ ++#define btree_node_iter_for_each(_iter, _set) \ ++ for (_set = (_iter)->data; \ ++ _set < (_iter)->data + ARRAY_SIZE((_iter)->data) && \ ++ (_set)->k != (_set)->end; \ ++ _set++) ++ ++static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter, ++ unsigned i) ++{ ++ return iter->data[i].k == iter->data[i].end; ++} ++ ++static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter) ++{ ++ return __btree_node_iter_set_end(iter, 0); ++} ++ ++/* ++ * When keys compare equal, deleted keys compare first: ++ * ++ * XXX: only need to compare pointers for keys that are both within a ++ * btree_node_iterator - we need to break ties for prev() to work correctly ++ */ ++static inline int bkey_iter_cmp(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bkey_packed *r) ++{ ++ return bkey_cmp_packed(b, l, r) ++ ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) ++ ?: cmp_int(l, r); ++} ++ ++static inline int btree_node_iter_cmp(const struct btree *b, ++ struct btree_node_iter_set l, ++ struct btree_node_iter_set r) ++{ ++ return bkey_iter_cmp(b, ++ __btree_node_offset_to_key(b, l.k), ++ __btree_node_offset_to_key(b, r.k)); ++} ++ ++/* These assume r (the search key) is not a deleted key: */ ++static inline int bkey_iter_pos_cmp(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bpos *r) ++{ ++ return bkey_cmp_left_packed(b, l, r) ++ ?: -((int) bkey_deleted(l)); ++} ++ ++static inline int bkey_iter_cmp_p_or_unp(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bkey_packed *r_packed, ++ const struct bpos *r) ++{ ++ return bkey_cmp_p_or_unp(b, l, r_packed, r) ++ ?: -((int) bkey_deleted(l)); ++} ++ ++static inline struct bkey_packed * ++__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ return __btree_node_offset_to_key(b, iter->data->k); ++} ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_peek_filter(struct btree_node_iter *iter, ++ struct btree *b, ++ unsigned min_key_type) ++{ ++ while (!bch2_btree_node_iter_end(iter)) { ++ struct bkey_packed *k = __bch2_btree_node_iter_peek_all(iter, b); ++ ++ if (k->type >= min_key_type) ++ return k; ++ ++ bch2_btree_node_iter_advance(iter, b); ++ } ++ ++ return NULL; ++} ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ return bch2_btree_node_iter_peek_filter(iter, b, 0); ++} ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b) ++{ ++ return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_discard + 1); ++} ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b) ++{ ++ struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b); ++ ++ if (ret) ++ bch2_btree_node_iter_advance(iter, b); ++ ++ return ret; ++} ++ ++struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *, ++ struct btree *); ++struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *, ++ struct btree *, unsigned); ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b) ++{ ++ return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_discard + 1); ++} ++ ++struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, ++ struct btree *, ++ struct bkey *); ++ ++#define for_each_btree_node_key_unpack(b, k, iter, unpacked) \ ++ for (bch2_btree_node_iter_init_from_start((iter), (b)); \ ++ (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\ ++ bch2_btree_node_iter_advance(iter, b)) ++ ++/* Accounting: */ ++ ++static inline void btree_keys_account_key(struct btree_nr_keys *n, ++ unsigned bset, ++ struct bkey_packed *k, ++ int sign) ++{ ++ n->live_u64s += k->u64s * sign; ++ n->bset_u64s[bset] += k->u64s * sign; ++ ++ if (bkey_packed(k)) ++ n->packed_keys += sign; ++ else ++ n->unpacked_keys += sign; ++} ++ ++static inline void btree_keys_account_val_delta(struct btree *b, ++ struct bkey_packed *k, ++ int delta) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ ++ b->nr.live_u64s += delta; ++ b->nr.bset_u64s[t - b->set] += delta; ++} ++ ++#define btree_keys_account_key_add(_nr, _bset_idx, _k) \ ++ btree_keys_account_key(_nr, _bset_idx, _k, 1) ++#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \ ++ btree_keys_account_key(_nr, _bset_idx, _k, -1) ++ ++#define btree_account_key_add(_b, _k) \ ++ btree_keys_account_key(&(_b)->nr, \ ++ bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1) ++#define btree_account_key_drop(_b, _k) \ ++ btree_keys_account_key(&(_b)->nr, \ ++ bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1) ++ ++struct bset_stats { ++ struct { ++ size_t nr, bytes; ++ } sets[BSET_TREE_NR_TYPES]; ++ ++ size_t floats; ++ size_t failed; ++}; ++ ++void bch2_btree_keys_stats(struct btree *, struct bset_stats *); ++void bch2_bfloat_to_text(struct printbuf *, struct btree *, ++ struct bkey_packed *); ++ ++/* Debug stuff */ ++ ++void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned); ++void bch2_dump_btree_node(struct bch_fs *, struct btree *); ++void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++void __bch2_verify_btree_nr_keys(struct btree *); ++void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); ++void bch2_verify_insert_pos(struct btree *, struct bkey_packed *, ++ struct bkey_packed *, unsigned); ++ ++#else ++ ++static inline void __bch2_verify_btree_nr_keys(struct btree *b) {} ++static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter, ++ struct btree *b) {} ++static inline void bch2_verify_insert_pos(struct btree *b, ++ struct bkey_packed *where, ++ struct bkey_packed *insert, ++ unsigned clobber_u64s) {} ++#endif ++ ++static inline void bch2_verify_btree_nr_keys(struct btree *b) ++{ ++ if (btree_keys_expensive_checks(b)) ++ __bch2_verify_btree_nr_keys(b); ++} ++ ++#endif /* _BCACHEFS_BSET_H */ +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +new file mode 100644 +index 000000000000..736671112861 +--- /dev/null ++++ b/fs/bcachefs/btree_cache.c +@@ -0,0 +1,1057 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_locking.h" ++#include "debug.h" ++ ++#include ++#include ++#include ++ ++const char * const bch2_btree_ids[] = { ++#define x(kwd, val, name) name, ++ BCH_BTREE_IDS() ++#undef x ++ NULL ++}; ++ ++void bch2_recalc_btree_reserve(struct bch_fs *c) ++{ ++ unsigned i, reserve = 16; ++ ++ if (!c->btree_roots[0].b) ++ reserve += 8; ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (c->btree_roots[i].b) ++ reserve += min_t(unsigned, 1, ++ c->btree_roots[i].b->c.level) * 8; ++ ++ c->btree_cache.reserve = reserve; ++} ++ ++static inline unsigned btree_cache_can_free(struct btree_cache *bc) ++{ ++ return max_t(int, 0, bc->used - bc->reserve); ++} ++ ++static void __btree_node_data_free(struct bch_fs *c, struct btree *b) ++{ ++ EBUG_ON(btree_node_write_in_flight(b)); ++ ++ kvpfree(b->data, btree_bytes(c)); ++ b->data = NULL; ++ vfree(b->aux_data); ++ b->aux_data = NULL; ++} ++ ++static void btree_node_data_free(struct bch_fs *c, struct btree *b) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ ++ __btree_node_data_free(c, b); ++ bc->used--; ++ list_move(&b->list, &bc->freed); ++} ++ ++static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, ++ const void *obj) ++{ ++ const struct btree *b = obj; ++ const u64 *v = arg->key; ++ ++ return b->hash_val == *v ? 0 : 1; ++} ++ ++static const struct rhashtable_params bch_btree_cache_params = { ++ .head_offset = offsetof(struct btree, hash), ++ .key_offset = offsetof(struct btree, hash_val), ++ .key_len = sizeof(u64), ++ .obj_cmpfn = bch2_btree_cache_cmp_fn, ++}; ++ ++static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) ++{ ++ BUG_ON(b->data || b->aux_data); ++ ++ b->data = kvpmalloc(btree_bytes(c), gfp); ++ if (!b->data) ++ return -ENOMEM; ++ ++ b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp); ++ if (!b->aux_data) { ++ kvpfree(b->data, btree_bytes(c)); ++ b->data = NULL; ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++static struct btree *__btree_node_mem_alloc(struct bch_fs *c) ++{ ++ struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL); ++ if (!b) ++ return NULL; ++ ++ bkey_btree_ptr_init(&b->key); ++ six_lock_init(&b->c.lock); ++ INIT_LIST_HEAD(&b->list); ++ INIT_LIST_HEAD(&b->write_blocked); ++ b->byte_order = ilog2(btree_bytes(c)); ++ return b; ++} ++ ++static struct btree *btree_node_mem_alloc(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b = __btree_node_mem_alloc(c); ++ if (!b) ++ return NULL; ++ ++ if (btree_node_data_alloc(c, b, GFP_KERNEL)) { ++ kfree(b); ++ return NULL; ++ } ++ ++ bc->used++; ++ list_add(&b->list, &bc->freeable); ++ return b; ++} ++ ++/* Btree in memory cache - hash table */ ++ ++void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) ++{ ++ rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); ++ ++ /* Cause future lookups for this node to fail: */ ++ b->hash_val = 0; ++ ++ six_lock_wakeup_all(&b->c.lock); ++} ++ ++int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) ++{ ++ BUG_ON(b->hash_val); ++ b->hash_val = btree_ptr_hash_val(&b->key); ++ ++ return rhashtable_lookup_insert_fast(&bc->table, &b->hash, ++ bch_btree_cache_params); ++} ++ ++int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, ++ unsigned level, enum btree_id id) ++{ ++ int ret; ++ ++ b->c.level = level; ++ b->c.btree_id = id; ++ ++ mutex_lock(&bc->lock); ++ ret = __bch2_btree_node_hash_insert(bc, b); ++ if (!ret) ++ list_add(&b->list, &bc->live); ++ mutex_unlock(&bc->lock); ++ ++ return ret; ++} ++ ++__flatten ++static inline struct btree *btree_cache_find(struct btree_cache *bc, ++ const struct bkey_i *k) ++{ ++ u64 v = btree_ptr_hash_val(k); ++ ++ return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); ++} ++ ++/* ++ * this version is for btree nodes that have already been freed (we're not ++ * reaping a real btree node) ++ */ ++static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ int ret = 0; ++ ++ lockdep_assert_held(&bc->lock); ++ ++ if (!six_trylock_intent(&b->c.lock)) ++ return -ENOMEM; ++ ++ if (!six_trylock_write(&b->c.lock)) ++ goto out_unlock_intent; ++ ++ if (btree_node_noevict(b)) ++ goto out_unlock; ++ ++ if (!btree_node_may_write(b)) ++ goto out_unlock; ++ ++ if (btree_node_dirty(b) && ++ test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) ++ goto out_unlock; ++ ++ if (btree_node_dirty(b) || ++ btree_node_write_in_flight(b) || ++ btree_node_read_in_flight(b)) { ++ if (!flush) ++ goto out_unlock; ++ ++ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, ++ TASK_UNINTERRUPTIBLE); ++ ++ /* ++ * Using the underscore version because we don't want to compact ++ * bsets after the write, since this node is about to be evicted ++ * - unless btree verify mode is enabled, since it runs out of ++ * the post write cleanup: ++ */ ++ if (verify_btree_ondisk(c)) ++ bch2_btree_node_write(c, b, SIX_LOCK_intent); ++ else ++ __bch2_btree_node_write(c, b, SIX_LOCK_read); ++ ++ /* wait for any in flight btree write */ ++ btree_node_wait_on_io(b); ++ } ++out: ++ if (b->hash_val && !ret) ++ trace_btree_node_reap(c, b); ++ return ret; ++out_unlock: ++ six_unlock_write(&b->c.lock); ++out_unlock_intent: ++ six_unlock_intent(&b->c.lock); ++ ret = -ENOMEM; ++ goto out; ++} ++ ++static int btree_node_reclaim(struct bch_fs *c, struct btree *b) ++{ ++ return __btree_node_reclaim(c, b, false); ++} ++ ++static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) ++{ ++ return __btree_node_reclaim(c, b, true); ++} ++ ++static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, ++ struct shrink_control *sc) ++{ ++ struct bch_fs *c = container_of(shrink, struct bch_fs, ++ btree_cache.shrink); ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b, *t; ++ unsigned long nr = sc->nr_to_scan; ++ unsigned long can_free; ++ unsigned long touched = 0; ++ unsigned long freed = 0; ++ unsigned i; ++ ++ if (btree_shrinker_disabled(c)) ++ return SHRINK_STOP; ++ ++ /* Return -1 if we can't do anything right now */ ++ if (sc->gfp_mask & __GFP_FS) ++ mutex_lock(&bc->lock); ++ else if (!mutex_trylock(&bc->lock)) ++ return -1; ++ ++ /* ++ * It's _really_ critical that we don't free too many btree nodes - we ++ * have to always leave ourselves a reserve. The reserve is how we ++ * guarantee that allocating memory for a new btree node can always ++ * succeed, so that inserting keys into the btree can always succeed and ++ * IO can always make forward progress: ++ */ ++ nr /= btree_pages(c); ++ can_free = btree_cache_can_free(bc); ++ nr = min_t(unsigned long, nr, can_free); ++ ++ i = 0; ++ list_for_each_entry_safe(b, t, &bc->freeable, list) { ++ touched++; ++ ++ if (freed >= nr) ++ break; ++ ++ if (++i > 3 && ++ !btree_node_reclaim(c, b)) { ++ btree_node_data_free(c, b); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ freed++; ++ } ++ } ++restart: ++ list_for_each_entry_safe(b, t, &bc->live, list) { ++ touched++; ++ ++ if (freed >= nr) { ++ /* Save position */ ++ if (&t->list != &bc->live) ++ list_move_tail(&bc->live, &t->list); ++ break; ++ } ++ ++ if (!btree_node_accessed(b) && ++ !btree_node_reclaim(c, b)) { ++ /* can't call bch2_btree_node_hash_remove under lock */ ++ freed++; ++ if (&t->list != &bc->live) ++ list_move_tail(&bc->live, &t->list); ++ ++ btree_node_data_free(c, b); ++ mutex_unlock(&bc->lock); ++ ++ bch2_btree_node_hash_remove(bc, b); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ ++ if (freed >= nr) ++ goto out; ++ ++ if (sc->gfp_mask & __GFP_FS) ++ mutex_lock(&bc->lock); ++ else if (!mutex_trylock(&bc->lock)) ++ goto out; ++ goto restart; ++ } else ++ clear_btree_node_accessed(b); ++ } ++ ++ mutex_unlock(&bc->lock); ++out: ++ return (unsigned long) freed * btree_pages(c); ++} ++ ++static unsigned long bch2_btree_cache_count(struct shrinker *shrink, ++ struct shrink_control *sc) ++{ ++ struct bch_fs *c = container_of(shrink, struct bch_fs, ++ btree_cache.shrink); ++ struct btree_cache *bc = &c->btree_cache; ++ ++ if (btree_shrinker_disabled(c)) ++ return 0; ++ ++ return btree_cache_can_free(bc) * btree_pages(c); ++} ++ ++void bch2_fs_btree_cache_exit(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ unsigned i; ++ ++ if (bc->shrink.list.next) ++ unregister_shrinker(&bc->shrink); ++ ++ mutex_lock(&bc->lock); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ if (c->verify_data) ++ list_move(&c->verify_data->list, &bc->live); ++ ++ kvpfree(c->verify_ondisk, btree_bytes(c)); ++#endif ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (c->btree_roots[i].b) ++ list_add(&c->btree_roots[i].b->list, &bc->live); ++ ++ list_splice(&bc->freeable, &bc->live); ++ ++ while (!list_empty(&bc->live)) { ++ b = list_first_entry(&bc->live, struct btree, list); ++ ++ BUG_ON(btree_node_read_in_flight(b) || ++ btree_node_write_in_flight(b)); ++ ++ if (btree_node_dirty(b)) ++ bch2_btree_complete_write(c, b, btree_current_write(b)); ++ clear_btree_node_dirty(b); ++ ++ btree_node_data_free(c, b); ++ } ++ ++ while (!list_empty(&bc->freed)) { ++ b = list_first_entry(&bc->freed, struct btree, list); ++ list_del(&b->list); ++ kfree(b); ++ } ++ ++ mutex_unlock(&bc->lock); ++ ++ if (bc->table_init_done) ++ rhashtable_destroy(&bc->table); ++} ++ ++int bch2_fs_btree_cache_init(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ unsigned i; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ ret = rhashtable_init(&bc->table, &bch_btree_cache_params); ++ if (ret) ++ goto out; ++ ++ bc->table_init_done = true; ++ ++ bch2_recalc_btree_reserve(c); ++ ++ for (i = 0; i < bc->reserve; i++) ++ if (!btree_node_mem_alloc(c)) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ list_splice_init(&bc->live, &bc->freeable); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ mutex_init(&c->verify_lock); ++ ++ c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); ++ if (!c->verify_ondisk) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ c->verify_data = btree_node_mem_alloc(c); ++ if (!c->verify_data) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ list_del_init(&c->verify_data->list); ++#endif ++ ++ bc->shrink.count_objects = bch2_btree_cache_count; ++ bc->shrink.scan_objects = bch2_btree_cache_scan; ++ bc->shrink.seeks = 4; ++ bc->shrink.batch = btree_pages(c) * 2; ++ register_shrinker(&bc->shrink); ++out: ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} ++ ++void bch2_fs_btree_cache_init_early(struct btree_cache *bc) ++{ ++ mutex_init(&bc->lock); ++ INIT_LIST_HEAD(&bc->live); ++ INIT_LIST_HEAD(&bc->freeable); ++ INIT_LIST_HEAD(&bc->freed); ++} ++ ++/* ++ * We can only have one thread cannibalizing other cached btree nodes at a time, ++ * or we'll deadlock. We use an open coded mutex to ensure that, which a ++ * cannibalize_bucket() will take. This means every time we unlock the root of ++ * the btree, we need to release this lock if we have it held. ++ */ ++void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ ++ if (bc->alloc_lock == current) { ++ trace_btree_node_cannibalize_unlock(c); ++ bc->alloc_lock = NULL; ++ closure_wake_up(&bc->alloc_wait); ++ } ++} ++ ++int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct task_struct *old; ++ ++ old = cmpxchg(&bc->alloc_lock, NULL, current); ++ if (old == NULL || old == current) ++ goto success; ++ ++ if (!cl) { ++ trace_btree_node_cannibalize_lock_fail(c); ++ return -ENOMEM; ++ } ++ ++ closure_wait(&bc->alloc_wait, cl); ++ ++ /* Try again, after adding ourselves to waitlist */ ++ old = cmpxchg(&bc->alloc_lock, NULL, current); ++ if (old == NULL || old == current) { ++ /* We raced */ ++ closure_wake_up(&bc->alloc_wait); ++ goto success; ++ } ++ ++ trace_btree_node_cannibalize_lock_fail(c); ++ return -EAGAIN; ++ ++success: ++ trace_btree_node_cannibalize_lock(c); ++ return 0; ++} ++ ++static struct btree *btree_node_cannibalize(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ ++ list_for_each_entry_reverse(b, &bc->live, list) ++ if (!btree_node_reclaim(c, b)) ++ return b; ++ ++ while (1) { ++ list_for_each_entry_reverse(b, &bc->live, list) ++ if (!btree_node_write_and_reclaim(c, b)) ++ return b; ++ ++ /* ++ * Rare case: all nodes were intent-locked. ++ * Just busy-wait. ++ */ ++ WARN_ONCE(1, "btree cache cannibalize failed\n"); ++ cond_resched(); ++ } ++} ++ ++struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ u64 start_time = local_clock(); ++ unsigned flags; ++ ++ flags = memalloc_nofs_save(); ++ mutex_lock(&bc->lock); ++ ++ /* ++ * btree_free() doesn't free memory; it sticks the node on the end of ++ * the list. Check if there's any freed nodes there: ++ */ ++ list_for_each_entry(b, &bc->freeable, list) ++ if (!btree_node_reclaim(c, b)) ++ goto got_node; ++ ++ /* ++ * We never free struct btree itself, just the memory that holds the on ++ * disk node. Check the freed list before allocating a new one: ++ */ ++ list_for_each_entry(b, &bc->freed, list) ++ if (!btree_node_reclaim(c, b)) ++ goto got_node; ++ ++ b = NULL; ++got_node: ++ if (b) ++ list_del_init(&b->list); ++ mutex_unlock(&bc->lock); ++ ++ if (!b) { ++ b = __btree_node_mem_alloc(c); ++ if (!b) ++ goto err; ++ ++ BUG_ON(!six_trylock_intent(&b->c.lock)); ++ BUG_ON(!six_trylock_write(&b->c.lock)); ++ } ++ ++ if (!b->data) { ++ if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) ++ goto err; ++ ++ mutex_lock(&bc->lock); ++ bc->used++; ++ mutex_unlock(&bc->lock); ++ } ++ ++ BUG_ON(btree_node_hashed(b)); ++ BUG_ON(btree_node_write_in_flight(b)); ++out: ++ b->flags = 0; ++ b->written = 0; ++ b->nsets = 0; ++ b->sib_u64s[0] = 0; ++ b->sib_u64s[1] = 0; ++ b->whiteout_u64s = 0; ++ bch2_btree_keys_init(b, &c->expensive_debug_checks); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], ++ start_time); ++ ++ memalloc_nofs_restore(flags); ++ return b; ++err: ++ mutex_lock(&bc->lock); ++ ++ if (b) { ++ list_add(&b->list, &bc->freed); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ } ++ ++ /* Try to cannibalize another cached btree node: */ ++ if (bc->alloc_lock == current) { ++ b = btree_node_cannibalize(c); ++ list_del_init(&b->list); ++ mutex_unlock(&bc->lock); ++ ++ bch2_btree_node_hash_remove(bc, b); ++ ++ trace_btree_node_cannibalize(c); ++ goto out; ++ } ++ ++ mutex_unlock(&bc->lock); ++ memalloc_nofs_restore(flags); ++ return ERR_PTR(-ENOMEM); ++} ++ ++/* Slowpath, don't want it inlined into btree_iter_traverse() */ ++static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, ++ struct btree_iter *iter, ++ const struct bkey_i *k, ++ enum btree_id btree_id, ++ unsigned level, ++ enum six_lock_type lock_type, ++ bool sync) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ ++ BUG_ON(level + 1 >= BTREE_MAX_DEPTH); ++ /* ++ * Parent node must be locked, else we could read in a btree node that's ++ * been freed: ++ */ ++ if (iter && !bch2_btree_node_relock(iter, level + 1)) ++ return ERR_PTR(-EINTR); ++ ++ b = bch2_btree_node_mem_alloc(c); ++ if (IS_ERR(b)) ++ return b; ++ ++ bkey_copy(&b->key, k); ++ if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { ++ /* raced with another fill: */ ++ ++ /* mark as unhashed... */ ++ b->hash_val = 0; ++ ++ mutex_lock(&bc->lock); ++ list_add(&b->list, &bc->freeable); ++ mutex_unlock(&bc->lock); ++ ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ return NULL; ++ } ++ ++ /* ++ * Unlock before doing IO: ++ * ++ * XXX: ideally should be dropping all btree node locks here ++ */ ++ if (iter && btree_node_read_locked(iter, level + 1)) ++ btree_node_unlock(iter, level + 1); ++ ++ bch2_btree_node_read(c, b, sync); ++ ++ six_unlock_write(&b->c.lock); ++ ++ if (!sync) { ++ six_unlock_intent(&b->c.lock); ++ return NULL; ++ } ++ ++ if (lock_type == SIX_LOCK_read) ++ six_lock_downgrade(&b->c.lock); ++ ++ return b; ++} ++ ++static int lock_node_check_fn(struct six_lock *lock, void *p) ++{ ++ struct btree *b = container_of(lock, struct btree, c.lock); ++ const struct bkey_i *k = p; ++ ++ return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1; ++} ++ ++/** ++ * bch_btree_node_get - find a btree node in the cache and lock it, reading it ++ * in from disk if necessary. ++ * ++ * If IO is necessary and running under generic_make_request, returns -EAGAIN. ++ * ++ * The btree node will have either a read or a write lock held, depending on ++ * the @write parameter. ++ */ ++struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, ++ const struct bkey_i *k, unsigned level, ++ enum six_lock_type lock_type) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ struct bset_tree *t; ++ ++ EBUG_ON(level >= BTREE_MAX_DEPTH); ++ ++ b = btree_node_mem_ptr(k); ++ if (b) ++ goto lock_node; ++retry: ++ b = btree_cache_find(bc, k); ++ if (unlikely(!b)) { ++ /* ++ * We must have the parent locked to call bch2_btree_node_fill(), ++ * else we could read in a btree node from disk that's been ++ * freed: ++ */ ++ b = bch2_btree_node_fill(c, iter, k, iter->btree_id, ++ level, lock_type, true); ++ ++ /* We raced and found the btree node in the cache */ ++ if (!b) ++ goto retry; ++ ++ if (IS_ERR(b)) ++ return b; ++ } else { ++lock_node: ++ /* ++ * There's a potential deadlock with splits and insertions into ++ * interior nodes we have to avoid: ++ * ++ * The other thread might be holding an intent lock on the node ++ * we want, and they want to update its parent node so they're ++ * going to upgrade their intent lock on the parent node to a ++ * write lock. ++ * ++ * But if we're holding a read lock on the parent, and we're ++ * trying to get the intent lock they're holding, we deadlock. ++ * ++ * So to avoid this we drop the read locks on parent nodes when ++ * we're starting to take intent locks - and handle the race. ++ * ++ * The race is that they might be about to free the node we ++ * want, and dropping our read lock on the parent node lets them ++ * update the parent marking the node we want as freed, and then ++ * free it: ++ * ++ * To guard against this, btree nodes are evicted from the cache ++ * when they're freed - and b->hash_val is zeroed out, which we ++ * check for after we lock the node. ++ * ++ * Then, bch2_btree_node_relock() on the parent will fail - because ++ * the parent was modified, when the pointer to the node we want ++ * was removed - and we'll bail out: ++ */ ++ if (btree_node_read_locked(iter, level + 1)) ++ btree_node_unlock(iter, level + 1); ++ ++ if (!btree_node_lock(b, k->k.p, level, iter, lock_type, ++ lock_node_check_fn, (void *) k)) { ++ if (b->hash_val != btree_ptr_hash_val(k)) ++ goto retry; ++ return ERR_PTR(-EINTR); ++ } ++ ++ if (unlikely(b->hash_val != btree_ptr_hash_val(k) || ++ b->c.level != level || ++ race_fault())) { ++ six_unlock_type(&b->c.lock, lock_type); ++ if (bch2_btree_node_relock(iter, level + 1)) ++ goto retry; ++ ++ trace_trans_restart_btree_node_reused(iter->trans->ip); ++ return ERR_PTR(-EINTR); ++ } ++ } ++ ++ /* XXX: waiting on IO with btree locks held: */ ++ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, ++ TASK_UNINTERRUPTIBLE); ++ ++ prefetch(b->aux_data); ++ ++ for_each_bset(b, t) { ++ void *p = (u64 *) b->aux_data + t->aux_data_offset; ++ ++ prefetch(p + L1_CACHE_BYTES * 0); ++ prefetch(p + L1_CACHE_BYTES * 1); ++ prefetch(p + L1_CACHE_BYTES * 2); ++ } ++ ++ /* avoid atomic set bit if it's not needed: */ ++ if (!btree_node_accessed(b)) ++ set_btree_node_accessed(b); ++ ++ if (unlikely(btree_node_read_error(b))) { ++ six_unlock_type(&b->c.lock, lock_type); ++ return ERR_PTR(-EIO); ++ } ++ ++ EBUG_ON(b->c.btree_id != iter->btree_id || ++ BTREE_NODE_LEVEL(b->data) != level || ++ bkey_cmp(b->data->max_key, k->k.p)); ++ ++ return b; ++} ++ ++struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, ++ const struct bkey_i *k, ++ enum btree_id btree_id, ++ unsigned level) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ struct bset_tree *t; ++ int ret; ++ ++ EBUG_ON(level >= BTREE_MAX_DEPTH); ++ ++ b = btree_node_mem_ptr(k); ++ if (b) ++ goto lock_node; ++retry: ++ b = btree_cache_find(bc, k); ++ if (unlikely(!b)) { ++ b = bch2_btree_node_fill(c, NULL, k, btree_id, ++ level, SIX_LOCK_read, true); ++ ++ /* We raced and found the btree node in the cache */ ++ if (!b) ++ goto retry; ++ ++ if (IS_ERR(b)) ++ return b; ++ } else { ++lock_node: ++ ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k); ++ if (ret) ++ goto retry; ++ ++ if (unlikely(b->hash_val != btree_ptr_hash_val(k) || ++ b->c.btree_id != btree_id || ++ b->c.level != level)) { ++ six_unlock_read(&b->c.lock); ++ goto retry; ++ } ++ } ++ ++ /* XXX: waiting on IO with btree locks held: */ ++ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, ++ TASK_UNINTERRUPTIBLE); ++ ++ prefetch(b->aux_data); ++ ++ for_each_bset(b, t) { ++ void *p = (u64 *) b->aux_data + t->aux_data_offset; ++ ++ prefetch(p + L1_CACHE_BYTES * 0); ++ prefetch(p + L1_CACHE_BYTES * 1); ++ prefetch(p + L1_CACHE_BYTES * 2); ++ } ++ ++ /* avoid atomic set bit if it's not needed: */ ++ if (!btree_node_accessed(b)) ++ set_btree_node_accessed(b); ++ ++ if (unlikely(btree_node_read_error(b))) { ++ six_unlock_read(&b->c.lock); ++ return ERR_PTR(-EIO); ++ } ++ ++ EBUG_ON(b->c.btree_id != btree_id || ++ BTREE_NODE_LEVEL(b->data) != level || ++ bkey_cmp(b->data->max_key, k->k.p)); ++ ++ return b; ++} ++ ++struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, ++ struct btree_iter *iter, ++ struct btree *b, ++ enum btree_node_sibling sib) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree *parent; ++ struct btree_node_iter node_iter; ++ struct bkey_packed *k; ++ BKEY_PADDED(k) tmp; ++ struct btree *ret = NULL; ++ unsigned level = b->c.level; ++ ++ parent = btree_iter_node(iter, level + 1); ++ if (!parent) ++ return NULL; ++ ++ /* ++ * There's a corner case where a btree_iter might have a node locked ++ * that is just outside its current pos - when ++ * bch2_btree_iter_set_pos_same_leaf() gets to the end of the node. ++ * ++ * But the lock ordering checks in __bch2_btree_node_lock() go off of ++ * iter->pos, not the node's key: so if the iterator is marked as ++ * needing to be traversed, we risk deadlock if we don't bail out here: ++ */ ++ if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) ++ return ERR_PTR(-EINTR); ++ ++ if (!bch2_btree_node_relock(iter, level + 1)) { ++ ret = ERR_PTR(-EINTR); ++ goto out; ++ } ++ ++ node_iter = iter->l[parent->c.level].iter; ++ ++ k = bch2_btree_node_iter_peek_all(&node_iter, parent); ++ BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p)); ++ ++ k = sib == btree_prev_sib ++ ? bch2_btree_node_iter_prev(&node_iter, parent) ++ : (bch2_btree_node_iter_advance(&node_iter, parent), ++ bch2_btree_node_iter_peek(&node_iter, parent)); ++ if (!k) ++ goto out; ++ ++ bch2_bkey_unpack(parent, &tmp.k, k); ++ ++ ret = bch2_btree_node_get(c, iter, &tmp.k, level, ++ SIX_LOCK_intent); ++ ++ if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) { ++ struct btree_iter *linked; ++ ++ if (!bch2_btree_node_relock(iter, level + 1)) ++ goto out; ++ ++ /* ++ * We might have got -EINTR because trylock failed, and we're ++ * holding other locks that would cause us to deadlock: ++ */ ++ trans_for_each_iter(trans, linked) ++ if (btree_iter_cmp(iter, linked) < 0) ++ __bch2_btree_iter_unlock(linked); ++ ++ if (sib == btree_prev_sib) ++ btree_node_unlock(iter, level); ++ ++ ret = bch2_btree_node_get(c, iter, &tmp.k, level, ++ SIX_LOCK_intent); ++ ++ /* ++ * before btree_iter_relock() calls btree_iter_verify_locks(): ++ */ ++ if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) ++ btree_node_unlock(iter, level + 1); ++ ++ if (!bch2_btree_node_relock(iter, level)) { ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); ++ ++ if (!IS_ERR(ret)) { ++ six_unlock_intent(&ret->c.lock); ++ ret = ERR_PTR(-EINTR); ++ } ++ } ++ ++ bch2_trans_relock(trans); ++ } ++out: ++ if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) ++ btree_node_unlock(iter, level + 1); ++ ++ if (PTR_ERR_OR_ZERO(ret) == -EINTR) ++ bch2_btree_iter_upgrade(iter, level + 2); ++ ++ BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level)); ++ ++ if (!IS_ERR_OR_NULL(ret)) { ++ struct btree *n1 = ret, *n2 = b; ++ ++ if (sib != btree_prev_sib) ++ swap(n1, n2); ++ ++ BUG_ON(bkey_cmp(bkey_successor(n1->key.k.p), ++ n2->data->min_key)); ++ } ++ ++ bch2_btree_trans_verify_locks(trans); ++ ++ return ret; ++} ++ ++void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, ++ const struct bkey_i *k, unsigned level) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ ++ BUG_ON(!btree_node_locked(iter, level + 1)); ++ BUG_ON(level >= BTREE_MAX_DEPTH); ++ ++ b = btree_cache_find(bc, k); ++ if (b) ++ return; ++ ++ bch2_btree_node_fill(c, iter, k, iter->btree_id, ++ level, SIX_LOCK_read, false); ++} ++ ++void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, ++ struct btree *b) ++{ ++ const struct bkey_format *f = &b->format; ++ struct bset_stats stats; ++ ++ memset(&stats, 0, sizeof(stats)); ++ ++ bch2_btree_keys_stats(b, &stats); ++ ++ pr_buf(out, ++ "l %u %llu:%llu - %llu:%llu:\n" ++ " ptrs: ", ++ b->c.level, ++ b->data->min_key.inode, ++ b->data->min_key.offset, ++ b->data->max_key.inode, ++ b->data->max_key.offset); ++ bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); ++ pr_buf(out, "\n" ++ " format: u64s %u fields %u %u %u %u %u\n" ++ " unpack fn len: %u\n" ++ " bytes used %zu/%zu (%zu%% full)\n" ++ " sib u64s: %u, %u (merge threshold %zu)\n" ++ " nr packed keys %u\n" ++ " nr unpacked keys %u\n" ++ " floats %zu\n" ++ " failed unpacked %zu\n", ++ f->key_u64s, ++ f->bits_per_field[0], ++ f->bits_per_field[1], ++ f->bits_per_field[2], ++ f->bits_per_field[3], ++ f->bits_per_field[4], ++ b->unpack_fn_len, ++ b->nr.live_u64s * sizeof(u64), ++ btree_bytes(c) - sizeof(struct btree_node), ++ b->nr.live_u64s * 100 / btree_max_u64s(c), ++ b->sib_u64s[0], ++ b->sib_u64s[1], ++ BTREE_FOREGROUND_MERGE_THRESHOLD(c), ++ b->nr.packed_keys, ++ b->nr.unpacked_keys, ++ stats.floats, ++ stats.failed); ++} +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +new file mode 100644 +index 000000000000..d0d3a85bb8be +--- /dev/null ++++ b/fs/bcachefs/btree_cache.h +@@ -0,0 +1,104 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_CACHE_H ++#define _BCACHEFS_BTREE_CACHE_H ++ ++#include "bcachefs.h" ++#include "btree_types.h" ++ ++struct btree_iter; ++ ++extern const char * const bch2_btree_ids[]; ++ ++void bch2_recalc_btree_reserve(struct bch_fs *); ++ ++void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); ++int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); ++int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, ++ unsigned, enum btree_id); ++ ++void bch2_btree_cache_cannibalize_unlock(struct bch_fs *); ++int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); ++ ++struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); ++ ++struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, ++ const struct bkey_i *, unsigned, ++ enum six_lock_type); ++ ++struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, ++ enum btree_id, unsigned); ++ ++struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, ++ struct btree *, enum btree_node_sibling); ++ ++void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *, ++ const struct bkey_i *, unsigned); ++ ++void bch2_fs_btree_cache_exit(struct bch_fs *); ++int bch2_fs_btree_cache_init(struct bch_fs *); ++void bch2_fs_btree_cache_init_early(struct btree_cache *); ++ ++static inline u64 btree_ptr_hash_val(const struct bkey_i *k) ++{ ++ switch (k->k.type) { ++ case KEY_TYPE_btree_ptr: ++ return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start); ++ case KEY_TYPE_btree_ptr_v2: ++ return bkey_i_to_btree_ptr_v2_c(k)->v.seq; ++ default: ++ return 0; ++ } ++} ++ ++static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k) ++{ ++ return k->k.type == KEY_TYPE_btree_ptr_v2 ++ ? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr ++ : NULL; ++} ++ ++/* is btree node in hash table? */ ++static inline bool btree_node_hashed(struct btree *b) ++{ ++ return b->hash_val != 0; ++} ++ ++#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \ ++ for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl, \ ++ &(_c)->btree_cache.table), \ ++ _iter = 0; _iter < (_tbl)->size; _iter++) \ ++ rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash) ++ ++static inline size_t btree_bytes(struct bch_fs *c) ++{ ++ return c->opts.btree_node_size << 9; ++} ++ ++static inline size_t btree_max_u64s(struct bch_fs *c) ++{ ++ return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64); ++} ++ ++static inline size_t btree_pages(struct bch_fs *c) ++{ ++ return btree_bytes(c) / PAGE_SIZE; ++} ++ ++static inline unsigned btree_blocks(struct bch_fs *c) ++{ ++ return c->opts.btree_node_size >> c->block_bits; ++} ++ ++#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3) ++ ++#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) ++#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ ++ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \ ++ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2)) ++ ++#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->c.btree_id].b) ++ ++void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, ++ struct btree *); ++ ++#endif /* _BCACHEFS_BTREE_CACHE_H */ +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +new file mode 100644 +index 000000000000..4f581130270c +--- /dev/null ++++ b/fs/bcachefs/btree_gc.c +@@ -0,0 +1,1395 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2010 Kent Overstreet ++ * Copyright (C) 2014 Datera Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "bkey_methods.h" ++#include "btree_locking.h" ++#include "btree_update_interior.h" ++#include "btree_io.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "clock.h" ++#include "debug.h" ++#include "ec.h" ++#include "error.h" ++#include "extents.h" ++#include "journal.h" ++#include "keylist.h" ++#include "move.h" ++#include "recovery.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) ++{ ++ write_seqcount_begin(&c->gc_pos_lock); ++ c->gc_pos = new_pos; ++ write_seqcount_end(&c->gc_pos_lock); ++} ++ ++static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) ++{ ++ BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); ++ __gc_pos_set(c, new_pos); ++} ++ ++static int bch2_gc_check_topology(struct bch_fs *c, ++ struct bkey_s_c k, ++ struct bpos *expected_start, ++ struct bpos expected_end, ++ bool is_last) ++{ ++ int ret = 0; ++ ++ if (k.k->type == KEY_TYPE_btree_ptr_v2) { ++ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); ++ ++ if (fsck_err_on(bkey_cmp(*expected_start, bp.v->min_key), c, ++ "btree node with incorrect min_key: got %llu:%llu, should be %llu:%llu", ++ bp.v->min_key.inode, ++ bp.v->min_key.offset, ++ expected_start->inode, ++ expected_start->offset)) { ++ BUG(); ++ } ++ } ++ ++ *expected_start = bkey_cmp(k.k->p, POS_MAX) ++ ? bkey_successor(k.k->p) ++ : k.k->p; ++ ++ if (fsck_err_on(is_last && ++ bkey_cmp(k.k->p, expected_end), c, ++ "btree node with incorrect max_key: got %llu:%llu, should be %llu:%llu", ++ k.k->p.inode, ++ k.k->p.offset, ++ expected_end.inode, ++ expected_end.offset)) { ++ BUG(); ++ } ++fsck_err: ++ return ret; ++} ++ ++/* marking of btree keys/nodes: */ ++ ++static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, ++ u8 *max_stale, bool initial) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ unsigned flags = ++ BTREE_TRIGGER_GC| ++ (initial ? BTREE_TRIGGER_NOATOMIC : 0); ++ int ret = 0; ++ ++ if (initial) { ++ BUG_ON(journal_seq_verify(c) && ++ k.k->version.lo > journal_cur_seq(&c->journal)); ++ ++ /* XXX change to fsck check */ ++ if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, ++ "key version number higher than recorded: %llu > %llu", ++ k.k->version.lo, ++ atomic64_read(&c->key_version))) ++ atomic64_set(&c->key_version, k.k->version.lo); ++ ++ if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || ++ fsck_err_on(!bch2_bkey_replicas_marked(c, k), c, ++ "superblock not marked as containing replicas (type %u)", ++ k.k->type)) { ++ ret = bch2_mark_bkey_replicas(c, k); ++ if (ret) ++ return ret; ++ } ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, true); ++ struct bucket *g2 = PTR_BUCKET(ca, ptr, false); ++ ++ if (mustfix_fsck_err_on(!g->gen_valid, c, ++ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree", ++ ptr->dev, PTR_BUCKET_NR(ca, ptr), ++ bch2_data_types[ptr_data_type(k.k, ptr)], ++ ptr->gen)) { ++ g2->_mark.gen = g->_mark.gen = ptr->gen; ++ g2->gen_valid = g->gen_valid = true; ++ } ++ ++ if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, ++ "bucket %u:%zu data type %s ptr gen in the future: %u > %u", ++ ptr->dev, PTR_BUCKET_NR(ca, ptr), ++ bch2_data_types[ptr_data_type(k.k, ptr)], ++ ptr->gen, g->mark.gen)) { ++ g2->_mark.gen = g->_mark.gen = ptr->gen; ++ g2->gen_valid = g->gen_valid = true; ++ g2->_mark.data_type = 0; ++ g2->_mark.dirty_sectors = 0; ++ g2->_mark.cached_sectors = 0; ++ set_bit(BCH_FS_FIXED_GENS, &c->flags); ++ } ++ } ++ } ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, true); ++ ++ if (gen_after(g->oldest_gen, ptr->gen)) ++ g->oldest_gen = ptr->gen; ++ ++ *max_stale = max(*max_stale, ptr_stale(ca, ptr)); ++ } ++ ++ bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags); ++fsck_err: ++ return ret; ++} ++ ++static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, ++ bool initial) ++{ ++ struct bpos next_node_start = b->data->min_key; ++ struct btree_node_iter iter; ++ struct bkey unpacked; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ *max_stale = 0; ++ ++ if (!btree_node_type_needs_gc(btree_node_type(b))) ++ return 0; ++ ++ bch2_btree_node_iter_init_from_start(&iter, b); ++ ++ while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { ++ bch2_bkey_debugcheck(c, b, k); ++ ++ ret = bch2_gc_mark_key(c, k, max_stale, initial); ++ if (ret) ++ break; ++ ++ bch2_btree_node_iter_advance(&iter, b); ++ ++ if (b->c.level) { ++ ret = bch2_gc_check_topology(c, k, ++ &next_node_start, ++ b->data->max_key, ++ bch2_btree_node_iter_end(&iter)); ++ if (ret) ++ break; ++ } ++ } ++ ++ return ret; ++} ++ ++static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, ++ bool initial, bool metadata_only) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct btree *b; ++ unsigned depth = metadata_only ? 1 ++ : expensive_debug_checks(c) ? 0 ++ : !btree_node_type_needs_gc(btree_id) ? 1 ++ : 0; ++ u8 max_stale = 0; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); ++ ++ __for_each_btree_node(&trans, iter, btree_id, POS_MIN, ++ 0, depth, BTREE_ITER_PREFETCH, b) { ++ bch2_verify_btree_nr_keys(b); ++ ++ gc_pos_set(c, gc_pos_btree_node(b)); ++ ++ ret = btree_gc_mark_node(c, b, &max_stale, initial); ++ if (ret) ++ break; ++ ++ if (!initial) { ++ if (max_stale > 64) ++ bch2_btree_node_rewrite(c, iter, ++ b->data->keys.seq, ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_NOWAIT| ++ BTREE_INSERT_GC_LOCK_HELD); ++ else if (!btree_gc_rewrite_disabled(c) && ++ (btree_gc_always_rewrite(c) || max_stale > 16)) ++ bch2_btree_node_rewrite(c, iter, ++ b->data->keys.seq, ++ BTREE_INSERT_NOWAIT| ++ BTREE_INSERT_GC_LOCK_HELD); ++ } ++ ++ bch2_trans_cond_resched(&trans); ++ } ++ ret = bch2_trans_exit(&trans) ?: ret; ++ if (ret) ++ return ret; ++ ++ mutex_lock(&c->btree_root_lock); ++ b = c->btree_roots[btree_id].b; ++ if (!btree_node_fake(b)) ++ ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), ++ &max_stale, initial); ++ gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); ++ mutex_unlock(&c->btree_root_lock); ++ ++ return ret; ++} ++ ++static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, ++ struct journal_keys *journal_keys, ++ unsigned target_depth) ++{ ++ struct btree_and_journal_iter iter; ++ struct bkey_s_c k; ++ struct bpos next_node_start = b->data->min_key; ++ u8 max_stale = 0; ++ int ret = 0; ++ ++ bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); ++ ++ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { ++ bch2_bkey_debugcheck(c, b, k); ++ ++ BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0); ++ BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0); ++ ++ ret = bch2_gc_mark_key(c, k, &max_stale, true); ++ if (ret) ++ break; ++ ++ if (b->c.level) { ++ struct btree *child; ++ BKEY_PADDED(k) tmp; ++ ++ bkey_reassemble(&tmp.k, k); ++ k = bkey_i_to_s_c(&tmp.k); ++ ++ bch2_btree_and_journal_iter_advance(&iter); ++ ++ ret = bch2_gc_check_topology(c, k, ++ &next_node_start, ++ b->data->max_key, ++ !bch2_btree_and_journal_iter_peek(&iter).k); ++ if (ret) ++ break; ++ ++ if (b->c.level > target_depth) { ++ child = bch2_btree_node_get_noiter(c, &tmp.k, ++ b->c.btree_id, b->c.level - 1); ++ ret = PTR_ERR_OR_ZERO(child); ++ if (ret) ++ break; ++ ++ ret = bch2_gc_btree_init_recurse(c, child, ++ journal_keys, target_depth); ++ six_unlock_read(&child->c.lock); ++ ++ if (ret) ++ break; ++ } ++ } else { ++ bch2_btree_and_journal_iter_advance(&iter); ++ } ++ } ++ ++ return ret; ++} ++ ++static int bch2_gc_btree_init(struct bch_fs *c, ++ struct journal_keys *journal_keys, ++ enum btree_id btree_id, ++ bool metadata_only) ++{ ++ struct btree *b; ++ unsigned target_depth = metadata_only ? 1 ++ : expensive_debug_checks(c) ? 0 ++ : !btree_node_type_needs_gc(btree_id) ? 1 ++ : 0; ++ u8 max_stale = 0; ++ int ret = 0; ++ ++ b = c->btree_roots[btree_id].b; ++ ++ if (btree_node_fake(b)) ++ return 0; ++ ++ six_lock_read(&b->c.lock, NULL, NULL); ++ if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c, ++ "btree root with incorrect min_key: %llu:%llu", ++ b->data->min_key.inode, ++ b->data->min_key.offset)) { ++ BUG(); ++ } ++ ++ if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c, ++ "btree root with incorrect min_key: %llu:%llu", ++ b->data->max_key.inode, ++ b->data->max_key.offset)) { ++ BUG(); ++ } ++ ++ if (b->c.level >= target_depth) ++ ret = bch2_gc_btree_init_recurse(c, b, ++ journal_keys, target_depth); ++ ++ if (!ret) ++ ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), ++ &max_stale, true); ++fsck_err: ++ six_unlock_read(&b->c.lock); ++ ++ return ret; ++} ++ ++static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) ++{ ++ return (int) btree_id_to_gc_phase(l) - ++ (int) btree_id_to_gc_phase(r); ++} ++ ++static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, ++ bool initial, bool metadata_only) ++{ ++ enum btree_id ids[BTREE_ID_NR]; ++ unsigned i; ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ ids[i] = i; ++ bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) { ++ enum btree_id id = ids[i]; ++ int ret = initial ++ ? bch2_gc_btree_init(c, journal_keys, ++ id, metadata_only) ++ : bch2_gc_btree(c, id, initial, metadata_only); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, ++ u64 start, u64 end, ++ enum bch_data_type type, ++ unsigned flags) ++{ ++ u64 b = sector_to_bucket(ca, start); ++ ++ do { ++ unsigned sectors = ++ min_t(u64, bucket_to_sector(ca, b + 1), end) - start; ++ ++ bch2_mark_metadata_bucket(c, ca, b, type, sectors, ++ gc_phase(GC_PHASE_SB), flags); ++ b++; ++ start += sectors; ++ } while (start < end); ++} ++ ++void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, ++ unsigned flags) ++{ ++ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; ++ unsigned i; ++ u64 b; ++ ++ /* ++ * This conditional is kind of gross, but we may be called from the ++ * device add path, before the new device has actually been added to the ++ * running filesystem: ++ */ ++ if (c) { ++ lockdep_assert_held(&c->sb_lock); ++ percpu_down_read(&c->mark_lock); ++ } ++ ++ for (i = 0; i < layout->nr_superblocks; i++) { ++ u64 offset = le64_to_cpu(layout->sb_offset[i]); ++ ++ if (offset == BCH_SB_SECTOR) ++ mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, ++ BCH_DATA_sb, flags); ++ ++ mark_metadata_sectors(c, ca, offset, ++ offset + (1 << layout->sb_max_size_bits), ++ BCH_DATA_sb, flags); ++ } ++ ++ for (i = 0; i < ca->journal.nr; i++) { ++ b = ca->journal.buckets[i]; ++ bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, ++ ca->mi.bucket_size, ++ gc_phase(GC_PHASE_SB), flags); ++ } ++ ++ if (c) ++ percpu_up_read(&c->mark_lock); ++} ++ ++static void bch2_mark_superblocks(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ mutex_lock(&c->sb_lock); ++ gc_pos_set(c, gc_phase(GC_PHASE_SB)); ++ ++ for_each_online_member(ca, c, i) ++ bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); ++ mutex_unlock(&c->sb_lock); ++} ++ ++#if 0 ++/* Also see bch2_pending_btree_node_free_insert_done() */ ++static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) ++{ ++ struct btree_update *as; ++ struct pending_btree_node_free *d; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE)); ++ ++ for_each_pending_btree_node_free(c, as, d) ++ if (d->index_update_done) ++ bch2_mark_key(c, bkey_i_to_s_c(&d->key), ++ 0, 0, NULL, 0, ++ BTREE_TRIGGER_GC); ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++} ++#endif ++ ++static void bch2_mark_allocator_buckets(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ struct open_bucket *ob; ++ size_t i, j, iter; ++ unsigned ci; ++ ++ percpu_down_read(&c->mark_lock); ++ ++ spin_lock(&c->freelist_lock); ++ gc_pos_set(c, gc_pos_alloc(c, NULL)); ++ ++ for_each_member_device(ca, c, ci) { ++ fifo_for_each_entry(i, &ca->free_inc, iter) ++ bch2_mark_alloc_bucket(c, ca, i, true, ++ gc_pos_alloc(c, NULL), ++ BTREE_TRIGGER_GC); ++ ++ ++ ++ for (j = 0; j < RESERVE_NR; j++) ++ fifo_for_each_entry(i, &ca->free[j], iter) ++ bch2_mark_alloc_bucket(c, ca, i, true, ++ gc_pos_alloc(c, NULL), ++ BTREE_TRIGGER_GC); ++ } ++ ++ spin_unlock(&c->freelist_lock); ++ ++ for (ob = c->open_buckets; ++ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ++ ob++) { ++ spin_lock(&ob->lock); ++ if (ob->valid) { ++ gc_pos_set(c, gc_pos_alloc(c, ob)); ++ ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true, ++ gc_pos_alloc(c, ob), ++ BTREE_TRIGGER_GC); ++ } ++ spin_unlock(&ob->lock); ++ } ++ ++ percpu_up_read(&c->mark_lock); ++} ++ ++static void bch2_gc_free(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ genradix_free(&c->stripes[1]); ++ ++ for_each_member_device(ca, c, i) { ++ kvpfree(rcu_dereference_protected(ca->buckets[1], 1), ++ sizeof(struct bucket_array) + ++ ca->mi.nbuckets * sizeof(struct bucket)); ++ ca->buckets[1] = NULL; ++ ++ free_percpu(ca->usage[1]); ++ ca->usage[1] = NULL; ++ } ++ ++ free_percpu(c->usage_gc); ++ c->usage_gc = NULL; ++} ++ ++static int bch2_gc_done(struct bch_fs *c, ++ bool initial, bool metadata_only) ++{ ++ struct bch_dev *ca; ++ bool verify = !metadata_only && ++ (!initial || ++ (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))); ++ unsigned i; ++ int ret = 0; ++ ++#define copy_field(_f, _msg, ...) \ ++ if (dst->_f != src->_f) { \ ++ if (verify) \ ++ fsck_err(c, _msg ": got %llu, should be %llu" \ ++ , ##__VA_ARGS__, dst->_f, src->_f); \ ++ dst->_f = src->_f; \ ++ } ++#define copy_stripe_field(_f, _msg, ...) \ ++ if (dst->_f != src->_f) { \ ++ if (verify) \ ++ fsck_err(c, "stripe %zu has wrong "_msg \ ++ ": got %u, should be %u", \ ++ dst_iter.pos, ##__VA_ARGS__, \ ++ dst->_f, src->_f); \ ++ dst->_f = src->_f; \ ++ dst->dirty = true; \ ++ } ++#define copy_bucket_field(_f) \ ++ if (dst->b[b].mark._f != src->b[b].mark._f) { \ ++ if (verify) \ ++ fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \ ++ ": got %u, should be %u", i, b, \ ++ dst->b[b].mark.gen, \ ++ bch2_data_types[dst->b[b].mark.data_type],\ ++ dst->b[b].mark._f, src->b[b].mark._f); \ ++ dst->b[b]._mark._f = src->b[b].mark._f; \ ++ } ++#define copy_dev_field(_f, _msg, ...) \ ++ copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) ++#define copy_fs_field(_f, _msg, ...) \ ++ copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) ++ ++ if (!metadata_only) { ++ struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0); ++ struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0); ++ struct stripe *dst, *src; ++ unsigned i; ++ ++ c->ec_stripes_heap.used = 0; ++ ++ while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) && ++ (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) { ++ BUG_ON(src_iter.pos != dst_iter.pos); ++ ++ copy_stripe_field(alive, "alive"); ++ copy_stripe_field(sectors, "sectors"); ++ copy_stripe_field(algorithm, "algorithm"); ++ copy_stripe_field(nr_blocks, "nr_blocks"); ++ copy_stripe_field(nr_redundant, "nr_redundant"); ++ copy_stripe_field(blocks_nonempty, ++ "blocks_nonempty"); ++ ++ for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++) ++ copy_stripe_field(block_sectors[i], ++ "block_sectors[%u]", i); ++ ++ if (dst->alive) { ++ spin_lock(&c->ec_stripes_heap_lock); ++ bch2_stripes_heap_insert(c, dst, dst_iter.pos); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ } ++ ++ genradix_iter_advance(&dst_iter, &c->stripes[0]); ++ genradix_iter_advance(&src_iter, &c->stripes[1]); ++ } ++ } ++ ++ for_each_member_device(ca, c, i) { ++ struct bucket_array *dst = __bucket_array(ca, 0); ++ struct bucket_array *src = __bucket_array(ca, 1); ++ size_t b; ++ ++ for (b = 0; b < src->nbuckets; b++) { ++ copy_bucket_field(gen); ++ copy_bucket_field(data_type); ++ copy_bucket_field(owned_by_allocator); ++ copy_bucket_field(stripe); ++ copy_bucket_field(dirty_sectors); ++ copy_bucket_field(cached_sectors); ++ ++ dst->b[b].oldest_gen = src->b[b].oldest_gen; ++ } ++ }; ++ ++ bch2_fs_usage_acc_to_base(c, 0); ++ bch2_fs_usage_acc_to_base(c, 1); ++ ++ bch2_dev_usage_from_buckets(c); ++ ++ { ++ unsigned nr = fs_usage_u64s(c); ++ struct bch_fs_usage *dst = c->usage_base; ++ struct bch_fs_usage *src = (void *) ++ bch2_acc_percpu_u64s((void *) c->usage_gc, nr); ++ ++ copy_fs_field(hidden, "hidden"); ++ copy_fs_field(btree, "btree"); ++ ++ if (!metadata_only) { ++ copy_fs_field(data, "data"); ++ copy_fs_field(cached, "cached"); ++ copy_fs_field(reserved, "reserved"); ++ copy_fs_field(nr_inodes,"nr_inodes"); ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) ++ copy_fs_field(persistent_reserved[i], ++ "persistent_reserved[%i]", i); ++ } ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ char buf[80]; ++ ++ if (metadata_only && ++ (e->data_type == BCH_DATA_user || ++ e->data_type == BCH_DATA_cached)) ++ continue; ++ ++ bch2_replicas_entry_to_text(&PBUF(buf), e); ++ ++ copy_fs_field(replicas[i], "%s", buf); ++ } ++ } ++ ++#undef copy_fs_field ++#undef copy_dev_field ++#undef copy_bucket_field ++#undef copy_stripe_field ++#undef copy_field ++fsck_err: ++ return ret; ++} ++ ++static int bch2_gc_start(struct bch_fs *c, ++ bool metadata_only) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ int ret; ++ ++ BUG_ON(c->usage_gc); ++ ++ c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), ++ sizeof(u64), GFP_KERNEL); ++ if (!c->usage_gc) { ++ bch_err(c, "error allocating c->usage_gc"); ++ return -ENOMEM; ++ } ++ ++ for_each_member_device(ca, c, i) { ++ BUG_ON(ca->buckets[1]); ++ BUG_ON(ca->usage[1]); ++ ++ ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + ++ ca->mi.nbuckets * sizeof(struct bucket), ++ GFP_KERNEL|__GFP_ZERO); ++ if (!ca->buckets[1]) { ++ percpu_ref_put(&ca->ref); ++ bch_err(c, "error allocating ca->buckets[gc]"); ++ return -ENOMEM; ++ } ++ ++ ca->usage[1] = alloc_percpu(struct bch_dev_usage); ++ if (!ca->usage[1]) { ++ bch_err(c, "error allocating ca->usage[gc]"); ++ percpu_ref_put(&ca->ref); ++ return -ENOMEM; ++ } ++ } ++ ++ ret = bch2_ec_mem_alloc(c, true); ++ if (ret) { ++ bch_err(c, "error allocating ec gc mem"); ++ return ret; ++ } ++ ++ percpu_down_write(&c->mark_lock); ++ ++ /* ++ * indicate to stripe code that we need to allocate for the gc stripes ++ * radix tree, too ++ */ ++ gc_pos_set(c, gc_phase(GC_PHASE_START)); ++ ++ for_each_member_device(ca, c, i) { ++ struct bucket_array *dst = __bucket_array(ca, 1); ++ struct bucket_array *src = __bucket_array(ca, 0); ++ size_t b; ++ ++ dst->first_bucket = src->first_bucket; ++ dst->nbuckets = src->nbuckets; ++ ++ for (b = 0; b < src->nbuckets; b++) { ++ struct bucket *d = &dst->b[b]; ++ struct bucket *s = &src->b[b]; ++ ++ d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; ++ d->gen_valid = s->gen_valid; ++ ++ if (metadata_only && ++ (s->mark.data_type == BCH_DATA_user || ++ s->mark.data_type == BCH_DATA_cached)) { ++ d->_mark = s->mark; ++ d->_mark.owned_by_allocator = 0; ++ } ++ } ++ }; ++ ++ percpu_up_write(&c->mark_lock); ++ ++ return 0; ++} ++ ++/** ++ * bch2_gc - walk _all_ references to buckets, and recompute them: ++ * ++ * Order matters here: ++ * - Concurrent GC relies on the fact that we have a total ordering for ++ * everything that GC walks - see gc_will_visit_node(), ++ * gc_will_visit_root() ++ * ++ * - also, references move around in the course of index updates and ++ * various other crap: everything needs to agree on the ordering ++ * references are allowed to move around in - e.g., we're allowed to ++ * start with a reference owned by an open_bucket (the allocator) and ++ * move it to the btree, but not the reverse. ++ * ++ * This is necessary to ensure that gc doesn't miss references that ++ * move around - if references move backwards in the ordering GC ++ * uses, GC could skip past them ++ */ ++int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, ++ bool initial, bool metadata_only) ++{ ++ struct bch_dev *ca; ++ u64 start_time = local_clock(); ++ unsigned i, iter = 0; ++ int ret; ++ ++ lockdep_assert_held(&c->state_lock); ++ trace_gc_start(c); ++ ++ down_write(&c->gc_lock); ++ ++ /* flush interior btree updates: */ ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c)); ++again: ++ ret = bch2_gc_start(c, metadata_only); ++ if (ret) ++ goto out; ++ ++ bch2_mark_superblocks(c); ++ ++ ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only); ++ if (ret) ++ goto out; ++ ++#if 0 ++ bch2_mark_pending_btree_node_frees(c); ++#endif ++ bch2_mark_allocator_buckets(c); ++ ++ c->gc_count++; ++out: ++ if (!ret && ++ (test_bit(BCH_FS_FIXED_GENS, &c->flags) || ++ (!iter && test_restart_gc(c)))) { ++ /* ++ * XXX: make sure gens we fixed got saved ++ */ ++ if (iter++ <= 2) { ++ bch_info(c, "Fixed gens, restarting mark and sweep:"); ++ clear_bit(BCH_FS_FIXED_GENS, &c->flags); ++ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); ++ ++ percpu_down_write(&c->mark_lock); ++ bch2_gc_free(c); ++ percpu_up_write(&c->mark_lock); ++ /* flush fsck errors, reset counters */ ++ bch2_flush_fsck_errs(c); ++ ++ goto again; ++ } ++ ++ bch_info(c, "Unable to fix bucket gens, looping"); ++ ret = -EINVAL; ++ } ++ ++ if (!ret) { ++ bch2_journal_block(&c->journal); ++ ++ percpu_down_write(&c->mark_lock); ++ ret = bch2_gc_done(c, initial, metadata_only); ++ ++ bch2_journal_unblock(&c->journal); ++ } else { ++ percpu_down_write(&c->mark_lock); ++ } ++ ++ /* Indicates that gc is no longer in progress: */ ++ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); ++ ++ bch2_gc_free(c); ++ percpu_up_write(&c->mark_lock); ++ ++ up_write(&c->gc_lock); ++ ++ trace_gc_end(c); ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); ++ ++ /* ++ * Wake up allocator in case it was waiting for buckets ++ * because of not being able to inc gens ++ */ ++ for_each_member_device(ca, c, i) ++ bch2_wake_allocator(ca); ++ ++ /* ++ * At startup, allocations can happen directly instead of via the ++ * allocator thread - issue wakeup in case they blocked on gc_lock: ++ */ ++ closure_wake_up(&c->freelist_wait); ++ return ret; ++} ++ ++/* ++ * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree ++ * node pointers currently never have cached pointers that can become stale: ++ */ ++static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id id) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, id, POS_MIN, BTREE_ITER_PREFETCH, k, ret) { ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ percpu_down_read(&c->mark_lock); ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, false); ++ ++ if (gen_after(g->gc_gen, ptr->gen)) ++ g->gc_gen = ptr->gen; ++ ++ if (gen_after(g->mark.gen, ptr->gen) > 32) { ++ /* rewrite btree node */ ++ ++ } ++ } ++ percpu_up_read(&c->mark_lock); ++ } ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++int bch2_gc_gens(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ struct bucket_array *buckets; ++ struct bucket *g; ++ unsigned i; ++ int ret; ++ ++ /* ++ * Ideally we would be using state_lock and not gc_lock here, but that ++ * introduces a deadlock in the RO path - we currently take the state ++ * lock at the start of going RO, thus the gc thread may get stuck: ++ */ ++ down_read(&c->gc_lock); ++ ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ for_each_bucket(g, buckets) ++ g->gc_gen = g->mark.gen; ++ up_read(&ca->bucket_lock); ++ } ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (btree_node_type_needs_gc(i)) { ++ ret = bch2_gc_btree_gens(c, i); ++ if (ret) { ++ bch_err(c, "error recalculating oldest_gen: %i", ret); ++ goto err; ++ } ++ } ++ ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ for_each_bucket(g, buckets) ++ g->oldest_gen = g->gc_gen; ++ up_read(&ca->bucket_lock); ++ } ++ ++ c->gc_count++; ++err: ++ up_read(&c->gc_lock); ++ return ret; ++} ++ ++/* Btree coalescing */ ++ ++static void recalc_packed_keys(struct btree *b) ++{ ++ struct bset *i = btree_bset_first(b); ++ struct bkey_packed *k; ++ ++ memset(&b->nr, 0, sizeof(b->nr)); ++ ++ BUG_ON(b->nsets != 1); ++ ++ vstruct_for_each(i, k) ++ btree_keys_account_key_add(&b->nr, 0, k); ++} ++ ++static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, ++ struct btree *old_nodes[GC_MERGE_NODES]) ++{ ++ struct btree *parent = btree_node_parent(iter, old_nodes[0]); ++ unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0; ++ unsigned blocks = btree_blocks(c) * 2 / 3; ++ struct btree *new_nodes[GC_MERGE_NODES]; ++ struct btree_update *as; ++ struct keylist keylist; ++ struct bkey_format_state format_state; ++ struct bkey_format new_format; ++ ++ memset(new_nodes, 0, sizeof(new_nodes)); ++ bch2_keylist_init(&keylist, NULL); ++ ++ /* Count keys that are not deleted */ ++ for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++) ++ u64s += old_nodes[i]->nr.live_u64s; ++ ++ nr_old_nodes = nr_new_nodes = i; ++ ++ /* Check if all keys in @old_nodes could fit in one fewer node */ ++ if (nr_old_nodes <= 1 || ++ __vstruct_blocks(struct btree_node, c->block_bits, ++ DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks) ++ return; ++ ++ /* Find a format that all keys in @old_nodes can pack into */ ++ bch2_bkey_format_init(&format_state); ++ ++ for (i = 0; i < nr_old_nodes; i++) ++ __bch2_btree_calc_format(&format_state, old_nodes[i]); ++ ++ new_format = bch2_bkey_format_done(&format_state); ++ ++ /* Check if repacking would make any nodes too big to fit */ ++ for (i = 0; i < nr_old_nodes; i++) ++ if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) { ++ trace_btree_gc_coalesce_fail(c, ++ BTREE_GC_COALESCE_FAIL_FORMAT_FITS); ++ return; ++ } ++ ++ if (bch2_keylist_realloc(&keylist, NULL, 0, ++ (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) { ++ trace_btree_gc_coalesce_fail(c, ++ BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC); ++ return; ++ } ++ ++ as = bch2_btree_update_start(iter->trans, iter->btree_id, ++ btree_update_reserve_required(c, parent) + nr_old_nodes, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE, ++ NULL); ++ if (IS_ERR(as)) { ++ trace_btree_gc_coalesce_fail(c, ++ BTREE_GC_COALESCE_FAIL_RESERVE_GET); ++ bch2_keylist_free(&keylist, NULL); ++ return; ++ } ++ ++ trace_btree_gc_coalesce(c, old_nodes[0]); ++ ++ for (i = 0; i < nr_old_nodes; i++) ++ bch2_btree_interior_update_will_free_node(as, old_nodes[i]); ++ ++ /* Repack everything with @new_format and sort down to one bset */ ++ for (i = 0; i < nr_old_nodes; i++) ++ new_nodes[i] = ++ __bch2_btree_node_alloc_replacement(as, old_nodes[i], ++ new_format); ++ ++ /* ++ * Conceptually we concatenate the nodes together and slice them ++ * up at different boundaries. ++ */ ++ for (i = nr_new_nodes - 1; i > 0; --i) { ++ struct btree *n1 = new_nodes[i]; ++ struct btree *n2 = new_nodes[i - 1]; ++ ++ struct bset *s1 = btree_bset_first(n1); ++ struct bset *s2 = btree_bset_first(n2); ++ struct bkey_packed *k, *last = NULL; ++ ++ /* Calculate how many keys from @n2 we could fit inside @n1 */ ++ u64s = 0; ++ ++ for (k = s2->start; ++ k < vstruct_last(s2) && ++ vstruct_blocks_plus(n1->data, c->block_bits, ++ u64s + k->u64s) <= blocks; ++ k = bkey_next_skip_noops(k, vstruct_last(s2))) { ++ last = k; ++ u64s += k->u64s; ++ } ++ ++ if (u64s == le16_to_cpu(s2->u64s)) { ++ /* n2 fits entirely in n1 */ ++ n1->key.k.p = n1->data->max_key = n2->data->max_key; ++ ++ memcpy_u64s(vstruct_last(s1), ++ s2->start, ++ le16_to_cpu(s2->u64s)); ++ le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s)); ++ ++ set_btree_bset_end(n1, n1->set); ++ ++ six_unlock_write(&n2->c.lock); ++ bch2_btree_node_free_never_inserted(c, n2); ++ six_unlock_intent(&n2->c.lock); ++ ++ memmove(new_nodes + i - 1, ++ new_nodes + i, ++ sizeof(new_nodes[0]) * (nr_new_nodes - i)); ++ new_nodes[--nr_new_nodes] = NULL; ++ } else if (u64s) { ++ /* move part of n2 into n1 */ ++ n1->key.k.p = n1->data->max_key = ++ bkey_unpack_pos(n1, last); ++ ++ n2->data->min_key = bkey_successor(n1->data->max_key); ++ ++ memcpy_u64s(vstruct_last(s1), ++ s2->start, u64s); ++ le16_add_cpu(&s1->u64s, u64s); ++ ++ memmove(s2->start, ++ vstruct_idx(s2, u64s), ++ (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64)); ++ s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s); ++ ++ set_btree_bset_end(n1, n1->set); ++ set_btree_bset_end(n2, n2->set); ++ } ++ } ++ ++ for (i = 0; i < nr_new_nodes; i++) { ++ struct btree *n = new_nodes[i]; ++ ++ recalc_packed_keys(n); ++ btree_node_reset_sib_u64s(n); ++ ++ bch2_btree_build_aux_trees(n); ++ ++ bch2_btree_update_add_new_node(as, n); ++ six_unlock_write(&n->c.lock); ++ ++ bch2_btree_node_write(c, n, SIX_LOCK_intent); ++ } ++ ++ /* ++ * The keys for the old nodes get deleted. We don't want to insert keys ++ * that compare equal to the keys for the new nodes we'll also be ++ * inserting - we can't because keys on a keylist must be strictly ++ * greater than the previous keys, and we also don't need to since the ++ * key for the new node will serve the same purpose (overwriting the key ++ * for the old node). ++ */ ++ for (i = 0; i < nr_old_nodes; i++) { ++ struct bkey_i delete; ++ unsigned j; ++ ++ for (j = 0; j < nr_new_nodes; j++) ++ if (!bkey_cmp(old_nodes[i]->key.k.p, ++ new_nodes[j]->key.k.p)) ++ goto next; ++ ++ bkey_init(&delete.k); ++ delete.k.p = old_nodes[i]->key.k.p; ++ bch2_keylist_add_in_order(&keylist, &delete); ++next: ++ i = i; ++ } ++ ++ /* ++ * Keys for the new nodes get inserted: bch2_btree_insert_keys() only ++ * does the lookup once and thus expects the keys to be in sorted order ++ * so we have to make sure the new keys are correctly ordered with ++ * respect to the deleted keys added in the previous loop ++ */ ++ for (i = 0; i < nr_new_nodes; i++) ++ bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key); ++ ++ /* Insert the newly coalesced nodes */ ++ bch2_btree_insert_node(as, parent, iter, &keylist, 0); ++ ++ BUG_ON(!bch2_keylist_empty(&keylist)); ++ ++ BUG_ON(iter->l[old_nodes[0]->c.level].b != old_nodes[0]); ++ ++ bch2_btree_iter_node_replace(iter, new_nodes[0]); ++ ++ for (i = 0; i < nr_new_nodes; i++) ++ bch2_btree_update_get_open_buckets(as, new_nodes[i]); ++ ++ /* Free the old nodes and update our sliding window */ ++ for (i = 0; i < nr_old_nodes; i++) { ++ bch2_btree_node_free_inmem(c, old_nodes[i], iter); ++ ++ /* ++ * the index update might have triggered a split, in which case ++ * the nodes we coalesced - the new nodes we just created - ++ * might not be sibling nodes anymore - don't add them to the ++ * sliding window (except the first): ++ */ ++ if (!i) { ++ old_nodes[i] = new_nodes[i]; ++ } else { ++ old_nodes[i] = NULL; ++ } ++ } ++ ++ for (i = 0; i < nr_new_nodes; i++) ++ six_unlock_intent(&new_nodes[i]->c.lock); ++ ++ bch2_btree_update_done(as); ++ bch2_keylist_free(&keylist, NULL); ++} ++ ++static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct btree *b; ++ bool kthread = (current->flags & PF_KTHREAD) != 0; ++ unsigned i; ++ ++ /* Sliding window of adjacent btree nodes */ ++ struct btree *merge[GC_MERGE_NODES]; ++ u32 lock_seq[GC_MERGE_NODES]; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ /* ++ * XXX: We don't have a good way of positively matching on sibling nodes ++ * that have the same parent - this code works by handling the cases ++ * where they might not have the same parent, and is thus fragile. Ugh. ++ * ++ * Perhaps redo this to use multiple linked iterators? ++ */ ++ memset(merge, 0, sizeof(merge)); ++ ++ __for_each_btree_node(&trans, iter, btree_id, POS_MIN, ++ BTREE_MAX_DEPTH, 0, ++ BTREE_ITER_PREFETCH, b) { ++ memmove(merge + 1, merge, ++ sizeof(merge) - sizeof(merge[0])); ++ memmove(lock_seq + 1, lock_seq, ++ sizeof(lock_seq) - sizeof(lock_seq[0])); ++ ++ merge[0] = b; ++ ++ for (i = 1; i < GC_MERGE_NODES; i++) { ++ if (!merge[i] || ++ !six_relock_intent(&merge[i]->c.lock, lock_seq[i])) ++ break; ++ ++ if (merge[i]->c.level != merge[0]->c.level) { ++ six_unlock_intent(&merge[i]->c.lock); ++ break; ++ } ++ } ++ memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0])); ++ ++ bch2_coalesce_nodes(c, iter, merge); ++ ++ for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) { ++ lock_seq[i] = merge[i]->c.lock.state.seq; ++ six_unlock_intent(&merge[i]->c.lock); ++ } ++ ++ lock_seq[0] = merge[0]->c.lock.state.seq; ++ ++ if (kthread && kthread_should_stop()) { ++ bch2_trans_exit(&trans); ++ return -ESHUTDOWN; ++ } ++ ++ bch2_trans_cond_resched(&trans); ++ ++ /* ++ * If the parent node wasn't relocked, it might have been split ++ * and the nodes in our sliding window might not have the same ++ * parent anymore - blow away the sliding window: ++ */ ++ if (btree_iter_node(iter, iter->level + 1) && ++ !btree_node_intent_locked(iter, iter->level + 1)) ++ memset(merge + 1, 0, ++ (GC_MERGE_NODES - 1) * sizeof(merge[0])); ++ } ++ return bch2_trans_exit(&trans); ++} ++ ++/** ++ * bch_coalesce - coalesce adjacent nodes with low occupancy ++ */ ++void bch2_coalesce(struct bch_fs *c) ++{ ++ enum btree_id id; ++ ++ down_read(&c->gc_lock); ++ trace_gc_coalesce_start(c); ++ ++ for (id = 0; id < BTREE_ID_NR; id++) { ++ int ret = c->btree_roots[id].b ++ ? bch2_coalesce_btree(c, id) ++ : 0; ++ ++ if (ret) { ++ if (ret != -ESHUTDOWN) ++ bch_err(c, "btree coalescing failed: %d", ret); ++ return; ++ } ++ } ++ ++ trace_gc_coalesce_end(c); ++ up_read(&c->gc_lock); ++} ++ ++static int bch2_gc_thread(void *arg) ++{ ++ struct bch_fs *c = arg; ++ struct io_clock *clock = &c->io_clock[WRITE]; ++ unsigned long last = atomic_long_read(&clock->now); ++ unsigned last_kick = atomic_read(&c->kick_gc); ++ int ret; ++ ++ set_freezable(); ++ ++ while (1) { ++ while (1) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ if (kthread_should_stop()) { ++ __set_current_state(TASK_RUNNING); ++ return 0; ++ } ++ ++ if (atomic_read(&c->kick_gc) != last_kick) ++ break; ++ ++ if (c->btree_gc_periodic) { ++ unsigned long next = last + c->capacity / 16; ++ ++ if (atomic_long_read(&clock->now) >= next) ++ break; ++ ++ bch2_io_clock_schedule_timeout(clock, next); ++ } else { ++ schedule(); ++ } ++ ++ try_to_freeze(); ++ } ++ __set_current_state(TASK_RUNNING); ++ ++ last = atomic_long_read(&clock->now); ++ last_kick = atomic_read(&c->kick_gc); ++ ++ /* ++ * Full gc is currently incompatible with btree key cache: ++ */ ++#if 0 ++ ret = bch2_gc(c, NULL, false, false); ++#else ++ ret = bch2_gc_gens(c); ++#endif ++ if (ret) ++ bch_err(c, "btree gc failed: %i", ret); ++ ++ debug_check_no_locks_held(); ++ } ++ ++ return 0; ++} ++ ++void bch2_gc_thread_stop(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ p = c->gc_thread; ++ c->gc_thread = NULL; ++ ++ if (p) { ++ kthread_stop(p); ++ put_task_struct(p); ++ } ++} ++ ++int bch2_gc_thread_start(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ BUG_ON(c->gc_thread); ++ ++ p = kthread_create(bch2_gc_thread, c, "bch_gc"); ++ if (IS_ERR(p)) ++ return PTR_ERR(p); ++ ++ get_task_struct(p); ++ c->gc_thread = p; ++ wake_up_process(p); ++ return 0; ++} +diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h +new file mode 100644 +index 000000000000..3694a3df62a8 +--- /dev/null ++++ b/fs/bcachefs/btree_gc.h +@@ -0,0 +1,121 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_GC_H ++#define _BCACHEFS_BTREE_GC_H ++ ++#include "btree_types.h" ++ ++void bch2_coalesce(struct bch_fs *); ++ ++struct journal_keys; ++int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool); ++int bch2_gc_gens(struct bch_fs *); ++void bch2_gc_thread_stop(struct bch_fs *); ++int bch2_gc_thread_start(struct bch_fs *); ++void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned); ++ ++/* ++ * For concurrent mark and sweep (with other index updates), we define a total ++ * ordering of _all_ references GC walks: ++ * ++ * Note that some references will have the same GC position as others - e.g. ++ * everything within the same btree node; in those cases we're relying on ++ * whatever locking exists for where those references live, i.e. the write lock ++ * on a btree node. ++ * ++ * That locking is also required to ensure GC doesn't pass the updater in ++ * between the updater adding/removing the reference and updating the GC marks; ++ * without that, we would at best double count sometimes. ++ * ++ * That part is important - whenever calling bch2_mark_pointers(), a lock _must_ ++ * be held that prevents GC from passing the position the updater is at. ++ * ++ * (What about the start of gc, when we're clearing all the marks? GC clears the ++ * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc ++ * position inside its cmpxchg loop, so crap magically works). ++ */ ++ ++/* Position of (the start of) a gc phase: */ ++static inline struct gc_pos gc_phase(enum gc_phase phase) ++{ ++ return (struct gc_pos) { ++ .phase = phase, ++ .pos = POS_MIN, ++ .level = 0, ++ }; ++} ++ ++static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) ++{ ++ if (l.phase != r.phase) ++ return l.phase < r.phase ? -1 : 1; ++ if (bkey_cmp(l.pos, r.pos)) ++ return bkey_cmp(l.pos, r.pos); ++ if (l.level != r.level) ++ return l.level < r.level ? -1 : 1; ++ return 0; ++} ++ ++static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) ++{ ++ switch (id) { ++#define x(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n; ++ BCH_BTREE_IDS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++static inline struct gc_pos gc_pos_btree(enum btree_id id, ++ struct bpos pos, unsigned level) ++{ ++ return (struct gc_pos) { ++ .phase = btree_id_to_gc_phase(id), ++ .pos = pos, ++ .level = level, ++ }; ++} ++ ++/* ++ * GC position of the pointers within a btree node: note, _not_ for &b->key ++ * itself, that lives in the parent node: ++ */ ++static inline struct gc_pos gc_pos_btree_node(struct btree *b) ++{ ++ return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level); ++} ++ ++/* ++ * GC position of the pointer to a btree root: we don't use ++ * gc_pos_pointer_to_btree_node() here to avoid a potential race with ++ * btree_split() increasing the tree depth - the new root will have level > the ++ * old root and thus have a greater gc position than the old root, but that ++ * would be incorrect since once gc has marked the root it's not coming back. ++ */ ++static inline struct gc_pos gc_pos_btree_root(enum btree_id id) ++{ ++ return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH); ++} ++ ++static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob) ++{ ++ return (struct gc_pos) { ++ .phase = GC_PHASE_ALLOC, ++ .pos = POS(ob ? ob - c->open_buckets : 0, 0), ++ }; ++} ++ ++static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) ++{ ++ unsigned seq; ++ bool ret; ++ ++ do { ++ seq = read_seqcount_begin(&c->gc_pos_lock); ++ ret = gc_pos_cmp(pos, c->gc_pos) <= 0; ++ } while (read_seqcount_retry(&c->gc_pos_lock, seq)); ++ ++ return ret; ++} ++ ++#endif /* _BCACHEFS_BTREE_GC_H */ +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +new file mode 100644 +index 000000000000..2f5097218f9c +--- /dev/null ++++ b/fs/bcachefs/btree_io.c +@@ -0,0 +1,1834 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "bkey_sort.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_locking.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "debug.h" ++#include "error.h" ++#include "extents.h" ++#include "io.h" ++#include "journal_reclaim.h" ++#include "journal_seq_blacklist.h" ++#include "super-io.h" ++ ++#include ++#include ++ ++static void verify_no_dups(struct btree *b, ++ struct bkey_packed *start, ++ struct bkey_packed *end, ++ bool extents) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bkey_packed *k, *p; ++ ++ if (start == end) ++ return; ++ ++ for (p = start, k = bkey_next_skip_noops(start, end); ++ k != end; ++ p = k, k = bkey_next_skip_noops(k, end)) { ++ struct bkey l = bkey_unpack_key(b, p); ++ struct bkey r = bkey_unpack_key(b, k); ++ ++ BUG_ON(extents ++ ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0 ++ : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); ++ //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0); ++ } ++#endif ++} ++ ++static void set_needs_whiteout(struct bset *i, int v) ++{ ++ struct bkey_packed *k; ++ ++ for (k = i->start; ++ k != vstruct_last(i); ++ k = bkey_next_skip_noops(k, vstruct_last(i))) ++ k->needs_whiteout = v; ++} ++ ++static void btree_bounce_free(struct bch_fs *c, size_t size, ++ bool used_mempool, void *p) ++{ ++ if (used_mempool) ++ mempool_free(p, &c->btree_bounce_pool); ++ else ++ vpfree(p, size); ++} ++ ++static void *btree_bounce_alloc(struct bch_fs *c, size_t size, ++ bool *used_mempool) ++{ ++ unsigned flags = memalloc_nofs_save(); ++ void *p; ++ ++ BUG_ON(size > btree_bytes(c)); ++ ++ *used_mempool = false; ++ p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); ++ if (!p) { ++ *used_mempool = true; ++ p = mempool_alloc(&c->btree_bounce_pool, GFP_NOIO); ++ } ++ memalloc_nofs_restore(flags); ++ return p; ++} ++ ++static void sort_bkey_ptrs(const struct btree *bt, ++ struct bkey_packed **ptrs, unsigned nr) ++{ ++ unsigned n = nr, a = nr / 2, b, c, d; ++ ++ if (!a) ++ return; ++ ++ /* Heap sort: see lib/sort.c: */ ++ while (1) { ++ if (a) ++ a--; ++ else if (--n) ++ swap(ptrs[0], ptrs[n]); ++ else ++ break; ++ ++ for (b = a; c = 2 * b + 1, (d = c + 1) < n;) ++ b = bkey_cmp_packed(bt, ++ ptrs[c], ++ ptrs[d]) >= 0 ? c : d; ++ if (d == n) ++ b = c; ++ ++ while (b != a && ++ bkey_cmp_packed(bt, ++ ptrs[a], ++ ptrs[b]) >= 0) ++ b = (b - 1) / 2; ++ c = b; ++ while (b != a) { ++ b = (b - 1) / 2; ++ swap(ptrs[b], ptrs[c]); ++ } ++ } ++} ++ ++static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) ++{ ++ struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k; ++ bool used_mempool = false; ++ size_t bytes = b->whiteout_u64s * sizeof(u64); ++ ++ if (!b->whiteout_u64s) ++ return; ++ ++ new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); ++ ++ ptrs = ptrs_end = ((void *) new_whiteouts + bytes); ++ ++ for (k = unwritten_whiteouts_start(c, b); ++ k != unwritten_whiteouts_end(c, b); ++ k = bkey_next(k)) ++ *--ptrs = k; ++ ++ sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs); ++ ++ k = new_whiteouts; ++ ++ while (ptrs != ptrs_end) { ++ bkey_copy(k, *ptrs); ++ k = bkey_next(k); ++ ptrs++; ++ } ++ ++ verify_no_dups(b, new_whiteouts, ++ (void *) ((u64 *) new_whiteouts + b->whiteout_u64s), ++ btree_node_old_extent_overwrite(b)); ++ ++ memcpy_u64s(unwritten_whiteouts_start(c, b), ++ new_whiteouts, b->whiteout_u64s); ++ ++ btree_bounce_free(c, bytes, used_mempool, new_whiteouts); ++} ++ ++static bool should_compact_bset(struct btree *b, struct bset_tree *t, ++ bool compacting, enum compact_mode mode) ++{ ++ if (!bset_dead_u64s(b, t)) ++ return false; ++ ++ switch (mode) { ++ case COMPACT_LAZY: ++ return should_compact_bset_lazy(b, t) || ++ (compacting && !bset_written(b, bset(b, t))); ++ case COMPACT_ALL: ++ return true; ++ default: ++ BUG(); ++ } ++} ++ ++static bool bch2_compact_extent_whiteouts(struct bch_fs *c, ++ struct btree *b, ++ enum compact_mode mode) ++{ ++ const struct bkey_format *f = &b->format; ++ struct bset_tree *t; ++ struct bkey_packed *whiteouts = NULL; ++ struct bkey_packed *u_start, *u_pos; ++ struct sort_iter sort_iter; ++ unsigned bytes, whiteout_u64s = 0, u64s; ++ bool used_mempool, compacting = false; ++ ++ BUG_ON(!btree_node_is_extents(b)); ++ ++ for_each_bset(b, t) ++ if (should_compact_bset(b, t, whiteout_u64s != 0, mode)) ++ whiteout_u64s += bset_dead_u64s(b, t); ++ ++ if (!whiteout_u64s) ++ return false; ++ ++ bch2_sort_whiteouts(c, b); ++ ++ sort_iter_init(&sort_iter, b); ++ ++ whiteout_u64s += b->whiteout_u64s; ++ bytes = whiteout_u64s * sizeof(u64); ++ ++ whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); ++ u_start = u_pos = whiteouts; ++ ++ memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b), ++ b->whiteout_u64s); ++ u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64); ++ ++ sort_iter_add(&sort_iter, u_start, u_pos); ++ ++ for_each_bset(b, t) { ++ struct bset *i = bset(b, t); ++ struct bkey_packed *k, *n, *out, *start, *end; ++ struct btree_node_entry *src = NULL, *dst = NULL; ++ ++ if (t != b->set && !bset_written(b, i)) { ++ src = container_of(i, struct btree_node_entry, keys); ++ dst = max(write_block(b), ++ (void *) btree_bkey_last(b, t - 1)); ++ } ++ ++ if (src != dst) ++ compacting = true; ++ ++ if (!should_compact_bset(b, t, compacting, mode)) { ++ if (src != dst) { ++ memmove(dst, src, sizeof(*src) + ++ le16_to_cpu(src->keys.u64s) * ++ sizeof(u64)); ++ i = &dst->keys; ++ set_btree_bset(b, t, i); ++ } ++ continue; ++ } ++ ++ compacting = true; ++ u_start = u_pos; ++ start = i->start; ++ end = vstruct_last(i); ++ ++ if (src != dst) { ++ memmove(dst, src, sizeof(*src)); ++ i = &dst->keys; ++ set_btree_bset(b, t, i); ++ } ++ ++ out = i->start; ++ ++ for (k = start; k != end; k = n) { ++ n = bkey_next_skip_noops(k, end); ++ ++ if (bkey_deleted(k)) ++ continue; ++ ++ BUG_ON(bkey_whiteout(k) && ++ k->needs_whiteout && ++ bkey_written(b, k)); ++ ++ if (bkey_whiteout(k) && !k->needs_whiteout) ++ continue; ++ ++ if (bkey_whiteout(k)) { ++ memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k)); ++ set_bkeyp_val_u64s(f, u_pos, 0); ++ u_pos = bkey_next(u_pos); ++ } else { ++ bkey_copy(out, k); ++ out = bkey_next(out); ++ } ++ } ++ ++ sort_iter_add(&sort_iter, u_start, u_pos); ++ ++ i->u64s = cpu_to_le16((u64 *) out - i->_data); ++ set_btree_bset_end(b, t); ++ bch2_bset_set_no_aux_tree(b, t); ++ } ++ ++ b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts; ++ ++ BUG_ON((void *) unwritten_whiteouts_start(c, b) < ++ (void *) btree_bkey_last(b, bset_tree_last(b))); ++ ++ u64s = bch2_sort_extent_whiteouts(unwritten_whiteouts_start(c, b), ++ &sort_iter); ++ ++ BUG_ON(u64s > b->whiteout_u64s); ++ BUG_ON(u_pos != whiteouts && !u64s); ++ ++ if (u64s != b->whiteout_u64s) { ++ void *src = unwritten_whiteouts_start(c, b); ++ ++ b->whiteout_u64s = u64s; ++ memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s); ++ } ++ ++ verify_no_dups(b, ++ unwritten_whiteouts_start(c, b), ++ unwritten_whiteouts_end(c, b), ++ true); ++ ++ btree_bounce_free(c, bytes, used_mempool, whiteouts); ++ ++ bch2_btree_build_aux_trees(b); ++ ++ bch_btree_keys_u64s_remaining(c, b); ++ bch2_verify_btree_nr_keys(b); ++ ++ return true; ++} ++ ++static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) ++{ ++ struct bset_tree *t; ++ bool ret = false; ++ ++ for_each_bset(b, t) { ++ struct bset *i = bset(b, t); ++ struct bkey_packed *k, *n, *out, *start, *end; ++ struct btree_node_entry *src = NULL, *dst = NULL; ++ ++ if (t != b->set && !bset_written(b, i)) { ++ src = container_of(i, struct btree_node_entry, keys); ++ dst = max(write_block(b), ++ (void *) btree_bkey_last(b, t - 1)); ++ } ++ ++ if (src != dst) ++ ret = true; ++ ++ if (!should_compact_bset(b, t, ret, mode)) { ++ if (src != dst) { ++ memmove(dst, src, sizeof(*src) + ++ le16_to_cpu(src->keys.u64s) * ++ sizeof(u64)); ++ i = &dst->keys; ++ set_btree_bset(b, t, i); ++ } ++ continue; ++ } ++ ++ start = btree_bkey_first(b, t); ++ end = btree_bkey_last(b, t); ++ ++ if (src != dst) { ++ memmove(dst, src, sizeof(*src)); ++ i = &dst->keys; ++ set_btree_bset(b, t, i); ++ } ++ ++ out = i->start; ++ ++ for (k = start; k != end; k = n) { ++ n = bkey_next_skip_noops(k, end); ++ ++ if (!bkey_whiteout(k)) { ++ bkey_copy(out, k); ++ out = bkey_next(out); ++ } else { ++ BUG_ON(k->needs_whiteout); ++ } ++ } ++ ++ i->u64s = cpu_to_le16((u64 *) out - i->_data); ++ set_btree_bset_end(b, t); ++ bch2_bset_set_no_aux_tree(b, t); ++ ret = true; ++ } ++ ++ bch2_verify_btree_nr_keys(b); ++ ++ bch2_btree_build_aux_trees(b); ++ ++ return ret; ++} ++ ++bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, ++ enum compact_mode mode) ++{ ++ return !btree_node_old_extent_overwrite(b) ++ ? bch2_drop_whiteouts(b, mode) ++ : bch2_compact_extent_whiteouts(c, b, mode); ++} ++ ++static void btree_node_sort(struct bch_fs *c, struct btree *b, ++ struct btree_iter *iter, ++ unsigned start_idx, ++ unsigned end_idx, ++ bool filter_whiteouts) ++{ ++ struct btree_node *out; ++ struct sort_iter sort_iter; ++ struct bset_tree *t; ++ struct bset *start_bset = bset(b, &b->set[start_idx]); ++ bool used_mempool = false; ++ u64 start_time, seq = 0; ++ unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1; ++ bool sorting_entire_node = start_idx == 0 && ++ end_idx == b->nsets; ++ ++ sort_iter_init(&sort_iter, b); ++ ++ for (t = b->set + start_idx; ++ t < b->set + end_idx; ++ t++) { ++ u64s += le16_to_cpu(bset(b, t)->u64s); ++ sort_iter_add(&sort_iter, ++ btree_bkey_first(b, t), ++ btree_bkey_last(b, t)); ++ } ++ ++ bytes = sorting_entire_node ++ ? btree_bytes(c) ++ : __vstruct_bytes(struct btree_node, u64s); ++ ++ out = btree_bounce_alloc(c, bytes, &used_mempool); ++ ++ start_time = local_clock(); ++ ++ if (btree_node_old_extent_overwrite(b)) ++ filter_whiteouts = bset_written(b, start_bset); ++ ++ u64s = (btree_node_old_extent_overwrite(b) ++ ? bch2_sort_extents ++ : bch2_sort_keys)(out->keys.start, ++ &sort_iter, ++ filter_whiteouts); ++ ++ out->keys.u64s = cpu_to_le16(u64s); ++ ++ BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes); ++ ++ if (sorting_entire_node) ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], ++ start_time); ++ ++ /* Make sure we preserve bset journal_seq: */ ++ for (t = b->set + start_idx; t < b->set + end_idx; t++) ++ seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq)); ++ start_bset->journal_seq = cpu_to_le64(seq); ++ ++ if (sorting_entire_node) { ++ unsigned u64s = le16_to_cpu(out->keys.u64s); ++ ++ BUG_ON(bytes != btree_bytes(c)); ++ ++ /* ++ * Our temporary buffer is the same size as the btree node's ++ * buffer, we can just swap buffers instead of doing a big ++ * memcpy() ++ */ ++ *out = *b->data; ++ out->keys.u64s = cpu_to_le16(u64s); ++ swap(out, b->data); ++ set_btree_bset(b, b->set, &b->data->keys); ++ } else { ++ start_bset->u64s = out->keys.u64s; ++ memcpy_u64s(start_bset->start, ++ out->keys.start, ++ le16_to_cpu(out->keys.u64s)); ++ } ++ ++ for (i = start_idx + 1; i < end_idx; i++) ++ b->nr.bset_u64s[start_idx] += ++ b->nr.bset_u64s[i]; ++ ++ b->nsets -= shift; ++ ++ for (i = start_idx + 1; i < b->nsets; i++) { ++ b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift]; ++ b->set[i] = b->set[i + shift]; ++ } ++ ++ for (i = b->nsets; i < MAX_BSETS; i++) ++ b->nr.bset_u64s[i] = 0; ++ ++ set_btree_bset_end(b, &b->set[start_idx]); ++ bch2_bset_set_no_aux_tree(b, &b->set[start_idx]); ++ ++ btree_bounce_free(c, bytes, used_mempool, out); ++ ++ bch2_verify_btree_nr_keys(b); ++} ++ ++void bch2_btree_sort_into(struct bch_fs *c, ++ struct btree *dst, ++ struct btree *src) ++{ ++ struct btree_nr_keys nr; ++ struct btree_node_iter src_iter; ++ u64 start_time = local_clock(); ++ ++ BUG_ON(dst->nsets != 1); ++ ++ bch2_bset_set_no_aux_tree(dst, dst->set); ++ ++ bch2_btree_node_iter_init_from_start(&src_iter, src); ++ ++ if (btree_node_is_extents(src)) ++ nr = bch2_sort_repack_merge(c, btree_bset_first(dst), ++ src, &src_iter, ++ &dst->format, ++ true); ++ else ++ nr = bch2_sort_repack(btree_bset_first(dst), ++ src, &src_iter, ++ &dst->format, ++ true); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], ++ start_time); ++ ++ set_btree_bset_end(dst, dst->set); ++ ++ dst->nr.live_u64s += nr.live_u64s; ++ dst->nr.bset_u64s[0] += nr.bset_u64s[0]; ++ dst->nr.packed_keys += nr.packed_keys; ++ dst->nr.unpacked_keys += nr.unpacked_keys; ++ ++ bch2_verify_btree_nr_keys(dst); ++} ++ ++#define SORT_CRIT (4096 / sizeof(u64)) ++ ++/* ++ * We're about to add another bset to the btree node, so if there's currently ++ * too many bsets - sort some of them together: ++ */ ++static bool btree_node_compact(struct bch_fs *c, struct btree *b, ++ struct btree_iter *iter) ++{ ++ unsigned unwritten_idx; ++ bool ret = false; ++ ++ for (unwritten_idx = 0; ++ unwritten_idx < b->nsets; ++ unwritten_idx++) ++ if (!bset_written(b, bset(b, &b->set[unwritten_idx]))) ++ break; ++ ++ if (b->nsets - unwritten_idx > 1) { ++ btree_node_sort(c, b, iter, unwritten_idx, ++ b->nsets, false); ++ ret = true; ++ } ++ ++ if (unwritten_idx > 1) { ++ btree_node_sort(c, b, iter, 0, unwritten_idx, false); ++ ret = true; ++ } ++ ++ return ret; ++} ++ ++void bch2_btree_build_aux_trees(struct btree *b) ++{ ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) ++ bch2_bset_build_aux_tree(b, t, ++ !bset_written(b, bset(b, t)) && ++ t == bset_tree_last(b)); ++} ++ ++/* ++ * @bch_btree_init_next - initialize a new (unwritten) bset that can then be ++ * inserted into ++ * ++ * Safe to call if there already is an unwritten bset - will only add a new bset ++ * if @b doesn't already have one. ++ * ++ * Returns true if we sorted (i.e. invalidated iterators ++ */ ++void bch2_btree_init_next(struct bch_fs *c, struct btree *b, ++ struct btree_iter *iter) ++{ ++ struct btree_node_entry *bne; ++ bool did_sort; ++ ++ EBUG_ON(!(b->c.lock.state.seq & 1)); ++ EBUG_ON(iter && iter->l[b->c.level].b != b); ++ ++ did_sort = btree_node_compact(c, b, iter); ++ ++ bne = want_new_bset(c, b); ++ if (bne) ++ bch2_bset_init_next(c, b, bne); ++ ++ bch2_btree_build_aux_trees(b); ++ ++ if (iter && did_sort) ++ bch2_btree_iter_reinit_node(iter, b); ++} ++ ++static void btree_err_msg(struct printbuf *out, struct bch_fs *c, ++ struct btree *b, struct bset *i, ++ unsigned offset, int write) ++{ ++ pr_buf(out, "error validating btree node %sat btree %u level %u/%u\n" ++ "pos ", ++ write ? "before write " : "", ++ b->c.btree_id, b->c.level, ++ c->btree_roots[b->c.btree_id].level); ++ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); ++ ++ pr_buf(out, " node offset %u", b->written); ++ if (i) ++ pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s)); ++} ++ ++enum btree_err_type { ++ BTREE_ERR_FIXABLE, ++ BTREE_ERR_WANT_RETRY, ++ BTREE_ERR_MUST_RETRY, ++ BTREE_ERR_FATAL, ++}; ++ ++enum btree_validate_ret { ++ BTREE_RETRY_READ = 64, ++}; ++ ++#define btree_err(type, c, b, i, msg, ...) \ ++({ \ ++ __label__ out; \ ++ char _buf[300]; \ ++ struct printbuf out = PBUF(_buf); \ ++ \ ++ btree_err_msg(&out, c, b, i, b->written, write); \ ++ pr_buf(&out, ": " msg, ##__VA_ARGS__); \ ++ \ ++ if (type == BTREE_ERR_FIXABLE && \ ++ write == READ && \ ++ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ ++ mustfix_fsck_err(c, "%s", _buf); \ ++ goto out; \ ++ } \ ++ \ ++ switch (write) { \ ++ case READ: \ ++ bch_err(c, "%s", _buf); \ ++ \ ++ switch (type) { \ ++ case BTREE_ERR_FIXABLE: \ ++ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ goto fsck_err; \ ++ case BTREE_ERR_WANT_RETRY: \ ++ if (have_retry) { \ ++ ret = BTREE_RETRY_READ; \ ++ goto fsck_err; \ ++ } \ ++ break; \ ++ case BTREE_ERR_MUST_RETRY: \ ++ ret = BTREE_RETRY_READ; \ ++ goto fsck_err; \ ++ case BTREE_ERR_FATAL: \ ++ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ goto fsck_err; \ ++ } \ ++ break; \ ++ case WRITE: \ ++ bch_err(c, "corrupt metadata before write: %s", _buf); \ ++ \ ++ if (bch2_fs_inconsistent(c)) { \ ++ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ goto fsck_err; \ ++ } \ ++ break; \ ++ } \ ++out: \ ++ true; \ ++}) ++ ++#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) ++ ++static int validate_bset(struct bch_fs *c, struct btree *b, ++ struct bset *i, unsigned sectors, ++ int write, bool have_retry) ++{ ++ unsigned version = le16_to_cpu(i->version); ++ const char *err; ++ int ret = 0; ++ ++ btree_err_on((version != BCH_BSET_VERSION_OLD && ++ version < bcachefs_metadata_version_min) || ++ version >= bcachefs_metadata_version_max, ++ BTREE_ERR_FATAL, c, b, i, ++ "unsupported bset version"); ++ ++ if (btree_err_on(b->written + sectors > c->opts.btree_node_size, ++ BTREE_ERR_FIXABLE, c, b, i, ++ "bset past end of btree node")) { ++ i->u64s = 0; ++ return 0; ++ } ++ ++ btree_err_on(b->written && !i->u64s, ++ BTREE_ERR_FIXABLE, c, b, i, ++ "empty bset"); ++ ++ if (!b->written) { ++ struct btree_node *bn = ++ container_of(i, struct btree_node, keys); ++ /* These indicate that we read the wrong btree node: */ ++ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { ++ struct bch_btree_ptr_v2 *bp = ++ &bkey_i_to_btree_ptr_v2(&b->key)->v; ++ ++ /* XXX endianness */ ++ btree_err_on(bp->seq != bn->keys.seq, ++ BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "incorrect sequence number (wrong btree node)"); ++ } ++ ++ btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, ++ BTREE_ERR_MUST_RETRY, c, b, i, ++ "incorrect btree id"); ++ ++ btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, ++ BTREE_ERR_MUST_RETRY, c, b, i, ++ "incorrect level"); ++ ++ if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) { ++ u64 *p = (u64 *) &bn->ptr; ++ ++ *p = swab64(*p); ++ } ++ ++ if (!write) ++ compat_btree_node(b->c.level, b->c.btree_id, version, ++ BSET_BIG_ENDIAN(i), write, bn); ++ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { ++ struct bch_btree_ptr_v2 *bp = ++ &bkey_i_to_btree_ptr_v2(&b->key)->v; ++ ++ btree_err_on(bkey_cmp(b->data->min_key, bp->min_key), ++ BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "incorrect min_key: got %llu:%llu should be %llu:%llu", ++ b->data->min_key.inode, ++ b->data->min_key.offset, ++ bp->min_key.inode, ++ bp->min_key.offset); ++ } ++ ++ btree_err_on(bkey_cmp(bn->max_key, b->key.k.p), ++ BTREE_ERR_MUST_RETRY, c, b, i, ++ "incorrect max key"); ++ ++ if (write) ++ compat_btree_node(b->c.level, b->c.btree_id, version, ++ BSET_BIG_ENDIAN(i), write, bn); ++ ++ /* XXX: ideally we would be validating min_key too */ ++#if 0 ++ /* ++ * not correct anymore, due to btree node write error ++ * handling ++ * ++ * need to add bn->seq to btree keys and verify ++ * against that ++ */ ++ btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key), ++ bn->ptr), ++ BTREE_ERR_FATAL, c, b, i, ++ "incorrect backpointer"); ++#endif ++ err = bch2_bkey_format_validate(&bn->format); ++ btree_err_on(err, ++ BTREE_ERR_FATAL, c, b, i, ++ "invalid bkey format: %s", err); ++ ++ compat_bformat(b->c.level, b->c.btree_id, version, ++ BSET_BIG_ENDIAN(i), write, ++ &bn->format); ++ } ++fsck_err: ++ return ret; ++} ++ ++static int validate_bset_keys(struct bch_fs *c, struct btree *b, ++ struct bset *i, unsigned *whiteout_u64s, ++ int write, bool have_retry) ++{ ++ unsigned version = le16_to_cpu(i->version); ++ struct bkey_packed *k, *prev = NULL; ++ bool seen_non_whiteout = false; ++ int ret = 0; ++ ++ if (!BSET_SEPARATE_WHITEOUTS(i)) { ++ seen_non_whiteout = true; ++ *whiteout_u64s = 0; ++ } ++ ++ for (k = i->start; ++ k != vstruct_last(i);) { ++ struct bkey_s u; ++ struct bkey tmp; ++ const char *invalid; ++ ++ if (btree_err_on(bkey_next(k) > vstruct_last(i), ++ BTREE_ERR_FIXABLE, c, b, i, ++ "key extends past end of bset")) { ++ i->u64s = cpu_to_le16((u64 *) k - i->_data); ++ break; ++ } ++ ++ if (btree_err_on(k->format > KEY_FORMAT_CURRENT, ++ BTREE_ERR_FIXABLE, c, b, i, ++ "invalid bkey format %u", k->format)) { ++ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); ++ memmove_u64s_down(k, bkey_next(k), ++ (u64 *) vstruct_end(i) - (u64 *) k); ++ continue; ++ } ++ ++ /* XXX: validate k->u64s */ ++ if (!write) ++ bch2_bkey_compat(b->c.level, b->c.btree_id, version, ++ BSET_BIG_ENDIAN(i), write, ++ &b->format, k); ++ ++ u = __bkey_disassemble(b, k, &tmp); ++ ++ invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?: ++ bch2_bkey_in_btree_node(b, u.s_c) ?: ++ (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL); ++ if (invalid) { ++ char buf[160]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); ++ btree_err(BTREE_ERR_FIXABLE, c, b, i, ++ "invalid bkey:\n%s\n%s", invalid, buf); ++ ++ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); ++ memmove_u64s_down(k, bkey_next(k), ++ (u64 *) vstruct_end(i) - (u64 *) k); ++ continue; ++ } ++ ++ if (write) ++ bch2_bkey_compat(b->c.level, b->c.btree_id, version, ++ BSET_BIG_ENDIAN(i), write, ++ &b->format, k); ++ ++ /* ++ * with the separate whiteouts thing (used for extents), the ++ * second set of keys actually can have whiteouts too, so we ++ * can't solely go off bkey_whiteout()... ++ */ ++ ++ if (!seen_non_whiteout && ++ (!bkey_whiteout(k) || ++ (prev && bkey_iter_cmp(b, prev, k) > 0))) { ++ *whiteout_u64s = k->_data - i->_data; ++ seen_non_whiteout = true; ++ } else if (prev && bkey_iter_cmp(b, prev, k) > 0) { ++ char buf1[80]; ++ char buf2[80]; ++ struct bkey up = bkey_unpack_key(b, prev); ++ ++ bch2_bkey_to_text(&PBUF(buf1), &up); ++ bch2_bkey_to_text(&PBUF(buf2), u.k); ++ ++ bch2_dump_bset(c, b, i, 0); ++ btree_err(BTREE_ERR_FATAL, c, b, i, ++ "keys out of order: %s > %s", ++ buf1, buf2); ++ /* XXX: repair this */ ++ } ++ ++ prev = k; ++ k = bkey_next_skip_noops(k, vstruct_last(i)); ++ } ++fsck_err: ++ return ret; ++} ++ ++int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry) ++{ ++ struct btree_node_entry *bne; ++ struct sort_iter *iter; ++ struct btree_node *sorted; ++ struct bkey_packed *k; ++ struct bch_extent_ptr *ptr; ++ struct bset *i; ++ bool used_mempool, blacklisted; ++ unsigned u64s; ++ int ret, retry_read = 0, write = READ; ++ ++ iter = mempool_alloc(&c->fill_iter, GFP_NOIO); ++ sort_iter_init(iter, b); ++ iter->size = (btree_blocks(c) + 1) * 2; ++ ++ if (bch2_meta_read_fault("btree")) ++ btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "dynamic fault"); ++ ++ btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), ++ BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "bad magic"); ++ ++ btree_err_on(!b->data->keys.seq, ++ BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "bad btree header"); ++ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { ++ struct bch_btree_ptr_v2 *bp = ++ &bkey_i_to_btree_ptr_v2(&b->key)->v; ++ ++ btree_err_on(b->data->keys.seq != bp->seq, ++ BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "got wrong btree node (seq %llx want %llx)", ++ b->data->keys.seq, bp->seq); ++ } ++ ++ while (b->written < c->opts.btree_node_size) { ++ unsigned sectors, whiteout_u64s = 0; ++ struct nonce nonce; ++ struct bch_csum csum; ++ bool first = !b->written; ++ ++ if (!b->written) { ++ i = &b->data->keys; ++ ++ btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), ++ BTREE_ERR_WANT_RETRY, c, b, i, ++ "unknown checksum type"); ++ ++ nonce = btree_nonce(i, b->written << 9); ++ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); ++ ++ btree_err_on(bch2_crc_cmp(csum, b->data->csum), ++ BTREE_ERR_WANT_RETRY, c, b, i, ++ "invalid checksum"); ++ ++ bset_encrypt(c, i, b->written << 9); ++ ++ if (btree_node_is_extents(b) && ++ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) { ++ set_btree_node_old_extent_overwrite(b); ++ set_btree_node_need_rewrite(b); ++ } ++ ++ sectors = vstruct_sectors(b->data, c->block_bits); ++ } else { ++ bne = write_block(b); ++ i = &bne->keys; ++ ++ if (i->seq != b->data->keys.seq) ++ break; ++ ++ btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), ++ BTREE_ERR_WANT_RETRY, c, b, i, ++ "unknown checksum type"); ++ ++ nonce = btree_nonce(i, b->written << 9); ++ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); ++ ++ btree_err_on(bch2_crc_cmp(csum, bne->csum), ++ BTREE_ERR_WANT_RETRY, c, b, i, ++ "invalid checksum"); ++ ++ bset_encrypt(c, i, b->written << 9); ++ ++ sectors = vstruct_sectors(bne, c->block_bits); ++ } ++ ++ ret = validate_bset(c, b, i, sectors, ++ READ, have_retry); ++ if (ret) ++ goto fsck_err; ++ ++ if (!b->written) ++ btree_node_set_format(b, b->data->format); ++ ++ ret = validate_bset_keys(c, b, i, &whiteout_u64s, ++ READ, have_retry); ++ if (ret) ++ goto fsck_err; ++ ++ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); ++ ++ b->written += sectors; ++ ++ blacklisted = bch2_journal_seq_is_blacklisted(c, ++ le64_to_cpu(i->journal_seq), ++ true); ++ ++ btree_err_on(blacklisted && first, ++ BTREE_ERR_FIXABLE, c, b, i, ++ "first btree node bset has blacklisted journal seq"); ++ if (blacklisted && !first) ++ continue; ++ ++ sort_iter_add(iter, i->start, ++ vstruct_idx(i, whiteout_u64s)); ++ ++ sort_iter_add(iter, ++ vstruct_idx(i, whiteout_u64s), ++ vstruct_last(i)); ++ } ++ ++ for (bne = write_block(b); ++ bset_byte_offset(b, bne) < btree_bytes(c); ++ bne = (void *) bne + block_bytes(c)) ++ btree_err_on(bne->keys.seq == b->data->keys.seq, ++ BTREE_ERR_WANT_RETRY, c, b, NULL, ++ "found bset signature after last bset"); ++ ++ sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); ++ sorted->keys.u64s = 0; ++ ++ set_btree_bset(b, b->set, &b->data->keys); ++ ++ b->nr = (btree_node_old_extent_overwrite(b) ++ ? bch2_extent_sort_fix_overlapping ++ : bch2_key_sort_fix_overlapping)(c, &sorted->keys, iter); ++ ++ u64s = le16_to_cpu(sorted->keys.u64s); ++ *sorted = *b->data; ++ sorted->keys.u64s = cpu_to_le16(u64s); ++ swap(sorted, b->data); ++ set_btree_bset(b, b->set, &b->data->keys); ++ b->nsets = 1; ++ ++ BUG_ON(b->nr.live_u64s != u64s); ++ ++ btree_bounce_free(c, btree_bytes(c), used_mempool, sorted); ++ ++ i = &b->data->keys; ++ for (k = i->start; k != vstruct_last(i);) { ++ struct bkey tmp; ++ struct bkey_s u = __bkey_disassemble(b, k, &tmp); ++ const char *invalid = bch2_bkey_val_invalid(c, u.s_c); ++ ++ if (invalid || ++ (inject_invalid_keys(c) && ++ !bversion_cmp(u.k->version, MAX_VERSION))) { ++ char buf[160]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); ++ btree_err(BTREE_ERR_FIXABLE, c, b, i, ++ "invalid bkey %s: %s", buf, invalid); ++ ++ btree_keys_account_key_drop(&b->nr, 0, k); ++ ++ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); ++ memmove_u64s_down(k, bkey_next(k), ++ (u64 *) vstruct_end(i) - (u64 *) k); ++ set_btree_bset_end(b, b->set); ++ continue; ++ } ++ ++ if (u.k->type == KEY_TYPE_btree_ptr_v2) { ++ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u); ++ ++ bp.v->mem_ptr = 0; ++ } ++ ++ k = bkey_next_skip_noops(k, vstruct_last(i)); ++ } ++ ++ bch2_bset_build_aux_tree(b, b->set, false); ++ ++ set_needs_whiteout(btree_bset_first(b), true); ++ ++ btree_node_reset_sib_u64s(b); ++ ++ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ if (ca->mi.state != BCH_MEMBER_STATE_RW) ++ set_btree_node_need_rewrite(b); ++ } ++out: ++ mempool_free(iter, &c->fill_iter); ++ return retry_read; ++fsck_err: ++ if (ret == BTREE_RETRY_READ) { ++ retry_read = 1; ++ } else { ++ bch2_inconsistent_error(c); ++ set_btree_node_read_error(b); ++ } ++ goto out; ++} ++ ++static void btree_node_read_work(struct work_struct *work) ++{ ++ struct btree_read_bio *rb = ++ container_of(work, struct btree_read_bio, work); ++ struct bch_fs *c = rb->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); ++ struct btree *b = rb->bio.bi_private; ++ struct bio *bio = &rb->bio; ++ struct bch_io_failures failed = { .nr = 0 }; ++ bool can_retry; ++ ++ goto start; ++ while (1) { ++ bch_info(c, "retrying read"); ++ ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); ++ rb->have_ioref = bch2_dev_get_ioref(ca, READ); ++ bio_reset(bio); ++ bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; ++ bio->bi_iter.bi_sector = rb->pick.ptr.offset; ++ bio->bi_iter.bi_size = btree_bytes(c); ++ ++ if (rb->have_ioref) { ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ submit_bio_wait(bio); ++ } else { ++ bio->bi_status = BLK_STS_REMOVED; ++ } ++start: ++ bch2_dev_io_err_on(bio->bi_status, ca, "btree read: %s", ++ bch2_blk_status_to_str(bio->bi_status)); ++ if (rb->have_ioref) ++ percpu_ref_put(&ca->io_ref); ++ rb->have_ioref = false; ++ ++ bch2_mark_io_failure(&failed, &rb->pick); ++ ++ can_retry = bch2_bkey_pick_read_device(c, ++ bkey_i_to_s_c(&b->key), ++ &failed, &rb->pick) > 0; ++ ++ if (!bio->bi_status && ++ !bch2_btree_node_read_done(c, b, can_retry)) ++ break; ++ ++ if (!can_retry) { ++ set_btree_node_read_error(b); ++ break; ++ } ++ } ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], ++ rb->start_time); ++ bio_put(&rb->bio); ++ clear_btree_node_read_in_flight(b); ++ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); ++} ++ ++static void btree_node_read_endio(struct bio *bio) ++{ ++ struct btree_read_bio *rb = ++ container_of(bio, struct btree_read_bio, bio); ++ struct bch_fs *c = rb->c; ++ ++ if (rb->have_ioref) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); ++ bch2_latency_acct(ca, rb->start_time, READ); ++ } ++ ++ queue_work(system_unbound_wq, &rb->work); ++} ++ ++void bch2_btree_node_read(struct bch_fs *c, struct btree *b, ++ bool sync) ++{ ++ struct extent_ptr_decoded pick; ++ struct btree_read_bio *rb; ++ struct bch_dev *ca; ++ struct bio *bio; ++ int ret; ++ ++ trace_btree_read(c, b); ++ ++ ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), ++ NULL, &pick); ++ if (bch2_fs_fatal_err_on(ret <= 0, c, ++ "btree node read error: no device to read from")) { ++ set_btree_node_read_error(b); ++ return; ++ } ++ ++ ca = bch_dev_bkey_exists(c, pick.ptr.dev); ++ ++ bio = bio_alloc_bioset(GFP_NOIO, buf_pages(b->data, ++ btree_bytes(c)), ++ &c->btree_bio); ++ rb = container_of(bio, struct btree_read_bio, bio); ++ rb->c = c; ++ rb->start_time = local_clock(); ++ rb->have_ioref = bch2_dev_get_ioref(ca, READ); ++ rb->pick = pick; ++ INIT_WORK(&rb->work, btree_node_read_work); ++ bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; ++ bio->bi_iter.bi_sector = pick.ptr.offset; ++ bio->bi_end_io = btree_node_read_endio; ++ bio->bi_private = b; ++ bch2_bio_map(bio, b->data, btree_bytes(c)); ++ ++ set_btree_node_read_in_flight(b); ++ ++ if (rb->have_ioref) { ++ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], ++ bio_sectors(bio)); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ ++ if (sync) { ++ submit_bio_wait(bio); ++ ++ bio->bi_private = b; ++ btree_node_read_work(&rb->work); ++ } else { ++ submit_bio(bio); ++ } ++ } else { ++ bio->bi_status = BLK_STS_REMOVED; ++ ++ if (sync) ++ btree_node_read_work(&rb->work); ++ else ++ queue_work(system_unbound_wq, &rb->work); ++ ++ } ++} ++ ++int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, ++ const struct bkey_i *k, unsigned level) ++{ ++ struct closure cl; ++ struct btree *b; ++ int ret; ++ ++ closure_init_stack(&cl); ++ ++ do { ++ ret = bch2_btree_cache_cannibalize_lock(c, &cl); ++ closure_sync(&cl); ++ } while (ret); ++ ++ b = bch2_btree_node_mem_alloc(c); ++ bch2_btree_cache_cannibalize_unlock(c); ++ ++ BUG_ON(IS_ERR(b)); ++ ++ bkey_copy(&b->key, k); ++ BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); ++ ++ bch2_btree_node_read(c, b, true); ++ ++ if (btree_node_read_error(b)) { ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ mutex_lock(&c->btree_cache.lock); ++ list_move(&b->list, &c->btree_cache.freeable); ++ mutex_unlock(&c->btree_cache.lock); ++ ++ ret = -EIO; ++ goto err; ++ } ++ ++ bch2_btree_set_root_for_read(c, b); ++err: ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ ++ return ret; ++} ++ ++void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, ++ struct btree_write *w) ++{ ++ unsigned long old, new, v = READ_ONCE(b->will_make_reachable); ++ ++ do { ++ old = new = v; ++ if (!(old & 1)) ++ break; ++ ++ new &= ~1UL; ++ } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old); ++ ++ if (old & 1) ++ closure_put(&((struct btree_update *) new)->cl); ++ ++ bch2_journal_pin_drop(&c->journal, &w->journal); ++} ++ ++static void btree_node_write_done(struct bch_fs *c, struct btree *b) ++{ ++ struct btree_write *w = btree_prev_write(b); ++ ++ bch2_btree_complete_write(c, b, w); ++ btree_node_io_unlock(b); ++} ++ ++static void bch2_btree_node_write_error(struct bch_fs *c, ++ struct btree_write_bio *wbio) ++{ ++ struct btree *b = wbio->wbio.bio.bi_private; ++ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; ++ struct bch_extent_ptr *ptr; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p, ++ BTREE_MAX_DEPTH, b->c.level, 0); ++retry: ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ goto err; ++ ++ /* has node been freed? */ ++ if (iter->l[b->c.level].b != b) { ++ /* node has been freed: */ ++ BUG_ON(!btree_node_dying(b)); ++ goto out; ++ } ++ ++ BUG_ON(!btree_node_hashed(b)); ++ ++ bkey_copy(&tmp.k, &b->key); ++ ++ bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr, ++ bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); ++ ++ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&tmp.k))) ++ goto err; ++ ++ ret = bch2_btree_node_update_key(c, iter, b, &tmp.k); ++ if (ret == -EINTR) ++ goto retry; ++ if (ret) ++ goto err; ++out: ++ bch2_trans_exit(&trans); ++ bio_put(&wbio->wbio.bio); ++ btree_node_write_done(c, b); ++ return; ++err: ++ set_btree_node_noevict(b); ++ bch2_fs_fatal_error(c, "fatal error writing btree node"); ++ goto out; ++} ++ ++void bch2_btree_write_error_work(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, struct bch_fs, ++ btree_write_error_work); ++ struct bio *bio; ++ ++ while (1) { ++ spin_lock_irq(&c->btree_write_error_lock); ++ bio = bio_list_pop(&c->btree_write_error_list); ++ spin_unlock_irq(&c->btree_write_error_lock); ++ ++ if (!bio) ++ break; ++ ++ bch2_btree_node_write_error(c, ++ container_of(bio, struct btree_write_bio, wbio.bio)); ++ } ++} ++ ++static void btree_node_write_work(struct work_struct *work) ++{ ++ struct btree_write_bio *wbio = ++ container_of(work, struct btree_write_bio, work); ++ struct bch_fs *c = wbio->wbio.c; ++ struct btree *b = wbio->wbio.bio.bi_private; ++ ++ btree_bounce_free(c, ++ wbio->bytes, ++ wbio->wbio.used_mempool, ++ wbio->data); ++ ++ if (wbio->wbio.failed.nr) { ++ unsigned long flags; ++ ++ spin_lock_irqsave(&c->btree_write_error_lock, flags); ++ bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio); ++ spin_unlock_irqrestore(&c->btree_write_error_lock, flags); ++ ++ queue_work(c->wq, &c->btree_write_error_work); ++ return; ++ } ++ ++ bio_put(&wbio->wbio.bio); ++ btree_node_write_done(c, b); ++} ++ ++static void btree_node_write_endio(struct bio *bio) ++{ ++ struct bch_write_bio *wbio = to_wbio(bio); ++ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; ++ struct bch_write_bio *orig = parent ?: wbio; ++ struct bch_fs *c = wbio->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); ++ unsigned long flags; ++ ++ if (wbio->have_ioref) ++ bch2_latency_acct(ca, wbio->submit_time, WRITE); ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write: %s", ++ bch2_blk_status_to_str(bio->bi_status)) || ++ bch2_meta_write_fault("btree")) { ++ spin_lock_irqsave(&c->btree_write_error_lock, flags); ++ bch2_dev_list_add_dev(&orig->failed, wbio->dev); ++ spin_unlock_irqrestore(&c->btree_write_error_lock, flags); ++ } ++ ++ if (wbio->have_ioref) ++ percpu_ref_put(&ca->io_ref); ++ ++ if (parent) { ++ bio_put(bio); ++ bio_endio(&parent->bio); ++ } else { ++ struct btree_write_bio *wb = ++ container_of(orig, struct btree_write_bio, wbio); ++ ++ INIT_WORK(&wb->work, btree_node_write_work); ++ queue_work(system_unbound_wq, &wb->work); ++ } ++} ++ ++static int validate_bset_for_write(struct bch_fs *c, struct btree *b, ++ struct bset *i, unsigned sectors) ++{ ++ unsigned whiteout_u64s = 0; ++ int ret; ++ ++ if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE)) ++ return -1; ++ ++ ret = validate_bset(c, b, i, sectors, WRITE, false) ?: ++ validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false); ++ if (ret) ++ bch2_inconsistent_error(c); ++ ++ return ret; ++} ++ ++void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, ++ enum six_lock_type lock_type_held) ++{ ++ struct btree_write_bio *wbio; ++ struct bset_tree *t; ++ struct bset *i; ++ struct btree_node *bn = NULL; ++ struct btree_node_entry *bne = NULL; ++ BKEY_PADDED(key) k; ++ struct bch_extent_ptr *ptr; ++ struct sort_iter sort_iter; ++ struct nonce nonce; ++ unsigned bytes_to_write, sectors_to_write, bytes, u64s; ++ u64 seq = 0; ++ bool used_mempool; ++ unsigned long old, new; ++ bool validate_before_checksum = false; ++ void *data; ++ ++ if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) ++ return; ++ ++ /* ++ * We may only have a read lock on the btree node - the dirty bit is our ++ * "lock" against racing with other threads that may be trying to start ++ * a write, we do a write iff we clear the dirty bit. Since setting the ++ * dirty bit requires a write lock, we can't race with other threads ++ * redirtying it: ++ */ ++ do { ++ old = new = READ_ONCE(b->flags); ++ ++ if (!(old & (1 << BTREE_NODE_dirty))) ++ return; ++ ++ if (!btree_node_may_write(b)) ++ return; ++ ++ if (old & (1 << BTREE_NODE_write_in_flight)) { ++ btree_node_wait_on_io(b); ++ continue; ++ } ++ ++ new &= ~(1 << BTREE_NODE_dirty); ++ new &= ~(1 << BTREE_NODE_need_write); ++ new |= (1 << BTREE_NODE_write_in_flight); ++ new |= (1 << BTREE_NODE_just_written); ++ new ^= (1 << BTREE_NODE_write_idx); ++ } while (cmpxchg_acquire(&b->flags, old, new) != old); ++ ++ BUG_ON(btree_node_fake(b)); ++ BUG_ON((b->will_make_reachable != 0) != !b->written); ++ ++ BUG_ON(b->written >= c->opts.btree_node_size); ++ BUG_ON(b->written & (c->opts.block_size - 1)); ++ BUG_ON(bset_written(b, btree_bset_last(b))); ++ BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); ++ BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); ++ ++ bch2_sort_whiteouts(c, b); ++ ++ sort_iter_init(&sort_iter, b); ++ ++ bytes = !b->written ++ ? sizeof(struct btree_node) ++ : sizeof(struct btree_node_entry); ++ ++ bytes += b->whiteout_u64s * sizeof(u64); ++ ++ for_each_bset(b, t) { ++ i = bset(b, t); ++ ++ if (bset_written(b, i)) ++ continue; ++ ++ bytes += le16_to_cpu(i->u64s) * sizeof(u64); ++ sort_iter_add(&sort_iter, ++ btree_bkey_first(b, t), ++ btree_bkey_last(b, t)); ++ seq = max(seq, le64_to_cpu(i->journal_seq)); ++ } ++ ++ data = btree_bounce_alloc(c, bytes, &used_mempool); ++ ++ if (!b->written) { ++ bn = data; ++ *bn = *b->data; ++ i = &bn->keys; ++ } else { ++ bne = data; ++ bne->keys = b->data->keys; ++ i = &bne->keys; ++ } ++ ++ i->journal_seq = cpu_to_le64(seq); ++ i->u64s = 0; ++ ++ if (!btree_node_old_extent_overwrite(b)) { ++ sort_iter_add(&sort_iter, ++ unwritten_whiteouts_start(c, b), ++ unwritten_whiteouts_end(c, b)); ++ SET_BSET_SEPARATE_WHITEOUTS(i, false); ++ } else { ++ memcpy_u64s(i->start, ++ unwritten_whiteouts_start(c, b), ++ b->whiteout_u64s); ++ i->u64s = cpu_to_le16(b->whiteout_u64s); ++ SET_BSET_SEPARATE_WHITEOUTS(i, true); ++ } ++ ++ b->whiteout_u64s = 0; ++ ++ u64s = btree_node_old_extent_overwrite(b) ++ ? bch2_sort_extents(vstruct_last(i), &sort_iter, false) ++ : bch2_sort_keys(i->start, &sort_iter, false); ++ le16_add_cpu(&i->u64s, u64s); ++ ++ set_needs_whiteout(i, false); ++ ++ /* do we have data to write? */ ++ if (b->written && !i->u64s) ++ goto nowrite; ++ ++ bytes_to_write = vstruct_end(i) - data; ++ sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; ++ ++ memset(data + bytes_to_write, 0, ++ (sectors_to_write << 9) - bytes_to_write); ++ ++ BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size); ++ BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); ++ BUG_ON(i->seq != b->data->keys.seq); ++ ++ i->version = c->sb.version < bcachefs_metadata_version_new_versioning ++ ? cpu_to_le16(BCH_BSET_VERSION_OLD) ++ : cpu_to_le16(c->sb.version); ++ SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c)); ++ ++ if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))) ++ validate_before_checksum = true; ++ ++ /* validate_bset will be modifying: */ ++ if (le16_to_cpu(i->version) < bcachefs_metadata_version_max) ++ validate_before_checksum = true; ++ ++ /* if we're going to be encrypting, check metadata validity first: */ ++ if (validate_before_checksum && ++ validate_bset_for_write(c, b, i, sectors_to_write)) ++ goto err; ++ ++ bset_encrypt(c, i, b->written << 9); ++ ++ nonce = btree_nonce(i, b->written << 9); ++ ++ if (bn) ++ bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn); ++ else ++ bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); ++ ++ /* if we're not encrypting, check metadata after checksumming: */ ++ if (!validate_before_checksum && ++ validate_bset_for_write(c, b, i, sectors_to_write)) ++ goto err; ++ ++ /* ++ * We handle btree write errors by immediately halting the journal - ++ * after we've done that, we can't issue any subsequent btree writes ++ * because they might have pointers to new nodes that failed to write. ++ * ++ * Furthermore, there's no point in doing any more btree writes because ++ * with the journal stopped, we're never going to update the journal to ++ * reflect that those writes were done and the data flushed from the ++ * journal: ++ * ++ * Also on journal error, the pending write may have updates that were ++ * never journalled (interior nodes, see btree_update_nodes_written()) - ++ * it's critical that we don't do the write in that case otherwise we ++ * will have updates visible that weren't in the journal: ++ * ++ * Make sure to update b->written so bch2_btree_init_next() doesn't ++ * break: ++ */ ++ if (bch2_journal_error(&c->journal) || ++ c->opts.nochanges) ++ goto err; ++ ++ trace_btree_write(b, bytes_to_write, sectors_to_write); ++ ++ wbio = container_of(bio_alloc_bioset(GFP_NOIO, ++ buf_pages(data, sectors_to_write << 9), ++ &c->btree_bio), ++ struct btree_write_bio, wbio.bio); ++ wbio_init(&wbio->wbio.bio); ++ wbio->data = data; ++ wbio->bytes = bytes; ++ wbio->wbio.used_mempool = used_mempool; ++ wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META; ++ wbio->wbio.bio.bi_end_io = btree_node_write_endio; ++ wbio->wbio.bio.bi_private = b; ++ ++ bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); ++ ++ /* ++ * If we're appending to a leaf node, we don't technically need FUA - ++ * this write just needs to be persisted before the next journal write, ++ * which will be marked FLUSH|FUA. ++ * ++ * Similarly if we're writing a new btree root - the pointer is going to ++ * be in the next journal entry. ++ * ++ * But if we're writing a new btree node (that isn't a root) or ++ * appending to a non leaf btree node, we need either FUA or a flush ++ * when we write the parent with the new pointer. FUA is cheaper than a ++ * flush, and writes appending to leaf nodes aren't blocking anything so ++ * just make all btree node writes FUA to keep things sane. ++ */ ++ ++ bkey_copy(&k.key, &b->key); ++ ++ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&k.key)), ptr) ++ ptr->offset += b->written; ++ ++ b->written += sectors_to_write; ++ ++ /* XXX: submitting IO with btree locks held: */ ++ bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, &k.key); ++ return; ++err: ++ set_btree_node_noevict(b); ++ b->written += sectors_to_write; ++nowrite: ++ btree_bounce_free(c, bytes, used_mempool, data); ++ btree_node_write_done(c, b); ++} ++ ++/* ++ * Work that must be done with write lock held: ++ */ ++bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) ++{ ++ bool invalidated_iter = false; ++ struct btree_node_entry *bne; ++ struct bset_tree *t; ++ ++ if (!btree_node_just_written(b)) ++ return false; ++ ++ BUG_ON(b->whiteout_u64s); ++ ++ clear_btree_node_just_written(b); ++ ++ /* ++ * Note: immediately after write, bset_written() doesn't work - the ++ * amount of data we had to write after compaction might have been ++ * smaller than the offset of the last bset. ++ * ++ * However, we know that all bsets have been written here, as long as ++ * we're still holding the write lock: ++ */ ++ ++ /* ++ * XXX: decide if we really want to unconditionally sort down to a ++ * single bset: ++ */ ++ if (b->nsets > 1) { ++ btree_node_sort(c, b, NULL, 0, b->nsets, true); ++ invalidated_iter = true; ++ } else { ++ invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL); ++ } ++ ++ for_each_bset(b, t) ++ set_needs_whiteout(bset(b, t), true); ++ ++ bch2_btree_verify(c, b); ++ ++ /* ++ * If later we don't unconditionally sort down to a single bset, we have ++ * to ensure this is still true: ++ */ ++ BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b)); ++ ++ bne = want_new_bset(c, b); ++ if (bne) ++ bch2_bset_init_next(c, b, bne); ++ ++ bch2_btree_build_aux_trees(b); ++ ++ return invalidated_iter; ++} ++ ++/* ++ * Use this one if the node is intent locked: ++ */ ++void bch2_btree_node_write(struct bch_fs *c, struct btree *b, ++ enum six_lock_type lock_type_held) ++{ ++ BUG_ON(lock_type_held == SIX_LOCK_write); ++ ++ if (lock_type_held == SIX_LOCK_intent || ++ six_lock_tryupgrade(&b->c.lock)) { ++ __bch2_btree_node_write(c, b, SIX_LOCK_intent); ++ ++ /* don't cycle lock unnecessarily: */ ++ if (btree_node_just_written(b) && ++ six_trylock_write(&b->c.lock)) { ++ bch2_btree_post_write_cleanup(c, b); ++ six_unlock_write(&b->c.lock); ++ } ++ ++ if (lock_type_held == SIX_LOCK_read) ++ six_lock_downgrade(&b->c.lock); ++ } else { ++ __bch2_btree_node_write(c, b, SIX_LOCK_read); ++ } ++} ++ ++static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) ++{ ++ struct bucket_table *tbl; ++ struct rhash_head *pos; ++ struct btree *b; ++ unsigned i; ++restart: ++ rcu_read_lock(); ++ for_each_cached_btree(b, c, tbl, i, pos) ++ if (test_bit(flag, &b->flags)) { ++ rcu_read_unlock(); ++ wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE); ++ goto restart; ++ ++ } ++ rcu_read_unlock(); ++} ++ ++void bch2_btree_flush_all_reads(struct bch_fs *c) ++{ ++ __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight); ++} ++ ++void bch2_btree_flush_all_writes(struct bch_fs *c) ++{ ++ __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); ++} ++ ++void bch2_btree_verify_flushed(struct bch_fs *c) ++{ ++ struct bucket_table *tbl; ++ struct rhash_head *pos; ++ struct btree *b; ++ unsigned i; ++ ++ rcu_read_lock(); ++ for_each_cached_btree(b, c, tbl, i, pos) { ++ unsigned long flags = READ_ONCE(b->flags); ++ ++ BUG_ON((flags & (1 << BTREE_NODE_dirty)) || ++ (flags & (1 << BTREE_NODE_write_in_flight))); ++ } ++ rcu_read_unlock(); ++} ++ ++void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ struct bucket_table *tbl; ++ struct rhash_head *pos; ++ struct btree *b; ++ unsigned i; ++ ++ rcu_read_lock(); ++ for_each_cached_btree(b, c, tbl, i, pos) { ++ unsigned long flags = READ_ONCE(b->flags); ++ ++ if (!(flags & (1 << BTREE_NODE_dirty))) ++ continue; ++ ++ pr_buf(out, "%p d %u n %u l %u w %u b %u r %u:%lu\n", ++ b, ++ (flags & (1 << BTREE_NODE_dirty)) != 0, ++ (flags & (1 << BTREE_NODE_need_write)) != 0, ++ b->c.level, ++ b->written, ++ !list_empty_careful(&b->write_blocked), ++ b->will_make_reachable != 0, ++ b->will_make_reachable & 1); ++ } ++ rcu_read_unlock(); ++} +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +new file mode 100644 +index 000000000000..626d0f071b70 +--- /dev/null ++++ b/fs/bcachefs/btree_io.h +@@ -0,0 +1,220 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_IO_H ++#define _BCACHEFS_BTREE_IO_H ++ ++#include "bkey_methods.h" ++#include "bset.h" ++#include "btree_locking.h" ++#include "checksum.h" ++#include "extents.h" ++#include "io_types.h" ++ ++struct bch_fs; ++struct btree_write; ++struct btree; ++struct btree_iter; ++ ++struct btree_read_bio { ++ struct bch_fs *c; ++ u64 start_time; ++ unsigned have_ioref:1; ++ struct extent_ptr_decoded pick; ++ struct work_struct work; ++ struct bio bio; ++}; ++ ++struct btree_write_bio { ++ struct work_struct work; ++ void *data; ++ unsigned bytes; ++ struct bch_write_bio wbio; ++}; ++ ++static inline void btree_node_io_unlock(struct btree *b) ++{ ++ EBUG_ON(!btree_node_write_in_flight(b)); ++ clear_btree_node_write_in_flight(b); ++ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); ++} ++ ++static inline void btree_node_io_lock(struct btree *b) ++{ ++ wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, ++ TASK_UNINTERRUPTIBLE); ++} ++ ++static inline void btree_node_wait_on_io(struct btree *b) ++{ ++ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, ++ TASK_UNINTERRUPTIBLE); ++} ++ ++static inline bool btree_node_may_write(struct btree *b) ++{ ++ return list_empty_careful(&b->write_blocked) && ++ (!b->written || !b->will_make_reachable); ++} ++ ++enum compact_mode { ++ COMPACT_LAZY, ++ COMPACT_ALL, ++}; ++ ++bool bch2_compact_whiteouts(struct bch_fs *, struct btree *, ++ enum compact_mode); ++ ++static inline bool should_compact_bset_lazy(struct btree *b, ++ struct bset_tree *t) ++{ ++ unsigned total_u64s = bset_u64s(t); ++ unsigned dead_u64s = bset_dead_u64s(b, t); ++ ++ return dead_u64s > 64 && dead_u64s * 3 > total_u64s; ++} ++ ++static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) ++{ ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) ++ if (should_compact_bset_lazy(b, t)) ++ return bch2_compact_whiteouts(c, b, COMPACT_LAZY); ++ ++ return false; ++} ++ ++static inline struct nonce btree_nonce(struct bset *i, unsigned offset) ++{ ++ return (struct nonce) {{ ++ [0] = cpu_to_le32(offset), ++ [1] = ((__le32 *) &i->seq)[0], ++ [2] = ((__le32 *) &i->seq)[1], ++ [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, ++ }}; ++} ++ ++static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) ++{ ++ struct nonce nonce = btree_nonce(i, offset); ++ ++ if (!offset) { ++ struct btree_node *bn = container_of(i, struct btree_node, keys); ++ unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; ++ ++ bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, ++ bytes); ++ ++ nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); ++ } ++ ++ bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, ++ vstruct_end(i) - (void *) i->_data); ++} ++ ++void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); ++ ++void bch2_btree_build_aux_trees(struct btree *); ++void bch2_btree_init_next(struct bch_fs *, struct btree *, ++ struct btree_iter *); ++ ++int bch2_btree_node_read_done(struct bch_fs *, struct btree *, bool); ++void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); ++int bch2_btree_root_read(struct bch_fs *, enum btree_id, ++ const struct bkey_i *, unsigned); ++ ++void bch2_btree_complete_write(struct bch_fs *, struct btree *, ++ struct btree_write *); ++void bch2_btree_write_error_work(struct work_struct *); ++ ++void __bch2_btree_node_write(struct bch_fs *, struct btree *, ++ enum six_lock_type); ++bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); ++ ++void bch2_btree_node_write(struct bch_fs *, struct btree *, ++ enum six_lock_type); ++ ++static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, ++ enum six_lock_type lock_held) ++{ ++ while (b->written && ++ btree_node_need_write(b) && ++ btree_node_may_write(b)) { ++ if (!btree_node_write_in_flight(b)) { ++ bch2_btree_node_write(c, b, lock_held); ++ break; ++ } ++ ++ six_unlock_type(&b->c.lock, lock_held); ++ btree_node_wait_on_io(b); ++ btree_node_lock_type(c, b, lock_held); ++ } ++} ++ ++#define bch2_btree_node_write_cond(_c, _b, cond) \ ++do { \ ++ unsigned long old, new, v = READ_ONCE((_b)->flags); \ ++ \ ++ do { \ ++ old = new = v; \ ++ \ ++ if (!(old & (1 << BTREE_NODE_dirty)) || !(cond)) \ ++ break; \ ++ \ ++ new |= (1 << BTREE_NODE_need_write); \ ++ } while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \ ++ \ ++ btree_node_write_if_need(_c, _b, SIX_LOCK_read); \ ++} while (0) ++ ++void bch2_btree_flush_all_reads(struct bch_fs *); ++void bch2_btree_flush_all_writes(struct bch_fs *); ++void bch2_btree_verify_flushed(struct bch_fs *); ++void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *); ++ ++static inline void compat_bformat(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, struct bkey_format *f) ++{ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_id == BTREE_ID_INODES) { ++ swap(f->bits_per_field[BKEY_FIELD_INODE], ++ f->bits_per_field[BKEY_FIELD_OFFSET]); ++ swap(f->field_offset[BKEY_FIELD_INODE], ++ f->field_offset[BKEY_FIELD_OFFSET]); ++ } ++} ++ ++static inline void compat_bpos(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, struct bpos *p) ++{ ++ if (big_endian != CPU_BIG_ENDIAN) ++ bch2_bpos_swab(p); ++ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_id == BTREE_ID_INODES) ++ swap(p->inode, p->offset); ++} ++ ++static inline void compat_btree_node(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, ++ struct btree_node *bn) ++{ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_node_type_is_extents(btree_id) && ++ bkey_cmp(bn->min_key, POS_MIN) && ++ write) ++ bn->min_key = bkey_predecessor(bn->min_key); ++ ++ compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key); ++ compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key); ++ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_node_type_is_extents(btree_id) && ++ bkey_cmp(bn->min_key, POS_MIN) && ++ !write) ++ bn->min_key = bkey_successor(bn->min_key); ++} ++ ++#endif /* _BCACHEFS_BTREE_IO_H */ +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +new file mode 100644 +index 000000000000..6fab76c3220c +--- /dev/null ++++ b/fs/bcachefs/btree_iter.c +@@ -0,0 +1,2445 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_cache.h" ++#include "btree_iter.h" ++#include "btree_key_cache.h" ++#include "btree_locking.h" ++#include "btree_update.h" ++#include "debug.h" ++#include "extents.h" ++#include "journal.h" ++ ++#include ++#include ++ ++static inline bool is_btree_node(struct btree_iter *iter, unsigned l) ++{ ++ return l < BTREE_MAX_DEPTH && ++ (unsigned long) iter->l[l].b >= 128; ++} ++ ++static inline struct bpos btree_iter_search_key(struct btree_iter *iter) ++{ ++ struct bpos pos = iter->pos; ++ ++ if ((iter->flags & BTREE_ITER_IS_EXTENTS) && ++ bkey_cmp(pos, POS_MAX)) ++ pos = bkey_successor(pos); ++ return pos; ++} ++ ++static inline bool btree_iter_pos_before_node(struct btree_iter *iter, ++ struct btree *b) ++{ ++ return bkey_cmp(btree_iter_search_key(iter), b->data->min_key) < 0; ++} ++ ++static inline bool btree_iter_pos_after_node(struct btree_iter *iter, ++ struct btree *b) ++{ ++ return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0; ++} ++ ++static inline bool btree_iter_pos_in_node(struct btree_iter *iter, ++ struct btree *b) ++{ ++ return iter->btree_id == b->c.btree_id && ++ !btree_iter_pos_before_node(iter, b) && ++ !btree_iter_pos_after_node(iter, b); ++} ++ ++/* Btree node locking: */ ++ ++void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) ++{ ++ bch2_btree_node_unlock_write_inlined(b, iter); ++} ++ ++void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) ++{ ++ struct btree_iter *linked; ++ unsigned readers = 0; ++ ++ EBUG_ON(!btree_node_intent_locked(iter, b->c.level)); ++ ++ trans_for_each_iter(iter->trans, linked) ++ if (linked->l[b->c.level].b == b && ++ btree_node_read_locked(linked, b->c.level)) ++ readers++; ++ ++ /* ++ * Must drop our read locks before calling six_lock_write() - ++ * six_unlock() won't do wakeups until the reader count ++ * goes to 0, and it's safe because we have the node intent ++ * locked: ++ */ ++ atomic64_sub(__SIX_VAL(read_lock, readers), ++ &b->c.lock.state.counter); ++ btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write); ++ atomic64_add(__SIX_VAL(read_lock, readers), ++ &b->c.lock.state.counter); ++} ++ ++bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) ++{ ++ struct btree *b = btree_iter_node(iter, level); ++ int want = __btree_lock_want(iter, level); ++ ++ if (!is_btree_node(iter, level)) ++ return false; ++ ++ if (race_fault()) ++ return false; ++ ++ if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) || ++ (btree_node_lock_seq_matches(iter, b, level) && ++ btree_node_lock_increment(iter->trans, b, level, want))) { ++ mark_btree_node_locked(iter, level, want); ++ return true; ++ } else { ++ return false; ++ } ++} ++ ++static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level) ++{ ++ struct btree *b = iter->l[level].b; ++ ++ EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED); ++ ++ if (!is_btree_node(iter, level)) ++ return false; ++ ++ if (btree_node_intent_locked(iter, level)) ++ return true; ++ ++ if (race_fault()) ++ return false; ++ ++ if (btree_node_locked(iter, level) ++ ? six_lock_tryupgrade(&b->c.lock) ++ : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq)) ++ goto success; ++ ++ if (btree_node_lock_seq_matches(iter, b, level) && ++ btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) { ++ btree_node_unlock(iter, level); ++ goto success; ++ } ++ ++ return false; ++success: ++ mark_btree_node_intent_locked(iter, level); ++ return true; ++} ++ ++static inline bool btree_iter_get_locks(struct btree_iter *iter, ++ bool upgrade, bool trace) ++{ ++ unsigned l = iter->level; ++ int fail_idx = -1; ++ ++ do { ++ if (!btree_iter_node(iter, l)) ++ break; ++ ++ if (!(upgrade ++ ? bch2_btree_node_upgrade(iter, l) ++ : bch2_btree_node_relock(iter, l))) { ++ if (trace) ++ (upgrade ++ ? trace_node_upgrade_fail ++ : trace_node_relock_fail)(l, iter->l[l].lock_seq, ++ is_btree_node(iter, l) ++ ? 0 ++ : (unsigned long) iter->l[l].b, ++ is_btree_node(iter, l) ++ ? iter->l[l].b->c.lock.state.seq ++ : 0); ++ ++ fail_idx = l; ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ } ++ ++ l++; ++ } while (l < iter->locks_want); ++ ++ /* ++ * When we fail to get a lock, we have to ensure that any child nodes ++ * can't be relocked so bch2_btree_iter_traverse has to walk back up to ++ * the node that we failed to relock: ++ */ ++ while (fail_idx >= 0) { ++ btree_node_unlock(iter, fail_idx); ++ iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS; ++ --fail_idx; ++ } ++ ++ if (iter->uptodate == BTREE_ITER_NEED_RELOCK) ++ iter->uptodate = BTREE_ITER_NEED_PEEK; ++ ++ bch2_btree_trans_verify_locks(iter->trans); ++ ++ return iter->uptodate < BTREE_ITER_NEED_RELOCK; ++} ++ ++static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b, ++ enum btree_iter_type type) ++{ ++ return type != BTREE_ITER_CACHED ++ ? container_of(_b, struct btree, c)->key.k.p ++ : container_of(_b, struct bkey_cached, c)->key.pos; ++} ++ ++/* Slowpath: */ ++bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, ++ unsigned level, struct btree_iter *iter, ++ enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, ++ void *p) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree_iter *linked; ++ u64 start_time = local_clock(); ++ bool ret = true; ++ ++ /* Check if it's safe to block: */ ++ trans_for_each_iter(trans, linked) { ++ if (!linked->nodes_locked) ++ continue; ++ ++ /* ++ * Can't block taking an intent lock if we have _any_ nodes read ++ * locked: ++ * ++ * - Our read lock blocks another thread with an intent lock on ++ * the same node from getting a write lock, and thus from ++ * dropping its intent lock ++ * ++ * - And the other thread may have multiple nodes intent locked: ++ * both the node we want to intent lock, and the node we ++ * already have read locked - deadlock: ++ */ ++ if (type == SIX_LOCK_intent && ++ linked->nodes_locked != linked->nodes_intent_locked) { ++ if (!(trans->nounlock)) { ++ linked->locks_want = max_t(unsigned, ++ linked->locks_want, ++ __fls(linked->nodes_locked) + 1); ++ if (!btree_iter_get_locks(linked, true, false)) ++ ret = false; ++ } else { ++ ret = false; ++ } ++ } ++ ++ /* ++ * Interior nodes must be locked before their descendants: if ++ * another iterator has possible descendants locked of the node ++ * we're about to lock, it must have the ancestors locked too: ++ */ ++ if (linked->btree_id == iter->btree_id && ++ level > __fls(linked->nodes_locked)) { ++ if (!(trans->nounlock)) { ++ linked->locks_want = ++ max(level + 1, max_t(unsigned, ++ linked->locks_want, ++ iter->locks_want)); ++ if (!btree_iter_get_locks(linked, true, false)) ++ ret = false; ++ } else { ++ ret = false; ++ } ++ } ++ ++ /* Must lock btree nodes in key order: */ ++ if ((cmp_int(iter->btree_id, linked->btree_id) ?: ++ -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0) ++ ret = false; ++ ++ if (iter->btree_id == linked->btree_id && ++ btree_node_locked(linked, level) && ++ bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b, ++ btree_iter_type(linked))) <= 0) ++ ret = false; ++ ++ /* ++ * Recheck if this is a node we already have locked - since one ++ * of the get_locks() calls might've successfully ++ * upgraded/relocked it: ++ */ ++ if (linked->l[level].b == b && ++ btree_node_locked_type(linked, level) >= type) { ++ six_lock_increment(&b->c.lock, type); ++ return true; ++ } ++ } ++ ++ if (unlikely(!ret)) { ++ trace_trans_restart_would_deadlock(iter->trans->ip); ++ return false; ++ } ++ ++ if (six_trylock_type(&b->c.lock, type)) ++ return true; ++ ++ if (six_lock_type(&b->c.lock, type, should_sleep_fn, p)) ++ return false; ++ ++ bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)], ++ start_time); ++ return true; ++} ++ ++/* Btree iterator locking: */ ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++static void bch2_btree_iter_verify_locks(struct btree_iter *iter) ++{ ++ unsigned l; ++ ++ if (!(iter->trans->iters_linked & (1ULL << iter->idx))) { ++ BUG_ON(iter->nodes_locked); ++ return; ++ } ++ ++ for (l = 0; is_btree_node(iter, l); l++) { ++ if (iter->uptodate >= BTREE_ITER_NEED_RELOCK && ++ !btree_node_locked(iter, l)) ++ continue; ++ ++ BUG_ON(btree_lock_want(iter, l) != ++ btree_node_locked_type(iter, l)); ++ } ++} ++ ++void bch2_btree_trans_verify_locks(struct btree_trans *trans) ++{ ++ struct btree_iter *iter; ++ ++ trans_for_each_iter_all(trans, iter) ++ bch2_btree_iter_verify_locks(iter); ++} ++#else ++static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {} ++#endif ++ ++__flatten ++bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace) ++{ ++ return btree_iter_get_locks(iter, false, trace); ++} ++ ++bool __bch2_btree_iter_upgrade(struct btree_iter *iter, ++ unsigned new_locks_want) ++{ ++ struct btree_iter *linked; ++ ++ EBUG_ON(iter->locks_want >= new_locks_want); ++ ++ iter->locks_want = new_locks_want; ++ ++ if (btree_iter_get_locks(iter, true, true)) ++ return true; ++ ++ /* ++ * Ancestor nodes must be locked before child nodes, so set locks_want ++ * on iterators that might lock ancestors before us to avoid getting ++ * -EINTR later: ++ */ ++ trans_for_each_iter(iter->trans, linked) ++ if (linked != iter && ++ linked->btree_id == iter->btree_id && ++ linked->locks_want < new_locks_want) { ++ linked->locks_want = new_locks_want; ++ btree_iter_get_locks(linked, true, false); ++ } ++ ++ return false; ++} ++ ++bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter, ++ unsigned new_locks_want) ++{ ++ unsigned l = iter->level; ++ ++ EBUG_ON(iter->locks_want >= new_locks_want); ++ ++ iter->locks_want = new_locks_want; ++ ++ do { ++ if (!btree_iter_node(iter, l)) ++ break; ++ ++ if (!bch2_btree_node_upgrade(iter, l)) { ++ iter->locks_want = l; ++ return false; ++ } ++ ++ l++; ++ } while (l < iter->locks_want); ++ ++ return true; ++} ++ ++void __bch2_btree_iter_downgrade(struct btree_iter *iter, ++ unsigned downgrade_to) ++{ ++ unsigned l, new_locks_want = downgrade_to ?: ++ (iter->flags & BTREE_ITER_INTENT ? 1 : 0); ++ ++ if (iter->locks_want < downgrade_to) { ++ iter->locks_want = new_locks_want; ++ ++ while (iter->nodes_locked && ++ (l = __fls(iter->nodes_locked)) >= iter->locks_want) { ++ if (l > iter->level) { ++ btree_node_unlock(iter, l); ++ } else { ++ if (btree_node_intent_locked(iter, l)) { ++ six_lock_downgrade(&iter->l[l].b->c.lock); ++ iter->nodes_intent_locked ^= 1 << l; ++ } ++ break; ++ } ++ } ++ } ++ ++ bch2_btree_trans_verify_locks(iter->trans); ++} ++ ++void bch2_trans_downgrade(struct btree_trans *trans) ++{ ++ struct btree_iter *iter; ++ ++ trans_for_each_iter(trans, iter) ++ bch2_btree_iter_downgrade(iter); ++} ++ ++/* Btree transaction locking: */ ++ ++bool bch2_trans_relock(struct btree_trans *trans) ++{ ++ struct btree_iter *iter; ++ bool ret = true; ++ ++ trans_for_each_iter(trans, iter) ++ if (iter->uptodate == BTREE_ITER_NEED_RELOCK) ++ ret &= bch2_btree_iter_relock(iter, true); ++ ++ return ret; ++} ++ ++void bch2_trans_unlock(struct btree_trans *trans) ++{ ++ struct btree_iter *iter; ++ ++ trans_for_each_iter(trans, iter) ++ __bch2_btree_iter_unlock(iter); ++} ++ ++/* Btree iterator: */ ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++static void bch2_btree_iter_verify_cached(struct btree_iter *iter) ++{ ++ struct bkey_cached *ck; ++ bool locked = btree_node_locked(iter, 0); ++ ++ if (!bch2_btree_node_relock(iter, 0)) ++ return; ++ ++ ck = (void *) iter->l[0].b; ++ BUG_ON(ck->key.btree_id != iter->btree_id || ++ bkey_cmp(ck->key.pos, iter->pos)); ++ ++ if (!locked) ++ btree_node_unlock(iter, 0); ++} ++ ++static void bch2_btree_iter_verify_level(struct btree_iter *iter, ++ unsigned level) ++{ ++ struct bpos pos = btree_iter_search_key(iter); ++ struct btree_iter_level *l = &iter->l[level]; ++ struct btree_node_iter tmp = l->iter; ++ bool locked = btree_node_locked(iter, level); ++ struct bkey_packed *p, *k; ++ char buf1[100], buf2[100]; ++ const char *msg; ++ ++ if (!debug_check_iterators(iter->trans->c)) ++ return; ++ ++ if (btree_iter_type(iter) == BTREE_ITER_CACHED) { ++ if (!level) ++ bch2_btree_iter_verify_cached(iter); ++ return; ++ } ++ ++ BUG_ON(iter->level < iter->min_depth); ++ ++ if (!btree_iter_node(iter, level)) ++ return; ++ ++ if (!bch2_btree_node_relock(iter, level)) ++ return; ++ ++ /* ++ * Ideally this invariant would always be true, and hopefully in the ++ * future it will be, but for now set_pos_same_leaf() breaks it: ++ */ ++ BUG_ON(iter->uptodate < BTREE_ITER_NEED_TRAVERSE && ++ !btree_iter_pos_in_node(iter, l->b)); ++ ++ /* ++ * node iterators don't use leaf node iterator: ++ */ ++ if (btree_iter_type(iter) == BTREE_ITER_NODES && ++ level <= iter->min_depth) ++ goto unlock; ++ ++ bch2_btree_node_iter_verify(&l->iter, l->b); ++ ++ /* ++ * For interior nodes, the iterator will have skipped past ++ * deleted keys: ++ * ++ * For extents, the iterator may have skipped past deleted keys (but not ++ * whiteouts) ++ */ ++ p = level || btree_node_type_is_extents(iter->btree_id) ++ ? bch2_btree_node_iter_prev_filter(&tmp, l->b, KEY_TYPE_discard) ++ : bch2_btree_node_iter_prev_all(&tmp, l->b); ++ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); ++ ++ if (p && bkey_iter_pos_cmp(l->b, p, &pos) >= 0) { ++ msg = "before"; ++ goto err; ++ } ++ ++ if (k && bkey_iter_pos_cmp(l->b, k, &pos) < 0) { ++ msg = "after"; ++ goto err; ++ } ++unlock: ++ if (!locked) ++ btree_node_unlock(iter, level); ++ return; ++err: ++ strcpy(buf1, "(none)"); ++ strcpy(buf2, "(none)"); ++ ++ if (p) { ++ struct bkey uk = bkey_unpack_key(l->b, p); ++ bch2_bkey_to_text(&PBUF(buf1), &uk); ++ } ++ ++ if (k) { ++ struct bkey uk = bkey_unpack_key(l->b, k); ++ bch2_bkey_to_text(&PBUF(buf2), &uk); ++ } ++ ++ panic("iterator should be %s key at level %u:\n" ++ "iter pos %s %llu:%llu\n" ++ "prev key %s\n" ++ "cur key %s\n", ++ msg, level, ++ iter->flags & BTREE_ITER_IS_EXTENTS ? ">" : "=>", ++ iter->pos.inode, iter->pos.offset, ++ buf1, buf2); ++} ++ ++static void bch2_btree_iter_verify(struct btree_iter *iter) ++{ ++ unsigned i; ++ ++ bch2_btree_trans_verify_locks(iter->trans); ++ ++ for (i = 0; i < BTREE_MAX_DEPTH; i++) ++ bch2_btree_iter_verify_level(iter, i); ++} ++ ++void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) ++{ ++ struct btree_iter *iter; ++ ++ if (!debug_check_iterators(trans->c)) ++ return; ++ ++ trans_for_each_iter_with_node(trans, b, iter) ++ bch2_btree_iter_verify_level(iter, b->c.level); ++} ++ ++#else ++ ++static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {} ++static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} ++ ++#endif ++ ++static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, ++ struct btree *b, ++ struct bset_tree *t, ++ struct bkey_packed *k) ++{ ++ struct btree_node_iter_set *set; ++ ++ btree_node_iter_for_each(iter, set) ++ if (set->end == t->end_offset) { ++ set->k = __btree_node_key_to_offset(b, k); ++ bch2_btree_node_iter_sort(iter, b); ++ return; ++ } ++ ++ bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); ++} ++ ++static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter, ++ struct btree *b, ++ struct bkey_packed *where) ++{ ++ struct btree_iter_level *l = &iter->l[b->c.level]; ++ struct bpos pos = btree_iter_search_key(iter); ++ ++ if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) ++ return; ++ ++ if (bkey_iter_pos_cmp(l->b, where, &pos) < 0) ++ bch2_btree_node_iter_advance(&l->iter, l->b); ++ ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++} ++ ++void bch2_btree_iter_fix_key_modified(struct btree_iter *iter, ++ struct btree *b, ++ struct bkey_packed *where) ++{ ++ struct btree_iter *linked; ++ ++ trans_for_each_iter_with_node(iter->trans, b, linked) { ++ __bch2_btree_iter_fix_key_modified(linked, b, where); ++ bch2_btree_iter_verify_level(linked, b->c.level); ++ } ++} ++ ++static void __bch2_btree_node_iter_fix(struct btree_iter *iter, ++ struct btree *b, ++ struct btree_node_iter *node_iter, ++ struct bset_tree *t, ++ struct bkey_packed *where, ++ unsigned clobber_u64s, ++ unsigned new_u64s) ++{ ++ const struct bkey_packed *end = btree_bkey_last(b, t); ++ struct btree_node_iter_set *set; ++ unsigned offset = __btree_node_key_to_offset(b, where); ++ int shift = new_u64s - clobber_u64s; ++ unsigned old_end = t->end_offset - shift; ++ unsigned orig_iter_pos = node_iter->data[0].k; ++ bool iter_current_key_modified = ++ orig_iter_pos >= offset && ++ orig_iter_pos <= offset + clobber_u64s; ++ struct bpos iter_pos = btree_iter_search_key(iter); ++ ++ btree_node_iter_for_each(node_iter, set) ++ if (set->end == old_end) ++ goto found; ++ ++ /* didn't find the bset in the iterator - might have to readd it: */ ++ if (new_u64s && ++ bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { ++ bch2_btree_node_iter_push(node_iter, b, where, end); ++ goto fixup_done; ++ } else { ++ /* Iterator is after key that changed */ ++ return; ++ } ++found: ++ set->end = t->end_offset; ++ ++ /* Iterator hasn't gotten to the key that changed yet: */ ++ if (set->k < offset) ++ return; ++ ++ if (new_u64s && ++ bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { ++ set->k = offset; ++ } else if (set->k < offset + clobber_u64s) { ++ set->k = offset + new_u64s; ++ if (set->k == set->end) ++ bch2_btree_node_iter_set_drop(node_iter, set); ++ } else { ++ /* Iterator is after key that changed */ ++ set->k = (int) set->k + shift; ++ return; ++ } ++ ++ bch2_btree_node_iter_sort(node_iter, b); ++fixup_done: ++ if (node_iter->data[0].k != orig_iter_pos) ++ iter_current_key_modified = true; ++ ++ /* ++ * When a new key is added, and the node iterator now points to that ++ * key, the iterator might have skipped past deleted keys that should ++ * come after the key the iterator now points to. We have to rewind to ++ * before those deleted keys - otherwise ++ * bch2_btree_node_iter_prev_all() breaks: ++ */ ++ if (!bch2_btree_node_iter_end(node_iter) && ++ iter_current_key_modified && ++ (b->c.level || ++ btree_node_type_is_extents(iter->btree_id))) { ++ struct bset_tree *t; ++ struct bkey_packed *k, *k2, *p; ++ ++ k = bch2_btree_node_iter_peek_all(node_iter, b); ++ ++ for_each_bset(b, t) { ++ bool set_pos = false; ++ ++ if (node_iter->data[0].end == t->end_offset) ++ continue; ++ ++ k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t); ++ ++ while ((p = bch2_bkey_prev_all(b, t, k2)) && ++ bkey_iter_cmp(b, k, p) < 0) { ++ k2 = p; ++ set_pos = true; ++ } ++ ++ if (set_pos) ++ btree_node_iter_set_set_pos(node_iter, ++ b, t, k2); ++ } ++ } ++ ++ if (!b->c.level && ++ node_iter == &iter->l[0].iter && ++ iter_current_key_modified) ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++} ++ ++void bch2_btree_node_iter_fix(struct btree_iter *iter, ++ struct btree *b, ++ struct btree_node_iter *node_iter, ++ struct bkey_packed *where, ++ unsigned clobber_u64s, ++ unsigned new_u64s) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, where); ++ struct btree_iter *linked; ++ ++ if (node_iter != &iter->l[b->c.level].iter) { ++ __bch2_btree_node_iter_fix(iter, b, node_iter, t, ++ where, clobber_u64s, new_u64s); ++ ++ if (debug_check_iterators(iter->trans->c)) ++ bch2_btree_node_iter_verify(node_iter, b); ++ } ++ ++ trans_for_each_iter_with_node(iter->trans, b, linked) { ++ __bch2_btree_node_iter_fix(linked, b, ++ &linked->l[b->c.level].iter, t, ++ where, clobber_u64s, new_u64s); ++ bch2_btree_iter_verify_level(linked, b->c.level); ++ } ++} ++ ++static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, ++ struct btree_iter_level *l, ++ struct bkey *u, ++ struct bkey_packed *k) ++{ ++ struct bkey_s_c ret; ++ ++ if (unlikely(!k)) { ++ /* ++ * signal to bch2_btree_iter_peek_slot() that we're currently at ++ * a hole ++ */ ++ u->type = KEY_TYPE_deleted; ++ return bkey_s_c_null; ++ } ++ ++ ret = bkey_disassemble(l->b, k, u); ++ ++ if (debug_check_bkeys(iter->trans->c)) ++ bch2_bkey_debugcheck(iter->trans->c, l->b, ret); ++ ++ return ret; ++} ++ ++/* peek_all() doesn't skip deleted keys */ ++static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter, ++ struct btree_iter_level *l, ++ struct bkey *u) ++{ ++ return __btree_iter_unpack(iter, l, u, ++ bch2_btree_node_iter_peek_all(&l->iter, l->b)); ++} ++ ++static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, ++ struct btree_iter_level *l) ++{ ++ return __btree_iter_unpack(iter, l, &iter->k, ++ bch2_btree_node_iter_peek(&l->iter, l->b)); ++} ++ ++static inline struct bkey_s_c __btree_iter_prev(struct btree_iter *iter, ++ struct btree_iter_level *l) ++{ ++ return __btree_iter_unpack(iter, l, &iter->k, ++ bch2_btree_node_iter_prev(&l->iter, l->b)); ++} ++ ++static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, ++ struct btree_iter_level *l, ++ int max_advance) ++{ ++ struct bpos pos = btree_iter_search_key(iter); ++ struct bkey_packed *k; ++ int nr_advanced = 0; ++ ++ while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && ++ bkey_iter_pos_cmp(l->b, k, &pos) < 0) { ++ if (max_advance > 0 && nr_advanced >= max_advance) ++ return false; ++ ++ bch2_btree_node_iter_advance(&l->iter, l->b); ++ nr_advanced++; ++ } ++ ++ return true; ++} ++ ++/* ++ * Verify that iterator for parent node points to child node: ++ */ ++static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) ++{ ++ struct btree_iter_level *l; ++ unsigned plevel; ++ bool parent_locked; ++ struct bkey_packed *k; ++ ++ if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) ++ return; ++ ++ plevel = b->c.level + 1; ++ if (!btree_iter_node(iter, plevel)) ++ return; ++ ++ parent_locked = btree_node_locked(iter, plevel); ++ ++ if (!bch2_btree_node_relock(iter, plevel)) ++ return; ++ ++ l = &iter->l[plevel]; ++ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); ++ if (!k || ++ bkey_deleted(k) || ++ bkey_cmp_left_packed(l->b, k, &b->key.k.p)) { ++ char buf[100]; ++ struct bkey uk = bkey_unpack_key(b, k); ++ ++ bch2_bkey_to_text(&PBUF(buf), &uk); ++ panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n", ++ buf, b->key.k.p.inode, b->key.k.p.offset); ++ } ++ ++ if (!parent_locked) ++ btree_node_unlock(iter, b->c.level + 1); ++} ++ ++static inline void __btree_iter_init(struct btree_iter *iter, ++ unsigned level) ++{ ++ struct bpos pos = btree_iter_search_key(iter); ++ struct btree_iter_level *l = &iter->l[level]; ++ ++ bch2_btree_node_iter_init(&l->iter, l->b, &pos); ++ ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++} ++ ++static inline void btree_iter_node_set(struct btree_iter *iter, ++ struct btree *b) ++{ ++ BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); ++ ++ btree_iter_verify_new_node(iter, b); ++ ++ EBUG_ON(!btree_iter_pos_in_node(iter, b)); ++ EBUG_ON(b->c.lock.state.seq & 1); ++ ++ iter->l[b->c.level].lock_seq = b->c.lock.state.seq; ++ iter->l[b->c.level].b = b; ++ __btree_iter_init(iter, b->c.level); ++} ++ ++/* ++ * A btree node is being replaced - update the iterator to point to the new ++ * node: ++ */ ++void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) ++{ ++ enum btree_node_locked_type t; ++ struct btree_iter *linked; ++ ++ trans_for_each_iter(iter->trans, linked) ++ if (btree_iter_type(linked) != BTREE_ITER_CACHED && ++ btree_iter_pos_in_node(linked, b)) { ++ /* ++ * bch2_btree_iter_node_drop() has already been called - ++ * the old node we're replacing has already been ++ * unlocked and the pointer invalidated ++ */ ++ BUG_ON(btree_node_locked(linked, b->c.level)); ++ ++ t = btree_lock_want(linked, b->c.level); ++ if (t != BTREE_NODE_UNLOCKED) { ++ six_lock_increment(&b->c.lock, t); ++ mark_btree_node_locked(linked, b->c.level, t); ++ } ++ ++ btree_iter_node_set(linked, b); ++ } ++} ++ ++void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) ++{ ++ struct btree_iter *linked; ++ unsigned level = b->c.level; ++ ++ trans_for_each_iter(iter->trans, linked) ++ if (linked->l[level].b == b) { ++ __btree_node_unlock(linked, level); ++ linked->l[level].b = BTREE_ITER_NO_NODE_DROP; ++ } ++} ++ ++/* ++ * A btree node has been modified in such a way as to invalidate iterators - fix ++ * them: ++ */ ++void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b) ++{ ++ struct btree_iter *linked; ++ ++ trans_for_each_iter_with_node(iter->trans, b, linked) ++ __btree_iter_init(linked, b->c.level); ++} ++ ++static int lock_root_check_fn(struct six_lock *lock, void *p) ++{ ++ struct btree *b = container_of(lock, struct btree, c.lock); ++ struct btree **rootp = p; ++ ++ return b == *rootp ? 0 : -1; ++} ++ ++static inline int btree_iter_lock_root(struct btree_iter *iter, ++ unsigned depth_want) ++{ ++ struct bch_fs *c = iter->trans->c; ++ struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b; ++ enum six_lock_type lock_type; ++ unsigned i; ++ ++ EBUG_ON(iter->nodes_locked); ++ ++ while (1) { ++ b = READ_ONCE(*rootp); ++ iter->level = READ_ONCE(b->c.level); ++ ++ if (unlikely(iter->level < depth_want)) { ++ /* ++ * the root is at a lower depth than the depth we want: ++ * got to the end of the btree, or we're walking nodes ++ * greater than some depth and there are no nodes >= ++ * that depth ++ */ ++ iter->level = depth_want; ++ for (i = iter->level; i < BTREE_MAX_DEPTH; i++) ++ iter->l[i].b = NULL; ++ return 1; ++ } ++ ++ lock_type = __btree_lock_want(iter, iter->level); ++ if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, ++ iter, lock_type, ++ lock_root_check_fn, rootp))) ++ return -EINTR; ++ ++ if (likely(b == READ_ONCE(*rootp) && ++ b->c.level == iter->level && ++ !race_fault())) { ++ for (i = 0; i < iter->level; i++) ++ iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT; ++ iter->l[iter->level].b = b; ++ for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++) ++ iter->l[i].b = NULL; ++ ++ mark_btree_node_locked(iter, iter->level, lock_type); ++ btree_iter_node_set(iter, b); ++ return 0; ++ } ++ ++ six_unlock_type(&b->c.lock, lock_type); ++ } ++} ++ ++noinline ++static void btree_iter_prefetch(struct btree_iter *iter) ++{ ++ struct bch_fs *c = iter->trans->c; ++ struct btree_iter_level *l = &iter->l[iter->level]; ++ struct btree_node_iter node_iter = l->iter; ++ struct bkey_packed *k; ++ BKEY_PADDED(k) tmp; ++ unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) ++ ? (iter->level > 1 ? 0 : 2) ++ : (iter->level > 1 ? 1 : 16); ++ bool was_locked = btree_node_locked(iter, iter->level); ++ ++ while (nr) { ++ if (!bch2_btree_node_relock(iter, iter->level)) ++ return; ++ ++ bch2_btree_node_iter_advance(&node_iter, l->b); ++ k = bch2_btree_node_iter_peek(&node_iter, l->b); ++ if (!k) ++ break; ++ ++ bch2_bkey_unpack(l->b, &tmp.k, k); ++ bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1); ++ } ++ ++ if (!was_locked) ++ btree_node_unlock(iter, iter->level); ++} ++ ++static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, ++ unsigned plevel, struct btree *b) ++{ ++ struct btree_iter_level *l = &iter->l[plevel]; ++ bool locked = btree_node_locked(iter, plevel); ++ struct bkey_packed *k; ++ struct bch_btree_ptr_v2 *bp; ++ ++ if (!bch2_btree_node_relock(iter, plevel)) ++ return; ++ ++ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); ++ BUG_ON(k->type != KEY_TYPE_btree_ptr_v2); ++ ++ bp = (void *) bkeyp_val(&l->b->format, k); ++ bp->mem_ptr = (unsigned long)b; ++ ++ if (!locked) ++ btree_node_unlock(iter, plevel); ++} ++ ++static __always_inline int btree_iter_down(struct btree_iter *iter) ++{ ++ struct bch_fs *c = iter->trans->c; ++ struct btree_iter_level *l = &iter->l[iter->level]; ++ struct btree *b; ++ unsigned level = iter->level - 1; ++ enum six_lock_type lock_type = __btree_lock_want(iter, level); ++ BKEY_PADDED(k) tmp; ++ ++ EBUG_ON(!btree_node_locked(iter, iter->level)); ++ ++ bch2_bkey_unpack(l->b, &tmp.k, ++ bch2_btree_node_iter_peek(&l->iter, l->b)); ++ ++ b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type); ++ if (unlikely(IS_ERR(b))) ++ return PTR_ERR(b); ++ ++ mark_btree_node_locked(iter, level, lock_type); ++ btree_iter_node_set(iter, b); ++ ++ if (tmp.k.k.type == KEY_TYPE_btree_ptr_v2 && ++ unlikely(b != btree_node_mem_ptr(&tmp.k))) ++ btree_node_mem_ptr_set(iter, level + 1, b); ++ ++ if (iter->flags & BTREE_ITER_PREFETCH) ++ btree_iter_prefetch(iter); ++ ++ iter->level = level; ++ ++ return 0; ++} ++ ++static void btree_iter_up(struct btree_iter *iter) ++{ ++ btree_node_unlock(iter, iter->level++); ++} ++ ++static int btree_iter_traverse_one(struct btree_iter *); ++ ++static int __btree_iter_traverse_all(struct btree_trans *trans, int ret) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter; ++ u8 sorted[BTREE_ITER_MAX]; ++ unsigned i, nr_sorted = 0; ++ ++ if (trans->in_traverse_all) ++ return -EINTR; ++ ++ trans->in_traverse_all = true; ++retry_all: ++ nr_sorted = 0; ++ ++ trans_for_each_iter(trans, iter) ++ sorted[nr_sorted++] = iter->idx; ++ ++#define btree_iter_cmp_by_idx(_l, _r) \ ++ btree_iter_cmp(&trans->iters[_l], &trans->iters[_r]) ++ ++ bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); ++#undef btree_iter_cmp_by_idx ++ bch2_trans_unlock(trans); ++ ++ if (unlikely(ret == -ENOMEM)) { ++ struct closure cl; ++ ++ closure_init_stack(&cl); ++ ++ do { ++ ret = bch2_btree_cache_cannibalize_lock(c, &cl); ++ closure_sync(&cl); ++ } while (ret); ++ } ++ ++ if (unlikely(ret == -EIO)) { ++ trans->error = true; ++ goto out; ++ } ++ ++ BUG_ON(ret && ret != -EINTR); ++ ++ /* Now, redo traversals in correct order: */ ++ for (i = 0; i < nr_sorted; i++) { ++ unsigned idx = sorted[i]; ++ ++ /* ++ * sucessfully traversing one iterator can cause another to be ++ * unlinked, in btree_key_cache_fill() ++ */ ++ if (!(trans->iters_linked & (1ULL << idx))) ++ continue; ++ ++ ret = btree_iter_traverse_one(&trans->iters[idx]); ++ if (ret) ++ goto retry_all; ++ } ++ ++ if (hweight64(trans->iters_live) > 1) ++ ret = -EINTR; ++ else ++ trans_for_each_iter(trans, iter) ++ if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) { ++ ret = -EINTR; ++ break; ++ } ++out: ++ bch2_btree_cache_cannibalize_unlock(c); ++ ++ trans->in_traverse_all = false; ++ return ret; ++} ++ ++int bch2_btree_iter_traverse_all(struct btree_trans *trans) ++{ ++ return __btree_iter_traverse_all(trans, 0); ++} ++ ++static inline bool btree_iter_good_node(struct btree_iter *iter, ++ unsigned l, int check_pos) ++{ ++ if (!is_btree_node(iter, l) || ++ !bch2_btree_node_relock(iter, l)) ++ return false; ++ ++ if (check_pos <= 0 && btree_iter_pos_before_node(iter, iter->l[l].b)) ++ return false; ++ if (check_pos >= 0 && btree_iter_pos_after_node(iter, iter->l[l].b)) ++ return false; ++ return true; ++} ++ ++static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter, ++ int check_pos) ++{ ++ unsigned l = iter->level; ++ ++ while (btree_iter_node(iter, l) && ++ !btree_iter_good_node(iter, l, check_pos)) { ++ btree_node_unlock(iter, l); ++ iter->l[l].b = BTREE_ITER_NO_NODE_UP; ++ l++; ++ } ++ ++ return l; ++} ++ ++/* ++ * This is the main state machine for walking down the btree - walks down to a ++ * specified depth ++ * ++ * Returns 0 on success, -EIO on error (error reading in a btree node). ++ * ++ * On error, caller (peek_node()/peek_key()) must return NULL; the error is ++ * stashed in the iterator and returned from bch2_trans_exit(). ++ */ ++static int btree_iter_traverse_one(struct btree_iter *iter) ++{ ++ unsigned depth_want = iter->level; ++ ++ /* ++ * if we need interior nodes locked, call btree_iter_relock() to make ++ * sure we walk back up enough that we lock them: ++ */ ++ if (iter->uptodate == BTREE_ITER_NEED_RELOCK || ++ iter->locks_want > 1) ++ bch2_btree_iter_relock(iter, false); ++ ++ if (btree_iter_type(iter) == BTREE_ITER_CACHED) ++ return bch2_btree_iter_traverse_cached(iter); ++ ++ if (iter->uptodate < BTREE_ITER_NEED_RELOCK) ++ return 0; ++ ++ if (unlikely(iter->level >= BTREE_MAX_DEPTH)) ++ return 0; ++ ++ /* ++ * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos ++ * here unnecessary ++ */ ++ iter->level = btree_iter_up_until_good_node(iter, 0); ++ ++ /* ++ * If we've got a btree node locked (i.e. we aren't about to relock the ++ * root) - advance its node iterator if necessary: ++ * ++ * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary ++ */ ++ if (is_btree_node(iter, iter->level)) { ++ BUG_ON(!btree_iter_pos_in_node(iter, iter->l[iter->level].b)); ++ ++ btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1); ++ } ++ ++ /* ++ * Note: iter->nodes[iter->level] may be temporarily NULL here - that ++ * would indicate to other code that we got to the end of the btree, ++ * here it indicates that relocking the root failed - it's critical that ++ * btree_iter_lock_root() comes next and that it can't fail ++ */ ++ while (iter->level > depth_want) { ++ int ret = btree_iter_node(iter, iter->level) ++ ? btree_iter_down(iter) ++ : btree_iter_lock_root(iter, depth_want); ++ if (unlikely(ret)) { ++ if (ret == 1) ++ return 0; ++ ++ iter->level = depth_want; ++ ++ if (ret == -EIO) { ++ iter->flags |= BTREE_ITER_ERROR; ++ iter->l[iter->level].b = ++ BTREE_ITER_NO_NODE_ERROR; ++ } else { ++ iter->l[iter->level].b = ++ BTREE_ITER_NO_NODE_DOWN; ++ } ++ return ret; ++ } ++ } ++ ++ iter->uptodate = BTREE_ITER_NEED_PEEK; ++ ++ bch2_btree_iter_verify(iter); ++ return 0; ++} ++ ++int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) ++{ ++ struct btree_trans *trans = iter->trans; ++ int ret; ++ ++ ret = bch2_trans_cond_resched(trans) ?: ++ btree_iter_traverse_one(iter); ++ if (unlikely(ret)) ++ ret = __btree_iter_traverse_all(trans, ret); ++ ++ return ret; ++} ++ ++static inline void bch2_btree_iter_checks(struct btree_iter *iter) ++{ ++ enum btree_iter_type type = btree_iter_type(iter); ++ ++ EBUG_ON(iter->btree_id >= BTREE_ID_NR); ++ ++ BUG_ON((type == BTREE_ITER_KEYS || ++ type == BTREE_ITER_CACHED) && ++ (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || ++ bkey_cmp(iter->pos, iter->k.p) > 0)); ++ ++ bch2_btree_iter_verify_locks(iter); ++ bch2_btree_iter_verify_level(iter, iter->level); ++} ++ ++/* Iterate across nodes (leaf and interior nodes) */ ++ ++struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) ++{ ++ struct btree *b; ++ int ret; ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); ++ bch2_btree_iter_checks(iter); ++ ++ if (iter->uptodate == BTREE_ITER_UPTODATE) ++ return iter->l[iter->level].b; ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ return NULL; ++ ++ b = btree_iter_node(iter, iter->level); ++ if (!b) ++ return NULL; ++ ++ BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); ++ ++ iter->pos = b->key.k.p; ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ ++ bch2_btree_iter_verify(iter); ++ ++ return b; ++} ++ ++struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) ++{ ++ struct btree *b; ++ int ret; ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); ++ bch2_btree_iter_checks(iter); ++ ++ /* already got to end? */ ++ if (!btree_iter_node(iter, iter->level)) ++ return NULL; ++ ++ bch2_trans_cond_resched(iter->trans); ++ ++ btree_iter_up(iter); ++ ++ if (!bch2_btree_node_relock(iter, iter->level)) ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ return NULL; ++ ++ /* got to end? */ ++ b = btree_iter_node(iter, iter->level); ++ if (!b) ++ return NULL; ++ ++ if (bkey_cmp(iter->pos, b->key.k.p) < 0) { ++ /* ++ * Haven't gotten to the end of the parent node: go back down to ++ * the next child node ++ */ ++ ++ /* ++ * We don't really want to be unlocking here except we can't ++ * directly tell btree_iter_traverse() "traverse to this level" ++ * except by setting iter->level, so we have to unlock so we ++ * don't screw up our lock invariants: ++ */ ++ if (btree_node_read_locked(iter, iter->level)) ++ btree_node_unlock(iter, iter->level); ++ ++ iter->pos = bkey_successor(iter->pos); ++ iter->level = iter->min_depth; ++ ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ return NULL; ++ ++ b = iter->l[iter->level].b; ++ } ++ ++ iter->pos = b->key.k.p; ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ ++ bch2_btree_iter_verify(iter); ++ ++ return b; ++} ++ ++/* Iterate across keys (in leaf nodes only) */ ++ ++void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ ++ EBUG_ON(iter->level != 0); ++ EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); ++ EBUG_ON(!btree_node_locked(iter, 0)); ++ EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0); ++ ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = new_pos; ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++ ++ btree_iter_advance_to_pos(iter, l, -1); ++ ++ /* ++ * XXX: ++ * keeping a node locked that's outside (even just outside) iter->pos ++ * breaks __bch2_btree_node_lock(). This seems to only affect ++ * bch2_btree_node_get_sibling so for now it's fixed there, but we ++ * should try to get rid of this corner case. ++ * ++ * (this behaviour is currently needed for BTREE_INSERT_NOUNLOCK) ++ */ ++ ++ if (bch2_btree_node_iter_end(&l->iter) && ++ btree_iter_pos_after_node(iter, l->b)) ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++} ++ ++static void btree_iter_pos_changed(struct btree_iter *iter, int cmp) ++{ ++ unsigned l = iter->level; ++ ++ if (!cmp) ++ goto out; ++ ++ if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) { ++ btree_node_unlock(iter, 0); ++ iter->l[0].b = BTREE_ITER_NO_NODE_UP; ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ return; ++ } ++ ++ l = btree_iter_up_until_good_node(iter, cmp); ++ ++ if (btree_iter_node(iter, l)) { ++ /* ++ * We might have to skip over many keys, or just a few: try ++ * advancing the node iterator, and if we have to skip over too ++ * many keys just reinit it (or if we're rewinding, since that ++ * is expensive). ++ */ ++ if (cmp < 0 || ++ !btree_iter_advance_to_pos(iter, &iter->l[l], 8)) ++ __btree_iter_init(iter, l); ++ ++ /* Don't leave it locked if we're not supposed to: */ ++ if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED) ++ btree_node_unlock(iter, l); ++ } ++out: ++ if (l != iter->level) ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ else ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++} ++ ++void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos, ++ bool strictly_greater) ++{ ++ struct bpos old = btree_iter_search_key(iter); ++ int cmp; ++ ++ iter->flags &= ~BTREE_ITER_IS_EXTENTS; ++ iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0; ++ ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = new_pos; ++ ++ cmp = bkey_cmp(btree_iter_search_key(iter), old); ++ ++ btree_iter_pos_changed(iter, cmp); ++} ++ ++void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) ++{ ++ int cmp = bkey_cmp(new_pos, iter->pos); ++ ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = new_pos; ++ ++ btree_iter_pos_changed(iter, cmp); ++} ++ ++static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ bool ret; ++ ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = l->b->key.k.p; ++ ++ ret = bkey_cmp(iter->pos, POS_MAX) != 0; ++ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) ++ iter->k.p = iter->pos = bkey_successor(iter->pos); ++ ++ btree_iter_pos_changed(iter, 1); ++ return ret; ++} ++ ++static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ bool ret; ++ ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = l->b->data->min_key; ++ iter->uptodate = BTREE_ITER_NEED_TRAVERSE; ++ ++ ret = bkey_cmp(iter->pos, POS_MIN) != 0; ++ if (ret) { ++ iter->k.p = iter->pos = bkey_predecessor(iter->pos); ++ ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) ++ iter->k.p = iter->pos = bkey_predecessor(iter->pos); ++ } ++ ++ btree_iter_pos_changed(iter, -1); ++ return ret; ++} ++ ++/** ++ * btree_iter_peek_uptodate - given an iterator that is uptodate, return the key ++ * it currently points to ++ */ ++static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_s_c ret = { .k = &iter->k }; ++ ++ if (!bkey_deleted(&iter->k)) { ++ struct bkey_packed *_k = ++ __bch2_btree_node_iter_peek_all(&l->iter, l->b); ++ ++ ret.v = bkeyp_val(&l->b->format, _k); ++ ++ if (debug_check_iterators(iter->trans->c)) { ++ struct bkey k = bkey_unpack_key(l->b, _k); ++ ++ BUG_ON(memcmp(&k, &iter->k, sizeof(k))); ++ } ++ ++ if (debug_check_bkeys(iter->trans->c)) ++ bch2_bkey_debugcheck(iter->trans->c, l->b, ret); ++ } ++ ++ return ret; ++} ++ ++/** ++ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's ++ * current position ++ */ ++struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_s_c k; ++ int ret; ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ bch2_btree_iter_checks(iter); ++ ++ if (iter->uptodate == BTREE_ITER_UPTODATE && ++ !bkey_deleted(&iter->k)) ++ return btree_iter_peek_uptodate(iter); ++ ++ while (1) { ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ k = __btree_iter_peek(iter, l); ++ if (likely(k.k)) ++ break; ++ ++ if (!btree_iter_set_pos_to_next_leaf(iter)) ++ return bkey_s_c_null; ++ } ++ ++ /* ++ * iter->pos should always be equal to the key we just ++ * returned - except extents can straddle iter->pos: ++ */ ++ if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || ++ bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) ++ iter->pos = bkey_start_pos(k.k); ++ ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ ++ bch2_btree_iter_verify_level(iter, 0); ++ return k; ++} ++ ++/** ++ * bch2_btree_iter_next: returns first key greater than iterator's current ++ * position ++ */ ++struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) ++{ ++ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) ++ return bkey_s_c_null; ++ ++ bch2_btree_iter_set_pos(iter, ++ (iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? iter->k.p ++ : bkey_successor(iter->k.p)); ++ ++ return bch2_btree_iter_peek(iter); ++} ++ ++static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter) ++{ ++ struct bpos pos = btree_iter_search_key(iter); ++ struct btree_trans *trans = iter->trans; ++ struct btree_insert_entry *i; ++ ++ trans_for_each_update2(trans, i) ++ if ((cmp_int(iter->btree_id, i->iter->btree_id) ?: ++ bkey_cmp(pos, i->k->k.p)) <= 0) ++ break; ++ ++ return i < trans->updates2 + trans->nr_updates2 && ++ iter->btree_id == i->iter->btree_id ++ ? bkey_i_to_s_c(i->k) ++ : bkey_s_c_null; ++} ++ ++static struct bkey_s_c __bch2_btree_iter_peek_with_updates(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_s_c k = __btree_iter_peek(iter, l); ++ struct bkey_s_c u = __btree_trans_updates_peek(iter); ++ ++ if (k.k && (!u.k || bkey_cmp(k.k->p, u.k->p) < 0)) ++ return k; ++ if (u.k && bkey_cmp(u.k->p, l->b->key.k.p) <= 0) { ++ iter->k = *u.k; ++ return u; ++ } ++ return bkey_s_c_null; ++} ++ ++struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) ++{ ++ struct bkey_s_c k; ++ int ret; ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ bch2_btree_iter_checks(iter); ++ ++ while (1) { ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ k = __bch2_btree_iter_peek_with_updates(iter); ++ ++ if (k.k && bkey_deleted(k.k)) { ++ bch2_btree_iter_set_pos(iter, ++ (iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? iter->k.p ++ : bkey_successor(iter->k.p)); ++ continue; ++ } ++ ++ if (likely(k.k)) ++ break; ++ ++ if (!btree_iter_set_pos_to_next_leaf(iter)) ++ return bkey_s_c_null; ++ } ++ ++ /* ++ * iter->pos should always be equal to the key we just ++ * returned - except extents can straddle iter->pos: ++ */ ++ if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || ++ bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) ++ iter->pos = bkey_start_pos(k.k); ++ ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ return k; ++} ++ ++struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter) ++{ ++ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) ++ return bkey_s_c_null; ++ ++ bch2_btree_iter_set_pos(iter, ++ (iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? iter->k.p ++ : bkey_successor(iter->k.p)); ++ ++ return bch2_btree_iter_peek_with_updates(iter); ++} ++ ++/** ++ * bch2_btree_iter_peek_prev: returns first key less than or equal to ++ * iterator's current position ++ */ ++struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) ++{ ++ struct bpos pos = iter->pos; ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_s_c k; ++ int ret; ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ bch2_btree_iter_checks(iter); ++ ++ if (iter->uptodate == BTREE_ITER_UPTODATE && ++ !bkey_deleted(&iter->k)) ++ return btree_iter_peek_uptodate(iter); ++ ++ while (1) { ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ k = __btree_iter_peek(iter, l); ++ if (!k.k || bkey_cmp(bkey_start_pos(k.k), pos) > 0) ++ k = __btree_iter_prev(iter, l); ++ ++ if (likely(k.k)) ++ break; ++ ++ if (!btree_iter_set_pos_to_prev_leaf(iter)) ++ return bkey_s_c_null; ++ } ++ ++ EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0); ++ iter->pos = bkey_start_pos(k.k); ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ return k; ++} ++ ++/** ++ * bch2_btree_iter_prev: returns first key less than iterator's current ++ * position ++ */ ++struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) ++{ ++ struct bpos pos = bkey_start_pos(&iter->k); ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ bch2_btree_iter_checks(iter); ++ ++ if (unlikely(!bkey_cmp(pos, POS_MIN))) ++ return bkey_s_c_null; ++ ++ bch2_btree_iter_set_pos(iter, bkey_predecessor(pos)); ++ ++ return bch2_btree_iter_peek_prev(iter); ++} ++ ++static inline struct bkey_s_c ++__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct btree_node_iter node_iter; ++ struct bkey_s_c k; ++ struct bkey n; ++ int ret; ++ ++ /* keys & holes can't span inode numbers: */ ++ if (iter->pos.offset == KEY_OFFSET_MAX) { ++ if (iter->pos.inode == KEY_INODE_MAX) ++ return bkey_s_c_null; ++ ++ bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos)); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ } ++ ++ /* ++ * iterator is now at the correct position for inserting at iter->pos, ++ * but we need to keep iterating until we find the first non whiteout so ++ * we know how big a hole we have, if any: ++ */ ++ ++ node_iter = l->iter; ++ k = __btree_iter_unpack(iter, l, &iter->k, ++ bch2_btree_node_iter_peek(&node_iter, l->b)); ++ ++ if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { ++ /* ++ * We're not setting iter->uptodate because the node iterator ++ * doesn't necessarily point at the key we're returning: ++ */ ++ ++ EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0); ++ bch2_btree_iter_verify_level(iter, 0); ++ return k; ++ } ++ ++ /* hole */ ++ ++ if (!k.k) ++ k.k = &l->b->key.k; ++ ++ bkey_init(&n); ++ n.p = iter->pos; ++ bch2_key_resize(&n, ++ min_t(u64, KEY_SIZE_MAX, ++ (k.k->p.inode == n.p.inode ++ ? bkey_start_offset(k.k) ++ : KEY_OFFSET_MAX) - ++ n.p.offset)); ++ ++ EBUG_ON(!n.size); ++ ++ iter->k = n; ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ ++ bch2_btree_iter_verify_level(iter, 0); ++ return (struct bkey_s_c) { &iter->k, NULL }; ++} ++ ++struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_s_c k; ++ int ret; ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ bch2_btree_iter_checks(iter); ++ ++ if (iter->uptodate == BTREE_ITER_UPTODATE) ++ return btree_iter_peek_uptodate(iter); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) ++ return __bch2_btree_iter_peek_slot_extents(iter); ++ ++ k = __btree_iter_peek_all(iter, l, &iter->k); ++ ++ EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0); ++ ++ if (!k.k || bkey_cmp(iter->pos, k.k->p)) { ++ /* hole */ ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos; ++ k = (struct bkey_s_c) { &iter->k, NULL }; ++ } ++ ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ bch2_btree_iter_verify_level(iter, 0); ++ return k; ++} ++ ++struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) ++{ ++ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) ++ return bkey_s_c_null; ++ ++ bch2_btree_iter_set_pos(iter, ++ (iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? iter->k.p ++ : bkey_successor(iter->k.p)); ++ ++ return bch2_btree_iter_peek_slot(iter); ++} ++ ++struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter) ++{ ++ struct bkey_cached *ck; ++ int ret; ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED); ++ bch2_btree_iter_checks(iter); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ ck = (void *) iter->l[0].b; ++ ++ EBUG_ON(iter->btree_id != ck->key.btree_id || ++ bkey_cmp(iter->pos, ck->key.pos)); ++ BUG_ON(!ck->valid); ++ ++ return bkey_i_to_s_c(ck->k); ++} ++ ++static inline void bch2_btree_iter_init(struct btree_trans *trans, ++ struct btree_iter *iter, enum btree_id btree_id, ++ struct bpos pos, unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ unsigned i; ++ ++ if (btree_node_type_is_extents(btree_id) && ++ !(flags & BTREE_ITER_NODES)) ++ flags |= BTREE_ITER_IS_EXTENTS; ++ ++ iter->trans = trans; ++ iter->pos = pos; ++ bkey_init(&iter->k); ++ iter->k.p = pos; ++ iter->flags = flags; ++ iter->uptodate = BTREE_ITER_NEED_TRAVERSE; ++ iter->btree_id = btree_id; ++ iter->level = 0; ++ iter->min_depth = 0; ++ iter->locks_want = flags & BTREE_ITER_INTENT ? 1 : 0; ++ iter->nodes_locked = 0; ++ iter->nodes_intent_locked = 0; ++ for (i = 0; i < ARRAY_SIZE(iter->l); i++) ++ iter->l[i].b = BTREE_ITER_NO_NODE_INIT; ++ ++ prefetch(c->btree_roots[btree_id].b); ++} ++ ++/* new transactional stuff: */ ++ ++static inline void __bch2_trans_iter_free(struct btree_trans *trans, ++ unsigned idx) ++{ ++ __bch2_btree_iter_unlock(&trans->iters[idx]); ++ trans->iters_linked &= ~(1ULL << idx); ++ trans->iters_live &= ~(1ULL << idx); ++ trans->iters_touched &= ~(1ULL << idx); ++} ++ ++int bch2_trans_iter_put(struct btree_trans *trans, ++ struct btree_iter *iter) ++{ ++ int ret; ++ ++ if (IS_ERR_OR_NULL(iter)) ++ return 0; ++ ++ BUG_ON(trans->iters + iter->idx != iter); ++ ++ ret = btree_iter_err(iter); ++ ++ if (!(trans->iters_touched & (1ULL << iter->idx)) && ++ !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) ++ __bch2_trans_iter_free(trans, iter->idx); ++ ++ trans->iters_live &= ~(1ULL << iter->idx); ++ return ret; ++} ++ ++int bch2_trans_iter_free(struct btree_trans *trans, ++ struct btree_iter *iter) ++{ ++ if (IS_ERR_OR_NULL(iter)) ++ return 0; ++ ++ trans->iters_touched &= ~(1ULL << iter->idx); ++ ++ return bch2_trans_iter_put(trans, iter); ++} ++ ++static int bch2_trans_realloc_iters(struct btree_trans *trans, ++ unsigned new_size) ++{ ++ void *p, *new_iters, *new_updates, *new_updates2; ++ size_t iters_bytes; ++ size_t updates_bytes; ++ ++ new_size = roundup_pow_of_two(new_size); ++ ++ BUG_ON(new_size > BTREE_ITER_MAX); ++ ++ if (new_size <= trans->size) ++ return 0; ++ ++ BUG_ON(trans->used_mempool); ++ ++ bch2_trans_unlock(trans); ++ ++ iters_bytes = sizeof(struct btree_iter) * new_size; ++ updates_bytes = sizeof(struct btree_insert_entry) * new_size; ++ ++ p = kmalloc(iters_bytes + ++ updates_bytes + ++ updates_bytes, GFP_NOFS); ++ if (p) ++ goto success; ++ ++ p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); ++ new_size = BTREE_ITER_MAX; ++ ++ trans->used_mempool = true; ++success: ++ new_iters = p; p += iters_bytes; ++ new_updates = p; p += updates_bytes; ++ new_updates2 = p; p += updates_bytes; ++ ++ memcpy(new_iters, trans->iters, ++ sizeof(struct btree_iter) * trans->nr_iters); ++ memcpy(new_updates, trans->updates, ++ sizeof(struct btree_insert_entry) * trans->nr_updates); ++ memcpy(new_updates2, trans->updates2, ++ sizeof(struct btree_insert_entry) * trans->nr_updates2); ++ ++ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) ++ memset(trans->iters, POISON_FREE, ++ sizeof(struct btree_iter) * trans->nr_iters + ++ sizeof(struct btree_insert_entry) * trans->nr_iters); ++ ++ if (trans->iters != trans->iters_onstack) ++ kfree(trans->iters); ++ ++ trans->iters = new_iters; ++ trans->updates = new_updates; ++ trans->updates2 = new_updates2; ++ trans->size = new_size; ++ ++ if (trans->iters_live) { ++ trace_trans_restart_iters_realloced(trans->ip, trans->size); ++ return -EINTR; ++ } ++ ++ return 0; ++} ++ ++static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) ++{ ++ unsigned idx = __ffs64(~trans->iters_linked); ++ ++ if (idx < trans->nr_iters) ++ goto got_slot; ++ ++ if (trans->nr_iters == trans->size) { ++ int ret; ++ ++ if (trans->nr_iters >= BTREE_ITER_MAX) { ++ struct btree_iter *iter; ++ ++ trans_for_each_iter(trans, iter) { ++ pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps", ++ bch2_btree_ids[iter->btree_id], ++ iter->pos.inode, ++ iter->pos.offset, ++ (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", ++ (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", ++ iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", ++ (void *) iter->ip_allocated); ++ } ++ ++ panic("trans iter oveflow\n"); ++ } ++ ++ ret = bch2_trans_realloc_iters(trans, trans->size * 2); ++ if (ret) ++ return ERR_PTR(ret); ++ } ++ ++ idx = trans->nr_iters++; ++ BUG_ON(trans->nr_iters > trans->size); ++ ++ trans->iters[idx].idx = idx; ++got_slot: ++ BUG_ON(trans->iters_linked & (1ULL << idx)); ++ trans->iters_linked |= 1ULL << idx; ++ trans->iters[idx].flags = 0; ++ return &trans->iters[idx]; ++} ++ ++static inline void btree_iter_copy(struct btree_iter *dst, ++ struct btree_iter *src) ++{ ++ unsigned i, idx = dst->idx; ++ ++ *dst = *src; ++ dst->idx = idx; ++ dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; ++ ++ for (i = 0; i < BTREE_MAX_DEPTH; i++) ++ if (btree_node_locked(dst, i)) ++ six_lock_increment(&dst->l[i].b->c.lock, ++ __btree_lock_want(dst, i)); ++ ++ dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; ++ dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT; ++} ++ ++static inline struct bpos bpos_diff(struct bpos l, struct bpos r) ++{ ++ if (bkey_cmp(l, r) > 0) ++ swap(l, r); ++ ++ return POS(r.inode - l.inode, r.offset - l.offset); ++} ++ ++static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, ++ unsigned btree_id, struct bpos pos, ++ unsigned flags) ++{ ++ struct btree_iter *iter, *best = NULL; ++ ++ BUG_ON(trans->nr_iters > BTREE_ITER_MAX); ++ ++ trans_for_each_iter(trans, iter) { ++ if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE)) ++ continue; ++ ++ if (iter->btree_id != btree_id) ++ continue; ++ ++ if (best && ++ bkey_cmp(bpos_diff(best->pos, pos), ++ bpos_diff(iter->pos, pos)) < 0) ++ continue; ++ ++ best = iter; ++ } ++ ++ if (!best) { ++ iter = btree_trans_iter_alloc(trans); ++ if (IS_ERR(iter)) ++ return iter; ++ ++ bch2_btree_iter_init(trans, iter, btree_id, pos, flags); ++ } else if ((trans->iters_live & (1ULL << best->idx)) || ++ (best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) { ++ iter = btree_trans_iter_alloc(trans); ++ if (IS_ERR(iter)) ++ return iter; ++ ++ btree_iter_copy(iter, best); ++ } else { ++ iter = best; ++ } ++ ++ iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; ++ iter->flags &= ~BTREE_ITER_USER_FLAGS; ++ iter->flags |= flags & BTREE_ITER_USER_FLAGS; ++ ++ if (iter->flags & BTREE_ITER_INTENT) ++ bch2_btree_iter_upgrade(iter, 1); ++ else ++ bch2_btree_iter_downgrade(iter); ++ ++ BUG_ON(iter->btree_id != btree_id); ++ BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE); ++ BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); ++ BUG_ON(iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT); ++ BUG_ON(trans->iters_live & (1ULL << iter->idx)); ++ ++ trans->iters_live |= 1ULL << iter->idx; ++ trans->iters_touched |= 1ULL << iter->idx; ++ ++ return iter; ++} ++ ++struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, ++ enum btree_id btree_id, ++ struct bpos pos, unsigned flags) ++{ ++ struct btree_iter *iter = ++ __btree_trans_get_iter(trans, btree_id, pos, flags); ++ ++ if (!IS_ERR(iter)) ++ __bch2_btree_iter_set_pos(iter, pos, ++ btree_node_type_is_extents(btree_id)); ++ return iter; ++} ++ ++struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, ++ enum btree_id btree_id, ++ struct bpos pos, ++ unsigned locks_want, ++ unsigned depth, ++ unsigned flags) ++{ ++ struct btree_iter *iter = ++ __btree_trans_get_iter(trans, btree_id, pos, ++ flags|BTREE_ITER_NODES); ++ unsigned i; ++ ++ BUG_ON(IS_ERR(iter)); ++ BUG_ON(bkey_cmp(iter->pos, pos)); ++ ++ iter->locks_want = locks_want; ++ iter->level = depth; ++ iter->min_depth = depth; ++ ++ for (i = 0; i < ARRAY_SIZE(iter->l); i++) ++ iter->l[i].b = NULL; ++ iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; ++ ++ return iter; ++} ++ ++struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans, ++ struct btree_iter *src) ++{ ++ struct btree_iter *iter; ++ ++ iter = btree_trans_iter_alloc(trans); ++ if (IS_ERR(iter)) ++ return iter; ++ ++ btree_iter_copy(iter, src); ++ ++ trans->iters_live |= 1ULL << iter->idx; ++ /* ++ * We don't need to preserve this iter since it's cheap to copy it ++ * again - this will cause trans_iter_put() to free it right away: ++ */ ++ trans->iters_touched &= ~(1ULL << iter->idx); ++ ++ return iter; ++} ++ ++static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size) ++{ ++ if (size > trans->mem_bytes) { ++ size_t old_bytes = trans->mem_bytes; ++ size_t new_bytes = roundup_pow_of_two(size); ++ void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); ++ ++ if (!new_mem) ++ return -ENOMEM; ++ ++ trans->mem = new_mem; ++ trans->mem_bytes = new_bytes; ++ ++ if (old_bytes) { ++ trace_trans_restart_mem_realloced(trans->ip, new_bytes); ++ return -EINTR; ++ } ++ } ++ ++ return 0; ++} ++ ++void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) ++{ ++ void *p; ++ int ret; ++ ++ ret = bch2_trans_preload_mem(trans, trans->mem_top + size); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ p = trans->mem + trans->mem_top; ++ trans->mem_top += size; ++ return p; ++} ++ ++inline void bch2_trans_unlink_iters(struct btree_trans *trans) ++{ ++ u64 iters = trans->iters_linked & ++ ~trans->iters_touched & ++ ~trans->iters_live; ++ ++ while (iters) { ++ unsigned idx = __ffs64(iters); ++ ++ iters &= ~(1ULL << idx); ++ __bch2_trans_iter_free(trans, idx); ++ } ++} ++ ++void bch2_trans_reset(struct btree_trans *trans, unsigned flags) ++{ ++ struct btree_iter *iter; ++ ++ trans_for_each_iter(trans, iter) ++ iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT| ++ BTREE_ITER_SET_POS_AFTER_COMMIT); ++ ++ bch2_trans_unlink_iters(trans); ++ ++ trans->iters_touched &= trans->iters_live; ++ ++ trans->need_reset = 0; ++ trans->nr_updates = 0; ++ trans->nr_updates2 = 0; ++ trans->mem_top = 0; ++ ++ trans->extra_journal_entries = NULL; ++ trans->extra_journal_entry_u64s = 0; ++ ++ if (trans->fs_usage_deltas) { ++ trans->fs_usage_deltas->used = 0; ++ memset(&trans->fs_usage_deltas->memset_start, 0, ++ (void *) &trans->fs_usage_deltas->memset_end - ++ (void *) &trans->fs_usage_deltas->memset_start); ++ } ++ ++ if (!(flags & TRANS_RESET_NOTRAVERSE)) ++ bch2_btree_iter_traverse_all(trans); ++} ++ ++void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, ++ unsigned expected_nr_iters, ++ size_t expected_mem_bytes) ++{ ++ memset(trans, 0, offsetof(struct btree_trans, iters_onstack)); ++ ++ /* ++ * reallocating iterators currently completely breaks ++ * bch2_trans_iter_put(): ++ */ ++ expected_nr_iters = BTREE_ITER_MAX; ++ ++ trans->c = c; ++ trans->ip = _RET_IP_; ++ trans->size = ARRAY_SIZE(trans->iters_onstack); ++ trans->iters = trans->iters_onstack; ++ trans->updates = trans->updates_onstack; ++ trans->updates2 = trans->updates2_onstack; ++ trans->fs_usage_deltas = NULL; ++ ++ if (expected_nr_iters > trans->size) ++ bch2_trans_realloc_iters(trans, expected_nr_iters); ++ ++ if (expected_mem_bytes) ++ bch2_trans_preload_mem(trans, expected_mem_bytes); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ trans->pid = current->pid; ++ mutex_lock(&c->btree_trans_lock); ++ list_add(&trans->list, &c->btree_trans_list); ++ mutex_unlock(&c->btree_trans_lock); ++#endif ++} ++ ++int bch2_trans_exit(struct btree_trans *trans) ++{ ++ bch2_trans_unlock(trans); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ mutex_lock(&trans->c->btree_trans_lock); ++ list_del(&trans->list); ++ mutex_unlock(&trans->c->btree_trans_lock); ++#endif ++ ++ bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); ++ ++ kfree(trans->fs_usage_deltas); ++ kfree(trans->mem); ++ if (trans->used_mempool) ++ mempool_free(trans->iters, &trans->c->btree_iters_pool); ++ else if (trans->iters != trans->iters_onstack) ++ kfree(trans->iters); ++ trans->mem = (void *) 0x1; ++ trans->iters = (void *) 0x1; ++ ++ return trans->error ? -EIO : 0; ++} ++ ++static void bch2_btree_iter_node_to_text(struct printbuf *out, ++ struct btree_bkey_cached_common *_b, ++ enum btree_iter_type type) ++{ ++ pr_buf(out, " %px l=%u %s:", ++ _b, _b->level, bch2_btree_ids[_b->btree_id]); ++ bch2_bpos_to_text(out, btree_node_pos(_b, type)); ++} ++ ++void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct btree_trans *trans; ++ struct btree_iter *iter; ++ struct btree *b; ++ unsigned l; ++ ++ mutex_lock(&c->btree_trans_lock); ++ list_for_each_entry(trans, &c->btree_trans_list, list) { ++ pr_buf(out, "%i %px %ps\n", trans->pid, trans, (void *) trans->ip); ++ ++ trans_for_each_iter(trans, iter) { ++ if (!iter->nodes_locked) ++ continue; ++ ++ pr_buf(out, " iter %u %s:", ++ iter->idx, ++ bch2_btree_ids[iter->btree_id]); ++ bch2_bpos_to_text(out, iter->pos); ++ pr_buf(out, "\n"); ++ ++ for (l = 0; l < BTREE_MAX_DEPTH; l++) { ++ if (btree_node_locked(iter, l)) { ++ pr_buf(out, " %s l=%u ", ++ btree_node_intent_locked(iter, l) ? "i" : "r", l); ++ bch2_btree_iter_node_to_text(out, ++ (void *) iter->l[l].b, ++ btree_iter_type(iter)); ++ pr_buf(out, "\n"); ++ } ++ } ++ } ++ ++ b = READ_ONCE(trans->locking); ++ if (b) { ++ pr_buf(out, " locking iter %u l=%u %s:", ++ trans->locking_iter_idx, ++ trans->locking_level, ++ bch2_btree_ids[trans->locking_btree_id]); ++ bch2_bpos_to_text(out, trans->locking_pos); ++ ++ ++ pr_buf(out, " node "); ++ bch2_btree_iter_node_to_text(out, ++ (void *) b, ++ btree_iter_type(&trans->iters[trans->locking_iter_idx])); ++ pr_buf(out, "\n"); ++ } ++ } ++ mutex_unlock(&c->btree_trans_lock); ++#endif ++} ++ ++void bch2_fs_btree_iter_exit(struct bch_fs *c) ++{ ++ mempool_exit(&c->btree_iters_pool); ++} ++ ++int bch2_fs_btree_iter_init(struct bch_fs *c) ++{ ++ unsigned nr = BTREE_ITER_MAX; ++ ++ INIT_LIST_HEAD(&c->btree_trans_list); ++ mutex_init(&c->btree_trans_lock); ++ ++ return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, ++ sizeof(struct btree_iter) * nr + ++ sizeof(struct btree_insert_entry) * nr + ++ sizeof(struct btree_insert_entry) * nr); ++} +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +new file mode 100644 +index 000000000000..bd9ec3ec9a92 +--- /dev/null ++++ b/fs/bcachefs/btree_iter.h +@@ -0,0 +1,314 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_ITER_H ++#define _BCACHEFS_BTREE_ITER_H ++ ++#include "bset.h" ++#include "btree_types.h" ++ ++static inline void btree_iter_set_dirty(struct btree_iter *iter, ++ enum btree_iter_uptodate u) ++{ ++ iter->uptodate = max_t(unsigned, iter->uptodate, u); ++} ++ ++static inline struct btree *btree_iter_node(struct btree_iter *iter, ++ unsigned level) ++{ ++ return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL; ++} ++ ++static inline bool btree_node_lock_seq_matches(const struct btree_iter *iter, ++ const struct btree *b, unsigned level) ++{ ++ /* ++ * We don't compare the low bits of the lock sequence numbers because ++ * @iter might have taken a write lock on @b, and we don't want to skip ++ * the linked iterator if the sequence numbers were equal before taking ++ * that write lock. The lock sequence number is incremented by taking ++ * and releasing write locks and is even when unlocked: ++ */ ++ return iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1; ++} ++ ++static inline struct btree *btree_node_parent(struct btree_iter *iter, ++ struct btree *b) ++{ ++ return btree_iter_node(iter, b->c.level + 1); ++} ++ ++static inline bool btree_trans_has_multiple_iters(const struct btree_trans *trans) ++{ ++ return hweight64(trans->iters_linked) > 1; ++} ++ ++static inline int btree_iter_err(const struct btree_iter *iter) ++{ ++ return iter->flags & BTREE_ITER_ERROR ? -EIO : 0; ++} ++ ++/* Iterate over iters within a transaction: */ ++ ++#define trans_for_each_iter_all(_trans, _iter) \ ++ for (_iter = (_trans)->iters; \ ++ _iter < (_trans)->iters + (_trans)->nr_iters; \ ++ _iter++) ++ ++static inline struct btree_iter * ++__trans_next_iter(struct btree_trans *trans, unsigned idx) ++{ ++ EBUG_ON(idx < trans->nr_iters && trans->iters[idx].idx != idx); ++ ++ for (; idx < trans->nr_iters; idx++) ++ if (trans->iters_linked & (1ULL << idx)) ++ return &trans->iters[idx]; ++ ++ return NULL; ++} ++ ++#define trans_for_each_iter(_trans, _iter) \ ++ for (_iter = __trans_next_iter((_trans), 0); \ ++ (_iter); \ ++ _iter = __trans_next_iter((_trans), (_iter)->idx + 1)) ++ ++static inline bool __iter_has_node(const struct btree_iter *iter, ++ const struct btree *b) ++{ ++ return iter->l[b->c.level].b == b && ++ btree_node_lock_seq_matches(iter, b, b->c.level); ++} ++ ++static inline struct btree_iter * ++__trans_next_iter_with_node(struct btree_trans *trans, struct btree *b, ++ unsigned idx) ++{ ++ struct btree_iter *iter = __trans_next_iter(trans, idx); ++ ++ while (iter && !__iter_has_node(iter, b)) ++ iter = __trans_next_iter(trans, iter->idx + 1); ++ ++ return iter; ++} ++ ++#define trans_for_each_iter_with_node(_trans, _b, _iter) \ ++ for (_iter = __trans_next_iter_with_node((_trans), (_b), 0); \ ++ (_iter); \ ++ _iter = __trans_next_iter_with_node((_trans), (_b), \ ++ (_iter)->idx + 1)) ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_btree_trans_verify_iters(struct btree_trans *, struct btree *); ++void bch2_btree_trans_verify_locks(struct btree_trans *); ++#else ++static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans, ++ struct btree *b) {} ++static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {} ++#endif ++ ++void bch2_btree_iter_fix_key_modified(struct btree_iter *, struct btree *, ++ struct bkey_packed *); ++void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, ++ struct btree_node_iter *, struct bkey_packed *, ++ unsigned, unsigned); ++ ++bool bch2_btree_iter_relock(struct btree_iter *, bool); ++bool bch2_trans_relock(struct btree_trans *); ++void bch2_trans_unlock(struct btree_trans *); ++ ++bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned); ++bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned); ++ ++static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter, ++ unsigned new_locks_want) ++{ ++ new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); ++ ++ return iter->locks_want < new_locks_want ++ ? (!iter->trans->nounlock ++ ? __bch2_btree_iter_upgrade(iter, new_locks_want) ++ : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want)) ++ : iter->uptodate <= BTREE_ITER_NEED_PEEK; ++} ++ ++void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned); ++ ++static inline void bch2_btree_iter_downgrade(struct btree_iter *iter) ++{ ++ if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0) ++ __bch2_btree_iter_downgrade(iter, 0); ++} ++ ++void bch2_trans_downgrade(struct btree_trans *); ++ ++void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *); ++void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *); ++ ++void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *); ++ ++int __must_check __bch2_btree_iter_traverse(struct btree_iter *); ++ ++static inline int __must_check ++bch2_btree_iter_traverse(struct btree_iter *iter) ++{ ++ return iter->uptodate >= BTREE_ITER_NEED_RELOCK ++ ? __bch2_btree_iter_traverse(iter) ++ : 0; ++} ++ ++int bch2_btree_iter_traverse_all(struct btree_trans *); ++ ++struct btree *bch2_btree_iter_peek_node(struct btree_iter *); ++struct btree *bch2_btree_iter_next_node(struct btree_iter *); ++ ++struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); ++ ++struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *); ++ ++struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); ++ ++struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); ++ ++struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *); ++ ++void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos); ++void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool); ++void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); ++ ++static inline int btree_iter_cmp(const struct btree_iter *l, ++ const struct btree_iter *r) ++{ ++ return cmp_int(l->btree_id, r->btree_id) ?: ++ -cmp_int(btree_iter_type(l), btree_iter_type(r)) ?: ++ bkey_cmp(l->pos, r->pos); ++} ++ ++/* ++ * Unlocks before scheduling ++ * Note: does not revalidate iterator ++ */ ++static inline int bch2_trans_cond_resched(struct btree_trans *trans) ++{ ++ if (need_resched() || race_fault()) { ++ bch2_trans_unlock(trans); ++ schedule(); ++ return bch2_trans_relock(trans) ? 0 : -EINTR; ++ } else { ++ return 0; ++ } ++} ++ ++#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ ++ _locks_want, _depth, _flags, _b) \ ++ for (iter = bch2_trans_get_node_iter((_trans), (_btree_id), \ ++ _start, _locks_want, _depth, _flags), \ ++ _b = bch2_btree_iter_peek_node(_iter); \ ++ (_b); \ ++ (_b) = bch2_btree_iter_next_node(_iter)) ++ ++#define for_each_btree_node(_trans, _iter, _btree_id, _start, \ ++ _flags, _b) \ ++ __for_each_btree_node(_trans, _iter, _btree_id, _start, \ ++ 0, 0, _flags, _b) ++ ++static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, ++ unsigned flags) ++{ ++ if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED) ++ return bch2_btree_iter_peek_cached(iter); ++ else ++ return flags & BTREE_ITER_SLOTS ++ ? bch2_btree_iter_peek_slot(iter) ++ : bch2_btree_iter_peek(iter); ++} ++ ++static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter, ++ unsigned flags) ++{ ++ return flags & BTREE_ITER_SLOTS ++ ? bch2_btree_iter_next_slot(iter) ++ : bch2_btree_iter_next(iter); ++} ++ ++static inline int bkey_err(struct bkey_s_c k) ++{ ++ return PTR_ERR_OR_ZERO(k.k); ++} ++ ++#define for_each_btree_key(_trans, _iter, _btree_id, \ ++ _start, _flags, _k, _ret) \ ++ for ((_ret) = PTR_ERR_OR_ZERO((_iter) = \ ++ bch2_trans_get_iter((_trans), (_btree_id), \ ++ (_start), (_flags))) ?: \ ++ PTR_ERR_OR_ZERO(((_k) = \ ++ __bch2_btree_iter_peek(_iter, _flags)).k); \ ++ !_ret && (_k).k; \ ++ (_ret) = PTR_ERR_OR_ZERO(((_k) = \ ++ __bch2_btree_iter_next(_iter, _flags)).k)) ++ ++#define for_each_btree_key_continue(_iter, _flags, _k, _ret) \ ++ for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \ ++ !((_ret) = bkey_err(_k)) && (_k).k; \ ++ (_k) = __bch2_btree_iter_next(_iter, _flags)) ++ ++/* new multiple iterator interface: */ ++ ++int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *); ++int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *); ++ ++void bch2_trans_unlink_iters(struct btree_trans *); ++ ++struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id, ++ struct bpos, unsigned); ++ ++static inline struct btree_iter * ++bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id, ++ struct bpos pos, unsigned flags) ++{ ++ struct btree_iter *iter = ++ __bch2_trans_get_iter(trans, btree_id, pos, flags); ++ ++ if (!IS_ERR(iter)) ++ iter->ip_allocated = _THIS_IP_; ++ return iter; ++} ++ ++struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *, ++ struct btree_iter *); ++static inline struct btree_iter * ++bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src) ++{ ++ struct btree_iter *iter = ++ __bch2_trans_copy_iter(trans, src); ++ ++ if (!IS_ERR(iter)) ++ iter->ip_allocated = _THIS_IP_; ++ return iter; ++ ++} ++ ++struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *, ++ enum btree_id, struct bpos, ++ unsigned, unsigned, unsigned); ++ ++#define TRANS_RESET_NOTRAVERSE (1 << 0) ++ ++void bch2_trans_reset(struct btree_trans *, unsigned); ++ ++static inline void bch2_trans_begin(struct btree_trans *trans) ++{ ++ return bch2_trans_reset(trans, 0); ++} ++ ++void *bch2_trans_kmalloc(struct btree_trans *, size_t); ++void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t); ++int bch2_trans_exit(struct btree_trans *); ++ ++void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *); ++ ++void bch2_fs_btree_iter_exit(struct bch_fs *); ++int bch2_fs_btree_iter_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_BTREE_ITER_H */ +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +new file mode 100644 +index 000000000000..61662750dfc0 +--- /dev/null ++++ b/fs/bcachefs/btree_key_cache.c +@@ -0,0 +1,519 @@ ++ ++#include "bcachefs.h" ++#include "btree_cache.h" ++#include "btree_iter.h" ++#include "btree_key_cache.h" ++#include "btree_locking.h" ++#include "btree_update.h" ++#include "error.h" ++#include "journal.h" ++#include "journal_reclaim.h" ++ ++#include ++ ++static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, ++ const void *obj) ++{ ++ const struct bkey_cached *ck = obj; ++ const struct bkey_cached_key *key = arg->key; ++ ++ return cmp_int(ck->key.btree_id, key->btree_id) ?: ++ bkey_cmp(ck->key.pos, key->pos); ++} ++ ++static const struct rhashtable_params bch2_btree_key_cache_params = { ++ .head_offset = offsetof(struct bkey_cached, hash), ++ .key_offset = offsetof(struct bkey_cached, key), ++ .key_len = sizeof(struct bkey_cached_key), ++ .obj_cmpfn = bch2_btree_key_cache_cmp_fn, ++}; ++ ++__flatten ++static inline struct bkey_cached * ++btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) ++{ ++ struct bkey_cached_key key = { ++ .btree_id = btree_id, ++ .pos = pos, ++ }; ++ ++ return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, ++ bch2_btree_key_cache_params); ++} ++ ++static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) ++{ ++ if (!six_trylock_intent(&ck->c.lock)) ++ return false; ++ ++ if (!six_trylock_write(&ck->c.lock)) { ++ six_unlock_intent(&ck->c.lock); ++ return false; ++ } ++ ++ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ six_unlock_write(&ck->c.lock); ++ six_unlock_intent(&ck->c.lock); ++ return false; ++ } ++ ++ return true; ++} ++ ++static void bkey_cached_evict(struct btree_key_cache *c, ++ struct bkey_cached *ck) ++{ ++ BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, ++ bch2_btree_key_cache_params)); ++ memset(&ck->key, ~0, sizeof(ck->key)); ++} ++ ++static void bkey_cached_free(struct btree_key_cache *c, ++ struct bkey_cached *ck) ++{ ++ list_move(&ck->list, &c->freed); ++ ++ kfree(ck->k); ++ ck->k = NULL; ++ ck->u64s = 0; ++ ++ six_unlock_write(&ck->c.lock); ++ six_unlock_intent(&ck->c.lock); ++} ++ ++static struct bkey_cached * ++bkey_cached_alloc(struct btree_key_cache *c) ++{ ++ struct bkey_cached *ck; ++ ++ list_for_each_entry(ck, &c->freed, list) ++ if (bkey_cached_lock_for_evict(ck)) ++ return ck; ++ ++ list_for_each_entry(ck, &c->clean, list) ++ if (bkey_cached_lock_for_evict(ck)) { ++ bkey_cached_evict(c, ck); ++ return ck; ++ } ++ ++ ck = kzalloc(sizeof(*ck), GFP_NOFS); ++ if (!ck) ++ return NULL; ++ ++ INIT_LIST_HEAD(&ck->list); ++ six_lock_init(&ck->c.lock); ++ BUG_ON(!six_trylock_intent(&ck->c.lock)); ++ BUG_ON(!six_trylock_write(&ck->c.lock)); ++ ++ return ck; ++} ++ ++static struct bkey_cached * ++btree_key_cache_create(struct btree_key_cache *c, ++ enum btree_id btree_id, ++ struct bpos pos) ++{ ++ struct bkey_cached *ck; ++ ++ ck = bkey_cached_alloc(c); ++ if (!ck) ++ return ERR_PTR(-ENOMEM); ++ ++ ck->c.level = 0; ++ ck->c.btree_id = btree_id; ++ ck->key.btree_id = btree_id; ++ ck->key.pos = pos; ++ ck->valid = false; ++ ++ BUG_ON(ck->flags); ++ ++ if (rhashtable_lookup_insert_fast(&c->table, ++ &ck->hash, ++ bch2_btree_key_cache_params)) { ++ /* We raced with another fill: */ ++ bkey_cached_free(c, ck); ++ return NULL; ++ } ++ ++ list_move(&ck->list, &c->clean); ++ six_unlock_write(&ck->c.lock); ++ ++ return ck; ++} ++ ++static int btree_key_cache_fill(struct btree_trans *trans, ++ struct btree_iter *ck_iter, ++ struct bkey_cached *ck) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ unsigned new_u64s = 0; ++ struct bkey_i *new_k = NULL; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, ck->key.btree_id, ++ ck->key.pos, BTREE_ITER_SLOTS); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) { ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++ } ++ ++ if (!bch2_btree_node_relock(ck_iter, 0)) { ++ bch2_trans_iter_put(trans, iter); ++ trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ return -EINTR; ++ } ++ ++ if (k.k->u64s > ck->u64s) { ++ new_u64s = roundup_pow_of_two(k.k->u64s); ++ new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS); ++ if (!new_k) { ++ bch2_trans_iter_put(trans, iter); ++ return -ENOMEM; ++ } ++ } ++ ++ bch2_btree_node_lock_write(ck_iter->l[0].b, ck_iter); ++ if (new_k) { ++ kfree(ck->k); ++ ck->u64s = new_u64s; ++ ck->k = new_k; ++ } ++ ++ bkey_reassemble(ck->k, k); ++ ck->valid = true; ++ bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter); ++ ++ /* We're not likely to need this iterator again: */ ++ bch2_trans_iter_free(trans, iter); ++ ++ return 0; ++} ++ ++static int bkey_cached_check_fn(struct six_lock *lock, void *p) ++{ ++ struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock); ++ const struct btree_iter *iter = p; ++ ++ return ck->key.btree_id == iter->btree_id && ++ !bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1; ++} ++ ++int bch2_btree_iter_traverse_cached(struct btree_iter *iter) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct bch_fs *c = trans->c; ++ struct bkey_cached *ck; ++ int ret = 0; ++ ++ BUG_ON(iter->level); ++ ++ if (btree_node_locked(iter, 0)) { ++ ck = (void *) iter->l[0].b; ++ goto fill; ++ } ++retry: ++ ck = btree_key_cache_find(c, iter->btree_id, iter->pos); ++ if (!ck) { ++ if (iter->flags & BTREE_ITER_CACHED_NOCREATE) { ++ iter->l[0].b = NULL; ++ return 0; ++ } ++ ++ mutex_lock(&c->btree_key_cache.lock); ++ ck = btree_key_cache_create(&c->btree_key_cache, ++ iter->btree_id, iter->pos); ++ mutex_unlock(&c->btree_key_cache.lock); ++ ++ ret = PTR_ERR_OR_ZERO(ck); ++ if (ret) ++ goto err; ++ if (!ck) ++ goto retry; ++ ++ mark_btree_node_locked(iter, 0, SIX_LOCK_intent); ++ iter->locks_want = 1; ++ } else { ++ enum six_lock_type lock_want = __btree_lock_want(iter, 0); ++ ++ if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want, ++ bkey_cached_check_fn, iter)) { ++ if (ck->key.btree_id != iter->btree_id || ++ bkey_cmp(ck->key.pos, iter->pos)) { ++ goto retry; ++ } ++ ++ trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ ret = -EINTR; ++ goto err; ++ } ++ ++ if (ck->key.btree_id != iter->btree_id || ++ bkey_cmp(ck->key.pos, iter->pos)) { ++ six_unlock_type(&ck->c.lock, lock_want); ++ goto retry; ++ } ++ ++ mark_btree_node_locked(iter, 0, lock_want); ++ } ++ ++ iter->l[0].lock_seq = ck->c.lock.state.seq; ++ iter->l[0].b = (void *) ck; ++fill: ++ if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) { ++ if (!btree_node_intent_locked(iter, 0)) ++ bch2_btree_iter_upgrade(iter, 1); ++ if (!btree_node_intent_locked(iter, 0)) { ++ trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ ret = -EINTR; ++ goto err; ++ } ++ ++ ret = btree_key_cache_fill(trans, iter, ck); ++ if (ret) ++ goto err; ++ } ++ ++ iter->uptodate = BTREE_ITER_NEED_PEEK; ++ bch2_btree_iter_downgrade(iter); ++ return ret; ++err: ++ if (ret != -EINTR) { ++ btree_node_unlock(iter, 0); ++ iter->flags |= BTREE_ITER_ERROR; ++ iter->l[0].b = BTREE_ITER_NO_NODE_ERROR; ++ } ++ return ret; ++} ++ ++static int btree_key_cache_flush_pos(struct btree_trans *trans, ++ struct bkey_cached_key key, ++ u64 journal_seq, ++ bool evict) ++{ ++ struct bch_fs *c = trans->c; ++ struct journal *j = &c->journal; ++ struct btree_iter *c_iter = NULL, *b_iter = NULL; ++ struct bkey_cached *ck; ++ int ret; ++ ++ b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, ++ BTREE_ITER_SLOTS| ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(b_iter); ++ if (ret) ++ goto out; ++ ++ c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_CACHED_NOCREATE| ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(c_iter); ++ if (ret) ++ goto out; ++retry: ++ ret = bch2_btree_iter_traverse(c_iter); ++ if (ret) ++ goto err; ++ ++ ck = (void *) c_iter->l[0].b; ++ if (!ck || ++ (journal_seq && ck->journal.seq != journal_seq)) ++ goto out; ++ ++ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ if (!evict) ++ goto out; ++ goto evict; ++ } ++ ++ ret = bch2_btree_iter_traverse(b_iter) ?: ++ bch2_trans_update(trans, b_iter, ck->k, BTREE_TRIGGER_NORUN) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOUNLOCK| ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_USE_ALLOC_RESERVE| ++ BTREE_INSERT_JOURNAL_RESERVED| ++ BTREE_INSERT_JOURNAL_RECLAIM); ++err: ++ if (ret == -EINTR) ++ goto retry; ++ ++ BUG_ON(ret && !bch2_journal_error(j)); ++ ++ if (ret) ++ goto out; ++ ++ bch2_journal_pin_drop(j, &ck->journal); ++ bch2_journal_preres_put(j, &ck->res); ++ clear_bit(BKEY_CACHED_DIRTY, &ck->flags); ++ ++ if (!evict) { ++ mutex_lock(&c->btree_key_cache.lock); ++ list_move_tail(&ck->list, &c->btree_key_cache.clean); ++ mutex_unlock(&c->btree_key_cache.lock); ++ } else { ++evict: ++ BUG_ON(!btree_node_intent_locked(c_iter, 0)); ++ ++ mark_btree_node_unlocked(c_iter, 0); ++ c_iter->l[0].b = NULL; ++ ++ six_lock_write(&ck->c.lock, NULL, NULL); ++ ++ mutex_lock(&c->btree_key_cache.lock); ++ bkey_cached_evict(&c->btree_key_cache, ck); ++ bkey_cached_free(&c->btree_key_cache, ck); ++ mutex_unlock(&c->btree_key_cache.lock); ++ } ++out: ++ bch2_trans_iter_put(trans, b_iter); ++ bch2_trans_iter_put(trans, c_iter); ++ return ret; ++} ++ ++static void btree_key_cache_journal_flush(struct journal *j, ++ struct journal_entry_pin *pin, ++ u64 seq) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bkey_cached *ck = ++ container_of(pin, struct bkey_cached, journal); ++ struct bkey_cached_key key; ++ struct btree_trans trans; ++ ++ six_lock_read(&ck->c.lock, NULL, NULL); ++ key = ck->key; ++ ++ if (ck->journal.seq != seq || ++ !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ six_unlock_read(&ck->c.lock); ++ return; ++ } ++ six_unlock_read(&ck->c.lock); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ btree_key_cache_flush_pos(&trans, key, seq, false); ++ bch2_trans_exit(&trans); ++} ++ ++/* ++ * Flush and evict a key from the key cache: ++ */ ++int bch2_btree_key_cache_flush(struct btree_trans *trans, ++ enum btree_id id, struct bpos pos) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_cached_key key = { id, pos }; ++ ++ /* Fastpath - assume it won't be found: */ ++ if (!btree_key_cache_find(c, id, pos)) ++ return 0; ++ ++ return btree_key_cache_flush_pos(trans, key, 0, true); ++} ++ ++bool bch2_btree_insert_key_cached(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_cached *ck = (void *) iter->l[0].b; ++ ++ BUG_ON(insert->u64s > ck->u64s); ++ ++ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { ++ int difference; ++ ++ BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s); ++ ++ difference = jset_u64s(insert->u64s) - ck->res.u64s; ++ if (difference > 0) { ++ trans->journal_preres.u64s -= difference; ++ ck->res.u64s += difference; ++ } ++ } ++ ++ bkey_copy(ck->k, insert); ++ ck->valid = true; ++ ++ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ mutex_lock(&c->btree_key_cache.lock); ++ list_del_init(&ck->list); ++ ++ set_bit(BKEY_CACHED_DIRTY, &ck->flags); ++ mutex_unlock(&c->btree_key_cache.lock); ++ } ++ ++ bch2_journal_pin_update(&c->journal, trans->journal_res.seq, ++ &ck->journal, btree_key_cache_journal_flush); ++ return true; ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_btree_key_cache_verify_clean(struct btree_trans *trans, ++ enum btree_id id, struct bpos pos) ++{ ++ BUG_ON(btree_key_cache_find(trans->c, id, pos)); ++} ++#endif ++ ++void bch2_fs_btree_key_cache_exit(struct btree_key_cache *c) ++{ ++ struct bkey_cached *ck, *n; ++ ++ mutex_lock(&c->lock); ++ list_for_each_entry_safe(ck, n, &c->clean, list) { ++ kfree(ck->k); ++ kfree(ck); ++ } ++ list_for_each_entry_safe(ck, n, &c->freed, list) ++ kfree(ck); ++ mutex_unlock(&c->lock); ++ ++ rhashtable_destroy(&c->table); ++} ++ ++void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) ++{ ++ mutex_init(&c->lock); ++ INIT_LIST_HEAD(&c->freed); ++ INIT_LIST_HEAD(&c->clean); ++} ++ ++int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) ++{ ++ return rhashtable_init(&c->table, &bch2_btree_key_cache_params); ++} ++ ++void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) ++{ ++ struct bucket_table *tbl; ++ struct bkey_cached *ck; ++ struct rhash_head *pos; ++ size_t i; ++ ++ mutex_lock(&c->lock); ++ tbl = rht_dereference_rcu(c->table.tbl, &c->table); ++ ++ for (i = 0; i < tbl->size; i++) { ++ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { ++ pr_buf(out, "%s:", ++ bch2_btree_ids[ck->key.btree_id]); ++ bch2_bpos_to_text(out, ck->key.pos); ++ ++ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) ++ pr_buf(out, " journal seq %llu", ck->journal.seq); ++ pr_buf(out, "\n"); ++ } ++ } ++ mutex_unlock(&c->lock); ++} +diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h +new file mode 100644 +index 000000000000..b1756c6c622c +--- /dev/null ++++ b/fs/bcachefs/btree_key_cache.h +@@ -0,0 +1,25 @@ ++#ifndef _BCACHEFS_BTREE_KEY_CACHE_H ++#define _BCACHEFS_BTREE_KEY_CACHE_H ++ ++int bch2_btree_iter_traverse_cached(struct btree_iter *); ++ ++bool bch2_btree_insert_key_cached(struct btree_trans *, ++ struct btree_iter *, struct bkey_i *); ++int bch2_btree_key_cache_flush(struct btree_trans *, ++ enum btree_id, struct bpos); ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_btree_key_cache_verify_clean(struct btree_trans *, ++ enum btree_id, struct bpos); ++#else ++static inline void ++bch2_btree_key_cache_verify_clean(struct btree_trans *trans, ++ enum btree_id id, struct bpos pos) {} ++#endif ++ ++void bch2_fs_btree_key_cache_exit(struct btree_key_cache *); ++void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *); ++int bch2_fs_btree_key_cache_init(struct btree_key_cache *); ++ ++void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *); ++ ++#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ +diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h +new file mode 100644 +index 000000000000..81fbf3e18647 +--- /dev/null ++++ b/fs/bcachefs/btree_locking.h +@@ -0,0 +1,257 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_LOCKING_H ++#define _BCACHEFS_BTREE_LOCKING_H ++ ++/* ++ * Only for internal btree use: ++ * ++ * The btree iterator tracks what locks it wants to take, and what locks it ++ * currently has - here we have wrappers for locking/unlocking btree nodes and ++ * updating the iterator state ++ */ ++ ++#include ++ ++#include "btree_iter.h" ++ ++/* matches six lock types */ ++enum btree_node_locked_type { ++ BTREE_NODE_UNLOCKED = -1, ++ BTREE_NODE_READ_LOCKED = SIX_LOCK_read, ++ BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent, ++}; ++ ++static inline int btree_node_locked_type(struct btree_iter *iter, ++ unsigned level) ++{ ++ /* ++ * We're relying on the fact that if nodes_intent_locked is set ++ * nodes_locked must be set as well, so that we can compute without ++ * branches: ++ */ ++ return BTREE_NODE_UNLOCKED + ++ ((iter->nodes_locked >> level) & 1) + ++ ((iter->nodes_intent_locked >> level) & 1); ++} ++ ++static inline bool btree_node_intent_locked(struct btree_iter *iter, ++ unsigned level) ++{ ++ return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED; ++} ++ ++static inline bool btree_node_read_locked(struct btree_iter *iter, ++ unsigned level) ++{ ++ return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED; ++} ++ ++static inline bool btree_node_locked(struct btree_iter *iter, unsigned level) ++{ ++ return iter->nodes_locked & (1 << level); ++} ++ ++static inline void mark_btree_node_unlocked(struct btree_iter *iter, ++ unsigned level) ++{ ++ iter->nodes_locked &= ~(1 << level); ++ iter->nodes_intent_locked &= ~(1 << level); ++} ++ ++static inline void mark_btree_node_locked(struct btree_iter *iter, ++ unsigned level, ++ enum six_lock_type type) ++{ ++ /* relying on this to avoid a branch */ ++ BUILD_BUG_ON(SIX_LOCK_read != 0); ++ BUILD_BUG_ON(SIX_LOCK_intent != 1); ++ ++ iter->nodes_locked |= 1 << level; ++ iter->nodes_intent_locked |= type << level; ++} ++ ++static inline void mark_btree_node_intent_locked(struct btree_iter *iter, ++ unsigned level) ++{ ++ mark_btree_node_locked(iter, level, SIX_LOCK_intent); ++} ++ ++static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level) ++{ ++ return level < iter->locks_want ++ ? SIX_LOCK_intent ++ : SIX_LOCK_read; ++} ++ ++static inline enum btree_node_locked_type ++btree_lock_want(struct btree_iter *iter, int level) ++{ ++ if (level < iter->level) ++ return BTREE_NODE_UNLOCKED; ++ if (level < iter->locks_want) ++ return BTREE_NODE_INTENT_LOCKED; ++ if (level == iter->level) ++ return BTREE_NODE_READ_LOCKED; ++ return BTREE_NODE_UNLOCKED; ++} ++ ++static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level) ++{ ++ int lock_type = btree_node_locked_type(iter, level); ++ ++ EBUG_ON(level >= BTREE_MAX_DEPTH); ++ ++ if (lock_type != BTREE_NODE_UNLOCKED) ++ six_unlock_type(&iter->l[level].b->c.lock, lock_type); ++ mark_btree_node_unlocked(iter, level); ++} ++ ++static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) ++{ ++ EBUG_ON(!level && iter->trans->nounlock); ++ ++ __btree_node_unlock(iter, level); ++} ++ ++static inline void __bch2_btree_iter_unlock(struct btree_iter *iter) ++{ ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); ++ ++ while (iter->nodes_locked) ++ btree_node_unlock(iter, __ffs(iter->nodes_locked)); ++} ++ ++static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) ++{ ++ switch (type) { ++ case SIX_LOCK_read: ++ return BCH_TIME_btree_lock_contended_read; ++ case SIX_LOCK_intent: ++ return BCH_TIME_btree_lock_contended_intent; ++ case SIX_LOCK_write: ++ return BCH_TIME_btree_lock_contended_write; ++ default: ++ BUG(); ++ } ++} ++ ++/* ++ * wrapper around six locks that just traces lock contended time ++ */ ++static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b, ++ enum six_lock_type type) ++{ ++ u64 start_time = local_clock(); ++ ++ six_lock_type(&b->c.lock, type, NULL, NULL); ++ bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); ++} ++ ++static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b, ++ enum six_lock_type type) ++{ ++ if (!six_trylock_type(&b->c.lock, type)) ++ __btree_node_lock_type(c, b, type); ++} ++ ++/* ++ * Lock a btree node if we already have it locked on one of our linked ++ * iterators: ++ */ ++static inline bool btree_node_lock_increment(struct btree_trans *trans, ++ struct btree *b, unsigned level, ++ enum btree_node_locked_type want) ++{ ++ struct btree_iter *iter; ++ ++ trans_for_each_iter(trans, iter) ++ if (iter->l[level].b == b && ++ btree_node_locked_type(iter, level) >= want) { ++ six_lock_increment(&b->c.lock, want); ++ return true; ++ } ++ ++ return false; ++} ++ ++bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned, ++ struct btree_iter *, enum six_lock_type, ++ six_lock_should_sleep_fn, void *); ++ ++static inline bool btree_node_lock(struct btree *b, ++ struct bpos pos, unsigned level, ++ struct btree_iter *iter, ++ enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) ++{ ++ struct btree_trans *trans = iter->trans; ++ bool ret; ++ ++ EBUG_ON(level >= BTREE_MAX_DEPTH); ++ EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ trans->locking = b; ++ trans->locking_iter_idx = iter->idx; ++ trans->locking_pos = pos; ++ trans->locking_btree_id = iter->btree_id; ++ trans->locking_level = level; ++#endif ++ ret = likely(six_trylock_type(&b->c.lock, type)) || ++ btree_node_lock_increment(trans, b, level, type) || ++ __bch2_btree_node_lock(b, pos, level, iter, type, ++ should_sleep_fn, p); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ trans->locking = NULL; ++#endif ++ return ret; ++} ++ ++bool __bch2_btree_node_relock(struct btree_iter *, unsigned); ++ ++static inline bool bch2_btree_node_relock(struct btree_iter *iter, ++ unsigned level) ++{ ++ EBUG_ON(btree_node_locked(iter, level) && ++ btree_node_locked_type(iter, level) != ++ __btree_lock_want(iter, level)); ++ ++ return likely(btree_node_locked(iter, level)) || ++ __bch2_btree_node_relock(iter, level); ++} ++ ++/* ++ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will ++ * succeed: ++ */ ++static inline void ++bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter) ++{ ++ struct btree_iter *linked; ++ ++ EBUG_ON(iter->l[b->c.level].b != b); ++ EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq); ++ ++ trans_for_each_iter_with_node(iter->trans, b, linked) ++ linked->l[b->c.level].lock_seq += 2; ++ ++ six_unlock_write(&b->c.lock); ++} ++ ++void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *); ++ ++void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *); ++ ++static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) ++{ ++ EBUG_ON(iter->l[b->c.level].b != b); ++ EBUG_ON(iter->l[b->c.level].lock_seq != b->c.lock.state.seq); ++ ++ if (unlikely(!six_trylock_write(&b->c.lock))) ++ __bch2_btree_node_lock_write(b, iter); ++} ++ ++#endif /* _BCACHEFS_BTREE_LOCKING_H */ ++ ++ +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +new file mode 100644 +index 000000000000..683b416ef427 +--- /dev/null ++++ b/fs/bcachefs/btree_types.h +@@ -0,0 +1,664 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_TYPES_H ++#define _BCACHEFS_BTREE_TYPES_H ++ ++#include ++#include ++#include ++ ++#include "bkey_methods.h" ++#include "buckets_types.h" ++#include "journal_types.h" ++ ++struct open_bucket; ++struct btree_update; ++struct btree_trans; ++ ++#define MAX_BSETS 3U ++ ++struct btree_nr_keys { ++ ++ /* ++ * Amount of live metadata (i.e. size of node after a compaction) in ++ * units of u64s ++ */ ++ u16 live_u64s; ++ u16 bset_u64s[MAX_BSETS]; ++ ++ /* live keys only: */ ++ u16 packed_keys; ++ u16 unpacked_keys; ++}; ++ ++struct bset_tree { ++ /* ++ * We construct a binary tree in an array as if the array ++ * started at 1, so that things line up on the same cachelines ++ * better: see comments in bset.c at cacheline_to_bkey() for ++ * details ++ */ ++ ++ /* size of the binary tree and prev array */ ++ u16 size; ++ ++ /* function of size - precalculated for to_inorder() */ ++ u16 extra; ++ ++ u16 data_offset; ++ u16 aux_data_offset; ++ u16 end_offset; ++ ++ struct bpos max_key; ++}; ++ ++struct btree_write { ++ struct journal_entry_pin journal; ++}; ++ ++struct btree_alloc { ++ struct open_buckets ob; ++ BKEY_PADDED(k); ++}; ++ ++struct btree_bkey_cached_common { ++ struct six_lock lock; ++ u8 level; ++ u8 btree_id; ++}; ++ ++struct btree { ++ struct btree_bkey_cached_common c; ++ ++ struct rhash_head hash; ++ u64 hash_val; ++ ++ unsigned long flags; ++ u16 written; ++ u8 nsets; ++ u8 nr_key_bits; ++ ++ struct bkey_format format; ++ ++ struct btree_node *data; ++ void *aux_data; ++ ++ /* ++ * Sets of sorted keys - the real btree node - plus a binary search tree ++ * ++ * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point ++ * to the memory we have allocated for this btree node. Additionally, ++ * set[0]->data points to the entire btree node as it exists on disk. ++ */ ++ struct bset_tree set[MAX_BSETS]; ++ ++ struct btree_nr_keys nr; ++ u16 sib_u64s[2]; ++ u16 whiteout_u64s; ++ u8 byte_order; ++ u8 unpack_fn_len; ++ ++ /* ++ * XXX: add a delete sequence number, so when bch2_btree_node_relock() ++ * fails because the lock sequence number has changed - i.e. the ++ * contents were modified - we can still relock the node if it's still ++ * the one we want, without redoing the traversal ++ */ ++ ++ /* ++ * For asynchronous splits/interior node updates: ++ * When we do a split, we allocate new child nodes and update the parent ++ * node to point to them: we update the parent in memory immediately, ++ * but then we must wait until the children have been written out before ++ * the update to the parent can be written - this is a list of the ++ * btree_updates that are blocking this node from being ++ * written: ++ */ ++ struct list_head write_blocked; ++ ++ /* ++ * Also for asynchronous splits/interior node updates: ++ * If a btree node isn't reachable yet, we don't want to kick off ++ * another write - because that write also won't yet be reachable and ++ * marking it as completed before it's reachable would be incorrect: ++ */ ++ unsigned long will_make_reachable; ++ ++ struct open_buckets ob; ++ ++ /* lru list */ ++ struct list_head list; ++ ++ struct btree_write writes[2]; ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ bool *expensive_debug_checks; ++#endif ++ ++ /* Key/pointer for this btree node */ ++ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); ++}; ++ ++struct btree_cache { ++ struct rhashtable table; ++ bool table_init_done; ++ /* ++ * We never free a struct btree, except on shutdown - we just put it on ++ * the btree_cache_freed list and reuse it later. This simplifies the ++ * code, and it doesn't cost us much memory as the memory usage is ++ * dominated by buffers that hold the actual btree node data and those ++ * can be freed - and the number of struct btrees allocated is ++ * effectively bounded. ++ * ++ * btree_cache_freeable effectively is a small cache - we use it because ++ * high order page allocations can be rather expensive, and it's quite ++ * common to delete and allocate btree nodes in quick succession. It ++ * should never grow past ~2-3 nodes in practice. ++ */ ++ struct mutex lock; ++ struct list_head live; ++ struct list_head freeable; ++ struct list_head freed; ++ ++ /* Number of elements in live + freeable lists */ ++ unsigned used; ++ unsigned reserve; ++ struct shrinker shrink; ++ ++ /* ++ * If we need to allocate memory for a new btree node and that ++ * allocation fails, we can cannibalize another node in the btree cache ++ * to satisfy the allocation - lock to guarantee only one thread does ++ * this at a time: ++ */ ++ struct task_struct *alloc_lock; ++ struct closure_waitlist alloc_wait; ++}; ++ ++struct btree_node_iter { ++ struct btree_node_iter_set { ++ u16 k, end; ++ } data[MAX_BSETS]; ++}; ++ ++enum btree_iter_type { ++ BTREE_ITER_KEYS, ++ BTREE_ITER_NODES, ++ BTREE_ITER_CACHED, ++}; ++ ++#define BTREE_ITER_TYPE ((1 << 2) - 1) ++ ++/* ++ * Iterate over all possible positions, synthesizing deleted keys for holes: ++ */ ++#define BTREE_ITER_SLOTS (1 << 2) ++/* ++ * Indicates that intent locks should be taken on leaf nodes, because we expect ++ * to be doing updates: ++ */ ++#define BTREE_ITER_INTENT (1 << 3) ++/* ++ * Causes the btree iterator code to prefetch additional btree nodes from disk: ++ */ ++#define BTREE_ITER_PREFETCH (1 << 4) ++/* ++ * Indicates that this iterator should not be reused until transaction commit, ++ * either because a pending update references it or because the update depends ++ * on that particular key being locked (e.g. by the str_hash code, for hash ++ * table consistency) ++ */ ++#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5) ++/* ++ * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for ++ * @pos or the first key strictly greater than @pos ++ */ ++#define BTREE_ITER_IS_EXTENTS (1 << 6) ++#define BTREE_ITER_ERROR (1 << 7) ++#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8) ++#define BTREE_ITER_CACHED_NOFILL (1 << 9) ++#define BTREE_ITER_CACHED_NOCREATE (1 << 10) ++ ++#define BTREE_ITER_USER_FLAGS \ ++ (BTREE_ITER_SLOTS \ ++ |BTREE_ITER_INTENT \ ++ |BTREE_ITER_PREFETCH \ ++ |BTREE_ITER_CACHED_NOFILL \ ++ |BTREE_ITER_CACHED_NOCREATE) ++ ++enum btree_iter_uptodate { ++ BTREE_ITER_UPTODATE = 0, ++ BTREE_ITER_NEED_PEEK = 1, ++ BTREE_ITER_NEED_RELOCK = 2, ++ BTREE_ITER_NEED_TRAVERSE = 3, ++}; ++ ++#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1) ++#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2) ++#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3) ++#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4) ++#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5) ++#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6) ++#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7) ++ ++/* ++ * @pos - iterator's current position ++ * @level - current btree depth ++ * @locks_want - btree level below which we start taking intent locks ++ * @nodes_locked - bitmask indicating which nodes in @nodes are locked ++ * @nodes_intent_locked - bitmask indicating which locks are intent locks ++ */ ++struct btree_iter { ++ struct btree_trans *trans; ++ struct bpos pos; ++ struct bpos pos_after_commit; ++ ++ u16 flags; ++ u8 idx; ++ ++ enum btree_id btree_id:4; ++ enum btree_iter_uptodate uptodate:4; ++ unsigned level:4, ++ min_depth:4, ++ locks_want:4, ++ nodes_locked:4, ++ nodes_intent_locked:4; ++ ++ struct btree_iter_level { ++ struct btree *b; ++ struct btree_node_iter iter; ++ u32 lock_seq; ++ } l[BTREE_MAX_DEPTH]; ++ ++ /* ++ * Current unpacked key - so that bch2_btree_iter_next()/ ++ * bch2_btree_iter_next_slot() can correctly advance pos. ++ */ ++ struct bkey k; ++ unsigned long ip_allocated; ++}; ++ ++static inline enum btree_iter_type ++btree_iter_type(const struct btree_iter *iter) ++{ ++ return iter->flags & BTREE_ITER_TYPE; ++} ++ ++static inline struct btree_iter_level *iter_l(struct btree_iter *iter) ++{ ++ return iter->l + iter->level; ++} ++ ++struct btree_key_cache { ++ struct mutex lock; ++ struct rhashtable table; ++ struct list_head freed; ++ struct list_head clean; ++}; ++ ++struct bkey_cached_key { ++ u32 btree_id; ++ struct bpos pos; ++} __attribute__((packed, aligned(4))); ++ ++#define BKEY_CACHED_DIRTY 0 ++ ++struct bkey_cached { ++ struct btree_bkey_cached_common c; ++ ++ unsigned long flags; ++ u8 u64s; ++ bool valid; ++ struct bkey_cached_key key; ++ ++ struct rhash_head hash; ++ struct list_head list; ++ ++ struct journal_preres res; ++ struct journal_entry_pin journal; ++ ++ struct bkey_i *k; ++}; ++ ++struct btree_insert_entry { ++ unsigned trigger_flags; ++ unsigned trans_triggers_run:1; ++ struct bkey_i *k; ++ struct btree_iter *iter; ++}; ++ ++#ifndef CONFIG_LOCKDEP ++#define BTREE_ITER_MAX 64 ++#else ++#define BTREE_ITER_MAX 32 ++#endif ++ ++struct btree_trans { ++ struct bch_fs *c; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct list_head list; ++ struct btree *locking; ++ unsigned locking_iter_idx; ++ struct bpos locking_pos; ++ u8 locking_btree_id; ++ u8 locking_level; ++ pid_t pid; ++#endif ++ unsigned long ip; ++ ++ u64 iters_linked; ++ u64 iters_live; ++ u64 iters_touched; ++ ++ u8 nr_iters; ++ u8 nr_updates; ++ u8 nr_updates2; ++ u8 size; ++ unsigned used_mempool:1; ++ unsigned error:1; ++ unsigned nounlock:1; ++ unsigned need_reset:1; ++ unsigned in_traverse_all:1; ++ ++ unsigned mem_top; ++ unsigned mem_bytes; ++ void *mem; ++ ++ struct btree_iter *iters; ++ struct btree_insert_entry *updates; ++ struct btree_insert_entry *updates2; ++ ++ /* update path: */ ++ struct jset_entry *extra_journal_entries; ++ unsigned extra_journal_entry_u64s; ++ struct journal_entry_pin *journal_pin; ++ ++ struct journal_res journal_res; ++ struct journal_preres journal_preres; ++ u64 *journal_seq; ++ struct disk_reservation *disk_res; ++ unsigned flags; ++ unsigned journal_u64s; ++ unsigned journal_preres_u64s; ++ struct replicas_delta_list *fs_usage_deltas; ++ ++ struct btree_iter iters_onstack[2]; ++ struct btree_insert_entry updates_onstack[2]; ++ struct btree_insert_entry updates2_onstack[2]; ++}; ++ ++#define BTREE_FLAG(flag) \ ++static inline bool btree_node_ ## flag(struct btree *b) \ ++{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ ++ \ ++static inline void set_btree_node_ ## flag(struct btree *b) \ ++{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ ++ \ ++static inline void clear_btree_node_ ## flag(struct btree *b) \ ++{ clear_bit(BTREE_NODE_ ## flag, &b->flags); } ++ ++enum btree_flags { ++ BTREE_NODE_read_in_flight, ++ BTREE_NODE_read_error, ++ BTREE_NODE_dirty, ++ BTREE_NODE_need_write, ++ BTREE_NODE_noevict, ++ BTREE_NODE_write_idx, ++ BTREE_NODE_accessed, ++ BTREE_NODE_write_in_flight, ++ BTREE_NODE_just_written, ++ BTREE_NODE_dying, ++ BTREE_NODE_fake, ++ BTREE_NODE_old_extent_overwrite, ++ BTREE_NODE_need_rewrite, ++}; ++ ++BTREE_FLAG(read_in_flight); ++BTREE_FLAG(read_error); ++BTREE_FLAG(dirty); ++BTREE_FLAG(need_write); ++BTREE_FLAG(noevict); ++BTREE_FLAG(write_idx); ++BTREE_FLAG(accessed); ++BTREE_FLAG(write_in_flight); ++BTREE_FLAG(just_written); ++BTREE_FLAG(dying); ++BTREE_FLAG(fake); ++BTREE_FLAG(old_extent_overwrite); ++BTREE_FLAG(need_rewrite); ++ ++static inline struct btree_write *btree_current_write(struct btree *b) ++{ ++ return b->writes + btree_node_write_idx(b); ++} ++ ++static inline struct btree_write *btree_prev_write(struct btree *b) ++{ ++ return b->writes + (btree_node_write_idx(b) ^ 1); ++} ++ ++static inline struct bset_tree *bset_tree_last(struct btree *b) ++{ ++ EBUG_ON(!b->nsets); ++ return b->set + b->nsets - 1; ++} ++ ++static inline void * ++__btree_node_offset_to_ptr(const struct btree *b, u16 offset) ++{ ++ return (void *) ((u64 *) b->data + 1 + offset); ++} ++ ++static inline u16 ++__btree_node_ptr_to_offset(const struct btree *b, const void *p) ++{ ++ u16 ret = (u64 *) p - 1 - (u64 *) b->data; ++ ++ EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p); ++ return ret; ++} ++ ++static inline struct bset *bset(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ return __btree_node_offset_to_ptr(b, t->data_offset); ++} ++ ++static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t) ++{ ++ t->end_offset = ++ __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t))); ++} ++ ++static inline void set_btree_bset(struct btree *b, struct bset_tree *t, ++ const struct bset *i) ++{ ++ t->data_offset = __btree_node_ptr_to_offset(b, i); ++ set_btree_bset_end(b, t); ++} ++ ++static inline struct bset *btree_bset_first(struct btree *b) ++{ ++ return bset(b, b->set); ++} ++ ++static inline struct bset *btree_bset_last(struct btree *b) ++{ ++ return bset(b, bset_tree_last(b)); ++} ++ ++static inline u16 ++__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k) ++{ ++ return __btree_node_ptr_to_offset(b, k); ++} ++ ++static inline struct bkey_packed * ++__btree_node_offset_to_key(const struct btree *b, u16 k) ++{ ++ return __btree_node_offset_to_ptr(b, k); ++} ++ ++static inline unsigned btree_bkey_first_offset(const struct bset_tree *t) ++{ ++ return t->data_offset + offsetof(struct bset, _data) / sizeof(u64); ++} ++ ++#define btree_bkey_first(_b, _t) \ ++({ \ ++ EBUG_ON(bset(_b, _t)->start != \ ++ __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\ ++ \ ++ bset(_b, _t)->start; \ ++}) ++ ++#define btree_bkey_last(_b, _t) \ ++({ \ ++ EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \ ++ vstruct_last(bset(_b, _t))); \ ++ \ ++ __btree_node_offset_to_key(_b, (_t)->end_offset); \ ++}) ++ ++static inline unsigned bset_u64s(struct bset_tree *t) ++{ ++ return t->end_offset - t->data_offset - ++ sizeof(struct bset) / sizeof(u64); ++} ++ ++static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t) ++{ ++ return bset_u64s(t) - b->nr.bset_u64s[t - b->set]; ++} ++ ++static inline unsigned bset_byte_offset(struct btree *b, void *i) ++{ ++ return i - (void *) b->data; ++} ++ ++enum btree_node_type { ++#define x(kwd, val, name) BKEY_TYPE_##kwd = val, ++ BCH_BTREE_IDS() ++#undef x ++ BKEY_TYPE_BTREE, ++}; ++ ++/* Type of a key in btree @id at level @level: */ ++static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id) ++{ ++ return level ? BKEY_TYPE_BTREE : (enum btree_node_type) id; ++} ++ ++/* Type of keys @b contains: */ ++static inline enum btree_node_type btree_node_type(struct btree *b) ++{ ++ return __btree_node_type(b->c.level, b->c.btree_id); ++} ++ ++static inline bool btree_node_type_is_extents(enum btree_node_type type) ++{ ++ switch (type) { ++ case BKEY_TYPE_EXTENTS: ++ case BKEY_TYPE_REFLINK: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static inline bool btree_node_is_extents(struct btree *b) ++{ ++ return btree_node_type_is_extents(btree_node_type(b)); ++} ++ ++static inline enum btree_node_type btree_iter_key_type(struct btree_iter *iter) ++{ ++ return __btree_node_type(iter->level, iter->btree_id); ++} ++ ++static inline bool btree_iter_is_extents(struct btree_iter *iter) ++{ ++ return btree_node_type_is_extents(btree_iter_key_type(iter)); ++} ++ ++#define BTREE_NODE_TYPE_HAS_TRIGGERS \ ++ ((1U << BKEY_TYPE_EXTENTS)| \ ++ (1U << BKEY_TYPE_ALLOC)| \ ++ (1U << BKEY_TYPE_INODES)| \ ++ (1U << BKEY_TYPE_REFLINK)| \ ++ (1U << BKEY_TYPE_EC)| \ ++ (1U << BKEY_TYPE_BTREE)) ++ ++#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ ++ ((1U << BKEY_TYPE_EXTENTS)| \ ++ (1U << BKEY_TYPE_INODES)| \ ++ (1U << BKEY_TYPE_REFLINK)) ++ ++enum btree_trigger_flags { ++ __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ ++ ++ __BTREE_TRIGGER_INSERT, ++ __BTREE_TRIGGER_OVERWRITE, ++ __BTREE_TRIGGER_OVERWRITE_SPLIT, ++ ++ __BTREE_TRIGGER_GC, ++ __BTREE_TRIGGER_BUCKET_INVALIDATE, ++ __BTREE_TRIGGER_ALLOC_READ, ++ __BTREE_TRIGGER_NOATOMIC, ++}; ++ ++#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) ++ ++#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) ++#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) ++#define BTREE_TRIGGER_OVERWRITE_SPLIT (1U << __BTREE_TRIGGER_OVERWRITE_SPLIT) ++ ++#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) ++#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) ++#define BTREE_TRIGGER_ALLOC_READ (1U << __BTREE_TRIGGER_ALLOC_READ) ++#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) ++ ++static inline bool btree_node_type_needs_gc(enum btree_node_type type) ++{ ++ return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); ++} ++ ++struct btree_root { ++ struct btree *b; ++ ++ /* On disk root - see async splits: */ ++ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); ++ u8 level; ++ u8 alive; ++ s8 error; ++}; ++ ++/* ++ * Optional hook that will be called just prior to a btree node update, when ++ * we're holding the write lock and we know what key is about to be overwritten: ++ */ ++ ++enum btree_insert_ret { ++ BTREE_INSERT_OK, ++ /* leaf node needs to be split */ ++ BTREE_INSERT_BTREE_NODE_FULL, ++ BTREE_INSERT_ENOSPC, ++ BTREE_INSERT_NEED_MARK_REPLICAS, ++ BTREE_INSERT_NEED_JOURNAL_RES, ++}; ++ ++enum btree_gc_coalesce_fail_reason { ++ BTREE_GC_COALESCE_FAIL_RESERVE_GET, ++ BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC, ++ BTREE_GC_COALESCE_FAIL_FORMAT_FITS, ++}; ++ ++enum btree_node_sibling { ++ btree_prev_sib, ++ btree_next_sib, ++}; ++ ++typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *, ++ struct btree *, ++ struct btree_node_iter *); ++ ++#endif /* _BCACHEFS_BTREE_TYPES_H */ +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +new file mode 100644 +index 000000000000..e0b1bde37484 +--- /dev/null ++++ b/fs/bcachefs/btree_update.h +@@ -0,0 +1,144 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_UPDATE_H ++#define _BCACHEFS_BTREE_UPDATE_H ++ ++#include "btree_iter.h" ++#include "journal.h" ++ ++struct bch_fs; ++struct btree; ++ ++void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *, ++ struct btree_iter *); ++bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, ++ struct btree_node_iter *, struct bkey_i *); ++void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); ++ ++enum btree_insert_flags { ++ __BTREE_INSERT_NOUNLOCK, ++ __BTREE_INSERT_NOFAIL, ++ __BTREE_INSERT_NOCHECK_RW, ++ __BTREE_INSERT_LAZY_RW, ++ __BTREE_INSERT_USE_RESERVE, ++ __BTREE_INSERT_USE_ALLOC_RESERVE, ++ __BTREE_INSERT_JOURNAL_REPLAY, ++ __BTREE_INSERT_JOURNAL_RESERVED, ++ __BTREE_INSERT_JOURNAL_RECLAIM, ++ __BTREE_INSERT_NOWAIT, ++ __BTREE_INSERT_GC_LOCK_HELD, ++ __BCH_HASH_SET_MUST_CREATE, ++ __BCH_HASH_SET_MUST_REPLACE, ++}; ++ ++/* ++ * Don't drop locks _after_ successfully updating btree: ++ */ ++#define BTREE_INSERT_NOUNLOCK (1 << __BTREE_INSERT_NOUNLOCK) ++ ++/* Don't check for -ENOSPC: */ ++#define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL) ++ ++#define BTREE_INSERT_NOCHECK_RW (1 << __BTREE_INSERT_NOCHECK_RW) ++#define BTREE_INSERT_LAZY_RW (1 << __BTREE_INSERT_LAZY_RW) ++ ++/* for copygc, or when merging btree nodes */ ++#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE) ++#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE) ++ ++/* Insert is for journal replay - don't get journal reservations: */ ++#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY) ++ ++/* Indicates that we have pre-reserved space in the journal: */ ++#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED) ++ ++/* Insert is being called from journal reclaim path: */ ++#define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM) ++ ++/* Don't block on allocation failure (for new btree nodes: */ ++#define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT) ++#define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD) ++ ++#define BCH_HASH_SET_MUST_CREATE (1 << __BCH_HASH_SET_MUST_CREATE) ++#define BCH_HASH_SET_MUST_REPLACE (1 << __BCH_HASH_SET_MUST_REPLACE) ++ ++int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); ++ ++int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *); ++int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, ++ struct disk_reservation *, u64 *, int flags); ++ ++int bch2_btree_delete_at_range(struct btree_trans *, struct btree_iter *, ++ struct bpos, u64 *); ++int bch2_btree_delete_range(struct bch_fs *, enum btree_id, ++ struct bpos, struct bpos, u64 *); ++ ++int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, ++ __le64, unsigned); ++int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, ++ struct btree *, struct bkey_i *); ++ ++int bch2_trans_update(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, enum btree_trigger_flags); ++int __bch2_trans_commit(struct btree_trans *); ++ ++/** ++ * bch2_trans_commit - insert keys at given iterator positions ++ * ++ * This is main entry point for btree updates. ++ * ++ * Return values: ++ * -EINTR: locking changed, this function should be called again. ++ * -EROFS: filesystem read only ++ * -EIO: journal or btree node IO error ++ */ ++static inline int bch2_trans_commit(struct btree_trans *trans, ++ struct disk_reservation *disk_res, ++ u64 *journal_seq, ++ unsigned flags) ++{ ++ trans->disk_res = disk_res; ++ trans->journal_seq = journal_seq; ++ trans->flags = flags; ++ ++ return __bch2_trans_commit(trans); ++} ++ ++#define __bch2_trans_do(_trans, _disk_res, _journal_seq, _flags, _do) \ ++({ \ ++ int _ret; \ ++ \ ++ while (1) { \ ++ _ret = (_do) ?: bch2_trans_commit(_trans, (_disk_res), \ ++ (_journal_seq), (_flags)); \ ++ if (_ret != -EINTR) \ ++ break; \ ++ bch2_trans_reset(_trans, 0); \ ++ } \ ++ \ ++ _ret; \ ++}) ++ ++#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ ++({ \ ++ struct btree_trans trans; \ ++ int _ret, _ret2; \ ++ \ ++ bch2_trans_init(&trans, (_c), 0, 0); \ ++ _ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \ ++ _do); \ ++ _ret2 = bch2_trans_exit(&trans); \ ++ \ ++ _ret ?: _ret2; \ ++}) ++ ++#define trans_for_each_update(_trans, _i) \ ++ for ((_i) = (_trans)->updates; \ ++ (_i) < (_trans)->updates + (_trans)->nr_updates; \ ++ (_i)++) ++ ++#define trans_for_each_update2(_trans, _i) \ ++ for ((_i) = (_trans)->updates2; \ ++ (_i) < (_trans)->updates2 + (_trans)->nr_updates2; \ ++ (_i)++) ++ ++#endif /* _BCACHEFS_BTREE_UPDATE_H */ +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +new file mode 100644 +index 000000000000..a2604b0ce2d8 +--- /dev/null ++++ b/fs/bcachefs/btree_update_interior.c +@@ -0,0 +1,2075 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_methods.h" ++#include "btree_cache.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_locking.h" ++#include "buckets.h" ++#include "extents.h" ++#include "journal.h" ++#include "journal_reclaim.h" ++#include "keylist.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++#include ++#include ++ ++/* Debug code: */ ++ ++/* ++ * Verify that child nodes correctly span parent node's range: ++ */ ++static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bpos next_node = b->data->min_key; ++ struct btree_node_iter iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_btree_ptr_v2 bp; ++ struct bkey unpacked; ++ ++ BUG_ON(!b->c.level); ++ ++ if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)) ++ return; ++ ++ bch2_btree_node_iter_init_from_start(&iter, b); ++ ++ while (1) { ++ k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked); ++ if (k.k->type != KEY_TYPE_btree_ptr_v2) ++ break; ++ bp = bkey_s_c_to_btree_ptr_v2(k); ++ ++ BUG_ON(bkey_cmp(next_node, bp.v->min_key)); ++ ++ bch2_btree_node_iter_advance(&iter, b); ++ ++ if (bch2_btree_node_iter_end(&iter)) { ++ BUG_ON(bkey_cmp(k.k->p, b->key.k.p)); ++ break; ++ } ++ ++ next_node = bkey_successor(k.k->p); ++ } ++#endif ++} ++ ++/* Calculate ideal packed bkey format for new btree nodes: */ ++ ++void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) ++{ ++ struct bkey_packed *k; ++ struct bset_tree *t; ++ struct bkey uk; ++ ++ bch2_bkey_format_add_pos(s, b->data->min_key); ++ ++ for_each_bset(b, t) ++ bset_tree_for_each_key(b, t, k) ++ if (!bkey_whiteout(k)) { ++ uk = bkey_unpack_key(b, k); ++ bch2_bkey_format_add_key(s, &uk); ++ } ++} ++ ++static struct bkey_format bch2_btree_calc_format(struct btree *b) ++{ ++ struct bkey_format_state s; ++ ++ bch2_bkey_format_init(&s); ++ __bch2_btree_calc_format(&s, b); ++ ++ return bch2_bkey_format_done(&s); ++} ++ ++static size_t btree_node_u64s_with_format(struct btree *b, ++ struct bkey_format *new_f) ++{ ++ struct bkey_format *old_f = &b->format; ++ ++ /* stupid integer promotion rules */ ++ ssize_t delta = ++ (((int) new_f->key_u64s - old_f->key_u64s) * ++ (int) b->nr.packed_keys) + ++ (((int) new_f->key_u64s - BKEY_U64s) * ++ (int) b->nr.unpacked_keys); ++ ++ BUG_ON(delta + b->nr.live_u64s < 0); ++ ++ return b->nr.live_u64s + delta; ++} ++ ++/** ++ * btree_node_format_fits - check if we could rewrite node with a new format ++ * ++ * This assumes all keys can pack with the new format -- it just checks if ++ * the re-packed keys would fit inside the node itself. ++ */ ++bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, ++ struct bkey_format *new_f) ++{ ++ size_t u64s = btree_node_u64s_with_format(b, new_f); ++ ++ return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c); ++} ++ ++/* Btree node freeing/allocation: */ ++ ++static void __btree_node_free(struct bch_fs *c, struct btree *b) ++{ ++ trace_btree_node_free(c, b); ++ ++ BUG_ON(btree_node_dirty(b)); ++ BUG_ON(btree_node_need_write(b)); ++ BUG_ON(b == btree_node_root(c, b)); ++ BUG_ON(b->ob.nr); ++ BUG_ON(!list_empty(&b->write_blocked)); ++ BUG_ON(b->will_make_reachable); ++ ++ clear_btree_node_noevict(b); ++ ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ mutex_lock(&c->btree_cache.lock); ++ list_move(&b->list, &c->btree_cache.freeable); ++ mutex_unlock(&c->btree_cache.lock); ++} ++ ++void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) ++{ ++ struct open_buckets ob = b->ob; ++ ++ b->ob.nr = 0; ++ ++ clear_btree_node_dirty(b); ++ ++ btree_node_lock_type(c, b, SIX_LOCK_write); ++ __btree_node_free(c, b); ++ six_unlock_write(&b->c.lock); ++ ++ bch2_open_buckets_put(c, &ob); ++} ++ ++void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, ++ struct btree_iter *iter) ++{ ++ struct btree_iter *linked; ++ ++ trans_for_each_iter(iter->trans, linked) ++ BUG_ON(linked->l[b->c.level].b == b); ++ ++ six_lock_write(&b->c.lock, NULL, NULL); ++ __btree_node_free(c, b); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++} ++ ++static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, ++ struct disk_reservation *res, ++ struct closure *cl, ++ unsigned flags) ++{ ++ struct write_point *wp; ++ struct btree *b; ++ BKEY_PADDED(k) tmp; ++ struct open_buckets ob = { .nr = 0 }; ++ struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; ++ unsigned nr_reserve; ++ enum alloc_reserve alloc_reserve; ++ ++ if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) { ++ nr_reserve = 0; ++ alloc_reserve = RESERVE_ALLOC; ++ } else if (flags & BTREE_INSERT_USE_RESERVE) { ++ nr_reserve = BTREE_NODE_RESERVE / 2; ++ alloc_reserve = RESERVE_BTREE; ++ } else { ++ nr_reserve = BTREE_NODE_RESERVE; ++ alloc_reserve = RESERVE_NONE; ++ } ++ ++ mutex_lock(&c->btree_reserve_cache_lock); ++ if (c->btree_reserve_cache_nr > nr_reserve) { ++ struct btree_alloc *a = ++ &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; ++ ++ ob = a->ob; ++ bkey_copy(&tmp.k, &a->k); ++ mutex_unlock(&c->btree_reserve_cache_lock); ++ goto mem_alloc; ++ } ++ mutex_unlock(&c->btree_reserve_cache_lock); ++ ++retry: ++ wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0, ++ writepoint_ptr(&c->btree_write_point), ++ &devs_have, ++ res->nr_replicas, ++ c->opts.metadata_replicas_required, ++ alloc_reserve, 0, cl); ++ if (IS_ERR(wp)) ++ return ERR_CAST(wp); ++ ++ if (wp->sectors_free < c->opts.btree_node_size) { ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ if (ob->sectors_free < c->opts.btree_node_size) ++ ob->sectors_free = 0; ++ ++ bch2_alloc_sectors_done(c, wp); ++ goto retry; ++ } ++ ++ if (c->sb.features & (1ULL << BCH_FEATURE_btree_ptr_v2)) ++ bkey_btree_ptr_v2_init(&tmp.k); ++ else ++ bkey_btree_ptr_init(&tmp.k); ++ ++ bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size); ++ ++ bch2_open_bucket_get(c, wp, &ob); ++ bch2_alloc_sectors_done(c, wp); ++mem_alloc: ++ b = bch2_btree_node_mem_alloc(c); ++ ++ /* we hold cannibalize_lock: */ ++ BUG_ON(IS_ERR(b)); ++ BUG_ON(b->ob.nr); ++ ++ bkey_copy(&b->key, &tmp.k); ++ b->ob = ob; ++ ++ return b; ++} ++ ++static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *b; ++ int ret; ++ ++ BUG_ON(level >= BTREE_MAX_DEPTH); ++ BUG_ON(!as->nr_prealloc_nodes); ++ ++ b = as->prealloc_nodes[--as->nr_prealloc_nodes]; ++ ++ set_btree_node_accessed(b); ++ set_btree_node_dirty(b); ++ set_btree_node_need_write(b); ++ ++ bch2_bset_init_first(b, &b->data->keys); ++ b->c.level = level; ++ b->c.btree_id = as->btree_id; ++ ++ memset(&b->nr, 0, sizeof(b->nr)); ++ b->data->magic = cpu_to_le64(bset_magic(c)); ++ b->data->flags = 0; ++ SET_BTREE_NODE_ID(b->data, as->btree_id); ++ SET_BTREE_NODE_LEVEL(b->data, level); ++ b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr; ++ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { ++ struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key); ++ ++ bp->v.mem_ptr = 0; ++ bp->v.seq = b->data->keys.seq; ++ bp->v.sectors_written = 0; ++ bp->v.sectors = cpu_to_le16(c->opts.btree_node_size); ++ } ++ ++ if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite)) ++ SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); ++ ++ if (btree_node_is_extents(b) && ++ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) { ++ set_btree_node_old_extent_overwrite(b); ++ set_btree_node_need_rewrite(b); ++ } ++ ++ bch2_btree_build_aux_trees(b); ++ ++ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); ++ BUG_ON(ret); ++ ++ trace_btree_node_alloc(c, b); ++ return b; ++} ++ ++static void btree_set_min(struct btree *b, struct bpos pos) ++{ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) ++ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos; ++ b->data->min_key = pos; ++} ++ ++static void btree_set_max(struct btree *b, struct bpos pos) ++{ ++ b->key.k.p = pos; ++ b->data->max_key = pos; ++} ++ ++struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as, ++ struct btree *b, ++ struct bkey_format format) ++{ ++ struct btree *n; ++ ++ n = bch2_btree_node_alloc(as, b->c.level); ++ ++ SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); ++ ++ btree_set_min(n, b->data->min_key); ++ btree_set_max(n, b->data->max_key); ++ ++ n->data->format = format; ++ btree_node_set_format(n, format); ++ ++ bch2_btree_sort_into(as->c, n, b); ++ ++ btree_node_reset_sib_u64s(n); ++ ++ n->key.k.p = b->key.k.p; ++ return n; ++} ++ ++static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as, ++ struct btree *b) ++{ ++ struct bkey_format new_f = bch2_btree_calc_format(b); ++ ++ /* ++ * The keys might expand with the new format - if they wouldn't fit in ++ * the btree node anymore, use the old format for now: ++ */ ++ if (!bch2_btree_node_format_fits(as->c, b, &new_f)) ++ new_f = b->format; ++ ++ return __bch2_btree_node_alloc_replacement(as, b, new_f); ++} ++ ++static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level) ++{ ++ struct btree *b = bch2_btree_node_alloc(as, level); ++ ++ btree_set_min(b, POS_MIN); ++ btree_set_max(b, POS_MAX); ++ b->data->format = bch2_btree_calc_format(b); ++ ++ btree_node_set_format(b, b->data->format); ++ bch2_btree_build_aux_trees(b); ++ ++ bch2_btree_update_add_new_node(as, b); ++ six_unlock_write(&b->c.lock); ++ ++ return b; ++} ++ ++static void bch2_btree_reserve_put(struct btree_update *as) ++{ ++ struct bch_fs *c = as->c; ++ ++ mutex_lock(&c->btree_reserve_cache_lock); ++ ++ while (as->nr_prealloc_nodes) { ++ struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes]; ++ ++ six_unlock_write(&b->c.lock); ++ ++ if (c->btree_reserve_cache_nr < ++ ARRAY_SIZE(c->btree_reserve_cache)) { ++ struct btree_alloc *a = ++ &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; ++ ++ a->ob = b->ob; ++ b->ob.nr = 0; ++ bkey_copy(&a->k, &b->key); ++ } else { ++ bch2_open_buckets_put(c, &b->ob); ++ } ++ ++ btree_node_lock_type(c, b, SIX_LOCK_write); ++ __btree_node_free(c, b); ++ six_unlock_write(&b->c.lock); ++ ++ six_unlock_intent(&b->c.lock); ++ } ++ ++ mutex_unlock(&c->btree_reserve_cache_lock); ++} ++ ++static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes, ++ unsigned flags, struct closure *cl) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *b; ++ int ret; ++ ++ BUG_ON(nr_nodes > BTREE_RESERVE_MAX); ++ ++ /* ++ * Protects reaping from the btree node cache and using the btree node ++ * open bucket reserve: ++ */ ++ ret = bch2_btree_cache_cannibalize_lock(c, cl); ++ if (ret) ++ return ret; ++ ++ while (as->nr_prealloc_nodes < nr_nodes) { ++ b = __bch2_btree_node_alloc(c, &as->disk_res, ++ flags & BTREE_INSERT_NOWAIT ++ ? NULL : cl, flags); ++ if (IS_ERR(b)) { ++ ret = PTR_ERR(b); ++ goto err_free; ++ } ++ ++ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key)); ++ if (ret) ++ goto err_free; ++ ++ as->prealloc_nodes[as->nr_prealloc_nodes++] = b; ++ } ++ ++ bch2_btree_cache_cannibalize_unlock(c); ++ return 0; ++err_free: ++ bch2_btree_cache_cannibalize_unlock(c); ++ trace_btree_reserve_get_fail(c, nr_nodes, cl); ++ return ret; ++} ++ ++/* Asynchronous interior node update machinery */ ++ ++static void bch2_btree_update_free(struct btree_update *as) ++{ ++ struct bch_fs *c = as->c; ++ ++ bch2_journal_preres_put(&c->journal, &as->journal_preres); ++ ++ bch2_journal_pin_drop(&c->journal, &as->journal); ++ bch2_journal_pin_flush(&c->journal, &as->journal); ++ bch2_disk_reservation_put(c, &as->disk_res); ++ bch2_btree_reserve_put(as); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_del(&as->unwritten_list); ++ list_del(&as->list); ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ closure_debug_destroy(&as->cl); ++ mempool_free(as, &c->btree_interior_update_pool); ++ ++ closure_wake_up(&c->btree_interior_update_wait); ++} ++ ++static void btree_update_will_delete_key(struct btree_update *as, ++ struct bkey_i *k) ++{ ++ BUG_ON(bch2_keylist_u64s(&as->old_keys) + k->k.u64s > ++ ARRAY_SIZE(as->_old_keys)); ++ bch2_keylist_add(&as->old_keys, k); ++} ++ ++static void btree_update_will_add_key(struct btree_update *as, ++ struct bkey_i *k) ++{ ++ BUG_ON(bch2_keylist_u64s(&as->new_keys) + k->k.u64s > ++ ARRAY_SIZE(as->_new_keys)); ++ bch2_keylist_add(&as->new_keys, k); ++} ++ ++/* ++ * The transactional part of an interior btree node update, where we journal the ++ * update we did to the interior node and update alloc info: ++ */ ++static int btree_update_nodes_written_trans(struct btree_trans *trans, ++ struct btree_update *as) ++{ ++ struct bkey_i *k; ++ int ret; ++ ++ trans->extra_journal_entries = (void *) &as->journal_entries[0]; ++ trans->extra_journal_entry_u64s = as->journal_u64s; ++ trans->journal_pin = &as->journal; ++ ++ for_each_keylist_key(&as->new_keys, k) { ++ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k), ++ 0, 0, BTREE_TRIGGER_INSERT); ++ if (ret) ++ return ret; ++ } ++ ++ for_each_keylist_key(&as->old_keys, k) { ++ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k), ++ 0, 0, BTREE_TRIGGER_OVERWRITE); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static void btree_update_nodes_written(struct btree_update *as) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *b = as->b; ++ u64 journal_seq = 0; ++ unsigned i; ++ int ret; ++ ++ /* ++ * We did an update to a parent node where the pointers we added pointed ++ * to child nodes that weren't written yet: now, the child nodes have ++ * been written so we can write out the update to the interior node. ++ */ ++ ++ /* ++ * We can't call into journal reclaim here: we'd block on the journal ++ * reclaim lock, but we may need to release the open buckets we have ++ * pinned in order for other btree updates to make forward progress, and ++ * journal reclaim does btree updates when flushing bkey_cached entries, ++ * which may require allocations as well. ++ */ ++ ret = bch2_trans_do(c, &as->disk_res, &journal_seq, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_USE_ALLOC_RESERVE| ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_JOURNAL_RECLAIM| ++ BTREE_INSERT_JOURNAL_RESERVED, ++ btree_update_nodes_written_trans(&trans, as)); ++ BUG_ON(ret && !bch2_journal_error(&c->journal)); ++ ++ if (b) { ++ /* ++ * @b is the node we did the final insert into: ++ * ++ * On failure to get a journal reservation, we still have to ++ * unblock the write and allow most of the write path to happen ++ * so that shutdown works, but the i->journal_seq mechanism ++ * won't work to prevent the btree write from being visible (we ++ * didn't get a journal sequence number) - instead ++ * __bch2_btree_node_write() doesn't do the actual write if ++ * we're in journal error state: ++ */ ++ ++ btree_node_lock_type(c, b, SIX_LOCK_intent); ++ btree_node_lock_type(c, b, SIX_LOCK_write); ++ mutex_lock(&c->btree_interior_update_lock); ++ ++ list_del(&as->write_blocked_list); ++ ++ if (!ret && as->b == b) { ++ struct bset *i = btree_bset_last(b); ++ ++ BUG_ON(!b->c.level); ++ BUG_ON(!btree_node_dirty(b)); ++ ++ i->journal_seq = cpu_to_le64( ++ max(journal_seq, ++ le64_to_cpu(i->journal_seq))); ++ ++ bch2_btree_add_journal_pin(c, b, journal_seq); ++ } ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++ six_unlock_write(&b->c.lock); ++ ++ btree_node_write_if_need(c, b, SIX_LOCK_intent); ++ six_unlock_intent(&b->c.lock); ++ } ++ ++ bch2_journal_pin_drop(&c->journal, &as->journal); ++ ++ bch2_journal_preres_put(&c->journal, &as->journal_preres); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ for (i = 0; i < as->nr_new_nodes; i++) { ++ b = as->new_nodes[i]; ++ ++ BUG_ON(b->will_make_reachable != (unsigned long) as); ++ b->will_make_reachable = 0; ++ } ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ for (i = 0; i < as->nr_new_nodes; i++) { ++ b = as->new_nodes[i]; ++ ++ btree_node_lock_type(c, b, SIX_LOCK_read); ++ btree_node_write_if_need(c, b, SIX_LOCK_read); ++ six_unlock_read(&b->c.lock); ++ } ++ ++ for (i = 0; i < as->nr_open_buckets; i++) ++ bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]); ++ ++ bch2_btree_update_free(as); ++} ++ ++static void btree_interior_update_work(struct work_struct *work) ++{ ++ struct bch_fs *c = ++ container_of(work, struct bch_fs, btree_interior_update_work); ++ struct btree_update *as; ++ ++ while (1) { ++ mutex_lock(&c->btree_interior_update_lock); ++ as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, ++ struct btree_update, unwritten_list); ++ if (as && !as->nodes_written) ++ as = NULL; ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ if (!as) ++ break; ++ ++ btree_update_nodes_written(as); ++ } ++} ++ ++static void btree_update_set_nodes_written(struct closure *cl) ++{ ++ struct btree_update *as = container_of(cl, struct btree_update, cl); ++ struct bch_fs *c = as->c; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ as->nodes_written = true; ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); ++} ++ ++/* ++ * We're updating @b with pointers to nodes that haven't finished writing yet: ++ * block @b from being written until @as completes ++ */ ++static void btree_update_updated_node(struct btree_update *as, struct btree *b) ++{ ++ struct bch_fs *c = as->c; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); ++ ++ BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); ++ BUG_ON(!btree_node_dirty(b)); ++ ++ as->mode = BTREE_INTERIOR_UPDATING_NODE; ++ as->b = b; ++ list_add(&as->write_blocked_list, &b->write_blocked); ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++} ++ ++static void btree_update_reparent(struct btree_update *as, ++ struct btree_update *child) ++{ ++ struct bch_fs *c = as->c; ++ ++ lockdep_assert_held(&c->btree_interior_update_lock); ++ ++ child->b = NULL; ++ child->mode = BTREE_INTERIOR_UPDATING_AS; ++ ++ /* ++ * When we write a new btree root, we have to drop our journal pin ++ * _before_ the new nodes are technically reachable; see ++ * btree_update_nodes_written(). ++ * ++ * This goes for journal pins that are recursively blocked on us - so, ++ * just transfer the journal pin to the new interior update so ++ * btree_update_nodes_written() can drop it. ++ */ ++ bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL); ++ bch2_journal_pin_drop(&c->journal, &child->journal); ++} ++ ++static void btree_update_updated_root(struct btree_update *as, struct btree *b) ++{ ++ struct bkey_i *insert = &b->key; ++ struct bch_fs *c = as->c; ++ ++ BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); ++ ++ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > ++ ARRAY_SIZE(as->journal_entries)); ++ ++ as->journal_u64s += ++ journal_entry_set((void *) &as->journal_entries[as->journal_u64s], ++ BCH_JSET_ENTRY_btree_root, ++ b->c.btree_id, b->c.level, ++ insert, insert->k.u64s); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); ++ ++ as->mode = BTREE_INTERIOR_UPDATING_ROOT; ++ mutex_unlock(&c->btree_interior_update_lock); ++} ++ ++/* ++ * bch2_btree_update_add_new_node: ++ * ++ * This causes @as to wait on @b to be written, before it gets to ++ * bch2_btree_update_nodes_written ++ * ++ * Additionally, it sets b->will_make_reachable to prevent any additional writes ++ * to @b from happening besides the first until @b is reachable on disk ++ * ++ * And it adds @b to the list of @as's new nodes, so that we can update sector ++ * counts in bch2_btree_update_nodes_written: ++ */ ++void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b) ++{ ++ struct bch_fs *c = as->c; ++ ++ closure_get(&as->cl); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes)); ++ BUG_ON(b->will_make_reachable); ++ ++ as->new_nodes[as->nr_new_nodes++] = b; ++ b->will_make_reachable = 1UL|(unsigned long) as; ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ btree_update_will_add_key(as, &b->key); ++} ++ ++/* ++ * returns true if @b was a new node ++ */ ++static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) ++{ ++ struct btree_update *as; ++ unsigned long v; ++ unsigned i; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ /* ++ * When b->will_make_reachable != 0, it owns a ref on as->cl that's ++ * dropped when it gets written by bch2_btree_complete_write - the ++ * xchg() is for synchronization with bch2_btree_complete_write: ++ */ ++ v = xchg(&b->will_make_reachable, 0); ++ as = (struct btree_update *) (v & ~1UL); ++ ++ if (!as) { ++ mutex_unlock(&c->btree_interior_update_lock); ++ return; ++ } ++ ++ for (i = 0; i < as->nr_new_nodes; i++) ++ if (as->new_nodes[i] == b) ++ goto found; ++ ++ BUG(); ++found: ++ array_remove_item(as->new_nodes, as->nr_new_nodes, i); ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ if (v & 1) ++ closure_put(&as->cl); ++} ++ ++void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b) ++{ ++ while (b->ob.nr) ++ as->open_buckets[as->nr_open_buckets++] = ++ b->ob.v[--b->ob.nr]; ++} ++ ++/* ++ * @b is being split/rewritten: it may have pointers to not-yet-written btree ++ * nodes and thus outstanding btree_updates - redirect @b's ++ * btree_updates to point to this btree_update: ++ */ ++void bch2_btree_interior_update_will_free_node(struct btree_update *as, ++ struct btree *b) ++{ ++ struct bch_fs *c = as->c; ++ struct btree_update *p, *n; ++ struct btree_write *w; ++ ++ set_btree_node_dying(b); ++ ++ if (btree_node_fake(b)) ++ return; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ ++ /* ++ * Does this node have any btree_update operations preventing ++ * it from being written? ++ * ++ * If so, redirect them to point to this btree_update: we can ++ * write out our new nodes, but we won't make them visible until those ++ * operations complete ++ */ ++ list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { ++ list_del_init(&p->write_blocked_list); ++ btree_update_reparent(as, p); ++ ++ /* ++ * for flush_held_btree_writes() waiting on updates to flush or ++ * nodes to be writeable: ++ */ ++ closure_wake_up(&c->btree_interior_update_wait); ++ } ++ ++ clear_btree_node_dirty(b); ++ clear_btree_node_need_write(b); ++ ++ /* ++ * Does this node have unwritten data that has a pin on the journal? ++ * ++ * If so, transfer that pin to the btree_update operation - ++ * note that if we're freeing multiple nodes, we only need to keep the ++ * oldest pin of any of the nodes we're freeing. We'll release the pin ++ * when the new nodes are persistent and reachable on disk: ++ */ ++ w = btree_current_write(b); ++ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); ++ bch2_journal_pin_drop(&c->journal, &w->journal); ++ ++ w = btree_prev_write(b); ++ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); ++ bch2_journal_pin_drop(&c->journal, &w->journal); ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ /* ++ * Is this a node that isn't reachable on disk yet? ++ * ++ * Nodes that aren't reachable yet have writes blocked until they're ++ * reachable - now that we've cancelled any pending writes and moved ++ * things waiting on that write to wait on this update, we can drop this ++ * node from the list of nodes that the other update is making ++ * reachable, prior to freeing it: ++ */ ++ btree_update_drop_new_node(c, b); ++ ++ btree_update_will_delete_key(as, &b->key); ++} ++ ++void bch2_btree_update_done(struct btree_update *as) ++{ ++ BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); ++ ++ bch2_btree_reserve_put(as); ++ ++ continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq); ++} ++ ++struct btree_update * ++bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, ++ unsigned nr_nodes, unsigned flags, ++ struct closure *cl) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_update *as; ++ int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) ++ ? BCH_DISK_RESERVATION_NOFAIL : 0; ++ int journal_flags = (flags & BTREE_INSERT_JOURNAL_RESERVED) ++ ? JOURNAL_RES_GET_RECLAIM : 0; ++ int ret = 0; ++ ++ /* ++ * This check isn't necessary for correctness - it's just to potentially ++ * prevent us from doing a lot of work that'll end up being wasted: ++ */ ++ ret = bch2_journal_error(&c->journal); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO); ++ memset(as, 0, sizeof(*as)); ++ closure_init(&as->cl, NULL); ++ as->c = c; ++ as->mode = BTREE_INTERIOR_NO_UPDATE; ++ as->btree_id = id; ++ INIT_LIST_HEAD(&as->list); ++ INIT_LIST_HEAD(&as->unwritten_list); ++ INIT_LIST_HEAD(&as->write_blocked_list); ++ bch2_keylist_init(&as->old_keys, as->_old_keys); ++ bch2_keylist_init(&as->new_keys, as->_new_keys); ++ bch2_keylist_init(&as->parent_keys, as->inline_keys); ++ ++ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, ++ BTREE_UPDATE_JOURNAL_RES, ++ journal_flags|JOURNAL_RES_GET_NONBLOCK); ++ if (ret == -EAGAIN) { ++ if (flags & BTREE_INSERT_NOUNLOCK) ++ return ERR_PTR(-EINTR); ++ ++ bch2_trans_unlock(trans); ++ ++ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, ++ BTREE_UPDATE_JOURNAL_RES, ++ journal_flags); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ if (!bch2_trans_relock(trans)) { ++ ret = -EINTR; ++ goto err; ++ } ++ } ++ ++ ret = bch2_disk_reservation_get(c, &as->disk_res, ++ nr_nodes * c->opts.btree_node_size, ++ c->opts.metadata_replicas, ++ disk_res_flags); ++ if (ret) ++ goto err; ++ ++ ret = bch2_btree_reserve_get(as, nr_nodes, flags, cl); ++ if (ret) ++ goto err; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_add_tail(&as->list, &c->btree_interior_update_list); ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ return as; ++err: ++ bch2_btree_update_free(as); ++ return ERR_PTR(ret); ++} ++ ++/* Btree root updates: */ ++ ++static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) ++{ ++ /* Root nodes cannot be reaped */ ++ mutex_lock(&c->btree_cache.lock); ++ list_del_init(&b->list); ++ mutex_unlock(&c->btree_cache.lock); ++ ++ mutex_lock(&c->btree_root_lock); ++ BUG_ON(btree_node_root(c, b) && ++ (b->c.level < btree_node_root(c, b)->c.level || ++ !btree_node_dying(btree_node_root(c, b)))); ++ ++ btree_node_root(c, b) = b; ++ mutex_unlock(&c->btree_root_lock); ++ ++ bch2_recalc_btree_reserve(c); ++} ++ ++/** ++ * bch_btree_set_root - update the root in memory and on disk ++ * ++ * To ensure forward progress, the current task must not be holding any ++ * btree node write locks. However, you must hold an intent lock on the ++ * old root. ++ * ++ * Note: This allocates a journal entry but doesn't add any keys to ++ * it. All the btree roots are part of every journal write, so there ++ * is nothing new to be done. This just guarantees that there is a ++ * journal write. ++ */ ++static void bch2_btree_set_root(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *old; ++ ++ trace_btree_set_root(c, b); ++ BUG_ON(!b->written && ++ !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)); ++ ++ old = btree_node_root(c, b); ++ ++ /* ++ * Ensure no one is using the old root while we switch to the ++ * new root: ++ */ ++ bch2_btree_node_lock_write(old, iter); ++ ++ bch2_btree_set_root_inmem(c, b); ++ ++ btree_update_updated_root(as, b); ++ ++ /* ++ * Unlock old root after new root is visible: ++ * ++ * The new root isn't persistent, but that's ok: we still have ++ * an intent lock on the new root, and any updates that would ++ * depend on the new root would have to update the new root. ++ */ ++ bch2_btree_node_unlock_write(old, iter); ++} ++ ++/* Interior node updates: */ ++ ++static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter, ++ struct bkey_i *insert, ++ struct btree_node_iter *node_iter) ++{ ++ struct bkey_packed *k; ++ ++ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > ++ ARRAY_SIZE(as->journal_entries)); ++ ++ as->journal_u64s += ++ journal_entry_set((void *) &as->journal_entries[as->journal_u64s], ++ BCH_JSET_ENTRY_btree_keys, ++ b->c.btree_id, b->c.level, ++ insert, insert->k.u64s); ++ ++ while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && ++ bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) ++ bch2_btree_node_iter_advance(node_iter, b); ++ ++ bch2_btree_bset_insert_key(iter, b, node_iter, insert); ++ set_btree_node_dirty(b); ++ set_btree_node_need_write(b); ++} ++ ++/* ++ * Move keys from n1 (original replacement node, now lower node) to n2 (higher ++ * node) ++ */ ++static struct btree *__btree_split_node(struct btree_update *as, ++ struct btree *n1, ++ struct btree_iter *iter) ++{ ++ size_t nr_packed = 0, nr_unpacked = 0; ++ struct btree *n2; ++ struct bset *set1, *set2; ++ struct bkey_packed *k, *prev = NULL; ++ ++ n2 = bch2_btree_node_alloc(as, n1->c.level); ++ bch2_btree_update_add_new_node(as, n2); ++ ++ n2->data->max_key = n1->data->max_key; ++ n2->data->format = n1->format; ++ SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data)); ++ n2->key.k.p = n1->key.k.p; ++ ++ btree_node_set_format(n2, n2->data->format); ++ ++ set1 = btree_bset_first(n1); ++ set2 = btree_bset_first(n2); ++ ++ /* ++ * Has to be a linear search because we don't have an auxiliary ++ * search tree yet ++ */ ++ k = set1->start; ++ while (1) { ++ struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1)); ++ ++ if (n == vstruct_last(set1)) ++ break; ++ if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5) ++ break; ++ ++ if (bkey_packed(k)) ++ nr_packed++; ++ else ++ nr_unpacked++; ++ ++ prev = k; ++ k = n; ++ } ++ ++ BUG_ON(!prev); ++ ++ btree_set_max(n1, bkey_unpack_pos(n1, prev)); ++ btree_set_min(n2, bkey_successor(n1->key.k.p)); ++ ++ set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k); ++ set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s)); ++ ++ set_btree_bset_end(n1, n1->set); ++ set_btree_bset_end(n2, n2->set); ++ ++ n2->nr.live_u64s = le16_to_cpu(set2->u64s); ++ n2->nr.bset_u64s[0] = le16_to_cpu(set2->u64s); ++ n2->nr.packed_keys = n1->nr.packed_keys - nr_packed; ++ n2->nr.unpacked_keys = n1->nr.unpacked_keys - nr_unpacked; ++ ++ n1->nr.live_u64s = le16_to_cpu(set1->u64s); ++ n1->nr.bset_u64s[0] = le16_to_cpu(set1->u64s); ++ n1->nr.packed_keys = nr_packed; ++ n1->nr.unpacked_keys = nr_unpacked; ++ ++ BUG_ON(!set1->u64s); ++ BUG_ON(!set2->u64s); ++ ++ memcpy_u64s(set2->start, ++ vstruct_end(set1), ++ le16_to_cpu(set2->u64s)); ++ ++ btree_node_reset_sib_u64s(n1); ++ btree_node_reset_sib_u64s(n2); ++ ++ bch2_verify_btree_nr_keys(n1); ++ bch2_verify_btree_nr_keys(n2); ++ ++ if (n1->c.level) { ++ btree_node_interior_verify(as->c, n1); ++ btree_node_interior_verify(as->c, n2); ++ } ++ ++ return n2; ++} ++ ++/* ++ * For updates to interior nodes, we've got to do the insert before we split ++ * because the stuff we're inserting has to be inserted atomically. Post split, ++ * the keys might have to go in different nodes and the split would no longer be ++ * atomic. ++ * ++ * Worse, if the insert is from btree node coalescing, if we do the insert after ++ * we do the split (and pick the pivot) - the pivot we pick might be between ++ * nodes that were coalesced, and thus in the middle of a child node post ++ * coalescing: ++ */ ++static void btree_split_insert_keys(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter, ++ struct keylist *keys) ++{ ++ struct btree_node_iter node_iter; ++ struct bkey_i *k = bch2_keylist_front(keys); ++ struct bkey_packed *src, *dst, *n; ++ struct bset *i; ++ ++ BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE); ++ ++ bch2_btree_node_iter_init(&node_iter, b, &k->k.p); ++ ++ while (!bch2_keylist_empty(keys)) { ++ k = bch2_keylist_front(keys); ++ ++ bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter); ++ bch2_keylist_pop_front(keys); ++ } ++ ++ /* ++ * We can't tolerate whiteouts here - with whiteouts there can be ++ * duplicate keys, and it would be rather bad if we picked a duplicate ++ * for the pivot: ++ */ ++ i = btree_bset_first(b); ++ src = dst = i->start; ++ while (src != vstruct_last(i)) { ++ n = bkey_next_skip_noops(src, vstruct_last(i)); ++ if (!bkey_deleted(src)) { ++ memmove_u64s_down(dst, src, src->u64s); ++ dst = bkey_next(dst); ++ } ++ src = n; ++ } ++ ++ i->u64s = cpu_to_le16((u64 *) dst - i->_data); ++ set_btree_bset_end(b, b->set); ++ ++ BUG_ON(b->nsets != 1 || ++ b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s)); ++ ++ btree_node_interior_verify(as->c, b); ++} ++ ++static void btree_split(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter, struct keylist *keys, ++ unsigned flags) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *parent = btree_node_parent(iter, b); ++ struct btree *n1, *n2 = NULL, *n3 = NULL; ++ u64 start_time = local_clock(); ++ ++ BUG_ON(!parent && (b != btree_node_root(c, b))); ++ BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level)); ++ ++ bch2_btree_interior_update_will_free_node(as, b); ++ ++ n1 = bch2_btree_node_alloc_replacement(as, b); ++ bch2_btree_update_add_new_node(as, n1); ++ ++ if (keys) ++ btree_split_insert_keys(as, n1, iter, keys); ++ ++ if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) { ++ trace_btree_split(c, b); ++ ++ n2 = __btree_split_node(as, n1, iter); ++ ++ bch2_btree_build_aux_trees(n2); ++ bch2_btree_build_aux_trees(n1); ++ six_unlock_write(&n2->c.lock); ++ six_unlock_write(&n1->c.lock); ++ ++ bch2_btree_node_write(c, n2, SIX_LOCK_intent); ++ ++ /* ++ * Note that on recursive parent_keys == keys, so we ++ * can't start adding new keys to parent_keys before emptying it ++ * out (which we did with btree_split_insert_keys() above) ++ */ ++ bch2_keylist_add(&as->parent_keys, &n1->key); ++ bch2_keylist_add(&as->parent_keys, &n2->key); ++ ++ if (!parent) { ++ /* Depth increases, make a new root */ ++ n3 = __btree_root_alloc(as, b->c.level + 1); ++ ++ n3->sib_u64s[0] = U16_MAX; ++ n3->sib_u64s[1] = U16_MAX; ++ ++ btree_split_insert_keys(as, n3, iter, &as->parent_keys); ++ ++ bch2_btree_node_write(c, n3, SIX_LOCK_intent); ++ } ++ } else { ++ trace_btree_compact(c, b); ++ ++ bch2_btree_build_aux_trees(n1); ++ six_unlock_write(&n1->c.lock); ++ ++ if (parent) ++ bch2_keylist_add(&as->parent_keys, &n1->key); ++ } ++ ++ bch2_btree_node_write(c, n1, SIX_LOCK_intent); ++ ++ /* New nodes all written, now make them visible: */ ++ ++ if (parent) { ++ /* Split a non root node */ ++ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); ++ } else if (n3) { ++ bch2_btree_set_root(as, n3, iter); ++ } else { ++ /* Root filled up but didn't need to be split */ ++ bch2_btree_set_root(as, n1, iter); ++ } ++ ++ bch2_btree_update_get_open_buckets(as, n1); ++ if (n2) ++ bch2_btree_update_get_open_buckets(as, n2); ++ if (n3) ++ bch2_btree_update_get_open_buckets(as, n3); ++ ++ /* Successful split, update the iterator to point to the new nodes: */ ++ ++ six_lock_increment(&b->c.lock, SIX_LOCK_intent); ++ bch2_btree_iter_node_drop(iter, b); ++ if (n3) ++ bch2_btree_iter_node_replace(iter, n3); ++ if (n2) ++ bch2_btree_iter_node_replace(iter, n2); ++ bch2_btree_iter_node_replace(iter, n1); ++ ++ /* ++ * The old node must be freed (in memory) _before_ unlocking the new ++ * nodes - else another thread could re-acquire a read lock on the old ++ * node after another thread has locked and updated the new node, thus ++ * seeing stale data: ++ */ ++ bch2_btree_node_free_inmem(c, b, iter); ++ ++ if (n3) ++ six_unlock_intent(&n3->c.lock); ++ if (n2) ++ six_unlock_intent(&n2->c.lock); ++ six_unlock_intent(&n1->c.lock); ++ ++ bch2_btree_trans_verify_locks(iter->trans); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split], ++ start_time); ++} ++ ++static void ++bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter, struct keylist *keys) ++{ ++ struct btree_iter *linked; ++ struct btree_node_iter node_iter; ++ struct bkey_i *insert = bch2_keylist_front(keys); ++ struct bkey_packed *k; ++ ++ /* Don't screw up @iter's position: */ ++ node_iter = iter->l[b->c.level].iter; ++ ++ /* ++ * btree_split(), btree_gc_coalesce() will insert keys before ++ * the iterator's current position - they know the keys go in ++ * the node the iterator points to: ++ */ ++ while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && ++ (bkey_cmp_packed(b, k, &insert->k) >= 0)) ++ ; ++ ++ for_each_keylist_key(keys, insert) ++ bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter); ++ ++ btree_update_updated_node(as, b); ++ ++ trans_for_each_iter_with_node(iter->trans, b, linked) ++ bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); ++ ++ bch2_btree_trans_verify_iters(iter->trans, b); ++} ++ ++/** ++ * bch_btree_insert_node - insert bkeys into a given btree node ++ * ++ * @iter: btree iterator ++ * @keys: list of keys to insert ++ * @hook: insert callback ++ * @persistent: if not null, @persistent will wait on journal write ++ * ++ * Inserts as many keys as it can into a given btree node, splitting it if full. ++ * If a split occurred, this function will return early. This can only happen ++ * for leaf nodes -- inserts into interior nodes have to be atomic. ++ */ ++void bch2_btree_insert_node(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter, struct keylist *keys, ++ unsigned flags) ++{ ++ struct bch_fs *c = as->c; ++ int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); ++ int old_live_u64s = b->nr.live_u64s; ++ int live_u64s_added, u64s_added; ++ ++ BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level)); ++ BUG_ON(!b->c.level); ++ BUG_ON(!as || as->b); ++ bch2_verify_keylist_sorted(keys); ++ ++ if (as->must_rewrite) ++ goto split; ++ ++ bch2_btree_node_lock_for_insert(c, b, iter); ++ ++ if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { ++ bch2_btree_node_unlock_write(b, iter); ++ goto split; ++ } ++ ++ bch2_btree_insert_keys_interior(as, b, iter, keys); ++ ++ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; ++ u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; ++ ++ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) ++ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); ++ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) ++ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); ++ ++ if (u64s_added > live_u64s_added && ++ bch2_maybe_compact_whiteouts(c, b)) ++ bch2_btree_iter_reinit_node(iter, b); ++ ++ bch2_btree_node_unlock_write(b, iter); ++ ++ btree_node_interior_verify(c, b); ++ ++ /* ++ * when called from the btree_split path the new nodes aren't added to ++ * the btree iterator yet, so the merge path's unlock/wait/relock dance ++ * won't work: ++ */ ++ bch2_foreground_maybe_merge(c, iter, b->c.level, ++ flags|BTREE_INSERT_NOUNLOCK); ++ return; ++split: ++ btree_split(as, b, iter, keys, flags); ++} ++ ++int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, ++ unsigned flags) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree *b = iter_l(iter)->b; ++ struct btree_update *as; ++ struct closure cl; ++ int ret = 0; ++ struct btree_insert_entry *i; ++ ++ /* ++ * We already have a disk reservation and open buckets pinned; this ++ * allocation must not block: ++ */ ++ trans_for_each_update(trans, i) ++ if (btree_node_type_needs_gc(i->iter->btree_id)) ++ flags |= BTREE_INSERT_USE_RESERVE; ++ ++ closure_init_stack(&cl); ++ ++ /* Hack, because gc and splitting nodes doesn't mix yet: */ ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && ++ !down_read_trylock(&c->gc_lock)) { ++ if (flags & BTREE_INSERT_NOUNLOCK) { ++ trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ return -EINTR; ++ } ++ ++ bch2_trans_unlock(trans); ++ down_read(&c->gc_lock); ++ ++ if (!bch2_trans_relock(trans)) ++ ret = -EINTR; ++ } ++ ++ /* ++ * XXX: figure out how far we might need to split, ++ * instead of locking/reserving all the way to the root: ++ */ ++ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { ++ trace_trans_restart_iter_upgrade(trans->ip); ++ ret = -EINTR; ++ goto out; ++ } ++ ++ as = bch2_btree_update_start(trans, iter->btree_id, ++ btree_update_reserve_required(c, b), flags, ++ !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); ++ if (IS_ERR(as)) { ++ ret = PTR_ERR(as); ++ if (ret == -EAGAIN) { ++ BUG_ON(flags & BTREE_INSERT_NOUNLOCK); ++ bch2_trans_unlock(trans); ++ ret = -EINTR; ++ ++ trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ } ++ goto out; ++ } ++ ++ btree_split(as, b, iter, NULL, flags); ++ bch2_btree_update_done(as); ++ ++ /* ++ * We haven't successfully inserted yet, so don't downgrade all the way ++ * back to read locks; ++ */ ++ __bch2_btree_iter_downgrade(iter, 1); ++out: ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) ++ up_read(&c->gc_lock); ++ closure_sync(&cl); ++ return ret; ++} ++ ++void __bch2_foreground_maybe_merge(struct bch_fs *c, ++ struct btree_iter *iter, ++ unsigned level, ++ unsigned flags, ++ enum btree_node_sibling sib) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree_update *as; ++ struct bkey_format_state new_s; ++ struct bkey_format new_f; ++ struct bkey_i delete; ++ struct btree *b, *m, *n, *prev, *next, *parent; ++ struct closure cl; ++ size_t sib_u64s; ++ int ret = 0; ++ ++ BUG_ON(!btree_node_locked(iter, level)); ++ ++ closure_init_stack(&cl); ++retry: ++ BUG_ON(!btree_node_locked(iter, level)); ++ ++ b = iter->l[level].b; ++ ++ parent = btree_node_parent(iter, b); ++ if (!parent) ++ goto out; ++ ++ if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) ++ goto out; ++ ++ /* XXX: can't be holding read locks */ ++ m = bch2_btree_node_get_sibling(c, iter, b, sib); ++ if (IS_ERR(m)) { ++ ret = PTR_ERR(m); ++ goto err; ++ } ++ ++ /* NULL means no sibling: */ ++ if (!m) { ++ b->sib_u64s[sib] = U16_MAX; ++ goto out; ++ } ++ ++ if (sib == btree_prev_sib) { ++ prev = m; ++ next = b; ++ } else { ++ prev = b; ++ next = m; ++ } ++ ++ bch2_bkey_format_init(&new_s); ++ __bch2_btree_calc_format(&new_s, b); ++ __bch2_btree_calc_format(&new_s, m); ++ new_f = bch2_bkey_format_done(&new_s); ++ ++ sib_u64s = btree_node_u64s_with_format(b, &new_f) + ++ btree_node_u64s_with_format(m, &new_f); ++ ++ if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) { ++ sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c); ++ sib_u64s /= 2; ++ sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c); ++ } ++ ++ sib_u64s = min(sib_u64s, btree_max_u64s(c)); ++ b->sib_u64s[sib] = sib_u64s; ++ ++ if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) { ++ six_unlock_intent(&m->c.lock); ++ goto out; ++ } ++ ++ /* We're changing btree topology, doesn't mix with gc: */ ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && ++ !down_read_trylock(&c->gc_lock)) ++ goto err_cycle_gc_lock; ++ ++ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { ++ ret = -EINTR; ++ goto err_unlock; ++ } ++ ++ as = bch2_btree_update_start(trans, iter->btree_id, ++ btree_update_reserve_required(c, parent) + 1, ++ flags| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE, ++ !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); ++ if (IS_ERR(as)) { ++ ret = PTR_ERR(as); ++ goto err_unlock; ++ } ++ ++ trace_btree_merge(c, b); ++ ++ bch2_btree_interior_update_will_free_node(as, b); ++ bch2_btree_interior_update_will_free_node(as, m); ++ ++ n = bch2_btree_node_alloc(as, b->c.level); ++ bch2_btree_update_add_new_node(as, n); ++ ++ btree_set_min(n, prev->data->min_key); ++ btree_set_max(n, next->data->max_key); ++ n->data->format = new_f; ++ ++ btree_node_set_format(n, new_f); ++ ++ bch2_btree_sort_into(c, n, prev); ++ bch2_btree_sort_into(c, n, next); ++ ++ bch2_btree_build_aux_trees(n); ++ six_unlock_write(&n->c.lock); ++ ++ bkey_init(&delete.k); ++ delete.k.p = prev->key.k.p; ++ bch2_keylist_add(&as->parent_keys, &delete); ++ bch2_keylist_add(&as->parent_keys, &n->key); ++ ++ bch2_btree_node_write(c, n, SIX_LOCK_intent); ++ ++ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); ++ ++ bch2_btree_update_get_open_buckets(as, n); ++ ++ six_lock_increment(&b->c.lock, SIX_LOCK_intent); ++ bch2_btree_iter_node_drop(iter, b); ++ bch2_btree_iter_node_drop(iter, m); ++ ++ bch2_btree_iter_node_replace(iter, n); ++ ++ bch2_btree_trans_verify_iters(trans, n); ++ ++ bch2_btree_node_free_inmem(c, b, iter); ++ bch2_btree_node_free_inmem(c, m, iter); ++ ++ six_unlock_intent(&n->c.lock); ++ ++ bch2_btree_update_done(as); ++ ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) ++ up_read(&c->gc_lock); ++out: ++ bch2_btree_trans_verify_locks(trans); ++ ++ /* ++ * Don't downgrade locks here: we're called after successful insert, ++ * and the caller will downgrade locks after a successful insert ++ * anyways (in case e.g. a split was required first) ++ * ++ * And we're also called when inserting into interior nodes in the ++ * split path, and downgrading to read locks in there is potentially ++ * confusing: ++ */ ++ closure_sync(&cl); ++ return; ++ ++err_cycle_gc_lock: ++ six_unlock_intent(&m->c.lock); ++ ++ if (flags & BTREE_INSERT_NOUNLOCK) ++ goto out; ++ ++ bch2_trans_unlock(trans); ++ ++ down_read(&c->gc_lock); ++ up_read(&c->gc_lock); ++ ret = -EINTR; ++ goto err; ++ ++err_unlock: ++ six_unlock_intent(&m->c.lock); ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) ++ up_read(&c->gc_lock); ++err: ++ BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK)); ++ ++ if ((ret == -EAGAIN || ret == -EINTR) && ++ !(flags & BTREE_INSERT_NOUNLOCK)) { ++ bch2_trans_unlock(trans); ++ closure_sync(&cl); ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ goto out; ++ ++ goto retry; ++ } ++ ++ goto out; ++} ++ ++static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, ++ struct btree *b, unsigned flags, ++ struct closure *cl) ++{ ++ struct btree *n, *parent = btree_node_parent(iter, b); ++ struct btree_update *as; ++ ++ as = bch2_btree_update_start(iter->trans, iter->btree_id, ++ (parent ++ ? btree_update_reserve_required(c, parent) ++ : 0) + 1, ++ flags, cl); ++ if (IS_ERR(as)) { ++ trace_btree_gc_rewrite_node_fail(c, b); ++ return PTR_ERR(as); ++ } ++ ++ bch2_btree_interior_update_will_free_node(as, b); ++ ++ n = bch2_btree_node_alloc_replacement(as, b); ++ bch2_btree_update_add_new_node(as, n); ++ ++ bch2_btree_build_aux_trees(n); ++ six_unlock_write(&n->c.lock); ++ ++ trace_btree_gc_rewrite_node(c, b); ++ ++ bch2_btree_node_write(c, n, SIX_LOCK_intent); ++ ++ if (parent) { ++ bch2_keylist_add(&as->parent_keys, &n->key); ++ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); ++ } else { ++ bch2_btree_set_root(as, n, iter); ++ } ++ ++ bch2_btree_update_get_open_buckets(as, n); ++ ++ six_lock_increment(&b->c.lock, SIX_LOCK_intent); ++ bch2_btree_iter_node_drop(iter, b); ++ bch2_btree_iter_node_replace(iter, n); ++ bch2_btree_node_free_inmem(c, b, iter); ++ six_unlock_intent(&n->c.lock); ++ ++ bch2_btree_update_done(as); ++ return 0; ++} ++ ++/** ++ * bch_btree_node_rewrite - Rewrite/move a btree node ++ * ++ * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e. ++ * btree_check_reserve() has to wait) ++ */ ++int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, ++ __le64 seq, unsigned flags) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct closure cl; ++ struct btree *b; ++ int ret; ++ ++ flags |= BTREE_INSERT_NOFAIL; ++ ++ closure_init_stack(&cl); ++ ++ bch2_btree_iter_upgrade(iter, U8_MAX); ++ ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) { ++ if (!down_read_trylock(&c->gc_lock)) { ++ bch2_trans_unlock(trans); ++ down_read(&c->gc_lock); ++ } ++ } ++ ++ while (1) { ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ break; ++ ++ b = bch2_btree_iter_peek_node(iter); ++ if (!b || b->data->keys.seq != seq) ++ break; ++ ++ ret = __btree_node_rewrite(c, iter, b, flags, &cl); ++ if (ret != -EAGAIN && ++ ret != -EINTR) ++ break; ++ ++ bch2_trans_unlock(trans); ++ closure_sync(&cl); ++ } ++ ++ bch2_btree_iter_downgrade(iter); ++ ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) ++ up_read(&c->gc_lock); ++ ++ closure_sync(&cl); ++ return ret; ++} ++ ++static void __bch2_btree_node_update_key(struct bch_fs *c, ++ struct btree_update *as, ++ struct btree_iter *iter, ++ struct btree *b, struct btree *new_hash, ++ struct bkey_i *new_key) ++{ ++ struct btree *parent; ++ int ret; ++ ++ btree_update_will_delete_key(as, &b->key); ++ btree_update_will_add_key(as, new_key); ++ ++ parent = btree_node_parent(iter, b); ++ if (parent) { ++ if (new_hash) { ++ bkey_copy(&new_hash->key, new_key); ++ ret = bch2_btree_node_hash_insert(&c->btree_cache, ++ new_hash, b->c.level, b->c.btree_id); ++ BUG_ON(ret); ++ } ++ ++ bch2_keylist_add(&as->parent_keys, new_key); ++ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0); ++ ++ if (new_hash) { ++ mutex_lock(&c->btree_cache.lock); ++ bch2_btree_node_hash_remove(&c->btree_cache, new_hash); ++ ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ bkey_copy(&b->key, new_key); ++ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); ++ BUG_ON(ret); ++ mutex_unlock(&c->btree_cache.lock); ++ } else { ++ bkey_copy(&b->key, new_key); ++ } ++ } else { ++ BUG_ON(btree_node_root(c, b) != b); ++ ++ bch2_btree_node_lock_write(b, iter); ++ bkey_copy(&b->key, new_key); ++ ++ if (btree_ptr_hash_val(&b->key) != b->hash_val) { ++ mutex_lock(&c->btree_cache.lock); ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); ++ BUG_ON(ret); ++ mutex_unlock(&c->btree_cache.lock); ++ } ++ ++ btree_update_updated_root(as, b); ++ bch2_btree_node_unlock_write(b, iter); ++ } ++ ++ bch2_btree_update_done(as); ++} ++ ++int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, ++ struct btree *b, ++ struct bkey_i *new_key) ++{ ++ struct btree *parent = btree_node_parent(iter, b); ++ struct btree_update *as = NULL; ++ struct btree *new_hash = NULL; ++ struct closure cl; ++ int ret; ++ ++ closure_init_stack(&cl); ++ ++ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) ++ return -EINTR; ++ ++ if (!down_read_trylock(&c->gc_lock)) { ++ bch2_trans_unlock(iter->trans); ++ down_read(&c->gc_lock); ++ ++ if (!bch2_trans_relock(iter->trans)) { ++ ret = -EINTR; ++ goto err; ++ } ++ } ++ ++ /* ++ * check btree_ptr_hash_val() after @b is locked by ++ * btree_iter_traverse(): ++ */ ++ if (btree_ptr_hash_val(new_key) != b->hash_val) { ++ /* bch2_btree_reserve_get will unlock */ ++ ret = bch2_btree_cache_cannibalize_lock(c, &cl); ++ if (ret) { ++ bch2_trans_unlock(iter->trans); ++ up_read(&c->gc_lock); ++ closure_sync(&cl); ++ down_read(&c->gc_lock); ++ ++ if (!bch2_trans_relock(iter->trans)) { ++ ret = -EINTR; ++ goto err; ++ } ++ } ++ ++ new_hash = bch2_btree_node_mem_alloc(c); ++ } ++retry: ++ as = bch2_btree_update_start(iter->trans, iter->btree_id, ++ parent ? btree_update_reserve_required(c, parent) : 0, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_USE_ALLOC_RESERVE, ++ &cl); ++ ++ if (IS_ERR(as)) { ++ ret = PTR_ERR(as); ++ if (ret == -EAGAIN) ++ ret = -EINTR; ++ ++ if (ret == -EINTR) { ++ bch2_trans_unlock(iter->trans); ++ up_read(&c->gc_lock); ++ closure_sync(&cl); ++ down_read(&c->gc_lock); ++ ++ if (bch2_trans_relock(iter->trans)) ++ goto retry; ++ } ++ ++ goto err; ++ } ++ ++ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key)); ++ if (ret) ++ goto err_free_update; ++ ++ __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key); ++ ++ bch2_btree_iter_downgrade(iter); ++err: ++ if (new_hash) { ++ mutex_lock(&c->btree_cache.lock); ++ list_move(&new_hash->list, &c->btree_cache.freeable); ++ mutex_unlock(&c->btree_cache.lock); ++ ++ six_unlock_write(&new_hash->c.lock); ++ six_unlock_intent(&new_hash->c.lock); ++ } ++ up_read(&c->gc_lock); ++ closure_sync(&cl); ++ return ret; ++err_free_update: ++ bch2_btree_update_free(as); ++ goto err; ++} ++ ++/* Init code: */ ++ ++/* ++ * Only for filesystem bringup, when first reading the btree roots or allocating ++ * btree roots when initializing a new filesystem: ++ */ ++void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) ++{ ++ BUG_ON(btree_node_root(c, b)); ++ ++ bch2_btree_set_root_inmem(c, b); ++} ++ ++void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) ++{ ++ struct closure cl; ++ struct btree *b; ++ int ret; ++ ++ closure_init_stack(&cl); ++ ++ do { ++ ret = bch2_btree_cache_cannibalize_lock(c, &cl); ++ closure_sync(&cl); ++ } while (ret); ++ ++ b = bch2_btree_node_mem_alloc(c); ++ bch2_btree_cache_cannibalize_unlock(c); ++ ++ set_btree_node_fake(b); ++ set_btree_node_need_rewrite(b); ++ b->c.level = 0; ++ b->c.btree_id = id; ++ ++ bkey_btree_ptr_init(&b->key); ++ b->key.k.p = POS_MAX; ++ *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id; ++ ++ bch2_bset_init_first(b, &b->data->keys); ++ bch2_btree_build_aux_trees(b); ++ ++ b->data->flags = 0; ++ btree_set_min(b, POS_MIN); ++ btree_set_max(b, POS_MAX); ++ b->data->format = bch2_btree_calc_format(b); ++ btree_node_set_format(b, b->data->format); ++ ++ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, ++ b->c.level, b->c.btree_id); ++ BUG_ON(ret); ++ ++ bch2_btree_set_root_inmem(c, b); ++ ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++} ++ ++void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ struct btree_update *as; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_for_each_entry(as, &c->btree_interior_update_list, list) ++ pr_buf(out, "%p m %u w %u r %u j %llu\n", ++ as, ++ as->mode, ++ as->nodes_written, ++ atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK, ++ as->journal.seq); ++ mutex_unlock(&c->btree_interior_update_lock); ++} ++ ++size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c) ++{ ++ size_t ret = 0; ++ struct list_head *i; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_for_each(i, &c->btree_interior_update_list) ++ ret++; ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ return ret; ++} ++ ++void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset) ++{ ++ struct btree_root *r; ++ struct jset_entry *entry; ++ ++ mutex_lock(&c->btree_root_lock); ++ ++ vstruct_for_each(jset, entry) ++ if (entry->type == BCH_JSET_ENTRY_btree_root) { ++ r = &c->btree_roots[entry->btree_id]; ++ r->level = entry->level; ++ r->alive = true; ++ bkey_copy(&r->key, &entry->start[0]); ++ } ++ ++ mutex_unlock(&c->btree_root_lock); ++} ++ ++struct jset_entry * ++bch2_btree_roots_to_journal_entries(struct bch_fs *c, ++ struct jset_entry *start, ++ struct jset_entry *end) ++{ ++ struct jset_entry *entry; ++ unsigned long have = 0; ++ unsigned i; ++ ++ for (entry = start; entry < end; entry = vstruct_next(entry)) ++ if (entry->type == BCH_JSET_ENTRY_btree_root) ++ __set_bit(entry->btree_id, &have); ++ ++ mutex_lock(&c->btree_root_lock); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (c->btree_roots[i].alive && !test_bit(i, &have)) { ++ journal_entry_set(end, ++ BCH_JSET_ENTRY_btree_root, ++ i, c->btree_roots[i].level, ++ &c->btree_roots[i].key, ++ c->btree_roots[i].key.u64s); ++ end = vstruct_next(end); ++ } ++ ++ mutex_unlock(&c->btree_root_lock); ++ ++ return end; ++} ++ ++void bch2_fs_btree_interior_update_exit(struct bch_fs *c) ++{ ++ if (c->btree_interior_update_worker) ++ destroy_workqueue(c->btree_interior_update_worker); ++ mempool_exit(&c->btree_interior_update_pool); ++} ++ ++int bch2_fs_btree_interior_update_init(struct bch_fs *c) ++{ ++ mutex_init(&c->btree_reserve_cache_lock); ++ INIT_LIST_HEAD(&c->btree_interior_update_list); ++ INIT_LIST_HEAD(&c->btree_interior_updates_unwritten); ++ mutex_init(&c->btree_interior_update_lock); ++ INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work); ++ ++ c->btree_interior_update_worker = ++ alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1); ++ if (!c->btree_interior_update_worker) ++ return -ENOMEM; ++ ++ return mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, ++ sizeof(struct btree_update)); ++} +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +new file mode 100644 +index 000000000000..7668225e72c6 +--- /dev/null ++++ b/fs/bcachefs/btree_update_interior.h +@@ -0,0 +1,331 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H ++#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H ++ ++#include "btree_cache.h" ++#include "btree_locking.h" ++#include "btree_update.h" ++ ++void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *); ++bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *, ++ struct bkey_format *); ++ ++#define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES) ++ ++#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1)) ++ ++/* ++ * Tracks an in progress split/rewrite of a btree node and the update to the ++ * parent node: ++ * ++ * When we split/rewrite a node, we do all the updates in memory without ++ * waiting for any writes to complete - we allocate the new node(s) and update ++ * the parent node, possibly recursively up to the root. ++ * ++ * The end result is that we have one or more new nodes being written - ++ * possibly several, if there were multiple splits - and then a write (updating ++ * an interior node) which will make all these new nodes visible. ++ * ++ * Additionally, as we split/rewrite nodes we free the old nodes - but the old ++ * nodes can't be freed (their space on disk can't be reclaimed) until the ++ * update to the interior node that makes the new node visible completes - ++ * until then, the old nodes are still reachable on disk. ++ * ++ */ ++struct btree_update { ++ struct closure cl; ++ struct bch_fs *c; ++ ++ struct list_head list; ++ struct list_head unwritten_list; ++ ++ /* What kind of update are we doing? */ ++ enum { ++ BTREE_INTERIOR_NO_UPDATE, ++ BTREE_INTERIOR_UPDATING_NODE, ++ BTREE_INTERIOR_UPDATING_ROOT, ++ BTREE_INTERIOR_UPDATING_AS, ++ } mode; ++ ++ unsigned must_rewrite:1; ++ unsigned nodes_written:1; ++ ++ enum btree_id btree_id; ++ ++ struct disk_reservation disk_res; ++ struct journal_preres journal_preres; ++ ++ /* ++ * BTREE_INTERIOR_UPDATING_NODE: ++ * The update that made the new nodes visible was a regular update to an ++ * existing interior node - @b. We can't write out the update to @b ++ * until the new nodes we created are finished writing, so we block @b ++ * from writing by putting this btree_interior update on the ++ * @b->write_blocked list with @write_blocked_list: ++ */ ++ struct btree *b; ++ struct list_head write_blocked_list; ++ ++ /* ++ * We may be freeing nodes that were dirty, and thus had journal entries ++ * pinned: we need to transfer the oldest of those pins to the ++ * btree_update operation, and release it when the new node(s) ++ * are all persistent and reachable: ++ */ ++ struct journal_entry_pin journal; ++ ++ /* Preallocated nodes we reserve when we start the update: */ ++ struct btree *prealloc_nodes[BTREE_UPDATE_NODES_MAX]; ++ unsigned nr_prealloc_nodes; ++ ++ /* Nodes being freed: */ ++ struct keylist old_keys; ++ u64 _old_keys[BTREE_UPDATE_NODES_MAX * ++ BKEY_BTREE_PTR_VAL_U64s_MAX]; ++ ++ /* Nodes being added: */ ++ struct keylist new_keys; ++ u64 _new_keys[BTREE_UPDATE_NODES_MAX * ++ BKEY_BTREE_PTR_VAL_U64s_MAX]; ++ ++ /* New nodes, that will be made reachable by this update: */ ++ struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; ++ unsigned nr_new_nodes; ++ ++ open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX * ++ BCH_REPLICAS_MAX]; ++ open_bucket_idx_t nr_open_buckets; ++ ++ unsigned journal_u64s; ++ u64 journal_entries[BTREE_UPDATE_JOURNAL_RES]; ++ ++ /* Only here to reduce stack usage on recursive splits: */ ++ struct keylist parent_keys; ++ /* ++ * Enough room for btree_split's keys without realloc - btree node ++ * pointers never have crc/compression info, so we only need to acount ++ * for the pointers for three keys ++ */ ++ u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; ++}; ++ ++void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *, ++ struct btree_iter *); ++void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *); ++ ++void bch2_btree_update_get_open_buckets(struct btree_update *, struct btree *); ++ ++struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, ++ struct btree *, ++ struct bkey_format); ++ ++void bch2_btree_update_done(struct btree_update *); ++struct btree_update * ++bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned, ++ unsigned, struct closure *); ++ ++void bch2_btree_interior_update_will_free_node(struct btree_update *, ++ struct btree *); ++void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); ++ ++void bch2_btree_insert_node(struct btree_update *, struct btree *, ++ struct btree_iter *, struct keylist *, ++ unsigned); ++int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned); ++ ++void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *, ++ unsigned, unsigned, enum btree_node_sibling); ++ ++static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c, ++ struct btree_iter *iter, ++ unsigned level, unsigned flags, ++ enum btree_node_sibling sib) ++{ ++ struct btree *b; ++ ++ if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) ++ return; ++ ++ if (!bch2_btree_node_relock(iter, level)) ++ return; ++ ++ b = iter->l[level].b; ++ if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) ++ return; ++ ++ __bch2_foreground_maybe_merge(c, iter, level, flags, sib); ++} ++ ++static inline void bch2_foreground_maybe_merge(struct bch_fs *c, ++ struct btree_iter *iter, ++ unsigned level, ++ unsigned flags) ++{ ++ bch2_foreground_maybe_merge_sibling(c, iter, level, flags, ++ btree_prev_sib); ++ bch2_foreground_maybe_merge_sibling(c, iter, level, flags, ++ btree_next_sib); ++} ++ ++void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); ++void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); ++ ++static inline unsigned btree_update_reserve_required(struct bch_fs *c, ++ struct btree *b) ++{ ++ unsigned depth = btree_node_root(c, b)->c.level + 1; ++ ++ /* ++ * Number of nodes we might have to allocate in a worst case btree ++ * split operation - we split all the way up to the root, then allocate ++ * a new root, unless we're already at max depth: ++ */ ++ if (depth < BTREE_MAX_DEPTH) ++ return (depth - b->c.level) * 2 + 1; ++ else ++ return (depth - b->c.level) * 2 - 1; ++} ++ ++static inline void btree_node_reset_sib_u64s(struct btree *b) ++{ ++ b->sib_u64s[0] = b->nr.live_u64s; ++ b->sib_u64s[1] = b->nr.live_u64s; ++} ++ ++static inline void *btree_data_end(struct bch_fs *c, struct btree *b) ++{ ++ return (void *) b->data + btree_bytes(c); ++} ++ ++static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c, ++ struct btree *b) ++{ ++ return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s); ++} ++ ++static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c, ++ struct btree *b) ++{ ++ return btree_data_end(c, b); ++} ++ ++static inline void *write_block(struct btree *b) ++{ ++ return (void *) b->data + (b->written << 9); ++} ++ ++static inline bool __btree_addr_written(struct btree *b, void *p) ++{ ++ return p < write_block(b); ++} ++ ++static inline bool bset_written(struct btree *b, struct bset *i) ++{ ++ return __btree_addr_written(b, i); ++} ++ ++static inline bool bkey_written(struct btree *b, struct bkey_packed *k) ++{ ++ return __btree_addr_written(b, k); ++} ++ ++static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, ++ struct btree *b, ++ void *end) ++{ ++ ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + ++ b->whiteout_u64s; ++ ssize_t total = c->opts.btree_node_size << 6; ++ ++ return total - used; ++} ++ ++static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, ++ struct btree *b) ++{ ++ ssize_t remaining = __bch_btree_u64s_remaining(c, b, ++ btree_bkey_last(b, bset_tree_last(b))); ++ ++ BUG_ON(remaining < 0); ++ ++ if (bset_written(b, btree_bset_last(b))) ++ return 0; ++ ++ return remaining; ++} ++ ++static inline unsigned btree_write_set_buffer(struct btree *b) ++{ ++ /* ++ * Could buffer up larger amounts of keys for btrees with larger keys, ++ * pending benchmarking: ++ */ ++ return 4 << 10; ++} ++ ++static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, ++ struct btree *b) ++{ ++ struct bset_tree *t = bset_tree_last(b); ++ struct btree_node_entry *bne = max(write_block(b), ++ (void *) btree_bkey_last(b, bset_tree_last(b))); ++ ssize_t remaining_space = ++ __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]); ++ ++ if (unlikely(bset_written(b, bset(b, t)))) { ++ if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) ++ return bne; ++ } else { ++ if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && ++ remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3)) ++ return bne; ++ } ++ ++ return NULL; ++} ++ ++static inline void push_whiteout(struct bch_fs *c, struct btree *b, ++ struct bpos pos) ++{ ++ struct bkey_packed k; ++ ++ BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s); ++ ++ if (!bkey_pack_pos(&k, pos, b)) { ++ struct bkey *u = (void *) &k; ++ ++ bkey_init(u); ++ u->p = pos; ++ } ++ ++ k.needs_whiteout = true; ++ ++ b->whiteout_u64s += k.u64s; ++ bkey_copy(unwritten_whiteouts_start(c, b), &k); ++} ++ ++/* ++ * write lock must be held on @b (else the dirty bset that we were going to ++ * insert into could be written out from under us) ++ */ ++static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, ++ struct btree *b, unsigned u64s) ++{ ++ if (unlikely(btree_node_need_rewrite(b))) ++ return false; ++ ++ return u64s <= bch_btree_keys_u64s_remaining(c, b); ++} ++ ++void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); ++ ++size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *); ++ ++void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *); ++struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, ++ struct jset_entry *, struct jset_entry *); ++ ++void bch2_fs_btree_interior_update_exit(struct bch_fs *); ++int bch2_fs_btree_interior_update_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +new file mode 100644 +index 000000000000..cd699c257244 +--- /dev/null ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -0,0 +1,1171 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_gc.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_key_cache.h" ++#include "btree_locking.h" ++#include "buckets.h" ++#include "debug.h" ++#include "error.h" ++#include "extent_update.h" ++#include "journal.h" ++#include "journal_reclaim.h" ++#include "keylist.h" ++#include "replicas.h" ++ ++#include ++#include ++#include ++ ++static inline bool same_leaf_as_prev(struct btree_trans *trans, ++ struct btree_insert_entry *i) ++{ ++ return i != trans->updates2 && ++ iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b; ++} ++ ++inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, ++ struct btree_iter *iter) ++{ ++ bch2_btree_node_lock_write(b, iter); ++ ++ if (btree_iter_type(iter) == BTREE_ITER_CACHED) ++ return; ++ ++ if (unlikely(btree_node_just_written(b)) && ++ bch2_btree_post_write_cleanup(c, b)) ++ bch2_btree_iter_reinit_node(iter, b); ++ ++ /* ++ * If the last bset has been written, or if it's gotten too big - start ++ * a new bset to insert into: ++ */ ++ if (want_new_bset(c, b)) ++ bch2_btree_init_next(c, b, iter); ++} ++ ++/* Inserting into a given leaf node (last stage of insert): */ ++ ++/* Handle overwrites and do insert, for non extents: */ ++bool bch2_btree_bset_insert_key(struct btree_iter *iter, ++ struct btree *b, ++ struct btree_node_iter *node_iter, ++ struct bkey_i *insert) ++{ ++ struct bkey_packed *k; ++ unsigned clobber_u64s = 0, new_u64s = 0; ++ ++ EBUG_ON(btree_node_just_written(b)); ++ EBUG_ON(bset_written(b, btree_bset_last(b))); ++ EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); ++ EBUG_ON(bkey_cmp(b->data->min_key, POS_MIN) && ++ bkey_cmp(bkey_start_pos(&insert->k), ++ bkey_predecessor(b->data->min_key)) < 0); ++ EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0); ++ EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0); ++ EBUG_ON(insert->k.u64s > ++ bch_btree_keys_u64s_remaining(iter->trans->c, b)); ++ EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); ++ ++ k = bch2_btree_node_iter_peek_all(node_iter, b); ++ if (k && bkey_cmp_packed(b, k, &insert->k)) ++ k = NULL; ++ ++ /* @k is the key being overwritten/deleted, if any: */ ++ EBUG_ON(k && bkey_whiteout(k)); ++ ++ /* Deleting, but not found? nothing to do: */ ++ if (bkey_whiteout(&insert->k) && !k) ++ return false; ++ ++ if (bkey_whiteout(&insert->k)) { ++ /* Deleting: */ ++ btree_account_key_drop(b, k); ++ k->type = KEY_TYPE_deleted; ++ ++ if (k->needs_whiteout) ++ push_whiteout(iter->trans->c, b, insert->k.p); ++ k->needs_whiteout = false; ++ ++ if (k >= btree_bset_last(b)->start) { ++ clobber_u64s = k->u64s; ++ bch2_bset_delete(b, k, clobber_u64s); ++ goto fix_iter; ++ } else { ++ bch2_btree_iter_fix_key_modified(iter, b, k); ++ } ++ ++ return true; ++ } ++ ++ if (k) { ++ /* Overwriting: */ ++ btree_account_key_drop(b, k); ++ k->type = KEY_TYPE_deleted; ++ ++ insert->k.needs_whiteout = k->needs_whiteout; ++ k->needs_whiteout = false; ++ ++ if (k >= btree_bset_last(b)->start) { ++ clobber_u64s = k->u64s; ++ goto overwrite; ++ } else { ++ bch2_btree_iter_fix_key_modified(iter, b, k); ++ } ++ } ++ ++ k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); ++overwrite: ++ bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); ++ new_u64s = k->u64s; ++fix_iter: ++ if (clobber_u64s != new_u64s) ++ bch2_btree_node_iter_fix(iter, b, node_iter, k, ++ clobber_u64s, new_u64s); ++ return true; ++} ++ ++static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, ++ unsigned i, u64 seq) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct btree_write *w = container_of(pin, struct btree_write, journal); ++ struct btree *b = container_of(w, struct btree, writes[i]); ++ ++ btree_node_lock_type(c, b, SIX_LOCK_read); ++ bch2_btree_node_write_cond(c, b, ++ (btree_current_write(b) == w && w->journal.seq == seq)); ++ six_unlock_read(&b->c.lock); ++} ++ ++static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) ++{ ++ return __btree_node_flush(j, pin, 0, seq); ++} ++ ++static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) ++{ ++ return __btree_node_flush(j, pin, 1, seq); ++} ++ ++inline void bch2_btree_add_journal_pin(struct bch_fs *c, ++ struct btree *b, u64 seq) ++{ ++ struct btree_write *w = btree_current_write(b); ++ ++ bch2_journal_pin_add(&c->journal, seq, &w->journal, ++ btree_node_write_idx(b) == 0 ++ ? btree_node_flush0 ++ : btree_node_flush1); ++} ++ ++/** ++ * btree_insert_key - insert a key one key into a leaf node ++ */ ++static bool btree_insert_key_leaf(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree *b = iter_l(iter)->b; ++ struct bset_tree *t = bset_tree_last(b); ++ struct bset *i = bset(b, t); ++ int old_u64s = bset_u64s(t); ++ int old_live_u64s = b->nr.live_u64s; ++ int live_u64s_added, u64s_added; ++ ++ EBUG_ON(!iter->level && ++ !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)); ++ ++ if (unlikely(!bch2_btree_bset_insert_key(iter, b, ++ &iter_l(iter)->iter, insert))) ++ return false; ++ ++ i->journal_seq = cpu_to_le64(max(trans->journal_res.seq, ++ le64_to_cpu(i->journal_seq))); ++ ++ bch2_btree_add_journal_pin(c, b, trans->journal_res.seq); ++ ++ if (unlikely(!btree_node_dirty(b))) ++ set_btree_node_dirty(b); ++ ++ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; ++ u64s_added = (int) bset_u64s(t) - old_u64s; ++ ++ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) ++ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); ++ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) ++ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); ++ ++ if (u64s_added > live_u64s_added && ++ bch2_maybe_compact_whiteouts(c, b)) ++ bch2_btree_iter_reinit_node(iter, b); ++ ++ trace_btree_insert_key(c, b, insert); ++ return true; ++} ++ ++/* Cached btree updates: */ ++ ++/* Normal update interface: */ ++ ++static inline void btree_insert_entry_checks(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct bch_fs *c = trans->c; ++ ++ BUG_ON(bkey_cmp(insert->k.p, iter->pos)); ++ BUG_ON(debug_check_bkeys(c) && ++ bch2_bkey_invalid(c, bkey_i_to_s_c(insert), ++ __btree_node_type(iter->level, iter->btree_id))); ++} ++ ++static noinline int ++bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s) ++{ ++ struct bch_fs *c = trans->c; ++ int ret; ++ ++ bch2_trans_unlock(trans); ++ ++ ret = bch2_journal_preres_get(&c->journal, ++ &trans->journal_preres, u64s, 0); ++ if (ret) ++ return ret; ++ ++ if (!bch2_trans_relock(trans)) { ++ trace_trans_restart_journal_preres_get(trans->ip); ++ return -EINTR; ++ } ++ ++ return 0; ++} ++ ++static inline int bch2_trans_journal_res_get(struct btree_trans *trans, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ int ret; ++ ++ if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) ++ flags |= JOURNAL_RES_GET_RESERVED; ++ ++ ret = bch2_journal_res_get(&c->journal, &trans->journal_res, ++ trans->journal_u64s, flags); ++ ++ return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret; ++} ++ ++static enum btree_insert_ret ++btree_key_can_insert(struct btree_trans *trans, ++ struct btree_iter *iter, ++ unsigned u64s) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree *b = iter_l(iter)->b; ++ ++ if (!bch2_btree_node_insert_fits(c, b, u64s)) ++ return BTREE_INSERT_BTREE_NODE_FULL; ++ ++ return BTREE_INSERT_OK; ++} ++ ++static enum btree_insert_ret ++btree_key_can_insert_cached(struct btree_trans *trans, ++ struct btree_iter *iter, ++ unsigned u64s) ++{ ++ struct bkey_cached *ck = (void *) iter->l[0].b; ++ unsigned new_u64s; ++ struct bkey_i *new_k; ++ ++ BUG_ON(iter->level); ++ ++ if (u64s <= ck->u64s) ++ return BTREE_INSERT_OK; ++ ++ new_u64s = roundup_pow_of_two(u64s); ++ new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); ++ if (!new_k) ++ return -ENOMEM; ++ ++ ck->u64s = new_u64s; ++ ck->k = new_k; ++ return BTREE_INSERT_OK; ++} ++ ++static inline void do_btree_insert_one(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct bch_fs *c = trans->c; ++ struct journal *j = &c->journal; ++ bool did_work; ++ ++ EBUG_ON(trans->journal_res.ref != ++ !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); ++ ++ insert->k.needs_whiteout = false; ++ ++ did_work = (btree_iter_type(iter) != BTREE_ITER_CACHED) ++ ? btree_insert_key_leaf(trans, iter, insert) ++ : bch2_btree_insert_key_cached(trans, iter, insert); ++ if (!did_work) ++ return; ++ ++ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { ++ bch2_journal_add_keys(j, &trans->journal_res, ++ iter->btree_id, insert); ++ ++ bch2_journal_set_has_inode(j, &trans->journal_res, ++ insert->k.p.inode); ++ ++ if (trans->journal_seq) ++ *trans->journal_seq = trans->journal_res.seq; ++ } ++} ++ ++static inline bool iter_has_trans_triggers(struct btree_iter *iter) ++{ ++ return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << iter->btree_id); ++} ++ ++static inline bool iter_has_nontrans_triggers(struct btree_iter *iter) ++{ ++ return (BTREE_NODE_TYPE_HAS_TRIGGERS & ++ ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS) & ++ (1U << iter->btree_id); ++} ++ ++static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter) ++{ ++ __bch2_btree_iter_unlock(iter); ++} ++ ++static noinline void bch2_trans_mark_gc(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_insert_entry *i; ++ ++ trans_for_each_update(trans, i) { ++ /* ++ * XXX: synchronization of cached update triggers with gc ++ */ ++ BUG_ON(btree_iter_type(i->iter) == BTREE_ITER_CACHED); ++ ++ if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) ++ bch2_mark_update(trans, i->iter, i->k, NULL, ++ i->trigger_flags|BTREE_TRIGGER_GC); ++ } ++} ++ ++static inline int ++bch2_trans_commit_write_locked(struct btree_trans *trans, ++ struct btree_insert_entry **stopped_at) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_fs_usage *fs_usage = NULL; ++ struct btree_insert_entry *i; ++ unsigned u64s = 0; ++ bool marking = false; ++ int ret; ++ ++ if (race_fault()) { ++ trace_trans_restart_fault_inject(trans->ip); ++ return -EINTR; ++ } ++ ++ /* ++ * Check if the insert will fit in the leaf node with the write lock ++ * held, otherwise another thread could write the node changing the ++ * amount of space available: ++ */ ++ ++ prefetch(&trans->c->journal.flags); ++ ++ trans_for_each_update2(trans, i) { ++ /* Multiple inserts might go to same leaf: */ ++ if (!same_leaf_as_prev(trans, i)) ++ u64s = 0; ++ ++ u64s += i->k->k.u64s; ++ ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED ++ ? btree_key_can_insert(trans, i->iter, u64s) ++ : btree_key_can_insert_cached(trans, i->iter, u64s); ++ if (ret) { ++ *stopped_at = i; ++ return ret; ++ } ++ ++ if (btree_node_type_needs_gc(i->iter->btree_id)) ++ marking = true; ++ } ++ ++ if (marking) { ++ percpu_down_read(&c->mark_lock); ++ fs_usage = bch2_fs_usage_scratch_get(c); ++ } ++ ++ /* ++ * Don't get journal reservation until after we know insert will ++ * succeed: ++ */ ++ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { ++ ret = bch2_trans_journal_res_get(trans, ++ JOURNAL_RES_GET_NONBLOCK); ++ if (ret) ++ goto err; ++ } else { ++ trans->journal_res.seq = c->journal.replay_journal_seq; ++ } ++ ++ if (unlikely(trans->extra_journal_entry_u64s)) { ++ memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), ++ trans->extra_journal_entries, ++ trans->extra_journal_entry_u64s); ++ ++ trans->journal_res.offset += trans->extra_journal_entry_u64s; ++ trans->journal_res.u64s -= trans->extra_journal_entry_u64s; ++ } ++ ++ /* ++ * Not allowed to fail after we've gotten our journal reservation - we ++ * have to use it: ++ */ ++ ++ if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { ++ if (journal_seq_verify(c)) ++ trans_for_each_update2(trans, i) ++ i->k->k.version.lo = trans->journal_res.seq; ++ else if (inject_invalid_keys(c)) ++ trans_for_each_update2(trans, i) ++ i->k->k.version = MAX_VERSION; ++ } ++ ++ /* Must be called under mark_lock: */ ++ if (marking && trans->fs_usage_deltas && ++ bch2_replicas_delta_list_apply(c, fs_usage, ++ trans->fs_usage_deltas)) { ++ ret = BTREE_INSERT_NEED_MARK_REPLICAS; ++ goto err; ++ } ++ ++ trans_for_each_update(trans, i) ++ if (iter_has_nontrans_triggers(i->iter)) ++ bch2_mark_update(trans, i->iter, i->k, ++ fs_usage, i->trigger_flags); ++ ++ if (marking) ++ bch2_trans_fs_usage_apply(trans, fs_usage); ++ ++ if (unlikely(c->gc_pos.phase)) ++ bch2_trans_mark_gc(trans); ++ ++ trans_for_each_update2(trans, i) ++ do_btree_insert_one(trans, i->iter, i->k); ++err: ++ if (marking) { ++ bch2_fs_usage_scratch_put(c, fs_usage); ++ percpu_up_read(&c->mark_lock); ++ } ++ ++ return ret; ++} ++ ++/* ++ * Get journal reservation, take write locks, and attempt to do btree update(s): ++ */ ++static inline int do_bch2_trans_commit(struct btree_trans *trans, ++ struct btree_insert_entry **stopped_at) ++{ ++ struct btree_insert_entry *i; ++ struct btree_iter *iter; ++ int ret; ++ ++ trans_for_each_update2(trans, i) ++ BUG_ON(!btree_node_intent_locked(i->iter, i->iter->level)); ++ ++ ret = bch2_journal_preres_get(&trans->c->journal, ++ &trans->journal_preres, trans->journal_preres_u64s, ++ JOURNAL_RES_GET_NONBLOCK| ++ ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) ++ ? JOURNAL_RES_GET_RECLAIM : 0)); ++ if (unlikely(ret == -EAGAIN)) ++ ret = bch2_trans_journal_preres_get_cold(trans, ++ trans->journal_preres_u64s); ++ if (unlikely(ret)) ++ return ret; ++ ++ /* ++ * Can't be holding any read locks when we go to take write locks: ++ * ++ * note - this must be done after bch2_trans_journal_preres_get_cold() ++ * or anything else that might call bch2_trans_relock(), since that ++ * would just retake the read locks: ++ */ ++ trans_for_each_iter(trans, iter) { ++ if (iter->nodes_locked != iter->nodes_intent_locked) { ++ EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); ++ EBUG_ON(trans->iters_live & (1ULL << iter->idx)); ++ bch2_btree_iter_unlock_noinline(iter); ++ } ++ } ++ ++ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) ++ trans_for_each_update2(trans, i) ++ btree_insert_entry_checks(trans, i->iter, i->k); ++ bch2_btree_trans_verify_locks(trans); ++ ++ trans_for_each_update2(trans, i) ++ if (!same_leaf_as_prev(trans, i)) ++ bch2_btree_node_lock_for_insert(trans->c, ++ iter_l(i->iter)->b, i->iter); ++ ++ ret = bch2_trans_commit_write_locked(trans, stopped_at); ++ ++ trans_for_each_update2(trans, i) ++ if (!same_leaf_as_prev(trans, i)) ++ bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b, ++ i->iter); ++ ++ if (!ret && trans->journal_pin) ++ bch2_journal_pin_add(&trans->c->journal, trans->journal_res.seq, ++ trans->journal_pin, NULL); ++ ++ /* ++ * Drop journal reservation after dropping write locks, since dropping ++ * the journal reservation may kick off a journal write: ++ */ ++ bch2_journal_res_put(&trans->c->journal, &trans->journal_res); ++ ++ if (unlikely(ret)) ++ return ret; ++ ++ if (trans->flags & BTREE_INSERT_NOUNLOCK) ++ trans->nounlock = true; ++ ++ trans_for_each_update2(trans, i) ++ if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && ++ !same_leaf_as_prev(trans, i)) ++ bch2_foreground_maybe_merge(trans->c, i->iter, ++ 0, trans->flags); ++ ++ trans->nounlock = false; ++ ++ bch2_trans_downgrade(trans); ++ ++ return 0; ++} ++ ++static noinline ++int bch2_trans_commit_error(struct btree_trans *trans, ++ struct btree_insert_entry *i, ++ int ret) ++{ ++ struct bch_fs *c = trans->c; ++ unsigned flags = trans->flags; ++ ++ /* ++ * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree ++ * update; if we haven't done anything yet it doesn't apply ++ */ ++ flags &= ~BTREE_INSERT_NOUNLOCK; ++ ++ switch (ret) { ++ case BTREE_INSERT_BTREE_NODE_FULL: ++ ret = bch2_btree_split_leaf(c, i->iter, flags); ++ ++ /* ++ * if the split succeeded without dropping locks the insert will ++ * still be atomic (what the caller peeked() and is overwriting ++ * won't have changed) ++ */ ++#if 0 ++ /* ++ * XXX: ++ * split -> btree node merging (of parent node) might still drop ++ * locks when we're not passing it BTREE_INSERT_NOUNLOCK ++ * ++ * we don't want to pass BTREE_INSERT_NOUNLOCK to split as that ++ * will inhibit merging - but we don't have a reliable way yet ++ * (do we?) of checking if we dropped locks in this path ++ */ ++ if (!ret) ++ goto retry; ++#endif ++ ++ /* ++ * don't care if we got ENOSPC because we told split it ++ * couldn't block: ++ */ ++ if (!ret || ++ ret == -EINTR || ++ (flags & BTREE_INSERT_NOUNLOCK)) { ++ trace_trans_restart_btree_node_split(trans->ip); ++ ret = -EINTR; ++ } ++ break; ++ case BTREE_INSERT_ENOSPC: ++ ret = -ENOSPC; ++ break; ++ case BTREE_INSERT_NEED_MARK_REPLICAS: ++ bch2_trans_unlock(trans); ++ ++ trans_for_each_update(trans, i) { ++ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k)); ++ if (ret) ++ return ret; ++ } ++ ++ if (bch2_trans_relock(trans)) ++ return 0; ++ ++ trace_trans_restart_mark_replicas(trans->ip); ++ ret = -EINTR; ++ break; ++ case BTREE_INSERT_NEED_JOURNAL_RES: ++ bch2_trans_unlock(trans); ++ ++ ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK); ++ if (ret) ++ return ret; ++ ++ if (bch2_trans_relock(trans)) ++ return 0; ++ ++ trace_trans_restart_journal_res_get(trans->ip); ++ ret = -EINTR; ++ break; ++ default: ++ BUG_ON(ret >= 0); ++ break; ++ } ++ ++ if (ret == -EINTR) { ++ int ret2 = bch2_btree_iter_traverse_all(trans); ++ ++ if (ret2) { ++ trace_trans_restart_traverse(trans->ip); ++ return ret2; ++ } ++ ++ trace_trans_restart_atomic(trans->ip); ++ } ++ ++ return ret; ++} ++ ++static noinline int ++bch2_trans_commit_get_rw_cold(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ int ret; ++ ++ if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW))) ++ return -EROFS; ++ ++ bch2_trans_unlock(trans); ++ ++ ret = bch2_fs_read_write_early(c); ++ if (ret) ++ return ret; ++ ++ percpu_ref_get(&c->writes); ++ return 0; ++} ++ ++static void bch2_trans_update2(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct btree_insert_entry *i, n = (struct btree_insert_entry) { ++ .iter = iter, .k = insert ++ }; ++ ++ btree_insert_entry_checks(trans, n.iter, n.k); ++ ++ BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); ++ ++ EBUG_ON(trans->nr_updates2 >= trans->nr_iters); ++ ++ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ ++ trans_for_each_update2(trans, i) { ++ if (btree_iter_cmp(n.iter, i->iter) == 0) { ++ *i = n; ++ return; ++ } ++ ++ if (btree_iter_cmp(n.iter, i->iter) <= 0) ++ break; ++ } ++ ++ array_insert_item(trans->updates2, trans->nr_updates2, ++ i - trans->updates2, n); ++} ++ ++static int extent_update_to_keys(struct btree_trans *trans, ++ struct btree_iter *orig_iter, ++ struct bkey_i *insert) ++{ ++ struct btree_iter *iter; ++ int ret; ++ ++ ret = bch2_extent_can_insert(trans, orig_iter, insert); ++ if (ret) ++ return ret; ++ ++ if (bkey_deleted(&insert->k)) ++ return 0; ++ ++ iter = bch2_trans_copy_iter(trans, orig_iter); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ iter->flags |= BTREE_ITER_INTENT; ++ __bch2_btree_iter_set_pos(iter, insert->k.p, false); ++ bch2_trans_update2(trans, iter, insert); ++ bch2_trans_iter_put(trans, iter); ++ return 0; ++} ++ ++static int extent_handle_overwrites(struct btree_trans *trans, ++ enum btree_id btree_id, ++ struct bpos start, struct bpos end) ++{ ++ struct btree_iter *iter = NULL, *update_iter; ++ struct bkey_i *update; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(iter); ++ if (ret) ++ return ret; ++ ++ k = bch2_btree_iter_peek_with_updates(iter); ++ ++ while (k.k && !(ret = bkey_err(k))) { ++ if (bkey_cmp(end, bkey_start_pos(k.k)) <= 0) ++ break; ++ ++ if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { ++ update_iter = bch2_trans_copy_iter(trans, iter); ++ if ((ret = PTR_ERR_OR_ZERO(update_iter))) ++ goto err; ++ ++ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ goto err; ++ ++ bkey_reassemble(update, k); ++ bch2_cut_back(start, update); ++ ++ __bch2_btree_iter_set_pos(update_iter, update->k.p, false); ++ bch2_trans_update2(trans, update_iter, update); ++ bch2_trans_iter_put(trans, update_iter); ++ } ++ ++ if (bkey_cmp(k.k->p, end) > 0) { ++ update_iter = bch2_trans_copy_iter(trans, iter); ++ if ((ret = PTR_ERR_OR_ZERO(update_iter))) ++ goto err; ++ ++ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ goto err; ++ ++ bkey_reassemble(update, k); ++ bch2_cut_front(end, update); ++ ++ __bch2_btree_iter_set_pos(update_iter, update->k.p, false); ++ bch2_trans_update2(trans, update_iter, update); ++ bch2_trans_iter_put(trans, update_iter); ++ } else { ++ update_iter = bch2_trans_copy_iter(trans, iter); ++ if ((ret = PTR_ERR_OR_ZERO(update_iter))) ++ goto err; ++ ++ update = bch2_trans_kmalloc(trans, sizeof(struct bkey)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ goto err; ++ ++ update->k = *k.k; ++ set_bkey_val_u64s(&update->k, 0); ++ update->k.type = KEY_TYPE_deleted; ++ update->k.size = 0; ++ ++ __bch2_btree_iter_set_pos(update_iter, update->k.p, false); ++ bch2_trans_update2(trans, update_iter, update); ++ bch2_trans_iter_put(trans, update_iter); ++ } ++ ++ k = bch2_btree_iter_next_with_updates(iter); ++ } ++err: ++ if (!IS_ERR_OR_NULL(iter)) ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++int __bch2_trans_commit(struct btree_trans *trans) ++{ ++ struct btree_insert_entry *i = NULL; ++ struct btree_iter *iter; ++ bool trans_trigger_run; ++ unsigned u64s; ++ int ret = 0; ++ ++ BUG_ON(trans->need_reset); ++ ++ if (!trans->nr_updates) ++ goto out_noupdates; ++ ++ if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) ++ lockdep_assert_held(&trans->c->gc_lock); ++ ++ memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); ++ ++ trans->journal_u64s = trans->extra_journal_entry_u64s; ++ trans->journal_preres_u64s = 0; ++ ++ if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && ++ unlikely(!percpu_ref_tryget(&trans->c->writes))) { ++ ret = bch2_trans_commit_get_rw_cold(trans); ++ if (ret) ++ return ret; ++ } ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ trans_for_each_update(trans, i) ++ if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && ++ !(i->trigger_flags & BTREE_TRIGGER_NORUN)) ++ bch2_btree_key_cache_verify_clean(trans, ++ i->iter->btree_id, i->iter->pos); ++#endif ++ ++ /* ++ * Running triggers will append more updates to the list of updates as ++ * we're walking it: ++ */ ++ do { ++ trans_trigger_run = false; ++ ++ trans_for_each_update(trans, i) { ++ if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK && ++ (ret = bch2_btree_iter_traverse(i->iter)))) { ++ trace_trans_restart_traverse(trans->ip); ++ goto out; ++ } ++ ++ /* ++ * We're not using bch2_btree_iter_upgrade here because ++ * we know trans->nounlock can't be set: ++ */ ++ if (unlikely(i->iter->locks_want < 1 && ++ !__bch2_btree_iter_upgrade(i->iter, 1))) { ++ trace_trans_restart_upgrade(trans->ip); ++ ret = -EINTR; ++ goto out; ++ } ++ ++ if (iter_has_trans_triggers(i->iter) && ++ !i->trans_triggers_run) { ++ i->trans_triggers_run = true; ++ trans_trigger_run = true; ++ ++ ret = bch2_trans_mark_update(trans, i->iter, i->k, ++ i->trigger_flags); ++ if (unlikely(ret)) { ++ if (ret == -EINTR) ++ trace_trans_restart_mark(trans->ip); ++ goto out; ++ } ++ } ++ } ++ } while (trans_trigger_run); ++ ++ /* Turn extents updates into keys: */ ++ trans_for_each_update(trans, i) ++ if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { ++ struct bpos start = bkey_start_pos(&i->k->k); ++ ++ while (i + 1 < trans->updates + trans->nr_updates && ++ i[0].iter->btree_id == i[1].iter->btree_id && ++ !bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k))) ++ i++; ++ ++ ret = extent_handle_overwrites(trans, i->iter->btree_id, ++ start, i->k->k.p); ++ if (ret) ++ goto out; ++ } ++ ++ trans_for_each_update(trans, i) { ++ if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { ++ ret = extent_update_to_keys(trans, i->iter, i->k); ++ if (ret) ++ goto out; ++ } else { ++ bch2_trans_update2(trans, i->iter, i->k); ++ } ++ } ++ ++ trans_for_each_update2(trans, i) { ++ BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK); ++ BUG_ON(i->iter->locks_want < 1); ++ ++ u64s = jset_u64s(i->k->k.u64s); ++ if (btree_iter_type(i->iter) == BTREE_ITER_CACHED && ++ likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) ++ trans->journal_preres_u64s += u64s; ++ trans->journal_u64s += u64s; ++ } ++retry: ++ memset(&trans->journal_res, 0, sizeof(trans->journal_res)); ++ ++ ret = do_bch2_trans_commit(trans, &i); ++ ++ /* make sure we didn't drop or screw up locks: */ ++ bch2_btree_trans_verify_locks(trans); ++ ++ if (ret) ++ goto err; ++ ++ trans_for_each_iter(trans, iter) ++ if ((trans->iters_live & (1ULL << iter->idx)) && ++ (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) { ++ if (trans->flags & BTREE_INSERT_NOUNLOCK) ++ bch2_btree_iter_set_pos_same_leaf(iter, iter->pos_after_commit); ++ else ++ bch2_btree_iter_set_pos(iter, iter->pos_after_commit); ++ } ++out: ++ bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); ++ ++ if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) ++ percpu_ref_put(&trans->c->writes); ++out_noupdates: ++ bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0); ++ ++ return ret; ++err: ++ ret = bch2_trans_commit_error(trans, i, ret); ++ if (ret) ++ goto out; ++ ++ goto retry; ++} ++ ++int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_i *k, enum btree_trigger_flags flags) ++{ ++ struct btree_insert_entry *i, n = (struct btree_insert_entry) { ++ .trigger_flags = flags, .iter = iter, .k = k ++ }; ++ ++ EBUG_ON(bkey_cmp(iter->pos, ++ (iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? bkey_start_pos(&k->k) ++ : k->k.p)); ++ ++ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ ++ if (btree_node_type_is_extents(iter->btree_id)) { ++ iter->pos_after_commit = k->k.p; ++ iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT; ++ } ++ ++ /* ++ * Pending updates are kept sorted: first, find position of new update: ++ */ ++ trans_for_each_update(trans, i) ++ if (btree_iter_cmp(iter, i->iter) <= 0) ++ break; ++ ++ /* ++ * Now delete/trim any updates the new update overwrites: ++ */ ++ if (i > trans->updates && ++ i[-1].iter->btree_id == iter->btree_id && ++ bkey_cmp(iter->pos, i[-1].k->k.p) < 0) ++ bch2_cut_back(n.iter->pos, i[-1].k); ++ ++ while (i < trans->updates + trans->nr_updates && ++ iter->btree_id == i->iter->btree_id && ++ bkey_cmp(n.k->k.p, i->k->k.p) >= 0) ++ array_remove_item(trans->updates, trans->nr_updates, ++ i - trans->updates); ++ ++ if (i < trans->updates + trans->nr_updates && ++ iter->btree_id == i->iter->btree_id && ++ bkey_cmp(n.k->k.p, i->iter->pos) > 0) { ++ /* ++ * When we have an extent that overwrites the start of another ++ * update, trimming that extent will mean the iterator's ++ * position has to change since the iterator position has to ++ * match the extent's start pos - but we don't want to change ++ * the iterator pos if some other code is using it, so we may ++ * need to clone it: ++ */ ++ if (trans->iters_live & (1ULL << i->iter->idx)) { ++ i->iter = bch2_trans_copy_iter(trans, i->iter); ++ if (IS_ERR(i->iter)) { ++ trans->need_reset = true; ++ return PTR_ERR(i->iter); ++ } ++ ++ i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ bch2_trans_iter_put(trans, i->iter); ++ } ++ ++ bch2_cut_front(n.k->k.p, i->k); ++ bch2_btree_iter_set_pos(i->iter, n.k->k.p); ++ } ++ ++ EBUG_ON(trans->nr_updates >= trans->nr_iters); ++ ++ array_insert_item(trans->updates, trans->nr_updates, ++ i - trans->updates, n); ++ return 0; ++} ++ ++int __bch2_btree_insert(struct btree_trans *trans, ++ enum btree_id id, struct bkey_i *k) ++{ ++ struct btree_iter *iter; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), ++ BTREE_ITER_INTENT); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ ret = bch2_btree_iter_traverse(iter) ?: ++ bch2_trans_update(trans, iter, k, 0); ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++/** ++ * bch2_btree_insert - insert keys into the extent btree ++ * @c: pointer to struct bch_fs ++ * @id: btree to insert into ++ * @insert_keys: list of keys to insert ++ * @hook: insert callback ++ */ ++int bch2_btree_insert(struct bch_fs *c, enum btree_id id, ++ struct bkey_i *k, ++ struct disk_reservation *disk_res, ++ u64 *journal_seq, int flags) ++{ ++ return bch2_trans_do(c, disk_res, journal_seq, flags, ++ __bch2_btree_insert(&trans, id, k)); ++} ++ ++int bch2_btree_delete_at_range(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bpos end, ++ u64 *journal_seq) ++{ ++ struct bkey_s_c k; ++ int ret = 0; ++retry: ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret = bkey_err(k)) && ++ bkey_cmp(iter->pos, end) < 0) { ++ struct bkey_i delete; ++ ++ bch2_trans_begin(trans); ++ ++ bkey_init(&delete.k); ++ ++ /* ++ * For extents, iter.pos won't necessarily be the same as ++ * bkey_start_pos(k.k) (for non extents they always will be the ++ * same). It's important that we delete starting from iter.pos ++ * because the range we want to delete could start in the middle ++ * of k. ++ * ++ * (bch2_btree_iter_peek() does guarantee that iter.pos >= ++ * bkey_start_pos(k.k)). ++ */ ++ delete.k.p = iter->pos; ++ ++ if (btree_node_type_is_extents(iter->btree_id)) { ++ unsigned max_sectors = ++ KEY_SIZE_MAX & (~0 << trans->c->block_bits); ++ ++ /* create the biggest key we can */ ++ bch2_key_resize(&delete.k, max_sectors); ++ bch2_cut_back(end, &delete); ++ ++ ret = bch2_extent_trim_atomic(&delete, iter); ++ if (ret) ++ break; ++ } ++ ++ bch2_trans_update(trans, iter, &delete, 0); ++ ret = bch2_trans_commit(trans, NULL, journal_seq, ++ BTREE_INSERT_NOFAIL); ++ if (ret) ++ break; ++ ++ bch2_trans_cond_resched(trans); ++ } ++ ++ if (ret == -EINTR) { ++ ret = 0; ++ goto retry; ++ } ++ ++ return ret; ++ ++} ++ ++int bch2_btree_delete_at(struct btree_trans *trans, ++ struct btree_iter *iter, unsigned flags) ++{ ++ struct bkey_i k; ++ ++ bkey_init(&k.k); ++ k.k.p = iter->pos; ++ ++ bch2_trans_update(trans, iter, &k, 0); ++ return bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE|flags); ++} ++ ++/* ++ * bch_btree_delete_range - delete everything within a given range ++ * ++ * Range is a half open interval - [start, end) ++ */ ++int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, ++ struct bpos start, struct bpos end, ++ u64 *journal_seq) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ int ret = 0; ++ ++ /* ++ * XXX: whether we need mem/more iters depends on whether this btree id ++ * has triggers ++ */ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); ++ ++ iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT); ++ ++ ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq); ++ ret = bch2_trans_exit(&trans) ?: ret; ++ ++ BUG_ON(ret == -EINTR); ++ return ret; ++} +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +new file mode 100644 +index 000000000000..97a8af31ded1 +--- /dev/null ++++ b/fs/bcachefs/buckets.c +@@ -0,0 +1,2145 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Code for manipulating bucket marks for garbage collection. ++ * ++ * Copyright 2014 Datera, Inc. ++ * ++ * Bucket states: ++ * - free bucket: mark == 0 ++ * The bucket contains no data and will not be read ++ * ++ * - allocator bucket: owned_by_allocator == 1 ++ * The bucket is on a free list, or it is an open bucket ++ * ++ * - cached bucket: owned_by_allocator == 0 && ++ * dirty_sectors == 0 && ++ * cached_sectors > 0 ++ * The bucket contains data but may be safely discarded as there are ++ * enough replicas of the data on other cache devices, or it has been ++ * written back to the backing device ++ * ++ * - dirty bucket: owned_by_allocator == 0 && ++ * dirty_sectors > 0 ++ * The bucket contains data that we must not discard (either only copy, ++ * or one of the 'main copies' for data requiring multiple replicas) ++ * ++ * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1 ++ * This is a btree node, journal or gen/prio bucket ++ * ++ * Lifecycle: ++ * ++ * bucket invalidated => bucket on freelist => open bucket => ++ * [dirty bucket =>] cached bucket => bucket invalidated => ... ++ * ++ * Note that cache promotion can skip the dirty bucket step, as data ++ * is copied from a deeper tier to a shallower tier, onto a cached ++ * bucket. ++ * Note also that a cached bucket can spontaneously become dirty -- ++ * see below. ++ * ++ * Only a traversal of the key space can determine whether a bucket is ++ * truly dirty or cached. ++ * ++ * Transitions: ++ * ++ * - free => allocator: bucket was invalidated ++ * - cached => allocator: bucket was invalidated ++ * ++ * - allocator => dirty: open bucket was filled up ++ * - allocator => cached: open bucket was filled up ++ * - allocator => metadata: metadata was allocated ++ * ++ * - dirty => cached: dirty sectors were copied to a deeper tier ++ * - dirty => free: dirty sectors were overwritten or moved (copy gc) ++ * - cached => free: cached sectors were overwritten ++ * ++ * - metadata => free: metadata was freed ++ * ++ * Oddities: ++ * - cached => dirty: a device was removed so formerly replicated data ++ * is no longer sufficiently replicated ++ * - free => cached: cannot happen ++ * - free => dirty: cannot happen ++ * - free => metadata: cannot happen ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "bset.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "ec.h" ++#include "error.h" ++#include "movinggc.h" ++#include "replicas.h" ++ ++#include ++#include ++ ++/* ++ * Clear journal_seq_valid for buckets for which it's not needed, to prevent ++ * wraparound: ++ */ ++void bch2_bucket_seq_cleanup(struct bch_fs *c) ++{ ++ u64 journal_seq = atomic64_read(&c->journal.seq); ++ u16 last_seq_ondisk = c->journal.last_seq_ondisk; ++ struct bch_dev *ca; ++ struct bucket_array *buckets; ++ struct bucket *g; ++ struct bucket_mark m; ++ unsigned i; ++ ++ if (journal_seq - c->last_bucket_seq_cleanup < ++ (1U << (BUCKET_JOURNAL_SEQ_BITS - 2))) ++ return; ++ ++ c->last_bucket_seq_cleanup = journal_seq; ++ ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ for_each_bucket(g, buckets) { ++ bucket_cmpxchg(g, m, ({ ++ if (!m.journal_seq_valid || ++ bucket_needs_journal_commit(m, last_seq_ondisk)) ++ break; ++ ++ m.journal_seq_valid = 0; ++ })); ++ } ++ up_read(&ca->bucket_lock); ++ } ++} ++ ++void bch2_fs_usage_initialize(struct bch_fs *c) ++{ ++ struct bch_fs_usage *usage; ++ unsigned i; ++ ++ percpu_down_write(&c->mark_lock); ++ usage = c->usage_base; ++ ++ bch2_fs_usage_acc_to_base(c, 0); ++ bch2_fs_usage_acc_to_base(c, 1); ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) ++ usage->reserved += usage->persistent_reserved[i]; ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ ++ switch (e->data_type) { ++ case BCH_DATA_btree: ++ usage->btree += usage->replicas[i]; ++ break; ++ case BCH_DATA_user: ++ usage->data += usage->replicas[i]; ++ break; ++ case BCH_DATA_cached: ++ usage->cached += usage->replicas[i]; ++ break; ++ } ++ } ++ ++ percpu_up_write(&c->mark_lock); ++} ++ ++void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage *fs_usage) ++{ ++ if (fs_usage == c->usage_scratch) ++ mutex_unlock(&c->usage_scratch_lock); ++ else ++ kfree(fs_usage); ++} ++ ++struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *c) ++{ ++ struct bch_fs_usage *ret; ++ unsigned bytes = fs_usage_u64s(c) * sizeof(u64); ++ ++ ret = kzalloc(bytes, GFP_NOWAIT|__GFP_NOWARN); ++ if (ret) ++ return ret; ++ ++ if (mutex_trylock(&c->usage_scratch_lock)) ++ goto out_pool; ++ ++ ret = kzalloc(bytes, GFP_NOFS); ++ if (ret) ++ return ret; ++ ++ mutex_lock(&c->usage_scratch_lock); ++out_pool: ++ ret = c->usage_scratch; ++ memset(ret, 0, bytes); ++ return ret; ++} ++ ++struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) ++{ ++ struct bch_dev_usage ret; ++ ++ memset(&ret, 0, sizeof(ret)); ++ acc_u64s_percpu((u64 *) &ret, ++ (u64 __percpu *) ca->usage[0], ++ sizeof(ret) / sizeof(u64)); ++ ++ return ret; ++} ++ ++static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, ++ unsigned journal_seq, ++ bool gc) ++{ ++ return this_cpu_ptr(gc ++ ? c->usage_gc ++ : c->usage[journal_seq & 1]); ++} ++ ++u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) ++{ ++ ssize_t offset = v - (u64 *) c->usage_base; ++ unsigned seq; ++ u64 ret; ++ ++ BUG_ON(offset < 0 || offset >= fs_usage_u64s(c)); ++ percpu_rwsem_assert_held(&c->mark_lock); ++ ++ do { ++ seq = read_seqcount_begin(&c->usage_lock); ++ ret = *v + ++ percpu_u64_get((u64 __percpu *) c->usage[0] + offset) + ++ percpu_u64_get((u64 __percpu *) c->usage[1] + offset); ++ } while (read_seqcount_retry(&c->usage_lock, seq)); ++ ++ return ret; ++} ++ ++struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c) ++{ ++ struct bch_fs_usage *ret; ++ unsigned seq, v, u64s = fs_usage_u64s(c); ++retry: ++ ret = kmalloc(u64s * sizeof(u64), GFP_NOFS); ++ if (unlikely(!ret)) ++ return NULL; ++ ++ percpu_down_read(&c->mark_lock); ++ ++ v = fs_usage_u64s(c); ++ if (unlikely(u64s != v)) { ++ u64s = v; ++ percpu_up_read(&c->mark_lock); ++ kfree(ret); ++ goto retry; ++ } ++ ++ do { ++ seq = read_seqcount_begin(&c->usage_lock); ++ memcpy(ret, c->usage_base, u64s * sizeof(u64)); ++ acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s); ++ acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[1], u64s); ++ } while (read_seqcount_retry(&c->usage_lock, seq)); ++ ++ return ret; ++} ++ ++void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) ++{ ++ unsigned u64s = fs_usage_u64s(c); ++ ++ BUG_ON(idx >= 2); ++ ++ write_seqcount_begin(&c->usage_lock); ++ ++ acc_u64s_percpu((u64 *) c->usage_base, ++ (u64 __percpu *) c->usage[idx], u64s); ++ percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); ++ ++ write_seqcount_end(&c->usage_lock); ++} ++ ++void bch2_fs_usage_to_text(struct printbuf *out, ++ struct bch_fs *c, ++ struct bch_fs_usage *fs_usage) ++{ ++ unsigned i; ++ ++ pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity); ++ ++ pr_buf(out, "hidden:\t\t\t\t%llu\n", ++ fs_usage->hidden); ++ pr_buf(out, "data:\t\t\t\t%llu\n", ++ fs_usage->data); ++ pr_buf(out, "cached:\t\t\t\t%llu\n", ++ fs_usage->cached); ++ pr_buf(out, "reserved:\t\t\t%llu\n", ++ fs_usage->reserved); ++ pr_buf(out, "nr_inodes:\t\t\t%llu\n", ++ fs_usage->nr_inodes); ++ pr_buf(out, "online reserved:\t\t%llu\n", ++ fs_usage->online_reserved); ++ ++ for (i = 0; ++ i < ARRAY_SIZE(fs_usage->persistent_reserved); ++ i++) { ++ pr_buf(out, "%u replicas:\n", i + 1); ++ pr_buf(out, "\treserved:\t\t%llu\n", ++ fs_usage->persistent_reserved[i]); ++ } ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ ++ pr_buf(out, "\t"); ++ bch2_replicas_entry_to_text(out, e); ++ pr_buf(out, ":\t%llu\n", fs_usage->replicas[i]); ++ } ++} ++ ++#define RESERVE_FACTOR 6 ++ ++static u64 reserve_factor(u64 r) ++{ ++ return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); ++} ++ ++static u64 avail_factor(u64 r) ++{ ++ return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1); ++} ++ ++u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage) ++{ ++ return min(fs_usage->hidden + ++ fs_usage->btree + ++ fs_usage->data + ++ reserve_factor(fs_usage->reserved + ++ fs_usage->online_reserved), ++ c->capacity); ++} ++ ++static struct bch_fs_usage_short ++__bch2_fs_usage_read_short(struct bch_fs *c) ++{ ++ struct bch_fs_usage_short ret; ++ u64 data, reserved; ++ ++ ret.capacity = c->capacity - ++ bch2_fs_usage_read_one(c, &c->usage_base->hidden); ++ ++ data = bch2_fs_usage_read_one(c, &c->usage_base->data) + ++ bch2_fs_usage_read_one(c, &c->usage_base->btree); ++ reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + ++ bch2_fs_usage_read_one(c, &c->usage_base->online_reserved); ++ ++ ret.used = min(ret.capacity, data + reserve_factor(reserved)); ++ ret.free = ret.capacity - ret.used; ++ ++ ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes); ++ ++ return ret; ++} ++ ++struct bch_fs_usage_short ++bch2_fs_usage_read_short(struct bch_fs *c) ++{ ++ struct bch_fs_usage_short ret; ++ ++ percpu_down_read(&c->mark_lock); ++ ret = __bch2_fs_usage_read_short(c); ++ percpu_up_read(&c->mark_lock); ++ ++ return ret; ++} ++ ++static inline int is_unavailable_bucket(struct bucket_mark m) ++{ ++ return !is_available_bucket(m); ++} ++ ++static inline int is_fragmented_bucket(struct bucket_mark m, ++ struct bch_dev *ca) ++{ ++ if (!m.owned_by_allocator && ++ m.data_type == BCH_DATA_user && ++ bucket_sectors_used(m)) ++ return max_t(int, 0, (int) ca->mi.bucket_size - ++ bucket_sectors_used(m)); ++ return 0; ++} ++ ++static inline int bucket_stripe_sectors(struct bucket_mark m) ++{ ++ return m.stripe ? m.dirty_sectors : 0; ++} ++ ++static inline enum bch_data_type bucket_type(struct bucket_mark m) ++{ ++ return m.cached_sectors && !m.dirty_sectors ++ ? BCH_DATA_cached ++ : m.data_type; ++} ++ ++static bool bucket_became_unavailable(struct bucket_mark old, ++ struct bucket_mark new) ++{ ++ return is_available_bucket(old) && ++ !is_available_bucket(new); ++} ++ ++int bch2_fs_usage_apply(struct bch_fs *c, ++ struct bch_fs_usage *fs_usage, ++ struct disk_reservation *disk_res, ++ unsigned journal_seq) ++{ ++ s64 added = fs_usage->data + fs_usage->reserved; ++ s64 should_not_have_added; ++ int ret = 0; ++ ++ percpu_rwsem_assert_held(&c->mark_lock); ++ ++ /* ++ * Not allowed to reduce sectors_available except by getting a ++ * reservation: ++ */ ++ should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0); ++ if (WARN_ONCE(should_not_have_added > 0, ++ "disk usage increased by %lli without a reservation", ++ should_not_have_added)) { ++ atomic64_sub(should_not_have_added, &c->sectors_available); ++ added -= should_not_have_added; ++ ret = -1; ++ } ++ ++ if (added > 0) { ++ disk_res->sectors -= added; ++ fs_usage->online_reserved -= added; ++ } ++ ++ preempt_disable(); ++ acc_u64s((u64 *) fs_usage_ptr(c, journal_seq, false), ++ (u64 *) fs_usage, fs_usage_u64s(c)); ++ preempt_enable(); ++ ++ return ret; ++} ++ ++static inline void account_bucket(struct bch_fs_usage *fs_usage, ++ struct bch_dev_usage *dev_usage, ++ enum bch_data_type type, ++ int nr, s64 size) ++{ ++ if (type == BCH_DATA_sb || type == BCH_DATA_journal) ++ fs_usage->hidden += size; ++ ++ dev_usage->buckets[type] += nr; ++} ++ ++static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, ++ struct bch_fs_usage *fs_usage, ++ struct bucket_mark old, struct bucket_mark new, ++ bool gc) ++{ ++ struct bch_dev_usage *u; ++ ++ percpu_rwsem_assert_held(&c->mark_lock); ++ ++ preempt_disable(); ++ u = this_cpu_ptr(ca->usage[gc]); ++ ++ if (bucket_type(old)) ++ account_bucket(fs_usage, u, bucket_type(old), ++ -1, -ca->mi.bucket_size); ++ ++ if (bucket_type(new)) ++ account_bucket(fs_usage, u, bucket_type(new), ++ 1, ca->mi.bucket_size); ++ ++ u->buckets_alloc += ++ (int) new.owned_by_allocator - (int) old.owned_by_allocator; ++ u->buckets_unavailable += ++ is_unavailable_bucket(new) - is_unavailable_bucket(old); ++ ++ u->buckets_ec += (int) new.stripe - (int) old.stripe; ++ u->sectors_ec += bucket_stripe_sectors(new) - ++ bucket_stripe_sectors(old); ++ ++ u->sectors[old.data_type] -= old.dirty_sectors; ++ u->sectors[new.data_type] += new.dirty_sectors; ++ u->sectors[BCH_DATA_cached] += ++ (int) new.cached_sectors - (int) old.cached_sectors; ++ u->sectors_fragmented += ++ is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca); ++ preempt_enable(); ++ ++ if (!is_available_bucket(old) && is_available_bucket(new)) ++ bch2_wake_allocator(ca); ++} ++ ++void bch2_dev_usage_from_buckets(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ struct bucket_mark old = { .v.counter = 0 }; ++ struct bucket_array *buckets; ++ struct bucket *g; ++ unsigned i; ++ int cpu; ++ ++ c->usage_base->hidden = 0; ++ ++ for_each_member_device(ca, c, i) { ++ for_each_possible_cpu(cpu) ++ memset(per_cpu_ptr(ca->usage[0], cpu), 0, ++ sizeof(*ca->usage[0])); ++ ++ buckets = bucket_array(ca); ++ ++ for_each_bucket(g, buckets) ++ bch2_dev_usage_update(c, ca, c->usage_base, ++ old, g->mark, false); ++ } ++} ++ ++static inline int update_replicas(struct bch_fs *c, ++ struct bch_fs_usage *fs_usage, ++ struct bch_replicas_entry *r, ++ s64 sectors) ++{ ++ int idx = bch2_replicas_entry_idx(c, r); ++ ++ if (idx < 0) ++ return -1; ++ ++ if (!fs_usage) ++ return 0; ++ ++ switch (r->data_type) { ++ case BCH_DATA_btree: ++ fs_usage->btree += sectors; ++ break; ++ case BCH_DATA_user: ++ fs_usage->data += sectors; ++ break; ++ case BCH_DATA_cached: ++ fs_usage->cached += sectors; ++ break; ++ } ++ fs_usage->replicas[idx] += sectors; ++ return 0; ++} ++ ++static inline void update_cached_sectors(struct bch_fs *c, ++ struct bch_fs_usage *fs_usage, ++ unsigned dev, s64 sectors) ++{ ++ struct bch_replicas_padded r; ++ ++ bch2_replicas_entry_cached(&r.e, dev); ++ ++ update_replicas(c, fs_usage, &r.e, sectors); ++} ++ ++static struct replicas_delta_list * ++replicas_deltas_realloc(struct btree_trans *trans, unsigned more) ++{ ++ struct replicas_delta_list *d = trans->fs_usage_deltas; ++ unsigned new_size = d ? (d->size + more) * 2 : 128; ++ ++ if (!d || d->used + more > d->size) { ++ d = krealloc(d, sizeof(*d) + new_size, GFP_NOIO|__GFP_ZERO); ++ BUG_ON(!d); ++ ++ d->size = new_size; ++ trans->fs_usage_deltas = d; ++ } ++ return d; ++} ++ ++static inline void update_replicas_list(struct btree_trans *trans, ++ struct bch_replicas_entry *r, ++ s64 sectors) ++{ ++ struct replicas_delta_list *d; ++ struct replicas_delta *n; ++ unsigned b; ++ ++ if (!sectors) ++ return; ++ ++ b = replicas_entry_bytes(r) + 8; ++ d = replicas_deltas_realloc(trans, b); ++ ++ n = (void *) d->d + d->used; ++ n->delta = sectors; ++ memcpy(&n->r, r, replicas_entry_bytes(r)); ++ d->used += b; ++} ++ ++static inline void update_cached_sectors_list(struct btree_trans *trans, ++ unsigned dev, s64 sectors) ++{ ++ struct bch_replicas_padded r; ++ ++ bch2_replicas_entry_cached(&r.e, dev); ++ ++ update_replicas_list(trans, &r.e, sectors); ++} ++ ++static inline struct replicas_delta * ++replicas_delta_next(struct replicas_delta *d) ++{ ++ return (void *) d + replicas_entry_bytes(&d->r) + 8; ++} ++ ++int bch2_replicas_delta_list_apply(struct bch_fs *c, ++ struct bch_fs_usage *fs_usage, ++ struct replicas_delta_list *r) ++{ ++ struct replicas_delta *d = r->d; ++ struct replicas_delta *top = (void *) r->d + r->used; ++ unsigned i; ++ ++ for (d = r->d; d != top; d = replicas_delta_next(d)) ++ if (update_replicas(c, fs_usage, &d->r, d->delta)) { ++ top = d; ++ goto unwind; ++ } ++ ++ if (!fs_usage) ++ return 0; ++ ++ fs_usage->nr_inodes += r->nr_inodes; ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) { ++ fs_usage->reserved += r->persistent_reserved[i]; ++ fs_usage->persistent_reserved[i] += r->persistent_reserved[i]; ++ } ++ ++ return 0; ++unwind: ++ for (d = r->d; d != top; d = replicas_delta_next(d)) ++ update_replicas(c, fs_usage, &d->r, -d->delta); ++ return -1; ++} ++ ++#define do_mark_fn(fn, c, pos, flags, ...) \ ++({ \ ++ int gc, ret = 0; \ ++ \ ++ percpu_rwsem_assert_held(&c->mark_lock); \ ++ \ ++ for (gc = 0; gc < 2 && !ret; gc++) \ ++ if (!gc == !(flags & BTREE_TRIGGER_GC) || \ ++ (gc && gc_visited(c, pos))) \ ++ ret = fn(c, __VA_ARGS__, gc); \ ++ ret; \ ++}) ++ ++static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, struct bucket_mark *ret, ++ bool gc) ++{ ++ struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); ++ struct bucket *g = __bucket(ca, b, gc); ++ struct bucket_mark old, new; ++ ++ old = bucket_cmpxchg(g, new, ({ ++ BUG_ON(!is_available_bucket(new)); ++ ++ new.owned_by_allocator = true; ++ new.data_type = 0; ++ new.cached_sectors = 0; ++ new.dirty_sectors = 0; ++ new.gen++; ++ })); ++ ++ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); ++ ++ if (old.cached_sectors) ++ update_cached_sectors(c, fs_usage, ca->dev_idx, ++ -((s64) old.cached_sectors)); ++ ++ if (!gc) ++ *ret = old; ++ return 0; ++} ++ ++void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, struct bucket_mark *old) ++{ ++ do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0, ++ ca, b, old); ++ ++ if (!old->owned_by_allocator && old->cached_sectors) ++ trace_invalidate(ca, bucket_to_sector(ca, b), ++ old->cached_sectors); ++} ++ ++static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, bool owned_by_allocator, ++ bool gc) ++{ ++ struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); ++ struct bucket *g = __bucket(ca, b, gc); ++ struct bucket_mark old, new; ++ ++ old = bucket_cmpxchg(g, new, ({ ++ new.owned_by_allocator = owned_by_allocator; ++ })); ++ ++ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); ++ ++ BUG_ON(!gc && ++ !owned_by_allocator && !old.owned_by_allocator); ++ ++ return 0; ++} ++ ++void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, bool owned_by_allocator, ++ struct gc_pos pos, unsigned flags) ++{ ++ preempt_disable(); ++ ++ do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags, ++ ca, b, owned_by_allocator); ++ ++ preempt_enable(); ++} ++ ++static int bch2_mark_alloc(struct bch_fs *c, ++ struct bkey_s_c old, struct bkey_s_c new, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, unsigned flags) ++{ ++ bool gc = flags & BTREE_TRIGGER_GC; ++ struct bkey_alloc_unpacked u; ++ struct bch_dev *ca; ++ struct bucket *g; ++ struct bucket_mark old_m, m; ++ ++ /* We don't do anything for deletions - do we?: */ ++ if (new.k->type != KEY_TYPE_alloc) ++ return 0; ++ ++ /* ++ * alloc btree is read in by bch2_alloc_read, not gc: ++ */ ++ if ((flags & BTREE_TRIGGER_GC) && ++ !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) ++ return 0; ++ ++ ca = bch_dev_bkey_exists(c, new.k->p.inode); ++ ++ if (new.k->p.offset >= ca->mi.nbuckets) ++ return 0; ++ ++ g = __bucket(ca, new.k->p.offset, gc); ++ u = bch2_alloc_unpack(new); ++ ++ old_m = bucket_cmpxchg(g, m, ({ ++ m.gen = u.gen; ++ m.data_type = u.data_type; ++ m.dirty_sectors = u.dirty_sectors; ++ m.cached_sectors = u.cached_sectors; ++ ++ if (journal_seq) { ++ m.journal_seq_valid = 1; ++ m.journal_seq = journal_seq; ++ } ++ })); ++ ++ if (!(flags & BTREE_TRIGGER_ALLOC_READ)) ++ bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc); ++ ++ g->io_time[READ] = u.read_time; ++ g->io_time[WRITE] = u.write_time; ++ g->oldest_gen = u.oldest_gen; ++ g->gen_valid = 1; ++ ++ /* ++ * need to know if we're getting called from the invalidate path or ++ * not: ++ */ ++ ++ if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && ++ old_m.cached_sectors) { ++ update_cached_sectors(c, fs_usage, ca->dev_idx, ++ -old_m.cached_sectors); ++ trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset), ++ old_m.cached_sectors); ++ } ++ ++ return 0; ++} ++ ++#define checked_add(a, b) \ ++({ \ ++ unsigned _res = (unsigned) (a) + (b); \ ++ bool overflow = _res > U16_MAX; \ ++ if (overflow) \ ++ _res = U16_MAX; \ ++ (a) = _res; \ ++ overflow; \ ++}) ++ ++static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, enum bch_data_type data_type, ++ unsigned sectors, bool gc) ++{ ++ struct bucket *g = __bucket(ca, b, gc); ++ struct bucket_mark old, new; ++ bool overflow; ++ ++ BUG_ON(data_type != BCH_DATA_sb && ++ data_type != BCH_DATA_journal); ++ ++ old = bucket_cmpxchg(g, new, ({ ++ new.data_type = data_type; ++ overflow = checked_add(new.dirty_sectors, sectors); ++ })); ++ ++ bch2_fs_inconsistent_on(old.data_type && ++ old.data_type != data_type, c, ++ "different types of data in same bucket: %s, %s", ++ bch2_data_types[old.data_type], ++ bch2_data_types[data_type]); ++ ++ bch2_fs_inconsistent_on(overflow, c, ++ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > U16_MAX", ++ ca->dev_idx, b, new.gen, ++ bch2_data_types[old.data_type ?: data_type], ++ old.dirty_sectors, sectors); ++ ++ if (c) ++ bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc), ++ old, new, gc); ++ ++ return 0; ++} ++ ++void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, enum bch_data_type type, ++ unsigned sectors, struct gc_pos pos, ++ unsigned flags) ++{ ++ BUG_ON(type != BCH_DATA_sb && ++ type != BCH_DATA_journal); ++ ++ preempt_disable(); ++ ++ if (likely(c)) { ++ do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags, ++ ca, b, type, sectors); ++ } else { ++ __bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0); ++ } ++ ++ preempt_enable(); ++} ++ ++static s64 disk_sectors_scaled(unsigned n, unsigned d, unsigned sectors) ++{ ++ return DIV_ROUND_UP(sectors * n, d); ++} ++ ++static s64 __ptr_disk_sectors_delta(unsigned old_size, ++ unsigned offset, s64 delta, ++ unsigned flags, ++ unsigned n, unsigned d) ++{ ++ BUG_ON(!n || !d); ++ ++ if (flags & BTREE_TRIGGER_OVERWRITE_SPLIT) { ++ BUG_ON(offset + -delta > old_size); ++ ++ return -disk_sectors_scaled(n, d, old_size) + ++ disk_sectors_scaled(n, d, offset) + ++ disk_sectors_scaled(n, d, old_size - offset + delta); ++ } else if (flags & BTREE_TRIGGER_OVERWRITE) { ++ BUG_ON(offset + -delta > old_size); ++ ++ return -disk_sectors_scaled(n, d, old_size) + ++ disk_sectors_scaled(n, d, old_size + delta); ++ } else { ++ return disk_sectors_scaled(n, d, delta); ++ } ++} ++ ++static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, ++ unsigned offset, s64 delta, ++ unsigned flags) ++{ ++ return __ptr_disk_sectors_delta(p.crc.live_size, ++ offset, delta, flags, ++ p.crc.compressed_size, ++ p.crc.uncompressed_size); ++} ++ ++static void bucket_set_stripe(struct bch_fs *c, ++ const struct bch_extent_ptr *ptr, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, ++ unsigned flags, ++ bool enabled) ++{ ++ bool gc = flags & BTREE_TRIGGER_GC; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, gc); ++ struct bucket_mark new, old; ++ ++ old = bucket_cmpxchg(g, new, ({ ++ new.stripe = enabled; ++ if (journal_seq) { ++ new.journal_seq_valid = 1; ++ new.journal_seq = journal_seq; ++ } ++ })); ++ ++ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); ++ ++ /* ++ * XXX write repair code for these, flag stripe as possibly bad ++ */ ++ if (old.gen != ptr->gen) ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "stripe with stale pointer"); ++#if 0 ++ /* ++ * We'd like to check for these, but these checks don't work ++ * yet: ++ */ ++ if (old.stripe && enabled) ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "multiple stripes using same bucket"); ++ ++ if (!old.stripe && !enabled) ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "deleting stripe but bucket not marked as stripe bucket"); ++#endif ++} ++ ++static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, ++ struct extent_ptr_decoded p, ++ s64 sectors, enum bch_data_type ptr_data_type, ++ u8 bucket_gen, u8 *bucket_data_type, ++ u16 *dirty_sectors, u16 *cached_sectors) ++{ ++ u16 *dst_sectors = !p.ptr.cached ++ ? dirty_sectors ++ : cached_sectors; ++ u16 orig_sectors = *dst_sectors; ++ char buf[200]; ++ ++ if (gen_after(p.ptr.gen, bucket_gen)) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), ++ bucket_gen, ++ bch2_data_types[*bucket_data_type ?: ptr_data_type], ++ p.ptr.gen, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ return -EIO; ++ } ++ ++ if (gen_cmp(bucket_gen, p.ptr.gen) >= 96U) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), ++ bucket_gen, ++ bch2_data_types[*bucket_data_type ?: ptr_data_type], ++ p.ptr.gen, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ return -EIO; ++ } ++ ++ if (bucket_gen != p.ptr.gen && !p.ptr.cached) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), ++ bucket_gen, ++ bch2_data_types[*bucket_data_type ?: ptr_data_type], ++ p.ptr.gen, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ return -EIO; ++ } ++ ++ if (bucket_gen != p.ptr.gen) ++ return 1; ++ ++ if (*bucket_data_type && *bucket_data_type != ptr_data_type) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), ++ bucket_gen, ++ bch2_data_types[*bucket_data_type], ++ bch2_data_types[ptr_data_type], ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ return -EIO; ++ } ++ ++ if (checked_add(*dst_sectors, sectors)) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n" ++ "while marking %s", ++ p.ptr.dev, PTR_BUCKET_NR(bch_dev_bkey_exists(c, p.ptr.dev), &p.ptr), ++ bucket_gen, ++ bch2_data_types[*bucket_data_type ?: ptr_data_type], ++ orig_sectors, sectors, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ return -EIO; ++ } ++ ++ *bucket_data_type = *dirty_sectors || *cached_sectors ++ ? ptr_data_type : 0; ++ return 0; ++} ++ ++static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, ++ struct extent_ptr_decoded p, ++ s64 sectors, enum bch_data_type data_type, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, unsigned flags) ++{ ++ bool gc = flags & BTREE_TRIGGER_GC; ++ struct bucket_mark old, new; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc); ++ u8 bucket_data_type; ++ u64 v; ++ int ret; ++ ++ v = atomic64_read(&g->_mark.v); ++ do { ++ new.v.counter = old.v.counter = v; ++ bucket_data_type = new.data_type; ++ ++ ret = __mark_pointer(c, k, p, sectors, data_type, new.gen, ++ &bucket_data_type, ++ &new.dirty_sectors, ++ &new.cached_sectors); ++ if (ret) ++ return ret; ++ ++ new.data_type = bucket_data_type; ++ ++ if (journal_seq) { ++ new.journal_seq_valid = 1; ++ new.journal_seq = journal_seq; ++ } ++ ++ if (flags & BTREE_TRIGGER_NOATOMIC) { ++ g->_mark = new; ++ break; ++ } ++ } while ((v = atomic64_cmpxchg(&g->_mark.v, ++ old.v.counter, ++ new.v.counter)) != old.v.counter); ++ ++ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); ++ ++ BUG_ON(!gc && bucket_became_unavailable(old, new)); ++ ++ return 0; ++} ++ ++static int bch2_mark_stripe_ptr(struct bch_fs *c, ++ struct bch_extent_stripe_ptr p, ++ enum bch_data_type data_type, ++ struct bch_fs_usage *fs_usage, ++ s64 sectors, unsigned flags, ++ struct bch_replicas_padded *r, ++ unsigned *nr_data, ++ unsigned *nr_parity) ++{ ++ bool gc = flags & BTREE_TRIGGER_GC; ++ struct stripe *m; ++ unsigned i, blocks_nonempty = 0; ++ ++ m = genradix_ptr(&c->stripes[gc], p.idx); ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ ++ if (!m || !m->alive) { ++ spin_unlock(&c->ec_stripes_heap_lock); ++ bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", ++ (u64) p.idx); ++ return -EIO; ++ } ++ ++ BUG_ON(m->r.e.data_type != data_type); ++ ++ *nr_data = m->nr_blocks - m->nr_redundant; ++ *nr_parity = m->nr_redundant; ++ *r = m->r; ++ ++ m->block_sectors[p.block] += sectors; ++ ++ for (i = 0; i < m->nr_blocks; i++) ++ blocks_nonempty += m->block_sectors[i] != 0; ++ ++ if (m->blocks_nonempty != blocks_nonempty) { ++ m->blocks_nonempty = blocks_nonempty; ++ if (!gc) ++ bch2_stripes_heap_update(c, m, p.idx); ++ } ++ ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ return 0; ++} ++ ++static int bch2_mark_extent(struct bch_fs *c, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned offset, s64 sectors, ++ enum bch_data_type data_type, ++ struct bch_fs_usage *fs_usage, ++ unsigned journal_seq, unsigned flags) ++{ ++ struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ struct bch_replicas_padded r; ++ s64 dirty_sectors = 0; ++ bool stale; ++ int ret; ++ ++ r.e.data_type = data_type; ++ r.e.nr_devs = 0; ++ r.e.nr_required = 1; ++ ++ BUG_ON(!sectors); ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ s64 disk_sectors = data_type == BCH_DATA_btree ++ ? sectors ++ : ptr_disk_sectors_delta(p, offset, sectors, flags); ++ ++ ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type, ++ fs_usage, journal_seq, flags); ++ if (ret < 0) ++ return ret; ++ ++ stale = ret > 0; ++ ++ if (p.ptr.cached) { ++ if (!stale) ++ update_cached_sectors(c, fs_usage, p.ptr.dev, ++ disk_sectors); ++ } else if (!p.has_ec) { ++ dirty_sectors += disk_sectors; ++ r.e.devs[r.e.nr_devs++] = p.ptr.dev; ++ } else { ++ struct bch_replicas_padded ec_r; ++ unsigned nr_data, nr_parity; ++ s64 parity_sectors; ++ ++ ret = bch2_mark_stripe_ptr(c, p.ec, data_type, ++ fs_usage, disk_sectors, flags, ++ &ec_r, &nr_data, &nr_parity); ++ if (ret) ++ return ret; ++ ++ parity_sectors = ++ __ptr_disk_sectors_delta(p.crc.live_size, ++ offset, sectors, flags, ++ p.crc.compressed_size * nr_parity, ++ p.crc.uncompressed_size * nr_data); ++ ++ update_replicas(c, fs_usage, &ec_r.e, ++ disk_sectors + parity_sectors); ++ ++ /* ++ * There may be other dirty pointers in this extent, but ++ * if so they're not required for mounting if we have an ++ * erasure coded pointer in this extent: ++ */ ++ r.e.nr_required = 0; ++ } ++ } ++ ++ if (r.e.nr_devs) ++ update_replicas(c, fs_usage, &r.e, dirty_sectors); ++ ++ return 0; ++} ++ ++static int bch2_mark_stripe(struct bch_fs *c, ++ struct bkey_s_c old, struct bkey_s_c new, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, unsigned flags) ++{ ++ bool gc = flags & BTREE_TRIGGER_GC; ++ size_t idx = new.k->p.offset; ++ const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe ++ ? bkey_s_c_to_stripe(old).v : NULL; ++ const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe ++ ? bkey_s_c_to_stripe(new).v : NULL; ++ struct stripe *m = genradix_ptr(&c->stripes[gc], idx); ++ unsigned i; ++ ++ if (!m || (old_s && !m->alive)) { ++ bch_err_ratelimited(c, "error marking nonexistent stripe %zu", ++ idx); ++ return -1; ++ } ++ ++ if (!new_s) { ++ /* Deleting: */ ++ for (i = 0; i < old_s->nr_blocks; i++) ++ bucket_set_stripe(c, old_s->ptrs + i, fs_usage, ++ journal_seq, flags, false); ++ ++ if (!gc && m->on_heap) { ++ spin_lock(&c->ec_stripes_heap_lock); ++ bch2_stripes_heap_del(c, m, idx); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ } ++ ++ memset(m, 0, sizeof(*m)); ++ } else { ++ BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks); ++ BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant); ++ ++ for (i = 0; i < new_s->nr_blocks; i++) { ++ if (!old_s || ++ memcmp(new_s->ptrs + i, ++ old_s->ptrs + i, ++ sizeof(struct bch_extent_ptr))) { ++ ++ if (old_s) ++ bucket_set_stripe(c, old_s->ptrs + i, fs_usage, ++ journal_seq, flags, false); ++ bucket_set_stripe(c, new_s->ptrs + i, fs_usage, ++ journal_seq, flags, true); ++ } ++ } ++ ++ m->alive = true; ++ m->sectors = le16_to_cpu(new_s->sectors); ++ m->algorithm = new_s->algorithm; ++ m->nr_blocks = new_s->nr_blocks; ++ m->nr_redundant = new_s->nr_redundant; ++ ++ bch2_bkey_to_replicas(&m->r.e, new); ++ ++ /* gc recalculates these fields: */ ++ if (!(flags & BTREE_TRIGGER_GC)) { ++ m->blocks_nonempty = 0; ++ ++ for (i = 0; i < new_s->nr_blocks; i++) { ++ m->block_sectors[i] = ++ stripe_blockcount_get(new_s, i); ++ m->blocks_nonempty += !!m->block_sectors[i]; ++ } ++ } ++ ++ if (!gc) { ++ spin_lock(&c->ec_stripes_heap_lock); ++ bch2_stripes_heap_update(c, m, idx); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ } ++ } ++ ++ return 0; ++} ++ ++static int bch2_mark_key_locked(struct bch_fs *c, ++ struct bkey_s_c old, ++ struct bkey_s_c new, ++ unsigned offset, s64 sectors, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, unsigned flags) ++{ ++ struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; ++ int ret = 0; ++ ++ BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE))); ++ ++ preempt_disable(); ++ ++ if (!fs_usage || (flags & BTREE_TRIGGER_GC)) ++ fs_usage = fs_usage_ptr(c, journal_seq, ++ flags & BTREE_TRIGGER_GC); ++ ++ switch (k.k->type) { ++ case KEY_TYPE_alloc: ++ ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags); ++ break; ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: ++ sectors = !(flags & BTREE_TRIGGER_OVERWRITE) ++ ? c->opts.btree_node_size ++ : -c->opts.btree_node_size; ++ ++ ret = bch2_mark_extent(c, old, new, offset, sectors, ++ BCH_DATA_btree, fs_usage, journal_seq, flags); ++ break; ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ ret = bch2_mark_extent(c, old, new, offset, sectors, ++ BCH_DATA_user, fs_usage, journal_seq, flags); ++ break; ++ case KEY_TYPE_stripe: ++ ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags); ++ break; ++ case KEY_TYPE_inode: ++ if (!(flags & BTREE_TRIGGER_OVERWRITE)) ++ fs_usage->nr_inodes++; ++ else ++ fs_usage->nr_inodes--; ++ break; ++ case KEY_TYPE_reservation: { ++ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; ++ ++ sectors *= replicas; ++ replicas = clamp_t(unsigned, replicas, 1, ++ ARRAY_SIZE(fs_usage->persistent_reserved)); ++ ++ fs_usage->reserved += sectors; ++ fs_usage->persistent_reserved[replicas - 1] += sectors; ++ break; ++ } ++ } ++ ++ preempt_enable(); ++ ++ return ret; ++} ++ ++int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, ++ unsigned offset, s64 sectors, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, unsigned flags) ++{ ++ struct bkey deleted; ++ struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; ++ int ret; ++ ++ bkey_init(&deleted); ++ ++ percpu_down_read(&c->mark_lock); ++ ret = bch2_mark_key_locked(c, old, new, offset, sectors, ++ fs_usage, journal_seq, ++ BTREE_TRIGGER_INSERT|flags); ++ percpu_up_read(&c->mark_lock); ++ ++ return ret; ++} ++ ++int bch2_mark_update(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *new, ++ struct bch_fs_usage *fs_usage, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree *b = iter_l(iter)->b; ++ struct btree_node_iter node_iter = iter_l(iter)->iter; ++ struct bkey_packed *_old; ++ struct bkey_s_c old; ++ struct bkey unpacked; ++ int ret = 0; ++ ++ if (unlikely(flags & BTREE_TRIGGER_NORUN)) ++ return 0; ++ ++ if (!btree_node_type_needs_gc(iter->btree_id)) ++ return 0; ++ ++ bkey_init(&unpacked); ++ old = (struct bkey_s_c) { &unpacked, NULL }; ++ ++ if (!btree_node_type_is_extents(iter->btree_id)) { ++ if (btree_iter_type(iter) != BTREE_ITER_CACHED) { ++ _old = bch2_btree_node_iter_peek(&node_iter, b); ++ if (_old) ++ old = bkey_disassemble(b, _old, &unpacked); ++ } else { ++ struct bkey_cached *ck = (void *) iter->l[0].b; ++ ++ if (ck->valid) ++ old = bkey_i_to_s_c(ck->k); ++ } ++ ++ if (old.k->type == new->k.type) { ++ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, ++ fs_usage, trans->journal_res.seq, ++ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); ++ ++ } else { ++ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, ++ fs_usage, trans->journal_res.seq, ++ BTREE_TRIGGER_INSERT|flags); ++ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, ++ fs_usage, trans->journal_res.seq, ++ BTREE_TRIGGER_OVERWRITE|flags); ++ } ++ } else { ++ BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); ++ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), ++ 0, new->k.size, ++ fs_usage, trans->journal_res.seq, ++ BTREE_TRIGGER_INSERT|flags); ++ ++ while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) { ++ unsigned offset = 0; ++ s64 sectors; ++ ++ old = bkey_disassemble(b, _old, &unpacked); ++ sectors = -((s64) old.k->size); ++ ++ flags |= BTREE_TRIGGER_OVERWRITE; ++ ++ if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0) ++ return 0; ++ ++ switch (bch2_extent_overlap(&new->k, old.k)) { ++ case BCH_EXTENT_OVERLAP_ALL: ++ offset = 0; ++ sectors = -((s64) old.k->size); ++ break; ++ case BCH_EXTENT_OVERLAP_BACK: ++ offset = bkey_start_offset(&new->k) - ++ bkey_start_offset(old.k); ++ sectors = bkey_start_offset(&new->k) - ++ old.k->p.offset; ++ break; ++ case BCH_EXTENT_OVERLAP_FRONT: ++ offset = 0; ++ sectors = bkey_start_offset(old.k) - ++ new->k.p.offset; ++ break; ++ case BCH_EXTENT_OVERLAP_MIDDLE: ++ offset = bkey_start_offset(&new->k) - ++ bkey_start_offset(old.k); ++ sectors = -((s64) new->k.size); ++ flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; ++ break; ++ } ++ ++ BUG_ON(sectors >= 0); ++ ++ ret = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), ++ offset, sectors, fs_usage, ++ trans->journal_res.seq, flags) ?: 1; ++ if (ret <= 0) ++ break; ++ ++ bch2_btree_node_iter_advance(&node_iter, b); ++ } ++ } ++ ++ return ret; ++} ++ ++void bch2_trans_fs_usage_apply(struct btree_trans *trans, ++ struct bch_fs_usage *fs_usage) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_insert_entry *i; ++ static int warned_disk_usage = 0; ++ u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; ++ char buf[200]; ++ ++ if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res, ++ trans->journal_res.seq) || ++ warned_disk_usage || ++ xchg(&warned_disk_usage, 1)) ++ return; ++ ++ bch_err(c, "disk usage increased more than %llu sectors reserved", ++ disk_res_sectors); ++ ++ trans_for_each_update(trans, i) { ++ pr_err("while inserting"); ++ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); ++ pr_err("%s", buf); ++ pr_err("overlapping with"); ++ ++ if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) { ++ struct btree *b = iter_l(i->iter)->b; ++ struct btree_node_iter node_iter = iter_l(i->iter)->iter; ++ struct bkey_packed *_k; ++ ++ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { ++ struct bkey unpacked; ++ struct bkey_s_c k; ++ ++ pr_info("_k %px format %u", _k, _k->format); ++ k = bkey_disassemble(b, _k, &unpacked); ++ ++ if (btree_node_is_extents(b) ++ ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0 ++ : bkey_cmp(i->k->k.p, k.k->p)) ++ break; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, k); ++ pr_err("%s", buf); ++ ++ bch2_btree_node_iter_advance(&node_iter, b); ++ } ++ } else { ++ struct bkey_cached *ck = (void *) i->iter->l[0].b; ++ ++ if (ck->valid) { ++ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k)); ++ pr_err("%s", buf); ++ } ++ } ++ } ++} ++ ++/* trans_mark: */ ++ ++static struct btree_iter *trans_get_update(struct btree_trans *trans, ++ enum btree_id btree_id, struct bpos pos, ++ struct bkey_s_c *k) ++{ ++ struct btree_insert_entry *i; ++ ++ trans_for_each_update(trans, i) ++ if (i->iter->btree_id == btree_id && ++ (btree_node_type_is_extents(btree_id) ++ ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 && ++ bkey_cmp(pos, i->k->k.p) < 0 ++ : !bkey_cmp(pos, i->iter->pos))) { ++ *k = bkey_i_to_s_c(i->k); ++ return i->iter; ++ } ++ ++ return NULL; ++} ++ ++static int trans_get_key(struct btree_trans *trans, ++ enum btree_id btree_id, struct bpos pos, ++ struct btree_iter **iter, ++ struct bkey_s_c *k) ++{ ++ unsigned flags = btree_id != BTREE_ID_ALLOC ++ ? BTREE_ITER_SLOTS ++ : BTREE_ITER_CACHED; ++ int ret; ++ ++ *iter = trans_get_update(trans, btree_id, pos, k); ++ if (*iter) ++ return 1; ++ ++ *iter = bch2_trans_get_iter(trans, btree_id, pos, ++ flags|BTREE_ITER_INTENT); ++ if (IS_ERR(*iter)) ++ return PTR_ERR(*iter); ++ ++ *k = __bch2_btree_iter_peek(*iter, flags); ++ ret = bkey_err(*k); ++ if (ret) ++ bch2_trans_iter_put(trans, *iter); ++ return ret; ++} ++ ++static int bch2_trans_mark_pointer(struct btree_trans *trans, ++ struct bkey_s_c k, struct extent_ptr_decoded p, ++ s64 sectors, enum bch_data_type data_type) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ struct bpos pos = POS(p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr)); ++ struct btree_iter *iter; ++ struct bkey_s_c k_a; ++ struct bkey_alloc_unpacked u; ++ struct bkey_i_alloc *a; ++ struct bucket *g; ++ int ret; ++ ++ iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k_a); ++ if (iter) { ++ u = bch2_alloc_unpack(k_a); ++ } else { ++ iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, pos, ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_INTENT); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ goto out; ++ ++ percpu_down_read(&c->mark_lock); ++ g = bucket(ca, pos.offset); ++ u = alloc_mem_to_key(g, READ_ONCE(g->mark)); ++ percpu_up_read(&c->mark_lock); ++ } ++ ++ ret = __mark_pointer(c, k, p, sectors, data_type, u.gen, &u.data_type, ++ &u.dirty_sectors, &u.cached_sectors); ++ if (ret) ++ goto out; ++ ++ a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); ++ ret = PTR_ERR_OR_ZERO(a); ++ if (ret) ++ goto out; ++ ++ bkey_alloc_init(&a->k_i); ++ a->k.p = pos; ++ bch2_alloc_pack(a, u); ++ bch2_trans_update(trans, iter, &a->k_i, 0); ++out: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, ++ struct bch_extent_stripe_ptr p, ++ s64 sectors, enum bch_data_type data_type, ++ struct bch_replicas_padded *r, ++ unsigned *nr_data, ++ unsigned *nr_parity) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_i_stripe *s; ++ int ret = 0; ++ ++ ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k); ++ if (ret < 0) ++ return ret; ++ ++ if (k.k->type != KEY_TYPE_stripe) { ++ bch2_fs_inconsistent(c, ++ "pointer to nonexistent stripe %llu", ++ (u64) p.idx); ++ ret = -EIO; ++ goto out; ++ } ++ ++ s = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(s); ++ if (ret) ++ goto out; ++ ++ bkey_reassemble(&s->k_i, k); ++ ++ stripe_blockcount_set(&s->v, p.block, ++ stripe_blockcount_get(&s->v, p.block) + ++ sectors); ++ ++ *nr_data = s->v.nr_blocks - s->v.nr_redundant; ++ *nr_parity = s->v.nr_redundant; ++ bch2_bkey_to_replicas(&r->e, bkey_i_to_s_c(&s->k_i)); ++ bch2_trans_update(trans, iter, &s->k_i, 0); ++out: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static int bch2_trans_mark_extent(struct btree_trans *trans, ++ struct bkey_s_c k, unsigned offset, ++ s64 sectors, unsigned flags, ++ enum bch_data_type data_type) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ struct bch_replicas_padded r; ++ s64 dirty_sectors = 0; ++ bool stale; ++ int ret; ++ ++ r.e.data_type = data_type; ++ r.e.nr_devs = 0; ++ r.e.nr_required = 1; ++ ++ BUG_ON(!sectors); ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ s64 disk_sectors = data_type == BCH_DATA_btree ++ ? sectors ++ : ptr_disk_sectors_delta(p, offset, sectors, flags); ++ ++ ret = bch2_trans_mark_pointer(trans, k, p, disk_sectors, ++ data_type); ++ if (ret < 0) ++ return ret; ++ ++ stale = ret > 0; ++ ++ if (p.ptr.cached) { ++ if (!stale) ++ update_cached_sectors_list(trans, p.ptr.dev, ++ disk_sectors); ++ } else if (!p.has_ec) { ++ dirty_sectors += disk_sectors; ++ r.e.devs[r.e.nr_devs++] = p.ptr.dev; ++ } else { ++ struct bch_replicas_padded ec_r; ++ unsigned nr_data, nr_parity; ++ s64 parity_sectors; ++ ++ ret = bch2_trans_mark_stripe_ptr(trans, p.ec, ++ disk_sectors, data_type, ++ &ec_r, &nr_data, &nr_parity); ++ if (ret) ++ return ret; ++ ++ parity_sectors = ++ __ptr_disk_sectors_delta(p.crc.live_size, ++ offset, sectors, flags, ++ p.crc.compressed_size * nr_parity, ++ p.crc.uncompressed_size * nr_data); ++ ++ update_replicas_list(trans, &ec_r.e, ++ disk_sectors + parity_sectors); ++ ++ r.e.nr_required = 0; ++ } ++ } ++ ++ if (r.e.nr_devs) ++ update_replicas_list(trans, &r.e, dirty_sectors); ++ ++ return 0; ++} ++ ++static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, ++ struct bkey_s_c_reflink_p p, ++ u64 idx, unsigned sectors, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_i_reflink_v *r_v; ++ s64 ret; ++ ++ ret = trans_get_key(trans, BTREE_ID_REFLINK, ++ POS(0, idx), &iter, &k); ++ if (ret < 0) ++ return ret; ++ ++ if (k.k->type != KEY_TYPE_reflink_v) { ++ bch2_fs_inconsistent(c, ++ "%llu:%llu len %u points to nonexistent indirect extent %llu", ++ p.k->p.inode, p.k->p.offset, p.k->size, idx); ++ ret = -EIO; ++ goto err; ++ } ++ ++ if ((flags & BTREE_TRIGGER_OVERWRITE) && ++ (bkey_start_offset(k.k) < idx || ++ k.k->p.offset > idx + sectors)) ++ goto out; ++ ++ sectors = k.k->p.offset - idx; ++ ++ r_v = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(r_v); ++ if (ret) ++ goto err; ++ ++ bkey_reassemble(&r_v->k_i, k); ++ ++ le64_add_cpu(&r_v->v.refcount, ++ !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1); ++ ++ if (!r_v->v.refcount) { ++ r_v->k.type = KEY_TYPE_deleted; ++ set_bkey_val_u64s(&r_v->k, 0); ++ } ++ ++ bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); ++ BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); ++ ++ bch2_trans_update(trans, iter, &r_v->k_i, 0); ++out: ++ ret = sectors; ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static int bch2_trans_mark_reflink_p(struct btree_trans *trans, ++ struct bkey_s_c_reflink_p p, unsigned offset, ++ s64 sectors, unsigned flags) ++{ ++ u64 idx = le64_to_cpu(p.v->idx) + offset; ++ s64 ret = 0; ++ ++ sectors = abs(sectors); ++ BUG_ON(offset + sectors > p.k->size); ++ ++ while (sectors) { ++ ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags); ++ if (ret < 0) ++ break; ++ ++ idx += ret; ++ sectors = max_t(s64, 0LL, sectors - ret); ++ ret = 0; ++ } ++ ++ return ret; ++} ++ ++int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, ++ unsigned offset, s64 sectors, unsigned flags) ++{ ++ struct replicas_delta_list *d; ++ struct bch_fs *c = trans->c; ++ ++ switch (k.k->type) { ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: ++ sectors = !(flags & BTREE_TRIGGER_OVERWRITE) ++ ? c->opts.btree_node_size ++ : -c->opts.btree_node_size; ++ ++ return bch2_trans_mark_extent(trans, k, offset, sectors, ++ flags, BCH_DATA_btree); ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ return bch2_trans_mark_extent(trans, k, offset, sectors, ++ flags, BCH_DATA_user); ++ case KEY_TYPE_inode: ++ d = replicas_deltas_realloc(trans, 0); ++ ++ if (!(flags & BTREE_TRIGGER_OVERWRITE)) ++ d->nr_inodes++; ++ else ++ d->nr_inodes--; ++ return 0; ++ case KEY_TYPE_reservation: { ++ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; ++ ++ d = replicas_deltas_realloc(trans, 0); ++ ++ sectors *= replicas; ++ replicas = clamp_t(unsigned, replicas, 1, ++ ARRAY_SIZE(d->persistent_reserved)); ++ ++ d->persistent_reserved[replicas - 1] += sectors; ++ return 0; ++ } ++ case KEY_TYPE_reflink_p: ++ return bch2_trans_mark_reflink_p(trans, ++ bkey_s_c_to_reflink_p(k), ++ offset, sectors, flags); ++ default: ++ return 0; ++ } ++} ++ ++int bch2_trans_mark_update(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert, ++ unsigned flags) ++{ ++ struct btree *b = iter_l(iter)->b; ++ struct btree_node_iter node_iter = iter_l(iter)->iter; ++ struct bkey_packed *_k; ++ int ret; ++ ++ if (unlikely(flags & BTREE_TRIGGER_NORUN)) ++ return 0; ++ ++ if (!btree_node_type_needs_gc(iter->btree_id)) ++ return 0; ++ ++ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert), ++ 0, insert->k.size, BTREE_TRIGGER_INSERT); ++ if (ret) ++ return ret; ++ ++ if (btree_iter_type(iter) == BTREE_ITER_CACHED) { ++ struct bkey_cached *ck = (void *) iter->l[0].b; ++ ++ return bch2_trans_mark_key(trans, bkey_i_to_s_c(ck->k), ++ 0, 0, BTREE_TRIGGER_OVERWRITE); ++ } ++ ++ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { ++ struct bkey unpacked; ++ struct bkey_s_c k; ++ unsigned offset = 0; ++ s64 sectors = 0; ++ unsigned flags = BTREE_TRIGGER_OVERWRITE; ++ ++ k = bkey_disassemble(b, _k, &unpacked); ++ ++ if (btree_node_is_extents(b) ++ ? bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0 ++ : bkey_cmp(insert->k.p, k.k->p)) ++ break; ++ ++ if (btree_node_is_extents(b)) { ++ switch (bch2_extent_overlap(&insert->k, k.k)) { ++ case BCH_EXTENT_OVERLAP_ALL: ++ offset = 0; ++ sectors = -((s64) k.k->size); ++ break; ++ case BCH_EXTENT_OVERLAP_BACK: ++ offset = bkey_start_offset(&insert->k) - ++ bkey_start_offset(k.k); ++ sectors = bkey_start_offset(&insert->k) - ++ k.k->p.offset; ++ break; ++ case BCH_EXTENT_OVERLAP_FRONT: ++ offset = 0; ++ sectors = bkey_start_offset(k.k) - ++ insert->k.p.offset; ++ break; ++ case BCH_EXTENT_OVERLAP_MIDDLE: ++ offset = bkey_start_offset(&insert->k) - ++ bkey_start_offset(k.k); ++ sectors = -((s64) insert->k.size); ++ flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; ++ break; ++ } ++ ++ BUG_ON(sectors >= 0); ++ } ++ ++ ret = bch2_trans_mark_key(trans, k, offset, sectors, flags); ++ if (ret) ++ return ret; ++ ++ bch2_btree_node_iter_advance(&node_iter, b); ++ } ++ ++ return 0; ++} ++ ++/* Disk reservations: */ ++ ++static u64 bch2_recalc_sectors_available(struct bch_fs *c) ++{ ++ percpu_u64_set(&c->pcpu->sectors_available, 0); ++ ++ return avail_factor(__bch2_fs_usage_read_short(c).free); ++} ++ ++void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) ++{ ++ percpu_down_read(&c->mark_lock); ++ this_cpu_sub(c->usage[0]->online_reserved, ++ res->sectors); ++ percpu_up_read(&c->mark_lock); ++ ++ res->sectors = 0; ++} ++ ++#define SECTORS_CACHE 1024 ++ ++int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, ++ unsigned sectors, int flags) ++{ ++ struct bch_fs_pcpu *pcpu; ++ u64 old, v, get; ++ s64 sectors_available; ++ int ret; ++ ++ percpu_down_read(&c->mark_lock); ++ preempt_disable(); ++ pcpu = this_cpu_ptr(c->pcpu); ++ ++ if (sectors <= pcpu->sectors_available) ++ goto out; ++ ++ v = atomic64_read(&c->sectors_available); ++ do { ++ old = v; ++ get = min((u64) sectors + SECTORS_CACHE, old); ++ ++ if (get < sectors) { ++ preempt_enable(); ++ percpu_up_read(&c->mark_lock); ++ goto recalculate; ++ } ++ } while ((v = atomic64_cmpxchg(&c->sectors_available, ++ old, old - get)) != old); ++ ++ pcpu->sectors_available += get; ++ ++out: ++ pcpu->sectors_available -= sectors; ++ this_cpu_add(c->usage[0]->online_reserved, sectors); ++ res->sectors += sectors; ++ ++ preempt_enable(); ++ percpu_up_read(&c->mark_lock); ++ return 0; ++ ++recalculate: ++ percpu_down_write(&c->mark_lock); ++ ++ sectors_available = bch2_recalc_sectors_available(c); ++ ++ if (sectors <= sectors_available || ++ (flags & BCH_DISK_RESERVATION_NOFAIL)) { ++ atomic64_set(&c->sectors_available, ++ max_t(s64, 0, sectors_available - sectors)); ++ this_cpu_add(c->usage[0]->online_reserved, sectors); ++ res->sectors += sectors; ++ ret = 0; ++ } else { ++ atomic64_set(&c->sectors_available, sectors_available); ++ ret = -ENOSPC; ++ } ++ ++ percpu_up_write(&c->mark_lock); ++ ++ return ret; ++} ++ ++/* Startup/shutdown: */ ++ ++static void buckets_free_rcu(struct rcu_head *rcu) ++{ ++ struct bucket_array *buckets = ++ container_of(rcu, struct bucket_array, rcu); ++ ++ kvpfree(buckets, ++ sizeof(struct bucket_array) + ++ buckets->nbuckets * sizeof(struct bucket)); ++} ++ ++int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ++{ ++ struct bucket_array *buckets = NULL, *old_buckets = NULL; ++ unsigned long *buckets_nouse = NULL; ++ alloc_fifo free[RESERVE_NR]; ++ alloc_fifo free_inc; ++ alloc_heap alloc_heap; ++ ++ size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, ++ ca->mi.bucket_size / c->opts.btree_node_size); ++ /* XXX: these should be tunable */ ++ size_t reserve_none = max_t(size_t, 1, nbuckets >> 9); ++ size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7); ++ size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), ++ btree_reserve * 2); ++ bool resize = ca->buckets[0] != NULL; ++ int ret = -ENOMEM; ++ unsigned i; ++ ++ memset(&free, 0, sizeof(free)); ++ memset(&free_inc, 0, sizeof(free_inc)); ++ memset(&alloc_heap, 0, sizeof(alloc_heap)); ++ ++ if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + ++ nbuckets * sizeof(struct bucket), ++ GFP_KERNEL|__GFP_ZERO)) || ++ !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * ++ sizeof(unsigned long), ++ GFP_KERNEL|__GFP_ZERO)) || ++ !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) || ++ !init_fifo(&free[RESERVE_MOVINGGC], ++ copygc_reserve, GFP_KERNEL) || ++ !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || ++ !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) || ++ !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL)) ++ goto err; ++ ++ buckets->first_bucket = ca->mi.first_bucket; ++ buckets->nbuckets = nbuckets; ++ ++ bch2_copygc_stop(c); ++ ++ if (resize) { ++ down_write(&c->gc_lock); ++ down_write(&ca->bucket_lock); ++ percpu_down_write(&c->mark_lock); ++ } ++ ++ old_buckets = bucket_array(ca); ++ ++ if (resize) { ++ size_t n = min(buckets->nbuckets, old_buckets->nbuckets); ++ ++ memcpy(buckets->b, ++ old_buckets->b, ++ n * sizeof(struct bucket)); ++ memcpy(buckets_nouse, ++ ca->buckets_nouse, ++ BITS_TO_LONGS(n) * sizeof(unsigned long)); ++ } ++ ++ rcu_assign_pointer(ca->buckets[0], buckets); ++ buckets = old_buckets; ++ ++ swap(ca->buckets_nouse, buckets_nouse); ++ ++ if (resize) { ++ percpu_up_write(&c->mark_lock); ++ up_write(&c->gc_lock); ++ } ++ ++ spin_lock(&c->freelist_lock); ++ for (i = 0; i < RESERVE_NR; i++) { ++ fifo_move(&free[i], &ca->free[i]); ++ swap(ca->free[i], free[i]); ++ } ++ fifo_move(&free_inc, &ca->free_inc); ++ swap(ca->free_inc, free_inc); ++ spin_unlock(&c->freelist_lock); ++ ++ /* with gc lock held, alloc_heap can't be in use: */ ++ swap(ca->alloc_heap, alloc_heap); ++ ++ nbuckets = ca->mi.nbuckets; ++ ++ if (resize) ++ up_write(&ca->bucket_lock); ++ ++ ret = 0; ++err: ++ free_heap(&alloc_heap); ++ free_fifo(&free_inc); ++ for (i = 0; i < RESERVE_NR; i++) ++ free_fifo(&free[i]); ++ kvpfree(buckets_nouse, ++ BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); ++ if (buckets) ++ call_rcu(&old_buckets->rcu, buckets_free_rcu); ++ ++ return ret; ++} ++ ++void bch2_dev_buckets_free(struct bch_dev *ca) ++{ ++ unsigned i; ++ ++ free_heap(&ca->alloc_heap); ++ free_fifo(&ca->free_inc); ++ for (i = 0; i < RESERVE_NR; i++) ++ free_fifo(&ca->free[i]); ++ kvpfree(ca->buckets_nouse, ++ BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); ++ kvpfree(rcu_dereference_protected(ca->buckets[0], 1), ++ sizeof(struct bucket_array) + ++ ca->mi.nbuckets * sizeof(struct bucket)); ++ ++ free_percpu(ca->usage[0]); ++} ++ ++int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) ++{ ++ if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage))) ++ return -ENOMEM; ++ ++ return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);; ++} +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +new file mode 100644 +index 000000000000..653f6761862e +--- /dev/null ++++ b/fs/bcachefs/buckets.h +@@ -0,0 +1,324 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Code for manipulating bucket marks for garbage collection. ++ * ++ * Copyright 2014 Datera, Inc. ++ */ ++ ++#ifndef _BUCKETS_H ++#define _BUCKETS_H ++ ++#include "buckets_types.h" ++#include "super.h" ++ ++#define for_each_bucket(_b, _buckets) \ ++ for (_b = (_buckets)->b + (_buckets)->first_bucket; \ ++ _b < (_buckets)->b + (_buckets)->nbuckets; _b++) ++ ++#define bucket_cmpxchg(g, new, expr) \ ++({ \ ++ struct bucket *_g = g; \ ++ u64 _v = atomic64_read(&(g)->_mark.v); \ ++ struct bucket_mark _old; \ ++ \ ++ do { \ ++ (new).v.counter = _old.v.counter = _v; \ ++ expr; \ ++ } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v, \ ++ _old.v.counter, \ ++ (new).v.counter)) != _old.v.counter);\ ++ _old; \ ++}) ++ ++static inline struct bucket_array *__bucket_array(struct bch_dev *ca, ++ bool gc) ++{ ++ return rcu_dereference_check(ca->buckets[gc], ++ !ca->fs || ++ percpu_rwsem_is_held(&ca->fs->mark_lock) || ++ lockdep_is_held(&ca->fs->gc_lock) || ++ lockdep_is_held(&ca->bucket_lock)); ++} ++ ++static inline struct bucket_array *bucket_array(struct bch_dev *ca) ++{ ++ return __bucket_array(ca, false); ++} ++ ++static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc) ++{ ++ struct bucket_array *buckets = __bucket_array(ca, gc); ++ ++ BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); ++ return buckets->b + b; ++} ++ ++static inline struct bucket *bucket(struct bch_dev *ca, size_t b) ++{ ++ return __bucket(ca, b, false); ++} ++ ++static inline void bucket_io_clock_reset(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, int rw) ++{ ++ bucket(ca, b)->io_time[rw] = c->bucket_clock[rw].hand; ++} ++ ++static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw) ++{ ++ return c->bucket_clock[rw].hand - g->io_time[rw]; ++} ++ ++/* ++ * bucket_gc_gen() returns the difference between the bucket's current gen and ++ * the oldest gen of any pointer into that bucket in the btree. ++ */ ++ ++static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b) ++{ ++ struct bucket *g = bucket(ca, b); ++ ++ return g->mark.gen - g->oldest_gen; ++} ++ ++static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, ++ const struct bch_extent_ptr *ptr) ++{ ++ return sector_to_bucket(ca, ptr->offset); ++} ++ ++static inline struct bucket *PTR_BUCKET(struct bch_dev *ca, ++ const struct bch_extent_ptr *ptr, ++ bool gc) ++{ ++ return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc); ++} ++ ++static inline enum bch_data_type ptr_data_type(const struct bkey *k, ++ const struct bch_extent_ptr *ptr) ++{ ++ if (k->type == KEY_TYPE_btree_ptr || ++ k->type == KEY_TYPE_btree_ptr_v2) ++ return BCH_DATA_btree; ++ ++ return ptr->cached ? BCH_DATA_cached : BCH_DATA_user; ++} ++ ++static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca, ++ const struct bch_extent_ptr *ptr) ++{ ++ struct bucket_mark m; ++ ++ rcu_read_lock(); ++ m = READ_ONCE(PTR_BUCKET(ca, ptr, 0)->mark); ++ rcu_read_unlock(); ++ ++ return m; ++} ++ ++static inline int gen_cmp(u8 a, u8 b) ++{ ++ return (s8) (a - b); ++} ++ ++static inline int gen_after(u8 a, u8 b) ++{ ++ int r = gen_cmp(a, b); ++ ++ return r > 0 ? r : 0; ++} ++ ++/** ++ * ptr_stale() - check if a pointer points into a bucket that has been ++ * invalidated. ++ */ ++static inline u8 ptr_stale(struct bch_dev *ca, ++ const struct bch_extent_ptr *ptr) ++{ ++ return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen); ++} ++ ++static inline s64 __ptr_disk_sectors(struct extent_ptr_decoded p, ++ unsigned live_size) ++{ ++ return live_size && p.crc.compression_type ++ ? max(1U, DIV_ROUND_UP(live_size * p.crc.compressed_size, ++ p.crc.uncompressed_size)) ++ : live_size; ++} ++ ++static inline s64 ptr_disk_sectors(struct extent_ptr_decoded p) ++{ ++ return __ptr_disk_sectors(p, p.crc.live_size); ++} ++ ++/* bucket gc marks */ ++ ++static inline unsigned bucket_sectors_used(struct bucket_mark mark) ++{ ++ return mark.dirty_sectors + mark.cached_sectors; ++} ++ ++static inline bool bucket_unused(struct bucket_mark mark) ++{ ++ return !mark.owned_by_allocator && ++ !mark.data_type && ++ !bucket_sectors_used(mark); ++} ++ ++static inline bool is_available_bucket(struct bucket_mark mark) ++{ ++ return (!mark.owned_by_allocator && ++ !mark.dirty_sectors && ++ !mark.stripe); ++} ++ ++static inline bool bucket_needs_journal_commit(struct bucket_mark m, ++ u16 last_seq_ondisk) ++{ ++ return m.journal_seq_valid && ++ ((s16) m.journal_seq - (s16) last_seq_ondisk > 0); ++} ++ ++/* Device usage: */ ++ ++struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); ++ ++void bch2_dev_usage_from_buckets(struct bch_fs *); ++ ++static inline u64 __dev_buckets_available(struct bch_dev *ca, ++ struct bch_dev_usage stats) ++{ ++ u64 total = ca->mi.nbuckets - ca->mi.first_bucket; ++ ++ if (WARN_ONCE(stats.buckets_unavailable > total, ++ "buckets_unavailable overflow (%llu > %llu)\n", ++ stats.buckets_unavailable, total)) ++ return 0; ++ ++ return total - stats.buckets_unavailable; ++} ++ ++/* ++ * Number of reclaimable buckets - only for use by the allocator thread: ++ */ ++static inline u64 dev_buckets_available(struct bch_dev *ca) ++{ ++ return __dev_buckets_available(ca, bch2_dev_usage_read(ca)); ++} ++ ++static inline u64 __dev_buckets_free(struct bch_dev *ca, ++ struct bch_dev_usage stats) ++{ ++ return __dev_buckets_available(ca, stats) + ++ fifo_used(&ca->free[RESERVE_NONE]) + ++ fifo_used(&ca->free_inc); ++} ++ ++static inline u64 dev_buckets_free(struct bch_dev *ca) ++{ ++ return __dev_buckets_free(ca, bch2_dev_usage_read(ca)); ++} ++ ++/* Filesystem usage: */ ++ ++static inline unsigned fs_usage_u64s(struct bch_fs *c) ++{ ++ ++ return sizeof(struct bch_fs_usage) / sizeof(u64) + ++ READ_ONCE(c->replicas.nr); ++} ++ ++void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *); ++struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *); ++ ++u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *); ++ ++struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *); ++ ++void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned); ++ ++void bch2_fs_usage_to_text(struct printbuf *, ++ struct bch_fs *, struct bch_fs_usage *); ++ ++u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *); ++ ++struct bch_fs_usage_short ++bch2_fs_usage_read_short(struct bch_fs *); ++ ++/* key/bucket marking: */ ++ ++void bch2_bucket_seq_cleanup(struct bch_fs *); ++void bch2_fs_usage_initialize(struct bch_fs *); ++ ++void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *, ++ size_t, struct bucket_mark *); ++void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, ++ size_t, bool, struct gc_pos, unsigned); ++void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, ++ size_t, enum bch_data_type, unsigned, ++ struct gc_pos, unsigned); ++ ++int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, ++ s64, struct bch_fs_usage *, u64, unsigned); ++int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, ++ struct disk_reservation *, unsigned); ++ ++int bch2_mark_update(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, struct bch_fs_usage *, unsigned); ++ ++int bch2_replicas_delta_list_apply(struct bch_fs *, ++ struct bch_fs_usage *, ++ struct replicas_delta_list *); ++int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, ++ unsigned, s64, unsigned); ++int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, ++ struct bkey_i *insert, unsigned); ++void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *); ++ ++/* disk reservations: */ ++ ++void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *); ++ ++static inline void bch2_disk_reservation_put(struct bch_fs *c, ++ struct disk_reservation *res) ++{ ++ if (res->sectors) ++ __bch2_disk_reservation_put(c, res); ++} ++ ++#define BCH_DISK_RESERVATION_NOFAIL (1 << 0) ++ ++int bch2_disk_reservation_add(struct bch_fs *, ++ struct disk_reservation *, ++ unsigned, int); ++ ++static inline struct disk_reservation ++bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) ++{ ++ return (struct disk_reservation) { ++ .sectors = 0, ++#if 0 ++ /* not used yet: */ ++ .gen = c->capacity_gen, ++#endif ++ .nr_replicas = nr_replicas, ++ }; ++} ++ ++static inline int bch2_disk_reservation_get(struct bch_fs *c, ++ struct disk_reservation *res, ++ unsigned sectors, ++ unsigned nr_replicas, ++ int flags) ++{ ++ *res = bch2_disk_reservation_init(c, nr_replicas); ++ ++ return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags); ++} ++ ++int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64); ++void bch2_dev_buckets_free(struct bch_dev *); ++int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *); ++ ++#endif /* _BUCKETS_H */ +diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h +new file mode 100644 +index 000000000000..d5215b14d7d9 +--- /dev/null ++++ b/fs/bcachefs/buckets_types.h +@@ -0,0 +1,135 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BUCKETS_TYPES_H ++#define _BUCKETS_TYPES_H ++ ++#include "bcachefs_format.h" ++#include "util.h" ++ ++#define BUCKET_JOURNAL_SEQ_BITS 16 ++ ++struct bucket_mark { ++ union { ++ atomic64_t v; ++ ++ struct { ++ u8 gen; ++ u8 data_type:3, ++ owned_by_allocator:1, ++ journal_seq_valid:1, ++ stripe:1; ++ u16 dirty_sectors; ++ u16 cached_sectors; ++ ++ /* ++ * low bits of journal sequence number when this bucket was most ++ * recently modified: if journal_seq_valid is set, this bucket can't be ++ * reused until the journal sequence number written to disk is >= the ++ * bucket's journal sequence number: ++ */ ++ u16 journal_seq; ++ }; ++ }; ++}; ++ ++struct bucket { ++ union { ++ struct bucket_mark _mark; ++ const struct bucket_mark mark; ++ }; ++ ++ u16 io_time[2]; ++ u8 oldest_gen; ++ u8 gc_gen; ++ unsigned gen_valid:1; ++}; ++ ++struct bucket_array { ++ struct rcu_head rcu; ++ u16 first_bucket; ++ size_t nbuckets; ++ struct bucket b[]; ++}; ++ ++struct bch_dev_usage { ++ u64 buckets[BCH_DATA_NR]; ++ u64 buckets_alloc; ++ u64 buckets_unavailable; ++ ++ /* _compressed_ sectors: */ ++ u64 sectors[BCH_DATA_NR]; ++ u64 sectors_fragmented; ++ ++ u64 buckets_ec; ++ u64 sectors_ec; ++}; ++ ++struct bch_fs_usage { ++ /* all fields are in units of 512 byte sectors: */ ++ ++ u64 online_reserved; ++ ++ /* fields after online_reserved are cleared/recalculated by gc: */ ++ u64 gc_start[0]; ++ ++ u64 hidden; ++ u64 btree; ++ u64 data; ++ u64 cached; ++ u64 reserved; ++ u64 nr_inodes; ++ ++ /* XXX: add stats for compression ratio */ ++#if 0 ++ u64 uncompressed; ++ u64 compressed; ++#endif ++ ++ /* broken out: */ ++ ++ u64 persistent_reserved[BCH_REPLICAS_MAX]; ++ u64 replicas[]; ++}; ++ ++struct bch_fs_usage_short { ++ u64 capacity; ++ u64 used; ++ u64 free; ++ u64 nr_inodes; ++}; ++ ++struct replicas_delta { ++ s64 delta; ++ struct bch_replicas_entry r; ++} __packed; ++ ++struct replicas_delta_list { ++ unsigned size; ++ unsigned used; ++ ++ struct {} memset_start; ++ u64 nr_inodes; ++ u64 persistent_reserved[BCH_REPLICAS_MAX]; ++ struct {} memset_end; ++ struct replicas_delta d[0]; ++}; ++ ++/* ++ * A reservation for space on disk: ++ */ ++struct disk_reservation { ++ u64 sectors; ++ u32 gen; ++ unsigned nr_replicas; ++}; ++ ++struct copygc_heap_entry { ++ u8 dev; ++ u8 gen; ++ u16 fragmentation; ++ u32 sectors; ++ u64 offset; ++}; ++ ++typedef HEAP(struct copygc_heap_entry) copygc_heap; ++ ++#endif /* _BUCKETS_TYPES_H */ +diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c +new file mode 100644 +index 000000000000..0377f9018d27 +--- /dev/null ++++ b/fs/bcachefs/chardev.c +@@ -0,0 +1,704 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef NO_BCACHEFS_CHARDEV ++ ++#include "bcachefs.h" ++#include "bcachefs_ioctl.h" ++#include "buckets.h" ++#include "chardev.h" ++#include "move.h" ++#include "replicas.h" ++#include "super.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* returns with ref on ca->ref */ ++static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, ++ unsigned flags) ++{ ++ struct bch_dev *ca; ++ ++ if (flags & BCH_BY_INDEX) { ++ if (dev >= c->sb.nr_devices) ++ return ERR_PTR(-EINVAL); ++ ++ rcu_read_lock(); ++ ca = rcu_dereference(c->devs[dev]); ++ if (ca) ++ percpu_ref_get(&ca->ref); ++ rcu_read_unlock(); ++ ++ if (!ca) ++ return ERR_PTR(-EINVAL); ++ } else { ++ char *path; ++ ++ path = strndup_user((const char __user *) ++ (unsigned long) dev, PATH_MAX); ++ if (IS_ERR(path)) ++ return ERR_CAST(path); ++ ++ ca = bch2_dev_lookup(c, path); ++ kfree(path); ++ } ++ ++ return ca; ++} ++ ++#if 0 ++static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) ++{ ++ struct bch_ioctl_assemble arg; ++ struct bch_fs *c; ++ u64 *user_devs = NULL; ++ char **devs = NULL; ++ unsigned i; ++ int ret = -EFAULT; ++ ++ if (copy_from_user(&arg, user_arg, sizeof(arg))) ++ return -EFAULT; ++ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL); ++ if (!user_devs) ++ return -ENOMEM; ++ ++ devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL); ++ ++ if (copy_from_user(user_devs, user_arg->devs, ++ sizeof(u64) * arg.nr_devs)) ++ goto err; ++ ++ for (i = 0; i < arg.nr_devs; i++) { ++ devs[i] = strndup_user((const char __user *)(unsigned long) ++ user_devs[i], ++ PATH_MAX); ++ if (!devs[i]) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ } ++ ++ c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty()); ++ ret = PTR_ERR_OR_ZERO(c); ++ if (!ret) ++ closure_put(&c->cl); ++err: ++ if (devs) ++ for (i = 0; i < arg.nr_devs; i++) ++ kfree(devs[i]); ++ kfree(devs); ++ return ret; ++} ++ ++static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg) ++{ ++ struct bch_ioctl_incremental arg; ++ const char *err; ++ char *path; ++ ++ if (copy_from_user(&arg, user_arg, sizeof(arg))) ++ return -EFAULT; ++ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); ++ if (!path) ++ return -ENOMEM; ++ ++ err = bch2_fs_open_incremental(path); ++ kfree(path); ++ ++ if (err) { ++ pr_err("Could not register bcachefs devices: %s", err); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++#endif ++ ++static long bch2_global_ioctl(unsigned cmd, void __user *arg) ++{ ++ switch (cmd) { ++#if 0 ++ case BCH_IOCTL_ASSEMBLE: ++ return bch2_ioctl_assemble(arg); ++ case BCH_IOCTL_INCREMENTAL: ++ return bch2_ioctl_incremental(arg); ++#endif ++ default: ++ return -ENOTTY; ++ } ++} ++ ++static long bch2_ioctl_query_uuid(struct bch_fs *c, ++ struct bch_ioctl_query_uuid __user *user_arg) ++{ ++ return copy_to_user(&user_arg->uuid, ++ &c->sb.user_uuid, ++ sizeof(c->sb.user_uuid)); ++} ++ ++#if 0 ++static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg) ++{ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ return bch2_fs_start(c); ++} ++ ++static long bch2_ioctl_stop(struct bch_fs *c) ++{ ++ bch2_fs_stop(c); ++ return 0; ++} ++#endif ++ ++static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) ++{ ++ char *path; ++ int ret; ++ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); ++ if (!path) ++ return -ENOMEM; ++ ++ ret = bch2_dev_add(c, path); ++ kfree(path); ++ ++ return ret; ++} ++ ++static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg) ++{ ++ struct bch_dev *ca; ++ ++ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| ++ BCH_FORCE_IF_METADATA_LOST| ++ BCH_FORCE_IF_DEGRADED| ++ BCH_BY_INDEX)) || ++ arg.pad) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ return bch2_dev_remove(c, ca, arg.flags); ++} ++ ++static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg) ++{ ++ char *path; ++ int ret; ++ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); ++ if (!path) ++ return -ENOMEM; ++ ++ ret = bch2_dev_online(c, path); ++ kfree(path); ++ return ret; ++} ++ ++static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) ++{ ++ struct bch_dev *ca; ++ int ret; ++ ++ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| ++ BCH_FORCE_IF_METADATA_LOST| ++ BCH_FORCE_IF_DEGRADED| ++ BCH_BY_INDEX)) || ++ arg.pad) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ ret = bch2_dev_offline(c, ca, arg.flags); ++ percpu_ref_put(&ca->ref); ++ return ret; ++} ++ ++static long bch2_ioctl_disk_set_state(struct bch_fs *c, ++ struct bch_ioctl_disk_set_state arg) ++{ ++ struct bch_dev *ca; ++ int ret; ++ ++ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| ++ BCH_FORCE_IF_METADATA_LOST| ++ BCH_FORCE_IF_DEGRADED| ++ BCH_BY_INDEX)) || ++ arg.pad[0] || arg.pad[1] || arg.pad[2]) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags); ++ ++ percpu_ref_put(&ca->ref); ++ return ret; ++} ++ ++struct bch_data_ctx { ++ struct bch_fs *c; ++ struct bch_ioctl_data arg; ++ struct bch_move_stats stats; ++ ++ int ret; ++ ++ struct task_struct *thread; ++}; ++ ++static int bch2_data_thread(void *arg) ++{ ++ struct bch_data_ctx *ctx = arg; ++ ++ ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); ++ ++ ctx->stats.data_type = U8_MAX; ++ return 0; ++} ++ ++static int bch2_data_job_release(struct inode *inode, struct file *file) ++{ ++ struct bch_data_ctx *ctx = file->private_data; ++ ++ kthread_stop(ctx->thread); ++ put_task_struct(ctx->thread); ++ kfree(ctx); ++ return 0; ++} ++ ++static ssize_t bch2_data_job_read(struct file *file, char __user *buf, ++ size_t len, loff_t *ppos) ++{ ++ struct bch_data_ctx *ctx = file->private_data; ++ struct bch_fs *c = ctx->c; ++ struct bch_ioctl_data_event e = { ++ .type = BCH_DATA_EVENT_PROGRESS, ++ .p.data_type = ctx->stats.data_type, ++ .p.btree_id = ctx->stats.btree_id, ++ .p.pos = ctx->stats.pos, ++ .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), ++ .p.sectors_total = bch2_fs_usage_read_short(c).used, ++ }; ++ ++ if (len < sizeof(e)) ++ return -EINVAL; ++ ++ return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e); ++} ++ ++static const struct file_operations bcachefs_data_ops = { ++ .release = bch2_data_job_release, ++ .read = bch2_data_job_read, ++ .llseek = no_llseek, ++}; ++ ++static long bch2_ioctl_data(struct bch_fs *c, ++ struct bch_ioctl_data arg) ++{ ++ struct bch_data_ctx *ctx = NULL; ++ struct file *file = NULL; ++ unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK; ++ int ret, fd = -1; ++ ++ if (arg.op >= BCH_DATA_OP_NR || arg.flags) ++ return -EINVAL; ++ ++ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); ++ if (!ctx) ++ return -ENOMEM; ++ ++ ctx->c = c; ++ ctx->arg = arg; ++ ++ ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]"); ++ if (IS_ERR(ctx->thread)) { ++ ret = PTR_ERR(ctx->thread); ++ goto err; ++ } ++ ++ ret = get_unused_fd_flags(flags); ++ if (ret < 0) ++ goto err; ++ fd = ret; ++ ++ file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags); ++ if (IS_ERR(file)) { ++ ret = PTR_ERR(file); ++ goto err; ++ } ++ ++ fd_install(fd, file); ++ ++ get_task_struct(ctx->thread); ++ wake_up_process(ctx->thread); ++ ++ return fd; ++err: ++ if (fd >= 0) ++ put_unused_fd(fd); ++ if (!IS_ERR_OR_NULL(ctx->thread)) ++ kthread_stop(ctx->thread); ++ kfree(ctx); ++ return ret; ++} ++ ++static long bch2_ioctl_fs_usage(struct bch_fs *c, ++ struct bch_ioctl_fs_usage __user *user_arg) ++{ ++ struct bch_ioctl_fs_usage *arg = NULL; ++ struct bch_replicas_usage *dst_e, *dst_end; ++ struct bch_fs_usage *src; ++ u32 replica_entries_bytes; ++ unsigned i; ++ int ret = 0; ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EINVAL; ++ ++ if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) ++ return -EFAULT; ++ ++ arg = kzalloc(sizeof(*arg) + replica_entries_bytes, GFP_KERNEL); ++ if (!arg) ++ return -ENOMEM; ++ ++ src = bch2_fs_usage_read(c); ++ if (!src) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ arg->capacity = c->capacity; ++ arg->used = bch2_fs_sectors_used(c, src); ++ arg->online_reserved = src->online_reserved; ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) ++ arg->persistent_reserved[i] = src->persistent_reserved[i]; ++ ++ dst_e = arg->replicas; ++ dst_end = (void *) arg->replicas + replica_entries_bytes; ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *src_e = ++ cpu_replicas_entry(&c->replicas, i); ++ ++ if (replicas_usage_next(dst_e) > dst_end) { ++ ret = -ERANGE; ++ break; ++ } ++ ++ dst_e->sectors = src->replicas[i]; ++ dst_e->r = *src_e; ++ ++ /* recheck after setting nr_devs: */ ++ if (replicas_usage_next(dst_e) > dst_end) { ++ ret = -ERANGE; ++ break; ++ } ++ ++ memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs); ++ ++ dst_e = replicas_usage_next(dst_e); ++ } ++ ++ arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas; ++ ++ percpu_up_read(&c->mark_lock); ++ kfree(src); ++ ++ if (!ret) ++ ret = copy_to_user(user_arg, arg, ++ sizeof(*arg) + arg->replica_entries_bytes); ++err: ++ kfree(arg); ++ return ret; ++} ++ ++static long bch2_ioctl_dev_usage(struct bch_fs *c, ++ struct bch_ioctl_dev_usage __user *user_arg) ++{ ++ struct bch_ioctl_dev_usage arg; ++ struct bch_dev_usage src; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EINVAL; ++ ++ if (copy_from_user(&arg, user_arg, sizeof(arg))) ++ return -EFAULT; ++ ++ if ((arg.flags & ~BCH_BY_INDEX) || ++ arg.pad[0] || ++ arg.pad[1] || ++ arg.pad[2]) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ src = bch2_dev_usage_read(ca); ++ ++ arg.state = ca->mi.state; ++ arg.bucket_size = ca->mi.bucket_size; ++ arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; ++ arg.available_buckets = arg.nr_buckets - src.buckets_unavailable; ++ arg.ec_buckets = src.buckets_ec; ++ arg.ec_sectors = src.sectors_ec; ++ ++ for (i = 0; i < BCH_DATA_NR; i++) { ++ arg.buckets[i] = src.buckets[i]; ++ arg.sectors[i] = src.sectors[i]; ++ } ++ ++ percpu_ref_put(&ca->ref); ++ ++ return copy_to_user(user_arg, &arg, sizeof(arg)); ++} ++ ++static long bch2_ioctl_read_super(struct bch_fs *c, ++ struct bch_ioctl_read_super arg) ++{ ++ struct bch_dev *ca = NULL; ++ struct bch_sb *sb; ++ int ret = 0; ++ ++ if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) || ++ arg.pad) ++ return -EINVAL; ++ ++ mutex_lock(&c->sb_lock); ++ ++ if (arg.flags & BCH_READ_DEV) { ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ ++ if (IS_ERR(ca)) { ++ ret = PTR_ERR(ca); ++ goto err; ++ } ++ ++ sb = ca->disk_sb.sb; ++ } else { ++ sb = c->disk_sb.sb; ++ } ++ ++ if (vstruct_bytes(sb) > arg.size) { ++ ret = -ERANGE; ++ goto err; ++ } ++ ++ ret = copy_to_user((void __user *)(unsigned long)arg.sb, ++ sb, vstruct_bytes(sb)); ++err: ++ if (ca) ++ percpu_ref_put(&ca->ref); ++ mutex_unlock(&c->sb_lock); ++ return ret; ++} ++ ++static long bch2_ioctl_disk_get_idx(struct bch_fs *c, ++ struct bch_ioctl_disk_get_idx arg) ++{ ++ dev_t dev = huge_decode_dev(arg.dev); ++ struct bch_dev *ca; ++ unsigned i; ++ ++ for_each_online_member(ca, c, i) ++ if (ca->disk_sb.bdev->bd_dev == dev) { ++ percpu_ref_put(&ca->io_ref); ++ return i; ++ } ++ ++ return -ENOENT; ++} ++ ++static long bch2_ioctl_disk_resize(struct bch_fs *c, ++ struct bch_ioctl_disk_resize arg) ++{ ++ struct bch_dev *ca; ++ int ret; ++ ++ if ((arg.flags & ~BCH_BY_INDEX) || ++ arg.pad) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ ret = bch2_dev_resize(c, ca, arg.nbuckets); ++ ++ percpu_ref_put(&ca->ref); ++ return ret; ++} ++ ++#define BCH_IOCTL(_name, _argtype) \ ++do { \ ++ _argtype i; \ ++ \ ++ if (copy_from_user(&i, arg, sizeof(i))) \ ++ return -EFAULT; \ ++ return bch2_ioctl_##_name(c, i); \ ++} while (0) ++ ++long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) ++{ ++ /* ioctls that don't require admin cap: */ ++ switch (cmd) { ++ case BCH_IOCTL_QUERY_UUID: ++ return bch2_ioctl_query_uuid(c, arg); ++ case BCH_IOCTL_FS_USAGE: ++ return bch2_ioctl_fs_usage(c, arg); ++ case BCH_IOCTL_DEV_USAGE: ++ return bch2_ioctl_dev_usage(c, arg); ++ } ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ switch (cmd) { ++#if 0 ++ case BCH_IOCTL_START: ++ BCH_IOCTL(start, struct bch_ioctl_start); ++ case BCH_IOCTL_STOP: ++ return bch2_ioctl_stop(c); ++#endif ++ case BCH_IOCTL_READ_SUPER: ++ BCH_IOCTL(read_super, struct bch_ioctl_read_super); ++ case BCH_IOCTL_DISK_GET_IDX: ++ BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx); ++ } ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EINVAL; ++ ++ /* ioctls that do require admin cap: */ ++ switch (cmd) { ++ case BCH_IOCTL_DISK_ADD: ++ BCH_IOCTL(disk_add, struct bch_ioctl_disk); ++ case BCH_IOCTL_DISK_REMOVE: ++ BCH_IOCTL(disk_remove, struct bch_ioctl_disk); ++ case BCH_IOCTL_DISK_ONLINE: ++ BCH_IOCTL(disk_online, struct bch_ioctl_disk); ++ case BCH_IOCTL_DISK_OFFLINE: ++ BCH_IOCTL(disk_offline, struct bch_ioctl_disk); ++ case BCH_IOCTL_DISK_SET_STATE: ++ BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state); ++ case BCH_IOCTL_DATA: ++ BCH_IOCTL(data, struct bch_ioctl_data); ++ case BCH_IOCTL_DISK_RESIZE: ++ BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); ++ ++ default: ++ return -ENOTTY; ++ } ++} ++ ++static DEFINE_IDR(bch_chardev_minor); ++ ++static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v) ++{ ++ unsigned minor = iminor(file_inode(filp)); ++ struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL; ++ void __user *arg = (void __user *) v; ++ ++ return c ++ ? bch2_fs_ioctl(c, cmd, arg) ++ : bch2_global_ioctl(cmd, arg); ++} ++ ++static const struct file_operations bch_chardev_fops = { ++ .owner = THIS_MODULE, ++ .unlocked_ioctl = bch2_chardev_ioctl, ++ .open = nonseekable_open, ++}; ++ ++static int bch_chardev_major; ++static struct class *bch_chardev_class; ++static struct device *bch_chardev; ++ ++void bch2_fs_chardev_exit(struct bch_fs *c) ++{ ++ if (!IS_ERR_OR_NULL(c->chardev)) ++ device_unregister(c->chardev); ++ if (c->minor >= 0) ++ idr_remove(&bch_chardev_minor, c->minor); ++} ++ ++int bch2_fs_chardev_init(struct bch_fs *c) ++{ ++ c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL); ++ if (c->minor < 0) ++ return c->minor; ++ ++ c->chardev = device_create(bch_chardev_class, NULL, ++ MKDEV(bch_chardev_major, c->minor), c, ++ "bcachefs%u-ctl", c->minor); ++ if (IS_ERR(c->chardev)) ++ return PTR_ERR(c->chardev); ++ ++ return 0; ++} ++ ++void bch2_chardev_exit(void) ++{ ++ if (!IS_ERR_OR_NULL(bch_chardev_class)) ++ device_destroy(bch_chardev_class, ++ MKDEV(bch_chardev_major, U8_MAX)); ++ if (!IS_ERR_OR_NULL(bch_chardev_class)) ++ class_destroy(bch_chardev_class); ++ if (bch_chardev_major > 0) ++ unregister_chrdev(bch_chardev_major, "bcachefs"); ++} ++ ++int __init bch2_chardev_init(void) ++{ ++ bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops); ++ if (bch_chardev_major < 0) ++ return bch_chardev_major; ++ ++ bch_chardev_class = class_create(THIS_MODULE, "bcachefs"); ++ if (IS_ERR(bch_chardev_class)) ++ return PTR_ERR(bch_chardev_class); ++ ++ bch_chardev = device_create(bch_chardev_class, NULL, ++ MKDEV(bch_chardev_major, U8_MAX), ++ NULL, "bcachefs-ctl"); ++ if (IS_ERR(bch_chardev)) ++ return PTR_ERR(bch_chardev); ++ ++ return 0; ++} ++ ++#endif /* NO_BCACHEFS_CHARDEV */ +diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h +new file mode 100644 +index 000000000000..3a4890d39ff9 +--- /dev/null ++++ b/fs/bcachefs/chardev.h +@@ -0,0 +1,31 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_CHARDEV_H ++#define _BCACHEFS_CHARDEV_H ++ ++#ifndef NO_BCACHEFS_FS ++ ++long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *); ++ ++void bch2_fs_chardev_exit(struct bch_fs *); ++int bch2_fs_chardev_init(struct bch_fs *); ++ ++void bch2_chardev_exit(void); ++int __init bch2_chardev_init(void); ++ ++#else ++ ++static inline long bch2_fs_ioctl(struct bch_fs *c, ++ unsigned cmd, void __user * arg) ++{ ++ return -ENOSYS; ++} ++ ++static inline void bch2_fs_chardev_exit(struct bch_fs *c) {} ++static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; } ++ ++static inline void bch2_chardev_exit(void) {} ++static inline int __init bch2_chardev_init(void) { return 0; } ++ ++#endif /* NO_BCACHEFS_FS */ ++ ++#endif /* _BCACHEFS_CHARDEV_H */ +diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c +new file mode 100644 +index 000000000000..3d88719ba86c +--- /dev/null ++++ b/fs/bcachefs/checksum.c +@@ -0,0 +1,618 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "checksum.h" ++#include "super.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static u64 bch2_checksum_init(unsigned type) ++{ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ return 0; ++ case BCH_CSUM_CRC32C_NONZERO: ++ return U32_MAX; ++ case BCH_CSUM_CRC64_NONZERO: ++ return U64_MAX; ++ case BCH_CSUM_CRC32C: ++ return 0; ++ case BCH_CSUM_CRC64: ++ return 0; ++ default: ++ BUG(); ++ } ++} ++ ++static u64 bch2_checksum_final(unsigned type, u64 crc) ++{ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ return 0; ++ case BCH_CSUM_CRC32C_NONZERO: ++ return crc ^ U32_MAX; ++ case BCH_CSUM_CRC64_NONZERO: ++ return crc ^ U64_MAX; ++ case BCH_CSUM_CRC32C: ++ return crc; ++ case BCH_CSUM_CRC64: ++ return crc; ++ default: ++ BUG(); ++ } ++} ++ ++static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t len) ++{ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ return 0; ++ case BCH_CSUM_CRC32C_NONZERO: ++ case BCH_CSUM_CRC32C: ++ return crc32c(crc, data, len); ++ case BCH_CSUM_CRC64_NONZERO: ++ case BCH_CSUM_CRC64: ++ return crc64_be(crc, data, len); ++ default: ++ BUG(); ++ } ++} ++ ++static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, ++ struct nonce nonce, ++ struct scatterlist *sg, size_t len) ++{ ++ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); ++ int ret; ++ ++ skcipher_request_set_sync_tfm(req, tfm); ++ skcipher_request_set_crypt(req, sg, sg, len, nonce.d); ++ ++ ret = crypto_skcipher_encrypt(req); ++ BUG_ON(ret); ++} ++ ++static inline void do_encrypt(struct crypto_sync_skcipher *tfm, ++ struct nonce nonce, ++ void *buf, size_t len) ++{ ++ struct scatterlist sg; ++ ++ sg_init_one(&sg, buf, len); ++ do_encrypt_sg(tfm, nonce, &sg, len); ++} ++ ++int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, ++ void *buf, size_t len) ++{ ++ struct crypto_sync_skcipher *chacha20 = ++ crypto_alloc_sync_skcipher("chacha20", 0, 0); ++ int ret; ++ ++ if (!chacha20) { ++ pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20)); ++ return PTR_ERR(chacha20); ++ } ++ ++ ret = crypto_skcipher_setkey(&chacha20->base, ++ (void *) key, sizeof(*key)); ++ if (ret) { ++ pr_err("crypto_skcipher_setkey() error: %i", ret); ++ goto err; ++ } ++ ++ do_encrypt(chacha20, nonce, buf, len); ++err: ++ crypto_free_sync_skcipher(chacha20); ++ return ret; ++} ++ ++static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc, ++ struct nonce nonce) ++{ ++ u8 key[POLY1305_KEY_SIZE]; ++ ++ nonce.d[3] ^= BCH_NONCE_POLY; ++ ++ memset(key, 0, sizeof(key)); ++ do_encrypt(c->chacha20, nonce, key, sizeof(key)); ++ ++ desc->tfm = c->poly1305; ++ crypto_shash_init(desc); ++ crypto_shash_update(desc, key, sizeof(key)); ++} ++ ++struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, ++ struct nonce nonce, const void *data, size_t len) ++{ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ case BCH_CSUM_CRC32C_NONZERO: ++ case BCH_CSUM_CRC64_NONZERO: ++ case BCH_CSUM_CRC32C: ++ case BCH_CSUM_CRC64: { ++ u64 crc = bch2_checksum_init(type); ++ ++ crc = bch2_checksum_update(type, crc, data, len); ++ crc = bch2_checksum_final(type, crc); ++ ++ return (struct bch_csum) { .lo = cpu_to_le64(crc) }; ++ } ++ ++ case BCH_CSUM_CHACHA20_POLY1305_80: ++ case BCH_CSUM_CHACHA20_POLY1305_128: { ++ SHASH_DESC_ON_STACK(desc, c->poly1305); ++ u8 digest[POLY1305_DIGEST_SIZE]; ++ struct bch_csum ret = { 0 }; ++ ++ gen_poly_key(c, desc, nonce); ++ ++ crypto_shash_update(desc, data, len); ++ crypto_shash_final(desc, digest); ++ ++ memcpy(&ret, digest, bch_crc_bytes[type]); ++ return ret; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++void bch2_encrypt(struct bch_fs *c, unsigned type, ++ struct nonce nonce, void *data, size_t len) ++{ ++ if (!bch2_csum_type_is_encryption(type)) ++ return; ++ ++ do_encrypt(c->chacha20, nonce, data, len); ++} ++ ++static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, ++ struct nonce nonce, struct bio *bio, ++ struct bvec_iter *iter) ++{ ++ struct bio_vec bv; ++ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ return (struct bch_csum) { 0 }; ++ case BCH_CSUM_CRC32C_NONZERO: ++ case BCH_CSUM_CRC64_NONZERO: ++ case BCH_CSUM_CRC32C: ++ case BCH_CSUM_CRC64: { ++ u64 crc = bch2_checksum_init(type); ++ ++#ifdef CONFIG_HIGHMEM ++ __bio_for_each_segment(bv, bio, *iter, *iter) { ++ void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; ++ crc = bch2_checksum_update(type, ++ crc, p, bv.bv_len); ++ kunmap_atomic(p); ++ } ++#else ++ __bio_for_each_bvec(bv, bio, *iter, *iter) ++ crc = bch2_checksum_update(type, crc, ++ page_address(bv.bv_page) + bv.bv_offset, ++ bv.bv_len); ++#endif ++ crc = bch2_checksum_final(type, crc); ++ return (struct bch_csum) { .lo = cpu_to_le64(crc) }; ++ } ++ ++ case BCH_CSUM_CHACHA20_POLY1305_80: ++ case BCH_CSUM_CHACHA20_POLY1305_128: { ++ SHASH_DESC_ON_STACK(desc, c->poly1305); ++ u8 digest[POLY1305_DIGEST_SIZE]; ++ struct bch_csum ret = { 0 }; ++ ++ gen_poly_key(c, desc, nonce); ++ ++#ifdef CONFIG_HIGHMEM ++ __bio_for_each_segment(bv, bio, *iter, *iter) { ++ void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; ++ ++ crypto_shash_update(desc, p, bv.bv_len); ++ kunmap_atomic(p); ++ } ++#else ++ __bio_for_each_bvec(bv, bio, *iter, *iter) ++ crypto_shash_update(desc, ++ page_address(bv.bv_page) + bv.bv_offset, ++ bv.bv_len); ++#endif ++ crypto_shash_final(desc, digest); ++ ++ memcpy(&ret, digest, bch_crc_bytes[type]); ++ return ret; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, ++ struct nonce nonce, struct bio *bio) ++{ ++ struct bvec_iter iter = bio->bi_iter; ++ ++ return __bch2_checksum_bio(c, type, nonce, bio, &iter); ++} ++ ++void bch2_encrypt_bio(struct bch_fs *c, unsigned type, ++ struct nonce nonce, struct bio *bio) ++{ ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ struct scatterlist sgl[16], *sg = sgl; ++ size_t bytes = 0; ++ ++ if (!bch2_csum_type_is_encryption(type)) ++ return; ++ ++ sg_init_table(sgl, ARRAY_SIZE(sgl)); ++ ++ bio_for_each_segment(bv, bio, iter) { ++ if (sg == sgl + ARRAY_SIZE(sgl)) { ++ sg_mark_end(sg - 1); ++ do_encrypt_sg(c->chacha20, nonce, sgl, bytes); ++ ++ nonce = nonce_add(nonce, bytes); ++ bytes = 0; ++ ++ sg_init_table(sgl, ARRAY_SIZE(sgl)); ++ sg = sgl; ++ } ++ ++ sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset); ++ bytes += bv.bv_len; ++ } ++ ++ sg_mark_end(sg - 1); ++ do_encrypt_sg(c->chacha20, nonce, sgl, bytes); ++} ++ ++struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, ++ struct bch_csum b, size_t b_len) ++{ ++ BUG_ON(!bch2_checksum_mergeable(type)); ++ ++ while (b_len) { ++ unsigned b = min_t(unsigned, b_len, PAGE_SIZE); ++ ++ a.lo = bch2_checksum_update(type, a.lo, ++ page_address(ZERO_PAGE(0)), b); ++ b_len -= b; ++ } ++ ++ a.lo ^= b.lo; ++ a.hi ^= b.hi; ++ return a; ++} ++ ++int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, ++ struct bversion version, ++ struct bch_extent_crc_unpacked crc_old, ++ struct bch_extent_crc_unpacked *crc_a, ++ struct bch_extent_crc_unpacked *crc_b, ++ unsigned len_a, unsigned len_b, ++ unsigned new_csum_type) ++{ ++ struct bvec_iter iter = bio->bi_iter; ++ struct nonce nonce = extent_nonce(version, crc_old); ++ struct bch_csum merged = { 0 }; ++ struct crc_split { ++ struct bch_extent_crc_unpacked *crc; ++ unsigned len; ++ unsigned csum_type; ++ struct bch_csum csum; ++ } splits[3] = { ++ { crc_a, len_a, new_csum_type }, ++ { crc_b, len_b, new_csum_type }, ++ { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type }, ++ }, *i; ++ bool mergeable = crc_old.csum_type == new_csum_type && ++ bch2_checksum_mergeable(new_csum_type); ++ unsigned crc_nonce = crc_old.nonce; ++ ++ BUG_ON(len_a + len_b > bio_sectors(bio)); ++ BUG_ON(crc_old.uncompressed_size != bio_sectors(bio)); ++ BUG_ON(crc_is_compressed(crc_old)); ++ BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) != ++ bch2_csum_type_is_encryption(new_csum_type)); ++ ++ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { ++ iter.bi_size = i->len << 9; ++ if (mergeable || i->crc) ++ i->csum = __bch2_checksum_bio(c, i->csum_type, ++ nonce, bio, &iter); ++ else ++ bio_advance_iter(bio, &iter, i->len << 9); ++ nonce = nonce_add(nonce, i->len << 9); ++ } ++ ++ if (mergeable) ++ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) ++ merged = bch2_checksum_merge(new_csum_type, merged, ++ i->csum, i->len << 9); ++ else ++ merged = bch2_checksum_bio(c, crc_old.csum_type, ++ extent_nonce(version, crc_old), bio); ++ ++ if (bch2_crc_cmp(merged, crc_old.csum)) ++ return -EIO; ++ ++ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { ++ if (i->crc) ++ *i->crc = (struct bch_extent_crc_unpacked) { ++ .csum_type = i->csum_type, ++ .compression_type = crc_old.compression_type, ++ .compressed_size = i->len, ++ .uncompressed_size = i->len, ++ .offset = 0, ++ .live_size = i->len, ++ .nonce = crc_nonce, ++ .csum = i->csum, ++ }; ++ ++ if (bch2_csum_type_is_encryption(new_csum_type)) ++ crc_nonce += i->len; ++ } ++ ++ return 0; ++} ++ ++#ifdef __KERNEL__ ++int bch2_request_key(struct bch_sb *sb, struct bch_key *key) ++{ ++ char key_description[60]; ++ struct key *keyring_key; ++ const struct user_key_payload *ukp; ++ int ret; ++ ++ snprintf(key_description, sizeof(key_description), ++ "bcachefs:%pUb", &sb->user_uuid); ++ ++ keyring_key = request_key(&key_type_logon, key_description, NULL); ++ if (IS_ERR(keyring_key)) ++ return PTR_ERR(keyring_key); ++ ++ down_read(&keyring_key->sem); ++ ukp = dereference_key_locked(keyring_key); ++ if (ukp->datalen == sizeof(*key)) { ++ memcpy(key, ukp->data, ukp->datalen); ++ ret = 0; ++ } else { ++ ret = -EINVAL; ++ } ++ up_read(&keyring_key->sem); ++ key_put(keyring_key); ++ ++ return ret; ++} ++#else ++#include ++#include ++ ++int bch2_request_key(struct bch_sb *sb, struct bch_key *key) ++{ ++ key_serial_t key_id; ++ char key_description[60]; ++ char uuid[40]; ++ ++ uuid_unparse_lower(sb->user_uuid.b, uuid); ++ sprintf(key_description, "bcachefs:%s", uuid); ++ ++ key_id = request_key("user", key_description, NULL, ++ KEY_SPEC_USER_KEYRING); ++ if (key_id < 0) ++ return -errno; ++ ++ if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key)) ++ return -1; ++ ++ return 0; ++} ++#endif ++ ++int bch2_decrypt_sb_key(struct bch_fs *c, ++ struct bch_sb_field_crypt *crypt, ++ struct bch_key *key) ++{ ++ struct bch_encrypted_key sb_key = crypt->key; ++ struct bch_key user_key; ++ int ret = 0; ++ ++ /* is key encrypted? */ ++ if (!bch2_key_is_encrypted(&sb_key)) ++ goto out; ++ ++ ret = bch2_request_key(c->disk_sb.sb, &user_key); ++ if (ret) { ++ bch_err(c, "error requesting encryption key: %i", ret); ++ goto err; ++ } ++ ++ /* decrypt real key: */ ++ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), ++ &sb_key, sizeof(sb_key)); ++ if (ret) ++ goto err; ++ ++ if (bch2_key_is_encrypted(&sb_key)) { ++ bch_err(c, "incorrect encryption key"); ++ ret = -EINVAL; ++ goto err; ++ } ++out: ++ *key = sb_key.key; ++err: ++ memzero_explicit(&sb_key, sizeof(sb_key)); ++ memzero_explicit(&user_key, sizeof(user_key)); ++ return ret; ++} ++ ++static int bch2_alloc_ciphers(struct bch_fs *c) ++{ ++ if (!c->chacha20) ++ c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); ++ if (IS_ERR(c->chacha20)) { ++ bch_err(c, "error requesting chacha20 module: %li", ++ PTR_ERR(c->chacha20)); ++ return PTR_ERR(c->chacha20); ++ } ++ ++ if (!c->poly1305) ++ c->poly1305 = crypto_alloc_shash("poly1305", 0, 0); ++ if (IS_ERR(c->poly1305)) { ++ bch_err(c, "error requesting poly1305 module: %li", ++ PTR_ERR(c->poly1305)); ++ return PTR_ERR(c->poly1305); ++ } ++ ++ return 0; ++} ++ ++int bch2_disable_encryption(struct bch_fs *c) ++{ ++ struct bch_sb_field_crypt *crypt; ++ struct bch_key key; ++ int ret = -EINVAL; ++ ++ mutex_lock(&c->sb_lock); ++ ++ crypt = bch2_sb_get_crypt(c->disk_sb.sb); ++ if (!crypt) ++ goto out; ++ ++ /* is key encrypted? */ ++ ret = 0; ++ if (bch2_key_is_encrypted(&crypt->key)) ++ goto out; ++ ++ ret = bch2_decrypt_sb_key(c, crypt, &key); ++ if (ret) ++ goto out; ++ ++ crypt->key.magic = BCH_KEY_MAGIC; ++ crypt->key.key = key; ++ ++ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0); ++ bch2_write_super(c); ++out: ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++int bch2_enable_encryption(struct bch_fs *c, bool keyed) ++{ ++ struct bch_encrypted_key key; ++ struct bch_key user_key; ++ struct bch_sb_field_crypt *crypt; ++ int ret = -EINVAL; ++ ++ mutex_lock(&c->sb_lock); ++ ++ /* Do we already have an encryption key? */ ++ if (bch2_sb_get_crypt(c->disk_sb.sb)) ++ goto err; ++ ++ ret = bch2_alloc_ciphers(c); ++ if (ret) ++ goto err; ++ ++ key.magic = BCH_KEY_MAGIC; ++ get_random_bytes(&key.key, sizeof(key.key)); ++ ++ if (keyed) { ++ ret = bch2_request_key(c->disk_sb.sb, &user_key); ++ if (ret) { ++ bch_err(c, "error requesting encryption key: %i", ret); ++ goto err; ++ } ++ ++ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), ++ &key, sizeof(key)); ++ if (ret) ++ goto err; ++ } ++ ++ ret = crypto_skcipher_setkey(&c->chacha20->base, ++ (void *) &key.key, sizeof(key.key)); ++ if (ret) ++ goto err; ++ ++ crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64)); ++ if (!crypt) { ++ ret = -ENOMEM; /* XXX this technically could be -ENOSPC */ ++ goto err; ++ } ++ ++ crypt->key = key; ++ ++ /* write superblock */ ++ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1); ++ bch2_write_super(c); ++err: ++ mutex_unlock(&c->sb_lock); ++ memzero_explicit(&user_key, sizeof(user_key)); ++ memzero_explicit(&key, sizeof(key)); ++ return ret; ++} ++ ++void bch2_fs_encryption_exit(struct bch_fs *c) ++{ ++ if (!IS_ERR_OR_NULL(c->poly1305)) ++ crypto_free_shash(c->poly1305); ++ if (!IS_ERR_OR_NULL(c->chacha20)) ++ crypto_free_sync_skcipher(c->chacha20); ++ if (!IS_ERR_OR_NULL(c->sha256)) ++ crypto_free_shash(c->sha256); ++} ++ ++int bch2_fs_encryption_init(struct bch_fs *c) ++{ ++ struct bch_sb_field_crypt *crypt; ++ struct bch_key key; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ c->sha256 = crypto_alloc_shash("sha256", 0, 0); ++ if (IS_ERR(c->sha256)) { ++ bch_err(c, "error requesting sha256 module"); ++ ret = PTR_ERR(c->sha256); ++ goto out; ++ } ++ ++ crypt = bch2_sb_get_crypt(c->disk_sb.sb); ++ if (!crypt) ++ goto out; ++ ++ ret = bch2_alloc_ciphers(c); ++ if (ret) ++ goto out; ++ ++ ret = bch2_decrypt_sb_key(c, crypt, &key); ++ if (ret) ++ goto out; ++ ++ ret = crypto_skcipher_setkey(&c->chacha20->base, ++ (void *) &key.key, sizeof(key.key)); ++ if (ret) ++ goto out; ++out: ++ memzero_explicit(&key, sizeof(key)); ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} +diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h +new file mode 100644 +index 000000000000..24dee8039d57 +--- /dev/null ++++ b/fs/bcachefs/checksum.h +@@ -0,0 +1,202 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_CHECKSUM_H ++#define _BCACHEFS_CHECKSUM_H ++ ++#include "bcachefs.h" ++#include "extents_types.h" ++#include "super-io.h" ++ ++#include ++#include ++ ++static inline bool bch2_checksum_mergeable(unsigned type) ++{ ++ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ case BCH_CSUM_CRC32C: ++ case BCH_CSUM_CRC64: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum, ++ struct bch_csum, size_t); ++ ++#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28) ++#define BCH_NONCE_BTREE cpu_to_le32(2 << 28) ++#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28) ++#define BCH_NONCE_PRIO cpu_to_le32(4 << 28) ++#define BCH_NONCE_POLY cpu_to_le32(1 << 31) ++ ++struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, ++ const void *, size_t); ++ ++/* ++ * This is used for various on disk data structures - bch_sb, prio_set, bset, ++ * jset: The checksum is _always_ the first field of these structs ++ */ ++#define csum_vstruct(_c, _type, _nonce, _i) \ ++({ \ ++ const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \ ++ const void *end = vstruct_end(_i); \ ++ \ ++ bch2_checksum(_c, _type, _nonce, start, end - start); \ ++}) ++ ++int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); ++int bch2_request_key(struct bch_sb *, struct bch_key *); ++ ++void bch2_encrypt(struct bch_fs *, unsigned, struct nonce, ++ void *data, size_t); ++ ++struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned, ++ struct nonce, struct bio *); ++ ++int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion, ++ struct bch_extent_crc_unpacked, ++ struct bch_extent_crc_unpacked *, ++ struct bch_extent_crc_unpacked *, ++ unsigned, unsigned, unsigned); ++ ++void bch2_encrypt_bio(struct bch_fs *, unsigned, ++ struct nonce, struct bio *); ++ ++int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, ++ struct bch_key *); ++ ++int bch2_disable_encryption(struct bch_fs *); ++int bch2_enable_encryption(struct bch_fs *, bool); ++ ++void bch2_fs_encryption_exit(struct bch_fs *); ++int bch2_fs_encryption_init(struct bch_fs *); ++ ++static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, ++ bool data) ++{ ++ switch (type) { ++ case BCH_CSUM_OPT_NONE: ++ return BCH_CSUM_NONE; ++ case BCH_CSUM_OPT_CRC32C: ++ return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO; ++ case BCH_CSUM_OPT_CRC64: ++ return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO; ++ default: ++ BUG(); ++ } ++} ++ ++static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, ++ unsigned opt) ++{ ++ if (c->sb.encryption_type) ++ return c->opts.wide_macs ++ ? BCH_CSUM_CHACHA20_POLY1305_128 ++ : BCH_CSUM_CHACHA20_POLY1305_80; ++ ++ return bch2_csum_opt_to_type(opt, true); ++} ++ ++static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) ++{ ++ if (c->sb.encryption_type) ++ return BCH_CSUM_CHACHA20_POLY1305_128; ++ ++ return bch2_csum_opt_to_type(c->opts.metadata_checksum, false); ++} ++ ++static const unsigned bch2_compression_opt_to_type[] = { ++#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, ++ BCH_COMPRESSION_OPTS() ++#undef x ++}; ++ ++static inline bool bch2_checksum_type_valid(const struct bch_fs *c, ++ unsigned type) ++{ ++ if (type >= BCH_CSUM_NR) ++ return false; ++ ++ if (bch2_csum_type_is_encryption(type) && !c->chacha20) ++ return false; ++ ++ return true; ++} ++ ++/* returns true if not equal */ ++static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) ++{ ++ /* ++ * XXX: need some way of preventing the compiler from optimizing this ++ * into a form that isn't constant time.. ++ */ ++ return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0; ++} ++ ++/* for skipping ahead and encrypting/decrypting at an offset: */ ++static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) ++{ ++ EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); ++ ++ le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); ++ return nonce; ++} ++ ++static inline struct nonce null_nonce(void) ++{ ++ struct nonce ret; ++ ++ memset(&ret, 0, sizeof(ret)); ++ return ret; ++} ++ ++static inline struct nonce extent_nonce(struct bversion version, ++ struct bch_extent_crc_unpacked crc) ++{ ++ unsigned compression_type = crc_is_compressed(crc) ++ ? crc.compression_type ++ : 0; ++ unsigned size = compression_type ? crc.uncompressed_size : 0; ++ struct nonce nonce = (struct nonce) {{ ++ [0] = cpu_to_le32(size << 22), ++ [1] = cpu_to_le32(version.lo), ++ [2] = cpu_to_le32(version.lo >> 32), ++ [3] = cpu_to_le32(version.hi| ++ (compression_type << 24))^BCH_NONCE_EXTENT, ++ }}; ++ ++ return nonce_add(nonce, crc.nonce << 9); ++} ++ ++static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key) ++{ ++ return le64_to_cpu(key->magic) != BCH_KEY_MAGIC; ++} ++ ++static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb) ++{ ++ __le64 magic = __bch2_sb_magic(sb); ++ ++ return (struct nonce) {{ ++ [0] = 0, ++ [1] = 0, ++ [2] = ((__le32 *) &magic)[0], ++ [3] = ((__le32 *) &magic)[1], ++ }}; ++} ++ ++static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c) ++{ ++ __le64 magic = bch2_sb_magic(c); ++ ++ return (struct nonce) {{ ++ [0] = 0, ++ [1] = 0, ++ [2] = ((__le32 *) &magic)[0], ++ [3] = ((__le32 *) &magic)[1], ++ }}; ++} ++ ++#endif /* _BCACHEFS_CHECKSUM_H */ +diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c +new file mode 100644 +index 000000000000..1d1590de55e8 +--- /dev/null ++++ b/fs/bcachefs/clock.c +@@ -0,0 +1,191 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "clock.h" ++ ++#include ++#include ++#include ++ ++static inline long io_timer_cmp(io_timer_heap *h, ++ struct io_timer *l, ++ struct io_timer *r) ++{ ++ return l->expire - r->expire; ++} ++ ++void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) ++{ ++ size_t i; ++ ++ spin_lock(&clock->timer_lock); ++ ++ if (time_after_eq((unsigned long) atomic_long_read(&clock->now), ++ timer->expire)) { ++ spin_unlock(&clock->timer_lock); ++ timer->fn(timer); ++ return; ++ } ++ ++ for (i = 0; i < clock->timers.used; i++) ++ if (clock->timers.data[i] == timer) ++ goto out; ++ ++ BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL)); ++out: ++ spin_unlock(&clock->timer_lock); ++} ++ ++void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) ++{ ++ size_t i; ++ ++ spin_lock(&clock->timer_lock); ++ ++ for (i = 0; i < clock->timers.used; i++) ++ if (clock->timers.data[i] == timer) { ++ heap_del(&clock->timers, i, io_timer_cmp, NULL); ++ break; ++ } ++ ++ spin_unlock(&clock->timer_lock); ++} ++ ++struct io_clock_wait { ++ struct io_timer io_timer; ++ struct timer_list cpu_timer; ++ struct task_struct *task; ++ int expired; ++}; ++ ++static void io_clock_wait_fn(struct io_timer *timer) ++{ ++ struct io_clock_wait *wait = container_of(timer, ++ struct io_clock_wait, io_timer); ++ ++ wait->expired = 1; ++ wake_up_process(wait->task); ++} ++ ++static void io_clock_cpu_timeout(struct timer_list *timer) ++{ ++ struct io_clock_wait *wait = container_of(timer, ++ struct io_clock_wait, cpu_timer); ++ ++ wait->expired = 1; ++ wake_up_process(wait->task); ++} ++ ++void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until) ++{ ++ struct io_clock_wait wait; ++ ++ /* XXX: calculate sleep time rigorously */ ++ wait.io_timer.expire = until; ++ wait.io_timer.fn = io_clock_wait_fn; ++ wait.task = current; ++ wait.expired = 0; ++ bch2_io_timer_add(clock, &wait.io_timer); ++ ++ schedule(); ++ ++ bch2_io_timer_del(clock, &wait.io_timer); ++} ++ ++void bch2_kthread_io_clock_wait(struct io_clock *clock, ++ unsigned long io_until, ++ unsigned long cpu_timeout) ++{ ++ bool kthread = (current->flags & PF_KTHREAD) != 0; ++ struct io_clock_wait wait; ++ ++ wait.io_timer.expire = io_until; ++ wait.io_timer.fn = io_clock_wait_fn; ++ wait.task = current; ++ wait.expired = 0; ++ bch2_io_timer_add(clock, &wait.io_timer); ++ ++ timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0); ++ ++ if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) ++ mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); ++ ++ while (1) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ if (kthread && kthread_should_stop()) ++ break; ++ ++ if (wait.expired) ++ break; ++ ++ schedule(); ++ try_to_freeze(); ++ } ++ ++ __set_current_state(TASK_RUNNING); ++ del_singleshot_timer_sync(&wait.cpu_timer); ++ destroy_timer_on_stack(&wait.cpu_timer); ++ bch2_io_timer_del(clock, &wait.io_timer); ++} ++ ++static struct io_timer *get_expired_timer(struct io_clock *clock, ++ unsigned long now) ++{ ++ struct io_timer *ret = NULL; ++ ++ spin_lock(&clock->timer_lock); ++ ++ if (clock->timers.used && ++ time_after_eq(now, clock->timers.data[0]->expire)) ++ heap_pop(&clock->timers, ret, io_timer_cmp, NULL); ++ ++ spin_unlock(&clock->timer_lock); ++ ++ return ret; ++} ++ ++void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) ++{ ++ struct io_timer *timer; ++ unsigned long now = atomic_long_add_return(sectors, &clock->now); ++ ++ while ((timer = get_expired_timer(clock, now))) ++ timer->fn(timer); ++} ++ ++void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) ++{ ++ unsigned long now; ++ unsigned i; ++ ++ spin_lock(&clock->timer_lock); ++ now = atomic_long_read(&clock->now); ++ ++ for (i = 0; i < clock->timers.used; i++) ++ pr_buf(out, "%ps:\t%li\n", ++ clock->timers.data[i]->fn, ++ clock->timers.data[i]->expire - now); ++ spin_unlock(&clock->timer_lock); ++} ++ ++void bch2_io_clock_exit(struct io_clock *clock) ++{ ++ free_heap(&clock->timers); ++ free_percpu(clock->pcpu_buf); ++} ++ ++int bch2_io_clock_init(struct io_clock *clock) ++{ ++ atomic_long_set(&clock->now, 0); ++ spin_lock_init(&clock->timer_lock); ++ ++ clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); ++ ++ clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf); ++ if (!clock->pcpu_buf) ++ return -ENOMEM; ++ ++ if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ return 0; ++} +diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h +new file mode 100644 +index 000000000000..70a0f7436c84 +--- /dev/null ++++ b/fs/bcachefs/clock.h +@@ -0,0 +1,38 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_CLOCK_H ++#define _BCACHEFS_CLOCK_H ++ ++void bch2_io_timer_add(struct io_clock *, struct io_timer *); ++void bch2_io_timer_del(struct io_clock *, struct io_timer *); ++void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long, ++ unsigned long); ++ ++void __bch2_increment_clock(struct io_clock *, unsigned); ++ ++static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors, ++ int rw) ++{ ++ struct io_clock *clock = &c->io_clock[rw]; ++ ++ if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >= ++ IO_CLOCK_PCPU_SECTORS)) ++ __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0)); ++} ++ ++void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); ++ ++#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\ ++({ \ ++ long __ret = timeout; \ ++ might_sleep(); \ ++ if (!___wait_cond_timeout(condition)) \ ++ __ret = __wait_event_timeout(wq, condition, timeout); \ ++ __ret; \ ++}) ++ ++void bch2_io_timers_to_text(struct printbuf *, struct io_clock *); ++ ++void bch2_io_clock_exit(struct io_clock *); ++int bch2_io_clock_init(struct io_clock *); ++ ++#endif /* _BCACHEFS_CLOCK_H */ +diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h +new file mode 100644 +index 000000000000..92c740a47565 +--- /dev/null ++++ b/fs/bcachefs/clock_types.h +@@ -0,0 +1,37 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_CLOCK_TYPES_H ++#define _BCACHEFS_CLOCK_TYPES_H ++ ++#include "util.h" ++ ++#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3) ++ ++/* ++ * Clocks/timers in units of sectors of IO: ++ * ++ * Note - they use percpu batching, so they're only approximate. ++ */ ++ ++struct io_timer; ++typedef void (*io_timer_fn)(struct io_timer *); ++ ++struct io_timer { ++ io_timer_fn fn; ++ unsigned long expire; ++}; ++ ++/* Amount to buffer up on a percpu counter */ ++#define IO_CLOCK_PCPU_SECTORS 128 ++ ++typedef HEAP(struct io_timer *) io_timer_heap; ++ ++struct io_clock { ++ atomic_long_t now; ++ u16 __percpu *pcpu_buf; ++ unsigned max_slop; ++ ++ spinlock_t timer_lock; ++ io_timer_heap timers; ++}; ++ ++#endif /* _BCACHEFS_CLOCK_TYPES_H */ +diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c +new file mode 100644 +index 000000000000..b50d2b0d5fd3 +--- /dev/null ++++ b/fs/bcachefs/compress.c +@@ -0,0 +1,629 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "checksum.h" ++#include "compress.h" ++#include "extents.h" ++#include "io.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++ ++/* Bounce buffer: */ ++struct bbuf { ++ void *b; ++ enum { ++ BB_NONE, ++ BB_VMAP, ++ BB_KMALLOC, ++ BB_MEMPOOL, ++ } type; ++ int rw; ++}; ++ ++static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw) ++{ ++ void *b; ++ ++ BUG_ON(size > c->sb.encoded_extent_max << 9); ++ ++ b = kmalloc(size, GFP_NOIO|__GFP_NOWARN); ++ if (b) ++ return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw }; ++ ++ b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO); ++ if (b) ++ return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw }; ++ ++ BUG(); ++} ++ ++static bool bio_phys_contig(struct bio *bio, struct bvec_iter start) ++{ ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ void *expected_start = NULL; ++ ++ __bio_for_each_bvec(bv, bio, iter, start) { ++ if (expected_start && ++ expected_start != page_address(bv.bv_page) + bv.bv_offset) ++ return false; ++ ++ expected_start = page_address(bv.bv_page) + ++ bv.bv_offset + bv.bv_len; ++ } ++ ++ return true; ++} ++ ++static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, ++ struct bvec_iter start, int rw) ++{ ++ struct bbuf ret; ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ unsigned nr_pages = 0; ++ struct page *stack_pages[16]; ++ struct page **pages = NULL; ++ void *data; ++ ++ BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max); ++ ++ if (!IS_ENABLED(CONFIG_HIGHMEM) && ++ bio_phys_contig(bio, start)) ++ return (struct bbuf) { ++ .b = page_address(bio_iter_page(bio, start)) + ++ bio_iter_offset(bio, start), ++ .type = BB_NONE, .rw = rw ++ }; ++ ++ /* check if we can map the pages contiguously: */ ++ __bio_for_each_segment(bv, bio, iter, start) { ++ if (iter.bi_size != start.bi_size && ++ bv.bv_offset) ++ goto bounce; ++ ++ if (bv.bv_len < iter.bi_size && ++ bv.bv_offset + bv.bv_len < PAGE_SIZE) ++ goto bounce; ++ ++ nr_pages++; ++ } ++ ++ BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages); ++ ++ pages = nr_pages > ARRAY_SIZE(stack_pages) ++ ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO) ++ : stack_pages; ++ if (!pages) ++ goto bounce; ++ ++ nr_pages = 0; ++ __bio_for_each_segment(bv, bio, iter, start) ++ pages[nr_pages++] = bv.bv_page; ++ ++ data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); ++ if (pages != stack_pages) ++ kfree(pages); ++ ++ if (data) ++ return (struct bbuf) { ++ .b = data + bio_iter_offset(bio, start), ++ .type = BB_VMAP, .rw = rw ++ }; ++bounce: ++ ret = __bounce_alloc(c, start.bi_size, rw); ++ ++ if (rw == READ) ++ memcpy_from_bio(ret.b, bio, start); ++ ++ return ret; ++} ++ ++static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw) ++{ ++ return __bio_map_or_bounce(c, bio, bio->bi_iter, rw); ++} ++ ++static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf) ++{ ++ switch (buf.type) { ++ case BB_NONE: ++ break; ++ case BB_VMAP: ++ vunmap((void *) ((unsigned long) buf.b & PAGE_MASK)); ++ break; ++ case BB_KMALLOC: ++ kfree(buf.b); ++ break; ++ case BB_MEMPOOL: ++ mempool_free(buf.b, &c->compression_bounce[buf.rw]); ++ break; ++ } ++} ++ ++static inline void zlib_set_workspace(z_stream *strm, void *workspace) ++{ ++#ifdef __KERNEL__ ++ strm->workspace = workspace; ++#endif ++} ++ ++static int __bio_uncompress(struct bch_fs *c, struct bio *src, ++ void *dst_data, struct bch_extent_crc_unpacked crc) ++{ ++ struct bbuf src_data = { NULL }; ++ size_t src_len = src->bi_iter.bi_size; ++ size_t dst_len = crc.uncompressed_size << 9; ++ void *workspace; ++ int ret; ++ ++ src_data = bio_map_or_bounce(c, src, READ); ++ ++ switch (crc.compression_type) { ++ case BCH_COMPRESSION_TYPE_lz4_old: ++ case BCH_COMPRESSION_TYPE_lz4: ++ ret = LZ4_decompress_safe_partial(src_data.b, dst_data, ++ src_len, dst_len, dst_len); ++ if (ret != dst_len) ++ goto err; ++ break; ++ case BCH_COMPRESSION_TYPE_gzip: { ++ z_stream strm = { ++ .next_in = src_data.b, ++ .avail_in = src_len, ++ .next_out = dst_data, ++ .avail_out = dst_len, ++ }; ++ ++ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); ++ ++ zlib_set_workspace(&strm, workspace); ++ zlib_inflateInit2(&strm, -MAX_WBITS); ++ ret = zlib_inflate(&strm, Z_FINISH); ++ ++ mempool_free(workspace, &c->decompress_workspace); ++ ++ if (ret != Z_STREAM_END) ++ goto err; ++ break; ++ } ++ case BCH_COMPRESSION_TYPE_zstd: { ++ ZSTD_DCtx *ctx; ++ size_t real_src_len = le32_to_cpup(src_data.b); ++ ++ if (real_src_len > src_len - 4) ++ goto err; ++ ++ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); ++ ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound()); ++ ++ ret = ZSTD_decompressDCtx(ctx, ++ dst_data, dst_len, ++ src_data.b + 4, real_src_len); ++ ++ mempool_free(workspace, &c->decompress_workspace); ++ ++ if (ret != dst_len) ++ goto err; ++ break; ++ } ++ default: ++ BUG(); ++ } ++ ret = 0; ++out: ++ bio_unmap_or_unbounce(c, src_data); ++ return ret; ++err: ++ ret = -EIO; ++ goto out; ++} ++ ++int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, ++ struct bch_extent_crc_unpacked *crc) ++{ ++ struct bbuf data = { NULL }; ++ size_t dst_len = crc->uncompressed_size << 9; ++ ++ /* bio must own its pages: */ ++ BUG_ON(!bio->bi_vcnt); ++ BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); ++ ++ if (crc->uncompressed_size > c->sb.encoded_extent_max || ++ crc->compressed_size > c->sb.encoded_extent_max) { ++ bch_err(c, "error rewriting existing data: extent too big"); ++ return -EIO; ++ } ++ ++ data = __bounce_alloc(c, dst_len, WRITE); ++ ++ if (__bio_uncompress(c, bio, data.b, *crc)) { ++ bch_err(c, "error rewriting existing data: decompression error"); ++ bio_unmap_or_unbounce(c, data); ++ return -EIO; ++ } ++ ++ /* ++ * XXX: don't have a good way to assert that the bio was allocated with ++ * enough space, we depend on bch2_move_extent doing the right thing ++ */ ++ bio->bi_iter.bi_size = crc->live_size << 9; ++ ++ memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9)); ++ ++ crc->csum_type = 0; ++ crc->compression_type = 0; ++ crc->compressed_size = crc->live_size; ++ crc->uncompressed_size = crc->live_size; ++ crc->offset = 0; ++ crc->csum = (struct bch_csum) { 0, 0 }; ++ ++ bio_unmap_or_unbounce(c, data); ++ return 0; ++} ++ ++int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, ++ struct bio *dst, struct bvec_iter dst_iter, ++ struct bch_extent_crc_unpacked crc) ++{ ++ struct bbuf dst_data = { NULL }; ++ size_t dst_len = crc.uncompressed_size << 9; ++ int ret = -ENOMEM; ++ ++ if (crc.uncompressed_size > c->sb.encoded_extent_max || ++ crc.compressed_size > c->sb.encoded_extent_max) ++ return -EIO; ++ ++ dst_data = dst_len == dst_iter.bi_size ++ ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) ++ : __bounce_alloc(c, dst_len, WRITE); ++ ++ ret = __bio_uncompress(c, src, dst_data.b, crc); ++ if (ret) ++ goto err; ++ ++ if (dst_data.type != BB_NONE && ++ dst_data.type != BB_VMAP) ++ memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9)); ++err: ++ bio_unmap_or_unbounce(c, dst_data); ++ return ret; ++} ++ ++static int attempt_compress(struct bch_fs *c, ++ void *workspace, ++ void *dst, size_t dst_len, ++ void *src, size_t src_len, ++ enum bch_compression_type compression_type) ++{ ++ switch (compression_type) { ++ case BCH_COMPRESSION_TYPE_lz4: { ++ int len = src_len; ++ int ret = LZ4_compress_destSize( ++ src, dst, ++ &len, dst_len, ++ workspace); ++ ++ if (len < src_len) ++ return -len; ++ ++ return ret; ++ } ++ case BCH_COMPRESSION_TYPE_gzip: { ++ z_stream strm = { ++ .next_in = src, ++ .avail_in = src_len, ++ .next_out = dst, ++ .avail_out = dst_len, ++ }; ++ ++ zlib_set_workspace(&strm, workspace); ++ zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION, ++ Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, ++ Z_DEFAULT_STRATEGY); ++ ++ if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END) ++ return 0; ++ ++ if (zlib_deflateEnd(&strm) != Z_OK) ++ return 0; ++ ++ return strm.total_out; ++ } ++ case BCH_COMPRESSION_TYPE_zstd: { ++ ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace, ++ ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams)); ++ ++ size_t len = ZSTD_compressCCtx(ctx, ++ dst + 4, dst_len - 4, ++ src, src_len, ++ c->zstd_params); ++ if (ZSTD_isError(len)) ++ return 0; ++ ++ *((__le32 *) dst) = cpu_to_le32(len); ++ return len + 4; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++static unsigned __bio_compress(struct bch_fs *c, ++ struct bio *dst, size_t *dst_len, ++ struct bio *src, size_t *src_len, ++ enum bch_compression_type compression_type) ++{ ++ struct bbuf src_data = { NULL }, dst_data = { NULL }; ++ void *workspace; ++ unsigned pad; ++ int ret = 0; ++ ++ BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR); ++ BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type])); ++ ++ /* If it's only one block, don't bother trying to compress: */ ++ if (bio_sectors(src) <= c->opts.block_size) ++ return 0; ++ ++ dst_data = bio_map_or_bounce(c, dst, WRITE); ++ src_data = bio_map_or_bounce(c, src, READ); ++ ++ workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOIO); ++ ++ *src_len = src->bi_iter.bi_size; ++ *dst_len = dst->bi_iter.bi_size; ++ ++ /* ++ * XXX: this algorithm sucks when the compression code doesn't tell us ++ * how much would fit, like LZ4 does: ++ */ ++ while (1) { ++ if (*src_len <= block_bytes(c)) { ++ ret = -1; ++ break; ++ } ++ ++ ret = attempt_compress(c, workspace, ++ dst_data.b, *dst_len, ++ src_data.b, *src_len, ++ compression_type); ++ if (ret > 0) { ++ *dst_len = ret; ++ ret = 0; ++ break; ++ } ++ ++ /* Didn't fit: should we retry with a smaller amount? */ ++ if (*src_len <= *dst_len) { ++ ret = -1; ++ break; ++ } ++ ++ /* ++ * If ret is negative, it's a hint as to how much data would fit ++ */ ++ BUG_ON(-ret >= *src_len); ++ ++ if (ret < 0) ++ *src_len = -ret; ++ else ++ *src_len -= (*src_len - *dst_len) / 2; ++ *src_len = round_down(*src_len, block_bytes(c)); ++ } ++ ++ mempool_free(workspace, &c->compress_workspace[compression_type]); ++ ++ if (ret) ++ goto err; ++ ++ /* Didn't get smaller: */ ++ if (round_up(*dst_len, block_bytes(c)) >= *src_len) ++ goto err; ++ ++ pad = round_up(*dst_len, block_bytes(c)) - *dst_len; ++ ++ memset(dst_data.b + *dst_len, 0, pad); ++ *dst_len += pad; ++ ++ if (dst_data.type != BB_NONE && ++ dst_data.type != BB_VMAP) ++ memcpy_to_bio(dst, dst->bi_iter, dst_data.b); ++ ++ BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size); ++ BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size); ++ BUG_ON(*dst_len & (block_bytes(c) - 1)); ++ BUG_ON(*src_len & (block_bytes(c) - 1)); ++out: ++ bio_unmap_or_unbounce(c, src_data); ++ bio_unmap_or_unbounce(c, dst_data); ++ return compression_type; ++err: ++ compression_type = BCH_COMPRESSION_TYPE_incompressible; ++ goto out; ++} ++ ++unsigned bch2_bio_compress(struct bch_fs *c, ++ struct bio *dst, size_t *dst_len, ++ struct bio *src, size_t *src_len, ++ unsigned compression_type) ++{ ++ unsigned orig_dst = dst->bi_iter.bi_size; ++ unsigned orig_src = src->bi_iter.bi_size; ++ ++ /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ ++ src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size, ++ c->sb.encoded_extent_max << 9); ++ /* Don't generate a bigger output than input: */ ++ dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); ++ ++ if (compression_type == BCH_COMPRESSION_TYPE_lz4_old) ++ compression_type = BCH_COMPRESSION_TYPE_lz4; ++ ++ compression_type = ++ __bio_compress(c, dst, dst_len, src, src_len, compression_type); ++ ++ dst->bi_iter.bi_size = orig_dst; ++ src->bi_iter.bi_size = orig_src; ++ return compression_type; ++} ++ ++static int __bch2_fs_compress_init(struct bch_fs *, u64); ++ ++#define BCH_FEATURE_none 0 ++ ++static const unsigned bch2_compression_opt_to_feature[] = { ++#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t, ++ BCH_COMPRESSION_OPTS() ++#undef x ++}; ++ ++#undef BCH_FEATURE_none ++ ++static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f) ++{ ++ int ret = 0; ++ ++ if ((c->sb.features & f) == f) ++ return 0; ++ ++ mutex_lock(&c->sb_lock); ++ ++ if ((c->sb.features & f) == f) { ++ mutex_unlock(&c->sb_lock); ++ return 0; ++ } ++ ++ ret = __bch2_fs_compress_init(c, c->sb.features|f); ++ if (ret) { ++ mutex_unlock(&c->sb_lock); ++ return ret; ++ } ++ ++ c->disk_sb.sb->features[0] |= cpu_to_le64(f); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++int bch2_check_set_has_compressed_data(struct bch_fs *c, ++ unsigned compression_type) ++{ ++ BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature)); ++ ++ return compression_type ++ ? __bch2_check_set_has_compressed_data(c, ++ 1ULL << bch2_compression_opt_to_feature[compression_type]) ++ : 0; ++} ++ ++void bch2_fs_compress_exit(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ mempool_exit(&c->decompress_workspace); ++ for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++) ++ mempool_exit(&c->compress_workspace[i]); ++ mempool_exit(&c->compression_bounce[WRITE]); ++ mempool_exit(&c->compression_bounce[READ]); ++} ++ ++static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) ++{ ++ size_t max_extent = c->sb.encoded_extent_max << 9; ++ size_t decompress_workspace_size = 0; ++ bool decompress_workspace_needed; ++ ZSTD_parameters params = ZSTD_getParams(0, max_extent, 0); ++ struct { ++ unsigned feature; ++ unsigned type; ++ size_t compress_workspace; ++ size_t decompress_workspace; ++ } compression_types[] = { ++ { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, LZ4_MEM_COMPRESS, 0 }, ++ { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip, ++ zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), ++ zlib_inflate_workspacesize(), }, ++ { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd, ++ ZSTD_CCtxWorkspaceBound(params.cParams), ++ ZSTD_DCtxWorkspaceBound() }, ++ }, *i; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ c->zstd_params = params; ++ ++ for (i = compression_types; ++ i < compression_types + ARRAY_SIZE(compression_types); ++ i++) ++ if (features & (1 << i->feature)) ++ goto have_compressed; ++ ++ goto out; ++have_compressed: ++ ++ if (!mempool_initialized(&c->compression_bounce[READ])) { ++ ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], ++ 1, max_extent); ++ if (ret) ++ goto out; ++ } ++ ++ if (!mempool_initialized(&c->compression_bounce[WRITE])) { ++ ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], ++ 1, max_extent); ++ if (ret) ++ goto out; ++ } ++ ++ for (i = compression_types; ++ i < compression_types + ARRAY_SIZE(compression_types); ++ i++) { ++ decompress_workspace_size = ++ max(decompress_workspace_size, i->decompress_workspace); ++ ++ if (!(features & (1 << i->feature))) ++ continue; ++ ++ if (i->decompress_workspace) ++ decompress_workspace_needed = true; ++ ++ if (mempool_initialized(&c->compress_workspace[i->type])) ++ continue; ++ ++ ret = mempool_init_kvpmalloc_pool( ++ &c->compress_workspace[i->type], ++ 1, i->compress_workspace); ++ if (ret) ++ goto out; ++ } ++ ++ if (!mempool_initialized(&c->decompress_workspace)) { ++ ret = mempool_init_kvpmalloc_pool( ++ &c->decompress_workspace, ++ 1, decompress_workspace_size); ++ if (ret) ++ goto out; ++ } ++out: ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} ++ ++int bch2_fs_compress_init(struct bch_fs *c) ++{ ++ u64 f = c->sb.features; ++ ++ if (c->opts.compression) ++ f |= 1ULL << bch2_compression_opt_to_feature[c->opts.compression]; ++ ++ if (c->opts.background_compression) ++ f |= 1ULL << bch2_compression_opt_to_feature[c->opts.background_compression]; ++ ++ return __bch2_fs_compress_init(c, f); ++ ++} +diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h +new file mode 100644 +index 000000000000..4bab1f61b3b5 +--- /dev/null ++++ b/fs/bcachefs/compress.h +@@ -0,0 +1,18 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_COMPRESS_H ++#define _BCACHEFS_COMPRESS_H ++ ++#include "extents_types.h" ++ ++int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, ++ struct bch_extent_crc_unpacked *); ++int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, ++ struct bvec_iter, struct bch_extent_crc_unpacked); ++unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, ++ struct bio *, size_t *, unsigned); ++ ++int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned); ++void bch2_fs_compress_exit(struct bch_fs *); ++int bch2_fs_compress_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_COMPRESS_H */ +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +new file mode 100644 +index 000000000000..aa10591a3b1a +--- /dev/null ++++ b/fs/bcachefs/debug.c +@@ -0,0 +1,432 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Assorted bcachefs debug code ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "debug.h" ++#include "error.h" ++#include "extents.h" ++#include "fsck.h" ++#include "inode.h" ++#include "io.h" ++#include "super.h" ++ ++#include ++#include ++#include ++#include ++#include ++ ++static struct dentry *bch_debug; ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++void __bch2_btree_verify(struct bch_fs *c, struct btree *b) ++{ ++ struct btree *v = c->verify_data; ++ struct btree_node *n_ondisk, *n_sorted, *n_inmemory; ++ struct bset *sorted, *inmemory; ++ struct extent_ptr_decoded pick; ++ struct bch_dev *ca; ++ struct bio *bio; ++ ++ if (c->opts.nochanges) ++ return; ++ ++ btree_node_io_lock(b); ++ mutex_lock(&c->verify_lock); ++ ++ n_ondisk = c->verify_ondisk; ++ n_sorted = c->verify_data->data; ++ n_inmemory = b->data; ++ ++ bkey_copy(&v->key, &b->key); ++ v->written = 0; ++ v->c.level = b->c.level; ++ v->c.btree_id = b->c.btree_id; ++ bch2_btree_keys_init(v, &c->expensive_debug_checks); ++ ++ if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), ++ NULL, &pick) <= 0) ++ return; ++ ++ ca = bch_dev_bkey_exists(c, pick.ptr.dev); ++ if (!bch2_dev_get_ioref(ca, READ)) ++ return; ++ ++ bio = bio_alloc_bioset(GFP_NOIO, ++ buf_pages(n_sorted, btree_bytes(c)), ++ &c->btree_bio); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_opf = REQ_OP_READ|REQ_META; ++ bio->bi_iter.bi_sector = pick.ptr.offset; ++ bch2_bio_map(bio, n_sorted, btree_bytes(c)); ++ ++ submit_bio_wait(bio); ++ ++ bio_put(bio); ++ percpu_ref_put(&ca->io_ref); ++ ++ memcpy(n_ondisk, n_sorted, btree_bytes(c)); ++ ++ if (bch2_btree_node_read_done(c, v, false)) ++ goto out; ++ ++ n_sorted = c->verify_data->data; ++ sorted = &n_sorted->keys; ++ inmemory = &n_inmemory->keys; ++ ++ if (inmemory->u64s != sorted->u64s || ++ memcmp(inmemory->start, ++ sorted->start, ++ vstruct_end(inmemory) - (void *) inmemory->start)) { ++ unsigned offset = 0, sectors; ++ struct bset *i; ++ unsigned j; ++ ++ console_lock(); ++ ++ printk(KERN_ERR "*** in memory:\n"); ++ bch2_dump_bset(c, b, inmemory, 0); ++ ++ printk(KERN_ERR "*** read back in:\n"); ++ bch2_dump_bset(c, v, sorted, 0); ++ ++ while (offset < b->written) { ++ if (!offset ) { ++ i = &n_ondisk->keys; ++ sectors = vstruct_blocks(n_ondisk, c->block_bits) << ++ c->block_bits; ++ } else { ++ struct btree_node_entry *bne = ++ (void *) n_ondisk + (offset << 9); ++ i = &bne->keys; ++ ++ sectors = vstruct_blocks(bne, c->block_bits) << ++ c->block_bits; ++ } ++ ++ printk(KERN_ERR "*** on disk block %u:\n", offset); ++ bch2_dump_bset(c, b, i, offset); ++ ++ offset += sectors; ++ } ++ ++ printk(KERN_ERR "*** block %u/%u not written\n", ++ offset >> c->block_bits, btree_blocks(c)); ++ ++ for (j = 0; j < le16_to_cpu(inmemory->u64s); j++) ++ if (inmemory->_data[j] != sorted->_data[j]) ++ break; ++ ++ printk(KERN_ERR "b->written %u\n", b->written); ++ ++ console_unlock(); ++ panic("verify failed at %u\n", j); ++ } ++out: ++ mutex_unlock(&c->verify_lock); ++ btree_node_io_unlock(b); ++} ++ ++#endif ++ ++#ifdef CONFIG_DEBUG_FS ++ ++/* XXX: bch_fs refcounting */ ++ ++struct dump_iter { ++ struct bpos from; ++ struct bch_fs *c; ++ enum btree_id id; ++ ++ char buf[PAGE_SIZE]; ++ size_t bytes; /* what's currently in buf */ ++ ++ char __user *ubuf; /* destination user buffer */ ++ size_t size; /* size of requested read */ ++ ssize_t ret; /* bytes read so far */ ++}; ++ ++static int flush_buf(struct dump_iter *i) ++{ ++ if (i->bytes) { ++ size_t bytes = min(i->bytes, i->size); ++ int err = copy_to_user(i->ubuf, i->buf, bytes); ++ ++ if (err) ++ return err; ++ ++ i->ret += bytes; ++ i->ubuf += bytes; ++ i->size -= bytes; ++ i->bytes -= bytes; ++ memmove(i->buf, i->buf + bytes, i->bytes); ++ } ++ ++ return 0; ++} ++ ++static int bch2_dump_open(struct inode *inode, struct file *file) ++{ ++ struct btree_debug *bd = inode->i_private; ++ struct dump_iter *i; ++ ++ i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); ++ if (!i) ++ return -ENOMEM; ++ ++ file->private_data = i; ++ i->from = POS_MIN; ++ i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]); ++ i->id = bd->id; ++ ++ return 0; ++} ++ ++static int bch2_dump_release(struct inode *inode, struct file *file) ++{ ++ kfree(file->private_data); ++ return 0; ++} ++ ++static ssize_t bch2_read_btree(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ err = flush_buf(i); ++ if (err) ++ return err; ++ ++ if (!i->size) ++ return i->ret; ++ ++ bch2_trans_init(&trans, i->c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); ++ k = bch2_btree_iter_peek(iter); ++ ++ while (k.k && !(err = bkey_err(k))) { ++ bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k); ++ i->bytes = strlen(i->buf); ++ BUG_ON(i->bytes >= PAGE_SIZE); ++ i->buf[i->bytes] = '\n'; ++ i->bytes++; ++ ++ k = bch2_btree_iter_next(iter); ++ i->from = iter->pos; ++ ++ err = flush_buf(i); ++ if (err) ++ break; ++ ++ if (!i->size) ++ break; ++ } ++ bch2_trans_exit(&trans); ++ ++ return err < 0 ? err : i->ret; ++} ++ ++static const struct file_operations btree_debug_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_dump_open, ++ .release = bch2_dump_release, ++ .read = bch2_read_btree, ++}; ++ ++static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct btree *b; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ err = flush_buf(i); ++ if (err) ++ return err; ++ ++ if (!i->size || !bkey_cmp(POS_MAX, i->from)) ++ return i->ret; ++ ++ bch2_trans_init(&trans, i->c, 0, 0); ++ ++ for_each_btree_node(&trans, iter, i->id, i->from, 0, b) { ++ bch2_btree_node_to_text(&PBUF(i->buf), i->c, b); ++ i->bytes = strlen(i->buf); ++ err = flush_buf(i); ++ if (err) ++ break; ++ ++ /* ++ * can't easily correctly restart a btree node traversal across ++ * all nodes, meh ++ */ ++ i->from = bkey_cmp(POS_MAX, b->key.k.p) ++ ? bkey_successor(b->key.k.p) ++ : b->key.k.p; ++ ++ if (!i->size) ++ break; ++ } ++ bch2_trans_exit(&trans); ++ ++ return err < 0 ? err : i->ret; ++} ++ ++static const struct file_operations btree_format_debug_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_dump_open, ++ .release = bch2_dump_release, ++ .read = bch2_read_btree_formats, ++}; ++ ++static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct btree *prev_node = NULL; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ err = flush_buf(i); ++ if (err) ++ return err; ++ ++ if (!i->size) ++ return i->ret; ++ ++ bch2_trans_init(&trans, i->c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); ++ ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(err = bkey_err(k))) { ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_packed *_k = ++ bch2_btree_node_iter_peek(&l->iter, l->b); ++ ++ if (l->b != prev_node) { ++ bch2_btree_node_to_text(&PBUF(i->buf), i->c, l->b); ++ i->bytes = strlen(i->buf); ++ err = flush_buf(i); ++ if (err) ++ break; ++ } ++ prev_node = l->b; ++ ++ bch2_bfloat_to_text(&PBUF(i->buf), l->b, _k); ++ i->bytes = strlen(i->buf); ++ err = flush_buf(i); ++ if (err) ++ break; ++ ++ bch2_btree_iter_next(iter); ++ i->from = iter->pos; ++ ++ err = flush_buf(i); ++ if (err) ++ break; ++ ++ if (!i->size) ++ break; ++ } ++ bch2_trans_exit(&trans); ++ ++ return err < 0 ? err : i->ret; ++} ++ ++static const struct file_operations bfloat_failed_debug_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_dump_open, ++ .release = bch2_dump_release, ++ .read = bch2_read_bfloat_failed, ++}; ++ ++void bch2_fs_debug_exit(struct bch_fs *c) ++{ ++ if (!IS_ERR_OR_NULL(c->debug)) ++ debugfs_remove_recursive(c->debug); ++} ++ ++void bch2_fs_debug_init(struct bch_fs *c) ++{ ++ struct btree_debug *bd; ++ char name[100]; ++ ++ if (IS_ERR_OR_NULL(bch_debug)) ++ return; ++ ++ snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); ++ c->debug = debugfs_create_dir(name, bch_debug); ++ if (IS_ERR_OR_NULL(c->debug)) ++ return; ++ ++ for (bd = c->btree_debug; ++ bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); ++ bd++) { ++ bd->id = bd - c->btree_debug; ++ bd->btree = debugfs_create_file(bch2_btree_ids[bd->id], ++ 0400, c->debug, bd, ++ &btree_debug_ops); ++ ++ snprintf(name, sizeof(name), "%s-formats", ++ bch2_btree_ids[bd->id]); ++ ++ bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd, ++ &btree_format_debug_ops); ++ ++ snprintf(name, sizeof(name), "%s-bfloat-failed", ++ bch2_btree_ids[bd->id]); ++ ++ bd->failed = debugfs_create_file(name, 0400, c->debug, bd, ++ &bfloat_failed_debug_ops); ++ } ++} ++ ++#endif ++ ++void bch2_debug_exit(void) ++{ ++ if (!IS_ERR_OR_NULL(bch_debug)) ++ debugfs_remove_recursive(bch_debug); ++} ++ ++int __init bch2_debug_init(void) ++{ ++ int ret = 0; ++ ++ bch_debug = debugfs_create_dir("bcachefs", NULL); ++ return ret; ++} +diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h +new file mode 100644 +index 000000000000..56c2d1ab5f63 +--- /dev/null ++++ b/fs/bcachefs/debug.h +@@ -0,0 +1,63 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_DEBUG_H ++#define _BCACHEFS_DEBUG_H ++ ++#include "bcachefs.h" ++ ++struct bio; ++struct btree; ++struct bch_fs; ++ ++#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; ++BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++#define BCH_DEBUG_PARAM(name, description) \ ++ static inline bool name(struct bch_fs *c) \ ++ { return bch2_##name || c->name; } ++BCH_DEBUG_PARAMS_ALWAYS() ++#undef BCH_DEBUG_PARAM ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++#define BCH_DEBUG_PARAM(name, description) \ ++ static inline bool name(struct bch_fs *c) \ ++ { return bch2_##name || c->name; } ++BCH_DEBUG_PARAMS_DEBUG() ++#undef BCH_DEBUG_PARAM ++ ++void __bch2_btree_verify(struct bch_fs *, struct btree *); ++ ++#define bypass_torture_test(d) ((d)->bypass_torture_test) ++ ++#else /* DEBUG */ ++ ++#define BCH_DEBUG_PARAM(name, description) \ ++ static inline bool name(struct bch_fs *c) { return false; } ++BCH_DEBUG_PARAMS_DEBUG() ++#undef BCH_DEBUG_PARAM ++ ++static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {} ++ ++#define bypass_torture_test(d) 0 ++ ++#endif ++ ++static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) ++{ ++ if (verify_btree_ondisk(c)) ++ __bch2_btree_verify(c, b); ++} ++ ++#ifdef CONFIG_DEBUG_FS ++void bch2_fs_debug_exit(struct bch_fs *); ++void bch2_fs_debug_init(struct bch_fs *); ++#else ++static inline void bch2_fs_debug_exit(struct bch_fs *c) {} ++static inline void bch2_fs_debug_init(struct bch_fs *c) {} ++#endif ++ ++void bch2_debug_exit(void); ++int bch2_debug_init(void); ++ ++#endif /* _BCACHEFS_DEBUG_H */ +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +new file mode 100644 +index 000000000000..f34bfda8ab0d +--- /dev/null ++++ b/fs/bcachefs/dirent.c +@@ -0,0 +1,385 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_update.h" ++#include "extents.h" ++#include "dirent.h" ++#include "fs.h" ++#include "keylist.h" ++#include "str_hash.h" ++ ++#include ++ ++unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) ++{ ++ unsigned len = bkey_val_bytes(d.k) - ++ offsetof(struct bch_dirent, d_name); ++ ++ return strnlen(d.v->d_name, len); ++} ++ ++static u64 bch2_dirent_hash(const struct bch_hash_info *info, ++ const struct qstr *name) ++{ ++ struct bch_str_hash_ctx ctx; ++ ++ bch2_str_hash_init(&ctx, info); ++ bch2_str_hash_update(&ctx, info, name->name, name->len); ++ ++ /* [0,2) reserved for dots */ ++ return max_t(u64, bch2_str_hash_end(&ctx, info), 2); ++} ++ ++static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) ++{ ++ return bch2_dirent_hash(info, key); ++} ++ ++static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) ++{ ++ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); ++ struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); ++ ++ return bch2_dirent_hash(info, &name); ++} ++ ++static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) ++{ ++ struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); ++ int len = bch2_dirent_name_bytes(l); ++ const struct qstr *r = _r; ++ ++ return len - r->len ?: memcmp(l.v->d_name, r->name, len); ++} ++ ++static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) ++{ ++ struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); ++ struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r); ++ int l_len = bch2_dirent_name_bytes(l); ++ int r_len = bch2_dirent_name_bytes(r); ++ ++ return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len); ++} ++ ++const struct bch_hash_desc bch2_dirent_hash_desc = { ++ .btree_id = BTREE_ID_DIRENTS, ++ .key_type = KEY_TYPE_dirent, ++ .hash_key = dirent_hash_key, ++ .hash_bkey = dirent_hash_bkey, ++ .cmp_key = dirent_cmp_key, ++ .cmp_bkey = dirent_cmp_bkey, ++}; ++ ++const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); ++ unsigned len; ++ ++ if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) ++ return "value too small"; ++ ++ len = bch2_dirent_name_bytes(d); ++ if (!len) ++ return "empty name"; ++ ++ /* ++ * older versions of bcachefs were buggy and creating dirent ++ * keys that were bigger than necessary: ++ */ ++ if (bkey_val_u64s(k.k) > dirent_val_u64s(len + 7)) ++ return "value too big"; ++ ++ if (len > BCH_NAME_MAX) ++ return "dirent name too big"; ++ ++ return NULL; ++} ++ ++void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); ++ ++ bch_scnmemcpy(out, d.v->d_name, ++ bch2_dirent_name_bytes(d)); ++ pr_buf(out, " -> %llu type %u", d.v->d_inum, d.v->d_type); ++} ++ ++static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, ++ u8 type, const struct qstr *name, u64 dst) ++{ ++ struct bkey_i_dirent *dirent; ++ unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len); ++ ++ if (name->len > BCH_NAME_MAX) ++ return ERR_PTR(-ENAMETOOLONG); ++ ++ BUG_ON(u64s > U8_MAX); ++ ++ dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); ++ if (IS_ERR(dirent)) ++ return dirent; ++ ++ bkey_dirent_init(&dirent->k_i); ++ dirent->k.u64s = u64s; ++ dirent->v.d_inum = cpu_to_le64(dst); ++ dirent->v.d_type = type; ++ ++ memcpy(dirent->v.d_name, name->name, name->len); ++ memset(dirent->v.d_name + name->len, 0, ++ bkey_val_bytes(&dirent->k) - ++ offsetof(struct bch_dirent, d_name) - ++ name->len); ++ ++ EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len); ++ ++ return dirent; ++} ++ ++int bch2_dirent_create(struct btree_trans *trans, ++ u64 dir_inum, const struct bch_hash_info *hash_info, ++ u8 type, const struct qstr *name, u64 dst_inum, ++ int flags) ++{ ++ struct bkey_i_dirent *dirent; ++ int ret; ++ ++ dirent = dirent_create_key(trans, type, name, dst_inum); ++ ret = PTR_ERR_OR_ZERO(dirent); ++ if (ret) ++ return ret; ++ ++ return bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, ++ dir_inum, &dirent->k_i, flags); ++} ++ ++static void dirent_copy_target(struct bkey_i_dirent *dst, ++ struct bkey_s_c_dirent src) ++{ ++ dst->v.d_inum = src.v->d_inum; ++ dst->v.d_type = src.v->d_type; ++} ++ ++int bch2_dirent_rename(struct btree_trans *trans, ++ u64 src_dir, struct bch_hash_info *src_hash, ++ u64 dst_dir, struct bch_hash_info *dst_hash, ++ const struct qstr *src_name, u64 *src_inum, ++ const struct qstr *dst_name, u64 *dst_inum, ++ enum bch_rename_mode mode) ++{ ++ struct btree_iter *src_iter = NULL, *dst_iter = NULL; ++ struct bkey_s_c old_src, old_dst; ++ struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; ++ struct bpos dst_pos = ++ POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name)); ++ int ret = 0; ++ ++ *src_inum = *dst_inum = 0; ++ ++ /* ++ * Lookup dst: ++ * ++ * Note that in BCH_RENAME mode, we're _not_ checking if ++ * the target already exists - we're relying on the VFS ++ * to do that check for us for correctness: ++ */ ++ dst_iter = mode == BCH_RENAME ++ ? bch2_hash_hole(trans, bch2_dirent_hash_desc, ++ dst_hash, dst_dir, dst_name) ++ : bch2_hash_lookup(trans, bch2_dirent_hash_desc, ++ dst_hash, dst_dir, dst_name, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(dst_iter); ++ if (ret) ++ goto out; ++ ++ old_dst = bch2_btree_iter_peek_slot(dst_iter); ++ ++ if (mode != BCH_RENAME) ++ *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum); ++ ++ /* Lookup src: */ ++ src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc, ++ src_hash, src_dir, src_name, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(src_iter); ++ if (ret) ++ goto out; ++ ++ old_src = bch2_btree_iter_peek_slot(src_iter); ++ *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum); ++ ++ /* Create new dst key: */ ++ new_dst = dirent_create_key(trans, 0, dst_name, 0); ++ ret = PTR_ERR_OR_ZERO(new_dst); ++ if (ret) ++ goto out; ++ ++ dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); ++ new_dst->k.p = dst_iter->pos; ++ ++ /* Create new src key: */ ++ if (mode == BCH_RENAME_EXCHANGE) { ++ new_src = dirent_create_key(trans, 0, src_name, 0); ++ ret = PTR_ERR_OR_ZERO(new_src); ++ if (ret) ++ goto out; ++ ++ dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); ++ new_src->k.p = src_iter->pos; ++ } else { ++ new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); ++ ret = PTR_ERR_OR_ZERO(new_src); ++ if (ret) ++ goto out; ++ ++ bkey_init(&new_src->k); ++ new_src->k.p = src_iter->pos; ++ ++ if (bkey_cmp(dst_pos, src_iter->pos) <= 0 && ++ bkey_cmp(src_iter->pos, dst_iter->pos) < 0) { ++ /* ++ * We have a hash collision for the new dst key, ++ * and new_src - the key we're deleting - is between ++ * new_dst's hashed slot and the slot we're going to be ++ * inserting it into - oops. This will break the hash ++ * table if we don't deal with it: ++ */ ++ if (mode == BCH_RENAME) { ++ /* ++ * If we're not overwriting, we can just insert ++ * new_dst at the src position: ++ */ ++ new_dst->k.p = src_iter->pos; ++ bch2_trans_update(trans, src_iter, ++ &new_dst->k_i, 0); ++ goto out; ++ } else { ++ /* If we're overwriting, we can't insert new_dst ++ * at a different slot because it has to ++ * overwrite old_dst - just make sure to use a ++ * whiteout when deleting src: ++ */ ++ new_src->k.type = KEY_TYPE_whiteout; ++ } ++ } else { ++ /* Check if we need a whiteout to delete src: */ ++ ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc, ++ src_hash, src_iter); ++ if (ret < 0) ++ goto out; ++ ++ if (ret) ++ new_src->k.type = KEY_TYPE_whiteout; ++ } ++ } ++ ++ bch2_trans_update(trans, src_iter, &new_src->k_i, 0); ++ bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0); ++out: ++ bch2_trans_iter_put(trans, src_iter); ++ bch2_trans_iter_put(trans, dst_iter); ++ return ret; ++} ++ ++int bch2_dirent_delete_at(struct btree_trans *trans, ++ const struct bch_hash_info *hash_info, ++ struct btree_iter *iter) ++{ ++ return bch2_hash_delete_at(trans, bch2_dirent_hash_desc, ++ hash_info, iter); ++} ++ ++struct btree_iter * ++__bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum, ++ const struct bch_hash_info *hash_info, ++ const struct qstr *name, unsigned flags) ++{ ++ return bch2_hash_lookup(trans, bch2_dirent_hash_desc, ++ hash_info, dir_inum, name, flags); ++} ++ ++u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, ++ const struct bch_hash_info *hash_info, ++ const struct qstr *name) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 inum = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = __bch2_dirent_lookup_trans(&trans, dir_inum, ++ hash_info, name, 0); ++ if (IS_ERR(iter)) { ++ BUG_ON(PTR_ERR(iter) == -EINTR); ++ goto out; ++ } ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); ++out: ++ bch2_trans_exit(&trans); ++ return inum; ++} ++ ++int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ for_each_btree_key(trans, iter, BTREE_ID_DIRENTS, ++ POS(dir_inum, 0), 0, k, ret) { ++ if (k.k->p.inode > dir_inum) ++ break; ++ ++ if (k.k->type == KEY_TYPE_dirent) { ++ ret = -ENOTEMPTY; ++ break; ++ } ++ } ++ bch2_trans_iter_put(trans, iter); ++ ++ return ret; ++} ++ ++int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_dirent dirent; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, ++ POS(inum, ctx->pos), 0, k, ret) { ++ if (k.k->p.inode > inum) ++ break; ++ ++ if (k.k->type != KEY_TYPE_dirent) ++ continue; ++ ++ dirent = bkey_s_c_to_dirent(k); ++ ++ /* ++ * XXX: dir_emit() can fault and block, while we're holding ++ * locks ++ */ ++ ctx->pos = dirent.k->p.offset; ++ if (!dir_emit(ctx, dirent.v->d_name, ++ bch2_dirent_name_bytes(dirent), ++ le64_to_cpu(dirent.v->d_inum), ++ dirent.v->d_type)) ++ break; ++ ctx->pos = dirent.k->p.offset + 1; ++ } ++ ret = bch2_trans_exit(&trans) ?: ret; ++ ++ return ret; ++} +diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h +new file mode 100644 +index 000000000000..34769371dd13 +--- /dev/null ++++ b/fs/bcachefs/dirent.h +@@ -0,0 +1,63 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_DIRENT_H ++#define _BCACHEFS_DIRENT_H ++ ++#include "str_hash.h" ++ ++extern const struct bch_hash_desc bch2_dirent_hash_desc; ++ ++const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_dirent (struct bkey_ops) { \ ++ .key_invalid = bch2_dirent_invalid, \ ++ .val_to_text = bch2_dirent_to_text, \ ++} ++ ++struct qstr; ++struct file; ++struct dir_context; ++struct bch_fs; ++struct bch_hash_info; ++struct bch_inode_info; ++ ++unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent); ++ ++static inline unsigned dirent_val_u64s(unsigned len) ++{ ++ return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len, ++ sizeof(u64)); ++} ++ ++int bch2_dirent_create(struct btree_trans *, u64, ++ const struct bch_hash_info *, u8, ++ const struct qstr *, u64, int); ++ ++int bch2_dirent_delete_at(struct btree_trans *, ++ const struct bch_hash_info *, ++ struct btree_iter *); ++ ++enum bch_rename_mode { ++ BCH_RENAME, ++ BCH_RENAME_OVERWRITE, ++ BCH_RENAME_EXCHANGE, ++}; ++ ++int bch2_dirent_rename(struct btree_trans *, ++ u64, struct bch_hash_info *, ++ u64, struct bch_hash_info *, ++ const struct qstr *, u64 *, ++ const struct qstr *, u64 *, ++ enum bch_rename_mode); ++ ++struct btree_iter * ++__bch2_dirent_lookup_trans(struct btree_trans *, u64, ++ const struct bch_hash_info *, ++ const struct qstr *, unsigned); ++u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *, ++ const struct qstr *); ++ ++int bch2_empty_dir_trans(struct btree_trans *, u64); ++int bch2_readdir(struct bch_fs *, u64, struct dir_context *); ++ ++#endif /* _BCACHEFS_DIRENT_H */ +diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c +new file mode 100644 +index 000000000000..c52b6faac9b4 +--- /dev/null ++++ b/fs/bcachefs/disk_groups.c +@@ -0,0 +1,486 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "disk_groups.h" ++#include "super-io.h" ++ ++#include ++ ++static int group_cmp(const void *_l, const void *_r) ++{ ++ const struct bch_disk_group *l = _l; ++ const struct bch_disk_group *r = _r; ++ ++ return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) - ++ (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?: ++ ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) - ++ (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?: ++ strncmp(l->label, r->label, sizeof(l->label)); ++} ++ ++static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ field_to_type(f, disk_groups); ++ struct bch_disk_group *g, *sorted = NULL; ++ struct bch_sb_field_members *mi; ++ struct bch_member *m; ++ unsigned i, nr_groups, len; ++ const char *err = NULL; ++ ++ mi = bch2_sb_get_members(sb); ++ groups = bch2_sb_get_disk_groups(sb); ++ nr_groups = disk_groups_nr(groups); ++ ++ for (m = mi->members; ++ m < mi->members + sb->nr_devices; ++ m++) { ++ unsigned g; ++ ++ if (!BCH_MEMBER_GROUP(m)) ++ continue; ++ ++ g = BCH_MEMBER_GROUP(m) - 1; ++ ++ if (g >= nr_groups || ++ BCH_GROUP_DELETED(&groups->entries[g])) ++ return "disk has invalid group"; ++ } ++ ++ if (!nr_groups) ++ return NULL; ++ ++ for (g = groups->entries; ++ g < groups->entries + nr_groups; ++ g++) { ++ if (BCH_GROUP_DELETED(g)) ++ continue; ++ ++ len = strnlen(g->label, sizeof(g->label)); ++ if (!len) { ++ err = "group with empty label"; ++ goto err; ++ } ++ } ++ ++ sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL); ++ if (!sorted) ++ return "cannot allocate memory"; ++ ++ memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted)); ++ sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL); ++ ++ for (i = 0; i + 1 < nr_groups; i++) ++ if (!BCH_GROUP_DELETED(sorted + i) && ++ !group_cmp(sorted + i, sorted + i + 1)) { ++ err = "duplicate groups"; ++ goto err; ++ } ++ ++ err = NULL; ++err: ++ kfree(sorted); ++ return err; ++} ++ ++static void bch2_sb_disk_groups_to_text(struct printbuf *out, ++ struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ field_to_type(f, disk_groups); ++ struct bch_disk_group *g; ++ unsigned nr_groups = disk_groups_nr(groups); ++ ++ for (g = groups->entries; ++ g < groups->entries + nr_groups; ++ g++) { ++ if (g != groups->entries) ++ pr_buf(out, " "); ++ ++ if (BCH_GROUP_DELETED(g)) ++ pr_buf(out, "[deleted]"); ++ else ++ pr_buf(out, "[parent %llu name %s]", ++ BCH_GROUP_PARENT(g), g->label); ++ } ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = { ++ .validate = bch2_sb_disk_groups_validate, ++ .to_text = bch2_sb_disk_groups_to_text ++}; ++ ++int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) ++{ ++ struct bch_sb_field_members *mi; ++ struct bch_sb_field_disk_groups *groups; ++ struct bch_disk_groups_cpu *cpu_g, *old_g; ++ unsigned i, g, nr_groups; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ groups = bch2_sb_get_disk_groups(c->disk_sb.sb); ++ nr_groups = disk_groups_nr(groups); ++ ++ if (!groups) ++ return 0; ++ ++ cpu_g = kzalloc(sizeof(*cpu_g) + ++ sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL); ++ if (!cpu_g) ++ return -ENOMEM; ++ ++ cpu_g->nr = nr_groups; ++ ++ for (i = 0; i < nr_groups; i++) { ++ struct bch_disk_group *src = &groups->entries[i]; ++ struct bch_disk_group_cpu *dst = &cpu_g->entries[i]; ++ ++ dst->deleted = BCH_GROUP_DELETED(src); ++ dst->parent = BCH_GROUP_PARENT(src); ++ } ++ ++ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { ++ struct bch_member *m = mi->members + i; ++ struct bch_disk_group_cpu *dst = ++ &cpu_g->entries[BCH_MEMBER_GROUP(m)]; ++ ++ if (!bch2_member_exists(m)) ++ continue; ++ ++ g = BCH_MEMBER_GROUP(m); ++ while (g) { ++ dst = &cpu_g->entries[g - 1]; ++ __set_bit(i, dst->devs.d); ++ g = dst->parent; ++ } ++ } ++ ++ old_g = rcu_dereference_protected(c->disk_groups, ++ lockdep_is_held(&c->sb_lock)); ++ rcu_assign_pointer(c->disk_groups, cpu_g); ++ if (old_g) ++ kfree_rcu(old_g, rcu); ++ ++ return 0; ++} ++ ++const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) ++{ ++ struct target t = target_decode(target); ++ ++ switch (t.type) { ++ case TARGET_NULL: ++ return NULL; ++ case TARGET_DEV: { ++ struct bch_dev *ca = t.dev < c->sb.nr_devices ++ ? rcu_dereference(c->devs[t.dev]) ++ : NULL; ++ return ca ? &ca->self : NULL; ++ } ++ case TARGET_GROUP: { ++ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); ++ ++ return g && t.group < g->nr && !g->entries[t.group].deleted ++ ? &g->entries[t.group].devs ++ : NULL; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) ++{ ++ struct target t = target_decode(target); ++ ++ switch (t.type) { ++ case TARGET_NULL: ++ return false; ++ case TARGET_DEV: ++ return dev == t.dev; ++ case TARGET_GROUP: { ++ struct bch_disk_groups_cpu *g; ++ const struct bch_devs_mask *m; ++ bool ret; ++ ++ rcu_read_lock(); ++ g = rcu_dereference(c->disk_groups); ++ m = g && t.group < g->nr && !g->entries[t.group].deleted ++ ? &g->entries[t.group].devs ++ : NULL; ++ ++ ret = m ? test_bit(dev, m->d) : false; ++ rcu_read_unlock(); ++ ++ return ret; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups, ++ unsigned parent, ++ const char *name, unsigned namelen) ++{ ++ unsigned i, nr_groups = disk_groups_nr(groups); ++ ++ if (!namelen || namelen > BCH_SB_LABEL_SIZE) ++ return -EINVAL; ++ ++ for (i = 0; i < nr_groups; i++) { ++ struct bch_disk_group *g = groups->entries + i; ++ ++ if (BCH_GROUP_DELETED(g)) ++ continue; ++ ++ if (!BCH_GROUP_DELETED(g) && ++ BCH_GROUP_PARENT(g) == parent && ++ strnlen(g->label, sizeof(g->label)) == namelen && ++ !memcmp(name, g->label, namelen)) ++ return i; ++ } ++ ++ return -1; ++} ++ ++static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent, ++ const char *name, unsigned namelen) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ bch2_sb_get_disk_groups(sb->sb); ++ unsigned i, nr_groups = disk_groups_nr(groups); ++ struct bch_disk_group *g; ++ ++ if (!namelen || namelen > BCH_SB_LABEL_SIZE) ++ return -EINVAL; ++ ++ for (i = 0; ++ i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]); ++ i++) ++ ; ++ ++ if (i == nr_groups) { ++ unsigned u64s = ++ (sizeof(struct bch_sb_field_disk_groups) + ++ sizeof(struct bch_disk_group) * (nr_groups + 1)) / ++ sizeof(u64); ++ ++ groups = bch2_sb_resize_disk_groups(sb, u64s); ++ if (!groups) ++ return -ENOSPC; ++ ++ nr_groups = disk_groups_nr(groups); ++ } ++ ++ BUG_ON(i >= nr_groups); ++ ++ g = &groups->entries[i]; ++ ++ memcpy(g->label, name, namelen); ++ if (namelen < sizeof(g->label)) ++ g->label[namelen] = '\0'; ++ SET_BCH_GROUP_DELETED(g, 0); ++ SET_BCH_GROUP_PARENT(g, parent); ++ SET_BCH_GROUP_DATA_ALLOWED(g, ~0); ++ ++ return i; ++} ++ ++int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ bch2_sb_get_disk_groups(sb->sb); ++ int v = -1; ++ ++ do { ++ const char *next = strchrnul(name, '.'); ++ unsigned len = next - name; ++ ++ if (*next == '.') ++ next++; ++ ++ v = __bch2_disk_group_find(groups, v + 1, name, len); ++ name = next; ++ } while (*name && v >= 0); ++ ++ return v; ++} ++ ++int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) ++{ ++ struct bch_sb_field_disk_groups *groups; ++ unsigned parent = 0; ++ int v = -1; ++ ++ do { ++ const char *next = strchrnul(name, '.'); ++ unsigned len = next - name; ++ ++ if (*next == '.') ++ next++; ++ ++ groups = bch2_sb_get_disk_groups(sb->sb); ++ ++ v = __bch2_disk_group_find(groups, parent, name, len); ++ if (v < 0) ++ v = __bch2_disk_group_add(sb, parent, name, len); ++ if (v < 0) ++ return v; ++ ++ parent = v + 1; ++ name = next; ++ } while (*name && v >= 0); ++ ++ return v; ++} ++ ++void bch2_disk_path_to_text(struct printbuf *out, ++ struct bch_sb_handle *sb, ++ unsigned v) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ bch2_sb_get_disk_groups(sb->sb); ++ struct bch_disk_group *g; ++ unsigned nr = 0; ++ u16 path[32]; ++ ++ while (1) { ++ if (nr == ARRAY_SIZE(path)) ++ goto inval; ++ ++ if (v >= disk_groups_nr(groups)) ++ goto inval; ++ ++ g = groups->entries + v; ++ ++ if (BCH_GROUP_DELETED(g)) ++ goto inval; ++ ++ path[nr++] = v; ++ ++ if (!BCH_GROUP_PARENT(g)) ++ break; ++ ++ v = BCH_GROUP_PARENT(g) - 1; ++ } ++ ++ while (nr) { ++ v = path[--nr]; ++ g = groups->entries + v; ++ ++ bch_scnmemcpy(out, g->label, ++ strnlen(g->label, sizeof(g->label))); ++ ++ if (nr) ++ pr_buf(out, "."); ++ } ++ return; ++inval: ++ pr_buf(out, "invalid group %u", v); ++} ++ ++int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) ++{ ++ struct bch_member *mi; ++ int v = -1; ++ int ret = 0; ++ ++ mutex_lock(&c->sb_lock); ++ ++ if (!strlen(name) || !strcmp(name, "none")) ++ goto write_sb; ++ ++ v = bch2_disk_path_find_or_create(&c->disk_sb, name); ++ if (v < 0) { ++ mutex_unlock(&c->sb_lock); ++ return v; ++ } ++ ++ ret = bch2_sb_disk_groups_to_cpu(c); ++ if (ret) ++ goto unlock; ++write_sb: ++ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; ++ SET_BCH_MEMBER_GROUP(mi, v + 1); ++ ++ bch2_write_super(c); ++unlock: ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v) ++{ ++ struct bch_dev *ca; ++ int g; ++ ++ if (!strlen(buf) || !strcmp(buf, "none")) { ++ *v = 0; ++ return 0; ++ } ++ ++ /* Is it a device? */ ++ ca = bch2_dev_lookup(c, buf); ++ if (!IS_ERR(ca)) { ++ *v = dev_to_target(ca->dev_idx); ++ percpu_ref_put(&ca->ref); ++ return 0; ++ } ++ ++ mutex_lock(&c->sb_lock); ++ g = bch2_disk_path_find(&c->disk_sb, buf); ++ mutex_unlock(&c->sb_lock); ++ ++ if (g >= 0) { ++ *v = group_to_target(g); ++ return 0; ++ } ++ ++ return -EINVAL; ++} ++ ++void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v) ++{ ++ struct target t = target_decode(v); ++ ++ switch (t.type) { ++ case TARGET_NULL: ++ pr_buf(out, "none"); ++ break; ++ case TARGET_DEV: { ++ struct bch_dev *ca; ++ ++ rcu_read_lock(); ++ ca = t.dev < c->sb.nr_devices ++ ? rcu_dereference(c->devs[t.dev]) ++ : NULL; ++ ++ if (ca && percpu_ref_tryget(&ca->io_ref)) { ++ char b[BDEVNAME_SIZE]; ++ ++ pr_buf(out, "/dev/%s", ++ bdevname(ca->disk_sb.bdev, b)); ++ percpu_ref_put(&ca->io_ref); ++ } else if (ca) { ++ pr_buf(out, "offline device %u", t.dev); ++ } else { ++ pr_buf(out, "invalid device %u", t.dev); ++ } ++ ++ rcu_read_unlock(); ++ break; ++ } ++ case TARGET_GROUP: ++ mutex_lock(&c->sb_lock); ++ bch2_disk_path_to_text(out, &c->disk_sb, t.group); ++ mutex_unlock(&c->sb_lock); ++ break; ++ default: ++ BUG(); ++ } ++} +diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h +new file mode 100644 +index 000000000000..3d84f23c34ed +--- /dev/null ++++ b/fs/bcachefs/disk_groups.h +@@ -0,0 +1,91 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_DISK_GROUPS_H ++#define _BCACHEFS_DISK_GROUPS_H ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups; ++ ++static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups) ++{ ++ return groups ++ ? (vstruct_end(&groups->field) - ++ (void *) &groups->entries[0]) / sizeof(struct bch_disk_group) ++ : 0; ++} ++ ++struct target { ++ enum { ++ TARGET_NULL, ++ TARGET_DEV, ++ TARGET_GROUP, ++ } type; ++ union { ++ unsigned dev; ++ unsigned group; ++ }; ++}; ++ ++#define TARGET_DEV_START 1 ++#define TARGET_GROUP_START (256 + TARGET_DEV_START) ++ ++static inline u16 dev_to_target(unsigned dev) ++{ ++ return TARGET_DEV_START + dev; ++} ++ ++static inline u16 group_to_target(unsigned group) ++{ ++ return TARGET_GROUP_START + group; ++} ++ ++static inline struct target target_decode(unsigned target) ++{ ++ if (target >= TARGET_GROUP_START) ++ return (struct target) { ++ .type = TARGET_GROUP, ++ .group = target - TARGET_GROUP_START ++ }; ++ ++ if (target >= TARGET_DEV_START) ++ return (struct target) { ++ .type = TARGET_DEV, ++ .group = target - TARGET_DEV_START ++ }; ++ ++ return (struct target) { .type = TARGET_NULL }; ++} ++ ++const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned); ++ ++static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c, ++ enum bch_data_type data_type, ++ u16 target) ++{ ++ struct bch_devs_mask devs = c->rw_devs[data_type]; ++ const struct bch_devs_mask *t = bch2_target_to_mask(c, target); ++ ++ if (t) ++ bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); ++ return devs; ++} ++ ++bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned); ++ ++int bch2_disk_path_find(struct bch_sb_handle *, const char *); ++ ++/* Exported for userspace bcachefs-tools: */ ++int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); ++ ++void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *, ++ unsigned); ++ ++int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *); ++void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, u64); ++ ++int bch2_sb_disk_groups_to_cpu(struct bch_fs *); ++ ++int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); ++ ++const char *bch2_sb_validate_disk_groups(struct bch_sb *, ++ struct bch_sb_field *); ++ ++#endif /* _BCACHEFS_DISK_GROUPS_H */ +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +new file mode 100644 +index 000000000000..5514f65378ad +--- /dev/null ++++ b/fs/bcachefs/ec.c +@@ -0,0 +1,1639 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++/* erasure coding */ ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_on_stack.h" ++#include "bset.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "error.h" ++#include "io.h" ++#include "keylist.h" ++#include "recovery.h" ++#include "super-io.h" ++#include "util.h" ++ ++#include ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++ ++static void raid5_recov(unsigned disks, unsigned failed_idx, ++ size_t size, void **data) ++{ ++ unsigned i = 2, nr; ++ ++ BUG_ON(failed_idx >= disks); ++ ++ swap(data[0], data[failed_idx]); ++ memcpy(data[0], data[1], size); ++ ++ while (i < disks) { ++ nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); ++ xor_blocks(nr, size, data[0], data + i); ++ i += nr; ++ } ++ ++ swap(data[0], data[failed_idx]); ++} ++ ++static void raid_gen(int nd, int np, size_t size, void **v) ++{ ++ if (np >= 1) ++ raid5_recov(nd + np, nd, size, v); ++ if (np >= 2) ++ raid6_call.gen_syndrome(nd + np, size, v); ++ BUG_ON(np > 2); ++} ++ ++static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) ++{ ++ switch (nr) { ++ case 0: ++ break; ++ case 1: ++ if (ir[0] < nd + 1) ++ raid5_recov(nd + 1, ir[0], size, v); ++ else ++ raid6_call.gen_syndrome(nd + np, size, v); ++ break; ++ case 2: ++ if (ir[1] < nd) { ++ /* data+data failure. */ ++ raid6_2data_recov(nd + np, size, ir[0], ir[1], v); ++ } else if (ir[0] < nd) { ++ /* data + p/q failure */ ++ ++ if (ir[1] == nd) /* data + p failure */ ++ raid6_datap_recov(nd + np, size, ir[0], v); ++ else { /* data + q failure */ ++ raid5_recov(nd + 1, ir[0], size, v); ++ raid6_call.gen_syndrome(nd + np, size, v); ++ } ++ } else { ++ raid_gen(nd, np, size, v); ++ } ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++#else ++ ++#include ++ ++#endif ++ ++struct ec_bio { ++ struct bch_dev *ca; ++ struct ec_stripe_buf *buf; ++ size_t idx; ++ struct bio bio; ++}; ++ ++/* Stripes btree keys: */ ++ ++const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; ++ ++ if (k.k->p.inode) ++ return "invalid stripe key"; ++ ++ if (bkey_val_bytes(k.k) < sizeof(*s)) ++ return "incorrect value size"; ++ ++ if (bkey_val_bytes(k.k) < sizeof(*s) || ++ bkey_val_u64s(k.k) < stripe_val_u64s(s)) ++ return "incorrect value size"; ++ ++ return bch2_bkey_ptrs_invalid(c, k); ++} ++ ++void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; ++ unsigned i; ++ ++ pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", ++ s->algorithm, ++ le16_to_cpu(s->sectors), ++ s->nr_blocks - s->nr_redundant, ++ s->nr_redundant, ++ s->csum_type, ++ 1U << s->csum_granularity_bits); ++ ++ for (i = 0; i < s->nr_blocks; i++) ++ pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev, ++ (u64) s->ptrs[i].offset, ++ stripe_blockcount_get(s, i)); ++} ++ ++static int ptr_matches_stripe(struct bch_fs *c, ++ struct bch_stripe *v, ++ const struct bch_extent_ptr *ptr) ++{ ++ unsigned i; ++ ++ for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) { ++ const struct bch_extent_ptr *ptr2 = v->ptrs + i; ++ ++ if (ptr->dev == ptr2->dev && ++ ptr->gen == ptr2->gen && ++ ptr->offset >= ptr2->offset && ++ ptr->offset < ptr2->offset + le16_to_cpu(v->sectors)) ++ return i; ++ } ++ ++ return -1; ++} ++ ++static int extent_matches_stripe(struct bch_fs *c, ++ struct bch_stripe *v, ++ struct bkey_s_c k) ++{ ++ ++ switch (k.k->type) { ++ case KEY_TYPE_extent: { ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ const struct bch_extent_ptr *ptr; ++ int idx; ++ ++ extent_for_each_ptr(e, ptr) { ++ idx = ptr_matches_stripe(c, v, ptr); ++ if (idx >= 0) ++ return idx; ++ } ++ break; ++ } ++ } ++ ++ return -1; ++} ++ ++static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_extent: { ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ const union bch_extent_entry *entry; ++ ++ extent_for_each_entry(e, entry) ++ if (extent_entry_type(entry) == ++ BCH_EXTENT_ENTRY_stripe_ptr && ++ entry->stripe_ptr.idx == idx) ++ return true; ++ ++ break; ++ } ++ } ++ ++ return false; ++} ++ ++/* Checksumming: */ ++ ++static void ec_generate_checksums(struct ec_stripe_buf *buf) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned csum_granularity = 1 << v->csum_granularity_bits; ++ unsigned csums_per_device = stripe_csums_per_device(v); ++ unsigned csum_bytes = bch_crc_bytes[v->csum_type]; ++ unsigned i, j; ++ ++ if (!csum_bytes) ++ return; ++ ++ BUG_ON(buf->offset); ++ BUG_ON(buf->size != le16_to_cpu(v->sectors)); ++ ++ for (i = 0; i < v->nr_blocks; i++) { ++ for (j = 0; j < csums_per_device; j++) { ++ unsigned offset = j << v->csum_granularity_bits; ++ unsigned len = min(csum_granularity, buf->size - offset); ++ ++ struct bch_csum csum = ++ bch2_checksum(NULL, v->csum_type, ++ null_nonce(), ++ buf->data[i] + (offset << 9), ++ len << 9); ++ ++ memcpy(stripe_csum(v, i, j), &csum, csum_bytes); ++ } ++ } ++} ++ ++static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned csum_granularity = 1 << v->csum_granularity_bits; ++ unsigned csum_bytes = bch_crc_bytes[v->csum_type]; ++ unsigned i; ++ ++ if (!csum_bytes) ++ return; ++ ++ for (i = 0; i < v->nr_blocks; i++) { ++ unsigned offset = buf->offset; ++ unsigned end = buf->offset + buf->size; ++ ++ if (!test_bit(i, buf->valid)) ++ continue; ++ ++ while (offset < end) { ++ unsigned j = offset >> v->csum_granularity_bits; ++ unsigned len = min(csum_granularity, end - offset); ++ struct bch_csum csum; ++ ++ BUG_ON(offset & (csum_granularity - 1)); ++ BUG_ON(offset + len != le16_to_cpu(v->sectors) && ++ ((offset + len) & (csum_granularity - 1))); ++ ++ csum = bch2_checksum(NULL, v->csum_type, ++ null_nonce(), ++ buf->data[i] + ((offset - buf->offset) << 9), ++ len << 9); ++ ++ if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) { ++ __bcache_io_error(c, ++ "checksum error while doing reconstruct read (%u:%u)", ++ i, j); ++ clear_bit(i, buf->valid); ++ break; ++ } ++ ++ offset += len; ++ } ++ } ++} ++ ++/* Erasure coding: */ ++ ++static void ec_generate_ec(struct ec_stripe_buf *buf) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned nr_data = v->nr_blocks - v->nr_redundant; ++ unsigned bytes = le16_to_cpu(v->sectors) << 9; ++ ++ raid_gen(nr_data, v->nr_redundant, bytes, buf->data); ++} ++ ++static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr) ++{ ++ return nr - bitmap_weight(buf->valid, nr); ++} ++ ++static unsigned ec_nr_failed(struct ec_stripe_buf *buf) ++{ ++ return __ec_nr_failed(buf, buf->key.v.nr_blocks); ++} ++ ++static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0; ++ unsigned nr_data = v->nr_blocks - v->nr_redundant; ++ unsigned bytes = buf->size << 9; ++ ++ if (ec_nr_failed(buf) > v->nr_redundant) { ++ __bcache_io_error(c, ++ "error doing reconstruct read: unable to read enough blocks"); ++ return -1; ++ } ++ ++ for (i = 0; i < nr_data; i++) ++ if (!test_bit(i, buf->valid)) ++ failed[nr_failed++] = i; ++ ++ raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); ++ return 0; ++} ++ ++/* IO: */ ++ ++static void ec_block_endio(struct bio *bio) ++{ ++ struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); ++ struct bch_dev *ca = ec_bio->ca; ++ struct closure *cl = bio->bi_private; ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s", ++ bio_data_dir(bio) ? "write" : "read", ++ bch2_blk_status_to_str(bio->bi_status))) ++ clear_bit(ec_bio->idx, ec_bio->buf->valid); ++ ++ bio_put(&ec_bio->bio); ++ percpu_ref_put(&ca->io_ref); ++ closure_put(cl); ++} ++ ++static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, ++ unsigned rw, unsigned idx, struct closure *cl) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned offset = 0, bytes = buf->size << 9; ++ struct bch_extent_ptr *ptr = &v->ptrs[idx]; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ if (!bch2_dev_get_ioref(ca, rw)) { ++ clear_bit(idx, buf->valid); ++ return; ++ } ++ ++ while (offset < bytes) { ++ unsigned nr_iovecs = min_t(size_t, BIO_MAX_PAGES, ++ DIV_ROUND_UP(bytes, PAGE_SIZE)); ++ unsigned b = min_t(size_t, bytes - offset, ++ nr_iovecs << PAGE_SHIFT); ++ struct ec_bio *ec_bio; ++ ++ ec_bio = container_of(bio_alloc_bioset(GFP_KERNEL, nr_iovecs, ++ &c->ec_bioset), ++ struct ec_bio, bio); ++ ++ ec_bio->ca = ca; ++ ec_bio->buf = buf; ++ ec_bio->idx = idx; ++ ++ bio_set_dev(&ec_bio->bio, ca->disk_sb.bdev); ++ bio_set_op_attrs(&ec_bio->bio, rw, 0); ++ ++ ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); ++ ec_bio->bio.bi_end_io = ec_block_endio; ++ ec_bio->bio.bi_private = cl; ++ ++ bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); ++ ++ closure_get(cl); ++ percpu_ref_get(&ca->io_ref); ++ ++ submit_bio(&ec_bio->bio); ++ ++ offset += b; ++ } ++ ++ percpu_ref_put(&ca->io_ref); ++} ++ ++/* recovery read path: */ ++int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct ec_stripe_buf *buf; ++ struct closure cl; ++ struct bkey_s_c k; ++ struct bch_stripe *v; ++ unsigned stripe_idx; ++ unsigned offset, end; ++ unsigned i, nr_data, csum_granularity; ++ int ret = 0, idx; ++ ++ closure_init_stack(&cl); ++ ++ BUG_ON(!rbio->pick.has_ec); ++ ++ stripe_idx = rbio->pick.ec.idx; ++ ++ buf = kzalloc(sizeof(*buf), GFP_NOIO); ++ if (!buf) ++ return -ENOMEM; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, ++ POS(0, stripe_idx), ++ BTREE_ITER_SLOTS); ++ k = bch2_btree_iter_peek_slot(iter); ++ if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) { ++ __bcache_io_error(c, ++ "error doing reconstruct read: stripe not found"); ++ kfree(buf); ++ return bch2_trans_exit(&trans) ?: -EIO; ++ } ++ ++ bkey_reassemble(&buf->key.k_i, k); ++ bch2_trans_exit(&trans); ++ ++ v = &buf->key.v; ++ ++ nr_data = v->nr_blocks - v->nr_redundant; ++ ++ idx = ptr_matches_stripe(c, v, &rbio->pick.ptr); ++ BUG_ON(idx < 0); ++ ++ csum_granularity = 1U << v->csum_granularity_bits; ++ ++ offset = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset; ++ end = offset + bio_sectors(&rbio->bio); ++ ++ BUG_ON(end > le16_to_cpu(v->sectors)); ++ ++ buf->offset = round_down(offset, csum_granularity); ++ buf->size = min_t(unsigned, le16_to_cpu(v->sectors), ++ round_up(end, csum_granularity)) - buf->offset; ++ ++ for (i = 0; i < v->nr_blocks; i++) { ++ buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO); ++ if (!buf->data[i]) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ } ++ ++ memset(buf->valid, 0xFF, sizeof(buf->valid)); ++ ++ for (i = 0; i < v->nr_blocks; i++) { ++ struct bch_extent_ptr *ptr = v->ptrs + i; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ if (ptr_stale(ca, ptr)) { ++ __bcache_io_error(c, ++ "error doing reconstruct read: stale pointer"); ++ clear_bit(i, buf->valid); ++ continue; ++ } ++ ++ ec_block_io(c, buf, REQ_OP_READ, i, &cl); ++ } ++ ++ closure_sync(&cl); ++ ++ if (ec_nr_failed(buf) > v->nr_redundant) { ++ __bcache_io_error(c, ++ "error doing reconstruct read: unable to read enough blocks"); ++ ret = -EIO; ++ goto err; ++ } ++ ++ ec_validate_checksums(c, buf); ++ ++ ret = ec_do_recov(c, buf); ++ if (ret) ++ goto err; ++ ++ memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, ++ buf->data[idx] + ((offset - buf->offset) << 9)); ++err: ++ for (i = 0; i < v->nr_blocks; i++) ++ kfree(buf->data[i]); ++ kfree(buf); ++ return ret; ++} ++ ++/* stripe bucket accounting: */ ++ ++static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) ++{ ++ ec_stripes_heap n, *h = &c->ec_stripes_heap; ++ ++ if (idx >= h->size) { ++ if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) ++ return -ENOMEM; ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ if (n.size > h->size) { ++ memcpy(n.data, h->data, h->used * sizeof(h->data[0])); ++ n.used = h->used; ++ swap(*h, n); ++ } ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ free_heap(&n); ++ } ++ ++ if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp)) ++ return -ENOMEM; ++ ++ if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && ++ !genradix_ptr_alloc(&c->stripes[1], idx, gfp)) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++static int ec_stripe_mem_alloc(struct bch_fs *c, ++ struct btree_iter *iter) ++{ ++ size_t idx = iter->pos.offset; ++ int ret = 0; ++ ++ if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN)) ++ return ret; ++ ++ bch2_trans_unlock(iter->trans); ++ ret = -EINTR; ++ ++ if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL)) ++ return ret; ++ ++ return -ENOMEM; ++} ++ ++static ssize_t stripe_idx_to_delete(struct bch_fs *c) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ ++ return h->used && h->data[0].blocks_nonempty == 0 ++ ? h->data[0].idx : -1; ++} ++ ++static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, ++ struct ec_stripe_heap_entry l, ++ struct ec_stripe_heap_entry r) ++{ ++ return ((l.blocks_nonempty > r.blocks_nonempty) - ++ (l.blocks_nonempty < r.blocks_nonempty)); ++} ++ ++static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, ++ size_t i) ++{ ++ struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); ++ ++ genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i; ++} ++ ++static void heap_verify_backpointer(struct bch_fs *c, size_t idx) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ struct stripe *m = genradix_ptr(&c->stripes[0], idx); ++ ++ BUG_ON(!m->alive); ++ BUG_ON(m->heap_idx >= h->used); ++ BUG_ON(h->data[m->heap_idx].idx != idx); ++} ++ ++void bch2_stripes_heap_del(struct bch_fs *c, ++ struct stripe *m, size_t idx) ++{ ++ if (!m->on_heap) ++ return; ++ ++ m->on_heap = false; ++ ++ heap_verify_backpointer(c, idx); ++ ++ heap_del(&c->ec_stripes_heap, m->heap_idx, ++ ec_stripes_heap_cmp, ++ ec_stripes_heap_set_backpointer); ++} ++ ++void bch2_stripes_heap_insert(struct bch_fs *c, ++ struct stripe *m, size_t idx) ++{ ++ if (m->on_heap) ++ return; ++ ++ BUG_ON(heap_full(&c->ec_stripes_heap)); ++ ++ m->on_heap = true; ++ ++ heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { ++ .idx = idx, ++ .blocks_nonempty = m->blocks_nonempty, ++ }), ++ ec_stripes_heap_cmp, ++ ec_stripes_heap_set_backpointer); ++ ++ heap_verify_backpointer(c, idx); ++} ++ ++void bch2_stripes_heap_update(struct bch_fs *c, ++ struct stripe *m, size_t idx) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ size_t i; ++ ++ if (!m->on_heap) ++ return; ++ ++ heap_verify_backpointer(c, idx); ++ ++ h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; ++ ++ i = m->heap_idx; ++ heap_sift_up(h, i, ec_stripes_heap_cmp, ++ ec_stripes_heap_set_backpointer); ++ heap_sift_down(h, i, ec_stripes_heap_cmp, ++ ec_stripes_heap_set_backpointer); ++ ++ heap_verify_backpointer(c, idx); ++ ++ if (stripe_idx_to_delete(c) >= 0 && ++ !percpu_ref_is_dying(&c->writes)) ++ schedule_work(&c->ec_stripe_delete_work); ++} ++ ++/* stripe deletion */ ++ ++static int ec_stripe_delete(struct bch_fs *c, size_t idx) ++{ ++ //pr_info("deleting stripe %zu", idx); ++ return bch2_btree_delete_range(c, BTREE_ID_EC, ++ POS(0, idx), ++ POS(0, idx + 1), ++ NULL); ++} ++ ++static void ec_stripe_delete_work(struct work_struct *work) ++{ ++ struct bch_fs *c = ++ container_of(work, struct bch_fs, ec_stripe_delete_work); ++ ssize_t idx; ++ ++ while (1) { ++ spin_lock(&c->ec_stripes_heap_lock); ++ idx = stripe_idx_to_delete(c); ++ if (idx < 0) { ++ spin_unlock(&c->ec_stripes_heap_lock); ++ break; ++ } ++ ++ bch2_stripes_heap_del(c, genradix_ptr(&c->stripes[0], idx), idx); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ if (ec_stripe_delete(c, idx)) ++ break; ++ } ++} ++ ++/* stripe creation: */ ++ ++static int ec_stripe_bkey_insert(struct bch_fs *c, ++ struct bkey_i_stripe *stripe) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bpos start_pos = POS(0, c->ec_stripe_hint); ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EC, start_pos, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { ++ if (start_pos.offset) { ++ start_pos = POS_MIN; ++ bch2_btree_iter_set_pos(iter, start_pos); ++ continue; ++ } ++ ++ ret = -ENOSPC; ++ break; ++ } ++ ++ if (bkey_deleted(k.k)) ++ goto found_slot; ++ } ++ ++ goto err; ++found_slot: ++ start_pos = iter->pos; ++ ++ ret = ec_stripe_mem_alloc(c, iter); ++ if (ret) ++ goto err; ++ ++ stripe->k.p = iter->pos; ++ ++ bch2_trans_update(&trans, iter, &stripe->k_i, 0); ++ ++ ret = bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL); ++err: ++ bch2_trans_iter_put(&trans, iter); ++ ++ if (ret == -EINTR) ++ goto retry; ++ ++ c->ec_stripe_hint = ret ? start_pos.offset : start_pos.offset + 1; ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++static void extent_stripe_ptr_add(struct bkey_s_extent e, ++ struct ec_stripe_buf *s, ++ struct bch_extent_ptr *ptr, ++ unsigned block) ++{ ++ struct bch_extent_stripe_ptr *dst = (void *) ptr; ++ union bch_extent_entry *end = extent_entry_last(e); ++ ++ memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst); ++ e.k->u64s += sizeof(*dst) / sizeof(u64); ++ ++ *dst = (struct bch_extent_stripe_ptr) { ++ .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, ++ .block = block, ++ .idx = s->key.k.p.offset, ++ }; ++} ++ ++static int ec_stripe_update_ptrs(struct bch_fs *c, ++ struct ec_stripe_buf *s, ++ struct bkey *pos) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_s_extent e; ++ struct bkey_on_stack sk; ++ int ret = 0, dev, idx; ++ ++ bkey_on_stack_init(&sk); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ /* XXX this doesn't support the reflink btree */ ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ bkey_start_pos(pos), ++ BTREE_ITER_INTENT); ++ ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret = bkey_err(k)) && ++ bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { ++ struct bch_extent_ptr *ptr, *ec_ptr = NULL; ++ ++ if (extent_has_stripe_ptr(k, s->key.k.p.offset)) { ++ bch2_btree_iter_next(iter); ++ continue; ++ } ++ ++ idx = extent_matches_stripe(c, &s->key.v, k); ++ if (idx < 0) { ++ bch2_btree_iter_next(iter); ++ continue; ++ } ++ ++ dev = s->key.v.ptrs[idx].dev; ++ ++ bkey_on_stack_reassemble(&sk, c, k); ++ e = bkey_i_to_s_extent(sk.k); ++ ++ bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev); ++ ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev); ++ BUG_ON(!ec_ptr); ++ ++ extent_stripe_ptr_add(e, s, ec_ptr, idx); ++ ++ bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); ++ bch2_trans_update(&trans, iter, sk.k, 0); ++ ++ ret = bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE); ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ break; ++ } ++ ++ bch2_trans_exit(&trans); ++ bkey_on_stack_exit(&sk, c); ++ ++ return ret; ++} ++ ++/* ++ * data buckets of new stripe all written: create the stripe ++ */ ++static void ec_stripe_create(struct ec_stripe_new *s) ++{ ++ struct bch_fs *c = s->c; ++ struct open_bucket *ob; ++ struct bkey_i *k; ++ struct stripe *m; ++ struct bch_stripe *v = &s->stripe.key.v; ++ unsigned i, nr_data = v->nr_blocks - v->nr_redundant; ++ struct closure cl; ++ int ret; ++ ++ BUG_ON(s->h->s == s); ++ ++ closure_init_stack(&cl); ++ ++ if (s->err) { ++ if (s->err != -EROFS) ++ bch_err(c, "error creating stripe: error writing data buckets"); ++ goto err; ++ } ++ ++ BUG_ON(!s->allocated); ++ ++ if (!percpu_ref_tryget(&c->writes)) ++ goto err; ++ ++ BUG_ON(bitmap_weight(s->blocks_allocated, ++ s->blocks.nr) != s->blocks.nr); ++ ++ ec_generate_ec(&s->stripe); ++ ++ ec_generate_checksums(&s->stripe); ++ ++ /* write p/q: */ ++ for (i = nr_data; i < v->nr_blocks; i++) ++ ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl); ++ ++ closure_sync(&cl); ++ ++ for (i = nr_data; i < v->nr_blocks; i++) ++ if (!test_bit(i, s->stripe.valid)) { ++ bch_err(c, "error creating stripe: error writing redundancy buckets"); ++ goto err_put_writes; ++ } ++ ++ ret = s->existing_stripe ++ ? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i, ++ NULL, NULL, BTREE_INSERT_NOFAIL) ++ : ec_stripe_bkey_insert(c, &s->stripe.key); ++ if (ret) { ++ bch_err(c, "error creating stripe: error creating stripe key"); ++ goto err_put_writes; ++ } ++ ++ for_each_keylist_key(&s->keys, k) { ++ ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k); ++ if (ret) { ++ bch_err(c, "error creating stripe: error updating pointers"); ++ break; ++ } ++ } ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ m = genradix_ptr(&c->stripes[0], s->stripe.key.k.p.offset); ++#if 0 ++ pr_info("created a %s stripe %llu", ++ s->existing_stripe ? "existing" : "new", ++ s->stripe.key.k.p.offset); ++#endif ++ BUG_ON(m->on_heap); ++ bch2_stripes_heap_insert(c, m, s->stripe.key.k.p.offset); ++ spin_unlock(&c->ec_stripes_heap_lock); ++err_put_writes: ++ percpu_ref_put(&c->writes); ++err: ++ open_bucket_for_each(c, &s->blocks, ob, i) { ++ ob->ec = NULL; ++ __bch2_open_bucket_put(c, ob); ++ } ++ ++ bch2_open_buckets_put(c, &s->parity); ++ ++ bch2_keylist_free(&s->keys, s->inline_keys); ++ ++ for (i = 0; i < s->stripe.key.v.nr_blocks; i++) ++ kvpfree(s->stripe.data[i], s->stripe.size << 9); ++ kfree(s); ++} ++ ++static void ec_stripe_create_work(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, ++ struct bch_fs, ec_stripe_create_work); ++ struct ec_stripe_new *s, *n; ++restart: ++ mutex_lock(&c->ec_stripe_new_lock); ++ list_for_each_entry_safe(s, n, &c->ec_stripe_new_list, list) ++ if (!atomic_read(&s->pin)) { ++ list_del(&s->list); ++ mutex_unlock(&c->ec_stripe_new_lock); ++ ec_stripe_create(s); ++ goto restart; ++ } ++ mutex_unlock(&c->ec_stripe_new_lock); ++} ++ ++static void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s) ++{ ++ BUG_ON(atomic_read(&s->pin) <= 0); ++ ++ if (atomic_dec_and_test(&s->pin)) { ++ BUG_ON(!s->pending); ++ queue_work(system_long_wq, &c->ec_stripe_create_work); ++ } ++} ++ ++static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) ++{ ++ struct ec_stripe_new *s = h->s; ++ ++ BUG_ON(!s->allocated && !s->err); ++ ++ h->s = NULL; ++ s->pending = true; ++ ++ mutex_lock(&c->ec_stripe_new_lock); ++ list_add(&s->list, &c->ec_stripe_new_list); ++ mutex_unlock(&c->ec_stripe_new_lock); ++ ++ ec_stripe_new_put(c, s); ++} ++ ++/* have a full bucket - hand it off to be erasure coded: */ ++void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob) ++{ ++ struct ec_stripe_new *s = ob->ec; ++ ++ if (ob->sectors_free) ++ s->err = -1; ++ ++ ec_stripe_new_put(c, s); ++} ++ ++void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) ++{ ++ struct ec_stripe_new *s = ob->ec; ++ ++ s->err = -EIO; ++} ++ ++void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) ++{ ++ struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); ++ struct bch_dev *ca; ++ unsigned offset; ++ ++ if (!ob) ++ return NULL; ++ ++ ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ offset = ca->mi.bucket_size - ob->sectors_free; ++ ++ return ob->ec->stripe.data[ob->ec_idx] + (offset << 9); ++} ++ ++void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, ++ struct bpos pos, unsigned sectors) ++{ ++ struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); ++ struct ec_stripe_new *ec; ++ ++ if (!ob) ++ return; ++ ++ //pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset); ++ ++ ec = ob->ec; ++ mutex_lock(&ec->lock); ++ ++ if (bch2_keylist_realloc(&ec->keys, ec->inline_keys, ++ ARRAY_SIZE(ec->inline_keys), ++ BKEY_U64s)) { ++ BUG(); ++ } ++ ++ bkey_init(&ec->keys.top->k); ++ ec->keys.top->k.p = pos; ++ bch2_key_resize(&ec->keys.top->k, sectors); ++ bch2_keylist_push(&ec->keys); ++ ++ mutex_unlock(&ec->lock); ++} ++ ++static int unsigned_cmp(const void *_l, const void *_r) ++{ ++ unsigned l = *((const unsigned *) _l); ++ unsigned r = *((const unsigned *) _r); ++ ++ return cmp_int(l, r); ++} ++ ++/* pick most common bucket size: */ ++static unsigned pick_blocksize(struct bch_fs *c, ++ struct bch_devs_mask *devs) ++{ ++ struct bch_dev *ca; ++ unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX]; ++ struct { ++ unsigned nr, size; ++ } cur = { 0, 0 }, best = { 0, 0 }; ++ ++ for_each_member_device_rcu(ca, c, i, devs) ++ sizes[nr++] = ca->mi.bucket_size; ++ ++ sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); ++ ++ for (i = 0; i < nr; i++) { ++ if (sizes[i] != cur.size) { ++ if (cur.nr > best.nr) ++ best = cur; ++ ++ cur.nr = 0; ++ cur.size = sizes[i]; ++ } ++ ++ cur.nr++; ++ } ++ ++ if (cur.nr > best.nr) ++ best = cur; ++ ++ return best.size; ++} ++ ++static bool may_create_new_stripe(struct bch_fs *c) ++{ ++ return false; ++} ++ ++static void ec_stripe_key_init(struct bch_fs *c, ++ struct bkey_i_stripe *s, ++ unsigned nr_data, ++ unsigned nr_parity, ++ unsigned stripe_size) ++{ ++ unsigned u64s; ++ ++ bkey_stripe_init(&s->k_i); ++ s->v.sectors = cpu_to_le16(stripe_size); ++ s->v.algorithm = 0; ++ s->v.nr_blocks = nr_data + nr_parity; ++ s->v.nr_redundant = nr_parity; ++ s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); ++ s->v.csum_type = BCH_CSUM_CRC32C; ++ s->v.pad = 0; ++ ++ while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { ++ BUG_ON(1 << s->v.csum_granularity_bits >= ++ le16_to_cpu(s->v.sectors) || ++ s->v.csum_granularity_bits == U8_MAX); ++ s->v.csum_granularity_bits++; ++ } ++ ++ set_bkey_val_u64s(&s->k, u64s); ++} ++ ++static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) ++{ ++ struct ec_stripe_new *s; ++ unsigned i; ++ ++ lockdep_assert_held(&h->lock); ++ ++ s = kzalloc(sizeof(*s), GFP_KERNEL); ++ if (!s) ++ return -ENOMEM; ++ ++ mutex_init(&s->lock); ++ atomic_set(&s->pin, 1); ++ s->c = c; ++ s->h = h; ++ s->nr_data = min_t(unsigned, h->nr_active_devs, ++ EC_STRIPE_MAX) - h->redundancy; ++ s->nr_parity = h->redundancy; ++ ++ bch2_keylist_init(&s->keys, s->inline_keys); ++ ++ s->stripe.offset = 0; ++ s->stripe.size = h->blocksize; ++ memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid)); ++ ++ ec_stripe_key_init(c, &s->stripe.key, s->nr_data, ++ s->nr_parity, h->blocksize); ++ ++ for (i = 0; i < s->stripe.key.v.nr_blocks; i++) { ++ s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL); ++ if (!s->stripe.data[i]) ++ goto err; ++ } ++ ++ h->s = s; ++ ++ return 0; ++err: ++ for (i = 0; i < s->stripe.key.v.nr_blocks; i++) ++ kvpfree(s->stripe.data[i], s->stripe.size << 9); ++ kfree(s); ++ return -ENOMEM; ++} ++ ++static struct ec_stripe_head * ++ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, ++ unsigned algo, unsigned redundancy) ++{ ++ struct ec_stripe_head *h; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ h = kzalloc(sizeof(*h), GFP_KERNEL); ++ if (!h) ++ return NULL; ++ ++ mutex_init(&h->lock); ++ mutex_lock(&h->lock); ++ ++ h->target = target; ++ h->algo = algo; ++ h->redundancy = redundancy; ++ ++ rcu_read_lock(); ++ h->devs = target_rw_devs(c, BCH_DATA_user, target); ++ ++ for_each_member_device_rcu(ca, c, i, &h->devs) ++ if (!ca->mi.durability) ++ __clear_bit(i, h->devs.d); ++ ++ h->blocksize = pick_blocksize(c, &h->devs); ++ ++ for_each_member_device_rcu(ca, c, i, &h->devs) ++ if (ca->mi.bucket_size == h->blocksize) ++ h->nr_active_devs++; ++ ++ rcu_read_unlock(); ++ list_add(&h->list, &c->ec_stripe_head_list); ++ return h; ++} ++ ++void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) ++{ ++ if (h->s && ++ h->s->allocated && ++ bitmap_weight(h->s->blocks_allocated, ++ h->s->blocks.nr) == h->s->blocks.nr) ++ ec_stripe_set_pending(c, h); ++ ++ mutex_unlock(&h->lock); ++} ++ ++struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c, ++ unsigned target, ++ unsigned algo, ++ unsigned redundancy) ++{ ++ struct ec_stripe_head *h; ++ ++ if (!redundancy) ++ return NULL; ++ ++ mutex_lock(&c->ec_stripe_head_lock); ++ list_for_each_entry(h, &c->ec_stripe_head_list, list) ++ if (h->target == target && ++ h->algo == algo && ++ h->redundancy == redundancy) { ++ mutex_lock(&h->lock); ++ goto found; ++ } ++ ++ h = ec_new_stripe_head_alloc(c, target, algo, redundancy); ++found: ++ mutex_unlock(&c->ec_stripe_head_lock); ++ return h; ++} ++ ++/* ++ * XXX: use a higher watermark for allocating open buckets here: ++ */ ++static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h) ++{ ++ struct bch_devs_mask devs; ++ struct open_bucket *ob; ++ unsigned i, nr_have, nr_data = ++ min_t(unsigned, h->nr_active_devs, ++ EC_STRIPE_MAX) - h->redundancy; ++ bool have_cache = true; ++ int ret = 0; ++ ++ devs = h->devs; ++ ++ for_each_set_bit(i, h->s->blocks_allocated, EC_STRIPE_MAX) { ++ __clear_bit(h->s->stripe.key.v.ptrs[i].dev, devs.d); ++ --nr_data; ++ } ++ ++ BUG_ON(h->s->blocks.nr > nr_data); ++ BUG_ON(h->s->parity.nr > h->redundancy); ++ ++ open_bucket_for_each(c, &h->s->parity, ob, i) ++ __clear_bit(ob->ptr.dev, devs.d); ++ open_bucket_for_each(c, &h->s->blocks, ob, i) ++ __clear_bit(ob->ptr.dev, devs.d); ++ ++ percpu_down_read(&c->mark_lock); ++ rcu_read_lock(); ++ ++ if (h->s->parity.nr < h->redundancy) { ++ nr_have = h->s->parity.nr; ++ ++ ret = bch2_bucket_alloc_set(c, &h->s->parity, ++ &h->parity_stripe, ++ &devs, ++ h->redundancy, ++ &nr_have, ++ &have_cache, ++ RESERVE_NONE, ++ 0, ++ NULL); ++ if (ret) ++ goto err; ++ } ++ ++ if (h->s->blocks.nr < nr_data) { ++ nr_have = h->s->blocks.nr; ++ ++ ret = bch2_bucket_alloc_set(c, &h->s->blocks, ++ &h->block_stripe, ++ &devs, ++ nr_data, ++ &nr_have, ++ &have_cache, ++ RESERVE_NONE, ++ 0, ++ NULL); ++ if (ret) ++ goto err; ++ } ++err: ++ rcu_read_unlock(); ++ percpu_up_read(&c->mark_lock); ++ return ret; ++} ++ ++/* XXX: doesn't obey target: */ ++static s64 get_existing_stripe(struct bch_fs *c, ++ unsigned target, ++ unsigned algo, ++ unsigned redundancy) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ struct stripe *m; ++ size_t heap_idx; ++ u64 stripe_idx; ++ ++ if (may_create_new_stripe(c)) ++ return -1; ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ for (heap_idx = 0; heap_idx < h->used; heap_idx++) { ++ if (!h->data[heap_idx].blocks_nonempty) ++ continue; ++ ++ stripe_idx = h->data[heap_idx].idx; ++ m = genradix_ptr(&c->stripes[0], stripe_idx); ++ ++ if (m->algorithm == algo && ++ m->nr_redundant == redundancy && ++ m->blocks_nonempty < m->nr_blocks - m->nr_redundant) { ++ bch2_stripes_heap_del(c, m, stripe_idx); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ return stripe_idx; ++ } ++ } ++ ++ spin_unlock(&c->ec_stripes_heap_lock); ++ return -1; ++} ++ ++static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (!ret) ++ bkey_reassemble(&stripe->key.k_i, k); ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, ++ unsigned target, ++ unsigned algo, ++ unsigned redundancy) ++{ ++ struct closure cl; ++ struct ec_stripe_head *h; ++ struct open_bucket *ob; ++ unsigned i, data_idx = 0; ++ s64 idx; ++ ++ closure_init_stack(&cl); ++ ++ h = __bch2_ec_stripe_head_get(c, target, algo, redundancy); ++ if (!h) ++ return NULL; ++ ++ if (!h->s && ec_new_stripe_alloc(c, h)) { ++ bch2_ec_stripe_head_put(c, h); ++ return NULL; ++ } ++ ++ if (!h->s->allocated) { ++ if (!h->s->existing_stripe && ++ (idx = get_existing_stripe(c, target, algo, redundancy)) >= 0) { ++ //pr_info("got existing stripe %llu", idx); ++ ++ h->s->existing_stripe = true; ++ h->s->existing_stripe_idx = idx; ++ if (get_stripe_key(c, idx, &h->s->stripe)) { ++ /* btree error */ ++ BUG(); ++ } ++ ++ for (i = 0; i < h->s->stripe.key.v.nr_blocks; i++) ++ if (stripe_blockcount_get(&h->s->stripe.key.v, i)) { ++ __set_bit(i, h->s->blocks_allocated); ++ ec_block_io(c, &h->s->stripe, READ, i, &cl); ++ } ++ } ++ ++ if (new_stripe_alloc_buckets(c, h)) { ++ bch2_ec_stripe_head_put(c, h); ++ h = NULL; ++ goto out; ++ } ++ ++ open_bucket_for_each(c, &h->s->blocks, ob, i) { ++ data_idx = find_next_zero_bit(h->s->blocks_allocated, ++ h->s->nr_data, data_idx); ++ BUG_ON(data_idx >= h->s->nr_data); ++ ++ h->s->stripe.key.v.ptrs[data_idx] = ob->ptr; ++ h->s->data_block_idx[i] = data_idx; ++ data_idx++; ++ } ++ ++ open_bucket_for_each(c, &h->s->parity, ob, i) ++ h->s->stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr; ++ ++ //pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]); ++ h->s->allocated = true; ++ } ++out: ++ closure_sync(&cl); ++ return h; ++} ++ ++void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct ec_stripe_head *h; ++ struct open_bucket *ob; ++ unsigned i; ++ ++ mutex_lock(&c->ec_stripe_head_lock); ++ list_for_each_entry(h, &c->ec_stripe_head_list, list) { ++ ++ mutex_lock(&h->lock); ++ if (!h->s) ++ goto unlock; ++ ++ open_bucket_for_each(c, &h->s->blocks, ob, i) ++ if (ob->ptr.dev == ca->dev_idx) ++ goto found; ++ open_bucket_for_each(c, &h->s->parity, ob, i) ++ if (ob->ptr.dev == ca->dev_idx) ++ goto found; ++ goto unlock; ++found: ++ h->s->err = -EROFS; ++ ec_stripe_set_pending(c, h); ++unlock: ++ mutex_unlock(&h->lock); ++ } ++ mutex_unlock(&c->ec_stripe_head_lock); ++} ++ ++static int __bch2_stripe_write_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct stripe *m, ++ size_t idx, ++ struct bkey_i_stripe *new_key) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c k; ++ unsigned i; ++ int ret; ++ ++ bch2_btree_iter_set_pos(iter, POS(0, idx)); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ return ret; ++ ++ if (k.k->type != KEY_TYPE_stripe) ++ return -EIO; ++ ++ bkey_reassemble(&new_key->k_i, k); ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ ++ for (i = 0; i < new_key->v.nr_blocks; i++) ++ stripe_blockcount_set(&new_key->v, i, ++ m->block_sectors[i]); ++ m->dirty = false; ++ ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ bch2_trans_update(trans, iter, &new_key->k_i, 0); ++ return 0; ++} ++ ++int bch2_stripes_write(struct bch_fs *c, unsigned flags, bool *wrote) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct genradix_iter giter; ++ struct bkey_i_stripe *new_key; ++ struct stripe *m; ++ int ret = 0; ++ ++ new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL); ++ BUG_ON(!new_key); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ ++ genradix_for_each(&c->stripes[0], giter, m) { ++ if (!m->dirty) ++ continue; ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL|flags, ++ __bch2_stripe_write_key(&trans, iter, m, ++ giter.pos, new_key)); ++ ++ if (ret) ++ break; ++ ++ *wrote = true; ++ } ++ ++ bch2_trans_exit(&trans); ++ ++ kfree(new_key); ++ ++ return ret; ++} ++ ++static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_s_c k) ++{ ++ int ret = 0; ++ ++ if (k.k->type == KEY_TYPE_stripe) { ++ struct stripe *m; ++ ++ ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: ++ bch2_mark_key(c, k, 0, 0, NULL, 0, ++ BTREE_TRIGGER_ALLOC_READ| ++ BTREE_TRIGGER_NOATOMIC); ++ if (ret) ++ return ret; ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ m = genradix_ptr(&c->stripes[0], k.k->p.offset); ++ bch2_stripes_heap_insert(c, m, k.k->p.offset); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ } ++ ++ return ret; ++} ++ ++int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) ++{ ++ int ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_EC, ++ NULL, bch2_stripes_read_fn); ++ if (ret) ++ bch_err(c, "error reading stripes: %i", ret); ++ ++ return ret; ++} ++ ++int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ size_t i, idx = 0; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0); ++ ++ k = bch2_btree_iter_prev(iter); ++ if (!IS_ERR_OR_NULL(k.k)) ++ idx = k.k->p.offset + 1; ++ ret = bch2_trans_exit(&trans); ++ if (ret) ++ return ret; ++ ++ if (!idx) ++ return 0; ++ ++ if (!gc && ++ !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx), ++ GFP_KERNEL)) ++ return -ENOMEM; ++#if 0 ++ ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL); ++#else ++ for (i = 0; i < idx; i++) ++ if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL)) ++ return -ENOMEM; ++#endif ++ return 0; ++} ++ ++void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ struct stripe *m; ++ size_t i; ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ for (i = 0; i < min(h->used, 20UL); i++) { ++ m = genradix_ptr(&c->stripes[0], h->data[i].idx); ++ ++ pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx, ++ h->data[i].blocks_nonempty, ++ m->nr_blocks - m->nr_redundant, ++ m->nr_redundant); ++ } ++ spin_unlock(&c->ec_stripes_heap_lock); ++} ++ ++void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ struct ec_stripe_head *h; ++ struct ec_stripe_new *s; ++ ++ mutex_lock(&c->ec_stripe_head_lock); ++ list_for_each_entry(h, &c->ec_stripe_head_list, list) { ++ pr_buf(out, "target %u algo %u redundancy %u:\n", ++ h->target, h->algo, h->redundancy); ++ ++ if (h->s) ++ pr_buf(out, "\tpending: blocks %u allocated %u\n", ++ h->s->blocks.nr, ++ bitmap_weight(h->s->blocks_allocated, ++ h->s->blocks.nr)); ++ } ++ mutex_unlock(&c->ec_stripe_head_lock); ++ ++ mutex_lock(&c->ec_stripe_new_lock); ++ list_for_each_entry(s, &c->ec_stripe_new_list, list) { ++ pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n", ++ s->blocks.nr, ++ bitmap_weight(s->blocks_allocated, ++ s->blocks.nr), ++ atomic_read(&s->pin)); ++ } ++ mutex_unlock(&c->ec_stripe_new_lock); ++} ++ ++void bch2_fs_ec_exit(struct bch_fs *c) ++{ ++ struct ec_stripe_head *h; ++ ++ while (1) { ++ mutex_lock(&c->ec_stripe_head_lock); ++ h = list_first_entry_or_null(&c->ec_stripe_head_list, ++ struct ec_stripe_head, list); ++ if (h) ++ list_del(&h->list); ++ mutex_unlock(&c->ec_stripe_head_lock); ++ if (!h) ++ break; ++ ++ BUG_ON(h->s); ++ kfree(h); ++ } ++ ++ BUG_ON(!list_empty(&c->ec_stripe_new_list)); ++ ++ free_heap(&c->ec_stripes_heap); ++ genradix_free(&c->stripes[0]); ++ bioset_exit(&c->ec_bioset); ++} ++ ++int bch2_fs_ec_init(struct bch_fs *c) ++{ ++ INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work); ++ INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); ++ ++ return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), ++ BIOSET_NEED_BVECS); ++} +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +new file mode 100644 +index 000000000000..f8fc3d616cd7 +--- /dev/null ++++ b/fs/bcachefs/ec.h +@@ -0,0 +1,169 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EC_H ++#define _BCACHEFS_EC_H ++ ++#include "ec_types.h" ++#include "keylist_types.h" ++ ++const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ ++#define bch2_bkey_ops_stripe (struct bkey_ops) { \ ++ .key_invalid = bch2_stripe_invalid, \ ++ .val_to_text = bch2_stripe_to_text, \ ++ .swab = bch2_ptr_swab, \ ++} ++ ++static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) ++{ ++ return DIV_ROUND_UP(le16_to_cpu(s->sectors), ++ 1 << s->csum_granularity_bits); ++} ++ ++static inline unsigned stripe_csum_offset(const struct bch_stripe *s, ++ unsigned dev, unsigned csum_idx) ++{ ++ unsigned csum_bytes = bch_crc_bytes[s->csum_type]; ++ ++ return sizeof(struct bch_stripe) + ++ sizeof(struct bch_extent_ptr) * s->nr_blocks + ++ (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes; ++} ++ ++static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s, ++ unsigned idx) ++{ ++ return stripe_csum_offset(s, s->nr_blocks, 0) + ++ sizeof(u16) * idx; ++} ++ ++static inline unsigned stripe_blockcount_get(const struct bch_stripe *s, ++ unsigned idx) ++{ ++ return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx)); ++} ++ ++static inline void stripe_blockcount_set(struct bch_stripe *s, ++ unsigned idx, unsigned v) ++{ ++ __le16 *p = (void *) s + stripe_blockcount_offset(s, idx); ++ ++ *p = cpu_to_le16(v); ++} ++ ++static inline unsigned stripe_val_u64s(const struct bch_stripe *s) ++{ ++ return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks), ++ sizeof(u64)); ++} ++ ++static inline void *stripe_csum(struct bch_stripe *s, ++ unsigned dev, unsigned csum_idx) ++{ ++ return (void *) s + stripe_csum_offset(s, dev, csum_idx); ++} ++ ++struct bch_read_bio; ++ ++struct ec_stripe_buf { ++ /* might not be buffering the entire stripe: */ ++ unsigned offset; ++ unsigned size; ++ unsigned long valid[BITS_TO_LONGS(EC_STRIPE_MAX)]; ++ ++ void *data[EC_STRIPE_MAX]; ++ ++ union { ++ struct bkey_i_stripe key; ++ u64 pad[255]; ++ }; ++}; ++ ++struct ec_stripe_head; ++ ++struct ec_stripe_new { ++ struct bch_fs *c; ++ struct ec_stripe_head *h; ++ struct mutex lock; ++ struct list_head list; ++ ++ /* counts in flight writes, stripe is created when pin == 0 */ ++ atomic_t pin; ++ ++ int err; ++ ++ u8 nr_data; ++ u8 nr_parity; ++ bool allocated; ++ bool pending; ++ bool existing_stripe; ++ u64 existing_stripe_idx; ++ ++ unsigned long blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)]; ++ ++ struct open_buckets blocks; ++ u8 data_block_idx[EC_STRIPE_MAX]; ++ struct open_buckets parity; ++ ++ struct keylist keys; ++ u64 inline_keys[BKEY_U64s * 8]; ++ ++ struct ec_stripe_buf stripe; ++}; ++ ++struct ec_stripe_head { ++ struct list_head list; ++ struct mutex lock; ++ ++ unsigned target; ++ unsigned algo; ++ unsigned redundancy; ++ ++ struct bch_devs_mask devs; ++ unsigned nr_active_devs; ++ ++ unsigned blocksize; ++ ++ struct dev_stripe_state block_stripe; ++ struct dev_stripe_state parity_stripe; ++ ++ struct ec_stripe_new *s; ++}; ++ ++int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *); ++ ++void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); ++void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *, ++ struct bpos, unsigned); ++ ++void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *); ++void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); ++ ++int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); ++ ++void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); ++struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned, ++ unsigned, unsigned); ++ ++void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); ++void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); ++void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t); ++ ++void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); ++ ++void bch2_ec_flush_new_stripes(struct bch_fs *); ++ ++struct journal_keys; ++int bch2_stripes_read(struct bch_fs *, struct journal_keys *); ++int bch2_stripes_write(struct bch_fs *, unsigned, bool *); ++ ++int bch2_ec_mem_alloc(struct bch_fs *, bool); ++ ++void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); ++void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); ++ ++void bch2_fs_ec_exit(struct bch_fs *); ++int bch2_fs_ec_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_EC_H */ +diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h +new file mode 100644 +index 000000000000..e4d633fca5bf +--- /dev/null ++++ b/fs/bcachefs/ec_types.h +@@ -0,0 +1,39 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EC_TYPES_H ++#define _BCACHEFS_EC_TYPES_H ++ ++#include ++ ++#define EC_STRIPE_MAX 16 ++ ++struct bch_replicas_padded { ++ struct bch_replicas_entry e; ++ u8 pad[EC_STRIPE_MAX]; ++}; ++ ++struct stripe { ++ size_t heap_idx; ++ ++ u16 sectors; ++ u8 algorithm; ++ ++ u8 nr_blocks; ++ u8 nr_redundant; ++ ++ unsigned alive:1; ++ unsigned dirty:1; ++ unsigned on_heap:1; ++ u8 blocks_nonempty; ++ u16 block_sectors[EC_STRIPE_MAX]; ++ ++ struct bch_replicas_padded r; ++}; ++ ++struct ec_stripe_heap_entry { ++ size_t idx; ++ unsigned blocks_nonempty; ++}; ++ ++typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap; ++ ++#endif /* _BCACHEFS_EC_TYPES_H */ +diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c +new file mode 100644 +index 000000000000..cd46706fb6f5 +--- /dev/null ++++ b/fs/bcachefs/error.c +@@ -0,0 +1,172 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "error.h" ++#include "io.h" ++#include "super.h" ++ ++#define FSCK_ERR_RATELIMIT_NR 10 ++ ++bool bch2_inconsistent_error(struct bch_fs *c) ++{ ++ set_bit(BCH_FS_ERROR, &c->flags); ++ ++ switch (c->opts.errors) { ++ case BCH_ON_ERROR_CONTINUE: ++ return false; ++ case BCH_ON_ERROR_RO: ++ if (bch2_fs_emergency_read_only(c)) ++ bch_err(c, "emergency read only"); ++ return true; ++ case BCH_ON_ERROR_PANIC: ++ panic(bch2_fmt(c, "panic after error")); ++ return true; ++ default: ++ BUG(); ++ } ++} ++ ++void bch2_fatal_error(struct bch_fs *c) ++{ ++ if (bch2_fs_emergency_read_only(c)) ++ bch_err(c, "emergency read only"); ++} ++ ++void bch2_io_error_work(struct work_struct *work) ++{ ++ struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); ++ struct bch_fs *c = ca->fs; ++ bool dev; ++ ++ down_write(&c->state_lock); ++ dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO, ++ BCH_FORCE_IF_DEGRADED); ++ if (dev ++ ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_RO, ++ BCH_FORCE_IF_DEGRADED) ++ : bch2_fs_emergency_read_only(c)) ++ bch_err(ca, ++ "too many IO errors, setting %s RO", ++ dev ? "device" : "filesystem"); ++ up_write(&c->state_lock); ++} ++ ++void bch2_io_error(struct bch_dev *ca) ++{ ++ //queue_work(system_long_wq, &ca->io_error_work); ++} ++ ++#ifdef __KERNEL__ ++#define ask_yn() false ++#else ++#include "tools-util.h" ++#endif ++ ++enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags, ++ const char *fmt, ...) ++{ ++ struct fsck_err_state *s = NULL; ++ va_list args; ++ bool fix = false, print = true, suppressing = false; ++ char _buf[sizeof(s->buf)], *buf = _buf; ++ ++ if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) { ++ va_start(args, fmt); ++ vprintk(fmt, args); ++ va_end(args); ++ ++ return bch2_inconsistent_error(c) ++ ? FSCK_ERR_EXIT ++ : FSCK_ERR_FIX; ++ } ++ ++ mutex_lock(&c->fsck_error_lock); ++ ++ list_for_each_entry(s, &c->fsck_errors, list) ++ if (s->fmt == fmt) ++ goto found; ++ ++ s = kzalloc(sizeof(*s), GFP_NOFS); ++ if (!s) { ++ if (!c->fsck_alloc_err) ++ bch_err(c, "kmalloc err, cannot ratelimit fsck errs"); ++ c->fsck_alloc_err = true; ++ buf = _buf; ++ goto print; ++ } ++ ++ INIT_LIST_HEAD(&s->list); ++ s->fmt = fmt; ++found: ++ list_move(&s->list, &c->fsck_errors); ++ s->nr++; ++ if (c->opts.ratelimit_errors && ++ s->nr >= FSCK_ERR_RATELIMIT_NR) { ++ if (s->nr == FSCK_ERR_RATELIMIT_NR) ++ suppressing = true; ++ else ++ print = false; ++ } ++ buf = s->buf; ++print: ++ va_start(args, fmt); ++ vscnprintf(buf, sizeof(_buf), fmt, args); ++ va_end(args); ++ ++ if (c->opts.fix_errors == FSCK_OPT_EXIT) { ++ bch_err(c, "%s, exiting", buf); ++ } else if (flags & FSCK_CAN_FIX) { ++ if (c->opts.fix_errors == FSCK_OPT_ASK) { ++ printk(KERN_ERR "%s: fix?", buf); ++ fix = ask_yn(); ++ } else if (c->opts.fix_errors == FSCK_OPT_YES || ++ (c->opts.nochanges && ++ !(flags & FSCK_CAN_IGNORE))) { ++ if (print) ++ bch_err(c, "%s, fixing", buf); ++ fix = true; ++ } else { ++ if (print) ++ bch_err(c, "%s, not fixing", buf); ++ fix = false; ++ } ++ } else if (flags & FSCK_NEED_FSCK) { ++ if (print) ++ bch_err(c, "%s (run fsck to correct)", buf); ++ } else { ++ if (print) ++ bch_err(c, "%s (repair unimplemented)", buf); ++ } ++ ++ if (suppressing) ++ bch_err(c, "Ratelimiting new instances of previous error"); ++ ++ mutex_unlock(&c->fsck_error_lock); ++ ++ if (fix) { ++ set_bit(BCH_FS_ERRORS_FIXED, &c->flags); ++ return FSCK_ERR_FIX; ++ } else { ++ set_bit(BCH_FS_ERROR, &c->flags); ++ return c->opts.fix_errors == FSCK_OPT_EXIT || ++ !(flags & FSCK_CAN_IGNORE) ++ ? FSCK_ERR_EXIT ++ : FSCK_ERR_IGNORE; ++ } ++} ++ ++void bch2_flush_fsck_errs(struct bch_fs *c) ++{ ++ struct fsck_err_state *s, *n; ++ ++ mutex_lock(&c->fsck_error_lock); ++ ++ list_for_each_entry_safe(s, n, &c->fsck_errors, list) { ++ if (s->ratelimited) ++ bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf); ++ ++ list_del(&s->list); ++ kfree(s); ++ } ++ ++ mutex_unlock(&c->fsck_error_lock); ++} +diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h +new file mode 100644 +index 000000000000..94b53312fbbd +--- /dev/null ++++ b/fs/bcachefs/error.h +@@ -0,0 +1,211 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ERROR_H ++#define _BCACHEFS_ERROR_H ++ ++#include ++#include ++ ++struct bch_dev; ++struct bch_fs; ++struct work_struct; ++ ++/* ++ * XXX: separate out errors that indicate on disk data is inconsistent, and flag ++ * superblock as such ++ */ ++ ++/* Error messages: */ ++ ++/* ++ * Inconsistency errors: The on disk data is inconsistent. If these occur during ++ * initial recovery, they don't indicate a bug in the running code - we walk all ++ * the metadata before modifying anything. If they occur at runtime, they ++ * indicate either a bug in the running code or (less likely) data is being ++ * silently corrupted under us. ++ * ++ * XXX: audit all inconsistent errors and make sure they're all recoverable, in ++ * BCH_ON_ERROR_CONTINUE mode ++ */ ++ ++bool bch2_inconsistent_error(struct bch_fs *); ++ ++#define bch2_fs_inconsistent(c, ...) \ ++({ \ ++ bch_err(c, __VA_ARGS__); \ ++ bch2_inconsistent_error(c); \ ++}) ++ ++#define bch2_fs_inconsistent_on(cond, c, ...) \ ++({ \ ++ int _ret = !!(cond); \ ++ \ ++ if (_ret) \ ++ bch2_fs_inconsistent(c, __VA_ARGS__); \ ++ _ret; \ ++}) ++ ++/* ++ * Later we might want to mark only the particular device inconsistent, not the ++ * entire filesystem: ++ */ ++ ++#define bch2_dev_inconsistent(ca, ...) \ ++do { \ ++ bch_err(ca, __VA_ARGS__); \ ++ bch2_inconsistent_error((ca)->fs); \ ++} while (0) ++ ++#define bch2_dev_inconsistent_on(cond, ca, ...) \ ++({ \ ++ int _ret = !!(cond); \ ++ \ ++ if (_ret) \ ++ bch2_dev_inconsistent(ca, __VA_ARGS__); \ ++ _ret; \ ++}) ++ ++/* ++ * Fsck errors: inconsistency errors we detect at mount time, and should ideally ++ * be able to repair: ++ */ ++ ++enum { ++ BCH_FSCK_OK = 0, ++ BCH_FSCK_ERRORS_NOT_FIXED = 1, ++ BCH_FSCK_REPAIR_UNIMPLEMENTED = 2, ++ BCH_FSCK_REPAIR_IMPOSSIBLE = 3, ++ BCH_FSCK_UNKNOWN_VERSION = 4, ++}; ++ ++enum fsck_err_opts { ++ FSCK_OPT_EXIT, ++ FSCK_OPT_YES, ++ FSCK_OPT_NO, ++ FSCK_OPT_ASK, ++}; ++ ++enum fsck_err_ret { ++ FSCK_ERR_IGNORE = 0, ++ FSCK_ERR_FIX = 1, ++ FSCK_ERR_EXIT = 2, ++}; ++ ++struct fsck_err_state { ++ struct list_head list; ++ const char *fmt; ++ u64 nr; ++ bool ratelimited; ++ char buf[512]; ++}; ++ ++#define FSCK_CAN_FIX (1 << 0) ++#define FSCK_CAN_IGNORE (1 << 1) ++#define FSCK_NEED_FSCK (1 << 2) ++ ++__printf(3, 4) __cold ++enum fsck_err_ret bch2_fsck_err(struct bch_fs *, ++ unsigned, const char *, ...); ++void bch2_flush_fsck_errs(struct bch_fs *); ++ ++#define __fsck_err(c, _flags, msg, ...) \ ++({ \ ++ int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\ ++ \ ++ if (_fix == FSCK_ERR_EXIT) { \ ++ bch_err(c, "Unable to continue, halting"); \ ++ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ goto fsck_err; \ ++ } \ ++ \ ++ _fix; \ ++}) ++ ++/* These macros return true if error should be fixed: */ ++ ++/* XXX: mark in superblock that filesystem contains errors, if we ignore: */ ++ ++#define __fsck_err_on(cond, c, _flags, ...) \ ++ ((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false) ++ ++#define need_fsck_err_on(cond, c, ...) \ ++ __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) ++ ++#define need_fsck_err(c, ...) \ ++ __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) ++ ++#define mustfix_fsck_err(c, ...) \ ++ __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__) ++ ++#define mustfix_fsck_err_on(cond, c, ...) \ ++ __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__) ++ ++#define fsck_err(c, ...) \ ++ __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) ++ ++#define fsck_err_on(cond, c, ...) \ ++ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) ++ ++/* ++ * Fatal errors: these don't indicate a bug, but we can't continue running in RW ++ * mode - pretty much just due to metadata IO errors: ++ */ ++ ++void bch2_fatal_error(struct bch_fs *); ++ ++#define bch2_fs_fatal_error(c, ...) \ ++do { \ ++ bch_err(c, __VA_ARGS__); \ ++ bch2_fatal_error(c); \ ++} while (0) ++ ++#define bch2_fs_fatal_err_on(cond, c, ...) \ ++({ \ ++ int _ret = !!(cond); \ ++ \ ++ if (_ret) \ ++ bch2_fs_fatal_error(c, __VA_ARGS__); \ ++ _ret; \ ++}) ++ ++/* ++ * IO errors: either recoverable metadata IO (because we have replicas), or data ++ * IO - we need to log it and print out a message, but we don't (necessarily) ++ * want to shut down the fs: ++ */ ++ ++void bch2_io_error_work(struct work_struct *); ++ ++/* Does the error handling without logging a message */ ++void bch2_io_error(struct bch_dev *); ++ ++/* Logs message and handles the error: */ ++#define bch2_dev_io_error(ca, fmt, ...) \ ++do { \ ++ printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs, \ ++ "IO error on %s for " fmt), \ ++ (ca)->name, ##__VA_ARGS__); \ ++ bch2_io_error(ca); \ ++} while (0) ++ ++#define bch2_dev_io_err_on(cond, ca, ...) \ ++({ \ ++ bool _ret = (cond); \ ++ \ ++ if (_ret) \ ++ bch2_dev_io_error(ca, __VA_ARGS__); \ ++ _ret; \ ++}) ++ ++/* kill? */ ++ ++#define __bcache_io_error(c, fmt, ...) \ ++ printk_ratelimited(KERN_ERR bch2_fmt(c, \ ++ "IO error: " fmt), ##__VA_ARGS__) ++ ++#define bcache_io_error(c, bio, fmt, ...) \ ++do { \ ++ __bcache_io_error(c, fmt, ##__VA_ARGS__); \ ++ (bio)->bi_status = BLK_STS_IOERR; \ ++} while (0) ++ ++#endif /* _BCACHEFS_ERROR_H */ +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +new file mode 100644 +index 000000000000..fd011df3cb99 +--- /dev/null ++++ b/fs/bcachefs/extent_update.c +@@ -0,0 +1,229 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "bkey_on_stack.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "debug.h" ++#include "extents.h" ++#include "extent_update.h" ++ ++/* ++ * This counts the number of iterators to the alloc & ec btrees we'll need ++ * inserting/removing this extent: ++ */ ++static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ unsigned ret = 0; ++ ++ bkey_extent_entry_for_each(ptrs, entry) { ++ switch (__extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ ret++; ++ } ++ } ++ ++ return ret; ++} ++ ++static int count_iters_for_insert(struct btree_trans *trans, ++ struct bkey_s_c k, ++ unsigned offset, ++ struct bpos *end, ++ unsigned *nr_iters, ++ unsigned max_iters) ++{ ++ int ret = 0, ret2 = 0; ++ ++ if (*nr_iters >= max_iters) { ++ *end = bpos_min(*end, k.k->p); ++ ret = 1; ++ } ++ ++ switch (k.k->type) { ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ *nr_iters += bch2_bkey_nr_alloc_ptrs(k); ++ ++ if (*nr_iters >= max_iters) { ++ *end = bpos_min(*end, k.k->p); ++ ret = 1; ++ } ++ ++ break; ++ case KEY_TYPE_reflink_p: { ++ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); ++ u64 idx = le64_to_cpu(p.v->idx); ++ unsigned sectors = bpos_min(*end, p.k->p).offset - ++ bkey_start_offset(p.k); ++ struct btree_iter *iter; ++ struct bkey_s_c r_k; ++ ++ for_each_btree_key(trans, iter, ++ BTREE_ID_REFLINK, POS(0, idx + offset), ++ BTREE_ITER_SLOTS, r_k, ret2) { ++ if (bkey_cmp(bkey_start_pos(r_k.k), ++ POS(0, idx + sectors)) >= 0) ++ break; ++ ++ /* extent_update_to_keys(), for the reflink_v update */ ++ *nr_iters += 1; ++ ++ *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); ++ ++ if (*nr_iters >= max_iters) { ++ struct bpos pos = bkey_start_pos(k.k); ++ pos.offset += min_t(u64, k.k->size, ++ r_k.k->p.offset - idx); ++ ++ *end = bpos_min(*end, pos); ++ ret = 1; ++ break; ++ } ++ } ++ ++ bch2_trans_iter_put(trans, iter); ++ break; ++ } ++ } ++ ++ return ret2 ?: ret; ++} ++ ++#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) ++ ++int bch2_extent_atomic_end(struct btree_iter *iter, ++ struct bkey_i *insert, ++ struct bpos *end) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree *b; ++ struct btree_node_iter node_iter; ++ struct bkey_packed *_k; ++ unsigned nr_iters = 0; ++ int ret; ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ return ret; ++ ++ b = iter->l[0].b; ++ node_iter = iter->l[0].iter; ++ ++ BUG_ON(bkey_cmp(b->data->min_key, POS_MIN) && ++ bkey_cmp(bkey_start_pos(&insert->k), ++ bkey_predecessor(b->data->min_key)) < 0); ++ ++ *end = bpos_min(insert->k.p, b->key.k.p); ++ ++ /* extent_update_to_keys(): */ ++ nr_iters += 1; ++ ++ ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, ++ &nr_iters, EXTENT_ITERS_MAX / 2); ++ if (ret < 0) ++ return ret; ++ ++ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { ++ struct bkey unpacked; ++ struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); ++ unsigned offset = 0; ++ ++ if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) ++ break; ++ ++ if (bkey_cmp(bkey_start_pos(&insert->k), ++ bkey_start_pos(k.k)) > 0) ++ offset = bkey_start_offset(&insert->k) - ++ bkey_start_offset(k.k); ++ ++ /* extent_handle_overwrites(): */ ++ switch (bch2_extent_overlap(&insert->k, k.k)) { ++ case BCH_EXTENT_OVERLAP_ALL: ++ case BCH_EXTENT_OVERLAP_FRONT: ++ nr_iters += 1; ++ break; ++ case BCH_EXTENT_OVERLAP_BACK: ++ case BCH_EXTENT_OVERLAP_MIDDLE: ++ nr_iters += 2; ++ break; ++ } ++ ++ ret = count_iters_for_insert(trans, k, offset, end, ++ &nr_iters, EXTENT_ITERS_MAX); ++ if (ret) ++ break; ++ ++ bch2_btree_node_iter_advance(&node_iter, b); ++ } ++ ++ return ret < 0 ? ret : 0; ++} ++ ++int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) ++{ ++ struct bpos end; ++ int ret; ++ ++ ret = bch2_extent_atomic_end(iter, k, &end); ++ if (ret) ++ return ret; ++ ++ bch2_cut_back(end, k); ++ return 0; ++} ++ ++int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) ++{ ++ struct bpos end; ++ int ret; ++ ++ ret = bch2_extent_atomic_end(iter, k, &end); ++ if (ret) ++ return ret; ++ ++ return !bkey_cmp(end, k->k.p); ++} ++ ++enum btree_insert_ret ++bch2_extent_can_insert(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct btree_node_iter node_iter = l->iter; ++ struct bkey_packed *_k; ++ struct bkey_s_c k; ++ struct bkey unpacked; ++ int sectors; ++ ++ _k = bch2_btree_node_iter_peek(&node_iter, l->b); ++ if (!_k) ++ return BTREE_INSERT_OK; ++ ++ k = bkey_disassemble(l->b, _k, &unpacked); ++ ++ /* Check if we're splitting a compressed extent: */ ++ ++ if (bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k)) > 0 && ++ bkey_cmp(insert->k.p, k.k->p) < 0 && ++ (sectors = bch2_bkey_sectors_compressed(k))) { ++ int flags = trans->flags & BTREE_INSERT_NOFAIL ++ ? BCH_DISK_RESERVATION_NOFAIL : 0; ++ ++ switch (bch2_disk_reservation_add(trans->c, trans->disk_res, ++ sectors, flags)) { ++ case 0: ++ break; ++ case -ENOSPC: ++ return BTREE_INSERT_ENOSPC; ++ default: ++ BUG(); ++ } ++ } ++ ++ return BTREE_INSERT_OK; ++} +diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h +new file mode 100644 +index 000000000000..38dc084627d2 +--- /dev/null ++++ b/fs/bcachefs/extent_update.h +@@ -0,0 +1,16 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EXTENT_UPDATE_H ++#define _BCACHEFS_EXTENT_UPDATE_H ++ ++#include "bcachefs.h" ++ ++int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *, ++ struct bpos *); ++int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); ++int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); ++ ++enum btree_insert_ret ++bch2_extent_can_insert(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *); ++ ++#endif /* _BCACHEFS_EXTENT_UPDATE_H */ +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +new file mode 100644 +index 000000000000..568f039edcff +--- /dev/null ++++ b/fs/bcachefs/extents.c +@@ -0,0 +1,1258 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2010 Kent Overstreet ++ * ++ * Code for managing the extent btree and dynamically updating the writeback ++ * dirty sector count. ++ */ ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_gc.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "debug.h" ++#include "disk_groups.h" ++#include "error.h" ++#include "extents.h" ++#include "inode.h" ++#include "journal.h" ++#include "replicas.h" ++#include "super.h" ++#include "super-io.h" ++#include "util.h" ++ ++#include ++ ++static unsigned bch2_crc_field_size_max[] = { ++ [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, ++ [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, ++ [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, ++}; ++ ++static void bch2_extent_crc_pack(union bch_extent_crc *, ++ struct bch_extent_crc_unpacked, ++ enum bch_extent_entry_type); ++ ++static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, ++ unsigned dev) ++{ ++ struct bch_dev_io_failures *i; ++ ++ for (i = f->devs; i < f->devs + f->nr; i++) ++ if (i->dev == dev) ++ return i; ++ ++ return NULL; ++} ++ ++void bch2_mark_io_failure(struct bch_io_failures *failed, ++ struct extent_ptr_decoded *p) ++{ ++ struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev); ++ ++ if (!f) { ++ BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); ++ ++ f = &failed->devs[failed->nr++]; ++ f->dev = p->ptr.dev; ++ f->idx = p->idx; ++ f->nr_failed = 1; ++ f->nr_retries = 0; ++ } else if (p->idx != f->idx) { ++ f->idx = p->idx; ++ f->nr_failed = 1; ++ f->nr_retries = 0; ++ } else { ++ f->nr_failed++; ++ } ++} ++ ++/* ++ * returns true if p1 is better than p2: ++ */ ++static inline bool ptr_better(struct bch_fs *c, ++ const struct extent_ptr_decoded p1, ++ const struct extent_ptr_decoded p2) ++{ ++ if (likely(!p1.idx && !p2.idx)) { ++ struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev); ++ struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev); ++ ++ u64 l1 = atomic64_read(&dev1->cur_latency[READ]); ++ u64 l2 = atomic64_read(&dev2->cur_latency[READ]); ++ ++ /* Pick at random, biased in favor of the faster device: */ ++ ++ return bch2_rand_range(l1 + l2) > l1; ++ } ++ ++ if (force_reconstruct_read(c)) ++ return p1.idx > p2.idx; ++ ++ return p1.idx < p2.idx; ++} ++ ++/* ++ * This picks a non-stale pointer, preferably from a device other than @avoid. ++ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to ++ * other devices, it will still pick a pointer from avoid. ++ */ ++int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, ++ struct bch_io_failures *failed, ++ struct extent_ptr_decoded *pick) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ struct bch_dev_io_failures *f; ++ struct bch_dev *ca; ++ int ret = 0; ++ ++ if (k.k->type == KEY_TYPE_error) ++ return -EIO; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ ++ /* ++ * If there are any dirty pointers it's an error if we can't ++ * read: ++ */ ++ if (!ret && !p.ptr.cached) ++ ret = -EIO; ++ ++ if (p.ptr.cached && ptr_stale(ca, &p.ptr)) ++ continue; ++ ++ f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; ++ if (f) ++ p.idx = f->nr_failed < f->nr_retries ++ ? f->idx ++ : f->idx + 1; ++ ++ if (!p.idx && ++ !bch2_dev_is_readable(ca)) ++ p.idx++; ++ ++ if (force_reconstruct_read(c) && ++ !p.idx && p.has_ec) ++ p.idx++; ++ ++ if (p.idx >= (unsigned) p.has_ec + 1) ++ continue; ++ ++ if (ret > 0 && !ptr_better(c, p, *pick)) ++ continue; ++ ++ *pick = p; ++ ret = 1; ++ } ++ ++ return ret; ++} ++ ++/* KEY_TYPE_btree_ptr: */ ++ ++const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) ++ return "value too big"; ++ ++ return bch2_bkey_ptrs_invalid(c, k); ++} ++ ++void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ const char *err; ++ char buf[160]; ++ struct bucket_mark mark; ++ struct bch_dev *ca; ++ ++ if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) ++ return; ++ ++ if (!percpu_down_read_trylock(&c->mark_lock)) ++ return; ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ mark = ptr_bucket_mark(ca, ptr); ++ ++ err = "stale"; ++ if (gen_after(mark.gen, ptr->gen)) ++ goto err; ++ ++ err = "inconsistent"; ++ if (mark.data_type != BCH_DATA_btree || ++ mark.dirty_sectors < c->opts.btree_node_size) ++ goto err; ++ } ++out: ++ percpu_up_read(&c->mark_lock); ++ return; ++err: ++ bch2_fs_inconsistent(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", ++ err, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), ++ PTR_BUCKET_NR(ca, ptr), ++ mark.gen, (unsigned) mark.v.counter); ++ goto out; ++} ++ ++void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ bch2_bkey_ptrs_to_text(out, c, k); ++} ++ ++void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); ++ ++ pr_buf(out, "seq %llx sectors %u written %u min_key ", ++ le64_to_cpu(bp.v->seq), ++ le16_to_cpu(bp.v->sectors), ++ le16_to_cpu(bp.v->sectors_written)); ++ ++ bch2_bpos_to_text(out, bp.v->min_key); ++ pr_buf(out, " "); ++ bch2_bkey_ptrs_to_text(out, c, k); ++} ++ ++void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, ++ unsigned big_endian, int write, ++ struct bkey_s k) ++{ ++ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k); ++ ++ compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key); ++ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_node_type_is_extents(btree_id) && ++ bkey_cmp(bp.v->min_key, POS_MIN)) ++ bp.v->min_key = write ++ ? bkey_predecessor(bp.v->min_key) ++ : bkey_successor(bp.v->min_key); ++} ++ ++/* KEY_TYPE_extent: */ ++ ++const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ return bch2_bkey_ptrs_invalid(c, k); ++} ++ ++void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ char buf[160]; ++ ++ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) || ++ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) ++ return; ++ ++ if (!percpu_down_read_trylock(&c->mark_lock)) ++ return; ++ ++ extent_for_each_ptr_decode(e, p, entry) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); ++ unsigned stale = gen_after(mark.gen, p.ptr.gen); ++ unsigned disk_sectors = ptr_disk_sectors(p); ++ unsigned mark_sectors = p.ptr.cached ++ ? mark.cached_sectors ++ : mark.dirty_sectors; ++ ++ bch2_fs_inconsistent_on(stale && !p.ptr.cached, c, ++ "stale dirty pointer (ptr gen %u bucket %u", ++ p.ptr.gen, mark.gen); ++ ++ bch2_fs_inconsistent_on(stale > 96, c, ++ "key too stale: %i", stale); ++ ++ bch2_fs_inconsistent_on(!stale && ++ (mark.data_type != BCH_DATA_user || ++ mark_sectors < disk_sectors), c, ++ "extent pointer not marked: %s:\n" ++ "type %u sectors %u < %u", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), ++ mark.data_type, ++ mark_sectors, disk_sectors); ++ } ++ ++ percpu_up_read(&c->mark_lock); ++} ++ ++void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ bch2_bkey_ptrs_to_text(out, c, k); ++} ++ ++enum merge_result bch2_extent_merge(struct bch_fs *c, ++ struct bkey_s _l, struct bkey_s _r) ++{ ++ struct bkey_s_extent l = bkey_s_to_extent(_l); ++ struct bkey_s_extent r = bkey_s_to_extent(_r); ++ union bch_extent_entry *en_l = l.v->start; ++ union bch_extent_entry *en_r = r.v->start; ++ struct bch_extent_crc_unpacked crc_l, crc_r; ++ ++ if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k)) ++ return BCH_MERGE_NOMERGE; ++ ++ crc_l = bch2_extent_crc_unpack(l.k, NULL); ++ ++ extent_for_each_entry(l, en_l) { ++ en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); ++ ++ if (extent_entry_type(en_l) != extent_entry_type(en_r)) ++ return BCH_MERGE_NOMERGE; ++ ++ switch (extent_entry_type(en_l)) { ++ case BCH_EXTENT_ENTRY_ptr: { ++ const struct bch_extent_ptr *lp = &en_l->ptr; ++ const struct bch_extent_ptr *rp = &en_r->ptr; ++ struct bch_dev *ca; ++ ++ if (lp->offset + crc_l.compressed_size != rp->offset || ++ lp->dev != rp->dev || ++ lp->gen != rp->gen) ++ return BCH_MERGE_NOMERGE; ++ ++ /* We don't allow extents to straddle buckets: */ ++ ca = bch_dev_bkey_exists(c, lp->dev); ++ ++ if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) ++ return BCH_MERGE_NOMERGE; ++ ++ break; ++ } ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ if (en_l->stripe_ptr.block != en_r->stripe_ptr.block || ++ en_l->stripe_ptr.idx != en_r->stripe_ptr.idx) ++ return BCH_MERGE_NOMERGE; ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ case BCH_EXTENT_ENTRY_crc64: ++ case BCH_EXTENT_ENTRY_crc128: ++ crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); ++ crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); ++ ++ if (crc_l.csum_type != crc_r.csum_type || ++ crc_l.compression_type != crc_r.compression_type || ++ crc_l.nonce != crc_r.nonce) ++ return BCH_MERGE_NOMERGE; ++ ++ if (crc_l.offset + crc_l.live_size != crc_l.compressed_size || ++ crc_r.offset) ++ return BCH_MERGE_NOMERGE; ++ ++ if (!bch2_checksum_mergeable(crc_l.csum_type)) ++ return BCH_MERGE_NOMERGE; ++ ++ if (crc_is_compressed(crc_l)) ++ return BCH_MERGE_NOMERGE; ++ ++ if (crc_l.csum_type && ++ crc_l.uncompressed_size + ++ crc_r.uncompressed_size > c->sb.encoded_extent_max) ++ return BCH_MERGE_NOMERGE; ++ ++ if (crc_l.uncompressed_size + crc_r.uncompressed_size > ++ bch2_crc_field_size_max[extent_entry_type(en_l)]) ++ return BCH_MERGE_NOMERGE; ++ ++ break; ++ default: ++ return BCH_MERGE_NOMERGE; ++ } ++ } ++ ++ extent_for_each_entry(l, en_l) { ++ struct bch_extent_crc_unpacked crc_l, crc_r; ++ ++ en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); ++ ++ if (!extent_entry_is_crc(en_l)) ++ continue; ++ ++ crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); ++ crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); ++ ++ crc_l.csum = bch2_checksum_merge(crc_l.csum_type, ++ crc_l.csum, ++ crc_r.csum, ++ crc_r.uncompressed_size << 9); ++ ++ crc_l.uncompressed_size += crc_r.uncompressed_size; ++ crc_l.compressed_size += crc_r.compressed_size; ++ ++ bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, ++ extent_entry_type(en_l)); ++ } ++ ++ bch2_key_resize(l.k, l.k->size + r.k->size); ++ ++ return BCH_MERGE_MERGE; ++} ++ ++/* KEY_TYPE_reservation: */ ++ ++const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); ++ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) ++ return "incorrect value size"; ++ ++ if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) ++ return "invalid nr_replicas"; ++ ++ return NULL; ++} ++ ++void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); ++ ++ pr_buf(out, "generation %u replicas %u", ++ le32_to_cpu(r.v->generation), ++ r.v->nr_replicas); ++} ++ ++enum merge_result bch2_reservation_merge(struct bch_fs *c, ++ struct bkey_s _l, struct bkey_s _r) ++{ ++ struct bkey_s_reservation l = bkey_s_to_reservation(_l); ++ struct bkey_s_reservation r = bkey_s_to_reservation(_r); ++ ++ if (l.v->generation != r.v->generation || ++ l.v->nr_replicas != r.v->nr_replicas) ++ return BCH_MERGE_NOMERGE; ++ ++ if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { ++ bch2_key_resize(l.k, KEY_SIZE_MAX); ++ bch2_cut_front_s(l.k->p, r.s); ++ return BCH_MERGE_PARTIAL; ++ } ++ ++ bch2_key_resize(l.k, l.k->size + r.k->size); ++ ++ return BCH_MERGE_MERGE; ++} ++ ++/* Extent checksum entries: */ ++ ++/* returns true if not equal */ ++static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, ++ struct bch_extent_crc_unpacked r) ++{ ++ return (l.csum_type != r.csum_type || ++ l.compression_type != r.compression_type || ++ l.compressed_size != r.compressed_size || ++ l.uncompressed_size != r.uncompressed_size || ++ l.offset != r.offset || ++ l.live_size != r.live_size || ++ l.nonce != r.nonce || ++ bch2_crc_cmp(l.csum, r.csum)); ++} ++ ++static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, ++ struct bch_extent_crc_unpacked n) ++{ ++ return !crc_is_compressed(u) && ++ u.csum_type && ++ u.uncompressed_size > u.live_size && ++ bch2_csum_type_is_encryption(u.csum_type) == ++ bch2_csum_type_is_encryption(n.csum_type); ++} ++ ++bool bch2_can_narrow_extent_crcs(struct bkey_s_c k, ++ struct bch_extent_crc_unpacked n) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ struct bch_extent_crc_unpacked crc; ++ const union bch_extent_entry *i; ++ ++ if (!n.csum_type) ++ return false; ++ ++ bkey_for_each_crc(k.k, ptrs, crc, i) ++ if (can_narrow_crc(crc, n)) ++ return true; ++ ++ return false; ++} ++ ++/* ++ * We're writing another replica for this extent, so while we've got the data in ++ * memory we'll be computing a new checksum for the currently live data. ++ * ++ * If there are other replicas we aren't moving, and they are checksummed but ++ * not compressed, we can modify them to point to only the data that is ++ * currently live (so that readers won't have to bounce) while we've got the ++ * checksum we need: ++ */ ++bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); ++ struct bch_extent_crc_unpacked u; ++ struct extent_ptr_decoded p; ++ union bch_extent_entry *i; ++ bool ret = false; ++ ++ /* Find a checksum entry that covers only live data: */ ++ if (!n.csum_type) { ++ bkey_for_each_crc(&k->k, ptrs, u, i) ++ if (!crc_is_compressed(u) && ++ u.csum_type && ++ u.live_size == u.uncompressed_size) { ++ n = u; ++ goto found; ++ } ++ return false; ++ } ++found: ++ BUG_ON(crc_is_compressed(n)); ++ BUG_ON(n.offset); ++ BUG_ON(n.live_size != k->k.size); ++ ++restart_narrow_pointers: ++ ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); ++ ++ bkey_for_each_ptr_decode(&k->k, ptrs, p, i) ++ if (can_narrow_crc(p.crc, n)) { ++ bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr); ++ p.ptr.offset += p.crc.offset; ++ p.crc = n; ++ bch2_extent_ptr_decoded_append(k, &p); ++ ret = true; ++ goto restart_narrow_pointers; ++ } ++ ++ return ret; ++} ++ ++static void bch2_extent_crc_pack(union bch_extent_crc *dst, ++ struct bch_extent_crc_unpacked src, ++ enum bch_extent_entry_type type) ++{ ++#define set_common_fields(_dst, _src) \ ++ _dst.type = 1 << type; \ ++ _dst.csum_type = _src.csum_type, \ ++ _dst.compression_type = _src.compression_type, \ ++ _dst._compressed_size = _src.compressed_size - 1, \ ++ _dst._uncompressed_size = _src.uncompressed_size - 1, \ ++ _dst.offset = _src.offset ++ ++ switch (type) { ++ case BCH_EXTENT_ENTRY_crc32: ++ set_common_fields(dst->crc32, src); ++ dst->crc32.csum = *((__le32 *) &src.csum.lo); ++ break; ++ case BCH_EXTENT_ENTRY_crc64: ++ set_common_fields(dst->crc64, src); ++ dst->crc64.nonce = src.nonce; ++ dst->crc64.csum_lo = src.csum.lo; ++ dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); ++ break; ++ case BCH_EXTENT_ENTRY_crc128: ++ set_common_fields(dst->crc128, src); ++ dst->crc128.nonce = src.nonce; ++ dst->crc128.csum = src.csum; ++ break; ++ default: ++ BUG(); ++ } ++#undef set_common_fields ++} ++ ++void bch2_extent_crc_append(struct bkey_i *k, ++ struct bch_extent_crc_unpacked new) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); ++ union bch_extent_crc *crc = (void *) ptrs.end; ++ enum bch_extent_entry_type type; ++ ++ if (bch_crc_bytes[new.csum_type] <= 4 && ++ new.uncompressed_size <= CRC32_SIZE_MAX && ++ new.nonce <= CRC32_NONCE_MAX) ++ type = BCH_EXTENT_ENTRY_crc32; ++ else if (bch_crc_bytes[new.csum_type] <= 10 && ++ new.uncompressed_size <= CRC64_SIZE_MAX && ++ new.nonce <= CRC64_NONCE_MAX) ++ type = BCH_EXTENT_ENTRY_crc64; ++ else if (bch_crc_bytes[new.csum_type] <= 16 && ++ new.uncompressed_size <= CRC128_SIZE_MAX && ++ new.nonce <= CRC128_NONCE_MAX) ++ type = BCH_EXTENT_ENTRY_crc128; ++ else ++ BUG(); ++ ++ bch2_extent_crc_pack(crc, new, type); ++ ++ k->k.u64s += extent_entry_u64s(ptrs.end); ++ ++ EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); ++} ++ ++/* Generic code for keys with pointers: */ ++ ++unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) ++{ ++ return bch2_bkey_devs(k).nr; ++} ++ ++unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) ++{ ++ return k.k->type == KEY_TYPE_reservation ++ ? bkey_s_c_to_reservation(k).v->nr_replicas ++ : bch2_bkey_dirty_devs(k).nr; ++} ++ ++unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) ++{ ++ unsigned ret = 0; ++ ++ if (k.k->type == KEY_TYPE_reservation) { ++ ret = bkey_s_c_to_reservation(k).v->nr_replicas; ++ } else { ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ ret += !p.ptr.cached && !crc_is_compressed(p.crc); ++ } ++ ++ return ret; ++} ++ ++unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ unsigned ret = 0; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (!p.ptr.cached && crc_is_compressed(p.crc)) ++ ret += p.crc.compressed_size; ++ ++ return ret; ++} ++ ++bool bch2_bkey_is_incompressible(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct bch_extent_crc_unpacked crc; ++ ++ bkey_for_each_crc(k.k, ptrs, crc, entry) ++ if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) ++ return true; ++ return false; ++} ++ ++bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, ++ unsigned nr_replicas) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bpos end = pos; ++ struct bkey_s_c k; ++ bool ret = true; ++ int err; ++ ++ end.offset += size; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos, ++ BTREE_ITER_SLOTS, k, err) { ++ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) ++ break; ++ ++ if (nr_replicas > bch2_bkey_nr_ptrs_fully_allocated(k)) { ++ ret = false; ++ break; ++ } ++ } ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++static unsigned bch2_extent_ptr_durability(struct bch_fs *c, ++ struct extent_ptr_decoded p) ++{ ++ unsigned durability = 0; ++ struct bch_dev *ca; ++ ++ if (p.ptr.cached) ++ return 0; ++ ++ ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ ++ if (ca->mi.state != BCH_MEMBER_STATE_FAILED) ++ durability = max_t(unsigned, durability, ca->mi.durability); ++ ++ if (p.has_ec) { ++ struct stripe *s = ++ genradix_ptr(&c->stripes[0], p.ec.idx); ++ ++ if (WARN_ON(!s)) ++ goto out; ++ ++ durability += s->nr_redundant; ++ } ++out: ++ return durability; ++} ++ ++unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ unsigned durability = 0; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ durability += bch2_extent_ptr_durability(c, p); ++ ++ return durability; ++} ++ ++void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, ++ unsigned target, ++ unsigned nr_desired_replicas) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); ++ union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; ++ ++ if (target && extra > 0) ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ int n = bch2_extent_ptr_durability(c, p); ++ ++ if (n && n <= extra && ++ !bch2_dev_in_target(c, p.ptr.dev, target)) { ++ entry->ptr.cached = true; ++ extra -= n; ++ } ++ } ++ ++ if (extra > 0) ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ int n = bch2_extent_ptr_durability(c, p); ++ ++ if (n && n <= extra) { ++ entry->ptr.cached = true; ++ extra -= n; ++ } ++ } ++} ++ ++void bch2_bkey_append_ptr(struct bkey_i *k, ++ struct bch_extent_ptr ptr) ++{ ++ EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev)); ++ ++ switch (k->k.type) { ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: ++ case KEY_TYPE_extent: ++ EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); ++ ++ ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; ++ ++ memcpy((void *) &k->v + bkey_val_bytes(&k->k), ++ &ptr, ++ sizeof(ptr)); ++ k->u64s++; ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static inline void __extent_entry_insert(struct bkey_i *k, ++ union bch_extent_entry *dst, ++ union bch_extent_entry *new) ++{ ++ union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); ++ ++ memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), ++ dst, (u64 *) end - (u64 *) dst); ++ k->k.u64s += extent_entry_u64s(new); ++ memcpy(dst, new, extent_entry_bytes(new)); ++} ++ ++void bch2_extent_ptr_decoded_append(struct bkey_i *k, ++ struct extent_ptr_decoded *p) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); ++ struct bch_extent_crc_unpacked crc = ++ bch2_extent_crc_unpack(&k->k, NULL); ++ union bch_extent_entry *pos; ++ ++ if (!bch2_crc_unpacked_cmp(crc, p->crc)) { ++ pos = ptrs.start; ++ goto found; ++ } ++ ++ bkey_for_each_crc(&k->k, ptrs, crc, pos) ++ if (!bch2_crc_unpacked_cmp(crc, p->crc)) { ++ pos = extent_entry_next(pos); ++ goto found; ++ } ++ ++ bch2_extent_crc_append(k, p->crc); ++ pos = bkey_val_end(bkey_i_to_s(k)); ++found: ++ p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; ++ __extent_entry_insert(k, pos, to_entry(&p->ptr)); ++ ++ if (p->has_ec) { ++ p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; ++ __extent_entry_insert(k, pos, to_entry(&p->ec)); ++ } ++} ++ ++static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, ++ union bch_extent_entry *entry) ++{ ++ union bch_extent_entry *i = ptrs.start; ++ ++ if (i == entry) ++ return NULL; ++ ++ while (extent_entry_next(i) != entry) ++ i = extent_entry_next(i); ++ return i; ++} ++ ++union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, ++ struct bch_extent_ptr *ptr) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); ++ union bch_extent_entry *dst, *src, *prev; ++ bool drop_crc = true; ++ ++ EBUG_ON(ptr < &ptrs.start->ptr || ++ ptr >= &ptrs.end->ptr); ++ EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); ++ ++ src = extent_entry_next(to_entry(ptr)); ++ if (src != ptrs.end && ++ !extent_entry_is_crc(src)) ++ drop_crc = false; ++ ++ dst = to_entry(ptr); ++ while ((prev = extent_entry_prev(ptrs, dst))) { ++ if (extent_entry_is_ptr(prev)) ++ break; ++ ++ if (extent_entry_is_crc(prev)) { ++ if (drop_crc) ++ dst = prev; ++ break; ++ } ++ ++ dst = prev; ++ } ++ ++ memmove_u64s_down(dst, src, ++ (u64 *) ptrs.end - (u64 *) src); ++ k.k->u64s -= (u64 *) src - (u64 *) dst; ++ ++ return dst; ++} ++ ++void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) ++{ ++ struct bch_extent_ptr *ptr; ++ ++ bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); ++} ++ ++const struct bch_extent_ptr * ++bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(ptrs, ptr) ++ if (ptr->dev == dev) ++ return ptr; ++ ++ return NULL; ++} ++ ++bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(ptrs, ptr) ++ if (bch2_dev_in_target(c, ptr->dev, target) && ++ (!ptr->cached || ++ !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) ++ return true; ++ ++ return false; ++} ++ ++bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, ++ struct bch_extent_ptr m, u64 offset) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (p.ptr.dev == m.dev && ++ p.ptr.gen == m.gen && ++ (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == ++ (s64) m.offset - offset) ++ return true; ++ ++ return false; ++} ++ ++/* ++ * bch_extent_normalize - clean up an extent, dropping stale pointers etc. ++ * ++ * Returns true if @k should be dropped entirely ++ * ++ * For existing keys, only called when btree nodes are being rewritten, not when ++ * they're merely being compacted/resorted in memory. ++ */ ++bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) ++{ ++ struct bch_extent_ptr *ptr; ++ ++ bch2_bkey_drop_ptrs(k, ptr, ++ ptr->cached && ++ ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); ++ ++ /* will only happen if all pointers were cached: */ ++ if (!bch2_bkey_nr_ptrs(k.s_c)) ++ k.k->type = KEY_TYPE_discard; ++ ++ return bkey_whiteout(k.k); ++} ++ ++void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct bch_extent_crc_unpacked crc; ++ const struct bch_extent_ptr *ptr; ++ const struct bch_extent_stripe_ptr *ec; ++ struct bch_dev *ca; ++ bool first = true; ++ ++ bkey_extent_entry_for_each(ptrs, entry) { ++ if (!first) ++ pr_buf(out, " "); ++ ++ switch (__extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ ptr = entry_to_ptr(entry); ++ ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] ++ ? bch_dev_bkey_exists(c, ptr->dev) ++ : NULL; ++ ++ pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev, ++ (u64) ptr->offset, ptr->gen, ++ ptr->cached ? " cached" : "", ++ ca && ptr_stale(ca, ptr) ++ ? " stale" : ""); ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ case BCH_EXTENT_ENTRY_crc64: ++ case BCH_EXTENT_ENTRY_crc128: ++ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); ++ ++ pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u", ++ crc.compressed_size, ++ crc.uncompressed_size, ++ crc.offset, crc.nonce, ++ crc.csum_type, ++ crc.compression_type); ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ ec = &entry->stripe_ptr; ++ ++ pr_buf(out, "ec: idx %llu block %u", ++ (u64) ec->idx, ec->block); ++ break; ++ default: ++ pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); ++ return; ++ } ++ ++ first = false; ++ } ++} ++ ++static const char *extent_ptr_invalid(const struct bch_fs *c, ++ struct bkey_s_c k, ++ const struct bch_extent_ptr *ptr, ++ unsigned size_ondisk, ++ bool metadata) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr2; ++ struct bch_dev *ca; ++ ++ if (!bch2_dev_exists2(c, ptr->dev)) ++ return "pointer to invalid device"; ++ ++ ca = bch_dev_bkey_exists(c, ptr->dev); ++ if (!ca) ++ return "pointer to invalid device"; ++ ++ bkey_for_each_ptr(ptrs, ptr2) ++ if (ptr != ptr2 && ptr->dev == ptr2->dev) ++ return "multiple pointers to same device"; ++ ++ if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets)) ++ return "offset past end of device"; ++ ++ if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) ++ return "offset before first bucket"; ++ ++ if (bucket_remainder(ca, ptr->offset) + ++ size_ondisk > ca->mi.bucket_size) ++ return "spans multiple buckets"; ++ ++ return NULL; ++} ++ ++const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct bch_extent_crc_unpacked crc; ++ unsigned size_ondisk = k.k->size; ++ const char *reason; ++ unsigned nonce = UINT_MAX; ++ ++ if (k.k->type == KEY_TYPE_btree_ptr) ++ size_ondisk = c->opts.btree_node_size; ++ if (k.k->type == KEY_TYPE_btree_ptr_v2) ++ size_ondisk = le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors); ++ ++ bkey_extent_entry_for_each(ptrs, entry) { ++ if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) ++ return "invalid extent entry type"; ++ ++ if (k.k->type == KEY_TYPE_btree_ptr && ++ !extent_entry_is_ptr(entry)) ++ return "has non ptr field"; ++ ++ switch (extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ reason = extent_ptr_invalid(c, k, &entry->ptr, ++ size_ondisk, false); ++ if (reason) ++ return reason; ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ case BCH_EXTENT_ENTRY_crc64: ++ case BCH_EXTENT_ENTRY_crc128: ++ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); ++ ++ if (crc.offset + crc.live_size > ++ crc.uncompressed_size) ++ return "checksum offset + key size > uncompressed size"; ++ ++ size_ondisk = crc.compressed_size; ++ ++ if (!bch2_checksum_type_valid(c, crc.csum_type)) ++ return "invalid checksum type"; ++ ++ if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) ++ return "invalid compression type"; ++ ++ if (bch2_csum_type_is_encryption(crc.csum_type)) { ++ if (nonce == UINT_MAX) ++ nonce = crc.offset + crc.nonce; ++ else if (nonce != crc.offset + crc.nonce) ++ return "incorrect nonce"; ++ } ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ break; ++ } ++ } ++ ++ return NULL; ++} ++ ++void bch2_ptr_swab(struct bkey_s k) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); ++ union bch_extent_entry *entry; ++ u64 *d; ++ ++ for (d = (u64 *) ptrs.start; ++ d != (u64 *) ptrs.end; ++ d++) ++ *d = swab64(*d); ++ ++ for (entry = ptrs.start; ++ entry < ptrs.end; ++ entry = extent_entry_next(entry)) { ++ switch (extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ entry->crc32.csum = swab32(entry->crc32.csum); ++ break; ++ case BCH_EXTENT_ENTRY_crc64: ++ entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); ++ entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); ++ break; ++ case BCH_EXTENT_ENTRY_crc128: ++ entry->crc128.csum.hi = (__force __le64) ++ swab64((__force u64) entry->crc128.csum.hi); ++ entry->crc128.csum.lo = (__force __le64) ++ swab64((__force u64) entry->crc128.csum.lo); ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ break; ++ } ++ } ++} ++ ++/* Generic extent code: */ ++ ++int bch2_cut_front_s(struct bpos where, struct bkey_s k) ++{ ++ unsigned new_val_u64s = bkey_val_u64s(k.k); ++ int val_u64s_delta; ++ u64 sub; ++ ++ if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) ++ return 0; ++ ++ EBUG_ON(bkey_cmp(where, k.k->p) > 0); ++ ++ sub = where.offset - bkey_start_offset(k.k); ++ ++ k.k->size -= sub; ++ ++ if (!k.k->size) { ++ k.k->type = KEY_TYPE_deleted; ++ new_val_u64s = 0; ++ } ++ ++ switch (k.k->type) { ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: { ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); ++ union bch_extent_entry *entry; ++ bool seen_crc = false; ++ ++ bkey_extent_entry_for_each(ptrs, entry) { ++ switch (extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ if (!seen_crc) ++ entry->ptr.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ entry->crc32.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_crc64: ++ entry->crc64.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_crc128: ++ entry->crc128.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ break; ++ } ++ ++ if (extent_entry_is_crc(entry)) ++ seen_crc = true; ++ } ++ ++ break; ++ } ++ case KEY_TYPE_reflink_p: { ++ struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); ++ ++ le64_add_cpu(&p.v->idx, sub); ++ break; ++ } ++ case KEY_TYPE_inline_data: { ++ struct bkey_s_inline_data d = bkey_s_to_inline_data(k); ++ ++ sub = min_t(u64, sub << 9, bkey_val_bytes(d.k)); ++ ++ memmove(d.v->data, ++ d.v->data + sub, ++ bkey_val_bytes(d.k) - sub); ++ ++ new_val_u64s -= sub >> 3; ++ break; ++ } ++ } ++ ++ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; ++ BUG_ON(val_u64s_delta < 0); ++ ++ set_bkey_val_u64s(k.k, new_val_u64s); ++ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); ++ return -val_u64s_delta; ++} ++ ++int bch2_cut_back_s(struct bpos where, struct bkey_s k) ++{ ++ unsigned new_val_u64s = bkey_val_u64s(k.k); ++ int val_u64s_delta; ++ u64 len = 0; ++ ++ if (bkey_cmp(where, k.k->p) >= 0) ++ return 0; ++ ++ EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0); ++ ++ len = where.offset - bkey_start_offset(k.k); ++ ++ k.k->p = where; ++ k.k->size = len; ++ ++ if (!len) { ++ k.k->type = KEY_TYPE_deleted; ++ new_val_u64s = 0; ++ } ++ ++ switch (k.k->type) { ++ case KEY_TYPE_inline_data: ++ new_val_u64s = min(new_val_u64s, k.k->size << 6); ++ break; ++ } ++ ++ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; ++ BUG_ON(val_u64s_delta < 0); ++ ++ set_bkey_val_u64s(k.k, new_val_u64s); ++ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); ++ return -val_u64s_delta; ++} +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +new file mode 100644 +index 000000000000..29b15365d19c +--- /dev/null ++++ b/fs/bcachefs/extents.h +@@ -0,0 +1,603 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EXTENTS_H ++#define _BCACHEFS_EXTENTS_H ++ ++#include "bcachefs.h" ++#include "bkey.h" ++#include "extents_types.h" ++ ++struct bch_fs; ++struct btree_trans; ++ ++/* extent entries: */ ++ ++#define extent_entry_last(_e) \ ++ ((typeof(&(_e).v->start[0])) bkey_val_end(_e)) ++ ++#define entry_to_ptr(_entry) \ ++({ \ ++ EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \ ++ \ ++ __builtin_choose_expr( \ ++ type_is_exact(_entry, const union bch_extent_entry *), \ ++ (const struct bch_extent_ptr *) (_entry), \ ++ (struct bch_extent_ptr *) (_entry)); \ ++}) ++ ++/* downcast, preserves const */ ++#define to_entry(_entry) \ ++({ \ ++ BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \ ++ !type_is(_entry, struct bch_extent_ptr *) && \ ++ !type_is(_entry, struct bch_extent_stripe_ptr *)); \ ++ \ ++ __builtin_choose_expr( \ ++ (type_is_exact(_entry, const union bch_extent_crc *) || \ ++ type_is_exact(_entry, const struct bch_extent_ptr *) ||\ ++ type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\ ++ (const union bch_extent_entry *) (_entry), \ ++ (union bch_extent_entry *) (_entry)); \ ++}) ++ ++#define extent_entry_next(_entry) \ ++ ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) ++ ++static inline unsigned ++__extent_entry_type(const union bch_extent_entry *e) ++{ ++ return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX; ++} ++ ++static inline enum bch_extent_entry_type ++extent_entry_type(const union bch_extent_entry *e) ++{ ++ int ret = __ffs(e->type); ++ ++ EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX); ++ ++ return ret; ++} ++ ++static inline size_t extent_entry_bytes(const union bch_extent_entry *entry) ++{ ++ switch (extent_entry_type(entry)) { ++#define x(f, n) \ ++ case BCH_EXTENT_ENTRY_##f: \ ++ return sizeof(struct bch_extent_##f); ++ BCH_EXTENT_ENTRY_TYPES() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) ++{ ++ return extent_entry_bytes(entry) / sizeof(u64); ++} ++ ++static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) ++{ ++ switch (extent_entry_type(e)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static inline bool extent_entry_is_crc(const union bch_extent_entry *e) ++{ ++ switch (extent_entry_type(e)) { ++ case BCH_EXTENT_ENTRY_crc32: ++ case BCH_EXTENT_ENTRY_crc64: ++ case BCH_EXTENT_ENTRY_crc128: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++union bch_extent_crc { ++ u8 type; ++ struct bch_extent_crc32 crc32; ++ struct bch_extent_crc64 crc64; ++ struct bch_extent_crc128 crc128; ++}; ++ ++#define __entry_to_crc(_entry) \ ++ __builtin_choose_expr( \ ++ type_is_exact(_entry, const union bch_extent_entry *), \ ++ (const union bch_extent_crc *) (_entry), \ ++ (union bch_extent_crc *) (_entry)) ++ ++#define entry_to_crc(_entry) \ ++({ \ ++ EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \ ++ \ ++ __entry_to_crc(_entry); \ ++}) ++ ++static inline struct bch_extent_crc_unpacked ++bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) ++{ ++#define common_fields(_crc) \ ++ .csum_type = _crc.csum_type, \ ++ .compression_type = _crc.compression_type, \ ++ .compressed_size = _crc._compressed_size + 1, \ ++ .uncompressed_size = _crc._uncompressed_size + 1, \ ++ .offset = _crc.offset, \ ++ .live_size = k->size ++ ++ if (!crc) ++ return (struct bch_extent_crc_unpacked) { ++ .compressed_size = k->size, ++ .uncompressed_size = k->size, ++ .live_size = k->size, ++ }; ++ ++ switch (extent_entry_type(to_entry(crc))) { ++ case BCH_EXTENT_ENTRY_crc32: { ++ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { ++ common_fields(crc->crc32), ++ }; ++ ++ *((__le32 *) &ret.csum.lo) = crc->crc32.csum; ++ ++ memcpy(&ret.csum.lo, &crc->crc32.csum, ++ sizeof(crc->crc32.csum)); ++ ++ return ret; ++ } ++ case BCH_EXTENT_ENTRY_crc64: { ++ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { ++ common_fields(crc->crc64), ++ .nonce = crc->crc64.nonce, ++ .csum.lo = (__force __le64) crc->crc64.csum_lo, ++ }; ++ ++ *((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi; ++ ++ return ret; ++ } ++ case BCH_EXTENT_ENTRY_crc128: { ++ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { ++ common_fields(crc->crc128), ++ .nonce = crc->crc128.nonce, ++ .csum = crc->crc128.csum, ++ }; ++ ++ return ret; ++ } ++ default: ++ BUG(); ++ } ++#undef common_fields ++} ++ ++static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc) ++{ ++ return (crc.compression_type != BCH_COMPRESSION_TYPE_none && ++ crc.compression_type != BCH_COMPRESSION_TYPE_incompressible); ++} ++ ++/* bkey_ptrs: generically over any key type that has ptrs */ ++ ++struct bkey_ptrs_c { ++ const union bch_extent_entry *start; ++ const union bch_extent_entry *end; ++}; ++ ++struct bkey_ptrs { ++ union bch_extent_entry *start; ++ union bch_extent_entry *end; ++}; ++ ++static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_btree_ptr: { ++ struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); ++ return (struct bkey_ptrs_c) { ++ to_entry(&e.v->start[0]), ++ to_entry(extent_entry_last(e)) ++ }; ++ } ++ case KEY_TYPE_extent: { ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ return (struct bkey_ptrs_c) { ++ e.v->start, ++ extent_entry_last(e) ++ }; ++ } ++ case KEY_TYPE_stripe: { ++ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); ++ return (struct bkey_ptrs_c) { ++ to_entry(&s.v->ptrs[0]), ++ to_entry(&s.v->ptrs[s.v->nr_blocks]), ++ }; ++ } ++ case KEY_TYPE_reflink_v: { ++ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); ++ ++ return (struct bkey_ptrs_c) { ++ r.v->start, ++ bkey_val_end(r), ++ }; ++ } ++ case KEY_TYPE_btree_ptr_v2: { ++ struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k); ++ return (struct bkey_ptrs_c) { ++ to_entry(&e.v->start[0]), ++ to_entry(extent_entry_last(e)) ++ }; ++ } ++ default: ++ return (struct bkey_ptrs_c) { NULL, NULL }; ++ } ++} ++ ++static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) ++{ ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c); ++ ++ return (struct bkey_ptrs) { ++ (void *) p.start, ++ (void *) p.end ++ }; ++} ++ ++#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ ++ for ((_entry) = (_start); \ ++ (_entry) < (_end); \ ++ (_entry) = extent_entry_next(_entry)) ++ ++#define __bkey_ptr_next(_ptr, _end) \ ++({ \ ++ typeof(_end) _entry; \ ++ \ ++ __bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry) \ ++ if (extent_entry_is_ptr(_entry)) \ ++ break; \ ++ \ ++ _entry < (_end) ? entry_to_ptr(_entry) : NULL; \ ++}) ++ ++#define bkey_extent_entry_for_each_from(_p, _entry, _start) \ ++ __bkey_extent_entry_for_each_from(_start, (_p).end, _entry) ++ ++#define bkey_extent_entry_for_each(_p, _entry) \ ++ bkey_extent_entry_for_each_from(_p, _entry, _p.start) ++ ++#define __bkey_for_each_ptr(_start, _end, _ptr) \ ++ for ((_ptr) = (_start); \ ++ ((_ptr) = __bkey_ptr_next(_ptr, _end)); \ ++ (_ptr)++) ++ ++#define bkey_ptr_next(_p, _ptr) \ ++ __bkey_ptr_next(_ptr, (_p).end) ++ ++#define bkey_for_each_ptr(_p, _ptr) \ ++ __bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr) ++ ++#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry) \ ++({ \ ++ __label__ out; \ ++ \ ++ (_ptr).idx = 0; \ ++ (_ptr).has_ec = false; \ ++ \ ++ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ ++ switch (extent_entry_type(_entry)) { \ ++ case BCH_EXTENT_ENTRY_ptr: \ ++ (_ptr).ptr = _entry->ptr; \ ++ goto out; \ ++ case BCH_EXTENT_ENTRY_crc32: \ ++ case BCH_EXTENT_ENTRY_crc64: \ ++ case BCH_EXTENT_ENTRY_crc128: \ ++ (_ptr).crc = bch2_extent_crc_unpack(_k, \ ++ entry_to_crc(_entry)); \ ++ break; \ ++ case BCH_EXTENT_ENTRY_stripe_ptr: \ ++ (_ptr).ec = _entry->stripe_ptr; \ ++ (_ptr).has_ec = true; \ ++ break; \ ++ } \ ++out: \ ++ _entry < (_end); \ ++}) ++ ++#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry) \ ++ for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \ ++ (_entry) = _start; \ ++ __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \ ++ (_entry) = extent_entry_next(_entry)) ++ ++#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \ ++ __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ ++ _ptr, _entry) ++ ++#define bkey_crc_next(_k, _start, _end, _crc, _iter) \ ++({ \ ++ __bkey_extent_entry_for_each_from(_iter, _end, _iter) \ ++ if (extent_entry_is_crc(_iter)) { \ ++ (_crc) = bch2_extent_crc_unpack(_k, \ ++ entry_to_crc(_iter)); \ ++ break; \ ++ } \ ++ \ ++ (_iter) < (_end); \ ++}) ++ ++#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \ ++ for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \ ++ (_iter) = (_start); \ ++ bkey_crc_next(_k, _start, _end, _crc, _iter); \ ++ (_iter) = extent_entry_next(_iter)) ++ ++#define bkey_for_each_crc(_k, _p, _crc, _iter) \ ++ __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter) ++ ++/* Iterate over pointers in KEY_TYPE_extent: */ ++ ++#define extent_for_each_entry_from(_e, _entry, _start) \ ++ __bkey_extent_entry_for_each_from(_start, \ ++ extent_entry_last(_e),_entry) ++ ++#define extent_for_each_entry(_e, _entry) \ ++ extent_for_each_entry_from(_e, _entry, (_e).v->start) ++ ++#define extent_ptr_next(_e, _ptr) \ ++ __bkey_ptr_next(_ptr, extent_entry_last(_e)) ++ ++#define extent_for_each_ptr(_e, _ptr) \ ++ __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) ++ ++#define extent_for_each_ptr_decode(_e, _ptr, _entry) \ ++ __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ ++ extent_entry_last(_e), _ptr, _entry) ++ ++/* utility code common to all keys with pointers: */ ++ ++void bch2_mark_io_failure(struct bch_io_failures *, ++ struct extent_ptr_decoded *); ++int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, ++ struct bch_io_failures *, ++ struct extent_ptr_decoded *); ++ ++/* KEY_TYPE_btree_ptr: */ ++ ++const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c); ++void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ ++void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, ++ int, struct bkey_s); ++ ++#define bch2_bkey_ops_btree_ptr (struct bkey_ops) { \ ++ .key_invalid = bch2_btree_ptr_invalid, \ ++ .key_debugcheck = bch2_btree_ptr_debugcheck, \ ++ .val_to_text = bch2_btree_ptr_to_text, \ ++ .swab = bch2_ptr_swab, \ ++} ++ ++#define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \ ++ .key_invalid = bch2_btree_ptr_invalid, \ ++ .key_debugcheck = bch2_btree_ptr_debugcheck, \ ++ .val_to_text = bch2_btree_ptr_v2_to_text, \ ++ .swab = bch2_ptr_swab, \ ++ .compat = bch2_btree_ptr_v2_compat, \ ++} ++ ++/* KEY_TYPE_extent: */ ++ ++const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c); ++void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++enum merge_result bch2_extent_merge(struct bch_fs *, ++ struct bkey_s, struct bkey_s); ++ ++#define bch2_bkey_ops_extent (struct bkey_ops) { \ ++ .key_invalid = bch2_extent_invalid, \ ++ .key_debugcheck = bch2_extent_debugcheck, \ ++ .val_to_text = bch2_extent_to_text, \ ++ .swab = bch2_ptr_swab, \ ++ .key_normalize = bch2_extent_normalize, \ ++ .key_merge = bch2_extent_merge, \ ++} ++ ++/* KEY_TYPE_reservation: */ ++ ++const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++enum merge_result bch2_reservation_merge(struct bch_fs *, ++ struct bkey_s, struct bkey_s); ++ ++#define bch2_bkey_ops_reservation (struct bkey_ops) { \ ++ .key_invalid = bch2_reservation_invalid, \ ++ .val_to_text = bch2_reservation_to_text, \ ++ .key_merge = bch2_reservation_merge, \ ++} ++ ++/* Extent checksum entries: */ ++ ++bool bch2_can_narrow_extent_crcs(struct bkey_s_c, ++ struct bch_extent_crc_unpacked); ++bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); ++void bch2_extent_crc_append(struct bkey_i *, ++ struct bch_extent_crc_unpacked); ++ ++/* Generic code for keys with pointers: */ ++ ++static inline bool bkey_extent_is_direct_data(const struct bkey *k) ++{ ++ switch (k->type) { ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static inline bool bkey_extent_is_data(const struct bkey *k) ++{ ++ return bkey_extent_is_direct_data(k) || ++ k->type == KEY_TYPE_inline_data || ++ k->type == KEY_TYPE_reflink_p; ++} ++ ++/* ++ * Should extent be counted under inode->i_sectors? ++ */ ++static inline bool bkey_extent_is_allocation(const struct bkey *k) ++{ ++ switch (k->type) { ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reservation: ++ case KEY_TYPE_reflink_p: ++ case KEY_TYPE_reflink_v: ++ case KEY_TYPE_inline_data: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) ++{ ++ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(p, ptr) ++ ret.devs[ret.nr++] = ptr->dev; ++ ++ return ret; ++} ++ ++static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) ++{ ++ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(p, ptr) ++ if (!ptr->cached) ++ ret.devs[ret.nr++] = ptr->dev; ++ ++ return ret; ++} ++ ++static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) ++{ ++ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(p, ptr) ++ if (ptr->cached) ++ ret.devs[ret.nr++] = ptr->dev; ++ ++ return ret; ++} ++ ++unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); ++unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); ++unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); ++bool bch2_bkey_is_incompressible(struct bkey_s_c); ++unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); ++bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned); ++unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); ++ ++void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, ++ unsigned, unsigned); ++ ++void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); ++void bch2_extent_ptr_decoded_append(struct bkey_i *, ++ struct extent_ptr_decoded *); ++union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, ++ struct bch_extent_ptr *); ++ ++#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \ ++do { \ ++ struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \ ++ \ ++ _ptr = &_ptrs.start->ptr; \ ++ \ ++ while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \ ++ if (_cond) { \ ++ _ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr); \ ++ _ptrs = bch2_bkey_ptrs(_k); \ ++ continue; \ ++ } \ ++ \ ++ (_ptr)++; \ ++ } \ ++} while (0) ++ ++void bch2_bkey_drop_device(struct bkey_s, unsigned); ++const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned); ++bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); ++ ++bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, ++ struct bch_extent_ptr, u64); ++ ++bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); ++void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c); ++ ++void bch2_ptr_swab(struct bkey_s); ++ ++/* Generic extent code: */ ++ ++int bch2_cut_front_s(struct bpos, struct bkey_s); ++int bch2_cut_back_s(struct bpos, struct bkey_s); ++ ++static inline void bch2_cut_front(struct bpos where, struct bkey_i *k) ++{ ++ bch2_cut_front_s(where, bkey_i_to_s(k)); ++} ++ ++static inline void bch2_cut_back(struct bpos where, struct bkey_i *k) ++{ ++ bch2_cut_back_s(where, bkey_i_to_s(k)); ++} ++ ++/** ++ * bch_key_resize - adjust size of @k ++ * ++ * bkey_start_offset(k) will be preserved, modifies where the extent ends ++ */ ++static inline void bch2_key_resize(struct bkey *k, unsigned new_size) ++{ ++ k->p.offset -= k->size; ++ k->p.offset += new_size; ++ k->size = new_size; ++} ++ ++/* ++ * In extent_sort_fix_overlapping(), insert_fixup_extent(), ++ * extent_merge_inline() - we're modifying keys in place that are packed. To do ++ * that we have to unpack the key, modify the unpacked key - then this ++ * copies/repacks the unpacked to the original as necessary. ++ */ ++static inline void extent_save(struct btree *b, struct bkey_packed *dst, ++ struct bkey *src) ++{ ++ struct bkey_format *f = &b->format; ++ struct bkey_i *dst_unpacked; ++ ++ if ((dst_unpacked = packed_to_bkey(dst))) ++ dst_unpacked->k = *src; ++ else ++ BUG_ON(!bch2_bkey_pack_key(dst, src, f)); ++} ++ ++#endif /* _BCACHEFS_EXTENTS_H */ +diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h +new file mode 100644 +index 000000000000..43d6c341ecca +--- /dev/null ++++ b/fs/bcachefs/extents_types.h +@@ -0,0 +1,40 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EXTENTS_TYPES_H ++#define _BCACHEFS_EXTENTS_TYPES_H ++ ++#include "bcachefs_format.h" ++ ++struct bch_extent_crc_unpacked { ++ u32 compressed_size; ++ u32 uncompressed_size; ++ u32 live_size; ++ ++ u8 csum_type; ++ u8 compression_type; ++ ++ u16 offset; ++ ++ u16 nonce; ++ ++ struct bch_csum csum; ++}; ++ ++struct extent_ptr_decoded { ++ unsigned idx; ++ bool has_ec; ++ struct bch_extent_crc_unpacked crc; ++ struct bch_extent_ptr ptr; ++ struct bch_extent_stripe_ptr ec; ++}; ++ ++struct bch_io_failures { ++ u8 nr; ++ struct bch_dev_io_failures { ++ u8 dev; ++ u8 idx; ++ u8 nr_failed; ++ u8 nr_retries; ++ } devs[BCH_REPLICAS_MAX]; ++}; ++ ++#endif /* _BCACHEFS_EXTENTS_TYPES_H */ +diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h +new file mode 100644 +index 000000000000..26d5cad7e6a5 +--- /dev/null ++++ b/fs/bcachefs/eytzinger.h +@@ -0,0 +1,285 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _EYTZINGER_H ++#define _EYTZINGER_H ++ ++#include ++#include ++ ++#include "util.h" ++ ++/* ++ * Traversal for trees in eytzinger layout - a full binary tree layed out in an ++ * array ++ */ ++ ++/* ++ * One based indexing version: ++ * ++ * With one based indexing each level of the tree starts at a power of two - ++ * good for cacheline alignment: ++ * ++ * Size parameter is treated as if we were using 0 based indexing, however: ++ * valid nodes, and inorder indices, are in the range [1..size) - that is, there ++ * are actually size - 1 elements ++ */ ++ ++static inline unsigned eytzinger1_child(unsigned i, unsigned child) ++{ ++ EBUG_ON(child > 1); ++ ++ return (i << 1) + child; ++} ++ ++static inline unsigned eytzinger1_left_child(unsigned i) ++{ ++ return eytzinger1_child(i, 0); ++} ++ ++static inline unsigned eytzinger1_right_child(unsigned i) ++{ ++ return eytzinger1_child(i, 1); ++} ++ ++static inline unsigned eytzinger1_first(unsigned size) ++{ ++ return rounddown_pow_of_two(size - 1); ++} ++ ++static inline unsigned eytzinger1_last(unsigned size) ++{ ++ return rounddown_pow_of_two(size) - 1; ++} ++ ++/* ++ * eytzinger1_next() and eytzinger1_prev() have the nice properties that ++ * ++ * eytzinger1_next(0) == eytzinger1_first()) ++ * eytzinger1_prev(0) == eytzinger1_last()) ++ * ++ * eytzinger1_prev(eytzinger1_first()) == 0 ++ * eytzinger1_next(eytzinger1_last()) == 0 ++ */ ++ ++static inline unsigned eytzinger1_next(unsigned i, unsigned size) ++{ ++ EBUG_ON(i >= size); ++ ++ if (eytzinger1_right_child(i) < size) { ++ i = eytzinger1_right_child(i); ++ ++ i <<= __fls(size) - __fls(i); ++ i >>= i >= size; ++ } else { ++ i >>= ffz(i) + 1; ++ } ++ ++ return i; ++} ++ ++static inline unsigned eytzinger1_prev(unsigned i, unsigned size) ++{ ++ EBUG_ON(i >= size); ++ ++ if (eytzinger1_left_child(i) < size) { ++ i = eytzinger1_left_child(i) + 1; ++ ++ i <<= __fls(size) - __fls(i); ++ i -= 1; ++ i >>= i >= size; ++ } else { ++ i >>= __ffs(i) + 1; ++ } ++ ++ return i; ++} ++ ++static inline unsigned eytzinger1_extra(unsigned size) ++{ ++ return (size - rounddown_pow_of_two(size - 1)) << 1; ++} ++ ++static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, ++ unsigned extra) ++{ ++ unsigned b = __fls(i); ++ unsigned shift = __fls(size - 1) - b; ++ int s; ++ ++ EBUG_ON(!i || i >= size); ++ ++ i ^= 1U << b; ++ i <<= 1; ++ i |= 1; ++ i <<= shift; ++ ++ /* ++ * sign bit trick: ++ * ++ * if (i > extra) ++ * i -= (i - extra) >> 1; ++ */ ++ s = extra - i; ++ i += (s >> 1) & (s >> 31); ++ ++ return i; ++} ++ ++static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, ++ unsigned extra) ++{ ++ unsigned shift; ++ int s; ++ ++ EBUG_ON(!i || i >= size); ++ ++ /* ++ * sign bit trick: ++ * ++ * if (i > extra) ++ * i += i - extra; ++ */ ++ s = extra - i; ++ i -= s & (s >> 31); ++ ++ shift = __ffs(i); ++ ++ i >>= shift + 1; ++ i |= 1U << (__fls(size - 1) - shift); ++ ++ return i; ++} ++ ++static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size) ++{ ++ return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size)); ++} ++ ++static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) ++{ ++ return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size)); ++} ++ ++#define eytzinger1_for_each(_i, _size) \ ++ for ((_i) = eytzinger1_first((_size)); \ ++ (_i) != 0; \ ++ (_i) = eytzinger1_next((_i), (_size))) ++ ++/* Zero based indexing version: */ ++ ++static inline unsigned eytzinger0_child(unsigned i, unsigned child) ++{ ++ EBUG_ON(child > 1); ++ ++ return (i << 1) + 1 + child; ++} ++ ++static inline unsigned eytzinger0_left_child(unsigned i) ++{ ++ return eytzinger0_child(i, 0); ++} ++ ++static inline unsigned eytzinger0_right_child(unsigned i) ++{ ++ return eytzinger0_child(i, 1); ++} ++ ++static inline unsigned eytzinger0_first(unsigned size) ++{ ++ return eytzinger1_first(size + 1) - 1; ++} ++ ++static inline unsigned eytzinger0_last(unsigned size) ++{ ++ return eytzinger1_last(size + 1) - 1; ++} ++ ++static inline unsigned eytzinger0_next(unsigned i, unsigned size) ++{ ++ return eytzinger1_next(i + 1, size + 1) - 1; ++} ++ ++static inline unsigned eytzinger0_prev(unsigned i, unsigned size) ++{ ++ return eytzinger1_prev(i + 1, size + 1) - 1; ++} ++ ++static inline unsigned eytzinger0_extra(unsigned size) ++{ ++ return eytzinger1_extra(size + 1); ++} ++ ++static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size, ++ unsigned extra) ++{ ++ return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1; ++} ++ ++static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size, ++ unsigned extra) ++{ ++ return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1; ++} ++ ++static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size) ++{ ++ return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size)); ++} ++ ++static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) ++{ ++ return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size)); ++} ++ ++#define eytzinger0_for_each(_i, _size) \ ++ for ((_i) = eytzinger0_first((_size)); \ ++ (_i) != -1; \ ++ (_i) = eytzinger0_next((_i), (_size))) ++ ++typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size); ++ ++/* return greatest node <= @search, or -1 if not found */ ++static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, ++ eytzinger_cmp_fn cmp, const void *search) ++{ ++ unsigned i, n = 0; ++ ++ if (!nr) ++ return -1; ++ ++ do { ++ i = n; ++ n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0); ++ } while (n < nr); ++ ++ if (n & 1) { ++ /* @i was greater than @search, return previous node: */ ++ ++ if (i == eytzinger0_first(nr)) ++ return -1; ++ ++ return eytzinger0_prev(i, nr); ++ } else { ++ return i; ++ } ++} ++ ++#define eytzinger0_find(base, nr, size, _cmp, search) \ ++({ \ ++ void *_base = (base); \ ++ void *_search = (search); \ ++ size_t _nr = (nr); \ ++ size_t _size = (size); \ ++ size_t _i = 0; \ ++ int _res; \ ++ \ ++ while (_i < _nr && \ ++ (_res = _cmp(_search, _base + _i * _size, _size))) \ ++ _i = eytzinger0_child(_i, _res > 0); \ ++ _i; \ ++}) ++ ++void eytzinger0_sort(void *, size_t, size_t, ++ int (*cmp_func)(const void *, const void *, size_t), ++ void (*swap_func)(void *, void *, size_t)); ++ ++#endif /* _EYTZINGER_H */ +diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h +new file mode 100644 +index 000000000000..cdb272708a4b +--- /dev/null ++++ b/fs/bcachefs/fifo.h +@@ -0,0 +1,127 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FIFO_H ++#define _BCACHEFS_FIFO_H ++ ++#include "util.h" ++ ++#define FIFO(type) \ ++struct { \ ++ size_t front, back, size, mask; \ ++ type *data; \ ++} ++ ++#define DECLARE_FIFO(type, name) FIFO(type) name ++ ++#define fifo_buf_size(fifo) \ ++ ((fifo)->size \ ++ ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]) \ ++ : 0) ++ ++#define init_fifo(fifo, _size, _gfp) \ ++({ \ ++ (fifo)->front = (fifo)->back = 0; \ ++ (fifo)->size = (_size); \ ++ (fifo)->mask = (fifo)->size \ ++ ? roundup_pow_of_two((fifo)->size) - 1 \ ++ : 0; \ ++ (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \ ++}) ++ ++#define free_fifo(fifo) \ ++do { \ ++ kvpfree((fifo)->data, fifo_buf_size(fifo)); \ ++ (fifo)->data = NULL; \ ++} while (0) ++ ++#define fifo_swap(l, r) \ ++do { \ ++ swap((l)->front, (r)->front); \ ++ swap((l)->back, (r)->back); \ ++ swap((l)->size, (r)->size); \ ++ swap((l)->mask, (r)->mask); \ ++ swap((l)->data, (r)->data); \ ++} while (0) ++ ++#define fifo_move(dest, src) \ ++do { \ ++ typeof(*((dest)->data)) _t; \ ++ while (!fifo_full(dest) && \ ++ fifo_pop(src, _t)) \ ++ fifo_push(dest, _t); \ ++} while (0) ++ ++#define fifo_used(fifo) (((fifo)->back - (fifo)->front)) ++#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) ++ ++#define fifo_empty(fifo) ((fifo)->front == (fifo)->back) ++#define fifo_full(fifo) (fifo_used(fifo) == (fifo)->size) ++ ++#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask]) ++#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) ++ ++#define fifo_entry_idx_abs(fifo, p) \ ++ ((((p) >= &fifo_peek_front(fifo) \ ++ ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) + \ ++ (((p) - (fifo)->data))) ++ ++#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask) ++#define fifo_idx_entry(fifo, i) (fifo)->data[((fifo)->front + (i)) & (fifo)->mask] ++ ++#define fifo_push_back_ref(f) \ ++ (fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask]) ++ ++#define fifo_push_front_ref(f) \ ++ (fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask]) ++ ++#define fifo_push_back(fifo, new) \ ++({ \ ++ typeof((fifo)->data) _r = fifo_push_back_ref(fifo); \ ++ if (_r) \ ++ *_r = (new); \ ++ _r != NULL; \ ++}) ++ ++#define fifo_push_front(fifo, new) \ ++({ \ ++ typeof((fifo)->data) _r = fifo_push_front_ref(fifo); \ ++ if (_r) \ ++ *_r = (new); \ ++ _r != NULL; \ ++}) ++ ++#define fifo_pop_front(fifo, i) \ ++({ \ ++ bool _r = !fifo_empty((fifo)); \ ++ if (_r) \ ++ (i) = (fifo)->data[(fifo)->front++ & (fifo)->mask]; \ ++ _r; \ ++}) ++ ++#define fifo_pop_back(fifo, i) \ ++({ \ ++ bool _r = !fifo_empty((fifo)); \ ++ if (_r) \ ++ (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]; \ ++ _r; \ ++}) ++ ++#define fifo_push_ref(fifo) fifo_push_back_ref(fifo) ++#define fifo_push(fifo, i) fifo_push_back(fifo, (i)) ++#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) ++#define fifo_peek(fifo) fifo_peek_front(fifo) ++ ++#define fifo_for_each_entry(_entry, _fifo, _iter) \ ++ for (typecheck(typeof((_fifo)->front), _iter), \ ++ (_iter) = (_fifo)->front; \ ++ ((_iter != (_fifo)->back) && \ ++ (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \ ++ (_iter)++) ++ ++#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \ ++ for (typecheck(typeof((_fifo)->front), _iter), \ ++ (_iter) = (_fifo)->front; \ ++ ((_iter != (_fifo)->back) && \ ++ (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \ ++ (_iter)++) ++ ++#endif /* _BCACHEFS_FIFO_H */ +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +new file mode 100644 +index 000000000000..878419d40992 +--- /dev/null ++++ b/fs/bcachefs/fs-common.c +@@ -0,0 +1,317 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "acl.h" ++#include "btree_update.h" ++#include "dirent.h" ++#include "fs-common.h" ++#include "inode.h" ++#include "xattr.h" ++ ++#include ++ ++int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, ++ struct bch_inode_unpacked *dir_u, ++ struct bch_inode_unpacked *new_inode, ++ const struct qstr *name, ++ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, ++ struct posix_acl *default_acl, ++ struct posix_acl *acl) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *dir_iter = NULL; ++ struct bch_hash_info hash = bch2_hash_info_init(c, new_inode); ++ u64 now = bch2_current_time(trans->c); ++ int ret; ++ ++ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(dir_iter); ++ if (ret) ++ goto err; ++ ++ bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); ++ ++ if (!name) ++ new_inode->bi_flags |= BCH_INODE_UNLINKED; ++ ++ ret = bch2_inode_create(trans, new_inode, ++ BLOCKDEV_INODE_MAX, 0, ++ &c->unused_inode_hint); ++ if (ret) ++ goto err; ++ ++ if (default_acl) { ++ ret = bch2_set_acl_trans(trans, new_inode, &hash, ++ default_acl, ACL_TYPE_DEFAULT); ++ if (ret) ++ goto err; ++ } ++ ++ if (acl) { ++ ret = bch2_set_acl_trans(trans, new_inode, &hash, ++ acl, ACL_TYPE_ACCESS); ++ if (ret) ++ goto err; ++ } ++ ++ if (name) { ++ struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u); ++ dir_u->bi_mtime = dir_u->bi_ctime = now; ++ ++ if (S_ISDIR(new_inode->bi_mode)) ++ dir_u->bi_nlink++; ++ ++ ret = bch2_inode_write(trans, dir_iter, dir_u); ++ if (ret) ++ goto err; ++ ++ ret = bch2_dirent_create(trans, dir_inum, &dir_hash, ++ mode_to_type(new_inode->bi_mode), ++ name, new_inode->bi_inum, ++ BCH_HASH_SET_MUST_CREATE); ++ if (ret) ++ goto err; ++ } ++err: ++ bch2_trans_iter_put(trans, dir_iter); ++ return ret; ++} ++ ++int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, ++ u64 inum, struct bch_inode_unpacked *dir_u, ++ struct bch_inode_unpacked *inode_u, const struct qstr *name) ++{ ++ struct btree_iter *dir_iter = NULL, *inode_iter = NULL; ++ struct bch_hash_info dir_hash; ++ u64 now = bch2_current_time(trans->c); ++ int ret; ++ ++ inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(inode_iter); ++ if (ret) ++ goto err; ++ ++ inode_u->bi_ctime = now; ++ bch2_inode_nlink_inc(inode_u); ++ ++ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0); ++ ret = PTR_ERR_OR_ZERO(dir_iter); ++ if (ret) ++ goto err; ++ ++ dir_u->bi_mtime = dir_u->bi_ctime = now; ++ ++ dir_hash = bch2_hash_info_init(trans->c, dir_u); ++ ++ ret = bch2_dirent_create(trans, dir_inum, &dir_hash, ++ mode_to_type(inode_u->bi_mode), ++ name, inum, BCH_HASH_SET_MUST_CREATE) ?: ++ bch2_inode_write(trans, dir_iter, dir_u) ?: ++ bch2_inode_write(trans, inode_iter, inode_u); ++err: ++ bch2_trans_iter_put(trans, dir_iter); ++ bch2_trans_iter_put(trans, inode_iter); ++ return ret; ++} ++ ++int bch2_unlink_trans(struct btree_trans *trans, ++ u64 dir_inum, struct bch_inode_unpacked *dir_u, ++ struct bch_inode_unpacked *inode_u, ++ const struct qstr *name) ++{ ++ struct btree_iter *dir_iter = NULL, *dirent_iter = NULL, ++ *inode_iter = NULL; ++ struct bch_hash_info dir_hash; ++ u64 inum, now = bch2_current_time(trans->c); ++ struct bkey_s_c k; ++ int ret; ++ ++ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(dir_iter); ++ if (ret) ++ goto err; ++ ++ dir_hash = bch2_hash_info_init(trans->c, dir_u); ++ ++ dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash, ++ name, BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(dirent_iter); ++ if (ret) ++ goto err; ++ ++ k = bch2_btree_iter_peek_slot(dirent_iter); ++ inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); ++ ++ inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(inode_iter); ++ if (ret) ++ goto err; ++ ++ dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; ++ dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode); ++ bch2_inode_nlink_dec(inode_u); ++ ++ ret = (S_ISDIR(inode_u->bi_mode) ++ ? bch2_empty_dir_trans(trans, inum) ++ : 0) ?: ++ bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?: ++ bch2_inode_write(trans, dir_iter, dir_u) ?: ++ bch2_inode_write(trans, inode_iter, inode_u); ++err: ++ bch2_trans_iter_put(trans, inode_iter); ++ bch2_trans_iter_put(trans, dirent_iter); ++ bch2_trans_iter_put(trans, dir_iter); ++ return ret; ++} ++ ++bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, ++ struct bch_inode_unpacked *src_u) ++{ ++ u64 src, dst; ++ unsigned id; ++ bool ret = false; ++ ++ for (id = 0; id < Inode_opt_nr; id++) { ++ if (dst_u->bi_fields_set & (1 << id)) ++ continue; ++ ++ src = bch2_inode_opt_get(src_u, id); ++ dst = bch2_inode_opt_get(dst_u, id); ++ ++ if (src == dst) ++ continue; ++ ++ bch2_inode_opt_set(dst_u, id, src); ++ ret = true; ++ } ++ ++ return ret; ++} ++ ++int bch2_rename_trans(struct btree_trans *trans, ++ u64 src_dir, struct bch_inode_unpacked *src_dir_u, ++ u64 dst_dir, struct bch_inode_unpacked *dst_dir_u, ++ struct bch_inode_unpacked *src_inode_u, ++ struct bch_inode_unpacked *dst_inode_u, ++ const struct qstr *src_name, ++ const struct qstr *dst_name, ++ enum bch_rename_mode mode) ++{ ++ struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL; ++ struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL; ++ struct bch_hash_info src_hash, dst_hash; ++ u64 src_inode, dst_inode, now = bch2_current_time(trans->c); ++ int ret; ++ ++ src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(src_dir_iter); ++ if (ret) ++ goto err; ++ ++ src_hash = bch2_hash_info_init(trans->c, src_dir_u); ++ ++ if (dst_dir != src_dir) { ++ dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(dst_dir_iter); ++ if (ret) ++ goto err; ++ ++ dst_hash = bch2_hash_info_init(trans->c, dst_dir_u); ++ } else { ++ dst_dir_u = src_dir_u; ++ dst_hash = src_hash; ++ } ++ ++ ret = bch2_dirent_rename(trans, ++ src_dir, &src_hash, ++ dst_dir, &dst_hash, ++ src_name, &src_inode, ++ dst_name, &dst_inode, ++ mode); ++ if (ret) ++ goto err; ++ ++ src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(src_inode_iter); ++ if (ret) ++ goto err; ++ ++ if (dst_inode) { ++ dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(dst_inode_iter); ++ if (ret) ++ goto err; ++ } ++ ++ if (mode == BCH_RENAME_OVERWRITE) { ++ if (S_ISDIR(src_inode_u->bi_mode) != ++ S_ISDIR(dst_inode_u->bi_mode)) { ++ ret = -ENOTDIR; ++ goto err; ++ } ++ ++ if (S_ISDIR(dst_inode_u->bi_mode) && ++ bch2_empty_dir_trans(trans, dst_inode)) { ++ ret = -ENOTEMPTY; ++ goto err; ++ } ++ } ++ ++ if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && ++ S_ISDIR(src_inode_u->bi_mode)) { ++ ret = -EXDEV; ++ goto err; ++ } ++ ++ if (mode == BCH_RENAME_EXCHANGE && ++ bch2_reinherit_attrs(dst_inode_u, src_dir_u) && ++ S_ISDIR(dst_inode_u->bi_mode)) { ++ ret = -EXDEV; ++ goto err; ++ } ++ ++ if (S_ISDIR(src_inode_u->bi_mode)) { ++ src_dir_u->bi_nlink--; ++ dst_dir_u->bi_nlink++; ++ } ++ ++ if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) { ++ dst_dir_u->bi_nlink--; ++ src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; ++ } ++ ++ if (mode == BCH_RENAME_OVERWRITE) ++ bch2_inode_nlink_dec(dst_inode_u); ++ ++ src_dir_u->bi_mtime = now; ++ src_dir_u->bi_ctime = now; ++ ++ if (src_dir != dst_dir) { ++ dst_dir_u->bi_mtime = now; ++ dst_dir_u->bi_ctime = now; ++ } ++ ++ src_inode_u->bi_ctime = now; ++ ++ if (dst_inode) ++ dst_inode_u->bi_ctime = now; ++ ++ ret = bch2_inode_write(trans, src_dir_iter, src_dir_u) ?: ++ (src_dir != dst_dir ++ ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u) ++ : 0 ) ?: ++ bch2_inode_write(trans, src_inode_iter, src_inode_u) ?: ++ (dst_inode ++ ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u) ++ : 0 ); ++err: ++ bch2_trans_iter_put(trans, dst_inode_iter); ++ bch2_trans_iter_put(trans, src_inode_iter); ++ bch2_trans_iter_put(trans, dst_dir_iter); ++ bch2_trans_iter_put(trans, src_dir_iter); ++ return ret; ++} +diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h +new file mode 100644 +index 000000000000..2273b7961c9b +--- /dev/null ++++ b/fs/bcachefs/fs-common.h +@@ -0,0 +1,37 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FS_COMMON_H ++#define _BCACHEFS_FS_COMMON_H ++ ++struct posix_acl; ++ ++int bch2_create_trans(struct btree_trans *, u64, ++ struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, ++ const struct qstr *, ++ uid_t, gid_t, umode_t, dev_t, ++ struct posix_acl *, ++ struct posix_acl *); ++ ++int bch2_link_trans(struct btree_trans *, u64, ++ u64, struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, ++ const struct qstr *); ++ ++int bch2_unlink_trans(struct btree_trans *, ++ u64, struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, ++ const struct qstr *); ++ ++int bch2_rename_trans(struct btree_trans *, ++ u64, struct bch_inode_unpacked *, ++ u64, struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, ++ const struct qstr *, ++ const struct qstr *, ++ enum bch_rename_mode); ++ ++bool bch2_reinherit_attrs(struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *); ++ ++#endif /* _BCACHEFS_FS_COMMON_H */ +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +new file mode 100644 +index 000000000000..55004998536d +--- /dev/null ++++ b/fs/bcachefs/fs-io.c +@@ -0,0 +1,3133 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef NO_BCACHEFS_FS ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_on_stack.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "clock.h" ++#include "error.h" ++#include "extents.h" ++#include "extent_update.h" ++#include "fs.h" ++#include "fs-io.h" ++#include "fsck.h" ++#include "inode.h" ++#include "journal.h" ++#include "io.h" ++#include "keylist.h" ++#include "quota.h" ++#include "reflink.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++struct quota_res { ++ u64 sectors; ++}; ++ ++struct bch_writepage_io { ++ struct closure cl; ++ struct bch_inode_info *inode; ++ ++ /* must be last: */ ++ struct bch_write_op op; ++}; ++ ++struct dio_write { ++ struct completion done; ++ struct kiocb *req; ++ struct mm_struct *mm; ++ unsigned loop:1, ++ sync:1, ++ free_iov:1; ++ struct quota_res quota_res; ++ u64 written; ++ ++ struct iov_iter iter; ++ struct iovec inline_vecs[2]; ++ ++ /* must be last: */ ++ struct bch_write_op op; ++}; ++ ++struct dio_read { ++ struct closure cl; ++ struct kiocb *req; ++ long ret; ++ struct bch_read_bio rbio; ++}; ++ ++/* pagecache_block must be held */ ++static int write_invalidate_inode_pages_range(struct address_space *mapping, ++ loff_t start, loff_t end) ++{ ++ int ret; ++ ++ /* ++ * XXX: the way this is currently implemented, we can spin if a process ++ * is continually redirtying a specific page ++ */ ++ do { ++ if (!mapping->nrpages && ++ !mapping->nrexceptional) ++ return 0; ++ ++ ret = filemap_write_and_wait_range(mapping, start, end); ++ if (ret) ++ break; ++ ++ if (!mapping->nrpages) ++ return 0; ++ ++ ret = invalidate_inode_pages2_range(mapping, ++ start >> PAGE_SHIFT, ++ end >> PAGE_SHIFT); ++ } while (ret == -EBUSY); ++ ++ return ret; ++} ++ ++/* quotas */ ++ ++#ifdef CONFIG_BCACHEFS_QUOTA ++ ++static void bch2_quota_reservation_put(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res) ++{ ++ if (!res->sectors) ++ return; ++ ++ mutex_lock(&inode->ei_quota_lock); ++ BUG_ON(res->sectors > inode->ei_quota_reserved); ++ ++ bch2_quota_acct(c, inode->ei_qid, Q_SPC, ++ -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); ++ inode->ei_quota_reserved -= res->sectors; ++ mutex_unlock(&inode->ei_quota_lock); ++ ++ res->sectors = 0; ++} ++ ++static int bch2_quota_reservation_add(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res, ++ unsigned sectors, ++ bool check_enospc) ++{ ++ int ret; ++ ++ mutex_lock(&inode->ei_quota_lock); ++ ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, ++ check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); ++ if (likely(!ret)) { ++ inode->ei_quota_reserved += sectors; ++ res->sectors += sectors; ++ } ++ mutex_unlock(&inode->ei_quota_lock); ++ ++ return ret; ++} ++ ++#else ++ ++static void bch2_quota_reservation_put(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res) ++{ ++} ++ ++static int bch2_quota_reservation_add(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res, ++ unsigned sectors, ++ bool check_enospc) ++{ ++ return 0; ++} ++ ++#endif ++ ++/* i_size updates: */ ++ ++struct inode_new_size { ++ loff_t new_size; ++ u64 now; ++ unsigned fields; ++}; ++ ++static int inode_set_size(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct inode_new_size *s = p; ++ ++ bi->bi_size = s->new_size; ++ if (s->fields & ATTR_ATIME) ++ bi->bi_atime = s->now; ++ if (s->fields & ATTR_MTIME) ++ bi->bi_mtime = s->now; ++ if (s->fields & ATTR_CTIME) ++ bi->bi_ctime = s->now; ++ ++ return 0; ++} ++ ++int __must_check bch2_write_inode_size(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ loff_t new_size, unsigned fields) ++{ ++ struct inode_new_size s = { ++ .new_size = new_size, ++ .now = bch2_current_time(c), ++ .fields = fields, ++ }; ++ ++ return bch2_write_inode(c, inode, inode_set_size, &s, fields); ++} ++ ++static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, ++ struct quota_res *quota_res, s64 sectors) ++{ ++ if (!sectors) ++ return; ++ ++ mutex_lock(&inode->ei_quota_lock); ++#ifdef CONFIG_BCACHEFS_QUOTA ++ if (quota_res && sectors > 0) { ++ BUG_ON(sectors > quota_res->sectors); ++ BUG_ON(sectors > inode->ei_quota_reserved); ++ ++ quota_res->sectors -= sectors; ++ inode->ei_quota_reserved -= sectors; ++ } else { ++ bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); ++ } ++#endif ++ inode->v.i_blocks += sectors; ++ mutex_unlock(&inode->ei_quota_lock); ++} ++ ++/* page state: */ ++ ++/* stored in page->private: */ ++ ++struct bch_page_sector { ++ /* Uncompressed, fully allocated replicas: */ ++ unsigned nr_replicas:3; ++ ++ /* Owns PAGE_SECTORS * replicas_reserved sized reservation: */ ++ unsigned replicas_reserved:3; ++ ++ /* i_sectors: */ ++ enum { ++ SECTOR_UNALLOCATED, ++ SECTOR_RESERVED, ++ SECTOR_DIRTY, ++ SECTOR_ALLOCATED, ++ } state:2; ++}; ++ ++struct bch_page_state { ++ spinlock_t lock; ++ atomic_t write_count; ++ struct bch_page_sector s[PAGE_SECTORS]; ++}; ++ ++static inline struct bch_page_state *__bch2_page_state(struct page *page) ++{ ++ return page_has_private(page) ++ ? (struct bch_page_state *) page_private(page) ++ : NULL; ++} ++ ++static inline struct bch_page_state *bch2_page_state(struct page *page) ++{ ++ EBUG_ON(!PageLocked(page)); ++ ++ return __bch2_page_state(page); ++} ++ ++/* for newly allocated pages: */ ++static void __bch2_page_state_release(struct page *page) ++{ ++ struct bch_page_state *s = __bch2_page_state(page); ++ ++ if (!s) ++ return; ++ ++ ClearPagePrivate(page); ++ set_page_private(page, 0); ++ put_page(page); ++ kfree(s); ++} ++ ++static void bch2_page_state_release(struct page *page) ++{ ++ struct bch_page_state *s = bch2_page_state(page); ++ ++ if (!s) ++ return; ++ ++ ClearPagePrivate(page); ++ set_page_private(page, 0); ++ put_page(page); ++ kfree(s); ++} ++ ++/* for newly allocated pages: */ ++static struct bch_page_state *__bch2_page_state_create(struct page *page, ++ gfp_t gfp) ++{ ++ struct bch_page_state *s; ++ ++ s = kzalloc(sizeof(*s), GFP_NOFS|gfp); ++ if (!s) ++ return NULL; ++ ++ spin_lock_init(&s->lock); ++ /* ++ * migrate_page_move_mapping() assumes that pages with private data ++ * have their count elevated by 1. ++ */ ++ get_page(page); ++ set_page_private(page, (unsigned long) s); ++ SetPagePrivate(page); ++ return s; ++} ++ ++static struct bch_page_state *bch2_page_state_create(struct page *page, ++ gfp_t gfp) ++{ ++ return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp); ++} ++ ++static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) ++{ ++ /* XXX: this should not be open coded */ ++ return inode->ei_inode.bi_data_replicas ++ ? inode->ei_inode.bi_data_replicas - 1 ++ : c->opts.data_replicas; ++} ++ ++static inline unsigned sectors_to_reserve(struct bch_page_sector *s, ++ unsigned nr_replicas) ++{ ++ return max(0, (int) nr_replicas - ++ s->nr_replicas - ++ s->replicas_reserved); ++} ++ ++static int bch2_get_page_disk_reservation(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct page *page, bool check_enospc) ++{ ++ struct bch_page_state *s = bch2_page_state_create(page, 0); ++ unsigned nr_replicas = inode_nr_replicas(c, inode); ++ struct disk_reservation disk_res = { 0 }; ++ unsigned i, disk_res_sectors = 0; ++ int ret; ++ ++ if (!s) ++ return -ENOMEM; ++ ++ for (i = 0; i < ARRAY_SIZE(s->s); i++) ++ disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); ++ ++ if (!disk_res_sectors) ++ return 0; ++ ++ ret = bch2_disk_reservation_get(c, &disk_res, ++ disk_res_sectors, 1, ++ !check_enospc ++ ? BCH_DISK_RESERVATION_NOFAIL ++ : 0); ++ if (unlikely(ret)) ++ return ret; ++ ++ for (i = 0; i < ARRAY_SIZE(s->s); i++) ++ s->s[i].replicas_reserved += ++ sectors_to_reserve(&s->s[i], nr_replicas); ++ ++ return 0; ++} ++ ++struct bch2_page_reservation { ++ struct disk_reservation disk; ++ struct quota_res quota; ++}; ++ ++static void bch2_page_reservation_init(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch2_page_reservation *res) ++{ ++ memset(res, 0, sizeof(*res)); ++ ++ res->disk.nr_replicas = inode_nr_replicas(c, inode); ++} ++ ++static void bch2_page_reservation_put(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch2_page_reservation *res) ++{ ++ bch2_disk_reservation_put(c, &res->disk); ++ bch2_quota_reservation_put(c, inode, &res->quota); ++} ++ ++static int bch2_page_reservation_get(struct bch_fs *c, ++ struct bch_inode_info *inode, struct page *page, ++ struct bch2_page_reservation *res, ++ unsigned offset, unsigned len, bool check_enospc) ++{ ++ struct bch_page_state *s = bch2_page_state_create(page, 0); ++ unsigned i, disk_sectors = 0, quota_sectors = 0; ++ int ret; ++ ++ if (!s) ++ return -ENOMEM; ++ ++ for (i = round_down(offset, block_bytes(c)) >> 9; ++ i < round_up(offset + len, block_bytes(c)) >> 9; ++ i++) { ++ disk_sectors += sectors_to_reserve(&s->s[i], ++ res->disk.nr_replicas); ++ quota_sectors += s->s[i].state == SECTOR_UNALLOCATED; ++ } ++ ++ if (disk_sectors) { ++ ret = bch2_disk_reservation_add(c, &res->disk, ++ disk_sectors, ++ !check_enospc ++ ? BCH_DISK_RESERVATION_NOFAIL ++ : 0); ++ if (unlikely(ret)) ++ return ret; ++ } ++ ++ if (quota_sectors) { ++ ret = bch2_quota_reservation_add(c, inode, &res->quota, ++ quota_sectors, ++ check_enospc); ++ if (unlikely(ret)) { ++ struct disk_reservation tmp = { ++ .sectors = disk_sectors ++ }; ++ ++ bch2_disk_reservation_put(c, &tmp); ++ res->disk.sectors -= disk_sectors; ++ return ret; ++ } ++ } ++ ++ return 0; ++} ++ ++static void bch2_clear_page_bits(struct page *page) ++{ ++ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_page_state *s = bch2_page_state(page); ++ struct disk_reservation disk_res = { 0 }; ++ int i, dirty_sectors = 0; ++ ++ if (!s) ++ return; ++ ++ EBUG_ON(!PageLocked(page)); ++ EBUG_ON(PageWriteback(page)); ++ ++ for (i = 0; i < ARRAY_SIZE(s->s); i++) { ++ disk_res.sectors += s->s[i].replicas_reserved; ++ s->s[i].replicas_reserved = 0; ++ ++ if (s->s[i].state == SECTOR_DIRTY) { ++ dirty_sectors++; ++ s->s[i].state = SECTOR_UNALLOCATED; ++ } ++ } ++ ++ bch2_disk_reservation_put(c, &disk_res); ++ ++ if (dirty_sectors) ++ i_sectors_acct(c, inode, NULL, -dirty_sectors); ++ ++ bch2_page_state_release(page); ++} ++ ++static void bch2_set_page_dirty(struct bch_fs *c, ++ struct bch_inode_info *inode, struct page *page, ++ struct bch2_page_reservation *res, ++ unsigned offset, unsigned len) ++{ ++ struct bch_page_state *s = bch2_page_state(page); ++ unsigned i, dirty_sectors = 0; ++ ++ WARN_ON((u64) page_offset(page) + offset + len > ++ round_up((u64) i_size_read(&inode->v), block_bytes(c))); ++ ++ spin_lock(&s->lock); ++ ++ for (i = round_down(offset, block_bytes(c)) >> 9; ++ i < round_up(offset + len, block_bytes(c)) >> 9; ++ i++) { ++ unsigned sectors = sectors_to_reserve(&s->s[i], ++ res->disk.nr_replicas); ++ ++ /* ++ * This can happen if we race with the error path in ++ * bch2_writepage_io_done(): ++ */ ++ sectors = min_t(unsigned, sectors, res->disk.sectors); ++ ++ s->s[i].replicas_reserved += sectors; ++ res->disk.sectors -= sectors; ++ ++ if (s->s[i].state == SECTOR_UNALLOCATED) ++ dirty_sectors++; ++ ++ s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY); ++ } ++ ++ spin_unlock(&s->lock); ++ ++ if (dirty_sectors) ++ i_sectors_acct(c, inode, &res->quota, dirty_sectors); ++ ++ if (!PageDirty(page)) ++ __set_page_dirty_nobuffers(page); ++} ++ ++vm_fault_t bch2_page_fault(struct vm_fault *vmf) ++{ ++ struct file *file = vmf->vma->vm_file; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ int ret; ++ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ret = filemap_fault(vmf); ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ ++ return ret; ++} ++ ++vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) ++{ ++ struct page *page = vmf->page; ++ struct file *file = vmf->vma->vm_file; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct address_space *mapping = file->f_mapping; ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch2_page_reservation res; ++ unsigned len; ++ loff_t isize; ++ int ret = VM_FAULT_LOCKED; ++ ++ bch2_page_reservation_init(c, inode, &res); ++ ++ sb_start_pagefault(inode->v.i_sb); ++ file_update_time(file); ++ ++ /* ++ * Not strictly necessary, but helps avoid dio writes livelocking in ++ * write_invalidate_inode_pages_range() - can drop this if/when we get ++ * a write_invalidate_inode_pages_range() that works without dropping ++ * page lock before invalidating page ++ */ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ++ lock_page(page); ++ isize = i_size_read(&inode->v); ++ ++ if (page->mapping != mapping || page_offset(page) >= isize) { ++ unlock_page(page); ++ ret = VM_FAULT_NOPAGE; ++ goto out; ++ } ++ ++ len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page)); ++ ++ if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) { ++ unlock_page(page); ++ ret = VM_FAULT_SIGBUS; ++ goto out; ++ } ++ ++ bch2_set_page_dirty(c, inode, page, &res, 0, len); ++ bch2_page_reservation_put(c, inode, &res); ++ ++ wait_for_stable_page(page); ++out: ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ sb_end_pagefault(inode->v.i_sb); ++ ++ return ret; ++} ++ ++void bch2_invalidatepage(struct page *page, unsigned int offset, ++ unsigned int length) ++{ ++ if (offset || length < PAGE_SIZE) ++ return; ++ ++ bch2_clear_page_bits(page); ++} ++ ++int bch2_releasepage(struct page *page, gfp_t gfp_mask) ++{ ++ if (PageDirty(page)) ++ return 0; ++ ++ bch2_clear_page_bits(page); ++ return 1; ++} ++ ++#ifdef CONFIG_MIGRATION ++int bch2_migrate_page(struct address_space *mapping, struct page *newpage, ++ struct page *page, enum migrate_mode mode) ++{ ++ int ret; ++ ++ EBUG_ON(!PageLocked(page)); ++ EBUG_ON(!PageLocked(newpage)); ++ ++ ret = migrate_page_move_mapping(mapping, newpage, page, 0); ++ if (ret != MIGRATEPAGE_SUCCESS) ++ return ret; ++ ++ if (PagePrivate(page)) { ++ ClearPagePrivate(page); ++ get_page(newpage); ++ set_page_private(newpage, page_private(page)); ++ set_page_private(page, 0); ++ put_page(page); ++ SetPagePrivate(newpage); ++ } ++ ++ if (mode != MIGRATE_SYNC_NO_COPY) ++ migrate_page_copy(newpage, page); ++ else ++ migrate_page_states(newpage, page); ++ return MIGRATEPAGE_SUCCESS; ++} ++#endif ++ ++/* readpage(s): */ ++ ++static void bch2_readpages_end_io(struct bio *bio) ++{ ++ struct bvec_iter_all iter; ++ struct bio_vec *bv; ++ ++ bio_for_each_segment_all(bv, bio, iter) { ++ struct page *page = bv->bv_page; ++ ++ if (!bio->bi_status) { ++ SetPageUptodate(page); ++ } else { ++ ClearPageUptodate(page); ++ SetPageError(page); ++ } ++ unlock_page(page); ++ } ++ ++ bio_put(bio); ++} ++ ++static inline void page_state_init_for_read(struct page *page) ++{ ++ SetPagePrivate(page); ++ page->private = 0; ++} ++ ++struct readpages_iter { ++ struct address_space *mapping; ++ struct page **pages; ++ unsigned nr_pages; ++ unsigned nr_added; ++ unsigned idx; ++ pgoff_t offset; ++}; ++ ++static int readpages_iter_init(struct readpages_iter *iter, ++ struct address_space *mapping, ++ struct list_head *pages, unsigned nr_pages) ++{ ++ memset(iter, 0, sizeof(*iter)); ++ ++ iter->mapping = mapping; ++ iter->offset = list_last_entry(pages, struct page, lru)->index; ++ ++ iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); ++ if (!iter->pages) ++ return -ENOMEM; ++ ++ while (!list_empty(pages)) { ++ struct page *page = list_last_entry(pages, struct page, lru); ++ ++ __bch2_page_state_create(page, __GFP_NOFAIL); ++ ++ iter->pages[iter->nr_pages++] = page; ++ list_del(&page->lru); ++ } ++ ++ return 0; ++} ++ ++static inline struct page *readpage_iter_next(struct readpages_iter *iter) ++{ ++ struct page *page; ++ unsigned i; ++ int ret; ++ ++ BUG_ON(iter->idx > iter->nr_added); ++ BUG_ON(iter->nr_added > iter->nr_pages); ++ ++ if (iter->idx < iter->nr_added) ++ goto out; ++ ++ while (1) { ++ if (iter->idx == iter->nr_pages) ++ return NULL; ++ ++ ret = add_to_page_cache_lru_vec(iter->mapping, ++ iter->pages + iter->nr_added, ++ iter->nr_pages - iter->nr_added, ++ iter->offset + iter->nr_added, ++ GFP_NOFS); ++ if (ret > 0) ++ break; ++ ++ page = iter->pages[iter->nr_added]; ++ iter->idx++; ++ iter->nr_added++; ++ ++ __bch2_page_state_release(page); ++ put_page(page); ++ } ++ ++ iter->nr_added += ret; ++ ++ for (i = iter->idx; i < iter->nr_added; i++) ++ put_page(iter->pages[i]); ++out: ++ EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); ++ ++ return iter->pages[iter->idx]; ++} ++ ++static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) ++{ ++ struct bvec_iter iter; ++ struct bio_vec bv; ++ unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v ++ ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); ++ unsigned state = k.k->type == KEY_TYPE_reservation ++ ? SECTOR_RESERVED ++ : SECTOR_ALLOCATED; ++ ++ bio_for_each_segment(bv, bio, iter) { ++ struct bch_page_state *s = bch2_page_state(bv.bv_page); ++ unsigned i; ++ ++ for (i = bv.bv_offset >> 9; ++ i < (bv.bv_offset + bv.bv_len) >> 9; ++ i++) { ++ s->s[i].nr_replicas = nr_ptrs; ++ s->s[i].state = state; ++ } ++ } ++} ++ ++static bool extent_partial_reads_expensive(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ struct bch_extent_crc_unpacked crc; ++ const union bch_extent_entry *i; ++ ++ bkey_for_each_crc(k.k, ptrs, crc, i) ++ if (crc.csum_type || crc.compression_type) ++ return true; ++ return false; ++} ++ ++static void readpage_bio_extend(struct readpages_iter *iter, ++ struct bio *bio, ++ unsigned sectors_this_extent, ++ bool get_more) ++{ ++ while (bio_sectors(bio) < sectors_this_extent && ++ bio->bi_vcnt < bio->bi_max_vecs) { ++ pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT; ++ struct page *page = readpage_iter_next(iter); ++ int ret; ++ ++ if (page) { ++ if (iter->offset + iter->idx != page_offset) ++ break; ++ ++ iter->idx++; ++ } else { ++ if (!get_more) ++ break; ++ ++ page = xa_load(&iter->mapping->i_pages, page_offset); ++ if (page && !xa_is_value(page)) ++ break; ++ ++ page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); ++ if (!page) ++ break; ++ ++ if (!__bch2_page_state_create(page, 0)) { ++ put_page(page); ++ break; ++ } ++ ++ ret = add_to_page_cache_lru(page, iter->mapping, ++ page_offset, GFP_NOFS); ++ if (ret) { ++ __bch2_page_state_release(page); ++ put_page(page); ++ break; ++ } ++ ++ put_page(page); ++ } ++ ++ BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0)); ++ } ++} ++ ++static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, ++ struct bch_read_bio *rbio, u64 inum, ++ struct readpages_iter *readpages_iter) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_on_stack sk; ++ int flags = BCH_READ_RETRY_IF_STALE| ++ BCH_READ_MAY_PROMOTE; ++ int ret = 0; ++ ++ rbio->c = c; ++ rbio->start_time = local_clock(); ++ ++ bkey_on_stack_init(&sk); ++retry: ++ while (1) { ++ struct bkey_s_c k; ++ unsigned bytes, sectors, offset_into_extent; ++ ++ bch2_btree_iter_set_pos(iter, ++ POS(inum, rbio->bio.bi_iter.bi_sector)); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ break; ++ ++ bkey_on_stack_reassemble(&sk, c, k); ++ k = bkey_i_to_s_c(sk.k); ++ ++ offset_into_extent = iter->pos.offset - ++ bkey_start_offset(k.k); ++ sectors = k.k->size - offset_into_extent; ++ ++ ret = bch2_read_indirect_extent(trans, ++ &offset_into_extent, &sk); ++ if (ret) ++ break; ++ ++ sectors = min(sectors, k.k->size - offset_into_extent); ++ ++ bch2_trans_unlock(trans); ++ ++ if (readpages_iter) ++ readpage_bio_extend(readpages_iter, &rbio->bio, sectors, ++ extent_partial_reads_expensive(k)); ++ ++ bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; ++ swap(rbio->bio.bi_iter.bi_size, bytes); ++ ++ if (rbio->bio.bi_iter.bi_size == bytes) ++ flags |= BCH_READ_LAST_FRAGMENT; ++ ++ if (bkey_extent_is_allocation(k.k)) ++ bch2_add_page_sectors(&rbio->bio, k); ++ ++ bch2_read_extent(c, rbio, k, offset_into_extent, flags); ++ ++ if (flags & BCH_READ_LAST_FRAGMENT) ++ break; ++ ++ swap(rbio->bio.bi_iter.bi_size, bytes); ++ bio_advance(&rbio->bio, bytes); ++ } ++ ++ if (ret == -EINTR) ++ goto retry; ++ ++ if (ret) { ++ bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); ++ bio_endio(&rbio->bio); ++ } ++ ++ bkey_on_stack_exit(&sk, c); ++} ++ ++int bch2_readpages(struct file *file, struct address_space *mapping, ++ struct list_head *pages, unsigned nr_pages) ++{ ++ struct bch_inode_info *inode = to_bch_ei(mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct page *page; ++ struct readpages_iter readpages_iter; ++ int ret; ++ ++ ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages); ++ BUG_ON(ret); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, ++ BTREE_ITER_SLOTS); ++ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ++ while ((page = readpage_iter_next(&readpages_iter))) { ++ pgoff_t index = readpages_iter.offset + readpages_iter.idx; ++ unsigned n = min_t(unsigned, ++ readpages_iter.nr_pages - ++ readpages_iter.idx, ++ BIO_MAX_PAGES); ++ struct bch_read_bio *rbio = ++ rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read), ++ opts); ++ ++ readpages_iter.idx++; ++ ++ bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0); ++ rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT; ++ rbio->bio.bi_end_io = bch2_readpages_end_io; ++ BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); ++ ++ bchfs_read(&trans, iter, rbio, inode->v.i_ino, ++ &readpages_iter); ++ } ++ ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ ++ bch2_trans_exit(&trans); ++ kfree(readpages_iter.pages); ++ ++ return 0; ++} ++ ++static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, ++ u64 inum, struct page *page) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ ++ bch2_page_state_create(page, __GFP_NOFAIL); ++ ++ bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC); ++ rbio->bio.bi_iter.bi_sector = ++ (sector_t) page->index << PAGE_SECTOR_SHIFT; ++ BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, ++ BTREE_ITER_SLOTS); ++ ++ bchfs_read(&trans, iter, rbio, inum, NULL); ++ ++ bch2_trans_exit(&trans); ++} ++ ++int bch2_readpage(struct file *file, struct page *page) ++{ ++ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); ++ struct bch_read_bio *rbio; ++ ++ rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts); ++ rbio->bio.bi_end_io = bch2_readpages_end_io; ++ ++ __bchfs_readpage(c, rbio, inode->v.i_ino, page); ++ return 0; ++} ++ ++static void bch2_read_single_page_end_io(struct bio *bio) ++{ ++ complete(bio->bi_private); ++} ++ ++static int bch2_read_single_page(struct page *page, ++ struct address_space *mapping) ++{ ++ struct bch_inode_info *inode = to_bch_ei(mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_read_bio *rbio; ++ int ret; ++ DECLARE_COMPLETION_ONSTACK(done); ++ ++ rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), ++ io_opts(c, &inode->ei_inode)); ++ rbio->bio.bi_private = &done; ++ rbio->bio.bi_end_io = bch2_read_single_page_end_io; ++ ++ __bchfs_readpage(c, rbio, inode->v.i_ino, page); ++ wait_for_completion(&done); ++ ++ ret = blk_status_to_errno(rbio->bio.bi_status); ++ bio_put(&rbio->bio); ++ ++ if (ret < 0) ++ return ret; ++ ++ SetPageUptodate(page); ++ return 0; ++} ++ ++/* writepages: */ ++ ++struct bch_writepage_state { ++ struct bch_writepage_io *io; ++ struct bch_io_opts opts; ++}; ++ ++static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, ++ struct bch_inode_info *inode) ++{ ++ return (struct bch_writepage_state) { ++ .opts = io_opts(c, &inode->ei_inode) ++ }; ++} ++ ++static void bch2_writepage_io_free(struct closure *cl) ++{ ++ struct bch_writepage_io *io = container_of(cl, ++ struct bch_writepage_io, cl); ++ ++ bio_put(&io->op.wbio.bio); ++} ++ ++static void bch2_writepage_io_done(struct closure *cl) ++{ ++ struct bch_writepage_io *io = container_of(cl, ++ struct bch_writepage_io, cl); ++ struct bch_fs *c = io->op.c; ++ struct bio *bio = &io->op.wbio.bio; ++ struct bvec_iter_all iter; ++ struct bio_vec *bvec; ++ unsigned i; ++ ++ if (io->op.error) { ++ bio_for_each_segment_all(bvec, bio, iter) { ++ struct bch_page_state *s; ++ ++ SetPageError(bvec->bv_page); ++ mapping_set_error(bvec->bv_page->mapping, -EIO); ++ ++ s = __bch2_page_state(bvec->bv_page); ++ spin_lock(&s->lock); ++ for (i = 0; i < PAGE_SECTORS; i++) ++ s->s[i].nr_replicas = 0; ++ spin_unlock(&s->lock); ++ } ++ } ++ ++ if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { ++ bio_for_each_segment_all(bvec, bio, iter) { ++ struct bch_page_state *s; ++ ++ s = __bch2_page_state(bvec->bv_page); ++ spin_lock(&s->lock); ++ for (i = 0; i < PAGE_SECTORS; i++) ++ s->s[i].nr_replicas = 0; ++ spin_unlock(&s->lock); ++ } ++ } ++ ++ /* ++ * racing with fallocate can cause us to add fewer sectors than ++ * expected - but we shouldn't add more sectors than expected: ++ */ ++ BUG_ON(io->op.i_sectors_delta > 0); ++ ++ /* ++ * (error (due to going RO) halfway through a page can screw that up ++ * slightly) ++ * XXX wtf? ++ BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); ++ */ ++ ++ /* ++ * PageWriteback is effectively our ref on the inode - fixup i_blocks ++ * before calling end_page_writeback: ++ */ ++ i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); ++ ++ bio_for_each_segment_all(bvec, bio, iter) { ++ struct bch_page_state *s = __bch2_page_state(bvec->bv_page); ++ ++ if (atomic_dec_and_test(&s->write_count)) ++ end_page_writeback(bvec->bv_page); ++ } ++ ++ closure_return_with_destructor(&io->cl, bch2_writepage_io_free); ++} ++ ++static void bch2_writepage_do_io(struct bch_writepage_state *w) ++{ ++ struct bch_writepage_io *io = w->io; ++ ++ w->io = NULL; ++ closure_call(&io->op.cl, bch2_write, NULL, &io->cl); ++ continue_at(&io->cl, bch2_writepage_io_done, NULL); ++} ++ ++/* ++ * Get a bch_writepage_io and add @page to it - appending to an existing one if ++ * possible, else allocating a new one: ++ */ ++static void bch2_writepage_io_alloc(struct bch_fs *c, ++ struct writeback_control *wbc, ++ struct bch_writepage_state *w, ++ struct bch_inode_info *inode, ++ u64 sector, ++ unsigned nr_replicas) ++{ ++ struct bch_write_op *op; ++ ++ w->io = container_of(bio_alloc_bioset(GFP_NOFS, ++ BIO_MAX_PAGES, ++ &c->writepage_bioset), ++ struct bch_writepage_io, op.wbio.bio); ++ ++ closure_init(&w->io->cl, NULL); ++ w->io->inode = inode; ++ ++ op = &w->io->op; ++ bch2_write_op_init(op, c, w->opts); ++ op->target = w->opts.foreground_target; ++ op_journal_seq_set(op, &inode->ei_journal_seq); ++ op->nr_replicas = nr_replicas; ++ op->res.nr_replicas = nr_replicas; ++ op->write_point = writepoint_hashed(inode->ei_last_dirtied); ++ op->pos = POS(inode->v.i_ino, sector); ++ op->wbio.bio.bi_iter.bi_sector = sector; ++ op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); ++} ++ ++static int __bch2_writepage(struct page *page, ++ struct writeback_control *wbc, ++ void *data) ++{ ++ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_writepage_state *w = data; ++ struct bch_page_state *s, orig; ++ unsigned i, offset, nr_replicas_this_write = U32_MAX; ++ loff_t i_size = i_size_read(&inode->v); ++ pgoff_t end_index = i_size >> PAGE_SHIFT; ++ int ret; ++ ++ EBUG_ON(!PageUptodate(page)); ++ ++ /* Is the page fully inside i_size? */ ++ if (page->index < end_index) ++ goto do_io; ++ ++ /* Is the page fully outside i_size? (truncate in progress) */ ++ offset = i_size & (PAGE_SIZE - 1); ++ if (page->index > end_index || !offset) { ++ unlock_page(page); ++ return 0; ++ } ++ ++ /* ++ * The page straddles i_size. It must be zeroed out on each and every ++ * writepage invocation because it may be mmapped. "A file is mapped ++ * in multiples of the page size. For a file that is not a multiple of ++ * the page size, the remaining memory is zeroed when mapped, and ++ * writes to that region are not written out to the file." ++ */ ++ zero_user_segment(page, offset, PAGE_SIZE); ++do_io: ++ s = bch2_page_state_create(page, __GFP_NOFAIL); ++ ++ ret = bch2_get_page_disk_reservation(c, inode, page, true); ++ if (ret) { ++ SetPageError(page); ++ mapping_set_error(page->mapping, ret); ++ unlock_page(page); ++ return 0; ++ } ++ ++ /* Before unlocking the page, get copy of reservations: */ ++ orig = *s; ++ ++ for (i = 0; i < PAGE_SECTORS; i++) { ++ if (s->s[i].state < SECTOR_DIRTY) ++ continue; ++ ++ nr_replicas_this_write = ++ min_t(unsigned, nr_replicas_this_write, ++ s->s[i].nr_replicas + ++ s->s[i].replicas_reserved); ++ } ++ ++ for (i = 0; i < PAGE_SECTORS; i++) { ++ if (s->s[i].state < SECTOR_DIRTY) ++ continue; ++ ++ s->s[i].nr_replicas = w->opts.compression ++ ? 0 : nr_replicas_this_write; ++ ++ s->s[i].replicas_reserved = 0; ++ s->s[i].state = SECTOR_ALLOCATED; ++ } ++ ++ BUG_ON(atomic_read(&s->write_count)); ++ atomic_set(&s->write_count, 1); ++ ++ BUG_ON(PageWriteback(page)); ++ set_page_writeback(page); ++ ++ unlock_page(page); ++ ++ offset = 0; ++ while (1) { ++ unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0; ++ u64 sector; ++ ++ while (offset < PAGE_SECTORS && ++ orig.s[offset].state < SECTOR_DIRTY) ++ offset++; ++ ++ if (offset == PAGE_SECTORS) ++ break; ++ ++ sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset; ++ ++ while (offset + sectors < PAGE_SECTORS && ++ orig.s[offset + sectors].state >= SECTOR_DIRTY) ++ sectors++; ++ ++ for (i = offset; i < offset + sectors; i++) { ++ reserved_sectors += orig.s[i].replicas_reserved; ++ dirty_sectors += orig.s[i].state == SECTOR_DIRTY; ++ } ++ ++ if (w->io && ++ (w->io->op.res.nr_replicas != nr_replicas_this_write || ++ bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || ++ w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= ++ (BIO_MAX_PAGES * PAGE_SIZE) || ++ bio_end_sector(&w->io->op.wbio.bio) != sector)) ++ bch2_writepage_do_io(w); ++ ++ if (!w->io) ++ bch2_writepage_io_alloc(c, wbc, w, inode, sector, ++ nr_replicas_this_write); ++ ++ atomic_inc(&s->write_count); ++ ++ BUG_ON(inode != w->io->inode); ++ BUG_ON(!bio_add_page(&w->io->op.wbio.bio, page, ++ sectors << 9, offset << 9)); ++ ++ /* Check for writing past i_size: */ ++ WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) > ++ round_up(i_size, block_bytes(c))); ++ ++ w->io->op.res.sectors += reserved_sectors; ++ w->io->op.i_sectors_delta -= dirty_sectors; ++ w->io->op.new_i_size = i_size; ++ ++ offset += sectors; ++ } ++ ++ if (atomic_dec_and_test(&s->write_count)) ++ end_page_writeback(page); ++ ++ return 0; ++} ++ ++int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) ++{ ++ struct bch_fs *c = mapping->host->i_sb->s_fs_info; ++ struct bch_writepage_state w = ++ bch_writepage_state_init(c, to_bch_ei(mapping->host)); ++ struct blk_plug plug; ++ int ret; ++ ++ blk_start_plug(&plug); ++ ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); ++ if (w.io) ++ bch2_writepage_do_io(&w); ++ blk_finish_plug(&plug); ++ return ret; ++} ++ ++int bch2_writepage(struct page *page, struct writeback_control *wbc) ++{ ++ struct bch_fs *c = page->mapping->host->i_sb->s_fs_info; ++ struct bch_writepage_state w = ++ bch_writepage_state_init(c, to_bch_ei(page->mapping->host)); ++ int ret; ++ ++ ret = __bch2_writepage(page, wbc, &w); ++ if (w.io) ++ bch2_writepage_do_io(&w); ++ ++ return ret; ++} ++ ++/* buffered writes: */ ++ ++int bch2_write_begin(struct file *file, struct address_space *mapping, ++ loff_t pos, unsigned len, unsigned flags, ++ struct page **pagep, void **fsdata) ++{ ++ struct bch_inode_info *inode = to_bch_ei(mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch2_page_reservation *res; ++ pgoff_t index = pos >> PAGE_SHIFT; ++ unsigned offset = pos & (PAGE_SIZE - 1); ++ struct page *page; ++ int ret = -ENOMEM; ++ ++ res = kmalloc(sizeof(*res), GFP_KERNEL); ++ if (!res) ++ return -ENOMEM; ++ ++ bch2_page_reservation_init(c, inode, res); ++ *fsdata = res; ++ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ++ page = grab_cache_page_write_begin(mapping, index, flags); ++ if (!page) ++ goto err_unlock; ++ ++ if (PageUptodate(page)) ++ goto out; ++ ++ /* If we're writing entire page, don't need to read it in first: */ ++ if (len == PAGE_SIZE) ++ goto out; ++ ++ if (!offset && pos + len >= inode->v.i_size) { ++ zero_user_segment(page, len, PAGE_SIZE); ++ flush_dcache_page(page); ++ goto out; ++ } ++ ++ if (index > inode->v.i_size >> PAGE_SHIFT) { ++ zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE); ++ flush_dcache_page(page); ++ goto out; ++ } ++readpage: ++ ret = bch2_read_single_page(page, mapping); ++ if (ret) ++ goto err; ++out: ++ ret = bch2_page_reservation_get(c, inode, page, res, ++ offset, len, true); ++ if (ret) { ++ if (!PageUptodate(page)) { ++ /* ++ * If the page hasn't been read in, we won't know if we ++ * actually need a reservation - we don't actually need ++ * to read here, we just need to check if the page is ++ * fully backed by uncompressed data: ++ */ ++ goto readpage; ++ } ++ ++ goto err; ++ } ++ ++ *pagep = page; ++ return 0; ++err: ++ unlock_page(page); ++ put_page(page); ++ *pagep = NULL; ++err_unlock: ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ kfree(res); ++ *fsdata = NULL; ++ return ret; ++} ++ ++int bch2_write_end(struct file *file, struct address_space *mapping, ++ loff_t pos, unsigned len, unsigned copied, ++ struct page *page, void *fsdata) ++{ ++ struct bch_inode_info *inode = to_bch_ei(mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch2_page_reservation *res = fsdata; ++ unsigned offset = pos & (PAGE_SIZE - 1); ++ ++ lockdep_assert_held(&inode->v.i_rwsem); ++ ++ if (unlikely(copied < len && !PageUptodate(page))) { ++ /* ++ * The page needs to be read in, but that would destroy ++ * our partial write - simplest thing is to just force ++ * userspace to redo the write: ++ */ ++ zero_user(page, 0, PAGE_SIZE); ++ flush_dcache_page(page); ++ copied = 0; ++ } ++ ++ spin_lock(&inode->v.i_lock); ++ if (pos + copied > inode->v.i_size) ++ i_size_write(&inode->v, pos + copied); ++ spin_unlock(&inode->v.i_lock); ++ ++ if (copied) { ++ if (!PageUptodate(page)) ++ SetPageUptodate(page); ++ ++ bch2_set_page_dirty(c, inode, page, res, offset, copied); ++ ++ inode->ei_last_dirtied = (unsigned long) current; ++ } ++ ++ unlock_page(page); ++ put_page(page); ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ ++ bch2_page_reservation_put(c, inode, res); ++ kfree(res); ++ ++ return copied; ++} ++ ++#define WRITE_BATCH_PAGES 32 ++ ++static int __bch2_buffered_write(struct bch_inode_info *inode, ++ struct address_space *mapping, ++ struct iov_iter *iter, ++ loff_t pos, unsigned len) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct page *pages[WRITE_BATCH_PAGES]; ++ struct bch2_page_reservation res; ++ unsigned long index = pos >> PAGE_SHIFT; ++ unsigned offset = pos & (PAGE_SIZE - 1); ++ unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); ++ unsigned i, reserved = 0, set_dirty = 0; ++ unsigned copied = 0, nr_pages_copied = 0; ++ int ret = 0; ++ ++ BUG_ON(!len); ++ BUG_ON(nr_pages > ARRAY_SIZE(pages)); ++ ++ bch2_page_reservation_init(c, inode, &res); ++ ++ for (i = 0; i < nr_pages; i++) { ++ pages[i] = grab_cache_page_write_begin(mapping, index + i, 0); ++ if (!pages[i]) { ++ nr_pages = i; ++ if (!i) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ len = min_t(unsigned, len, ++ nr_pages * PAGE_SIZE - offset); ++ break; ++ } ++ } ++ ++ if (offset && !PageUptodate(pages[0])) { ++ ret = bch2_read_single_page(pages[0], mapping); ++ if (ret) ++ goto out; ++ } ++ ++ if ((pos + len) & (PAGE_SIZE - 1) && ++ !PageUptodate(pages[nr_pages - 1])) { ++ if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) { ++ zero_user(pages[nr_pages - 1], 0, PAGE_SIZE); ++ } else { ++ ret = bch2_read_single_page(pages[nr_pages - 1], mapping); ++ if (ret) ++ goto out; ++ } ++ } ++ ++ while (reserved < len) { ++ struct page *page = pages[(offset + reserved) >> PAGE_SHIFT]; ++ unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1); ++ unsigned pg_len = min_t(unsigned, len - reserved, ++ PAGE_SIZE - pg_offset); ++retry_reservation: ++ ret = bch2_page_reservation_get(c, inode, page, &res, ++ pg_offset, pg_len, true); ++ ++ if (ret && !PageUptodate(page)) { ++ ret = bch2_read_single_page(page, mapping); ++ if (!ret) ++ goto retry_reservation; ++ } ++ ++ if (ret) ++ goto out; ++ ++ reserved += pg_len; ++ } ++ ++ if (mapping_writably_mapped(mapping)) ++ for (i = 0; i < nr_pages; i++) ++ flush_dcache_page(pages[i]); ++ ++ while (copied < len) { ++ struct page *page = pages[(offset + copied) >> PAGE_SHIFT]; ++ unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1); ++ unsigned pg_len = min_t(unsigned, len - copied, ++ PAGE_SIZE - pg_offset); ++ unsigned pg_copied = iov_iter_copy_from_user_atomic(page, ++ iter, pg_offset, pg_len); ++ ++ if (!pg_copied) ++ break; ++ ++ if (!PageUptodate(page) && ++ pg_copied != PAGE_SIZE && ++ pos + copied + pg_copied < inode->v.i_size) { ++ zero_user(page, 0, PAGE_SIZE); ++ break; ++ } ++ ++ flush_dcache_page(page); ++ iov_iter_advance(iter, pg_copied); ++ copied += pg_copied; ++ ++ if (pg_copied != pg_len) ++ break; ++ } ++ ++ if (!copied) ++ goto out; ++ ++ spin_lock(&inode->v.i_lock); ++ if (pos + copied > inode->v.i_size) ++ i_size_write(&inode->v, pos + copied); ++ spin_unlock(&inode->v.i_lock); ++ ++ while (set_dirty < copied) { ++ struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT]; ++ unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1); ++ unsigned pg_len = min_t(unsigned, copied - set_dirty, ++ PAGE_SIZE - pg_offset); ++ ++ if (!PageUptodate(page)) ++ SetPageUptodate(page); ++ ++ bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len); ++ unlock_page(page); ++ put_page(page); ++ ++ set_dirty += pg_len; ++ } ++ ++ nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE); ++ inode->ei_last_dirtied = (unsigned long) current; ++out: ++ for (i = nr_pages_copied; i < nr_pages; i++) { ++ unlock_page(pages[i]); ++ put_page(pages[i]); ++ } ++ ++ bch2_page_reservation_put(c, inode, &res); ++ ++ return copied ?: ret; ++} ++ ++static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) ++{ ++ struct file *file = iocb->ki_filp; ++ struct address_space *mapping = file->f_mapping; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ loff_t pos = iocb->ki_pos; ++ ssize_t written = 0; ++ int ret = 0; ++ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ++ do { ++ unsigned offset = pos & (PAGE_SIZE - 1); ++ unsigned bytes = min_t(unsigned long, iov_iter_count(iter), ++ PAGE_SIZE * WRITE_BATCH_PAGES - offset); ++again: ++ /* ++ * Bring in the user page that we will copy from _first_. ++ * Otherwise there's a nasty deadlock on copying from the ++ * same page as we're writing to, without it being marked ++ * up-to-date. ++ * ++ * Not only is this an optimisation, but it is also required ++ * to check that the address is actually valid, when atomic ++ * usercopies are used, below. ++ */ ++ if (unlikely(iov_iter_fault_in_readable(iter, bytes))) { ++ bytes = min_t(unsigned long, iov_iter_count(iter), ++ PAGE_SIZE - offset); ++ ++ if (unlikely(iov_iter_fault_in_readable(iter, bytes))) { ++ ret = -EFAULT; ++ break; ++ } ++ } ++ ++ if (unlikely(fatal_signal_pending(current))) { ++ ret = -EINTR; ++ break; ++ } ++ ++ ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); ++ if (unlikely(ret < 0)) ++ break; ++ ++ cond_resched(); ++ ++ if (unlikely(ret == 0)) { ++ /* ++ * If we were unable to copy any data at all, we must ++ * fall back to a single segment length write. ++ * ++ * If we didn't fallback here, we could livelock ++ * because not all segments in the iov can be copied at ++ * once without a pagefault. ++ */ ++ bytes = min_t(unsigned long, PAGE_SIZE - offset, ++ iov_iter_single_seg_count(iter)); ++ goto again; ++ } ++ pos += ret; ++ written += ret; ++ ret = 0; ++ ++ balance_dirty_pages_ratelimited(mapping); ++ } while (iov_iter_count(iter)); ++ ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ ++ return written ? written : ret; ++} ++ ++/* O_DIRECT reads */ ++ ++static void bch2_dio_read_complete(struct closure *cl) ++{ ++ struct dio_read *dio = container_of(cl, struct dio_read, cl); ++ ++ dio->req->ki_complete(dio->req, dio->ret, 0); ++ bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ ++} ++ ++static void bch2_direct_IO_read_endio(struct bio *bio) ++{ ++ struct dio_read *dio = bio->bi_private; ++ ++ if (bio->bi_status) ++ dio->ret = blk_status_to_errno(bio->bi_status); ++ ++ closure_put(&dio->cl); ++} ++ ++static void bch2_direct_IO_read_split_endio(struct bio *bio) ++{ ++ bch2_direct_IO_read_endio(bio); ++ bio_check_pages_dirty(bio); /* transfers ownership */ ++} ++ ++static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) ++{ ++ struct file *file = req->ki_filp; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); ++ struct dio_read *dio; ++ struct bio *bio; ++ loff_t offset = req->ki_pos; ++ bool sync = is_sync_kiocb(req); ++ size_t shorten; ++ ssize_t ret; ++ ++ if ((offset|iter->count) & (block_bytes(c) - 1)) ++ return -EINVAL; ++ ++ ret = min_t(loff_t, iter->count, ++ max_t(loff_t, 0, i_size_read(&inode->v) - offset)); ++ ++ if (!ret) ++ return ret; ++ ++ shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); ++ iter->count -= shorten; ++ ++ bio = bio_alloc_bioset(GFP_KERNEL, ++ iov_iter_npages(iter, BIO_MAX_PAGES), ++ &c->dio_read_bioset); ++ ++ bio->bi_end_io = bch2_direct_IO_read_endio; ++ ++ dio = container_of(bio, struct dio_read, rbio.bio); ++ closure_init(&dio->cl, NULL); ++ ++ /* ++ * this is a _really_ horrible hack just to avoid an atomic sub at the ++ * end: ++ */ ++ if (!sync) { ++ set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL); ++ atomic_set(&dio->cl.remaining, ++ CLOSURE_REMAINING_INITIALIZER - ++ CLOSURE_RUNNING + ++ CLOSURE_DESTRUCTOR); ++ } else { ++ atomic_set(&dio->cl.remaining, ++ CLOSURE_REMAINING_INITIALIZER + 1); ++ } ++ ++ dio->req = req; ++ dio->ret = ret; ++ ++ goto start; ++ while (iter->count) { ++ bio = bio_alloc_bioset(GFP_KERNEL, ++ iov_iter_npages(iter, BIO_MAX_PAGES), ++ &c->bio_read); ++ bio->bi_end_io = bch2_direct_IO_read_split_endio; ++start: ++ bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC); ++ bio->bi_iter.bi_sector = offset >> 9; ++ bio->bi_private = dio; ++ ++ ret = bio_iov_iter_get_pages(bio, iter); ++ if (ret < 0) { ++ /* XXX: fault inject this path */ ++ bio->bi_status = BLK_STS_RESOURCE; ++ bio_endio(bio); ++ break; ++ } ++ ++ offset += bio->bi_iter.bi_size; ++ bio_set_pages_dirty(bio); ++ ++ if (iter->count) ++ closure_get(&dio->cl); ++ ++ bch2_read(c, rbio_init(bio, opts), inode->v.i_ino); ++ } ++ ++ iter->count += shorten; ++ ++ if (sync) { ++ closure_sync(&dio->cl); ++ closure_debug_destroy(&dio->cl); ++ ret = dio->ret; ++ bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ ++ return ret; ++ } else { ++ return -EIOCBQUEUED; ++ } ++} ++ ++ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) ++{ ++ struct file *file = iocb->ki_filp; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct address_space *mapping = file->f_mapping; ++ size_t count = iov_iter_count(iter); ++ ssize_t ret; ++ ++ if (!count) ++ return 0; /* skip atime */ ++ ++ if (iocb->ki_flags & IOCB_DIRECT) { ++ struct blk_plug plug; ++ ++ ret = filemap_write_and_wait_range(mapping, ++ iocb->ki_pos, ++ iocb->ki_pos + count - 1); ++ if (ret < 0) ++ return ret; ++ ++ file_accessed(file); ++ ++ blk_start_plug(&plug); ++ ret = bch2_direct_IO_read(iocb, iter); ++ blk_finish_plug(&plug); ++ ++ if (ret >= 0) ++ iocb->ki_pos += ret; ++ } else { ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ret = generic_file_read_iter(iocb, iter); ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ } ++ ++ return ret; ++} ++ ++/* O_DIRECT writes */ ++ ++static void bch2_dio_write_loop_async(struct bch_write_op *); ++ ++static long bch2_dio_write_loop(struct dio_write *dio) ++{ ++ bool kthread = (current->flags & PF_KTHREAD) != 0; ++ struct kiocb *req = dio->req; ++ struct address_space *mapping = req->ki_filp->f_mapping; ++ struct bch_inode_info *inode = file_bch_inode(req->ki_filp); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bio *bio = &dio->op.wbio.bio; ++ struct bvec_iter_all iter; ++ struct bio_vec *bv; ++ unsigned unaligned; ++ bool sync = dio->sync; ++ long ret; ++ ++ if (dio->loop) ++ goto loop; ++ ++ while (1) { ++ if (kthread) ++ kthread_use_mm(dio->mm); ++ BUG_ON(current->faults_disabled_mapping); ++ current->faults_disabled_mapping = mapping; ++ ++ ret = bio_iov_iter_get_pages(bio, &dio->iter); ++ ++ current->faults_disabled_mapping = NULL; ++ if (kthread) ++ kthread_unuse_mm(dio->mm); ++ ++ if (unlikely(ret < 0)) ++ goto err; ++ ++ unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); ++ bio->bi_iter.bi_size -= unaligned; ++ iov_iter_revert(&dio->iter, unaligned); ++ ++ if (!bio->bi_iter.bi_size) { ++ /* ++ * bio_iov_iter_get_pages was only able to get < ++ * blocksize worth of pages: ++ */ ++ bio_for_each_segment_all(bv, bio, iter) ++ put_page(bv->bv_page); ++ ret = -EFAULT; ++ goto err; ++ } ++ ++ bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode)); ++ dio->op.end_io = bch2_dio_write_loop_async; ++ dio->op.target = dio->op.opts.foreground_target; ++ op_journal_seq_set(&dio->op, &inode->ei_journal_seq); ++ dio->op.write_point = writepoint_hashed((unsigned long) current); ++ dio->op.nr_replicas = dio->op.opts.data_replicas; ++ dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); ++ ++ if ((req->ki_flags & IOCB_DSYNC) && ++ !c->opts.journal_flush_disabled) ++ dio->op.flags |= BCH_WRITE_FLUSH; ++ ++ ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), ++ dio->op.opts.data_replicas, 0); ++ if (unlikely(ret) && ++ !bch2_check_range_allocated(c, dio->op.pos, ++ bio_sectors(bio), dio->op.opts.data_replicas)) ++ goto err; ++ ++ task_io_account_write(bio->bi_iter.bi_size); ++ ++ if (!dio->sync && !dio->loop && dio->iter.count) { ++ struct iovec *iov = dio->inline_vecs; ++ ++ if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { ++ iov = kmalloc(dio->iter.nr_segs * sizeof(*iov), ++ GFP_KERNEL); ++ if (unlikely(!iov)) { ++ dio->sync = sync = true; ++ goto do_io; ++ } ++ ++ dio->free_iov = true; ++ } ++ ++ memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov)); ++ dio->iter.iov = iov; ++ } ++do_io: ++ dio->loop = true; ++ closure_call(&dio->op.cl, bch2_write, NULL, NULL); ++ ++ if (sync) ++ wait_for_completion(&dio->done); ++ else ++ return -EIOCBQUEUED; ++loop: ++ i_sectors_acct(c, inode, &dio->quota_res, ++ dio->op.i_sectors_delta); ++ req->ki_pos += (u64) dio->op.written << 9; ++ dio->written += dio->op.written; ++ ++ spin_lock(&inode->v.i_lock); ++ if (req->ki_pos > inode->v.i_size) ++ i_size_write(&inode->v, req->ki_pos); ++ spin_unlock(&inode->v.i_lock); ++ ++ bio_for_each_segment_all(bv, bio, iter) ++ put_page(bv->bv_page); ++ if (!dio->iter.count || dio->op.error) ++ break; ++ ++ bio_reset(bio); ++ reinit_completion(&dio->done); ++ } ++ ++ ret = dio->op.error ?: ((long) dio->written << 9); ++err: ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ bch2_quota_reservation_put(c, inode, &dio->quota_res); ++ ++ if (dio->free_iov) ++ kfree(dio->iter.iov); ++ ++ bio_put(bio); ++ ++ /* inode->i_dio_count is our ref on inode and thus bch_fs */ ++ inode_dio_end(&inode->v); ++ ++ if (!sync) { ++ req->ki_complete(req, ret, 0); ++ ret = -EIOCBQUEUED; ++ } ++ return ret; ++} ++ ++static void bch2_dio_write_loop_async(struct bch_write_op *op) ++{ ++ struct dio_write *dio = container_of(op, struct dio_write, op); ++ ++ if (dio->sync) ++ complete(&dio->done); ++ else ++ bch2_dio_write_loop(dio); ++} ++ ++static noinline ++ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) ++{ ++ struct file *file = req->ki_filp; ++ struct address_space *mapping = file->f_mapping; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct dio_write *dio; ++ struct bio *bio; ++ bool locked = true, extending; ++ ssize_t ret; ++ ++ prefetch(&c->opts); ++ prefetch((void *) &c->opts + 64); ++ prefetch(&inode->ei_inode); ++ prefetch((void *) &inode->ei_inode + 64); ++ ++ inode_lock(&inode->v); ++ ++ ret = generic_write_checks(req, iter); ++ if (unlikely(ret <= 0)) ++ goto err; ++ ++ ret = file_remove_privs(file); ++ if (unlikely(ret)) ++ goto err; ++ ++ ret = file_update_time(file); ++ if (unlikely(ret)) ++ goto err; ++ ++ if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) ++ goto err; ++ ++ inode_dio_begin(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ extending = req->ki_pos + iter->count > inode->v.i_size; ++ if (!extending) { ++ inode_unlock(&inode->v); ++ locked = false; ++ } ++ ++ bio = bio_alloc_bioset(GFP_KERNEL, ++ iov_iter_npages(iter, BIO_MAX_PAGES), ++ &c->dio_write_bioset); ++ dio = container_of(bio, struct dio_write, op.wbio.bio); ++ init_completion(&dio->done); ++ dio->req = req; ++ dio->mm = current->mm; ++ dio->loop = false; ++ dio->sync = is_sync_kiocb(req) || extending; ++ dio->free_iov = false; ++ dio->quota_res.sectors = 0; ++ dio->written = 0; ++ dio->iter = *iter; ++ ++ ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, ++ iter->count >> 9, true); ++ if (unlikely(ret)) ++ goto err_put_bio; ++ ++ ret = write_invalidate_inode_pages_range(mapping, ++ req->ki_pos, ++ req->ki_pos + iter->count - 1); ++ if (unlikely(ret)) ++ goto err_put_bio; ++ ++ ret = bch2_dio_write_loop(dio); ++err: ++ if (locked) ++ inode_unlock(&inode->v); ++ return ret; ++err_put_bio: ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ bch2_quota_reservation_put(c, inode, &dio->quota_res); ++ bio_put(bio); ++ inode_dio_end(&inode->v); ++ goto err; ++} ++ ++ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) ++{ ++ struct file *file = iocb->ki_filp; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ ssize_t ret; ++ ++ if (iocb->ki_flags & IOCB_DIRECT) ++ return bch2_direct_write(iocb, from); ++ ++ /* We can write back this queue in page reclaim */ ++ current->backing_dev_info = inode_to_bdi(&inode->v); ++ inode_lock(&inode->v); ++ ++ ret = generic_write_checks(iocb, from); ++ if (ret <= 0) ++ goto unlock; ++ ++ ret = file_remove_privs(file); ++ if (ret) ++ goto unlock; ++ ++ ret = file_update_time(file); ++ if (ret) ++ goto unlock; ++ ++ ret = bch2_buffered_write(iocb, from); ++ if (likely(ret > 0)) ++ iocb->ki_pos += ret; ++unlock: ++ inode_unlock(&inode->v); ++ current->backing_dev_info = NULL; ++ ++ if (ret > 0) ++ ret = generic_write_sync(iocb, ret); ++ ++ return ret; ++} ++ ++/* fsync: */ ++ ++int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ int ret, ret2; ++ ++ ret = file_write_and_wait_range(file, start, end); ++ if (ret) ++ return ret; ++ ++ if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC)) ++ goto out; ++ ++ ret = sync_inode_metadata(&inode->v, 1); ++ if (ret) ++ return ret; ++out: ++ if (!c->opts.journal_flush_disabled) ++ ret = bch2_journal_flush_seq(&c->journal, ++ inode->ei_journal_seq); ++ ret2 = file_check_and_advance_wb_err(file); ++ ++ return ret ?: ret2; ++} ++ ++/* truncate: */ ++ ++static inline int range_has_data(struct bch_fs *c, ++ struct bpos start, ++ struct bpos end) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k, ret) { ++ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) ++ break; ++ ++ if (bkey_extent_is_data(k.k)) { ++ ret = 1; ++ break; ++ } ++ } ++ ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++static int __bch2_truncate_page(struct bch_inode_info *inode, ++ pgoff_t index, loff_t start, loff_t end) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct address_space *mapping = inode->v.i_mapping; ++ struct bch_page_state *s; ++ unsigned start_offset = start & (PAGE_SIZE - 1); ++ unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; ++ unsigned i; ++ struct page *page; ++ int ret = 0; ++ ++ /* Page boundary? Nothing to do */ ++ if (!((index == start >> PAGE_SHIFT && start_offset) || ++ (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE))) ++ return 0; ++ ++ /* Above i_size? */ ++ if (index << PAGE_SHIFT >= inode->v.i_size) ++ return 0; ++ ++ page = find_lock_page(mapping, index); ++ if (!page) { ++ /* ++ * XXX: we're doing two index lookups when we end up reading the ++ * page ++ */ ++ ret = range_has_data(c, ++ POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT), ++ POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT)); ++ if (ret <= 0) ++ return ret; ++ ++ page = find_or_create_page(mapping, index, GFP_KERNEL); ++ if (unlikely(!page)) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ } ++ ++ s = bch2_page_state_create(page, 0); ++ if (!s) { ++ ret = -ENOMEM; ++ goto unlock; ++ } ++ ++ if (!PageUptodate(page)) { ++ ret = bch2_read_single_page(page, mapping); ++ if (ret) ++ goto unlock; ++ } ++ ++ if (index != start >> PAGE_SHIFT) ++ start_offset = 0; ++ if (index != end >> PAGE_SHIFT) ++ end_offset = PAGE_SIZE; ++ ++ for (i = round_up(start_offset, block_bytes(c)) >> 9; ++ i < round_down(end_offset, block_bytes(c)) >> 9; ++ i++) { ++ s->s[i].nr_replicas = 0; ++ s->s[i].state = SECTOR_UNALLOCATED; ++ } ++ ++ zero_user_segment(page, start_offset, end_offset); ++ ++ /* ++ * Bit of a hack - we don't want truncate to fail due to -ENOSPC. ++ * ++ * XXX: because we aren't currently tracking whether the page has actual ++ * data in it (vs. just 0s, or only partially written) this wrong. ick. ++ */ ++ ret = bch2_get_page_disk_reservation(c, inode, page, false); ++ BUG_ON(ret); ++ ++ __set_page_dirty_nobuffers(page); ++unlock: ++ unlock_page(page); ++ put_page(page); ++out: ++ return ret; ++} ++ ++static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) ++{ ++ return __bch2_truncate_page(inode, from >> PAGE_SHIFT, ++ from, round_up(from, PAGE_SIZE)); ++} ++ ++static int bch2_extend(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *inode_u, ++ struct iattr *iattr) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct address_space *mapping = inode->v.i_mapping; ++ int ret; ++ ++ /* ++ * sync appends: ++ * ++ * this has to be done _before_ extending i_size: ++ */ ++ ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); ++ if (ret) ++ return ret; ++ ++ truncate_setsize(&inode->v, iattr->ia_size); ++ setattr_copy(&inode->v, iattr); ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode_size(c, inode, inode->v.i_size, ++ ATTR_MTIME|ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++ ++ return ret; ++} ++ ++static int bch2_truncate_finish_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; ++ bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); ++ return 0; ++} ++ ++static int bch2_truncate_start_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, void *p) ++{ ++ u64 *new_i_size = p; ++ ++ bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY; ++ bi->bi_size = *new_i_size; ++ return 0; ++} ++ ++int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct address_space *mapping = inode->v.i_mapping; ++ struct bch_inode_unpacked inode_u; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ u64 new_i_size = iattr->ia_size; ++ s64 i_sectors_delta = 0; ++ int ret = 0; ++ ++ inode_dio_wait(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ /* ++ * fetch current on disk i_size: inode is locked, i_size can only ++ * increase underneath us: ++ */ ++ bch2_trans_init(&trans, c, 0, 0); ++ iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, 0); ++ ret = PTR_ERR_OR_ZERO(iter); ++ bch2_trans_exit(&trans); ++ ++ if (ret) ++ goto err; ++ ++ /* ++ * check this before next assertion; on filesystem error our normal ++ * invariants are a bit broken (truncate has to truncate the page cache ++ * before the inode). ++ */ ++ ret = bch2_journal_error(&c->journal); ++ if (ret) ++ goto err; ++ ++ BUG_ON(inode->v.i_size < inode_u.bi_size); ++ ++ if (iattr->ia_size > inode->v.i_size) { ++ ret = bch2_extend(inode, &inode_u, iattr); ++ goto err; ++ } ++ ++ ret = bch2_truncate_page(inode, iattr->ia_size); ++ if (unlikely(ret)) ++ goto err; ++ ++ /* ++ * When extending, we're going to write the new i_size to disk ++ * immediately so we need to flush anything above the current on disk ++ * i_size first: ++ * ++ * Also, when extending we need to flush the page that i_size currently ++ * straddles - if it's mapped to userspace, we need to ensure that ++ * userspace has to redirty it and call .mkwrite -> set_page_dirty ++ * again to allocate the part of the page that was extended. ++ */ ++ if (iattr->ia_size > inode_u.bi_size) ++ ret = filemap_write_and_wait_range(mapping, ++ inode_u.bi_size, ++ iattr->ia_size - 1); ++ else if (iattr->ia_size & (PAGE_SIZE - 1)) ++ ret = filemap_write_and_wait_range(mapping, ++ round_down(iattr->ia_size, PAGE_SIZE), ++ iattr->ia_size - 1); ++ if (ret) ++ goto err; ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode(c, inode, bch2_truncate_start_fn, ++ &new_i_size, 0); ++ mutex_unlock(&inode->ei_update_lock); ++ ++ if (unlikely(ret)) ++ goto err; ++ ++ truncate_setsize(&inode->v, iattr->ia_size); ++ ++ ret = bch2_fpunch(c, inode->v.i_ino, ++ round_up(iattr->ia_size, block_bytes(c)) >> 9, ++ U64_MAX, &inode->ei_journal_seq, &i_sectors_delta); ++ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++ ++ if (unlikely(ret)) ++ goto err; ++ ++ setattr_copy(&inode->v, iattr); ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, ++ ATTR_MTIME|ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++err: ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ return ret; ++} ++ ++/* fallocate: */ ++ ++static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ u64 discard_start = round_up(offset, block_bytes(c)) >> 9; ++ u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9; ++ int ret = 0; ++ ++ inode_lock(&inode->v); ++ inode_dio_wait(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ ret = __bch2_truncate_page(inode, ++ offset >> PAGE_SHIFT, ++ offset, offset + len); ++ if (unlikely(ret)) ++ goto err; ++ ++ if (offset >> PAGE_SHIFT != ++ (offset + len) >> PAGE_SHIFT) { ++ ret = __bch2_truncate_page(inode, ++ (offset + len) >> PAGE_SHIFT, ++ offset, offset + len); ++ if (unlikely(ret)) ++ goto err; ++ } ++ ++ truncate_pagecache_range(&inode->v, offset, offset + len - 1); ++ ++ if (discard_start < discard_end) { ++ s64 i_sectors_delta = 0; ++ ++ ret = bch2_fpunch(c, inode->v.i_ino, ++ discard_start, discard_end, ++ &inode->ei_journal_seq, ++ &i_sectors_delta); ++ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++ } ++err: ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ inode_unlock(&inode->v); ++ ++ return ret; ++} ++ ++static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, ++ loff_t offset, loff_t len, ++ bool insert) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct address_space *mapping = inode->v.i_mapping; ++ struct bkey_on_stack copy; ++ struct btree_trans trans; ++ struct btree_iter *src, *dst; ++ loff_t shift, new_size; ++ u64 src_start; ++ int ret; ++ ++ if ((offset | len) & (block_bytes(c) - 1)) ++ return -EINVAL; ++ ++ bkey_on_stack_init(©); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); ++ ++ /* ++ * We need i_mutex to keep the page cache consistent with the extents ++ * btree, and the btree consistent with i_size - we don't need outside ++ * locking for the extents btree itself, because we're using linked ++ * iterators ++ */ ++ inode_lock(&inode->v); ++ inode_dio_wait(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ if (insert) { ++ ret = -EFBIG; ++ if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len) ++ goto err; ++ ++ ret = -EINVAL; ++ if (offset >= inode->v.i_size) ++ goto err; ++ ++ src_start = U64_MAX; ++ shift = len; ++ } else { ++ ret = -EINVAL; ++ if (offset + len >= inode->v.i_size) ++ goto err; ++ ++ src_start = offset + len; ++ shift = -len; ++ } ++ ++ new_size = inode->v.i_size + shift; ++ ++ ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); ++ if (ret) ++ goto err; ++ ++ if (insert) { ++ i_size_write(&inode->v, new_size); ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode_size(c, inode, new_size, ++ ATTR_MTIME|ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++ } else { ++ s64 i_sectors_delta = 0; ++ ++ ret = bch2_fpunch(c, inode->v.i_ino, ++ offset >> 9, (offset + len) >> 9, ++ &inode->ei_journal_seq, ++ &i_sectors_delta); ++ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++ ++ if (ret) ++ goto err; ++ } ++ ++ src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(inode->v.i_ino, src_start >> 9), ++ BTREE_ITER_INTENT); ++ BUG_ON(IS_ERR_OR_NULL(src)); ++ ++ dst = bch2_trans_copy_iter(&trans, src); ++ BUG_ON(IS_ERR_OR_NULL(dst)); ++ ++ while (1) { ++ struct disk_reservation disk_res = ++ bch2_disk_reservation_init(c, 0); ++ struct bkey_i delete; ++ struct bkey_s_c k; ++ struct bpos next_pos; ++ struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); ++ struct bpos atomic_end; ++ unsigned trigger_flags = 0; ++ ++ k = insert ++ ? bch2_btree_iter_peek_prev(src) ++ : bch2_btree_iter_peek(src); ++ if ((ret = bkey_err(k))) ++ goto bkey_err; ++ ++ if (!k.k || k.k->p.inode != inode->v.i_ino) ++ break; ++ ++ BUG_ON(bkey_cmp(src->pos, bkey_start_pos(k.k))); ++ ++ if (insert && ++ bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0) ++ break; ++reassemble: ++ bkey_on_stack_reassemble(©, c, k); ++ ++ if (insert && ++ bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) ++ bch2_cut_front(move_pos, copy.k); ++ ++ copy.k->k.p.offset += shift >> 9; ++ bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k->k)); ++ ++ ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end); ++ if (ret) ++ goto bkey_err; ++ ++ if (bkey_cmp(atomic_end, copy.k->k.p)) { ++ if (insert) { ++ move_pos = atomic_end; ++ move_pos.offset -= shift >> 9; ++ goto reassemble; ++ } else { ++ bch2_cut_back(atomic_end, copy.k); ++ } ++ } ++ ++ bkey_init(&delete.k); ++ delete.k.p = copy.k->k.p; ++ delete.k.size = copy.k->k.size; ++ delete.k.p.offset -= shift >> 9; ++ ++ next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; ++ ++ if (copy.k->k.size == k.k->size) { ++ /* ++ * If we're moving the entire extent, we can skip ++ * running triggers: ++ */ ++ trigger_flags |= BTREE_TRIGGER_NORUN; ++ } else { ++ /* We might end up splitting compressed extents: */ ++ unsigned nr_ptrs = ++ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k)); ++ ++ ret = bch2_disk_reservation_get(c, &disk_res, ++ copy.k->k.size, nr_ptrs, ++ BCH_DISK_RESERVATION_NOFAIL); ++ BUG_ON(ret); ++ } ++ ++ bch2_btree_iter_set_pos(src, bkey_start_pos(&delete.k)); ++ ++ ret = bch2_trans_update(&trans, src, &delete, trigger_flags) ?: ++ bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?: ++ bch2_trans_commit(&trans, &disk_res, ++ &inode->ei_journal_seq, ++ BTREE_INSERT_NOFAIL); ++ bch2_disk_reservation_put(c, &disk_res); ++bkey_err: ++ if (!ret) ++ bch2_btree_iter_set_pos(src, next_pos); ++ ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ goto err; ++ ++ bch2_trans_cond_resched(&trans); ++ } ++ bch2_trans_unlock(&trans); ++ ++ if (!insert) { ++ i_size_write(&inode->v, new_size); ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode_size(c, inode, new_size, ++ ATTR_MTIME|ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++ } ++err: ++ bch2_trans_exit(&trans); ++ bkey_on_stack_exit(©, c); ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ inode_unlock(&inode->v); ++ return ret; ++} ++ ++static long bchfs_fallocate(struct bch_inode_info *inode, int mode, ++ loff_t offset, loff_t len) ++{ ++ struct address_space *mapping = inode->v.i_mapping; ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bpos end_pos; ++ loff_t end = offset + len; ++ loff_t block_start = round_down(offset, block_bytes(c)); ++ loff_t block_end = round_up(end, block_bytes(c)); ++ unsigned sectors; ++ unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas; ++ int ret; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ inode_lock(&inode->v); ++ inode_dio_wait(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { ++ ret = inode_newsize_ok(&inode->v, end); ++ if (ret) ++ goto err; ++ } ++ ++ if (mode & FALLOC_FL_ZERO_RANGE) { ++ ret = __bch2_truncate_page(inode, ++ offset >> PAGE_SHIFT, ++ offset, end); ++ ++ if (!ret && ++ offset >> PAGE_SHIFT != end >> PAGE_SHIFT) ++ ret = __bch2_truncate_page(inode, ++ end >> PAGE_SHIFT, ++ offset, end); ++ ++ if (unlikely(ret)) ++ goto err; ++ ++ truncate_pagecache_range(&inode->v, offset, end - 1); ++ } ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(inode->v.i_ino, block_start >> 9), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ end_pos = POS(inode->v.i_ino, block_end >> 9); ++ ++ while (bkey_cmp(iter->pos, end_pos) < 0) { ++ s64 i_sectors_delta = 0; ++ struct disk_reservation disk_res = { 0 }; ++ struct quota_res quota_res = { 0 }; ++ struct bkey_i_reservation reservation; ++ struct bkey_s_c k; ++ ++ bch2_trans_begin(&trans); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ if ((ret = bkey_err(k))) ++ goto bkey_err; ++ ++ /* already reserved */ ++ if (k.k->type == KEY_TYPE_reservation && ++ bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) { ++ bch2_btree_iter_next_slot(iter); ++ continue; ++ } ++ ++ if (bkey_extent_is_data(k.k) && ++ !(mode & FALLOC_FL_ZERO_RANGE)) { ++ bch2_btree_iter_next_slot(iter); ++ continue; ++ } ++ ++ bkey_reservation_init(&reservation.k_i); ++ reservation.k.type = KEY_TYPE_reservation; ++ reservation.k.p = k.k->p; ++ reservation.k.size = k.k->size; ++ ++ bch2_cut_front(iter->pos, &reservation.k_i); ++ bch2_cut_back(end_pos, &reservation.k_i); ++ ++ sectors = reservation.k.size; ++ reservation.v.nr_replicas = bch2_bkey_nr_ptrs_allocated(k); ++ ++ if (!bkey_extent_is_allocation(k.k)) { ++ ret = bch2_quota_reservation_add(c, inode, ++ "a_res, ++ sectors, true); ++ if (unlikely(ret)) ++ goto bkey_err; ++ } ++ ++ if (reservation.v.nr_replicas < replicas || ++ bch2_bkey_sectors_compressed(k)) { ++ ret = bch2_disk_reservation_get(c, &disk_res, sectors, ++ replicas, 0); ++ if (unlikely(ret)) ++ goto bkey_err; ++ ++ reservation.v.nr_replicas = disk_res.nr_replicas; ++ } ++ ++ ret = bch2_extent_update(&trans, iter, &reservation.k_i, ++ &disk_res, &inode->ei_journal_seq, ++ 0, &i_sectors_delta); ++ i_sectors_acct(c, inode, "a_res, i_sectors_delta); ++bkey_err: ++ bch2_quota_reservation_put(c, inode, "a_res); ++ bch2_disk_reservation_put(c, &disk_res); ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ goto err; ++ } ++ ++ /* ++ * Do we need to extend the file? ++ * ++ * If we zeroed up to the end of the file, we dropped whatever writes ++ * were going to write out the current i_size, so we have to extend ++ * manually even if FL_KEEP_SIZE was set: ++ */ ++ if (end >= inode->v.i_size && ++ (!(mode & FALLOC_FL_KEEP_SIZE) || ++ (mode & FALLOC_FL_ZERO_RANGE))) { ++ struct btree_iter *inode_iter; ++ struct bch_inode_unpacked inode_u; ++ ++ do { ++ bch2_trans_begin(&trans); ++ inode_iter = bch2_inode_peek(&trans, &inode_u, ++ inode->v.i_ino, 0); ++ ret = PTR_ERR_OR_ZERO(inode_iter); ++ } while (ret == -EINTR); ++ ++ bch2_trans_unlock(&trans); ++ ++ if (ret) ++ goto err; ++ ++ /* ++ * Sync existing appends before extending i_size, ++ * as in bch2_extend(): ++ */ ++ ret = filemap_write_and_wait_range(mapping, ++ inode_u.bi_size, S64_MAX); ++ if (ret) ++ goto err; ++ ++ if (mode & FALLOC_FL_KEEP_SIZE) ++ end = inode->v.i_size; ++ else ++ i_size_write(&inode->v, end); ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode_size(c, inode, end, 0); ++ mutex_unlock(&inode->ei_update_lock); ++ } ++err: ++ bch2_trans_exit(&trans); ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ inode_unlock(&inode->v); ++ return ret; ++} ++ ++long bch2_fallocate_dispatch(struct file *file, int mode, ++ loff_t offset, loff_t len) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ long ret; ++ ++ if (!percpu_ref_tryget(&c->writes)) ++ return -EROFS; ++ ++ if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) ++ ret = bchfs_fallocate(inode, mode, offset, len); ++ else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) ++ ret = bchfs_fpunch(inode, offset, len); ++ else if (mode == FALLOC_FL_INSERT_RANGE) ++ ret = bchfs_fcollapse_finsert(inode, offset, len, true); ++ else if (mode == FALLOC_FL_COLLAPSE_RANGE) ++ ret = bchfs_fcollapse_finsert(inode, offset, len, false); ++ else ++ ret = -EOPNOTSUPP; ++ ++ percpu_ref_put(&c->writes); ++ ++ return ret; ++} ++ ++static void mark_range_unallocated(struct bch_inode_info *inode, ++ loff_t start, loff_t end) ++{ ++ pgoff_t index = start >> PAGE_SHIFT; ++ pgoff_t end_index = (end - 1) >> PAGE_SHIFT; ++ struct pagevec pvec; ++ ++ pagevec_init(&pvec); ++ ++ do { ++ unsigned nr_pages, i, j; ++ ++ nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, ++ &index, end_index); ++ if (nr_pages == 0) ++ break; ++ ++ for (i = 0; i < nr_pages; i++) { ++ struct page *page = pvec.pages[i]; ++ struct bch_page_state *s; ++ ++ lock_page(page); ++ s = bch2_page_state(page); ++ ++ if (s) { ++ spin_lock(&s->lock); ++ for (j = 0; j < PAGE_SECTORS; j++) ++ s->s[j].nr_replicas = 0; ++ spin_unlock(&s->lock); ++ } ++ ++ unlock_page(page); ++ } ++ pagevec_release(&pvec); ++ } while (index <= end_index); ++} ++ ++loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, ++ struct file *file_dst, loff_t pos_dst, ++ loff_t len, unsigned remap_flags) ++{ ++ struct bch_inode_info *src = file_bch_inode(file_src); ++ struct bch_inode_info *dst = file_bch_inode(file_dst); ++ struct bch_fs *c = src->v.i_sb->s_fs_info; ++ s64 i_sectors_delta = 0; ++ u64 aligned_len; ++ loff_t ret = 0; ++ ++ if (!c->opts.reflink) ++ return -EOPNOTSUPP; ++ ++ if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) ++ return -EINVAL; ++ ++ if (remap_flags & REMAP_FILE_DEDUP) ++ return -EOPNOTSUPP; ++ ++ if ((pos_src & (block_bytes(c) - 1)) || ++ (pos_dst & (block_bytes(c) - 1))) ++ return -EINVAL; ++ ++ if (src == dst && ++ abs(pos_src - pos_dst) < len) ++ return -EINVAL; ++ ++ bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); ++ ++ file_update_time(file_dst); ++ ++ inode_dio_wait(&src->v); ++ inode_dio_wait(&dst->v); ++ ++ ret = generic_remap_file_range_prep(file_src, pos_src, ++ file_dst, pos_dst, ++ &len, remap_flags); ++ if (ret < 0 || len == 0) ++ goto err; ++ ++ aligned_len = round_up((u64) len, block_bytes(c)); ++ ++ ret = write_invalidate_inode_pages_range(dst->v.i_mapping, ++ pos_dst, pos_dst + len - 1); ++ if (ret) ++ goto err; ++ ++ mark_range_unallocated(src, pos_src, pos_src + aligned_len); ++ ++ ret = bch2_remap_range(c, ++ POS(dst->v.i_ino, pos_dst >> 9), ++ POS(src->v.i_ino, pos_src >> 9), ++ aligned_len >> 9, ++ &dst->ei_journal_seq, ++ pos_dst + len, &i_sectors_delta); ++ if (ret < 0) ++ goto err; ++ ++ /* ++ * due to alignment, we might have remapped slightly more than requsted ++ */ ++ ret = min((u64) ret << 9, (u64) len); ++ ++ /* XXX get a quota reservation */ ++ i_sectors_acct(c, dst, NULL, i_sectors_delta); ++ ++ spin_lock(&dst->v.i_lock); ++ if (pos_dst + ret > dst->v.i_size) ++ i_size_write(&dst->v, pos_dst + ret); ++ spin_unlock(&dst->v.i_lock); ++err: ++ bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); ++ ++ return ret; ++} ++ ++/* fseek: */ ++ ++static int page_data_offset(struct page *page, unsigned offset) ++{ ++ struct bch_page_state *s = bch2_page_state(page); ++ unsigned i; ++ ++ if (s) ++ for (i = offset >> 9; i < PAGE_SECTORS; i++) ++ if (s->s[i].state >= SECTOR_DIRTY) ++ return i << 9; ++ ++ return -1; ++} ++ ++static loff_t bch2_seek_pagecache_data(struct inode *vinode, ++ loff_t start_offset, ++ loff_t end_offset) ++{ ++ struct address_space *mapping = vinode->i_mapping; ++ struct page *page; ++ pgoff_t start_index = start_offset >> PAGE_SHIFT; ++ pgoff_t end_index = end_offset >> PAGE_SHIFT; ++ pgoff_t index = start_index; ++ loff_t ret; ++ int offset; ++ ++ while (index <= end_index) { ++ if (find_get_pages_range(mapping, &index, end_index, 1, &page)) { ++ lock_page(page); ++ ++ offset = page_data_offset(page, ++ page->index == start_index ++ ? start_offset & (PAGE_SIZE - 1) ++ : 0); ++ if (offset >= 0) { ++ ret = clamp(((loff_t) page->index << PAGE_SHIFT) + ++ offset, ++ start_offset, end_offset); ++ unlock_page(page); ++ put_page(page); ++ return ret; ++ } ++ ++ unlock_page(page); ++ put_page(page); ++ } else { ++ break; ++ } ++ } ++ ++ return end_offset; ++} ++ ++static loff_t bch2_seek_data(struct file *file, u64 offset) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 isize, next_data = MAX_LFS_FILESIZE; ++ int ret; ++ ++ isize = i_size_read(&inode->v); ++ if (offset >= isize) ++ return -ENXIO; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, ++ POS(inode->v.i_ino, offset >> 9), 0, k, ret) { ++ if (k.k->p.inode != inode->v.i_ino) { ++ break; ++ } else if (bkey_extent_is_data(k.k)) { ++ next_data = max(offset, bkey_start_offset(k.k) << 9); ++ break; ++ } else if (k.k->p.offset >> 9 > isize) ++ break; ++ } ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ if (ret) ++ return ret; ++ ++ if (next_data > offset) ++ next_data = bch2_seek_pagecache_data(&inode->v, ++ offset, next_data); ++ ++ if (next_data >= isize) ++ return -ENXIO; ++ ++ return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); ++} ++ ++static int __page_hole_offset(struct page *page, unsigned offset) ++{ ++ struct bch_page_state *s = bch2_page_state(page); ++ unsigned i; ++ ++ if (!s) ++ return 0; ++ ++ for (i = offset >> 9; i < PAGE_SECTORS; i++) ++ if (s->s[i].state < SECTOR_DIRTY) ++ return i << 9; ++ ++ return -1; ++} ++ ++static loff_t page_hole_offset(struct address_space *mapping, loff_t offset) ++{ ++ pgoff_t index = offset >> PAGE_SHIFT; ++ struct page *page; ++ int pg_offset; ++ loff_t ret = -1; ++ ++ page = find_lock_entry(mapping, index); ++ if (!page || xa_is_value(page)) ++ return offset; ++ ++ pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1)); ++ if (pg_offset >= 0) ++ ret = ((loff_t) index << PAGE_SHIFT) + pg_offset; ++ ++ unlock_page(page); ++ ++ return ret; ++} ++ ++static loff_t bch2_seek_pagecache_hole(struct inode *vinode, ++ loff_t start_offset, ++ loff_t end_offset) ++{ ++ struct address_space *mapping = vinode->i_mapping; ++ loff_t offset = start_offset, hole; ++ ++ while (offset < end_offset) { ++ hole = page_hole_offset(mapping, offset); ++ if (hole >= 0 && hole <= end_offset) ++ return max(start_offset, hole); ++ ++ offset += PAGE_SIZE; ++ offset &= PAGE_MASK; ++ } ++ ++ return end_offset; ++} ++ ++static loff_t bch2_seek_hole(struct file *file, u64 offset) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 isize, next_hole = MAX_LFS_FILESIZE; ++ int ret; ++ ++ isize = i_size_read(&inode->v); ++ if (offset >= isize) ++ return -ENXIO; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, ++ POS(inode->v.i_ino, offset >> 9), ++ BTREE_ITER_SLOTS, k, ret) { ++ if (k.k->p.inode != inode->v.i_ino) { ++ next_hole = bch2_seek_pagecache_hole(&inode->v, ++ offset, MAX_LFS_FILESIZE); ++ break; ++ } else if (!bkey_extent_is_data(k.k)) { ++ next_hole = bch2_seek_pagecache_hole(&inode->v, ++ max(offset, bkey_start_offset(k.k) << 9), ++ k.k->p.offset << 9); ++ ++ if (next_hole < k.k->p.offset << 9) ++ break; ++ } else { ++ offset = max(offset, bkey_start_offset(k.k) << 9); ++ } ++ } ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ if (ret) ++ return ret; ++ ++ if (next_hole > isize) ++ next_hole = isize; ++ ++ return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); ++} ++ ++loff_t bch2_llseek(struct file *file, loff_t offset, int whence) ++{ ++ switch (whence) { ++ case SEEK_SET: ++ case SEEK_CUR: ++ case SEEK_END: ++ return generic_file_llseek(file, offset, whence); ++ case SEEK_DATA: ++ return bch2_seek_data(file, offset); ++ case SEEK_HOLE: ++ return bch2_seek_hole(file, offset); ++ } ++ ++ return -EINVAL; ++} ++ ++void bch2_fs_fsio_exit(struct bch_fs *c) ++{ ++ bioset_exit(&c->dio_write_bioset); ++ bioset_exit(&c->dio_read_bioset); ++ bioset_exit(&c->writepage_bioset); ++} ++ ++int bch2_fs_fsio_init(struct bch_fs *c) ++{ ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ if (bioset_init(&c->writepage_bioset, ++ 4, offsetof(struct bch_writepage_io, op.wbio.bio), ++ BIOSET_NEED_BVECS) || ++ bioset_init(&c->dio_read_bioset, ++ 4, offsetof(struct dio_read, rbio.bio), ++ BIOSET_NEED_BVECS) || ++ bioset_init(&c->dio_write_bioset, ++ 4, offsetof(struct dio_write, op.wbio.bio), ++ BIOSET_NEED_BVECS)) ++ ret = -ENOMEM; ++ ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} ++ ++#endif /* NO_BCACHEFS_FS */ +diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h +new file mode 100644 +index 000000000000..7063556d289b +--- /dev/null ++++ b/fs/bcachefs/fs-io.h +@@ -0,0 +1,57 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FS_IO_H ++#define _BCACHEFS_FS_IO_H ++ ++#ifndef NO_BCACHEFS_FS ++ ++#include "buckets.h" ++#include "io_types.h" ++ ++#include ++ ++struct quota_res; ++ ++int __must_check bch2_write_inode_size(struct bch_fs *, ++ struct bch_inode_info *, ++ loff_t, unsigned); ++ ++int bch2_writepage(struct page *, struct writeback_control *); ++int bch2_readpage(struct file *, struct page *); ++ ++int bch2_writepages(struct address_space *, struct writeback_control *); ++int bch2_readpages(struct file *, struct address_space *, ++ struct list_head *, unsigned); ++ ++int bch2_write_begin(struct file *, struct address_space *, loff_t, ++ unsigned, unsigned, struct page **, void **); ++int bch2_write_end(struct file *, struct address_space *, loff_t, ++ unsigned, unsigned, struct page *, void *); ++ ++ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); ++ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); ++ ++int bch2_fsync(struct file *, loff_t, loff_t, int); ++ ++int bch2_truncate(struct bch_inode_info *, struct iattr *); ++long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); ++ ++loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, ++ loff_t, loff_t, unsigned); ++ ++loff_t bch2_llseek(struct file *, loff_t, int); ++ ++vm_fault_t bch2_page_fault(struct vm_fault *); ++vm_fault_t bch2_page_mkwrite(struct vm_fault *); ++void bch2_invalidatepage(struct page *, unsigned int, unsigned int); ++int bch2_releasepage(struct page *, gfp_t); ++int bch2_migrate_page(struct address_space *, struct page *, ++ struct page *, enum migrate_mode); ++ ++void bch2_fs_fsio_exit(struct bch_fs *); ++int bch2_fs_fsio_init(struct bch_fs *); ++#else ++static inline void bch2_fs_fsio_exit(struct bch_fs *c) {} ++static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; } ++#endif ++ ++#endif /* _BCACHEFS_FS_IO_H */ +diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c +new file mode 100644 +index 000000000000..0873d2f0928c +--- /dev/null ++++ b/fs/bcachefs/fs-ioctl.c +@@ -0,0 +1,312 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef NO_BCACHEFS_FS ++ ++#include "bcachefs.h" ++#include "chardev.h" ++#include "dirent.h" ++#include "fs.h" ++#include "fs-common.h" ++#include "fs-ioctl.h" ++#include "quota.h" ++ ++#include ++#include ++ ++#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) ++ ++struct flags_set { ++ unsigned mask; ++ unsigned flags; ++ ++ unsigned projid; ++}; ++ ++static int bch2_inode_flags_set(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ /* ++ * We're relying on btree locking here for exclusion with other ioctl ++ * calls - use the flags in the btree (@bi), not inode->i_flags: ++ */ ++ struct flags_set *s = p; ++ unsigned newflags = s->flags; ++ unsigned oldflags = bi->bi_flags & s->mask; ++ ++ if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) && ++ !capable(CAP_LINUX_IMMUTABLE)) ++ return -EPERM; ++ ++ if (!S_ISREG(bi->bi_mode) && ++ !S_ISDIR(bi->bi_mode) && ++ (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags) ++ return -EINVAL; ++ ++ bi->bi_flags &= ~s->mask; ++ bi->bi_flags |= newflags; ++ ++ bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); ++ return 0; ++} ++ ++static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg) ++{ ++ unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags); ++ ++ return put_user(flags, arg); ++} ++ ++static int bch2_ioc_setflags(struct bch_fs *c, ++ struct file *file, ++ struct bch_inode_info *inode, ++ void __user *arg) ++{ ++ struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) }; ++ unsigned uflags; ++ int ret; ++ ++ if (get_user(uflags, (int __user *) arg)) ++ return -EFAULT; ++ ++ s.flags = map_flags_rev(bch_flags_to_uflags, uflags); ++ if (uflags) ++ return -EOPNOTSUPP; ++ ++ ret = mnt_want_write_file(file); ++ if (ret) ++ return ret; ++ ++ inode_lock(&inode->v); ++ if (!inode_owner_or_capable(&inode->v)) { ++ ret = -EACCES; ++ goto setflags_out; ++ } ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode(c, inode, bch2_inode_flags_set, &s, ++ ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++ ++setflags_out: ++ inode_unlock(&inode->v); ++ mnt_drop_write_file(file); ++ return ret; ++} ++ ++static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, ++ struct fsxattr __user *arg) ++{ ++ struct fsxattr fa = { 0 }; ++ ++ fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags); ++ fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ]; ++ ++ return copy_to_user(arg, &fa, sizeof(fa)); ++} ++ ++static int fssetxattr_inode_update_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct flags_set *s = p; ++ ++ if (s->projid != bi->bi_project) { ++ bi->bi_fields_set |= 1U << Inode_opt_project; ++ bi->bi_project = s->projid; ++ } ++ ++ return bch2_inode_flags_set(inode, bi, p); ++} ++ ++static int bch2_ioc_fssetxattr(struct bch_fs *c, ++ struct file *file, ++ struct bch_inode_info *inode, ++ struct fsxattr __user *arg) ++{ ++ struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) }; ++ struct fsxattr fa; ++ int ret; ++ ++ if (copy_from_user(&fa, arg, sizeof(fa))) ++ return -EFAULT; ++ ++ s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags); ++ if (fa.fsx_xflags) ++ return -EOPNOTSUPP; ++ ++ if (fa.fsx_projid >= U32_MAX) ++ return -EINVAL; ++ ++ /* ++ * inode fields accessible via the xattr interface are stored with a +1 ++ * bias, so that 0 means unset: ++ */ ++ s.projid = fa.fsx_projid + 1; ++ ++ ret = mnt_want_write_file(file); ++ if (ret) ++ return ret; ++ ++ inode_lock(&inode->v); ++ if (!inode_owner_or_capable(&inode->v)) { ++ ret = -EACCES; ++ goto err; ++ } ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_set_projid(c, inode, fa.fsx_projid); ++ if (ret) ++ goto err_unlock; ++ ++ ret = bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, ++ ATTR_CTIME); ++err_unlock: ++ mutex_unlock(&inode->ei_update_lock); ++err: ++ inode_unlock(&inode->v); ++ mnt_drop_write_file(file); ++ return ret; ++} ++ ++static int bch2_reinherit_attrs_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct bch_inode_info *dir = p; ++ ++ return !bch2_reinherit_attrs(bi, &dir->ei_inode); ++} ++ ++static int bch2_ioc_reinherit_attrs(struct bch_fs *c, ++ struct file *file, ++ struct bch_inode_info *src, ++ const char __user *name) ++{ ++ struct bch_inode_info *dst; ++ struct inode *vinode = NULL; ++ char *kname = NULL; ++ struct qstr qstr; ++ int ret = 0; ++ u64 inum; ++ ++ kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); ++ if (!kname) ++ return -ENOMEM; ++ ++ ret = strncpy_from_user(kname, name, BCH_NAME_MAX); ++ if (unlikely(ret < 0)) ++ goto err1; ++ ++ qstr.len = ret; ++ qstr.name = kname; ++ ++ ret = -ENOENT; ++ inum = bch2_dirent_lookup(c, src->v.i_ino, ++ &src->ei_str_hash, ++ &qstr); ++ if (!inum) ++ goto err1; ++ ++ vinode = bch2_vfs_inode_get(c, inum); ++ ret = PTR_ERR_OR_ZERO(vinode); ++ if (ret) ++ goto err1; ++ ++ dst = to_bch_ei(vinode); ++ ++ ret = mnt_want_write_file(file); ++ if (ret) ++ goto err2; ++ ++ bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst); ++ ++ if (inode_attr_changing(src, dst, Inode_opt_project)) { ++ ret = bch2_fs_quota_transfer(c, dst, ++ src->ei_qid, ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (ret) ++ goto err3; ++ } ++ ++ ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0); ++err3: ++ bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst); ++ ++ /* return true if we did work */ ++ if (ret >= 0) ++ ret = !ret; ++ ++ mnt_drop_write_file(file); ++err2: ++ iput(vinode); ++err1: ++ kfree(kname); ++ ++ return ret; ++} ++ ++long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct super_block *sb = inode->v.i_sb; ++ struct bch_fs *c = sb->s_fs_info; ++ ++ switch (cmd) { ++ case FS_IOC_GETFLAGS: ++ return bch2_ioc_getflags(inode, (int __user *) arg); ++ ++ case FS_IOC_SETFLAGS: ++ return bch2_ioc_setflags(c, file, inode, (int __user *) arg); ++ ++ case FS_IOC_FSGETXATTR: ++ return bch2_ioc_fsgetxattr(inode, (void __user *) arg); ++ case FS_IOC_FSSETXATTR: ++ return bch2_ioc_fssetxattr(c, file, inode, ++ (void __user *) arg); ++ ++ case BCHFS_IOC_REINHERIT_ATTRS: ++ return bch2_ioc_reinherit_attrs(c, file, inode, ++ (void __user *) arg); ++ ++ case FS_IOC_GETVERSION: ++ return -ENOTTY; ++ case FS_IOC_SETVERSION: ++ return -ENOTTY; ++ ++ case FS_IOC_GOINGDOWN: ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ down_write(&sb->s_umount); ++ sb->s_flags |= SB_RDONLY; ++ if (bch2_fs_emergency_read_only(c)) ++ bch_err(c, "emergency read only due to ioctl"); ++ up_write(&sb->s_umount); ++ return 0; ++ ++ default: ++ return bch2_fs_ioctl(c, cmd, (void __user *) arg); ++ } ++} ++ ++#ifdef CONFIG_COMPAT ++long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg) ++{ ++ /* These are just misnamed, they actually get/put from/to user an int */ ++ switch (cmd) { ++ case FS_IOC_GETFLAGS: ++ cmd = FS_IOC_GETFLAGS; ++ break; ++ case FS_IOC32_SETFLAGS: ++ cmd = FS_IOC_SETFLAGS; ++ break; ++ default: ++ return -ENOIOCTLCMD; ++ } ++ return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); ++} ++#endif ++ ++#endif /* NO_BCACHEFS_FS */ +diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h +new file mode 100644 +index 000000000000..f201980ef2c3 +--- /dev/null ++++ b/fs/bcachefs/fs-ioctl.h +@@ -0,0 +1,81 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FS_IOCTL_H ++#define _BCACHEFS_FS_IOCTL_H ++ ++/* Inode flags: */ ++ ++/* bcachefs inode flags -> vfs inode flags: */ ++static const unsigned bch_flags_to_vfs[] = { ++ [__BCH_INODE_SYNC] = S_SYNC, ++ [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE, ++ [__BCH_INODE_APPEND] = S_APPEND, ++ [__BCH_INODE_NOATIME] = S_NOATIME, ++}; ++ ++/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ ++static const unsigned bch_flags_to_uflags[] = { ++ [__BCH_INODE_SYNC] = FS_SYNC_FL, ++ [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL, ++ [__BCH_INODE_APPEND] = FS_APPEND_FL, ++ [__BCH_INODE_NODUMP] = FS_NODUMP_FL, ++ [__BCH_INODE_NOATIME] = FS_NOATIME_FL, ++}; ++ ++/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ ++static const unsigned bch_flags_to_xflags[] = { ++ [__BCH_INODE_SYNC] = FS_XFLAG_SYNC, ++ [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE, ++ [__BCH_INODE_APPEND] = FS_XFLAG_APPEND, ++ [__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP, ++ [__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME, ++ //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; ++}; ++ ++#define set_flags(_map, _in, _out) \ ++do { \ ++ unsigned _i; \ ++ \ ++ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ ++ if ((_in) & (1 << _i)) \ ++ (_out) |= _map[_i]; \ ++ else \ ++ (_out) &= ~_map[_i]; \ ++} while (0) ++ ++#define map_flags(_map, _in) \ ++({ \ ++ unsigned _out = 0; \ ++ \ ++ set_flags(_map, _in, _out); \ ++ _out; \ ++}) ++ ++#define map_flags_rev(_map, _in) \ ++({ \ ++ unsigned _i, _out = 0; \ ++ \ ++ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ ++ if ((_in) & _map[_i]) { \ ++ (_out) |= 1 << _i; \ ++ (_in) &= ~_map[_i]; \ ++ } \ ++ (_out); \ ++}) ++ ++#define map_defined(_map) \ ++({ \ ++ unsigned _in = ~0; \ ++ \ ++ map_flags_rev(_map, _in); \ ++}) ++ ++/* Set VFS inode flags from bcachefs inode: */ ++static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) ++{ ++ set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); ++} ++ ++long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long); ++long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long); ++ ++#endif /* _BCACHEFS_FS_IOCTL_H */ +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +new file mode 100644 +index 000000000000..e504e6b19abe +--- /dev/null ++++ b/fs/bcachefs/fs.c +@@ -0,0 +1,1628 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef NO_BCACHEFS_FS ++ ++#include "bcachefs.h" ++#include "acl.h" ++#include "bkey_on_stack.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "chardev.h" ++#include "dirent.h" ++#include "extents.h" ++#include "fs.h" ++#include "fs-common.h" ++#include "fs-io.h" ++#include "fs-ioctl.h" ++#include "fsck.h" ++#include "inode.h" ++#include "io.h" ++#include "journal.h" ++#include "keylist.h" ++#include "quota.h" ++#include "super.h" ++#include "xattr.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static struct kmem_cache *bch2_inode_cache; ++ ++static void bch2_vfs_inode_init(struct bch_fs *, ++ struct bch_inode_info *, ++ struct bch_inode_unpacked *); ++ ++static void journal_seq_copy(struct bch_inode_info *dst, ++ u64 journal_seq) ++{ ++ u64 old, v = READ_ONCE(dst->ei_journal_seq); ++ ++ do { ++ old = v; ++ ++ if (old >= journal_seq) ++ break; ++ } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old); ++} ++ ++static void __pagecache_lock_put(struct pagecache_lock *lock, long i) ++{ ++ BUG_ON(atomic_long_read(&lock->v) == 0); ++ ++ if (atomic_long_sub_return_release(i, &lock->v) == 0) ++ wake_up_all(&lock->wait); ++} ++ ++static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i) ++{ ++ long v = atomic_long_read(&lock->v), old; ++ ++ do { ++ old = v; ++ ++ if (i > 0 ? v < 0 : v > 0) ++ return false; ++ } while ((v = atomic_long_cmpxchg_acquire(&lock->v, ++ old, old + i)) != old); ++ return true; ++} ++ ++static void __pagecache_lock_get(struct pagecache_lock *lock, long i) ++{ ++ wait_event(lock->wait, __pagecache_lock_tryget(lock, i)); ++} ++ ++void bch2_pagecache_add_put(struct pagecache_lock *lock) ++{ ++ __pagecache_lock_put(lock, 1); ++} ++ ++void bch2_pagecache_add_get(struct pagecache_lock *lock) ++{ ++ __pagecache_lock_get(lock, 1); ++} ++ ++void bch2_pagecache_block_put(struct pagecache_lock *lock) ++{ ++ __pagecache_lock_put(lock, -1); ++} ++ ++void bch2_pagecache_block_get(struct pagecache_lock *lock) ++{ ++ __pagecache_lock_get(lock, -1); ++} ++ ++void bch2_inode_update_after_write(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ unsigned fields) ++{ ++ set_nlink(&inode->v, bch2_inode_nlink_get(bi)); ++ i_uid_write(&inode->v, bi->bi_uid); ++ i_gid_write(&inode->v, bi->bi_gid); ++ inode->v.i_mode = bi->bi_mode; ++ ++ if (fields & ATTR_ATIME) ++ inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime); ++ if (fields & ATTR_MTIME) ++ inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime); ++ if (fields & ATTR_CTIME) ++ inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime); ++ ++ inode->ei_inode = *bi; ++ ++ bch2_inode_flags_to_vfs(inode); ++} ++ ++int __must_check bch2_write_inode(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ inode_set_fn set, ++ void *p, unsigned fields) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bch_inode_unpacked inode_u; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(iter) ?: ++ (set ? set(inode, &inode_u, p) : 0) ?: ++ bch2_inode_write(&trans, iter, &inode_u) ?: ++ bch2_trans_commit(&trans, NULL, ++ &inode->ei_journal_seq, ++ BTREE_INSERT_NOUNLOCK| ++ BTREE_INSERT_NOFAIL); ++ ++ /* ++ * the btree node lock protects inode->ei_inode, not ei_update_lock; ++ * this is important for inode updates via bchfs_write_index_update ++ */ ++ if (!ret) ++ bch2_inode_update_after_write(c, inode, &inode_u, fields); ++ ++ bch2_trans_iter_put(&trans, iter); ++ ++ if (ret == -EINTR) ++ goto retry; ++ ++ bch2_trans_exit(&trans); ++ return ret < 0 ? ret : 0; ++} ++ ++int bch2_fs_quota_transfer(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch_qid new_qid, ++ unsigned qtypes, ++ enum quota_acct_mode mode) ++{ ++ unsigned i; ++ int ret; ++ ++ qtypes &= enabled_qtypes(c); ++ ++ for (i = 0; i < QTYP_NR; i++) ++ if (new_qid.q[i] == inode->ei_qid.q[i]) ++ qtypes &= ~(1U << i); ++ ++ if (!qtypes) ++ return 0; ++ ++ mutex_lock(&inode->ei_quota_lock); ++ ++ ret = bch2_quota_transfer(c, qtypes, new_qid, ++ inode->ei_qid, ++ inode->v.i_blocks + ++ inode->ei_quota_reserved, ++ mode); ++ if (!ret) ++ for (i = 0; i < QTYP_NR; i++) ++ if (qtypes & (1 << i)) ++ inode->ei_qid.q[i] = new_qid.q[i]; ++ ++ mutex_unlock(&inode->ei_quota_lock); ++ ++ return ret; ++} ++ ++struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) ++{ ++ struct bch_inode_unpacked inode_u; ++ struct bch_inode_info *inode; ++ int ret; ++ ++ inode = to_bch_ei(iget_locked(c->vfs_sb, inum)); ++ if (unlikely(!inode)) ++ return ERR_PTR(-ENOMEM); ++ if (!(inode->v.i_state & I_NEW)) ++ return &inode->v; ++ ++ ret = bch2_inode_find_by_inum(c, inum, &inode_u); ++ if (ret) { ++ iget_failed(&inode->v); ++ return ERR_PTR(ret); ++ } ++ ++ bch2_vfs_inode_init(c, inode, &inode_u); ++ ++ inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum); ++ ++ unlock_new_inode(&inode->v); ++ ++ return &inode->v; ++} ++ ++static struct bch_inode_info * ++__bch2_create(struct bch_inode_info *dir, struct dentry *dentry, ++ umode_t mode, dev_t rdev, bool tmpfile) ++{ ++ struct bch_fs *c = dir->v.i_sb->s_fs_info; ++ struct user_namespace *ns = dir->v.i_sb->s_user_ns; ++ struct btree_trans trans; ++ struct bch_inode_unpacked dir_u; ++ struct bch_inode_info *inode, *old; ++ struct bch_inode_unpacked inode_u; ++ struct posix_acl *default_acl = NULL, *acl = NULL; ++ u64 journal_seq = 0; ++ int ret; ++ ++ /* ++ * preallocate acls + vfs inode before btree transaction, so that ++ * nothing can fail after the transaction succeeds: ++ */ ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); ++ if (ret) ++ return ERR_PTR(ret); ++#endif ++ inode = to_bch_ei(new_inode(c->vfs_sb)); ++ if (unlikely(!inode)) { ++ inode = ERR_PTR(-ENOMEM); ++ goto err; ++ } ++ ++ bch2_inode_init_early(c, &inode_u); ++ ++ if (!tmpfile) ++ mutex_lock(&dir->ei_update_lock); ++ ++ bch2_trans_init(&trans, c, 8, 1024); ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u, ++ !tmpfile ? &dentry->d_name : NULL, ++ from_kuid(ns, current_fsuid()), ++ from_kgid(ns, current_fsgid()), ++ mode, rdev, ++ default_acl, acl) ?: ++ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (unlikely(ret)) ++ goto err_before_quota; ++ ++ ret = bch2_trans_commit(&trans, NULL, &journal_seq, ++ BTREE_INSERT_NOUNLOCK); ++ if (unlikely(ret)) { ++ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, ++ KEY_TYPE_QUOTA_WARN); ++err_before_quota: ++ if (ret == -EINTR) ++ goto retry; ++ goto err_trans; ++ } ++ ++ if (!tmpfile) { ++ bch2_inode_update_after_write(c, dir, &dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ journal_seq_copy(dir, journal_seq); ++ mutex_unlock(&dir->ei_update_lock); ++ } ++ ++ bch2_vfs_inode_init(c, inode, &inode_u); ++ journal_seq_copy(inode, journal_seq); ++ ++ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); ++ set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); ++ ++ /* ++ * we must insert the new inode into the inode cache before calling ++ * bch2_trans_exit() and dropping locks, else we could race with another ++ * thread pulling the inode in and modifying it: ++ */ ++ ++ old = to_bch_ei(insert_inode_locked2(&inode->v)); ++ if (unlikely(old)) { ++ /* ++ * We raced, another process pulled the new inode into cache ++ * before us: ++ */ ++ journal_seq_copy(old, journal_seq); ++ make_bad_inode(&inode->v); ++ iput(&inode->v); ++ ++ inode = old; ++ } else { ++ /* ++ * we really don't want insert_inode_locked2() to be setting ++ * I_NEW... ++ */ ++ unlock_new_inode(&inode->v); ++ } ++ ++ bch2_trans_exit(&trans); ++err: ++ posix_acl_release(default_acl); ++ posix_acl_release(acl); ++ return inode; ++err_trans: ++ if (!tmpfile) ++ mutex_unlock(&dir->ei_update_lock); ++ ++ bch2_trans_exit(&trans); ++ make_bad_inode(&inode->v); ++ iput(&inode->v); ++ inode = ERR_PTR(ret); ++ goto err; ++} ++ ++/* methods */ ++ ++static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, ++ unsigned int flags) ++{ ++ struct bch_fs *c = vdir->i_sb->s_fs_info; ++ struct bch_inode_info *dir = to_bch_ei(vdir); ++ struct inode *vinode = NULL; ++ u64 inum; ++ ++ inum = bch2_dirent_lookup(c, dir->v.i_ino, ++ &dir->ei_str_hash, ++ &dentry->d_name); ++ ++ if (inum) ++ vinode = bch2_vfs_inode_get(c, inum); ++ ++ return d_splice_alias(vinode, dentry); ++} ++ ++static int bch2_mknod(struct inode *vdir, struct dentry *dentry, ++ umode_t mode, dev_t rdev) ++{ ++ struct bch_inode_info *inode = ++ __bch2_create(to_bch_ei(vdir), dentry, mode, rdev, false); ++ ++ if (IS_ERR(inode)) ++ return PTR_ERR(inode); ++ ++ d_instantiate(dentry, &inode->v); ++ return 0; ++} ++ ++static int bch2_create(struct inode *vdir, struct dentry *dentry, ++ umode_t mode, bool excl) ++{ ++ return bch2_mknod(vdir, dentry, mode|S_IFREG, 0); ++} ++ ++static int __bch2_link(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch_inode_info *dir, ++ struct dentry *dentry) ++{ ++ struct btree_trans trans; ++ struct bch_inode_unpacked dir_u, inode_u; ++ int ret; ++ ++ mutex_lock(&inode->ei_update_lock); ++ bch2_trans_init(&trans, c, 4, 1024); ++ ++ do { ++ bch2_trans_begin(&trans); ++ ret = bch2_link_trans(&trans, ++ dir->v.i_ino, ++ inode->v.i_ino, &dir_u, &inode_u, ++ &dentry->d_name) ?: ++ bch2_trans_commit(&trans, NULL, ++ &inode->ei_journal_seq, ++ BTREE_INSERT_NOUNLOCK); ++ } while (ret == -EINTR); ++ ++ if (likely(!ret)) { ++ BUG_ON(inode_u.bi_inum != inode->v.i_ino); ++ ++ journal_seq_copy(inode, dir->ei_journal_seq); ++ bch2_inode_update_after_write(c, dir, &dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME); ++ } ++ ++ bch2_trans_exit(&trans); ++ mutex_unlock(&inode->ei_update_lock); ++ return ret; ++} ++ ++static int bch2_link(struct dentry *old_dentry, struct inode *vdir, ++ struct dentry *dentry) ++{ ++ struct bch_fs *c = vdir->i_sb->s_fs_info; ++ struct bch_inode_info *dir = to_bch_ei(vdir); ++ struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode); ++ int ret; ++ ++ lockdep_assert_held(&inode->v.i_rwsem); ++ ++ ret = __bch2_link(c, inode, dir, dentry); ++ if (unlikely(ret)) ++ return ret; ++ ++ ihold(&inode->v); ++ d_instantiate(dentry, &inode->v); ++ return 0; ++} ++ ++static int bch2_unlink(struct inode *vdir, struct dentry *dentry) ++{ ++ struct bch_fs *c = vdir->i_sb->s_fs_info; ++ struct bch_inode_info *dir = to_bch_ei(vdir); ++ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); ++ struct bch_inode_unpacked dir_u, inode_u; ++ struct btree_trans trans; ++ int ret; ++ ++ bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); ++ bch2_trans_init(&trans, c, 4, 1024); ++ ++ do { ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_unlink_trans(&trans, ++ dir->v.i_ino, &dir_u, ++ &inode_u, &dentry->d_name) ?: ++ bch2_trans_commit(&trans, NULL, ++ &dir->ei_journal_seq, ++ BTREE_INSERT_NOUNLOCK| ++ BTREE_INSERT_NOFAIL); ++ } while (ret == -EINTR); ++ ++ if (likely(!ret)) { ++ BUG_ON(inode_u.bi_inum != inode->v.i_ino); ++ ++ journal_seq_copy(inode, dir->ei_journal_seq); ++ bch2_inode_update_after_write(c, dir, &dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ bch2_inode_update_after_write(c, inode, &inode_u, ++ ATTR_MTIME); ++ } ++ ++ bch2_trans_exit(&trans); ++ bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); ++ ++ return ret; ++} ++ ++static int bch2_symlink(struct inode *vdir, struct dentry *dentry, ++ const char *symname) ++{ ++ struct bch_fs *c = vdir->i_sb->s_fs_info; ++ struct bch_inode_info *dir = to_bch_ei(vdir), *inode; ++ int ret; ++ ++ inode = __bch2_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0, true); ++ if (unlikely(IS_ERR(inode))) ++ return PTR_ERR(inode); ++ ++ inode_lock(&inode->v); ++ ret = page_symlink(&inode->v, symname, strlen(symname) + 1); ++ inode_unlock(&inode->v); ++ ++ if (unlikely(ret)) ++ goto err; ++ ++ ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX); ++ if (unlikely(ret)) ++ goto err; ++ ++ journal_seq_copy(dir, inode->ei_journal_seq); ++ ++ ret = __bch2_link(c, inode, dir, dentry); ++ if (unlikely(ret)) ++ goto err; ++ ++ d_instantiate(dentry, &inode->v); ++ return 0; ++err: ++ iput(&inode->v); ++ return ret; ++} ++ ++static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode) ++{ ++ return bch2_mknod(vdir, dentry, mode|S_IFDIR, 0); ++} ++ ++static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry, ++ struct inode *dst_vdir, struct dentry *dst_dentry, ++ unsigned flags) ++{ ++ struct bch_fs *c = src_vdir->i_sb->s_fs_info; ++ struct bch_inode_info *src_dir = to_bch_ei(src_vdir); ++ struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir); ++ struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); ++ struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); ++ struct bch_inode_unpacked dst_dir_u, src_dir_u; ++ struct bch_inode_unpacked src_inode_u, dst_inode_u; ++ struct btree_trans trans; ++ enum bch_rename_mode mode = flags & RENAME_EXCHANGE ++ ? BCH_RENAME_EXCHANGE ++ : dst_dentry->d_inode ++ ? BCH_RENAME_OVERWRITE : BCH_RENAME; ++ u64 journal_seq = 0; ++ int ret; ++ ++ if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) ++ return -EINVAL; ++ ++ if (mode == BCH_RENAME_OVERWRITE) { ++ ret = filemap_write_and_wait_range(src_inode->v.i_mapping, ++ 0, LLONG_MAX); ++ if (ret) ++ return ret; ++ } ++ ++ bch2_trans_init(&trans, c, 8, 2048); ++ ++ bch2_lock_inodes(INODE_UPDATE_LOCK, ++ src_dir, ++ dst_dir, ++ src_inode, ++ dst_inode); ++ ++ if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { ++ ret = bch2_fs_quota_transfer(c, src_inode, ++ dst_dir->ei_qid, ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (ret) ++ goto err; ++ } ++ ++ if (mode == BCH_RENAME_EXCHANGE && ++ inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) { ++ ret = bch2_fs_quota_transfer(c, dst_inode, ++ src_dir->ei_qid, ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (ret) ++ goto err; ++ } ++ ++retry: ++ bch2_trans_begin(&trans); ++ ret = bch2_rename_trans(&trans, ++ src_dir->v.i_ino, &src_dir_u, ++ dst_dir->v.i_ino, &dst_dir_u, ++ &src_inode_u, ++ &dst_inode_u, ++ &src_dentry->d_name, ++ &dst_dentry->d_name, ++ mode) ?: ++ bch2_trans_commit(&trans, NULL, ++ &journal_seq, ++ BTREE_INSERT_NOUNLOCK); ++ if (ret == -EINTR) ++ goto retry; ++ if (unlikely(ret)) ++ goto err; ++ ++ BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); ++ BUG_ON(dst_inode && ++ dst_inode->v.i_ino != dst_inode_u.bi_inum); ++ ++ bch2_inode_update_after_write(c, src_dir, &src_dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ journal_seq_copy(src_dir, journal_seq); ++ ++ if (src_dir != dst_dir) { ++ bch2_inode_update_after_write(c, dst_dir, &dst_dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ journal_seq_copy(dst_dir, journal_seq); ++ } ++ ++ bch2_inode_update_after_write(c, src_inode, &src_inode_u, ++ ATTR_CTIME); ++ journal_seq_copy(src_inode, journal_seq); ++ ++ if (dst_inode) { ++ bch2_inode_update_after_write(c, dst_inode, &dst_inode_u, ++ ATTR_CTIME); ++ journal_seq_copy(dst_inode, journal_seq); ++ } ++err: ++ bch2_trans_exit(&trans); ++ ++ bch2_fs_quota_transfer(c, src_inode, ++ bch_qid(&src_inode->ei_inode), ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_NOCHECK); ++ if (dst_inode) ++ bch2_fs_quota_transfer(c, dst_inode, ++ bch_qid(&dst_inode->ei_inode), ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_NOCHECK); ++ ++ bch2_unlock_inodes(INODE_UPDATE_LOCK, ++ src_dir, ++ dst_dir, ++ src_inode, ++ dst_inode); ++ ++ return ret; ++} ++ ++void bch2_setattr_copy(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ struct iattr *attr) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ unsigned int ia_valid = attr->ia_valid; ++ ++ if (ia_valid & ATTR_UID) ++ bi->bi_uid = from_kuid(c->vfs_sb->s_user_ns, attr->ia_uid); ++ if (ia_valid & ATTR_GID) ++ bi->bi_gid = from_kgid(c->vfs_sb->s_user_ns, attr->ia_gid); ++ ++ if (ia_valid & ATTR_ATIME) ++ bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); ++ if (ia_valid & ATTR_MTIME) ++ bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime); ++ if (ia_valid & ATTR_CTIME) ++ bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime); ++ ++ if (ia_valid & ATTR_MODE) { ++ umode_t mode = attr->ia_mode; ++ kgid_t gid = ia_valid & ATTR_GID ++ ? attr->ia_gid ++ : inode->v.i_gid; ++ ++ if (!in_group_p(gid) && ++ !capable_wrt_inode_uidgid(&inode->v, CAP_FSETID)) ++ mode &= ~S_ISGID; ++ bi->bi_mode = mode; ++ } ++} ++ ++static int bch2_setattr_nonsize(struct bch_inode_info *inode, ++ struct iattr *attr) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_qid qid; ++ struct btree_trans trans; ++ struct btree_iter *inode_iter; ++ struct bch_inode_unpacked inode_u; ++ struct posix_acl *acl = NULL; ++ int ret; ++ ++ mutex_lock(&inode->ei_update_lock); ++ ++ qid = inode->ei_qid; ++ ++ if (attr->ia_valid & ATTR_UID) ++ qid.q[QTYP_USR] = from_kuid(&init_user_ns, attr->ia_uid); ++ ++ if (attr->ia_valid & ATTR_GID) ++ qid.q[QTYP_GRP] = from_kgid(&init_user_ns, attr->ia_gid); ++ ++ ret = bch2_fs_quota_transfer(c, inode, qid, ~0, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (ret) ++ goto err; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ kfree(acl); ++ acl = NULL; ++ ++ inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(inode_iter); ++ if (ret) ++ goto btree_err; ++ ++ bch2_setattr_copy(inode, &inode_u, attr); ++ ++ if (attr->ia_valid & ATTR_MODE) { ++ ret = bch2_acl_chmod(&trans, inode, inode_u.bi_mode, &acl); ++ if (ret) ++ goto btree_err; ++ } ++ ++ ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: ++ bch2_trans_commit(&trans, NULL, ++ &inode->ei_journal_seq, ++ BTREE_INSERT_NOUNLOCK| ++ BTREE_INSERT_NOFAIL); ++btree_err: ++ if (ret == -EINTR) ++ goto retry; ++ if (unlikely(ret)) ++ goto err_trans; ++ ++ bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid); ++ ++ if (acl) ++ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); ++err_trans: ++ bch2_trans_exit(&trans); ++err: ++ mutex_unlock(&inode->ei_update_lock); ++ ++ return ret; ++} ++ ++static int bch2_getattr(const struct path *path, struct kstat *stat, ++ u32 request_mask, unsigned query_flags) ++{ ++ struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ stat->dev = inode->v.i_sb->s_dev; ++ stat->ino = inode->v.i_ino; ++ stat->mode = inode->v.i_mode; ++ stat->nlink = inode->v.i_nlink; ++ stat->uid = inode->v.i_uid; ++ stat->gid = inode->v.i_gid; ++ stat->rdev = inode->v.i_rdev; ++ stat->size = i_size_read(&inode->v); ++ stat->atime = inode->v.i_atime; ++ stat->mtime = inode->v.i_mtime; ++ stat->ctime = inode->v.i_ctime; ++ stat->blksize = block_bytes(c); ++ stat->blocks = inode->v.i_blocks; ++ ++ if (request_mask & STATX_BTIME) { ++ stat->result_mask |= STATX_BTIME; ++ stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); ++ } ++ ++ if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE) ++ stat->attributes |= STATX_ATTR_IMMUTABLE; ++ stat->attributes_mask |= STATX_ATTR_IMMUTABLE; ++ ++ if (inode->ei_inode.bi_flags & BCH_INODE_APPEND) ++ stat->attributes |= STATX_ATTR_APPEND; ++ stat->attributes_mask |= STATX_ATTR_APPEND; ++ ++ if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP) ++ stat->attributes |= STATX_ATTR_NODUMP; ++ stat->attributes_mask |= STATX_ATTR_NODUMP; ++ ++ return 0; ++} ++ ++static int bch2_setattr(struct dentry *dentry, struct iattr *iattr) ++{ ++ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); ++ int ret; ++ ++ lockdep_assert_held(&inode->v.i_rwsem); ++ ++ ret = setattr_prepare(dentry, iattr); ++ if (ret) ++ return ret; ++ ++ return iattr->ia_valid & ATTR_SIZE ++ ? bch2_truncate(inode, iattr) ++ : bch2_setattr_nonsize(inode, iattr); ++} ++ ++static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode) ++{ ++ struct bch_inode_info *inode = ++ __bch2_create(to_bch_ei(vdir), dentry, mode, 0, true); ++ ++ if (IS_ERR(inode)) ++ return PTR_ERR(inode); ++ ++ d_mark_tmpfile(dentry, &inode->v); ++ d_instantiate(dentry, &inode->v); ++ return 0; ++} ++ ++static int bch2_fill_extent(struct bch_fs *c, ++ struct fiemap_extent_info *info, ++ struct bkey_s_c k, unsigned flags) ++{ ++ if (bkey_extent_is_data(k.k)) { ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ int ret; ++ ++ if (k.k->type == KEY_TYPE_reflink_v) ++ flags |= FIEMAP_EXTENT_SHARED; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ int flags2 = 0; ++ u64 offset = p.ptr.offset; ++ ++ if (p.crc.compression_type) ++ flags2 |= FIEMAP_EXTENT_ENCODED; ++ else ++ offset += p.crc.offset; ++ ++ if ((offset & (c->opts.block_size - 1)) || ++ (k.k->size & (c->opts.block_size - 1))) ++ flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; ++ ++ ret = fiemap_fill_next_extent(info, ++ bkey_start_offset(k.k) << 9, ++ offset << 9, ++ k.k->size << 9, flags|flags2); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++ } else if (k.k->type == KEY_TYPE_reservation) { ++ return fiemap_fill_next_extent(info, ++ bkey_start_offset(k.k) << 9, ++ 0, k.k->size << 9, ++ flags| ++ FIEMAP_EXTENT_DELALLOC| ++ FIEMAP_EXTENT_UNWRITTEN); ++ } else { ++ BUG(); ++ } ++} ++ ++static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, ++ u64 start, u64 len) ++{ ++ struct bch_fs *c = vinode->i_sb->s_fs_info; ++ struct bch_inode_info *ei = to_bch_ei(vinode); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_on_stack cur, prev; ++ struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); ++ unsigned offset_into_extent, sectors; ++ bool have_extent = false; ++ int ret = 0; ++ ++ ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); ++ if (ret) ++ return ret; ++ ++ if (start + len < start) ++ return -EINVAL; ++ ++ bkey_on_stack_init(&cur); ++ bkey_on_stack_init(&prev); ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(ei->v.i_ino, start >> 9), 0); ++retry: ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret = bkey_err(k)) && ++ bkey_cmp(iter->pos, end) < 0) { ++ if (!bkey_extent_is_data(k.k) && ++ k.k->type != KEY_TYPE_reservation) { ++ bch2_btree_iter_next(iter); ++ continue; ++ } ++ ++ bkey_on_stack_realloc(&cur, c, k.k->u64s); ++ bkey_on_stack_realloc(&prev, c, k.k->u64s); ++ bkey_reassemble(cur.k, k); ++ k = bkey_i_to_s_c(cur.k); ++ ++ offset_into_extent = iter->pos.offset - ++ bkey_start_offset(k.k); ++ sectors = k.k->size - offset_into_extent; ++ ++ ret = bch2_read_indirect_extent(&trans, ++ &offset_into_extent, &cur); ++ if (ret) ++ break; ++ ++ sectors = min(sectors, k.k->size - offset_into_extent); ++ ++ if (offset_into_extent) ++ bch2_cut_front(POS(k.k->p.inode, ++ bkey_start_offset(k.k) + ++ offset_into_extent), ++ cur.k); ++ bch2_key_resize(&cur.k->k, sectors); ++ cur.k->k.p = iter->pos; ++ cur.k->k.p.offset += cur.k->k.size; ++ ++ if (have_extent) { ++ ret = bch2_fill_extent(c, info, ++ bkey_i_to_s_c(prev.k), 0); ++ if (ret) ++ break; ++ } ++ ++ bkey_copy(prev.k, cur.k); ++ have_extent = true; ++ ++ if (k.k->type == KEY_TYPE_reflink_v) ++ bch2_btree_iter_set_pos(iter, k.k->p); ++ else ++ bch2_btree_iter_next(iter); ++ } ++ ++ if (ret == -EINTR) ++ goto retry; ++ ++ if (!ret && have_extent) ++ ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), ++ FIEMAP_EXTENT_LAST); ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ bkey_on_stack_exit(&cur, c); ++ bkey_on_stack_exit(&prev, c); ++ return ret < 0 ? ret : 0; ++} ++ ++static const struct vm_operations_struct bch_vm_ops = { ++ .fault = bch2_page_fault, ++ .map_pages = filemap_map_pages, ++ .page_mkwrite = bch2_page_mkwrite, ++}; ++ ++static int bch2_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ file_accessed(file); ++ ++ vma->vm_ops = &bch_vm_ops; ++ return 0; ++} ++ ++/* Directories: */ ++ ++static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) ++{ ++ return generic_file_llseek_size(file, offset, whence, ++ S64_MAX, S64_MAX); ++} ++ ++static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ if (!dir_emit_dots(file, ctx)) ++ return 0; ++ ++ return bch2_readdir(c, inode->v.i_ino, ctx); ++} ++ ++static const struct file_operations bch_file_operations = { ++ .llseek = bch2_llseek, ++ .read_iter = bch2_read_iter, ++ .write_iter = bch2_write_iter, ++ .mmap = bch2_mmap, ++ .open = generic_file_open, ++ .fsync = bch2_fsync, ++ .splice_read = generic_file_splice_read, ++ /* ++ * Broken, on v5.3: ++ .splice_write = iter_file_splice_write, ++ */ ++ .fallocate = bch2_fallocate_dispatch, ++ .unlocked_ioctl = bch2_fs_file_ioctl, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = bch2_compat_fs_ioctl, ++#endif ++ .remap_file_range = bch2_remap_file_range, ++}; ++ ++static const struct inode_operations bch_file_inode_operations = { ++ .getattr = bch2_getattr, ++ .setattr = bch2_setattr, ++ .fiemap = bch2_fiemap, ++ .listxattr = bch2_xattr_list, ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ .get_acl = bch2_get_acl, ++ .set_acl = bch2_set_acl, ++#endif ++}; ++ ++static const struct inode_operations bch_dir_inode_operations = { ++ .lookup = bch2_lookup, ++ .create = bch2_create, ++ .link = bch2_link, ++ .unlink = bch2_unlink, ++ .symlink = bch2_symlink, ++ .mkdir = bch2_mkdir, ++ .rmdir = bch2_unlink, ++ .mknod = bch2_mknod, ++ .rename = bch2_rename2, ++ .getattr = bch2_getattr, ++ .setattr = bch2_setattr, ++ .tmpfile = bch2_tmpfile, ++ .listxattr = bch2_xattr_list, ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ .get_acl = bch2_get_acl, ++ .set_acl = bch2_set_acl, ++#endif ++}; ++ ++static const struct file_operations bch_dir_file_operations = { ++ .llseek = bch2_dir_llseek, ++ .read = generic_read_dir, ++ .iterate_shared = bch2_vfs_readdir, ++ .fsync = bch2_fsync, ++ .unlocked_ioctl = bch2_fs_file_ioctl, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = bch2_compat_fs_ioctl, ++#endif ++}; ++ ++static const struct inode_operations bch_symlink_inode_operations = { ++ .get_link = page_get_link, ++ .getattr = bch2_getattr, ++ .setattr = bch2_setattr, ++ .listxattr = bch2_xattr_list, ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ .get_acl = bch2_get_acl, ++ .set_acl = bch2_set_acl, ++#endif ++}; ++ ++static const struct inode_operations bch_special_inode_operations = { ++ .getattr = bch2_getattr, ++ .setattr = bch2_setattr, ++ .listxattr = bch2_xattr_list, ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ .get_acl = bch2_get_acl, ++ .set_acl = bch2_set_acl, ++#endif ++}; ++ ++static const struct address_space_operations bch_address_space_operations = { ++ .writepage = bch2_writepage, ++ .readpage = bch2_readpage, ++ .writepages = bch2_writepages, ++ .readpages = bch2_readpages, ++ .set_page_dirty = __set_page_dirty_nobuffers, ++ .write_begin = bch2_write_begin, ++ .write_end = bch2_write_end, ++ .invalidatepage = bch2_invalidatepage, ++ .releasepage = bch2_releasepage, ++ .direct_IO = noop_direct_IO, ++#ifdef CONFIG_MIGRATION ++ .migratepage = bch2_migrate_page, ++#endif ++ .error_remove_page = generic_error_remove_page, ++}; ++ ++static struct inode *bch2_nfs_get_inode(struct super_block *sb, ++ u64 ino, u32 generation) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct inode *vinode; ++ ++ if (ino < BCACHEFS_ROOT_INO) ++ return ERR_PTR(-ESTALE); ++ ++ vinode = bch2_vfs_inode_get(c, ino); ++ if (IS_ERR(vinode)) ++ return ERR_CAST(vinode); ++ if (generation && vinode->i_generation != generation) { ++ /* we didn't find the right inode.. */ ++ iput(vinode); ++ return ERR_PTR(-ESTALE); ++ } ++ return vinode; ++} ++ ++static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid, ++ int fh_len, int fh_type) ++{ ++ return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ++ bch2_nfs_get_inode); ++} ++ ++static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid, ++ int fh_len, int fh_type) ++{ ++ return generic_fh_to_parent(sb, fid, fh_len, fh_type, ++ bch2_nfs_get_inode); ++} ++ ++static const struct export_operations bch_export_ops = { ++ .fh_to_dentry = bch2_fh_to_dentry, ++ .fh_to_parent = bch2_fh_to_parent, ++ //.get_parent = bch2_get_parent, ++}; ++ ++static void bch2_vfs_inode_init(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi) ++{ ++ bch2_inode_update_after_write(c, inode, bi, ~0); ++ ++ inode->v.i_blocks = bi->bi_sectors; ++ inode->v.i_ino = bi->bi_inum; ++ inode->v.i_rdev = bi->bi_dev; ++ inode->v.i_generation = bi->bi_generation; ++ inode->v.i_size = bi->bi_size; ++ ++ inode->ei_journal_seq = 0; ++ inode->ei_quota_reserved = 0; ++ inode->ei_str_hash = bch2_hash_info_init(c, bi); ++ inode->ei_qid = bch_qid(bi); ++ ++ inode->v.i_mapping->a_ops = &bch_address_space_operations; ++ ++ switch (inode->v.i_mode & S_IFMT) { ++ case S_IFREG: ++ inode->v.i_op = &bch_file_inode_operations; ++ inode->v.i_fop = &bch_file_operations; ++ break; ++ case S_IFDIR: ++ inode->v.i_op = &bch_dir_inode_operations; ++ inode->v.i_fop = &bch_dir_file_operations; ++ break; ++ case S_IFLNK: ++ inode_nohighmem(&inode->v); ++ inode->v.i_op = &bch_symlink_inode_operations; ++ break; ++ default: ++ init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev); ++ inode->v.i_op = &bch_special_inode_operations; ++ break; ++ } ++} ++ ++static struct inode *bch2_alloc_inode(struct super_block *sb) ++{ ++ struct bch_inode_info *inode; ++ ++ inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS); ++ if (!inode) ++ return NULL; ++ ++ inode_init_once(&inode->v); ++ mutex_init(&inode->ei_update_lock); ++ pagecache_lock_init(&inode->ei_pagecache_lock); ++ mutex_init(&inode->ei_quota_lock); ++ inode->ei_journal_seq = 0; ++ ++ return &inode->v; ++} ++ ++static void bch2_i_callback(struct rcu_head *head) ++{ ++ struct inode *vinode = container_of(head, struct inode, i_rcu); ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ ++ kmem_cache_free(bch2_inode_cache, inode); ++} ++ ++static void bch2_destroy_inode(struct inode *vinode) ++{ ++ call_rcu(&vinode->i_rcu, bch2_i_callback); ++} ++ ++static int inode_update_times_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ bi->bi_atime = timespec_to_bch2_time(c, inode->v.i_atime); ++ bi->bi_mtime = timespec_to_bch2_time(c, inode->v.i_mtime); ++ bi->bi_ctime = timespec_to_bch2_time(c, inode->v.i_ctime); ++ ++ return 0; ++} ++ ++static int bch2_vfs_write_inode(struct inode *vinode, ++ struct writeback_control *wbc) ++{ ++ struct bch_fs *c = vinode->i_sb->s_fs_info; ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ int ret; ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, ++ ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++ ++ return ret; ++} ++ ++static void bch2_evict_inode(struct inode *vinode) ++{ ++ struct bch_fs *c = vinode->i_sb->s_fs_info; ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ ++ truncate_inode_pages_final(&inode->v.i_data); ++ ++ clear_inode(&inode->v); ++ ++ BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); ++ ++ if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) { ++ bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), ++ KEY_TYPE_QUOTA_WARN); ++ bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, ++ KEY_TYPE_QUOTA_WARN); ++ bch2_inode_rm(c, inode->v.i_ino); ++ } ++} ++ ++static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) ++{ ++ struct super_block *sb = dentry->d_sb; ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); ++ unsigned shift = sb->s_blocksize_bits - 9; ++ u64 fsid; ++ ++ buf->f_type = BCACHEFS_STATFS_MAGIC; ++ buf->f_bsize = sb->s_blocksize; ++ buf->f_blocks = usage.capacity >> shift; ++ buf->f_bfree = (usage.capacity - usage.used) >> shift; ++ buf->f_bavail = buf->f_bfree; ++ buf->f_files = 0; ++ buf->f_ffree = 0; ++ ++ fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ ++ le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); ++ buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; ++ buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; ++ buf->f_namelen = BCH_NAME_MAX; ++ ++ return 0; ++} ++ ++static int bch2_sync_fs(struct super_block *sb, int wait) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ ++ if (c->opts.journal_flush_disabled) ++ return 0; ++ ++ if (!wait) { ++ bch2_journal_flush_async(&c->journal, NULL); ++ return 0; ++ } ++ ++ return bch2_journal_flush(&c->journal); ++} ++ ++static struct bch_fs *bch2_path_to_fs(const char *dev) ++{ ++ struct bch_fs *c; ++ struct block_device *bdev = lookup_bdev(dev); ++ ++ if (IS_ERR(bdev)) ++ return ERR_CAST(bdev); ++ ++ c = bch2_bdev_to_fs(bdev); ++ bdput(bdev); ++ return c ?: ERR_PTR(-ENOENT); ++} ++ ++static struct bch_fs *__bch2_open_as_blockdevs(const char *dev_name, char * const *devs, ++ unsigned nr_devs, struct bch_opts opts) ++{ ++ struct bch_fs *c, *c1, *c2; ++ size_t i; ++ ++ if (!nr_devs) ++ return ERR_PTR(-EINVAL); ++ ++ c = bch2_fs_open(devs, nr_devs, opts); ++ ++ if (IS_ERR(c) && PTR_ERR(c) == -EBUSY) { ++ /* ++ * Already open? ++ * Look up each block device, make sure they all belong to a ++ * filesystem and they all belong to the _same_ filesystem ++ */ ++ ++ c1 = bch2_path_to_fs(devs[0]); ++ if (IS_ERR(c1)) ++ return c; ++ ++ for (i = 1; i < nr_devs; i++) { ++ c2 = bch2_path_to_fs(devs[i]); ++ if (!IS_ERR(c2)) ++ closure_put(&c2->cl); ++ ++ if (c1 != c2) { ++ closure_put(&c1->cl); ++ return c; ++ } ++ } ++ ++ c = c1; ++ } ++ ++ if (IS_ERR(c)) ++ return c; ++ ++ down_write(&c->state_lock); ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) { ++ up_write(&c->state_lock); ++ closure_put(&c->cl); ++ pr_err("err mounting %s: incomplete filesystem", dev_name); ++ return ERR_PTR(-EINVAL); ++ } ++ ++ up_write(&c->state_lock); ++ ++ set_bit(BCH_FS_BDEV_MOUNTED, &c->flags); ++ return c; ++} ++ ++static struct bch_fs *bch2_open_as_blockdevs(const char *_dev_name, ++ struct bch_opts opts) ++{ ++ char *dev_name = NULL, **devs = NULL, *s; ++ struct bch_fs *c = ERR_PTR(-ENOMEM); ++ size_t i, nr_devs = 0; ++ ++ dev_name = kstrdup(_dev_name, GFP_KERNEL); ++ if (!dev_name) ++ goto err; ++ ++ for (s = dev_name; s; s = strchr(s + 1, ':')) ++ nr_devs++; ++ ++ devs = kcalloc(nr_devs, sizeof(const char *), GFP_KERNEL); ++ if (!devs) ++ goto err; ++ ++ for (i = 0, s = dev_name; ++ s; ++ (s = strchr(s, ':')) && (*s++ = '\0')) ++ devs[i++] = s; ++ ++ c = __bch2_open_as_blockdevs(_dev_name, devs, nr_devs, opts); ++err: ++ kfree(devs); ++ kfree(dev_name); ++ return c; ++} ++ ++static int bch2_remount(struct super_block *sb, int *flags, char *data) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_opts opts = bch2_opts_empty(); ++ int ret; ++ ++ opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); ++ ++ ret = bch2_parse_mount_opts(&opts, data); ++ if (ret) ++ return ret; ++ ++ if (opts.read_only != c->opts.read_only) { ++ down_write(&c->state_lock); ++ ++ if (opts.read_only) { ++ bch2_fs_read_only(c); ++ ++ sb->s_flags |= SB_RDONLY; ++ } else { ++ ret = bch2_fs_read_write(c); ++ if (ret) { ++ bch_err(c, "error going rw: %i", ret); ++ up_write(&c->state_lock); ++ return -EINVAL; ++ } ++ ++ sb->s_flags &= ~SB_RDONLY; ++ } ++ ++ c->opts.read_only = opts.read_only; ++ ++ up_write(&c->state_lock); ++ } ++ ++ if (opts.errors >= 0) ++ c->opts.errors = opts.errors; ++ ++ return ret; ++} ++ ++static int bch2_show_devname(struct seq_file *seq, struct dentry *root) ++{ ++ struct bch_fs *c = root->d_sb->s_fs_info; ++ struct bch_dev *ca; ++ unsigned i; ++ bool first = true; ++ ++ for_each_online_member(ca, c, i) { ++ if (!first) ++ seq_putc(seq, ':'); ++ first = false; ++ seq_puts(seq, "/dev/"); ++ seq_puts(seq, ca->name); ++ } ++ ++ return 0; ++} ++ ++static int bch2_show_options(struct seq_file *seq, struct dentry *root) ++{ ++ struct bch_fs *c = root->d_sb->s_fs_info; ++ enum bch_opt_id i; ++ char buf[512]; ++ ++ for (i = 0; i < bch2_opts_nr; i++) { ++ const struct bch_option *opt = &bch2_opt_table[i]; ++ u64 v = bch2_opt_get_by_id(&c->opts, i); ++ ++ if (!(opt->mode & OPT_MOUNT)) ++ continue; ++ ++ if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) ++ continue; ++ ++ bch2_opt_to_text(&PBUF(buf), c, opt, v, ++ OPT_SHOW_MOUNT_STYLE); ++ seq_putc(seq, ','); ++ seq_puts(seq, buf); ++ } ++ ++ return 0; ++} ++ ++static const struct super_operations bch_super_operations = { ++ .alloc_inode = bch2_alloc_inode, ++ .destroy_inode = bch2_destroy_inode, ++ .write_inode = bch2_vfs_write_inode, ++ .evict_inode = bch2_evict_inode, ++ .sync_fs = bch2_sync_fs, ++ .statfs = bch2_statfs, ++ .show_devname = bch2_show_devname, ++ .show_options = bch2_show_options, ++ .remount_fs = bch2_remount, ++#if 0 ++ .put_super = bch2_put_super, ++ .freeze_fs = bch2_freeze, ++ .unfreeze_fs = bch2_unfreeze, ++#endif ++}; ++ ++static int bch2_test_super(struct super_block *s, void *data) ++{ ++ return s->s_fs_info == data; ++} ++ ++static int bch2_set_super(struct super_block *s, void *data) ++{ ++ s->s_fs_info = data; ++ return 0; ++} ++ ++static struct dentry *bch2_mount(struct file_system_type *fs_type, ++ int flags, const char *dev_name, void *data) ++{ ++ struct bch_fs *c; ++ struct bch_dev *ca; ++ struct super_block *sb; ++ struct inode *vinode; ++ struct bch_opts opts = bch2_opts_empty(); ++ unsigned i; ++ int ret; ++ ++ opt_set(opts, read_only, (flags & SB_RDONLY) != 0); ++ ++ ret = bch2_parse_mount_opts(&opts, data); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ c = bch2_open_as_blockdevs(dev_name, opts); ++ if (IS_ERR(c)) ++ return ERR_CAST(c); ++ ++ sb = sget(fs_type, bch2_test_super, bch2_set_super, flags|SB_NOSEC, c); ++ if (IS_ERR(sb)) { ++ closure_put(&c->cl); ++ return ERR_CAST(sb); ++ } ++ ++ BUG_ON(sb->s_fs_info != c); ++ ++ if (sb->s_root) { ++ closure_put(&c->cl); ++ ++ if ((flags ^ sb->s_flags) & SB_RDONLY) { ++ ret = -EBUSY; ++ goto err_put_super; ++ } ++ goto out; ++ } ++ ++ sb->s_blocksize = block_bytes(c); ++ sb->s_blocksize_bits = ilog2(block_bytes(c)); ++ sb->s_maxbytes = MAX_LFS_FILESIZE; ++ sb->s_op = &bch_super_operations; ++ sb->s_export_op = &bch_export_ops; ++#ifdef CONFIG_BCACHEFS_QUOTA ++ sb->s_qcop = &bch2_quotactl_operations; ++ sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ; ++#endif ++ sb->s_xattr = bch2_xattr_handlers; ++ sb->s_magic = BCACHEFS_STATFS_MAGIC; ++ sb->s_time_gran = c->sb.time_precision; ++ c->vfs_sb = sb; ++ strlcpy(sb->s_id, c->name, sizeof(sb->s_id)); ++ ++ ret = super_setup_bdi(sb); ++ if (ret) ++ goto err_put_super; ++ ++ sb->s_bdi->congested_fn = bch2_congested; ++ sb->s_bdi->congested_data = c; ++ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; ++ ++ for_each_online_member(ca, c, i) { ++ struct block_device *bdev = ca->disk_sb.bdev; ++ ++ /* XXX: create an anonymous device for multi device filesystems */ ++ sb->s_bdev = bdev; ++ sb->s_dev = bdev->bd_dev; ++ percpu_ref_put(&ca->io_ref); ++ break; ++ } ++ ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ if (c->opts.acl) ++ sb->s_flags |= SB_POSIXACL; ++#endif ++ ++ vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO); ++ if (IS_ERR(vinode)) { ++ bch_err(c, "error mounting: error getting root inode %i", ++ (int) PTR_ERR(vinode)); ++ ret = PTR_ERR(vinode); ++ goto err_put_super; ++ } ++ ++ sb->s_root = d_make_root(vinode); ++ if (!sb->s_root) { ++ bch_err(c, "error mounting: error allocating root dentry"); ++ ret = -ENOMEM; ++ goto err_put_super; ++ } ++ ++ sb->s_flags |= SB_ACTIVE; ++out: ++ return dget(sb->s_root); ++ ++err_put_super: ++ deactivate_locked_super(sb); ++ return ERR_PTR(ret); ++} ++ ++static void bch2_kill_sb(struct super_block *sb) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ ++ generic_shutdown_super(sb); ++ ++ if (test_bit(BCH_FS_BDEV_MOUNTED, &c->flags)) ++ bch2_fs_stop(c); ++ else ++ closure_put(&c->cl); ++} ++ ++static struct file_system_type bcache_fs_type = { ++ .owner = THIS_MODULE, ++ .name = "bcachefs", ++ .mount = bch2_mount, ++ .kill_sb = bch2_kill_sb, ++ .fs_flags = FS_REQUIRES_DEV, ++}; ++ ++MODULE_ALIAS_FS("bcachefs"); ++ ++void bch2_vfs_exit(void) ++{ ++ unregister_filesystem(&bcache_fs_type); ++ if (bch2_inode_cache) ++ kmem_cache_destroy(bch2_inode_cache); ++} ++ ++int __init bch2_vfs_init(void) ++{ ++ int ret = -ENOMEM; ++ ++ bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0); ++ if (!bch2_inode_cache) ++ goto err; ++ ++ ret = register_filesystem(&bcache_fs_type); ++ if (ret) ++ goto err; ++ ++ return 0; ++err: ++ bch2_vfs_exit(); ++ return ret; ++} ++ ++#endif /* NO_BCACHEFS_FS */ +diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h +new file mode 100644 +index 000000000000..eda903a45325 +--- /dev/null ++++ b/fs/bcachefs/fs.h +@@ -0,0 +1,174 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FS_H ++#define _BCACHEFS_FS_H ++ ++#include "inode.h" ++#include "opts.h" ++#include "str_hash.h" ++#include "quota_types.h" ++ ++#include ++#include ++ ++/* ++ * Two-state lock - can be taken for add or block - both states are shared, ++ * like read side of rwsem, but conflict with other state: ++ */ ++struct pagecache_lock { ++ atomic_long_t v; ++ wait_queue_head_t wait; ++}; ++ ++static inline void pagecache_lock_init(struct pagecache_lock *lock) ++{ ++ atomic_long_set(&lock->v, 0); ++ init_waitqueue_head(&lock->wait); ++} ++ ++void bch2_pagecache_add_put(struct pagecache_lock *); ++void bch2_pagecache_add_get(struct pagecache_lock *); ++void bch2_pagecache_block_put(struct pagecache_lock *); ++void bch2_pagecache_block_get(struct pagecache_lock *); ++ ++struct bch_inode_info { ++ struct inode v; ++ ++ struct mutex ei_update_lock; ++ u64 ei_journal_seq; ++ u64 ei_quota_reserved; ++ unsigned long ei_last_dirtied; ++ ++ struct pagecache_lock ei_pagecache_lock; ++ ++ struct mutex ei_quota_lock; ++ struct bch_qid ei_qid; ++ ++ struct bch_hash_info ei_str_hash; ++ ++ /* copy of inode in btree: */ ++ struct bch_inode_unpacked ei_inode; ++}; ++ ++#define to_bch_ei(_inode) \ ++ container_of_or_null(_inode, struct bch_inode_info, v) ++ ++static inline int ptrcmp(void *l, void *r) ++{ ++ return cmp_int(l, r); ++} ++ ++enum bch_inode_lock_op { ++ INODE_LOCK = (1U << 0), ++ INODE_PAGECACHE_BLOCK = (1U << 1), ++ INODE_UPDATE_LOCK = (1U << 2), ++}; ++ ++#define bch2_lock_inodes(_locks, ...) \ ++do { \ ++ struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ ++ unsigned i; \ ++ \ ++ bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ ++ \ ++ for (i = 1; i < ARRAY_SIZE(a); i++) \ ++ if (a[i] != a[i - 1]) { \ ++ if ((_locks) & INODE_LOCK) \ ++ down_write_nested(&a[i]->v.i_rwsem, i); \ ++ if ((_locks) & INODE_PAGECACHE_BLOCK) \ ++ bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\ ++ if ((_locks) & INODE_UPDATE_LOCK) \ ++ mutex_lock_nested(&a[i]->ei_update_lock, i);\ ++ } \ ++} while (0) ++ ++#define bch2_unlock_inodes(_locks, ...) \ ++do { \ ++ struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ ++ unsigned i; \ ++ \ ++ bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ ++ \ ++ for (i = 1; i < ARRAY_SIZE(a); i++) \ ++ if (a[i] != a[i - 1]) { \ ++ if ((_locks) & INODE_LOCK) \ ++ up_write(&a[i]->v.i_rwsem); \ ++ if ((_locks) & INODE_PAGECACHE_BLOCK) \ ++ bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\ ++ if ((_locks) & INODE_UPDATE_LOCK) \ ++ mutex_unlock(&a[i]->ei_update_lock); \ ++ } \ ++} while (0) ++ ++static inline struct bch_inode_info *file_bch_inode(struct file *file) ++{ ++ return to_bch_ei(file_inode(file)); ++} ++ ++static inline bool inode_attr_changing(struct bch_inode_info *dir, ++ struct bch_inode_info *inode, ++ enum inode_opt_id id) ++{ ++ return !(inode->ei_inode.bi_fields_set & (1 << id)) && ++ bch2_inode_opt_get(&dir->ei_inode, id) != ++ bch2_inode_opt_get(&inode->ei_inode, id); ++} ++ ++static inline bool inode_attrs_changing(struct bch_inode_info *dir, ++ struct bch_inode_info *inode) ++{ ++ unsigned id; ++ ++ for (id = 0; id < Inode_opt_nr; id++) ++ if (inode_attr_changing(dir, inode, id)) ++ return true; ++ ++ return false; ++} ++ ++struct bch_inode_unpacked; ++ ++#ifndef NO_BCACHEFS_FS ++ ++int bch2_fs_quota_transfer(struct bch_fs *, ++ struct bch_inode_info *, ++ struct bch_qid, ++ unsigned, ++ enum quota_acct_mode); ++ ++static inline int bch2_set_projid(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ u32 projid) ++{ ++ struct bch_qid qid = inode->ei_qid; ++ ++ qid.q[QTYP_PRJ] = projid; ++ ++ return bch2_fs_quota_transfer(c, inode, qid, ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_PREALLOC); ++} ++ ++struct inode *bch2_vfs_inode_get(struct bch_fs *, u64); ++ ++/* returns 0 if we want to do the update, or error is passed up */ ++typedef int (*inode_set_fn)(struct bch_inode_info *, ++ struct bch_inode_unpacked *, void *); ++ ++void bch2_inode_update_after_write(struct bch_fs *, ++ struct bch_inode_info *, ++ struct bch_inode_unpacked *, ++ unsigned); ++int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, ++ inode_set_fn, void *, unsigned); ++ ++void bch2_vfs_exit(void); ++int bch2_vfs_init(void); ++ ++#else ++ ++static inline void bch2_vfs_exit(void) {} ++static inline int bch2_vfs_init(void) { return 0; } ++ ++#endif /* NO_BCACHEFS_FS */ ++ ++#endif /* _BCACHEFS_FS_H */ +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +new file mode 100644 +index 000000000000..5a6df3d1973a +--- /dev/null ++++ b/fs/bcachefs/fsck.c +@@ -0,0 +1,1502 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_on_stack.h" ++#include "btree_update.h" ++#include "dirent.h" ++#include "error.h" ++#include "fs-common.h" ++#include "fsck.h" ++#include "inode.h" ++#include "keylist.h" ++#include "super.h" ++#include "xattr.h" ++ ++#include /* struct qstr */ ++#include ++ ++#define QSTR(n) { { { .len = strlen(n) } }, .name = n } ++ ++static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 sectors = 0; ++ int ret; ++ ++ for_each_btree_key(trans, iter, BTREE_ID_EXTENTS, ++ POS(inum, 0), 0, k, ret) { ++ if (k.k->p.inode != inum) ++ break; ++ ++ if (bkey_extent_is_allocation(k.k)) ++ sectors += k.k->size; ++ } ++ ++ bch2_trans_iter_free(trans, iter); ++ ++ return ret ?: sectors; ++} ++ ++static int __remove_dirent(struct btree_trans *trans, ++ struct bkey_s_c_dirent dirent) ++{ ++ struct bch_fs *c = trans->c; ++ struct qstr name; ++ struct bch_inode_unpacked dir_inode; ++ struct bch_hash_info dir_hash_info; ++ u64 dir_inum = dirent.k->p.inode; ++ int ret; ++ char *buf; ++ ++ name.len = bch2_dirent_name_bytes(dirent); ++ buf = bch2_trans_kmalloc(trans, name.len + 1); ++ if (IS_ERR(buf)) ++ return PTR_ERR(buf); ++ ++ memcpy(buf, dirent.v->d_name, name.len); ++ buf[name.len] = '\0'; ++ name.name = buf; ++ ++ ret = bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode); ++ if (ret && ret != -EINTR) ++ bch_err(c, "remove_dirent: err %i looking up directory inode", ret); ++ if (ret) ++ return ret; ++ ++ dir_hash_info = bch2_hash_info_init(c, &dir_inode); ++ ++ ret = bch2_hash_delete(trans, bch2_dirent_hash_desc, ++ &dir_hash_info, dir_inum, &name); ++ if (ret && ret != -EINTR) ++ bch_err(c, "remove_dirent: err %i deleting dirent", ret); ++ if (ret) ++ return ret; ++ ++ return 0; ++} ++ ++static int remove_dirent(struct btree_trans *trans, ++ struct bkey_s_c_dirent dirent) ++{ ++ return __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ __remove_dirent(trans, dirent)); ++} ++ ++static int reattach_inode(struct bch_fs *c, ++ struct bch_inode_unpacked *lostfound_inode, ++ u64 inum) ++{ ++ struct bch_inode_unpacked dir_u, inode_u; ++ char name_buf[20]; ++ struct qstr name; ++ int ret; ++ ++ snprintf(name_buf, sizeof(name_buf), "%llu", inum); ++ name = (struct qstr) QSTR(name_buf); ++ ++ ret = bch2_trans_do(c, NULL, NULL, ++ BTREE_INSERT_LAZY_RW, ++ bch2_link_trans(&trans, lostfound_inode->bi_inum, ++ inum, &dir_u, &inode_u, &name)); ++ if (ret) ++ bch_err(c, "error %i reattaching inode %llu", ret, inum); ++ ++ return ret; ++} ++ ++struct inode_walker { ++ bool first_this_inode; ++ bool have_inode; ++ u64 cur_inum; ++ struct bch_inode_unpacked inode; ++}; ++ ++static struct inode_walker inode_walker_init(void) ++{ ++ return (struct inode_walker) { ++ .cur_inum = -1, ++ .have_inode = false, ++ }; ++} ++ ++static int walk_inode(struct btree_trans *trans, ++ struct inode_walker *w, u64 inum) ++{ ++ if (inum != w->cur_inum) { ++ int ret = bch2_inode_find_by_inum_trans(trans, inum, ++ &w->inode); ++ ++ if (ret && ret != -ENOENT) ++ return ret; ++ ++ w->have_inode = !ret; ++ w->cur_inum = inum; ++ w->first_this_inode = true; ++ } else { ++ w->first_this_inode = false; ++ } ++ ++ return 0; ++} ++ ++struct hash_check { ++ struct bch_hash_info info; ++ ++ /* start of current chain of hash collisions: */ ++ struct btree_iter *chain; ++ ++ /* next offset in current chain of hash collisions: */ ++ u64 chain_end; ++}; ++ ++static void hash_check_init(struct hash_check *h) ++{ ++ h->chain = NULL; ++ h->chain_end = 0; ++} ++ ++static void hash_stop_chain(struct btree_trans *trans, ++ struct hash_check *h) ++{ ++ if (h->chain) ++ bch2_trans_iter_free(trans, h->chain); ++ h->chain = NULL; ++} ++ ++static void hash_check_set_inode(struct btree_trans *trans, ++ struct hash_check *h, ++ const struct bch_inode_unpacked *bi) ++{ ++ h->info = bch2_hash_info_init(trans->c, bi); ++ hash_stop_chain(trans, h); ++} ++ ++static int hash_redo_key(const struct bch_hash_desc desc, ++ struct btree_trans *trans, struct hash_check *h, ++ struct btree_iter *k_iter, struct bkey_s_c k, ++ u64 hashed) ++{ ++ struct bkey_i delete; ++ struct bkey_i *tmp; ++ ++ tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if (IS_ERR(tmp)) ++ return PTR_ERR(tmp); ++ ++ bkey_reassemble(tmp, k); ++ ++ bkey_init(&delete.k); ++ delete.k.p = k_iter->pos; ++ bch2_trans_update(trans, k_iter, &delete, 0); ++ ++ return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode, ++ tmp, BCH_HASH_SET_MUST_CREATE); ++} ++ ++static int fsck_hash_delete_at(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ struct bch_hash_info *info, ++ struct btree_iter *iter) ++{ ++ int ret; ++retry: ++ ret = bch2_hash_delete_at(trans, desc, info, iter) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW); ++ if (ret == -EINTR) { ++ ret = bch2_btree_iter_traverse(iter); ++ if (!ret) ++ goto retry; ++ } ++ ++ return ret; ++} ++ ++static int hash_check_duplicates(struct btree_trans *trans, ++ const struct bch_hash_desc desc, struct hash_check *h, ++ struct btree_iter *k_iter, struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter; ++ struct bkey_s_c k2; ++ char buf[200]; ++ int ret = 0; ++ ++ if (!bkey_cmp(h->chain->pos, k_iter->pos)) ++ return 0; ++ ++ iter = bch2_trans_copy_iter(trans, h->chain); ++ BUG_ON(IS_ERR(iter)); ++ ++ for_each_btree_key_continue(iter, 0, k2, ret) { ++ if (bkey_cmp(k2.k->p, k.k->p) >= 0) ++ break; ++ ++ if (fsck_err_on(k2.k->type == desc.key_type && ++ !desc.cmp_bkey(k, k2), c, ++ "duplicate hash table keys:\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) { ++ ret = fsck_hash_delete_at(trans, desc, &h->info, k_iter); ++ if (ret) ++ return ret; ++ ret = 1; ++ break; ++ } ++ } ++fsck_err: ++ bch2_trans_iter_free(trans, iter); ++ return ret; ++} ++ ++static void hash_set_chain_start(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ struct hash_check *h, ++ struct btree_iter *k_iter, struct bkey_s_c k) ++{ ++ bool hole = (k.k->type != KEY_TYPE_whiteout && ++ k.k->type != desc.key_type); ++ ++ if (hole || k.k->p.offset > h->chain_end + 1) ++ hash_stop_chain(trans, h); ++ ++ if (!hole) { ++ if (!h->chain) { ++ h->chain = bch2_trans_copy_iter(trans, k_iter); ++ BUG_ON(IS_ERR(h->chain)); ++ } ++ ++ h->chain_end = k.k->p.offset; ++ } ++} ++ ++static bool key_has_correct_hash(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ struct hash_check *h, ++ struct btree_iter *k_iter, struct bkey_s_c k) ++{ ++ u64 hash; ++ ++ hash_set_chain_start(trans, desc, h, k_iter, k); ++ ++ if (k.k->type != desc.key_type) ++ return true; ++ ++ hash = desc.hash_bkey(&h->info, k); ++ ++ return hash >= h->chain->pos.offset && ++ hash <= k.k->p.offset; ++} ++ ++static int hash_check_key(struct btree_trans *trans, ++ const struct bch_hash_desc desc, struct hash_check *h, ++ struct btree_iter *k_iter, struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ char buf[200]; ++ u64 hashed; ++ int ret = 0; ++ ++ hash_set_chain_start(trans, desc, h, k_iter, k); ++ ++ if (k.k->type != desc.key_type) ++ return 0; ++ ++ hashed = desc.hash_bkey(&h->info, k); ++ ++ if (fsck_err_on(hashed < h->chain->pos.offset || ++ hashed > k.k->p.offset, c, ++ "hash table key at wrong offset: btree %u, %llu, " ++ "hashed to %llu chain starts at %llu\n%s", ++ desc.btree_id, k.k->p.offset, ++ hashed, h->chain->pos.offset, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) { ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, ++ hash_redo_key(desc, trans, h, k_iter, k, hashed)); ++ if (ret) { ++ bch_err(c, "hash_redo_key err %i", ret); ++ return ret; ++ } ++ return 1; ++ } ++ ++ ret = hash_check_duplicates(trans, desc, h, k_iter, k); ++fsck_err: ++ return ret; ++} ++ ++static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h, ++ struct btree_iter *iter, struct bkey_s_c *k) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_i_dirent *d = NULL; ++ int ret = -EINVAL; ++ char buf[200]; ++ unsigned len; ++ u64 hash; ++ ++ if (key_has_correct_hash(trans, bch2_dirent_hash_desc, h, iter, *k)) ++ return 0; ++ ++ len = bch2_dirent_name_bytes(bkey_s_c_to_dirent(*k)); ++ BUG_ON(!len); ++ ++ memcpy(buf, bkey_s_c_to_dirent(*k).v->d_name, len); ++ buf[len] = '\0'; ++ ++ d = kmalloc(bkey_bytes(k->k), GFP_KERNEL); ++ if (!d) { ++ bch_err(c, "memory allocation failure"); ++ return -ENOMEM; ++ } ++ ++ bkey_reassemble(&d->k_i, *k); ++ ++ do { ++ --len; ++ if (!len) ++ goto err_redo; ++ ++ d->k.u64s = BKEY_U64s + dirent_val_u64s(len); ++ ++ BUG_ON(bkey_val_bytes(&d->k) < ++ offsetof(struct bch_dirent, d_name) + len); ++ ++ memset(d->v.d_name + len, 0, ++ bkey_val_bytes(&d->k) - ++ offsetof(struct bch_dirent, d_name) - len); ++ ++ hash = bch2_dirent_hash_desc.hash_bkey(&h->info, ++ bkey_i_to_s_c(&d->k_i)); ++ } while (hash < h->chain->pos.offset || ++ hash > k->k->p.offset); ++ ++ if (fsck_err(c, "dirent with junk at end, was %s (%zu) now %s (%u)", ++ buf, strlen(buf), d->v.d_name, len)) { ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ (bch2_trans_update(trans, iter, &d->k_i, 0), 0)); ++ if (ret) ++ goto err; ++ ++ *k = bch2_btree_iter_peek(iter); ++ ++ BUG_ON(k->k->type != KEY_TYPE_dirent); ++ } ++err: ++fsck_err: ++ kfree(d); ++ return ret; ++err_redo: ++ hash = bch2_dirent_hash_desc.hash_bkey(&h->info, *k); ++ ++ if (fsck_err(c, "cannot fix dirent by removing trailing garbage %s (%zu)\n" ++ "hash table key at wrong offset: btree %u, offset %llu, " ++ "hashed to %llu chain starts at %llu\n%s", ++ buf, strlen(buf), BTREE_ID_DIRENTS, ++ k->k->p.offset, hash, h->chain->pos.offset, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ *k), buf))) { ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, ++ hash_redo_key(bch2_dirent_hash_desc, trans, ++ h, iter, *k, hash)); ++ if (ret) ++ bch_err(c, "hash_redo_key err %i", ret); ++ else ++ ret = 1; ++ } ++ ++ goto err; ++} ++ ++static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size) ++{ ++ return bch2_btree_delete_range(c, BTREE_ID_EXTENTS, ++ POS(inode_nr, round_up(new_size, block_bytes(c)) >> 9), ++ POS(inode_nr + 1, 0), NULL); ++} ++ ++static int bch2_fix_overlapping_extent(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k, struct bpos cut_at) ++{ ++ struct btree_iter *u_iter; ++ struct bkey_i *u; ++ int ret; ++ ++ u = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(u); ++ if (ret) ++ return ret; ++ ++ bkey_reassemble(u, k); ++ bch2_cut_front(cut_at, u); ++ ++ u_iter = bch2_trans_copy_iter(trans, iter); ++ ret = PTR_ERR_OR_ZERO(u_iter); ++ if (ret) ++ return ret; ++ ++ /* ++ * We don't want to go through the ++ * extent_handle_overwrites path: ++ */ ++ __bch2_btree_iter_set_pos(u_iter, u->k.p, false); ++ ++ /* ++ * XXX: this is going to leave disk space ++ * accounting slightly wrong ++ */ ++ ret = bch2_trans_update(trans, u_iter, u, 0); ++ bch2_trans_iter_put(trans, u_iter); ++ return ret; ++} ++ ++/* ++ * Walk extents: verify that extents have a corresponding S_ISREG inode, and ++ * that i_size an i_sectors are consistent ++ */ ++noinline_for_stack ++static int check_extents(struct bch_fs *c) ++{ ++ struct inode_walker w = inode_walker_init(); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_on_stack prev; ++ u64 i_sectors; ++ int ret = 0; ++ ++ bkey_on_stack_init(&prev); ++ prev.k->k = KEY(0, 0, 0); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ bch_verbose(c, "checking extents"); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(BCACHEFS_ROOT_INO, 0), ++ BTREE_ITER_INTENT); ++retry: ++ for_each_btree_key_continue(iter, 0, k, ret) { ++ if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { ++ char buf1[200]; ++ char buf2[200]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); ++ bch2_bkey_val_to_text(&PBUF(buf2), c, k); ++ ++ if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) { ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ bch2_fix_overlapping_extent(&trans, ++ iter, k, prev.k->k.p)); ++ if (ret) ++ goto err; ++ } ++ } ++ bkey_on_stack_reassemble(&prev, c, k); ++ ++ ret = walk_inode(&trans, &w, k.k->p.inode); ++ if (ret) ++ break; ++ ++ if (fsck_err_on(!w.have_inode, c, ++ "extent type %u for missing inode %llu", ++ k.k->type, k.k->p.inode) || ++ fsck_err_on(w.have_inode && ++ !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c, ++ "extent type %u for non regular file, inode %llu mode %o", ++ k.k->type, k.k->p.inode, w.inode.bi_mode)) { ++ bch2_trans_unlock(&trans); ++ ++ ret = bch2_inode_truncate(c, k.k->p.inode, 0); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ if (fsck_err_on(w.first_this_inode && ++ w.have_inode && ++ !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) && ++ w.inode.bi_sectors != ++ (i_sectors = bch2_count_inode_sectors(&trans, w.cur_inum)), ++ c, "inode %llu has incorrect i_sectors: got %llu, should be %llu", ++ w.inode.bi_inum, ++ w.inode.bi_sectors, i_sectors)) { ++ struct bkey_inode_buf p; ++ ++ w.inode.bi_sectors = i_sectors; ++ ++ bch2_trans_unlock(&trans); ++ ++ bch2_inode_pack(&p, &w.inode); ++ ++ ret = bch2_btree_insert(c, BTREE_ID_INODES, ++ &p.inode.k_i, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW); ++ if (ret) { ++ bch_err(c, "error in fsck: error %i updating inode", ret); ++ goto err; ++ } ++ ++ /* revalidate iterator: */ ++ k = bch2_btree_iter_peek(iter); ++ } ++ ++ if (fsck_err_on(w.have_inode && ++ !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && ++ k.k->type != KEY_TYPE_reservation && ++ k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c, ++ "extent type %u offset %llu past end of inode %llu, i_size %llu", ++ k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) { ++ bch2_trans_unlock(&trans); ++ ++ ret = bch2_inode_truncate(c, k.k->p.inode, ++ w.inode.bi_size); ++ if (ret) ++ goto err; ++ continue; ++ } ++ } ++err: ++fsck_err: ++ if (ret == -EINTR) ++ goto retry; ++ bkey_on_stack_exit(&prev, c); ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++/* ++ * Walk dirents: verify that they all have a corresponding S_ISDIR inode, ++ * validate d_type ++ */ ++noinline_for_stack ++static int check_dirents(struct bch_fs *c) ++{ ++ struct inode_walker w = inode_walker_init(); ++ struct hash_check h; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ unsigned name_len; ++ char buf[200]; ++ int ret = 0; ++ ++ bch_verbose(c, "checking dirents"); ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ hash_check_init(&h); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, ++ POS(BCACHEFS_ROOT_INO, 0), 0); ++retry: ++ for_each_btree_key_continue(iter, 0, k, ret) { ++ struct bkey_s_c_dirent d; ++ struct bch_inode_unpacked target; ++ bool have_target; ++ u64 d_inum; ++ ++ ret = walk_inode(&trans, &w, k.k->p.inode); ++ if (ret) ++ break; ++ ++ if (fsck_err_on(!w.have_inode, c, ++ "dirent in nonexisting directory:\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf)) || ++ fsck_err_on(!S_ISDIR(w.inode.bi_mode), c, ++ "dirent in non directory inode type %u:\n%s", ++ mode_to_type(w.inode.bi_mode), ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) { ++ ret = bch2_btree_delete_at(&trans, iter, 0); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ if (w.first_this_inode && w.have_inode) ++ hash_check_set_inode(&trans, &h, &w.inode); ++ ++ ret = check_dirent_hash(&trans, &h, iter, &k); ++ if (ret > 0) { ++ ret = 0; ++ continue; ++ } ++ if (ret) ++ goto fsck_err; ++ ++ if (ret) ++ goto fsck_err; ++ ++ if (k.k->type != KEY_TYPE_dirent) ++ continue; ++ ++ d = bkey_s_c_to_dirent(k); ++ d_inum = le64_to_cpu(d.v->d_inum); ++ ++ name_len = bch2_dirent_name_bytes(d); ++ ++ if (fsck_err_on(!name_len, c, "empty dirent") || ++ fsck_err_on(name_len == 1 && ++ !memcmp(d.v->d_name, ".", 1), c, ++ ". dirent") || ++ fsck_err_on(name_len == 2 && ++ !memcmp(d.v->d_name, "..", 2), c, ++ ".. dirent") || ++ fsck_err_on(name_len == 2 && ++ !memcmp(d.v->d_name, "..", 2), c, ++ ".. dirent") || ++ fsck_err_on(memchr(d.v->d_name, '/', name_len), c, ++ "dirent name has invalid chars")) { ++ ret = remove_dirent(&trans, d); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ if (fsck_err_on(d_inum == d.k->p.inode, c, ++ "dirent points to own directory:\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) { ++ ret = remove_dirent(&trans, d); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ ret = bch2_inode_find_by_inum_trans(&trans, d_inum, &target); ++ if (ret && ret != -ENOENT) ++ break; ++ ++ have_target = !ret; ++ ret = 0; ++ ++ if (fsck_err_on(!have_target, c, ++ "dirent points to missing inode:\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) { ++ ret = remove_dirent(&trans, d); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ if (fsck_err_on(have_target && ++ d.v->d_type != ++ mode_to_type(target.bi_mode), c, ++ "incorrect d_type: should be %u:\n%s", ++ mode_to_type(target.bi_mode), ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) { ++ struct bkey_i_dirent *n; ++ ++ n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); ++ if (!n) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ bkey_reassemble(&n->k_i, d.s_c); ++ n->v.d_type = mode_to_type(target.bi_mode); ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ (bch2_trans_update(&trans, iter, &n->k_i, 0), 0)); ++ kfree(n); ++ if (ret) ++ goto err; ++ ++ } ++ } ++ ++ hash_stop_chain(&trans, &h); ++err: ++fsck_err: ++ if (ret == -EINTR) ++ goto retry; ++ ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++/* ++ * Walk xattrs: verify that they all have a corresponding inode ++ */ ++noinline_for_stack ++static int check_xattrs(struct bch_fs *c) ++{ ++ struct inode_walker w = inode_walker_init(); ++ struct hash_check h; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch_verbose(c, "checking xattrs"); ++ ++ hash_check_init(&h); ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, ++ POS(BCACHEFS_ROOT_INO, 0), 0); ++retry: ++ for_each_btree_key_continue(iter, 0, k, ret) { ++ ret = walk_inode(&trans, &w, k.k->p.inode); ++ if (ret) ++ break; ++ ++ if (fsck_err_on(!w.have_inode, c, ++ "xattr for missing inode %llu", ++ k.k->p.inode)) { ++ ret = bch2_btree_delete_at(&trans, iter, 0); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ if (w.first_this_inode && w.have_inode) ++ hash_check_set_inode(&trans, &h, &w.inode); ++ ++ ret = hash_check_key(&trans, bch2_xattr_hash_desc, ++ &h, iter, k); ++ if (ret) ++ goto fsck_err; ++ } ++err: ++fsck_err: ++ if (ret == -EINTR) ++ goto retry; ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++/* Get root directory, create if it doesn't exist: */ ++static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode) ++{ ++ struct bkey_inode_buf packed; ++ int ret; ++ ++ bch_verbose(c, "checking root directory"); ++ ++ ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode); ++ if (ret && ret != -ENOENT) ++ return ret; ++ ++ if (fsck_err_on(ret, c, "root directory missing")) ++ goto create_root; ++ ++ if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c, ++ "root inode not a directory")) ++ goto create_root; ++ ++ return 0; ++fsck_err: ++ return ret; ++create_root: ++ bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755, ++ 0, NULL); ++ root_inode->bi_inum = BCACHEFS_ROOT_INO; ++ ++ bch2_inode_pack(&packed, root_inode); ++ ++ return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, ++ NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW); ++} ++ ++/* Get lost+found, create if it doesn't exist: */ ++static int check_lostfound(struct bch_fs *c, ++ struct bch_inode_unpacked *root_inode, ++ struct bch_inode_unpacked *lostfound_inode) ++{ ++ struct qstr lostfound = QSTR("lost+found"); ++ struct bch_hash_info root_hash_info = ++ bch2_hash_info_init(c, root_inode); ++ u64 inum; ++ int ret; ++ ++ bch_verbose(c, "checking lost+found"); ++ ++ inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info, ++ &lostfound); ++ if (!inum) { ++ bch_notice(c, "creating lost+found"); ++ goto create_lostfound; ++ } ++ ++ ret = bch2_inode_find_by_inum(c, inum, lostfound_inode); ++ if (ret && ret != -ENOENT) ++ return ret; ++ ++ if (fsck_err_on(ret, c, "lost+found missing")) ++ goto create_lostfound; ++ ++ if (fsck_err_on(!S_ISDIR(lostfound_inode->bi_mode), c, ++ "lost+found inode not a directory")) ++ goto create_lostfound; ++ ++ return 0; ++fsck_err: ++ return ret; ++create_lostfound: ++ bch2_inode_init_early(c, lostfound_inode); ++ ++ ret = bch2_trans_do(c, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ bch2_create_trans(&trans, ++ BCACHEFS_ROOT_INO, root_inode, ++ lostfound_inode, &lostfound, ++ 0, 0, S_IFDIR|0700, 0, NULL, NULL)); ++ if (ret) ++ bch_err(c, "error creating lost+found: %i", ret); ++ ++ return ret; ++} ++ ++struct inode_bitmap { ++ unsigned long *bits; ++ size_t size; ++}; ++ ++static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr) ++{ ++ return nr < b->size ? test_bit(nr, b->bits) : false; ++} ++ ++static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr) ++{ ++ if (nr >= b->size) { ++ size_t new_size = max_t(size_t, max_t(size_t, ++ PAGE_SIZE * 8, ++ b->size * 2), ++ nr + 1); ++ void *n; ++ ++ new_size = roundup_pow_of_two(new_size); ++ n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO); ++ if (!n) { ++ return -ENOMEM; ++ } ++ ++ b->bits = n; ++ b->size = new_size; ++ } ++ ++ __set_bit(nr, b->bits); ++ return 0; ++} ++ ++struct pathbuf { ++ size_t nr; ++ size_t size; ++ ++ struct pathbuf_entry { ++ u64 inum; ++ u64 offset; ++ } *entries; ++}; ++ ++static int path_down(struct pathbuf *p, u64 inum) ++{ ++ if (p->nr == p->size) { ++ size_t new_size = max_t(size_t, 256UL, p->size * 2); ++ void *n = krealloc(p->entries, ++ new_size * sizeof(p->entries[0]), ++ GFP_KERNEL); ++ if (!n) ++ return -ENOMEM; ++ ++ p->entries = n; ++ p->size = new_size; ++ }; ++ ++ p->entries[p->nr++] = (struct pathbuf_entry) { ++ .inum = inum, ++ .offset = 0, ++ }; ++ return 0; ++} ++ ++noinline_for_stack ++static int check_directory_structure(struct bch_fs *c, ++ struct bch_inode_unpacked *lostfound_inode) ++{ ++ struct inode_bitmap dirs_done = { NULL, 0 }; ++ struct pathbuf path = { 0, 0, NULL }; ++ struct pathbuf_entry *e; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_dirent dirent; ++ bool had_unreachable; ++ u64 d_inum; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ bch_verbose(c, "checking directory structure"); ++ ++ /* DFS: */ ++restart_dfs: ++ had_unreachable = false; ++ ++ ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO); ++ if (ret) { ++ bch_err(c, "memory allocation failure in inode_bitmap_set()"); ++ goto err; ++ } ++ ++ ret = path_down(&path, BCACHEFS_ROOT_INO); ++ if (ret) ++ goto err; ++ ++ while (path.nr) { ++next: ++ e = &path.entries[path.nr - 1]; ++ ++ if (e->offset == U64_MAX) ++ goto up; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, ++ POS(e->inum, e->offset + 1), 0, k, ret) { ++ if (k.k->p.inode != e->inum) ++ break; ++ ++ e->offset = k.k->p.offset; ++ ++ if (k.k->type != KEY_TYPE_dirent) ++ continue; ++ ++ dirent = bkey_s_c_to_dirent(k); ++ ++ if (dirent.v->d_type != DT_DIR) ++ continue; ++ ++ d_inum = le64_to_cpu(dirent.v->d_inum); ++ ++ if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c, ++ "directory %llu has multiple hardlinks", ++ d_inum)) { ++ ret = remove_dirent(&trans, dirent); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ ret = inode_bitmap_set(&dirs_done, d_inum); ++ if (ret) { ++ bch_err(c, "memory allocation failure in inode_bitmap_set()"); ++ goto err; ++ } ++ ++ ret = path_down(&path, d_inum); ++ if (ret) { ++ goto err; ++ } ++ ++ ret = bch2_trans_iter_free(&trans, iter); ++ if (ret) { ++ bch_err(c, "btree error %i in fsck", ret); ++ goto err; ++ } ++ goto next; ++ } ++ ret = bch2_trans_iter_free(&trans, iter) ?: ret; ++ if (ret) { ++ bch_err(c, "btree error %i in fsck", ret); ++ goto err; ++ } ++up: ++ path.nr--; ++ } ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS_MIN, 0); ++retry: ++ for_each_btree_key_continue(iter, 0, k, ret) { ++ if (k.k->type != KEY_TYPE_inode) ++ continue; ++ ++ if (!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode))) ++ continue; ++ ++ ret = bch2_empty_dir_trans(&trans, k.k->p.inode); ++ if (ret == -EINTR) ++ goto retry; ++ if (!ret) ++ continue; ++ ++ if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.offset), c, ++ "unreachable directory found (inum %llu)", ++ k.k->p.offset)) { ++ bch2_trans_unlock(&trans); ++ ++ ret = reattach_inode(c, lostfound_inode, k.k->p.offset); ++ if (ret) { ++ goto err; ++ } ++ ++ had_unreachable = true; ++ } ++ } ++ bch2_trans_iter_free(&trans, iter); ++ if (ret) ++ goto err; ++ ++ if (had_unreachable) { ++ bch_info(c, "reattached unreachable directories, restarting pass to check for loops"); ++ kfree(dirs_done.bits); ++ kfree(path.entries); ++ memset(&dirs_done, 0, sizeof(dirs_done)); ++ memset(&path, 0, sizeof(path)); ++ goto restart_dfs; ++ } ++err: ++fsck_err: ++ ret = bch2_trans_exit(&trans) ?: ret; ++ kfree(dirs_done.bits); ++ kfree(path.entries); ++ return ret; ++} ++ ++struct nlink { ++ u32 count; ++ u32 dir_count; ++}; ++ ++typedef GENRADIX(struct nlink) nlink_table; ++ ++static void inc_link(struct bch_fs *c, nlink_table *links, ++ u64 range_start, u64 *range_end, ++ u64 inum, bool dir) ++{ ++ struct nlink *link; ++ ++ if (inum < range_start || inum >= *range_end) ++ return; ++ ++ link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL); ++ if (!link) { ++ bch_verbose(c, "allocation failed during fsck - will need another pass"); ++ *range_end = inum; ++ return; ++ } ++ ++ if (dir) ++ link->dir_count++; ++ else ++ link->count++; ++} ++ ++noinline_for_stack ++static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, ++ u64 range_start, u64 *range_end) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_dirent d; ++ u64 d_inum; ++ int ret; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret) { ++ switch (k.k->type) { ++ case KEY_TYPE_dirent: ++ d = bkey_s_c_to_dirent(k); ++ d_inum = le64_to_cpu(d.v->d_inum); ++ ++ if (d.v->d_type == DT_DIR) ++ inc_link(c, links, range_start, range_end, ++ d.k->p.inode, true); ++ ++ inc_link(c, links, range_start, range_end, ++ d_inum, false); ++ ++ break; ++ } ++ ++ bch2_trans_cond_resched(&trans); ++ } ++ ret = bch2_trans_exit(&trans) ?: ret; ++ if (ret) ++ bch_err(c, "error in fsck: btree error %i while walking dirents", ret); ++ ++ return ret; ++} ++ ++static int check_inode_nlink(struct bch_fs *c, ++ struct bch_inode_unpacked *lostfound_inode, ++ struct bch_inode_unpacked *u, ++ struct nlink *link, ++ bool *do_update) ++{ ++ u32 i_nlink = bch2_inode_nlink_get(u); ++ u32 real_i_nlink = ++ link->count * nlink_bias(u->bi_mode) + ++ link->dir_count; ++ int ret = 0; ++ ++ /* ++ * These should have been caught/fixed by earlier passes, we don't ++ * repair them here: ++ */ ++ if (S_ISDIR(u->bi_mode) && link->count > 1) { ++ need_fsck_err(c, "directory %llu with multiple hardlinks: %u", ++ u->bi_inum, link->count); ++ return 0; ++ } ++ ++ if (S_ISDIR(u->bi_mode) && !link->count) { ++ need_fsck_err(c, "unreachable directory found (inum %llu)", ++ u->bi_inum); ++ return 0; ++ } ++ ++ if (!S_ISDIR(u->bi_mode) && link->dir_count) { ++ need_fsck_err(c, "non directory with subdirectories (inum %llu)", ++ u->bi_inum); ++ return 0; ++ } ++ ++ if (!link->count && ++ !(u->bi_flags & BCH_INODE_UNLINKED) && ++ (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { ++ if (fsck_err(c, "unreachable inode %llu not marked as unlinked (type %u)", ++ u->bi_inum, mode_to_type(u->bi_mode)) == ++ FSCK_ERR_IGNORE) ++ return 0; ++ ++ ret = reattach_inode(c, lostfound_inode, u->bi_inum); ++ if (ret) ++ return ret; ++ ++ link->count = 1; ++ real_i_nlink = nlink_bias(u->bi_mode) + link->dir_count; ++ goto set_i_nlink; ++ } ++ ++ if (i_nlink < link->count) { ++ if (fsck_err(c, "inode %llu i_link too small (%u < %u, type %i)", ++ u->bi_inum, i_nlink, link->count, ++ mode_to_type(u->bi_mode)) == FSCK_ERR_IGNORE) ++ return 0; ++ goto set_i_nlink; ++ } ++ ++ if (i_nlink != real_i_nlink && ++ c->sb.clean) { ++ if (fsck_err(c, "filesystem marked clean, " ++ "but inode %llu has wrong i_nlink " ++ "(type %u i_nlink %u, should be %u)", ++ u->bi_inum, mode_to_type(u->bi_mode), ++ i_nlink, real_i_nlink) == FSCK_ERR_IGNORE) ++ return 0; ++ goto set_i_nlink; ++ } ++ ++ if (i_nlink != real_i_nlink && ++ (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { ++ if (fsck_err(c, "inode %llu has wrong i_nlink " ++ "(type %u i_nlink %u, should be %u)", ++ u->bi_inum, mode_to_type(u->bi_mode), ++ i_nlink, real_i_nlink) == FSCK_ERR_IGNORE) ++ return 0; ++ goto set_i_nlink; ++ } ++ ++ if (real_i_nlink && i_nlink != real_i_nlink) ++ bch_verbose(c, "setting inode %llu nlink from %u to %u", ++ u->bi_inum, i_nlink, real_i_nlink); ++set_i_nlink: ++ if (i_nlink != real_i_nlink) { ++ bch2_inode_nlink_set(u, real_i_nlink); ++ *do_update = true; ++ } ++fsck_err: ++ return ret; ++} ++ ++static int check_inode(struct btree_trans *trans, ++ struct bch_inode_unpacked *lostfound_inode, ++ struct btree_iter *iter, ++ struct bkey_s_c_inode inode, ++ struct nlink *link) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_inode_unpacked u; ++ bool do_update = false; ++ int ret = 0; ++ ++ ret = bch2_inode_unpack(inode, &u); ++ ++ bch2_trans_unlock(trans); ++ ++ if (bch2_fs_inconsistent_on(ret, c, ++ "error unpacking inode %llu in fsck", ++ inode.k->p.inode)) ++ return ret; ++ ++ if (link) { ++ ret = check_inode_nlink(c, lostfound_inode, &u, link, ++ &do_update); ++ if (ret) ++ return ret; ++ } ++ ++ if (u.bi_flags & BCH_INODE_UNLINKED && ++ (!c->sb.clean || ++ fsck_err(c, "filesystem marked clean, but inode %llu unlinked", ++ u.bi_inum))) { ++ bch_verbose(c, "deleting inode %llu", u.bi_inum); ++ ++ bch2_fs_lazy_rw(c); ++ ++ ret = bch2_inode_rm(c, u.bi_inum); ++ if (ret) ++ bch_err(c, "error in fsck: error %i while deleting inode", ret); ++ return ret; ++ } ++ ++ if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY && ++ (!c->sb.clean || ++ fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty", ++ u.bi_inum))) { ++ bch_verbose(c, "truncating inode %llu", u.bi_inum); ++ ++ bch2_fs_lazy_rw(c); ++ ++ /* ++ * XXX: need to truncate partial blocks too here - or ideally ++ * just switch units to bytes and that issue goes away ++ */ ++ ++ ret = bch2_inode_truncate(c, u.bi_inum, u.bi_size); ++ if (ret) { ++ bch_err(c, "error in fsck: error %i truncating inode", ret); ++ return ret; ++ } ++ ++ /* ++ * We truncated without our normal sector accounting hook, just ++ * make sure we recalculate it: ++ */ ++ u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY; ++ ++ u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; ++ do_update = true; ++ } ++ ++ if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY && ++ (!c->sb.clean || ++ fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty", ++ u.bi_inum))) { ++ s64 sectors; ++ ++ bch_verbose(c, "recounting sectors for inode %llu", ++ u.bi_inum); ++ ++ sectors = bch2_count_inode_sectors(trans, u.bi_inum); ++ if (sectors < 0) { ++ bch_err(c, "error in fsck: error %i recounting inode sectors", ++ (int) sectors); ++ return sectors; ++ } ++ ++ u.bi_sectors = sectors; ++ u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY; ++ do_update = true; ++ } ++ ++ if (do_update) { ++ struct bkey_inode_buf p; ++ ++ bch2_inode_pack(&p, &u); ++ ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ (bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0)); ++ if (ret) ++ bch_err(c, "error in fsck: error %i " ++ "updating inode", ret); ++ } ++fsck_err: ++ return ret; ++} ++ ++noinline_for_stack ++static int bch2_gc_walk_inodes(struct bch_fs *c, ++ struct bch_inode_unpacked *lostfound_inode, ++ nlink_table *links, ++ u64 range_start, u64 range_end) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct nlink *link, zero_links = { 0, 0 }; ++ struct genradix_iter nlinks_iter; ++ int ret = 0, ret2 = 0; ++ u64 nlinks_pos; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, ++ POS(0, range_start), 0); ++ nlinks_iter = genradix_iter_init(links, 0); ++ ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret2 = bkey_err(k))) { ++peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); ++ ++ if (!link && (!k.k || iter->pos.offset >= range_end)) ++ break; ++ ++ nlinks_pos = range_start + nlinks_iter.pos; ++ if (iter->pos.offset > nlinks_pos) { ++ /* Should have been caught by dirents pass: */ ++ need_fsck_err_on(link && link->count, c, ++ "missing inode %llu (nlink %u)", ++ nlinks_pos, link->count); ++ genradix_iter_advance(&nlinks_iter, links); ++ goto peek_nlinks; ++ } ++ ++ if (iter->pos.offset < nlinks_pos || !link) ++ link = &zero_links; ++ ++ if (k.k && k.k->type == KEY_TYPE_inode) { ++ ret = check_inode(&trans, lostfound_inode, iter, ++ bkey_s_c_to_inode(k), link); ++ BUG_ON(ret == -EINTR); ++ if (ret) ++ break; ++ } else { ++ /* Should have been caught by dirents pass: */ ++ need_fsck_err_on(link->count, c, ++ "missing inode %llu (nlink %u)", ++ nlinks_pos, link->count); ++ } ++ ++ if (nlinks_pos == iter->pos.offset) ++ genradix_iter_advance(&nlinks_iter, links); ++ ++ bch2_btree_iter_next(iter); ++ bch2_trans_cond_resched(&trans); ++ } ++fsck_err: ++ bch2_trans_exit(&trans); ++ ++ if (ret2) ++ bch_err(c, "error in fsck: btree error %i while walking inodes", ret2); ++ ++ return ret ?: ret2; ++} ++ ++noinline_for_stack ++static int check_inode_nlinks(struct bch_fs *c, ++ struct bch_inode_unpacked *lostfound_inode) ++{ ++ nlink_table links; ++ u64 this_iter_range_start, next_iter_range_start = 0; ++ int ret = 0; ++ ++ bch_verbose(c, "checking inode nlinks"); ++ ++ genradix_init(&links); ++ ++ do { ++ this_iter_range_start = next_iter_range_start; ++ next_iter_range_start = U64_MAX; ++ ++ ret = bch2_gc_walk_dirents(c, &links, ++ this_iter_range_start, ++ &next_iter_range_start); ++ if (ret) ++ break; ++ ++ ret = bch2_gc_walk_inodes(c, lostfound_inode, &links, ++ this_iter_range_start, ++ next_iter_range_start); ++ if (ret) ++ break; ++ ++ genradix_free(&links); ++ } while (next_iter_range_start != U64_MAX); ++ ++ genradix_free(&links); ++ ++ return ret; ++} ++ ++/* ++ * Checks for inconsistencies that shouldn't happen, unless we have a bug. ++ * Doesn't fix them yet, mainly because they haven't yet been observed: ++ */ ++int bch2_fsck_full(struct bch_fs *c) ++{ ++ struct bch_inode_unpacked root_inode, lostfound_inode; ++ ++ return check_extents(c) ?: ++ check_dirents(c) ?: ++ check_xattrs(c) ?: ++ check_root(c, &root_inode) ?: ++ check_lostfound(c, &root_inode, &lostfound_inode) ?: ++ check_directory_structure(c, &lostfound_inode) ?: ++ check_inode_nlinks(c, &lostfound_inode); ++} ++ ++int bch2_fsck_inode_nlink(struct bch_fs *c) ++{ ++ struct bch_inode_unpacked root_inode, lostfound_inode; ++ ++ return check_root(c, &root_inode) ?: ++ check_lostfound(c, &root_inode, &lostfound_inode) ?: ++ check_inode_nlinks(c, &lostfound_inode); ++} ++ ++int bch2_fsck_walk_inodes_only(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_inode inode; ++ int ret; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k, ret) { ++ if (k.k->type != KEY_TYPE_inode) ++ continue; ++ ++ inode = bkey_s_c_to_inode(k); ++ ++ if (inode.v->bi_flags & ++ (BCH_INODE_I_SIZE_DIRTY| ++ BCH_INODE_I_SECTORS_DIRTY| ++ BCH_INODE_UNLINKED)) { ++ ret = check_inode(&trans, NULL, iter, inode, NULL); ++ BUG_ON(ret == -EINTR); ++ if (ret) ++ break; ++ } ++ } ++ BUG_ON(ret == -EINTR); ++ ++ return bch2_trans_exit(&trans) ?: ret; ++} +diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h +new file mode 100644 +index 000000000000..9e4af02bde1e +--- /dev/null ++++ b/fs/bcachefs/fsck.h +@@ -0,0 +1,9 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FSCK_H ++#define _BCACHEFS_FSCK_H ++ ++int bch2_fsck_full(struct bch_fs *); ++int bch2_fsck_inode_nlink(struct bch_fs *); ++int bch2_fsck_walk_inodes_only(struct bch_fs *); ++ ++#endif /* _BCACHEFS_FSCK_H */ +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +new file mode 100644 +index 000000000000..7d20f082ad45 +--- /dev/null ++++ b/fs/bcachefs/inode.c +@@ -0,0 +1,554 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_update.h" ++#include "error.h" ++#include "extents.h" ++#include "inode.h" ++#include "str_hash.h" ++ ++#include ++ ++#include ++ ++const char * const bch2_inode_opts[] = { ++#define x(name, ...) #name, ++ BCH_INODE_OPTS() ++#undef x ++ NULL, ++}; ++ ++static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; ++static const u8 bits_table[8] = { ++ 1 * 8 - 1, ++ 2 * 8 - 2, ++ 3 * 8 - 3, ++ 4 * 8 - 4, ++ 6 * 8 - 5, ++ 8 * 8 - 6, ++ 10 * 8 - 7, ++ 13 * 8 - 8, ++}; ++ ++static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo) ++{ ++ __be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), }; ++ unsigned shift, bytes, bits = likely(!hi) ++ ? fls64(lo) ++ : fls64(hi) + 64; ++ ++ for (shift = 1; shift <= 8; shift++) ++ if (bits < bits_table[shift - 1]) ++ goto got_shift; ++ ++ BUG(); ++got_shift: ++ bytes = byte_table[shift - 1]; ++ ++ BUG_ON(out + bytes > end); ++ ++ memcpy(out, (u8 *) in + 16 - bytes, bytes); ++ *out |= (1 << 8) >> shift; ++ ++ return bytes; ++} ++ ++static int inode_decode_field(const u8 *in, const u8 *end, ++ u64 out[2], unsigned *out_bits) ++{ ++ __be64 be[2] = { 0, 0 }; ++ unsigned bytes, shift; ++ u8 *p; ++ ++ if (in >= end) ++ return -1; ++ ++ if (!*in) ++ return -1; ++ ++ /* ++ * position of highest set bit indicates number of bytes: ++ * shift = number of bits to remove in high byte: ++ */ ++ shift = 8 - __fls(*in); /* 1 <= shift <= 8 */ ++ bytes = byte_table[shift - 1]; ++ ++ if (in + bytes > end) ++ return -1; ++ ++ p = (u8 *) be + 16 - bytes; ++ memcpy(p, in, bytes); ++ *p ^= (1 << 8) >> shift; ++ ++ out[0] = be64_to_cpu(be[0]); ++ out[1] = be64_to_cpu(be[1]); ++ *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]); ++ ++ return bytes; ++} ++ ++void bch2_inode_pack(struct bkey_inode_buf *packed, ++ const struct bch_inode_unpacked *inode) ++{ ++ u8 *out = packed->inode.v.fields; ++ u8 *end = (void *) &packed[1]; ++ u8 *last_nonzero_field = out; ++ unsigned nr_fields = 0, last_nonzero_fieldnr = 0; ++ unsigned bytes; ++ ++ bkey_inode_init(&packed->inode.k_i); ++ packed->inode.k.p.offset = inode->bi_inum; ++ packed->inode.v.bi_hash_seed = inode->bi_hash_seed; ++ packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags); ++ packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); ++ ++#define x(_name, _bits) \ ++ out += inode_encode_field(out, end, 0, inode->_name); \ ++ nr_fields++; \ ++ \ ++ if (inode->_name) { \ ++ last_nonzero_field = out; \ ++ last_nonzero_fieldnr = nr_fields; \ ++ } ++ ++ BCH_INODE_FIELDS() ++#undef x ++ ++ out = last_nonzero_field; ++ nr_fields = last_nonzero_fieldnr; ++ ++ bytes = out - (u8 *) &packed->inode.v; ++ set_bkey_val_bytes(&packed->inode.k, bytes); ++ memset_u64s_tail(&packed->inode.v, 0, bytes); ++ ++ SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields); ++ ++ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { ++ struct bch_inode_unpacked unpacked; ++ ++ int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode), ++ &unpacked); ++ BUG_ON(ret); ++ BUG_ON(unpacked.bi_inum != inode->bi_inum); ++ BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); ++ BUG_ON(unpacked.bi_mode != inode->bi_mode); ++ ++#define x(_name, _bits) BUG_ON(unpacked._name != inode->_name); ++ BCH_INODE_FIELDS() ++#undef x ++ } ++} ++ ++int bch2_inode_unpack(struct bkey_s_c_inode inode, ++ struct bch_inode_unpacked *unpacked) ++{ ++ const u8 *in = inode.v->fields; ++ const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k); ++ u64 field[2]; ++ unsigned fieldnr = 0, field_bits; ++ int ret; ++ ++ unpacked->bi_inum = inode.k->p.offset; ++ unpacked->bi_hash_seed = inode.v->bi_hash_seed; ++ unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); ++ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); ++ ++#define x(_name, _bits) \ ++ if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ ++ memset(&unpacked->_name, 0, \ ++ sizeof(*unpacked) - \ ++ offsetof(struct bch_inode_unpacked, _name)); \ ++ return 0; \ ++ } \ ++ \ ++ ret = inode_decode_field(in, end, field, &field_bits); \ ++ if (ret < 0) \ ++ return ret; \ ++ \ ++ if (field_bits > sizeof(unpacked->_name) * 8) \ ++ return -1; \ ++ \ ++ unpacked->_name = field[1]; \ ++ in += ret; ++ ++ BCH_INODE_FIELDS() ++#undef x ++ ++ /* XXX: signal if there were more fields than expected? */ ++ ++ return 0; ++} ++ ++struct btree_iter *bch2_inode_peek(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode, ++ u64 inum, unsigned flags) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum), ++ BTREE_ITER_SLOTS|flags); ++ if (IS_ERR(iter)) ++ return iter; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO; ++ if (ret) ++ goto err; ++ ++ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); ++ if (ret) ++ goto err; ++ ++ return iter; ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ERR_PTR(ret); ++} ++ ++int bch2_inode_write(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bch_inode_unpacked *inode) ++{ ++ struct bkey_inode_buf *inode_p; ++ ++ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); ++ if (IS_ERR(inode_p)) ++ return PTR_ERR(inode_p); ++ ++ bch2_inode_pack(inode_p, inode); ++ bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); ++ return 0; ++} ++ ++const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); ++ struct bch_inode_unpacked unpacked; ++ ++ if (k.k->p.inode) ++ return "nonzero k.p.inode"; ++ ++ if (bkey_val_bytes(k.k) < sizeof(struct bch_inode)) ++ return "incorrect value size"; ++ ++ if (k.k->p.offset < BLOCKDEV_INODE_MAX) ++ return "fs inode in blockdev range"; ++ ++ if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) ++ return "invalid str hash type"; ++ ++ if (bch2_inode_unpack(inode, &unpacked)) ++ return "invalid variable length fields"; ++ ++ if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) ++ return "invalid data checksum type"; ++ ++ if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) ++ return "invalid data checksum type"; ++ ++ if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && ++ unpacked.bi_nlink != 0) ++ return "flagged as unlinked but bi_nlink != 0"; ++ ++ return NULL; ++} ++ ++void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); ++ struct bch_inode_unpacked unpacked; ++ ++ if (bch2_inode_unpack(inode, &unpacked)) { ++ pr_buf(out, "(unpack error)"); ++ return; ++ } ++ ++#define x(_name, _bits) \ ++ pr_buf(out, #_name ": %llu ", (u64) unpacked._name); ++ BCH_INODE_FIELDS() ++#undef x ++} ++ ++const char *bch2_inode_generation_invalid(const struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ if (k.k->p.inode) ++ return "nonzero k.p.inode"; ++ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) ++ return "incorrect value size"; ++ ++ return NULL; ++} ++ ++void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k); ++ ++ pr_buf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); ++} ++ ++void bch2_inode_init_early(struct bch_fs *c, ++ struct bch_inode_unpacked *inode_u) ++{ ++ enum bch_str_hash_type str_hash = ++ bch2_str_hash_opt_to_type(c, c->opts.str_hash); ++ ++ memset(inode_u, 0, sizeof(*inode_u)); ++ ++ /* ick */ ++ inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET; ++ get_random_bytes(&inode_u->bi_hash_seed, ++ sizeof(inode_u->bi_hash_seed)); ++} ++ ++void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, ++ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, ++ struct bch_inode_unpacked *parent) ++{ ++ inode_u->bi_mode = mode; ++ inode_u->bi_uid = uid; ++ inode_u->bi_gid = gid; ++ inode_u->bi_dev = rdev; ++ inode_u->bi_atime = now; ++ inode_u->bi_mtime = now; ++ inode_u->bi_ctime = now; ++ inode_u->bi_otime = now; ++ ++ if (parent && parent->bi_mode & S_ISGID) { ++ inode_u->bi_gid = parent->bi_gid; ++ if (S_ISDIR(mode)) ++ inode_u->bi_mode |= S_ISGID; ++ } ++ ++ if (parent) { ++#define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name; ++ BCH_INODE_OPTS() ++#undef x ++ } ++} ++ ++void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, ++ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, ++ struct bch_inode_unpacked *parent) ++{ ++ bch2_inode_init_early(c, inode_u); ++ bch2_inode_init_late(inode_u, bch2_current_time(c), ++ uid, gid, mode, rdev, parent); ++} ++ ++static inline u32 bkey_generation(struct bkey_s_c k) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_inode: ++ BUG(); ++ case KEY_TYPE_inode_generation: ++ return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); ++ default: ++ return 0; ++ } ++} ++ ++int bch2_inode_create(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode_u, ++ u64 min, u64 max, u64 *hint) ++{ ++ struct bkey_inode_buf *inode_p; ++ struct btree_iter *iter = NULL; ++ struct bkey_s_c k; ++ u64 start; ++ int ret; ++ ++ if (!max) ++ max = ULLONG_MAX; ++ ++ if (trans->c->opts.inodes_32bit) ++ max = min_t(u64, max, U32_MAX); ++ ++ start = READ_ONCE(*hint); ++ ++ if (start >= max || start < min) ++ start = min; ++ ++ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); ++ if (IS_ERR(inode_p)) ++ return PTR_ERR(inode_p); ++again: ++ for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(0, start), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ if (bkey_cmp(iter->pos, POS(0, max)) > 0) ++ break; ++ ++ if (k.k->type != KEY_TYPE_inode) ++ goto found_slot; ++ } ++ ++ bch2_trans_iter_put(trans, iter); ++ ++ if (ret) ++ return ret; ++ ++ if (start != min) { ++ /* Retry from start */ ++ start = min; ++ goto again; ++ } ++ ++ return -ENOSPC; ++found_slot: ++ *hint = k.k->p.offset; ++ inode_u->bi_inum = k.k->p.offset; ++ inode_u->bi_generation = bkey_generation(k); ++ ++ bch2_inode_pack(inode_p, inode_u); ++ bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); ++ bch2_trans_iter_put(trans, iter); ++ return 0; ++} ++ ++int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_i_inode_generation delete; ++ struct bpos start = POS(inode_nr, 0); ++ struct bpos end = POS(inode_nr + 1, 0); ++ int ret; ++ ++ /* ++ * If this was a directory, there shouldn't be any real dirents left - ++ * but there could be whiteouts (from hash collisions) that we should ++ * delete: ++ * ++ * XXX: the dirent could ideally would delete whiteouts when they're no ++ * longer needed ++ */ ++ ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, ++ start, end, NULL) ?: ++ bch2_btree_delete_range(c, BTREE_ID_XATTRS, ++ start, end, NULL) ?: ++ bch2_btree_delete_range(c, BTREE_ID_DIRENTS, ++ start, end, NULL); ++ if (ret) ++ return ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ do { ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); ++ u32 bi_generation = 0; ++ ++ ret = bkey_err(k); ++ if (ret) ++ break; ++ ++ bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c, ++ "inode %llu not found when deleting", ++ inode_nr); ++ ++ switch (k.k->type) { ++ case KEY_TYPE_inode: { ++ struct bch_inode_unpacked inode_u; ++ ++ if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u)) ++ bi_generation = inode_u.bi_generation + 1; ++ break; ++ } ++ case KEY_TYPE_inode_generation: { ++ struct bkey_s_c_inode_generation g = ++ bkey_s_c_to_inode_generation(k); ++ bi_generation = le32_to_cpu(g.v->bi_generation); ++ break; ++ } ++ } ++ ++ if (!bi_generation) { ++ bkey_init(&delete.k); ++ delete.k.p.offset = inode_nr; ++ } else { ++ bkey_inode_generation_init(&delete.k_i); ++ delete.k.p.offset = inode_nr; ++ delete.v.bi_generation = cpu_to_le32(bi_generation); ++ } ++ ++ bch2_trans_update(&trans, iter, &delete.k_i, 0); ++ ++ ret = bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL); ++ } while (ret == -EINTR); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, ++ struct bch_inode_unpacked *inode) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, ++ POS(0, inode_nr), BTREE_ITER_SLOTS); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ ret = k.k->type == KEY_TYPE_inode ++ ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode) ++ : -ENOENT; ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, ++ struct bch_inode_unpacked *inode) ++{ ++ return bch2_trans_do(c, NULL, NULL, 0, ++ bch2_inode_find_by_inum_trans(&trans, inode_nr, inode)); ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_inode_pack_test(void) ++{ ++ struct bch_inode_unpacked *u, test_inodes[] = { ++ { ++ .bi_atime = U64_MAX, ++ .bi_ctime = U64_MAX, ++ .bi_mtime = U64_MAX, ++ .bi_otime = U64_MAX, ++ .bi_size = U64_MAX, ++ .bi_sectors = U64_MAX, ++ .bi_uid = U32_MAX, ++ .bi_gid = U32_MAX, ++ .bi_nlink = U32_MAX, ++ .bi_generation = U32_MAX, ++ .bi_dev = U32_MAX, ++ }, ++ }; ++ ++ for (u = test_inodes; ++ u < test_inodes + ARRAY_SIZE(test_inodes); ++ u++) { ++ struct bkey_inode_buf p; ++ ++ bch2_inode_pack(&p, u); ++ } ++} ++#endif +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +new file mode 100644 +index 000000000000..bb759a46dc41 +--- /dev/null ++++ b/fs/bcachefs/inode.h +@@ -0,0 +1,177 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_INODE_H ++#define _BCACHEFS_INODE_H ++ ++#include "opts.h" ++ ++extern const char * const bch2_inode_opts[]; ++ ++const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_inode (struct bkey_ops) { \ ++ .key_invalid = bch2_inode_invalid, \ ++ .val_to_text = bch2_inode_to_text, \ ++} ++ ++const char *bch2_inode_generation_invalid(const struct bch_fs *, ++ struct bkey_s_c); ++void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ ++#define bch2_bkey_ops_inode_generation (struct bkey_ops) { \ ++ .key_invalid = bch2_inode_generation_invalid, \ ++ .val_to_text = bch2_inode_generation_to_text, \ ++} ++ ++struct bch_inode_unpacked { ++ u64 bi_inum; ++ __le64 bi_hash_seed; ++ u32 bi_flags; ++ u16 bi_mode; ++ ++#define x(_name, _bits) u##_bits _name; ++ BCH_INODE_FIELDS() ++#undef x ++}; ++ ++struct bkey_inode_buf { ++ struct bkey_i_inode inode; ++ ++#define x(_name, _bits) + 8 + _bits / 8 ++ u8 _pad[0 + BCH_INODE_FIELDS()]; ++#undef x ++} __attribute__((packed, aligned(8))); ++ ++void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); ++int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); ++ ++struct btree_iter *bch2_inode_peek(struct btree_trans *, ++ struct bch_inode_unpacked *, u64, unsigned); ++int bch2_inode_write(struct btree_trans *, struct btree_iter *, ++ struct bch_inode_unpacked *); ++ ++void bch2_inode_init_early(struct bch_fs *, ++ struct bch_inode_unpacked *); ++void bch2_inode_init_late(struct bch_inode_unpacked *, u64, ++ uid_t, gid_t, umode_t, dev_t, ++ struct bch_inode_unpacked *); ++void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, ++ uid_t, gid_t, umode_t, dev_t, ++ struct bch_inode_unpacked *); ++ ++int bch2_inode_create(struct btree_trans *, ++ struct bch_inode_unpacked *, ++ u64, u64, u64 *); ++ ++int bch2_inode_rm(struct bch_fs *, u64); ++ ++int bch2_inode_find_by_inum_trans(struct btree_trans *, u64, ++ struct bch_inode_unpacked *); ++int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *); ++ ++static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode) ++{ ++ struct bch_io_opts ret = { 0 }; ++ ++#define x(_name, _bits) \ ++ if (inode->bi_##_name) \ ++ opt_set(ret, _name, inode->bi_##_name - 1); ++ BCH_INODE_OPTS() ++#undef x ++ return ret; ++} ++ ++static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode, ++ enum inode_opt_id id, u64 v) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Inode_opt_##_name: \ ++ inode->bi_##_name = v; \ ++ break; ++ BCH_INODE_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode, ++ enum inode_opt_id id) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Inode_opt_##_name: \ ++ return inode->bi_##_name; ++ BCH_INODE_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++static inline struct bch_io_opts ++io_opts(struct bch_fs *c, struct bch_inode_unpacked *inode) ++{ ++ struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts); ++ ++ bch2_io_opts_apply(&opts, bch2_inode_opts_get(inode)); ++ return opts; ++} ++ ++static inline u8 mode_to_type(umode_t mode) ++{ ++ return (mode >> 12) & 15; ++} ++ ++/* i_nlink: */ ++ ++static inline unsigned nlink_bias(umode_t mode) ++{ ++ return S_ISDIR(mode) ? 2 : 1; ++} ++ ++static inline void bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) ++{ ++ if (bi->bi_flags & BCH_INODE_UNLINKED) ++ bi->bi_flags &= ~BCH_INODE_UNLINKED; ++ else ++ bi->bi_nlink++; ++} ++ ++static inline void bch2_inode_nlink_dec(struct bch_inode_unpacked *bi) ++{ ++ BUG_ON(bi->bi_flags & BCH_INODE_UNLINKED); ++ if (bi->bi_nlink) ++ bi->bi_nlink--; ++ else ++ bi->bi_flags |= BCH_INODE_UNLINKED; ++} ++ ++static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi) ++{ ++ return bi->bi_flags & BCH_INODE_UNLINKED ++ ? 0 ++ : bi->bi_nlink + nlink_bias(bi->bi_mode); ++} ++ ++static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, ++ unsigned nlink) ++{ ++ if (nlink) { ++ bi->bi_nlink = nlink - nlink_bias(bi->bi_mode); ++ bi->bi_flags &= ~BCH_INODE_UNLINKED; ++ } else { ++ bi->bi_nlink = 0; ++ bi->bi_flags |= BCH_INODE_UNLINKED; ++ } ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_inode_pack_test(void); ++#else ++static inline void bch2_inode_pack_test(void) {} ++#endif ++ ++#endif /* _BCACHEFS_INODE_H */ +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +new file mode 100644 +index 000000000000..5c9c3cf54edd +--- /dev/null ++++ b/fs/bcachefs/io.c +@@ -0,0 +1,2387 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Some low level IO code, and hacks for various block layer limitations ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_on_stack.h" ++#include "bset.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "compress.h" ++#include "clock.h" ++#include "debug.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "error.h" ++#include "extent_update.h" ++#include "inode.h" ++#include "io.h" ++#include "journal.h" ++#include "keylist.h" ++#include "move.h" ++#include "rebalance.h" ++#include "super.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++ ++#include ++ ++const char *bch2_blk_status_to_str(blk_status_t status) ++{ ++ if (status == BLK_STS_REMOVED) ++ return "device removed"; ++ return blk_status_to_str(status); ++} ++ ++static bool bch2_target_congested(struct bch_fs *c, u16 target) ++{ ++ const struct bch_devs_mask *devs; ++ unsigned d, nr = 0, total = 0; ++ u64 now = local_clock(), last; ++ s64 congested; ++ struct bch_dev *ca; ++ ++ if (!target) ++ return false; ++ ++ rcu_read_lock(); ++ devs = bch2_target_to_mask(c, target) ?: ++ &c->rw_devs[BCH_DATA_user]; ++ ++ for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { ++ ca = rcu_dereference(c->devs[d]); ++ if (!ca) ++ continue; ++ ++ congested = atomic_read(&ca->congested); ++ last = READ_ONCE(ca->congested_last); ++ if (time_after64(now, last)) ++ congested -= (now - last) >> 12; ++ ++ total += max(congested, 0LL); ++ nr++; ++ } ++ rcu_read_unlock(); ++ ++ return bch2_rand_range(nr * CONGESTED_MAX) < total; ++} ++ ++static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, ++ u64 now, int rw) ++{ ++ u64 latency_capable = ++ ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; ++ /* ideally we'd be taking into account the device's variance here: */ ++ u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); ++ s64 latency_over = io_latency - latency_threshold; ++ ++ if (latency_threshold && latency_over > 0) { ++ /* ++ * bump up congested by approximately latency_over * 4 / ++ * latency_threshold - we don't need much accuracy here so don't ++ * bother with the divide: ++ */ ++ if (atomic_read(&ca->congested) < CONGESTED_MAX) ++ atomic_add(latency_over >> ++ max_t(int, ilog2(latency_threshold) - 2, 0), ++ &ca->congested); ++ ++ ca->congested_last = now; ++ } else if (atomic_read(&ca->congested) > 0) { ++ atomic_dec(&ca->congested); ++ } ++} ++ ++void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) ++{ ++ atomic64_t *latency = &ca->cur_latency[rw]; ++ u64 now = local_clock(); ++ u64 io_latency = time_after64(now, submit_time) ++ ? now - submit_time ++ : 0; ++ u64 old, new, v = atomic64_read(latency); ++ ++ do { ++ old = v; ++ ++ /* ++ * If the io latency was reasonably close to the current ++ * latency, skip doing the update and atomic operation - most of ++ * the time: ++ */ ++ if (abs((int) (old - io_latency)) < (old >> 1) && ++ now & ~(~0 << 5)) ++ break; ++ ++ new = ewma_add(old, io_latency, 5); ++ } while ((v = atomic64_cmpxchg(latency, old, new)) != old); ++ ++ bch2_congested_acct(ca, io_latency, now, rw); ++ ++ __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); ++} ++ ++/* Allocate, free from mempool: */ ++ ++void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) ++{ ++ struct bvec_iter_all iter; ++ struct bio_vec *bv; ++ ++ bio_for_each_segment_all(bv, bio, iter) ++ if (bv->bv_page != ZERO_PAGE(0)) ++ mempool_free(bv->bv_page, &c->bio_bounce_pages); ++ bio->bi_vcnt = 0; ++} ++ ++static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) ++{ ++ struct page *page; ++ ++ if (likely(!*using_mempool)) { ++ page = alloc_page(GFP_NOIO); ++ if (unlikely(!page)) { ++ mutex_lock(&c->bio_bounce_pages_lock); ++ *using_mempool = true; ++ goto pool_alloc; ++ ++ } ++ } else { ++pool_alloc: ++ page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO); ++ } ++ ++ return page; ++} ++ ++void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, ++ size_t size) ++{ ++ bool using_mempool = false; ++ ++ while (size) { ++ struct page *page = __bio_alloc_page_pool(c, &using_mempool); ++ unsigned len = min(PAGE_SIZE, size); ++ ++ BUG_ON(!bio_add_page(bio, page, len, 0)); ++ size -= len; ++ } ++ ++ if (using_mempool) ++ mutex_unlock(&c->bio_bounce_pages_lock); ++} ++ ++/* Extent update path: */ ++ ++static int sum_sector_overwrites(struct btree_trans *trans, ++ struct btree_iter *extent_iter, ++ struct bkey_i *new, ++ bool may_allocate, ++ bool *maybe_extending, ++ s64 *delta) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c old; ++ int ret = 0; ++ ++ *maybe_extending = true; ++ *delta = 0; ++ ++ iter = bch2_trans_copy_iter(trans, extent_iter); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { ++ if (!may_allocate && ++ bch2_bkey_nr_ptrs_fully_allocated(old) < ++ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) { ++ ret = -ENOSPC; ++ break; ++ } ++ ++ *delta += (min(new->k.p.offset, ++ old.k->p.offset) - ++ max(bkey_start_offset(&new->k), ++ bkey_start_offset(old.k))) * ++ (bkey_extent_is_allocation(&new->k) - ++ bkey_extent_is_allocation(old.k)); ++ ++ if (bkey_cmp(old.k->p, new->k.p) >= 0) { ++ /* ++ * Check if there's already data above where we're ++ * going to be writing to - this means we're definitely ++ * not extending the file: ++ * ++ * Note that it's not sufficient to check if there's ++ * data up to the sector offset we're going to be ++ * writing to, because i_size could be up to one block ++ * less: ++ */ ++ if (!bkey_cmp(old.k->p, new->k.p)) ++ old = bch2_btree_iter_next(iter); ++ ++ if (old.k && !bkey_err(old) && ++ old.k->p.inode == extent_iter->pos.inode && ++ bkey_extent_is_data(old.k)) ++ *maybe_extending = false; ++ ++ break; ++ } ++ } ++ ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++int bch2_extent_update(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *k, ++ struct disk_reservation *disk_res, ++ u64 *journal_seq, ++ u64 new_i_size, ++ s64 *i_sectors_delta) ++{ ++ /* this must live until after bch2_trans_commit(): */ ++ struct bkey_inode_buf inode_p; ++ bool extending = false; ++ s64 delta = 0; ++ int ret; ++ ++ ret = bch2_extent_trim_atomic(k, iter); ++ if (ret) ++ return ret; ++ ++ ret = sum_sector_overwrites(trans, iter, k, ++ disk_res && disk_res->sectors != 0, ++ &extending, &delta); ++ if (ret) ++ return ret; ++ ++ new_i_size = extending ++ ? min(k->k.p.offset << 9, new_i_size) ++ : 0; ++ ++ if (delta || new_i_size) { ++ struct btree_iter *inode_iter; ++ struct bch_inode_unpacked inode_u; ++ ++ inode_iter = bch2_inode_peek(trans, &inode_u, ++ k->k.p.inode, BTREE_ITER_INTENT); ++ if (IS_ERR(inode_iter)) ++ return PTR_ERR(inode_iter); ++ ++ /* ++ * XXX: ++ * writeback can race a bit with truncate, because truncate ++ * first updates the inode then truncates the pagecache. This is ++ * ugly, but lets us preserve the invariant that the in memory ++ * i_size is always >= the on disk i_size. ++ * ++ BUG_ON(new_i_size > inode_u.bi_size && ++ (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)); ++ */ ++ BUG_ON(new_i_size > inode_u.bi_size && !extending); ++ ++ if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && ++ new_i_size > inode_u.bi_size) ++ inode_u.bi_size = new_i_size; ++ else ++ new_i_size = 0; ++ ++ inode_u.bi_sectors += delta; ++ ++ if (delta || new_i_size) { ++ bch2_inode_pack(&inode_p, &inode_u); ++ bch2_trans_update(trans, inode_iter, ++ &inode_p.inode.k_i, 0); ++ } ++ ++ bch2_trans_iter_put(trans, inode_iter); ++ } ++ ++ bch2_trans_update(trans, iter, k, 0); ++ ++ ret = bch2_trans_commit(trans, disk_res, journal_seq, ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE); ++ if (!ret && i_sectors_delta) ++ *i_sectors_delta += delta; ++ ++ return ret; ++} ++ ++int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, ++ struct bpos end, u64 *journal_seq, ++ s64 *i_sectors_delta) ++{ ++ struct bch_fs *c = trans->c; ++ unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); ++ struct bkey_s_c k; ++ int ret = 0, ret2 = 0; ++ ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ bkey_cmp(iter->pos, end) < 0) { ++ struct disk_reservation disk_res = ++ bch2_disk_reservation_init(c, 0); ++ struct bkey_i delete; ++ ++ bch2_trans_begin(trans); ++ ++ ret = bkey_err(k); ++ if (ret) ++ goto btree_err; ++ ++ bkey_init(&delete.k); ++ delete.k.p = iter->pos; ++ ++ /* create the biggest key we can */ ++ bch2_key_resize(&delete.k, max_sectors); ++ bch2_cut_back(end, &delete); ++ ++ ret = bch2_extent_update(trans, iter, &delete, ++ &disk_res, journal_seq, ++ 0, i_sectors_delta); ++ bch2_disk_reservation_put(c, &disk_res); ++btree_err: ++ if (ret == -EINTR) { ++ ret2 = ret; ++ ret = 0; ++ } ++ if (ret) ++ break; ++ } ++ ++ if (bkey_cmp(iter->pos, end) > 0) { ++ bch2_btree_iter_set_pos(iter, end); ++ ret = bch2_btree_iter_traverse(iter); ++ } ++ ++ return ret ?: ret2; ++} ++ ++int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, ++ u64 *journal_seq, s64 *i_sectors_delta) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(inum, start), ++ BTREE_ITER_INTENT); ++ ++ ret = bch2_fpunch_at(&trans, iter, POS(inum, end), ++ journal_seq, i_sectors_delta); ++ bch2_trans_exit(&trans); ++ ++ if (ret == -EINTR) ++ ret = 0; ++ ++ return ret; ++} ++ ++int bch2_write_index_default(struct bch_write_op *op) ++{ ++ struct bch_fs *c = op->c; ++ struct bkey_on_stack sk; ++ struct keylist *keys = &op->insert_keys; ++ struct bkey_i *k = bch2_keylist_front(keys); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ int ret; ++ ++ bkey_on_stack_init(&sk); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ bkey_start_pos(&k->k), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ ++ do { ++ bch2_trans_begin(&trans); ++ ++ k = bch2_keylist_front(keys); ++ ++ bkey_on_stack_realloc(&sk, c, k->k.u64s); ++ bkey_copy(sk.k, k); ++ bch2_cut_front(iter->pos, sk.k); ++ ++ ret = bch2_extent_update(&trans, iter, sk.k, ++ &op->res, op_journal_seq(op), ++ op->new_i_size, &op->i_sectors_delta); ++ if (ret == -EINTR) ++ continue; ++ if (ret) ++ break; ++ ++ if (bkey_cmp(iter->pos, k->k.p) >= 0) ++ bch2_keylist_pop_front(keys); ++ } while (!bch2_keylist_empty(keys)); ++ ++ bch2_trans_exit(&trans); ++ bkey_on_stack_exit(&sk, c); ++ ++ return ret; ++} ++ ++/* Writes */ ++ ++void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, ++ enum bch_data_type type, ++ const struct bkey_i *k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); ++ const struct bch_extent_ptr *ptr; ++ struct bch_write_bio *n; ++ struct bch_dev *ca; ++ ++ BUG_ON(c->opts.nochanges); ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || ++ !c->devs[ptr->dev]); ++ ++ ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ if (to_entry(ptr + 1) < ptrs.end) { ++ n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO, ++ &ca->replica_set)); ++ ++ n->bio.bi_end_io = wbio->bio.bi_end_io; ++ n->bio.bi_private = wbio->bio.bi_private; ++ n->parent = wbio; ++ n->split = true; ++ n->bounce = false; ++ n->put_bio = true; ++ n->bio.bi_opf = wbio->bio.bi_opf; ++ bio_inc_remaining(&wbio->bio); ++ } else { ++ n = wbio; ++ n->split = false; ++ } ++ ++ n->c = c; ++ n->dev = ptr->dev; ++ n->have_ioref = bch2_dev_get_ioref(ca, ++ type == BCH_DATA_btree ? READ : WRITE); ++ n->submit_time = local_clock(); ++ n->bio.bi_iter.bi_sector = ptr->offset; ++ ++ if (!journal_flushes_device(ca)) ++ n->bio.bi_opf |= REQ_FUA; ++ ++ if (likely(n->have_ioref)) { ++ this_cpu_add(ca->io_done->sectors[WRITE][type], ++ bio_sectors(&n->bio)); ++ ++ bio_set_dev(&n->bio, ca->disk_sb.bdev); ++ submit_bio(&n->bio); ++ } else { ++ n->bio.bi_status = BLK_STS_REMOVED; ++ bio_endio(&n->bio); ++ } ++ } ++} ++ ++static void __bch2_write(struct closure *); ++ ++static void bch2_write_done(struct closure *cl) ++{ ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bch_fs *c = op->c; ++ ++ if (!op->error && (op->flags & BCH_WRITE_FLUSH)) ++ op->error = bch2_journal_error(&c->journal); ++ ++ bch2_disk_reservation_put(c, &op->res); ++ percpu_ref_put(&c->writes); ++ bch2_keylist_free(&op->insert_keys, op->inline_keys); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); ++ ++ if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) ++ up(&c->io_in_flight); ++ ++ if (op->end_io) { ++ EBUG_ON(cl->parent); ++ closure_debug_destroy(cl); ++ op->end_io(op); ++ } else { ++ closure_return(cl); ++ } ++} ++ ++/** ++ * bch_write_index - after a write, update index to point to new data ++ */ ++static void __bch2_write_index(struct bch_write_op *op) ++{ ++ struct bch_fs *c = op->c; ++ struct keylist *keys = &op->insert_keys; ++ struct bch_extent_ptr *ptr; ++ struct bkey_i *src, *dst = keys->keys, *n, *k; ++ unsigned dev; ++ int ret; ++ ++ for (src = keys->keys; src != keys->top; src = n) { ++ n = bkey_next(src); ++ ++ if (bkey_extent_is_direct_data(&src->k)) { ++ bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, ++ test_bit(ptr->dev, op->failed.d)); ++ ++ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) { ++ ret = -EIO; ++ goto err; ++ } ++ } ++ ++ if (dst != src) ++ memmove_u64s_down(dst, src, src->u64s); ++ dst = bkey_next(dst); ++ } ++ ++ keys->top = dst; ++ ++ /* ++ * probably not the ideal place to hook this in, but I don't ++ * particularly want to plumb io_opts all the way through the btree ++ * update stack right now ++ */ ++ for_each_keylist_key(keys, k) { ++ bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts); ++ ++ if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k))) ++ bch2_check_set_feature(op->c, BCH_FEATURE_incompressible); ++ ++ } ++ ++ if (!bch2_keylist_empty(keys)) { ++ u64 sectors_start = keylist_sectors(keys); ++ int ret = op->index_update_fn(op); ++ ++ BUG_ON(ret == -EINTR); ++ BUG_ON(keylist_sectors(keys) && !ret); ++ ++ op->written += sectors_start - keylist_sectors(keys); ++ ++ if (ret) { ++ __bcache_io_error(c, "btree IO error %i", ret); ++ op->error = ret; ++ } ++ } ++out: ++ /* If some a bucket wasn't written, we can't erasure code it: */ ++ for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) ++ bch2_open_bucket_write_error(c, &op->open_buckets, dev); ++ ++ bch2_open_buckets_put(c, &op->open_buckets); ++ return; ++err: ++ keys->top = keys->keys; ++ op->error = ret; ++ goto out; ++} ++ ++static void bch2_write_index(struct closure *cl) ++{ ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bch_fs *c = op->c; ++ ++ __bch2_write_index(op); ++ ++ if (!(op->flags & BCH_WRITE_DONE)) { ++ continue_at(cl, __bch2_write, index_update_wq(op)); ++ } else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) { ++ bch2_journal_flush_seq_async(&c->journal, ++ *op_journal_seq(op), ++ cl); ++ continue_at(cl, bch2_write_done, index_update_wq(op)); ++ } else { ++ continue_at_nobarrier(cl, bch2_write_done, NULL); ++ } ++} ++ ++static void bch2_write_endio(struct bio *bio) ++{ ++ struct closure *cl = bio->bi_private; ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bch_write_bio *wbio = to_wbio(bio); ++ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; ++ struct bch_fs *c = wbio->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s", ++ bch2_blk_status_to_str(bio->bi_status))) ++ set_bit(wbio->dev, op->failed.d); ++ ++ if (wbio->have_ioref) { ++ bch2_latency_acct(ca, wbio->submit_time, WRITE); ++ percpu_ref_put(&ca->io_ref); ++ } ++ ++ if (wbio->bounce) ++ bch2_bio_free_pages_pool(c, bio); ++ ++ if (wbio->put_bio) ++ bio_put(bio); ++ ++ if (parent) ++ bio_endio(&parent->bio); ++ else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT)) ++ closure_put(cl); ++ else ++ continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op)); ++} ++ ++static void init_append_extent(struct bch_write_op *op, ++ struct write_point *wp, ++ struct bversion version, ++ struct bch_extent_crc_unpacked crc) ++{ ++ struct bch_fs *c = op->c; ++ struct bkey_i_extent *e; ++ struct open_bucket *ob; ++ unsigned i; ++ ++ BUG_ON(crc.compressed_size > wp->sectors_free); ++ wp->sectors_free -= crc.compressed_size; ++ op->pos.offset += crc.uncompressed_size; ++ ++ e = bkey_extent_init(op->insert_keys.top); ++ e->k.p = op->pos; ++ e->k.size = crc.uncompressed_size; ++ e->k.version = version; ++ ++ if (crc.csum_type || ++ crc.compression_type || ++ crc.nonce) ++ bch2_extent_crc_append(&e->k_i, crc); ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ union bch_extent_entry *end = ++ bkey_val_end(bkey_i_to_s(&e->k_i)); ++ ++ end->ptr = ob->ptr; ++ end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; ++ end->ptr.cached = !ca->mi.durability || ++ (op->flags & BCH_WRITE_CACHED) != 0; ++ end->ptr.offset += ca->mi.bucket_size - ob->sectors_free; ++ ++ e->k.u64s++; ++ ++ BUG_ON(crc.compressed_size > ob->sectors_free); ++ ob->sectors_free -= crc.compressed_size; ++ } ++ ++ bch2_keylist_push(&op->insert_keys); ++} ++ ++static struct bio *bch2_write_bio_alloc(struct bch_fs *c, ++ struct write_point *wp, ++ struct bio *src, ++ bool *page_alloc_failed, ++ void *buf) ++{ ++ struct bch_write_bio *wbio; ++ struct bio *bio; ++ unsigned output_available = ++ min(wp->sectors_free << 9, src->bi_iter.bi_size); ++ unsigned pages = DIV_ROUND_UP(output_available + ++ (buf ++ ? ((unsigned long) buf & (PAGE_SIZE - 1)) ++ : 0), PAGE_SIZE); ++ ++ bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write); ++ wbio = wbio_init(bio); ++ wbio->put_bio = true; ++ /* copy WRITE_SYNC flag */ ++ wbio->bio.bi_opf = src->bi_opf; ++ ++ if (buf) { ++ bch2_bio_map(bio, buf, output_available); ++ return bio; ++ } ++ ++ wbio->bounce = true; ++ ++ /* ++ * We can't use mempool for more than c->sb.encoded_extent_max ++ * worth of pages, but we'd like to allocate more if we can: ++ */ ++ bch2_bio_alloc_pages_pool(c, bio, ++ min_t(unsigned, output_available, ++ c->sb.encoded_extent_max << 9)); ++ ++ if (bio->bi_iter.bi_size < output_available) ++ *page_alloc_failed = ++ bch2_bio_alloc_pages(bio, ++ output_available - ++ bio->bi_iter.bi_size, ++ GFP_NOFS) != 0; ++ ++ return bio; ++} ++ ++static int bch2_write_rechecksum(struct bch_fs *c, ++ struct bch_write_op *op, ++ unsigned new_csum_type) ++{ ++ struct bio *bio = &op->wbio.bio; ++ struct bch_extent_crc_unpacked new_crc; ++ int ret; ++ ++ /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ ++ ++ if (bch2_csum_type_is_encryption(op->crc.csum_type) != ++ bch2_csum_type_is_encryption(new_csum_type)) ++ new_csum_type = op->crc.csum_type; ++ ++ ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, ++ NULL, &new_crc, ++ op->crc.offset, op->crc.live_size, ++ new_csum_type); ++ if (ret) ++ return ret; ++ ++ bio_advance(bio, op->crc.offset << 9); ++ bio->bi_iter.bi_size = op->crc.live_size << 9; ++ op->crc = new_crc; ++ return 0; ++} ++ ++static int bch2_write_decrypt(struct bch_write_op *op) ++{ ++ struct bch_fs *c = op->c; ++ struct nonce nonce = extent_nonce(op->version, op->crc); ++ struct bch_csum csum; ++ ++ if (!bch2_csum_type_is_encryption(op->crc.csum_type)) ++ return 0; ++ ++ /* ++ * If we need to decrypt data in the write path, we'll no longer be able ++ * to verify the existing checksum (poly1305 mac, in this case) after ++ * it's decrypted - this is the last point we'll be able to reverify the ++ * checksum: ++ */ ++ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); ++ if (bch2_crc_cmp(op->crc.csum, csum)) ++ return -EIO; ++ ++ bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); ++ op->crc.csum_type = 0; ++ op->crc.csum = (struct bch_csum) { 0, 0 }; ++ return 0; ++} ++ ++static enum prep_encoded_ret { ++ PREP_ENCODED_OK, ++ PREP_ENCODED_ERR, ++ PREP_ENCODED_CHECKSUM_ERR, ++ PREP_ENCODED_DO_WRITE, ++} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) ++{ ++ struct bch_fs *c = op->c; ++ struct bio *bio = &op->wbio.bio; ++ ++ if (!(op->flags & BCH_WRITE_DATA_ENCODED)) ++ return PREP_ENCODED_OK; ++ ++ BUG_ON(bio_sectors(bio) != op->crc.compressed_size); ++ ++ /* Can we just write the entire extent as is? */ ++ if (op->crc.uncompressed_size == op->crc.live_size && ++ op->crc.compressed_size <= wp->sectors_free && ++ (op->crc.compression_type == op->compression_type || ++ op->incompressible)) { ++ if (!crc_is_compressed(op->crc) && ++ op->csum_type != op->crc.csum_type && ++ bch2_write_rechecksum(c, op, op->csum_type)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ return PREP_ENCODED_DO_WRITE; ++ } ++ ++ /* ++ * If the data is compressed and we couldn't write the entire extent as ++ * is, we have to decompress it: ++ */ ++ if (crc_is_compressed(op->crc)) { ++ struct bch_csum csum; ++ ++ if (bch2_write_decrypt(op)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ /* Last point we can still verify checksum: */ ++ csum = bch2_checksum_bio(c, op->crc.csum_type, ++ extent_nonce(op->version, op->crc), ++ bio); ++ if (bch2_crc_cmp(op->crc.csum, csum)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) ++ return PREP_ENCODED_ERR; ++ } ++ ++ /* ++ * No longer have compressed data after this point - data might be ++ * encrypted: ++ */ ++ ++ /* ++ * If the data is checksummed and we're only writing a subset, ++ * rechecksum and adjust bio to point to currently live data: ++ */ ++ if ((op->crc.live_size != op->crc.uncompressed_size || ++ op->crc.csum_type != op->csum_type) && ++ bch2_write_rechecksum(c, op, op->csum_type)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ /* ++ * If we want to compress the data, it has to be decrypted: ++ */ ++ if ((op->compression_type || ++ bch2_csum_type_is_encryption(op->crc.csum_type) != ++ bch2_csum_type_is_encryption(op->csum_type)) && ++ bch2_write_decrypt(op)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ return PREP_ENCODED_OK; ++} ++ ++static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, ++ struct bio **_dst) ++{ ++ struct bch_fs *c = op->c; ++ struct bio *src = &op->wbio.bio, *dst = src; ++ struct bvec_iter saved_iter; ++ void *ec_buf; ++ struct bpos ec_pos = op->pos; ++ unsigned total_output = 0, total_input = 0; ++ bool bounce = false; ++ bool page_alloc_failed = false; ++ int ret, more = 0; ++ ++ BUG_ON(!bio_sectors(src)); ++ ++ ec_buf = bch2_writepoint_ec_buf(c, wp); ++ ++ switch (bch2_write_prep_encoded_data(op, wp)) { ++ case PREP_ENCODED_OK: ++ break; ++ case PREP_ENCODED_ERR: ++ ret = -EIO; ++ goto err; ++ case PREP_ENCODED_CHECKSUM_ERR: ++ BUG(); ++ goto csum_err; ++ case PREP_ENCODED_DO_WRITE: ++ /* XXX look for bug here */ ++ if (ec_buf) { ++ dst = bch2_write_bio_alloc(c, wp, src, ++ &page_alloc_failed, ++ ec_buf); ++ bio_copy_data(dst, src); ++ bounce = true; ++ } ++ init_append_extent(op, wp, op->version, op->crc); ++ goto do_write; ++ } ++ ++ if (ec_buf || ++ op->compression_type || ++ (op->csum_type && ++ !(op->flags & BCH_WRITE_PAGES_STABLE)) || ++ (bch2_csum_type_is_encryption(op->csum_type) && ++ !(op->flags & BCH_WRITE_PAGES_OWNED))) { ++ dst = bch2_write_bio_alloc(c, wp, src, ++ &page_alloc_failed, ++ ec_buf); ++ bounce = true; ++ } ++ ++ saved_iter = dst->bi_iter; ++ ++ do { ++ struct bch_extent_crc_unpacked crc = ++ (struct bch_extent_crc_unpacked) { 0 }; ++ struct bversion version = op->version; ++ size_t dst_len, src_len; ++ ++ if (page_alloc_failed && ++ bio_sectors(dst) < wp->sectors_free && ++ bio_sectors(dst) < c->sb.encoded_extent_max) ++ break; ++ ++ BUG_ON(op->compression_type && ++ (op->flags & BCH_WRITE_DATA_ENCODED) && ++ bch2_csum_type_is_encryption(op->crc.csum_type)); ++ BUG_ON(op->compression_type && !bounce); ++ ++ crc.compression_type = op->incompressible ++ ? BCH_COMPRESSION_TYPE_incompressible ++ : op->compression_type ++ ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, ++ op->compression_type) ++ : 0; ++ if (!crc_is_compressed(crc)) { ++ dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); ++ dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); ++ ++ if (op->csum_type) ++ dst_len = min_t(unsigned, dst_len, ++ c->sb.encoded_extent_max << 9); ++ ++ if (bounce) { ++ swap(dst->bi_iter.bi_size, dst_len); ++ bio_copy_data(dst, src); ++ swap(dst->bi_iter.bi_size, dst_len); ++ } ++ ++ src_len = dst_len; ++ } ++ ++ BUG_ON(!src_len || !dst_len); ++ ++ if (bch2_csum_type_is_encryption(op->csum_type)) { ++ if (bversion_zero(version)) { ++ version.lo = atomic64_inc_return(&c->key_version); ++ } else { ++ crc.nonce = op->nonce; ++ op->nonce += src_len >> 9; ++ } ++ } ++ ++ if ((op->flags & BCH_WRITE_DATA_ENCODED) && ++ !crc_is_compressed(crc) && ++ bch2_csum_type_is_encryption(op->crc.csum_type) == ++ bch2_csum_type_is_encryption(op->csum_type)) { ++ /* ++ * Note: when we're using rechecksum(), we need to be ++ * checksumming @src because it has all the data our ++ * existing checksum covers - if we bounced (because we ++ * were trying to compress), @dst will only have the ++ * part of the data the new checksum will cover. ++ * ++ * But normally we want to be checksumming post bounce, ++ * because part of the reason for bouncing is so the ++ * data can't be modified (by userspace) while it's in ++ * flight. ++ */ ++ if (bch2_rechecksum_bio(c, src, version, op->crc, ++ &crc, &op->crc, ++ src_len >> 9, ++ bio_sectors(src) - (src_len >> 9), ++ op->csum_type)) ++ goto csum_err; ++ } else { ++ if ((op->flags & BCH_WRITE_DATA_ENCODED) && ++ bch2_rechecksum_bio(c, src, version, op->crc, ++ NULL, &op->crc, ++ src_len >> 9, ++ bio_sectors(src) - (src_len >> 9), ++ op->crc.csum_type)) ++ goto csum_err; ++ ++ crc.compressed_size = dst_len >> 9; ++ crc.uncompressed_size = src_len >> 9; ++ crc.live_size = src_len >> 9; ++ ++ swap(dst->bi_iter.bi_size, dst_len); ++ bch2_encrypt_bio(c, op->csum_type, ++ extent_nonce(version, crc), dst); ++ crc.csum = bch2_checksum_bio(c, op->csum_type, ++ extent_nonce(version, crc), dst); ++ crc.csum_type = op->csum_type; ++ swap(dst->bi_iter.bi_size, dst_len); ++ } ++ ++ init_append_extent(op, wp, version, crc); ++ ++ if (dst != src) ++ bio_advance(dst, dst_len); ++ bio_advance(src, src_len); ++ total_output += dst_len; ++ total_input += src_len; ++ } while (dst->bi_iter.bi_size && ++ src->bi_iter.bi_size && ++ wp->sectors_free && ++ !bch2_keylist_realloc(&op->insert_keys, ++ op->inline_keys, ++ ARRAY_SIZE(op->inline_keys), ++ BKEY_EXTENT_U64s_MAX)); ++ ++ more = src->bi_iter.bi_size != 0; ++ ++ dst->bi_iter = saved_iter; ++ ++ if (dst == src && more) { ++ BUG_ON(total_output != total_input); ++ ++ dst = bio_split(src, total_input >> 9, ++ GFP_NOIO, &c->bio_write); ++ wbio_init(dst)->put_bio = true; ++ /* copy WRITE_SYNC flag */ ++ dst->bi_opf = src->bi_opf; ++ } ++ ++ dst->bi_iter.bi_size = total_output; ++do_write: ++ /* might have done a realloc... */ ++ bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9); ++ ++ *_dst = dst; ++ return more; ++csum_err: ++ bch_err(c, "error verifying existing checksum while " ++ "rewriting existing data (memory corruption?)"); ++ ret = -EIO; ++err: ++ if (to_wbio(dst)->bounce) ++ bch2_bio_free_pages_pool(c, dst); ++ if (to_wbio(dst)->put_bio) ++ bio_put(dst); ++ ++ return ret; ++} ++ ++static void __bch2_write(struct closure *cl) ++{ ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bch_fs *c = op->c; ++ struct write_point *wp; ++ struct bio *bio; ++ bool skip_put = true; ++ unsigned nofs_flags; ++ int ret; ++ ++ nofs_flags = memalloc_nofs_save(); ++again: ++ memset(&op->failed, 0, sizeof(op->failed)); ++ ++ do { ++ struct bkey_i *key_to_write; ++ unsigned key_to_write_offset = op->insert_keys.top_p - ++ op->insert_keys.keys_p; ++ ++ /* +1 for possible cache device: */ ++ if (op->open_buckets.nr + op->nr_replicas + 1 > ++ ARRAY_SIZE(op->open_buckets.v)) ++ goto flush_io; ++ ++ if (bch2_keylist_realloc(&op->insert_keys, ++ op->inline_keys, ++ ARRAY_SIZE(op->inline_keys), ++ BKEY_EXTENT_U64s_MAX)) ++ goto flush_io; ++ ++ if ((op->flags & BCH_WRITE_FROM_INTERNAL) && ++ percpu_ref_is_dying(&c->writes)) { ++ ret = -EROFS; ++ goto err; ++ } ++ ++ /* ++ * The copygc thread is now global, which means it's no longer ++ * freeing up space on specific disks, which means that ++ * allocations for specific disks may hang arbitrarily long: ++ */ ++ wp = bch2_alloc_sectors_start(c, ++ op->target, ++ op->opts.erasure_code, ++ op->write_point, ++ &op->devs_have, ++ op->nr_replicas, ++ op->nr_replicas_required, ++ op->alloc_reserve, ++ op->flags, ++ (op->flags & (BCH_WRITE_ALLOC_NOWAIT| ++ BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl); ++ EBUG_ON(!wp); ++ ++ if (unlikely(IS_ERR(wp))) { ++ if (unlikely(PTR_ERR(wp) != -EAGAIN)) { ++ ret = PTR_ERR(wp); ++ goto err; ++ } ++ ++ goto flush_io; ++ } ++ ++ /* ++ * It's possible for the allocator to fail, put us on the ++ * freelist waitlist, and then succeed in one of various retry ++ * paths: if that happens, we need to disable the skip_put ++ * optimization because otherwise there won't necessarily be a ++ * barrier before we free the bch_write_op: ++ */ ++ if (atomic_read(&cl->remaining) & CLOSURE_WAITING) ++ skip_put = false; ++ ++ bch2_open_bucket_get(c, wp, &op->open_buckets); ++ ret = bch2_write_extent(op, wp, &bio); ++ bch2_alloc_sectors_done(c, wp); ++ ++ if (ret < 0) ++ goto err; ++ ++ if (ret) { ++ skip_put = false; ++ } else { ++ /* ++ * for the skip_put optimization this has to be set ++ * before we submit the bio: ++ */ ++ op->flags |= BCH_WRITE_DONE; ++ } ++ ++ bio->bi_end_io = bch2_write_endio; ++ bio->bi_private = &op->cl; ++ bio->bi_opf |= REQ_OP_WRITE; ++ ++ if (!skip_put) ++ closure_get(bio->bi_private); ++ else ++ op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT; ++ ++ key_to_write = (void *) (op->insert_keys.keys_p + ++ key_to_write_offset); ++ ++ bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, ++ key_to_write); ++ } while (ret); ++ ++ if (!skip_put) ++ continue_at(cl, bch2_write_index, index_update_wq(op)); ++out: ++ memalloc_nofs_restore(nofs_flags); ++ return; ++err: ++ op->error = ret; ++ op->flags |= BCH_WRITE_DONE; ++ ++ continue_at(cl, bch2_write_index, index_update_wq(op)); ++ goto out; ++flush_io: ++ /* ++ * If the write can't all be submitted at once, we generally want to ++ * block synchronously as that signals backpressure to the caller. ++ * ++ * However, if we're running out of a workqueue, we can't block here ++ * because we'll be blocking other work items from completing: ++ */ ++ if (current->flags & PF_WQ_WORKER) { ++ continue_at(cl, bch2_write_index, index_update_wq(op)); ++ goto out; ++ } ++ ++ closure_sync(cl); ++ ++ if (!bch2_keylist_empty(&op->insert_keys)) { ++ __bch2_write_index(op); ++ ++ if (op->error) { ++ op->flags |= BCH_WRITE_DONE; ++ continue_at_nobarrier(cl, bch2_write_done, NULL); ++ goto out; ++ } ++ } ++ ++ goto again; ++} ++ ++static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) ++{ ++ struct closure *cl = &op->cl; ++ struct bio *bio = &op->wbio.bio; ++ struct bvec_iter iter; ++ struct bkey_i_inline_data *id; ++ unsigned sectors; ++ int ret; ++ ++ bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); ++ ++ ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, ++ ARRAY_SIZE(op->inline_keys), ++ BKEY_U64s + DIV_ROUND_UP(data_len, 8)); ++ if (ret) { ++ op->error = ret; ++ goto err; ++ } ++ ++ sectors = bio_sectors(bio); ++ op->pos.offset += sectors; ++ ++ id = bkey_inline_data_init(op->insert_keys.top); ++ id->k.p = op->pos; ++ id->k.version = op->version; ++ id->k.size = sectors; ++ ++ iter = bio->bi_iter; ++ iter.bi_size = data_len; ++ memcpy_from_bio(id->v.data, bio, iter); ++ ++ while (data_len & 7) ++ id->v.data[data_len++] = '\0'; ++ set_bkey_val_bytes(&id->k, data_len); ++ bch2_keylist_push(&op->insert_keys); ++ ++ op->flags |= BCH_WRITE_WROTE_DATA_INLINE; ++ op->flags |= BCH_WRITE_DONE; ++ ++ continue_at_nobarrier(cl, bch2_write_index, NULL); ++ return; ++err: ++ bch2_write_done(&op->cl); ++} ++ ++/** ++ * bch_write - handle a write to a cache device or flash only volume ++ * ++ * This is the starting point for any data to end up in a cache device; it could ++ * be from a normal write, or a writeback write, or a write to a flash only ++ * volume - it's also used by the moving garbage collector to compact data in ++ * mostly empty buckets. ++ * ++ * It first writes the data to the cache, creating a list of keys to be inserted ++ * (if the data won't fit in a single open bucket, there will be multiple keys); ++ * after the data is written it calls bch_journal, and after the keys have been ++ * added to the next journal write they're inserted into the btree. ++ * ++ * If op->discard is true, instead of inserting the data it invalidates the ++ * region of the cache represented by op->bio and op->inode. ++ */ ++void bch2_write(struct closure *cl) ++{ ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bio *bio = &op->wbio.bio; ++ struct bch_fs *c = op->c; ++ unsigned data_len; ++ ++ BUG_ON(!op->nr_replicas); ++ BUG_ON(!op->write_point.v); ++ BUG_ON(!bkey_cmp(op->pos, POS_MAX)); ++ ++ op->start_time = local_clock(); ++ bch2_keylist_init(&op->insert_keys, op->inline_keys); ++ wbio_init(bio)->put_bio = false; ++ ++ if (bio_sectors(bio) & (c->opts.block_size - 1)) { ++ __bcache_io_error(c, "misaligned write"); ++ op->error = -EIO; ++ goto err; ++ } ++ ++ if (c->opts.nochanges || ++ !percpu_ref_tryget(&c->writes)) { ++ if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) ++ __bcache_io_error(c, "read only"); ++ op->error = -EROFS; ++ goto err; ++ } ++ ++ /* ++ * Can't ratelimit copygc - we'd deadlock: ++ */ ++ if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) ++ down(&c->io_in_flight); ++ ++ bch2_increment_clock(c, bio_sectors(bio), WRITE); ++ ++ data_len = min_t(u64, bio->bi_iter.bi_size, ++ op->new_i_size - (op->pos.offset << 9)); ++ ++ if (c->opts.inline_data && ++ data_len <= min(block_bytes(c) / 2, 1024U)) { ++ bch2_write_data_inline(op, data_len); ++ return; ++ } ++ ++ continue_at_nobarrier(cl, __bch2_write, NULL); ++ return; ++err: ++ bch2_disk_reservation_put(c, &op->res); ++ ++ if (op->end_io) { ++ EBUG_ON(cl->parent); ++ closure_debug_destroy(cl); ++ op->end_io(op); ++ } else { ++ closure_return(cl); ++ } ++} ++ ++/* Cache promotion on read */ ++ ++struct promote_op { ++ struct closure cl; ++ struct rcu_head rcu; ++ u64 start_time; ++ ++ struct rhash_head hash; ++ struct bpos pos; ++ ++ struct migrate_write write; ++ struct bio_vec bi_inline_vecs[0]; /* must be last */ ++}; ++ ++static const struct rhashtable_params bch_promote_params = { ++ .head_offset = offsetof(struct promote_op, hash), ++ .key_offset = offsetof(struct promote_op, pos), ++ .key_len = sizeof(struct bpos), ++}; ++ ++static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k, ++ struct bpos pos, ++ struct bch_io_opts opts, ++ unsigned flags) ++{ ++ if (!(flags & BCH_READ_MAY_PROMOTE)) ++ return false; ++ ++ if (!opts.promote_target) ++ return false; ++ ++ if (bch2_bkey_has_target(c, k, opts.promote_target)) ++ return false; ++ ++ if (bch2_target_congested(c, opts.promote_target)) { ++ /* XXX trace this */ ++ return false; ++ } ++ ++ if (rhashtable_lookup_fast(&c->promote_table, &pos, ++ bch_promote_params)) ++ return false; ++ ++ return true; ++} ++ ++static void promote_free(struct bch_fs *c, struct promote_op *op) ++{ ++ int ret; ++ ++ ret = rhashtable_remove_fast(&c->promote_table, &op->hash, ++ bch_promote_params); ++ BUG_ON(ret); ++ percpu_ref_put(&c->writes); ++ kfree_rcu(op, rcu); ++} ++ ++static void promote_done(struct closure *cl) ++{ ++ struct promote_op *op = ++ container_of(cl, struct promote_op, cl); ++ struct bch_fs *c = op->write.op.c; ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_data_promote], ++ op->start_time); ++ ++ bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio); ++ promote_free(c, op); ++} ++ ++static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) ++{ ++ struct bch_fs *c = rbio->c; ++ struct closure *cl = &op->cl; ++ struct bio *bio = &op->write.op.wbio.bio; ++ ++ trace_promote(&rbio->bio); ++ ++ /* we now own pages: */ ++ BUG_ON(!rbio->bounce); ++ BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); ++ ++ memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, ++ sizeof(struct bio_vec) * rbio->bio.bi_vcnt); ++ swap(bio->bi_vcnt, rbio->bio.bi_vcnt); ++ ++ bch2_migrate_read_done(&op->write, rbio); ++ ++ closure_init(cl, NULL); ++ closure_call(&op->write.op.cl, bch2_write, c->wq, cl); ++ closure_return_with_destructor(cl, promote_done); ++} ++ ++static struct promote_op *__promote_alloc(struct bch_fs *c, ++ enum btree_id btree_id, ++ struct bkey_s_c k, ++ struct bpos pos, ++ struct extent_ptr_decoded *pick, ++ struct bch_io_opts opts, ++ unsigned sectors, ++ struct bch_read_bio **rbio) ++{ ++ struct promote_op *op = NULL; ++ struct bio *bio; ++ unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); ++ int ret; ++ ++ if (!percpu_ref_tryget(&c->writes)) ++ return NULL; ++ ++ op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO); ++ if (!op) ++ goto err; ++ ++ op->start_time = local_clock(); ++ op->pos = pos; ++ ++ /* ++ * We don't use the mempool here because extents that aren't ++ * checksummed or compressed can be too big for the mempool: ++ */ ++ *rbio = kzalloc(sizeof(struct bch_read_bio) + ++ sizeof(struct bio_vec) * pages, ++ GFP_NOIO); ++ if (!*rbio) ++ goto err; ++ ++ rbio_init(&(*rbio)->bio, opts); ++ bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages); ++ ++ if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, ++ GFP_NOIO)) ++ goto err; ++ ++ (*rbio)->bounce = true; ++ (*rbio)->split = true; ++ (*rbio)->kmalloc = true; ++ ++ if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, ++ bch_promote_params)) ++ goto err; ++ ++ bio = &op->write.op.wbio.bio; ++ bio_init(bio, bio->bi_inline_vecs, pages); ++ ++ ret = bch2_migrate_write_init(c, &op->write, ++ writepoint_hashed((unsigned long) current), ++ opts, ++ DATA_PROMOTE, ++ (struct data_opts) { ++ .target = opts.promote_target ++ }, ++ btree_id, k); ++ BUG_ON(ret); ++ ++ return op; ++err: ++ if (*rbio) ++ bio_free_pages(&(*rbio)->bio); ++ kfree(*rbio); ++ *rbio = NULL; ++ kfree(op); ++ percpu_ref_put(&c->writes); ++ return NULL; ++} ++ ++noinline ++static struct promote_op *promote_alloc(struct bch_fs *c, ++ struct bvec_iter iter, ++ struct bkey_s_c k, ++ struct extent_ptr_decoded *pick, ++ struct bch_io_opts opts, ++ unsigned flags, ++ struct bch_read_bio **rbio, ++ bool *bounce, ++ bool *read_full) ++{ ++ bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); ++ /* data might have to be decompressed in the write path: */ ++ unsigned sectors = promote_full ++ ? max(pick->crc.compressed_size, pick->crc.live_size) ++ : bvec_iter_sectors(iter); ++ struct bpos pos = promote_full ++ ? bkey_start_pos(k.k) ++ : POS(k.k->p.inode, iter.bi_sector); ++ struct promote_op *promote; ++ ++ if (!should_promote(c, k, pos, opts, flags)) ++ return NULL; ++ ++ promote = __promote_alloc(c, ++ k.k->type == KEY_TYPE_reflink_v ++ ? BTREE_ID_REFLINK ++ : BTREE_ID_EXTENTS, ++ k, pos, pick, opts, sectors, rbio); ++ if (!promote) ++ return NULL; ++ ++ *bounce = true; ++ *read_full = promote_full; ++ return promote; ++} ++ ++/* Read */ ++ ++#define READ_RETRY_AVOID 1 ++#define READ_RETRY 2 ++#define READ_ERR 3 ++ ++enum rbio_context { ++ RBIO_CONTEXT_NULL, ++ RBIO_CONTEXT_HIGHPRI, ++ RBIO_CONTEXT_UNBOUND, ++}; ++ ++static inline struct bch_read_bio * ++bch2_rbio_parent(struct bch_read_bio *rbio) ++{ ++ return rbio->split ? rbio->parent : rbio; ++} ++ ++__always_inline ++static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, ++ enum rbio_context context, ++ struct workqueue_struct *wq) ++{ ++ if (context <= rbio->context) { ++ fn(&rbio->work); ++ } else { ++ rbio->work.func = fn; ++ rbio->context = context; ++ queue_work(wq, &rbio->work); ++ } ++} ++ ++static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) ++{ ++ BUG_ON(rbio->bounce && !rbio->split); ++ ++ if (rbio->promote) ++ promote_free(rbio->c, rbio->promote); ++ rbio->promote = NULL; ++ ++ if (rbio->bounce) ++ bch2_bio_free_pages_pool(rbio->c, &rbio->bio); ++ ++ if (rbio->split) { ++ struct bch_read_bio *parent = rbio->parent; ++ ++ if (rbio->kmalloc) ++ kfree(rbio); ++ else ++ bio_put(&rbio->bio); ++ ++ rbio = parent; ++ } ++ ++ return rbio; ++} ++ ++/* ++ * Only called on a top level bch_read_bio to complete an entire read request, ++ * not a split: ++ */ ++static void bch2_rbio_done(struct bch_read_bio *rbio) ++{ ++ if (rbio->start_time) ++ bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], ++ rbio->start_time); ++ bio_endio(&rbio->bio); ++} ++ ++static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, ++ struct bvec_iter bvec_iter, u64 inode, ++ struct bch_io_failures *failed, ++ unsigned flags) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_on_stack sk; ++ struct bkey_s_c k; ++ int ret; ++ ++ flags &= ~BCH_READ_LAST_FRAGMENT; ++ flags |= BCH_READ_MUST_CLONE; ++ ++ bkey_on_stack_init(&sk); ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ rbio->pos, BTREE_ITER_SLOTS); ++retry: ++ rbio->bio.bi_status = 0; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ if (bkey_err(k)) ++ goto err; ++ ++ bkey_on_stack_reassemble(&sk, c, k); ++ k = bkey_i_to_s_c(sk.k); ++ bch2_trans_unlock(&trans); ++ ++ if (!bch2_bkey_matches_ptr(c, k, ++ rbio->pick.ptr, ++ rbio->pos.offset - ++ rbio->pick.crc.offset)) { ++ /* extent we wanted to read no longer exists: */ ++ rbio->hole = true; ++ goto out; ++ } ++ ++ ret = __bch2_read_extent(c, rbio, bvec_iter, k, 0, failed, flags); ++ if (ret == READ_RETRY) ++ goto retry; ++ if (ret) ++ goto err; ++out: ++ bch2_rbio_done(rbio); ++ bch2_trans_exit(&trans); ++ bkey_on_stack_exit(&sk, c); ++ return; ++err: ++ rbio->bio.bi_status = BLK_STS_IOERR; ++ goto out; ++} ++ ++static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, ++ struct bvec_iter bvec_iter, u64 inode, ++ struct bch_io_failures *failed, unsigned flags) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_on_stack sk; ++ struct bkey_s_c k; ++ int ret; ++ ++ flags &= ~BCH_READ_LAST_FRAGMENT; ++ flags |= BCH_READ_MUST_CLONE; ++ ++ bkey_on_stack_init(&sk); ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, ++ POS(inode, bvec_iter.bi_sector), ++ BTREE_ITER_SLOTS, k, ret) { ++ unsigned bytes, sectors, offset_into_extent; ++ ++ bkey_on_stack_reassemble(&sk, c, k); ++ k = bkey_i_to_s_c(sk.k); ++ ++ offset_into_extent = iter->pos.offset - ++ bkey_start_offset(k.k); ++ sectors = k.k->size - offset_into_extent; ++ ++ ret = bch2_read_indirect_extent(&trans, ++ &offset_into_extent, &sk); ++ if (ret) ++ break; ++ ++ sectors = min(sectors, k.k->size - offset_into_extent); ++ ++ bch2_trans_unlock(&trans); ++ ++ bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; ++ swap(bvec_iter.bi_size, bytes); ++ ++ ret = __bch2_read_extent(c, rbio, bvec_iter, k, ++ offset_into_extent, failed, flags); ++ switch (ret) { ++ case READ_RETRY: ++ goto retry; ++ case READ_ERR: ++ goto err; ++ }; ++ ++ if (bytes == bvec_iter.bi_size) ++ goto out; ++ ++ swap(bvec_iter.bi_size, bytes); ++ bio_advance_iter(&rbio->bio, &bvec_iter, bytes); ++ } ++ ++ if (ret == -EINTR) ++ goto retry; ++ /* ++ * If we get here, it better have been because there was an error ++ * reading a btree node ++ */ ++ BUG_ON(!ret); ++ __bcache_io_error(c, "btree IO error: %i", ret); ++err: ++ rbio->bio.bi_status = BLK_STS_IOERR; ++out: ++ bch2_trans_exit(&trans); ++ bkey_on_stack_exit(&sk, c); ++ bch2_rbio_done(rbio); ++} ++ ++static void bch2_rbio_retry(struct work_struct *work) ++{ ++ struct bch_read_bio *rbio = ++ container_of(work, struct bch_read_bio, work); ++ struct bch_fs *c = rbio->c; ++ struct bvec_iter iter = rbio->bvec_iter; ++ unsigned flags = rbio->flags; ++ u64 inode = rbio->pos.inode; ++ struct bch_io_failures failed = { .nr = 0 }; ++ ++ trace_read_retry(&rbio->bio); ++ ++ if (rbio->retry == READ_RETRY_AVOID) ++ bch2_mark_io_failure(&failed, &rbio->pick); ++ ++ rbio->bio.bi_status = 0; ++ ++ rbio = bch2_rbio_free(rbio); ++ ++ flags |= BCH_READ_IN_RETRY; ++ flags &= ~BCH_READ_MAY_PROMOTE; ++ ++ if (flags & BCH_READ_NODECODE) ++ bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags); ++ else ++ bch2_read_retry(c, rbio, iter, inode, &failed, flags); ++} ++ ++static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, ++ blk_status_t error) ++{ ++ rbio->retry = retry; ++ ++ if (rbio->flags & BCH_READ_IN_RETRY) ++ return; ++ ++ if (retry == READ_ERR) { ++ rbio = bch2_rbio_free(rbio); ++ ++ rbio->bio.bi_status = error; ++ bch2_rbio_done(rbio); ++ } else { ++ bch2_rbio_punt(rbio, bch2_rbio_retry, ++ RBIO_CONTEXT_UNBOUND, system_unbound_wq); ++ } ++} ++ ++static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, ++ struct bch_read_bio *rbio) ++{ ++ struct bch_fs *c = rbio->c; ++ u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset; ++ struct bch_extent_crc_unpacked new_crc; ++ struct btree_iter *iter = NULL; ++ struct bkey_i *new; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ if (crc_is_compressed(rbio->pick.crc)) ++ return 0; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_EXTENTS, rbio->pos, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ if ((ret = PTR_ERR_OR_ZERO(iter))) ++ goto out; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ if ((ret = bkey_err(k))) ++ goto out; ++ ++ /* ++ * going to be temporarily appending another checksum entry: ++ */ ++ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + ++ BKEY_EXTENT_U64s_MAX * 8); ++ if ((ret = PTR_ERR_OR_ZERO(new))) ++ goto out; ++ ++ bkey_reassemble(new, k); ++ k = bkey_i_to_s_c(new); ++ ++ if (bversion_cmp(k.k->version, rbio->version) || ++ !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) ++ goto out; ++ ++ /* Extent was merged? */ ++ if (bkey_start_offset(k.k) < data_offset || ++ k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) ++ goto out; ++ ++ if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, ++ rbio->pick.crc, NULL, &new_crc, ++ bkey_start_offset(k.k) - data_offset, k.k->size, ++ rbio->pick.crc.csum_type)) { ++ bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); ++ ret = 0; ++ goto out; ++ } ++ ++ if (!bch2_bkey_narrow_crcs(new, new_crc)) ++ goto out; ++ ++ bch2_trans_update(trans, iter, new, 0); ++out: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) ++{ ++ bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, ++ __bch2_rbio_narrow_crcs(&trans, rbio)); ++} ++ ++/* Inner part that may run in process context */ ++static void __bch2_read_endio(struct work_struct *work) ++{ ++ struct bch_read_bio *rbio = ++ container_of(work, struct bch_read_bio, work); ++ struct bch_fs *c = rbio->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); ++ struct bio *src = &rbio->bio; ++ struct bio *dst = &bch2_rbio_parent(rbio)->bio; ++ struct bvec_iter dst_iter = rbio->bvec_iter; ++ struct bch_extent_crc_unpacked crc = rbio->pick.crc; ++ struct nonce nonce = extent_nonce(rbio->version, crc); ++ struct bch_csum csum; ++ ++ /* Reset iterator for checksumming and copying bounced data: */ ++ if (rbio->bounce) { ++ src->bi_iter.bi_size = crc.compressed_size << 9; ++ src->bi_iter.bi_idx = 0; ++ src->bi_iter.bi_bvec_done = 0; ++ } else { ++ src->bi_iter = rbio->bvec_iter; ++ } ++ ++ csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); ++ if (bch2_crc_cmp(csum, rbio->pick.crc.csum)) ++ goto csum_err; ++ ++ if (unlikely(rbio->narrow_crcs)) ++ bch2_rbio_narrow_crcs(rbio); ++ ++ if (rbio->flags & BCH_READ_NODECODE) ++ goto nodecode; ++ ++ /* Adjust crc to point to subset of data we want: */ ++ crc.offset += rbio->offset_into_extent; ++ crc.live_size = bvec_iter_sectors(rbio->bvec_iter); ++ ++ if (crc_is_compressed(crc)) { ++ bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) ++ goto decompression_err; ++ } else { ++ /* don't need to decrypt the entire bio: */ ++ nonce = nonce_add(nonce, crc.offset << 9); ++ bio_advance(src, crc.offset << 9); ++ ++ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); ++ src->bi_iter.bi_size = dst_iter.bi_size; ++ ++ bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ ++ if (rbio->bounce) { ++ struct bvec_iter src_iter = src->bi_iter; ++ bio_copy_data_iter(dst, &dst_iter, src, &src_iter); ++ } ++ } ++ ++ if (rbio->promote) { ++ /* ++ * Re encrypt data we decrypted, so it's consistent with ++ * rbio->crc: ++ */ ++ bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ promote_start(rbio->promote, rbio); ++ rbio->promote = NULL; ++ } ++nodecode: ++ if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { ++ rbio = bch2_rbio_free(rbio); ++ bch2_rbio_done(rbio); ++ } ++ return; ++csum_err: ++ /* ++ * Checksum error: if the bio wasn't bounced, we may have been ++ * reading into buffers owned by userspace (that userspace can ++ * scribble over) - retry the read, bouncing it this time: ++ */ ++ if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { ++ rbio->flags |= BCH_READ_MUST_BOUNCE; ++ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); ++ return; ++ } ++ ++ bch2_dev_io_error(ca, ++ "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)", ++ rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, ++ rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, ++ csum.hi, csum.lo, crc.csum_type); ++ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); ++ return; ++decompression_err: ++ __bcache_io_error(c, "decompression error, inode %llu offset %llu", ++ rbio->pos.inode, ++ (u64) rbio->bvec_iter.bi_sector); ++ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); ++ return; ++} ++ ++static void bch2_read_endio(struct bio *bio) ++{ ++ struct bch_read_bio *rbio = ++ container_of(bio, struct bch_read_bio, bio); ++ struct bch_fs *c = rbio->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); ++ struct workqueue_struct *wq = NULL; ++ enum rbio_context context = RBIO_CONTEXT_NULL; ++ ++ if (rbio->have_ioref) { ++ bch2_latency_acct(ca, rbio->submit_time, READ); ++ percpu_ref_put(&ca->io_ref); ++ } ++ ++ if (!rbio->split) ++ rbio->bio.bi_end_io = rbio->end_io; ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s", ++ bch2_blk_status_to_str(bio->bi_status))) { ++ bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); ++ return; ++ } ++ ++ if (rbio->pick.ptr.cached && ++ (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || ++ ptr_stale(ca, &rbio->pick.ptr))) { ++ atomic_long_inc(&c->read_realloc_races); ++ ++ if (rbio->flags & BCH_READ_RETRY_IF_STALE) ++ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); ++ else ++ bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); ++ return; ++ } ++ ++ if (rbio->narrow_crcs || ++ crc_is_compressed(rbio->pick.crc) || ++ bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) ++ context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; ++ else if (rbio->pick.crc.csum_type) ++ context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; ++ ++ bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); ++} ++ ++int __bch2_read_indirect_extent(struct btree_trans *trans, ++ unsigned *offset_into_extent, ++ struct bkey_on_stack *orig_k) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 reflink_offset; ++ int ret; ++ ++ reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + ++ *offset_into_extent; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK, ++ POS(0, reflink_offset), ++ BTREE_ITER_SLOTS); ++ ret = PTR_ERR_OR_ZERO(iter); ++ if (ret) ++ return ret; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_reflink_v) { ++ __bcache_io_error(trans->c, ++ "pointer to nonexistent indirect extent"); ++ ret = -EIO; ++ goto err; ++ } ++ ++ *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); ++ bkey_on_stack_reassemble(orig_k, trans->c, k); ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++int __bch2_read_extent(struct bch_fs *c, struct bch_read_bio *orig, ++ struct bvec_iter iter, struct bkey_s_c k, ++ unsigned offset_into_extent, ++ struct bch_io_failures *failed, unsigned flags) ++{ ++ struct extent_ptr_decoded pick; ++ struct bch_read_bio *rbio = NULL; ++ struct bch_dev *ca; ++ struct promote_op *promote = NULL; ++ bool bounce = false, read_full = false, narrow_crcs = false; ++ struct bpos pos = bkey_start_pos(k.k); ++ int pick_ret; ++ ++ if (k.k->type == KEY_TYPE_inline_data) { ++ struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); ++ unsigned bytes = min_t(unsigned, iter.bi_size, ++ bkey_val_bytes(d.k)); ++ ++ swap(iter.bi_size, bytes); ++ memcpy_to_bio(&orig->bio, iter, d.v->data); ++ swap(iter.bi_size, bytes); ++ bio_advance_iter(&orig->bio, &iter, bytes); ++ zero_fill_bio_iter(&orig->bio, iter); ++ goto out_read_done; ++ } ++ ++ pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); ++ ++ /* hole or reservation - just zero fill: */ ++ if (!pick_ret) ++ goto hole; ++ ++ if (pick_ret < 0) { ++ __bcache_io_error(c, "no device to read from"); ++ goto err; ++ } ++ ++ if (pick_ret > 0) ++ ca = bch_dev_bkey_exists(c, pick.ptr.dev); ++ ++ if (flags & BCH_READ_NODECODE) { ++ /* ++ * can happen if we retry, and the extent we were going to read ++ * has been merged in the meantime: ++ */ ++ if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) ++ goto hole; ++ ++ iter.bi_size = pick.crc.compressed_size << 9; ++ goto get_bio; ++ } ++ ++ if (!(flags & BCH_READ_LAST_FRAGMENT) || ++ bio_flagged(&orig->bio, BIO_CHAIN)) ++ flags |= BCH_READ_MUST_CLONE; ++ ++ narrow_crcs = !(flags & BCH_READ_IN_RETRY) && ++ bch2_can_narrow_extent_crcs(k, pick.crc); ++ ++ if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) ++ flags |= BCH_READ_MUST_BOUNCE; ++ ++ EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); ++ ++ if (crc_is_compressed(pick.crc) || ++ (pick.crc.csum_type != BCH_CSUM_NONE && ++ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || ++ (bch2_csum_type_is_encryption(pick.crc.csum_type) && ++ (flags & BCH_READ_USER_MAPPED)) || ++ (flags & BCH_READ_MUST_BOUNCE)))) { ++ read_full = true; ++ bounce = true; ++ } ++ ++ if (orig->opts.promote_target) ++ promote = promote_alloc(c, iter, k, &pick, orig->opts, flags, ++ &rbio, &bounce, &read_full); ++ ++ if (!read_full) { ++ EBUG_ON(crc_is_compressed(pick.crc)); ++ EBUG_ON(pick.crc.csum_type && ++ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || ++ bvec_iter_sectors(iter) != pick.crc.live_size || ++ pick.crc.offset || ++ offset_into_extent)); ++ ++ pos.offset += offset_into_extent; ++ pick.ptr.offset += pick.crc.offset + ++ offset_into_extent; ++ offset_into_extent = 0; ++ pick.crc.compressed_size = bvec_iter_sectors(iter); ++ pick.crc.uncompressed_size = bvec_iter_sectors(iter); ++ pick.crc.offset = 0; ++ pick.crc.live_size = bvec_iter_sectors(iter); ++ offset_into_extent = 0; ++ } ++get_bio: ++ if (rbio) { ++ /* ++ * promote already allocated bounce rbio: ++ * promote needs to allocate a bio big enough for uncompressing ++ * data in the write path, but we're not going to use it all ++ * here: ++ */ ++ EBUG_ON(rbio->bio.bi_iter.bi_size < ++ pick.crc.compressed_size << 9); ++ rbio->bio.bi_iter.bi_size = ++ pick.crc.compressed_size << 9; ++ } else if (bounce) { ++ unsigned sectors = pick.crc.compressed_size; ++ ++ rbio = rbio_init(bio_alloc_bioset(GFP_NOIO, ++ DIV_ROUND_UP(sectors, PAGE_SECTORS), ++ &c->bio_read_split), ++ orig->opts); ++ ++ bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); ++ rbio->bounce = true; ++ rbio->split = true; ++ } else if (flags & BCH_READ_MUST_CLONE) { ++ /* ++ * Have to clone if there were any splits, due to error ++ * reporting issues (if a split errored, and retrying didn't ++ * work, when it reports the error to its parent (us) we don't ++ * know if the error was from our bio, and we should retry, or ++ * from the whole bio, in which case we don't want to retry and ++ * lose the error) ++ */ ++ rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO, ++ &c->bio_read_split), ++ orig->opts); ++ rbio->bio.bi_iter = iter; ++ rbio->split = true; ++ } else { ++ rbio = orig; ++ rbio->bio.bi_iter = iter; ++ EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); ++ } ++ ++ EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); ++ ++ rbio->c = c; ++ rbio->submit_time = local_clock(); ++ if (rbio->split) ++ rbio->parent = orig; ++ else ++ rbio->end_io = orig->bio.bi_end_io; ++ rbio->bvec_iter = iter; ++ rbio->offset_into_extent= offset_into_extent; ++ rbio->flags = flags; ++ rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); ++ rbio->narrow_crcs = narrow_crcs; ++ rbio->hole = 0; ++ rbio->retry = 0; ++ rbio->context = 0; ++ /* XXX: only initialize this if needed */ ++ rbio->devs_have = bch2_bkey_devs(k); ++ rbio->pick = pick; ++ rbio->pos = pos; ++ rbio->version = k.k->version; ++ rbio->promote = promote; ++ INIT_WORK(&rbio->work, NULL); ++ ++ rbio->bio.bi_opf = orig->bio.bi_opf; ++ rbio->bio.bi_iter.bi_sector = pick.ptr.offset; ++ rbio->bio.bi_end_io = bch2_read_endio; ++ ++ if (rbio->bounce) ++ trace_read_bounce(&rbio->bio); ++ ++ bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); ++ ++ rcu_read_lock(); ++ bucket_io_clock_reset(c, ca, PTR_BUCKET_NR(ca, &pick.ptr), READ); ++ rcu_read_unlock(); ++ ++ if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { ++ bio_inc_remaining(&orig->bio); ++ trace_read_split(&orig->bio); ++ } ++ ++ if (!rbio->pick.idx) { ++ if (!rbio->have_ioref) { ++ __bcache_io_error(c, "no device to read from"); ++ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); ++ goto out; ++ } ++ ++ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], ++ bio_sectors(&rbio->bio)); ++ bio_set_dev(&rbio->bio, ca->disk_sb.bdev); ++ ++ if (likely(!(flags & BCH_READ_IN_RETRY))) ++ submit_bio(&rbio->bio); ++ else ++ submit_bio_wait(&rbio->bio); ++ } else { ++ /* Attempting reconstruct read: */ ++ if (bch2_ec_read_extent(c, rbio)) { ++ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); ++ goto out; ++ } ++ ++ if (likely(!(flags & BCH_READ_IN_RETRY))) ++ bio_endio(&rbio->bio); ++ } ++out: ++ if (likely(!(flags & BCH_READ_IN_RETRY))) { ++ return 0; ++ } else { ++ int ret; ++ ++ rbio->context = RBIO_CONTEXT_UNBOUND; ++ bch2_read_endio(&rbio->bio); ++ ++ ret = rbio->retry; ++ rbio = bch2_rbio_free(rbio); ++ ++ if (ret == READ_RETRY_AVOID) { ++ bch2_mark_io_failure(failed, &pick); ++ ret = READ_RETRY; ++ } ++ ++ return ret; ++ } ++ ++err: ++ if (flags & BCH_READ_IN_RETRY) ++ return READ_ERR; ++ ++ orig->bio.bi_status = BLK_STS_IOERR; ++ goto out_read_done; ++ ++hole: ++ /* ++ * won't normally happen in the BCH_READ_NODECODE ++ * (bch2_move_extent()) path, but if we retry and the extent we wanted ++ * to read no longer exists we have to signal that: ++ */ ++ if (flags & BCH_READ_NODECODE) ++ orig->hole = true; ++ ++ zero_fill_bio_iter(&orig->bio, iter); ++out_read_done: ++ if (flags & BCH_READ_LAST_FRAGMENT) ++ bch2_rbio_done(orig); ++ return 0; ++} ++ ++void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_on_stack sk; ++ struct bkey_s_c k; ++ unsigned flags = BCH_READ_RETRY_IF_STALE| ++ BCH_READ_MAY_PROMOTE| ++ BCH_READ_USER_MAPPED; ++ int ret; ++ ++ BUG_ON(rbio->_state); ++ BUG_ON(flags & BCH_READ_NODECODE); ++ BUG_ON(flags & BCH_READ_IN_RETRY); ++ ++ rbio->c = c; ++ rbio->start_time = local_clock(); ++ ++ bkey_on_stack_init(&sk); ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(inode, rbio->bio.bi_iter.bi_sector), ++ BTREE_ITER_SLOTS); ++ while (1) { ++ unsigned bytes, sectors, offset_into_extent; ++ ++ bch2_btree_iter_set_pos(iter, ++ POS(inode, rbio->bio.bi_iter.bi_sector)); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ offset_into_extent = iter->pos.offset - ++ bkey_start_offset(k.k); ++ sectors = k.k->size - offset_into_extent; ++ ++ bkey_on_stack_reassemble(&sk, c, k); ++ k = bkey_i_to_s_c(sk.k); ++ ++ ret = bch2_read_indirect_extent(&trans, ++ &offset_into_extent, &sk); ++ if (ret) ++ goto err; ++ ++ /* ++ * With indirect extents, the amount of data to read is the min ++ * of the original extent and the indirect extent: ++ */ ++ sectors = min(sectors, k.k->size - offset_into_extent); ++ ++ /* ++ * Unlock the iterator while the btree node's lock is still in ++ * cache, before doing the IO: ++ */ ++ bch2_trans_unlock(&trans); ++ ++ bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; ++ swap(rbio->bio.bi_iter.bi_size, bytes); ++ ++ if (rbio->bio.bi_iter.bi_size == bytes) ++ flags |= BCH_READ_LAST_FRAGMENT; ++ ++ bch2_read_extent(c, rbio, k, offset_into_extent, flags); ++ ++ if (flags & BCH_READ_LAST_FRAGMENT) ++ break; ++ ++ swap(rbio->bio.bi_iter.bi_size, bytes); ++ bio_advance(&rbio->bio, bytes); ++ } ++out: ++ bch2_trans_exit(&trans); ++ bkey_on_stack_exit(&sk, c); ++ return; ++err: ++ if (ret == -EINTR) ++ goto retry; ++ ++ bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); ++ bch2_rbio_done(rbio); ++ goto out; ++} ++ ++void bch2_fs_io_exit(struct bch_fs *c) ++{ ++ if (c->promote_table.tbl) ++ rhashtable_destroy(&c->promote_table); ++ mempool_exit(&c->bio_bounce_pages); ++ bioset_exit(&c->bio_write); ++ bioset_exit(&c->bio_read_split); ++ bioset_exit(&c->bio_read); ++} ++ ++int bch2_fs_io_init(struct bch_fs *c) ++{ ++ if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), ++ BIOSET_NEED_BVECS) || ++ bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), ++ BIOSET_NEED_BVECS) || ++ bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), ++ BIOSET_NEED_BVECS) || ++ mempool_init_page_pool(&c->bio_bounce_pages, ++ max_t(unsigned, ++ c->opts.btree_node_size, ++ c->sb.encoded_extent_max) / ++ PAGE_SECTORS, 0) || ++ rhashtable_init(&c->promote_table, &bch_promote_params)) ++ return -ENOMEM; ++ ++ return 0; ++} +diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h +new file mode 100644 +index 000000000000..ded468d70f09 +--- /dev/null ++++ b/fs/bcachefs/io.h +@@ -0,0 +1,169 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_IO_H ++#define _BCACHEFS_IO_H ++ ++#include "checksum.h" ++#include "bkey_on_stack.h" ++#include "io_types.h" ++ ++#define to_wbio(_bio) \ ++ container_of((_bio), struct bch_write_bio, bio) ++ ++#define to_rbio(_bio) \ ++ container_of((_bio), struct bch_read_bio, bio) ++ ++void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); ++void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); ++ ++void bch2_latency_acct(struct bch_dev *, u64, int); ++ ++void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, ++ enum bch_data_type, const struct bkey_i *); ++ ++#define BLK_STS_REMOVED ((__force blk_status_t)128) ++ ++const char *bch2_blk_status_to_str(blk_status_t); ++ ++enum bch_write_flags { ++ BCH_WRITE_ALLOC_NOWAIT = (1 << 0), ++ BCH_WRITE_CACHED = (1 << 1), ++ BCH_WRITE_FLUSH = (1 << 2), ++ BCH_WRITE_DATA_ENCODED = (1 << 3), ++ BCH_WRITE_PAGES_STABLE = (1 << 4), ++ BCH_WRITE_PAGES_OWNED = (1 << 5), ++ BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6), ++ BCH_WRITE_WROTE_DATA_INLINE = (1 << 7), ++ BCH_WRITE_FROM_INTERNAL = (1 << 8), ++ ++ /* Internal: */ ++ BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9), ++ BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 10), ++ BCH_WRITE_DONE = (1 << 11), ++}; ++ ++static inline u64 *op_journal_seq(struct bch_write_op *op) ++{ ++ return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR) ++ ? op->journal_seq_p : &op->journal_seq; ++} ++ ++static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq) ++{ ++ op->journal_seq_p = journal_seq; ++ op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR; ++} ++ ++static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) ++{ ++ return op->alloc_reserve == RESERVE_MOVINGGC ++ ? op->c->copygc_wq ++ : op->c->wq; ++} ++ ++int bch2_extent_update(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, struct disk_reservation *, ++ u64 *, u64, s64 *); ++int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, ++ struct bpos, u64 *, s64 *); ++int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *); ++ ++int bch2_write_index_default(struct bch_write_op *); ++ ++static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, ++ struct bch_io_opts opts) ++{ ++ op->c = c; ++ op->end_io = NULL; ++ op->flags = 0; ++ op->written = 0; ++ op->error = 0; ++ op->csum_type = bch2_data_checksum_type(c, opts.data_checksum); ++ op->compression_type = bch2_compression_opt_to_type[opts.compression]; ++ op->nr_replicas = 0; ++ op->nr_replicas_required = c->opts.data_replicas_required; ++ op->alloc_reserve = RESERVE_NONE; ++ op->incompressible = 0; ++ op->open_buckets.nr = 0; ++ op->devs_have.nr = 0; ++ op->target = 0; ++ op->opts = opts; ++ op->pos = POS_MAX; ++ op->version = ZERO_VERSION; ++ op->write_point = (struct write_point_specifier) { 0 }; ++ op->res = (struct disk_reservation) { 0 }; ++ op->journal_seq = 0; ++ op->new_i_size = U64_MAX; ++ op->i_sectors_delta = 0; ++ op->index_update_fn = bch2_write_index_default; ++} ++ ++void bch2_write(struct closure *); ++ ++static inline struct bch_write_bio *wbio_init(struct bio *bio) ++{ ++ struct bch_write_bio *wbio = to_wbio(bio); ++ ++ memset(wbio, 0, offsetof(struct bch_write_bio, bio)); ++ return wbio; ++} ++ ++struct bch_devs_mask; ++struct cache_promote_op; ++struct extent_ptr_decoded; ++ ++int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, ++ struct bkey_on_stack *); ++ ++static inline int bch2_read_indirect_extent(struct btree_trans *trans, ++ unsigned *offset_into_extent, ++ struct bkey_on_stack *k) ++{ ++ return k->k->k.type == KEY_TYPE_reflink_p ++ ? __bch2_read_indirect_extent(trans, offset_into_extent, k) ++ : 0; ++} ++ ++enum bch_read_flags { ++ BCH_READ_RETRY_IF_STALE = 1 << 0, ++ BCH_READ_MAY_PROMOTE = 1 << 1, ++ BCH_READ_USER_MAPPED = 1 << 2, ++ BCH_READ_NODECODE = 1 << 3, ++ BCH_READ_LAST_FRAGMENT = 1 << 4, ++ ++ /* internal: */ ++ BCH_READ_MUST_BOUNCE = 1 << 5, ++ BCH_READ_MUST_CLONE = 1 << 6, ++ BCH_READ_IN_RETRY = 1 << 7, ++}; ++ ++int __bch2_read_extent(struct bch_fs *, struct bch_read_bio *, ++ struct bvec_iter, struct bkey_s_c, unsigned, ++ struct bch_io_failures *, unsigned); ++ ++static inline void bch2_read_extent(struct bch_fs *c, ++ struct bch_read_bio *rbio, ++ struct bkey_s_c k, ++ unsigned offset_into_extent, ++ unsigned flags) ++{ ++ __bch2_read_extent(c, rbio, rbio->bio.bi_iter, k, ++ offset_into_extent, NULL, flags); ++} ++ ++void bch2_read(struct bch_fs *, struct bch_read_bio *, u64); ++ ++static inline struct bch_read_bio *rbio_init(struct bio *bio, ++ struct bch_io_opts opts) ++{ ++ struct bch_read_bio *rbio = to_rbio(bio); ++ ++ rbio->_state = 0; ++ rbio->promote = NULL; ++ rbio->opts = opts; ++ return rbio; ++} ++ ++void bch2_fs_io_exit(struct bch_fs *); ++int bch2_fs_io_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_IO_H */ +diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h +new file mode 100644 +index 000000000000..b23727d212b9 +--- /dev/null ++++ b/fs/bcachefs/io_types.h +@@ -0,0 +1,148 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_IO_TYPES_H ++#define _BCACHEFS_IO_TYPES_H ++ ++#include "alloc_types.h" ++#include "btree_types.h" ++#include "buckets_types.h" ++#include "extents_types.h" ++#include "keylist_types.h" ++#include "opts.h" ++#include "super_types.h" ++ ++#include ++#include ++ ++struct bch_read_bio { ++ struct bch_fs *c; ++ u64 start_time; ++ u64 submit_time; ++ ++ /* ++ * Reads will often have to be split, and if the extent being read from ++ * was checksummed or compressed we'll also have to allocate bounce ++ * buffers and copy the data back into the original bio. ++ * ++ * If we didn't have to split, we have to save and restore the original ++ * bi_end_io - @split below indicates which: ++ */ ++ union { ++ struct bch_read_bio *parent; ++ bio_end_io_t *end_io; ++ }; ++ ++ /* ++ * Saved copy of bio->bi_iter, from submission time - allows us to ++ * resubmit on IO error, and also to copy data back to the original bio ++ * when we're bouncing: ++ */ ++ struct bvec_iter bvec_iter; ++ ++ unsigned offset_into_extent; ++ ++ u16 flags; ++ union { ++ struct { ++ u16 bounce:1, ++ split:1, ++ kmalloc:1, ++ have_ioref:1, ++ narrow_crcs:1, ++ hole:1, ++ retry:2, ++ context:2; ++ }; ++ u16 _state; ++ }; ++ ++ struct bch_devs_list devs_have; ++ ++ struct extent_ptr_decoded pick; ++ /* start pos of data we read (may not be pos of data we want) */ ++ struct bpos pos; ++ struct bversion version; ++ ++ struct promote_op *promote; ++ ++ struct bch_io_opts opts; ++ ++ struct work_struct work; ++ ++ struct bio bio; ++}; ++ ++struct bch_write_bio { ++ struct bch_fs *c; ++ struct bch_write_bio *parent; ++ ++ u64 submit_time; ++ ++ struct bch_devs_list failed; ++ u8 dev; ++ ++ unsigned split:1, ++ bounce:1, ++ put_bio:1, ++ have_ioref:1, ++ used_mempool:1; ++ ++ struct bio bio; ++}; ++ ++struct bch_write_op { ++ struct closure cl; ++ struct bch_fs *c; ++ void (*end_io)(struct bch_write_op *); ++ u64 start_time; ++ ++ unsigned written; /* sectors */ ++ u16 flags; ++ s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ ++ ++ unsigned csum_type:4; ++ unsigned compression_type:4; ++ unsigned nr_replicas:4; ++ unsigned nr_replicas_required:4; ++ unsigned alloc_reserve:3; ++ unsigned incompressible:1; ++ ++ struct bch_devs_list devs_have; ++ u16 target; ++ u16 nonce; ++ struct bch_io_opts opts; ++ ++ struct bpos pos; ++ struct bversion version; ++ ++ /* For BCH_WRITE_DATA_ENCODED: */ ++ struct bch_extent_crc_unpacked crc; ++ ++ struct write_point_specifier write_point; ++ ++ struct disk_reservation res; ++ ++ struct open_buckets open_buckets; ++ ++ /* ++ * If caller wants to flush but hasn't passed us a journal_seq ptr, we ++ * still need to stash the journal_seq somewhere: ++ */ ++ union { ++ u64 *journal_seq_p; ++ u64 journal_seq; ++ }; ++ u64 new_i_size; ++ s64 i_sectors_delta; ++ ++ int (*index_update_fn)(struct bch_write_op *); ++ ++ struct bch_devs_mask failed; ++ ++ struct keylist insert_keys; ++ u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2]; ++ ++ /* Must be last: */ ++ struct bch_write_bio wbio; ++}; ++ ++#endif /* _BCACHEFS_IO_TYPES_H */ +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +new file mode 100644 +index 000000000000..210ad1b0c469 +--- /dev/null ++++ b/fs/bcachefs/journal.c +@@ -0,0 +1,1248 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * bcachefs journalling code, for btree insertions ++ * ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_methods.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "journal.h" ++#include "journal_io.h" ++#include "journal_reclaim.h" ++#include "journal_seq_blacklist.h" ++#include "super-io.h" ++ ++#include ++ ++static bool __journal_entry_is_open(union journal_res_state state) ++{ ++ return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; ++} ++ ++static bool journal_entry_is_open(struct journal *j) ++{ ++ return __journal_entry_is_open(j->reservations); ++} ++ ++static void journal_pin_new_entry(struct journal *j, int count) ++{ ++ struct journal_entry_pin_list *p; ++ ++ /* ++ * The fifo_push() needs to happen at the same time as j->seq is ++ * incremented for journal_last_seq() to be calculated correctly ++ */ ++ atomic64_inc(&j->seq); ++ p = fifo_push_ref(&j->pin); ++ ++ INIT_LIST_HEAD(&p->list); ++ INIT_LIST_HEAD(&p->flushed); ++ atomic_set(&p->count, count); ++ p->devs.nr = 0; ++} ++ ++static void bch2_journal_buf_init(struct journal *j) ++{ ++ struct journal_buf *buf = journal_cur_buf(j); ++ ++ memset(buf->has_inode, 0, sizeof(buf->has_inode)); ++ ++ memset(buf->data, 0, sizeof(*buf->data)); ++ buf->data->seq = cpu_to_le64(journal_cur_seq(j)); ++ buf->data->u64s = 0; ++} ++ ++void bch2_journal_halt(struct journal *j) ++{ ++ union journal_res_state old, new; ++ u64 v = atomic64_read(&j->reservations.counter); ++ ++ do { ++ old.v = new.v = v; ++ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) ++ return; ++ ++ new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL; ++ } while ((v = atomic64_cmpxchg(&j->reservations.counter, ++ old.v, new.v)) != old.v); ++ ++ journal_wake(j); ++ closure_wake_up(&journal_cur_buf(j)->wait); ++} ++ ++/* journal entry close/open: */ ++ ++void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set) ++{ ++ if (!need_write_just_set && ++ test_bit(JOURNAL_NEED_WRITE, &j->flags)) ++ bch2_time_stats_update(j->delay_time, ++ j->need_write_time); ++ ++ clear_bit(JOURNAL_NEED_WRITE, &j->flags); ++ ++ closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); ++} ++ ++/* ++ * Returns true if journal entry is now closed: ++ */ ++static bool __journal_entry_close(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_buf *buf = journal_cur_buf(j); ++ union journal_res_state old, new; ++ u64 v = atomic64_read(&j->reservations.counter); ++ bool set_need_write = false; ++ unsigned sectors; ++ ++ lockdep_assert_held(&j->lock); ++ ++ do { ++ old.v = new.v = v; ++ if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL) ++ return true; ++ ++ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) { ++ /* this entry will never be written: */ ++ closure_wake_up(&buf->wait); ++ return true; ++ } ++ ++ if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) { ++ set_bit(JOURNAL_NEED_WRITE, &j->flags); ++ j->need_write_time = local_clock(); ++ set_need_write = true; ++ } ++ ++ if (new.prev_buf_unwritten) ++ return false; ++ ++ new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL; ++ new.idx++; ++ new.prev_buf_unwritten = 1; ++ ++ BUG_ON(journal_state_count(new, new.idx)); ++ } while ((v = atomic64_cmpxchg(&j->reservations.counter, ++ old.v, new.v)) != old.v); ++ ++ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); ++ ++ sectors = vstruct_blocks_plus(buf->data, c->block_bits, ++ buf->u64s_reserved) << c->block_bits; ++ BUG_ON(sectors > buf->sectors); ++ buf->sectors = sectors; ++ ++ bkey_extent_init(&buf->key); ++ ++ /* ++ * We have to set last_seq here, _before_ opening a new journal entry: ++ * ++ * A threads may replace an old pin with a new pin on their current ++ * journal reservation - the expectation being that the journal will ++ * contain either what the old pin protected or what the new pin ++ * protects. ++ * ++ * After the old pin is dropped journal_last_seq() won't include the old ++ * pin, so we can only write the updated last_seq on the entry that ++ * contains whatever the new pin protects. ++ * ++ * Restated, we can _not_ update last_seq for a given entry if there ++ * could be a newer entry open with reservations/pins that have been ++ * taken against it. ++ * ++ * Hence, we want update/set last_seq on the current journal entry right ++ * before we open a new one: ++ */ ++ buf->data->last_seq = cpu_to_le64(journal_last_seq(j)); ++ ++ if (journal_entry_empty(buf->data)) ++ clear_bit(JOURNAL_NOT_EMPTY, &j->flags); ++ else ++ set_bit(JOURNAL_NOT_EMPTY, &j->flags); ++ ++ journal_pin_new_entry(j, 1); ++ ++ bch2_journal_buf_init(j); ++ ++ cancel_delayed_work(&j->write_work); ++ ++ bch2_journal_space_available(j); ++ ++ bch2_journal_buf_put(j, old.idx, set_need_write); ++ return true; ++} ++ ++static bool journal_entry_close(struct journal *j) ++{ ++ bool ret; ++ ++ spin_lock(&j->lock); ++ ret = __journal_entry_close(j); ++ spin_unlock(&j->lock); ++ ++ return ret; ++} ++ ++/* ++ * should _only_ called from journal_res_get() - when we actually want a ++ * journal reservation - journal entry is open means journal is dirty: ++ * ++ * returns: ++ * 0: success ++ * -ENOSPC: journal currently full, must invoke reclaim ++ * -EAGAIN: journal blocked, must wait ++ * -EROFS: insufficient rw devices or journal error ++ */ ++static int journal_entry_open(struct journal *j) ++{ ++ struct journal_buf *buf = journal_cur_buf(j); ++ union journal_res_state old, new; ++ int u64s; ++ u64 v; ++ ++ lockdep_assert_held(&j->lock); ++ BUG_ON(journal_entry_is_open(j)); ++ ++ if (j->blocked) ++ return -EAGAIN; ++ ++ if (j->cur_entry_error) ++ return j->cur_entry_error; ++ ++ BUG_ON(!j->cur_entry_sectors); ++ ++ buf->u64s_reserved = j->entry_u64s_reserved; ++ buf->disk_sectors = j->cur_entry_sectors; ++ buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9); ++ ++ u64s = (int) (buf->sectors << 9) / sizeof(u64) - ++ journal_entry_overhead(j); ++ u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); ++ ++ if (u64s <= le32_to_cpu(buf->data->u64s)) ++ return -ENOSPC; ++ ++ /* ++ * Must be set before marking the journal entry as open: ++ */ ++ j->cur_entry_u64s = u64s; ++ ++ v = atomic64_read(&j->reservations.counter); ++ do { ++ old.v = new.v = v; ++ ++ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) ++ return -EROFS; ++ ++ /* Handle any already added entries */ ++ new.cur_entry_offset = le32_to_cpu(buf->data->u64s); ++ ++ EBUG_ON(journal_state_count(new, new.idx)); ++ journal_state_inc(&new); ++ } while ((v = atomic64_cmpxchg(&j->reservations.counter, ++ old.v, new.v)) != old.v); ++ ++ if (j->res_get_blocked_start) ++ bch2_time_stats_update(j->blocked_time, ++ j->res_get_blocked_start); ++ j->res_get_blocked_start = 0; ++ ++ mod_delayed_work(system_freezable_wq, ++ &j->write_work, ++ msecs_to_jiffies(j->write_delay_ms)); ++ journal_wake(j); ++ return 0; ++} ++ ++static bool journal_quiesced(struct journal *j) ++{ ++ union journal_res_state state = READ_ONCE(j->reservations); ++ bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state); ++ ++ if (!ret) ++ journal_entry_close(j); ++ return ret; ++} ++ ++static void journal_quiesce(struct journal *j) ++{ ++ wait_event(j->wait, journal_quiesced(j)); ++} ++ ++static void journal_write_work(struct work_struct *work) ++{ ++ struct journal *j = container_of(work, struct journal, write_work.work); ++ ++ journal_entry_close(j); ++} ++ ++/* ++ * Given an inode number, if that inode number has data in the journal that ++ * hasn't yet been flushed, return the journal sequence number that needs to be ++ * flushed: ++ */ ++u64 bch2_inode_journal_seq(struct journal *j, u64 inode) ++{ ++ size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); ++ u64 seq = 0; ++ ++ if (!test_bit(h, j->buf[0].has_inode) && ++ !test_bit(h, j->buf[1].has_inode)) ++ return 0; ++ ++ spin_lock(&j->lock); ++ if (test_bit(h, journal_cur_buf(j)->has_inode)) ++ seq = journal_cur_seq(j); ++ else if (test_bit(h, journal_prev_buf(j)->has_inode)) ++ seq = journal_cur_seq(j) - 1; ++ spin_unlock(&j->lock); ++ ++ return seq; ++} ++ ++static int __journal_res_get(struct journal *j, struct journal_res *res, ++ unsigned flags) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_buf *buf; ++ bool can_discard; ++ int ret; ++retry: ++ if (journal_res_get_fast(j, res, flags)) ++ return 0; ++ ++ if (bch2_journal_error(j)) ++ return -EROFS; ++ ++ spin_lock(&j->lock); ++ ++ /* ++ * Recheck after taking the lock, so we don't race with another thread ++ * that just did journal_entry_open() and call journal_entry_close() ++ * unnecessarily ++ */ ++ if (journal_res_get_fast(j, res, flags)) { ++ spin_unlock(&j->lock); ++ return 0; ++ } ++ ++ if (!(flags & JOURNAL_RES_GET_RESERVED) && ++ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { ++ /* ++ * Don't want to close current journal entry, just need to ++ * invoke reclaim: ++ */ ++ ret = -ENOSPC; ++ goto unlock; ++ } ++ ++ /* ++ * If we couldn't get a reservation because the current buf filled up, ++ * and we had room for a bigger entry on disk, signal that we want to ++ * realloc the journal bufs: ++ */ ++ buf = journal_cur_buf(j); ++ if (journal_entry_is_open(j) && ++ buf->buf_size >> 9 < buf->disk_sectors && ++ buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) ++ j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); ++ ++ if (journal_entry_is_open(j) && ++ !__journal_entry_close(j)) { ++ /* ++ * We failed to get a reservation on the current open journal ++ * entry because it's full, and we can't close it because ++ * there's still a previous one in flight: ++ */ ++ trace_journal_entry_full(c); ++ ret = -EAGAIN; ++ } else { ++ ret = journal_entry_open(j); ++ } ++unlock: ++ if ((ret == -EAGAIN || ret == -ENOSPC) && ++ !j->res_get_blocked_start) ++ j->res_get_blocked_start = local_clock() ?: 1; ++ ++ can_discard = j->can_discard; ++ spin_unlock(&j->lock); ++ ++ if (!ret) ++ goto retry; ++ ++ if (ret == -ENOSPC) { ++ WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED), ++ "JOURNAL_RES_GET_RESERVED set but journal full"); ++ ++ /* ++ * Journal is full - can't rely on reclaim from work item due to ++ * freezing: ++ */ ++ trace_journal_full(c); ++ ++ if (!(flags & JOURNAL_RES_GET_NONBLOCK)) { ++ if (can_discard) { ++ bch2_journal_do_discards(j); ++ goto retry; ++ } ++ ++ if (mutex_trylock(&j->reclaim_lock)) { ++ bch2_journal_reclaim(j); ++ mutex_unlock(&j->reclaim_lock); ++ } ++ } ++ ++ ret = -EAGAIN; ++ } ++ ++ return ret; ++} ++ ++/* ++ * Essentially the entry function to the journaling code. When bcachefs is doing ++ * a btree insert, it calls this function to get the current journal write. ++ * Journal write is the structure used set up journal writes. The calling ++ * function will then add its keys to the structure, queuing them for the next ++ * write. ++ * ++ * To ensure forward progress, the current task must not be holding any ++ * btree node write locks. ++ */ ++int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, ++ unsigned flags) ++{ ++ int ret; ++ ++ closure_wait_event(&j->async_wait, ++ (ret = __journal_res_get(j, res, flags)) != -EAGAIN || ++ (flags & JOURNAL_RES_GET_NONBLOCK)); ++ return ret; ++} ++ ++/* journal_preres: */ ++ ++static bool journal_preres_available(struct journal *j, ++ struct journal_preres *res, ++ unsigned new_u64s, ++ unsigned flags) ++{ ++ bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags); ++ ++ if (!ret) ++ bch2_journal_reclaim_work(&j->reclaim_work.work); ++ ++ return ret; ++} ++ ++int __bch2_journal_preres_get(struct journal *j, ++ struct journal_preres *res, ++ unsigned new_u64s, ++ unsigned flags) ++{ ++ int ret; ++ ++ closure_wait_event(&j->preres_wait, ++ (ret = bch2_journal_error(j)) || ++ journal_preres_available(j, res, new_u64s, flags)); ++ return ret; ++} ++ ++/* journal_entry_res: */ ++ ++void bch2_journal_entry_res_resize(struct journal *j, ++ struct journal_entry_res *res, ++ unsigned new_u64s) ++{ ++ union journal_res_state state; ++ int d = new_u64s - res->u64s; ++ ++ spin_lock(&j->lock); ++ ++ j->entry_u64s_reserved += d; ++ if (d <= 0) ++ goto out; ++ ++ j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); ++ smp_mb(); ++ state = READ_ONCE(j->reservations); ++ ++ if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL && ++ state.cur_entry_offset > j->cur_entry_u64s) { ++ j->cur_entry_u64s += d; ++ /* ++ * Not enough room in current journal entry, have to flush it: ++ */ ++ __journal_entry_close(j); ++ } else { ++ journal_cur_buf(j)->u64s_reserved += d; ++ } ++out: ++ spin_unlock(&j->lock); ++ res->u64s += d; ++} ++ ++/* journal flushing: */ ++ ++u64 bch2_journal_last_unwritten_seq(struct journal *j) ++{ ++ u64 seq; ++ ++ spin_lock(&j->lock); ++ seq = journal_cur_seq(j); ++ if (j->reservations.prev_buf_unwritten) ++ seq--; ++ spin_unlock(&j->lock); ++ ++ return seq; ++} ++ ++/** ++ * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't ++ * open yet, or wait if we cannot ++ * ++ * used by the btree interior update machinery, when it needs to write a new ++ * btree root - every journal entry contains the roots of all the btrees, so it ++ * doesn't need to bother with getting a journal reservation ++ */ ++int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ int ret; ++ ++ spin_lock(&j->lock); ++ ++ /* ++ * Can't try to open more than one sequence number ahead: ++ */ ++ BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j)); ++ ++ if (journal_cur_seq(j) > seq || ++ journal_entry_is_open(j)) { ++ spin_unlock(&j->lock); ++ return 0; ++ } ++ ++ if (journal_cur_seq(j) < seq && ++ !__journal_entry_close(j)) { ++ /* haven't finished writing out the previous one: */ ++ trace_journal_entry_full(c); ++ ret = -EAGAIN; ++ } else { ++ BUG_ON(journal_cur_seq(j) != seq); ++ ++ ret = journal_entry_open(j); ++ } ++ ++ if ((ret == -EAGAIN || ret == -ENOSPC) && ++ !j->res_get_blocked_start) ++ j->res_get_blocked_start = local_clock() ?: 1; ++ ++ if (ret == -EAGAIN || ret == -ENOSPC) ++ closure_wait(&j->async_wait, cl); ++ ++ spin_unlock(&j->lock); ++ ++ if (ret == -ENOSPC) { ++ trace_journal_full(c); ++ bch2_journal_reclaim_work(&j->reclaim_work.work); ++ ret = -EAGAIN; ++ } ++ ++ return ret; ++} ++ ++static int journal_seq_error(struct journal *j, u64 seq) ++{ ++ union journal_res_state state = READ_ONCE(j->reservations); ++ ++ if (seq == journal_cur_seq(j)) ++ return bch2_journal_error(j); ++ ++ if (seq + 1 == journal_cur_seq(j) && ++ !state.prev_buf_unwritten && ++ seq > j->seq_ondisk) ++ return -EIO; ++ ++ return 0; ++} ++ ++static inline struct journal_buf * ++journal_seq_to_buf(struct journal *j, u64 seq) ++{ ++ /* seq should be for a journal entry that has been opened: */ ++ BUG_ON(seq > journal_cur_seq(j)); ++ BUG_ON(seq == journal_cur_seq(j) && ++ j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); ++ ++ if (seq == journal_cur_seq(j)) ++ return journal_cur_buf(j); ++ if (seq + 1 == journal_cur_seq(j) && ++ j->reservations.prev_buf_unwritten) ++ return journal_prev_buf(j); ++ return NULL; ++} ++ ++/** ++ * bch2_journal_wait_on_seq - wait for a journal entry to be written ++ * ++ * does _not_ cause @seq to be written immediately - if there is no other ++ * activity to cause the relevant journal entry to be filled up or flushed it ++ * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is ++ * configurable). ++ */ ++void bch2_journal_wait_on_seq(struct journal *j, u64 seq, ++ struct closure *parent) ++{ ++ struct journal_buf *buf; ++ ++ spin_lock(&j->lock); ++ ++ if ((buf = journal_seq_to_buf(j, seq))) { ++ if (!closure_wait(&buf->wait, parent)) ++ BUG(); ++ ++ if (seq == journal_cur_seq(j)) { ++ smp_mb(); ++ if (bch2_journal_error(j)) ++ closure_wake_up(&buf->wait); ++ } ++ } ++ ++ spin_unlock(&j->lock); ++} ++ ++/** ++ * bch2_journal_flush_seq_async - wait for a journal entry to be written ++ * ++ * like bch2_journal_wait_on_seq, except that it triggers a write immediately if ++ * necessary ++ */ ++void bch2_journal_flush_seq_async(struct journal *j, u64 seq, ++ struct closure *parent) ++{ ++ struct journal_buf *buf; ++ ++ spin_lock(&j->lock); ++ ++ if (parent && ++ (buf = journal_seq_to_buf(j, seq))) ++ if (!closure_wait(&buf->wait, parent)) ++ BUG(); ++ ++ if (seq == journal_cur_seq(j)) ++ __journal_entry_close(j); ++ spin_unlock(&j->lock); ++} ++ ++static int journal_seq_flushed(struct journal *j, u64 seq) ++{ ++ int ret; ++ ++ spin_lock(&j->lock); ++ ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq); ++ ++ if (seq == journal_cur_seq(j)) ++ __journal_entry_close(j); ++ spin_unlock(&j->lock); ++ ++ return ret; ++} ++ ++int bch2_journal_flush_seq(struct journal *j, u64 seq) ++{ ++ u64 start_time = local_clock(); ++ int ret, ret2; ++ ++ ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq))); ++ ++ bch2_time_stats_update(j->flush_seq_time, start_time); ++ ++ return ret ?: ret2 < 0 ? ret2 : 0; ++} ++ ++/** ++ * bch2_journal_meta_async - force a journal entry to be written ++ */ ++void bch2_journal_meta_async(struct journal *j, struct closure *parent) ++{ ++ struct journal_res res; ++ ++ memset(&res, 0, sizeof(res)); ++ ++ bch2_journal_res_get(j, &res, jset_u64s(0), 0); ++ bch2_journal_res_put(j, &res); ++ ++ bch2_journal_flush_seq_async(j, res.seq, parent); ++} ++ ++int bch2_journal_meta(struct journal *j) ++{ ++ struct journal_res res; ++ int ret; ++ ++ memset(&res, 0, sizeof(res)); ++ ++ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); ++ if (ret) ++ return ret; ++ ++ bch2_journal_res_put(j, &res); ++ ++ return bch2_journal_flush_seq(j, res.seq); ++} ++ ++/* ++ * bch2_journal_flush_async - if there is an open journal entry, or a journal ++ * still being written, write it and wait for the write to complete ++ */ ++void bch2_journal_flush_async(struct journal *j, struct closure *parent) ++{ ++ u64 seq, journal_seq; ++ ++ spin_lock(&j->lock); ++ journal_seq = journal_cur_seq(j); ++ ++ if (journal_entry_is_open(j)) { ++ seq = journal_seq; ++ } else if (journal_seq) { ++ seq = journal_seq - 1; ++ } else { ++ spin_unlock(&j->lock); ++ return; ++ } ++ spin_unlock(&j->lock); ++ ++ bch2_journal_flush_seq_async(j, seq, parent); ++} ++ ++int bch2_journal_flush(struct journal *j) ++{ ++ u64 seq, journal_seq; ++ ++ spin_lock(&j->lock); ++ journal_seq = journal_cur_seq(j); ++ ++ if (journal_entry_is_open(j)) { ++ seq = journal_seq; ++ } else if (journal_seq) { ++ seq = journal_seq - 1; ++ } else { ++ spin_unlock(&j->lock); ++ return 0; ++ } ++ spin_unlock(&j->lock); ++ ++ return bch2_journal_flush_seq(j, seq); ++} ++ ++/* block/unlock the journal: */ ++ ++void bch2_journal_unblock(struct journal *j) ++{ ++ spin_lock(&j->lock); ++ j->blocked--; ++ spin_unlock(&j->lock); ++ ++ journal_wake(j); ++} ++ ++void bch2_journal_block(struct journal *j) ++{ ++ spin_lock(&j->lock); ++ j->blocked++; ++ spin_unlock(&j->lock); ++ ++ journal_quiesce(j); ++} ++ ++/* allocate journal on a device: */ ++ ++static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, ++ bool new_fs, struct closure *cl) ++{ ++ struct bch_fs *c = ca->fs; ++ struct journal_device *ja = &ca->journal; ++ struct bch_sb_field_journal *journal_buckets; ++ u64 *new_bucket_seq = NULL, *new_buckets = NULL; ++ int ret = 0; ++ ++ /* don't handle reducing nr of buckets yet: */ ++ if (nr <= ja->nr) ++ return 0; ++ ++ ret = -ENOMEM; ++ new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); ++ new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); ++ if (!new_buckets || !new_bucket_seq) ++ goto err; ++ ++ journal_buckets = bch2_sb_resize_journal(&ca->disk_sb, ++ nr + sizeof(*journal_buckets) / sizeof(u64)); ++ if (!journal_buckets) ++ goto err; ++ ++ /* ++ * We may be called from the device add path, before the new device has ++ * actually been added to the running filesystem: ++ */ ++ if (c) ++ spin_lock(&c->journal.lock); ++ ++ memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); ++ memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); ++ swap(new_buckets, ja->buckets); ++ swap(new_bucket_seq, ja->bucket_seq); ++ ++ if (c) ++ spin_unlock(&c->journal.lock); ++ ++ while (ja->nr < nr) { ++ struct open_bucket *ob = NULL; ++ unsigned pos; ++ long bucket; ++ ++ if (new_fs) { ++ bucket = bch2_bucket_alloc_new_fs(ca); ++ if (bucket < 0) { ++ ret = -ENOSPC; ++ goto err; ++ } ++ } else { ++ ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, ++ false, cl); ++ if (IS_ERR(ob)) { ++ ret = cl ? -EAGAIN : -ENOSPC; ++ goto err; ++ } ++ ++ bucket = sector_to_bucket(ca, ob->ptr.offset); ++ } ++ ++ if (c) { ++ percpu_down_read(&c->mark_lock); ++ spin_lock(&c->journal.lock); ++ } ++ ++ pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0; ++ __array_insert_item(ja->buckets, ja->nr, pos); ++ __array_insert_item(ja->bucket_seq, ja->nr, pos); ++ __array_insert_item(journal_buckets->buckets, ja->nr, pos); ++ ja->nr++; ++ ++ ja->buckets[pos] = bucket; ++ ja->bucket_seq[pos] = 0; ++ journal_buckets->buckets[pos] = cpu_to_le64(bucket); ++ ++ if (pos <= ja->discard_idx) ++ ja->discard_idx = (ja->discard_idx + 1) % ja->nr; ++ if (pos <= ja->dirty_idx_ondisk) ++ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; ++ if (pos <= ja->dirty_idx) ++ ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; ++ if (pos <= ja->cur_idx) ++ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ++ ++ bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal, ++ ca->mi.bucket_size, ++ gc_phase(GC_PHASE_SB), ++ 0); ++ ++ if (c) { ++ spin_unlock(&c->journal.lock); ++ percpu_up_read(&c->mark_lock); ++ } ++ ++ if (!new_fs) ++ bch2_open_bucket_put(c, ob); ++ } ++ ++ ret = 0; ++err: ++ kfree(new_bucket_seq); ++ kfree(new_buckets); ++ ++ return ret; ++} ++ ++/* ++ * Allocate more journal space at runtime - not currently making use if it, but ++ * the code works: ++ */ ++int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, ++ unsigned nr) ++{ ++ struct journal_device *ja = &ca->journal; ++ struct closure cl; ++ unsigned current_nr; ++ int ret; ++ ++ closure_init_stack(&cl); ++ ++ do { ++ struct disk_reservation disk_res = { 0, 0 }; ++ ++ closure_sync(&cl); ++ ++ mutex_lock(&c->sb_lock); ++ current_nr = ja->nr; ++ ++ /* ++ * note: journal buckets aren't really counted as _sectors_ used yet, so ++ * we don't need the disk reservation to avoid the BUG_ON() in buckets.c ++ * when space used goes up without a reservation - but we do need the ++ * reservation to ensure we'll actually be able to allocate: ++ */ ++ ++ if (bch2_disk_reservation_get(c, &disk_res, ++ bucket_to_sector(ca, nr - ja->nr), 1, 0)) { ++ mutex_unlock(&c->sb_lock); ++ return -ENOSPC; ++ } ++ ++ ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl); ++ ++ bch2_disk_reservation_put(c, &disk_res); ++ ++ if (ja->nr != current_nr) ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ } while (ret == -EAGAIN); ++ ++ return ret; ++} ++ ++int bch2_dev_journal_alloc(struct bch_dev *ca) ++{ ++ unsigned nr; ++ ++ if (dynamic_fault("bcachefs:add:journal_alloc")) ++ return -ENOMEM; ++ ++ /* ++ * clamp journal size to 1024 buckets or 512MB (in sectors), whichever ++ * is smaller: ++ */ ++ nr = clamp_t(unsigned, ca->mi.nbuckets >> 8, ++ BCH_JOURNAL_BUCKETS_MIN, ++ min(1 << 10, ++ (1 << 20) / ca->mi.bucket_size)); ++ ++ return __bch2_set_nr_journal_buckets(ca, nr, true, NULL); ++} ++ ++/* startup/shutdown: */ ++ ++static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) ++{ ++ union journal_res_state state; ++ struct journal_buf *w; ++ bool ret; ++ ++ spin_lock(&j->lock); ++ state = READ_ONCE(j->reservations); ++ w = j->buf + !state.idx; ++ ++ ret = state.prev_buf_unwritten && ++ bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx); ++ spin_unlock(&j->lock); ++ ++ return ret; ++} ++ ++void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) ++{ ++ wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx)); ++} ++ ++void bch2_fs_journal_stop(struct journal *j) ++{ ++ bch2_journal_flush_all_pins(j); ++ ++ wait_event(j->wait, journal_entry_close(j)); ++ ++ /* do we need to write another journal entry? */ ++ if (test_bit(JOURNAL_NOT_EMPTY, &j->flags)) ++ bch2_journal_meta(j); ++ ++ journal_quiesce(j); ++ ++ BUG_ON(!bch2_journal_error(j) && ++ test_bit(JOURNAL_NOT_EMPTY, &j->flags)); ++ ++ cancel_delayed_work_sync(&j->write_work); ++ cancel_delayed_work_sync(&j->reclaim_work); ++} ++ ++int bch2_fs_journal_start(struct journal *j, u64 cur_seq, ++ struct list_head *journal_entries) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_entry_pin_list *p; ++ struct journal_replay *i; ++ u64 last_seq = cur_seq, nr, seq; ++ ++ if (!list_empty(journal_entries)) ++ last_seq = le64_to_cpu(list_last_entry(journal_entries, ++ struct journal_replay, list)->j.last_seq); ++ ++ nr = cur_seq - last_seq; ++ ++ if (nr + 1 > j->pin.size) { ++ free_fifo(&j->pin); ++ init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); ++ if (!j->pin.data) { ++ bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); ++ return -ENOMEM; ++ } ++ } ++ ++ j->replay_journal_seq = last_seq; ++ j->replay_journal_seq_end = cur_seq; ++ j->last_seq_ondisk = last_seq; ++ j->pin.front = last_seq; ++ j->pin.back = cur_seq; ++ atomic64_set(&j->seq, cur_seq - 1); ++ ++ fifo_for_each_entry_ptr(p, &j->pin, seq) { ++ INIT_LIST_HEAD(&p->list); ++ INIT_LIST_HEAD(&p->flushed); ++ atomic_set(&p->count, 1); ++ p->devs.nr = 0; ++ } ++ ++ list_for_each_entry(i, journal_entries, list) { ++ seq = le64_to_cpu(i->j.seq); ++ BUG_ON(seq >= cur_seq); ++ ++ if (seq < last_seq) ++ continue; ++ ++ journal_seq_pin(j, seq)->devs = i->devs; ++ } ++ ++ spin_lock(&j->lock); ++ ++ set_bit(JOURNAL_STARTED, &j->flags); ++ ++ journal_pin_new_entry(j, 1); ++ bch2_journal_buf_init(j); ++ ++ c->last_bucket_seq_cleanup = journal_cur_seq(j); ++ ++ bch2_journal_space_available(j); ++ spin_unlock(&j->lock); ++ ++ return 0; ++} ++ ++/* init/exit: */ ++ ++void bch2_dev_journal_exit(struct bch_dev *ca) ++{ ++ kfree(ca->journal.bio); ++ kfree(ca->journal.buckets); ++ kfree(ca->journal.bucket_seq); ++ ++ ca->journal.bio = NULL; ++ ca->journal.buckets = NULL; ++ ca->journal.bucket_seq = NULL; ++} ++ ++int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) ++{ ++ struct journal_device *ja = &ca->journal; ++ struct bch_sb_field_journal *journal_buckets = ++ bch2_sb_get_journal(sb); ++ unsigned i; ++ ++ ja->nr = bch2_nr_journal_buckets(journal_buckets); ++ ++ ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); ++ if (!ja->bucket_seq) ++ return -ENOMEM; ++ ++ ca->journal.bio = bio_kmalloc(GFP_KERNEL, ++ DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE)); ++ if (!ca->journal.bio) ++ return -ENOMEM; ++ ++ ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); ++ if (!ja->buckets) ++ return -ENOMEM; ++ ++ for (i = 0; i < ja->nr; i++) ++ ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); ++ ++ return 0; ++} ++ ++void bch2_fs_journal_exit(struct journal *j) ++{ ++ kvpfree(j->buf[1].data, j->buf[1].buf_size); ++ kvpfree(j->buf[0].data, j->buf[0].buf_size); ++ free_fifo(&j->pin); ++} ++ ++int bch2_fs_journal_init(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ static struct lock_class_key res_key; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ spin_lock_init(&j->lock); ++ spin_lock_init(&j->err_lock); ++ init_waitqueue_head(&j->wait); ++ INIT_DELAYED_WORK(&j->write_work, journal_write_work); ++ INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work); ++ init_waitqueue_head(&j->pin_flush_wait); ++ mutex_init(&j->reclaim_lock); ++ mutex_init(&j->discard_lock); ++ ++ lockdep_init_map(&j->res_map, "journal res", &res_key, 0); ++ ++ j->buf[0].buf_size = JOURNAL_ENTRY_SIZE_MIN; ++ j->buf[1].buf_size = JOURNAL_ENTRY_SIZE_MIN; ++ j->write_delay_ms = 1000; ++ j->reclaim_delay_ms = 100; ++ ++ /* Btree roots: */ ++ j->entry_u64s_reserved += ++ BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX); ++ ++ atomic64_set(&j->reservations.counter, ++ ((union journal_res_state) ++ { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); ++ ++ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || ++ !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) || ++ !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ j->pin.front = j->pin.back = 1; ++out: ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} ++ ++/* debug: */ ++ ++void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ union journal_res_state s; ++ struct bch_dev *ca; ++ unsigned iter; ++ ++ rcu_read_lock(); ++ spin_lock(&j->lock); ++ s = READ_ONCE(j->reservations); ++ ++ pr_buf(out, ++ "active journal entries:\t%llu\n" ++ "seq:\t\t\t%llu\n" ++ "last_seq:\t\t%llu\n" ++ "last_seq_ondisk:\t%llu\n" ++ "prereserved:\t\t%u/%u\n" ++ "current entry sectors:\t%u\n" ++ "current entry:\t\t", ++ fifo_used(&j->pin), ++ journal_cur_seq(j), ++ journal_last_seq(j), ++ j->last_seq_ondisk, ++ j->prereserved.reserved, ++ j->prereserved.remaining, ++ j->cur_entry_sectors); ++ ++ switch (s.cur_entry_offset) { ++ case JOURNAL_ENTRY_ERROR_VAL: ++ pr_buf(out, "error\n"); ++ break; ++ case JOURNAL_ENTRY_CLOSED_VAL: ++ pr_buf(out, "closed\n"); ++ break; ++ default: ++ pr_buf(out, "%u/%u\n", ++ s.cur_entry_offset, ++ j->cur_entry_u64s); ++ break; ++ } ++ ++ pr_buf(out, ++ "current entry refs:\t%u\n" ++ "prev entry unwritten:\t", ++ journal_state_count(s, s.idx)); ++ ++ if (s.prev_buf_unwritten) ++ pr_buf(out, "yes, ref %u sectors %u\n", ++ journal_state_count(s, !s.idx), ++ journal_prev_buf(j)->sectors); ++ else ++ pr_buf(out, "no\n"); ++ ++ pr_buf(out, ++ "need write:\t\t%i\n" ++ "replay done:\t\t%i\n", ++ test_bit(JOURNAL_NEED_WRITE, &j->flags), ++ test_bit(JOURNAL_REPLAY_DONE, &j->flags)); ++ ++ for_each_member_device_rcu(ca, c, iter, ++ &c->rw_devs[BCH_DATA_journal]) { ++ struct journal_device *ja = &ca->journal; ++ ++ if (!ja->nr) ++ continue; ++ ++ pr_buf(out, ++ "dev %u:\n" ++ "\tnr\t\t%u\n" ++ "\tavailable\t%u:%u\n" ++ "\tdiscard_idx\t\t%u\n" ++ "\tdirty_idx_ondisk\t%u (seq %llu)\n" ++ "\tdirty_idx\t\t%u (seq %llu)\n" ++ "\tcur_idx\t\t%u (seq %llu)\n", ++ iter, ja->nr, ++ bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ++ ja->sectors_free, ++ ja->discard_idx, ++ ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk], ++ ja->dirty_idx, ja->bucket_seq[ja->dirty_idx], ++ ja->cur_idx, ja->bucket_seq[ja->cur_idx]); ++ } ++ ++ spin_unlock(&j->lock); ++ rcu_read_unlock(); ++} ++ ++void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) ++{ ++ struct journal_entry_pin_list *pin_list; ++ struct journal_entry_pin *pin; ++ u64 i; ++ ++ spin_lock(&j->lock); ++ fifo_for_each_entry_ptr(pin_list, &j->pin, i) { ++ pr_buf(out, "%llu: count %u\n", ++ i, atomic_read(&pin_list->count)); ++ ++ list_for_each_entry(pin, &pin_list->list, list) ++ pr_buf(out, "\t%px %ps\n", ++ pin, pin->flush); ++ ++ if (!list_empty(&pin_list->flushed)) ++ pr_buf(out, "flushed:\n"); ++ ++ list_for_each_entry(pin, &pin_list->flushed, list) ++ pr_buf(out, "\t%px %ps\n", ++ pin, pin->flush); ++ } ++ spin_unlock(&j->lock); ++} +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +new file mode 100644 +index 000000000000..56438840efd7 +--- /dev/null ++++ b/fs/bcachefs/journal.h +@@ -0,0 +1,519 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_H ++#define _BCACHEFS_JOURNAL_H ++ ++/* ++ * THE JOURNAL: ++ * ++ * The primary purpose of the journal is to log updates (insertions) to the ++ * b-tree, to avoid having to do synchronous updates to the b-tree on disk. ++ * ++ * Without the journal, the b-tree is always internally consistent on ++ * disk - and in fact, in the earliest incarnations bcache didn't have a journal ++ * but did handle unclean shutdowns by doing all index updates synchronously ++ * (with coalescing). ++ * ++ * Updates to interior nodes still happen synchronously and without the journal ++ * (for simplicity) - this may change eventually but updates to interior nodes ++ * are rare enough it's not a huge priority. ++ * ++ * This means the journal is relatively separate from the b-tree; it consists of ++ * just a list of keys and journal replay consists of just redoing those ++ * insertions in same order that they appear in the journal. ++ * ++ * PERSISTENCE: ++ * ++ * For synchronous updates (where we're waiting on the index update to hit ++ * disk), the journal entry will be written out immediately (or as soon as ++ * possible, if the write for the previous journal entry was still in flight). ++ * ++ * Synchronous updates are specified by passing a closure (@flush_cl) to ++ * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter ++ * down to the journalling code. That closure will will wait on the journal ++ * write to complete (via closure_wait()). ++ * ++ * If the index update wasn't synchronous, the journal entry will be ++ * written out after 10 ms have elapsed, by default (the delay_ms field ++ * in struct journal). ++ * ++ * JOURNAL ENTRIES: ++ * ++ * A journal entry is variable size (struct jset), it's got a fixed length ++ * header and then a variable number of struct jset_entry entries. ++ * ++ * Journal entries are identified by monotonically increasing 64 bit sequence ++ * numbers - jset->seq; other places in the code refer to this sequence number. ++ * ++ * A jset_entry entry contains one or more bkeys (which is what gets inserted ++ * into the b-tree). We need a container to indicate which b-tree the key is ++ * for; also, the roots of the various b-trees are stored in jset_entry entries ++ * (one for each b-tree) - this lets us add new b-tree types without changing ++ * the on disk format. ++ * ++ * We also keep some things in the journal header that are logically part of the ++ * superblock - all the things that are frequently updated. This is for future ++ * bcache on raw flash support; the superblock (which will become another ++ * journal) can't be moved or wear leveled, so it contains just enough ++ * information to find the main journal, and the superblock only has to be ++ * rewritten when we want to move/wear level the main journal. ++ * ++ * JOURNAL LAYOUT ON DISK: ++ * ++ * The journal is written to a ringbuffer of buckets (which is kept in the ++ * superblock); the individual buckets are not necessarily contiguous on disk ++ * which means that journal entries are not allowed to span buckets, but also ++ * that we can resize the journal at runtime if desired (unimplemented). ++ * ++ * The journal buckets exist in the same pool as all the other buckets that are ++ * managed by the allocator and garbage collection - garbage collection marks ++ * the journal buckets as metadata buckets. ++ * ++ * OPEN/DIRTY JOURNAL ENTRIES: ++ * ++ * Open/dirty journal entries are journal entries that contain b-tree updates ++ * that have not yet been written out to the b-tree on disk. We have to track ++ * which journal entries are dirty, and we also have to avoid wrapping around ++ * the journal and overwriting old but still dirty journal entries with new ++ * journal entries. ++ * ++ * On disk, this is represented with the "last_seq" field of struct jset; ++ * last_seq is the first sequence number that journal replay has to replay. ++ * ++ * To avoid overwriting dirty journal entries on disk, we keep a mapping (in ++ * journal_device->seq) of for each journal bucket, the highest sequence number ++ * any journal entry it contains. Then, by comparing that against last_seq we ++ * can determine whether that journal bucket contains dirty journal entries or ++ * not. ++ * ++ * To track which journal entries are dirty, we maintain a fifo of refcounts ++ * (where each entry corresponds to a specific sequence number) - when a ref ++ * goes to 0, that journal entry is no longer dirty. ++ * ++ * Journalling of index updates is done at the same time as the b-tree itself is ++ * being modified (see btree_insert_key()); when we add the key to the journal ++ * the pending b-tree write takes a ref on the journal entry the key was added ++ * to. If a pending b-tree write would need to take refs on multiple dirty ++ * journal entries, it only keeps the ref on the oldest one (since a newer ++ * journal entry will still be replayed if an older entry was dirty). ++ * ++ * JOURNAL FILLING UP: ++ * ++ * There are two ways the journal could fill up; either we could run out of ++ * space to write to, or we could have too many open journal entries and run out ++ * of room in the fifo of refcounts. Since those refcounts are decremented ++ * without any locking we can't safely resize that fifo, so we handle it the ++ * same way. ++ * ++ * If the journal fills up, we start flushing dirty btree nodes until we can ++ * allocate space for a journal write again - preferentially flushing btree ++ * nodes that are pinning the oldest journal entries first. ++ */ ++ ++#include ++ ++#include "journal_types.h" ++ ++struct bch_fs; ++ ++static inline void journal_wake(struct journal *j) ++{ ++ wake_up(&j->wait); ++ closure_wake_up(&j->async_wait); ++ closure_wake_up(&j->preres_wait); ++} ++ ++static inline struct journal_buf *journal_cur_buf(struct journal *j) ++{ ++ return j->buf + j->reservations.idx; ++} ++ ++static inline struct journal_buf *journal_prev_buf(struct journal *j) ++{ ++ return j->buf + !j->reservations.idx; ++} ++ ++/* Sequence number of oldest dirty journal entry */ ++ ++static inline u64 journal_last_seq(struct journal *j) ++{ ++ return j->pin.front; ++} ++ ++static inline u64 journal_cur_seq(struct journal *j) ++{ ++ BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); ++ ++ return j->pin.back - 1; ++} ++ ++u64 bch2_inode_journal_seq(struct journal *, u64); ++ ++static inline int journal_state_count(union journal_res_state s, int idx) ++{ ++ return idx == 0 ? s.buf0_count : s.buf1_count; ++} ++ ++static inline void journal_state_inc(union journal_res_state *s) ++{ ++ s->buf0_count += s->idx == 0; ++ s->buf1_count += s->idx == 1; ++} ++ ++static inline void bch2_journal_set_has_inode(struct journal *j, ++ struct journal_res *res, ++ u64 inum) ++{ ++ struct journal_buf *buf = &j->buf[res->idx]; ++ unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8)); ++ ++ /* avoid atomic op if possible */ ++ if (unlikely(!test_bit(bit, buf->has_inode))) ++ set_bit(bit, buf->has_inode); ++} ++ ++/* ++ * Amount of space that will be taken up by some keys in the journal (i.e. ++ * including the jset header) ++ */ ++static inline unsigned jset_u64s(unsigned u64s) ++{ ++ return u64s + sizeof(struct jset_entry) / sizeof(u64); ++} ++ ++static inline int journal_entry_overhead(struct journal *j) ++{ ++ return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved; ++} ++ ++static inline struct jset_entry * ++bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) ++{ ++ struct jset *jset = buf->data; ++ struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s)); ++ ++ memset(entry, 0, sizeof(*entry)); ++ entry->u64s = cpu_to_le16(u64s); ++ ++ le32_add_cpu(&jset->u64s, jset_u64s(u64s)); ++ ++ return entry; ++} ++ ++static inline struct jset_entry * ++journal_res_entry(struct journal *j, struct journal_res *res) ++{ ++ return vstruct_idx(j->buf[res->idx].data, res->offset); ++} ++ ++static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type, ++ enum btree_id id, unsigned level, ++ const void *data, unsigned u64s) ++{ ++ memset(entry, 0, sizeof(*entry)); ++ entry->u64s = cpu_to_le16(u64s); ++ entry->type = type; ++ entry->btree_id = id; ++ entry->level = level; ++ memcpy_u64s_small(entry->_data, data, u64s); ++ ++ return jset_u64s(u64s); ++} ++ ++static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res, ++ unsigned type, enum btree_id id, ++ unsigned level, ++ const void *data, unsigned u64s) ++{ ++ unsigned actual = journal_entry_set(journal_res_entry(j, res), ++ type, id, level, data, u64s); ++ ++ EBUG_ON(!res->ref); ++ EBUG_ON(actual > res->u64s); ++ ++ res->offset += actual; ++ res->u64s -= actual; ++} ++ ++static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res, ++ enum btree_id id, const struct bkey_i *k) ++{ ++ bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys, ++ id, 0, k, k->k.u64s); ++} ++ ++static inline bool journal_entry_empty(struct jset *j) ++{ ++ struct jset_entry *i; ++ ++ if (j->seq != j->last_seq) ++ return false; ++ ++ vstruct_for_each(j, i) ++ if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s) ++ return false; ++ return true; ++} ++ ++void __bch2_journal_buf_put(struct journal *, bool); ++ ++static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, ++ bool need_write_just_set) ++{ ++ union journal_res_state s; ++ ++ s.v = atomic64_sub_return(((union journal_res_state) { ++ .buf0_count = idx == 0, ++ .buf1_count = idx == 1, ++ }).v, &j->reservations.counter); ++ if (!journal_state_count(s, idx)) { ++ EBUG_ON(s.idx == idx || !s.prev_buf_unwritten); ++ __bch2_journal_buf_put(j, need_write_just_set); ++ } ++} ++ ++/* ++ * This function releases the journal write structure so other threads can ++ * then proceed to add their keys as well. ++ */ ++static inline void bch2_journal_res_put(struct journal *j, ++ struct journal_res *res) ++{ ++ if (!res->ref) ++ return; ++ ++ lock_release(&j->res_map, _THIS_IP_); ++ ++ while (res->u64s) ++ bch2_journal_add_entry(j, res, ++ BCH_JSET_ENTRY_btree_keys, ++ 0, 0, NULL, 0); ++ ++ bch2_journal_buf_put(j, res->idx, false); ++ ++ res->ref = 0; ++} ++ ++int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, ++ unsigned); ++ ++#define JOURNAL_RES_GET_NONBLOCK (1 << 0) ++#define JOURNAL_RES_GET_CHECK (1 << 1) ++#define JOURNAL_RES_GET_RESERVED (1 << 2) ++#define JOURNAL_RES_GET_RECLAIM (1 << 3) ++ ++static inline int journal_res_get_fast(struct journal *j, ++ struct journal_res *res, ++ unsigned flags) ++{ ++ union journal_res_state old, new; ++ u64 v = atomic64_read(&j->reservations.counter); ++ ++ do { ++ old.v = new.v = v; ++ ++ /* ++ * Check if there is still room in the current journal ++ * entry: ++ */ ++ if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) ++ return 0; ++ ++ EBUG_ON(!journal_state_count(new, new.idx)); ++ ++ if (!(flags & JOURNAL_RES_GET_RESERVED) && ++ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) ++ return 0; ++ ++ if (flags & JOURNAL_RES_GET_CHECK) ++ return 1; ++ ++ new.cur_entry_offset += res->u64s; ++ journal_state_inc(&new); ++ } while ((v = atomic64_cmpxchg(&j->reservations.counter, ++ old.v, new.v)) != old.v); ++ ++ res->ref = true; ++ res->idx = old.idx; ++ res->offset = old.cur_entry_offset; ++ res->seq = le64_to_cpu(j->buf[old.idx].data->seq); ++ return 1; ++} ++ ++static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, ++ unsigned u64s, unsigned flags) ++{ ++ int ret; ++ ++ EBUG_ON(res->ref); ++ EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); ++ ++ res->u64s = u64s; ++ ++ if (journal_res_get_fast(j, res, flags)) ++ goto out; ++ ++ ret = bch2_journal_res_get_slowpath(j, res, flags); ++ if (ret) ++ return ret; ++out: ++ if (!(flags & JOURNAL_RES_GET_CHECK)) { ++ lock_acquire_shared(&j->res_map, 0, ++ (flags & JOURNAL_RES_GET_NONBLOCK) != 0, ++ NULL, _THIS_IP_); ++ EBUG_ON(!res->ref); ++ } ++ return 0; ++} ++ ++/* journal_preres: */ ++ ++static inline bool journal_check_may_get_unreserved(struct journal *j) ++{ ++ union journal_preres_state s = READ_ONCE(j->prereserved); ++ bool ret = s.reserved <= s.remaining && ++ fifo_free(&j->pin) > 8; ++ ++ lockdep_assert_held(&j->lock); ++ ++ if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { ++ if (ret) { ++ set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); ++ journal_wake(j); ++ } else { ++ clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); ++ } ++ } ++ return ret; ++} ++ ++static inline void bch2_journal_preres_put(struct journal *j, ++ struct journal_preres *res) ++{ ++ union journal_preres_state s = { .reserved = res->u64s }; ++ ++ if (!res->u64s) ++ return; ++ ++ s.v = atomic64_sub_return(s.v, &j->prereserved.counter); ++ res->u64s = 0; ++ closure_wake_up(&j->preres_wait); ++ ++ if (s.reserved <= s.remaining && ++ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { ++ spin_lock(&j->lock); ++ journal_check_may_get_unreserved(j); ++ spin_unlock(&j->lock); ++ } ++} ++ ++int __bch2_journal_preres_get(struct journal *, ++ struct journal_preres *, unsigned, unsigned); ++ ++static inline int bch2_journal_preres_get_fast(struct journal *j, ++ struct journal_preres *res, ++ unsigned new_u64s, ++ unsigned flags) ++{ ++ int d = new_u64s - res->u64s; ++ union journal_preres_state old, new; ++ u64 v = atomic64_read(&j->prereserved.counter); ++ ++ do { ++ old.v = new.v = v; ++ ++ new.reserved += d; ++ ++ /* ++ * If we're being called from the journal reclaim path, we have ++ * to unconditionally give out the pre-reservation, there's ++ * nothing else sensible we can do - otherwise we'd recurse back ++ * into the reclaim path and deadlock: ++ */ ++ ++ if (!(flags & JOURNAL_RES_GET_RECLAIM) && ++ new.reserved > new.remaining) ++ return 0; ++ } while ((v = atomic64_cmpxchg(&j->prereserved.counter, ++ old.v, new.v)) != old.v); ++ ++ res->u64s += d; ++ return 1; ++} ++ ++static inline int bch2_journal_preres_get(struct journal *j, ++ struct journal_preres *res, ++ unsigned new_u64s, ++ unsigned flags) ++{ ++ if (new_u64s <= res->u64s) ++ return 0; ++ ++ if (bch2_journal_preres_get_fast(j, res, new_u64s, flags)) ++ return 0; ++ ++ if (flags & JOURNAL_RES_GET_NONBLOCK) ++ return -EAGAIN; ++ ++ return __bch2_journal_preres_get(j, res, new_u64s, flags); ++} ++ ++/* journal_entry_res: */ ++ ++void bch2_journal_entry_res_resize(struct journal *, ++ struct journal_entry_res *, ++ unsigned); ++ ++u64 bch2_journal_last_unwritten_seq(struct journal *); ++int bch2_journal_open_seq_async(struct journal *, u64, struct closure *); ++ ++void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *); ++void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); ++void bch2_journal_flush_async(struct journal *, struct closure *); ++void bch2_journal_meta_async(struct journal *, struct closure *); ++ ++int bch2_journal_flush_seq(struct journal *, u64); ++int bch2_journal_flush(struct journal *); ++int bch2_journal_meta(struct journal *); ++ ++void bch2_journal_halt(struct journal *); ++ ++static inline int bch2_journal_error(struct journal *j) ++{ ++ return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ++ ? -EIO : 0; ++} ++ ++struct bch_dev; ++ ++static inline bool journal_flushes_device(struct bch_dev *ca) ++{ ++ return true; ++} ++ ++static inline void bch2_journal_set_replay_done(struct journal *j) ++{ ++ BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); ++ set_bit(JOURNAL_REPLAY_DONE, &j->flags); ++} ++ ++void bch2_journal_unblock(struct journal *); ++void bch2_journal_block(struct journal *); ++ ++void bch2_journal_debug_to_text(struct printbuf *, struct journal *); ++void bch2_journal_pins_to_text(struct printbuf *, struct journal *); ++ ++int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, ++ unsigned nr); ++int bch2_dev_journal_alloc(struct bch_dev *); ++ ++void bch2_dev_journal_stop(struct journal *, struct bch_dev *); ++ ++void bch2_fs_journal_stop(struct journal *); ++int bch2_fs_journal_start(struct journal *, u64, struct list_head *); ++ ++void bch2_dev_journal_exit(struct bch_dev *); ++int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); ++void bch2_fs_journal_exit(struct journal *); ++int bch2_fs_journal_init(struct journal *); ++ ++#endif /* _BCACHEFS_JOURNAL_H */ +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +new file mode 100644 +index 000000000000..bd0e6b371701 +--- /dev/null ++++ b/fs/bcachefs/journal_io.c +@@ -0,0 +1,1183 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "btree_io.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "error.h" ++#include "io.h" ++#include "journal.h" ++#include "journal_io.h" ++#include "journal_reclaim.h" ++#include "replicas.h" ++ ++#include ++ ++struct journal_list { ++ struct closure cl; ++ struct mutex lock; ++ struct list_head *head; ++ int ret; ++}; ++ ++#define JOURNAL_ENTRY_ADD_OK 0 ++#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 ++ ++/* ++ * Given a journal entry we just read, add it to the list of journal entries to ++ * be replayed: ++ */ ++static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, ++ struct journal_list *jlist, struct jset *j, ++ bool bad) ++{ ++ struct journal_replay *i, *pos; ++ struct bch_devs_list devs = { .nr = 0 }; ++ struct list_head *where; ++ size_t bytes = vstruct_bytes(j); ++ __le64 last_seq; ++ int ret; ++ ++ last_seq = !list_empty(jlist->head) ++ ? list_last_entry(jlist->head, struct journal_replay, ++ list)->j.last_seq ++ : 0; ++ ++ if (!c->opts.read_entire_journal) { ++ /* Is this entry older than the range we need? */ ++ if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) { ++ ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; ++ goto out; ++ } ++ ++ /* Drop entries we don't need anymore */ ++ list_for_each_entry_safe(i, pos, jlist->head, list) { ++ if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq)) ++ break; ++ list_del(&i->list); ++ kvpfree(i, offsetof(struct journal_replay, j) + ++ vstruct_bytes(&i->j)); ++ } ++ } ++ ++ list_for_each_entry_reverse(i, jlist->head, list) { ++ if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) { ++ where = &i->list; ++ goto add; ++ } ++ } ++ ++ where = jlist->head; ++add: ++ i = where->next != jlist->head ++ ? container_of(where->next, struct journal_replay, list) ++ : NULL; ++ ++ /* ++ * Duplicate journal entries? If so we want the one that didn't have a ++ * checksum error: ++ */ ++ if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { ++ if (i->bad) { ++ devs = i->devs; ++ list_del(&i->list); ++ kvpfree(i, offsetof(struct journal_replay, j) + ++ vstruct_bytes(&i->j)); ++ } else if (bad) { ++ goto found; ++ } else { ++ fsck_err_on(bytes != vstruct_bytes(&i->j) || ++ memcmp(j, &i->j, bytes), c, ++ "found duplicate but non identical journal entries (seq %llu)", ++ le64_to_cpu(j->seq)); ++ goto found; ++ } ++ ++ } ++ ++ i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); ++ if (!i) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ list_add(&i->list, where); ++ i->devs = devs; ++ i->bad = bad; ++ memcpy(&i->j, j, bytes); ++found: ++ if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx)) ++ bch2_dev_list_add_dev(&i->devs, ca->dev_idx); ++ else ++ fsck_err_on(1, c, "duplicate journal entries on same device"); ++ ret = JOURNAL_ENTRY_ADD_OK; ++out: ++fsck_err: ++ return ret; ++} ++ ++static struct nonce journal_nonce(const struct jset *jset) ++{ ++ return (struct nonce) {{ ++ [0] = 0, ++ [1] = ((__le32 *) &jset->seq)[0], ++ [2] = ((__le32 *) &jset->seq)[1], ++ [3] = BCH_NONCE_JOURNAL, ++ }}; ++} ++ ++/* this fills in a range with empty jset_entries: */ ++static void journal_entry_null_range(void *start, void *end) ++{ ++ struct jset_entry *entry; ++ ++ for (entry = start; entry != end; entry = vstruct_next(entry)) ++ memset(entry, 0, sizeof(*entry)); ++} ++ ++#define JOURNAL_ENTRY_REREAD 5 ++#define JOURNAL_ENTRY_NONE 6 ++#define JOURNAL_ENTRY_BAD 7 ++ ++#define journal_entry_err(c, msg, ...) \ ++({ \ ++ switch (write) { \ ++ case READ: \ ++ mustfix_fsck_err(c, msg, ##__VA_ARGS__); \ ++ break; \ ++ case WRITE: \ ++ bch_err(c, "corrupt metadata before write:\n" \ ++ msg, ##__VA_ARGS__); \ ++ if (bch2_fs_inconsistent(c)) { \ ++ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ goto fsck_err; \ ++ } \ ++ break; \ ++ } \ ++ true; \ ++}) ++ ++#define journal_entry_err_on(cond, c, msg, ...) \ ++ ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false) ++ ++static int journal_validate_key(struct bch_fs *c, struct jset *jset, ++ struct jset_entry *entry, ++ unsigned level, enum btree_id btree_id, ++ struct bkey_i *k, ++ const char *type, int write) ++{ ++ void *next = vstruct_next(entry); ++ const char *invalid; ++ unsigned version = le32_to_cpu(jset->version); ++ int ret = 0; ++ ++ if (journal_entry_err_on(!k->k.u64s, c, ++ "invalid %s in journal: k->u64s 0", type)) { ++ entry->u64s = cpu_to_le16((u64 *) k - entry->_data); ++ journal_entry_null_range(vstruct_next(entry), next); ++ return 0; ++ } ++ ++ if (journal_entry_err_on((void *) bkey_next(k) > ++ (void *) vstruct_next(entry), c, ++ "invalid %s in journal: extends past end of journal entry", ++ type)) { ++ entry->u64s = cpu_to_le16((u64 *) k - entry->_data); ++ journal_entry_null_range(vstruct_next(entry), next); ++ return 0; ++ } ++ ++ if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c, ++ "invalid %s in journal: bad format %u", ++ type, k->k.format)) { ++ le16_add_cpu(&entry->u64s, -k->k.u64s); ++ memmove(k, bkey_next(k), next - (void *) bkey_next(k)); ++ journal_entry_null_range(vstruct_next(entry), next); ++ return 0; ++ } ++ ++ if (!write) ++ bch2_bkey_compat(level, btree_id, version, ++ JSET_BIG_ENDIAN(jset), write, ++ NULL, bkey_to_packed(k)); ++ ++ invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), ++ __btree_node_type(level, btree_id)); ++ if (invalid) { ++ char buf[160]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k)); ++ mustfix_fsck_err(c, "invalid %s in journal: %s\n%s", ++ type, invalid, buf); ++ ++ le16_add_cpu(&entry->u64s, -k->k.u64s); ++ memmove(k, bkey_next(k), next - (void *) bkey_next(k)); ++ journal_entry_null_range(vstruct_next(entry), next); ++ return 0; ++ } ++ ++ if (write) ++ bch2_bkey_compat(level, btree_id, version, ++ JSET_BIG_ENDIAN(jset), write, ++ NULL, bkey_to_packed(k)); ++fsck_err: ++ return ret; ++} ++ ++static int journal_entry_validate_btree_keys(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ struct bkey_i *k; ++ ++ vstruct_for_each(entry, k) { ++ int ret = journal_validate_key(c, jset, entry, ++ entry->level, ++ entry->btree_id, ++ k, "key", write); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static int journal_entry_validate_btree_root(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ struct bkey_i *k = entry->start; ++ int ret = 0; ++ ++ if (journal_entry_err_on(!entry->u64s || ++ le16_to_cpu(entry->u64s) != k->k.u64s, c, ++ "invalid btree root journal entry: wrong number of keys")) { ++ void *next = vstruct_next(entry); ++ /* ++ * we don't want to null out this jset_entry, ++ * just the contents, so that later we can tell ++ * we were _supposed_ to have a btree root ++ */ ++ entry->u64s = 0; ++ journal_entry_null_range(vstruct_next(entry), next); ++ return 0; ++ } ++ ++ return journal_validate_key(c, jset, entry, 1, entry->btree_id, k, ++ "btree root", write); ++fsck_err: ++ return ret; ++} ++ ++static int journal_entry_validate_prio_ptrs(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ /* obsolete, don't care: */ ++ return 0; ++} ++ ++static int journal_entry_validate_blacklist(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ int ret = 0; ++ ++ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c, ++ "invalid journal seq blacklist entry: bad size")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ } ++fsck_err: ++ return ret; ++} ++ ++static int journal_entry_validate_blacklist_v2(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ struct jset_entry_blacklist_v2 *bl_entry; ++ int ret = 0; ++ ++ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c, ++ "invalid journal seq blacklist entry: bad size")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ goto out; ++ } ++ ++ bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); ++ ++ if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > ++ le64_to_cpu(bl_entry->end), c, ++ "invalid journal seq blacklist entry: start > end")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ } ++out: ++fsck_err: ++ return ret; ++} ++ ++static int journal_entry_validate_usage(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); ++ int ret = 0; ++ ++ if (journal_entry_err_on(bytes < sizeof(*u), ++ c, ++ "invalid journal entry usage: bad size")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ return ret; ++ } ++ ++fsck_err: ++ return ret; ++} ++ ++static int journal_entry_validate_data_usage(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ struct jset_entry_data_usage *u = ++ container_of(entry, struct jset_entry_data_usage, entry); ++ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); ++ int ret = 0; ++ ++ if (journal_entry_err_on(bytes < sizeof(*u) || ++ bytes < sizeof(*u) + u->r.nr_devs, ++ c, ++ "invalid journal entry usage: bad size")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ return ret; ++ } ++ ++fsck_err: ++ return ret; ++} ++ ++struct jset_entry_ops { ++ int (*validate)(struct bch_fs *, struct jset *, ++ struct jset_entry *, int); ++}; ++ ++static const struct jset_entry_ops bch2_jset_entry_ops[] = { ++#define x(f, nr) \ ++ [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ ++ .validate = journal_entry_validate_##f, \ ++ }, ++ BCH_JSET_ENTRY_TYPES() ++#undef x ++}; ++ ++static int journal_entry_validate(struct bch_fs *c, struct jset *jset, ++ struct jset_entry *entry, int write) ++{ ++ return entry->type < BCH_JSET_ENTRY_NR ++ ? bch2_jset_entry_ops[entry->type].validate(c, jset, ++ entry, write) ++ : 0; ++} ++ ++static int jset_validate_entries(struct bch_fs *c, struct jset *jset, ++ int write) ++{ ++ struct jset_entry *entry; ++ int ret = 0; ++ ++ vstruct_for_each(jset, entry) { ++ if (journal_entry_err_on(vstruct_next(entry) > ++ vstruct_last(jset), c, ++ "journal entry extends past end of jset")) { ++ jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); ++ break; ++ } ++ ++ ret = journal_entry_validate(c, jset, entry, write); ++ if (ret) ++ break; ++ } ++fsck_err: ++ return ret; ++} ++ ++static int jset_validate(struct bch_fs *c, ++ struct bch_dev *ca, ++ struct jset *jset, u64 sector, ++ unsigned bucket_sectors_left, ++ unsigned sectors_read, ++ int write) ++{ ++ size_t bytes = vstruct_bytes(jset); ++ struct bch_csum csum; ++ unsigned version; ++ int ret = 0; ++ ++ if (le64_to_cpu(jset->magic) != jset_magic(c)) ++ return JOURNAL_ENTRY_NONE; ++ ++ version = le32_to_cpu(jset->version); ++ if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD && ++ version < bcachefs_metadata_version_min) || ++ version >= bcachefs_metadata_version_max, c, ++ "%s sector %llu seq %llu: unknown journal entry version %u", ++ ca->name, sector, le64_to_cpu(jset->seq), ++ version)) { ++ /* XXX: note we might have missing journal entries */ ++ return JOURNAL_ENTRY_BAD; ++ } ++ ++ if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c, ++ "%s sector %llu seq %llu: journal entry too big (%zu bytes)", ++ ca->name, sector, le64_to_cpu(jset->seq), bytes)) { ++ /* XXX: note we might have missing journal entries */ ++ return JOURNAL_ENTRY_BAD; ++ } ++ ++ if (bytes > sectors_read << 9) ++ return JOURNAL_ENTRY_REREAD; ++ ++ if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, ++ "%s sector %llu seq %llu: journal entry with unknown csum type %llu", ++ ca->name, sector, le64_to_cpu(jset->seq), ++ JSET_CSUM_TYPE(jset))) ++ return JOURNAL_ENTRY_BAD; ++ ++ csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); ++ if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c, ++ "%s sector %llu seq %llu: journal checksum bad", ++ ca->name, sector, le64_to_cpu(jset->seq))) { ++ /* XXX: retry IO, when we start retrying checksum errors */ ++ /* XXX: note we might have missing journal entries */ ++ return JOURNAL_ENTRY_BAD; ++ } ++ ++ bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), ++ jset->encrypted_start, ++ vstruct_end(jset) - (void *) jset->encrypted_start); ++ ++ if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, ++ "invalid journal entry: last_seq > seq")) { ++ jset->last_seq = jset->seq; ++ return JOURNAL_ENTRY_BAD; ++ } ++ ++ return 0; ++fsck_err: ++ return ret; ++} ++ ++struct journal_read_buf { ++ void *data; ++ size_t size; ++}; ++ ++static int journal_read_buf_realloc(struct journal_read_buf *b, ++ size_t new_size) ++{ ++ void *n; ++ ++ /* the bios are sized for this many pages, max: */ ++ if (new_size > JOURNAL_ENTRY_SIZE_MAX) ++ return -ENOMEM; ++ ++ new_size = roundup_pow_of_two(new_size); ++ n = kvpmalloc(new_size, GFP_KERNEL); ++ if (!n) ++ return -ENOMEM; ++ ++ kvpfree(b->data, b->size); ++ b->data = n; ++ b->size = new_size; ++ return 0; ++} ++ ++static int journal_read_bucket(struct bch_dev *ca, ++ struct journal_read_buf *buf, ++ struct journal_list *jlist, ++ unsigned bucket) ++{ ++ struct bch_fs *c = ca->fs; ++ struct journal_device *ja = &ca->journal; ++ struct jset *j = NULL; ++ unsigned sectors, sectors_read = 0; ++ u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), ++ end = offset + ca->mi.bucket_size; ++ bool saw_bad = false; ++ int ret = 0; ++ ++ pr_debug("reading %u", bucket); ++ ++ while (offset < end) { ++ if (!sectors_read) { ++ struct bio *bio; ++reread: ++ sectors_read = min_t(unsigned, ++ end - offset, buf->size >> 9); ++ ++ bio = bio_kmalloc(GFP_KERNEL, ++ buf_pages(buf->data, ++ sectors_read << 9)); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_iter.bi_sector = offset; ++ bio_set_op_attrs(bio, REQ_OP_READ, 0); ++ bch2_bio_map(bio, buf->data, sectors_read << 9); ++ ++ ret = submit_bio_wait(bio); ++ bio_put(bio); ++ ++ if (bch2_dev_io_err_on(ret, ca, ++ "journal read from sector %llu", ++ offset) || ++ bch2_meta_read_fault("journal")) ++ return -EIO; ++ ++ j = buf->data; ++ } ++ ++ ret = jset_validate(c, ca, j, offset, ++ end - offset, sectors_read, ++ READ); ++ switch (ret) { ++ case BCH_FSCK_OK: ++ sectors = vstruct_sectors(j, c->block_bits); ++ break; ++ case JOURNAL_ENTRY_REREAD: ++ if (vstruct_bytes(j) > buf->size) { ++ ret = journal_read_buf_realloc(buf, ++ vstruct_bytes(j)); ++ if (ret) ++ return ret; ++ } ++ goto reread; ++ case JOURNAL_ENTRY_NONE: ++ if (!saw_bad) ++ return 0; ++ sectors = c->opts.block_size; ++ goto next_block; ++ case JOURNAL_ENTRY_BAD: ++ saw_bad = true; ++ /* ++ * On checksum error we don't really trust the size ++ * field of the journal entry we read, so try reading ++ * again at next block boundary: ++ */ ++ sectors = c->opts.block_size; ++ break; ++ default: ++ return ret; ++ } ++ ++ /* ++ * This happens sometimes if we don't have discards on - ++ * when we've partially overwritten a bucket with new ++ * journal entries. We don't need the rest of the ++ * bucket: ++ */ ++ if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) ++ return 0; ++ ++ ja->bucket_seq[bucket] = le64_to_cpu(j->seq); ++ ++ mutex_lock(&jlist->lock); ++ ret = journal_entry_add(c, ca, jlist, j, ret != 0); ++ mutex_unlock(&jlist->lock); ++ ++ switch (ret) { ++ case JOURNAL_ENTRY_ADD_OK: ++ break; ++ case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: ++ break; ++ default: ++ return ret; ++ } ++next_block: ++ pr_debug("next"); ++ offset += sectors; ++ sectors_read -= sectors; ++ j = ((void *) j) + (sectors << 9); ++ } ++ ++ return 0; ++} ++ ++static void bch2_journal_read_device(struct closure *cl) ++{ ++ struct journal_device *ja = ++ container_of(cl, struct journal_device, read); ++ struct bch_dev *ca = container_of(ja, struct bch_dev, journal); ++ struct journal_list *jlist = ++ container_of(cl->parent, struct journal_list, cl); ++ struct journal_read_buf buf = { NULL, 0 }; ++ u64 min_seq = U64_MAX; ++ unsigned i; ++ int ret; ++ ++ if (!ja->nr) ++ goto out; ++ ++ ret = journal_read_buf_realloc(&buf, PAGE_SIZE); ++ if (ret) ++ goto err; ++ ++ pr_debug("%u journal buckets", ja->nr); ++ ++ for (i = 0; i < ja->nr; i++) { ++ ret = journal_read_bucket(ca, &buf, jlist, i); ++ if (ret) ++ goto err; ++ } ++ ++ /* Find the journal bucket with the highest sequence number: */ ++ for (i = 0; i < ja->nr; i++) { ++ if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx]) ++ ja->cur_idx = i; ++ ++ min_seq = min(ja->bucket_seq[i], min_seq); ++ } ++ ++ /* ++ * If there's duplicate journal entries in multiple buckets (which ++ * definitely isn't supposed to happen, but...) - make sure to start ++ * cur_idx at the last of those buckets, so we don't deadlock trying to ++ * allocate ++ */ ++ while (ja->bucket_seq[ja->cur_idx] > min_seq && ++ ja->bucket_seq[ja->cur_idx] > ++ ja->bucket_seq[(ja->cur_idx + 1) % ja->nr]) ++ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ++ ++ ja->sectors_free = 0; ++ ++ /* ++ * Set dirty_idx to indicate the entire journal is full and needs to be ++ * reclaimed - journal reclaim will immediately reclaim whatever isn't ++ * pinned when it first runs: ++ */ ++ ja->discard_idx = ja->dirty_idx_ondisk = ++ ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; ++out: ++ kvpfree(buf.data, buf.size); ++ percpu_ref_put(&ca->io_ref); ++ closure_return(cl); ++ return; ++err: ++ mutex_lock(&jlist->lock); ++ jlist->ret = ret; ++ mutex_unlock(&jlist->lock); ++ goto out; ++} ++ ++int bch2_journal_read(struct bch_fs *c, struct list_head *list) ++{ ++ struct journal_list jlist; ++ struct journal_replay *i; ++ struct bch_dev *ca; ++ unsigned iter; ++ size_t keys = 0, entries = 0; ++ bool degraded = false; ++ int ret = 0; ++ ++ closure_init_stack(&jlist.cl); ++ mutex_init(&jlist.lock); ++ jlist.head = list; ++ jlist.ret = 0; ++ ++ for_each_member_device(ca, c, iter) { ++ if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && ++ !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) ++ continue; ++ ++ if ((ca->mi.state == BCH_MEMBER_STATE_RW || ++ ca->mi.state == BCH_MEMBER_STATE_RO) && ++ percpu_ref_tryget(&ca->io_ref)) ++ closure_call(&ca->journal.read, ++ bch2_journal_read_device, ++ system_unbound_wq, ++ &jlist.cl); ++ else ++ degraded = true; ++ } ++ ++ closure_sync(&jlist.cl); ++ ++ if (jlist.ret) ++ return jlist.ret; ++ ++ list_for_each_entry(i, list, list) { ++ struct jset_entry *entry; ++ struct bkey_i *k, *_n; ++ struct bch_replicas_padded replicas; ++ char buf[80]; ++ ++ ret = jset_validate_entries(c, &i->j, READ); ++ if (ret) ++ goto fsck_err; ++ ++ /* ++ * If we're mounting in degraded mode - if we didn't read all ++ * the devices - this is wrong: ++ */ ++ ++ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, i->devs); ++ ++ if (!degraded && ++ (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || ++ fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c, ++ "superblock not marked as containing replicas %s", ++ (bch2_replicas_entry_to_text(&PBUF(buf), ++ &replicas.e), buf)))) { ++ ret = bch2_mark_replicas(c, &replicas.e); ++ if (ret) ++ return ret; ++ } ++ ++ for_each_jset_key(k, _n, entry, &i->j) ++ keys++; ++ entries++; ++ } ++ ++ if (!list_empty(list)) { ++ i = list_last_entry(list, struct journal_replay, list); ++ ++ bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", ++ keys, entries, le64_to_cpu(i->j.seq)); ++ } ++fsck_err: ++ return ret; ++} ++ ++/* journal write: */ ++ ++static void __journal_write_alloc(struct journal *j, ++ struct journal_buf *w, ++ struct dev_alloc_list *devs_sorted, ++ unsigned sectors, ++ unsigned *replicas, ++ unsigned replicas_want) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_device *ja; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ if (*replicas >= replicas_want) ++ return; ++ ++ for (i = 0; i < devs_sorted->nr; i++) { ++ ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); ++ if (!ca) ++ continue; ++ ++ ja = &ca->journal; ++ ++ /* ++ * Check that we can use this device, and aren't already using ++ * it: ++ */ ++ if (!ca->mi.durability || ++ ca->mi.state != BCH_MEMBER_STATE_RW || ++ !ja->nr || ++ bch2_bkey_has_device(bkey_i_to_s_c(&w->key), ++ ca->dev_idx) || ++ sectors > ja->sectors_free) ++ continue; ++ ++ bch2_dev_stripe_increment(ca, &j->wp.stripe); ++ ++ bch2_bkey_append_ptr(&w->key, ++ (struct bch_extent_ptr) { ++ .offset = bucket_to_sector(ca, ++ ja->buckets[ja->cur_idx]) + ++ ca->mi.bucket_size - ++ ja->sectors_free, ++ .dev = ca->dev_idx, ++ }); ++ ++ ja->sectors_free -= sectors; ++ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); ++ ++ *replicas += ca->mi.durability; ++ ++ if (*replicas >= replicas_want) ++ break; ++ } ++} ++ ++/** ++ * journal_next_bucket - move on to the next journal bucket if possible ++ */ ++static int journal_write_alloc(struct journal *j, struct journal_buf *w, ++ unsigned sectors) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_device *ja; ++ struct bch_dev *ca; ++ struct dev_alloc_list devs_sorted; ++ unsigned i, replicas = 0, replicas_want = ++ READ_ONCE(c->opts.metadata_replicas); ++ ++ rcu_read_lock(); ++ ++ devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, ++ &c->rw_devs[BCH_DATA_journal]); ++ ++ __journal_write_alloc(j, w, &devs_sorted, ++ sectors, &replicas, replicas_want); ++ ++ if (replicas >= replicas_want) ++ goto done; ++ ++ for (i = 0; i < devs_sorted.nr; i++) { ++ ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); ++ if (!ca) ++ continue; ++ ++ ja = &ca->journal; ++ ++ if (sectors > ja->sectors_free && ++ sectors <= ca->mi.bucket_size && ++ bch2_journal_dev_buckets_available(j, ja, ++ journal_space_discarded)) { ++ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ++ ja->sectors_free = ca->mi.bucket_size; ++ ++ /* ++ * ja->bucket_seq[ja->cur_idx] must always have ++ * something sensible: ++ */ ++ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); ++ } ++ } ++ ++ __journal_write_alloc(j, w, &devs_sorted, ++ sectors, &replicas, replicas_want); ++done: ++ rcu_read_unlock(); ++ ++ return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; ++} ++ ++static void journal_write_compact(struct jset *jset) ++{ ++ struct jset_entry *i, *next, *prev = NULL; ++ ++ /* ++ * Simple compaction, dropping empty jset_entries (from journal ++ * reservations that weren't fully used) and merging jset_entries that ++ * can be. ++ * ++ * If we wanted to be really fancy here, we could sort all the keys in ++ * the jset and drop keys that were overwritten - probably not worth it: ++ */ ++ vstruct_for_each_safe(jset, i, next) { ++ unsigned u64s = le16_to_cpu(i->u64s); ++ ++ /* Empty entry: */ ++ if (!u64s) ++ continue; ++ ++ /* Can we merge with previous entry? */ ++ if (prev && ++ i->btree_id == prev->btree_id && ++ i->level == prev->level && ++ i->type == prev->type && ++ i->type == BCH_JSET_ENTRY_btree_keys && ++ le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { ++ memmove_u64s_down(vstruct_next(prev), ++ i->_data, ++ u64s); ++ le16_add_cpu(&prev->u64s, u64s); ++ continue; ++ } ++ ++ /* Couldn't merge, move i into new position (after prev): */ ++ prev = prev ? vstruct_next(prev) : jset->start; ++ if (i != prev) ++ memmove_u64s_down(prev, i, jset_u64s(u64s)); ++ } ++ ++ prev = prev ? vstruct_next(prev) : jset->start; ++ jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); ++} ++ ++static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) ++{ ++ /* we aren't holding j->lock: */ ++ unsigned new_size = READ_ONCE(j->buf_size_want); ++ void *new_buf; ++ ++ if (buf->buf_size >= new_size) ++ return; ++ ++ new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN); ++ if (!new_buf) ++ return; ++ ++ memcpy(new_buf, buf->data, buf->buf_size); ++ kvpfree(buf->data, buf->buf_size); ++ buf->data = new_buf; ++ buf->buf_size = new_size; ++} ++ ++static void journal_write_done(struct closure *cl) ++{ ++ struct journal *j = container_of(cl, struct journal, io); ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_buf *w = journal_prev_buf(j); ++ struct bch_devs_list devs = ++ bch2_bkey_devs(bkey_i_to_s_c(&w->key)); ++ struct bch_replicas_padded replicas; ++ u64 seq = le64_to_cpu(w->data->seq); ++ u64 last_seq = le64_to_cpu(w->data->last_seq); ++ ++ bch2_time_stats_update(j->write_time, j->write_start_time); ++ ++ if (!devs.nr) { ++ bch_err(c, "unable to write journal to sufficient devices"); ++ goto err; ++ } ++ ++ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs); ++ ++ if (bch2_mark_replicas(c, &replicas.e)) ++ goto err; ++ ++ spin_lock(&j->lock); ++ if (seq >= j->pin.front) ++ journal_seq_pin(j, seq)->devs = devs; ++ ++ j->seq_ondisk = seq; ++ j->last_seq_ondisk = last_seq; ++ bch2_journal_space_available(j); ++ ++ /* ++ * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard ++ * more buckets: ++ * ++ * Must come before signaling write completion, for ++ * bch2_fs_journal_stop(): ++ */ ++ mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0); ++out: ++ /* also must come before signalling write completion: */ ++ closure_debug_destroy(cl); ++ ++ BUG_ON(!j->reservations.prev_buf_unwritten); ++ atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v, ++ &j->reservations.counter); ++ ++ closure_wake_up(&w->wait); ++ journal_wake(j); ++ ++ if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) ++ mod_delayed_work(system_freezable_wq, &j->write_work, 0); ++ spin_unlock(&j->lock); ++ return; ++err: ++ bch2_fatal_error(c); ++ spin_lock(&j->lock); ++ goto out; ++} ++ ++static void journal_write_endio(struct bio *bio) ++{ ++ struct bch_dev *ca = bio->bi_private; ++ struct journal *j = &ca->fs->journal; ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s", ++ bch2_blk_status_to_str(bio->bi_status)) || ++ bch2_meta_write_fault("journal")) { ++ struct journal_buf *w = journal_prev_buf(j); ++ unsigned long flags; ++ ++ spin_lock_irqsave(&j->err_lock, flags); ++ bch2_bkey_drop_device(bkey_i_to_s(&w->key), ca->dev_idx); ++ spin_unlock_irqrestore(&j->err_lock, flags); ++ } ++ ++ closure_put(&j->io); ++ percpu_ref_put(&ca->io_ref); ++} ++ ++void bch2_journal_write(struct closure *cl) ++{ ++ struct journal *j = container_of(cl, struct journal, io); ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ struct journal_buf *w = journal_prev_buf(j); ++ struct jset_entry *start, *end; ++ struct jset *jset; ++ struct bio *bio; ++ struct bch_extent_ptr *ptr; ++ bool validate_before_checksum = false; ++ unsigned i, sectors, bytes, u64s; ++ int ret; ++ ++ bch2_journal_pin_put(j, le64_to_cpu(w->data->seq)); ++ ++ journal_buf_realloc(j, w); ++ jset = w->data; ++ ++ j->write_start_time = local_clock(); ++ ++ /* ++ * New btree roots are set by journalling them; when the journal entry ++ * gets written we have to propagate them to c->btree_roots ++ * ++ * But, every journal entry we write has to contain all the btree roots ++ * (at least for now); so after we copy btree roots to c->btree_roots we ++ * have to get any missing btree roots and add them to this journal ++ * entry: ++ */ ++ ++ bch2_journal_entries_to_btree_roots(c, jset); ++ ++ start = end = vstruct_last(jset); ++ ++ end = bch2_btree_roots_to_journal_entries(c, jset->start, end); ++ ++ end = bch2_journal_super_entries_add_common(c, end, ++ le64_to_cpu(jset->seq)); ++ u64s = (u64 *) end - (u64 *) start; ++ BUG_ON(u64s > j->entry_u64s_reserved); ++ ++ le32_add_cpu(&jset->u64s, u64s); ++ BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors); ++ ++ journal_write_compact(jset); ++ ++ jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); ++ jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); ++ jset->magic = cpu_to_le64(jset_magic(c)); ++ ++ jset->version = c->sb.version < bcachefs_metadata_version_new_versioning ++ ? cpu_to_le32(BCH_JSET_VERSION_OLD) ++ : cpu_to_le32(c->sb.version); ++ ++ SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); ++ SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); ++ ++ if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) ++ validate_before_checksum = true; ++ ++ if (le32_to_cpu(jset->version) < bcachefs_metadata_version_max) ++ validate_before_checksum = true; ++ ++ if (validate_before_checksum && ++ jset_validate_entries(c, jset, WRITE)) ++ goto err; ++ ++ bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), ++ jset->encrypted_start, ++ vstruct_end(jset) - (void *) jset->encrypted_start); ++ ++ jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), ++ journal_nonce(jset), jset); ++ ++ if (!validate_before_checksum && ++ jset_validate_entries(c, jset, WRITE)) ++ goto err; ++ ++ sectors = vstruct_sectors(jset, c->block_bits); ++ BUG_ON(sectors > w->sectors); ++ ++ bytes = vstruct_bytes(jset); ++ memset((void *) jset + bytes, 0, (sectors << 9) - bytes); ++ ++retry_alloc: ++ spin_lock(&j->lock); ++ ret = journal_write_alloc(j, w, sectors); ++ ++ if (ret && j->can_discard) { ++ spin_unlock(&j->lock); ++ bch2_journal_do_discards(j); ++ goto retry_alloc; ++ } ++ ++ /* ++ * write is allocated, no longer need to account for it in ++ * bch2_journal_space_available(): ++ */ ++ w->sectors = 0; ++ ++ /* ++ * journal entry has been compacted and allocated, recalculate space ++ * available: ++ */ ++ bch2_journal_space_available(j); ++ spin_unlock(&j->lock); ++ ++ if (ret) { ++ bch_err(c, "Unable to allocate journal write"); ++ bch2_fatal_error(c); ++ continue_at(cl, journal_write_done, system_highpri_wq); ++ return; ++ } ++ ++ /* ++ * XXX: we really should just disable the entire journal in nochanges ++ * mode ++ */ ++ if (c->opts.nochanges) ++ goto no_io; ++ ++ extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { ++ ca = bch_dev_bkey_exists(c, ptr->dev); ++ if (!percpu_ref_tryget(&ca->io_ref)) { ++ /* XXX: fix this */ ++ bch_err(c, "missing device for journal write\n"); ++ continue; ++ } ++ ++ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], ++ sectors); ++ ++ bio = ca->journal.bio; ++ bio_reset(bio); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_iter.bi_sector = ptr->offset; ++ bio->bi_end_io = journal_write_endio; ++ bio->bi_private = ca; ++ bio_set_op_attrs(bio, REQ_OP_WRITE, ++ REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); ++ bch2_bio_map(bio, jset, sectors << 9); ++ ++ trace_journal_write(bio); ++ closure_bio_submit(bio, cl); ++ ++ ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq); ++ } ++ ++ for_each_rw_member(ca, c, i) ++ if (journal_flushes_device(ca) && ++ !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { ++ percpu_ref_get(&ca->io_ref); ++ ++ bio = ca->journal.bio; ++ bio_reset(bio); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_opf = REQ_OP_FLUSH; ++ bio->bi_end_io = journal_write_endio; ++ bio->bi_private = ca; ++ closure_bio_submit(bio, cl); ++ } ++ ++no_io: ++ bch2_bucket_seq_cleanup(c); ++ ++ continue_at(cl, journal_write_done, system_highpri_wq); ++ return; ++err: ++ bch2_inconsistent_error(c); ++ continue_at(cl, journal_write_done, system_highpri_wq); ++} +diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h +new file mode 100644 +index 000000000000..6958ee0f8cf2 +--- /dev/null ++++ b/fs/bcachefs/journal_io.h +@@ -0,0 +1,44 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_IO_H ++#define _BCACHEFS_JOURNAL_IO_H ++ ++/* ++ * Only used for holding the journal entries we read in btree_journal_read() ++ * during cache_registration ++ */ ++struct journal_replay { ++ struct list_head list; ++ struct bch_devs_list devs; ++ /* checksum error, but we may want to try using it anyways: */ ++ bool bad; ++ /* must be last: */ ++ struct jset j; ++}; ++ ++static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, ++ struct jset_entry *entry, unsigned type) ++{ ++ while (entry < vstruct_last(jset)) { ++ if (entry->type == type) ++ return entry; ++ ++ entry = vstruct_next(entry); ++ } ++ ++ return NULL; ++} ++ ++#define for_each_jset_entry_type(entry, jset, type) \ ++ for (entry = (jset)->start; \ ++ (entry = __jset_entry_type_next(jset, entry, type)); \ ++ entry = vstruct_next(entry)) ++ ++#define for_each_jset_key(k, _n, entry, jset) \ ++ for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ ++ vstruct_for_each_safe(entry, k, _n) ++ ++int bch2_journal_read(struct bch_fs *, struct list_head *); ++ ++void bch2_journal_write(struct closure *); ++ ++#endif /* _BCACHEFS_JOURNAL_IO_H */ +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +new file mode 100644 +index 000000000000..57591983eebd +--- /dev/null ++++ b/fs/bcachefs/journal_reclaim.c +@@ -0,0 +1,644 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "journal.h" ++#include "journal_io.h" ++#include "journal_reclaim.h" ++#include "replicas.h" ++#include "super.h" ++ ++/* Free space calculations: */ ++ ++static unsigned journal_space_from(struct journal_device *ja, ++ enum journal_space_from from) ++{ ++ switch (from) { ++ case journal_space_discarded: ++ return ja->discard_idx; ++ case journal_space_clean_ondisk: ++ return ja->dirty_idx_ondisk; ++ case journal_space_clean: ++ return ja->dirty_idx; ++ default: ++ BUG(); ++ } ++} ++ ++unsigned bch2_journal_dev_buckets_available(struct journal *j, ++ struct journal_device *ja, ++ enum journal_space_from from) ++{ ++ unsigned available = (journal_space_from(ja, from) - ++ ja->cur_idx - 1 + ja->nr) % ja->nr; ++ ++ /* ++ * Don't use the last bucket unless writing the new last_seq ++ * will make another bucket available: ++ */ ++ if (available && ja->dirty_idx_ondisk == ja->dirty_idx) ++ --available; ++ ++ return available; ++} ++ ++static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) ++{ ++ union journal_preres_state old, new; ++ u64 v = atomic64_read(&j->prereserved.counter); ++ ++ do { ++ old.v = new.v = v; ++ new.remaining = u64s_remaining; ++ } while ((v = atomic64_cmpxchg(&j->prereserved.counter, ++ old.v, new.v)) != old.v); ++} ++ ++static struct journal_space { ++ unsigned next_entry; ++ unsigned remaining; ++} __journal_space_available(struct journal *j, unsigned nr_devs_want, ++ enum journal_space_from from) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ unsigned sectors_next_entry = UINT_MAX; ++ unsigned sectors_total = UINT_MAX; ++ unsigned i, nr_devs = 0; ++ unsigned unwritten_sectors = j->reservations.prev_buf_unwritten ++ ? journal_prev_buf(j)->sectors ++ : 0; ++ ++ rcu_read_lock(); ++ for_each_member_device_rcu(ca, c, i, ++ &c->rw_devs[BCH_DATA_journal]) { ++ struct journal_device *ja = &ca->journal; ++ unsigned buckets_this_device, sectors_this_device; ++ ++ if (!ja->nr) ++ continue; ++ ++ buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from); ++ sectors_this_device = ja->sectors_free; ++ ++ /* ++ * We that we don't allocate the space for a journal entry ++ * until we write it out - thus, account for it here: ++ */ ++ if (unwritten_sectors >= sectors_this_device) { ++ if (!buckets_this_device) ++ continue; ++ ++ buckets_this_device--; ++ sectors_this_device = ca->mi.bucket_size; ++ } ++ ++ sectors_this_device -= unwritten_sectors; ++ ++ if (sectors_this_device < ca->mi.bucket_size && ++ buckets_this_device) { ++ buckets_this_device--; ++ sectors_this_device = ca->mi.bucket_size; ++ } ++ ++ if (!sectors_this_device) ++ continue; ++ ++ sectors_next_entry = min(sectors_next_entry, ++ sectors_this_device); ++ ++ sectors_total = min(sectors_total, ++ buckets_this_device * ca->mi.bucket_size + ++ sectors_this_device); ++ ++ nr_devs++; ++ } ++ rcu_read_unlock(); ++ ++ if (nr_devs < nr_devs_want) ++ return (struct journal_space) { 0, 0 }; ++ ++ return (struct journal_space) { ++ .next_entry = sectors_next_entry, ++ .remaining = max_t(int, 0, sectors_total - sectors_next_entry), ++ }; ++} ++ ++void bch2_journal_space_available(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ struct journal_space discarded, clean_ondisk, clean; ++ unsigned overhead, u64s_remaining = 0; ++ unsigned max_entry_size = min(j->buf[0].buf_size >> 9, ++ j->buf[1].buf_size >> 9); ++ unsigned i, nr_online = 0, nr_devs_want; ++ bool can_discard = false; ++ int ret = 0; ++ ++ lockdep_assert_held(&j->lock); ++ ++ rcu_read_lock(); ++ for_each_member_device_rcu(ca, c, i, ++ &c->rw_devs[BCH_DATA_journal]) { ++ struct journal_device *ja = &ca->journal; ++ ++ if (!ja->nr) ++ continue; ++ ++ while (ja->dirty_idx != ja->cur_idx && ++ ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j)) ++ ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; ++ ++ while (ja->dirty_idx_ondisk != ja->dirty_idx && ++ ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk) ++ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; ++ ++ if (ja->discard_idx != ja->dirty_idx_ondisk) ++ can_discard = true; ++ ++ max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); ++ nr_online++; ++ } ++ rcu_read_unlock(); ++ ++ j->can_discard = can_discard; ++ ++ if (nr_online < c->opts.metadata_replicas_required) { ++ ret = -EROFS; ++ goto out; ++ } ++ ++ if (!fifo_free(&j->pin)) { ++ ret = -ENOSPC; ++ goto out; ++ } ++ ++ nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); ++ ++ discarded = __journal_space_available(j, nr_devs_want, journal_space_discarded); ++ clean_ondisk = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk); ++ clean = __journal_space_available(j, nr_devs_want, journal_space_clean); ++ ++ if (!discarded.next_entry) ++ ret = -ENOSPC; ++ ++ overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) * ++ journal_entry_overhead(j); ++ u64s_remaining = clean.remaining << 6; ++ u64s_remaining = max_t(int, 0, u64s_remaining - overhead); ++ u64s_remaining /= 4; ++out: ++ j->cur_entry_sectors = !ret ? discarded.next_entry : 0; ++ j->cur_entry_error = ret; ++ journal_set_remaining(j, u64s_remaining); ++ journal_check_may_get_unreserved(j); ++ ++ if (!ret) ++ journal_wake(j); ++} ++ ++/* Discards - last part of journal reclaim: */ ++ ++static bool should_discard_bucket(struct journal *j, struct journal_device *ja) ++{ ++ bool ret; ++ ++ spin_lock(&j->lock); ++ ret = ja->discard_idx != ja->dirty_idx_ondisk; ++ spin_unlock(&j->lock); ++ ++ return ret; ++} ++ ++/* ++ * Advance ja->discard_idx as long as it points to buckets that are no longer ++ * dirty, issuing discards if necessary: ++ */ ++void bch2_journal_do_discards(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ unsigned iter; ++ ++ mutex_lock(&j->discard_lock); ++ ++ for_each_rw_member(ca, c, iter) { ++ struct journal_device *ja = &ca->journal; ++ ++ while (should_discard_bucket(j, ja)) { ++ if (ca->mi.discard && ++ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) ++ blkdev_issue_discard(ca->disk_sb.bdev, ++ bucket_to_sector(ca, ++ ja->buckets[ja->discard_idx]), ++ ca->mi.bucket_size, GFP_NOIO, 0); ++ ++ spin_lock(&j->lock); ++ ja->discard_idx = (ja->discard_idx + 1) % ja->nr; ++ ++ bch2_journal_space_available(j); ++ spin_unlock(&j->lock); ++ } ++ } ++ ++ mutex_unlock(&j->discard_lock); ++} ++ ++/* ++ * Journal entry pinning - machinery for holding a reference on a given journal ++ * entry, holding it open to ensure it gets replayed during recovery: ++ */ ++ ++static void bch2_journal_reclaim_fast(struct journal *j) ++{ ++ struct journal_entry_pin_list temp; ++ bool popped = false; ++ ++ lockdep_assert_held(&j->lock); ++ ++ /* ++ * Unpin journal entries whose reference counts reached zero, meaning ++ * all btree nodes got written out ++ */ ++ while (!fifo_empty(&j->pin) && ++ !atomic_read(&fifo_peek_front(&j->pin).count)) { ++ BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); ++ BUG_ON(!fifo_pop(&j->pin, temp)); ++ popped = true; ++ } ++ ++ if (popped) ++ bch2_journal_space_available(j); ++} ++ ++void bch2_journal_pin_put(struct journal *j, u64 seq) ++{ ++ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); ++ ++ if (atomic_dec_and_test(&pin_list->count)) { ++ spin_lock(&j->lock); ++ bch2_journal_reclaim_fast(j); ++ spin_unlock(&j->lock); ++ } ++} ++ ++static inline void __journal_pin_drop(struct journal *j, ++ struct journal_entry_pin *pin) ++{ ++ struct journal_entry_pin_list *pin_list; ++ ++ if (!journal_pin_active(pin)) ++ return; ++ ++ pin_list = journal_seq_pin(j, pin->seq); ++ pin->seq = 0; ++ list_del_init(&pin->list); ++ ++ /* ++ * Unpinning a journal entry make make journal_next_bucket() succeed, if ++ * writing a new last_seq will now make another bucket available: ++ */ ++ if (atomic_dec_and_test(&pin_list->count) && ++ pin_list == &fifo_peek_front(&j->pin)) ++ bch2_journal_reclaim_fast(j); ++ else if (fifo_used(&j->pin) == 1 && ++ atomic_read(&pin_list->count) == 1) ++ journal_wake(j); ++} ++ ++void bch2_journal_pin_drop(struct journal *j, ++ struct journal_entry_pin *pin) ++{ ++ spin_lock(&j->lock); ++ __journal_pin_drop(j, pin); ++ spin_unlock(&j->lock); ++} ++ ++static void bch2_journal_pin_add_locked(struct journal *j, u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) ++{ ++ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); ++ ++ __journal_pin_drop(j, pin); ++ ++ BUG_ON(!atomic_read(&pin_list->count) && seq == journal_last_seq(j)); ++ ++ atomic_inc(&pin_list->count); ++ pin->seq = seq; ++ pin->flush = flush_fn; ++ ++ list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed); ++} ++ ++void __bch2_journal_pin_add(struct journal *j, u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) ++{ ++ spin_lock(&j->lock); ++ bch2_journal_pin_add_locked(j, seq, pin, flush_fn); ++ spin_unlock(&j->lock); ++ ++ /* ++ * If the journal is currently full, we might want to call flush_fn ++ * immediately: ++ */ ++ journal_wake(j); ++} ++ ++void bch2_journal_pin_update(struct journal *j, u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) ++{ ++ if (journal_pin_active(pin) && pin->seq < seq) ++ return; ++ ++ spin_lock(&j->lock); ++ ++ if (pin->seq != seq) { ++ bch2_journal_pin_add_locked(j, seq, pin, flush_fn); ++ } else { ++ struct journal_entry_pin_list *pin_list = ++ journal_seq_pin(j, seq); ++ ++ /* ++ * If the pin is already pinning the right sequence number, it ++ * still might've already been flushed: ++ */ ++ list_move(&pin->list, &pin_list->list); ++ } ++ ++ spin_unlock(&j->lock); ++ ++ /* ++ * If the journal is currently full, we might want to call flush_fn ++ * immediately: ++ */ ++ journal_wake(j); ++} ++ ++void bch2_journal_pin_copy(struct journal *j, ++ struct journal_entry_pin *dst, ++ struct journal_entry_pin *src, ++ journal_pin_flush_fn flush_fn) ++{ ++ spin_lock(&j->lock); ++ ++ if (journal_pin_active(src) && ++ (!journal_pin_active(dst) || src->seq < dst->seq)) ++ bch2_journal_pin_add_locked(j, src->seq, dst, flush_fn); ++ ++ spin_unlock(&j->lock); ++} ++ ++/** ++ * bch2_journal_pin_flush: ensure journal pin callback is no longer running ++ */ ++void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) ++{ ++ BUG_ON(journal_pin_active(pin)); ++ ++ wait_event(j->pin_flush_wait, j->flush_in_progress != pin); ++} ++ ++/* ++ * Journal reclaim: flush references to open journal entries to reclaim space in ++ * the journal ++ * ++ * May be done by the journal code in the background as needed to free up space ++ * for more journal entries, or as part of doing a clean shutdown, or to migrate ++ * data off of a specific device: ++ */ ++ ++static struct journal_entry_pin * ++journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) ++{ ++ struct journal_entry_pin_list *pin_list; ++ struct journal_entry_pin *ret = NULL; ++ ++ if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)) ++ return NULL; ++ ++ spin_lock(&j->lock); ++ ++ fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) ++ if (*seq > max_seq || ++ (ret = list_first_entry_or_null(&pin_list->list, ++ struct journal_entry_pin, list))) ++ break; ++ ++ if (ret) { ++ list_move(&ret->list, &pin_list->flushed); ++ BUG_ON(j->flush_in_progress); ++ j->flush_in_progress = ret; ++ j->last_flushed = jiffies; ++ } ++ ++ spin_unlock(&j->lock); ++ ++ return ret; ++} ++ ++/* returns true if we did work */ ++static bool journal_flush_pins(struct journal *j, u64 seq_to_flush, ++ unsigned min_nr) ++{ ++ struct journal_entry_pin *pin; ++ bool ret = false; ++ u64 seq; ++ ++ lockdep_assert_held(&j->reclaim_lock); ++ ++ while ((pin = journal_get_next_pin(j, min_nr ++ ? U64_MAX : seq_to_flush, &seq))) { ++ if (min_nr) ++ min_nr--; ++ ++ pin->flush(j, pin, seq); ++ ++ BUG_ON(j->flush_in_progress != pin); ++ j->flush_in_progress = NULL; ++ wake_up(&j->pin_flush_wait); ++ ret = true; ++ } ++ ++ return ret; ++} ++ ++/** ++ * bch2_journal_reclaim - free up journal buckets ++ * ++ * Background journal reclaim writes out btree nodes. It should be run ++ * early enough so that we never completely run out of journal buckets. ++ * ++ * High watermarks for triggering background reclaim: ++ * - FIFO has fewer than 512 entries left ++ * - fewer than 25% journal buckets free ++ * ++ * Background reclaim runs until low watermarks are reached: ++ * - FIFO has more than 1024 entries left ++ * - more than 50% journal buckets free ++ * ++ * As long as a reclaim can complete in the time it takes to fill up ++ * 512 journal entries or 25% of all journal buckets, then ++ * journal_next_bucket() should not stall. ++ */ ++void bch2_journal_reclaim(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ unsigned iter, min_nr = 0; ++ u64 seq_to_flush = 0; ++ ++ lockdep_assert_held(&j->reclaim_lock); ++ ++ bch2_journal_do_discards(j); ++ ++ spin_lock(&j->lock); ++ ++ for_each_rw_member(ca, c, iter) { ++ struct journal_device *ja = &ca->journal; ++ unsigned nr_buckets, bucket_to_flush; ++ ++ if (!ja->nr) ++ continue; ++ ++ /* Try to keep the journal at most half full: */ ++ nr_buckets = ja->nr / 2; ++ ++ /* And include pre-reservations: */ ++ nr_buckets += DIV_ROUND_UP(j->prereserved.reserved, ++ (ca->mi.bucket_size << 6) - ++ journal_entry_overhead(j)); ++ ++ nr_buckets = min(nr_buckets, ja->nr); ++ ++ bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; ++ seq_to_flush = max(seq_to_flush, ++ ja->bucket_seq[bucket_to_flush]); ++ } ++ ++ /* Also flush if the pin fifo is more than half full */ ++ seq_to_flush = max_t(s64, seq_to_flush, ++ (s64) journal_cur_seq(j) - ++ (j->pin.size >> 1)); ++ spin_unlock(&j->lock); ++ ++ /* ++ * If it's been longer than j->reclaim_delay_ms since we last flushed, ++ * make sure to flush at least one journal pin: ++ */ ++ if (time_after(jiffies, j->last_flushed + ++ msecs_to_jiffies(j->reclaim_delay_ms))) ++ min_nr = 1; ++ ++ if (j->prereserved.reserved * 2 > j->prereserved.remaining) { ++ seq_to_flush = max(seq_to_flush, journal_last_seq(j)); ++ min_nr = 1; ++ } ++ ++ journal_flush_pins(j, seq_to_flush, min_nr); ++ ++ if (!bch2_journal_error(j)) ++ queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, ++ msecs_to_jiffies(j->reclaim_delay_ms)); ++} ++ ++void bch2_journal_reclaim_work(struct work_struct *work) ++{ ++ struct journal *j = container_of(to_delayed_work(work), ++ struct journal, reclaim_work); ++ ++ mutex_lock(&j->reclaim_lock); ++ bch2_journal_reclaim(j); ++ mutex_unlock(&j->reclaim_lock); ++} ++ ++static int journal_flush_done(struct journal *j, u64 seq_to_flush, ++ bool *did_work) ++{ ++ int ret; ++ ++ ret = bch2_journal_error(j); ++ if (ret) ++ return ret; ++ ++ mutex_lock(&j->reclaim_lock); ++ ++ *did_work = journal_flush_pins(j, seq_to_flush, 0); ++ ++ spin_lock(&j->lock); ++ /* ++ * If journal replay hasn't completed, the unreplayed journal entries ++ * hold refs on their corresponding sequence numbers ++ */ ++ ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || ++ journal_last_seq(j) > seq_to_flush || ++ (fifo_used(&j->pin) == 1 && ++ atomic_read(&fifo_peek_front(&j->pin).count) == 1); ++ ++ spin_unlock(&j->lock); ++ mutex_unlock(&j->reclaim_lock); ++ ++ return ret; ++} ++ ++bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) ++{ ++ bool did_work = false; ++ ++ if (!test_bit(JOURNAL_STARTED, &j->flags)) ++ return false; ++ ++ closure_wait_event(&j->async_wait, ++ journal_flush_done(j, seq_to_flush, &did_work)); ++ ++ return did_work; ++} ++ ++int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_entry_pin_list *p; ++ u64 iter, seq = 0; ++ int ret = 0; ++ ++ spin_lock(&j->lock); ++ fifo_for_each_entry_ptr(p, &j->pin, iter) ++ if (dev_idx >= 0 ++ ? bch2_dev_list_has_dev(p->devs, dev_idx) ++ : p->devs.nr < c->opts.metadata_replicas) ++ seq = iter; ++ spin_unlock(&j->lock); ++ ++ bch2_journal_flush_pins(j, seq); ++ ++ ret = bch2_journal_error(j); ++ if (ret) ++ return ret; ++ ++ mutex_lock(&c->replicas_gc_lock); ++ bch2_replicas_gc_start(c, 1 << BCH_DATA_journal); ++ ++ seq = 0; ++ ++ spin_lock(&j->lock); ++ while (!ret && seq < j->pin.back) { ++ struct bch_replicas_padded replicas; ++ ++ seq = max(seq, journal_last_seq(j)); ++ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, ++ journal_seq_pin(j, seq)->devs); ++ seq++; ++ ++ spin_unlock(&j->lock); ++ ret = bch2_mark_replicas(c, &replicas.e); ++ spin_lock(&j->lock); ++ } ++ spin_unlock(&j->lock); ++ ++ ret = bch2_replicas_gc_end(c, ret); ++ mutex_unlock(&c->replicas_gc_lock); ++ ++ return ret; ++} +diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h +new file mode 100644 +index 000000000000..8128907a7623 +--- /dev/null ++++ b/fs/bcachefs/journal_reclaim.h +@@ -0,0 +1,69 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_RECLAIM_H ++#define _BCACHEFS_JOURNAL_RECLAIM_H ++ ++#define JOURNAL_PIN (32 * 1024) ++ ++enum journal_space_from { ++ journal_space_discarded, ++ journal_space_clean_ondisk, ++ journal_space_clean, ++}; ++ ++unsigned bch2_journal_dev_buckets_available(struct journal *, ++ struct journal_device *, ++ enum journal_space_from); ++void bch2_journal_space_available(struct journal *); ++ ++static inline bool journal_pin_active(struct journal_entry_pin *pin) ++{ ++ return pin->seq != 0; ++} ++ ++static inline struct journal_entry_pin_list * ++journal_seq_pin(struct journal *j, u64 seq) ++{ ++ EBUG_ON(seq < j->pin.front || seq >= j->pin.back); ++ ++ return &j->pin.data[seq & j->pin.mask]; ++} ++ ++void bch2_journal_pin_put(struct journal *, u64); ++void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); ++ ++void __bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *, ++ journal_pin_flush_fn); ++ ++static inline void bch2_journal_pin_add(struct journal *j, u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) ++{ ++ if (unlikely(!journal_pin_active(pin) || pin->seq > seq)) ++ __bch2_journal_pin_add(j, seq, pin, flush_fn); ++} ++ ++void bch2_journal_pin_update(struct journal *, u64, ++ struct journal_entry_pin *, ++ journal_pin_flush_fn); ++ ++void bch2_journal_pin_copy(struct journal *, ++ struct journal_entry_pin *, ++ struct journal_entry_pin *, ++ journal_pin_flush_fn); ++ ++void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); ++ ++void bch2_journal_do_discards(struct journal *); ++void bch2_journal_reclaim(struct journal *); ++void bch2_journal_reclaim_work(struct work_struct *); ++ ++bool bch2_journal_flush_pins(struct journal *, u64); ++ ++static inline bool bch2_journal_flush_all_pins(struct journal *j) ++{ ++ return bch2_journal_flush_pins(j, U64_MAX); ++} ++ ++int bch2_journal_flush_device_pins(struct journal *, int); ++ ++#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ +diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c +new file mode 100644 +index 000000000000..d0f1bbf8f6a7 +--- /dev/null ++++ b/fs/bcachefs/journal_seq_blacklist.c +@@ -0,0 +1,309 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_iter.h" ++#include "eytzinger.h" ++#include "journal_seq_blacklist.h" ++#include "super-io.h" ++ ++/* ++ * journal_seq_blacklist machinery: ++ * ++ * To guarantee order of btree updates after a crash, we need to detect when a ++ * btree node entry (bset) is newer than the newest journal entry that was ++ * successfully written, and ignore it - effectively ignoring any btree updates ++ * that didn't make it into the journal. ++ * ++ * If we didn't do this, we might have two btree nodes, a and b, both with ++ * updates that weren't written to the journal yet: if b was updated after a, ++ * but b was flushed and not a - oops; on recovery we'll find that the updates ++ * to b happened, but not the updates to a that happened before it. ++ * ++ * Ignoring bsets that are newer than the newest journal entry is always safe, ++ * because everything they contain will also have been journalled - and must ++ * still be present in the journal on disk until a journal entry has been ++ * written _after_ that bset was written. ++ * ++ * To accomplish this, bsets record the newest journal sequence number they ++ * contain updates for; then, on startup, the btree code queries the journal ++ * code to ask "Is this sequence number newer than the newest journal entry? If ++ * so, ignore it." ++ * ++ * When this happens, we must blacklist that journal sequence number: the ++ * journal must not write any entries with that sequence number, and it must ++ * record that it was blacklisted so that a) on recovery we don't think we have ++ * missing journal entries and b) so that the btree code continues to ignore ++ * that bset, until that btree node is rewritten. ++ */ ++ ++static unsigned sb_blacklist_u64s(unsigned nr) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl; ++ ++ return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); ++} ++ ++static struct bch_sb_field_journal_seq_blacklist * ++blacklist_entry_try_merge(struct bch_fs *c, ++ struct bch_sb_field_journal_seq_blacklist *bl, ++ unsigned i) ++{ ++ unsigned nr = blacklist_nr_entries(bl); ++ ++ if (le64_to_cpu(bl->start[i].end) >= ++ le64_to_cpu(bl->start[i + 1].start)) { ++ bl->start[i].end = bl->start[i + 1].end; ++ --nr; ++ memmove(&bl->start[i], ++ &bl->start[i + 1], ++ sizeof(bl->start[0]) * (nr - i)); ++ ++ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, ++ sb_blacklist_u64s(nr)); ++ BUG_ON(!bl); ++ } ++ ++ return bl; ++} ++ ++int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl; ++ unsigned i, nr; ++ int ret = 0; ++ ++ mutex_lock(&c->sb_lock); ++ bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); ++ nr = blacklist_nr_entries(bl); ++ ++ if (bl) { ++ for (i = 0; i < nr; i++) { ++ struct journal_seq_blacklist_entry *e = ++ bl->start + i; ++ ++ if (start == le64_to_cpu(e->start) && ++ end == le64_to_cpu(e->end)) ++ goto out; ++ ++ if (start <= le64_to_cpu(e->start) && ++ end >= le64_to_cpu(e->end)) { ++ e->start = cpu_to_le64(start); ++ e->end = cpu_to_le64(end); ++ ++ if (i + 1 < nr) ++ bl = blacklist_entry_try_merge(c, ++ bl, i); ++ if (i) ++ bl = blacklist_entry_try_merge(c, ++ bl, i - 1); ++ goto out_write_sb; ++ } ++ } ++ } ++ ++ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, ++ sb_blacklist_u64s(nr + 1)); ++ if (!bl) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ bl->start[nr].start = cpu_to_le64(start); ++ bl->start[nr].end = cpu_to_le64(end); ++out_write_sb: ++ c->disk_sb.sb->features[0] |= ++ 1ULL << BCH_FEATURE_journal_seq_blacklist_v3; ++ ++ ret = bch2_write_super(c); ++out: ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++static int journal_seq_blacklist_table_cmp(const void *_l, ++ const void *_r, size_t size) ++{ ++ const struct journal_seq_blacklist_table_entry *l = _l; ++ const struct journal_seq_blacklist_table_entry *r = _r; ++ ++ return cmp_int(l->start, r->start); ++} ++ ++bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, ++ bool dirty) ++{ ++ struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; ++ struct journal_seq_blacklist_table_entry search = { .start = seq }; ++ int idx; ++ ++ if (!t) ++ return false; ++ ++ idx = eytzinger0_find_le(t->entries, t->nr, ++ sizeof(t->entries[0]), ++ journal_seq_blacklist_table_cmp, ++ &search); ++ if (idx < 0) ++ return false; ++ ++ BUG_ON(t->entries[idx].start > seq); ++ ++ if (seq >= t->entries[idx].end) ++ return false; ++ ++ if (dirty) ++ t->entries[idx].dirty = true; ++ return true; ++} ++ ++int bch2_blacklist_table_initialize(struct bch_fs *c) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl = ++ bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); ++ struct journal_seq_blacklist_table *t; ++ unsigned i, nr = blacklist_nr_entries(bl); ++ ++ BUG_ON(c->journal_seq_blacklist_table); ++ ++ if (!bl) ++ return 0; ++ ++ t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr, ++ GFP_KERNEL); ++ if (!t) ++ return -ENOMEM; ++ ++ t->nr = nr; ++ ++ for (i = 0; i < nr; i++) { ++ t->entries[i].start = le64_to_cpu(bl->start[i].start); ++ t->entries[i].end = le64_to_cpu(bl->start[i].end); ++ } ++ ++ eytzinger0_sort(t->entries, ++ t->nr, ++ sizeof(t->entries[0]), ++ journal_seq_blacklist_table_cmp, ++ NULL); ++ ++ c->journal_seq_blacklist_table = t; ++ return 0; ++} ++ ++static const char * ++bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl = ++ field_to_type(f, journal_seq_blacklist); ++ struct journal_seq_blacklist_entry *i; ++ unsigned nr = blacklist_nr_entries(bl); ++ ++ for (i = bl->start; i < bl->start + nr; i++) { ++ if (le64_to_cpu(i->start) >= ++ le64_to_cpu(i->end)) ++ return "entry start >= end"; ++ ++ if (i + 1 < bl->start + nr && ++ le64_to_cpu(i[0].end) > ++ le64_to_cpu(i[1].start)) ++ return "entries out of order"; ++ } ++ ++ return NULL; ++} ++ ++static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, ++ struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl = ++ field_to_type(f, journal_seq_blacklist); ++ struct journal_seq_blacklist_entry *i; ++ unsigned nr = blacklist_nr_entries(bl); ++ ++ for (i = bl->start; i < bl->start + nr; i++) { ++ if (i != bl->start) ++ pr_buf(out, " "); ++ ++ pr_buf(out, "%llu-%llu", ++ le64_to_cpu(i->start), ++ le64_to_cpu(i->end)); ++ } ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { ++ .validate = bch2_sb_journal_seq_blacklist_validate, ++ .to_text = bch2_sb_journal_seq_blacklist_to_text ++}; ++ ++void bch2_blacklist_entries_gc(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, struct bch_fs, ++ journal_seq_blacklist_gc_work); ++ struct journal_seq_blacklist_table *t; ++ struct bch_sb_field_journal_seq_blacklist *bl; ++ struct journal_seq_blacklist_entry *src, *dst; ++ struct btree_trans trans; ++ unsigned i, nr, new_nr; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) { ++ struct btree_iter *iter; ++ struct btree *b; ++ ++ for_each_btree_node(&trans, iter, i, POS_MIN, ++ BTREE_ITER_PREFETCH, b) ++ if (test_bit(BCH_FS_STOPPING, &c->flags)) { ++ bch2_trans_exit(&trans); ++ return; ++ } ++ bch2_trans_iter_free(&trans, iter); ++ } ++ ++ ret = bch2_trans_exit(&trans); ++ if (ret) ++ return; ++ ++ mutex_lock(&c->sb_lock); ++ bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); ++ if (!bl) ++ goto out; ++ ++ nr = blacklist_nr_entries(bl); ++ dst = bl->start; ++ ++ t = c->journal_seq_blacklist_table; ++ BUG_ON(nr != t->nr); ++ ++ for (src = bl->start, i = eytzinger0_first(t->nr); ++ src < bl->start + nr; ++ src++, i = eytzinger0_next(i, nr)) { ++ BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); ++ BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); ++ ++ if (t->entries[i].dirty) ++ *dst++ = *src; ++ } ++ ++ new_nr = dst - bl->start; ++ ++ bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); ++ ++ if (new_nr != nr) { ++ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, ++ new_nr ? sb_blacklist_u64s(new_nr) : 0); ++ BUG_ON(new_nr && !bl); ++ ++ if (!new_nr) ++ c->disk_sb.sb->features[0] &= ++ ~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); ++ ++ bch2_write_super(c); ++ } ++out: ++ mutex_unlock(&c->sb_lock); ++} +diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h +new file mode 100644 +index 000000000000..afb886ec8e25 +--- /dev/null ++++ b/fs/bcachefs/journal_seq_blacklist.h +@@ -0,0 +1,22 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H ++#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H ++ ++static inline unsigned ++blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) ++{ ++ return bl ++ ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / ++ sizeof(struct journal_seq_blacklist_entry)) ++ : 0; ++} ++ ++bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); ++int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); ++int bch2_blacklist_table_initialize(struct bch_fs *); ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; ++ ++void bch2_blacklist_entries_gc(struct work_struct *); ++ ++#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +new file mode 100644 +index 000000000000..154b51b891d3 +--- /dev/null ++++ b/fs/bcachefs/journal_types.h +@@ -0,0 +1,277 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_TYPES_H ++#define _BCACHEFS_JOURNAL_TYPES_H ++ ++#include ++#include ++ ++#include "alloc_types.h" ++#include "super_types.h" ++#include "fifo.h" ++ ++struct journal_res; ++ ++/* ++ * We put two of these in struct journal; we used them for writes to the ++ * journal that are being staged or in flight. ++ */ ++struct journal_buf { ++ struct jset *data; ++ ++ BKEY_PADDED(key); ++ ++ struct closure_waitlist wait; ++ ++ unsigned buf_size; /* size in bytes of @data */ ++ unsigned sectors; /* maximum size for current entry */ ++ unsigned disk_sectors; /* maximum size entry could have been, if ++ buf_size was bigger */ ++ unsigned u64s_reserved; ++ /* bloom filter: */ ++ unsigned long has_inode[1024 / sizeof(unsigned long)]; ++}; ++ ++/* ++ * Something that makes a journal entry dirty - i.e. a btree node that has to be ++ * flushed: ++ */ ++ ++struct journal_entry_pin_list { ++ struct list_head list; ++ struct list_head flushed; ++ atomic_t count; ++ struct bch_devs_list devs; ++}; ++ ++struct journal; ++struct journal_entry_pin; ++typedef void (*journal_pin_flush_fn)(struct journal *j, ++ struct journal_entry_pin *, u64); ++ ++struct journal_entry_pin { ++ struct list_head list; ++ journal_pin_flush_fn flush; ++ u64 seq; ++}; ++ ++struct journal_res { ++ bool ref; ++ u8 idx; ++ u16 u64s; ++ u32 offset; ++ u64 seq; ++}; ++ ++/* ++ * For reserving space in the journal prior to getting a reservation on a ++ * particular journal entry: ++ */ ++struct journal_preres { ++ unsigned u64s; ++}; ++ ++union journal_res_state { ++ struct { ++ atomic64_t counter; ++ }; ++ ++ struct { ++ u64 v; ++ }; ++ ++ struct { ++ u64 cur_entry_offset:20, ++ idx:1, ++ prev_buf_unwritten:1, ++ buf0_count:21, ++ buf1_count:21; ++ }; ++}; ++ ++union journal_preres_state { ++ struct { ++ atomic64_t counter; ++ }; ++ ++ struct { ++ u64 v; ++ }; ++ ++ struct { ++ u32 reserved; ++ u32 remaining; ++ }; ++}; ++ ++/* bytes: */ ++#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ ++#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ ++ ++/* ++ * We stash some journal state as sentinal values in cur_entry_offset: ++ * note - cur_entry_offset is in units of u64s ++ */ ++#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) ++ ++#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) ++#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) ++ ++/* ++ * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP, ++ * either because something's waiting on the write to complete or because it's ++ * been dirty too long and the timer's expired. ++ */ ++ ++enum { ++ JOURNAL_REPLAY_DONE, ++ JOURNAL_STARTED, ++ JOURNAL_RECLAIM_STARTED, ++ JOURNAL_NEED_WRITE, ++ JOURNAL_NOT_EMPTY, ++ JOURNAL_MAY_GET_UNRESERVED, ++}; ++ ++/* Embedded in struct bch_fs */ ++struct journal { ++ /* Fastpath stuff up front: */ ++ ++ unsigned long flags; ++ ++ union journal_res_state reservations; ++ ++ /* Max size of current journal entry */ ++ unsigned cur_entry_u64s; ++ unsigned cur_entry_sectors; ++ ++ /* ++ * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if ++ * insufficient devices: ++ */ ++ int cur_entry_error; ++ ++ union journal_preres_state prereserved; ++ ++ /* Reserved space in journal entry to be used just prior to write */ ++ unsigned entry_u64s_reserved; ++ ++ unsigned buf_size_want; ++ ++ /* ++ * Two journal entries -- one is currently open for new entries, the ++ * other is possibly being written out. ++ */ ++ struct journal_buf buf[2]; ++ ++ spinlock_t lock; ++ ++ /* if nonzero, we may not open a new journal entry: */ ++ unsigned blocked; ++ ++ /* Used when waiting because the journal was full */ ++ wait_queue_head_t wait; ++ struct closure_waitlist async_wait; ++ struct closure_waitlist preres_wait; ++ ++ struct closure io; ++ struct delayed_work write_work; ++ ++ /* Sequence number of most recent journal entry (last entry in @pin) */ ++ atomic64_t seq; ++ ++ /* seq, last_seq from the most recent journal entry successfully written */ ++ u64 seq_ondisk; ++ u64 last_seq_ondisk; ++ ++ /* ++ * FIFO of journal entries whose btree updates have not yet been ++ * written out. ++ * ++ * Each entry is a reference count. The position in the FIFO is the ++ * entry's sequence number relative to @seq. ++ * ++ * The journal entry itself holds a reference count, put when the ++ * journal entry is written out. Each btree node modified by the journal ++ * entry also holds a reference count, put when the btree node is ++ * written. ++ * ++ * When a reference count reaches zero, the journal entry is no longer ++ * needed. When all journal entries in the oldest journal bucket are no ++ * longer needed, the bucket can be discarded and reused. ++ */ ++ struct { ++ u64 front, back, size, mask; ++ struct journal_entry_pin_list *data; ++ } pin; ++ ++ u64 replay_journal_seq; ++ u64 replay_journal_seq_end; ++ ++ struct write_point wp; ++ spinlock_t err_lock; ++ ++ struct delayed_work reclaim_work; ++ struct mutex reclaim_lock; ++ unsigned long last_flushed; ++ struct journal_entry_pin *flush_in_progress; ++ wait_queue_head_t pin_flush_wait; ++ ++ /* protects advancing ja->discard_idx: */ ++ struct mutex discard_lock; ++ bool can_discard; ++ ++ unsigned write_delay_ms; ++ unsigned reclaim_delay_ms; ++ ++ u64 res_get_blocked_start; ++ u64 need_write_time; ++ u64 write_start_time; ++ ++ struct time_stats *write_time; ++ struct time_stats *delay_time; ++ struct time_stats *blocked_time; ++ struct time_stats *flush_seq_time; ++ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map res_map; ++#endif ++}; ++ ++/* ++ * Embedded in struct bch_dev. First three fields refer to the array of journal ++ * buckets, in bch_sb. ++ */ ++struct journal_device { ++ /* ++ * For each journal bucket, contains the max sequence number of the ++ * journal writes it contains - so we know when a bucket can be reused. ++ */ ++ u64 *bucket_seq; ++ ++ unsigned sectors_free; ++ ++ /* ++ * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx: ++ */ ++ unsigned discard_idx; /* Next bucket to discard */ ++ unsigned dirty_idx_ondisk; ++ unsigned dirty_idx; ++ unsigned cur_idx; /* Journal bucket we're currently writing to */ ++ unsigned nr; ++ ++ u64 *buckets; ++ ++ /* Bio for journal reads/writes to this device */ ++ struct bio *bio; ++ ++ /* for bch_journal_read_device */ ++ struct closure read; ++}; ++ ++/* ++ * journal_entry_res - reserve space in every journal entry: ++ */ ++struct journal_entry_res { ++ unsigned u64s; ++}; ++ ++#endif /* _BCACHEFS_JOURNAL_TYPES_H */ +diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c +new file mode 100644 +index 000000000000..864dfaa67b7a +--- /dev/null ++++ b/fs/bcachefs/keylist.c +@@ -0,0 +1,67 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "keylist.h" ++ ++int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s, ++ size_t nr_inline_u64s, size_t new_u64s) ++{ ++ size_t oldsize = bch2_keylist_u64s(l); ++ size_t newsize = oldsize + new_u64s; ++ u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p; ++ u64 *new_keys; ++ ++ newsize = roundup_pow_of_two(newsize); ++ ++ if (newsize <= nr_inline_u64s || ++ (old_buf && roundup_pow_of_two(oldsize) == newsize)) ++ return 0; ++ ++ new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO); ++ if (!new_keys) ++ return -ENOMEM; ++ ++ if (!old_buf) ++ memcpy_u64s(new_keys, inline_u64s, oldsize); ++ ++ l->keys_p = new_keys; ++ l->top_p = new_keys + oldsize; ++ ++ return 0; ++} ++ ++void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert) ++{ ++ struct bkey_i *where; ++ ++ for_each_keylist_key(l, where) ++ if (bkey_cmp(insert->k.p, where->k.p) < 0) ++ break; ++ ++ memmove_u64s_up((u64 *) where + insert->k.u64s, ++ where, ++ ((u64 *) l->top) - ((u64 *) where)); ++ ++ l->top_p += insert->k.u64s; ++ bkey_copy(where, insert); ++} ++ ++void bch2_keylist_pop_front(struct keylist *l) ++{ ++ l->top_p -= bch2_keylist_front(l)->k.u64s; ++ ++ memmove_u64s_down(l->keys, ++ bkey_next(l->keys), ++ bch2_keylist_u64s(l)); ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_verify_keylist_sorted(struct keylist *l) ++{ ++ struct bkey_i *k; ++ ++ for_each_keylist_key(l, k) ++ BUG_ON(bkey_next(k) != l->top && ++ bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0); ++} ++#endif +diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h +new file mode 100644 +index 000000000000..195799bb20bc +--- /dev/null ++++ b/fs/bcachefs/keylist.h +@@ -0,0 +1,76 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_KEYLIST_H ++#define _BCACHEFS_KEYLIST_H ++ ++#include "keylist_types.h" ++ ++int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t); ++void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *); ++void bch2_keylist_pop_front(struct keylist *); ++ ++static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys) ++{ ++ l->top_p = l->keys_p = inline_keys; ++} ++ ++static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys) ++{ ++ if (l->keys_p != inline_keys) ++ kfree(l->keys_p); ++ bch2_keylist_init(l, inline_keys); ++} ++ ++static inline void bch2_keylist_push(struct keylist *l) ++{ ++ l->top = bkey_next(l->top); ++} ++ ++static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k) ++{ ++ bkey_copy(l->top, k); ++ bch2_keylist_push(l); ++} ++ ++static inline bool bch2_keylist_empty(struct keylist *l) ++{ ++ return l->top == l->keys; ++} ++ ++static inline size_t bch2_keylist_u64s(struct keylist *l) ++{ ++ return l->top_p - l->keys_p; ++} ++ ++static inline size_t bch2_keylist_bytes(struct keylist *l) ++{ ++ return bch2_keylist_u64s(l) * sizeof(u64); ++} ++ ++static inline struct bkey_i *bch2_keylist_front(struct keylist *l) ++{ ++ return l->keys; ++} ++ ++#define for_each_keylist_key(_keylist, _k) \ ++ for (_k = (_keylist)->keys; \ ++ _k != (_keylist)->top; \ ++ _k = bkey_next(_k)) ++ ++static inline u64 keylist_sectors(struct keylist *keys) ++{ ++ struct bkey_i *k; ++ u64 ret = 0; ++ ++ for_each_keylist_key(keys, k) ++ ret += k->k.size; ++ ++ return ret; ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_verify_keylist_sorted(struct keylist *); ++#else ++static inline void bch2_verify_keylist_sorted(struct keylist *l) {} ++#endif ++ ++#endif /* _BCACHEFS_KEYLIST_H */ +diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h +new file mode 100644 +index 000000000000..4b3ff7d8a875 +--- /dev/null ++++ b/fs/bcachefs/keylist_types.h +@@ -0,0 +1,16 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_KEYLIST_TYPES_H ++#define _BCACHEFS_KEYLIST_TYPES_H ++ ++struct keylist { ++ union { ++ struct bkey_i *keys; ++ u64 *keys_p; ++ }; ++ union { ++ struct bkey_i *top; ++ u64 *top_p; ++ }; ++}; ++ ++#endif /* _BCACHEFS_KEYLIST_TYPES_H */ +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +new file mode 100644 +index 000000000000..96c8690adc5b +--- /dev/null ++++ b/fs/bcachefs/migrate.c +@@ -0,0 +1,170 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Code for moving data off a device. ++ */ ++ ++#include "bcachefs.h" ++#include "bkey_on_stack.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "extents.h" ++#include "io.h" ++#include "journal.h" ++#include "keylist.h" ++#include "migrate.h" ++#include "move.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, ++ unsigned dev_idx, int flags, bool metadata) ++{ ++ unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; ++ unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; ++ unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED; ++ unsigned nr_good; ++ ++ bch2_bkey_drop_device(k, dev_idx); ++ ++ nr_good = bch2_bkey_durability(c, k.s_c); ++ if ((!nr_good && !(flags & lost)) || ++ (nr_good < replicas && !(flags & degraded))) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags, ++ enum btree_id btree_id) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_on_stack sk; ++ int ret = 0; ++ ++ bkey_on_stack_init(&sk); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, ++ BTREE_ITER_PREFETCH); ++ ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret = bkey_err(k))) { ++ if (!bch2_bkey_has_device(k, dev_idx)) { ++ bch2_btree_iter_next(iter); ++ continue; ++ } ++ ++ bkey_on_stack_reassemble(&sk, c, k); ++ ++ ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k), ++ dev_idx, flags, false); ++ if (ret) ++ break; ++ ++ /* ++ * If the new extent no longer has any pointers, bch2_extent_normalize() ++ * will do the appropriate thing with it (turning it into a ++ * KEY_TYPE_error key, or just a discard if it was a cached extent) ++ */ ++ bch2_extent_normalize(c, bkey_i_to_s(sk.k)); ++ ++ bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); ++ ++ bch2_trans_update(&trans, iter, sk.k, 0); ++ ++ ret = bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL); ++ ++ /* ++ * don't want to leave ret == -EINTR, since if we raced and ++ * something else overwrote the key we could spuriously return ++ * -EINTR below: ++ */ ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ break; ++ } ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ bkey_on_stack_exit(&sk, c); ++ ++ BUG_ON(ret == -EINTR); ++ ++ return ret; ++} ++ ++static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) ++{ ++ return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_EXTENTS) ?: ++ __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_REFLINK); ++} ++ ++static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct closure cl; ++ struct btree *b; ++ unsigned id; ++ int ret; ++ ++ /* don't handle this yet: */ ++ if (flags & BCH_FORCE_IF_METADATA_LOST) ++ return -EINVAL; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ closure_init_stack(&cl); ++ ++ for (id = 0; id < BTREE_ID_NR; id++) { ++ for_each_btree_node(&trans, iter, id, POS_MIN, ++ BTREE_ITER_PREFETCH, b) { ++ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; ++retry: ++ if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key), ++ dev_idx)) ++ continue; ++ ++ bkey_copy(&tmp.k, &b->key); ++ ++ ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.k), ++ dev_idx, flags, true); ++ if (ret) { ++ bch_err(c, "Cannot drop device without losing data"); ++ goto err; ++ } ++ ++ ret = bch2_btree_node_update_key(c, iter, b, &tmp.k); ++ if (ret == -EINTR) { ++ b = bch2_btree_iter_peek_node(iter); ++ goto retry; ++ } ++ if (ret) { ++ bch_err(c, "Error updating btree node key: %i", ret); ++ goto err; ++ } ++ } ++ bch2_trans_iter_free(&trans, iter); ++ } ++ ++ /* flush relevant btree updates */ ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c)); ++ ++ ret = 0; ++err: ++ ret = bch2_trans_exit(&trans) ?: ret; ++ ++ BUG_ON(ret == -EINTR); ++ ++ return ret; ++} ++ ++int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) ++{ ++ return bch2_dev_usrdata_drop(c, dev_idx, flags) ?: ++ bch2_dev_metadata_drop(c, dev_idx, flags); ++} +diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h +new file mode 100644 +index 000000000000..027efaa0d575 +--- /dev/null ++++ b/fs/bcachefs/migrate.h +@@ -0,0 +1,7 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_MIGRATE_H ++#define _BCACHEFS_MIGRATE_H ++ ++int bch2_dev_data_drop(struct bch_fs *, unsigned, int); ++ ++#endif /* _BCACHEFS_MIGRATE_H */ +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +new file mode 100644 +index 000000000000..2f3be487ef65 +--- /dev/null ++++ b/fs/bcachefs/move.c +@@ -0,0 +1,819 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_on_stack.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "disk_groups.h" ++#include "inode.h" ++#include "io.h" ++#include "journal_reclaim.h" ++#include "move.h" ++#include "replicas.h" ++#include "super-io.h" ++#include "keylist.h" ++ ++#include ++#include ++ ++#include ++ ++#define SECTORS_IN_FLIGHT_PER_DEVICE 2048 ++ ++struct moving_io { ++ struct list_head list; ++ struct closure cl; ++ bool read_completed; ++ ++ unsigned read_sectors; ++ unsigned write_sectors; ++ ++ struct bch_read_bio rbio; ++ ++ struct migrate_write write; ++ /* Must be last since it is variable size */ ++ struct bio_vec bi_inline_vecs[0]; ++}; ++ ++struct moving_context { ++ /* Closure for waiting on all reads and writes to complete */ ++ struct closure cl; ++ ++ struct bch_move_stats *stats; ++ ++ struct list_head reads; ++ ++ /* in flight sectors: */ ++ atomic_t read_sectors; ++ atomic_t write_sectors; ++ ++ wait_queue_head_t wait; ++}; ++ ++static int bch2_migrate_index_update(struct bch_write_op *op) ++{ ++ struct bch_fs *c = op->c; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct migrate_write *m = ++ container_of(op, struct migrate_write, op); ++ struct keylist *keys = &op->insert_keys; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ iter = bch2_trans_get_iter(&trans, m->btree_id, ++ bkey_start_pos(&bch2_keylist_front(keys)->k), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ ++ while (1) { ++ struct bkey_s_c k; ++ struct bkey_i *insert; ++ struct bkey_i_extent *new; ++ BKEY_PADDED(k) _new, _insert; ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ bool did_work = false; ++ int nr; ++ ++ bch2_trans_reset(&trans, 0); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) { ++ if (ret == -EINTR) ++ continue; ++ break; ++ } ++ ++ new = bkey_i_to_extent(bch2_keylist_front(keys)); ++ ++ if (bversion_cmp(k.k->version, new->k.version) || ++ !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset)) ++ goto nomatch; ++ ++ if (m->data_cmd == DATA_REWRITE && ++ !bch2_bkey_has_device(k, m->data_opts.rewrite_dev)) ++ goto nomatch; ++ ++ bkey_reassemble(&_insert.k, k); ++ insert = &_insert.k; ++ ++ bkey_copy(&_new.k, bch2_keylist_front(keys)); ++ new = bkey_i_to_extent(&_new.k); ++ bch2_cut_front(iter->pos, &new->k_i); ++ ++ bch2_cut_front(iter->pos, insert); ++ bch2_cut_back(new->k.p, insert); ++ bch2_cut_back(insert->k.p, &new->k_i); ++ ++ if (m->data_cmd == DATA_REWRITE) ++ bch2_bkey_drop_device(bkey_i_to_s(insert), ++ m->data_opts.rewrite_dev); ++ ++ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { ++ if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) { ++ /* ++ * raced with another move op? extent already ++ * has a pointer to the device we just wrote ++ * data to ++ */ ++ continue; ++ } ++ ++ bch2_extent_ptr_decoded_append(insert, &p); ++ did_work = true; ++ } ++ ++ if (!did_work) ++ goto nomatch; ++ ++ bch2_bkey_narrow_crcs(insert, ++ (struct bch_extent_crc_unpacked) { 0 }); ++ bch2_extent_normalize(c, bkey_i_to_s(insert)); ++ bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert), ++ op->opts.background_target, ++ op->opts.data_replicas); ++ ++ /* ++ * If we're not fully overwriting @k, and it's compressed, we ++ * need a reservation for all the pointers in @insert ++ */ ++ nr = bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(insert)) - ++ m->nr_ptrs_reserved; ++ ++ if (insert->k.size < k.k->size && ++ bch2_bkey_sectors_compressed(k) && ++ nr > 0) { ++ ret = bch2_disk_reservation_add(c, &op->res, ++ keylist_sectors(keys) * nr, 0); ++ if (ret) ++ goto out; ++ ++ m->nr_ptrs_reserved += nr; ++ goto next; ++ } ++ ++ bch2_trans_update(&trans, iter, insert, 0); ++ ++ ret = bch2_trans_commit(&trans, &op->res, ++ op_journal_seq(op), ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ m->data_opts.btree_insert_flags); ++ if (!ret) ++ atomic_long_inc(&c->extent_migrate_done); ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ break; ++next: ++ while (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) >= 0) { ++ bch2_keylist_pop_front(keys); ++ if (bch2_keylist_empty(keys)) ++ goto out; ++ } ++ continue; ++nomatch: ++ if (m->ctxt) { ++ BUG_ON(k.k->p.offset <= iter->pos.offset); ++ atomic64_inc(&m->ctxt->stats->keys_raced); ++ atomic64_add(k.k->p.offset - iter->pos.offset, ++ &m->ctxt->stats->sectors_raced); ++ } ++ atomic_long_inc(&c->extent_migrate_raced); ++ trace_move_race(&new->k); ++ bch2_btree_iter_next_slot(iter); ++ goto next; ++ } ++out: ++ bch2_trans_exit(&trans); ++ BUG_ON(ret == -EINTR); ++ return ret; ++} ++ ++void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio) ++{ ++ /* write bio must own pages: */ ++ BUG_ON(!m->op.wbio.bio.bi_vcnt); ++ ++ m->ptr = rbio->pick.ptr; ++ m->offset = rbio->pos.offset - rbio->pick.crc.offset; ++ m->op.devs_have = rbio->devs_have; ++ m->op.pos = rbio->pos; ++ m->op.version = rbio->version; ++ m->op.crc = rbio->pick.crc; ++ m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; ++ ++ if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) { ++ m->op.nonce = m->op.crc.nonce + m->op.crc.offset; ++ m->op.csum_type = m->op.crc.csum_type; ++ } ++ ++ if (m->data_cmd == DATA_REWRITE) ++ bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev); ++} ++ ++int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, ++ struct write_point_specifier wp, ++ struct bch_io_opts io_opts, ++ enum data_cmd data_cmd, ++ struct data_opts data_opts, ++ enum btree_id btree_id, ++ struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ int ret; ++ ++ m->btree_id = btree_id; ++ m->data_cmd = data_cmd; ++ m->data_opts = data_opts; ++ m->nr_ptrs_reserved = 0; ++ ++ bch2_write_op_init(&m->op, c, io_opts); ++ ++ if (!bch2_bkey_is_incompressible(k)) ++ m->op.compression_type = ++ bch2_compression_opt_to_type[io_opts.background_compression ?: ++ io_opts.compression]; ++ else ++ m->op.incompressible = true; ++ ++ m->op.target = data_opts.target, ++ m->op.write_point = wp; ++ ++ if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) { ++ m->op.alloc_reserve = RESERVE_MOVINGGC; ++ m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; ++ } else { ++ /* XXX: this should probably be passed in */ ++ m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; ++ } ++ ++ m->op.flags |= BCH_WRITE_PAGES_STABLE| ++ BCH_WRITE_PAGES_OWNED| ++ BCH_WRITE_DATA_ENCODED| ++ BCH_WRITE_FROM_INTERNAL; ++ ++ m->op.nr_replicas = 1; ++ m->op.nr_replicas_required = 1; ++ m->op.index_update_fn = bch2_migrate_index_update; ++ ++ switch (data_cmd) { ++ case DATA_ADD_REPLICAS: { ++ /* ++ * DATA_ADD_REPLICAS is used for moving data to a different ++ * device in the background, and due to compression the new copy ++ * might take up more space than the old copy: ++ */ ++#if 0 ++ int nr = (int) io_opts.data_replicas - ++ bch2_bkey_nr_ptrs_allocated(k); ++#endif ++ int nr = (int) io_opts.data_replicas; ++ ++ if (nr > 0) { ++ m->op.nr_replicas = m->nr_ptrs_reserved = nr; ++ ++ ret = bch2_disk_reservation_get(c, &m->op.res, ++ k.k->size, m->op.nr_replicas, 0); ++ if (ret) ++ return ret; ++ } ++ break; ++ } ++ case DATA_REWRITE: { ++ unsigned compressed_sectors = 0; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (!p.ptr.cached && ++ crc_is_compressed(p.crc) && ++ bch2_dev_in_target(c, p.ptr.dev, data_opts.target)) ++ compressed_sectors += p.crc.compressed_size; ++ ++ if (compressed_sectors) { ++ ret = bch2_disk_reservation_add(c, &m->op.res, ++ compressed_sectors, ++ BCH_DISK_RESERVATION_NOFAIL); ++ if (ret) ++ return ret; ++ } ++ break; ++ } ++ case DATA_PROMOTE: ++ m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; ++ m->op.flags |= BCH_WRITE_CACHED; ++ break; ++ default: ++ BUG(); ++ } ++ ++ return 0; ++} ++ ++static void move_free(struct closure *cl) ++{ ++ struct moving_io *io = container_of(cl, struct moving_io, cl); ++ struct moving_context *ctxt = io->write.ctxt; ++ struct bvec_iter_all iter; ++ struct bio_vec *bv; ++ ++ bch2_disk_reservation_put(io->write.op.c, &io->write.op.res); ++ ++ bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter) ++ if (bv->bv_page) ++ __free_page(bv->bv_page); ++ ++ wake_up(&ctxt->wait); ++ ++ kfree(io); ++} ++ ++static void move_write_done(struct closure *cl) ++{ ++ struct moving_io *io = container_of(cl, struct moving_io, cl); ++ ++ atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); ++ closure_return_with_destructor(cl, move_free); ++} ++ ++static void move_write(struct closure *cl) ++{ ++ struct moving_io *io = container_of(cl, struct moving_io, cl); ++ ++ if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { ++ closure_return_with_destructor(cl, move_free); ++ return; ++ } ++ ++ bch2_migrate_read_done(&io->write, &io->rbio); ++ ++ atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); ++ closure_call(&io->write.op.cl, bch2_write, NULL, cl); ++ continue_at(cl, move_write_done, NULL); ++} ++ ++static inline struct moving_io *next_pending_write(struct moving_context *ctxt) ++{ ++ struct moving_io *io = ++ list_first_entry_or_null(&ctxt->reads, struct moving_io, list); ++ ++ return io && io->read_completed ? io : NULL; ++} ++ ++static void move_read_endio(struct bio *bio) ++{ ++ struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); ++ struct moving_context *ctxt = io->write.ctxt; ++ ++ atomic_sub(io->read_sectors, &ctxt->read_sectors); ++ io->read_completed = true; ++ ++ if (next_pending_write(ctxt)) ++ wake_up(&ctxt->wait); ++ ++ closure_put(&ctxt->cl); ++} ++ ++static void do_pending_writes(struct moving_context *ctxt) ++{ ++ struct moving_io *io; ++ ++ while ((io = next_pending_write(ctxt))) { ++ list_del(&io->list); ++ closure_call(&io->cl, move_write, NULL, &ctxt->cl); ++ } ++} ++ ++#define move_ctxt_wait_event(_ctxt, _cond) \ ++do { \ ++ do_pending_writes(_ctxt); \ ++ \ ++ if (_cond) \ ++ break; \ ++ __wait_event((_ctxt)->wait, \ ++ next_pending_write(_ctxt) || (_cond)); \ ++} while (1) ++ ++static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) ++{ ++ unsigned sectors_pending = atomic_read(&ctxt->write_sectors); ++ ++ move_ctxt_wait_event(ctxt, ++ !atomic_read(&ctxt->write_sectors) || ++ atomic_read(&ctxt->write_sectors) != sectors_pending); ++} ++ ++static int bch2_move_extent(struct bch_fs *c, ++ struct moving_context *ctxt, ++ struct write_point_specifier wp, ++ struct bch_io_opts io_opts, ++ enum btree_id btree_id, ++ struct bkey_s_c k, ++ enum data_cmd data_cmd, ++ struct data_opts data_opts) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ struct moving_io *io; ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ unsigned sectors = k.k->size, pages; ++ int ret = -ENOMEM; ++ ++ move_ctxt_wait_event(ctxt, ++ atomic_read(&ctxt->write_sectors) < ++ SECTORS_IN_FLIGHT_PER_DEVICE); ++ ++ move_ctxt_wait_event(ctxt, ++ atomic_read(&ctxt->read_sectors) < ++ SECTORS_IN_FLIGHT_PER_DEVICE); ++ ++ /* write path might have to decompress data: */ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); ++ ++ pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); ++ io = kzalloc(sizeof(struct moving_io) + ++ sizeof(struct bio_vec) * pages, GFP_KERNEL); ++ if (!io) ++ goto err; ++ ++ io->write.ctxt = ctxt; ++ io->read_sectors = k.k->size; ++ io->write_sectors = k.k->size; ++ ++ bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages); ++ bio_set_prio(&io->write.op.wbio.bio, ++ IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); ++ ++ if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, ++ GFP_KERNEL)) ++ goto err_free; ++ ++ io->rbio.c = c; ++ io->rbio.opts = io_opts; ++ bio_init(&io->rbio.bio, io->bi_inline_vecs, pages); ++ io->rbio.bio.bi_vcnt = pages; ++ bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); ++ io->rbio.bio.bi_iter.bi_size = sectors << 9; ++ ++ bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0); ++ io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); ++ io->rbio.bio.bi_end_io = move_read_endio; ++ ++ ret = bch2_migrate_write_init(c, &io->write, wp, io_opts, ++ data_cmd, data_opts, btree_id, k); ++ if (ret) ++ goto err_free_pages; ++ ++ atomic64_inc(&ctxt->stats->keys_moved); ++ atomic64_add(k.k->size, &ctxt->stats->sectors_moved); ++ ++ trace_move_extent(k.k); ++ ++ atomic_add(io->read_sectors, &ctxt->read_sectors); ++ list_add_tail(&io->list, &ctxt->reads); ++ ++ /* ++ * dropped by move_read_endio() - guards against use after free of ++ * ctxt when doing wakeup ++ */ ++ closure_get(&ctxt->cl); ++ bch2_read_extent(c, &io->rbio, k, 0, ++ BCH_READ_NODECODE| ++ BCH_READ_LAST_FRAGMENT); ++ return 0; ++err_free_pages: ++ bio_free_pages(&io->write.op.wbio.bio); ++err_free: ++ kfree(io); ++err: ++ trace_move_alloc_fail(k.k); ++ return ret; ++} ++ ++static int __bch2_move_data(struct bch_fs *c, ++ struct moving_context *ctxt, ++ struct bch_ratelimit *rate, ++ struct write_point_specifier wp, ++ struct bpos start, ++ struct bpos end, ++ move_pred_fn pred, void *arg, ++ struct bch_move_stats *stats, ++ enum btree_id btree_id) ++{ ++ bool kthread = (current->flags & PF_KTHREAD) != 0; ++ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); ++ struct bkey_on_stack sk; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct data_opts data_opts; ++ enum data_cmd data_cmd; ++ u64 delay, cur_inum = U64_MAX; ++ int ret = 0, ret2; ++ ++ bkey_on_stack_init(&sk); ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ stats->data_type = BCH_DATA_user; ++ stats->btree_id = btree_id; ++ stats->pos = POS_MIN; ++ ++ iter = bch2_trans_get_iter(&trans, btree_id, start, ++ BTREE_ITER_PREFETCH); ++ ++ if (rate) ++ bch2_ratelimit_reset(rate); ++ ++ while (1) { ++ do { ++ delay = rate ? bch2_ratelimit_delay(rate) : 0; ++ ++ if (delay) { ++ bch2_trans_unlock(&trans); ++ set_current_state(TASK_INTERRUPTIBLE); ++ } ++ ++ if (kthread && (ret = kthread_should_stop())) { ++ __set_current_state(TASK_RUNNING); ++ goto out; ++ } ++ ++ if (delay) ++ schedule_timeout(delay); ++ ++ if (unlikely(freezing(current))) { ++ bch2_trans_unlock(&trans); ++ move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); ++ try_to_freeze(); ++ } ++ } while (delay); ++peek: ++ k = bch2_btree_iter_peek(iter); ++ ++ stats->pos = iter->pos; ++ ++ if (!k.k) ++ break; ++ ret = bkey_err(k); ++ if (ret) ++ break; ++ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) ++ break; ++ ++ if (!bkey_extent_is_direct_data(k.k)) ++ goto next_nondata; ++ ++ if (btree_id == BTREE_ID_EXTENTS && ++ cur_inum != k.k->p.inode) { ++ struct bch_inode_unpacked inode; ++ ++ /* don't hold btree locks while looking up inode: */ ++ bch2_trans_unlock(&trans); ++ ++ io_opts = bch2_opts_to_inode_opts(c->opts); ++ if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode)) ++ bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode)); ++ cur_inum = k.k->p.inode; ++ goto peek; ++ } ++ ++ switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) { ++ case DATA_SKIP: ++ goto next; ++ case DATA_SCRUB: ++ BUG(); ++ case DATA_ADD_REPLICAS: ++ case DATA_REWRITE: ++ case DATA_PROMOTE: ++ break; ++ default: ++ BUG(); ++ } ++ ++ /* unlock before doing IO: */ ++ bkey_on_stack_reassemble(&sk, c, k); ++ k = bkey_i_to_s_c(sk.k); ++ bch2_trans_unlock(&trans); ++ ++ ret2 = bch2_move_extent(c, ctxt, wp, io_opts, btree_id, k, ++ data_cmd, data_opts); ++ if (ret2) { ++ if (ret2 == -ENOMEM) { ++ /* memory allocation failure, wait for some IO to finish */ ++ bch2_move_ctxt_wait_for_io(ctxt); ++ continue; ++ } ++ ++ /* XXX signal failure */ ++ goto next; ++ } ++ ++ if (rate) ++ bch2_ratelimit_increment(rate, k.k->size); ++next: ++ atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k), ++ &stats->sectors_seen); ++next_nondata: ++ bch2_btree_iter_next(iter); ++ bch2_trans_cond_resched(&trans); ++ } ++out: ++ ret = bch2_trans_exit(&trans) ?: ret; ++ bkey_on_stack_exit(&sk, c); ++ ++ return ret; ++} ++ ++int bch2_move_data(struct bch_fs *c, ++ struct bch_ratelimit *rate, ++ struct write_point_specifier wp, ++ struct bpos start, ++ struct bpos end, ++ move_pred_fn pred, void *arg, ++ struct bch_move_stats *stats) ++{ ++ struct moving_context ctxt = { .stats = stats }; ++ int ret; ++ ++ closure_init_stack(&ctxt.cl); ++ INIT_LIST_HEAD(&ctxt.reads); ++ init_waitqueue_head(&ctxt.wait); ++ ++ stats->data_type = BCH_DATA_user; ++ ++ ret = __bch2_move_data(c, &ctxt, rate, wp, start, end, ++ pred, arg, stats, BTREE_ID_EXTENTS) ?: ++ __bch2_move_data(c, &ctxt, rate, wp, start, end, ++ pred, arg, stats, BTREE_ID_REFLINK); ++ ++ move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); ++ closure_sync(&ctxt.cl); ++ ++ EBUG_ON(atomic_read(&ctxt.write_sectors)); ++ ++ trace_move_data(c, ++ atomic64_read(&stats->sectors_moved), ++ atomic64_read(&stats->keys_moved)); ++ ++ return ret; ++} ++ ++static int bch2_move_btree(struct bch_fs *c, ++ move_pred_fn pred, ++ void *arg, ++ struct bch_move_stats *stats) ++{ ++ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct btree *b; ++ unsigned id; ++ struct data_opts data_opts; ++ enum data_cmd cmd; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ stats->data_type = BCH_DATA_btree; ++ ++ for (id = 0; id < BTREE_ID_NR; id++) { ++ stats->btree_id = id; ++ ++ for_each_btree_node(&trans, iter, id, POS_MIN, ++ BTREE_ITER_PREFETCH, b) { ++ stats->pos = iter->pos; ++ ++ switch ((cmd = pred(c, arg, ++ bkey_i_to_s_c(&b->key), ++ &io_opts, &data_opts))) { ++ case DATA_SKIP: ++ goto next; ++ case DATA_SCRUB: ++ BUG(); ++ case DATA_ADD_REPLICAS: ++ case DATA_REWRITE: ++ break; ++ default: ++ BUG(); ++ } ++ ++ ret = bch2_btree_node_rewrite(c, iter, ++ b->data->keys.seq, 0) ?: ret; ++next: ++ bch2_trans_cond_resched(&trans); ++ } ++ ++ ret = bch2_trans_iter_free(&trans, iter) ?: ret; ++ } ++ ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++#if 0 ++static enum data_cmd scrub_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) ++{ ++ return DATA_SCRUB; ++} ++#endif ++ ++static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) ++{ ++ unsigned nr_good = bch2_bkey_durability(c, k); ++ unsigned replicas = 0; ++ ++ switch (k.k->type) { ++ case KEY_TYPE_btree_ptr: ++ replicas = c->opts.metadata_replicas; ++ break; ++ case KEY_TYPE_extent: ++ replicas = io_opts->data_replicas; ++ break; ++ } ++ ++ if (!nr_good || nr_good >= replicas) ++ return DATA_SKIP; ++ ++ data_opts->target = 0; ++ data_opts->btree_insert_flags = 0; ++ return DATA_ADD_REPLICAS; ++} ++ ++static enum data_cmd migrate_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) ++{ ++ struct bch_ioctl_data *op = arg; ++ ++ if (!bch2_bkey_has_device(k, op->migrate.dev)) ++ return DATA_SKIP; ++ ++ data_opts->target = 0; ++ data_opts->btree_insert_flags = 0; ++ data_opts->rewrite_dev = op->migrate.dev; ++ return DATA_REWRITE; ++} ++ ++int bch2_data_job(struct bch_fs *c, ++ struct bch_move_stats *stats, ++ struct bch_ioctl_data op) ++{ ++ int ret = 0; ++ ++ switch (op.op) { ++ case BCH_DATA_OP_REREPLICATE: ++ stats->data_type = BCH_DATA_journal; ++ ret = bch2_journal_flush_device_pins(&c->journal, -1); ++ ++ ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret; ++ ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c)); ++ ++ ret = bch2_replicas_gc2(c) ?: ret; ++ ++ ret = bch2_move_data(c, NULL, ++ writepoint_hashed((unsigned long) current), ++ op.start, ++ op.end, ++ rereplicate_pred, c, stats) ?: ret; ++ ret = bch2_replicas_gc2(c) ?: ret; ++ break; ++ case BCH_DATA_OP_MIGRATE: ++ if (op.migrate.dev >= c->sb.nr_devices) ++ return -EINVAL; ++ ++ stats->data_type = BCH_DATA_journal; ++ ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); ++ ++ ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret; ++ ret = bch2_replicas_gc2(c) ?: ret; ++ ++ ret = bch2_move_data(c, NULL, ++ writepoint_hashed((unsigned long) current), ++ op.start, ++ op.end, ++ migrate_pred, &op, stats) ?: ret; ++ ret = bch2_replicas_gc2(c) ?: ret; ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++ return ret; ++} +diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h +new file mode 100644 +index 000000000000..0acd1720d4f8 +--- /dev/null ++++ b/fs/bcachefs/move.h +@@ -0,0 +1,64 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_MOVE_H ++#define _BCACHEFS_MOVE_H ++ ++#include "btree_iter.h" ++#include "buckets.h" ++#include "io_types.h" ++#include "move_types.h" ++ ++struct bch_read_bio; ++struct moving_context; ++ ++enum data_cmd { ++ DATA_SKIP, ++ DATA_SCRUB, ++ DATA_ADD_REPLICAS, ++ DATA_REWRITE, ++ DATA_PROMOTE, ++}; ++ ++struct data_opts { ++ u16 target; ++ unsigned rewrite_dev; ++ int btree_insert_flags; ++}; ++ ++struct migrate_write { ++ enum btree_id btree_id; ++ enum data_cmd data_cmd; ++ struct data_opts data_opts; ++ ++ unsigned nr_ptrs_reserved; ++ ++ struct moving_context *ctxt; ++ ++ /* what we read: */ ++ struct bch_extent_ptr ptr; ++ u64 offset; ++ ++ struct bch_write_op op; ++}; ++ ++void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *); ++int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *, ++ struct write_point_specifier, ++ struct bch_io_opts, ++ enum data_cmd, struct data_opts, ++ enum btree_id, struct bkey_s_c); ++ ++typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *, ++ struct bkey_s_c, ++ struct bch_io_opts *, struct data_opts *); ++ ++int bch2_move_data(struct bch_fs *, struct bch_ratelimit *, ++ struct write_point_specifier, ++ struct bpos, struct bpos, ++ move_pred_fn, void *, ++ struct bch_move_stats *); ++ ++int bch2_data_job(struct bch_fs *, ++ struct bch_move_stats *, ++ struct bch_ioctl_data); ++ ++#endif /* _BCACHEFS_MOVE_H */ +diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h +new file mode 100644 +index 000000000000..fc0de165af9f +--- /dev/null ++++ b/fs/bcachefs/move_types.h +@@ -0,0 +1,17 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_MOVE_TYPES_H ++#define _BCACHEFS_MOVE_TYPES_H ++ ++struct bch_move_stats { ++ enum bch_data_type data_type; ++ enum btree_id btree_id; ++ struct bpos pos; ++ ++ atomic64_t keys_moved; ++ atomic64_t keys_raced; ++ atomic64_t sectors_moved; ++ atomic64_t sectors_seen; ++ atomic64_t sectors_raced; ++}; ++ ++#endif /* _BCACHEFS_MOVE_TYPES_H */ +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +new file mode 100644 +index 000000000000..de0a7974ec9f +--- /dev/null ++++ b/fs/bcachefs/movinggc.c +@@ -0,0 +1,359 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Moving/copying garbage collector ++ * ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "btree_iter.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "clock.h" ++#include "disk_groups.h" ++#include "error.h" ++#include "extents.h" ++#include "eytzinger.h" ++#include "io.h" ++#include "keylist.h" ++#include "move.h" ++#include "movinggc.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * We can't use the entire copygc reserve in one iteration of copygc: we may ++ * need the buckets we're freeing up to go back into the copygc reserve to make ++ * forward progress, but if the copygc reserve is full they'll be available for ++ * any allocation - and it's possible that in a given iteration, we free up most ++ * of the buckets we're going to free before we allocate most of the buckets ++ * we're going to allocate. ++ * ++ * If we only use half of the reserve per iteration, then in steady state we'll ++ * always have room in the reserve for the buckets we're going to need in the ++ * next iteration: ++ */ ++#define COPYGC_BUCKETS_PER_ITER(ca) \ ++ ((ca)->free[RESERVE_MOVINGGC].size / 2) ++ ++static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) ++{ ++ const struct copygc_heap_entry *l = _l; ++ const struct copygc_heap_entry *r = _r; ++ ++ return cmp_int(l->dev, r->dev) ?: ++ cmp_int(l->offset, r->offset); ++} ++ ++static int __copygc_pred(struct bch_fs *c, struct bkey_s_c k) ++{ ++ copygc_heap *h = &c->copygc_heap; ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct copygc_heap_entry search = { ++ .dev = ptr->dev, ++ .offset = ptr->offset ++ }; ++ ++ ssize_t i = eytzinger0_find_le(h->data, h->used, ++ sizeof(h->data[0]), ++ bucket_offset_cmp, &search); ++#if 0 ++ /* eytzinger search verify code: */ ++ ssize_t j = -1, k; ++ ++ for (k = 0; k < h->used; k++) ++ if (h->data[k].offset <= ptr->offset && ++ (j < 0 || h->data[k].offset > h->data[j].offset)) ++ j = k; ++ ++ BUG_ON(i != j); ++#endif ++ if (i >= 0 && ++ ptr->offset < h->data[i].offset + ca->mi.bucket_size && ++ ptr->gen == h->data[i].gen) ++ return ptr->dev; ++ } ++ ++ return -1; ++} ++ ++static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) ++{ ++ int dev_idx = __copygc_pred(c, k); ++ if (dev_idx < 0) ++ return DATA_SKIP; ++ ++ data_opts->target = io_opts->background_target; ++ data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; ++ data_opts->rewrite_dev = dev_idx; ++ return DATA_REWRITE; ++} ++ ++static bool have_copygc_reserve(struct bch_dev *ca) ++{ ++ bool ret; ++ ++ spin_lock(&ca->fs->freelist_lock); ++ ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) || ++ ca->allocator_state != ALLOCATOR_RUNNING; ++ spin_unlock(&ca->fs->freelist_lock); ++ ++ return ret; ++} ++ ++static inline int fragmentation_cmp(copygc_heap *heap, ++ struct copygc_heap_entry l, ++ struct copygc_heap_entry r) ++{ ++ return cmp_int(l.fragmentation, r.fragmentation); ++} ++ ++static int bch2_copygc(struct bch_fs *c) ++{ ++ copygc_heap *h = &c->copygc_heap; ++ struct copygc_heap_entry e, *i; ++ struct bucket_array *buckets; ++ struct bch_move_stats move_stats; ++ u64 sectors_to_move = 0, sectors_not_moved = 0; ++ u64 sectors_reserved = 0; ++ u64 buckets_to_move, buckets_not_moved = 0; ++ struct bch_dev *ca; ++ unsigned dev_idx; ++ size_t b, heap_size = 0; ++ int ret; ++ ++ memset(&move_stats, 0, sizeof(move_stats)); ++ /* ++ * Find buckets with lowest sector counts, skipping completely ++ * empty buckets, by building a maxheap sorted by sector count, ++ * and repeatedly replacing the maximum element until all ++ * buckets have been visited. ++ */ ++ h->used = 0; ++ ++ for_each_rw_member(ca, c, dev_idx) ++ heap_size += ca->mi.nbuckets >> 7; ++ ++ if (h->size < heap_size) { ++ free_heap(&c->copygc_heap); ++ if (!init_heap(&c->copygc_heap, heap_size, GFP_KERNEL)) { ++ bch_err(c, "error allocating copygc heap"); ++ return 0; ++ } ++ } ++ ++ for_each_rw_member(ca, c, dev_idx) { ++ closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); ++ ++ spin_lock(&ca->fs->freelist_lock); ++ sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size; ++ spin_unlock(&ca->fs->freelist_lock); ++ ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { ++ struct bucket_mark m = READ_ONCE(buckets->b[b].mark); ++ struct copygc_heap_entry e; ++ ++ if (m.owned_by_allocator || ++ m.data_type != BCH_DATA_user || ++ !bucket_sectors_used(m) || ++ bucket_sectors_used(m) >= ca->mi.bucket_size) ++ continue; ++ ++ e = (struct copygc_heap_entry) { ++ .dev = dev_idx, ++ .gen = m.gen, ++ .fragmentation = bucket_sectors_used(m) * (1U << 15) ++ / ca->mi.bucket_size, ++ .sectors = bucket_sectors_used(m), ++ .offset = bucket_to_sector(ca, b), ++ }; ++ heap_add_or_replace(h, e, -fragmentation_cmp, NULL); ++ } ++ up_read(&ca->bucket_lock); ++ } ++ ++ if (!sectors_reserved) { ++ bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!"); ++ return -1; ++ } ++ ++ for (i = h->data; i < h->data + h->used; i++) ++ sectors_to_move += i->sectors; ++ ++ while (sectors_to_move > sectors_reserved) { ++ BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL)); ++ sectors_to_move -= e.sectors; ++ } ++ ++ buckets_to_move = h->used; ++ ++ if (!buckets_to_move) ++ return 0; ++ ++ eytzinger0_sort(h->data, h->used, ++ sizeof(h->data[0]), ++ bucket_offset_cmp, NULL); ++ ++ ret = bch2_move_data(c, &c->copygc_pd.rate, ++ writepoint_ptr(&c->copygc_write_point), ++ POS_MIN, POS_MAX, ++ copygc_pred, NULL, ++ &move_stats); ++ ++ for_each_rw_member(ca, c, dev_idx) { ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ for (i = h->data; i < h->data + h->used; i++) { ++ struct bucket_mark m; ++ size_t b; ++ ++ if (i->dev != dev_idx) ++ continue; ++ ++ b = sector_to_bucket(ca, i->offset); ++ m = READ_ONCE(buckets->b[b].mark); ++ ++ if (i->gen == m.gen && ++ bucket_sectors_used(m)) { ++ sectors_not_moved += bucket_sectors_used(m); ++ buckets_not_moved++; ++ } ++ } ++ up_read(&ca->bucket_lock); ++ } ++ ++ if (sectors_not_moved && !ret) ++ bch_warn_ratelimited(c, ++ "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)", ++ sectors_not_moved, sectors_to_move, ++ buckets_not_moved, buckets_to_move, ++ atomic64_read(&move_stats.sectors_moved), ++ atomic64_read(&move_stats.keys_raced), ++ atomic64_read(&move_stats.sectors_raced)); ++ ++ trace_copygc(c, ++ atomic64_read(&move_stats.sectors_moved), sectors_not_moved, ++ buckets_to_move, buckets_not_moved); ++ return 0; ++} ++ ++/* ++ * Copygc runs when the amount of fragmented data is above some arbitrary ++ * threshold: ++ * ++ * The threshold at the limit - when the device is full - is the amount of space ++ * we reserved in bch2_recalc_capacity; we can't have more than that amount of ++ * disk space stranded due to fragmentation and store everything we have ++ * promised to store. ++ * ++ * But we don't want to be running copygc unnecessarily when the device still ++ * has plenty of free space - rather, we want copygc to smoothly run every so ++ * often and continually reduce the amount of fragmented space as the device ++ * fills up. So, we increase the threshold by half the current free space. ++ */ ++unsigned long bch2_copygc_wait_amount(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned dev_idx; ++ u64 fragmented_allowed = c->copygc_threshold; ++ u64 fragmented = 0; ++ ++ for_each_rw_member(ca, c, dev_idx) { ++ struct bch_dev_usage usage = bch2_dev_usage_read(ca); ++ ++ fragmented_allowed += ((__dev_buckets_available(ca, usage) * ++ ca->mi.bucket_size) >> 1); ++ fragmented += usage.sectors_fragmented; ++ } ++ ++ return max_t(s64, 0, fragmented_allowed - fragmented); ++} ++ ++static int bch2_copygc_thread(void *arg) ++{ ++ struct bch_fs *c = arg; ++ struct io_clock *clock = &c->io_clock[WRITE]; ++ unsigned long last, wait; ++ ++ set_freezable(); ++ ++ while (!kthread_should_stop()) { ++ if (kthread_wait_freezable(c->copy_gc_enabled)) ++ break; ++ ++ last = atomic_long_read(&clock->now); ++ wait = bch2_copygc_wait_amount(c); ++ ++ if (wait > clock->max_slop) { ++ bch2_kthread_io_clock_wait(clock, last + wait, ++ MAX_SCHEDULE_TIMEOUT); ++ continue; ++ } ++ ++ if (bch2_copygc(c)) ++ break; ++ } ++ ++ return 0; ++} ++ ++void bch2_copygc_stop(struct bch_fs *c) ++{ ++ c->copygc_pd.rate.rate = UINT_MAX; ++ bch2_ratelimit_reset(&c->copygc_pd.rate); ++ ++ if (c->copygc_thread) { ++ kthread_stop(c->copygc_thread); ++ put_task_struct(c->copygc_thread); ++ } ++ c->copygc_thread = NULL; ++} ++ ++int bch2_copygc_start(struct bch_fs *c) ++{ ++ struct task_struct *t; ++ ++ if (c->copygc_thread) ++ return 0; ++ ++ if (c->opts.nochanges) ++ return 0; ++ ++ if (bch2_fs_init_fault("copygc_start")) ++ return -ENOMEM; ++ ++ t = kthread_create(bch2_copygc_thread, c, "bch_copygc"); ++ if (IS_ERR(t)) ++ return PTR_ERR(t); ++ ++ get_task_struct(t); ++ ++ c->copygc_thread = t; ++ wake_up_process(c->copygc_thread); ++ ++ return 0; ++} ++ ++void bch2_fs_copygc_init(struct bch_fs *c) ++{ ++ bch2_pd_controller_init(&c->copygc_pd); ++ c->copygc_pd.d_term = 0; ++} +diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h +new file mode 100644 +index 000000000000..922738247d03 +--- /dev/null ++++ b/fs/bcachefs/movinggc.h +@@ -0,0 +1,9 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_MOVINGGC_H ++#define _BCACHEFS_MOVINGGC_H ++ ++void bch2_copygc_stop(struct bch_fs *); ++int bch2_copygc_start(struct bch_fs *); ++void bch2_fs_copygc_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_MOVINGGC_H */ +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +new file mode 100644 +index 000000000000..afe25cd26c06 +--- /dev/null ++++ b/fs/bcachefs/opts.c +@@ -0,0 +1,437 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include ++ ++#include "bcachefs.h" ++#include "compress.h" ++#include "disk_groups.h" ++#include "opts.h" ++#include "super-io.h" ++#include "util.h" ++ ++const char * const bch2_error_actions[] = { ++ "continue", ++ "remount-ro", ++ "panic", ++ NULL ++}; ++ ++const char * const bch2_sb_features[] = { ++#define x(f, n) #f, ++ BCH_SB_FEATURES() ++#undef x ++ NULL ++}; ++ ++const char * const bch2_csum_opts[] = { ++ "none", ++ "crc32c", ++ "crc64", ++ NULL ++}; ++ ++const char * const bch2_compression_opts[] = { ++#define x(t, n) #t, ++ BCH_COMPRESSION_OPTS() ++#undef x ++ NULL ++}; ++ ++const char * const bch2_str_hash_types[] = { ++ "crc32c", ++ "crc64", ++ "siphash", ++ NULL ++}; ++ ++const char * const bch2_data_types[] = { ++#define x(t, n) #t, ++ BCH_DATA_TYPES() ++#undef x ++ NULL ++}; ++ ++const char * const bch2_cache_replacement_policies[] = { ++ "lru", ++ "fifo", ++ "random", ++ NULL ++}; ++ ++/* Default is -1; we skip past it for struct cached_dev's cache mode */ ++const char * const bch2_cache_modes[] = { ++ "default", ++ "writethrough", ++ "writeback", ++ "writearound", ++ "none", ++ NULL ++}; ++ ++const char * const bch2_dev_state[] = { ++ "readwrite", ++ "readonly", ++ "failed", ++ "spare", ++ NULL ++}; ++ ++void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) ++{ ++#define x(_name, ...) \ ++ if (opt_defined(src, _name)) \ ++ opt_set(*dst, _name, src._name); ++ ++ BCH_OPTS() ++#undef x ++} ++ ++bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Opt_##_name: \ ++ return opt_defined(*opts, _name); ++ BCH_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Opt_##_name: \ ++ return opts->_name; ++ BCH_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Opt_##_name: \ ++ opt_set(*opts, _name, v); \ ++ break; ++ BCH_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++/* ++ * Initial options from superblock - here we don't want any options undefined, ++ * any options the superblock doesn't specify are set to 0: ++ */ ++struct bch_opts bch2_opts_from_sb(struct bch_sb *sb) ++{ ++ struct bch_opts opts = bch2_opts_empty(); ++ ++#define x(_name, _bits, _mode, _type, _sb_opt, ...) \ ++ if (_sb_opt != NO_SB_OPT) \ ++ opt_set(opts, _name, _sb_opt(sb)); ++ BCH_OPTS() ++#undef x ++ ++ return opts; ++} ++ ++const struct bch_option bch2_opt_table[] = { ++#define OPT_BOOL() .type = BCH_OPT_BOOL ++#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, .min = _min, .max = _max ++#define OPT_SECTORS(_min, _max) .type = BCH_OPT_SECTORS, .min = _min, .max = _max ++#define OPT_STR(_choices) .type = BCH_OPT_STR, .choices = _choices ++#define OPT_FN(_fn) .type = BCH_OPT_FN, \ ++ .parse = _fn##_parse, \ ++ .to_text = _fn##_to_text ++ ++#define x(_name, _bits, _mode, _type, _sb_opt, _default, _hint, _help) \ ++ [Opt_##_name] = { \ ++ .attr = { \ ++ .name = #_name, \ ++ .mode = (_mode) & OPT_RUNTIME ? 0644 : 0444, \ ++ }, \ ++ .mode = _mode, \ ++ .hint = _hint, \ ++ .help = _help, \ ++ .set_sb = SET_##_sb_opt, \ ++ _type \ ++ }, ++ ++ BCH_OPTS() ++#undef x ++}; ++ ++int bch2_opt_lookup(const char *name) ++{ ++ const struct bch_option *i; ++ ++ for (i = bch2_opt_table; ++ i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table); ++ i++) ++ if (!strcmp(name, i->attr.name)) ++ return i - bch2_opt_table; ++ ++ return -1; ++} ++ ++struct synonym { ++ const char *s1, *s2; ++}; ++ ++static const struct synonym bch_opt_synonyms[] = { ++ { "quota", "usrquota" }, ++}; ++ ++static int bch2_mount_opt_lookup(const char *name) ++{ ++ const struct synonym *i; ++ ++ for (i = bch_opt_synonyms; ++ i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms); ++ i++) ++ if (!strcmp(name, i->s1)) ++ name = i->s2; ++ ++ return bch2_opt_lookup(name); ++} ++ ++int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, ++ const char *val, u64 *res) ++{ ++ ssize_t ret; ++ ++ switch (opt->type) { ++ case BCH_OPT_BOOL: ++ ret = kstrtou64(val, 10, res); ++ if (ret < 0) ++ return ret; ++ ++ if (*res > 1) ++ return -ERANGE; ++ break; ++ case BCH_OPT_UINT: ++ ret = kstrtou64(val, 10, res); ++ if (ret < 0) ++ return ret; ++ ++ if (*res < opt->min || *res >= opt->max) ++ return -ERANGE; ++ break; ++ case BCH_OPT_SECTORS: ++ ret = bch2_strtou64_h(val, res); ++ if (ret < 0) ++ return ret; ++ ++ if (*res & 511) ++ return -EINVAL; ++ ++ *res >>= 9; ++ ++ if (*res < opt->min || *res >= opt->max) ++ return -ERANGE; ++ break; ++ case BCH_OPT_STR: ++ ret = match_string(opt->choices, -1, val); ++ if (ret < 0) ++ return ret; ++ ++ *res = ret; ++ break; ++ case BCH_OPT_FN: ++ if (!c) ++ return -EINVAL; ++ ++ return opt->parse(c, val, res); ++ } ++ ++ return 0; ++} ++ ++void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c, ++ const struct bch_option *opt, u64 v, ++ unsigned flags) ++{ ++ if (flags & OPT_SHOW_MOUNT_STYLE) { ++ if (opt->type == BCH_OPT_BOOL) { ++ pr_buf(out, "%s%s", ++ v ? "" : "no", ++ opt->attr.name); ++ return; ++ } ++ ++ pr_buf(out, "%s=", opt->attr.name); ++ } ++ ++ switch (opt->type) { ++ case BCH_OPT_BOOL: ++ case BCH_OPT_UINT: ++ pr_buf(out, "%lli", v); ++ break; ++ case BCH_OPT_SECTORS: ++ bch2_hprint(out, v); ++ break; ++ case BCH_OPT_STR: ++ if (flags & OPT_SHOW_FULL_LIST) ++ bch2_string_opt_to_text(out, opt->choices, v); ++ else ++ pr_buf(out, opt->choices[v]); ++ break; ++ case BCH_OPT_FN: ++ opt->to_text(out, c, v); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) ++{ ++ int ret = 0; ++ ++ switch (id) { ++ case Opt_compression: ++ case Opt_background_compression: ++ ret = bch2_check_set_has_compressed_data(c, v); ++ break; ++ case Opt_erasure_code: ++ if (v) ++ bch2_check_set_feature(c, BCH_FEATURE_ec); ++ break; ++ } ++ ++ return ret; ++} ++ ++int bch2_opts_check_may_set(struct bch_fs *c) ++{ ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < bch2_opts_nr; i++) { ++ ret = bch2_opt_check_may_set(c, i, ++ bch2_opt_get_by_id(&c->opts, i)); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++int bch2_parse_mount_opts(struct bch_opts *opts, char *options) ++{ ++ char *opt, *name, *val; ++ int ret, id; ++ u64 v; ++ ++ while ((opt = strsep(&options, ",")) != NULL) { ++ name = strsep(&opt, "="); ++ val = opt; ++ ++ if (val) { ++ id = bch2_mount_opt_lookup(name); ++ if (id < 0) ++ goto bad_opt; ++ ++ ret = bch2_opt_parse(NULL, &bch2_opt_table[id], val, &v); ++ if (ret < 0) ++ goto bad_val; ++ } else { ++ id = bch2_mount_opt_lookup(name); ++ v = 1; ++ ++ if (id < 0 && ++ !strncmp("no", name, 2)) { ++ id = bch2_mount_opt_lookup(name + 2); ++ v = 0; ++ } ++ ++ if (id < 0) ++ goto bad_opt; ++ ++ if (bch2_opt_table[id].type != BCH_OPT_BOOL) ++ goto no_val; ++ } ++ ++ if (!(bch2_opt_table[id].mode & OPT_MOUNT)) ++ goto bad_opt; ++ ++ if (id == Opt_acl && ++ !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL)) ++ goto bad_opt; ++ ++ if ((id == Opt_usrquota || ++ id == Opt_grpquota) && ++ !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) ++ goto bad_opt; ++ ++ bch2_opt_set_by_id(opts, id, v); ++ } ++ ++ return 0; ++bad_opt: ++ pr_err("Bad mount option %s", name); ++ return -1; ++bad_val: ++ pr_err("Invalid value %s for mount option %s", val, name); ++ return -1; ++no_val: ++ pr_err("Mount option %s requires a value", name); ++ return -1; ++} ++ ++/* io opts: */ ++ ++struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) ++{ ++ struct bch_io_opts ret = { 0 }; ++#define x(_name, _bits) \ ++ if (opt_defined(src, _name)) \ ++ opt_set(ret, _name, src._name); ++ BCH_INODE_OPTS() ++#undef x ++ return ret; ++} ++ ++struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src) ++{ ++ struct bch_opts ret = { 0 }; ++#define x(_name, _bits) \ ++ if (opt_defined(src, _name)) \ ++ opt_set(ret, _name, src._name); ++ BCH_INODE_OPTS() ++#undef x ++ return ret; ++} ++ ++void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src) ++{ ++#define x(_name, _bits) \ ++ if (opt_defined(src, _name)) \ ++ opt_set(*dst, _name, src._name); ++ BCH_INODE_OPTS() ++#undef x ++} ++ ++bool bch2_opt_is_inode_opt(enum bch_opt_id id) ++{ ++ static const enum bch_opt_id inode_opt_list[] = { ++#define x(_name, _bits) Opt_##_name, ++ BCH_INODE_OPTS() ++#undef x ++ }; ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++) ++ if (inode_opt_list[i] == id) ++ return true; ++ ++ return false; ++} +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +new file mode 100644 +index 000000000000..014c608ca0c6 +--- /dev/null ++++ b/fs/bcachefs/opts.h +@@ -0,0 +1,440 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_OPTS_H ++#define _BCACHEFS_OPTS_H ++ ++#include ++#include ++#include ++#include ++#include "bcachefs_format.h" ++ ++extern const char * const bch2_error_actions[]; ++extern const char * const bch2_sb_features[]; ++extern const char * const bch2_csum_opts[]; ++extern const char * const bch2_compression_opts[]; ++extern const char * const bch2_str_hash_types[]; ++extern const char * const bch2_data_types[]; ++extern const char * const bch2_cache_replacement_policies[]; ++extern const char * const bch2_cache_modes[]; ++extern const char * const bch2_dev_state[]; ++ ++/* ++ * Mount options; we also store defaults in the superblock. ++ * ++ * Also exposed via sysfs: if an option is writeable, and it's also stored in ++ * the superblock, changing it via sysfs (currently? might change this) also ++ * updates the superblock. ++ * ++ * We store options as signed integers, where -1 means undefined. This means we ++ * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only ++ * apply the options from that struct that are defined. ++ */ ++ ++/* dummy option, for options that aren't stored in the superblock */ ++LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0); ++ ++/* When can be set: */ ++enum opt_mode { ++ OPT_FORMAT = (1 << 0), ++ OPT_MOUNT = (1 << 1), ++ OPT_RUNTIME = (1 << 2), ++ OPT_INODE = (1 << 3), ++ OPT_DEVICE = (1 << 4), ++}; ++ ++enum opt_type { ++ BCH_OPT_BOOL, ++ BCH_OPT_UINT, ++ BCH_OPT_SECTORS, ++ BCH_OPT_STR, ++ BCH_OPT_FN, ++}; ++ ++/** ++ * x(name, shortopt, type, in mem type, mode, sb_opt) ++ * ++ * @name - name of mount option, sysfs attribute, and struct bch_opts ++ * member ++ * ++ * @mode - when opt may be set ++ * ++ * @sb_option - name of corresponding superblock option ++ * ++ * @type - one of OPT_BOOL, OPT_UINT, OPT_STR ++ */ ++ ++/* ++ * XXX: add fields for ++ * - default value ++ * - helptext ++ */ ++ ++#ifdef __KERNEL__ ++#define RATELIMIT_ERRORS true ++#else ++#define RATELIMIT_ERRORS false ++#endif ++ ++#define BCH_OPTS() \ ++ x(block_size, u16, \ ++ OPT_FORMAT, \ ++ OPT_SECTORS(1, 128), \ ++ BCH_SB_BLOCK_SIZE, 8, \ ++ "size", NULL) \ ++ x(btree_node_size, u16, \ ++ OPT_FORMAT, \ ++ OPT_SECTORS(1, 512), \ ++ BCH_SB_BTREE_NODE_SIZE, 512, \ ++ "size", "Btree node size, default 256k") \ ++ x(errors, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_STR(bch2_error_actions), \ ++ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_RO, \ ++ NULL, "Action to take on filesystem error") \ ++ x(metadata_replicas, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_UINT(1, BCH_REPLICAS_MAX), \ ++ BCH_SB_META_REPLICAS_WANT, 1, \ ++ "#", "Number of metadata replicas") \ ++ x(data_replicas, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_UINT(1, BCH_REPLICAS_MAX), \ ++ BCH_SB_DATA_REPLICAS_WANT, 1, \ ++ "#", "Number of data replicas") \ ++ x(metadata_replicas_required, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_UINT(1, BCH_REPLICAS_MAX), \ ++ BCH_SB_META_REPLICAS_REQ, 1, \ ++ "#", NULL) \ ++ x(data_replicas_required, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_UINT(1, BCH_REPLICAS_MAX), \ ++ BCH_SB_DATA_REPLICAS_REQ, 1, \ ++ "#", NULL) \ ++ x(metadata_checksum, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_STR(bch2_csum_opts), \ ++ BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ ++ NULL, NULL) \ ++ x(data_checksum, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_STR(bch2_csum_opts), \ ++ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ ++ NULL, NULL) \ ++ x(compression, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_STR(bch2_compression_opts), \ ++ BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ ++ NULL, NULL) \ ++ x(background_compression, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_STR(bch2_compression_opts), \ ++ BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ ++ NULL, NULL) \ ++ x(str_hash, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_STR(bch2_str_hash_types), \ ++ BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_SIPHASH, \ ++ NULL, "Hash function for directory entries and xattrs")\ ++ x(foreground_target, u16, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FN(bch2_opt_target), \ ++ BCH_SB_FOREGROUND_TARGET, 0, \ ++ "(target)", "Device or disk group for foreground writes") \ ++ x(background_target, u16, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FN(bch2_opt_target), \ ++ BCH_SB_BACKGROUND_TARGET, 0, \ ++ "(target)", "Device or disk group to move data to in the background")\ ++ x(promote_target, u16, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FN(bch2_opt_target), \ ++ BCH_SB_PROMOTE_TARGET, 0, \ ++ "(target)", "Device or disk group to promote data to on read")\ ++ x(erasure_code, u16, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_BOOL(), \ ++ BCH_SB_ERASURE_CODE, false, \ ++ NULL, "Enable erasure coding (DO NOT USE YET)") \ ++ x(inodes_32bit, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH_SB_INODE_32BIT, false, \ ++ NULL, "Constrain inode numbers to 32 bits") \ ++ x(gc_reserve_percent, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_UINT(5, 21), \ ++ BCH_SB_GC_RESERVE, 8, \ ++ "%", "Percentage of disk space to reserve for copygc")\ ++ x(gc_reserve_bytes, u64, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_SECTORS(0, U64_MAX), \ ++ BCH_SB_GC_RESERVE_BYTES, 0, \ ++ "%", "Amount of disk space to reserve for copygc\n" \ ++ "Takes precedence over gc_reserve_percent if set")\ ++ x(root_reserve_percent, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_UINT(0, 100), \ ++ BCH_SB_ROOT_RESERVE, 0, \ ++ "%", "Percentage of disk space to reserve for superuser")\ ++ x(wide_macs, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH_SB_128_BIT_MACS, false, \ ++ NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\ ++ x(inline_data, u8, \ ++ OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Enable inline data extents") \ ++ x(acl, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH_SB_POSIX_ACL, true, \ ++ NULL, "Enable POSIX acls") \ ++ x(usrquota, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH_SB_USRQUOTA, false, \ ++ NULL, "Enable user quotas") \ ++ x(grpquota, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH_SB_GRPQUOTA, false, \ ++ NULL, "Enable group quotas") \ ++ x(prjquota, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH_SB_PRJQUOTA, false, \ ++ NULL, "Enable project quotas") \ ++ x(reflink, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH_SB_REFLINK, true, \ ++ NULL, "Enable reflink support") \ ++ x(degraded, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Allow mounting in degraded mode") \ ++ x(discard, u8, \ ++ OPT_MOUNT|OPT_DEVICE, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Enable discard/TRIM support") \ ++ x(verbose, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Extra debugging information during mount/recovery")\ ++ x(journal_flush_disabled, u8, \ ++ OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Disable journal flush on sync/fsync\n" \ ++ "If enabled, writes can be lost, but only since the\n"\ ++ "last journal write (default 1 second)") \ ++ x(fsck, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Run fsck on mount") \ ++ x(fix_errors, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Fix errors during fsck without asking") \ ++ x(ratelimit_errors, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, RATELIMIT_ERRORS, \ ++ NULL, "Ratelimit error messages during fsck") \ ++ x(nochanges, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Super read only mode - no writes at all will be issued,\n"\ ++ "even if we have to replay the journal") \ ++ x(norecovery, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Don't replay the journal") \ ++ x(rebuild_replicas, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Rebuild the superblock replicas section") \ ++ x(keep_journal, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Don't free journal entries/keys after startup")\ ++ x(read_entire_journal, u8, \ ++ 0, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Read all journal entries, not just dirty ones")\ ++ x(noexcl, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Don't open device in exclusive mode") \ ++ x(sb, u64, \ ++ OPT_MOUNT, \ ++ OPT_UINT(0, S64_MAX), \ ++ NO_SB_OPT, BCH_SB_SECTOR, \ ++ "offset", "Sector offset of superblock") \ ++ x(read_only, u8, \ ++ 0, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, NULL) \ ++ x(nostart, u8, \ ++ 0, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Don\'t start filesystem, only open devices") \ ++ x(reconstruct_alloc, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Reconstruct alloc btree") \ ++ x(version_upgrade, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Set superblock to latest version,\n" \ ++ "allowing any new features to be used") \ ++ x(project, u8, \ ++ OPT_INODE, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, NULL) \ ++ x(fs_size, u64, \ ++ OPT_DEVICE, \ ++ OPT_SECTORS(0, S64_MAX), \ ++ NO_SB_OPT, 0, \ ++ "size", "Size of filesystem on device") \ ++ x(bucket, u32, \ ++ OPT_DEVICE, \ ++ OPT_SECTORS(0, S64_MAX), \ ++ NO_SB_OPT, 0, \ ++ "size", "Size of filesystem on device") \ ++ x(durability, u8, \ ++ OPT_DEVICE, \ ++ OPT_UINT(0, BCH_REPLICAS_MAX), \ ++ NO_SB_OPT, 1, \ ++ "n", "Data written to this device will be considered\n"\ ++ "to have already been replicated n times") ++ ++struct bch_opts { ++#define x(_name, _bits, ...) unsigned _name##_defined:1; ++ BCH_OPTS() ++#undef x ++ ++#define x(_name, _bits, ...) _bits _name; ++ BCH_OPTS() ++#undef x ++}; ++ ++static const struct bch_opts bch2_opts_default = { ++#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \ ++ ._name##_defined = true, \ ++ ._name = _default, \ ++ ++ BCH_OPTS() ++#undef x ++}; ++ ++#define opt_defined(_opts, _name) ((_opts)._name##_defined) ++ ++#define opt_get(_opts, _name) \ ++ (opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name) ++ ++#define opt_set(_opts, _name, _v) \ ++do { \ ++ (_opts)._name##_defined = true; \ ++ (_opts)._name = _v; \ ++} while (0) ++ ++static inline struct bch_opts bch2_opts_empty(void) ++{ ++ return (struct bch_opts) { 0 }; ++} ++ ++void bch2_opts_apply(struct bch_opts *, struct bch_opts); ++ ++enum bch_opt_id { ++#define x(_name, ...) Opt_##_name, ++ BCH_OPTS() ++#undef x ++ bch2_opts_nr ++}; ++ ++struct bch_fs; ++struct printbuf; ++ ++struct bch_option { ++ struct attribute attr; ++ void (*set_sb)(struct bch_sb *, u64); ++ enum opt_mode mode; ++ enum opt_type type; ++ ++ union { ++ struct { ++ u64 min, max; ++ }; ++ struct { ++ const char * const *choices; ++ }; ++ struct { ++ int (*parse)(struct bch_fs *, const char *, u64 *); ++ void (*to_text)(struct printbuf *, struct bch_fs *, u64); ++ }; ++ }; ++ ++ const char *hint; ++ const char *help; ++ ++}; ++ ++extern const struct bch_option bch2_opt_table[]; ++ ++bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); ++u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); ++void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); ++ ++struct bch_opts bch2_opts_from_sb(struct bch_sb *); ++ ++int bch2_opt_lookup(const char *); ++int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *); ++ ++#define OPT_SHOW_FULL_LIST (1 << 0) ++#define OPT_SHOW_MOUNT_STYLE (1 << 1) ++ ++void bch2_opt_to_text(struct printbuf *, struct bch_fs *, ++ const struct bch_option *, u64, unsigned); ++ ++int bch2_opt_check_may_set(struct bch_fs *, int, u64); ++int bch2_opts_check_may_set(struct bch_fs *); ++int bch2_parse_mount_opts(struct bch_opts *, char *); ++ ++/* inode opts: */ ++ ++struct bch_io_opts { ++#define x(_name, _bits) unsigned _name##_defined:1; ++ BCH_INODE_OPTS() ++#undef x ++ ++#define x(_name, _bits) u##_bits _name; ++ BCH_INODE_OPTS() ++#undef x ++}; ++ ++struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); ++struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts); ++void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts); ++bool bch2_opt_is_inode_opt(enum bch_opt_id); ++ ++#endif /* _BCACHEFS_OPTS_H */ +diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c +new file mode 100644 +index 000000000000..d3032a46e7f3 +--- /dev/null ++++ b/fs/bcachefs/quota.c +@@ -0,0 +1,783 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "btree_update.h" ++#include "inode.h" ++#include "quota.h" ++#include "super-io.h" ++ ++static const char *bch2_sb_validate_quota(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_quota *q = field_to_type(f, quota); ++ ++ if (vstruct_bytes(&q->field) != sizeof(*q)) ++ return "invalid field quota: wrong size"; ++ ++ return NULL; ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_quota = { ++ .validate = bch2_sb_validate_quota, ++}; ++ ++const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ if (k.k->p.inode >= QTYP_NR) ++ return "invalid quota type"; ++ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) ++ return "incorrect value size"; ++ ++ return NULL; ++} ++ ++static const char * const bch2_quota_counters[] = { ++ "space", ++ "inodes", ++}; ++ ++void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_quota dq = bkey_s_c_to_quota(k); ++ unsigned i; ++ ++ for (i = 0; i < Q_COUNTERS; i++) ++ pr_buf(out, "%s hardlimit %llu softlimit %llu", ++ bch2_quota_counters[i], ++ le64_to_cpu(dq.v->c[i].hardlimit), ++ le64_to_cpu(dq.v->c[i].softlimit)); ++} ++ ++#ifdef CONFIG_BCACHEFS_QUOTA ++ ++#include ++#include ++#include ++ ++static inline unsigned __next_qtype(unsigned i, unsigned qtypes) ++{ ++ qtypes >>= i; ++ return qtypes ? i + __ffs(qtypes) : QTYP_NR; ++} ++ ++#define for_each_set_qtype(_c, _i, _q, _qtypes) \ ++ for (_i = 0; \ ++ (_i = __next_qtype(_i, _qtypes), \ ++ _q = &(_c)->quotas[_i], \ ++ _i < QTYP_NR); \ ++ _i++) ++ ++static bool ignore_hardlimit(struct bch_memquota_type *q) ++{ ++ if (capable(CAP_SYS_RESOURCE)) ++ return true; ++#if 0 ++ struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; ++ ++ return capable(CAP_SYS_RESOURCE) && ++ (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || ++ !(info->dqi_flags & DQF_ROOT_SQUASH)); ++#endif ++ return false; ++} ++ ++enum quota_msg { ++ SOFTWARN, /* Softlimit reached */ ++ SOFTLONGWARN, /* Grace time expired */ ++ HARDWARN, /* Hardlimit reached */ ++ ++ HARDBELOW, /* Usage got below inode hardlimit */ ++ SOFTBELOW, /* Usage got below inode softlimit */ ++}; ++ ++static int quota_nl[][Q_COUNTERS] = { ++ [HARDWARN][Q_SPC] = QUOTA_NL_BHARDWARN, ++ [SOFTLONGWARN][Q_SPC] = QUOTA_NL_BSOFTLONGWARN, ++ [SOFTWARN][Q_SPC] = QUOTA_NL_BSOFTWARN, ++ [HARDBELOW][Q_SPC] = QUOTA_NL_BHARDBELOW, ++ [SOFTBELOW][Q_SPC] = QUOTA_NL_BSOFTBELOW, ++ ++ [HARDWARN][Q_INO] = QUOTA_NL_IHARDWARN, ++ [SOFTLONGWARN][Q_INO] = QUOTA_NL_ISOFTLONGWARN, ++ [SOFTWARN][Q_INO] = QUOTA_NL_ISOFTWARN, ++ [HARDBELOW][Q_INO] = QUOTA_NL_IHARDBELOW, ++ [SOFTBELOW][Q_INO] = QUOTA_NL_ISOFTBELOW, ++}; ++ ++struct quota_msgs { ++ u8 nr; ++ struct { ++ u8 qtype; ++ u8 msg; ++ } m[QTYP_NR * Q_COUNTERS]; ++}; ++ ++static void prepare_msg(unsigned qtype, ++ enum quota_counters counter, ++ struct quota_msgs *msgs, ++ enum quota_msg msg_type) ++{ ++ BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m)); ++ ++ msgs->m[msgs->nr].qtype = qtype; ++ msgs->m[msgs->nr].msg = quota_nl[msg_type][counter]; ++ msgs->nr++; ++} ++ ++static void prepare_warning(struct memquota_counter *qc, ++ unsigned qtype, ++ enum quota_counters counter, ++ struct quota_msgs *msgs, ++ enum quota_msg msg_type) ++{ ++ if (qc->warning_issued & (1 << msg_type)) ++ return; ++ ++ prepare_msg(qtype, counter, msgs, msg_type); ++} ++ ++static void flush_warnings(struct bch_qid qid, ++ struct super_block *sb, ++ struct quota_msgs *msgs) ++{ ++ unsigned i; ++ ++ for (i = 0; i < msgs->nr; i++) ++ quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]), ++ sb->s_dev, msgs->m[i].msg); ++} ++ ++static int bch2_quota_check_limit(struct bch_fs *c, ++ unsigned qtype, ++ struct bch_memquota *mq, ++ struct quota_msgs *msgs, ++ enum quota_counters counter, ++ s64 v, ++ enum quota_acct_mode mode) ++{ ++ struct bch_memquota_type *q = &c->quotas[qtype]; ++ struct memquota_counter *qc = &mq->c[counter]; ++ u64 n = qc->v + v; ++ ++ BUG_ON((s64) n < 0); ++ ++ if (mode == KEY_TYPE_QUOTA_NOCHECK) ++ return 0; ++ ++ if (v <= 0) { ++ if (n < qc->hardlimit && ++ (qc->warning_issued & (1 << HARDWARN))) { ++ qc->warning_issued &= ~(1 << HARDWARN); ++ prepare_msg(qtype, counter, msgs, HARDBELOW); ++ } ++ ++ if (n < qc->softlimit && ++ (qc->warning_issued & (1 << SOFTWARN))) { ++ qc->warning_issued &= ~(1 << SOFTWARN); ++ prepare_msg(qtype, counter, msgs, SOFTBELOW); ++ } ++ ++ qc->warning_issued = 0; ++ return 0; ++ } ++ ++ if (qc->hardlimit && ++ qc->hardlimit < n && ++ !ignore_hardlimit(q)) { ++ if (mode == KEY_TYPE_QUOTA_PREALLOC) ++ return -EDQUOT; ++ ++ prepare_warning(qc, qtype, counter, msgs, HARDWARN); ++ } ++ ++ if (qc->softlimit && ++ qc->softlimit < n && ++ qc->timer && ++ ktime_get_real_seconds() >= qc->timer && ++ !ignore_hardlimit(q)) { ++ if (mode == KEY_TYPE_QUOTA_PREALLOC) ++ return -EDQUOT; ++ ++ prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN); ++ } ++ ++ if (qc->softlimit && ++ qc->softlimit < n && ++ qc->timer == 0) { ++ if (mode == KEY_TYPE_QUOTA_PREALLOC) ++ return -EDQUOT; ++ ++ prepare_warning(qc, qtype, counter, msgs, SOFTWARN); ++ ++ /* XXX is this the right one? */ ++ qc->timer = ktime_get_real_seconds() + ++ q->limits[counter].warnlimit; ++ } ++ ++ return 0; ++} ++ ++int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, ++ enum quota_counters counter, s64 v, ++ enum quota_acct_mode mode) ++{ ++ unsigned qtypes = enabled_qtypes(c); ++ struct bch_memquota_type *q; ++ struct bch_memquota *mq[QTYP_NR]; ++ struct quota_msgs msgs; ++ unsigned i; ++ int ret = 0; ++ ++ memset(&msgs, 0, sizeof(msgs)); ++ ++ for_each_set_qtype(c, i, q, qtypes) ++ mutex_lock_nested(&q->lock, i); ++ ++ for_each_set_qtype(c, i, q, qtypes) { ++ mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS); ++ if (!mq[i]) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode); ++ if (ret) ++ goto err; ++ } ++ ++ for_each_set_qtype(c, i, q, qtypes) ++ mq[i]->c[counter].v += v; ++err: ++ for_each_set_qtype(c, i, q, qtypes) ++ mutex_unlock(&q->lock); ++ ++ flush_warnings(qid, c->vfs_sb, &msgs); ++ ++ return ret; ++} ++ ++static void __bch2_quota_transfer(struct bch_memquota *src_q, ++ struct bch_memquota *dst_q, ++ enum quota_counters counter, s64 v) ++{ ++ BUG_ON(v > src_q->c[counter].v); ++ BUG_ON(v + dst_q->c[counter].v < v); ++ ++ src_q->c[counter].v -= v; ++ dst_q->c[counter].v += v; ++} ++ ++int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, ++ struct bch_qid dst, ++ struct bch_qid src, u64 space, ++ enum quota_acct_mode mode) ++{ ++ struct bch_memquota_type *q; ++ struct bch_memquota *src_q[3], *dst_q[3]; ++ struct quota_msgs msgs; ++ unsigned i; ++ int ret = 0; ++ ++ qtypes &= enabled_qtypes(c); ++ ++ memset(&msgs, 0, sizeof(msgs)); ++ ++ for_each_set_qtype(c, i, q, qtypes) ++ mutex_lock_nested(&q->lock, i); ++ ++ for_each_set_qtype(c, i, q, qtypes) { ++ src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS); ++ dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_NOFS); ++ ++ if (!src_q[i] || !dst_q[i]) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC, ++ dst_q[i]->c[Q_SPC].v + space, ++ mode); ++ if (ret) ++ goto err; ++ ++ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO, ++ dst_q[i]->c[Q_INO].v + 1, ++ mode); ++ if (ret) ++ goto err; ++ } ++ ++ for_each_set_qtype(c, i, q, qtypes) { ++ __bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space); ++ __bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1); ++ } ++ ++err: ++ for_each_set_qtype(c, i, q, qtypes) ++ mutex_unlock(&q->lock); ++ ++ flush_warnings(dst, c->vfs_sb, &msgs); ++ ++ return ret; ++} ++ ++static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_quota dq; ++ struct bch_memquota_type *q; ++ struct bch_memquota *mq; ++ unsigned i; ++ ++ BUG_ON(k.k->p.inode >= QTYP_NR); ++ ++ switch (k.k->type) { ++ case KEY_TYPE_quota: ++ dq = bkey_s_c_to_quota(k); ++ q = &c->quotas[k.k->p.inode]; ++ ++ mutex_lock(&q->lock); ++ mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL); ++ if (!mq) { ++ mutex_unlock(&q->lock); ++ return -ENOMEM; ++ } ++ ++ for (i = 0; i < Q_COUNTERS; i++) { ++ mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit); ++ mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit); ++ } ++ ++ mutex_unlock(&q->lock); ++ } ++ ++ return 0; ++} ++ ++static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_QUOTAS, POS(type, 0), ++ BTREE_ITER_PREFETCH, k, ret) { ++ if (k.k->p.inode != type) ++ break; ++ ++ ret = __bch2_quota_set(c, k); ++ if (ret) ++ break; ++ } ++ ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++void bch2_fs_quota_exit(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(c->quotas); i++) ++ genradix_free(&c->quotas[i].table); ++} ++ ++void bch2_fs_quota_init(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(c->quotas); i++) ++ mutex_init(&c->quotas[i].lock); ++} ++ ++static void bch2_sb_quota_read(struct bch_fs *c) ++{ ++ struct bch_sb_field_quota *sb_quota; ++ unsigned i, j; ++ ++ sb_quota = bch2_sb_get_quota(c->disk_sb.sb); ++ if (!sb_quota) ++ return; ++ ++ for (i = 0; i < QTYP_NR; i++) { ++ struct bch_memquota_type *q = &c->quotas[i]; ++ ++ for (j = 0; j < Q_COUNTERS; j++) { ++ q->limits[j].timelimit = ++ le32_to_cpu(sb_quota->q[i].c[j].timelimit); ++ q->limits[j].warnlimit = ++ le32_to_cpu(sb_quota->q[i].c[j].warnlimit); ++ } ++ } ++} ++ ++int bch2_fs_quota_read(struct bch_fs *c) ++{ ++ unsigned i, qtypes = enabled_qtypes(c); ++ struct bch_memquota_type *q; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bch_inode_unpacked u; ++ struct bkey_s_c k; ++ int ret; ++ ++ mutex_lock(&c->sb_lock); ++ bch2_sb_quota_read(c); ++ mutex_unlock(&c->sb_lock); ++ ++ for_each_set_qtype(c, i, q, qtypes) { ++ ret = bch2_quota_init_type(c, i); ++ if (ret) ++ return ret; ++ } ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ switch (k.k->type) { ++ case KEY_TYPE_inode: ++ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u); ++ if (ret) ++ return ret; ++ ++ bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, ++ KEY_TYPE_QUOTA_NOCHECK); ++ bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, ++ KEY_TYPE_QUOTA_NOCHECK); ++ } ++ } ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++/* Enable/disable/delete quotas for an entire filesystem: */ ++ ++static int bch2_quota_enable(struct super_block *sb, unsigned uflags) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ /* Accounting must be enabled at mount time: */ ++ if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT)) ++ return -EINVAL; ++ ++ /* Can't enable enforcement without accounting: */ ++ if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota) ++ return -EINVAL; ++ ++ if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota) ++ return -EINVAL; ++ ++ if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota) ++ return -EINVAL; ++ ++ mutex_lock(&c->sb_lock); ++ if (uflags & FS_QUOTA_UDQ_ENFD) ++ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true); ++ ++ if (uflags & FS_QUOTA_GDQ_ENFD) ++ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true); ++ ++ if (uflags & FS_QUOTA_PDQ_ENFD) ++ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++static int bch2_quota_disable(struct super_block *sb, unsigned uflags) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ mutex_lock(&c->sb_lock); ++ if (uflags & FS_QUOTA_UDQ_ENFD) ++ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false); ++ ++ if (uflags & FS_QUOTA_GDQ_ENFD) ++ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false); ++ ++ if (uflags & FS_QUOTA_PDQ_ENFD) ++ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++static int bch2_quota_remove(struct super_block *sb, unsigned uflags) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ int ret; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ if (uflags & FS_USER_QUOTA) { ++ if (c->opts.usrquota) ++ return -EINVAL; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, ++ POS(QTYP_USR, 0), ++ POS(QTYP_USR + 1, 0), ++ NULL); ++ if (ret) ++ return ret; ++ } ++ ++ if (uflags & FS_GROUP_QUOTA) { ++ if (c->opts.grpquota) ++ return -EINVAL; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, ++ POS(QTYP_GRP, 0), ++ POS(QTYP_GRP + 1, 0), ++ NULL); ++ if (ret) ++ return ret; ++ } ++ ++ if (uflags & FS_PROJ_QUOTA) { ++ if (c->opts.prjquota) ++ return -EINVAL; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, ++ POS(QTYP_PRJ, 0), ++ POS(QTYP_PRJ + 1, 0), ++ NULL); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Return quota status information, such as enforcements, quota file inode ++ * numbers etc. ++ */ ++static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ unsigned qtypes = enabled_qtypes(c); ++ unsigned i; ++ ++ memset(state, 0, sizeof(*state)); ++ ++ for (i = 0; i < QTYP_NR; i++) { ++ state->s_state[i].flags |= QCI_SYSFILE; ++ ++ if (!(qtypes & (1 << i))) ++ continue; ++ ++ state->s_state[i].flags |= QCI_ACCT_ENABLED; ++ ++ state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit; ++ state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit; ++ ++ state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit; ++ state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Adjust quota timers & warnings ++ */ ++static int bch2_quota_set_info(struct super_block *sb, int type, ++ struct qc_info *info) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_sb_field_quota *sb_quota; ++ struct bch_memquota_type *q; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ if (type >= QTYP_NR) ++ return -EINVAL; ++ ++ if (!((1 << type) & enabled_qtypes(c))) ++ return -ESRCH; ++ ++ if (info->i_fieldmask & ++ ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS)) ++ return -EINVAL; ++ ++ q = &c->quotas[type]; ++ ++ mutex_lock(&c->sb_lock); ++ sb_quota = bch2_sb_get_quota(c->disk_sb.sb); ++ if (!sb_quota) { ++ sb_quota = bch2_sb_resize_quota(&c->disk_sb, ++ sizeof(*sb_quota) / sizeof(u64)); ++ if (!sb_quota) ++ return -ENOSPC; ++ } ++ ++ if (info->i_fieldmask & QC_SPC_TIMER) ++ sb_quota->q[type].c[Q_SPC].timelimit = ++ cpu_to_le32(info->i_spc_timelimit); ++ ++ if (info->i_fieldmask & QC_SPC_WARNS) ++ sb_quota->q[type].c[Q_SPC].warnlimit = ++ cpu_to_le32(info->i_spc_warnlimit); ++ ++ if (info->i_fieldmask & QC_INO_TIMER) ++ sb_quota->q[type].c[Q_INO].timelimit = ++ cpu_to_le32(info->i_ino_timelimit); ++ ++ if (info->i_fieldmask & QC_INO_WARNS) ++ sb_quota->q[type].c[Q_INO].warnlimit = ++ cpu_to_le32(info->i_ino_warnlimit); ++ ++ bch2_sb_quota_read(c); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++/* Get/set individual quotas: */ ++ ++static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src) ++{ ++ dst->d_space = src->c[Q_SPC].v << 9; ++ dst->d_spc_hardlimit = src->c[Q_SPC].hardlimit << 9; ++ dst->d_spc_softlimit = src->c[Q_SPC].softlimit << 9; ++ dst->d_spc_timer = src->c[Q_SPC].timer; ++ dst->d_spc_warns = src->c[Q_SPC].warns; ++ ++ dst->d_ino_count = src->c[Q_INO].v; ++ dst->d_ino_hardlimit = src->c[Q_INO].hardlimit; ++ dst->d_ino_softlimit = src->c[Q_INO].softlimit; ++ dst->d_ino_timer = src->c[Q_INO].timer; ++ dst->d_ino_warns = src->c[Q_INO].warns; ++} ++ ++static int bch2_get_quota(struct super_block *sb, struct kqid kqid, ++ struct qc_dqblk *qdq) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_memquota_type *q = &c->quotas[kqid.type]; ++ qid_t qid = from_kqid(&init_user_ns, kqid); ++ struct bch_memquota *mq; ++ ++ memset(qdq, 0, sizeof(*qdq)); ++ ++ mutex_lock(&q->lock); ++ mq = genradix_ptr(&q->table, qid); ++ if (mq) ++ __bch2_quota_get(qdq, mq); ++ mutex_unlock(&q->lock); ++ ++ return 0; ++} ++ ++static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid, ++ struct qc_dqblk *qdq) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_memquota_type *q = &c->quotas[kqid->type]; ++ qid_t qid = from_kqid(&init_user_ns, *kqid); ++ struct genradix_iter iter; ++ struct bch_memquota *mq; ++ int ret = 0; ++ ++ mutex_lock(&q->lock); ++ ++ genradix_for_each_from(&q->table, iter, mq, qid) ++ if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) { ++ __bch2_quota_get(qdq, mq); ++ *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos); ++ goto found; ++ } ++ ++ ret = -ENOENT; ++found: ++ mutex_unlock(&q->lock); ++ return ret; ++} ++ ++static int bch2_set_quota_trans(struct btree_trans *trans, ++ struct bkey_i_quota *new_quota, ++ struct qc_dqblk *qdq) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_QUOTAS, new_quota->k.p, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(iter); ++ ++ ret = bkey_err(k); ++ if (unlikely(ret)) ++ return ret; ++ ++ if (k.k->type == KEY_TYPE_quota) ++ new_quota->v = *bkey_s_c_to_quota(k).v; ++ ++ if (qdq->d_fieldmask & QC_SPC_SOFT) ++ new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9); ++ if (qdq->d_fieldmask & QC_SPC_HARD) ++ new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9); ++ ++ if (qdq->d_fieldmask & QC_INO_SOFT) ++ new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit); ++ if (qdq->d_fieldmask & QC_INO_HARD) ++ new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); ++ ++ return bch2_trans_update(trans, iter, &new_quota->k_i, 0); ++} ++ ++static int bch2_set_quota(struct super_block *sb, struct kqid qid, ++ struct qc_dqblk *qdq) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct btree_trans trans; ++ struct bkey_i_quota new_quota; ++ int ret; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ bkey_quota_init(&new_quota.k_i); ++ new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOUNLOCK, ++ bch2_set_quota_trans(&trans, &new_quota, qdq)) ?: ++ __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i)); ++ ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++const struct quotactl_ops bch2_quotactl_operations = { ++ .quota_enable = bch2_quota_enable, ++ .quota_disable = bch2_quota_disable, ++ .rm_xquota = bch2_quota_remove, ++ ++ .get_state = bch2_quota_get_state, ++ .set_info = bch2_quota_set_info, ++ ++ .get_dqblk = bch2_get_quota, ++ .get_nextdqblk = bch2_get_next_quota, ++ .set_dqblk = bch2_set_quota, ++}; ++ ++#endif /* CONFIG_BCACHEFS_QUOTA */ +diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h +new file mode 100644 +index 000000000000..51e4f9713ef0 +--- /dev/null ++++ b/fs/bcachefs/quota.h +@@ -0,0 +1,71 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_QUOTA_H ++#define _BCACHEFS_QUOTA_H ++ ++#include "inode.h" ++#include "quota_types.h" ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_quota; ++ ++const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_quota (struct bkey_ops) { \ ++ .key_invalid = bch2_quota_invalid, \ ++ .val_to_text = bch2_quota_to_text, \ ++} ++ ++static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u) ++{ ++ return (struct bch_qid) { ++ .q[QTYP_USR] = u->bi_uid, ++ .q[QTYP_GRP] = u->bi_gid, ++ .q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0, ++ }; ++} ++ ++static inline unsigned enabled_qtypes(struct bch_fs *c) ++{ ++ return ((c->opts.usrquota << QTYP_USR)| ++ (c->opts.grpquota << QTYP_GRP)| ++ (c->opts.prjquota << QTYP_PRJ)); ++} ++ ++#ifdef CONFIG_BCACHEFS_QUOTA ++ ++int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters, ++ s64, enum quota_acct_mode); ++ ++int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid, ++ struct bch_qid, u64, enum quota_acct_mode); ++ ++void bch2_fs_quota_exit(struct bch_fs *); ++void bch2_fs_quota_init(struct bch_fs *); ++int bch2_fs_quota_read(struct bch_fs *); ++ ++extern const struct quotactl_ops bch2_quotactl_operations; ++ ++#else ++ ++static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, ++ enum quota_counters counter, s64 v, ++ enum quota_acct_mode mode) ++{ ++ return 0; ++} ++ ++static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, ++ struct bch_qid dst, ++ struct bch_qid src, u64 space, ++ enum quota_acct_mode mode) ++{ ++ return 0; ++} ++ ++static inline void bch2_fs_quota_exit(struct bch_fs *c) {} ++static inline void bch2_fs_quota_init(struct bch_fs *c) {} ++static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; } ++ ++#endif ++ ++#endif /* _BCACHEFS_QUOTA_H */ +diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h +new file mode 100644 +index 000000000000..6a136083d389 +--- /dev/null ++++ b/fs/bcachefs/quota_types.h +@@ -0,0 +1,43 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_QUOTA_TYPES_H ++#define _BCACHEFS_QUOTA_TYPES_H ++ ++#include ++ ++struct bch_qid { ++ u32 q[QTYP_NR]; ++}; ++ ++enum quota_acct_mode { ++ KEY_TYPE_QUOTA_PREALLOC, ++ KEY_TYPE_QUOTA_WARN, ++ KEY_TYPE_QUOTA_NOCHECK, ++}; ++ ++struct memquota_counter { ++ u64 v; ++ u64 hardlimit; ++ u64 softlimit; ++ s64 timer; ++ int warns; ++ int warning_issued; ++}; ++ ++struct bch_memquota { ++ struct memquota_counter c[Q_COUNTERS]; ++}; ++ ++typedef GENRADIX(struct bch_memquota) bch_memquota_table; ++ ++struct quota_limit { ++ u32 timelimit; ++ u32 warnlimit; ++}; ++ ++struct bch_memquota_type { ++ struct quota_limit limits[Q_COUNTERS]; ++ bch_memquota_table table; ++ struct mutex lock; ++}; ++ ++#endif /* _BCACHEFS_QUOTA_TYPES_H */ +diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c +new file mode 100644 +index 000000000000..56a1f761271f +--- /dev/null ++++ b/fs/bcachefs/rebalance.c +@@ -0,0 +1,331 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "btree_iter.h" ++#include "buckets.h" ++#include "clock.h" ++#include "disk_groups.h" ++#include "extents.h" ++#include "io.h" ++#include "move.h" ++#include "rebalance.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++ ++/* ++ * Check if an extent should be moved: ++ * returns -1 if it should not be moved, or ++ * device of pointer that should be moved, if known, or INT_MAX if unknown ++ */ ++static int __bch2_rebalance_pred(struct bch_fs *c, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ if (io_opts->background_compression && ++ !bch2_bkey_is_incompressible(k)) ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (!p.ptr.cached && ++ p.crc.compression_type != ++ bch2_compression_opt_to_type[io_opts->background_compression]) ++ return p.ptr.dev; ++ ++ if (io_opts->background_target) ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (!p.ptr.cached && ++ !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target)) ++ return p.ptr.dev; ++ ++ return -1; ++} ++ ++void bch2_rebalance_add_key(struct bch_fs *c, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts) ++{ ++ atomic64_t *counter; ++ int dev; ++ ++ dev = __bch2_rebalance_pred(c, k, io_opts); ++ if (dev < 0) ++ return; ++ ++ counter = dev < INT_MAX ++ ? &bch_dev_bkey_exists(c, dev)->rebalance_work ++ : &c->rebalance.work_unknown_dev; ++ ++ if (atomic64_add_return(k.k->size, counter) == k.k->size) ++ rebalance_wakeup(c); ++} ++ ++static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) ++{ ++ if (__bch2_rebalance_pred(c, k, io_opts) >= 0) { ++ data_opts->target = io_opts->background_target; ++ data_opts->btree_insert_flags = 0; ++ return DATA_ADD_REPLICAS; ++ } else { ++ return DATA_SKIP; ++ } ++} ++ ++void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) ++{ ++ if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) == ++ sectors) ++ rebalance_wakeup(c); ++} ++ ++struct rebalance_work { ++ int dev_most_full_idx; ++ unsigned dev_most_full_percent; ++ u64 dev_most_full_work; ++ u64 dev_most_full_capacity; ++ u64 total_work; ++}; ++ ++static void rebalance_work_accumulate(struct rebalance_work *w, ++ u64 dev_work, u64 unknown_dev, u64 capacity, int idx) ++{ ++ unsigned percent_full; ++ u64 work = dev_work + unknown_dev; ++ ++ if (work < dev_work || work < unknown_dev) ++ work = U64_MAX; ++ work = min(work, capacity); ++ ++ percent_full = div64_u64(work * 100, capacity); ++ ++ if (percent_full >= w->dev_most_full_percent) { ++ w->dev_most_full_idx = idx; ++ w->dev_most_full_percent = percent_full; ++ w->dev_most_full_work = work; ++ w->dev_most_full_capacity = capacity; ++ } ++ ++ if (w->total_work + dev_work >= w->total_work && ++ w->total_work + dev_work >= dev_work) ++ w->total_work += dev_work; ++} ++ ++static struct rebalance_work rebalance_work(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ struct rebalance_work ret = { .dev_most_full_idx = -1 }; ++ u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev); ++ unsigned i; ++ ++ for_each_online_member(ca, c, i) ++ rebalance_work_accumulate(&ret, ++ atomic64_read(&ca->rebalance_work), ++ unknown_dev, ++ bucket_to_sector(ca, ca->mi.nbuckets - ++ ca->mi.first_bucket), ++ i); ++ ++ rebalance_work_accumulate(&ret, ++ unknown_dev, 0, c->capacity, -1); ++ ++ return ret; ++} ++ ++static void rebalance_work_reset(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ for_each_online_member(ca, c, i) ++ atomic64_set(&ca->rebalance_work, 0); ++ ++ atomic64_set(&c->rebalance.work_unknown_dev, 0); ++} ++ ++static unsigned long curr_cputime(void) ++{ ++ u64 utime, stime; ++ ++ task_cputime_adjusted(current, &utime, &stime); ++ return nsecs_to_jiffies(utime + stime); ++} ++ ++static int bch2_rebalance_thread(void *arg) ++{ ++ struct bch_fs *c = arg; ++ struct bch_fs_rebalance *r = &c->rebalance; ++ struct io_clock *clock = &c->io_clock[WRITE]; ++ struct rebalance_work w, p; ++ unsigned long start, prev_start; ++ unsigned long prev_run_time, prev_run_cputime; ++ unsigned long cputime, prev_cputime; ++ unsigned long io_start; ++ long throttle; ++ ++ set_freezable(); ++ ++ io_start = atomic_long_read(&clock->now); ++ p = rebalance_work(c); ++ prev_start = jiffies; ++ prev_cputime = curr_cputime(); ++ ++ while (!kthread_wait_freezable(r->enabled)) { ++ cond_resched(); ++ ++ start = jiffies; ++ cputime = curr_cputime(); ++ ++ prev_run_time = start - prev_start; ++ prev_run_cputime = cputime - prev_cputime; ++ ++ w = rebalance_work(c); ++ BUG_ON(!w.dev_most_full_capacity); ++ ++ if (!w.total_work) { ++ r->state = REBALANCE_WAITING; ++ kthread_wait_freezable(rebalance_work(c).total_work); ++ continue; ++ } ++ ++ /* ++ * If there isn't much work to do, throttle cpu usage: ++ */ ++ throttle = prev_run_cputime * 100 / ++ max(1U, w.dev_most_full_percent) - ++ prev_run_time; ++ ++ if (w.dev_most_full_percent < 20 && throttle > 0) { ++ r->throttled_until_iotime = io_start + ++ div_u64(w.dev_most_full_capacity * ++ (20 - w.dev_most_full_percent), ++ 50); ++ ++ if (atomic_long_read(&clock->now) + clock->max_slop < ++ r->throttled_until_iotime) { ++ r->throttled_until_cputime = start + throttle; ++ r->state = REBALANCE_THROTTLED; ++ ++ bch2_kthread_io_clock_wait(clock, ++ r->throttled_until_iotime, ++ throttle); ++ continue; ++ } ++ } ++ ++ /* minimum 1 mb/sec: */ ++ r->pd.rate.rate = ++ max_t(u64, 1 << 11, ++ r->pd.rate.rate * ++ max(p.dev_most_full_percent, 1U) / ++ max(w.dev_most_full_percent, 1U)); ++ ++ io_start = atomic_long_read(&clock->now); ++ p = w; ++ prev_start = start; ++ prev_cputime = cputime; ++ ++ r->state = REBALANCE_RUNNING; ++ memset(&r->move_stats, 0, sizeof(r->move_stats)); ++ rebalance_work_reset(c); ++ ++ bch2_move_data(c, ++ /* ratelimiting disabled for now */ ++ NULL, /* &r->pd.rate, */ ++ writepoint_ptr(&c->rebalance_write_point), ++ POS_MIN, POS_MAX, ++ rebalance_pred, NULL, ++ &r->move_stats); ++ } ++ ++ return 0; ++} ++ ++void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ struct bch_fs_rebalance *r = &c->rebalance; ++ struct rebalance_work w = rebalance_work(c); ++ char h1[21], h2[21]; ++ ++ bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9); ++ bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9); ++ pr_buf(out, "fullest_dev (%i):\t%s/%s\n", ++ w.dev_most_full_idx, h1, h2); ++ ++ bch2_hprint(&PBUF(h1), w.total_work << 9); ++ bch2_hprint(&PBUF(h2), c->capacity << 9); ++ pr_buf(out, "total work:\t\t%s/%s\n", h1, h2); ++ ++ pr_buf(out, "rate:\t\t\t%u\n", r->pd.rate.rate); ++ ++ switch (r->state) { ++ case REBALANCE_WAITING: ++ pr_buf(out, "waiting\n"); ++ break; ++ case REBALANCE_THROTTLED: ++ bch2_hprint(&PBUF(h1), ++ (r->throttled_until_iotime - ++ atomic_long_read(&c->io_clock[WRITE].now)) << 9); ++ pr_buf(out, "throttled for %lu sec or %s io\n", ++ (r->throttled_until_cputime - jiffies) / HZ, ++ h1); ++ break; ++ case REBALANCE_RUNNING: ++ pr_buf(out, "running\n"); ++ pr_buf(out, "pos %llu:%llu\n", ++ r->move_stats.pos.inode, ++ r->move_stats.pos.offset); ++ break; ++ } ++} ++ ++void bch2_rebalance_stop(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ c->rebalance.pd.rate.rate = UINT_MAX; ++ bch2_ratelimit_reset(&c->rebalance.pd.rate); ++ ++ p = rcu_dereference_protected(c->rebalance.thread, 1); ++ c->rebalance.thread = NULL; ++ ++ if (p) { ++ /* for sychronizing with rebalance_wakeup() */ ++ synchronize_rcu(); ++ ++ kthread_stop(p); ++ put_task_struct(p); ++ } ++} ++ ++int bch2_rebalance_start(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ if (c->opts.nochanges) ++ return 0; ++ ++ p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance"); ++ if (IS_ERR(p)) ++ return PTR_ERR(p); ++ ++ get_task_struct(p); ++ rcu_assign_pointer(c->rebalance.thread, p); ++ wake_up_process(p); ++ return 0; ++} ++ ++void bch2_fs_rebalance_init(struct bch_fs *c) ++{ ++ bch2_pd_controller_init(&c->rebalance.pd); ++ ++ atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX); ++} +diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h +new file mode 100644 +index 000000000000..7ade0bb81cce +--- /dev/null ++++ b/fs/bcachefs/rebalance.h +@@ -0,0 +1,28 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_REBALANCE_H ++#define _BCACHEFS_REBALANCE_H ++ ++#include "rebalance_types.h" ++ ++static inline void rebalance_wakeup(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ rcu_read_lock(); ++ p = rcu_dereference(c->rebalance.thread); ++ if (p) ++ wake_up_process(p); ++ rcu_read_unlock(); ++} ++ ++void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c, ++ struct bch_io_opts *); ++void bch2_rebalance_add_work(struct bch_fs *, u64); ++ ++void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *); ++ ++void bch2_rebalance_stop(struct bch_fs *); ++int bch2_rebalance_start(struct bch_fs *); ++void bch2_fs_rebalance_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_REBALANCE_H */ +diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h +new file mode 100644 +index 000000000000..192c6be20ced +--- /dev/null ++++ b/fs/bcachefs/rebalance_types.h +@@ -0,0 +1,27 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_REBALANCE_TYPES_H ++#define _BCACHEFS_REBALANCE_TYPES_H ++ ++#include "move_types.h" ++ ++enum rebalance_state { ++ REBALANCE_WAITING, ++ REBALANCE_THROTTLED, ++ REBALANCE_RUNNING, ++}; ++ ++struct bch_fs_rebalance { ++ struct task_struct __rcu *thread; ++ struct bch_pd_controller pd; ++ ++ atomic64_t work_unknown_dev; ++ ++ enum rebalance_state state; ++ unsigned long throttled_until_iotime; ++ unsigned long throttled_until_cputime; ++ struct bch_move_stats move_stats; ++ ++ unsigned enabled:1; ++}; ++ ++#endif /* _BCACHEFS_REBALANCE_TYPES_H */ +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +new file mode 100644 +index 000000000000..6e829bf0a31f +--- /dev/null ++++ b/fs/bcachefs/recovery.c +@@ -0,0 +1,1330 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_io.h" ++#include "buckets.h" ++#include "dirent.h" ++#include "ec.h" ++#include "error.h" ++#include "fs-common.h" ++#include "fsck.h" ++#include "journal_io.h" ++#include "journal_reclaim.h" ++#include "journal_seq_blacklist.h" ++#include "quota.h" ++#include "recovery.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++#include ++#include ++ ++#define QSTR(n) { { { .len = strlen(n) } }, .name = n } ++ ++/* iterate over keys read from the journal: */ ++ ++static struct journal_key *journal_key_search(struct journal_keys *journal_keys, ++ enum btree_id id, unsigned level, ++ struct bpos pos) ++{ ++ size_t l = 0, r = journal_keys->nr, m; ++ ++ while (l < r) { ++ m = l + ((r - l) >> 1); ++ if ((cmp_int(id, journal_keys->d[m].btree_id) ?: ++ cmp_int(level, journal_keys->d[m].level) ?: ++ bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0) ++ l = m + 1; ++ else ++ r = m; ++ } ++ ++ BUG_ON(l < journal_keys->nr && ++ (cmp_int(id, journal_keys->d[l].btree_id) ?: ++ cmp_int(level, journal_keys->d[l].level) ?: ++ bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0); ++ ++ BUG_ON(l && ++ (cmp_int(id, journal_keys->d[l - 1].btree_id) ?: ++ cmp_int(level, journal_keys->d[l - 1].level) ?: ++ bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0); ++ ++ return l < journal_keys->nr ? journal_keys->d + l : NULL; ++} ++ ++static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) ++{ ++ if (iter->k && ++ iter->k < iter->keys->d + iter->keys->nr && ++ iter->k->btree_id == iter->btree_id && ++ iter->k->level == iter->level) ++ return iter->k->k; ++ ++ iter->k = NULL; ++ return NULL; ++} ++ ++static void bch2_journal_iter_advance(struct journal_iter *iter) ++{ ++ if (iter->k) ++ iter->k++; ++} ++ ++static void bch2_journal_iter_init(struct journal_iter *iter, ++ struct journal_keys *journal_keys, ++ enum btree_id id, unsigned level, ++ struct bpos pos) ++{ ++ iter->btree_id = id; ++ iter->level = level; ++ iter->keys = journal_keys; ++ iter->k = journal_key_search(journal_keys, id, level, pos); ++} ++ ++static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) ++{ ++ return iter->btree ++ ? bch2_btree_iter_peek(iter->btree) ++ : bch2_btree_node_iter_peek_unpack(&iter->node_iter, ++ iter->b, &iter->unpacked); ++} ++ ++static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) ++{ ++ if (iter->btree) ++ bch2_btree_iter_next(iter->btree); ++ else ++ bch2_btree_node_iter_advance(&iter->node_iter, iter->b); ++} ++ ++void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) ++{ ++ switch (iter->last) { ++ case none: ++ break; ++ case btree: ++ bch2_journal_iter_advance_btree(iter); ++ break; ++ case journal: ++ bch2_journal_iter_advance(&iter->journal); ++ break; ++ } ++ ++ iter->last = none; ++} ++ ++struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) ++{ ++ struct bkey_s_c ret; ++ ++ while (1) { ++ struct bkey_s_c btree_k = ++ bch2_journal_iter_peek_btree(iter); ++ struct bkey_s_c journal_k = ++ bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal)); ++ ++ if (btree_k.k && journal_k.k) { ++ int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p); ++ ++ if (!cmp) ++ bch2_journal_iter_advance_btree(iter); ++ ++ iter->last = cmp < 0 ? btree : journal; ++ } else if (btree_k.k) { ++ iter->last = btree; ++ } else if (journal_k.k) { ++ iter->last = journal; ++ } else { ++ iter->last = none; ++ return bkey_s_c_null; ++ } ++ ++ ret = iter->last == journal ? journal_k : btree_k; ++ ++ if (iter->b && ++ bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) { ++ iter->journal.k = NULL; ++ iter->last = none; ++ return bkey_s_c_null; ++ } ++ ++ if (!bkey_deleted(ret.k)) ++ break; ++ ++ bch2_btree_and_journal_iter_advance(iter); ++ } ++ ++ return ret; ++} ++ ++struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *iter) ++{ ++ bch2_btree_and_journal_iter_advance(iter); ++ ++ return bch2_btree_and_journal_iter_peek(iter); ++} ++ ++void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter, ++ struct btree_trans *trans, ++ struct journal_keys *journal_keys, ++ enum btree_id id, struct bpos pos) ++{ ++ memset(iter, 0, sizeof(*iter)); ++ ++ iter->btree = bch2_trans_get_iter(trans, id, pos, 0); ++ bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos); ++} ++ ++void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, ++ struct journal_keys *journal_keys, ++ struct btree *b) ++{ ++ memset(iter, 0, sizeof(*iter)); ++ ++ iter->b = b; ++ bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); ++ bch2_journal_iter_init(&iter->journal, journal_keys, ++ b->c.btree_id, b->c.level, b->data->min_key); ++} ++ ++/* Walk btree, overlaying keys from the journal: */ ++ ++static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b, ++ struct journal_keys *journal_keys, ++ enum btree_id btree_id, ++ btree_walk_node_fn node_fn, ++ btree_walk_key_fn key_fn) ++{ ++ struct btree_and_journal_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); ++ ++ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { ++ ret = key_fn(c, btree_id, b->c.level, k); ++ if (ret) ++ break; ++ ++ if (b->c.level) { ++ struct btree *child; ++ BKEY_PADDED(k) tmp; ++ ++ bkey_reassemble(&tmp.k, k); ++ k = bkey_i_to_s_c(&tmp.k); ++ ++ bch2_btree_and_journal_iter_advance(&iter); ++ ++ if (b->c.level > 0) { ++ child = bch2_btree_node_get_noiter(c, &tmp.k, ++ b->c.btree_id, b->c.level - 1); ++ ret = PTR_ERR_OR_ZERO(child); ++ if (ret) ++ break; ++ ++ ret = (node_fn ? node_fn(c, b) : 0) ?: ++ bch2_btree_and_journal_walk_recurse(c, child, ++ journal_keys, btree_id, node_fn, key_fn); ++ six_unlock_read(&child->c.lock); ++ ++ if (ret) ++ break; ++ } ++ } else { ++ bch2_btree_and_journal_iter_advance(&iter); ++ } ++ } ++ ++ return ret; ++} ++ ++int bch2_btree_and_journal_walk(struct bch_fs *c, struct journal_keys *journal_keys, ++ enum btree_id btree_id, ++ btree_walk_node_fn node_fn, ++ btree_walk_key_fn key_fn) ++{ ++ struct btree *b = c->btree_roots[btree_id].b; ++ int ret = 0; ++ ++ if (btree_node_fake(b)) ++ return 0; ++ ++ six_lock_read(&b->c.lock, NULL, NULL); ++ ret = (node_fn ? node_fn(c, b) : 0) ?: ++ bch2_btree_and_journal_walk_recurse(c, b, journal_keys, btree_id, ++ node_fn, key_fn) ?: ++ key_fn(c, btree_id, b->c.level + 1, bkey_i_to_s_c(&b->key)); ++ six_unlock_read(&b->c.lock); ++ ++ return ret; ++} ++ ++/* sort and dedup all keys in the journal: */ ++ ++void bch2_journal_entries_free(struct list_head *list) ++{ ++ ++ while (!list_empty(list)) { ++ struct journal_replay *i = ++ list_first_entry(list, struct journal_replay, list); ++ list_del(&i->list); ++ kvpfree(i, offsetof(struct journal_replay, j) + ++ vstruct_bytes(&i->j)); ++ } ++} ++ ++/* ++ * When keys compare equal, oldest compares first: ++ */ ++static int journal_sort_key_cmp(const void *_l, const void *_r) ++{ ++ const struct journal_key *l = _l; ++ const struct journal_key *r = _r; ++ ++ return cmp_int(l->btree_id, r->btree_id) ?: ++ cmp_int(l->level, r->level) ?: ++ bkey_cmp(l->k->k.p, r->k->k.p) ?: ++ cmp_int(l->journal_seq, r->journal_seq) ?: ++ cmp_int(l->journal_offset, r->journal_offset); ++} ++ ++void bch2_journal_keys_free(struct journal_keys *keys) ++{ ++ kvfree(keys->d); ++ keys->d = NULL; ++ keys->nr = 0; ++} ++ ++static struct journal_keys journal_keys_sort(struct list_head *journal_entries) ++{ ++ struct journal_replay *p; ++ struct jset_entry *entry; ++ struct bkey_i *k, *_n; ++ struct journal_keys keys = { NULL }; ++ struct journal_key *src, *dst; ++ size_t nr_keys = 0; ++ ++ if (list_empty(journal_entries)) ++ return keys; ++ ++ keys.journal_seq_base = ++ le64_to_cpu(list_last_entry(journal_entries, ++ struct journal_replay, list)->j.last_seq); ++ ++ list_for_each_entry(p, journal_entries, list) { ++ if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) ++ continue; ++ ++ for_each_jset_key(k, _n, entry, &p->j) ++ nr_keys++; ++ } ++ ++ ++ keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL); ++ if (!keys.d) ++ goto err; ++ ++ list_for_each_entry(p, journal_entries, list) { ++ if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) ++ continue; ++ ++ for_each_jset_key(k, _n, entry, &p->j) ++ keys.d[keys.nr++] = (struct journal_key) { ++ .btree_id = entry->btree_id, ++ .level = entry->level, ++ .k = k, ++ .journal_seq = le64_to_cpu(p->j.seq) - ++ keys.journal_seq_base, ++ .journal_offset = k->_data - p->j._data, ++ }; ++ } ++ ++ sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL); ++ ++ src = dst = keys.d; ++ while (src < keys.d + keys.nr) { ++ while (src + 1 < keys.d + keys.nr && ++ src[0].btree_id == src[1].btree_id && ++ src[0].level == src[1].level && ++ !bkey_cmp(src[0].k->k.p, src[1].k->k.p)) ++ src++; ++ ++ *dst++ = *src++; ++ } ++ ++ keys.nr = dst - keys.d; ++err: ++ return keys; ++} ++ ++/* journal replay: */ ++ ++static void replay_now_at(struct journal *j, u64 seq) ++{ ++ BUG_ON(seq < j->replay_journal_seq); ++ BUG_ON(seq > j->replay_journal_seq_end); ++ ++ while (j->replay_journal_seq < seq) ++ bch2_journal_pin_put(j, j->replay_journal_seq++); ++} ++ ++static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id, ++ struct bkey_i *k) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter, *split_iter; ++ /* ++ * We might cause compressed extents to be split, so we need to pass in ++ * a disk_reservation: ++ */ ++ struct disk_reservation disk_res = ++ bch2_disk_reservation_init(c, 0); ++ struct bkey_i *split; ++ struct bpos atomic_end; ++ /* ++ * Some extents aren't equivalent - w.r.t. what the triggers do ++ * - if they're split: ++ */ ++ bool remark_if_split = bch2_bkey_sectors_compressed(bkey_i_to_s_c(k)) || ++ k->k.type == KEY_TYPE_reflink_p; ++ bool remark = false; ++ int ret; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ iter = bch2_trans_get_iter(&trans, btree_id, ++ bkey_start_pos(&k->k), ++ BTREE_ITER_INTENT); ++ ++ do { ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ goto err; ++ ++ atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p); ++ ++ split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k)); ++ ret = PTR_ERR_OR_ZERO(split); ++ if (ret) ++ goto err; ++ ++ if (!remark && ++ remark_if_split && ++ bkey_cmp(atomic_end, k->k.p) < 0) { ++ ret = bch2_disk_reservation_add(c, &disk_res, ++ k->k.size * ++ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k)), ++ BCH_DISK_RESERVATION_NOFAIL); ++ BUG_ON(ret); ++ ++ remark = true; ++ } ++ ++ bkey_copy(split, k); ++ bch2_cut_front(iter->pos, split); ++ bch2_cut_back(atomic_end, split); ++ ++ split_iter = bch2_trans_copy_iter(&trans, iter); ++ ret = PTR_ERR_OR_ZERO(split_iter); ++ if (ret) ++ goto err; ++ ++ /* ++ * It's important that we don't go through the ++ * extent_handle_overwrites() and extent_update_to_keys() path ++ * here: journal replay is supposed to treat extents like ++ * regular keys ++ */ ++ __bch2_btree_iter_set_pos(split_iter, split->k.p, false); ++ bch2_trans_update(&trans, split_iter, split, ++ BTREE_TRIGGER_NORUN); ++ ++ bch2_btree_iter_set_pos(iter, split->k.p); ++ ++ if (remark) { ++ ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(split), ++ 0, split->k.size, ++ BTREE_TRIGGER_INSERT); ++ if (ret) ++ goto err; ++ } ++ } while (bkey_cmp(iter->pos, k->k.p) < 0); ++ ++ if (remark) { ++ ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k), ++ 0, -((s64) k->k.size), ++ BTREE_TRIGGER_OVERWRITE); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_trans_commit(&trans, &disk_res, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_JOURNAL_REPLAY); ++err: ++ if (ret == -EINTR) ++ goto retry; ++ ++ bch2_disk_reservation_put(c, &disk_res); ++ ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++static int __bch2_journal_replay_key(struct btree_trans *trans, ++ enum btree_id id, unsigned level, ++ struct bkey_i *k) ++{ ++ struct btree_iter *iter; ++ int ret; ++ ++ iter = bch2_trans_get_node_iter(trans, id, k->k.p, ++ BTREE_MAX_DEPTH, level, ++ BTREE_ITER_INTENT); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ /* ++ * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run ++ * extent_handle_overwrites() and extent_update_to_keys() - but we don't ++ * want that here, journal replay is supposed to treat extents like ++ * regular keys: ++ */ ++ __bch2_btree_iter_set_pos(iter, k->k.p, false); ++ ++ ret = bch2_btree_iter_traverse(iter) ?: ++ bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_i *k) ++{ ++ return bch2_trans_do(c, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_JOURNAL_REPLAY, ++ __bch2_journal_replay_key(&trans, id, level, k)); ++} ++ ++static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k) ++{ ++ struct btree_iter *iter; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, k->k.p, ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(iter) ?: ++ bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) ++{ ++ return bch2_trans_do(c, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_JOURNAL_REPLAY, ++ __bch2_alloc_replay_key(&trans, k)); ++} ++ ++static int journal_sort_seq_cmp(const void *_l, const void *_r) ++{ ++ const struct journal_key *l = _l; ++ const struct journal_key *r = _r; ++ ++ return cmp_int(r->level, l->level) ?: ++ cmp_int(l->journal_seq, r->journal_seq) ?: ++ cmp_int(l->btree_id, r->btree_id) ?: ++ bkey_cmp(l->k->k.p, r->k->k.p); ++} ++ ++static int bch2_journal_replay(struct bch_fs *c, ++ struct journal_keys keys) ++{ ++ struct journal *j = &c->journal; ++ struct journal_key *i; ++ u64 seq; ++ int ret; ++ ++ sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL); ++ ++ if (keys.nr) ++ replay_now_at(j, keys.journal_seq_base); ++ ++ seq = j->replay_journal_seq; ++ ++ /* ++ * First replay updates to the alloc btree - these will only update the ++ * btree key cache: ++ */ ++ for_each_journal_key(keys, i) { ++ cond_resched(); ++ ++ if (!i->level && i->btree_id == BTREE_ID_ALLOC) { ++ j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; ++ ret = bch2_alloc_replay_key(c, i->k); ++ if (ret) ++ goto err; ++ } ++ } ++ ++ /* ++ * Next replay updates to interior btree nodes: ++ */ ++ for_each_journal_key(keys, i) { ++ cond_resched(); ++ ++ if (i->level) { ++ j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; ++ ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); ++ if (ret) ++ goto err; ++ } ++ } ++ ++ /* ++ * Now that the btree is in a consistent state, we can start journal ++ * reclaim (which will be flushing entries from the btree key cache back ++ * to the btree: ++ */ ++ set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); ++ set_bit(JOURNAL_RECLAIM_STARTED, &j->flags); ++ ++ j->replay_journal_seq = seq; ++ ++ /* ++ * Now replay leaf node updates: ++ */ ++ for_each_journal_key(keys, i) { ++ cond_resched(); ++ ++ if (i->level || i->btree_id == BTREE_ID_ALLOC) ++ continue; ++ ++ replay_now_at(j, keys.journal_seq_base + i->journal_seq); ++ ++ ret = i->k->k.size ++ ? bch2_extent_replay_key(c, i->btree_id, i->k) ++ : bch2_journal_replay_key(c, i->btree_id, i->level, i->k); ++ if (ret) ++ goto err; ++ } ++ ++ replay_now_at(j, j->replay_journal_seq_end); ++ j->replay_journal_seq = 0; ++ ++ bch2_journal_set_replay_done(j); ++ bch2_journal_flush_all_pins(j); ++ return bch2_journal_error(j); ++err: ++ bch_err(c, "journal replay: error %d while replaying key", ret); ++ return ret; ++} ++ ++static bool journal_empty(struct list_head *journal) ++{ ++ return list_empty(journal) || ++ journal_entry_empty(&list_last_entry(journal, ++ struct journal_replay, list)->j); ++} ++ ++static int ++verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c, ++ struct list_head *journal) ++{ ++ struct journal_replay *i = ++ list_last_entry(journal, struct journal_replay, list); ++ u64 start_seq = le64_to_cpu(i->j.last_seq); ++ u64 end_seq = le64_to_cpu(i->j.seq); ++ u64 seq = start_seq; ++ int ret = 0; ++ ++ list_for_each_entry(i, journal, list) { ++ if (le64_to_cpu(i->j.seq) < start_seq) ++ continue; ++ ++ fsck_err_on(seq != le64_to_cpu(i->j.seq), c, ++ "journal entries %llu-%llu missing! (replaying %llu-%llu)", ++ seq, le64_to_cpu(i->j.seq) - 1, ++ start_seq, end_seq); ++ ++ seq = le64_to_cpu(i->j.seq); ++ ++ fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c, ++ "found blacklisted journal entry %llu", seq); ++ ++ do { ++ seq++; ++ } while (bch2_journal_seq_is_blacklisted(c, seq, false)); ++ } ++fsck_err: ++ return ret; ++} ++ ++/* journal replay early: */ ++ ++static int journal_replay_entry_early(struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ int ret = 0; ++ ++ switch (entry->type) { ++ case BCH_JSET_ENTRY_btree_root: { ++ struct btree_root *r; ++ ++ if (entry->btree_id >= BTREE_ID_NR) { ++ bch_err(c, "filesystem has unknown btree type %u", ++ entry->btree_id); ++ return -EINVAL; ++ } ++ ++ r = &c->btree_roots[entry->btree_id]; ++ ++ if (entry->u64s) { ++ r->level = entry->level; ++ bkey_copy(&r->key, &entry->start[0]); ++ r->error = 0; ++ } else { ++ r->error = -EIO; ++ } ++ r->alive = true; ++ break; ++ } ++ case BCH_JSET_ENTRY_usage: { ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ ++ switch (entry->btree_id) { ++ case FS_USAGE_RESERVED: ++ if (entry->level < BCH_REPLICAS_MAX) ++ c->usage_base->persistent_reserved[entry->level] = ++ le64_to_cpu(u->v); ++ break; ++ case FS_USAGE_INODES: ++ c->usage_base->nr_inodes = le64_to_cpu(u->v); ++ break; ++ case FS_USAGE_KEY_VERSION: ++ atomic64_set(&c->key_version, ++ le64_to_cpu(u->v)); ++ break; ++ } ++ ++ break; ++ } ++ case BCH_JSET_ENTRY_data_usage: { ++ struct jset_entry_data_usage *u = ++ container_of(entry, struct jset_entry_data_usage, entry); ++ ret = bch2_replicas_set_usage(c, &u->r, ++ le64_to_cpu(u->v)); ++ break; ++ } ++ case BCH_JSET_ENTRY_blacklist: { ++ struct jset_entry_blacklist *bl_entry = ++ container_of(entry, struct jset_entry_blacklist, entry); ++ ++ ret = bch2_journal_seq_blacklist_add(c, ++ le64_to_cpu(bl_entry->seq), ++ le64_to_cpu(bl_entry->seq) + 1); ++ break; ++ } ++ case BCH_JSET_ENTRY_blacklist_v2: { ++ struct jset_entry_blacklist_v2 *bl_entry = ++ container_of(entry, struct jset_entry_blacklist_v2, entry); ++ ++ ret = bch2_journal_seq_blacklist_add(c, ++ le64_to_cpu(bl_entry->start), ++ le64_to_cpu(bl_entry->end) + 1); ++ break; ++ } ++ } ++ ++ return ret; ++} ++ ++static int journal_replay_early(struct bch_fs *c, ++ struct bch_sb_field_clean *clean, ++ struct list_head *journal) ++{ ++ struct jset_entry *entry; ++ int ret; ++ ++ if (clean) { ++ c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock); ++ c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock); ++ ++ for (entry = clean->start; ++ entry != vstruct_end(&clean->field); ++ entry = vstruct_next(entry)) { ++ ret = journal_replay_entry_early(c, entry); ++ if (ret) ++ return ret; ++ } ++ } else { ++ struct journal_replay *i = ++ list_last_entry(journal, struct journal_replay, list); ++ ++ c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); ++ c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); ++ ++ list_for_each_entry(i, journal, list) ++ vstruct_for_each(&i->j, entry) { ++ ret = journal_replay_entry_early(c, entry); ++ if (ret) ++ return ret; ++ } ++ } ++ ++ bch2_fs_usage_initialize(c); ++ ++ return 0; ++} ++ ++/* sb clean section: */ ++ ++static struct bkey_i *btree_root_find(struct bch_fs *c, ++ struct bch_sb_field_clean *clean, ++ struct jset *j, ++ enum btree_id id, unsigned *level) ++{ ++ struct bkey_i *k; ++ struct jset_entry *entry, *start, *end; ++ ++ if (clean) { ++ start = clean->start; ++ end = vstruct_end(&clean->field); ++ } else { ++ start = j->start; ++ end = vstruct_last(j); ++ } ++ ++ for (entry = start; entry < end; entry = vstruct_next(entry)) ++ if (entry->type == BCH_JSET_ENTRY_btree_root && ++ entry->btree_id == id) ++ goto found; ++ ++ return NULL; ++found: ++ if (!entry->u64s) ++ return ERR_PTR(-EINVAL); ++ ++ k = entry->start; ++ *level = entry->level; ++ return k; ++} ++ ++static int verify_superblock_clean(struct bch_fs *c, ++ struct bch_sb_field_clean **cleanp, ++ struct jset *j) ++{ ++ unsigned i; ++ struct bch_sb_field_clean *clean = *cleanp; ++ int ret = 0; ++ ++ if (!c->sb.clean || !j) ++ return 0; ++ ++ if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, ++ "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", ++ le64_to_cpu(clean->journal_seq), ++ le64_to_cpu(j->seq))) { ++ kfree(clean); ++ *cleanp = NULL; ++ return 0; ++ } ++ ++ mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, ++ "superblock read clock doesn't match journal after clean shutdown"); ++ mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, ++ "superblock read clock doesn't match journal after clean shutdown"); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) { ++ char buf1[200], buf2[200]; ++ struct bkey_i *k1, *k2; ++ unsigned l1 = 0, l2 = 0; ++ ++ k1 = btree_root_find(c, clean, NULL, i, &l1); ++ k2 = btree_root_find(c, NULL, j, i, &l2); ++ ++ if (!k1 && !k2) ++ continue; ++ ++ mustfix_fsck_err_on(!k1 || !k2 || ++ IS_ERR(k1) || ++ IS_ERR(k2) || ++ k1->k.u64s != k2->k.u64s || ++ memcmp(k1, k2, bkey_bytes(k1)) || ++ l1 != l2, c, ++ "superblock btree root %u doesn't match journal after clean shutdown\n" ++ "sb: l=%u %s\n" ++ "journal: l=%u %s\n", i, ++ l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1), ++ l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2)); ++ } ++fsck_err: ++ return ret; ++} ++ ++static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) ++{ ++ struct bch_sb_field_clean *clean, *sb_clean; ++ int ret; ++ ++ mutex_lock(&c->sb_lock); ++ sb_clean = bch2_sb_get_clean(c->disk_sb.sb); ++ ++ if (fsck_err_on(!sb_clean, c, ++ "superblock marked clean but clean section not present")) { ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ c->sb.clean = false; ++ mutex_unlock(&c->sb_lock); ++ return NULL; ++ } ++ ++ clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), ++ GFP_KERNEL); ++ if (!clean) { ++ mutex_unlock(&c->sb_lock); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ if (le16_to_cpu(c->disk_sb.sb->version) < ++ bcachefs_metadata_version_bkey_renumber) ++ bch2_sb_clean_renumber(clean, READ); ++ ++ mutex_unlock(&c->sb_lock); ++ ++ return clean; ++fsck_err: ++ mutex_unlock(&c->sb_lock); ++ return ERR_PTR(ret); ++} ++ ++static int read_btree_roots(struct bch_fs *c) ++{ ++ unsigned i; ++ int ret = 0; ++ ++ for (i = 0; i < BTREE_ID_NR; i++) { ++ struct btree_root *r = &c->btree_roots[i]; ++ ++ if (!r->alive) ++ continue; ++ ++ if (i == BTREE_ID_ALLOC && ++ c->opts.reconstruct_alloc) { ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); ++ continue; ++ } ++ ++ ++ if (r->error) { ++ __fsck_err(c, i == BTREE_ID_ALLOC ++ ? FSCK_CAN_IGNORE : 0, ++ "invalid btree root %s", ++ bch2_btree_ids[i]); ++ if (i == BTREE_ID_ALLOC) ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); ++ } ++ ++ ret = bch2_btree_root_read(c, i, &r->key, r->level); ++ if (ret) { ++ __fsck_err(c, i == BTREE_ID_ALLOC ++ ? FSCK_CAN_IGNORE : 0, ++ "error reading btree root %s", ++ bch2_btree_ids[i]); ++ if (i == BTREE_ID_ALLOC) ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); ++ } ++ } ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (!c->btree_roots[i].b) ++ bch2_btree_root_alloc(c, i); ++fsck_err: ++ return ret; ++} ++ ++int bch2_fs_recovery(struct bch_fs *c) ++{ ++ const char *err = "cannot allocate memory"; ++ struct bch_sb_field_clean *clean = NULL; ++ u64 journal_seq; ++ bool wrote = false, write_sb = false; ++ int ret; ++ ++ if (c->sb.clean) ++ clean = read_superblock_clean(c); ++ ret = PTR_ERR_OR_ZERO(clean); ++ if (ret) ++ goto err; ++ ++ if (c->sb.clean) ++ bch_info(c, "recovering from clean shutdown, journal seq %llu", ++ le64_to_cpu(clean->journal_seq)); ++ ++ if (!c->replicas.entries || ++ c->opts.rebuild_replicas) { ++ bch_info(c, "building replicas info"); ++ set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); ++ } ++ ++ if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { ++ struct jset *j; ++ ++ ret = bch2_journal_read(c, &c->journal_entries); ++ if (ret) ++ goto err; ++ ++ if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c, ++ "filesystem marked clean but journal not empty")) { ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ c->sb.clean = false; ++ } ++ ++ if (!c->sb.clean && list_empty(&c->journal_entries)) { ++ bch_err(c, "no journal entries found"); ++ ret = BCH_FSCK_REPAIR_IMPOSSIBLE; ++ goto err; ++ } ++ ++ c->journal_keys = journal_keys_sort(&c->journal_entries); ++ if (!c->journal_keys.d) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ j = &list_last_entry(&c->journal_entries, ++ struct journal_replay, list)->j; ++ ++ ret = verify_superblock_clean(c, &clean, j); ++ if (ret) ++ goto err; ++ ++ journal_seq = le64_to_cpu(j->seq) + 1; ++ } else { ++ journal_seq = le64_to_cpu(clean->journal_seq) + 1; ++ } ++ ++ if (!c->sb.clean && ++ !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { ++ bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ ret = journal_replay_early(c, clean, &c->journal_entries); ++ if (ret) ++ goto err; ++ ++ if (!c->sb.clean) { ++ ret = bch2_journal_seq_blacklist_add(c, ++ journal_seq, ++ journal_seq + 4); ++ if (ret) { ++ bch_err(c, "error creating new journal seq blacklist entry"); ++ goto err; ++ } ++ ++ journal_seq += 4; ++ ++ /* ++ * The superblock needs to be written before we do any btree ++ * node writes: it will be in the read_write() path ++ */ ++ } ++ ++ ret = bch2_blacklist_table_initialize(c); ++ ++ if (!list_empty(&c->journal_entries)) { ++ ret = verify_journal_entries_not_blacklisted_or_missing(c, ++ &c->journal_entries); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_fs_journal_start(&c->journal, journal_seq, ++ &c->journal_entries); ++ if (ret) ++ goto err; ++ ++ ret = read_btree_roots(c); ++ if (ret) ++ goto err; ++ ++ bch_verbose(c, "starting alloc read"); ++ err = "error reading allocation information"; ++ ret = bch2_alloc_read(c, &c->journal_keys); ++ if (ret) ++ goto err; ++ bch_verbose(c, "alloc read done"); ++ ++ bch_verbose(c, "starting stripes_read"); ++ err = "error reading stripes"; ++ ret = bch2_stripes_read(c, &c->journal_keys); ++ if (ret) ++ goto err; ++ bch_verbose(c, "stripes_read done"); ++ ++ set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); ++ ++ if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) && ++ !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) { ++ /* ++ * interior btree node updates aren't consistent with the ++ * journal; after an unclean shutdown we have to walk all ++ * pointers to metadata: ++ */ ++ bch_info(c, "starting metadata mark and sweep"); ++ err = "error in mark and sweep"; ++ ret = bch2_gc(c, &c->journal_keys, true, true); ++ if (ret) ++ goto err; ++ bch_verbose(c, "mark and sweep done"); ++ } ++ ++ if (c->opts.fsck || ++ !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) || ++ test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { ++ bch_info(c, "starting mark and sweep"); ++ err = "error in mark and sweep"; ++ ret = bch2_gc(c, &c->journal_keys, true, false); ++ if (ret) ++ goto err; ++ bch_verbose(c, "mark and sweep done"); ++ } ++ ++ clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); ++ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); ++ ++ /* ++ * Skip past versions that might have possibly been used (as nonces), ++ * but hadn't had their pointers written: ++ */ ++ if (c->sb.encryption_type && !c->sb.clean) ++ atomic64_add(1 << 16, &c->key_version); ++ ++ if (c->opts.norecovery) ++ goto out; ++ ++ bch_verbose(c, "starting journal replay"); ++ err = "journal replay failed"; ++ ret = bch2_journal_replay(c, c->journal_keys); ++ if (ret) ++ goto err; ++ bch_verbose(c, "journal replay done"); ++ ++ if (!c->opts.nochanges) { ++ /* ++ * note that even when filesystem was clean there might be work ++ * to do here, if we ran gc (because of fsck) which recalculated ++ * oldest_gen: ++ */ ++ bch_verbose(c, "writing allocation info"); ++ err = "error writing out alloc info"; ++ ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW, &wrote) ?: ++ bch2_alloc_write(c, BTREE_INSERT_LAZY_RW, &wrote); ++ if (ret) { ++ bch_err(c, "error writing alloc info"); ++ goto err; ++ } ++ bch_verbose(c, "alloc write done"); ++ ++ set_bit(BCH_FS_ALLOC_WRITTEN, &c->flags); ++ } ++ ++ if (!c->sb.clean) { ++ if (!(c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { ++ bch_info(c, "checking inode link counts"); ++ err = "error in recovery"; ++ ret = bch2_fsck_inode_nlink(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "check inodes done"); ++ ++ } else { ++ bch_verbose(c, "checking for deleted inodes"); ++ err = "error in recovery"; ++ ret = bch2_fsck_walk_inodes_only(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "check inodes done"); ++ } ++ } ++ ++ if (c->opts.fsck) { ++ bch_info(c, "starting fsck"); ++ err = "error in fsck"; ++ ret = bch2_fsck_full(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "fsck done"); ++ } ++ ++ if (enabled_qtypes(c)) { ++ bch_verbose(c, "reading quotas"); ++ ret = bch2_fs_quota_read(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "quotas done"); ++ } ++ ++ mutex_lock(&c->sb_lock); ++ if (c->opts.version_upgrade) { ++ if (c->sb.version < bcachefs_metadata_version_new_versioning) ++ c->disk_sb.sb->version_min = ++ le16_to_cpu(bcachefs_metadata_version_min); ++ c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); ++ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; ++ write_sb = true; ++ } ++ ++ if (!test_bit(BCH_FS_ERROR, &c->flags)) { ++ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; ++ write_sb = true; ++ } ++ ++ if (c->opts.fsck && ++ !test_bit(BCH_FS_ERROR, &c->flags)) { ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; ++ SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); ++ write_sb = true; ++ } ++ ++ if (write_sb) ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ if (c->journal_seq_blacklist_table && ++ c->journal_seq_blacklist_table->nr > 128) ++ queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); ++out: ++ ret = 0; ++err: ++fsck_err: ++ set_bit(BCH_FS_FSCK_DONE, &c->flags); ++ bch2_flush_fsck_errs(c); ++ ++ if (!c->opts.keep_journal) { ++ bch2_journal_keys_free(&c->journal_keys); ++ bch2_journal_entries_free(&c->journal_entries); ++ } ++ kfree(clean); ++ if (ret) ++ bch_err(c, "Error in recovery: %s (%i)", err, ret); ++ else ++ bch_verbose(c, "ret %i", ret); ++ return ret; ++} ++ ++int bch2_fs_initialize(struct bch_fs *c) ++{ ++ struct bch_inode_unpacked root_inode, lostfound_inode; ++ struct bkey_inode_buf packed_inode; ++ struct qstr lostfound = QSTR("lost+found"); ++ const char *err = "cannot allocate memory"; ++ struct bch_dev *ca; ++ LIST_HEAD(journal); ++ unsigned i; ++ int ret; ++ ++ bch_notice(c, "initializing new filesystem"); ++ ++ mutex_lock(&c->sb_lock); ++ for_each_online_member(ca, c, i) ++ bch2_mark_dev_superblock(c, ca, 0); ++ mutex_unlock(&c->sb_lock); ++ ++ mutex_lock(&c->sb_lock); ++ c->disk_sb.sb->version = c->disk_sb.sb->version_min = ++ le16_to_cpu(bcachefs_metadata_version_current); ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; ++ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); ++ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ bch2_btree_root_alloc(c, i); ++ ++ set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); ++ set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); ++ ++ err = "unable to allocate journal buckets"; ++ for_each_online_member(ca, c, i) { ++ ret = bch2_dev_journal_alloc(ca); ++ if (ret) { ++ percpu_ref_put(&ca->io_ref); ++ goto err; ++ } ++ } ++ ++ /* ++ * journal_res_get() will crash if called before this has ++ * set up the journal.pin FIFO and journal.cur pointer: ++ */ ++ bch2_fs_journal_start(&c->journal, 1, &journal); ++ bch2_journal_set_replay_done(&c->journal); ++ ++ bch2_inode_init(c, &root_inode, 0, 0, ++ S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); ++ root_inode.bi_inum = BCACHEFS_ROOT_INO; ++ bch2_inode_pack(&packed_inode, &root_inode); ++ ++ err = "error creating root directory"; ++ ret = bch2_btree_insert(c, BTREE_ID_INODES, ++ &packed_inode.inode.k_i, ++ NULL, NULL, BTREE_INSERT_LAZY_RW); ++ if (ret) ++ goto err; ++ ++ bch2_inode_init_early(c, &lostfound_inode); ++ ++ err = "error creating lost+found"; ++ ret = bch2_trans_do(c, NULL, NULL, 0, ++ bch2_create_trans(&trans, BCACHEFS_ROOT_INO, ++ &root_inode, &lostfound_inode, ++ &lostfound, ++ 0, 0, S_IFDIR|0700, 0, ++ NULL, NULL)); ++ if (ret) ++ goto err; ++ ++ if (enabled_qtypes(c)) { ++ ret = bch2_fs_quota_read(c); ++ if (ret) ++ goto err; ++ } ++ ++ err = "error writing first journal entry"; ++ ret = bch2_journal_meta(&c->journal); ++ if (ret) ++ goto err; ++ ++ mutex_lock(&c->sb_lock); ++ SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++err: ++ pr_err("Error initializing new filesystem: %s (%i)", err, ret); ++ return ret; ++} +diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h +new file mode 100644 +index 000000000000..a66827c9addf +--- /dev/null ++++ b/fs/bcachefs/recovery.h +@@ -0,0 +1,60 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_RECOVERY_H ++#define _BCACHEFS_RECOVERY_H ++ ++#define for_each_journal_key(keys, i) \ ++ for (i = (keys).d; i < (keys).d + (keys).nr; (i)++) ++ ++struct journal_iter { ++ enum btree_id btree_id; ++ unsigned level; ++ struct journal_keys *keys; ++ struct journal_key *k; ++}; ++ ++/* ++ * Iterate over keys in the btree, with keys from the journal overlaid on top: ++ */ ++ ++struct btree_and_journal_iter { ++ struct btree_iter *btree; ++ ++ struct btree *b; ++ struct btree_node_iter node_iter; ++ struct bkey unpacked; ++ ++ struct journal_iter journal; ++ ++ enum last_key_returned { ++ none, ++ btree, ++ journal, ++ } last; ++}; ++ ++void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); ++struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); ++struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *); ++ ++void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *, ++ struct btree_trans *, ++ struct journal_keys *, ++ enum btree_id, struct bpos); ++void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, ++ struct journal_keys *, ++ struct btree *); ++ ++typedef int (*btree_walk_node_fn)(struct bch_fs *c, struct btree *b); ++typedef int (*btree_walk_key_fn)(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_s_c k); ++ ++int bch2_btree_and_journal_walk(struct bch_fs *, struct journal_keys *, enum btree_id, ++ btree_walk_node_fn, btree_walk_key_fn); ++ ++void bch2_journal_keys_free(struct journal_keys *); ++void bch2_journal_entries_free(struct list_head *); ++ ++int bch2_fs_recovery(struct bch_fs *); ++int bch2_fs_initialize(struct bch_fs *); ++ ++#endif /* _BCACHEFS_RECOVERY_H */ +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +new file mode 100644 +index 000000000000..3c473f1380a6 +--- /dev/null ++++ b/fs/bcachefs/reflink.c +@@ -0,0 +1,303 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "bkey_on_stack.h" ++#include "btree_update.h" ++#include "extents.h" ++#include "inode.h" ++#include "io.h" ++#include "reflink.h" ++ ++#include ++ ++/* reflink pointers */ ++ ++const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); ++ ++ if (bkey_val_bytes(p.k) != sizeof(*p.v)) ++ return "incorrect value size"; ++ ++ return NULL; ++} ++ ++void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); ++ ++ pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx)); ++} ++ ++enum merge_result bch2_reflink_p_merge(struct bch_fs *c, ++ struct bkey_s _l, struct bkey_s _r) ++{ ++ struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l); ++ struct bkey_s_reflink_p r = bkey_s_to_reflink_p(_r); ++ ++ if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) ++ return BCH_MERGE_NOMERGE; ++ ++ if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { ++ bch2_key_resize(l.k, KEY_SIZE_MAX); ++ bch2_cut_front_s(l.k->p, _r); ++ return BCH_MERGE_PARTIAL; ++ } ++ ++ bch2_key_resize(l.k, l.k->size + r.k->size); ++ ++ return BCH_MERGE_MERGE; ++} ++ ++/* indirect extents */ ++ ++const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); ++ ++ if (bkey_val_bytes(r.k) < sizeof(*r.v)) ++ return "incorrect value size"; ++ ++ return bch2_bkey_ptrs_invalid(c, k); ++} ++ ++void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); ++ ++ pr_buf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); ++ ++ bch2_bkey_ptrs_to_text(out, c, k); ++} ++ ++static int bch2_make_extent_indirect(struct btree_trans *trans, ++ struct btree_iter *extent_iter, ++ struct bkey_i_extent *e) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *reflink_iter; ++ struct bkey_s_c k; ++ struct bkey_i_reflink_v *r_v; ++ struct bkey_i_reflink_p *r_p; ++ int ret; ++ ++ for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK, ++ POS(0, c->reflink_hint), ++ BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) { ++ if (reflink_iter->pos.inode) { ++ bch2_btree_iter_set_pos(reflink_iter, POS_MIN); ++ continue; ++ } ++ ++ if (bkey_deleted(k.k) && e->k.size <= k.k->size) ++ break; ++ } ++ ++ if (ret) ++ goto err; ++ ++ /* rewind iter to start of hole, if necessary: */ ++ bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k)); ++ ++ r_v = bch2_trans_kmalloc(trans, sizeof(*r_v) + bkey_val_bytes(&e->k)); ++ ret = PTR_ERR_OR_ZERO(r_v); ++ if (ret) ++ goto err; ++ ++ bkey_reflink_v_init(&r_v->k_i); ++ r_v->k.p = reflink_iter->pos; ++ bch2_key_resize(&r_v->k, e->k.size); ++ r_v->k.version = e->k.version; ++ ++ set_bkey_val_u64s(&r_v->k, bkey_val_u64s(&r_v->k) + ++ bkey_val_u64s(&e->k)); ++ r_v->v.refcount = 0; ++ memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k)); ++ ++ bch2_trans_update(trans, reflink_iter, &r_v->k_i, 0); ++ ++ r_p = bch2_trans_kmalloc(trans, sizeof(*r_p)); ++ if (IS_ERR(r_p)) ++ return PTR_ERR(r_p); ++ ++ e->k.type = KEY_TYPE_reflink_p; ++ r_p = bkey_i_to_reflink_p(&e->k_i); ++ set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); ++ r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); ++ ++ bch2_trans_update(trans, extent_iter, &r_p->k_i, 0); ++err: ++ if (!IS_ERR(reflink_iter)) ++ c->reflink_hint = reflink_iter->pos.offset; ++ bch2_trans_iter_put(trans, reflink_iter); ++ ++ return ret; ++} ++ ++static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) ++{ ++ struct bkey_s_c k = bch2_btree_iter_peek(iter); ++ int ret; ++ ++ for_each_btree_key_continue(iter, 0, k, ret) { ++ if (bkey_cmp(iter->pos, end) >= 0) ++ return bkey_s_c_null; ++ ++ if (k.k->type == KEY_TYPE_extent || ++ k.k->type == KEY_TYPE_reflink_p) ++ break; ++ } ++ ++ return k; ++} ++ ++s64 bch2_remap_range(struct bch_fs *c, ++ struct bpos dst_start, struct bpos src_start, ++ u64 remap_sectors, u64 *journal_seq, ++ u64 new_i_size, s64 *i_sectors_delta) ++{ ++ struct btree_trans trans; ++ struct btree_iter *dst_iter, *src_iter; ++ struct bkey_s_c src_k; ++ BKEY_PADDED(k) new_dst; ++ struct bkey_on_stack new_src; ++ struct bpos dst_end = dst_start, src_end = src_start; ++ struct bpos dst_want, src_want; ++ u64 src_done, dst_done; ++ int ret = 0, ret2 = 0; ++ ++ if (!c->opts.reflink) ++ return -EOPNOTSUPP; ++ ++ if (!percpu_ref_tryget(&c->writes)) ++ return -EROFS; ++ ++ bch2_check_set_feature(c, BCH_FEATURE_reflink); ++ ++ dst_end.offset += remap_sectors; ++ src_end.offset += remap_sectors; ++ ++ bkey_on_stack_init(&new_src); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); ++ ++ src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start, ++ BTREE_ITER_INTENT); ++ dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start, ++ BTREE_ITER_INTENT); ++ ++ while (1) { ++ bch2_trans_begin(&trans); ++ ++ trans.mem_top = 0; ++ ++ if (fatal_signal_pending(current)) { ++ ret = -EINTR; ++ goto err; ++ } ++ ++ src_k = get_next_src(src_iter, src_end); ++ ret = bkey_err(src_k); ++ if (ret) ++ goto btree_err; ++ ++ src_done = bpos_min(src_iter->pos, src_end).offset - ++ src_start.offset; ++ dst_want = POS(dst_start.inode, dst_start.offset + src_done); ++ ++ if (bkey_cmp(dst_iter->pos, dst_want) < 0) { ++ ret = bch2_fpunch_at(&trans, dst_iter, dst_want, ++ journal_seq, i_sectors_delta); ++ if (ret) ++ goto btree_err; ++ continue; ++ } ++ ++ BUG_ON(bkey_cmp(dst_iter->pos, dst_want)); ++ ++ if (!bkey_cmp(dst_iter->pos, dst_end)) ++ break; ++ ++ if (src_k.k->type == KEY_TYPE_extent) { ++ bkey_on_stack_reassemble(&new_src, c, src_k); ++ src_k = bkey_i_to_s_c(new_src.k); ++ ++ bch2_cut_front(src_iter->pos, new_src.k); ++ bch2_cut_back(src_end, new_src.k); ++ ++ ret = bch2_make_extent_indirect(&trans, src_iter, ++ bkey_i_to_extent(new_src.k)); ++ if (ret) ++ goto btree_err; ++ ++ BUG_ON(src_k.k->type != KEY_TYPE_reflink_p); ++ } ++ ++ if (src_k.k->type == KEY_TYPE_reflink_p) { ++ struct bkey_s_c_reflink_p src_p = ++ bkey_s_c_to_reflink_p(src_k); ++ struct bkey_i_reflink_p *dst_p = ++ bkey_reflink_p_init(&new_dst.k); ++ ++ u64 offset = le64_to_cpu(src_p.v->idx) + ++ (src_iter->pos.offset - ++ bkey_start_offset(src_k.k)); ++ ++ dst_p->v.idx = cpu_to_le64(offset); ++ } else { ++ BUG(); ++ } ++ ++ new_dst.k.k.p = dst_iter->pos; ++ bch2_key_resize(&new_dst.k.k, ++ min(src_k.k->p.offset - src_iter->pos.offset, ++ dst_end.offset - dst_iter->pos.offset)); ++ ++ ret = bch2_extent_update(&trans, dst_iter, &new_dst.k, ++ NULL, journal_seq, ++ new_i_size, i_sectors_delta); ++ if (ret) ++ goto btree_err; ++ ++ dst_done = dst_iter->pos.offset - dst_start.offset; ++ src_want = POS(src_start.inode, src_start.offset + dst_done); ++ bch2_btree_iter_set_pos(src_iter, src_want); ++btree_err: ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ goto err; ++ } ++ ++ BUG_ON(bkey_cmp(dst_iter->pos, dst_end)); ++err: ++ BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0); ++ ++ dst_done = dst_iter->pos.offset - dst_start.offset; ++ new_i_size = min(dst_iter->pos.offset << 9, new_i_size); ++ ++ bch2_trans_begin(&trans); ++ ++ do { ++ struct bch_inode_unpacked inode_u; ++ struct btree_iter *inode_iter; ++ ++ inode_iter = bch2_inode_peek(&trans, &inode_u, ++ dst_start.inode, BTREE_ITER_INTENT); ++ ret2 = PTR_ERR_OR_ZERO(inode_iter); ++ ++ if (!ret2 && ++ inode_u.bi_size < new_i_size) { ++ inode_u.bi_size = new_i_size; ++ ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?: ++ bch2_trans_commit(&trans, NULL, journal_seq, 0); ++ } ++ } while (ret2 == -EINTR); ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ bkey_on_stack_exit(&new_src, c); ++ ++ percpu_ref_put(&c->writes); ++ ++ return dst_done ?: ret ?: ret2; ++} +diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h +new file mode 100644 +index 000000000000..5445c1cf0797 +--- /dev/null ++++ b/fs/bcachefs/reflink.h +@@ -0,0 +1,31 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_REFLINK_H ++#define _BCACHEFS_REFLINK_H ++ ++const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++enum merge_result bch2_reflink_p_merge(struct bch_fs *, ++ struct bkey_s, struct bkey_s); ++ ++#define bch2_bkey_ops_reflink_p (struct bkey_ops) { \ ++ .key_invalid = bch2_reflink_p_invalid, \ ++ .val_to_text = bch2_reflink_p_to_text, \ ++ .key_merge = bch2_reflink_p_merge, \ ++} ++ ++const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ ++ ++#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \ ++ .key_invalid = bch2_reflink_v_invalid, \ ++ .val_to_text = bch2_reflink_v_to_text, \ ++ .swab = bch2_ptr_swab, \ ++} ++ ++s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos, ++ u64, u64 *, u64, s64 *); ++ ++#endif /* _BCACHEFS_REFLINK_H */ +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +new file mode 100644 +index 000000000000..6b6506c68609 +--- /dev/null ++++ b/fs/bcachefs/replicas.c +@@ -0,0 +1,1059 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "buckets.h" ++#include "journal.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, ++ struct bch_replicas_cpu *); ++ ++/* Replicas tracking - in memory: */ ++ ++static inline int u8_cmp(u8 l, u8 r) ++{ ++ return cmp_int(l, r); ++} ++ ++static void verify_replicas_entry(struct bch_replicas_entry *e) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ unsigned i; ++ ++ BUG_ON(e->data_type >= BCH_DATA_NR); ++ BUG_ON(!e->nr_devs); ++ BUG_ON(e->nr_required > 1 && ++ e->nr_required >= e->nr_devs); ++ ++ for (i = 0; i + 1 < e->nr_devs; i++) ++ BUG_ON(e->devs[i] >= e->devs[i + 1]); ++#endif ++} ++ ++static void replicas_entry_sort(struct bch_replicas_entry *e) ++{ ++ bubble_sort(e->devs, e->nr_devs, u8_cmp); ++} ++ ++static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) ++{ ++ eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); ++} ++ ++void bch2_replicas_entry_to_text(struct printbuf *out, ++ struct bch_replicas_entry *e) ++{ ++ unsigned i; ++ ++ pr_buf(out, "%s: %u/%u [", ++ bch2_data_types[e->data_type], ++ e->nr_required, ++ e->nr_devs); ++ ++ for (i = 0; i < e->nr_devs; i++) ++ pr_buf(out, i ? " %u" : "%u", e->devs[i]); ++ pr_buf(out, "]"); ++} ++ ++void bch2_cpu_replicas_to_text(struct printbuf *out, ++ struct bch_replicas_cpu *r) ++{ ++ struct bch_replicas_entry *e; ++ bool first = true; ++ ++ for_each_cpu_replicas_entry(r, e) { ++ if (!first) ++ pr_buf(out, " "); ++ first = false; ++ ++ bch2_replicas_entry_to_text(out, e); ++ } ++} ++ ++static void extent_to_replicas(struct bkey_s_c k, ++ struct bch_replicas_entry *r) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ r->nr_required = 1; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ if (p.ptr.cached) ++ continue; ++ ++ if (!p.has_ec) ++ r->devs[r->nr_devs++] = p.ptr.dev; ++ else ++ r->nr_required = 0; ++ } ++} ++ ++static void stripe_to_replicas(struct bkey_s_c k, ++ struct bch_replicas_entry *r) ++{ ++ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); ++ const struct bch_extent_ptr *ptr; ++ ++ r->nr_required = s.v->nr_blocks - s.v->nr_redundant; ++ ++ for (ptr = s.v->ptrs; ++ ptr < s.v->ptrs + s.v->nr_blocks; ++ ptr++) ++ r->devs[r->nr_devs++] = ptr->dev; ++} ++ ++void bch2_bkey_to_replicas(struct bch_replicas_entry *e, ++ struct bkey_s_c k) ++{ ++ e->nr_devs = 0; ++ ++ switch (k.k->type) { ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: ++ e->data_type = BCH_DATA_btree; ++ extent_to_replicas(k, e); ++ break; ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ e->data_type = BCH_DATA_user; ++ extent_to_replicas(k, e); ++ break; ++ case KEY_TYPE_stripe: ++ e->data_type = BCH_DATA_user; ++ stripe_to_replicas(k, e); ++ break; ++ } ++ ++ replicas_entry_sort(e); ++} ++ ++void bch2_devlist_to_replicas(struct bch_replicas_entry *e, ++ enum bch_data_type data_type, ++ struct bch_devs_list devs) ++{ ++ unsigned i; ++ ++ BUG_ON(!data_type || ++ data_type == BCH_DATA_sb || ++ data_type >= BCH_DATA_NR); ++ ++ e->data_type = data_type; ++ e->nr_devs = 0; ++ e->nr_required = 1; ++ ++ for (i = 0; i < devs.nr; i++) ++ e->devs[e->nr_devs++] = devs.devs[i]; ++ ++ replicas_entry_sort(e); ++} ++ ++static struct bch_replicas_cpu ++cpu_replicas_add_entry(struct bch_replicas_cpu *old, ++ struct bch_replicas_entry *new_entry) ++{ ++ unsigned i; ++ struct bch_replicas_cpu new = { ++ .nr = old->nr + 1, ++ .entry_size = max_t(unsigned, old->entry_size, ++ replicas_entry_bytes(new_entry)), ++ }; ++ ++ BUG_ON(!new_entry->data_type); ++ verify_replicas_entry(new_entry); ++ ++ new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO); ++ if (!new.entries) ++ return new; ++ ++ for (i = 0; i < old->nr; i++) ++ memcpy(cpu_replicas_entry(&new, i), ++ cpu_replicas_entry(old, i), ++ old->entry_size); ++ ++ memcpy(cpu_replicas_entry(&new, old->nr), ++ new_entry, ++ replicas_entry_bytes(new_entry)); ++ ++ bch2_cpu_replicas_sort(&new); ++ return new; ++} ++ ++static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, ++ struct bch_replicas_entry *search) ++{ ++ int idx, entry_size = replicas_entry_bytes(search); ++ ++ if (unlikely(entry_size > r->entry_size)) ++ return -1; ++ ++ verify_replicas_entry(search); ++ ++#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) ++ idx = eytzinger0_find(r->entries, r->nr, r->entry_size, ++ entry_cmp, search); ++#undef entry_cmp ++ ++ return idx < r->nr ? idx : -1; ++} ++ ++int bch2_replicas_entry_idx(struct bch_fs *c, ++ struct bch_replicas_entry *search) ++{ ++ replicas_entry_sort(search); ++ ++ return __replicas_entry_idx(&c->replicas, search); ++} ++ ++static bool __replicas_has_entry(struct bch_replicas_cpu *r, ++ struct bch_replicas_entry *search) ++{ ++ return __replicas_entry_idx(r, search) >= 0; ++} ++ ++bool bch2_replicas_marked(struct bch_fs *c, ++ struct bch_replicas_entry *search) ++{ ++ bool marked; ++ ++ if (!search->nr_devs) ++ return true; ++ ++ verify_replicas_entry(search); ++ ++ percpu_down_read(&c->mark_lock); ++ marked = __replicas_has_entry(&c->replicas, search) && ++ (likely((!c->replicas_gc.entries)) || ++ __replicas_has_entry(&c->replicas_gc, search)); ++ percpu_up_read(&c->mark_lock); ++ ++ return marked; ++} ++ ++static void __replicas_table_update(struct bch_fs_usage *dst, ++ struct bch_replicas_cpu *dst_r, ++ struct bch_fs_usage *src, ++ struct bch_replicas_cpu *src_r) ++{ ++ int src_idx, dst_idx; ++ ++ *dst = *src; ++ ++ for (src_idx = 0; src_idx < src_r->nr; src_idx++) { ++ if (!src->replicas[src_idx]) ++ continue; ++ ++ dst_idx = __replicas_entry_idx(dst_r, ++ cpu_replicas_entry(src_r, src_idx)); ++ BUG_ON(dst_idx < 0); ++ ++ dst->replicas[dst_idx] = src->replicas[src_idx]; ++ } ++} ++ ++static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, ++ struct bch_replicas_cpu *dst_r, ++ struct bch_fs_usage __percpu *src_p, ++ struct bch_replicas_cpu *src_r) ++{ ++ unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; ++ struct bch_fs_usage *dst, *src = (void *) ++ bch2_acc_percpu_u64s((void *) src_p, src_nr); ++ ++ preempt_disable(); ++ dst = this_cpu_ptr(dst_p); ++ preempt_enable(); ++ ++ __replicas_table_update(dst, dst_r, src, src_r); ++} ++ ++/* ++ * Resize filesystem accounting: ++ */ ++static int replicas_table_update(struct bch_fs *c, ++ struct bch_replicas_cpu *new_r) ++{ ++ struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL }; ++ struct bch_fs_usage *new_scratch = NULL; ++ struct bch_fs_usage __percpu *new_gc = NULL; ++ struct bch_fs_usage *new_base = NULL; ++ unsigned bytes = sizeof(struct bch_fs_usage) + ++ sizeof(u64) * new_r->nr; ++ int ret = -ENOMEM; ++ ++ if (!(new_base = kzalloc(bytes, GFP_NOIO)) || ++ !(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64), ++ GFP_NOIO)) || ++ !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64), ++ GFP_NOIO)) || ++ !(new_scratch = kmalloc(bytes, GFP_NOIO)) || ++ (c->usage_gc && ++ !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) { ++ bch_err(c, "error updating replicas table: memory allocation failure"); ++ goto err; ++ } ++ ++ if (c->usage_base) ++ __replicas_table_update(new_base, new_r, ++ c->usage_base, &c->replicas); ++ if (c->usage[0]) ++ __replicas_table_update_pcpu(new_usage[0], new_r, ++ c->usage[0], &c->replicas); ++ if (c->usage[1]) ++ __replicas_table_update_pcpu(new_usage[1], new_r, ++ c->usage[1], &c->replicas); ++ if (c->usage_gc) ++ __replicas_table_update_pcpu(new_gc, new_r, ++ c->usage_gc, &c->replicas); ++ ++ swap(c->usage_base, new_base); ++ swap(c->usage[0], new_usage[0]); ++ swap(c->usage[1], new_usage[1]); ++ swap(c->usage_scratch, new_scratch); ++ swap(c->usage_gc, new_gc); ++ swap(c->replicas, *new_r); ++ ret = 0; ++err: ++ free_percpu(new_gc); ++ kfree(new_scratch); ++ free_percpu(new_usage[1]); ++ free_percpu(new_usage[0]); ++ kfree(new_base); ++ return ret; ++} ++ ++static unsigned reserve_journal_replicas(struct bch_fs *c, ++ struct bch_replicas_cpu *r) ++{ ++ struct bch_replicas_entry *e; ++ unsigned journal_res_u64s = 0; ++ ++ /* nr_inodes: */ ++ journal_res_u64s += ++ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); ++ ++ /* key_version: */ ++ journal_res_u64s += ++ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); ++ ++ /* persistent_reserved: */ ++ journal_res_u64s += ++ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) * ++ BCH_REPLICAS_MAX; ++ ++ for_each_cpu_replicas_entry(r, e) ++ journal_res_u64s += ++ DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) + ++ e->nr_devs, sizeof(u64)); ++ return journal_res_u64s; ++} ++ ++noinline ++static int bch2_mark_replicas_slowpath(struct bch_fs *c, ++ struct bch_replicas_entry *new_entry) ++{ ++ struct bch_replicas_cpu new_r, new_gc; ++ int ret = 0; ++ ++ verify_replicas_entry(new_entry); ++ ++ memset(&new_r, 0, sizeof(new_r)); ++ memset(&new_gc, 0, sizeof(new_gc)); ++ ++ mutex_lock(&c->sb_lock); ++ ++ if (c->replicas_gc.entries && ++ !__replicas_has_entry(&c->replicas_gc, new_entry)) { ++ new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry); ++ if (!new_gc.entries) ++ goto err; ++ } ++ ++ if (!__replicas_has_entry(&c->replicas, new_entry)) { ++ new_r = cpu_replicas_add_entry(&c->replicas, new_entry); ++ if (!new_r.entries) ++ goto err; ++ ++ ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); ++ if (ret) ++ goto err; ++ ++ bch2_journal_entry_res_resize(&c->journal, ++ &c->replicas_journal_res, ++ reserve_journal_replicas(c, &new_r)); ++ } ++ ++ if (!new_r.entries && ++ !new_gc.entries) ++ goto out; ++ ++ /* allocations done, now commit: */ ++ ++ if (new_r.entries) ++ bch2_write_super(c); ++ ++ /* don't update in memory replicas until changes are persistent */ ++ percpu_down_write(&c->mark_lock); ++ if (new_r.entries) ++ ret = replicas_table_update(c, &new_r); ++ if (new_gc.entries) ++ swap(new_gc, c->replicas_gc); ++ percpu_up_write(&c->mark_lock); ++out: ++ mutex_unlock(&c->sb_lock); ++ ++ kfree(new_r.entries); ++ kfree(new_gc.entries); ++ ++ return ret; ++err: ++ bch_err(c, "error adding replicas entry: memory allocation failure"); ++ ret = -ENOMEM; ++ goto out; ++} ++ ++static int __bch2_mark_replicas(struct bch_fs *c, ++ struct bch_replicas_entry *r, ++ bool check) ++{ ++ return likely(bch2_replicas_marked(c, r)) ? 0 ++ : check ? -1 ++ : bch2_mark_replicas_slowpath(c, r); ++} ++ ++int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) ++{ ++ return __bch2_mark_replicas(c, r, false); ++} ++ ++static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k, ++ bool check) ++{ ++ struct bch_replicas_padded search; ++ struct bch_devs_list cached = bch2_bkey_cached_devs(k); ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < cached.nr; i++) { ++ bch2_replicas_entry_cached(&search.e, cached.devs[i]); ++ ++ ret = __bch2_mark_replicas(c, &search.e, check); ++ if (ret) ++ return ret; ++ } ++ ++ bch2_bkey_to_replicas(&search.e, k); ++ ++ return __bch2_mark_replicas(c, &search.e, check); ++} ++ ++bool bch2_bkey_replicas_marked(struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ return __bch2_mark_bkey_replicas(c, k, true) == 0; ++} ++ ++int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) ++{ ++ return __bch2_mark_bkey_replicas(c, k, false); ++} ++ ++int bch2_replicas_gc_end(struct bch_fs *c, int ret) ++{ ++ unsigned i; ++ ++ lockdep_assert_held(&c->replicas_gc_lock); ++ ++ mutex_lock(&c->sb_lock); ++ percpu_down_write(&c->mark_lock); ++ ++ /* ++ * this is kind of crappy; the replicas gc mechanism needs to be ripped ++ * out ++ */ ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ struct bch_replicas_cpu n; ++ ++ if (!__replicas_has_entry(&c->replicas_gc, e) && ++ (c->usage_base->replicas[i] || ++ percpu_u64_get(&c->usage[0]->replicas[i]) || ++ percpu_u64_get(&c->usage[1]->replicas[i]))) { ++ n = cpu_replicas_add_entry(&c->replicas_gc, e); ++ if (!n.entries) { ++ ret = -ENOSPC; ++ goto err; ++ } ++ ++ swap(n, c->replicas_gc); ++ kfree(n.entries); ++ } ++ } ++ ++ if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) { ++ ret = -ENOSPC; ++ goto err; ++ } ++ ++ ret = replicas_table_update(c, &c->replicas_gc); ++err: ++ kfree(c->replicas_gc.entries); ++ c->replicas_gc.entries = NULL; ++ ++ percpu_up_write(&c->mark_lock); ++ ++ if (!ret) ++ bch2_write_super(c); ++ ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) ++{ ++ struct bch_replicas_entry *e; ++ unsigned i = 0; ++ ++ lockdep_assert_held(&c->replicas_gc_lock); ++ ++ mutex_lock(&c->sb_lock); ++ BUG_ON(c->replicas_gc.entries); ++ ++ c->replicas_gc.nr = 0; ++ c->replicas_gc.entry_size = 0; ++ ++ for_each_cpu_replicas_entry(&c->replicas, e) ++ if (!((1 << e->data_type) & typemask)) { ++ c->replicas_gc.nr++; ++ c->replicas_gc.entry_size = ++ max_t(unsigned, c->replicas_gc.entry_size, ++ replicas_entry_bytes(e)); ++ } ++ ++ c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, ++ c->replicas_gc.entry_size, ++ GFP_NOIO); ++ if (!c->replicas_gc.entries) { ++ mutex_unlock(&c->sb_lock); ++ bch_err(c, "error allocating c->replicas_gc"); ++ return -ENOMEM; ++ } ++ ++ for_each_cpu_replicas_entry(&c->replicas, e) ++ if (!((1 << e->data_type) & typemask)) ++ memcpy(cpu_replicas_entry(&c->replicas_gc, i++), ++ e, c->replicas_gc.entry_size); ++ ++ bch2_cpu_replicas_sort(&c->replicas_gc); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++int bch2_replicas_gc2(struct bch_fs *c) ++{ ++ struct bch_replicas_cpu new = { 0 }; ++ unsigned i, nr; ++ int ret = 0; ++ ++ bch2_journal_meta(&c->journal); ++retry: ++ nr = READ_ONCE(c->replicas.nr); ++ new.entry_size = READ_ONCE(c->replicas.entry_size); ++ new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); ++ if (!new.entries) { ++ bch_err(c, "error allocating c->replicas_gc"); ++ return -ENOMEM; ++ } ++ ++ mutex_lock(&c->sb_lock); ++ percpu_down_write(&c->mark_lock); ++ ++ if (nr != c->replicas.nr || ++ new.entry_size != c->replicas.entry_size) { ++ percpu_up_write(&c->mark_lock); ++ mutex_unlock(&c->sb_lock); ++ kfree(new.entries); ++ goto retry; ++ } ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ ++ if (e->data_type == BCH_DATA_journal || ++ c->usage_base->replicas[i] || ++ percpu_u64_get(&c->usage[0]->replicas[i]) || ++ percpu_u64_get(&c->usage[1]->replicas[i])) ++ memcpy(cpu_replicas_entry(&new, new.nr++), ++ e, new.entry_size); ++ } ++ ++ bch2_cpu_replicas_sort(&new); ++ ++ if (bch2_cpu_replicas_to_sb_replicas(c, &new)) { ++ ret = -ENOSPC; ++ goto err; ++ } ++ ++ ret = replicas_table_update(c, &new); ++err: ++ kfree(new.entries); ++ ++ percpu_up_write(&c->mark_lock); ++ ++ if (!ret) ++ bch2_write_super(c); ++ ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++int bch2_replicas_set_usage(struct bch_fs *c, ++ struct bch_replicas_entry *r, ++ u64 sectors) ++{ ++ int ret, idx = bch2_replicas_entry_idx(c, r); ++ ++ if (idx < 0) { ++ struct bch_replicas_cpu n; ++ ++ n = cpu_replicas_add_entry(&c->replicas, r); ++ if (!n.entries) ++ return -ENOMEM; ++ ++ ret = replicas_table_update(c, &n); ++ if (ret) ++ return ret; ++ ++ kfree(n.entries); ++ ++ idx = bch2_replicas_entry_idx(c, r); ++ BUG_ON(ret < 0); ++ } ++ ++ c->usage_base->replicas[idx] = sectors; ++ ++ return 0; ++} ++ ++/* Replicas tracking - superblock: */ ++ ++static int ++__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, ++ struct bch_replicas_cpu *cpu_r) ++{ ++ struct bch_replicas_entry *e, *dst; ++ unsigned nr = 0, entry_size = 0, idx = 0; ++ ++ for_each_replicas_entry(sb_r, e) { ++ entry_size = max_t(unsigned, entry_size, ++ replicas_entry_bytes(e)); ++ nr++; ++ } ++ ++ cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); ++ if (!cpu_r->entries) ++ return -ENOMEM; ++ ++ cpu_r->nr = nr; ++ cpu_r->entry_size = entry_size; ++ ++ for_each_replicas_entry(sb_r, e) { ++ dst = cpu_replicas_entry(cpu_r, idx++); ++ memcpy(dst, e, replicas_entry_bytes(e)); ++ replicas_entry_sort(dst); ++ } ++ ++ return 0; ++} ++ ++static int ++__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, ++ struct bch_replicas_cpu *cpu_r) ++{ ++ struct bch_replicas_entry_v0 *e; ++ unsigned nr = 0, entry_size = 0, idx = 0; ++ ++ for_each_replicas_entry(sb_r, e) { ++ entry_size = max_t(unsigned, entry_size, ++ replicas_entry_bytes(e)); ++ nr++; ++ } ++ ++ entry_size += sizeof(struct bch_replicas_entry) - ++ sizeof(struct bch_replicas_entry_v0); ++ ++ cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); ++ if (!cpu_r->entries) ++ return -ENOMEM; ++ ++ cpu_r->nr = nr; ++ cpu_r->entry_size = entry_size; ++ ++ for_each_replicas_entry(sb_r, e) { ++ struct bch_replicas_entry *dst = ++ cpu_replicas_entry(cpu_r, idx++); ++ ++ dst->data_type = e->data_type; ++ dst->nr_devs = e->nr_devs; ++ dst->nr_required = 1; ++ memcpy(dst->devs, e->devs, e->nr_devs); ++ replicas_entry_sort(dst); ++ } ++ ++ return 0; ++} ++ ++int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) ++{ ++ struct bch_sb_field_replicas *sb_v1; ++ struct bch_sb_field_replicas_v0 *sb_v0; ++ struct bch_replicas_cpu new_r = { 0, 0, NULL }; ++ int ret = 0; ++ ++ if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb))) ++ ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r); ++ else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb))) ++ ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r); ++ ++ if (ret) ++ return -ENOMEM; ++ ++ bch2_cpu_replicas_sort(&new_r); ++ ++ percpu_down_write(&c->mark_lock); ++ ++ ret = replicas_table_update(c, &new_r); ++ percpu_up_write(&c->mark_lock); ++ ++ kfree(new_r.entries); ++ ++ return 0; ++} ++ ++static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, ++ struct bch_replicas_cpu *r) ++{ ++ struct bch_sb_field_replicas_v0 *sb_r; ++ struct bch_replicas_entry_v0 *dst; ++ struct bch_replicas_entry *src; ++ size_t bytes; ++ ++ bytes = sizeof(struct bch_sb_field_replicas); ++ ++ for_each_cpu_replicas_entry(r, src) ++ bytes += replicas_entry_bytes(src) - 1; ++ ++ sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb, ++ DIV_ROUND_UP(bytes, sizeof(u64))); ++ if (!sb_r) ++ return -ENOSPC; ++ ++ bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); ++ sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb); ++ ++ memset(&sb_r->entries, 0, ++ vstruct_end(&sb_r->field) - ++ (void *) &sb_r->entries); ++ ++ dst = sb_r->entries; ++ for_each_cpu_replicas_entry(r, src) { ++ dst->data_type = src->data_type; ++ dst->nr_devs = src->nr_devs; ++ memcpy(dst->devs, src->devs, src->nr_devs); ++ ++ dst = replicas_entry_next(dst); ++ ++ BUG_ON((void *) dst > vstruct_end(&sb_r->field)); ++ } ++ ++ return 0; ++} ++ ++static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, ++ struct bch_replicas_cpu *r) ++{ ++ struct bch_sb_field_replicas *sb_r; ++ struct bch_replicas_entry *dst, *src; ++ bool need_v1 = false; ++ size_t bytes; ++ ++ bytes = sizeof(struct bch_sb_field_replicas); ++ ++ for_each_cpu_replicas_entry(r, src) { ++ bytes += replicas_entry_bytes(src); ++ if (src->nr_required != 1) ++ need_v1 = true; ++ } ++ ++ if (!need_v1) ++ return bch2_cpu_replicas_to_sb_replicas_v0(c, r); ++ ++ sb_r = bch2_sb_resize_replicas(&c->disk_sb, ++ DIV_ROUND_UP(bytes, sizeof(u64))); ++ if (!sb_r) ++ return -ENOSPC; ++ ++ bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); ++ sb_r = bch2_sb_get_replicas(c->disk_sb.sb); ++ ++ memset(&sb_r->entries, 0, ++ vstruct_end(&sb_r->field) - ++ (void *) &sb_r->entries); ++ ++ dst = sb_r->entries; ++ for_each_cpu_replicas_entry(r, src) { ++ memcpy(dst, src, replicas_entry_bytes(src)); ++ ++ dst = replicas_entry_next(dst); ++ ++ BUG_ON((void *) dst > vstruct_end(&sb_r->field)); ++ } ++ ++ return 0; ++} ++ ++static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r) ++{ ++ unsigned i; ++ ++ sort_cmp_size(cpu_r->entries, ++ cpu_r->nr, ++ cpu_r->entry_size, ++ memcmp, NULL); ++ ++ for (i = 0; i + 1 < cpu_r->nr; i++) { ++ struct bch_replicas_entry *l = ++ cpu_replicas_entry(cpu_r, i); ++ struct bch_replicas_entry *r = ++ cpu_replicas_entry(cpu_r, i + 1); ++ ++ BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0); ++ ++ if (!memcmp(l, r, cpu_r->entry_size)) ++ return "duplicate replicas entry"; ++ } ++ ++ return NULL; ++} ++ ++static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f) ++{ ++ struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); ++ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); ++ struct bch_replicas_cpu cpu_r = { .entries = NULL }; ++ struct bch_replicas_entry *e; ++ const char *err; ++ unsigned i; ++ ++ for_each_replicas_entry(sb_r, e) { ++ err = "invalid replicas entry: invalid data type"; ++ if (e->data_type >= BCH_DATA_NR) ++ goto err; ++ ++ err = "invalid replicas entry: no devices"; ++ if (!e->nr_devs) ++ goto err; ++ ++ err = "invalid replicas entry: bad nr_required"; ++ if (e->nr_required > 1 && ++ e->nr_required >= e->nr_devs) ++ goto err; ++ ++ err = "invalid replicas entry: invalid device"; ++ for (i = 0; i < e->nr_devs; i++) ++ if (!bch2_dev_exists(sb, mi, e->devs[i])) ++ goto err; ++ } ++ ++ err = "cannot allocate memory"; ++ if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r)) ++ goto err; ++ ++ err = check_dup_replicas_entries(&cpu_r); ++err: ++ kfree(cpu_r.entries); ++ return err; ++} ++ ++static void bch2_sb_replicas_to_text(struct printbuf *out, ++ struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_replicas *r = field_to_type(f, replicas); ++ struct bch_replicas_entry *e; ++ bool first = true; ++ ++ for_each_replicas_entry(r, e) { ++ if (!first) ++ pr_buf(out, " "); ++ first = false; ++ ++ bch2_replicas_entry_to_text(out, e); ++ } ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_replicas = { ++ .validate = bch2_sb_validate_replicas, ++ .to_text = bch2_sb_replicas_to_text, ++}; ++ ++static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f) ++{ ++ struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); ++ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); ++ struct bch_replicas_cpu cpu_r = { .entries = NULL }; ++ struct bch_replicas_entry_v0 *e; ++ const char *err; ++ unsigned i; ++ ++ for_each_replicas_entry_v0(sb_r, e) { ++ err = "invalid replicas entry: invalid data type"; ++ if (e->data_type >= BCH_DATA_NR) ++ goto err; ++ ++ err = "invalid replicas entry: no devices"; ++ if (!e->nr_devs) ++ goto err; ++ ++ err = "invalid replicas entry: invalid device"; ++ for (i = 0; i < e->nr_devs; i++) ++ if (!bch2_dev_exists(sb, mi, e->devs[i])) ++ goto err; ++ } ++ ++ err = "cannot allocate memory"; ++ if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r)) ++ goto err; ++ ++ err = check_dup_replicas_entries(&cpu_r); ++err: ++ kfree(cpu_r.entries); ++ return err; ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { ++ .validate = bch2_sb_validate_replicas_v0, ++}; ++ ++/* Query replicas: */ ++ ++struct replicas_status __bch2_replicas_status(struct bch_fs *c, ++ struct bch_devs_mask online_devs) ++{ ++ struct bch_sb_field_members *mi; ++ struct bch_replicas_entry *e; ++ unsigned i, nr_online, nr_offline; ++ struct replicas_status ret; ++ ++ memset(&ret, 0, sizeof(ret)); ++ ++ for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) ++ ret.replicas[i].redundancy = INT_MAX; ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ ++ percpu_down_read(&c->mark_lock); ++ ++ for_each_cpu_replicas_entry(&c->replicas, e) { ++ if (e->data_type >= ARRAY_SIZE(ret.replicas)) ++ panic("e %p data_type %u\n", e, e->data_type); ++ ++ nr_online = nr_offline = 0; ++ ++ for (i = 0; i < e->nr_devs; i++) { ++ BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, ++ e->devs[i])); ++ ++ if (test_bit(e->devs[i], online_devs.d)) ++ nr_online++; ++ else ++ nr_offline++; ++ } ++ ++ ret.replicas[e->data_type].redundancy = ++ min(ret.replicas[e->data_type].redundancy, ++ (int) nr_online - (int) e->nr_required); ++ ++ ret.replicas[e->data_type].nr_offline = ++ max(ret.replicas[e->data_type].nr_offline, ++ nr_offline); ++ } ++ ++ percpu_up_read(&c->mark_lock); ++ ++ for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) ++ if (ret.replicas[i].redundancy == INT_MAX) ++ ret.replicas[i].redundancy = 0; ++ ++ return ret; ++} ++ ++struct replicas_status bch2_replicas_status(struct bch_fs *c) ++{ ++ return __bch2_replicas_status(c, bch2_online_devs(c)); ++} ++ ++static bool have_enough_devs(struct replicas_status s, ++ enum bch_data_type type, ++ bool force_if_degraded, ++ bool force_if_lost) ++{ ++ return (!s.replicas[type].nr_offline || force_if_degraded) && ++ (s.replicas[type].redundancy >= 0 || force_if_lost); ++} ++ ++bool bch2_have_enough_devs(struct replicas_status s, unsigned flags) ++{ ++ return (have_enough_devs(s, BCH_DATA_journal, ++ flags & BCH_FORCE_IF_METADATA_DEGRADED, ++ flags & BCH_FORCE_IF_METADATA_LOST) && ++ have_enough_devs(s, BCH_DATA_btree, ++ flags & BCH_FORCE_IF_METADATA_DEGRADED, ++ flags & BCH_FORCE_IF_METADATA_LOST) && ++ have_enough_devs(s, BCH_DATA_user, ++ flags & BCH_FORCE_IF_DATA_DEGRADED, ++ flags & BCH_FORCE_IF_DATA_LOST)); ++} ++ ++int bch2_replicas_online(struct bch_fs *c, bool meta) ++{ ++ struct replicas_status s = bch2_replicas_status(c); ++ ++ return (meta ++ ? min(s.replicas[BCH_DATA_journal].redundancy, ++ s.replicas[BCH_DATA_btree].redundancy) ++ : s.replicas[BCH_DATA_user].redundancy) + 1; ++} ++ ++unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bch_replicas_entry *e; ++ unsigned i, ret = 0; ++ ++ percpu_down_read(&c->mark_lock); ++ ++ for_each_cpu_replicas_entry(&c->replicas, e) ++ for (i = 0; i < e->nr_devs; i++) ++ if (e->devs[i] == ca->dev_idx) ++ ret |= 1 << e->data_type; ++ ++ percpu_up_read(&c->mark_lock); ++ ++ return ret; ++} ++ ++int bch2_fs_replicas_init(struct bch_fs *c) ++{ ++ c->journal.entry_u64s_reserved += ++ reserve_journal_replicas(c, &c->replicas); ++ ++ return replicas_table_update(c, &c->replicas); ++} +diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h +new file mode 100644 +index 000000000000..8b95164fbb56 +--- /dev/null ++++ b/fs/bcachefs/replicas.h +@@ -0,0 +1,91 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_REPLICAS_H ++#define _BCACHEFS_REPLICAS_H ++ ++#include "eytzinger.h" ++#include "replicas_types.h" ++ ++void bch2_replicas_entry_to_text(struct printbuf *, ++ struct bch_replicas_entry *); ++void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); ++ ++static inline struct bch_replicas_entry * ++cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) ++{ ++ return (void *) r->entries + r->entry_size * i; ++} ++ ++int bch2_replicas_entry_idx(struct bch_fs *, ++ struct bch_replicas_entry *); ++ ++void bch2_devlist_to_replicas(struct bch_replicas_entry *, ++ enum bch_data_type, ++ struct bch_devs_list); ++bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *); ++int bch2_mark_replicas(struct bch_fs *, ++ struct bch_replicas_entry *); ++ ++void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); ++bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c); ++int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c); ++ ++static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, ++ unsigned dev) ++{ ++ e->data_type = BCH_DATA_cached; ++ e->nr_devs = 1; ++ e->nr_required = 1; ++ e->devs[0] = dev; ++} ++ ++struct replicas_status { ++ struct { ++ int redundancy; ++ unsigned nr_offline; ++ } replicas[BCH_DATA_NR]; ++}; ++ ++struct replicas_status __bch2_replicas_status(struct bch_fs *, ++ struct bch_devs_mask); ++struct replicas_status bch2_replicas_status(struct bch_fs *); ++bool bch2_have_enough_devs(struct replicas_status, unsigned); ++ ++int bch2_replicas_online(struct bch_fs *, bool); ++unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); ++ ++int bch2_replicas_gc_end(struct bch_fs *, int); ++int bch2_replicas_gc_start(struct bch_fs *, unsigned); ++int bch2_replicas_gc2(struct bch_fs *); ++ ++int bch2_replicas_set_usage(struct bch_fs *, ++ struct bch_replicas_entry *, ++ u64); ++ ++#define for_each_cpu_replicas_entry(_r, _i) \ ++ for (_i = (_r)->entries; \ ++ (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ ++ _i = (void *) (_i) + (_r)->entry_size) ++ ++/* iterate over superblock replicas - used by userspace tools: */ ++ ++#define replicas_entry_next(_i) \ ++ ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i))) ++ ++#define for_each_replicas_entry(_r, _i) \ ++ for (_i = (_r)->entries; \ ++ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ ++ (_i) = replicas_entry_next(_i)) ++ ++#define for_each_replicas_entry_v0(_r, _i) \ ++ for (_i = (_r)->entries; \ ++ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ ++ (_i) = replicas_entry_next(_i)) ++ ++int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_replicas; ++extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0; ++ ++int bch2_fs_replicas_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_REPLICAS_H */ +diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h +new file mode 100644 +index 000000000000..0535b1d3760e +--- /dev/null ++++ b/fs/bcachefs/replicas_types.h +@@ -0,0 +1,10 @@ ++#ifndef _BCACHEFS_REPLICAS_TYPES_H ++#define _BCACHEFS_REPLICAS_TYPES_H ++ ++struct bch_replicas_cpu { ++ unsigned nr; ++ unsigned entry_size; ++ struct bch_replicas_entry *entries; ++}; ++ ++#endif /* _BCACHEFS_REPLICAS_TYPES_H */ +diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c +new file mode 100644 +index 000000000000..c062edb3fbc2 +--- /dev/null ++++ b/fs/bcachefs/siphash.c +@@ -0,0 +1,173 @@ ++// SPDX-License-Identifier: BSD-3-Clause ++/* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */ ++ ++/*- ++ * Copyright (c) 2013 Andre Oppermann ++ * All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. The name of the author may not be used to endorse or promote ++ * products derived from this software without specific prior written ++ * permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND ++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS ++ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ++ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY ++ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ++ * SUCH DAMAGE. ++ */ ++ ++/* ++ * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d ++ * are the number of compression rounds and the number of finalization rounds. ++ * A compression round is identical to a finalization round and this round ++ * function is called SipRound. Given a 128-bit key k and a (possibly empty) ++ * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m). ++ * ++ * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18, ++ * by Jean-Philippe Aumasson and Daniel J. Bernstein, ++ * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa ++ * https://131002.net/siphash/siphash.pdf ++ * https://131002.net/siphash/ ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include "siphash.h" ++ ++static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds) ++{ ++ while (rounds--) { ++ ctx->v[0] += ctx->v[1]; ++ ctx->v[2] += ctx->v[3]; ++ ctx->v[1] = rol64(ctx->v[1], 13); ++ ctx->v[3] = rol64(ctx->v[3], 16); ++ ++ ctx->v[1] ^= ctx->v[0]; ++ ctx->v[3] ^= ctx->v[2]; ++ ctx->v[0] = rol64(ctx->v[0], 32); ++ ++ ctx->v[2] += ctx->v[1]; ++ ctx->v[0] += ctx->v[3]; ++ ctx->v[1] = rol64(ctx->v[1], 17); ++ ctx->v[3] = rol64(ctx->v[3], 21); ++ ++ ctx->v[1] ^= ctx->v[2]; ++ ctx->v[3] ^= ctx->v[0]; ++ ctx->v[2] = rol64(ctx->v[2], 32); ++ } ++} ++ ++static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds) ++{ ++ u64 m = get_unaligned_le64(ptr); ++ ++ ctx->v[3] ^= m; ++ SipHash_Rounds(ctx, rounds); ++ ctx->v[0] ^= m; ++} ++ ++void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key) ++{ ++ u64 k0, k1; ++ ++ k0 = le64_to_cpu(key->k0); ++ k1 = le64_to_cpu(key->k1); ++ ++ ctx->v[0] = 0x736f6d6570736575ULL ^ k0; ++ ctx->v[1] = 0x646f72616e646f6dULL ^ k1; ++ ctx->v[2] = 0x6c7967656e657261ULL ^ k0; ++ ctx->v[3] = 0x7465646279746573ULL ^ k1; ++ ++ memset(ctx->buf, 0, sizeof(ctx->buf)); ++ ctx->bytes = 0; ++} ++ ++void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, ++ const void *src, size_t len) ++{ ++ const u8 *ptr = src; ++ size_t left, used; ++ ++ if (len == 0) ++ return; ++ ++ used = ctx->bytes % sizeof(ctx->buf); ++ ctx->bytes += len; ++ ++ if (used > 0) { ++ left = sizeof(ctx->buf) - used; ++ ++ if (len >= left) { ++ memcpy(&ctx->buf[used], ptr, left); ++ SipHash_CRounds(ctx, ctx->buf, rc); ++ len -= left; ++ ptr += left; ++ } else { ++ memcpy(&ctx->buf[used], ptr, len); ++ return; ++ } ++ } ++ ++ while (len >= sizeof(ctx->buf)) { ++ SipHash_CRounds(ctx, ptr, rc); ++ len -= sizeof(ctx->buf); ++ ptr += sizeof(ctx->buf); ++ } ++ ++ if (len > 0) ++ memcpy(&ctx->buf[used], ptr, len); ++} ++ ++void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf) ++{ ++ u64 r; ++ ++ r = SipHash_End(ctx, rc, rf); ++ ++ *((__le64 *) dst) = cpu_to_le64(r); ++} ++ ++u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf) ++{ ++ u64 r; ++ size_t left, used; ++ ++ used = ctx->bytes % sizeof(ctx->buf); ++ left = sizeof(ctx->buf) - used; ++ memset(&ctx->buf[used], 0, left - 1); ++ ctx->buf[7] = ctx->bytes; ++ ++ SipHash_CRounds(ctx, ctx->buf, rc); ++ ctx->v[2] ^= 0xff; ++ SipHash_Rounds(ctx, rf); ++ ++ r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]); ++ memset(ctx, 0, sizeof(*ctx)); ++ return (r); ++} ++ ++u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len) ++{ ++ SIPHASH_CTX ctx; ++ ++ SipHash_Init(&ctx, key); ++ SipHash_Update(&ctx, rc, rf, src, len); ++ return SipHash_End(&ctx, rc, rf); ++} +diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h +new file mode 100644 +index 000000000000..3dfaf34a43b2 +--- /dev/null ++++ b/fs/bcachefs/siphash.h +@@ -0,0 +1,87 @@ ++/* SPDX-License-Identifier: BSD-3-Clause */ ++/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */ ++/*- ++ * Copyright (c) 2013 Andre Oppermann ++ * All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. The name of the author may not be used to endorse or promote ++ * products derived from this software without specific prior written ++ * permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND ++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS ++ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ++ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY ++ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ++ * SUCH DAMAGE. ++ * ++ * $FreeBSD$ ++ */ ++ ++/* ++ * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions) ++ * optimized for speed on short messages returning a 64bit hash/digest value. ++ * ++ * The number of rounds is defined during the initialization: ++ * SipHash24_Init() for the fast and resonable strong version ++ * SipHash48_Init() for the strong version (half as fast) ++ * ++ * struct SIPHASH_CTX ctx; ++ * SipHash24_Init(&ctx); ++ * SipHash_SetKey(&ctx, "16bytes long key"); ++ * SipHash_Update(&ctx, pointer_to_string, length_of_string); ++ * SipHash_Final(output, &ctx); ++ */ ++ ++#ifndef _SIPHASH_H_ ++#define _SIPHASH_H_ ++ ++#include ++ ++#define SIPHASH_BLOCK_LENGTH 8 ++#define SIPHASH_KEY_LENGTH 16 ++#define SIPHASH_DIGEST_LENGTH 8 ++ ++typedef struct _SIPHASH_CTX { ++ u64 v[4]; ++ u8 buf[SIPHASH_BLOCK_LENGTH]; ++ u32 bytes; ++} SIPHASH_CTX; ++ ++typedef struct { ++ __le64 k0; ++ __le64 k1; ++} SIPHASH_KEY; ++ ++void SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *); ++void SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t); ++u64 SipHash_End(SIPHASH_CTX *, int, int); ++void SipHash_Final(void *, SIPHASH_CTX *, int, int); ++u64 SipHash(const SIPHASH_KEY *, int, int, const void *, size_t); ++ ++#define SipHash24_Init(_c, _k) SipHash_Init((_c), (_k)) ++#define SipHash24_Update(_c, _p, _l) SipHash_Update((_c), 2, 4, (_p), (_l)) ++#define SipHash24_End(_d) SipHash_End((_d), 2, 4) ++#define SipHash24_Final(_d, _c) SipHash_Final((_d), (_c), 2, 4) ++#define SipHash24(_k, _p, _l) SipHash((_k), 2, 4, (_p), (_l)) ++ ++#define SipHash48_Init(_c, _k) SipHash_Init((_c), (_k)) ++#define SipHash48_Update(_c, _p, _l) SipHash_Update((_c), 4, 8, (_p), (_l)) ++#define SipHash48_End(_d) SipHash_End((_d), 4, 8) ++#define SipHash48_Final(_d, _c) SipHash_Final((_d), (_c), 4, 8) ++#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l)) ++ ++#endif /* _SIPHASH_H_ */ +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +new file mode 100644 +index 000000000000..dea9b7252b88 +--- /dev/null ++++ b/fs/bcachefs/str_hash.h +@@ -0,0 +1,336 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_STR_HASH_H ++#define _BCACHEFS_STR_HASH_H ++ ++#include "btree_iter.h" ++#include "btree_update.h" ++#include "checksum.h" ++#include "error.h" ++#include "inode.h" ++#include "siphash.h" ++#include "super.h" ++ ++#include ++#include ++#include ++ ++static inline enum bch_str_hash_type ++bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) ++{ ++ switch (opt) { ++ case BCH_STR_HASH_OPT_CRC32C: ++ return BCH_STR_HASH_CRC32C; ++ case BCH_STR_HASH_OPT_CRC64: ++ return BCH_STR_HASH_CRC64; ++ case BCH_STR_HASH_OPT_SIPHASH: ++ return c->sb.features & (1ULL << BCH_FEATURE_new_siphash) ++ ? BCH_STR_HASH_SIPHASH ++ : BCH_STR_HASH_SIPHASH_OLD; ++ default: ++ BUG(); ++ } ++} ++ ++struct bch_hash_info { ++ u8 type; ++ union { ++ __le64 crc_key; ++ SIPHASH_KEY siphash_key; ++ }; ++}; ++ ++static inline struct bch_hash_info ++bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) ++{ ++ /* XXX ick */ ++ struct bch_hash_info info = { ++ .type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) & ++ ~(~0U << INODE_STR_HASH_BITS), ++ .crc_key = bi->bi_hash_seed, ++ }; ++ ++ if (unlikely(info.type == BCH_STR_HASH_SIPHASH_OLD)) { ++ SHASH_DESC_ON_STACK(desc, c->sha256); ++ u8 digest[SHA256_DIGEST_SIZE]; ++ ++ desc->tfm = c->sha256; ++ ++ crypto_shash_digest(desc, (void *) &bi->bi_hash_seed, ++ sizeof(bi->bi_hash_seed), digest); ++ memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); ++ } ++ ++ return info; ++} ++ ++struct bch_str_hash_ctx { ++ union { ++ u32 crc32c; ++ u64 crc64; ++ SIPHASH_CTX siphash; ++ }; ++}; ++ ++static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx, ++ const struct bch_hash_info *info) ++{ ++ switch (info->type) { ++ case BCH_STR_HASH_CRC32C: ++ ctx->crc32c = crc32c(~0, &info->crc_key, sizeof(info->crc_key)); ++ break; ++ case BCH_STR_HASH_CRC64: ++ ctx->crc64 = crc64_be(~0, &info->crc_key, sizeof(info->crc_key)); ++ break; ++ case BCH_STR_HASH_SIPHASH_OLD: ++ case BCH_STR_HASH_SIPHASH: ++ SipHash24_Init(&ctx->siphash, &info->siphash_key); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx, ++ const struct bch_hash_info *info, ++ const void *data, size_t len) ++{ ++ switch (info->type) { ++ case BCH_STR_HASH_CRC32C: ++ ctx->crc32c = crc32c(ctx->crc32c, data, len); ++ break; ++ case BCH_STR_HASH_CRC64: ++ ctx->crc64 = crc64_be(ctx->crc64, data, len); ++ break; ++ case BCH_STR_HASH_SIPHASH_OLD: ++ case BCH_STR_HASH_SIPHASH: ++ SipHash24_Update(&ctx->siphash, data, len); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, ++ const struct bch_hash_info *info) ++{ ++ switch (info->type) { ++ case BCH_STR_HASH_CRC32C: ++ return ctx->crc32c; ++ case BCH_STR_HASH_CRC64: ++ return ctx->crc64 >> 1; ++ case BCH_STR_HASH_SIPHASH_OLD: ++ case BCH_STR_HASH_SIPHASH: ++ return SipHash24_End(&ctx->siphash) >> 1; ++ default: ++ BUG(); ++ } ++} ++ ++struct bch_hash_desc { ++ enum btree_id btree_id; ++ u8 key_type; ++ ++ u64 (*hash_key)(const struct bch_hash_info *, const void *); ++ u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c); ++ bool (*cmp_key)(struct bkey_s_c, const void *); ++ bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c); ++}; ++ ++static __always_inline struct btree_iter * ++bch2_hash_lookup(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ u64 inode, const void *key, ++ unsigned flags) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ for_each_btree_key(trans, iter, desc.btree_id, ++ POS(inode, desc.hash_key(info, key)), ++ BTREE_ITER_SLOTS|flags, k, ret) { ++ if (iter->pos.inode != inode) ++ break; ++ ++ if (k.k->type == desc.key_type) { ++ if (!desc.cmp_key(k, key)) ++ return iter; ++ } else if (k.k->type == KEY_TYPE_whiteout) { ++ ; ++ } else { ++ /* hole, not found */ ++ break; ++ } ++ } ++ bch2_trans_iter_put(trans, iter); ++ ++ return ERR_PTR(ret ?: -ENOENT); ++} ++ ++static __always_inline struct btree_iter * ++bch2_hash_hole(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ u64 inode, const void *key) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ for_each_btree_key(trans, iter, desc.btree_id, ++ POS(inode, desc.hash_key(info, key)), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ if (iter->pos.inode != inode) ++ break; ++ ++ if (k.k->type != desc.key_type) ++ return iter; ++ } ++ ++ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ bch2_trans_iter_put(trans, iter); ++ ++ return ERR_PTR(ret ?: -ENOSPC); ++} ++ ++static __always_inline ++int bch2_hash_needs_whiteout(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ struct btree_iter *start) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ iter = bch2_trans_copy_iter(trans, start); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ bch2_btree_iter_next_slot(iter); ++ ++ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) { ++ if (k.k->type != desc.key_type && ++ k.k->type != KEY_TYPE_whiteout) ++ break; ++ ++ if (k.k->type == desc.key_type && ++ desc.hash_bkey(info, k) <= start->pos.offset) { ++ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ ret = 1; ++ break; ++ } ++ } ++ ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static __always_inline ++int bch2_hash_set(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ u64 inode, struct bkey_i *insert, int flags) ++{ ++ struct btree_iter *iter, *slot = NULL; ++ struct bkey_s_c k; ++ bool found = false; ++ int ret; ++ ++ for_each_btree_key(trans, iter, desc.btree_id, ++ POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ if (iter->pos.inode != inode) ++ break; ++ ++ if (k.k->type == desc.key_type) { ++ if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) ++ goto found; ++ ++ /* hash collision: */ ++ continue; ++ } ++ ++ if (!slot && ++ !(flags & BCH_HASH_SET_MUST_REPLACE)) { ++ slot = bch2_trans_copy_iter(trans, iter); ++ if (IS_ERR(slot)) ++ return PTR_ERR(slot); ++ } ++ ++ if (k.k->type != KEY_TYPE_whiteout) ++ goto not_found; ++ } ++ ++ if (!ret) ++ ret = -ENOSPC; ++out: ++ bch2_trans_iter_put(trans, slot); ++ bch2_trans_iter_put(trans, iter); ++ ++ return ret; ++found: ++ found = true; ++not_found: ++ ++ if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) { ++ ret = -ENOENT; ++ } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) { ++ ret = -EEXIST; ++ } else { ++ if (!found && slot) ++ swap(iter, slot); ++ ++ insert->k.p = iter->pos; ++ bch2_trans_update(trans, iter, insert, 0); ++ } ++ ++ goto out; ++} ++ ++static __always_inline ++int bch2_hash_delete_at(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ struct btree_iter *iter) ++{ ++ struct bkey_i *delete; ++ int ret; ++ ++ ret = bch2_hash_needs_whiteout(trans, desc, info, iter); ++ if (ret < 0) ++ return ret; ++ ++ delete = bch2_trans_kmalloc(trans, sizeof(*delete)); ++ if (IS_ERR(delete)) ++ return PTR_ERR(delete); ++ ++ bkey_init(&delete->k); ++ delete->k.p = iter->pos; ++ delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted; ++ ++ bch2_trans_update(trans, iter, delete, 0); ++ return 0; ++} ++ ++static __always_inline ++int bch2_hash_delete(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ u64 inode, const void *key) ++{ ++ struct btree_iter *iter; ++ int ret; ++ ++ iter = bch2_hash_lookup(trans, desc, info, inode, key, ++ BTREE_ITER_INTENT); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ ret = bch2_hash_delete_at(trans, desc, info, iter); ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++#endif /* _BCACHEFS_STR_HASH_H */ +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +new file mode 100644 +index 000000000000..cee6cc938734 +--- /dev/null ++++ b/fs/bcachefs/super-io.c +@@ -0,0 +1,1158 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "error.h" ++#include "io.h" ++#include "journal.h" ++#include "journal_seq_blacklist.h" ++#include "replicas.h" ++#include "quota.h" ++#include "super-io.h" ++#include "super.h" ++#include "vstructs.h" ++ ++#include ++#include ++ ++const char * const bch2_sb_fields[] = { ++#define x(name, nr) #name, ++ BCH_SB_FIELDS() ++#undef x ++ NULL ++}; ++ ++static const char *bch2_sb_field_validate(struct bch_sb *, ++ struct bch_sb_field *); ++ ++struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb, ++ enum bch_sb_field_type type) ++{ ++ struct bch_sb_field *f; ++ ++ /* XXX: need locking around superblock to access optional fields */ ++ ++ vstruct_for_each(sb, f) ++ if (le32_to_cpu(f->type) == type) ++ return f; ++ return NULL; ++} ++ ++static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, ++ struct bch_sb_field *f, ++ unsigned u64s) ++{ ++ unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; ++ unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s; ++ ++ BUG_ON(get_order(__vstruct_bytes(struct bch_sb, sb_u64s)) > ++ sb->page_order); ++ ++ if (!f && !u64s) { ++ /* nothing to do: */ ++ } else if (!f) { ++ f = vstruct_last(sb->sb); ++ memset(f, 0, sizeof(u64) * u64s); ++ f->u64s = cpu_to_le32(u64s); ++ f->type = 0; ++ } else { ++ void *src, *dst; ++ ++ src = vstruct_end(f); ++ ++ if (u64s) { ++ f->u64s = cpu_to_le32(u64s); ++ dst = vstruct_end(f); ++ } else { ++ dst = f; ++ } ++ ++ memmove(dst, src, vstruct_end(sb->sb) - src); ++ ++ if (dst > src) ++ memset(src, 0, dst - src); ++ } ++ ++ sb->sb->u64s = cpu_to_le32(sb_u64s); ++ ++ return u64s ? f : NULL; ++} ++ ++void bch2_sb_field_delete(struct bch_sb_handle *sb, ++ enum bch_sb_field_type type) ++{ ++ struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); ++ ++ if (f) ++ __bch2_sb_field_resize(sb, f, 0); ++} ++ ++/* Superblock realloc/free: */ ++ ++void bch2_free_super(struct bch_sb_handle *sb) ++{ ++ if (sb->bio) ++ bio_put(sb->bio); ++ if (!IS_ERR_OR_NULL(sb->bdev)) ++ blkdev_put(sb->bdev, sb->mode); ++ ++ free_pages((unsigned long) sb->sb, sb->page_order); ++ memset(sb, 0, sizeof(*sb)); ++} ++ ++int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) ++{ ++ size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s); ++ unsigned order = get_order(new_bytes); ++ struct bch_sb *new_sb; ++ struct bio *bio; ++ ++ if (sb->sb && sb->page_order >= order) ++ return 0; ++ ++ if (sb->have_layout) { ++ u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; ++ ++ if (new_bytes > max_bytes) { ++ char buf[BDEVNAME_SIZE]; ++ ++ pr_err("%s: superblock too big: want %zu but have %llu", ++ bdevname(sb->bdev, buf), new_bytes, max_bytes); ++ return -ENOSPC; ++ } ++ } ++ ++ if (sb->page_order >= order && sb->sb) ++ return 0; ++ ++ if (dynamic_fault("bcachefs:add:super_realloc")) ++ return -ENOMEM; ++ ++ if (sb->have_bio) { ++ bio = bio_kmalloc(GFP_KERNEL, 1 << order); ++ if (!bio) ++ return -ENOMEM; ++ ++ if (sb->bio) ++ bio_put(sb->bio); ++ sb->bio = bio; ++ } ++ ++ new_sb = (void *) __get_free_pages(GFP_NOFS|__GFP_ZERO, order); ++ if (!new_sb) ++ return -ENOMEM; ++ ++ if (sb->sb) ++ memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order); ++ ++ free_pages((unsigned long) sb->sb, sb->page_order); ++ sb->sb = new_sb; ++ ++ sb->page_order = order; ++ ++ return 0; ++} ++ ++struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb, ++ enum bch_sb_field_type type, ++ unsigned u64s) ++{ ++ struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); ++ ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; ++ ssize_t d = -old_u64s + u64s; ++ ++ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) ++ return NULL; ++ ++ if (sb->fs_sb) { ++ struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb); ++ struct bch_dev *ca; ++ unsigned i; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ /* XXX: we're not checking that offline device have enough space */ ++ ++ for_each_online_member(ca, c, i) { ++ struct bch_sb_handle *sb = &ca->disk_sb; ++ ++ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { ++ percpu_ref_put(&ca->ref); ++ return NULL; ++ } ++ } ++ } ++ ++ f = bch2_sb_field_get(sb->sb, type); ++ f = __bch2_sb_field_resize(sb, f, u64s); ++ if (f) ++ f->type = cpu_to_le32(type); ++ return f; ++} ++ ++/* Superblock validate: */ ++ ++static inline void __bch2_sb_layout_size_assert(void) ++{ ++ BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); ++} ++ ++static const char *validate_sb_layout(struct bch_sb_layout *layout) ++{ ++ u64 offset, prev_offset, max_sectors; ++ unsigned i; ++ ++ if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) ++ return "Not a bcachefs superblock layout"; ++ ++ if (layout->layout_type != 0) ++ return "Invalid superblock layout type"; ++ ++ if (!layout->nr_superblocks) ++ return "Invalid superblock layout: no superblocks"; ++ ++ if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) ++ return "Invalid superblock layout: too many superblocks"; ++ ++ max_sectors = 1 << layout->sb_max_size_bits; ++ ++ prev_offset = le64_to_cpu(layout->sb_offset[0]); ++ ++ for (i = 1; i < layout->nr_superblocks; i++) { ++ offset = le64_to_cpu(layout->sb_offset[i]); ++ ++ if (offset < prev_offset + max_sectors) ++ return "Invalid superblock layout: superblocks overlap"; ++ prev_offset = offset; ++ } ++ ++ return NULL; ++} ++ ++const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) ++{ ++ struct bch_sb *sb = disk_sb->sb; ++ struct bch_sb_field *f; ++ struct bch_sb_field_members *mi; ++ const char *err; ++ u32 version, version_min; ++ u16 block_size; ++ ++ version = le16_to_cpu(sb->version); ++ version_min = version >= bcachefs_metadata_version_new_versioning ++ ? le16_to_cpu(sb->version_min) ++ : version; ++ ++ if (version >= bcachefs_metadata_version_max || ++ version_min < bcachefs_metadata_version_min) ++ return "Unsupported superblock version"; ++ ++ if (version_min > version) ++ return "Bad minimum version"; ++ ++ if (sb->features[1] || ++ (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) ++ return "Filesystem has incompatible features"; ++ ++ block_size = le16_to_cpu(sb->block_size); ++ ++ if (!is_power_of_2(block_size) || ++ block_size > PAGE_SECTORS) ++ return "Bad block size"; ++ ++ if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) ++ return "Bad user UUID"; ++ ++ if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) ++ return "Bad internal UUID"; ++ ++ if (!sb->nr_devices || ++ sb->nr_devices <= sb->dev_idx || ++ sb->nr_devices > BCH_SB_MEMBERS_MAX) ++ return "Bad number of member devices"; ++ ++ if (!BCH_SB_META_REPLICAS_WANT(sb) || ++ BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) ++ return "Invalid number of metadata replicas"; ++ ++ if (!BCH_SB_META_REPLICAS_REQ(sb) || ++ BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) ++ return "Invalid number of metadata replicas"; ++ ++ if (!BCH_SB_DATA_REPLICAS_WANT(sb) || ++ BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) ++ return "Invalid number of data replicas"; ++ ++ if (!BCH_SB_DATA_REPLICAS_REQ(sb) || ++ BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) ++ return "Invalid number of data replicas"; ++ ++ if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) ++ return "Invalid metadata checksum type"; ++ ++ if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) ++ return "Invalid metadata checksum type"; ++ ++ if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR) ++ return "Invalid compression type"; ++ ++ if (!BCH_SB_BTREE_NODE_SIZE(sb)) ++ return "Btree node size not set"; ++ ++ if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb))) ++ return "Btree node size not a power of two"; ++ ++ if (BCH_SB_GC_RESERVE(sb) < 5) ++ return "gc reserve percentage too small"; ++ ++ if (!sb->time_precision || ++ le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) ++ return "invalid time precision"; ++ ++ /* validate layout */ ++ err = validate_sb_layout(&sb->layout); ++ if (err) ++ return err; ++ ++ vstruct_for_each(sb, f) { ++ if (!f->u64s) ++ return "Invalid superblock: invalid optional field"; ++ ++ if (vstruct_next(f) > vstruct_last(sb)) ++ return "Invalid superblock: invalid optional field"; ++ } ++ ++ /* members must be validated first: */ ++ mi = bch2_sb_get_members(sb); ++ if (!mi) ++ return "Invalid superblock: member info area missing"; ++ ++ err = bch2_sb_field_validate(sb, &mi->field); ++ if (err) ++ return err; ++ ++ vstruct_for_each(sb, f) { ++ if (le32_to_cpu(f->type) == BCH_SB_FIELD_members) ++ continue; ++ ++ err = bch2_sb_field_validate(sb, f); ++ if (err) ++ return err; ++ } ++ ++ return NULL; ++} ++ ++/* device open: */ ++ ++static void bch2_sb_update(struct bch_fs *c) ++{ ++ struct bch_sb *src = c->disk_sb.sb; ++ struct bch_sb_field_members *mi = bch2_sb_get_members(src); ++ struct bch_dev *ca; ++ unsigned i; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ c->sb.uuid = src->uuid; ++ c->sb.user_uuid = src->user_uuid; ++ c->sb.version = le16_to_cpu(src->version); ++ c->sb.nr_devices = src->nr_devices; ++ c->sb.clean = BCH_SB_CLEAN(src); ++ c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); ++ c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src); ++ c->sb.time_base_lo = le64_to_cpu(src->time_base_lo); ++ c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); ++ c->sb.time_precision = le32_to_cpu(src->time_precision); ++ c->sb.features = le64_to_cpu(src->features[0]); ++ c->sb.compat = le64_to_cpu(src->compat[0]); ++ ++ for_each_member_device(ca, c, i) ++ ca->mi = bch2_mi_to_cpu(mi->members + i); ++} ++ ++/* doesn't copy member info */ ++static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) ++{ ++ struct bch_sb_field *src_f, *dst_f; ++ struct bch_sb *dst = dst_handle->sb; ++ unsigned i; ++ ++ dst->version = src->version; ++ dst->version_min = src->version_min; ++ dst->seq = src->seq; ++ dst->uuid = src->uuid; ++ dst->user_uuid = src->user_uuid; ++ memcpy(dst->label, src->label, sizeof(dst->label)); ++ ++ dst->block_size = src->block_size; ++ dst->nr_devices = src->nr_devices; ++ ++ dst->time_base_lo = src->time_base_lo; ++ dst->time_base_hi = src->time_base_hi; ++ dst->time_precision = src->time_precision; ++ ++ memcpy(dst->flags, src->flags, sizeof(dst->flags)); ++ memcpy(dst->features, src->features, sizeof(dst->features)); ++ memcpy(dst->compat, src->compat, sizeof(dst->compat)); ++ ++ for (i = 0; i < BCH_SB_FIELD_NR; i++) { ++ if (i == BCH_SB_FIELD_journal) ++ continue; ++ ++ src_f = bch2_sb_field_get(src, i); ++ dst_f = bch2_sb_field_get(dst, i); ++ dst_f = __bch2_sb_field_resize(dst_handle, dst_f, ++ src_f ? le32_to_cpu(src_f->u64s) : 0); ++ ++ if (src_f) ++ memcpy(dst_f, src_f, vstruct_bytes(src_f)); ++ } ++} ++ ++int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) ++{ ++ struct bch_sb_field_journal *journal_buckets = ++ bch2_sb_get_journal(src); ++ unsigned journal_u64s = journal_buckets ++ ? le32_to_cpu(journal_buckets->field.u64s) ++ : 0; ++ int ret; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ ret = bch2_sb_realloc(&c->disk_sb, ++ le32_to_cpu(src->u64s) - journal_u64s); ++ if (ret) ++ return ret; ++ ++ __copy_super(&c->disk_sb, src); ++ ++ ret = bch2_sb_replicas_to_cpu_replicas(c); ++ if (ret) ++ return ret; ++ ++ ret = bch2_sb_disk_groups_to_cpu(c); ++ if (ret) ++ return ret; ++ ++ bch2_sb_update(c); ++ return 0; ++} ++ ++int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bch_sb *src = c->disk_sb.sb, *dst = ca->disk_sb.sb; ++ struct bch_sb_field_journal *journal_buckets = ++ bch2_sb_get_journal(dst); ++ unsigned journal_u64s = journal_buckets ++ ? le32_to_cpu(journal_buckets->field.u64s) ++ : 0; ++ unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s; ++ int ret; ++ ++ ret = bch2_sb_realloc(&ca->disk_sb, u64s); ++ if (ret) ++ return ret; ++ ++ __copy_super(&ca->disk_sb, src); ++ return 0; ++} ++ ++/* read superblock: */ ++ ++static const char *read_one_super(struct bch_sb_handle *sb, u64 offset) ++{ ++ struct bch_csum csum; ++ size_t bytes; ++reread: ++ bio_reset(sb->bio); ++ bio_set_dev(sb->bio, sb->bdev); ++ sb->bio->bi_iter.bi_sector = offset; ++ bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); ++ bch2_bio_map(sb->bio, sb->sb, PAGE_SIZE << sb->page_order); ++ ++ if (submit_bio_wait(sb->bio)) ++ return "IO error"; ++ ++ if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) ++ return "Not a bcachefs superblock"; ++ ++ if (le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_min || ++ le16_to_cpu(sb->sb->version) >= bcachefs_metadata_version_max) ++ return "Unsupported superblock version"; ++ ++ bytes = vstruct_bytes(sb->sb); ++ ++ if (bytes > 512 << sb->sb->layout.sb_max_size_bits) ++ return "Bad superblock: too big"; ++ ++ if (get_order(bytes) > sb->page_order) { ++ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s))) ++ return "cannot allocate memory"; ++ goto reread; ++ } ++ ++ if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) ++ return "unknown csum type"; ++ ++ /* XXX: verify MACs */ ++ csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), ++ null_nonce(), sb->sb); ++ ++ if (bch2_crc_cmp(csum, sb->sb->csum)) ++ return "bad checksum reading superblock"; ++ ++ sb->seq = le64_to_cpu(sb->sb->seq); ++ ++ return NULL; ++} ++ ++int bch2_read_super(const char *path, struct bch_opts *opts, ++ struct bch_sb_handle *sb) ++{ ++ u64 offset = opt_get(*opts, sb); ++ struct bch_sb_layout layout; ++ const char *err; ++ __le64 *i; ++ int ret; ++ ++ pr_verbose_init(*opts, ""); ++ ++ memset(sb, 0, sizeof(*sb)); ++ sb->mode = FMODE_READ; ++ sb->have_bio = true; ++ ++ if (!opt_get(*opts, noexcl)) ++ sb->mode |= FMODE_EXCL; ++ ++ if (!opt_get(*opts, nochanges)) ++ sb->mode |= FMODE_WRITE; ++ ++ sb->bdev = blkdev_get_by_path(path, sb->mode, sb); ++ if (IS_ERR(sb->bdev) && ++ PTR_ERR(sb->bdev) == -EACCES && ++ opt_get(*opts, read_only)) { ++ sb->mode &= ~FMODE_WRITE; ++ ++ sb->bdev = blkdev_get_by_path(path, sb->mode, sb); ++ if (!IS_ERR(sb->bdev)) ++ opt_set(*opts, nochanges, true); ++ } ++ ++ if (IS_ERR(sb->bdev)) { ++ ret = PTR_ERR(sb->bdev); ++ goto out; ++ } ++ ++ err = "cannot allocate memory"; ++ ret = bch2_sb_realloc(sb, 0); ++ if (ret) ++ goto err; ++ ++ ret = -EFAULT; ++ err = "dynamic fault"; ++ if (bch2_fs_init_fault("read_super")) ++ goto err; ++ ++ ret = -EINVAL; ++ err = read_one_super(sb, offset); ++ if (!err) ++ goto got_super; ++ ++ if (opt_defined(*opts, sb)) ++ goto err; ++ ++ pr_err("error reading default superblock: %s", err); ++ ++ /* ++ * Error reading primary superblock - read location of backup ++ * superblocks: ++ */ ++ bio_reset(sb->bio); ++ bio_set_dev(sb->bio, sb->bdev); ++ sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; ++ bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); ++ /* ++ * use sb buffer to read layout, since sb buffer is page aligned but ++ * layout won't be: ++ */ ++ bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout)); ++ ++ err = "IO error"; ++ if (submit_bio_wait(sb->bio)) ++ goto err; ++ ++ memcpy(&layout, sb->sb, sizeof(layout)); ++ err = validate_sb_layout(&layout); ++ if (err) ++ goto err; ++ ++ for (i = layout.sb_offset; ++ i < layout.sb_offset + layout.nr_superblocks; i++) { ++ offset = le64_to_cpu(*i); ++ ++ if (offset == opt_get(*opts, sb)) ++ continue; ++ ++ err = read_one_super(sb, offset); ++ if (!err) ++ goto got_super; ++ } ++ ++ ret = -EINVAL; ++ goto err; ++ ++got_super: ++ err = "Superblock block size smaller than device block size"; ++ ret = -EINVAL; ++ if (le16_to_cpu(sb->sb->block_size) << 9 < ++ bdev_logical_block_size(sb->bdev)) ++ goto err; ++ ++ if (sb->mode & FMODE_WRITE) ++ bdev_get_queue(sb->bdev)->backing_dev_info->capabilities ++ |= BDI_CAP_STABLE_WRITES; ++ ret = 0; ++ sb->have_layout = true; ++out: ++ pr_verbose_init(*opts, "ret %i", ret); ++ return ret; ++err: ++ bch2_free_super(sb); ++ pr_err("error reading superblock: %s", err); ++ goto out; ++} ++ ++/* write superblock: */ ++ ++static void write_super_endio(struct bio *bio) ++{ ++ struct bch_dev *ca = bio->bi_private; ++ ++ /* XXX: return errors directly */ ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write: %s", ++ bch2_blk_status_to_str(bio->bi_status))) ++ ca->sb_write_error = 1; ++ ++ closure_put(&ca->fs->sb_write); ++ percpu_ref_put(&ca->io_ref); ++} ++ ++static void read_back_super(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bch_sb *sb = ca->disk_sb.sb; ++ struct bio *bio = ca->disk_sb.bio; ++ ++ bio_reset(bio); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); ++ bio->bi_end_io = write_super_endio; ++ bio->bi_private = ca; ++ bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC|REQ_META); ++ bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); ++ ++ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], ++ bio_sectors(bio)); ++ ++ percpu_ref_get(&ca->io_ref); ++ closure_bio_submit(bio, &c->sb_write); ++} ++ ++static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) ++{ ++ struct bch_sb *sb = ca->disk_sb.sb; ++ struct bio *bio = ca->disk_sb.bio; ++ ++ sb->offset = sb->layout.sb_offset[idx]; ++ ++ SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum); ++ sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), ++ null_nonce(), sb); ++ ++ bio_reset(bio); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); ++ bio->bi_end_io = write_super_endio; ++ bio->bi_private = ca; ++ bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META); ++ bch2_bio_map(bio, sb, ++ roundup((size_t) vstruct_bytes(sb), ++ bdev_logical_block_size(ca->disk_sb.bdev))); ++ ++ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], ++ bio_sectors(bio)); ++ ++ percpu_ref_get(&ca->io_ref); ++ closure_bio_submit(bio, &c->sb_write); ++} ++ ++int bch2_write_super(struct bch_fs *c) ++{ ++ struct closure *cl = &c->sb_write; ++ struct bch_dev *ca; ++ unsigned i, sb = 0, nr_wrote; ++ const char *err; ++ struct bch_devs_mask sb_written; ++ bool wrote, can_mount_without_written, can_mount_with_written; ++ int ret = 0; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ closure_init_stack(cl); ++ memset(&sb_written, 0, sizeof(sb_written)); ++ ++ le64_add_cpu(&c->disk_sb.sb->seq, 1); ++ ++ if (test_bit(BCH_FS_ERROR, &c->flags)) ++ SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); ++ ++ for_each_online_member(ca, c, i) ++ bch2_sb_from_fs(c, ca); ++ ++ for_each_online_member(ca, c, i) { ++ err = bch2_sb_validate(&ca->disk_sb); ++ if (err) { ++ bch2_fs_inconsistent(c, "sb invalid before write: %s", err); ++ ret = -1; ++ goto out; ++ } ++ } ++ ++ if (c->opts.nochanges) ++ goto out; ++ ++ for_each_online_member(ca, c, i) { ++ __set_bit(ca->dev_idx, sb_written.d); ++ ca->sb_write_error = 0; ++ } ++ ++ for_each_online_member(ca, c, i) ++ read_back_super(c, ca); ++ closure_sync(cl); ++ ++ for_each_online_member(ca, c, i) { ++ if (!ca->sb_write_error && ++ ca->disk_sb.seq != ++ le64_to_cpu(ca->sb_read_scratch->seq)) { ++ bch2_fs_fatal_error(c, ++ "Superblock modified by another process"); ++ percpu_ref_put(&ca->io_ref); ++ ret = -EROFS; ++ goto out; ++ } ++ } ++ ++ do { ++ wrote = false; ++ for_each_online_member(ca, c, i) ++ if (!ca->sb_write_error && ++ sb < ca->disk_sb.sb->layout.nr_superblocks) { ++ write_one_super(c, ca, sb); ++ wrote = true; ++ } ++ closure_sync(cl); ++ sb++; ++ } while (wrote); ++ ++ for_each_online_member(ca, c, i) { ++ if (ca->sb_write_error) ++ __clear_bit(ca->dev_idx, sb_written.d); ++ else ++ ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq); ++ } ++ ++ nr_wrote = dev_mask_nr(&sb_written); ++ ++ can_mount_with_written = ++ bch2_have_enough_devs(__bch2_replicas_status(c, sb_written), ++ BCH_FORCE_IF_DEGRADED); ++ ++ for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) ++ sb_written.d[i] = ~sb_written.d[i]; ++ ++ can_mount_without_written = ++ bch2_have_enough_devs(__bch2_replicas_status(c, sb_written), ++ BCH_FORCE_IF_DEGRADED); ++ ++ /* ++ * If we would be able to mount _without_ the devices we successfully ++ * wrote superblocks to, we weren't able to write to enough devices: ++ * ++ * Exception: if we can mount without the successes because we haven't ++ * written anything (new filesystem), we continue if we'd be able to ++ * mount with the devices we did successfully write to: ++ */ ++ if (bch2_fs_fatal_err_on(!nr_wrote || ++ (can_mount_without_written && ++ !can_mount_with_written), c, ++ "Unable to write superblock to sufficient devices")) ++ ret = -1; ++out: ++ /* Make new options visible after they're persistent: */ ++ bch2_sb_update(c); ++ return ret; ++} ++ ++void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) ++{ ++ mutex_lock(&c->sb_lock); ++ if (!(c->sb.features & (1ULL << feat))) { ++ c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat); ++ ++ bch2_write_super(c); ++ } ++ mutex_unlock(&c->sb_lock); ++} ++ ++/* BCH_SB_FIELD_journal: */ ++ ++static int u64_cmp(const void *_l, const void *_r) ++{ ++ u64 l = *((const u64 *) _l), r = *((const u64 *) _r); ++ ++ return l < r ? -1 : l > r ? 1 : 0; ++} ++ ++static const char *bch2_sb_validate_journal(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_journal *journal = field_to_type(f, journal); ++ struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; ++ const char *err; ++ unsigned nr; ++ unsigned i; ++ u64 *b; ++ ++ journal = bch2_sb_get_journal(sb); ++ if (!journal) ++ return NULL; ++ ++ nr = bch2_nr_journal_buckets(journal); ++ if (!nr) ++ return NULL; ++ ++ b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); ++ if (!b) ++ return "cannot allocate memory"; ++ ++ for (i = 0; i < nr; i++) ++ b[i] = le64_to_cpu(journal->buckets[i]); ++ ++ sort(b, nr, sizeof(u64), u64_cmp, NULL); ++ ++ err = "journal bucket at sector 0"; ++ if (!b[0]) ++ goto err; ++ ++ err = "journal bucket before first bucket"; ++ if (m && b[0] < le16_to_cpu(m->first_bucket)) ++ goto err; ++ ++ err = "journal bucket past end of device"; ++ if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets)) ++ goto err; ++ ++ err = "duplicate journal buckets"; ++ for (i = 0; i + 1 < nr; i++) ++ if (b[i] == b[i + 1]) ++ goto err; ++ ++ err = NULL; ++err: ++ kfree(b); ++ return err; ++} ++ ++static const struct bch_sb_field_ops bch_sb_field_ops_journal = { ++ .validate = bch2_sb_validate_journal, ++}; ++ ++/* BCH_SB_FIELD_members: */ ++ ++static const char *bch2_sb_validate_members(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_members *mi = field_to_type(f, members); ++ struct bch_member *m; ++ ++ if ((void *) (mi->members + sb->nr_devices) > ++ vstruct_end(&mi->field)) ++ return "Invalid superblock: bad member info"; ++ ++ for (m = mi->members; ++ m < mi->members + sb->nr_devices; ++ m++) { ++ if (!bch2_member_exists(m)) ++ continue; ++ ++ if (le64_to_cpu(m->nbuckets) > LONG_MAX) ++ return "Too many buckets"; ++ ++ if (le64_to_cpu(m->nbuckets) - ++ le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) ++ return "Not enough buckets"; ++ ++ if (le16_to_cpu(m->bucket_size) < ++ le16_to_cpu(sb->block_size)) ++ return "bucket size smaller than block size"; ++ ++ if (le16_to_cpu(m->bucket_size) < ++ BCH_SB_BTREE_NODE_SIZE(sb)) ++ return "bucket size smaller than btree node size"; ++ } ++ ++ return NULL; ++} ++ ++static const struct bch_sb_field_ops bch_sb_field_ops_members = { ++ .validate = bch2_sb_validate_members, ++}; ++ ++/* BCH_SB_FIELD_crypt: */ ++ ++static const char *bch2_sb_validate_crypt(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); ++ ++ if (vstruct_bytes(&crypt->field) != sizeof(*crypt)) ++ return "invalid field crypt: wrong size"; ++ ++ if (BCH_CRYPT_KDF_TYPE(crypt)) ++ return "invalid field crypt: bad kdf type"; ++ ++ return NULL; ++} ++ ++static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { ++ .validate = bch2_sb_validate_crypt, ++}; ++ ++/* BCH_SB_FIELD_clean: */ ++ ++void bch2_sb_clean_renumber(struct bch_sb_field_clean *clean, int write) ++{ ++ struct jset_entry *entry; ++ ++ for (entry = clean->start; ++ entry < (struct jset_entry *) vstruct_end(&clean->field); ++ entry = vstruct_next(entry)) ++ bch2_bkey_renumber(BKEY_TYPE_BTREE, bkey_to_packed(entry->start), write); ++} ++ ++int bch2_fs_mark_dirty(struct bch_fs *c) ++{ ++ int ret; ++ ++ /* ++ * Unconditionally write superblock, to verify it hasn't changed before ++ * we go rw: ++ */ ++ ++ mutex_lock(&c->sb_lock); ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates; ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled; ++ ret = bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++static void ++entry_init_u64s(struct jset_entry *entry, unsigned u64s) ++{ ++ memset(entry, 0, u64s * sizeof(u64)); ++ ++ /* ++ * The u64s field counts from the start of data, ignoring the shared ++ * fields. ++ */ ++ entry->u64s = u64s - 1; ++} ++ ++static void ++entry_init_size(struct jset_entry *entry, size_t size) ++{ ++ unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); ++ entry_init_u64s(entry, u64s); ++} ++ ++struct jset_entry * ++bch2_journal_super_entries_add_common(struct bch_fs *c, ++ struct jset_entry *entry, ++ u64 journal_seq) ++{ ++ unsigned i; ++ ++ percpu_down_write(&c->mark_lock); ++ ++ if (!journal_seq) { ++ bch2_fs_usage_acc_to_base(c, 0); ++ bch2_fs_usage_acc_to_base(c, 1); ++ } else { ++ bch2_fs_usage_acc_to_base(c, journal_seq & 1); ++ } ++ ++ { ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ ++ entry_init_size(entry, sizeof(*u)); ++ u->entry.type = BCH_JSET_ENTRY_usage; ++ u->entry.btree_id = FS_USAGE_INODES; ++ u->v = cpu_to_le64(c->usage_base->nr_inodes); ++ ++ entry = vstruct_next(entry); ++ } ++ ++ { ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ ++ entry_init_size(entry, sizeof(*u)); ++ u->entry.type = BCH_JSET_ENTRY_usage; ++ u->entry.btree_id = FS_USAGE_KEY_VERSION; ++ u->v = cpu_to_le64(atomic64_read(&c->key_version)); ++ ++ entry = vstruct_next(entry); ++ } ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) { ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ ++ entry_init_size(entry, sizeof(*u)); ++ u->entry.type = BCH_JSET_ENTRY_usage; ++ u->entry.btree_id = FS_USAGE_RESERVED; ++ u->entry.level = i; ++ u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); ++ ++ entry = vstruct_next(entry); ++ } ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ struct jset_entry_data_usage *u = ++ container_of(entry, struct jset_entry_data_usage, entry); ++ ++ entry_init_size(entry, sizeof(*u) + e->nr_devs); ++ u->entry.type = BCH_JSET_ENTRY_data_usage; ++ u->v = cpu_to_le64(c->usage_base->replicas[i]); ++ memcpy(&u->r, e, replicas_entry_bytes(e)); ++ ++ entry = vstruct_next(entry); ++ } ++ ++ percpu_up_write(&c->mark_lock); ++ ++ return entry; ++} ++ ++void bch2_fs_mark_clean(struct bch_fs *c) ++{ ++ struct bch_sb_field_clean *sb_clean; ++ struct jset_entry *entry; ++ unsigned u64s; ++ ++ mutex_lock(&c->sb_lock); ++ if (BCH_SB_CLEAN(c->disk_sb.sb)) ++ goto out; ++ ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, true); ++ ++ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; ++ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA; ++ c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates); ++ c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled); ++ ++ u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; ++ ++ sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s); ++ if (!sb_clean) { ++ bch_err(c, "error resizing superblock while setting filesystem clean"); ++ goto out; ++ } ++ ++ sb_clean->flags = 0; ++ sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); ++ sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); ++ sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1); ++ ++ /* Trying to catch outstanding bug: */ ++ BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); ++ ++ entry = sb_clean->start; ++ entry = bch2_journal_super_entries_add_common(c, entry, 0); ++ entry = bch2_btree_roots_to_journal_entries(c, entry, entry); ++ BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); ++ ++ memset(entry, 0, ++ vstruct_end(&sb_clean->field) - (void *) entry); ++ ++ if (le16_to_cpu(c->disk_sb.sb->version) < ++ bcachefs_metadata_version_bkey_renumber) ++ bch2_sb_clean_renumber(sb_clean, WRITE); ++ ++ bch2_write_super(c); ++out: ++ mutex_unlock(&c->sb_lock); ++} ++ ++static const char *bch2_sb_validate_clean(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_clean *clean = field_to_type(f, clean); ++ ++ if (vstruct_bytes(&clean->field) < sizeof(*clean)) ++ return "invalid field crypt: wrong size"; ++ ++ return NULL; ++} ++ ++static const struct bch_sb_field_ops bch_sb_field_ops_clean = { ++ .validate = bch2_sb_validate_clean, ++}; ++ ++static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { ++#define x(f, nr) \ ++ [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, ++ BCH_SB_FIELDS() ++#undef x ++}; ++ ++static const char *bch2_sb_field_validate(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ unsigned type = le32_to_cpu(f->type); ++ ++ return type < BCH_SB_FIELD_NR ++ ? bch2_sb_field_ops[type]->validate(sb, f) ++ : NULL; ++} ++ ++void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ unsigned type = le32_to_cpu(f->type); ++ const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR ++ ? bch2_sb_field_ops[type] : NULL; ++ ++ if (ops) ++ pr_buf(out, "%s", bch2_sb_fields[type]); ++ else ++ pr_buf(out, "(unknown field %u)", type); ++ ++ pr_buf(out, " (size %llu):", vstruct_bytes(f)); ++ ++ if (ops && ops->to_text) ++ bch2_sb_field_ops[type]->to_text(out, sb, f); ++} +diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h +new file mode 100644 +index 000000000000..7a068158efca +--- /dev/null ++++ b/fs/bcachefs/super-io.h +@@ -0,0 +1,137 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SUPER_IO_H ++#define _BCACHEFS_SUPER_IO_H ++ ++#include "extents.h" ++#include "eytzinger.h" ++#include "super_types.h" ++#include "super.h" ++ ++#include ++ ++struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type); ++struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *, ++ enum bch_sb_field_type, unsigned); ++void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type); ++ ++#define field_to_type(_f, _name) \ ++ container_of_or_null(_f, struct bch_sb_field_##_name, field) ++ ++#define x(_name, _nr) \ ++static inline struct bch_sb_field_##_name * \ ++bch2_sb_get_##_name(struct bch_sb *sb) \ ++{ \ ++ return field_to_type(bch2_sb_field_get(sb, \ ++ BCH_SB_FIELD_##_name), _name); \ ++} \ ++ \ ++static inline struct bch_sb_field_##_name * \ ++bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s) \ ++{ \ ++ return field_to_type(bch2_sb_field_resize(sb, \ ++ BCH_SB_FIELD_##_name, u64s), _name); \ ++} ++ ++BCH_SB_FIELDS() ++#undef x ++ ++extern const char * const bch2_sb_fields[]; ++ ++struct bch_sb_field_ops { ++ const char * (*validate)(struct bch_sb *, struct bch_sb_field *); ++ void (*to_text)(struct printbuf *, struct bch_sb *, ++ struct bch_sb_field *); ++}; ++ ++static inline __le64 bch2_sb_magic(struct bch_fs *c) ++{ ++ __le64 ret; ++ memcpy(&ret, &c->sb.uuid, sizeof(ret)); ++ return ret; ++} ++ ++static inline __u64 jset_magic(struct bch_fs *c) ++{ ++ return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC); ++} ++ ++static inline __u64 bset_magic(struct bch_fs *c) ++{ ++ return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC); ++} ++ ++int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *); ++int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); ++ ++void bch2_free_super(struct bch_sb_handle *); ++int bch2_sb_realloc(struct bch_sb_handle *, unsigned); ++ ++const char *bch2_sb_validate(struct bch_sb_handle *); ++ ++int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); ++int bch2_write_super(struct bch_fs *); ++void __bch2_check_set_feature(struct bch_fs *, unsigned); ++ ++static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) ++{ ++ if (!(c->sb.features & (1ULL << feat))) ++ __bch2_check_set_feature(c, feat); ++} ++ ++/* BCH_SB_FIELD_journal: */ ++ ++static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) ++{ ++ return j ++ ? (__le64 *) vstruct_end(&j->field) - j->buckets ++ : 0; ++} ++ ++/* BCH_SB_FIELD_members: */ ++ ++static inline bool bch2_member_exists(struct bch_member *m) ++{ ++ return !bch2_is_zero(m->uuid.b, sizeof(uuid_le)); ++} ++ ++static inline bool bch2_dev_exists(struct bch_sb *sb, ++ struct bch_sb_field_members *mi, ++ unsigned dev) ++{ ++ return dev < sb->nr_devices && ++ bch2_member_exists(&mi->members[dev]); ++} ++ ++static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) ++{ ++ return (struct bch_member_cpu) { ++ .nbuckets = le64_to_cpu(mi->nbuckets), ++ .first_bucket = le16_to_cpu(mi->first_bucket), ++ .bucket_size = le16_to_cpu(mi->bucket_size), ++ .group = BCH_MEMBER_GROUP(mi), ++ .state = BCH_MEMBER_STATE(mi), ++ .replacement = BCH_MEMBER_REPLACEMENT(mi), ++ .discard = BCH_MEMBER_DISCARD(mi), ++ .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), ++ .durability = BCH_MEMBER_DURABILITY(mi) ++ ? BCH_MEMBER_DURABILITY(mi) - 1 ++ : 1, ++ .valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)), ++ }; ++} ++ ++/* BCH_SB_FIELD_clean: */ ++ ++struct jset_entry * ++bch2_journal_super_entries_add_common(struct bch_fs *, ++ struct jset_entry *, u64); ++ ++void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int); ++ ++int bch2_fs_mark_dirty(struct bch_fs *); ++void bch2_fs_mark_clean(struct bch_fs *); ++ ++void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, ++ struct bch_sb_field *); ++ ++#endif /* _BCACHEFS_SUPER_IO_H */ +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +new file mode 100644 +index 000000000000..30be083b09bf +--- /dev/null ++++ b/fs/bcachefs/super.c +@@ -0,0 +1,2062 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * bcachefs setup/teardown code, and some metadata io - read a superblock and ++ * figure out what to do with it. ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "bkey_sort.h" ++#include "btree_cache.h" ++#include "btree_gc.h" ++#include "btree_key_cache.h" ++#include "btree_update_interior.h" ++#include "btree_io.h" ++#include "chardev.h" ++#include "checksum.h" ++#include "clock.h" ++#include "compress.h" ++#include "debug.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "error.h" ++#include "fs.h" ++#include "fs-io.h" ++#include "fsck.h" ++#include "inode.h" ++#include "io.h" ++#include "journal.h" ++#include "journal_reclaim.h" ++#include "journal_seq_blacklist.h" ++#include "move.h" ++#include "migrate.h" ++#include "movinggc.h" ++#include "quota.h" ++#include "rebalance.h" ++#include "recovery.h" ++#include "replicas.h" ++#include "super.h" ++#include "super-io.h" ++#include "sysfs.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Kent Overstreet "); ++ ++#define KTYPE(type) \ ++struct kobj_type type ## _ktype = { \ ++ .release = type ## _release, \ ++ .sysfs_ops = &type ## _sysfs_ops, \ ++ .default_attrs = type ## _files \ ++} ++ ++static void bch2_fs_release(struct kobject *); ++static void bch2_dev_release(struct kobject *); ++ ++static void bch2_fs_internal_release(struct kobject *k) ++{ ++} ++ ++static void bch2_fs_opts_dir_release(struct kobject *k) ++{ ++} ++ ++static void bch2_fs_time_stats_release(struct kobject *k) ++{ ++} ++ ++static KTYPE(bch2_fs); ++static KTYPE(bch2_fs_internal); ++static KTYPE(bch2_fs_opts_dir); ++static KTYPE(bch2_fs_time_stats); ++static KTYPE(bch2_dev); ++ ++static struct kset *bcachefs_kset; ++static LIST_HEAD(bch_fs_list); ++static DEFINE_MUTEX(bch_fs_list_lock); ++ ++static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait); ++ ++static void bch2_dev_free(struct bch_dev *); ++static int bch2_dev_alloc(struct bch_fs *, unsigned); ++static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); ++static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); ++ ++struct bch_fs *bch2_bdev_to_fs(struct block_device *bdev) ++{ ++ struct bch_fs *c; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ mutex_lock(&bch_fs_list_lock); ++ rcu_read_lock(); ++ ++ list_for_each_entry(c, &bch_fs_list, list) ++ for_each_member_device_rcu(ca, c, i, NULL) ++ if (ca->disk_sb.bdev == bdev) { ++ closure_get(&c->cl); ++ goto found; ++ } ++ c = NULL; ++found: ++ rcu_read_unlock(); ++ mutex_unlock(&bch_fs_list_lock); ++ ++ return c; ++} ++ ++static struct bch_fs *__bch2_uuid_to_fs(uuid_le uuid) ++{ ++ struct bch_fs *c; ++ ++ lockdep_assert_held(&bch_fs_list_lock); ++ ++ list_for_each_entry(c, &bch_fs_list, list) ++ if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid_le))) ++ return c; ++ ++ return NULL; ++} ++ ++struct bch_fs *bch2_uuid_to_fs(uuid_le uuid) ++{ ++ struct bch_fs *c; ++ ++ mutex_lock(&bch_fs_list_lock); ++ c = __bch2_uuid_to_fs(uuid); ++ if (c) ++ closure_get(&c->cl); ++ mutex_unlock(&bch_fs_list_lock); ++ ++ return c; ++} ++ ++int bch2_congested(void *data, int bdi_bits) ++{ ++ struct bch_fs *c = data; ++ struct backing_dev_info *bdi; ++ struct bch_dev *ca; ++ unsigned i; ++ int ret = 0; ++ ++ rcu_read_lock(); ++ if (bdi_bits & (1 << WB_sync_congested)) { ++ /* Reads - check all devices: */ ++ for_each_readable_member(ca, c, i) { ++ bdi = ca->disk_sb.bdev->bd_bdi; ++ ++ if (bdi_congested(bdi, bdi_bits)) { ++ ret = 1; ++ break; ++ } ++ } ++ } else { ++ const struct bch_devs_mask *devs = ++ bch2_target_to_mask(c, c->opts.foreground_target) ?: ++ &c->rw_devs[BCH_DATA_user]; ++ ++ for_each_member_device_rcu(ca, c, i, devs) { ++ bdi = ca->disk_sb.bdev->bd_bdi; ++ ++ if (bdi_congested(bdi, bdi_bits)) { ++ ret = 1; ++ break; ++ } ++ } ++ } ++ rcu_read_unlock(); ++ ++ return ret; ++} ++ ++/* Filesystem RO/RW: */ ++ ++/* ++ * For startup/shutdown of RW stuff, the dependencies are: ++ * ++ * - foreground writes depend on copygc and rebalance (to free up space) ++ * ++ * - copygc and rebalance depend on mark and sweep gc (they actually probably ++ * don't because they either reserve ahead of time or don't block if ++ * allocations fail, but allocations can require mark and sweep gc to run ++ * because of generation number wraparound) ++ * ++ * - all of the above depends on the allocator threads ++ * ++ * - allocator depends on the journal (when it rewrites prios and gens) ++ */ ++ ++static void __bch2_fs_read_only(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ bool wrote = false; ++ unsigned i, clean_passes = 0; ++ int ret; ++ ++ bch2_rebalance_stop(c); ++ bch2_copygc_stop(c); ++ bch2_gc_thread_stop(c); ++ ++ /* ++ * Flush journal before stopping allocators, because flushing journal ++ * blacklist entries involves allocating new btree nodes: ++ */ ++ bch2_journal_flush_all_pins(&c->journal); ++ ++ /* ++ * If the allocator threads didn't all start up, the btree updates to ++ * write out alloc info aren't going to work: ++ */ ++ if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags)) ++ goto nowrote_alloc; ++ ++ bch_verbose(c, "writing alloc info"); ++ /* ++ * This should normally just be writing the bucket read/write clocks: ++ */ ++ ret = bch2_stripes_write(c, BTREE_INSERT_NOCHECK_RW, &wrote) ?: ++ bch2_alloc_write(c, BTREE_INSERT_NOCHECK_RW, &wrote); ++ bch_verbose(c, "writing alloc info complete"); ++ ++ if (ret && !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) ++ bch2_fs_inconsistent(c, "error writing out alloc info %i", ret); ++ ++ if (ret) ++ goto nowrote_alloc; ++ ++ bch_verbose(c, "flushing journal and stopping allocators"); ++ ++ bch2_journal_flush_all_pins(&c->journal); ++ set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); ++ ++ do { ++ clean_passes++; ++ ++ if (bch2_journal_flush_all_pins(&c->journal)) ++ clean_passes = 0; ++ ++ /* ++ * In flight interior btree updates will generate more journal ++ * updates and btree updates (alloc btree): ++ */ ++ if (bch2_btree_interior_updates_nr_pending(c)) { ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c)); ++ clean_passes = 0; ++ } ++ flush_work(&c->btree_interior_update_work); ++ ++ if (bch2_journal_flush_all_pins(&c->journal)) ++ clean_passes = 0; ++ } while (clean_passes < 2); ++ bch_verbose(c, "flushing journal and stopping allocators complete"); ++ ++ set_bit(BCH_FS_ALLOC_CLEAN, &c->flags); ++nowrote_alloc: ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c)); ++ flush_work(&c->btree_interior_update_work); ++ ++ for_each_member_device(ca, c, i) ++ bch2_dev_allocator_stop(ca); ++ ++ clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); ++ clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); ++ ++ bch2_fs_journal_stop(&c->journal); ++ ++ /* ++ * the journal kicks off btree writes via reclaim - wait for in flight ++ * writes after stopping journal: ++ */ ++ if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) ++ bch2_btree_flush_all_writes(c); ++ else ++ bch2_btree_verify_flushed(c); ++ ++ /* ++ * After stopping journal: ++ */ ++ for_each_member_device(ca, c, i) ++ bch2_dev_allocator_remove(c, ca); ++} ++ ++static void bch2_writes_disabled(struct percpu_ref *writes) ++{ ++ struct bch_fs *c = container_of(writes, struct bch_fs, writes); ++ ++ set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); ++ wake_up(&bch_read_only_wait); ++} ++ ++void bch2_fs_read_only(struct bch_fs *c) ++{ ++ if (!test_bit(BCH_FS_RW, &c->flags)) { ++ cancel_delayed_work_sync(&c->journal.reclaim_work); ++ return; ++ } ++ ++ BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); ++ ++ /* ++ * Block new foreground-end write operations from starting - any new ++ * writes will return -EROFS: ++ * ++ * (This is really blocking new _allocations_, writes to previously ++ * allocated space can still happen until stopping the allocator in ++ * bch2_dev_allocator_stop()). ++ */ ++ percpu_ref_kill(&c->writes); ++ ++ cancel_work_sync(&c->ec_stripe_delete_work); ++ cancel_delayed_work(&c->pd_controllers_update); ++ ++ /* ++ * If we're not doing an emergency shutdown, we want to wait on ++ * outstanding writes to complete so they don't see spurious errors due ++ * to shutting down the allocator: ++ * ++ * If we are doing an emergency shutdown outstanding writes may ++ * hang until we shutdown the allocator so we don't want to wait ++ * on outstanding writes before shutting everything down - but ++ * we do need to wait on them before returning and signalling ++ * that going RO is complete: ++ */ ++ wait_event(bch_read_only_wait, ++ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) || ++ test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); ++ ++ __bch2_fs_read_only(c); ++ ++ wait_event(bch_read_only_wait, ++ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); ++ ++ clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); ++ ++ if (!bch2_journal_error(&c->journal) && ++ !test_bit(BCH_FS_ERROR, &c->flags) && ++ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) && ++ test_bit(BCH_FS_STARTED, &c->flags) && ++ test_bit(BCH_FS_ALLOC_CLEAN, &c->flags) && ++ !c->opts.norecovery) { ++ bch_verbose(c, "marking filesystem clean"); ++ bch2_fs_mark_clean(c); ++ } ++ ++ clear_bit(BCH_FS_RW, &c->flags); ++} ++ ++static void bch2_fs_read_only_work(struct work_struct *work) ++{ ++ struct bch_fs *c = ++ container_of(work, struct bch_fs, read_only_work); ++ ++ down_write(&c->state_lock); ++ bch2_fs_read_only(c); ++ up_write(&c->state_lock); ++} ++ ++static void bch2_fs_read_only_async(struct bch_fs *c) ++{ ++ queue_work(system_long_wq, &c->read_only_work); ++} ++ ++bool bch2_fs_emergency_read_only(struct bch_fs *c) ++{ ++ bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags); ++ ++ bch2_journal_halt(&c->journal); ++ bch2_fs_read_only_async(c); ++ ++ wake_up(&bch_read_only_wait); ++ return ret; ++} ++ ++static int bch2_fs_read_write_late(struct bch_fs *c) ++{ ++ int ret; ++ ++ ret = bch2_gc_thread_start(c); ++ if (ret) { ++ bch_err(c, "error starting gc thread"); ++ return ret; ++ } ++ ++ ret = bch2_copygc_start(c); ++ if (ret) { ++ bch_err(c, "error starting copygc thread"); ++ return ret; ++ } ++ ++ ret = bch2_rebalance_start(c); ++ if (ret) { ++ bch_err(c, "error starting rebalance thread"); ++ return ret; ++ } ++ ++ schedule_delayed_work(&c->pd_controllers_update, 5 * HZ); ++ ++ schedule_work(&c->ec_stripe_delete_work); ++ ++ return 0; ++} ++ ++static int __bch2_fs_read_write(struct bch_fs *c, bool early) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ int ret; ++ ++ if (test_bit(BCH_FS_RW, &c->flags)) ++ return 0; ++ ++ /* ++ * nochanges is used for fsck -n mode - we have to allow going rw ++ * during recovery for that to work: ++ */ ++ if (c->opts.norecovery || ++ (c->opts.nochanges && ++ (!early || c->opts.read_only))) ++ return -EROFS; ++ ++ ret = bch2_fs_mark_dirty(c); ++ if (ret) ++ goto err; ++ ++ /* ++ * We need to write out a journal entry before we start doing btree ++ * updates, to ensure that on unclean shutdown new journal blacklist ++ * entries are created: ++ */ ++ bch2_journal_meta(&c->journal); ++ ++ clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags); ++ ++ for_each_rw_member(ca, c, i) ++ bch2_dev_allocator_add(c, ca); ++ bch2_recalc_capacity(c); ++ ++ for_each_rw_member(ca, c, i) { ++ ret = bch2_dev_allocator_start(ca); ++ if (ret) { ++ bch_err(c, "error starting allocator threads"); ++ percpu_ref_put(&ca->io_ref); ++ goto err; ++ } ++ } ++ ++ set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); ++ ++ if (!early) { ++ ret = bch2_fs_read_write_late(c); ++ if (ret) ++ goto err; ++ } ++ ++ percpu_ref_reinit(&c->writes); ++ set_bit(BCH_FS_RW, &c->flags); ++ ++ queue_delayed_work(c->journal_reclaim_wq, ++ &c->journal.reclaim_work, 0); ++ return 0; ++err: ++ __bch2_fs_read_only(c); ++ return ret; ++} ++ ++int bch2_fs_read_write(struct bch_fs *c) ++{ ++ return __bch2_fs_read_write(c, false); ++} ++ ++int bch2_fs_read_write_early(struct bch_fs *c) ++{ ++ lockdep_assert_held(&c->state_lock); ++ ++ return __bch2_fs_read_write(c, true); ++} ++ ++/* Filesystem startup/shutdown: */ ++ ++static void bch2_fs_free(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ for (i = 0; i < BCH_TIME_STAT_NR; i++) ++ bch2_time_stats_exit(&c->times[i]); ++ ++ bch2_fs_quota_exit(c); ++ bch2_fs_fsio_exit(c); ++ bch2_fs_ec_exit(c); ++ bch2_fs_encryption_exit(c); ++ bch2_fs_io_exit(c); ++ bch2_fs_btree_interior_update_exit(c); ++ bch2_fs_btree_iter_exit(c); ++ bch2_fs_btree_key_cache_exit(&c->btree_key_cache); ++ bch2_fs_btree_cache_exit(c); ++ bch2_fs_journal_exit(&c->journal); ++ bch2_io_clock_exit(&c->io_clock[WRITE]); ++ bch2_io_clock_exit(&c->io_clock[READ]); ++ bch2_fs_compress_exit(c); ++ bch2_journal_keys_free(&c->journal_keys); ++ bch2_journal_entries_free(&c->journal_entries); ++ percpu_free_rwsem(&c->mark_lock); ++ kfree(c->usage_scratch); ++ free_percpu(c->usage[1]); ++ free_percpu(c->usage[0]); ++ kfree(c->usage_base); ++ free_percpu(c->pcpu); ++ mempool_exit(&c->large_bkey_pool); ++ mempool_exit(&c->btree_bounce_pool); ++ bioset_exit(&c->btree_bio); ++ mempool_exit(&c->fill_iter); ++ percpu_ref_exit(&c->writes); ++ kfree(c->replicas.entries); ++ kfree(c->replicas_gc.entries); ++ kfree(rcu_dereference_protected(c->disk_groups, 1)); ++ kfree(c->journal_seq_blacklist_table); ++ free_heap(&c->copygc_heap); ++ ++ if (c->journal_reclaim_wq) ++ destroy_workqueue(c->journal_reclaim_wq); ++ if (c->copygc_wq) ++ destroy_workqueue(c->copygc_wq); ++ if (c->wq) ++ destroy_workqueue(c->wq); ++ ++ free_pages((unsigned long) c->disk_sb.sb, ++ c->disk_sb.page_order); ++ kvpfree(c, sizeof(*c)); ++ module_put(THIS_MODULE); ++} ++ ++static void bch2_fs_release(struct kobject *kobj) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); ++ ++ bch2_fs_free(c); ++} ++ ++void bch2_fs_stop(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ bch_verbose(c, "shutting down"); ++ ++ set_bit(BCH_FS_STOPPING, &c->flags); ++ ++ cancel_work_sync(&c->journal_seq_blacklist_gc_work); ++ ++ down_write(&c->state_lock); ++ bch2_fs_read_only(c); ++ up_write(&c->state_lock); ++ ++ for_each_member_device(ca, c, i) ++ if (ca->kobj.state_in_sysfs && ++ ca->disk_sb.bdev) ++ sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, ++ "bcachefs"); ++ ++ if (c->kobj.state_in_sysfs) ++ kobject_del(&c->kobj); ++ ++ bch2_fs_debug_exit(c); ++ bch2_fs_chardev_exit(c); ++ ++ kobject_put(&c->time_stats); ++ kobject_put(&c->opts_dir); ++ kobject_put(&c->internal); ++ ++ mutex_lock(&bch_fs_list_lock); ++ list_del(&c->list); ++ mutex_unlock(&bch_fs_list_lock); ++ ++ closure_sync(&c->cl); ++ closure_debug_destroy(&c->cl); ++ ++ /* btree prefetch might have kicked off reads in the background: */ ++ bch2_btree_flush_all_reads(c); ++ ++ for_each_member_device(ca, c, i) ++ cancel_work_sync(&ca->io_error_work); ++ ++ cancel_work_sync(&c->btree_write_error_work); ++ cancel_delayed_work_sync(&c->pd_controllers_update); ++ cancel_work_sync(&c->read_only_work); ++ ++ for (i = 0; i < c->sb.nr_devices; i++) ++ if (c->devs[i]) ++ bch2_dev_free(rcu_dereference_protected(c->devs[i], 1)); ++ ++ bch_verbose(c, "shutdown complete"); ++ ++ kobject_put(&c->kobj); ++} ++ ++static const char *bch2_fs_online(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ const char *err = NULL; ++ unsigned i; ++ int ret; ++ ++ lockdep_assert_held(&bch_fs_list_lock); ++ ++ if (!list_empty(&c->list)) ++ return NULL; ++ ++ if (__bch2_uuid_to_fs(c->sb.uuid)) ++ return "filesystem UUID already open"; ++ ++ ret = bch2_fs_chardev_init(c); ++ if (ret) ++ return "error creating character device"; ++ ++ bch2_fs_debug_init(c); ++ ++ if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) || ++ kobject_add(&c->internal, &c->kobj, "internal") || ++ kobject_add(&c->opts_dir, &c->kobj, "options") || ++ kobject_add(&c->time_stats, &c->kobj, "time_stats") || ++ bch2_opts_create_sysfs_files(&c->opts_dir)) ++ return "error creating sysfs objects"; ++ ++ down_write(&c->state_lock); ++ ++ err = "error creating sysfs objects"; ++ __for_each_member_device(ca, c, i, NULL) ++ if (bch2_dev_sysfs_online(c, ca)) ++ goto err; ++ ++ list_add(&c->list, &bch_fs_list); ++ err = NULL; ++err: ++ up_write(&c->state_lock); ++ return err; ++} ++ ++static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) ++{ ++ struct bch_sb_field_members *mi; ++ struct bch_fs *c; ++ unsigned i, iter_size; ++ const char *err; ++ ++ pr_verbose_init(opts, ""); ++ ++ c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); ++ if (!c) ++ goto out; ++ ++ __module_get(THIS_MODULE); ++ ++ c->minor = -1; ++ c->disk_sb.fs_sb = true; ++ ++ init_rwsem(&c->state_lock); ++ mutex_init(&c->sb_lock); ++ mutex_init(&c->replicas_gc_lock); ++ mutex_init(&c->btree_root_lock); ++ INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); ++ ++ init_rwsem(&c->gc_lock); ++ ++ for (i = 0; i < BCH_TIME_STAT_NR; i++) ++ bch2_time_stats_init(&c->times[i]); ++ ++ bch2_fs_copygc_init(c); ++ bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); ++ bch2_fs_allocator_background_init(c); ++ bch2_fs_allocator_foreground_init(c); ++ bch2_fs_rebalance_init(c); ++ bch2_fs_quota_init(c); ++ ++ INIT_LIST_HEAD(&c->list); ++ ++ mutex_init(&c->usage_scratch_lock); ++ ++ mutex_init(&c->bio_bounce_pages_lock); ++ ++ bio_list_init(&c->btree_write_error_list); ++ spin_lock_init(&c->btree_write_error_lock); ++ INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work); ++ ++ INIT_WORK(&c->journal_seq_blacklist_gc_work, ++ bch2_blacklist_entries_gc); ++ ++ INIT_LIST_HEAD(&c->journal_entries); ++ ++ INIT_LIST_HEAD(&c->fsck_errors); ++ mutex_init(&c->fsck_error_lock); ++ ++ INIT_LIST_HEAD(&c->ec_stripe_head_list); ++ mutex_init(&c->ec_stripe_head_lock); ++ ++ INIT_LIST_HEAD(&c->ec_stripe_new_list); ++ mutex_init(&c->ec_stripe_new_lock); ++ ++ spin_lock_init(&c->ec_stripes_heap_lock); ++ ++ seqcount_init(&c->gc_pos_lock); ++ ++ seqcount_init(&c->usage_lock); ++ ++ sema_init(&c->io_in_flight, 64); ++ ++ c->copy_gc_enabled = 1; ++ c->rebalance.enabled = 1; ++ c->promote_whole_extents = true; ++ ++ c->journal.write_time = &c->times[BCH_TIME_journal_write]; ++ c->journal.delay_time = &c->times[BCH_TIME_journal_delay]; ++ c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal]; ++ c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; ++ ++ bch2_fs_btree_cache_init_early(&c->btree_cache); ++ ++ if (percpu_init_rwsem(&c->mark_lock)) ++ goto err; ++ ++ mutex_lock(&c->sb_lock); ++ ++ if (bch2_sb_to_fs(c, sb)) { ++ mutex_unlock(&c->sb_lock); ++ goto err; ++ } ++ ++ mutex_unlock(&c->sb_lock); ++ ++ scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid); ++ ++ c->opts = bch2_opts_default; ++ bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb)); ++ bch2_opts_apply(&c->opts, opts); ++ ++ c->block_bits = ilog2(c->opts.block_size); ++ c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); ++ ++ if (bch2_fs_init_fault("fs_alloc")) ++ goto err; ++ ++ iter_size = sizeof(struct sort_iter) + ++ (btree_blocks(c) + 1) * 2 * ++ sizeof(struct sort_iter_set); ++ ++ if (!(c->wq = alloc_workqueue("bcachefs", ++ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || ++ !(c->copygc_wq = alloc_workqueue("bcache_copygc", ++ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || ++ !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal", ++ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || ++ percpu_ref_init(&c->writes, bch2_writes_disabled, ++ PERCPU_REF_INIT_DEAD, GFP_KERNEL) || ++ mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || ++ bioset_init(&c->btree_bio, 1, ++ max(offsetof(struct btree_read_bio, bio), ++ offsetof(struct btree_write_bio, wbio.bio)), ++ BIOSET_NEED_BVECS) || ++ !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || ++ mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, ++ btree_bytes(c)) || ++ mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || ++ bch2_io_clock_init(&c->io_clock[READ]) || ++ bch2_io_clock_init(&c->io_clock[WRITE]) || ++ bch2_fs_journal_init(&c->journal) || ++ bch2_fs_replicas_init(c) || ++ bch2_fs_btree_cache_init(c) || ++ bch2_fs_btree_key_cache_init(&c->btree_key_cache) || ++ bch2_fs_btree_iter_init(c) || ++ bch2_fs_btree_interior_update_init(c) || ++ bch2_fs_io_init(c) || ++ bch2_fs_encryption_init(c) || ++ bch2_fs_compress_init(c) || ++ bch2_fs_ec_init(c) || ++ bch2_fs_fsio_init(c)) ++ goto err; ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ for (i = 0; i < c->sb.nr_devices; i++) ++ if (bch2_dev_exists(c->disk_sb.sb, mi, i) && ++ bch2_dev_alloc(c, i)) ++ goto err; ++ ++ /* ++ * Now that all allocations have succeeded, init various refcounty ++ * things that let us shutdown: ++ */ ++ closure_init(&c->cl, NULL); ++ ++ c->kobj.kset = bcachefs_kset; ++ kobject_init(&c->kobj, &bch2_fs_ktype); ++ kobject_init(&c->internal, &bch2_fs_internal_ktype); ++ kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); ++ kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); ++ ++ mutex_lock(&bch_fs_list_lock); ++ err = bch2_fs_online(c); ++ mutex_unlock(&bch_fs_list_lock); ++ if (err) { ++ bch_err(c, "bch2_fs_online() error: %s", err); ++ goto err; ++ } ++out: ++ pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM); ++ return c; ++err: ++ bch2_fs_free(c); ++ c = NULL; ++ goto out; ++} ++ ++noinline_for_stack ++static void print_mount_opts(struct bch_fs *c) ++{ ++ enum bch_opt_id i; ++ char buf[512]; ++ struct printbuf p = PBUF(buf); ++ bool first = true; ++ ++ strcpy(buf, "(null)"); ++ ++ if (c->opts.read_only) { ++ pr_buf(&p, "ro"); ++ first = false; ++ } ++ ++ for (i = 0; i < bch2_opts_nr; i++) { ++ const struct bch_option *opt = &bch2_opt_table[i]; ++ u64 v = bch2_opt_get_by_id(&c->opts, i); ++ ++ if (!(opt->mode & OPT_MOUNT)) ++ continue; ++ ++ if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) ++ continue; ++ ++ if (!first) ++ pr_buf(&p, ","); ++ first = false; ++ bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE); ++ } ++ ++ bch_info(c, "mounted with opts: %s", buf); ++} ++ ++int bch2_fs_start(struct bch_fs *c) ++{ ++ const char *err = "cannot allocate memory"; ++ struct bch_sb_field_members *mi; ++ struct bch_dev *ca; ++ time64_t now = ktime_get_real_seconds(); ++ unsigned i; ++ int ret = -EINVAL; ++ ++ down_write(&c->state_lock); ++ ++ BUG_ON(test_bit(BCH_FS_STARTED, &c->flags)); ++ ++ mutex_lock(&c->sb_lock); ++ ++ for_each_online_member(ca, c, i) ++ bch2_sb_from_fs(c, ca); ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ for_each_online_member(ca, c, i) ++ mi->members[ca->dev_idx].last_mount = cpu_to_le64(now); ++ ++ mutex_unlock(&c->sb_lock); ++ ++ for_each_rw_member(ca, c, i) ++ bch2_dev_allocator_add(c, ca); ++ bch2_recalc_capacity(c); ++ ++ ret = BCH_SB_INITIALIZED(c->disk_sb.sb) ++ ? bch2_fs_recovery(c) ++ : bch2_fs_initialize(c); ++ if (ret) ++ goto err; ++ ++ ret = bch2_opts_check_may_set(c); ++ if (ret) ++ goto err; ++ ++ err = "dynamic fault"; ++ ret = -EINVAL; ++ if (bch2_fs_init_fault("fs_start")) ++ goto err; ++ ++ set_bit(BCH_FS_STARTED, &c->flags); ++ ++ if (c->opts.read_only || c->opts.nochanges) { ++ bch2_fs_read_only(c); ++ } else { ++ err = "error going read write"; ++ ret = !test_bit(BCH_FS_RW, &c->flags) ++ ? bch2_fs_read_write(c) ++ : bch2_fs_read_write_late(c); ++ if (ret) ++ goto err; ++ } ++ ++ print_mount_opts(c); ++ ret = 0; ++out: ++ up_write(&c->state_lock); ++ return ret; ++err: ++ switch (ret) { ++ case BCH_FSCK_ERRORS_NOT_FIXED: ++ bch_err(c, "filesystem contains errors: please report this to the developers"); ++ pr_cont("mount with -o fix_errors to repair\n"); ++ err = "fsck error"; ++ break; ++ case BCH_FSCK_REPAIR_UNIMPLEMENTED: ++ bch_err(c, "filesystem contains errors: please report this to the developers"); ++ pr_cont("repair unimplemented: inform the developers so that it can be added\n"); ++ err = "fsck error"; ++ break; ++ case BCH_FSCK_REPAIR_IMPOSSIBLE: ++ bch_err(c, "filesystem contains errors, but repair impossible"); ++ err = "fsck error"; ++ break; ++ case BCH_FSCK_UNKNOWN_VERSION: ++ err = "unknown metadata version";; ++ break; ++ case -ENOMEM: ++ err = "cannot allocate memory"; ++ break; ++ case -EIO: ++ err = "IO error"; ++ break; ++ } ++ ++ if (ret >= 0) ++ ret = -EIO; ++ goto out; ++} ++ ++static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) ++{ ++ struct bch_sb_field_members *sb_mi; ++ ++ sb_mi = bch2_sb_get_members(sb); ++ if (!sb_mi) ++ return "Invalid superblock: member info area missing"; ++ ++ if (le16_to_cpu(sb->block_size) != c->opts.block_size) ++ return "mismatched block size"; ++ ++ if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) < ++ BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) ++ return "new cache bucket size is too small"; ++ ++ return NULL; ++} ++ ++static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb) ++{ ++ struct bch_sb *newest = ++ le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb; ++ struct bch_sb_field_members *mi = bch2_sb_get_members(newest); ++ ++ if (uuid_le_cmp(fs->uuid, sb->uuid)) ++ return "device not a member of filesystem"; ++ ++ if (!bch2_dev_exists(newest, mi, sb->dev_idx)) ++ return "device has been removed"; ++ ++ if (fs->block_size != sb->block_size) ++ return "mismatched block size"; ++ ++ return NULL; ++} ++ ++/* Device startup/shutdown: */ ++ ++static void bch2_dev_release(struct kobject *kobj) ++{ ++ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); ++ ++ kfree(ca); ++} ++ ++static void bch2_dev_free(struct bch_dev *ca) ++{ ++ cancel_work_sync(&ca->io_error_work); ++ ++ if (ca->kobj.state_in_sysfs && ++ ca->disk_sb.bdev) ++ sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, ++ "bcachefs"); ++ ++ if (ca->kobj.state_in_sysfs) ++ kobject_del(&ca->kobj); ++ ++ bch2_free_super(&ca->disk_sb); ++ bch2_dev_journal_exit(ca); ++ ++ free_percpu(ca->io_done); ++ bioset_exit(&ca->replica_set); ++ bch2_dev_buckets_free(ca); ++ free_page((unsigned long) ca->sb_read_scratch); ++ ++ bch2_time_stats_exit(&ca->io_latency[WRITE]); ++ bch2_time_stats_exit(&ca->io_latency[READ]); ++ ++ percpu_ref_exit(&ca->io_ref); ++ percpu_ref_exit(&ca->ref); ++ kobject_put(&ca->kobj); ++} ++ ++static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) ++{ ++ ++ lockdep_assert_held(&c->state_lock); ++ ++ if (percpu_ref_is_zero(&ca->io_ref)) ++ return; ++ ++ __bch2_dev_read_only(c, ca); ++ ++ reinit_completion(&ca->io_ref_completion); ++ percpu_ref_kill(&ca->io_ref); ++ wait_for_completion(&ca->io_ref_completion); ++ ++ if (ca->kobj.state_in_sysfs) { ++ struct kobject *block = ++ &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj; ++ ++ sysfs_remove_link(block, "bcachefs"); ++ sysfs_remove_link(&ca->kobj, "block"); ++ } ++ ++ bch2_free_super(&ca->disk_sb); ++ bch2_dev_journal_exit(ca); ++} ++ ++static void bch2_dev_ref_complete(struct percpu_ref *ref) ++{ ++ struct bch_dev *ca = container_of(ref, struct bch_dev, ref); ++ ++ complete(&ca->ref_completion); ++} ++ ++static void bch2_dev_io_ref_complete(struct percpu_ref *ref) ++{ ++ struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); ++ ++ complete(&ca->io_ref_completion); ++} ++ ++static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) ++{ ++ int ret; ++ ++ if (!c->kobj.state_in_sysfs) ++ return 0; ++ ++ if (!ca->kobj.state_in_sysfs) { ++ ret = kobject_add(&ca->kobj, &c->kobj, ++ "dev-%u", ca->dev_idx); ++ if (ret) ++ return ret; ++ } ++ ++ if (ca->disk_sb.bdev) { ++ struct kobject *block = ++ &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj; ++ ++ ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); ++ if (ret) ++ return ret; ++ ret = sysfs_create_link(&ca->kobj, block, "block"); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, ++ struct bch_member *member) ++{ ++ struct bch_dev *ca; ++ ++ ca = kzalloc(sizeof(*ca), GFP_KERNEL); ++ if (!ca) ++ return NULL; ++ ++ kobject_init(&ca->kobj, &bch2_dev_ktype); ++ init_completion(&ca->ref_completion); ++ init_completion(&ca->io_ref_completion); ++ ++ init_rwsem(&ca->bucket_lock); ++ ++ INIT_WORK(&ca->io_error_work, bch2_io_error_work); ++ ++ bch2_time_stats_init(&ca->io_latency[READ]); ++ bch2_time_stats_init(&ca->io_latency[WRITE]); ++ ++ ca->mi = bch2_mi_to_cpu(member); ++ ca->uuid = member->uuid; ++ ++ if (opt_defined(c->opts, discard)) ++ ca->mi.discard = opt_get(c->opts, discard); ++ ++ if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, ++ 0, GFP_KERNEL) || ++ percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, ++ PERCPU_REF_INIT_DEAD, GFP_KERNEL) || ++ !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || ++ bch2_dev_buckets_alloc(c, ca) || ++ bioset_init(&ca->replica_set, 4, ++ offsetof(struct bch_write_bio, bio), 0) || ++ !(ca->io_done = alloc_percpu(*ca->io_done))) ++ goto err; ++ ++ return ca; ++err: ++ bch2_dev_free(ca); ++ return NULL; ++} ++ ++static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, ++ unsigned dev_idx) ++{ ++ ca->dev_idx = dev_idx; ++ __set_bit(ca->dev_idx, ca->self.d); ++ scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); ++ ++ ca->fs = c; ++ rcu_assign_pointer(c->devs[ca->dev_idx], ca); ++ ++ if (bch2_dev_sysfs_online(c, ca)) ++ pr_warn("error creating sysfs objects"); ++} ++ ++static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) ++{ ++ struct bch_member *member = ++ bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx; ++ struct bch_dev *ca = NULL; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ if (bch2_fs_init_fault("dev_alloc")) ++ goto err; ++ ++ ca = __bch2_dev_alloc(c, member); ++ if (!ca) ++ goto err; ++ ++ bch2_dev_attach(c, ca, dev_idx); ++out: ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++err: ++ if (ca) ++ bch2_dev_free(ca); ++ ret = -ENOMEM; ++ goto out; ++} ++ ++static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) ++{ ++ unsigned ret; ++ ++ if (bch2_dev_is_online(ca)) { ++ bch_err(ca, "already have device online in slot %u", ++ sb->sb->dev_idx); ++ return -EINVAL; ++ } ++ ++ if (get_capacity(sb->bdev->bd_disk) < ++ ca->mi.bucket_size * ca->mi.nbuckets) { ++ bch_err(ca, "cannot online: device too small"); ++ return -EINVAL; ++ } ++ ++ BUG_ON(!percpu_ref_is_zero(&ca->io_ref)); ++ ++ if (get_capacity(sb->bdev->bd_disk) < ++ ca->mi.bucket_size * ca->mi.nbuckets) { ++ bch_err(ca, "device too small"); ++ return -EINVAL; ++ } ++ ++ ret = bch2_dev_journal_init(ca, sb->sb); ++ if (ret) ++ return ret; ++ ++ /* Commit: */ ++ ca->disk_sb = *sb; ++ if (sb->mode & FMODE_EXCL) ++ ca->disk_sb.bdev->bd_holder = ca; ++ memset(sb, 0, sizeof(*sb)); ++ ++ percpu_ref_reinit(&ca->io_ref); ++ ++ return 0; ++} ++ ++static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) ++{ ++ struct bch_dev *ca; ++ int ret; ++ ++ lockdep_assert_held(&c->state_lock); ++ ++ if (le64_to_cpu(sb->sb->seq) > ++ le64_to_cpu(c->disk_sb.sb->seq)) ++ bch2_sb_to_fs(c, sb->sb); ++ ++ BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices || ++ !c->devs[sb->sb->dev_idx]); ++ ++ ca = bch_dev_locked(c, sb->sb->dev_idx); ++ ++ ret = __bch2_dev_attach_bdev(ca, sb); ++ if (ret) ++ return ret; ++ ++ if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) && ++ !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_sb])) { ++ mutex_lock(&c->sb_lock); ++ bch2_mark_dev_superblock(ca->fs, ca, 0); ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ bch2_dev_sysfs_online(c, ca); ++ ++ if (c->sb.nr_devices == 1) ++ bdevname(ca->disk_sb.bdev, c->name); ++ bdevname(ca->disk_sb.bdev, ca->name); ++ ++ rebalance_wakeup(c); ++ return 0; ++} ++ ++/* Device management: */ ++ ++/* ++ * Note: this function is also used by the error paths - when a particular ++ * device sees an error, we call it to determine whether we can just set the ++ * device RO, or - if this function returns false - we'll set the whole ++ * filesystem RO: ++ * ++ * XXX: maybe we should be more explicit about whether we're changing state ++ * because we got an error or what have you? ++ */ ++bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, ++ enum bch_member_state new_state, int flags) ++{ ++ struct bch_devs_mask new_online_devs; ++ struct replicas_status s; ++ struct bch_dev *ca2; ++ int i, nr_rw = 0, required; ++ ++ lockdep_assert_held(&c->state_lock); ++ ++ switch (new_state) { ++ case BCH_MEMBER_STATE_RW: ++ return true; ++ case BCH_MEMBER_STATE_RO: ++ if (ca->mi.state != BCH_MEMBER_STATE_RW) ++ return true; ++ ++ /* do we have enough devices to write to? */ ++ for_each_member_device(ca2, c, i) ++ if (ca2 != ca) ++ nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW; ++ ++ required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) ++ ? c->opts.metadata_replicas ++ : c->opts.metadata_replicas_required, ++ !(flags & BCH_FORCE_IF_DATA_DEGRADED) ++ ? c->opts.data_replicas ++ : c->opts.data_replicas_required); ++ ++ return nr_rw >= required; ++ case BCH_MEMBER_STATE_FAILED: ++ case BCH_MEMBER_STATE_SPARE: ++ if (ca->mi.state != BCH_MEMBER_STATE_RW && ++ ca->mi.state != BCH_MEMBER_STATE_RO) ++ return true; ++ ++ /* do we have enough devices to read from? */ ++ new_online_devs = bch2_online_devs(c); ++ __clear_bit(ca->dev_idx, new_online_devs.d); ++ ++ s = __bch2_replicas_status(c, new_online_devs); ++ ++ return bch2_have_enough_devs(s, flags); ++ default: ++ BUG(); ++ } ++} ++ ++static bool bch2_fs_may_start(struct bch_fs *c) ++{ ++ struct replicas_status s; ++ struct bch_sb_field_members *mi; ++ struct bch_dev *ca; ++ unsigned i, flags = c->opts.degraded ++ ? BCH_FORCE_IF_DEGRADED ++ : 0; ++ ++ if (!c->opts.degraded) { ++ mutex_lock(&c->sb_lock); ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ ++ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { ++ if (!bch2_dev_exists(c->disk_sb.sb, mi, i)) ++ continue; ++ ++ ca = bch_dev_locked(c, i); ++ ++ if (!bch2_dev_is_online(ca) && ++ (ca->mi.state == BCH_MEMBER_STATE_RW || ++ ca->mi.state == BCH_MEMBER_STATE_RO)) { ++ mutex_unlock(&c->sb_lock); ++ return false; ++ } ++ } ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ s = bch2_replicas_status(c); ++ ++ return bch2_have_enough_devs(s, flags); ++} ++ ++static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) ++{ ++ /* ++ * Device going read only means the copygc reserve get smaller, so we ++ * don't want that happening while copygc is in progress: ++ */ ++ bch2_copygc_stop(c); ++ ++ /* ++ * The allocator thread itself allocates btree nodes, so stop it first: ++ */ ++ bch2_dev_allocator_stop(ca); ++ bch2_dev_allocator_remove(c, ca); ++ bch2_dev_journal_stop(&c->journal, ca); ++ ++ bch2_copygc_start(c); ++} ++ ++static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) ++{ ++ lockdep_assert_held(&c->state_lock); ++ ++ BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW); ++ ++ bch2_dev_allocator_add(c, ca); ++ bch2_recalc_capacity(c); ++ ++ if (bch2_dev_allocator_start(ca)) ++ return "error starting allocator thread"; ++ ++ return NULL; ++} ++ ++int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, ++ enum bch_member_state new_state, int flags) ++{ ++ struct bch_sb_field_members *mi; ++ int ret = 0; ++ ++ if (ca->mi.state == new_state) ++ return 0; ++ ++ if (!bch2_dev_state_allowed(c, ca, new_state, flags)) ++ return -EINVAL; ++ ++ if (new_state != BCH_MEMBER_STATE_RW) ++ __bch2_dev_read_only(c, ca); ++ ++ bch_notice(ca, "%s", bch2_dev_state[new_state]); ++ ++ mutex_lock(&c->sb_lock); ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ if (new_state == BCH_MEMBER_STATE_RW && ++ __bch2_dev_read_write(c, ca)) ++ ret = -ENOMEM; ++ ++ rebalance_wakeup(c); ++ ++ return ret; ++} ++ ++int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, ++ enum bch_member_state new_state, int flags) ++{ ++ int ret; ++ ++ down_write(&c->state_lock); ++ ret = __bch2_dev_set_state(c, ca, new_state, flags); ++ up_write(&c->state_lock); ++ ++ return ret; ++} ++ ++/* Device add/removal: */ ++ ++int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct btree_trans trans; ++ size_t i; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < ca->mi.nbuckets; i++) { ++ ret = bch2_btree_key_cache_flush(&trans, ++ BTREE_ID_ALLOC, POS(ca->dev_idx, i)); ++ if (ret) ++ break; ++ } ++ bch2_trans_exit(&trans); ++ ++ if (ret) ++ return ret; ++ ++ return bch2_btree_delete_range(c, BTREE_ID_ALLOC, ++ POS(ca->dev_idx, 0), ++ POS(ca->dev_idx + 1, 0), ++ NULL); ++} ++ ++int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) ++{ ++ struct bch_sb_field_members *mi; ++ unsigned dev_idx = ca->dev_idx, data; ++ int ret = -EINVAL; ++ ++ down_write(&c->state_lock); ++ ++ /* ++ * We consume a reference to ca->ref, regardless of whether we succeed ++ * or fail: ++ */ ++ percpu_ref_put(&ca->ref); ++ ++ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { ++ bch_err(ca, "Cannot remove without losing data"); ++ goto err; ++ } ++ ++ __bch2_dev_read_only(c, ca); ++ ++ ret = bch2_dev_data_drop(c, ca->dev_idx, flags); ++ if (ret) { ++ bch_err(ca, "Remove failed: error %i dropping data", ret); ++ goto err; ++ } ++ ++ ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); ++ if (ret) { ++ bch_err(ca, "Remove failed: error %i flushing journal", ret); ++ goto err; ++ } ++ ++ ret = bch2_dev_remove_alloc(c, ca); ++ if (ret) { ++ bch_err(ca, "Remove failed, error deleting alloc info"); ++ goto err; ++ } ++ ++ /* ++ * must flush all existing journal entries, they might have ++ * (overwritten) keys that point to the device we're removing: ++ */ ++ bch2_journal_flush_all_pins(&c->journal); ++ /* ++ * hack to ensure bch2_replicas_gc2() clears out entries to this device ++ */ ++ bch2_journal_meta(&c->journal); ++ ret = bch2_journal_error(&c->journal); ++ if (ret) { ++ bch_err(ca, "Remove failed, journal error"); ++ goto err; ++ } ++ ++ ret = bch2_replicas_gc2(c); ++ if (ret) { ++ bch_err(ca, "Remove failed: error %i from replicas gc", ret); ++ goto err; ++ } ++ ++ data = bch2_dev_has_data(c, ca); ++ if (data) { ++ char data_has_str[100]; ++ ++ bch2_flags_to_text(&PBUF(data_has_str), ++ bch2_data_types, data); ++ bch_err(ca, "Remove failed, still has data (%s)", data_has_str); ++ ret = -EBUSY; ++ goto err; ++ } ++ ++ __bch2_dev_offline(c, ca); ++ ++ mutex_lock(&c->sb_lock); ++ rcu_assign_pointer(c->devs[ca->dev_idx], NULL); ++ mutex_unlock(&c->sb_lock); ++ ++ percpu_ref_kill(&ca->ref); ++ wait_for_completion(&ca->ref_completion); ++ ++ bch2_dev_free(ca); ++ ++ /* ++ * Free this device's slot in the bch_member array - all pointers to ++ * this device must be gone: ++ */ ++ mutex_lock(&c->sb_lock); ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid)); ++ ++ bch2_write_super(c); ++ ++ mutex_unlock(&c->sb_lock); ++ up_write(&c->state_lock); ++ return 0; ++err: ++ if (ca->mi.state == BCH_MEMBER_STATE_RW && ++ !percpu_ref_is_zero(&ca->io_ref)) ++ __bch2_dev_read_write(c, ca); ++ up_write(&c->state_lock); ++ return ret; ++} ++ ++static void dev_usage_clear(struct bch_dev *ca) ++{ ++ struct bucket_array *buckets; ++ ++ percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0])); ++ ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets); ++ up_read(&ca->bucket_lock); ++} ++ ++/* Add new device to running filesystem: */ ++int bch2_dev_add(struct bch_fs *c, const char *path) ++{ ++ struct bch_opts opts = bch2_opts_empty(); ++ struct bch_sb_handle sb; ++ const char *err; ++ struct bch_dev *ca = NULL; ++ struct bch_sb_field_members *mi; ++ struct bch_member dev_mi; ++ unsigned dev_idx, nr_devices, u64s; ++ int ret; ++ ++ ret = bch2_read_super(path, &opts, &sb); ++ if (ret) ++ return ret; ++ ++ err = bch2_sb_validate(&sb); ++ if (err) ++ return -EINVAL; ++ ++ dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx]; ++ ++ err = bch2_dev_may_add(sb.sb, c); ++ if (err) ++ return -EINVAL; ++ ++ ca = __bch2_dev_alloc(c, &dev_mi); ++ if (!ca) { ++ bch2_free_super(&sb); ++ return -ENOMEM; ++ } ++ ++ ret = __bch2_dev_attach_bdev(ca, &sb); ++ if (ret) { ++ bch2_dev_free(ca); ++ return ret; ++ } ++ ++ /* ++ * We want to allocate journal on the new device before adding the new ++ * device to the filesystem because allocating after we attach requires ++ * spinning up the allocator thread, and the allocator thread requires ++ * doing btree writes, which if the existing devices are RO isn't going ++ * to work ++ * ++ * So we have to mark where the superblocks are, but marking allocated ++ * data normally updates the filesystem usage too, so we have to mark, ++ * allocate the journal, reset all the marks, then remark after we ++ * attach... ++ */ ++ bch2_mark_dev_superblock(ca->fs, ca, 0); ++ ++ err = "journal alloc failed"; ++ ret = bch2_dev_journal_alloc(ca); ++ if (ret) ++ goto err; ++ ++ dev_usage_clear(ca); ++ ++ down_write(&c->state_lock); ++ mutex_lock(&c->sb_lock); ++ ++ err = "insufficient space in new superblock"; ++ ret = bch2_sb_from_fs(c, ca); ++ if (ret) ++ goto err_unlock; ++ ++ mi = bch2_sb_get_members(ca->disk_sb.sb); ++ ++ if (!bch2_sb_resize_members(&ca->disk_sb, ++ le32_to_cpu(mi->field.u64s) + ++ sizeof(dev_mi) / sizeof(u64))) { ++ ret = -ENOSPC; ++ goto err_unlock; ++ } ++ ++ if (dynamic_fault("bcachefs:add:no_slot")) ++ goto no_slot; ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) ++ if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx)) ++ goto have_slot; ++no_slot: ++ err = "no slots available in superblock"; ++ ret = -ENOSPC; ++ goto err_unlock; ++ ++have_slot: ++ nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); ++ u64s = (sizeof(struct bch_sb_field_members) + ++ sizeof(struct bch_member) * nr_devices) / sizeof(u64); ++ ++ err = "no space in superblock for member info"; ++ ret = -ENOSPC; ++ ++ mi = bch2_sb_resize_members(&c->disk_sb, u64s); ++ if (!mi) ++ goto err_unlock; ++ ++ /* success: */ ++ ++ mi->members[dev_idx] = dev_mi; ++ mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_real_seconds()); ++ c->disk_sb.sb->nr_devices = nr_devices; ++ ++ ca->disk_sb.sb->dev_idx = dev_idx; ++ bch2_dev_attach(c, ca, dev_idx); ++ ++ bch2_mark_dev_superblock(c, ca, 0); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ if (ca->mi.state == BCH_MEMBER_STATE_RW) { ++ err = __bch2_dev_read_write(c, ca); ++ if (err) ++ goto err_late; ++ } ++ ++ up_write(&c->state_lock); ++ return 0; ++ ++err_unlock: ++ mutex_unlock(&c->sb_lock); ++ up_write(&c->state_lock); ++err: ++ if (ca) ++ bch2_dev_free(ca); ++ bch2_free_super(&sb); ++ bch_err(c, "Unable to add device: %s", err); ++ return ret; ++err_late: ++ bch_err(c, "Error going rw after adding device: %s", err); ++ return -EINVAL; ++} ++ ++/* Hot add existing device to running filesystem: */ ++int bch2_dev_online(struct bch_fs *c, const char *path) ++{ ++ struct bch_opts opts = bch2_opts_empty(); ++ struct bch_sb_handle sb = { NULL }; ++ struct bch_sb_field_members *mi; ++ struct bch_dev *ca; ++ unsigned dev_idx; ++ const char *err; ++ int ret; ++ ++ down_write(&c->state_lock); ++ ++ ret = bch2_read_super(path, &opts, &sb); ++ if (ret) { ++ up_write(&c->state_lock); ++ return ret; ++ } ++ ++ dev_idx = sb.sb->dev_idx; ++ ++ err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb); ++ if (err) ++ goto err; ++ ++ if (bch2_dev_attach_bdev(c, &sb)) { ++ err = "bch2_dev_attach_bdev() error"; ++ goto err; ++ } ++ ++ ca = bch_dev_locked(c, dev_idx); ++ if (ca->mi.state == BCH_MEMBER_STATE_RW) { ++ err = __bch2_dev_read_write(c, ca); ++ if (err) ++ goto err; ++ } ++ ++ mutex_lock(&c->sb_lock); ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ ++ mi->members[ca->dev_idx].last_mount = ++ cpu_to_le64(ktime_get_real_seconds()); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ up_write(&c->state_lock); ++ return 0; ++err: ++ up_write(&c->state_lock); ++ bch2_free_super(&sb); ++ bch_err(c, "error bringing %s online: %s", path, err); ++ return -EINVAL; ++} ++ ++int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) ++{ ++ down_write(&c->state_lock); ++ ++ if (!bch2_dev_is_online(ca)) { ++ bch_err(ca, "Already offline"); ++ up_write(&c->state_lock); ++ return 0; ++ } ++ ++ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { ++ bch_err(ca, "Cannot offline required disk"); ++ up_write(&c->state_lock); ++ return -EINVAL; ++ } ++ ++ __bch2_dev_offline(c, ca); ++ ++ up_write(&c->state_lock); ++ return 0; ++} ++ ++int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ++{ ++ struct bch_member *mi; ++ int ret = 0; ++ ++ down_write(&c->state_lock); ++ ++ if (nbuckets < ca->mi.nbuckets) { ++ bch_err(ca, "Cannot shrink yet"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ if (bch2_dev_is_online(ca) && ++ get_capacity(ca->disk_sb.bdev->bd_disk) < ++ ca->mi.bucket_size * nbuckets) { ++ bch_err(ca, "New size larger than device"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ ret = bch2_dev_buckets_resize(c, ca, nbuckets); ++ if (ret) { ++ bch_err(ca, "Resize error: %i", ret); ++ goto err; ++ } ++ ++ mutex_lock(&c->sb_lock); ++ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; ++ mi->nbuckets = cpu_to_le64(nbuckets); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ bch2_recalc_capacity(c); ++err: ++ up_write(&c->state_lock); ++ return ret; ++} ++ ++/* return with ref on ca->ref: */ ++struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path) ++{ ++ struct block_device *bdev = lookup_bdev(path); ++ struct bch_dev *ca; ++ unsigned i; ++ ++ if (IS_ERR(bdev)) ++ return ERR_CAST(bdev); ++ ++ for_each_member_device(ca, c, i) ++ if (ca->disk_sb.bdev == bdev) ++ goto found; ++ ++ ca = ERR_PTR(-ENOENT); ++found: ++ bdput(bdev); ++ return ca; ++} ++ ++/* Filesystem open: */ ++ ++struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, ++ struct bch_opts opts) ++{ ++ struct bch_sb_handle *sb = NULL; ++ struct bch_fs *c = NULL; ++ struct bch_sb_field_members *mi; ++ unsigned i, best_sb = 0; ++ const char *err; ++ int ret = -ENOMEM; ++ ++ pr_verbose_init(opts, ""); ++ ++ if (!nr_devices) { ++ c = ERR_PTR(-EINVAL); ++ goto out2; ++ } ++ ++ if (!try_module_get(THIS_MODULE)) { ++ c = ERR_PTR(-ENODEV); ++ goto out2; ++ } ++ ++ sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); ++ if (!sb) ++ goto err; ++ ++ for (i = 0; i < nr_devices; i++) { ++ ret = bch2_read_super(devices[i], &opts, &sb[i]); ++ if (ret) ++ goto err; ++ ++ err = bch2_sb_validate(&sb[i]); ++ if (err) ++ goto err_print; ++ } ++ ++ for (i = 1; i < nr_devices; i++) ++ if (le64_to_cpu(sb[i].sb->seq) > ++ le64_to_cpu(sb[best_sb].sb->seq)) ++ best_sb = i; ++ ++ mi = bch2_sb_get_members(sb[best_sb].sb); ++ ++ i = 0; ++ while (i < nr_devices) { ++ if (i != best_sb && ++ !bch2_dev_exists(sb[best_sb].sb, mi, sb[i].sb->dev_idx)) { ++ char buf[BDEVNAME_SIZE]; ++ pr_info("%s has been removed, skipping", ++ bdevname(sb[i].bdev, buf)); ++ bch2_free_super(&sb[i]); ++ array_remove_item(sb, nr_devices, i); ++ continue; ++ } ++ ++ err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb); ++ if (err) ++ goto err_print; ++ i++; ++ } ++ ++ ret = -ENOMEM; ++ c = bch2_fs_alloc(sb[best_sb].sb, opts); ++ if (!c) ++ goto err; ++ ++ err = "bch2_dev_online() error"; ++ down_write(&c->state_lock); ++ for (i = 0; i < nr_devices; i++) ++ if (bch2_dev_attach_bdev(c, &sb[i])) { ++ up_write(&c->state_lock); ++ goto err_print; ++ } ++ up_write(&c->state_lock); ++ ++ err = "insufficient devices"; ++ if (!bch2_fs_may_start(c)) ++ goto err_print; ++ ++ if (!c->opts.nostart) { ++ ret = bch2_fs_start(c); ++ if (ret) ++ goto err; ++ } ++out: ++ kfree(sb); ++ module_put(THIS_MODULE); ++out2: ++ pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c)); ++ return c; ++err_print: ++ pr_err("bch_fs_open err opening %s: %s", ++ devices[0], err); ++ ret = -EINVAL; ++err: ++ if (c) ++ bch2_fs_stop(c); ++ for (i = 0; i < nr_devices; i++) ++ bch2_free_super(&sb[i]); ++ c = ERR_PTR(ret); ++ goto out; ++} ++ ++static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb, ++ struct bch_opts opts) ++{ ++ const char *err; ++ struct bch_fs *c; ++ bool allocated_fs = false; ++ int ret; ++ ++ err = bch2_sb_validate(sb); ++ if (err) ++ return err; ++ ++ mutex_lock(&bch_fs_list_lock); ++ c = __bch2_uuid_to_fs(sb->sb->uuid); ++ if (c) { ++ closure_get(&c->cl); ++ ++ err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb); ++ if (err) ++ goto err; ++ } else { ++ c = bch2_fs_alloc(sb->sb, opts); ++ err = "cannot allocate memory"; ++ if (!c) ++ goto err; ++ ++ allocated_fs = true; ++ } ++ ++ err = "bch2_dev_online() error"; ++ ++ mutex_lock(&c->sb_lock); ++ if (bch2_dev_attach_bdev(c, sb)) { ++ mutex_unlock(&c->sb_lock); ++ goto err; ++ } ++ mutex_unlock(&c->sb_lock); ++ ++ if (!c->opts.nostart && bch2_fs_may_start(c)) { ++ err = "error starting filesystem"; ++ ret = bch2_fs_start(c); ++ if (ret) ++ goto err; ++ } ++ ++ closure_put(&c->cl); ++ mutex_unlock(&bch_fs_list_lock); ++ ++ return NULL; ++err: ++ mutex_unlock(&bch_fs_list_lock); ++ ++ if (allocated_fs) ++ bch2_fs_stop(c); ++ else if (c) ++ closure_put(&c->cl); ++ ++ return err; ++} ++ ++const char *bch2_fs_open_incremental(const char *path) ++{ ++ struct bch_sb_handle sb; ++ struct bch_opts opts = bch2_opts_empty(); ++ const char *err; ++ ++ if (bch2_read_super(path, &opts, &sb)) ++ return "error reading superblock"; ++ ++ err = __bch2_fs_open_incremental(&sb, opts); ++ bch2_free_super(&sb); ++ ++ return err; ++} ++ ++/* Global interfaces/init */ ++ ++static void bcachefs_exit(void) ++{ ++ bch2_debug_exit(); ++ bch2_vfs_exit(); ++ bch2_chardev_exit(); ++ if (bcachefs_kset) ++ kset_unregister(bcachefs_kset); ++} ++ ++static int __init bcachefs_init(void) ++{ ++ bch2_bkey_pack_test(); ++ bch2_inode_pack_test(); ++ ++ if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || ++ bch2_chardev_init() || ++ bch2_vfs_init() || ++ bch2_debug_init()) ++ goto err; ++ ++ return 0; ++err: ++ bcachefs_exit(); ++ return -ENOMEM; ++} ++ ++#define BCH_DEBUG_PARAM(name, description) \ ++ bool bch2_##name; \ ++ module_param_named(name, bch2_##name, bool, 0644); \ ++ MODULE_PARM_DESC(name, description); ++BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++module_exit(bcachefs_exit); ++module_init(bcachefs_init); +diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h +new file mode 100644 +index 000000000000..fffee96726ce +--- /dev/null ++++ b/fs/bcachefs/super.h +@@ -0,0 +1,240 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SUPER_H ++#define _BCACHEFS_SUPER_H ++ ++#include "extents.h" ++ ++#include "bcachefs_ioctl.h" ++ ++#include ++ ++static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) ++{ ++ return div_u64(s, ca->mi.bucket_size); ++} ++ ++static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) ++{ ++ return ((sector_t) b) * ca->mi.bucket_size; ++} ++ ++static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) ++{ ++ u32 remainder; ++ ++ div_u64_rem(s, ca->mi.bucket_size, &remainder); ++ return remainder; ++} ++ ++static inline bool bch2_dev_is_online(struct bch_dev *ca) ++{ ++ return !percpu_ref_is_zero(&ca->io_ref); ++} ++ ++static inline bool bch2_dev_is_readable(struct bch_dev *ca) ++{ ++ return bch2_dev_is_online(ca) && ++ ca->mi.state != BCH_MEMBER_STATE_FAILED; ++} ++ ++static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) ++{ ++ if (!percpu_ref_tryget(&ca->io_ref)) ++ return false; ++ ++ if (ca->mi.state == BCH_MEMBER_STATE_RW || ++ (ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ)) ++ return true; ++ ++ percpu_ref_put(&ca->io_ref); ++ return false; ++} ++ ++static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) ++{ ++ return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); ++} ++ ++static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, ++ unsigned dev) ++{ ++ unsigned i; ++ ++ for (i = 0; i < devs.nr; i++) ++ if (devs.devs[i] == dev) ++ return true; ++ ++ return false; ++} ++ ++static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, ++ unsigned dev) ++{ ++ unsigned i; ++ ++ for (i = 0; i < devs->nr; i++) ++ if (devs->devs[i] == dev) { ++ array_remove_item(devs->devs, devs->nr, i); ++ return; ++ } ++} ++ ++static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, ++ unsigned dev) ++{ ++ BUG_ON(bch2_dev_list_has_dev(*devs, dev)); ++ BUG_ON(devs->nr >= BCH_REPLICAS_MAX); ++ devs->devs[devs->nr++] = dev; ++} ++ ++static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) ++{ ++ return (struct bch_devs_list) { .nr = 1, .devs[0] = dev }; ++} ++ ++static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, ++ const struct bch_devs_mask *mask) ++{ ++ struct bch_dev *ca = NULL; ++ ++ while ((*iter = mask ++ ? find_next_bit(mask->d, c->sb.nr_devices, *iter) ++ : *iter) < c->sb.nr_devices && ++ !(ca = rcu_dereference_check(c->devs[*iter], ++ lockdep_is_held(&c->state_lock)))) ++ (*iter)++; ++ ++ return ca; ++} ++ ++#define __for_each_member_device(ca, c, iter, mask) \ ++ for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++) ++ ++#define for_each_member_device_rcu(ca, c, iter, mask) \ ++ __for_each_member_device(ca, c, iter, mask) ++ ++static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter) ++{ ++ struct bch_dev *ca; ++ ++ rcu_read_lock(); ++ if ((ca = __bch2_next_dev(c, iter, NULL))) ++ percpu_ref_get(&ca->ref); ++ rcu_read_unlock(); ++ ++ return ca; ++} ++ ++/* ++ * If you break early, you must drop your ref on the current device ++ */ ++#define for_each_member_device(ca, c, iter) \ ++ for ((iter) = 0; \ ++ (ca = bch2_get_next_dev(c, &(iter))); \ ++ percpu_ref_put(&ca->ref), (iter)++) ++ ++static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, ++ unsigned *iter, ++ int state_mask) ++{ ++ struct bch_dev *ca; ++ ++ rcu_read_lock(); ++ while ((ca = __bch2_next_dev(c, iter, NULL)) && ++ (!((1 << ca->mi.state) & state_mask) || ++ !percpu_ref_tryget(&ca->io_ref))) ++ (*iter)++; ++ rcu_read_unlock(); ++ ++ return ca; ++} ++ ++#define __for_each_online_member(ca, c, iter, state_mask) \ ++ for ((iter) = 0; \ ++ (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \ ++ percpu_ref_put(&ca->io_ref), (iter)++) ++ ++#define for_each_online_member(ca, c, iter) \ ++ __for_each_online_member(ca, c, iter, ~0) ++ ++#define for_each_rw_member(ca, c, iter) \ ++ __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_RW) ++ ++#define for_each_readable_member(ca, c, iter) \ ++ __for_each_online_member(ca, c, iter, \ ++ (1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO)) ++ ++/* ++ * If a key exists that references a device, the device won't be going away and ++ * we can omit rcu_read_lock(): ++ */ ++static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx) ++{ ++ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); ++ ++ return rcu_dereference_check(c->devs[idx], 1); ++} ++ ++static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) ++{ ++ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); ++ ++ return rcu_dereference_protected(c->devs[idx], ++ lockdep_is_held(&c->sb_lock) || ++ lockdep_is_held(&c->state_lock)); ++} ++ ++/* XXX kill, move to struct bch_fs */ ++static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) ++{ ++ struct bch_devs_mask devs; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ memset(&devs, 0, sizeof(devs)); ++ for_each_online_member(ca, c, i) ++ __set_bit(ca->dev_idx, devs.d); ++ return devs; ++} ++ ++struct bch_fs *bch2_bdev_to_fs(struct block_device *); ++struct bch_fs *bch2_uuid_to_fs(uuid_le); ++int bch2_congested(void *, int); ++ ++bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, ++ enum bch_member_state, int); ++int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *, ++ enum bch_member_state, int); ++int bch2_dev_set_state(struct bch_fs *, struct bch_dev *, ++ enum bch_member_state, int); ++ ++int bch2_dev_fail(struct bch_dev *, int); ++int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int); ++int bch2_dev_add(struct bch_fs *, const char *); ++int bch2_dev_online(struct bch_fs *, const char *); ++int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int); ++int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); ++struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); ++ ++bool bch2_fs_emergency_read_only(struct bch_fs *); ++void bch2_fs_read_only(struct bch_fs *); ++ ++int bch2_fs_read_write(struct bch_fs *); ++int bch2_fs_read_write_early(struct bch_fs *); ++ ++/* ++ * Only for use in the recovery/fsck path: ++ */ ++static inline void bch2_fs_lazy_rw(struct bch_fs *c) ++{ ++ if (percpu_ref_is_zero(&c->writes)) ++ bch2_fs_read_write_early(c); ++} ++ ++void bch2_fs_stop(struct bch_fs *); ++ ++int bch2_fs_start(struct bch_fs *); ++struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); ++const char *bch2_fs_open_incremental(const char *path); ++ ++#endif /* _BCACHEFS_SUPER_H */ +diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h +new file mode 100644 +index 000000000000..20406ebd6f5b +--- /dev/null ++++ b/fs/bcachefs/super_types.h +@@ -0,0 +1,51 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SUPER_TYPES_H ++#define _BCACHEFS_SUPER_TYPES_H ++ ++struct bch_sb_handle { ++ struct bch_sb *sb; ++ struct block_device *bdev; ++ struct bio *bio; ++ unsigned page_order; ++ fmode_t mode; ++ unsigned have_layout:1; ++ unsigned have_bio:1; ++ unsigned fs_sb:1; ++ u64 seq; ++}; ++ ++struct bch_devs_mask { ++ unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)]; ++}; ++ ++struct bch_devs_list { ++ u8 nr; ++ u8 devs[BCH_REPLICAS_MAX + 1]; ++}; ++ ++struct bch_member_cpu { ++ u64 nbuckets; /* device size */ ++ u16 first_bucket; /* index of first bucket used */ ++ u16 bucket_size; /* sectors */ ++ u16 group; ++ u8 state; ++ u8 replacement; ++ u8 discard; ++ u8 data_allowed; ++ u8 durability; ++ u8 valid; ++}; ++ ++struct bch_disk_group_cpu { ++ bool deleted; ++ u16 parent; ++ struct bch_devs_mask devs; ++}; ++ ++struct bch_disk_groups_cpu { ++ struct rcu_head rcu; ++ unsigned nr; ++ struct bch_disk_group_cpu entries[]; ++}; ++ ++#endif /* _BCACHEFS_SUPER_TYPES_H */ +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +new file mode 100644 +index 000000000000..0cb29f43d99d +--- /dev/null ++++ b/fs/bcachefs/sysfs.c +@@ -0,0 +1,1074 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * bcache sysfs interfaces ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#ifndef NO_BCACHEFS_SYSFS ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "sysfs.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_key_cache.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "clock.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "inode.h" ++#include "journal.h" ++#include "keylist.h" ++#include "move.h" ++#include "opts.h" ++#include "rebalance.h" ++#include "replicas.h" ++#include "super-io.h" ++#include "tests.h" ++ ++#include ++#include ++#include ++ ++#include "util.h" ++ ++#define SYSFS_OPS(type) \ ++struct sysfs_ops type ## _sysfs_ops = { \ ++ .show = type ## _show, \ ++ .store = type ## _store \ ++} ++ ++#define SHOW(fn) \ ++static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ ++ char *buf) \ ++ ++#define STORE(fn) \ ++static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ ++ const char *buf, size_t size) \ ++ ++#define __sysfs_attribute(_name, _mode) \ ++ static struct attribute sysfs_##_name = \ ++ { .name = #_name, .mode = _mode } ++ ++#define write_attribute(n) __sysfs_attribute(n, S_IWUSR) ++#define read_attribute(n) __sysfs_attribute(n, S_IRUGO) ++#define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR) ++ ++#define sysfs_printf(file, fmt, ...) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\ ++} while (0) ++ ++#define sysfs_print(file, var) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ return snprint(buf, PAGE_SIZE, var); \ ++} while (0) ++ ++#define sysfs_hprint(file, val) \ ++do { \ ++ if (attr == &sysfs_ ## file) { \ ++ bch2_hprint(&out, val); \ ++ pr_buf(&out, "\n"); \ ++ return out.pos - buf; \ ++ } \ ++} while (0) ++ ++#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var)) ++#define var_print(_var) sysfs_print(_var, var(_var)) ++#define var_hprint(_var) sysfs_hprint(_var, var(_var)) ++ ++#define sysfs_strtoul(file, var) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ return strtoul_safe(buf, var) ?: (ssize_t) size; \ ++} while (0) ++ ++#define sysfs_strtoul_clamp(file, var, min, max) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ return strtoul_safe_clamp(buf, var, min, max) \ ++ ?: (ssize_t) size; \ ++} while (0) ++ ++#define strtoul_or_return(cp) \ ++({ \ ++ unsigned long _v; \ ++ int _r = kstrtoul(cp, 10, &_v); \ ++ if (_r) \ ++ return _r; \ ++ _v; \ ++}) ++ ++#define strtoul_restrict_or_return(cp, min, max) \ ++({ \ ++ unsigned long __v = 0; \ ++ int _r = strtoul_safe_restrict(cp, __v, min, max); \ ++ if (_r) \ ++ return _r; \ ++ __v; \ ++}) ++ ++#define strtoi_h_or_return(cp) \ ++({ \ ++ u64 _v; \ ++ int _r = strtoi_h(cp, &_v); \ ++ if (_r) \ ++ return _r; \ ++ _v; \ ++}) ++ ++#define sysfs_hatoi(file, var) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ return strtoi_h(buf, &var) ?: (ssize_t) size; \ ++} while (0) ++ ++write_attribute(trigger_journal_flush); ++write_attribute(trigger_btree_coalesce); ++write_attribute(trigger_gc); ++write_attribute(prune_cache); ++rw_attribute(btree_gc_periodic); ++ ++read_attribute(uuid); ++read_attribute(minor); ++read_attribute(bucket_size); ++read_attribute(block_size); ++read_attribute(btree_node_size); ++read_attribute(first_bucket); ++read_attribute(nbuckets); ++read_attribute(durability); ++read_attribute(iodone); ++ ++read_attribute(io_latency_read); ++read_attribute(io_latency_write); ++read_attribute(io_latency_stats_read); ++read_attribute(io_latency_stats_write); ++read_attribute(congested); ++ ++read_attribute(bucket_quantiles_last_read); ++read_attribute(bucket_quantiles_last_write); ++read_attribute(bucket_quantiles_fragmentation); ++read_attribute(bucket_quantiles_oldest_gen); ++ ++read_attribute(reserve_stats); ++read_attribute(btree_cache_size); ++read_attribute(compression_stats); ++read_attribute(journal_debug); ++read_attribute(journal_pins); ++read_attribute(btree_updates); ++read_attribute(dirty_btree_nodes); ++read_attribute(btree_key_cache); ++read_attribute(btree_transactions); ++read_attribute(stripes_heap); ++ ++read_attribute(internal_uuid); ++ ++read_attribute(has_data); ++read_attribute(alloc_debug); ++write_attribute(wake_allocator); ++ ++read_attribute(read_realloc_races); ++read_attribute(extent_migrate_done); ++read_attribute(extent_migrate_raced); ++ ++rw_attribute(journal_write_delay_ms); ++rw_attribute(journal_reclaim_delay_ms); ++ ++rw_attribute(discard); ++rw_attribute(cache_replacement_policy); ++rw_attribute(label); ++ ++rw_attribute(copy_gc_enabled); ++sysfs_pd_controller_attribute(copy_gc); ++ ++rw_attribute(rebalance_enabled); ++sysfs_pd_controller_attribute(rebalance); ++read_attribute(rebalance_work); ++rw_attribute(promote_whole_extents); ++ ++read_attribute(new_stripes); ++ ++rw_attribute(pd_controllers_update_seconds); ++ ++read_attribute(meta_replicas_have); ++read_attribute(data_replicas_have); ++ ++read_attribute(io_timers_read); ++read_attribute(io_timers_write); ++ ++#ifdef CONFIG_BCACHEFS_TESTS ++write_attribute(perf_test); ++#endif /* CONFIG_BCACHEFS_TESTS */ ++ ++#define BCH_DEBUG_PARAM(name, description) \ ++ rw_attribute(name); ++ ++ BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++#define x(_name) \ ++ static struct attribute sysfs_time_stat_##_name = \ ++ { .name = #_name, .mode = S_IRUGO }; ++ BCH_TIME_STATS() ++#undef x ++ ++static struct attribute sysfs_state_rw = { ++ .name = "state", ++ .mode = S_IRUGO ++}; ++ ++static size_t bch2_btree_cache_size(struct bch_fs *c) ++{ ++ size_t ret = 0; ++ struct btree *b; ++ ++ mutex_lock(&c->btree_cache.lock); ++ list_for_each_entry(b, &c->btree_cache.live, list) ++ ret += btree_bytes(c); ++ ++ mutex_unlock(&c->btree_cache.lock); ++ return ret; ++} ++ ++static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c); ++ ++ if (!fs_usage) ++ return -ENOMEM; ++ ++ bch2_fs_usage_to_text(out, c, fs_usage); ++ ++ percpu_up_read(&c->mark_lock); ++ ++ kfree(fs_usage); ++ return 0; ++} ++ ++static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0, ++ nr_compressed_extents = 0, ++ compressed_sectors_compressed = 0, ++ compressed_sectors_uncompressed = 0; ++ int ret; ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EPERM; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, 0, k, ret) ++ if (k.k->type == KEY_TYPE_extent) { ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ extent_for_each_ptr_decode(e, p, entry) { ++ if (!crc_is_compressed(p.crc)) { ++ nr_uncompressed_extents++; ++ uncompressed_sectors += e.k->size; ++ } else { ++ nr_compressed_extents++; ++ compressed_sectors_compressed += ++ p.crc.compressed_size; ++ compressed_sectors_uncompressed += ++ p.crc.uncompressed_size; ++ } ++ ++ /* only looking at the first ptr */ ++ break; ++ } ++ } ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ if (ret) ++ return ret; ++ ++ pr_buf(out, ++ "uncompressed data:\n" ++ " nr extents: %llu\n" ++ " size (bytes): %llu\n" ++ "compressed data:\n" ++ " nr extents: %llu\n" ++ " compressed size (bytes): %llu\n" ++ " uncompressed size (bytes): %llu\n", ++ nr_uncompressed_extents, ++ uncompressed_sectors << 9, ++ nr_compressed_extents, ++ compressed_sectors_compressed << 9, ++ compressed_sectors_uncompressed << 9); ++ return 0; ++} ++ ++SHOW(bch2_fs) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ ++ sysfs_print(minor, c->minor); ++ sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); ++ ++ sysfs_print(journal_write_delay_ms, c->journal.write_delay_ms); ++ sysfs_print(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); ++ ++ sysfs_print(block_size, block_bytes(c)); ++ sysfs_print(btree_node_size, btree_bytes(c)); ++ sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); ++ ++ sysfs_print(read_realloc_races, ++ atomic_long_read(&c->read_realloc_races)); ++ sysfs_print(extent_migrate_done, ++ atomic_long_read(&c->extent_migrate_done)); ++ sysfs_print(extent_migrate_raced, ++ atomic_long_read(&c->extent_migrate_raced)); ++ ++ sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); ++ ++ sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); ++ ++ sysfs_print(pd_controllers_update_seconds, ++ c->pd_controllers_update_seconds); ++ ++ sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); ++ sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ ++ sysfs_pd_controller_show(copy_gc, &c->copygc_pd); ++ ++ if (attr == &sysfs_rebalance_work) { ++ bch2_rebalance_work_to_text(&out, c); ++ return out.pos - buf; ++ } ++ ++ sysfs_print(promote_whole_extents, c->promote_whole_extents); ++ ++ sysfs_printf(meta_replicas_have, "%i", bch2_replicas_online(c, true)); ++ sysfs_printf(data_replicas_have, "%i", bch2_replicas_online(c, false)); ++ ++ /* Debugging: */ ++ ++ if (attr == &sysfs_alloc_debug) ++ return fs_alloc_debug_to_text(&out, c) ?: out.pos - buf; ++ ++ if (attr == &sysfs_journal_debug) { ++ bch2_journal_debug_to_text(&out, &c->journal); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_journal_pins) { ++ bch2_journal_pins_to_text(&out, &c->journal); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_btree_updates) { ++ bch2_btree_updates_to_text(&out, c); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_dirty_btree_nodes) { ++ bch2_dirty_btree_nodes_to_text(&out, c); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_btree_key_cache) { ++ bch2_btree_key_cache_to_text(&out, &c->btree_key_cache); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_btree_transactions) { ++ bch2_btree_trans_to_text(&out, c); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_stripes_heap) { ++ bch2_stripes_heap_to_text(&out, c); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_compression_stats) { ++ bch2_compression_stats_to_text(&out, c); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_new_stripes) { ++ bch2_new_stripes_to_text(&out, c); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_io_timers_read) { ++ bch2_io_timers_to_text(&out, &c->io_clock[READ]); ++ return out.pos - buf; ++ } ++ if (attr == &sysfs_io_timers_write) { ++ bch2_io_timers_to_text(&out, &c->io_clock[WRITE]); ++ return out.pos - buf; ++ } ++ ++#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); ++ BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++ return 0; ++} ++ ++STORE(bch2_fs) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); ++ ++ sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms); ++ sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); ++ ++ if (attr == &sysfs_btree_gc_periodic) { ++ ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic) ++ ?: (ssize_t) size; ++ ++ wake_up_process(c->gc_thread); ++ return ret; ++ } ++ ++ if (attr == &sysfs_copy_gc_enabled) { ++ ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) ++ ?: (ssize_t) size; ++ ++ if (c->copygc_thread) ++ wake_up_process(c->copygc_thread); ++ return ret; ++ } ++ ++ if (attr == &sysfs_rebalance_enabled) { ++ ssize_t ret = strtoul_safe(buf, c->rebalance.enabled) ++ ?: (ssize_t) size; ++ ++ rebalance_wakeup(c); ++ return ret; ++ } ++ ++ sysfs_strtoul(pd_controllers_update_seconds, ++ c->pd_controllers_update_seconds); ++ sysfs_pd_controller_store(rebalance, &c->rebalance.pd); ++ sysfs_pd_controller_store(copy_gc, &c->copygc_pd); ++ ++ sysfs_strtoul(promote_whole_extents, c->promote_whole_extents); ++ ++ /* Debugging: */ ++ ++#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name); ++ BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EPERM; ++ ++ /* Debugging: */ ++ ++ if (attr == &sysfs_trigger_journal_flush) ++ bch2_journal_meta_async(&c->journal, NULL); ++ ++ if (attr == &sysfs_trigger_btree_coalesce) ++ bch2_coalesce(c); ++ ++ if (attr == &sysfs_trigger_gc) { ++ /* ++ * Full gc is currently incompatible with btree key cache: ++ */ ++#if 0 ++ down_read(&c->state_lock); ++ bch2_gc(c, NULL, false, false); ++ up_read(&c->state_lock); ++#else ++ bch2_gc_gens(c); ++#endif ++ } ++ ++ if (attr == &sysfs_prune_cache) { ++ struct shrink_control sc; ++ ++ sc.gfp_mask = GFP_KERNEL; ++ sc.nr_to_scan = strtoul_or_return(buf); ++ c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); ++ } ++ ++#ifdef CONFIG_BCACHEFS_TESTS ++ if (attr == &sysfs_perf_test) { ++ char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; ++ char *test = strsep(&p, " \t\n"); ++ char *nr_str = strsep(&p, " \t\n"); ++ char *threads_str = strsep(&p, " \t\n"); ++ unsigned threads; ++ u64 nr; ++ int ret = -EINVAL; ++ ++ if (threads_str && ++ !(ret = kstrtouint(threads_str, 10, &threads)) && ++ !(ret = bch2_strtoull_h(nr_str, &nr))) ++ bch2_btree_perf_test(c, test, nr, threads); ++ else ++ size = ret; ++ kfree(tmp); ++ } ++#endif ++ return size; ++} ++SYSFS_OPS(bch2_fs); ++ ++struct attribute *bch2_fs_files[] = { ++ &sysfs_minor, ++ &sysfs_block_size, ++ &sysfs_btree_node_size, ++ &sysfs_btree_cache_size, ++ ++ &sysfs_meta_replicas_have, ++ &sysfs_data_replicas_have, ++ ++ &sysfs_journal_write_delay_ms, ++ &sysfs_journal_reclaim_delay_ms, ++ ++ &sysfs_promote_whole_extents, ++ ++ &sysfs_compression_stats, ++ ++#ifdef CONFIG_BCACHEFS_TESTS ++ &sysfs_perf_test, ++#endif ++ NULL ++}; ++ ++/* internal dir - just a wrapper */ ++ ++SHOW(bch2_fs_internal) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, internal); ++ return bch2_fs_show(&c->kobj, attr, buf); ++} ++ ++STORE(bch2_fs_internal) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, internal); ++ return bch2_fs_store(&c->kobj, attr, buf, size); ++} ++SYSFS_OPS(bch2_fs_internal); ++ ++struct attribute *bch2_fs_internal_files[] = { ++ &sysfs_alloc_debug, ++ &sysfs_journal_debug, ++ &sysfs_journal_pins, ++ &sysfs_btree_updates, ++ &sysfs_dirty_btree_nodes, ++ &sysfs_btree_key_cache, ++ &sysfs_btree_transactions, ++ &sysfs_stripes_heap, ++ ++ &sysfs_read_realloc_races, ++ &sysfs_extent_migrate_done, ++ &sysfs_extent_migrate_raced, ++ ++ &sysfs_trigger_journal_flush, ++ &sysfs_trigger_btree_coalesce, ++ &sysfs_trigger_gc, ++ &sysfs_prune_cache, ++ ++ &sysfs_copy_gc_enabled, ++ ++ &sysfs_rebalance_enabled, ++ &sysfs_rebalance_work, ++ sysfs_pd_controller_files(rebalance), ++ sysfs_pd_controller_files(copy_gc), ++ ++ &sysfs_new_stripes, ++ ++ &sysfs_io_timers_read, ++ &sysfs_io_timers_write, ++ ++ &sysfs_internal_uuid, ++ ++#define BCH_DEBUG_PARAM(name, description) &sysfs_##name, ++ BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++ NULL ++}; ++ ++/* options */ ++ ++SHOW(bch2_fs_opts_dir) ++{ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); ++ const struct bch_option *opt = container_of(attr, struct bch_option, attr); ++ int id = opt - bch2_opt_table; ++ u64 v = bch2_opt_get_by_id(&c->opts, id); ++ ++ bch2_opt_to_text(&out, c, opt, v, OPT_SHOW_FULL_LIST); ++ pr_buf(&out, "\n"); ++ ++ return out.pos - buf; ++} ++ ++STORE(bch2_fs_opts_dir) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); ++ const struct bch_option *opt = container_of(attr, struct bch_option, attr); ++ int ret, id = opt - bch2_opt_table; ++ char *tmp; ++ u64 v; ++ ++ tmp = kstrdup(buf, GFP_KERNEL); ++ if (!tmp) ++ return -ENOMEM; ++ ++ ret = bch2_opt_parse(c, opt, strim(tmp), &v); ++ kfree(tmp); ++ ++ if (ret < 0) ++ return ret; ++ ++ ret = bch2_opt_check_may_set(c, id, v); ++ if (ret < 0) ++ return ret; ++ ++ if (opt->set_sb != SET_NO_SB_OPT) { ++ mutex_lock(&c->sb_lock); ++ opt->set_sb(c->disk_sb.sb, v); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ bch2_opt_set_by_id(&c->opts, id, v); ++ ++ if ((id == Opt_background_target || ++ id == Opt_background_compression) && v) { ++ bch2_rebalance_add_work(c, S64_MAX); ++ rebalance_wakeup(c); ++ } ++ ++ return size; ++} ++SYSFS_OPS(bch2_fs_opts_dir); ++ ++struct attribute *bch2_fs_opts_dir_files[] = { NULL }; ++ ++int bch2_opts_create_sysfs_files(struct kobject *kobj) ++{ ++ const struct bch_option *i; ++ int ret; ++ ++ for (i = bch2_opt_table; ++ i < bch2_opt_table + bch2_opts_nr; ++ i++) { ++ if (!(i->mode & (OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME))) ++ continue; ++ ++ ret = sysfs_create_file(kobj, &i->attr); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++/* time stats */ ++ ++SHOW(bch2_fs_time_stats) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ ++#define x(name) \ ++ if (attr == &sysfs_time_stat_##name) { \ ++ bch2_time_stats_to_text(&out, &c->times[BCH_TIME_##name]);\ ++ return out.pos - buf; \ ++ } ++ BCH_TIME_STATS() ++#undef x ++ ++ return 0; ++} ++ ++STORE(bch2_fs_time_stats) ++{ ++ return size; ++} ++SYSFS_OPS(bch2_fs_time_stats); ++ ++struct attribute *bch2_fs_time_stats_files[] = { ++#define x(name) \ ++ &sysfs_time_stat_##name, ++ BCH_TIME_STATS() ++#undef x ++ NULL ++}; ++ ++typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *, ++ size_t, void *); ++ ++static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, void *private) ++{ ++ int rw = (private ? 1 : 0); ++ ++ return bucket_last_io(c, bucket(ca, b), rw); ++} ++ ++static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, void *private) ++{ ++ struct bucket *g = bucket(ca, b); ++ return bucket_sectors_used(g->mark); ++} ++ ++static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, void *private) ++{ ++ return bucket_gc_gen(ca, b); ++} ++ ++static int unsigned_cmp(const void *_l, const void *_r) ++{ ++ const unsigned *l = _l; ++ const unsigned *r = _r; ++ ++ return cmp_int(*l, *r); ++} ++ ++static int quantiles_to_text(struct printbuf *out, ++ struct bch_fs *c, struct bch_dev *ca, ++ bucket_map_fn *fn, void *private) ++{ ++ size_t i, n; ++ /* Compute 31 quantiles */ ++ unsigned q[31], *p; ++ ++ down_read(&ca->bucket_lock); ++ n = ca->mi.nbuckets; ++ ++ p = vzalloc(n * sizeof(unsigned)); ++ if (!p) { ++ up_read(&ca->bucket_lock); ++ return -ENOMEM; ++ } ++ ++ for (i = ca->mi.first_bucket; i < n; i++) ++ p[i] = fn(c, ca, i, private); ++ ++ sort(p, n, sizeof(unsigned), unsigned_cmp, NULL); ++ up_read(&ca->bucket_lock); ++ ++ while (n && ++ !p[n - 1]) ++ --n; ++ ++ for (i = 0; i < ARRAY_SIZE(q); i++) ++ q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)]; ++ ++ vfree(p); ++ ++ for (i = 0; i < ARRAY_SIZE(q); i++) ++ pr_buf(out, "%u ", q[i]); ++ pr_buf(out, "\n"); ++ return 0; ++} ++ ++static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca) ++{ ++ enum alloc_reserve i; ++ ++ spin_lock(&ca->fs->freelist_lock); ++ ++ pr_buf(out, "free_inc:\t%zu\t%zu\n", ++ fifo_used(&ca->free_inc), ++ ca->free_inc.size); ++ ++ for (i = 0; i < RESERVE_NR; i++) ++ pr_buf(out, "free[%u]:\t%zu\t%zu\n", i, ++ fifo_used(&ca->free[i]), ++ ca->free[i].size); ++ ++ spin_unlock(&ca->fs->freelist_lock); ++} ++ ++static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) ++{ ++ struct bch_fs *c = ca->fs; ++ struct bch_dev_usage stats = bch2_dev_usage_read(ca); ++ unsigned i, nr[BCH_DATA_NR]; ++ ++ memset(nr, 0, sizeof(nr)); ++ ++ for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) ++ nr[c->open_buckets[i].type]++; ++ ++ pr_buf(out, ++ "free_inc: %zu/%zu\n" ++ "free[RESERVE_BTREE]: %zu/%zu\n" ++ "free[RESERVE_MOVINGGC]: %zu/%zu\n" ++ "free[RESERVE_NONE]: %zu/%zu\n" ++ "buckets:\n" ++ " capacity: %llu\n" ++ " alloc: %llu\n" ++ " sb: %llu\n" ++ " journal: %llu\n" ++ " meta: %llu\n" ++ " user: %llu\n" ++ " cached: %llu\n" ++ " erasure coded: %llu\n" ++ " available: %lli\n" ++ "sectors:\n" ++ " sb: %llu\n" ++ " journal: %llu\n" ++ " meta: %llu\n" ++ " user: %llu\n" ++ " cached: %llu\n" ++ " erasure coded: %llu\n" ++ " fragmented: %llu\n" ++ " copygc threshold: %llu\n" ++ "freelist_wait: %s\n" ++ "open buckets: %u/%u (reserved %u)\n" ++ "open_buckets_wait: %s\n" ++ "open_buckets_btree: %u\n" ++ "open_buckets_user: %u\n" ++ "btree reserve cache: %u\n", ++ fifo_used(&ca->free_inc), ca->free_inc.size, ++ fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size, ++ fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, ++ fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, ++ ca->mi.nbuckets - ca->mi.first_bucket, ++ stats.buckets_alloc, ++ stats.buckets[BCH_DATA_sb], ++ stats.buckets[BCH_DATA_journal], ++ stats.buckets[BCH_DATA_btree], ++ stats.buckets[BCH_DATA_user], ++ stats.buckets[BCH_DATA_cached], ++ stats.buckets_ec, ++ __dev_buckets_available(ca, stats), ++ stats.sectors[BCH_DATA_sb], ++ stats.sectors[BCH_DATA_journal], ++ stats.sectors[BCH_DATA_btree], ++ stats.sectors[BCH_DATA_user], ++ stats.sectors[BCH_DATA_cached], ++ stats.sectors_ec, ++ stats.sectors_fragmented, ++ c->copygc_threshold, ++ c->freelist_wait.list.first ? "waiting" : "empty", ++ c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, ++ BTREE_NODE_OPEN_BUCKET_RESERVE, ++ c->open_buckets_wait.list.first ? "waiting" : "empty", ++ nr[BCH_DATA_btree], ++ nr[BCH_DATA_user], ++ c->btree_reserve_cache_nr); ++} ++ ++static const char * const bch2_rw[] = { ++ "read", ++ "write", ++ NULL ++}; ++ ++static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca) ++{ ++ int rw, i; ++ ++ for (rw = 0; rw < 2; rw++) { ++ pr_buf(out, "%s:\n", bch2_rw[rw]); ++ ++ for (i = 1; i < BCH_DATA_NR; i++) ++ pr_buf(out, "%-12s:%12llu\n", ++ bch2_data_types[i], ++ percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); ++ } ++} ++ ++SHOW(bch2_dev) ++{ ++ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); ++ struct bch_fs *c = ca->fs; ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ ++ sysfs_printf(uuid, "%pU\n", ca->uuid.b); ++ ++ sysfs_print(bucket_size, bucket_bytes(ca)); ++ sysfs_print(block_size, block_bytes(c)); ++ sysfs_print(first_bucket, ca->mi.first_bucket); ++ sysfs_print(nbuckets, ca->mi.nbuckets); ++ sysfs_print(durability, ca->mi.durability); ++ sysfs_print(discard, ca->mi.discard); ++ ++ if (attr == &sysfs_label) { ++ if (ca->mi.group) { ++ mutex_lock(&c->sb_lock); ++ bch2_disk_path_to_text(&out, &c->disk_sb, ++ ca->mi.group - 1); ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ pr_buf(&out, "\n"); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_has_data) { ++ bch2_flags_to_text(&out, bch2_data_types, ++ bch2_dev_has_data(c, ca)); ++ pr_buf(&out, "\n"); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_cache_replacement_policy) { ++ bch2_string_opt_to_text(&out, ++ bch2_cache_replacement_policies, ++ ca->mi.replacement); ++ pr_buf(&out, "\n"); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_state_rw) { ++ bch2_string_opt_to_text(&out, bch2_dev_state, ++ ca->mi.state); ++ pr_buf(&out, "\n"); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_iodone) { ++ dev_iodone_to_text(&out, ca); ++ return out.pos - buf; ++ } ++ ++ sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); ++ sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); ++ ++ if (attr == &sysfs_io_latency_stats_read) { ++ bch2_time_stats_to_text(&out, &ca->io_latency[READ]); ++ return out.pos - buf; ++ } ++ if (attr == &sysfs_io_latency_stats_write) { ++ bch2_time_stats_to_text(&out, &ca->io_latency[WRITE]); ++ return out.pos - buf; ++ } ++ ++ sysfs_printf(congested, "%u%%", ++ clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) ++ * 100 / CONGESTED_MAX); ++ ++ if (attr == &sysfs_bucket_quantiles_last_read) ++ return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 0) ?: out.pos - buf; ++ if (attr == &sysfs_bucket_quantiles_last_write) ++ return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 1) ?: out.pos - buf; ++ if (attr == &sysfs_bucket_quantiles_fragmentation) ++ return quantiles_to_text(&out, c, ca, bucket_sectors_used_fn, NULL) ?: out.pos - buf; ++ if (attr == &sysfs_bucket_quantiles_oldest_gen) ++ return quantiles_to_text(&out, c, ca, bucket_oldest_gen_fn, NULL) ?: out.pos - buf; ++ ++ if (attr == &sysfs_reserve_stats) { ++ reserve_stats_to_text(&out, ca); ++ return out.pos - buf; ++ } ++ if (attr == &sysfs_alloc_debug) { ++ dev_alloc_debug_to_text(&out, ca); ++ return out.pos - buf; ++ } ++ ++ return 0; ++} ++ ++STORE(bch2_dev) ++{ ++ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); ++ struct bch_fs *c = ca->fs; ++ struct bch_member *mi; ++ ++ if (attr == &sysfs_discard) { ++ bool v = strtoul_or_return(buf); ++ ++ mutex_lock(&c->sb_lock); ++ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; ++ ++ if (v != BCH_MEMBER_DISCARD(mi)) { ++ SET_BCH_MEMBER_DISCARD(mi, v); ++ bch2_write_super(c); ++ } ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ if (attr == &sysfs_cache_replacement_policy) { ++ ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf); ++ ++ if (v < 0) ++ return v; ++ ++ mutex_lock(&c->sb_lock); ++ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; ++ ++ if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) { ++ SET_BCH_MEMBER_REPLACEMENT(mi, v); ++ bch2_write_super(c); ++ } ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ if (attr == &sysfs_label) { ++ char *tmp; ++ int ret; ++ ++ tmp = kstrdup(buf, GFP_KERNEL); ++ if (!tmp) ++ return -ENOMEM; ++ ++ ret = bch2_dev_group_set(c, ca, strim(tmp)); ++ kfree(tmp); ++ if (ret) ++ return ret; ++ } ++ ++ if (attr == &sysfs_wake_allocator) ++ bch2_wake_allocator(ca); ++ ++ return size; ++} ++SYSFS_OPS(bch2_dev); ++ ++struct attribute *bch2_dev_files[] = { ++ &sysfs_uuid, ++ &sysfs_bucket_size, ++ &sysfs_block_size, ++ &sysfs_first_bucket, ++ &sysfs_nbuckets, ++ &sysfs_durability, ++ ++ /* settings: */ ++ &sysfs_discard, ++ &sysfs_cache_replacement_policy, ++ &sysfs_state_rw, ++ &sysfs_label, ++ ++ &sysfs_has_data, ++ &sysfs_iodone, ++ ++ &sysfs_io_latency_read, ++ &sysfs_io_latency_write, ++ &sysfs_io_latency_stats_read, ++ &sysfs_io_latency_stats_write, ++ &sysfs_congested, ++ ++ /* alloc info - other stats: */ ++ &sysfs_bucket_quantiles_last_read, ++ &sysfs_bucket_quantiles_last_write, ++ &sysfs_bucket_quantiles_fragmentation, ++ &sysfs_bucket_quantiles_oldest_gen, ++ ++ &sysfs_reserve_stats, ++ ++ /* debug: */ ++ &sysfs_alloc_debug, ++ &sysfs_wake_allocator, ++ NULL ++}; ++ ++#endif /* _BCACHEFS_SYSFS_H_ */ +diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h +new file mode 100644 +index 000000000000..525fd05d91f7 +--- /dev/null ++++ b/fs/bcachefs/sysfs.h +@@ -0,0 +1,44 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SYSFS_H_ ++#define _BCACHEFS_SYSFS_H_ ++ ++#include ++ ++#ifndef NO_BCACHEFS_SYSFS ++ ++struct attribute; ++struct sysfs_ops; ++ ++extern struct attribute *bch2_fs_files[]; ++extern struct attribute *bch2_fs_internal_files[]; ++extern struct attribute *bch2_fs_opts_dir_files[]; ++extern struct attribute *bch2_fs_time_stats_files[]; ++extern struct attribute *bch2_dev_files[]; ++ ++extern struct sysfs_ops bch2_fs_sysfs_ops; ++extern struct sysfs_ops bch2_fs_internal_sysfs_ops; ++extern struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; ++extern struct sysfs_ops bch2_fs_time_stats_sysfs_ops; ++extern struct sysfs_ops bch2_dev_sysfs_ops; ++ ++int bch2_opts_create_sysfs_files(struct kobject *); ++ ++#else ++ ++static struct attribute *bch2_fs_files[] = {}; ++static struct attribute *bch2_fs_internal_files[] = {}; ++static struct attribute *bch2_fs_opts_dir_files[] = {}; ++static struct attribute *bch2_fs_time_stats_files[] = {}; ++static struct attribute *bch2_dev_files[] = {}; ++ ++static const struct sysfs_ops bch2_fs_sysfs_ops; ++static const struct sysfs_ops bch2_fs_internal_sysfs_ops; ++static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; ++static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; ++static const struct sysfs_ops bch2_dev_sysfs_ops; ++ ++static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; } ++ ++#endif /* NO_BCACHEFS_SYSFS */ ++ ++#endif /* _BCACHEFS_SYSFS_H_ */ +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +new file mode 100644 +index 000000000000..4dcace650416 +--- /dev/null ++++ b/fs/bcachefs/tests.c +@@ -0,0 +1,725 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifdef CONFIG_BCACHEFS_TESTS ++ ++#include "bcachefs.h" ++#include "btree_update.h" ++#include "journal_reclaim.h" ++#include "tests.h" ++ ++#include "linux/kthread.h" ++#include "linux/random.h" ++ ++static void delete_test_keys(struct bch_fs *c) ++{ ++ int ret; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, ++ POS(0, 0), POS(0, U64_MAX), ++ NULL); ++ BUG_ON(ret); ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, ++ POS(0, 0), POS(0, U64_MAX), ++ NULL); ++ BUG_ON(ret); ++} ++ ++/* unit tests */ ++ ++static void test_delete(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_i_cookie k; ++ int ret; ++ ++ bkey_cookie_init(&k.k_i); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p, ++ BTREE_ITER_INTENT); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ BUG_ON(ret); ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_trans_update(&trans, iter, &k.k_i, 0)); ++ BUG_ON(ret); ++ ++ pr_info("deleting once"); ++ ret = bch2_btree_delete_at(&trans, iter, 0); ++ BUG_ON(ret); ++ ++ pr_info("deleting twice"); ++ ret = bch2_btree_delete_at(&trans, iter, 0); ++ BUG_ON(ret); ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_delete_written(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_i_cookie k; ++ int ret; ++ ++ bkey_cookie_init(&k.k_i); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p, ++ BTREE_ITER_INTENT); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ BUG_ON(ret); ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_trans_update(&trans, iter, &k.k_i, 0)); ++ BUG_ON(ret); ++ ++ bch2_journal_flush_all_pins(&c->journal); ++ ++ ret = bch2_btree_delete_at(&trans, iter, 0); ++ BUG_ON(ret); ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_iterate(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 i; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ delete_test_keys(c); ++ ++ pr_info("inserting test keys"); ++ ++ for (i = 0; i < nr; i++) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = i; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, ++ NULL, NULL, 0); ++ BUG_ON(ret); ++ } ++ ++ pr_info("iterating forwards"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, ++ POS_MIN, 0, k, ret) { ++ if (k.k->p.inode) ++ break; ++ ++ BUG_ON(k.k->p.offset != i++); ++ } ++ ++ BUG_ON(i != nr); ++ ++ pr_info("iterating backwards"); ++ ++ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) ++ BUG_ON(k.k->p.offset != --i); ++ ++ BUG_ON(i); ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_iterate_extents(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 i; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ delete_test_keys(c); ++ ++ pr_info("inserting test extents"); ++ ++ for (i = 0; i < nr; i += 8) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = i + 8; ++ k.k.size = 8; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, ++ NULL, NULL, 0); ++ BUG_ON(ret); ++ } ++ ++ pr_info("iterating forwards"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, ++ POS_MIN, 0, k, ret) { ++ BUG_ON(bkey_start_offset(k.k) != i); ++ i = k.k->p.offset; ++ } ++ ++ BUG_ON(i != nr); ++ ++ pr_info("iterating backwards"); ++ ++ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) { ++ BUG_ON(k.k->p.offset != i); ++ i = bkey_start_offset(k.k); ++ } ++ ++ BUG_ON(i); ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_iterate_slots(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 i; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ delete_test_keys(c); ++ ++ pr_info("inserting test keys"); ++ ++ for (i = 0; i < nr; i++) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = i * 2; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, ++ NULL, NULL, 0); ++ BUG_ON(ret); ++ } ++ ++ pr_info("iterating forwards"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, ++ 0, k, ret) { ++ if (k.k->p.inode) ++ break; ++ ++ BUG_ON(k.k->p.offset != i); ++ i += 2; ++ } ++ bch2_trans_iter_free(&trans, iter); ++ ++ BUG_ON(i != nr * 2); ++ ++ pr_info("iterating forwards by slots"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, ++ BTREE_ITER_SLOTS, k, ret) { ++ BUG_ON(k.k->p.offset != i); ++ BUG_ON(bkey_deleted(k.k) != (i & 1)); ++ ++ i++; ++ if (i == nr * 2) ++ break; ++ } ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 i; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ delete_test_keys(c); ++ ++ pr_info("inserting test keys"); ++ ++ for (i = 0; i < nr; i += 16) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = i + 16; ++ k.k.size = 8; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, ++ NULL, NULL, 0); ++ BUG_ON(ret); ++ } ++ ++ pr_info("iterating forwards"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, ++ 0, k, ret) { ++ BUG_ON(bkey_start_offset(k.k) != i + 8); ++ BUG_ON(k.k->size != 8); ++ i += 16; ++ } ++ bch2_trans_iter_free(&trans, iter); ++ ++ BUG_ON(i != nr); ++ ++ pr_info("iterating forwards by slots"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, ++ BTREE_ITER_SLOTS, k, ret) { ++ BUG_ON(bkey_deleted(k.k) != !(i % 16)); ++ ++ BUG_ON(bkey_start_offset(k.k) != i); ++ BUG_ON(k.k->size != 8); ++ i = k.k->p.offset; ++ ++ if (i == nr) ++ break; ++ } ++ ++ bch2_trans_exit(&trans); ++} ++ ++/* ++ * XXX: we really want to make sure we've got a btree with depth > 0 for these ++ * tests ++ */ ++static void test_peek_end(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0); ++ ++ k = bch2_btree_iter_peek(iter); ++ BUG_ON(k.k); ++ ++ k = bch2_btree_iter_peek(iter); ++ BUG_ON(k.k); ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_peek_end_extents(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, 0); ++ ++ k = bch2_btree_iter_peek(iter); ++ BUG_ON(k.k); ++ ++ k = bch2_btree_iter_peek(iter); ++ BUG_ON(k.k); ++ ++ bch2_trans_exit(&trans); ++} ++ ++/* extent unit tests */ ++ ++u64 test_version; ++ ++static void insert_test_extent(struct bch_fs *c, ++ u64 start, u64 end) ++{ ++ struct bkey_i_cookie k; ++ int ret; ++ ++ //pr_info("inserting %llu-%llu v %llu", start, end, test_version); ++ ++ bkey_cookie_init(&k.k_i); ++ k.k_i.k.p.offset = end; ++ k.k_i.k.size = end - start; ++ k.k_i.k.version.lo = test_version++; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, ++ NULL, NULL, 0); ++ BUG_ON(ret); ++} ++ ++static void __test_extent_overwrite(struct bch_fs *c, ++ u64 e1_start, u64 e1_end, ++ u64 e2_start, u64 e2_end) ++{ ++ insert_test_extent(c, e1_start, e1_end); ++ insert_test_extent(c, e2_start, e2_end); ++ ++ delete_test_keys(c); ++} ++ ++static void test_extent_overwrite_front(struct bch_fs *c, u64 nr) ++{ ++ __test_extent_overwrite(c, 0, 64, 0, 32); ++ __test_extent_overwrite(c, 8, 64, 0, 32); ++} ++ ++static void test_extent_overwrite_back(struct bch_fs *c, u64 nr) ++{ ++ __test_extent_overwrite(c, 0, 64, 32, 64); ++ __test_extent_overwrite(c, 0, 64, 32, 72); ++} ++ ++static void test_extent_overwrite_middle(struct bch_fs *c, u64 nr) ++{ ++ __test_extent_overwrite(c, 0, 64, 32, 40); ++} ++ ++static void test_extent_overwrite_all(struct bch_fs *c, u64 nr) ++{ ++ __test_extent_overwrite(c, 32, 64, 0, 64); ++ __test_extent_overwrite(c, 32, 64, 0, 128); ++ __test_extent_overwrite(c, 32, 64, 32, 64); ++ __test_extent_overwrite(c, 32, 64, 32, 128); ++} ++ ++/* perf tests */ ++ ++static u64 test_rand(void) ++{ ++ u64 v; ++#if 0 ++ v = prandom_u32(); ++#else ++ prandom_bytes(&v, sizeof(v)); ++#endif ++ return v; ++} ++ ++static void rand_insert(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct bkey_i_cookie k; ++ int ret; ++ u64 i; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < nr; i++) { ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = test_rand(); ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ __bch2_btree_insert(&trans, BTREE_ID_XATTRS, &k.k_i)); ++ ++ BUG_ON(ret); ++ } ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void rand_lookup(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 i; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < nr; i++) { ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, ++ POS(0, test_rand()), 0); ++ ++ k = bch2_btree_iter_peek(iter); ++ bch2_trans_iter_free(&trans, iter); ++ } ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void rand_mixed(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ u64 i; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < nr; i++) { ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, ++ POS(0, test_rand()), 0); ++ ++ k = bch2_btree_iter_peek(iter); ++ ++ if (!(i & 3) && k.k) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p = iter->pos; ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_trans_update(&trans, iter, &k.k_i, 0)); ++ ++ BUG_ON(ret); ++ } ++ ++ bch2_trans_iter_free(&trans, iter); ++ } ++ ++ bch2_trans_exit(&trans); ++} ++ ++static int __do_delete(struct btree_trans *trans, struct bpos pos) ++{ ++ struct btree_iter *iter; ++ struct bkey_i delete; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_XATTRS, pos, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(iter); ++ if (ret) ++ goto err; ++ ++ k = bch2_btree_iter_peek(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ bkey_init(&delete.k); ++ delete.k.p = k.k->p; ++ ++ bch2_trans_update(trans, iter, &delete, 0); ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static void rand_delete(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ int ret; ++ u64 i; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < nr; i++) { ++ struct bpos pos = POS(0, test_rand()); ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ __do_delete(&trans, pos)); ++ BUG_ON(ret); ++ } ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void seq_insert(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_i_cookie insert; ++ int ret; ++ u64 i = 0; ++ ++ bkey_cookie_init(&insert.k_i); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ insert.k.p = iter->pos; ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_trans_update(&trans, iter, &insert.k_i, 0)); ++ ++ BUG_ON(ret); ++ ++ if (++i == nr) ++ break; ++ } ++ bch2_trans_exit(&trans); ++} ++ ++static void seq_lookup(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, 0, k, ret) ++ ; ++ bch2_trans_exit(&trans); ++} ++ ++static void seq_overwrite(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, ++ BTREE_ITER_INTENT, k, ret) { ++ struct bkey_i_cookie u; ++ ++ bkey_reassemble(&u.k_i, k); ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_trans_update(&trans, iter, &u.k_i, 0)); ++ ++ BUG_ON(ret); ++ } ++ bch2_trans_exit(&trans); ++} ++ ++static void seq_delete(struct bch_fs *c, u64 nr) ++{ ++ int ret; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, ++ POS(0, 0), POS(0, U64_MAX), ++ NULL); ++ BUG_ON(ret); ++} ++ ++typedef void (*perf_test_fn)(struct bch_fs *, u64); ++ ++struct test_job { ++ struct bch_fs *c; ++ u64 nr; ++ unsigned nr_threads; ++ perf_test_fn fn; ++ ++ atomic_t ready; ++ wait_queue_head_t ready_wait; ++ ++ atomic_t done; ++ struct completion done_completion; ++ ++ u64 start; ++ u64 finish; ++}; ++ ++static int btree_perf_test_thread(void *data) ++{ ++ struct test_job *j = data; ++ ++ if (atomic_dec_and_test(&j->ready)) { ++ wake_up(&j->ready_wait); ++ j->start = sched_clock(); ++ } else { ++ wait_event(j->ready_wait, !atomic_read(&j->ready)); ++ } ++ ++ j->fn(j->c, j->nr / j->nr_threads); ++ ++ if (atomic_dec_and_test(&j->done)) { ++ j->finish = sched_clock(); ++ complete(&j->done_completion); ++ } ++ ++ return 0; ++} ++ ++void bch2_btree_perf_test(struct bch_fs *c, const char *testname, ++ u64 nr, unsigned nr_threads) ++{ ++ struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; ++ char name_buf[20], nr_buf[20], per_sec_buf[20]; ++ unsigned i; ++ u64 time; ++ ++ atomic_set(&j.ready, nr_threads); ++ init_waitqueue_head(&j.ready_wait); ++ ++ atomic_set(&j.done, nr_threads); ++ init_completion(&j.done_completion); ++ ++#define perf_test(_test) \ ++ if (!strcmp(testname, #_test)) j.fn = _test ++ ++ perf_test(rand_insert); ++ perf_test(rand_lookup); ++ perf_test(rand_mixed); ++ perf_test(rand_delete); ++ ++ perf_test(seq_insert); ++ perf_test(seq_lookup); ++ perf_test(seq_overwrite); ++ perf_test(seq_delete); ++ ++ /* a unit test, not a perf test: */ ++ perf_test(test_delete); ++ perf_test(test_delete_written); ++ perf_test(test_iterate); ++ perf_test(test_iterate_extents); ++ perf_test(test_iterate_slots); ++ perf_test(test_iterate_slots_extents); ++ perf_test(test_peek_end); ++ perf_test(test_peek_end_extents); ++ ++ perf_test(test_extent_overwrite_front); ++ perf_test(test_extent_overwrite_back); ++ perf_test(test_extent_overwrite_middle); ++ perf_test(test_extent_overwrite_all); ++ ++ if (!j.fn) { ++ pr_err("unknown test %s", testname); ++ return; ++ } ++ ++ //pr_info("running test %s:", testname); ++ ++ if (nr_threads == 1) ++ btree_perf_test_thread(&j); ++ else ++ for (i = 0; i < nr_threads; i++) ++ kthread_run(btree_perf_test_thread, &j, ++ "bcachefs perf test[%u]", i); ++ ++ while (wait_for_completion_interruptible(&j.done_completion)) ++ ; ++ ++ time = j.finish - j.start; ++ ++ scnprintf(name_buf, sizeof(name_buf), "%s:", testname); ++ bch2_hprint(&PBUF(nr_buf), nr); ++ bch2_hprint(&PBUF(per_sec_buf), nr * NSEC_PER_SEC / time); ++ printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n", ++ name_buf, nr_buf, nr_threads, ++ time / NSEC_PER_SEC, ++ time * nr_threads / nr, ++ per_sec_buf); ++} ++ ++#endif /* CONFIG_BCACHEFS_TESTS */ +diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h +new file mode 100644 +index 000000000000..551d0764225e +--- /dev/null ++++ b/fs/bcachefs/tests.h +@@ -0,0 +1,15 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_TEST_H ++#define _BCACHEFS_TEST_H ++ ++struct bch_fs; ++ ++#ifdef CONFIG_BCACHEFS_TESTS ++ ++void bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned); ++ ++#else ++ ++#endif /* CONFIG_BCACHEFS_TESTS */ ++ ++#endif /* _BCACHEFS_TEST_H */ +diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c +new file mode 100644 +index 000000000000..59e8dfa3d245 +--- /dev/null ++++ b/fs/bcachefs/trace.c +@@ -0,0 +1,12 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "alloc_types.h" ++#include "buckets.h" ++#include "btree_types.h" ++#include "keylist.h" ++ ++#include ++#include "keylist.h" ++ ++#define CREATE_TRACE_POINTS ++#include +diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c +new file mode 100644 +index 000000000000..fd4044a6a08f +--- /dev/null ++++ b/fs/bcachefs/util.c +@@ -0,0 +1,907 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * random utiility code, for bcache but in theory not specific to bcache ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "eytzinger.h" ++#include "util.h" ++ ++static const char si_units[] = "?kMGTPEZY"; ++ ++static int __bch2_strtoh(const char *cp, u64 *res, ++ u64 t_max, bool t_signed) ++{ ++ bool positive = *cp != '-'; ++ unsigned u; ++ u64 v = 0; ++ ++ if (*cp == '+' || *cp == '-') ++ cp++; ++ ++ if (!isdigit(*cp)) ++ return -EINVAL; ++ ++ do { ++ if (v > U64_MAX / 10) ++ return -ERANGE; ++ v *= 10; ++ if (v > U64_MAX - (*cp - '0')) ++ return -ERANGE; ++ v += *cp - '0'; ++ cp++; ++ } while (isdigit(*cp)); ++ ++ for (u = 1; u < strlen(si_units); u++) ++ if (*cp == si_units[u]) { ++ cp++; ++ goto got_unit; ++ } ++ u = 0; ++got_unit: ++ if (*cp == '\n') ++ cp++; ++ if (*cp) ++ return -EINVAL; ++ ++ if (fls64(v) + u * 10 > 64) ++ return -ERANGE; ++ ++ v <<= u * 10; ++ ++ if (positive) { ++ if (v > t_max) ++ return -ERANGE; ++ } else { ++ if (v && !t_signed) ++ return -ERANGE; ++ ++ if (v > t_max + 1) ++ return -ERANGE; ++ v = -v; ++ } ++ ++ *res = v; ++ return 0; ++} ++ ++#define STRTO_H(name, type) \ ++int bch2_ ## name ## _h(const char *cp, type *res) \ ++{ \ ++ u64 v; \ ++ int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \ ++ ANYSINT_MAX(type) != ((type) ~0ULL)); \ ++ *res = v; \ ++ return ret; \ ++} ++ ++STRTO_H(strtoint, int) ++STRTO_H(strtouint, unsigned int) ++STRTO_H(strtoll, long long) ++STRTO_H(strtoull, unsigned long long) ++STRTO_H(strtou64, u64) ++ ++void bch2_hprint(struct printbuf *buf, s64 v) ++{ ++ int u, t = 0; ++ ++ for (u = 0; v >= 1024 || v <= -1024; u++) { ++ t = v & ~(~0U << 10); ++ v >>= 10; ++ } ++ ++ pr_buf(buf, "%lli", v); ++ ++ /* ++ * 103 is magic: t is in the range [-1023, 1023] and we want ++ * to turn it into [-9, 9] ++ */ ++ if (u && v < 100 && v > -100) ++ pr_buf(buf, ".%i", t / 103); ++ if (u) ++ pr_buf(buf, "%c", si_units[u]); ++} ++ ++void bch2_string_opt_to_text(struct printbuf *out, ++ const char * const list[], ++ size_t selected) ++{ ++ size_t i; ++ ++ for (i = 0; list[i]; i++) ++ pr_buf(out, i == selected ? "[%s] " : "%s ", list[i]); ++} ++ ++void bch2_flags_to_text(struct printbuf *out, ++ const char * const list[], u64 flags) ++{ ++ unsigned bit, nr = 0; ++ bool first = true; ++ ++ if (out->pos != out->end) ++ *out->pos = '\0'; ++ ++ while (list[nr]) ++ nr++; ++ ++ while (flags && (bit = __ffs(flags)) < nr) { ++ if (!first) ++ pr_buf(out, ","); ++ first = false; ++ pr_buf(out, "%s", list[bit]); ++ flags ^= 1 << bit; ++ } ++} ++ ++u64 bch2_read_flag_list(char *opt, const char * const list[]) ++{ ++ u64 ret = 0; ++ char *p, *s, *d = kstrndup(opt, PAGE_SIZE - 1, GFP_KERNEL); ++ ++ if (!d) ++ return -ENOMEM; ++ ++ s = strim(d); ++ ++ while ((p = strsep(&s, ","))) { ++ int flag = match_string(list, -1, p); ++ if (flag < 0) { ++ ret = -1; ++ break; ++ } ++ ++ ret |= 1 << flag; ++ } ++ ++ kfree(d); ++ ++ return ret; ++} ++ ++bool bch2_is_zero(const void *_p, size_t n) ++{ ++ const char *p = _p; ++ size_t i; ++ ++ for (i = 0; i < n; i++) ++ if (p[i]) ++ return false; ++ return true; ++} ++ ++static void bch2_quantiles_update(struct quantiles *q, u64 v) ++{ ++ unsigned i = 0; ++ ++ while (i < ARRAY_SIZE(q->entries)) { ++ struct quantile_entry *e = q->entries + i; ++ ++ if (unlikely(!e->step)) { ++ e->m = v; ++ e->step = max_t(unsigned, v / 2, 1024); ++ } else if (e->m > v) { ++ e->m = e->m >= e->step ++ ? e->m - e->step ++ : 0; ++ } else if (e->m < v) { ++ e->m = e->m + e->step > e->m ++ ? e->m + e->step ++ : U32_MAX; ++ } ++ ++ if ((e->m > v ? e->m - v : v - e->m) < e->step) ++ e->step = max_t(unsigned, e->step / 2, 1); ++ ++ if (v >= e->m) ++ break; ++ ++ i = eytzinger0_child(i, v > e->m); ++ } ++} ++ ++/* time stats: */ ++ ++static void bch2_time_stats_update_one(struct time_stats *stats, ++ u64 start, u64 end) ++{ ++ u64 duration, freq; ++ ++ duration = time_after64(end, start) ++ ? end - start : 0; ++ freq = time_after64(end, stats->last_event) ++ ? end - stats->last_event : 0; ++ ++ stats->count++; ++ ++ stats->average_duration = stats->average_duration ++ ? ewma_add(stats->average_duration, duration, 6) ++ : duration; ++ ++ stats->average_frequency = stats->average_frequency ++ ? ewma_add(stats->average_frequency, freq, 6) ++ : freq; ++ ++ stats->max_duration = max(stats->max_duration, duration); ++ ++ stats->last_event = end; ++ ++ bch2_quantiles_update(&stats->quantiles, duration); ++} ++ ++void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end) ++{ ++ unsigned long flags; ++ ++ if (!stats->buffer) { ++ spin_lock_irqsave(&stats->lock, flags); ++ bch2_time_stats_update_one(stats, start, end); ++ ++ if (stats->average_frequency < 32 && ++ stats->count > 1024) ++ stats->buffer = ++ alloc_percpu_gfp(struct time_stat_buffer, ++ GFP_ATOMIC); ++ spin_unlock_irqrestore(&stats->lock, flags); ++ } else { ++ struct time_stat_buffer_entry *i; ++ struct time_stat_buffer *b; ++ ++ preempt_disable(); ++ b = this_cpu_ptr(stats->buffer); ++ ++ BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); ++ b->entries[b->nr++] = (struct time_stat_buffer_entry) { ++ .start = start, ++ .end = end ++ }; ++ ++ if (b->nr == ARRAY_SIZE(b->entries)) { ++ spin_lock_irqsave(&stats->lock, flags); ++ for (i = b->entries; ++ i < b->entries + ARRAY_SIZE(b->entries); ++ i++) ++ bch2_time_stats_update_one(stats, i->start, i->end); ++ spin_unlock_irqrestore(&stats->lock, flags); ++ ++ b->nr = 0; ++ } ++ ++ preempt_enable(); ++ } ++} ++ ++static const struct time_unit { ++ const char *name; ++ u32 nsecs; ++} time_units[] = { ++ { "ns", 1 }, ++ { "us", NSEC_PER_USEC }, ++ { "ms", NSEC_PER_MSEC }, ++ { "sec", NSEC_PER_SEC }, ++}; ++ ++static const struct time_unit *pick_time_units(u64 ns) ++{ ++ const struct time_unit *u; ++ ++ for (u = time_units; ++ u + 1 < time_units + ARRAY_SIZE(time_units) && ++ ns >= u[1].nsecs << 1; ++ u++) ++ ; ++ ++ return u; ++} ++ ++static void pr_time_units(struct printbuf *out, u64 ns) ++{ ++ const struct time_unit *u = pick_time_units(ns); ++ ++ pr_buf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); ++} ++ ++void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats) ++{ ++ const struct time_unit *u; ++ u64 freq = READ_ONCE(stats->average_frequency); ++ u64 q, last_q = 0; ++ int i; ++ ++ pr_buf(out, "count:\t\t%llu\n", ++ stats->count); ++ pr_buf(out, "rate:\t\t%llu/sec\n", ++ freq ? div64_u64(NSEC_PER_SEC, freq) : 0); ++ ++ pr_buf(out, "frequency:\t"); ++ pr_time_units(out, freq); ++ ++ pr_buf(out, "\navg duration:\t"); ++ pr_time_units(out, stats->average_duration); ++ ++ pr_buf(out, "\nmax duration:\t"); ++ pr_time_units(out, stats->max_duration); ++ ++ i = eytzinger0_first(NR_QUANTILES); ++ u = pick_time_units(stats->quantiles.entries[i].m); ++ ++ pr_buf(out, "\nquantiles (%s):\t", u->name); ++ eytzinger0_for_each(i, NR_QUANTILES) { ++ bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; ++ ++ q = max(stats->quantiles.entries[i].m, last_q); ++ pr_buf(out, "%llu%s", ++ div_u64(q, u->nsecs), ++ is_last ? "\n" : " "); ++ last_q = q; ++ } ++} ++ ++void bch2_time_stats_exit(struct time_stats *stats) ++{ ++ free_percpu(stats->buffer); ++} ++ ++void bch2_time_stats_init(struct time_stats *stats) ++{ ++ memset(stats, 0, sizeof(*stats)); ++ spin_lock_init(&stats->lock); ++} ++ ++/* ratelimit: */ ++ ++/** ++ * bch2_ratelimit_delay() - return how long to delay until the next time to do ++ * some work ++ * ++ * @d - the struct bch_ratelimit to update ++ * ++ * Returns the amount of time to delay by, in jiffies ++ */ ++u64 bch2_ratelimit_delay(struct bch_ratelimit *d) ++{ ++ u64 now = local_clock(); ++ ++ return time_after64(d->next, now) ++ ? nsecs_to_jiffies(d->next - now) ++ : 0; ++} ++ ++/** ++ * bch2_ratelimit_increment() - increment @d by the amount of work done ++ * ++ * @d - the struct bch_ratelimit to update ++ * @done - the amount of work done, in arbitrary units ++ */ ++void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done) ++{ ++ u64 now = local_clock(); ++ ++ d->next += div_u64(done * NSEC_PER_SEC, d->rate); ++ ++ if (time_before64(now + NSEC_PER_SEC, d->next)) ++ d->next = now + NSEC_PER_SEC; ++ ++ if (time_after64(now - NSEC_PER_SEC * 2, d->next)) ++ d->next = now - NSEC_PER_SEC * 2; ++} ++ ++/* pd controller: */ ++ ++/* ++ * Updates pd_controller. Attempts to scale inputed values to units per second. ++ * @target: desired value ++ * @actual: current value ++ * ++ * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing ++ * it makes actual go down. ++ */ ++void bch2_pd_controller_update(struct bch_pd_controller *pd, ++ s64 target, s64 actual, int sign) ++{ ++ s64 proportional, derivative, change; ++ ++ unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ; ++ ++ if (seconds_since_update == 0) ++ return; ++ ++ pd->last_update = jiffies; ++ ++ proportional = actual - target; ++ proportional *= seconds_since_update; ++ proportional = div_s64(proportional, pd->p_term_inverse); ++ ++ derivative = actual - pd->last_actual; ++ derivative = div_s64(derivative, seconds_since_update); ++ derivative = ewma_add(pd->smoothed_derivative, derivative, ++ (pd->d_term / seconds_since_update) ?: 1); ++ derivative = derivative * pd->d_term; ++ derivative = div_s64(derivative, pd->p_term_inverse); ++ ++ change = proportional + derivative; ++ ++ /* Don't increase rate if not keeping up */ ++ if (change > 0 && ++ pd->backpressure && ++ time_after64(local_clock(), ++ pd->rate.next + NSEC_PER_MSEC)) ++ change = 0; ++ ++ change *= (sign * -1); ++ ++ pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change, ++ 1, UINT_MAX); ++ ++ pd->last_actual = actual; ++ pd->last_derivative = derivative; ++ pd->last_proportional = proportional; ++ pd->last_change = change; ++ pd->last_target = target; ++} ++ ++void bch2_pd_controller_init(struct bch_pd_controller *pd) ++{ ++ pd->rate.rate = 1024; ++ pd->last_update = jiffies; ++ pd->p_term_inverse = 6000; ++ pd->d_term = 30; ++ pd->d_smooth = pd->d_term; ++ pd->backpressure = 1; ++} ++ ++size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf) ++{ ++ /* 2^64 - 1 is 20 digits, plus null byte */ ++ char rate[21]; ++ char actual[21]; ++ char target[21]; ++ char proportional[21]; ++ char derivative[21]; ++ char change[21]; ++ s64 next_io; ++ ++ bch2_hprint(&PBUF(rate), pd->rate.rate); ++ bch2_hprint(&PBUF(actual), pd->last_actual); ++ bch2_hprint(&PBUF(target), pd->last_target); ++ bch2_hprint(&PBUF(proportional), pd->last_proportional); ++ bch2_hprint(&PBUF(derivative), pd->last_derivative); ++ bch2_hprint(&PBUF(change), pd->last_change); ++ ++ next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC); ++ ++ return sprintf(buf, ++ "rate:\t\t%s/sec\n" ++ "target:\t\t%s\n" ++ "actual:\t\t%s\n" ++ "proportional:\t%s\n" ++ "derivative:\t%s\n" ++ "change:\t\t%s/sec\n" ++ "next io:\t%llims\n", ++ rate, target, actual, proportional, ++ derivative, change, next_io); ++} ++ ++/* misc: */ ++ ++void bch2_bio_map(struct bio *bio, void *base, size_t size) ++{ ++ while (size) { ++ struct page *page = is_vmalloc_addr(base) ++ ? vmalloc_to_page(base) ++ : virt_to_page(base); ++ unsigned offset = offset_in_page(base); ++ unsigned len = min_t(size_t, PAGE_SIZE - offset, size); ++ ++ BUG_ON(!bio_add_page(bio, page, len, offset)); ++ size -= len; ++ base += len; ++ } ++} ++ ++int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) ++{ ++ while (size) { ++ struct page *page = alloc_page(gfp_mask); ++ unsigned len = min(PAGE_SIZE, size); ++ ++ if (!page) ++ return -ENOMEM; ++ ++ BUG_ON(!bio_add_page(bio, page, len, 0)); ++ size -= len; ++ } ++ ++ return 0; ++} ++ ++size_t bch2_rand_range(size_t max) ++{ ++ size_t rand; ++ ++ if (!max) ++ return 0; ++ ++ do { ++ rand = get_random_long(); ++ rand &= roundup_pow_of_two(max) - 1; ++ } while (rand >= max); ++ ++ return rand; ++} ++ ++void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src) ++{ ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ ++ __bio_for_each_segment(bv, dst, iter, dst_iter) { ++ void *dstp = kmap_atomic(bv.bv_page); ++ memcpy(dstp + bv.bv_offset, src, bv.bv_len); ++ kunmap_atomic(dstp); ++ ++ src += bv.bv_len; ++ } ++} ++ ++void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) ++{ ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ ++ __bio_for_each_segment(bv, src, iter, src_iter) { ++ void *srcp = kmap_atomic(bv.bv_page); ++ memcpy(dst, srcp + bv.bv_offset, bv.bv_len); ++ kunmap_atomic(srcp); ++ ++ dst += bv.bv_len; ++ } ++} ++ ++void bch_scnmemcpy(struct printbuf *out, ++ const char *src, size_t len) ++{ ++ size_t n = printbuf_remaining(out); ++ ++ if (n) { ++ n = min(n - 1, len); ++ memcpy(out->pos, src, n); ++ out->pos += n; ++ *out->pos = '\0'; ++ } ++} ++ ++#include "eytzinger.h" ++ ++static int alignment_ok(const void *base, size_t align) ++{ ++ return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || ++ ((unsigned long)base & (align - 1)) == 0; ++} ++ ++static void u32_swap(void *a, void *b, size_t size) ++{ ++ u32 t = *(u32 *)a; ++ *(u32 *)a = *(u32 *)b; ++ *(u32 *)b = t; ++} ++ ++static void u64_swap(void *a, void *b, size_t size) ++{ ++ u64 t = *(u64 *)a; ++ *(u64 *)a = *(u64 *)b; ++ *(u64 *)b = t; ++} ++ ++static void generic_swap(void *a, void *b, size_t size) ++{ ++ char t; ++ ++ do { ++ t = *(char *)a; ++ *(char *)a++ = *(char *)b; ++ *(char *)b++ = t; ++ } while (--size > 0); ++} ++ ++static inline int do_cmp(void *base, size_t n, size_t size, ++ int (*cmp_func)(const void *, const void *, size_t), ++ size_t l, size_t r) ++{ ++ return cmp_func(base + inorder_to_eytzinger0(l, n) * size, ++ base + inorder_to_eytzinger0(r, n) * size, ++ size); ++} ++ ++static inline void do_swap(void *base, size_t n, size_t size, ++ void (*swap_func)(void *, void *, size_t), ++ size_t l, size_t r) ++{ ++ swap_func(base + inorder_to_eytzinger0(l, n) * size, ++ base + inorder_to_eytzinger0(r, n) * size, ++ size); ++} ++ ++void eytzinger0_sort(void *base, size_t n, size_t size, ++ int (*cmp_func)(const void *, const void *, size_t), ++ void (*swap_func)(void *, void *, size_t)) ++{ ++ int i, c, r; ++ ++ if (!swap_func) { ++ if (size == 4 && alignment_ok(base, 4)) ++ swap_func = u32_swap; ++ else if (size == 8 && alignment_ok(base, 8)) ++ swap_func = u64_swap; ++ else ++ swap_func = generic_swap; ++ } ++ ++ /* heapify */ ++ for (i = n / 2 - 1; i >= 0; --i) { ++ for (r = i; r * 2 + 1 < n; r = c) { ++ c = r * 2 + 1; ++ ++ if (c + 1 < n && ++ do_cmp(base, n, size, cmp_func, c, c + 1) < 0) ++ c++; ++ ++ if (do_cmp(base, n, size, cmp_func, r, c) >= 0) ++ break; ++ ++ do_swap(base, n, size, swap_func, r, c); ++ } ++ } ++ ++ /* sort */ ++ for (i = n - 1; i > 0; --i) { ++ do_swap(base, n, size, swap_func, 0, i); ++ ++ for (r = 0; r * 2 + 1 < i; r = c) { ++ c = r * 2 + 1; ++ ++ if (c + 1 < i && ++ do_cmp(base, n, size, cmp_func, c, c + 1) < 0) ++ c++; ++ ++ if (do_cmp(base, n, size, cmp_func, r, c) >= 0) ++ break; ++ ++ do_swap(base, n, size, swap_func, r, c); ++ } ++ } ++} ++ ++void sort_cmp_size(void *base, size_t num, size_t size, ++ int (*cmp_func)(const void *, const void *, size_t), ++ void (*swap_func)(void *, void *, size_t size)) ++{ ++ /* pre-scale counters for performance */ ++ int i = (num/2 - 1) * size, n = num * size, c, r; ++ ++ if (!swap_func) { ++ if (size == 4 && alignment_ok(base, 4)) ++ swap_func = u32_swap; ++ else if (size == 8 && alignment_ok(base, 8)) ++ swap_func = u64_swap; ++ else ++ swap_func = generic_swap; ++ } ++ ++ /* heapify */ ++ for ( ; i >= 0; i -= size) { ++ for (r = i; r * 2 + size < n; r = c) { ++ c = r * 2 + size; ++ if (c < n - size && ++ cmp_func(base + c, base + c + size, size) < 0) ++ c += size; ++ if (cmp_func(base + r, base + c, size) >= 0) ++ break; ++ swap_func(base + r, base + c, size); ++ } ++ } ++ ++ /* sort */ ++ for (i = n - size; i > 0; i -= size) { ++ swap_func(base, base + i, size); ++ for (r = 0; r * 2 + size < i; r = c) { ++ c = r * 2 + size; ++ if (c < i - size && ++ cmp_func(base + c, base + c + size, size) < 0) ++ c += size; ++ if (cmp_func(base + r, base + c, size) >= 0) ++ break; ++ swap_func(base + r, base + c, size); ++ } ++ } ++} ++ ++static void mempool_free_vp(void *element, void *pool_data) ++{ ++ size_t size = (size_t) pool_data; ++ ++ vpfree(element, size); ++} ++ ++static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data) ++{ ++ size_t size = (size_t) pool_data; ++ ++ return vpmalloc(size, gfp_mask); ++} ++ ++int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size) ++{ ++ return size < PAGE_SIZE ++ ? mempool_init_kmalloc_pool(pool, min_nr, size) ++ : mempool_init(pool, min_nr, mempool_alloc_vp, ++ mempool_free_vp, (void *) size); ++} ++ ++#if 0 ++void eytzinger1_test(void) ++{ ++ unsigned inorder, eytz, size; ++ ++ pr_info("1 based eytzinger test:"); ++ ++ for (size = 2; ++ size < 65536; ++ size++) { ++ unsigned extra = eytzinger1_extra(size); ++ ++ if (!(size % 4096)) ++ pr_info("tree size %u", size); ++ ++ BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size)); ++ BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size)); ++ ++ BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0); ++ BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0); ++ ++ inorder = 1; ++ eytzinger1_for_each(eytz, size) { ++ BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz); ++ BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder); ++ BUG_ON(eytz != eytzinger1_last(size) && ++ eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz); ++ ++ inorder++; ++ } ++ } ++} ++ ++void eytzinger0_test(void) ++{ ++ ++ unsigned inorder, eytz, size; ++ ++ pr_info("0 based eytzinger test:"); ++ ++ for (size = 1; ++ size < 65536; ++ size++) { ++ unsigned extra = eytzinger0_extra(size); ++ ++ if (!(size % 4096)) ++ pr_info("tree size %u", size); ++ ++ BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size)); ++ BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size)); ++ ++ BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1); ++ BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1); ++ ++ inorder = 0; ++ eytzinger0_for_each(eytz, size) { ++ BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz); ++ BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder); ++ BUG_ON(eytz != eytzinger0_last(size) && ++ eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz); ++ ++ inorder++; ++ } ++ } ++} ++ ++static inline int cmp_u16(const void *_l, const void *_r, size_t size) ++{ ++ const u16 *l = _l, *r = _r; ++ ++ return (*l > *r) - (*r - *l); ++} ++ ++static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) ++{ ++ int i, c1 = -1, c2 = -1; ++ ssize_t r; ++ ++ r = eytzinger0_find_le(test_array, nr, ++ sizeof(test_array[0]), ++ cmp_u16, &search); ++ if (r >= 0) ++ c1 = test_array[r]; ++ ++ for (i = 0; i < nr; i++) ++ if (test_array[i] <= search && test_array[i] > c2) ++ c2 = test_array[i]; ++ ++ if (c1 != c2) { ++ eytzinger0_for_each(i, nr) ++ pr_info("[%3u] = %12u", i, test_array[i]); ++ pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i", ++ i, r, c1, c2); ++ } ++} ++ ++void eytzinger0_find_test(void) ++{ ++ unsigned i, nr, allocated = 1 << 12; ++ u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL); ++ ++ for (nr = 1; nr < allocated; nr++) { ++ pr_info("testing %u elems", nr); ++ ++ get_random_bytes(test_array, nr * sizeof(test_array[0])); ++ eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); ++ ++ /* verify array is sorted correctly: */ ++ eytzinger0_for_each(i, nr) ++ BUG_ON(i != eytzinger0_last(nr) && ++ test_array[i] > test_array[eytzinger0_next(i, nr)]); ++ ++ for (i = 0; i < U16_MAX; i += 1 << 12) ++ eytzinger0_find_test_val(test_array, nr, i); ++ ++ for (i = 0; i < nr; i++) { ++ eytzinger0_find_test_val(test_array, nr, test_array[i] - 1); ++ eytzinger0_find_test_val(test_array, nr, test_array[i]); ++ eytzinger0_find_test_val(test_array, nr, test_array[i] + 1); ++ } ++ } ++ ++ kfree(test_array); ++} ++#endif ++ ++/* ++ * Accumulate percpu counters onto one cpu's copy - only valid when access ++ * against any percpu counter is guarded against ++ */ ++u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) ++{ ++ u64 *ret; ++ int cpu; ++ ++ preempt_disable(); ++ ret = this_cpu_ptr(p); ++ preempt_enable(); ++ ++ for_each_possible_cpu(cpu) { ++ u64 *i = per_cpu_ptr(p, cpu); ++ ++ if (i != ret) { ++ acc_u64s(ret, i, nr); ++ memset(i, 0, nr * sizeof(u64)); ++ } ++ } ++ ++ return ret; ++} +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +new file mode 100644 +index 000000000000..f48c6380684f +--- /dev/null ++++ b/fs/bcachefs/util.h +@@ -0,0 +1,761 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_UTIL_H ++#define _BCACHEFS_UTIL_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define PAGE_SECTOR_SHIFT (PAGE_SHIFT - 9) ++#define PAGE_SECTORS (1UL << PAGE_SECTOR_SHIFT) ++ ++struct closure; ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++#define EBUG_ON(cond) BUG_ON(cond) ++#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) ++#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) ++#define atomic_sub_bug(i, v) BUG_ON(atomic_sub_return(i, v) < 0) ++#define atomic_add_bug(i, v) BUG_ON(atomic_add_return(i, v) < 0) ++#define atomic_long_dec_bug(v) BUG_ON(atomic_long_dec_return(v) < 0) ++#define atomic_long_sub_bug(i, v) BUG_ON(atomic_long_sub_return(i, v) < 0) ++#define atomic64_dec_bug(v) BUG_ON(atomic64_dec_return(v) < 0) ++#define atomic64_inc_bug(v, i) BUG_ON(atomic64_inc_return(v) <= i) ++#define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0) ++#define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0) ++ ++#define memcpy(dst, src, len) \ ++({ \ ++ void *_dst = (dst); \ ++ const void *_src = (src); \ ++ size_t _len = (len); \ ++ \ ++ BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \ ++ (void *) (_dst) + (_len) <= (void *) (_src))); \ ++ memcpy(_dst, _src, _len); \ ++}) ++ ++#else /* DEBUG */ ++ ++#define EBUG_ON(cond) ++#define atomic_dec_bug(v) atomic_dec(v) ++#define atomic_inc_bug(v, i) atomic_inc(v) ++#define atomic_sub_bug(i, v) atomic_sub(i, v) ++#define atomic_add_bug(i, v) atomic_add(i, v) ++#define atomic_long_dec_bug(v) atomic_long_dec(v) ++#define atomic_long_sub_bug(i, v) atomic_long_sub(i, v) ++#define atomic64_dec_bug(v) atomic64_dec(v) ++#define atomic64_inc_bug(v, i) atomic64_inc(v) ++#define atomic64_sub_bug(i, v) atomic64_sub(i, v) ++#define atomic64_add_bug(i, v) atomic64_add(i, v) ++ ++#endif ++ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++#define CPU_BIG_ENDIAN 0 ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++#define CPU_BIG_ENDIAN 1 ++#endif ++ ++/* type hackery */ ++ ++#define type_is_exact(_val, _type) \ ++ __builtin_types_compatible_p(typeof(_val), _type) ++ ++#define type_is(_val, _type) \ ++ (__builtin_types_compatible_p(typeof(_val), _type) || \ ++ __builtin_types_compatible_p(typeof(_val), const _type)) ++ ++/* Userspace doesn't align allocations as nicely as the kernel allocators: */ ++static inline size_t buf_pages(void *p, size_t len) ++{ ++ return DIV_ROUND_UP(len + ++ ((unsigned long) p & (PAGE_SIZE - 1)), ++ PAGE_SIZE); ++} ++ ++static inline void vpfree(void *p, size_t size) ++{ ++ if (is_vmalloc_addr(p)) ++ vfree(p); ++ else ++ free_pages((unsigned long) p, get_order(size)); ++} ++ ++static inline void *vpmalloc(size_t size, gfp_t gfp_mask) ++{ ++ return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, ++ get_order(size)) ?: ++ __vmalloc(size, gfp_mask); ++} ++ ++static inline void kvpfree(void *p, size_t size) ++{ ++ if (size < PAGE_SIZE) ++ kfree(p); ++ else ++ vpfree(p, size); ++} ++ ++static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) ++{ ++ return size < PAGE_SIZE ++ ? kmalloc(size, gfp_mask) ++ : vpmalloc(size, gfp_mask); ++} ++ ++int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t); ++ ++#define HEAP(type) \ ++struct { \ ++ size_t size, used; \ ++ type *data; \ ++} ++ ++#define DECLARE_HEAP(type, name) HEAP(type) name ++ ++#define init_heap(heap, _size, gfp) \ ++({ \ ++ (heap)->used = 0; \ ++ (heap)->size = (_size); \ ++ (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\ ++ (gfp)); \ ++}) ++ ++#define free_heap(heap) \ ++do { \ ++ kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \ ++ (heap)->data = NULL; \ ++} while (0) ++ ++#define heap_set_backpointer(h, i, _fn) \ ++do { \ ++ void (*fn)(typeof(h), size_t) = _fn; \ ++ if (fn) \ ++ fn(h, i); \ ++} while (0) ++ ++#define heap_swap(h, i, j, set_backpointer) \ ++do { \ ++ swap((h)->data[i], (h)->data[j]); \ ++ heap_set_backpointer(h, i, set_backpointer); \ ++ heap_set_backpointer(h, j, set_backpointer); \ ++} while (0) ++ ++#define heap_peek(h) \ ++({ \ ++ EBUG_ON(!(h)->used); \ ++ (h)->data[0]; \ ++}) ++ ++#define heap_full(h) ((h)->used == (h)->size) ++ ++#define heap_sift_down(h, i, cmp, set_backpointer) \ ++do { \ ++ size_t _c, _j = i; \ ++ \ ++ for (; _j * 2 + 1 < (h)->used; _j = _c) { \ ++ _c = _j * 2 + 1; \ ++ if (_c + 1 < (h)->used && \ ++ cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0) \ ++ _c++; \ ++ \ ++ if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0) \ ++ break; \ ++ heap_swap(h, _c, _j, set_backpointer); \ ++ } \ ++} while (0) ++ ++#define heap_sift_up(h, i, cmp, set_backpointer) \ ++do { \ ++ while (i) { \ ++ size_t p = (i - 1) / 2; \ ++ if (cmp(h, (h)->data[i], (h)->data[p]) >= 0) \ ++ break; \ ++ heap_swap(h, i, p, set_backpointer); \ ++ i = p; \ ++ } \ ++} while (0) ++ ++#define __heap_add(h, d, cmp, set_backpointer) \ ++({ \ ++ size_t _i = (h)->used++; \ ++ (h)->data[_i] = d; \ ++ heap_set_backpointer(h, _i, set_backpointer); \ ++ \ ++ heap_sift_up(h, _i, cmp, set_backpointer); \ ++ _i; \ ++}) ++ ++#define heap_add(h, d, cmp, set_backpointer) \ ++({ \ ++ bool _r = !heap_full(h); \ ++ if (_r) \ ++ __heap_add(h, d, cmp, set_backpointer); \ ++ _r; \ ++}) ++ ++#define heap_add_or_replace(h, new, cmp, set_backpointer) \ ++do { \ ++ if (!heap_add(h, new, cmp, set_backpointer) && \ ++ cmp(h, new, heap_peek(h)) >= 0) { \ ++ (h)->data[0] = new; \ ++ heap_set_backpointer(h, 0, set_backpointer); \ ++ heap_sift_down(h, 0, cmp, set_backpointer); \ ++ } \ ++} while (0) ++ ++#define heap_del(h, i, cmp, set_backpointer) \ ++do { \ ++ size_t _i = (i); \ ++ \ ++ BUG_ON(_i >= (h)->used); \ ++ (h)->used--; \ ++ heap_swap(h, _i, (h)->used, set_backpointer); \ ++ heap_sift_up(h, _i, cmp, set_backpointer); \ ++ heap_sift_down(h, _i, cmp, set_backpointer); \ ++} while (0) ++ ++#define heap_pop(h, d, cmp, set_backpointer) \ ++({ \ ++ bool _r = (h)->used; \ ++ if (_r) { \ ++ (d) = (h)->data[0]; \ ++ heap_del(h, 0, cmp, set_backpointer); \ ++ } \ ++ _r; \ ++}) ++ ++#define heap_resort(heap, cmp, set_backpointer) \ ++do { \ ++ ssize_t _i; \ ++ for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \ ++ heap_sift_down(heap, _i, cmp, set_backpointer); \ ++} while (0) ++ ++#define ANYSINT_MAX(t) \ ++ ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) ++ ++struct printbuf { ++ char *pos; ++ char *end; ++}; ++ ++static inline size_t printbuf_remaining(struct printbuf *buf) ++{ ++ return buf->end - buf->pos; ++} ++ ++#define _PBUF(_buf, _len) \ ++ ((struct printbuf) { \ ++ .pos = _buf, \ ++ .end = _buf + _len, \ ++ }) ++ ++#define PBUF(_buf) _PBUF(_buf, sizeof(_buf)) ++ ++#define pr_buf(_out, ...) \ ++do { \ ++ (_out)->pos += scnprintf((_out)->pos, printbuf_remaining(_out), \ ++ __VA_ARGS__); \ ++} while (0) ++ ++void bch_scnmemcpy(struct printbuf *, const char *, size_t); ++ ++int bch2_strtoint_h(const char *, int *); ++int bch2_strtouint_h(const char *, unsigned int *); ++int bch2_strtoll_h(const char *, long long *); ++int bch2_strtoull_h(const char *, unsigned long long *); ++int bch2_strtou64_h(const char *, u64 *); ++ ++static inline int bch2_strtol_h(const char *cp, long *res) ++{ ++#if BITS_PER_LONG == 32 ++ return bch2_strtoint_h(cp, (int *) res); ++#else ++ return bch2_strtoll_h(cp, (long long *) res); ++#endif ++} ++ ++static inline int bch2_strtoul_h(const char *cp, long *res) ++{ ++#if BITS_PER_LONG == 32 ++ return bch2_strtouint_h(cp, (unsigned int *) res); ++#else ++ return bch2_strtoull_h(cp, (unsigned long long *) res); ++#endif ++} ++ ++#define strtoi_h(cp, res) \ ++ ( type_is(*res, int) ? bch2_strtoint_h(cp, (void *) res)\ ++ : type_is(*res, long) ? bch2_strtol_h(cp, (void *) res)\ ++ : type_is(*res, long long) ? bch2_strtoll_h(cp, (void *) res)\ ++ : type_is(*res, unsigned) ? bch2_strtouint_h(cp, (void *) res)\ ++ : type_is(*res, unsigned long) ? bch2_strtoul_h(cp, (void *) res)\ ++ : type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\ ++ : -EINVAL) ++ ++#define strtoul_safe(cp, var) \ ++({ \ ++ unsigned long _v; \ ++ int _r = kstrtoul(cp, 10, &_v); \ ++ if (!_r) \ ++ var = _v; \ ++ _r; \ ++}) ++ ++#define strtoul_safe_clamp(cp, var, min, max) \ ++({ \ ++ unsigned long _v; \ ++ int _r = kstrtoul(cp, 10, &_v); \ ++ if (!_r) \ ++ var = clamp_t(typeof(var), _v, min, max); \ ++ _r; \ ++}) ++ ++#define strtoul_safe_restrict(cp, var, min, max) \ ++({ \ ++ unsigned long _v; \ ++ int _r = kstrtoul(cp, 10, &_v); \ ++ if (!_r && _v >= min && _v <= max) \ ++ var = _v; \ ++ else \ ++ _r = -EINVAL; \ ++ _r; \ ++}) ++ ++#define snprint(buf, size, var) \ ++ snprintf(buf, size, \ ++ type_is(var, int) ? "%i\n" \ ++ : type_is(var, unsigned) ? "%u\n" \ ++ : type_is(var, long) ? "%li\n" \ ++ : type_is(var, unsigned long) ? "%lu\n" \ ++ : type_is(var, s64) ? "%lli\n" \ ++ : type_is(var, u64) ? "%llu\n" \ ++ : type_is(var, char *) ? "%s\n" \ ++ : "%i\n", var) ++ ++void bch2_hprint(struct printbuf *, s64); ++ ++bool bch2_is_zero(const void *, size_t); ++ ++void bch2_string_opt_to_text(struct printbuf *, ++ const char * const [], size_t); ++ ++void bch2_flags_to_text(struct printbuf *, const char * const[], u64); ++u64 bch2_read_flag_list(char *, const char * const[]); ++ ++#define NR_QUANTILES 15 ++#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) ++#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) ++#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) ++ ++struct quantiles { ++ struct quantile_entry { ++ u64 m; ++ u64 step; ++ } entries[NR_QUANTILES]; ++}; ++ ++struct time_stat_buffer { ++ unsigned nr; ++ struct time_stat_buffer_entry { ++ u64 start; ++ u64 end; ++ } entries[32]; ++}; ++ ++struct time_stats { ++ spinlock_t lock; ++ u64 count; ++ /* all fields are in nanoseconds */ ++ u64 average_duration; ++ u64 average_frequency; ++ u64 max_duration; ++ u64 last_event; ++ struct quantiles quantiles; ++ ++ struct time_stat_buffer __percpu *buffer; ++}; ++ ++void __bch2_time_stats_update(struct time_stats *stats, u64, u64); ++ ++static inline void bch2_time_stats_update(struct time_stats *stats, u64 start) ++{ ++ __bch2_time_stats_update(stats, start, local_clock()); ++} ++ ++void bch2_time_stats_to_text(struct printbuf *, struct time_stats *); ++ ++void bch2_time_stats_exit(struct time_stats *); ++void bch2_time_stats_init(struct time_stats *); ++ ++#define ewma_add(ewma, val, weight) \ ++({ \ ++ typeof(ewma) _ewma = (ewma); \ ++ typeof(weight) _weight = (weight); \ ++ \ ++ (((_ewma << _weight) - _ewma) + (val)) >> _weight; \ ++}) ++ ++struct bch_ratelimit { ++ /* Next time we want to do some work, in nanoseconds */ ++ u64 next; ++ ++ /* ++ * Rate at which we want to do work, in units per nanosecond ++ * The units here correspond to the units passed to ++ * bch2_ratelimit_increment() ++ */ ++ unsigned rate; ++}; ++ ++static inline void bch2_ratelimit_reset(struct bch_ratelimit *d) ++{ ++ d->next = local_clock(); ++} ++ ++u64 bch2_ratelimit_delay(struct bch_ratelimit *); ++void bch2_ratelimit_increment(struct bch_ratelimit *, u64); ++ ++struct bch_pd_controller { ++ struct bch_ratelimit rate; ++ unsigned long last_update; ++ ++ s64 last_actual; ++ s64 smoothed_derivative; ++ ++ unsigned p_term_inverse; ++ unsigned d_smooth; ++ unsigned d_term; ++ ++ /* for exporting to sysfs (no effect on behavior) */ ++ s64 last_derivative; ++ s64 last_proportional; ++ s64 last_change; ++ s64 last_target; ++ ++ /* If true, the rate will not increase if bch2_ratelimit_delay() ++ * is not being called often enough. */ ++ bool backpressure; ++}; ++ ++void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int); ++void bch2_pd_controller_init(struct bch_pd_controller *); ++size_t bch2_pd_controller_print_debug(struct bch_pd_controller *, char *); ++ ++#define sysfs_pd_controller_attribute(name) \ ++ rw_attribute(name##_rate); \ ++ rw_attribute(name##_rate_bytes); \ ++ rw_attribute(name##_rate_d_term); \ ++ rw_attribute(name##_rate_p_term_inverse); \ ++ read_attribute(name##_rate_debug) ++ ++#define sysfs_pd_controller_files(name) \ ++ &sysfs_##name##_rate, \ ++ &sysfs_##name##_rate_bytes, \ ++ &sysfs_##name##_rate_d_term, \ ++ &sysfs_##name##_rate_p_term_inverse, \ ++ &sysfs_##name##_rate_debug ++ ++#define sysfs_pd_controller_show(name, var) \ ++do { \ ++ sysfs_hprint(name##_rate, (var)->rate.rate); \ ++ sysfs_print(name##_rate_bytes, (var)->rate.rate); \ ++ sysfs_print(name##_rate_d_term, (var)->d_term); \ ++ sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \ ++ \ ++ if (attr == &sysfs_##name##_rate_debug) \ ++ return bch2_pd_controller_print_debug(var, buf); \ ++} while (0) ++ ++#define sysfs_pd_controller_store(name, var) \ ++do { \ ++ sysfs_strtoul_clamp(name##_rate, \ ++ (var)->rate.rate, 1, UINT_MAX); \ ++ sysfs_strtoul_clamp(name##_rate_bytes, \ ++ (var)->rate.rate, 1, UINT_MAX); \ ++ sysfs_strtoul(name##_rate_d_term, (var)->d_term); \ ++ sysfs_strtoul_clamp(name##_rate_p_term_inverse, \ ++ (var)->p_term_inverse, 1, INT_MAX); \ ++} while (0) ++ ++#define container_of_or_null(ptr, type, member) \ ++({ \ ++ typeof(ptr) _ptr = ptr; \ ++ _ptr ? container_of(_ptr, type, member) : NULL; \ ++}) ++ ++/* Does linear interpolation between powers of two */ ++static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) ++{ ++ unsigned fract = x & ~(~0 << fract_bits); ++ ++ x >>= fract_bits; ++ x = 1 << x; ++ x += (x * fract) >> fract_bits; ++ ++ return x; ++} ++ ++void bch2_bio_map(struct bio *bio, void *base, size_t); ++int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t); ++ ++static inline sector_t bdev_sectors(struct block_device *bdev) ++{ ++ return bdev->bd_inode->i_size >> 9; ++} ++ ++#define closure_bio_submit(bio, cl) \ ++do { \ ++ closure_get(cl); \ ++ submit_bio(bio); \ ++} while (0) ++ ++#define kthread_wait_freezable(cond) \ ++({ \ ++ int _ret = 0; \ ++ while (1) { \ ++ set_current_state(TASK_INTERRUPTIBLE); \ ++ if (kthread_should_stop()) { \ ++ _ret = -1; \ ++ break; \ ++ } \ ++ \ ++ if (cond) \ ++ break; \ ++ \ ++ schedule(); \ ++ try_to_freeze(); \ ++ } \ ++ set_current_state(TASK_RUNNING); \ ++ _ret; \ ++}) ++ ++size_t bch2_rand_range(size_t); ++ ++void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); ++void memcpy_from_bio(void *, struct bio *, struct bvec_iter); ++ ++static inline void memcpy_u64s_small(void *dst, const void *src, ++ unsigned u64s) ++{ ++ u64 *d = dst; ++ const u64 *s = src; ++ ++ while (u64s--) ++ *d++ = *s++; ++} ++ ++static inline void __memcpy_u64s(void *dst, const void *src, ++ unsigned u64s) ++{ ++#ifdef CONFIG_X86_64 ++ long d0, d1, d2; ++ asm volatile("rep ; movsq" ++ : "=&c" (d0), "=&D" (d1), "=&S" (d2) ++ : "0" (u64s), "1" (dst), "2" (src) ++ : "memory"); ++#else ++ u64 *d = dst; ++ const u64 *s = src; ++ ++ while (u64s--) ++ *d++ = *s++; ++#endif ++} ++ ++static inline void memcpy_u64s(void *dst, const void *src, ++ unsigned u64s) ++{ ++ EBUG_ON(!(dst >= src + u64s * sizeof(u64) || ++ dst + u64s * sizeof(u64) <= src)); ++ ++ __memcpy_u64s(dst, src, u64s); ++} ++ ++static inline void __memmove_u64s_down(void *dst, const void *src, ++ unsigned u64s) ++{ ++ __memcpy_u64s(dst, src, u64s); ++} ++ ++static inline void memmove_u64s_down(void *dst, const void *src, ++ unsigned u64s) ++{ ++ EBUG_ON(dst > src); ++ ++ __memmove_u64s_down(dst, src, u64s); ++} ++ ++static inline void __memmove_u64s_up_small(void *_dst, const void *_src, ++ unsigned u64s) ++{ ++ u64 *dst = (u64 *) _dst + u64s; ++ u64 *src = (u64 *) _src + u64s; ++ ++ while (u64s--) ++ *--dst = *--src; ++} ++ ++static inline void memmove_u64s_up_small(void *dst, const void *src, ++ unsigned u64s) ++{ ++ EBUG_ON(dst < src); ++ ++ __memmove_u64s_up_small(dst, src, u64s); ++} ++ ++static inline void __memmove_u64s_up(void *_dst, const void *_src, ++ unsigned u64s) ++{ ++ u64 *dst = (u64 *) _dst + u64s - 1; ++ u64 *src = (u64 *) _src + u64s - 1; ++ ++#ifdef CONFIG_X86_64 ++ long d0, d1, d2; ++ asm volatile("std ;\n" ++ "rep ; movsq\n" ++ "cld ;\n" ++ : "=&c" (d0), "=&D" (d1), "=&S" (d2) ++ : "0" (u64s), "1" (dst), "2" (src) ++ : "memory"); ++#else ++ while (u64s--) ++ *dst-- = *src--; ++#endif ++} ++ ++static inline void memmove_u64s_up(void *dst, const void *src, ++ unsigned u64s) ++{ ++ EBUG_ON(dst < src); ++ ++ __memmove_u64s_up(dst, src, u64s); ++} ++ ++static inline void memmove_u64s(void *dst, const void *src, ++ unsigned u64s) ++{ ++ if (dst < src) ++ __memmove_u64s_down(dst, src, u64s); ++ else ++ __memmove_u64s_up(dst, src, u64s); ++} ++ ++/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */ ++static inline void memset_u64s_tail(void *s, int c, unsigned bytes) ++{ ++ unsigned rem = round_up(bytes, sizeof(u64)) - bytes; ++ ++ memset(s + bytes, c, rem); ++} ++ ++void sort_cmp_size(void *base, size_t num, size_t size, ++ int (*cmp_func)(const void *, const void *, size_t), ++ void (*swap_func)(void *, void *, size_t)); ++ ++/* just the memmove, doesn't update @_nr */ ++#define __array_insert_item(_array, _nr, _pos) \ ++ memmove(&(_array)[(_pos) + 1], \ ++ &(_array)[(_pos)], \ ++ sizeof((_array)[0]) * ((_nr) - (_pos))) ++ ++#define array_insert_item(_array, _nr, _pos, _new_item) \ ++do { \ ++ __array_insert_item(_array, _nr, _pos); \ ++ (_nr)++; \ ++ (_array)[(_pos)] = (_new_item); \ ++} while (0) ++ ++#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \ ++do { \ ++ (_nr) -= (_nr_to_remove); \ ++ memmove(&(_array)[(_pos)], \ ++ &(_array)[(_pos) + (_nr_to_remove)], \ ++ sizeof((_array)[0]) * ((_nr) - (_pos))); \ ++} while (0) ++ ++#define array_remove_item(_array, _nr, _pos) \ ++ array_remove_items(_array, _nr, _pos, 1) ++ ++#define bubble_sort(_base, _nr, _cmp) \ ++do { \ ++ ssize_t _i, _end; \ ++ bool _swapped = true; \ ++ \ ++ for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\ ++ _swapped = false; \ ++ for (_i = 0; _i < _end; _i++) \ ++ if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \ ++ swap((_base)[_i], (_base)[_i + 1]); \ ++ _swapped = true; \ ++ } \ ++ } \ ++} while (0) ++ ++static inline u64 percpu_u64_get(u64 __percpu *src) ++{ ++ u64 ret = 0; ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ ret += *per_cpu_ptr(src, cpu); ++ return ret; ++} ++ ++static inline void percpu_u64_set(u64 __percpu *dst, u64 src) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ *per_cpu_ptr(dst, cpu) = 0; ++ ++ preempt_disable(); ++ *this_cpu_ptr(dst) = src; ++ preempt_enable(); ++} ++ ++static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr) ++{ ++ unsigned i; ++ ++ for (i = 0; i < nr; i++) ++ acc[i] += src[i]; ++} ++ ++static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src, ++ unsigned nr) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ acc_u64s(acc, per_cpu_ptr(src, cpu), nr); ++} ++ ++static inline void percpu_memset(void __percpu *p, int c, size_t bytes) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ memset(per_cpu_ptr(p, cpu), c, bytes); ++} ++ ++u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); ++ ++#define cmp_int(l, r) ((l > r) - (l < r)) ++ ++#endif /* _BCACHEFS_UTIL_H */ +diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h +new file mode 100644 +index 000000000000..c099cdc0605f +--- /dev/null ++++ b/fs/bcachefs/vstructs.h +@@ -0,0 +1,63 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _VSTRUCTS_H ++#define _VSTRUCTS_H ++ ++#include "util.h" ++ ++/* ++ * NOTE: we can't differentiate between __le64 and u64 with type_is - this ++ * assumes u64 is little endian: ++ */ ++#define __vstruct_u64s(_s) \ ++({ \ ++ ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s) \ ++ : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s) \ ++ : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s) \ ++ : ((__force u8) ((_s)->u64s))); \ ++}) ++ ++#define __vstruct_bytes(_type, _u64s) \ ++({ \ ++ BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \ ++ \ ++ (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ ++}) ++ ++#define vstruct_bytes(_s) \ ++ __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s)) ++ ++#define __vstruct_blocks(_type, _sector_block_bits, _u64s) \ ++ (round_up(__vstruct_bytes(_type, _u64s), \ ++ 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits))) ++ ++#define vstruct_blocks(_s, _sector_block_bits) \ ++ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s)) ++ ++#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s) \ ++ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, \ ++ __vstruct_u64s(_s) + (_u64s)) ++ ++#define vstruct_sectors(_s, _sector_block_bits) \ ++ (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9) ++ ++#define vstruct_next(_s) \ ++ ((typeof(_s)) ((_s)->_data + __vstruct_u64s(_s))) ++#define vstruct_last(_s) \ ++ ((typeof(&(_s)->start[0])) ((_s)->_data + __vstruct_u64s(_s))) ++#define vstruct_end(_s) \ ++ ((void *) ((_s)->_data + __vstruct_u64s(_s))) ++ ++#define vstruct_for_each(_s, _i) \ ++ for (_i = (_s)->start; \ ++ _i < vstruct_last(_s); \ ++ _i = vstruct_next(_i)) ++ ++#define vstruct_for_each_safe(_s, _i, _t) \ ++ for (_i = (_s)->start; \ ++ _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \ ++ _i = _t) ++ ++#define vstruct_idx(_s, _idx) \ ++ ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx))) ++ ++#endif /* _VSTRUCTS_H */ +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +new file mode 100644 +index 000000000000..21f64cb7e402 +--- /dev/null ++++ b/fs/bcachefs/xattr.c +@@ -0,0 +1,586 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_update.h" ++#include "extents.h" ++#include "fs.h" ++#include "rebalance.h" ++#include "str_hash.h" ++#include "xattr.h" ++ ++#include ++#include ++#include ++ ++static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned); ++ ++static u64 bch2_xattr_hash(const struct bch_hash_info *info, ++ const struct xattr_search_key *key) ++{ ++ struct bch_str_hash_ctx ctx; ++ ++ bch2_str_hash_init(&ctx, info); ++ bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type)); ++ bch2_str_hash_update(&ctx, info, key->name.name, key->name.len); ++ ++ return bch2_str_hash_end(&ctx, info); ++} ++ ++static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key) ++{ ++ return bch2_xattr_hash(info, key); ++} ++ ++static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) ++{ ++ struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k); ++ ++ return bch2_xattr_hash(info, ++ &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len)); ++} ++ ++static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) ++{ ++ struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); ++ const struct xattr_search_key *r = _r; ++ ++ return l.v->x_type != r->type || ++ l.v->x_name_len != r->name.len || ++ memcmp(l.v->x_name, r->name.name, r->name.len); ++} ++ ++static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) ++{ ++ struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); ++ struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r); ++ ++ return l.v->x_type != r.v->x_type || ++ l.v->x_name_len != r.v->x_name_len || ++ memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len); ++} ++ ++const struct bch_hash_desc bch2_xattr_hash_desc = { ++ .btree_id = BTREE_ID_XATTRS, ++ .key_type = KEY_TYPE_xattr, ++ .hash_key = xattr_hash_key, ++ .hash_bkey = xattr_hash_bkey, ++ .cmp_key = xattr_cmp_key, ++ .cmp_bkey = xattr_cmp_bkey, ++}; ++ ++const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ const struct xattr_handler *handler; ++ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); ++ ++ if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) ++ return "value too small"; ++ ++ if (bkey_val_u64s(k.k) < ++ xattr_val_u64s(xattr.v->x_name_len, ++ le16_to_cpu(xattr.v->x_val_len))) ++ return "value too small"; ++ ++ if (bkey_val_u64s(k.k) > ++ xattr_val_u64s(xattr.v->x_name_len, ++ le16_to_cpu(xattr.v->x_val_len) + 4)) ++ return "value too big"; ++ ++ handler = bch2_xattr_type_to_handler(xattr.v->x_type); ++ if (!handler) ++ return "invalid type"; ++ ++ if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) ++ return "xattr name has invalid characters"; ++ ++ return NULL; ++} ++ ++void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ const struct xattr_handler *handler; ++ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); ++ ++ handler = bch2_xattr_type_to_handler(xattr.v->x_type); ++ if (handler && handler->prefix) ++ pr_buf(out, "%s", handler->prefix); ++ else if (handler) ++ pr_buf(out, "(type %u)", xattr.v->x_type); ++ else ++ pr_buf(out, "(unknown type %u)", xattr.v->x_type); ++ ++ bch_scnmemcpy(out, xattr.v->x_name, ++ xattr.v->x_name_len); ++ pr_buf(out, ":"); ++ bch_scnmemcpy(out, xattr_val(xattr.v), ++ le16_to_cpu(xattr.v->x_val_len)); ++} ++ ++int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, ++ const char *name, void *buffer, size_t size, int type) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c_xattr xattr; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, ++ &inode->ei_str_hash, inode->v.i_ino, ++ &X_SEARCH(type, name, strlen(name)), ++ 0); ++ if (IS_ERR(iter)) { ++ bch2_trans_exit(&trans); ++ BUG_ON(PTR_ERR(iter) == -EINTR); ++ ++ return PTR_ERR(iter) == -ENOENT ? -ENODATA : PTR_ERR(iter); ++ } ++ ++ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); ++ ret = le16_to_cpu(xattr.v->x_val_len); ++ if (buffer) { ++ if (ret > size) ++ ret = -ERANGE; ++ else ++ memcpy(buffer, xattr_val(xattr.v), ret); ++ } ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++int bch2_xattr_set(struct btree_trans *trans, u64 inum, ++ const struct bch_hash_info *hash_info, ++ const char *name, const void *value, size_t size, ++ int type, int flags) ++{ ++ int ret; ++ ++ if (value) { ++ struct bkey_i_xattr *xattr; ++ unsigned namelen = strlen(name); ++ unsigned u64s = BKEY_U64s + ++ xattr_val_u64s(namelen, size); ++ ++ if (u64s > U8_MAX) ++ return -ERANGE; ++ ++ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); ++ if (IS_ERR(xattr)) ++ return PTR_ERR(xattr); ++ ++ bkey_xattr_init(&xattr->k_i); ++ xattr->k.u64s = u64s; ++ xattr->v.x_type = type; ++ xattr->v.x_name_len = namelen; ++ xattr->v.x_val_len = cpu_to_le16(size); ++ memcpy(xattr->v.x_name, name, namelen); ++ memcpy(xattr_val(&xattr->v), value, size); ++ ++ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, ++ inum, &xattr->k_i, ++ (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)| ++ (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0)); ++ } else { ++ struct xattr_search_key search = ++ X_SEARCH(type, name, strlen(name)); ++ ++ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, ++ hash_info, inum, &search); ++ } ++ ++ if (ret == -ENOENT) ++ ret = flags & XATTR_REPLACE ? -ENODATA : 0; ++ ++ return ret; ++} ++ ++struct xattr_buf { ++ char *buf; ++ size_t len; ++ size_t used; ++}; ++ ++static int __bch2_xattr_emit(const char *prefix, ++ const char *name, size_t name_len, ++ struct xattr_buf *buf) ++{ ++ const size_t prefix_len = strlen(prefix); ++ const size_t total_len = prefix_len + name_len + 1; ++ ++ if (buf->buf) { ++ if (buf->used + total_len > buf->len) ++ return -ERANGE; ++ ++ memcpy(buf->buf + buf->used, prefix, prefix_len); ++ memcpy(buf->buf + buf->used + prefix_len, ++ name, name_len); ++ buf->buf[buf->used + prefix_len + name_len] = '\0'; ++ } ++ ++ buf->used += total_len; ++ return 0; ++} ++ ++static int bch2_xattr_emit(struct dentry *dentry, ++ const struct bch_xattr *xattr, ++ struct xattr_buf *buf) ++{ ++ const struct xattr_handler *handler = ++ bch2_xattr_type_to_handler(xattr->x_type); ++ ++ return handler && (!handler->list || handler->list(dentry)) ++ ? __bch2_xattr_emit(handler->prefix ?: handler->name, ++ xattr->x_name, xattr->x_name_len, buf) ++ : 0; ++} ++ ++static int bch2_xattr_list_bcachefs(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct xattr_buf *buf, ++ bool all) ++{ ++ const char *prefix = all ? "bcachefs_effective." : "bcachefs."; ++ unsigned id; ++ int ret = 0; ++ u64 v; ++ ++ for (id = 0; id < Inode_opt_nr; id++) { ++ v = bch2_inode_opt_get(&inode->ei_inode, id); ++ if (!v) ++ continue; ++ ++ if (!all && ++ !(inode->ei_inode.bi_fields_set & (1 << id))) ++ continue; ++ ++ ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id], ++ strlen(bch2_inode_opts[id]), buf); ++ if (ret) ++ break; ++ } ++ ++ return ret; ++} ++ ++ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) ++{ ++ struct bch_fs *c = dentry->d_sb->s_fs_info; ++ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; ++ u64 inum = dentry->d_inode->i_ino; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, ++ POS(inum, 0), 0, k, ret) { ++ BUG_ON(k.k->p.inode < inum); ++ ++ if (k.k->p.inode > inum) ++ break; ++ ++ if (k.k->type != KEY_TYPE_xattr) ++ continue; ++ ++ ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); ++ if (ret) ++ break; ++ } ++ ret = bch2_trans_exit(&trans) ?: ret; ++ ++ if (ret) ++ return ret; ++ ++ ret = bch2_xattr_list_bcachefs(c, inode, &buf, false); ++ if (ret) ++ return ret; ++ ++ ret = bch2_xattr_list_bcachefs(c, inode, &buf, true); ++ if (ret) ++ return ret; ++ ++ return buf.used; ++} ++ ++static int bch2_xattr_get_handler(const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, void *buffer, size_t size) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ return bch2_xattr_get(c, inode, name, buffer, size, handler->flags); ++} ++ ++static int bch2_xattr_set_handler(const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, const void *value, ++ size_t size, int flags) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0, ++ bch2_xattr_set(&trans, inode->v.i_ino, ++ &inode->ei_str_hash, ++ name, value, size, ++ handler->flags, flags)); ++} ++ ++static const struct xattr_handler bch_xattr_user_handler = { ++ .prefix = XATTR_USER_PREFIX, ++ .get = bch2_xattr_get_handler, ++ .set = bch2_xattr_set_handler, ++ .flags = KEY_TYPE_XATTR_INDEX_USER, ++}; ++ ++static bool bch2_xattr_trusted_list(struct dentry *dentry) ++{ ++ return capable(CAP_SYS_ADMIN); ++} ++ ++static const struct xattr_handler bch_xattr_trusted_handler = { ++ .prefix = XATTR_TRUSTED_PREFIX, ++ .list = bch2_xattr_trusted_list, ++ .get = bch2_xattr_get_handler, ++ .set = bch2_xattr_set_handler, ++ .flags = KEY_TYPE_XATTR_INDEX_TRUSTED, ++}; ++ ++static const struct xattr_handler bch_xattr_security_handler = { ++ .prefix = XATTR_SECURITY_PREFIX, ++ .get = bch2_xattr_get_handler, ++ .set = bch2_xattr_set_handler, ++ .flags = KEY_TYPE_XATTR_INDEX_SECURITY, ++}; ++ ++#ifndef NO_BCACHEFS_FS ++ ++static int opt_to_inode_opt(int id) ++{ ++ switch (id) { ++#define x(name, ...) \ ++ case Opt_##name: return Inode_opt_##name; ++ BCH_INODE_OPTS() ++#undef x ++ default: ++ return -1; ++ } ++} ++ ++static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, void *buffer, size_t size, ++ bool all) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_opts opts = ++ bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode)); ++ const struct bch_option *opt; ++ int id, inode_opt_id; ++ char buf[512]; ++ struct printbuf out = PBUF(buf); ++ unsigned val_len; ++ u64 v; ++ ++ id = bch2_opt_lookup(name); ++ if (id < 0 || !bch2_opt_is_inode_opt(id)) ++ return -EINVAL; ++ ++ inode_opt_id = opt_to_inode_opt(id); ++ if (inode_opt_id < 0) ++ return -EINVAL; ++ ++ opt = bch2_opt_table + id; ++ ++ if (!bch2_opt_defined_by_id(&opts, id)) ++ return -ENODATA; ++ ++ if (!all && ++ !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id))) ++ return -ENODATA; ++ ++ v = bch2_opt_get_by_id(&opts, id); ++ bch2_opt_to_text(&out, c, opt, v, 0); ++ ++ val_len = out.pos - buf; ++ ++ if (buffer && val_len > size) ++ return -ERANGE; ++ ++ if (buffer) ++ memcpy(buffer, buf, val_len); ++ return val_len; ++} ++ ++static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, void *buffer, size_t size) ++{ ++ return __bch2_xattr_bcachefs_get(handler, dentry, vinode, ++ name, buffer, size, false); ++} ++ ++struct inode_opt_set { ++ int id; ++ u64 v; ++ bool defined; ++}; ++ ++static int inode_opt_set_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct inode_opt_set *s = p; ++ ++ if (s->defined) ++ bi->bi_fields_set |= 1U << s->id; ++ else ++ bi->bi_fields_set &= ~(1U << s->id); ++ ++ bch2_inode_opt_set(bi, s->id, s->v); ++ ++ return 0; ++} ++ ++static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, const void *value, ++ size_t size, int flags) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ const struct bch_option *opt; ++ char *buf; ++ struct inode_opt_set s; ++ int opt_id, inode_opt_id, ret; ++ ++ opt_id = bch2_opt_lookup(name); ++ if (opt_id < 0) ++ return -EINVAL; ++ ++ opt = bch2_opt_table + opt_id; ++ ++ inode_opt_id = opt_to_inode_opt(opt_id); ++ if (inode_opt_id < 0) ++ return -EINVAL; ++ ++ s.id = inode_opt_id; ++ ++ if (value) { ++ u64 v = 0; ++ ++ buf = kmalloc(size + 1, GFP_KERNEL); ++ if (!buf) ++ return -ENOMEM; ++ memcpy(buf, value, size); ++ buf[size] = '\0'; ++ ++ ret = bch2_opt_parse(c, opt, buf, &v); ++ kfree(buf); ++ ++ if (ret < 0) ++ return ret; ++ ++ ret = bch2_opt_check_may_set(c, opt_id, v); ++ if (ret < 0) ++ return ret; ++ ++ s.v = v + 1; ++ s.defined = true; ++ } else { ++ if (!IS_ROOT(dentry)) { ++ struct bch_inode_info *dir = ++ to_bch_ei(d_inode(dentry->d_parent)); ++ ++ s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id); ++ } else { ++ s.v = 0; ++ } ++ ++ s.defined = false; ++ } ++ ++ mutex_lock(&inode->ei_update_lock); ++ if (inode_opt_id == Inode_opt_project) { ++ /* ++ * inode fields accessible via the xattr interface are stored ++ * with a +1 bias, so that 0 means unset: ++ */ ++ ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); ++err: ++ mutex_unlock(&inode->ei_update_lock); ++ ++ if (value && ++ (opt_id == Opt_background_compression || ++ opt_id == Opt_background_target)) ++ bch2_rebalance_add_work(c, inode->v.i_blocks); ++ ++ return ret; ++} ++ ++static const struct xattr_handler bch_xattr_bcachefs_handler = { ++ .prefix = "bcachefs.", ++ .get = bch2_xattr_bcachefs_get, ++ .set = bch2_xattr_bcachefs_set, ++}; ++ ++static int bch2_xattr_bcachefs_get_effective( ++ const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, void *buffer, size_t size) ++{ ++ return __bch2_xattr_bcachefs_get(handler, dentry, vinode, ++ name, buffer, size, true); ++} ++ ++static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { ++ .prefix = "bcachefs_effective.", ++ .get = bch2_xattr_bcachefs_get_effective, ++ .set = bch2_xattr_bcachefs_set, ++}; ++ ++#endif /* NO_BCACHEFS_FS */ ++ ++const struct xattr_handler *bch2_xattr_handlers[] = { ++ &bch_xattr_user_handler, ++ &posix_acl_access_xattr_handler, ++ &posix_acl_default_xattr_handler, ++ &bch_xattr_trusted_handler, ++ &bch_xattr_security_handler, ++#ifndef NO_BCACHEFS_FS ++ &bch_xattr_bcachefs_handler, ++ &bch_xattr_bcachefs_effective_handler, ++#endif ++ NULL ++}; ++ ++static const struct xattr_handler *bch_xattr_handler_map[] = { ++ [KEY_TYPE_XATTR_INDEX_USER] = &bch_xattr_user_handler, ++ [KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS] = ++ &posix_acl_access_xattr_handler, ++ [KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT] = ++ &posix_acl_default_xattr_handler, ++ [KEY_TYPE_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler, ++ [KEY_TYPE_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler, ++}; ++ ++static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type) ++{ ++ return type < ARRAY_SIZE(bch_xattr_handler_map) ++ ? bch_xattr_handler_map[type] ++ : NULL; ++} +diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h +new file mode 100644 +index 000000000000..4151065ab853 +--- /dev/null ++++ b/fs/bcachefs/xattr.h +@@ -0,0 +1,49 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_XATTR_H ++#define _BCACHEFS_XATTR_H ++ ++#include "str_hash.h" ++ ++extern const struct bch_hash_desc bch2_xattr_hash_desc; ++ ++const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_xattr (struct bkey_ops) { \ ++ .key_invalid = bch2_xattr_invalid, \ ++ .val_to_text = bch2_xattr_to_text, \ ++} ++ ++static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) ++{ ++ return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + ++ name_len + val_len, sizeof(u64)); ++} ++ ++#define xattr_val(_xattr) \ ++ ((void *) (_xattr)->x_name + (_xattr)->x_name_len) ++ ++struct xattr_search_key { ++ u8 type; ++ struct qstr name; ++}; ++ ++#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \ ++ { .type = _type, .name = QSTR_INIT(_name, _len) }) ++ ++struct dentry; ++struct xattr_handler; ++struct bch_hash_info; ++struct bch_inode_info; ++ ++int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *, ++ const char *, void *, size_t, int); ++ ++int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *, ++ const char *, const void *, size_t, int, int); ++ ++ssize_t bch2_xattr_list(struct dentry *, char *, size_t); ++ ++extern const struct xattr_handler *bch2_xattr_handlers[]; ++ ++#endif /* _BCACHEFS_XATTR_H */ +diff --git a/fs/cifs/file.c b/fs/cifs/file.c +index be46fab4c96d..a17a21181e18 100644 +--- a/fs/cifs/file.c ++++ b/fs/cifs/file.c +@@ -4296,20 +4296,12 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, + + page = lru_to_page(page_list); + +- /* +- * Lock the page and put it in the cache. Since no one else +- * should have access to this page, we're safe to simply set +- * PG_locked without checking it first. +- */ +- __SetPageLocked(page); +- rc = add_to_page_cache_locked(page, mapping, +- page->index, gfp); ++ rc = add_to_page_cache(page, mapping, ++ page->index, gfp); + + /* give up if we can't stick it in the cache */ +- if (rc) { +- __ClearPageLocked(page); ++ if (rc) + return rc; +- } + + /* move first page to the tmplist */ + *offset = (loff_t)page->index << PAGE_SHIFT; +@@ -4328,12 +4320,9 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, + if (*bytes + PAGE_SIZE > rsize) + break; + +- __SetPageLocked(page); +- rc = add_to_page_cache_locked(page, mapping, page->index, gfp); +- if (rc) { +- __ClearPageLocked(page); ++ rc = add_to_page_cache(page, mapping, page->index, gfp); ++ if (rc) + break; +- } + list_move_tail(&page->lru, tmplist); + (*bytes) += PAGE_SIZE; + expected_index++; +diff --git a/fs/dcache.c b/fs/dcache.c +index 361ea7ab30ea..6fbf68e60326 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -3132,9 +3132,8 @@ void d_genocide(struct dentry *parent) + + EXPORT_SYMBOL(d_genocide); + +-void d_tmpfile(struct dentry *dentry, struct inode *inode) ++void d_mark_tmpfile(struct dentry *dentry, struct inode *inode) + { +- inode_dec_link_count(inode); + BUG_ON(dentry->d_name.name != dentry->d_iname || + !hlist_unhashed(&dentry->d_u.d_alias) || + !d_unlinked(dentry)); +@@ -3144,6 +3143,13 @@ void d_tmpfile(struct dentry *dentry, struct inode *inode) + (unsigned long long)inode->i_ino); + spin_unlock(&dentry->d_lock); + spin_unlock(&dentry->d_parent->d_lock); ++} ++EXPORT_SYMBOL(d_mark_tmpfile); ++ ++void d_tmpfile(struct dentry *dentry, struct inode *inode) ++{ ++ inode_dec_link_count(inode); ++ d_mark_tmpfile(dentry, inode); + d_instantiate(dentry, inode); + } + EXPORT_SYMBOL(d_tmpfile); +diff --git a/fs/inode.c b/fs/inode.c +index 72c4c347afb7..e70ad3d2d01c 100644 +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -1578,6 +1578,46 @@ int insert_inode_locked(struct inode *inode) + } + EXPORT_SYMBOL(insert_inode_locked); + ++struct inode *insert_inode_locked2(struct inode *inode) ++{ ++ struct super_block *sb = inode->i_sb; ++ ino_t ino = inode->i_ino; ++ struct hlist_head *head = inode_hashtable + hash(sb, ino); ++ ++ while (1) { ++ struct inode *old = NULL; ++ spin_lock(&inode_hash_lock); ++ hlist_for_each_entry(old, head, i_hash) { ++ if (old->i_ino != ino) ++ continue; ++ if (old->i_sb != sb) ++ continue; ++ spin_lock(&old->i_lock); ++ if (old->i_state & (I_FREEING|I_WILL_FREE)) { ++ spin_unlock(&old->i_lock); ++ continue; ++ } ++ break; ++ } ++ if (likely(!old)) { ++ spin_lock(&inode->i_lock); ++ inode->i_state |= I_NEW | I_CREATING; ++ hlist_add_head(&inode->i_hash, head); ++ spin_unlock(&inode->i_lock); ++ spin_unlock(&inode_hash_lock); ++ return NULL; ++ } ++ __iget(old); ++ spin_unlock(&old->i_lock); ++ spin_unlock(&inode_hash_lock); ++ wait_on_inode(old); ++ if (unlikely(!inode_unhashed(old))) ++ return old; ++ iput(old); ++ } ++} ++EXPORT_SYMBOL(insert_inode_locked2); ++ + int insert_inode_locked4(struct inode *inode, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) + { +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 57241417ff2f..e080ccb4fdf1 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -908,6 +908,7 @@ extern const char *blk_op_str(unsigned int op); + + int blk_status_to_errno(blk_status_t status); + blk_status_t errno_to_blk_status(int errno); ++const char *blk_status_to_str(blk_status_t status); + + int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin); + +diff --git a/include/linux/closure.h b/include/linux/closure.h +new file mode 100644 +index 000000000000..36b4a83f9b77 +--- /dev/null ++++ b/include/linux/closure.h +@@ -0,0 +1,399 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _LINUX_CLOSURE_H ++#define _LINUX_CLOSURE_H ++ ++#include ++#include ++#include ++#include ++ ++/* ++ * Closure is perhaps the most overused and abused term in computer science, but ++ * since I've been unable to come up with anything better you're stuck with it ++ * again. ++ * ++ * What are closures? ++ * ++ * They embed a refcount. The basic idea is they count "things that are in ++ * progress" - in flight bios, some other thread that's doing something else - ++ * anything you might want to wait on. ++ * ++ * The refcount may be manipulated with closure_get() and closure_put(). ++ * closure_put() is where many of the interesting things happen, when it causes ++ * the refcount to go to 0. ++ * ++ * Closures can be used to wait on things both synchronously and asynchronously, ++ * and synchronous and asynchronous use can be mixed without restriction. To ++ * wait synchronously, use closure_sync() - you will sleep until your closure's ++ * refcount hits 1. ++ * ++ * To wait asynchronously, use ++ * continue_at(cl, next_function, workqueue); ++ * ++ * passing it, as you might expect, the function to run when nothing is pending ++ * and the workqueue to run that function out of. ++ * ++ * continue_at() also, critically, requires a 'return' immediately following the ++ * location where this macro is referenced, to return to the calling function. ++ * There's good reason for this. ++ * ++ * To use safely closures asynchronously, they must always have a refcount while ++ * they are running owned by the thread that is running them. Otherwise, suppose ++ * you submit some bios and wish to have a function run when they all complete: ++ * ++ * foo_endio(struct bio *bio) ++ * { ++ * closure_put(cl); ++ * } ++ * ++ * closure_init(cl); ++ * ++ * do_stuff(); ++ * closure_get(cl); ++ * bio1->bi_endio = foo_endio; ++ * bio_submit(bio1); ++ * ++ * do_more_stuff(); ++ * closure_get(cl); ++ * bio2->bi_endio = foo_endio; ++ * bio_submit(bio2); ++ * ++ * continue_at(cl, complete_some_read, system_wq); ++ * ++ * If closure's refcount started at 0, complete_some_read() could run before the ++ * second bio was submitted - which is almost always not what you want! More ++ * importantly, it wouldn't be possible to say whether the original thread or ++ * complete_some_read()'s thread owned the closure - and whatever state it was ++ * associated with! ++ * ++ * So, closure_init() initializes a closure's refcount to 1 - and when a ++ * closure_fn is run, the refcount will be reset to 1 first. ++ * ++ * Then, the rule is - if you got the refcount with closure_get(), release it ++ * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount ++ * on a closure because you called closure_init() or you were run out of a ++ * closure - _always_ use continue_at(). Doing so consistently will help ++ * eliminate an entire class of particularly pernicious races. ++ * ++ * Lastly, you might have a wait list dedicated to a specific event, and have no ++ * need for specifying the condition - you just want to wait until someone runs ++ * closure_wake_up() on the appropriate wait list. In that case, just use ++ * closure_wait(). It will return either true or false, depending on whether the ++ * closure was already on a wait list or not - a closure can only be on one wait ++ * list at a time. ++ * ++ * Parents: ++ * ++ * closure_init() takes two arguments - it takes the closure to initialize, and ++ * a (possibly null) parent. ++ * ++ * If parent is non null, the new closure will have a refcount for its lifetime; ++ * a closure is considered to be "finished" when its refcount hits 0 and the ++ * function to run is null. Hence ++ * ++ * continue_at(cl, NULL, NULL); ++ * ++ * returns up the (spaghetti) stack of closures, precisely like normal return ++ * returns up the C stack. continue_at() with non null fn is better thought of ++ * as doing a tail call. ++ * ++ * All this implies that a closure should typically be embedded in a particular ++ * struct (which its refcount will normally control the lifetime of), and that ++ * struct can very much be thought of as a stack frame. ++ */ ++ ++struct closure; ++struct closure_syncer; ++typedef void (closure_fn) (struct closure *); ++extern struct dentry *bcache_debug; ++ ++struct closure_waitlist { ++ struct llist_head list; ++}; ++ ++enum closure_state { ++ /* ++ * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by ++ * the thread that owns the closure, and cleared by the thread that's ++ * waking up the closure. ++ * ++ * The rest are for debugging and don't affect behaviour: ++ * ++ * CLOSURE_RUNNING: Set when a closure is running (i.e. by ++ * closure_init() and when closure_put() runs then next function), and ++ * must be cleared before remaining hits 0. Primarily to help guard ++ * against incorrect usage and accidentally transferring references. ++ * continue_at() and closure_return() clear it for you, if you're doing ++ * something unusual you can use closure_set_dead() which also helps ++ * annotate where references are being transferred. ++ */ ++ ++ CLOSURE_BITS_START = (1U << 26), ++ CLOSURE_DESTRUCTOR = (1U << 26), ++ CLOSURE_WAITING = (1U << 28), ++ CLOSURE_RUNNING = (1U << 30), ++}; ++ ++#define CLOSURE_GUARD_MASK \ ++ ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1) ++ ++#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) ++#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) ++ ++struct closure { ++ union { ++ struct { ++ struct workqueue_struct *wq; ++ struct closure_syncer *s; ++ struct llist_node list; ++ closure_fn *fn; ++ }; ++ struct work_struct work; ++ }; ++ ++ struct closure *parent; ++ ++ atomic_t remaining; ++ ++#ifdef CONFIG_DEBUG_CLOSURES ++#define CLOSURE_MAGIC_DEAD 0xc054dead ++#define CLOSURE_MAGIC_ALIVE 0xc054a11e ++ ++ unsigned int magic; ++ struct list_head all; ++ unsigned long ip; ++ unsigned long waiting_on; ++#endif ++}; ++ ++void closure_sub(struct closure *cl, int v); ++void closure_put(struct closure *cl); ++void __closure_wake_up(struct closure_waitlist *list); ++bool closure_wait(struct closure_waitlist *list, struct closure *cl); ++void __closure_sync(struct closure *cl); ++ ++/** ++ * closure_sync - sleep until a closure a closure has nothing left to wait on ++ * ++ * Sleeps until the refcount hits 1 - the thread that's running the closure owns ++ * the last refcount. ++ */ ++static inline void closure_sync(struct closure *cl) ++{ ++ if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) ++ __closure_sync(cl); ++} ++ ++#ifdef CONFIG_DEBUG_CLOSURES ++ ++void closure_debug_create(struct closure *cl); ++void closure_debug_destroy(struct closure *cl); ++ ++#else ++ ++static inline void closure_debug_create(struct closure *cl) {} ++static inline void closure_debug_destroy(struct closure *cl) {} ++ ++#endif ++ ++static inline void closure_set_ip(struct closure *cl) ++{ ++#ifdef CONFIG_DEBUG_CLOSURES ++ cl->ip = _THIS_IP_; ++#endif ++} ++ ++static inline void closure_set_ret_ip(struct closure *cl) ++{ ++#ifdef CONFIG_DEBUG_CLOSURES ++ cl->ip = _RET_IP_; ++#endif ++} ++ ++static inline void closure_set_waiting(struct closure *cl, unsigned long f) ++{ ++#ifdef CONFIG_DEBUG_CLOSURES ++ cl->waiting_on = f; ++#endif ++} ++ ++static inline void closure_set_stopped(struct closure *cl) ++{ ++ atomic_sub(CLOSURE_RUNNING, &cl->remaining); ++} ++ ++static inline void set_closure_fn(struct closure *cl, closure_fn *fn, ++ struct workqueue_struct *wq) ++{ ++ closure_set_ip(cl); ++ cl->fn = fn; ++ cl->wq = wq; ++ /* between atomic_dec() in closure_put() */ ++ smp_mb__before_atomic(); ++} ++ ++static inline void closure_queue(struct closure *cl) ++{ ++ struct workqueue_struct *wq = cl->wq; ++ /** ++ * Changes made to closure, work_struct, or a couple of other structs ++ * may cause work.func not pointing to the right location. ++ */ ++ BUILD_BUG_ON(offsetof(struct closure, fn) ++ != offsetof(struct work_struct, func)); ++ ++ if (wq) { ++ INIT_WORK(&cl->work, cl->work.func); ++ BUG_ON(!queue_work(wq, &cl->work)); ++ } else ++ cl->fn(cl); ++} ++ ++/** ++ * closure_get - increment a closure's refcount ++ */ ++static inline void closure_get(struct closure *cl) ++{ ++#ifdef CONFIG_DEBUG_CLOSURES ++ BUG_ON((atomic_inc_return(&cl->remaining) & ++ CLOSURE_REMAINING_MASK) <= 1); ++#else ++ atomic_inc(&cl->remaining); ++#endif ++} ++ ++/** ++ * closure_init - Initialize a closure, setting the refcount to 1 ++ * @cl: closure to initialize ++ * @parent: parent of the new closure. cl will take a refcount on it for its ++ * lifetime; may be NULL. ++ */ ++static inline void closure_init(struct closure *cl, struct closure *parent) ++{ ++ cl->fn = NULL; ++ cl->parent = parent; ++ if (parent) ++ closure_get(parent); ++ ++ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); ++ ++ closure_debug_create(cl); ++ closure_set_ip(cl); ++} ++ ++static inline void closure_init_stack(struct closure *cl) ++{ ++ memset(cl, 0, sizeof(struct closure)); ++ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); ++} ++ ++/** ++ * closure_wake_up - wake up all closures on a wait list, ++ * with memory barrier ++ */ ++static inline void closure_wake_up(struct closure_waitlist *list) ++{ ++ /* Memory barrier for the wait list */ ++ smp_mb(); ++ __closure_wake_up(list); ++} ++ ++/** ++ * continue_at - jump to another function with barrier ++ * ++ * After @cl is no longer waiting on anything (i.e. all outstanding refs have ++ * been dropped with closure_put()), it will resume execution at @fn running out ++ * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). ++ * ++ * This is because after calling continue_at() you no longer have a ref on @cl, ++ * and whatever @cl owns may be freed out from under you - a running closure fn ++ * has a ref on its own closure which continue_at() drops. ++ * ++ * Note you are expected to immediately return after using this macro. ++ */ ++#define continue_at(_cl, _fn, _wq) \ ++do { \ ++ set_closure_fn(_cl, _fn, _wq); \ ++ closure_sub(_cl, CLOSURE_RUNNING + 1); \ ++} while (0) ++ ++/** ++ * closure_return - finish execution of a closure ++ * ++ * This is used to indicate that @cl is finished: when all outstanding refs on ++ * @cl have been dropped @cl's ref on its parent closure (as passed to ++ * closure_init()) will be dropped, if one was specified - thus this can be ++ * thought of as returning to the parent closure. ++ */ ++#define closure_return(_cl) continue_at((_cl), NULL, NULL) ++ ++/** ++ * continue_at_nobarrier - jump to another function without barrier ++ * ++ * Causes @fn to be executed out of @cl, in @wq context (or called directly if ++ * @wq is NULL). ++ * ++ * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, ++ * thus it's not safe to touch anything protected by @cl after a ++ * continue_at_nobarrier(). ++ */ ++#define continue_at_nobarrier(_cl, _fn, _wq) \ ++do { \ ++ set_closure_fn(_cl, _fn, _wq); \ ++ closure_queue(_cl); \ ++} while (0) ++ ++/** ++ * closure_return_with_destructor - finish execution of a closure, ++ * with destructor ++ * ++ * Works like closure_return(), except @destructor will be called when all ++ * outstanding refs on @cl have been dropped; @destructor may be used to safely ++ * free the memory occupied by @cl, and it is called with the ref on the parent ++ * closure still held - so @destructor could safely return an item to a ++ * freelist protected by @cl's parent. ++ */ ++#define closure_return_with_destructor(_cl, _destructor) \ ++do { \ ++ set_closure_fn(_cl, _destructor, NULL); \ ++ closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ ++} while (0) ++ ++/** ++ * closure_call - execute @fn out of a new, uninitialized closure ++ * ++ * Typically used when running out of one closure, and we want to run @fn ++ * asynchronously out of a new closure - @parent will then wait for @cl to ++ * finish. ++ */ ++static inline void closure_call(struct closure *cl, closure_fn fn, ++ struct workqueue_struct *wq, ++ struct closure *parent) ++{ ++ closure_init(cl, parent); ++ continue_at_nobarrier(cl, fn, wq); ++} ++ ++#define __closure_wait_event(waitlist, _cond) \ ++do { \ ++ struct closure cl; \ ++ \ ++ closure_init_stack(&cl); \ ++ \ ++ while (1) { \ ++ closure_wait(waitlist, &cl); \ ++ if (_cond) \ ++ break; \ ++ closure_sync(&cl); \ ++ } \ ++ closure_wake_up(waitlist); \ ++ closure_sync(&cl); \ ++} while (0) ++ ++#define closure_wait_event(waitlist, _cond) \ ++do { \ ++ if (!(_cond)) \ ++ __closure_wait_event(waitlist, _cond); \ ++} while (0) ++ ++#endif /* _LINUX_CLOSURE_H */ +diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h +index c8f03d2969df..6165f4f769b6 100644 +--- a/include/linux/compiler_attributes.h ++++ b/include/linux/compiler_attributes.h +@@ -271,4 +271,9 @@ + */ + #define __weak __attribute__((__weak__)) + ++/* ++ * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-flatten-function-attribute ++ */ ++#define __flatten __attribute__((flatten)) ++ + #endif /* __LINUX_COMPILER_ATTRIBUTES_H */ +diff --git a/include/linux/dcache.h b/include/linux/dcache.h +index a81f0c3cf352..053e33f5afd9 100644 +--- a/include/linux/dcache.h ++++ b/include/linux/dcache.h +@@ -256,6 +256,7 @@ extern struct dentry * d_make_root(struct inode *); + /* - the ramfs-type tree */ + extern void d_genocide(struct dentry *); + ++extern void d_mark_tmpfile(struct dentry *, struct inode *); + extern void d_tmpfile(struct dentry *, struct inode *); + + extern struct dentry *d_find_alias(struct inode *); +diff --git a/include/linux/fs.h b/include/linux/fs.h +index f5abba86107d..a0793e83b266 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -3088,6 +3088,7 @@ extern struct inode *find_inode_rcu(struct super_block *, unsigned long, + extern struct inode *find_inode_by_ino_rcu(struct super_block *, unsigned long); + extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); + extern int insert_inode_locked(struct inode *); ++extern struct inode *insert_inode_locked2(struct inode *); + #ifdef CONFIG_DEBUG_LOCK_ALLOC + extern void lockdep_annotate_inode_mutex_key(struct inode *inode); + #else +diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h +index cf2468da68e9..25cadac5e90d 100644 +--- a/include/linux/pagemap.h ++++ b/include/linux/pagemap.h +@@ -645,10 +645,15 @@ static inline int fault_in_pages_readable(const char __user *uaddr, int size) + return 0; + } + +-int add_to_page_cache_locked(struct page *page, struct address_space *mapping, +- pgoff_t index, gfp_t gfp_mask); ++int add_to_page_cache(struct page *page, struct address_space *mapping, ++ pgoff_t index, gfp_t gfp_mask); + int add_to_page_cache_lru(struct page *page, struct address_space *mapping, + pgoff_t index, gfp_t gfp_mask); ++int add_to_page_cache_lru_vec(struct address_space *mapping, ++ struct page **pages, ++ unsigned nr_pages, ++ pgoff_t offset, gfp_t gfp_mask); ++ + extern void delete_from_page_cache(struct page *page); + extern void __delete_from_page_cache(struct page *page, void *shadow); + int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); +@@ -666,22 +671,6 @@ void page_cache_readahead_unbounded(struct address_space *, struct file *, + pgoff_t index, unsigned long nr_to_read, + unsigned long lookahead_count); + +-/* +- * Like add_to_page_cache_locked, but used to add newly allocated pages: +- * the page is new, so we can just run __SetPageLocked() against it. +- */ +-static inline int add_to_page_cache(struct page *page, +- struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) +-{ +- int error; +- +- __SetPageLocked(page); +- error = add_to_page_cache_locked(page, mapping, offset, gfp_mask); +- if (unlikely(error)) +- __ClearPageLocked(page); +- return error; +-} +- + /** + * struct readahead_control - Describes a readahead request. + * +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 683372943093..6340de2990ff 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -739,6 +739,7 @@ struct task_struct { + + struct mm_struct *mm; + struct mm_struct *active_mm; ++ struct address_space *faults_disabled_mapping; + + /* Per-thread vma caching: */ + struct vmacache vmacache; +diff --git a/include/linux/six.h b/include/linux/six.h +new file mode 100644 +index 000000000000..a16e94f482e9 +--- /dev/null ++++ b/include/linux/six.h +@@ -0,0 +1,197 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#ifndef _LINUX_SIX_H ++#define _LINUX_SIX_H ++ ++/* ++ * Shared/intent/exclusive locks: sleepable read/write locks, much like rw ++ * semaphores, except with a third intermediate state, intent. Basic operations ++ * are: ++ * ++ * six_lock_read(&foo->lock); ++ * six_unlock_read(&foo->lock); ++ * ++ * six_lock_intent(&foo->lock); ++ * six_unlock_intent(&foo->lock); ++ * ++ * six_lock_write(&foo->lock); ++ * six_unlock_write(&foo->lock); ++ * ++ * Intent locks block other intent locks, but do not block read locks, and you ++ * must have an intent lock held before taking a write lock, like so: ++ * ++ * six_lock_intent(&foo->lock); ++ * six_lock_write(&foo->lock); ++ * six_unlock_write(&foo->lock); ++ * six_unlock_intent(&foo->lock); ++ * ++ * Other operations: ++ * ++ * six_trylock_read() ++ * six_trylock_intent() ++ * six_trylock_write() ++ * ++ * six_lock_downgrade(): convert from intent to read ++ * six_lock_tryupgrade(): attempt to convert from read to intent ++ * ++ * Locks also embed a sequence number, which is incremented when the lock is ++ * locked or unlocked for write. The current sequence number can be grabbed ++ * while a lock is held from lock->state.seq; then, if you drop the lock you can ++ * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock ++ * iff it hasn't been locked for write in the meantime. ++ * ++ * There are also operations that take the lock type as a parameter, where the ++ * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write: ++ * ++ * six_lock_type(lock, type) ++ * six_unlock_type(lock, type) ++ * six_relock(lock, type, seq) ++ * six_trylock_type(lock, type) ++ * six_trylock_convert(lock, from, to) ++ * ++ * A lock may be held multiple types by the same thread (for read or intent, ++ * not write). However, the six locks code does _not_ implement the actual ++ * recursive checks itself though - rather, if your code (e.g. btree iterator ++ * code) knows that the current thread already has a lock held, and for the ++ * correct type, six_lock_increment() may be used to bump up the counter for ++ * that type - the only effect is that one more call to unlock will be required ++ * before the lock is unlocked. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#define SIX_LOCK_SEPARATE_LOCKFNS ++ ++union six_lock_state { ++ struct { ++ atomic64_t counter; ++ }; ++ ++ struct { ++ u64 v; ++ }; ++ ++ struct { ++ /* for waitlist_bitnr() */ ++ unsigned long l; ++ }; ++ ++ struct { ++ unsigned read_lock:28; ++ unsigned intent_lock:1; ++ unsigned waiters:3; ++ /* ++ * seq works much like in seqlocks: it's incremented every time ++ * we lock and unlock for write. ++ * ++ * If it's odd write lock is held, even unlocked. ++ * ++ * Thus readers can unlock, and then lock again later iff it ++ * hasn't been modified in the meantime. ++ */ ++ u32 seq; ++ }; ++}; ++ ++enum six_lock_type { ++ SIX_LOCK_read, ++ SIX_LOCK_intent, ++ SIX_LOCK_write, ++}; ++ ++struct six_lock { ++ union six_lock_state state; ++ unsigned intent_lock_recurse; ++ struct task_struct *owner; ++ struct optimistic_spin_queue osq; ++ ++ raw_spinlock_t wait_lock; ++ struct list_head wait_list[2]; ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map dep_map; ++#endif ++}; ++ ++typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); ++ ++static __always_inline void __six_lock_init(struct six_lock *lock, ++ const char *name, ++ struct lock_class_key *key) ++{ ++ atomic64_set(&lock->state.counter, 0); ++ raw_spin_lock_init(&lock->wait_lock); ++ INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]); ++ INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]); ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ debug_check_no_locks_freed((void *) lock, sizeof(*lock)); ++ lockdep_init_map(&lock->dep_map, name, key, 0); ++#endif ++} ++ ++#define six_lock_init(lock) \ ++do { \ ++ static struct lock_class_key __key; \ ++ \ ++ __six_lock_init((lock), #lock, &__key); \ ++} while (0) ++ ++#define __SIX_VAL(field, _v) (((union six_lock_state) { .field = _v }).v) ++ ++#define __SIX_LOCK(type) \ ++bool six_trylock_##type(struct six_lock *); \ ++bool six_relock_##type(struct six_lock *, u32); \ ++int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\ ++void six_unlock_##type(struct six_lock *); ++ ++__SIX_LOCK(read) ++__SIX_LOCK(intent) ++__SIX_LOCK(write) ++#undef __SIX_LOCK ++ ++#define SIX_LOCK_DISPATCH(type, fn, ...) \ ++ switch (type) { \ ++ case SIX_LOCK_read: \ ++ return fn##_read(__VA_ARGS__); \ ++ case SIX_LOCK_intent: \ ++ return fn##_intent(__VA_ARGS__); \ ++ case SIX_LOCK_write: \ ++ return fn##_write(__VA_ARGS__); \ ++ default: \ ++ BUG(); \ ++ } ++ ++static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) ++{ ++ SIX_LOCK_DISPATCH(type, six_trylock, lock); ++} ++ ++static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, ++ unsigned seq) ++{ ++ SIX_LOCK_DISPATCH(type, six_relock, lock, seq); ++} ++ ++static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) ++{ ++ SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p); ++} ++ ++static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) ++{ ++ SIX_LOCK_DISPATCH(type, six_unlock, lock); ++} ++ ++void six_lock_downgrade(struct six_lock *); ++bool six_lock_tryupgrade(struct six_lock *); ++bool six_trylock_convert(struct six_lock *, enum six_lock_type, ++ enum six_lock_type); ++ ++void six_lock_increment(struct six_lock *, enum six_lock_type); ++ ++void six_lock_wakeup_all(struct six_lock *); ++ ++#endif /* _LINUX_SIX_H */ +diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h +index 0221f852a7e1..f81f60d891ac 100644 +--- a/include/linux/vmalloc.h ++++ b/include/linux/vmalloc.h +@@ -106,6 +106,7 @@ extern void *vzalloc(unsigned long size); + extern void *vmalloc_user(unsigned long size); + extern void *vmalloc_node(unsigned long size, int node); + extern void *vzalloc_node(unsigned long size, int node); ++extern void *vmalloc_exec(unsigned long size, gfp_t gfp_mask); + extern void *vmalloc_32(unsigned long size); + extern void *vmalloc_32_user(unsigned long size); + extern void *__vmalloc(unsigned long size, gfp_t gfp_mask); +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +new file mode 100644 +index 000000000000..9b4e8295ed75 +--- /dev/null ++++ b/include/trace/events/bcachefs.h +@@ -0,0 +1,664 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#undef TRACE_SYSTEM ++#define TRACE_SYSTEM bcachefs ++ ++#if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ) ++#define _TRACE_BCACHE_H ++ ++#include ++ ++DECLARE_EVENT_CLASS(bpos, ++ TP_PROTO(struct bpos *p), ++ TP_ARGS(p), ++ ++ TP_STRUCT__entry( ++ __field(u64, inode ) ++ __field(u64, offset ) ++ ), ++ ++ TP_fast_assign( ++ __entry->inode = p->inode; ++ __entry->offset = p->offset; ++ ), ++ ++ TP_printk("%llu:%llu", __entry->inode, __entry->offset) ++); ++ ++DECLARE_EVENT_CLASS(bkey, ++ TP_PROTO(const struct bkey *k), ++ TP_ARGS(k), ++ ++ TP_STRUCT__entry( ++ __field(u64, inode ) ++ __field(u64, offset ) ++ __field(u32, size ) ++ ), ++ ++ TP_fast_assign( ++ __entry->inode = k->p.inode; ++ __entry->offset = k->p.offset; ++ __entry->size = k->size; ++ ), ++ ++ TP_printk("%llu:%llu len %u", __entry->inode, ++ __entry->offset, __entry->size) ++); ++ ++DECLARE_EVENT_CLASS(bch_fs, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ ), ++ ++ TP_printk("%pU", __entry->uuid) ++); ++ ++DECLARE_EVENT_CLASS(bio, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ __field(sector_t, sector ) ++ __field(unsigned int, nr_sector ) ++ __array(char, rwbs, 6 ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = bio->bi_disk ? bio_dev(bio) : 0; ++ __entry->sector = bio->bi_iter.bi_sector; ++ __entry->nr_sector = bio->bi_iter.bi_size >> 9; ++ blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); ++ ), ++ ++ TP_printk("%d,%d %s %llu + %u", ++ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, ++ (unsigned long long)__entry->sector, __entry->nr_sector) ++); ++ ++/* io.c: */ ++ ++DEFINE_EVENT(bio, read_split, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++DEFINE_EVENT(bio, read_bounce, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++DEFINE_EVENT(bio, read_retry, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++DEFINE_EVENT(bio, promote, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++/* Journal */ ++ ++DEFINE_EVENT(bch_fs, journal_full, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, journal_entry_full, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bio, journal_write, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++/* bset.c: */ ++ ++DEFINE_EVENT(bpos, bkey_pack_pos_fail, ++ TP_PROTO(struct bpos *p), ++ TP_ARGS(p) ++); ++ ++/* Btree */ ++ ++DECLARE_EVENT_CLASS(btree_node, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(u8, level ) ++ __field(u8, id ) ++ __field(u64, inode ) ++ __field(u64, offset ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->level = b->c.level; ++ __entry->id = b->c.btree_id; ++ __entry->inode = b->key.k.p.inode; ++ __entry->offset = b->key.k.p.offset; ++ ), ++ ++ TP_printk("%pU %u id %u %llu:%llu", ++ __entry->uuid, __entry->level, __entry->id, ++ __entry->inode, __entry->offset) ++); ++ ++DEFINE_EVENT(btree_node, btree_read, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++TRACE_EVENT(btree_write, ++ TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors), ++ TP_ARGS(b, bytes, sectors), ++ ++ TP_STRUCT__entry( ++ __field(enum btree_node_type, type) ++ __field(unsigned, bytes ) ++ __field(unsigned, sectors ) ++ ), ++ ++ TP_fast_assign( ++ __entry->type = btree_node_type(b); ++ __entry->bytes = bytes; ++ __entry->sectors = sectors; ++ ), ++ ++ TP_printk("bkey type %u bytes %u sectors %u", ++ __entry->type , __entry->bytes, __entry->sectors) ++); ++ ++DEFINE_EVENT(btree_node, btree_node_alloc, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_node_free, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_node_reap, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DECLARE_EVENT_CLASS(btree_node_cannibalize_lock, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ ), ++ ++ TP_printk("%pU", __entry->uuid) ++); ++ ++DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock_fail, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++TRACE_EVENT(btree_reserve_get_fail, ++ TP_PROTO(struct bch_fs *c, size_t required, struct closure *cl), ++ TP_ARGS(c, required, cl), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(size_t, required ) ++ __field(struct closure *, cl ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->required = required; ++ __entry->cl = cl; ++ ), ++ ++ TP_printk("%pU required %zu by %p", __entry->uuid, ++ __entry->required, __entry->cl) ++); ++ ++TRACE_EVENT(btree_insert_key, ++ TP_PROTO(struct bch_fs *c, struct btree *b, struct bkey_i *k), ++ TP_ARGS(c, b, k), ++ ++ TP_STRUCT__entry( ++ __field(u8, id ) ++ __field(u64, inode ) ++ __field(u64, offset ) ++ __field(u32, size ) ++ ), ++ ++ TP_fast_assign( ++ __entry->id = b->c.btree_id; ++ __entry->inode = k->k.p.inode; ++ __entry->offset = k->k.p.offset; ++ __entry->size = k->k.size; ++ ), ++ ++ TP_printk("btree %u: %llu:%llu len %u", __entry->id, ++ __entry->inode, __entry->offset, __entry->size) ++); ++ ++DEFINE_EVENT(btree_node, btree_split, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_compact, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_merge, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_set_root, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++/* Garbage collection */ ++ ++DEFINE_EVENT(btree_node, btree_gc_coalesce, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++TRACE_EVENT(btree_gc_coalesce_fail, ++ TP_PROTO(struct bch_fs *c, int reason), ++ TP_ARGS(c, reason), ++ ++ TP_STRUCT__entry( ++ __field(u8, reason ) ++ __array(char, uuid, 16 ) ++ ), ++ ++ TP_fast_assign( ++ __entry->reason = reason; ++ memcpy(__entry->uuid, c->disk_sb.sb->user_uuid.b, 16); ++ ), ++ ++ TP_printk("%pU: %u", __entry->uuid, __entry->reason) ++); ++ ++DEFINE_EVENT(btree_node, btree_gc_rewrite_node, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_gc_rewrite_node_fail, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(bch_fs, gc_start, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, gc_end, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, gc_coalesce_start, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, gc_coalesce_end, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, gc_cannot_inc_gens, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++/* Allocator */ ++ ++TRACE_EVENT(alloc_batch, ++ TP_PROTO(struct bch_dev *ca, size_t free, size_t total), ++ TP_ARGS(ca, free, total), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(size_t, free ) ++ __field(size_t, total ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, ca->uuid.b, 16); ++ __entry->free = free; ++ __entry->total = total; ++ ), ++ ++ TP_printk("%pU free %zu total %zu", ++ __entry->uuid, __entry->free, __entry->total) ++); ++ ++TRACE_EVENT(invalidate, ++ TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors), ++ TP_ARGS(ca, offset, sectors), ++ ++ TP_STRUCT__entry( ++ __field(unsigned, sectors ) ++ __field(dev_t, dev ) ++ __field(__u64, offset ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = ca->disk_sb.bdev->bd_dev; ++ __entry->offset = offset, ++ __entry->sectors = sectors; ++ ), ++ ++ TP_printk("invalidated %u sectors at %d,%d sector=%llu", ++ __entry->sectors, MAJOR(__entry->dev), ++ MINOR(__entry->dev), __entry->offset) ++); ++ ++DEFINE_EVENT(bch_fs, rescale_prios, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DECLARE_EVENT_CLASS(bucket_alloc, ++ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), ++ TP_ARGS(ca, reserve), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16) ++ __field(enum alloc_reserve, reserve ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, ca->uuid.b, 16); ++ __entry->reserve = reserve; ++ ), ++ ++ TP_printk("%pU reserve %d", __entry->uuid, __entry->reserve) ++); ++ ++DEFINE_EVENT(bucket_alloc, bucket_alloc, ++ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), ++ TP_ARGS(ca, reserve) ++); ++ ++DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, ++ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), ++ TP_ARGS(ca, reserve) ++); ++ ++DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail, ++ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), ++ TP_ARGS(ca, reserve) ++); ++ ++/* Moving IO */ ++ ++DEFINE_EVENT(bkey, move_extent, ++ TP_PROTO(const struct bkey *k), ++ TP_ARGS(k) ++); ++ ++DEFINE_EVENT(bkey, move_alloc_fail, ++ TP_PROTO(const struct bkey *k), ++ TP_ARGS(k) ++); ++ ++DEFINE_EVENT(bkey, move_race, ++ TP_PROTO(const struct bkey *k), ++ TP_ARGS(k) ++); ++ ++TRACE_EVENT(move_data, ++ TP_PROTO(struct bch_fs *c, u64 sectors_moved, ++ u64 keys_moved), ++ TP_ARGS(c, sectors_moved, keys_moved), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(u64, sectors_moved ) ++ __field(u64, keys_moved ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->sectors_moved = sectors_moved; ++ __entry->keys_moved = keys_moved; ++ ), ++ ++ TP_printk("%pU sectors_moved %llu keys_moved %llu", ++ __entry->uuid, __entry->sectors_moved, __entry->keys_moved) ++); ++ ++TRACE_EVENT(copygc, ++ TP_PROTO(struct bch_fs *c, ++ u64 sectors_moved, u64 sectors_not_moved, ++ u64 buckets_moved, u64 buckets_not_moved), ++ TP_ARGS(c, ++ sectors_moved, sectors_not_moved, ++ buckets_moved, buckets_not_moved), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(u64, sectors_moved ) ++ __field(u64, sectors_not_moved ) ++ __field(u64, buckets_moved ) ++ __field(u64, buckets_not_moved ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->sectors_moved = sectors_moved; ++ __entry->sectors_not_moved = sectors_not_moved; ++ __entry->buckets_moved = buckets_moved; ++ __entry->buckets_not_moved = buckets_moved; ++ ), ++ ++ TP_printk("%pU sectors moved %llu remain %llu buckets moved %llu remain %llu", ++ __entry->uuid, ++ __entry->sectors_moved, __entry->sectors_not_moved, ++ __entry->buckets_moved, __entry->buckets_not_moved) ++); ++ ++TRACE_EVENT(transaction_restart_ip, ++ TP_PROTO(unsigned long caller, unsigned long ip), ++ TP_ARGS(caller, ip), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, caller ) ++ __field(unsigned long, ip ) ++ ), ++ ++ TP_fast_assign( ++ __entry->caller = caller; ++ __entry->ip = ip; ++ ), ++ ++ TP_printk("%pF %pF", (void *) __entry->caller, (void *) __entry->ip) ++); ++ ++DECLARE_EVENT_CLASS(transaction_restart, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, ip ) ++ ), ++ ++ TP_fast_assign( ++ __entry->ip = ip; ++ ), ++ ++ TP_printk("%pf", (void *) __entry->ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_would_deadlock, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++TRACE_EVENT(trans_restart_iters_realloced, ++ TP_PROTO(unsigned long ip, unsigned nr), ++ TP_ARGS(ip, nr), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, ip ) ++ __field(unsigned, nr ) ++ ), ++ ++ TP_fast_assign( ++ __entry->ip = ip; ++ __entry->nr = nr; ++ ), ++ ++ TP_printk("%pf nr %u", (void *) __entry->ip, __entry->nr) ++); ++ ++TRACE_EVENT(trans_restart_mem_realloced, ++ TP_PROTO(unsigned long ip, unsigned long bytes), ++ TP_ARGS(ip, bytes), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, ip ) ++ __field(unsigned long, bytes ) ++ ), ++ ++ TP_fast_assign( ++ __entry->ip = ip; ++ __entry->bytes = bytes; ++ ), ++ ++ TP_printk("%pf bytes %lu", (void *) __entry->ip, __entry->bytes) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_fault_inject, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_btree_node_split, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_mark, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_upgrade, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_iter_upgrade, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_traverse, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_atomic, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DECLARE_EVENT_CLASS(node_lock_fail, ++ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), ++ TP_ARGS(level, iter_seq, node, node_seq), ++ ++ TP_STRUCT__entry( ++ __field(u32, level) ++ __field(u32, iter_seq) ++ __field(u32, node) ++ __field(u32, node_seq) ++ ), ++ ++ TP_fast_assign( ++ __entry->level = level; ++ __entry->iter_seq = iter_seq; ++ __entry->node = node; ++ __entry->node_seq = node_seq; ++ ), ++ ++ TP_printk("level %u iter seq %u node %u node seq %u", ++ __entry->level, __entry->iter_seq, ++ __entry->node, __entry->node_seq) ++); ++ ++DEFINE_EVENT(node_lock_fail, node_upgrade_fail, ++ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), ++ TP_ARGS(level, iter_seq, node, node_seq) ++); ++ ++DEFINE_EVENT(node_lock_fail, node_relock_fail, ++ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), ++ TP_ARGS(level, iter_seq, node, node_seq) ++); ++ ++#endif /* _TRACE_BCACHE_H */ ++ ++/* This part must be outside protection */ ++#include +diff --git a/init/init_task.c b/init/init_task.c +index 15089d15010a..61d969e94569 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -83,6 +83,7 @@ struct task_struct init_task + .nr_cpus_allowed= NR_CPUS, + .mm = NULL, + .active_mm = &init_mm, ++ .faults_disabled_mapping = NULL, + .restart_block = { + .fn = do_no_restart_syscall, + }, +diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks +index 3de8fd11873b..ab8aa082ce56 100644 +--- a/kernel/Kconfig.locks ++++ b/kernel/Kconfig.locks +@@ -259,3 +259,6 @@ config ARCH_HAS_MMIOWB + config MMIOWB + def_bool y if ARCH_HAS_MMIOWB + depends on SMP ++ ++config SIXLOCKS ++ bool +diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile +index 6d11cfb9b41f..4c13937e8f37 100644 +--- a/kernel/locking/Makefile ++++ b/kernel/locking/Makefile +@@ -32,3 +32,4 @@ obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o + obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o + obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o + obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o ++obj-$(CONFIG_SIXLOCKS) += six.o +diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h +index baca699b94e9..4abb462d914d 100644 +--- a/kernel/locking/lockdep_internals.h ++++ b/kernel/locking/lockdep_internals.h +@@ -96,7 +96,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = + #else + #define MAX_LOCKDEP_ENTRIES 32768UL + +-#define MAX_LOCKDEP_CHAINS_BITS 16 ++#define MAX_LOCKDEP_CHAINS_BITS 18 + + /* + * Stack-trace: tightly packed array of stack backtrace +@@ -114,7 +114,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = + + #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) + +-#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) ++#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*10) + + extern struct list_head all_lock_classes; + extern struct lock_chain lock_chains[]; +diff --git a/kernel/locking/six.c b/kernel/locking/six.c +new file mode 100644 +index 000000000000..49d46ed2e18e +--- /dev/null ++++ b/kernel/locking/six.c +@@ -0,0 +1,553 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef DEBUG ++#define EBUG_ON(cond) BUG_ON(cond) ++#else ++#define EBUG_ON(cond) do {} while (0) ++#endif ++ ++#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_) ++#define six_release(l) lock_release(l, _RET_IP_) ++ ++struct six_lock_vals { ++ /* Value we add to the lock in order to take the lock: */ ++ u64 lock_val; ++ ++ /* If the lock has this value (used as a mask), taking the lock fails: */ ++ u64 lock_fail; ++ ++ /* Value we add to the lock in order to release the lock: */ ++ u64 unlock_val; ++ ++ /* Mask that indicates lock is held for this type: */ ++ u64 held_mask; ++ ++ /* Waitlist we wakeup when releasing the lock: */ ++ enum six_lock_type unlock_wakeup; ++}; ++ ++#define __SIX_LOCK_HELD_read __SIX_VAL(read_lock, ~0) ++#define __SIX_LOCK_HELD_intent __SIX_VAL(intent_lock, ~0) ++#define __SIX_LOCK_HELD_write __SIX_VAL(seq, 1) ++ ++#define LOCK_VALS { \ ++ [SIX_LOCK_read] = { \ ++ .lock_val = __SIX_VAL(read_lock, 1), \ ++ .lock_fail = __SIX_LOCK_HELD_write, \ ++ .unlock_val = -__SIX_VAL(read_lock, 1), \ ++ .held_mask = __SIX_LOCK_HELD_read, \ ++ .unlock_wakeup = SIX_LOCK_write, \ ++ }, \ ++ [SIX_LOCK_intent] = { \ ++ .lock_val = __SIX_VAL(intent_lock, 1), \ ++ .lock_fail = __SIX_LOCK_HELD_intent, \ ++ .unlock_val = -__SIX_VAL(intent_lock, 1), \ ++ .held_mask = __SIX_LOCK_HELD_intent, \ ++ .unlock_wakeup = SIX_LOCK_intent, \ ++ }, \ ++ [SIX_LOCK_write] = { \ ++ .lock_val = __SIX_VAL(seq, 1), \ ++ .lock_fail = __SIX_LOCK_HELD_read, \ ++ .unlock_val = __SIX_VAL(seq, 1), \ ++ .held_mask = __SIX_LOCK_HELD_write, \ ++ .unlock_wakeup = SIX_LOCK_read, \ ++ }, \ ++} ++ ++static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, ++ union six_lock_state old) ++{ ++ if (type != SIX_LOCK_intent) ++ return; ++ ++ if (!old.intent_lock) { ++ EBUG_ON(lock->owner); ++ lock->owner = current; ++ } else { ++ EBUG_ON(lock->owner != current); ++ } ++} ++ ++static __always_inline bool do_six_trylock_type(struct six_lock *lock, ++ enum six_lock_type type) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ union six_lock_state old; ++ u64 v = READ_ONCE(lock->state.v); ++ ++ EBUG_ON(type == SIX_LOCK_write && lock->owner != current); ++ ++ do { ++ old.v = v; ++ ++ EBUG_ON(type == SIX_LOCK_write && ++ ((old.v & __SIX_LOCK_HELD_write) || ++ !(old.v & __SIX_LOCK_HELD_intent))); ++ ++ if (old.v & l[type].lock_fail) ++ return false; ++ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, ++ old.v, ++ old.v + l[type].lock_val)) != old.v); ++ ++ six_set_owner(lock, type, old); ++ return true; ++} ++ ++__always_inline __flatten ++static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type) ++{ ++ if (!do_six_trylock_type(lock, type)) ++ return false; ++ ++ if (type != SIX_LOCK_write) ++ six_acquire(&lock->dep_map, 1); ++ return true; ++} ++ ++__always_inline __flatten ++static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, ++ unsigned seq) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ union six_lock_state old; ++ u64 v = READ_ONCE(lock->state.v); ++ ++ do { ++ old.v = v; ++ ++ if (old.seq != seq || old.v & l[type].lock_fail) ++ return false; ++ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, ++ old.v, ++ old.v + l[type].lock_val)) != old.v); ++ ++ six_set_owner(lock, type, old); ++ if (type != SIX_LOCK_write) ++ six_acquire(&lock->dep_map, 1); ++ return true; ++} ++ ++struct six_lock_waiter { ++ struct list_head list; ++ struct task_struct *task; ++}; ++ ++/* This is probably up there with the more evil things I've done */ ++#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l)) ++ ++#ifdef CONFIG_LOCK_SPIN_ON_OWNER ++ ++static inline int six_can_spin_on_owner(struct six_lock *lock) ++{ ++ struct task_struct *owner; ++ int retval = 1; ++ ++ if (need_resched()) ++ return 0; ++ ++ rcu_read_lock(); ++ owner = READ_ONCE(lock->owner); ++ if (owner) ++ retval = owner->on_cpu; ++ rcu_read_unlock(); ++ /* ++ * if lock->owner is not set, the mutex owner may have just acquired ++ * it and not set the owner yet or the mutex has been released. ++ */ ++ return retval; ++} ++ ++static inline bool six_spin_on_owner(struct six_lock *lock, ++ struct task_struct *owner) ++{ ++ bool ret = true; ++ ++ rcu_read_lock(); ++ while (lock->owner == owner) { ++ /* ++ * Ensure we emit the owner->on_cpu, dereference _after_ ++ * checking lock->owner still matches owner. If that fails, ++ * owner might point to freed memory. If it still matches, ++ * the rcu_read_lock() ensures the memory stays valid. ++ */ ++ barrier(); ++ ++ if (!owner->on_cpu || need_resched()) { ++ ret = false; ++ break; ++ } ++ ++ cpu_relax(); ++ } ++ rcu_read_unlock(); ++ ++ return ret; ++} ++ ++static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) ++{ ++ struct task_struct *task = current; ++ ++ if (type == SIX_LOCK_write) ++ return false; ++ ++ preempt_disable(); ++ if (!six_can_spin_on_owner(lock)) ++ goto fail; ++ ++ if (!osq_lock(&lock->osq)) ++ goto fail; ++ ++ while (1) { ++ struct task_struct *owner; ++ ++ /* ++ * If there's an owner, wait for it to either ++ * release the lock or go to sleep. ++ */ ++ owner = READ_ONCE(lock->owner); ++ if (owner && !six_spin_on_owner(lock, owner)) ++ break; ++ ++ if (do_six_trylock_type(lock, type)) { ++ osq_unlock(&lock->osq); ++ preempt_enable(); ++ return true; ++ } ++ ++ /* ++ * When there's no owner, we might have preempted between the ++ * owner acquiring the lock and setting the owner field. If ++ * we're an RT task that will live-lock because we won't let ++ * the owner complete. ++ */ ++ if (!owner && (need_resched() || rt_task(task))) ++ break; ++ ++ /* ++ * The cpu_relax() call is a compiler barrier which forces ++ * everything in this loop to be re-loaded. We don't need ++ * memory barriers as we'll eventually observe the right ++ * values at the cost of a few extra spins. ++ */ ++ cpu_relax(); ++ } ++ ++ osq_unlock(&lock->osq); ++fail: ++ preempt_enable(); ++ ++ /* ++ * If we fell out of the spin path because of need_resched(), ++ * reschedule now, before we try-lock again. This avoids getting ++ * scheduled out right after we obtained the lock. ++ */ ++ if (need_resched()) ++ schedule(); ++ ++ return false; ++} ++ ++#else /* CONFIG_LOCK_SPIN_ON_OWNER */ ++ ++static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) ++{ ++ return false; ++} ++ ++#endif ++ ++noinline ++static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ union six_lock_state old, new; ++ struct six_lock_waiter wait; ++ int ret = 0; ++ u64 v; ++ ++ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; ++ if (ret) ++ return ret; ++ ++ if (six_optimistic_spin(lock, type)) ++ return 0; ++ ++ lock_contended(&lock->dep_map, _RET_IP_); ++ ++ INIT_LIST_HEAD(&wait.list); ++ wait.task = current; ++ ++ while (1) { ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ if (type == SIX_LOCK_write) ++ EBUG_ON(lock->owner != current); ++ else if (list_empty_careful(&wait.list)) { ++ raw_spin_lock(&lock->wait_lock); ++ list_add_tail(&wait.list, &lock->wait_list[type]); ++ raw_spin_unlock(&lock->wait_lock); ++ } ++ ++ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; ++ if (ret) ++ break; ++ ++ v = READ_ONCE(lock->state.v); ++ do { ++ new.v = old.v = v; ++ ++ if (!(old.v & l[type].lock_fail)) ++ new.v += l[type].lock_val; ++ else if (!(new.waiters & (1 << type))) ++ new.waiters |= 1 << type; ++ else ++ break; /* waiting bit already set */ ++ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, ++ old.v, new.v)) != old.v); ++ ++ if (!(old.v & l[type].lock_fail)) ++ break; ++ ++ schedule(); ++ } ++ ++ if (!ret) ++ six_set_owner(lock, type, old); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ if (!list_empty_careful(&wait.list)) { ++ raw_spin_lock(&lock->wait_lock); ++ list_del_init(&wait.list); ++ raw_spin_unlock(&lock->wait_lock); ++ } ++ ++ return ret; ++} ++ ++__always_inline ++static int __six_lock_type(struct six_lock *lock, enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) ++{ ++ int ret; ++ ++ if (type != SIX_LOCK_write) ++ six_acquire(&lock->dep_map, 0); ++ ++ ret = do_six_trylock_type(lock, type) ? 0 ++ : __six_lock_type_slowpath(lock, type, should_sleep_fn, p); ++ ++ if (ret && type != SIX_LOCK_write) ++ six_release(&lock->dep_map); ++ if (!ret) ++ lock_acquired(&lock->dep_map, _RET_IP_); ++ ++ return ret; ++} ++ ++static inline void six_lock_wakeup(struct six_lock *lock, ++ union six_lock_state state, ++ unsigned waitlist_id) ++{ ++ struct list_head *wait_list = &lock->wait_list[waitlist_id]; ++ struct six_lock_waiter *w, *next; ++ ++ if (waitlist_id == SIX_LOCK_write && state.read_lock) ++ return; ++ ++ if (!(state.waiters & (1 << waitlist_id))) ++ return; ++ ++ clear_bit(waitlist_bitnr(waitlist_id), ++ (unsigned long *) &lock->state.v); ++ ++ if (waitlist_id == SIX_LOCK_write) { ++ struct task_struct *p = READ_ONCE(lock->owner); ++ ++ if (p) ++ wake_up_process(p); ++ return; ++ } ++ ++ raw_spin_lock(&lock->wait_lock); ++ ++ list_for_each_entry_safe(w, next, wait_list, list) { ++ list_del_init(&w->list); ++ ++ if (wake_up_process(w->task) && ++ waitlist_id != SIX_LOCK_read) { ++ if (!list_empty(wait_list)) ++ set_bit(waitlist_bitnr(waitlist_id), ++ (unsigned long *) &lock->state.v); ++ break; ++ } ++ } ++ ++ raw_spin_unlock(&lock->wait_lock); ++} ++ ++__always_inline __flatten ++static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ union six_lock_state state; ++ ++ EBUG_ON(!(lock->state.v & l[type].held_mask)); ++ EBUG_ON(type == SIX_LOCK_write && ++ !(lock->state.v & __SIX_LOCK_HELD_intent)); ++ ++ if (type != SIX_LOCK_write) ++ six_release(&lock->dep_map); ++ ++ if (type == SIX_LOCK_intent) { ++ EBUG_ON(lock->owner != current); ++ ++ if (lock->intent_lock_recurse) { ++ --lock->intent_lock_recurse; ++ return; ++ } ++ ++ lock->owner = NULL; ++ } ++ ++ state.v = atomic64_add_return_release(l[type].unlock_val, ++ &lock->state.counter); ++ six_lock_wakeup(lock, state, l[type].unlock_wakeup); ++} ++ ++#define __SIX_LOCK(type) \ ++bool six_trylock_##type(struct six_lock *lock) \ ++{ \ ++ return __six_trylock_type(lock, SIX_LOCK_##type); \ ++} \ ++EXPORT_SYMBOL_GPL(six_trylock_##type); \ ++ \ ++bool six_relock_##type(struct six_lock *lock, u32 seq) \ ++{ \ ++ return __six_relock_type(lock, SIX_LOCK_##type, seq); \ ++} \ ++EXPORT_SYMBOL_GPL(six_relock_##type); \ ++ \ ++int six_lock_##type(struct six_lock *lock, \ ++ six_lock_should_sleep_fn should_sleep_fn, void *p) \ ++{ \ ++ return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p);\ ++} \ ++EXPORT_SYMBOL_GPL(six_lock_##type); \ ++ \ ++void six_unlock_##type(struct six_lock *lock) \ ++{ \ ++ __six_unlock_type(lock, SIX_LOCK_##type); \ ++} \ ++EXPORT_SYMBOL_GPL(six_unlock_##type); ++ ++__SIX_LOCK(read) ++__SIX_LOCK(intent) ++__SIX_LOCK(write) ++ ++#undef __SIX_LOCK ++ ++/* Convert from intent to read: */ ++void six_lock_downgrade(struct six_lock *lock) ++{ ++ six_lock_increment(lock, SIX_LOCK_read); ++ six_unlock_intent(lock); ++} ++EXPORT_SYMBOL_GPL(six_lock_downgrade); ++ ++bool six_lock_tryupgrade(struct six_lock *lock) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ union six_lock_state old, new; ++ u64 v = READ_ONCE(lock->state.v); ++ ++ do { ++ new.v = old.v = v; ++ ++ EBUG_ON(!(old.v & l[SIX_LOCK_read].held_mask)); ++ ++ new.v += l[SIX_LOCK_read].unlock_val; ++ ++ if (new.v & l[SIX_LOCK_intent].lock_fail) ++ return false; ++ ++ new.v += l[SIX_LOCK_intent].lock_val; ++ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, ++ old.v, new.v)) != old.v); ++ ++ six_set_owner(lock, SIX_LOCK_intent, old); ++ six_lock_wakeup(lock, new, l[SIX_LOCK_read].unlock_wakeup); ++ ++ return true; ++} ++EXPORT_SYMBOL_GPL(six_lock_tryupgrade); ++ ++bool six_trylock_convert(struct six_lock *lock, ++ enum six_lock_type from, ++ enum six_lock_type to) ++{ ++ EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write); ++ ++ if (to == from) ++ return true; ++ ++ if (to == SIX_LOCK_read) { ++ six_lock_downgrade(lock); ++ return true; ++ } else { ++ return six_lock_tryupgrade(lock); ++ } ++} ++EXPORT_SYMBOL_GPL(six_trylock_convert); ++ ++/* ++ * Increment read/intent lock count, assuming we already have it read or intent ++ * locked: ++ */ ++void six_lock_increment(struct six_lock *lock, enum six_lock_type type) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ ++ EBUG_ON(type == SIX_LOCK_write); ++ six_acquire(&lock->dep_map, 0); ++ ++ /* XXX: assert already locked, and that we don't overflow: */ ++ ++ switch (type) { ++ case SIX_LOCK_read: ++ atomic64_add(l[type].lock_val, &lock->state.counter); ++ break; ++ case SIX_LOCK_intent: ++ lock->intent_lock_recurse++; ++ break; ++ case SIX_LOCK_write: ++ BUG(); ++ break; ++ } ++} ++EXPORT_SYMBOL_GPL(six_lock_increment); ++ ++void six_lock_wakeup_all(struct six_lock *lock) ++{ ++ struct six_lock_waiter *w; ++ ++ raw_spin_lock(&lock->wait_lock); ++ ++ list_for_each_entry(w, &lock->wait_list[0], list) ++ wake_up_process(w->task); ++ list_for_each_entry(w, &lock->wait_list[1], list) ++ wake_up_process(w->task); ++ ++ raw_spin_unlock(&lock->wait_lock); ++} ++EXPORT_SYMBOL_GPL(six_lock_wakeup_all); +diff --git a/kernel/module.c b/kernel/module.c +index aa183c9ac0a2..fdfe519a0393 100644 +--- a/kernel/module.c ++++ b/kernel/module.c +@@ -2786,9 +2786,7 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) + + void * __weak module_alloc(unsigned long size) + { +- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, +- GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, +- NUMA_NO_NODE, __builtin_return_address(0)); ++ return vmalloc_exec(size, GFP_KERNEL); + } + + bool __weak module_init_section(const char *name) +diff --git a/lib/Kconfig b/lib/Kconfig +index df3f3da95990..086d332ab5c8 100644 +--- a/lib/Kconfig ++++ b/lib/Kconfig +@@ -457,6 +457,9 @@ config ASSOCIATIVE_ARRAY + + for more information. + ++config CLOSURES ++ bool ++ + config HAS_IOMEM + bool + depends on !NO_IOMEM +diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug +index 9ad9210d70a1..51558639ee91 100644 +--- a/lib/Kconfig.debug ++++ b/lib/Kconfig.debug +@@ -1466,6 +1466,15 @@ config DEBUG_CREDENTIALS + + source "kernel/rcu/Kconfig.debug" + ++config DEBUG_CLOSURES ++ bool "Debug closures (bcache async widgits)" ++ depends on CLOSURES ++ select DEBUG_FS ++ help ++ Keeps all active closures in a linked list and provides a debugfs ++ interface to list them, which makes it possible to see asynchronous ++ operations that get stuck. ++ + config DEBUG_WQ_FORCE_RR_CPU + bool "Force round-robin CPU selection for unbound work items" + depends on DEBUG_KERNEL +diff --git a/lib/Makefile b/lib/Makefile +index b1c42c10073b..7d6921a5c823 100644 +--- a/lib/Makefile ++++ b/lib/Makefile +@@ -208,6 +208,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o + + obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o + ++obj-$(CONFIG_CLOSURES) += closure.o ++ + obj-$(CONFIG_DQL) += dynamic_queue_limits.o + + obj-$(CONFIG_GLOB) += glob.o +diff --git a/lib/closure.c b/lib/closure.c +new file mode 100644 +index 000000000000..3e6366c26209 +--- /dev/null ++++ b/lib/closure.c +@@ -0,0 +1,214 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Asynchronous refcounty things ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++static inline void closure_put_after_sub(struct closure *cl, int flags) ++{ ++ int r = flags & CLOSURE_REMAINING_MASK; ++ ++ BUG_ON(flags & CLOSURE_GUARD_MASK); ++ BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); ++ ++ if (!r) { ++ if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { ++ atomic_set(&cl->remaining, ++ CLOSURE_REMAINING_INITIALIZER); ++ closure_queue(cl); ++ } else { ++ struct closure *parent = cl->parent; ++ closure_fn *destructor = cl->fn; ++ ++ closure_debug_destroy(cl); ++ ++ if (destructor) ++ destructor(cl); ++ ++ if (parent) ++ closure_put(parent); ++ } ++ } ++} ++ ++/* For clearing flags with the same atomic op as a put */ ++void closure_sub(struct closure *cl, int v) ++{ ++ closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); ++} ++EXPORT_SYMBOL(closure_sub); ++ ++/* ++ * closure_put - decrement a closure's refcount ++ */ ++void closure_put(struct closure *cl) ++{ ++ closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); ++} ++EXPORT_SYMBOL(closure_put); ++ ++/* ++ * closure_wake_up - wake up all closures on a wait list, without memory barrier ++ */ ++void __closure_wake_up(struct closure_waitlist *wait_list) ++{ ++ struct llist_node *list; ++ struct closure *cl, *t; ++ struct llist_node *reverse = NULL; ++ ++ list = llist_del_all(&wait_list->list); ++ ++ /* We first reverse the list to preserve FIFO ordering and fairness */ ++ reverse = llist_reverse_order(list); ++ ++ /* Then do the wakeups */ ++ llist_for_each_entry_safe(cl, t, reverse, list) { ++ closure_set_waiting(cl, 0); ++ closure_sub(cl, CLOSURE_WAITING + 1); ++ } ++} ++EXPORT_SYMBOL(__closure_wake_up); ++ ++/** ++ * closure_wait - add a closure to a waitlist ++ * @waitlist: will own a ref on @cl, which will be released when ++ * closure_wake_up() is called on @waitlist. ++ * @cl: closure pointer. ++ * ++ */ ++bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) ++{ ++ if (atomic_read(&cl->remaining) & CLOSURE_WAITING) ++ return false; ++ ++ closure_set_waiting(cl, _RET_IP_); ++ atomic_add(CLOSURE_WAITING + 1, &cl->remaining); ++ llist_add(&cl->list, &waitlist->list); ++ ++ return true; ++} ++EXPORT_SYMBOL(closure_wait); ++ ++struct closure_syncer { ++ struct task_struct *task; ++ int done; ++}; ++ ++static void closure_sync_fn(struct closure *cl) ++{ ++ struct closure_syncer *s = cl->s; ++ struct task_struct *p; ++ ++ rcu_read_lock(); ++ p = READ_ONCE(s->task); ++ s->done = 1; ++ wake_up_process(p); ++ rcu_read_unlock(); ++} ++ ++void __sched __closure_sync(struct closure *cl) ++{ ++ struct closure_syncer s = { .task = current }; ++ ++ cl->s = &s; ++ continue_at(cl, closure_sync_fn, NULL); ++ ++ while (1) { ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ if (s.done) ++ break; ++ schedule(); ++ } ++ ++ __set_current_state(TASK_RUNNING); ++} ++EXPORT_SYMBOL(__closure_sync); ++ ++#ifdef CONFIG_DEBUG_CLOSURES ++ ++static LIST_HEAD(closure_list); ++static DEFINE_SPINLOCK(closure_list_lock); ++ ++void closure_debug_create(struct closure *cl) ++{ ++ unsigned long flags; ++ ++ BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); ++ cl->magic = CLOSURE_MAGIC_ALIVE; ++ ++ spin_lock_irqsave(&closure_list_lock, flags); ++ list_add(&cl->all, &closure_list); ++ spin_unlock_irqrestore(&closure_list_lock, flags); ++} ++EXPORT_SYMBOL(closure_debug_create); ++ ++void closure_debug_destroy(struct closure *cl) ++{ ++ unsigned long flags; ++ ++ BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); ++ cl->magic = CLOSURE_MAGIC_DEAD; ++ ++ spin_lock_irqsave(&closure_list_lock, flags); ++ list_del(&cl->all); ++ spin_unlock_irqrestore(&closure_list_lock, flags); ++} ++EXPORT_SYMBOL(closure_debug_destroy); ++ ++static int debug_seq_show(struct seq_file *f, void *data) ++{ ++ struct closure *cl; ++ ++ spin_lock_irq(&closure_list_lock); ++ ++ list_for_each_entry(cl, &closure_list, all) { ++ int r = atomic_read(&cl->remaining); ++ ++ seq_printf(f, "%p: %pS -> %pS p %p r %i ", ++ cl, (void *) cl->ip, cl->fn, cl->parent, ++ r & CLOSURE_REMAINING_MASK); ++ ++ seq_printf(f, "%s%s\n", ++ test_bit(WORK_STRUCT_PENDING_BIT, ++ work_data_bits(&cl->work)) ? "Q" : "", ++ r & CLOSURE_RUNNING ? "R" : ""); ++ ++ if (r & CLOSURE_WAITING) ++ seq_printf(f, " W %pS\n", ++ (void *) cl->waiting_on); ++ ++ seq_puts(f, "\n"); ++ } ++ ++ spin_unlock_irq(&closure_list_lock); ++ return 0; ++} ++ ++static int debug_seq_open(struct inode *inode, struct file *file) ++{ ++ return single_open(file, debug_seq_show, NULL); ++} ++ ++static const struct file_operations debug_ops = { ++ .owner = THIS_MODULE, ++ .open = debug_seq_open, ++ .read = seq_read, ++ .release = single_release ++}; ++ ++static int __init closure_debug_init(void) ++{ ++ debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops); ++ return 0; ++} ++late_initcall(closure_debug_init) ++ ++#endif +diff --git a/mm/filemap.c b/mm/filemap.c +index 385759c4ce4b..5ca0ff7b9357 100644 +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -116,6 +116,69 @@ + * ->tasklist_lock (memory_failure, collect_procs_ao) + */ + ++static int page_cache_tree_insert_vec(struct page *pages[], ++ unsigned nr_pages, ++ struct address_space *mapping, ++ pgoff_t index, ++ gfp_t gfp_mask, ++ void *shadow[]) ++{ ++ XA_STATE(xas, &mapping->i_pages, index); ++ void *old; ++ int i = 0, error = 0; ++ ++ mapping_set_update(&xas, mapping); ++ ++ if (!nr_pages) ++ return 0; ++ ++ xa_lock_irq(&mapping->i_pages); ++ ++ while (1) { ++ old = xas_load(&xas); ++ if (old && !xa_is_value(old)) { ++ error = -EEXIST; ++ break; ++ } ++ ++ xas_store(&xas, pages[i]); ++ error = xas_error(&xas); ++ ++ if (error == -ENOMEM) { ++ xa_unlock_irq(&mapping->i_pages); ++ if (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)) ++ error = 0; ++ xa_lock_irq(&mapping->i_pages); ++ ++ if (!error) ++ continue; ++ break; ++ } ++ ++ if (error) ++ break; ++ ++ if (shadow) ++ shadow[i] = old; ++ if (xa_is_value(old)) ++ mapping->nrexceptional--; ++ mapping->nrpages++; ++ ++ /* hugetlb pages do not participate in page cache accounting. */ ++ if (!PageHuge(pages[i])) ++ __inc_lruvec_page_state(pages[i], NR_FILE_PAGES); ++ ++ if (++i == nr_pages) ++ break; ++ ++ xas_next(&xas); ++ } ++ ++ xa_unlock_irq(&mapping->i_pages); ++ ++ return i ?: error; ++} ++ + static void page_cache_delete(struct address_space *mapping, + struct page *page, void *shadow) + { +@@ -826,114 +889,147 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) + } + EXPORT_SYMBOL_GPL(replace_page_cache_page); + +-static int __add_to_page_cache_locked(struct page *page, +- struct address_space *mapping, +- pgoff_t offset, gfp_t gfp_mask, +- void **shadowp) ++static int add_to_page_cache_vec(struct page **pages, unsigned nr_pages, ++ struct address_space *mapping, ++ pgoff_t index, gfp_t gfp_mask, ++ void *shadow[]) + { +- XA_STATE(xas, &mapping->i_pages, offset); +- int huge = PageHuge(page); +- int error; +- void *old; ++ int i, nr_added = 0, error = 0; + +- VM_BUG_ON_PAGE(!PageLocked(page), page); +- VM_BUG_ON_PAGE(PageSwapBacked(page), page); +- mapping_set_update(&xas, mapping); ++ for (i = 0; i < nr_pages; i++) { ++ struct page *page = pages[i]; + +- get_page(page); +- page->mapping = mapping; +- page->index = offset; ++ VM_BUG_ON_PAGE(PageSwapBacked(page), page); ++ VM_BUG_ON_PAGE(PageSwapCache(page), page); + +- if (!huge) { +- error = mem_cgroup_charge(page, current->mm, gfp_mask); +- if (error) +- goto error; ++ __SetPageLocked(page); ++ get_page(page); ++ page->mapping = mapping; ++ page->index = index + i; ++ ++ if (!PageHuge(page)) { ++ error = mem_cgroup_charge(page, current->mm, gfp_mask); ++ if (error) { ++ page->mapping = NULL; ++ /* Leave page->index set: truncation relies upon it */ ++ put_page(page); ++ __ClearPageLocked(page); ++ if (!i) ++ return error; ++ nr_pages = i; ++ break; ++ } ++ } + } + +- do { +- xas_lock_irq(&xas); +- old = xas_load(&xas); +- if (old && !xa_is_value(old)) +- xas_set_err(&xas, -EEXIST); +- xas_store(&xas, page); +- if (xas_error(&xas)) +- goto unlock; ++ error = page_cache_tree_insert_vec(pages, nr_pages, mapping, ++ index, gfp_mask, shadow); ++ if (error > 0) { ++ nr_added = error; ++ error = 0; ++ } + +- if (xa_is_value(old)) { +- mapping->nrexceptional--; +- if (shadowp) +- *shadowp = old; +- } +- mapping->nrpages++; ++ for (i = 0; i < nr_added; i++) ++ trace_mm_filemap_add_to_page_cache(pages[i]); + +- /* hugetlb pages do not participate in page cache accounting */ +- if (!huge) +- __inc_lruvec_page_state(page, NR_FILE_PAGES); +-unlock: +- xas_unlock_irq(&xas); +- } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)); ++ for (i = nr_added; i < nr_pages; i++) { ++ struct page *page = pages[i]; + +- if (xas_error(&xas)) { +- error = xas_error(&xas); +- goto error; ++ /* Leave page->index set: truncation relies upon it */ ++ page->mapping = NULL; ++ put_page(page); ++ __ClearPageLocked(page); + } + +- trace_mm_filemap_add_to_page_cache(page); +- return 0; +-error: +- page->mapping = NULL; +- /* Leave page->index set: truncation relies upon it */ +- put_page(page); +- return error; ++ return nr_added ?: error; + } +-ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO); + + /** +- * add_to_page_cache_locked - add a locked page to the pagecache ++ * add_to_page_cache - add a newly allocated page to the pagecache + * @page: page to add + * @mapping: the page's address_space + * @offset: page index + * @gfp_mask: page allocation mode + * +- * This function is used to add a page to the pagecache. It must be locked. +- * This function does not add the page to the LRU. The caller must do that. ++ * This function is used to add a page to the pagecache. It must be newly ++ * allocated. This function does not add the page to the LRU. The caller must ++ * do that. + * + * Return: %0 on success, negative error code otherwise. + */ +-int add_to_page_cache_locked(struct page *page, struct address_space *mapping, +- pgoff_t offset, gfp_t gfp_mask) ++int add_to_page_cache(struct page *page, struct address_space *mapping, ++ pgoff_t offset, gfp_t gfp_mask) + { +- return __add_to_page_cache_locked(page, mapping, offset, +- gfp_mask, NULL); ++ int ret = add_to_page_cache_vec(&page, 1, mapping, offset, ++ gfp_mask, NULL); ++ if (ret < 0) ++ return ret; ++ return 0; + } +-EXPORT_SYMBOL(add_to_page_cache_locked); ++EXPORT_SYMBOL(add_to_page_cache); ++ALLOW_ERROR_INJECTION(add_to_page_cache, ERRNO); + +-int add_to_page_cache_lru(struct page *page, struct address_space *mapping, +- pgoff_t offset, gfp_t gfp_mask) ++int add_to_page_cache_lru_vec(struct address_space *mapping, ++ struct page **pages, ++ unsigned nr_pages, ++ pgoff_t offset, gfp_t gfp_mask) + { +- void *shadow = NULL; +- int ret; ++ void *shadow_stack[8], **shadow = shadow_stack; ++ int i, ret = 0, err = 0, nr_added; ++ ++ if (nr_pages > ARRAY_SIZE(shadow_stack)) { ++ shadow = kmalloc_array(nr_pages, sizeof(void *), gfp_mask); ++ if (!shadow) ++ goto slowpath; ++ } ++ ++ for (i = 0; i < nr_pages; i++) ++ VM_BUG_ON_PAGE(PageActive(pages[i]), pages[i]); ++ ++ ret = add_to_page_cache_vec(pages, nr_pages, mapping, ++ offset, gfp_mask, shadow); ++ nr_added = ret > 0 ? ret : 0; ++ ++ /* ++ * The page might have been evicted from cache only recently, in which ++ * case it should be activated like any other repeatedly accessed page. ++ * The exception is pages getting rewritten; evicting other data from ++ * the working set, only to cache data that will get overwritten with ++ * something else, is a waste of memory. ++ */ ++ for (i = 0; i < nr_added; i++) { ++ struct page *page = pages[i]; ++ void *s = shadow[i]; + +- __SetPageLocked(page); +- ret = __add_to_page_cache_locked(page, mapping, offset, +- gfp_mask, &shadow); +- if (unlikely(ret)) +- __ClearPageLocked(page); +- else { +- /* +- * The page might have been evicted from cache only +- * recently, in which case it should be activated like +- * any other repeatedly accessed page. +- * The exception is pages getting rewritten; evicting other +- * data from the working set, only to cache data that will +- * get overwritten with something else, is a waste of memory. +- */ + WARN_ON_ONCE(PageActive(page)); +- if (!(gfp_mask & __GFP_WRITE) && shadow) +- workingset_refault(page, shadow); ++ if (!(gfp_mask & __GFP_WRITE) && s) ++ workingset_refault(page, s); + lru_cache_add(page); + } ++ ++ if (shadow != shadow_stack) ++ kfree(shadow); ++ + return ret; ++slowpath: ++ for (i = 0; i < nr_pages; i++) { ++ err = add_to_page_cache_lru(pages[i], mapping, ++ offset + i, gfp_mask); ++ if (err) ++ break; ++ } ++ ++ return i ?: err; ++} ++EXPORT_SYMBOL_GPL(add_to_page_cache_lru_vec); ++ ++int add_to_page_cache_lru(struct page *page, struct address_space *mapping, ++ pgoff_t offset, gfp_t gfp_mask) ++{ ++ int ret = add_to_page_cache_lru_vec(mapping, &page, 1, offset, gfp_mask); ++ if (ret < 0) ++ return ret; ++ return 0; + } + EXPORT_SYMBOL_GPL(add_to_page_cache_lru); + +@@ -1824,6 +1920,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, + + return ret; + } ++EXPORT_SYMBOL(find_get_pages_range); + + /** + * find_get_pages_contig - gang contiguous pagecache lookup +@@ -1972,6 +2069,244 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra) + ra->ra_pages /= 4; + } + ++static struct page * ++generic_file_buffered_read_readpage(struct file *filp, ++ struct address_space *mapping, ++ struct page *page) ++{ ++ struct file_ra_state *ra = &filp->f_ra; ++ int error; ++ ++ /* ++ * A previous I/O error may have been due to temporary ++ * failures, eg. multipath errors. ++ * PG_error will be set again if readpage fails. ++ */ ++ ClearPageError(page); ++ /* Start the actual read. The read will unlock the page. */ ++ error = mapping->a_ops->readpage(filp, page); ++ ++ if (unlikely(error)) { ++ put_page(page); ++ return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL; ++ } ++ ++ if (!PageUptodate(page)) { ++ error = lock_page_killable(page); ++ if (unlikely(error)) { ++ put_page(page); ++ return ERR_PTR(error); ++ } ++ if (!PageUptodate(page)) { ++ if (page->mapping == NULL) { ++ /* ++ * invalidate_mapping_pages got it ++ */ ++ unlock_page(page); ++ put_page(page); ++ return NULL; ++ } ++ unlock_page(page); ++ shrink_readahead_size_eio(ra); ++ put_page(page); ++ return ERR_PTR(-EIO); ++ } ++ unlock_page(page); ++ } ++ ++ return page; ++} ++ ++static struct page * ++generic_file_buffered_read_pagenotuptodate(struct kiocb *iocb, ++ struct file *filp, ++ struct iov_iter *iter, ++ struct page *page, ++ loff_t pos, loff_t count) ++{ ++ struct address_space *mapping = filp->f_mapping; ++ struct inode *inode = mapping->host; ++ int error; ++ ++ /* ++ * See comment in do_read_cache_page on why ++ * wait_on_page_locked is used to avoid unnecessarily ++ * serialisations and why it's safe. ++ */ ++ error = wait_on_page_locked_killable(page); ++ if (unlikely(error)) { ++ put_page(page); ++ return ERR_PTR(error); ++ } ++ ++ if (PageUptodate(page)) ++ return page; ++ ++ if (inode->i_blkbits == PAGE_SHIFT || ++ !mapping->a_ops->is_partially_uptodate) ++ goto page_not_up_to_date; ++ /* pipes can't handle partially uptodate pages */ ++ if (unlikely(iov_iter_is_pipe(iter))) ++ goto page_not_up_to_date; ++ if (!trylock_page(page)) ++ goto page_not_up_to_date; ++ /* Did it get truncated before we got the lock? */ ++ if (!page->mapping) ++ goto page_not_up_to_date_locked; ++ ++ if (!mapping->a_ops->is_partially_uptodate(page, ++ pos & ~PAGE_MASK, count)) ++ goto page_not_up_to_date_locked; ++ unlock_page(page); ++ return page; ++ ++page_not_up_to_date: ++ /* Get exclusive access to the page ... */ ++ error = lock_page_killable(page); ++ if (unlikely(error)) { ++ put_page(page); ++ return ERR_PTR(error); ++ } ++ ++page_not_up_to_date_locked: ++ /* Did it get truncated before we got the lock? */ ++ if (!page->mapping) { ++ unlock_page(page); ++ put_page(page); ++ return NULL; ++ } ++ ++ /* Did somebody else fill it already? */ ++ if (PageUptodate(page)) { ++ unlock_page(page); ++ return page; ++ } ++ ++ if (iocb->ki_flags & IOCB_NOIO) { ++ unlock_page(page); ++ put_page(page); ++ return ERR_PTR(-EAGAIN); ++ } ++ ++ return generic_file_buffered_read_readpage(filp, mapping, page); ++} ++ ++static struct page * ++generic_file_buffered_read_no_cached_page(struct kiocb *iocb, ++ struct iov_iter *iter) ++{ ++ struct file *filp = iocb->ki_filp; ++ struct address_space *mapping = filp->f_mapping; ++ pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; ++ struct page *page; ++ int error; ++ ++ if (iocb->ki_flags & IOCB_NOIO) ++ return ERR_PTR(-EAGAIN); ++ ++ /* ++ * Ok, it wasn't cached, so we need to create a new ++ * page.. ++ */ ++ page = page_cache_alloc(mapping); ++ if (!page) ++ return ERR_PTR(-ENOMEM); ++ ++ error = add_to_page_cache_lru(page, mapping, index, ++ mapping_gfp_constraint(mapping, GFP_KERNEL)); ++ if (error) { ++ put_page(page); ++ return error != -EEXIST ? ERR_PTR(error) : NULL; ++ } ++ ++ return generic_file_buffered_read_readpage(filp, mapping, page); ++} ++ ++static int generic_file_buffered_read_get_pages(struct kiocb *iocb, ++ struct iov_iter *iter, ++ struct page **pages, ++ unsigned int nr) ++{ ++ struct file *filp = iocb->ki_filp; ++ struct address_space *mapping = filp->f_mapping; ++ struct file_ra_state *ra = &filp->f_ra; ++ pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; ++ pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; ++ int i, j, nr_got, err = 0; ++ ++ nr = min_t(unsigned long, last_index - index, nr); ++find_page: ++ if (fatal_signal_pending(current)) ++ return -EINTR; ++ ++ nr_got = find_get_pages_contig(mapping, index, nr, pages); ++ if (nr_got) ++ goto got_pages; ++ ++ if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) ++ return -EAGAIN; ++ ++ page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); ++ ++ nr_got = find_get_pages_contig(mapping, index, nr, pages); ++ if (nr_got) ++ goto got_pages; ++ ++ pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter); ++ err = PTR_ERR_OR_ZERO(pages[0]); ++ if (!IS_ERR_OR_NULL(pages[0])) ++ nr_got = 1; ++got_pages: ++ for (i = 0; i < nr_got; i++) { ++ struct page *page = pages[i]; ++ pgoff_t pg_index = index + i; ++ loff_t pg_pos = max(iocb->ki_pos, ++ (loff_t) pg_index << PAGE_SHIFT); ++ loff_t pg_count = iocb->ki_pos + iter->count - pg_pos; ++ ++ if (PageReadahead(page)) { ++ if (iocb->ki_flags & IOCB_NOIO) { ++ for (j = i; j < nr_got; j++) ++ put_page(pages[j]); ++ nr_got = i; ++ err = -EAGAIN; ++ break; ++ } ++ page_cache_async_readahead(mapping, ra, filp, page, ++ pg_index, last_index - pg_index); ++ } ++ ++ if (!PageUptodate(page)) { ++ if (iocb->ki_flags & IOCB_NOWAIT) { ++ for (j = i; j < nr_got; j++) ++ put_page(pages[j]); ++ nr_got = i; ++ err = -EAGAIN; ++ break; ++ } ++ ++ page = generic_file_buffered_read_pagenotuptodate(iocb, ++ filp, iter, page, pg_pos, pg_count); ++ if (IS_ERR_OR_NULL(page)) { ++ for (j = i + 1; j < nr_got; j++) ++ put_page(pages[j]); ++ nr_got = i; ++ err = PTR_ERR_OR_ZERO(page); ++ break; ++ } ++ } ++ } ++ ++ if (likely(nr_got)) ++ return nr_got; ++ if (err) ++ return err; ++ /* ++ * No pages and no error means we raced and should retry: ++ */ ++ goto find_page; ++} ++ + /** + * generic_file_buffered_read - generic file read routine + * @iocb: the iocb to read +@@ -1992,261 +2327,110 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, + struct iov_iter *iter, ssize_t written) + { + struct file *filp = iocb->ki_filp; ++ struct file_ra_state *ra = &filp->f_ra; + struct address_space *mapping = filp->f_mapping; + struct inode *inode = mapping->host; +- struct file_ra_state *ra = &filp->f_ra; +- loff_t *ppos = &iocb->ki_pos; +- pgoff_t index; +- pgoff_t last_index; +- pgoff_t prev_index; +- unsigned long offset; /* offset into pagecache page */ +- unsigned int prev_offset; +- int error = 0; +- +- if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) ++ size_t orig_count = iov_iter_count(iter); ++ struct page *pages_onstack[8], **pages = NULL; ++ unsigned int nr_pages = min_t(unsigned int, 512, ++ ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) - ++ (iocb->ki_pos >> PAGE_SHIFT)); ++ int i, pg_nr, error = 0; ++ bool writably_mapped; ++ loff_t isize, end_offset; ++ ++ if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) + return 0; + iov_iter_truncate(iter, inode->i_sb->s_maxbytes); + +- index = *ppos >> PAGE_SHIFT; +- prev_index = ra->prev_pos >> PAGE_SHIFT; +- prev_offset = ra->prev_pos & (PAGE_SIZE-1); +- last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; +- offset = *ppos & ~PAGE_MASK; ++ if (nr_pages > ARRAY_SIZE(pages_onstack)) ++ pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL); + +- for (;;) { +- struct page *page; +- pgoff_t end_index; +- loff_t isize; +- unsigned long nr, ret; ++ if (!pages) { ++ pages = pages_onstack; ++ nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack)); ++ } + ++ do { + cond_resched(); +-find_page: +- if (fatal_signal_pending(current)) { +- error = -EINTR; +- goto out; +- } + +- page = find_get_page(mapping, index); +- if (!page) { +- if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO)) +- goto would_block; +- page_cache_sync_readahead(mapping, +- ra, filp, +- index, last_index - index); +- page = find_get_page(mapping, index); +- if (unlikely(page == NULL)) +- goto no_cached_page; +- } +- if (PageReadahead(page)) { +- if (iocb->ki_flags & IOCB_NOIO) { +- put_page(page); +- goto out; +- } +- page_cache_async_readahead(mapping, +- ra, filp, page, +- index, last_index - index); ++ i = 0; ++ pg_nr = generic_file_buffered_read_get_pages(iocb, iter, ++ pages, nr_pages); ++ if (pg_nr < 0) { ++ error = pg_nr; ++ break; + } +- if (!PageUptodate(page)) { +- if (iocb->ki_flags & IOCB_NOWAIT) { +- put_page(page); +- goto would_block; +- } + +- /* +- * See comment in do_read_cache_page on why +- * wait_on_page_locked is used to avoid unnecessarily +- * serialisations and why it's safe. +- */ +- error = wait_on_page_locked_killable(page); +- if (unlikely(error)) +- goto readpage_error; +- if (PageUptodate(page)) +- goto page_ok; +- +- if (inode->i_blkbits == PAGE_SHIFT || +- !mapping->a_ops->is_partially_uptodate) +- goto page_not_up_to_date; +- /* pipes can't handle partially uptodate pages */ +- if (unlikely(iov_iter_is_pipe(iter))) +- goto page_not_up_to_date; +- if (!trylock_page(page)) +- goto page_not_up_to_date; +- /* Did it get truncated before we got the lock? */ +- if (!page->mapping) +- goto page_not_up_to_date_locked; +- if (!mapping->a_ops->is_partially_uptodate(page, +- offset, iter->count)) +- goto page_not_up_to_date_locked; +- unlock_page(page); +- } +-page_ok: + /* +- * i_size must be checked after we know the page is Uptodate. ++ * i_size must be checked after we know the pages are Uptodate. + * + * Checking i_size after the check allows us to calculate + * the correct value for "nr", which means the zero-filled + * part of the page is not copied back to userspace (unless + * another truncate extends the file - this is desired though). + */ +- + isize = i_size_read(inode); +- end_index = (isize - 1) >> PAGE_SHIFT; +- if (unlikely(!isize || index > end_index)) { +- put_page(page); +- goto out; +- } ++ if (unlikely(iocb->ki_pos >= isize)) ++ goto put_pages; + +- /* nr is the maximum number of bytes to copy from this page */ +- nr = PAGE_SIZE; +- if (index == end_index) { +- nr = ((isize - 1) & ~PAGE_MASK) + 1; +- if (nr <= offset) { +- put_page(page); +- goto out; +- } +- } +- nr = nr - offset; ++ end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); + +- /* If users can be writing to this page using arbitrary +- * virtual addresses, take care about potential aliasing +- * before reading the page on the kernel side. +- */ +- if (mapping_writably_mapped(mapping)) +- flush_dcache_page(page); ++ while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr > ++ (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT) ++ put_page(pages[--pg_nr]); + + /* +- * When a sequential read accesses a page several times, +- * only mark it as accessed the first time. ++ * Once we start copying data, we don't want to be touching any ++ * cachelines that might be contended: + */ +- if (prev_index != index || offset != prev_offset) +- mark_page_accessed(page); +- prev_index = index; ++ writably_mapped = mapping_writably_mapped(mapping); + + /* +- * Ok, we have the page, and it's up-to-date, so +- * now we can copy it to user space... ++ * When a sequential read accesses a page several times, only ++ * mark it as accessed the first time. + */ ++ if (iocb->ki_pos >> PAGE_SHIFT != ++ ra->prev_pos >> PAGE_SHIFT) ++ mark_page_accessed(pages[0]); ++ for (i = 1; i < pg_nr; i++) ++ mark_page_accessed(pages[i]); ++ ++ for (i = 0; i < pg_nr; i++) { ++ unsigned int offset = iocb->ki_pos & ~PAGE_MASK; ++ unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos, ++ PAGE_SIZE - offset); ++ unsigned int copied; + +- ret = copy_page_to_iter(page, offset, nr, iter); +- offset += ret; +- index += offset >> PAGE_SHIFT; +- offset &= ~PAGE_MASK; +- prev_offset = offset; +- +- put_page(page); +- written += ret; +- if (!iov_iter_count(iter)) +- goto out; +- if (ret < nr) { +- error = -EFAULT; +- goto out; +- } +- continue; +- +-page_not_up_to_date: +- /* Get exclusive access to the page ... */ +- error = lock_page_killable(page); +- if (unlikely(error)) +- goto readpage_error; +- +-page_not_up_to_date_locked: +- /* Did it get truncated before we got the lock? */ +- if (!page->mapping) { +- unlock_page(page); +- put_page(page); +- continue; +- } +- +- /* Did somebody else fill it already? */ +- if (PageUptodate(page)) { +- unlock_page(page); +- goto page_ok; +- } ++ /* ++ * If users can be writing to this page using arbitrary ++ * virtual addresses, take care about potential aliasing ++ * before reading the page on the kernel side. ++ */ ++ if (writably_mapped) ++ flush_dcache_page(pages[i]); + +-readpage: +- if (iocb->ki_flags & IOCB_NOIO) { +- unlock_page(page); +- put_page(page); +- goto would_block; +- } +- /* +- * A previous I/O error may have been due to temporary +- * failures, eg. multipath errors. +- * PG_error will be set again if readpage fails. +- */ +- ClearPageError(page); +- /* Start the actual read. The read will unlock the page. */ +- error = mapping->a_ops->readpage(filp, page); ++ copied = copy_page_to_iter(pages[i], offset, bytes, iter); + +- if (unlikely(error)) { +- if (error == AOP_TRUNCATED_PAGE) { +- put_page(page); +- error = 0; +- goto find_page; +- } +- goto readpage_error; +- } ++ iocb->ki_pos += copied; ++ ra->prev_pos = iocb->ki_pos; + +- if (!PageUptodate(page)) { +- error = lock_page_killable(page); +- if (unlikely(error)) +- goto readpage_error; +- if (!PageUptodate(page)) { +- if (page->mapping == NULL) { +- /* +- * invalidate_mapping_pages got it +- */ +- unlock_page(page); +- put_page(page); +- goto find_page; +- } +- unlock_page(page); +- shrink_readahead_size_eio(ra); +- error = -EIO; +- goto readpage_error; ++ if (copied < bytes) { ++ error = -EFAULT; ++ break; + } +- unlock_page(page); + } ++put_pages: ++ for (i = 0; i < pg_nr; i++) ++ put_page(pages[i]); ++ } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); + +- goto page_ok; +- +-readpage_error: +- /* UHHUH! A synchronous read error occurred. Report it */ +- put_page(page); +- goto out; +- +-no_cached_page: +- /* +- * Ok, it wasn't cached, so we need to create a new +- * page.. +- */ +- page = page_cache_alloc(mapping); +- if (!page) { +- error = -ENOMEM; +- goto out; +- } +- error = add_to_page_cache_lru(page, mapping, index, +- mapping_gfp_constraint(mapping, GFP_KERNEL)); +- if (error) { +- put_page(page); +- if (error == -EEXIST) { +- error = 0; +- goto find_page; +- } +- goto out; +- } +- goto readpage; +- } ++ file_accessed(filp); ++ written += orig_count - iov_iter_count(iter); + +-would_block: +- error = -EAGAIN; +-out: +- ra->prev_pos = prev_index; +- ra->prev_pos <<= PAGE_SHIFT; +- ra->prev_pos |= prev_offset; ++ if (pages != pages_onstack) ++ kfree(pages); + +- *ppos = ((loff_t)index << PAGE_SHIFT) + offset; +- file_accessed(filp); + return written ? written : error; + } + EXPORT_SYMBOL_GPL(generic_file_buffered_read); +diff --git a/mm/gup.c b/mm/gup.c +index 6f47697f8fb0..ccceb6d3e367 100644 +--- a/mm/gup.c ++++ b/mm/gup.c +@@ -1108,6 +1108,13 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + } + cond_resched(); + ++ if (current->faults_disabled_mapping && ++ vma->vm_file && ++ vma->vm_file->f_mapping == current->faults_disabled_mapping) { ++ ret = -EFAULT; ++ goto out; ++ } ++ + page = follow_page_mask(vma, start, foll_flags, &ctx); + if (!page) { + ret = faultin_page(tsk, vma, start, &foll_flags, +diff --git a/mm/nommu.c b/mm/nommu.c +index f32a69095d50..f714f339e19b 100644 +--- a/mm/nommu.c ++++ b/mm/nommu.c +@@ -290,6 +290,24 @@ void *vzalloc_node(unsigned long size, int node) + } + EXPORT_SYMBOL(vzalloc_node); + ++/** ++ * vmalloc_exec - allocate virtually contiguous, executable memory ++ * @size: allocation size ++ * ++ * Kernel-internal function to allocate enough pages to cover @size ++ * the page level allocator and map them into contiguous and ++ * executable kernel virtual space. ++ * ++ * For tight control over page level allocator and protection flags ++ * use __vmalloc() instead. ++ */ ++ ++void *vmalloc_exec(unsigned long size, gfp_t gfp_mask) ++{ ++ return __vmalloc(size, gfp_mask); ++} ++EXPORT_SYMBOL_GPL(vmalloc_exec); ++ + /** + * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) + * @size: allocation size +diff --git a/mm/page-writeback.c b/mm/page-writeback.c +index 28b3e7a67565..2aa1e1e4c20b 100644 +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -2477,20 +2477,19 @@ int __set_page_dirty_nobuffers(struct page *page) + lock_page_memcg(page); + if (!TestSetPageDirty(page)) { + struct address_space *mapping = page_mapping(page); +- unsigned long flags; + + if (!mapping) { + unlock_page_memcg(page); + return 1; + } + +- xa_lock_irqsave(&mapping->i_pages, flags); ++ xa_lock_irq(&mapping->i_pages); + BUG_ON(page_mapping(page) != mapping); + WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); + account_page_dirtied(page, mapping); + __xa_set_mark(&mapping->i_pages, page_index(page), + PAGECACHE_TAG_DIRTY); +- xa_unlock_irqrestore(&mapping->i_pages, flags); ++ xa_unlock_irq(&mapping->i_pages); + unlock_page_memcg(page); + + if (mapping->host) { +diff --git a/mm/vmalloc.c b/mm/vmalloc.c +index 5a2b55c8dd9a..f296b41e67f0 100644 +--- a/mm/vmalloc.c ++++ b/mm/vmalloc.c +@@ -2695,6 +2695,27 @@ void *vzalloc_node(unsigned long size, int node) + } + EXPORT_SYMBOL(vzalloc_node); + ++/** ++ * vmalloc_exec - allocate virtually contiguous, executable memory ++ * @size: allocation size ++ * ++ * Kernel-internal function to allocate enough pages to cover @size ++ * the page level allocator and map them into contiguous and ++ * executable kernel virtual space. ++ * ++ * For tight control over page level allocator and protection flags ++ * use __vmalloc() instead. ++ * ++ * Return: pointer to the allocated memory or %NULL on error ++ */ ++void *vmalloc_exec(unsigned long size, gfp_t gfp_mask) ++{ ++ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, ++ gfp_mask, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, ++ NUMA_NO_NODE, __builtin_return_address(0)); ++} ++EXPORT_SYMBOL_GPL(vmalloc_exec); ++ + #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) + #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) + #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) diff --git a/linux-tkg/linux-tkg-patches/5.8/0009-glitched-bmq.patch b/linux-tkg/linux-tkg-patches/5.8/0009-glitched-bmq.patch new file mode 100644 index 0000000..38666e4 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.8/0009-glitched-bmq.patch @@ -0,0 +1,90 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched - BMQ + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_500 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -39,6 +39,13 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,6 +59,7 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_500 ++ default HZ_750 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -46,6 +46,13 @@ choice + on desktops with great smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -60,6 +67,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 500 if HZ_500 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 9270a4370d54..30d01e647417 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -159,7 +159,7 @@ struct scan_control { + /* + * From 0 .. 100. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 20; + /* + * The total number of pages which are beyond the high watermark within all + * zones. diff --git a/linux-tkg/linux-tkg-patches/5.8/0009-glitched-ondemand-bmq.patch b/linux-tkg/linux-tkg-patches/5.8/0009-glitched-ondemand-bmq.patch new file mode 100644 index 0000000..a926040 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.8/0009-glitched-ondemand-bmq.patch @@ -0,0 +1,18 @@ +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index 6b423eebfd5d..61e3271675d6 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -21,10 +21,10 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_SAMPLING_DOWN_FACTOR (1) ++#define DEF_FREQUENCY_UP_THRESHOLD (55) ++#define DEF_SAMPLING_DOWN_FACTOR (5) + #define MAX_SAMPLING_DOWN_FACTOR (100000) +-#define MICRO_FREQUENCY_UP_THRESHOLD (95) ++#define MICRO_FREQUENCY_UP_THRESHOLD (63) + #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) + #define MIN_FREQUENCY_UP_THRESHOLD (1) + #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux-tkg/linux-tkg-patches/5.8/0009-prjc_v5.8-r3.patch b/linux-tkg/linux-tkg-patches/5.8/0009-prjc_v5.8-r3.patch new file mode 100644 index 0000000..01bffcd --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.8/0009-prjc_v5.8-r3.patch @@ -0,0 +1,8582 @@ +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index fb95fad81c79..6e3f8233600e 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4525,6 +4525,12 @@ + + sbni= [NET] Granch SBNI12 leased line adapter + ++ sched_timeslice= ++ [KNL] Time slice in us for BMQ/PDS scheduler. ++ Format: (must be >= 1000) ++ Default: 4000 ++ See Documentation/scheduler/sched-BMQ.txt ++ + sched_debug [KNL] Enables verbose scheduler debug messages. + + schedstats= [KNL,X86] Enable or disable scheduled statistics. +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index 83acf5025488..313d2124e709 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -1428,3 +1428,13 @@ is 10 seconds. + + The softlockup threshold is (``2 * watchdog_thresh``). Setting this + tunable to zero will disable lockup detection altogether. ++ ++yield_type: ++=========== ++ ++BMQ/PDS CPU scheduler only. This determines what type of yield calls ++to sched_yield will perform. ++ ++ 0 - No yield. ++ 1 - Deboost and requeue task. (default) ++ 2 - Set run queue skip task. +diff --git a/Documentation/scheduler/sched-BMQ.txt b/Documentation/scheduler/sched-BMQ.txt +new file mode 100644 +index 000000000000..05c84eec0f31 +--- /dev/null ++++ b/Documentation/scheduler/sched-BMQ.txt +@@ -0,0 +1,110 @@ ++ BitMap queue CPU Scheduler ++ -------------------------- ++ ++CONTENT ++======== ++ ++ Background ++ Design ++ Overview ++ Task policy ++ Priority management ++ BitMap Queue ++ CPU Assignment and Migration ++ ++ ++Background ++========== ++ ++BitMap Queue CPU scheduler, referred to as BMQ from here on, is an evolution ++of previous Priority and Deadline based Skiplist multiple queue scheduler(PDS), ++and inspired by Zircon scheduler. The goal of it is to keep the scheduler code ++simple, while efficiency and scalable for interactive tasks, such as desktop, ++movie playback and gaming etc. ++ ++Design ++====== ++ ++Overview ++-------- ++ ++BMQ use per CPU run queue design, each CPU(logical) has it's own run queue, ++each CPU is responsible for scheduling the tasks that are putting into it's ++run queue. ++ ++The run queue is a set of priority queues. Note that these queues are fifo ++queue for non-rt tasks or priority queue for rt tasks in data structure. See ++BitMap Queue below for details. BMQ is optimized for non-rt tasks in the fact ++that most applications are non-rt tasks. No matter the queue is fifo or ++priority, In each queue is an ordered list of runnable tasks awaiting execution ++and the data structures are the same. When it is time for a new task to run, ++the scheduler simply looks the lowest numbered queueue that contains a task, ++and runs the first task from the head of that queue. And per CPU idle task is ++also in the run queue, so the scheduler can always find a task to run on from ++its run queue. ++ ++Each task will assigned the same timeslice(default 4ms) when it is picked to ++start running. Task will be reinserted at the end of the appropriate priority ++queue when it uses its whole timeslice. When the scheduler selects a new task ++from the priority queue it sets the CPU's preemption timer for the remainder of ++the previous timeslice. When that timer fires the scheduler will stop execution ++on that task, select another task and start over again. ++ ++If a task blocks waiting for a shared resource then it's taken out of its ++priority queue and is placed in a wait queue for the shared resource. When it ++is unblocked it will be reinserted in the appropriate priority queue of an ++eligible CPU. ++ ++Task policy ++----------- ++ ++BMQ supports DEADLINE, FIFO, RR, NORMAL, BATCH and IDLE task policy like the ++mainline CFS scheduler. But BMQ is heavy optimized for non-rt task, that's ++NORMAL/BATCH/IDLE policy tasks. Below is the implementation detail of each ++policy. ++ ++DEADLINE ++ It is squashed as priority 0 FIFO task. ++ ++FIFO/RR ++ All RT tasks share one single priority queue in BMQ run queue designed. The ++complexity of insert operation is O(n). BMQ is not designed for system runs ++with major rt policy tasks. ++ ++NORMAL/BATCH/IDLE ++ BATCH and IDLE tasks are treated as the same policy. They compete CPU with ++NORMAL policy tasks, but they just don't boost. To control the priority of ++NORMAL/BATCH/IDLE tasks, simply use nice level. ++ ++ISO ++ ISO policy is not supported in BMQ. Please use nice level -20 NORMAL policy ++task instead. ++ ++Priority management ++------------------- ++ ++RT tasks have priority from 0-99. For non-rt tasks, there are three different ++factors used to determine the effective priority of a task. The effective ++priority being what is used to determine which queue it will be in. ++ ++The first factor is simply the task’s static priority. Which is assigned from ++task's nice level, within [-20, 19] in userland's point of view and [0, 39] ++internally. ++ ++The second factor is the priority boost. This is a value bounded between ++[-MAX_PRIORITY_ADJ, MAX_PRIORITY_ADJ] used to offset the base priority, it is ++modified by the following cases: ++ ++*When a thread has used up its entire timeslice, always deboost its boost by ++increasing by one. ++*When a thread gives up cpu control(voluntary or non-voluntary) to reschedule, ++and its switch-in time(time after last switch and run) below the thredhold ++based on its priority boost, will boost its boost by decreasing by one buti is ++capped at 0 (won’t go negative). ++ ++The intent in this system is to ensure that interactive threads are serviced ++quickly. These are usually the threads that interact directly with the user ++and cause user-perceivable latency. These threads usually do little work and ++spend most of their time blocked awaiting another user event. So they get the ++priority boost from unblocking while background threads that do most of the ++processing receive the priority penalty for using their entire timeslice. +diff --git a/fs/proc/base.c b/fs/proc/base.c +index d86c0afc8a85..7f394a6fb9b6 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -479,7 +479,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + seq_puts(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h +index 8874f681b056..59eb72bf7d5f 100644 +--- a/include/asm-generic/resource.h ++++ b/include/asm-generic/resource.h +@@ -23,7 +23,7 @@ + [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY }, \ + [RLIMIT_SIGPENDING] = { 0, 0 }, \ + [RLIMIT_MSGQUEUE] = { MQ_BYTES_MAX, MQ_BYTES_MAX }, \ +- [RLIMIT_NICE] = { 0, 0 }, \ ++ [RLIMIT_NICE] = { 30, 30 }, \ + [RLIMIT_RTPRIO] = { 0, 0 }, \ + [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, \ + } +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 683372943093..d25f2501daf3 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -32,6 +32,7 @@ + #include + #include + #include ++#include + + /* task_struct member predeclarations (sorted alphabetically): */ + struct audit_context; +@@ -650,12 +651,18 @@ struct task_struct { + unsigned int ptrace; + + #ifdef CONFIG_SMP +- int on_cpu; + struct __call_single_node wake_entry; ++#endif ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_ALT) ++ int on_cpu; ++#endif ++ ++#ifdef CONFIG_SMP + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ + unsigned int cpu; + #endif ++#ifndef CONFIG_SCHED_ALT + unsigned int wakee_flips; + unsigned long wakee_flip_decay_ts; + struct task_struct *last_wakee; +@@ -669,6 +676,7 @@ struct task_struct { + */ + int recent_used_cpu; + int wake_cpu; ++#endif /* !CONFIG_SCHED_ALT */ + #endif + int on_rq; + +@@ -677,13 +685,33 @@ struct task_struct { + int normal_prio; + unsigned int rt_priority; + ++#ifdef CONFIG_SCHED_ALT ++ u64 last_ran; ++ s64 time_slice; ++#ifdef CONFIG_SCHED_BMQ ++ int boost_prio; ++ int bmq_idx; ++ struct list_head bmq_node; ++#endif /* CONFIG_SCHED_BMQ */ ++#ifdef CONFIG_SCHED_PDS ++ u64 deadline; ++ u64 priodl; ++ /* skip list level */ ++ int sl_level; ++ /* skip list node */ ++ struct skiplist_node sl_node; ++#endif /* CONFIG_SCHED_PDS */ ++ /* sched_clock time spent running */ ++ u64 sched_time; ++#else /* !CONFIG_SCHED_ALT */ + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++ struct sched_dl_entity dl; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +- struct sched_dl_entity dl; + + #ifdef CONFIG_UCLAMP_TASK + /* Clamp values requested for a scheduling entity */ +@@ -1326,6 +1354,15 @@ struct task_struct { + */ + }; + ++#ifdef CONFIG_SCHED_ALT ++#define tsk_seruntime(t) ((t)->sched_time) ++/* replace the uncertian rt_timeout with 0UL */ ++#define tsk_rttimeout(t) (0UL) ++#else /* CFS */ ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++#endif /* !CONFIG_SCHED_ALT */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index 1aff00b65f3c..179d77c8360e 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -1,5 +1,24 @@ + /* SPDX-License-Identifier: GPL-2.0 */ + ++#ifdef CONFIG_SCHED_ALT ++ ++static inline int dl_task(struct task_struct *p) ++{ ++ return 0; ++} ++ ++#ifdef CONFIG_SCHED_BMQ ++#define __tsk_deadline(p) (0UL) ++#endif ++ ++#ifdef CONFIG_SCHED_PDS ++#define __tsk_deadline(p) ((p)->priodl) ++#endif ++ ++#else ++ ++#define __tsk_deadline(p) ((p)->dl.deadline) ++ + /* + * SCHED_DEADLINE tasks has negative priorities, reflecting + * the fact that any of them has higher prio than RT and +@@ -19,6 +38,7 @@ static inline int dl_task(struct task_struct *p) + { + return dl_prio(p->prio); + } ++#endif /* CONFIG_SCHED_ALT */ + + static inline bool dl_time_before(u64 a, u64 b) + { +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index 7d64feafc408..42730d27ceb5 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -20,11 +20,20 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ + #define MAX_RT_PRIO MAX_USER_RT_PRIO + + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) + ++/* +/- priority levels from the base priority */ ++#ifdef CONFIG_SCHED_BMQ ++#define MAX_PRIORITY_ADJ 7 ++#endif ++#ifdef CONFIG_SCHED_PDS ++#define MAX_PRIORITY_ADJ 0 ++#endif ++ + /* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c08b4..0a7565d0d3cf 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_ALT + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h +new file mode 100644 +index 000000000000..47ca955a451d +--- /dev/null ++++ b/include/linux/skip_list.h +@@ -0,0 +1,177 @@ ++/* ++ * Copyright (C) 2016 Alfred Chen. ++ * ++ * Code based on Con Kolivas's skip list implementation for BFS, and ++ * which is based on example originally by William Pugh. ++ * ++ * Skip Lists are a probabilistic alternative to balanced trees, as ++ * described in the June 1990 issue of CACM and were invented by ++ * William Pugh in 1987. ++ * ++ * A couple of comments about this implementation: ++ * ++ * This file only provides a infrastructure of skip list. ++ * ++ * skiplist_node is embedded into container data structure, to get rid ++ * the dependency of kmalloc/kfree operation in scheduler code. ++ * ++ * A customized search function should be defined using DEFINE_SKIPLIST_INSERT ++ * macro and be used for skip list insert operation. ++ * ++ * Random Level is also not defined in this file, instead, it should be ++ * customized implemented and set to node->level then pass to the customized ++ * skiplist_insert function. ++ * ++ * Levels start at zero and go up to (NUM_SKIPLIST_LEVEL -1) ++ * ++ * NUM_SKIPLIST_LEVEL in this implementation is 8 instead of origin 16, ++ * considering that there will be 256 entries to enable the top level when using ++ * random level p=0.5, and that number is more than enough for a run queue usage ++ * in a scheduler usage. And it also help to reduce the memory usage of the ++ * embedded skip list node in task_struct to about 50%. ++ * ++ * The insertion routine has been implemented so as to use the ++ * dirty hack described in the CACM paper: if a random level is ++ * generated that is more than the current maximum level, the ++ * current maximum level plus one is used instead. ++ * ++ * BFS Notes: In this implementation of skiplists, there are bidirectional ++ * next/prev pointers and the insert function returns a pointer to the actual ++ * node the value is stored. The key here is chosen by the scheduler so as to ++ * sort tasks according to the priority list requirements and is no longer used ++ * by the scheduler after insertion. The scheduler lookup, however, occurs in ++ * O(1) time because it is always the first item in the level 0 linked list. ++ * Since the task struct stores a copy of the node pointer upon skiplist_insert, ++ * it can also remove it much faster than the original implementation with the ++ * aid of prev<->next pointer manipulation and no searching. ++ */ ++#ifndef _LINUX_SKIP_LIST_H ++#define _LINUX_SKIP_LIST_H ++ ++#include ++ ++#define NUM_SKIPLIST_LEVEL (8) ++ ++struct skiplist_node { ++ int level; /* Levels in this node */ ++ struct skiplist_node *next[NUM_SKIPLIST_LEVEL]; ++ struct skiplist_node *prev[NUM_SKIPLIST_LEVEL]; ++}; ++ ++#define SKIPLIST_NODE_INIT(name) { 0,\ ++ {&name, &name, &name, &name,\ ++ &name, &name, &name, &name},\ ++ {&name, &name, &name, &name,\ ++ &name, &name, &name, &name},\ ++ } ++ ++static inline void INIT_SKIPLIST_NODE(struct skiplist_node *node) ++{ ++ /* only level 0 ->next matters in skiplist_empty() */ ++ WRITE_ONCE(node->next[0], node); ++} ++ ++/** ++ * FULL_INIT_SKIPLIST_NODE -- fully init a skiplist_node, expecially for header ++ * @node: the skip list node to be inited. ++ */ ++static inline void FULL_INIT_SKIPLIST_NODE(struct skiplist_node *node) ++{ ++ int i; ++ ++ node->level = 0; ++ for (i = 0; i < NUM_SKIPLIST_LEVEL; i++) { ++ WRITE_ONCE(node->next[i], node); ++ node->prev[i] = node; ++ } ++} ++ ++/** ++ * skiplist_empty - test whether a skip list is empty ++ * @head: the skip list to test. ++ */ ++static inline int skiplist_empty(const struct skiplist_node *head) ++{ ++ return READ_ONCE(head->next[0]) == head; ++} ++ ++/** ++ * skiplist_entry - get the struct for this entry ++ * @ptr: the &struct skiplist_node pointer. ++ * @type: the type of the struct this is embedded in. ++ * @member: the name of the skiplist_node within the struct. ++ */ ++#define skiplist_entry(ptr, type, member) \ ++ container_of(ptr, type, member) ++ ++/** ++ * DEFINE_SKIPLIST_INSERT_FUNC -- macro to define a customized skip list insert ++ * function, which takes two parameters, first one is the header node of the ++ * skip list, second one is the skip list node to be inserted ++ * @func_name: the customized skip list insert function name ++ * @search_func: the search function to be used, which takes two parameters, ++ * 1st one is the itrator of skiplist_node in the list, the 2nd is the skip list ++ * node to be inserted, the function should return true if search should be ++ * continued, otherwise return false. ++ * Returns 1 if @node is inserted as the first item of skip list at level zero, ++ * otherwise 0 ++ */ ++#define DEFINE_SKIPLIST_INSERT_FUNC(func_name, search_func)\ ++static inline int func_name(struct skiplist_node *head, struct skiplist_node *node)\ ++{\ ++ struct skiplist_node *update[NUM_SKIPLIST_LEVEL];\ ++ struct skiplist_node *p, *q;\ ++ int k = head->level;\ ++\ ++ p = head;\ ++ do {\ ++ while (q = p->next[k], q != head && search_func(q, node))\ ++ p = q;\ ++ update[k] = p;\ ++ } while (--k >= 0);\ ++\ ++ k = node->level;\ ++ if (unlikely(k > head->level)) {\ ++ node->level = k = ++head->level;\ ++ update[k] = head;\ ++ }\ ++\ ++ do {\ ++ p = update[k];\ ++ q = p->next[k];\ ++ node->next[k] = q;\ ++ p->next[k] = node;\ ++ node->prev[k] = p;\ ++ q->prev[k] = node;\ ++ } while (--k >= 0);\ ++\ ++ return (p == head);\ ++} ++ ++/** ++ * skiplist_del_init -- delete skip list node from a skip list and reset it's ++ * init state ++ * @head: the header node of the skip list to be deleted from. ++ * @node: the skip list node to be deleted, the caller need to ensure @node is ++ * in skip list which @head represent. ++ * Returns 1 if @node is the first item of skip level at level zero, otherwise 0 ++ */ ++static inline int ++skiplist_del_init(struct skiplist_node *head, struct skiplist_node *node) ++{ ++ int l, m = node->level; ++ ++ for (l = 0; l <= m; l++) { ++ node->prev[l]->next[l] = node->next[l]; ++ node->next[l]->prev[l] = node->prev[l]; ++ } ++ if (m == head->level && m > 0) { ++ while (head->next[m] == head && m > 0) ++ m--; ++ head->level = m; ++ } ++ INIT_SKIPLIST_NODE(node); ++ ++ return (node->prev[0] == head); ++} ++#endif /* _LINUX_SKIP_LIST_H */ +diff --git a/init/Kconfig b/init/Kconfig +index 0498af567f70..aaa7c434eedf 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -742,9 +742,39 @@ config GENERIC_SCHED_CLOCK + + menu "Scheduler features" + ++menuconfig SCHED_ALT ++ bool "Alternative CPU Schedulers" ++ default y ++ help ++ This feature enable alternative CPU scheduler" ++ ++if SCHED_ALT ++ ++choice ++ prompt "Alternative CPU Scheduler" ++ default SCHED_BMQ ++ ++config SCHED_BMQ ++ bool "BMQ CPU scheduler" ++ help ++ The BitMap Queue CPU scheduler for excellent interactivity and ++ responsiveness on the desktop and solid scalability on normal ++ hardware and commodity servers. ++ ++config SCHED_PDS ++ bool "PDS CPU scheduler" ++ help ++ The Priority and Deadline based Skip list multiple queue CPU ++ Scheduler. ++ ++endchoice ++ ++endif ++ + config UCLAMP_TASK + bool "Enable utilization clamping for RT/FAIR tasks" + depends on CPU_FREQ_GOV_SCHEDUTIL ++ depends on !SCHED_ALT + help + This feature enables the scheduler to track the clamped utilization + of each CPU based on RUNNABLE tasks scheduled on that CPU. +@@ -830,6 +860,7 @@ config NUMA_BALANCING + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_ALT + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -916,7 +947,7 @@ menuconfig CGROUP_SCHED + bandwidth allocation to such task groups. It uses cgroups to group + tasks. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_ALT + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -1172,6 +1203,7 @@ config CHECKPOINT_RESTORE + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_ALT + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff --git a/init/init_task.c b/init/init_task.c +index 15089d15010a..6bc94553d79a 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -74,9 +74,15 @@ struct task_struct init_task + .stack = init_stack, + .usage = REFCOUNT_INIT(2), + .flags = PF_KTHREAD, ++#ifdef CONFIG_SCHED_ALT ++ .prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, ++ .static_prio = DEFAULT_PRIO, ++ .normal_prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, ++#else + .prio = MAX_PRIO - 20, + .static_prio = MAX_PRIO - 20, + .normal_prio = MAX_PRIO - 20, ++#endif + .policy = SCHED_NORMAL, + .cpus_ptr = &init_task.cpus_mask, + .cpus_mask = CPU_MASK_ALL, +@@ -86,6 +92,19 @@ struct task_struct init_task + .restart_block = { + .fn = do_no_restart_syscall, + }, ++#ifdef CONFIG_SCHED_ALT ++#ifdef CONFIG_SCHED_BMQ ++ .boost_prio = 0, ++ .bmq_idx = 15, ++ .bmq_node = LIST_HEAD_INIT(init_task.bmq_node), ++#endif ++#ifdef CONFIG_SCHED_PDS ++ .deadline = 0, ++ .sl_level = 0, ++ .sl_node = SKIPLIST_NODE_INIT(init_task.sl_node), ++#endif ++ .time_slice = HZ, ++#else + .se = { + .group_node = LIST_HEAD_INIT(init_task.se.group_node), + }, +@@ -93,6 +112,7 @@ struct task_struct init_task + .run_list = LIST_HEAD_INIT(init_task.rt.run_list), + .time_slice = RR_TIMESLICE, + }, ++#endif + .tasks = LIST_HEAD_INIT(init_task.tasks), + #ifdef CONFIG_SMP + .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index 642415b8c3c9..7e0e1fe18035 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -636,7 +636,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) + return ret; + } + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_ALT) + /* + * Helper routine for generate_sched_domains(). + * Do cpusets a, b have overlapping effective cpus_allowed masks? +@@ -1009,7 +1009,7 @@ static void rebuild_sched_domains_locked(void) + /* Have scheduler rebuild the domains */ + partition_and_rebuild_sched_domains(ndoms, doms, attr); + } +-#else /* !CONFIG_SMP */ ++#else /* !CONFIG_SMP || CONFIG_SCHED_ALT */ + static void rebuild_sched_domains_locked(void) + { + } +diff --git a/kernel/delayacct.c b/kernel/delayacct.c +index 27725754ac99..769d773c7182 100644 +--- a/kernel/delayacct.c ++++ b/kernel/delayacct.c +@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff --git a/kernel/exit.c b/kernel/exit.c +index 727150f28103..23ddd91a3d29 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -121,7 +121,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -142,7 +142,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +index f6310f848f34..4176ad070bc9 100644 +--- a/kernel/livepatch/transition.c ++++ b/kernel/livepatch/transition.c +@@ -306,7 +306,11 @@ static bool klp_try_switch_task(struct task_struct *task) + */ + rq = task_rq_lock(task, &flags); + ++#ifdef CONFIG_SCHED_ALT ++ if (task_running(task) && task != current) { ++#else + if (task_running(rq, task) && task != current) { ++#endif + snprintf(err_buf, STACK_ERR_BUF_SIZE, + "%s: %s:%d is running\n", __func__, task->comm, + task->pid); +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index cfdd5b93264d..84c284eb544a 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -227,15 +227,19 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + * Only use with rt_mutex_waiter_{less,equal}() + */ + #define task_to_waiter(p) \ +- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } ++ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = __tsk_deadline(p) } + + static inline int + rt_mutex_waiter_less(struct rt_mutex_waiter *left, + struct rt_mutex_waiter *right) + { ++#ifdef CONFIG_SCHED_PDS ++ return (left->deadline < right->deadline); ++#else + if (left->prio < right->prio) + return 1; + ++#ifndef CONFIG_SCHED_BMQ + /* + * If both waiters have dl_prio(), we check the deadlines of the + * associated tasks. +@@ -244,17 +248,23 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, + */ + if (dl_prio(left->prio)) + return dl_time_before(left->deadline, right->deadline); ++#endif + + return 0; ++#endif + } + + static inline int + rt_mutex_waiter_equal(struct rt_mutex_waiter *left, + struct rt_mutex_waiter *right) + { ++#ifdef CONFIG_SCHED_PDS ++ return (left->deadline == right->deadline); ++#else + if (left->prio != right->prio) + return 0; + ++#ifndef CONFIG_SCHED_BMQ + /* + * If both waiters have dl_prio(), we check the deadlines of the + * associated tasks. +@@ -263,8 +273,10 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, + */ + if (dl_prio(left->prio)) + return left->deadline == right->deadline; ++#endif + + return 1; ++#endif + } + + static void +@@ -678,7 +690,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, + * the values of the node being removed. + */ + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + rt_mutex_enqueue(lock, waiter); + +@@ -951,7 +963,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + waiter->task = task; + waiter->lock = lock; + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + /* Get the top priority waiter on the lock */ + if (rt_mutex_has_waiters(lock)) +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 5fc9c9b70862..eb6d7d87779f 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -22,14 +22,20 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + +-obj-y += core.o loadavg.o clock.o cputime.o +-obj-y += idle.o fair.o rt.o deadline.o +-obj-y += wait.o wait_bit.o swait.o completion.o +- +-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o ++ifdef CONFIG_SCHED_ALT ++obj-y += alt_core.o alt_debug.o ++else ++obj-y += core.o ++obj-y += fair.o rt.o deadline.o ++obj-$(CONFIG_SMP) += cpudeadline.o stop_task.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o +-obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o ++endif ++obj-y += loadavg.o clock.o cputime.o ++obj-y += idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o ++obj-$(CONFIG_SMP) += cpupri.o pelt.o topology.o ++obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o +diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c +new file mode 100644 +index 000000000000..b469c9488d18 +--- /dev/null ++++ b/kernel/sched/alt_core.c +@@ -0,0 +1,6184 @@ ++/* ++ * kernel/sched/alt_core.c ++ * ++ * Core alternative kernel scheduler code and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel ++ * scheduler by Alfred Chen. ++ * 2019-02-20 BMQ(BitMap Queue) kernel scheduler by Alfred Chen. ++ */ ++#include "sched.h" ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include ++ ++#include "../workqueue_internal.h" ++#include "../../fs/io-wq.h" ++#include "../smpboot.h" ++ ++#include "pelt.h" ++#include "smp.h" ++ ++#define CREATE_TRACE_POINTS ++#include ++ ++#define ALT_SCHED_VERSION "v5.8-r3" ++ ++/* rt_prio(prio) defined in include/linux/sched/rt.h */ ++#define rt_task(p) rt_prio((p)->prio) ++#define rt_policy(policy) ((policy) == SCHED_FIFO || (policy) == SCHED_RR) ++#define task_has_rt_policy(p) (rt_policy((p)->policy)) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* Default time slice is 4 in ms, can be set via kernel parameter "sched_timeslice" */ ++u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000); ++ ++static int __init sched_timeslice(char *str) ++{ ++ int timeslice_us; ++ ++ get_option(&str, ×lice_us); ++ if (timeslice_us >= 1000) ++ sched_timeslice_ns = timeslice_us * 1000; ++ ++ return 0; ++} ++early_param("sched_timeslice", sched_timeslice); ++ ++/* Reschedule if less than this many μs left */ ++#define RESCHED_NS (100 * 1000) ++ ++/** ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Deboost and requeue task. (default) ++ * 2: Set rq skip task. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++#ifdef CONFIG_SMP ++static cpumask_t sched_rq_pending_mask ____cacheline_aligned_in_smp; ++ ++DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_masks); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_end_mask); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_mask); ++ ++#ifdef CONFIG_SCHED_SMT ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++EXPORT_SYMBOL_GPL(sched_smt_present); ++#endif ++ ++/* ++ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of ++ * the domain), this allows us to quickly tell if two cpus are in the same cache ++ * domain, see cpus_share_cache(). ++ */ ++DEFINE_PER_CPU(int, sd_llc_id); ++#endif /* CONFIG_SMP */ ++ ++static DEFINE_MUTEX(sched_hotcpu_mutex); ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++#define IDLE_WM (IDLE_TASK_SCHED_PRIO) ++ ++#ifdef CONFIG_SCHED_SMT ++static cpumask_t sched_sg_idle_mask ____cacheline_aligned_in_smp; ++#endif ++static cpumask_t sched_rq_watermark[SCHED_BITS] ____cacheline_aligned_in_smp; ++ ++#ifdef CONFIG_SCHED_BMQ ++#include "bmq_imp.h" ++#endif ++#ifdef CONFIG_SCHED_PDS ++#include "pds_imp.h" ++#endif ++ ++static inline void update_sched_rq_watermark(struct rq *rq) ++{ ++ unsigned long watermark = sched_queue_watermark(rq); ++ unsigned long last_wm = rq->watermark; ++ unsigned long i; ++ int cpu; ++ ++ /*printk(KERN_INFO "sched: watermark(%d) %d, last %d\n", ++ cpu_of(rq), watermark, last_wm);*/ ++ if (watermark == last_wm) ++ return; ++ ++ rq->watermark = watermark; ++ cpu = cpu_of(rq); ++ if (watermark < last_wm) { ++ for (i = watermark + 1; i <= last_wm; i++) ++ cpumask_andnot(&sched_rq_watermark[i], ++ &sched_rq_watermark[i], cpumask_of(cpu)); ++#ifdef CONFIG_SCHED_SMT ++ if (!static_branch_likely(&sched_smt_present)) ++ return; ++ if (IDLE_WM == last_wm) ++ cpumask_andnot(&sched_sg_idle_mask, ++ &sched_sg_idle_mask, cpu_smt_mask(cpu)); ++#endif ++ return; ++ } ++ /* last_wm < watermark */ ++ for (i = last_wm + 1; i <= watermark; i++) ++ cpumask_set_cpu(cpu, &sched_rq_watermark[i]); ++#ifdef CONFIG_SCHED_SMT ++ if (!static_branch_likely(&sched_smt_present)) ++ return; ++ if (IDLE_WM == watermark) { ++ cpumask_t tmp; ++ cpumask_and(&tmp, cpu_smt_mask(cpu), &sched_rq_watermark[IDLE_WM]); ++ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) ++ cpumask_or(&sched_sg_idle_mask, cpu_smt_mask(cpu), ++ &sched_sg_idle_mask); ++ } ++#endif ++} ++ ++static inline struct task_struct *rq_runnable_task(struct rq *rq) ++{ ++ struct task_struct *next = sched_rq_first_task(rq); ++ ++ if (unlikely(next == rq->skip)) ++ next = sched_rq_next_task(next, rq); ++ ++ return next; ++} ++ ++/* ++ * Context: p->pi_lock ++ */ ++static inline struct rq ++*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock(&rq->lock); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ *plock = NULL; ++ return rq; ++ } ++ } ++} ++ ++static inline void ++__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) ++{ ++ if (NULL != lock) ++ raw_spin_unlock(lock); ++} ++ ++static inline struct rq ++*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, ++ unsigned long *flags) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock_irqsave(&rq->lock, *flags); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&rq->lock, *flags); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ raw_spin_lock_irqsave(&p->pi_lock, *flags); ++ if (likely(!p->on_cpu && !p->on_rq && ++ rq == task_rq(p))) { ++ *plock = &p->pi_lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); ++ } ++ } ++} ++ ++static inline void ++task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, ++ unsigned long *flags) ++{ ++ raw_spin_unlock_irqrestore(lock, *flags); ++} ++ ++/* ++ * __task_rq_lock - lock the rq @p resides on. ++ */ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ for (;;) { ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) ++ return rq; ++ raw_spin_unlock(&rq->lock); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. ++ */ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ for (;;) { ++ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ /* ++ * move_queued_task() task_rq_lock() ++ * ++ * ACQUIRE (rq->lock) ++ * [S] ->on_rq = MIGRATING [L] rq = task_rq() ++ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); ++ * [S] ->cpu = new_cpu [L] task_rq() ++ * [L] ->on_rq ++ * RELEASE (rq->lock) ++ * ++ * If we observe the old CPU in task_rq_lock(), the acquire of ++ * the old rq->lock will fully serialize against the stores. ++ * ++ * If we observe the new CPU in task_rq_lock(), the address ++ * dependency headed by '[L] rq = task_rq()' and the acquire ++ * will pair with the WMB to ensure we then also see migrating. ++ */ ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++static inline void ++rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irqsave(&rq->lock, rf->flags); ++} ++ ++static inline void ++rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irqrestore(&rq->lock, rf->flags); ++} ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++ s64 __maybe_unused steal = 0, irq_delta = 0; ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ delta -= steal; ++ } ++#endif ++ ++ rq->clock_task += delta; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ if ((irq_delta + steal)) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta <= 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out ++ * of nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu; ++ ++ if (!tick_nohz_full_enabled()) ++ return; ++ ++ cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (rq->nr_running < 2) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++/* ++ * Add/Remove/Requeue task to/from the runqueue routines ++ * Context: rq->lock ++ */ ++static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ /*printk(KERN_INFO "sched: dequeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ ++ WARN_ONCE(task_rq(p) != rq, "sched: dequeue task reside on cpu%d from cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ ++ __SCHED_DEQUEUE_TASK(p, rq, flags, update_sched_rq_watermark(rq)); ++ --rq->nr_running; ++#ifdef CONFIG_SMP ++ if (1 == rq->nr_running) ++ cpumask_clear_cpu(cpu_of(rq), &sched_rq_pending_mask); ++#endif ++ ++ sched_update_tick_dependency(rq); ++} ++ ++static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ /*printk(KERN_INFO "sched: enqueue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ ++ WARN_ONCE(task_rq(p) != rq, "sched: enqueue task reside on cpu%d to cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ ++ __SCHED_ENQUEUE_TASK(p, rq, flags); ++ update_sched_rq_watermark(rq); ++ ++rq->nr_running; ++#ifdef CONFIG_SMP ++ if (2 == rq->nr_running) ++ cpumask_set_cpu(cpu_of(rq), &sched_rq_pending_mask); ++#endif ++ ++ sched_update_tick_dependency(rq); ++ ++ /* ++ * If in_iowait is set, the code below may not trigger any cpufreq ++ * utilization updates, so do it here explicitly with the IOWAIT flag ++ * passed. ++ */ ++ if (p->in_iowait) ++ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); ++} ++ ++static inline void requeue_task(struct task_struct *p, struct rq *rq) ++{ ++ lockdep_assert_held(&rq->lock); ++ /*printk(KERN_INFO "sched: requeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ ++ WARN_ONCE(task_rq(p) != rq, "sched: cpu[%d] requeue task reside on cpu%d\n", ++ cpu_of(rq), task_cpu(p)); ++ ++ __SCHED_REQUEUE_TASK(p, rq, update_sched_rq_watermark(rq)); ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * In order to ensure that a pending wakeup will observe our pending ++ * state, even in the failed case, an explicit smp_mb() must be used. ++ */ ++ smp_mb__before_atomic(); ++ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) ++ return false; ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++ return true; ++} ++ ++/** ++ * wake_q_add() - queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ */ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task)) ++ get_task_struct(task); ++} ++ ++/** ++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ * ++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers ++ * that already hold reference to @task can call the 'safe' version and trust ++ * wake_q to do the right thing depending whether or not the @task is already ++ * queued for wakeup. ++ */ ++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (!__wake_q_add(head, task)) ++ put_task_struct(task); ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* task can safely be re-inserted now: */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++/* ++ * resched_curr - mark rq's current task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_curr(struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ int cpu; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ if (test_tsk_need_resched(curr)) ++ return; ++ ++ cpu = cpu_of(rq); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(curr)) ++ smp_send_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(cpu_rq(cpu)); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++} ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_NO_HZ_COMMON ++void nohz_balance_enter_idle(int cpu) {} ++ ++void select_nohz_load_balancer(int stop_tick) {} ++ ++void set_cpu_sd_state_idle(void) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(), default_cpu = -1; ++ struct cpumask *mask; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { ++ if (!idle_cpu(cpu)) ++ return cpu; ++ default_cpu = cpu; ++ } ++ ++ for (mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ mask < per_cpu(sched_cpu_affinity_end_mask, cpu); mask++) ++ for_each_cpu_and(i, mask, housekeeping_cpumask(HK_FLAG_TIMER)) ++ if (!idle_cpu(i)) ++ return i; ++ ++ if (default_cpu == -1) ++ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++ cpu = default_cpu; ++ ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++static inline void wake_up_idle_cpu(int cpu) ++{ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ set_tsk_need_resched(cpu_rq(cpu)->idle); ++ smp_send_reschedule(cpu); ++} ++ ++static inline bool wake_up_full_nohz_cpu(int cpu) ++{ ++ /* ++ * We just need the target to call irq_exit() and re-evaluate ++ * the next tick. The nohz full kick at least implies that. ++ * If needed we can still optimize that later with an ++ * empty IRQ. ++ */ ++ if (tick_nohz_full_cpu(cpu)) { ++ if (cpu != smp_processor_id() || ++ tick_nohz_tick_stopped()) ++ tick_nohz_full_kick_cpu(cpu); ++ return true; ++ } ++ ++ return false; ++} ++ ++void wake_up_nohz_cpu(int cpu) ++{ ++ if (cpu_online(cpu) && !wake_up_full_nohz_cpu(cpu)) ++ wake_up_idle_cpu(cpu); ++} ++ ++static void nohz_csd_func(void *info) ++{ ++ struct rq *rq = info; ++ int cpu = cpu_of(rq); ++ unsigned int flags; ++ ++ /* ++ * Release the rq::nohz_csd. ++ */ ++ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); ++ WARN_ON(!(flags & NOHZ_KICK_MASK)); ++ ++ rq->idle_balance = idle_cpu(cpu); ++ if (rq->idle_balance && !need_resched()) { ++ rq->nohz_idle_balance = flags; ++ raise_softirq_irqoff(SCHED_SOFTIRQ); ++ } ++} ++ ++#endif /* CONFIG_NO_HZ_COMMON */ ++#endif /* CONFIG_SMP */ ++ ++static inline void check_preempt_curr(struct rq *rq) ++{ ++ if (sched_rq_first_task(rq) != rq->curr) ++ resched_curr(rq); ++} ++ ++static inline void ++rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func) ++{ ++ csd->flags = 0; ++ csd->func = func; ++ csd->info = rq; ++} ++ ++#ifdef CONFIG_SCHED_HRTICK ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++ ++static void hrtick_clear(struct rq *rq) ++{ ++ if (hrtimer_active(&rq->hrtick_timer)) ++ hrtimer_cancel(&rq->hrtick_timer); ++} ++ ++/* ++ * High-resolution timer tick. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrtick(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrtick_timer); ++ struct task_struct *p; ++ ++ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); ++ ++ raw_spin_lock(&rq->lock); ++ p = rq->curr; ++ p->time_slice = 0; ++ resched_curr(rq); ++ raw_spin_unlock(&rq->lock); ++ ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Use hrtick when: ++ * - enabled by features ++ * - hrtimer is actually high res ++ */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ /** ++ * Alt schedule FW doesn't support sched_feat yet ++ if (!sched_feat(HRTICK)) ++ return 0; ++ */ ++ if (!cpu_active(cpu_of(rq))) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrtick_timer); ++} ++ ++#ifdef CONFIG_SMP ++ ++static void __hrtick_restart(struct rq *rq) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ++ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); ++} ++ ++/* ++ * called from hardirq (IPI) context ++ */ ++static void __hrtick_start(void *arg) ++{ ++ struct rq *rq = arg; ++ ++ raw_spin_lock(&rq->lock); ++ __hrtick_restart(rq); ++ raw_spin_unlock(&rq->lock); ++} ++ ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ktime_t time; ++ s64 delta; ++ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense and can cause timer DoS. ++ */ ++ delta = max_t(s64, delay, 10000LL); ++ time = ktime_add_ns(timer->base->get_time(), delta); ++ ++ hrtimer_set_expires(timer, time); ++ ++ if (rq == this_rq()) ++ __hrtick_restart(rq); ++ else ++ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); ++} ++ ++#else ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense. Rely on vruntime for fairness. ++ */ ++ delay = max_t(u64, delay, 10000LL); ++ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED_HARD); ++} ++#endif /* CONFIG_SMP */ ++ ++static void hrtick_rq_init(struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start); ++#endif ++ ++ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); ++ rq->hrtick_timer.function = hrtick; ++} ++#else /* CONFIG_SCHED_HRTICK */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline void hrtick_clear(struct rq *rq) ++{ ++} ++ ++static inline void hrtick_rq_init(struct rq *rq) ++{ ++} ++#endif /* CONFIG_SCHED_HRTICK */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ if (task_has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ ++ return p->static_prio + MAX_PRIORITY_ADJ; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static void activate_task(struct task_struct *p, struct rq *rq) ++{ ++ enqueue_task(p, rq, ENQUEUE_WAKEUP); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++ cpufreq_update_util(rq, 0); ++} ++ ++/* ++ * deactivate_task - remove a task from the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ dequeue_task(p, rq, DEQUEUE_SLEEP); ++ p->on_rq = 0; ++ cpufreq_update_util(rq, 0); ++} ++ ++static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ WRITE_ONCE(p->cpu, cpu); ++#else ++ WRITE_ONCE(task_thread_info(p)->cpu, cpu); ++#endif ++#endif ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++#ifdef CONFIG_SCHED_DEBUG ++ /* ++ * We should never call set_task_cpu() on a blocked task, ++ * ttwu() will sort out the placement. ++ */ ++ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && ++ !p->on_rq); ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * sched_move_task() holds both and thus holding either pins the cgroup, ++ * see task_group(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(&task_rq(p)->lock))); ++#endif ++ /* ++ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. ++ */ ++ WARN_ON_ONCE(!cpu_online(new_cpu)); ++#endif ++ if (task_cpu(p) == new_cpu) ++ return; ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ __set_task_cpu(p, new_cpu); ++} ++ ++static inline bool is_per_cpu_kthread(struct task_struct *p) ++{ ++ return ((p->flags & PF_KTHREAD) && (1 == p->nr_cpus_allowed)); ++} ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr() and select_fallback_rq(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ ++ if (is_per_cpu_kthread(p)) ++ return cpu_online(cpu); ++ ++ return cpu_active(cpu); ++} ++ ++/* ++ * This is how migration works: ++ * ++ * 1) we invoke migration_cpu_stop() on the target CPU using ++ * stop_one_cpu(). ++ * 2) stopper starts to run (implicitly forcing the migrated thread ++ * off the CPU) ++ * 3) it checks whether the migrated task is still in the wrong runqueue. ++ * 4) if it's in the wrong runqueue then the migration thread removes ++ * it and puts it into the right queue. ++ * 5) stopper completes and stop_one_cpu() returns and the migration ++ * is done. ++ */ ++ ++/* ++ * move_queued_task - move a queued task to new rq. ++ * ++ * Returns (locked) new rq. Old rq's lock is released. ++ */ ++static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int ++ new_cpu) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ++ dequeue_task(p, rq, 0); ++ set_task_cpu(p, new_cpu); ++ raw_spin_unlock(&rq->lock); ++ ++ rq = cpu_rq(new_cpu); ++ ++ raw_spin_lock(&rq->lock); ++ BUG_ON(task_cpu(p) != new_cpu); ++ enqueue_task(p, rq, 0); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++ check_preempt_curr(rq); ++ ++ return rq; ++} ++ ++struct migration_arg { ++ struct task_struct *task; ++ int dest_cpu; ++}; ++ ++/* ++ * Move (not current) task off this CPU, onto the destination CPU. We're doing ++ * this because either it can't run here any more (set_cpus_allowed() ++ * away from this CPU, or CPU going down), or because we're ++ * attempting to rebalance this task on exec (sched_exec). ++ * ++ * So we race with normal scheduler movements, but that's OK, as long ++ * as the task is no longer on this CPU. ++ */ ++static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int ++ dest_cpu) ++{ ++ /* Affinity changed (again). */ ++ if (!is_cpu_allowed(p, dest_cpu)) ++ return rq; ++ ++ update_rq_clock(rq); ++ return move_queued_task(rq, p, dest_cpu); ++} ++ ++/* ++ * migration_cpu_stop - this will be executed by a highprio stopper thread ++ * and performs thread migration by bumping thread off CPU then ++ * 'pushing' onto another runqueue. ++ */ ++static int migration_cpu_stop(void *data) ++{ ++ struct migration_arg *arg = data; ++ struct task_struct *p = arg->task; ++ struct rq *rq = this_rq(); ++ ++ /* ++ * The original target CPU might have gone down and we might ++ * be on another CPU but it doesn't matter. ++ */ ++ local_irq_disable(); ++ /* ++ * We need to explicitly wake pending tasks before running ++ * __migrate_task() such that we will not miss enforcing cpus_ptr ++ * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. ++ */ ++ flush_smp_call_function_from_idle(); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ /* ++ * If task_rq(p) != rq, it cannot be migrated here, because we're ++ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because ++ * we're holding p->pi_lock. ++ */ ++ if (task_rq(p) == rq && task_on_rq_queued(p)) ++ rq = __migrate_task(rq, p, arg->dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_enable(); ++ return 0; ++} ++ ++static inline void ++set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ set_cpus_allowed_common(p, new_mask); ++} ++#endif ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ unsigned long flags; ++ bool running, on_rq; ++ unsigned long ncsw; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(p) && p == rq->curr) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ task_access_lock_irqsave(p, &lock, &flags); ++ trace_sched_wait_task(p); ++ running = task_running(p); ++ on_rq = p->on_rq; ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(on_rq)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_send_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++ ++/* ++ * ->cpus_ptr is protected by both rq->lock and p->pi_lock ++ * ++ * A few notes on cpu_active vs cpu_online: ++ * ++ * - cpu_active must be a subset of cpu_online ++ * ++ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, ++ * see __set_cpus_allowed_ptr(). At this point the newly online ++ * CPU isn't yet part of the sched domains, and balancing will not ++ * see it. ++ * ++ * - on cpu-down we clear cpu_active() to mask the sched domains and ++ * avoid the load balancer to place new tasks on the to be removed ++ * CPU. Existing tasks will remain running there and will be taken ++ * off. ++ * ++ * This means that fallback selection must not select !active CPUs. ++ * And can assume that any active CPU must be online. Conversely ++ * select_task_rq() below may allow selection of !active CPUs in order ++ * to satisfy the above rules. ++ */ ++static int select_fallback_rq(int cpu, struct task_struct *p) ++{ ++ int nid = cpu_to_node(cpu); ++ const struct cpumask *nodemask = NULL; ++ enum { cpuset, possible, fail } state = cpuset; ++ int dest_cpu; ++ ++ /* ++ * If the node that the CPU is on has been offlined, cpu_to_node() ++ * will return -1. There is no CPU on the node, and we should ++ * select the CPU on the other node. ++ */ ++ if (nid != -1) { ++ nodemask = cpumask_of_node(nid); ++ ++ /* Look for allowed, online CPU in same node. */ ++ for_each_cpu(dest_cpu, nodemask) { ++ if (!cpu_active(dest_cpu)) ++ continue; ++ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) ++ return dest_cpu; ++ } ++ } ++ ++ for (;;) { ++ /* Any allowed, online CPU? */ ++ for_each_cpu(dest_cpu, p->cpus_ptr) { ++ if (!is_cpu_allowed(p, dest_cpu)) ++ continue; ++ goto out; ++ } ++ ++ /* No more Mr. Nice Guy. */ ++ switch (state) { ++ case cpuset: ++ if (IS_ENABLED(CONFIG_CPUSETS)) { ++ cpuset_cpus_allowed_fallback(p); ++ state = possible; ++ break; ++ } ++ /* Fall-through */ ++ case possible: ++ do_set_cpus_allowed(p, cpu_possible_mask); ++ state = fail; ++ break; ++ ++ case fail: ++ BUG(); ++ break; ++ } ++ } ++ ++out: ++ if (state != cpuset) { ++ /* ++ * Don't tell them about moving exiting tasks or ++ * kernel threads (both mm NULL), since they never ++ * leave kernel. ++ */ ++ if (p->mm && printk_ratelimit()) { ++ printk_deferred("process %d (%s) no longer affine to cpu%d\n", ++ task_pid_nr(p), p->comm, cpu); ++ } ++ } ++ ++ return dest_cpu; ++} ++ ++static inline int select_task_rq(struct task_struct *p, struct rq *rq) ++{ ++ cpumask_t chk_mask, tmp; ++ ++ if (unlikely(!cpumask_and(&chk_mask, p->cpus_ptr, cpu_online_mask))) ++ return select_fallback_rq(task_cpu(p), p); ++ ++ if ( ++#ifdef CONFIG_SCHED_SMT ++ cpumask_and(&tmp, &chk_mask, &sched_sg_idle_mask) || ++#endif ++ cpumask_and(&tmp, &chk_mask, &sched_rq_watermark[IDLE_WM]) || ++ cpumask_and(&tmp, &chk_mask, ++ &sched_rq_watermark[task_sched_prio(p, rq) + 1])) ++ return best_mask_cpu(task_cpu(p), &tmp); ++ ++ return best_mask_cpu(task_cpu(p), &chk_mask); ++} ++ ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ int dest_cpu; ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (cpumask_equal(&p->cpus_mask, new_mask)) ++ goto out; ++ ++ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ if (dest_cpu >= nr_cpu_ids) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ do_set_cpus_allowed(p, new_mask); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(p) || p->state == TASK_WAKING) { ++ struct migration_arg arg = { p, dest_cpu }; ++ ++ /* Need help from migration thread: drop lock and wait. */ ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); ++ return 0; ++ } ++ if (task_on_rq_queued(p)) { ++ /* ++ * OK, since we're going to drop the lock immediately ++ * afterwards anyway. ++ */ ++ update_rq_clock(rq); ++ rq = move_queued_task(rq, p, dest_cpu); ++ lock = &rq->lock; ++ } ++ ++out: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#else /* CONFIG_SMP */ ++ ++static inline int select_task_rq(struct task_struct *p, struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline int ++__set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++ ++#endif /* CONFIG_SMP */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq= this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) ++ __schedstat_inc(rq->ttwu_local); ++ else { ++ /** Alt schedule FW ToDo: ++ * How to do ttwu_wake_remote ++ */ ++ } ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static inline void ++ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ check_preempt_curr(rq); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static inline void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++ ++ activate_task(p, rq); ++ ttwu_do_wakeup(rq, p, 0); ++} ++ ++static int ttwu_remote(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ rq = __task_access_lock(p, &lock); ++ if (task_on_rq_queued(p)) { ++ /* check_preempt_curr() may use rq clock */ ++ update_rq_clock(rq); ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_access_unlock(p, lock); ++ ++ return ret; ++} ++ ++#ifdef CONFIG_SMP ++void sched_ttwu_pending(void *arg) ++{ ++ struct llist_node *llist = arg; ++ struct rq *rq = this_rq(); ++ struct task_struct *p, *t; ++ struct rq_flags rf; ++ ++ if (!llist) ++ return; ++ ++ /* ++ * rq::ttwu_pending racy indication of out-standing wakeups. ++ * Races such that false-negatives are possible, since they ++ * are shorter lived that false-positives would be. ++ */ ++ WRITE_ONCE(rq->ttwu_pending, 0); ++ ++ rq_lock_irqsave(rq, &rf); ++ update_rq_clock(rq); ++ ++ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { ++ if (WARN_ON_ONCE(p->on_cpu)) ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) ++ set_task_cpu(p, cpu_of(rq)); ++ ++ ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0); ++ } ++ ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++void send_call_function_single_ipi(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (!set_nr_if_polling(rq->idle)) ++ arch_send_call_function_single_ipi(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++/* ++ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if ++ * necessary. The wakee CPU on receipt of the IPI will queue the task ++ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost ++ * of the wakeup instead of the waker. ++ */ ++static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); ++ ++ WRITE_ONCE(rq->ttwu_pending, 1); ++ __smp_call_single_queue(cpu, &p->wake_entry.llist); ++} ++ ++static inline bool ttwu_queue_cond(int cpu, int wake_flags) ++{ ++ /* ++ * If the CPU does not share cache, then queue the task on the ++ * remote rqs wakelist to avoid accessing remote data. ++ */ ++ if (!cpus_share_cache(smp_processor_id(), cpu)) ++ return true; ++ ++ /* ++ * If the task is descheduling and the only running task on the ++ * CPU then use the wakelist to offload the task activation to ++ * the soon-to-be-idle CPU as the current CPU is likely busy. ++ * nr_running is checked to avoid unnecessary task stacking. ++ */ ++ if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) ++ return true; ++ ++ return false; ++} ++ ++static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) ++{ ++ if (ttwu_queue_cond(cpu, wake_flags)) { ++ if (WARN_ON_ONCE(cpu == smp_processor_id())) ++ return false; ++ ++ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ ++ __ttwu_queue_wakelist(p, cpu, wake_flags); ++ return true; ++ } ++ ++ return false; ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (is_idle_task(rq->curr)) ++ smp_send_reschedule(cpu); ++ /* Else CPU is not idle, do nothing here */ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); ++} ++#endif /* CONFIG_SMP */ ++ ++static inline void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++#if defined(CONFIG_SMP) ++ if (ttwu_queue_wakelist(p, cpu, wake_flags)) ++ return; ++#endif ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ raw_spin_unlock(&rq->lock); ++} ++ ++/* ++ * Notes on Program-Order guarantees on SMP systems. ++ * ++ * MIGRATION ++ * ++ * The basic program-order guarantee on SMP systems is that when a task [t] ++ * migrates, all its activity on its old CPU [c0] happens-before any subsequent ++ * execution on its new CPU [c1]. ++ * ++ * For migration (of runnable tasks) this is provided by the following means: ++ * ++ * A) UNLOCK of the rq(c0)->lock scheduling out task t ++ * B) migration for t is required to synchronize *both* rq(c0)->lock and ++ * rq(c1)->lock (if not at the same time, then in that order). ++ * C) LOCK of the rq(c1)->lock scheduling in task ++ * ++ * Transitivity guarantees that B happens after A and C after B. ++ * Note: we only require RCpc transitivity. ++ * Note: the CPU doing B need not be c0 or c1 ++ * ++ * Example: ++ * ++ * CPU0 CPU1 CPU2 ++ * ++ * LOCK rq(0)->lock ++ * sched-out X ++ * sched-in Y ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(0)->lock // orders against CPU0 ++ * dequeue X ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(1)->lock ++ * enqueue X ++ * UNLOCK rq(1)->lock ++ * ++ * LOCK rq(1)->lock // orders against CPU2 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(1)->lock ++ * ++ * ++ * BLOCKING -- aka. SLEEP + WAKEUP ++ * ++ * For blocking we (obviously) need to provide the same guarantee as for ++ * migration. However the means are completely different as there is no lock ++ * chain to provide order. Instead we do: ++ * ++ * 1) smp_store_release(X->on_cpu, 0) ++ * 2) smp_cond_load_acquire(!X->on_cpu) ++ * ++ * Example: ++ * ++ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) ++ * ++ * LOCK rq(0)->lock LOCK X->pi_lock ++ * dequeue X ++ * sched-out X ++ * smp_store_release(X->on_cpu, 0); ++ * ++ * smp_cond_load_acquire(&X->on_cpu, !VAL); ++ * X->state = WAKING ++ * set_task_cpu(X,2) ++ * ++ * LOCK rq(2)->lock ++ * enqueue X ++ * X->state = RUNNING ++ * UNLOCK rq(2)->lock ++ * ++ * LOCK rq(2)->lock // orders against CPU1 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(2)->lock ++ * ++ * UNLOCK X->pi_lock ++ * UNLOCK rq(0)->lock ++ * ++ * ++ * However; for wakeups there is a second guarantee we must provide, namely we ++ * must observe the state that lead to our wakeup. That is, not only must our ++ * task observe its own prior state, it must also observe the stores prior to ++ * its wakeup. ++ * ++ * This means that any means of doing remote wakeups must order the CPU doing ++ * the wakeup against the CPU the task is going to end up running on. This, ++ * however, is already required for the regular Program-Order guarantee above, ++ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). ++ * ++ */ ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Return: %true if @p was woken up, %false if it was already running. ++ * or @state didn't match @p's state. ++ */ ++static int try_to_wake_up(struct task_struct *p, unsigned int state, ++ int wake_flags) ++{ ++ unsigned long flags; ++ int cpu, success = 0; ++ ++ preempt_disable(); ++ if (p == current) { ++ /* ++ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) ++ * == smp_processor_id()'. Together this means we can special ++ * case the whole 'p->on_rq && ttwu_remote()' case below ++ * without taking any locks. ++ * ++ * In particular: ++ * - we rely on Program-Order guarantees for all the ordering, ++ * - we're serialized against set_special_state() by virtue of ++ * it disabling IRQs (this allows not taking ->pi_lock). ++ */ ++ if (!(p->state & state)) ++ goto out; ++ ++ success = 1; ++ trace_sched_waking(p); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++ goto out; ++ } ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with mb() in ++ * set_current_state() the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ if (!(p->state & state)) ++ goto unlock; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ * ++ * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). ++ */ ++ smp_rmb(); ++ if (READ_ONCE(p->on_rq) && ttwu_remote(p, wake_flags)) ++ goto unlock; ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ * ++ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure ++ * schedule()'s deactivate_task() has 'happened' and p will no longer ++ * care about it's own p->state. See the comment in __schedule(). ++ */ ++ smp_acquire__after_ctrl_dep(); ++ ++ /* ++ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq ++ * == 0), which means we need to do an enqueue, change p->state to ++ * TASK_WAKING such that we can unlock p->pi_lock before doing the ++ * enqueue, such as ttwu_queue_wakelist(). ++ */ ++ p->state = TASK_WAKING; ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, considering queueing p on the remote CPUs wake_list ++ * which potentially sends an IPI instead of spinning on p->on_cpu to ++ * let the waker make forward progress. This is safe because IRQs are ++ * disabled and the IPI will deliver after on_cpu is cleared. ++ * ++ * Ensure we load task_cpu(p) after p->on_cpu: ++ * ++ * set_task_cpu(p, cpu); ++ * STORE p->cpu = @cpu ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock ++ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) ++ * STORE p->on_cpu = 1 LOAD p->cpu ++ * ++ * to ensure we observe the correct CPU on which the task is currently ++ * scheduling. ++ */ ++ if (smp_load_acquire(&p->on_cpu) && ++ ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) ++ goto unlock; ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until its done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ sched_task_ttwu(p); ++ ++ cpu = select_task_rq(p, this_rq()); ++ ++ if (cpu != task_cpu(p)) { ++ wake_flags |= WF_MIGRATED; ++ psi_ttwu_dequeue(p); ++ set_task_cpu(p, cpu); ++ } ++#else ++ cpu = task_cpu(p); ++#endif /* CONFIG_SMP */ ++ ++ ttwu_queue(p, cpu, wake_flags); ++unlock: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++out: ++ if (success) ++ ttwu_stat(p, task_cpu(p), wake_flags); ++ preempt_enable(); ++ ++ return success; ++} ++ ++/** ++ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state ++ * @p: Process for which the function is to be invoked. ++ * @func: Function to invoke. ++ * @arg: Argument to function. ++ * ++ * If the specified task can be quickly locked into a definite state ++ * (either sleeping or on a given runqueue), arrange to keep it in that ++ * state while invoking @func(@arg). This function can use ->on_rq and ++ * task_curr() to work out what the state is, if required. Given that ++ * @func can be invoked with a runqueue lock held, it had better be quite ++ * lightweight. ++ * ++ * Returns: ++ * @false if the task slipped out from under the locks. ++ * @true if the task was locked onto a runqueue or is sleeping. ++ * However, @func can override this by returning @false. ++ */ ++bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) ++{ ++ bool ret = false; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ lockdep_assert_irqs_enabled(); ++ raw_spin_lock_irq(&p->pi_lock); ++ if (p->on_rq) { ++ rq = __task_rq_lock(p, &rf); ++ if (task_rq(p) == rq) ++ ret = func(p, arg); ++ __task_rq_unlock(rq, &rf); ++ } else { ++ switch (p->state) { ++ case TASK_RUNNING: ++ case TASK_WAKING: ++ break; ++ default: ++ smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). ++ if (!p->on_rq) ++ ret = func(p, arg); ++ } ++ } ++ raw_spin_unlock_irq(&p->pi_lock); ++ return ret; ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ * ++ * __sched_fork() is basic setup used by init_idle() too: ++ */ ++static inline void __sched_fork(unsigned long clone_flags, struct task_struct *p) ++{ ++ p->on_rq = 0; ++ p->on_cpu = 0; ++ p->utime = 0; ++ p->stime = 0; ++ p->sched_time = 0; ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ ++#ifdef CONFIG_COMPACTION ++ p->capture_control = NULL; ++#endif ++#ifdef CONFIG_SMP ++ p->wake_entry.u_flags = CSD_TYPE_TTWU; ++#endif ++} ++ ++/* ++ * fork()/clone()-time setup: ++ */ ++int sched_fork(unsigned long clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ ++ __sched_fork(clone_flags, p); ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = current->normal_prio; ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (task_has_rt_policy(p)) { ++ p->policy = SCHED_NORMAL; ++ p->static_prio = NICE_TO_PRIO(0); ++ p->rt_priority = 0; ++ } else if (PRIO_TO_NICE(p->static_prio) < 0) ++ p->static_prio = NICE_TO_PRIO(0); ++ ++ p->prio = p->normal_prio = normal_prio(p); ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ /* ++ * The child is not yet in the pid-hash so no cgroup attach races, ++ * and the cgroup is pinned to this child due to cgroup_fork() ++ * is ran before sched_fork(). ++ * ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. ++ */ ++ rq = this_rq(); ++ raw_spin_lock(&rq->lock); ++ ++ rq->curr->time_slice /= 2; ++ p->time_slice = rq->curr->time_slice; ++#ifdef CONFIG_SCHED_HRTICK ++ hrtick_start(rq, rq->curr->time_slice); ++#endif ++ ++ if (p->time_slice < RESCHED_NS) { ++ p->time_slice = sched_timeslice_ns; ++ resched_curr(rq); ++ } ++ sched_task_fork(p, rq); ++ raw_spin_unlock(&rq->lock); ++ ++ rseq_migrate(p); ++ /* ++ * We're setting the CPU for the first time, we don't migrate, ++ * so use __set_task_cpu(). ++ */ ++ __set_task_cpu(p, cpu_of(rq)); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ return 0; ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ p->state = TASK_RUNNING; ++ ++ rq = cpu_rq(select_task_rq(p, this_rq())); ++#ifdef CONFIG_SMP ++ rseq_migrate(p); ++ /* ++ * Fork balancing, do it here and not earlier because: ++ * - cpus_ptr can change in the fork path ++ * - any previously selected CPU might disappear through hotplug ++ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, ++ * as we're not fully set-up yet. ++ */ ++ __set_task_cpu(p, cpu_of(rq)); ++#endif ++ ++ raw_spin_lock(&rq->lock); ++ ++ update_rq_clock(rq); ++ activate_task(p, rq); ++ trace_sched_wakeup_new(p); ++ check_preempt_curr(rq); ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ */ ++ next->on_cpu = 1; ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->on_cpu is cleared, the task can be moved to a different CPU. ++ * We must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#else ++ prev->on_cpu = 0; ++#endif ++} ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock.dep_map, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock.owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static struct rq *finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(&rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ finish_lock_switch(rq); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct_rcu_user(prev); ++ } ++ ++ tick_nohz_task_switch(); ++ return rq; ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq; ++ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ rq = finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline struct rq * ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prepare_task_switch(rq, prev, next); ++ ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * kernel -> kernel lazy + transfer active ++ * user -> kernel lazy + mmgrab() active ++ * ++ * kernel -> user switch + mmdrop() active ++ * user -> user switch ++ */ ++ if (!next->mm) { // to kernel ++ enter_lazy_tlb(prev->active_mm, next); ++ ++ next->active_mm = prev->active_mm; ++ if (prev->mm) // from user ++ mmgrab(prev->active_mm); ++ else ++ prev->active_mm = NULL; ++ } else { // to user ++ membarrier_switch_mm(rq, prev->active_mm, next->mm); ++ /* ++ * sys_membarrier() requires an smp_mb() between setting ++ * rq->curr / membarrier_switch_mm() and returning to userspace. ++ * ++ * The below provides this either through switch_mm(), or in ++ * case 'prev->active_mm == next->mm' through ++ * finish_task_switch()'s mmdrop(). ++ */ ++ switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ ++ if (!prev->mm) { // from kernel ++ /* will mmdrop() in finish_task_switch(). */ ++ rq->prev_mm = prev->active_mm; ++ prev->active_mm = NULL; ++ } ++ } ++ ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ return finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptible section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ return raw_rq()->nr_running == 1; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int i; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += cpu_rq(i)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpuidle menu ++ * governor, are using nonsensical data. Preferring shallow idle state selection ++ * for a CPU that has IO-wait which might not even end up running the task when ++ * it does become runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ return atomic_read(&cpu_rq(cpu)->nr_iowait); ++} ++ ++/* ++ * IO-wait accounting, and how its mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += nr_iowait_cpu(i); ++ ++ return sum; ++} ++ ++#ifdef CONFIG_SMP ++ ++/* ++ * sched_exec - execve() is a valuable balancing opportunity, because at ++ * this point the task has the smallest effective memory and cache ++ * footprint. ++ */ ++void sched_exec(void) ++{ ++ struct task_struct *p = current; ++ unsigned long flags; ++ int dest_cpu; ++ struct rq *rq; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = this_rq(); ++ ++ if (rq != task_rq(p) || rq->nr_running < 2) ++ goto unlock; ++ ++ dest_cpu = select_task_rq(p, task_rq(p)); ++ if (dest_cpu == smp_processor_id()) ++ goto unlock; ++ ++ if (likely(cpu_active(dest_cpu))) { ++ struct migration_arg arg = { p, dest_cpu }; ++ ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); ++ return; ++ } ++unlock: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#endif ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++static inline void update_curr(struct rq *rq, struct task_struct *p) ++{ ++ s64 ns = rq->clock_task - p->last_ran; ++ ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ p->time_slice -= ns; ++ p->last_ran = rq->clock_task; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimization chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_access_lock_irqsave(p, &lock, &flags); ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_rq_clock(rq); ++ update_curr(rq, p); ++ } ++ ns = tsk_seruntime(p); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ return ns; ++} ++ ++DEFINE_PER_CPU(unsigned long, thermal_pressure); ++ ++void arch_set_thermal_pressure(struct cpumask *cpus, ++ unsigned long th_pressure) ++{ ++ int cpu; ++ ++ for_each_cpu(cpu, cpus) ++ WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static inline void scheduler_task_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ if (is_idle_task(p)) ++ return; ++ ++ update_curr(rq, p); ++ cpufreq_update_util(rq, 0); ++ ++ /* ++ * Tasks have less than RESCHED_NS of time slice left they will be ++ * rescheduled. ++ */ ++ if (p->time_slice >= RESCHED_NS) ++ return; ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ arch_scale_freq_tick(); ++ sched_clock_tick(); ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ scheduler_task_tick(rq); ++ calc_global_load_tick(rq); ++ psi_task_tick(rq); ++ ++ rq->last_tick = rq->clock; ++ raw_spin_unlock(&rq->lock); ++ ++ perf_event_task_tick(); ++} ++ ++#ifdef CONFIG_SCHED_SMT ++static inline int active_load_balance_cpu_stop(void *data) ++{ ++ struct rq *rq = this_rq(); ++ struct task_struct *p = data; ++ cpumask_t tmp; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ rq->active_balance = 0; ++ /* _something_ may have changed the task, double check again */ ++ if (task_on_rq_queued(p) && task_rq(p) == rq && ++ cpumask_and(&tmp, p->cpus_ptr, &sched_sg_idle_mask)) { ++ int cpu = cpu_of(rq); ++ int dcpu = __best_mask_cpu(cpu, &tmp, ++ per_cpu(sched_cpu_llc_mask, cpu)); ++ rq = move_queued_task(rq, p, dcpu); ++ } ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_restore(flags); ++ ++ return 0; ++} ++ ++/* sg_balance_trigger - trigger slibing group balance for @cpu */ ++static inline int sg_balance_trigger(const int cpu) ++{ ++ struct rq *rq= cpu_rq(cpu); ++ unsigned long flags; ++ struct task_struct *curr; ++ int res; ++ ++ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) ++ return 0; ++ curr = rq->curr; ++ res = (!is_idle_task(curr)) && (1 == rq->nr_running) &&\ ++ cpumask_intersects(curr->cpus_ptr, &sched_sg_idle_mask) &&\ ++ (!rq->active_balance); ++ ++ if (res) ++ rq->active_balance = 1; ++ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ if (res) ++ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, ++ curr, &rq->active_balance_work); ++ return res; ++} ++ ++/* ++ * sg_balance_check - slibing group balance check for run queue @rq ++ */ ++static inline void sg_balance_check(struct rq *rq) ++{ ++ cpumask_t chk; ++ int cpu; ++ ++ /* exit when no sg in idle */ ++ if (cpumask_empty(&sched_sg_idle_mask)) ++ return; ++ ++ cpu = cpu_of(rq); ++ /* ++ * Only cpu in slibing idle group will do the checking and then ++ * find potential cpus which can migrate the current running task ++ */ ++ if (cpumask_test_cpu(cpu, &sched_sg_idle_mask) && ++ cpumask_andnot(&chk, cpu_online_mask, &sched_rq_pending_mask) && ++ cpumask_andnot(&chk, &chk, &sched_rq_watermark[IDLE_WM])) { ++ int i, tried = 0; ++ ++ for_each_cpu_wrap(i, &chk, cpu) { ++ if (cpumask_subset(cpu_smt_mask(i), &chk)) { ++ if (sg_balance_trigger(i)) ++ return; ++ if (tried) ++ return; ++ tried++; ++ } ++ } ++ } ++} ++#endif /* CONFIG_SCHED_SMT */ ++ ++#ifdef CONFIG_NO_HZ_FULL ++ ++struct tick_work { ++ int cpu; ++ atomic_t state; ++ struct delayed_work work; ++}; ++/* Values for ->state, see diagram below. */ ++#define TICK_SCHED_REMOTE_OFFLINE 0 ++#define TICK_SCHED_REMOTE_OFFLINING 1 ++#define TICK_SCHED_REMOTE_RUNNING 2 ++ ++/* ++ * State diagram for ->state: ++ * ++ * ++ * TICK_SCHED_REMOTE_OFFLINE ++ * | ^ ++ * | | ++ * | | sched_tick_remote() ++ * | | ++ * | | ++ * +--TICK_SCHED_REMOTE_OFFLINING ++ * | ^ ++ * | | ++ * sched_tick_start() | | sched_tick_stop() ++ * | | ++ * V | ++ * TICK_SCHED_REMOTE_RUNNING ++ * ++ * ++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() ++ * and sched_tick_start() are happy to leave the state in RUNNING. ++ */ ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ unsigned long flags; ++ u64 delta; ++ int os; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (!tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ curr = rq->curr; ++ if (cpu_is_offline(cpu)) ++ goto out_unlock; ++ ++ update_rq_clock(rq); ++ if (!is_idle_task(curr)) { ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ delta = rq_clock_task(rq) - curr->last_ran; ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ } ++ scheduler_task_tick(rq); ++ ++ calc_load_nohz_remote(rq); ++out_unlock: ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++out_requeue: ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. But ++ * first update state to reflect hotplug activity if required. ++ */ ++ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); ++ if (os == TICK_SCHED_REMOTE_RUNNING) ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ int os; ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); ++ if (os == TICK_SCHED_REMOTE_OFFLINE) { ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++ } ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ cancel_delayed_work_sync(&twork->work); ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_PREEMPT_TRACER)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(KERN_ERR, preempt_disable_ip); ++ } ++ if (panic_on_warn) ++ panic("scheduling while atomic\n"); ++ ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev, bool preempt) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++ ++ if (task_scs_end_corrupted(prev)) ++ panic("corrupted shadow stack detected inside scheduler\n"); ++#endif ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++ if (!preempt && prev->state && prev->non_block_count) { ++ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", ++ prev->comm, prev->pid, prev->non_block_count); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++ } ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++/* ++ * Compile time debug macro ++ * #define ALT_SCHED_DEBUG ++ */ ++ ++#ifdef ALT_SCHED_DEBUG ++void alt_sched_debug(void) ++{ ++ printk(KERN_INFO "sched: pending: 0x%04lx, idle: 0x%04lx, sg_idle: 0x%04lx\n", ++ sched_rq_pending_mask.bits[0], ++ sched_rq_watermark[IDLE_WM].bits[0], ++ sched_sg_idle_mask.bits[0]); ++} ++#else ++inline void alt_sched_debug(void) {} ++#endif ++ ++#ifdef CONFIG_SMP ++ ++#define SCHED_RQ_NR_MIGRATION (32UL) ++/* ++ * Migrate pending tasks in @rq to @dest_cpu ++ * Will try to migrate mininal of half of @rq nr_running tasks and ++ * SCHED_RQ_NR_MIGRATION to @dest_cpu ++ */ ++static inline int ++migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, const int dest_cpu) ++{ ++ struct task_struct *p, *skip = rq->curr; ++ int nr_migrated = 0; ++ int nr_tries = min(rq->nr_running / 2, SCHED_RQ_NR_MIGRATION); ++ ++ while (skip != rq->idle && nr_tries && ++ (p = sched_rq_next_task(skip, rq)) != rq->idle) { ++ skip = sched_rq_next_task(p, rq); ++ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) { ++ __SCHED_DEQUEUE_TASK(p, rq, 0, ); ++ set_task_cpu(p, dest_cpu); ++ __SCHED_ENQUEUE_TASK(p, dest_rq, 0); ++ nr_migrated++; ++ } ++ nr_tries--; ++ } ++ ++ return nr_migrated; ++} ++ ++static inline int take_other_rq_tasks(struct rq *rq, int cpu) ++{ ++ struct cpumask *affinity_mask, *end_mask; ++ ++ if (unlikely(!rq->online)) ++ return 0; ++ ++ if (cpumask_empty(&sched_rq_pending_mask)) ++ return 0; ++ ++ affinity_mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ end_mask = per_cpu(sched_cpu_affinity_end_mask, cpu); ++ do { ++ int i; ++ for_each_cpu_and(i, &sched_rq_pending_mask, affinity_mask) { ++ int nr_migrated; ++ struct rq *src_rq; ++ ++ src_rq = cpu_rq(i); ++ if (!do_raw_spin_trylock(&src_rq->lock)) ++ continue; ++ spin_acquire(&src_rq->lock.dep_map, ++ SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ ++ if ((nr_migrated = migrate_pending_tasks(src_rq, rq, cpu))) { ++ src_rq->nr_running -= nr_migrated; ++#ifdef CONFIG_SMP ++ if (src_rq->nr_running < 2) ++ cpumask_clear_cpu(i, &sched_rq_pending_mask); ++#endif ++ rq->nr_running += nr_migrated; ++#ifdef CONFIG_SMP ++ if (rq->nr_running > 1) ++ cpumask_set_cpu(cpu, &sched_rq_pending_mask); ++#endif ++ update_sched_rq_watermark(rq); ++ cpufreq_update_util(rq, 0); ++ ++ spin_release(&src_rq->lock.dep_map, _RET_IP_); ++ do_raw_spin_unlock(&src_rq->lock); ++ ++ return 1; ++ } ++ ++ spin_release(&src_rq->lock.dep_map, _RET_IP_); ++ do_raw_spin_unlock(&src_rq->lock); ++ } ++ } while (++affinity_mask < end_mask); ++ ++ return 0; ++} ++#endif ++ ++/* ++ * Timeslices below RESCHED_NS are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. ++ */ ++static inline void check_curr(struct task_struct *p, struct rq *rq) ++{ ++ if (unlikely(rq->idle == p)) ++ return; ++ ++ update_curr(rq, p); ++ ++ if (p->time_slice < RESCHED_NS) ++ time_slice_expired(p, rq); ++} ++ ++static inline struct task_struct * ++choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) ++{ ++ struct task_struct *next; ++ ++ if (unlikely(rq->skip)) { ++ next = rq_runnable_task(rq); ++ if (next == rq->idle) { ++#ifdef CONFIG_SMP ++ if (!take_other_rq_tasks(rq, cpu)) { ++#endif ++ rq->skip = NULL; ++ schedstat_inc(rq->sched_goidle); ++ return next; ++#ifdef CONFIG_SMP ++ } ++ next = rq_runnable_task(rq); ++#endif ++ } ++ rq->skip = NULL; ++#ifdef CONFIG_HIGH_RES_TIMERS ++ hrtick_start(rq, next->time_slice); ++#endif ++ return next; ++ } ++ ++ next = sched_rq_first_task(rq); ++ if (next == rq->idle) { ++#ifdef CONFIG_SMP ++ if (!take_other_rq_tasks(rq, cpu)) { ++#endif ++ schedstat_inc(rq->sched_goidle); ++ /*printk(KERN_INFO "sched: choose_next_task(%d) idle %px\n", cpu, next);*/ ++ return next; ++#ifdef CONFIG_SMP ++ } ++ next = sched_rq_first_task(rq); ++#endif ++ } ++#ifdef CONFIG_HIGH_RES_TIMERS ++ hrtick_start(rq, next->time_slice); ++#endif ++ /*printk(KERN_INFO "sched: choose_next_task(%d) next %px\n", cpu, ++ * next);*/ ++ return next; ++} ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next; ++ unsigned long *switch_count; ++ unsigned long prev_state; ++ struct rq *rq; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ ++ schedule_debug(prev, preempt); ++ ++ /* by passing sched_feat(HRTICK) checking which Alt schedule FW doesn't support */ ++ hrtick_clear(rq); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(): ++ * ++ * __set_current_state(@state) signal_wake_up() ++ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) ++ * wake_up_state(p, state) ++ * LOCK rq->lock LOCK p->pi_state ++ * smp_mb__after_spinlock() smp_mb__after_spinlock() ++ * if (signal_pending_state()) if (p->state & @state) ++ * ++ * Also, the membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ raw_spin_lock(&rq->lock); ++ smp_mb__after_spinlock(); ++ ++ update_rq_clock(rq); ++ ++ switch_count = &prev->nivcsw; ++ /* ++ * We must load prev->state once (task_struct::state is volatile), such ++ * that: ++ * ++ * - we form a control dependency vs deactivate_task() below. ++ * - ptrace_{,un}freeze_traced() can change ->state underneath us. ++ */ ++ prev_state = prev->state; ++ if (!preempt && prev_state && prev_state == prev->state) { ++ if (signal_pending_state(prev_state, prev)) { ++ prev->state = TASK_RUNNING; ++ } else { ++ prev->sched_contributes_to_load = ++ (prev_state & TASK_UNINTERRUPTIBLE) && ++ !(prev_state & TASK_NOLOAD) && ++ !(prev->flags & PF_FROZEN); ++ ++ if (prev->sched_contributes_to_load) ++ rq->nr_uninterruptible++; ++ ++ /* ++ * __schedule() ttwu() ++ * prev_state = prev->state; if (p->on_rq && ...) ++ * if (prev_state) goto out; ++ * p->on_rq = 0; smp_acquire__after_ctrl_dep(); ++ * p->state = TASK_WAKING ++ * ++ * Where __schedule() and ttwu() have matching control dependencies. ++ * ++ * After this, schedule() must not care about p->state any more. ++ */ ++ sched_task_deactivate(prev, rq); ++ deactivate_task(prev, rq); ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ check_curr(prev, rq); ++ ++ next = choose_next_task(rq, cpu, prev); ++ ++ if (likely(prev != next)) { ++ next->last_ran = rq->clock_task; ++ rq->last_ts_switch = rq->clock; ++ ++ rq->nr_switches++; ++ /* ++ * RCU users of rcu_dereference(rq->curr) may not see ++ * changes to task_struct made by pick_next_task(). ++ */ ++ RCU_INIT_POINTER(rq->curr, next); ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ ++ psi_sched_switch(prev, next, !task_on_rq_queued(prev)); ++ ++ trace_sched_switch(preempt, prev, next); ++ ++ /* Also unlocks the rq: */ ++ rq = context_switch(rq, prev, next); ++ } else ++ raw_spin_unlock_irq(&rq->lock); ++ ++#ifdef CONFIG_SCHED_SMT ++ sg_balance_check(rq); ++#endif ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(): */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ ++ __schedule(false); ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ if (!tsk->state) ++ return; ++ ++ /* ++ * If a worker went to sleep, notify and ask workqueue whether ++ * it wants to wake up a task to maintain concurrency. ++ * As this function is called inside the schedule() context, ++ * we disable preemption to avoid it calling schedule() again ++ * in the possible wakeup of a kworker and because wq_worker_sleeping() ++ * requires it. ++ */ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ preempt_disable(); ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_sleeping(tsk); ++ else ++ io_wq_worker_sleeping(tsk); ++ preempt_enable_no_resched(); ++ } ++ ++ if (tsk_is_pi_blocked(tsk)) ++ return; ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++static void sched_update_worker(struct task_struct *tsk) ++{ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_running(tsk); ++ else ++ io_wq_worker_running(tsk); ++ } ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ sched_update_worker(tsk); ++} ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_CONTEXT_TRACKING ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != CONTEXT_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPTION ++/* ++ * This is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPTION */ ++ ++/* ++ * This is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++static inline void check_task_changed(struct rq *rq, struct task_struct *p) ++{ ++ /* Trigger resched if task sched_prio has been modified. */ ++ if (task_on_rq_queued(p) && sched_task_need_requeue(p, rq)) { ++ requeue_task(p, rq); ++ check_preempt_curr(rq); ++ } ++} ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_access_lock(p, &lock); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guaratees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ p->prio = prio; ++ update_task_priodl(p); ++ ++ check_task_changed(rq, p); ++out_unlock: ++ __task_access_unlock(p, lock); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ p->static_prio = NICE_TO_PRIO(nice); ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (task_has_rt_policy(p)) ++ goto out_unlock; ++ ++ p->prio = effective_prio(p); ++ update_task_priodl(p); ++ ++ check_task_changed(rq, p); ++out_unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (rq->curr != rq->idle) ++ return 0; ++ ++ if (rq->nr_running) ++ return 0; ++ ++#ifdef CONFIG_SMP ++ if (rq->ttwu_pending) ++ return 0; ++#endif ++ ++ return 1; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the cpu @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++static void __setscheduler_params(struct task_struct *p, ++ const struct sched_attr *attr) ++{ ++ int policy = attr->sched_policy; ++ ++ if (policy == SETPARAM_POLICY) ++ policy = p->policy; ++ ++ p->policy = policy; ++ ++ /* ++ * allow normal nice value to be set, but will not have any ++ * effect on scheduling until the task not SCHED_NORMAL/ ++ * SCHED_BATCH ++ */ ++ p->static_prio = NICE_TO_PRIO(attr->sched_nice); ++ ++ /* ++ * __sched_setscheduler() ensures attr->sched_priority == 0 when ++ * !rt_policy. Always setting this ensures that things like ++ * getparam()/getattr() don't report silly values for !rt tasks. ++ */ ++ p->rt_priority = attr->sched_priority; ++ p->normal_prio = normal_prio(p); ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct rq *rq, struct task_struct *p, ++ const struct sched_attr *attr, bool keep_boost) ++{ ++ __setscheduler_params(p, attr); ++ ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++ update_task_priodl(p); ++} ++ ++/* ++ * check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int __sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, ++ bool user, bool pi) ++{ ++ const struct sched_attr dl_squash_attr = { ++ .size = sizeof(struct sched_attr), ++ .sched_policy = SCHED_FIFO, ++ .sched_nice = 0, ++ .sched_priority = 99, ++ }; ++ int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ int retval, oldpolicy = -1; ++ int policy = attr->sched_policy; ++ unsigned long flags; ++ struct rq *rq; ++ int reset_on_fork; ++ raw_spinlock_t *lock; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ /* ++ * Alt schedule FW supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO ++ */ ++ if (unlikely(SCHED_DEADLINE == policy)) { ++ attr = &dl_squash_attr; ++ policy = attr->sched_policy; ++ newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); ++ ++ if (policy > SCHED_IDLE) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH and SCHED_IDLE is 0. ++ */ ++ if (attr->sched_priority < 0 || ++ (p->mm && attr->sched_priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if ((SCHED_RR == policy || SCHED_FIFO == policy) != ++ (attr->sched_priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (SCHED_FIFO == policy || SCHED_RR == policy) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (attr->sched_priority > p->rt_priority && ++ attr->sched_priority > rlim_rtprio) ++ return -EPERM; ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ if (pi) ++ cpuset_read_lock(); ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ /* ++ * To be able to change p->policy safely, task_access_lock() ++ * must be called. ++ * IF use task_access_lock() here: ++ * For the task p which is not running, reading rq->stop is ++ * racy but acceptable as ->stop doesn't change much. ++ * An enhancemnet can be made to read rq->stop saftly. ++ */ ++ rq = __task_access_lock(p, &lock); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea ++ */ ++ if (p == rq->stop) { ++ retval = -EINVAL; ++ goto unlock; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy)) { ++ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) ++ goto change; ++ if (!rt_policy(policy) && ++ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) ++ goto change; ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ retval = 0; ++ goto unlock; ++ } ++change: ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ goto recheck; ++ } ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ if (pi) { ++ /* ++ * Take priority boosted tasks into account. If the new ++ * effective priority is unchanged, we just store the new ++ * normal parameters and do not touch the scheduler class and ++ * the runqueue. This will be done when the task deboost ++ * itself. ++ */ ++ if (rt_effective_prio(p, newprio) == p->prio) { ++ __setscheduler_params(p, attr); ++ retval = 0; ++ goto unlock; ++ } ++ } ++ ++ __setscheduler(rq, p, attr, pi); ++ ++ check_task_changed(rq, p); ++ ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ if (pi) { ++ cpuset_read_unlock(); ++ rt_mutex_adjust_pi(p); ++ } ++ ++ preempt_enable(); ++ ++ return 0; ++ ++unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ return retval; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ ++ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { ++ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; ++ policy &= ~SCHED_RESET_ON_FORK; ++ attr.sched_policy = policy; ++ } ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++ ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++EXPORT_SYMBOL_GPL(sched_setscheduler); ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++EXPORT_SYMBOL_GPL(sched_setattr); ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setscheduler(p, policy, &lparam); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) ++ goto err_size; ++ ++ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); ++ if (ret) { ++ if (ret == -E2BIG) ++ goto err_size; ++ return ret; ++ } ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * @param: structure containing the new RT priority. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (p != NULL) ++ retval = sched_setattr(p, &attr); ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (task_has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/* ++ * Copy the kernel size attribute structure (which might be larger ++ * than what user-space knows about) to user-space. ++ * ++ * Note that all cases are valid: user-space buffer can be larger or ++ * smaller than the kernel-space buffer. The usual case is that both ++ * have the same size. ++ */ ++static int ++sched_attr_copy_to_user(struct sched_attr __user *uattr, ++ struct sched_attr *kattr, ++ unsigned int usize) ++{ ++ unsigned int ksize = sizeof(*kattr); ++ ++ if (!access_ok(uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * sched_getattr() ABI forwards and backwards compatibility: ++ * ++ * If usize == ksize then we just copy everything to user-space and all is good. ++ * ++ * If usize < ksize then we only copy as much as user-space has space for, ++ * this keeps ABI compatibility as well. We skip the rest. ++ * ++ * If usize > ksize then user-space is using a newer version of the ABI, ++ * which part the kernel doesn't know about. Just ignore it - tooling can ++ * detect the kernel's knowledge of attributes from the attr->size value ++ * which is set to ksize in this case. ++ */ ++ kattr->size = min(usize, ksize); ++ ++ if (copy_to_user(uattr, kattr, kattr->size)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @usize: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, usize, unsigned int, flags) ++{ ++ struct sched_attr kattr = { }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || usize > PAGE_SIZE || ++ usize < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ kattr.sched_policy = p->policy; ++ if (p->sched_reset_on_fork) ++ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; ++ if (task_has_rt_policy(p)) ++ kattr.sched_priority = p->rt_priority; ++ else ++ kattr.sched_nice = task_nice(p); ++ ++#ifdef CONFIG_UCLAMP_TASK ++ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; ++ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; ++#endif ++ ++ rcu_read_unlock(); ++ ++ return sched_attr_copy_to_user(uattr, &kattr, usize); ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_allowed, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ put_online_cpus(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_allowed); ++ cpumask_and(new_mask, in_mask, cpus_allowed); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_allowed); ++ if (!cpumask_subset(new_mask, cpus_allowed)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_allowed to the ++ * cpuset's cpus_allowed ++ */ ++ cpumask_copy(new_mask, cpus_allowed); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_allowed); ++out_put_task: ++ put_task_struct(p); ++ put_online_cpus(); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ struct cpumask *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ raw_spinlock_t *lock; ++ unsigned long flags; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ task_access_lock_irqsave(p, &lock, &flags); ++ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: size of CPU mask copied to user_mask_ptr on success. An ++ * error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min_t(size_t, len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ * ++ * Return: 0. ++ */ ++static void do_sched_yield(void) ++{ ++ struct rq *rq; ++ struct rq_flags rf; ++ ++ if (!sched_yield_type) ++ return; ++ ++ rq = this_rq_lock_irq(&rf); ++ ++ schedstat_inc(rq->yld_count); ++ ++ if (1 == sched_yield_type) { ++ if (!rt_task(current)) ++ do_sched_yield_type_1(current, rq); ++ } else if (2 == sched_yield_type) { ++ if (rq->nr_running > 1) ++ rq->skip = current; ++ } ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ preempt_disable(); ++ raw_spin_unlock(&rq->lock); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPTION ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ rcu_all_qs(); ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, its already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * In Alt schedule FW, yield_to is not supported. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void __sched io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ int retval; ++ ++ alt_sched_debug(); ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ rcu_read_unlock(); ++ ++ *t = ns_to_timespec64(sched_timeslice_ns); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct __kernel_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT_32BIT_TIME ++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, ++ struct old_timespec32 __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_old_timespec32(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, ++ task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ show_stack(p, NULL, KERN_INFO); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++#if BITS_PER_LONG == 32 ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#else ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#endif ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++#ifdef CONFIG_SCHED_DEBUG ++ /* TODO: Alt schedule FW should support this ++ if (!state_filter) ++ sysrq_sched_debug_show(); ++ */ ++#endif ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: CPU the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ __sched_fork(0, idle); ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ idle->last_ran = rq->clock_task; ++ idle->state = TASK_RUNNING; ++ idle->flags |= PF_IDLE; ++ sched_queue_init_idle(rq, idle); ++ ++ scs_task_reset(idle); ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#endif ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ __set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->idle = idle; ++ rcu_assign_pointer(rq->curr, idle); ++ idle->on_cpu = 1; ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++#ifdef CONFIG_SMP ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_mask may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++bool sched_smp_initialized __read_mostly; ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Ensures that the idle task is using init_mm right before its CPU goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(current != this_rq()->idle); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ finish_arch_post_lock_switch(); ++ } ++ ++ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ ++} ++ ++/* ++ * Migrate all tasks from the rq, sleeping tasks will be migrated by ++ * try_to_wake_up()->select_task_rq(). ++ * ++ * Called with rq->lock held even though we'er in stop_machine() and ++ * there's no concurrency possible, we hold the required locks anyway ++ * because of lock validation efforts. ++ */ ++static void migrate_tasks(struct rq *dead_rq) ++{ ++ struct rq *rq = dead_rq; ++ struct task_struct *p, *stop = rq->stop; ++ int count = 0; ++ ++ /* ++ * Fudge the rq selection such that the below task selection loop ++ * doesn't get stuck on the currently eligible stop task. ++ * ++ * We're currently inside stop_machine() and the rq is either stuck ++ * in the stop_machine_cpu_stop() loop, or we're executing this code, ++ * either way we should never end up calling schedule() until we're ++ * done here. ++ */ ++ rq->stop = NULL; ++ ++ p = sched_rq_first_task(rq); ++ while (p != rq->idle) { ++ int dest_cpu; ++ ++ /* skip the running task */ ++ if (task_running(p) || 1 == p->nr_cpus_allowed) { ++ p = sched_rq_next_task(p, rq); ++ continue; ++ } ++ ++ /* ++ * Rules for changing task_struct::cpus_allowed are holding ++ * both pi_lock and rq->lock, such that holding either ++ * stabilizes the mask. ++ * ++ * Drop rq->lock is not quite as disastrous as it usually is ++ * because !cpu_active at this point, which means load-balance ++ * will not interfere. Also, stop-machine. ++ */ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ /* ++ * Since we're inside stop-machine, _nothing_ should have ++ * changed the task, WARN if weird stuff happened, because in ++ * that case the above rq->lock drop is a fail too. ++ */ ++ if (WARN_ON(task_rq(p) != rq || !task_on_rq_queued(p))) { ++ raw_spin_unlock(&p->pi_lock); ++ p = sched_rq_next_task(p, rq); ++ continue; ++ } ++ ++ count++; ++ /* Find suitable destination for @next, with force if needed. */ ++ dest_cpu = select_fallback_rq(dead_rq->cpu, p); ++ rq = __migrate_task(rq, p, dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ rq = dead_rq; ++ raw_spin_lock(&rq->lock); ++ /* Check queued task all over from the header again */ ++ p = sched_rq_first_task(rq); ++ } ++ ++ rq->stop = stop; ++} ++ ++static void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) ++ rq->online = false; ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++static void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) ++ rq->online = true; ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going up, increment the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_inc_cpuslocked(&sched_smt_present); ++#endif ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) ++ cpuset_cpu_active(); ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all cpus have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_online(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu(); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going down, decrement the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) { ++ static_branch_dec_cpuslocked(&sched_smt_present); ++ if (!static_branch_likely(&sched_smt_present)) ++ cpumask_clear(&sched_sg_idle_mask); ++ } ++#endif ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ return 0; ++} ++ ++static void sched_rq_cpu_starting(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ rq->calc_load_update = calc_load_update; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_rq_cpu_starting(cpu); ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ /* Handle pending wakeups and then migrate everything off */ ++ sched_tick_stop(cpu); ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_offline(rq); ++ migrate_tasks(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ hrtick_clear(rq); ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_SMP ++static void sched_init_topology_cpumask_early(void) ++{ ++ int cpu, level; ++ cpumask_t *tmp; ++ ++ for_each_possible_cpu(cpu) { ++ for (level = 0; level < NR_CPU_AFFINITY_CHK_LEVEL; level++) { ++ tmp = &(per_cpu(sched_cpu_affinity_masks, cpu)[level]); ++ cpumask_copy(tmp, cpu_possible_mask); ++ cpumask_clear_cpu(cpu, tmp); ++ } ++ per_cpu(sched_cpu_llc_mask, cpu) = ++ &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ per_cpu(sched_cpu_affinity_end_mask, cpu) = ++ &(per_cpu(sched_cpu_affinity_masks, cpu)[1]); ++ /*per_cpu(sd_llc_id, cpu) = cpu;*/ ++ } ++} ++ ++#define TOPOLOGY_CPUMASK(name, mask, last) \ ++ if (cpumask_and(chk, chk, mask)) \ ++ printk(KERN_INFO "sched: cpu#%02d affinity mask: 0x%08lx - "#name,\ ++ cpu, (chk++)->bits[0]); \ ++ if (!last) \ ++ cpumask_complement(chk, mask) ++ ++static void sched_init_topology_cpumask(void) ++{ ++ int cpu; ++ cpumask_t *chk; ++ ++ for_each_online_cpu(cpu) { ++ /* take chance to reset time slice for idle tasks */ ++ cpu_rq(cpu)->idle->time_slice = sched_timeslice_ns; ++ ++ chk = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ ++ cpumask_complement(chk, cpumask_of(cpu)); ++#ifdef CONFIG_SCHED_SMT ++ TOPOLOGY_CPUMASK(smt, topology_sibling_cpumask(cpu), false); ++#endif ++ per_cpu(sd_llc_id, cpu) = cpumask_first(cpu_coregroup_mask(cpu)); ++ per_cpu(sched_cpu_llc_mask, cpu) = chk; ++ TOPOLOGY_CPUMASK(coregroup, cpu_coregroup_mask(cpu), false); ++ ++ TOPOLOGY_CPUMASK(core, topology_core_cpumask(cpu), false); ++ ++ TOPOLOGY_CPUMASK(others, cpu_online_mask, true); ++ ++ per_cpu(sched_cpu_affinity_end_mask, cpu) = chk; ++ printk(KERN_INFO "sched: cpu#%02d llc_id = %d, llc_mask idx = %d\n", ++ cpu, per_cpu(sd_llc_id, cpu), ++ (int) (per_cpu(sched_cpu_llc_mask, cpu) - ++ &(per_cpu(sched_cpu_affinity_masks, cpu)[0]))); ++ } ++} ++#endif ++ ++void __init sched_init_smp(void) ++{ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) ++ BUG(); ++ ++ sched_init_topology_cpumask(); ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++ cpu_rq(0)->idle->time_slice = sched_timeslice_ns; ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++ int i; ++ struct rq *rq; ++ ++ printk(KERN_INFO ALT_SCHED_VERSION_MSG); ++ ++ wait_bit_init(); ++ ++#ifdef CONFIG_SMP ++ for (i = 0; i < SCHED_BITS; i++) ++ cpumask_copy(&sched_rq_watermark[i], cpu_present_mask); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ ++ sched_queue_init(rq); ++ rq->watermark = IDLE_WM; ++ rq->skip = NULL; ++ ++ raw_spin_lock_init(&rq->lock); ++ rq->nr_running = rq->nr_uninterruptible = 0; ++ rq->calc_load_active = 0; ++ rq->calc_load_update = jiffies + LOAD_FREQ; ++#ifdef CONFIG_SMP ++ rq->online = false; ++ rq->cpu = i; ++ ++#ifdef CONFIG_SCHED_SMT ++ rq->active_balance = 0; ++#endif ++ ++#ifdef CONFIG_NO_HZ_COMMON ++ rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); ++#endif ++#endif /* CONFIG_SMP */ ++ rq->nr_switches = 0; ++ ++ hrtick_rq_init(rq); ++ atomic_set(&rq->nr_iowait, 0); ++ } ++#ifdef CONFIG_SMP ++ /* Set rq->online for cpu 0 */ ++ cpu_rq(0)->online = true; ++#endif ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++ ++ sched_init_topology_cpumask_early(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++ ++ psi_init(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current) && !current->non_block_count) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++#ifdef CONFIG_DEBUG_PREEMPT ++ if (!preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(KERN_ERR, preempt_disable_ip); ++ } ++#endif ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++ ++void __cant_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > preempt_offset) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); ++ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++void normalize_rt_tasks(void) ++{ ++ struct task_struct *g, *p; ++ struct sched_attr attr = { ++ .sched_policy = SCHED_NORMAL, ++ }; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p)) { ++ /* ++ * Renice negative nice level userspace ++ * tasks back to 0: ++ */ ++ if (task_nice(p) < 0) ++ set_user_nice(p, 0); ++ continue; ++ } ++ ++ __sched_setscheduler(p, &attr, false, false); ++ } ++ read_unlock(&tasklist_lock); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * ia64_set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_legacy_files[] = { ++ { } /* Terminate */ ++}; ++ ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++#undef CREATE_TRACE_POINTS +diff --git a/kernel/sched/alt_debug.c b/kernel/sched/alt_debug.c +new file mode 100644 +index 000000000000..1212a031700e +--- /dev/null ++++ b/kernel/sched/alt_debug.c +@@ -0,0 +1,31 @@ ++/* ++ * kernel/sched/alt_debug.c ++ * ++ * Print the alt scheduler debugging details ++ * ++ * Author: Alfred Chen ++ * Date : 2020 ++ */ ++#include "sched.h" ++ ++/* ++ * This allows printing both to /proc/sched_debug and ++ * to the console ++ */ ++#define SEQ_printf(m, x...) \ ++ do { \ ++ if (m) \ ++ seq_printf(m, x); \ ++ else \ ++ pr_cont(x); \ ++ } while (0) ++ ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{ ++ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), ++ get_nr_threads(p)); ++} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} +diff --git a/kernel/sched/alt_sched.h b/kernel/sched/alt_sched.h +new file mode 100644 +index 000000000000..99be2c51c88d +--- /dev/null ++++ b/kernel/sched/alt_sched.h +@@ -0,0 +1,555 @@ ++#ifndef ALT_SCHED_H ++#define ALT_SCHED_H ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#ifdef CONFIG_PARAVIRT ++# include ++#endif ++ ++#include "cpupri.h" ++ ++#ifdef CONFIG_SCHED_BMQ ++#include "bmq.h" ++#endif ++#ifdef CONFIG_SCHED_PDS ++#include "pds.h" ++#endif ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++} ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++#define WF_ON_CPU 0x08 /* Wakee is on_rq */ ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ /* runqueue lock: */ ++ raw_spinlock_t lock; ++ ++ struct task_struct __rcu *curr; ++ struct task_struct *idle, *stop, *skip; ++ struct mm_struct *prev_mm; ++ ++#ifdef CONFIG_SCHED_BMQ ++ struct bmq queue; ++#endif ++#ifdef CONFIG_SCHED_PDS ++ struct skiplist_node sl_header; ++#endif ++ unsigned long watermark; ++ ++ /* switch count */ ++ u64 nr_switches; ++ ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_MEMBARRIER ++ int membarrier_state; ++#endif ++ ++#ifdef CONFIG_SMP ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++ unsigned int ttwu_pending; ++ unsigned char nohz_idle_balance; ++ unsigned char idle_balance; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ struct sched_avg avg_irq; ++#endif ++ ++#ifdef CONFIG_SCHED_SMT ++ int active_balance; ++ struct cpu_stop_work active_balance_work; ++#endif ++#endif /* CONFIG_SMP */ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ /* calc_load related fields */ ++ unsigned long calc_load_update; ++ long calc_load_active; ++ ++ u64 clock, last_tick; ++ u64 last_ts_switch; ++ u64 clock_task; ++ ++ unsigned long nr_running; ++ unsigned long nr_uninterruptible; ++ ++#ifdef CONFIG_SCHED_HRTICK ++#ifdef CONFIG_SMP ++ call_single_data_t hrtick_csd; ++#endif ++ struct hrtimer hrtick_timer; ++#endif ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++ ++#ifdef CONFIG_NO_HZ_COMMON ++#ifdef CONFIG_SMP ++ call_single_data_t nohz_csd; ++#endif ++ atomic_t nohz_flags; ++#endif /* CONFIG_NO_HZ_COMMON */ ++}; ++ ++extern unsigned long calc_load_update; ++extern atomic_long_t calc_load_tasks; ++ ++extern void calc_global_load_tick(struct rq *this_rq); ++extern long calc_load_fold_active(struct rq *this_rq, long adjust); ++ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) ++#define this_rq() this_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++ ++#ifdef CONFIG_SMP ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++extern bool sched_smp_initialized; ++ ++enum { ++ BASE_CPU_AFFINITY_CHK_LEVEL = 1, ++#ifdef CONFIG_SCHED_SMT ++ SMT_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++#ifdef CONFIG_SCHED_MC ++ MC_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++ NR_CPU_AFFINITY_CHK_LEVEL ++}; ++ ++DECLARE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_masks); ++ ++static inline int __best_mask_cpu(int cpu, const cpumask_t *cpumask, ++ const cpumask_t *mask) ++{ ++ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) ++ mask++; ++ return cpu; ++} ++ ++static inline int best_mask_cpu(int cpu, const cpumask_t *cpumask) ++{ ++ return cpumask_test_cpu(cpu, cpumask)? cpu : ++ __best_mask_cpu(cpu, cpumask, &(per_cpu(sched_cpu_affinity_masks, cpu)[0])); ++} ++ ++extern void flush_smp_call_function_from_idle(void); ++ ++#else /* !CONFIG_SMP */ ++static inline void flush_smp_call_function_from_idle(void) { } ++#endif ++ ++#ifndef arch_scale_freq_tick ++static __always_inline ++void arch_scale_freq_tick(void) ++{ ++} ++#endif ++ ++#ifndef arch_scale_freq_capacity ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock_task; ++} ++ ++/* ++ * {de,en}queue flags: ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ */ ++ ++#define DEQUEUE_SLEEP 0x01 ++ ++#define ENQUEUE_WAKEUP 0x01 ++ ++ ++/* ++ * Below are scheduler API which using in other kernel code ++ * It use the dummy rq_flags ++ * ToDo : BMQ need to support these APIs for compatibility with mainline ++ * scheduler code. ++ */ ++struct rq_flags { ++ unsigned long flags; ++}; ++ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock); ++ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock); ++ ++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(&rq->lock); ++} ++ ++static inline void ++task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++} ++ ++static inline void ++rq_unlock_irq(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++static inline struct rq * ++this_rq_lock_irq(struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ raw_spin_lock(&rq->lock); ++ ++ return rq; ++} ++ ++static inline int task_current(struct rq *rq, struct task_struct *p) ++{ ++ return rq->curr == p; ++} ++ ++static inline bool task_running(struct task_struct *p) ++{ ++ return p->on_cpu; ++} ++ ++extern struct static_key_false sched_schedstats; ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++static inline int cpu_of(const struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ return rq->cpu; ++#else ++ return 0; ++#endif ++} ++ ++#include "stats.h" ++ ++#ifdef CONFIG_NO_HZ_COMMON ++#define NOHZ_BALANCE_KICK_BIT 0 ++#define NOHZ_STATS_KICK_BIT 1 ++ ++#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) ++#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) ++ ++#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK) ++ ++#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) ++ ++/* TODO: needed? ++extern void nohz_balance_exit_idle(struct rq *rq); ++#else ++static inline void nohz_balance_exit_idle(struct rq *rq) { } ++*/ ++#endif ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); ++ ++/** ++ * cpufreq_update_util - Take a note about CPU utilization changes. ++ * @rq: Runqueue to carry out the update for. ++ * @flags: Update reason flags. ++ * ++ * This function is called by the scheduler on the CPU whose utilization is ++ * being updated. ++ * ++ * It can only be called from RCU-sched read-side critical sections. ++ * ++ * The way cpufreq is currently arranged requires it to evaluate the CPU ++ * performance state (frequency/voltage) on a regular basis to prevent it from ++ * being stuck in a completely inadequate performance level for too long. ++ * That is not guaranteed to happen if the updates are only triggered from CFS ++ * and DL, though, because they may not be coming in if only RT tasks are ++ * active all the time (or there are RT tasks only). ++ * ++ * As a workaround for that issue, this function is called periodically by the ++ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, ++ * but that really is a band-aid. Going forward it should be replaced with ++ * solutions targeted more specifically at RT tasks. ++ */ ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); ++ if (data) ++ data->func(data, rq_clock(rq), flags); ++} ++#else ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} ++#endif /* CONFIG_CPU_FREQ */ ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern int __init sched_tick_offload_init(void); ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++#endif ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++extern void schedule_idle(void); ++ ++/* ++ * !! For sched_setattr_nocheck() (kernel) only !! ++ * ++ * This is actually gross. :( ++ * ++ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE ++ * tasks, but still be able to sleep. We need this on platforms that cannot ++ * atomically change clock frequency. Remove once fast switching will be ++ * available on such platforms. ++ * ++ * SUGOV stands for SchedUtil GOVernor. ++ */ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++#ifdef CONFIG_MEMBARRIER ++/* ++ * The scheduler provides memory barriers required by membarrier between: ++ * - prior user-space memory accesses and store to rq->membarrier_state, ++ * - store to rq->membarrier_state and following user-space memory accesses. ++ * In the same way it provides those guarantees around store to rq->curr. ++ */ ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++ int membarrier_state; ++ ++ if (prev_mm == next_mm) ++ return; ++ ++ membarrier_state = atomic_read(&next_mm->membarrier_state); ++ if (READ_ONCE(rq->membarrier_state) == membarrier_state) ++ return; ++ ++ WRITE_ONCE(rq->membarrier_state, membarrier_state); ++} ++#else ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++} ++#endif ++ ++#ifdef CONFIG_NUMA ++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); ++#else ++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return nr_cpu_ids; ++} ++#endif ++ ++void swake_up_all_locked(struct swait_queue_head *q); ++void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); ++ ++#endif /* ALT_SCHED_H */ +diff --git a/kernel/sched/bmq.h b/kernel/sched/bmq.h +new file mode 100644 +index 000000000000..aff0bb30a884 +--- /dev/null ++++ b/kernel/sched/bmq.h +@@ -0,0 +1,20 @@ ++#ifndef BMQ_H ++#define BMQ_H ++ ++/* bits: ++ * RT(0-99), (Low prio adj range, nice width, high prio adj range) / 2, cpu idle task */ ++#define SCHED_BITS (MAX_RT_PRIO + NICE_WIDTH / 2 + MAX_PRIORITY_ADJ + 1) ++#define IDLE_TASK_SCHED_PRIO (SCHED_BITS - 1) ++ ++struct bmq { ++ DECLARE_BITMAP(bitmap, SCHED_BITS); ++ struct list_head heads[SCHED_BITS]; ++}; ++ ++ ++static inline int task_running_nice(struct task_struct *p) ++{ ++ return (p->prio + p->boost_prio > DEFAULT_PRIO + MAX_PRIORITY_ADJ); ++} ++ ++#endif +diff --git a/kernel/sched/bmq_imp.h b/kernel/sched/bmq_imp.h +new file mode 100644 +index 000000000000..ad9a7c448da7 +--- /dev/null ++++ b/kernel/sched/bmq_imp.h +@@ -0,0 +1,185 @@ ++#define ALT_SCHED_VERSION_MSG "sched/bmq: BMQ CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n" ++ ++/* ++ * BMQ only routines ++ */ ++#define rq_switch_time(rq) ((rq)->clock - (rq)->last_ts_switch) ++#define boost_threshold(p) (sched_timeslice_ns >>\ ++ (15 - MAX_PRIORITY_ADJ - (p)->boost_prio)) ++ ++static inline void boost_task(struct task_struct *p) ++{ ++ int limit; ++ ++ switch (p->policy) { ++ case SCHED_NORMAL: ++ limit = -MAX_PRIORITY_ADJ; ++ break; ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ limit = 0; ++ break; ++ default: ++ return; ++ } ++ ++ if (p->boost_prio > limit) ++ p->boost_prio--; ++} ++ ++static inline void deboost_task(struct task_struct *p) ++{ ++ if (p->boost_prio < MAX_PRIORITY_ADJ) ++ p->boost_prio++; ++} ++ ++/* ++ * Common interfaces ++ */ ++static inline int task_sched_prio(struct task_struct *p, struct rq *rq) ++{ ++ return (p->prio < MAX_RT_PRIO)? p->prio : MAX_RT_PRIO / 2 + (p->prio + p->boost_prio) / 2; ++} ++ ++static inline void requeue_task(struct task_struct *p, struct rq *rq); ++ ++static inline void time_slice_expired(struct task_struct *p, struct rq *rq) ++{ ++ p->time_slice = sched_timeslice_ns; ++ ++ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) { ++ if (SCHED_RR != p->policy) ++ deboost_task(p); ++ requeue_task(p, rq); ++ } ++} ++ ++static inline void update_task_priodl(struct task_struct *p) {} ++ ++static inline unsigned long sched_queue_watermark(struct rq *rq) ++{ ++ return find_first_bit(rq->queue.bitmap, SCHED_BITS); ++} ++ ++static inline void sched_queue_init(struct rq *rq) ++{ ++ struct bmq *q = &rq->queue; ++ int i; ++ ++ bitmap_zero(q->bitmap, SCHED_BITS); ++ for(i = 0; i < SCHED_BITS; i++) ++ INIT_LIST_HEAD(&q->heads[i]); ++} ++ ++static inline void sched_queue_init_idle(struct rq *rq, struct task_struct *idle) ++{ ++ struct bmq *q = &rq->queue; ++ ++ idle->bmq_idx = IDLE_TASK_SCHED_PRIO; ++ INIT_LIST_HEAD(&q->heads[idle->bmq_idx]); ++ list_add(&idle->bmq_node, &q->heads[idle->bmq_idx]); ++ set_bit(idle->bmq_idx, q->bitmap); ++} ++ ++/* ++ * This routine used in bmq scheduler only which assume the idle task in the bmq ++ */ ++static inline struct task_struct *sched_rq_first_task(struct rq *rq) ++{ ++ unsigned long idx = find_first_bit(rq->queue.bitmap, SCHED_BITS); ++ const struct list_head *head = &rq->queue.heads[idx]; ++ ++ return list_first_entry(head, struct task_struct, bmq_node); ++} ++ ++static inline struct task_struct * ++sched_rq_next_task(struct task_struct *p, struct rq *rq) ++{ ++ unsigned long idx = p->bmq_idx; ++ struct list_head *head = &rq->queue.heads[idx]; ++ ++ if (list_is_last(&p->bmq_node, head)) { ++ idx = find_next_bit(rq->queue.bitmap, SCHED_BITS, idx + 1); ++ head = &rq->queue.heads[idx]; ++ ++ return list_first_entry(head, struct task_struct, bmq_node); ++ } ++ ++ return list_next_entry(p, bmq_node); ++} ++ ++#define __SCHED_DEQUEUE_TASK(p, rq, flags, func) \ ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); \ ++ sched_info_dequeued(rq, p); \ ++ \ ++ list_del(&p->bmq_node); \ ++ if (list_empty(&rq->queue.heads[p->bmq_idx])) { \ ++ clear_bit(p->bmq_idx, rq->queue.bitmap);\ ++ func; \ ++ } ++ ++#define __SCHED_ENQUEUE_TASK(p, rq, flags) \ ++ sched_info_queued(rq, p); \ ++ psi_enqueue(p, flags); \ ++ \ ++ p->bmq_idx = task_sched_prio(p, rq); \ ++ list_add_tail(&p->bmq_node, &rq->queue.heads[p->bmq_idx]); \ ++ set_bit(p->bmq_idx, rq->queue.bitmap) ++ ++#define __SCHED_REQUEUE_TASK(p, rq, func) \ ++{ \ ++ int idx = task_sched_prio(p, rq); \ ++\ ++ list_del(&p->bmq_node); \ ++ list_add_tail(&p->bmq_node, &rq->queue.heads[idx]); \ ++ if (idx != p->bmq_idx) { \ ++ if (list_empty(&rq->queue.heads[p->bmq_idx])) \ ++ clear_bit(p->bmq_idx, rq->queue.bitmap); \ ++ p->bmq_idx = idx; \ ++ set_bit(p->bmq_idx, rq->queue.bitmap); \ ++ func; \ ++ } \ ++} ++ ++static inline bool sched_task_need_requeue(struct task_struct *p, struct rq *rq) ++{ ++ return (task_sched_prio(p, rq) != p->bmq_idx); ++} ++ ++static void sched_task_fork(struct task_struct *p, struct rq *rq) ++{ ++ p->boost_prio = (p->boost_prio < 0) ? ++ p->boost_prio + MAX_PRIORITY_ADJ : MAX_PRIORITY_ADJ; ++} ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ if (p->prio < MAX_RT_PRIO) ++ return (p->prio - MAX_RT_PRIO); ++ return (p->prio - MAX_RT_PRIO + p->boost_prio); ++} ++ ++static void do_sched_yield_type_1(struct task_struct *p, struct rq *rq) ++{ ++ p->boost_prio = MAX_PRIORITY_ADJ; ++} ++ ++static void sched_task_ttwu(struct task_struct *p) ++{ ++ if(this_rq()->clock_task - p->last_ran > sched_timeslice_ns) ++ boost_task(p); ++} ++ ++static void sched_task_deactivate(struct task_struct *p, struct rq *rq) ++{ ++ if (rq_switch_time(rq) < boost_threshold(p)) ++ boost_task(p); ++} +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index 7fbaee24c824..0d7ad05b84fe 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -183,6 +183,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifndef CONFIG_SCHED_ALT + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -300,6 +301,13 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) + + return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); + } ++#else /* CONFIG_SCHED_ALT */ ++static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) ++{ ++ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); ++ return sg_cpu->max; ++} ++#endif + + /** + * sugov_iowait_reset() - Reset the IO boost status of a CPU. +@@ -443,7 +451,9 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } + */ + static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) + { ++#ifndef CONFIG_SCHED_ALT + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) ++#endif + sg_policy->limits_changed = true; + } + +@@ -686,6 +696,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + } + + ret = sched_setattr_nocheck(thread, &attr); ++ + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); +@@ -916,6 +927,7 @@ static int __init sugov_register(void) + core_initcall(sugov_register); + + #ifdef CONFIG_ENERGY_MODEL ++#ifndef CONFIG_SCHED_ALT + extern bool sched_energy_update; + extern struct mutex sched_energy_mutex; + +@@ -946,4 +958,10 @@ void sched_cpufreq_governor_change(struct cpufreq_policy *policy, + } + + } ++#else /* CONFIG_SCHED_ALT */ ++void sched_cpufreq_governor_change(struct cpufreq_policy *policy, ++ struct cpufreq_governor *old_gov) ++{ ++} ++#endif + #endif +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index ff9435dee1df..0ee9967d2d74 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -122,7 +122,7 @@ void account_user_time(struct task_struct *p, u64 cputime) + p->utime += cputime; + account_group_user_time(p, cputime); + +- index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; ++ index = task_running_nice(p) ? CPUTIME_NICE : CPUTIME_USER; + + /* Add user time to cpustat. */ + task_group_account_field(p, index, cputime); +@@ -146,7 +146,7 @@ void account_guest_time(struct task_struct *p, u64 cputime) + p->gtime += cputime; + + /* Add guest time to cpustat. */ +- if (task_nice(p) > 0) { ++ if (task_running_nice(p)) { + cpustat[CPUTIME_NICE] += cputime; + cpustat[CPUTIME_GUEST_NICE] += cputime; + } else { +@@ -269,7 +269,7 @@ static inline u64 account_other_time(u64 max) + #ifdef CONFIG_64BIT + static inline u64 read_sum_exec_runtime(struct task_struct *t) + { +- return t->se.sum_exec_runtime; ++ return tsk_seruntime(t); + } + #else + static u64 read_sum_exec_runtime(struct task_struct *t) +@@ -279,7 +279,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) + struct rq *rq; + + rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; ++ ns = tsk_seruntime(t); + task_rq_unlock(rq, t, &rf); + + return ns; +@@ -658,7 +658,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index 1ae95b9150d3..f5c3aa20d172 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -372,6 +372,7 @@ void cpu_startup_entry(enum cpuhp_state state) + do_idle(); + } + ++#ifndef CONFIG_SCHED_ALT + /* + * idle-task scheduling class. + */ +@@ -492,3 +493,4 @@ const struct sched_class idle_sched_class = { + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif +diff --git a/kernel/sched/pds.h b/kernel/sched/pds.h +new file mode 100644 +index 000000000000..7fdeace7e8a5 +--- /dev/null ++++ b/kernel/sched/pds.h +@@ -0,0 +1,14 @@ ++#ifndef PDS_H ++#define PDS_H ++ ++/* bits: ++ * RT(0-99), (Low prio adj range, nice width, high prio adj range) / 2, cpu idle task */ ++#define SCHED_BITS (MAX_RT_PRIO + 20 + 1) ++#define IDLE_TASK_SCHED_PRIO (SCHED_BITS - 1) ++ ++static inline int task_running_nice(struct task_struct *p) ++{ ++ return (p->prio > DEFAULT_PRIO); ++} ++ ++#endif +diff --git a/kernel/sched/pds_imp.h b/kernel/sched/pds_imp.h +new file mode 100644 +index 000000000000..6baee5e961b9 +--- /dev/null ++++ b/kernel/sched/pds_imp.h +@@ -0,0 +1,257 @@ ++#define ALT_SCHED_VERSION_MSG "sched/pds: PDS CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n" ++ ++static const u64 user_prio2deadline[NICE_WIDTH] = { ++/* -20 */ 4194304, 4613734, 5075107, 5582617, 6140878, ++/* -15 */ 6754965, 7430461, 8173507, 8990857, 9889942, ++/* -10 */ 10878936, 11966829, 13163511, 14479862, 15927848, ++/* -5 */ 17520632, 19272695, 21199964, 23319960, 25651956, ++/* 0 */ 28217151, 31038866, 34142752, 37557027, 41312729, ++/* 5 */ 45444001, 49988401, 54987241, 60485965, 66534561, ++/* 10 */ 73188017, 80506818, 88557499, 97413248, 107154572, ++/* 15 */ 117870029, 129657031, 142622734, 156885007, 172573507 ++}; ++ ++static const unsigned char dl_level_map[] = { ++/* 0 4 8 12 */ ++ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, ++/* 16 20 24 28 */ ++ 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, ++/* 32 36 40 44 */ ++ 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, ++/* 48 52 56 60 */ ++ 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, ++/* 64 68 72 76 */ ++ 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 7, 6, 5, 4, 3, 2, ++/* 80 84 88 92 */ ++ 1, 0 ++}; ++ ++static inline int ++task_sched_prio(const struct task_struct *p, const struct rq *rq) ++{ ++ size_t delta; ++ ++ if (p == rq->idle) ++ return IDLE_TASK_SCHED_PRIO; ++ ++ if (p->prio < MAX_RT_PRIO) ++ return p->prio; ++ ++ delta = (rq->clock + user_prio2deadline[39] - p->deadline) >> 21; ++ delta = min((size_t)delta, ARRAY_SIZE(dl_level_map) - 1); ++ ++ return MAX_RT_PRIO + dl_level_map[delta]; ++} ++ ++static inline void update_task_priodl(struct task_struct *p) ++{ ++ p->priodl = (((u64) (p->prio))<<56) | ((p->deadline)>>8); ++} ++ ++static inline void requeue_task(struct task_struct *p, struct rq *rq); ++ ++static inline void time_slice_expired(struct task_struct *p, struct rq *rq) ++{ ++ /*printk(KERN_INFO "sched: time_slice_expired(%d) - %px\n", cpu_of(rq), p);*/ ++ p->time_slice = sched_timeslice_ns; ++ ++ if (p->prio >= MAX_RT_PRIO) ++ p->deadline = rq->clock + user_prio2deadline[TASK_USER_PRIO(p)]; ++ update_task_priodl(p); ++ ++ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) ++ requeue_task(p, rq); ++} ++ ++/* ++ * pds_skiplist_task_search -- search function used in PDS run queue skip list ++ * node insert operation. ++ * @it: iterator pointer to the node in the skip list ++ * @node: pointer to the skiplist_node to be inserted ++ * ++ * Returns true if key of @it is less or equal to key value of @node, otherwise ++ * false. ++ */ ++static inline bool ++pds_skiplist_task_search(struct skiplist_node *it, struct skiplist_node *node) ++{ ++ return (skiplist_entry(it, struct task_struct, sl_node)->priodl <= ++ skiplist_entry(node, struct task_struct, sl_node)->priodl); ++} ++ ++/* ++ * Define the skip list insert function for PDS ++ */ ++DEFINE_SKIPLIST_INSERT_FUNC(pds_skiplist_insert, pds_skiplist_task_search); ++ ++/* ++ * Init the queue structure in rq ++ */ ++static inline void sched_queue_init(struct rq *rq) ++{ ++ FULL_INIT_SKIPLIST_NODE(&rq->sl_header); ++} ++ ++/* ++ * Init idle task and put into queue structure of rq ++ * IMPORTANT: may be called multiple times for a single cpu ++ */ ++static inline void sched_queue_init_idle(struct rq *rq, struct task_struct *idle) ++{ ++ /*printk(KERN_INFO "sched: init(%d) - %px\n", cpu_of(rq), idle);*/ ++ int default_prio = idle->prio; ++ ++ idle->prio = MAX_PRIO; ++ idle->deadline = 0ULL; ++ update_task_priodl(idle); ++ ++ FULL_INIT_SKIPLIST_NODE(&rq->sl_header); ++ ++ idle->sl_node.level = idle->sl_level; ++ pds_skiplist_insert(&rq->sl_header, &idle->sl_node); ++ ++ idle->prio = default_prio; ++} ++ ++/* ++ * This routine assume that the idle task always in queue ++ */ ++static inline struct task_struct *sched_rq_first_task(struct rq *rq) ++{ ++ struct skiplist_node *node = rq->sl_header.next[0]; ++ ++ BUG_ON(node == &rq->sl_header); ++ return skiplist_entry(node, struct task_struct, sl_node); ++} ++ ++static inline struct task_struct * ++sched_rq_next_task(struct task_struct *p, struct rq *rq) ++{ ++ struct skiplist_node *next = p->sl_node.next[0]; ++ ++ BUG_ON(next == &rq->sl_header); ++ return skiplist_entry(next, struct task_struct, sl_node); ++} ++ ++static inline unsigned long sched_queue_watermark(struct rq *rq) ++{ ++ return task_sched_prio(sched_rq_first_task(rq), rq); ++} ++ ++#define __SCHED_DEQUEUE_TASK(p, rq, flags, func) \ ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); \ ++ sched_info_dequeued(rq, p); \ ++ \ ++ if (skiplist_del_init(&rq->sl_header, &p->sl_node)) { \ ++ func; \ ++ } ++ ++#define __SCHED_ENQUEUE_TASK(p, rq, flags) \ ++ sched_info_queued(rq, p); \ ++ psi_enqueue(p, flags); \ ++ \ ++ p->sl_node.level = p->sl_level; \ ++ pds_skiplist_insert(&rq->sl_header, &p->sl_node) ++ ++/* ++ * Requeue a task @p to @rq ++ */ ++#define __SCHED_REQUEUE_TASK(p, rq, func) \ ++{\ ++ bool b_first = skiplist_del_init(&rq->sl_header, &p->sl_node); \ ++\ ++ p->sl_node.level = p->sl_level; \ ++ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node) || b_first) { \ ++ func; \ ++ } \ ++} ++ ++static inline bool sched_task_need_requeue(struct task_struct *p, struct rq *rq) ++{ ++ struct skiplist_node *node = p->sl_node.prev[0]; ++ ++ if (node != &rq->sl_header) { ++ struct task_struct *t = skiplist_entry(node, struct task_struct, sl_node); ++ ++ if (t->priodl > p->priodl) ++ return true; ++ } ++ ++ node = p->sl_node.next[0]; ++ if (node != &rq->sl_header) { ++ struct task_struct *t = skiplist_entry(node, struct task_struct, sl_node); ++ ++ if (t->priodl < p->priodl) ++ return true; ++ } ++ ++ return false; ++} ++ ++/* ++ * pds_skiplist_random_level -- Returns a pseudo-random level number for skip ++ * list node which is used in PDS run queue. ++ * ++ * In current implementation, based on testing, the first 8 bits in microseconds ++ * of niffies are suitable for random level population. ++ * find_first_bit() is used to satisfy p = 0.5 between each levels, and there ++ * should be platform hardware supported instruction(known as ctz/clz) to speed ++ * up this function. ++ * The skiplist level for a task is populated when task is created and doesn't ++ * change in task's life time. When task is being inserted into run queue, this ++ * skiplist level is set to task's sl_node->level, the skiplist insert function ++ * may change it based on current level of the skip lsit. ++ */ ++static inline int pds_skiplist_random_level(const struct task_struct *p) ++{ ++ long unsigned int randseed; ++ ++ /* ++ * 1. Some architectures don't have better than microsecond resolution ++ * so mask out ~microseconds as a factor of the random seed for skiplist ++ * insertion. ++ * 2. Use address of task structure pointer as another factor of the ++ * random seed for task burst forking scenario. ++ */ ++ randseed = (task_rq(p)->clock ^ (long unsigned int)p) >> 10; ++ ++ return find_first_bit(&randseed, NUM_SKIPLIST_LEVEL - 1); ++} ++ ++static void sched_task_fork(struct task_struct *p, struct rq *rq) ++{ ++ p->sl_level = pds_skiplist_random_level(p); ++ if (p->prio >= MAX_RT_PRIO) ++ p->deadline = rq->clock + user_prio2deadline[TASK_USER_PRIO(p)]; ++ update_task_priodl(p); ++} ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ int ret; ++ ++ if (p->prio < MAX_RT_PRIO) ++ return (p->prio - MAX_RT_PRIO); ++ ++ preempt_disable(); ++ ret = task_sched_prio(p, this_rq()) - MAX_RT_PRIO; ++ preempt_enable(); ++ ++ return ret; ++} ++ ++static void do_sched_yield_type_1(struct task_struct *p, struct rq *rq) ++{ ++ time_slice_expired(p, rq); ++} ++ ++static void sched_task_ttwu(struct task_struct *p) {} ++static void sched_task_deactivate(struct task_struct *p, struct rq *rq) {} +diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c +index b4b1ff96642f..0ead9625081f 100644 +--- a/kernel/sched/pelt.c ++++ b/kernel/sched/pelt.c +@@ -274,6 +274,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) + WRITE_ONCE(sa->util_avg, sa->util_sum / divider); + } + ++#ifndef CONFIG_SCHED_ALT + /* + * sched_entity: + * +@@ -391,8 +392,9 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + + return 0; + } ++#endif + +-#ifdef CONFIG_SCHED_THERMAL_PRESSURE ++#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT) + /* + * thermal: + * +diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h +index eb034d9f024d..7fefc89b377a 100644 +--- a/kernel/sched/pelt.h ++++ b/kernel/sched/pelt.h +@@ -1,13 +1,15 @@ + #ifdef CONFIG_SMP + #include "sched-pelt.h" + ++#ifndef CONFIG_SCHED_ALT + int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); + int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); + int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); + int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); + int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); ++#endif + +-#ifdef CONFIG_SCHED_THERMAL_PRESSURE ++#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT) + int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); + + static inline u64 thermal_load_avg(struct rq *rq) +@@ -37,6 +39,7 @@ update_irq_load_avg(struct rq *rq, u64 running) + } + #endif + ++#ifndef CONFIG_SCHED_ALT + /* + * When a task is dequeued, its estimated utilization should not be update if + * its util_avg has not been updated at least once. +@@ -157,9 +160,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) + return rq_clock_pelt(rq_of(cfs_rq)); + } + #endif ++#endif /* CONFIG_SCHED_ALT */ + + #else + ++#ifndef CONFIG_SCHED_ALT + static inline int + update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) + { +@@ -177,6 +182,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + { + return 0; + } ++#endif + + static inline int + update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 877fb08eb1b0..da6a01b591a0 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2,6 +2,10 @@ + /* + * Scheduler internal types and methods: + */ ++#ifdef CONFIG_SCHED_ALT ++#include "alt_sched.h" ++#else ++ + #include + + #include +@@ -2542,3 +2546,9 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) + + void swake_up_all_locked(struct swait_queue_head *q); + void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); ++ ++static inline int task_running_nice(struct task_struct *p) ++{ ++ return (task_nice(p) > 0); ++} ++#endif /* !CONFIG_SCHED_ALT */ +diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c +index 750fb3c67eed..108422ebc7bf 100644 +--- a/kernel/sched/stats.c ++++ b/kernel/sched/stats.c +@@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v) + } else { + struct rq *rq; + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_ALT + struct sched_domain *sd; + int dcount = 0; ++#endif + #endif + cpu = (unsigned long)(v - 2); + rq = cpu_rq(cpu); +@@ -40,6 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + seq_printf(seq, "\n"); + + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_ALT + /* domain-specific stats */ + rcu_read_lock(); + for_each_domain(cpu, sd) { +@@ -68,6 +71,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + sd->ttwu_move_balance); + } + rcu_read_unlock(); ++#endif + #endif + } + return 0; +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index ba81187bb7af..996b5786b058 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -4,6 +4,7 @@ + */ + #include "sched.h" + ++#ifndef CONFIG_SCHED_ALT + DEFINE_MUTEX(sched_domains_mutex); + + /* Protected by sched_domains_mutex: */ +@@ -1180,8 +1181,10 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) + */ + + static int default_relax_domain_level = -1; ++#endif /* CONFIG_SCHED_ALT */ + int sched_domain_level_max; + ++#ifndef CONFIG_SCHED_ALT + static int __init setup_relax_domain_level(char *str) + { + if (kstrtoint(str, 0, &default_relax_domain_level)) +@@ -1413,6 +1416,7 @@ sd_init(struct sched_domain_topology_level *tl, + + return sd; + } ++#endif /* CONFIG_SCHED_ALT */ + + /* + * Topology list, bottom-up. +@@ -1442,6 +1446,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl) + sched_domain_topology = tl; + } + ++#ifndef CONFIG_SCHED_ALT + #ifdef CONFIG_NUMA + + static const struct cpumask *sd_numa_mask(int cpu) +@@ -2316,3 +2321,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], + partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); + mutex_unlock(&sched_domains_mutex); + } ++#else /* CONFIG_SCHED_ALT */ ++void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ++ struct sched_domain_attr *dattr_new) ++{} ++ ++#ifdef CONFIG_NUMA ++int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; ++ ++int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return best_mask_cpu(cpu, cpus); ++} ++#endif /* CONFIG_NUMA */ ++#endif +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index db1ce7af2563..4437a207d061 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -120,6 +120,10 @@ static unsigned long long_max = LONG_MAX; + static int one_hundred = 100; + static int two_hundred = 200; + static int one_thousand = 1000; ++#ifdef CONFIG_SCHED_ALT ++static int __maybe_unused zero = 0; ++extern int sched_yield_type; ++#endif + #ifdef CONFIG_PRINTK + static int ten_thousand = 10000; + #endif +@@ -184,7 +188,7 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; + int sysctl_legacy_va_layout; + #endif + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_ALT) + static int min_sched_granularity_ns = 100000; /* 100 usecs */ + static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns; /* 0 usecs */ +@@ -1653,6 +1657,7 @@ int proc_do_static_key(struct ctl_table *table, int write, + } + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_ALT + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -1834,6 +1839,7 @@ static struct ctl_table kern_table[] = { + .extra2 = SYSCTL_ONE, + }, + #endif ++#endif /* !CONFIG_SCHED_ALT */ + #ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", +@@ -2410,6 +2416,17 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_ALT ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &two, ++ }, ++#endif + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index d89da1c7e005..a73adff9f309 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -1923,8 +1923,10 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, + int ret = 0; + u64 slack; + ++#ifndef CONFIG_SCHED_ALT + slack = current->timer_slack_ns; + if (dl_task(current) || rt_task(current)) ++#endif + slack = 0; + + hrtimer_init_sleeper_on_stack(&t, clockid, mode); +diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +index 165117996ea0..bd8718a51499 100644 +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -216,7 +216,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) + u64 stime, utime; + + task_cputime(p, &utime, &stime); +- store_samples(samples, stime, utime, p->se.sum_exec_runtime); ++ store_samples(samples, stime, utime, tsk_seruntime(p)); + } + + static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, +@@ -789,6 +789,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, + } + } + ++#ifndef CONFIG_SCHED_ALT + static inline void check_dl_overrun(struct task_struct *tsk) + { + if (tsk->dl.dl_overrun) { +@@ -796,6 +797,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + } + } ++#endif + + static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) + { +@@ -823,8 +825,10 @@ static void check_thread_timers(struct task_struct *tsk, + u64 samples[CPUCLOCK_MAX]; + unsigned long soft; + ++#ifndef CONFIG_SCHED_ALT + if (dl_task(tsk)) + check_dl_overrun(tsk); ++#endif + + if (expiry_cache_is_inactive(pct)) + return; +@@ -838,7 +842,7 @@ static void check_thread_timers(struct task_struct *tsk, + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ +- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); ++ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + /* At the hard limit, send SIGKILL. No further action. */ +@@ -1074,8 +1078,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) + return true; + } + ++#ifndef CONFIG_SCHED_ALT + if (dl_task(tsk) && tsk->dl.dl_overrun) + return true; ++#endif + + return false; + } +diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +index b5e3496cf803..65f60c77bc50 100644 +--- a/kernel/trace/trace_selftest.c ++++ b/kernel/trace/trace_selftest.c +@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_ALT ++ /* No deadline on BMQ/PDS, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + diff --git a/linux-tkg/linux-tkg-patches/5.8/0011-ZFS-fix.patch b/linux-tkg/linux-tkg-patches/5.8/0011-ZFS-fix.patch new file mode 100644 index 0000000..af71d04 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.8/0011-ZFS-fix.patch @@ -0,0 +1,43 @@ +From 1e010beda2896bdf3082fb37a3e49f8ce20e04d8 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= +Date: Thu, 2 May 2019 05:28:08 +0100 +Subject: [PATCH] x86/fpu: Export kernel_fpu_{begin,end}() with + EXPORT_SYMBOL_GPL +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +We need these symbols in zfs as the fpu implementation breaks userspace: + +https://github.com/zfsonlinux/zfs/issues/9346 +Signed-off-by: Jörg Thalheim +--- + arch/x86/kernel/fpu/core.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c +index 12c70840980e..352538b3bb5d 100644 +--- a/arch/x86/kernel/fpu/core.c ++++ b/arch/x86/kernel/fpu/core.c +@@ -102,7 +102,7 @@ void kernel_fpu_begin(void) + } + __cpu_invalidate_fpregs_state(); + } +-EXPORT_SYMBOL_GPL(kernel_fpu_begin); ++EXPORT_SYMBOL(kernel_fpu_begin); + + void kernel_fpu_end(void) + { +@@ -111,7 +111,7 @@ void kernel_fpu_end(void) + this_cpu_write(in_kernel_fpu, false); + preempt_enable(); + } +-EXPORT_SYMBOL_GPL(kernel_fpu_end); ++EXPORT_SYMBOL(kernel_fpu_end); + + /* + * Save the FPU state (mark it for reload if necessary): +-- +2.23.0 + + diff --git a/linux-tkg/linux-tkg-patches/5.8/0012-misc-additions.patch b/linux-tkg/linux-tkg-patches/5.8/0012-misc-additions.patch new file mode 100644 index 0000000..ae06419 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.8/0012-misc-additions.patch @@ -0,0 +1,54 @@ +diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig +index 0840d27381ea..73aba9a31064 100644 +--- a/drivers/tty/Kconfig ++++ b/drivers/tty/Kconfig +@@ -75,6 +75,19 @@ config VT_CONSOLE_SLEEP + def_bool y + depends on VT_CONSOLE && PM_SLEEP + ++config NR_TTY_DEVICES ++ int "Maximum tty device number" ++ depends on VT ++ range 12 63 ++ default 63 ++ ---help--- ++ This option is used to change the number of tty devices in /dev. ++ The default value is 63. The lowest number you can set is 12, ++ 63 is also the upper limit so we don't overrun the serial ++ consoles. ++ ++ If unsure, say 63. ++ + config HW_CONSOLE + bool + depends on VT && !UML +diff --git a/include/uapi/linux/vt.h b/include/uapi/linux/vt.h +index e9d39c48520a..3bceead8da40 100644 +--- a/include/uapi/linux/vt.h ++++ b/include/uapi/linux/vt.h +@@ -3,12 +3,25 @@ + #define _UAPI_LINUX_VT_H + + ++/* ++ * We will make this definition solely for the purpose of making packages ++ * such as splashutils build, because they can not understand that ++ * NR_TTY_DEVICES is defined in the kernel configuration. ++ */ ++#ifndef CONFIG_NR_TTY_DEVICES ++#define CONFIG_NR_TTY_DEVICES 63 ++#endif ++ + /* + * These constants are also useful for user-level apps (e.g., VC + * resizing). + */ + #define MIN_NR_CONSOLES 1 /* must be at least 1 */ +-#define MAX_NR_CONSOLES 63 /* serial lines start at 64 */ ++/* ++ * NR_TTY_DEVICES: ++ * Value MUST be at least 12 and must never be higher then 63 ++ */ ++#define MAX_NR_CONSOLES CONFIG_NR_TTY_DEVICES /* serial lines start above this */ + /* Note: the ioctl VT_GETSTATE does not work for + consoles 16 and higher (since it returns a short) */ \ No newline at end of file diff --git a/linux-tkg/linux-tkg-patches/5.9/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch b/linux-tkg/linux-tkg-patches/5.9/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch new file mode 100644 index 0000000..83240cb --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.9/0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch @@ -0,0 +1,156 @@ +From 5ec2dd3a095442ec1a21d86042a4994f2ba24e63 Mon Sep 17 00:00:00 2001 +Message-Id: <5ec2dd3a095442ec1a21d86042a4994f2ba24e63.1512651251.git.jan.steffens@gmail.com> +From: Serge Hallyn +Date: Fri, 31 May 2013 19:12:12 +0100 +Subject: [PATCH] add sysctl to disallow unprivileged CLONE_NEWUSER by default + +Signed-off-by: Serge Hallyn +[bwh: Remove unneeded binary sysctl bits] +Signed-off-by: Daniel Micay +--- + kernel/fork.c | 15 +++++++++++++++ + kernel/sysctl.c | 12 ++++++++++++ + kernel/user_namespace.c | 3 +++ + 3 files changed, 30 insertions(+) + +diff --git a/kernel/fork.c b/kernel/fork.c +index 07cc743698d3668e..4011d68a8ff9305c 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -102,6 +102,11 @@ + + #define CREATE_TRACE_POINTS + #include ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#else ++#define unprivileged_userns_clone 0 ++#endif + + /* + * Minimum number of threads to boot the kernel +@@ -1555,6 +1560,10 @@ static __latent_entropy struct task_struct *copy_process( + if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) + return ERR_PTR(-EINVAL); + ++ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) ++ if (!capable(CAP_SYS_ADMIN)) ++ return ERR_PTR(-EPERM); ++ + /* + * Thread groups must share signals as well, and detached threads + * can only be started up within the thread group. +@@ -2348,6 +2357,12 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; + ++ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { ++ err = -EPERM; ++ if (!capable(CAP_SYS_ADMIN)) ++ goto bad_unshare_out; ++ } ++ + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index b86520ed3fb60fbf..f7dab3760839f1a1 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -105,6 +105,9 @@ extern int core_uses_pid; + + #if defined(CONFIG_SYSCTL) + ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#endif + /* Constants used for minimum and maximum */ + #ifdef CONFIG_LOCKUP_DETECTOR + static int sixty = 60; +@@ -513,6 +516,15 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_USER_NS ++ { ++ .procname = "unprivileged_userns_clone", ++ .data = &unprivileged_userns_clone, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++#endif + #ifdef CONFIG_PROC_SYSCTL + { + .procname = "tainted", +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index c490f1e4313b998a..dd03bd39d7bf194d 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -24,6 +24,9 @@ + #include + #include + ++/* sysctl */ ++int unprivileged_userns_clone; ++ + static struct kmem_cache *user_ns_cachep __read_mostly; + static DEFINE_MUTEX(userns_state_mutex); + +-- +2.15.1 + +From b5202296055dd333db4425120d3f93ef4e6a0573 Mon Sep 17 00:00:00 2001 +From: "Jan Alexander Steffens (heftig)" +Date: Thu, 7 Dec 2017 13:50:48 +0100 +Subject: ZEN: Add CONFIG for unprivileged_userns_clone + +This way our default behavior continues to match the vanilla kernel. +--- + init/Kconfig | 16 ++++++++++++++++ + kernel/user_namespace.c | 4 ++++ + 2 files changed, 20 insertions(+) + +diff --git a/init/Kconfig b/init/Kconfig +index 4592bf7997c0..f3df02990aff 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1004,6 +1004,22 @@ config USER_NS + + If unsure, say N. + ++config USER_NS_UNPRIVILEGED ++ bool "Allow unprivileged users to create namespaces" ++ default y ++ depends on USER_NS ++ help ++ When disabled, unprivileged users will not be able to create ++ new namespaces. Allowing users to create their own namespaces ++ has been part of several recent local privilege escalation ++ exploits, so if you need user namespaces but are ++ paranoid^Wsecurity-conscious you want to disable this. ++ ++ This setting can be overridden at runtime via the ++ kernel.unprivileged_userns_clone sysctl. ++ ++ If unsure, say Y. ++ + config PID_NS + bool "PID Namespaces" + default y +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index 6b9dbc257e34..107b17f0d528 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -27,7 +27,11 @@ + #include + + /* sysctl */ ++#ifdef CONFIG_USER_NS_UNPRIVILEGED ++int unprivileged_userns_clone = 1; ++#else + int unprivileged_userns_clone; ++#endif + + static struct kmem_cache *user_ns_cachep __read_mostly; + static DEFINE_MUTEX(userns_state_mutex); diff --git a/linux-tkg/linux-tkg-patches/5.9/0002-clear-patches.patch b/linux-tkg/linux-tkg-patches/5.9/0002-clear-patches.patch new file mode 100644 index 0000000..22a32f5 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.9/0002-clear-patches.patch @@ -0,0 +1,360 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Mon, 14 Mar 2016 11:10:58 -0600 +Subject: [PATCH] pci pme wakeups + +Reduce wakeups for PME checks, which are a workaround for miswired +boards (sadly, too many of them) in laptops. +--- + drivers/pci/pci.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c +index c9338f9..6974fbf 100644 +--- a/drivers/pci/pci.c ++++ b/drivers/pci/pci.c +@@ -62,7 +62,7 @@ struct pci_pme_device { + struct pci_dev *dev; + }; + +-#define PME_TIMEOUT 1000 /* How long between PME checks */ ++#define PME_TIMEOUT 4000 /* How long between PME checks */ + + static void pci_dev_d3_sleep(struct pci_dev *dev) + { +-- +https://clearlinux.org + +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Sat, 19 Mar 2016 21:32:19 -0400 +Subject: [PATCH] intel_idle: tweak cpuidle cstates + +Increase target_residency in cpuidle cstate + +Tune intel_idle to be a bit less agressive; +Clear linux is cleaner in hygiene (wakupes) than the average linux, +so we can afford changing these in a way that increases +performance while keeping power efficiency +--- + drivers/idle/intel_idle.c | 44 +++++++++++++++++++-------------------- + 1 file changed, 22 insertions(+), 22 deletions(-) + +diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c +index f449584..c994d24 100644 +--- a/drivers/idle/intel_idle.c ++++ b/drivers/idle/intel_idle.c +@@ -531,7 +531,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -539,7 +539,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 33, +- .target_residency = 100, ++ .target_residency = 900, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -547,7 +547,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, +- .target_residency = 400, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -555,7 +555,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { + .desc = "MWAIT 0x32", + .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 166, +- .target_residency = 500, ++ .target_residency = 1500, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -563,7 +563,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, +- .target_residency = 900, ++ .target_residency = 2000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -571,7 +571,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 600, +- .target_residency = 1800, ++ .target_residency = 5000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -579,7 +579,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 2600, +- .target_residency = 7700, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -599,7 +599,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -607,7 +607,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 40, +- .target_residency = 100, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -615,7 +615,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 133, +- .target_residency = 400, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -623,7 +623,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { + .desc = "MWAIT 0x32", + .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 166, +- .target_residency = 500, ++ .target_residency = 2000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -631,7 +631,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 300, +- .target_residency = 900, ++ .target_residency = 4000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -639,7 +639,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 600, +- .target_residency = 1800, ++ .target_residency = 7000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -647,7 +647,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 2600, +- .target_residency = 7700, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -668,7 +668,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 120, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -676,7 +676,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 70, +- .target_residency = 100, ++ .target_residency = 1000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -684,7 +684,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { + .desc = "MWAIT 0x20", + .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 85, +- .target_residency = 200, ++ .target_residency = 600, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -692,7 +692,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { + .desc = "MWAIT 0x33", + .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 124, +- .target_residency = 800, ++ .target_residency = 3000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -700,7 +700,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { + .desc = "MWAIT 0x40", + .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 200, +- .target_residency = 800, ++ .target_residency = 3200, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -708,7 +708,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { + .desc = "MWAIT 0x50", + .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 480, +- .target_residency = 5000, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -716,7 +716,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { + .desc = "MWAIT 0x60", + .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 890, +- .target_residency = 5000, ++ .target_residency = 9000, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +@@ -737,7 +737,7 @@ static struct cpuidle_state skx_cstates[] __initdata = { + .desc = "MWAIT 0x01", + .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, + .exit_latency = 10, +- .target_residency = 20, ++ .target_residency = 300, + .enter = &intel_idle, + .enter_s2idle = intel_idle_s2idle, }, + { +-- +https://clearlinux.org + +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Fri, 6 Jan 2017 15:34:09 +0000 +Subject: [PATCH] ipv4/tcp: allow the memory tuning for tcp to go a little + bigger than default + +--- + net/ipv4/tcp.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 30c1142..4345075 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -4201,8 +4201,8 @@ void __init tcp_init(void) + tcp_init_mem(); + /* Set per-socket limits to no more than 1/128 the pressure threshold */ + limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7); +- max_wshare = min(4UL*1024*1024, limit); +- max_rshare = min(6UL*1024*1024, limit); ++ max_wshare = min(16UL*1024*1024, limit); ++ max_rshare = min(16UL*1024*1024, limit); + + init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; + init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; +-- +https://clearlinux.org + +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Sun, 18 Feb 2018 23:35:41 +0000 +Subject: [PATCH] locking: rwsem: spin faster + +tweak rwsem owner spinning a bit +--- + kernel/locking/rwsem.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c +index f11b9bd..1bbfcc1 100644 +--- a/kernel/locking/rwsem.c ++++ b/kernel/locking/rwsem.c +@@ -717,6 +717,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) + struct task_struct *new, *owner; + unsigned long flags, new_flags; + enum owner_state state; ++ int i = 0; + + owner = rwsem_owner_flags(sem, &flags); + state = rwsem_owner_state(owner, flags, nonspinnable); +@@ -750,7 +751,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) + break; + } + +- cpu_relax(); ++ if (i++ > 1000) ++ cpu_relax(); + } + rcu_read_unlock(); + +-- +https://clearlinux.org + +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: Arjan van de Ven +Date: Thu, 2 Jun 2016 23:36:32 -0500 +Subject: [PATCH] initialize ata before graphics + +ATA init is the long pole in the boot process, and its asynchronous. +move the graphics init after it so that ata and graphics initialize +in parallel +--- + drivers/Makefile | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +diff --git a/drivers/Makefile b/drivers/Makefile +index c0cd1b9..af1e2fb 100644 +--- a/drivers/Makefile ++++ b/drivers/Makefile +@@ -59,15 +59,8 @@ obj-y += char/ + # iommu/ comes before gpu as gpu are using iommu controllers + obj-y += iommu/ + +-# gpu/ comes after char for AGP vs DRM startup and after iommu +-obj-y += gpu/ +- + obj-$(CONFIG_CONNECTOR) += connector/ + +-# i810fb and intelfb depend on char/agp/ +-obj-$(CONFIG_FB_I810) += video/fbdev/i810/ +-obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ +- + obj-$(CONFIG_PARPORT) += parport/ + obj-$(CONFIG_NVM) += lightnvm/ + obj-y += base/ block/ misc/ mfd/ nfc/ +@@ -80,6 +73,14 @@ obj-$(CONFIG_IDE) += ide/ + obj-y += scsi/ + obj-y += nvme/ + obj-$(CONFIG_ATA) += ata/ ++ ++# gpu/ comes after char for AGP vs DRM startup and after iommu ++obj-y += gpu/ ++ ++# i810fb and intelfb depend on char/agp/ ++obj-$(CONFIG_FB_I810) += video/fbdev/i810/ ++obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/ ++ + obj-$(CONFIG_TARGET_CORE) += target/ + obj-$(CONFIG_MTD) += mtd/ + obj-$(CONFIG_SPI) += spi/ +-- +https://clearlinux.org + diff --git a/linux-tkg/linux-tkg-patches/5.9/0003-glitched-base.patch b/linux-tkg/linux-tkg-patches/5.9/0003-glitched-base.patch new file mode 100644 index 0000000..fb09b35 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.9/0003-glitched-base.patch @@ -0,0 +1,708 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: [PATCH 01/17] glitched + +--- + scripts/mkcompile_h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h +index baf3ab8d9d49..854e32e6aec7 100755 +--- a/scripts/mkcompile_h ++++ b/scripts/mkcompile_h +@@ -41,8 +41,8 @@ else + fi + + UTS_VERSION="#$VERSION" +-CONFIG_FLAGS="" +-if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi ++CONFIG_FLAGS="TKG" ++if [ -n "$SMP" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS SMP"; fi + if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi + if [ -n "$PREEMPT_RT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT_RT"; fi + +-- +2.28.0 + + +From c304f43d14e98d4bf1215fc10bc5012f554bdd8a Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Mon, 29 Jan 2018 16:59:22 +0000 +Subject: [PATCH 02/17] dcache: cache_pressure = 50 decreases the rate at which + VFS caches are reclaimed + +Signed-off-by: Alexandre Frade +--- + fs/dcache.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/dcache.c b/fs/dcache.c +index 361ea7ab30ea..0c5cf69b241a 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -71,7 +71,7 @@ + * If no ancestor relationship: + * arbitrary, since it's serialized on rename_lock + */ +-int sysctl_vfs_cache_pressure __read_mostly = 100; ++int sysctl_vfs_cache_pressure __read_mostly = 50; + EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); + + __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); +-- +2.28.0 + + +From 28f32f59d9d55ac7ec3a20b79bdd02d2a0a5f7e1 Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Mon, 29 Jan 2018 18:29:13 +0000 +Subject: [PATCH 03/17] sched/core: nr_migrate = 128 increases number of tasks + to iterate in a single balance run. + +Signed-off-by: Alexandre Frade +--- + kernel/sched/core.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index f788cd61df21..2bfbb4213707 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -59,7 +59,7 @@ const_debug unsigned int sysctl_sched_features = + * Number of tasks to iterate in a single balance run. + * Limited because this is done with IRQs disabled. + */ +-const_debug unsigned int sysctl_sched_nr_migrate = 32; ++const_debug unsigned int sysctl_sched_nr_migrate = 128; + + /* + * period over which we measure -rt task CPU usage in us. +@@ -71,9 +71,9 @@ __read_mostly int scheduler_running; + + /* + * part of the period that we allow rt tasks to run in us. +- * default: 0.95s ++ * XanMod default: 0.98s + */ +-int sysctl_sched_rt_runtime = 950000; ++int sysctl_sched_rt_runtime = 980000; + + /* + * __task_rq_lock - lock the rq @p resides on. +-- +2.28.0 + + +From acc49f33a10f61dc66c423888cbb883ba46710e4 Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Mon, 29 Jan 2018 17:41:29 +0000 +Subject: [PATCH 04/17] scripts: disable the localversion "+" tag of a git repo + +Signed-off-by: Alexandre Frade +--- + scripts/setlocalversion | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/scripts/setlocalversion b/scripts/setlocalversion +index 20f2efd57b11..0552d8b9f582 100755 +--- a/scripts/setlocalversion ++++ b/scripts/setlocalversion +@@ -54,7 +54,7 @@ scm_version() + # If only the short version is requested, don't bother + # running further git commands + if $short; then +- echo "+" ++ # echo "+" + return + fi + # If we are past a tagged commit (like +-- +2.28.0 + + +From 61fcb33fb0de8bc0f060e0a1ada38ed149217f4d Mon Sep 17 00:00:00 2001 +From: Oleksandr Natalenko +Date: Wed, 11 Dec 2019 11:46:19 +0100 +Subject: [PATCH 05/17] init/Kconfig: enable -O3 for all arches + +Building a kernel with -O3 may help in hunting bugs like [1] and thus +using this switch should not be restricted to one specific arch only. + +With that, lets expose it for everyone. + +[1] https://lore.kernel.org/lkml/673b885183fb64f1cbb3ed2387524077@natalenko.name/ + +Signed-off-by: Oleksandr Natalenko +--- + init/Kconfig | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/init/Kconfig b/init/Kconfig +index 0498af567f70..3ae8678e1145 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1278,7 +1278,6 @@ config CC_OPTIMIZE_FOR_PERFORMANCE + + config CC_OPTIMIZE_FOR_PERFORMANCE_O3 + bool "Optimize more for performance (-O3)" +- depends on ARC + help + Choosing this option will pass "-O3" to your compiler to optimize + the kernel yet more for performance. +-- +2.28.0 + + +From 360c6833e07cc9fdef5746f6bc45bdbc7212288d Mon Sep 17 00:00:00 2001 +From: "Jan Alexander Steffens (heftig)" +Date: Fri, 26 Oct 2018 11:22:33 +0100 +Subject: [PATCH 06/17] infiniband: Fix __read_overflow2 error with -O3 + inlining + +--- + drivers/infiniband/core/addr.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c +index 3a98439bba83..6efc4f907f58 100644 +--- a/drivers/infiniband/core/addr.c ++++ b/drivers/infiniband/core/addr.c +@@ -820,6 +820,7 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, + union { + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; ++ struct sockaddr_ib _sockaddr_ib; + } sgid_addr, dgid_addr; + int ret; + +-- +2.28.0 + + +From f85ed068b4d0e6c31edce8574a95757a60e58b87 Mon Sep 17 00:00:00 2001 +From: Etienne Juvigny +Date: Mon, 3 Sep 2018 17:36:25 +0200 +Subject: [PATCH 07/17] Zenify & stuff + +--- + init/Kconfig | 32 ++++++++++++++++++++++++++++++++ + kernel/sched/fair.c | 25 +++++++++++++++++++++++++ + mm/page-writeback.c | 8 ++++++++ + 3 files changed, 65 insertions(+) + +diff --git a/init/Kconfig b/init/Kconfig +index 3ae8678e1145..da708eed0f1e 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -92,6 +92,38 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config ZENIFY ++ bool "A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience" ++ default y ++ help ++ Tunes the kernel for responsiveness at the cost of throughput and power usage. ++ ++ --- Virtual Memory Subsystem --------------------------- ++ ++ Mem dirty before bg writeback..: 10 % -> 20 % ++ Mem dirty before sync writeback: 20 % -> 50 % ++ ++ --- Block Layer ---------------------------------------- ++ ++ Queue depth...............: 128 -> 512 ++ Default MQ scheduler......: mq-deadline -> bfq ++ ++ --- CFS CPU Scheduler ---------------------------------- ++ ++ Scheduling latency.............: 6 -> 3 ms ++ Minimal granularity............: 0.75 -> 0.3 ms ++ Wakeup granularity.............: 1 -> 0.5 ms ++ CPU migration cost.............: 0.5 -> 0.25 ms ++ Bandwidth slice size...........: 5 -> 3 ms ++ Ondemand fine upscaling limit..: 95 % -> 85 % ++ ++ --- MuQSS CPU Scheduler -------------------------------- ++ ++ Scheduling interval............: 6 -> 3 ms ++ ISO task max realtime use......: 70 % -> 25 % ++ Ondemand coarse upscaling limit: 80 % -> 45 % ++ Ondemand fine upscaling limit..: 95 % -> 45 % ++ + config BROKEN + bool + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 6b3b59cc51d6..2a0072192c3d 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -37,8 +37,13 @@ + * + * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_latency = 3000000ULL; ++static unsigned int normalized_sysctl_sched_latency = 3000000ULL; ++#else + unsigned int sysctl_sched_latency = 6000000ULL; + static unsigned int normalized_sysctl_sched_latency = 6000000ULL; ++#endif + + /* + * The initial- and re-scaling of tunables is configurable +@@ -58,13 +63,22 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L + * + * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_min_granularity = 300000ULL; ++static unsigned int normalized_sysctl_sched_min_granularity = 300000ULL; ++#else + unsigned int sysctl_sched_min_granularity = 750000ULL; + static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; ++#endif + + /* + * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity + */ ++#ifdef CONFIG_ZENIFY ++static unsigned int sched_nr_latency = 10; ++#else + static unsigned int sched_nr_latency = 8; ++#endif + + /* + * After fork, child runs first. If set to 0 (default) then +@@ -81,10 +95,17 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; + * + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_wakeup_granularity = 500000UL; ++static unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL; ++ ++const_debug unsigned int sysctl_sched_migration_cost = 50000UL; ++#else + unsigned int sysctl_sched_wakeup_granularity = 1000000UL; + static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; + + const_debug unsigned int sysctl_sched_migration_cost = 500000UL; ++#endif + + int sched_thermal_decay_shift; + static int __init setup_sched_thermal_decay_shift(char *str) +@@ -128,8 +149,12 @@ int __weak arch_asym_cpu_priority(int cpu) + * + * (default: 5 msec, units: microseconds) + */ ++#ifdef CONFIG_ZENIFY ++unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; ++#else + unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; + #endif ++#endif + + static inline void update_load_add(struct load_weight *lw, unsigned long inc) + { +diff --git a/mm/page-writeback.c b/mm/page-writeback.c +index 28b3e7a67565..01a1aef2b9b1 100644 +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -71,7 +71,11 @@ static long ratelimit_pages = 32; + /* + * Start background writeback (via writeback threads) at this percentage + */ ++#ifdef CONFIG_ZENIFY ++int dirty_background_ratio = 20; ++#else + int dirty_background_ratio = 10; ++#endif + + /* + * dirty_background_bytes starts at 0 (disabled) so that it is a function of +@@ -88,7 +92,11 @@ int vm_highmem_is_dirtyable; + /* + * The generator of dirty data starts writeback at this percentage + */ ++#ifdef CONFIG_ZENIFY ++int vm_dirty_ratio = 50; ++#else + int vm_dirty_ratio = 20; ++#endif + + /* + * vm_dirty_bytes starts at 0 (disabled) so that it is a function of +-- +2.28.0 + + +From e92e67143385cf285851e12aa8b7f083dd38dd24 Mon Sep 17 00:00:00 2001 +From: Steven Barrett +Date: Sun, 16 Jan 2011 18:57:32 -0600 +Subject: [PATCH 08/17] ZEN: Allow TCP YeAH as default congestion control + +4.4: In my tests YeAH dramatically slowed down transfers over a WLAN, + reducing throughput from ~65Mbps (CUBIC) to ~7MBps (YeAH) over 10 + seconds (netperf TCP_STREAM) including long stalls. + + Be careful when choosing this. ~heftig +--- + net/ipv4/Kconfig | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index e64e59b536d3..bfb55ef7ebbe 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -691,6 +691,9 @@ choice + config DEFAULT_VEGAS + bool "Vegas" if TCP_CONG_VEGAS=y + ++ config DEFAULT_YEAH ++ bool "YeAH" if TCP_CONG_YEAH=y ++ + config DEFAULT_VENO + bool "Veno" if TCP_CONG_VENO=y + +@@ -724,6 +727,7 @@ config DEFAULT_TCP_CONG + default "htcp" if DEFAULT_HTCP + default "hybla" if DEFAULT_HYBLA + default "vegas" if DEFAULT_VEGAS ++ default "yeah" if DEFAULT_YEAH + default "westwood" if DEFAULT_WESTWOOD + default "veno" if DEFAULT_VENO + default "reno" if DEFAULT_RENO +-- +2.28.0 + + +From 76dbe7477bfde1b5e8bf29a71b5af7ab2be9b98e Mon Sep 17 00:00:00 2001 +From: Steven Barrett +Date: Wed, 28 Nov 2018 19:01:27 -0600 +Subject: [PATCH 09/17] zen: Use [defer+madvise] as default khugepaged defrag + strategy + +For some reason, the default strategy to respond to THP fault fallbacks +is still just madvise, meaning stall if the program wants transparent +hugepages, but don't trigger a background reclaim / compaction if THP +begins to fail allocations. This creates a snowball affect where we +still use the THP code paths, but we almost always fail once a system +has been active and busy for a while. + +The option "defer" was created for interactive systems where THP can +still improve performance. If we have to fallback to a regular page due +to an allocation failure or anything else, we will trigger a background +reclaim and compaction so future THP attempts succeed and previous +attempts eventually have their smaller pages combined without stalling +running applications. + +We still want madvise to stall applications that explicitely want THP, +so defer+madvise _does_ make a ton of sense. Make it the default for +interactive systems, especially if the kernel maintainer left +transparent hugepages on "always". + +Reasoning and details in the original patch: https://lwn.net/Articles/711248/ +--- + mm/huge_memory.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index 74300e337c3c..9277f22c10a7 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -53,7 +53,11 @@ unsigned long transparent_hugepage_flags __read_mostly = + #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE + (1< +Date: Wed, 24 Oct 2018 16:58:52 -0300 +Subject: [PATCH 10/17] net/sched: allow configuring cake qdisc as default + +Signed-off-by: Alexandre Frade +--- + net/sched/Kconfig | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/net/sched/Kconfig b/net/sched/Kconfig +index 84badf00647e..6a922bca9f39 100644 +--- a/net/sched/Kconfig ++++ b/net/sched/Kconfig +@@ -471,6 +471,9 @@ choice + config DEFAULT_SFQ + bool "Stochastic Fair Queue" if NET_SCH_SFQ + ++ config DEFAULT_CAKE ++ bool "Common Applications Kept Enhanced" if NET_SCH_CAKE ++ + config DEFAULT_PFIFO_FAST + bool "Priority FIFO Fast" + endchoice +@@ -481,6 +484,7 @@ config DEFAULT_NET_SCH + default "fq" if DEFAULT_FQ + default "fq_codel" if DEFAULT_FQ_CODEL + default "sfq" if DEFAULT_SFQ ++ default "cake" if DEFAULT_CAKE + default "pfifo_fast" + endif + +-- +2.28.0 + + +From 816ee502759e954304693813bd03d94986b28dba Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Mon, 18 Feb 2019 17:40:57 +0100 +Subject: [PATCH 11/17] mm: Set watermark_scale_factor to 200 (from 10) + +Multiple users have reported it's helping reducing/eliminating stuttering +with DXVK. +--- + mm/page_alloc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 898ff44f2c7b..e72074034793 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -330,7 +330,7 @@ int watermark_boost_factor __read_mostly; + #else + int watermark_boost_factor __read_mostly = 15000; + #endif +-int watermark_scale_factor = 10; ++int watermark_scale_factor = 200; + + static unsigned long nr_kernel_pages __initdata; + static unsigned long nr_all_pages __initdata; +-- +2.28.0 + + +From 90240bcd90a568878738e66c0d45bed3e38e347b Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Fri, 19 Apr 2019 12:33:38 +0200 +Subject: [PATCH 12/17] Set vm.max_map_count to 262144 by default + +The value is still pretty low, and AMD64-ABI and ELF extended numbering +supports that, so we should be fine on modern x86 systems. + +This fixes crashes in some applications using more than 65535 vmas (also +affects some windows games running in wine, such as Star Citizen). +--- + include/linux/mm.h | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/include/linux/mm.h b/include/linux/mm.h +index bc05c3588aa3..b0cefe94920d 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -190,8 +190,7 @@ static inline void __mm_zero_struct_page(struct page *page) + * not a hard limit any more. Although some userspace tools can be surprised by + * that. + */ +-#define MAPCOUNT_ELF_CORE_MARGIN (5) +-#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) ++#define DEFAULT_MAX_MAP_COUNT (262144) + + extern int sysctl_max_map_count; + +-- +2.28.0 + + +From 3a34034dba5efe91bcec491efe8c66e8087f509b Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Mon, 27 Jul 2020 00:19:18 +0200 +Subject: [PATCH 13/17] mm: bump DEFAULT_MAX_MAP_COUNT + +Some games such as Detroit: Become Human tend to be very crash prone with +lower values. +--- + include/linux/mm.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/linux/mm.h b/include/linux/mm.h +index b0cefe94920d..890165099b07 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -190,7 +190,7 @@ static inline void __mm_zero_struct_page(struct page *page) + * not a hard limit any more. Although some userspace tools can be surprised by + * that. + */ +-#define DEFAULT_MAX_MAP_COUNT (262144) ++#define DEFAULT_MAX_MAP_COUNT (524288) + + extern int sysctl_max_map_count; + +-- +2.28.0 + + +From 977812938da7c7226415778c340832141d9278b7 Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Mon, 25 Nov 2019 15:13:06 -0300 +Subject: [PATCH 14/17] elevator: set default scheduler to bfq for blk-mq + +Signed-off-by: Alexandre Frade +--- + block/elevator.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/block/elevator.c b/block/elevator.c +index 4eab3d70e880..79669aa39d79 100644 +--- a/block/elevator.c ++++ b/block/elevator.c +@@ -623,15 +623,15 @@ static inline bool elv_support_iosched(struct request_queue *q) + } + + /* +- * For single queue devices, default to using mq-deadline. If we have multiple +- * queues or mq-deadline is not available, default to "none". ++ * For single queue devices, default to using bfq. If we have multiple ++ * queues or bfq is not available, default to "none". + */ + static struct elevator_type *elevator_get_default(struct request_queue *q) + { + if (q->nr_hw_queues != 1) + return NULL; + +- return elevator_get(q, "mq-deadline", false); ++ return elevator_get(q, "bfq", false); + } + + /* +-- +2.28.0 + + +From e2111bc5989131c675659d40e0cc4f214df2f990 Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Fri, 10 May 2019 16:45:59 -0300 +Subject: [PATCH 15/17] block: set rq_affinity = 2 for full multithreading I/O + requests + +Signed-off-by: Alexandre Frade +--- + include/linux/blkdev.h | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 28efe374a2e1..d4e5d35d2ece 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -624,7 +624,8 @@ struct request_queue { + #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ + + #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ +- (1 << QUEUE_FLAG_SAME_COMP)) ++ (1 << QUEUE_FLAG_SAME_COMP) | \ ++ (1 << QUEUE_FLAG_SAME_FORCE)) + + void blk_queue_flag_set(unsigned int flag, struct request_queue *q); + void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); +-- +2.28.0 + + +From 3c229f434aca65c4ca61772bc03c3e0370817b92 Mon Sep 17 00:00:00 2001 +From: Alexandre Frade +Date: Mon, 3 Aug 2020 17:05:04 +0000 +Subject: [PATCH 16/17] mm: set 2 megabytes for address_space-level file + read-ahead pages size + +Signed-off-by: Alexandre Frade +--- + include/linux/pagemap.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h +index cf2468da68e9..007dea784451 100644 +--- a/include/linux/pagemap.h ++++ b/include/linux/pagemap.h +@@ -655,7 +655,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); + void delete_from_page_cache_batch(struct address_space *mapping, + struct pagevec *pvec); + +-#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) ++#define VM_READAHEAD_PAGES (SZ_2M / PAGE_SIZE) + + void page_cache_sync_readahead(struct address_space *, struct file_ra_state *, + struct file *, pgoff_t index, unsigned long req_count); +-- +2.28.0 + + +From 716f41cf6631f3a85834dcb67b4ce99185b6387f Mon Sep 17 00:00:00 2001 +From: Steven Barrett +Date: Wed, 15 Jan 2020 20:43:56 -0600 +Subject: [PATCH 17/17] ZEN: intel-pstate: Implement "enable" parameter + +If intel-pstate is compiled into the kernel, it will preempt the loading +of acpi-cpufreq so you can take advantage of hardware p-states without +any friction. + +However, intel-pstate is not completely superior to cpufreq's ondemand +for one reason. There's no concept of an up_threshold property. + +In ondemand, up_threshold essentially reduces the maximum utilization to +compare against, allowing you to hit max frequencies and turbo boost +from a much lower core utilization. + +With intel-pstate, you have the concept of minimum and maximum +performance, but no tunable that lets you define, maximum frequency +means 50% core utilization. For just this oversight, there's reasons +you may want ondemand. + +Lets support setting "enable" in kernel boot parameters. This lets +kernel maintainers include "intel_pstate=disable" statically in the +static boot parameters, but let users of the kernel override this +selection. +--- + Documentation/admin-guide/kernel-parameters.txt | 3 +++ + drivers/cpufreq/intel_pstate.c | 2 ++ + 2 files changed, 5 insertions(+) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index fb95fad81c79..3e92fee81e33 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -1857,6 +1857,9 @@ + disable + Do not enable intel_pstate as the default + scaling driver for the supported processors ++ enable ++ Enable intel_pstate in-case "disable" was passed ++ previously in the kernel boot parameters + passive + Use intel_pstate as a scaling driver, but configure it + to work with generic cpufreq governors (instead of +diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c +index 36a469150ff9..aee891c9b78a 100644 +--- a/drivers/cpufreq/intel_pstate.c ++++ b/drivers/cpufreq/intel_pstate.c +@@ -2845,6 +2845,8 @@ static int __init intel_pstate_setup(char *str) + pr_info("HWP disabled\n"); + no_hwp = 1; + } ++ if (!strcmp(str, "enable")) ++ no_load = 0; + if (!strcmp(str, "force")) + force_load = 1; + if (!strcmp(str, "hwp_only")) +-- +2.28.0 + diff --git a/linux-tkg/linux-tkg-patches/5.9/0003-glitched-cfs.patch b/linux-tkg/linux-tkg-patches/5.9/0003-glitched-cfs.patch new file mode 100644 index 0000000..06b7f02 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.9/0003-glitched-cfs.patch @@ -0,0 +1,72 @@ +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_500 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -39,6 +39,13 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,6 +59,7 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_500 ++ default HZ_750 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -46,6 +46,13 @@ choice + on desktops with great smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -60,6 +67,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 500 if HZ_500 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK + diff --git a/linux-tkg/linux-tkg-patches/5.9/0004-5.9-ck1.patch b/linux-tkg/linux-tkg-patches/5.9/0004-5.9-ck1.patch new file mode 100644 index 0000000..33e9da3 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.9/0004-5.9-ck1.patch @@ -0,0 +1,13384 @@ +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index a1068742a6df..d2a8f1c637d2 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4595,6 +4595,14 @@ + Memory area to be used by remote processor image, + managed by CMA. + ++ rqshare= [X86] Select the MuQSS scheduler runqueue sharing type. ++ Format: ++ smt -- Share SMT (hyperthread) sibling runqueues ++ mc -- Share MC (multicore) sibling runqueues ++ smp -- Share SMP runqueues ++ none -- So not share any runqueues ++ Default value is mc ++ + rw [KNL] Mount root device read-write on boot + + S [KNL] Run init in single mode +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index d4b32cc32bb7..9e1e71fc66d0 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -436,6 +436,16 @@ this allows system administrators to override the + ``IA64_THREAD_UAC_NOPRINT`` ``prctl`` and avoid logs being flooded. + + ++iso_cpu: (MuQSS CPU scheduler only) ++=================================== ++ ++This sets the percentage cpu that the unprivileged SCHED_ISO tasks can ++run effectively at realtime priority, averaged over a rolling five ++seconds over the -whole- system, meaning all cpus. ++ ++Set to 70 (percent) by default. ++ ++ + kexec_load_disabled + =================== + +@@ -1077,6 +1087,20 @@ ROM/Flash boot loader. Maybe to tell it what to do after + rebooting. ??? + + ++rr_interval: (MuQSS CPU scheduler only) ++======================================= ++ ++This is the smallest duration that any cpu process scheduling unit ++will run for. Increasing this value can increase throughput of cpu ++bound tasks substantially but at the expense of increased latencies ++overall. Conversely decreasing it will decrease average and maximum ++latencies but at the expense of throughput. This value is in ++milliseconds and the default value chosen depends on the number of ++cpus available at scheduler initialisation with a minimum of 6. ++ ++Valid values are from 1-1000. ++ ++ + sched_energy_aware + ================== + +@@ -1515,3 +1539,13 @@ is 10 seconds. + + The softlockup threshold is (``2 * watchdog_thresh``). Setting this + tunable to zero will disable lockup detection altogether. ++ ++ ++yield_type: (MuQSS CPU scheduler only) ++====================================== ++ ++This determines what type of yield calls to sched_yield will perform. ++ ++ 0: No yield. ++ 1: Yield only to better priority/deadline tasks. (default) ++ 2: Expire timeslice and recalculate deadline. +diff --git a/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt +new file mode 100644 +index 000000000000..c0282002a079 +--- /dev/null ++++ b/Documentation/scheduler/sched-BFS.txt +@@ -0,0 +1,351 @@ ++BFS - The Brain Fuck Scheduler by Con Kolivas. ++ ++Goals. ++ ++The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to ++completely do away with the complex designs of the past for the cpu process ++scheduler and instead implement one that is very simple in basic design. ++The main focus of BFS is to achieve excellent desktop interactivity and ++responsiveness without heuristics and tuning knobs that are difficult to ++understand, impossible to model and predict the effect of, and when tuned to ++one workload cause massive detriment to another. ++ ++ ++Design summary. ++ ++BFS is best described as a single runqueue, O(n) lookup, earliest effective ++virtual deadline first design, loosely based on EEVDF (earliest eligible virtual ++deadline first) and my previous Staircase Deadline scheduler. Each component ++shall be described in order to understand the significance of, and reasoning for ++it. The codebase when the first stable version was released was approximately ++9000 lines less code than the existing mainline linux kernel scheduler (in ++2.6.31). This does not even take into account the removal of documentation and ++the cgroups code that is not used. ++ ++Design reasoning. ++ ++The single runqueue refers to the queued but not running processes for the ++entire system, regardless of the number of CPUs. The reason for going back to ++a single runqueue design is that once multiple runqueues are introduced, ++per-CPU or otherwise, there will be complex interactions as each runqueue will ++be responsible for the scheduling latency and fairness of the tasks only on its ++own runqueue, and to achieve fairness and low latency across multiple CPUs, any ++advantage in throughput of having CPU local tasks causes other disadvantages. ++This is due to requiring a very complex balancing system to at best achieve some ++semblance of fairness across CPUs and can only maintain relatively low latency ++for tasks bound to the same CPUs, not across them. To increase said fairness ++and latency across CPUs, the advantage of local runqueue locking, which makes ++for better scalability, is lost due to having to grab multiple locks. ++ ++A significant feature of BFS is that all accounting is done purely based on CPU ++used and nowhere is sleep time used in any way to determine entitlement or ++interactivity. Interactivity "estimators" that use some kind of sleep/run ++algorithm are doomed to fail to detect all interactive tasks, and to falsely tag ++tasks that aren't interactive as being so. The reason for this is that it is ++close to impossible to determine that when a task is sleeping, whether it is ++doing it voluntarily, as in a userspace application waiting for input in the ++form of a mouse click or otherwise, or involuntarily, because it is waiting for ++another thread, process, I/O, kernel activity or whatever. Thus, such an ++estimator will introduce corner cases, and more heuristics will be required to ++cope with those corner cases, introducing more corner cases and failed ++interactivity detection and so on. Interactivity in BFS is built into the design ++by virtue of the fact that tasks that are waking up have not used up their quota ++of CPU time, and have earlier effective deadlines, thereby making it very likely ++they will preempt any CPU bound task of equivalent nice level. See below for ++more information on the virtual deadline mechanism. Even if they do not preempt ++a running task, because the rr interval is guaranteed to have a bound upper ++limit on how long a task will wait for, it will be scheduled within a timeframe ++that will not cause visible interface jitter. ++ ++ ++Design details. ++ ++Task insertion. ++ ++BFS inserts tasks into each relevant queue as an O(1) insertion into a double ++linked list. On insertion, *every* running queue is checked to see if the newly ++queued task can run on any idle queue, or preempt the lowest running task on the ++system. This is how the cross-CPU scheduling of BFS achieves significantly lower ++latency per extra CPU the system has. In this case the lookup is, in the worst ++case scenario, O(n) where n is the number of CPUs on the system. ++ ++Data protection. ++ ++BFS has one single lock protecting the process local data of every task in the ++global queue. Thus every insertion, removal and modification of task data in the ++global runqueue needs to grab the global lock. However, once a task is taken by ++a CPU, the CPU has its own local data copy of the running process' accounting ++information which only that CPU accesses and modifies (such as during a ++timer tick) thus allowing the accounting data to be updated lockless. Once a ++CPU has taken a task to run, it removes it from the global queue. Thus the ++global queue only ever has, at most, ++ ++ (number of tasks requesting cpu time) - (number of logical CPUs) + 1 ++ ++tasks in the global queue. This value is relevant for the time taken to look up ++tasks during scheduling. This will increase if many tasks with CPU affinity set ++in their policy to limit which CPUs they're allowed to run on if they outnumber ++the number of CPUs. The +1 is because when rescheduling a task, the CPU's ++currently running task is put back on the queue. Lookup will be described after ++the virtual deadline mechanism is explained. ++ ++Virtual deadline. ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in BFS is entirely in the virtual deadline mechanism. The one ++tunable in BFS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in jiffies by this equation: ++ ++ jiffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. Once a task is descheduled, it is put back on the queue, and an ++O(n) lookup of all queued-but-not-running tasks is done to determine which has ++the earliest deadline and that task is chosen to receive CPU next. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (jiffies) is ++constantly moving. ++ ++Task lookup. ++ ++BFS has 103 priority queues. 100 of these are dedicated to the static priority ++of realtime tasks, and the remaining 3 are, in order of best to worst priority, ++SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority ++scheduling). When a task of these priorities is queued, a bitmap of running ++priorities is set showing which of these priorities has tasks waiting for CPU ++time. When a CPU is made to reschedule, the lookup for the next task to get ++CPU time is performed in the following way: ++ ++First the bitmap is checked to see what static priority tasks are queued. If ++any realtime priorities are found, the corresponding queue is checked and the ++first task listed there is taken (provided CPU affinity is suitable) and lookup ++is complete. If the priority corresponds to a SCHED_ISO task, they are also ++taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds ++to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this ++stage, every task in the runlist that corresponds to that priority is checked ++to see which has the earliest set deadline, and (provided it has suitable CPU ++affinity) it is taken off the runqueue and given the CPU. If a task has an ++expired deadline, it is taken and the rest of the lookup aborted (as they are ++chosen in FIFO order). ++ ++Thus, the lookup is O(n) in the worst case only, where n is as described ++earlier, as tasks may be chosen before the whole task list is looked over. ++ ++ ++Scalability. ++ ++The major limitations of BFS will be that of scalability, as the separate ++runqueue designs will have less lock contention as the number of CPUs rises. ++However they do not scale linearly even with separate runqueues as multiple ++runqueues will need to be locked concurrently on such designs to be able to ++achieve fair CPU balancing, to try and achieve some sort of nice-level fairness ++across CPUs, and to achieve low enough latency for tasks on a busy CPU when ++other CPUs would be more suited. BFS has the advantage that it requires no ++balancing algorithm whatsoever, as balancing occurs by proxy simply because ++all CPUs draw off the global runqueue, in priority and deadline order. Despite ++the fact that scalability is _not_ the prime concern of BFS, it both shows very ++good scalability to smaller numbers of CPUs and is likely a more scalable design ++at these numbers of CPUs. ++ ++It also has some very low overhead scalability features built into the design ++when it has been deemed their overhead is so marginal that they're worth adding. ++The first is the local copy of the running process' data to the CPU it's running ++on to allow that data to be updated lockless where possible. Then there is ++deference paid to the last CPU a task was running on, by trying that CPU first ++when looking for an idle CPU to use the next time it's scheduled. Finally there ++is the notion of cache locality beyond the last running CPU. The sched_domains ++information is used to determine the relative virtual "cache distance" that ++other CPUs have from the last CPU a task was running on. CPUs with shared ++caches, such as SMT siblings, or multicore CPUs with shared caches, are treated ++as cache local. CPUs without shared caches are treated as not cache local, and ++CPUs on different NUMA nodes are treated as very distant. This "relative cache ++distance" is used by modifying the virtual deadline value when doing lookups. ++Effectively, the deadline is unaltered between "cache local" CPUs, doubled for ++"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning ++behind the doubling of deadlines is as follows. The real cost of migrating a ++task from one CPU to another is entirely dependant on the cache footprint of ++the task, how cache intensive the task is, how long it's been running on that ++CPU to take up the bulk of its cache, how big the CPU cache is, how fast and ++how layered the CPU cache is, how fast a context switch is... and so on. In ++other words, it's close to random in the real world where we do more than just ++one sole workload. The only thing we can be sure of is that it's not free. So ++BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs ++is more important than cache locality, and cache locality only plays a part ++after that. Doubling the effective deadline is based on the premise that the ++"cache local" CPUs will tend to work on the same tasks up to double the number ++of cache local CPUs, and once the workload is beyond that amount, it is likely ++that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA ++is a value I pulled out of my arse. ++ ++When choosing an idle CPU for a waking task, the cache locality is determined ++according to where the task last ran and then idle CPUs are ranked from best ++to worst to choose the most suitable idle CPU based on cache locality, NUMA ++node locality and hyperthread sibling business. They are chosen in the ++following preference (if idle): ++ ++* Same core, idle or busy cache, idle threads ++* Other core, same cache, idle or busy cache, idle threads. ++* Same node, other CPU, idle cache, idle threads. ++* Same node, other CPU, busy cache, idle threads. ++* Same core, busy threads. ++* Other core, same cache, busy threads. ++* Same node, other CPU, busy threads. ++* Other node, other CPU, idle cache, idle threads. ++* Other node, other CPU, busy cache, idle threads. ++* Other node, other CPU, busy threads. ++ ++This shows the SMT or "hyperthread" awareness in the design as well which will ++choose a real idle core first before a logical SMT sibling which already has ++tasks on the physical CPU. ++ ++Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark. ++However this benchmarking was performed on an earlier design that was far less ++scalable than the current one so it's hard to know how scalable it is in terms ++of both CPUs (due to the global runqueue) and heavily loaded machines (due to ++O(n) lookup) at this stage. Note that in terms of scalability, the number of ++_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x) ++quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark ++results are very promising indeed, without needing to tweak any knobs, features ++or options. Benchmark contributions are most welcome. ++ ++ ++Features ++ ++As the initial prime target audience for BFS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval ++and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition ++to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is ++support for CGROUPS. The average user should neither need to know what these ++are, nor should they need to be using them to have good desktop behaviour. ++ ++rr_interval ++ ++There is only one "scheduler" tunable, the round robin interval. This can be ++accessed in ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6 on a ++uniprocessor machine, and automatically set to a progressively higher value on ++multiprocessor machines. The reasoning behind increasing the value on more CPUs ++is that the effective latency is decreased by virtue of there being more CPUs on ++BFS (for reasons explained above), and increasing the value allows for less ++cache contention and more throughput. Valid values are from 1 to 1000 ++Decreasing the value will decrease latencies at the cost of decreasing ++throughput, while increasing it will improve throughput, but at the cost of ++worsening latencies. The accuracy of the rr interval is limited by HZ resolution ++of the kernel configuration. Thus, the worst case latencies are usually slightly ++higher than this actual value. The default value of 6 is not an arbitrary one. ++It is based on the fact that humans can detect jitter at approximately 7ms, so ++aiming for much lower latencies is pointless under most circumstances. It is ++worth noting this fact when comparing the latency performance of BFS to other ++schedulers. Worst case latencies being higher than 7ms are far worse than ++average latencies not being in the microsecond range. ++ ++Isochronous scheduling. ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of _total CPU_ available across the machine, configurable ++as a percentage in the following "resource handling" tunable (as opposed to a ++scheduler tunable): ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of BFS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++Because some applications constantly set their policy as well as their nice ++level, there is potential for them to undo the override specified by the user ++on the command line of setting the policy to SCHED_ISO. To counter this, once ++a task has been set to SCHED_ISO policy, it needs superuser privileges to set ++it back to SCHED_NORMAL. This will ensure the task remains ISO and all child ++processes and threads will also inherit the ISO policy. ++ ++Idleprio scheduling. ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start ++a video encode or so on without any slowdown of other tasks. To avoid this ++policy from grabbing shared resources and holding them indefinitely, if it ++detects a state where the task is waiting on I/O, the machine is about to ++suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As ++per the Isochronous task management, once a task has been scheduled as IDLEPRIO, ++it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can ++be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++ schedtool -D -e ./mprime ++ ++Subtick accounting. ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the ++timer tick frequency (HZ) is lowered. It is possible to create an application ++which uses almost 100% CPU, yet by being descheduled at the right time, records ++zero CPU usage. While the main problem with this is that there are possible ++security implications, it is also difficult to determine how much CPU a task ++really does use. BFS tries to use the sub-tick accounting from the TSC clock, ++where possible, to determine real CPU usage. This is not entirely reliable, but ++is far more likely to produce accurate CPU usage data than the existing designs ++and will not show tasks as consuming no CPU usage when they actually are. Thus, ++the amount of CPU reported as being used by BFS will more accurately represent ++how much CPU the task itself is using (as is shown for example by the 'time' ++application), so the reported values may be quite different to other schedulers. ++Values reported as the 'load' are more prone to problems with this design, but ++per process values are closer to real usage. When comparing throughput of BFS ++to other designs, it is important to compare the actual completed work in terms ++of total wall clock time taken and total work done, rather than the reported ++"cpu usage". ++ ++ ++Con Kolivas Fri Aug 27 2010 +diff --git a/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt +new file mode 100644 +index 000000000000..ae28b85c9995 +--- /dev/null ++++ b/Documentation/scheduler/sched-MuQSS.txt +@@ -0,0 +1,373 @@ ++MuQSS - The Multiple Queue Skiplist Scheduler by Con Kolivas. ++ ++MuQSS is a per-cpu runqueue variant of the original BFS scheduler with ++one 8 level skiplist per runqueue, and fine grained locking for much more ++scalability. ++ ++ ++Goals. ++ ++The goal of the Multiple Queue Skiplist Scheduler, referred to as MuQSS from ++here on (pronounced mux) is to completely do away with the complex designs of ++the past for the cpu process scheduler and instead implement one that is very ++simple in basic design. The main focus of MuQSS is to achieve excellent desktop ++interactivity and responsiveness without heuristics and tuning knobs that are ++difficult to understand, impossible to model and predict the effect of, and when ++tuned to one workload cause massive detriment to another, while still being ++scalable to many CPUs and processes. ++ ++ ++Design summary. ++ ++MuQSS is best described as per-cpu multiple runqueue, O(log n) insertion, O(1) ++lookup, earliest effective virtual deadline first tickless design, loosely based ++on EEVDF (earliest eligible virtual deadline first) and my previous Staircase ++Deadline scheduler, and evolved from the single runqueue O(n) BFS scheduler. ++Each component shall be described in order to understand the significance of, ++and reasoning for it. ++ ++ ++Design reasoning. ++ ++In BFS, the use of a single runqueue across all CPUs meant that each CPU would ++need to scan the entire runqueue looking for the process with the earliest ++deadline and schedule that next, regardless of which CPU it originally came ++from. This made BFS deterministic with respect to latency and provided ++guaranteed latencies dependent on number of processes and CPUs. The single ++runqueue, however, meant that all CPUs would compete for the single lock ++protecting it, which would lead to increasing lock contention as the number of ++CPUs rose and appeared to limit scalability of common workloads beyond 16 ++logical CPUs. Additionally, the O(n) lookup of the runqueue list obviously ++increased overhead proportionate to the number of queued proecesses and led to ++cache thrashing while iterating over the linked list. ++ ++MuQSS is an evolution of BFS, designed to maintain the same scheduling ++decision mechanism and be virtually deterministic without relying on the ++constrained design of the single runqueue by splitting out the single runqueue ++to be per-CPU and use skiplists instead of linked lists. ++ ++The original reason for going back to a single runqueue design for BFS was that ++once multiple runqueues are introduced, per-CPU or otherwise, there will be ++complex interactions as each runqueue will be responsible for the scheduling ++latency and fairness of the tasks only on its own runqueue, and to achieve ++fairness and low latency across multiple CPUs, any advantage in throughput of ++having CPU local tasks causes other disadvantages. This is due to requiring a ++very complex balancing system to at best achieve some semblance of fairness ++across CPUs and can only maintain relatively low latency for tasks bound to the ++same CPUs, not across them. To increase said fairness and latency across CPUs, ++the advantage of local runqueue locking, which makes for better scalability, is ++lost due to having to grab multiple locks. ++ ++MuQSS works around the problems inherent in multiple runqueue designs by ++making its skip lists priority ordered and through novel use of lockless ++examination of each other runqueue it can decide if it should take the earliest ++deadline task from another runqueue for latency reasons, or for CPU balancing ++reasons. It still does not have a balancing system, choosing to allow the ++next task scheduling decision and task wakeup CPU choice to allow balancing to ++happen by virtue of its choices. ++ ++As a further evolution of the design, MuQSS normally configures sharing of ++runqueues in a logical fashion for when CPU resources are shared for improved ++latency and throughput. By default it shares runqueues and locks between ++multicore siblings. Optionally it can be configured to run with sharing of ++SMT siblings only, all SMP packages or no sharing at all. Additionally it can ++be selected at boot time. ++ ++ ++Design details. ++ ++Custom skip list implementation: ++ ++To avoid the overhead of building up and tearing down skip list structures, ++the variant used by MuQSS has a number of optimisations making it specific for ++its use case in the scheduler. It uses static arrays of 8 'levels' instead of ++building up and tearing down structures dynamically. This makes each runqueue ++only scale O(log N) up to 64k tasks. However as there is one runqueue per CPU ++it means that it scales O(log N) up to 64k x number of logical CPUs which is ++far beyond the realistic task limits each CPU could handle. By being 8 levels ++it also makes the array exactly one cacheline in size. Additionally, each ++skip list node is bidirectional making insertion and removal amortised O(1), ++being O(k) where k is 1-8. Uniquely, we are only ever interested in the very ++first entry in each list at all times with MuQSS, so there is never a need to ++do a search and thus look up is always O(1). In interactive mode, the queues ++will be searched beyond their first entry if the first task is not suitable ++for affinity or SMT nice reasons. ++ ++Task insertion: ++ ++MuQSS inserts tasks into a per CPU runqueue as an O(log N) insertion into ++a custom skip list as described above (based on the original design by William ++Pugh). Insertion is ordered in such a way that there is never a need to do a ++search by ordering tasks according to static priority primarily, and then ++virtual deadline at the time of insertion. ++ ++Niffies: ++ ++Niffies are a monotonic forward moving timer not unlike the "jiffies" but are ++of nanosecond resolution. Niffies are calculated per-runqueue from the high ++resolution TSC timers, and in order to maintain fairness are synchronised ++between CPUs whenever both runqueues are locked concurrently. ++ ++Virtual deadline: ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in MuQSS is entirely in the virtual deadline mechanism. The one ++tunable in MuQSS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in niffies by this equation: ++ ++ niffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (niffies) is ++constantly moving. ++ ++Task lookup: ++ ++As tasks are already pre-ordered according to anticipated scheduling order in ++the skip lists, lookup for the next suitable task per-runqueue is always a ++matter of simply selecting the first task in the 0th level skip list entry. ++In order to maintain optimal latency and fairness across CPUs, MuQSS does a ++novel examination of every other runqueue in cache locality order, choosing the ++best task across all runqueues. This provides near-determinism of how long any ++task across the entire system may wait before receiving CPU time. The other ++runqueues are first examine lockless and then trylocked to minimise the ++potential lock contention if they are likely to have a suitable better task. ++Each other runqueue lock is only held for as long as it takes to examine the ++entry for suitability. In "interactive" mode, the default setting, MuQSS will ++look for the best deadline task across all CPUs, while in !interactive mode, ++it will only select a better deadline task from another CPU if it is more ++heavily laden than the current one. ++ ++Lookup is therefore O(k) where k is number of CPUs. ++ ++ ++Latency. ++ ++Through the use of virtual deadlines to govern the scheduling order of normal ++tasks, queue-to-activation latency per runqueue is guaranteed to be bound by ++the rr_interval tunable which is set to 6ms by default. This means that the ++longest a CPU bound task will wait for more CPU is proportional to the number ++of running tasks and in the common case of 0-2 running tasks per CPU, will be ++under the 7ms threshold for human perception of jitter. Additionally, as newly ++woken tasks will have an early deadline from their previous runtime, the very ++tasks that are usually latency sensitive will have the shortest interval for ++activation, usually preempting any existing CPU bound tasks. ++ ++Tickless expiry: ++ ++A feature of MuQSS is that it is not tied to the resolution of the chosen tick ++rate in Hz, instead depending entirely on the high resolution timers where ++possible for sub-millisecond accuracy on timeouts regarless of the underlying ++tick rate. This allows MuQSS to be run with the low overhead of low Hz rates ++such as 100 by default, benefiting from the improved throughput and lower ++power usage it provides. Another advantage of this approach is that in ++combination with the Full No HZ option, which disables ticks on running task ++CPUs instead of just idle CPUs, the tick can be disabled at all times ++regardless of how many tasks are running instead of being limited to just one ++running task. Note that this option is NOT recommended for regular desktop ++users. ++ ++ ++Scalability and balancing. ++ ++Unlike traditional approaches where balancing is a combination of CPU selection ++at task wakeup and intermittent balancing based on a vast array of rules set ++according to architecture, busyness calculations and special case management, ++MuQSS indirectly balances on the fly at task wakeup and next task selection. ++During initialisation, MuQSS creates a cache coherency ordered list of CPUs for ++each logical CPU and uses this to aid task/CPU selection when CPUs are busy. ++Additionally it selects any idle CPUs, if they are available, at any time over ++busy CPUs according to the following preference: ++ ++ * Same thread, idle or busy cache, idle or busy threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ ++Mux is therefore SMT, MC and Numa aware without the need for extra ++intermittent balancing to maintain CPUs busy and make the most of cache ++coherency. ++ ++ ++Features ++ ++As the initial prime target audience for MuQSS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are 3 optional tunables, and 2 extra scheduling policies. The rr_interval, ++interactive, and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO ++policies. In addition to this, MuQSS also uses sub-tick accounting. What MuQSS ++does _not_ now feature is support for CGROUPS. The average user should neither ++need to know what these are, nor should they need to be using them to have good ++desktop behaviour. However since some applications refuse to work without ++cgroups, one can enable them with MuQSS as a stub and the filesystem will be ++created which will allow the applications to work. ++ ++rr_interval: ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6. Valid values ++are from 1 to 1000 Decreasing the value will decrease latencies at the cost of ++decreasing throughput, while increasing it will improve throughput, but at the ++cost of worsening latencies. It is based on the fact that humans can detect ++jitter at approximately 7ms, so aiming for much lower latencies is pointless ++under most circumstances. It is worth noting this fact when comparing the ++latency performance of MuQSS to other schedulers. Worst case latencies being ++higher than 7ms are far worse than average latencies not being in the ++microsecond range. ++ ++interactive: ++ ++ /proc/sys/kernel/interactive ++ ++The value is a simple boolean of 1 for on and 0 for off and is set to on by ++default. Disabling this will disable the near-determinism of MuQSS when ++selecting the next task by not examining all CPUs for the earliest deadline ++task, or which CPU to wake to, instead prioritising CPU balancing for improved ++throughput. Latency will still be bound by rr_interval, but on a per-CPU basis ++instead of across the whole system. ++ ++Runqueue sharing. ++ ++By default MuQSS chooses to share runqueue resources (specifically the skip ++list and locking) between multicore siblings. It is configurable at build time ++to select between None, SMT, MC and SMP, corresponding to no sharing, sharing ++only between simultaneous mulithreading siblings, multicore siblings, or ++symmetric multiprocessing physical packages. Additionally it can be se at ++bootime with the use of the rqshare parameter. The reason for configurability ++is that some architectures have CPUs with many multicore siblings (>= 16) ++where it may be detrimental to throughput to share runqueues and another ++sharing option may be desirable. Additionally, more sharing than usual can ++improve latency on a system-wide level at the expense of throughput if desired. ++ ++The options are: ++none, smt, mc, smp ++ ++eg: ++ rqshare=mc ++ ++Isochronous scheduling: ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of CPU available per CPU, configurable as a percentage in ++the following "resource handling" tunable (as opposed to a scheduler tunable): ++ ++iso_cpu: ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of MuQSS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++ ++ ++Idleprio scheduling: ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start a ++video encode or so on without any slowdown of other tasks. To avoid this policy ++from grabbing shared resources and holding them indefinitely, if it detects a ++state where the task is waiting on I/O, the machine is about to suspend to ram ++and so on, it will transiently schedule them as SCHED_NORMAL. Once a task has ++been scheduled as IDLEPRIO, it cannot be put back to SCHED_NORMAL without ++superuser privileges since it is effectively a lower scheduling policy. Tasks ++can be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++schedtool -D -e ./mprime ++ ++Subtick accounting: ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the timer ++tick frequency (HZ) is lowered. It is possible to create an application which ++uses almost 100% CPU, yet by being descheduled at the right time, records zero ++CPU usage. While the main problem with this is that there are possible security ++implications, it is also difficult to determine how much CPU a task really does ++use. Mux uses sub-tick accounting from the TSC clock to determine real CPU ++usage. Thus, the amount of CPU reported as being used by MuQSS will more ++accurately represent how much CPU the task itself is using (as is shown for ++example by the 'time' application), so the reported values may be quite ++different to other schedulers. When comparing throughput of MuQSS to other ++designs, it is important to compare the actual completed work in terms of total ++wall clock time taken and total work done, rather than the reported "cpu usage". ++ ++Symmetric MultiThreading (SMT) aware nice: ++ ++SMT, a.k.a. hyperthreading, is a very common feature on modern CPUs. While the ++logical CPU count rises by adding thread units to each CPU core, allowing more ++than one task to be run simultaneously on the same core, the disadvantage of it ++is that the CPU power is shared between the tasks, not summating to the power ++of two CPUs. The practical upshot of this is that two tasks running on ++separate threads of the same core run significantly slower than if they had one ++core each to run on. While smart CPU selection allows each task to have a core ++to itself whenever available (as is done on MuQSS), it cannot offset the ++slowdown that occurs when the cores are all loaded and only a thread is left. ++Most of the time this is harmless as the CPU is effectively overloaded at this ++point and the extra thread is of benefit. However when running a niced task in ++the presence of an un-niced task (say nice 19 v nice 0), the nice task gets ++precisely the same amount of CPU power as the unniced one. MuQSS has an ++optional configuration feature known as SMT-NICE which selectively idles the ++secondary niced thread for a period proportional to the nice difference, ++allowing CPU distribution according to nice level to be maintained, at the ++expense of a small amount of extra overhead. If this is configured in on a ++machine without SMT threads, the overhead is minimal. ++ ++ ++Con Kolivas Sat, 29th October 2016 +diff --git a/Makefile b/Makefile +index 51540b291738..ab8c480660a6 100644 +--- a/Makefile ++++ b/Makefile +@@ -18,6 +18,10 @@ $(if $(filter __%, $(MAKECMDGOALS)), \ + PHONY := __all + __all: + ++CKVERSION = -ck1 ++CKNAME = MuQSS Powered ++EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION) ++ + # We are using a recursive build, so we need to do a little thinking + # to get the ordering right. + # +diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig +index 9c5f06e8eb9b..0d1069eee09c 100644 +--- a/arch/alpha/Kconfig ++++ b/arch/alpha/Kconfig +@@ -666,6 +666,8 @@ config HZ + default 1200 if HZ_1200 + default 1024 + ++source "kernel/Kconfig.MuQSS" ++ + config SRM_ENV + tristate "SRM environment through procfs" + depends on PROC_FS +diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig +index a12656ec0072..b46b6ddc7636 100644 +--- a/arch/arc/configs/tb10x_defconfig ++++ b/arch/arc/configs/tb10x_defconfig +@@ -29,7 +29,7 @@ CONFIG_ARC_PLAT_TB10X=y + CONFIG_ARC_CACHE_LINE_SHIFT=5 + CONFIG_HZ=250 + CONFIG_ARC_BUILTIN_DTB_NAME="abilis_tb100_dvk" +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_COMPACTION is not set + CONFIG_NET=y + CONFIG_PACKET=y +diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig +index e00d94b16658..efabbd09475a 100644 +--- a/arch/arm/Kconfig ++++ b/arch/arm/Kconfig +@@ -1236,6 +1236,8 @@ config SCHED_SMT + MultiThreading at a cost of slightly increased overhead in some + places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config HAVE_ARM_SCU + bool + help +diff --git a/arch/arm/configs/bcm2835_defconfig b/arch/arm/configs/bcm2835_defconfig +index 44ff9cd88d81..9c639c998015 100644 +--- a/arch/arm/configs/bcm2835_defconfig ++++ b/arch/arm/configs/bcm2835_defconfig +@@ -29,7 +29,7 @@ CONFIG_MODULE_UNLOAD=y + CONFIG_ARCH_MULTI_V6=y + CONFIG_ARCH_BCM=y + CONFIG_ARCH_BCM2835=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_AEABI=y + CONFIG_KSM=y + CONFIG_CLEANCACHE=y +diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig +index 82d3ffb18e70..bb05667427a6 100644 +--- a/arch/arm/configs/imx_v6_v7_defconfig ++++ b/arch/arm/configs/imx_v6_v7_defconfig +@@ -45,6 +45,7 @@ CONFIG_PCI_MSI=y + CONFIG_PCI_IMX6=y + CONFIG_SMP=y + CONFIG_ARM_PSCI=y ++CONFIG_PREEMPT=y + CONFIG_HIGHMEM=y + CONFIG_FORCE_MAX_ZONEORDER=14 + CONFIG_CMDLINE="noinitrd console=ttymxc0,115200" +diff --git a/arch/arm/configs/mps2_defconfig b/arch/arm/configs/mps2_defconfig +index 1d923dbb9928..9c1931f1fafd 100644 +--- a/arch/arm/configs/mps2_defconfig ++++ b/arch/arm/configs/mps2_defconfig +@@ -18,7 +18,7 @@ CONFIG_ARCH_MPS2=y + CONFIG_SET_MEM_PARAM=y + CONFIG_DRAM_BASE=0x21000000 + CONFIG_DRAM_SIZE=0x1000000 +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_ATAGS is not set + CONFIG_ZBOOT_ROM_TEXT=0x0 + CONFIG_ZBOOT_ROM_BSS=0x0 +diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig +index a9c6f32a9b1c..870866aaa39d 100644 +--- a/arch/arm/configs/mxs_defconfig ++++ b/arch/arm/configs/mxs_defconfig +@@ -1,7 +1,7 @@ + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT_VOLUNTARY=n + CONFIG_TASKSTATS=y + CONFIG_TASK_DELAY_ACCT=y + CONFIG_TASK_XACCT=y +@@ -25,6 +25,13 @@ CONFIG_MODULE_UNLOAD=y + CONFIG_MODULE_FORCE_UNLOAD=y + CONFIG_MODVERSIONS=y + CONFIG_BLK_DEV_INTEGRITY=y ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++# CONFIG_ARCH_MULTI_V7 is not set ++CONFIG_ARCH_MXS=y ++# CONFIG_ARM_THUMB is not set ++CONFIG_PREEMPT=y ++CONFIG_AEABI=y + CONFIG_NET=y + CONFIG_PACKET=y + CONFIG_UNIX=y +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index 6d232837cbee..052cae73d674 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -945,6 +945,8 @@ config SCHED_SMT + MultiThreading at a cost of slightly increased overhead in some + places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config NR_CPUS + int "Maximum number of CPUs (2-4096)" + range 2 4096 +diff --git a/arch/mips/configs/fuloong2e_defconfig b/arch/mips/configs/fuloong2e_defconfig +index 023b4e644b1c..013e630b96a6 100644 +--- a/arch/mips/configs/fuloong2e_defconfig ++++ b/arch/mips/configs/fuloong2e_defconfig +@@ -4,7 +4,7 @@ CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y +diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig +index 9085f4d6c698..fb23111d45f6 100644 +--- a/arch/mips/configs/gpr_defconfig ++++ b/arch/mips/configs/gpr_defconfig +@@ -1,8 +1,8 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_BSD_PROCESS_ACCT_V3=y + CONFIG_RELAY=y +diff --git a/arch/mips/configs/ip22_defconfig b/arch/mips/configs/ip22_defconfig +index 21a1168ae301..529a1b1007cf 100644 +--- a/arch/mips/configs/ip22_defconfig ++++ b/arch/mips/configs/ip22_defconfig +@@ -1,7 +1,7 @@ + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=14 +diff --git a/arch/mips/configs/ip28_defconfig b/arch/mips/configs/ip28_defconfig +index 0921ef38e9fb..6da05cef46f8 100644 +--- a/arch/mips/configs/ip28_defconfig ++++ b/arch/mips/configs/ip28_defconfig +@@ -1,5 +1,5 @@ + CONFIG_SYSVIPC=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=14 +diff --git a/arch/mips/configs/jazz_defconfig b/arch/mips/configs/jazz_defconfig +index 8c223035921f..a3bf87450343 100644 +--- a/arch/mips/configs/jazz_defconfig ++++ b/arch/mips/configs/jazz_defconfig +@@ -1,8 +1,8 @@ ++CONFIG_PREEMPT=y + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_LOG_BUF_SHIFT=14 + CONFIG_RELAY=y +diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig +index 914af125a7fa..76a64290373f 100644 +--- a/arch/mips/configs/mtx1_defconfig ++++ b/arch/mips/configs/mtx1_defconfig +@@ -1,8 +1,8 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_AUDIT=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_BSD_PROCESS_ACCT_V3=y + CONFIG_RELAY=y +diff --git a/arch/mips/configs/nlm_xlr_defconfig b/arch/mips/configs/nlm_xlr_defconfig +index 4ecb157e56d4..ea7309283b01 100644 +--- a/arch/mips/configs/nlm_xlr_defconfig ++++ b/arch/mips/configs/nlm_xlr_defconfig +@@ -1,10 +1,10 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_AUDIT=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_BSD_PROCESS_ACCT_V3=y + CONFIG_TASKSTATS=y +diff --git a/arch/mips/configs/pic32mzda_defconfig b/arch/mips/configs/pic32mzda_defconfig +index 63fe2da1b37f..7f08ee237345 100644 +--- a/arch/mips/configs/pic32mzda_defconfig ++++ b/arch/mips/configs/pic32mzda_defconfig +@@ -1,7 +1,7 @@ ++CONFIG_PREEMPT=y + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=14 +diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig +index b9adf15ebbec..0025b56dc300 100644 +--- a/arch/mips/configs/pistachio_defconfig ++++ b/arch/mips/configs/pistachio_defconfig +@@ -1,9 +1,9 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_DEFAULT_HOSTNAME="localhost" + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_IKCONFIG=m + CONFIG_IKCONFIG_PROC=y + CONFIG_LOG_BUF_SHIFT=18 +diff --git a/arch/mips/configs/pnx8335_stb225_defconfig b/arch/mips/configs/pnx8335_stb225_defconfig +index d06db6b87959..fb2cd3234d95 100644 +--- a/arch/mips/configs/pnx8335_stb225_defconfig ++++ b/arch/mips/configs/pnx8335_stb225_defconfig +@@ -1,9 +1,9 @@ ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + # CONFIG_SWAP is not set + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_LOG_BUF_SHIFT=14 + CONFIG_EXPERT=y + CONFIG_SLAB=y +diff --git a/arch/mips/configs/rm200_defconfig b/arch/mips/configs/rm200_defconfig +index 30d7c3db884e..9e68acfa0d0e 100644 +--- a/arch/mips/configs/rm200_defconfig ++++ b/arch/mips/configs/rm200_defconfig +@@ -1,6 +1,6 @@ ++CONFIG_PREEMPT=y + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y +-CONFIG_PREEMPT_VOLUNTARY=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y +diff --git a/arch/parisc/configs/712_defconfig b/arch/parisc/configs/712_defconfig +new file mode 100644 +index 000000000000..578524f80cc4 +--- /dev/null ++++ b/arch/parisc/configs/712_defconfig +@@ -0,0 +1,181 @@ ++# CONFIG_LOCALVERSION_AUTO is not set ++CONFIG_SYSVIPC=y ++CONFIG_POSIX_MQUEUE=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=16 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_KALLSYMS_ALL=y ++CONFIG_SLAB=y ++CONFIG_PROFILING=y ++CONFIG_OPROFILE=m ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_MODULE_FORCE_UNLOAD=y ++CONFIG_PA7100LC=y ++CONFIG_PREEMPT=y ++CONFIG_GSC_LASI=y ++# CONFIG_PDC_CHASSIS is not set ++CONFIG_BINFMT_MISC=m ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_XFRM_USER=m ++CONFIG_NET_KEY=m ++CONFIG_INET=y ++CONFIG_IP_MULTICAST=y ++CONFIG_IP_PNP=y ++CONFIG_IP_PNP_DHCP=y ++CONFIG_IP_PNP_BOOTP=y ++CONFIG_INET_AH=m ++CONFIG_INET_ESP=m ++CONFIG_INET_DIAG=m ++# CONFIG_IPV6 is not set ++CONFIG_NETFILTER=y ++CONFIG_LLC2=m ++CONFIG_NET_PKTGEN=m ++CONFIG_DEVTMPFS=y ++CONFIG_DEVTMPFS_MOUNT=y ++# CONFIG_STANDALONE is not set ++# CONFIG_PREVENT_FIRMWARE_BUILD is not set ++CONFIG_PARPORT=y ++CONFIG_PARPORT_PC=m ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_CRYPTOLOOP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_SIZE=6144 ++CONFIG_ATA_OVER_ETH=m ++CONFIG_SCSI=y ++CONFIG_BLK_DEV_SD=y ++CONFIG_CHR_DEV_ST=y ++CONFIG_BLK_DEV_SR=y ++CONFIG_CHR_DEV_SG=y ++CONFIG_SCSI_ISCSI_ATTRS=m ++CONFIG_SCSI_LASI700=y ++CONFIG_SCSI_DEBUG=m ++CONFIG_MD=y ++CONFIG_BLK_DEV_MD=m ++CONFIG_MD_LINEAR=m ++CONFIG_MD_RAID0=m ++CONFIG_MD_RAID1=m ++CONFIG_NETDEVICES=y ++CONFIG_BONDING=m ++CONFIG_DUMMY=m ++CONFIG_TUN=m ++CONFIG_LASI_82596=y ++CONFIG_PPP=m ++CONFIG_PPP_BSDCOMP=m ++CONFIG_PPP_DEFLATE=m ++CONFIG_PPP_MPPE=m ++CONFIG_PPPOE=m ++CONFIG_PPP_ASYNC=m ++CONFIG_PPP_SYNC_TTY=m ++# CONFIG_KEYBOARD_HIL_OLD is not set ++CONFIG_MOUSE_SERIAL=m ++CONFIG_LEGACY_PTY_COUNT=64 ++CONFIG_SERIAL_8250=y ++CONFIG_SERIAL_8250_CONSOLE=y ++CONFIG_SERIAL_8250_NR_UARTS=17 ++CONFIG_SERIAL_8250_EXTENDED=y ++CONFIG_SERIAL_8250_MANY_PORTS=y ++CONFIG_SERIAL_8250_SHARE_IRQ=y ++# CONFIG_SERIAL_MUX is not set ++CONFIG_PDC_CONSOLE=y ++CONFIG_PRINTER=m ++CONFIG_PPDEV=m ++# CONFIG_HW_RANDOM is not set ++CONFIG_RAW_DRIVER=y ++# CONFIG_HWMON is not set ++CONFIG_FB=y ++CONFIG_FB_MODE_HELPERS=y ++CONFIG_FB_TILEBLITTING=y ++CONFIG_DUMMY_CONSOLE_COLUMNS=128 ++CONFIG_DUMMY_CONSOLE_ROWS=48 ++CONFIG_FRAMEBUFFER_CONSOLE=y ++CONFIG_LOGO=y ++# CONFIG_LOGO_LINUX_MONO is not set ++# CONFIG_LOGO_LINUX_VGA16 is not set ++# CONFIG_LOGO_LINUX_CLUT224 is not set ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_SEQUENCER=y ++CONFIG_SND_HARMONY=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT3_FS=y ++CONFIG_JFS_FS=m ++CONFIG_XFS_FS=m ++CONFIG_AUTOFS4_FS=y ++CONFIG_ISO9660_FS=y ++CONFIG_JOLIET=y ++CONFIG_UDF_FS=m ++CONFIG_MSDOS_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_PROC_KCORE=y ++CONFIG_TMPFS=y ++CONFIG_UFS_FS=m ++CONFIG_NFS_FS=y ++CONFIG_NFS_V4=y ++CONFIG_ROOT_NFS=y ++CONFIG_NFSD=m ++CONFIG_NFSD_V4=y ++CONFIG_CIFS=m ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_737=m ++CONFIG_NLS_CODEPAGE_775=m ++CONFIG_NLS_CODEPAGE_850=m ++CONFIG_NLS_CODEPAGE_852=m ++CONFIG_NLS_CODEPAGE_855=m ++CONFIG_NLS_CODEPAGE_857=m ++CONFIG_NLS_CODEPAGE_860=m ++CONFIG_NLS_CODEPAGE_861=m ++CONFIG_NLS_CODEPAGE_862=m ++CONFIG_NLS_CODEPAGE_863=m ++CONFIG_NLS_CODEPAGE_864=m ++CONFIG_NLS_CODEPAGE_865=m ++CONFIG_NLS_CODEPAGE_866=m ++CONFIG_NLS_CODEPAGE_869=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_CODEPAGE_950=m ++CONFIG_NLS_CODEPAGE_932=m ++CONFIG_NLS_CODEPAGE_949=m ++CONFIG_NLS_CODEPAGE_874=m ++CONFIG_NLS_ISO8859_8=m ++CONFIG_NLS_CODEPAGE_1250=m ++CONFIG_NLS_CODEPAGE_1251=m ++CONFIG_NLS_ASCII=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_ISO8859_2=m ++CONFIG_NLS_ISO8859_3=m ++CONFIG_NLS_ISO8859_4=m ++CONFIG_NLS_ISO8859_5=m ++CONFIG_NLS_ISO8859_6=m ++CONFIG_NLS_ISO8859_7=m ++CONFIG_NLS_ISO8859_9=m ++CONFIG_NLS_ISO8859_13=m ++CONFIG_NLS_ISO8859_14=m ++CONFIG_NLS_ISO8859_15=m ++CONFIG_NLS_KOI8_R=m ++CONFIG_NLS_KOI8_U=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_FS=y ++CONFIG_MAGIC_SYSRQ=y ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_MUTEXES=y ++CONFIG_CRYPTO_TEST=m ++CONFIG_CRYPTO_HMAC=y ++CONFIG_CRYPTO_MICHAEL_MIC=m ++CONFIG_CRYPTO_SHA512=m ++CONFIG_CRYPTO_TGR192=m ++CONFIG_CRYPTO_WP512=m ++CONFIG_CRYPTO_ANUBIS=m ++CONFIG_CRYPTO_BLOWFISH=m ++CONFIG_CRYPTO_CAST6=m ++CONFIG_CRYPTO_KHAZAD=m ++CONFIG_CRYPTO_SERPENT=m ++CONFIG_CRYPTO_TEA=m ++CONFIG_CRYPTO_TWOFISH=m ++CONFIG_CRYPTO_DEFLATE=m ++# CONFIG_CRYPTO_HW is not set ++CONFIG_FONTS=y ++CONFIG_FONT_8x8=y ++CONFIG_FONT_8x16=y +diff --git a/arch/parisc/configs/c3000_defconfig b/arch/parisc/configs/c3000_defconfig +new file mode 100644 +index 000000000000..d1bdfad94048 +--- /dev/null ++++ b/arch/parisc/configs/c3000_defconfig +@@ -0,0 +1,151 @@ ++# CONFIG_LOCALVERSION_AUTO is not set ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=16 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++CONFIG_KALLSYMS_ALL=y ++CONFIG_SLAB=y ++CONFIG_PROFILING=y ++CONFIG_OPROFILE=m ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_MODULE_FORCE_UNLOAD=y ++CONFIG_PA8X00=y ++CONFIG_PREEMPT=y ++# CONFIG_GSC is not set ++CONFIG_PCI=y ++CONFIG_PCI_LBA=y ++# CONFIG_PDC_CHASSIS is not set ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_XFRM_USER=m ++CONFIG_NET_KEY=m ++CONFIG_INET=y ++CONFIG_IP_MULTICAST=y ++CONFIG_IP_PNP=y ++CONFIG_IP_PNP_BOOTP=y ++# CONFIG_INET_DIAG is not set ++CONFIG_INET6_IPCOMP=m ++CONFIG_IPV6_TUNNEL=m ++CONFIG_NETFILTER=y ++CONFIG_NET_PKTGEN=m ++CONFIG_DEVTMPFS=y ++CONFIG_DEVTMPFS_MOUNT=y ++# CONFIG_STANDALONE is not set ++# CONFIG_PREVENT_FIRMWARE_BUILD is not set ++CONFIG_BLK_DEV_UMEM=m ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_CRYPTOLOOP=m ++CONFIG_IDE=y ++CONFIG_BLK_DEV_IDECD=y ++CONFIG_BLK_DEV_NS87415=y ++CONFIG_SCSI=y ++CONFIG_BLK_DEV_SD=y ++CONFIG_CHR_DEV_ST=y ++CONFIG_BLK_DEV_SR=y ++CONFIG_CHR_DEV_SG=y ++CONFIG_SCSI_ISCSI_ATTRS=m ++CONFIG_SCSI_SYM53C8XX_2=y ++CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=0 ++CONFIG_SCSI_DEBUG=m ++CONFIG_MD=y ++CONFIG_BLK_DEV_MD=y ++CONFIG_MD_LINEAR=y ++CONFIG_MD_RAID0=y ++CONFIG_MD_RAID1=y ++CONFIG_BLK_DEV_DM=m ++CONFIG_DM_CRYPT=m ++CONFIG_DM_SNAPSHOT=m ++CONFIG_DM_MIRROR=m ++CONFIG_DM_ZERO=m ++CONFIG_DM_MULTIPATH=m ++CONFIG_FUSION=y ++CONFIG_FUSION_SPI=m ++CONFIG_FUSION_CTL=m ++CONFIG_NETDEVICES=y ++CONFIG_BONDING=m ++CONFIG_DUMMY=m ++CONFIG_TUN=m ++CONFIG_ACENIC=m ++CONFIG_TIGON3=m ++CONFIG_NET_TULIP=y ++CONFIG_DE2104X=m ++CONFIG_TULIP=y ++CONFIG_TULIP_MMIO=y ++CONFIG_E100=m ++CONFIG_E1000=m ++CONFIG_PPP=m ++CONFIG_PPP_BSDCOMP=m ++CONFIG_PPP_DEFLATE=m ++CONFIG_PPPOE=m ++CONFIG_PPP_ASYNC=m ++CONFIG_PPP_SYNC_TTY=m ++# CONFIG_KEYBOARD_ATKBD is not set ++# CONFIG_MOUSE_PS2 is not set ++CONFIG_SERIO=m ++CONFIG_SERIO_LIBPS2=m ++CONFIG_SERIAL_8250=y ++CONFIG_SERIAL_8250_CONSOLE=y ++CONFIG_SERIAL_8250_NR_UARTS=13 ++CONFIG_SERIAL_8250_EXTENDED=y ++CONFIG_SERIAL_8250_MANY_PORTS=y ++CONFIG_SERIAL_8250_SHARE_IRQ=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_RAW_DRIVER=y ++# CONFIG_HWMON is not set ++CONFIG_FB=y ++CONFIG_FRAMEBUFFER_CONSOLE=y ++CONFIG_LOGO=y ++# CONFIG_LOGO_LINUX_MONO is not set ++# CONFIG_LOGO_LINUX_VGA16 is not set ++# CONFIG_LOGO_LINUX_CLUT224 is not set ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_SEQUENCER=y ++CONFIG_SND_AD1889=y ++CONFIG_USB_HIDDEV=y ++CONFIG_USB=y ++CONFIG_USB_OHCI_HCD=y ++CONFIG_USB_PRINTER=m ++CONFIG_USB_STORAGE=m ++CONFIG_USB_STORAGE_USBAT=m ++CONFIG_USB_STORAGE_SDDR09=m ++CONFIG_USB_STORAGE_SDDR55=m ++CONFIG_USB_STORAGE_JUMPSHOT=m ++CONFIG_USB_MDC800=m ++CONFIG_USB_MICROTEK=m ++CONFIG_USB_LEGOTOWER=m ++CONFIG_EXT2_FS=y ++CONFIG_EXT3_FS=y ++CONFIG_XFS_FS=m ++CONFIG_AUTOFS4_FS=y ++CONFIG_ISO9660_FS=y ++CONFIG_JOLIET=y ++CONFIG_MSDOS_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_PROC_KCORE=y ++CONFIG_TMPFS=y ++CONFIG_NFS_FS=y ++CONFIG_ROOT_NFS=y ++CONFIG_NFSD=y ++CONFIG_NFSD_V3=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_850=m ++CONFIG_NLS_ASCII=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_ISO8859_15=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_FS=y ++CONFIG_HEADERS_INSTALL=y ++CONFIG_HEADERS_CHECK=y ++CONFIG_MAGIC_SYSRQ=y ++CONFIG_DEBUG_MUTEXES=y ++# CONFIG_DEBUG_BUGVERBOSE is not set ++CONFIG_CRYPTO_TEST=m ++CONFIG_CRYPTO_MD5=m ++CONFIG_CRYPTO_BLOWFISH=m ++CONFIG_CRYPTO_DES=m ++# CONFIG_CRYPTO_HW is not set +diff --git a/arch/parisc/configs/defconfig b/arch/parisc/configs/defconfig +new file mode 100644 +index 000000000000..0d976614934c +--- /dev/null ++++ b/arch/parisc/configs/defconfig +@@ -0,0 +1,206 @@ ++# CONFIG_LOCALVERSION_AUTO is not set ++CONFIG_SYSVIPC=y ++CONFIG_POSIX_MQUEUE=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=16 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_KALLSYMS_ALL=y ++CONFIG_SLAB=y ++CONFIG_PROFILING=y ++CONFIG_OPROFILE=m ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_MODULE_FORCE_UNLOAD=y ++# CONFIG_BLK_DEV_BSG is not set ++CONFIG_PA7100LC=y ++CONFIG_PREEMPT=y ++CONFIG_IOMMU_CCIO=y ++CONFIG_GSC_LASI=y ++CONFIG_GSC_WAX=y ++CONFIG_EISA=y ++CONFIG_PCI=y ++CONFIG_GSC_DINO=y ++CONFIG_PCI_LBA=y ++CONFIG_PCCARD=y ++CONFIG_YENTA=y ++CONFIG_PD6729=y ++CONFIG_I82092=y ++CONFIG_BINFMT_MISC=m ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_XFRM_USER=m ++CONFIG_NET_KEY=m ++CONFIG_INET=y ++CONFIG_IP_MULTICAST=y ++CONFIG_IP_PNP=y ++CONFIG_IP_PNP_DHCP=y ++CONFIG_IP_PNP_BOOTP=y ++CONFIG_INET_AH=m ++CONFIG_INET_ESP=m ++CONFIG_INET_DIAG=m ++CONFIG_INET6_AH=y ++CONFIG_INET6_ESP=y ++CONFIG_INET6_IPCOMP=y ++CONFIG_LLC2=m ++CONFIG_DEVTMPFS=y ++CONFIG_DEVTMPFS_MOUNT=y ++# CONFIG_STANDALONE is not set ++# CONFIG_PREVENT_FIRMWARE_BUILD is not set ++CONFIG_PARPORT=y ++CONFIG_PARPORT_PC=m ++CONFIG_PARPORT_PC_PCMCIA=m ++CONFIG_PARPORT_1284=y ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_CRYPTOLOOP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_SIZE=6144 ++CONFIG_IDE=y ++CONFIG_BLK_DEV_IDECS=y ++CONFIG_BLK_DEV_IDECD=y ++CONFIG_BLK_DEV_GENERIC=y ++CONFIG_BLK_DEV_NS87415=y ++CONFIG_SCSI=y ++CONFIG_BLK_DEV_SD=y ++CONFIG_CHR_DEV_ST=y ++CONFIG_BLK_DEV_SR=y ++CONFIG_CHR_DEV_SG=y ++CONFIG_SCSI_LASI700=y ++CONFIG_SCSI_SYM53C8XX_2=y ++CONFIG_SCSI_ZALON=y ++CONFIG_MD=y ++CONFIG_BLK_DEV_MD=y ++CONFIG_MD_LINEAR=y ++CONFIG_MD_RAID0=y ++CONFIG_MD_RAID1=y ++CONFIG_MD_RAID10=y ++CONFIG_BLK_DEV_DM=y ++CONFIG_NETDEVICES=y ++CONFIG_BONDING=m ++CONFIG_DUMMY=m ++CONFIG_TUN=m ++CONFIG_ACENIC=y ++CONFIG_TIGON3=y ++CONFIG_NET_TULIP=y ++CONFIG_TULIP=y ++CONFIG_LASI_82596=y ++CONFIG_PPP=m ++CONFIG_PPP_BSDCOMP=m ++CONFIG_PPP_DEFLATE=m ++CONFIG_PPPOE=m ++CONFIG_PPP_ASYNC=m ++CONFIG_PPP_SYNC_TTY=m ++# CONFIG_KEYBOARD_HIL_OLD is not set ++CONFIG_MOUSE_SERIAL=y ++CONFIG_LEGACY_PTY_COUNT=64 ++CONFIG_SERIAL_8250=y ++CONFIG_SERIAL_8250_CONSOLE=y ++CONFIG_SERIAL_8250_CS=y ++CONFIG_SERIAL_8250_NR_UARTS=17 ++CONFIG_SERIAL_8250_EXTENDED=y ++CONFIG_SERIAL_8250_MANY_PORTS=y ++CONFIG_SERIAL_8250_SHARE_IRQ=y ++CONFIG_PRINTER=m ++CONFIG_PPDEV=m ++# CONFIG_HW_RANDOM is not set ++# CONFIG_HWMON is not set ++CONFIG_FB=y ++CONFIG_FB_MODE_HELPERS=y ++CONFIG_FB_TILEBLITTING=y ++CONFIG_DUMMY_CONSOLE_COLUMNS=128 ++CONFIG_DUMMY_CONSOLE_ROWS=48 ++CONFIG_FRAMEBUFFER_CONSOLE=y ++CONFIG_LOGO=y ++# CONFIG_LOGO_LINUX_MONO is not set ++# CONFIG_LOGO_LINUX_VGA16 is not set ++# CONFIG_LOGO_LINUX_CLUT224 is not set ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_DYNAMIC_MINORS=y ++CONFIG_SND_SEQUENCER=y ++CONFIG_SND_AD1889=y ++CONFIG_SND_HARMONY=y ++CONFIG_HID_GYRATION=y ++CONFIG_HID_NTRIG=y ++CONFIG_HID_PANTHERLORD=y ++CONFIG_HID_PETALYNX=y ++CONFIG_HID_SAMSUNG=y ++CONFIG_HID_SUNPLUS=y ++CONFIG_HID_TOPSEED=y ++CONFIG_USB=y ++CONFIG_USB_MON=y ++CONFIG_USB_OHCI_HCD=y ++CONFIG_USB_UHCI_HCD=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT3_FS=y ++CONFIG_ISO9660_FS=y ++CONFIG_JOLIET=y ++CONFIG_VFAT_FS=y ++CONFIG_PROC_KCORE=y ++CONFIG_TMPFS=y ++CONFIG_NFS_FS=y ++CONFIG_ROOT_NFS=y ++CONFIG_NFSD=y ++CONFIG_NFSD_V4=y ++CONFIG_CIFS=m ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_CODEPAGE_737=m ++CONFIG_NLS_CODEPAGE_775=m ++CONFIG_NLS_CODEPAGE_850=m ++CONFIG_NLS_CODEPAGE_852=m ++CONFIG_NLS_CODEPAGE_855=m ++CONFIG_NLS_CODEPAGE_857=m ++CONFIG_NLS_CODEPAGE_860=m ++CONFIG_NLS_CODEPAGE_861=m ++CONFIG_NLS_CODEPAGE_862=m ++CONFIG_NLS_CODEPAGE_863=m ++CONFIG_NLS_CODEPAGE_864=m ++CONFIG_NLS_CODEPAGE_865=m ++CONFIG_NLS_CODEPAGE_866=m ++CONFIG_NLS_CODEPAGE_869=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_CODEPAGE_950=m ++CONFIG_NLS_CODEPAGE_932=m ++CONFIG_NLS_CODEPAGE_949=m ++CONFIG_NLS_CODEPAGE_874=m ++CONFIG_NLS_ISO8859_8=m ++CONFIG_NLS_CODEPAGE_1250=y ++CONFIG_NLS_CODEPAGE_1251=m ++CONFIG_NLS_ASCII=m ++CONFIG_NLS_ISO8859_1=y ++CONFIG_NLS_ISO8859_2=m ++CONFIG_NLS_ISO8859_3=m ++CONFIG_NLS_ISO8859_4=m ++CONFIG_NLS_ISO8859_5=m ++CONFIG_NLS_ISO8859_6=m ++CONFIG_NLS_ISO8859_7=m ++CONFIG_NLS_ISO8859_9=m ++CONFIG_NLS_ISO8859_13=m ++CONFIG_NLS_ISO8859_14=m ++CONFIG_NLS_ISO8859_15=m ++CONFIG_NLS_KOI8_R=m ++CONFIG_NLS_KOI8_U=m ++CONFIG_NLS_UTF8=y ++CONFIG_DEBUG_FS=y ++CONFIG_HEADERS_INSTALL=y ++CONFIG_HEADERS_CHECK=y ++CONFIG_MAGIC_SYSRQ=y ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_MUTEXES=y ++CONFIG_KEYS=y ++CONFIG_CRYPTO_TEST=m ++CONFIG_CRYPTO_MICHAEL_MIC=m ++CONFIG_CRYPTO_SHA512=m ++CONFIG_CRYPTO_TGR192=m ++CONFIG_CRYPTO_WP512=m ++CONFIG_CRYPTO_ANUBIS=m ++CONFIG_CRYPTO_BLOWFISH=m ++CONFIG_CRYPTO_CAST6=m ++CONFIG_CRYPTO_KHAZAD=m ++CONFIG_CRYPTO_SERPENT=m ++CONFIG_CRYPTO_TEA=m ++CONFIG_CRYPTO_TWOFISH=m ++# CONFIG_CRYPTO_HW is not set ++CONFIG_LIBCRC32C=m ++CONFIG_FONTS=y +diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig +index 787e829b6f25..22914bbb4caa 100644 +--- a/arch/powerpc/Kconfig ++++ b/arch/powerpc/Kconfig +@@ -882,6 +882,8 @@ config SCHED_SMT + when dealing with POWER5 cpus at a cost of slightly increased + overhead in some places. If unsure say N here. + ++source "kernel/Kconfig.MuQSS" ++ + config PPC_DENORMALISATION + bool "PowerPC denormalisation exception handling" + depends on PPC_BOOK3S_64 +diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig +index 66e9a0fd64ff..c8531232efb7 100644 +--- a/arch/powerpc/configs/ppc6xx_defconfig ++++ b/arch/powerpc/configs/ppc6xx_defconfig +@@ -73,7 +73,7 @@ CONFIG_QE_GPIO=y + CONFIG_MCU_MPC8349EMITX=y + CONFIG_HIGHMEM=y + CONFIG_HZ_1000=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BINFMT_MISC=y + CONFIG_HIBERNATION=y + CONFIG_PM_DEBUG=y +diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +index f18d5067cd0f..fe489fc01c73 100644 +--- a/arch/powerpc/platforms/cell/spufs/sched.c ++++ b/arch/powerpc/platforms/cell/spufs/sched.c +@@ -51,11 +51,6 @@ static struct task_struct *spusched_task; + static struct timer_list spusched_timer; + static struct timer_list spuloadavg_timer; + +-/* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- + /* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. +diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig +index ee6d28ae08de..827e4693c5b2 100644 +--- a/arch/sh/configs/se7712_defconfig ++++ b/arch/sh/configs/se7712_defconfig +@@ -21,7 +21,7 @@ CONFIG_FLATMEM_MANUAL=y + CONFIG_SH_SOLUTION_ENGINE=y + CONFIG_SH_PCLK_FREQ=66666666 + CONFIG_HEARTBEAT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1" + CONFIG_NET=y +diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig +index bad921bc10f8..e8f42bc0d370 100644 +--- a/arch/sh/configs/se7721_defconfig ++++ b/arch/sh/configs/se7721_defconfig +@@ -21,7 +21,7 @@ CONFIG_FLATMEM_MANUAL=y + CONFIG_SH_7721_SOLUTION_ENGINE=y + CONFIG_SH_PCLK_FREQ=33333333 + CONFIG_HEARTBEAT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda2" + CONFIG_NET=y +diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig +index ba887f1351be..4434e93b70bc 100644 +--- a/arch/sh/configs/titan_defconfig ++++ b/arch/sh/configs/titan_defconfig +@@ -19,7 +19,7 @@ CONFIG_SH_TITAN=y + CONFIG_SH_PCLK_FREQ=30000000 + CONFIG_SH_DMA=y + CONFIG_SH_DMA_API=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC1,38400N81 root=/dev/nfs ip=:::::eth1:autoconf rw" + CONFIG_PCI=y +diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig +index bde4d21a8ac8..c054ec82d91b 100644 +--- a/arch/sparc/configs/sparc64_defconfig ++++ b/arch/sparc/configs/sparc64_defconfig +@@ -22,7 +22,7 @@ CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y + CONFIG_NUMA=y + CONFIG_DEFAULT_MMAP_MIN_ADDR=8192 +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_SUN_LDOMS=y + CONFIG_PCI=y + CONFIG_PCI_MSI=y +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 7101ac64bb20..6f56ad1894d1 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1005,6 +1005,22 @@ config NR_CPUS + config SCHED_SMT + def_bool y if SMP + ++config SMT_NICE ++ bool "SMT (Hyperthreading) aware nice priority and policy support" ++ depends on SCHED_MUQSS && SCHED_SMT ++ default y ++ help ++ Enabling Hyperthreading on Intel CPUs decreases the effectiveness ++ of the use of 'nice' levels and different scheduling policies ++ (e.g. realtime) due to sharing of CPU power between hyperthreads. ++ SMT nice support makes each logical CPU aware of what is running on ++ its hyperthread siblings, maintaining appropriate distribution of ++ CPU according to nice levels and scheduling policies at the expense ++ of slightly increased overhead. ++ ++ If unsure say Y here. ++ ++ + config SCHED_MC + def_bool y + prompt "Multi-core scheduler support" +@@ -1035,6 +1051,8 @@ config SCHED_MC_PRIO + + If unsure say Y here. + ++source "kernel/Kconfig.MuQSS" ++ + config UP_LATE_INIT + def_bool y + depends on !SMP && X86_LOCAL_APIC +@@ -1419,7 +1437,7 @@ config HIGHMEM64G + endchoice + + choice +- prompt "Memory split" if EXPERT ++ prompt "Memory split" + default VMSPLIT_3G + depends on X86_32 + help +@@ -1439,17 +1457,17 @@ choice + option alone! + + config VMSPLIT_3G +- bool "3G/1G user/kernel split" ++ bool "Default 896MB lowmem (3G/1G user/kernel split)" + config VMSPLIT_3G_OPT + depends on !X86_PAE +- bool "3G/1G user/kernel split (for full 1G low memory)" ++ bool "1GB lowmem (3G/1G user/kernel split)" + config VMSPLIT_2G +- bool "2G/2G user/kernel split" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_2G_OPT + depends on !X86_PAE +- bool "2G/2G user/kernel split (for full 2G low memory)" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_1G +- bool "1G/3G user/kernel split" ++ bool "3GB lowmem (1G/3G user/kernel split)" + endchoice + + config PAGE_OFFSET +diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig +index 78210793d357..0c4415b23002 100644 +--- a/arch/x86/configs/i386_defconfig ++++ b/arch/x86/configs/i386_defconfig +@@ -23,6 +23,8 @@ CONFIG_PROFILING=y + CONFIG_SMP=y + CONFIG_X86_GENERIC=y + CONFIG_HPET_TIMER=y ++CONFIG_SCHED_SMT=y ++CONFIG_PREEMPT=y + CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y + CONFIG_X86_REBOOTFIXUPS=y + CONFIG_MICROCODE_AMD=y +diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig +index 9936528e1939..328c7d0a38a1 100644 +--- a/arch/x86/configs/x86_64_defconfig ++++ b/arch/x86/configs/x86_64_defconfig +@@ -20,6 +20,9 @@ CONFIG_BLK_DEV_INITRD=y + # CONFIG_COMPAT_BRK is not set + CONFIG_PROFILING=y + CONFIG_SMP=y ++CONFIG_NR_CPUS=64 ++CONFIG_SCHED_SMT=y ++CONFIG_PREEMPT=y + CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y + CONFIG_MICROCODE_AMD=y + CONFIG_X86_MSR=y +diff --git a/drivers/accessibility/speakup/speakup_acntpc.c b/drivers/accessibility/speakup/speakup_acntpc.c +index c94328a5bd4a..6e7d4671aa69 100644 +--- a/drivers/accessibility/speakup/speakup_acntpc.c ++++ b/drivers/accessibility/speakup/speakup_acntpc.c +@@ -198,7 +198,7 @@ static void do_catch_up(struct spk_synth *synth) + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout((full_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -226,7 +226,7 @@ static void do_catch_up(struct spk_synth *synth) + jiffy_delta_val = jiffy_delta->u.n.value; + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff --git a/drivers/accessibility/speakup/speakup_apollo.c b/drivers/accessibility/speakup/speakup_apollo.c +index 0877b4044c28..627102d048c1 100644 +--- a/drivers/accessibility/speakup/speakup_apollo.c ++++ b/drivers/accessibility/speakup/speakup_apollo.c +@@ -165,7 +165,7 @@ static void do_catch_up(struct spk_synth *synth) + if (!synth->io_ops->synth_out(synth, ch)) { + synth->io_ops->tiocmset(0, UART_MCR_RTS); + synth->io_ops->tiocmset(UART_MCR_RTS, 0); +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout(full_time_val); + continue; + } + if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { +diff --git a/drivers/accessibility/speakup/speakup_decext.c b/drivers/accessibility/speakup/speakup_decext.c +index 7408eb29cf38..938a0c35968f 100644 +--- a/drivers/accessibility/speakup/speakup_decext.c ++++ b/drivers/accessibility/speakup/speakup_decext.c +@@ -180,7 +180,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (synth_full() || !synth->io_ops->synth_out(synth, ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/accessibility/speakup/speakup_decpc.c b/drivers/accessibility/speakup/speakup_decpc.c +index 96f24c848cc5..1130dfe4da6c 100644 +--- a/drivers/accessibility/speakup/speakup_decpc.c ++++ b/drivers/accessibility/speakup/speakup_decpc.c +@@ -398,7 +398,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (dt_sendchar(ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/accessibility/speakup/speakup_dectlk.c b/drivers/accessibility/speakup/speakup_dectlk.c +index 780214b5ca16..7b91594c57aa 100644 +--- a/drivers/accessibility/speakup/speakup_dectlk.c ++++ b/drivers/accessibility/speakup/speakup_dectlk.c +@@ -247,7 +247,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (synth_full_val || !synth->io_ops->synth_out(synth, ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/accessibility/speakup/speakup_dtlk.c b/drivers/accessibility/speakup/speakup_dtlk.c +index dbebed0eeeec..6d83c13ca4a6 100644 +--- a/drivers/accessibility/speakup/speakup_dtlk.c ++++ b/drivers/accessibility/speakup/speakup_dtlk.c +@@ -211,7 +211,7 @@ static void do_catch_up(struct spk_synth *synth) + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -227,7 +227,7 @@ static void do_catch_up(struct spk_synth *synth) + delay_time_val = delay_time->u.n.value; + jiffy_delta_val = jiffy_delta->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff --git a/drivers/accessibility/speakup/speakup_keypc.c b/drivers/accessibility/speakup/speakup_keypc.c +index 414827e888fc..cb31c9176daa 100644 +--- a/drivers/accessibility/speakup/speakup_keypc.c ++++ b/drivers/accessibility/speakup/speakup_keypc.c +@@ -199,7 +199,7 @@ static void do_catch_up(struct spk_synth *synth) + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout((full_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -232,7 +232,7 @@ static void do_catch_up(struct spk_synth *synth) + jiffy_delta_val = jiffy_delta->u.n.value; + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff --git a/drivers/accessibility/speakup/synth.c b/drivers/accessibility/speakup/synth.c +index ac47dbac7207..09f6ba829dfd 100644 +--- a/drivers/accessibility/speakup/synth.c ++++ b/drivers/accessibility/speakup/synth.c +@@ -93,12 +93,8 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (ch == '\n') + ch = synth->procspeech; +- if (unicode) +- ret = synth->io_ops->synth_out_unicode(synth, ch); +- else +- ret = synth->io_ops->synth_out(synth, ch); +- if (!ret) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ if (!synth->io_ops->synth_out(synth, ch)) { ++ schedule_msec_hrtimeout(full_time_val); + continue; + } + if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { +@@ -108,11 +104,9 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth->io_ops->synth_out(synth, synth->procspeech)) +- schedule_timeout( +- msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + else +- schedule_timeout( +- msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout(full_time_val); + jiff_max = jiffies + jiffy_delta_val; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/block/swim.c b/drivers/block/swim.c +index dd34504382e5..0caa1c7e9223 100644 +--- a/drivers/block/swim.c ++++ b/drivers/block/swim.c +@@ -328,7 +328,7 @@ static inline void swim_motor(struct swim __iomem *base, + if (swim_readbit(base, MOTOR_ON)) + break; + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + } else if (action == OFF) { + swim_action(base, MOTOR_OFF); +@@ -347,7 +347,7 @@ static inline void swim_eject(struct swim __iomem *base) + if (!swim_readbit(base, DISK_IN)) + break; + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + swim_select(base, RELAX); + } +@@ -372,6 +372,7 @@ static inline int swim_step(struct swim __iomem *base) + + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); ++ schedule_min_hrtimeout(); + + swim_select(base, RELAX); + if (!swim_readbit(base, STEP)) +diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c +index 737c0b6b24ea..a3db1f42bb3b 100644 +--- a/drivers/char/ipmi/ipmi_msghandler.c ++++ b/drivers/char/ipmi/ipmi_msghandler.c +@@ -3542,7 +3542,7 @@ static void cleanup_smi_msgs(struct ipmi_smi *intf) + /* Current message first, to preserve order */ + while (intf->curr_msg && !list_empty(&intf->waiting_rcv_msgs)) { + /* Wait for the message to clear out. */ +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + + /* No need for locks, the interface is down. */ +diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c +index 0416b9c9d410..9ce5fae0f1cf 100644 +--- a/drivers/char/ipmi/ipmi_ssif.c ++++ b/drivers/char/ipmi/ipmi_ssif.c +@@ -1288,7 +1288,7 @@ static void shutdown_ssif(void *send_info) + + /* make sure the driver is not looking for flags any more. */ + while (ssif_info->ssif_state != SSIF_NORMAL) +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + + ssif_info->stopping = true; + del_timer_sync(&ssif_info->watch_timer); +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c +index a95156fc5db7..8f07c8900184 100644 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c +@@ -235,7 +235,7 @@ static int vmw_fifo_wait_noirq(struct vmw_private *dev_priv, + DRM_ERROR("SVGA device lockup.\n"); + break; + } +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + if (interruptible && signal_pending(current)) { + ret = -ERESTARTSYS; + break; +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c +index 75f3efee21a4..09b1932ce85b 100644 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c +@@ -203,7 +203,7 @@ int vmw_fallback_wait(struct vmw_private *dev_priv, + break; + } + if (lazy) +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + else if ((++count & 0x0F) == 0) { + /** + * FIXME: Use schedule_hr_timeout here for +diff --git a/drivers/hwmon/fam15h_power.c b/drivers/hwmon/fam15h_power.c +index 29f5fed28c2a..974cb08c7aa7 100644 +--- a/drivers/hwmon/fam15h_power.c ++++ b/drivers/hwmon/fam15h_power.c +@@ -221,7 +221,7 @@ static ssize_t power1_average_show(struct device *dev, + prev_ptsc[cu] = data->cpu_sw_pwr_ptsc[cu]; + } + +- leftover = schedule_timeout_interruptible(msecs_to_jiffies(data->power_period)); ++ leftover = schedule_msec_hrtimeout_interruptible((data->power_period)); + if (leftover) + return 0; + +diff --git a/drivers/iio/light/tsl2563.c b/drivers/iio/light/tsl2563.c +index abc8d7db8dc1..baa9d6338a52 100644 +--- a/drivers/iio/light/tsl2563.c ++++ b/drivers/iio/light/tsl2563.c +@@ -269,11 +269,7 @@ static void tsl2563_wait_adc(struct tsl2563_chip *chip) + default: + delay = 402; + } +- /* +- * TODO: Make sure that we wait at least required delay but why we +- * have to extend it one tick more? +- */ +- schedule_timeout_interruptible(msecs_to_jiffies(delay) + 2); ++ schedule_msec_hrtimeout_interruptible(delay + 1); + } + + static int tsl2563_adjust_gainlevel(struct tsl2563_chip *chip, u16 adc) +diff --git a/drivers/media/i2c/msp3400-driver.c b/drivers/media/i2c/msp3400-driver.c +index 39530d43590e..a7caf2eb5771 100644 +--- a/drivers/media/i2c/msp3400-driver.c ++++ b/drivers/media/i2c/msp3400-driver.c +@@ -170,7 +170,7 @@ static int msp_read(struct i2c_client *client, int dev, int addr) + break; + dev_warn(&client->dev, "I/O error #%d (read 0x%02x/0x%02x)\n", err, + dev, addr); +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + if (err == 3) { + dev_warn(&client->dev, "resetting chip, sound will go off.\n"); +@@ -211,7 +211,7 @@ static int msp_write(struct i2c_client *client, int dev, int addr, int val) + break; + dev_warn(&client->dev, "I/O error #%d (write 0x%02x/0x%02x)\n", err, + dev, addr); +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + if (err == 3) { + dev_warn(&client->dev, "resetting chip, sound will go off.\n"); +diff --git a/drivers/media/pci/cx18/cx18-gpio.c b/drivers/media/pci/cx18/cx18-gpio.c +index cf7cfda94107..f63e17489547 100644 +--- a/drivers/media/pci/cx18/cx18-gpio.c ++++ b/drivers/media/pci/cx18/cx18-gpio.c +@@ -81,11 +81,11 @@ static void gpio_reset_seq(struct cx18 *cx, u32 active_lo, u32 active_hi, + + /* Assert */ + gpio_update(cx, mask, ~active_lo); +- schedule_timeout_uninterruptible(msecs_to_jiffies(assert_msecs)); ++ schedule_msec_hrtimeout_uninterruptible((assert_msecs)); + + /* Deassert */ + gpio_update(cx, mask, ~active_hi); +- schedule_timeout_uninterruptible(msecs_to_jiffies(recovery_msecs)); ++ schedule_msec_hrtimeout_uninterruptible((recovery_msecs)); + } + + /* +diff --git a/drivers/media/pci/ivtv/ivtv-gpio.c b/drivers/media/pci/ivtv/ivtv-gpio.c +index 856e7ab7f33e..766a26251337 100644 +--- a/drivers/media/pci/ivtv/ivtv-gpio.c ++++ b/drivers/media/pci/ivtv/ivtv-gpio.c +@@ -105,7 +105,7 @@ void ivtv_reset_ir_gpio(struct ivtv *itv) + curout = (curout & ~0xF) | 1; + write_reg(curout, IVTV_REG_GPIO_OUT); + /* We could use something else for smaller time */ +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + curout |= 2; + write_reg(curout, IVTV_REG_GPIO_OUT); + curdir &= ~0x80; +@@ -125,11 +125,11 @@ int ivtv_reset_tuner_gpio(void *dev, int component, int cmd, int value) + curout = read_reg(IVTV_REG_GPIO_OUT); + curout &= ~(1 << itv->card->xceive_pin); + write_reg(curout, IVTV_REG_GPIO_OUT); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + + curout |= 1 << itv->card->xceive_pin; + write_reg(curout, IVTV_REG_GPIO_OUT); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + return 0; + } + +diff --git a/drivers/media/pci/ivtv/ivtv-ioctl.c b/drivers/media/pci/ivtv/ivtv-ioctl.c +index 35dccb31174c..8181cd65e876 100644 +--- a/drivers/media/pci/ivtv/ivtv-ioctl.c ++++ b/drivers/media/pci/ivtv/ivtv-ioctl.c +@@ -1139,7 +1139,7 @@ void ivtv_s_std_dec(struct ivtv *itv, v4l2_std_id std) + TASK_UNINTERRUPTIBLE); + if ((read_reg(IVTV_REG_DEC_LINE_FIELD) >> 16) < 100) + break; +- schedule_timeout(msecs_to_jiffies(25)); ++ schedule_msec_hrtimeout((25)); + } + finish_wait(&itv->vsync_waitq, &wait); + mutex_lock(&itv->serialize_lock); +diff --git a/drivers/media/pci/ivtv/ivtv-streams.c b/drivers/media/pci/ivtv/ivtv-streams.c +index f04ee84bab5f..c4469b4b8f99 100644 +--- a/drivers/media/pci/ivtv/ivtv-streams.c ++++ b/drivers/media/pci/ivtv/ivtv-streams.c +@@ -849,7 +849,7 @@ int ivtv_stop_v4l2_encode_stream(struct ivtv_stream *s, int gop_end) + while (!test_bit(IVTV_F_I_EOS, &itv->i_flags) && + time_before(jiffies, + then + msecs_to_jiffies(2000))) { +- schedule_timeout(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout((10)); + } + + /* To convert jiffies to ms, we must multiply by 1000 +diff --git a/drivers/media/radio/radio-mr800.c b/drivers/media/radio/radio-mr800.c +index cb0437b4c331..163fffc0e1d4 100644 +--- a/drivers/media/radio/radio-mr800.c ++++ b/drivers/media/radio/radio-mr800.c +@@ -366,7 +366,7 @@ static int vidioc_s_hw_freq_seek(struct file *file, void *priv, + retval = -ENODATA; + break; + } +- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { ++ if (schedule_msec_hrtimeout_interruptible((10))) { + retval = -ERESTARTSYS; + break; + } +diff --git a/drivers/media/radio/radio-tea5777.c b/drivers/media/radio/radio-tea5777.c +index fb9de7bbcd19..e53cf45e7f3f 100644 +--- a/drivers/media/radio/radio-tea5777.c ++++ b/drivers/media/radio/radio-tea5777.c +@@ -235,7 +235,7 @@ static int radio_tea5777_update_read_reg(struct radio_tea5777 *tea, int wait) + } + + if (wait) { +- if (schedule_timeout_interruptible(msecs_to_jiffies(wait))) ++ if (schedule_msec_hrtimeout_interruptible((wait))) + return -ERESTARTSYS; + } + +diff --git a/drivers/media/radio/tea575x.c b/drivers/media/radio/tea575x.c +index c37315226c42..e73e6393403c 100644 +--- a/drivers/media/radio/tea575x.c ++++ b/drivers/media/radio/tea575x.c +@@ -401,7 +401,7 @@ int snd_tea575x_s_hw_freq_seek(struct file *file, struct snd_tea575x *tea, + for (;;) { + if (time_after(jiffies, timeout)) + break; +- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { ++ if (schedule_msec_hrtimeout_interruptible((10))) { + /* some signal arrived, stop search */ + tea->val &= ~TEA575X_BIT_SEARCH; + snd_tea575x_set_freq(tea); +diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c +index b690796d24d4..448b13da62b4 100644 +--- a/drivers/mfd/ucb1x00-core.c ++++ b/drivers/mfd/ucb1x00-core.c +@@ -250,7 +250,7 @@ unsigned int ucb1x00_adc_read(struct ucb1x00 *ucb, int adc_channel, int sync) + break; + /* yield to other processes */ + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + + return UCB_ADC_DAT(val); +diff --git a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c +index 8e6607fc8a67..b9ab770bbdb5 100644 +--- a/drivers/misc/sgi-xp/xpc_channel.c ++++ b/drivers/misc/sgi-xp/xpc_channel.c +@@ -834,7 +834,7 @@ xpc_allocate_msg_wait(struct xpc_channel *ch) + + atomic_inc(&ch->n_on_msg_allocate_wq); + prepare_to_wait(&ch->msg_allocate_wq, &wait, TASK_INTERRUPTIBLE); +- ret = schedule_timeout(1); ++ ret = schedule_min_hrtimeout(); + finish_wait(&ch->msg_allocate_wq, &wait); + atomic_dec(&ch->n_on_msg_allocate_wq); + +diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c +index 4a33ec4fc089..da85f847ebb4 100644 +--- a/drivers/net/caif/caif_hsi.c ++++ b/drivers/net/caif/caif_hsi.c +@@ -939,7 +939,7 @@ static void cfhsi_wake_down(struct work_struct *work) + break; + + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + retry--; + } + +diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c +index 66d0198e7834..ce1c7bf9be87 100644 +--- a/drivers/net/can/usb/peak_usb/pcan_usb.c ++++ b/drivers/net/can/usb/peak_usb/pcan_usb.c +@@ -242,7 +242,7 @@ static int pcan_usb_write_mode(struct peak_usb_device *dev, u8 onoff) + } else { + /* the PCAN-USB needs time to init */ + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(PCAN_USB_STARTUP_TIMEOUT)); ++ schedule_msec_hrtimeout((PCAN_USB_STARTUP_TIMEOUT)); + } + + return err; +diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c +index 65b315bc60ab..2b3f71086f5f 100644 +--- a/drivers/net/usb/lan78xx.c ++++ b/drivers/net/usb/lan78xx.c +@@ -2666,7 +2666,7 @@ static void lan78xx_terminate_urbs(struct lan78xx_net *dev) + while (!skb_queue_empty(&dev->rxq) && + !skb_queue_empty(&dev->txq) && + !skb_queue_empty(&dev->done)) { +- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); ++ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); + set_current_state(TASK_UNINTERRUPTIBLE); + netif_dbg(dev, ifdown, dev->net, + "waited for %d urb completions\n", temp); +diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c +index 2b2a841cd938..1a4d27179db1 100644 +--- a/drivers/net/usb/usbnet.c ++++ b/drivers/net/usb/usbnet.c +@@ -767,7 +767,7 @@ static void wait_skb_queue_empty(struct sk_buff_head *q) + spin_lock_irqsave(&q->lock, flags); + while (!skb_queue_empty(q)) { + spin_unlock_irqrestore(&q->lock, flags); +- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); ++ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); + set_current_state(TASK_UNINTERRUPTIBLE); + spin_lock_irqsave(&q->lock, flags); + } +diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c +index 461e955aa259..5ab8e7396ea4 100644 +--- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c ++++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c +@@ -816,7 +816,7 @@ static int ipw2100_hw_send_command(struct ipw2100_priv *priv, + * doesn't seem to have as many firmware restart cycles... + * + * As a test, we're sticking in a 1/100s delay here */ +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + + return 0; + +@@ -1267,7 +1267,7 @@ static int ipw2100_start_adapter(struct ipw2100_priv *priv) + IPW_DEBUG_FW("Waiting for f/w initialization to complete...\n"); + i = 5000; + do { +- schedule_timeout_uninterruptible(msecs_to_jiffies(40)); ++ schedule_msec_hrtimeout_uninterruptible((40)); + /* Todo... wait for sync command ... */ + + read_register(priv->net_dev, IPW_REG_INTA, &inta); +diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c +index 4547ac44c8d4..8fa1a7fdf12c 100644 +--- a/drivers/parport/ieee1284.c ++++ b/drivers/parport/ieee1284.c +@@ -202,7 +202,7 @@ int parport_wait_peripheral(struct parport *port, + /* parport_wait_event didn't time out, but the + * peripheral wasn't actually ready either. + * Wait for another 10ms. */ +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + } + +diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c +index 2c11bd3fe1fd..8cb6b61c0880 100644 +--- a/drivers/parport/ieee1284_ops.c ++++ b/drivers/parport/ieee1284_ops.c +@@ -520,7 +520,7 @@ size_t parport_ieee1284_ecp_read_data (struct parport *port, + /* Yield the port for a while. */ + if (count && dev->port->irq != PARPORT_IRQ_NONE) { + parport_release (dev); +- schedule_timeout_interruptible(msecs_to_jiffies(40)); ++ schedule_msec_hrtimeout_interruptible((40)); + parport_claim_or_block (dev); + } + else +diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c +index bffe548187ee..c2918ee3e100 100644 +--- a/drivers/platform/x86/intel_ips.c ++++ b/drivers/platform/x86/intel_ips.c +@@ -798,7 +798,7 @@ static int ips_adjust(void *data) + ips_gpu_lower(ips); + + sleep: +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_ADJUST_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_ADJUST_PERIOD)); + } while (!kthread_should_stop()); + + dev_dbg(ips->dev, "ips-adjust thread stopped\n"); +@@ -974,7 +974,7 @@ static int ips_monitor(void *data) + seqno_timestamp = get_jiffies_64(); + + old_cpu_power = thm_readl(THM_CEC); +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + + /* Collect an initial average */ + for (i = 0; i < IPS_SAMPLE_COUNT; i++) { +@@ -1001,7 +1001,7 @@ static int ips_monitor(void *data) + mchp_samples[i] = mchp; + } + +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + if (kthread_should_stop()) + break; + } +@@ -1028,7 +1028,7 @@ static int ips_monitor(void *data) + * us to reduce the sample frequency if the CPU and GPU are idle. + */ + old_cpu_power = thm_readl(THM_CEC); +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + last_sample_period = IPS_SAMPLE_PERIOD; + + timer_setup(&ips->timer, monitor_timeout, TIMER_DEFERRABLE); +diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c +index 2018614f258f..fc19b312c345 100644 +--- a/drivers/rtc/rtc-wm8350.c ++++ b/drivers/rtc/rtc-wm8350.c +@@ -114,7 +114,7 @@ static int wm8350_rtc_settime(struct device *dev, struct rtc_time *tm) + /* Wait until confirmation of stopping */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (--retries && !(rtc_ctrl & WM8350_RTC_STS)); + + if (!retries) { +@@ -197,7 +197,7 @@ static int wm8350_rtc_stop_alarm(struct wm8350 *wm8350) + /* Wait until confirmation of stopping */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (retries-- && !(rtc_ctrl & WM8350_RTC_ALMSTS)); + + if (!(rtc_ctrl & WM8350_RTC_ALMSTS)) +@@ -220,7 +220,7 @@ static int wm8350_rtc_start_alarm(struct wm8350 *wm8350) + /* Wait until confirmation */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (retries-- && rtc_ctrl & WM8350_RTC_ALMSTS); + + if (rtc_ctrl & WM8350_RTC_ALMSTS) +diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c +index 03b1805b106c..41ee54ff304a 100644 +--- a/drivers/scsi/fnic/fnic_scsi.c ++++ b/drivers/scsi/fnic/fnic_scsi.c +@@ -217,7 +217,7 @@ int fnic_fw_reset_handler(struct fnic *fnic) + + /* wait for io cmpl */ + while (atomic_read(&fnic->in_flight)) +- schedule_timeout(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout((1)); + + spin_lock_irqsave(&fnic->wq_copy_lock[0], flags); + +@@ -2278,7 +2278,7 @@ static int fnic_clean_pending_aborts(struct fnic *fnic, + } + } + +- schedule_timeout(msecs_to_jiffies(2 * fnic->config.ed_tov)); ++ schedule_msec_hrtimeout((2 * fnic->config.ed_tov)); + + /* walk again to check, if IOs are still pending in fw */ + if (fnic_is_abts_pending(fnic, lr_sc)) +diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c +index 983eeb0e3d07..007966930f94 100644 +--- a/drivers/scsi/lpfc/lpfc_scsi.c ++++ b/drivers/scsi/lpfc/lpfc_scsi.c +@@ -5194,7 +5194,7 @@ lpfc_reset_flush_io_context(struct lpfc_vport *vport, uint16_t tgt_id, + tgt_id, lun_id, context); + later = msecs_to_jiffies(2 * vport->cfg_devloss_tmo * 1000) + jiffies; + while (time_after(later, jiffies) && cnt) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(20)); ++ schedule_msec_hrtimeout_uninterruptible((20)); + cnt = lpfc_sli_sum_iocb(vport, tgt_id, lun_id, context); + } + if (cnt) { +diff --git a/drivers/scsi/snic/snic_scsi.c b/drivers/scsi/snic/snic_scsi.c +index b3650c989ed4..7ed1fb285754 100644 +--- a/drivers/scsi/snic/snic_scsi.c ++++ b/drivers/scsi/snic/snic_scsi.c +@@ -2353,7 +2353,7 @@ snic_reset(struct Scsi_Host *shost, struct scsi_cmnd *sc) + + /* Wait for all the IOs that are entered in Qcmd */ + while (atomic_read(&snic->ios_inflight)) +- schedule_timeout(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout((1)); + + ret = snic_issue_hba_reset(snic, sc); + if (ret) { +diff --git a/drivers/staging/comedi/drivers/ni_mio_common.c b/drivers/staging/comedi/drivers/ni_mio_common.c +index 9266e13f6271..df5c53216d78 100644 +--- a/drivers/staging/comedi/drivers/ni_mio_common.c ++++ b/drivers/staging/comedi/drivers/ni_mio_common.c +@@ -4748,7 +4748,7 @@ static int cs5529_wait_for_idle(struct comedi_device *dev) + if ((status & NI67XX_CAL_STATUS_BUSY) == 0) + break; + set_current_state(TASK_INTERRUPTIBLE); +- if (schedule_timeout(1)) ++ if (schedule_min_hrtimeout()) + return -EIO; + } + if (i == timeout) { +diff --git a/drivers/staging/rts5208/rtsx.c b/drivers/staging/rts5208/rtsx.c +index 898add4d1fc8..0aa9dd467349 100644 +--- a/drivers/staging/rts5208/rtsx.c ++++ b/drivers/staging/rts5208/rtsx.c +@@ -477,7 +477,7 @@ static int rtsx_polling_thread(void *__dev) + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(POLLING_INTERVAL)); ++ schedule_msec_hrtimeout((POLLING_INTERVAL)); + + /* lock the device pointers */ + mutex_lock(&dev->dev_mutex); +diff --git a/drivers/staging/unisys/visornic/visornic_main.c b/drivers/staging/unisys/visornic/visornic_main.c +index 0433536930a9..d8726f28843f 100644 +--- a/drivers/staging/unisys/visornic/visornic_main.c ++++ b/drivers/staging/unisys/visornic/visornic_main.c +@@ -549,7 +549,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- wait += schedule_timeout(msecs_to_jiffies(10)); ++ wait += schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + } + +@@ -560,7 +560,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- schedule_timeout(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + if (atomic_read(&devdata->usage)) + break; +@@ -714,7 +714,7 @@ static int visornic_enable_with_timeout(struct net_device *netdev, + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- wait += schedule_timeout(msecs_to_jiffies(10)); ++ wait += schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + } + +diff --git a/drivers/video/fbdev/omap/hwa742.c b/drivers/video/fbdev/omap/hwa742.c +index cfe63932f825..71c00ef772a3 100644 +--- a/drivers/video/fbdev/omap/hwa742.c ++++ b/drivers/video/fbdev/omap/hwa742.c +@@ -913,7 +913,7 @@ static void hwa742_resume(void) + if (hwa742_read_reg(HWA742_PLL_DIV_REG) & (1 << 7)) + break; + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(5)); ++ schedule_msec_hrtimeout((5)); + } + hwa742_set_update_mode(hwa742.update_mode_before_suspend); + } +diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c +index f1551e00eb12..f0f651e92504 100644 +--- a/drivers/video/fbdev/pxafb.c ++++ b/drivers/video/fbdev/pxafb.c +@@ -1287,7 +1287,7 @@ static int pxafb_smart_thread(void *arg) + mutex_unlock(&fbi->ctrlr_lock); + + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(30)); ++ schedule_msec_hrtimeout((30)); + } + + pr_debug("%s(): task ending\n", __func__); +diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c +index 76d2e43817ea..6ba0604e2162 100644 +--- a/fs/btrfs/inode-map.c ++++ b/fs/btrfs/inode-map.c +@@ -91,7 +91,7 @@ static int caching_kthread(void *data) + btrfs_release_path(path); + root->ino_cache_progress = last; + up_read(&fs_info->commit_root_sem); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + goto again; + } else + continue; +diff --git a/fs/proc/base.c b/fs/proc/base.c +index 617db4e0faa0..f85926764f9a 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -479,7 +479,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + seq_puts(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff --git a/include/linux/freezer.h b/include/linux/freezer.h +index 27828145ca09..504cc97bf475 100644 +--- a/include/linux/freezer.h ++++ b/include/linux/freezer.h +@@ -311,6 +311,7 @@ static inline void set_freezable(void) {} + #define wait_event_freezekillable_unsafe(wq, condition) \ + wait_event_killable(wq, condition) + ++#define pm_freezing (false) + #endif /* !CONFIG_FREEZER */ + + #endif /* FREEZER_H_INCLUDED */ +diff --git a/include/linux/init_task.h b/include/linux/init_task.h +index 2c620d7ac432..73417df5daa2 100644 +--- a/include/linux/init_task.h ++++ b/include/linux/init_task.h +@@ -36,7 +36,11 @@ extern struct cred init_cred; + #define INIT_PREV_CPUTIME(x) + #endif + ++#ifdef CONFIG_SCHED_MUQSS ++#define INIT_TASK_COMM "MuQSS" ++#else + #define INIT_TASK_COMM "swapper" ++#endif + + /* Attach to the init_task data structure for proper alignment */ + #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK +diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h +index e9bfe6972aed..16ba1c7e5bde 100644 +--- a/include/linux/ioprio.h ++++ b/include/linux/ioprio.h +@@ -53,6 +53,8 @@ enum { + */ + static inline int task_nice_ioprio(struct task_struct *task) + { ++ if (iso_task(task)) ++ return 0; + return (task_nice(task) + 20) / 5; + } + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index afe01e232935..139e4535fcc6 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -35,6 +35,10 @@ + #include + #include + ++#ifdef CONFIG_SCHED_MUQSS ++#include ++#endif ++ + /* task_struct member predeclarations (sorted alphabetically): */ + struct audit_context; + struct backing_dev_info; +@@ -213,13 +217,40 @@ struct task_group; + + extern void scheduler_tick(void); + +-#define MAX_SCHEDULE_TIMEOUT LONG_MAX +- ++#define MAX_SCHEDULE_TIMEOUT LONG_MAX + extern long schedule_timeout(long timeout); + extern long schedule_timeout_interruptible(long timeout); + extern long schedule_timeout_killable(long timeout); + extern long schedule_timeout_uninterruptible(long timeout); + extern long schedule_timeout_idle(long timeout); ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++extern long schedule_msec_hrtimeout(long timeout); ++extern long schedule_min_hrtimeout(void); ++extern long schedule_msec_hrtimeout_interruptible(long timeout); ++extern long schedule_msec_hrtimeout_uninterruptible(long timeout); ++#else ++static inline long schedule_msec_hrtimeout(long timeout) ++{ ++ return schedule_timeout(msecs_to_jiffies(timeout)); ++} ++ ++static inline long schedule_min_hrtimeout(void) ++{ ++ return schedule_timeout(1); ++} ++ ++static inline long schedule_msec_hrtimeout_interruptible(long timeout) ++{ ++ return schedule_timeout_interruptible(msecs_to_jiffies(timeout)); ++} ++ ++static inline long schedule_msec_hrtimeout_uninterruptible(long timeout) ++{ ++ return schedule_timeout_uninterruptible(msecs_to_jiffies(timeout)); ++} ++#endif ++ + asmlinkage void schedule(void); + extern void schedule_preempt_disabled(void); + asmlinkage void preempt_schedule_irq(void); +@@ -651,8 +682,10 @@ struct task_struct { + unsigned int flags; + unsigned int ptrace; + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_MUQSS) + int on_cpu; ++#endif ++#ifdef CONFIG_SMP + struct __call_single_node wake_entry; + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ +@@ -678,10 +711,25 @@ struct task_struct { + int static_prio; + int normal_prio; + unsigned int rt_priority; ++#ifdef CONFIG_SCHED_MUQSS ++ int time_slice; ++ u64 deadline; ++ skiplist_node node; /* Skip list node */ ++ u64 last_ran; ++ u64 sched_time; /* sched_clock time spent running */ ++#ifdef CONFIG_SMT_NICE ++ int smt_bias; /* Policy/nice level bias across smt siblings */ ++#endif ++#ifdef CONFIG_HOTPLUG_CPU ++ bool zerobound; /* Bound to CPU0 for hotplug */ ++#endif ++ unsigned long rt_timeout; ++#else /* CONFIG_SCHED_MUQSS */ + + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +@@ -863,6 +911,10 @@ struct task_struct { + #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + u64 utimescaled; + u64 stimescaled; ++#endif ++#ifdef CONFIG_SCHED_MUQSS ++ /* Unbanked cpu time */ ++ unsigned long utime_ns, stime_ns; + #endif + u64 gtime; + struct prev_cputime prev_cputime; +@@ -1332,6 +1384,40 @@ struct task_struct { + */ + }; + ++#ifdef CONFIG_SCHED_MUQSS ++#define tsk_seruntime(t) ((t)->sched_time) ++#define tsk_rttimeout(t) ((t)->rt_timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++} ++ ++void print_scheduler_version(void); ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return (p->policy == SCHED_ISO); ++} ++#else /* CFS */ ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++ p->nr_cpus_allowed = current->nr_cpus_allowed; ++} ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "CFS CPU scheduler.\n"); ++} ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return false; ++} ++#endif /* CONFIG_SCHED_MUQSS */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index 1aff00b65f3c..73d6319a856a 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -28,7 +28,16 @@ static inline bool dl_time_before(u64 a, u64 b) + #ifdef CONFIG_SMP + + struct root_domain; ++#ifdef CONFIG_SCHED_MUQSS ++static inline void dl_clear_root_domain(struct root_domain *rd) ++{ ++} ++static inline void dl_add_task_root_domain(struct task_struct *p) ++{ ++} ++#else /* CONFIG_SCHED_MUQSS */ + extern void dl_add_task_root_domain(struct task_struct *p); + extern void dl_clear_root_domain(struct root_domain *rd); ++#endif /* CONFIG_SCHED_MUQSS */ + + #endif /* CONFIG_SMP */ +diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h +index 6d67e9a5af6b..101fe470aa8f 100644 +--- a/include/linux/sched/nohz.h ++++ b/include/linux/sched/nohz.h +@@ -13,7 +13,7 @@ extern int get_nohz_timer_target(void); + static inline void nohz_balance_enter_idle(int cpu) { } + #endif + +-#ifdef CONFIG_NO_HZ_COMMON ++#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS) + void calc_load_nohz_start(void); + void calc_load_nohz_remote(struct rq *rq); + void calc_load_nohz_stop(void); +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index 7d64feafc408..43c9d9e50c09 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -20,8 +20,20 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ ++#ifdef CONFIG_SCHED_MUQSS ++/* Note different MAX_RT_PRIO */ ++#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1) ++ ++#define ISO_PRIO (MAX_RT_PRIO) ++#define NORMAL_PRIO (MAX_RT_PRIO + 1) ++#define IDLE_PRIO (MAX_RT_PRIO + 2) ++#define PRIO_LIMIT ((IDLE_PRIO) + 1) ++#else /* CONFIG_SCHED_MUQSS */ + #define MAX_RT_PRIO MAX_USER_RT_PRIO + ++#endif /* CONFIG_SCHED_MUQSS */ ++ + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) + +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c08b4..010b2244e0b6 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_MUQSS + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h +index a98965007eef..743f67fd012e 100644 +--- a/include/linux/sched/task.h ++++ b/include/linux/sched/task.h +@@ -93,7 +93,7 @@ int kernel_wait(pid_t pid, int *stat); + extern void free_task(struct task_struct *tsk); + + /* sched_exec is called by processes performing an exec */ +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_MUQSS) + extern void sched_exec(void); + #else + #define sched_exec() {} +diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h +new file mode 100644 +index 000000000000..d4be84ba273b +--- /dev/null ++++ b/include/linux/skip_list.h +@@ -0,0 +1,33 @@ ++#ifndef _LINUX_SKIP_LISTS_H ++#define _LINUX_SKIP_LISTS_H ++typedef u64 keyType; ++typedef void *valueType; ++ ++typedef struct nodeStructure skiplist_node; ++ ++struct nodeStructure { ++ int level; /* Levels in this structure */ ++ keyType key; ++ valueType value; ++ skiplist_node *next[8]; ++ skiplist_node *prev[8]; ++}; ++ ++typedef struct listStructure { ++ int entries; ++ int level; /* Maximum level of the list ++ (1 more than the number of levels in the list) */ ++ skiplist_node *header; /* pointer to header */ ++} skiplist; ++ ++void skiplist_init(skiplist_node *slnode); ++skiplist *new_skiplist(skiplist_node *slnode); ++void free_skiplist(skiplist *l); ++void skiplist_node_init(skiplist_node *node); ++void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed); ++void skiplist_delete(skiplist *l, skiplist_node *node); ++ ++static inline bool skiplist_node_empty(skiplist_node *node) { ++ return (!node->next[0]); ++} ++#endif /* _LINUX_SKIP_LISTS_H */ +diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h +index 3bac0a8ceab2..f48c5c5da651 100644 +--- a/include/uapi/linux/sched.h ++++ b/include/uapi/linux/sched.h +@@ -115,9 +115,16 @@ struct clone_args { + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 +-/* SCHED_ISO: reserved but not implemented yet */ ++/* SCHED_ISO: Implemented on MuQSS only */ + #define SCHED_IDLE 5 ++#ifdef CONFIG_SCHED_MUQSS ++#define SCHED_ISO 4 ++#define SCHED_IDLEPRIO SCHED_IDLE ++#define SCHED_MAX (SCHED_IDLEPRIO) ++#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) ++#else /* CONFIG_SCHED_MUQSS */ + #define SCHED_DEADLINE 6 ++#endif /* CONFIG_SCHED_MUQSS */ + + /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ + #define SCHED_RESET_ON_FORK 0x40000000 +diff --git a/init/Kconfig b/init/Kconfig +index d6a0b31b13dc..7e0eb99bd607 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -92,6 +92,18 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config SCHED_MUQSS ++ bool "MuQSS cpu scheduler" ++ select HIGH_RES_TIMERS ++ help ++ The Multiple Queue Skiplist Scheduler for excellent interactivity and ++ responsiveness on the desktop and highly scalable deterministic ++ low latency on any hardware. ++ ++ Say Y here. ++ default y ++ ++ + config BROKEN + bool + +@@ -510,6 +522,7 @@ config SCHED_THERMAL_PRESSURE + default y if ARM64 + depends on SMP + depends on CPU_FREQ_THERMAL ++ depends on !SCHED_MUQSS + help + Select this option to enable thermal pressure accounting in the + scheduler. Thermal pressure is the value conveyed to the scheduler +@@ -858,6 +871,7 @@ config NUMA_BALANCING + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_MUQSS + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -942,9 +956,13 @@ menuconfig CGROUP_SCHED + help + This feature lets CPU scheduler recognize task groups and control CPU + bandwidth allocation to such task groups. It uses cgroups to group +- tasks. ++ tasks. In combination with MuQSS this is purely a STUB to create the ++ files associated with the CPU controller cgroup but most of the ++ controls do nothing. This is useful for working in environments and ++ with applications that will only work if this control group is ++ present. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_MUQSS + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -1073,6 +1091,7 @@ config CGROUP_DEVICE + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" ++ depends on !SCHED_MUQSS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +@@ -1200,6 +1219,7 @@ config CHECKPOINT_RESTORE + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_MUQSS + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff --git a/init/init_task.c b/init/init_task.c +index f6889fce64af..2557beb609c0 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -75,9 +75,17 @@ struct task_struct init_task + .stack = init_stack, + .usage = REFCOUNT_INIT(2), + .flags = PF_KTHREAD, ++#ifdef CONFIG_SCHED_MUQSS ++ .prio = NORMAL_PRIO, ++ .static_prio = MAX_PRIO - 20, ++ .normal_prio = NORMAL_PRIO, ++ .deadline = 0, ++ .time_slice = 1000000, ++#else + .prio = MAX_PRIO - 20, + .static_prio = MAX_PRIO - 20, + .normal_prio = MAX_PRIO - 20, ++#endif + .policy = SCHED_NORMAL, + .cpus_ptr = &init_task.cpus_mask, + .cpus_mask = CPU_MASK_ALL, +@@ -87,6 +95,7 @@ struct task_struct init_task + .restart_block = { + .fn = do_no_restart_syscall, + }, ++#ifndef CONFIG_SCHED_MUQSS + .se = { + .group_node = LIST_HEAD_INIT(init_task.se.group_node), + }, +@@ -94,6 +103,7 @@ struct task_struct init_task + .run_list = LIST_HEAD_INIT(init_task.rt.run_list), + .time_slice = RR_TIMESLICE, + }, ++#endif + .tasks = LIST_HEAD_INIT(init_task.tasks), + #ifdef CONFIG_SMP + .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), +diff --git a/init/main.c b/init/main.c +index e880b4ecb314..fe0a705e83f2 100644 +--- a/init/main.c ++++ b/init/main.c +@@ -1421,6 +1421,8 @@ static int __ref kernel_init(void *unused) + + do_sysctl_args(); + ++ print_scheduler_version(); ++ + if (ramdisk_execute_command) { + ret = run_init_process(ramdisk_execute_command); + if (!ret) +diff --git a/kernel/Kconfig.MuQSS b/kernel/Kconfig.MuQSS +new file mode 100644 +index 000000000000..a6a58781ef91 +--- /dev/null ++++ b/kernel/Kconfig.MuQSS +@@ -0,0 +1,105 @@ ++choice ++ prompt "CPU scheduler runqueue sharing" ++ default RQ_MC if SCHED_MUQSS ++ default RQ_NONE ++ ++config RQ_NONE ++ bool "No sharing" ++ help ++ This is the default behaviour where the CPU scheduler has one runqueue ++ per CPU, whether it is a physical or logical CPU (hyperthread). ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=none ++ ++ If unsure, say N. ++ ++config RQ_SMT ++ bool "SMT (hyperthread) siblings" ++ depends on SCHED_SMT && SCHED_MUQSS ++ ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by SMT (hyperthread) siblings. As these logical cores share ++ one physical core, sharing the runqueue resource can lead to decreased ++ overhead, lower latency and higher throughput. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=smt ++ ++ If unsure, say N. ++ ++config RQ_MC ++ bool "Multicore siblings" ++ depends on SCHED_MC && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by multicore siblings in addition to any SMT siblings. ++ As these physical cores share caches, sharing the runqueue resource ++ will lead to lower latency, but its effects on overhead and throughput ++ are less predictable. As a general rule, 6 or fewer cores will likely ++ benefit from this, while larger CPUs will only derive a latency ++ benefit. If your workloads are primarily single threaded, this will ++ possibly worsen throughput. If you are only concerned about latency ++ then enable this regardless of how many cores you have. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=mc ++ ++ If unsure, say Y. ++ ++config RQ_MC_LLC ++ bool "Multicore siblings (LLC)" ++ depends on SCHED_MC && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will behave similarly as ++ with "Multicore siblings". ++ This option takes LLC cache into account when scheduling tasks. ++ Option may benefit CPUs with multiple LLC caches, such as Ryzen ++ and Xeon CPUs. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=llc ++ ++ If unsure, say N. ++ ++config RQ_SMP ++ bool "Symmetric Multi-Processing" ++ depends on SMP && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by all physical CPUs unless they are on separate NUMA nodes. ++ As physical CPUs usually do not share resources, sharing the runqueue ++ will normally worsen throughput but improve latency. If you only ++ care about latency enable this. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=smp ++ ++ If unsure, say N. ++ ++config RQ_ALL ++ bool "NUMA" ++ depends on SMP && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ regardless of the architecture configuration, including across NUMA ++ nodes. This can substantially decrease throughput in NUMA ++ configurations, but light NUMA designs will not be dramatically ++ affected. This option should only be chosen if latency is the prime ++ concern. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=all ++ ++ If unsure, say N. ++endchoice ++ ++config SHARERQ ++ int ++ default 0 if RQ_NONE ++ default 1 if RQ_SMT ++ default 2 if RQ_MC ++ default 3 if RQ_MC_LLC ++ default 4 if RQ_SMP ++ default 5 if RQ_ALL +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 38ef6d06888e..89ed751ac4e4 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -5,7 +5,8 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_100 if SCHED_MUQSS ++ default HZ_250_NODEF if !SCHED_MUQSS + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -20,11 +21,18 @@ choice + config HZ_100 + bool "100 HZ" + help ++ 100 Hz is a suitable choice in combination with MuQSS which does ++ not rely on ticks for rescheduling interrupts, and is not Hz limited ++ for timeouts and sleeps from both the kernel and userspace. ++ This allows us to benefit from the lower overhead and higher ++ throughput of fewer timer ticks. ++ ++ Non-MuQSS kernels: + 100 Hz is a typical choice for servers, SMP and NUMA systems + with lots of processors that may show reduced performance if + too many timer interrupts are occurring. + +- config HZ_250 ++ config HZ_250_NODEF + bool "250 HZ" + help + 250 Hz is a good compromise choice allowing server performance +@@ -32,7 +40,10 @@ choice + on SMP and NUMA systems. If you are going to be using NTSC video + or multimedia, selected 300Hz instead. + +- config HZ_300 ++ 250 Hz is the default choice for the mainline scheduler but not ++ advantageous in combination with MuQSS. ++ ++ config HZ_300_NODEF + bool "300 HZ" + help + 300 Hz is a good compromise choice allowing server performance +@@ -40,7 +51,7 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + +- config HZ_1000 ++ config HZ_1000_NODEF + bool "1000 HZ" + help + 1000 Hz is the preferred choice for desktop systems and other +@@ -51,9 +62,9 @@ endchoice + config HZ + int + default 100 if HZ_100 +- default 250 if HZ_250 +- default 300 if HZ_300 +- default 1000 if HZ_1000 ++ default 250 if HZ_250_NODEF ++ default 300 if HZ_300_NODEF ++ default 1000 if HZ_1000_NODEF + + config SCHED_HRTICK + def_bool HIGH_RES_TIMERS +diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt +index bf82259cff96..d9438eb6f91c 100644 +--- a/kernel/Kconfig.preempt ++++ b/kernel/Kconfig.preempt +@@ -2,7 +2,7 @@ + + choice + prompt "Preemption Model" +- default PREEMPT_NONE ++ default PREEMPT + + config PREEMPT_NONE + bool "No Forced Preemption (Server)" +@@ -18,7 +18,7 @@ config PREEMPT_NONE + latencies. + + config PREEMPT_VOLUNTARY +- bool "Voluntary Kernel Preemption (Desktop)" ++ bool "Voluntary Kernel Preemption (Nothing)" + depends on !ARCH_NO_PREEMPT + help + This option reduces the latency of the kernel by adding more +@@ -33,7 +33,8 @@ config PREEMPT_VOLUNTARY + applications to run more 'smoothly' even when the system is + under load. + +- Select this if you are building a kernel for a desktop system. ++ Select this for no system in particular (choose Preemptible ++ instead on a desktop if you know what's good for you). + + config PREEMPT + bool "Preemptible Kernel (Low-Latency Desktop)" +diff --git a/kernel/Makefile b/kernel/Makefile +index 9a20016d4900..a2640d78eadb 100644 +--- a/kernel/Makefile ++++ b/kernel/Makefile +@@ -10,7 +10,8 @@ obj-y = fork.o exec_domain.o panic.o \ + extable.o params.o \ + kthread.o sys_ni.o nsproxy.o \ + notifier.o ksysfs.o cred.o reboot.o \ +- async.o range.o smpboot.o ucount.o regset.o ++ async.o range.o smpboot.o ucount.o regset.o \ ++ skip_list.o + + obj-$(CONFIG_BPFILTER) += usermode_driver.o + obj-$(CONFIG_MODULES) += kmod.o +diff --git a/kernel/delayacct.c b/kernel/delayacct.c +index 27725754ac99..769d773c7182 100644 +--- a/kernel/delayacct.c ++++ b/kernel/delayacct.c +@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff --git a/kernel/exit.c b/kernel/exit.c +index 733e80f334e7..3f3506c851fd 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -121,7 +121,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -142,7 +142,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig +index 10a5aff4eecc..ce3bcc66b48d 100644 +--- a/kernel/irq/Kconfig ++++ b/kernel/irq/Kconfig +@@ -112,6 +112,23 @@ config GENERIC_IRQ_RESERVATION_MODE + config IRQ_FORCED_THREADING + bool + ++config FORCE_IRQ_THREADING ++ bool "Make IRQ threading compulsory" ++ depends on IRQ_FORCED_THREADING ++ default n ++ help ++ ++ Make IRQ threading mandatory for any IRQ handlers that support it ++ instead of being optional and requiring the threadirqs kernel ++ parameter. Instead they can be optionally disabled with the ++ nothreadirqs kernel parameter. ++ ++ Enabling this may make some architectures not boot with runqueue ++ sharing and MuQSS. ++ ++ Enable if you are building for a desktop or low latency system, ++ otherwise say N. ++ + config SPARSE_IRQ + bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ + help +diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c +index 5df903fccb60..17a0dd194582 100644 +--- a/kernel/irq/manage.c ++++ b/kernel/irq/manage.c +@@ -25,9 +25,20 @@ + #include "internals.h" + + #if defined(CONFIG_IRQ_FORCED_THREADING) && !defined(CONFIG_PREEMPT_RT) ++#ifdef CONFIG_FORCE_IRQ_THREADING ++__read_mostly bool force_irqthreads = true; ++#else + __read_mostly bool force_irqthreads; ++#endif + EXPORT_SYMBOL_GPL(force_irqthreads); + ++static int __init setup_noforced_irqthreads(char *arg) ++{ ++ force_irqthreads = false; ++ return 0; ++} ++early_param("nothreadirqs", setup_noforced_irqthreads); ++ + static int __init setup_forced_irqthreads(char *arg) + { + force_irqthreads = true; +diff --git a/kernel/kthread.c b/kernel/kthread.c +index 3edaa380dc7b..a1712699726b 100644 +--- a/kernel/kthread.c ++++ b/kernel/kthread.c +@@ -471,6 +471,34 @@ void kthread_bind(struct task_struct *p, unsigned int cpu) + } + EXPORT_SYMBOL(kthread_bind); + ++#if defined(CONFIG_SCHED_MUQSS) && defined(CONFIG_SMP) ++extern void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); ++ ++/* ++ * new_kthread_bind is a special variant of __kthread_bind_mask. ++ * For new threads to work on muqss we want to call do_set_cpus_allowed ++ * without the task_cpu being set and the task rescheduled until they're ++ * rescheduled on their own so we call __do_set_cpus_allowed directly which ++ * only changes the cpumask. This is particularly important for smpboot threads ++ * to work. ++ */ ++static void new_kthread_bind(struct task_struct *p, unsigned int cpu) ++{ ++ unsigned long flags; ++ ++ if (WARN_ON(!wait_task_inactive(p, TASK_UNINTERRUPTIBLE))) ++ return; ++ ++ /* It's safe because the task is inactive. */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ __do_set_cpus_allowed(p, cpumask_of(cpu)); ++ p->flags |= PF_NO_SETAFFINITY; ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++#else ++#define new_kthread_bind(p, cpu) kthread_bind(p, cpu) ++#endif ++ + /** + * kthread_create_on_cpu - Create a cpu bound kthread + * @threadfn: the function to run until signal_pending(current). +@@ -491,7 +519,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), + cpu); + if (IS_ERR(p)) + return p; +- kthread_bind(p, cpu); ++ new_kthread_bind(p, cpu); + /* CPU hotplug need to bind once again when unparking the thread. */ + set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); + to_kthread(p)->cpu = cpu; +diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +index f6310f848f34..825f9b8e228f 100644 +--- a/kernel/livepatch/transition.c ++++ b/kernel/livepatch/transition.c +@@ -282,7 +282,7 @@ static bool klp_try_switch_task(struct task_struct *task) + { + static char err_buf[STACK_ERR_BUF_SIZE]; + struct rq *rq; +- struct rq_flags flags; ++ struct rq_flags rf; + int ret; + bool success = false; + +@@ -304,7 +304,7 @@ static bool klp_try_switch_task(struct task_struct *task) + * functions. If all goes well, switch the task to the target patch + * state. + */ +- rq = task_rq_lock(task, &flags); ++ rq = task_rq_lock(task, &rf); + + if (task_running(rq, task) && task != current) { + snprintf(err_buf, STACK_ERR_BUF_SIZE, +@@ -323,7 +323,7 @@ static bool klp_try_switch_task(struct task_struct *task) + task->patch_state = klp_target_state; + + done: +- task_rq_unlock(rq, task, &flags); ++ task_rq_unlock(rq, task, &rf); + + /* + * Due to console deadlock issues, pr_debug() can't be used while +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 5fc9c9b70862..1ff14a21193d 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -22,15 +22,23 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + ++ifdef CONFIG_SCHED_MUQSS ++obj-y += MuQSS.o clock.o cputime.o ++obj-y += idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o ++ ++obj-$(CONFIG_SMP) += topology.o ++else + obj-y += core.o loadavg.o clock.o cputime.o + obj-y += idle.o fair.o rt.o deadline.o + obj-y += wait.o wait_bit.o swait.o completion.o + + obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o +-obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o ++endif ++obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o +diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c +new file mode 100644 +index 000000000000..8da537d5226c +--- /dev/null ++++ b/kernel/sched/MuQSS.c +@@ -0,0 +1,7855 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * kernel/sched/MuQSS.c, was kernel/sched.c ++ * ++ * Kernel scheduler and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and ++ * make semaphores SMP safe ++ * 1998-11-19 Implemented schedule_timeout() and related stuff ++ * by Andrea Arcangeli ++ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: ++ * hybrid priority-list and round-robin design with ++ * an array-switch method of distributing timeslices ++ * and per-CPU runqueues. Cleanups and useful suggestions ++ * by Davide Libenzi, preemptible kernel bits by Robert Love. ++ * 2003-09-03 Interactivity tuning by Con Kolivas. ++ * 2004-04-02 Scheduler domains code by Nick Piggin ++ * 2007-04-15 Work begun on replacing all interactivity tuning with a ++ * fair scheduling design by Con Kolivas. ++ * 2007-05-05 Load balancing (smp-nice) and other improvements ++ * by Peter Williams ++ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith ++ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri ++ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, ++ * Thomas Gleixner, Mike Kravetz ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2016-10-01 Multiple Queue Skiplist Scheduler scalable evolution of BFS ++ * scheduler by Con Kolivas. ++ * 2019-08-31 LLC bits by Eduards Bezverhijs ++ */ ++#define CREATE_TRACE_POINTS ++#include ++#undef CREATE_TRACE_POINTS ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#include "../workqueue_internal.h" ++#include "../../fs/io-wq.h" ++#include "../smpboot.h" ++ ++#include "MuQSS.h" ++#include "smp.h" ++ ++#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) ++#define rt_task(p) rt_prio((p)->prio) ++#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) ++#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR) ++#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) ++ ++#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) ++#define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) ++#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO) ++ ++#define is_iso_policy(policy) ((policy) == SCHED_ISO) ++#define iso_task(p) unlikely(is_iso_policy((p)->policy)) ++#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO) ++ ++#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) ++ ++#define ISO_PERIOD (5 * HZ) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* ++ * Some helpers for converting to/from various scales. Use shifts to get ++ * approximate multiples of ten for less overhead. ++ */ ++#define APPROX_NS_PS (1073741824) /* Approximate ns per second */ ++#define JIFFIES_TO_NS(TIME) ((TIME) * (APPROX_NS_PS / HZ)) ++#define JIFFY_NS (APPROX_NS_PS / HZ) ++#define JIFFY_US (1048576 / HZ) ++#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS) ++#define HALF_JIFFY_NS (APPROX_NS_PS / HZ / 2) ++#define HALF_JIFFY_US (1048576 / HZ / 2) ++#define MS_TO_NS(TIME) ((TIME) << 20) ++#define MS_TO_US(TIME) ((TIME) << 10) ++#define NS_TO_MS(TIME) ((TIME) >> 20) ++#define NS_TO_US(TIME) ((TIME) >> 10) ++#define US_TO_NS(TIME) ((TIME) << 10) ++#define TICK_APPROX_NS ((APPROX_NS_PS+HZ/2)/HZ) ++ ++#define RESCHED_US (100) /* Reschedule if less than this many μs left */ ++ ++void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "MuQSS CPU scheduler v0.204 by Con Kolivas.\n"); ++} ++ ++/* Define RQ share levels */ ++#define RQSHARE_NONE 0 ++#define RQSHARE_SMT 1 ++#define RQSHARE_MC 2 ++#define RQSHARE_MC_LLC 3 ++#define RQSHARE_SMP 4 ++#define RQSHARE_ALL 5 ++ ++/* Define locality levels */ ++#define LOCALITY_SAME 0 ++#define LOCALITY_SMT 1 ++#define LOCALITY_MC_LLC 2 ++#define LOCALITY_MC 3 ++#define LOCALITY_SMP 4 ++#define LOCALITY_DISTANT 5 ++ ++/* ++ * This determines what level of runqueue sharing will be done and is ++ * configurable at boot time with the bootparam rqshare = ++ */ ++static int rqshare __read_mostly = CONFIG_SHARERQ; /* Default RQSHARE_MC */ ++ ++static int __init set_rqshare(char *str) ++{ ++ if (!strncmp(str, "none", 4)) { ++ rqshare = RQSHARE_NONE; ++ return 0; ++ } ++ if (!strncmp(str, "smt", 3)) { ++ rqshare = RQSHARE_SMT; ++ return 0; ++ } ++ if (!strncmp(str, "mc", 2)) { ++ rqshare = RQSHARE_MC; ++ return 0; ++ } ++ if (!strncmp(str, "llc", 3)) { ++ rqshare = RQSHARE_MC_LLC; ++ return 0; ++ } ++ if (!strncmp(str, "smp", 3)) { ++ rqshare = RQSHARE_SMP; ++ return 0; ++ } ++ if (!strncmp(str, "all", 3)) { ++ rqshare = RQSHARE_ALL; ++ return 0; ++ } ++ return 1; ++} ++__setup("rqshare=", set_rqshare); ++ ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 6ms. ++ * Tunable via /proc interface. ++ */ ++int rr_interval __read_mostly = 6; ++ ++/* ++ * Tunable to choose whether to prioritise latency or throughput, simple ++ * binary yes or no ++ */ ++int sched_interactive __read_mostly = 1; ++ ++/* ++ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks ++ * are allowed to run five seconds as real time tasks. This is the total over ++ * all online cpus. ++ */ ++int sched_iso_cpu __read_mostly = 70; ++ ++/* ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Yield only to better priority/deadline tasks. (default) ++ * 2: Expire timeslice and recalculate deadline. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++/* ++ * The relative length of deadline for each priority(nice) level. ++ */ ++static int prio_ratios[NICE_WIDTH] __read_mostly; ++ ++ ++/* ++ * The quota handed out to tasks of all priority levels when refilling their ++ * time_slice. ++ */ ++static inline int timeslice(void) ++{ ++ return MS_TO_US(rr_interval); ++} ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifdef CONFIG_SMP ++/* ++ * Total number of runqueues. Equals number of CPUs when there is no runqueue ++ * sharing but is usually less with SMT/MC sharing of runqueues. ++ */ ++static int total_runqueues __read_mostly = 1; ++ ++static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp; ++ ++struct rq *cpu_rq(int cpu) ++{ ++ return &per_cpu(runqueues, (cpu)); ++} ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++ ++/* ++ * For asym packing, by default the lower numbered cpu has higher priority. ++ */ ++int __weak arch_asym_cpu_priority(int cpu) ++{ ++ return -cpu; ++} ++ ++int __weak arch_sd_sibling_asym_packing(void) ++{ ++ return 0*SD_ASYM_PACKING; ++} ++ ++#ifdef CONFIG_SCHED_SMT ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++EXPORT_SYMBOL_GPL(sched_smt_present); ++#endif ++ ++#else ++struct rq *uprq; ++#endif /* CONFIG_SMP */ ++ ++#include "stats.h" ++ ++/* ++ * All common locking functions performed on rq->lock. rq->clock is local to ++ * the CPU accessing it so it can be modified just with interrupts disabled ++ * when we're not updating niffies. ++ * Looking up task_rq must be done under rq->lock to be safe. ++ */ ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++static void update_irq_load_avg(struct rq *rq, long delta); ++#else ++static inline void update_irq_load_avg(struct rq *rq, long delta) {} ++#endif ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++ s64 __maybe_unused steal = 0, irq_delta = 0; ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ delta -= steal; ++ } ++#endif ++ rq->clock_task += delta; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ if (irq_delta + steal) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta < 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++/* ++ * Niffies are a globally increasing nanosecond counter. They're only used by ++ * update_load_avg and time_slice_expired, however deadlines are based on them ++ * across CPUs. Update them whenever we will call one of those functions, and ++ * synchronise them across CPUs whenever we hold both runqueue locks. ++ */ ++static inline void update_clocks(struct rq *rq) ++{ ++ s64 ndiff, minndiff; ++ long jdiff; ++ ++ update_rq_clock(rq); ++ ndiff = rq->clock - rq->old_clock; ++ rq->old_clock = rq->clock; ++ jdiff = jiffies - rq->last_jiffy; ++ ++ /* Subtract any niffies added by balancing with other rqs */ ++ ndiff -= rq->niffies - rq->last_niffy; ++ minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies; ++ if (minndiff < 0) ++ minndiff = 0; ++ ndiff = max(ndiff, minndiff); ++ rq->niffies += ndiff; ++ rq->last_niffy = rq->niffies; ++ if (jdiff) { ++ rq->last_jiffy += jdiff; ++ rq->last_jiffy_niffies = rq->niffies; ++ } ++} ++ ++/* ++ * Any time we have two runqueues locked we use that as an opportunity to ++ * synchronise niffies to the highest value as idle ticks may have artificially ++ * kept niffies low on one CPU and the truth can only be later. ++ */ ++static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2) ++{ ++ if (rq1->niffies > rq2->niffies) ++ rq2->niffies = rq1->niffies; ++ else ++ rq1->niffies = rq2->niffies; ++} ++ ++/* ++ * double_rq_lock - safely lock two runqueues ++ * ++ * Note this does not disable interrupts like task_rq_lock, ++ * you need to do so manually before calling. ++ */ ++ ++/* For when we know rq1 != rq2 */ ++static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2) ++ __acquires(rq1->lock) ++ __acquires(rq2->lock) ++{ ++ if (rq1 < rq2) { ++ raw_spin_lock(rq1->lock); ++ raw_spin_lock_nested(rq2->lock, SINGLE_DEPTH_NESTING); ++ } else { ++ raw_spin_lock(rq2->lock); ++ raw_spin_lock_nested(rq1->lock, SINGLE_DEPTH_NESTING); ++ } ++} ++ ++static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) ++ __acquires(rq1->lock) ++ __acquires(rq2->lock) ++{ ++ BUG_ON(!irqs_disabled()); ++ if (rq1->lock == rq2->lock) { ++ raw_spin_lock(rq1->lock); ++ __acquire(rq2->lock); /* Fake it out ;) */ ++ } else ++ __double_rq_lock(rq1, rq2); ++ synchronise_niffies(rq1, rq2); ++} ++ ++/* ++ * double_rq_unlock - safely unlock two runqueues ++ * ++ * Note this does not restore interrupts like task_rq_unlock, ++ * you need to do so manually after calling. ++ */ ++static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) ++ __releases(rq1->lock) ++ __releases(rq2->lock) ++{ ++ raw_spin_unlock(rq1->lock); ++ if (rq1->lock != rq2->lock) ++ raw_spin_unlock(rq2->lock); ++ else ++ __release(rq2->lock); ++} ++ ++static inline void lock_all_rqs(void) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ do_raw_spin_lock(rq->lock); ++ } ++} ++ ++static inline void unlock_all_rqs(void) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ do_raw_spin_unlock(rq->lock); ++ } ++ preempt_enable(); ++} ++ ++/* Specially nest trylock an rq */ ++static inline bool trylock_rq(struct rq *this_rq, struct rq *rq) ++{ ++ if (unlikely(!do_raw_spin_trylock(rq->lock))) ++ return false; ++ spin_acquire(&rq->lock->dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ synchronise_niffies(this_rq, rq); ++ return true; ++} ++ ++/* Unlock a specially nested trylocked rq */ ++static inline void unlock_rq(struct rq *rq) ++{ ++ spin_release(&rq->lock->dep_map, _RET_IP_); ++ do_raw_spin_unlock(rq->lock); ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * In order to ensure that a pending wakeup will observe our pending ++ * state, even in the failed case, an explicit smp_mb() must be used. ++ */ ++ smp_mb__before_atomic(); ++ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) ++ return false; ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++ return true; ++} ++ ++/** ++ * wake_q_add() - queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ */ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task)) ++ get_task_struct(task); ++} ++ ++/** ++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ * ++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers ++ * that already hold reference to @task can call the 'safe' version and trust ++ * wake_q to do the right thing depending whether or not the @task is already ++ * queued for wakeup. ++ */ ++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (!__wake_q_add(head, task)) ++ put_task_struct(task); ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* Task can safely be re-inserted now */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++static inline void smp_sched_reschedule(int cpu) ++{ ++ if (likely(cpu_online(cpu))) ++ smp_send_reschedule(cpu); ++} ++ ++/* ++ * resched_task - mark a task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_task(struct task_struct *p) ++{ ++ int cpu; ++#ifdef CONFIG_LOCKDEP ++ /* Kernel threads call this when creating workqueues while still ++ * inactive from __kthread_bind_mask, holding only the pi_lock */ ++ if (!(p->flags & PF_KTHREAD)) { ++ struct rq *rq = task_rq(p); ++ ++ lockdep_assert_held(rq->lock); ++ } ++#endif ++ if (test_tsk_need_resched(p)) ++ return; ++ ++ cpu = task_cpu(p); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(p)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++/* ++ * A task that is not running or queued will not have a node set. ++ * A task that is queued but not running will have a node set. ++ * A task that is currently running will have ->on_cpu set but no node set. ++ */ ++static inline bool task_queued(struct task_struct *p) ++{ ++ return !skiplist_node_empty(&p->node); ++} ++ ++static void enqueue_task(struct rq *rq, struct task_struct *p, int flags); ++static inline void resched_if_idle(struct rq *rq); ++ ++static inline bool deadline_before(u64 deadline, u64 time) ++{ ++ return (deadline < time); ++} ++ ++/* ++ * Deadline is "now" in niffies + (offset by priority). Setting the deadline ++ * is the key to everything. It distributes cpu fairly amongst tasks of the ++ * same nice value, it proportions cpu according to nice level, it means the ++ * task that last woke up the longest ago has the earliest deadline, thus ++ * ensuring that interactive tasks get low latency on wake up. The CPU ++ * proportion works out to the square of the virtual deadline difference, so ++ * this equation will give nice 19 3% CPU compared to nice 0. ++ */ ++static inline u64 prio_deadline_diff(int user_prio) ++{ ++ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); ++} ++ ++static inline u64 task_deadline_diff(struct task_struct *p) ++{ ++ return prio_deadline_diff(TASK_USER_PRIO(p)); ++} ++ ++static inline u64 static_deadline_diff(int static_prio) ++{ ++ return prio_deadline_diff(USER_PRIO(static_prio)); ++} ++ ++static inline int longest_deadline_diff(void) ++{ ++ return prio_deadline_diff(39); ++} ++ ++static inline int ms_longest_deadline_diff(void) ++{ ++ return NS_TO_MS(longest_deadline_diff()); ++} ++ ++static inline bool rq_local(struct rq *rq); ++ ++#ifndef SCHED_CAPACITY_SCALE ++#define SCHED_CAPACITY_SCALE 1024 ++#endif ++ ++static inline int rq_load(struct rq *rq) ++{ ++ return rq->nr_running; ++} ++ ++/* ++ * Update the load average for feeding into cpu frequency governors. Use a ++ * rough estimate of a rolling average with ~ time constant of 32ms. ++ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 ++ * Make sure a call to update_clocks has been made before calling this to get ++ * an updated rq->niffies. ++ */ ++static void update_load_avg(struct rq *rq, unsigned int flags) ++{ ++ long us_interval, load; ++ ++ us_interval = NS_TO_US(rq->niffies - rq->load_update); ++ if (unlikely(us_interval <= 0)) ++ return; ++ ++ load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); ++ if (unlikely(load < 0)) ++ load = 0; ++ load += rq_load(rq) * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; ++ rq->load_avg = load; ++ ++ rq->load_update = rq->niffies; ++ update_irq_load_avg(rq, 0); ++ if (likely(rq_local(rq))) ++ cpufreq_trigger(rq, flags); ++} ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++/* ++ * IRQ variant of update_load_avg below. delta is actually time in nanoseconds ++ * here so we scale curload to how long it's been since the last update. ++ */ ++static void update_irq_load_avg(struct rq *rq, long delta) ++{ ++ long us_interval, load; ++ ++ us_interval = NS_TO_US(rq->niffies - rq->irq_load_update); ++ if (unlikely(us_interval <= 0)) ++ return; ++ ++ load = rq->irq_load_avg - (rq->irq_load_avg * us_interval * 5 / 262144); ++ if (unlikely(load < 0)) ++ load = 0; ++ load += NS_TO_US(delta) * SCHED_CAPACITY_SCALE * 5 / 262144; ++ rq->irq_load_avg = load; ++ ++ rq->irq_load_update = rq->niffies; ++} ++#endif ++ ++/* ++ * Removing from the runqueue. Enter with rq locked. Deleting a task ++ * from the skip list is done via the stored node reference in the task struct ++ * and does not require a full look up. Thus it occurs in O(k) time where k ++ * is the "level" of the list the task was stored at - usually < 4, max 8. ++ */ ++static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ skiplist_delete(rq->sl, &p->node); ++ rq->best_key = rq->node->next[0]->key; ++ update_clocks(rq); ++ ++ if (!(flags & DEQUEUE_SAVE)) { ++ sched_info_dequeued(rq, p); ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); ++ } ++ rq->nr_running--; ++ if (rt_task(p)) ++ rq->rt_nr_running--; ++ update_load_avg(rq, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_RCU ++static bool rcu_read_critical(struct task_struct *p) ++{ ++ return p->rcu_read_unlock_special.b.blocked; ++} ++#else /* CONFIG_PREEMPT_RCU */ ++#define rcu_read_critical(p) (false) ++#endif /* CONFIG_PREEMPT_RCU */ ++ ++/* ++ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as ++ * an idle task, we ensure none of the following conditions are met. ++ */ ++static bool idleprio_suitable(struct task_struct *p) ++{ ++ return (!(p->sched_contributes_to_load) && !(p->flags & (PF_EXITING)) && ++ !signal_pending(p) && !rcu_read_critical(p) && !freezing(p)); ++} ++ ++/* ++ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check ++ * that the iso_refractory flag is not set. ++ */ ++static inline bool isoprio_suitable(struct rq *rq) ++{ ++ return !rq->iso_refractory; ++} ++ ++static inline void inc_nr_running(struct rq *rq) ++{ ++ rq->nr_running++; ++ if (trace_sched_update_nr_running_tp_enabled()) { ++ call_trace_sched_update_nr_running(rq, 1); ++ } ++} ++ ++static inline void dec_nr_running(struct rq *rq) ++{ ++ rq->nr_running--; ++ if (trace_sched_update_nr_running_tp_enabled()) { ++ call_trace_sched_update_nr_running(rq, -1); ++ } ++} ++ ++/* ++ * Adding to the runqueue. Enter with rq locked. ++ */ ++static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ unsigned int randseed, cflags = 0; ++ u64 sl_id; ++ ++ if (!rt_task(p)) { ++ /* Check it hasn't gotten rt from PI */ ++ if ((idleprio_task(p) && idleprio_suitable(p)) || ++ (iso_task(p) && isoprio_suitable(rq))) ++ p->prio = p->normal_prio; ++ else ++ p->prio = NORMAL_PRIO; ++ } else ++ rq->rt_nr_running++; ++ /* ++ * The sl_id key passed to the skiplist generates a sorted list. ++ * Realtime and sched iso tasks run FIFO so they only need be sorted ++ * according to priority. The skiplist will put tasks of the same ++ * key inserted later in FIFO order. Tasks of sched normal, batch ++ * and idleprio are sorted according to their deadlines. Idleprio ++ * tasks are offset by an impossibly large deadline value ensuring ++ * they get sorted into last positions, but still according to their ++ * own deadlines. This creates a "landscape" of skiplists running ++ * from priority 0 realtime in first place to the lowest priority ++ * idleprio tasks last. Skiplist insertion is an O(log n) process. ++ */ ++ if (p->prio <= ISO_PRIO) { ++ sl_id = p->prio; ++ } else { ++ sl_id = p->deadline; ++ if (idleprio_task(p)) { ++ if (p->prio == IDLE_PRIO) ++ sl_id |= 0xF000000000000000; ++ else ++ sl_id += longest_deadline_diff(); ++ } ++ } ++ /* ++ * Some architectures don't have better than microsecond resolution ++ * so mask out ~microseconds as the random seed for skiplist insertion. ++ */ ++ update_clocks(rq); ++ if (!(flags & ENQUEUE_RESTORE)) { ++ sched_info_queued(rq, p); ++ psi_enqueue(p, flags & ENQUEUE_WAKEUP); ++ } ++ ++ randseed = (rq->niffies >> 10) & 0xFFFFFFFF; ++ skiplist_insert(rq->sl, &p->node, sl_id, p, randseed); ++ rq->best_key = rq->node->next[0]->key; ++ if (p->in_iowait) ++ cflags |= SCHED_CPUFREQ_IOWAIT; ++ inc_nr_running(rq); ++ update_load_avg(rq, cflags); ++} ++ ++/* ++ * Returns the relative length of deadline all compared to the shortest ++ * deadline which is that of nice -20. ++ */ ++static inline int task_prio_ratio(struct task_struct *p) ++{ ++ return prio_ratios[TASK_USER_PRIO(p)]; ++} ++ ++/* ++ * task_timeslice - all tasks of all priorities get the exact same timeslice ++ * length. CPU distribution is handled by giving different deadlines to ++ * tasks of different priorities. Use 128 as the base value for fast shifts. ++ */ ++static inline int task_timeslice(struct task_struct *p) ++{ ++ return (rr_interval * task_prio_ratio(p) / 128); ++} ++ ++#ifdef CONFIG_SMP ++/* Entered with rq locked */ ++static inline void resched_if_idle(struct rq *rq) ++{ ++ if (rq_idle(rq)) ++ resched_task(rq->curr); ++} ++ ++static inline bool rq_local(struct rq *rq) ++{ ++ return (rq->cpu == smp_processor_id()); ++} ++#ifdef CONFIG_SMT_NICE ++static const cpumask_t *thread_cpumask(int cpu); ++ ++/* Find the best real time priority running on any SMT siblings of cpu and if ++ * none are running, the static priority of the best deadline task running. ++ * The lookups to the other runqueues is done lockless as the occasional wrong ++ * value would be harmless. */ ++static int best_smt_bias(struct rq *this_rq) ++{ ++ int other_cpu, best_bias = 0; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct rq *rq = cpu_rq(other_cpu); ++ ++ if (rq_idle(rq)) ++ continue; ++ if (unlikely(!rq->online)) ++ continue; ++ if (!rq->rq_mm) ++ continue; ++ if (likely(rq->rq_smt_bias > best_bias)) ++ best_bias = rq->rq_smt_bias; ++ } ++ return best_bias; ++} ++ ++static int task_prio_bias(struct task_struct *p) ++{ ++ if (rt_task(p)) ++ return 1 << 30; ++ else if (task_running_iso(p)) ++ return 1 << 29; ++ else if (task_running_idle(p)) ++ return 0; ++ return MAX_PRIO - p->static_prio; ++} ++ ++static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq) ++{ ++ return true; ++} ++ ++static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule; ++ ++/* We've already decided p can run on CPU, now test if it shouldn't for SMT ++ * nice reasons. */ ++static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq) ++{ ++ int best_bias, task_bias; ++ ++ /* Kernel threads always run */ ++ if (unlikely(!p->mm)) ++ return true; ++ if (rt_task(p)) ++ return true; ++ if (!idleprio_suitable(p)) ++ return true; ++ best_bias = best_smt_bias(this_rq); ++ /* The smt siblings are all idle or running IDLEPRIO */ ++ if (best_bias < 1) ++ return true; ++ task_bias = task_prio_bias(p); ++ if (task_bias < 1) ++ return false; ++ if (task_bias >= best_bias) ++ return true; ++ /* Dither 25% cpu of normal tasks regardless of nice difference */ ++ if (best_bias % 4 == 1) ++ return true; ++ /* Sorry, you lose */ ++ return false; ++} ++#else /* CONFIG_SMT_NICE */ ++#define smt_schedule(p, this_rq) (true) ++#endif /* CONFIG_SMT_NICE */ ++ ++static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask) ++{ ++ set_bit(cpu, (volatile unsigned long *)cpumask); ++} ++ ++/* ++ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to ++ * allow easy lookup of whether any suitable idle CPUs are available. ++ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the ++ * idle_cpus variable than to do a full bitmask check when we are busy. The ++ * bits are set atomically but read locklessly as occasional false positive / ++ * negative is harmless. ++ */ ++static inline void set_cpuidle_map(int cpu) ++{ ++ if (likely(cpu_online(cpu))) ++ atomic_set_cpu(cpu, &cpu_idle_map); ++} ++ ++static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) ++{ ++ clear_bit(cpu, (volatile unsigned long *)cpumask); ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++ atomic_clear_cpu(cpu, &cpu_idle_map); ++} ++ ++static bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return (cpumask_intersects(p->cpus_ptr, &cpu_idle_map)); ++} ++ ++/* ++ * Resched current on rq. We don't know if rq is local to this CPU nor if it ++ * is locked so we do not use an intermediate variable for the task to avoid ++ * having it dereferenced. ++ */ ++static void resched_curr(struct rq *rq) ++{ ++ int cpu; ++ ++ if (test_tsk_need_resched(rq->curr)) ++ return; ++ ++ rq->preempt = rq->curr; ++ cpu = rq->cpu; ++ ++ /* We're doing this without holding the rq lock if it's not task_rq */ ++ ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(rq->curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(rq->curr)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++#define CPUIDLE_DIFF_THREAD (1) ++#define CPUIDLE_DIFF_CORE_LLC (2) ++#define CPUIDLE_DIFF_CORE (4) ++#define CPUIDLE_CACHE_BUSY (8) ++#define CPUIDLE_DIFF_CPU (16) ++#define CPUIDLE_THREAD_BUSY (32) ++#define CPUIDLE_DIFF_NODE (64) ++ ++/* ++ * The best idle CPU is chosen according to the CPUIDLE ranking above where the ++ * lowest value would give the most suitable CPU to schedule p onto next. The ++ * order works out to be the following: ++ * ++ * Same thread, idle or busy cache, idle or busy threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ */ ++static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask) ++{ ++ int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY | ++ CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE | ++ CPUIDLE_DIFF_CORE_LLC | CPUIDLE_DIFF_THREAD; ++ int cpu_tmp; ++ ++ if (cpumask_test_cpu(best_cpu, tmpmask)) ++ goto out; ++ ++ for_each_cpu(cpu_tmp, tmpmask) { ++ int ranking, locality; ++ struct rq *tmp_rq; ++ ++ ranking = 0; ++ tmp_rq = cpu_rq(cpu_tmp); ++ ++ locality = rq->cpu_locality[cpu_tmp]; ++#ifdef CONFIG_NUMA ++ if (locality > LOCALITY_SMP) ++ ranking |= CPUIDLE_DIFF_NODE; ++ else ++#endif ++ if (locality > LOCALITY_MC) ++ ranking |= CPUIDLE_DIFF_CPU; ++#ifdef CONFIG_SCHED_MC ++ else if (locality == LOCALITY_MC_LLC) ++ ranking |= CPUIDLE_DIFF_CORE_LLC; ++ else if (locality == LOCALITY_MC) ++ ranking |= CPUIDLE_DIFF_CORE; ++ if (!(tmp_rq->cache_idle(tmp_rq))) ++ ranking |= CPUIDLE_CACHE_BUSY; ++#endif ++#ifdef CONFIG_SCHED_SMT ++ if (locality == LOCALITY_SMT) ++ ranking |= CPUIDLE_DIFF_THREAD; ++#endif ++ if (ranking < best_ranking ++#ifdef CONFIG_SCHED_SMT ++ || (ranking == best_ranking && (tmp_rq->siblings_idle(tmp_rq))) ++#endif ++ ) { ++ best_cpu = cpu_tmp; ++ best_ranking = ranking; ++ } ++ } ++out: ++ return best_cpu; ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ struct rq *this_rq = cpu_rq(this_cpu); ++ ++ return (this_rq->cpu_locality[that_cpu] < LOCALITY_SMP); ++} ++ ++/* As per resched_curr but only will resched idle task */ ++static inline void resched_idle(struct rq *rq) ++{ ++ if (test_tsk_need_resched(rq->idle)) ++ return; ++ ++ rq->preempt = rq->idle; ++ ++ set_tsk_need_resched(rq->idle); ++ ++ if (rq_local(rq)) { ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ smp_sched_reschedule(rq->cpu); ++} ++ ++DEFINE_PER_CPU(cpumask_t, idlemask); ++ ++static struct rq *resched_best_idle(struct task_struct *p, int cpu) ++{ ++ cpumask_t *tmpmask = &(per_cpu(idlemask, cpu)); ++ struct rq *rq; ++ int best_cpu; ++ ++ cpumask_and(tmpmask, p->cpus_ptr, &cpu_idle_map); ++ best_cpu = best_mask_cpu(cpu, task_rq(p), tmpmask); ++ rq = cpu_rq(best_cpu); ++ if (!smt_schedule(p, rq)) ++ return NULL; ++ rq->preempt = p; ++ resched_idle(rq); ++ return rq; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p, task_cpu(p)); ++} ++ ++static inline struct rq *rq_order(struct rq *rq, int cpu) ++{ ++ return rq->rq_order[cpu]; ++} ++#else /* CONFIG_SMP */ ++static inline void set_cpuidle_map(int cpu) ++{ ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++} ++ ++static inline bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return uprq->curr == uprq->idle; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++} ++ ++static inline void resched_curr(struct rq *rq) ++{ ++ resched_task(rq->curr); ++} ++ ++static inline void resched_if_idle(struct rq *rq) ++{ ++} ++ ++static inline bool rq_local(struct rq *rq) ++{ ++ return true; ++} ++ ++static inline struct rq *rq_order(struct rq *rq, int cpu) ++{ ++ return rq; ++} ++ ++static inline bool smt_schedule(struct task_struct *p, struct rq *rq) ++{ ++ return true; ++} ++#endif /* CONFIG_SMP */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ if (has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ if (idleprio_task(p)) ++ return IDLE_PRIO; ++ if (iso_task(p)) ++ return ISO_PRIO; ++ return NORMAL_PRIO; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. Enter with rq locked. ++ */ ++static void activate_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ resched_if_idle(rq); ++ ++ /* ++ * Sleep time is in units of nanosecs, so shift by 20 to get a ++ * milliseconds-range estimation of the amount of time that the task ++ * spent sleeping: ++ */ ++ if (unlikely(prof_on == SLEEP_PROFILING)) { ++ if (p->state == TASK_UNINTERRUPTIBLE) ++ profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), ++ (rq->niffies - p->last_ran) >> 20); ++ } ++ ++ p->prio = effective_prio(p); ++ enqueue_task(rq, p, flags); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++} ++ ++/* ++ * deactivate_task - If it's running, it's not on the runqueue and we can just ++ * decrement the nr_running. Enter with rq locked. ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ p->on_rq = 0; ++ sched_info_dequeued(rq, p); ++ /* deactivate_task is always DEQUEUE_SLEEP in muqss */ ++ psi_dequeue(p, DEQUEUE_SLEEP); ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++ struct rq *rq; ++ ++ if (task_cpu(p) == new_cpu) ++ return; ++ ++ /* Do NOT call set_task_cpu on a currently queued task as we will not ++ * be reliably holding the rq lock after changing CPU. */ ++ BUG_ON(task_queued(p)); ++ rq = task_rq(p); ++ ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * Furthermore, all task_rq users should acquire both locks, see ++ * task_rq_lock(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(rq->lock))); ++#endif ++ ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ /* ++ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++ p->wake_cpu = new_cpu; ++ ++ if (task_running(rq, p)) { ++ /* ++ * We should only be calling this on a running task if we're ++ * holding rq lock. ++ */ ++ lockdep_assert_held(rq->lock); ++ ++ /* ++ * We can't change the task_thread_info CPU on a running task ++ * as p will still be protected by the rq lock of the CPU it ++ * is still running on so we only set the wake_cpu for it to be ++ * lazily updated once off the CPU. ++ */ ++ return; ++ } ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ WRITE_ONCE(p->cpu, new_cpu); ++#else ++ WRITE_ONCE(task_thread_info(p)->cpu, new_cpu); ++#endif ++ /* We're no longer protecting p after this point since we're holding ++ * the wrong runqueue lock. */ ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Move a task off the runqueue and take it to a cpu for it will ++ * become the running task. ++ */ ++static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) ++{ ++ struct rq *p_rq = task_rq(p); ++ ++ dequeue_task(p_rq, p, DEQUEUE_SAVE); ++ if (p_rq != rq) { ++ sched_info_dequeued(p_rq, p); ++ sched_info_queued(rq, p); ++ } ++ set_task_cpu(p, cpu); ++} ++ ++/* ++ * Returns a descheduling task to the runqueue unless it is being ++ * deactivated. ++ */ ++static inline void return_task(struct task_struct *p, struct rq *rq, ++ int cpu, bool deactivate) ++{ ++ if (deactivate) ++ deactivate_task(p, rq); ++ else { ++#ifdef CONFIG_SMP ++ /* ++ * set_task_cpu was called on the running task that doesn't ++ * want to deactivate so it has to be enqueued to a different ++ * CPU and we need its lock. Tag it to be moved with as the ++ * lock is dropped in finish_lock_switch. ++ */ ++ if (unlikely(p->wake_cpu != cpu)) ++ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ++ else ++#endif ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ } ++} ++ ++/* Enter with rq lock held. We know p is on the local cpu */ ++static inline void __set_tsk_resched(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ int running, queued; ++ struct rq_flags rf; ++ unsigned long ncsw; ++ struct rq *rq; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(rq, p)) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ rq = task_rq_lock(p, &rf); ++ trace_sched_wait_task(p); ++ running = task_running(rq, p); ++ queued = task_on_rq_queued(p); ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_rq_unlock(rq, p, &rf); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(queued)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_sched_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++#endif ++ ++/* ++ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the ++ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or ++ * between themselves, they cooperatively multitask. An idle rq scores as ++ * prio PRIO_LIMIT so it is always preempted. ++ */ ++static inline bool ++can_preempt(struct task_struct *p, int prio, u64 deadline) ++{ ++ /* Better static priority RT task or better policy preemption */ ++ if (p->prio < prio) ++ return true; ++ if (p->prio > prio) ++ return false; ++ if (p->policy == SCHED_BATCH) ++ return false; ++ /* SCHED_NORMAL and ISO will preempt based on deadline */ ++ if (!deadline_before(p->deadline, deadline)) ++ return false; ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ ++ if (is_per_cpu_kthread(p)) ++ return cpu_online(cpu); ++ ++ return cpu_active(cpu); ++} ++ ++/* ++ * Check to see if p can run on cpu, and if not, whether there are any online ++ * CPUs it can run on instead. This only happens with the hotplug threads that ++ * bring up the CPUs. ++ */ ++static inline bool sched_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (likely(cpumask_test_cpu(cpu, p->cpus_ptr))) ++ return false; ++ if (p->nr_cpus_allowed == 1) { ++ cpumask_t valid_mask; ++ ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_online_mask); ++ if (unlikely(cpumask_empty(&valid_mask))) ++ return false; ++ } ++ return true; ++} ++ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ return true; ++} ++ ++#define cpu_online_map (*(cpumask_t *)cpu_online_mask) ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ int i, this_entries = rq_load(this_rq); ++ cpumask_t tmp; ++ ++ if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p))) ++ return; ++ ++ /* IDLEPRIO tasks never preempt anything but idle */ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ ++ cpumask_and(&tmp, &cpu_online_map, p->cpus_ptr); ++ ++ for (i = 0; i < num_online_cpus(); i++) { ++ struct rq *rq = this_rq->cpu_order[i]; ++ ++ if (!cpumask_test_cpu(rq->cpu, &tmp)) ++ continue; ++ ++ if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries) ++ continue; ++ if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) { ++ /* We set rq->preempting lockless, it's a hint only */ ++ rq->preempting = p; ++ resched_curr(rq); ++ return; ++ } ++ } ++} ++ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check); ++#else /* CONFIG_SMP */ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ return false; ++} ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) ++ resched_curr(uprq); ++} ++ ++static inline int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++#endif /* CONFIG_SMP */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq = this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) { ++ __schedstat_inc(rq->ttwu_local); ++ } else { ++ struct sched_domain *sd; ++ ++ rcu_read_lock(); ++ for_each_domain(rq->cpu, sd) { ++ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { ++ __schedstat_inc(sd->ttwu_wake_remote); ++ break; ++ } ++ } ++ rcu_read_unlock(); ++ } ++ ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ /* ++ * Sync wakeups (i.e. those types of wakeups where the waker ++ * has indicated that it will leave the CPU in short order) ++ * don't trigger a preemption if there are no idle cpus, ++ * instead waiting for current to deschedule. ++ */ ++ if (wake_flags & WF_SYNC) ++ resched_suitable_idle(p); ++ else ++ try_preempt(p, rq); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ int en_flags = ENQUEUE_WAKEUP; ++ ++ lockdep_assert_held(rq->lock); ++ ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++ ++#ifdef CONFIG_SMP ++ if (wake_flags & WF_MIGRATED) ++ en_flags |= ENQUEUE_MIGRATED; ++#endif ++ ++ activate_task(rq, p, en_flags); ++ ttwu_do_wakeup(rq, p, wake_flags); ++} ++ ++/* ++ * Consider @p being inside a wait loop: ++ * ++ * for (;;) { ++ * set_current_state(TASK_UNINTERRUPTIBLE); ++ * ++ * if (CONDITION) ++ * break; ++ * ++ * schedule(); ++ * } ++ * __set_current_state(TASK_RUNNING); ++ * ++ * between set_current_state() and schedule(). In this case @p is still ++ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in ++ * an atomic manner. ++ * ++ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq ++ * then schedule() must still happen and p->state can be changed to ++ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we ++ * need to do a full wakeup with enqueue. ++ * ++ * Returns: %true when the wakeup is done, ++ * %false otherwise. ++ */ ++static int ttwu_runnable(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ int ret = 0; ++ ++ rq = __task_rq_lock(p, NULL); ++ if (likely(task_on_rq_queued(p))) { ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_rq_unlock(rq, NULL); ++ ++ return ret; ++} ++ ++#ifdef CONFIG_SMP ++void sched_ttwu_pending(void *arg) ++{ ++ struct llist_node *llist = arg; ++ struct rq *rq = this_rq(); ++ struct task_struct *p, *t; ++ struct rq_flags rf; ++ ++ if (!llist) ++ return; ++ ++ /* ++ * rq::ttwu_pending racy indication of out-standing wakeups. ++ * Races such that false-negatives are possible, since they ++ * are shorter lived that false-positives would be. ++ */ ++ WRITE_ONCE(rq->ttwu_pending, 0); ++ ++ rq_lock_irqsave(rq, &rf); ++ ++ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { ++ if (WARN_ON_ONCE(p->on_cpu)) ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) ++ set_task_cpu(p, cpu_of(rq)); ++ ++ ttwu_do_activate(rq, p, 0); ++ } ++ ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++void send_call_function_single_ipi(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (!set_nr_if_polling(rq->idle)) ++ arch_send_call_function_single_ipi(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++/* ++ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if ++ * necessary. The wakee CPU on receipt of the IPI will queue the task ++ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost ++ * of the wakeup instead of the waker. ++ */ ++static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ WRITE_ONCE(rq->ttwu_pending, 1); ++ __smp_call_single_queue(cpu, &p->wake_entry.llist); ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ rq_lock_irqsave(rq, &rf); ++ if (likely(is_idle_task(rq->curr))) ++ smp_sched_reschedule(cpu); ++ /* Else cpu is not in idle, do nothing here */ ++ rq_unlock_irqrestore(rq, &rf); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++static inline bool ttwu_queue_cond(int cpu, int wake_flags) ++{ ++ /* ++ * If the CPU does not share cache, then queue the task on the ++ * remote rqs wakelist to avoid accessing remote data. ++ */ ++ if (!cpus_share_cache(smp_processor_id(), cpu)) ++ return true; ++ ++ /* ++ * If the task is descheduling and the only running task on the ++ * CPU then use the wakelist to offload the task activation to ++ * the soon-to-be-idle CPU as the current CPU is likely busy. ++ * nr_running is checked to avoid unnecessary task stacking. ++ */ ++ if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) ++ return true; ++ ++ return false; ++} ++ ++static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) ++{ ++ if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { ++ if (WARN_ON_ONCE(cpu == smp_processor_id())) ++ return false; ++ ++ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ ++ __ttwu_queue_wakelist(p, cpu, wake_flags); ++ return true; ++ } ++ ++ return false; ++} ++ ++static int valid_task_cpu(struct task_struct *p) ++{ ++ cpumask_t valid_mask; ++ ++ if (p->flags & PF_KTHREAD) ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_all_mask); ++ else ++ cpumask_and(&valid_mask, p->cpus_ptr, cpu_active_mask); ++ ++ if (unlikely(!cpumask_weight(&valid_mask))) { ++ /* We shouldn't be hitting this any more */ ++ printk(KERN_WARNING "SCHED: No cpumask for %s/%d weight %d\n", p->comm, ++ p->pid, cpumask_weight(p->cpus_ptr)); ++ return cpumask_any(p->cpus_ptr); ++ } ++ return cpumask_any(&valid_mask); ++} ++ ++/* ++ * For a task that's just being woken up we have a valuable balancing ++ * opportunity so choose the nearest cache most lightly loaded runqueue. ++ * Entered with rq locked and returns with the chosen runqueue locked. ++ */ ++static inline int select_best_cpu(struct task_struct *p) ++{ ++ unsigned int idlest = ~0U; ++ struct rq *rq = NULL; ++ int i; ++ ++ if (suitable_idle_cpus(p)) { ++ int cpu = task_cpu(p); ++ ++ if (unlikely(needs_other_cpu(p, cpu))) ++ cpu = valid_task_cpu(p); ++ rq = resched_best_idle(p, cpu); ++ if (likely(rq)) ++ return rq->cpu; ++ } ++ ++ for (i = 0; i < num_online_cpus(); i++) { ++ struct rq *other_rq = task_rq(p)->cpu_order[i]; ++ int entries; ++ ++ if (!other_rq->online) ++ continue; ++ if (needs_other_cpu(p, other_rq->cpu)) ++ continue; ++ entries = rq_load(other_rq); ++ if (entries >= idlest) ++ continue; ++ idlest = entries; ++ rq = other_rq; ++ } ++ if (unlikely(!rq)) ++ return task_cpu(p); ++ return rq->cpu; ++} ++#else /* CONFIG_SMP */ ++ ++static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) ++{ ++ return false; ++} ++ ++static int valid_task_cpu(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static inline int select_best_cpu(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static struct rq *resched_best_idle(struct task_struct *p, int cpu) ++{ ++ return NULL; ++} ++#endif /* CONFIG_SMP */ ++ ++static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (ttwu_queue_wakelist(p, cpu, wake_flags)) ++ return; ++ ++ rq_lock(rq); ++ update_rq_clock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ rq_unlock(rq); ++} ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Return: %true if @p was woken up, %false if it was already running. ++ * or @state didn't match @p's state. ++ */ ++static int ++try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ++{ ++ unsigned long flags; ++ int cpu, success = 0; ++ ++ preempt_disable(); ++ if (p == current) { ++ /* ++ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) ++ * == smp_processor_id()'. Together this means we can special ++ * case the whole 'p->on_rq && ttwu_runnable()' case below ++ * without taking any locks. ++ * ++ * In particular: ++ * - we rely on Program-Order guarantees for all the ordering, ++ * - we're serialized against set_special_state() by virtue of ++ * it disabling IRQs (this allows not taking ->pi_lock). ++ */ ++ if (!(p->state & state)) ++ goto out; ++ ++ success = 1; ++ trace_sched_waking(p); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++ goto out; ++ } ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with smp_store_mb() ++ * in set_current_state() that the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ if (!(p->state & state)) ++ goto unlock; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) ++ goto unlock; ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ * ++ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure ++ * schedule()'s deactivate_task() has 'happened' and p will no longer ++ * care about it's own p->state. See the comment in __schedule(). ++ */ ++ smp_acquire__after_ctrl_dep(); ++ ++ /* ++ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq ++ * == 0), which means we need to do an enqueue, change p->state to ++ * TASK_WAKING such that we can unlock p->pi_lock before doing the ++ * enqueue, such as ttwu_queue_wakelist(). ++ */ ++ p->state = TASK_WAKING; ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, considering queueing p on the remote CPUs wake_list ++ * which potentially sends an IPI instead of spinning on p->on_cpu to ++ * let the waker make forward progress. This is safe because IRQs are ++ * disabled and the IPI will deliver after on_cpu is cleared. ++ * ++ * Ensure we load task_cpu(p) after p->on_cpu: ++ * ++ * set_task_cpu(p, cpu); ++ * STORE p->cpu = @cpu ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock ++ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) ++ * STORE p->on_cpu = 1 LOAD p->cpu ++ * ++ * to ensure we observe the correct CPU on which the task is currently ++ * scheduling. ++ */ ++ if (smp_load_acquire(&p->on_cpu) && ++ ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) ++ goto unlock; ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until its done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ cpu = select_best_cpu(p); ++ if (task_cpu(p) != cpu) { ++ wake_flags |= WF_MIGRATED; ++ psi_ttwu_dequeue(p); ++ set_task_cpu(p, cpu); ++ } ++ ++#else ++ cpu = task_cpu(p); ++#endif /* CONFIG_SMP */ ++ ++ ttwu_queue(p, cpu, wake_flags); ++unlock: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++out: ++ if (success) ++ ttwu_stat(p, task_cpu(p), wake_flags); ++ preempt_enable(); ++ ++ return success; ++} ++ ++/** ++ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state ++ * @p: Process for which the function is to be invoked. ++ * @func: Function to invoke. ++ * @arg: Argument to function. ++ * ++ * If the specified task can be quickly locked into a definite state ++ * (either sleeping or on a given runqueue), arrange to keep it in that ++ * state while invoking @func(@arg). This function can use ->on_rq and ++ * task_curr() to work out what the state is, if required. Given that ++ * @func can be invoked with a runqueue lock held, it had better be quite ++ * lightweight. ++ * ++ * Returns: ++ * @false if the task slipped out from under the locks. ++ * @true if the task was locked onto a runqueue or is sleeping. ++ * However, @func can override this by returning @false. ++ */ ++bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) ++{ ++ bool ret = false; ++ struct rq *rq; ++ ++ lockdep_assert_irqs_enabled(); ++ raw_spin_lock_irq(&p->pi_lock); ++ if (p->on_rq) { ++ rq = __task_rq_lock(p, NULL); ++ if (task_rq(p) == rq) ++ ret = func(p, arg); ++ rq_unlock(rq); ++ } else { ++ switch (p->state) { ++ case TASK_RUNNING: ++ case TASK_WAKING: ++ break; ++ default: ++ smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). ++ if (!p->on_rq) ++ ret = func(p, arg); ++ } ++ } ++ raw_spin_unlock_irq(&p->pi_lock); ++ return ret; ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++static void time_slice_expired(struct task_struct *p, struct rq *rq); ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ */ ++int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ ++#ifdef CONFIG_COMPACTION ++ p->capture_control = NULL; ++#endif ++ ++#ifdef CONFIG_SMP ++ p->wake_entry.u_flags = CSD_TYPE_TTWU; ++#endif ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * The process state is set to the same value of the process executing ++ * do_fork() code. That is running. This guarantees that nobody will ++ * actually run it, and a signal or other external event cannot wake ++ * it up and insert it on the runqueue either. ++ */ ++ ++ /* Should be reset in fork.c but done here for ease of MuQSS patching */ ++ p->on_cpu = ++ p->on_rq = ++ p->utime = ++ p->stime = ++ p->sched_time = ++ p->stime_ns = ++ p->utime_ns = 0; ++ skiplist_node_init(&p->node); ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR || p-> policy == SCHED_ISO) { ++ p->policy = SCHED_NORMAL; ++ p->normal_prio = normal_prio(p); ++ } ++ ++ if (PRIO_TO_NICE(p->static_prio) < 0) { ++ p->static_prio = NICE_TO_PRIO(0); ++ p->normal_prio = p->static_prio; ++ } ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ /* ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rseq_migrate(p); ++ set_task_cpu(p, smp_processor_id()); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ return 0; ++} ++ ++void sched_post_fork(struct task_struct *p) ++{ ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, ++ size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p); ++ ++static void account_task_cpu(struct rq *rq, struct task_struct *p) ++{ ++ update_clocks(rq); ++ /* This isn't really a context switch but accounting is the same */ ++ update_cpu_clock_switch(rq, p); ++ p->last_ran = rq->niffies; ++} ++ ++bool sched_smp_initialized __read_mostly; ++ ++static inline int hrexpiry_enabled(struct rq *rq) ++{ ++ if (unlikely(!cpu_active(cpu_of(rq)) || !sched_smp_initialized)) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrexpiry_timer); ++} ++ ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++static inline void hrexpiry_clear(struct rq *rq) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ if (hrtimer_active(&rq->hrexpiry_timer)) ++ hrtimer_cancel(&rq->hrexpiry_timer); ++} ++ ++/* ++ * High-resolution time_slice expiry. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrexpiry(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrexpiry_timer); ++ struct task_struct *p; ++ ++ /* This can happen during CPU hotplug / resume */ ++ if (unlikely(cpu_of(rq) != smp_processor_id())) ++ goto out; ++ ++ /* ++ * We're doing this without the runqueue lock but this should always ++ * be run on the local CPU. Time slice should run out in __schedule ++ * but we set it to zero here in case niffies is slightly less. ++ */ ++ p = rq->curr; ++ p->time_slice = 0; ++ __set_tsk_resched(p); ++out: ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Called to set the hrexpiry timer state. ++ * ++ * called with irqs disabled from the local CPU only ++ */ ++static void hrexpiry_start(struct rq *rq, u64 delay) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ ++ hrtimer_start(&rq->hrexpiry_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED); ++} ++ ++static void init_rq_hrexpiry(struct rq *rq) ++{ ++ hrtimer_init(&rq->hrexpiry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ rq->hrexpiry_timer.function = hrexpiry; ++} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return HALF_JIFFY_US; ++ return 0; ++} ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ struct task_struct *parent, *rq_curr; ++ struct rq *rq, *new_rq; ++ unsigned long flags; ++ ++ parent = p->parent; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ p->state = TASK_RUNNING; ++ /* Task_rq can't change yet on a new task */ ++ new_rq = rq = task_rq(p); ++ if (unlikely(needs_other_cpu(p, task_cpu(p)))) { ++ set_task_cpu(p, valid_task_cpu(p)); ++ new_rq = task_rq(p); ++ } ++ ++ double_rq_lock(rq, new_rq); ++ rq_curr = rq->curr; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = rq_curr->normal_prio; ++ ++ trace_sched_wakeup_new(p); ++ ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. If it's negative, it won't ++ * matter since that's the same as being 0. rq->rq_deadline is only ++ * modified within schedule() so it is always equal to ++ * current->deadline. ++ */ ++ account_task_cpu(rq, rq_curr); ++ p->last_ran = rq_curr->last_ran; ++ if (likely(rq_curr->policy != SCHED_FIFO)) { ++ rq_curr->time_slice /= 2; ++ if (rq_curr->time_slice < RESCHED_US) { ++ /* ++ * Forking task has run out of timeslice. Reschedule it and ++ * start its child with a new time slice and deadline. The ++ * child will end up running first because its deadline will ++ * be slightly earlier. ++ */ ++ __set_tsk_resched(rq_curr); ++ time_slice_expired(p, new_rq); ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p, task_cpu(p)); ++ else if (unlikely(rq != new_rq)) ++ try_preempt(p, new_rq); ++ } else { ++ p->time_slice = rq_curr->time_slice; ++ if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) { ++ /* ++ * The VM isn't cloned, so we're in a good position to ++ * do child-runs-first in anticipation of an exec. This ++ * usually avoids a lot of COW overhead. ++ */ ++ __set_tsk_resched(rq_curr); ++ } else { ++ /* ++ * Adjust the hrexpiry since rq_curr will keep ++ * running and its timeslice has been shortened. ++ */ ++ hrexpiry_start(rq, US_TO_NS(rq_curr->time_slice)); ++ try_preempt(p, new_rq); ++ } ++ } ++ } else { ++ time_slice_expired(p, new_rq); ++ try_preempt(p, new_rq); ++ } ++ activate_task(new_rq, p, 0); ++ double_rq_unlock(rq, new_rq); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ */ ++ next->on_cpu = 1; ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * This must be the very last reference to @prev from this CPU. After ++ * p->on_cpu is cleared, the task can be moved to a different CPU. We ++ * must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#endif ++} ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock->dep_map, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock->owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock->dep_map, 0, 0, _THIS_IP_); ++ ++#ifdef CONFIG_SMP ++ /* ++ * If prev was marked as migrating to another CPU in return_task, drop ++ * the local runqueue lock but leave interrupts disabled and grab the ++ * remote lock we're migrating it to before enabling them. ++ */ ++ if (unlikely(task_on_rq_migrating(prev))) { ++ sched_info_dequeued(rq, prev); ++ /* ++ * We move the ownership of prev to the new cpu now. ttwu can't ++ * activate prev to the wrong cpu since it has to grab this ++ * runqueue in ttwu_remote. ++ */ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ prev->cpu = prev->wake_cpu; ++#else ++ task_thread_info(prev)->cpu = prev->wake_cpu; ++#endif ++ raw_spin_unlock(rq->lock); ++ ++ raw_spin_lock(&prev->pi_lock); ++ rq = __task_rq_lock(prev, NULL); ++ /* Check that someone else hasn't already queued prev */ ++ if (likely(!task_queued(prev))) { ++ enqueue_task(rq, prev, 0); ++ prev->on_rq = TASK_ON_RQ_QUEUED; ++ /* Wake up the CPU if it's not already running */ ++ resched_if_idle(rq); ++ } ++ raw_spin_unlock(&prev->pi_lock); ++ } ++#endif ++ rq_unlock(rq); ++ local_irq_enable(); ++} ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_switch ++# define finish_arch_switch(prev) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static void finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ finish_lock_switch(rq, prev); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct_rcu_user(prev); ++ } ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++{ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline void ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prepare_task_switch(rq, prev, next); ++ ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * kernel -> kernel lazy + transfer active ++ * user -> kernel lazy + mmgrab() active ++ * ++ * kernel -> user switch + mmdrop() active ++ * user -> user switch ++ */ ++ if (!next->mm) { // to kernel ++ enter_lazy_tlb(prev->active_mm, next); ++ ++ next->active_mm = prev->active_mm; ++ if (prev->mm) // from user ++ mmgrab(prev->active_mm); ++ else ++ prev->active_mm = NULL; ++ } else { // to user ++ membarrier_switch_mm(rq, prev->active_mm, next->mm); ++ /* ++ * sys_membarrier() requires an smp_mb() between setting ++ * rq->curr / membarrier_switch_mm() and returning to userspace. ++ * ++ * The below provides this either through switch_mm(), or in ++ * case 'prev->active_mm == next->mm' through ++ * finish_task_switch()'s mmdrop(). ++ */ ++ switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ ++ if (!prev->mm) { // from kernel ++ /* will mmdrop() in finish_task_switch(). */ ++ rq->prev_mm = prev->active_mm; ++ prev->active_mm = NULL; ++ } ++ } ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++static unsigned long nr_uninterruptible(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_uninterruptible; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptible section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ if (rq_load(raw_rq()) == 1) ++ return true; ++ else ++ return false; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int cpu; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(cpu) ++ sum += cpu_rq(cpu)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpufreq menu ++ * governor are using nonsensical data. Boosting frequency for a CPU that has ++ * IO-wait which might not even end up running the task when it does become ++ * runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ return atomic_read(&cpu_rq(cpu)->nr_iowait); ++} ++ ++/* ++ * IO-wait accounting, and how its mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long cpu, sum = 0; ++ ++ for_each_possible_cpu(cpu) ++ sum += nr_iowait_cpu(cpu); ++ ++ return sum; ++} ++ ++unsigned long nr_active(void) ++{ ++ return nr_running() + nr_uninterruptible(); ++} ++ ++/* Variables and functions for calc_load */ ++static unsigned long calc_load_update; ++unsigned long avenrun[3]; ++EXPORT_SYMBOL(avenrun); ++ ++/** ++ * get_avenrun - get the load average array ++ * @loads: pointer to dest load array ++ * @offset: offset to add ++ * @shift: shift count to shift the result left ++ * ++ * These values are estimates at best, so no need for locking. ++ */ ++void get_avenrun(unsigned long *loads, unsigned long offset, int shift) ++{ ++ loads[0] = (avenrun[0] + offset) << shift; ++ loads[1] = (avenrun[1] + offset) << shift; ++ loads[2] = (avenrun[2] + offset) << shift; ++} ++ ++/* ++ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds. ++ */ ++void calc_global_load(void) ++{ ++ long active; ++ ++ if (time_before(jiffies, READ_ONCE(calc_load_update))) ++ return; ++ active = nr_active() * FIXED_1; ++ ++ avenrun[0] = calc_load(avenrun[0], EXP_1, active); ++ avenrun[1] = calc_load(avenrun[1], EXP_5, active); ++ avenrun[2] = calc_load(avenrun[2], EXP_15, active); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++} ++ ++/** ++ * fixed_power_int - compute: x^n, in O(log n) time ++ * ++ * @x: base of the power ++ * @frac_bits: fractional bits of @x ++ * @n: power to raise @x to. ++ * ++ * By exploiting the relation between the definition of the natural power ++ * function: x^n := x*x*...*x (x multiplied by itself for n times), and ++ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, ++ * (where: n_i \elem {0, 1}, the binary vector representing n), ++ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is ++ * of course trivially computable in O(log_2 n), the length of our binary ++ * vector. ++ */ ++static unsigned long ++fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) ++{ ++ unsigned long result = 1UL << frac_bits; ++ ++ if (n) { ++ for (;;) { ++ if (n & 1) { ++ result *= x; ++ result += 1UL << (frac_bits - 1); ++ result >>= frac_bits; ++ } ++ n >>= 1; ++ if (!n) ++ break; ++ x *= x; ++ x += 1UL << (frac_bits - 1); ++ x >>= frac_bits; ++ } ++ } ++ ++ return result; ++} ++ ++/* ++ * a1 = a0 * e + a * (1 - e) ++ * ++ * a2 = a1 * e + a * (1 - e) ++ * = (a0 * e + a * (1 - e)) * e + a * (1 - e) ++ * = a0 * e^2 + a * (1 - e) * (1 + e) ++ * ++ * a3 = a2 * e + a * (1 - e) ++ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) ++ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) ++ * ++ * ... ++ * ++ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] ++ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) ++ * = a0 * e^n + a * (1 - e^n) ++ * ++ * [1] application of the geometric series: ++ * ++ * n 1 - x^(n+1) ++ * S_n := \Sum x^i = ------------- ++ * i=0 1 - x ++ */ ++unsigned long ++calc_load_n(unsigned long load, unsigned long exp, ++ unsigned long active, unsigned int n) ++{ ++ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); ++} ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++#ifdef CONFIG_PARAVIRT ++static inline u64 steal_ticks(u64 steal) ++{ ++ if (unlikely(steal > NSEC_PER_SEC)) ++ return div_u64(steal, TICK_NSEC); ++ ++ return __iter_div_u64_rem(steal, TICK_NSEC, &steal); ++} ++#endif ++ ++#ifndef nsecs_to_cputime ++# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) ++#endif ++ ++/* ++ * On each tick, add the number of nanoseconds to the unbanked variables and ++ * once one tick's worth has accumulated, account it allowing for accurate ++ * sub-tick accounting and totals. Use the TICK_APPROX_NS to match the way we ++ * deduct nanoseconds. ++ */ ++static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ if (atomic_read(&rq->nr_iowait) > 0) { ++ rq->iowait_ns += ns; ++ if (rq->iowait_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->iowait_ns); ++ cpustat[CPUTIME_IOWAIT] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->iowait_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->idle_ns += ns; ++ if (rq->idle_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->idle_ns); ++ cpustat[CPUTIME_IDLE] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->idle_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(idle); ++} ++ ++static void pc_system_time(struct rq *rq, struct task_struct *p, ++ int hardirq_offset, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ p->stime_ns += ns; ++ if (p->stime_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(p->stime_ns); ++ p->stime_ns %= JIFFY_NS; ++ p->stime += (__force u64)TICK_APPROX_NS * ticks; ++ account_group_system_time(p, TICK_APPROX_NS * ticks); ++ } ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ if (hardirq_count() - hardirq_offset) { ++ rq->irq_ns += ns; ++ if (rq->irq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->irq_ns); ++ cpustat[CPUTIME_IRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->irq_ns %= JIFFY_NS; ++ } ++ } else if (in_serving_softirq()) { ++ rq->softirq_ns += ns; ++ if (rq->softirq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->softirq_ns); ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->softirq_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->system_ns += ns; ++ if (rq->system_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->system_ns); ++ cpustat[CPUTIME_SYSTEM] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->system_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(p); ++} ++ ++static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ p->utime_ns += ns; ++ if (p->utime_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(p->utime_ns); ++ p->utime_ns %= JIFFY_NS; ++ p->utime += (__force u64)TICK_APPROX_NS * ticks; ++ account_group_user_time(p, TICK_APPROX_NS * ticks); ++ } ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ if (this_cpu_ksoftirqd() == p) { ++ /* ++ * ksoftirqd time do not get accounted in cpu_softirq_time. ++ * So, we have to handle it separately here. ++ */ ++ rq->softirq_ns += ns; ++ if (rq->softirq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->softirq_ns); ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->softirq_ns %= JIFFY_NS; ++ } ++ } ++ ++ if (task_nice(p) > 0 || idleprio_task(p)) { ++ rq->nice_ns += ns; ++ if (rq->nice_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->nice_ns); ++ cpustat[CPUTIME_NICE] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->nice_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->user_ns += ns; ++ if (rq->user_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->user_ns); ++ cpustat[CPUTIME_USER] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->user_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(p); ++} ++ ++/* ++ * This is called on clock ticks. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p) ++{ ++ s64 account_ns = rq->niffies - p->last_ran; ++ struct task_struct *idle = rq->idle; ++ ++ /* Accurate tick timekeeping */ ++ if (user_mode(get_irq_regs())) ++ pc_user_time(rq, p, account_ns); ++ else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) { ++ pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns); ++ } else ++ pc_idle_time(rq, idle, account_ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p->policy != SCHED_FIFO && p != idle) ++ p->time_slice -= NS_TO_US(account_ns); ++ ++ p->last_ran = rq->niffies; ++} ++ ++/* ++ * This is called on context switches. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p) ++{ ++ s64 account_ns = rq->niffies - p->last_ran; ++ struct task_struct *idle = rq->idle; ++ ++ /* Accurate subtick timekeeping */ ++ if (p != idle) ++ pc_user_time(rq, p, account_ns); ++ else ++ pc_idle_time(rq, idle, account_ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p->policy != SCHED_FIFO && p != idle) ++ p->time_slice -= NS_TO_US(account_ns); ++} ++ ++/* ++ * Return any ns on the sched_clock that have not yet been accounted in ++ * @p in case that task is currently running. ++ * ++ * Called with task_rq_lock(p) held. ++ */ ++static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) ++{ ++ u64 ns = 0; ++ ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_clocks(rq); ++ ns = rq->niffies - p->last_ran; ++ } ++ ++ return ns; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ struct rq_flags rf; ++ struct rq *rq; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimisation chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_rq_lock(p, &rf); ++ ns = p->sched_time + do_task_delta_exec(p, rq); ++ task_rq_unlock(rq, p, &rf); ++ ++ return ns; ++} ++ ++/* ++ * Functions to test for when SCHED_ISO tasks have used their allocated ++ * quota as real time scheduling and convert them back to SCHED_NORMAL. All ++ * data is modified only by the local runqueue during scheduler_tick with ++ * interrupts disabled. ++ */ ++ ++/* ++ * Test if SCHED_ISO tasks have run longer than their alloted period as RT ++ * tasks and set the refractory flag if necessary. There is 10% hysteresis ++ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a ++ * slow division. ++ */ ++static inline void iso_tick(struct rq *rq) ++{ ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; ++ rq->iso_ticks += 100; ++ if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) { ++ rq->iso_refractory = true; ++ if (unlikely(rq->iso_ticks > ISO_PERIOD * 100)) ++ rq->iso_ticks = ISO_PERIOD * 100; ++ } ++} ++ ++/* No SCHED_ISO task was running so decrease rq->iso_ticks */ ++static inline void no_iso_tick(struct rq *rq, int ticks) ++{ ++ if (rq->iso_ticks > 0 || rq->iso_refractory) { ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD; ++ if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) { ++ rq->iso_refractory = false; ++ if (unlikely(rq->iso_ticks < 0)) ++ rq->iso_ticks = 0; ++ } ++ } ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static void task_running_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ /* ++ * If a SCHED_ISO task is running we increment the iso_ticks. In ++ * order to prevent SCHED_ISO tasks from causing starvation in the ++ * presence of true RT tasks we account those as iso_ticks as well. ++ */ ++ if (rt_task(p) || task_running_iso(p)) ++ iso_tick(rq); ++ else ++ no_iso_tick(rq, 1); ++ ++ /* SCHED_FIFO tasks never run out of timeslice. */ ++ if (p->policy == SCHED_FIFO) ++ return; ++ ++ if (iso_task(p)) { ++ if (task_running_iso(p)) { ++ if (rq->iso_refractory) { ++ /* ++ * SCHED_ISO task is running as RT and limit ++ * has been hit. Force it to reschedule as ++ * SCHED_NORMAL by zeroing its time_slice ++ */ ++ p->time_slice = 0; ++ } ++ } else if (!rq->iso_refractory) { ++ /* Can now run again ISO. Reschedule to pick up prio */ ++ goto out_resched; ++ } ++ } ++ ++ /* ++ * Tasks that were scheduled in the first half of a tick are not ++ * allowed to run into the 2nd half of the next tick if they will ++ * run out of time slice in the interim. Otherwise, if they have ++ * less than RESCHED_US μs of time slice left they will be rescheduled. ++ * Dither is used as a backup for when hrexpiry is disabled or high res ++ * timers not configured in. ++ */ ++ if (p->time_slice - rq->dither >= RESCHED_US) ++ return; ++out_resched: ++ rq_lock(rq); ++ __set_tsk_resched(p); ++ rq_unlock(rq); ++} ++ ++static inline void task_tick(struct rq *rq) ++{ ++ if (!rq_idle(rq)) ++ task_running_tick(rq); ++ else if (rq->last_jiffy > rq->last_scheduler_tick) ++ no_iso_tick(rq, rq->last_jiffy - rq->last_scheduler_tick); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * We can stop the timer tick any time highres timers are active since ++ * we rely entirely on highres timeouts for task expiry rescheduling. ++ */ ++static void sched_stop_tick(struct rq *rq, int cpu) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ if (!tick_nohz_full_enabled()) ++ return; ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++ ++static inline void sched_start_tick(struct rq *rq, int cpu) ++{ ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++ ++struct tick_work { ++ int cpu; ++ atomic_t state; ++ struct delayed_work work; ++}; ++/* Values for ->state, see diagram below. */ ++#define TICK_SCHED_REMOTE_OFFLINE 0 ++#define TICK_SCHED_REMOTE_OFFLINING 1 ++#define TICK_SCHED_REMOTE_RUNNING 2 ++ ++/* ++ * State diagram for ->state: ++ * ++ * ++ * TICK_SCHED_REMOTE_OFFLINE ++ * | ^ ++ * | | ++ * | | sched_tick_remote() ++ * | | ++ * | | ++ * +--TICK_SCHED_REMOTE_OFFLINING ++ * | ^ ++ * | | ++ * sched_tick_start() | | sched_tick_stop() ++ * | | ++ * V | ++ * TICK_SCHED_REMOTE_RUNNING ++ * ++ * ++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() ++ * and sched_tick_start() are happy to leave the state in RUNNING. ++ */ ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ u64 delta; ++ int os; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (!tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ rq_lock_irq(rq); ++ if (cpu_is_offline(cpu)) ++ goto out_unlock; ++ ++ curr = rq->curr; ++ update_rq_clock(rq); ++ ++ if (!is_idle_task(curr)) { ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ delta = rq_clock_task(rq) - curr->last_ran; ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ } ++ task_tick(rq); ++ ++out_unlock: ++ rq_unlock_irq(rq, NULL); ++ ++out_requeue: ++ ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. But ++ * first update state to reflect hotplug activity if required. ++ */ ++ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); ++ if (os == TICK_SCHED_REMOTE_RUNNING) ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ struct tick_work *twork; ++ int os; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); ++ if (os == TICK_SCHED_REMOTE_OFFLINE) { ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++ } ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ int os; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ /* There cannot be competing actions, but don't rely on stop-machine. */ ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING); ++ WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING); ++ /* Don't cancel, as this would mess up the state machine. */ ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_stop_tick(struct rq *rq, int cpu) {} ++static inline void sched_start_tick(struct rq *rq, int cpu) {} ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ arch_scale_freq_tick(); ++ sched_clock_tick(); ++ update_clocks(rq); ++ update_load_avg(rq, 0); ++ update_cpu_clock_tick(rq, rq->curr); ++ task_tick(rq); ++ rq->last_scheduler_tick = rq->last_jiffy; ++ rq->last_tick = rq->clock; ++ psi_task_tick(rq); ++ perf_event_task_tick(); ++ sched_stop_tick(rq, cpu); ++} ++ ++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * The time_slice is only refilled when it is empty and that is when we set a ++ * new deadline. Make sure update_clocks has been called recently to update ++ * rq->niffies. ++ */ ++static void time_slice_expired(struct task_struct *p, struct rq *rq) ++{ ++ p->time_slice = timeslice(); ++ p->deadline = rq->niffies + task_deadline_diff(p); ++#ifdef CONFIG_SMT_NICE ++ if (!p->mm) ++ p->smt_bias = 0; ++ else if (rt_task(p)) ++ p->smt_bias = 1 << 30; ++ else if (task_running_iso(p)) ++ p->smt_bias = 1 << 29; ++ else if (idleprio_task(p)) { ++ if (task_running_idle(p)) ++ p->smt_bias = 0; ++ else ++ p->smt_bias = 1; ++ } else if (--p->smt_bias < 1) ++ p->smt_bias = MAX_PRIO - p->static_prio; ++#endif ++} ++ ++/* ++ * Timeslices below RESCHED_US are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. SCHED_BATCH tasks ++ * have been flagged be not latency sensitive and likely to be fully CPU ++ * bound so every time they're rescheduled they have their time_slice ++ * refilled, but get a new later deadline to have little effect on ++ * SCHED_NORMAL tasks. ++ ++ */ ++static inline void check_deadline(struct task_struct *p, struct rq *rq) ++{ ++ if (p->time_slice < RESCHED_US || batch_task(p)) ++ time_slice_expired(p, rq); ++} ++ ++/* ++ * Task selection with skiplists is a simple matter of picking off the first ++ * task in the sorted list, an O(1) operation. The lookup is amortised O(1) ++ * being bound to the number of processors. ++ * ++ * Runqueues are selectively locked based on their unlocked data and then ++ * unlocked if not needed. At most 3 locks will be held at any time and are ++ * released as soon as they're no longer needed. All balancing between CPUs ++ * is thus done here in an extremely simple first come best fit manner. ++ * ++ * This iterates over runqueues in cache locality order. In interactive mode ++ * it iterates over all CPUs and finds the task with the best key/deadline. ++ * In non-interactive mode it will only take a task if it's from the current ++ * runqueue or a runqueue with more tasks than the current one with a better ++ * key/deadline. ++ */ ++#ifdef CONFIG_SMP ++static inline struct task_struct ++*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ struct rq *locked = NULL, *chosen = NULL; ++ struct task_struct *edt = idle; ++ int i, best_entries = 0; ++ u64 best_key = ~0ULL; ++ ++ for (i = 0; i < total_runqueues; i++) { ++ struct rq *other_rq = rq_order(rq, i); ++ skiplist_node *next; ++ int entries; ++ ++ entries = other_rq->sl->entries; ++ /* ++ * Check for queued entres lockless first. The local runqueue ++ * is locked so entries will always be accurate. ++ */ ++ if (!sched_interactive) { ++ /* ++ * Don't reschedule balance across nodes unless the CPU ++ * is idle. ++ */ ++ if (edt != idle && rq->cpu_locality[other_rq->cpu] > LOCALITY_SMP) ++ break; ++ if (entries <= best_entries) ++ continue; ++ } else if (!entries) ++ continue; ++ ++ /* if (i) implies other_rq != rq */ ++ if (i) { ++ /* Check for best id queued lockless first */ ++ if (other_rq->best_key >= best_key) ++ continue; ++ ++ if (unlikely(!trylock_rq(rq, other_rq))) ++ continue; ++ ++ /* Need to reevaluate entries after locking */ ++ entries = other_rq->sl->entries; ++ if (unlikely(!entries)) { ++ unlock_rq(other_rq); ++ continue; ++ } ++ } ++ ++ next = other_rq->node; ++ /* ++ * In interactive mode we check beyond the best entry on other ++ * runqueues if we can't get the best for smt or affinity ++ * reasons. ++ */ ++ while ((next = next->next[0]) != other_rq->node) { ++ struct task_struct *p; ++ u64 key = next->key; ++ ++ /* Reevaluate key after locking */ ++ if (key >= best_key) ++ break; ++ ++ p = next->value; ++ if (!smt_schedule(p, rq)) { ++ if (i && !sched_interactive) ++ break; ++ continue; ++ } ++ ++ if (sched_other_cpu(p, cpu)) { ++ if (sched_interactive || !i) ++ continue; ++ break; ++ } ++ /* Make sure affinity is ok */ ++ if (i) { ++ /* From this point on p is the best so far */ ++ if (locked) ++ unlock_rq(locked); ++ chosen = locked = other_rq; ++ } ++ best_entries = entries; ++ best_key = key; ++ edt = p; ++ break; ++ } ++ /* rq->preempting is a hint only as the state may have changed ++ * since it was set with the resched call but if we have met ++ * the condition we can break out here. */ ++ if (edt == rq->preempting) ++ break; ++ if (i && other_rq != chosen) ++ unlock_rq(other_rq); ++ } ++ ++ if (likely(edt != idle)) ++ take_task(rq, cpu, edt); ++ ++ if (locked) ++ unlock_rq(locked); ++ ++ rq->preempting = NULL; ++ ++ return edt; ++} ++#else /* CONFIG_SMP */ ++static inline struct task_struct ++*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ struct task_struct *edt; ++ ++ if (unlikely(!rq->sl->entries)) ++ return idle; ++ edt = rq->node->next[0]->value; ++ take_task(rq, cpu, edt); ++ return edt; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(KERN_ERR, preempt_disable_ip); ++ } ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev, bool preempt) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++ ++ if (task_scs_end_corrupted(prev)) ++ panic("corrupted shadow stack detected inside scheduler\n"); ++#endif ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++ if (!preempt && prev->state && prev->non_block_count) { ++ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", ++ prev->comm, prev->pid, prev->non_block_count); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++ } ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++/* ++ * The currently running task's information is all stored in rq local data ++ * which is only modified by the local CPU. ++ */ ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ if (p == rq->idle || p->policy == SCHED_FIFO) ++ hrexpiry_clear(rq); ++ else ++ hrexpiry_start(rq, US_TO_NS(p->time_slice)); ++ if (rq->clock - rq->last_tick > HALF_JIFFY_NS) ++ rq->dither = 0; ++ else ++ rq->dither = rq_dither(rq); ++ ++ rq->rq_deadline = p->deadline; ++ rq->rq_prio = p->prio; ++#ifdef CONFIG_SMT_NICE ++ rq->rq_mm = p->mm; ++ rq->rq_smt_bias = p->smt_bias; ++#endif ++} ++ ++#ifdef CONFIG_SMT_NICE ++static void check_no_siblings(struct rq __maybe_unused *this_rq) {} ++static void wake_no_siblings(struct rq __maybe_unused *this_rq) {} ++static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings; ++static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings; ++ ++/* Iterate over smt siblings when we've scheduled a process on cpu and decide ++ * whether they should continue running or be descheduled. */ ++static void check_smt_siblings(struct rq *this_rq) ++{ ++ int other_cpu; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct task_struct *p; ++ struct rq *rq; ++ ++ rq = cpu_rq(other_cpu); ++ if (rq_idle(rq)) ++ continue; ++ p = rq->curr; ++ if (!smt_schedule(p, this_rq)) ++ resched_curr(rq); ++ } ++} ++ ++static void wake_smt_siblings(struct rq *this_rq) ++{ ++ int other_cpu; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct rq *rq; ++ ++ rq = cpu_rq(other_cpu); ++ if (rq_idle(rq)) ++ resched_idle(rq); ++ } ++} ++#else ++static void check_siblings(struct rq __maybe_unused *this_rq) {} ++static void wake_siblings(struct rq __maybe_unused *this_rq) {} ++#endif ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next, *idle; ++ unsigned long *switch_count; ++ unsigned long prev_state; ++ bool deactivate = false; ++ struct rq *rq; ++ u64 niffies; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ idle = rq->idle; ++ ++ schedule_debug(prev, preempt); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(): ++ * ++ * __set_current_state(@state) signal_wake_up() ++ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) ++ * wake_up_state(p, state) ++ * LOCK rq->lock LOCK p->pi_state ++ * smp_mb__after_spinlock() smp_mb__after_spinlock() ++ * if (signal_pending_state()) if (p->state & @state) ++ * ++ * Also, the membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ rq_lock(rq); ++ smp_mb__after_spinlock(); ++#ifdef CONFIG_SMP ++ if (rq->preempt) { ++ /* ++ * Make sure resched_curr hasn't triggered a preemption ++ * locklessly on a task that has since scheduled away. Spurious ++ * wakeup of idle is okay though. ++ */ ++ if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) { ++ rq->preempt = NULL; ++ clear_preempt_need_resched(); ++ rq_unlock_irq(rq, NULL); ++ return; ++ } ++ rq->preempt = NULL; ++ } ++#endif ++ ++ switch_count = &prev->nivcsw; ++ ++ /* ++ * We must load prev->state once (task_struct::state is volatile), such ++ * that: ++ * ++ * - we form a control dependency vs deactivate_task() below. ++ * - ptrace_{,un}freeze_traced() can change ->state underneath us. ++ */ ++ prev_state = prev->state; ++ if (!preempt && prev_state) { ++ if (signal_pending_state(prev_state, prev)) { ++ prev->state = TASK_RUNNING; ++ } else { ++ prev->sched_contributes_to_load = ++ (prev_state & TASK_UNINTERRUPTIBLE) && ++ !(prev_state & TASK_NOLOAD) && ++ !(prev->flags & PF_FROZEN); ++ ++ if (prev->sched_contributes_to_load) ++ rq->nr_uninterruptible++; ++ ++ /* ++ * __schedule() ttwu() ++ * prev_state = prev->state; if (p->on_rq && ...) ++ * if (prev_state) goto out; ++ * p->on_rq = 0; smp_acquire__after_ctrl_dep(); ++ * p->state = TASK_WAKING ++ * ++ * Where __schedule() and ttwu() have matching control dependencies. ++ * ++ * After this, schedule() must not care about p->state any more. ++ */ ++ deactivate = true; ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ /* ++ * Store the niffy value here for use by the next task's last_ran ++ * below to avoid losing niffies due to update_clocks being called ++ * again after this point. ++ */ ++ update_clocks(rq); ++ niffies = rq->niffies; ++ update_cpu_clock_switch(rq, prev); ++ ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ if (idle != prev) { ++ check_deadline(prev, rq); ++ return_task(prev, rq, cpu, deactivate); ++ } ++ ++ next = earliest_deadline_task(rq, cpu, idle); ++ if (likely(next->prio != PRIO_LIMIT)) ++ clear_cpuidle_map(cpu); ++ else { ++ set_cpuidle_map(cpu); ++ update_load_avg(rq, 0); ++ } ++ ++ set_rq_task(rq, next); ++ next->last_ran = niffies; ++ ++ if (likely(prev != next)) { ++ /* ++ * Don't reschedule an idle task or deactivated tasks ++ */ ++ if (prev == idle) { ++ inc_nr_running(rq); ++ if (rt_task(next)) ++ rq->rt_nr_running++; ++ } else if (!deactivate) ++ resched_suitable_idle(prev); ++ if (unlikely(next == idle)) { ++ dec_nr_running(rq); ++ if (rt_task(prev)) ++ rq->rt_nr_running--; ++ wake_siblings(rq); ++ } else ++ check_siblings(rq); ++ rq->nr_switches++; ++ /* ++ * RCU users of rcu_dereference(rq->curr) may not see ++ * changes to task_struct made by pick_next_task(). ++ */ ++ RCU_INIT_POINTER(rq->curr, next); ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ ++ psi_sched_switch(prev, next, !task_on_rq_queued(prev)); ++ ++ trace_sched_switch(preempt, prev, next); ++ context_switch(rq, prev, next); /* unlocks the rq */ ++ } else { ++ check_siblings(rq); ++ rq_unlock(rq); ++ local_irq_enable(); ++ } ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(). */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ __schedule(false); ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ if (!tsk->state) ++ return; ++ ++ /* ++ * If a worker went to sleep, notify and ask workqueue whether ++ * it wants to wake up a task to maintain concurrency. ++ * As this function is called inside the schedule() context, ++ * we disable preemption to avoid it calling schedule() again ++ * in the possible wakeup of a kworker and because wq_worker_sleeping() ++ * requires it. ++ */ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ preempt_disable(); ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_sleeping(tsk); ++ else ++ io_wq_worker_sleeping(tsk); ++ preempt_enable_no_resched(); ++ } ++ ++ if (tsk_is_pi_blocked(tsk)) ++ return; ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++static inline void sched_update_worker(struct task_struct *tsk) ++{ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_running(tsk); ++ else ++ io_wq_worker_running(tsk); ++ } ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ sched_update_worker(tsk); ++} ++ ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_CONTEXT_TRACKING ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != IN_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPTION ++/* ++ * This is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPTION */ ++ ++/* ++ * This is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio, oldprio; ++ struct rq *rq; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_rq_lock(p, NULL); ++ update_rq_clock(rq); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guaratees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ oldprio = p->prio; ++ p->prio = prio; ++ if (task_running(rq, p)){ ++ if (prio > oldprio) ++ resched_task(p); ++ } else if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (prio < oldprio) ++ try_preempt(p, rq); ++ } ++out_unlock: ++ __task_rq_unlock(rq, NULL); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++/* ++ * Adjust the deadline for when the priority is to change, before it's ++ * changed. ++ */ ++static inline void adjust_deadline(struct task_struct *p, int new_prio) ++{ ++ p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p); ++} ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ int new_static, old_static; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ new_static = NICE_TO_PRIO(nice); ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (has_rt_policy(p)) { ++ p->static_prio = new_static; ++ goto out_unlock; ++ } ++ ++ adjust_deadline(p, new_static); ++ old_static = p->static_prio; ++ p->static_prio = new_static; ++ p->prio = effective_prio(p); ++ ++ if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (new_static < old_static) ++ try_preempt(p, rq); ++ } else if (task_running(rq, p)) { ++ set_rq_task(rq, p); ++ if (old_static < new_static) ++ resched_task(p); ++ } ++out_unlock: ++ task_rq_unlock(rq, p, &rf); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ int delta, prio = p->prio - MAX_RT_PRIO; ++ ++ /* rt tasks and iso tasks */ ++ if (prio <= 0) ++ goto out; ++ ++ /* Convert to ms to avoid overflows */ ++ delta = NS_TO_MS(p->deadline - task_rq(p)->niffies); ++ if (unlikely(delta < 0)) ++ delta = 0; ++ delta = delta * 40 / ms_longest_deadline_diff(); ++ if (delta <= 80) ++ prio += delta; ++ if (idleprio_task(p)) ++ prio += 40; ++out: ++ return prio; ++} ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (rq->curr != rq->idle) ++ return 0; ++ ++ if (rq->nr_running) ++ return 0; ++ ++#ifdef CONFIG_SMP ++ if (rq->ttwu_pending) ++ return 0; ++#endif ++ ++ return 1; ++} ++ ++/** ++ * available_idle_cpu - is a given CPU idle for enqueuing work. ++ * @cpu: the CPU in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int available_idle_cpu(int cpu) ++{ ++ if (!idle_cpu(cpu)) ++ return 0; ++ ++ if (vcpu_is_preempted(cpu)) ++ return 0; ++ ++ return 1; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the CPU @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, ++ int prio, const struct sched_attr *attr, ++ bool keep_boost) ++{ ++ int oldrtprio, oldprio; ++ ++ /* ++ * If params can't change scheduling class changes aren't allowed ++ * either. ++ */ ++ if (attr->sched_flags & SCHED_FLAG_KEEP_PARAMS) ++ return; ++ ++ p->policy = policy; ++ oldrtprio = p->rt_priority; ++ p->rt_priority = prio; ++ p->normal_prio = normal_prio(p); ++ oldprio = p->prio; ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++ ++ if (task_running(rq, p)) { ++ set_rq_task(rq, p); ++ resched_task(p); ++ } else if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (p->prio < oldprio || p->rt_priority > oldrtprio) ++ try_preempt(p, rq); ++ } ++} ++ ++/* ++ * Check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int __sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, ++ bool user, bool pi) ++{ ++ int retval, policy = attr->sched_policy, oldpolicy = -1, priority = attr->sched_priority; ++ unsigned long rlim_rtprio = 0; ++ struct rq_flags rf; ++ int reset_on_fork; ++ struct rq *rq; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { ++ unsigned long lflags; ++ ++ if (!lock_task_sighand(p, &lflags)) ++ return -ESRCH; ++ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); ++ unlock_task_sighand(p, &lflags); ++ if (rlim_rtprio) ++ goto recheck; ++ /* ++ * If the caller requested an RT policy without having the ++ * necessary rights, we downgrade the policy to SCHED_ISO. ++ * We also set the parameter to zero to pass the checks. ++ */ ++ policy = SCHED_ISO; ++ priority = 0; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); ++ policy &= ~SCHED_RESET_ON_FORK; ++ ++ if (!SCHED_RANGE(policy)) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH is 0. ++ */ ++ if (priority < 0 || ++ (p->mm && priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if (is_rt_policy(policy) != (priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (is_rt_policy(policy)) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (priority > p->rt_priority && ++ priority > rlim_rtprio) ++ return -EPERM; ++ } else { ++ switch (p->policy) { ++ /* ++ * Can only downgrade policies but not back to ++ * SCHED_NORMAL ++ */ ++ case SCHED_ISO: ++ if (policy == SCHED_ISO) ++ goto out; ++ if (policy != SCHED_NORMAL) ++ return -EPERM; ++ break; ++ case SCHED_BATCH: ++ if (policy == SCHED_BATCH) ++ goto out; ++ if (policy != SCHED_IDLEPRIO) ++ return -EPERM; ++ break; ++ case SCHED_IDLEPRIO: ++ if (policy == SCHED_IDLEPRIO) ++ goto out; ++ return -EPERM; ++ default: ++ break; ++ } ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag: */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ if (pi) ++ cpuset_read_lock(); ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ * ++ * To be able to change p->policy safely, the runqueue lock must be ++ * held. ++ */ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea: ++ */ ++ if (p == rq->stop) { ++ retval = -EINVAL; ++ goto unlock; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy && (!is_rt_policy(policy) || ++ priority == p->rt_priority))) { ++ retval = 0; ++ goto unlock; ++ } ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ task_rq_unlock(rq, p, &rf); ++ if (pi) ++ cpuset_read_unlock(); ++ goto recheck; ++ } ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ __setscheduler(p, rq, policy, priority, attr, pi); ++ ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ task_rq_unlock(rq, p, &rf); ++ ++ if (pi) { ++ cpuset_read_unlock(); ++ rt_mutex_adjust_pi(p); ++ } ++ preempt_enable(); ++out: ++ return 0; ++ ++unlock: ++ task_rq_unlock(rq, p, &rf); ++ if (pi) ++ cpuset_read_unlock(); ++ return retval; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Use sched_set_fifo(), read its comment. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++ ++/* ++ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally ++ * incapable of resource management, which is the one thing an OS really should ++ * be doing. ++ * ++ * This is of course the reason it is limited to privileged users only. ++ * ++ * Worse still; it is fundamentally impossible to compose static priority ++ * workloads. You cannot take two correctly working static prio workloads ++ * and smash them together and still expect them to work. ++ * ++ * For this reason 'all' FIFO tasks the kernel creates are basically at: ++ * ++ * MAX_RT_PRIO / 2 ++ * ++ * The administrator _MUST_ configure the system, the kernel simply doesn't ++ * know enough information to make a sensible choice. ++ */ ++void sched_set_fifo(struct task_struct *p) ++{ ++ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; ++ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); ++} ++EXPORT_SYMBOL_GPL(sched_set_fifo); ++ ++/* ++ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. ++ */ ++void sched_set_fifo_low(struct task_struct *p) ++{ ++ struct sched_param sp = { .sched_priority = 1 }; ++ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); ++} ++EXPORT_SYMBOL_GPL(sched_set_fifo_low); ++ ++void sched_set_normal(struct task_struct *p, int nice) ++{ ++ struct sched_attr attr = { ++ .sched_policy = SCHED_NORMAL, ++ .sched_nice = nice, ++ }; ++ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); ++} ++EXPORT_SYMBOL_GPL(sched_set_normal); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setscheduler(p, policy, &lparam); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, ++ struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) ++ goto err_size; ++ ++ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); ++ if (ret) { ++ if (ret == -E2BIG) ++ goto err_size; ++ return ret; ++ } ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) ++ attr.sched_policy = SETPARAM_POLICY; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setattr(p, &attr); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/* ++ * Copy the kernel size attribute structure (which might be larger ++ * than what user-space knows about) to user-space. ++ * ++ * Note that all cases are valid: user-space buffer can be larger or ++ * smaller than the kernel-space buffer. The usual case is that both ++ * have the same size. ++ */ ++static int ++sched_attr_copy_to_user(struct sched_attr __user *uattr, ++ struct sched_attr *kattr, ++ unsigned int usize) ++{ ++ unsigned int ksize = sizeof(*kattr); ++ ++ if (!access_ok(uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * sched_getattr() ABI forwards and backwards compatibility: ++ * ++ * If usize == ksize then we just copy everything to user-space and all is good. ++ * ++ * If usize < ksize then we only copy as much as user-space has space for, ++ * this keeps ABI compatibility as well. We skip the rest. ++ * ++ * If usize > ksize then user-space is using a newer version of the ABI, ++ * which part the kernel doesn't know about. Just ignore it - tooling can ++ * detect the kernel's knowledge of attributes from the attr->size value ++ * which is set to ksize in this case. ++ */ ++ kattr->size = min(usize, ksize); ++ ++ if (copy_to_user(uattr, kattr, kattr->size)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @usize: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, usize, unsigned int, flags) ++{ ++ struct sched_attr kattr = { }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || usize > PAGE_SIZE || ++ usize < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ kattr.sched_policy = p->policy; ++ if (rt_task(p)) ++ kattr.sched_priority = p->rt_priority; ++ else ++ kattr.sched_nice = task_nice(p); ++ ++ rcu_read_unlock(); ++ ++ return sched_attr_copy_to_user(uattr, &kattr, usize); ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_allowed, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_allowed); ++ cpumask_and(new_mask, in_mask, cpus_allowed); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_allowed); ++ if (!cpumask_subset(new_mask, cpus_allowed)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_allowed to the ++ * cpuset's cpus_allowed ++ */ ++ cpumask_copy(new_mask, cpus_allowed); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_allowed); ++out_put_task: ++ put_task_struct(p); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ cpumask_t *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ unsigned long flags; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ put_online_cpus(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min(len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ * ++ * Return: 0. ++ */ ++static void do_sched_yield(void) ++{ ++ struct rq *rq; ++ ++ if (!sched_yield_type) ++ return; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ rq_lock(rq); ++ ++ if (sched_yield_type > 1) ++ time_slice_expired(current, rq); ++ schedstat_inc(rq->yld_count); ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ preempt_disable(); ++ rq_unlock(rq); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPTION ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ rcu_all_qs(); ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, its already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ struct task_struct *rq_p; ++ struct rq *rq, *p_rq; ++ unsigned long flags; ++ int yielded = 0; ++ ++ local_irq_save(flags); ++ rq = this_rq(); ++ ++again: ++ p_rq = task_rq(p); ++ /* ++ * If we're the only runnable task on the rq and target rq also ++ * has only one task, there's absolutely no point in yielding. ++ */ ++ if (task_running(p_rq, p) || p->state) { ++ yielded = -ESRCH; ++ goto out_irq; ++ } ++ ++ double_rq_lock(rq, p_rq); ++ if (unlikely(task_rq(p) != p_rq)) { ++ double_rq_unlock(rq, p_rq); ++ goto again; ++ } ++ ++ yielded = 1; ++ schedstat_inc(rq->yld_count); ++ rq_p = rq->curr; ++ if (p->deadline > rq_p->deadline) ++ p->deadline = rq_p->deadline; ++ p->time_slice += rq_p->time_slice; ++ if (p->time_slice > timeslice()) ++ p->time_slice = timeslice(); ++ time_slice_expired(rq_p, rq); ++ if (preempt && rq != p_rq) ++ resched_task(p_rq->curr); ++ double_rq_unlock(rq, p_rq); ++out_irq: ++ local_irq_restore(flags); ++ ++ if (yielded > 0) ++ schedule(); ++ return yielded; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void __sched io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ unsigned int time_slice; ++ struct rq_flags rf; ++ struct rq *rq; ++ int retval; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ rq = task_rq_lock(p, &rf); ++ time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); ++ task_rq_unlock(rq, p, &rf); ++ ++ rcu_read_unlock(); ++ *t = ns_to_timespec64(time_slice); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * this syscall writes the default timeslice value of a given process ++ * into the user-space timespec buffer. A value of '0' means infinity. ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct __kernel_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT_32BIT_TIME ++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, ++ struct old_timespec32 __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_old_timespec32(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", ++ free, task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ show_stack(p, NULL, KERN_INFO); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++#ifdef CONFIG_SMP ++void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ struct rq *rq = task_rq(p); ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ ++ if (task_queued(p)) { ++ /* ++ * Because __kthread_bind() calls this on blocked tasks without ++ * holding rq->lock. ++ */ ++ lockdep_assert_held(rq->lock); ++ } ++} ++ ++/* ++ * Calling do_set_cpus_allowed from outside the scheduler code should not be ++ * called on a running or queued task. We should be holding pi_lock. ++ */ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ __do_set_cpus_allowed(p, new_mask); ++ if (needs_other_cpu(p, task_cpu(p))) { ++ struct rq *rq; ++ ++ rq = __task_rq_lock(p, NULL); ++ set_task_cpu(p, valid_task_cpu(p)); ++ resched_task(p); ++ __task_rq_unlock(rq, NULL); ++ } ++} ++#endif ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: cpu the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(rq->lock); ++ idle->last_ran = rq->niffies; ++ time_slice_expired(idle, rq); ++ idle->state = TASK_RUNNING; ++ /* Setting prio to illegal value shouldn't matter when never queued */ ++ idle->prio = PRIO_LIMIT; ++ idle->flags |= PF_IDLE; ++ ++ scs_task_reset(idle); ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#ifdef CONFIG_SMT_NICE ++ idle->smt_bias = 0; ++#endif ++#endif ++ set_rq_task(rq, idle); ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->idle = idle; ++ rcu_assign_pointer(rq->curr, idle); ++ idle->on_rq = TASK_ON_RQ_QUEUED; ++ raw_spin_unlock(rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_mask may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++ rq_lock_irqsave(rq, &rf); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(rq); ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_NO_HZ_COMMON ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++void nohz_balance_enter_idle(int cpu) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(), default_cpu = -1; ++ struct sched_domain *sd; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { ++ if (!idle_cpu(cpu)) ++ return cpu; ++ default_cpu = cpu; ++ } ++ ++ rcu_read_lock(); ++ for_each_domain(cpu, sd) { ++ for_each_cpu_and(i, sched_domain_span(sd), ++ housekeeping_cpumask(HK_FLAG_TIMER)) { ++ if (cpu == i) ++ continue; ++ ++ if (!idle_cpu(i)) { ++ cpu = i; ++ goto unlock; ++ } ++ } ++ } ++ ++ if (default_cpu == -1) ++ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++ cpu = default_cpu; ++unlock: ++ rcu_read_unlock(); ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++void wake_up_idle_cpu(int cpu) ++{ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ if (set_nr_and_not_polling(cpu_rq(cpu)->idle)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++static bool wake_up_full_nohz_cpu(int cpu) ++{ ++ /* ++ * We just need the target to call irq_exit() and re-evaluate ++ * the next tick. The nohz full kick at least implies that. ++ * If needed we can still optimize that later with an ++ * empty IRQ. ++ */ ++ if (cpu_is_offline(cpu)) ++ return true; /* Don't try to wake offline CPUs. */ ++ if (tick_nohz_full_cpu(cpu)) { ++ if (cpu != smp_processor_id() || ++ tick_nohz_tick_stopped()) ++ tick_nohz_full_kick_cpu(cpu); ++ return true; ++ } ++ ++ return false; ++} ++ ++/* ++ * Wake up the specified CPU. If the CPU is going offline, it is the ++ * caller's responsibility to deal with the lost wakeup, for example, ++ * by hooking into the CPU_DEAD notifier like timers and hrtimers do. ++ */ ++void wake_up_nohz_cpu(int cpu) ++{ ++ if (!wake_up_full_nohz_cpu(cpu)) ++ wake_up_idle_cpu(cpu); ++} ++#endif /* CONFIG_NO_HZ_COMMON */ ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ bool queued = false, running_wrong = false, kthread; ++ unsigned int dest_cpu; ++ struct rq_flags rf; ++ struct rq *rq; ++ int ret = 0; ++ ++ rq = task_rq_lock(p, &rf); ++ update_rq_clock(rq); ++ ++ kthread = !!(p->flags & PF_KTHREAD); ++ if (kthread) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (cpumask_equal(&p->cpus_mask, new_mask)) ++ goto out; ++ ++ /* ++ * Picking a ~random cpu helps in cases where we are changing affinity ++ * for groups of tasks (ie. cpuset), so that load balancing is not ++ * immediately required to distribute the tasks within their new mask. ++ */ ++ dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask); ++ if (dest_cpu >= nr_cpu_ids) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ queued = task_queued(p); ++ __do_set_cpus_allowed(p, new_mask); ++ ++ if (kthread) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(rq, p)) { ++ /* Task is running on the wrong cpu now, reschedule it. */ ++ if (rq == this_rq()) { ++ set_task_cpu(p, dest_cpu); ++ set_tsk_need_resched(p); ++ running_wrong = true; ++ } else ++ resched_task(p); ++ } else { ++ if (queued) { ++ /* ++ * Switch runqueue locks after dequeueing the task ++ * here while still holding the pi_lock to be holding ++ * the correct lock for enqueueing. ++ */ ++ dequeue_task(rq, p, 0); ++ rq_unlock(rq); ++ ++ rq = cpu_rq(dest_cpu); ++ rq_lock(rq); ++ } ++ set_task_cpu(p, dest_cpu); ++ if (queued) ++ enqueue_task(rq, p, 0); ++ } ++ if (queued) ++ try_preempt(p, rq); ++ if (running_wrong) ++ preempt_disable(); ++out: ++ task_rq_unlock(rq, p, &rf); ++ ++ if (running_wrong) { ++ __schedule(true); ++ preempt_enable(); ++ } ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Run through task list and find tasks affined to the dead cpu, then remove ++ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold ++ * cpu 0 and src_cpu's runqueue locks. We should be holding both rq lock and ++ * pi_lock to change cpus_mask but it's not going to matter here. ++ */ ++static void bind_zero(int src_cpu) ++{ ++ struct task_struct *p, *t; ++ struct rq *rq0; ++ int bound = 0; ++ ++ if (src_cpu == 0) ++ return; ++ ++ rq0 = cpu_rq(0); ++ ++ do_each_thread(t, p) { ++ if (cpumask_test_cpu(src_cpu, p->cpus_ptr)) { ++ bool local = (task_cpu(p) == src_cpu); ++ struct rq *rq = task_rq(p); ++ ++ /* task_running is the cpu stopper thread */ ++ if (local && task_running(rq, p)) ++ continue; ++ atomic_clear_cpu(src_cpu, &p->cpus_mask); ++ atomic_set_cpu(0, &p->cpus_mask); ++ p->zerobound = true; ++ bound++; ++ if (local) { ++ bool queued = task_queued(p); ++ ++ if (queued) ++ dequeue_task(rq, p, 0); ++ set_task_cpu(p, 0); ++ if (queued) ++ enqueue_task(rq0, p, 0); ++ } ++ } ++ } while_each_thread(t, p); ++ ++ if (bound) { ++ printk(KERN_INFO "MuQSS removed affinity for %d processes to cpu %d\n", ++ bound, src_cpu); ++ } ++} ++ ++/* Find processes with the zerobound flag and reenable their affinity for the ++ * CPU coming alive. */ ++static void unbind_zero(int src_cpu) ++{ ++ int unbound = 0, zerobound = 0; ++ struct task_struct *p, *t; ++ ++ if (src_cpu == 0) ++ return; ++ ++ do_each_thread(t, p) { ++ if (!p->mm) ++ p->zerobound = false; ++ if (p->zerobound) { ++ unbound++; ++ cpumask_set_cpu(src_cpu, &p->cpus_mask); ++ /* Once every CPU affinity has been re-enabled, remove ++ * the zerobound flag */ ++ if (cpumask_subset(cpu_possible_mask, p->cpus_ptr)) { ++ p->zerobound = false; ++ zerobound++; ++ } ++ } ++ } while_each_thread(t, p); ++ ++ if (unbound) { ++ printk(KERN_INFO "MuQSS added affinity for %d processes to cpu %d\n", ++ unbound, src_cpu); ++ } ++ if (zerobound) { ++ printk(KERN_INFO "MuQSS released forced binding to cpu0 for %d processes\n", ++ zerobound); ++ } ++} ++ ++/* ++ * Ensure that the idle task is using init_mm right before its cpu goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(cpu_online(smp_processor_id())); ++ BUG_ON(current != this_rq()->idle); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ finish_arch_post_lock_switch(); ++ } ++ ++ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ ++} ++#else /* CONFIG_HOTPLUG_CPU */ ++static void unbind_zero(int src_cpu) {} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++ ++static struct ctl_table sd_ctl_dir[] = { ++ { ++ .procname = "sched_domain", ++ .mode = 0555, ++ }, ++ {} ++}; ++ ++static struct ctl_table sd_ctl_root[] = { ++ { ++ .procname = "kernel", ++ .mode = 0555, ++ .child = sd_ctl_dir, ++ }, ++ {} ++}; ++ ++static struct ctl_table *sd_alloc_ctl_entry(int n) ++{ ++ struct ctl_table *entry = ++ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); ++ ++ return entry; ++} ++ ++static void sd_free_ctl_entry(struct ctl_table **tablep) ++{ ++ struct ctl_table *entry; ++ ++ /* ++ * In the intermediate directories, both the child directory and ++ * procname are dynamically allocated and could fail but the mode ++ * will always be set. In the lowest directory the names are ++ * static strings and all have proc handlers. ++ */ ++ for (entry = *tablep; entry->mode; entry++) { ++ if (entry->child) ++ sd_free_ctl_entry(&entry->child); ++ if (entry->proc_handler == NULL) ++ kfree(entry->procname); ++ } ++ ++ kfree(*tablep); ++ *tablep = NULL; ++} ++ ++static void ++set_table_entry(struct ctl_table *entry, ++ const char *procname, void *data, int maxlen, ++ umode_t mode, proc_handler *proc_handler) ++{ ++ entry->procname = procname; ++ entry->data = data; ++ entry->maxlen = maxlen; ++ entry->mode = mode; ++ entry->proc_handler = proc_handler; ++} ++ ++static struct ctl_table * ++sd_alloc_ctl_domain_table(struct sched_domain *sd) ++{ ++ struct ctl_table *table = sd_alloc_ctl_entry(9); ++ ++ if (table == NULL) ++ return NULL; ++ ++ set_table_entry(&table[0], "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[1], "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0644, proc_dointvec_minmax); ++ set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax); ++ set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring); ++ /* &table[8] is terminator */ ++ ++ return table; ++} ++ ++static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) ++{ ++ struct ctl_table *entry, *table; ++ struct sched_domain *sd; ++ int domain_num = 0, i; ++ char buf[32]; ++ ++ for_each_domain(cpu, sd) ++ domain_num++; ++ entry = table = sd_alloc_ctl_entry(domain_num + 1); ++ if (table == NULL) ++ return NULL; ++ ++ i = 0; ++ for_each_domain(cpu, sd) { ++ snprintf(buf, 32, "domain%d", i); ++ entry->procname = kstrdup(buf, GFP_KERNEL); ++ entry->mode = 0555; ++ entry->child = sd_alloc_ctl_domain_table(sd); ++ entry++; ++ i++; ++ } ++ return table; ++} ++ ++static cpumask_var_t sd_sysctl_cpus; ++static struct ctl_table_header *sd_sysctl_header; ++ ++void register_sched_domain_sysctl(void) ++{ ++ static struct ctl_table *cpu_entries; ++ static struct ctl_table **cpu_idx; ++ char buf[32]; ++ int i; ++ ++ if (!cpu_entries) { ++ cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); ++ if (!cpu_entries) ++ return; ++ ++ WARN_ON(sd_ctl_dir[0].child); ++ sd_ctl_dir[0].child = cpu_entries; ++ } ++ ++ if (!cpu_idx) { ++ struct ctl_table *e = cpu_entries; ++ ++ cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); ++ if (!cpu_idx) ++ return; ++ ++ /* deal with sparse possible map */ ++ for_each_possible_cpu(i) { ++ cpu_idx[i] = e; ++ e++; ++ } ++ } ++ ++ if (!cpumask_available(sd_sysctl_cpus)) { ++ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) ++ return; ++ ++ /* init to possible to not have holes in @cpu_entries */ ++ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); ++ } ++ ++ for_each_cpu(i, sd_sysctl_cpus) { ++ struct ctl_table *e = cpu_idx[i]; ++ ++ if (e->child) ++ sd_free_ctl_entry(&e->child); ++ ++ if (!e->procname) { ++ snprintf(buf, 32, "cpu%d", i); ++ e->procname = kstrdup(buf, GFP_KERNEL); ++ } ++ e->mode = 0555; ++ e->child = sd_alloc_ctl_cpu_table(i); ++ ++ __cpumask_clear_cpu(i, sd_sysctl_cpus); ++ } ++ ++ WARN_ON(sd_sysctl_header); ++ sd_sysctl_header = register_sysctl_table(sd_ctl_root); ++} ++ ++void dirty_sched_domain_sysctl(int cpu) ++{ ++ if (cpumask_available(sd_sysctl_cpus)) ++ __cpumask_set_cpu(cpu, sd_sysctl_cpus); ++} ++ ++/* may be called multiple times per register */ ++void unregister_sched_domain_sysctl(void) ++{ ++ unregister_sysctl_table(sd_sysctl_header); ++ sd_sysctl_header = NULL; ++} ++#endif /* CONFIG_SYSCTL */ ++ ++void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) { ++ cpumask_set_cpu(cpu_of(rq), rq->rd->online); ++ rq->online = true; ++ } ++} ++ ++void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) { ++ int cpu = cpu_of(rq); ++ ++ cpumask_clear_cpu(cpu, rq->rd->online); ++ rq->online = false; ++ clear_cpuidle_map(cpu); ++ } ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ struct rq_flags rf; ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going up, increment the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_inc_cpuslocked(&sched_smt_present); ++#endif ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) { ++ sched_domains_numa_masks_set(cpu); ++ cpuset_cpu_active(); ++ } ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all CPUs have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ rq_lock_irqsave(rq, &rf); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_online(rq); ++ } ++ unbind_zero(cpu); ++ rq_unlock_irqrestore(rq, &rf); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu(); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going down, decrement the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_dec_cpuslocked(&sched_smt_present); ++#endif ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ sched_domains_numa_masks_clear(cpu); ++ return 0; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ /* Handle pending wakeups and then migrate everything off */ ++ sched_tick_stop(cpu); ++ ++ local_irq_save(flags); ++ double_rq_lock(rq, cpu_rq(0)); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_offline(rq); ++ } ++ bind_zero(cpu); ++ double_rq_unlock(rq, cpu_rq(0)); ++ sched_start_tick(rq, cpu); ++ hrexpiry_clear(rq); ++ local_irq_restore(flags); ++ ++ return 0; ++} ++#endif ++ ++#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) ++/* ++ * Cheaper version of the below functions in case support for SMT and MC is ++ * compiled in but CPUs have no siblings. ++ */ ++static bool sole_cpu_idle(struct rq *rq) ++{ ++ return rq_idle(rq); ++} ++#endif ++#ifdef CONFIG_SCHED_SMT ++static const cpumask_t *thread_cpumask(int cpu) ++{ ++ return topology_sibling_cpumask(cpu); ++} ++/* All this CPU's SMT siblings are idle */ ++static bool siblings_cpu_idle(struct rq *rq) ++{ ++ return cpumask_subset(&rq->thread_mask, &cpu_idle_map); ++} ++#endif ++#ifdef CONFIG_SCHED_MC ++static const cpumask_t *core_cpumask(int cpu) ++{ ++ return topology_core_cpumask(cpu); ++} ++/* All this CPU's shared cache siblings are idle */ ++static bool cache_cpu_idle(struct rq *rq) ++{ ++ return cpumask_subset(&rq->core_mask, &cpu_idle_map); ++} ++/* MC siblings CPU mask which share the same LLC */ ++static const cpumask_t *llc_core_cpumask(int cpu) ++{ ++#ifdef CONFIG_X86 ++ return per_cpu(cpu_llc_shared_map, cpu); ++#else ++ return topology_core_cpumask(cpu); ++#endif ++} ++#endif ++ ++enum sched_domain_level { ++ SD_LV_NONE = 0, ++ SD_LV_SIBLING, ++ SD_LV_MC, ++ SD_LV_BOOK, ++ SD_LV_CPU, ++ SD_LV_NODE, ++ SD_LV_ALLNODES, ++ SD_LV_MAX ++}; ++ ++/* ++ * Set up the relative cache distance of each online cpu from each ++ * other in a simple array for quick lookup. Locality is determined ++ * by the closest sched_domain that CPUs are separated by. CPUs with ++ * shared cache in SMT and MC are treated as local. Separate CPUs ++ * (within the same package or physically) within the same node are ++ * treated as not local. CPUs not even in the same domain (different ++ * nodes) are treated as very distant. ++ */ ++static void __init select_leaders(void) ++{ ++ struct rq *rq, *other_rq, *leader; ++ struct sched_domain *sd; ++ int cpu, other_cpu; ++#ifdef CONFIG_SCHED_SMT ++ bool smt_threads = false; ++#endif ++ ++ for (cpu = 0; cpu < num_online_cpus(); cpu++) { ++ rq = cpu_rq(cpu); ++ leader = NULL; ++ /* First check if this cpu is in the same node */ ++ for_each_domain(cpu, sd) { ++ if (sd->level > SD_LV_MC) ++ continue; ++ if (rqshare != RQSHARE_ALL) ++ leader = NULL; ++ /* Set locality to local node if not already found lower */ ++ for_each_cpu(other_cpu, sched_domain_span(sd)) { ++ if (rqshare >= RQSHARE_SMP) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the smp_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ if (!other_rq->smp_leader) ++ other_rq->smp_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_SMP) ++ rq->cpu_locality[other_cpu] = LOCALITY_SMP; ++ } ++ } ++ ++ /* ++ * Each runqueue has its own function in case it doesn't have ++ * siblings of its own allowing mixed topologies. ++ */ ++#ifdef CONFIG_SCHED_MC ++ leader = NULL; ++ if (cpumask_weight(core_cpumask(cpu)) > 1) { ++ cpumask_copy(&rq->core_mask, llc_core_cpumask(cpu)); ++ cpumask_clear_cpu(cpu, &rq->core_mask); ++ for_each_cpu(other_cpu, core_cpumask(cpu)) { ++ if (rqshare == RQSHARE_MC || ++ (rqshare == RQSHARE_MC_LLC && cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu)))) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the mc_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ if (!other_rq->mc_leader) ++ other_rq->mc_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_MC) { ++ /* this is to get LLC into play even in case LLC sharing is not used */ ++ if (cpumask_test_cpu(other_cpu, llc_core_cpumask(cpu))) ++ rq->cpu_locality[other_cpu] = LOCALITY_MC_LLC; ++ else ++ rq->cpu_locality[other_cpu] = LOCALITY_MC; ++ } ++ } ++ rq->cache_idle = cache_cpu_idle; ++ } ++#endif ++#ifdef CONFIG_SCHED_SMT ++ leader = NULL; ++ if (cpumask_weight(thread_cpumask(cpu)) > 1) { ++ cpumask_copy(&rq->thread_mask, thread_cpumask(cpu)); ++ cpumask_clear_cpu(cpu, &rq->thread_mask); ++ for_each_cpu(other_cpu, thread_cpumask(cpu)) { ++ if (rqshare == RQSHARE_SMT) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the smt_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ if (!other_rq->smt_leader) ++ other_rq->smt_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > LOCALITY_SMT) ++ rq->cpu_locality[other_cpu] = LOCALITY_SMT; ++ } ++ rq->siblings_idle = siblings_cpu_idle; ++ smt_threads = true; ++ } ++#endif ++ } ++ ++#ifdef CONFIG_SMT_NICE ++ if (smt_threads) { ++ check_siblings = &check_smt_siblings; ++ wake_siblings = &wake_smt_siblings; ++ smt_schedule = &smt_should_schedule; ++ } ++#endif ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for_each_online_cpu(other_cpu) { ++ printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]); ++ } ++ } ++} ++ ++/* FIXME freeing locked spinlock */ ++static void __init share_and_free_rq(struct rq *leader, struct rq *rq) ++{ ++ WARN_ON(rq->nr_running > 0); ++ ++ kfree(rq->node); ++ kfree(rq->sl); ++ kfree(rq->lock); ++ rq->node = leader->node; ++ rq->sl = leader->sl; ++ rq->lock = leader->lock; ++ rq->is_leader = false; ++ barrier(); ++ /* To make up for not unlocking the freed runlock */ ++ preempt_enable(); ++} ++ ++static void __init share_rqs(void) ++{ ++ struct rq *rq, *leader; ++ int cpu; ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->smp_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing SMP runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ share_and_free_rq(leader, rq); ++ } else ++ rq_unlock(rq); ++ } ++ ++#ifdef CONFIG_SCHED_MC ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->mc_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing MC runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ share_and_free_rq(leader, rq); ++ } else ++ rq_unlock(rq); ++ } ++#endif /* CONFIG_SCHED_MC */ ++ ++#ifdef CONFIG_SCHED_SMT ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->smt_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "MuQSS sharing SMT runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ share_and_free_rq(leader, rq); ++ } else ++ rq_unlock(rq); ++ } ++#endif /* CONFIG_SCHED_SMT */ ++} ++ ++static void __init setup_rq_orders(void) ++{ ++ int *selected_cpus, *ordered_cpus; ++ struct rq *rq, *other_rq; ++ int cpu, other_cpu, i; ++ ++ selected_cpus = kmalloc(sizeof(int) * NR_CPUS, GFP_ATOMIC); ++ ordered_cpus = kmalloc(sizeof(int) * NR_CPUS, GFP_ATOMIC); ++ ++ total_runqueues = 0; ++ for_each_online_cpu(cpu) { ++ int locality, total_rqs = 0, total_cpus = 0; ++ ++ rq = cpu_rq(cpu); ++ if (rq->is_leader) ++ total_runqueues++; ++ ++ for (locality = LOCALITY_SAME; locality <= LOCALITY_DISTANT; locality++) { ++ int selected_cpu_cnt, selected_cpu_idx, test_cpu_idx, cpu_idx, best_locality, test_cpu; ++ int ordered_cpus_idx; ++ ++ ordered_cpus_idx = -1; ++ selected_cpu_cnt = 0; ++ ++ for_each_online_cpu(test_cpu) { ++ if (cpu < num_online_cpus() / 2) ++ other_cpu = cpu + test_cpu; ++ else ++ other_cpu = cpu - test_cpu; ++ if (other_cpu < 0) ++ other_cpu += num_online_cpus(); ++ else ++ other_cpu %= num_online_cpus(); ++ /* gather CPUs of the same locality */ ++ if (rq->cpu_locality[other_cpu] == locality) { ++ selected_cpus[selected_cpu_cnt] = other_cpu; ++ selected_cpu_cnt++; ++ } ++ } ++ ++ /* reserve first CPU as starting point */ ++ if (selected_cpu_cnt > 0) { ++ ordered_cpus_idx++; ++ ordered_cpus[ordered_cpus_idx] = selected_cpus[ordered_cpus_idx]; ++ selected_cpus[ordered_cpus_idx] = -1; ++ } ++ ++ /* take each CPU and sort it within the same locality based on each inter-CPU localities */ ++ for (test_cpu_idx = 1; test_cpu_idx < selected_cpu_cnt; test_cpu_idx++) { ++ /* starting point with worst locality and current CPU */ ++ best_locality = LOCALITY_DISTANT; ++ selected_cpu_idx = test_cpu_idx; ++ ++ /* try to find the best locality within group */ ++ for (cpu_idx = 1; cpu_idx < selected_cpu_cnt; cpu_idx++) { ++ /* if CPU has not been used and locality is better */ ++ if (selected_cpus[cpu_idx] > -1) { ++ other_rq = cpu_rq(ordered_cpus[ordered_cpus_idx]); ++ if (best_locality > other_rq->cpu_locality[selected_cpus[cpu_idx]]) { ++ /* assign best locality and best CPU idx in array */ ++ best_locality = other_rq->cpu_locality[selected_cpus[cpu_idx]]; ++ selected_cpu_idx = cpu_idx; ++ } ++ } ++ } ++ ++ /* add our next best CPU to ordered list */ ++ ordered_cpus_idx++; ++ ordered_cpus[ordered_cpus_idx] = selected_cpus[selected_cpu_idx]; ++ /* mark this CPU as used */ ++ selected_cpus[selected_cpu_idx] = -1; ++ } ++ ++ /* set up RQ and CPU orders */ ++ for (test_cpu = 0; test_cpu <= ordered_cpus_idx; test_cpu++) { ++ other_rq = cpu_rq(ordered_cpus[test_cpu]); ++ /* set up cpu orders */ ++ rq->cpu_order[total_cpus++] = other_rq; ++ if (other_rq->is_leader) { ++ /* set up RQ orders */ ++ rq->rq_order[total_rqs++] = other_rq; ++ } ++ } ++ } ++ } ++ ++ kfree(selected_cpus); ++ kfree(ordered_cpus); ++ ++#ifdef CONFIG_X86 ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for (i = 0; i < total_runqueues; i++) { ++ printk(KERN_DEBUG "MuQSS CPU %d llc %d RQ order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, ++ rq->rq_order[i]->cpu, per_cpu(cpu_llc_id, rq->rq_order[i]->cpu)); ++ } ++ } ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for (i = 0; i < num_online_cpus(); i++) { ++ printk(KERN_DEBUG "MuQSS CPU %d llc %d CPU order %d RQ %d llc %d\n", cpu, per_cpu(cpu_llc_id, cpu), i, ++ rq->cpu_order[i]->cpu, per_cpu(cpu_llc_id, rq->cpu_order[i]->cpu)); ++ } ++ } ++#endif ++} ++ ++void __init sched_init_smp(void) ++{ ++ sched_init_numa(); ++ ++ /* ++ * There's no userspace yet to cause hotplug operations; hence all the ++ * cpu masks are stable and all blatant races in the below code cannot ++ * happen. ++ */ ++ mutex_lock(&sched_domains_mutex); ++ sched_init_domains(cpu_active_mask); ++ mutex_unlock(&sched_domains_mutex); ++ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) ++ BUG(); ++ ++ local_irq_disable(); ++ mutex_lock(&sched_domains_mutex); ++ lock_all_rqs(); ++ ++ printk(KERN_INFO "MuQSS possible/present/online CPUs: %d/%d/%d\n", ++ num_possible_cpus(), num_present_cpus(), num_online_cpus()); ++ ++ select_leaders(); ++ ++ unlock_all_rqs(); ++ mutex_unlock(&sched_domains_mutex); ++ ++ share_rqs(); ++ ++ local_irq_enable(); ++ ++ setup_rq_orders(); ++ ++ switch (rqshare) { ++ case RQSHARE_ALL: ++ /* This should only ever read 1 */ ++ printk(KERN_INFO "MuQSS runqueue share type ALL total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_SMP: ++ printk(KERN_INFO "MuQSS runqueue share type SMP total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_MC: ++ printk(KERN_INFO "MuQSS runqueue share type MC total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_MC_LLC: ++ printk(KERN_INFO "MuQSS runqueue share type LLC total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_SMT: ++ printk(KERN_INFO "MuQSS runqueue share type SMT total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_NONE: ++ printk(KERN_INFO "MuQSS runqueue share type NONE total runqueues: %d\n", ++ total_runqueues); ++ break; ++ } ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++ sched_smp_initialized = true; ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++#ifdef CONFIG_SMP ++ int cpu_ids; ++#endif ++ int i; ++ struct rq *rq; ++ ++ wait_bit_init(); ++ ++ prio_ratios[0] = 128; ++ for (i = 1 ; i < NICE_WIDTH ; i++) ++ prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; ++ ++ skiplist_node_init(&init_task.node); ++ ++#ifdef CONFIG_SMP ++ init_defrootdomain(); ++ cpumask_clear(&cpu_idle_map); ++#else ++ uprq = &per_cpu(runqueues, 0); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ rq->node = kmalloc(sizeof(skiplist_node), GFP_ATOMIC); ++ skiplist_init(rq->node); ++ rq->sl = new_skiplist(rq->node); ++ rq->lock = kmalloc(sizeof(raw_spinlock_t), GFP_ATOMIC); ++ raw_spin_lock_init(rq->lock); ++ rq->nr_running = 0; ++ rq->nr_uninterruptible = 0; ++ rq->nr_switches = 0; ++ rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0; ++ rq->last_jiffy = jiffies; ++ rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns = ++ rq->iowait_ns = rq->idle_ns = 0; ++ rq->dither = 0; ++ set_rq_task(rq, &init_task); ++ rq->iso_ticks = 0; ++ rq->iso_refractory = false; ++#ifdef CONFIG_SMP ++ rq->is_leader = true; ++ rq->smp_leader = NULL; ++#ifdef CONFIG_SCHED_MC ++ rq->mc_leader = NULL; ++#endif ++#ifdef CONFIG_SCHED_SMT ++ rq->smt_leader = NULL; ++#endif ++ rq->sd = NULL; ++ rq->rd = NULL; ++ rq->online = false; ++ rq->cpu = i; ++ rq_attach_root(rq, &def_root_domain); ++#endif ++ init_rq_hrexpiry(rq); ++ atomic_set(&rq->nr_iowait, 0); ++ } ++ ++#ifdef CONFIG_SMP ++ cpu_ids = i; ++ /* ++ * Set the base locality for cpu cache distance calculation to ++ * "distant" (3). Make sure the distance from a CPU to itself is 0. ++ */ ++ for_each_possible_cpu(i) { ++ int j; ++ ++ rq = cpu_rq(i); ++#ifdef CONFIG_SCHED_SMT ++ rq->siblings_idle = sole_cpu_idle; ++#endif ++#ifdef CONFIG_SCHED_MC ++ rq->cache_idle = sole_cpu_idle; ++#endif ++ rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC); ++ for_each_possible_cpu(j) { ++ if (i == j) ++ rq->cpu_locality[j] = LOCALITY_SAME; ++ else ++ rq->cpu_locality[j] = LOCALITY_DISTANT; ++ } ++ rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); ++ rq->cpu_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); ++ rq->rq_order[0] = rq->cpu_order[0] = rq; ++ for (j = 1; j < cpu_ids; j++) ++ rq->rq_order[j] = rq->cpu_order[j] = cpu_rq(j); ++ } ++#endif ++ ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++ ++ psi_init(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void __cant_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > preempt_offset) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); ++ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current) && !current->non_block_count) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && !preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(KERN_ERR, preempt_disable_ip); ++ } ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++static inline void normalise_rt_tasks(void) ++{ ++ struct sched_attr attr = {}; ++ struct task_struct *g, *p; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p) && !iso_task(p)) ++ continue; ++ ++ rq = task_rq_lock(p, &rf); ++ __setscheduler(p, rq, SCHED_NORMAL, 0, &attr, false); ++ task_rq_unlock(rq, p, &rf); ++ } ++ read_unlock(&tasklist_lock); ++} ++ ++void normalize_rt_tasks(void) ++{ ++ normalise_rt_tasks(); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * ia64_set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++void init_idle_bootup_task(struct task_struct *idle) ++{} ++ ++#ifdef CONFIG_SCHED_DEBUG ++__read_mostly bool sched_debug_enabled; ++ ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{ ++ seq_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), ++ get_nr_threads(p)); ++} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_legacy_files[] = { ++ { } /* Terminate */ ++}; ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void call_trace_sched_update_nr_running(struct rq *rq, int count) ++{ ++ trace_sched_update_nr_running_tp(rq, count); ++} ++ ++/* CFS Compat */ ++#ifdef CONFIG_RCU_TORTURE_TEST ++int sysctl_sched_rt_runtime; ++#endif +diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h +new file mode 100644 +index 000000000000..09a1f2fe64ba +--- /dev/null ++++ b/kernel/sched/MuQSS.h +@@ -0,0 +1,1070 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef MUQSS_SCHED_H ++#define MUQSS_SCHED_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef CONFIG_PARAVIRT ++#include ++#endif ++ ++#include "cpupri.h" ++ ++#include ++ ++#ifdef CONFIG_SCHED_DEBUG ++# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) ++#else ++# define SCHED_WARN_ON(x) ((void)(x)) ++#endif ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++#define WF_ON_CPU 0x08 /* Wakee is on_cpu */ ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++extern void call_trace_sched_update_nr_running(struct rq *rq, int count); ++ ++struct rq; ++ ++#ifdef CONFIG_SMP ++ ++static inline bool sched_asym_prefer(int a, int b) ++{ ++ return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); ++} ++ ++struct perf_domain { ++ struct em_perf_domain *em_pd; ++ struct perf_domain *next; ++ struct rcu_head rcu; ++}; ++ ++/* Scheduling group status flags */ ++#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ ++#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ ++ ++/* ++ * We add the notion of a root-domain which will be used to define per-domain ++ * variables. Each exclusive cpuset essentially defines an island domain by ++ * fully partitioning the member cpus from any other cpuset. Whenever a new ++ * exclusive cpuset is created, we also create and attach a new root-domain ++ * object. ++ * ++ */ ++struct root_domain { ++ atomic_t refcount; ++ atomic_t rto_count; ++ struct rcu_head rcu; ++ cpumask_var_t span; ++ cpumask_var_t online; ++ ++ /* ++ * Indicate pullable load on at least one CPU, e.g: ++ * - More than one runnable task ++ * - Running task is misfit ++ */ ++ int overload; ++ ++ /* Indicate one or more cpus over-utilized (tipping point) */ ++ int overutilized; ++ ++ /* ++ * The bit corresponding to a CPU gets set here if such CPU has more ++ * than one runnable -deadline task (as it is below for RT tasks). ++ */ ++ cpumask_var_t dlo_mask; ++ atomic_t dlo_count; ++ /* Replace unused CFS structures with void */ ++ //struct dl_bw dl_bw; ++ //struct cpudl cpudl; ++ void *dl_bw; ++ void *cpudl; ++ ++ /* ++ * The "RT overload" flag: it gets set if a CPU has more than ++ * one runnable RT task. ++ */ ++ cpumask_var_t rto_mask; ++ //struct cpupri cpupri; ++ void *cpupri; ++ ++ unsigned long max_cpu_capacity; ++ ++ /* ++ * NULL-terminated list of performance domains intersecting with the ++ * CPUs of the rd. Protected by RCU. ++ */ ++ struct perf_domain *pd; ++}; ++ ++extern void init_defrootdomain(void); ++extern int sched_init_domains(const struct cpumask *cpu_map); ++extern void rq_attach_root(struct rq *rq, struct root_domain *rd); ++ ++static inline void cpupri_cleanup(void __maybe_unused *cpupri) ++{ ++} ++ ++static inline void cpudl_cleanup(void __maybe_unused *cpudl) ++{ ++} ++ ++static inline void init_dl_bw(void __maybe_unused *dl_bw) ++{ ++} ++ ++static inline int cpudl_init(void __maybe_unused *dl_bw) ++{ ++ return 0; ++} ++ ++static inline int cpupri_init(void __maybe_unused *cpupri) ++{ ++ return 0; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ raw_spinlock_t *lock; ++ raw_spinlock_t *orig_lock; ++ ++ struct task_struct __rcu *curr; ++ struct task_struct *idle; ++ struct task_struct *stop; ++ struct mm_struct *prev_mm; ++ ++ unsigned int nr_running; ++ /* ++ * This is part of a global counter where only the total sum ++ * over all CPUs matters. A task can increase this counter on ++ * one CPU and if it got migrated afterwards it may decrease ++ * it on another CPU. Always updated under the runqueue lock: ++ */ ++ unsigned long nr_uninterruptible; ++#ifdef CONFIG_SMP ++ unsigned int ttwu_pending; ++#endif ++ u64 nr_switches; ++ ++ /* Stored data about rq->curr to work outside rq lock */ ++ u64 rq_deadline; ++ int rq_prio; ++ ++ /* Best queued id for use outside lock */ ++ u64 best_key; ++ ++ unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */ ++ unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ ++ u64 niffies; /* Last time this RQ updated rq clock */ ++ u64 last_niffy; /* Last niffies as updated by local clock */ ++ u64 last_jiffy_niffies; /* Niffies @ last_jiffy */ ++ ++ u64 load_update; /* When we last updated load */ ++ unsigned long load_avg; /* Rolling load average */ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ u64 irq_load_update; /* When we last updated IRQ load */ ++ unsigned long irq_load_avg; /* Rolling IRQ load average */ ++#endif ++#ifdef CONFIG_SMT_NICE ++ struct mm_struct *rq_mm; ++ int rq_smt_bias; /* Policy/nice level bias across smt siblings */ ++#endif ++ /* Accurate timekeeping data */ ++ unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns, ++ iowait_ns, idle_ns; ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_MEMBARRIER ++ int membarrier_state; ++#endif ++ ++ skiplist_node *node; ++ skiplist *sl; ++#ifdef CONFIG_SMP ++ struct task_struct *preempt; /* Preempt triggered on this task */ ++ struct task_struct *preempting; /* Hint only, what task is preempting */ ++ ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++ struct root_domain *rd; ++ struct sched_domain *sd; ++ ++ unsigned long cpu_capacity_orig; ++ ++ int *cpu_locality; /* CPU relative cache distance */ ++ struct rq **rq_order; /* Shared RQs ordered by relative cache distance */ ++ struct rq **cpu_order; /* RQs of discrete CPUs ordered by distance */ ++ ++ bool is_leader; ++ struct rq *smp_leader; /* First physical CPU per node */ ++#ifdef CONFIG_SCHED_THERMAL_PRESSURE ++ struct sched_avg avg_thermal; ++#endif /* CONFIG_SCHED_THERMAL_PRESSURE */ ++#ifdef CONFIG_SCHED_SMT ++ struct rq *smt_leader; /* First logical CPU in SMT siblings */ ++ cpumask_t thread_mask; ++ bool (*siblings_idle)(struct rq *rq); ++ /* See if all smt siblings are idle */ ++#endif /* CONFIG_SCHED_SMT */ ++#ifdef CONFIG_SCHED_MC ++ struct rq *mc_leader; /* First logical CPU in MC siblings */ ++ cpumask_t core_mask; ++ bool (*cache_idle)(struct rq *rq); ++ /* See if all cache siblings are idle */ ++#endif /* CONFIG_SCHED_MC */ ++#endif /* CONFIG_SMP */ ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ u64 clock, old_clock, last_tick; ++ /* Ensure that all clocks are in the same cache line */ ++ u64 clock_task ____cacheline_aligned; ++ int dither; ++ ++ int iso_ticks; ++ bool iso_refractory; ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++ struct hrtimer hrexpiry_timer; ++#endif ++ ++ int rt_nr_running; /* Number real time tasks running */ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++}; ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ lockdep_assert_held(rq->lock); ++ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ lockdep_assert_held(rq->lock); ++ ++ return rq->clock_task; ++} ++ ++/** ++ * By default the decay is the default pelt decay period. ++ * The decay shift can change the decay period in ++ * multiples of 32. ++ * Decay shift Decay period(ms) ++ * 0 32 ++ * 1 64 ++ * 2 128 ++ * 3 256 ++ * 4 512 ++ */ ++extern int sched_thermal_decay_shift; ++ ++static inline u64 rq_clock_thermal(struct rq *rq) ++{ ++ return rq_clock_task(rq) >> sched_thermal_decay_shift; ++} ++ ++struct rq_flags { ++ unsigned long flags; ++}; ++ ++#ifdef CONFIG_SMP ++struct rq *cpu_rq(int cpu); ++#endif ++ ++#ifndef CONFIG_SMP ++extern struct rq *uprq; ++#define cpu_rq(cpu) (uprq) ++#define this_rq() (uprq) ++#define raw_rq() (uprq) ++#define task_rq(p) (uprq) ++#define cpu_curr(cpu) ((uprq)->curr) ++#else /* CONFIG_SMP */ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define this_rq() this_cpu_ptr(&runqueues) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#endif /* CONFIG_SMP */ ++ ++static inline int task_current(struct rq *rq, struct task_struct *p) ++{ ++ return rq->curr == p; ++} ++ ++static inline int task_running(struct rq *rq, struct task_struct *p) ++{ ++#ifdef CONFIG_SMP ++ return p->on_cpu; ++#else ++ return task_current(rq, p); ++#endif ++} ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++} ++ ++static inline void rq_lock(struct rq *rq) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock(rq->lock); ++} ++ ++static inline void rq_unlock(struct rq *rq) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(rq->lock); ++} ++ ++static inline void rq_lock_irq(struct rq *rq) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irq(rq->lock); ++} ++ ++static inline void rq_unlock_irq(struct rq *rq, struct rq_flags __always_unused *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(rq->lock); ++} ++ ++static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irqsave(rq->lock, rf->flags); ++} ++ ++static inline void rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irqrestore(rq->lock, rf->flags); ++} ++ ++static inline struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ while (42) { ++ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); ++ rq = task_rq(p); ++ raw_spin_lock(rq->lock); ++ if (likely(rq == task_rq(p))) ++ break; ++ raw_spin_unlock(rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++ } ++ return rq; ++} ++ ++static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ rq_unlock(rq); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++} ++ ++static inline struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags __always_unused *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ while (42) { ++ rq = task_rq(p); ++ raw_spin_lock(rq->lock); ++ if (likely(rq == task_rq(p))) ++ break; ++ raw_spin_unlock(rq->lock); ++ } ++ return rq; ++} ++ ++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags __always_unused *rf) ++{ ++ rq_unlock(rq); ++} ++ ++static inline struct rq * ++this_rq_lock_irq(struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ rq_lock(rq); ++ return rq; ++} ++ ++/* ++ * {de,en}queue flags: Most not used on MuQSS. ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks ++ * are in a known state which allows modification. Such pairs ++ * should preserve as much state as possible. ++ * ++ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location ++ * in the runqueue. ++ * ++ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) ++ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) ++ * ENQUEUE_MIGRATED - the task was migrated during wakeup ++ * ++ */ ++ ++#define DEQUEUE_SLEEP 0x01 ++#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ ++ ++#define ENQUEUE_WAKEUP 0x01 ++#define ENQUEUE_RESTORE 0x02 ++ ++#ifdef CONFIG_SMP ++#define ENQUEUE_MIGRATED 0x40 ++#else ++#define ENQUEUE_MIGRATED 0x00 ++#endif ++ ++#ifdef CONFIG_NUMA ++enum numa_topology_type { ++ NUMA_DIRECT, ++ NUMA_GLUELESS_MESH, ++ NUMA_BACKPLANE, ++}; ++extern enum numa_topology_type sched_numa_topology_type; ++extern int sched_max_numa_distance; ++extern bool find_numa_distance(int distance); ++extern void sched_init_numa(void); ++extern void sched_domains_numa_masks_set(unsigned int cpu); ++extern void sched_domains_numa_masks_clear(unsigned int cpu); ++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); ++#else ++static inline void sched_init_numa(void) { } ++static inline void sched_domains_numa_masks_set(unsigned int cpu) { } ++static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } ++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return nr_cpu_ids; ++} ++#endif ++ ++extern struct mutex sched_domains_mutex; ++extern struct static_key_false sched_schedstats; ++ ++#define rcu_dereference_check_sched_domain(p) \ ++ rcu_dereference_check((p), \ ++ lockdep_is_held(&sched_domains_mutex)) ++ ++#ifdef CONFIG_SMP ++ ++/* ++ * The domain tree (rq->sd) is protected by RCU's quiescent state transition. ++ * See destroy_sched_domains: call_rcu for details. ++ * ++ * The domain tree of any CPU may only be accessed from within ++ * preempt-disabled sections. ++ */ ++#define for_each_domain(cpu, __sd) \ ++ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ ++ __sd; __sd = __sd->parent) ++ ++/** ++ * highest_flag_domain - Return highest sched_domain containing flag. ++ * @cpu: The cpu whose highest level of sched domain is to ++ * be returned. ++ * @flag: The flag to check for the highest sched_domain ++ * for the given cpu. ++ * ++ * Returns the highest sched_domain of a cpu which contains the given flag. ++ */ ++static inline struct sched_domain *highest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd, *hsd = NULL; ++ ++ for_each_domain(cpu, sd) { ++ if (!(sd->flags & flag)) ++ break; ++ hsd = sd; ++ } ++ ++ return hsd; ++} ++ ++static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd; ++ ++ for_each_domain(cpu, sd) { ++ if (sd->flags & flag) ++ break; ++ } ++ ++ return sd; ++} ++ ++DECLARE_PER_CPU(struct sched_domain *, sd_llc); ++DECLARE_PER_CPU(int, sd_llc_size); ++DECLARE_PER_CPU(int, sd_llc_id); ++DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); ++DECLARE_PER_CPU(struct sched_domain *, sd_numa); ++DECLARE_PER_CPU(struct sched_domain *, sd_asym_packing); ++DECLARE_PER_CPU(struct sched_domain *, sd_asym_cpucapacity); ++ ++struct sched_group_capacity { ++ atomic_t ref; ++ /* ++ * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity ++ * for a single CPU. ++ */ ++ unsigned long capacity; ++ unsigned long min_capacity; /* Min per-CPU capacity in group */ ++ unsigned long max_capacity; /* Max per-CPU capacity in group */ ++ unsigned long next_update; ++ int imbalance; /* XXX unrelated to capacity but shared group state */ ++ ++#ifdef CONFIG_SCHED_DEBUG ++ int id; ++#endif ++ ++ unsigned long cpumask[]; /* balance mask */ ++}; ++ ++struct sched_group { ++ struct sched_group *next; /* Must be a circular list */ ++ atomic_t ref; ++ ++ unsigned int group_weight; ++ struct sched_group_capacity *sgc; ++ int asym_prefer_cpu; /* cpu of highest priority in group */ ++ ++ /* ++ * The CPUs this group covers. ++ * ++ * NOTE: this field is variable length. (Allocated dynamically ++ * by attaching extra space to the end of the structure, ++ * depending on how many CPUs the kernel has booted up with) ++ */ ++ unsigned long cpumask[0]; ++}; ++ ++static inline struct cpumask *sched_group_span(struct sched_group *sg) ++{ ++ return to_cpumask(sg->cpumask); ++} ++ ++/* ++ * See build_balance_mask(). ++ */ ++static inline struct cpumask *group_balance_mask(struct sched_group *sg) ++{ ++ return to_cpumask(sg->sgc->cpumask); ++} ++ ++/** ++ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. ++ * @group: The group whose first cpu is to be returned. ++ */ ++static inline unsigned int group_first_cpu(struct sched_group *group) ++{ ++ return cpumask_first(sched_group_span(group)); ++} ++ ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void dirty_sched_domain_sysctl(int cpu); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void dirty_sched_domain_sysctl(int cpu) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++extern void flush_smp_call_function_from_idle(void); ++ ++extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); ++extern void set_rq_online (struct rq *rq); ++extern void set_rq_offline(struct rq *rq); ++extern bool sched_smp_initialized; ++ ++static inline void update_group_capacity(struct sched_domain *sd, int cpu) ++{ ++} ++ ++static inline void trigger_load_balance(struct rq *rq) ++{ ++} ++ ++#define sched_feat(x) 0 ++ ++#else /* CONFIG_SMP */ ++ ++static inline void flush_smp_call_function_from_idle(void) { } ++ ++#endif /* CONFIG_SMP */ ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ SCHED_WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++#ifdef CONFIG_SCHED_DEBUG ++extern bool sched_debug_enabled; ++#endif ++ ++extern void schedule_idle(void); ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++static inline bool sched_stop_runnable(struct rq *rq) ++{ ++ return rq->stop && task_on_rq_queued(rq->stop); ++} ++ ++#ifdef CONFIG_SMP ++static inline int cpu_of(struct rq *rq) ++{ ++ return rq->cpu; ++} ++#else /* CONFIG_SMP */ ++static inline int cpu_of(struct rq *rq) ++{ ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); ++ ++static inline void cpufreq_trigger(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, ++ cpu_of(rq))); ++ ++ if (data) ++ data->func(data, rq->niffies, flags); ++} ++#else ++static inline void cpufreq_trigger(struct rq *rq, unsigned int flag) ++{ ++} ++#endif /* CONFIG_CPU_FREQ */ ++ ++static __always_inline ++unsigned int uclamp_rq_util_with(struct rq __maybe_unused *rq, unsigned int util, ++ struct task_struct __maybe_unused *p) ++{ ++ return util; ++} ++ ++static inline bool uclamp_is_used(void) ++{ ++ return false; ++} ++ ++#ifndef arch_scale_freq_tick ++static __always_inline ++void arch_scale_freq_tick(void) ++{ ++} ++#endif ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++#ifdef CONFIG_64BIT ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ return tsk_seruntime(t); ++} ++#else ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ struct rq_flags rf; ++ u64 ns; ++ struct rq *rq; ++ ++ rq = task_rq_lock(t, &rf); ++ ns = tsk_seruntime(t); ++ task_rq_unlock(rq, t, &rf); ++ ++ return ns; ++} ++#endif ++ ++#ifndef arch_scale_freq_capacity ++/** ++ * arch_scale_freq_capacity - get the frequency scale factor of a given CPU. ++ * @cpu: the CPU in question. ++ * ++ * Return: the frequency scale factor normalized against SCHED_CAPACITY_SCALE, i.e. ++ * ++ * f_curr ++ * ------ * SCHED_CAPACITY_SCALE ++ * f_max ++ */ ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern bool sched_can_stop_tick(struct rq *rq); ++extern int __init sched_tick_offload_init(void); ++ ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out of ++ * nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (sched_can_stop_tick(rq)) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++static inline bool rt_rq_is_runnable(struct rq *rt_rq) ++{ ++ return rt_rq->rt_nr_running; ++} ++ ++/** ++ * enum schedutil_type - CPU utilization type ++ * @FREQUENCY_UTIL: Utilization used to select frequency ++ * @ENERGY_UTIL: Utilization used during energy calculation ++ * ++ * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time ++ * need to be aggregated differently depending on the usage made of them. This ++ * enum is used within schedutil_freq_util() to differentiate the types of ++ * utilization expected by the callers, and adjust the aggregation accordingly. ++ */ ++enum schedutil_type { ++ FREQUENCY_UTIL, ++ ENERGY_UTIL, ++}; ++ ++#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL ++ ++unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, ++ unsigned long max, enum schedutil_type type, ++ struct task_struct *p); ++ ++static inline unsigned long cpu_bw_dl(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline unsigned long cpu_util_dl(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline unsigned long cpu_util_cfs(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->load_avg); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++static inline unsigned long cpu_util_rt(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->rt_nr_running); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++static inline unsigned long cpu_util_irq(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->irq_load_avg); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++static inline ++unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) ++{ ++ util *= (max - irq); ++ util /= max; ++ ++ return util; ++ ++} ++#else ++static inline unsigned long cpu_util_irq(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline ++unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) ++{ ++ return util; ++} ++#endif ++#endif ++ ++#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) ++#define perf_domain_span(pd) (to_cpumask(((pd)->em_pd->cpus))) ++ ++DECLARE_STATIC_KEY_FALSE(sched_energy_present); ++ ++static inline bool sched_energy_enabled(void) ++{ ++ return static_branch_unlikely(&sched_energy_present); ++} ++ ++#else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ ++ ++#define perf_domain_span(pd) NULL ++static inline bool sched_energy_enabled(void) { return false; } ++ ++#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ ++ ++#ifdef CONFIG_MEMBARRIER ++/* ++ * The scheduler provides memory barriers required by membarrier between: ++ * - prior user-space memory accesses and store to rq->membarrier_state, ++ * - store to rq->membarrier_state and following user-space memory accesses. ++ * In the same way it provides those guarantees around store to rq->curr. ++ */ ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++ int membarrier_state; ++ ++ if (prev_mm == next_mm) ++ return; ++ ++ membarrier_state = atomic_read(&next_mm->membarrier_state); ++ if (READ_ONCE(rq->membarrier_state) == membarrier_state) ++ return; ++ ++ WRITE_ONCE(rq->membarrier_state, membarrier_state); ++} ++#else ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++} ++#endif ++ ++#ifdef CONFIG_SMP ++static inline bool is_per_cpu_kthread(struct task_struct *p) ++{ ++ if (!(p->flags & PF_KTHREAD)) ++ return false; ++ ++ if (p->nr_cpus_allowed != 1) ++ return false; ++ ++ return true; ++} ++#endif ++ ++void swake_up_all_locked(struct swait_queue_head *q); ++void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); ++ ++/* pelt.h compat CONFIG_SCHED_THERMAL_PRESSURE impossible with MUQSS */ ++static inline int ++update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) ++{ ++ return 0; ++} ++ ++static inline u64 thermal_load_avg(struct rq *rq) ++{ ++ return 0; ++} ++ ++#ifdef CONFIG_RCU_TORTURE_TEST ++extern int sysctl_sched_rt_runtime; ++#endif ++ ++#endif /* MUQSS_SCHED_H */ +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index e39008242cf4..146a3dfe626f 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -183,6 +183,12 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifdef CONFIG_SCHED_MUQSS ++#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(rq) ++#else ++#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(&rq->rt) ++#endif ++ + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -211,7 +217,7 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + struct rq *rq = cpu_rq(cpu); + + if (!uclamp_is_used() && +- type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { ++ type == FREQUENCY_UTIL && rt_rq_runnable(rq)) { + return max; + } + +@@ -656,7 +662,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + struct task_struct *thread; + struct sched_attr attr = { + .size = sizeof(struct sched_attr), ++#ifdef CONFIG_SCHED_MUQSS ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, ++#endif + .sched_flags = SCHED_FLAG_SUGOV, + .sched_nice = 0, + .sched_priority = 0, +diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h +index efbb492bb94c..f0288c32ab17 100644 +--- a/kernel/sched/cpupri.h ++++ b/kernel/sched/cpupri.h +@@ -17,6 +17,7 @@ struct cpupri { + int *cpu_to_pri; + }; + ++#ifndef CONFIG_SCHED_MUQSS + #ifdef CONFIG_SMP + int cpupri_find(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask); +@@ -27,3 +28,4 @@ void cpupri_set(struct cpupri *cp, int cpu, int pri); + int cpupri_init(struct cpupri *cp); + void cpupri_cleanup(struct cpupri *cp); + #endif ++#endif +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index 5a55d2300452..283a580754a7 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -266,26 +266,6 @@ static inline u64 account_other_time(u64 max) + return accounted; + } + +-#ifdef CONFIG_64BIT +-static inline u64 read_sum_exec_runtime(struct task_struct *t) +-{ +- return t->se.sum_exec_runtime; +-} +-#else +-static u64 read_sum_exec_runtime(struct task_struct *t) +-{ +- u64 ns; +- struct rq_flags rf; +- struct rq *rq; +- +- rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; +- task_rq_unlock(rq, t, &rf); +- +- return ns; +-} +-#endif +- + /* + * Accumulate raw cputime values of dead tasks (sig->[us]time) and live + * tasks (sum on group iteration) belonging to @tsk's group. +@@ -614,7 +594,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index f324dc36fc43..43ca13ed9ab0 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -369,6 +369,7 @@ void cpu_startup_entry(enum cpuhp_state state) + do_idle(); + } + ++#ifndef CONFIG_SCHED_MUQSS + /* + * idle-task scheduling class. + */ +@@ -482,3 +483,4 @@ const struct sched_class idle_sched_class + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif /* CONFIG_SCHED_MUQSS */ +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 28709f6b0975..4478c11cb51a 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2,6 +2,19 @@ + /* + * Scheduler internal types and methods: + */ ++#ifdef CONFIG_SCHED_MUQSS ++#include "MuQSS.h" ++ ++/* Begin compatibility wrappers for MuQSS/CFS differences */ ++#define rq_rt_nr_running(rq) ((rq)->rt_nr_running) ++#define rq_h_nr_running(rq) ((rq)->nr_running) ++ ++#else /* CONFIG_SCHED_MUQSS */ ++ ++#define rq_rt_nr_running(rq) ((rq)->rt.rt_nr_running) ++#define rq_h_nr_running(rq) ((rq)->cfs.h_nr_running) ++ ++ + #include + + #include +@@ -2626,3 +2639,25 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) + + void swake_up_all_locked(struct swait_queue_head *q); + void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); ++ ++/* MuQSS compatibility functions */ ++#ifdef CONFIG_64BIT ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ return t->se.sum_exec_runtime; ++} ++#else ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ u64 ns; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ rq = task_rq_lock(t, &rf); ++ ns = t->se.sum_exec_runtime; ++ task_rq_unlock(rq, t, &rf); ++ ++ return ns; ++} ++#endif ++#endif /* CONFIG_SCHED_MUQSS */ +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 1bd7e3af904f..a1dc490c15e4 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -440,7 +440,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) + struct root_domain *old_rd = NULL; + unsigned long flags; + ++#ifdef CONFIG_SCHED_MUQSS ++ raw_spin_lock_irqsave(rq->lock, flags); ++#else + raw_spin_lock_irqsave(&rq->lock, flags); ++#endif + + if (rq->rd) { + old_rd = rq->rd; +@@ -466,7 +470,11 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) + if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) + set_rq_online(rq); + ++#ifdef CONFIG_SCHED_MUQSS ++ raw_spin_unlock_irqrestore(rq->lock, flags); ++#else + raw_spin_unlock_irqrestore(&rq->lock, flags); ++#endif + + if (old_rd) + call_rcu(&old_rd->rcu, free_rootdomain); +diff --git a/kernel/skip_list.c b/kernel/skip_list.c +new file mode 100644 +index 000000000000..bf5c6e97e139 +--- /dev/null ++++ b/kernel/skip_list.c +@@ -0,0 +1,148 @@ ++/* ++ Copyright (C) 2011,2016 Con Kolivas. ++ ++ Code based on example originally by William Pugh. ++ ++Skip Lists are a probabilistic alternative to balanced trees, as ++described in the June 1990 issue of CACM and were invented by ++William Pugh in 1987. ++ ++A couple of comments about this implementation: ++The routine randomLevel has been hard-coded to generate random ++levels using p=0.25. It can be easily changed. ++ ++The insertion routine has been implemented so as to use the ++dirty hack described in the CACM paper: if a random level is ++generated that is more than the current maximum level, the ++current maximum level plus one is used instead. ++ ++Levels start at zero and go up to MaxLevel (which is equal to ++MaxNumberOfLevels-1). ++ ++The routines defined in this file are: ++ ++init: defines slnode ++ ++new_skiplist: returns a new, empty list ++ ++randomLevel: Returns a random level based on a u64 random seed passed to it. ++In MuQSS, the "niffy" time is used for this purpose. ++ ++insert(l,key, value): inserts the binding (key, value) into l. This operation ++occurs in O(log n) time. ++ ++delnode(slnode, l, node): deletes any binding of key from the l based on the ++actual node value. This operation occurs in O(k) time where k is the ++number of levels of the node in question (max 8). The original delete ++function occurred in O(log n) time and involved a search. ++ ++MuQSS Notes: In this implementation of skiplists, there are bidirectional ++next/prev pointers and the insert function returns a pointer to the actual ++node the value is stored. The key here is chosen by the scheduler so as to ++sort tasks according to the priority list requirements and is no longer used ++by the scheduler after insertion. The scheduler lookup, however, occurs in ++O(1) time because it is always the first item in the level 0 linked list. ++Since the task struct stores a copy of the node pointer upon skiplist_insert, ++it can also remove it much faster than the original implementation with the ++aid of prev<->next pointer manipulation and no searching. ++ ++*/ ++ ++#include ++#include ++ ++#define MaxNumberOfLevels 8 ++#define MaxLevel (MaxNumberOfLevels - 1) ++ ++void skiplist_init(skiplist_node *slnode) ++{ ++ int i; ++ ++ slnode->key = 0xFFFFFFFFFFFFFFFF; ++ slnode->level = 0; ++ slnode->value = NULL; ++ for (i = 0; i < MaxNumberOfLevels; i++) ++ slnode->next[i] = slnode->prev[i] = slnode; ++} ++ ++skiplist *new_skiplist(skiplist_node *slnode) ++{ ++ skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC); ++ ++ BUG_ON(!l); ++ l->header = slnode; ++ return l; ++} ++ ++void free_skiplist(skiplist *l) ++{ ++ skiplist_node *p, *q; ++ ++ p = l->header; ++ do { ++ q = p->next[0]; ++ p->next[0]->prev[0] = q->prev[0]; ++ skiplist_node_init(p); ++ p = q; ++ } while (p != l->header); ++ kfree(l); ++} ++ ++void skiplist_node_init(skiplist_node *node) ++{ ++ memset(node, 0, sizeof(skiplist_node)); ++} ++ ++static inline unsigned int randomLevel(const long unsigned int randseed) ++{ ++ return find_first_bit(&randseed, MaxLevel) / 2; ++} ++ ++void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed) ++{ ++ skiplist_node *update[MaxNumberOfLevels]; ++ skiplist_node *p, *q; ++ int k = l->level; ++ ++ p = l->header; ++ do { ++ while (q = p->next[k], q->key <= key) ++ p = q; ++ update[k] = p; ++ } while (--k >= 0); ++ ++ ++l->entries; ++ k = randomLevel(randseed); ++ if (k > l->level) { ++ k = ++l->level; ++ update[k] = l->header; ++ } ++ ++ node->level = k; ++ node->key = key; ++ node->value = value; ++ do { ++ p = update[k]; ++ node->next[k] = p->next[k]; ++ p->next[k] = node; ++ node->prev[k] = p; ++ node->next[k]->prev[k] = node; ++ } while (--k >= 0); ++} ++ ++void skiplist_delete(skiplist *l, skiplist_node *node) ++{ ++ int k, m = node->level; ++ ++ for (k = 0; k <= m; k++) { ++ node->prev[k]->next[k] = node->next[k]; ++ node->next[k]->prev[k] = node->prev[k]; ++ } ++ skiplist_node_init(node); ++ if (m == l->level) { ++ while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0) ++ m--; ++ l->level = m; ++ } ++ l->entries--; ++} +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index afad085960b8..d2e35cd54f94 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -120,7 +120,17 @@ static unsigned long long_max = LONG_MAX; + static int one_hundred = 100; + static int two_hundred = 200; + static int one_thousand = 1000; +-#ifdef CONFIG_PRINTK ++static int zero = 0; ++static int one = 1; ++#ifdef CONFIG_SCHED_MUQSS ++extern int rr_interval; ++extern int sched_interactive; ++extern int sched_iso_cpu; ++extern int sched_yield_type; ++#endif ++extern int hrtimer_granularity_us; ++extern int hrtimeout_min_us; ++#if defined(CONFIG_PRINTK) || defined(CONFIG_SCHED_MUQSS) + static int ten_thousand = 10000; + #endif + #ifdef CONFIG_PERF_EVENTS +@@ -184,7 +194,7 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; + int sysctl_legacy_va_layout; + #endif + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_MUQSS) + static int min_sched_granularity_ns = 100000; /* 100 usecs */ + static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns; /* 0 usecs */ +@@ -193,7 +203,7 @@ static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; + static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; + #endif /* CONFIG_SMP */ +-#endif /* CONFIG_SCHED_DEBUG */ ++#endif /* CONFIG_SCHED_DEBUG && !CONFIG_SCHED_MUQSS */ + + #ifdef CONFIG_COMPACTION + static int min_extfrag_threshold; +@@ -1652,6 +1662,7 @@ int proc_do_static_key(struct ctl_table *table, int write, + } + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_MUQSS + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -1843,6 +1854,73 @@ static struct ctl_table kern_table[] = { + .extra1 = SYSCTL_ONE, + }, + #endif ++#elif defined(CONFIG_SCHED_MUQSS) ++ { ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &one_thousand, ++ }, ++ { ++ .procname = "interactive", ++ .data = &sched_interactive, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, ++ { ++ .procname = "iso_cpu", ++ .data = &sched_iso_cpu, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one_hundred, ++ }, ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &two, ++ }, ++#if defined(CONFIG_SMP) && defined(CONFIG_SCHEDSTATS) ++ { ++ .procname = "sched_schedstats", ++ .data = NULL, ++ .maxlen = sizeof(unsigned int), ++ .mode = 0644, ++ .proc_handler = sysctl_schedstats, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, ++#endif /* CONFIG_SMP && CONFIG_SCHEDSTATS */ ++#endif /* CONFIG_SCHED_MUQSS */ ++ { ++ .procname = "hrtimer_granularity_us", ++ .data = &hrtimer_granularity_us, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &ten_thousand, ++ }, ++ { ++ .procname = "hrtimeout_min_us", ++ .data = &hrtimeout_min_us, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &ten_thousand, ++ }, + #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) + { + .procname = "sched_energy_aware", +diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig +index a09b1d61df6a..e7662101fcc3 100644 +--- a/kernel/time/Kconfig ++++ b/kernel/time/Kconfig +@@ -75,6 +75,9 @@ config NO_HZ_COMMON + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + ++config NO_HZ_FULL ++ bool ++ + choice + prompt "Timer tick handling" + default NO_HZ_IDLE if NO_HZ +@@ -96,8 +99,9 @@ config NO_HZ_IDLE + + Most of the time you want to say Y here. + +-config NO_HZ_FULL ++config NO_HZ_FULL_NODEF + bool "Full dynticks system (tickless)" ++ select NO_HZ_FULL + # NO_HZ_COMMON dependency + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + # We need at least one periodic CPU for timekeeping +@@ -123,6 +127,8 @@ config NO_HZ_FULL + transitions: syscalls, exceptions and interrupts. Even when it's + dynamically off. + ++ Not recommended for desktops,laptops, or mobile devices. ++ + Say N. + + endchoice +@@ -132,7 +138,7 @@ config CONTEXT_TRACKING + + config CONTEXT_TRACKING_FORCE + bool "Force context tracking" +- depends on CONTEXT_TRACKING ++ depends on CONTEXT_TRACKING && !SCHED_MUQSS + default y if !NO_HZ_FULL + help + The major pre-requirement for full dynticks to work is to +diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c +index f5490222e134..544c58c29267 100644 +--- a/kernel/time/clockevents.c ++++ b/kernel/time/clockevents.c +@@ -190,8 +190,9 @@ int clockevents_tick_resume(struct clock_event_device *dev) + + #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST + +-/* Limit min_delta to a jiffie */ +-#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) ++int __read_mostly hrtimer_granularity_us = 100; ++/* Limit min_delta to 100us */ ++#define MIN_DELTA_LIMIT (hrtimer_granularity_us * NSEC_PER_USEC) + + /** + * clockevents_increase_min_delta - raise minimum delta of a clock event device +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index 95b6a708b040..19918cf649b0 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -2223,3 +2223,113 @@ int __sched schedule_hrtimeout(ktime_t *expires, + return schedule_hrtimeout_range(expires, 0, mode); + } + EXPORT_SYMBOL_GPL(schedule_hrtimeout); ++ ++/* ++ * As per schedule_hrtimeout but taskes a millisecond value and returns how ++ * many milliseconds are left. ++ */ ++long __sched schedule_msec_hrtimeout(long timeout) ++{ ++ struct hrtimer_sleeper t; ++ int delta, jiffs; ++ ktime_t expires; ++ ++ if (!timeout) { ++ __set_current_state(TASK_RUNNING); ++ return 0; ++ } ++ ++ jiffs = msecs_to_jiffies(timeout); ++ /* ++ * If regular timer resolution is adequate or hrtimer resolution is not ++ * (yet) better than Hz, as would occur during startup, use regular ++ * timers. ++ */ ++ if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ || pm_freezing) ++ return schedule_timeout(jiffs); ++ ++ delta = (timeout % 1000) * NSEC_PER_MSEC; ++ expires = ktime_set(0, delta); ++ ++ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ hrtimer_set_expires_range_ns(&t.timer, expires, delta); ++ ++ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); ++ ++ if (likely(t.task)) ++ schedule(); ++ ++ hrtimer_cancel(&t.timer); ++ destroy_hrtimer_on_stack(&t.timer); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ expires = hrtimer_expires_remaining(&t.timer); ++ timeout = ktime_to_ms(expires); ++ return timeout < 0 ? 0 : timeout; ++} ++ ++EXPORT_SYMBOL(schedule_msec_hrtimeout); ++ ++#define USECS_PER_SEC 1000000 ++extern int hrtimer_granularity_us; ++ ++static inline long schedule_usec_hrtimeout(long timeout) ++{ ++ struct hrtimer_sleeper t; ++ ktime_t expires; ++ int delta; ++ ++ if (!timeout) { ++ __set_current_state(TASK_RUNNING); ++ return 0; ++ } ++ ++ if (hrtimer_resolution >= NSEC_PER_SEC / HZ) ++ return schedule_timeout(usecs_to_jiffies(timeout)); ++ ++ if (timeout < hrtimer_granularity_us) ++ timeout = hrtimer_granularity_us; ++ delta = (timeout % USECS_PER_SEC) * NSEC_PER_USEC; ++ expires = ktime_set(0, delta); ++ ++ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ hrtimer_set_expires_range_ns(&t.timer, expires, delta); ++ ++ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); ++ ++ if (likely(t.task)) ++ schedule(); ++ ++ hrtimer_cancel(&t.timer); ++ destroy_hrtimer_on_stack(&t.timer); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ expires = hrtimer_expires_remaining(&t.timer); ++ timeout = ktime_to_us(expires); ++ return timeout < 0 ? 0 : timeout; ++} ++ ++int __read_mostly hrtimeout_min_us = 500; ++ ++long __sched schedule_min_hrtimeout(void) ++{ ++ return usecs_to_jiffies(schedule_usec_hrtimeout(hrtimeout_min_us)); ++} ++ ++EXPORT_SYMBOL(schedule_min_hrtimeout); ++ ++long __sched schedule_msec_hrtimeout_interruptible(long timeout) ++{ ++ __set_current_state(TASK_INTERRUPTIBLE); ++ return schedule_msec_hrtimeout(timeout); ++} ++EXPORT_SYMBOL(schedule_msec_hrtimeout_interruptible); ++ ++long __sched schedule_msec_hrtimeout_uninterruptible(long timeout) ++{ ++ __set_current_state(TASK_UNINTERRUPTIBLE); ++ return schedule_msec_hrtimeout(timeout); ++} ++EXPORT_SYMBOL(schedule_msec_hrtimeout_uninterruptible); +diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +index a71758e34e45..ebb84a65d928 100644 +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -216,7 +216,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) + u64 stime, utime; + + task_cputime(p, &utime, &stime); +- store_samples(samples, stime, utime, p->se.sum_exec_runtime); ++ store_samples(samples, stime, utime, tsk_seruntime(p)); + } + + static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, +@@ -850,7 +850,7 @@ static void check_thread_timers(struct task_struct *tsk, + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ +- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); ++ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + /* At the hard limit, send SIGKILL. No further action. */ +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index a50364df1054..a86e4530e530 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -44,6 +44,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -1587,7 +1588,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base) + * Check, if the next hrtimer event is before the next timer wheel + * event: + */ +-static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) ++static u64 cmp_next_hrtimer_event(struct timer_base *base, u64 basem, u64 expires) + { + u64 nextevt = hrtimer_get_next_event(); + +@@ -1605,6 +1606,9 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) + if (nextevt <= basem) + return basem; + ++ if (nextevt < expires && nextevt - basem <= TICK_NSEC) ++ base->is_idle = false; ++ + /* + * Round up to the next jiffie. High resolution timers are + * off, so the hrtimers are expired in the tick and we need to +@@ -1674,7 +1678,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) + } + raw_spin_unlock(&base->lock); + +- return cmp_next_hrtimer_event(basem, expires); ++ return cmp_next_hrtimer_event(base, basem, expires); + } + + /** +@@ -1873,6 +1877,18 @@ signed long __sched schedule_timeout(signed long timeout) + + expire = timeout + jiffies; + ++#ifdef CONFIG_HIGH_RES_TIMERS ++ if (timeout == 1 && hrtimer_resolution < NSEC_PER_SEC / HZ) { ++ /* ++ * Special case 1 as being a request for the minimum timeout ++ * and use highres timers to timeout after 1ms to workaround ++ * the granularity of low Hz tick timers. ++ */ ++ if (!schedule_min_hrtimeout()) ++ return 0; ++ goto out_timeout; ++ } ++#endif + timer.task = current; + timer_setup_on_stack(&timer.timer, process_timeout, 0); + __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING); +@@ -1881,10 +1897,10 @@ signed long __sched schedule_timeout(signed long timeout) + + /* Remove the timer from the object tracker */ + destroy_timer_on_stack(&timer.timer); +- ++out_timeout: + timeout = expire - jiffies; + +- out: ++out: + return timeout < 0 ? 0 : timeout; + } + EXPORT_SYMBOL(schedule_timeout); +@@ -2027,7 +2043,19 @@ void __init init_timers(void) + */ + void msleep(unsigned int msecs) + { +- unsigned long timeout = msecs_to_jiffies(msecs) + 1; ++ int jiffs = msecs_to_jiffies(msecs); ++ unsigned long timeout; ++ ++ /* ++ * Use high resolution timers where the resolution of tick based ++ * timers is inadequate. ++ */ ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { ++ while (msecs) ++ msecs = schedule_msec_hrtimeout_uninterruptible(msecs); ++ return; ++ } ++ timeout = jiffs + 1; + + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); +@@ -2041,7 +2069,15 @@ EXPORT_SYMBOL(msleep); + */ + unsigned long msleep_interruptible(unsigned int msecs) + { +- unsigned long timeout = msecs_to_jiffies(msecs) + 1; ++ int jiffs = msecs_to_jiffies(msecs); ++ unsigned long timeout; ++ ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { ++ while (msecs && !signal_pending(current)) ++ msecs = schedule_msec_hrtimeout_interruptible(msecs); ++ return msecs; ++ } ++ timeout = jiffs + 1; + + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); +diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +index b5e3496cf803..68930e7f4d28 100644 +--- a/kernel/trace/trace_selftest.c ++++ b/kernel/trace/trace_selftest.c +@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_MUQSS ++ /* No deadline on MuQSS, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 466fc3144fff..27224c2d7674 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -169,7 +169,7 @@ struct scan_control { + /* + * From 0 .. 200. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 33; + + static void set_task_reclaim_state(struct task_struct *task, + struct reclaim_state *rs) +diff --git a/net/core/pktgen.c b/net/core/pktgen.c +index 44fdbb9c6e53..ae0adfc677c2 100644 +--- a/net/core/pktgen.c ++++ b/net/core/pktgen.c +@@ -1894,7 +1894,7 @@ static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname) + mutex_unlock(&pktgen_thread_lock); + pr_debug("%s: waiting for %s to disappear....\n", + __func__, ifname); +- schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try)); ++ schedule_msec_hrtimeout_interruptible((msec_per_try)); + mutex_lock(&pktgen_thread_lock); + + if (++i >= max_tries) { +diff --git a/sound/pci/maestro3.c b/sound/pci/maestro3.c +index 40232a278b1a..d87fae1113aa 100644 +--- a/sound/pci/maestro3.c ++++ b/sound/pci/maestro3.c +@@ -1995,7 +1995,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) + outw(0, io + GPIO_DATA); + outw(dir | GPO_PRIMARY_AC97, io + GPIO_DIRECTION); + +- schedule_timeout_uninterruptible(msecs_to_jiffies(delay1)); ++ schedule_msec_hrtimeout_uninterruptible((delay1)); + + outw(GPO_PRIMARY_AC97, io + GPIO_DATA); + udelay(5); +@@ -2003,7 +2003,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) + outw(IO_SRAM_ENABLE | SERIAL_AC_LINK_ENABLE, io + RING_BUS_CTRL_A); + outw(~0, io + GPIO_MASK); + +- schedule_timeout_uninterruptible(msecs_to_jiffies(delay2)); ++ schedule_msec_hrtimeout_uninterruptible((delay2)); + + if (! snd_m3_try_read_vendor(chip)) + break; +diff --git a/sound/soc/codecs/rt5631.c b/sound/soc/codecs/rt5631.c +index 653da3eaf355..d77d12902594 100644 +--- a/sound/soc/codecs/rt5631.c ++++ b/sound/soc/codecs/rt5631.c +@@ -417,7 +417,7 @@ static void onebit_depop_mute_stage(struct snd_soc_component *component, int ena + hp_zc = snd_soc_component_read(component, RT5631_INT_ST_IRQ_CTRL_2); + snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); + if (enable) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + /* config one-bit depop parameter */ + rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x307f); + snd_soc_component_update_bits(component, RT5631_HP_OUT_VOL, +@@ -529,7 +529,7 @@ static void depop_seq_mute_stage(struct snd_soc_component *component, int enable + hp_zc = snd_soc_component_read(component, RT5631_INT_ST_IRQ_CTRL_2); + snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); + if (enable) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + + /* config depop sequence parameter */ + rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x302f); +diff --git a/sound/soc/codecs/wm8350.c b/sound/soc/codecs/wm8350.c +index a6aa212fa0c8..8bfa549b38db 100644 +--- a/sound/soc/codecs/wm8350.c ++++ b/sound/soc/codecs/wm8350.c +@@ -233,10 +233,10 @@ static void wm8350_pga_work(struct work_struct *work) + out2->ramp == WM8350_RAMP_UP) { + /* delay is longer over 0dB as increases are larger */ + if (i >= WM8350_OUTn_0dB) +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (2)); + else +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (1)); + } else + udelay(50); /* doesn't matter if we delay longer */ +@@ -1120,7 +1120,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + (platform->dis_out4 << 6)); + + /* wait for discharge */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + cap_discharge_msecs)); + +@@ -1136,7 +1136,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + WM8350_VBUFEN); + + /* wait for vmid */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + vmid_charge_msecs)); + +@@ -1187,7 +1187,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1, pm1); + + /* wait */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + vmid_discharge_msecs)); + +@@ -1205,7 +1205,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + pm1 | WM8350_OUTPUT_DRAIN_EN); + + /* wait */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform->drain_msecs)); + + pm1 &= ~WM8350_BIASEN; +diff --git a/sound/soc/codecs/wm8900.c b/sound/soc/codecs/wm8900.c +index a9a6d766a176..45bf31de6282 100644 +--- a/sound/soc/codecs/wm8900.c ++++ b/sound/soc/codecs/wm8900.c +@@ -1104,7 +1104,7 @@ static int wm8900_set_bias_level(struct snd_soc_component *component, + /* Need to let things settle before stopping the clock + * to ensure that restart works, see "Stopping the + * master clock" in the datasheet. */ +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible(1); + snd_soc_component_write(component, WM8900_REG_POWER2, + WM8900_REG_POWER2_SYSCLK_ENA); + break; +diff --git a/sound/soc/codecs/wm9713.c b/sound/soc/codecs/wm9713.c +index 7072ffacbdfd..e8414ec4759c 100644 +--- a/sound/soc/codecs/wm9713.c ++++ b/sound/soc/codecs/wm9713.c +@@ -199,7 +199,7 @@ static int wm9713_voice_shutdown(struct snd_soc_dapm_widget *w, + + /* Gracefully shut down the voice interface. */ + snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0200); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible(1); + snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0f00); + snd_soc_component_update_bits(component, AC97_EXTENDED_MID, 0x1000, 0x1000); + +@@ -868,7 +868,7 @@ static int wm9713_set_pll(struct snd_soc_component *component, + wm9713->pll_in = freq_in; + + /* wait 10ms AC97 link frames for the link to stabilise */ +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + return 0; + } + +diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c +index 3273161e2787..7fb9b4c6dd7b 100644 +--- a/sound/soc/soc-dapm.c ++++ b/sound/soc/soc-dapm.c +@@ -154,7 +154,7 @@ static void dapm_assert_locked(struct snd_soc_dapm_context *dapm) + static void pop_wait(u32 pop_time) + { + if (pop_time) +- schedule_timeout_uninterruptible(msecs_to_jiffies(pop_time)); ++ schedule_msec_hrtimeout_uninterruptible((pop_time)); + } + + __printf(3, 4) +diff --git a/sound/usb/line6/pcm.c b/sound/usb/line6/pcm.c +index fdbdfb7bce92..fa8e8faf3eb3 100644 +--- a/sound/usb/line6/pcm.c ++++ b/sound/usb/line6/pcm.c +@@ -127,7 +127,7 @@ static void line6_wait_clear_audio_urbs(struct snd_line6_pcm *line6pcm, + if (!alive) + break; + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } while (--timeout > 0); + if (alive) + dev_err(line6pcm->line6->ifcdev, diff --git a/linux-tkg/linux-tkg-patches/5.9/0004-glitched-muqss.patch b/linux-tkg/linux-tkg-patches/5.9/0004-glitched-muqss.patch new file mode 100644 index 0000000..2c4837e --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.9/0004-glitched-muqss.patch @@ -0,0 +1,78 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched - MuQSS + +diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c +index 84a1d08d68551..57c3036a68952 100644 +--- a/kernel/sched/MuQSS.c ++++ b/kernel/sched/MuQSS.c +@@ -163,7 +167,11 @@ int sched_interactive __read_mostly = 1; + * are allowed to run five seconds as real time tasks. This is the total over + * all online cpus. + */ ++#ifdef CONFIG_ZENIFY ++int sched_iso_cpu __read_mostly = 25; ++#else + int sched_iso_cpu __read_mostly = 70; ++#endif + + /* + * sched_yield_type - Choose what sort of yield sched_yield will perform. + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -5,7 +5,7 @@ + choice + prompt "Timer frequency" + default HZ_100 if SCHED_MUQSS +- default HZ_250_NODEF if !SCHED_MUQSS ++ default HZ_500_NODEF if !SCHED_MUQSS + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -50,6 +50,20 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500_NODEF ++ bool "500 HZ" ++ help ++ 500 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ ++ config HZ_750_NODEF ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000_NODEF + bool "1000 HZ" + help +@@ -63,6 +70,8 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250_NODEF + default 300 if HZ_300_NODEF ++ default 500 if HZ_500_NODEF ++ default 750 if HZ_750_NODEF + default 1000 if HZ_1000_NODEF + + config SCHED_HRTICK + +diff --git a/Makefile b/Makefile +index d4d36c61940b..4a9dfe471f1f 100644 +--- a/Makefile ++++ b/Makefile +@@ -15,7 +15,6 @@ NAME = Kleptomaniac Octopus + + CKVERSION = -ck1 + CKNAME = MuQSS Powered +-EXTRAVERSION := $(EXTRAVERSION)$(CKVERSION) + + # We are using a recursive build, so we need to do a little thinking + # to get the ordering right. diff --git a/linux-tkg/linux-tkg-patches/5.9/0004-glitched-ondemand-muqss.patch b/linux-tkg/linux-tkg-patches/5.9/0004-glitched-ondemand-muqss.patch new file mode 100644 index 0000000..02933e4 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.9/0004-glitched-ondemand-muqss.patch @@ -0,0 +1,18 @@ +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index 6b423eebfd5d..61e3271675d6 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -21,10 +21,10 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_SAMPLING_DOWN_FACTOR (1) ++#define DEF_FREQUENCY_UP_THRESHOLD (45) ++#define DEF_SAMPLING_DOWN_FACTOR (5) + #define MAX_SAMPLING_DOWN_FACTOR (100000) +-#define MICRO_FREQUENCY_UP_THRESHOLD (95) ++#define MICRO_FREQUENCY_UP_THRESHOLD (45) + #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) + #define MIN_FREQUENCY_UP_THRESHOLD (1) + #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux-tkg/linux-tkg-patches/5.9/0005-glitched-pds.patch b/linux-tkg/linux-tkg-patches/5.9/0005-glitched-pds.patch new file mode 100644 index 0000000..08c9ef3 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.9/0005-glitched-pds.patch @@ -0,0 +1,90 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched - PDS + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_500 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -39,6 +39,13 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,6 +59,7 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_500 ++ default HZ_750 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -46,6 +46,13 @@ choice + on desktops with great smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -60,6 +67,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 500 if HZ_500 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 9270a4370d54..30d01e647417 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -169,7 +169,7 @@ + /* + * From 0 .. 200. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 20; + + static void set_task_reclaim_state(struct task_struct *task, + struct reclaim_state *rs) diff --git a/linux-tkg/linux-tkg-patches/5.9/0006-add-acs-overrides_iommu.patch b/linux-tkg/linux-tkg-patches/5.9/0006-add-acs-overrides_iommu.patch new file mode 100644 index 0000000..d1303a5 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.9/0006-add-acs-overrides_iommu.patch @@ -0,0 +1,193 @@ +From cdeab384f48dd9c88e2dff2e9ad8d57dca1a1b1c Mon Sep 17 00:00:00 2001 +From: Mark Weiman +Date: Sun, 12 Aug 2018 11:36:21 -0400 +Subject: [PATCH] pci: Enable overrides for missing ACS capabilities + +This an updated version of Alex Williamson's patch from: +https://lkml.org/lkml/2013/5/30/513 + +Original commit message follows: + +PCIe ACS (Access Control Services) is the PCIe 2.0+ feature that +allows us to control whether transactions are allowed to be redirected +in various subnodes of a PCIe topology. For instance, if two +endpoints are below a root port or downsteam switch port, the +downstream port may optionally redirect transactions between the +devices, bypassing upstream devices. The same can happen internally +on multifunction devices. The transaction may never be visible to the +upstream devices. + +One upstream device that we particularly care about is the IOMMU. If +a redirection occurs in the topology below the IOMMU, then the IOMMU +cannot provide isolation between devices. This is why the PCIe spec +encourages topologies to include ACS support. Without it, we have to +assume peer-to-peer DMA within a hierarchy can bypass IOMMU isolation. + +Unfortunately, far too many topologies do not support ACS to make this +a steadfast requirement. Even the latest chipsets from Intel are only +sporadically supporting ACS. We have trouble getting interconnect +vendors to include the PCIe spec required PCIe capability, let alone +suggested features. + +Therefore, we need to add some flexibility. The pcie_acs_override= +boot option lets users opt-in specific devices or sets of devices to +assume ACS support. The "downstream" option assumes full ACS support +on root ports and downstream switch ports. The "multifunction" +option assumes the subset of ACS features available on multifunction +endpoints and upstream switch ports are supported. The "id:nnnn:nnnn" +option enables ACS support on devices matching the provided vendor +and device IDs, allowing more strategic ACS overrides. These options +may be combined in any order. A maximum of 16 id specific overrides +are available. It's suggested to use the most limited set of options +necessary to avoid completely disabling ACS across the topology. +Note to hardware vendors, we have facilities to permanently quirk +specific devices which enforce isolation but not provide an ACS +capability. Please contact me to have your devices added and save +your customers the hassle of this boot option. + +Signed-off-by: Mark Weiman +--- + .../admin-guide/kernel-parameters.txt | 9 ++ + drivers/pci/quirks.c | 101 ++++++++++++++++++ + 2 files changed, 110 insertions(+) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index aefd358a5ca3..173b3596fd9e 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -3190,6 +3190,15 @@ + nomsi [MSI] If the PCI_MSI kernel config parameter is + enabled, this kernel boot option can be used to + disable the use of MSI interrupts system-wide. ++ pcie_acs_override = ++ [PCIE] Override missing PCIe ACS support for: ++ downstream ++ All downstream ports - full ACS capabilities ++ multifunction ++ All multifunction devices - multifunction ACS subset ++ id:nnnn:nnnn ++ Specific device - full ACS capabilities ++ Specified as vid:did (vendor/device ID) in hex + noioapicquirk [APIC] Disable all boot interrupt quirks. + Safety option to keep boot IRQs enabled. This + should never be necessary. +diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c +index 4700d24e5d55..8f7a3d7fd9c1 100644 +--- a/drivers/pci/quirks.c ++++ b/drivers/pci/quirks.c +@@ -3372,6 +3372,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev) + dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET; + } + ++static bool acs_on_downstream; ++static bool acs_on_multifunction; ++ ++#define NUM_ACS_IDS 16 ++struct acs_on_id { ++ unsigned short vendor; ++ unsigned short device; ++}; ++static struct acs_on_id acs_on_ids[NUM_ACS_IDS]; ++static u8 max_acs_id; ++ ++static __init int pcie_acs_override_setup(char *p) ++{ ++ if (!p) ++ return -EINVAL; ++ ++ while (*p) { ++ if (!strncmp(p, "downstream", 10)) ++ acs_on_downstream = true; ++ if (!strncmp(p, "multifunction", 13)) ++ acs_on_multifunction = true; ++ if (!strncmp(p, "id:", 3)) { ++ char opt[5]; ++ int ret; ++ long val; ++ ++ if (max_acs_id >= NUM_ACS_IDS - 1) { ++ pr_warn("Out of PCIe ACS override slots (%d)\n", ++ NUM_ACS_IDS); ++ goto next; ++ } ++ ++ p += 3; ++ snprintf(opt, 5, "%s", p); ++ ret = kstrtol(opt, 16, &val); ++ if (ret) { ++ pr_warn("PCIe ACS ID parse error %d\n", ret); ++ goto next; ++ } ++ acs_on_ids[max_acs_id].vendor = val; ++ ++ p += strcspn(p, ":"); ++ if (*p != ':') { ++ pr_warn("PCIe ACS invalid ID\n"); ++ goto next; ++ } ++ ++ p++; ++ snprintf(opt, 5, "%s", p); ++ ret = kstrtol(opt, 16, &val); ++ if (ret) { ++ pr_warn("PCIe ACS ID parse error %d\n", ret); ++ goto next; ++ } ++ acs_on_ids[max_acs_id].device = val; ++ max_acs_id++; ++ } ++next: ++ p += strcspn(p, ","); ++ if (*p == ',') ++ p++; ++ } ++ ++ if (acs_on_downstream || acs_on_multifunction || max_acs_id) ++ pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n"); ++ ++ return 0; ++} ++early_param("pcie_acs_override", pcie_acs_override_setup); ++ ++static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags) ++{ ++ int i; ++ ++ /* Never override ACS for legacy devices or devices with ACS caps */ ++ if (!pci_is_pcie(dev) || ++ pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS)) ++ return -ENOTTY; ++ ++ for (i = 0; i < max_acs_id; i++) ++ if (acs_on_ids[i].vendor == dev->vendor && ++ acs_on_ids[i].device == dev->device) ++ return 1; ++ ++ switch (pci_pcie_type(dev)) { ++ case PCI_EXP_TYPE_DOWNSTREAM: ++ case PCI_EXP_TYPE_ROOT_PORT: ++ if (acs_on_downstream) ++ return 1; ++ break; ++ case PCI_EXP_TYPE_ENDPOINT: ++ case PCI_EXP_TYPE_UPSTREAM: ++ case PCI_EXP_TYPE_LEG_END: ++ case PCI_EXP_TYPE_RC_END: ++ if (acs_on_multifunction && dev->multifunction) ++ return 1; ++ } ++ ++ return -ENOTTY; ++} + /* + * Some Atheros AR9xxx and QCA988x chips do not behave after a bus reset. + * The device will throw a Link Down error on AER-capable systems and +@@ -4513,6 +4613,7 @@ static const struct pci_dev_acs_enabled { + { PCI_VENDOR_ID_ZHAOXIN, 0x9083, pci_quirk_mf_endpoint_acs }, + /* Zhaoxin Root/Downstream Ports */ + { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, ++ { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, + { 0 } + }; + + diff --git a/linux-tkg/linux-tkg-patches/5.9/0007-v5.9-fsync.patch b/linux-tkg/linux-tkg-patches/5.9/0007-v5.9-fsync.patch new file mode 100644 index 0000000..47badbb --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.9/0007-v5.9-fsync.patch @@ -0,0 +1,597 @@ +From 7b5df0248ce255ef5b7204d65a7b3783ebb76a3d Mon Sep 17 00:00:00 2001 +From: Gabriel Krisman Bertazi +Date: Fri, 13 Dec 2019 11:08:02 -0300 +Subject: [PATCH 1/2] futex: Implement mechanism to wait on any of several + futexes +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +This is a new futex operation, called FUTEX_WAIT_MULTIPLE, which allows +a thread to wait on several futexes at the same time, and be awoken by +any of them. In a sense, it implements one of the features that was +supported by pooling on the old FUTEX_FD interface. + +The use case lies in the Wine implementation of the Windows NT interface +WaitMultipleObjects. This Windows API function allows a thread to sleep +waiting on the first of a set of event sources (mutexes, timers, signal, +console input, etc) to signal. Considering this is a primitive +synchronization operation for Windows applications, being able to quickly +signal events on the producer side, and quickly go to sleep on the +consumer side is essential for good performance of those running over Wine. + +Wine developers have an implementation that uses eventfd, but it suffers +from FD exhaustion (there is applications that go to the order of +multi-milion FDs), and higher CPU utilization than this new operation. + +The futex list is passed as an array of `struct futex_wait_block` +(pointer, value, bitset) to the kernel, which will enqueue all of them +and sleep if none was already triggered. It returns a hint of which +futex caused the wake up event to userspace, but the hint doesn't +guarantee that is the only futex triggered. Before calling the syscall +again, userspace should traverse the list, trying to re-acquire any of +the other futexes, to prevent an immediate -EWOULDBLOCK return code from +the kernel. + +This was tested using three mechanisms: + +1) By reimplementing FUTEX_WAIT in terms of FUTEX_WAIT_MULTIPLE and +running the unmodified tools/testing/selftests/futex and a full linux +distro on top of this kernel. + +2) By an example code that exercises the FUTEX_WAIT_MULTIPLE path on a +multi-threaded, event-handling setup. + +3) By running the Wine fsync implementation and executing multi-threaded +applications, in particular modern games, on top of this implementation. + +Changes were tested for the following ABIs: x86_64, i386 and x32. +Support for x32 applications is not implemented since it would +take a major rework adding a new entry point and splitting the current +futex 64 entry point in two and we can't change the current x32 syscall +number without breaking user space compatibility. + +CC: Steven Rostedt +Cc: Richard Yao +Cc: Thomas Gleixner +Cc: Peter Zijlstra +Co-developed-by: Zebediah Figura +Signed-off-by: Zebediah Figura +Co-developed-by: Steven Noonan +Signed-off-by: Steven Noonan +Co-developed-by: Pierre-Loup A. Griffais +Signed-off-by: Pierre-Loup A. Griffais +Signed-off-by: Gabriel Krisman Bertazi +[Added compatibility code] +Co-developed-by: André Almeida +Signed-off-by: André Almeida + +Adjusted for v5.9: Removed `put_futex_key` calls. +--- + include/uapi/linux/futex.h | 20 +++ + kernel/futex.c | 352 ++++++++++++++++++++++++++++++++++++- + 2 files changed, 370 insertions(+), 2 deletions(-) + +diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h +index a89eb0accd5e2..580001e89c6ca 100644 +--- a/include/uapi/linux/futex.h ++++ b/include/uapi/linux/futex.h +@@ -21,6 +21,7 @@ + #define FUTEX_WAKE_BITSET 10 + #define FUTEX_WAIT_REQUEUE_PI 11 + #define FUTEX_CMP_REQUEUE_PI 12 ++#define FUTEX_WAIT_MULTIPLE 13 + + #define FUTEX_PRIVATE_FLAG 128 + #define FUTEX_CLOCK_REALTIME 256 +@@ -40,6 +41,8 @@ + FUTEX_PRIVATE_FLAG) + #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ + FUTEX_PRIVATE_FLAG) ++#define FUTEX_WAIT_MULTIPLE_PRIVATE (FUTEX_WAIT_MULTIPLE | \ ++ FUTEX_PRIVATE_FLAG) + + /* + * Support for robust futexes: the kernel cleans up held futexes at +@@ -150,4 +153,21 @@ struct robust_list_head { + (((op & 0xf) << 28) | ((cmp & 0xf) << 24) \ + | ((oparg & 0xfff) << 12) | (cmparg & 0xfff)) + ++/* ++ * Maximum number of multiple futexes to wait for ++ */ ++#define FUTEX_MULTIPLE_MAX_COUNT 128 ++ ++/** ++ * struct futex_wait_block - Block of futexes to be waited for ++ * @uaddr: User address of the futex ++ * @val: Futex value expected by userspace ++ * @bitset: Bitset for the optional bitmasked wakeup ++ */ ++struct futex_wait_block { ++ __u32 __user *uaddr; ++ __u32 val; ++ __u32 bitset; ++}; ++ + #endif /* _UAPI_LINUX_FUTEX_H */ +diff --git a/kernel/futex.c b/kernel/futex.c +index a5876694a60eb..6f4bea76df460 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -197,6 +197,8 @@ struct futex_pi_state { + * @rt_waiter: rt_waiter storage for use with requeue_pi + * @requeue_pi_key: the requeue_pi target futex key + * @bitset: bitset for the optional bitmasked wakeup ++ * @uaddr: userspace address of futex ++ * @uval: expected futex's value + * + * We use this hashed waitqueue, instead of a normal wait_queue_entry_t, so + * we can wake only the relevant ones (hashed queues may be shared). +@@ -219,6 +221,8 @@ struct futex_q { + struct rt_mutex_waiter *rt_waiter; + union futex_key *requeue_pi_key; + u32 bitset; ++ u32 __user *uaddr; ++ u32 uval; + } __randomize_layout; + + static const struct futex_q futex_q_init = { +@@ -2304,6 +2308,29 @@ static int unqueue_me(struct futex_q *q) + return ret; + } + ++/** ++ * unqueue_multiple() - Remove several futexes from their futex_hash_bucket ++ * @q: The list of futexes to unqueue ++ * @count: Number of futexes in the list ++ * ++ * Helper to unqueue a list of futexes. This can't fail. ++ * ++ * Return: ++ * - >=0 - Index of the last futex that was awoken; ++ * - -1 - If no futex was awoken ++ */ ++static int unqueue_multiple(struct futex_q *q, int count) ++{ ++ int ret = -1; ++ int i; ++ ++ for (i = 0; i < count; i++) { ++ if (!unqueue_me(&q[i])) ++ ret = i; ++ } ++ return ret; ++} ++ + /* + * PI futexes can not be requeued and must remove themself from the + * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry +@@ -2662,6 +2689,205 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + return ret; + } + ++/** ++ * futex_wait_multiple_setup() - Prepare to wait and enqueue multiple futexes ++ * @qs: The corresponding futex list ++ * @count: The size of the lists ++ * @flags: Futex flags (FLAGS_SHARED, etc.) ++ * @awaken: Index of the last awoken futex ++ * ++ * Prepare multiple futexes in a single step and enqueue them. This may fail if ++ * the futex list is invalid or if any futex was already awoken. On success the ++ * task is ready to interruptible sleep. ++ * ++ * Return: ++ * - 1 - One of the futexes was awaken by another thread ++ * - 0 - Success ++ * - <0 - -EFAULT, -EWOULDBLOCK or -EINVAL ++ */ ++static int futex_wait_multiple_setup(struct futex_q *qs, int count, ++ unsigned int flags, int *awaken) ++{ ++ struct futex_hash_bucket *hb; ++ int ret, i; ++ u32 uval; ++ ++ /* ++ * Enqueuing multiple futexes is tricky, because we need to ++ * enqueue each futex in the list before dealing with the next ++ * one to avoid deadlocking on the hash bucket. But, before ++ * enqueuing, we need to make sure that current->state is ++ * TASK_INTERRUPTIBLE, so we don't absorb any awake events, which ++ * cannot be done before the get_futex_key of the next key, ++ * because it calls get_user_pages, which can sleep. Thus, we ++ * fetch the list of futexes keys in two steps, by first pinning ++ * all the memory keys in the futex key, and only then we read ++ * each key and queue the corresponding futex. ++ */ ++retry: ++ for (i = 0; i < count; i++) { ++ qs[i].key = FUTEX_KEY_INIT; ++ ret = get_futex_key(qs[i].uaddr, flags & FLAGS_SHARED, ++ &qs[i].key, FUTEX_READ); ++ if (unlikely(ret)) { ++ return ret; ++ } ++ } ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ for (i = 0; i < count; i++) { ++ struct futex_q *q = &qs[i]; ++ ++ hb = queue_lock(q); ++ ++ ret = get_futex_value_locked(&uval, q->uaddr); ++ if (ret) { ++ /* ++ * We need to try to handle the fault, which ++ * cannot be done without sleep, so we need to ++ * undo all the work already done, to make sure ++ * we don't miss any wake ups. Therefore, clean ++ * up, handle the fault and retry from the ++ * beginning. ++ */ ++ queue_unlock(hb); ++ ++ /* ++ * Keys 0..(i-1) are implicitly put ++ * on unqueue_multiple. ++ */ ++ *awaken = unqueue_multiple(qs, i); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ /* ++ * On a real fault, prioritize the error even if ++ * some other futex was awoken. Userspace gave ++ * us a bad address, -EFAULT them. ++ */ ++ ret = get_user(uval, q->uaddr); ++ if (ret) ++ return ret; ++ ++ /* ++ * Even if the page fault was handled, If ++ * something was already awaken, we can safely ++ * give up and succeed to give a hint for userspace to ++ * acquire the right futex faster. ++ */ ++ if (*awaken >= 0) ++ return 1; ++ ++ goto retry; ++ } ++ ++ if (uval != q->uval) { ++ queue_unlock(hb); ++ ++ /* ++ * If something was already awaken, we can ++ * safely ignore the error and succeed. ++ */ ++ *awaken = unqueue_multiple(qs, i); ++ __set_current_state(TASK_RUNNING); ++ if (*awaken >= 0) ++ return 1; ++ ++ return -EWOULDBLOCK; ++ } ++ ++ /* ++ * The bucket lock can't be held while dealing with the ++ * next futex. Queue each futex at this moment so hb can ++ * be unlocked. ++ */ ++ queue_me(&qs[i], hb); ++ } ++ return 0; ++} ++ ++/** ++ * futex_wait_multiple() - Prepare to wait on and enqueue several futexes ++ * @qs: The list of futexes to wait on ++ * @op: Operation code from futex's syscall ++ * @count: The number of objects ++ * @abs_time: Timeout before giving up and returning to userspace ++ * ++ * Entry point for the FUTEX_WAIT_MULTIPLE futex operation, this function ++ * sleeps on a group of futexes and returns on the first futex that ++ * triggered, or after the timeout has elapsed. ++ * ++ * Return: ++ * - >=0 - Hint to the futex that was awoken ++ * - <0 - On error ++ */ ++static int futex_wait_multiple(struct futex_q *qs, int op, ++ u32 count, ktime_t *abs_time) ++{ ++ struct hrtimer_sleeper timeout, *to; ++ int ret, flags = 0, hint = 0; ++ unsigned int i; ++ ++ if (!(op & FUTEX_PRIVATE_FLAG)) ++ flags |= FLAGS_SHARED; ++ ++ if (op & FUTEX_CLOCK_REALTIME) ++ flags |= FLAGS_CLOCKRT; ++ ++ to = futex_setup_timer(abs_time, &timeout, flags, 0); ++ while (1) { ++ ret = futex_wait_multiple_setup(qs, count, flags, &hint); ++ if (ret) { ++ if (ret > 0) { ++ /* A futex was awaken during setup */ ++ ret = hint; ++ } ++ break; ++ } ++ ++ if (to) ++ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); ++ ++ /* ++ * Avoid sleeping if another thread already tried to ++ * wake us. ++ */ ++ for (i = 0; i < count; i++) { ++ if (plist_node_empty(&qs[i].list)) ++ break; ++ } ++ ++ if (i == count && (!to || to->task)) ++ freezable_schedule(); ++ ++ ret = unqueue_multiple(qs, count); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ if (ret >= 0) ++ break; ++ if (to && !to->task) { ++ ret = -ETIMEDOUT; ++ break; ++ } else if (signal_pending(current)) { ++ ret = -ERESTARTSYS; ++ break; ++ } ++ /* ++ * The final case is a spurious wakeup, for ++ * which just retry. ++ */ ++ } ++ ++ if (to) { ++ hrtimer_cancel(&to->timer); ++ destroy_hrtimer_on_stack(&to->timer); ++ } ++ ++ return ret; ++} ++ + static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, + ktime_t *abs_time, u32 bitset) + { +@@ -3774,6 +4000,43 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, + return -ENOSYS; + } + ++/** ++ * futex_read_wait_block - Read an array of futex_wait_block from userspace ++ * @uaddr: Userspace address of the block ++ * @count: Number of blocks to be read ++ * ++ * This function creates and allocate an array of futex_q (we zero it to ++ * initialize the fields) and then, for each futex_wait_block element from ++ * userspace, fill a futex_q element with proper values. ++ */ ++inline struct futex_q *futex_read_wait_block(u32 __user *uaddr, u32 count) ++{ ++ unsigned int i; ++ struct futex_q *qs; ++ struct futex_wait_block fwb; ++ struct futex_wait_block __user *entry = ++ (struct futex_wait_block __user *)uaddr; ++ ++ if (!count || count > FUTEX_MULTIPLE_MAX_COUNT) ++ return ERR_PTR(-EINVAL); ++ ++ qs = kcalloc(count, sizeof(*qs), GFP_KERNEL); ++ if (!qs) ++ return ERR_PTR(-ENOMEM); ++ ++ for (i = 0; i < count; i++) { ++ if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { ++ kfree(qs); ++ return ERR_PTR(-EFAULT); ++ } ++ ++ qs[i].uaddr = fwb.uaddr; ++ qs[i].uval = fwb.val; ++ qs[i].bitset = fwb.bitset; ++ } ++ ++ return qs; ++} + + SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, +@@ -3786,7 +4049,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET || +- cmd == FUTEX_WAIT_REQUEUE_PI)) { ++ cmd == FUTEX_WAIT_REQUEUE_PI || ++ cmd == FUTEX_WAIT_MULTIPLE)) { + if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG)))) + return -EFAULT; + if (get_timespec64(&ts, utime)) +@@ -3807,6 +4071,25 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) + val2 = (u32) (unsigned long) utime; + ++ if (cmd == FUTEX_WAIT_MULTIPLE) { ++ int ret; ++ struct futex_q *qs; ++ ++#ifdef CONFIG_X86_X32 ++ if (unlikely(in_x32_syscall())) ++ return -ENOSYS; ++#endif ++ qs = futex_read_wait_block(uaddr, val); ++ ++ if (IS_ERR(qs)) ++ return PTR_ERR(qs); ++ ++ ret = futex_wait_multiple(qs, op, val, tp); ++ kfree(qs); ++ ++ return ret; ++ } ++ + return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); + } + +@@ -3969,6 +4252,57 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, + #endif /* CONFIG_COMPAT */ + + #ifdef CONFIG_COMPAT_32BIT_TIME ++/** ++ * struct compat_futex_wait_block - Block of futexes to be waited for ++ * @uaddr: User address of the futex (compatible pointer) ++ * @val: Futex value expected by userspace ++ * @bitset: Bitset for the optional bitmasked wakeup ++ */ ++struct compat_futex_wait_block { ++ compat_uptr_t uaddr; ++ __u32 val; ++ __u32 bitset; ++}; ++ ++/** ++ * compat_futex_read_wait_block - Read an array of futex_wait_block from ++ * userspace ++ * @uaddr: Userspace address of the block ++ * @count: Number of blocks to be read ++ * ++ * This function does the same as futex_read_wait_block(), except that it ++ * converts the pointer to the futex from the compat version to the regular one. ++ */ ++inline struct futex_q *compat_futex_read_wait_block(u32 __user *uaddr, ++ u32 count) ++{ ++ unsigned int i; ++ struct futex_q *qs; ++ struct compat_futex_wait_block fwb; ++ struct compat_futex_wait_block __user *entry = ++ (struct compat_futex_wait_block __user *)uaddr; ++ ++ if (!count || count > FUTEX_MULTIPLE_MAX_COUNT) ++ return ERR_PTR(-EINVAL); ++ ++ qs = kcalloc(count, sizeof(*qs), GFP_KERNEL); ++ if (!qs) ++ return ERR_PTR(-ENOMEM); ++ ++ for (i = 0; i < count; i++) { ++ if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) { ++ kfree(qs); ++ return ERR_PTR(-EFAULT); ++ } ++ ++ qs[i].uaddr = compat_ptr(fwb.uaddr); ++ qs[i].uval = fwb.val; ++ qs[i].bitset = fwb.bitset; ++ } ++ ++ return qs; ++} ++ + SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + struct old_timespec32 __user *, utime, u32 __user *, uaddr2, + u32, val3) +@@ -3980,7 +4314,8 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET || +- cmd == FUTEX_WAIT_REQUEUE_PI)) { ++ cmd == FUTEX_WAIT_REQUEUE_PI || ++ cmd == FUTEX_WAIT_MULTIPLE)) { + if (get_old_timespec32(&ts, utime)) + return -EFAULT; + if (!timespec64_valid(&ts)) +@@ -3995,6 +4330,19 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) + val2 = (int) (unsigned long) utime; + ++ if (cmd == FUTEX_WAIT_MULTIPLE) { ++ int ret; ++ struct futex_q *qs = compat_futex_read_wait_block(uaddr, val); ++ ++ if (IS_ERR(qs)) ++ return PTR_ERR(qs); ++ ++ ret = futex_wait_multiple(qs, op, val, tp); ++ kfree(qs); ++ ++ return ret; ++ } ++ + return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); + } + #endif /* CONFIG_COMPAT_32BIT_TIME */ + +From ccdddb50d330d2ee1a4d2cbfdd27bdd7fb10eec3 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Andr=C3=A9=20Almeida?= +Date: Fri, 7 Feb 2020 23:28:02 -0300 +Subject: [PATCH 2/2] futex: Add Proton compatibility code + +--- + include/uapi/linux/futex.h | 2 +- + kernel/futex.c | 5 +++-- + 2 files changed, 4 insertions(+), 3 deletions(-) + +diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h +index 580001e89c6ca..a3e760886b8e7 100644 +--- a/include/uapi/linux/futex.h ++++ b/include/uapi/linux/futex.h +@@ -21,7 +21,7 @@ + #define FUTEX_WAKE_BITSET 10 + #define FUTEX_WAIT_REQUEUE_PI 11 + #define FUTEX_CMP_REQUEUE_PI 12 +-#define FUTEX_WAIT_MULTIPLE 13 ++#define FUTEX_WAIT_MULTIPLE 31 + + #define FUTEX_PRIVATE_FLAG 128 + #define FUTEX_CLOCK_REALTIME 256 +diff --git a/kernel/futex.c b/kernel/futex.c +index 6f4bea76df460..03d89fe7b8392 100644 +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -4059,7 +4059,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + return -EINVAL; + + t = timespec64_to_ktime(ts); +- if (cmd == FUTEX_WAIT) ++ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) + t = ktime_add_safe(ktime_get(), t); + tp = &t; + } +@@ -4260,6 +4260,7 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, + */ + struct compat_futex_wait_block { + compat_uptr_t uaddr; ++ __u32 pad; + __u32 val; + __u32 bitset; + }; +@@ -4322,7 +4323,7 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, + return -EINVAL; + + t = timespec64_to_ktime(ts); +- if (cmd == FUTEX_WAIT) ++ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE) + t = ktime_add_safe(ktime_get(), t); + tp = &t; + } diff --git a/linux-tkg/linux-tkg-patches/5.9/0008-5.9-bcachefs.patch b/linux-tkg/linux-tkg-patches/5.9/0008-5.9-bcachefs.patch new file mode 100644 index 0000000..5e81fb6 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.9/0008-5.9-bcachefs.patch @@ -0,0 +1,70821 @@ +diff --git a/block/bio.c b/block/bio.c +index e865ea55b9f9..72a65c4113be 100644 +--- a/block/bio.c ++++ b/block/bio.c +@@ -1320,6 +1320,7 @@ void bio_set_pages_dirty(struct bio *bio) + set_page_dirty_lock(bvec->bv_page); + } + } ++EXPORT_SYMBOL_GPL(bio_set_pages_dirty); + + /* + * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. +@@ -1379,6 +1380,7 @@ void bio_check_pages_dirty(struct bio *bio) + spin_unlock_irqrestore(&bio_dirty_lock, flags); + schedule_work(&bio_dirty_work); + } ++EXPORT_SYMBOL_GPL(bio_check_pages_dirty); + + static inline bool bio_remaining_done(struct bio *bio) + { +diff --git a/block/blk-core.c b/block/blk-core.c +index 10c08ac50697..d68f24a7ee48 100644 +--- a/block/blk-core.c ++++ b/block/blk-core.c +@@ -213,18 +213,23 @@ int blk_status_to_errno(blk_status_t status) + } + EXPORT_SYMBOL_GPL(blk_status_to_errno); + +-static void print_req_error(struct request *req, blk_status_t status, +- const char *caller) ++const char *blk_status_to_str(blk_status_t status) + { + int idx = (__force int)status; + + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) +- return; ++ return "(invalid error)"; ++ return blk_errors[idx].name; ++} ++EXPORT_SYMBOL_GPL(blk_status_to_str); + ++static void print_req_error(struct request *req, blk_status_t status, ++ const char *caller) ++{ + printk_ratelimited(KERN_ERR + "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " + "phys_seg %u prio class %u\n", +- caller, blk_errors[idx].name, ++ caller, blk_status_to_str(status), + req->rq_disk ? req->rq_disk->disk_name : "?", + blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), + req->cmd_flags & ~REQ_OP_MASK, +diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig +index d1ca4d059c20..e63646b103c4 100644 +--- a/drivers/md/bcache/Kconfig ++++ b/drivers/md/bcache/Kconfig +@@ -3,6 +3,7 @@ + config BCACHE + tristate "Block device as cache" + select CRC64 ++ select CLOSURES + help + Allows a block device to be used as cache for other devices; uses + a btree for indexing and the layout is optimized for SSDs. +@@ -18,15 +19,6 @@ config BCACHE_DEBUG + Enables extra debugging tools, allows expensive runtime checks to be + turned on. + +-config BCACHE_CLOSURES_DEBUG +- bool "Debug closures" +- depends on BCACHE +- select DEBUG_FS +- help +- Keeps all active closures in a linked list and provides a debugfs +- interface to list them, which makes it possible to see asynchronous +- operations that get stuck. +- + config BCACHE_ASYNC_REGISTRATION + bool "Asynchronous device registration (EXPERIMENTAL)" + depends on BCACHE +diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile +index 5b87e59676b8..054e8a33a7ab 100644 +--- a/drivers/md/bcache/Makefile ++++ b/drivers/md/bcache/Makefile +@@ -2,6 +2,6 @@ + + obj-$(CONFIG_BCACHE) += bcache.o + +-bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ +- io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ ++bcache-y := alloc.o bset.o btree.o debug.o extents.o io.o\ ++ journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ + util.o writeback.o features.o +diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h +index 4fd03d2496d8..498625095807 100644 +--- a/drivers/md/bcache/bcache.h ++++ b/drivers/md/bcache/bcache.h +@@ -180,6 +180,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -192,7 +193,6 @@ + + #include "bset.h" + #include "util.h" +-#include "closure.h" + + struct bucket { + atomic_t pin; +diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c +deleted file mode 100644 +index 0164a1fe94a9..000000000000 +--- a/drivers/md/bcache/closure.c ++++ /dev/null +@@ -1,217 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0 +-/* +- * Asynchronous refcounty things +- * +- * Copyright 2010, 2011 Kent Overstreet +- * Copyright 2012 Google, Inc. +- */ +- +-#include +-#include +-#include +-#include +- +-#include "closure.h" +- +-static inline void closure_put_after_sub(struct closure *cl, int flags) +-{ +- int r = flags & CLOSURE_REMAINING_MASK; +- +- BUG_ON(flags & CLOSURE_GUARD_MASK); +- BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); +- +- if (!r) { +- if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { +- atomic_set(&cl->remaining, +- CLOSURE_REMAINING_INITIALIZER); +- closure_queue(cl); +- } else { +- struct closure *parent = cl->parent; +- closure_fn *destructor = cl->fn; +- +- closure_debug_destroy(cl); +- +- if (destructor) +- destructor(cl); +- +- if (parent) +- closure_put(parent); +- } +- } +-} +- +-/* For clearing flags with the same atomic op as a put */ +-void closure_sub(struct closure *cl, int v) +-{ +- closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); +-} +- +-/* +- * closure_put - decrement a closure's refcount +- */ +-void closure_put(struct closure *cl) +-{ +- closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); +-} +- +-/* +- * closure_wake_up - wake up all closures on a wait list, without memory barrier +- */ +-void __closure_wake_up(struct closure_waitlist *wait_list) +-{ +- struct llist_node *list; +- struct closure *cl, *t; +- struct llist_node *reverse = NULL; +- +- list = llist_del_all(&wait_list->list); +- +- /* We first reverse the list to preserve FIFO ordering and fairness */ +- reverse = llist_reverse_order(list); +- +- /* Then do the wakeups */ +- llist_for_each_entry_safe(cl, t, reverse, list) { +- closure_set_waiting(cl, 0); +- closure_sub(cl, CLOSURE_WAITING + 1); +- } +-} +- +-/** +- * closure_wait - add a closure to a waitlist +- * @waitlist: will own a ref on @cl, which will be released when +- * closure_wake_up() is called on @waitlist. +- * @cl: closure pointer. +- * +- */ +-bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) +-{ +- if (atomic_read(&cl->remaining) & CLOSURE_WAITING) +- return false; +- +- closure_set_waiting(cl, _RET_IP_); +- atomic_add(CLOSURE_WAITING + 1, &cl->remaining); +- llist_add(&cl->list, &waitlist->list); +- +- return true; +-} +- +-struct closure_syncer { +- struct task_struct *task; +- int done; +-}; +- +-static void closure_sync_fn(struct closure *cl) +-{ +- struct closure_syncer *s = cl->s; +- struct task_struct *p; +- +- rcu_read_lock(); +- p = READ_ONCE(s->task); +- s->done = 1; +- wake_up_process(p); +- rcu_read_unlock(); +-} +- +-void __sched __closure_sync(struct closure *cl) +-{ +- struct closure_syncer s = { .task = current }; +- +- cl->s = &s; +- continue_at(cl, closure_sync_fn, NULL); +- +- while (1) { +- set_current_state(TASK_UNINTERRUPTIBLE); +- if (s.done) +- break; +- schedule(); +- } +- +- __set_current_state(TASK_RUNNING); +-} +- +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- +-static LIST_HEAD(closure_list); +-static DEFINE_SPINLOCK(closure_list_lock); +- +-void closure_debug_create(struct closure *cl) +-{ +- unsigned long flags; +- +- BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); +- cl->magic = CLOSURE_MAGIC_ALIVE; +- +- spin_lock_irqsave(&closure_list_lock, flags); +- list_add(&cl->all, &closure_list); +- spin_unlock_irqrestore(&closure_list_lock, flags); +-} +- +-void closure_debug_destroy(struct closure *cl) +-{ +- unsigned long flags; +- +- BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); +- cl->magic = CLOSURE_MAGIC_DEAD; +- +- spin_lock_irqsave(&closure_list_lock, flags); +- list_del(&cl->all); +- spin_unlock_irqrestore(&closure_list_lock, flags); +-} +- +-static struct dentry *closure_debug; +- +-static int debug_seq_show(struct seq_file *f, void *data) +-{ +- struct closure *cl; +- +- spin_lock_irq(&closure_list_lock); +- +- list_for_each_entry(cl, &closure_list, all) { +- int r = atomic_read(&cl->remaining); +- +- seq_printf(f, "%p: %pS -> %pS p %p r %i ", +- cl, (void *) cl->ip, cl->fn, cl->parent, +- r & CLOSURE_REMAINING_MASK); +- +- seq_printf(f, "%s%s\n", +- test_bit(WORK_STRUCT_PENDING_BIT, +- work_data_bits(&cl->work)) ? "Q" : "", +- r & CLOSURE_RUNNING ? "R" : ""); +- +- if (r & CLOSURE_WAITING) +- seq_printf(f, " W %pS\n", +- (void *) cl->waiting_on); +- +- seq_printf(f, "\n"); +- } +- +- spin_unlock_irq(&closure_list_lock); +- return 0; +-} +- +-static int debug_seq_open(struct inode *inode, struct file *file) +-{ +- return single_open(file, debug_seq_show, NULL); +-} +- +-static const struct file_operations debug_ops = { +- .owner = THIS_MODULE, +- .open = debug_seq_open, +- .read = seq_read, +- .release = single_release +-}; +- +-void __init closure_debug_init(void) +-{ +- if (!IS_ERR_OR_NULL(bcache_debug)) +- /* +- * it is unnecessary to check return value of +- * debugfs_create_file(), we should not care +- * about this. +- */ +- closure_debug = debugfs_create_file( +- "closures", 0400, bcache_debug, NULL, &debug_ops); +-} +-#endif +- +-MODULE_AUTHOR("Kent Overstreet "); +-MODULE_LICENSE("GPL"); +diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h +deleted file mode 100644 +index c88cdc4ae4ec..000000000000 +--- a/drivers/md/bcache/closure.h ++++ /dev/null +@@ -1,378 +0,0 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ +-#ifndef _LINUX_CLOSURE_H +-#define _LINUX_CLOSURE_H +- +-#include +-#include +-#include +-#include +- +-/* +- * Closure is perhaps the most overused and abused term in computer science, but +- * since I've been unable to come up with anything better you're stuck with it +- * again. +- * +- * What are closures? +- * +- * They embed a refcount. The basic idea is they count "things that are in +- * progress" - in flight bios, some other thread that's doing something else - +- * anything you might want to wait on. +- * +- * The refcount may be manipulated with closure_get() and closure_put(). +- * closure_put() is where many of the interesting things happen, when it causes +- * the refcount to go to 0. +- * +- * Closures can be used to wait on things both synchronously and asynchronously, +- * and synchronous and asynchronous use can be mixed without restriction. To +- * wait synchronously, use closure_sync() - you will sleep until your closure's +- * refcount hits 1. +- * +- * To wait asynchronously, use +- * continue_at(cl, next_function, workqueue); +- * +- * passing it, as you might expect, the function to run when nothing is pending +- * and the workqueue to run that function out of. +- * +- * continue_at() also, critically, requires a 'return' immediately following the +- * location where this macro is referenced, to return to the calling function. +- * There's good reason for this. +- * +- * To use safely closures asynchronously, they must always have a refcount while +- * they are running owned by the thread that is running them. Otherwise, suppose +- * you submit some bios and wish to have a function run when they all complete: +- * +- * foo_endio(struct bio *bio) +- * { +- * closure_put(cl); +- * } +- * +- * closure_init(cl); +- * +- * do_stuff(); +- * closure_get(cl); +- * bio1->bi_endio = foo_endio; +- * bio_submit(bio1); +- * +- * do_more_stuff(); +- * closure_get(cl); +- * bio2->bi_endio = foo_endio; +- * bio_submit(bio2); +- * +- * continue_at(cl, complete_some_read, system_wq); +- * +- * If closure's refcount started at 0, complete_some_read() could run before the +- * second bio was submitted - which is almost always not what you want! More +- * importantly, it wouldn't be possible to say whether the original thread or +- * complete_some_read()'s thread owned the closure - and whatever state it was +- * associated with! +- * +- * So, closure_init() initializes a closure's refcount to 1 - and when a +- * closure_fn is run, the refcount will be reset to 1 first. +- * +- * Then, the rule is - if you got the refcount with closure_get(), release it +- * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount +- * on a closure because you called closure_init() or you were run out of a +- * closure - _always_ use continue_at(). Doing so consistently will help +- * eliminate an entire class of particularly pernicious races. +- * +- * Lastly, you might have a wait list dedicated to a specific event, and have no +- * need for specifying the condition - you just want to wait until someone runs +- * closure_wake_up() on the appropriate wait list. In that case, just use +- * closure_wait(). It will return either true or false, depending on whether the +- * closure was already on a wait list or not - a closure can only be on one wait +- * list at a time. +- * +- * Parents: +- * +- * closure_init() takes two arguments - it takes the closure to initialize, and +- * a (possibly null) parent. +- * +- * If parent is non null, the new closure will have a refcount for its lifetime; +- * a closure is considered to be "finished" when its refcount hits 0 and the +- * function to run is null. Hence +- * +- * continue_at(cl, NULL, NULL); +- * +- * returns up the (spaghetti) stack of closures, precisely like normal return +- * returns up the C stack. continue_at() with non null fn is better thought of +- * as doing a tail call. +- * +- * All this implies that a closure should typically be embedded in a particular +- * struct (which its refcount will normally control the lifetime of), and that +- * struct can very much be thought of as a stack frame. +- */ +- +-struct closure; +-struct closure_syncer; +-typedef void (closure_fn) (struct closure *); +-extern struct dentry *bcache_debug; +- +-struct closure_waitlist { +- struct llist_head list; +-}; +- +-enum closure_state { +- /* +- * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by +- * the thread that owns the closure, and cleared by the thread that's +- * waking up the closure. +- * +- * The rest are for debugging and don't affect behaviour: +- * +- * CLOSURE_RUNNING: Set when a closure is running (i.e. by +- * closure_init() and when closure_put() runs then next function), and +- * must be cleared before remaining hits 0. Primarily to help guard +- * against incorrect usage and accidentally transferring references. +- * continue_at() and closure_return() clear it for you, if you're doing +- * something unusual you can use closure_set_dead() which also helps +- * annotate where references are being transferred. +- */ +- +- CLOSURE_BITS_START = (1U << 26), +- CLOSURE_DESTRUCTOR = (1U << 26), +- CLOSURE_WAITING = (1U << 28), +- CLOSURE_RUNNING = (1U << 30), +-}; +- +-#define CLOSURE_GUARD_MASK \ +- ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1) +- +-#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) +-#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) +- +-struct closure { +- union { +- struct { +- struct workqueue_struct *wq; +- struct closure_syncer *s; +- struct llist_node list; +- closure_fn *fn; +- }; +- struct work_struct work; +- }; +- +- struct closure *parent; +- +- atomic_t remaining; +- +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +-#define CLOSURE_MAGIC_DEAD 0xc054dead +-#define CLOSURE_MAGIC_ALIVE 0xc054a11e +- +- unsigned int magic; +- struct list_head all; +- unsigned long ip; +- unsigned long waiting_on; +-#endif +-}; +- +-void closure_sub(struct closure *cl, int v); +-void closure_put(struct closure *cl); +-void __closure_wake_up(struct closure_waitlist *list); +-bool closure_wait(struct closure_waitlist *list, struct closure *cl); +-void __closure_sync(struct closure *cl); +- +-/** +- * closure_sync - sleep until a closure a closure has nothing left to wait on +- * +- * Sleeps until the refcount hits 1 - the thread that's running the closure owns +- * the last refcount. +- */ +-static inline void closure_sync(struct closure *cl) +-{ +- if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) +- __closure_sync(cl); +-} +- +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- +-void closure_debug_init(void); +-void closure_debug_create(struct closure *cl); +-void closure_debug_destroy(struct closure *cl); +- +-#else +- +-static inline void closure_debug_init(void) {} +-static inline void closure_debug_create(struct closure *cl) {} +-static inline void closure_debug_destroy(struct closure *cl) {} +- +-#endif +- +-static inline void closure_set_ip(struct closure *cl) +-{ +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- cl->ip = _THIS_IP_; +-#endif +-} +- +-static inline void closure_set_ret_ip(struct closure *cl) +-{ +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- cl->ip = _RET_IP_; +-#endif +-} +- +-static inline void closure_set_waiting(struct closure *cl, unsigned long f) +-{ +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- cl->waiting_on = f; +-#endif +-} +- +-static inline void closure_set_stopped(struct closure *cl) +-{ +- atomic_sub(CLOSURE_RUNNING, &cl->remaining); +-} +- +-static inline void set_closure_fn(struct closure *cl, closure_fn *fn, +- struct workqueue_struct *wq) +-{ +- closure_set_ip(cl); +- cl->fn = fn; +- cl->wq = wq; +- /* between atomic_dec() in closure_put() */ +- smp_mb__before_atomic(); +-} +- +-static inline void closure_queue(struct closure *cl) +-{ +- struct workqueue_struct *wq = cl->wq; +- /** +- * Changes made to closure, work_struct, or a couple of other structs +- * may cause work.func not pointing to the right location. +- */ +- BUILD_BUG_ON(offsetof(struct closure, fn) +- != offsetof(struct work_struct, func)); +- if (wq) { +- INIT_WORK(&cl->work, cl->work.func); +- BUG_ON(!queue_work(wq, &cl->work)); +- } else +- cl->fn(cl); +-} +- +-/** +- * closure_get - increment a closure's refcount +- */ +-static inline void closure_get(struct closure *cl) +-{ +-#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +- BUG_ON((atomic_inc_return(&cl->remaining) & +- CLOSURE_REMAINING_MASK) <= 1); +-#else +- atomic_inc(&cl->remaining); +-#endif +-} +- +-/** +- * closure_init - Initialize a closure, setting the refcount to 1 +- * @cl: closure to initialize +- * @parent: parent of the new closure. cl will take a refcount on it for its +- * lifetime; may be NULL. +- */ +-static inline void closure_init(struct closure *cl, struct closure *parent) +-{ +- memset(cl, 0, sizeof(struct closure)); +- cl->parent = parent; +- if (parent) +- closure_get(parent); +- +- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +- +- closure_debug_create(cl); +- closure_set_ip(cl); +-} +- +-static inline void closure_init_stack(struct closure *cl) +-{ +- memset(cl, 0, sizeof(struct closure)); +- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); +-} +- +-/** +- * closure_wake_up - wake up all closures on a wait list, +- * with memory barrier +- */ +-static inline void closure_wake_up(struct closure_waitlist *list) +-{ +- /* Memory barrier for the wait list */ +- smp_mb(); +- __closure_wake_up(list); +-} +- +-/** +- * continue_at - jump to another function with barrier +- * +- * After @cl is no longer waiting on anything (i.e. all outstanding refs have +- * been dropped with closure_put()), it will resume execution at @fn running out +- * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). +- * +- * This is because after calling continue_at() you no longer have a ref on @cl, +- * and whatever @cl owns may be freed out from under you - a running closure fn +- * has a ref on its own closure which continue_at() drops. +- * +- * Note you are expected to immediately return after using this macro. +- */ +-#define continue_at(_cl, _fn, _wq) \ +-do { \ +- set_closure_fn(_cl, _fn, _wq); \ +- closure_sub(_cl, CLOSURE_RUNNING + 1); \ +-} while (0) +- +-/** +- * closure_return - finish execution of a closure +- * +- * This is used to indicate that @cl is finished: when all outstanding refs on +- * @cl have been dropped @cl's ref on its parent closure (as passed to +- * closure_init()) will be dropped, if one was specified - thus this can be +- * thought of as returning to the parent closure. +- */ +-#define closure_return(_cl) continue_at((_cl), NULL, NULL) +- +-/** +- * continue_at_nobarrier - jump to another function without barrier +- * +- * Causes @fn to be executed out of @cl, in @wq context (or called directly if +- * @wq is NULL). +- * +- * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, +- * thus it's not safe to touch anything protected by @cl after a +- * continue_at_nobarrier(). +- */ +-#define continue_at_nobarrier(_cl, _fn, _wq) \ +-do { \ +- set_closure_fn(_cl, _fn, _wq); \ +- closure_queue(_cl); \ +-} while (0) +- +-/** +- * closure_return_with_destructor - finish execution of a closure, +- * with destructor +- * +- * Works like closure_return(), except @destructor will be called when all +- * outstanding refs on @cl have been dropped; @destructor may be used to safely +- * free the memory occupied by @cl, and it is called with the ref on the parent +- * closure still held - so @destructor could safely return an item to a +- * freelist protected by @cl's parent. +- */ +-#define closure_return_with_destructor(_cl, _destructor) \ +-do { \ +- set_closure_fn(_cl, _destructor, NULL); \ +- closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ +-} while (0) +- +-/** +- * closure_call - execute @fn out of a new, uninitialized closure +- * +- * Typically used when running out of one closure, and we want to run @fn +- * asynchronously out of a new closure - @parent will then wait for @cl to +- * finish. +- */ +-static inline void closure_call(struct closure *cl, closure_fn fn, +- struct workqueue_struct *wq, +- struct closure *parent) +-{ +- closure_init(cl, parent); +- continue_at_nobarrier(cl, fn, wq); +-} +- +-#endif /* _LINUX_CLOSURE_H */ +diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c +index 1bbdc410ee3c..3b9e991ea475 100644 +--- a/drivers/md/bcache/super.c ++++ b/drivers/md/bcache/super.c +@@ -2925,7 +2925,6 @@ static int __init bcache_init(void) + goto err; + + bch_debug_init(); +- closure_debug_init(); + + bcache_is_reboot = false; + +diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h +index c029f7443190..59093f9f1793 100644 +--- a/drivers/md/bcache/util.h ++++ b/drivers/md/bcache/util.h +@@ -4,6 +4,7 @@ + #define _BCACHE_UTIL_H + + #include ++#include + #include + #include + #include +@@ -13,8 +14,6 @@ + #include + #include + +-#include "closure.h" +- + #define PAGE_SECTORS (PAGE_SIZE / 512) + + struct closure; +diff --git a/fs/Kconfig b/fs/Kconfig +index aa4c12282301..88082e3663cb 100644 +--- a/fs/Kconfig ++++ b/fs/Kconfig +@@ -40,6 +40,7 @@ source "fs/ocfs2/Kconfig" + source "fs/btrfs/Kconfig" + source "fs/nilfs2/Kconfig" + source "fs/f2fs/Kconfig" ++source "fs/bcachefs/Kconfig" + source "fs/zonefs/Kconfig" + + config FS_DAX +diff --git a/fs/Makefile b/fs/Makefile +index 1c7b0e3f6daa..8afa8e3bc14f 100644 +--- a/fs/Makefile ++++ b/fs/Makefile +@@ -130,6 +130,7 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ + obj-$(CONFIG_BTRFS_FS) += btrfs/ + obj-$(CONFIG_GFS2_FS) += gfs2/ + obj-$(CONFIG_F2FS_FS) += f2fs/ ++obj-$(CONFIG_BCACHEFS_FS) += bcachefs/ + obj-$(CONFIG_CEPH_FS) += ceph/ + obj-$(CONFIG_PSTORE) += pstore/ + obj-$(CONFIG_EFIVAR_FS) += efivarfs/ +diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig +new file mode 100644 +index 000000000000..5594af719b2a +--- /dev/null ++++ b/fs/bcachefs/Kconfig +@@ -0,0 +1,50 @@ ++ ++config BCACHEFS_FS ++ tristate "bcachefs filesystem support" ++ depends on BLOCK ++ select EXPORTFS ++ select CLOSURES ++ select LIBCRC32C ++ select CRC64 ++ select FS_POSIX_ACL ++ select LZ4_COMPRESS ++ select LZ4_DECOMPRESS ++ select ZLIB_DEFLATE ++ select ZLIB_INFLATE ++ select ZSTD_COMPRESS ++ select ZSTD_DECOMPRESS ++ select CRYPTO_SHA256 ++ select CRYPTO_CHACHA20 ++ select CRYPTO_POLY1305 ++ select KEYS ++ select SIXLOCKS ++ select RAID6_PQ ++ select XOR_BLOCKS ++ help ++ The bcachefs filesystem - a modern, copy on write filesystem, with ++ support for multiple devices, compression, checksumming, etc. ++ ++config BCACHEFS_QUOTA ++ bool "bcachefs quota support" ++ depends on BCACHEFS_FS ++ select QUOTACTL ++ ++config BCACHEFS_POSIX_ACL ++ bool "bcachefs POSIX ACL support" ++ depends on BCACHEFS_FS ++ select FS_POSIX_ACL ++ ++config BCACHEFS_DEBUG ++ bool "bcachefs debugging" ++ depends on BCACHEFS_FS ++ help ++ Enables many extra debugging checks and assertions. ++ ++ The resulting code will be significantly slower than normal; you ++ probably shouldn't select this option unless you're a developer. ++ ++config BCACHEFS_TESTS ++ bool "bcachefs unit and performance tests" ++ depends on BCACHEFS_FS ++ help ++ Include some unit and performance tests for the core btree code +diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile +new file mode 100644 +index 000000000000..d85ced62c0dd +--- /dev/null ++++ b/fs/bcachefs/Makefile +@@ -0,0 +1,59 @@ ++ ++obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o ++ ++bcachefs-y := \ ++ acl.o \ ++ alloc_background.o \ ++ alloc_foreground.o \ ++ bkey.o \ ++ bkey_methods.o \ ++ bkey_sort.o \ ++ bset.o \ ++ btree_cache.o \ ++ btree_gc.o \ ++ btree_io.o \ ++ btree_iter.o \ ++ btree_key_cache.o \ ++ btree_update_interior.o \ ++ btree_update_leaf.o \ ++ buckets.o \ ++ chardev.o \ ++ checksum.o \ ++ clock.o \ ++ compress.o \ ++ debug.o \ ++ dirent.o \ ++ disk_groups.o \ ++ ec.o \ ++ error.o \ ++ extents.o \ ++ extent_update.o \ ++ fs.o \ ++ fs-common.o \ ++ fs-ioctl.o \ ++ fs-io.o \ ++ fsck.o \ ++ inode.o \ ++ io.o \ ++ journal.o \ ++ journal_io.o \ ++ journal_reclaim.o \ ++ journal_seq_blacklist.o \ ++ keylist.o \ ++ migrate.o \ ++ move.o \ ++ movinggc.o \ ++ opts.o \ ++ quota.o \ ++ rebalance.o \ ++ recovery.o \ ++ reflink.o \ ++ replicas.o \ ++ siphash.o \ ++ super.o \ ++ super-io.o \ ++ sysfs.o \ ++ tests.o \ ++ trace.o \ ++ util.o \ ++ xattr.o +diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c +new file mode 100644 +index 000000000000..76c98ddbf628 +--- /dev/null ++++ b/fs/bcachefs/acl.c +@@ -0,0 +1,388 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ ++#include "bcachefs.h" ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include "acl.h" ++#include "fs.h" ++#include "xattr.h" ++ ++static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long) ++{ ++ return sizeof(bch_acl_header) + ++ sizeof(bch_acl_entry_short) * nr_short + ++ sizeof(bch_acl_entry) * nr_long; ++} ++ ++static inline int acl_to_xattr_type(int type) ++{ ++ switch (type) { ++ case ACL_TYPE_ACCESS: ++ return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS; ++ case ACL_TYPE_DEFAULT: ++ return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT; ++ default: ++ BUG(); ++ } ++} ++ ++/* ++ * Convert from filesystem to in-memory representation. ++ */ ++static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size) ++{ ++ const void *p, *end = value + size; ++ struct posix_acl *acl; ++ struct posix_acl_entry *out; ++ unsigned count = 0; ++ ++ if (!value) ++ return NULL; ++ if (size < sizeof(bch_acl_header)) ++ goto invalid; ++ if (((bch_acl_header *)value)->a_version != ++ cpu_to_le32(BCH_ACL_VERSION)) ++ goto invalid; ++ ++ p = value + sizeof(bch_acl_header); ++ while (p < end) { ++ const bch_acl_entry *entry = p; ++ ++ if (p + sizeof(bch_acl_entry_short) > end) ++ goto invalid; ++ ++ switch (le16_to_cpu(entry->e_tag)) { ++ case ACL_USER_OBJ: ++ case ACL_GROUP_OBJ: ++ case ACL_MASK: ++ case ACL_OTHER: ++ p += sizeof(bch_acl_entry_short); ++ break; ++ case ACL_USER: ++ case ACL_GROUP: ++ p += sizeof(bch_acl_entry); ++ break; ++ default: ++ goto invalid; ++ } ++ ++ count++; ++ } ++ ++ if (p > end) ++ goto invalid; ++ ++ if (!count) ++ return NULL; ++ ++ acl = posix_acl_alloc(count, GFP_KERNEL); ++ if (!acl) ++ return ERR_PTR(-ENOMEM); ++ ++ out = acl->a_entries; ++ ++ p = value + sizeof(bch_acl_header); ++ while (p < end) { ++ const bch_acl_entry *in = p; ++ ++ out->e_tag = le16_to_cpu(in->e_tag); ++ out->e_perm = le16_to_cpu(in->e_perm); ++ ++ switch (out->e_tag) { ++ case ACL_USER_OBJ: ++ case ACL_GROUP_OBJ: ++ case ACL_MASK: ++ case ACL_OTHER: ++ p += sizeof(bch_acl_entry_short); ++ break; ++ case ACL_USER: ++ out->e_uid = make_kuid(&init_user_ns, ++ le32_to_cpu(in->e_id)); ++ p += sizeof(bch_acl_entry); ++ break; ++ case ACL_GROUP: ++ out->e_gid = make_kgid(&init_user_ns, ++ le32_to_cpu(in->e_id)); ++ p += sizeof(bch_acl_entry); ++ break; ++ } ++ ++ out++; ++ } ++ ++ BUG_ON(out != acl->a_entries + acl->a_count); ++ ++ return acl; ++invalid: ++ pr_err("invalid acl entry"); ++ return ERR_PTR(-EINVAL); ++} ++ ++#define acl_for_each_entry(acl, acl_e) \ ++ for (acl_e = acl->a_entries; \ ++ acl_e < acl->a_entries + acl->a_count; \ ++ acl_e++) ++ ++/* ++ * Convert from in-memory to filesystem representation. ++ */ ++static struct bkey_i_xattr * ++bch2_acl_to_xattr(struct btree_trans *trans, ++ const struct posix_acl *acl, ++ int type) ++{ ++ struct bkey_i_xattr *xattr; ++ bch_acl_header *acl_header; ++ const struct posix_acl_entry *acl_e; ++ void *outptr; ++ unsigned nr_short = 0, nr_long = 0, acl_len, u64s; ++ ++ acl_for_each_entry(acl, acl_e) { ++ switch (acl_e->e_tag) { ++ case ACL_USER: ++ case ACL_GROUP: ++ nr_long++; ++ break; ++ case ACL_USER_OBJ: ++ case ACL_GROUP_OBJ: ++ case ACL_MASK: ++ case ACL_OTHER: ++ nr_short++; ++ break; ++ default: ++ return ERR_PTR(-EINVAL); ++ } ++ } ++ ++ acl_len = bch2_acl_size(nr_short, nr_long); ++ u64s = BKEY_U64s + xattr_val_u64s(0, acl_len); ++ ++ if (u64s > U8_MAX) ++ return ERR_PTR(-E2BIG); ++ ++ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); ++ if (IS_ERR(xattr)) ++ return xattr; ++ ++ bkey_xattr_init(&xattr->k_i); ++ xattr->k.u64s = u64s; ++ xattr->v.x_type = acl_to_xattr_type(type); ++ xattr->v.x_name_len = 0, ++ xattr->v.x_val_len = cpu_to_le16(acl_len); ++ ++ acl_header = xattr_val(&xattr->v); ++ acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION); ++ ++ outptr = (void *) acl_header + sizeof(*acl_header); ++ ++ acl_for_each_entry(acl, acl_e) { ++ bch_acl_entry *entry = outptr; ++ ++ entry->e_tag = cpu_to_le16(acl_e->e_tag); ++ entry->e_perm = cpu_to_le16(acl_e->e_perm); ++ switch (acl_e->e_tag) { ++ case ACL_USER: ++ entry->e_id = cpu_to_le32( ++ from_kuid(&init_user_ns, acl_e->e_uid)); ++ outptr += sizeof(bch_acl_entry); ++ break; ++ case ACL_GROUP: ++ entry->e_id = cpu_to_le32( ++ from_kgid(&init_user_ns, acl_e->e_gid)); ++ outptr += sizeof(bch_acl_entry); ++ break; ++ ++ case ACL_USER_OBJ: ++ case ACL_GROUP_OBJ: ++ case ACL_MASK: ++ case ACL_OTHER: ++ outptr += sizeof(bch_acl_entry_short); ++ break; ++ } ++ } ++ ++ BUG_ON(outptr != xattr_val(&xattr->v) + acl_len); ++ ++ return xattr; ++} ++ ++struct posix_acl *bch2_get_acl(struct inode *vinode, int type) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c_xattr xattr; ++ struct posix_acl *acl = NULL; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, ++ &inode->ei_str_hash, inode->v.i_ino, ++ &X_SEARCH(acl_to_xattr_type(type), "", 0), ++ 0); ++ if (IS_ERR(iter)) { ++ if (PTR_ERR(iter) == -EINTR) ++ goto retry; ++ ++ if (PTR_ERR(iter) != -ENOENT) ++ acl = ERR_CAST(iter); ++ goto out; ++ } ++ ++ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); ++ ++ acl = bch2_acl_from_disk(xattr_val(xattr.v), ++ le16_to_cpu(xattr.v->x_val_len)); ++ ++ if (!IS_ERR(acl)) ++ set_cached_acl(&inode->v, type, acl); ++out: ++ bch2_trans_exit(&trans); ++ return acl; ++} ++ ++int bch2_set_acl_trans(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode_u, ++ const struct bch_hash_info *hash_info, ++ struct posix_acl *acl, int type) ++{ ++ int ret; ++ ++ if (type == ACL_TYPE_DEFAULT && ++ !S_ISDIR(inode_u->bi_mode)) ++ return acl ? -EACCES : 0; ++ ++ if (acl) { ++ struct bkey_i_xattr *xattr = ++ bch2_acl_to_xattr(trans, acl, type); ++ if (IS_ERR(xattr)) ++ return PTR_ERR(xattr); ++ ++ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, ++ inode_u->bi_inum, &xattr->k_i, 0); ++ } else { ++ struct xattr_search_key search = ++ X_SEARCH(acl_to_xattr_type(type), "", 0); ++ ++ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info, ++ inode_u->bi_inum, &search); ++ } ++ ++ return ret == -ENOENT ? 0 : ret; ++} ++ ++int bch2_set_acl(struct inode *vinode, struct posix_acl *_acl, int type) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter *inode_iter; ++ struct bch_inode_unpacked inode_u; ++ struct posix_acl *acl; ++ umode_t mode; ++ int ret; ++ ++ mutex_lock(&inode->ei_update_lock); ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ acl = _acl; ++ ++ inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(inode_iter); ++ if (ret) ++ goto btree_err; ++ ++ mode = inode_u.bi_mode; ++ ++ if (type == ACL_TYPE_ACCESS) { ++ ret = posix_acl_update_mode(&inode->v, &mode, &acl); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_set_acl_trans(&trans, &inode_u, ++ &inode->ei_str_hash, ++ acl, type); ++ if (ret) ++ goto btree_err; ++ ++ inode_u.bi_ctime = bch2_current_time(c); ++ inode_u.bi_mode = mode; ++ ++ ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: ++ bch2_trans_commit(&trans, NULL, ++ &inode->ei_journal_seq, ++ BTREE_INSERT_NOUNLOCK); ++btree_err: ++ if (ret == -EINTR) ++ goto retry; ++ if (unlikely(ret)) ++ goto err; ++ ++ bch2_inode_update_after_write(c, inode, &inode_u, ++ ATTR_CTIME|ATTR_MODE); ++ ++ set_cached_acl(&inode->v, type, acl); ++err: ++ bch2_trans_exit(&trans); ++ mutex_unlock(&inode->ei_update_lock); ++ ++ return ret; ++} ++ ++int bch2_acl_chmod(struct btree_trans *trans, ++ struct bch_inode_info *inode, ++ umode_t mode, ++ struct posix_acl **new_acl) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c_xattr xattr; ++ struct bkey_i_xattr *new; ++ struct posix_acl *acl; ++ int ret = 0; ++ ++ iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc, ++ &inode->ei_str_hash, inode->v.i_ino, ++ &X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0), ++ BTREE_ITER_INTENT); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0; ++ ++ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); ++ ++ acl = bch2_acl_from_disk(xattr_val(xattr.v), ++ le16_to_cpu(xattr.v->x_val_len)); ++ if (IS_ERR_OR_NULL(acl)) ++ return PTR_ERR(acl); ++ ++ ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode); ++ if (ret) ++ goto err; ++ ++ new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); ++ if (IS_ERR(new)) { ++ ret = PTR_ERR(new); ++ goto err; ++ } ++ ++ new->k.p = iter->pos; ++ bch2_trans_update(trans, iter, &new->k_i, 0); ++ *new_acl = acl; ++ acl = NULL; ++err: ++ kfree(acl); ++ return ret; ++} ++ ++#endif /* CONFIG_BCACHEFS_POSIX_ACL */ +diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h +new file mode 100644 +index 000000000000..cb62d502a7ff +--- /dev/null ++++ b/fs/bcachefs/acl.h +@@ -0,0 +1,59 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ACL_H ++#define _BCACHEFS_ACL_H ++ ++struct bch_inode_unpacked; ++struct bch_hash_info; ++struct bch_inode_info; ++struct posix_acl; ++ ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ ++#define BCH_ACL_VERSION 0x0001 ++ ++typedef struct { ++ __le16 e_tag; ++ __le16 e_perm; ++ __le32 e_id; ++} bch_acl_entry; ++ ++typedef struct { ++ __le16 e_tag; ++ __le16 e_perm; ++} bch_acl_entry_short; ++ ++typedef struct { ++ __le32 a_version; ++} bch_acl_header; ++ ++struct posix_acl *bch2_get_acl(struct inode *, int); ++ ++int bch2_set_acl_trans(struct btree_trans *, ++ struct bch_inode_unpacked *, ++ const struct bch_hash_info *, ++ struct posix_acl *, int); ++int bch2_set_acl(struct inode *, struct posix_acl *, int); ++int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *, ++ umode_t, struct posix_acl **); ++ ++#else ++ ++static inline int bch2_set_acl_trans(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode_u, ++ const struct bch_hash_info *hash_info, ++ struct posix_acl *acl, int type) ++{ ++ return 0; ++} ++ ++static inline int bch2_acl_chmod(struct btree_trans *trans, ++ struct bch_inode_info *inode, ++ umode_t mode, ++ struct posix_acl **new_acl) ++{ ++ return 0; ++} ++ ++#endif /* CONFIG_BCACHEFS_POSIX_ACL */ ++ ++#endif /* _BCACHEFS_ACL_H */ +diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c +new file mode 100644 +index 000000000000..97508de9f721 +--- /dev/null ++++ b/fs/bcachefs/alloc_background.c +@@ -0,0 +1,1477 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_key_cache.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "clock.h" ++#include "debug.h" ++#include "ec.h" ++#include "error.h" ++#include "recovery.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static const char * const bch2_alloc_field_names[] = { ++#define x(name, bytes) #name, ++ BCH_ALLOC_FIELDS() ++#undef x ++ NULL ++}; ++ ++static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int); ++ ++/* Ratelimiting/PD controllers */ ++ ++static void pd_controllers_update(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(to_delayed_work(work), ++ struct bch_fs, ++ pd_controllers_update); ++ struct bch_dev *ca; ++ s64 free = 0, fragmented = 0; ++ unsigned i; ++ ++ for_each_member_device(ca, c, i) { ++ struct bch_dev_usage stats = bch2_dev_usage_read(ca); ++ ++ free += bucket_to_sector(ca, ++ __dev_buckets_free(ca, stats)) << 9; ++ /* ++ * Bytes of internal fragmentation, which can be ++ * reclaimed by copy GC ++ */ ++ fragmented += max_t(s64, 0, (bucket_to_sector(ca, ++ stats.buckets[BCH_DATA_user] + ++ stats.buckets[BCH_DATA_cached]) - ++ (stats.sectors[BCH_DATA_user] + ++ stats.sectors[BCH_DATA_cached])) << 9); ++ } ++ ++ bch2_pd_controller_update(&c->copygc_pd, free, fragmented, -1); ++ schedule_delayed_work(&c->pd_controllers_update, ++ c->pd_controllers_update_seconds * HZ); ++} ++ ++/* Persistent alloc info: */ ++ ++static inline u64 get_alloc_field(const struct bch_alloc *a, ++ const void **p, unsigned field) ++{ ++ unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; ++ u64 v; ++ ++ if (!(a->fields & (1 << field))) ++ return 0; ++ ++ switch (bytes) { ++ case 1: ++ v = *((const u8 *) *p); ++ break; ++ case 2: ++ v = le16_to_cpup(*p); ++ break; ++ case 4: ++ v = le32_to_cpup(*p); ++ break; ++ case 8: ++ v = le64_to_cpup(*p); ++ break; ++ default: ++ BUG(); ++ } ++ ++ *p += bytes; ++ return v; ++} ++ ++static inline void put_alloc_field(struct bkey_i_alloc *a, void **p, ++ unsigned field, u64 v) ++{ ++ unsigned bytes = BCH_ALLOC_FIELD_BYTES[field]; ++ ++ if (!v) ++ return; ++ ++ a->v.fields |= 1 << field; ++ ++ switch (bytes) { ++ case 1: ++ *((u8 *) *p) = v; ++ break; ++ case 2: ++ *((__le16 *) *p) = cpu_to_le16(v); ++ break; ++ case 4: ++ *((__le32 *) *p) = cpu_to_le32(v); ++ break; ++ case 8: ++ *((__le64 *) *p) = cpu_to_le64(v); ++ break; ++ default: ++ BUG(); ++ } ++ ++ *p += bytes; ++} ++ ++struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) ++{ ++ struct bkey_alloc_unpacked ret = { .gen = 0 }; ++ ++ if (k.k->type == KEY_TYPE_alloc) { ++ const struct bch_alloc *a = bkey_s_c_to_alloc(k).v; ++ const void *d = a->data; ++ unsigned idx = 0; ++ ++ ret.gen = a->gen; ++ ++#define x(_name, _bits) ret._name = get_alloc_field(a, &d, idx++); ++ BCH_ALLOC_FIELDS() ++#undef x ++ } ++ return ret; ++} ++ ++void bch2_alloc_pack(struct bkey_i_alloc *dst, ++ const struct bkey_alloc_unpacked src) ++{ ++ unsigned idx = 0; ++ void *d = dst->v.data; ++ unsigned bytes; ++ ++ dst->v.fields = 0; ++ dst->v.gen = src.gen; ++ ++#define x(_name, _bits) put_alloc_field(dst, &d, idx++, src._name); ++ BCH_ALLOC_FIELDS() ++#undef x ++ ++ bytes = (void *) d - (void *) &dst->v; ++ set_bkey_val_bytes(&dst->k, bytes); ++ memset_u64s_tail(&dst->v, 0, bytes); ++} ++ ++static unsigned bch_alloc_val_u64s(const struct bch_alloc *a) ++{ ++ unsigned i, bytes = offsetof(struct bch_alloc, data); ++ ++ for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_FIELD_BYTES); i++) ++ if (a->fields & (1 << i)) ++ bytes += BCH_ALLOC_FIELD_BYTES[i]; ++ ++ return DIV_ROUND_UP(bytes, sizeof(u64)); ++} ++ ++const char *bch2_alloc_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); ++ ++ if (k.k->p.inode >= c->sb.nr_devices || ++ !c->devs[k.k->p.inode]) ++ return "invalid device"; ++ ++ /* allow for unknown fields */ ++ if (bkey_val_u64s(a.k) < bch_alloc_val_u64s(a.v)) ++ return "incorrect value size"; ++ ++ return NULL; ++} ++ ++void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); ++ const void *d = a.v->data; ++ unsigned i; ++ ++ pr_buf(out, "gen %u", a.v->gen); ++ ++ for (i = 0; i < BCH_ALLOC_FIELD_NR; i++) ++ if (a.v->fields & (1 << i)) ++ pr_buf(out, " %s %llu", ++ bch2_alloc_field_names[i], ++ get_alloc_field(a.v, &d, i)); ++} ++ ++static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_s_c k) ++{ ++ struct bch_dev *ca; ++ struct bucket *g; ++ struct bkey_alloc_unpacked u; ++ ++ if (level || k.k->type != KEY_TYPE_alloc) ++ return 0; ++ ++ ca = bch_dev_bkey_exists(c, k.k->p.inode); ++ g = __bucket(ca, k.k->p.offset, 0); ++ u = bch2_alloc_unpack(k); ++ ++ g->_mark.gen = u.gen; ++ g->_mark.data_type = u.data_type; ++ g->_mark.dirty_sectors = u.dirty_sectors; ++ g->_mark.cached_sectors = u.cached_sectors; ++ g->io_time[READ] = u.read_time; ++ g->io_time[WRITE] = u.write_time; ++ g->oldest_gen = u.oldest_gen; ++ g->gen_valid = 1; ++ ++ return 0; ++} ++ ++int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ int ret = 0; ++ ++ down_read(&c->gc_lock); ++ ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC, ++ NULL, bch2_alloc_read_fn); ++ up_read(&c->gc_lock); ++ ++ if (ret) { ++ bch_err(c, "error reading alloc info: %i", ret); ++ return ret; ++ } ++ ++ percpu_down_write(&c->mark_lock); ++ bch2_dev_usage_from_buckets(c); ++ percpu_up_write(&c->mark_lock); ++ ++ mutex_lock(&c->bucket_clock[READ].lock); ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ bch2_recalc_oldest_io(c, ca, READ); ++ up_read(&ca->bucket_lock); ++ } ++ mutex_unlock(&c->bucket_clock[READ].lock); ++ ++ mutex_lock(&c->bucket_clock[WRITE].lock); ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ bch2_recalc_oldest_io(c, ca, WRITE); ++ up_read(&ca->bucket_lock); ++ } ++ mutex_unlock(&c->bucket_clock[WRITE].lock); ++ ++ return 0; ++} ++ ++static int bch2_alloc_write_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c k; ++ struct bch_dev *ca; ++ struct bucket_array *ba; ++ struct bucket *g; ++ struct bucket_mark m; ++ struct bkey_alloc_unpacked old_u, new_u; ++ __BKEY_PADDED(k, 8) alloc_key; /* hack: */ ++ struct bkey_i_alloc *a; ++ int ret; ++retry: ++ bch2_trans_begin(trans); ++ ++ ret = bch2_btree_key_cache_flush(trans, ++ BTREE_ID_ALLOC, iter->pos); ++ if (ret) ++ goto err; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ old_u = bch2_alloc_unpack(k); ++ ++ percpu_down_read(&c->mark_lock); ++ ca = bch_dev_bkey_exists(c, iter->pos.inode); ++ ba = bucket_array(ca); ++ ++ g = &ba->b[iter->pos.offset]; ++ m = READ_ONCE(g->mark); ++ new_u = alloc_mem_to_key(g, m); ++ percpu_up_read(&c->mark_lock); ++ ++ if (!bkey_alloc_unpacked_cmp(old_u, new_u)) ++ return 0; ++ ++ a = bkey_alloc_init(&alloc_key.k); ++ a->k.p = iter->pos; ++ bch2_alloc_pack(a, new_u); ++ ++ bch2_trans_update(trans, iter, &a->k_i, ++ BTREE_TRIGGER_NORUN); ++ ret = bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ flags); ++err: ++ if (ret == -EINTR) ++ goto retry; ++ return ret; ++} ++ ++int bch2_dev_alloc_write(struct bch_fs *c, struct bch_dev *ca, unsigned flags) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ u64 first_bucket, nbuckets; ++ int ret = 0; ++ ++ percpu_down_read(&c->mark_lock); ++ first_bucket = bucket_array(ca)->first_bucket; ++ nbuckets = bucket_array(ca)->nbuckets; ++ percpu_up_read(&c->mark_lock); ++ ++ BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, ++ POS(ca->dev_idx, first_bucket), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ ++ while (iter->pos.offset < nbuckets) { ++ bch2_trans_cond_resched(&trans); ++ ++ ret = bch2_alloc_write_key(&trans, iter, flags); ++ if (ret) ++ break; ++ bch2_btree_iter_next_slot(iter); ++ } ++ ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++int bch2_alloc_write(struct bch_fs *c, unsigned flags) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ int ret = 0; ++ ++ for_each_rw_member(ca, c, i) { ++ bch2_dev_alloc_write(c, ca, flags); ++ if (ret) { ++ percpu_ref_put(&ca->io_ref); ++ break; ++ } ++ } ++ ++ return ret; ++} ++ ++/* Bucket IO clocks: */ ++ ++static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw) ++{ ++ struct bucket_clock *clock = &c->bucket_clock[rw]; ++ struct bucket_array *buckets = bucket_array(ca); ++ struct bucket *g; ++ u16 max_last_io = 0; ++ unsigned i; ++ ++ lockdep_assert_held(&c->bucket_clock[rw].lock); ++ ++ /* Recalculate max_last_io for this device: */ ++ for_each_bucket(g, buckets) ++ max_last_io = max(max_last_io, bucket_last_io(c, g, rw)); ++ ++ ca->max_last_bucket_io[rw] = max_last_io; ++ ++ /* Recalculate global max_last_io: */ ++ max_last_io = 0; ++ ++ for_each_member_device(ca, c, i) ++ max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]); ++ ++ clock->max_last_io = max_last_io; ++} ++ ++static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw) ++{ ++ struct bucket_clock *clock = &c->bucket_clock[rw]; ++ struct bucket_array *buckets; ++ struct bch_dev *ca; ++ struct bucket *g; ++ unsigned i; ++ ++ trace_rescale_prios(c); ++ ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ for_each_bucket(g, buckets) ++ g->io_time[rw] = clock->hand - ++ bucket_last_io(c, g, rw) / 2; ++ ++ bch2_recalc_oldest_io(c, ca, rw); ++ ++ up_read(&ca->bucket_lock); ++ } ++} ++ ++static inline u64 bucket_clock_freq(u64 capacity) ++{ ++ return max(capacity >> 10, 2028ULL); ++} ++ ++static void bch2_inc_clock_hand(struct io_timer *timer) ++{ ++ struct bucket_clock *clock = container_of(timer, ++ struct bucket_clock, rescale); ++ struct bch_fs *c = container_of(clock, ++ struct bch_fs, bucket_clock[clock->rw]); ++ struct bch_dev *ca; ++ u64 capacity; ++ unsigned i; ++ ++ mutex_lock(&clock->lock); ++ ++ /* if clock cannot be advanced more, rescale prio */ ++ if (clock->max_last_io >= U16_MAX - 2) ++ bch2_rescale_bucket_io_times(c, clock->rw); ++ ++ BUG_ON(clock->max_last_io >= U16_MAX - 2); ++ ++ for_each_member_device(ca, c, i) ++ ca->max_last_bucket_io[clock->rw]++; ++ clock->max_last_io++; ++ clock->hand++; ++ ++ mutex_unlock(&clock->lock); ++ ++ capacity = READ_ONCE(c->capacity); ++ ++ if (!capacity) ++ return; ++ ++ /* ++ * we only increment when 0.1% of the filesystem capacity has been read ++ * or written too, this determines if it's time ++ * ++ * XXX: we shouldn't really be going off of the capacity of devices in ++ * RW mode (that will be 0 when we're RO, yet we can still service ++ * reads) ++ */ ++ timer->expire += bucket_clock_freq(capacity); ++ ++ bch2_io_timer_add(&c->io_clock[clock->rw], timer); ++} ++ ++static void bch2_bucket_clock_init(struct bch_fs *c, int rw) ++{ ++ struct bucket_clock *clock = &c->bucket_clock[rw]; ++ ++ clock->hand = 1; ++ clock->rw = rw; ++ clock->rescale.fn = bch2_inc_clock_hand; ++ clock->rescale.expire = bucket_clock_freq(c->capacity); ++ mutex_init(&clock->lock); ++} ++ ++int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, ++ size_t bucket_nr, int rw) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, dev); ++ struct btree_iter *iter; ++ struct bucket *g; ++ struct bkey_i_alloc *a; ++ struct bkey_alloc_unpacked u; ++ u16 *time; ++ int ret = 0; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr), ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_INTENT); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); ++ ret = PTR_ERR_OR_ZERO(a); ++ if (ret) ++ goto out; ++ ++ percpu_down_read(&c->mark_lock); ++ g = bucket(ca, bucket_nr); ++ u = alloc_mem_to_key(g, READ_ONCE(g->mark)); ++ percpu_up_read(&c->mark_lock); ++ ++ bkey_alloc_init(&a->k_i); ++ a->k.p = iter->pos; ++ ++ time = rw == READ ? &u.read_time : &u.write_time; ++ if (*time == c->bucket_clock[rw].hand) ++ goto out; ++ ++ *time = c->bucket_clock[rw].hand; ++ ++ bch2_alloc_pack(a, u); ++ ++ ret = bch2_trans_update(trans, iter, &a->k_i, 0) ?: ++ bch2_trans_commit(trans, NULL, NULL, 0); ++out: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++/* Background allocator thread: */ ++ ++/* ++ * Scans for buckets to be invalidated, invalidates them, rewrites prios/gens ++ * (marking them as invalidated on disk), then optionally issues discard ++ * commands to the newly free buckets, then puts them on the various freelists. ++ */ ++ ++/** ++ * wait_buckets_available - wait on reclaimable buckets ++ * ++ * If there aren't enough available buckets to fill up free_inc, wait until ++ * there are. ++ */ ++static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) ++{ ++ unsigned long gc_count = c->gc_count; ++ u64 available; ++ int ret = 0; ++ ++ ca->allocator_state = ALLOCATOR_BLOCKED; ++ closure_wake_up(&c->freelist_wait); ++ ++ while (1) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ if (kthread_should_stop()) { ++ ret = 1; ++ break; ++ } ++ ++ if (gc_count != c->gc_count) ++ ca->inc_gen_really_needs_gc = 0; ++ ++ available = max_t(s64, 0, dev_buckets_available(ca) - ++ ca->inc_gen_really_needs_gc); ++ ++ if (available > fifo_free(&ca->free_inc) || ++ (available && ++ (!fifo_full(&ca->free[RESERVE_BTREE]) || ++ !fifo_full(&ca->free[RESERVE_MOVINGGC])))) ++ break; ++ ++ up_read(&c->gc_lock); ++ schedule(); ++ try_to_freeze(); ++ down_read(&c->gc_lock); ++ } ++ ++ __set_current_state(TASK_RUNNING); ++ ca->allocator_state = ALLOCATOR_RUNNING; ++ closure_wake_up(&c->freelist_wait); ++ ++ return ret; ++} ++ ++static bool bch2_can_invalidate_bucket(struct bch_dev *ca, ++ size_t bucket, ++ struct bucket_mark mark) ++{ ++ u8 gc_gen; ++ ++ if (!is_available_bucket(mark)) ++ return false; ++ ++ if (ca->buckets_nouse && ++ test_bit(bucket, ca->buckets_nouse)) ++ return false; ++ ++ gc_gen = bucket_gc_gen(ca, bucket); ++ ++ if (gc_gen >= BUCKET_GC_GEN_MAX / 2) ++ ca->inc_gen_needs_gc++; ++ ++ if (gc_gen >= BUCKET_GC_GEN_MAX) ++ ca->inc_gen_really_needs_gc++; ++ ++ return gc_gen < BUCKET_GC_GEN_MAX; ++} ++ ++/* ++ * Determines what order we're going to reuse buckets, smallest bucket_key() ++ * first. ++ * ++ * ++ * - We take into account the read prio of the bucket, which gives us an ++ * indication of how hot the data is -- we scale the prio so that the prio ++ * farthest from the clock is worth 1/8th of the closest. ++ * ++ * - The number of sectors of cached data in the bucket, which gives us an ++ * indication of the cost in cache misses this eviction will cause. ++ * ++ * - If hotness * sectors used compares equal, we pick the bucket with the ++ * smallest bucket_gc_gen() - since incrementing the same bucket's generation ++ * number repeatedly forces us to run mark and sweep gc to avoid generation ++ * number wraparound. ++ */ ++ ++static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, struct bucket_mark m) ++{ ++ unsigned last_io = bucket_last_io(c, bucket(ca, b), READ); ++ unsigned max_last_io = ca->max_last_bucket_io[READ]; ++ ++ /* ++ * Time since last read, scaled to [0, 8) where larger value indicates ++ * more recently read data: ++ */ ++ unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io; ++ ++ /* How much we want to keep the data in this bucket: */ ++ unsigned long data_wantness = ++ (hotness + 1) * bucket_sectors_used(m); ++ ++ unsigned long needs_journal_commit = ++ bucket_needs_journal_commit(m, c->journal.last_seq_ondisk); ++ ++ return (data_wantness << 9) | ++ (needs_journal_commit << 8) | ++ (bucket_gc_gen(ca, b) / 16); ++} ++ ++static inline int bucket_alloc_cmp(alloc_heap *h, ++ struct alloc_heap_entry l, ++ struct alloc_heap_entry r) ++{ ++ return cmp_int(l.key, r.key) ?: ++ cmp_int(r.nr, l.nr) ?: ++ cmp_int(l.bucket, r.bucket); ++} ++ ++static inline int bucket_idx_cmp(const void *_l, const void *_r) ++{ ++ const struct alloc_heap_entry *l = _l, *r = _r; ++ ++ return cmp_int(l->bucket, r->bucket); ++} ++ ++static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bucket_array *buckets; ++ struct alloc_heap_entry e = { 0 }; ++ size_t b, i, nr = 0; ++ ++ ca->alloc_heap.used = 0; ++ ++ mutex_lock(&c->bucket_clock[READ].lock); ++ down_read(&ca->bucket_lock); ++ ++ buckets = bucket_array(ca); ++ ++ bch2_recalc_oldest_io(c, ca, READ); ++ ++ /* ++ * Find buckets with lowest read priority, by building a maxheap sorted ++ * by read priority and repeatedly replacing the maximum element until ++ * all buckets have been visited. ++ */ ++ for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { ++ struct bucket_mark m = READ_ONCE(buckets->b[b].mark); ++ unsigned long key = bucket_sort_key(c, ca, b, m); ++ ++ if (!bch2_can_invalidate_bucket(ca, b, m)) ++ continue; ++ ++ if (e.nr && e.bucket + e.nr == b && e.key == key) { ++ e.nr++; ++ } else { ++ if (e.nr) ++ heap_add_or_replace(&ca->alloc_heap, e, ++ -bucket_alloc_cmp, NULL); ++ ++ e = (struct alloc_heap_entry) { ++ .bucket = b, ++ .nr = 1, ++ .key = key, ++ }; ++ } ++ ++ cond_resched(); ++ } ++ ++ if (e.nr) ++ heap_add_or_replace(&ca->alloc_heap, e, ++ -bucket_alloc_cmp, NULL); ++ ++ for (i = 0; i < ca->alloc_heap.used; i++) ++ nr += ca->alloc_heap.data[i].nr; ++ ++ while (nr - ca->alloc_heap.data[0].nr >= ALLOC_SCAN_BATCH(ca)) { ++ nr -= ca->alloc_heap.data[0].nr; ++ heap_pop(&ca->alloc_heap, e, -bucket_alloc_cmp, NULL); ++ } ++ ++ up_read(&ca->bucket_lock); ++ mutex_unlock(&c->bucket_clock[READ].lock); ++} ++ ++static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bucket_array *buckets = bucket_array(ca); ++ struct bucket_mark m; ++ size_t b, start; ++ ++ if (ca->fifo_last_bucket < ca->mi.first_bucket || ++ ca->fifo_last_bucket >= ca->mi.nbuckets) ++ ca->fifo_last_bucket = ca->mi.first_bucket; ++ ++ start = ca->fifo_last_bucket; ++ ++ do { ++ ca->fifo_last_bucket++; ++ if (ca->fifo_last_bucket == ca->mi.nbuckets) ++ ca->fifo_last_bucket = ca->mi.first_bucket; ++ ++ b = ca->fifo_last_bucket; ++ m = READ_ONCE(buckets->b[b].mark); ++ ++ if (bch2_can_invalidate_bucket(ca, b, m)) { ++ struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; ++ ++ heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); ++ if (heap_full(&ca->alloc_heap)) ++ break; ++ } ++ ++ cond_resched(); ++ } while (ca->fifo_last_bucket != start); ++} ++ ++static void find_reclaimable_buckets_random(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bucket_array *buckets = bucket_array(ca); ++ struct bucket_mark m; ++ size_t checked, i; ++ ++ for (checked = 0; ++ checked < ca->mi.nbuckets / 2; ++ checked++) { ++ size_t b = bch2_rand_range(ca->mi.nbuckets - ++ ca->mi.first_bucket) + ++ ca->mi.first_bucket; ++ ++ m = READ_ONCE(buckets->b[b].mark); ++ ++ if (bch2_can_invalidate_bucket(ca, b, m)) { ++ struct alloc_heap_entry e = { .bucket = b, .nr = 1, }; ++ ++ heap_add(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); ++ if (heap_full(&ca->alloc_heap)) ++ break; ++ } ++ ++ cond_resched(); ++ } ++ ++ sort(ca->alloc_heap.data, ++ ca->alloc_heap.used, ++ sizeof(ca->alloc_heap.data[0]), ++ bucket_idx_cmp, NULL); ++ ++ /* remove duplicates: */ ++ for (i = 0; i + 1 < ca->alloc_heap.used; i++) ++ if (ca->alloc_heap.data[i].bucket == ++ ca->alloc_heap.data[i + 1].bucket) ++ ca->alloc_heap.data[i].nr = 0; ++} ++ ++static size_t find_reclaimable_buckets(struct bch_fs *c, struct bch_dev *ca) ++{ ++ size_t i, nr = 0; ++ ++ ca->inc_gen_needs_gc = 0; ++ ++ switch (ca->mi.replacement) { ++ case CACHE_REPLACEMENT_LRU: ++ find_reclaimable_buckets_lru(c, ca); ++ break; ++ case CACHE_REPLACEMENT_FIFO: ++ find_reclaimable_buckets_fifo(c, ca); ++ break; ++ case CACHE_REPLACEMENT_RANDOM: ++ find_reclaimable_buckets_random(c, ca); ++ break; ++ } ++ ++ heap_resort(&ca->alloc_heap, bucket_alloc_cmp, NULL); ++ ++ for (i = 0; i < ca->alloc_heap.used; i++) ++ nr += ca->alloc_heap.data[i].nr; ++ ++ return nr; ++} ++ ++static inline long next_alloc_bucket(struct bch_dev *ca) ++{ ++ struct alloc_heap_entry e, *top = ca->alloc_heap.data; ++ ++ while (ca->alloc_heap.used) { ++ if (top->nr) { ++ size_t b = top->bucket; ++ ++ top->bucket++; ++ top->nr--; ++ return b; ++ } ++ ++ heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); ++ } ++ ++ return -1; ++} ++ ++/* ++ * returns sequence number of most recent journal entry that updated this ++ * bucket: ++ */ ++static u64 bucket_journal_seq(struct bch_fs *c, struct bucket_mark m) ++{ ++ if (m.journal_seq_valid) { ++ u64 journal_seq = atomic64_read(&c->journal.seq); ++ u64 bucket_seq = journal_seq; ++ ++ bucket_seq &= ~((u64) U16_MAX); ++ bucket_seq |= m.journal_seq; ++ ++ if (bucket_seq > journal_seq) ++ bucket_seq -= 1 << 16; ++ ++ return bucket_seq; ++ } else { ++ return 0; ++ } ++} ++ ++static int bch2_invalidate_one_bucket2(struct btree_trans *trans, ++ struct bch_dev *ca, ++ struct btree_iter *iter, ++ u64 *journal_seq, unsigned flags) ++{ ++#if 0 ++ __BKEY_PADDED(k, BKEY_ALLOC_VAL_U64s_MAX) alloc_key; ++#else ++ /* hack: */ ++ __BKEY_PADDED(k, 8) alloc_key; ++#endif ++ struct bch_fs *c = trans->c; ++ struct bkey_i_alloc *a; ++ struct bkey_alloc_unpacked u; ++ struct bucket *g; ++ struct bucket_mark m; ++ bool invalidating_cached_data; ++ size_t b; ++ int ret = 0; ++ ++ BUG_ON(!ca->alloc_heap.used || ++ !ca->alloc_heap.data[0].nr); ++ b = ca->alloc_heap.data[0].bucket; ++ ++ /* first, put on free_inc and mark as owned by allocator: */ ++ percpu_down_read(&c->mark_lock); ++ spin_lock(&c->freelist_lock); ++ ++ verify_not_on_freelist(c, ca, b); ++ ++ BUG_ON(!fifo_push(&ca->free_inc, b)); ++ ++ g = bucket(ca, b); ++ m = READ_ONCE(g->mark); ++ ++ invalidating_cached_data = m.cached_sectors != 0; ++ ++ /* ++ * If we're not invalidating cached data, we only increment the bucket ++ * gen in memory here, the incremented gen will be updated in the btree ++ * by bch2_trans_mark_pointer(): ++ */ ++ ++ if (!invalidating_cached_data) ++ bch2_invalidate_bucket(c, ca, b, &m); ++ else ++ bch2_mark_alloc_bucket(c, ca, b, true, gc_pos_alloc(c, NULL), 0); ++ ++ spin_unlock(&c->freelist_lock); ++ percpu_up_read(&c->mark_lock); ++ ++ if (!invalidating_cached_data) ++ goto out; ++ ++ /* ++ * If the read-only path is trying to shut down, we can't be generating ++ * new btree updates: ++ */ ++ if (test_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags)) { ++ ret = 1; ++ goto out; ++ } ++ ++ BUG_ON(BKEY_ALLOC_VAL_U64s_MAX > 8); ++ ++ bch2_btree_iter_set_pos(iter, POS(ca->dev_idx, b)); ++retry: ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ return ret; ++ ++ percpu_down_read(&c->mark_lock); ++ g = bucket(ca, iter->pos.offset); ++ m = READ_ONCE(g->mark); ++ u = alloc_mem_to_key(g, m); ++ ++ percpu_up_read(&c->mark_lock); ++ ++ invalidating_cached_data = u.cached_sectors != 0; ++ ++ u.gen++; ++ u.data_type = 0; ++ u.dirty_sectors = 0; ++ u.cached_sectors = 0; ++ u.read_time = c->bucket_clock[READ].hand; ++ u.write_time = c->bucket_clock[WRITE].hand; ++ ++ a = bkey_alloc_init(&alloc_key.k); ++ a->k.p = iter->pos; ++ bch2_alloc_pack(a, u); ++ ++ bch2_trans_update(trans, iter, &a->k_i, ++ BTREE_TRIGGER_BUCKET_INVALIDATE); ++ ++ /* ++ * XXX: ++ * when using deferred btree updates, we have journal reclaim doing ++ * btree updates and thus requiring the allocator to make forward ++ * progress, and here the allocator is requiring space in the journal - ++ * so we need a journal pre-reservation: ++ */ ++ ret = bch2_trans_commit(trans, NULL, ++ invalidating_cached_data ? journal_seq : NULL, ++ BTREE_INSERT_NOUNLOCK| ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_USE_ALLOC_RESERVE| ++ flags); ++ if (ret == -EINTR) ++ goto retry; ++out: ++ if (!ret) { ++ /* remove from alloc_heap: */ ++ struct alloc_heap_entry e, *top = ca->alloc_heap.data; ++ ++ top->bucket++; ++ top->nr--; ++ ++ if (!top->nr) ++ heap_pop(&ca->alloc_heap, e, bucket_alloc_cmp, NULL); ++ ++ /* ++ * Make sure we flush the last journal entry that updated this ++ * bucket (i.e. deleting the last reference) before writing to ++ * this bucket again: ++ */ ++ *journal_seq = max(*journal_seq, bucket_journal_seq(c, m)); ++ } else { ++ size_t b2; ++ ++ /* remove from free_inc: */ ++ percpu_down_read(&c->mark_lock); ++ spin_lock(&c->freelist_lock); ++ ++ bch2_mark_alloc_bucket(c, ca, b, false, ++ gc_pos_alloc(c, NULL), 0); ++ ++ BUG_ON(!fifo_pop_back(&ca->free_inc, b2)); ++ BUG_ON(b != b2); ++ ++ spin_unlock(&c->freelist_lock); ++ percpu_up_read(&c->mark_lock); ++ } ++ ++ return ret < 0 ? ret : 0; ++} ++ ++/* ++ * Pull buckets off ca->alloc_heap, invalidate them, move them to ca->free_inc: ++ */ ++static int bch2_invalidate_buckets(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ u64 journal_seq = 0; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_ALLOC, ++ POS(ca->dev_idx, 0), ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_INTENT); ++ ++ /* Only use nowait if we've already invalidated at least one bucket: */ ++ while (!ret && ++ !fifo_full(&ca->free_inc) && ++ ca->alloc_heap.used) ++ ret = bch2_invalidate_one_bucket2(&trans, ca, iter, &journal_seq, ++ BTREE_INSERT_GC_LOCK_HELD| ++ (!fifo_empty(&ca->free_inc) ++ ? BTREE_INSERT_NOWAIT : 0)); ++ ++ bch2_trans_exit(&trans); ++ ++ /* If we used NOWAIT, don't return the error: */ ++ if (!fifo_empty(&ca->free_inc)) ++ ret = 0; ++ if (ret) { ++ bch_err(ca, "error invalidating buckets: %i", ret); ++ return ret; ++ } ++ ++ if (journal_seq) ++ ret = bch2_journal_flush_seq(&c->journal, journal_seq); ++ if (ret) { ++ bch_err(ca, "journal error: %i", ret); ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static int push_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, size_t bucket) ++{ ++ unsigned i; ++ int ret = 0; ++ ++ while (1) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ spin_lock(&c->freelist_lock); ++ for (i = 0; i < RESERVE_NR; i++) { ++ ++ /* ++ * Don't strand buckets on the copygc freelist until ++ * after recovery is finished: ++ */ ++ if (!test_bit(BCH_FS_STARTED, &c->flags) && ++ i == RESERVE_MOVINGGC) ++ continue; ++ ++ if (fifo_push(&ca->free[i], bucket)) { ++ fifo_pop(&ca->free_inc, bucket); ++ ++ closure_wake_up(&c->freelist_wait); ++ ca->allocator_state = ALLOCATOR_RUNNING; ++ ++ spin_unlock(&c->freelist_lock); ++ goto out; ++ } ++ } ++ ++ if (ca->allocator_state != ALLOCATOR_BLOCKED_FULL) { ++ ca->allocator_state = ALLOCATOR_BLOCKED_FULL; ++ closure_wake_up(&c->freelist_wait); ++ } ++ ++ spin_unlock(&c->freelist_lock); ++ ++ if ((current->flags & PF_KTHREAD) && ++ kthread_should_stop()) { ++ ret = 1; ++ break; ++ } ++ ++ schedule(); ++ try_to_freeze(); ++ } ++out: ++ __set_current_state(TASK_RUNNING); ++ return ret; ++} ++ ++/* ++ * Pulls buckets off free_inc, discards them (if enabled), then adds them to ++ * freelists, waiting until there's room if necessary: ++ */ ++static int discard_invalidated_buckets(struct bch_fs *c, struct bch_dev *ca) ++{ ++ while (!fifo_empty(&ca->free_inc)) { ++ size_t bucket = fifo_peek(&ca->free_inc); ++ ++ if (ca->mi.discard && ++ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) ++ blkdev_issue_discard(ca->disk_sb.bdev, ++ bucket_to_sector(ca, bucket), ++ ca->mi.bucket_size, GFP_NOIO, 0); ++ ++ if (push_invalidated_bucket(c, ca, bucket)) ++ return 1; ++ } ++ ++ return 0; ++} ++ ++/** ++ * bch_allocator_thread - move buckets from free_inc to reserves ++ * ++ * The free_inc FIFO is populated by find_reclaimable_buckets(), and ++ * the reserves are depleted by bucket allocation. When we run out ++ * of free_inc, try to invalidate some buckets and write out ++ * prios and gens. ++ */ ++static int bch2_allocator_thread(void *arg) ++{ ++ struct bch_dev *ca = arg; ++ struct bch_fs *c = ca->fs; ++ size_t nr; ++ int ret; ++ ++ set_freezable(); ++ ca->allocator_state = ALLOCATOR_RUNNING; ++ ++ while (1) { ++ cond_resched(); ++ if (kthread_should_stop()) ++ break; ++ ++ pr_debug("discarding %zu invalidated buckets", ++ fifo_used(&ca->free_inc)); ++ ++ ret = discard_invalidated_buckets(c, ca); ++ if (ret) ++ goto stop; ++ ++ down_read(&c->gc_lock); ++ ++ ret = bch2_invalidate_buckets(c, ca); ++ if (ret) { ++ up_read(&c->gc_lock); ++ goto stop; ++ } ++ ++ if (!fifo_empty(&ca->free_inc)) { ++ up_read(&c->gc_lock); ++ continue; ++ } ++ ++ pr_debug("free_inc now empty"); ++ ++ do { ++ /* ++ * Find some buckets that we can invalidate, either ++ * they're completely unused, or only contain clean data ++ * that's been written back to the backing device or ++ * another cache tier ++ */ ++ ++ pr_debug("scanning for reclaimable buckets"); ++ ++ nr = find_reclaimable_buckets(c, ca); ++ ++ pr_debug("found %zu buckets", nr); ++ ++ trace_alloc_batch(ca, nr, ca->alloc_heap.size); ++ ++ if ((ca->inc_gen_needs_gc >= ALLOC_SCAN_BATCH(ca) || ++ ca->inc_gen_really_needs_gc) && ++ c->gc_thread) { ++ atomic_inc(&c->kick_gc); ++ wake_up_process(c->gc_thread); ++ } ++ ++ /* ++ * If we found any buckets, we have to invalidate them ++ * before we scan for more - but if we didn't find very ++ * many we may want to wait on more buckets being ++ * available so we don't spin: ++ */ ++ if (!nr || ++ (nr < ALLOC_SCAN_BATCH(ca) && ++ !fifo_empty(&ca->free[RESERVE_NONE]))) { ++ ret = wait_buckets_available(c, ca); ++ if (ret) { ++ up_read(&c->gc_lock); ++ goto stop; ++ } ++ } ++ } while (!nr); ++ ++ up_read(&c->gc_lock); ++ ++ pr_debug("%zu buckets to invalidate", nr); ++ ++ /* ++ * alloc_heap is now full of newly-invalidated buckets: next, ++ * write out the new bucket gens: ++ */ ++ } ++ ++stop: ++ pr_debug("alloc thread stopping (ret %i)", ret); ++ ca->allocator_state = ALLOCATOR_STOPPED; ++ closure_wake_up(&c->freelist_wait); ++ return 0; ++} ++ ++/* Startup/shutdown (ro/rw): */ ++ ++void bch2_recalc_capacity(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ u64 capacity = 0, reserved_sectors = 0, gc_reserve, copygc_threshold = 0; ++ unsigned bucket_size_max = 0; ++ unsigned long ra_pages = 0; ++ unsigned i, j; ++ ++ lockdep_assert_held(&c->state_lock); ++ ++ for_each_online_member(ca, c, i) { ++ struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_bdi; ++ ++ ra_pages += bdi->ra_pages; ++ } ++ ++ bch2_set_ra_pages(c, ra_pages); ++ ++ for_each_rw_member(ca, c, i) { ++ u64 dev_reserve = 0; ++ ++ /* ++ * We need to reserve buckets (from the number ++ * of currently available buckets) against ++ * foreground writes so that mainly copygc can ++ * make forward progress. ++ * ++ * We need enough to refill the various reserves ++ * from scratch - copygc will use its entire ++ * reserve all at once, then run against when ++ * its reserve is refilled (from the formerly ++ * available buckets). ++ * ++ * This reserve is just used when considering if ++ * allocations for foreground writes must wait - ++ * not -ENOSPC calculations. ++ */ ++ for (j = 0; j < RESERVE_NONE; j++) ++ dev_reserve += ca->free[j].size; ++ ++ dev_reserve += 1; /* btree write point */ ++ dev_reserve += 1; /* copygc write point */ ++ dev_reserve += 1; /* rebalance write point */ ++ ++ dev_reserve *= ca->mi.bucket_size; ++ ++ copygc_threshold += dev_reserve; ++ ++ capacity += bucket_to_sector(ca, ca->mi.nbuckets - ++ ca->mi.first_bucket); ++ ++ reserved_sectors += dev_reserve * 2; ++ ++ bucket_size_max = max_t(unsigned, bucket_size_max, ++ ca->mi.bucket_size); ++ } ++ ++ gc_reserve = c->opts.gc_reserve_bytes ++ ? c->opts.gc_reserve_bytes >> 9 ++ : div64_u64(capacity * c->opts.gc_reserve_percent, 100); ++ ++ reserved_sectors = max(gc_reserve, reserved_sectors); ++ ++ reserved_sectors = min(reserved_sectors, capacity); ++ ++ c->copygc_threshold = copygc_threshold; ++ c->capacity = capacity - reserved_sectors; ++ ++ c->bucket_size_max = bucket_size_max; ++ ++ /* Wake up case someone was waiting for buckets */ ++ closure_wake_up(&c->freelist_wait); ++} ++ ++static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct open_bucket *ob; ++ bool ret = false; ++ ++ for (ob = c->open_buckets; ++ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ++ ob++) { ++ spin_lock(&ob->lock); ++ if (ob->valid && !ob->on_partial_list && ++ ob->ptr.dev == ca->dev_idx) ++ ret = true; ++ spin_unlock(&ob->lock); ++ } ++ ++ return ret; ++} ++ ++/* device goes ro: */ ++void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) ++{ ++ unsigned i; ++ ++ BUG_ON(ca->alloc_thread); ++ ++ /* First, remove device from allocation groups: */ ++ ++ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) ++ clear_bit(ca->dev_idx, c->rw_devs[i].d); ++ ++ /* ++ * Capacity is calculated based off of devices in allocation groups: ++ */ ++ bch2_recalc_capacity(c); ++ ++ /* Next, close write points that point to this device... */ ++ for (i = 0; i < ARRAY_SIZE(c->write_points); i++) ++ bch2_writepoint_stop(c, ca, &c->write_points[i]); ++ ++ bch2_writepoint_stop(c, ca, &c->copygc_write_point); ++ bch2_writepoint_stop(c, ca, &c->rebalance_write_point); ++ bch2_writepoint_stop(c, ca, &c->btree_write_point); ++ ++ mutex_lock(&c->btree_reserve_cache_lock); ++ while (c->btree_reserve_cache_nr) { ++ struct btree_alloc *a = ++ &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; ++ ++ bch2_open_buckets_put(c, &a->ob); ++ } ++ mutex_unlock(&c->btree_reserve_cache_lock); ++ ++ while (1) { ++ struct open_bucket *ob; ++ ++ spin_lock(&c->freelist_lock); ++ if (!ca->open_buckets_partial_nr) { ++ spin_unlock(&c->freelist_lock); ++ break; ++ } ++ ob = c->open_buckets + ++ ca->open_buckets_partial[--ca->open_buckets_partial_nr]; ++ ob->on_partial_list = false; ++ spin_unlock(&c->freelist_lock); ++ ++ bch2_open_bucket_put(c, ob); ++ } ++ ++ bch2_ec_stop_dev(c, ca); ++ ++ /* ++ * Wake up threads that were blocked on allocation, so they can notice ++ * the device can no longer be removed and the capacity has changed: ++ */ ++ closure_wake_up(&c->freelist_wait); ++ ++ /* ++ * journal_res_get() can block waiting for free space in the journal - ++ * it needs to notice there may not be devices to allocate from anymore: ++ */ ++ wake_up(&c->journal.wait); ++ ++ /* Now wait for any in flight writes: */ ++ ++ closure_wait_event(&c->open_buckets_wait, ++ !bch2_dev_has_open_write_point(c, ca)); ++} ++ ++/* device goes rw: */ ++void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) ++ if (ca->mi.data_allowed & (1 << i)) ++ set_bit(ca->dev_idx, c->rw_devs[i].d); ++} ++ ++void bch2_dev_allocator_quiesce(struct bch_fs *c, struct bch_dev *ca) ++{ ++ if (ca->alloc_thread) ++ closure_wait_event(&c->freelist_wait, ++ ca->allocator_state != ALLOCATOR_RUNNING); ++} ++ ++/* stop allocator thread: */ ++void bch2_dev_allocator_stop(struct bch_dev *ca) ++{ ++ struct task_struct *p; ++ ++ p = rcu_dereference_protected(ca->alloc_thread, 1); ++ ca->alloc_thread = NULL; ++ ++ /* ++ * We need an rcu barrier between setting ca->alloc_thread = NULL and ++ * the thread shutting down to avoid bch2_wake_allocator() racing: ++ * ++ * XXX: it would be better to have the rcu barrier be asynchronous ++ * instead of blocking us here ++ */ ++ synchronize_rcu(); ++ ++ if (p) { ++ kthread_stop(p); ++ put_task_struct(p); ++ } ++} ++ ++/* start allocator thread: */ ++int bch2_dev_allocator_start(struct bch_dev *ca) ++{ ++ struct task_struct *p; ++ ++ /* ++ * allocator thread already started? ++ */ ++ if (ca->alloc_thread) ++ return 0; ++ ++ p = kthread_create(bch2_allocator_thread, ca, ++ "bch_alloc[%s]", ca->name); ++ if (IS_ERR(p)) ++ return PTR_ERR(p); ++ ++ get_task_struct(p); ++ rcu_assign_pointer(ca->alloc_thread, p); ++ wake_up_process(p); ++ return 0; ++} ++ ++void bch2_fs_allocator_background_init(struct bch_fs *c) ++{ ++ spin_lock_init(&c->freelist_lock); ++ bch2_bucket_clock_init(c, READ); ++ bch2_bucket_clock_init(c, WRITE); ++ ++ c->pd_controllers_update_seconds = 5; ++ INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); ++} +diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h +new file mode 100644 +index 000000000000..cbaff56f7473 +--- /dev/null ++++ b/fs/bcachefs/alloc_background.h +@@ -0,0 +1,105 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ALLOC_BACKGROUND_H ++#define _BCACHEFS_ALLOC_BACKGROUND_H ++ ++#include "bcachefs.h" ++#include "alloc_types.h" ++#include "debug.h" ++ ++struct bkey_alloc_unpacked { ++ u8 gen; ++#define x(_name, _bits) u##_bits _name; ++ BCH_ALLOC_FIELDS() ++#undef x ++}; ++ ++/* How out of date a pointer gen is allowed to be: */ ++#define BUCKET_GC_GEN_MAX 96U ++ ++/* returns true if not equal */ ++static inline bool bkey_alloc_unpacked_cmp(struct bkey_alloc_unpacked l, ++ struct bkey_alloc_unpacked r) ++{ ++ return l.gen != r.gen ++#define x(_name, _bits) || l._name != r._name ++ BCH_ALLOC_FIELDS() ++#undef x ++ ; ++} ++ ++struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c); ++void bch2_alloc_pack(struct bkey_i_alloc *, ++ const struct bkey_alloc_unpacked); ++ ++int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); ++ ++static inline struct bkey_alloc_unpacked ++alloc_mem_to_key(struct bucket *g, struct bucket_mark m) ++{ ++ return (struct bkey_alloc_unpacked) { ++ .gen = m.gen, ++ .oldest_gen = g->oldest_gen, ++ .data_type = m.data_type, ++ .dirty_sectors = m.dirty_sectors, ++ .cached_sectors = m.cached_sectors, ++ .read_time = g->io_time[READ], ++ .write_time = g->io_time[WRITE], ++ }; ++} ++ ++#define ALLOC_SCAN_BATCH(ca) max_t(size_t, 1, (ca)->mi.nbuckets >> 9) ++ ++const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_alloc (struct bkey_ops) { \ ++ .key_invalid = bch2_alloc_invalid, \ ++ .val_to_text = bch2_alloc_to_text, \ ++} ++ ++struct journal_keys; ++int bch2_alloc_read(struct bch_fs *, struct journal_keys *); ++ ++static inline void bch2_wake_allocator(struct bch_dev *ca) ++{ ++ struct task_struct *p; ++ ++ rcu_read_lock(); ++ p = rcu_dereference(ca->alloc_thread); ++ if (p) { ++ wake_up_process(p); ++ ca->allocator_state = ALLOCATOR_RUNNING; ++ } ++ rcu_read_unlock(); ++} ++ ++static inline void verify_not_on_freelist(struct bch_fs *c, struct bch_dev *ca, ++ size_t bucket) ++{ ++ if (expensive_debug_checks(c)) { ++ size_t iter; ++ long i; ++ unsigned j; ++ ++ for (j = 0; j < RESERVE_NR; j++) ++ fifo_for_each_entry(i, &ca->free[j], iter) ++ BUG_ON(i == bucket); ++ fifo_for_each_entry(i, &ca->free_inc, iter) ++ BUG_ON(i == bucket); ++ } ++} ++ ++void bch2_recalc_capacity(struct bch_fs *); ++ ++void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); ++void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); ++ ++void bch2_dev_allocator_quiesce(struct bch_fs *, struct bch_dev *); ++void bch2_dev_allocator_stop(struct bch_dev *); ++int bch2_dev_allocator_start(struct bch_dev *); ++ ++int bch2_dev_alloc_write(struct bch_fs *, struct bch_dev *, unsigned); ++int bch2_alloc_write(struct bch_fs *, unsigned); ++void bch2_fs_allocator_background_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ +diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c +new file mode 100644 +index 000000000000..7a92e3d53254 +--- /dev/null ++++ b/fs/bcachefs/alloc_foreground.c +@@ -0,0 +1,990 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Primary bucket allocation code ++ * ++ * Copyright 2012 Google, Inc. ++ * ++ * Allocation in bcache is done in terms of buckets: ++ * ++ * Each bucket has associated an 8 bit gen; this gen corresponds to the gen in ++ * btree pointers - they must match for the pointer to be considered valid. ++ * ++ * Thus (assuming a bucket has no dirty data or metadata in it) we can reuse a ++ * bucket simply by incrementing its gen. ++ * ++ * The gens (along with the priorities; it's really the gens are important but ++ * the code is named as if it's the priorities) are written in an arbitrary list ++ * of buckets on disk, with a pointer to them in the journal header. ++ * ++ * When we invalidate a bucket, we have to write its new gen to disk and wait ++ * for that write to complete before we use it - otherwise after a crash we ++ * could have pointers that appeared to be good but pointed to data that had ++ * been overwritten. ++ * ++ * Since the gens and priorities are all stored contiguously on disk, we can ++ * batch this up: We fill up the free_inc list with freshly invalidated buckets, ++ * call prio_write(), and when prio_write() finishes we pull buckets off the ++ * free_inc list and optionally discard them. ++ * ++ * free_inc isn't the only freelist - if it was, we'd often have to sleep while ++ * priorities and gens were being written before we could allocate. c->free is a ++ * smaller freelist, and buckets on that list are always ready to be used. ++ * ++ * If we've got discards enabled, that happens when a bucket moves from the ++ * free_inc list to the free list. ++ * ++ * It's important to ensure that gens don't wrap around - with respect to ++ * either the oldest gen in the btree or the gen on disk. This is quite ++ * difficult to do in practice, but we explicitly guard against it anyways - if ++ * a bucket is in danger of wrapping around we simply skip invalidating it that ++ * time around, and we garbage collect or rewrite the priorities sooner than we ++ * would have otherwise. ++ * ++ * bch2_bucket_alloc() allocates a single bucket from a specific device. ++ * ++ * bch2_bucket_alloc_set() allocates one or more buckets from different devices ++ * in a given filesystem. ++ * ++ * invalidate_buckets() drives all the processes described above. It's called ++ * from bch2_bucket_alloc() and a few other places that need to make sure free ++ * buckets are ready. ++ * ++ * invalidate_buckets_(lru|fifo)() find buckets that are available to be ++ * invalidated, and then invalidate them and stick them on the free_inc list - ++ * in either lru or fifo order. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "clock.h" ++#include "debug.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "io.h" ++ ++#include ++#include ++#include ++#include ++ ++/* ++ * Open buckets represent a bucket that's currently being allocated from. They ++ * serve two purposes: ++ * ++ * - They track buckets that have been partially allocated, allowing for ++ * sub-bucket sized allocations - they're used by the sector allocator below ++ * ++ * - They provide a reference to the buckets they own that mark and sweep GC ++ * can find, until the new allocation has a pointer to it inserted into the ++ * btree ++ * ++ * When allocating some space with the sector allocator, the allocation comes ++ * with a reference to an open bucket - the caller is required to put that ++ * reference _after_ doing the index update that makes its allocation reachable. ++ */ ++ ++void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) ++{ ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ ++ if (ob->ec) { ++ bch2_ec_bucket_written(c, ob); ++ return; ++ } ++ ++ percpu_down_read(&c->mark_lock); ++ spin_lock(&ob->lock); ++ ++ bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), ++ false, gc_pos_alloc(c, ob), 0); ++ ob->valid = false; ++ ob->type = 0; ++ ++ spin_unlock(&ob->lock); ++ percpu_up_read(&c->mark_lock); ++ ++ spin_lock(&c->freelist_lock); ++ ob->freelist = c->open_buckets_freelist; ++ c->open_buckets_freelist = ob - c->open_buckets; ++ c->open_buckets_nr_free++; ++ spin_unlock(&c->freelist_lock); ++ ++ closure_wake_up(&c->open_buckets_wait); ++} ++ ++void bch2_open_bucket_write_error(struct bch_fs *c, ++ struct open_buckets *obs, ++ unsigned dev) ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, obs, ob, i) ++ if (ob->ptr.dev == dev && ++ ob->ec) ++ bch2_ec_bucket_cancel(c, ob); ++} ++ ++static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) ++{ ++ struct open_bucket *ob; ++ ++ BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free); ++ ++ ob = c->open_buckets + c->open_buckets_freelist; ++ c->open_buckets_freelist = ob->freelist; ++ atomic_set(&ob->pin, 1); ++ ob->type = 0; ++ ++ c->open_buckets_nr_free--; ++ return ob; ++} ++ ++static void open_bucket_free_unused(struct bch_fs *c, ++ struct write_point *wp, ++ struct open_bucket *ob) ++{ ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ bool may_realloc = wp->type == BCH_DATA_user; ++ ++ BUG_ON(ca->open_buckets_partial_nr > ++ ARRAY_SIZE(ca->open_buckets_partial)); ++ ++ if (ca->open_buckets_partial_nr < ++ ARRAY_SIZE(ca->open_buckets_partial) && ++ may_realloc) { ++ spin_lock(&c->freelist_lock); ++ ob->on_partial_list = true; ++ ca->open_buckets_partial[ca->open_buckets_partial_nr++] = ++ ob - c->open_buckets; ++ spin_unlock(&c->freelist_lock); ++ ++ closure_wake_up(&c->open_buckets_wait); ++ closure_wake_up(&c->freelist_wait); ++ } else { ++ bch2_open_bucket_put(c, ob); ++ } ++} ++ ++static void verify_not_stale(struct bch_fs *c, const struct open_buckets *obs) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, obs, ob, i) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ ++ BUG_ON(ptr_stale(ca, &ob->ptr)); ++ } ++#endif ++} ++ ++/* _only_ for allocating the journal on a new device: */ ++long bch2_bucket_alloc_new_fs(struct bch_dev *ca) ++{ ++ struct bucket_array *buckets; ++ ssize_t b; ++ ++ rcu_read_lock(); ++ buckets = bucket_array(ca); ++ ++ for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) ++ if (is_available_bucket(buckets->b[b].mark)) ++ goto success; ++ b = -1; ++success: ++ rcu_read_unlock(); ++ return b; ++} ++ ++static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) ++{ ++ switch (reserve) { ++ case RESERVE_ALLOC: ++ return 0; ++ case RESERVE_BTREE: ++ return OPEN_BUCKETS_COUNT / 4; ++ default: ++ return OPEN_BUCKETS_COUNT / 2; ++ } ++} ++ ++/** ++ * bch_bucket_alloc - allocate a single bucket from a specific device ++ * ++ * Returns index of bucket on success, 0 on failure ++ * */ ++struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, ++ enum alloc_reserve reserve, ++ bool may_alloc_partial, ++ struct closure *cl) ++{ ++ struct bucket_array *buckets; ++ struct open_bucket *ob; ++ long bucket = 0; ++ ++ spin_lock(&c->freelist_lock); ++ ++ if (may_alloc_partial) { ++ int i; ++ ++ for (i = ca->open_buckets_partial_nr - 1; i >= 0; --i) { ++ ob = c->open_buckets + ca->open_buckets_partial[i]; ++ ++ if (reserve <= ob->alloc_reserve) { ++ array_remove_item(ca->open_buckets_partial, ++ ca->open_buckets_partial_nr, ++ i); ++ ob->on_partial_list = false; ++ ob->alloc_reserve = reserve; ++ spin_unlock(&c->freelist_lock); ++ return ob; ++ } ++ } ++ } ++ ++ if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { ++ if (cl) ++ closure_wait(&c->open_buckets_wait, cl); ++ ++ if (!c->blocked_allocate_open_bucket) ++ c->blocked_allocate_open_bucket = local_clock(); ++ ++ spin_unlock(&c->freelist_lock); ++ trace_open_bucket_alloc_fail(ca, reserve); ++ return ERR_PTR(-OPEN_BUCKETS_EMPTY); ++ } ++ ++ if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket))) ++ goto out; ++ ++ switch (reserve) { ++ case RESERVE_ALLOC: ++ if (fifo_pop(&ca->free[RESERVE_BTREE], bucket)) ++ goto out; ++ break; ++ case RESERVE_BTREE: ++ if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >= ++ ca->free[RESERVE_BTREE].size && ++ fifo_pop(&ca->free[RESERVE_BTREE], bucket)) ++ goto out; ++ break; ++ case RESERVE_MOVINGGC: ++ if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket)) ++ goto out; ++ break; ++ default: ++ break; ++ } ++ ++ if (cl) ++ closure_wait(&c->freelist_wait, cl); ++ ++ if (!c->blocked_allocate) ++ c->blocked_allocate = local_clock(); ++ ++ spin_unlock(&c->freelist_lock); ++ ++ trace_bucket_alloc_fail(ca, reserve); ++ return ERR_PTR(-FREELIST_EMPTY); ++out: ++ verify_not_on_freelist(c, ca, bucket); ++ ++ ob = bch2_open_bucket_alloc(c); ++ ++ spin_lock(&ob->lock); ++ buckets = bucket_array(ca); ++ ++ ob->valid = true; ++ ob->sectors_free = ca->mi.bucket_size; ++ ob->alloc_reserve = reserve; ++ ob->ptr = (struct bch_extent_ptr) { ++ .type = 1 << BCH_EXTENT_ENTRY_ptr, ++ .gen = buckets->b[bucket].mark.gen, ++ .offset = bucket_to_sector(ca, bucket), ++ .dev = ca->dev_idx, ++ }; ++ ++ spin_unlock(&ob->lock); ++ ++ if (c->blocked_allocate_open_bucket) { ++ bch2_time_stats_update( ++ &c->times[BCH_TIME_blocked_allocate_open_bucket], ++ c->blocked_allocate_open_bucket); ++ c->blocked_allocate_open_bucket = 0; ++ } ++ ++ if (c->blocked_allocate) { ++ bch2_time_stats_update( ++ &c->times[BCH_TIME_blocked_allocate], ++ c->blocked_allocate); ++ c->blocked_allocate = 0; ++ } ++ ++ spin_unlock(&c->freelist_lock); ++ ++ bch2_wake_allocator(ca); ++ ++ trace_bucket_alloc(ca, reserve); ++ return ob; ++} ++ ++static int __dev_stripe_cmp(struct dev_stripe_state *stripe, ++ unsigned l, unsigned r) ++{ ++ return ((stripe->next_alloc[l] > stripe->next_alloc[r]) - ++ (stripe->next_alloc[l] < stripe->next_alloc[r])); ++} ++ ++#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) ++ ++struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, ++ struct dev_stripe_state *stripe, ++ struct bch_devs_mask *devs) ++{ ++ struct dev_alloc_list ret = { .nr = 0 }; ++ unsigned i; ++ ++ for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) ++ ret.devs[ret.nr++] = i; ++ ++ bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); ++ return ret; ++} ++ ++void bch2_dev_stripe_increment(struct bch_dev *ca, ++ struct dev_stripe_state *stripe) ++{ ++ u64 *v = stripe->next_alloc + ca->dev_idx; ++ u64 free_space = dev_buckets_free(ca); ++ u64 free_space_inv = free_space ++ ? div64_u64(1ULL << 48, free_space) ++ : 1ULL << 48; ++ u64 scale = *v / 4; ++ ++ if (*v + free_space_inv >= *v) ++ *v += free_space_inv; ++ else ++ *v = U64_MAX; ++ ++ for (v = stripe->next_alloc; ++ v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) ++ *v = *v < scale ? 0 : *v - scale; ++} ++ ++#define BUCKET_MAY_ALLOC_PARTIAL (1 << 0) ++#define BUCKET_ALLOC_USE_DURABILITY (1 << 1) ++ ++static void add_new_bucket(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct bch_devs_mask *devs_may_alloc, ++ unsigned *nr_effective, ++ bool *have_cache, ++ unsigned flags, ++ struct open_bucket *ob) ++{ ++ unsigned durability = ++ bch_dev_bkey_exists(c, ob->ptr.dev)->mi.durability; ++ ++ __clear_bit(ob->ptr.dev, devs_may_alloc->d); ++ *nr_effective += (flags & BUCKET_ALLOC_USE_DURABILITY) ++ ? durability : 1; ++ *have_cache |= !durability; ++ ++ ob_push(c, ptrs, ob); ++} ++ ++enum bucket_alloc_ret ++bch2_bucket_alloc_set(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct dev_stripe_state *stripe, ++ struct bch_devs_mask *devs_may_alloc, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ enum alloc_reserve reserve, ++ unsigned flags, ++ struct closure *cl) ++{ ++ struct dev_alloc_list devs_sorted = ++ bch2_dev_alloc_list(c, stripe, devs_may_alloc); ++ struct bch_dev *ca; ++ enum bucket_alloc_ret ret = INSUFFICIENT_DEVICES; ++ unsigned i; ++ ++ BUG_ON(*nr_effective >= nr_replicas); ++ ++ for (i = 0; i < devs_sorted.nr; i++) { ++ struct open_bucket *ob; ++ ++ ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); ++ if (!ca) ++ continue; ++ ++ if (!ca->mi.durability && *have_cache) ++ continue; ++ ++ ob = bch2_bucket_alloc(c, ca, reserve, ++ flags & BUCKET_MAY_ALLOC_PARTIAL, cl); ++ if (IS_ERR(ob)) { ++ ret = -PTR_ERR(ob); ++ ++ if (cl) ++ return ret; ++ continue; ++ } ++ ++ add_new_bucket(c, ptrs, devs_may_alloc, ++ nr_effective, have_cache, flags, ob); ++ ++ bch2_dev_stripe_increment(ca, stripe); ++ ++ if (*nr_effective >= nr_replicas) ++ return ALLOC_SUCCESS; ++ } ++ ++ return ret; ++} ++ ++/* Allocate from stripes: */ ++ ++/* ++ * if we can't allocate a new stripe because there are already too many ++ * partially filled stripes, force allocating from an existing stripe even when ++ * it's to a device we don't want: ++ */ ++ ++static void bucket_alloc_from_stripe(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct write_point *wp, ++ struct bch_devs_mask *devs_may_alloc, ++ u16 target, ++ unsigned erasure_code, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ unsigned flags) ++{ ++ struct dev_alloc_list devs_sorted; ++ struct ec_stripe_head *h; ++ struct open_bucket *ob; ++ struct bch_dev *ca; ++ unsigned i, ec_idx; ++ ++ if (!erasure_code) ++ return; ++ ++ if (nr_replicas < 2) ++ return; ++ ++ if (ec_open_bucket(c, ptrs)) ++ return; ++ ++ h = bch2_ec_stripe_head_get(c, target, 0, nr_replicas - 1); ++ if (!h) ++ return; ++ ++ devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); ++ ++ for (i = 0; i < devs_sorted.nr; i++) ++ open_bucket_for_each(c, &h->s->blocks, ob, ec_idx) ++ if (ob->ptr.dev == devs_sorted.devs[i] && ++ !test_and_set_bit(h->s->data_block_idx[ec_idx], ++ h->s->blocks_allocated)) ++ goto got_bucket; ++ goto out_put_head; ++got_bucket: ++ ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ ++ ob->ec_idx = h->s->data_block_idx[ec_idx]; ++ ob->ec = h->s; ++ ++ add_new_bucket(c, ptrs, devs_may_alloc, ++ nr_effective, have_cache, flags, ob); ++ atomic_inc(&h->s->pin); ++out_put_head: ++ bch2_ec_stripe_head_put(c, h); ++} ++ ++/* Sector allocator */ ++ ++static void get_buckets_from_writepoint(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct write_point *wp, ++ struct bch_devs_mask *devs_may_alloc, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ unsigned flags, ++ bool need_ec) ++{ ++ struct open_buckets ptrs_skip = { .nr = 0 }; ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ ++ if (*nr_effective < nr_replicas && ++ test_bit(ob->ptr.dev, devs_may_alloc->d) && ++ (ca->mi.durability || ++ (wp->type == BCH_DATA_user && !*have_cache)) && ++ (ob->ec || !need_ec)) { ++ add_new_bucket(c, ptrs, devs_may_alloc, ++ nr_effective, have_cache, ++ flags, ob); ++ } else { ++ ob_push(c, &ptrs_skip, ob); ++ } ++ } ++ wp->ptrs = ptrs_skip; ++} ++ ++static enum bucket_alloc_ret ++open_bucket_add_buckets(struct bch_fs *c, ++ struct open_buckets *ptrs, ++ struct write_point *wp, ++ struct bch_devs_list *devs_have, ++ u16 target, ++ unsigned erasure_code, ++ unsigned nr_replicas, ++ unsigned *nr_effective, ++ bool *have_cache, ++ enum alloc_reserve reserve, ++ unsigned flags, ++ struct closure *_cl) ++{ ++ struct bch_devs_mask devs; ++ struct open_bucket *ob; ++ struct closure *cl = NULL; ++ enum bucket_alloc_ret ret; ++ unsigned i; ++ ++ rcu_read_lock(); ++ devs = target_rw_devs(c, wp->type, target); ++ rcu_read_unlock(); ++ ++ /* Don't allocate from devices we already have pointers to: */ ++ for (i = 0; i < devs_have->nr; i++) ++ __clear_bit(devs_have->devs[i], devs.d); ++ ++ open_bucket_for_each(c, ptrs, ob, i) ++ __clear_bit(ob->ptr.dev, devs.d); ++ ++ if (erasure_code) { ++ if (!ec_open_bucket(c, ptrs)) { ++ get_buckets_from_writepoint(c, ptrs, wp, &devs, ++ nr_replicas, nr_effective, ++ have_cache, flags, true); ++ if (*nr_effective >= nr_replicas) ++ return 0; ++ } ++ ++ if (!ec_open_bucket(c, ptrs)) { ++ bucket_alloc_from_stripe(c, ptrs, wp, &devs, ++ target, erasure_code, ++ nr_replicas, nr_effective, ++ have_cache, flags); ++ if (*nr_effective >= nr_replicas) ++ return 0; ++ } ++ } ++ ++ get_buckets_from_writepoint(c, ptrs, wp, &devs, ++ nr_replicas, nr_effective, ++ have_cache, flags, false); ++ if (*nr_effective >= nr_replicas) ++ return 0; ++ ++ percpu_down_read(&c->mark_lock); ++ rcu_read_lock(); ++ ++retry_blocking: ++ /* ++ * Try nonblocking first, so that if one device is full we'll try from ++ * other devices: ++ */ ++ ret = bch2_bucket_alloc_set(c, ptrs, &wp->stripe, &devs, ++ nr_replicas, nr_effective, have_cache, ++ reserve, flags, cl); ++ if (ret && ret != INSUFFICIENT_DEVICES && !cl && _cl) { ++ cl = _cl; ++ goto retry_blocking; ++ } ++ ++ rcu_read_unlock(); ++ percpu_up_read(&c->mark_lock); ++ ++ return ret; ++} ++ ++void bch2_open_buckets_stop_dev(struct bch_fs *c, struct bch_dev *ca, ++ struct open_buckets *obs) ++{ ++ struct open_buckets ptrs = { .nr = 0 }; ++ struct open_bucket *ob, *ob2; ++ unsigned i, j; ++ ++ open_bucket_for_each(c, obs, ob, i) { ++ bool drop = !ca || ob->ptr.dev == ca->dev_idx; ++ ++ if (!drop && ob->ec) { ++ mutex_lock(&ob->ec->lock); ++ open_bucket_for_each(c, &ob->ec->blocks, ob2, j) ++ drop |= ob2->ptr.dev == ca->dev_idx; ++ open_bucket_for_each(c, &ob->ec->parity, ob2, j) ++ drop |= ob2->ptr.dev == ca->dev_idx; ++ mutex_unlock(&ob->ec->lock); ++ } ++ ++ if (drop) ++ bch2_open_bucket_put(c, ob); ++ else ++ ob_push(c, &ptrs, ob); ++ } ++ ++ *obs = ptrs; ++} ++ ++void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, ++ struct write_point *wp) ++{ ++ mutex_lock(&wp->lock); ++ bch2_open_buckets_stop_dev(c, ca, &wp->ptrs); ++ mutex_unlock(&wp->lock); ++} ++ ++static inline struct hlist_head *writepoint_hash(struct bch_fs *c, ++ unsigned long write_point) ++{ ++ unsigned hash = ++ hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); ++ ++ return &c->write_points_hash[hash]; ++} ++ ++static struct write_point *__writepoint_find(struct hlist_head *head, ++ unsigned long write_point) ++{ ++ struct write_point *wp; ++ ++ hlist_for_each_entry_rcu(wp, head, node) ++ if (wp->write_point == write_point) ++ return wp; ++ ++ return NULL; ++} ++ ++static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) ++{ ++ u64 stranded = c->write_points_nr * c->bucket_size_max; ++ u64 free = bch2_fs_usage_read_short(c).free; ++ ++ return stranded * factor > free; ++} ++ ++static bool try_increase_writepoints(struct bch_fs *c) ++{ ++ struct write_point *wp; ++ ++ if (c->write_points_nr == ARRAY_SIZE(c->write_points) || ++ too_many_writepoints(c, 32)) ++ return false; ++ ++ wp = c->write_points + c->write_points_nr++; ++ hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); ++ return true; ++} ++ ++static bool try_decrease_writepoints(struct bch_fs *c, ++ unsigned old_nr) ++{ ++ struct write_point *wp; ++ ++ mutex_lock(&c->write_points_hash_lock); ++ if (c->write_points_nr < old_nr) { ++ mutex_unlock(&c->write_points_hash_lock); ++ return true; ++ } ++ ++ if (c->write_points_nr == 1 || ++ !too_many_writepoints(c, 8)) { ++ mutex_unlock(&c->write_points_hash_lock); ++ return false; ++ } ++ ++ wp = c->write_points + --c->write_points_nr; ++ ++ hlist_del_rcu(&wp->node); ++ mutex_unlock(&c->write_points_hash_lock); ++ ++ bch2_writepoint_stop(c, NULL, wp); ++ return true; ++} ++ ++static struct write_point *writepoint_find(struct bch_fs *c, ++ unsigned long write_point) ++{ ++ struct write_point *wp, *oldest; ++ struct hlist_head *head; ++ ++ if (!(write_point & 1UL)) { ++ wp = (struct write_point *) write_point; ++ mutex_lock(&wp->lock); ++ return wp; ++ } ++ ++ head = writepoint_hash(c, write_point); ++restart_find: ++ wp = __writepoint_find(head, write_point); ++ if (wp) { ++lock_wp: ++ mutex_lock(&wp->lock); ++ if (wp->write_point == write_point) ++ goto out; ++ mutex_unlock(&wp->lock); ++ goto restart_find; ++ } ++restart_find_oldest: ++ oldest = NULL; ++ for (wp = c->write_points; ++ wp < c->write_points + c->write_points_nr; wp++) ++ if (!oldest || time_before64(wp->last_used, oldest->last_used)) ++ oldest = wp; ++ ++ mutex_lock(&oldest->lock); ++ mutex_lock(&c->write_points_hash_lock); ++ if (oldest >= c->write_points + c->write_points_nr || ++ try_increase_writepoints(c)) { ++ mutex_unlock(&c->write_points_hash_lock); ++ mutex_unlock(&oldest->lock); ++ goto restart_find_oldest; ++ } ++ ++ wp = __writepoint_find(head, write_point); ++ if (wp && wp != oldest) { ++ mutex_unlock(&c->write_points_hash_lock); ++ mutex_unlock(&oldest->lock); ++ goto lock_wp; ++ } ++ ++ wp = oldest; ++ hlist_del_rcu(&wp->node); ++ wp->write_point = write_point; ++ hlist_add_head_rcu(&wp->node, head); ++ mutex_unlock(&c->write_points_hash_lock); ++out: ++ wp->last_used = sched_clock(); ++ return wp; ++} ++ ++/* ++ * Get us an open_bucket we can allocate from, return with it locked: ++ */ ++struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, ++ unsigned target, ++ unsigned erasure_code, ++ struct write_point_specifier write_point, ++ struct bch_devs_list *devs_have, ++ unsigned nr_replicas, ++ unsigned nr_replicas_required, ++ enum alloc_reserve reserve, ++ unsigned flags, ++ struct closure *cl) ++{ ++ struct write_point *wp; ++ struct open_bucket *ob; ++ struct open_buckets ptrs; ++ unsigned nr_effective, write_points_nr; ++ unsigned ob_flags = 0; ++ bool have_cache; ++ enum bucket_alloc_ret ret; ++ int i; ++ ++ if (!(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) ++ ob_flags |= BUCKET_ALLOC_USE_DURABILITY; ++ ++ BUG_ON(!nr_replicas || !nr_replicas_required); ++retry: ++ ptrs.nr = 0; ++ nr_effective = 0; ++ write_points_nr = c->write_points_nr; ++ have_cache = false; ++ ++ wp = writepoint_find(c, write_point.v); ++ ++ if (wp->type == BCH_DATA_user) ++ ob_flags |= BUCKET_MAY_ALLOC_PARTIAL; ++ ++ /* metadata may not allocate on cache devices: */ ++ if (wp->type != BCH_DATA_user) ++ have_cache = true; ++ ++ if (!target || (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { ++ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, ++ target, erasure_code, ++ nr_replicas, &nr_effective, ++ &have_cache, reserve, ++ ob_flags, cl); ++ } else { ++ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, ++ target, erasure_code, ++ nr_replicas, &nr_effective, ++ &have_cache, reserve, ++ ob_flags, NULL); ++ if (!ret) ++ goto alloc_done; ++ ++ ret = open_bucket_add_buckets(c, &ptrs, wp, devs_have, ++ 0, erasure_code, ++ nr_replicas, &nr_effective, ++ &have_cache, reserve, ++ ob_flags, cl); ++ } ++alloc_done: ++ BUG_ON(!ret && nr_effective < nr_replicas); ++ ++ if (erasure_code && !ec_open_bucket(c, &ptrs)) ++ pr_debug("failed to get ec bucket: ret %u", ret); ++ ++ if (ret == INSUFFICIENT_DEVICES && ++ nr_effective >= nr_replicas_required) ++ ret = 0; ++ ++ if (ret) ++ goto err; ++ ++ /* Free buckets we didn't use: */ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ open_bucket_free_unused(c, wp, ob); ++ ++ wp->ptrs = ptrs; ++ ++ wp->sectors_free = UINT_MAX; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ wp->sectors_free = min(wp->sectors_free, ob->sectors_free); ++ ++ BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); ++ ++ verify_not_stale(c, &wp->ptrs); ++ ++ return wp; ++err: ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ if (ptrs.nr < ARRAY_SIZE(ptrs.v)) ++ ob_push(c, &ptrs, ob); ++ else ++ open_bucket_free_unused(c, wp, ob); ++ wp->ptrs = ptrs; ++ ++ mutex_unlock(&wp->lock); ++ ++ if (ret == FREELIST_EMPTY && ++ try_decrease_writepoints(c, write_points_nr)) ++ goto retry; ++ ++ switch (ret) { ++ case OPEN_BUCKETS_EMPTY: ++ case FREELIST_EMPTY: ++ return cl ? ERR_PTR(-EAGAIN) : ERR_PTR(-ENOSPC); ++ case INSUFFICIENT_DEVICES: ++ return ERR_PTR(-EROFS); ++ default: ++ BUG(); ++ } ++} ++ ++/* ++ * Append pointers to the space we just allocated to @k, and mark @sectors space ++ * as allocated out of @ob ++ */ ++void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, ++ struct bkey_i *k, unsigned sectors) ++ ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ BUG_ON(sectors > wp->sectors_free); ++ wp->sectors_free -= sectors; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ struct bch_extent_ptr tmp = ob->ptr; ++ ++ tmp.cached = !ca->mi.durability && ++ wp->type == BCH_DATA_user; ++ ++ tmp.offset += ca->mi.bucket_size - ob->sectors_free; ++ bch2_bkey_append_ptr(k, tmp); ++ ++ BUG_ON(sectors > ob->sectors_free); ++ ob->sectors_free -= sectors; ++ } ++} ++ ++/* ++ * Append pointers to the space we just allocated to @k, and mark @sectors space ++ * as allocated out of @ob ++ */ ++void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) ++{ ++ struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 }; ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob); ++ wp->ptrs = keep; ++ ++ mutex_unlock(&wp->lock); ++ ++ bch2_open_buckets_put(c, &ptrs); ++} ++ ++static inline void writepoint_init(struct write_point *wp, ++ enum bch_data_type type) ++{ ++ mutex_init(&wp->lock); ++ wp->type = type; ++} ++ ++void bch2_fs_allocator_foreground_init(struct bch_fs *c) ++{ ++ struct open_bucket *ob; ++ struct write_point *wp; ++ ++ mutex_init(&c->write_points_hash_lock); ++ c->write_points_nr = ARRAY_SIZE(c->write_points); ++ ++ /* open bucket 0 is a sentinal NULL: */ ++ spin_lock_init(&c->open_buckets[0].lock); ++ ++ for (ob = c->open_buckets + 1; ++ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { ++ spin_lock_init(&ob->lock); ++ c->open_buckets_nr_free++; ++ ++ ob->freelist = c->open_buckets_freelist; ++ c->open_buckets_freelist = ob - c->open_buckets; ++ } ++ ++ writepoint_init(&c->btree_write_point, BCH_DATA_btree); ++ writepoint_init(&c->rebalance_write_point, BCH_DATA_user); ++ writepoint_init(&c->copygc_write_point, BCH_DATA_user); ++ ++ for (wp = c->write_points; ++ wp < c->write_points + c->write_points_nr; wp++) { ++ writepoint_init(wp, BCH_DATA_user); ++ ++ wp->last_used = sched_clock(); ++ wp->write_point = (unsigned long) wp; ++ hlist_add_head_rcu(&wp->node, ++ writepoint_hash(c, wp->write_point)); ++ } ++} +diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h +new file mode 100644 +index 000000000000..c658295cb8e0 +--- /dev/null ++++ b/fs/bcachefs/alloc_foreground.h +@@ -0,0 +1,138 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ALLOC_FOREGROUND_H ++#define _BCACHEFS_ALLOC_FOREGROUND_H ++ ++#include "bcachefs.h" ++#include "alloc_types.h" ++ ++#include ++ ++struct bkey; ++struct bch_dev; ++struct bch_fs; ++struct bch_devs_List; ++ ++enum bucket_alloc_ret { ++ ALLOC_SUCCESS, ++ OPEN_BUCKETS_EMPTY, ++ FREELIST_EMPTY, /* Allocator thread not keeping up */ ++ INSUFFICIENT_DEVICES, ++}; ++ ++struct dev_alloc_list { ++ unsigned nr; ++ u8 devs[BCH_SB_MEMBERS_MAX]; ++}; ++ ++struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, ++ struct dev_stripe_state *, ++ struct bch_devs_mask *); ++void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); ++ ++long bch2_bucket_alloc_new_fs(struct bch_dev *); ++ ++struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, ++ enum alloc_reserve, bool, ++ struct closure *); ++ ++static inline void ob_push(struct bch_fs *c, struct open_buckets *obs, ++ struct open_bucket *ob) ++{ ++ BUG_ON(obs->nr >= ARRAY_SIZE(obs->v)); ++ ++ obs->v[obs->nr++] = ob - c->open_buckets; ++} ++ ++#define open_bucket_for_each(_c, _obs, _ob, _i) \ ++ for ((_i) = 0; \ ++ (_i) < (_obs)->nr && \ ++ ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true); \ ++ (_i)++) ++ ++static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, ++ struct open_buckets *obs) ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, obs, ob, i) ++ if (ob->ec) ++ return ob; ++ ++ return NULL; ++} ++ ++void bch2_open_bucket_write_error(struct bch_fs *, ++ struct open_buckets *, unsigned); ++ ++void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); ++ ++static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) ++{ ++ if (atomic_dec_and_test(&ob->pin)) ++ __bch2_open_bucket_put(c, ob); ++} ++ ++static inline void bch2_open_buckets_put(struct bch_fs *c, ++ struct open_buckets *ptrs) ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, ptrs, ob, i) ++ bch2_open_bucket_put(c, ob); ++ ptrs->nr = 0; ++} ++ ++static inline void bch2_open_bucket_get(struct bch_fs *c, ++ struct write_point *wp, ++ struct open_buckets *ptrs) ++{ ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) { ++ ob->type = wp->type; ++ atomic_inc(&ob->pin); ++ ob_push(c, ptrs, ob); ++ } ++} ++ ++enum bucket_alloc_ret ++bch2_bucket_alloc_set(struct bch_fs *, struct open_buckets *, ++ struct dev_stripe_state *, struct bch_devs_mask *, ++ unsigned, unsigned *, bool *, enum alloc_reserve, ++ unsigned, struct closure *); ++ ++struct write_point *bch2_alloc_sectors_start(struct bch_fs *, ++ unsigned, unsigned, ++ struct write_point_specifier, ++ struct bch_devs_list *, ++ unsigned, unsigned, ++ enum alloc_reserve, ++ unsigned, ++ struct closure *); ++ ++void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, ++ struct bkey_i *, unsigned); ++void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); ++ ++void bch2_open_buckets_stop_dev(struct bch_fs *, struct bch_dev *, ++ struct open_buckets *); ++ ++void bch2_writepoint_stop(struct bch_fs *, struct bch_dev *, ++ struct write_point *); ++ ++static inline struct write_point_specifier writepoint_hashed(unsigned long v) ++{ ++ return (struct write_point_specifier) { .v = v | 1 }; ++} ++ ++static inline struct write_point_specifier writepoint_ptr(struct write_point *wp) ++{ ++ return (struct write_point_specifier) { .v = (unsigned long) wp }; ++} ++ ++void bch2_fs_allocator_foreground_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ +diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h +new file mode 100644 +index 000000000000..20705460bb0a +--- /dev/null ++++ b/fs/bcachefs/alloc_types.h +@@ -0,0 +1,113 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ALLOC_TYPES_H ++#define _BCACHEFS_ALLOC_TYPES_H ++ ++#include ++#include ++ ++#include "clock_types.h" ++#include "fifo.h" ++ ++struct ec_bucket_buf; ++ ++/* There's two of these clocks, one for reads and one for writes: */ ++struct bucket_clock { ++ /* ++ * "now" in (read/write) IO time - incremented whenever we do X amount ++ * of reads or writes. ++ * ++ * Goes with the bucket read/write prios: when we read or write to a ++ * bucket we reset the bucket's prio to the current hand; thus hand - ++ * prio = time since bucket was last read/written. ++ * ++ * The units are some amount (bytes/sectors) of data read/written, and ++ * the units can change on the fly if we need to rescale to fit ++ * everything in a u16 - your only guarantee is that the units are ++ * consistent. ++ */ ++ u16 hand; ++ u16 max_last_io; ++ ++ int rw; ++ ++ struct io_timer rescale; ++ struct mutex lock; ++}; ++ ++/* There is one reserve for each type of btree, one for prios and gens ++ * and one for moving GC */ ++enum alloc_reserve { ++ RESERVE_ALLOC = -1, ++ RESERVE_BTREE = 0, ++ RESERVE_MOVINGGC = 1, ++ RESERVE_NONE = 2, ++ RESERVE_NR = 3, ++}; ++ ++typedef FIFO(long) alloc_fifo; ++ ++#define OPEN_BUCKETS_COUNT 1024 ++ ++#define WRITE_POINT_HASH_NR 32 ++#define WRITE_POINT_MAX 32 ++ ++typedef u16 open_bucket_idx_t; ++ ++struct open_bucket { ++ spinlock_t lock; ++ atomic_t pin; ++ open_bucket_idx_t freelist; ++ ++ /* ++ * When an open bucket has an ec_stripe attached, this is the index of ++ * the block in the stripe this open_bucket corresponds to: ++ */ ++ u8 ec_idx; ++ u8 type; ++ unsigned valid:1; ++ unsigned on_partial_list:1; ++ int alloc_reserve:3; ++ unsigned sectors_free; ++ struct bch_extent_ptr ptr; ++ struct ec_stripe_new *ec; ++}; ++ ++#define OPEN_BUCKET_LIST_MAX 15 ++ ++struct open_buckets { ++ open_bucket_idx_t nr; ++ open_bucket_idx_t v[OPEN_BUCKET_LIST_MAX]; ++}; ++ ++struct dev_stripe_state { ++ u64 next_alloc[BCH_SB_MEMBERS_MAX]; ++}; ++ ++struct write_point { ++ struct hlist_node node; ++ struct mutex lock; ++ u64 last_used; ++ unsigned long write_point; ++ enum bch_data_type type; ++ bool is_ec; ++ ++ /* calculated based on how many pointers we're actually going to use: */ ++ unsigned sectors_free; ++ ++ struct open_buckets ptrs; ++ struct dev_stripe_state stripe; ++}; ++ ++struct write_point_specifier { ++ unsigned long v; ++}; ++ ++struct alloc_heap_entry { ++ size_t bucket; ++ size_t nr; ++ unsigned long key; ++}; ++ ++typedef HEAP(struct alloc_heap_entry) alloc_heap; ++ ++#endif /* _BCACHEFS_ALLOC_TYPES_H */ +diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h +new file mode 100644 +index 000000000000..29f411635f29 +--- /dev/null ++++ b/fs/bcachefs/bcachefs.h +@@ -0,0 +1,882 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_H ++#define _BCACHEFS_H ++ ++/* ++ * SOME HIGH LEVEL CODE DOCUMENTATION: ++ * ++ * Bcache mostly works with cache sets, cache devices, and backing devices. ++ * ++ * Support for multiple cache devices hasn't quite been finished off yet, but ++ * it's about 95% plumbed through. A cache set and its cache devices is sort of ++ * like a md raid array and its component devices. Most of the code doesn't care ++ * about individual cache devices, the main abstraction is the cache set. ++ * ++ * Multiple cache devices is intended to give us the ability to mirror dirty ++ * cached data and metadata, without mirroring clean cached data. ++ * ++ * Backing devices are different, in that they have a lifetime independent of a ++ * cache set. When you register a newly formatted backing device it'll come up ++ * in passthrough mode, and then you can attach and detach a backing device from ++ * a cache set at runtime - while it's mounted and in use. Detaching implicitly ++ * invalidates any cached data for that backing device. ++ * ++ * A cache set can have multiple (many) backing devices attached to it. ++ * ++ * There's also flash only volumes - this is the reason for the distinction ++ * between struct cached_dev and struct bcache_device. A flash only volume ++ * works much like a bcache device that has a backing device, except the ++ * "cached" data is always dirty. The end result is that we get thin ++ * provisioning with very little additional code. ++ * ++ * Flash only volumes work but they're not production ready because the moving ++ * garbage collector needs more work. More on that later. ++ * ++ * BUCKETS/ALLOCATION: ++ * ++ * Bcache is primarily designed for caching, which means that in normal ++ * operation all of our available space will be allocated. Thus, we need an ++ * efficient way of deleting things from the cache so we can write new things to ++ * it. ++ * ++ * To do this, we first divide the cache device up into buckets. A bucket is the ++ * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ ++ * works efficiently. ++ * ++ * Each bucket has a 16 bit priority, and an 8 bit generation associated with ++ * it. The gens and priorities for all the buckets are stored contiguously and ++ * packed on disk (in a linked list of buckets - aside from the superblock, all ++ * of bcache's metadata is stored in buckets). ++ * ++ * The priority is used to implement an LRU. We reset a bucket's priority when ++ * we allocate it or on cache it, and every so often we decrement the priority ++ * of each bucket. It could be used to implement something more sophisticated, ++ * if anyone ever gets around to it. ++ * ++ * The generation is used for invalidating buckets. Each pointer also has an 8 ++ * bit generation embedded in it; for a pointer to be considered valid, its gen ++ * must match the gen of the bucket it points into. Thus, to reuse a bucket all ++ * we have to do is increment its gen (and write its new gen to disk; we batch ++ * this up). ++ * ++ * Bcache is entirely COW - we never write twice to a bucket, even buckets that ++ * contain metadata (including btree nodes). ++ * ++ * THE BTREE: ++ * ++ * Bcache is in large part design around the btree. ++ * ++ * At a high level, the btree is just an index of key -> ptr tuples. ++ * ++ * Keys represent extents, and thus have a size field. Keys also have a variable ++ * number of pointers attached to them (potentially zero, which is handy for ++ * invalidating the cache). ++ * ++ * The key itself is an inode:offset pair. The inode number corresponds to a ++ * backing device or a flash only volume. The offset is the ending offset of the ++ * extent within the inode - not the starting offset; this makes lookups ++ * slightly more convenient. ++ * ++ * Pointers contain the cache device id, the offset on that device, and an 8 bit ++ * generation number. More on the gen later. ++ * ++ * Index lookups are not fully abstracted - cache lookups in particular are ++ * still somewhat mixed in with the btree code, but things are headed in that ++ * direction. ++ * ++ * Updates are fairly well abstracted, though. There are two different ways of ++ * updating the btree; insert and replace. ++ * ++ * BTREE_INSERT will just take a list of keys and insert them into the btree - ++ * overwriting (possibly only partially) any extents they overlap with. This is ++ * used to update the index after a write. ++ * ++ * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is ++ * overwriting a key that matches another given key. This is used for inserting ++ * data into the cache after a cache miss, and for background writeback, and for ++ * the moving garbage collector. ++ * ++ * There is no "delete" operation; deleting things from the index is ++ * accomplished by either by invalidating pointers (by incrementing a bucket's ++ * gen) or by inserting a key with 0 pointers - which will overwrite anything ++ * previously present at that location in the index. ++ * ++ * This means that there are always stale/invalid keys in the btree. They're ++ * filtered out by the code that iterates through a btree node, and removed when ++ * a btree node is rewritten. ++ * ++ * BTREE NODES: ++ * ++ * Our unit of allocation is a bucket, and we we can't arbitrarily allocate and ++ * free smaller than a bucket - so, that's how big our btree nodes are. ++ * ++ * (If buckets are really big we'll only use part of the bucket for a btree node ++ * - no less than 1/4th - but a bucket still contains no more than a single ++ * btree node. I'd actually like to change this, but for now we rely on the ++ * bucket's gen for deleting btree nodes when we rewrite/split a node.) ++ * ++ * Anyways, btree nodes are big - big enough to be inefficient with a textbook ++ * btree implementation. ++ * ++ * The way this is solved is that btree nodes are internally log structured; we ++ * can append new keys to an existing btree node without rewriting it. This ++ * means each set of keys we write is sorted, but the node is not. ++ * ++ * We maintain this log structure in memory - keeping 1Mb of keys sorted would ++ * be expensive, and we have to distinguish between the keys we have written and ++ * the keys we haven't. So to do a lookup in a btree node, we have to search ++ * each sorted set. But we do merge written sets together lazily, so the cost of ++ * these extra searches is quite low (normally most of the keys in a btree node ++ * will be in one big set, and then there'll be one or two sets that are much ++ * smaller). ++ * ++ * This log structure makes bcache's btree more of a hybrid between a ++ * conventional btree and a compacting data structure, with some of the ++ * advantages of both. ++ * ++ * GARBAGE COLLECTION: ++ * ++ * We can't just invalidate any bucket - it might contain dirty data or ++ * metadata. If it once contained dirty data, other writes might overwrite it ++ * later, leaving no valid pointers into that bucket in the index. ++ * ++ * Thus, the primary purpose of garbage collection is to find buckets to reuse. ++ * It also counts how much valid data it each bucket currently contains, so that ++ * allocation can reuse buckets sooner when they've been mostly overwritten. ++ * ++ * It also does some things that are really internal to the btree ++ * implementation. If a btree node contains pointers that are stale by more than ++ * some threshold, it rewrites the btree node to avoid the bucket's generation ++ * wrapping around. It also merges adjacent btree nodes if they're empty enough. ++ * ++ * THE JOURNAL: ++ * ++ * Bcache's journal is not necessary for consistency; we always strictly ++ * order metadata writes so that the btree and everything else is consistent on ++ * disk in the event of an unclean shutdown, and in fact bcache had writeback ++ * caching (with recovery from unclean shutdown) before journalling was ++ * implemented. ++ * ++ * Rather, the journal is purely a performance optimization; we can't complete a ++ * write until we've updated the index on disk, otherwise the cache would be ++ * inconsistent in the event of an unclean shutdown. This means that without the ++ * journal, on random write workloads we constantly have to update all the leaf ++ * nodes in the btree, and those writes will be mostly empty (appending at most ++ * a few keys each) - highly inefficient in terms of amount of metadata writes, ++ * and it puts more strain on the various btree resorting/compacting code. ++ * ++ * The journal is just a log of keys we've inserted; on startup we just reinsert ++ * all the keys in the open journal entries. That means that when we're updating ++ * a node in the btree, we can wait until a 4k block of keys fills up before ++ * writing them out. ++ * ++ * For simplicity, we only journal updates to leaf nodes; updates to parent ++ * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth ++ * the complexity to deal with journalling them (in particular, journal replay) ++ * - updates to non leaf nodes just happen synchronously (see btree_split()). ++ */ ++ ++#undef pr_fmt ++#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "bcachefs_format.h" ++#include "fifo.h" ++#include "opts.h" ++#include "util.h" ++ ++#define dynamic_fault(...) 0 ++#define race_fault(...) 0 ++ ++#define bch2_fs_init_fault(name) \ ++ dynamic_fault("bcachefs:bch_fs_init:" name) ++#define bch2_meta_read_fault(name) \ ++ dynamic_fault("bcachefs:meta:read:" name) ++#define bch2_meta_write_fault(name) \ ++ dynamic_fault("bcachefs:meta:write:" name) ++ ++#ifdef __KERNEL__ ++#define bch2_fmt(_c, fmt) "bcachefs (%s): " fmt "\n", ((_c)->name) ++#else ++#define bch2_fmt(_c, fmt) fmt "\n" ++#endif ++ ++#define bch_info(c, fmt, ...) \ ++ printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_notice(c, fmt, ...) \ ++ printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_warn(c, fmt, ...) \ ++ printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_warn_ratelimited(c, fmt, ...) \ ++ printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_err(c, fmt, ...) \ ++ printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) ++#define bch_err_ratelimited(c, fmt, ...) \ ++ printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) ++ ++#define bch_verbose(c, fmt, ...) \ ++do { \ ++ if ((c)->opts.verbose) \ ++ bch_info(c, fmt, ##__VA_ARGS__); \ ++} while (0) ++ ++#define pr_verbose_init(opts, fmt, ...) \ ++do { \ ++ if (opt_get(opts, verbose)) \ ++ pr_info(fmt, ##__VA_ARGS__); \ ++} while (0) ++ ++/* Parameters that are useful for debugging, but should always be compiled in: */ ++#define BCH_DEBUG_PARAMS_ALWAYS() \ ++ BCH_DEBUG_PARAM(key_merging_disabled, \ ++ "Disables merging of extents") \ ++ BCH_DEBUG_PARAM(btree_gc_always_rewrite, \ ++ "Causes mark and sweep to compact and rewrite every " \ ++ "btree node it traverses") \ ++ BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \ ++ "Disables rewriting of btree nodes during mark and sweep")\ ++ BCH_DEBUG_PARAM(btree_shrinker_disabled, \ ++ "Disables the shrinker callback for the btree node cache") ++ ++/* Parameters that should only be compiled in in debug mode: */ ++#define BCH_DEBUG_PARAMS_DEBUG() \ ++ BCH_DEBUG_PARAM(expensive_debug_checks, \ ++ "Enables various runtime debugging checks that " \ ++ "significantly affect performance") \ ++ BCH_DEBUG_PARAM(debug_check_iterators, \ ++ "Enables extra verification for btree iterators") \ ++ BCH_DEBUG_PARAM(debug_check_bkeys, \ ++ "Run bkey_debugcheck (primarily checking GC/allocation "\ ++ "information) when iterating over keys") \ ++ BCH_DEBUG_PARAM(verify_btree_ondisk, \ ++ "Reread btree nodes at various points to verify the " \ ++ "mergesort in the read path against modifications " \ ++ "done in memory") \ ++ BCH_DEBUG_PARAM(journal_seq_verify, \ ++ "Store the journal sequence number in the version " \ ++ "number of every btree key, and verify that btree " \ ++ "update ordering is preserved during recovery") \ ++ BCH_DEBUG_PARAM(inject_invalid_keys, \ ++ "Store the journal sequence number in the version " \ ++ "number of every btree key, and verify that btree " \ ++ "update ordering is preserved during recovery") \ ++ BCH_DEBUG_PARAM(test_alloc_startup, \ ++ "Force allocator startup to use the slowpath where it" \ ++ "can't find enough free buckets without invalidating" \ ++ "cached data") \ ++ BCH_DEBUG_PARAM(force_reconstruct_read, \ ++ "Force reads to use the reconstruct path, when reading" \ ++ "from erasure coded extents") \ ++ BCH_DEBUG_PARAM(test_restart_gc, \ ++ "Test restarting mark and sweep gc when bucket gens change") ++ ++#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL() ++#else ++#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() ++#endif ++ ++#define BCH_TIME_STATS() \ ++ x(btree_node_mem_alloc) \ ++ x(btree_node_split) \ ++ x(btree_node_sort) \ ++ x(btree_node_read) \ ++ x(btree_gc) \ ++ x(btree_lock_contended_read) \ ++ x(btree_lock_contended_intent) \ ++ x(btree_lock_contended_write) \ ++ x(data_write) \ ++ x(data_read) \ ++ x(data_promote) \ ++ x(journal_write) \ ++ x(journal_delay) \ ++ x(journal_flush_seq) \ ++ x(blocked_journal) \ ++ x(blocked_allocate) \ ++ x(blocked_allocate_open_bucket) ++ ++enum bch_time_stats { ++#define x(name) BCH_TIME_##name, ++ BCH_TIME_STATS() ++#undef x ++ BCH_TIME_STAT_NR ++}; ++ ++#include "alloc_types.h" ++#include "btree_types.h" ++#include "buckets_types.h" ++#include "clock_types.h" ++#include "ec_types.h" ++#include "journal_types.h" ++#include "keylist_types.h" ++#include "quota_types.h" ++#include "rebalance_types.h" ++#include "replicas_types.h" ++#include "super_types.h" ++ ++/* Number of nodes btree coalesce will try to coalesce at once */ ++#define GC_MERGE_NODES 4U ++ ++/* Maximum number of nodes we might need to allocate atomically: */ ++#define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1)) ++ ++/* Size of the freelist we allocate btree nodes from: */ ++#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4) ++ ++#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX) ++ ++struct btree; ++ ++enum gc_phase { ++ GC_PHASE_NOT_RUNNING, ++ GC_PHASE_START, ++ GC_PHASE_SB, ++ ++ GC_PHASE_BTREE_EC, ++ GC_PHASE_BTREE_EXTENTS, ++ GC_PHASE_BTREE_INODES, ++ GC_PHASE_BTREE_DIRENTS, ++ GC_PHASE_BTREE_XATTRS, ++ GC_PHASE_BTREE_ALLOC, ++ GC_PHASE_BTREE_QUOTAS, ++ GC_PHASE_BTREE_REFLINK, ++ ++ GC_PHASE_PENDING_DELETE, ++ GC_PHASE_ALLOC, ++}; ++ ++struct gc_pos { ++ enum gc_phase phase; ++ struct bpos pos; ++ unsigned level; ++}; ++ ++struct io_count { ++ u64 sectors[2][BCH_DATA_NR]; ++}; ++ ++struct bch_dev { ++ struct kobject kobj; ++ struct percpu_ref ref; ++ struct completion ref_completion; ++ struct percpu_ref io_ref; ++ struct completion io_ref_completion; ++ ++ struct bch_fs *fs; ++ ++ u8 dev_idx; ++ /* ++ * Cached version of this device's member info from superblock ++ * Committed by bch2_write_super() -> bch_fs_mi_update() ++ */ ++ struct bch_member_cpu mi; ++ uuid_le uuid; ++ char name[BDEVNAME_SIZE]; ++ ++ struct bch_sb_handle disk_sb; ++ struct bch_sb *sb_read_scratch; ++ int sb_write_error; ++ ++ struct bch_devs_mask self; ++ ++ /* biosets used in cloned bios for writing multiple replicas */ ++ struct bio_set replica_set; ++ ++ /* ++ * Buckets: ++ * Per-bucket arrays are protected by c->mark_lock, bucket_lock and ++ * gc_lock, for device resize - holding any is sufficient for access: ++ * Or rcu_read_lock(), but only for ptr_stale(): ++ */ ++ struct bucket_array __rcu *buckets[2]; ++ unsigned long *buckets_nouse; ++ struct rw_semaphore bucket_lock; ++ ++ struct bch_dev_usage __percpu *usage[2]; ++ ++ /* Allocator: */ ++ struct task_struct __rcu *alloc_thread; ++ ++ /* ++ * free: Buckets that are ready to be used ++ * ++ * free_inc: Incoming buckets - these are buckets that currently have ++ * cached data in them, and we can't reuse them until after we write ++ * their new gen to disk. After prio_write() finishes writing the new ++ * gens/prios, they'll be moved to the free list (and possibly discarded ++ * in the process) ++ */ ++ alloc_fifo free[RESERVE_NR]; ++ alloc_fifo free_inc; ++ ++ open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; ++ open_bucket_idx_t open_buckets_partial_nr; ++ ++ size_t fifo_last_bucket; ++ ++ /* last calculated minimum prio */ ++ u16 max_last_bucket_io[2]; ++ ++ size_t inc_gen_needs_gc; ++ size_t inc_gen_really_needs_gc; ++ ++ /* ++ * XXX: this should be an enum for allocator state, so as to include ++ * error state ++ */ ++ enum { ++ ALLOCATOR_STOPPED, ++ ALLOCATOR_RUNNING, ++ ALLOCATOR_BLOCKED, ++ ALLOCATOR_BLOCKED_FULL, ++ } allocator_state; ++ ++ alloc_heap alloc_heap; ++ ++ atomic64_t rebalance_work; ++ ++ struct journal_device journal; ++ ++ struct work_struct io_error_work; ++ ++ /* The rest of this all shows up in sysfs */ ++ atomic64_t cur_latency[2]; ++ struct time_stats io_latency[2]; ++ ++#define CONGESTED_MAX 1024 ++ atomic_t congested; ++ u64 congested_last; ++ ++ struct io_count __percpu *io_done; ++}; ++ ++enum { ++ /* startup: */ ++ BCH_FS_ALLOC_READ_DONE, ++ BCH_FS_ALLOC_CLEAN, ++ BCH_FS_ALLOCATOR_RUNNING, ++ BCH_FS_ALLOCATOR_STOPPING, ++ BCH_FS_INITIAL_GC_DONE, ++ BCH_FS_BTREE_INTERIOR_REPLAY_DONE, ++ BCH_FS_FSCK_DONE, ++ BCH_FS_STARTED, ++ BCH_FS_RW, ++ ++ /* shutdown: */ ++ BCH_FS_STOPPING, ++ BCH_FS_EMERGENCY_RO, ++ BCH_FS_WRITE_DISABLE_COMPLETE, ++ ++ /* errors: */ ++ BCH_FS_ERROR, ++ BCH_FS_ERRORS_FIXED, ++ ++ /* misc: */ ++ BCH_FS_FIXED_GENS, ++ BCH_FS_ALLOC_WRITTEN, ++ BCH_FS_REBUILD_REPLICAS, ++ BCH_FS_HOLD_BTREE_WRITES, ++}; ++ ++struct btree_debug { ++ unsigned id; ++ struct dentry *btree; ++ struct dentry *btree_format; ++ struct dentry *failed; ++}; ++ ++struct bch_fs_pcpu { ++ u64 sectors_available; ++}; ++ ++struct journal_seq_blacklist_table { ++ size_t nr; ++ struct journal_seq_blacklist_table_entry { ++ u64 start; ++ u64 end; ++ bool dirty; ++ } entries[0]; ++}; ++ ++struct journal_keys { ++ struct journal_key { ++ enum btree_id btree_id:8; ++ unsigned level:8; ++ struct bkey_i *k; ++ u32 journal_seq; ++ u32 journal_offset; ++ } *d; ++ size_t nr; ++ u64 journal_seq_base; ++}; ++ ++struct bch_fs { ++ struct closure cl; ++ ++ struct list_head list; ++ struct kobject kobj; ++ struct kobject internal; ++ struct kobject opts_dir; ++ struct kobject time_stats; ++ unsigned long flags; ++ ++ int minor; ++ struct device *chardev; ++ struct super_block *vfs_sb; ++ char name[40]; ++ ++ /* ro/rw, add/remove/resize devices: */ ++ struct rw_semaphore state_lock; ++ ++ /* Counts outstanding writes, for clean transition to read-only */ ++ struct percpu_ref writes; ++ struct work_struct read_only_work; ++ ++ struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; ++ ++ struct bch_replicas_cpu replicas; ++ struct bch_replicas_cpu replicas_gc; ++ struct mutex replicas_gc_lock; ++ ++ struct journal_entry_res replicas_journal_res; ++ ++ struct bch_disk_groups_cpu __rcu *disk_groups; ++ ++ struct bch_opts opts; ++ ++ /* Updated by bch2_sb_update():*/ ++ struct { ++ uuid_le uuid; ++ uuid_le user_uuid; ++ ++ u16 version; ++ u16 encoded_extent_max; ++ ++ u8 nr_devices; ++ u8 clean; ++ ++ u8 encryption_type; ++ ++ u64 time_base_lo; ++ u32 time_base_hi; ++ u32 time_precision; ++ u64 features; ++ u64 compat; ++ } sb; ++ ++ struct bch_sb_handle disk_sb; ++ ++ unsigned short block_bits; /* ilog2(block_size) */ ++ ++ u16 btree_foreground_merge_threshold; ++ ++ struct closure sb_write; ++ struct mutex sb_lock; ++ ++ /* BTREE CACHE */ ++ struct bio_set btree_bio; ++ ++ struct btree_root btree_roots[BTREE_ID_NR]; ++ struct mutex btree_root_lock; ++ ++ struct btree_cache btree_cache; ++ ++ /* ++ * Cache of allocated btree nodes - if we allocate a btree node and ++ * don't use it, if we free it that space can't be reused until going ++ * _all_ the way through the allocator (which exposes us to a livelock ++ * when allocating btree reserves fail halfway through) - instead, we ++ * can stick them here: ++ */ ++ struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2]; ++ unsigned btree_reserve_cache_nr; ++ struct mutex btree_reserve_cache_lock; ++ ++ mempool_t btree_interior_update_pool; ++ struct list_head btree_interior_update_list; ++ struct list_head btree_interior_updates_unwritten; ++ struct mutex btree_interior_update_lock; ++ struct closure_waitlist btree_interior_update_wait; ++ ++ struct workqueue_struct *btree_interior_update_worker; ++ struct work_struct btree_interior_update_work; ++ ++ /* btree_iter.c: */ ++ struct mutex btree_trans_lock; ++ struct list_head btree_trans_list; ++ mempool_t btree_iters_pool; ++ ++ struct btree_key_cache btree_key_cache; ++ ++ struct workqueue_struct *wq; ++ /* copygc needs its own workqueue for index updates.. */ ++ struct workqueue_struct *copygc_wq; ++ struct workqueue_struct *journal_reclaim_wq; ++ ++ /* ALLOCATION */ ++ struct delayed_work pd_controllers_update; ++ unsigned pd_controllers_update_seconds; ++ ++ struct bch_devs_mask rw_devs[BCH_DATA_NR]; ++ ++ u64 capacity; /* sectors */ ++ ++ /* ++ * When capacity _decreases_ (due to a disk being removed), we ++ * increment capacity_gen - this invalidates outstanding reservations ++ * and forces them to be revalidated ++ */ ++ u32 capacity_gen; ++ unsigned bucket_size_max; ++ ++ atomic64_t sectors_available; ++ ++ struct bch_fs_pcpu __percpu *pcpu; ++ ++ struct percpu_rw_semaphore mark_lock; ++ ++ seqcount_t usage_lock; ++ struct bch_fs_usage *usage_base; ++ struct bch_fs_usage __percpu *usage[2]; ++ struct bch_fs_usage __percpu *usage_gc; ++ ++ /* single element mempool: */ ++ struct mutex usage_scratch_lock; ++ struct bch_fs_usage *usage_scratch; ++ ++ /* ++ * When we invalidate buckets, we use both the priority and the amount ++ * of good data to determine which buckets to reuse first - to weight ++ * those together consistently we keep track of the smallest nonzero ++ * priority of any bucket. ++ */ ++ struct bucket_clock bucket_clock[2]; ++ ++ struct io_clock io_clock[2]; ++ ++ /* JOURNAL SEQ BLACKLIST */ ++ struct journal_seq_blacklist_table * ++ journal_seq_blacklist_table; ++ struct work_struct journal_seq_blacklist_gc_work; ++ ++ /* ALLOCATOR */ ++ spinlock_t freelist_lock; ++ struct closure_waitlist freelist_wait; ++ u64 blocked_allocate; ++ u64 blocked_allocate_open_bucket; ++ open_bucket_idx_t open_buckets_freelist; ++ open_bucket_idx_t open_buckets_nr_free; ++ struct closure_waitlist open_buckets_wait; ++ struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; ++ ++ struct write_point btree_write_point; ++ struct write_point rebalance_write_point; ++ ++ struct write_point write_points[WRITE_POINT_MAX]; ++ struct hlist_head write_points_hash[WRITE_POINT_HASH_NR]; ++ struct mutex write_points_hash_lock; ++ unsigned write_points_nr; ++ ++ /* GARBAGE COLLECTION */ ++ struct task_struct *gc_thread; ++ atomic_t kick_gc; ++ unsigned long gc_count; ++ ++ /* ++ * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] ++ * has been marked by GC. ++ * ++ * gc_cur_phase is a superset of btree_ids (BTREE_ID_EXTENTS etc.) ++ * ++ * Protected by gc_pos_lock. Only written to by GC thread, so GC thread ++ * can read without a lock. ++ */ ++ seqcount_t gc_pos_lock; ++ struct gc_pos gc_pos; ++ ++ /* ++ * The allocation code needs gc_mark in struct bucket to be correct, but ++ * it's not while a gc is in progress. ++ */ ++ struct rw_semaphore gc_lock; ++ ++ /* IO PATH */ ++ struct semaphore io_in_flight; ++ struct bio_set bio_read; ++ struct bio_set bio_read_split; ++ struct bio_set bio_write; ++ struct mutex bio_bounce_pages_lock; ++ mempool_t bio_bounce_pages; ++ struct rhashtable promote_table; ++ ++ mempool_t compression_bounce[2]; ++ mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR]; ++ mempool_t decompress_workspace; ++ ZSTD_parameters zstd_params; ++ ++ struct crypto_shash *sha256; ++ struct crypto_sync_skcipher *chacha20; ++ struct crypto_shash *poly1305; ++ ++ atomic64_t key_version; ++ ++ mempool_t large_bkey_pool; ++ ++ /* REBALANCE */ ++ struct bch_fs_rebalance rebalance; ++ ++ /* COPYGC */ ++ struct task_struct *copygc_thread; ++ copygc_heap copygc_heap; ++ struct bch_pd_controller copygc_pd; ++ struct write_point copygc_write_point; ++ u64 copygc_threshold; ++ ++ /* STRIPES: */ ++ GENRADIX(struct stripe) stripes[2]; ++ ++ ec_stripes_heap ec_stripes_heap; ++ spinlock_t ec_stripes_heap_lock; ++ ++ /* ERASURE CODING */ ++ struct list_head ec_stripe_head_list; ++ struct mutex ec_stripe_head_lock; ++ ++ struct list_head ec_stripe_new_list; ++ struct mutex ec_stripe_new_lock; ++ ++ struct work_struct ec_stripe_create_work; ++ u64 ec_stripe_hint; ++ ++ struct bio_set ec_bioset; ++ ++ struct work_struct ec_stripe_delete_work; ++ struct llist_head ec_stripe_delete_list; ++ ++ /* REFLINK */ ++ u64 reflink_hint; ++ ++ /* VFS IO PATH - fs-io.c */ ++ struct bio_set writepage_bioset; ++ struct bio_set dio_write_bioset; ++ struct bio_set dio_read_bioset; ++ ++ struct bio_list btree_write_error_list; ++ struct work_struct btree_write_error_work; ++ spinlock_t btree_write_error_lock; ++ ++ /* ERRORS */ ++ struct list_head fsck_errors; ++ struct mutex fsck_error_lock; ++ bool fsck_alloc_err; ++ ++ /* QUOTAS */ ++ struct bch_memquota_type quotas[QTYP_NR]; ++ ++ /* DEBUG JUNK */ ++ struct dentry *debug; ++ struct btree_debug btree_debug[BTREE_ID_NR]; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct btree *verify_data; ++ struct btree_node *verify_ondisk; ++ struct mutex verify_lock; ++#endif ++ ++ u64 unused_inode_hint; ++ ++ /* ++ * A btree node on disk could have too many bsets for an iterator to fit ++ * on the stack - have to dynamically allocate them ++ */ ++ mempool_t fill_iter; ++ ++ mempool_t btree_bounce_pool; ++ ++ struct journal journal; ++ struct list_head journal_entries; ++ struct journal_keys journal_keys; ++ ++ u64 last_bucket_seq_cleanup; ++ ++ /* The rest of this all shows up in sysfs */ ++ atomic_long_t read_realloc_races; ++ atomic_long_t extent_migrate_done; ++ atomic_long_t extent_migrate_raced; ++ ++ unsigned btree_gc_periodic:1; ++ unsigned copy_gc_enabled:1; ++ bool promote_whole_extents; ++ ++#define BCH_DEBUG_PARAM(name, description) bool name; ++ BCH_DEBUG_PARAMS_ALL() ++#undef BCH_DEBUG_PARAM ++ ++ struct time_stats times[BCH_TIME_STAT_NR]; ++}; ++ ++static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) ++{ ++#ifndef NO_BCACHEFS_FS ++ if (c->vfs_sb) ++ c->vfs_sb->s_bdi->ra_pages = ra_pages; ++#endif ++} ++ ++static inline unsigned bucket_bytes(const struct bch_dev *ca) ++{ ++ return ca->mi.bucket_size << 9; ++} ++ ++static inline unsigned block_bytes(const struct bch_fs *c) ++{ ++ return c->opts.block_size << 9; ++} ++ ++static inline struct timespec64 bch2_time_to_timespec(struct bch_fs *c, u64 time) ++{ ++ return ns_to_timespec64(time * c->sb.time_precision + c->sb.time_base_lo); ++} ++ ++static inline s64 timespec_to_bch2_time(struct bch_fs *c, struct timespec64 ts) ++{ ++ s64 ns = timespec64_to_ns(&ts) - c->sb.time_base_lo; ++ ++ if (c->sb.time_precision == 1) ++ return ns; ++ ++ return div_s64(ns, c->sb.time_precision); ++} ++ ++static inline s64 bch2_current_time(struct bch_fs *c) ++{ ++ struct timespec64 now; ++ ++ ktime_get_coarse_real_ts64(&now); ++ return timespec_to_bch2_time(c, now); ++} ++ ++static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) ++{ ++ return dev < c->sb.nr_devices && c->devs[dev]; ++} ++ ++#endif /* _BCACHEFS_H */ +diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h +new file mode 100644 +index 000000000000..d5a2230e403c +--- /dev/null ++++ b/fs/bcachefs/bcachefs_format.h +@@ -0,0 +1,1671 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FORMAT_H ++#define _BCACHEFS_FORMAT_H ++ ++/* ++ * bcachefs on disk data structures ++ * ++ * OVERVIEW: ++ * ++ * There are three main types of on disk data structures in bcachefs (this is ++ * reduced from 5 in bcache) ++ * ++ * - superblock ++ * - journal ++ * - btree ++ * ++ * The btree is the primary structure; most metadata exists as keys in the ++ * various btrees. There are only a small number of btrees, they're not ++ * sharded - we have one btree for extents, another for inodes, et cetera. ++ * ++ * SUPERBLOCK: ++ * ++ * The superblock contains the location of the journal, the list of devices in ++ * the filesystem, and in general any metadata we need in order to decide ++ * whether we can start a filesystem or prior to reading the journal/btree ++ * roots. ++ * ++ * The superblock is extensible, and most of the contents of the superblock are ++ * in variable length, type tagged fields; see struct bch_sb_field. ++ * ++ * Backup superblocks do not reside in a fixed location; also, superblocks do ++ * not have a fixed size. To locate backup superblocks we have struct ++ * bch_sb_layout; we store a copy of this inside every superblock, and also ++ * before the first superblock. ++ * ++ * JOURNAL: ++ * ++ * The journal primarily records btree updates in the order they occurred; ++ * journal replay consists of just iterating over all the keys in the open ++ * journal entries and re-inserting them into the btrees. ++ * ++ * The journal also contains entry types for the btree roots, and blacklisted ++ * journal sequence numbers (see journal_seq_blacklist.c). ++ * ++ * BTREE: ++ * ++ * bcachefs btrees are copy on write b+ trees, where nodes are big (typically ++ * 128k-256k) and log structured. We use struct btree_node for writing the first ++ * entry in a given node (offset 0), and struct btree_node_entry for all ++ * subsequent writes. ++ * ++ * After the header, btree node entries contain a list of keys in sorted order. ++ * Values are stored inline with the keys; since values are variable length (and ++ * keys effectively are variable length too, due to packing) we can't do random ++ * access without building up additional in memory tables in the btree node read ++ * path. ++ * ++ * BTREE KEYS (struct bkey): ++ * ++ * The various btrees share a common format for the key - so as to avoid ++ * switching in fastpath lookup/comparison code - but define their own ++ * structures for the key values. ++ * ++ * The size of a key/value pair is stored as a u8 in units of u64s, so the max ++ * size is just under 2k. The common part also contains a type tag for the ++ * value, and a format field indicating whether the key is packed or not (and ++ * also meant to allow adding new key fields in the future, if desired). ++ * ++ * bkeys, when stored within a btree node, may also be packed. In that case, the ++ * bkey_format in that node is used to unpack it. Packed bkeys mean that we can ++ * be generous with field sizes in the common part of the key format (64 bit ++ * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#define LE_BITMASK(_bits, name, type, field, offset, end) \ ++static const unsigned name##_OFFSET = offset; \ ++static const unsigned name##_BITS = (end - offset); \ ++static const __u##_bits name##_MAX = (1ULL << (end - offset)) - 1; \ ++ \ ++static inline __u64 name(const type *k) \ ++{ \ ++ return (__le##_bits##_to_cpu(k->field) >> offset) & \ ++ ~(~0ULL << (end - offset)); \ ++} \ ++ \ ++static inline void SET_##name(type *k, __u64 v) \ ++{ \ ++ __u##_bits new = __le##_bits##_to_cpu(k->field); \ ++ \ ++ new &= ~(~(~0ULL << (end - offset)) << offset); \ ++ new |= (v & ~(~0ULL << (end - offset))) << offset; \ ++ k->field = __cpu_to_le##_bits(new); \ ++} ++ ++#define LE16_BITMASK(n, t, f, o, e) LE_BITMASK(16, n, t, f, o, e) ++#define LE32_BITMASK(n, t, f, o, e) LE_BITMASK(32, n, t, f, o, e) ++#define LE64_BITMASK(n, t, f, o, e) LE_BITMASK(64, n, t, f, o, e) ++ ++struct bkey_format { ++ __u8 key_u64s; ++ __u8 nr_fields; ++ /* One unused slot for now: */ ++ __u8 bits_per_field[6]; ++ __le64 field_offset[6]; ++}; ++ ++/* Btree keys - all units are in sectors */ ++ ++struct bpos { ++ /* ++ * Word order matches machine byte order - btree code treats a bpos as a ++ * single large integer, for search/comparison purposes ++ * ++ * Note that wherever a bpos is embedded in another on disk data ++ * structure, it has to be byte swabbed when reading in metadata that ++ * wasn't written in native endian order: ++ */ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ __u32 snapshot; ++ __u64 offset; ++ __u64 inode; ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++ __u64 inode; ++ __u64 offset; /* Points to end of extent - sectors */ ++ __u32 snapshot; ++#else ++#error edit for your odd byteorder. ++#endif ++} __attribute__((packed, aligned(4))); ++ ++#define KEY_INODE_MAX ((__u64)~0ULL) ++#define KEY_OFFSET_MAX ((__u64)~0ULL) ++#define KEY_SNAPSHOT_MAX ((__u32)~0U) ++#define KEY_SIZE_MAX ((__u32)~0U) ++ ++static inline struct bpos POS(__u64 inode, __u64 offset) ++{ ++ struct bpos ret; ++ ++ ret.inode = inode; ++ ret.offset = offset; ++ ret.snapshot = 0; ++ ++ return ret; ++} ++ ++#define POS_MIN POS(0, 0) ++#define POS_MAX POS(KEY_INODE_MAX, KEY_OFFSET_MAX) ++ ++/* Empty placeholder struct, for container_of() */ ++struct bch_val { ++ __u64 __nothing[0]; ++}; ++ ++struct bversion { ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ __u64 lo; ++ __u32 hi; ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++ __u32 hi; ++ __u64 lo; ++#endif ++} __attribute__((packed, aligned(4))); ++ ++struct bkey { ++ /* Size of combined key and value, in u64s */ ++ __u8 u64s; ++ ++ /* Format of key (0 for format local to btree node) */ ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u8 format:7, ++ needs_whiteout:1; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u8 needs_whiteout:1, ++ format:7; ++#else ++#error edit for your odd byteorder. ++#endif ++ ++ /* Type of the value */ ++ __u8 type; ++ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ __u8 pad[1]; ++ ++ struct bversion version; ++ __u32 size; /* extent size, in sectors */ ++ struct bpos p; ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++ struct bpos p; ++ __u32 size; /* extent size, in sectors */ ++ struct bversion version; ++ ++ __u8 pad[1]; ++#endif ++} __attribute__((packed, aligned(8))); ++ ++struct bkey_packed { ++ __u64 _data[0]; ++ ++ /* Size of combined key and value, in u64s */ ++ __u8 u64s; ++ ++ /* Format of key (0 for format local to btree node) */ ++ ++ /* ++ * XXX: next incompat on disk format change, switch format and ++ * needs_whiteout - bkey_packed() will be cheaper if format is the high ++ * bits of the bitfield ++ */ ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u8 format:7, ++ needs_whiteout:1; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u8 needs_whiteout:1, ++ format:7; ++#endif ++ ++ /* Type of the value */ ++ __u8 type; ++ __u8 key_start[0]; ++ ++ /* ++ * We copy bkeys with struct assignment in various places, and while ++ * that shouldn't be done with packed bkeys we can't disallow it in C, ++ * and it's legal to cast a bkey to a bkey_packed - so padding it out ++ * to the same size as struct bkey should hopefully be safest. ++ */ ++ __u8 pad[sizeof(struct bkey) - 3]; ++} __attribute__((packed, aligned(8))); ++ ++#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64)) ++#define BKEY_U64s_MAX U8_MAX ++#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s) ++ ++#define KEY_PACKED_BITS_START 24 ++ ++#define KEY_FORMAT_LOCAL_BTREE 0 ++#define KEY_FORMAT_CURRENT 1 ++ ++enum bch_bkey_fields { ++ BKEY_FIELD_INODE, ++ BKEY_FIELD_OFFSET, ++ BKEY_FIELD_SNAPSHOT, ++ BKEY_FIELD_SIZE, ++ BKEY_FIELD_VERSION_HI, ++ BKEY_FIELD_VERSION_LO, ++ BKEY_NR_FIELDS, ++}; ++ ++#define bkey_format_field(name, field) \ ++ [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8) ++ ++#define BKEY_FORMAT_CURRENT \ ++((struct bkey_format) { \ ++ .key_u64s = BKEY_U64s, \ ++ .nr_fields = BKEY_NR_FIELDS, \ ++ .bits_per_field = { \ ++ bkey_format_field(INODE, p.inode), \ ++ bkey_format_field(OFFSET, p.offset), \ ++ bkey_format_field(SNAPSHOT, p.snapshot), \ ++ bkey_format_field(SIZE, size), \ ++ bkey_format_field(VERSION_HI, version.hi), \ ++ bkey_format_field(VERSION_LO, version.lo), \ ++ }, \ ++}) ++ ++/* bkey with inline value */ ++struct bkey_i { ++ __u64 _data[0]; ++ ++ union { ++ struct { ++ /* Size of combined key and value, in u64s */ ++ __u8 u64s; ++ }; ++ struct { ++ struct bkey k; ++ struct bch_val v; ++ }; ++ }; ++}; ++ ++#define KEY(_inode, _offset, _size) \ ++((struct bkey) { \ ++ .u64s = BKEY_U64s, \ ++ .format = KEY_FORMAT_CURRENT, \ ++ .p = POS(_inode, _offset), \ ++ .size = _size, \ ++}) ++ ++static inline void bkey_init(struct bkey *k) ++{ ++ *k = KEY(0, 0, 0); ++} ++ ++#define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64)) ++ ++#define __BKEY_PADDED(key, pad) \ ++ struct { struct bkey_i key; __u64 key ## _pad[pad]; } ++ ++/* ++ * - DELETED keys are used internally to mark keys that should be ignored but ++ * override keys in composition order. Their version number is ignored. ++ * ++ * - DISCARDED keys indicate that the data is all 0s because it has been ++ * discarded. DISCARDs may have a version; if the version is nonzero the key ++ * will be persistent, otherwise the key will be dropped whenever the btree ++ * node is rewritten (like DELETED keys). ++ * ++ * - ERROR: any read of the data returns a read error, as the data was lost due ++ * to a failing device. Like DISCARDED keys, they can be removed (overridden) ++ * by new writes or cluster-wide GC. Node repair can also overwrite them with ++ * the same or a more recent version number, but not with an older version ++ * number. ++ * ++ * - WHITEOUT: for hash table btrees ++*/ ++#define BCH_BKEY_TYPES() \ ++ x(deleted, 0) \ ++ x(discard, 1) \ ++ x(error, 2) \ ++ x(cookie, 3) \ ++ x(whiteout, 4) \ ++ x(btree_ptr, 5) \ ++ x(extent, 6) \ ++ x(reservation, 7) \ ++ x(inode, 8) \ ++ x(inode_generation, 9) \ ++ x(dirent, 10) \ ++ x(xattr, 11) \ ++ x(alloc, 12) \ ++ x(quota, 13) \ ++ x(stripe, 14) \ ++ x(reflink_p, 15) \ ++ x(reflink_v, 16) \ ++ x(inline_data, 17) \ ++ x(btree_ptr_v2, 18) ++ ++enum bch_bkey_type { ++#define x(name, nr) KEY_TYPE_##name = nr, ++ BCH_BKEY_TYPES() ++#undef x ++ KEY_TYPE_MAX, ++}; ++ ++struct bch_cookie { ++ struct bch_val v; ++ __le64 cookie; ++}; ++ ++/* Extents */ ++ ++/* ++ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally ++ * preceded by checksum/compression information (bch_extent_crc32 or ++ * bch_extent_crc64). ++ * ++ * One major determining factor in the format of extents is how we handle and ++ * represent extents that have been partially overwritten and thus trimmed: ++ * ++ * If an extent is not checksummed or compressed, when the extent is trimmed we ++ * don't have to remember the extent we originally allocated and wrote: we can ++ * merely adjust ptr->offset to point to the start of the data that is currently ++ * live. The size field in struct bkey records the current (live) size of the ++ * extent, and is also used to mean "size of region on disk that we point to" in ++ * this case. ++ * ++ * Thus an extent that is not checksummed or compressed will consist only of a ++ * list of bch_extent_ptrs, with none of the fields in ++ * bch_extent_crc32/bch_extent_crc64. ++ * ++ * When an extent is checksummed or compressed, it's not possible to read only ++ * the data that is currently live: we have to read the entire extent that was ++ * originally written, and then return only the part of the extent that is ++ * currently live. ++ * ++ * Thus, in addition to the current size of the extent in struct bkey, we need ++ * to store the size of the originally allocated space - this is the ++ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, ++ * when the extent is trimmed, instead of modifying the offset field of the ++ * pointer, we keep a second smaller offset field - "offset into the original ++ * extent of the currently live region". ++ * ++ * The other major determining factor is replication and data migration: ++ * ++ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated ++ * write, we will initially write all the replicas in the same format, with the ++ * same checksum type and compression format - however, when copygc runs later (or ++ * tiering/cache promotion, anything that moves data), it is not in general ++ * going to rewrite all the pointers at once - one of the replicas may be in a ++ * bucket on one device that has very little fragmentation while another lives ++ * in a bucket that has become heavily fragmented, and thus is being rewritten ++ * sooner than the rest. ++ * ++ * Thus it will only move a subset of the pointers (or in the case of ++ * tiering/cache promotion perhaps add a single pointer without dropping any ++ * current pointers), and if the extent has been partially overwritten it must ++ * write only the currently live portion (or copygc would not be able to reduce ++ * fragmentation!) - which necessitates a different bch_extent_crc format for ++ * the new pointer. ++ * ++ * But in the interests of space efficiency, we don't want to store one ++ * bch_extent_crc for each pointer if we don't have to. ++ * ++ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and ++ * bch_extent_ptrs appended arbitrarily one after the other. We determine the ++ * type of a given entry with a scheme similar to utf8 (except we're encoding a ++ * type, not a size), encoding the type in the position of the first set bit: ++ * ++ * bch_extent_crc32 - 0b1 ++ * bch_extent_ptr - 0b10 ++ * bch_extent_crc64 - 0b100 ++ * ++ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and ++ * bch_extent_crc64 is the least constrained). ++ * ++ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, ++ * until the next bch_extent_crc32/64. ++ * ++ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer ++ * is neither checksummed nor compressed. ++ */ ++ ++/* 128 bits, sufficient for cryptographic MACs: */ ++struct bch_csum { ++ __le64 lo; ++ __le64 hi; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_EXTENT_ENTRY_TYPES() \ ++ x(ptr, 0) \ ++ x(crc32, 1) \ ++ x(crc64, 2) \ ++ x(crc128, 3) \ ++ x(stripe_ptr, 4) ++#define BCH_EXTENT_ENTRY_MAX 5 ++ ++enum bch_extent_entry_type { ++#define x(f, n) BCH_EXTENT_ENTRY_##f = n, ++ BCH_EXTENT_ENTRY_TYPES() ++#undef x ++}; ++ ++/* Compressed/uncompressed size are stored biased by 1: */ ++struct bch_extent_crc32 { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u32 type:2, ++ _compressed_size:7, ++ _uncompressed_size:7, ++ offset:7, ++ _unused:1, ++ csum_type:4, ++ compression_type:4; ++ __u32 csum; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u32 csum; ++ __u32 compression_type:4, ++ csum_type:4, ++ _unused:1, ++ offset:7, ++ _uncompressed_size:7, ++ _compressed_size:7, ++ type:2; ++#endif ++} __attribute__((packed, aligned(8))); ++ ++#define CRC32_SIZE_MAX (1U << 7) ++#define CRC32_NONCE_MAX 0 ++ ++struct bch_extent_crc64 { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:3, ++ _compressed_size:9, ++ _uncompressed_size:9, ++ offset:9, ++ nonce:10, ++ csum_type:4, ++ compression_type:4, ++ csum_hi:16; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 csum_hi:16, ++ compression_type:4, ++ csum_type:4, ++ nonce:10, ++ offset:9, ++ _uncompressed_size:9, ++ _compressed_size:9, ++ type:3; ++#endif ++ __u64 csum_lo; ++} __attribute__((packed, aligned(8))); ++ ++#define CRC64_SIZE_MAX (1U << 9) ++#define CRC64_NONCE_MAX ((1U << 10) - 1) ++ ++struct bch_extent_crc128 { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:4, ++ _compressed_size:13, ++ _uncompressed_size:13, ++ offset:13, ++ nonce:13, ++ csum_type:4, ++ compression_type:4; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 compression_type:4, ++ csum_type:4, ++ nonce:13, ++ offset:13, ++ _uncompressed_size:13, ++ _compressed_size:13, ++ type:4; ++#endif ++ struct bch_csum csum; ++} __attribute__((packed, aligned(8))); ++ ++#define CRC128_SIZE_MAX (1U << 13) ++#define CRC128_NONCE_MAX ((1U << 13) - 1) ++ ++/* ++ * @reservation - pointer hasn't been written to, just reserved ++ */ ++struct bch_extent_ptr { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:1, ++ cached:1, ++ unused:1, ++ reservation:1, ++ offset:44, /* 8 petabytes */ ++ dev:8, ++ gen:8; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 gen:8, ++ dev:8, ++ offset:44, ++ reservation:1, ++ unused:1, ++ cached:1, ++ type:1; ++#endif ++} __attribute__((packed, aligned(8))); ++ ++struct bch_extent_stripe_ptr { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:5, ++ block:8, ++ idx:51; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 idx:51, ++ block:8, ++ type:5; ++#endif ++}; ++ ++struct bch_extent_reservation { ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++ __u64 type:6, ++ unused:22, ++ replicas:4, ++ generation:32; ++#elif defined (__BIG_ENDIAN_BITFIELD) ++ __u64 generation:32, ++ replicas:4, ++ unused:22, ++ type:6; ++#endif ++}; ++ ++union bch_extent_entry { ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 ++ unsigned long type; ++#elif __BITS_PER_LONG == 32 ++ struct { ++ unsigned long pad; ++ unsigned long type; ++ }; ++#else ++#error edit for your odd byteorder. ++#endif ++ ++#define x(f, n) struct bch_extent_##f f; ++ BCH_EXTENT_ENTRY_TYPES() ++#undef x ++}; ++ ++struct bch_btree_ptr { ++ struct bch_val v; ++ ++ struct bch_extent_ptr start[0]; ++ __u64 _data[0]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_btree_ptr_v2 { ++ struct bch_val v; ++ ++ __u64 mem_ptr; ++ __le64 seq; ++ __le16 sectors_written; ++ /* In case we ever decide to do variable size btree nodes: */ ++ __le16 sectors; ++ struct bpos min_key; ++ struct bch_extent_ptr start[0]; ++ __u64 _data[0]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_extent { ++ struct bch_val v; ++ ++ union bch_extent_entry start[0]; ++ __u64 _data[0]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_reservation { ++ struct bch_val v; ++ ++ __le32 generation; ++ __u8 nr_replicas; ++ __u8 pad[3]; ++} __attribute__((packed, aligned(8))); ++ ++/* Maximum size (in u64s) a single pointer could be: */ ++#define BKEY_EXTENT_PTR_U64s_MAX\ ++ ((sizeof(struct bch_extent_crc128) + \ ++ sizeof(struct bch_extent_ptr)) / sizeof(u64)) ++ ++/* Maximum possible size of an entire extent value: */ ++#define BKEY_EXTENT_VAL_U64s_MAX \ ++ (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) ++ ++#define BKEY_PADDED(key) __BKEY_PADDED(key, BKEY_EXTENT_VAL_U64s_MAX) ++ ++/* * Maximum possible size of an entire extent, key + value: */ ++#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) ++ ++/* Btree pointers don't carry around checksums: */ ++#define BKEY_BTREE_PTR_VAL_U64s_MAX \ ++ ((sizeof(struct bch_btree_ptr_v2) + \ ++ sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(u64)) ++#define BKEY_BTREE_PTR_U64s_MAX \ ++ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) ++ ++/* Inodes */ ++ ++#define BLOCKDEV_INODE_MAX 4096 ++ ++#define BCACHEFS_ROOT_INO 4096 ++ ++struct bch_inode { ++ struct bch_val v; ++ ++ __le64 bi_hash_seed; ++ __le32 bi_flags; ++ __le16 bi_mode; ++ __u8 fields[0]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_inode_generation { ++ struct bch_val v; ++ ++ __le32 bi_generation; ++ __le32 pad; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_INODE_FIELDS() \ ++ x(bi_atime, 64) \ ++ x(bi_ctime, 64) \ ++ x(bi_mtime, 64) \ ++ x(bi_otime, 64) \ ++ x(bi_size, 64) \ ++ x(bi_sectors, 64) \ ++ x(bi_uid, 32) \ ++ x(bi_gid, 32) \ ++ x(bi_nlink, 32) \ ++ x(bi_generation, 32) \ ++ x(bi_dev, 32) \ ++ x(bi_data_checksum, 8) \ ++ x(bi_compression, 8) \ ++ x(bi_project, 32) \ ++ x(bi_background_compression, 8) \ ++ x(bi_data_replicas, 8) \ ++ x(bi_promote_target, 16) \ ++ x(bi_foreground_target, 16) \ ++ x(bi_background_target, 16) \ ++ x(bi_erasure_code, 16) \ ++ x(bi_fields_set, 16) ++ ++/* subset of BCH_INODE_FIELDS */ ++#define BCH_INODE_OPTS() \ ++ x(data_checksum, 8) \ ++ x(compression, 8) \ ++ x(project, 32) \ ++ x(background_compression, 8) \ ++ x(data_replicas, 8) \ ++ x(promote_target, 16) \ ++ x(foreground_target, 16) \ ++ x(background_target, 16) \ ++ x(erasure_code, 16) ++ ++enum inode_opt_id { ++#define x(name, ...) \ ++ Inode_opt_##name, ++ BCH_INODE_OPTS() ++#undef x ++ Inode_opt_nr, ++}; ++ ++enum { ++ /* ++ * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL ++ * flags) ++ */ ++ __BCH_INODE_SYNC = 0, ++ __BCH_INODE_IMMUTABLE = 1, ++ __BCH_INODE_APPEND = 2, ++ __BCH_INODE_NODUMP = 3, ++ __BCH_INODE_NOATIME = 4, ++ ++ __BCH_INODE_I_SIZE_DIRTY= 5, ++ __BCH_INODE_I_SECTORS_DIRTY= 6, ++ __BCH_INODE_UNLINKED = 7, ++ ++ /* bits 20+ reserved for packed fields below: */ ++}; ++ ++#define BCH_INODE_SYNC (1 << __BCH_INODE_SYNC) ++#define BCH_INODE_IMMUTABLE (1 << __BCH_INODE_IMMUTABLE) ++#define BCH_INODE_APPEND (1 << __BCH_INODE_APPEND) ++#define BCH_INODE_NODUMP (1 << __BCH_INODE_NODUMP) ++#define BCH_INODE_NOATIME (1 << __BCH_INODE_NOATIME) ++#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY) ++#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY) ++#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) ++ ++LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); ++LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 32); ++ ++/* Dirents */ ++ ++/* ++ * Dirents (and xattrs) have to implement string lookups; since our b-tree ++ * doesn't support arbitrary length strings for the key, we instead index by a ++ * 64 bit hash (currently truncated sha1) of the string, stored in the offset ++ * field of the key - using linear probing to resolve hash collisions. This also ++ * provides us with the readdir cookie posix requires. ++ * ++ * Linear probing requires us to use whiteouts for deletions, in the event of a ++ * collision: ++ */ ++ ++struct bch_dirent { ++ struct bch_val v; ++ ++ /* Target inode number: */ ++ __le64 d_inum; ++ ++ /* ++ * Copy of mode bits 12-15 from the target inode - so userspace can get ++ * the filetype without having to do a stat() ++ */ ++ __u8 d_type; ++ ++ __u8 d_name[]; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_NAME_MAX (U8_MAX * sizeof(u64) - \ ++ sizeof(struct bkey) - \ ++ offsetof(struct bch_dirent, d_name)) ++ ++ ++/* Xattrs */ ++ ++#define KEY_TYPE_XATTR_INDEX_USER 0 ++#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 ++#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 ++#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 ++#define KEY_TYPE_XATTR_INDEX_SECURITY 4 ++ ++struct bch_xattr { ++ struct bch_val v; ++ __u8 x_type; ++ __u8 x_name_len; ++ __le16 x_val_len; ++ __u8 x_name[]; ++} __attribute__((packed, aligned(8))); ++ ++/* Bucket/allocation information: */ ++ ++struct bch_alloc { ++ struct bch_val v; ++ __u8 fields; ++ __u8 gen; ++ __u8 data[]; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_ALLOC_FIELDS() \ ++ x(read_time, 16) \ ++ x(write_time, 16) \ ++ x(data_type, 8) \ ++ x(dirty_sectors, 16) \ ++ x(cached_sectors, 16) \ ++ x(oldest_gen, 8) ++ ++enum { ++#define x(name, bytes) BCH_ALLOC_FIELD_##name, ++ BCH_ALLOC_FIELDS() ++#undef x ++ BCH_ALLOC_FIELD_NR ++}; ++ ++static const unsigned BCH_ALLOC_FIELD_BYTES[] = { ++#define x(name, bits) [BCH_ALLOC_FIELD_##name] = bits / 8, ++ BCH_ALLOC_FIELDS() ++#undef x ++}; ++ ++#define x(name, bits) + (bits / 8) ++static const unsigned BKEY_ALLOC_VAL_U64s_MAX = ++ DIV_ROUND_UP(offsetof(struct bch_alloc, data) ++ BCH_ALLOC_FIELDS(), sizeof(u64)); ++#undef x ++ ++#define BKEY_ALLOC_U64s_MAX (BKEY_U64s + BKEY_ALLOC_VAL_U64s_MAX) ++ ++/* Quotas: */ ++ ++enum quota_types { ++ QTYP_USR = 0, ++ QTYP_GRP = 1, ++ QTYP_PRJ = 2, ++ QTYP_NR = 3, ++}; ++ ++enum quota_counters { ++ Q_SPC = 0, ++ Q_INO = 1, ++ Q_COUNTERS = 2, ++}; ++ ++struct bch_quota_counter { ++ __le64 hardlimit; ++ __le64 softlimit; ++}; ++ ++struct bch_quota { ++ struct bch_val v; ++ struct bch_quota_counter c[Q_COUNTERS]; ++} __attribute__((packed, aligned(8))); ++ ++/* Erasure coding */ ++ ++struct bch_stripe { ++ struct bch_val v; ++ __le16 sectors; ++ __u8 algorithm; ++ __u8 nr_blocks; ++ __u8 nr_redundant; ++ ++ __u8 csum_granularity_bits; ++ __u8 csum_type; ++ __u8 pad; ++ ++ struct bch_extent_ptr ptrs[0]; ++} __attribute__((packed, aligned(8))); ++ ++/* Reflink: */ ++ ++struct bch_reflink_p { ++ struct bch_val v; ++ __le64 idx; ++ ++ __le32 reservation_generation; ++ __u8 nr_replicas; ++ __u8 pad[3]; ++}; ++ ++struct bch_reflink_v { ++ struct bch_val v; ++ __le64 refcount; ++ union bch_extent_entry start[0]; ++ __u64 _data[0]; ++}; ++ ++/* Inline data */ ++ ++struct bch_inline_data { ++ struct bch_val v; ++ u8 data[0]; ++}; ++ ++/* Optional/variable size superblock sections: */ ++ ++struct bch_sb_field { ++ __u64 _data[0]; ++ __le32 u64s; ++ __le32 type; ++}; ++ ++#define BCH_SB_FIELDS() \ ++ x(journal, 0) \ ++ x(members, 1) \ ++ x(crypt, 2) \ ++ x(replicas_v0, 3) \ ++ x(quota, 4) \ ++ x(disk_groups, 5) \ ++ x(clean, 6) \ ++ x(replicas, 7) \ ++ x(journal_seq_blacklist, 8) ++ ++enum bch_sb_field_type { ++#define x(f, nr) BCH_SB_FIELD_##f = nr, ++ BCH_SB_FIELDS() ++#undef x ++ BCH_SB_FIELD_NR ++}; ++ ++/* BCH_SB_FIELD_journal: */ ++ ++struct bch_sb_field_journal { ++ struct bch_sb_field field; ++ __le64 buckets[0]; ++}; ++ ++/* BCH_SB_FIELD_members: */ ++ ++#define BCH_MIN_NR_NBUCKETS (1 << 6) ++ ++struct bch_member { ++ uuid_le uuid; ++ __le64 nbuckets; /* device size */ ++ __le16 first_bucket; /* index of first bucket used */ ++ __le16 bucket_size; /* sectors */ ++ __le32 pad; ++ __le64 last_mount; /* time_t */ ++ ++ __le64 flags[2]; ++}; ++ ++LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4) ++/* 4-10 unused, was TIER, HAS_(META)DATA */ ++LE64_BITMASK(BCH_MEMBER_REPLACEMENT, struct bch_member, flags[0], 10, 14) ++LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15) ++LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20) ++LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28) ++LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30) ++ ++#define BCH_TIER_MAX 4U ++ ++#if 0 ++LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); ++LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); ++#endif ++ ++enum bch_member_state { ++ BCH_MEMBER_STATE_RW = 0, ++ BCH_MEMBER_STATE_RO = 1, ++ BCH_MEMBER_STATE_FAILED = 2, ++ BCH_MEMBER_STATE_SPARE = 3, ++ BCH_MEMBER_STATE_NR = 4, ++}; ++ ++enum cache_replacement { ++ CACHE_REPLACEMENT_LRU = 0, ++ CACHE_REPLACEMENT_FIFO = 1, ++ CACHE_REPLACEMENT_RANDOM = 2, ++ CACHE_REPLACEMENT_NR = 3, ++}; ++ ++struct bch_sb_field_members { ++ struct bch_sb_field field; ++ struct bch_member members[0]; ++}; ++ ++/* BCH_SB_FIELD_crypt: */ ++ ++struct nonce { ++ __le32 d[4]; ++}; ++ ++struct bch_key { ++ __le64 key[4]; ++}; ++ ++#define BCH_KEY_MAGIC \ ++ (((u64) 'b' << 0)|((u64) 'c' << 8)| \ ++ ((u64) 'h' << 16)|((u64) '*' << 24)| \ ++ ((u64) '*' << 32)|((u64) 'k' << 40)| \ ++ ((u64) 'e' << 48)|((u64) 'y' << 56)) ++ ++struct bch_encrypted_key { ++ __le64 magic; ++ struct bch_key key; ++}; ++ ++/* ++ * If this field is present in the superblock, it stores an encryption key which ++ * is used encrypt all other data/metadata. The key will normally be encrypted ++ * with the key userspace provides, but if encryption has been turned off we'll ++ * just store the master key unencrypted in the superblock so we can access the ++ * previously encrypted data. ++ */ ++struct bch_sb_field_crypt { ++ struct bch_sb_field field; ++ ++ __le64 flags; ++ __le64 kdf_flags; ++ struct bch_encrypted_key key; ++}; ++ ++LE64_BITMASK(BCH_CRYPT_KDF_TYPE, struct bch_sb_field_crypt, flags, 0, 4); ++ ++enum bch_kdf_types { ++ BCH_KDF_SCRYPT = 0, ++ BCH_KDF_NR = 1, ++}; ++ ++/* stored as base 2 log of scrypt params: */ ++LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); ++LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); ++LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); ++ ++/* BCH_SB_FIELD_replicas: */ ++ ++#define BCH_DATA_TYPES() \ ++ x(none, 0) \ ++ x(sb, 1) \ ++ x(journal, 2) \ ++ x(btree, 3) \ ++ x(user, 4) \ ++ x(cached, 5) ++ ++enum bch_data_type { ++#define x(t, n) BCH_DATA_##t, ++ BCH_DATA_TYPES() ++#undef x ++ BCH_DATA_NR ++}; ++ ++struct bch_replicas_entry_v0 { ++ __u8 data_type; ++ __u8 nr_devs; ++ __u8 devs[0]; ++} __attribute__((packed)); ++ ++struct bch_sb_field_replicas_v0 { ++ struct bch_sb_field field; ++ struct bch_replicas_entry_v0 entries[0]; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_replicas_entry { ++ __u8 data_type; ++ __u8 nr_devs; ++ __u8 nr_required; ++ __u8 devs[0]; ++} __attribute__((packed)); ++ ++#define replicas_entry_bytes(_i) \ ++ (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) ++ ++struct bch_sb_field_replicas { ++ struct bch_sb_field field; ++ struct bch_replicas_entry entries[0]; ++} __attribute__((packed, aligned(8))); ++ ++/* BCH_SB_FIELD_quota: */ ++ ++struct bch_sb_quota_counter { ++ __le32 timelimit; ++ __le32 warnlimit; ++}; ++ ++struct bch_sb_quota_type { ++ __le64 flags; ++ struct bch_sb_quota_counter c[Q_COUNTERS]; ++}; ++ ++struct bch_sb_field_quota { ++ struct bch_sb_field field; ++ struct bch_sb_quota_type q[QTYP_NR]; ++} __attribute__((packed, aligned(8))); ++ ++/* BCH_SB_FIELD_disk_groups: */ ++ ++#define BCH_SB_LABEL_SIZE 32 ++ ++struct bch_disk_group { ++ __u8 label[BCH_SB_LABEL_SIZE]; ++ __le64 flags[2]; ++} __attribute__((packed, aligned(8))); ++ ++LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) ++LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) ++LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) ++ ++struct bch_sb_field_disk_groups { ++ struct bch_sb_field field; ++ struct bch_disk_group entries[0]; ++} __attribute__((packed, aligned(8))); ++ ++/* ++ * On clean shutdown, store btree roots and current journal sequence number in ++ * the superblock: ++ */ ++struct jset_entry { ++ __le16 u64s; ++ __u8 btree_id; ++ __u8 level; ++ __u8 type; /* designates what this jset holds */ ++ __u8 pad[3]; ++ ++ union { ++ struct bkey_i start[0]; ++ __u64 _data[0]; ++ }; ++}; ++ ++struct bch_sb_field_clean { ++ struct bch_sb_field field; ++ ++ __le32 flags; ++ __le16 read_clock; ++ __le16 write_clock; ++ __le64 journal_seq; ++ ++ union { ++ struct jset_entry start[0]; ++ __u64 _data[0]; ++ }; ++}; ++ ++struct journal_seq_blacklist_entry { ++ __le64 start; ++ __le64 end; ++}; ++ ++struct bch_sb_field_journal_seq_blacklist { ++ struct bch_sb_field field; ++ ++ union { ++ struct journal_seq_blacklist_entry start[0]; ++ __u64 _data[0]; ++ }; ++}; ++ ++/* Superblock: */ ++ ++/* ++ * New versioning scheme: ++ * One common version number for all on disk data structures - superblock, btree ++ * nodes, journal entries ++ */ ++#define BCH_JSET_VERSION_OLD 2 ++#define BCH_BSET_VERSION_OLD 3 ++ ++enum bcachefs_metadata_version { ++ bcachefs_metadata_version_min = 9, ++ bcachefs_metadata_version_new_versioning = 10, ++ bcachefs_metadata_version_bkey_renumber = 10, ++ bcachefs_metadata_version_inode_btree_change = 11, ++ bcachefs_metadata_version_max = 12, ++}; ++ ++#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) ++ ++#define BCH_SB_SECTOR 8 ++#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */ ++ ++struct bch_sb_layout { ++ uuid_le magic; /* bcachefs superblock UUID */ ++ __u8 layout_type; ++ __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */ ++ __u8 nr_superblocks; ++ __u8 pad[5]; ++ __le64 sb_offset[61]; ++} __attribute__((packed, aligned(8))); ++ ++#define BCH_SB_LAYOUT_SECTOR 7 ++ ++/* ++ * @offset - sector where this sb was written ++ * @version - on disk format version ++ * @version_min - Oldest metadata version this filesystem contains; so we can ++ * safely drop compatibility code and refuse to mount filesystems ++ * we'd need it for ++ * @magic - identifies as a bcachefs superblock (BCACHE_MAGIC) ++ * @seq - incremented each time superblock is written ++ * @uuid - used for generating various magic numbers and identifying ++ * member devices, never changes ++ * @user_uuid - user visible UUID, may be changed ++ * @label - filesystem label ++ * @seq - identifies most recent superblock, incremented each time ++ * superblock is written ++ * @features - enabled incompatible features ++ */ ++struct bch_sb { ++ struct bch_csum csum; ++ __le16 version; ++ __le16 version_min; ++ __le16 pad[2]; ++ uuid_le magic; ++ uuid_le uuid; ++ uuid_le user_uuid; ++ __u8 label[BCH_SB_LABEL_SIZE]; ++ __le64 offset; ++ __le64 seq; ++ ++ __le16 block_size; ++ __u8 dev_idx; ++ __u8 nr_devices; ++ __le32 u64s; ++ ++ __le64 time_base_lo; ++ __le32 time_base_hi; ++ __le32 time_precision; ++ ++ __le64 flags[8]; ++ __le64 features[2]; ++ __le64 compat[2]; ++ ++ struct bch_sb_layout layout; ++ ++ union { ++ struct bch_sb_field start[0]; ++ __le64 _data[0]; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++/* ++ * Flags: ++ * BCH_SB_INITALIZED - set on first mount ++ * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect ++ * behaviour of mount/recovery path: ++ * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits ++ * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80 ++ * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides ++ * DATA/META_CSUM_TYPE. Also indicates encryption ++ * algorithm in use, if/when we get more than one ++ */ ++ ++LE16_BITMASK(BCH_SB_BLOCK_SIZE, struct bch_sb, block_size, 0, 16); ++ ++LE64_BITMASK(BCH_SB_INITIALIZED, struct bch_sb, flags[0], 0, 1); ++LE64_BITMASK(BCH_SB_CLEAN, struct bch_sb, flags[0], 1, 2); ++LE64_BITMASK(BCH_SB_CSUM_TYPE, struct bch_sb, flags[0], 2, 8); ++LE64_BITMASK(BCH_SB_ERROR_ACTION, struct bch_sb, flags[0], 8, 12); ++ ++LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE, struct bch_sb, flags[0], 12, 28); ++ ++LE64_BITMASK(BCH_SB_GC_RESERVE, struct bch_sb, flags[0], 28, 33); ++LE64_BITMASK(BCH_SB_ROOT_RESERVE, struct bch_sb, flags[0], 33, 40); ++ ++LE64_BITMASK(BCH_SB_META_CSUM_TYPE, struct bch_sb, flags[0], 40, 44); ++LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48); ++ ++LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52); ++LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56); ++ ++LE64_BITMASK(BCH_SB_POSIX_ACL, struct bch_sb, flags[0], 56, 57); ++LE64_BITMASK(BCH_SB_USRQUOTA, struct bch_sb, flags[0], 57, 58); ++LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[0], 58, 59); ++LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60); ++ ++LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61); ++ ++LE64_BITMASK(BCH_SB_REFLINK, struct bch_sb, flags[0], 61, 62); ++ ++/* 61-64 unused */ ++ ++LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); ++LE64_BITMASK(BCH_SB_COMPRESSION_TYPE, struct bch_sb, flags[1], 4, 8); ++LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9); ++ ++LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10); ++LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14); ++ ++/* ++ * Max size of an extent that may require bouncing to read or write ++ * (checksummed, compressed): 64k ++ */ ++LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS, ++ struct bch_sb, flags[1], 14, 20); ++ ++LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24); ++LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28); ++ ++LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40); ++LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52); ++LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64); ++ ++LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE, ++ struct bch_sb, flags[2], 0, 4); ++LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); ++ ++LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); ++ ++/* ++ * Features: ++ * ++ * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist ++ * reflink: gates KEY_TYPE_reflink ++ * inline_data: gates KEY_TYPE_inline_data ++ * new_siphash: gates BCH_STR_HASH_SIPHASH ++ * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE ++ */ ++#define BCH_SB_FEATURES() \ ++ x(lz4, 0) \ ++ x(gzip, 1) \ ++ x(zstd, 2) \ ++ x(atomic_nlink, 3) \ ++ x(ec, 4) \ ++ x(journal_seq_blacklist_v3, 5) \ ++ x(reflink, 6) \ ++ x(new_siphash, 7) \ ++ x(inline_data, 8) \ ++ x(new_extent_overwrite, 9) \ ++ x(incompressible, 10) \ ++ x(btree_ptr_v2, 11) \ ++ x(extents_above_btree_updates, 12) \ ++ x(btree_updates_journalled, 13) ++ ++#define BCH_SB_FEATURES_ALL \ ++ ((1ULL << BCH_FEATURE_new_siphash)| \ ++ (1ULL << BCH_FEATURE_new_extent_overwrite)| \ ++ (1ULL << BCH_FEATURE_btree_ptr_v2)| \ ++ (1ULL << BCH_FEATURE_extents_above_btree_updates)) ++ ++enum bch_sb_feature { ++#define x(f, n) BCH_FEATURE_##f, ++ BCH_SB_FEATURES() ++#undef x ++ BCH_FEATURE_NR, ++}; ++ ++enum bch_sb_compat { ++ BCH_COMPAT_FEAT_ALLOC_INFO = 0, ++ BCH_COMPAT_FEAT_ALLOC_METADATA = 1, ++}; ++ ++/* options: */ ++ ++#define BCH_REPLICAS_MAX 4U ++ ++enum bch_error_actions { ++ BCH_ON_ERROR_CONTINUE = 0, ++ BCH_ON_ERROR_RO = 1, ++ BCH_ON_ERROR_PANIC = 2, ++ BCH_NR_ERROR_ACTIONS = 3, ++}; ++ ++enum bch_str_hash_type { ++ BCH_STR_HASH_CRC32C = 0, ++ BCH_STR_HASH_CRC64 = 1, ++ BCH_STR_HASH_SIPHASH_OLD = 2, ++ BCH_STR_HASH_SIPHASH = 3, ++ BCH_STR_HASH_NR = 4, ++}; ++ ++enum bch_str_hash_opts { ++ BCH_STR_HASH_OPT_CRC32C = 0, ++ BCH_STR_HASH_OPT_CRC64 = 1, ++ BCH_STR_HASH_OPT_SIPHASH = 2, ++ BCH_STR_HASH_OPT_NR = 3, ++}; ++ ++enum bch_csum_type { ++ BCH_CSUM_NONE = 0, ++ BCH_CSUM_CRC32C_NONZERO = 1, ++ BCH_CSUM_CRC64_NONZERO = 2, ++ BCH_CSUM_CHACHA20_POLY1305_80 = 3, ++ BCH_CSUM_CHACHA20_POLY1305_128 = 4, ++ BCH_CSUM_CRC32C = 5, ++ BCH_CSUM_CRC64 = 6, ++ BCH_CSUM_NR = 7, ++}; ++ ++static const unsigned bch_crc_bytes[] = { ++ [BCH_CSUM_NONE] = 0, ++ [BCH_CSUM_CRC32C_NONZERO] = 4, ++ [BCH_CSUM_CRC32C] = 4, ++ [BCH_CSUM_CRC64_NONZERO] = 8, ++ [BCH_CSUM_CRC64] = 8, ++ [BCH_CSUM_CHACHA20_POLY1305_80] = 10, ++ [BCH_CSUM_CHACHA20_POLY1305_128] = 16, ++}; ++ ++static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) ++{ ++ switch (type) { ++ case BCH_CSUM_CHACHA20_POLY1305_80: ++ case BCH_CSUM_CHACHA20_POLY1305_128: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++enum bch_csum_opts { ++ BCH_CSUM_OPT_NONE = 0, ++ BCH_CSUM_OPT_CRC32C = 1, ++ BCH_CSUM_OPT_CRC64 = 2, ++ BCH_CSUM_OPT_NR = 3, ++}; ++ ++#define BCH_COMPRESSION_TYPES() \ ++ x(none, 0) \ ++ x(lz4_old, 1) \ ++ x(gzip, 2) \ ++ x(lz4, 3) \ ++ x(zstd, 4) \ ++ x(incompressible, 5) ++ ++enum bch_compression_type { ++#define x(t, n) BCH_COMPRESSION_TYPE_##t, ++ BCH_COMPRESSION_TYPES() ++#undef x ++ BCH_COMPRESSION_TYPE_NR ++}; ++ ++#define BCH_COMPRESSION_OPTS() \ ++ x(none, 0) \ ++ x(lz4, 1) \ ++ x(gzip, 2) \ ++ x(zstd, 3) ++ ++enum bch_compression_opts { ++#define x(t, n) BCH_COMPRESSION_OPT_##t, ++ BCH_COMPRESSION_OPTS() ++#undef x ++ BCH_COMPRESSION_OPT_NR ++}; ++ ++/* ++ * Magic numbers ++ * ++ * The various other data structures have their own magic numbers, which are ++ * xored with the first part of the cache set's UUID ++ */ ++ ++#define BCACHE_MAGIC \ ++ UUID_LE(0xf67385c6, 0x1a4e, 0xca45, \ ++ 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81) ++ ++#define BCACHEFS_STATFS_MAGIC 0xca451a4e ++ ++#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL) ++#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL) ++ ++static inline __le64 __bch2_sb_magic(struct bch_sb *sb) ++{ ++ __le64 ret; ++ memcpy(&ret, &sb->uuid, sizeof(ret)); ++ return ret; ++} ++ ++static inline __u64 __jset_magic(struct bch_sb *sb) ++{ ++ return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC); ++} ++ ++static inline __u64 __bset_magic(struct bch_sb *sb) ++{ ++ return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC); ++} ++ ++/* Journal */ ++ ++#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) ++ ++#define BCH_JSET_ENTRY_TYPES() \ ++ x(btree_keys, 0) \ ++ x(btree_root, 1) \ ++ x(prio_ptrs, 2) \ ++ x(blacklist, 3) \ ++ x(blacklist_v2, 4) \ ++ x(usage, 5) \ ++ x(data_usage, 6) ++ ++enum { ++#define x(f, nr) BCH_JSET_ENTRY_##f = nr, ++ BCH_JSET_ENTRY_TYPES() ++#undef x ++ BCH_JSET_ENTRY_NR ++}; ++ ++/* ++ * Journal sequence numbers can be blacklisted: bsets record the max sequence ++ * number of all the journal entries they contain updates for, so that on ++ * recovery we can ignore those bsets that contain index updates newer that what ++ * made it into the journal. ++ * ++ * This means that we can't reuse that journal_seq - we have to skip it, and ++ * then record that we skipped it so that the next time we crash and recover we ++ * don't think there was a missing journal entry. ++ */ ++struct jset_entry_blacklist { ++ struct jset_entry entry; ++ __le64 seq; ++}; ++ ++struct jset_entry_blacklist_v2 { ++ struct jset_entry entry; ++ __le64 start; ++ __le64 end; ++}; ++ ++enum { ++ FS_USAGE_RESERVED = 0, ++ FS_USAGE_INODES = 1, ++ FS_USAGE_KEY_VERSION = 2, ++ FS_USAGE_NR = 3 ++}; ++ ++struct jset_entry_usage { ++ struct jset_entry entry; ++ __le64 v; ++} __attribute__((packed)); ++ ++struct jset_entry_data_usage { ++ struct jset_entry entry; ++ __le64 v; ++ struct bch_replicas_entry r; ++} __attribute__((packed)); ++ ++/* ++ * On disk format for a journal entry: ++ * seq is monotonically increasing; every journal entry has its own unique ++ * sequence number. ++ * ++ * last_seq is the oldest journal entry that still has keys the btree hasn't ++ * flushed to disk yet. ++ * ++ * version is for on disk format changes. ++ */ ++struct jset { ++ struct bch_csum csum; ++ ++ __le64 magic; ++ __le64 seq; ++ __le32 version; ++ __le32 flags; ++ ++ __le32 u64s; /* size of d[] in u64s */ ++ ++ __u8 encrypted_start[0]; ++ ++ __le16 read_clock; ++ __le16 write_clock; ++ ++ /* Sequence number of oldest dirty journal entry */ ++ __le64 last_seq; ++ ++ ++ union { ++ struct jset_entry start[0]; ++ __u64 _data[0]; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); ++LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); ++ ++#define BCH_JOURNAL_BUCKETS_MIN 8 ++ ++/* Btree: */ ++ ++#define BCH_BTREE_IDS() \ ++ x(EXTENTS, 0, "extents") \ ++ x(INODES, 1, "inodes") \ ++ x(DIRENTS, 2, "dirents") \ ++ x(XATTRS, 3, "xattrs") \ ++ x(ALLOC, 4, "alloc") \ ++ x(QUOTAS, 5, "quotas") \ ++ x(EC, 6, "stripes") \ ++ x(REFLINK, 7, "reflink") ++ ++enum btree_id { ++#define x(kwd, val, name) BTREE_ID_##kwd = val, ++ BCH_BTREE_IDS() ++#undef x ++ BTREE_ID_NR ++}; ++ ++#define BTREE_MAX_DEPTH 4U ++ ++/* Btree nodes */ ++ ++/* ++ * Btree nodes ++ * ++ * On disk a btree node is a list/log of these; within each set the keys are ++ * sorted ++ */ ++struct bset { ++ __le64 seq; ++ ++ /* ++ * Highest journal entry this bset contains keys for. ++ * If on recovery we don't see that journal entry, this bset is ignored: ++ * this allows us to preserve the order of all index updates after a ++ * crash, since the journal records a total order of all index updates ++ * and anything that didn't make it to the journal doesn't get used. ++ */ ++ __le64 journal_seq; ++ ++ __le32 flags; ++ __le16 version; ++ __le16 u64s; /* count of d[] in u64s */ ++ ++ union { ++ struct bkey_packed start[0]; ++ __u64 _data[0]; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4); ++ ++LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5); ++LE32_BITMASK(BSET_SEPARATE_WHITEOUTS, ++ struct bset, flags, 5, 6); ++ ++struct btree_node { ++ struct bch_csum csum; ++ __le64 magic; ++ ++ /* this flags field is encrypted, unlike bset->flags: */ ++ __le64 flags; ++ ++ /* Closed interval: */ ++ struct bpos min_key; ++ struct bpos max_key; ++ struct bch_extent_ptr ptr; ++ struct bkey_format format; ++ ++ union { ++ struct bset keys; ++ struct { ++ __u8 pad[22]; ++ __le16 u64s; ++ __u64 _data[0]; ++ ++ }; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++LE64_BITMASK(BTREE_NODE_ID, struct btree_node, flags, 0, 4); ++LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8); ++LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE, ++ struct btree_node, flags, 8, 9); ++/* 9-32 unused */ ++LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64); ++ ++struct btree_node_entry { ++ struct bch_csum csum; ++ ++ union { ++ struct bset keys; ++ struct { ++ __u8 pad[22]; ++ __le16 u64s; ++ __u64 _data[0]; ++ ++ }; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++#endif /* _BCACHEFS_FORMAT_H */ +diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h +new file mode 100644 +index 000000000000..d71157a3e073 +--- /dev/null ++++ b/fs/bcachefs/bcachefs_ioctl.h +@@ -0,0 +1,332 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_IOCTL_H ++#define _BCACHEFS_IOCTL_H ++ ++#include ++#include ++#include "bcachefs_format.h" ++ ++/* ++ * Flags common to multiple ioctls: ++ */ ++#define BCH_FORCE_IF_DATA_LOST (1 << 0) ++#define BCH_FORCE_IF_METADATA_LOST (1 << 1) ++#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2) ++#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3) ++ ++#define BCH_FORCE_IF_DEGRADED \ ++ (BCH_FORCE_IF_DATA_DEGRADED| \ ++ BCH_FORCE_IF_METADATA_DEGRADED) ++ ++/* ++ * If cleared, ioctl that refer to a device pass it as a pointer to a pathname ++ * (e.g. /dev/sda1); if set, the dev field is the device's index within the ++ * filesystem: ++ */ ++#define BCH_BY_INDEX (1 << 4) ++ ++/* ++ * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem ++ * wide superblock: ++ */ ++#define BCH_READ_DEV (1 << 5) ++ ++/* global control dev: */ ++ ++/* These are currently broken, and probably unnecessary: */ ++#if 0 ++#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble) ++#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental) ++ ++struct bch_ioctl_assemble { ++ __u32 flags; ++ __u32 nr_devs; ++ __u64 pad; ++ __u64 devs[]; ++}; ++ ++struct bch_ioctl_incremental { ++ __u32 flags; ++ __u64 pad; ++ __u64 dev; ++}; ++#endif ++ ++/* filesystem ioctls: */ ++ ++#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid) ++ ++/* These only make sense when we also have incremental assembly */ ++#if 0 ++#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start) ++#define BCH_IOCTL_STOP _IO(0xbc, 3) ++#endif ++ ++#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk) ++#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk) ++#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk) ++#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk) ++#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state) ++#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data) ++#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage) ++#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage) ++#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super) ++#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx) ++#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) ++ ++/* ioctl below act on a particular file, not the filesystem as a whole: */ ++ ++#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) ++ ++/* ++ * BCH_IOCTL_QUERY_UUID: get filesystem UUID ++ * ++ * Returns user visible UUID, not internal UUID (which may not ever be changed); ++ * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with ++ * this UUID. ++ */ ++struct bch_ioctl_query_uuid { ++ uuid_le uuid; ++}; ++ ++#if 0 ++struct bch_ioctl_start { ++ __u32 flags; ++ __u32 pad; ++}; ++#endif ++ ++/* ++ * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem ++ * ++ * The specified device must not be open or in use. On success, the new device ++ * will be an online member of the filesystem just like any other member. ++ * ++ * The device must first be prepared by userspace by formatting with a bcachefs ++ * superblock, which is only used for passing in superblock options/parameters ++ * for that device (in struct bch_member). The new device's superblock should ++ * not claim to be a member of any existing filesystem - UUIDs on it will be ++ * ignored. ++ */ ++ ++/* ++ * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem ++ * ++ * Any data present on @dev will be permanently deleted, and @dev will be ++ * removed from its slot in the filesystem's list of member devices. The device ++ * may be either offline or offline. ++ * ++ * Will fail removing @dev would leave us with insufficient read write devices ++ * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are ++ * set. ++ */ ++ ++/* ++ * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem ++ * but is not open (e.g. because we started in degraded mode), bring it online ++ * ++ * all existing data on @dev will be available once the device is online, ++ * exactly as if @dev was present when the filesystem was first mounted ++ */ ++ ++/* ++ * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that ++ * block device, without removing it from the filesystem (so it can be brought ++ * back online later) ++ * ++ * Data present on @dev will be unavailable while @dev is offline (unless ++ * replicated), but will still be intact and untouched if @dev is brought back ++ * online ++ * ++ * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would ++ * leave us with insufficient read write devices or degraded/unavailable data, ++ * unless the approprate BCH_FORCE_IF_* flags are set. ++ */ ++ ++struct bch_ioctl_disk { ++ __u32 flags; ++ __u32 pad; ++ __u64 dev; ++}; ++ ++/* ++ * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem ++ * ++ * @new_state - one of the bch_member_state states (rw, ro, failed, ++ * spare) ++ * ++ * Will refuse to change member state if we would then have insufficient devices ++ * to write to, or if it would result in degraded data (when @new_state is ++ * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set. ++ */ ++struct bch_ioctl_disk_set_state { ++ __u32 flags; ++ __u8 new_state; ++ __u8 pad[3]; ++ __u64 dev; ++}; ++ ++enum bch_data_ops { ++ BCH_DATA_OP_SCRUB = 0, ++ BCH_DATA_OP_REREPLICATE = 1, ++ BCH_DATA_OP_MIGRATE = 2, ++ BCH_DATA_OP_NR = 3, ++}; ++ ++/* ++ * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g. ++ * scrub, rereplicate, migrate). ++ * ++ * This ioctl kicks off a job in the background, and returns a file descriptor. ++ * Reading from the file descriptor returns a struct bch_ioctl_data_event, ++ * indicating current progress, and closing the file descriptor will stop the ++ * job. The file descriptor is O_CLOEXEC. ++ */ ++struct bch_ioctl_data { ++ __u32 op; ++ __u32 flags; ++ ++ struct bpos start; ++ struct bpos end; ++ ++ union { ++ struct { ++ __u32 dev; ++ __u32 pad; ++ } migrate; ++ struct { ++ __u64 pad[8]; ++ }; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++enum bch_data_event { ++ BCH_DATA_EVENT_PROGRESS = 0, ++ /* XXX: add an event for reporting errors */ ++ BCH_DATA_EVENT_NR = 1, ++}; ++ ++struct bch_ioctl_data_progress { ++ __u8 data_type; ++ __u8 btree_id; ++ __u8 pad[2]; ++ struct bpos pos; ++ ++ __u64 sectors_done; ++ __u64 sectors_total; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_ioctl_data_event { ++ __u8 type; ++ __u8 pad[7]; ++ union { ++ struct bch_ioctl_data_progress p; ++ __u64 pad2[15]; ++ }; ++} __attribute__((packed, aligned(8))); ++ ++struct bch_replicas_usage { ++ __u64 sectors; ++ struct bch_replicas_entry r; ++} __attribute__((packed)); ++ ++static inline struct bch_replicas_usage * ++replicas_usage_next(struct bch_replicas_usage *u) ++{ ++ return (void *) u + replicas_entry_bytes(&u->r) + 8; ++} ++ ++/* ++ * BCH_IOCTL_FS_USAGE: query filesystem disk space usage ++ * ++ * Returns disk space usage broken out by data type, number of replicas, and ++ * by component device ++ * ++ * @replica_entries_bytes - size, in bytes, allocated for replica usage entries ++ * ++ * On success, @replica_entries_bytes will be changed to indicate the number of ++ * bytes actually used. ++ * ++ * Returns -ERANGE if @replica_entries_bytes was too small ++ */ ++struct bch_ioctl_fs_usage { ++ __u64 capacity; ++ __u64 used; ++ __u64 online_reserved; ++ __u64 persistent_reserved[BCH_REPLICAS_MAX]; ++ ++ __u32 replica_entries_bytes; ++ __u32 pad; ++ ++ struct bch_replicas_usage replicas[0]; ++}; ++ ++/* ++ * BCH_IOCTL_DEV_USAGE: query device disk space usage ++ * ++ * Returns disk space usage broken out by data type - both by buckets and ++ * sectors. ++ */ ++struct bch_ioctl_dev_usage { ++ __u64 dev; ++ __u32 flags; ++ __u8 state; ++ __u8 pad[7]; ++ ++ __u32 bucket_size; ++ __u64 nr_buckets; ++ __u64 available_buckets; ++ ++ __u64 buckets[BCH_DATA_NR]; ++ __u64 sectors[BCH_DATA_NR]; ++ ++ __u64 ec_buckets; ++ __u64 ec_sectors; ++}; ++ ++/* ++ * BCH_IOCTL_READ_SUPER: read filesystem superblock ++ * ++ * Equivalent to reading the superblock directly from the block device, except ++ * avoids racing with the kernel writing the superblock or having to figure out ++ * which block device to read ++ * ++ * @sb - buffer to read into ++ * @size - size of userspace allocated buffer ++ * @dev - device to read superblock for, if BCH_READ_DEV flag is ++ * specified ++ * ++ * Returns -ERANGE if buffer provided is too small ++ */ ++struct bch_ioctl_read_super { ++ __u32 flags; ++ __u32 pad; ++ __u64 dev; ++ __u64 size; ++ __u64 sb; ++}; ++ ++/* ++ * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to ++ * determine if disk is a (online) member - if so, returns device's index ++ * ++ * Returns -ENOENT if not found ++ */ ++struct bch_ioctl_disk_get_idx { ++ __u64 dev; ++}; ++ ++/* ++ * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device ++ * ++ * @dev - member to resize ++ * @nbuckets - new number of buckets ++ */ ++struct bch_ioctl_disk_resize { ++ __u32 flags; ++ __u32 pad; ++ __u64 dev; ++ __u64 nbuckets; ++}; ++ ++#endif /* _BCACHEFS_IOCTL_H */ +diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c +new file mode 100644 +index 000000000000..4d0c9129cd4a +--- /dev/null ++++ b/fs/bcachefs/bkey.c +@@ -0,0 +1,1154 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey.h" ++#include "bkey_methods.h" ++#include "bset.h" ++#include "util.h" ++ ++#undef EBUG_ON ++ ++#ifdef DEBUG_BKEYS ++#define EBUG_ON(cond) BUG_ON(cond) ++#else ++#define EBUG_ON(cond) ++#endif ++ ++const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT; ++ ++struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, ++ const struct bkey_packed *); ++ ++void bch2_to_binary(char *out, const u64 *p, unsigned nr_bits) ++{ ++ unsigned bit = high_bit_offset, done = 0; ++ ++ while (1) { ++ while (bit < 64) { ++ if (done && !(done % 8)) ++ *out++ = ' '; ++ *out++ = *p & (1ULL << (63 - bit)) ? '1' : '0'; ++ bit++; ++ done++; ++ if (done == nr_bits) { ++ *out++ = '\0'; ++ return; ++ } ++ } ++ ++ p = next_word(p); ++ bit = 0; ++ } ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++static void bch2_bkey_pack_verify(const struct bkey_packed *packed, ++ const struct bkey *unpacked, ++ const struct bkey_format *format) ++{ ++ struct bkey tmp; ++ ++ BUG_ON(bkeyp_val_u64s(format, packed) != ++ bkey_val_u64s(unpacked)); ++ ++ BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed)); ++ ++ tmp = __bch2_bkey_unpack_key(format, packed); ++ ++ if (memcmp(&tmp, unpacked, sizeof(struct bkey))) { ++ char buf1[160], buf2[160]; ++ char buf3[160], buf4[160]; ++ ++ bch2_bkey_to_text(&PBUF(buf1), unpacked); ++ bch2_bkey_to_text(&PBUF(buf2), &tmp); ++ bch2_to_binary(buf3, (void *) unpacked, 80); ++ bch2_to_binary(buf4, high_word(format, packed), 80); ++ ++ panic("keys differ: format u64s %u fields %u %u %u %u %u\n%s\n%s\n%s\n%s\n", ++ format->key_u64s, ++ format->bits_per_field[0], ++ format->bits_per_field[1], ++ format->bits_per_field[2], ++ format->bits_per_field[3], ++ format->bits_per_field[4], ++ buf1, buf2, buf3, buf4); ++ } ++} ++ ++#else ++static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed, ++ const struct bkey *unpacked, ++ const struct bkey_format *format) {} ++#endif ++ ++struct pack_state { ++ const struct bkey_format *format; ++ unsigned bits; /* bits remaining in current word */ ++ u64 w; /* current word */ ++ u64 *p; /* pointer to next word */ ++}; ++ ++__always_inline ++static struct pack_state pack_state_init(const struct bkey_format *format, ++ struct bkey_packed *k) ++{ ++ u64 *p = high_word(format, k); ++ ++ return (struct pack_state) { ++ .format = format, ++ .bits = 64 - high_bit_offset, ++ .w = 0, ++ .p = p, ++ }; ++} ++ ++__always_inline ++static void pack_state_finish(struct pack_state *state, ++ struct bkey_packed *k) ++{ ++ EBUG_ON(state->p < k->_data); ++ EBUG_ON(state->p >= k->_data + state->format->key_u64s); ++ ++ *state->p = state->w; ++} ++ ++struct unpack_state { ++ const struct bkey_format *format; ++ unsigned bits; /* bits remaining in current word */ ++ u64 w; /* current word */ ++ const u64 *p; /* pointer to next word */ ++}; ++ ++__always_inline ++static struct unpack_state unpack_state_init(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ const u64 *p = high_word(format, k); ++ ++ return (struct unpack_state) { ++ .format = format, ++ .bits = 64 - high_bit_offset, ++ .w = *p << high_bit_offset, ++ .p = p, ++ }; ++} ++ ++__always_inline ++static u64 get_inc_field(struct unpack_state *state, unsigned field) ++{ ++ unsigned bits = state->format->bits_per_field[field]; ++ u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]); ++ ++ if (bits >= state->bits) { ++ v = state->w >> (64 - bits); ++ bits -= state->bits; ++ ++ state->p = next_word(state->p); ++ state->w = *state->p; ++ state->bits = 64; ++ } ++ ++ /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ ++ v |= (state->w >> 1) >> (63 - bits); ++ state->w <<= bits; ++ state->bits -= bits; ++ ++ return v + offset; ++} ++ ++__always_inline ++static bool set_inc_field(struct pack_state *state, unsigned field, u64 v) ++{ ++ unsigned bits = state->format->bits_per_field[field]; ++ u64 offset = le64_to_cpu(state->format->field_offset[field]); ++ ++ if (v < offset) ++ return false; ++ ++ v -= offset; ++ ++ if (fls64(v) > bits) ++ return false; ++ ++ if (bits > state->bits) { ++ bits -= state->bits; ++ /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ ++ state->w |= (v >> 1) >> (bits - 1); ++ ++ *state->p = state->w; ++ state->p = next_word(state->p); ++ state->w = 0; ++ state->bits = 64; ++ } ++ ++ state->bits -= bits; ++ state->w |= v << state->bits; ++ ++ return true; ++} ++ ++/* ++ * Note: does NOT set out->format (we don't know what it should be here!) ++ * ++ * Also: doesn't work on extents - it doesn't preserve the invariant that ++ * if k is packed bkey_start_pos(k) will successfully pack ++ */ ++static bool bch2_bkey_transform_key(const struct bkey_format *out_f, ++ struct bkey_packed *out, ++ const struct bkey_format *in_f, ++ const struct bkey_packed *in) ++{ ++ struct pack_state out_s = pack_state_init(out_f, out); ++ struct unpack_state in_s = unpack_state_init(in_f, in); ++ unsigned i; ++ ++ out->_data[0] = 0; ++ ++ for (i = 0; i < BKEY_NR_FIELDS; i++) ++ if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i))) ++ return false; ++ ++ /* Can't happen because the val would be too big to unpack: */ ++ EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX); ++ ++ pack_state_finish(&out_s, out); ++ out->u64s = out_f->key_u64s + in->u64s - in_f->key_u64s; ++ out->needs_whiteout = in->needs_whiteout; ++ out->type = in->type; ++ ++ return true; ++} ++ ++bool bch2_bkey_transform(const struct bkey_format *out_f, ++ struct bkey_packed *out, ++ const struct bkey_format *in_f, ++ const struct bkey_packed *in) ++{ ++ if (!bch2_bkey_transform_key(out_f, out, in_f, in)) ++ return false; ++ ++ memcpy_u64s((u64 *) out + out_f->key_u64s, ++ (u64 *) in + in_f->key_u64s, ++ (in->u64s - in_f->key_u64s)); ++ return true; ++} ++ ++#define bkey_fields() \ ++ x(BKEY_FIELD_INODE, p.inode) \ ++ x(BKEY_FIELD_OFFSET, p.offset) \ ++ x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ ++ x(BKEY_FIELD_SIZE, size) \ ++ x(BKEY_FIELD_VERSION_HI, version.hi) \ ++ x(BKEY_FIELD_VERSION_LO, version.lo) ++ ++struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format, ++ const struct bkey_packed *in) ++{ ++ struct unpack_state state = unpack_state_init(format, in); ++ struct bkey out; ++ ++ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); ++ EBUG_ON(in->u64s < format->key_u64s); ++ EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); ++ EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX); ++ ++ out.u64s = BKEY_U64s + in->u64s - format->key_u64s; ++ out.format = KEY_FORMAT_CURRENT; ++ out.needs_whiteout = in->needs_whiteout; ++ out.type = in->type; ++ out.pad[0] = 0; ++ ++#define x(id, field) out.field = get_inc_field(&state, id); ++ bkey_fields() ++#undef x ++ ++ return out; ++} ++ ++#ifndef HAVE_BCACHEFS_COMPILED_UNPACK ++struct bpos __bkey_unpack_pos(const struct bkey_format *format, ++ const struct bkey_packed *in) ++{ ++ struct unpack_state state = unpack_state_init(format, in); ++ struct bpos out; ++ ++ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); ++ EBUG_ON(in->u64s < format->key_u64s); ++ EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); ++ ++ out.inode = get_inc_field(&state, BKEY_FIELD_INODE); ++ out.offset = get_inc_field(&state, BKEY_FIELD_OFFSET); ++ out.snapshot = get_inc_field(&state, BKEY_FIELD_SNAPSHOT); ++ ++ return out; ++} ++#endif ++ ++/** ++ * bch2_bkey_pack_key -- pack just the key, not the value ++ */ ++bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, ++ const struct bkey_format *format) ++{ ++ struct pack_state state = pack_state_init(format, out); ++ ++ EBUG_ON((void *) in == (void *) out); ++ EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); ++ EBUG_ON(in->format != KEY_FORMAT_CURRENT); ++ ++ out->_data[0] = 0; ++ ++#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false; ++ bkey_fields() ++#undef x ++ ++ /* ++ * Extents - we have to guarantee that if an extent is packed, a trimmed ++ * version will also pack: ++ */ ++ if (bkey_start_offset(in) < ++ le64_to_cpu(format->field_offset[BKEY_FIELD_OFFSET])) ++ return false; ++ ++ pack_state_finish(&state, out); ++ out->u64s = format->key_u64s + in->u64s - BKEY_U64s; ++ out->format = KEY_FORMAT_LOCAL_BTREE; ++ out->needs_whiteout = in->needs_whiteout; ++ out->type = in->type; ++ ++ bch2_bkey_pack_verify(out, in, format); ++ return true; ++} ++ ++/** ++ * bch2_bkey_unpack -- unpack the key and the value ++ */ ++void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst, ++ const struct bkey_packed *src) ++{ ++ __bkey_unpack_key(b, &dst->k, src); ++ ++ memcpy_u64s(&dst->v, ++ bkeyp_val(&b->format, src), ++ bkeyp_val_u64s(&b->format, src)); ++} ++ ++/** ++ * bch2_bkey_pack -- pack the key and the value ++ */ ++bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in, ++ const struct bkey_format *format) ++{ ++ struct bkey_packed tmp; ++ ++ if (!bch2_bkey_pack_key(&tmp, &in->k, format)) ++ return false; ++ ++ memmove_u64s((u64 *) out + format->key_u64s, ++ &in->v, ++ bkey_val_u64s(&in->k)); ++ memcpy_u64s(out, &tmp, format->key_u64s); ++ ++ return true; ++} ++ ++__always_inline ++static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) ++{ ++ unsigned bits = state->format->bits_per_field[field]; ++ u64 offset = le64_to_cpu(state->format->field_offset[field]); ++ bool ret = true; ++ ++ EBUG_ON(v < offset); ++ v -= offset; ++ ++ if (fls64(v) > bits) { ++ v = ~(~0ULL << bits); ++ ret = false; ++ } ++ ++ if (bits > state->bits) { ++ bits -= state->bits; ++ state->w |= (v >> 1) >> (bits - 1); ++ ++ *state->p = state->w; ++ state->p = next_word(state->p); ++ state->w = 0; ++ state->bits = 64; ++ } ++ ++ state->bits -= bits; ++ state->w |= v << state->bits; ++ ++ return ret; ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++static bool bkey_packed_successor(struct bkey_packed *out, ++ const struct btree *b, ++ struct bkey_packed k) ++{ ++ const struct bkey_format *f = &b->format; ++ unsigned nr_key_bits = b->nr_key_bits; ++ unsigned first_bit, offset; ++ u64 *p; ++ ++ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); ++ ++ if (!nr_key_bits) ++ return false; ++ ++ *out = k; ++ ++ first_bit = high_bit_offset + nr_key_bits - 1; ++ p = nth_word(high_word(f, out), first_bit >> 6); ++ offset = 63 - (first_bit & 63); ++ ++ while (nr_key_bits) { ++ unsigned bits = min(64 - offset, nr_key_bits); ++ u64 mask = (~0ULL >> (64 - bits)) << offset; ++ ++ if ((*p & mask) != mask) { ++ *p += 1ULL << offset; ++ EBUG_ON(bkey_cmp_packed(b, out, &k) <= 0); ++ return true; ++ } ++ ++ *p &= ~mask; ++ p = prev_word(p); ++ nr_key_bits -= bits; ++ offset = 0; ++ } ++ ++ return false; ++} ++#endif ++ ++/* ++ * Returns a packed key that compares <= in ++ * ++ * This is used in bset_search_tree(), where we need a packed pos in order to be ++ * able to compare against the keys in the auxiliary search tree - and it's ++ * legal to use a packed pos that isn't equivalent to the original pos, ++ * _provided_ it compares <= to the original pos. ++ */ ++enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, ++ struct bpos in, ++ const struct btree *b) ++{ ++ const struct bkey_format *f = &b->format; ++ struct pack_state state = pack_state_init(f, out); ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bpos orig = in; ++#endif ++ bool exact = true; ++ ++ out->_data[0] = 0; ++ ++ if (unlikely(in.snapshot < ++ le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) { ++ if (!in.offset-- && ++ !in.inode--) ++ return BKEY_PACK_POS_FAIL; ++ in.snapshot = KEY_SNAPSHOT_MAX; ++ exact = false; ++ } ++ ++ if (unlikely(in.offset < ++ le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) { ++ if (!in.inode--) ++ return BKEY_PACK_POS_FAIL; ++ in.offset = KEY_OFFSET_MAX; ++ in.snapshot = KEY_SNAPSHOT_MAX; ++ exact = false; ++ } ++ ++ if (unlikely(in.inode < ++ le64_to_cpu(f->field_offset[BKEY_FIELD_INODE]))) ++ return BKEY_PACK_POS_FAIL; ++ ++ if (!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode)) { ++ in.offset = KEY_OFFSET_MAX; ++ in.snapshot = KEY_SNAPSHOT_MAX; ++ exact = false; ++ } ++ ++ if (!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset)) { ++ in.snapshot = KEY_SNAPSHOT_MAX; ++ exact = false; ++ } ++ ++ if (!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot)) ++ exact = false; ++ ++ pack_state_finish(&state, out); ++ out->u64s = f->key_u64s; ++ out->format = KEY_FORMAT_LOCAL_BTREE; ++ out->type = KEY_TYPE_deleted; ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ if (exact) { ++ BUG_ON(bkey_cmp_left_packed(b, out, &orig)); ++ } else { ++ struct bkey_packed successor; ++ ++ BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); ++ BUG_ON(bkey_packed_successor(&successor, b, *out) && ++ bkey_cmp_left_packed(b, &successor, &orig) < 0); ++ } ++#endif ++ ++ return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER; ++} ++ ++void bch2_bkey_format_init(struct bkey_format_state *s) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(s->field_min); i++) ++ s->field_min[i] = U64_MAX; ++ ++ for (i = 0; i < ARRAY_SIZE(s->field_max); i++) ++ s->field_max[i] = 0; ++ ++ /* Make sure we can store a size of 0: */ ++ s->field_min[BKEY_FIELD_SIZE] = 0; ++} ++ ++static void __bkey_format_add(struct bkey_format_state *s, ++ unsigned field, u64 v) ++{ ++ s->field_min[field] = min(s->field_min[field], v); ++ s->field_max[field] = max(s->field_max[field], v); ++} ++ ++/* ++ * Changes @format so that @k can be successfully packed with @format ++ */ ++void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k) ++{ ++#define x(id, field) __bkey_format_add(s, id, k->field); ++ bkey_fields() ++#undef x ++ __bkey_format_add(s, BKEY_FIELD_OFFSET, bkey_start_offset(k)); ++} ++ ++void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p) ++{ ++ unsigned field = 0; ++ ++ __bkey_format_add(s, field++, p.inode); ++ __bkey_format_add(s, field++, p.offset); ++ __bkey_format_add(s, field++, p.snapshot); ++} ++ ++/* ++ * We don't want it to be possible for the packed format to represent fields ++ * bigger than a u64... that will cause confusion and issues (like with ++ * bkey_packed_successor()) ++ */ ++static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i, ++ unsigned bits, u64 offset) ++{ ++ offset = bits == 64 ? 0 : min(offset, U64_MAX - ((1ULL << bits) - 1)); ++ ++ f->bits_per_field[i] = bits; ++ f->field_offset[i] = cpu_to_le64(offset); ++} ++ ++struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) ++{ ++ unsigned i, bits = KEY_PACKED_BITS_START; ++ struct bkey_format ret = { ++ .nr_fields = BKEY_NR_FIELDS, ++ }; ++ ++ for (i = 0; i < ARRAY_SIZE(s->field_min); i++) { ++ s->field_min[i] = min(s->field_min[i], s->field_max[i]); ++ ++ set_format_field(&ret, i, ++ fls64(s->field_max[i] - s->field_min[i]), ++ s->field_min[i]); ++ ++ bits += ret.bits_per_field[i]; ++ } ++ ++ /* allow for extent merging: */ ++ if (ret.bits_per_field[BKEY_FIELD_SIZE]) { ++ ret.bits_per_field[BKEY_FIELD_SIZE] += 4; ++ bits += 4; ++ } ++ ++ ret.key_u64s = DIV_ROUND_UP(bits, 64); ++ ++ /* if we have enough spare bits, round fields up to nearest byte */ ++ bits = ret.key_u64s * 64 - bits; ++ ++ for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) { ++ unsigned r = round_up(ret.bits_per_field[i], 8) - ++ ret.bits_per_field[i]; ++ ++ if (r <= bits) { ++ set_format_field(&ret, i, ++ ret.bits_per_field[i] + r, ++ le64_to_cpu(ret.field_offset[i])); ++ bits -= r; ++ } ++ } ++ ++ EBUG_ON(bch2_bkey_format_validate(&ret)); ++ return ret; ++} ++ ++const char *bch2_bkey_format_validate(struct bkey_format *f) ++{ ++ unsigned i, bits = KEY_PACKED_BITS_START; ++ ++ if (f->nr_fields != BKEY_NR_FIELDS) ++ return "incorrect number of fields"; ++ ++ for (i = 0; i < f->nr_fields; i++) { ++ u64 field_offset = le64_to_cpu(f->field_offset[i]); ++ ++ if (f->bits_per_field[i] > 64) ++ return "field too large"; ++ ++ if (field_offset && ++ (f->bits_per_field[i] == 64 || ++ (field_offset + ((1ULL << f->bits_per_field[i]) - 1) < ++ field_offset))) ++ return "offset + bits overflow"; ++ ++ bits += f->bits_per_field[i]; ++ } ++ ++ if (f->key_u64s != DIV_ROUND_UP(bits, 64)) ++ return "incorrect key_u64s"; ++ ++ return NULL; ++} ++ ++/* ++ * Most significant differing bit ++ * Bits are indexed from 0 - return is [0, nr_key_bits) ++ */ ++__pure ++unsigned bch2_bkey_greatest_differing_bit(const struct btree *b, ++ const struct bkey_packed *l_k, ++ const struct bkey_packed *r_k) ++{ ++ const u64 *l = high_word(&b->format, l_k); ++ const u64 *r = high_word(&b->format, r_k); ++ unsigned nr_key_bits = b->nr_key_bits; ++ unsigned word_bits = 64 - high_bit_offset; ++ u64 l_v, r_v; ++ ++ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); ++ ++ /* for big endian, skip past header */ ++ l_v = *l & (~0ULL >> high_bit_offset); ++ r_v = *r & (~0ULL >> high_bit_offset); ++ ++ while (nr_key_bits) { ++ if (nr_key_bits < word_bits) { ++ l_v >>= word_bits - nr_key_bits; ++ r_v >>= word_bits - nr_key_bits; ++ nr_key_bits = 0; ++ } else { ++ nr_key_bits -= word_bits; ++ } ++ ++ if (l_v != r_v) ++ return fls64(l_v ^ r_v) - 1 + nr_key_bits; ++ ++ l = next_word(l); ++ r = next_word(r); ++ ++ l_v = *l; ++ r_v = *r; ++ word_bits = 64; ++ } ++ ++ return 0; ++} ++ ++/* ++ * First set bit ++ * Bits are indexed from 0 - return is [0, nr_key_bits) ++ */ ++__pure ++unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k) ++{ ++ const u64 *p = high_word(&b->format, k); ++ unsigned nr_key_bits = b->nr_key_bits; ++ unsigned ret = 0, offset; ++ ++ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); ++ ++ offset = nr_key_bits; ++ while (offset > 64) { ++ p = next_word(p); ++ offset -= 64; ++ } ++ ++ offset = 64 - offset; ++ ++ while (nr_key_bits) { ++ unsigned bits = nr_key_bits + offset < 64 ++ ? nr_key_bits ++ : 64 - offset; ++ ++ u64 mask = (~0ULL >> (64 - bits)) << offset; ++ ++ if (*p & mask) ++ return ret + __ffs64(*p & mask) - offset; ++ ++ p = prev_word(p); ++ nr_key_bits -= bits; ++ ret += bits; ++ offset = 0; ++ } ++ ++ return 0; ++} ++ ++#ifdef CONFIG_X86_64 ++ ++static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, ++ unsigned nr_key_bits) ++{ ++ long d0, d1, d2, d3; ++ int cmp; ++ ++ /* we shouldn't need asm for this, but gcc is being retarded: */ ++ ++ asm(".intel_syntax noprefix;" ++ "xor eax, eax;" ++ "xor edx, edx;" ++ "1:;" ++ "mov r8, [rdi];" ++ "mov r9, [rsi];" ++ "sub ecx, 64;" ++ "jl 2f;" ++ ++ "cmp r8, r9;" ++ "jnz 3f;" ++ ++ "lea rdi, [rdi - 8];" ++ "lea rsi, [rsi - 8];" ++ "jmp 1b;" ++ ++ "2:;" ++ "not ecx;" ++ "shr r8, 1;" ++ "shr r9, 1;" ++ "shr r8, cl;" ++ "shr r9, cl;" ++ "cmp r8, r9;" ++ ++ "3:\n" ++ "seta al;" ++ "setb dl;" ++ "sub eax, edx;" ++ ".att_syntax prefix;" ++ : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp) ++ : "0" (l), "1" (r), "3" (nr_key_bits) ++ : "r8", "r9", "cc", "memory"); ++ ++ return cmp; ++} ++ ++#define I(_x) (*(out)++ = (_x)) ++#define I1(i0) I(i0) ++#define I2(i0, i1) (I1(i0), I(i1)) ++#define I3(i0, i1, i2) (I2(i0, i1), I(i2)) ++#define I4(i0, i1, i2, i3) (I3(i0, i1, i2), I(i3)) ++#define I5(i0, i1, i2, i3, i4) (I4(i0, i1, i2, i3), I(i4)) ++ ++static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out, ++ enum bch_bkey_fields field, ++ unsigned dst_offset, unsigned dst_size, ++ bool *eax_zeroed) ++{ ++ unsigned bits = format->bits_per_field[field]; ++ u64 offset = le64_to_cpu(format->field_offset[field]); ++ unsigned i, byte, bit_offset, align, shl, shr; ++ ++ if (!bits && !offset) { ++ if (!*eax_zeroed) { ++ /* xor eax, eax */ ++ I2(0x31, 0xc0); ++ } ++ ++ *eax_zeroed = true; ++ goto set_field; ++ } ++ ++ if (!bits) { ++ /* just return offset: */ ++ ++ switch (dst_size) { ++ case 8: ++ if (offset > S32_MAX) { ++ /* mov [rdi + dst_offset], offset */ ++ I3(0xc7, 0x47, dst_offset); ++ memcpy(out, &offset, 4); ++ out += 4; ++ ++ I3(0xc7, 0x47, dst_offset + 4); ++ memcpy(out, (void *) &offset + 4, 4); ++ out += 4; ++ } else { ++ /* mov [rdi + dst_offset], offset */ ++ /* sign extended */ ++ I4(0x48, 0xc7, 0x47, dst_offset); ++ memcpy(out, &offset, 4); ++ out += 4; ++ } ++ break; ++ case 4: ++ /* mov [rdi + dst_offset], offset */ ++ I3(0xc7, 0x47, dst_offset); ++ memcpy(out, &offset, 4); ++ out += 4; ++ break; ++ default: ++ BUG(); ++ } ++ ++ return out; ++ } ++ ++ bit_offset = format->key_u64s * 64; ++ for (i = 0; i <= field; i++) ++ bit_offset -= format->bits_per_field[i]; ++ ++ byte = bit_offset / 8; ++ bit_offset -= byte * 8; ++ ++ *eax_zeroed = false; ++ ++ if (bit_offset == 0 && bits == 8) { ++ /* movzx eax, BYTE PTR [rsi + imm8] */ ++ I4(0x0f, 0xb6, 0x46, byte); ++ } else if (bit_offset == 0 && bits == 16) { ++ /* movzx eax, WORD PTR [rsi + imm8] */ ++ I4(0x0f, 0xb7, 0x46, byte); ++ } else if (bit_offset + bits <= 32) { ++ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); ++ byte -= align; ++ bit_offset += align * 8; ++ ++ BUG_ON(bit_offset + bits > 32); ++ ++ /* mov eax, [rsi + imm8] */ ++ I3(0x8b, 0x46, byte); ++ ++ if (bit_offset) { ++ /* shr eax, imm8 */ ++ I3(0xc1, 0xe8, bit_offset); ++ } ++ ++ if (bit_offset + bits < 32) { ++ unsigned mask = ~0U >> (32 - bits); ++ ++ /* and eax, imm32 */ ++ I1(0x25); ++ memcpy(out, &mask, 4); ++ out += 4; ++ } ++ } else if (bit_offset + bits <= 64) { ++ align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7); ++ byte -= align; ++ bit_offset += align * 8; ++ ++ BUG_ON(bit_offset + bits > 64); ++ ++ /* mov rax, [rsi + imm8] */ ++ I4(0x48, 0x8b, 0x46, byte); ++ ++ shl = 64 - bit_offset - bits; ++ shr = bit_offset + shl; ++ ++ if (shl) { ++ /* shl rax, imm8 */ ++ I4(0x48, 0xc1, 0xe0, shl); ++ } ++ ++ if (shr) { ++ /* shr rax, imm8 */ ++ I4(0x48, 0xc1, 0xe8, shr); ++ } ++ } else { ++ align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); ++ byte -= align; ++ bit_offset += align * 8; ++ ++ BUG_ON(bit_offset + bits > 96); ++ ++ /* mov rax, [rsi + byte] */ ++ I4(0x48, 0x8b, 0x46, byte); ++ ++ /* mov edx, [rsi + byte + 8] */ ++ I3(0x8b, 0x56, byte + 8); ++ ++ /* bits from next word: */ ++ shr = bit_offset + bits - 64; ++ BUG_ON(shr > bit_offset); ++ ++ /* shr rax, bit_offset */ ++ I4(0x48, 0xc1, 0xe8, shr); ++ ++ /* shl rdx, imm8 */ ++ I4(0x48, 0xc1, 0xe2, 64 - shr); ++ ++ /* or rax, rdx */ ++ I3(0x48, 0x09, 0xd0); ++ ++ shr = bit_offset - shr; ++ ++ if (shr) { ++ /* shr rax, imm8 */ ++ I4(0x48, 0xc1, 0xe8, shr); ++ } ++ } ++ ++ /* rax += offset: */ ++ if (offset > S32_MAX) { ++ /* mov rdx, imm64 */ ++ I2(0x48, 0xba); ++ memcpy(out, &offset, 8); ++ out += 8; ++ /* add %rdx, %rax */ ++ I3(0x48, 0x01, 0xd0); ++ } else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) { ++ /* add rax, imm32 */ ++ I2(0x48, 0x05); ++ memcpy(out, &offset, 4); ++ out += 4; ++ } else if (offset) { ++ /* add eax, imm32 */ ++ I1(0x05); ++ memcpy(out, &offset, 4); ++ out += 4; ++ } ++set_field: ++ switch (dst_size) { ++ case 8: ++ /* mov [rdi + dst_offset], rax */ ++ I4(0x48, 0x89, 0x47, dst_offset); ++ break; ++ case 4: ++ /* mov [rdi + dst_offset], eax */ ++ I3(0x89, 0x47, dst_offset); ++ break; ++ default: ++ BUG(); ++ } ++ ++ return out; ++} ++ ++int bch2_compile_bkey_format(const struct bkey_format *format, void *_out) ++{ ++ bool eax_zeroed = false; ++ u8 *out = _out; ++ ++ /* ++ * rdi: dst - unpacked key ++ * rsi: src - packed key ++ */ ++ ++ /* k->u64s, k->format, k->type */ ++ ++ /* mov eax, [rsi] */ ++ I2(0x8b, 0x06); ++ ++ /* add eax, BKEY_U64s - format->key_u64s */ ++ I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0); ++ ++ /* and eax, imm32: mask out k->pad: */ ++ I5(0x25, 0xff, 0xff, 0xff, 0); ++ ++ /* mov [rdi], eax */ ++ I2(0x89, 0x07); ++ ++#define x(id, field) \ ++ out = compile_bkey_field(format, out, id, \ ++ offsetof(struct bkey, field), \ ++ sizeof(((struct bkey *) NULL)->field), \ ++ &eax_zeroed); ++ bkey_fields() ++#undef x ++ ++ /* retq */ ++ I1(0xc3); ++ ++ return (void *) out - _out; ++} ++ ++#else ++static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, ++ unsigned nr_key_bits) ++{ ++ u64 l_v, r_v; ++ ++ if (!nr_key_bits) ++ return 0; ++ ++ /* for big endian, skip past header */ ++ nr_key_bits += high_bit_offset; ++ l_v = *l & (~0ULL >> high_bit_offset); ++ r_v = *r & (~0ULL >> high_bit_offset); ++ ++ while (1) { ++ if (nr_key_bits < 64) { ++ l_v >>= 64 - nr_key_bits; ++ r_v >>= 64 - nr_key_bits; ++ nr_key_bits = 0; ++ } else { ++ nr_key_bits -= 64; ++ } ++ ++ if (!nr_key_bits || l_v != r_v) ++ break; ++ ++ l = next_word(l); ++ r = next_word(r); ++ ++ l_v = *l; ++ r_v = *r; ++ } ++ ++ return cmp_int(l_v, r_v); ++} ++#endif ++ ++__pure ++int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l, ++ const struct bkey_packed *r, ++ const struct btree *b) ++{ ++ const struct bkey_format *f = &b->format; ++ int ret; ++ ++ EBUG_ON(!bkey_packed(l) || !bkey_packed(r)); ++ EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); ++ ++ ret = __bkey_cmp_bits(high_word(f, l), ++ high_word(f, r), ++ b->nr_key_bits); ++ ++ EBUG_ON(ret != bkey_cmp(bkey_unpack_pos(b, l), ++ bkey_unpack_pos(b, r))); ++ return ret; ++} ++ ++__pure __flatten ++int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bpos *r) ++{ ++ return bkey_cmp(bkey_unpack_pos_format_checked(b, l), *r); ++} ++ ++__pure __flatten ++int __bch2_bkey_cmp_packed(const struct bkey_packed *l, ++ const struct bkey_packed *r, ++ const struct btree *b) ++{ ++ struct bkey unpacked; ++ ++ if (likely(bkey_packed(l) && bkey_packed(r))) ++ return __bch2_bkey_cmp_packed_format_checked(l, r, b); ++ ++ if (bkey_packed(l)) { ++ __bkey_unpack_key_format_checked(b, &unpacked, l); ++ l = (void*) &unpacked; ++ } else if (bkey_packed(r)) { ++ __bkey_unpack_key_format_checked(b, &unpacked, r); ++ r = (void*) &unpacked; ++ } ++ ++ return bkey_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); ++} ++ ++__pure __flatten ++int __bch2_bkey_cmp_left_packed(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bpos *r) ++{ ++ const struct bkey *l_unpacked; ++ ++ return unlikely(l_unpacked = packed_to_bkey_c(l)) ++ ? bkey_cmp(l_unpacked->p, *r) ++ : __bch2_bkey_cmp_left_packed_format_checked(b, l, r); ++} ++ ++void bch2_bpos_swab(struct bpos *p) ++{ ++ u8 *l = (u8 *) p; ++ u8 *h = ((u8 *) &p[1]) - 1; ++ ++ while (l < h) { ++ swap(*l, *h); ++ l++; ++ --h; ++ } ++} ++ ++void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k) ++{ ++ const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current; ++ u8 *l = k->key_start; ++ u8 *h = (u8 *) (k->_data + f->key_u64s) - 1; ++ ++ while (l < h) { ++ swap(*l, *h); ++ l++; ++ --h; ++ } ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_bkey_pack_test(void) ++{ ++ struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0); ++ struct bkey_packed p; ++ ++ struct bkey_format test_format = { ++ .key_u64s = 2, ++ .nr_fields = BKEY_NR_FIELDS, ++ .bits_per_field = { ++ 13, ++ 64, ++ }, ++ }; ++ ++ struct unpack_state in_s = ++ unpack_state_init(&bch2_bkey_format_current, (void *) &t); ++ struct pack_state out_s = pack_state_init(&test_format, &p); ++ unsigned i; ++ ++ for (i = 0; i < out_s.format->nr_fields; i++) { ++ u64 a, v = get_inc_field(&in_s, i); ++ ++ switch (i) { ++#define x(id, field) case id: a = t.field; break; ++ bkey_fields() ++#undef x ++ default: ++ BUG(); ++ } ++ ++ if (a != v) ++ panic("got %llu actual %llu i %u\n", v, a, i); ++ ++ if (!set_inc_field(&out_s, i, v)) ++ panic("failed at %u\n", i); ++ } ++ ++ BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format)); ++} ++#endif +diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h +new file mode 100644 +index 000000000000..cbcfbd26bc58 +--- /dev/null ++++ b/fs/bcachefs/bkey.h +@@ -0,0 +1,605 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BKEY_H ++#define _BCACHEFS_BKEY_H ++ ++#include ++#include "bcachefs_format.h" ++ ++#include "util.h" ++#include "vstructs.h" ++ ++#ifdef CONFIG_X86_64 ++#define HAVE_BCACHEFS_COMPILED_UNPACK 1 ++#endif ++ ++void bch2_to_binary(char *, const u64 *, unsigned); ++ ++/* bkey with split value, const */ ++struct bkey_s_c { ++ const struct bkey *k; ++ const struct bch_val *v; ++}; ++ ++/* bkey with split value */ ++struct bkey_s { ++ union { ++ struct { ++ struct bkey *k; ++ struct bch_val *v; ++ }; ++ struct bkey_s_c s_c; ++ }; ++}; ++ ++#define bkey_next(_k) vstruct_next(_k) ++ ++static inline struct bkey_packed *bkey_next_skip_noops(struct bkey_packed *k, ++ struct bkey_packed *end) ++{ ++ k = bkey_next(k); ++ ++ while (k != end && !k->u64s) ++ k = (void *) ((u64 *) k + 1); ++ return k; ++} ++ ++#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) ++ ++static inline size_t bkey_val_bytes(const struct bkey *k) ++{ ++ return bkey_val_u64s(k) * sizeof(u64); ++} ++ ++static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) ++{ ++ k->u64s = BKEY_U64s + val_u64s; ++} ++ ++static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) ++{ ++ k->u64s = BKEY_U64s + DIV_ROUND_UP(bytes, sizeof(u64)); ++} ++ ++#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) ++ ++#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) ++ ++#define bkey_whiteout(_k) \ ++ ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_discard) ++ ++#define bkey_packed_typecheck(_k) \ ++({ \ ++ BUILD_BUG_ON(!type_is(_k, struct bkey *) && \ ++ !type_is(_k, struct bkey_packed *)); \ ++ type_is(_k, struct bkey_packed *); \ ++}) ++ ++enum bkey_lr_packed { ++ BKEY_PACKED_BOTH, ++ BKEY_PACKED_RIGHT, ++ BKEY_PACKED_LEFT, ++ BKEY_PACKED_NONE, ++}; ++ ++#define bkey_lr_packed_typecheck(_l, _r) \ ++ (!bkey_packed_typecheck(_l) + ((!bkey_packed_typecheck(_r)) << 1)) ++ ++#define bkey_lr_packed(_l, _r) \ ++ ((_l)->format + ((_r)->format << 1)) ++ ++#define bkey_copy(_dst, _src) \ ++do { \ ++ BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) && \ ++ !type_is(_dst, struct bkey_packed *)); \ ++ BUILD_BUG_ON(!type_is(_src, struct bkey_i *) && \ ++ !type_is(_src, struct bkey_packed *)); \ ++ EBUG_ON((u64 *) (_dst) > (u64 *) (_src) && \ ++ (u64 *) (_dst) < (u64 *) (_src) + \ ++ ((struct bkey *) (_src))->u64s); \ ++ \ ++ memcpy_u64s_small((_dst), (_src), \ ++ ((struct bkey *) (_src))->u64s); \ ++} while (0) ++ ++struct btree; ++ ++struct bkey_format_state { ++ u64 field_min[BKEY_NR_FIELDS]; ++ u64 field_max[BKEY_NR_FIELDS]; ++}; ++ ++void bch2_bkey_format_init(struct bkey_format_state *); ++void bch2_bkey_format_add_key(struct bkey_format_state *, const struct bkey *); ++void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); ++struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); ++const char *bch2_bkey_format_validate(struct bkey_format *); ++ ++__pure ++unsigned bch2_bkey_greatest_differing_bit(const struct btree *, ++ const struct bkey_packed *, ++ const struct bkey_packed *); ++__pure ++unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *); ++ ++__pure ++int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *, ++ const struct bkey_packed *, ++ const struct btree *); ++ ++__pure ++int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *, ++ const struct bkey_packed *, ++ const struct bpos *); ++ ++__pure ++int __bch2_bkey_cmp_packed(const struct bkey_packed *, ++ const struct bkey_packed *, ++ const struct btree *); ++ ++__pure ++int __bch2_bkey_cmp_left_packed(const struct btree *, ++ const struct bkey_packed *, ++ const struct bpos *); ++ ++static inline __pure ++int bkey_cmp_left_packed(const struct btree *b, ++ const struct bkey_packed *l, const struct bpos *r) ++{ ++ return __bch2_bkey_cmp_left_packed(b, l, r); ++} ++ ++/* ++ * we prefer to pass bpos by ref, but it's often enough terribly convenient to ++ * pass it by by val... as much as I hate c++, const ref would be nice here: ++ */ ++__pure __flatten ++static inline int bkey_cmp_left_packed_byval(const struct btree *b, ++ const struct bkey_packed *l, ++ struct bpos r) ++{ ++ return bkey_cmp_left_packed(b, l, &r); ++} ++ ++/* ++ * If @_l or @_r are struct bkey * (not bkey_packed *), uses type information to ++ * skip dispatching on k->format: ++ */ ++#define bkey_cmp_packed(_b, _l, _r) \ ++({ \ ++ int _cmp; \ ++ \ ++ switch (bkey_lr_packed_typecheck(_l, _r)) { \ ++ case BKEY_PACKED_NONE: \ ++ _cmp = bkey_cmp(((struct bkey *) (_l))->p, \ ++ ((struct bkey *) (_r))->p); \ ++ break; \ ++ case BKEY_PACKED_LEFT: \ ++ _cmp = bkey_cmp_left_packed((_b), \ ++ (struct bkey_packed *) (_l), \ ++ &((struct bkey *) (_r))->p); \ ++ break; \ ++ case BKEY_PACKED_RIGHT: \ ++ _cmp = -bkey_cmp_left_packed((_b), \ ++ (struct bkey_packed *) (_r), \ ++ &((struct bkey *) (_l))->p); \ ++ break; \ ++ case BKEY_PACKED_BOTH: \ ++ _cmp = __bch2_bkey_cmp_packed((void *) (_l), \ ++ (void *) (_r), (_b)); \ ++ break; \ ++ } \ ++ _cmp; \ ++}) ++ ++#if 1 ++static __always_inline int bkey_cmp(struct bpos l, struct bpos r) ++{ ++ if (l.inode != r.inode) ++ return l.inode < r.inode ? -1 : 1; ++ if (l.offset != r.offset) ++ return l.offset < r.offset ? -1 : 1; ++ if (l.snapshot != r.snapshot) ++ return l.snapshot < r.snapshot ? -1 : 1; ++ return 0; ++} ++#else ++int bkey_cmp(struct bpos l, struct bpos r); ++#endif ++ ++static inline struct bpos bpos_min(struct bpos l, struct bpos r) ++{ ++ return bkey_cmp(l, r) < 0 ? l : r; ++} ++ ++void bch2_bpos_swab(struct bpos *); ++void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); ++ ++static __always_inline int bversion_cmp(struct bversion l, struct bversion r) ++{ ++ return cmp_int(l.hi, r.hi) ?: ++ cmp_int(l.lo, r.lo); ++} ++ ++#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) ++#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL }) ++ ++static __always_inline int bversion_zero(struct bversion v) ++{ ++ return !bversion_cmp(v, ZERO_VERSION); ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++/* statement expressions confusing unlikely()? */ ++#define bkey_packed(_k) \ ++ ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \ ++ (_k)->format != KEY_FORMAT_CURRENT; }) ++#else ++#define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT) ++#endif ++ ++/* ++ * It's safe to treat an unpacked bkey as a packed one, but not the reverse ++ */ ++static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k) ++{ ++ return (struct bkey_packed *) k; ++} ++ ++static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k) ++{ ++ return (const struct bkey_packed *) k; ++} ++ ++static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k) ++{ ++ return bkey_packed(k) ? NULL : (struct bkey_i *) k; ++} ++ ++static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k) ++{ ++ return bkey_packed(k) ? NULL : (const struct bkey *) k; ++} ++ ++static inline unsigned bkey_format_key_bits(const struct bkey_format *format) ++{ ++ return format->bits_per_field[BKEY_FIELD_INODE] + ++ format->bits_per_field[BKEY_FIELD_OFFSET] + ++ format->bits_per_field[BKEY_FIELD_SNAPSHOT]; ++} ++ ++static inline struct bpos bkey_successor(struct bpos p) ++{ ++ struct bpos ret = p; ++ ++ if (!++ret.offset) ++ BUG_ON(!++ret.inode); ++ ++ return ret; ++} ++ ++static inline struct bpos bkey_predecessor(struct bpos p) ++{ ++ struct bpos ret = p; ++ ++ if (!ret.offset--) ++ BUG_ON(!ret.inode--); ++ ++ return ret; ++} ++ ++static inline u64 bkey_start_offset(const struct bkey *k) ++{ ++ return k->p.offset - k->size; ++} ++ ++static inline struct bpos bkey_start_pos(const struct bkey *k) ++{ ++ return (struct bpos) { ++ .inode = k->p.inode, ++ .offset = bkey_start_offset(k), ++ .snapshot = k->p.snapshot, ++ }; ++} ++ ++/* Packed helpers */ ++ ++static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s; ++ ++ EBUG_ON(k->u64s < ret); ++ return ret; ++} ++ ++static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ return bkeyp_key_u64s(format, k) * sizeof(u64); ++} ++ ++static inline unsigned bkeyp_val_u64s(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ return k->u64s - bkeyp_key_u64s(format, k); ++} ++ ++static inline size_t bkeyp_val_bytes(const struct bkey_format *format, ++ const struct bkey_packed *k) ++{ ++ return bkeyp_val_u64s(format, k) * sizeof(u64); ++} ++ ++static inline void set_bkeyp_val_u64s(const struct bkey_format *format, ++ struct bkey_packed *k, unsigned val_u64s) ++{ ++ k->u64s = bkeyp_key_u64s(format, k) + val_u64s; ++} ++ ++#define bkeyp_val(_format, _k) \ ++ ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k))) ++ ++extern const struct bkey_format bch2_bkey_format_current; ++ ++bool bch2_bkey_transform(const struct bkey_format *, ++ struct bkey_packed *, ++ const struct bkey_format *, ++ const struct bkey_packed *); ++ ++struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, ++ const struct bkey_packed *); ++ ++#ifndef HAVE_BCACHEFS_COMPILED_UNPACK ++struct bpos __bkey_unpack_pos(const struct bkey_format *, ++ const struct bkey_packed *); ++#endif ++ ++bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *, ++ const struct bkey_format *); ++ ++enum bkey_pack_pos_ret { ++ BKEY_PACK_POS_EXACT, ++ BKEY_PACK_POS_SMALLER, ++ BKEY_PACK_POS_FAIL, ++}; ++ ++enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos, ++ const struct btree *); ++ ++static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in, ++ const struct btree *b) ++{ ++ return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT; ++} ++ ++void bch2_bkey_unpack(const struct btree *, struct bkey_i *, ++ const struct bkey_packed *); ++bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *, ++ const struct bkey_format *); ++ ++static inline u64 bkey_field_max(const struct bkey_format *f, ++ enum bch_bkey_fields nr) ++{ ++ return f->bits_per_field[nr] < 64 ++ ? (le64_to_cpu(f->field_offset[nr]) + ++ ~(~0ULL << f->bits_per_field[nr])) ++ : U64_MAX; ++} ++ ++#ifdef HAVE_BCACHEFS_COMPILED_UNPACK ++ ++int bch2_compile_bkey_format(const struct bkey_format *, void *); ++ ++#else ++ ++static inline int bch2_compile_bkey_format(const struct bkey_format *format, ++ void *out) { return 0; } ++ ++#endif ++ ++static inline void bkey_reassemble(struct bkey_i *dst, ++ struct bkey_s_c src) ++{ ++ dst->k = *src.k; ++ memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k)); ++} ++ ++#define bkey_s_null ((struct bkey_s) { .k = NULL }) ++#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) ++ ++#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) ++#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) ++ ++static inline struct bkey_s bkey_to_s(struct bkey *k) ++{ ++ return (struct bkey_s) { .k = k, .v = NULL }; ++} ++ ++static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) ++{ ++ return (struct bkey_s_c) { .k = k, .v = NULL }; ++} ++ ++static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) ++{ ++ return (struct bkey_s) { .k = &k->k, .v = &k->v }; ++} ++ ++static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) ++{ ++ return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; ++} ++ ++/* ++ * For a given type of value (e.g. struct bch_extent), generates the types for ++ * bkey + bch_extent - inline, split, split const - and also all the conversion ++ * functions, which also check that the value is of the correct type. ++ * ++ * We use anonymous unions for upcasting - e.g. converting from e.g. a ++ * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion ++ * functions. ++ */ ++#define BKEY_VAL_ACCESSORS(name) \ ++struct bkey_i_##name { \ ++ union { \ ++ struct bkey k; \ ++ struct bkey_i k_i; \ ++ }; \ ++ struct bch_##name v; \ ++}; \ ++ \ ++struct bkey_s_c_##name { \ ++ union { \ ++ struct { \ ++ const struct bkey *k; \ ++ const struct bch_##name *v; \ ++ }; \ ++ struct bkey_s_c s_c; \ ++ }; \ ++}; \ ++ \ ++struct bkey_s_##name { \ ++ union { \ ++ struct { \ ++ struct bkey *k; \ ++ struct bch_##name *v; \ ++ }; \ ++ struct bkey_s_c_##name c; \ ++ struct bkey_s s; \ ++ struct bkey_s_c s_c; \ ++ }; \ ++}; \ ++ \ ++static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ ++{ \ ++ EBUG_ON(k->k.type != KEY_TYPE_##name); \ ++ return container_of(&k->k, struct bkey_i_##name, k); \ ++} \ ++ \ ++static inline const struct bkey_i_##name * \ ++bkey_i_to_##name##_c(const struct bkey_i *k) \ ++{ \ ++ EBUG_ON(k->k.type != KEY_TYPE_##name); \ ++ return container_of(&k->k, struct bkey_i_##name, k); \ ++} \ ++ \ ++static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ ++{ \ ++ EBUG_ON(k.k->type != KEY_TYPE_##name); \ ++ return (struct bkey_s_##name) { \ ++ .k = k.k, \ ++ .v = container_of(k.v, struct bch_##name, v), \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ ++{ \ ++ EBUG_ON(k.k->type != KEY_TYPE_##name); \ ++ return (struct bkey_s_c_##name) { \ ++ .k = k.k, \ ++ .v = container_of(k.v, struct bch_##name, v), \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ ++{ \ ++ return (struct bkey_s_##name) { \ ++ .k = &k->k, \ ++ .v = &k->v, \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_c_##name \ ++name##_i_to_s_c(const struct bkey_i_##name *k) \ ++{ \ ++ return (struct bkey_s_c_##name) { \ ++ .k = &k->k, \ ++ .v = &k->v, \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ ++{ \ ++ EBUG_ON(k->k.type != KEY_TYPE_##name); \ ++ return (struct bkey_s_##name) { \ ++ .k = &k->k, \ ++ .v = container_of(&k->v, struct bch_##name, v), \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_s_c_##name \ ++bkey_i_to_s_c_##name(const struct bkey_i *k) \ ++{ \ ++ EBUG_ON(k->k.type != KEY_TYPE_##name); \ ++ return (struct bkey_s_c_##name) { \ ++ .k = &k->k, \ ++ .v = container_of(&k->v, struct bch_##name, v), \ ++ }; \ ++} \ ++ \ ++static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ ++{ \ ++ struct bkey_i_##name *k = \ ++ container_of(&_k->k, struct bkey_i_##name, k); \ ++ \ ++ bkey_init(&k->k); \ ++ memset(&k->v, 0, sizeof(k->v)); \ ++ k->k.type = KEY_TYPE_##name; \ ++ set_bkey_val_bytes(&k->k, sizeof(k->v)); \ ++ \ ++ return k; \ ++} ++ ++BKEY_VAL_ACCESSORS(cookie); ++BKEY_VAL_ACCESSORS(btree_ptr); ++BKEY_VAL_ACCESSORS(extent); ++BKEY_VAL_ACCESSORS(reservation); ++BKEY_VAL_ACCESSORS(inode); ++BKEY_VAL_ACCESSORS(inode_generation); ++BKEY_VAL_ACCESSORS(dirent); ++BKEY_VAL_ACCESSORS(xattr); ++BKEY_VAL_ACCESSORS(alloc); ++BKEY_VAL_ACCESSORS(quota); ++BKEY_VAL_ACCESSORS(stripe); ++BKEY_VAL_ACCESSORS(reflink_p); ++BKEY_VAL_ACCESSORS(reflink_v); ++BKEY_VAL_ACCESSORS(inline_data); ++BKEY_VAL_ACCESSORS(btree_ptr_v2); ++ ++/* byte order helpers */ ++ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ ++static inline unsigned high_word_offset(const struct bkey_format *f) ++{ ++ return f->key_u64s - 1; ++} ++ ++#define high_bit_offset 0 ++#define nth_word(p, n) ((p) - (n)) ++ ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++ ++static inline unsigned high_word_offset(const struct bkey_format *f) ++{ ++ return 0; ++} ++ ++#define high_bit_offset KEY_PACKED_BITS_START ++#define nth_word(p, n) ((p) + (n)) ++ ++#else ++#error edit for your odd byteorder. ++#endif ++ ++#define high_word(f, k) ((k)->_data + high_word_offset(f)) ++#define next_word(p) nth_word(p, 1) ++#define prev_word(p) nth_word(p, -1) ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_bkey_pack_test(void); ++#else ++static inline void bch2_bkey_pack_test(void) {} ++#endif ++ ++#endif /* _BCACHEFS_BKEY_H */ +diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c +new file mode 100644 +index 000000000000..36e0c5152b47 +--- /dev/null ++++ b/fs/bcachefs/bkey_methods.c +@@ -0,0 +1,353 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_types.h" ++#include "alloc_background.h" ++#include "dirent.h" ++#include "ec.h" ++#include "error.h" ++#include "extents.h" ++#include "inode.h" ++#include "quota.h" ++#include "reflink.h" ++#include "xattr.h" ++ ++const char * const bch2_bkey_types[] = { ++#define x(name, nr) #name, ++ BCH_BKEY_TYPES() ++#undef x ++ NULL ++}; ++ ++static const char *deleted_key_invalid(const struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ return NULL; ++} ++ ++#define bch2_bkey_ops_deleted (struct bkey_ops) { \ ++ .key_invalid = deleted_key_invalid, \ ++} ++ ++#define bch2_bkey_ops_discard (struct bkey_ops) { \ ++ .key_invalid = deleted_key_invalid, \ ++} ++ ++static const char *empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ if (bkey_val_bytes(k.k)) ++ return "value size should be zero"; ++ ++ return NULL; ++} ++ ++#define bch2_bkey_ops_error (struct bkey_ops) { \ ++ .key_invalid = empty_val_key_invalid, \ ++} ++ ++static const char *key_type_cookie_invalid(const struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_cookie)) ++ return "incorrect value size"; ++ ++ return NULL; ++} ++ ++#define bch2_bkey_ops_cookie (struct bkey_ops) { \ ++ .key_invalid = key_type_cookie_invalid, \ ++} ++ ++#define bch2_bkey_ops_whiteout (struct bkey_ops) { \ ++ .key_invalid = empty_val_key_invalid, \ ++} ++ ++static const char *key_type_inline_data_invalid(const struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ return NULL; ++} ++ ++static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ pr_buf(out, "(%zu bytes)", bkey_val_bytes(k.k)); ++} ++ ++#define bch2_bkey_ops_inline_data (struct bkey_ops) { \ ++ .key_invalid = key_type_inline_data_invalid, \ ++ .val_to_text = key_type_inline_data_to_text, \ ++} ++ ++static const struct bkey_ops bch2_bkey_ops[] = { ++#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, ++ BCH_BKEY_TYPES() ++#undef x ++}; ++ ++const char *bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k) ++{ ++ if (k.k->type >= KEY_TYPE_MAX) ++ return "invalid type"; ++ ++ return bch2_bkey_ops[k.k->type].key_invalid(c, k); ++} ++ ++const char *__bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, ++ enum btree_node_type type) ++{ ++ if (k.k->u64s < BKEY_U64s) ++ return "u64s too small"; ++ ++ if (type == BKEY_TYPE_BTREE && ++ bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) ++ return "value too big"; ++ ++ if (btree_node_type_is_extents(type)) { ++ if ((k.k->size == 0) != bkey_deleted(k.k)) ++ return "bad size field"; ++ ++ if (k.k->size > k.k->p.offset) ++ return "size greater than offset"; ++ } else { ++ if (k.k->size) ++ return "nonzero size field"; ++ } ++ ++ if (k.k->p.snapshot) ++ return "nonzero snapshot"; ++ ++ if (type != BKEY_TYPE_BTREE && ++ !bkey_cmp(k.k->p, POS_MAX)) ++ return "POS_MAX key"; ++ ++ return NULL; ++} ++ ++const char *bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, ++ enum btree_node_type type) ++{ ++ return __bch2_bkey_invalid(c, k, type) ?: ++ bch2_bkey_val_invalid(c, k); ++} ++ ++const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k) ++{ ++ if (bkey_cmp(k.k->p, b->data->min_key) < 0) ++ return "key before start of btree node"; ++ ++ if (bkey_cmp(k.k->p, b->data->max_key) > 0) ++ return "key past end of btree node"; ++ ++ return NULL; ++} ++ ++void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; ++ const char *invalid; ++ ++ BUG_ON(!k.k->u64s); ++ ++ invalid = bch2_bkey_invalid(c, k, btree_node_type(b)) ?: ++ bch2_bkey_in_btree_node(b, k); ++ if (invalid) { ++ char buf[160]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, k); ++ bch2_fs_inconsistent(c, "invalid bkey %s: %s", buf, invalid); ++ return; ++ } ++ ++ if (ops->key_debugcheck) ++ ops->key_debugcheck(c, k); ++} ++ ++void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) ++{ ++ if (!bkey_cmp(pos, POS_MIN)) ++ pr_buf(out, "POS_MIN"); ++ else if (!bkey_cmp(pos, POS_MAX)) ++ pr_buf(out, "POS_MAX"); ++ else ++ pr_buf(out, "%llu:%llu", pos.inode, pos.offset); ++} ++ ++void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) ++{ ++ if (k) { ++ pr_buf(out, "u64s %u type %s ", k->u64s, ++ bch2_bkey_types[k->type]); ++ ++ bch2_bpos_to_text(out, k->p); ++ ++ pr_buf(out, " snap %u len %u ver %llu", ++ k->p.snapshot, k->size, k->version.lo); ++ } else { ++ pr_buf(out, "(null)"); ++ } ++} ++ ++void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; ++ ++ if (likely(ops->val_to_text)) ++ ops->val_to_text(out, c, k); ++} ++ ++void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ bch2_bkey_to_text(out, k.k); ++ ++ if (k.k) { ++ pr_buf(out, ": "); ++ bch2_val_to_text(out, c, k); ++ } ++} ++ ++void bch2_bkey_swab_val(struct bkey_s k) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; ++ ++ if (ops->swab) ++ ops->swab(k); ++} ++ ++bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[k.k->type]; ++ ++ return ops->key_normalize ++ ? ops->key_normalize(c, k) ++ : false; ++} ++ ++enum merge_result bch2_bkey_merge(struct bch_fs *c, ++ struct bkey_s l, struct bkey_s r) ++{ ++ const struct bkey_ops *ops = &bch2_bkey_ops[l.k->type]; ++ enum merge_result ret; ++ ++ if (key_merging_disabled(c) || ++ !ops->key_merge || ++ l.k->type != r.k->type || ++ bversion_cmp(l.k->version, r.k->version) || ++ bkey_cmp(l.k->p, bkey_start_pos(r.k))) ++ return BCH_MERGE_NOMERGE; ++ ++ ret = ops->key_merge(c, l, r); ++ ++ if (ret != BCH_MERGE_NOMERGE) ++ l.k->needs_whiteout |= r.k->needs_whiteout; ++ return ret; ++} ++ ++static const struct old_bkey_type { ++ u8 btree_node_type; ++ u8 old; ++ u8 new; ++} bkey_renumber_table[] = { ++ {BKEY_TYPE_BTREE, 128, KEY_TYPE_btree_ptr }, ++ {BKEY_TYPE_EXTENTS, 128, KEY_TYPE_extent }, ++ {BKEY_TYPE_EXTENTS, 129, KEY_TYPE_extent }, ++ {BKEY_TYPE_EXTENTS, 130, KEY_TYPE_reservation }, ++ {BKEY_TYPE_INODES, 128, KEY_TYPE_inode }, ++ {BKEY_TYPE_INODES, 130, KEY_TYPE_inode_generation }, ++ {BKEY_TYPE_DIRENTS, 128, KEY_TYPE_dirent }, ++ {BKEY_TYPE_DIRENTS, 129, KEY_TYPE_whiteout }, ++ {BKEY_TYPE_XATTRS, 128, KEY_TYPE_xattr }, ++ {BKEY_TYPE_XATTRS, 129, KEY_TYPE_whiteout }, ++ {BKEY_TYPE_ALLOC, 128, KEY_TYPE_alloc }, ++ {BKEY_TYPE_QUOTAS, 128, KEY_TYPE_quota }, ++}; ++ ++void bch2_bkey_renumber(enum btree_node_type btree_node_type, ++ struct bkey_packed *k, ++ int write) ++{ ++ const struct old_bkey_type *i; ++ ++ for (i = bkey_renumber_table; ++ i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table); ++ i++) ++ if (btree_node_type == i->btree_node_type && ++ k->type == (write ? i->new : i->old)) { ++ k->type = write ? i->old : i->new; ++ break; ++ } ++} ++ ++void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, ++ struct bkey_format *f, ++ struct bkey_packed *k) ++{ ++ const struct bkey_ops *ops; ++ struct bkey uk; ++ struct bkey_s u; ++ int i; ++ ++ /* ++ * Do these operations in reverse order in the write path: ++ */ ++ ++ for (i = 0; i < 4; i++) ++ switch (!write ? i : 3 - i) { ++ case 0: ++ if (big_endian != CPU_BIG_ENDIAN) ++ bch2_bkey_swab_key(f, k); ++ break; ++ case 1: ++ if (version < bcachefs_metadata_version_bkey_renumber) ++ bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write); ++ break; ++ case 2: ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_id == BTREE_ID_INODES) { ++ if (!bkey_packed(k)) { ++ struct bkey_i *u = packed_to_bkey(k); ++ swap(u->k.p.inode, u->k.p.offset); ++ } else if (f->bits_per_field[BKEY_FIELD_INODE] && ++ f->bits_per_field[BKEY_FIELD_OFFSET]) { ++ struct bkey_format tmp = *f, *in = f, *out = &tmp; ++ ++ swap(tmp.bits_per_field[BKEY_FIELD_INODE], ++ tmp.bits_per_field[BKEY_FIELD_OFFSET]); ++ swap(tmp.field_offset[BKEY_FIELD_INODE], ++ tmp.field_offset[BKEY_FIELD_OFFSET]); ++ ++ if (!write) ++ swap(in, out); ++ ++ uk = __bch2_bkey_unpack_key(in, k); ++ swap(uk.p.inode, uk.p.offset); ++ BUG_ON(!bch2_bkey_pack_key(k, &uk, out)); ++ } ++ } ++ break; ++ case 3: ++ if (!bkey_packed(k)) { ++ u = bkey_i_to_s(packed_to_bkey(k)); ++ } else { ++ uk = __bch2_bkey_unpack_key(f, k); ++ u.k = &uk; ++ u.v = bkeyp_val(f, k); ++ } ++ ++ if (big_endian != CPU_BIG_ENDIAN) ++ bch2_bkey_swab_val(u); ++ ++ ops = &bch2_bkey_ops[k->type]; ++ ++ if (ops->compat) ++ ops->compat(btree_id, version, big_endian, write, u); ++ break; ++ default: ++ BUG(); ++ } ++} +diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h +new file mode 100644 +index 000000000000..0bca725ae3b8 +--- /dev/null ++++ b/fs/bcachefs/bkey_methods.h +@@ -0,0 +1,82 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BKEY_METHODS_H ++#define _BCACHEFS_BKEY_METHODS_H ++ ++#include "bkey.h" ++ ++struct bch_fs; ++struct btree; ++struct bkey; ++enum btree_node_type; ++ ++extern const char * const bch2_bkey_types[]; ++ ++enum merge_result { ++ BCH_MERGE_NOMERGE, ++ ++ /* ++ * The keys were mergeable, but would have overflowed size - so instead ++ * l was changed to the maximum size, and both keys were modified: ++ */ ++ BCH_MERGE_PARTIAL, ++ BCH_MERGE_MERGE, ++}; ++ ++struct bkey_ops { ++ /* Returns reason for being invalid if invalid, else NULL: */ ++ const char * (*key_invalid)(const struct bch_fs *, ++ struct bkey_s_c); ++ void (*key_debugcheck)(struct bch_fs *, struct bkey_s_c); ++ void (*val_to_text)(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ void (*swab)(struct bkey_s); ++ bool (*key_normalize)(struct bch_fs *, struct bkey_s); ++ enum merge_result (*key_merge)(struct bch_fs *, ++ struct bkey_s, struct bkey_s); ++ void (*compat)(enum btree_id id, unsigned version, ++ unsigned big_endian, int write, ++ struct bkey_s); ++}; ++ ++const char *bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c); ++const char *__bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, ++ enum btree_node_type); ++const char *bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, ++ enum btree_node_type); ++const char *bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c); ++ ++void bch2_bkey_debugcheck(struct bch_fs *, struct btree *, struct bkey_s_c); ++ ++void bch2_bpos_to_text(struct printbuf *, struct bpos); ++void bch2_bkey_to_text(struct printbuf *, const struct bkey *); ++void bch2_val_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ ++void bch2_bkey_swab_val(struct bkey_s); ++ ++bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); ++ ++enum merge_result bch2_bkey_merge(struct bch_fs *, ++ struct bkey_s, struct bkey_s); ++ ++void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); ++ ++void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned, ++ int, struct bkey_format *, struct bkey_packed *); ++ ++static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, ++ struct bkey_format *f, ++ struct bkey_packed *k) ++{ ++ if (version < bcachefs_metadata_version_current || ++ big_endian != CPU_BIG_ENDIAN) ++ __bch2_bkey_compat(level, btree_id, version, ++ big_endian, write, f, k); ++ ++} ++ ++#endif /* _BCACHEFS_BKEY_METHODS_H */ +diff --git a/fs/bcachefs/bkey_on_stack.h b/fs/bcachefs/bkey_on_stack.h +new file mode 100644 +index 000000000000..f607a0cb37ed +--- /dev/null ++++ b/fs/bcachefs/bkey_on_stack.h +@@ -0,0 +1,43 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BKEY_ON_STACK_H ++#define _BCACHEFS_BKEY_ON_STACK_H ++ ++#include "bcachefs.h" ++ ++struct bkey_on_stack { ++ struct bkey_i *k; ++ u64 onstack[12]; ++}; ++ ++static inline void bkey_on_stack_realloc(struct bkey_on_stack *s, ++ struct bch_fs *c, unsigned u64s) ++{ ++ if (s->k == (void *) s->onstack && ++ u64s > ARRAY_SIZE(s->onstack)) { ++ s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS); ++ memcpy(s->k, s->onstack, sizeof(s->onstack)); ++ } ++} ++ ++static inline void bkey_on_stack_reassemble(struct bkey_on_stack *s, ++ struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ bkey_on_stack_realloc(s, c, k.k->u64s); ++ bkey_reassemble(s->k, k); ++} ++ ++static inline void bkey_on_stack_init(struct bkey_on_stack *s) ++{ ++ s->k = (void *) s->onstack; ++} ++ ++static inline void bkey_on_stack_exit(struct bkey_on_stack *s, ++ struct bch_fs *c) ++{ ++ if (s->k != (void *) s->onstack) ++ mempool_free(s->k, &c->large_bkey_pool); ++ s->k = NULL; ++} ++ ++#endif /* _BCACHEFS_BKEY_ON_STACK_H */ +diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c +new file mode 100644 +index 000000000000..839e78d1dc35 +--- /dev/null ++++ b/fs/bcachefs/bkey_sort.c +@@ -0,0 +1,515 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "bkey_on_stack.h" ++#include "bkey_sort.h" ++#include "bset.h" ++#include "extents.h" ++ ++typedef int (*sort_cmp_fn)(struct btree *, ++ struct bkey_packed *, ++ struct bkey_packed *); ++ ++static inline bool sort_iter_end(struct sort_iter *iter) ++{ ++ return !iter->used; ++} ++ ++static inline void __sort_iter_sift(struct sort_iter *iter, ++ unsigned from, ++ sort_cmp_fn cmp) ++{ ++ unsigned i; ++ ++ for (i = from; ++ i + 1 < iter->used && ++ cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0; ++ i++) ++ swap(iter->data[i], iter->data[i + 1]); ++} ++ ++static inline void sort_iter_sift(struct sort_iter *iter, sort_cmp_fn cmp) ++{ ++ ++ __sort_iter_sift(iter, 0, cmp); ++} ++ ++static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) ++{ ++ unsigned i = iter->used; ++ ++ while (i--) ++ __sort_iter_sift(iter, i, cmp); ++} ++ ++static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) ++{ ++ return !sort_iter_end(iter) ? iter->data->k : NULL; ++} ++ ++static inline void __sort_iter_advance(struct sort_iter *iter, ++ unsigned idx, sort_cmp_fn cmp) ++{ ++ struct sort_iter_set *i = iter->data + idx; ++ ++ BUG_ON(idx >= iter->used); ++ ++ i->k = bkey_next_skip_noops(i->k, i->end); ++ ++ BUG_ON(i->k > i->end); ++ ++ if (i->k == i->end) ++ array_remove_item(iter->data, iter->used, idx); ++ else ++ __sort_iter_sift(iter, idx, cmp); ++} ++ ++static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) ++{ ++ __sort_iter_advance(iter, 0, cmp); ++} ++ ++static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, ++ sort_cmp_fn cmp) ++{ ++ struct bkey_packed *ret = sort_iter_peek(iter); ++ ++ if (ret) ++ sort_iter_advance(iter, cmp); ++ ++ return ret; ++} ++ ++/* ++ * If keys compare equal, compare by pointer order: ++ */ ++static inline int key_sort_fix_overlapping_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) ++{ ++ return bkey_cmp_packed(b, l, r) ?: ++ cmp_int((unsigned long) l, (unsigned long) r); ++} ++ ++static inline bool should_drop_next_key(struct sort_iter *iter) ++{ ++ /* ++ * key_sort_cmp() ensures that when keys compare equal the older key ++ * comes first; so if l->k compares equal to r->k then l->k is older ++ * and should be dropped. ++ */ ++ return iter->used >= 2 && ++ !bkey_cmp_packed(iter->b, ++ iter->data[0].k, ++ iter->data[1].k); ++} ++ ++struct btree_nr_keys ++bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, ++ struct sort_iter *iter) ++{ ++ struct bkey_packed *out = dst->start; ++ struct bkey_packed *k; ++ struct btree_nr_keys nr; ++ ++ memset(&nr, 0, sizeof(nr)); ++ ++ sort_iter_sort(iter, key_sort_fix_overlapping_cmp); ++ ++ while ((k = sort_iter_peek(iter))) { ++ if (!bkey_whiteout(k) && ++ !should_drop_next_key(iter)) { ++ bkey_copy(out, k); ++ btree_keys_account_key_add(&nr, 0, out); ++ out = bkey_next(out); ++ } ++ ++ sort_iter_advance(iter, key_sort_fix_overlapping_cmp); ++ } ++ ++ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); ++ return nr; ++} ++ ++static void extent_sort_append(struct bch_fs *c, ++ struct bkey_format *f, ++ struct btree_nr_keys *nr, ++ struct bkey_packed **out, ++ struct bkey_s k) ++{ ++ if (!bkey_whiteout(k.k)) { ++ if (!bch2_bkey_pack_key(*out, k.k, f)) ++ memcpy_u64s_small(*out, k.k, BKEY_U64s); ++ ++ memcpy_u64s_small(bkeyp_val(f, *out), k.v, bkey_val_u64s(k.k)); ++ ++ btree_keys_account_key_add(nr, 0, *out); ++ *out = bkey_next(*out); ++ } ++} ++ ++/* Sort + repack in a new format: */ ++struct btree_nr_keys ++bch2_sort_repack(struct bset *dst, struct btree *src, ++ struct btree_node_iter *src_iter, ++ struct bkey_format *out_f, ++ bool filter_whiteouts) ++{ ++ struct bkey_format *in_f = &src->format; ++ struct bkey_packed *in, *out = vstruct_last(dst); ++ struct btree_nr_keys nr; ++ ++ memset(&nr, 0, sizeof(nr)); ++ ++ while ((in = bch2_btree_node_iter_next_all(src_iter, src))) { ++ if (filter_whiteouts && bkey_whiteout(in)) ++ continue; ++ ++ if (bch2_bkey_transform(out_f, out, bkey_packed(in) ++ ? in_f : &bch2_bkey_format_current, in)) ++ out->format = KEY_FORMAT_LOCAL_BTREE; ++ else ++ bch2_bkey_unpack(src, (void *) out, in); ++ ++ btree_keys_account_key_add(&nr, 0, out); ++ out = bkey_next(out); ++ } ++ ++ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); ++ return nr; ++} ++ ++/* Sort, repack, and call bch2_bkey_normalize() to drop stale pointers: */ ++struct btree_nr_keys ++bch2_sort_repack_merge(struct bch_fs *c, ++ struct bset *dst, struct btree *src, ++ struct btree_node_iter *iter, ++ struct bkey_format *out_f, ++ bool filter_whiteouts) ++{ ++ struct bkey_packed *out = vstruct_last(dst), *k_packed; ++ struct bkey_on_stack k; ++ struct btree_nr_keys nr; ++ ++ memset(&nr, 0, sizeof(nr)); ++ bkey_on_stack_init(&k); ++ ++ while ((k_packed = bch2_btree_node_iter_next_all(iter, src))) { ++ if (filter_whiteouts && bkey_whiteout(k_packed)) ++ continue; ++ ++ /* ++ * NOTE: ++ * bch2_bkey_normalize may modify the key we pass it (dropping ++ * stale pointers) and we don't have a write lock on the src ++ * node; we have to make a copy of the entire key before calling ++ * normalize ++ */ ++ bkey_on_stack_realloc(&k, c, k_packed->u64s + BKEY_U64s); ++ bch2_bkey_unpack(src, k.k, k_packed); ++ ++ if (filter_whiteouts && ++ bch2_bkey_normalize(c, bkey_i_to_s(k.k))) ++ continue; ++ ++ extent_sort_append(c, out_f, &nr, &out, bkey_i_to_s(k.k)); ++ } ++ ++ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); ++ bkey_on_stack_exit(&k, c); ++ return nr; ++} ++ ++static inline int sort_keys_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) ++{ ++ return bkey_cmp_packed(b, l, r) ?: ++ (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: ++ (int) l->needs_whiteout - (int) r->needs_whiteout; ++} ++ ++unsigned bch2_sort_keys(struct bkey_packed *dst, ++ struct sort_iter *iter, ++ bool filter_whiteouts) ++{ ++ const struct bkey_format *f = &iter->b->format; ++ struct bkey_packed *in, *next, *out = dst; ++ ++ sort_iter_sort(iter, sort_keys_cmp); ++ ++ while ((in = sort_iter_next(iter, sort_keys_cmp))) { ++ bool needs_whiteout = false; ++ ++ if (bkey_whiteout(in) && ++ (filter_whiteouts || !in->needs_whiteout)) ++ continue; ++ ++ while ((next = sort_iter_peek(iter)) && ++ !bkey_cmp_packed(iter->b, in, next)) { ++ BUG_ON(in->needs_whiteout && ++ next->needs_whiteout); ++ needs_whiteout |= in->needs_whiteout; ++ in = sort_iter_next(iter, sort_keys_cmp); ++ } ++ ++ if (bkey_whiteout(in)) { ++ memcpy_u64s(out, in, bkeyp_key_u64s(f, in)); ++ set_bkeyp_val_u64s(f, out, 0); ++ } else { ++ bkey_copy(out, in); ++ } ++ out->needs_whiteout |= needs_whiteout; ++ out = bkey_next(out); ++ } ++ ++ return (u64 *) out - (u64 *) dst; ++} ++ ++/* Compat code for btree_node_old_extent_overwrite: */ ++ ++/* ++ * If keys compare equal, compare by pointer order: ++ * ++ * Necessary for sort_fix_overlapping() - if there are multiple keys that ++ * compare equal in different sets, we have to process them newest to oldest. ++ */ ++static inline int extent_sort_fix_overlapping_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) ++{ ++ struct bkey ul = bkey_unpack_key(b, l); ++ struct bkey ur = bkey_unpack_key(b, r); ++ ++ return bkey_cmp(bkey_start_pos(&ul), ++ bkey_start_pos(&ur)) ?: ++ cmp_int((unsigned long) r, (unsigned long) l); ++} ++ ++/* ++ * The algorithm in extent_sort_fix_overlapping() relies on keys in the same ++ * bset being ordered by start offset - but 0 size whiteouts (which are always ++ * KEY_TYPE_deleted) break this ordering, so we need to skip over them: ++ */ ++static void extent_iter_advance(struct sort_iter *iter, unsigned idx) ++{ ++ struct sort_iter_set *i = iter->data + idx; ++ ++ do { ++ i->k = bkey_next_skip_noops(i->k, i->end); ++ } while (i->k != i->end && bkey_deleted(i->k)); ++ ++ if (i->k == i->end) ++ array_remove_item(iter->data, iter->used, idx); ++ else ++ __sort_iter_sift(iter, idx, extent_sort_fix_overlapping_cmp); ++} ++ ++struct btree_nr_keys ++bch2_extent_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, ++ struct sort_iter *iter) ++{ ++ struct btree *b = iter->b; ++ struct bkey_format *f = &b->format; ++ struct sort_iter_set *_l = iter->data, *_r = iter->data + 1; ++ struct bkey_packed *out = dst->start; ++ struct bkey l_unpacked, r_unpacked; ++ struct bkey_s l, r; ++ struct btree_nr_keys nr; ++ struct bkey_on_stack split; ++ unsigned i; ++ ++ memset(&nr, 0, sizeof(nr)); ++ bkey_on_stack_init(&split); ++ ++ sort_iter_sort(iter, extent_sort_fix_overlapping_cmp); ++ for (i = 0; i < iter->used;) { ++ if (bkey_deleted(iter->data[i].k)) ++ __sort_iter_advance(iter, i, ++ extent_sort_fix_overlapping_cmp); ++ else ++ i++; ++ } ++ ++ while (!sort_iter_end(iter)) { ++ l = __bkey_disassemble(b, _l->k, &l_unpacked); ++ ++ if (iter->used == 1) { ++ extent_sort_append(c, f, &nr, &out, l); ++ extent_iter_advance(iter, 0); ++ continue; ++ } ++ ++ r = __bkey_disassemble(b, _r->k, &r_unpacked); ++ ++ /* If current key and next key don't overlap, just append */ ++ if (bkey_cmp(l.k->p, bkey_start_pos(r.k)) <= 0) { ++ extent_sort_append(c, f, &nr, &out, l); ++ extent_iter_advance(iter, 0); ++ continue; ++ } ++ ++ /* Skip 0 size keys */ ++ if (!r.k->size) { ++ extent_iter_advance(iter, 1); ++ continue; ++ } ++ ++ /* ++ * overlap: keep the newer key and trim the older key so they ++ * don't overlap. comparing pointers tells us which one is ++ * newer, since the bsets are appended one after the other. ++ */ ++ ++ /* can't happen because of comparison func */ ++ BUG_ON(_l->k < _r->k && ++ !bkey_cmp(bkey_start_pos(l.k), bkey_start_pos(r.k))); ++ ++ if (_l->k > _r->k) { ++ /* l wins, trim r */ ++ if (bkey_cmp(l.k->p, r.k->p) >= 0) { ++ extent_iter_advance(iter, 1); ++ } else { ++ bch2_cut_front_s(l.k->p, r); ++ extent_save(b, _r->k, r.k); ++ __sort_iter_sift(iter, 1, ++ extent_sort_fix_overlapping_cmp); ++ } ++ } else if (bkey_cmp(l.k->p, r.k->p) > 0) { ++ ++ /* ++ * r wins, but it overlaps in the middle of l - split l: ++ */ ++ bkey_on_stack_reassemble(&split, c, l.s_c); ++ bch2_cut_back(bkey_start_pos(r.k), split.k); ++ ++ bch2_cut_front_s(r.k->p, l); ++ extent_save(b, _l->k, l.k); ++ ++ __sort_iter_sift(iter, 0, ++ extent_sort_fix_overlapping_cmp); ++ ++ extent_sort_append(c, f, &nr, &out, ++ bkey_i_to_s(split.k)); ++ } else { ++ bch2_cut_back_s(bkey_start_pos(r.k), l); ++ extent_save(b, _l->k, l.k); ++ } ++ } ++ ++ dst->u64s = cpu_to_le16((u64 *) out - dst->_data); ++ ++ bkey_on_stack_exit(&split, c); ++ return nr; ++} ++ ++static inline int sort_extents_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) ++{ ++ return bkey_cmp_packed(b, l, r) ?: ++ (int) bkey_deleted(l) - (int) bkey_deleted(r); ++} ++ ++unsigned bch2_sort_extents(struct bkey_packed *dst, ++ struct sort_iter *iter, ++ bool filter_whiteouts) ++{ ++ struct bkey_packed *in, *out = dst; ++ ++ sort_iter_sort(iter, sort_extents_cmp); ++ ++ while ((in = sort_iter_next(iter, sort_extents_cmp))) { ++ if (bkey_deleted(in)) ++ continue; ++ ++ if (bkey_whiteout(in) && ++ (filter_whiteouts || !in->needs_whiteout)) ++ continue; ++ ++ bkey_copy(out, in); ++ out = bkey_next(out); ++ } ++ ++ return (u64 *) out - (u64 *) dst; ++} ++ ++static inline int sort_extent_whiteouts_cmp(struct btree *b, ++ struct bkey_packed *l, ++ struct bkey_packed *r) ++{ ++ struct bkey ul = bkey_unpack_key(b, l); ++ struct bkey ur = bkey_unpack_key(b, r); ++ ++ return bkey_cmp(bkey_start_pos(&ul), bkey_start_pos(&ur)); ++} ++ ++unsigned bch2_sort_extent_whiteouts(struct bkey_packed *dst, ++ struct sort_iter *iter) ++{ ++ const struct bkey_format *f = &iter->b->format; ++ struct bkey_packed *in, *out = dst; ++ struct bkey_i l, r; ++ bool prev = false, l_packed = false; ++ u64 max_packed_size = bkey_field_max(f, BKEY_FIELD_SIZE); ++ u64 max_packed_offset = bkey_field_max(f, BKEY_FIELD_OFFSET); ++ u64 new_size; ++ ++ max_packed_size = min_t(u64, max_packed_size, KEY_SIZE_MAX); ++ ++ sort_iter_sort(iter, sort_extent_whiteouts_cmp); ++ ++ while ((in = sort_iter_next(iter, sort_extent_whiteouts_cmp))) { ++ if (bkey_deleted(in)) ++ continue; ++ ++ EBUG_ON(bkeyp_val_u64s(f, in)); ++ EBUG_ON(in->type != KEY_TYPE_discard); ++ ++ r.k = bkey_unpack_key(iter->b, in); ++ ++ if (prev && ++ bkey_cmp(l.k.p, bkey_start_pos(&r.k)) >= 0) { ++ if (bkey_cmp(l.k.p, r.k.p) >= 0) ++ continue; ++ ++ new_size = l_packed ++ ? min(max_packed_size, max_packed_offset - ++ bkey_start_offset(&l.k)) ++ : KEY_SIZE_MAX; ++ ++ new_size = min(new_size, r.k.p.offset - ++ bkey_start_offset(&l.k)); ++ ++ BUG_ON(new_size < l.k.size); ++ ++ bch2_key_resize(&l.k, new_size); ++ ++ if (bkey_cmp(l.k.p, r.k.p) >= 0) ++ continue; ++ ++ bch2_cut_front(l.k.p, &r); ++ } ++ ++ if (prev) { ++ if (!bch2_bkey_pack(out, &l, f)) { ++ BUG_ON(l_packed); ++ bkey_copy(out, &l); ++ } ++ out = bkey_next(out); ++ } ++ ++ l = r; ++ prev = true; ++ l_packed = bkey_packed(in); ++ } ++ ++ if (prev) { ++ if (!bch2_bkey_pack(out, &l, f)) { ++ BUG_ON(l_packed); ++ bkey_copy(out, &l); ++ } ++ out = bkey_next(out); ++ } ++ ++ return (u64 *) out - (u64 *) dst; ++} +diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h +new file mode 100644 +index 000000000000..458a051fdac5 +--- /dev/null ++++ b/fs/bcachefs/bkey_sort.h +@@ -0,0 +1,57 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BKEY_SORT_H ++#define _BCACHEFS_BKEY_SORT_H ++ ++struct sort_iter { ++ struct btree *b; ++ unsigned used; ++ unsigned size; ++ ++ struct sort_iter_set { ++ struct bkey_packed *k, *end; ++ } data[MAX_BSETS + 1]; ++}; ++ ++static inline void sort_iter_init(struct sort_iter *iter, struct btree *b) ++{ ++ iter->b = b; ++ iter->used = 0; ++ iter->size = ARRAY_SIZE(iter->data); ++} ++ ++static inline void sort_iter_add(struct sort_iter *iter, ++ struct bkey_packed *k, ++ struct bkey_packed *end) ++{ ++ BUG_ON(iter->used >= iter->size); ++ ++ if (k != end) ++ iter->data[iter->used++] = (struct sort_iter_set) { k, end }; ++} ++ ++struct btree_nr_keys ++bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *, ++ struct sort_iter *); ++struct btree_nr_keys ++bch2_extent_sort_fix_overlapping(struct bch_fs *, struct bset *, ++ struct sort_iter *); ++ ++struct btree_nr_keys ++bch2_sort_repack(struct bset *, struct btree *, ++ struct btree_node_iter *, ++ struct bkey_format *, bool); ++struct btree_nr_keys ++bch2_sort_repack_merge(struct bch_fs *, ++ struct bset *, struct btree *, ++ struct btree_node_iter *, ++ struct bkey_format *, bool); ++ ++unsigned bch2_sort_keys(struct bkey_packed *, ++ struct sort_iter *, bool); ++unsigned bch2_sort_extents(struct bkey_packed *, ++ struct sort_iter *, bool); ++ ++unsigned bch2_sort_extent_whiteouts(struct bkey_packed *, ++ struct sort_iter *); ++ ++#endif /* _BCACHEFS_BKEY_SORT_H */ +diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c +new file mode 100644 +index 000000000000..f7c2841ed8a7 +--- /dev/null ++++ b/fs/bcachefs/bset.c +@@ -0,0 +1,1742 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Code for working with individual keys, and sorted sets of keys with in a ++ * btree node ++ * ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "btree_cache.h" ++#include "bset.h" ++#include "eytzinger.h" ++#include "util.h" ++ ++#include ++#include ++#include ++#include ++ ++/* hack.. */ ++#include "alloc_types.h" ++#include ++ ++static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *, ++ struct btree *); ++ ++static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) ++{ ++ unsigned n = ARRAY_SIZE(iter->data); ++ ++ while (n && __btree_node_iter_set_end(iter, n - 1)) ++ --n; ++ ++ return n; ++} ++ ++struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) ++{ ++ unsigned offset = __btree_node_key_to_offset(b, k); ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) ++ if (offset <= t->end_offset) { ++ EBUG_ON(offset < btree_bkey_first_offset(t)); ++ return t; ++ } ++ ++ BUG(); ++} ++ ++/* ++ * There are never duplicate live keys in the btree - but including keys that ++ * have been flagged as deleted (and will be cleaned up later) we _will_ see ++ * duplicates. ++ * ++ * Thus the sort order is: usual key comparison first, but for keys that compare ++ * equal the deleted key(s) come first, and the (at most one) live version comes ++ * last. ++ * ++ * The main reason for this is insertion: to handle overwrites, we first iterate ++ * over keys that compare equal to our insert key, and then insert immediately ++ * prior to the first key greater than the key we're inserting - our insert ++ * position will be after all keys that compare equal to our insert key, which ++ * by the time we actually do the insert will all be deleted. ++ */ ++ ++void bch2_dump_bset(struct bch_fs *c, struct btree *b, ++ struct bset *i, unsigned set) ++{ ++ struct bkey_packed *_k, *_n; ++ struct bkey uk, n; ++ struct bkey_s_c k; ++ char buf[200]; ++ ++ if (!i->u64s) ++ return; ++ ++ for (_k = i->start; ++ _k < vstruct_last(i); ++ _k = _n) { ++ _n = bkey_next_skip_noops(_k, vstruct_last(i)); ++ ++ k = bkey_disassemble(b, _k, &uk); ++ if (c) ++ bch2_bkey_val_to_text(&PBUF(buf), c, k); ++ else ++ bch2_bkey_to_text(&PBUF(buf), k.k); ++ printk(KERN_ERR "block %u key %5zu: %s\n", set, ++ _k->_data - i->_data, buf); ++ ++ if (_n == vstruct_last(i)) ++ continue; ++ ++ n = bkey_unpack_key(b, _n); ++ ++ if (bkey_cmp(bkey_start_pos(&n), k.k->p) < 0) { ++ printk(KERN_ERR "Key skipped backwards\n"); ++ continue; ++ } ++ ++ if (!bkey_deleted(k.k) && ++ !bkey_cmp(n.p, k.k->p)) ++ printk(KERN_ERR "Duplicate keys\n"); ++ } ++} ++ ++void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) ++{ ++ struct bset_tree *t; ++ ++ console_lock(); ++ for_each_bset(b, t) ++ bch2_dump_bset(c, b, bset(b, t), t - b->set); ++ console_unlock(); ++} ++ ++void bch2_dump_btree_node_iter(struct btree *b, ++ struct btree_node_iter *iter) ++{ ++ struct btree_node_iter_set *set; ++ ++ printk(KERN_ERR "btree node iter with %u/%u sets:\n", ++ __btree_node_iter_used(iter), b->nsets); ++ ++ btree_node_iter_for_each(iter, set) { ++ struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ struct bkey uk = bkey_unpack_key(b, k); ++ char buf[100]; ++ ++ bch2_bkey_to_text(&PBUF(buf), &uk); ++ printk(KERN_ERR "set %zu key %u: %s\n", ++ t - b->set, set->k, buf); ++ } ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++void __bch2_verify_btree_nr_keys(struct btree *b) ++{ ++ struct bset_tree *t; ++ struct bkey_packed *k; ++ struct btree_nr_keys nr = { 0 }; ++ ++ for_each_bset(b, t) ++ bset_tree_for_each_key(b, t, k) ++ if (!bkey_whiteout(k)) ++ btree_keys_account_key_add(&nr, t - b->set, k); ++ ++ BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); ++} ++ ++static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, ++ struct btree *b) ++{ ++ struct btree_node_iter iter = *_iter; ++ const struct bkey_packed *k, *n; ++ ++ k = bch2_btree_node_iter_peek_all(&iter, b); ++ __bch2_btree_node_iter_advance(&iter, b); ++ n = bch2_btree_node_iter_peek_all(&iter, b); ++ ++ bkey_unpack_key(b, k); ++ ++ if (n && ++ bkey_iter_cmp(b, k, n) > 0) { ++ struct btree_node_iter_set *set; ++ struct bkey ku = bkey_unpack_key(b, k); ++ struct bkey nu = bkey_unpack_key(b, n); ++ char buf1[80], buf2[80]; ++ ++ bch2_dump_btree_node(NULL, b); ++ bch2_bkey_to_text(&PBUF(buf1), &ku); ++ bch2_bkey_to_text(&PBUF(buf2), &nu); ++ printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", ++ buf1, buf2); ++ printk(KERN_ERR "iter was:"); ++ ++ btree_node_iter_for_each(_iter, set) { ++ struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ printk(" [%zi %zi]", t - b->set, ++ k->_data - bset(b, t)->_data); ++ } ++ panic("\n"); ++ } ++} ++ ++void bch2_btree_node_iter_verify(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ struct btree_node_iter_set *set, *s2; ++ struct bkey_packed *k, *p; ++ struct bset_tree *t; ++ ++ if (bch2_btree_node_iter_end(iter)) ++ return; ++ ++ /* Verify no duplicates: */ ++ btree_node_iter_for_each(iter, set) ++ btree_node_iter_for_each(iter, s2) ++ BUG_ON(set != s2 && set->end == s2->end); ++ ++ /* Verify that set->end is correct: */ ++ btree_node_iter_for_each(iter, set) { ++ for_each_bset(b, t) ++ if (set->end == t->end_offset) ++ goto found; ++ BUG(); ++found: ++ BUG_ON(set->k < btree_bkey_first_offset(t) || ++ set->k >= t->end_offset); ++ } ++ ++ /* Verify iterator is sorted: */ ++ btree_node_iter_for_each(iter, set) ++ BUG_ON(set != iter->data && ++ btree_node_iter_cmp(b, set[-1], set[0]) > 0); ++ ++ k = bch2_btree_node_iter_peek_all(iter, b); ++ ++ for_each_bset(b, t) { ++ if (iter->data[0].end == t->end_offset) ++ continue; ++ ++ p = bch2_bkey_prev_all(b, t, ++ bch2_btree_node_iter_bset_pos(iter, b, t)); ++ ++ BUG_ON(p && bkey_iter_cmp(b, k, p) < 0); ++ } ++} ++ ++void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, ++ struct bkey_packed *insert, unsigned clobber_u64s) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, where); ++ struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); ++ struct bkey_packed *next = (void *) (where->_data + clobber_u64s); ++#if 0 ++ BUG_ON(prev && ++ bkey_iter_cmp(b, prev, insert) > 0); ++#else ++ if (prev && ++ bkey_iter_cmp(b, prev, insert) > 0) { ++ struct bkey k1 = bkey_unpack_key(b, prev); ++ struct bkey k2 = bkey_unpack_key(b, insert); ++ char buf1[100]; ++ char buf2[100]; ++ ++ bch2_dump_btree_node(NULL, b); ++ bch2_bkey_to_text(&PBUF(buf1), &k1); ++ bch2_bkey_to_text(&PBUF(buf2), &k2); ++ ++ panic("prev > insert:\n" ++ "prev key %s\n" ++ "insert key %s\n", ++ buf1, buf2); ++ } ++#endif ++#if 0 ++ BUG_ON(next != btree_bkey_last(b, t) && ++ bkey_iter_cmp(b, insert, next) > 0); ++#else ++ if (next != btree_bkey_last(b, t) && ++ bkey_iter_cmp(b, insert, next) > 0) { ++ struct bkey k1 = bkey_unpack_key(b, insert); ++ struct bkey k2 = bkey_unpack_key(b, next); ++ char buf1[100]; ++ char buf2[100]; ++ ++ bch2_dump_btree_node(NULL, b); ++ bch2_bkey_to_text(&PBUF(buf1), &k1); ++ bch2_bkey_to_text(&PBUF(buf2), &k2); ++ ++ panic("insert > next:\n" ++ "insert key %s\n" ++ "next key %s\n", ++ buf1, buf2); ++ } ++#endif ++} ++ ++#else ++ ++static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, ++ struct btree *b) {} ++ ++#endif ++ ++/* Auxiliary search trees */ ++ ++#define BFLOAT_FAILED_UNPACKED U8_MAX ++#define BFLOAT_FAILED U8_MAX ++ ++struct bkey_float { ++ u8 exponent; ++ u8 key_offset; ++ u16 mantissa; ++}; ++#define BKEY_MANTISSA_BITS 16 ++ ++static unsigned bkey_float_byte_offset(unsigned idx) ++{ ++ return idx * sizeof(struct bkey_float); ++} ++ ++struct ro_aux_tree { ++ struct bkey_float f[0]; ++}; ++ ++struct rw_aux_tree { ++ u16 offset; ++ struct bpos k; ++}; ++ ++static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) ++{ ++ BUG_ON(t->aux_data_offset == U16_MAX); ++ ++ switch (bset_aux_tree_type(t)) { ++ case BSET_NO_AUX_TREE: ++ return t->aux_data_offset; ++ case BSET_RO_AUX_TREE: ++ return t->aux_data_offset + ++ DIV_ROUND_UP(t->size * sizeof(struct bkey_float) + ++ t->size * sizeof(u8), 8); ++ case BSET_RW_AUX_TREE: ++ return t->aux_data_offset + ++ DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); ++ default: ++ BUG(); ++ } ++} ++ ++static unsigned bset_aux_tree_buf_start(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ return t == b->set ++ ? DIV_ROUND_UP(b->unpack_fn_len, 8) ++ : bset_aux_tree_buf_end(t - 1); ++} ++ ++static void *__aux_tree_base(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ return b->aux_data + t->aux_data_offset * 8; ++} ++ ++static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); ++ ++ return __aux_tree_base(b, t); ++} ++ ++static u8 *ro_aux_tree_prev(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); ++ ++ return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size); ++} ++ ++static struct bkey_float *bkey_float(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned idx) ++{ ++ return ro_aux_tree_base(b, t)->f + idx; ++} ++ ++static void bset_aux_tree_verify(struct btree *b) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) { ++ if (t->aux_data_offset == U16_MAX) ++ continue; ++ ++ BUG_ON(t != b->set && ++ t[-1].aux_data_offset == U16_MAX); ++ ++ BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); ++ BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); ++ BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); ++ } ++#endif ++} ++ ++void bch2_btree_keys_init(struct btree *b, bool *expensive_debug_checks) ++{ ++ unsigned i; ++ ++ b->nsets = 0; ++ memset(&b->nr, 0, sizeof(b->nr)); ++#ifdef CONFIG_BCACHEFS_DEBUG ++ b->expensive_debug_checks = expensive_debug_checks; ++#endif ++ for (i = 0; i < MAX_BSETS; i++) ++ b->set[i].data_offset = U16_MAX; ++ ++ bch2_bset_set_no_aux_tree(b, b->set); ++} ++ ++/* Binary tree stuff for auxiliary search trees */ ++ ++/* ++ * Cacheline/offset <-> bkey pointer arithmetic: ++ * ++ * t->tree is a binary search tree in an array; each node corresponds to a key ++ * in one cacheline in t->set (BSET_CACHELINE bytes). ++ * ++ * This means we don't have to store the full index of the key that a node in ++ * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and ++ * then bkey_float->m gives us the offset within that cacheline, in units of 8 ++ * bytes. ++ * ++ * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to ++ * make this work. ++ * ++ * To construct the bfloat for an arbitrary key we need to know what the key ++ * immediately preceding it is: we have to check if the two keys differ in the ++ * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size ++ * of the previous key so we can walk backwards to it from t->tree[j]'s key. ++ */ ++ ++static inline void *bset_cacheline(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned cacheline) ++{ ++ return (void *) round_down((unsigned long) btree_bkey_first(b, t), ++ L1_CACHE_BYTES) + ++ cacheline * BSET_CACHELINE; ++} ++ ++static struct bkey_packed *cacheline_to_bkey(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned cacheline, ++ unsigned offset) ++{ ++ return bset_cacheline(b, t, cacheline) + offset * 8; ++} ++ ++static unsigned bkey_to_cacheline(const struct btree *b, ++ const struct bset_tree *t, ++ const struct bkey_packed *k) ++{ ++ return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE; ++} ++ ++static ssize_t __bkey_to_cacheline_offset(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned cacheline, ++ const struct bkey_packed *k) ++{ ++ return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline); ++} ++ ++static unsigned bkey_to_cacheline_offset(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned cacheline, ++ const struct bkey_packed *k) ++{ ++ size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k); ++ ++ EBUG_ON(m > U8_MAX); ++ return m; ++} ++ ++static inline struct bkey_packed *tree_to_bkey(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned j) ++{ ++ return cacheline_to_bkey(b, t, ++ __eytzinger1_to_inorder(j, t->size, t->extra), ++ bkey_float(b, t, j)->key_offset); ++} ++ ++static struct bkey_packed *tree_to_prev_bkey(const struct btree *b, ++ const struct bset_tree *t, ++ unsigned j) ++{ ++ unsigned prev_u64s = ro_aux_tree_prev(b, t)[j]; ++ ++ return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s); ++} ++ ++static struct rw_aux_tree *rw_aux_tree(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); ++ ++ return __aux_tree_base(b, t); ++} ++ ++/* ++ * For the write set - the one we're currently inserting keys into - we don't ++ * maintain a full search tree, we just keep a simple lookup table in t->prev. ++ */ ++static struct bkey_packed *rw_aux_to_bkey(const struct btree *b, ++ struct bset_tree *t, ++ unsigned j) ++{ ++ return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset); ++} ++ ++static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, ++ unsigned j, struct bkey_packed *k) ++{ ++ EBUG_ON(k >= btree_bkey_last(b, t)); ++ ++ rw_aux_tree(b, t)[j] = (struct rw_aux_tree) { ++ .offset = __btree_node_key_to_offset(b, k), ++ .k = bkey_unpack_pos(b, k), ++ }; ++} ++ ++static void bch2_bset_verify_rw_aux_tree(struct btree *b, ++ struct bset_tree *t) ++{ ++ struct bkey_packed *k = btree_bkey_first(b, t); ++ unsigned j = 0; ++ ++ if (!btree_keys_expensive_checks(b)) ++ return; ++ ++ BUG_ON(bset_has_ro_aux_tree(t)); ++ ++ if (!bset_has_rw_aux_tree(t)) ++ return; ++ ++ BUG_ON(t->size < 1); ++ BUG_ON(rw_aux_to_bkey(b, t, j) != k); ++ ++ goto start; ++ while (1) { ++ if (rw_aux_to_bkey(b, t, j) == k) { ++ BUG_ON(bkey_cmp(rw_aux_tree(b, t)[j].k, ++ bkey_unpack_pos(b, k))); ++start: ++ if (++j == t->size) ++ break; ++ ++ BUG_ON(rw_aux_tree(b, t)[j].offset <= ++ rw_aux_tree(b, t)[j - 1].offset); ++ } ++ ++ k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); ++ BUG_ON(k >= btree_bkey_last(b, t)); ++ } ++} ++ ++/* returns idx of first entry >= offset: */ ++static unsigned rw_aux_tree_bsearch(struct btree *b, ++ struct bset_tree *t, ++ unsigned offset) ++{ ++ unsigned bset_offs = offset - btree_bkey_first_offset(t); ++ unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t); ++ unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0; ++ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); ++ EBUG_ON(!t->size); ++ EBUG_ON(idx > t->size); ++ ++ while (idx < t->size && ++ rw_aux_tree(b, t)[idx].offset < offset) ++ idx++; ++ ++ while (idx && ++ rw_aux_tree(b, t)[idx - 1].offset >= offset) ++ idx--; ++ ++ EBUG_ON(idx < t->size && ++ rw_aux_tree(b, t)[idx].offset < offset); ++ EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset); ++ EBUG_ON(idx + 1 < t->size && ++ rw_aux_tree(b, t)[idx].offset == ++ rw_aux_tree(b, t)[idx + 1].offset); ++ ++ return idx; ++} ++ ++static inline unsigned bkey_mantissa(const struct bkey_packed *k, ++ const struct bkey_float *f, ++ unsigned idx) ++{ ++ u64 v; ++ ++ EBUG_ON(!bkey_packed(k)); ++ ++ v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3))); ++ ++ /* ++ * In little endian, we're shifting off low bits (and then the bits we ++ * want are at the low end), in big endian we're shifting off high bits ++ * (and then the bits we want are at the high end, so we shift them ++ * back down): ++ */ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ v >>= f->exponent & 7; ++#else ++ v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS; ++#endif ++ return (u16) v; ++} ++ ++static void make_bfloat(struct btree *b, struct bset_tree *t, ++ unsigned j, ++ struct bkey_packed *min_key, ++ struct bkey_packed *max_key) ++{ ++ struct bkey_float *f = bkey_float(b, t, j); ++ struct bkey_packed *m = tree_to_bkey(b, t, j); ++ struct bkey_packed *l, *r; ++ unsigned mantissa; ++ int shift, exponent, high_bit; ++ ++ if (is_power_of_2(j)) { ++ l = min_key; ++ ++ if (!l->u64s) { ++ if (!bkey_pack_pos(l, b->data->min_key, b)) { ++ struct bkey_i tmp; ++ ++ bkey_init(&tmp.k); ++ tmp.k.p = b->data->min_key; ++ bkey_copy(l, &tmp); ++ } ++ } ++ } else { ++ l = tree_to_prev_bkey(b, t, j >> ffs(j)); ++ ++ EBUG_ON(m < l); ++ } ++ ++ if (is_power_of_2(j + 1)) { ++ r = max_key; ++ ++ if (!r->u64s) { ++ if (!bkey_pack_pos(r, t->max_key, b)) { ++ struct bkey_i tmp; ++ ++ bkey_init(&tmp.k); ++ tmp.k.p = t->max_key; ++ bkey_copy(r, &tmp); ++ } ++ } ++ } else { ++ r = tree_to_bkey(b, t, j >> (ffz(j) + 1)); ++ ++ EBUG_ON(m > r); ++ } ++ ++ /* ++ * for failed bfloats, the lookup code falls back to comparing against ++ * the original key. ++ */ ++ ++ if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) || ++ !b->nr_key_bits) { ++ f->exponent = BFLOAT_FAILED_UNPACKED; ++ return; ++ } ++ ++ /* ++ * The greatest differing bit of l and r is the first bit we must ++ * include in the bfloat mantissa we're creating in order to do ++ * comparisons - that bit always becomes the high bit of ++ * bfloat->mantissa, and thus the exponent we're calculating here is ++ * the position of what will become the low bit in bfloat->mantissa: ++ * ++ * Note that this may be negative - we may be running off the low end ++ * of the key: we handle this later: ++ */ ++ high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r), ++ min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1); ++ exponent = high_bit - (BKEY_MANTISSA_BITS - 1); ++ ++ /* ++ * Then we calculate the actual shift value, from the start of the key ++ * (k->_data), to get the key bits starting at exponent: ++ */ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; ++ ++ EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64); ++#else ++ shift = high_bit_offset + ++ b->nr_key_bits - ++ exponent - ++ BKEY_MANTISSA_BITS; ++ ++ EBUG_ON(shift < KEY_PACKED_BITS_START); ++#endif ++ EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); ++ ++ f->exponent = shift; ++ mantissa = bkey_mantissa(m, f, j); ++ ++ /* ++ * If we've got garbage bits, set them to all 1s - it's legal for the ++ * bfloat to compare larger than the original key, but not smaller: ++ */ ++ if (exponent < 0) ++ mantissa |= ~(~0U << -exponent); ++ ++ f->mantissa = mantissa; ++} ++ ++/* bytes remaining - only valid for last bset: */ ++static unsigned __bset_tree_capacity(struct btree *b, struct bset_tree *t) ++{ ++ bset_aux_tree_verify(b); ++ ++ return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); ++} ++ ++static unsigned bset_ro_tree_capacity(struct btree *b, struct bset_tree *t) ++{ ++ return __bset_tree_capacity(b, t) / ++ (sizeof(struct bkey_float) + sizeof(u8)); ++} ++ ++static unsigned bset_rw_tree_capacity(struct btree *b, struct bset_tree *t) ++{ ++ return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); ++} ++ ++static void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) ++{ ++ struct bkey_packed *k; ++ ++ t->size = 1; ++ t->extra = BSET_RW_AUX_TREE_VAL; ++ rw_aux_tree(b, t)[0].offset = ++ __btree_node_key_to_offset(b, btree_bkey_first(b, t)); ++ ++ bset_tree_for_each_key(b, t, k) { ++ if (t->size == bset_rw_tree_capacity(b, t)) ++ break; ++ ++ if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) > ++ L1_CACHE_BYTES) ++ rw_aux_tree_set(b, t, t->size++, k); ++ } ++} ++ ++static void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) ++{ ++ struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); ++ struct bkey_packed min_key, max_key; ++ unsigned j, cacheline = 1; ++ ++ /* signal to make_bfloat() that they're uninitialized: */ ++ min_key.u64s = max_key.u64s = 0; ++ ++ t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), ++ bset_ro_tree_capacity(b, t)); ++retry: ++ if (t->size < 2) { ++ t->size = 0; ++ t->extra = BSET_NO_AUX_TREE_VAL; ++ return; ++ } ++ ++ t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; ++ ++ /* First we figure out where the first key in each cacheline is */ ++ eytzinger1_for_each(j, t->size) { ++ while (bkey_to_cacheline(b, t, k) < cacheline) ++ prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); ++ ++ if (k >= btree_bkey_last(b, t)) { ++ /* XXX: this path sucks */ ++ t->size--; ++ goto retry; ++ } ++ ++ ro_aux_tree_prev(b, t)[j] = prev->u64s; ++ bkey_float(b, t, j)->key_offset = ++ bkey_to_cacheline_offset(b, t, cacheline++, k); ++ ++ EBUG_ON(tree_to_prev_bkey(b, t, j) != prev); ++ EBUG_ON(tree_to_bkey(b, t, j) != k); ++ } ++ ++ while (k != btree_bkey_last(b, t)) ++ prev = k, k = bkey_next_skip_noops(k, btree_bkey_last(b, t)); ++ ++ t->max_key = bkey_unpack_pos(b, prev); ++ ++ /* Then we build the tree */ ++ eytzinger1_for_each(j, t->size) ++ make_bfloat(b, t, j, &min_key, &max_key); ++} ++ ++static void bset_alloc_tree(struct btree *b, struct bset_tree *t) ++{ ++ struct bset_tree *i; ++ ++ for (i = b->set; i != t; i++) ++ BUG_ON(bset_has_rw_aux_tree(i)); ++ ++ bch2_bset_set_no_aux_tree(b, t); ++ ++ /* round up to next cacheline: */ ++ t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t), ++ SMP_CACHE_BYTES / sizeof(u64)); ++ ++ bset_aux_tree_verify(b); ++} ++ ++void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t, ++ bool writeable) ++{ ++ if (writeable ++ ? bset_has_rw_aux_tree(t) ++ : bset_has_ro_aux_tree(t)) ++ return; ++ ++ bset_alloc_tree(b, t); ++ ++ if (!__bset_tree_capacity(b, t)) ++ return; ++ ++ if (writeable) ++ __build_rw_aux_tree(b, t); ++ else ++ __build_ro_aux_tree(b, t); ++ ++ bset_aux_tree_verify(b); ++} ++ ++void bch2_bset_init_first(struct btree *b, struct bset *i) ++{ ++ struct bset_tree *t; ++ ++ BUG_ON(b->nsets); ++ ++ memset(i, 0, sizeof(*i)); ++ get_random_bytes(&i->seq, sizeof(i->seq)); ++ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); ++ ++ t = &b->set[b->nsets++]; ++ set_btree_bset(b, t, i); ++} ++ ++void bch2_bset_init_next(struct bch_fs *c, struct btree *b, ++ struct btree_node_entry *bne) ++{ ++ struct bset *i = &bne->keys; ++ struct bset_tree *t; ++ ++ BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c)); ++ BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); ++ BUG_ON(b->nsets >= MAX_BSETS); ++ ++ memset(i, 0, sizeof(*i)); ++ i->seq = btree_bset_first(b)->seq; ++ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); ++ ++ t = &b->set[b->nsets++]; ++ set_btree_bset(b, t, i); ++} ++ ++/* ++ * find _some_ key in the same bset as @k that precedes @k - not necessarily the ++ * immediate predecessor: ++ */ ++static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, ++ struct bkey_packed *k) ++{ ++ struct bkey_packed *p; ++ unsigned offset; ++ int j; ++ ++ EBUG_ON(k < btree_bkey_first(b, t) || ++ k > btree_bkey_last(b, t)); ++ ++ if (k == btree_bkey_first(b, t)) ++ return NULL; ++ ++ switch (bset_aux_tree_type(t)) { ++ case BSET_NO_AUX_TREE: ++ p = btree_bkey_first(b, t); ++ break; ++ case BSET_RO_AUX_TREE: ++ j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k)); ++ ++ do { ++ p = j ? tree_to_bkey(b, t, ++ __inorder_to_eytzinger1(j--, ++ t->size, t->extra)) ++ : btree_bkey_first(b, t); ++ } while (p >= k); ++ break; ++ case BSET_RW_AUX_TREE: ++ offset = __btree_node_key_to_offset(b, k); ++ j = rw_aux_tree_bsearch(b, t, offset); ++ p = j ? rw_aux_to_bkey(b, t, j - 1) ++ : btree_bkey_first(b, t); ++ break; ++ } ++ ++ return p; ++} ++ ++struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, ++ struct bset_tree *t, ++ struct bkey_packed *k, ++ unsigned min_key_type) ++{ ++ struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; ++ ++ while ((p = __bkey_prev(b, t, k)) && !ret) { ++ for (i = p; i != k; i = bkey_next_skip_noops(i, k)) ++ if (i->type >= min_key_type) ++ ret = i; ++ ++ k = p; ++ } ++ ++ if (btree_keys_expensive_checks(b)) { ++ BUG_ON(ret >= orig_k); ++ ++ for (i = ret ++ ? bkey_next_skip_noops(ret, orig_k) ++ : btree_bkey_first(b, t); ++ i != orig_k; ++ i = bkey_next_skip_noops(i, orig_k)) ++ BUG_ON(i->type >= min_key_type); ++ } ++ ++ return ret; ++} ++ ++/* Insert */ ++ ++static void rw_aux_tree_fix_invalidated_key(struct btree *b, ++ struct bset_tree *t, ++ struct bkey_packed *k) ++{ ++ unsigned offset = __btree_node_key_to_offset(b, k); ++ unsigned j = rw_aux_tree_bsearch(b, t, offset); ++ ++ if (j < t->size && ++ rw_aux_tree(b, t)[j].offset == offset) ++ rw_aux_tree_set(b, t, j, k); ++ ++ bch2_bset_verify_rw_aux_tree(b, t); ++} ++ ++static void ro_aux_tree_fix_invalidated_key(struct btree *b, ++ struct bset_tree *t, ++ struct bkey_packed *k) ++{ ++ struct bkey_packed min_key, max_key; ++ unsigned inorder, j; ++ ++ EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); ++ ++ /* signal to make_bfloat() that they're uninitialized: */ ++ min_key.u64s = max_key.u64s = 0; ++ ++ if (bkey_next_skip_noops(k, btree_bkey_last(b, t)) == btree_bkey_last(b, t)) { ++ t->max_key = bkey_unpack_pos(b, k); ++ ++ for (j = 1; j < t->size; j = j * 2 + 1) ++ make_bfloat(b, t, j, &min_key, &max_key); ++ } ++ ++ inorder = bkey_to_cacheline(b, t, k); ++ ++ if (inorder && ++ inorder < t->size) { ++ j = __inorder_to_eytzinger1(inorder, t->size, t->extra); ++ ++ if (k == tree_to_bkey(b, t, j)) { ++ /* Fix the node this key corresponds to */ ++ make_bfloat(b, t, j, &min_key, &max_key); ++ ++ /* Children for which this key is the right boundary */ ++ for (j = eytzinger1_left_child(j); ++ j < t->size; ++ j = eytzinger1_right_child(j)) ++ make_bfloat(b, t, j, &min_key, &max_key); ++ } ++ } ++ ++ if (inorder + 1 < t->size) { ++ j = __inorder_to_eytzinger1(inorder + 1, t->size, t->extra); ++ ++ if (k == tree_to_prev_bkey(b, t, j)) { ++ make_bfloat(b, t, j, &min_key, &max_key); ++ ++ /* Children for which this key is the left boundary */ ++ for (j = eytzinger1_right_child(j); ++ j < t->size; ++ j = eytzinger1_left_child(j)) ++ make_bfloat(b, t, j, &min_key, &max_key); ++ } ++ } ++} ++ ++/** ++ * bch2_bset_fix_invalidated_key() - given an existing key @k that has been ++ * modified, fix any auxiliary search tree by remaking all the nodes in the ++ * auxiliary search tree that @k corresponds to ++ */ ++void bch2_bset_fix_invalidated_key(struct btree *b, struct bkey_packed *k) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ ++ switch (bset_aux_tree_type(t)) { ++ case BSET_NO_AUX_TREE: ++ break; ++ case BSET_RO_AUX_TREE: ++ ro_aux_tree_fix_invalidated_key(b, t, k); ++ break; ++ case BSET_RW_AUX_TREE: ++ rw_aux_tree_fix_invalidated_key(b, t, k); ++ break; ++ } ++} ++ ++static void bch2_bset_fix_lookup_table(struct btree *b, ++ struct bset_tree *t, ++ struct bkey_packed *_where, ++ unsigned clobber_u64s, ++ unsigned new_u64s) ++{ ++ int shift = new_u64s - clobber_u64s; ++ unsigned l, j, where = __btree_node_key_to_offset(b, _where); ++ ++ EBUG_ON(bset_has_ro_aux_tree(t)); ++ ++ if (!bset_has_rw_aux_tree(t)) ++ return; ++ ++ /* returns first entry >= where */ ++ l = rw_aux_tree_bsearch(b, t, where); ++ ++ if (!l) /* never delete first entry */ ++ l++; ++ else if (l < t->size && ++ where < t->end_offset && ++ rw_aux_tree(b, t)[l].offset == where) ++ rw_aux_tree_set(b, t, l++, _where); ++ ++ /* l now > where */ ++ ++ for (j = l; ++ j < t->size && ++ rw_aux_tree(b, t)[j].offset < where + clobber_u64s; ++ j++) ++ ; ++ ++ if (j < t->size && ++ rw_aux_tree(b, t)[j].offset + shift == ++ rw_aux_tree(b, t)[l - 1].offset) ++ j++; ++ ++ memmove(&rw_aux_tree(b, t)[l], ++ &rw_aux_tree(b, t)[j], ++ (void *) &rw_aux_tree(b, t)[t->size] - ++ (void *) &rw_aux_tree(b, t)[j]); ++ t->size -= j - l; ++ ++ for (j = l; j < t->size; j++) ++ rw_aux_tree(b, t)[j].offset += shift; ++ ++ EBUG_ON(l < t->size && ++ rw_aux_tree(b, t)[l].offset == ++ rw_aux_tree(b, t)[l - 1].offset); ++ ++ if (t->size < bset_rw_tree_capacity(b, t) && ++ (l < t->size ++ ? rw_aux_tree(b, t)[l].offset ++ : t->end_offset) - ++ rw_aux_tree(b, t)[l - 1].offset > ++ L1_CACHE_BYTES / sizeof(u64)) { ++ struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1); ++ struct bkey_packed *end = l < t->size ++ ? rw_aux_to_bkey(b, t, l) ++ : btree_bkey_last(b, t); ++ struct bkey_packed *k = start; ++ ++ while (1) { ++ k = bkey_next_skip_noops(k, end); ++ if (k == end) ++ break; ++ ++ if ((void *) k - (void *) start >= L1_CACHE_BYTES) { ++ memmove(&rw_aux_tree(b, t)[l + 1], ++ &rw_aux_tree(b, t)[l], ++ (void *) &rw_aux_tree(b, t)[t->size] - ++ (void *) &rw_aux_tree(b, t)[l]); ++ t->size++; ++ rw_aux_tree_set(b, t, l, k); ++ break; ++ } ++ } ++ } ++ ++ bch2_bset_verify_rw_aux_tree(b, t); ++ bset_aux_tree_verify(b); ++} ++ ++void bch2_bset_insert(struct btree *b, ++ struct btree_node_iter *iter, ++ struct bkey_packed *where, ++ struct bkey_i *insert, ++ unsigned clobber_u64s) ++{ ++ struct bkey_format *f = &b->format; ++ struct bset_tree *t = bset_tree_last(b); ++ struct bkey_packed packed, *src = bkey_to_packed(insert); ++ ++ bch2_bset_verify_rw_aux_tree(b, t); ++ bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s); ++ ++ if (bch2_bkey_pack_key(&packed, &insert->k, f)) ++ src = &packed; ++ ++ if (!bkey_whiteout(&insert->k)) ++ btree_keys_account_key_add(&b->nr, t - b->set, src); ++ ++ if (src->u64s != clobber_u64s) { ++ u64 *src_p = where->_data + clobber_u64s; ++ u64 *dst_p = where->_data + src->u64s; ++ ++ EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < ++ (int) clobber_u64s - src->u64s); ++ ++ memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); ++ le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s); ++ set_btree_bset_end(b, t); ++ } ++ ++ memcpy_u64s(where, src, ++ bkeyp_key_u64s(f, src)); ++ memcpy_u64s(bkeyp_val(f, where), &insert->v, ++ bkeyp_val_u64s(f, src)); ++ ++ if (src->u64s != clobber_u64s) ++ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); ++ ++ bch2_verify_btree_nr_keys(b); ++} ++ ++void bch2_bset_delete(struct btree *b, ++ struct bkey_packed *where, ++ unsigned clobber_u64s) ++{ ++ struct bset_tree *t = bset_tree_last(b); ++ u64 *src_p = where->_data + clobber_u64s; ++ u64 *dst_p = where->_data; ++ ++ bch2_bset_verify_rw_aux_tree(b, t); ++ ++ EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s); ++ ++ memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); ++ le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s); ++ set_btree_bset_end(b, t); ++ ++ bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0); ++} ++ ++/* Lookup */ ++ ++__flatten ++static struct bkey_packed *bset_search_write_set(const struct btree *b, ++ struct bset_tree *t, ++ struct bpos *search, ++ const struct bkey_packed *packed_search) ++{ ++ unsigned l = 0, r = t->size; ++ ++ while (l + 1 != r) { ++ unsigned m = (l + r) >> 1; ++ ++ if (bkey_cmp(rw_aux_tree(b, t)[m].k, *search) < 0) ++ l = m; ++ else ++ r = m; ++ } ++ ++ return rw_aux_to_bkey(b, t, l); ++} ++ ++static inline void prefetch_four_cachelines(void *p) ++{ ++#ifdef CONFIG_X86_64 ++ asm(".intel_syntax noprefix;" ++ "prefetcht0 [%0 - 127 + 64 * 0];" ++ "prefetcht0 [%0 - 127 + 64 * 1];" ++ "prefetcht0 [%0 - 127 + 64 * 2];" ++ "prefetcht0 [%0 - 127 + 64 * 3];" ++ ".att_syntax prefix;" ++ : ++ : "r" (p + 127)); ++#else ++ prefetch(p + L1_CACHE_BYTES * 0); ++ prefetch(p + L1_CACHE_BYTES * 1); ++ prefetch(p + L1_CACHE_BYTES * 2); ++ prefetch(p + L1_CACHE_BYTES * 3); ++#endif ++} ++ ++static inline bool bkey_mantissa_bits_dropped(const struct btree *b, ++ const struct bkey_float *f, ++ unsigned idx) ++{ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++ unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits; ++ ++ return f->exponent > key_bits_start; ++#else ++ unsigned key_bits_end = high_bit_offset + b->nr_key_bits; ++ ++ return f->exponent + BKEY_MANTISSA_BITS < key_bits_end; ++#endif ++} ++ ++__flatten ++static struct bkey_packed *bset_search_tree(const struct btree *b, ++ struct bset_tree *t, ++ struct bpos *search, ++ const struct bkey_packed *packed_search) ++{ ++ struct ro_aux_tree *base = ro_aux_tree_base(b, t); ++ struct bkey_float *f; ++ struct bkey_packed *k; ++ unsigned inorder, n = 1, l, r; ++ int cmp; ++ ++ do { ++ if (likely(n << 4 < t->size)) ++ prefetch(&base->f[n << 4]); ++ ++ f = &base->f[n]; ++ ++ if (!unlikely(packed_search)) ++ goto slowpath; ++ if (unlikely(f->exponent >= BFLOAT_FAILED)) ++ goto slowpath; ++ ++ l = f->mantissa; ++ r = bkey_mantissa(packed_search, f, n); ++ ++ if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n)) ++ goto slowpath; ++ ++ n = n * 2 + (l < r); ++ continue; ++slowpath: ++ k = tree_to_bkey(b, t, n); ++ cmp = bkey_cmp_p_or_unp(b, k, packed_search, search); ++ if (!cmp) ++ return k; ++ ++ n = n * 2 + (cmp < 0); ++ } while (n < t->size); ++ ++ inorder = __eytzinger1_to_inorder(n >> 1, t->size, t->extra); ++ ++ /* ++ * n would have been the node we recursed to - the low bit tells us if ++ * we recursed left or recursed right. ++ */ ++ if (likely(!(n & 1))) { ++ --inorder; ++ if (unlikely(!inorder)) ++ return btree_bkey_first(b, t); ++ ++ f = &base->f[eytzinger1_prev(n >> 1, t->size)]; ++ } ++ ++ return cacheline_to_bkey(b, t, inorder, f->key_offset); ++} ++ ++static __always_inline __flatten ++struct bkey_packed *__bch2_bset_search(struct btree *b, ++ struct bset_tree *t, ++ struct bpos *search, ++ const struct bkey_packed *lossy_packed_search) ++{ ++ ++ /* ++ * First, we search for a cacheline, then lastly we do a linear search ++ * within that cacheline. ++ * ++ * To search for the cacheline, there's three different possibilities: ++ * * The set is too small to have a search tree, so we just do a linear ++ * search over the whole set. ++ * * The set is the one we're currently inserting into; keeping a full ++ * auxiliary search tree up to date would be too expensive, so we ++ * use a much simpler lookup table to do a binary search - ++ * bset_search_write_set(). ++ * * Or we use the auxiliary search tree we constructed earlier - ++ * bset_search_tree() ++ */ ++ ++ switch (bset_aux_tree_type(t)) { ++ case BSET_NO_AUX_TREE: ++ return btree_bkey_first(b, t); ++ case BSET_RW_AUX_TREE: ++ return bset_search_write_set(b, t, search, lossy_packed_search); ++ case BSET_RO_AUX_TREE: ++ /* ++ * Each node in the auxiliary search tree covers a certain range ++ * of bits, and keys above and below the set it covers might ++ * differ outside those bits - so we have to special case the ++ * start and end - handle that here: ++ */ ++ ++ if (bkey_cmp(*search, t->max_key) > 0) ++ return btree_bkey_last(b, t); ++ ++ return bset_search_tree(b, t, search, lossy_packed_search); ++ default: ++ unreachable(); ++ } ++} ++ ++static __always_inline __flatten ++struct bkey_packed *bch2_bset_search_linear(struct btree *b, ++ struct bset_tree *t, ++ struct bpos *search, ++ struct bkey_packed *packed_search, ++ const struct bkey_packed *lossy_packed_search, ++ struct bkey_packed *m) ++{ ++ if (lossy_packed_search) ++ while (m != btree_bkey_last(b, t) && ++ bkey_iter_cmp_p_or_unp(b, m, ++ lossy_packed_search, search) < 0) ++ m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); ++ ++ if (!packed_search) ++ while (m != btree_bkey_last(b, t) && ++ bkey_iter_pos_cmp(b, m, search) < 0) ++ m = bkey_next_skip_noops(m, btree_bkey_last(b, t)); ++ ++ if (btree_keys_expensive_checks(b)) { ++ struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); ++ ++ BUG_ON(prev && ++ bkey_iter_cmp_p_or_unp(b, prev, ++ packed_search, search) >= 0); ++ } ++ ++ return m; ++} ++ ++/* ++ * Returns the first key greater than or equal to @search ++ */ ++static __always_inline __flatten ++struct bkey_packed *bch2_bset_search(struct btree *b, ++ struct bset_tree *t, ++ struct bpos *search, ++ struct bkey_packed *packed_search, ++ const struct bkey_packed *lossy_packed_search) ++{ ++ struct bkey_packed *m = __bch2_bset_search(b, t, search, ++ lossy_packed_search); ++ ++ return bch2_bset_search_linear(b, t, search, ++ packed_search, lossy_packed_search, m); ++} ++ ++/* Btree node iterator */ ++ ++static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, ++ struct btree *b, ++ const struct bkey_packed *k, ++ const struct bkey_packed *end) ++{ ++ if (k != end) { ++ struct btree_node_iter_set *pos; ++ ++ btree_node_iter_for_each(iter, pos) ++ ; ++ ++ BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data)); ++ *pos = (struct btree_node_iter_set) { ++ __btree_node_key_to_offset(b, k), ++ __btree_node_key_to_offset(b, end) ++ }; ++ } ++} ++ ++void bch2_btree_node_iter_push(struct btree_node_iter *iter, ++ struct btree *b, ++ const struct bkey_packed *k, ++ const struct bkey_packed *end) ++{ ++ __bch2_btree_node_iter_push(iter, b, k, end); ++ bch2_btree_node_iter_sort(iter, b); ++} ++ ++noinline __flatten __attribute__((cold)) ++static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, ++ struct btree *b, struct bpos *search) ++{ ++ struct bset_tree *t; ++ ++ trace_bkey_pack_pos_fail(search); ++ ++ for_each_bset(b, t) ++ __bch2_btree_node_iter_push(iter, b, ++ bch2_bset_search(b, t, search, NULL, NULL), ++ btree_bkey_last(b, t)); ++ ++ bch2_btree_node_iter_sort(iter, b); ++} ++ ++/** ++ * bch_btree_node_iter_init - initialize a btree node iterator, starting from a ++ * given position ++ * ++ * Main entry point to the lookup code for individual btree nodes: ++ * ++ * NOTE: ++ * ++ * When you don't filter out deleted keys, btree nodes _do_ contain duplicate ++ * keys. This doesn't matter for most code, but it does matter for lookups. ++ * ++ * Some adjacent keys with a string of equal keys: ++ * i j k k k k l m ++ * ++ * If you search for k, the lookup code isn't guaranteed to return you any ++ * specific k. The lookup code is conceptually doing a binary search and ++ * iterating backwards is very expensive so if the pivot happens to land at the ++ * last k that's what you'll get. ++ * ++ * This works out ok, but it's something to be aware of: ++ * ++ * - For non extents, we guarantee that the live key comes last - see ++ * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't ++ * see will only be deleted keys you don't care about. ++ * ++ * - For extents, deleted keys sort last (see the comment at the top of this ++ * file). But when you're searching for extents, you actually want the first ++ * key strictly greater than your search key - an extent that compares equal ++ * to the search key is going to have 0 sectors after the search key. ++ * ++ * But this does mean that we can't just search for ++ * bkey_successor(start_of_range) to get the first extent that overlaps with ++ * the range we want - if we're unlucky and there's an extent that ends ++ * exactly where we searched, then there could be a deleted key at the same ++ * position and we'd get that when we search instead of the preceding extent ++ * we needed. ++ * ++ * So we've got to search for start_of_range, then after the lookup iterate ++ * past any extents that compare equal to the position we searched for. ++ */ ++__flatten ++void bch2_btree_node_iter_init(struct btree_node_iter *iter, ++ struct btree *b, struct bpos *search) ++{ ++ struct bkey_packed p, *packed_search = NULL; ++ struct btree_node_iter_set *pos = iter->data; ++ struct bkey_packed *k[MAX_BSETS]; ++ unsigned i; ++ ++ EBUG_ON(bkey_cmp(*search, b->data->min_key) < 0); ++ bset_aux_tree_verify(b); ++ ++ memset(iter, 0, sizeof(*iter)); ++ ++ switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) { ++ case BKEY_PACK_POS_EXACT: ++ packed_search = &p; ++ break; ++ case BKEY_PACK_POS_SMALLER: ++ packed_search = NULL; ++ break; ++ case BKEY_PACK_POS_FAIL: ++ btree_node_iter_init_pack_failed(iter, b, search); ++ return; ++ } ++ ++ for (i = 0; i < b->nsets; i++) { ++ k[i] = __bch2_bset_search(b, b->set + i, search, &p); ++ prefetch_four_cachelines(k[i]); ++ } ++ ++ for (i = 0; i < b->nsets; i++) { ++ struct bset_tree *t = b->set + i; ++ struct bkey_packed *end = btree_bkey_last(b, t); ++ ++ k[i] = bch2_bset_search_linear(b, t, search, ++ packed_search, &p, k[i]); ++ if (k[i] != end) ++ *pos++ = (struct btree_node_iter_set) { ++ __btree_node_key_to_offset(b, k[i]), ++ __btree_node_key_to_offset(b, end) ++ }; ++ } ++ ++ bch2_btree_node_iter_sort(iter, b); ++} ++ ++void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ struct bset_tree *t; ++ ++ memset(iter, 0, sizeof(*iter)); ++ ++ for_each_bset(b, t) ++ __bch2_btree_node_iter_push(iter, b, ++ btree_bkey_first(b, t), ++ btree_bkey_last(b, t)); ++ bch2_btree_node_iter_sort(iter, b); ++} ++ ++struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter, ++ struct btree *b, ++ struct bset_tree *t) ++{ ++ struct btree_node_iter_set *set; ++ ++ btree_node_iter_for_each(iter, set) ++ if (set->end == t->end_offset) ++ return __btree_node_offset_to_key(b, set->k); ++ ++ return btree_bkey_last(b, t); ++} ++ ++static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter, ++ struct btree *b, ++ unsigned first) ++{ ++ bool ret; ++ ++ if ((ret = (btree_node_iter_cmp(b, ++ iter->data[first], ++ iter->data[first + 1]) > 0))) ++ swap(iter->data[first], iter->data[first + 1]); ++ return ret; ++} ++ ++void bch2_btree_node_iter_sort(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ /* unrolled bubble sort: */ ++ ++ if (!__btree_node_iter_set_end(iter, 2)) { ++ btree_node_iter_sort_two(iter, b, 0); ++ btree_node_iter_sort_two(iter, b, 1); ++ } ++ ++ if (!__btree_node_iter_set_end(iter, 1)) ++ btree_node_iter_sort_two(iter, b, 0); ++} ++ ++void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter, ++ struct btree_node_iter_set *set) ++{ ++ struct btree_node_iter_set *last = ++ iter->data + ARRAY_SIZE(iter->data) - 1; ++ ++ memmove(&set[0], &set[1], (void *) last - (void *) set); ++ *last = (struct btree_node_iter_set) { 0, 0 }; ++} ++ ++static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s; ++ ++ EBUG_ON(iter->data->k > iter->data->end); ++ ++ while (!__btree_node_iter_set_end(iter, 0) && ++ !__bch2_btree_node_iter_peek_all(iter, b)->u64s) ++ iter->data->k++; ++ ++ if (unlikely(__btree_node_iter_set_end(iter, 0))) { ++ bch2_btree_node_iter_set_drop(iter, iter->data); ++ return; ++ } ++ ++ if (__btree_node_iter_set_end(iter, 1)) ++ return; ++ ++ if (!btree_node_iter_sort_two(iter, b, 0)) ++ return; ++ ++ if (__btree_node_iter_set_end(iter, 2)) ++ return; ++ ++ btree_node_iter_sort_two(iter, b, 1); ++} ++ ++void bch2_btree_node_iter_advance(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ if (btree_keys_expensive_checks(b)) { ++ bch2_btree_node_iter_verify(iter, b); ++ bch2_btree_node_iter_next_check(iter, b); ++ } ++ ++ __bch2_btree_node_iter_advance(iter, b); ++} ++ ++/* ++ * Expensive: ++ */ ++struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ struct bkey_packed *k, *prev = NULL; ++ struct btree_node_iter_set *set; ++ struct bset_tree *t; ++ unsigned end = 0; ++ ++ if (btree_keys_expensive_checks(b)) ++ bch2_btree_node_iter_verify(iter, b); ++ ++ for_each_bset(b, t) { ++ k = bch2_bkey_prev_all(b, t, ++ bch2_btree_node_iter_bset_pos(iter, b, t)); ++ if (k && ++ (!prev || bkey_iter_cmp(b, k, prev) > 0)) { ++ prev = k; ++ end = t->end_offset; ++ } ++ } ++ ++ if (!prev) ++ return NULL; ++ ++ /* ++ * We're manually memmoving instead of just calling sort() to ensure the ++ * prev we picked ends up in slot 0 - sort won't necessarily put it ++ * there because of duplicate deleted keys: ++ */ ++ btree_node_iter_for_each(iter, set) ++ if (set->end == end) ++ goto found; ++ ++ BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]); ++found: ++ BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data)); ++ ++ memmove(&iter->data[1], ++ &iter->data[0], ++ (void *) set - (void *) &iter->data[0]); ++ ++ iter->data[0].k = __btree_node_key_to_offset(b, prev); ++ iter->data[0].end = end; ++ ++ if (btree_keys_expensive_checks(b)) ++ bch2_btree_node_iter_verify(iter, b); ++ return prev; ++} ++ ++struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *iter, ++ struct btree *b, ++ unsigned min_key_type) ++{ ++ struct bkey_packed *prev; ++ ++ do { ++ prev = bch2_btree_node_iter_prev_all(iter, b); ++ } while (prev && prev->type < min_key_type); ++ ++ return prev; ++} ++ ++struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, ++ struct btree *b, ++ struct bkey *u) ++{ ++ struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b); ++ ++ return k ? bkey_disassemble(b, k, u) : bkey_s_c_null; ++} ++ ++/* Mergesort */ ++ ++void bch2_btree_keys_stats(struct btree *b, struct bset_stats *stats) ++{ ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) { ++ enum bset_aux_tree_type type = bset_aux_tree_type(t); ++ size_t j; ++ ++ stats->sets[type].nr++; ++ stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) * ++ sizeof(u64); ++ ++ if (bset_has_ro_aux_tree(t)) { ++ stats->floats += t->size - 1; ++ ++ for (j = 1; j < t->size; j++) ++ stats->failed += ++ bkey_float(b, t, j)->exponent == ++ BFLOAT_FAILED; ++ } ++ } ++} ++ ++void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, ++ struct bkey_packed *k) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ struct bkey uk; ++ unsigned j, inorder; ++ ++ if (out->pos != out->end) ++ *out->pos = '\0'; ++ ++ if (!bset_has_ro_aux_tree(t)) ++ return; ++ ++ inorder = bkey_to_cacheline(b, t, k); ++ if (!inorder || inorder >= t->size) ++ return; ++ ++ j = __inorder_to_eytzinger1(inorder, t->size, t->extra); ++ if (k != tree_to_bkey(b, t, j)) ++ return; ++ ++ switch (bkey_float(b, t, j)->exponent) { ++ case BFLOAT_FAILED: ++ uk = bkey_unpack_key(b, k); ++ pr_buf(out, ++ " failed unpacked at depth %u\n" ++ "\t%llu:%llu\n", ++ ilog2(j), ++ uk.p.inode, uk.p.offset); ++ break; ++ } ++} +diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h +new file mode 100644 +index 000000000000..5921cf689105 +--- /dev/null ++++ b/fs/bcachefs/bset.h +@@ -0,0 +1,661 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BSET_H ++#define _BCACHEFS_BSET_H ++ ++#include ++#include ++ ++#include "bcachefs_format.h" ++#include "bkey.h" ++#include "bkey_methods.h" ++#include "btree_types.h" ++#include "util.h" /* for time_stats */ ++#include "vstructs.h" ++ ++/* ++ * BKEYS: ++ * ++ * A bkey contains a key, a size field, a variable number of pointers, and some ++ * ancillary flag bits. ++ * ++ * We use two different functions for validating bkeys, bkey_invalid and ++ * bkey_deleted(). ++ * ++ * The one exception to the rule that ptr_invalid() filters out invalid keys is ++ * that it also filters out keys of size 0 - these are keys that have been ++ * completely overwritten. It'd be safe to delete these in memory while leaving ++ * them on disk, just unnecessary work - so we filter them out when resorting ++ * instead. ++ * ++ * We can't filter out stale keys when we're resorting, because garbage ++ * collection needs to find them to ensure bucket gens don't wrap around - ++ * unless we're rewriting the btree node those stale keys still exist on disk. ++ * ++ * We also implement functions here for removing some number of sectors from the ++ * front or the back of a bkey - this is mainly used for fixing overlapping ++ * extents, by removing the overlapping sectors from the older key. ++ * ++ * BSETS: ++ * ++ * A bset is an array of bkeys laid out contiguously in memory in sorted order, ++ * along with a header. A btree node is made up of a number of these, written at ++ * different times. ++ * ++ * There could be many of them on disk, but we never allow there to be more than ++ * 4 in memory - we lazily resort as needed. ++ * ++ * We implement code here for creating and maintaining auxiliary search trees ++ * (described below) for searching an individial bset, and on top of that we ++ * implement a btree iterator. ++ * ++ * BTREE ITERATOR: ++ * ++ * Most of the code in bcache doesn't care about an individual bset - it needs ++ * to search entire btree nodes and iterate over them in sorted order. ++ * ++ * The btree iterator code serves both functions; it iterates through the keys ++ * in a btree node in sorted order, starting from either keys after a specific ++ * point (if you pass it a search key) or the start of the btree node. ++ * ++ * AUXILIARY SEARCH TREES: ++ * ++ * Since keys are variable length, we can't use a binary search on a bset - we ++ * wouldn't be able to find the start of the next key. But binary searches are ++ * slow anyways, due to terrible cache behaviour; bcache originally used binary ++ * searches and that code topped out at under 50k lookups/second. ++ * ++ * So we need to construct some sort of lookup table. Since we only insert keys ++ * into the last (unwritten) set, most of the keys within a given btree node are ++ * usually in sets that are mostly constant. We use two different types of ++ * lookup tables to take advantage of this. ++ * ++ * Both lookup tables share in common that they don't index every key in the ++ * set; they index one key every BSET_CACHELINE bytes, and then a linear search ++ * is used for the rest. ++ * ++ * For sets that have been written to disk and are no longer being inserted ++ * into, we construct a binary search tree in an array - traversing a binary ++ * search tree in an array gives excellent locality of reference and is very ++ * fast, since both children of any node are adjacent to each other in memory ++ * (and their grandchildren, and great grandchildren...) - this means ++ * prefetching can be used to great effect. ++ * ++ * It's quite useful performance wise to keep these nodes small - not just ++ * because they're more likely to be in L2, but also because we can prefetch ++ * more nodes on a single cacheline and thus prefetch more iterations in advance ++ * when traversing this tree. ++ * ++ * Nodes in the auxiliary search tree must contain both a key to compare against ++ * (we don't want to fetch the key from the set, that would defeat the purpose), ++ * and a pointer to the key. We use a few tricks to compress both of these. ++ * ++ * To compress the pointer, we take advantage of the fact that one node in the ++ * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have ++ * a function (to_inorder()) that takes the index of a node in a binary tree and ++ * returns what its index would be in an inorder traversal, so we only have to ++ * store the low bits of the offset. ++ * ++ * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To ++ * compress that, we take advantage of the fact that when we're traversing the ++ * search tree at every iteration we know that both our search key and the key ++ * we're looking for lie within some range - bounded by our previous ++ * comparisons. (We special case the start of a search so that this is true even ++ * at the root of the tree). ++ * ++ * So we know the key we're looking for is between a and b, and a and b don't ++ * differ higher than bit 50, we don't need to check anything higher than bit ++ * 50. ++ * ++ * We don't usually need the rest of the bits, either; we only need enough bits ++ * to partition the key range we're currently checking. Consider key n - the ++ * key our auxiliary search tree node corresponds to, and key p, the key ++ * immediately preceding n. The lowest bit we need to store in the auxiliary ++ * search tree is the highest bit that differs between n and p. ++ * ++ * Note that this could be bit 0 - we might sometimes need all 80 bits to do the ++ * comparison. But we'd really like our nodes in the auxiliary search tree to be ++ * of fixed size. ++ * ++ * The solution is to make them fixed size, and when we're constructing a node ++ * check if p and n differed in the bits we needed them to. If they don't we ++ * flag that node, and when doing lookups we fallback to comparing against the ++ * real key. As long as this doesn't happen to often (and it seems to reliably ++ * happen a bit less than 1% of the time), we win - even on failures, that key ++ * is then more likely to be in cache than if we were doing binary searches all ++ * the way, since we're touching so much less memory. ++ * ++ * The keys in the auxiliary search tree are stored in (software) floating ++ * point, with an exponent and a mantissa. The exponent needs to be big enough ++ * to address all the bits in the original key, but the number of bits in the ++ * mantissa is somewhat arbitrary; more bits just gets us fewer failures. ++ * ++ * We need 7 bits for the exponent and 3 bits for the key's offset (since keys ++ * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes. ++ * We need one node per 128 bytes in the btree node, which means the auxiliary ++ * search trees take up 3% as much memory as the btree itself. ++ * ++ * Constructing these auxiliary search trees is moderately expensive, and we ++ * don't want to be constantly rebuilding the search tree for the last set ++ * whenever we insert another key into it. For the unwritten set, we use a much ++ * simpler lookup table - it's just a flat array, so index i in the lookup table ++ * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing ++ * within each byte range works the same as with the auxiliary search trees. ++ * ++ * These are much easier to keep up to date when we insert a key - we do it ++ * somewhat lazily; when we shift a key up we usually just increment the pointer ++ * to it, only when it would overflow do we go to the trouble of finding the ++ * first key in that range of bytes again. ++ */ ++ ++extern bool bch2_expensive_debug_checks; ++ ++static inline bool btree_keys_expensive_checks(const struct btree *b) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ return bch2_expensive_debug_checks || *b->expensive_debug_checks; ++#else ++ return false; ++#endif ++} ++ ++enum bset_aux_tree_type { ++ BSET_NO_AUX_TREE, ++ BSET_RO_AUX_TREE, ++ BSET_RW_AUX_TREE, ++}; ++ ++#define BSET_TREE_NR_TYPES 3 ++ ++#define BSET_NO_AUX_TREE_VAL (U16_MAX) ++#define BSET_RW_AUX_TREE_VAL (U16_MAX - 1) ++ ++static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t) ++{ ++ switch (t->extra) { ++ case BSET_NO_AUX_TREE_VAL: ++ EBUG_ON(t->size); ++ return BSET_NO_AUX_TREE; ++ case BSET_RW_AUX_TREE_VAL: ++ EBUG_ON(!t->size); ++ return BSET_RW_AUX_TREE; ++ default: ++ EBUG_ON(!t->size); ++ return BSET_RO_AUX_TREE; ++ } ++} ++ ++/* ++ * BSET_CACHELINE was originally intended to match the hardware cacheline size - ++ * it used to be 64, but I realized the lookup code would touch slightly less ++ * memory if it was 128. ++ * ++ * It definites the number of bytes (in struct bset) per struct bkey_float in ++ * the auxiliar search tree - when we're done searching the bset_float tree we ++ * have this many bytes left that we do a linear search over. ++ * ++ * Since (after level 5) every level of the bset_tree is on a new cacheline, ++ * we're touching one fewer cacheline in the bset tree in exchange for one more ++ * cacheline in the linear search - but the linear search might stop before it ++ * gets to the second cacheline. ++ */ ++ ++#define BSET_CACHELINE 128 ++ ++static inline size_t btree_keys_cachelines(struct btree *b) ++{ ++ return (1U << b->byte_order) / BSET_CACHELINE; ++} ++ ++static inline size_t btree_aux_data_bytes(struct btree *b) ++{ ++ return btree_keys_cachelines(b) * 8; ++} ++ ++static inline size_t btree_aux_data_u64s(struct btree *b) ++{ ++ return btree_aux_data_bytes(b) / sizeof(u64); ++} ++ ++typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); ++ ++static inline void ++__bkey_unpack_key_format_checked(const struct btree *b, ++ struct bkey *dst, ++ const struct bkey_packed *src) ++{ ++#ifdef HAVE_BCACHEFS_COMPILED_UNPACK ++ { ++ compiled_unpack_fn unpack_fn = b->aux_data; ++ unpack_fn(dst, src); ++ ++ if (btree_keys_expensive_checks(b)) { ++ struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); ++ ++ BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); ++ } ++ } ++#else ++ *dst = __bch2_bkey_unpack_key(&b->format, src); ++#endif ++} ++ ++static inline struct bkey ++bkey_unpack_key_format_checked(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++ struct bkey dst; ++ ++ __bkey_unpack_key_format_checked(b, &dst, src); ++ return dst; ++} ++ ++static inline void __bkey_unpack_key(const struct btree *b, ++ struct bkey *dst, ++ const struct bkey_packed *src) ++{ ++ if (likely(bkey_packed(src))) ++ __bkey_unpack_key_format_checked(b, dst, src); ++ else ++ *dst = *packed_to_bkey_c(src); ++} ++ ++/** ++ * bkey_unpack_key -- unpack just the key, not the value ++ */ ++static inline struct bkey bkey_unpack_key(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++ return likely(bkey_packed(src)) ++ ? bkey_unpack_key_format_checked(b, src) ++ : *packed_to_bkey_c(src); ++} ++ ++static inline struct bpos ++bkey_unpack_pos_format_checked(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++#ifdef HAVE_BCACHEFS_COMPILED_UNPACK ++ return bkey_unpack_key_format_checked(b, src).p; ++#else ++ return __bkey_unpack_pos(&b->format, src); ++#endif ++} ++ ++static inline struct bpos bkey_unpack_pos(const struct btree *b, ++ const struct bkey_packed *src) ++{ ++ return likely(bkey_packed(src)) ++ ? bkey_unpack_pos_format_checked(b, src) ++ : packed_to_bkey_c(src)->p; ++} ++ ++/* Disassembled bkeys */ ++ ++static inline struct bkey_s_c bkey_disassemble(struct btree *b, ++ const struct bkey_packed *k, ++ struct bkey *u) ++{ ++ __bkey_unpack_key(b, u, k); ++ ++ return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), }; ++} ++ ++/* non const version: */ ++static inline struct bkey_s __bkey_disassemble(struct btree *b, ++ struct bkey_packed *k, ++ struct bkey *u) ++{ ++ __bkey_unpack_key(b, u, k); ++ ++ return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; ++} ++ ++#define for_each_bset(_b, _t) \ ++ for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) ++ ++#define bset_tree_for_each_key(_b, _t, _k) \ ++ for (_k = btree_bkey_first(_b, _t); \ ++ _k != btree_bkey_last(_b, _t); \ ++ _k = bkey_next_skip_noops(_k, btree_bkey_last(_b, _t))) ++ ++static inline bool bset_has_ro_aux_tree(struct bset_tree *t) ++{ ++ return bset_aux_tree_type(t) == BSET_RO_AUX_TREE; ++} ++ ++static inline bool bset_has_rw_aux_tree(struct bset_tree *t) ++{ ++ return bset_aux_tree_type(t) == BSET_RW_AUX_TREE; ++} ++ ++static inline void bch2_bset_set_no_aux_tree(struct btree *b, ++ struct bset_tree *t) ++{ ++ BUG_ON(t < b->set); ++ ++ for (; t < b->set + ARRAY_SIZE(b->set); t++) { ++ t->size = 0; ++ t->extra = BSET_NO_AUX_TREE_VAL; ++ t->aux_data_offset = U16_MAX; ++ } ++} ++ ++static inline void btree_node_set_format(struct btree *b, ++ struct bkey_format f) ++{ ++ int len; ++ ++ b->format = f; ++ b->nr_key_bits = bkey_format_key_bits(&f); ++ ++ len = bch2_compile_bkey_format(&b->format, b->aux_data); ++ BUG_ON(len < 0 || len > U8_MAX); ++ ++ b->unpack_fn_len = len; ++ ++ bch2_bset_set_no_aux_tree(b, b->set); ++} ++ ++static inline struct bset *bset_next_set(struct btree *b, ++ unsigned block_bytes) ++{ ++ struct bset *i = btree_bset_last(b); ++ ++ EBUG_ON(!is_power_of_2(block_bytes)); ++ ++ return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); ++} ++ ++void bch2_btree_keys_init(struct btree *, bool *); ++ ++void bch2_bset_init_first(struct btree *, struct bset *); ++void bch2_bset_init_next(struct bch_fs *, struct btree *, ++ struct btree_node_entry *); ++void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); ++void bch2_bset_fix_invalidated_key(struct btree *, struct bkey_packed *); ++ ++void bch2_bset_insert(struct btree *, struct btree_node_iter *, ++ struct bkey_packed *, struct bkey_i *, unsigned); ++void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned); ++ ++/* Bkey utility code */ ++ ++/* packed or unpacked */ ++static inline int bkey_cmp_p_or_unp(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bkey_packed *r_packed, ++ const struct bpos *r) ++{ ++ EBUG_ON(r_packed && !bkey_packed(r_packed)); ++ ++ if (unlikely(!bkey_packed(l))) ++ return bkey_cmp(packed_to_bkey_c(l)->p, *r); ++ ++ if (likely(r_packed)) ++ return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b); ++ ++ return __bch2_bkey_cmp_left_packed_format_checked(b, l, r); ++} ++ ++struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *); ++ ++struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *, ++ struct bkey_packed *, unsigned); ++ ++static inline struct bkey_packed * ++bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k) ++{ ++ return bch2_bkey_prev_filter(b, t, k, 0); ++} ++ ++static inline struct bkey_packed * ++bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k) ++{ ++ return bch2_bkey_prev_filter(b, t, k, KEY_TYPE_discard + 1); ++} ++ ++enum bch_extent_overlap { ++ BCH_EXTENT_OVERLAP_ALL = 0, ++ BCH_EXTENT_OVERLAP_BACK = 1, ++ BCH_EXTENT_OVERLAP_FRONT = 2, ++ BCH_EXTENT_OVERLAP_MIDDLE = 3, ++}; ++ ++/* Returns how k overlaps with m */ ++static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, ++ const struct bkey *m) ++{ ++ int cmp1 = bkey_cmp(k->p, m->p) < 0; ++ int cmp2 = bkey_cmp(bkey_start_pos(k), ++ bkey_start_pos(m)) > 0; ++ ++ return (cmp1 << 1) + cmp2; ++} ++ ++/* Btree key iteration */ ++ ++void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *, ++ const struct bkey_packed *, ++ const struct bkey_packed *); ++void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *, ++ struct bpos *); ++void bch2_btree_node_iter_init_from_start(struct btree_node_iter *, ++ struct btree *); ++struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *, ++ struct btree *, ++ struct bset_tree *); ++ ++void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *); ++void bch2_btree_node_iter_set_drop(struct btree_node_iter *, ++ struct btree_node_iter_set *); ++void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *); ++ ++#define btree_node_iter_for_each(_iter, _set) \ ++ for (_set = (_iter)->data; \ ++ _set < (_iter)->data + ARRAY_SIZE((_iter)->data) && \ ++ (_set)->k != (_set)->end; \ ++ _set++) ++ ++static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter, ++ unsigned i) ++{ ++ return iter->data[i].k == iter->data[i].end; ++} ++ ++static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter) ++{ ++ return __btree_node_iter_set_end(iter, 0); ++} ++ ++/* ++ * When keys compare equal, deleted keys compare first: ++ * ++ * XXX: only need to compare pointers for keys that are both within a ++ * btree_node_iterator - we need to break ties for prev() to work correctly ++ */ ++static inline int bkey_iter_cmp(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bkey_packed *r) ++{ ++ return bkey_cmp_packed(b, l, r) ++ ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) ++ ?: cmp_int(l, r); ++} ++ ++static inline int btree_node_iter_cmp(const struct btree *b, ++ struct btree_node_iter_set l, ++ struct btree_node_iter_set r) ++{ ++ return bkey_iter_cmp(b, ++ __btree_node_offset_to_key(b, l.k), ++ __btree_node_offset_to_key(b, r.k)); ++} ++ ++/* These assume r (the search key) is not a deleted key: */ ++static inline int bkey_iter_pos_cmp(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bpos *r) ++{ ++ return bkey_cmp_left_packed(b, l, r) ++ ?: -((int) bkey_deleted(l)); ++} ++ ++static inline int bkey_iter_cmp_p_or_unp(const struct btree *b, ++ const struct bkey_packed *l, ++ const struct bkey_packed *r_packed, ++ const struct bpos *r) ++{ ++ return bkey_cmp_p_or_unp(b, l, r_packed, r) ++ ?: -((int) bkey_deleted(l)); ++} ++ ++static inline struct bkey_packed * ++__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ return __btree_node_offset_to_key(b, iter->data->k); ++} ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_peek_filter(struct btree_node_iter *iter, ++ struct btree *b, ++ unsigned min_key_type) ++{ ++ while (!bch2_btree_node_iter_end(iter)) { ++ struct bkey_packed *k = __bch2_btree_node_iter_peek_all(iter, b); ++ ++ if (k->type >= min_key_type) ++ return k; ++ ++ bch2_btree_node_iter_advance(iter, b); ++ } ++ ++ return NULL; ++} ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, ++ struct btree *b) ++{ ++ return bch2_btree_node_iter_peek_filter(iter, b, 0); ++} ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b) ++{ ++ return bch2_btree_node_iter_peek_filter(iter, b, KEY_TYPE_discard + 1); ++} ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b) ++{ ++ struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b); ++ ++ if (ret) ++ bch2_btree_node_iter_advance(iter, b); ++ ++ return ret; ++} ++ ++struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *, ++ struct btree *); ++struct bkey_packed *bch2_btree_node_iter_prev_filter(struct btree_node_iter *, ++ struct btree *, unsigned); ++ ++static inline struct bkey_packed * ++bch2_btree_node_iter_prev(struct btree_node_iter *iter, struct btree *b) ++{ ++ return bch2_btree_node_iter_prev_filter(iter, b, KEY_TYPE_discard + 1); ++} ++ ++struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, ++ struct btree *, ++ struct bkey *); ++ ++#define for_each_btree_node_key_unpack(b, k, iter, unpacked) \ ++ for (bch2_btree_node_iter_init_from_start((iter), (b)); \ ++ (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\ ++ bch2_btree_node_iter_advance(iter, b)) ++ ++/* Accounting: */ ++ ++static inline void btree_keys_account_key(struct btree_nr_keys *n, ++ unsigned bset, ++ struct bkey_packed *k, ++ int sign) ++{ ++ n->live_u64s += k->u64s * sign; ++ n->bset_u64s[bset] += k->u64s * sign; ++ ++ if (bkey_packed(k)) ++ n->packed_keys += sign; ++ else ++ n->unpacked_keys += sign; ++} ++ ++static inline void btree_keys_account_val_delta(struct btree *b, ++ struct bkey_packed *k, ++ int delta) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, k); ++ ++ b->nr.live_u64s += delta; ++ b->nr.bset_u64s[t - b->set] += delta; ++} ++ ++#define btree_keys_account_key_add(_nr, _bset_idx, _k) \ ++ btree_keys_account_key(_nr, _bset_idx, _k, 1) ++#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \ ++ btree_keys_account_key(_nr, _bset_idx, _k, -1) ++ ++#define btree_account_key_add(_b, _k) \ ++ btree_keys_account_key(&(_b)->nr, \ ++ bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1) ++#define btree_account_key_drop(_b, _k) \ ++ btree_keys_account_key(&(_b)->nr, \ ++ bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1) ++ ++struct bset_stats { ++ struct { ++ size_t nr, bytes; ++ } sets[BSET_TREE_NR_TYPES]; ++ ++ size_t floats; ++ size_t failed; ++}; ++ ++void bch2_btree_keys_stats(struct btree *, struct bset_stats *); ++void bch2_bfloat_to_text(struct printbuf *, struct btree *, ++ struct bkey_packed *); ++ ++/* Debug stuff */ ++ ++void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned); ++void bch2_dump_btree_node(struct bch_fs *, struct btree *); ++void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++void __bch2_verify_btree_nr_keys(struct btree *); ++void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); ++void bch2_verify_insert_pos(struct btree *, struct bkey_packed *, ++ struct bkey_packed *, unsigned); ++ ++#else ++ ++static inline void __bch2_verify_btree_nr_keys(struct btree *b) {} ++static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter, ++ struct btree *b) {} ++static inline void bch2_verify_insert_pos(struct btree *b, ++ struct bkey_packed *where, ++ struct bkey_packed *insert, ++ unsigned clobber_u64s) {} ++#endif ++ ++static inline void bch2_verify_btree_nr_keys(struct btree *b) ++{ ++ if (btree_keys_expensive_checks(b)) ++ __bch2_verify_btree_nr_keys(b); ++} ++ ++#endif /* _BCACHEFS_BSET_H */ +diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c +new file mode 100644 +index 000000000000..bb94fa2341ee +--- /dev/null ++++ b/fs/bcachefs/btree_cache.c +@@ -0,0 +1,1063 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_locking.h" ++#include "debug.h" ++ ++#include ++#include ++#include ++ ++const char * const bch2_btree_ids[] = { ++#define x(kwd, val, name) name, ++ BCH_BTREE_IDS() ++#undef x ++ NULL ++}; ++ ++void bch2_recalc_btree_reserve(struct bch_fs *c) ++{ ++ unsigned i, reserve = 16; ++ ++ if (!c->btree_roots[0].b) ++ reserve += 8; ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (c->btree_roots[i].b) ++ reserve += min_t(unsigned, 1, ++ c->btree_roots[i].b->c.level) * 8; ++ ++ c->btree_cache.reserve = reserve; ++} ++ ++static inline unsigned btree_cache_can_free(struct btree_cache *bc) ++{ ++ return max_t(int, 0, bc->used - bc->reserve); ++} ++ ++static void __btree_node_data_free(struct bch_fs *c, struct btree *b) ++{ ++ EBUG_ON(btree_node_write_in_flight(b)); ++ ++ kvpfree(b->data, btree_bytes(c)); ++ b->data = NULL; ++ vfree(b->aux_data); ++ b->aux_data = NULL; ++} ++ ++static void btree_node_data_free(struct bch_fs *c, struct btree *b) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ ++ __btree_node_data_free(c, b); ++ bc->used--; ++ list_move(&b->list, &bc->freed); ++} ++ ++static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, ++ const void *obj) ++{ ++ const struct btree *b = obj; ++ const u64 *v = arg->key; ++ ++ return b->hash_val == *v ? 0 : 1; ++} ++ ++static const struct rhashtable_params bch_btree_cache_params = { ++ .head_offset = offsetof(struct btree, hash), ++ .key_offset = offsetof(struct btree, hash_val), ++ .key_len = sizeof(u64), ++ .obj_cmpfn = bch2_btree_cache_cmp_fn, ++}; ++ ++static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) ++{ ++ BUG_ON(b->data || b->aux_data); ++ ++ b->data = kvpmalloc(btree_bytes(c), gfp); ++ if (!b->data) ++ return -ENOMEM; ++ ++ b->aux_data = vmalloc_exec(btree_aux_data_bytes(b), gfp); ++ if (!b->aux_data) { ++ kvpfree(b->data, btree_bytes(c)); ++ b->data = NULL; ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++static struct btree *__btree_node_mem_alloc(struct bch_fs *c) ++{ ++ struct btree *b = kzalloc(sizeof(struct btree), GFP_KERNEL); ++ if (!b) ++ return NULL; ++ ++ bkey_btree_ptr_init(&b->key); ++ six_lock_init(&b->c.lock); ++ INIT_LIST_HEAD(&b->list); ++ INIT_LIST_HEAD(&b->write_blocked); ++ b->byte_order = ilog2(btree_bytes(c)); ++ return b; ++} ++ ++static struct btree *btree_node_mem_alloc(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b = __btree_node_mem_alloc(c); ++ if (!b) ++ return NULL; ++ ++ if (btree_node_data_alloc(c, b, GFP_KERNEL)) { ++ kfree(b); ++ return NULL; ++ } ++ ++ bc->used++; ++ list_add(&b->list, &bc->freeable); ++ return b; ++} ++ ++/* Btree in memory cache - hash table */ ++ ++void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) ++{ ++ rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); ++ ++ /* Cause future lookups for this node to fail: */ ++ b->hash_val = 0; ++ ++ six_lock_wakeup_all(&b->c.lock); ++} ++ ++int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) ++{ ++ BUG_ON(b->hash_val); ++ b->hash_val = btree_ptr_hash_val(&b->key); ++ ++ return rhashtable_lookup_insert_fast(&bc->table, &b->hash, ++ bch_btree_cache_params); ++} ++ ++int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, ++ unsigned level, enum btree_id id) ++{ ++ int ret; ++ ++ b->c.level = level; ++ b->c.btree_id = id; ++ ++ mutex_lock(&bc->lock); ++ ret = __bch2_btree_node_hash_insert(bc, b); ++ if (!ret) ++ list_add(&b->list, &bc->live); ++ mutex_unlock(&bc->lock); ++ ++ return ret; ++} ++ ++__flatten ++static inline struct btree *btree_cache_find(struct btree_cache *bc, ++ const struct bkey_i *k) ++{ ++ u64 v = btree_ptr_hash_val(k); ++ ++ return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); ++} ++ ++/* ++ * this version is for btree nodes that have already been freed (we're not ++ * reaping a real btree node) ++ */ ++static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ int ret = 0; ++ ++ lockdep_assert_held(&bc->lock); ++ ++ if (!six_trylock_intent(&b->c.lock)) ++ return -ENOMEM; ++ ++ if (!six_trylock_write(&b->c.lock)) ++ goto out_unlock_intent; ++ ++ if (btree_node_noevict(b)) ++ goto out_unlock; ++ ++ if (!btree_node_may_write(b)) ++ goto out_unlock; ++ ++ if (btree_node_dirty(b) && ++ test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) ++ goto out_unlock; ++ ++ if (btree_node_dirty(b) || ++ btree_node_write_in_flight(b) || ++ btree_node_read_in_flight(b)) { ++ if (!flush) ++ goto out_unlock; ++ ++ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, ++ TASK_UNINTERRUPTIBLE); ++ ++ /* ++ * Using the underscore version because we don't want to compact ++ * bsets after the write, since this node is about to be evicted ++ * - unless btree verify mode is enabled, since it runs out of ++ * the post write cleanup: ++ */ ++ if (verify_btree_ondisk(c)) ++ bch2_btree_node_write(c, b, SIX_LOCK_intent); ++ else ++ __bch2_btree_node_write(c, b, SIX_LOCK_read); ++ ++ /* wait for any in flight btree write */ ++ btree_node_wait_on_io(b); ++ } ++out: ++ if (b->hash_val && !ret) ++ trace_btree_node_reap(c, b); ++ return ret; ++out_unlock: ++ six_unlock_write(&b->c.lock); ++out_unlock_intent: ++ six_unlock_intent(&b->c.lock); ++ ret = -ENOMEM; ++ goto out; ++} ++ ++static int btree_node_reclaim(struct bch_fs *c, struct btree *b) ++{ ++ return __btree_node_reclaim(c, b, false); ++} ++ ++static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) ++{ ++ return __btree_node_reclaim(c, b, true); ++} ++ ++static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, ++ struct shrink_control *sc) ++{ ++ struct bch_fs *c = container_of(shrink, struct bch_fs, ++ btree_cache.shrink); ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b, *t; ++ unsigned long nr = sc->nr_to_scan; ++ unsigned long can_free; ++ unsigned long touched = 0; ++ unsigned long freed = 0; ++ unsigned i, flags; ++ ++ if (btree_shrinker_disabled(c)) ++ return SHRINK_STOP; ++ ++ /* Return -1 if we can't do anything right now */ ++ if (sc->gfp_mask & __GFP_FS) ++ mutex_lock(&bc->lock); ++ else if (!mutex_trylock(&bc->lock)) ++ return -1; ++ ++ flags = memalloc_nofs_save(); ++ ++ /* ++ * It's _really_ critical that we don't free too many btree nodes - we ++ * have to always leave ourselves a reserve. The reserve is how we ++ * guarantee that allocating memory for a new btree node can always ++ * succeed, so that inserting keys into the btree can always succeed and ++ * IO can always make forward progress: ++ */ ++ nr /= btree_pages(c); ++ can_free = btree_cache_can_free(bc); ++ nr = min_t(unsigned long, nr, can_free); ++ ++ i = 0; ++ list_for_each_entry_safe(b, t, &bc->freeable, list) { ++ touched++; ++ ++ if (freed >= nr) ++ break; ++ ++ if (++i > 3 && ++ !btree_node_reclaim(c, b)) { ++ btree_node_data_free(c, b); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ freed++; ++ } ++ } ++restart: ++ list_for_each_entry_safe(b, t, &bc->live, list) { ++ touched++; ++ ++ if (freed >= nr) { ++ /* Save position */ ++ if (&t->list != &bc->live) ++ list_move_tail(&bc->live, &t->list); ++ break; ++ } ++ ++ if (!btree_node_accessed(b) && ++ !btree_node_reclaim(c, b)) { ++ /* can't call bch2_btree_node_hash_remove under lock */ ++ freed++; ++ if (&t->list != &bc->live) ++ list_move_tail(&bc->live, &t->list); ++ ++ btree_node_data_free(c, b); ++ mutex_unlock(&bc->lock); ++ ++ bch2_btree_node_hash_remove(bc, b); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ ++ if (freed >= nr) ++ goto out; ++ ++ if (sc->gfp_mask & __GFP_FS) ++ mutex_lock(&bc->lock); ++ else if (!mutex_trylock(&bc->lock)) ++ goto out; ++ goto restart; ++ } else ++ clear_btree_node_accessed(b); ++ } ++ ++ memalloc_nofs_restore(flags); ++ mutex_unlock(&bc->lock); ++out: ++ return (unsigned long) freed * btree_pages(c); ++} ++ ++static unsigned long bch2_btree_cache_count(struct shrinker *shrink, ++ struct shrink_control *sc) ++{ ++ struct bch_fs *c = container_of(shrink, struct bch_fs, ++ btree_cache.shrink); ++ struct btree_cache *bc = &c->btree_cache; ++ ++ if (btree_shrinker_disabled(c)) ++ return 0; ++ ++ return btree_cache_can_free(bc) * btree_pages(c); ++} ++ ++void bch2_fs_btree_cache_exit(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ unsigned i, flags; ++ ++ if (bc->shrink.list.next) ++ unregister_shrinker(&bc->shrink); ++ ++ /* vfree() can allocate memory: */ ++ flags = memalloc_nofs_save(); ++ mutex_lock(&bc->lock); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ if (c->verify_data) ++ list_move(&c->verify_data->list, &bc->live); ++ ++ kvpfree(c->verify_ondisk, btree_bytes(c)); ++#endif ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (c->btree_roots[i].b) ++ list_add(&c->btree_roots[i].b->list, &bc->live); ++ ++ list_splice(&bc->freeable, &bc->live); ++ ++ while (!list_empty(&bc->live)) { ++ b = list_first_entry(&bc->live, struct btree, list); ++ ++ BUG_ON(btree_node_read_in_flight(b) || ++ btree_node_write_in_flight(b)); ++ ++ if (btree_node_dirty(b)) ++ bch2_btree_complete_write(c, b, btree_current_write(b)); ++ clear_btree_node_dirty(b); ++ ++ btree_node_data_free(c, b); ++ } ++ ++ while (!list_empty(&bc->freed)) { ++ b = list_first_entry(&bc->freed, struct btree, list); ++ list_del(&b->list); ++ kfree(b); ++ } ++ ++ mutex_unlock(&bc->lock); ++ memalloc_nofs_restore(flags); ++ ++ if (bc->table_init_done) ++ rhashtable_destroy(&bc->table); ++} ++ ++int bch2_fs_btree_cache_init(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ unsigned i; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ ret = rhashtable_init(&bc->table, &bch_btree_cache_params); ++ if (ret) ++ goto out; ++ ++ bc->table_init_done = true; ++ ++ bch2_recalc_btree_reserve(c); ++ ++ for (i = 0; i < bc->reserve; i++) ++ if (!btree_node_mem_alloc(c)) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ list_splice_init(&bc->live, &bc->freeable); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ mutex_init(&c->verify_lock); ++ ++ c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); ++ if (!c->verify_ondisk) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ c->verify_data = btree_node_mem_alloc(c); ++ if (!c->verify_data) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ list_del_init(&c->verify_data->list); ++#endif ++ ++ bc->shrink.count_objects = bch2_btree_cache_count; ++ bc->shrink.scan_objects = bch2_btree_cache_scan; ++ bc->shrink.seeks = 4; ++ bc->shrink.batch = btree_pages(c) * 2; ++ register_shrinker(&bc->shrink); ++out: ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} ++ ++void bch2_fs_btree_cache_init_early(struct btree_cache *bc) ++{ ++ mutex_init(&bc->lock); ++ INIT_LIST_HEAD(&bc->live); ++ INIT_LIST_HEAD(&bc->freeable); ++ INIT_LIST_HEAD(&bc->freed); ++} ++ ++/* ++ * We can only have one thread cannibalizing other cached btree nodes at a time, ++ * or we'll deadlock. We use an open coded mutex to ensure that, which a ++ * cannibalize_bucket() will take. This means every time we unlock the root of ++ * the btree, we need to release this lock if we have it held. ++ */ ++void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ ++ if (bc->alloc_lock == current) { ++ trace_btree_node_cannibalize_unlock(c); ++ bc->alloc_lock = NULL; ++ closure_wake_up(&bc->alloc_wait); ++ } ++} ++ ++int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct task_struct *old; ++ ++ old = cmpxchg(&bc->alloc_lock, NULL, current); ++ if (old == NULL || old == current) ++ goto success; ++ ++ if (!cl) { ++ trace_btree_node_cannibalize_lock_fail(c); ++ return -ENOMEM; ++ } ++ ++ closure_wait(&bc->alloc_wait, cl); ++ ++ /* Try again, after adding ourselves to waitlist */ ++ old = cmpxchg(&bc->alloc_lock, NULL, current); ++ if (old == NULL || old == current) { ++ /* We raced */ ++ closure_wake_up(&bc->alloc_wait); ++ goto success; ++ } ++ ++ trace_btree_node_cannibalize_lock_fail(c); ++ return -EAGAIN; ++ ++success: ++ trace_btree_node_cannibalize_lock(c); ++ return 0; ++} ++ ++static struct btree *btree_node_cannibalize(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ ++ list_for_each_entry_reverse(b, &bc->live, list) ++ if (!btree_node_reclaim(c, b)) ++ return b; ++ ++ while (1) { ++ list_for_each_entry_reverse(b, &bc->live, list) ++ if (!btree_node_write_and_reclaim(c, b)) ++ return b; ++ ++ /* ++ * Rare case: all nodes were intent-locked. ++ * Just busy-wait. ++ */ ++ WARN_ONCE(1, "btree cache cannibalize failed\n"); ++ cond_resched(); ++ } ++} ++ ++struct btree *bch2_btree_node_mem_alloc(struct bch_fs *c) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ u64 start_time = local_clock(); ++ unsigned flags; ++ ++ flags = memalloc_nofs_save(); ++ mutex_lock(&bc->lock); ++ ++ /* ++ * btree_free() doesn't free memory; it sticks the node on the end of ++ * the list. Check if there's any freed nodes there: ++ */ ++ list_for_each_entry(b, &bc->freeable, list) ++ if (!btree_node_reclaim(c, b)) ++ goto got_node; ++ ++ /* ++ * We never free struct btree itself, just the memory that holds the on ++ * disk node. Check the freed list before allocating a new one: ++ */ ++ list_for_each_entry(b, &bc->freed, list) ++ if (!btree_node_reclaim(c, b)) ++ goto got_node; ++ ++ b = NULL; ++got_node: ++ if (b) ++ list_del_init(&b->list); ++ mutex_unlock(&bc->lock); ++ ++ if (!b) { ++ b = __btree_node_mem_alloc(c); ++ if (!b) ++ goto err; ++ ++ BUG_ON(!six_trylock_intent(&b->c.lock)); ++ BUG_ON(!six_trylock_write(&b->c.lock)); ++ } ++ ++ if (!b->data) { ++ if (btree_node_data_alloc(c, b, __GFP_NOWARN|GFP_KERNEL)) ++ goto err; ++ ++ mutex_lock(&bc->lock); ++ bc->used++; ++ mutex_unlock(&bc->lock); ++ } ++ ++ BUG_ON(btree_node_hashed(b)); ++ BUG_ON(btree_node_write_in_flight(b)); ++out: ++ b->flags = 0; ++ b->written = 0; ++ b->nsets = 0; ++ b->sib_u64s[0] = 0; ++ b->sib_u64s[1] = 0; ++ b->whiteout_u64s = 0; ++ bch2_btree_keys_init(b, &c->expensive_debug_checks); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], ++ start_time); ++ ++ memalloc_nofs_restore(flags); ++ return b; ++err: ++ mutex_lock(&bc->lock); ++ ++ if (b) { ++ list_add(&b->list, &bc->freed); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ } ++ ++ /* Try to cannibalize another cached btree node: */ ++ if (bc->alloc_lock == current) { ++ b = btree_node_cannibalize(c); ++ list_del_init(&b->list); ++ mutex_unlock(&bc->lock); ++ ++ bch2_btree_node_hash_remove(bc, b); ++ ++ trace_btree_node_cannibalize(c); ++ goto out; ++ } ++ ++ mutex_unlock(&bc->lock); ++ memalloc_nofs_restore(flags); ++ return ERR_PTR(-ENOMEM); ++} ++ ++/* Slowpath, don't want it inlined into btree_iter_traverse() */ ++static noinline struct btree *bch2_btree_node_fill(struct bch_fs *c, ++ struct btree_iter *iter, ++ const struct bkey_i *k, ++ enum btree_id btree_id, ++ unsigned level, ++ enum six_lock_type lock_type, ++ bool sync) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ ++ BUG_ON(level + 1 >= BTREE_MAX_DEPTH); ++ /* ++ * Parent node must be locked, else we could read in a btree node that's ++ * been freed: ++ */ ++ if (iter && !bch2_btree_node_relock(iter, level + 1)) ++ return ERR_PTR(-EINTR); ++ ++ b = bch2_btree_node_mem_alloc(c); ++ if (IS_ERR(b)) ++ return b; ++ ++ bkey_copy(&b->key, k); ++ if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { ++ /* raced with another fill: */ ++ ++ /* mark as unhashed... */ ++ b->hash_val = 0; ++ ++ mutex_lock(&bc->lock); ++ list_add(&b->list, &bc->freeable); ++ mutex_unlock(&bc->lock); ++ ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ return NULL; ++ } ++ ++ /* ++ * Unlock before doing IO: ++ * ++ * XXX: ideally should be dropping all btree node locks here ++ */ ++ if (iter && btree_node_read_locked(iter, level + 1)) ++ btree_node_unlock(iter, level + 1); ++ ++ bch2_btree_node_read(c, b, sync); ++ ++ six_unlock_write(&b->c.lock); ++ ++ if (!sync) { ++ six_unlock_intent(&b->c.lock); ++ return NULL; ++ } ++ ++ if (lock_type == SIX_LOCK_read) ++ six_lock_downgrade(&b->c.lock); ++ ++ return b; ++} ++ ++static int lock_node_check_fn(struct six_lock *lock, void *p) ++{ ++ struct btree *b = container_of(lock, struct btree, c.lock); ++ const struct bkey_i *k = p; ++ ++ return b->hash_val == btree_ptr_hash_val(k) ? 0 : -1; ++} ++ ++/** ++ * bch_btree_node_get - find a btree node in the cache and lock it, reading it ++ * in from disk if necessary. ++ * ++ * If IO is necessary and running under generic_make_request, returns -EAGAIN. ++ * ++ * The btree node will have either a read or a write lock held, depending on ++ * the @write parameter. ++ */ ++struct btree *bch2_btree_node_get(struct bch_fs *c, struct btree_iter *iter, ++ const struct bkey_i *k, unsigned level, ++ enum six_lock_type lock_type) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ struct bset_tree *t; ++ ++ EBUG_ON(level >= BTREE_MAX_DEPTH); ++ ++ b = btree_node_mem_ptr(k); ++ if (b) ++ goto lock_node; ++retry: ++ b = btree_cache_find(bc, k); ++ if (unlikely(!b)) { ++ /* ++ * We must have the parent locked to call bch2_btree_node_fill(), ++ * else we could read in a btree node from disk that's been ++ * freed: ++ */ ++ b = bch2_btree_node_fill(c, iter, k, iter->btree_id, ++ level, lock_type, true); ++ ++ /* We raced and found the btree node in the cache */ ++ if (!b) ++ goto retry; ++ ++ if (IS_ERR(b)) ++ return b; ++ } else { ++lock_node: ++ /* ++ * There's a potential deadlock with splits and insertions into ++ * interior nodes we have to avoid: ++ * ++ * The other thread might be holding an intent lock on the node ++ * we want, and they want to update its parent node so they're ++ * going to upgrade their intent lock on the parent node to a ++ * write lock. ++ * ++ * But if we're holding a read lock on the parent, and we're ++ * trying to get the intent lock they're holding, we deadlock. ++ * ++ * So to avoid this we drop the read locks on parent nodes when ++ * we're starting to take intent locks - and handle the race. ++ * ++ * The race is that they might be about to free the node we ++ * want, and dropping our read lock on the parent node lets them ++ * update the parent marking the node we want as freed, and then ++ * free it: ++ * ++ * To guard against this, btree nodes are evicted from the cache ++ * when they're freed - and b->hash_val is zeroed out, which we ++ * check for after we lock the node. ++ * ++ * Then, bch2_btree_node_relock() on the parent will fail - because ++ * the parent was modified, when the pointer to the node we want ++ * was removed - and we'll bail out: ++ */ ++ if (btree_node_read_locked(iter, level + 1)) ++ btree_node_unlock(iter, level + 1); ++ ++ if (!btree_node_lock(b, k->k.p, level, iter, lock_type, ++ lock_node_check_fn, (void *) k)) { ++ if (b->hash_val != btree_ptr_hash_val(k)) ++ goto retry; ++ return ERR_PTR(-EINTR); ++ } ++ ++ if (unlikely(b->hash_val != btree_ptr_hash_val(k) || ++ b->c.level != level || ++ race_fault())) { ++ six_unlock_type(&b->c.lock, lock_type); ++ if (bch2_btree_node_relock(iter, level + 1)) ++ goto retry; ++ ++ trace_trans_restart_btree_node_reused(iter->trans->ip); ++ return ERR_PTR(-EINTR); ++ } ++ } ++ ++ /* XXX: waiting on IO with btree locks held: */ ++ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, ++ TASK_UNINTERRUPTIBLE); ++ ++ prefetch(b->aux_data); ++ ++ for_each_bset(b, t) { ++ void *p = (u64 *) b->aux_data + t->aux_data_offset; ++ ++ prefetch(p + L1_CACHE_BYTES * 0); ++ prefetch(p + L1_CACHE_BYTES * 1); ++ prefetch(p + L1_CACHE_BYTES * 2); ++ } ++ ++ /* avoid atomic set bit if it's not needed: */ ++ if (!btree_node_accessed(b)) ++ set_btree_node_accessed(b); ++ ++ if (unlikely(btree_node_read_error(b))) { ++ six_unlock_type(&b->c.lock, lock_type); ++ return ERR_PTR(-EIO); ++ } ++ ++ EBUG_ON(b->c.btree_id != iter->btree_id || ++ BTREE_NODE_LEVEL(b->data) != level || ++ bkey_cmp(b->data->max_key, k->k.p)); ++ ++ return b; ++} ++ ++struct btree *bch2_btree_node_get_noiter(struct bch_fs *c, ++ const struct bkey_i *k, ++ enum btree_id btree_id, ++ unsigned level) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ struct bset_tree *t; ++ int ret; ++ ++ EBUG_ON(level >= BTREE_MAX_DEPTH); ++ ++ b = btree_node_mem_ptr(k); ++ if (b) ++ goto lock_node; ++retry: ++ b = btree_cache_find(bc, k); ++ if (unlikely(!b)) { ++ b = bch2_btree_node_fill(c, NULL, k, btree_id, ++ level, SIX_LOCK_read, true); ++ ++ /* We raced and found the btree node in the cache */ ++ if (!b) ++ goto retry; ++ ++ if (IS_ERR(b)) ++ return b; ++ } else { ++lock_node: ++ ret = six_lock_read(&b->c.lock, lock_node_check_fn, (void *) k); ++ if (ret) ++ goto retry; ++ ++ if (unlikely(b->hash_val != btree_ptr_hash_val(k) || ++ b->c.btree_id != btree_id || ++ b->c.level != level)) { ++ six_unlock_read(&b->c.lock); ++ goto retry; ++ } ++ } ++ ++ /* XXX: waiting on IO with btree locks held: */ ++ wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, ++ TASK_UNINTERRUPTIBLE); ++ ++ prefetch(b->aux_data); ++ ++ for_each_bset(b, t) { ++ void *p = (u64 *) b->aux_data + t->aux_data_offset; ++ ++ prefetch(p + L1_CACHE_BYTES * 0); ++ prefetch(p + L1_CACHE_BYTES * 1); ++ prefetch(p + L1_CACHE_BYTES * 2); ++ } ++ ++ /* avoid atomic set bit if it's not needed: */ ++ if (!btree_node_accessed(b)) ++ set_btree_node_accessed(b); ++ ++ if (unlikely(btree_node_read_error(b))) { ++ six_unlock_read(&b->c.lock); ++ return ERR_PTR(-EIO); ++ } ++ ++ EBUG_ON(b->c.btree_id != btree_id || ++ BTREE_NODE_LEVEL(b->data) != level || ++ bkey_cmp(b->data->max_key, k->k.p)); ++ ++ return b; ++} ++ ++struct btree *bch2_btree_node_get_sibling(struct bch_fs *c, ++ struct btree_iter *iter, ++ struct btree *b, ++ enum btree_node_sibling sib) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree *parent; ++ struct btree_node_iter node_iter; ++ struct bkey_packed *k; ++ BKEY_PADDED(k) tmp; ++ struct btree *ret = NULL; ++ unsigned level = b->c.level; ++ ++ parent = btree_iter_node(iter, level + 1); ++ if (!parent) ++ return NULL; ++ ++ /* ++ * There's a corner case where a btree_iter might have a node locked ++ * that is just outside its current pos - when ++ * bch2_btree_iter_set_pos_same_leaf() gets to the end of the node. ++ * ++ * But the lock ordering checks in __bch2_btree_node_lock() go off of ++ * iter->pos, not the node's key: so if the iterator is marked as ++ * needing to be traversed, we risk deadlock if we don't bail out here: ++ */ ++ if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) ++ return ERR_PTR(-EINTR); ++ ++ if (!bch2_btree_node_relock(iter, level + 1)) { ++ ret = ERR_PTR(-EINTR); ++ goto out; ++ } ++ ++ node_iter = iter->l[parent->c.level].iter; ++ ++ k = bch2_btree_node_iter_peek_all(&node_iter, parent); ++ BUG_ON(bkey_cmp_left_packed(parent, k, &b->key.k.p)); ++ ++ k = sib == btree_prev_sib ++ ? bch2_btree_node_iter_prev(&node_iter, parent) ++ : (bch2_btree_node_iter_advance(&node_iter, parent), ++ bch2_btree_node_iter_peek(&node_iter, parent)); ++ if (!k) ++ goto out; ++ ++ bch2_bkey_unpack(parent, &tmp.k, k); ++ ++ ret = bch2_btree_node_get(c, iter, &tmp.k, level, ++ SIX_LOCK_intent); ++ ++ if (PTR_ERR_OR_ZERO(ret) == -EINTR && !trans->nounlock) { ++ struct btree_iter *linked; ++ ++ if (!bch2_btree_node_relock(iter, level + 1)) ++ goto out; ++ ++ /* ++ * We might have got -EINTR because trylock failed, and we're ++ * holding other locks that would cause us to deadlock: ++ */ ++ trans_for_each_iter(trans, linked) ++ if (btree_iter_cmp(iter, linked) < 0) ++ __bch2_btree_iter_unlock(linked); ++ ++ if (sib == btree_prev_sib) ++ btree_node_unlock(iter, level); ++ ++ ret = bch2_btree_node_get(c, iter, &tmp.k, level, ++ SIX_LOCK_intent); ++ ++ /* ++ * before btree_iter_relock() calls btree_iter_verify_locks(): ++ */ ++ if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) ++ btree_node_unlock(iter, level + 1); ++ ++ if (!bch2_btree_node_relock(iter, level)) { ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); ++ ++ if (!IS_ERR(ret)) { ++ six_unlock_intent(&ret->c.lock); ++ ret = ERR_PTR(-EINTR); ++ } ++ } ++ ++ bch2_trans_relock(trans); ++ } ++out: ++ if (btree_lock_want(iter, level + 1) == BTREE_NODE_UNLOCKED) ++ btree_node_unlock(iter, level + 1); ++ ++ if (PTR_ERR_OR_ZERO(ret) == -EINTR) ++ bch2_btree_iter_upgrade(iter, level + 2); ++ ++ BUG_ON(!IS_ERR(ret) && !btree_node_locked(iter, level)); ++ ++ if (!IS_ERR_OR_NULL(ret)) { ++ struct btree *n1 = ret, *n2 = b; ++ ++ if (sib != btree_prev_sib) ++ swap(n1, n2); ++ ++ BUG_ON(bkey_cmp(bkey_successor(n1->key.k.p), ++ n2->data->min_key)); ++ } ++ ++ bch2_btree_trans_verify_locks(trans); ++ ++ return ret; ++} ++ ++void bch2_btree_node_prefetch(struct bch_fs *c, struct btree_iter *iter, ++ const struct bkey_i *k, unsigned level) ++{ ++ struct btree_cache *bc = &c->btree_cache; ++ struct btree *b; ++ ++ BUG_ON(!btree_node_locked(iter, level + 1)); ++ BUG_ON(level >= BTREE_MAX_DEPTH); ++ ++ b = btree_cache_find(bc, k); ++ if (b) ++ return; ++ ++ bch2_btree_node_fill(c, iter, k, iter->btree_id, ++ level, SIX_LOCK_read, false); ++} ++ ++void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, ++ struct btree *b) ++{ ++ const struct bkey_format *f = &b->format; ++ struct bset_stats stats; ++ ++ memset(&stats, 0, sizeof(stats)); ++ ++ bch2_btree_keys_stats(b, &stats); ++ ++ pr_buf(out, ++ "l %u %llu:%llu - %llu:%llu:\n" ++ " ptrs: ", ++ b->c.level, ++ b->data->min_key.inode, ++ b->data->min_key.offset, ++ b->data->max_key.inode, ++ b->data->max_key.offset); ++ bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); ++ pr_buf(out, "\n" ++ " format: u64s %u fields %u %u %u %u %u\n" ++ " unpack fn len: %u\n" ++ " bytes used %zu/%zu (%zu%% full)\n" ++ " sib u64s: %u, %u (merge threshold %zu)\n" ++ " nr packed keys %u\n" ++ " nr unpacked keys %u\n" ++ " floats %zu\n" ++ " failed unpacked %zu\n", ++ f->key_u64s, ++ f->bits_per_field[0], ++ f->bits_per_field[1], ++ f->bits_per_field[2], ++ f->bits_per_field[3], ++ f->bits_per_field[4], ++ b->unpack_fn_len, ++ b->nr.live_u64s * sizeof(u64), ++ btree_bytes(c) - sizeof(struct btree_node), ++ b->nr.live_u64s * 100 / btree_max_u64s(c), ++ b->sib_u64s[0], ++ b->sib_u64s[1], ++ BTREE_FOREGROUND_MERGE_THRESHOLD(c), ++ b->nr.packed_keys, ++ b->nr.unpacked_keys, ++ stats.floats, ++ stats.failed); ++} +diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h +new file mode 100644 +index 000000000000..d0d3a85bb8be +--- /dev/null ++++ b/fs/bcachefs/btree_cache.h +@@ -0,0 +1,104 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_CACHE_H ++#define _BCACHEFS_BTREE_CACHE_H ++ ++#include "bcachefs.h" ++#include "btree_types.h" ++ ++struct btree_iter; ++ ++extern const char * const bch2_btree_ids[]; ++ ++void bch2_recalc_btree_reserve(struct bch_fs *); ++ ++void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); ++int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); ++int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, ++ unsigned, enum btree_id); ++ ++void bch2_btree_cache_cannibalize_unlock(struct bch_fs *); ++int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); ++ ++struct btree *bch2_btree_node_mem_alloc(struct bch_fs *); ++ ++struct btree *bch2_btree_node_get(struct bch_fs *, struct btree_iter *, ++ const struct bkey_i *, unsigned, ++ enum six_lock_type); ++ ++struct btree *bch2_btree_node_get_noiter(struct bch_fs *, const struct bkey_i *, ++ enum btree_id, unsigned); ++ ++struct btree *bch2_btree_node_get_sibling(struct bch_fs *, struct btree_iter *, ++ struct btree *, enum btree_node_sibling); ++ ++void bch2_btree_node_prefetch(struct bch_fs *, struct btree_iter *, ++ const struct bkey_i *, unsigned); ++ ++void bch2_fs_btree_cache_exit(struct bch_fs *); ++int bch2_fs_btree_cache_init(struct bch_fs *); ++void bch2_fs_btree_cache_init_early(struct btree_cache *); ++ ++static inline u64 btree_ptr_hash_val(const struct bkey_i *k) ++{ ++ switch (k->k.type) { ++ case KEY_TYPE_btree_ptr: ++ return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start); ++ case KEY_TYPE_btree_ptr_v2: ++ return bkey_i_to_btree_ptr_v2_c(k)->v.seq; ++ default: ++ return 0; ++ } ++} ++ ++static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k) ++{ ++ return k->k.type == KEY_TYPE_btree_ptr_v2 ++ ? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr ++ : NULL; ++} ++ ++/* is btree node in hash table? */ ++static inline bool btree_node_hashed(struct btree *b) ++{ ++ return b->hash_val != 0; ++} ++ ++#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \ ++ for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl, \ ++ &(_c)->btree_cache.table), \ ++ _iter = 0; _iter < (_tbl)->size; _iter++) \ ++ rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash) ++ ++static inline size_t btree_bytes(struct bch_fs *c) ++{ ++ return c->opts.btree_node_size << 9; ++} ++ ++static inline size_t btree_max_u64s(struct bch_fs *c) ++{ ++ return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64); ++} ++ ++static inline size_t btree_pages(struct bch_fs *c) ++{ ++ return btree_bytes(c) / PAGE_SIZE; ++} ++ ++static inline unsigned btree_blocks(struct bch_fs *c) ++{ ++ return c->opts.btree_node_size >> c->block_bits; ++} ++ ++#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3) ++ ++#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) ++#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ ++ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \ ++ (BTREE_FOREGROUND_MERGE_THRESHOLD(c) << 2)) ++ ++#define btree_node_root(_c, _b) ((_c)->btree_roots[(_b)->c.btree_id].b) ++ ++void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, ++ struct btree *); ++ ++#endif /* _BCACHEFS_BTREE_CACHE_H */ +diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c +new file mode 100644 +index 000000000000..e8c1e752a25d +--- /dev/null ++++ b/fs/bcachefs/btree_gc.c +@@ -0,0 +1,1438 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2010 Kent Overstreet ++ * Copyright (C) 2014 Datera Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "bkey_methods.h" ++#include "bkey_on_stack.h" ++#include "btree_locking.h" ++#include "btree_update_interior.h" ++#include "btree_io.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "clock.h" ++#include "debug.h" ++#include "ec.h" ++#include "error.h" ++#include "extents.h" ++#include "journal.h" ++#include "keylist.h" ++#include "move.h" ++#include "recovery.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) ++{ ++ preempt_disable(); ++ write_seqcount_begin(&c->gc_pos_lock); ++ c->gc_pos = new_pos; ++ write_seqcount_end(&c->gc_pos_lock); ++ preempt_enable(); ++} ++ ++static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) ++{ ++ BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); ++ __gc_pos_set(c, new_pos); ++} ++ ++static int bch2_gc_check_topology(struct bch_fs *c, ++ struct bkey_s_c k, ++ struct bpos *expected_start, ++ struct bpos expected_end, ++ bool is_last) ++{ ++ int ret = 0; ++ ++ if (k.k->type == KEY_TYPE_btree_ptr_v2) { ++ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); ++ ++ if (fsck_err_on(bkey_cmp(*expected_start, bp.v->min_key), c, ++ "btree node with incorrect min_key: got %llu:%llu, should be %llu:%llu", ++ bp.v->min_key.inode, ++ bp.v->min_key.offset, ++ expected_start->inode, ++ expected_start->offset)) { ++ BUG(); ++ } ++ } ++ ++ *expected_start = bkey_cmp(k.k->p, POS_MAX) ++ ? bkey_successor(k.k->p) ++ : k.k->p; ++ ++ if (fsck_err_on(is_last && ++ bkey_cmp(k.k->p, expected_end), c, ++ "btree node with incorrect max_key: got %llu:%llu, should be %llu:%llu", ++ k.k->p.inode, ++ k.k->p.offset, ++ expected_end.inode, ++ expected_end.offset)) { ++ BUG(); ++ } ++fsck_err: ++ return ret; ++} ++ ++/* marking of btree keys/nodes: */ ++ ++static int bch2_gc_mark_key(struct bch_fs *c, struct bkey_s_c k, ++ u8 *max_stale, bool initial) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ unsigned flags = ++ BTREE_TRIGGER_GC| ++ (initial ? BTREE_TRIGGER_NOATOMIC : 0); ++ int ret = 0; ++ ++ if (initial) { ++ BUG_ON(journal_seq_verify(c) && ++ k.k->version.lo > journal_cur_seq(&c->journal)); ++ ++ /* XXX change to fsck check */ ++ if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c, ++ "key version number higher than recorded: %llu > %llu", ++ k.k->version.lo, ++ atomic64_read(&c->key_version))) ++ atomic64_set(&c->key_version, k.k->version.lo); ++ ++ if (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || ++ fsck_err_on(!bch2_bkey_replicas_marked(c, k), c, ++ "superblock not marked as containing replicas (type %u)", ++ k.k->type)) { ++ ret = bch2_mark_bkey_replicas(c, k); ++ if (ret) ++ return ret; ++ } ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, true); ++ struct bucket *g2 = PTR_BUCKET(ca, ptr, false); ++ ++ if (mustfix_fsck_err_on(!g->gen_valid, c, ++ "bucket %u:%zu data type %s ptr gen %u missing in alloc btree", ++ ptr->dev, PTR_BUCKET_NR(ca, ptr), ++ bch2_data_types[ptr_data_type(k.k, ptr)], ++ ptr->gen)) { ++ g2->_mark.gen = g->_mark.gen = ptr->gen; ++ g2->gen_valid = g->gen_valid = true; ++ } ++ ++ if (mustfix_fsck_err_on(gen_cmp(ptr->gen, g->mark.gen) > 0, c, ++ "bucket %u:%zu data type %s ptr gen in the future: %u > %u", ++ ptr->dev, PTR_BUCKET_NR(ca, ptr), ++ bch2_data_types[ptr_data_type(k.k, ptr)], ++ ptr->gen, g->mark.gen)) { ++ g2->_mark.gen = g->_mark.gen = ptr->gen; ++ g2->gen_valid = g->gen_valid = true; ++ g2->_mark.data_type = 0; ++ g2->_mark.dirty_sectors = 0; ++ g2->_mark.cached_sectors = 0; ++ set_bit(BCH_FS_FIXED_GENS, &c->flags); ++ } ++ } ++ } ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, true); ++ ++ if (gen_after(g->oldest_gen, ptr->gen)) ++ g->oldest_gen = ptr->gen; ++ ++ *max_stale = max(*max_stale, ptr_stale(ca, ptr)); ++ } ++ ++ bch2_mark_key(c, k, 0, k.k->size, NULL, 0, flags); ++fsck_err: ++ return ret; ++} ++ ++static int btree_gc_mark_node(struct bch_fs *c, struct btree *b, u8 *max_stale, ++ bool initial) ++{ ++ struct bpos next_node_start = b->data->min_key; ++ struct btree_node_iter iter; ++ struct bkey unpacked; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ *max_stale = 0; ++ ++ if (!btree_node_type_needs_gc(btree_node_type(b))) ++ return 0; ++ ++ bch2_btree_node_iter_init_from_start(&iter, b); ++ ++ while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { ++ bch2_bkey_debugcheck(c, b, k); ++ ++ ret = bch2_gc_mark_key(c, k, max_stale, initial); ++ if (ret) ++ break; ++ ++ bch2_btree_node_iter_advance(&iter, b); ++ ++ if (b->c.level) { ++ ret = bch2_gc_check_topology(c, k, ++ &next_node_start, ++ b->data->max_key, ++ bch2_btree_node_iter_end(&iter)); ++ if (ret) ++ break; ++ } ++ } ++ ++ return ret; ++} ++ ++static int bch2_gc_btree(struct bch_fs *c, enum btree_id btree_id, ++ bool initial, bool metadata_only) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct btree *b; ++ unsigned depth = metadata_only ? 1 ++ : expensive_debug_checks(c) ? 0 ++ : !btree_node_type_needs_gc(btree_id) ? 1 ++ : 0; ++ u8 max_stale = 0; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); ++ ++ __for_each_btree_node(&trans, iter, btree_id, POS_MIN, ++ 0, depth, BTREE_ITER_PREFETCH, b) { ++ bch2_verify_btree_nr_keys(b); ++ ++ gc_pos_set(c, gc_pos_btree_node(b)); ++ ++ ret = btree_gc_mark_node(c, b, &max_stale, initial); ++ if (ret) ++ break; ++ ++ if (!initial) { ++ if (max_stale > 64) ++ bch2_btree_node_rewrite(c, iter, ++ b->data->keys.seq, ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_NOWAIT| ++ BTREE_INSERT_GC_LOCK_HELD); ++ else if (!btree_gc_rewrite_disabled(c) && ++ (btree_gc_always_rewrite(c) || max_stale > 16)) ++ bch2_btree_node_rewrite(c, iter, ++ b->data->keys.seq, ++ BTREE_INSERT_NOWAIT| ++ BTREE_INSERT_GC_LOCK_HELD); ++ } ++ ++ bch2_trans_cond_resched(&trans); ++ } ++ ret = bch2_trans_exit(&trans) ?: ret; ++ if (ret) ++ return ret; ++ ++ mutex_lock(&c->btree_root_lock); ++ b = c->btree_roots[btree_id].b; ++ if (!btree_node_fake(b)) ++ ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), ++ &max_stale, initial); ++ gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); ++ mutex_unlock(&c->btree_root_lock); ++ ++ return ret; ++} ++ ++static int bch2_gc_btree_init_recurse(struct bch_fs *c, struct btree *b, ++ struct journal_keys *journal_keys, ++ unsigned target_depth) ++{ ++ struct btree_and_journal_iter iter; ++ struct bkey_s_c k; ++ struct bpos next_node_start = b->data->min_key; ++ u8 max_stale = 0; ++ int ret = 0; ++ ++ bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); ++ ++ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { ++ bch2_bkey_debugcheck(c, b, k); ++ ++ BUG_ON(bkey_cmp(k.k->p, b->data->min_key) < 0); ++ BUG_ON(bkey_cmp(k.k->p, b->data->max_key) > 0); ++ ++ ret = bch2_gc_mark_key(c, k, &max_stale, true); ++ if (ret) ++ break; ++ ++ if (b->c.level) { ++ struct btree *child; ++ BKEY_PADDED(k) tmp; ++ ++ bkey_reassemble(&tmp.k, k); ++ k = bkey_i_to_s_c(&tmp.k); ++ ++ bch2_btree_and_journal_iter_advance(&iter); ++ ++ ret = bch2_gc_check_topology(c, k, ++ &next_node_start, ++ b->data->max_key, ++ !bch2_btree_and_journal_iter_peek(&iter).k); ++ if (ret) ++ break; ++ ++ if (b->c.level > target_depth) { ++ child = bch2_btree_node_get_noiter(c, &tmp.k, ++ b->c.btree_id, b->c.level - 1); ++ ret = PTR_ERR_OR_ZERO(child); ++ if (ret) ++ break; ++ ++ ret = bch2_gc_btree_init_recurse(c, child, ++ journal_keys, target_depth); ++ six_unlock_read(&child->c.lock); ++ ++ if (ret) ++ break; ++ } ++ } else { ++ bch2_btree_and_journal_iter_advance(&iter); ++ } ++ } ++ ++ return ret; ++} ++ ++static int bch2_gc_btree_init(struct bch_fs *c, ++ struct journal_keys *journal_keys, ++ enum btree_id btree_id, ++ bool metadata_only) ++{ ++ struct btree *b; ++ unsigned target_depth = metadata_only ? 1 ++ : expensive_debug_checks(c) ? 0 ++ : !btree_node_type_needs_gc(btree_id) ? 1 ++ : 0; ++ u8 max_stale = 0; ++ int ret = 0; ++ ++ b = c->btree_roots[btree_id].b; ++ ++ if (btree_node_fake(b)) ++ return 0; ++ ++ six_lock_read(&b->c.lock, NULL, NULL); ++ if (fsck_err_on(bkey_cmp(b->data->min_key, POS_MIN), c, ++ "btree root with incorrect min_key: %llu:%llu", ++ b->data->min_key.inode, ++ b->data->min_key.offset)) { ++ BUG(); ++ } ++ ++ if (fsck_err_on(bkey_cmp(b->data->max_key, POS_MAX), c, ++ "btree root with incorrect min_key: %llu:%llu", ++ b->data->max_key.inode, ++ b->data->max_key.offset)) { ++ BUG(); ++ } ++ ++ if (b->c.level >= target_depth) ++ ret = bch2_gc_btree_init_recurse(c, b, ++ journal_keys, target_depth); ++ ++ if (!ret) ++ ret = bch2_gc_mark_key(c, bkey_i_to_s_c(&b->key), ++ &max_stale, true); ++fsck_err: ++ six_unlock_read(&b->c.lock); ++ ++ return ret; ++} ++ ++static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) ++{ ++ return (int) btree_id_to_gc_phase(l) - ++ (int) btree_id_to_gc_phase(r); ++} ++ ++static int bch2_gc_btrees(struct bch_fs *c, struct journal_keys *journal_keys, ++ bool initial, bool metadata_only) ++{ ++ enum btree_id ids[BTREE_ID_NR]; ++ unsigned i; ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ ids[i] = i; ++ bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) { ++ enum btree_id id = ids[i]; ++ int ret = initial ++ ? bch2_gc_btree_init(c, journal_keys, ++ id, metadata_only) ++ : bch2_gc_btree(c, id, initial, metadata_only); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, ++ u64 start, u64 end, ++ enum bch_data_type type, ++ unsigned flags) ++{ ++ u64 b = sector_to_bucket(ca, start); ++ ++ do { ++ unsigned sectors = ++ min_t(u64, bucket_to_sector(ca, b + 1), end) - start; ++ ++ bch2_mark_metadata_bucket(c, ca, b, type, sectors, ++ gc_phase(GC_PHASE_SB), flags); ++ b++; ++ start += sectors; ++ } while (start < end); ++} ++ ++void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, ++ unsigned flags) ++{ ++ struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; ++ unsigned i; ++ u64 b; ++ ++ /* ++ * This conditional is kind of gross, but we may be called from the ++ * device add path, before the new device has actually been added to the ++ * running filesystem: ++ */ ++ if (c) { ++ lockdep_assert_held(&c->sb_lock); ++ percpu_down_read(&c->mark_lock); ++ } ++ ++ for (i = 0; i < layout->nr_superblocks; i++) { ++ u64 offset = le64_to_cpu(layout->sb_offset[i]); ++ ++ if (offset == BCH_SB_SECTOR) ++ mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, ++ BCH_DATA_sb, flags); ++ ++ mark_metadata_sectors(c, ca, offset, ++ offset + (1 << layout->sb_max_size_bits), ++ BCH_DATA_sb, flags); ++ } ++ ++ for (i = 0; i < ca->journal.nr; i++) { ++ b = ca->journal.buckets[i]; ++ bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, ++ ca->mi.bucket_size, ++ gc_phase(GC_PHASE_SB), flags); ++ } ++ ++ if (c) ++ percpu_up_read(&c->mark_lock); ++} ++ ++static void bch2_mark_superblocks(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ mutex_lock(&c->sb_lock); ++ gc_pos_set(c, gc_phase(GC_PHASE_SB)); ++ ++ for_each_online_member(ca, c, i) ++ bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); ++ mutex_unlock(&c->sb_lock); ++} ++ ++#if 0 ++/* Also see bch2_pending_btree_node_free_insert_done() */ ++static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) ++{ ++ struct btree_update *as; ++ struct pending_btree_node_free *d; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE)); ++ ++ for_each_pending_btree_node_free(c, as, d) ++ if (d->index_update_done) ++ bch2_mark_key(c, bkey_i_to_s_c(&d->key), ++ 0, 0, NULL, 0, ++ BTREE_TRIGGER_GC); ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++} ++#endif ++ ++static void bch2_mark_allocator_buckets(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ struct open_bucket *ob; ++ size_t i, j, iter; ++ unsigned ci; ++ ++ percpu_down_read(&c->mark_lock); ++ ++ spin_lock(&c->freelist_lock); ++ gc_pos_set(c, gc_pos_alloc(c, NULL)); ++ ++ for_each_member_device(ca, c, ci) { ++ fifo_for_each_entry(i, &ca->free_inc, iter) ++ bch2_mark_alloc_bucket(c, ca, i, true, ++ gc_pos_alloc(c, NULL), ++ BTREE_TRIGGER_GC); ++ ++ ++ ++ for (j = 0; j < RESERVE_NR; j++) ++ fifo_for_each_entry(i, &ca->free[j], iter) ++ bch2_mark_alloc_bucket(c, ca, i, true, ++ gc_pos_alloc(c, NULL), ++ BTREE_TRIGGER_GC); ++ } ++ ++ spin_unlock(&c->freelist_lock); ++ ++ for (ob = c->open_buckets; ++ ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ++ ob++) { ++ spin_lock(&ob->lock); ++ if (ob->valid) { ++ gc_pos_set(c, gc_pos_alloc(c, ob)); ++ ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ bch2_mark_alloc_bucket(c, ca, PTR_BUCKET_NR(ca, &ob->ptr), true, ++ gc_pos_alloc(c, ob), ++ BTREE_TRIGGER_GC); ++ } ++ spin_unlock(&ob->lock); ++ } ++ ++ percpu_up_read(&c->mark_lock); ++} ++ ++static void bch2_gc_free(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ genradix_free(&c->stripes[1]); ++ ++ for_each_member_device(ca, c, i) { ++ kvpfree(rcu_dereference_protected(ca->buckets[1], 1), ++ sizeof(struct bucket_array) + ++ ca->mi.nbuckets * sizeof(struct bucket)); ++ ca->buckets[1] = NULL; ++ ++ free_percpu(ca->usage[1]); ++ ca->usage[1] = NULL; ++ } ++ ++ free_percpu(c->usage_gc); ++ c->usage_gc = NULL; ++} ++ ++static int bch2_gc_done(struct bch_fs *c, ++ bool initial, bool metadata_only) ++{ ++ struct bch_dev *ca; ++ bool verify = !metadata_only && ++ (!initial || ++ (c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO))); ++ unsigned i; ++ int ret = 0; ++ ++#define copy_field(_f, _msg, ...) \ ++ if (dst->_f != src->_f) { \ ++ if (verify) \ ++ fsck_err(c, _msg ": got %llu, should be %llu" \ ++ , ##__VA_ARGS__, dst->_f, src->_f); \ ++ dst->_f = src->_f; \ ++ ret = 1; \ ++ } ++#define copy_stripe_field(_f, _msg, ...) \ ++ if (dst->_f != src->_f) { \ ++ if (verify) \ ++ fsck_err(c, "stripe %zu has wrong "_msg \ ++ ": got %u, should be %u", \ ++ dst_iter.pos, ##__VA_ARGS__, \ ++ dst->_f, src->_f); \ ++ dst->_f = src->_f; \ ++ dst->dirty = true; \ ++ ret = 1; \ ++ } ++#define copy_bucket_field(_f) \ ++ if (dst->b[b].mark._f != src->b[b].mark._f) { \ ++ if (verify) \ ++ fsck_err(c, "bucket %u:%zu gen %u data type %s has wrong " #_f \ ++ ": got %u, should be %u", i, b, \ ++ dst->b[b].mark.gen, \ ++ bch2_data_types[dst->b[b].mark.data_type],\ ++ dst->b[b].mark._f, src->b[b].mark._f); \ ++ dst->b[b]._mark._f = src->b[b].mark._f; \ ++ ret = 1; \ ++ } ++#define copy_dev_field(_f, _msg, ...) \ ++ copy_field(_f, "dev %u has wrong " _msg, i, ##__VA_ARGS__) ++#define copy_fs_field(_f, _msg, ...) \ ++ copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) ++ ++ if (!metadata_only) { ++ struct genradix_iter dst_iter = genradix_iter_init(&c->stripes[0], 0); ++ struct genradix_iter src_iter = genradix_iter_init(&c->stripes[1], 0); ++ struct stripe *dst, *src; ++ unsigned i; ++ ++ c->ec_stripes_heap.used = 0; ++ ++ while ((dst = genradix_iter_peek(&dst_iter, &c->stripes[0])) && ++ (src = genradix_iter_peek(&src_iter, &c->stripes[1]))) { ++ BUG_ON(src_iter.pos != dst_iter.pos); ++ ++ copy_stripe_field(alive, "alive"); ++ copy_stripe_field(sectors, "sectors"); ++ copy_stripe_field(algorithm, "algorithm"); ++ copy_stripe_field(nr_blocks, "nr_blocks"); ++ copy_stripe_field(nr_redundant, "nr_redundant"); ++ copy_stripe_field(blocks_nonempty, ++ "blocks_nonempty"); ++ ++ for (i = 0; i < ARRAY_SIZE(dst->block_sectors); i++) ++ copy_stripe_field(block_sectors[i], ++ "block_sectors[%u]", i); ++ ++ if (dst->alive) { ++ spin_lock(&c->ec_stripes_heap_lock); ++ bch2_stripes_heap_insert(c, dst, dst_iter.pos); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ } ++ ++ genradix_iter_advance(&dst_iter, &c->stripes[0]); ++ genradix_iter_advance(&src_iter, &c->stripes[1]); ++ } ++ } ++ ++ for_each_member_device(ca, c, i) { ++ struct bucket_array *dst = __bucket_array(ca, 0); ++ struct bucket_array *src = __bucket_array(ca, 1); ++ size_t b; ++ ++ for (b = 0; b < src->nbuckets; b++) { ++ copy_bucket_field(gen); ++ copy_bucket_field(data_type); ++ copy_bucket_field(owned_by_allocator); ++ copy_bucket_field(stripe); ++ copy_bucket_field(dirty_sectors); ++ copy_bucket_field(cached_sectors); ++ ++ dst->b[b].oldest_gen = src->b[b].oldest_gen; ++ } ++ }; ++ ++ bch2_fs_usage_acc_to_base(c, 0); ++ bch2_fs_usage_acc_to_base(c, 1); ++ ++ bch2_dev_usage_from_buckets(c); ++ ++ { ++ unsigned nr = fs_usage_u64s(c); ++ struct bch_fs_usage *dst = c->usage_base; ++ struct bch_fs_usage *src = (void *) ++ bch2_acc_percpu_u64s((void *) c->usage_gc, nr); ++ ++ copy_fs_field(hidden, "hidden"); ++ copy_fs_field(btree, "btree"); ++ ++ if (!metadata_only) { ++ copy_fs_field(data, "data"); ++ copy_fs_field(cached, "cached"); ++ copy_fs_field(reserved, "reserved"); ++ copy_fs_field(nr_inodes,"nr_inodes"); ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) ++ copy_fs_field(persistent_reserved[i], ++ "persistent_reserved[%i]", i); ++ } ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ char buf[80]; ++ ++ if (metadata_only && ++ (e->data_type == BCH_DATA_user || ++ e->data_type == BCH_DATA_cached)) ++ continue; ++ ++ bch2_replicas_entry_to_text(&PBUF(buf), e); ++ ++ copy_fs_field(replicas[i], "%s", buf); ++ } ++ } ++ ++#undef copy_fs_field ++#undef copy_dev_field ++#undef copy_bucket_field ++#undef copy_stripe_field ++#undef copy_field ++fsck_err: ++ return ret; ++} ++ ++static int bch2_gc_start(struct bch_fs *c, ++ bool metadata_only) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ int ret; ++ ++ BUG_ON(c->usage_gc); ++ ++ c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), ++ sizeof(u64), GFP_KERNEL); ++ if (!c->usage_gc) { ++ bch_err(c, "error allocating c->usage_gc"); ++ return -ENOMEM; ++ } ++ ++ for_each_member_device(ca, c, i) { ++ BUG_ON(ca->buckets[1]); ++ BUG_ON(ca->usage[1]); ++ ++ ca->buckets[1] = kvpmalloc(sizeof(struct bucket_array) + ++ ca->mi.nbuckets * sizeof(struct bucket), ++ GFP_KERNEL|__GFP_ZERO); ++ if (!ca->buckets[1]) { ++ percpu_ref_put(&ca->ref); ++ bch_err(c, "error allocating ca->buckets[gc]"); ++ return -ENOMEM; ++ } ++ ++ ca->usage[1] = alloc_percpu(struct bch_dev_usage); ++ if (!ca->usage[1]) { ++ bch_err(c, "error allocating ca->usage[gc]"); ++ percpu_ref_put(&ca->ref); ++ return -ENOMEM; ++ } ++ } ++ ++ ret = bch2_ec_mem_alloc(c, true); ++ if (ret) { ++ bch_err(c, "error allocating ec gc mem"); ++ return ret; ++ } ++ ++ percpu_down_write(&c->mark_lock); ++ ++ /* ++ * indicate to stripe code that we need to allocate for the gc stripes ++ * radix tree, too ++ */ ++ gc_pos_set(c, gc_phase(GC_PHASE_START)); ++ ++ for_each_member_device(ca, c, i) { ++ struct bucket_array *dst = __bucket_array(ca, 1); ++ struct bucket_array *src = __bucket_array(ca, 0); ++ size_t b; ++ ++ dst->first_bucket = src->first_bucket; ++ dst->nbuckets = src->nbuckets; ++ ++ for (b = 0; b < src->nbuckets; b++) { ++ struct bucket *d = &dst->b[b]; ++ struct bucket *s = &src->b[b]; ++ ++ d->_mark.gen = dst->b[b].oldest_gen = s->mark.gen; ++ d->gen_valid = s->gen_valid; ++ ++ if (metadata_only && ++ (s->mark.data_type == BCH_DATA_user || ++ s->mark.data_type == BCH_DATA_cached)) { ++ d->_mark = s->mark; ++ d->_mark.owned_by_allocator = 0; ++ } ++ } ++ }; ++ ++ percpu_up_write(&c->mark_lock); ++ ++ return 0; ++} ++ ++/** ++ * bch2_gc - walk _all_ references to buckets, and recompute them: ++ * ++ * Order matters here: ++ * - Concurrent GC relies on the fact that we have a total ordering for ++ * everything that GC walks - see gc_will_visit_node(), ++ * gc_will_visit_root() ++ * ++ * - also, references move around in the course of index updates and ++ * various other crap: everything needs to agree on the ordering ++ * references are allowed to move around in - e.g., we're allowed to ++ * start with a reference owned by an open_bucket (the allocator) and ++ * move it to the btree, but not the reverse. ++ * ++ * This is necessary to ensure that gc doesn't miss references that ++ * move around - if references move backwards in the ordering GC ++ * uses, GC could skip past them ++ */ ++int bch2_gc(struct bch_fs *c, struct journal_keys *journal_keys, ++ bool initial, bool metadata_only) ++{ ++ struct bch_dev *ca; ++ u64 start_time = local_clock(); ++ unsigned i, iter = 0; ++ int ret; ++ ++ lockdep_assert_held(&c->state_lock); ++ trace_gc_start(c); ++ ++ down_write(&c->gc_lock); ++ ++ /* flush interior btree updates: */ ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c)); ++again: ++ ret = bch2_gc_start(c, metadata_only); ++ if (ret) ++ goto out; ++ ++ bch2_mark_superblocks(c); ++ ++ ret = bch2_gc_btrees(c, journal_keys, initial, metadata_only); ++ if (ret) ++ goto out; ++ ++#if 0 ++ bch2_mark_pending_btree_node_frees(c); ++#endif ++ bch2_mark_allocator_buckets(c); ++ ++ c->gc_count++; ++out: ++ if (!ret && ++ (test_bit(BCH_FS_FIXED_GENS, &c->flags) || ++ (!iter && test_restart_gc(c)))) { ++ /* ++ * XXX: make sure gens we fixed got saved ++ */ ++ if (iter++ <= 2) { ++ bch_info(c, "Fixed gens, restarting mark and sweep:"); ++ clear_bit(BCH_FS_FIXED_GENS, &c->flags); ++ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); ++ ++ percpu_down_write(&c->mark_lock); ++ bch2_gc_free(c); ++ percpu_up_write(&c->mark_lock); ++ /* flush fsck errors, reset counters */ ++ bch2_flush_fsck_errs(c); ++ ++ goto again; ++ } ++ ++ bch_info(c, "Unable to fix bucket gens, looping"); ++ ret = -EINVAL; ++ } ++ ++ if (!ret) { ++ bch2_journal_block(&c->journal); ++ ++ percpu_down_write(&c->mark_lock); ++ ret = bch2_gc_done(c, initial, metadata_only); ++ ++ bch2_journal_unblock(&c->journal); ++ } else { ++ percpu_down_write(&c->mark_lock); ++ } ++ ++ /* Indicates that gc is no longer in progress: */ ++ __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); ++ ++ bch2_gc_free(c); ++ percpu_up_write(&c->mark_lock); ++ ++ up_write(&c->gc_lock); ++ ++ trace_gc_end(c); ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); ++ ++ /* ++ * Wake up allocator in case it was waiting for buckets ++ * because of not being able to inc gens ++ */ ++ for_each_member_device(ca, c, i) ++ bch2_wake_allocator(ca); ++ ++ /* ++ * At startup, allocations can happen directly instead of via the ++ * allocator thread - issue wakeup in case they blocked on gc_lock: ++ */ ++ closure_wake_up(&c->freelist_wait); ++ return ret; ++} ++ ++static bool gc_btree_gens_key(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ percpu_down_read(&c->mark_lock); ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, false); ++ ++ if (gen_after(g->mark.gen, ptr->gen) > 16) { ++ percpu_up_read(&c->mark_lock); ++ return true; ++ } ++ } ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, false); ++ ++ if (gen_after(g->gc_gen, ptr->gen)) ++ g->gc_gen = ptr->gen; ++ } ++ percpu_up_read(&c->mark_lock); ++ ++ return false; ++} ++ ++/* ++ * For recalculating oldest gen, we only need to walk keys in leaf nodes; btree ++ * node pointers currently never have cached pointers that can become stale: ++ */ ++static int bch2_gc_btree_gens(struct bch_fs *c, enum btree_id btree_id) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_on_stack sk; ++ int ret = 0; ++ ++ bkey_on_stack_init(&sk); ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, ++ BTREE_ITER_PREFETCH); ++ ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret = bkey_err(k))) { ++ if (gc_btree_gens_key(c, k)) { ++ bkey_on_stack_reassemble(&sk, c, k); ++ bch2_extent_normalize(c, bkey_i_to_s(sk.k)); ++ ++ bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); ++ ++ bch2_trans_update(&trans, iter, sk.k, 0); ++ ++ ret = bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL); ++ if (ret == -EINTR) ++ continue; ++ if (ret) { ++ break; ++ } ++ } ++ ++ bch2_btree_iter_next(iter); ++ } ++ ++ bch2_trans_exit(&trans); ++ bkey_on_stack_exit(&sk, c); ++ ++ return ret; ++} ++ ++int bch2_gc_gens(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ struct bucket_array *buckets; ++ struct bucket *g; ++ unsigned i; ++ int ret; ++ ++ /* ++ * Ideally we would be using state_lock and not gc_lock here, but that ++ * introduces a deadlock in the RO path - we currently take the state ++ * lock at the start of going RO, thus the gc thread may get stuck: ++ */ ++ down_read(&c->gc_lock); ++ ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ for_each_bucket(g, buckets) ++ g->gc_gen = g->mark.gen; ++ up_read(&ca->bucket_lock); ++ } ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (btree_node_type_needs_gc(i)) { ++ ret = bch2_gc_btree_gens(c, i); ++ if (ret) { ++ bch_err(c, "error recalculating oldest_gen: %i", ret); ++ goto err; ++ } ++ } ++ ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ for_each_bucket(g, buckets) ++ g->oldest_gen = g->gc_gen; ++ up_read(&ca->bucket_lock); ++ } ++ ++ c->gc_count++; ++err: ++ up_read(&c->gc_lock); ++ return ret; ++} ++ ++/* Btree coalescing */ ++ ++static void recalc_packed_keys(struct btree *b) ++{ ++ struct bset *i = btree_bset_first(b); ++ struct bkey_packed *k; ++ ++ memset(&b->nr, 0, sizeof(b->nr)); ++ ++ BUG_ON(b->nsets != 1); ++ ++ vstruct_for_each(i, k) ++ btree_keys_account_key_add(&b->nr, 0, k); ++} ++ ++static void bch2_coalesce_nodes(struct bch_fs *c, struct btree_iter *iter, ++ struct btree *old_nodes[GC_MERGE_NODES]) ++{ ++ struct btree *parent = btree_node_parent(iter, old_nodes[0]); ++ unsigned i, nr_old_nodes, nr_new_nodes, u64s = 0; ++ unsigned blocks = btree_blocks(c) * 2 / 3; ++ struct btree *new_nodes[GC_MERGE_NODES]; ++ struct btree_update *as; ++ struct keylist keylist; ++ struct bkey_format_state format_state; ++ struct bkey_format new_format; ++ ++ memset(new_nodes, 0, sizeof(new_nodes)); ++ bch2_keylist_init(&keylist, NULL); ++ ++ /* Count keys that are not deleted */ ++ for (i = 0; i < GC_MERGE_NODES && old_nodes[i]; i++) ++ u64s += old_nodes[i]->nr.live_u64s; ++ ++ nr_old_nodes = nr_new_nodes = i; ++ ++ /* Check if all keys in @old_nodes could fit in one fewer node */ ++ if (nr_old_nodes <= 1 || ++ __vstruct_blocks(struct btree_node, c->block_bits, ++ DIV_ROUND_UP(u64s, nr_old_nodes - 1)) > blocks) ++ return; ++ ++ /* Find a format that all keys in @old_nodes can pack into */ ++ bch2_bkey_format_init(&format_state); ++ ++ for (i = 0; i < nr_old_nodes; i++) ++ __bch2_btree_calc_format(&format_state, old_nodes[i]); ++ ++ new_format = bch2_bkey_format_done(&format_state); ++ ++ /* Check if repacking would make any nodes too big to fit */ ++ for (i = 0; i < nr_old_nodes; i++) ++ if (!bch2_btree_node_format_fits(c, old_nodes[i], &new_format)) { ++ trace_btree_gc_coalesce_fail(c, ++ BTREE_GC_COALESCE_FAIL_FORMAT_FITS); ++ return; ++ } ++ ++ if (bch2_keylist_realloc(&keylist, NULL, 0, ++ (BKEY_U64s + BKEY_EXTENT_U64s_MAX) * nr_old_nodes)) { ++ trace_btree_gc_coalesce_fail(c, ++ BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC); ++ return; ++ } ++ ++ as = bch2_btree_update_start(iter->trans, iter->btree_id, ++ btree_update_reserve_required(c, parent) + nr_old_nodes, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE, ++ NULL); ++ if (IS_ERR(as)) { ++ trace_btree_gc_coalesce_fail(c, ++ BTREE_GC_COALESCE_FAIL_RESERVE_GET); ++ bch2_keylist_free(&keylist, NULL); ++ return; ++ } ++ ++ trace_btree_gc_coalesce(c, old_nodes[0]); ++ ++ for (i = 0; i < nr_old_nodes; i++) ++ bch2_btree_interior_update_will_free_node(as, old_nodes[i]); ++ ++ /* Repack everything with @new_format and sort down to one bset */ ++ for (i = 0; i < nr_old_nodes; i++) ++ new_nodes[i] = ++ __bch2_btree_node_alloc_replacement(as, old_nodes[i], ++ new_format); ++ ++ /* ++ * Conceptually we concatenate the nodes together and slice them ++ * up at different boundaries. ++ */ ++ for (i = nr_new_nodes - 1; i > 0; --i) { ++ struct btree *n1 = new_nodes[i]; ++ struct btree *n2 = new_nodes[i - 1]; ++ ++ struct bset *s1 = btree_bset_first(n1); ++ struct bset *s2 = btree_bset_first(n2); ++ struct bkey_packed *k, *last = NULL; ++ ++ /* Calculate how many keys from @n2 we could fit inside @n1 */ ++ u64s = 0; ++ ++ for (k = s2->start; ++ k < vstruct_last(s2) && ++ vstruct_blocks_plus(n1->data, c->block_bits, ++ u64s + k->u64s) <= blocks; ++ k = bkey_next_skip_noops(k, vstruct_last(s2))) { ++ last = k; ++ u64s += k->u64s; ++ } ++ ++ if (u64s == le16_to_cpu(s2->u64s)) { ++ /* n2 fits entirely in n1 */ ++ n1->key.k.p = n1->data->max_key = n2->data->max_key; ++ ++ memcpy_u64s(vstruct_last(s1), ++ s2->start, ++ le16_to_cpu(s2->u64s)); ++ le16_add_cpu(&s1->u64s, le16_to_cpu(s2->u64s)); ++ ++ set_btree_bset_end(n1, n1->set); ++ ++ six_unlock_write(&n2->c.lock); ++ bch2_btree_node_free_never_inserted(c, n2); ++ six_unlock_intent(&n2->c.lock); ++ ++ memmove(new_nodes + i - 1, ++ new_nodes + i, ++ sizeof(new_nodes[0]) * (nr_new_nodes - i)); ++ new_nodes[--nr_new_nodes] = NULL; ++ } else if (u64s) { ++ /* move part of n2 into n1 */ ++ n1->key.k.p = n1->data->max_key = ++ bkey_unpack_pos(n1, last); ++ ++ n2->data->min_key = bkey_successor(n1->data->max_key); ++ ++ memcpy_u64s(vstruct_last(s1), ++ s2->start, u64s); ++ le16_add_cpu(&s1->u64s, u64s); ++ ++ memmove(s2->start, ++ vstruct_idx(s2, u64s), ++ (le16_to_cpu(s2->u64s) - u64s) * sizeof(u64)); ++ s2->u64s = cpu_to_le16(le16_to_cpu(s2->u64s) - u64s); ++ ++ set_btree_bset_end(n1, n1->set); ++ set_btree_bset_end(n2, n2->set); ++ } ++ } ++ ++ for (i = 0; i < nr_new_nodes; i++) { ++ struct btree *n = new_nodes[i]; ++ ++ recalc_packed_keys(n); ++ btree_node_reset_sib_u64s(n); ++ ++ bch2_btree_build_aux_trees(n); ++ ++ bch2_btree_update_add_new_node(as, n); ++ six_unlock_write(&n->c.lock); ++ ++ bch2_btree_node_write(c, n, SIX_LOCK_intent); ++ } ++ ++ /* ++ * The keys for the old nodes get deleted. We don't want to insert keys ++ * that compare equal to the keys for the new nodes we'll also be ++ * inserting - we can't because keys on a keylist must be strictly ++ * greater than the previous keys, and we also don't need to since the ++ * key for the new node will serve the same purpose (overwriting the key ++ * for the old node). ++ */ ++ for (i = 0; i < nr_old_nodes; i++) { ++ struct bkey_i delete; ++ unsigned j; ++ ++ for (j = 0; j < nr_new_nodes; j++) ++ if (!bkey_cmp(old_nodes[i]->key.k.p, ++ new_nodes[j]->key.k.p)) ++ goto next; ++ ++ bkey_init(&delete.k); ++ delete.k.p = old_nodes[i]->key.k.p; ++ bch2_keylist_add_in_order(&keylist, &delete); ++next: ++ i = i; ++ } ++ ++ /* ++ * Keys for the new nodes get inserted: bch2_btree_insert_keys() only ++ * does the lookup once and thus expects the keys to be in sorted order ++ * so we have to make sure the new keys are correctly ordered with ++ * respect to the deleted keys added in the previous loop ++ */ ++ for (i = 0; i < nr_new_nodes; i++) ++ bch2_keylist_add_in_order(&keylist, &new_nodes[i]->key); ++ ++ /* Insert the newly coalesced nodes */ ++ bch2_btree_insert_node(as, parent, iter, &keylist, 0); ++ ++ BUG_ON(!bch2_keylist_empty(&keylist)); ++ ++ BUG_ON(iter->l[old_nodes[0]->c.level].b != old_nodes[0]); ++ ++ bch2_btree_iter_node_replace(iter, new_nodes[0]); ++ ++ for (i = 0; i < nr_new_nodes; i++) ++ bch2_btree_update_get_open_buckets(as, new_nodes[i]); ++ ++ /* Free the old nodes and update our sliding window */ ++ for (i = 0; i < nr_old_nodes; i++) { ++ bch2_btree_node_free_inmem(c, old_nodes[i], iter); ++ ++ /* ++ * the index update might have triggered a split, in which case ++ * the nodes we coalesced - the new nodes we just created - ++ * might not be sibling nodes anymore - don't add them to the ++ * sliding window (except the first): ++ */ ++ if (!i) { ++ old_nodes[i] = new_nodes[i]; ++ } else { ++ old_nodes[i] = NULL; ++ } ++ } ++ ++ for (i = 0; i < nr_new_nodes; i++) ++ six_unlock_intent(&new_nodes[i]->c.lock); ++ ++ bch2_btree_update_done(as); ++ bch2_keylist_free(&keylist, NULL); ++} ++ ++static int bch2_coalesce_btree(struct bch_fs *c, enum btree_id btree_id) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct btree *b; ++ bool kthread = (current->flags & PF_KTHREAD) != 0; ++ unsigned i; ++ ++ /* Sliding window of adjacent btree nodes */ ++ struct btree *merge[GC_MERGE_NODES]; ++ u32 lock_seq[GC_MERGE_NODES]; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ /* ++ * XXX: We don't have a good way of positively matching on sibling nodes ++ * that have the same parent - this code works by handling the cases ++ * where they might not have the same parent, and is thus fragile. Ugh. ++ * ++ * Perhaps redo this to use multiple linked iterators? ++ */ ++ memset(merge, 0, sizeof(merge)); ++ ++ __for_each_btree_node(&trans, iter, btree_id, POS_MIN, ++ BTREE_MAX_DEPTH, 0, ++ BTREE_ITER_PREFETCH, b) { ++ memmove(merge + 1, merge, ++ sizeof(merge) - sizeof(merge[0])); ++ memmove(lock_seq + 1, lock_seq, ++ sizeof(lock_seq) - sizeof(lock_seq[0])); ++ ++ merge[0] = b; ++ ++ for (i = 1; i < GC_MERGE_NODES; i++) { ++ if (!merge[i] || ++ !six_relock_intent(&merge[i]->c.lock, lock_seq[i])) ++ break; ++ ++ if (merge[i]->c.level != merge[0]->c.level) { ++ six_unlock_intent(&merge[i]->c.lock); ++ break; ++ } ++ } ++ memset(merge + i, 0, (GC_MERGE_NODES - i) * sizeof(merge[0])); ++ ++ bch2_coalesce_nodes(c, iter, merge); ++ ++ for (i = 1; i < GC_MERGE_NODES && merge[i]; i++) { ++ lock_seq[i] = merge[i]->c.lock.state.seq; ++ six_unlock_intent(&merge[i]->c.lock); ++ } ++ ++ lock_seq[0] = merge[0]->c.lock.state.seq; ++ ++ if (kthread && kthread_should_stop()) { ++ bch2_trans_exit(&trans); ++ return -ESHUTDOWN; ++ } ++ ++ bch2_trans_cond_resched(&trans); ++ ++ /* ++ * If the parent node wasn't relocked, it might have been split ++ * and the nodes in our sliding window might not have the same ++ * parent anymore - blow away the sliding window: ++ */ ++ if (btree_iter_node(iter, iter->level + 1) && ++ !btree_node_intent_locked(iter, iter->level + 1)) ++ memset(merge + 1, 0, ++ (GC_MERGE_NODES - 1) * sizeof(merge[0])); ++ } ++ return bch2_trans_exit(&trans); ++} ++ ++/** ++ * bch_coalesce - coalesce adjacent nodes with low occupancy ++ */ ++void bch2_coalesce(struct bch_fs *c) ++{ ++ enum btree_id id; ++ ++ down_read(&c->gc_lock); ++ trace_gc_coalesce_start(c); ++ ++ for (id = 0; id < BTREE_ID_NR; id++) { ++ int ret = c->btree_roots[id].b ++ ? bch2_coalesce_btree(c, id) ++ : 0; ++ ++ if (ret) { ++ if (ret != -ESHUTDOWN) ++ bch_err(c, "btree coalescing failed: %d", ret); ++ return; ++ } ++ } ++ ++ trace_gc_coalesce_end(c); ++ up_read(&c->gc_lock); ++} ++ ++static int bch2_gc_thread(void *arg) ++{ ++ struct bch_fs *c = arg; ++ struct io_clock *clock = &c->io_clock[WRITE]; ++ unsigned long last = atomic_long_read(&clock->now); ++ unsigned last_kick = atomic_read(&c->kick_gc); ++ int ret; ++ ++ set_freezable(); ++ ++ while (1) { ++ while (1) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ if (kthread_should_stop()) { ++ __set_current_state(TASK_RUNNING); ++ return 0; ++ } ++ ++ if (atomic_read(&c->kick_gc) != last_kick) ++ break; ++ ++ if (c->btree_gc_periodic) { ++ unsigned long next = last + c->capacity / 16; ++ ++ if (atomic_long_read(&clock->now) >= next) ++ break; ++ ++ bch2_io_clock_schedule_timeout(clock, next); ++ } else { ++ schedule(); ++ } ++ ++ try_to_freeze(); ++ } ++ __set_current_state(TASK_RUNNING); ++ ++ last = atomic_long_read(&clock->now); ++ last_kick = atomic_read(&c->kick_gc); ++ ++ /* ++ * Full gc is currently incompatible with btree key cache: ++ */ ++#if 0 ++ ret = bch2_gc(c, NULL, false, false); ++#else ++ ret = bch2_gc_gens(c); ++#endif ++ if (ret < 0) ++ bch_err(c, "btree gc failed: %i", ret); ++ ++ debug_check_no_locks_held(); ++ } ++ ++ return 0; ++} ++ ++void bch2_gc_thread_stop(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ p = c->gc_thread; ++ c->gc_thread = NULL; ++ ++ if (p) { ++ kthread_stop(p); ++ put_task_struct(p); ++ } ++} ++ ++int bch2_gc_thread_start(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ BUG_ON(c->gc_thread); ++ ++ p = kthread_create(bch2_gc_thread, c, "bch_gc"); ++ if (IS_ERR(p)) ++ return PTR_ERR(p); ++ ++ get_task_struct(p); ++ c->gc_thread = p; ++ wake_up_process(p); ++ return 0; ++} +diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h +new file mode 100644 +index 000000000000..3694a3df62a8 +--- /dev/null ++++ b/fs/bcachefs/btree_gc.h +@@ -0,0 +1,121 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_GC_H ++#define _BCACHEFS_BTREE_GC_H ++ ++#include "btree_types.h" ++ ++void bch2_coalesce(struct bch_fs *); ++ ++struct journal_keys; ++int bch2_gc(struct bch_fs *, struct journal_keys *, bool, bool); ++int bch2_gc_gens(struct bch_fs *); ++void bch2_gc_thread_stop(struct bch_fs *); ++int bch2_gc_thread_start(struct bch_fs *); ++void bch2_mark_dev_superblock(struct bch_fs *, struct bch_dev *, unsigned); ++ ++/* ++ * For concurrent mark and sweep (with other index updates), we define a total ++ * ordering of _all_ references GC walks: ++ * ++ * Note that some references will have the same GC position as others - e.g. ++ * everything within the same btree node; in those cases we're relying on ++ * whatever locking exists for where those references live, i.e. the write lock ++ * on a btree node. ++ * ++ * That locking is also required to ensure GC doesn't pass the updater in ++ * between the updater adding/removing the reference and updating the GC marks; ++ * without that, we would at best double count sometimes. ++ * ++ * That part is important - whenever calling bch2_mark_pointers(), a lock _must_ ++ * be held that prevents GC from passing the position the updater is at. ++ * ++ * (What about the start of gc, when we're clearing all the marks? GC clears the ++ * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc ++ * position inside its cmpxchg loop, so crap magically works). ++ */ ++ ++/* Position of (the start of) a gc phase: */ ++static inline struct gc_pos gc_phase(enum gc_phase phase) ++{ ++ return (struct gc_pos) { ++ .phase = phase, ++ .pos = POS_MIN, ++ .level = 0, ++ }; ++} ++ ++static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) ++{ ++ if (l.phase != r.phase) ++ return l.phase < r.phase ? -1 : 1; ++ if (bkey_cmp(l.pos, r.pos)) ++ return bkey_cmp(l.pos, r.pos); ++ if (l.level != r.level) ++ return l.level < r.level ? -1 : 1; ++ return 0; ++} ++ ++static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) ++{ ++ switch (id) { ++#define x(n, v, s) case BTREE_ID_##n: return GC_PHASE_BTREE_##n; ++ BCH_BTREE_IDS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++static inline struct gc_pos gc_pos_btree(enum btree_id id, ++ struct bpos pos, unsigned level) ++{ ++ return (struct gc_pos) { ++ .phase = btree_id_to_gc_phase(id), ++ .pos = pos, ++ .level = level, ++ }; ++} ++ ++/* ++ * GC position of the pointers within a btree node: note, _not_ for &b->key ++ * itself, that lives in the parent node: ++ */ ++static inline struct gc_pos gc_pos_btree_node(struct btree *b) ++{ ++ return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level); ++} ++ ++/* ++ * GC position of the pointer to a btree root: we don't use ++ * gc_pos_pointer_to_btree_node() here to avoid a potential race with ++ * btree_split() increasing the tree depth - the new root will have level > the ++ * old root and thus have a greater gc position than the old root, but that ++ * would be incorrect since once gc has marked the root it's not coming back. ++ */ ++static inline struct gc_pos gc_pos_btree_root(enum btree_id id) ++{ ++ return gc_pos_btree(id, POS_MAX, BTREE_MAX_DEPTH); ++} ++ ++static inline struct gc_pos gc_pos_alloc(struct bch_fs *c, struct open_bucket *ob) ++{ ++ return (struct gc_pos) { ++ .phase = GC_PHASE_ALLOC, ++ .pos = POS(ob ? ob - c->open_buckets : 0, 0), ++ }; ++} ++ ++static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) ++{ ++ unsigned seq; ++ bool ret; ++ ++ do { ++ seq = read_seqcount_begin(&c->gc_pos_lock); ++ ret = gc_pos_cmp(pos, c->gc_pos) <= 0; ++ } while (read_seqcount_retry(&c->gc_pos_lock, seq)); ++ ++ return ret; ++} ++ ++#endif /* _BCACHEFS_BTREE_GC_H */ +diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c +new file mode 100644 +index 000000000000..2f5097218f9c +--- /dev/null ++++ b/fs/bcachefs/btree_io.c +@@ -0,0 +1,1834 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "bkey_sort.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_locking.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "debug.h" ++#include "error.h" ++#include "extents.h" ++#include "io.h" ++#include "journal_reclaim.h" ++#include "journal_seq_blacklist.h" ++#include "super-io.h" ++ ++#include ++#include ++ ++static void verify_no_dups(struct btree *b, ++ struct bkey_packed *start, ++ struct bkey_packed *end, ++ bool extents) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bkey_packed *k, *p; ++ ++ if (start == end) ++ return; ++ ++ for (p = start, k = bkey_next_skip_noops(start, end); ++ k != end; ++ p = k, k = bkey_next_skip_noops(k, end)) { ++ struct bkey l = bkey_unpack_key(b, p); ++ struct bkey r = bkey_unpack_key(b, k); ++ ++ BUG_ON(extents ++ ? bkey_cmp(l.p, bkey_start_pos(&r)) > 0 ++ : bkey_cmp(l.p, bkey_start_pos(&r)) >= 0); ++ //BUG_ON(bkey_cmp_packed(&b->format, p, k) >= 0); ++ } ++#endif ++} ++ ++static void set_needs_whiteout(struct bset *i, int v) ++{ ++ struct bkey_packed *k; ++ ++ for (k = i->start; ++ k != vstruct_last(i); ++ k = bkey_next_skip_noops(k, vstruct_last(i))) ++ k->needs_whiteout = v; ++} ++ ++static void btree_bounce_free(struct bch_fs *c, size_t size, ++ bool used_mempool, void *p) ++{ ++ if (used_mempool) ++ mempool_free(p, &c->btree_bounce_pool); ++ else ++ vpfree(p, size); ++} ++ ++static void *btree_bounce_alloc(struct bch_fs *c, size_t size, ++ bool *used_mempool) ++{ ++ unsigned flags = memalloc_nofs_save(); ++ void *p; ++ ++ BUG_ON(size > btree_bytes(c)); ++ ++ *used_mempool = false; ++ p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT); ++ if (!p) { ++ *used_mempool = true; ++ p = mempool_alloc(&c->btree_bounce_pool, GFP_NOIO); ++ } ++ memalloc_nofs_restore(flags); ++ return p; ++} ++ ++static void sort_bkey_ptrs(const struct btree *bt, ++ struct bkey_packed **ptrs, unsigned nr) ++{ ++ unsigned n = nr, a = nr / 2, b, c, d; ++ ++ if (!a) ++ return; ++ ++ /* Heap sort: see lib/sort.c: */ ++ while (1) { ++ if (a) ++ a--; ++ else if (--n) ++ swap(ptrs[0], ptrs[n]); ++ else ++ break; ++ ++ for (b = a; c = 2 * b + 1, (d = c + 1) < n;) ++ b = bkey_cmp_packed(bt, ++ ptrs[c], ++ ptrs[d]) >= 0 ? c : d; ++ if (d == n) ++ b = c; ++ ++ while (b != a && ++ bkey_cmp_packed(bt, ++ ptrs[a], ++ ptrs[b]) >= 0) ++ b = (b - 1) / 2; ++ c = b; ++ while (b != a) { ++ b = (b - 1) / 2; ++ swap(ptrs[b], ptrs[c]); ++ } ++ } ++} ++ ++static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) ++{ ++ struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k; ++ bool used_mempool = false; ++ size_t bytes = b->whiteout_u64s * sizeof(u64); ++ ++ if (!b->whiteout_u64s) ++ return; ++ ++ new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); ++ ++ ptrs = ptrs_end = ((void *) new_whiteouts + bytes); ++ ++ for (k = unwritten_whiteouts_start(c, b); ++ k != unwritten_whiteouts_end(c, b); ++ k = bkey_next(k)) ++ *--ptrs = k; ++ ++ sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs); ++ ++ k = new_whiteouts; ++ ++ while (ptrs != ptrs_end) { ++ bkey_copy(k, *ptrs); ++ k = bkey_next(k); ++ ptrs++; ++ } ++ ++ verify_no_dups(b, new_whiteouts, ++ (void *) ((u64 *) new_whiteouts + b->whiteout_u64s), ++ btree_node_old_extent_overwrite(b)); ++ ++ memcpy_u64s(unwritten_whiteouts_start(c, b), ++ new_whiteouts, b->whiteout_u64s); ++ ++ btree_bounce_free(c, bytes, used_mempool, new_whiteouts); ++} ++ ++static bool should_compact_bset(struct btree *b, struct bset_tree *t, ++ bool compacting, enum compact_mode mode) ++{ ++ if (!bset_dead_u64s(b, t)) ++ return false; ++ ++ switch (mode) { ++ case COMPACT_LAZY: ++ return should_compact_bset_lazy(b, t) || ++ (compacting && !bset_written(b, bset(b, t))); ++ case COMPACT_ALL: ++ return true; ++ default: ++ BUG(); ++ } ++} ++ ++static bool bch2_compact_extent_whiteouts(struct bch_fs *c, ++ struct btree *b, ++ enum compact_mode mode) ++{ ++ const struct bkey_format *f = &b->format; ++ struct bset_tree *t; ++ struct bkey_packed *whiteouts = NULL; ++ struct bkey_packed *u_start, *u_pos; ++ struct sort_iter sort_iter; ++ unsigned bytes, whiteout_u64s = 0, u64s; ++ bool used_mempool, compacting = false; ++ ++ BUG_ON(!btree_node_is_extents(b)); ++ ++ for_each_bset(b, t) ++ if (should_compact_bset(b, t, whiteout_u64s != 0, mode)) ++ whiteout_u64s += bset_dead_u64s(b, t); ++ ++ if (!whiteout_u64s) ++ return false; ++ ++ bch2_sort_whiteouts(c, b); ++ ++ sort_iter_init(&sort_iter, b); ++ ++ whiteout_u64s += b->whiteout_u64s; ++ bytes = whiteout_u64s * sizeof(u64); ++ ++ whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); ++ u_start = u_pos = whiteouts; ++ ++ memcpy_u64s(u_pos, unwritten_whiteouts_start(c, b), ++ b->whiteout_u64s); ++ u_pos = (void *) u_pos + b->whiteout_u64s * sizeof(u64); ++ ++ sort_iter_add(&sort_iter, u_start, u_pos); ++ ++ for_each_bset(b, t) { ++ struct bset *i = bset(b, t); ++ struct bkey_packed *k, *n, *out, *start, *end; ++ struct btree_node_entry *src = NULL, *dst = NULL; ++ ++ if (t != b->set && !bset_written(b, i)) { ++ src = container_of(i, struct btree_node_entry, keys); ++ dst = max(write_block(b), ++ (void *) btree_bkey_last(b, t - 1)); ++ } ++ ++ if (src != dst) ++ compacting = true; ++ ++ if (!should_compact_bset(b, t, compacting, mode)) { ++ if (src != dst) { ++ memmove(dst, src, sizeof(*src) + ++ le16_to_cpu(src->keys.u64s) * ++ sizeof(u64)); ++ i = &dst->keys; ++ set_btree_bset(b, t, i); ++ } ++ continue; ++ } ++ ++ compacting = true; ++ u_start = u_pos; ++ start = i->start; ++ end = vstruct_last(i); ++ ++ if (src != dst) { ++ memmove(dst, src, sizeof(*src)); ++ i = &dst->keys; ++ set_btree_bset(b, t, i); ++ } ++ ++ out = i->start; ++ ++ for (k = start; k != end; k = n) { ++ n = bkey_next_skip_noops(k, end); ++ ++ if (bkey_deleted(k)) ++ continue; ++ ++ BUG_ON(bkey_whiteout(k) && ++ k->needs_whiteout && ++ bkey_written(b, k)); ++ ++ if (bkey_whiteout(k) && !k->needs_whiteout) ++ continue; ++ ++ if (bkey_whiteout(k)) { ++ memcpy_u64s(u_pos, k, bkeyp_key_u64s(f, k)); ++ set_bkeyp_val_u64s(f, u_pos, 0); ++ u_pos = bkey_next(u_pos); ++ } else { ++ bkey_copy(out, k); ++ out = bkey_next(out); ++ } ++ } ++ ++ sort_iter_add(&sort_iter, u_start, u_pos); ++ ++ i->u64s = cpu_to_le16((u64 *) out - i->_data); ++ set_btree_bset_end(b, t); ++ bch2_bset_set_no_aux_tree(b, t); ++ } ++ ++ b->whiteout_u64s = (u64 *) u_pos - (u64 *) whiteouts; ++ ++ BUG_ON((void *) unwritten_whiteouts_start(c, b) < ++ (void *) btree_bkey_last(b, bset_tree_last(b))); ++ ++ u64s = bch2_sort_extent_whiteouts(unwritten_whiteouts_start(c, b), ++ &sort_iter); ++ ++ BUG_ON(u64s > b->whiteout_u64s); ++ BUG_ON(u_pos != whiteouts && !u64s); ++ ++ if (u64s != b->whiteout_u64s) { ++ void *src = unwritten_whiteouts_start(c, b); ++ ++ b->whiteout_u64s = u64s; ++ memmove_u64s_up(unwritten_whiteouts_start(c, b), src, u64s); ++ } ++ ++ verify_no_dups(b, ++ unwritten_whiteouts_start(c, b), ++ unwritten_whiteouts_end(c, b), ++ true); ++ ++ btree_bounce_free(c, bytes, used_mempool, whiteouts); ++ ++ bch2_btree_build_aux_trees(b); ++ ++ bch_btree_keys_u64s_remaining(c, b); ++ bch2_verify_btree_nr_keys(b); ++ ++ return true; ++} ++ ++static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) ++{ ++ struct bset_tree *t; ++ bool ret = false; ++ ++ for_each_bset(b, t) { ++ struct bset *i = bset(b, t); ++ struct bkey_packed *k, *n, *out, *start, *end; ++ struct btree_node_entry *src = NULL, *dst = NULL; ++ ++ if (t != b->set && !bset_written(b, i)) { ++ src = container_of(i, struct btree_node_entry, keys); ++ dst = max(write_block(b), ++ (void *) btree_bkey_last(b, t - 1)); ++ } ++ ++ if (src != dst) ++ ret = true; ++ ++ if (!should_compact_bset(b, t, ret, mode)) { ++ if (src != dst) { ++ memmove(dst, src, sizeof(*src) + ++ le16_to_cpu(src->keys.u64s) * ++ sizeof(u64)); ++ i = &dst->keys; ++ set_btree_bset(b, t, i); ++ } ++ continue; ++ } ++ ++ start = btree_bkey_first(b, t); ++ end = btree_bkey_last(b, t); ++ ++ if (src != dst) { ++ memmove(dst, src, sizeof(*src)); ++ i = &dst->keys; ++ set_btree_bset(b, t, i); ++ } ++ ++ out = i->start; ++ ++ for (k = start; k != end; k = n) { ++ n = bkey_next_skip_noops(k, end); ++ ++ if (!bkey_whiteout(k)) { ++ bkey_copy(out, k); ++ out = bkey_next(out); ++ } else { ++ BUG_ON(k->needs_whiteout); ++ } ++ } ++ ++ i->u64s = cpu_to_le16((u64 *) out - i->_data); ++ set_btree_bset_end(b, t); ++ bch2_bset_set_no_aux_tree(b, t); ++ ret = true; ++ } ++ ++ bch2_verify_btree_nr_keys(b); ++ ++ bch2_btree_build_aux_trees(b); ++ ++ return ret; ++} ++ ++bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, ++ enum compact_mode mode) ++{ ++ return !btree_node_old_extent_overwrite(b) ++ ? bch2_drop_whiteouts(b, mode) ++ : bch2_compact_extent_whiteouts(c, b, mode); ++} ++ ++static void btree_node_sort(struct bch_fs *c, struct btree *b, ++ struct btree_iter *iter, ++ unsigned start_idx, ++ unsigned end_idx, ++ bool filter_whiteouts) ++{ ++ struct btree_node *out; ++ struct sort_iter sort_iter; ++ struct bset_tree *t; ++ struct bset *start_bset = bset(b, &b->set[start_idx]); ++ bool used_mempool = false; ++ u64 start_time, seq = 0; ++ unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1; ++ bool sorting_entire_node = start_idx == 0 && ++ end_idx == b->nsets; ++ ++ sort_iter_init(&sort_iter, b); ++ ++ for (t = b->set + start_idx; ++ t < b->set + end_idx; ++ t++) { ++ u64s += le16_to_cpu(bset(b, t)->u64s); ++ sort_iter_add(&sort_iter, ++ btree_bkey_first(b, t), ++ btree_bkey_last(b, t)); ++ } ++ ++ bytes = sorting_entire_node ++ ? btree_bytes(c) ++ : __vstruct_bytes(struct btree_node, u64s); ++ ++ out = btree_bounce_alloc(c, bytes, &used_mempool); ++ ++ start_time = local_clock(); ++ ++ if (btree_node_old_extent_overwrite(b)) ++ filter_whiteouts = bset_written(b, start_bset); ++ ++ u64s = (btree_node_old_extent_overwrite(b) ++ ? bch2_sort_extents ++ : bch2_sort_keys)(out->keys.start, ++ &sort_iter, ++ filter_whiteouts); ++ ++ out->keys.u64s = cpu_to_le16(u64s); ++ ++ BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes); ++ ++ if (sorting_entire_node) ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], ++ start_time); ++ ++ /* Make sure we preserve bset journal_seq: */ ++ for (t = b->set + start_idx; t < b->set + end_idx; t++) ++ seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq)); ++ start_bset->journal_seq = cpu_to_le64(seq); ++ ++ if (sorting_entire_node) { ++ unsigned u64s = le16_to_cpu(out->keys.u64s); ++ ++ BUG_ON(bytes != btree_bytes(c)); ++ ++ /* ++ * Our temporary buffer is the same size as the btree node's ++ * buffer, we can just swap buffers instead of doing a big ++ * memcpy() ++ */ ++ *out = *b->data; ++ out->keys.u64s = cpu_to_le16(u64s); ++ swap(out, b->data); ++ set_btree_bset(b, b->set, &b->data->keys); ++ } else { ++ start_bset->u64s = out->keys.u64s; ++ memcpy_u64s(start_bset->start, ++ out->keys.start, ++ le16_to_cpu(out->keys.u64s)); ++ } ++ ++ for (i = start_idx + 1; i < end_idx; i++) ++ b->nr.bset_u64s[start_idx] += ++ b->nr.bset_u64s[i]; ++ ++ b->nsets -= shift; ++ ++ for (i = start_idx + 1; i < b->nsets; i++) { ++ b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift]; ++ b->set[i] = b->set[i + shift]; ++ } ++ ++ for (i = b->nsets; i < MAX_BSETS; i++) ++ b->nr.bset_u64s[i] = 0; ++ ++ set_btree_bset_end(b, &b->set[start_idx]); ++ bch2_bset_set_no_aux_tree(b, &b->set[start_idx]); ++ ++ btree_bounce_free(c, bytes, used_mempool, out); ++ ++ bch2_verify_btree_nr_keys(b); ++} ++ ++void bch2_btree_sort_into(struct bch_fs *c, ++ struct btree *dst, ++ struct btree *src) ++{ ++ struct btree_nr_keys nr; ++ struct btree_node_iter src_iter; ++ u64 start_time = local_clock(); ++ ++ BUG_ON(dst->nsets != 1); ++ ++ bch2_bset_set_no_aux_tree(dst, dst->set); ++ ++ bch2_btree_node_iter_init_from_start(&src_iter, src); ++ ++ if (btree_node_is_extents(src)) ++ nr = bch2_sort_repack_merge(c, btree_bset_first(dst), ++ src, &src_iter, ++ &dst->format, ++ true); ++ else ++ nr = bch2_sort_repack(btree_bset_first(dst), ++ src, &src_iter, ++ &dst->format, ++ true); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], ++ start_time); ++ ++ set_btree_bset_end(dst, dst->set); ++ ++ dst->nr.live_u64s += nr.live_u64s; ++ dst->nr.bset_u64s[0] += nr.bset_u64s[0]; ++ dst->nr.packed_keys += nr.packed_keys; ++ dst->nr.unpacked_keys += nr.unpacked_keys; ++ ++ bch2_verify_btree_nr_keys(dst); ++} ++ ++#define SORT_CRIT (4096 / sizeof(u64)) ++ ++/* ++ * We're about to add another bset to the btree node, so if there's currently ++ * too many bsets - sort some of them together: ++ */ ++static bool btree_node_compact(struct bch_fs *c, struct btree *b, ++ struct btree_iter *iter) ++{ ++ unsigned unwritten_idx; ++ bool ret = false; ++ ++ for (unwritten_idx = 0; ++ unwritten_idx < b->nsets; ++ unwritten_idx++) ++ if (!bset_written(b, bset(b, &b->set[unwritten_idx]))) ++ break; ++ ++ if (b->nsets - unwritten_idx > 1) { ++ btree_node_sort(c, b, iter, unwritten_idx, ++ b->nsets, false); ++ ret = true; ++ } ++ ++ if (unwritten_idx > 1) { ++ btree_node_sort(c, b, iter, 0, unwritten_idx, false); ++ ret = true; ++ } ++ ++ return ret; ++} ++ ++void bch2_btree_build_aux_trees(struct btree *b) ++{ ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) ++ bch2_bset_build_aux_tree(b, t, ++ !bset_written(b, bset(b, t)) && ++ t == bset_tree_last(b)); ++} ++ ++/* ++ * @bch_btree_init_next - initialize a new (unwritten) bset that can then be ++ * inserted into ++ * ++ * Safe to call if there already is an unwritten bset - will only add a new bset ++ * if @b doesn't already have one. ++ * ++ * Returns true if we sorted (i.e. invalidated iterators ++ */ ++void bch2_btree_init_next(struct bch_fs *c, struct btree *b, ++ struct btree_iter *iter) ++{ ++ struct btree_node_entry *bne; ++ bool did_sort; ++ ++ EBUG_ON(!(b->c.lock.state.seq & 1)); ++ EBUG_ON(iter && iter->l[b->c.level].b != b); ++ ++ did_sort = btree_node_compact(c, b, iter); ++ ++ bne = want_new_bset(c, b); ++ if (bne) ++ bch2_bset_init_next(c, b, bne); ++ ++ bch2_btree_build_aux_trees(b); ++ ++ if (iter && did_sort) ++ bch2_btree_iter_reinit_node(iter, b); ++} ++ ++static void btree_err_msg(struct printbuf *out, struct bch_fs *c, ++ struct btree *b, struct bset *i, ++ unsigned offset, int write) ++{ ++ pr_buf(out, "error validating btree node %sat btree %u level %u/%u\n" ++ "pos ", ++ write ? "before write " : "", ++ b->c.btree_id, b->c.level, ++ c->btree_roots[b->c.btree_id].level); ++ bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); ++ ++ pr_buf(out, " node offset %u", b->written); ++ if (i) ++ pr_buf(out, " bset u64s %u", le16_to_cpu(i->u64s)); ++} ++ ++enum btree_err_type { ++ BTREE_ERR_FIXABLE, ++ BTREE_ERR_WANT_RETRY, ++ BTREE_ERR_MUST_RETRY, ++ BTREE_ERR_FATAL, ++}; ++ ++enum btree_validate_ret { ++ BTREE_RETRY_READ = 64, ++}; ++ ++#define btree_err(type, c, b, i, msg, ...) \ ++({ \ ++ __label__ out; \ ++ char _buf[300]; \ ++ struct printbuf out = PBUF(_buf); \ ++ \ ++ btree_err_msg(&out, c, b, i, b->written, write); \ ++ pr_buf(&out, ": " msg, ##__VA_ARGS__); \ ++ \ ++ if (type == BTREE_ERR_FIXABLE && \ ++ write == READ && \ ++ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) { \ ++ mustfix_fsck_err(c, "%s", _buf); \ ++ goto out; \ ++ } \ ++ \ ++ switch (write) { \ ++ case READ: \ ++ bch_err(c, "%s", _buf); \ ++ \ ++ switch (type) { \ ++ case BTREE_ERR_FIXABLE: \ ++ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ goto fsck_err; \ ++ case BTREE_ERR_WANT_RETRY: \ ++ if (have_retry) { \ ++ ret = BTREE_RETRY_READ; \ ++ goto fsck_err; \ ++ } \ ++ break; \ ++ case BTREE_ERR_MUST_RETRY: \ ++ ret = BTREE_RETRY_READ; \ ++ goto fsck_err; \ ++ case BTREE_ERR_FATAL: \ ++ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ goto fsck_err; \ ++ } \ ++ break; \ ++ case WRITE: \ ++ bch_err(c, "corrupt metadata before write: %s", _buf); \ ++ \ ++ if (bch2_fs_inconsistent(c)) { \ ++ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ goto fsck_err; \ ++ } \ ++ break; \ ++ } \ ++out: \ ++ true; \ ++}) ++ ++#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) ++ ++static int validate_bset(struct bch_fs *c, struct btree *b, ++ struct bset *i, unsigned sectors, ++ int write, bool have_retry) ++{ ++ unsigned version = le16_to_cpu(i->version); ++ const char *err; ++ int ret = 0; ++ ++ btree_err_on((version != BCH_BSET_VERSION_OLD && ++ version < bcachefs_metadata_version_min) || ++ version >= bcachefs_metadata_version_max, ++ BTREE_ERR_FATAL, c, b, i, ++ "unsupported bset version"); ++ ++ if (btree_err_on(b->written + sectors > c->opts.btree_node_size, ++ BTREE_ERR_FIXABLE, c, b, i, ++ "bset past end of btree node")) { ++ i->u64s = 0; ++ return 0; ++ } ++ ++ btree_err_on(b->written && !i->u64s, ++ BTREE_ERR_FIXABLE, c, b, i, ++ "empty bset"); ++ ++ if (!b->written) { ++ struct btree_node *bn = ++ container_of(i, struct btree_node, keys); ++ /* These indicate that we read the wrong btree node: */ ++ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { ++ struct bch_btree_ptr_v2 *bp = ++ &bkey_i_to_btree_ptr_v2(&b->key)->v; ++ ++ /* XXX endianness */ ++ btree_err_on(bp->seq != bn->keys.seq, ++ BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "incorrect sequence number (wrong btree node)"); ++ } ++ ++ btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, ++ BTREE_ERR_MUST_RETRY, c, b, i, ++ "incorrect btree id"); ++ ++ btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, ++ BTREE_ERR_MUST_RETRY, c, b, i, ++ "incorrect level"); ++ ++ if (BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN) { ++ u64 *p = (u64 *) &bn->ptr; ++ ++ *p = swab64(*p); ++ } ++ ++ if (!write) ++ compat_btree_node(b->c.level, b->c.btree_id, version, ++ BSET_BIG_ENDIAN(i), write, bn); ++ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { ++ struct bch_btree_ptr_v2 *bp = ++ &bkey_i_to_btree_ptr_v2(&b->key)->v; ++ ++ btree_err_on(bkey_cmp(b->data->min_key, bp->min_key), ++ BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "incorrect min_key: got %llu:%llu should be %llu:%llu", ++ b->data->min_key.inode, ++ b->data->min_key.offset, ++ bp->min_key.inode, ++ bp->min_key.offset); ++ } ++ ++ btree_err_on(bkey_cmp(bn->max_key, b->key.k.p), ++ BTREE_ERR_MUST_RETRY, c, b, i, ++ "incorrect max key"); ++ ++ if (write) ++ compat_btree_node(b->c.level, b->c.btree_id, version, ++ BSET_BIG_ENDIAN(i), write, bn); ++ ++ /* XXX: ideally we would be validating min_key too */ ++#if 0 ++ /* ++ * not correct anymore, due to btree node write error ++ * handling ++ * ++ * need to add bn->seq to btree keys and verify ++ * against that ++ */ ++ btree_err_on(!extent_contains_ptr(bkey_i_to_s_c_extent(&b->key), ++ bn->ptr), ++ BTREE_ERR_FATAL, c, b, i, ++ "incorrect backpointer"); ++#endif ++ err = bch2_bkey_format_validate(&bn->format); ++ btree_err_on(err, ++ BTREE_ERR_FATAL, c, b, i, ++ "invalid bkey format: %s", err); ++ ++ compat_bformat(b->c.level, b->c.btree_id, version, ++ BSET_BIG_ENDIAN(i), write, ++ &bn->format); ++ } ++fsck_err: ++ return ret; ++} ++ ++static int validate_bset_keys(struct bch_fs *c, struct btree *b, ++ struct bset *i, unsigned *whiteout_u64s, ++ int write, bool have_retry) ++{ ++ unsigned version = le16_to_cpu(i->version); ++ struct bkey_packed *k, *prev = NULL; ++ bool seen_non_whiteout = false; ++ int ret = 0; ++ ++ if (!BSET_SEPARATE_WHITEOUTS(i)) { ++ seen_non_whiteout = true; ++ *whiteout_u64s = 0; ++ } ++ ++ for (k = i->start; ++ k != vstruct_last(i);) { ++ struct bkey_s u; ++ struct bkey tmp; ++ const char *invalid; ++ ++ if (btree_err_on(bkey_next(k) > vstruct_last(i), ++ BTREE_ERR_FIXABLE, c, b, i, ++ "key extends past end of bset")) { ++ i->u64s = cpu_to_le16((u64 *) k - i->_data); ++ break; ++ } ++ ++ if (btree_err_on(k->format > KEY_FORMAT_CURRENT, ++ BTREE_ERR_FIXABLE, c, b, i, ++ "invalid bkey format %u", k->format)) { ++ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); ++ memmove_u64s_down(k, bkey_next(k), ++ (u64 *) vstruct_end(i) - (u64 *) k); ++ continue; ++ } ++ ++ /* XXX: validate k->u64s */ ++ if (!write) ++ bch2_bkey_compat(b->c.level, b->c.btree_id, version, ++ BSET_BIG_ENDIAN(i), write, ++ &b->format, k); ++ ++ u = __bkey_disassemble(b, k, &tmp); ++ ++ invalid = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b)) ?: ++ bch2_bkey_in_btree_node(b, u.s_c) ?: ++ (write ? bch2_bkey_val_invalid(c, u.s_c) : NULL); ++ if (invalid) { ++ char buf[160]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); ++ btree_err(BTREE_ERR_FIXABLE, c, b, i, ++ "invalid bkey:\n%s\n%s", invalid, buf); ++ ++ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); ++ memmove_u64s_down(k, bkey_next(k), ++ (u64 *) vstruct_end(i) - (u64 *) k); ++ continue; ++ } ++ ++ if (write) ++ bch2_bkey_compat(b->c.level, b->c.btree_id, version, ++ BSET_BIG_ENDIAN(i), write, ++ &b->format, k); ++ ++ /* ++ * with the separate whiteouts thing (used for extents), the ++ * second set of keys actually can have whiteouts too, so we ++ * can't solely go off bkey_whiteout()... ++ */ ++ ++ if (!seen_non_whiteout && ++ (!bkey_whiteout(k) || ++ (prev && bkey_iter_cmp(b, prev, k) > 0))) { ++ *whiteout_u64s = k->_data - i->_data; ++ seen_non_whiteout = true; ++ } else if (prev && bkey_iter_cmp(b, prev, k) > 0) { ++ char buf1[80]; ++ char buf2[80]; ++ struct bkey up = bkey_unpack_key(b, prev); ++ ++ bch2_bkey_to_text(&PBUF(buf1), &up); ++ bch2_bkey_to_text(&PBUF(buf2), u.k); ++ ++ bch2_dump_bset(c, b, i, 0); ++ btree_err(BTREE_ERR_FATAL, c, b, i, ++ "keys out of order: %s > %s", ++ buf1, buf2); ++ /* XXX: repair this */ ++ } ++ ++ prev = k; ++ k = bkey_next_skip_noops(k, vstruct_last(i)); ++ } ++fsck_err: ++ return ret; ++} ++ ++int bch2_btree_node_read_done(struct bch_fs *c, struct btree *b, bool have_retry) ++{ ++ struct btree_node_entry *bne; ++ struct sort_iter *iter; ++ struct btree_node *sorted; ++ struct bkey_packed *k; ++ struct bch_extent_ptr *ptr; ++ struct bset *i; ++ bool used_mempool, blacklisted; ++ unsigned u64s; ++ int ret, retry_read = 0, write = READ; ++ ++ iter = mempool_alloc(&c->fill_iter, GFP_NOIO); ++ sort_iter_init(iter, b); ++ iter->size = (btree_blocks(c) + 1) * 2; ++ ++ if (bch2_meta_read_fault("btree")) ++ btree_err(BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "dynamic fault"); ++ ++ btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), ++ BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "bad magic"); ++ ++ btree_err_on(!b->data->keys.seq, ++ BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "bad btree header"); ++ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { ++ struct bch_btree_ptr_v2 *bp = ++ &bkey_i_to_btree_ptr_v2(&b->key)->v; ++ ++ btree_err_on(b->data->keys.seq != bp->seq, ++ BTREE_ERR_MUST_RETRY, c, b, NULL, ++ "got wrong btree node (seq %llx want %llx)", ++ b->data->keys.seq, bp->seq); ++ } ++ ++ while (b->written < c->opts.btree_node_size) { ++ unsigned sectors, whiteout_u64s = 0; ++ struct nonce nonce; ++ struct bch_csum csum; ++ bool first = !b->written; ++ ++ if (!b->written) { ++ i = &b->data->keys; ++ ++ btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), ++ BTREE_ERR_WANT_RETRY, c, b, i, ++ "unknown checksum type"); ++ ++ nonce = btree_nonce(i, b->written << 9); ++ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); ++ ++ btree_err_on(bch2_crc_cmp(csum, b->data->csum), ++ BTREE_ERR_WANT_RETRY, c, b, i, ++ "invalid checksum"); ++ ++ bset_encrypt(c, i, b->written << 9); ++ ++ if (btree_node_is_extents(b) && ++ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) { ++ set_btree_node_old_extent_overwrite(b); ++ set_btree_node_need_rewrite(b); ++ } ++ ++ sectors = vstruct_sectors(b->data, c->block_bits); ++ } else { ++ bne = write_block(b); ++ i = &bne->keys; ++ ++ if (i->seq != b->data->keys.seq) ++ break; ++ ++ btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), ++ BTREE_ERR_WANT_RETRY, c, b, i, ++ "unknown checksum type"); ++ ++ nonce = btree_nonce(i, b->written << 9); ++ csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); ++ ++ btree_err_on(bch2_crc_cmp(csum, bne->csum), ++ BTREE_ERR_WANT_RETRY, c, b, i, ++ "invalid checksum"); ++ ++ bset_encrypt(c, i, b->written << 9); ++ ++ sectors = vstruct_sectors(bne, c->block_bits); ++ } ++ ++ ret = validate_bset(c, b, i, sectors, ++ READ, have_retry); ++ if (ret) ++ goto fsck_err; ++ ++ if (!b->written) ++ btree_node_set_format(b, b->data->format); ++ ++ ret = validate_bset_keys(c, b, i, &whiteout_u64s, ++ READ, have_retry); ++ if (ret) ++ goto fsck_err; ++ ++ SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); ++ ++ b->written += sectors; ++ ++ blacklisted = bch2_journal_seq_is_blacklisted(c, ++ le64_to_cpu(i->journal_seq), ++ true); ++ ++ btree_err_on(blacklisted && first, ++ BTREE_ERR_FIXABLE, c, b, i, ++ "first btree node bset has blacklisted journal seq"); ++ if (blacklisted && !first) ++ continue; ++ ++ sort_iter_add(iter, i->start, ++ vstruct_idx(i, whiteout_u64s)); ++ ++ sort_iter_add(iter, ++ vstruct_idx(i, whiteout_u64s), ++ vstruct_last(i)); ++ } ++ ++ for (bne = write_block(b); ++ bset_byte_offset(b, bne) < btree_bytes(c); ++ bne = (void *) bne + block_bytes(c)) ++ btree_err_on(bne->keys.seq == b->data->keys.seq, ++ BTREE_ERR_WANT_RETRY, c, b, NULL, ++ "found bset signature after last bset"); ++ ++ sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); ++ sorted->keys.u64s = 0; ++ ++ set_btree_bset(b, b->set, &b->data->keys); ++ ++ b->nr = (btree_node_old_extent_overwrite(b) ++ ? bch2_extent_sort_fix_overlapping ++ : bch2_key_sort_fix_overlapping)(c, &sorted->keys, iter); ++ ++ u64s = le16_to_cpu(sorted->keys.u64s); ++ *sorted = *b->data; ++ sorted->keys.u64s = cpu_to_le16(u64s); ++ swap(sorted, b->data); ++ set_btree_bset(b, b->set, &b->data->keys); ++ b->nsets = 1; ++ ++ BUG_ON(b->nr.live_u64s != u64s); ++ ++ btree_bounce_free(c, btree_bytes(c), used_mempool, sorted); ++ ++ i = &b->data->keys; ++ for (k = i->start; k != vstruct_last(i);) { ++ struct bkey tmp; ++ struct bkey_s u = __bkey_disassemble(b, k, &tmp); ++ const char *invalid = bch2_bkey_val_invalid(c, u.s_c); ++ ++ if (invalid || ++ (inject_invalid_keys(c) && ++ !bversion_cmp(u.k->version, MAX_VERSION))) { ++ char buf[160]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, u.s_c); ++ btree_err(BTREE_ERR_FIXABLE, c, b, i, ++ "invalid bkey %s: %s", buf, invalid); ++ ++ btree_keys_account_key_drop(&b->nr, 0, k); ++ ++ i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); ++ memmove_u64s_down(k, bkey_next(k), ++ (u64 *) vstruct_end(i) - (u64 *) k); ++ set_btree_bset_end(b, b->set); ++ continue; ++ } ++ ++ if (u.k->type == KEY_TYPE_btree_ptr_v2) { ++ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u); ++ ++ bp.v->mem_ptr = 0; ++ } ++ ++ k = bkey_next_skip_noops(k, vstruct_last(i)); ++ } ++ ++ bch2_bset_build_aux_tree(b, b->set, false); ++ ++ set_needs_whiteout(btree_bset_first(b), true); ++ ++ btree_node_reset_sib_u64s(b); ++ ++ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ if (ca->mi.state != BCH_MEMBER_STATE_RW) ++ set_btree_node_need_rewrite(b); ++ } ++out: ++ mempool_free(iter, &c->fill_iter); ++ return retry_read; ++fsck_err: ++ if (ret == BTREE_RETRY_READ) { ++ retry_read = 1; ++ } else { ++ bch2_inconsistent_error(c); ++ set_btree_node_read_error(b); ++ } ++ goto out; ++} ++ ++static void btree_node_read_work(struct work_struct *work) ++{ ++ struct btree_read_bio *rb = ++ container_of(work, struct btree_read_bio, work); ++ struct bch_fs *c = rb->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); ++ struct btree *b = rb->bio.bi_private; ++ struct bio *bio = &rb->bio; ++ struct bch_io_failures failed = { .nr = 0 }; ++ bool can_retry; ++ ++ goto start; ++ while (1) { ++ bch_info(c, "retrying read"); ++ ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); ++ rb->have_ioref = bch2_dev_get_ioref(ca, READ); ++ bio_reset(bio); ++ bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; ++ bio->bi_iter.bi_sector = rb->pick.ptr.offset; ++ bio->bi_iter.bi_size = btree_bytes(c); ++ ++ if (rb->have_ioref) { ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ submit_bio_wait(bio); ++ } else { ++ bio->bi_status = BLK_STS_REMOVED; ++ } ++start: ++ bch2_dev_io_err_on(bio->bi_status, ca, "btree read: %s", ++ bch2_blk_status_to_str(bio->bi_status)); ++ if (rb->have_ioref) ++ percpu_ref_put(&ca->io_ref); ++ rb->have_ioref = false; ++ ++ bch2_mark_io_failure(&failed, &rb->pick); ++ ++ can_retry = bch2_bkey_pick_read_device(c, ++ bkey_i_to_s_c(&b->key), ++ &failed, &rb->pick) > 0; ++ ++ if (!bio->bi_status && ++ !bch2_btree_node_read_done(c, b, can_retry)) ++ break; ++ ++ if (!can_retry) { ++ set_btree_node_read_error(b); ++ break; ++ } ++ } ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], ++ rb->start_time); ++ bio_put(&rb->bio); ++ clear_btree_node_read_in_flight(b); ++ wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); ++} ++ ++static void btree_node_read_endio(struct bio *bio) ++{ ++ struct btree_read_bio *rb = ++ container_of(bio, struct btree_read_bio, bio); ++ struct bch_fs *c = rb->c; ++ ++ if (rb->have_ioref) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); ++ bch2_latency_acct(ca, rb->start_time, READ); ++ } ++ ++ queue_work(system_unbound_wq, &rb->work); ++} ++ ++void bch2_btree_node_read(struct bch_fs *c, struct btree *b, ++ bool sync) ++{ ++ struct extent_ptr_decoded pick; ++ struct btree_read_bio *rb; ++ struct bch_dev *ca; ++ struct bio *bio; ++ int ret; ++ ++ trace_btree_read(c, b); ++ ++ ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), ++ NULL, &pick); ++ if (bch2_fs_fatal_err_on(ret <= 0, c, ++ "btree node read error: no device to read from")) { ++ set_btree_node_read_error(b); ++ return; ++ } ++ ++ ca = bch_dev_bkey_exists(c, pick.ptr.dev); ++ ++ bio = bio_alloc_bioset(GFP_NOIO, buf_pages(b->data, ++ btree_bytes(c)), ++ &c->btree_bio); ++ rb = container_of(bio, struct btree_read_bio, bio); ++ rb->c = c; ++ rb->start_time = local_clock(); ++ rb->have_ioref = bch2_dev_get_ioref(ca, READ); ++ rb->pick = pick; ++ INIT_WORK(&rb->work, btree_node_read_work); ++ bio->bi_opf = REQ_OP_READ|REQ_SYNC|REQ_META; ++ bio->bi_iter.bi_sector = pick.ptr.offset; ++ bio->bi_end_io = btree_node_read_endio; ++ bio->bi_private = b; ++ bch2_bio_map(bio, b->data, btree_bytes(c)); ++ ++ set_btree_node_read_in_flight(b); ++ ++ if (rb->have_ioref) { ++ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], ++ bio_sectors(bio)); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ ++ if (sync) { ++ submit_bio_wait(bio); ++ ++ bio->bi_private = b; ++ btree_node_read_work(&rb->work); ++ } else { ++ submit_bio(bio); ++ } ++ } else { ++ bio->bi_status = BLK_STS_REMOVED; ++ ++ if (sync) ++ btree_node_read_work(&rb->work); ++ else ++ queue_work(system_unbound_wq, &rb->work); ++ ++ } ++} ++ ++int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, ++ const struct bkey_i *k, unsigned level) ++{ ++ struct closure cl; ++ struct btree *b; ++ int ret; ++ ++ closure_init_stack(&cl); ++ ++ do { ++ ret = bch2_btree_cache_cannibalize_lock(c, &cl); ++ closure_sync(&cl); ++ } while (ret); ++ ++ b = bch2_btree_node_mem_alloc(c); ++ bch2_btree_cache_cannibalize_unlock(c); ++ ++ BUG_ON(IS_ERR(b)); ++ ++ bkey_copy(&b->key, k); ++ BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); ++ ++ bch2_btree_node_read(c, b, true); ++ ++ if (btree_node_read_error(b)) { ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ mutex_lock(&c->btree_cache.lock); ++ list_move(&b->list, &c->btree_cache.freeable); ++ mutex_unlock(&c->btree_cache.lock); ++ ++ ret = -EIO; ++ goto err; ++ } ++ ++ bch2_btree_set_root_for_read(c, b); ++err: ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++ ++ return ret; ++} ++ ++void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, ++ struct btree_write *w) ++{ ++ unsigned long old, new, v = READ_ONCE(b->will_make_reachable); ++ ++ do { ++ old = new = v; ++ if (!(old & 1)) ++ break; ++ ++ new &= ~1UL; ++ } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old); ++ ++ if (old & 1) ++ closure_put(&((struct btree_update *) new)->cl); ++ ++ bch2_journal_pin_drop(&c->journal, &w->journal); ++} ++ ++static void btree_node_write_done(struct bch_fs *c, struct btree *b) ++{ ++ struct btree_write *w = btree_prev_write(b); ++ ++ bch2_btree_complete_write(c, b, w); ++ btree_node_io_unlock(b); ++} ++ ++static void bch2_btree_node_write_error(struct bch_fs *c, ++ struct btree_write_bio *wbio) ++{ ++ struct btree *b = wbio->wbio.bio.bi_private; ++ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; ++ struct bch_extent_ptr *ptr; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_node_iter(&trans, b->c.btree_id, b->key.k.p, ++ BTREE_MAX_DEPTH, b->c.level, 0); ++retry: ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ goto err; ++ ++ /* has node been freed? */ ++ if (iter->l[b->c.level].b != b) { ++ /* node has been freed: */ ++ BUG_ON(!btree_node_dying(b)); ++ goto out; ++ } ++ ++ BUG_ON(!btree_node_hashed(b)); ++ ++ bkey_copy(&tmp.k, &b->key); ++ ++ bch2_bkey_drop_ptrs(bkey_i_to_s(&tmp.k), ptr, ++ bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); ++ ++ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&tmp.k))) ++ goto err; ++ ++ ret = bch2_btree_node_update_key(c, iter, b, &tmp.k); ++ if (ret == -EINTR) ++ goto retry; ++ if (ret) ++ goto err; ++out: ++ bch2_trans_exit(&trans); ++ bio_put(&wbio->wbio.bio); ++ btree_node_write_done(c, b); ++ return; ++err: ++ set_btree_node_noevict(b); ++ bch2_fs_fatal_error(c, "fatal error writing btree node"); ++ goto out; ++} ++ ++void bch2_btree_write_error_work(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, struct bch_fs, ++ btree_write_error_work); ++ struct bio *bio; ++ ++ while (1) { ++ spin_lock_irq(&c->btree_write_error_lock); ++ bio = bio_list_pop(&c->btree_write_error_list); ++ spin_unlock_irq(&c->btree_write_error_lock); ++ ++ if (!bio) ++ break; ++ ++ bch2_btree_node_write_error(c, ++ container_of(bio, struct btree_write_bio, wbio.bio)); ++ } ++} ++ ++static void btree_node_write_work(struct work_struct *work) ++{ ++ struct btree_write_bio *wbio = ++ container_of(work, struct btree_write_bio, work); ++ struct bch_fs *c = wbio->wbio.c; ++ struct btree *b = wbio->wbio.bio.bi_private; ++ ++ btree_bounce_free(c, ++ wbio->bytes, ++ wbio->wbio.used_mempool, ++ wbio->data); ++ ++ if (wbio->wbio.failed.nr) { ++ unsigned long flags; ++ ++ spin_lock_irqsave(&c->btree_write_error_lock, flags); ++ bio_list_add(&c->btree_write_error_list, &wbio->wbio.bio); ++ spin_unlock_irqrestore(&c->btree_write_error_lock, flags); ++ ++ queue_work(c->wq, &c->btree_write_error_work); ++ return; ++ } ++ ++ bio_put(&wbio->wbio.bio); ++ btree_node_write_done(c, b); ++} ++ ++static void btree_node_write_endio(struct bio *bio) ++{ ++ struct bch_write_bio *wbio = to_wbio(bio); ++ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; ++ struct bch_write_bio *orig = parent ?: wbio; ++ struct bch_fs *c = wbio->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); ++ unsigned long flags; ++ ++ if (wbio->have_ioref) ++ bch2_latency_acct(ca, wbio->submit_time, WRITE); ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write: %s", ++ bch2_blk_status_to_str(bio->bi_status)) || ++ bch2_meta_write_fault("btree")) { ++ spin_lock_irqsave(&c->btree_write_error_lock, flags); ++ bch2_dev_list_add_dev(&orig->failed, wbio->dev); ++ spin_unlock_irqrestore(&c->btree_write_error_lock, flags); ++ } ++ ++ if (wbio->have_ioref) ++ percpu_ref_put(&ca->io_ref); ++ ++ if (parent) { ++ bio_put(bio); ++ bio_endio(&parent->bio); ++ } else { ++ struct btree_write_bio *wb = ++ container_of(orig, struct btree_write_bio, wbio); ++ ++ INIT_WORK(&wb->work, btree_node_write_work); ++ queue_work(system_unbound_wq, &wb->work); ++ } ++} ++ ++static int validate_bset_for_write(struct bch_fs *c, struct btree *b, ++ struct bset *i, unsigned sectors) ++{ ++ unsigned whiteout_u64s = 0; ++ int ret; ++ ++ if (bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), BKEY_TYPE_BTREE)) ++ return -1; ++ ++ ret = validate_bset(c, b, i, sectors, WRITE, false) ?: ++ validate_bset_keys(c, b, i, &whiteout_u64s, WRITE, false); ++ if (ret) ++ bch2_inconsistent_error(c); ++ ++ return ret; ++} ++ ++void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, ++ enum six_lock_type lock_type_held) ++{ ++ struct btree_write_bio *wbio; ++ struct bset_tree *t; ++ struct bset *i; ++ struct btree_node *bn = NULL; ++ struct btree_node_entry *bne = NULL; ++ BKEY_PADDED(key) k; ++ struct bch_extent_ptr *ptr; ++ struct sort_iter sort_iter; ++ struct nonce nonce; ++ unsigned bytes_to_write, sectors_to_write, bytes, u64s; ++ u64 seq = 0; ++ bool used_mempool; ++ unsigned long old, new; ++ bool validate_before_checksum = false; ++ void *data; ++ ++ if (test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)) ++ return; ++ ++ /* ++ * We may only have a read lock on the btree node - the dirty bit is our ++ * "lock" against racing with other threads that may be trying to start ++ * a write, we do a write iff we clear the dirty bit. Since setting the ++ * dirty bit requires a write lock, we can't race with other threads ++ * redirtying it: ++ */ ++ do { ++ old = new = READ_ONCE(b->flags); ++ ++ if (!(old & (1 << BTREE_NODE_dirty))) ++ return; ++ ++ if (!btree_node_may_write(b)) ++ return; ++ ++ if (old & (1 << BTREE_NODE_write_in_flight)) { ++ btree_node_wait_on_io(b); ++ continue; ++ } ++ ++ new &= ~(1 << BTREE_NODE_dirty); ++ new &= ~(1 << BTREE_NODE_need_write); ++ new |= (1 << BTREE_NODE_write_in_flight); ++ new |= (1 << BTREE_NODE_just_written); ++ new ^= (1 << BTREE_NODE_write_idx); ++ } while (cmpxchg_acquire(&b->flags, old, new) != old); ++ ++ BUG_ON(btree_node_fake(b)); ++ BUG_ON((b->will_make_reachable != 0) != !b->written); ++ ++ BUG_ON(b->written >= c->opts.btree_node_size); ++ BUG_ON(b->written & (c->opts.block_size - 1)); ++ BUG_ON(bset_written(b, btree_bset_last(b))); ++ BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); ++ BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); ++ ++ bch2_sort_whiteouts(c, b); ++ ++ sort_iter_init(&sort_iter, b); ++ ++ bytes = !b->written ++ ? sizeof(struct btree_node) ++ : sizeof(struct btree_node_entry); ++ ++ bytes += b->whiteout_u64s * sizeof(u64); ++ ++ for_each_bset(b, t) { ++ i = bset(b, t); ++ ++ if (bset_written(b, i)) ++ continue; ++ ++ bytes += le16_to_cpu(i->u64s) * sizeof(u64); ++ sort_iter_add(&sort_iter, ++ btree_bkey_first(b, t), ++ btree_bkey_last(b, t)); ++ seq = max(seq, le64_to_cpu(i->journal_seq)); ++ } ++ ++ data = btree_bounce_alloc(c, bytes, &used_mempool); ++ ++ if (!b->written) { ++ bn = data; ++ *bn = *b->data; ++ i = &bn->keys; ++ } else { ++ bne = data; ++ bne->keys = b->data->keys; ++ i = &bne->keys; ++ } ++ ++ i->journal_seq = cpu_to_le64(seq); ++ i->u64s = 0; ++ ++ if (!btree_node_old_extent_overwrite(b)) { ++ sort_iter_add(&sort_iter, ++ unwritten_whiteouts_start(c, b), ++ unwritten_whiteouts_end(c, b)); ++ SET_BSET_SEPARATE_WHITEOUTS(i, false); ++ } else { ++ memcpy_u64s(i->start, ++ unwritten_whiteouts_start(c, b), ++ b->whiteout_u64s); ++ i->u64s = cpu_to_le16(b->whiteout_u64s); ++ SET_BSET_SEPARATE_WHITEOUTS(i, true); ++ } ++ ++ b->whiteout_u64s = 0; ++ ++ u64s = btree_node_old_extent_overwrite(b) ++ ? bch2_sort_extents(vstruct_last(i), &sort_iter, false) ++ : bch2_sort_keys(i->start, &sort_iter, false); ++ le16_add_cpu(&i->u64s, u64s); ++ ++ set_needs_whiteout(i, false); ++ ++ /* do we have data to write? */ ++ if (b->written && !i->u64s) ++ goto nowrite; ++ ++ bytes_to_write = vstruct_end(i) - data; ++ sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; ++ ++ memset(data + bytes_to_write, 0, ++ (sectors_to_write << 9) - bytes_to_write); ++ ++ BUG_ON(b->written + sectors_to_write > c->opts.btree_node_size); ++ BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); ++ BUG_ON(i->seq != b->data->keys.seq); ++ ++ i->version = c->sb.version < bcachefs_metadata_version_new_versioning ++ ? cpu_to_le16(BCH_BSET_VERSION_OLD) ++ : cpu_to_le16(c->sb.version); ++ SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c)); ++ ++ if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))) ++ validate_before_checksum = true; ++ ++ /* validate_bset will be modifying: */ ++ if (le16_to_cpu(i->version) < bcachefs_metadata_version_max) ++ validate_before_checksum = true; ++ ++ /* if we're going to be encrypting, check metadata validity first: */ ++ if (validate_before_checksum && ++ validate_bset_for_write(c, b, i, sectors_to_write)) ++ goto err; ++ ++ bset_encrypt(c, i, b->written << 9); ++ ++ nonce = btree_nonce(i, b->written << 9); ++ ++ if (bn) ++ bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn); ++ else ++ bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); ++ ++ /* if we're not encrypting, check metadata after checksumming: */ ++ if (!validate_before_checksum && ++ validate_bset_for_write(c, b, i, sectors_to_write)) ++ goto err; ++ ++ /* ++ * We handle btree write errors by immediately halting the journal - ++ * after we've done that, we can't issue any subsequent btree writes ++ * because they might have pointers to new nodes that failed to write. ++ * ++ * Furthermore, there's no point in doing any more btree writes because ++ * with the journal stopped, we're never going to update the journal to ++ * reflect that those writes were done and the data flushed from the ++ * journal: ++ * ++ * Also on journal error, the pending write may have updates that were ++ * never journalled (interior nodes, see btree_update_nodes_written()) - ++ * it's critical that we don't do the write in that case otherwise we ++ * will have updates visible that weren't in the journal: ++ * ++ * Make sure to update b->written so bch2_btree_init_next() doesn't ++ * break: ++ */ ++ if (bch2_journal_error(&c->journal) || ++ c->opts.nochanges) ++ goto err; ++ ++ trace_btree_write(b, bytes_to_write, sectors_to_write); ++ ++ wbio = container_of(bio_alloc_bioset(GFP_NOIO, ++ buf_pages(data, sectors_to_write << 9), ++ &c->btree_bio), ++ struct btree_write_bio, wbio.bio); ++ wbio_init(&wbio->wbio.bio); ++ wbio->data = data; ++ wbio->bytes = bytes; ++ wbio->wbio.used_mempool = used_mempool; ++ wbio->wbio.bio.bi_opf = REQ_OP_WRITE|REQ_META; ++ wbio->wbio.bio.bi_end_io = btree_node_write_endio; ++ wbio->wbio.bio.bi_private = b; ++ ++ bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); ++ ++ /* ++ * If we're appending to a leaf node, we don't technically need FUA - ++ * this write just needs to be persisted before the next journal write, ++ * which will be marked FLUSH|FUA. ++ * ++ * Similarly if we're writing a new btree root - the pointer is going to ++ * be in the next journal entry. ++ * ++ * But if we're writing a new btree node (that isn't a root) or ++ * appending to a non leaf btree node, we need either FUA or a flush ++ * when we write the parent with the new pointer. FUA is cheaper than a ++ * flush, and writes appending to leaf nodes aren't blocking anything so ++ * just make all btree node writes FUA to keep things sane. ++ */ ++ ++ bkey_copy(&k.key, &b->key); ++ ++ bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&k.key)), ptr) ++ ptr->offset += b->written; ++ ++ b->written += sectors_to_write; ++ ++ /* XXX: submitting IO with btree locks held: */ ++ bch2_submit_wbio_replicas(&wbio->wbio, c, BCH_DATA_btree, &k.key); ++ return; ++err: ++ set_btree_node_noevict(b); ++ b->written += sectors_to_write; ++nowrite: ++ btree_bounce_free(c, bytes, used_mempool, data); ++ btree_node_write_done(c, b); ++} ++ ++/* ++ * Work that must be done with write lock held: ++ */ ++bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) ++{ ++ bool invalidated_iter = false; ++ struct btree_node_entry *bne; ++ struct bset_tree *t; ++ ++ if (!btree_node_just_written(b)) ++ return false; ++ ++ BUG_ON(b->whiteout_u64s); ++ ++ clear_btree_node_just_written(b); ++ ++ /* ++ * Note: immediately after write, bset_written() doesn't work - the ++ * amount of data we had to write after compaction might have been ++ * smaller than the offset of the last bset. ++ * ++ * However, we know that all bsets have been written here, as long as ++ * we're still holding the write lock: ++ */ ++ ++ /* ++ * XXX: decide if we really want to unconditionally sort down to a ++ * single bset: ++ */ ++ if (b->nsets > 1) { ++ btree_node_sort(c, b, NULL, 0, b->nsets, true); ++ invalidated_iter = true; ++ } else { ++ invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL); ++ } ++ ++ for_each_bset(b, t) ++ set_needs_whiteout(bset(b, t), true); ++ ++ bch2_btree_verify(c, b); ++ ++ /* ++ * If later we don't unconditionally sort down to a single bset, we have ++ * to ensure this is still true: ++ */ ++ BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b)); ++ ++ bne = want_new_bset(c, b); ++ if (bne) ++ bch2_bset_init_next(c, b, bne); ++ ++ bch2_btree_build_aux_trees(b); ++ ++ return invalidated_iter; ++} ++ ++/* ++ * Use this one if the node is intent locked: ++ */ ++void bch2_btree_node_write(struct bch_fs *c, struct btree *b, ++ enum six_lock_type lock_type_held) ++{ ++ BUG_ON(lock_type_held == SIX_LOCK_write); ++ ++ if (lock_type_held == SIX_LOCK_intent || ++ six_lock_tryupgrade(&b->c.lock)) { ++ __bch2_btree_node_write(c, b, SIX_LOCK_intent); ++ ++ /* don't cycle lock unnecessarily: */ ++ if (btree_node_just_written(b) && ++ six_trylock_write(&b->c.lock)) { ++ bch2_btree_post_write_cleanup(c, b); ++ six_unlock_write(&b->c.lock); ++ } ++ ++ if (lock_type_held == SIX_LOCK_read) ++ six_lock_downgrade(&b->c.lock); ++ } else { ++ __bch2_btree_node_write(c, b, SIX_LOCK_read); ++ } ++} ++ ++static void __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) ++{ ++ struct bucket_table *tbl; ++ struct rhash_head *pos; ++ struct btree *b; ++ unsigned i; ++restart: ++ rcu_read_lock(); ++ for_each_cached_btree(b, c, tbl, i, pos) ++ if (test_bit(flag, &b->flags)) { ++ rcu_read_unlock(); ++ wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE); ++ goto restart; ++ ++ } ++ rcu_read_unlock(); ++} ++ ++void bch2_btree_flush_all_reads(struct bch_fs *c) ++{ ++ __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight); ++} ++ ++void bch2_btree_flush_all_writes(struct bch_fs *c) ++{ ++ __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); ++} ++ ++void bch2_btree_verify_flushed(struct bch_fs *c) ++{ ++ struct bucket_table *tbl; ++ struct rhash_head *pos; ++ struct btree *b; ++ unsigned i; ++ ++ rcu_read_lock(); ++ for_each_cached_btree(b, c, tbl, i, pos) { ++ unsigned long flags = READ_ONCE(b->flags); ++ ++ BUG_ON((flags & (1 << BTREE_NODE_dirty)) || ++ (flags & (1 << BTREE_NODE_write_in_flight))); ++ } ++ rcu_read_unlock(); ++} ++ ++void bch2_dirty_btree_nodes_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ struct bucket_table *tbl; ++ struct rhash_head *pos; ++ struct btree *b; ++ unsigned i; ++ ++ rcu_read_lock(); ++ for_each_cached_btree(b, c, tbl, i, pos) { ++ unsigned long flags = READ_ONCE(b->flags); ++ ++ if (!(flags & (1 << BTREE_NODE_dirty))) ++ continue; ++ ++ pr_buf(out, "%p d %u n %u l %u w %u b %u r %u:%lu\n", ++ b, ++ (flags & (1 << BTREE_NODE_dirty)) != 0, ++ (flags & (1 << BTREE_NODE_need_write)) != 0, ++ b->c.level, ++ b->written, ++ !list_empty_careful(&b->write_blocked), ++ b->will_make_reachable != 0, ++ b->will_make_reachable & 1); ++ } ++ rcu_read_unlock(); ++} +diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h +new file mode 100644 +index 000000000000..626d0f071b70 +--- /dev/null ++++ b/fs/bcachefs/btree_io.h +@@ -0,0 +1,220 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_IO_H ++#define _BCACHEFS_BTREE_IO_H ++ ++#include "bkey_methods.h" ++#include "bset.h" ++#include "btree_locking.h" ++#include "checksum.h" ++#include "extents.h" ++#include "io_types.h" ++ ++struct bch_fs; ++struct btree_write; ++struct btree; ++struct btree_iter; ++ ++struct btree_read_bio { ++ struct bch_fs *c; ++ u64 start_time; ++ unsigned have_ioref:1; ++ struct extent_ptr_decoded pick; ++ struct work_struct work; ++ struct bio bio; ++}; ++ ++struct btree_write_bio { ++ struct work_struct work; ++ void *data; ++ unsigned bytes; ++ struct bch_write_bio wbio; ++}; ++ ++static inline void btree_node_io_unlock(struct btree *b) ++{ ++ EBUG_ON(!btree_node_write_in_flight(b)); ++ clear_btree_node_write_in_flight(b); ++ wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); ++} ++ ++static inline void btree_node_io_lock(struct btree *b) ++{ ++ wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, ++ TASK_UNINTERRUPTIBLE); ++} ++ ++static inline void btree_node_wait_on_io(struct btree *b) ++{ ++ wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, ++ TASK_UNINTERRUPTIBLE); ++} ++ ++static inline bool btree_node_may_write(struct btree *b) ++{ ++ return list_empty_careful(&b->write_blocked) && ++ (!b->written || !b->will_make_reachable); ++} ++ ++enum compact_mode { ++ COMPACT_LAZY, ++ COMPACT_ALL, ++}; ++ ++bool bch2_compact_whiteouts(struct bch_fs *, struct btree *, ++ enum compact_mode); ++ ++static inline bool should_compact_bset_lazy(struct btree *b, ++ struct bset_tree *t) ++{ ++ unsigned total_u64s = bset_u64s(t); ++ unsigned dead_u64s = bset_dead_u64s(b, t); ++ ++ return dead_u64s > 64 && dead_u64s * 3 > total_u64s; ++} ++ ++static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) ++{ ++ struct bset_tree *t; ++ ++ for_each_bset(b, t) ++ if (should_compact_bset_lazy(b, t)) ++ return bch2_compact_whiteouts(c, b, COMPACT_LAZY); ++ ++ return false; ++} ++ ++static inline struct nonce btree_nonce(struct bset *i, unsigned offset) ++{ ++ return (struct nonce) {{ ++ [0] = cpu_to_le32(offset), ++ [1] = ((__le32 *) &i->seq)[0], ++ [2] = ((__le32 *) &i->seq)[1], ++ [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, ++ }}; ++} ++ ++static inline void bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) ++{ ++ struct nonce nonce = btree_nonce(i, offset); ++ ++ if (!offset) { ++ struct btree_node *bn = container_of(i, struct btree_node, keys); ++ unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; ++ ++ bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, &bn->flags, ++ bytes); ++ ++ nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); ++ } ++ ++ bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, ++ vstruct_end(i) - (void *) i->_data); ++} ++ ++void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); ++ ++void bch2_btree_build_aux_trees(struct btree *); ++void bch2_btree_init_next(struct bch_fs *, struct btree *, ++ struct btree_iter *); ++ ++int bch2_btree_node_read_done(struct bch_fs *, struct btree *, bool); ++void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); ++int bch2_btree_root_read(struct bch_fs *, enum btree_id, ++ const struct bkey_i *, unsigned); ++ ++void bch2_btree_complete_write(struct bch_fs *, struct btree *, ++ struct btree_write *); ++void bch2_btree_write_error_work(struct work_struct *); ++ ++void __bch2_btree_node_write(struct bch_fs *, struct btree *, ++ enum six_lock_type); ++bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); ++ ++void bch2_btree_node_write(struct bch_fs *, struct btree *, ++ enum six_lock_type); ++ ++static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, ++ enum six_lock_type lock_held) ++{ ++ while (b->written && ++ btree_node_need_write(b) && ++ btree_node_may_write(b)) { ++ if (!btree_node_write_in_flight(b)) { ++ bch2_btree_node_write(c, b, lock_held); ++ break; ++ } ++ ++ six_unlock_type(&b->c.lock, lock_held); ++ btree_node_wait_on_io(b); ++ btree_node_lock_type(c, b, lock_held); ++ } ++} ++ ++#define bch2_btree_node_write_cond(_c, _b, cond) \ ++do { \ ++ unsigned long old, new, v = READ_ONCE((_b)->flags); \ ++ \ ++ do { \ ++ old = new = v; \ ++ \ ++ if (!(old & (1 << BTREE_NODE_dirty)) || !(cond)) \ ++ break; \ ++ \ ++ new |= (1 << BTREE_NODE_need_write); \ ++ } while ((v = cmpxchg(&(_b)->flags, old, new)) != old); \ ++ \ ++ btree_node_write_if_need(_c, _b, SIX_LOCK_read); \ ++} while (0) ++ ++void bch2_btree_flush_all_reads(struct bch_fs *); ++void bch2_btree_flush_all_writes(struct bch_fs *); ++void bch2_btree_verify_flushed(struct bch_fs *); ++void bch2_dirty_btree_nodes_to_text(struct printbuf *, struct bch_fs *); ++ ++static inline void compat_bformat(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, struct bkey_format *f) ++{ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_id == BTREE_ID_INODES) { ++ swap(f->bits_per_field[BKEY_FIELD_INODE], ++ f->bits_per_field[BKEY_FIELD_OFFSET]); ++ swap(f->field_offset[BKEY_FIELD_INODE], ++ f->field_offset[BKEY_FIELD_OFFSET]); ++ } ++} ++ ++static inline void compat_bpos(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, struct bpos *p) ++{ ++ if (big_endian != CPU_BIG_ENDIAN) ++ bch2_bpos_swab(p); ++ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_id == BTREE_ID_INODES) ++ swap(p->inode, p->offset); ++} ++ ++static inline void compat_btree_node(unsigned level, enum btree_id btree_id, ++ unsigned version, unsigned big_endian, ++ int write, ++ struct btree_node *bn) ++{ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_node_type_is_extents(btree_id) && ++ bkey_cmp(bn->min_key, POS_MIN) && ++ write) ++ bn->min_key = bkey_predecessor(bn->min_key); ++ ++ compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key); ++ compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key); ++ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_node_type_is_extents(btree_id) && ++ bkey_cmp(bn->min_key, POS_MIN) && ++ !write) ++ bn->min_key = bkey_successor(bn->min_key); ++} ++ ++#endif /* _BCACHEFS_BTREE_IO_H */ +diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c +new file mode 100644 +index 000000000000..6fab76c3220c +--- /dev/null ++++ b/fs/bcachefs/btree_iter.c +@@ -0,0 +1,2445 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_cache.h" ++#include "btree_iter.h" ++#include "btree_key_cache.h" ++#include "btree_locking.h" ++#include "btree_update.h" ++#include "debug.h" ++#include "extents.h" ++#include "journal.h" ++ ++#include ++#include ++ ++static inline bool is_btree_node(struct btree_iter *iter, unsigned l) ++{ ++ return l < BTREE_MAX_DEPTH && ++ (unsigned long) iter->l[l].b >= 128; ++} ++ ++static inline struct bpos btree_iter_search_key(struct btree_iter *iter) ++{ ++ struct bpos pos = iter->pos; ++ ++ if ((iter->flags & BTREE_ITER_IS_EXTENTS) && ++ bkey_cmp(pos, POS_MAX)) ++ pos = bkey_successor(pos); ++ return pos; ++} ++ ++static inline bool btree_iter_pos_before_node(struct btree_iter *iter, ++ struct btree *b) ++{ ++ return bkey_cmp(btree_iter_search_key(iter), b->data->min_key) < 0; ++} ++ ++static inline bool btree_iter_pos_after_node(struct btree_iter *iter, ++ struct btree *b) ++{ ++ return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0; ++} ++ ++static inline bool btree_iter_pos_in_node(struct btree_iter *iter, ++ struct btree *b) ++{ ++ return iter->btree_id == b->c.btree_id && ++ !btree_iter_pos_before_node(iter, b) && ++ !btree_iter_pos_after_node(iter, b); ++} ++ ++/* Btree node locking: */ ++ ++void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter) ++{ ++ bch2_btree_node_unlock_write_inlined(b, iter); ++} ++ ++void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) ++{ ++ struct btree_iter *linked; ++ unsigned readers = 0; ++ ++ EBUG_ON(!btree_node_intent_locked(iter, b->c.level)); ++ ++ trans_for_each_iter(iter->trans, linked) ++ if (linked->l[b->c.level].b == b && ++ btree_node_read_locked(linked, b->c.level)) ++ readers++; ++ ++ /* ++ * Must drop our read locks before calling six_lock_write() - ++ * six_unlock() won't do wakeups until the reader count ++ * goes to 0, and it's safe because we have the node intent ++ * locked: ++ */ ++ atomic64_sub(__SIX_VAL(read_lock, readers), ++ &b->c.lock.state.counter); ++ btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write); ++ atomic64_add(__SIX_VAL(read_lock, readers), ++ &b->c.lock.state.counter); ++} ++ ++bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level) ++{ ++ struct btree *b = btree_iter_node(iter, level); ++ int want = __btree_lock_want(iter, level); ++ ++ if (!is_btree_node(iter, level)) ++ return false; ++ ++ if (race_fault()) ++ return false; ++ ++ if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) || ++ (btree_node_lock_seq_matches(iter, b, level) && ++ btree_node_lock_increment(iter->trans, b, level, want))) { ++ mark_btree_node_locked(iter, level, want); ++ return true; ++ } else { ++ return false; ++ } ++} ++ ++static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level) ++{ ++ struct btree *b = iter->l[level].b; ++ ++ EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED); ++ ++ if (!is_btree_node(iter, level)) ++ return false; ++ ++ if (btree_node_intent_locked(iter, level)) ++ return true; ++ ++ if (race_fault()) ++ return false; ++ ++ if (btree_node_locked(iter, level) ++ ? six_lock_tryupgrade(&b->c.lock) ++ : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq)) ++ goto success; ++ ++ if (btree_node_lock_seq_matches(iter, b, level) && ++ btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) { ++ btree_node_unlock(iter, level); ++ goto success; ++ } ++ ++ return false; ++success: ++ mark_btree_node_intent_locked(iter, level); ++ return true; ++} ++ ++static inline bool btree_iter_get_locks(struct btree_iter *iter, ++ bool upgrade, bool trace) ++{ ++ unsigned l = iter->level; ++ int fail_idx = -1; ++ ++ do { ++ if (!btree_iter_node(iter, l)) ++ break; ++ ++ if (!(upgrade ++ ? bch2_btree_node_upgrade(iter, l) ++ : bch2_btree_node_relock(iter, l))) { ++ if (trace) ++ (upgrade ++ ? trace_node_upgrade_fail ++ : trace_node_relock_fail)(l, iter->l[l].lock_seq, ++ is_btree_node(iter, l) ++ ? 0 ++ : (unsigned long) iter->l[l].b, ++ is_btree_node(iter, l) ++ ? iter->l[l].b->c.lock.state.seq ++ : 0); ++ ++ fail_idx = l; ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ } ++ ++ l++; ++ } while (l < iter->locks_want); ++ ++ /* ++ * When we fail to get a lock, we have to ensure that any child nodes ++ * can't be relocked so bch2_btree_iter_traverse has to walk back up to ++ * the node that we failed to relock: ++ */ ++ while (fail_idx >= 0) { ++ btree_node_unlock(iter, fail_idx); ++ iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS; ++ --fail_idx; ++ } ++ ++ if (iter->uptodate == BTREE_ITER_NEED_RELOCK) ++ iter->uptodate = BTREE_ITER_NEED_PEEK; ++ ++ bch2_btree_trans_verify_locks(iter->trans); ++ ++ return iter->uptodate < BTREE_ITER_NEED_RELOCK; ++} ++ ++static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b, ++ enum btree_iter_type type) ++{ ++ return type != BTREE_ITER_CACHED ++ ? container_of(_b, struct btree, c)->key.k.p ++ : container_of(_b, struct bkey_cached, c)->key.pos; ++} ++ ++/* Slowpath: */ ++bool __bch2_btree_node_lock(struct btree *b, struct bpos pos, ++ unsigned level, struct btree_iter *iter, ++ enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, ++ void *p) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree_iter *linked; ++ u64 start_time = local_clock(); ++ bool ret = true; ++ ++ /* Check if it's safe to block: */ ++ trans_for_each_iter(trans, linked) { ++ if (!linked->nodes_locked) ++ continue; ++ ++ /* ++ * Can't block taking an intent lock if we have _any_ nodes read ++ * locked: ++ * ++ * - Our read lock blocks another thread with an intent lock on ++ * the same node from getting a write lock, and thus from ++ * dropping its intent lock ++ * ++ * - And the other thread may have multiple nodes intent locked: ++ * both the node we want to intent lock, and the node we ++ * already have read locked - deadlock: ++ */ ++ if (type == SIX_LOCK_intent && ++ linked->nodes_locked != linked->nodes_intent_locked) { ++ if (!(trans->nounlock)) { ++ linked->locks_want = max_t(unsigned, ++ linked->locks_want, ++ __fls(linked->nodes_locked) + 1); ++ if (!btree_iter_get_locks(linked, true, false)) ++ ret = false; ++ } else { ++ ret = false; ++ } ++ } ++ ++ /* ++ * Interior nodes must be locked before their descendants: if ++ * another iterator has possible descendants locked of the node ++ * we're about to lock, it must have the ancestors locked too: ++ */ ++ if (linked->btree_id == iter->btree_id && ++ level > __fls(linked->nodes_locked)) { ++ if (!(trans->nounlock)) { ++ linked->locks_want = ++ max(level + 1, max_t(unsigned, ++ linked->locks_want, ++ iter->locks_want)); ++ if (!btree_iter_get_locks(linked, true, false)) ++ ret = false; ++ } else { ++ ret = false; ++ } ++ } ++ ++ /* Must lock btree nodes in key order: */ ++ if ((cmp_int(iter->btree_id, linked->btree_id) ?: ++ -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0) ++ ret = false; ++ ++ if (iter->btree_id == linked->btree_id && ++ btree_node_locked(linked, level) && ++ bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b, ++ btree_iter_type(linked))) <= 0) ++ ret = false; ++ ++ /* ++ * Recheck if this is a node we already have locked - since one ++ * of the get_locks() calls might've successfully ++ * upgraded/relocked it: ++ */ ++ if (linked->l[level].b == b && ++ btree_node_locked_type(linked, level) >= type) { ++ six_lock_increment(&b->c.lock, type); ++ return true; ++ } ++ } ++ ++ if (unlikely(!ret)) { ++ trace_trans_restart_would_deadlock(iter->trans->ip); ++ return false; ++ } ++ ++ if (six_trylock_type(&b->c.lock, type)) ++ return true; ++ ++ if (six_lock_type(&b->c.lock, type, should_sleep_fn, p)) ++ return false; ++ ++ bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)], ++ start_time); ++ return true; ++} ++ ++/* Btree iterator locking: */ ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++static void bch2_btree_iter_verify_locks(struct btree_iter *iter) ++{ ++ unsigned l; ++ ++ if (!(iter->trans->iters_linked & (1ULL << iter->idx))) { ++ BUG_ON(iter->nodes_locked); ++ return; ++ } ++ ++ for (l = 0; is_btree_node(iter, l); l++) { ++ if (iter->uptodate >= BTREE_ITER_NEED_RELOCK && ++ !btree_node_locked(iter, l)) ++ continue; ++ ++ BUG_ON(btree_lock_want(iter, l) != ++ btree_node_locked_type(iter, l)); ++ } ++} ++ ++void bch2_btree_trans_verify_locks(struct btree_trans *trans) ++{ ++ struct btree_iter *iter; ++ ++ trans_for_each_iter_all(trans, iter) ++ bch2_btree_iter_verify_locks(iter); ++} ++#else ++static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {} ++#endif ++ ++__flatten ++bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace) ++{ ++ return btree_iter_get_locks(iter, false, trace); ++} ++ ++bool __bch2_btree_iter_upgrade(struct btree_iter *iter, ++ unsigned new_locks_want) ++{ ++ struct btree_iter *linked; ++ ++ EBUG_ON(iter->locks_want >= new_locks_want); ++ ++ iter->locks_want = new_locks_want; ++ ++ if (btree_iter_get_locks(iter, true, true)) ++ return true; ++ ++ /* ++ * Ancestor nodes must be locked before child nodes, so set locks_want ++ * on iterators that might lock ancestors before us to avoid getting ++ * -EINTR later: ++ */ ++ trans_for_each_iter(iter->trans, linked) ++ if (linked != iter && ++ linked->btree_id == iter->btree_id && ++ linked->locks_want < new_locks_want) { ++ linked->locks_want = new_locks_want; ++ btree_iter_get_locks(linked, true, false); ++ } ++ ++ return false; ++} ++ ++bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter, ++ unsigned new_locks_want) ++{ ++ unsigned l = iter->level; ++ ++ EBUG_ON(iter->locks_want >= new_locks_want); ++ ++ iter->locks_want = new_locks_want; ++ ++ do { ++ if (!btree_iter_node(iter, l)) ++ break; ++ ++ if (!bch2_btree_node_upgrade(iter, l)) { ++ iter->locks_want = l; ++ return false; ++ } ++ ++ l++; ++ } while (l < iter->locks_want); ++ ++ return true; ++} ++ ++void __bch2_btree_iter_downgrade(struct btree_iter *iter, ++ unsigned downgrade_to) ++{ ++ unsigned l, new_locks_want = downgrade_to ?: ++ (iter->flags & BTREE_ITER_INTENT ? 1 : 0); ++ ++ if (iter->locks_want < downgrade_to) { ++ iter->locks_want = new_locks_want; ++ ++ while (iter->nodes_locked && ++ (l = __fls(iter->nodes_locked)) >= iter->locks_want) { ++ if (l > iter->level) { ++ btree_node_unlock(iter, l); ++ } else { ++ if (btree_node_intent_locked(iter, l)) { ++ six_lock_downgrade(&iter->l[l].b->c.lock); ++ iter->nodes_intent_locked ^= 1 << l; ++ } ++ break; ++ } ++ } ++ } ++ ++ bch2_btree_trans_verify_locks(iter->trans); ++} ++ ++void bch2_trans_downgrade(struct btree_trans *trans) ++{ ++ struct btree_iter *iter; ++ ++ trans_for_each_iter(trans, iter) ++ bch2_btree_iter_downgrade(iter); ++} ++ ++/* Btree transaction locking: */ ++ ++bool bch2_trans_relock(struct btree_trans *trans) ++{ ++ struct btree_iter *iter; ++ bool ret = true; ++ ++ trans_for_each_iter(trans, iter) ++ if (iter->uptodate == BTREE_ITER_NEED_RELOCK) ++ ret &= bch2_btree_iter_relock(iter, true); ++ ++ return ret; ++} ++ ++void bch2_trans_unlock(struct btree_trans *trans) ++{ ++ struct btree_iter *iter; ++ ++ trans_for_each_iter(trans, iter) ++ __bch2_btree_iter_unlock(iter); ++} ++ ++/* Btree iterator: */ ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++static void bch2_btree_iter_verify_cached(struct btree_iter *iter) ++{ ++ struct bkey_cached *ck; ++ bool locked = btree_node_locked(iter, 0); ++ ++ if (!bch2_btree_node_relock(iter, 0)) ++ return; ++ ++ ck = (void *) iter->l[0].b; ++ BUG_ON(ck->key.btree_id != iter->btree_id || ++ bkey_cmp(ck->key.pos, iter->pos)); ++ ++ if (!locked) ++ btree_node_unlock(iter, 0); ++} ++ ++static void bch2_btree_iter_verify_level(struct btree_iter *iter, ++ unsigned level) ++{ ++ struct bpos pos = btree_iter_search_key(iter); ++ struct btree_iter_level *l = &iter->l[level]; ++ struct btree_node_iter tmp = l->iter; ++ bool locked = btree_node_locked(iter, level); ++ struct bkey_packed *p, *k; ++ char buf1[100], buf2[100]; ++ const char *msg; ++ ++ if (!debug_check_iterators(iter->trans->c)) ++ return; ++ ++ if (btree_iter_type(iter) == BTREE_ITER_CACHED) { ++ if (!level) ++ bch2_btree_iter_verify_cached(iter); ++ return; ++ } ++ ++ BUG_ON(iter->level < iter->min_depth); ++ ++ if (!btree_iter_node(iter, level)) ++ return; ++ ++ if (!bch2_btree_node_relock(iter, level)) ++ return; ++ ++ /* ++ * Ideally this invariant would always be true, and hopefully in the ++ * future it will be, but for now set_pos_same_leaf() breaks it: ++ */ ++ BUG_ON(iter->uptodate < BTREE_ITER_NEED_TRAVERSE && ++ !btree_iter_pos_in_node(iter, l->b)); ++ ++ /* ++ * node iterators don't use leaf node iterator: ++ */ ++ if (btree_iter_type(iter) == BTREE_ITER_NODES && ++ level <= iter->min_depth) ++ goto unlock; ++ ++ bch2_btree_node_iter_verify(&l->iter, l->b); ++ ++ /* ++ * For interior nodes, the iterator will have skipped past ++ * deleted keys: ++ * ++ * For extents, the iterator may have skipped past deleted keys (but not ++ * whiteouts) ++ */ ++ p = level || btree_node_type_is_extents(iter->btree_id) ++ ? bch2_btree_node_iter_prev_filter(&tmp, l->b, KEY_TYPE_discard) ++ : bch2_btree_node_iter_prev_all(&tmp, l->b); ++ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); ++ ++ if (p && bkey_iter_pos_cmp(l->b, p, &pos) >= 0) { ++ msg = "before"; ++ goto err; ++ } ++ ++ if (k && bkey_iter_pos_cmp(l->b, k, &pos) < 0) { ++ msg = "after"; ++ goto err; ++ } ++unlock: ++ if (!locked) ++ btree_node_unlock(iter, level); ++ return; ++err: ++ strcpy(buf1, "(none)"); ++ strcpy(buf2, "(none)"); ++ ++ if (p) { ++ struct bkey uk = bkey_unpack_key(l->b, p); ++ bch2_bkey_to_text(&PBUF(buf1), &uk); ++ } ++ ++ if (k) { ++ struct bkey uk = bkey_unpack_key(l->b, k); ++ bch2_bkey_to_text(&PBUF(buf2), &uk); ++ } ++ ++ panic("iterator should be %s key at level %u:\n" ++ "iter pos %s %llu:%llu\n" ++ "prev key %s\n" ++ "cur key %s\n", ++ msg, level, ++ iter->flags & BTREE_ITER_IS_EXTENTS ? ">" : "=>", ++ iter->pos.inode, iter->pos.offset, ++ buf1, buf2); ++} ++ ++static void bch2_btree_iter_verify(struct btree_iter *iter) ++{ ++ unsigned i; ++ ++ bch2_btree_trans_verify_locks(iter->trans); ++ ++ for (i = 0; i < BTREE_MAX_DEPTH; i++) ++ bch2_btree_iter_verify_level(iter, i); ++} ++ ++void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b) ++{ ++ struct btree_iter *iter; ++ ++ if (!debug_check_iterators(trans->c)) ++ return; ++ ++ trans_for_each_iter_with_node(trans, b, iter) ++ bch2_btree_iter_verify_level(iter, b->c.level); ++} ++ ++#else ++ ++static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {} ++static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} ++ ++#endif ++ ++static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, ++ struct btree *b, ++ struct bset_tree *t, ++ struct bkey_packed *k) ++{ ++ struct btree_node_iter_set *set; ++ ++ btree_node_iter_for_each(iter, set) ++ if (set->end == t->end_offset) { ++ set->k = __btree_node_key_to_offset(b, k); ++ bch2_btree_node_iter_sort(iter, b); ++ return; ++ } ++ ++ bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); ++} ++ ++static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter, ++ struct btree *b, ++ struct bkey_packed *where) ++{ ++ struct btree_iter_level *l = &iter->l[b->c.level]; ++ struct bpos pos = btree_iter_search_key(iter); ++ ++ if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) ++ return; ++ ++ if (bkey_iter_pos_cmp(l->b, where, &pos) < 0) ++ bch2_btree_node_iter_advance(&l->iter, l->b); ++ ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++} ++ ++void bch2_btree_iter_fix_key_modified(struct btree_iter *iter, ++ struct btree *b, ++ struct bkey_packed *where) ++{ ++ struct btree_iter *linked; ++ ++ trans_for_each_iter_with_node(iter->trans, b, linked) { ++ __bch2_btree_iter_fix_key_modified(linked, b, where); ++ bch2_btree_iter_verify_level(linked, b->c.level); ++ } ++} ++ ++static void __bch2_btree_node_iter_fix(struct btree_iter *iter, ++ struct btree *b, ++ struct btree_node_iter *node_iter, ++ struct bset_tree *t, ++ struct bkey_packed *where, ++ unsigned clobber_u64s, ++ unsigned new_u64s) ++{ ++ const struct bkey_packed *end = btree_bkey_last(b, t); ++ struct btree_node_iter_set *set; ++ unsigned offset = __btree_node_key_to_offset(b, where); ++ int shift = new_u64s - clobber_u64s; ++ unsigned old_end = t->end_offset - shift; ++ unsigned orig_iter_pos = node_iter->data[0].k; ++ bool iter_current_key_modified = ++ orig_iter_pos >= offset && ++ orig_iter_pos <= offset + clobber_u64s; ++ struct bpos iter_pos = btree_iter_search_key(iter); ++ ++ btree_node_iter_for_each(node_iter, set) ++ if (set->end == old_end) ++ goto found; ++ ++ /* didn't find the bset in the iterator - might have to readd it: */ ++ if (new_u64s && ++ bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { ++ bch2_btree_node_iter_push(node_iter, b, where, end); ++ goto fixup_done; ++ } else { ++ /* Iterator is after key that changed */ ++ return; ++ } ++found: ++ set->end = t->end_offset; ++ ++ /* Iterator hasn't gotten to the key that changed yet: */ ++ if (set->k < offset) ++ return; ++ ++ if (new_u64s && ++ bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) { ++ set->k = offset; ++ } else if (set->k < offset + clobber_u64s) { ++ set->k = offset + new_u64s; ++ if (set->k == set->end) ++ bch2_btree_node_iter_set_drop(node_iter, set); ++ } else { ++ /* Iterator is after key that changed */ ++ set->k = (int) set->k + shift; ++ return; ++ } ++ ++ bch2_btree_node_iter_sort(node_iter, b); ++fixup_done: ++ if (node_iter->data[0].k != orig_iter_pos) ++ iter_current_key_modified = true; ++ ++ /* ++ * When a new key is added, and the node iterator now points to that ++ * key, the iterator might have skipped past deleted keys that should ++ * come after the key the iterator now points to. We have to rewind to ++ * before those deleted keys - otherwise ++ * bch2_btree_node_iter_prev_all() breaks: ++ */ ++ if (!bch2_btree_node_iter_end(node_iter) && ++ iter_current_key_modified && ++ (b->c.level || ++ btree_node_type_is_extents(iter->btree_id))) { ++ struct bset_tree *t; ++ struct bkey_packed *k, *k2, *p; ++ ++ k = bch2_btree_node_iter_peek_all(node_iter, b); ++ ++ for_each_bset(b, t) { ++ bool set_pos = false; ++ ++ if (node_iter->data[0].end == t->end_offset) ++ continue; ++ ++ k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t); ++ ++ while ((p = bch2_bkey_prev_all(b, t, k2)) && ++ bkey_iter_cmp(b, k, p) < 0) { ++ k2 = p; ++ set_pos = true; ++ } ++ ++ if (set_pos) ++ btree_node_iter_set_set_pos(node_iter, ++ b, t, k2); ++ } ++ } ++ ++ if (!b->c.level && ++ node_iter == &iter->l[0].iter && ++ iter_current_key_modified) ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++} ++ ++void bch2_btree_node_iter_fix(struct btree_iter *iter, ++ struct btree *b, ++ struct btree_node_iter *node_iter, ++ struct bkey_packed *where, ++ unsigned clobber_u64s, ++ unsigned new_u64s) ++{ ++ struct bset_tree *t = bch2_bkey_to_bset(b, where); ++ struct btree_iter *linked; ++ ++ if (node_iter != &iter->l[b->c.level].iter) { ++ __bch2_btree_node_iter_fix(iter, b, node_iter, t, ++ where, clobber_u64s, new_u64s); ++ ++ if (debug_check_iterators(iter->trans->c)) ++ bch2_btree_node_iter_verify(node_iter, b); ++ } ++ ++ trans_for_each_iter_with_node(iter->trans, b, linked) { ++ __bch2_btree_node_iter_fix(linked, b, ++ &linked->l[b->c.level].iter, t, ++ where, clobber_u64s, new_u64s); ++ bch2_btree_iter_verify_level(linked, b->c.level); ++ } ++} ++ ++static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter, ++ struct btree_iter_level *l, ++ struct bkey *u, ++ struct bkey_packed *k) ++{ ++ struct bkey_s_c ret; ++ ++ if (unlikely(!k)) { ++ /* ++ * signal to bch2_btree_iter_peek_slot() that we're currently at ++ * a hole ++ */ ++ u->type = KEY_TYPE_deleted; ++ return bkey_s_c_null; ++ } ++ ++ ret = bkey_disassemble(l->b, k, u); ++ ++ if (debug_check_bkeys(iter->trans->c)) ++ bch2_bkey_debugcheck(iter->trans->c, l->b, ret); ++ ++ return ret; ++} ++ ++/* peek_all() doesn't skip deleted keys */ ++static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter, ++ struct btree_iter_level *l, ++ struct bkey *u) ++{ ++ return __btree_iter_unpack(iter, l, u, ++ bch2_btree_node_iter_peek_all(&l->iter, l->b)); ++} ++ ++static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter, ++ struct btree_iter_level *l) ++{ ++ return __btree_iter_unpack(iter, l, &iter->k, ++ bch2_btree_node_iter_peek(&l->iter, l->b)); ++} ++ ++static inline struct bkey_s_c __btree_iter_prev(struct btree_iter *iter, ++ struct btree_iter_level *l) ++{ ++ return __btree_iter_unpack(iter, l, &iter->k, ++ bch2_btree_node_iter_prev(&l->iter, l->b)); ++} ++ ++static inline bool btree_iter_advance_to_pos(struct btree_iter *iter, ++ struct btree_iter_level *l, ++ int max_advance) ++{ ++ struct bpos pos = btree_iter_search_key(iter); ++ struct bkey_packed *k; ++ int nr_advanced = 0; ++ ++ while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && ++ bkey_iter_pos_cmp(l->b, k, &pos) < 0) { ++ if (max_advance > 0 && nr_advanced >= max_advance) ++ return false; ++ ++ bch2_btree_node_iter_advance(&l->iter, l->b); ++ nr_advanced++; ++ } ++ ++ return true; ++} ++ ++/* ++ * Verify that iterator for parent node points to child node: ++ */ ++static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b) ++{ ++ struct btree_iter_level *l; ++ unsigned plevel; ++ bool parent_locked; ++ struct bkey_packed *k; ++ ++ if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) ++ return; ++ ++ plevel = b->c.level + 1; ++ if (!btree_iter_node(iter, plevel)) ++ return; ++ ++ parent_locked = btree_node_locked(iter, plevel); ++ ++ if (!bch2_btree_node_relock(iter, plevel)) ++ return; ++ ++ l = &iter->l[plevel]; ++ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); ++ if (!k || ++ bkey_deleted(k) || ++ bkey_cmp_left_packed(l->b, k, &b->key.k.p)) { ++ char buf[100]; ++ struct bkey uk = bkey_unpack_key(b, k); ++ ++ bch2_bkey_to_text(&PBUF(buf), &uk); ++ panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n", ++ buf, b->key.k.p.inode, b->key.k.p.offset); ++ } ++ ++ if (!parent_locked) ++ btree_node_unlock(iter, b->c.level + 1); ++} ++ ++static inline void __btree_iter_init(struct btree_iter *iter, ++ unsigned level) ++{ ++ struct bpos pos = btree_iter_search_key(iter); ++ struct btree_iter_level *l = &iter->l[level]; ++ ++ bch2_btree_node_iter_init(&l->iter, l->b, &pos); ++ ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++} ++ ++static inline void btree_iter_node_set(struct btree_iter *iter, ++ struct btree *b) ++{ ++ BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); ++ ++ btree_iter_verify_new_node(iter, b); ++ ++ EBUG_ON(!btree_iter_pos_in_node(iter, b)); ++ EBUG_ON(b->c.lock.state.seq & 1); ++ ++ iter->l[b->c.level].lock_seq = b->c.lock.state.seq; ++ iter->l[b->c.level].b = b; ++ __btree_iter_init(iter, b->c.level); ++} ++ ++/* ++ * A btree node is being replaced - update the iterator to point to the new ++ * node: ++ */ ++void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b) ++{ ++ enum btree_node_locked_type t; ++ struct btree_iter *linked; ++ ++ trans_for_each_iter(iter->trans, linked) ++ if (btree_iter_type(linked) != BTREE_ITER_CACHED && ++ btree_iter_pos_in_node(linked, b)) { ++ /* ++ * bch2_btree_iter_node_drop() has already been called - ++ * the old node we're replacing has already been ++ * unlocked and the pointer invalidated ++ */ ++ BUG_ON(btree_node_locked(linked, b->c.level)); ++ ++ t = btree_lock_want(linked, b->c.level); ++ if (t != BTREE_NODE_UNLOCKED) { ++ six_lock_increment(&b->c.lock, t); ++ mark_btree_node_locked(linked, b->c.level, t); ++ } ++ ++ btree_iter_node_set(linked, b); ++ } ++} ++ ++void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b) ++{ ++ struct btree_iter *linked; ++ unsigned level = b->c.level; ++ ++ trans_for_each_iter(iter->trans, linked) ++ if (linked->l[level].b == b) { ++ __btree_node_unlock(linked, level); ++ linked->l[level].b = BTREE_ITER_NO_NODE_DROP; ++ } ++} ++ ++/* ++ * A btree node has been modified in such a way as to invalidate iterators - fix ++ * them: ++ */ ++void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b) ++{ ++ struct btree_iter *linked; ++ ++ trans_for_each_iter_with_node(iter->trans, b, linked) ++ __btree_iter_init(linked, b->c.level); ++} ++ ++static int lock_root_check_fn(struct six_lock *lock, void *p) ++{ ++ struct btree *b = container_of(lock, struct btree, c.lock); ++ struct btree **rootp = p; ++ ++ return b == *rootp ? 0 : -1; ++} ++ ++static inline int btree_iter_lock_root(struct btree_iter *iter, ++ unsigned depth_want) ++{ ++ struct bch_fs *c = iter->trans->c; ++ struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b; ++ enum six_lock_type lock_type; ++ unsigned i; ++ ++ EBUG_ON(iter->nodes_locked); ++ ++ while (1) { ++ b = READ_ONCE(*rootp); ++ iter->level = READ_ONCE(b->c.level); ++ ++ if (unlikely(iter->level < depth_want)) { ++ /* ++ * the root is at a lower depth than the depth we want: ++ * got to the end of the btree, or we're walking nodes ++ * greater than some depth and there are no nodes >= ++ * that depth ++ */ ++ iter->level = depth_want; ++ for (i = iter->level; i < BTREE_MAX_DEPTH; i++) ++ iter->l[i].b = NULL; ++ return 1; ++ } ++ ++ lock_type = __btree_lock_want(iter, iter->level); ++ if (unlikely(!btree_node_lock(b, POS_MAX, iter->level, ++ iter, lock_type, ++ lock_root_check_fn, rootp))) ++ return -EINTR; ++ ++ if (likely(b == READ_ONCE(*rootp) && ++ b->c.level == iter->level && ++ !race_fault())) { ++ for (i = 0; i < iter->level; i++) ++ iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT; ++ iter->l[iter->level].b = b; ++ for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++) ++ iter->l[i].b = NULL; ++ ++ mark_btree_node_locked(iter, iter->level, lock_type); ++ btree_iter_node_set(iter, b); ++ return 0; ++ } ++ ++ six_unlock_type(&b->c.lock, lock_type); ++ } ++} ++ ++noinline ++static void btree_iter_prefetch(struct btree_iter *iter) ++{ ++ struct bch_fs *c = iter->trans->c; ++ struct btree_iter_level *l = &iter->l[iter->level]; ++ struct btree_node_iter node_iter = l->iter; ++ struct bkey_packed *k; ++ BKEY_PADDED(k) tmp; ++ unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) ++ ? (iter->level > 1 ? 0 : 2) ++ : (iter->level > 1 ? 1 : 16); ++ bool was_locked = btree_node_locked(iter, iter->level); ++ ++ while (nr) { ++ if (!bch2_btree_node_relock(iter, iter->level)) ++ return; ++ ++ bch2_btree_node_iter_advance(&node_iter, l->b); ++ k = bch2_btree_node_iter_peek(&node_iter, l->b); ++ if (!k) ++ break; ++ ++ bch2_bkey_unpack(l->b, &tmp.k, k); ++ bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1); ++ } ++ ++ if (!was_locked) ++ btree_node_unlock(iter, iter->level); ++} ++ ++static noinline void btree_node_mem_ptr_set(struct btree_iter *iter, ++ unsigned plevel, struct btree *b) ++{ ++ struct btree_iter_level *l = &iter->l[plevel]; ++ bool locked = btree_node_locked(iter, plevel); ++ struct bkey_packed *k; ++ struct bch_btree_ptr_v2 *bp; ++ ++ if (!bch2_btree_node_relock(iter, plevel)) ++ return; ++ ++ k = bch2_btree_node_iter_peek_all(&l->iter, l->b); ++ BUG_ON(k->type != KEY_TYPE_btree_ptr_v2); ++ ++ bp = (void *) bkeyp_val(&l->b->format, k); ++ bp->mem_ptr = (unsigned long)b; ++ ++ if (!locked) ++ btree_node_unlock(iter, plevel); ++} ++ ++static __always_inline int btree_iter_down(struct btree_iter *iter) ++{ ++ struct bch_fs *c = iter->trans->c; ++ struct btree_iter_level *l = &iter->l[iter->level]; ++ struct btree *b; ++ unsigned level = iter->level - 1; ++ enum six_lock_type lock_type = __btree_lock_want(iter, level); ++ BKEY_PADDED(k) tmp; ++ ++ EBUG_ON(!btree_node_locked(iter, iter->level)); ++ ++ bch2_bkey_unpack(l->b, &tmp.k, ++ bch2_btree_node_iter_peek(&l->iter, l->b)); ++ ++ b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type); ++ if (unlikely(IS_ERR(b))) ++ return PTR_ERR(b); ++ ++ mark_btree_node_locked(iter, level, lock_type); ++ btree_iter_node_set(iter, b); ++ ++ if (tmp.k.k.type == KEY_TYPE_btree_ptr_v2 && ++ unlikely(b != btree_node_mem_ptr(&tmp.k))) ++ btree_node_mem_ptr_set(iter, level + 1, b); ++ ++ if (iter->flags & BTREE_ITER_PREFETCH) ++ btree_iter_prefetch(iter); ++ ++ iter->level = level; ++ ++ return 0; ++} ++ ++static void btree_iter_up(struct btree_iter *iter) ++{ ++ btree_node_unlock(iter, iter->level++); ++} ++ ++static int btree_iter_traverse_one(struct btree_iter *); ++ ++static int __btree_iter_traverse_all(struct btree_trans *trans, int ret) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter; ++ u8 sorted[BTREE_ITER_MAX]; ++ unsigned i, nr_sorted = 0; ++ ++ if (trans->in_traverse_all) ++ return -EINTR; ++ ++ trans->in_traverse_all = true; ++retry_all: ++ nr_sorted = 0; ++ ++ trans_for_each_iter(trans, iter) ++ sorted[nr_sorted++] = iter->idx; ++ ++#define btree_iter_cmp_by_idx(_l, _r) \ ++ btree_iter_cmp(&trans->iters[_l], &trans->iters[_r]) ++ ++ bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx); ++#undef btree_iter_cmp_by_idx ++ bch2_trans_unlock(trans); ++ ++ if (unlikely(ret == -ENOMEM)) { ++ struct closure cl; ++ ++ closure_init_stack(&cl); ++ ++ do { ++ ret = bch2_btree_cache_cannibalize_lock(c, &cl); ++ closure_sync(&cl); ++ } while (ret); ++ } ++ ++ if (unlikely(ret == -EIO)) { ++ trans->error = true; ++ goto out; ++ } ++ ++ BUG_ON(ret && ret != -EINTR); ++ ++ /* Now, redo traversals in correct order: */ ++ for (i = 0; i < nr_sorted; i++) { ++ unsigned idx = sorted[i]; ++ ++ /* ++ * sucessfully traversing one iterator can cause another to be ++ * unlinked, in btree_key_cache_fill() ++ */ ++ if (!(trans->iters_linked & (1ULL << idx))) ++ continue; ++ ++ ret = btree_iter_traverse_one(&trans->iters[idx]); ++ if (ret) ++ goto retry_all; ++ } ++ ++ if (hweight64(trans->iters_live) > 1) ++ ret = -EINTR; ++ else ++ trans_for_each_iter(trans, iter) ++ if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) { ++ ret = -EINTR; ++ break; ++ } ++out: ++ bch2_btree_cache_cannibalize_unlock(c); ++ ++ trans->in_traverse_all = false; ++ return ret; ++} ++ ++int bch2_btree_iter_traverse_all(struct btree_trans *trans) ++{ ++ return __btree_iter_traverse_all(trans, 0); ++} ++ ++static inline bool btree_iter_good_node(struct btree_iter *iter, ++ unsigned l, int check_pos) ++{ ++ if (!is_btree_node(iter, l) || ++ !bch2_btree_node_relock(iter, l)) ++ return false; ++ ++ if (check_pos <= 0 && btree_iter_pos_before_node(iter, iter->l[l].b)) ++ return false; ++ if (check_pos >= 0 && btree_iter_pos_after_node(iter, iter->l[l].b)) ++ return false; ++ return true; ++} ++ ++static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter, ++ int check_pos) ++{ ++ unsigned l = iter->level; ++ ++ while (btree_iter_node(iter, l) && ++ !btree_iter_good_node(iter, l, check_pos)) { ++ btree_node_unlock(iter, l); ++ iter->l[l].b = BTREE_ITER_NO_NODE_UP; ++ l++; ++ } ++ ++ return l; ++} ++ ++/* ++ * This is the main state machine for walking down the btree - walks down to a ++ * specified depth ++ * ++ * Returns 0 on success, -EIO on error (error reading in a btree node). ++ * ++ * On error, caller (peek_node()/peek_key()) must return NULL; the error is ++ * stashed in the iterator and returned from bch2_trans_exit(). ++ */ ++static int btree_iter_traverse_one(struct btree_iter *iter) ++{ ++ unsigned depth_want = iter->level; ++ ++ /* ++ * if we need interior nodes locked, call btree_iter_relock() to make ++ * sure we walk back up enough that we lock them: ++ */ ++ if (iter->uptodate == BTREE_ITER_NEED_RELOCK || ++ iter->locks_want > 1) ++ bch2_btree_iter_relock(iter, false); ++ ++ if (btree_iter_type(iter) == BTREE_ITER_CACHED) ++ return bch2_btree_iter_traverse_cached(iter); ++ ++ if (iter->uptodate < BTREE_ITER_NEED_RELOCK) ++ return 0; ++ ++ if (unlikely(iter->level >= BTREE_MAX_DEPTH)) ++ return 0; ++ ++ /* ++ * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos ++ * here unnecessary ++ */ ++ iter->level = btree_iter_up_until_good_node(iter, 0); ++ ++ /* ++ * If we've got a btree node locked (i.e. we aren't about to relock the ++ * root) - advance its node iterator if necessary: ++ * ++ * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary ++ */ ++ if (is_btree_node(iter, iter->level)) { ++ BUG_ON(!btree_iter_pos_in_node(iter, iter->l[iter->level].b)); ++ ++ btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1); ++ } ++ ++ /* ++ * Note: iter->nodes[iter->level] may be temporarily NULL here - that ++ * would indicate to other code that we got to the end of the btree, ++ * here it indicates that relocking the root failed - it's critical that ++ * btree_iter_lock_root() comes next and that it can't fail ++ */ ++ while (iter->level > depth_want) { ++ int ret = btree_iter_node(iter, iter->level) ++ ? btree_iter_down(iter) ++ : btree_iter_lock_root(iter, depth_want); ++ if (unlikely(ret)) { ++ if (ret == 1) ++ return 0; ++ ++ iter->level = depth_want; ++ ++ if (ret == -EIO) { ++ iter->flags |= BTREE_ITER_ERROR; ++ iter->l[iter->level].b = ++ BTREE_ITER_NO_NODE_ERROR; ++ } else { ++ iter->l[iter->level].b = ++ BTREE_ITER_NO_NODE_DOWN; ++ } ++ return ret; ++ } ++ } ++ ++ iter->uptodate = BTREE_ITER_NEED_PEEK; ++ ++ bch2_btree_iter_verify(iter); ++ return 0; ++} ++ ++int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter) ++{ ++ struct btree_trans *trans = iter->trans; ++ int ret; ++ ++ ret = bch2_trans_cond_resched(trans) ?: ++ btree_iter_traverse_one(iter); ++ if (unlikely(ret)) ++ ret = __btree_iter_traverse_all(trans, ret); ++ ++ return ret; ++} ++ ++static inline void bch2_btree_iter_checks(struct btree_iter *iter) ++{ ++ enum btree_iter_type type = btree_iter_type(iter); ++ ++ EBUG_ON(iter->btree_id >= BTREE_ID_NR); ++ ++ BUG_ON((type == BTREE_ITER_KEYS || ++ type == BTREE_ITER_CACHED) && ++ (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 || ++ bkey_cmp(iter->pos, iter->k.p) > 0)); ++ ++ bch2_btree_iter_verify_locks(iter); ++ bch2_btree_iter_verify_level(iter, iter->level); ++} ++ ++/* Iterate across nodes (leaf and interior nodes) */ ++ ++struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) ++{ ++ struct btree *b; ++ int ret; ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); ++ bch2_btree_iter_checks(iter); ++ ++ if (iter->uptodate == BTREE_ITER_UPTODATE) ++ return iter->l[iter->level].b; ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ return NULL; ++ ++ b = btree_iter_node(iter, iter->level); ++ if (!b) ++ return NULL; ++ ++ BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0); ++ ++ iter->pos = b->key.k.p; ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ ++ bch2_btree_iter_verify(iter); ++ ++ return b; ++} ++ ++struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) ++{ ++ struct btree *b; ++ int ret; ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES); ++ bch2_btree_iter_checks(iter); ++ ++ /* already got to end? */ ++ if (!btree_iter_node(iter, iter->level)) ++ return NULL; ++ ++ bch2_trans_cond_resched(iter->trans); ++ ++ btree_iter_up(iter); ++ ++ if (!bch2_btree_node_relock(iter, iter->level)) ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ return NULL; ++ ++ /* got to end? */ ++ b = btree_iter_node(iter, iter->level); ++ if (!b) ++ return NULL; ++ ++ if (bkey_cmp(iter->pos, b->key.k.p) < 0) { ++ /* ++ * Haven't gotten to the end of the parent node: go back down to ++ * the next child node ++ */ ++ ++ /* ++ * We don't really want to be unlocking here except we can't ++ * directly tell btree_iter_traverse() "traverse to this level" ++ * except by setting iter->level, so we have to unlock so we ++ * don't screw up our lock invariants: ++ */ ++ if (btree_node_read_locked(iter, iter->level)) ++ btree_node_unlock(iter, iter->level); ++ ++ iter->pos = bkey_successor(iter->pos); ++ iter->level = iter->min_depth; ++ ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ return NULL; ++ ++ b = iter->l[iter->level].b; ++ } ++ ++ iter->pos = b->key.k.p; ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ ++ bch2_btree_iter_verify(iter); ++ ++ return b; ++} ++ ++/* Iterate across keys (in leaf nodes only) */ ++ ++void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ ++ EBUG_ON(iter->level != 0); ++ EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0); ++ EBUG_ON(!btree_node_locked(iter, 0)); ++ EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0); ++ ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = new_pos; ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++ ++ btree_iter_advance_to_pos(iter, l, -1); ++ ++ /* ++ * XXX: ++ * keeping a node locked that's outside (even just outside) iter->pos ++ * breaks __bch2_btree_node_lock(). This seems to only affect ++ * bch2_btree_node_get_sibling so for now it's fixed there, but we ++ * should try to get rid of this corner case. ++ * ++ * (this behaviour is currently needed for BTREE_INSERT_NOUNLOCK) ++ */ ++ ++ if (bch2_btree_node_iter_end(&l->iter) && ++ btree_iter_pos_after_node(iter, l->b)) ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++} ++ ++static void btree_iter_pos_changed(struct btree_iter *iter, int cmp) ++{ ++ unsigned l = iter->level; ++ ++ if (!cmp) ++ goto out; ++ ++ if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) { ++ btree_node_unlock(iter, 0); ++ iter->l[0].b = BTREE_ITER_NO_NODE_UP; ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ return; ++ } ++ ++ l = btree_iter_up_until_good_node(iter, cmp); ++ ++ if (btree_iter_node(iter, l)) { ++ /* ++ * We might have to skip over many keys, or just a few: try ++ * advancing the node iterator, and if we have to skip over too ++ * many keys just reinit it (or if we're rewinding, since that ++ * is expensive). ++ */ ++ if (cmp < 0 || ++ !btree_iter_advance_to_pos(iter, &iter->l[l], 8)) ++ __btree_iter_init(iter, l); ++ ++ /* Don't leave it locked if we're not supposed to: */ ++ if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED) ++ btree_node_unlock(iter, l); ++ } ++out: ++ if (l != iter->level) ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE); ++ else ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK); ++} ++ ++void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos, ++ bool strictly_greater) ++{ ++ struct bpos old = btree_iter_search_key(iter); ++ int cmp; ++ ++ iter->flags &= ~BTREE_ITER_IS_EXTENTS; ++ iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0; ++ ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = new_pos; ++ ++ cmp = bkey_cmp(btree_iter_search_key(iter), old); ++ ++ btree_iter_pos_changed(iter, cmp); ++} ++ ++void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) ++{ ++ int cmp = bkey_cmp(new_pos, iter->pos); ++ ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = new_pos; ++ ++ btree_iter_pos_changed(iter, cmp); ++} ++ ++static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ bool ret; ++ ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = l->b->key.k.p; ++ ++ ret = bkey_cmp(iter->pos, POS_MAX) != 0; ++ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) ++ iter->k.p = iter->pos = bkey_successor(iter->pos); ++ ++ btree_iter_pos_changed(iter, 1); ++ return ret; ++} ++ ++static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ bool ret; ++ ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos = l->b->data->min_key; ++ iter->uptodate = BTREE_ITER_NEED_TRAVERSE; ++ ++ ret = bkey_cmp(iter->pos, POS_MIN) != 0; ++ if (ret) { ++ iter->k.p = iter->pos = bkey_predecessor(iter->pos); ++ ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) ++ iter->k.p = iter->pos = bkey_predecessor(iter->pos); ++ } ++ ++ btree_iter_pos_changed(iter, -1); ++ return ret; ++} ++ ++/** ++ * btree_iter_peek_uptodate - given an iterator that is uptodate, return the key ++ * it currently points to ++ */ ++static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_s_c ret = { .k = &iter->k }; ++ ++ if (!bkey_deleted(&iter->k)) { ++ struct bkey_packed *_k = ++ __bch2_btree_node_iter_peek_all(&l->iter, l->b); ++ ++ ret.v = bkeyp_val(&l->b->format, _k); ++ ++ if (debug_check_iterators(iter->trans->c)) { ++ struct bkey k = bkey_unpack_key(l->b, _k); ++ ++ BUG_ON(memcmp(&k, &iter->k, sizeof(k))); ++ } ++ ++ if (debug_check_bkeys(iter->trans->c)) ++ bch2_bkey_debugcheck(iter->trans->c, l->b, ret); ++ } ++ ++ return ret; ++} ++ ++/** ++ * bch2_btree_iter_peek: returns first key greater than or equal to iterator's ++ * current position ++ */ ++struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_s_c k; ++ int ret; ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ bch2_btree_iter_checks(iter); ++ ++ if (iter->uptodate == BTREE_ITER_UPTODATE && ++ !bkey_deleted(&iter->k)) ++ return btree_iter_peek_uptodate(iter); ++ ++ while (1) { ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ k = __btree_iter_peek(iter, l); ++ if (likely(k.k)) ++ break; ++ ++ if (!btree_iter_set_pos_to_next_leaf(iter)) ++ return bkey_s_c_null; ++ } ++ ++ /* ++ * iter->pos should always be equal to the key we just ++ * returned - except extents can straddle iter->pos: ++ */ ++ if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || ++ bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) ++ iter->pos = bkey_start_pos(k.k); ++ ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ ++ bch2_btree_iter_verify_level(iter, 0); ++ return k; ++} ++ ++/** ++ * bch2_btree_iter_next: returns first key greater than iterator's current ++ * position ++ */ ++struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) ++{ ++ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) ++ return bkey_s_c_null; ++ ++ bch2_btree_iter_set_pos(iter, ++ (iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? iter->k.p ++ : bkey_successor(iter->k.p)); ++ ++ return bch2_btree_iter_peek(iter); ++} ++ ++static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter) ++{ ++ struct bpos pos = btree_iter_search_key(iter); ++ struct btree_trans *trans = iter->trans; ++ struct btree_insert_entry *i; ++ ++ trans_for_each_update2(trans, i) ++ if ((cmp_int(iter->btree_id, i->iter->btree_id) ?: ++ bkey_cmp(pos, i->k->k.p)) <= 0) ++ break; ++ ++ return i < trans->updates2 + trans->nr_updates2 && ++ iter->btree_id == i->iter->btree_id ++ ? bkey_i_to_s_c(i->k) ++ : bkey_s_c_null; ++} ++ ++static struct bkey_s_c __bch2_btree_iter_peek_with_updates(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_s_c k = __btree_iter_peek(iter, l); ++ struct bkey_s_c u = __btree_trans_updates_peek(iter); ++ ++ if (k.k && (!u.k || bkey_cmp(k.k->p, u.k->p) < 0)) ++ return k; ++ if (u.k && bkey_cmp(u.k->p, l->b->key.k.p) <= 0) { ++ iter->k = *u.k; ++ return u; ++ } ++ return bkey_s_c_null; ++} ++ ++struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter) ++{ ++ struct bkey_s_c k; ++ int ret; ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ bch2_btree_iter_checks(iter); ++ ++ while (1) { ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ k = __bch2_btree_iter_peek_with_updates(iter); ++ ++ if (k.k && bkey_deleted(k.k)) { ++ bch2_btree_iter_set_pos(iter, ++ (iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? iter->k.p ++ : bkey_successor(iter->k.p)); ++ continue; ++ } ++ ++ if (likely(k.k)) ++ break; ++ ++ if (!btree_iter_set_pos_to_next_leaf(iter)) ++ return bkey_s_c_null; ++ } ++ ++ /* ++ * iter->pos should always be equal to the key we just ++ * returned - except extents can straddle iter->pos: ++ */ ++ if (!(iter->flags & BTREE_ITER_IS_EXTENTS) || ++ bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0) ++ iter->pos = bkey_start_pos(k.k); ++ ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ return k; ++} ++ ++struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter) ++{ ++ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) ++ return bkey_s_c_null; ++ ++ bch2_btree_iter_set_pos(iter, ++ (iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? iter->k.p ++ : bkey_successor(iter->k.p)); ++ ++ return bch2_btree_iter_peek_with_updates(iter); ++} ++ ++/** ++ * bch2_btree_iter_peek_prev: returns first key less than or equal to ++ * iterator's current position ++ */ ++struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) ++{ ++ struct bpos pos = iter->pos; ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_s_c k; ++ int ret; ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ bch2_btree_iter_checks(iter); ++ ++ if (iter->uptodate == BTREE_ITER_UPTODATE && ++ !bkey_deleted(&iter->k)) ++ return btree_iter_peek_uptodate(iter); ++ ++ while (1) { ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ k = __btree_iter_peek(iter, l); ++ if (!k.k || bkey_cmp(bkey_start_pos(k.k), pos) > 0) ++ k = __btree_iter_prev(iter, l); ++ ++ if (likely(k.k)) ++ break; ++ ++ if (!btree_iter_set_pos_to_prev_leaf(iter)) ++ return bkey_s_c_null; ++ } ++ ++ EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0); ++ iter->pos = bkey_start_pos(k.k); ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ return k; ++} ++ ++/** ++ * bch2_btree_iter_prev: returns first key less than iterator's current ++ * position ++ */ ++struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) ++{ ++ struct bpos pos = bkey_start_pos(&iter->k); ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ bch2_btree_iter_checks(iter); ++ ++ if (unlikely(!bkey_cmp(pos, POS_MIN))) ++ return bkey_s_c_null; ++ ++ bch2_btree_iter_set_pos(iter, bkey_predecessor(pos)); ++ ++ return bch2_btree_iter_peek_prev(iter); ++} ++ ++static inline struct bkey_s_c ++__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct btree_node_iter node_iter; ++ struct bkey_s_c k; ++ struct bkey n; ++ int ret; ++ ++ /* keys & holes can't span inode numbers: */ ++ if (iter->pos.offset == KEY_OFFSET_MAX) { ++ if (iter->pos.inode == KEY_INODE_MAX) ++ return bkey_s_c_null; ++ ++ bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos)); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ } ++ ++ /* ++ * iterator is now at the correct position for inserting at iter->pos, ++ * but we need to keep iterating until we find the first non whiteout so ++ * we know how big a hole we have, if any: ++ */ ++ ++ node_iter = l->iter; ++ k = __btree_iter_unpack(iter, l, &iter->k, ++ bch2_btree_node_iter_peek(&node_iter, l->b)); ++ ++ if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) { ++ /* ++ * We're not setting iter->uptodate because the node iterator ++ * doesn't necessarily point at the key we're returning: ++ */ ++ ++ EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0); ++ bch2_btree_iter_verify_level(iter, 0); ++ return k; ++ } ++ ++ /* hole */ ++ ++ if (!k.k) ++ k.k = &l->b->key.k; ++ ++ bkey_init(&n); ++ n.p = iter->pos; ++ bch2_key_resize(&n, ++ min_t(u64, KEY_SIZE_MAX, ++ (k.k->p.inode == n.p.inode ++ ? bkey_start_offset(k.k) ++ : KEY_OFFSET_MAX) - ++ n.p.offset)); ++ ++ EBUG_ON(!n.size); ++ ++ iter->k = n; ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ ++ bch2_btree_iter_verify_level(iter, 0); ++ return (struct bkey_s_c) { &iter->k, NULL }; ++} ++ ++struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_s_c k; ++ int ret; ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS); ++ bch2_btree_iter_checks(iter); ++ ++ if (iter->uptodate == BTREE_ITER_UPTODATE) ++ return btree_iter_peek_uptodate(iter); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ if (iter->flags & BTREE_ITER_IS_EXTENTS) ++ return __bch2_btree_iter_peek_slot_extents(iter); ++ ++ k = __btree_iter_peek_all(iter, l, &iter->k); ++ ++ EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0); ++ ++ if (!k.k || bkey_cmp(iter->pos, k.k->p)) { ++ /* hole */ ++ bkey_init(&iter->k); ++ iter->k.p = iter->pos; ++ k = (struct bkey_s_c) { &iter->k, NULL }; ++ } ++ ++ iter->uptodate = BTREE_ITER_UPTODATE; ++ bch2_btree_iter_verify_level(iter, 0); ++ return k; ++} ++ ++struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) ++{ ++ if (unlikely(!bkey_cmp(iter->k.p, POS_MAX))) ++ return bkey_s_c_null; ++ ++ bch2_btree_iter_set_pos(iter, ++ (iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? iter->k.p ++ : bkey_successor(iter->k.p)); ++ ++ return bch2_btree_iter_peek_slot(iter); ++} ++ ++struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter) ++{ ++ struct bkey_cached *ck; ++ int ret; ++ ++ EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED); ++ bch2_btree_iter_checks(iter); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (unlikely(ret)) ++ return bkey_s_c_err(ret); ++ ++ ck = (void *) iter->l[0].b; ++ ++ EBUG_ON(iter->btree_id != ck->key.btree_id || ++ bkey_cmp(iter->pos, ck->key.pos)); ++ BUG_ON(!ck->valid); ++ ++ return bkey_i_to_s_c(ck->k); ++} ++ ++static inline void bch2_btree_iter_init(struct btree_trans *trans, ++ struct btree_iter *iter, enum btree_id btree_id, ++ struct bpos pos, unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ unsigned i; ++ ++ if (btree_node_type_is_extents(btree_id) && ++ !(flags & BTREE_ITER_NODES)) ++ flags |= BTREE_ITER_IS_EXTENTS; ++ ++ iter->trans = trans; ++ iter->pos = pos; ++ bkey_init(&iter->k); ++ iter->k.p = pos; ++ iter->flags = flags; ++ iter->uptodate = BTREE_ITER_NEED_TRAVERSE; ++ iter->btree_id = btree_id; ++ iter->level = 0; ++ iter->min_depth = 0; ++ iter->locks_want = flags & BTREE_ITER_INTENT ? 1 : 0; ++ iter->nodes_locked = 0; ++ iter->nodes_intent_locked = 0; ++ for (i = 0; i < ARRAY_SIZE(iter->l); i++) ++ iter->l[i].b = BTREE_ITER_NO_NODE_INIT; ++ ++ prefetch(c->btree_roots[btree_id].b); ++} ++ ++/* new transactional stuff: */ ++ ++static inline void __bch2_trans_iter_free(struct btree_trans *trans, ++ unsigned idx) ++{ ++ __bch2_btree_iter_unlock(&trans->iters[idx]); ++ trans->iters_linked &= ~(1ULL << idx); ++ trans->iters_live &= ~(1ULL << idx); ++ trans->iters_touched &= ~(1ULL << idx); ++} ++ ++int bch2_trans_iter_put(struct btree_trans *trans, ++ struct btree_iter *iter) ++{ ++ int ret; ++ ++ if (IS_ERR_OR_NULL(iter)) ++ return 0; ++ ++ BUG_ON(trans->iters + iter->idx != iter); ++ ++ ret = btree_iter_err(iter); ++ ++ if (!(trans->iters_touched & (1ULL << iter->idx)) && ++ !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) ++ __bch2_trans_iter_free(trans, iter->idx); ++ ++ trans->iters_live &= ~(1ULL << iter->idx); ++ return ret; ++} ++ ++int bch2_trans_iter_free(struct btree_trans *trans, ++ struct btree_iter *iter) ++{ ++ if (IS_ERR_OR_NULL(iter)) ++ return 0; ++ ++ trans->iters_touched &= ~(1ULL << iter->idx); ++ ++ return bch2_trans_iter_put(trans, iter); ++} ++ ++static int bch2_trans_realloc_iters(struct btree_trans *trans, ++ unsigned new_size) ++{ ++ void *p, *new_iters, *new_updates, *new_updates2; ++ size_t iters_bytes; ++ size_t updates_bytes; ++ ++ new_size = roundup_pow_of_two(new_size); ++ ++ BUG_ON(new_size > BTREE_ITER_MAX); ++ ++ if (new_size <= trans->size) ++ return 0; ++ ++ BUG_ON(trans->used_mempool); ++ ++ bch2_trans_unlock(trans); ++ ++ iters_bytes = sizeof(struct btree_iter) * new_size; ++ updates_bytes = sizeof(struct btree_insert_entry) * new_size; ++ ++ p = kmalloc(iters_bytes + ++ updates_bytes + ++ updates_bytes, GFP_NOFS); ++ if (p) ++ goto success; ++ ++ p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS); ++ new_size = BTREE_ITER_MAX; ++ ++ trans->used_mempool = true; ++success: ++ new_iters = p; p += iters_bytes; ++ new_updates = p; p += updates_bytes; ++ new_updates2 = p; p += updates_bytes; ++ ++ memcpy(new_iters, trans->iters, ++ sizeof(struct btree_iter) * trans->nr_iters); ++ memcpy(new_updates, trans->updates, ++ sizeof(struct btree_insert_entry) * trans->nr_updates); ++ memcpy(new_updates2, trans->updates2, ++ sizeof(struct btree_insert_entry) * trans->nr_updates2); ++ ++ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) ++ memset(trans->iters, POISON_FREE, ++ sizeof(struct btree_iter) * trans->nr_iters + ++ sizeof(struct btree_insert_entry) * trans->nr_iters); ++ ++ if (trans->iters != trans->iters_onstack) ++ kfree(trans->iters); ++ ++ trans->iters = new_iters; ++ trans->updates = new_updates; ++ trans->updates2 = new_updates2; ++ trans->size = new_size; ++ ++ if (trans->iters_live) { ++ trace_trans_restart_iters_realloced(trans->ip, trans->size); ++ return -EINTR; ++ } ++ ++ return 0; ++} ++ ++static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans) ++{ ++ unsigned idx = __ffs64(~trans->iters_linked); ++ ++ if (idx < trans->nr_iters) ++ goto got_slot; ++ ++ if (trans->nr_iters == trans->size) { ++ int ret; ++ ++ if (trans->nr_iters >= BTREE_ITER_MAX) { ++ struct btree_iter *iter; ++ ++ trans_for_each_iter(trans, iter) { ++ pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps", ++ bch2_btree_ids[iter->btree_id], ++ iter->pos.inode, ++ iter->pos.offset, ++ (trans->iters_live & (1ULL << iter->idx)) ? " live" : "", ++ (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "", ++ iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "", ++ (void *) iter->ip_allocated); ++ } ++ ++ panic("trans iter oveflow\n"); ++ } ++ ++ ret = bch2_trans_realloc_iters(trans, trans->size * 2); ++ if (ret) ++ return ERR_PTR(ret); ++ } ++ ++ idx = trans->nr_iters++; ++ BUG_ON(trans->nr_iters > trans->size); ++ ++ trans->iters[idx].idx = idx; ++got_slot: ++ BUG_ON(trans->iters_linked & (1ULL << idx)); ++ trans->iters_linked |= 1ULL << idx; ++ trans->iters[idx].flags = 0; ++ return &trans->iters[idx]; ++} ++ ++static inline void btree_iter_copy(struct btree_iter *dst, ++ struct btree_iter *src) ++{ ++ unsigned i, idx = dst->idx; ++ ++ *dst = *src; ++ dst->idx = idx; ++ dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; ++ ++ for (i = 0; i < BTREE_MAX_DEPTH; i++) ++ if (btree_node_locked(dst, i)) ++ six_lock_increment(&dst->l[i].b->c.lock, ++ __btree_lock_want(dst, i)); ++ ++ dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; ++ dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT; ++} ++ ++static inline struct bpos bpos_diff(struct bpos l, struct bpos r) ++{ ++ if (bkey_cmp(l, r) > 0) ++ swap(l, r); ++ ++ return POS(r.inode - l.inode, r.offset - l.offset); ++} ++ ++static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans, ++ unsigned btree_id, struct bpos pos, ++ unsigned flags) ++{ ++ struct btree_iter *iter, *best = NULL; ++ ++ BUG_ON(trans->nr_iters > BTREE_ITER_MAX); ++ ++ trans_for_each_iter(trans, iter) { ++ if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE)) ++ continue; ++ ++ if (iter->btree_id != btree_id) ++ continue; ++ ++ if (best && ++ bkey_cmp(bpos_diff(best->pos, pos), ++ bpos_diff(iter->pos, pos)) < 0) ++ continue; ++ ++ best = iter; ++ } ++ ++ if (!best) { ++ iter = btree_trans_iter_alloc(trans); ++ if (IS_ERR(iter)) ++ return iter; ++ ++ bch2_btree_iter_init(trans, iter, btree_id, pos, flags); ++ } else if ((trans->iters_live & (1ULL << best->idx)) || ++ (best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) { ++ iter = btree_trans_iter_alloc(trans); ++ if (IS_ERR(iter)) ++ return iter; ++ ++ btree_iter_copy(iter, best); ++ } else { ++ iter = best; ++ } ++ ++ iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT; ++ iter->flags &= ~BTREE_ITER_USER_FLAGS; ++ iter->flags |= flags & BTREE_ITER_USER_FLAGS; ++ ++ if (iter->flags & BTREE_ITER_INTENT) ++ bch2_btree_iter_upgrade(iter, 1); ++ else ++ bch2_btree_iter_downgrade(iter); ++ ++ BUG_ON(iter->btree_id != btree_id); ++ BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE); ++ BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); ++ BUG_ON(iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT); ++ BUG_ON(trans->iters_live & (1ULL << iter->idx)); ++ ++ trans->iters_live |= 1ULL << iter->idx; ++ trans->iters_touched |= 1ULL << iter->idx; ++ ++ return iter; ++} ++ ++struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans, ++ enum btree_id btree_id, ++ struct bpos pos, unsigned flags) ++{ ++ struct btree_iter *iter = ++ __btree_trans_get_iter(trans, btree_id, pos, flags); ++ ++ if (!IS_ERR(iter)) ++ __bch2_btree_iter_set_pos(iter, pos, ++ btree_node_type_is_extents(btree_id)); ++ return iter; ++} ++ ++struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans, ++ enum btree_id btree_id, ++ struct bpos pos, ++ unsigned locks_want, ++ unsigned depth, ++ unsigned flags) ++{ ++ struct btree_iter *iter = ++ __btree_trans_get_iter(trans, btree_id, pos, ++ flags|BTREE_ITER_NODES); ++ unsigned i; ++ ++ BUG_ON(IS_ERR(iter)); ++ BUG_ON(bkey_cmp(iter->pos, pos)); ++ ++ iter->locks_want = locks_want; ++ iter->level = depth; ++ iter->min_depth = depth; ++ ++ for (i = 0; i < ARRAY_SIZE(iter->l); i++) ++ iter->l[i].b = NULL; ++ iter->l[iter->level].b = BTREE_ITER_NO_NODE_INIT; ++ ++ return iter; ++} ++ ++struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans, ++ struct btree_iter *src) ++{ ++ struct btree_iter *iter; ++ ++ iter = btree_trans_iter_alloc(trans); ++ if (IS_ERR(iter)) ++ return iter; ++ ++ btree_iter_copy(iter, src); ++ ++ trans->iters_live |= 1ULL << iter->idx; ++ /* ++ * We don't need to preserve this iter since it's cheap to copy it ++ * again - this will cause trans_iter_put() to free it right away: ++ */ ++ trans->iters_touched &= ~(1ULL << iter->idx); ++ ++ return iter; ++} ++ ++static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size) ++{ ++ if (size > trans->mem_bytes) { ++ size_t old_bytes = trans->mem_bytes; ++ size_t new_bytes = roundup_pow_of_two(size); ++ void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS); ++ ++ if (!new_mem) ++ return -ENOMEM; ++ ++ trans->mem = new_mem; ++ trans->mem_bytes = new_bytes; ++ ++ if (old_bytes) { ++ trace_trans_restart_mem_realloced(trans->ip, new_bytes); ++ return -EINTR; ++ } ++ } ++ ++ return 0; ++} ++ ++void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) ++{ ++ void *p; ++ int ret; ++ ++ ret = bch2_trans_preload_mem(trans, trans->mem_top + size); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ p = trans->mem + trans->mem_top; ++ trans->mem_top += size; ++ return p; ++} ++ ++inline void bch2_trans_unlink_iters(struct btree_trans *trans) ++{ ++ u64 iters = trans->iters_linked & ++ ~trans->iters_touched & ++ ~trans->iters_live; ++ ++ while (iters) { ++ unsigned idx = __ffs64(iters); ++ ++ iters &= ~(1ULL << idx); ++ __bch2_trans_iter_free(trans, idx); ++ } ++} ++ ++void bch2_trans_reset(struct btree_trans *trans, unsigned flags) ++{ ++ struct btree_iter *iter; ++ ++ trans_for_each_iter(trans, iter) ++ iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT| ++ BTREE_ITER_SET_POS_AFTER_COMMIT); ++ ++ bch2_trans_unlink_iters(trans); ++ ++ trans->iters_touched &= trans->iters_live; ++ ++ trans->need_reset = 0; ++ trans->nr_updates = 0; ++ trans->nr_updates2 = 0; ++ trans->mem_top = 0; ++ ++ trans->extra_journal_entries = NULL; ++ trans->extra_journal_entry_u64s = 0; ++ ++ if (trans->fs_usage_deltas) { ++ trans->fs_usage_deltas->used = 0; ++ memset(&trans->fs_usage_deltas->memset_start, 0, ++ (void *) &trans->fs_usage_deltas->memset_end - ++ (void *) &trans->fs_usage_deltas->memset_start); ++ } ++ ++ if (!(flags & TRANS_RESET_NOTRAVERSE)) ++ bch2_btree_iter_traverse_all(trans); ++} ++ ++void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, ++ unsigned expected_nr_iters, ++ size_t expected_mem_bytes) ++{ ++ memset(trans, 0, offsetof(struct btree_trans, iters_onstack)); ++ ++ /* ++ * reallocating iterators currently completely breaks ++ * bch2_trans_iter_put(): ++ */ ++ expected_nr_iters = BTREE_ITER_MAX; ++ ++ trans->c = c; ++ trans->ip = _RET_IP_; ++ trans->size = ARRAY_SIZE(trans->iters_onstack); ++ trans->iters = trans->iters_onstack; ++ trans->updates = trans->updates_onstack; ++ trans->updates2 = trans->updates2_onstack; ++ trans->fs_usage_deltas = NULL; ++ ++ if (expected_nr_iters > trans->size) ++ bch2_trans_realloc_iters(trans, expected_nr_iters); ++ ++ if (expected_mem_bytes) ++ bch2_trans_preload_mem(trans, expected_mem_bytes); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ trans->pid = current->pid; ++ mutex_lock(&c->btree_trans_lock); ++ list_add(&trans->list, &c->btree_trans_list); ++ mutex_unlock(&c->btree_trans_lock); ++#endif ++} ++ ++int bch2_trans_exit(struct btree_trans *trans) ++{ ++ bch2_trans_unlock(trans); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ mutex_lock(&trans->c->btree_trans_lock); ++ list_del(&trans->list); ++ mutex_unlock(&trans->c->btree_trans_lock); ++#endif ++ ++ bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); ++ ++ kfree(trans->fs_usage_deltas); ++ kfree(trans->mem); ++ if (trans->used_mempool) ++ mempool_free(trans->iters, &trans->c->btree_iters_pool); ++ else if (trans->iters != trans->iters_onstack) ++ kfree(trans->iters); ++ trans->mem = (void *) 0x1; ++ trans->iters = (void *) 0x1; ++ ++ return trans->error ? -EIO : 0; ++} ++ ++static void bch2_btree_iter_node_to_text(struct printbuf *out, ++ struct btree_bkey_cached_common *_b, ++ enum btree_iter_type type) ++{ ++ pr_buf(out, " %px l=%u %s:", ++ _b, _b->level, bch2_btree_ids[_b->btree_id]); ++ bch2_bpos_to_text(out, btree_node_pos(_b, type)); ++} ++ ++void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct btree_trans *trans; ++ struct btree_iter *iter; ++ struct btree *b; ++ unsigned l; ++ ++ mutex_lock(&c->btree_trans_lock); ++ list_for_each_entry(trans, &c->btree_trans_list, list) { ++ pr_buf(out, "%i %px %ps\n", trans->pid, trans, (void *) trans->ip); ++ ++ trans_for_each_iter(trans, iter) { ++ if (!iter->nodes_locked) ++ continue; ++ ++ pr_buf(out, " iter %u %s:", ++ iter->idx, ++ bch2_btree_ids[iter->btree_id]); ++ bch2_bpos_to_text(out, iter->pos); ++ pr_buf(out, "\n"); ++ ++ for (l = 0; l < BTREE_MAX_DEPTH; l++) { ++ if (btree_node_locked(iter, l)) { ++ pr_buf(out, " %s l=%u ", ++ btree_node_intent_locked(iter, l) ? "i" : "r", l); ++ bch2_btree_iter_node_to_text(out, ++ (void *) iter->l[l].b, ++ btree_iter_type(iter)); ++ pr_buf(out, "\n"); ++ } ++ } ++ } ++ ++ b = READ_ONCE(trans->locking); ++ if (b) { ++ pr_buf(out, " locking iter %u l=%u %s:", ++ trans->locking_iter_idx, ++ trans->locking_level, ++ bch2_btree_ids[trans->locking_btree_id]); ++ bch2_bpos_to_text(out, trans->locking_pos); ++ ++ ++ pr_buf(out, " node "); ++ bch2_btree_iter_node_to_text(out, ++ (void *) b, ++ btree_iter_type(&trans->iters[trans->locking_iter_idx])); ++ pr_buf(out, "\n"); ++ } ++ } ++ mutex_unlock(&c->btree_trans_lock); ++#endif ++} ++ ++void bch2_fs_btree_iter_exit(struct bch_fs *c) ++{ ++ mempool_exit(&c->btree_iters_pool); ++} ++ ++int bch2_fs_btree_iter_init(struct bch_fs *c) ++{ ++ unsigned nr = BTREE_ITER_MAX; ++ ++ INIT_LIST_HEAD(&c->btree_trans_list); ++ mutex_init(&c->btree_trans_lock); ++ ++ return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1, ++ sizeof(struct btree_iter) * nr + ++ sizeof(struct btree_insert_entry) * nr + ++ sizeof(struct btree_insert_entry) * nr); ++} +diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h +new file mode 100644 +index 000000000000..bd9ec3ec9a92 +--- /dev/null ++++ b/fs/bcachefs/btree_iter.h +@@ -0,0 +1,314 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_ITER_H ++#define _BCACHEFS_BTREE_ITER_H ++ ++#include "bset.h" ++#include "btree_types.h" ++ ++static inline void btree_iter_set_dirty(struct btree_iter *iter, ++ enum btree_iter_uptodate u) ++{ ++ iter->uptodate = max_t(unsigned, iter->uptodate, u); ++} ++ ++static inline struct btree *btree_iter_node(struct btree_iter *iter, ++ unsigned level) ++{ ++ return level < BTREE_MAX_DEPTH ? iter->l[level].b : NULL; ++} ++ ++static inline bool btree_node_lock_seq_matches(const struct btree_iter *iter, ++ const struct btree *b, unsigned level) ++{ ++ /* ++ * We don't compare the low bits of the lock sequence numbers because ++ * @iter might have taken a write lock on @b, and we don't want to skip ++ * the linked iterator if the sequence numbers were equal before taking ++ * that write lock. The lock sequence number is incremented by taking ++ * and releasing write locks and is even when unlocked: ++ */ ++ return iter->l[level].lock_seq >> 1 == b->c.lock.state.seq >> 1; ++} ++ ++static inline struct btree *btree_node_parent(struct btree_iter *iter, ++ struct btree *b) ++{ ++ return btree_iter_node(iter, b->c.level + 1); ++} ++ ++static inline bool btree_trans_has_multiple_iters(const struct btree_trans *trans) ++{ ++ return hweight64(trans->iters_linked) > 1; ++} ++ ++static inline int btree_iter_err(const struct btree_iter *iter) ++{ ++ return iter->flags & BTREE_ITER_ERROR ? -EIO : 0; ++} ++ ++/* Iterate over iters within a transaction: */ ++ ++#define trans_for_each_iter_all(_trans, _iter) \ ++ for (_iter = (_trans)->iters; \ ++ _iter < (_trans)->iters + (_trans)->nr_iters; \ ++ _iter++) ++ ++static inline struct btree_iter * ++__trans_next_iter(struct btree_trans *trans, unsigned idx) ++{ ++ EBUG_ON(idx < trans->nr_iters && trans->iters[idx].idx != idx); ++ ++ for (; idx < trans->nr_iters; idx++) ++ if (trans->iters_linked & (1ULL << idx)) ++ return &trans->iters[idx]; ++ ++ return NULL; ++} ++ ++#define trans_for_each_iter(_trans, _iter) \ ++ for (_iter = __trans_next_iter((_trans), 0); \ ++ (_iter); \ ++ _iter = __trans_next_iter((_trans), (_iter)->idx + 1)) ++ ++static inline bool __iter_has_node(const struct btree_iter *iter, ++ const struct btree *b) ++{ ++ return iter->l[b->c.level].b == b && ++ btree_node_lock_seq_matches(iter, b, b->c.level); ++} ++ ++static inline struct btree_iter * ++__trans_next_iter_with_node(struct btree_trans *trans, struct btree *b, ++ unsigned idx) ++{ ++ struct btree_iter *iter = __trans_next_iter(trans, idx); ++ ++ while (iter && !__iter_has_node(iter, b)) ++ iter = __trans_next_iter(trans, iter->idx + 1); ++ ++ return iter; ++} ++ ++#define trans_for_each_iter_with_node(_trans, _b, _iter) \ ++ for (_iter = __trans_next_iter_with_node((_trans), (_b), 0); \ ++ (_iter); \ ++ _iter = __trans_next_iter_with_node((_trans), (_b), \ ++ (_iter)->idx + 1)) ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_btree_trans_verify_iters(struct btree_trans *, struct btree *); ++void bch2_btree_trans_verify_locks(struct btree_trans *); ++#else ++static inline void bch2_btree_trans_verify_iters(struct btree_trans *trans, ++ struct btree *b) {} ++static inline void bch2_btree_trans_verify_locks(struct btree_trans *iter) {} ++#endif ++ ++void bch2_btree_iter_fix_key_modified(struct btree_iter *, struct btree *, ++ struct bkey_packed *); ++void bch2_btree_node_iter_fix(struct btree_iter *, struct btree *, ++ struct btree_node_iter *, struct bkey_packed *, ++ unsigned, unsigned); ++ ++bool bch2_btree_iter_relock(struct btree_iter *, bool); ++bool bch2_trans_relock(struct btree_trans *); ++void bch2_trans_unlock(struct btree_trans *); ++ ++bool __bch2_btree_iter_upgrade(struct btree_iter *, unsigned); ++bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *, unsigned); ++ ++static inline bool bch2_btree_iter_upgrade(struct btree_iter *iter, ++ unsigned new_locks_want) ++{ ++ new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); ++ ++ return iter->locks_want < new_locks_want ++ ? (!iter->trans->nounlock ++ ? __bch2_btree_iter_upgrade(iter, new_locks_want) ++ : __bch2_btree_iter_upgrade_nounlock(iter, new_locks_want)) ++ : iter->uptodate <= BTREE_ITER_NEED_PEEK; ++} ++ ++void __bch2_btree_iter_downgrade(struct btree_iter *, unsigned); ++ ++static inline void bch2_btree_iter_downgrade(struct btree_iter *iter) ++{ ++ if (iter->locks_want > (iter->flags & BTREE_ITER_INTENT) ? 1 : 0) ++ __bch2_btree_iter_downgrade(iter, 0); ++} ++ ++void bch2_trans_downgrade(struct btree_trans *); ++ ++void bch2_btree_iter_node_replace(struct btree_iter *, struct btree *); ++void bch2_btree_iter_node_drop(struct btree_iter *, struct btree *); ++ ++void bch2_btree_iter_reinit_node(struct btree_iter *, struct btree *); ++ ++int __must_check __bch2_btree_iter_traverse(struct btree_iter *); ++ ++static inline int __must_check ++bch2_btree_iter_traverse(struct btree_iter *iter) ++{ ++ return iter->uptodate >= BTREE_ITER_NEED_RELOCK ++ ? __bch2_btree_iter_traverse(iter) ++ : 0; ++} ++ ++int bch2_btree_iter_traverse_all(struct btree_trans *); ++ ++struct btree *bch2_btree_iter_peek_node(struct btree_iter *); ++struct btree *bch2_btree_iter_next_node(struct btree_iter *); ++ ++struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); ++ ++struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *); ++ ++struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); ++ ++struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); ++struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); ++ ++struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *); ++ ++void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *, struct bpos); ++void __bch2_btree_iter_set_pos(struct btree_iter *, struct bpos, bool); ++void bch2_btree_iter_set_pos(struct btree_iter *, struct bpos); ++ ++static inline int btree_iter_cmp(const struct btree_iter *l, ++ const struct btree_iter *r) ++{ ++ return cmp_int(l->btree_id, r->btree_id) ?: ++ -cmp_int(btree_iter_type(l), btree_iter_type(r)) ?: ++ bkey_cmp(l->pos, r->pos); ++} ++ ++/* ++ * Unlocks before scheduling ++ * Note: does not revalidate iterator ++ */ ++static inline int bch2_trans_cond_resched(struct btree_trans *trans) ++{ ++ if (need_resched() || race_fault()) { ++ bch2_trans_unlock(trans); ++ schedule(); ++ return bch2_trans_relock(trans) ? 0 : -EINTR; ++ } else { ++ return 0; ++ } ++} ++ ++#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ ++ _locks_want, _depth, _flags, _b) \ ++ for (iter = bch2_trans_get_node_iter((_trans), (_btree_id), \ ++ _start, _locks_want, _depth, _flags), \ ++ _b = bch2_btree_iter_peek_node(_iter); \ ++ (_b); \ ++ (_b) = bch2_btree_iter_next_node(_iter)) ++ ++#define for_each_btree_node(_trans, _iter, _btree_id, _start, \ ++ _flags, _b) \ ++ __for_each_btree_node(_trans, _iter, _btree_id, _start, \ ++ 0, 0, _flags, _b) ++ ++static inline struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, ++ unsigned flags) ++{ ++ if ((flags & BTREE_ITER_TYPE) == BTREE_ITER_CACHED) ++ return bch2_btree_iter_peek_cached(iter); ++ else ++ return flags & BTREE_ITER_SLOTS ++ ? bch2_btree_iter_peek_slot(iter) ++ : bch2_btree_iter_peek(iter); ++} ++ ++static inline struct bkey_s_c __bch2_btree_iter_next(struct btree_iter *iter, ++ unsigned flags) ++{ ++ return flags & BTREE_ITER_SLOTS ++ ? bch2_btree_iter_next_slot(iter) ++ : bch2_btree_iter_next(iter); ++} ++ ++static inline int bkey_err(struct bkey_s_c k) ++{ ++ return PTR_ERR_OR_ZERO(k.k); ++} ++ ++#define for_each_btree_key(_trans, _iter, _btree_id, \ ++ _start, _flags, _k, _ret) \ ++ for ((_ret) = PTR_ERR_OR_ZERO((_iter) = \ ++ bch2_trans_get_iter((_trans), (_btree_id), \ ++ (_start), (_flags))) ?: \ ++ PTR_ERR_OR_ZERO(((_k) = \ ++ __bch2_btree_iter_peek(_iter, _flags)).k); \ ++ !_ret && (_k).k; \ ++ (_ret) = PTR_ERR_OR_ZERO(((_k) = \ ++ __bch2_btree_iter_next(_iter, _flags)).k)) ++ ++#define for_each_btree_key_continue(_iter, _flags, _k, _ret) \ ++ for ((_k) = __bch2_btree_iter_peek(_iter, _flags); \ ++ !((_ret) = bkey_err(_k)) && (_k).k; \ ++ (_k) = __bch2_btree_iter_next(_iter, _flags)) ++ ++/* new multiple iterator interface: */ ++ ++int bch2_trans_iter_put(struct btree_trans *, struct btree_iter *); ++int bch2_trans_iter_free(struct btree_trans *, struct btree_iter *); ++ ++void bch2_trans_unlink_iters(struct btree_trans *); ++ ++struct btree_iter *__bch2_trans_get_iter(struct btree_trans *, enum btree_id, ++ struct bpos, unsigned); ++ ++static inline struct btree_iter * ++bch2_trans_get_iter(struct btree_trans *trans, enum btree_id btree_id, ++ struct bpos pos, unsigned flags) ++{ ++ struct btree_iter *iter = ++ __bch2_trans_get_iter(trans, btree_id, pos, flags); ++ ++ if (!IS_ERR(iter)) ++ iter->ip_allocated = _THIS_IP_; ++ return iter; ++} ++ ++struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *, ++ struct btree_iter *); ++static inline struct btree_iter * ++bch2_trans_copy_iter(struct btree_trans *trans, struct btree_iter *src) ++{ ++ struct btree_iter *iter = ++ __bch2_trans_copy_iter(trans, src); ++ ++ if (!IS_ERR(iter)) ++ iter->ip_allocated = _THIS_IP_; ++ return iter; ++ ++} ++ ++struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *, ++ enum btree_id, struct bpos, ++ unsigned, unsigned, unsigned); ++ ++#define TRANS_RESET_NOTRAVERSE (1 << 0) ++ ++void bch2_trans_reset(struct btree_trans *, unsigned); ++ ++static inline void bch2_trans_begin(struct btree_trans *trans) ++{ ++ return bch2_trans_reset(trans, 0); ++} ++ ++void *bch2_trans_kmalloc(struct btree_trans *, size_t); ++void bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned, size_t); ++int bch2_trans_exit(struct btree_trans *); ++ ++void bch2_btree_trans_to_text(struct printbuf *, struct bch_fs *); ++ ++void bch2_fs_btree_iter_exit(struct bch_fs *); ++int bch2_fs_btree_iter_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_BTREE_ITER_H */ +diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c +new file mode 100644 +index 000000000000..61662750dfc0 +--- /dev/null ++++ b/fs/bcachefs/btree_key_cache.c +@@ -0,0 +1,519 @@ ++ ++#include "bcachefs.h" ++#include "btree_cache.h" ++#include "btree_iter.h" ++#include "btree_key_cache.h" ++#include "btree_locking.h" ++#include "btree_update.h" ++#include "error.h" ++#include "journal.h" ++#include "journal_reclaim.h" ++ ++#include ++ ++static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, ++ const void *obj) ++{ ++ const struct bkey_cached *ck = obj; ++ const struct bkey_cached_key *key = arg->key; ++ ++ return cmp_int(ck->key.btree_id, key->btree_id) ?: ++ bkey_cmp(ck->key.pos, key->pos); ++} ++ ++static const struct rhashtable_params bch2_btree_key_cache_params = { ++ .head_offset = offsetof(struct bkey_cached, hash), ++ .key_offset = offsetof(struct bkey_cached, key), ++ .key_len = sizeof(struct bkey_cached_key), ++ .obj_cmpfn = bch2_btree_key_cache_cmp_fn, ++}; ++ ++__flatten ++static inline struct bkey_cached * ++btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) ++{ ++ struct bkey_cached_key key = { ++ .btree_id = btree_id, ++ .pos = pos, ++ }; ++ ++ return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, ++ bch2_btree_key_cache_params); ++} ++ ++static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) ++{ ++ if (!six_trylock_intent(&ck->c.lock)) ++ return false; ++ ++ if (!six_trylock_write(&ck->c.lock)) { ++ six_unlock_intent(&ck->c.lock); ++ return false; ++ } ++ ++ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ six_unlock_write(&ck->c.lock); ++ six_unlock_intent(&ck->c.lock); ++ return false; ++ } ++ ++ return true; ++} ++ ++static void bkey_cached_evict(struct btree_key_cache *c, ++ struct bkey_cached *ck) ++{ ++ BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, ++ bch2_btree_key_cache_params)); ++ memset(&ck->key, ~0, sizeof(ck->key)); ++} ++ ++static void bkey_cached_free(struct btree_key_cache *c, ++ struct bkey_cached *ck) ++{ ++ list_move(&ck->list, &c->freed); ++ ++ kfree(ck->k); ++ ck->k = NULL; ++ ck->u64s = 0; ++ ++ six_unlock_write(&ck->c.lock); ++ six_unlock_intent(&ck->c.lock); ++} ++ ++static struct bkey_cached * ++bkey_cached_alloc(struct btree_key_cache *c) ++{ ++ struct bkey_cached *ck; ++ ++ list_for_each_entry(ck, &c->freed, list) ++ if (bkey_cached_lock_for_evict(ck)) ++ return ck; ++ ++ list_for_each_entry(ck, &c->clean, list) ++ if (bkey_cached_lock_for_evict(ck)) { ++ bkey_cached_evict(c, ck); ++ return ck; ++ } ++ ++ ck = kzalloc(sizeof(*ck), GFP_NOFS); ++ if (!ck) ++ return NULL; ++ ++ INIT_LIST_HEAD(&ck->list); ++ six_lock_init(&ck->c.lock); ++ BUG_ON(!six_trylock_intent(&ck->c.lock)); ++ BUG_ON(!six_trylock_write(&ck->c.lock)); ++ ++ return ck; ++} ++ ++static struct bkey_cached * ++btree_key_cache_create(struct btree_key_cache *c, ++ enum btree_id btree_id, ++ struct bpos pos) ++{ ++ struct bkey_cached *ck; ++ ++ ck = bkey_cached_alloc(c); ++ if (!ck) ++ return ERR_PTR(-ENOMEM); ++ ++ ck->c.level = 0; ++ ck->c.btree_id = btree_id; ++ ck->key.btree_id = btree_id; ++ ck->key.pos = pos; ++ ck->valid = false; ++ ++ BUG_ON(ck->flags); ++ ++ if (rhashtable_lookup_insert_fast(&c->table, ++ &ck->hash, ++ bch2_btree_key_cache_params)) { ++ /* We raced with another fill: */ ++ bkey_cached_free(c, ck); ++ return NULL; ++ } ++ ++ list_move(&ck->list, &c->clean); ++ six_unlock_write(&ck->c.lock); ++ ++ return ck; ++} ++ ++static int btree_key_cache_fill(struct btree_trans *trans, ++ struct btree_iter *ck_iter, ++ struct bkey_cached *ck) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ unsigned new_u64s = 0; ++ struct bkey_i *new_k = NULL; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, ck->key.btree_id, ++ ck->key.pos, BTREE_ITER_SLOTS); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) { ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++ } ++ ++ if (!bch2_btree_node_relock(ck_iter, 0)) { ++ bch2_trans_iter_put(trans, iter); ++ trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ return -EINTR; ++ } ++ ++ if (k.k->u64s > ck->u64s) { ++ new_u64s = roundup_pow_of_two(k.k->u64s); ++ new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOFS); ++ if (!new_k) { ++ bch2_trans_iter_put(trans, iter); ++ return -ENOMEM; ++ } ++ } ++ ++ bch2_btree_node_lock_write(ck_iter->l[0].b, ck_iter); ++ if (new_k) { ++ kfree(ck->k); ++ ck->u64s = new_u64s; ++ ck->k = new_k; ++ } ++ ++ bkey_reassemble(ck->k, k); ++ ck->valid = true; ++ bch2_btree_node_unlock_write(ck_iter->l[0].b, ck_iter); ++ ++ /* We're not likely to need this iterator again: */ ++ bch2_trans_iter_free(trans, iter); ++ ++ return 0; ++} ++ ++static int bkey_cached_check_fn(struct six_lock *lock, void *p) ++{ ++ struct bkey_cached *ck = container_of(lock, struct bkey_cached, c.lock); ++ const struct btree_iter *iter = p; ++ ++ return ck->key.btree_id == iter->btree_id && ++ !bkey_cmp(ck->key.pos, iter->pos) ? 0 : -1; ++} ++ ++int bch2_btree_iter_traverse_cached(struct btree_iter *iter) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct bch_fs *c = trans->c; ++ struct bkey_cached *ck; ++ int ret = 0; ++ ++ BUG_ON(iter->level); ++ ++ if (btree_node_locked(iter, 0)) { ++ ck = (void *) iter->l[0].b; ++ goto fill; ++ } ++retry: ++ ck = btree_key_cache_find(c, iter->btree_id, iter->pos); ++ if (!ck) { ++ if (iter->flags & BTREE_ITER_CACHED_NOCREATE) { ++ iter->l[0].b = NULL; ++ return 0; ++ } ++ ++ mutex_lock(&c->btree_key_cache.lock); ++ ck = btree_key_cache_create(&c->btree_key_cache, ++ iter->btree_id, iter->pos); ++ mutex_unlock(&c->btree_key_cache.lock); ++ ++ ret = PTR_ERR_OR_ZERO(ck); ++ if (ret) ++ goto err; ++ if (!ck) ++ goto retry; ++ ++ mark_btree_node_locked(iter, 0, SIX_LOCK_intent); ++ iter->locks_want = 1; ++ } else { ++ enum six_lock_type lock_want = __btree_lock_want(iter, 0); ++ ++ if (!btree_node_lock((void *) ck, iter->pos, 0, iter, lock_want, ++ bkey_cached_check_fn, iter)) { ++ if (ck->key.btree_id != iter->btree_id || ++ bkey_cmp(ck->key.pos, iter->pos)) { ++ goto retry; ++ } ++ ++ trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ ret = -EINTR; ++ goto err; ++ } ++ ++ if (ck->key.btree_id != iter->btree_id || ++ bkey_cmp(ck->key.pos, iter->pos)) { ++ six_unlock_type(&ck->c.lock, lock_want); ++ goto retry; ++ } ++ ++ mark_btree_node_locked(iter, 0, lock_want); ++ } ++ ++ iter->l[0].lock_seq = ck->c.lock.state.seq; ++ iter->l[0].b = (void *) ck; ++fill: ++ if (!ck->valid && !(iter->flags & BTREE_ITER_CACHED_NOFILL)) { ++ if (!btree_node_intent_locked(iter, 0)) ++ bch2_btree_iter_upgrade(iter, 1); ++ if (!btree_node_intent_locked(iter, 0)) { ++ trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ ret = -EINTR; ++ goto err; ++ } ++ ++ ret = btree_key_cache_fill(trans, iter, ck); ++ if (ret) ++ goto err; ++ } ++ ++ iter->uptodate = BTREE_ITER_NEED_PEEK; ++ bch2_btree_iter_downgrade(iter); ++ return ret; ++err: ++ if (ret != -EINTR) { ++ btree_node_unlock(iter, 0); ++ iter->flags |= BTREE_ITER_ERROR; ++ iter->l[0].b = BTREE_ITER_NO_NODE_ERROR; ++ } ++ return ret; ++} ++ ++static int btree_key_cache_flush_pos(struct btree_trans *trans, ++ struct bkey_cached_key key, ++ u64 journal_seq, ++ bool evict) ++{ ++ struct bch_fs *c = trans->c; ++ struct journal *j = &c->journal; ++ struct btree_iter *c_iter = NULL, *b_iter = NULL; ++ struct bkey_cached *ck; ++ int ret; ++ ++ b_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, ++ BTREE_ITER_SLOTS| ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(b_iter); ++ if (ret) ++ goto out; ++ ++ c_iter = bch2_trans_get_iter(trans, key.btree_id, key.pos, ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_CACHED_NOCREATE| ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(c_iter); ++ if (ret) ++ goto out; ++retry: ++ ret = bch2_btree_iter_traverse(c_iter); ++ if (ret) ++ goto err; ++ ++ ck = (void *) c_iter->l[0].b; ++ if (!ck || ++ (journal_seq && ck->journal.seq != journal_seq)) ++ goto out; ++ ++ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ if (!evict) ++ goto out; ++ goto evict; ++ } ++ ++ ret = bch2_btree_iter_traverse(b_iter) ?: ++ bch2_trans_update(trans, b_iter, ck->k, BTREE_TRIGGER_NORUN) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOUNLOCK| ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_USE_ALLOC_RESERVE| ++ BTREE_INSERT_JOURNAL_RESERVED| ++ BTREE_INSERT_JOURNAL_RECLAIM); ++err: ++ if (ret == -EINTR) ++ goto retry; ++ ++ BUG_ON(ret && !bch2_journal_error(j)); ++ ++ if (ret) ++ goto out; ++ ++ bch2_journal_pin_drop(j, &ck->journal); ++ bch2_journal_preres_put(j, &ck->res); ++ clear_bit(BKEY_CACHED_DIRTY, &ck->flags); ++ ++ if (!evict) { ++ mutex_lock(&c->btree_key_cache.lock); ++ list_move_tail(&ck->list, &c->btree_key_cache.clean); ++ mutex_unlock(&c->btree_key_cache.lock); ++ } else { ++evict: ++ BUG_ON(!btree_node_intent_locked(c_iter, 0)); ++ ++ mark_btree_node_unlocked(c_iter, 0); ++ c_iter->l[0].b = NULL; ++ ++ six_lock_write(&ck->c.lock, NULL, NULL); ++ ++ mutex_lock(&c->btree_key_cache.lock); ++ bkey_cached_evict(&c->btree_key_cache, ck); ++ bkey_cached_free(&c->btree_key_cache, ck); ++ mutex_unlock(&c->btree_key_cache.lock); ++ } ++out: ++ bch2_trans_iter_put(trans, b_iter); ++ bch2_trans_iter_put(trans, c_iter); ++ return ret; ++} ++ ++static void btree_key_cache_journal_flush(struct journal *j, ++ struct journal_entry_pin *pin, ++ u64 seq) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bkey_cached *ck = ++ container_of(pin, struct bkey_cached, journal); ++ struct bkey_cached_key key; ++ struct btree_trans trans; ++ ++ six_lock_read(&ck->c.lock, NULL, NULL); ++ key = ck->key; ++ ++ if (ck->journal.seq != seq || ++ !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ six_unlock_read(&ck->c.lock); ++ return; ++ } ++ six_unlock_read(&ck->c.lock); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ btree_key_cache_flush_pos(&trans, key, seq, false); ++ bch2_trans_exit(&trans); ++} ++ ++/* ++ * Flush and evict a key from the key cache: ++ */ ++int bch2_btree_key_cache_flush(struct btree_trans *trans, ++ enum btree_id id, struct bpos pos) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_cached_key key = { id, pos }; ++ ++ /* Fastpath - assume it won't be found: */ ++ if (!btree_key_cache_find(c, id, pos)) ++ return 0; ++ ++ return btree_key_cache_flush_pos(trans, key, 0, true); ++} ++ ++bool bch2_btree_insert_key_cached(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_cached *ck = (void *) iter->l[0].b; ++ ++ BUG_ON(insert->u64s > ck->u64s); ++ ++ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { ++ int difference; ++ ++ BUG_ON(jset_u64s(insert->u64s) > trans->journal_preres.u64s); ++ ++ difference = jset_u64s(insert->u64s) - ck->res.u64s; ++ if (difference > 0) { ++ trans->journal_preres.u64s -= difference; ++ ck->res.u64s += difference; ++ } ++ } ++ ++ bkey_copy(ck->k, insert); ++ ck->valid = true; ++ ++ if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { ++ mutex_lock(&c->btree_key_cache.lock); ++ list_del_init(&ck->list); ++ ++ set_bit(BKEY_CACHED_DIRTY, &ck->flags); ++ mutex_unlock(&c->btree_key_cache.lock); ++ } ++ ++ bch2_journal_pin_update(&c->journal, trans->journal_res.seq, ++ &ck->journal, btree_key_cache_journal_flush); ++ return true; ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_btree_key_cache_verify_clean(struct btree_trans *trans, ++ enum btree_id id, struct bpos pos) ++{ ++ BUG_ON(btree_key_cache_find(trans->c, id, pos)); ++} ++#endif ++ ++void bch2_fs_btree_key_cache_exit(struct btree_key_cache *c) ++{ ++ struct bkey_cached *ck, *n; ++ ++ mutex_lock(&c->lock); ++ list_for_each_entry_safe(ck, n, &c->clean, list) { ++ kfree(ck->k); ++ kfree(ck); ++ } ++ list_for_each_entry_safe(ck, n, &c->freed, list) ++ kfree(ck); ++ mutex_unlock(&c->lock); ++ ++ rhashtable_destroy(&c->table); ++} ++ ++void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) ++{ ++ mutex_init(&c->lock); ++ INIT_LIST_HEAD(&c->freed); ++ INIT_LIST_HEAD(&c->clean); ++} ++ ++int bch2_fs_btree_key_cache_init(struct btree_key_cache *c) ++{ ++ return rhashtable_init(&c->table, &bch2_btree_key_cache_params); ++} ++ ++void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) ++{ ++ struct bucket_table *tbl; ++ struct bkey_cached *ck; ++ struct rhash_head *pos; ++ size_t i; ++ ++ mutex_lock(&c->lock); ++ tbl = rht_dereference_rcu(c->table.tbl, &c->table); ++ ++ for (i = 0; i < tbl->size; i++) { ++ rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { ++ pr_buf(out, "%s:", ++ bch2_btree_ids[ck->key.btree_id]); ++ bch2_bpos_to_text(out, ck->key.pos); ++ ++ if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) ++ pr_buf(out, " journal seq %llu", ck->journal.seq); ++ pr_buf(out, "\n"); ++ } ++ } ++ mutex_unlock(&c->lock); ++} +diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h +new file mode 100644 +index 000000000000..b1756c6c622c +--- /dev/null ++++ b/fs/bcachefs/btree_key_cache.h +@@ -0,0 +1,25 @@ ++#ifndef _BCACHEFS_BTREE_KEY_CACHE_H ++#define _BCACHEFS_BTREE_KEY_CACHE_H ++ ++int bch2_btree_iter_traverse_cached(struct btree_iter *); ++ ++bool bch2_btree_insert_key_cached(struct btree_trans *, ++ struct btree_iter *, struct bkey_i *); ++int bch2_btree_key_cache_flush(struct btree_trans *, ++ enum btree_id, struct bpos); ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_btree_key_cache_verify_clean(struct btree_trans *, ++ enum btree_id, struct bpos); ++#else ++static inline void ++bch2_btree_key_cache_verify_clean(struct btree_trans *trans, ++ enum btree_id id, struct bpos pos) {} ++#endif ++ ++void bch2_fs_btree_key_cache_exit(struct btree_key_cache *); ++void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *); ++int bch2_fs_btree_key_cache_init(struct btree_key_cache *); ++ ++void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *); ++ ++#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ +diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h +new file mode 100644 +index 000000000000..81fbf3e18647 +--- /dev/null ++++ b/fs/bcachefs/btree_locking.h +@@ -0,0 +1,257 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_LOCKING_H ++#define _BCACHEFS_BTREE_LOCKING_H ++ ++/* ++ * Only for internal btree use: ++ * ++ * The btree iterator tracks what locks it wants to take, and what locks it ++ * currently has - here we have wrappers for locking/unlocking btree nodes and ++ * updating the iterator state ++ */ ++ ++#include ++ ++#include "btree_iter.h" ++ ++/* matches six lock types */ ++enum btree_node_locked_type { ++ BTREE_NODE_UNLOCKED = -1, ++ BTREE_NODE_READ_LOCKED = SIX_LOCK_read, ++ BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent, ++}; ++ ++static inline int btree_node_locked_type(struct btree_iter *iter, ++ unsigned level) ++{ ++ /* ++ * We're relying on the fact that if nodes_intent_locked is set ++ * nodes_locked must be set as well, so that we can compute without ++ * branches: ++ */ ++ return BTREE_NODE_UNLOCKED + ++ ((iter->nodes_locked >> level) & 1) + ++ ((iter->nodes_intent_locked >> level) & 1); ++} ++ ++static inline bool btree_node_intent_locked(struct btree_iter *iter, ++ unsigned level) ++{ ++ return btree_node_locked_type(iter, level) == BTREE_NODE_INTENT_LOCKED; ++} ++ ++static inline bool btree_node_read_locked(struct btree_iter *iter, ++ unsigned level) ++{ ++ return btree_node_locked_type(iter, level) == BTREE_NODE_READ_LOCKED; ++} ++ ++static inline bool btree_node_locked(struct btree_iter *iter, unsigned level) ++{ ++ return iter->nodes_locked & (1 << level); ++} ++ ++static inline void mark_btree_node_unlocked(struct btree_iter *iter, ++ unsigned level) ++{ ++ iter->nodes_locked &= ~(1 << level); ++ iter->nodes_intent_locked &= ~(1 << level); ++} ++ ++static inline void mark_btree_node_locked(struct btree_iter *iter, ++ unsigned level, ++ enum six_lock_type type) ++{ ++ /* relying on this to avoid a branch */ ++ BUILD_BUG_ON(SIX_LOCK_read != 0); ++ BUILD_BUG_ON(SIX_LOCK_intent != 1); ++ ++ iter->nodes_locked |= 1 << level; ++ iter->nodes_intent_locked |= type << level; ++} ++ ++static inline void mark_btree_node_intent_locked(struct btree_iter *iter, ++ unsigned level) ++{ ++ mark_btree_node_locked(iter, level, SIX_LOCK_intent); ++} ++ ++static inline enum six_lock_type __btree_lock_want(struct btree_iter *iter, int level) ++{ ++ return level < iter->locks_want ++ ? SIX_LOCK_intent ++ : SIX_LOCK_read; ++} ++ ++static inline enum btree_node_locked_type ++btree_lock_want(struct btree_iter *iter, int level) ++{ ++ if (level < iter->level) ++ return BTREE_NODE_UNLOCKED; ++ if (level < iter->locks_want) ++ return BTREE_NODE_INTENT_LOCKED; ++ if (level == iter->level) ++ return BTREE_NODE_READ_LOCKED; ++ return BTREE_NODE_UNLOCKED; ++} ++ ++static inline void __btree_node_unlock(struct btree_iter *iter, unsigned level) ++{ ++ int lock_type = btree_node_locked_type(iter, level); ++ ++ EBUG_ON(level >= BTREE_MAX_DEPTH); ++ ++ if (lock_type != BTREE_NODE_UNLOCKED) ++ six_unlock_type(&iter->l[level].b->c.lock, lock_type); ++ mark_btree_node_unlocked(iter, level); ++} ++ ++static inline void btree_node_unlock(struct btree_iter *iter, unsigned level) ++{ ++ EBUG_ON(!level && iter->trans->nounlock); ++ ++ __btree_node_unlock(iter, level); ++} ++ ++static inline void __bch2_btree_iter_unlock(struct btree_iter *iter) ++{ ++ btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK); ++ ++ while (iter->nodes_locked) ++ btree_node_unlock(iter, __ffs(iter->nodes_locked)); ++} ++ ++static inline enum bch_time_stats lock_to_time_stat(enum six_lock_type type) ++{ ++ switch (type) { ++ case SIX_LOCK_read: ++ return BCH_TIME_btree_lock_contended_read; ++ case SIX_LOCK_intent: ++ return BCH_TIME_btree_lock_contended_intent; ++ case SIX_LOCK_write: ++ return BCH_TIME_btree_lock_contended_write; ++ default: ++ BUG(); ++ } ++} ++ ++/* ++ * wrapper around six locks that just traces lock contended time ++ */ ++static inline void __btree_node_lock_type(struct bch_fs *c, struct btree *b, ++ enum six_lock_type type) ++{ ++ u64 start_time = local_clock(); ++ ++ six_lock_type(&b->c.lock, type, NULL, NULL); ++ bch2_time_stats_update(&c->times[lock_to_time_stat(type)], start_time); ++} ++ ++static inline void btree_node_lock_type(struct bch_fs *c, struct btree *b, ++ enum six_lock_type type) ++{ ++ if (!six_trylock_type(&b->c.lock, type)) ++ __btree_node_lock_type(c, b, type); ++} ++ ++/* ++ * Lock a btree node if we already have it locked on one of our linked ++ * iterators: ++ */ ++static inline bool btree_node_lock_increment(struct btree_trans *trans, ++ struct btree *b, unsigned level, ++ enum btree_node_locked_type want) ++{ ++ struct btree_iter *iter; ++ ++ trans_for_each_iter(trans, iter) ++ if (iter->l[level].b == b && ++ btree_node_locked_type(iter, level) >= want) { ++ six_lock_increment(&b->c.lock, want); ++ return true; ++ } ++ ++ return false; ++} ++ ++bool __bch2_btree_node_lock(struct btree *, struct bpos, unsigned, ++ struct btree_iter *, enum six_lock_type, ++ six_lock_should_sleep_fn, void *); ++ ++static inline bool btree_node_lock(struct btree *b, ++ struct bpos pos, unsigned level, ++ struct btree_iter *iter, ++ enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) ++{ ++ struct btree_trans *trans = iter->trans; ++ bool ret; ++ ++ EBUG_ON(level >= BTREE_MAX_DEPTH); ++ EBUG_ON(!(trans->iters_linked & (1ULL << iter->idx))); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ trans->locking = b; ++ trans->locking_iter_idx = iter->idx; ++ trans->locking_pos = pos; ++ trans->locking_btree_id = iter->btree_id; ++ trans->locking_level = level; ++#endif ++ ret = likely(six_trylock_type(&b->c.lock, type)) || ++ btree_node_lock_increment(trans, b, level, type) || ++ __bch2_btree_node_lock(b, pos, level, iter, type, ++ should_sleep_fn, p); ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ trans->locking = NULL; ++#endif ++ return ret; ++} ++ ++bool __bch2_btree_node_relock(struct btree_iter *, unsigned); ++ ++static inline bool bch2_btree_node_relock(struct btree_iter *iter, ++ unsigned level) ++{ ++ EBUG_ON(btree_node_locked(iter, level) && ++ btree_node_locked_type(iter, level) != ++ __btree_lock_want(iter, level)); ++ ++ return likely(btree_node_locked(iter, level)) || ++ __bch2_btree_node_relock(iter, level); ++} ++ ++/* ++ * Updates the saved lock sequence number, so that bch2_btree_node_relock() will ++ * succeed: ++ */ ++static inline void ++bch2_btree_node_unlock_write_inlined(struct btree *b, struct btree_iter *iter) ++{ ++ struct btree_iter *linked; ++ ++ EBUG_ON(iter->l[b->c.level].b != b); ++ EBUG_ON(iter->l[b->c.level].lock_seq + 1 != b->c.lock.state.seq); ++ ++ trans_for_each_iter_with_node(iter->trans, b, linked) ++ linked->l[b->c.level].lock_seq += 2; ++ ++ six_unlock_write(&b->c.lock); ++} ++ ++void bch2_btree_node_unlock_write(struct btree *, struct btree_iter *); ++ ++void __bch2_btree_node_lock_write(struct btree *, struct btree_iter *); ++ ++static inline void bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter) ++{ ++ EBUG_ON(iter->l[b->c.level].b != b); ++ EBUG_ON(iter->l[b->c.level].lock_seq != b->c.lock.state.seq); ++ ++ if (unlikely(!six_trylock_write(&b->c.lock))) ++ __bch2_btree_node_lock_write(b, iter); ++} ++ ++#endif /* _BCACHEFS_BTREE_LOCKING_H */ ++ ++ +diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h +new file mode 100644 +index 000000000000..cc01baeec138 +--- /dev/null ++++ b/fs/bcachefs/btree_types.h +@@ -0,0 +1,663 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_TYPES_H ++#define _BCACHEFS_BTREE_TYPES_H ++ ++#include ++#include ++#include ++ ++#include "bkey_methods.h" ++#include "buckets_types.h" ++#include "journal_types.h" ++ ++struct open_bucket; ++struct btree_update; ++struct btree_trans; ++ ++#define MAX_BSETS 3U ++ ++struct btree_nr_keys { ++ ++ /* ++ * Amount of live metadata (i.e. size of node after a compaction) in ++ * units of u64s ++ */ ++ u16 live_u64s; ++ u16 bset_u64s[MAX_BSETS]; ++ ++ /* live keys only: */ ++ u16 packed_keys; ++ u16 unpacked_keys; ++}; ++ ++struct bset_tree { ++ /* ++ * We construct a binary tree in an array as if the array ++ * started at 1, so that things line up on the same cachelines ++ * better: see comments in bset.c at cacheline_to_bkey() for ++ * details ++ */ ++ ++ /* size of the binary tree and prev array */ ++ u16 size; ++ ++ /* function of size - precalculated for to_inorder() */ ++ u16 extra; ++ ++ u16 data_offset; ++ u16 aux_data_offset; ++ u16 end_offset; ++ ++ struct bpos max_key; ++}; ++ ++struct btree_write { ++ struct journal_entry_pin journal; ++}; ++ ++struct btree_alloc { ++ struct open_buckets ob; ++ BKEY_PADDED(k); ++}; ++ ++struct btree_bkey_cached_common { ++ struct six_lock lock; ++ u8 level; ++ u8 btree_id; ++}; ++ ++struct btree { ++ struct btree_bkey_cached_common c; ++ ++ struct rhash_head hash; ++ u64 hash_val; ++ ++ unsigned long flags; ++ u16 written; ++ u8 nsets; ++ u8 nr_key_bits; ++ ++ struct bkey_format format; ++ ++ struct btree_node *data; ++ void *aux_data; ++ ++ /* ++ * Sets of sorted keys - the real btree node - plus a binary search tree ++ * ++ * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point ++ * to the memory we have allocated for this btree node. Additionally, ++ * set[0]->data points to the entire btree node as it exists on disk. ++ */ ++ struct bset_tree set[MAX_BSETS]; ++ ++ struct btree_nr_keys nr; ++ u16 sib_u64s[2]; ++ u16 whiteout_u64s; ++ u8 byte_order; ++ u8 unpack_fn_len; ++ ++ /* ++ * XXX: add a delete sequence number, so when bch2_btree_node_relock() ++ * fails because the lock sequence number has changed - i.e. the ++ * contents were modified - we can still relock the node if it's still ++ * the one we want, without redoing the traversal ++ */ ++ ++ /* ++ * For asynchronous splits/interior node updates: ++ * When we do a split, we allocate new child nodes and update the parent ++ * node to point to them: we update the parent in memory immediately, ++ * but then we must wait until the children have been written out before ++ * the update to the parent can be written - this is a list of the ++ * btree_updates that are blocking this node from being ++ * written: ++ */ ++ struct list_head write_blocked; ++ ++ /* ++ * Also for asynchronous splits/interior node updates: ++ * If a btree node isn't reachable yet, we don't want to kick off ++ * another write - because that write also won't yet be reachable and ++ * marking it as completed before it's reachable would be incorrect: ++ */ ++ unsigned long will_make_reachable; ++ ++ struct open_buckets ob; ++ ++ /* lru list */ ++ struct list_head list; ++ ++ struct btree_write writes[2]; ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ bool *expensive_debug_checks; ++#endif ++ ++ /* Key/pointer for this btree node */ ++ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); ++}; ++ ++struct btree_cache { ++ struct rhashtable table; ++ bool table_init_done; ++ /* ++ * We never free a struct btree, except on shutdown - we just put it on ++ * the btree_cache_freed list and reuse it later. This simplifies the ++ * code, and it doesn't cost us much memory as the memory usage is ++ * dominated by buffers that hold the actual btree node data and those ++ * can be freed - and the number of struct btrees allocated is ++ * effectively bounded. ++ * ++ * btree_cache_freeable effectively is a small cache - we use it because ++ * high order page allocations can be rather expensive, and it's quite ++ * common to delete and allocate btree nodes in quick succession. It ++ * should never grow past ~2-3 nodes in practice. ++ */ ++ struct mutex lock; ++ struct list_head live; ++ struct list_head freeable; ++ struct list_head freed; ++ ++ /* Number of elements in live + freeable lists */ ++ unsigned used; ++ unsigned reserve; ++ struct shrinker shrink; ++ ++ /* ++ * If we need to allocate memory for a new btree node and that ++ * allocation fails, we can cannibalize another node in the btree cache ++ * to satisfy the allocation - lock to guarantee only one thread does ++ * this at a time: ++ */ ++ struct task_struct *alloc_lock; ++ struct closure_waitlist alloc_wait; ++}; ++ ++struct btree_node_iter { ++ struct btree_node_iter_set { ++ u16 k, end; ++ } data[MAX_BSETS]; ++}; ++ ++enum btree_iter_type { ++ BTREE_ITER_KEYS, ++ BTREE_ITER_NODES, ++ BTREE_ITER_CACHED, ++}; ++ ++#define BTREE_ITER_TYPE ((1 << 2) - 1) ++ ++/* ++ * Iterate over all possible positions, synthesizing deleted keys for holes: ++ */ ++#define BTREE_ITER_SLOTS (1 << 2) ++/* ++ * Indicates that intent locks should be taken on leaf nodes, because we expect ++ * to be doing updates: ++ */ ++#define BTREE_ITER_INTENT (1 << 3) ++/* ++ * Causes the btree iterator code to prefetch additional btree nodes from disk: ++ */ ++#define BTREE_ITER_PREFETCH (1 << 4) ++/* ++ * Indicates that this iterator should not be reused until transaction commit, ++ * either because a pending update references it or because the update depends ++ * on that particular key being locked (e.g. by the str_hash code, for hash ++ * table consistency) ++ */ ++#define BTREE_ITER_KEEP_UNTIL_COMMIT (1 << 5) ++/* ++ * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for ++ * @pos or the first key strictly greater than @pos ++ */ ++#define BTREE_ITER_IS_EXTENTS (1 << 6) ++#define BTREE_ITER_ERROR (1 << 7) ++#define BTREE_ITER_SET_POS_AFTER_COMMIT (1 << 8) ++#define BTREE_ITER_CACHED_NOFILL (1 << 9) ++#define BTREE_ITER_CACHED_NOCREATE (1 << 10) ++ ++#define BTREE_ITER_USER_FLAGS \ ++ (BTREE_ITER_SLOTS \ ++ |BTREE_ITER_INTENT \ ++ |BTREE_ITER_PREFETCH \ ++ |BTREE_ITER_CACHED_NOFILL \ ++ |BTREE_ITER_CACHED_NOCREATE) ++ ++enum btree_iter_uptodate { ++ BTREE_ITER_UPTODATE = 0, ++ BTREE_ITER_NEED_PEEK = 1, ++ BTREE_ITER_NEED_RELOCK = 2, ++ BTREE_ITER_NEED_TRAVERSE = 3, ++}; ++ ++#define BTREE_ITER_NO_NODE_GET_LOCKS ((struct btree *) 1) ++#define BTREE_ITER_NO_NODE_DROP ((struct btree *) 2) ++#define BTREE_ITER_NO_NODE_LOCK_ROOT ((struct btree *) 3) ++#define BTREE_ITER_NO_NODE_UP ((struct btree *) 4) ++#define BTREE_ITER_NO_NODE_DOWN ((struct btree *) 5) ++#define BTREE_ITER_NO_NODE_INIT ((struct btree *) 6) ++#define BTREE_ITER_NO_NODE_ERROR ((struct btree *) 7) ++ ++/* ++ * @pos - iterator's current position ++ * @level - current btree depth ++ * @locks_want - btree level below which we start taking intent locks ++ * @nodes_locked - bitmask indicating which nodes in @nodes are locked ++ * @nodes_intent_locked - bitmask indicating which locks are intent locks ++ */ ++struct btree_iter { ++ struct btree_trans *trans; ++ struct bpos pos; ++ struct bpos pos_after_commit; ++ ++ u16 flags; ++ u8 idx; ++ ++ enum btree_id btree_id:4; ++ enum btree_iter_uptodate uptodate:4; ++ unsigned level:4, ++ min_depth:4, ++ locks_want:4, ++ nodes_locked:4, ++ nodes_intent_locked:4; ++ ++ struct btree_iter_level { ++ struct btree *b; ++ struct btree_node_iter iter; ++ u32 lock_seq; ++ } l[BTREE_MAX_DEPTH]; ++ ++ /* ++ * Current unpacked key - so that bch2_btree_iter_next()/ ++ * bch2_btree_iter_next_slot() can correctly advance pos. ++ */ ++ struct bkey k; ++ unsigned long ip_allocated; ++}; ++ ++static inline enum btree_iter_type ++btree_iter_type(const struct btree_iter *iter) ++{ ++ return iter->flags & BTREE_ITER_TYPE; ++} ++ ++static inline struct btree_iter_level *iter_l(struct btree_iter *iter) ++{ ++ return iter->l + iter->level; ++} ++ ++struct btree_key_cache { ++ struct mutex lock; ++ struct rhashtable table; ++ struct list_head freed; ++ struct list_head clean; ++}; ++ ++struct bkey_cached_key { ++ u32 btree_id; ++ struct bpos pos; ++} __attribute__((packed, aligned(4))); ++ ++#define BKEY_CACHED_DIRTY 0 ++ ++struct bkey_cached { ++ struct btree_bkey_cached_common c; ++ ++ unsigned long flags; ++ u8 u64s; ++ bool valid; ++ struct bkey_cached_key key; ++ ++ struct rhash_head hash; ++ struct list_head list; ++ ++ struct journal_preres res; ++ struct journal_entry_pin journal; ++ ++ struct bkey_i *k; ++}; ++ ++struct btree_insert_entry { ++ unsigned trigger_flags; ++ unsigned trans_triggers_run:1; ++ struct bkey_i *k; ++ struct btree_iter *iter; ++}; ++ ++#ifndef CONFIG_LOCKDEP ++#define BTREE_ITER_MAX 64 ++#else ++#define BTREE_ITER_MAX 32 ++#endif ++ ++struct btree_trans { ++ struct bch_fs *c; ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct list_head list; ++ struct btree *locking; ++ unsigned locking_iter_idx; ++ struct bpos locking_pos; ++ u8 locking_btree_id; ++ u8 locking_level; ++ pid_t pid; ++#endif ++ unsigned long ip; ++ ++ u64 iters_linked; ++ u64 iters_live; ++ u64 iters_touched; ++ ++ u8 nr_iters; ++ u8 nr_updates; ++ u8 nr_updates2; ++ u8 size; ++ unsigned used_mempool:1; ++ unsigned error:1; ++ unsigned nounlock:1; ++ unsigned need_reset:1; ++ unsigned in_traverse_all:1; ++ ++ unsigned mem_top; ++ unsigned mem_bytes; ++ void *mem; ++ ++ struct btree_iter *iters; ++ struct btree_insert_entry *updates; ++ struct btree_insert_entry *updates2; ++ ++ /* update path: */ ++ struct jset_entry *extra_journal_entries; ++ unsigned extra_journal_entry_u64s; ++ struct journal_entry_pin *journal_pin; ++ ++ struct journal_res journal_res; ++ struct journal_preres journal_preres; ++ u64 *journal_seq; ++ struct disk_reservation *disk_res; ++ unsigned flags; ++ unsigned journal_u64s; ++ unsigned journal_preres_u64s; ++ struct replicas_delta_list *fs_usage_deltas; ++ ++ struct btree_iter iters_onstack[2]; ++ struct btree_insert_entry updates_onstack[2]; ++ struct btree_insert_entry updates2_onstack[2]; ++}; ++ ++#define BTREE_FLAG(flag) \ ++static inline bool btree_node_ ## flag(struct btree *b) \ ++{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ ++ \ ++static inline void set_btree_node_ ## flag(struct btree *b) \ ++{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ ++ \ ++static inline void clear_btree_node_ ## flag(struct btree *b) \ ++{ clear_bit(BTREE_NODE_ ## flag, &b->flags); } ++ ++enum btree_flags { ++ BTREE_NODE_read_in_flight, ++ BTREE_NODE_read_error, ++ BTREE_NODE_dirty, ++ BTREE_NODE_need_write, ++ BTREE_NODE_noevict, ++ BTREE_NODE_write_idx, ++ BTREE_NODE_accessed, ++ BTREE_NODE_write_in_flight, ++ BTREE_NODE_just_written, ++ BTREE_NODE_dying, ++ BTREE_NODE_fake, ++ BTREE_NODE_old_extent_overwrite, ++ BTREE_NODE_need_rewrite, ++}; ++ ++BTREE_FLAG(read_in_flight); ++BTREE_FLAG(read_error); ++BTREE_FLAG(dirty); ++BTREE_FLAG(need_write); ++BTREE_FLAG(noevict); ++BTREE_FLAG(write_idx); ++BTREE_FLAG(accessed); ++BTREE_FLAG(write_in_flight); ++BTREE_FLAG(just_written); ++BTREE_FLAG(dying); ++BTREE_FLAG(fake); ++BTREE_FLAG(old_extent_overwrite); ++BTREE_FLAG(need_rewrite); ++ ++static inline struct btree_write *btree_current_write(struct btree *b) ++{ ++ return b->writes + btree_node_write_idx(b); ++} ++ ++static inline struct btree_write *btree_prev_write(struct btree *b) ++{ ++ return b->writes + (btree_node_write_idx(b) ^ 1); ++} ++ ++static inline struct bset_tree *bset_tree_last(struct btree *b) ++{ ++ EBUG_ON(!b->nsets); ++ return b->set + b->nsets - 1; ++} ++ ++static inline void * ++__btree_node_offset_to_ptr(const struct btree *b, u16 offset) ++{ ++ return (void *) ((u64 *) b->data + 1 + offset); ++} ++ ++static inline u16 ++__btree_node_ptr_to_offset(const struct btree *b, const void *p) ++{ ++ u16 ret = (u64 *) p - 1 - (u64 *) b->data; ++ ++ EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p); ++ return ret; ++} ++ ++static inline struct bset *bset(const struct btree *b, ++ const struct bset_tree *t) ++{ ++ return __btree_node_offset_to_ptr(b, t->data_offset); ++} ++ ++static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t) ++{ ++ t->end_offset = ++ __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t))); ++} ++ ++static inline void set_btree_bset(struct btree *b, struct bset_tree *t, ++ const struct bset *i) ++{ ++ t->data_offset = __btree_node_ptr_to_offset(b, i); ++ set_btree_bset_end(b, t); ++} ++ ++static inline struct bset *btree_bset_first(struct btree *b) ++{ ++ return bset(b, b->set); ++} ++ ++static inline struct bset *btree_bset_last(struct btree *b) ++{ ++ return bset(b, bset_tree_last(b)); ++} ++ ++static inline u16 ++__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k) ++{ ++ return __btree_node_ptr_to_offset(b, k); ++} ++ ++static inline struct bkey_packed * ++__btree_node_offset_to_key(const struct btree *b, u16 k) ++{ ++ return __btree_node_offset_to_ptr(b, k); ++} ++ ++static inline unsigned btree_bkey_first_offset(const struct bset_tree *t) ++{ ++ return t->data_offset + offsetof(struct bset, _data) / sizeof(u64); ++} ++ ++#define btree_bkey_first(_b, _t) \ ++({ \ ++ EBUG_ON(bset(_b, _t)->start != \ ++ __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\ ++ \ ++ bset(_b, _t)->start; \ ++}) ++ ++#define btree_bkey_last(_b, _t) \ ++({ \ ++ EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \ ++ vstruct_last(bset(_b, _t))); \ ++ \ ++ __btree_node_offset_to_key(_b, (_t)->end_offset); \ ++}) ++ ++static inline unsigned bset_u64s(struct bset_tree *t) ++{ ++ return t->end_offset - t->data_offset - ++ sizeof(struct bset) / sizeof(u64); ++} ++ ++static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t) ++{ ++ return bset_u64s(t) - b->nr.bset_u64s[t - b->set]; ++} ++ ++static inline unsigned bset_byte_offset(struct btree *b, void *i) ++{ ++ return i - (void *) b->data; ++} ++ ++enum btree_node_type { ++#define x(kwd, val, name) BKEY_TYPE_##kwd = val, ++ BCH_BTREE_IDS() ++#undef x ++ BKEY_TYPE_BTREE, ++}; ++ ++/* Type of a key in btree @id at level @level: */ ++static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id) ++{ ++ return level ? BKEY_TYPE_BTREE : (enum btree_node_type) id; ++} ++ ++/* Type of keys @b contains: */ ++static inline enum btree_node_type btree_node_type(struct btree *b) ++{ ++ return __btree_node_type(b->c.level, b->c.btree_id); ++} ++ ++static inline bool btree_node_type_is_extents(enum btree_node_type type) ++{ ++ switch (type) { ++ case BKEY_TYPE_EXTENTS: ++ case BKEY_TYPE_REFLINK: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static inline bool btree_node_is_extents(struct btree *b) ++{ ++ return btree_node_type_is_extents(btree_node_type(b)); ++} ++ ++static inline enum btree_node_type btree_iter_key_type(struct btree_iter *iter) ++{ ++ return __btree_node_type(iter->level, iter->btree_id); ++} ++ ++static inline bool btree_iter_is_extents(struct btree_iter *iter) ++{ ++ return btree_node_type_is_extents(btree_iter_key_type(iter)); ++} ++ ++#define BTREE_NODE_TYPE_HAS_TRIGGERS \ ++ ((1U << BKEY_TYPE_EXTENTS)| \ ++ (1U << BKEY_TYPE_ALLOC)| \ ++ (1U << BKEY_TYPE_INODES)| \ ++ (1U << BKEY_TYPE_REFLINK)| \ ++ (1U << BKEY_TYPE_EC)| \ ++ (1U << BKEY_TYPE_BTREE)) ++ ++#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ ++ ((1U << BKEY_TYPE_EXTENTS)| \ ++ (1U << BKEY_TYPE_INODES)| \ ++ (1U << BKEY_TYPE_EC)| \ ++ (1U << BKEY_TYPE_REFLINK)) ++ ++enum btree_trigger_flags { ++ __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ ++ ++ __BTREE_TRIGGER_INSERT, ++ __BTREE_TRIGGER_OVERWRITE, ++ __BTREE_TRIGGER_OVERWRITE_SPLIT, ++ ++ __BTREE_TRIGGER_GC, ++ __BTREE_TRIGGER_BUCKET_INVALIDATE, ++ __BTREE_TRIGGER_NOATOMIC, ++}; ++ ++#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) ++ ++#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) ++#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) ++#define BTREE_TRIGGER_OVERWRITE_SPLIT (1U << __BTREE_TRIGGER_OVERWRITE_SPLIT) ++ ++#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) ++#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) ++#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) ++ ++static inline bool btree_node_type_needs_gc(enum btree_node_type type) ++{ ++ return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); ++} ++ ++struct btree_root { ++ struct btree *b; ++ ++ /* On disk root - see async splits: */ ++ __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); ++ u8 level; ++ u8 alive; ++ s8 error; ++}; ++ ++/* ++ * Optional hook that will be called just prior to a btree node update, when ++ * we're holding the write lock and we know what key is about to be overwritten: ++ */ ++ ++enum btree_insert_ret { ++ BTREE_INSERT_OK, ++ /* leaf node needs to be split */ ++ BTREE_INSERT_BTREE_NODE_FULL, ++ BTREE_INSERT_ENOSPC, ++ BTREE_INSERT_NEED_MARK_REPLICAS, ++ BTREE_INSERT_NEED_JOURNAL_RES, ++}; ++ ++enum btree_gc_coalesce_fail_reason { ++ BTREE_GC_COALESCE_FAIL_RESERVE_GET, ++ BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC, ++ BTREE_GC_COALESCE_FAIL_FORMAT_FITS, ++}; ++ ++enum btree_node_sibling { ++ btree_prev_sib, ++ btree_next_sib, ++}; ++ ++typedef struct btree_nr_keys (*sort_fix_overlapping_fn)(struct bset *, ++ struct btree *, ++ struct btree_node_iter *); ++ ++#endif /* _BCACHEFS_BTREE_TYPES_H */ +diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h +new file mode 100644 +index 000000000000..e0b1bde37484 +--- /dev/null ++++ b/fs/bcachefs/btree_update.h +@@ -0,0 +1,144 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_UPDATE_H ++#define _BCACHEFS_BTREE_UPDATE_H ++ ++#include "btree_iter.h" ++#include "journal.h" ++ ++struct bch_fs; ++struct btree; ++ ++void bch2_btree_node_lock_for_insert(struct bch_fs *, struct btree *, ++ struct btree_iter *); ++bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, ++ struct btree_node_iter *, struct bkey_i *); ++void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); ++ ++enum btree_insert_flags { ++ __BTREE_INSERT_NOUNLOCK, ++ __BTREE_INSERT_NOFAIL, ++ __BTREE_INSERT_NOCHECK_RW, ++ __BTREE_INSERT_LAZY_RW, ++ __BTREE_INSERT_USE_RESERVE, ++ __BTREE_INSERT_USE_ALLOC_RESERVE, ++ __BTREE_INSERT_JOURNAL_REPLAY, ++ __BTREE_INSERT_JOURNAL_RESERVED, ++ __BTREE_INSERT_JOURNAL_RECLAIM, ++ __BTREE_INSERT_NOWAIT, ++ __BTREE_INSERT_GC_LOCK_HELD, ++ __BCH_HASH_SET_MUST_CREATE, ++ __BCH_HASH_SET_MUST_REPLACE, ++}; ++ ++/* ++ * Don't drop locks _after_ successfully updating btree: ++ */ ++#define BTREE_INSERT_NOUNLOCK (1 << __BTREE_INSERT_NOUNLOCK) ++ ++/* Don't check for -ENOSPC: */ ++#define BTREE_INSERT_NOFAIL (1 << __BTREE_INSERT_NOFAIL) ++ ++#define BTREE_INSERT_NOCHECK_RW (1 << __BTREE_INSERT_NOCHECK_RW) ++#define BTREE_INSERT_LAZY_RW (1 << __BTREE_INSERT_LAZY_RW) ++ ++/* for copygc, or when merging btree nodes */ ++#define BTREE_INSERT_USE_RESERVE (1 << __BTREE_INSERT_USE_RESERVE) ++#define BTREE_INSERT_USE_ALLOC_RESERVE (1 << __BTREE_INSERT_USE_ALLOC_RESERVE) ++ ++/* Insert is for journal replay - don't get journal reservations: */ ++#define BTREE_INSERT_JOURNAL_REPLAY (1 << __BTREE_INSERT_JOURNAL_REPLAY) ++ ++/* Indicates that we have pre-reserved space in the journal: */ ++#define BTREE_INSERT_JOURNAL_RESERVED (1 << __BTREE_INSERT_JOURNAL_RESERVED) ++ ++/* Insert is being called from journal reclaim path: */ ++#define BTREE_INSERT_JOURNAL_RECLAIM (1 << __BTREE_INSERT_JOURNAL_RECLAIM) ++ ++/* Don't block on allocation failure (for new btree nodes: */ ++#define BTREE_INSERT_NOWAIT (1 << __BTREE_INSERT_NOWAIT) ++#define BTREE_INSERT_GC_LOCK_HELD (1 << __BTREE_INSERT_GC_LOCK_HELD) ++ ++#define BCH_HASH_SET_MUST_CREATE (1 << __BCH_HASH_SET_MUST_CREATE) ++#define BCH_HASH_SET_MUST_REPLACE (1 << __BCH_HASH_SET_MUST_REPLACE) ++ ++int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); ++ ++int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *); ++int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, ++ struct disk_reservation *, u64 *, int flags); ++ ++int bch2_btree_delete_at_range(struct btree_trans *, struct btree_iter *, ++ struct bpos, u64 *); ++int bch2_btree_delete_range(struct bch_fs *, enum btree_id, ++ struct bpos, struct bpos, u64 *); ++ ++int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *, ++ __le64, unsigned); ++int bch2_btree_node_update_key(struct bch_fs *, struct btree_iter *, ++ struct btree *, struct bkey_i *); ++ ++int bch2_trans_update(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, enum btree_trigger_flags); ++int __bch2_trans_commit(struct btree_trans *); ++ ++/** ++ * bch2_trans_commit - insert keys at given iterator positions ++ * ++ * This is main entry point for btree updates. ++ * ++ * Return values: ++ * -EINTR: locking changed, this function should be called again. ++ * -EROFS: filesystem read only ++ * -EIO: journal or btree node IO error ++ */ ++static inline int bch2_trans_commit(struct btree_trans *trans, ++ struct disk_reservation *disk_res, ++ u64 *journal_seq, ++ unsigned flags) ++{ ++ trans->disk_res = disk_res; ++ trans->journal_seq = journal_seq; ++ trans->flags = flags; ++ ++ return __bch2_trans_commit(trans); ++} ++ ++#define __bch2_trans_do(_trans, _disk_res, _journal_seq, _flags, _do) \ ++({ \ ++ int _ret; \ ++ \ ++ while (1) { \ ++ _ret = (_do) ?: bch2_trans_commit(_trans, (_disk_res), \ ++ (_journal_seq), (_flags)); \ ++ if (_ret != -EINTR) \ ++ break; \ ++ bch2_trans_reset(_trans, 0); \ ++ } \ ++ \ ++ _ret; \ ++}) ++ ++#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ ++({ \ ++ struct btree_trans trans; \ ++ int _ret, _ret2; \ ++ \ ++ bch2_trans_init(&trans, (_c), 0, 0); \ ++ _ret = __bch2_trans_do(&trans, _disk_res, _journal_seq, _flags, \ ++ _do); \ ++ _ret2 = bch2_trans_exit(&trans); \ ++ \ ++ _ret ?: _ret2; \ ++}) ++ ++#define trans_for_each_update(_trans, _i) \ ++ for ((_i) = (_trans)->updates; \ ++ (_i) < (_trans)->updates + (_trans)->nr_updates; \ ++ (_i)++) ++ ++#define trans_for_each_update2(_trans, _i) \ ++ for ((_i) = (_trans)->updates2; \ ++ (_i) < (_trans)->updates2 + (_trans)->nr_updates2; \ ++ (_i)++) ++ ++#endif /* _BCACHEFS_BTREE_UPDATE_H */ +diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c +new file mode 100644 +index 000000000000..a2604b0ce2d8 +--- /dev/null ++++ b/fs/bcachefs/btree_update_interior.c +@@ -0,0 +1,2075 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_methods.h" ++#include "btree_cache.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_locking.h" ++#include "buckets.h" ++#include "extents.h" ++#include "journal.h" ++#include "journal_reclaim.h" ++#include "keylist.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++#include ++#include ++ ++/* Debug code: */ ++ ++/* ++ * Verify that child nodes correctly span parent node's range: ++ */ ++static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ struct bpos next_node = b->data->min_key; ++ struct btree_node_iter iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_btree_ptr_v2 bp; ++ struct bkey unpacked; ++ ++ BUG_ON(!b->c.level); ++ ++ if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)) ++ return; ++ ++ bch2_btree_node_iter_init_from_start(&iter, b); ++ ++ while (1) { ++ k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked); ++ if (k.k->type != KEY_TYPE_btree_ptr_v2) ++ break; ++ bp = bkey_s_c_to_btree_ptr_v2(k); ++ ++ BUG_ON(bkey_cmp(next_node, bp.v->min_key)); ++ ++ bch2_btree_node_iter_advance(&iter, b); ++ ++ if (bch2_btree_node_iter_end(&iter)) { ++ BUG_ON(bkey_cmp(k.k->p, b->key.k.p)); ++ break; ++ } ++ ++ next_node = bkey_successor(k.k->p); ++ } ++#endif ++} ++ ++/* Calculate ideal packed bkey format for new btree nodes: */ ++ ++void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) ++{ ++ struct bkey_packed *k; ++ struct bset_tree *t; ++ struct bkey uk; ++ ++ bch2_bkey_format_add_pos(s, b->data->min_key); ++ ++ for_each_bset(b, t) ++ bset_tree_for_each_key(b, t, k) ++ if (!bkey_whiteout(k)) { ++ uk = bkey_unpack_key(b, k); ++ bch2_bkey_format_add_key(s, &uk); ++ } ++} ++ ++static struct bkey_format bch2_btree_calc_format(struct btree *b) ++{ ++ struct bkey_format_state s; ++ ++ bch2_bkey_format_init(&s); ++ __bch2_btree_calc_format(&s, b); ++ ++ return bch2_bkey_format_done(&s); ++} ++ ++static size_t btree_node_u64s_with_format(struct btree *b, ++ struct bkey_format *new_f) ++{ ++ struct bkey_format *old_f = &b->format; ++ ++ /* stupid integer promotion rules */ ++ ssize_t delta = ++ (((int) new_f->key_u64s - old_f->key_u64s) * ++ (int) b->nr.packed_keys) + ++ (((int) new_f->key_u64s - BKEY_U64s) * ++ (int) b->nr.unpacked_keys); ++ ++ BUG_ON(delta + b->nr.live_u64s < 0); ++ ++ return b->nr.live_u64s + delta; ++} ++ ++/** ++ * btree_node_format_fits - check if we could rewrite node with a new format ++ * ++ * This assumes all keys can pack with the new format -- it just checks if ++ * the re-packed keys would fit inside the node itself. ++ */ ++bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, ++ struct bkey_format *new_f) ++{ ++ size_t u64s = btree_node_u64s_with_format(b, new_f); ++ ++ return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c); ++} ++ ++/* Btree node freeing/allocation: */ ++ ++static void __btree_node_free(struct bch_fs *c, struct btree *b) ++{ ++ trace_btree_node_free(c, b); ++ ++ BUG_ON(btree_node_dirty(b)); ++ BUG_ON(btree_node_need_write(b)); ++ BUG_ON(b == btree_node_root(c, b)); ++ BUG_ON(b->ob.nr); ++ BUG_ON(!list_empty(&b->write_blocked)); ++ BUG_ON(b->will_make_reachable); ++ ++ clear_btree_node_noevict(b); ++ ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ mutex_lock(&c->btree_cache.lock); ++ list_move(&b->list, &c->btree_cache.freeable); ++ mutex_unlock(&c->btree_cache.lock); ++} ++ ++void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) ++{ ++ struct open_buckets ob = b->ob; ++ ++ b->ob.nr = 0; ++ ++ clear_btree_node_dirty(b); ++ ++ btree_node_lock_type(c, b, SIX_LOCK_write); ++ __btree_node_free(c, b); ++ six_unlock_write(&b->c.lock); ++ ++ bch2_open_buckets_put(c, &ob); ++} ++ ++void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, ++ struct btree_iter *iter) ++{ ++ struct btree_iter *linked; ++ ++ trans_for_each_iter(iter->trans, linked) ++ BUG_ON(linked->l[b->c.level].b == b); ++ ++ six_lock_write(&b->c.lock, NULL, NULL); ++ __btree_node_free(c, b); ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++} ++ ++static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, ++ struct disk_reservation *res, ++ struct closure *cl, ++ unsigned flags) ++{ ++ struct write_point *wp; ++ struct btree *b; ++ BKEY_PADDED(k) tmp; ++ struct open_buckets ob = { .nr = 0 }; ++ struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; ++ unsigned nr_reserve; ++ enum alloc_reserve alloc_reserve; ++ ++ if (flags & BTREE_INSERT_USE_ALLOC_RESERVE) { ++ nr_reserve = 0; ++ alloc_reserve = RESERVE_ALLOC; ++ } else if (flags & BTREE_INSERT_USE_RESERVE) { ++ nr_reserve = BTREE_NODE_RESERVE / 2; ++ alloc_reserve = RESERVE_BTREE; ++ } else { ++ nr_reserve = BTREE_NODE_RESERVE; ++ alloc_reserve = RESERVE_NONE; ++ } ++ ++ mutex_lock(&c->btree_reserve_cache_lock); ++ if (c->btree_reserve_cache_nr > nr_reserve) { ++ struct btree_alloc *a = ++ &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; ++ ++ ob = a->ob; ++ bkey_copy(&tmp.k, &a->k); ++ mutex_unlock(&c->btree_reserve_cache_lock); ++ goto mem_alloc; ++ } ++ mutex_unlock(&c->btree_reserve_cache_lock); ++ ++retry: ++ wp = bch2_alloc_sectors_start(c, c->opts.foreground_target, 0, ++ writepoint_ptr(&c->btree_write_point), ++ &devs_have, ++ res->nr_replicas, ++ c->opts.metadata_replicas_required, ++ alloc_reserve, 0, cl); ++ if (IS_ERR(wp)) ++ return ERR_CAST(wp); ++ ++ if (wp->sectors_free < c->opts.btree_node_size) { ++ struct open_bucket *ob; ++ unsigned i; ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) ++ if (ob->sectors_free < c->opts.btree_node_size) ++ ob->sectors_free = 0; ++ ++ bch2_alloc_sectors_done(c, wp); ++ goto retry; ++ } ++ ++ if (c->sb.features & (1ULL << BCH_FEATURE_btree_ptr_v2)) ++ bkey_btree_ptr_v2_init(&tmp.k); ++ else ++ bkey_btree_ptr_init(&tmp.k); ++ ++ bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size); ++ ++ bch2_open_bucket_get(c, wp, &ob); ++ bch2_alloc_sectors_done(c, wp); ++mem_alloc: ++ b = bch2_btree_node_mem_alloc(c); ++ ++ /* we hold cannibalize_lock: */ ++ BUG_ON(IS_ERR(b)); ++ BUG_ON(b->ob.nr); ++ ++ bkey_copy(&b->key, &tmp.k); ++ b->ob = ob; ++ ++ return b; ++} ++ ++static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *b; ++ int ret; ++ ++ BUG_ON(level >= BTREE_MAX_DEPTH); ++ BUG_ON(!as->nr_prealloc_nodes); ++ ++ b = as->prealloc_nodes[--as->nr_prealloc_nodes]; ++ ++ set_btree_node_accessed(b); ++ set_btree_node_dirty(b); ++ set_btree_node_need_write(b); ++ ++ bch2_bset_init_first(b, &b->data->keys); ++ b->c.level = level; ++ b->c.btree_id = as->btree_id; ++ ++ memset(&b->nr, 0, sizeof(b->nr)); ++ b->data->magic = cpu_to_le64(bset_magic(c)); ++ b->data->flags = 0; ++ SET_BTREE_NODE_ID(b->data, as->btree_id); ++ SET_BTREE_NODE_LEVEL(b->data, level); ++ b->data->ptr = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)).start->ptr; ++ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { ++ struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key); ++ ++ bp->v.mem_ptr = 0; ++ bp->v.seq = b->data->keys.seq; ++ bp->v.sectors_written = 0; ++ bp->v.sectors = cpu_to_le16(c->opts.btree_node_size); ++ } ++ ++ if (c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite)) ++ SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); ++ ++ if (btree_node_is_extents(b) && ++ !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data)) { ++ set_btree_node_old_extent_overwrite(b); ++ set_btree_node_need_rewrite(b); ++ } ++ ++ bch2_btree_build_aux_trees(b); ++ ++ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); ++ BUG_ON(ret); ++ ++ trace_btree_node_alloc(c, b); ++ return b; ++} ++ ++static void btree_set_min(struct btree *b, struct bpos pos) ++{ ++ if (b->key.k.type == KEY_TYPE_btree_ptr_v2) ++ bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos; ++ b->data->min_key = pos; ++} ++ ++static void btree_set_max(struct btree *b, struct bpos pos) ++{ ++ b->key.k.p = pos; ++ b->data->max_key = pos; ++} ++ ++struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as, ++ struct btree *b, ++ struct bkey_format format) ++{ ++ struct btree *n; ++ ++ n = bch2_btree_node_alloc(as, b->c.level); ++ ++ SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); ++ ++ btree_set_min(n, b->data->min_key); ++ btree_set_max(n, b->data->max_key); ++ ++ n->data->format = format; ++ btree_node_set_format(n, format); ++ ++ bch2_btree_sort_into(as->c, n, b); ++ ++ btree_node_reset_sib_u64s(n); ++ ++ n->key.k.p = b->key.k.p; ++ return n; ++} ++ ++static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as, ++ struct btree *b) ++{ ++ struct bkey_format new_f = bch2_btree_calc_format(b); ++ ++ /* ++ * The keys might expand with the new format - if they wouldn't fit in ++ * the btree node anymore, use the old format for now: ++ */ ++ if (!bch2_btree_node_format_fits(as->c, b, &new_f)) ++ new_f = b->format; ++ ++ return __bch2_btree_node_alloc_replacement(as, b, new_f); ++} ++ ++static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level) ++{ ++ struct btree *b = bch2_btree_node_alloc(as, level); ++ ++ btree_set_min(b, POS_MIN); ++ btree_set_max(b, POS_MAX); ++ b->data->format = bch2_btree_calc_format(b); ++ ++ btree_node_set_format(b, b->data->format); ++ bch2_btree_build_aux_trees(b); ++ ++ bch2_btree_update_add_new_node(as, b); ++ six_unlock_write(&b->c.lock); ++ ++ return b; ++} ++ ++static void bch2_btree_reserve_put(struct btree_update *as) ++{ ++ struct bch_fs *c = as->c; ++ ++ mutex_lock(&c->btree_reserve_cache_lock); ++ ++ while (as->nr_prealloc_nodes) { ++ struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes]; ++ ++ six_unlock_write(&b->c.lock); ++ ++ if (c->btree_reserve_cache_nr < ++ ARRAY_SIZE(c->btree_reserve_cache)) { ++ struct btree_alloc *a = ++ &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; ++ ++ a->ob = b->ob; ++ b->ob.nr = 0; ++ bkey_copy(&a->k, &b->key); ++ } else { ++ bch2_open_buckets_put(c, &b->ob); ++ } ++ ++ btree_node_lock_type(c, b, SIX_LOCK_write); ++ __btree_node_free(c, b); ++ six_unlock_write(&b->c.lock); ++ ++ six_unlock_intent(&b->c.lock); ++ } ++ ++ mutex_unlock(&c->btree_reserve_cache_lock); ++} ++ ++static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes, ++ unsigned flags, struct closure *cl) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *b; ++ int ret; ++ ++ BUG_ON(nr_nodes > BTREE_RESERVE_MAX); ++ ++ /* ++ * Protects reaping from the btree node cache and using the btree node ++ * open bucket reserve: ++ */ ++ ret = bch2_btree_cache_cannibalize_lock(c, cl); ++ if (ret) ++ return ret; ++ ++ while (as->nr_prealloc_nodes < nr_nodes) { ++ b = __bch2_btree_node_alloc(c, &as->disk_res, ++ flags & BTREE_INSERT_NOWAIT ++ ? NULL : cl, flags); ++ if (IS_ERR(b)) { ++ ret = PTR_ERR(b); ++ goto err_free; ++ } ++ ++ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(&b->key)); ++ if (ret) ++ goto err_free; ++ ++ as->prealloc_nodes[as->nr_prealloc_nodes++] = b; ++ } ++ ++ bch2_btree_cache_cannibalize_unlock(c); ++ return 0; ++err_free: ++ bch2_btree_cache_cannibalize_unlock(c); ++ trace_btree_reserve_get_fail(c, nr_nodes, cl); ++ return ret; ++} ++ ++/* Asynchronous interior node update machinery */ ++ ++static void bch2_btree_update_free(struct btree_update *as) ++{ ++ struct bch_fs *c = as->c; ++ ++ bch2_journal_preres_put(&c->journal, &as->journal_preres); ++ ++ bch2_journal_pin_drop(&c->journal, &as->journal); ++ bch2_journal_pin_flush(&c->journal, &as->journal); ++ bch2_disk_reservation_put(c, &as->disk_res); ++ bch2_btree_reserve_put(as); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_del(&as->unwritten_list); ++ list_del(&as->list); ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ closure_debug_destroy(&as->cl); ++ mempool_free(as, &c->btree_interior_update_pool); ++ ++ closure_wake_up(&c->btree_interior_update_wait); ++} ++ ++static void btree_update_will_delete_key(struct btree_update *as, ++ struct bkey_i *k) ++{ ++ BUG_ON(bch2_keylist_u64s(&as->old_keys) + k->k.u64s > ++ ARRAY_SIZE(as->_old_keys)); ++ bch2_keylist_add(&as->old_keys, k); ++} ++ ++static void btree_update_will_add_key(struct btree_update *as, ++ struct bkey_i *k) ++{ ++ BUG_ON(bch2_keylist_u64s(&as->new_keys) + k->k.u64s > ++ ARRAY_SIZE(as->_new_keys)); ++ bch2_keylist_add(&as->new_keys, k); ++} ++ ++/* ++ * The transactional part of an interior btree node update, where we journal the ++ * update we did to the interior node and update alloc info: ++ */ ++static int btree_update_nodes_written_trans(struct btree_trans *trans, ++ struct btree_update *as) ++{ ++ struct bkey_i *k; ++ int ret; ++ ++ trans->extra_journal_entries = (void *) &as->journal_entries[0]; ++ trans->extra_journal_entry_u64s = as->journal_u64s; ++ trans->journal_pin = &as->journal; ++ ++ for_each_keylist_key(&as->new_keys, k) { ++ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k), ++ 0, 0, BTREE_TRIGGER_INSERT); ++ if (ret) ++ return ret; ++ } ++ ++ for_each_keylist_key(&as->old_keys, k) { ++ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(k), ++ 0, 0, BTREE_TRIGGER_OVERWRITE); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static void btree_update_nodes_written(struct btree_update *as) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *b = as->b; ++ u64 journal_seq = 0; ++ unsigned i; ++ int ret; ++ ++ /* ++ * We did an update to a parent node where the pointers we added pointed ++ * to child nodes that weren't written yet: now, the child nodes have ++ * been written so we can write out the update to the interior node. ++ */ ++ ++ /* ++ * We can't call into journal reclaim here: we'd block on the journal ++ * reclaim lock, but we may need to release the open buckets we have ++ * pinned in order for other btree updates to make forward progress, and ++ * journal reclaim does btree updates when flushing bkey_cached entries, ++ * which may require allocations as well. ++ */ ++ ret = bch2_trans_do(c, &as->disk_res, &journal_seq, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_USE_ALLOC_RESERVE| ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_JOURNAL_RECLAIM| ++ BTREE_INSERT_JOURNAL_RESERVED, ++ btree_update_nodes_written_trans(&trans, as)); ++ BUG_ON(ret && !bch2_journal_error(&c->journal)); ++ ++ if (b) { ++ /* ++ * @b is the node we did the final insert into: ++ * ++ * On failure to get a journal reservation, we still have to ++ * unblock the write and allow most of the write path to happen ++ * so that shutdown works, but the i->journal_seq mechanism ++ * won't work to prevent the btree write from being visible (we ++ * didn't get a journal sequence number) - instead ++ * __bch2_btree_node_write() doesn't do the actual write if ++ * we're in journal error state: ++ */ ++ ++ btree_node_lock_type(c, b, SIX_LOCK_intent); ++ btree_node_lock_type(c, b, SIX_LOCK_write); ++ mutex_lock(&c->btree_interior_update_lock); ++ ++ list_del(&as->write_blocked_list); ++ ++ if (!ret && as->b == b) { ++ struct bset *i = btree_bset_last(b); ++ ++ BUG_ON(!b->c.level); ++ BUG_ON(!btree_node_dirty(b)); ++ ++ i->journal_seq = cpu_to_le64( ++ max(journal_seq, ++ le64_to_cpu(i->journal_seq))); ++ ++ bch2_btree_add_journal_pin(c, b, journal_seq); ++ } ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++ six_unlock_write(&b->c.lock); ++ ++ btree_node_write_if_need(c, b, SIX_LOCK_intent); ++ six_unlock_intent(&b->c.lock); ++ } ++ ++ bch2_journal_pin_drop(&c->journal, &as->journal); ++ ++ bch2_journal_preres_put(&c->journal, &as->journal_preres); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ for (i = 0; i < as->nr_new_nodes; i++) { ++ b = as->new_nodes[i]; ++ ++ BUG_ON(b->will_make_reachable != (unsigned long) as); ++ b->will_make_reachable = 0; ++ } ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ for (i = 0; i < as->nr_new_nodes; i++) { ++ b = as->new_nodes[i]; ++ ++ btree_node_lock_type(c, b, SIX_LOCK_read); ++ btree_node_write_if_need(c, b, SIX_LOCK_read); ++ six_unlock_read(&b->c.lock); ++ } ++ ++ for (i = 0; i < as->nr_open_buckets; i++) ++ bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]); ++ ++ bch2_btree_update_free(as); ++} ++ ++static void btree_interior_update_work(struct work_struct *work) ++{ ++ struct bch_fs *c = ++ container_of(work, struct bch_fs, btree_interior_update_work); ++ struct btree_update *as; ++ ++ while (1) { ++ mutex_lock(&c->btree_interior_update_lock); ++ as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, ++ struct btree_update, unwritten_list); ++ if (as && !as->nodes_written) ++ as = NULL; ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ if (!as) ++ break; ++ ++ btree_update_nodes_written(as); ++ } ++} ++ ++static void btree_update_set_nodes_written(struct closure *cl) ++{ ++ struct btree_update *as = container_of(cl, struct btree_update, cl); ++ struct bch_fs *c = as->c; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ as->nodes_written = true; ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); ++} ++ ++/* ++ * We're updating @b with pointers to nodes that haven't finished writing yet: ++ * block @b from being written until @as completes ++ */ ++static void btree_update_updated_node(struct btree_update *as, struct btree *b) ++{ ++ struct bch_fs *c = as->c; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); ++ ++ BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); ++ BUG_ON(!btree_node_dirty(b)); ++ ++ as->mode = BTREE_INTERIOR_UPDATING_NODE; ++ as->b = b; ++ list_add(&as->write_blocked_list, &b->write_blocked); ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++} ++ ++static void btree_update_reparent(struct btree_update *as, ++ struct btree_update *child) ++{ ++ struct bch_fs *c = as->c; ++ ++ lockdep_assert_held(&c->btree_interior_update_lock); ++ ++ child->b = NULL; ++ child->mode = BTREE_INTERIOR_UPDATING_AS; ++ ++ /* ++ * When we write a new btree root, we have to drop our journal pin ++ * _before_ the new nodes are technically reachable; see ++ * btree_update_nodes_written(). ++ * ++ * This goes for journal pins that are recursively blocked on us - so, ++ * just transfer the journal pin to the new interior update so ++ * btree_update_nodes_written() can drop it. ++ */ ++ bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL); ++ bch2_journal_pin_drop(&c->journal, &child->journal); ++} ++ ++static void btree_update_updated_root(struct btree_update *as, struct btree *b) ++{ ++ struct bkey_i *insert = &b->key; ++ struct bch_fs *c = as->c; ++ ++ BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); ++ ++ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > ++ ARRAY_SIZE(as->journal_entries)); ++ ++ as->journal_u64s += ++ journal_entry_set((void *) &as->journal_entries[as->journal_u64s], ++ BCH_JSET_ENTRY_btree_root, ++ b->c.btree_id, b->c.level, ++ insert, insert->k.u64s); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); ++ ++ as->mode = BTREE_INTERIOR_UPDATING_ROOT; ++ mutex_unlock(&c->btree_interior_update_lock); ++} ++ ++/* ++ * bch2_btree_update_add_new_node: ++ * ++ * This causes @as to wait on @b to be written, before it gets to ++ * bch2_btree_update_nodes_written ++ * ++ * Additionally, it sets b->will_make_reachable to prevent any additional writes ++ * to @b from happening besides the first until @b is reachable on disk ++ * ++ * And it adds @b to the list of @as's new nodes, so that we can update sector ++ * counts in bch2_btree_update_nodes_written: ++ */ ++void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b) ++{ ++ struct bch_fs *c = as->c; ++ ++ closure_get(&as->cl); ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes)); ++ BUG_ON(b->will_make_reachable); ++ ++ as->new_nodes[as->nr_new_nodes++] = b; ++ b->will_make_reachable = 1UL|(unsigned long) as; ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ btree_update_will_add_key(as, &b->key); ++} ++ ++/* ++ * returns true if @b was a new node ++ */ ++static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) ++{ ++ struct btree_update *as; ++ unsigned long v; ++ unsigned i; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ /* ++ * When b->will_make_reachable != 0, it owns a ref on as->cl that's ++ * dropped when it gets written by bch2_btree_complete_write - the ++ * xchg() is for synchronization with bch2_btree_complete_write: ++ */ ++ v = xchg(&b->will_make_reachable, 0); ++ as = (struct btree_update *) (v & ~1UL); ++ ++ if (!as) { ++ mutex_unlock(&c->btree_interior_update_lock); ++ return; ++ } ++ ++ for (i = 0; i < as->nr_new_nodes; i++) ++ if (as->new_nodes[i] == b) ++ goto found; ++ ++ BUG(); ++found: ++ array_remove_item(as->new_nodes, as->nr_new_nodes, i); ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ if (v & 1) ++ closure_put(&as->cl); ++} ++ ++void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b) ++{ ++ while (b->ob.nr) ++ as->open_buckets[as->nr_open_buckets++] = ++ b->ob.v[--b->ob.nr]; ++} ++ ++/* ++ * @b is being split/rewritten: it may have pointers to not-yet-written btree ++ * nodes and thus outstanding btree_updates - redirect @b's ++ * btree_updates to point to this btree_update: ++ */ ++void bch2_btree_interior_update_will_free_node(struct btree_update *as, ++ struct btree *b) ++{ ++ struct bch_fs *c = as->c; ++ struct btree_update *p, *n; ++ struct btree_write *w; ++ ++ set_btree_node_dying(b); ++ ++ if (btree_node_fake(b)) ++ return; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ ++ /* ++ * Does this node have any btree_update operations preventing ++ * it from being written? ++ * ++ * If so, redirect them to point to this btree_update: we can ++ * write out our new nodes, but we won't make them visible until those ++ * operations complete ++ */ ++ list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { ++ list_del_init(&p->write_blocked_list); ++ btree_update_reparent(as, p); ++ ++ /* ++ * for flush_held_btree_writes() waiting on updates to flush or ++ * nodes to be writeable: ++ */ ++ closure_wake_up(&c->btree_interior_update_wait); ++ } ++ ++ clear_btree_node_dirty(b); ++ clear_btree_node_need_write(b); ++ ++ /* ++ * Does this node have unwritten data that has a pin on the journal? ++ * ++ * If so, transfer that pin to the btree_update operation - ++ * note that if we're freeing multiple nodes, we only need to keep the ++ * oldest pin of any of the nodes we're freeing. We'll release the pin ++ * when the new nodes are persistent and reachable on disk: ++ */ ++ w = btree_current_write(b); ++ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); ++ bch2_journal_pin_drop(&c->journal, &w->journal); ++ ++ w = btree_prev_write(b); ++ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); ++ bch2_journal_pin_drop(&c->journal, &w->journal); ++ ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ /* ++ * Is this a node that isn't reachable on disk yet? ++ * ++ * Nodes that aren't reachable yet have writes blocked until they're ++ * reachable - now that we've cancelled any pending writes and moved ++ * things waiting on that write to wait on this update, we can drop this ++ * node from the list of nodes that the other update is making ++ * reachable, prior to freeing it: ++ */ ++ btree_update_drop_new_node(c, b); ++ ++ btree_update_will_delete_key(as, &b->key); ++} ++ ++void bch2_btree_update_done(struct btree_update *as) ++{ ++ BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); ++ ++ bch2_btree_reserve_put(as); ++ ++ continue_at(&as->cl, btree_update_set_nodes_written, system_freezable_wq); ++} ++ ++struct btree_update * ++bch2_btree_update_start(struct btree_trans *trans, enum btree_id id, ++ unsigned nr_nodes, unsigned flags, ++ struct closure *cl) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_update *as; ++ int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) ++ ? BCH_DISK_RESERVATION_NOFAIL : 0; ++ int journal_flags = (flags & BTREE_INSERT_JOURNAL_RESERVED) ++ ? JOURNAL_RES_GET_RECLAIM : 0; ++ int ret = 0; ++ ++ /* ++ * This check isn't necessary for correctness - it's just to potentially ++ * prevent us from doing a lot of work that'll end up being wasted: ++ */ ++ ret = bch2_journal_error(&c->journal); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO); ++ memset(as, 0, sizeof(*as)); ++ closure_init(&as->cl, NULL); ++ as->c = c; ++ as->mode = BTREE_INTERIOR_NO_UPDATE; ++ as->btree_id = id; ++ INIT_LIST_HEAD(&as->list); ++ INIT_LIST_HEAD(&as->unwritten_list); ++ INIT_LIST_HEAD(&as->write_blocked_list); ++ bch2_keylist_init(&as->old_keys, as->_old_keys); ++ bch2_keylist_init(&as->new_keys, as->_new_keys); ++ bch2_keylist_init(&as->parent_keys, as->inline_keys); ++ ++ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, ++ BTREE_UPDATE_JOURNAL_RES, ++ journal_flags|JOURNAL_RES_GET_NONBLOCK); ++ if (ret == -EAGAIN) { ++ if (flags & BTREE_INSERT_NOUNLOCK) ++ return ERR_PTR(-EINTR); ++ ++ bch2_trans_unlock(trans); ++ ++ ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, ++ BTREE_UPDATE_JOURNAL_RES, ++ journal_flags); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ if (!bch2_trans_relock(trans)) { ++ ret = -EINTR; ++ goto err; ++ } ++ } ++ ++ ret = bch2_disk_reservation_get(c, &as->disk_res, ++ nr_nodes * c->opts.btree_node_size, ++ c->opts.metadata_replicas, ++ disk_res_flags); ++ if (ret) ++ goto err; ++ ++ ret = bch2_btree_reserve_get(as, nr_nodes, flags, cl); ++ if (ret) ++ goto err; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_add_tail(&as->list, &c->btree_interior_update_list); ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ return as; ++err: ++ bch2_btree_update_free(as); ++ return ERR_PTR(ret); ++} ++ ++/* Btree root updates: */ ++ ++static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) ++{ ++ /* Root nodes cannot be reaped */ ++ mutex_lock(&c->btree_cache.lock); ++ list_del_init(&b->list); ++ mutex_unlock(&c->btree_cache.lock); ++ ++ mutex_lock(&c->btree_root_lock); ++ BUG_ON(btree_node_root(c, b) && ++ (b->c.level < btree_node_root(c, b)->c.level || ++ !btree_node_dying(btree_node_root(c, b)))); ++ ++ btree_node_root(c, b) = b; ++ mutex_unlock(&c->btree_root_lock); ++ ++ bch2_recalc_btree_reserve(c); ++} ++ ++/** ++ * bch_btree_set_root - update the root in memory and on disk ++ * ++ * To ensure forward progress, the current task must not be holding any ++ * btree node write locks. However, you must hold an intent lock on the ++ * old root. ++ * ++ * Note: This allocates a journal entry but doesn't add any keys to ++ * it. All the btree roots are part of every journal write, so there ++ * is nothing new to be done. This just guarantees that there is a ++ * journal write. ++ */ ++static void bch2_btree_set_root(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *old; ++ ++ trace_btree_set_root(c, b); ++ BUG_ON(!b->written && ++ !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags)); ++ ++ old = btree_node_root(c, b); ++ ++ /* ++ * Ensure no one is using the old root while we switch to the ++ * new root: ++ */ ++ bch2_btree_node_lock_write(old, iter); ++ ++ bch2_btree_set_root_inmem(c, b); ++ ++ btree_update_updated_root(as, b); ++ ++ /* ++ * Unlock old root after new root is visible: ++ * ++ * The new root isn't persistent, but that's ok: we still have ++ * an intent lock on the new root, and any updates that would ++ * depend on the new root would have to update the new root. ++ */ ++ bch2_btree_node_unlock_write(old, iter); ++} ++ ++/* Interior node updates: */ ++ ++static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter, ++ struct bkey_i *insert, ++ struct btree_node_iter *node_iter) ++{ ++ struct bkey_packed *k; ++ ++ BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > ++ ARRAY_SIZE(as->journal_entries)); ++ ++ as->journal_u64s += ++ journal_entry_set((void *) &as->journal_entries[as->journal_u64s], ++ BCH_JSET_ENTRY_btree_keys, ++ b->c.btree_id, b->c.level, ++ insert, insert->k.u64s); ++ ++ while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && ++ bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) ++ bch2_btree_node_iter_advance(node_iter, b); ++ ++ bch2_btree_bset_insert_key(iter, b, node_iter, insert); ++ set_btree_node_dirty(b); ++ set_btree_node_need_write(b); ++} ++ ++/* ++ * Move keys from n1 (original replacement node, now lower node) to n2 (higher ++ * node) ++ */ ++static struct btree *__btree_split_node(struct btree_update *as, ++ struct btree *n1, ++ struct btree_iter *iter) ++{ ++ size_t nr_packed = 0, nr_unpacked = 0; ++ struct btree *n2; ++ struct bset *set1, *set2; ++ struct bkey_packed *k, *prev = NULL; ++ ++ n2 = bch2_btree_node_alloc(as, n1->c.level); ++ bch2_btree_update_add_new_node(as, n2); ++ ++ n2->data->max_key = n1->data->max_key; ++ n2->data->format = n1->format; ++ SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data)); ++ n2->key.k.p = n1->key.k.p; ++ ++ btree_node_set_format(n2, n2->data->format); ++ ++ set1 = btree_bset_first(n1); ++ set2 = btree_bset_first(n2); ++ ++ /* ++ * Has to be a linear search because we don't have an auxiliary ++ * search tree yet ++ */ ++ k = set1->start; ++ while (1) { ++ struct bkey_packed *n = bkey_next_skip_noops(k, vstruct_last(set1)); ++ ++ if (n == vstruct_last(set1)) ++ break; ++ if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5) ++ break; ++ ++ if (bkey_packed(k)) ++ nr_packed++; ++ else ++ nr_unpacked++; ++ ++ prev = k; ++ k = n; ++ } ++ ++ BUG_ON(!prev); ++ ++ btree_set_max(n1, bkey_unpack_pos(n1, prev)); ++ btree_set_min(n2, bkey_successor(n1->key.k.p)); ++ ++ set2->u64s = cpu_to_le16((u64 *) vstruct_end(set1) - (u64 *) k); ++ set1->u64s = cpu_to_le16(le16_to_cpu(set1->u64s) - le16_to_cpu(set2->u64s)); ++ ++ set_btree_bset_end(n1, n1->set); ++ set_btree_bset_end(n2, n2->set); ++ ++ n2->nr.live_u64s = le16_to_cpu(set2->u64s); ++ n2->nr.bset_u64s[0] = le16_to_cpu(set2->u64s); ++ n2->nr.packed_keys = n1->nr.packed_keys - nr_packed; ++ n2->nr.unpacked_keys = n1->nr.unpacked_keys - nr_unpacked; ++ ++ n1->nr.live_u64s = le16_to_cpu(set1->u64s); ++ n1->nr.bset_u64s[0] = le16_to_cpu(set1->u64s); ++ n1->nr.packed_keys = nr_packed; ++ n1->nr.unpacked_keys = nr_unpacked; ++ ++ BUG_ON(!set1->u64s); ++ BUG_ON(!set2->u64s); ++ ++ memcpy_u64s(set2->start, ++ vstruct_end(set1), ++ le16_to_cpu(set2->u64s)); ++ ++ btree_node_reset_sib_u64s(n1); ++ btree_node_reset_sib_u64s(n2); ++ ++ bch2_verify_btree_nr_keys(n1); ++ bch2_verify_btree_nr_keys(n2); ++ ++ if (n1->c.level) { ++ btree_node_interior_verify(as->c, n1); ++ btree_node_interior_verify(as->c, n2); ++ } ++ ++ return n2; ++} ++ ++/* ++ * For updates to interior nodes, we've got to do the insert before we split ++ * because the stuff we're inserting has to be inserted atomically. Post split, ++ * the keys might have to go in different nodes and the split would no longer be ++ * atomic. ++ * ++ * Worse, if the insert is from btree node coalescing, if we do the insert after ++ * we do the split (and pick the pivot) - the pivot we pick might be between ++ * nodes that were coalesced, and thus in the middle of a child node post ++ * coalescing: ++ */ ++static void btree_split_insert_keys(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter, ++ struct keylist *keys) ++{ ++ struct btree_node_iter node_iter; ++ struct bkey_i *k = bch2_keylist_front(keys); ++ struct bkey_packed *src, *dst, *n; ++ struct bset *i; ++ ++ BUG_ON(btree_node_type(b) != BKEY_TYPE_BTREE); ++ ++ bch2_btree_node_iter_init(&node_iter, b, &k->k.p); ++ ++ while (!bch2_keylist_empty(keys)) { ++ k = bch2_keylist_front(keys); ++ ++ bch2_insert_fixup_btree_ptr(as, b, iter, k, &node_iter); ++ bch2_keylist_pop_front(keys); ++ } ++ ++ /* ++ * We can't tolerate whiteouts here - with whiteouts there can be ++ * duplicate keys, and it would be rather bad if we picked a duplicate ++ * for the pivot: ++ */ ++ i = btree_bset_first(b); ++ src = dst = i->start; ++ while (src != vstruct_last(i)) { ++ n = bkey_next_skip_noops(src, vstruct_last(i)); ++ if (!bkey_deleted(src)) { ++ memmove_u64s_down(dst, src, src->u64s); ++ dst = bkey_next(dst); ++ } ++ src = n; ++ } ++ ++ i->u64s = cpu_to_le16((u64 *) dst - i->_data); ++ set_btree_bset_end(b, b->set); ++ ++ BUG_ON(b->nsets != 1 || ++ b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s)); ++ ++ btree_node_interior_verify(as->c, b); ++} ++ ++static void btree_split(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter, struct keylist *keys, ++ unsigned flags) ++{ ++ struct bch_fs *c = as->c; ++ struct btree *parent = btree_node_parent(iter, b); ++ struct btree *n1, *n2 = NULL, *n3 = NULL; ++ u64 start_time = local_clock(); ++ ++ BUG_ON(!parent && (b != btree_node_root(c, b))); ++ BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level)); ++ ++ bch2_btree_interior_update_will_free_node(as, b); ++ ++ n1 = bch2_btree_node_alloc_replacement(as, b); ++ bch2_btree_update_add_new_node(as, n1); ++ ++ if (keys) ++ btree_split_insert_keys(as, n1, iter, keys); ++ ++ if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) { ++ trace_btree_split(c, b); ++ ++ n2 = __btree_split_node(as, n1, iter); ++ ++ bch2_btree_build_aux_trees(n2); ++ bch2_btree_build_aux_trees(n1); ++ six_unlock_write(&n2->c.lock); ++ six_unlock_write(&n1->c.lock); ++ ++ bch2_btree_node_write(c, n2, SIX_LOCK_intent); ++ ++ /* ++ * Note that on recursive parent_keys == keys, so we ++ * can't start adding new keys to parent_keys before emptying it ++ * out (which we did with btree_split_insert_keys() above) ++ */ ++ bch2_keylist_add(&as->parent_keys, &n1->key); ++ bch2_keylist_add(&as->parent_keys, &n2->key); ++ ++ if (!parent) { ++ /* Depth increases, make a new root */ ++ n3 = __btree_root_alloc(as, b->c.level + 1); ++ ++ n3->sib_u64s[0] = U16_MAX; ++ n3->sib_u64s[1] = U16_MAX; ++ ++ btree_split_insert_keys(as, n3, iter, &as->parent_keys); ++ ++ bch2_btree_node_write(c, n3, SIX_LOCK_intent); ++ } ++ } else { ++ trace_btree_compact(c, b); ++ ++ bch2_btree_build_aux_trees(n1); ++ six_unlock_write(&n1->c.lock); ++ ++ if (parent) ++ bch2_keylist_add(&as->parent_keys, &n1->key); ++ } ++ ++ bch2_btree_node_write(c, n1, SIX_LOCK_intent); ++ ++ /* New nodes all written, now make them visible: */ ++ ++ if (parent) { ++ /* Split a non root node */ ++ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); ++ } else if (n3) { ++ bch2_btree_set_root(as, n3, iter); ++ } else { ++ /* Root filled up but didn't need to be split */ ++ bch2_btree_set_root(as, n1, iter); ++ } ++ ++ bch2_btree_update_get_open_buckets(as, n1); ++ if (n2) ++ bch2_btree_update_get_open_buckets(as, n2); ++ if (n3) ++ bch2_btree_update_get_open_buckets(as, n3); ++ ++ /* Successful split, update the iterator to point to the new nodes: */ ++ ++ six_lock_increment(&b->c.lock, SIX_LOCK_intent); ++ bch2_btree_iter_node_drop(iter, b); ++ if (n3) ++ bch2_btree_iter_node_replace(iter, n3); ++ if (n2) ++ bch2_btree_iter_node_replace(iter, n2); ++ bch2_btree_iter_node_replace(iter, n1); ++ ++ /* ++ * The old node must be freed (in memory) _before_ unlocking the new ++ * nodes - else another thread could re-acquire a read lock on the old ++ * node after another thread has locked and updated the new node, thus ++ * seeing stale data: ++ */ ++ bch2_btree_node_free_inmem(c, b, iter); ++ ++ if (n3) ++ six_unlock_intent(&n3->c.lock); ++ if (n2) ++ six_unlock_intent(&n2->c.lock); ++ six_unlock_intent(&n1->c.lock); ++ ++ bch2_btree_trans_verify_locks(iter->trans); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split], ++ start_time); ++} ++ ++static void ++bch2_btree_insert_keys_interior(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter, struct keylist *keys) ++{ ++ struct btree_iter *linked; ++ struct btree_node_iter node_iter; ++ struct bkey_i *insert = bch2_keylist_front(keys); ++ struct bkey_packed *k; ++ ++ /* Don't screw up @iter's position: */ ++ node_iter = iter->l[b->c.level].iter; ++ ++ /* ++ * btree_split(), btree_gc_coalesce() will insert keys before ++ * the iterator's current position - they know the keys go in ++ * the node the iterator points to: ++ */ ++ while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && ++ (bkey_cmp_packed(b, k, &insert->k) >= 0)) ++ ; ++ ++ for_each_keylist_key(keys, insert) ++ bch2_insert_fixup_btree_ptr(as, b, iter, insert, &node_iter); ++ ++ btree_update_updated_node(as, b); ++ ++ trans_for_each_iter_with_node(iter->trans, b, linked) ++ bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); ++ ++ bch2_btree_trans_verify_iters(iter->trans, b); ++} ++ ++/** ++ * bch_btree_insert_node - insert bkeys into a given btree node ++ * ++ * @iter: btree iterator ++ * @keys: list of keys to insert ++ * @hook: insert callback ++ * @persistent: if not null, @persistent will wait on journal write ++ * ++ * Inserts as many keys as it can into a given btree node, splitting it if full. ++ * If a split occurred, this function will return early. This can only happen ++ * for leaf nodes -- inserts into interior nodes have to be atomic. ++ */ ++void bch2_btree_insert_node(struct btree_update *as, struct btree *b, ++ struct btree_iter *iter, struct keylist *keys, ++ unsigned flags) ++{ ++ struct bch_fs *c = as->c; ++ int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); ++ int old_live_u64s = b->nr.live_u64s; ++ int live_u64s_added, u64s_added; ++ ++ BUG_ON(!btree_node_intent_locked(iter, btree_node_root(c, b)->c.level)); ++ BUG_ON(!b->c.level); ++ BUG_ON(!as || as->b); ++ bch2_verify_keylist_sorted(keys); ++ ++ if (as->must_rewrite) ++ goto split; ++ ++ bch2_btree_node_lock_for_insert(c, b, iter); ++ ++ if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { ++ bch2_btree_node_unlock_write(b, iter); ++ goto split; ++ } ++ ++ bch2_btree_insert_keys_interior(as, b, iter, keys); ++ ++ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; ++ u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; ++ ++ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) ++ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); ++ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) ++ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); ++ ++ if (u64s_added > live_u64s_added && ++ bch2_maybe_compact_whiteouts(c, b)) ++ bch2_btree_iter_reinit_node(iter, b); ++ ++ bch2_btree_node_unlock_write(b, iter); ++ ++ btree_node_interior_verify(c, b); ++ ++ /* ++ * when called from the btree_split path the new nodes aren't added to ++ * the btree iterator yet, so the merge path's unlock/wait/relock dance ++ * won't work: ++ */ ++ bch2_foreground_maybe_merge(c, iter, b->c.level, ++ flags|BTREE_INSERT_NOUNLOCK); ++ return; ++split: ++ btree_split(as, b, iter, keys, flags); ++} ++ ++int bch2_btree_split_leaf(struct bch_fs *c, struct btree_iter *iter, ++ unsigned flags) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree *b = iter_l(iter)->b; ++ struct btree_update *as; ++ struct closure cl; ++ int ret = 0; ++ struct btree_insert_entry *i; ++ ++ /* ++ * We already have a disk reservation and open buckets pinned; this ++ * allocation must not block: ++ */ ++ trans_for_each_update(trans, i) ++ if (btree_node_type_needs_gc(i->iter->btree_id)) ++ flags |= BTREE_INSERT_USE_RESERVE; ++ ++ closure_init_stack(&cl); ++ ++ /* Hack, because gc and splitting nodes doesn't mix yet: */ ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && ++ !down_read_trylock(&c->gc_lock)) { ++ if (flags & BTREE_INSERT_NOUNLOCK) { ++ trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ return -EINTR; ++ } ++ ++ bch2_trans_unlock(trans); ++ down_read(&c->gc_lock); ++ ++ if (!bch2_trans_relock(trans)) ++ ret = -EINTR; ++ } ++ ++ /* ++ * XXX: figure out how far we might need to split, ++ * instead of locking/reserving all the way to the root: ++ */ ++ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { ++ trace_trans_restart_iter_upgrade(trans->ip); ++ ret = -EINTR; ++ goto out; ++ } ++ ++ as = bch2_btree_update_start(trans, iter->btree_id, ++ btree_update_reserve_required(c, b), flags, ++ !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); ++ if (IS_ERR(as)) { ++ ret = PTR_ERR(as); ++ if (ret == -EAGAIN) { ++ BUG_ON(flags & BTREE_INSERT_NOUNLOCK); ++ bch2_trans_unlock(trans); ++ ret = -EINTR; ++ ++ trace_transaction_restart_ip(trans->ip, _THIS_IP_); ++ } ++ goto out; ++ } ++ ++ btree_split(as, b, iter, NULL, flags); ++ bch2_btree_update_done(as); ++ ++ /* ++ * We haven't successfully inserted yet, so don't downgrade all the way ++ * back to read locks; ++ */ ++ __bch2_btree_iter_downgrade(iter, 1); ++out: ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) ++ up_read(&c->gc_lock); ++ closure_sync(&cl); ++ return ret; ++} ++ ++void __bch2_foreground_maybe_merge(struct bch_fs *c, ++ struct btree_iter *iter, ++ unsigned level, ++ unsigned flags, ++ enum btree_node_sibling sib) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree_update *as; ++ struct bkey_format_state new_s; ++ struct bkey_format new_f; ++ struct bkey_i delete; ++ struct btree *b, *m, *n, *prev, *next, *parent; ++ struct closure cl; ++ size_t sib_u64s; ++ int ret = 0; ++ ++ BUG_ON(!btree_node_locked(iter, level)); ++ ++ closure_init_stack(&cl); ++retry: ++ BUG_ON(!btree_node_locked(iter, level)); ++ ++ b = iter->l[level].b; ++ ++ parent = btree_node_parent(iter, b); ++ if (!parent) ++ goto out; ++ ++ if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) ++ goto out; ++ ++ /* XXX: can't be holding read locks */ ++ m = bch2_btree_node_get_sibling(c, iter, b, sib); ++ if (IS_ERR(m)) { ++ ret = PTR_ERR(m); ++ goto err; ++ } ++ ++ /* NULL means no sibling: */ ++ if (!m) { ++ b->sib_u64s[sib] = U16_MAX; ++ goto out; ++ } ++ ++ if (sib == btree_prev_sib) { ++ prev = m; ++ next = b; ++ } else { ++ prev = b; ++ next = m; ++ } ++ ++ bch2_bkey_format_init(&new_s); ++ __bch2_btree_calc_format(&new_s, b); ++ __bch2_btree_calc_format(&new_s, m); ++ new_f = bch2_bkey_format_done(&new_s); ++ ++ sib_u64s = btree_node_u64s_with_format(b, &new_f) + ++ btree_node_u64s_with_format(m, &new_f); ++ ++ if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) { ++ sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c); ++ sib_u64s /= 2; ++ sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c); ++ } ++ ++ sib_u64s = min(sib_u64s, btree_max_u64s(c)); ++ b->sib_u64s[sib] = sib_u64s; ++ ++ if (b->sib_u64s[sib] > BTREE_FOREGROUND_MERGE_THRESHOLD(c)) { ++ six_unlock_intent(&m->c.lock); ++ goto out; ++ } ++ ++ /* We're changing btree topology, doesn't mix with gc: */ ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD) && ++ !down_read_trylock(&c->gc_lock)) ++ goto err_cycle_gc_lock; ++ ++ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) { ++ ret = -EINTR; ++ goto err_unlock; ++ } ++ ++ as = bch2_btree_update_start(trans, iter->btree_id, ++ btree_update_reserve_required(c, parent) + 1, ++ flags| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE, ++ !(flags & BTREE_INSERT_NOUNLOCK) ? &cl : NULL); ++ if (IS_ERR(as)) { ++ ret = PTR_ERR(as); ++ goto err_unlock; ++ } ++ ++ trace_btree_merge(c, b); ++ ++ bch2_btree_interior_update_will_free_node(as, b); ++ bch2_btree_interior_update_will_free_node(as, m); ++ ++ n = bch2_btree_node_alloc(as, b->c.level); ++ bch2_btree_update_add_new_node(as, n); ++ ++ btree_set_min(n, prev->data->min_key); ++ btree_set_max(n, next->data->max_key); ++ n->data->format = new_f; ++ ++ btree_node_set_format(n, new_f); ++ ++ bch2_btree_sort_into(c, n, prev); ++ bch2_btree_sort_into(c, n, next); ++ ++ bch2_btree_build_aux_trees(n); ++ six_unlock_write(&n->c.lock); ++ ++ bkey_init(&delete.k); ++ delete.k.p = prev->key.k.p; ++ bch2_keylist_add(&as->parent_keys, &delete); ++ bch2_keylist_add(&as->parent_keys, &n->key); ++ ++ bch2_btree_node_write(c, n, SIX_LOCK_intent); ++ ++ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); ++ ++ bch2_btree_update_get_open_buckets(as, n); ++ ++ six_lock_increment(&b->c.lock, SIX_LOCK_intent); ++ bch2_btree_iter_node_drop(iter, b); ++ bch2_btree_iter_node_drop(iter, m); ++ ++ bch2_btree_iter_node_replace(iter, n); ++ ++ bch2_btree_trans_verify_iters(trans, n); ++ ++ bch2_btree_node_free_inmem(c, b, iter); ++ bch2_btree_node_free_inmem(c, m, iter); ++ ++ six_unlock_intent(&n->c.lock); ++ ++ bch2_btree_update_done(as); ++ ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) ++ up_read(&c->gc_lock); ++out: ++ bch2_btree_trans_verify_locks(trans); ++ ++ /* ++ * Don't downgrade locks here: we're called after successful insert, ++ * and the caller will downgrade locks after a successful insert ++ * anyways (in case e.g. a split was required first) ++ * ++ * And we're also called when inserting into interior nodes in the ++ * split path, and downgrading to read locks in there is potentially ++ * confusing: ++ */ ++ closure_sync(&cl); ++ return; ++ ++err_cycle_gc_lock: ++ six_unlock_intent(&m->c.lock); ++ ++ if (flags & BTREE_INSERT_NOUNLOCK) ++ goto out; ++ ++ bch2_trans_unlock(trans); ++ ++ down_read(&c->gc_lock); ++ up_read(&c->gc_lock); ++ ret = -EINTR; ++ goto err; ++ ++err_unlock: ++ six_unlock_intent(&m->c.lock); ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) ++ up_read(&c->gc_lock); ++err: ++ BUG_ON(ret == -EAGAIN && (flags & BTREE_INSERT_NOUNLOCK)); ++ ++ if ((ret == -EAGAIN || ret == -EINTR) && ++ !(flags & BTREE_INSERT_NOUNLOCK)) { ++ bch2_trans_unlock(trans); ++ closure_sync(&cl); ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ goto out; ++ ++ goto retry; ++ } ++ ++ goto out; ++} ++ ++static int __btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, ++ struct btree *b, unsigned flags, ++ struct closure *cl) ++{ ++ struct btree *n, *parent = btree_node_parent(iter, b); ++ struct btree_update *as; ++ ++ as = bch2_btree_update_start(iter->trans, iter->btree_id, ++ (parent ++ ? btree_update_reserve_required(c, parent) ++ : 0) + 1, ++ flags, cl); ++ if (IS_ERR(as)) { ++ trace_btree_gc_rewrite_node_fail(c, b); ++ return PTR_ERR(as); ++ } ++ ++ bch2_btree_interior_update_will_free_node(as, b); ++ ++ n = bch2_btree_node_alloc_replacement(as, b); ++ bch2_btree_update_add_new_node(as, n); ++ ++ bch2_btree_build_aux_trees(n); ++ six_unlock_write(&n->c.lock); ++ ++ trace_btree_gc_rewrite_node(c, b); ++ ++ bch2_btree_node_write(c, n, SIX_LOCK_intent); ++ ++ if (parent) { ++ bch2_keylist_add(&as->parent_keys, &n->key); ++ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, flags); ++ } else { ++ bch2_btree_set_root(as, n, iter); ++ } ++ ++ bch2_btree_update_get_open_buckets(as, n); ++ ++ six_lock_increment(&b->c.lock, SIX_LOCK_intent); ++ bch2_btree_iter_node_drop(iter, b); ++ bch2_btree_iter_node_replace(iter, n); ++ bch2_btree_node_free_inmem(c, b, iter); ++ six_unlock_intent(&n->c.lock); ++ ++ bch2_btree_update_done(as); ++ return 0; ++} ++ ++/** ++ * bch_btree_node_rewrite - Rewrite/move a btree node ++ * ++ * Returns 0 on success, -EINTR or -EAGAIN on failure (i.e. ++ * btree_check_reserve() has to wait) ++ */ ++int bch2_btree_node_rewrite(struct bch_fs *c, struct btree_iter *iter, ++ __le64 seq, unsigned flags) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct closure cl; ++ struct btree *b; ++ int ret; ++ ++ flags |= BTREE_INSERT_NOFAIL; ++ ++ closure_init_stack(&cl); ++ ++ bch2_btree_iter_upgrade(iter, U8_MAX); ++ ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) { ++ if (!down_read_trylock(&c->gc_lock)) { ++ bch2_trans_unlock(trans); ++ down_read(&c->gc_lock); ++ } ++ } ++ ++ while (1) { ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ break; ++ ++ b = bch2_btree_iter_peek_node(iter); ++ if (!b || b->data->keys.seq != seq) ++ break; ++ ++ ret = __btree_node_rewrite(c, iter, b, flags, &cl); ++ if (ret != -EAGAIN && ++ ret != -EINTR) ++ break; ++ ++ bch2_trans_unlock(trans); ++ closure_sync(&cl); ++ } ++ ++ bch2_btree_iter_downgrade(iter); ++ ++ if (!(flags & BTREE_INSERT_GC_LOCK_HELD)) ++ up_read(&c->gc_lock); ++ ++ closure_sync(&cl); ++ return ret; ++} ++ ++static void __bch2_btree_node_update_key(struct bch_fs *c, ++ struct btree_update *as, ++ struct btree_iter *iter, ++ struct btree *b, struct btree *new_hash, ++ struct bkey_i *new_key) ++{ ++ struct btree *parent; ++ int ret; ++ ++ btree_update_will_delete_key(as, &b->key); ++ btree_update_will_add_key(as, new_key); ++ ++ parent = btree_node_parent(iter, b); ++ if (parent) { ++ if (new_hash) { ++ bkey_copy(&new_hash->key, new_key); ++ ret = bch2_btree_node_hash_insert(&c->btree_cache, ++ new_hash, b->c.level, b->c.btree_id); ++ BUG_ON(ret); ++ } ++ ++ bch2_keylist_add(&as->parent_keys, new_key); ++ bch2_btree_insert_node(as, parent, iter, &as->parent_keys, 0); ++ ++ if (new_hash) { ++ mutex_lock(&c->btree_cache.lock); ++ bch2_btree_node_hash_remove(&c->btree_cache, new_hash); ++ ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ bkey_copy(&b->key, new_key); ++ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); ++ BUG_ON(ret); ++ mutex_unlock(&c->btree_cache.lock); ++ } else { ++ bkey_copy(&b->key, new_key); ++ } ++ } else { ++ BUG_ON(btree_node_root(c, b) != b); ++ ++ bch2_btree_node_lock_write(b, iter); ++ bkey_copy(&b->key, new_key); ++ ++ if (btree_ptr_hash_val(&b->key) != b->hash_val) { ++ mutex_lock(&c->btree_cache.lock); ++ bch2_btree_node_hash_remove(&c->btree_cache, b); ++ ++ ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); ++ BUG_ON(ret); ++ mutex_unlock(&c->btree_cache.lock); ++ } ++ ++ btree_update_updated_root(as, b); ++ bch2_btree_node_unlock_write(b, iter); ++ } ++ ++ bch2_btree_update_done(as); ++} ++ ++int bch2_btree_node_update_key(struct bch_fs *c, struct btree_iter *iter, ++ struct btree *b, ++ struct bkey_i *new_key) ++{ ++ struct btree *parent = btree_node_parent(iter, b); ++ struct btree_update *as = NULL; ++ struct btree *new_hash = NULL; ++ struct closure cl; ++ int ret; ++ ++ closure_init_stack(&cl); ++ ++ if (!bch2_btree_iter_upgrade(iter, U8_MAX)) ++ return -EINTR; ++ ++ if (!down_read_trylock(&c->gc_lock)) { ++ bch2_trans_unlock(iter->trans); ++ down_read(&c->gc_lock); ++ ++ if (!bch2_trans_relock(iter->trans)) { ++ ret = -EINTR; ++ goto err; ++ } ++ } ++ ++ /* ++ * check btree_ptr_hash_val() after @b is locked by ++ * btree_iter_traverse(): ++ */ ++ if (btree_ptr_hash_val(new_key) != b->hash_val) { ++ /* bch2_btree_reserve_get will unlock */ ++ ret = bch2_btree_cache_cannibalize_lock(c, &cl); ++ if (ret) { ++ bch2_trans_unlock(iter->trans); ++ up_read(&c->gc_lock); ++ closure_sync(&cl); ++ down_read(&c->gc_lock); ++ ++ if (!bch2_trans_relock(iter->trans)) { ++ ret = -EINTR; ++ goto err; ++ } ++ } ++ ++ new_hash = bch2_btree_node_mem_alloc(c); ++ } ++retry: ++ as = bch2_btree_update_start(iter->trans, iter->btree_id, ++ parent ? btree_update_reserve_required(c, parent) : 0, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_USE_ALLOC_RESERVE, ++ &cl); ++ ++ if (IS_ERR(as)) { ++ ret = PTR_ERR(as); ++ if (ret == -EAGAIN) ++ ret = -EINTR; ++ ++ if (ret == -EINTR) { ++ bch2_trans_unlock(iter->trans); ++ up_read(&c->gc_lock); ++ closure_sync(&cl); ++ down_read(&c->gc_lock); ++ ++ if (bch2_trans_relock(iter->trans)) ++ goto retry; ++ } ++ ++ goto err; ++ } ++ ++ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(new_key)); ++ if (ret) ++ goto err_free_update; ++ ++ __bch2_btree_node_update_key(c, as, iter, b, new_hash, new_key); ++ ++ bch2_btree_iter_downgrade(iter); ++err: ++ if (new_hash) { ++ mutex_lock(&c->btree_cache.lock); ++ list_move(&new_hash->list, &c->btree_cache.freeable); ++ mutex_unlock(&c->btree_cache.lock); ++ ++ six_unlock_write(&new_hash->c.lock); ++ six_unlock_intent(&new_hash->c.lock); ++ } ++ up_read(&c->gc_lock); ++ closure_sync(&cl); ++ return ret; ++err_free_update: ++ bch2_btree_update_free(as); ++ goto err; ++} ++ ++/* Init code: */ ++ ++/* ++ * Only for filesystem bringup, when first reading the btree roots or allocating ++ * btree roots when initializing a new filesystem: ++ */ ++void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) ++{ ++ BUG_ON(btree_node_root(c, b)); ++ ++ bch2_btree_set_root_inmem(c, b); ++} ++ ++void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) ++{ ++ struct closure cl; ++ struct btree *b; ++ int ret; ++ ++ closure_init_stack(&cl); ++ ++ do { ++ ret = bch2_btree_cache_cannibalize_lock(c, &cl); ++ closure_sync(&cl); ++ } while (ret); ++ ++ b = bch2_btree_node_mem_alloc(c); ++ bch2_btree_cache_cannibalize_unlock(c); ++ ++ set_btree_node_fake(b); ++ set_btree_node_need_rewrite(b); ++ b->c.level = 0; ++ b->c.btree_id = id; ++ ++ bkey_btree_ptr_init(&b->key); ++ b->key.k.p = POS_MAX; ++ *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id; ++ ++ bch2_bset_init_first(b, &b->data->keys); ++ bch2_btree_build_aux_trees(b); ++ ++ b->data->flags = 0; ++ btree_set_min(b, POS_MIN); ++ btree_set_max(b, POS_MAX); ++ b->data->format = bch2_btree_calc_format(b); ++ btree_node_set_format(b, b->data->format); ++ ++ ret = bch2_btree_node_hash_insert(&c->btree_cache, b, ++ b->c.level, b->c.btree_id); ++ BUG_ON(ret); ++ ++ bch2_btree_set_root_inmem(c, b); ++ ++ six_unlock_write(&b->c.lock); ++ six_unlock_intent(&b->c.lock); ++} ++ ++void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ struct btree_update *as; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_for_each_entry(as, &c->btree_interior_update_list, list) ++ pr_buf(out, "%p m %u w %u r %u j %llu\n", ++ as, ++ as->mode, ++ as->nodes_written, ++ atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK, ++ as->journal.seq); ++ mutex_unlock(&c->btree_interior_update_lock); ++} ++ ++size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c) ++{ ++ size_t ret = 0; ++ struct list_head *i; ++ ++ mutex_lock(&c->btree_interior_update_lock); ++ list_for_each(i, &c->btree_interior_update_list) ++ ret++; ++ mutex_unlock(&c->btree_interior_update_lock); ++ ++ return ret; ++} ++ ++void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset) ++{ ++ struct btree_root *r; ++ struct jset_entry *entry; ++ ++ mutex_lock(&c->btree_root_lock); ++ ++ vstruct_for_each(jset, entry) ++ if (entry->type == BCH_JSET_ENTRY_btree_root) { ++ r = &c->btree_roots[entry->btree_id]; ++ r->level = entry->level; ++ r->alive = true; ++ bkey_copy(&r->key, &entry->start[0]); ++ } ++ ++ mutex_unlock(&c->btree_root_lock); ++} ++ ++struct jset_entry * ++bch2_btree_roots_to_journal_entries(struct bch_fs *c, ++ struct jset_entry *start, ++ struct jset_entry *end) ++{ ++ struct jset_entry *entry; ++ unsigned long have = 0; ++ unsigned i; ++ ++ for (entry = start; entry < end; entry = vstruct_next(entry)) ++ if (entry->type == BCH_JSET_ENTRY_btree_root) ++ __set_bit(entry->btree_id, &have); ++ ++ mutex_lock(&c->btree_root_lock); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (c->btree_roots[i].alive && !test_bit(i, &have)) { ++ journal_entry_set(end, ++ BCH_JSET_ENTRY_btree_root, ++ i, c->btree_roots[i].level, ++ &c->btree_roots[i].key, ++ c->btree_roots[i].key.u64s); ++ end = vstruct_next(end); ++ } ++ ++ mutex_unlock(&c->btree_root_lock); ++ ++ return end; ++} ++ ++void bch2_fs_btree_interior_update_exit(struct bch_fs *c) ++{ ++ if (c->btree_interior_update_worker) ++ destroy_workqueue(c->btree_interior_update_worker); ++ mempool_exit(&c->btree_interior_update_pool); ++} ++ ++int bch2_fs_btree_interior_update_init(struct bch_fs *c) ++{ ++ mutex_init(&c->btree_reserve_cache_lock); ++ INIT_LIST_HEAD(&c->btree_interior_update_list); ++ INIT_LIST_HEAD(&c->btree_interior_updates_unwritten); ++ mutex_init(&c->btree_interior_update_lock); ++ INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work); ++ ++ c->btree_interior_update_worker = ++ alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1); ++ if (!c->btree_interior_update_worker) ++ return -ENOMEM; ++ ++ return mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, ++ sizeof(struct btree_update)); ++} +diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h +new file mode 100644 +index 000000000000..7668225e72c6 +--- /dev/null ++++ b/fs/bcachefs/btree_update_interior.h +@@ -0,0 +1,331 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H ++#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H ++ ++#include "btree_cache.h" ++#include "btree_locking.h" ++#include "btree_update.h" ++ ++void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *); ++bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *, ++ struct bkey_format *); ++ ++#define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES) ++ ++#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1)) ++ ++/* ++ * Tracks an in progress split/rewrite of a btree node and the update to the ++ * parent node: ++ * ++ * When we split/rewrite a node, we do all the updates in memory without ++ * waiting for any writes to complete - we allocate the new node(s) and update ++ * the parent node, possibly recursively up to the root. ++ * ++ * The end result is that we have one or more new nodes being written - ++ * possibly several, if there were multiple splits - and then a write (updating ++ * an interior node) which will make all these new nodes visible. ++ * ++ * Additionally, as we split/rewrite nodes we free the old nodes - but the old ++ * nodes can't be freed (their space on disk can't be reclaimed) until the ++ * update to the interior node that makes the new node visible completes - ++ * until then, the old nodes are still reachable on disk. ++ * ++ */ ++struct btree_update { ++ struct closure cl; ++ struct bch_fs *c; ++ ++ struct list_head list; ++ struct list_head unwritten_list; ++ ++ /* What kind of update are we doing? */ ++ enum { ++ BTREE_INTERIOR_NO_UPDATE, ++ BTREE_INTERIOR_UPDATING_NODE, ++ BTREE_INTERIOR_UPDATING_ROOT, ++ BTREE_INTERIOR_UPDATING_AS, ++ } mode; ++ ++ unsigned must_rewrite:1; ++ unsigned nodes_written:1; ++ ++ enum btree_id btree_id; ++ ++ struct disk_reservation disk_res; ++ struct journal_preres journal_preres; ++ ++ /* ++ * BTREE_INTERIOR_UPDATING_NODE: ++ * The update that made the new nodes visible was a regular update to an ++ * existing interior node - @b. We can't write out the update to @b ++ * until the new nodes we created are finished writing, so we block @b ++ * from writing by putting this btree_interior update on the ++ * @b->write_blocked list with @write_blocked_list: ++ */ ++ struct btree *b; ++ struct list_head write_blocked_list; ++ ++ /* ++ * We may be freeing nodes that were dirty, and thus had journal entries ++ * pinned: we need to transfer the oldest of those pins to the ++ * btree_update operation, and release it when the new node(s) ++ * are all persistent and reachable: ++ */ ++ struct journal_entry_pin journal; ++ ++ /* Preallocated nodes we reserve when we start the update: */ ++ struct btree *prealloc_nodes[BTREE_UPDATE_NODES_MAX]; ++ unsigned nr_prealloc_nodes; ++ ++ /* Nodes being freed: */ ++ struct keylist old_keys; ++ u64 _old_keys[BTREE_UPDATE_NODES_MAX * ++ BKEY_BTREE_PTR_VAL_U64s_MAX]; ++ ++ /* Nodes being added: */ ++ struct keylist new_keys; ++ u64 _new_keys[BTREE_UPDATE_NODES_MAX * ++ BKEY_BTREE_PTR_VAL_U64s_MAX]; ++ ++ /* New nodes, that will be made reachable by this update: */ ++ struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; ++ unsigned nr_new_nodes; ++ ++ open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX * ++ BCH_REPLICAS_MAX]; ++ open_bucket_idx_t nr_open_buckets; ++ ++ unsigned journal_u64s; ++ u64 journal_entries[BTREE_UPDATE_JOURNAL_RES]; ++ ++ /* Only here to reduce stack usage on recursive splits: */ ++ struct keylist parent_keys; ++ /* ++ * Enough room for btree_split's keys without realloc - btree node ++ * pointers never have crc/compression info, so we only need to acount ++ * for the pointers for three keys ++ */ ++ u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; ++}; ++ ++void bch2_btree_node_free_inmem(struct bch_fs *, struct btree *, ++ struct btree_iter *); ++void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *); ++ ++void bch2_btree_update_get_open_buckets(struct btree_update *, struct btree *); ++ ++struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, ++ struct btree *, ++ struct bkey_format); ++ ++void bch2_btree_update_done(struct btree_update *); ++struct btree_update * ++bch2_btree_update_start(struct btree_trans *, enum btree_id, unsigned, ++ unsigned, struct closure *); ++ ++void bch2_btree_interior_update_will_free_node(struct btree_update *, ++ struct btree *); ++void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); ++ ++void bch2_btree_insert_node(struct btree_update *, struct btree *, ++ struct btree_iter *, struct keylist *, ++ unsigned); ++int bch2_btree_split_leaf(struct bch_fs *, struct btree_iter *, unsigned); ++ ++void __bch2_foreground_maybe_merge(struct bch_fs *, struct btree_iter *, ++ unsigned, unsigned, enum btree_node_sibling); ++ ++static inline void bch2_foreground_maybe_merge_sibling(struct bch_fs *c, ++ struct btree_iter *iter, ++ unsigned level, unsigned flags, ++ enum btree_node_sibling sib) ++{ ++ struct btree *b; ++ ++ if (iter->uptodate >= BTREE_ITER_NEED_TRAVERSE) ++ return; ++ ++ if (!bch2_btree_node_relock(iter, level)) ++ return; ++ ++ b = iter->l[level].b; ++ if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) ++ return; ++ ++ __bch2_foreground_maybe_merge(c, iter, level, flags, sib); ++} ++ ++static inline void bch2_foreground_maybe_merge(struct bch_fs *c, ++ struct btree_iter *iter, ++ unsigned level, ++ unsigned flags) ++{ ++ bch2_foreground_maybe_merge_sibling(c, iter, level, flags, ++ btree_prev_sib); ++ bch2_foreground_maybe_merge_sibling(c, iter, level, flags, ++ btree_next_sib); ++} ++ ++void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); ++void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); ++ ++static inline unsigned btree_update_reserve_required(struct bch_fs *c, ++ struct btree *b) ++{ ++ unsigned depth = btree_node_root(c, b)->c.level + 1; ++ ++ /* ++ * Number of nodes we might have to allocate in a worst case btree ++ * split operation - we split all the way up to the root, then allocate ++ * a new root, unless we're already at max depth: ++ */ ++ if (depth < BTREE_MAX_DEPTH) ++ return (depth - b->c.level) * 2 + 1; ++ else ++ return (depth - b->c.level) * 2 - 1; ++} ++ ++static inline void btree_node_reset_sib_u64s(struct btree *b) ++{ ++ b->sib_u64s[0] = b->nr.live_u64s; ++ b->sib_u64s[1] = b->nr.live_u64s; ++} ++ ++static inline void *btree_data_end(struct bch_fs *c, struct btree *b) ++{ ++ return (void *) b->data + btree_bytes(c); ++} ++ ++static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c, ++ struct btree *b) ++{ ++ return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s); ++} ++ ++static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c, ++ struct btree *b) ++{ ++ return btree_data_end(c, b); ++} ++ ++static inline void *write_block(struct btree *b) ++{ ++ return (void *) b->data + (b->written << 9); ++} ++ ++static inline bool __btree_addr_written(struct btree *b, void *p) ++{ ++ return p < write_block(b); ++} ++ ++static inline bool bset_written(struct btree *b, struct bset *i) ++{ ++ return __btree_addr_written(b, i); ++} ++ ++static inline bool bkey_written(struct btree *b, struct bkey_packed *k) ++{ ++ return __btree_addr_written(b, k); ++} ++ ++static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, ++ struct btree *b, ++ void *end) ++{ ++ ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + ++ b->whiteout_u64s; ++ ssize_t total = c->opts.btree_node_size << 6; ++ ++ return total - used; ++} ++ ++static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, ++ struct btree *b) ++{ ++ ssize_t remaining = __bch_btree_u64s_remaining(c, b, ++ btree_bkey_last(b, bset_tree_last(b))); ++ ++ BUG_ON(remaining < 0); ++ ++ if (bset_written(b, btree_bset_last(b))) ++ return 0; ++ ++ return remaining; ++} ++ ++static inline unsigned btree_write_set_buffer(struct btree *b) ++{ ++ /* ++ * Could buffer up larger amounts of keys for btrees with larger keys, ++ * pending benchmarking: ++ */ ++ return 4 << 10; ++} ++ ++static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, ++ struct btree *b) ++{ ++ struct bset_tree *t = bset_tree_last(b); ++ struct btree_node_entry *bne = max(write_block(b), ++ (void *) btree_bkey_last(b, bset_tree_last(b))); ++ ssize_t remaining_space = ++ __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]); ++ ++ if (unlikely(bset_written(b, bset(b, t)))) { ++ if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) ++ return bne; ++ } else { ++ if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && ++ remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3)) ++ return bne; ++ } ++ ++ return NULL; ++} ++ ++static inline void push_whiteout(struct bch_fs *c, struct btree *b, ++ struct bpos pos) ++{ ++ struct bkey_packed k; ++ ++ BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s); ++ ++ if (!bkey_pack_pos(&k, pos, b)) { ++ struct bkey *u = (void *) &k; ++ ++ bkey_init(u); ++ u->p = pos; ++ } ++ ++ k.needs_whiteout = true; ++ ++ b->whiteout_u64s += k.u64s; ++ bkey_copy(unwritten_whiteouts_start(c, b), &k); ++} ++ ++/* ++ * write lock must be held on @b (else the dirty bset that we were going to ++ * insert into could be written out from under us) ++ */ ++static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, ++ struct btree *b, unsigned u64s) ++{ ++ if (unlikely(btree_node_need_rewrite(b))) ++ return false; ++ ++ return u64s <= bch_btree_keys_u64s_remaining(c, b); ++} ++ ++void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); ++ ++size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *); ++ ++void bch2_journal_entries_to_btree_roots(struct bch_fs *, struct jset *); ++struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, ++ struct jset_entry *, struct jset_entry *); ++ ++void bch2_fs_btree_interior_update_exit(struct bch_fs *); ++int bch2_fs_btree_interior_update_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ +diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c +new file mode 100644 +index 000000000000..49995cd00c16 +--- /dev/null ++++ b/fs/bcachefs/btree_update_leaf.c +@@ -0,0 +1,1172 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_gc.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_key_cache.h" ++#include "btree_locking.h" ++#include "buckets.h" ++#include "debug.h" ++#include "error.h" ++#include "extent_update.h" ++#include "journal.h" ++#include "journal_reclaim.h" ++#include "keylist.h" ++#include "replicas.h" ++ ++#include ++#include ++#include ++ ++static inline bool same_leaf_as_prev(struct btree_trans *trans, ++ struct btree_insert_entry *i) ++{ ++ return i != trans->updates2 && ++ iter_l(i[0].iter)->b == iter_l(i[-1].iter)->b; ++} ++ ++inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b, ++ struct btree_iter *iter) ++{ ++ bch2_btree_node_lock_write(b, iter); ++ ++ if (btree_iter_type(iter) == BTREE_ITER_CACHED) ++ return; ++ ++ if (unlikely(btree_node_just_written(b)) && ++ bch2_btree_post_write_cleanup(c, b)) ++ bch2_btree_iter_reinit_node(iter, b); ++ ++ /* ++ * If the last bset has been written, or if it's gotten too big - start ++ * a new bset to insert into: ++ */ ++ if (want_new_bset(c, b)) ++ bch2_btree_init_next(c, b, iter); ++} ++ ++/* Inserting into a given leaf node (last stage of insert): */ ++ ++/* Handle overwrites and do insert, for non extents: */ ++bool bch2_btree_bset_insert_key(struct btree_iter *iter, ++ struct btree *b, ++ struct btree_node_iter *node_iter, ++ struct bkey_i *insert) ++{ ++ struct bkey_packed *k; ++ unsigned clobber_u64s = 0, new_u64s = 0; ++ ++ EBUG_ON(btree_node_just_written(b)); ++ EBUG_ON(bset_written(b, btree_bset_last(b))); ++ EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); ++ EBUG_ON(bkey_cmp(b->data->min_key, POS_MIN) && ++ bkey_cmp(bkey_start_pos(&insert->k), ++ bkey_predecessor(b->data->min_key)) < 0); ++ EBUG_ON(bkey_cmp(insert->k.p, b->data->min_key) < 0); ++ EBUG_ON(bkey_cmp(insert->k.p, b->data->max_key) > 0); ++ EBUG_ON(insert->k.u64s > ++ bch_btree_keys_u64s_remaining(iter->trans->c, b)); ++ EBUG_ON(iter->flags & BTREE_ITER_IS_EXTENTS); ++ ++ k = bch2_btree_node_iter_peek_all(node_iter, b); ++ if (k && bkey_cmp_packed(b, k, &insert->k)) ++ k = NULL; ++ ++ /* @k is the key being overwritten/deleted, if any: */ ++ EBUG_ON(k && bkey_whiteout(k)); ++ ++ /* Deleting, but not found? nothing to do: */ ++ if (bkey_whiteout(&insert->k) && !k) ++ return false; ++ ++ if (bkey_whiteout(&insert->k)) { ++ /* Deleting: */ ++ btree_account_key_drop(b, k); ++ k->type = KEY_TYPE_deleted; ++ ++ if (k->needs_whiteout) ++ push_whiteout(iter->trans->c, b, insert->k.p); ++ k->needs_whiteout = false; ++ ++ if (k >= btree_bset_last(b)->start) { ++ clobber_u64s = k->u64s; ++ bch2_bset_delete(b, k, clobber_u64s); ++ goto fix_iter; ++ } else { ++ bch2_btree_iter_fix_key_modified(iter, b, k); ++ } ++ ++ return true; ++ } ++ ++ if (k) { ++ /* Overwriting: */ ++ btree_account_key_drop(b, k); ++ k->type = KEY_TYPE_deleted; ++ ++ insert->k.needs_whiteout = k->needs_whiteout; ++ k->needs_whiteout = false; ++ ++ if (k >= btree_bset_last(b)->start) { ++ clobber_u64s = k->u64s; ++ goto overwrite; ++ } else { ++ bch2_btree_iter_fix_key_modified(iter, b, k); ++ } ++ } ++ ++ k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); ++overwrite: ++ bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); ++ new_u64s = k->u64s; ++fix_iter: ++ if (clobber_u64s != new_u64s) ++ bch2_btree_node_iter_fix(iter, b, node_iter, k, ++ clobber_u64s, new_u64s); ++ return true; ++} ++ ++static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, ++ unsigned i, u64 seq) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct btree_write *w = container_of(pin, struct btree_write, journal); ++ struct btree *b = container_of(w, struct btree, writes[i]); ++ ++ btree_node_lock_type(c, b, SIX_LOCK_read); ++ bch2_btree_node_write_cond(c, b, ++ (btree_current_write(b) == w && w->journal.seq == seq)); ++ six_unlock_read(&b->c.lock); ++} ++ ++static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) ++{ ++ return __btree_node_flush(j, pin, 0, seq); ++} ++ ++static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) ++{ ++ return __btree_node_flush(j, pin, 1, seq); ++} ++ ++inline void bch2_btree_add_journal_pin(struct bch_fs *c, ++ struct btree *b, u64 seq) ++{ ++ struct btree_write *w = btree_current_write(b); ++ ++ bch2_journal_pin_add(&c->journal, seq, &w->journal, ++ btree_node_write_idx(b) == 0 ++ ? btree_node_flush0 ++ : btree_node_flush1); ++} ++ ++/** ++ * btree_insert_key - insert a key one key into a leaf node ++ */ ++static bool btree_insert_key_leaf(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree *b = iter_l(iter)->b; ++ struct bset_tree *t = bset_tree_last(b); ++ struct bset *i = bset(b, t); ++ int old_u64s = bset_u64s(t); ++ int old_live_u64s = b->nr.live_u64s; ++ int live_u64s_added, u64s_added; ++ ++ EBUG_ON(!iter->level && ++ !test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags)); ++ ++ if (unlikely(!bch2_btree_bset_insert_key(iter, b, ++ &iter_l(iter)->iter, insert))) ++ return false; ++ ++ i->journal_seq = cpu_to_le64(max(trans->journal_res.seq, ++ le64_to_cpu(i->journal_seq))); ++ ++ bch2_btree_add_journal_pin(c, b, trans->journal_res.seq); ++ ++ if (unlikely(!btree_node_dirty(b))) ++ set_btree_node_dirty(b); ++ ++ live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; ++ u64s_added = (int) bset_u64s(t) - old_u64s; ++ ++ if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) ++ b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); ++ if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) ++ b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); ++ ++ if (u64s_added > live_u64s_added && ++ bch2_maybe_compact_whiteouts(c, b)) ++ bch2_btree_iter_reinit_node(iter, b); ++ ++ trace_btree_insert_key(c, b, insert); ++ return true; ++} ++ ++/* Cached btree updates: */ ++ ++/* Normal update interface: */ ++ ++static inline void btree_insert_entry_checks(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct bch_fs *c = trans->c; ++ ++ BUG_ON(bkey_cmp(insert->k.p, iter->pos)); ++ BUG_ON(debug_check_bkeys(c) && ++ bch2_bkey_invalid(c, bkey_i_to_s_c(insert), ++ __btree_node_type(iter->level, iter->btree_id))); ++} ++ ++static noinline int ++bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s) ++{ ++ struct bch_fs *c = trans->c; ++ int ret; ++ ++ bch2_trans_unlock(trans); ++ ++ ret = bch2_journal_preres_get(&c->journal, ++ &trans->journal_preres, u64s, 0); ++ if (ret) ++ return ret; ++ ++ if (!bch2_trans_relock(trans)) { ++ trace_trans_restart_journal_preres_get(trans->ip); ++ return -EINTR; ++ } ++ ++ return 0; ++} ++ ++static inline int bch2_trans_journal_res_get(struct btree_trans *trans, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ int ret; ++ ++ if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED) ++ flags |= JOURNAL_RES_GET_RESERVED; ++ ++ ret = bch2_journal_res_get(&c->journal, &trans->journal_res, ++ trans->journal_u64s, flags); ++ ++ return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret; ++} ++ ++static enum btree_insert_ret ++btree_key_can_insert(struct btree_trans *trans, ++ struct btree_iter *iter, ++ unsigned u64s) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree *b = iter_l(iter)->b; ++ ++ if (!bch2_btree_node_insert_fits(c, b, u64s)) ++ return BTREE_INSERT_BTREE_NODE_FULL; ++ ++ return BTREE_INSERT_OK; ++} ++ ++static enum btree_insert_ret ++btree_key_can_insert_cached(struct btree_trans *trans, ++ struct btree_iter *iter, ++ unsigned u64s) ++{ ++ struct bkey_cached *ck = (void *) iter->l[0].b; ++ unsigned new_u64s; ++ struct bkey_i *new_k; ++ ++ BUG_ON(iter->level); ++ ++ if (u64s <= ck->u64s) ++ return BTREE_INSERT_OK; ++ ++ new_u64s = roundup_pow_of_two(u64s); ++ new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); ++ if (!new_k) ++ return -ENOMEM; ++ ++ ck->u64s = new_u64s; ++ ck->k = new_k; ++ return BTREE_INSERT_OK; ++} ++ ++static inline void do_btree_insert_one(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct bch_fs *c = trans->c; ++ struct journal *j = &c->journal; ++ bool did_work; ++ ++ EBUG_ON(trans->journal_res.ref != ++ !(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)); ++ ++ insert->k.needs_whiteout = false; ++ ++ did_work = (btree_iter_type(iter) != BTREE_ITER_CACHED) ++ ? btree_insert_key_leaf(trans, iter, insert) ++ : bch2_btree_insert_key_cached(trans, iter, insert); ++ if (!did_work) ++ return; ++ ++ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { ++ bch2_journal_add_keys(j, &trans->journal_res, ++ iter->btree_id, insert); ++ ++ bch2_journal_set_has_inode(j, &trans->journal_res, ++ insert->k.p.inode); ++ ++ if (trans->journal_seq) ++ *trans->journal_seq = trans->journal_res.seq; ++ } ++} ++ ++static inline bool iter_has_trans_triggers(struct btree_iter *iter) ++{ ++ return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << iter->btree_id); ++} ++ ++static inline bool iter_has_nontrans_triggers(struct btree_iter *iter) ++{ ++ return (((BTREE_NODE_TYPE_HAS_TRIGGERS & ++ ~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS)) | ++ (1U << BTREE_ID_EC)) & ++ (1U << iter->btree_id); ++} ++ ++static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter) ++{ ++ __bch2_btree_iter_unlock(iter); ++} ++ ++static noinline void bch2_trans_mark_gc(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_insert_entry *i; ++ ++ trans_for_each_update(trans, i) { ++ /* ++ * XXX: synchronization of cached update triggers with gc ++ */ ++ BUG_ON(btree_iter_type(i->iter) == BTREE_ITER_CACHED); ++ ++ if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b))) ++ bch2_mark_update(trans, i->iter, i->k, NULL, ++ i->trigger_flags|BTREE_TRIGGER_GC); ++ } ++} ++ ++static inline int ++bch2_trans_commit_write_locked(struct btree_trans *trans, ++ struct btree_insert_entry **stopped_at) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_fs_usage *fs_usage = NULL; ++ struct btree_insert_entry *i; ++ unsigned u64s = 0; ++ bool marking = false; ++ int ret; ++ ++ if (race_fault()) { ++ trace_trans_restart_fault_inject(trans->ip); ++ return -EINTR; ++ } ++ ++ /* ++ * Check if the insert will fit in the leaf node with the write lock ++ * held, otherwise another thread could write the node changing the ++ * amount of space available: ++ */ ++ ++ prefetch(&trans->c->journal.flags); ++ ++ trans_for_each_update2(trans, i) { ++ /* Multiple inserts might go to same leaf: */ ++ if (!same_leaf_as_prev(trans, i)) ++ u64s = 0; ++ ++ u64s += i->k->k.u64s; ++ ret = btree_iter_type(i->iter) != BTREE_ITER_CACHED ++ ? btree_key_can_insert(trans, i->iter, u64s) ++ : btree_key_can_insert_cached(trans, i->iter, u64s); ++ if (ret) { ++ *stopped_at = i; ++ return ret; ++ } ++ ++ if (btree_node_type_needs_gc(i->iter->btree_id)) ++ marking = true; ++ } ++ ++ if (marking) { ++ percpu_down_read(&c->mark_lock); ++ fs_usage = bch2_fs_usage_scratch_get(c); ++ } ++ ++ /* ++ * Don't get journal reservation until after we know insert will ++ * succeed: ++ */ ++ if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) { ++ ret = bch2_trans_journal_res_get(trans, ++ JOURNAL_RES_GET_NONBLOCK); ++ if (ret) ++ goto err; ++ } else { ++ trans->journal_res.seq = c->journal.replay_journal_seq; ++ } ++ ++ if (unlikely(trans->extra_journal_entry_u64s)) { ++ memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), ++ trans->extra_journal_entries, ++ trans->extra_journal_entry_u64s); ++ ++ trans->journal_res.offset += trans->extra_journal_entry_u64s; ++ trans->journal_res.u64s -= trans->extra_journal_entry_u64s; ++ } ++ ++ /* ++ * Not allowed to fail after we've gotten our journal reservation - we ++ * have to use it: ++ */ ++ ++ if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) { ++ if (journal_seq_verify(c)) ++ trans_for_each_update2(trans, i) ++ i->k->k.version.lo = trans->journal_res.seq; ++ else if (inject_invalid_keys(c)) ++ trans_for_each_update2(trans, i) ++ i->k->k.version = MAX_VERSION; ++ } ++ ++ /* Must be called under mark_lock: */ ++ if (marking && trans->fs_usage_deltas && ++ bch2_replicas_delta_list_apply(c, fs_usage, ++ trans->fs_usage_deltas)) { ++ ret = BTREE_INSERT_NEED_MARK_REPLICAS; ++ goto err; ++ } ++ ++ trans_for_each_update(trans, i) ++ if (iter_has_nontrans_triggers(i->iter)) ++ bch2_mark_update(trans, i->iter, i->k, ++ fs_usage, i->trigger_flags); ++ ++ if (marking) ++ bch2_trans_fs_usage_apply(trans, fs_usage); ++ ++ if (unlikely(c->gc_pos.phase)) ++ bch2_trans_mark_gc(trans); ++ ++ trans_for_each_update2(trans, i) ++ do_btree_insert_one(trans, i->iter, i->k); ++err: ++ if (marking) { ++ bch2_fs_usage_scratch_put(c, fs_usage); ++ percpu_up_read(&c->mark_lock); ++ } ++ ++ return ret; ++} ++ ++/* ++ * Get journal reservation, take write locks, and attempt to do btree update(s): ++ */ ++static inline int do_bch2_trans_commit(struct btree_trans *trans, ++ struct btree_insert_entry **stopped_at) ++{ ++ struct btree_insert_entry *i; ++ struct btree_iter *iter; ++ int ret; ++ ++ trans_for_each_update2(trans, i) ++ BUG_ON(!btree_node_intent_locked(i->iter, i->iter->level)); ++ ++ ret = bch2_journal_preres_get(&trans->c->journal, ++ &trans->journal_preres, trans->journal_preres_u64s, ++ JOURNAL_RES_GET_NONBLOCK| ++ ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) ++ ? JOURNAL_RES_GET_RECLAIM : 0)); ++ if (unlikely(ret == -EAGAIN)) ++ ret = bch2_trans_journal_preres_get_cold(trans, ++ trans->journal_preres_u64s); ++ if (unlikely(ret)) ++ return ret; ++ ++ /* ++ * Can't be holding any read locks when we go to take write locks: ++ * ++ * note - this must be done after bch2_trans_journal_preres_get_cold() ++ * or anything else that might call bch2_trans_relock(), since that ++ * would just retake the read locks: ++ */ ++ trans_for_each_iter(trans, iter) { ++ if (iter->nodes_locked != iter->nodes_intent_locked) { ++ EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT); ++ EBUG_ON(trans->iters_live & (1ULL << iter->idx)); ++ bch2_btree_iter_unlock_noinline(iter); ++ } ++ } ++ ++ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) ++ trans_for_each_update2(trans, i) ++ btree_insert_entry_checks(trans, i->iter, i->k); ++ bch2_btree_trans_verify_locks(trans); ++ ++ trans_for_each_update2(trans, i) ++ if (!same_leaf_as_prev(trans, i)) ++ bch2_btree_node_lock_for_insert(trans->c, ++ iter_l(i->iter)->b, i->iter); ++ ++ ret = bch2_trans_commit_write_locked(trans, stopped_at); ++ ++ trans_for_each_update2(trans, i) ++ if (!same_leaf_as_prev(trans, i)) ++ bch2_btree_node_unlock_write_inlined(iter_l(i->iter)->b, ++ i->iter); ++ ++ if (!ret && trans->journal_pin) ++ bch2_journal_pin_add(&trans->c->journal, trans->journal_res.seq, ++ trans->journal_pin, NULL); ++ ++ /* ++ * Drop journal reservation after dropping write locks, since dropping ++ * the journal reservation may kick off a journal write: ++ */ ++ bch2_journal_res_put(&trans->c->journal, &trans->journal_res); ++ ++ if (unlikely(ret)) ++ return ret; ++ ++ if (trans->flags & BTREE_INSERT_NOUNLOCK) ++ trans->nounlock = true; ++ ++ trans_for_each_update2(trans, i) ++ if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && ++ !same_leaf_as_prev(trans, i)) ++ bch2_foreground_maybe_merge(trans->c, i->iter, ++ 0, trans->flags); ++ ++ trans->nounlock = false; ++ ++ bch2_trans_downgrade(trans); ++ ++ return 0; ++} ++ ++static noinline ++int bch2_trans_commit_error(struct btree_trans *trans, ++ struct btree_insert_entry *i, ++ int ret) ++{ ++ struct bch_fs *c = trans->c; ++ unsigned flags = trans->flags; ++ ++ /* ++ * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree ++ * update; if we haven't done anything yet it doesn't apply ++ */ ++ flags &= ~BTREE_INSERT_NOUNLOCK; ++ ++ switch (ret) { ++ case BTREE_INSERT_BTREE_NODE_FULL: ++ ret = bch2_btree_split_leaf(c, i->iter, flags); ++ ++ /* ++ * if the split succeeded without dropping locks the insert will ++ * still be atomic (what the caller peeked() and is overwriting ++ * won't have changed) ++ */ ++#if 0 ++ /* ++ * XXX: ++ * split -> btree node merging (of parent node) might still drop ++ * locks when we're not passing it BTREE_INSERT_NOUNLOCK ++ * ++ * we don't want to pass BTREE_INSERT_NOUNLOCK to split as that ++ * will inhibit merging - but we don't have a reliable way yet ++ * (do we?) of checking if we dropped locks in this path ++ */ ++ if (!ret) ++ goto retry; ++#endif ++ ++ /* ++ * don't care if we got ENOSPC because we told split it ++ * couldn't block: ++ */ ++ if (!ret || ++ ret == -EINTR || ++ (flags & BTREE_INSERT_NOUNLOCK)) { ++ trace_trans_restart_btree_node_split(trans->ip); ++ ret = -EINTR; ++ } ++ break; ++ case BTREE_INSERT_ENOSPC: ++ ret = -ENOSPC; ++ break; ++ case BTREE_INSERT_NEED_MARK_REPLICAS: ++ bch2_trans_unlock(trans); ++ ++ trans_for_each_update(trans, i) { ++ ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k)); ++ if (ret) ++ return ret; ++ } ++ ++ if (bch2_trans_relock(trans)) ++ return 0; ++ ++ trace_trans_restart_mark_replicas(trans->ip); ++ ret = -EINTR; ++ break; ++ case BTREE_INSERT_NEED_JOURNAL_RES: ++ bch2_trans_unlock(trans); ++ ++ ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK); ++ if (ret) ++ return ret; ++ ++ if (bch2_trans_relock(trans)) ++ return 0; ++ ++ trace_trans_restart_journal_res_get(trans->ip); ++ ret = -EINTR; ++ break; ++ default: ++ BUG_ON(ret >= 0); ++ break; ++ } ++ ++ if (ret == -EINTR) { ++ int ret2 = bch2_btree_iter_traverse_all(trans); ++ ++ if (ret2) { ++ trace_trans_restart_traverse(trans->ip); ++ return ret2; ++ } ++ ++ trace_trans_restart_atomic(trans->ip); ++ } ++ ++ return ret; ++} ++ ++static noinline int ++bch2_trans_commit_get_rw_cold(struct btree_trans *trans) ++{ ++ struct bch_fs *c = trans->c; ++ int ret; ++ ++ if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW))) ++ return -EROFS; ++ ++ bch2_trans_unlock(trans); ++ ++ ret = bch2_fs_read_write_early(c); ++ if (ret) ++ return ret; ++ ++ percpu_ref_get(&c->writes); ++ return 0; ++} ++ ++static void bch2_trans_update2(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct btree_insert_entry *i, n = (struct btree_insert_entry) { ++ .iter = iter, .k = insert ++ }; ++ ++ btree_insert_entry_checks(trans, n.iter, n.k); ++ ++ BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); ++ ++ EBUG_ON(trans->nr_updates2 >= trans->nr_iters); ++ ++ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ ++ trans_for_each_update2(trans, i) { ++ if (btree_iter_cmp(n.iter, i->iter) == 0) { ++ *i = n; ++ return; ++ } ++ ++ if (btree_iter_cmp(n.iter, i->iter) <= 0) ++ break; ++ } ++ ++ array_insert_item(trans->updates2, trans->nr_updates2, ++ i - trans->updates2, n); ++} ++ ++static int extent_update_to_keys(struct btree_trans *trans, ++ struct btree_iter *orig_iter, ++ struct bkey_i *insert) ++{ ++ struct btree_iter *iter; ++ int ret; ++ ++ ret = bch2_extent_can_insert(trans, orig_iter, insert); ++ if (ret) ++ return ret; ++ ++ if (bkey_deleted(&insert->k)) ++ return 0; ++ ++ iter = bch2_trans_copy_iter(trans, orig_iter); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ iter->flags |= BTREE_ITER_INTENT; ++ __bch2_btree_iter_set_pos(iter, insert->k.p, false); ++ bch2_trans_update2(trans, iter, insert); ++ bch2_trans_iter_put(trans, iter); ++ return 0; ++} ++ ++static int extent_handle_overwrites(struct btree_trans *trans, ++ enum btree_id btree_id, ++ struct bpos start, struct bpos end) ++{ ++ struct btree_iter *iter = NULL, *update_iter; ++ struct bkey_i *update; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ iter = bch2_trans_get_iter(trans, btree_id, start, BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(iter); ++ if (ret) ++ return ret; ++ ++ k = bch2_btree_iter_peek_with_updates(iter); ++ ++ while (k.k && !(ret = bkey_err(k))) { ++ if (bkey_cmp(end, bkey_start_pos(k.k)) <= 0) ++ break; ++ ++ if (bkey_cmp(bkey_start_pos(k.k), start) < 0) { ++ update_iter = bch2_trans_copy_iter(trans, iter); ++ if ((ret = PTR_ERR_OR_ZERO(update_iter))) ++ goto err; ++ ++ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ goto err; ++ ++ bkey_reassemble(update, k); ++ bch2_cut_back(start, update); ++ ++ __bch2_btree_iter_set_pos(update_iter, update->k.p, false); ++ bch2_trans_update2(trans, update_iter, update); ++ bch2_trans_iter_put(trans, update_iter); ++ } ++ ++ if (bkey_cmp(k.k->p, end) > 0) { ++ update_iter = bch2_trans_copy_iter(trans, iter); ++ if ((ret = PTR_ERR_OR_ZERO(update_iter))) ++ goto err; ++ ++ update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ goto err; ++ ++ bkey_reassemble(update, k); ++ bch2_cut_front(end, update); ++ ++ __bch2_btree_iter_set_pos(update_iter, update->k.p, false); ++ bch2_trans_update2(trans, update_iter, update); ++ bch2_trans_iter_put(trans, update_iter); ++ } else { ++ update_iter = bch2_trans_copy_iter(trans, iter); ++ if ((ret = PTR_ERR_OR_ZERO(update_iter))) ++ goto err; ++ ++ update = bch2_trans_kmalloc(trans, sizeof(struct bkey)); ++ if ((ret = PTR_ERR_OR_ZERO(update))) ++ goto err; ++ ++ update->k = *k.k; ++ set_bkey_val_u64s(&update->k, 0); ++ update->k.type = KEY_TYPE_deleted; ++ update->k.size = 0; ++ ++ __bch2_btree_iter_set_pos(update_iter, update->k.p, false); ++ bch2_trans_update2(trans, update_iter, update); ++ bch2_trans_iter_put(trans, update_iter); ++ } ++ ++ k = bch2_btree_iter_next_with_updates(iter); ++ } ++err: ++ if (!IS_ERR_OR_NULL(iter)) ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++int __bch2_trans_commit(struct btree_trans *trans) ++{ ++ struct btree_insert_entry *i = NULL; ++ struct btree_iter *iter; ++ bool trans_trigger_run; ++ unsigned u64s; ++ int ret = 0; ++ ++ BUG_ON(trans->need_reset); ++ ++ if (!trans->nr_updates) ++ goto out_noupdates; ++ ++ if (trans->flags & BTREE_INSERT_GC_LOCK_HELD) ++ lockdep_assert_held(&trans->c->gc_lock); ++ ++ memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); ++ ++ trans->journal_u64s = trans->extra_journal_entry_u64s; ++ trans->journal_preres_u64s = 0; ++ ++ if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) && ++ unlikely(!percpu_ref_tryget(&trans->c->writes))) { ++ ret = bch2_trans_commit_get_rw_cold(trans); ++ if (ret) ++ return ret; ++ } ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ trans_for_each_update(trans, i) ++ if (btree_iter_type(i->iter) != BTREE_ITER_CACHED && ++ !(i->trigger_flags & BTREE_TRIGGER_NORUN)) ++ bch2_btree_key_cache_verify_clean(trans, ++ i->iter->btree_id, i->iter->pos); ++#endif ++ ++ /* ++ * Running triggers will append more updates to the list of updates as ++ * we're walking it: ++ */ ++ do { ++ trans_trigger_run = false; ++ ++ trans_for_each_update(trans, i) { ++ if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK && ++ (ret = bch2_btree_iter_traverse(i->iter)))) { ++ trace_trans_restart_traverse(trans->ip); ++ goto out; ++ } ++ ++ /* ++ * We're not using bch2_btree_iter_upgrade here because ++ * we know trans->nounlock can't be set: ++ */ ++ if (unlikely(i->iter->locks_want < 1 && ++ !__bch2_btree_iter_upgrade(i->iter, 1))) { ++ trace_trans_restart_upgrade(trans->ip); ++ ret = -EINTR; ++ goto out; ++ } ++ ++ if (iter_has_trans_triggers(i->iter) && ++ !i->trans_triggers_run) { ++ i->trans_triggers_run = true; ++ trans_trigger_run = true; ++ ++ ret = bch2_trans_mark_update(trans, i->iter, i->k, ++ i->trigger_flags); ++ if (unlikely(ret)) { ++ if (ret == -EINTR) ++ trace_trans_restart_mark(trans->ip); ++ goto out; ++ } ++ } ++ } ++ } while (trans_trigger_run); ++ ++ /* Turn extents updates into keys: */ ++ trans_for_each_update(trans, i) ++ if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { ++ struct bpos start = bkey_start_pos(&i->k->k); ++ ++ while (i + 1 < trans->updates + trans->nr_updates && ++ i[0].iter->btree_id == i[1].iter->btree_id && ++ !bkey_cmp(i[0].k->k.p, bkey_start_pos(&i[1].k->k))) ++ i++; ++ ++ ret = extent_handle_overwrites(trans, i->iter->btree_id, ++ start, i->k->k.p); ++ if (ret) ++ goto out; ++ } ++ ++ trans_for_each_update(trans, i) { ++ if (i->iter->flags & BTREE_ITER_IS_EXTENTS) { ++ ret = extent_update_to_keys(trans, i->iter, i->k); ++ if (ret) ++ goto out; ++ } else { ++ bch2_trans_update2(trans, i->iter, i->k); ++ } ++ } ++ ++ trans_for_each_update2(trans, i) { ++ BUG_ON(i->iter->uptodate > BTREE_ITER_NEED_PEEK); ++ BUG_ON(i->iter->locks_want < 1); ++ ++ u64s = jset_u64s(i->k->k.u64s); ++ if (btree_iter_type(i->iter) == BTREE_ITER_CACHED && ++ likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) ++ trans->journal_preres_u64s += u64s; ++ trans->journal_u64s += u64s; ++ } ++retry: ++ memset(&trans->journal_res, 0, sizeof(trans->journal_res)); ++ ++ ret = do_bch2_trans_commit(trans, &i); ++ ++ /* make sure we didn't drop or screw up locks: */ ++ bch2_btree_trans_verify_locks(trans); ++ ++ if (ret) ++ goto err; ++ ++ trans_for_each_iter(trans, iter) ++ if ((trans->iters_live & (1ULL << iter->idx)) && ++ (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) { ++ if (trans->flags & BTREE_INSERT_NOUNLOCK) ++ bch2_btree_iter_set_pos_same_leaf(iter, iter->pos_after_commit); ++ else ++ bch2_btree_iter_set_pos(iter, iter->pos_after_commit); ++ } ++out: ++ bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres); ++ ++ if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW))) ++ percpu_ref_put(&trans->c->writes); ++out_noupdates: ++ bch2_trans_reset(trans, !ret ? TRANS_RESET_NOTRAVERSE : 0); ++ ++ return ret; ++err: ++ ret = bch2_trans_commit_error(trans, i, ret); ++ if (ret) ++ goto out; ++ ++ goto retry; ++} ++ ++int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, ++ struct bkey_i *k, enum btree_trigger_flags flags) ++{ ++ struct btree_insert_entry *i, n = (struct btree_insert_entry) { ++ .trigger_flags = flags, .iter = iter, .k = k ++ }; ++ ++ EBUG_ON(bkey_cmp(iter->pos, ++ (iter->flags & BTREE_ITER_IS_EXTENTS) ++ ? bkey_start_pos(&k->k) ++ : k->k.p)); ++ ++ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ ++ if (btree_node_type_is_extents(iter->btree_id)) { ++ iter->pos_after_commit = k->k.p; ++ iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT; ++ } ++ ++ /* ++ * Pending updates are kept sorted: first, find position of new update: ++ */ ++ trans_for_each_update(trans, i) ++ if (btree_iter_cmp(iter, i->iter) <= 0) ++ break; ++ ++ /* ++ * Now delete/trim any updates the new update overwrites: ++ */ ++ if (i > trans->updates && ++ i[-1].iter->btree_id == iter->btree_id && ++ bkey_cmp(iter->pos, i[-1].k->k.p) < 0) ++ bch2_cut_back(n.iter->pos, i[-1].k); ++ ++ while (i < trans->updates + trans->nr_updates && ++ iter->btree_id == i->iter->btree_id && ++ bkey_cmp(n.k->k.p, i->k->k.p) >= 0) ++ array_remove_item(trans->updates, trans->nr_updates, ++ i - trans->updates); ++ ++ if (i < trans->updates + trans->nr_updates && ++ iter->btree_id == i->iter->btree_id && ++ bkey_cmp(n.k->k.p, i->iter->pos) > 0) { ++ /* ++ * When we have an extent that overwrites the start of another ++ * update, trimming that extent will mean the iterator's ++ * position has to change since the iterator position has to ++ * match the extent's start pos - but we don't want to change ++ * the iterator pos if some other code is using it, so we may ++ * need to clone it: ++ */ ++ if (trans->iters_live & (1ULL << i->iter->idx)) { ++ i->iter = bch2_trans_copy_iter(trans, i->iter); ++ if (IS_ERR(i->iter)) { ++ trans->need_reset = true; ++ return PTR_ERR(i->iter); ++ } ++ ++ i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ bch2_trans_iter_put(trans, i->iter); ++ } ++ ++ bch2_cut_front(n.k->k.p, i->k); ++ bch2_btree_iter_set_pos(i->iter, n.k->k.p); ++ } ++ ++ EBUG_ON(trans->nr_updates >= trans->nr_iters); ++ ++ array_insert_item(trans->updates, trans->nr_updates, ++ i - trans->updates, n); ++ return 0; ++} ++ ++int __bch2_btree_insert(struct btree_trans *trans, ++ enum btree_id id, struct bkey_i *k) ++{ ++ struct btree_iter *iter; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k), ++ BTREE_ITER_INTENT); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ ret = bch2_btree_iter_traverse(iter) ?: ++ bch2_trans_update(trans, iter, k, 0); ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++/** ++ * bch2_btree_insert - insert keys into the extent btree ++ * @c: pointer to struct bch_fs ++ * @id: btree to insert into ++ * @insert_keys: list of keys to insert ++ * @hook: insert callback ++ */ ++int bch2_btree_insert(struct bch_fs *c, enum btree_id id, ++ struct bkey_i *k, ++ struct disk_reservation *disk_res, ++ u64 *journal_seq, int flags) ++{ ++ return bch2_trans_do(c, disk_res, journal_seq, flags, ++ __bch2_btree_insert(&trans, id, k)); ++} ++ ++int bch2_btree_delete_at_range(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bpos end, ++ u64 *journal_seq) ++{ ++ struct bkey_s_c k; ++ int ret = 0; ++retry: ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret = bkey_err(k)) && ++ bkey_cmp(iter->pos, end) < 0) { ++ struct bkey_i delete; ++ ++ bch2_trans_begin(trans); ++ ++ bkey_init(&delete.k); ++ ++ /* ++ * For extents, iter.pos won't necessarily be the same as ++ * bkey_start_pos(k.k) (for non extents they always will be the ++ * same). It's important that we delete starting from iter.pos ++ * because the range we want to delete could start in the middle ++ * of k. ++ * ++ * (bch2_btree_iter_peek() does guarantee that iter.pos >= ++ * bkey_start_pos(k.k)). ++ */ ++ delete.k.p = iter->pos; ++ ++ if (btree_node_type_is_extents(iter->btree_id)) { ++ unsigned max_sectors = ++ KEY_SIZE_MAX & (~0 << trans->c->block_bits); ++ ++ /* create the biggest key we can */ ++ bch2_key_resize(&delete.k, max_sectors); ++ bch2_cut_back(end, &delete); ++ ++ ret = bch2_extent_trim_atomic(&delete, iter); ++ if (ret) ++ break; ++ } ++ ++ bch2_trans_update(trans, iter, &delete, 0); ++ ret = bch2_trans_commit(trans, NULL, journal_seq, ++ BTREE_INSERT_NOFAIL); ++ if (ret) ++ break; ++ ++ bch2_trans_cond_resched(trans); ++ } ++ ++ if (ret == -EINTR) { ++ ret = 0; ++ goto retry; ++ } ++ ++ return ret; ++ ++} ++ ++int bch2_btree_delete_at(struct btree_trans *trans, ++ struct btree_iter *iter, unsigned flags) ++{ ++ struct bkey_i k; ++ ++ bkey_init(&k.k); ++ k.k.p = iter->pos; ++ ++ bch2_trans_update(trans, iter, &k, 0); ++ return bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE|flags); ++} ++ ++/* ++ * bch_btree_delete_range - delete everything within a given range ++ * ++ * Range is a half open interval - [start, end) ++ */ ++int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, ++ struct bpos start, struct bpos end, ++ u64 *journal_seq) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ int ret = 0; ++ ++ /* ++ * XXX: whether we need mem/more iters depends on whether this btree id ++ * has triggers ++ */ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); ++ ++ iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT); ++ ++ ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq); ++ ret = bch2_trans_exit(&trans) ?: ret; ++ ++ BUG_ON(ret == -EINTR); ++ return ret; ++} +diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c +new file mode 100644 +index 000000000000..2a3b95968a86 +--- /dev/null ++++ b/fs/bcachefs/buckets.c +@@ -0,0 +1,2230 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Code for manipulating bucket marks for garbage collection. ++ * ++ * Copyright 2014 Datera, Inc. ++ * ++ * Bucket states: ++ * - free bucket: mark == 0 ++ * The bucket contains no data and will not be read ++ * ++ * - allocator bucket: owned_by_allocator == 1 ++ * The bucket is on a free list, or it is an open bucket ++ * ++ * - cached bucket: owned_by_allocator == 0 && ++ * dirty_sectors == 0 && ++ * cached_sectors > 0 ++ * The bucket contains data but may be safely discarded as there are ++ * enough replicas of the data on other cache devices, or it has been ++ * written back to the backing device ++ * ++ * - dirty bucket: owned_by_allocator == 0 && ++ * dirty_sectors > 0 ++ * The bucket contains data that we must not discard (either only copy, ++ * or one of the 'main copies' for data requiring multiple replicas) ++ * ++ * - metadata bucket: owned_by_allocator == 0 && is_metadata == 1 ++ * This is a btree node, journal or gen/prio bucket ++ * ++ * Lifecycle: ++ * ++ * bucket invalidated => bucket on freelist => open bucket => ++ * [dirty bucket =>] cached bucket => bucket invalidated => ... ++ * ++ * Note that cache promotion can skip the dirty bucket step, as data ++ * is copied from a deeper tier to a shallower tier, onto a cached ++ * bucket. ++ * Note also that a cached bucket can spontaneously become dirty -- ++ * see below. ++ * ++ * Only a traversal of the key space can determine whether a bucket is ++ * truly dirty or cached. ++ * ++ * Transitions: ++ * ++ * - free => allocator: bucket was invalidated ++ * - cached => allocator: bucket was invalidated ++ * ++ * - allocator => dirty: open bucket was filled up ++ * - allocator => cached: open bucket was filled up ++ * - allocator => metadata: metadata was allocated ++ * ++ * - dirty => cached: dirty sectors were copied to a deeper tier ++ * - dirty => free: dirty sectors were overwritten or moved (copy gc) ++ * - cached => free: cached sectors were overwritten ++ * ++ * - metadata => free: metadata was freed ++ * ++ * Oddities: ++ * - cached => dirty: a device was removed so formerly replicated data ++ * is no longer sufficiently replicated ++ * - free => cached: cannot happen ++ * - free => dirty: cannot happen ++ * - free => metadata: cannot happen ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "bset.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "ec.h" ++#include "error.h" ++#include "movinggc.h" ++#include "replicas.h" ++ ++#include ++#include ++ ++/* ++ * Clear journal_seq_valid for buckets for which it's not needed, to prevent ++ * wraparound: ++ */ ++void bch2_bucket_seq_cleanup(struct bch_fs *c) ++{ ++ u64 journal_seq = atomic64_read(&c->journal.seq); ++ u16 last_seq_ondisk = c->journal.last_seq_ondisk; ++ struct bch_dev *ca; ++ struct bucket_array *buckets; ++ struct bucket *g; ++ struct bucket_mark m; ++ unsigned i; ++ ++ if (journal_seq - c->last_bucket_seq_cleanup < ++ (1U << (BUCKET_JOURNAL_SEQ_BITS - 2))) ++ return; ++ ++ c->last_bucket_seq_cleanup = journal_seq; ++ ++ for_each_member_device(ca, c, i) { ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ for_each_bucket(g, buckets) { ++ bucket_cmpxchg(g, m, ({ ++ if (!m.journal_seq_valid || ++ bucket_needs_journal_commit(m, last_seq_ondisk)) ++ break; ++ ++ m.journal_seq_valid = 0; ++ })); ++ } ++ up_read(&ca->bucket_lock); ++ } ++} ++ ++void bch2_fs_usage_initialize(struct bch_fs *c) ++{ ++ struct bch_fs_usage *usage; ++ unsigned i; ++ ++ percpu_down_write(&c->mark_lock); ++ usage = c->usage_base; ++ ++ bch2_fs_usage_acc_to_base(c, 0); ++ bch2_fs_usage_acc_to_base(c, 1); ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) ++ usage->reserved += usage->persistent_reserved[i]; ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ ++ switch (e->data_type) { ++ case BCH_DATA_btree: ++ usage->btree += usage->replicas[i]; ++ break; ++ case BCH_DATA_user: ++ usage->data += usage->replicas[i]; ++ break; ++ case BCH_DATA_cached: ++ usage->cached += usage->replicas[i]; ++ break; ++ } ++ } ++ ++ percpu_up_write(&c->mark_lock); ++} ++ ++void bch2_fs_usage_scratch_put(struct bch_fs *c, struct bch_fs_usage *fs_usage) ++{ ++ if (fs_usage == c->usage_scratch) ++ mutex_unlock(&c->usage_scratch_lock); ++ else ++ kfree(fs_usage); ++} ++ ++struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *c) ++{ ++ struct bch_fs_usage *ret; ++ unsigned bytes = fs_usage_u64s(c) * sizeof(u64); ++ ++ ret = kzalloc(bytes, GFP_NOWAIT|__GFP_NOWARN); ++ if (ret) ++ return ret; ++ ++ if (mutex_trylock(&c->usage_scratch_lock)) ++ goto out_pool; ++ ++ ret = kzalloc(bytes, GFP_NOFS); ++ if (ret) ++ return ret; ++ ++ mutex_lock(&c->usage_scratch_lock); ++out_pool: ++ ret = c->usage_scratch; ++ memset(ret, 0, bytes); ++ return ret; ++} ++ ++struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) ++{ ++ struct bch_dev_usage ret; ++ ++ memset(&ret, 0, sizeof(ret)); ++ acc_u64s_percpu((u64 *) &ret, ++ (u64 __percpu *) ca->usage[0], ++ sizeof(ret) / sizeof(u64)); ++ ++ return ret; ++} ++ ++static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, ++ unsigned journal_seq, ++ bool gc) ++{ ++ return this_cpu_ptr(gc ++ ? c->usage_gc ++ : c->usage[journal_seq & 1]); ++} ++ ++u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) ++{ ++ ssize_t offset = v - (u64 *) c->usage_base; ++ unsigned seq; ++ u64 ret; ++ ++ BUG_ON(offset < 0 || offset >= fs_usage_u64s(c)); ++ percpu_rwsem_assert_held(&c->mark_lock); ++ ++ do { ++ seq = read_seqcount_begin(&c->usage_lock); ++ ret = *v + ++ percpu_u64_get((u64 __percpu *) c->usage[0] + offset) + ++ percpu_u64_get((u64 __percpu *) c->usage[1] + offset); ++ } while (read_seqcount_retry(&c->usage_lock, seq)); ++ ++ return ret; ++} ++ ++struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c) ++{ ++ struct bch_fs_usage *ret; ++ unsigned seq, v, u64s = fs_usage_u64s(c); ++retry: ++ ret = kmalloc(u64s * sizeof(u64), GFP_NOFS); ++ if (unlikely(!ret)) ++ return NULL; ++ ++ percpu_down_read(&c->mark_lock); ++ ++ v = fs_usage_u64s(c); ++ if (unlikely(u64s != v)) { ++ u64s = v; ++ percpu_up_read(&c->mark_lock); ++ kfree(ret); ++ goto retry; ++ } ++ ++ do { ++ seq = read_seqcount_begin(&c->usage_lock); ++ memcpy(ret, c->usage_base, u64s * sizeof(u64)); ++ acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[0], u64s); ++ acc_u64s_percpu((u64 *) ret, (u64 __percpu *) c->usage[1], u64s); ++ } while (read_seqcount_retry(&c->usage_lock, seq)); ++ ++ return ret; ++} ++ ++void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) ++{ ++ unsigned u64s = fs_usage_u64s(c); ++ ++ BUG_ON(idx >= 2); ++ ++ preempt_disable(); ++ write_seqcount_begin(&c->usage_lock); ++ ++ acc_u64s_percpu((u64 *) c->usage_base, ++ (u64 __percpu *) c->usage[idx], u64s); ++ percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); ++ ++ write_seqcount_end(&c->usage_lock); ++ preempt_enable(); ++} ++ ++void bch2_fs_usage_to_text(struct printbuf *out, ++ struct bch_fs *c, ++ struct bch_fs_usage *fs_usage) ++{ ++ unsigned i; ++ ++ pr_buf(out, "capacity:\t\t\t%llu\n", c->capacity); ++ ++ pr_buf(out, "hidden:\t\t\t\t%llu\n", ++ fs_usage->hidden); ++ pr_buf(out, "data:\t\t\t\t%llu\n", ++ fs_usage->data); ++ pr_buf(out, "cached:\t\t\t\t%llu\n", ++ fs_usage->cached); ++ pr_buf(out, "reserved:\t\t\t%llu\n", ++ fs_usage->reserved); ++ pr_buf(out, "nr_inodes:\t\t\t%llu\n", ++ fs_usage->nr_inodes); ++ pr_buf(out, "online reserved:\t\t%llu\n", ++ fs_usage->online_reserved); ++ ++ for (i = 0; ++ i < ARRAY_SIZE(fs_usage->persistent_reserved); ++ i++) { ++ pr_buf(out, "%u replicas:\n", i + 1); ++ pr_buf(out, "\treserved:\t\t%llu\n", ++ fs_usage->persistent_reserved[i]); ++ } ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ ++ pr_buf(out, "\t"); ++ bch2_replicas_entry_to_text(out, e); ++ pr_buf(out, ":\t%llu\n", fs_usage->replicas[i]); ++ } ++} ++ ++#define RESERVE_FACTOR 6 ++ ++static u64 reserve_factor(u64 r) ++{ ++ return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); ++} ++ ++static u64 avail_factor(u64 r) ++{ ++ return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1); ++} ++ ++u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage *fs_usage) ++{ ++ return min(fs_usage->hidden + ++ fs_usage->btree + ++ fs_usage->data + ++ reserve_factor(fs_usage->reserved + ++ fs_usage->online_reserved), ++ c->capacity); ++} ++ ++static struct bch_fs_usage_short ++__bch2_fs_usage_read_short(struct bch_fs *c) ++{ ++ struct bch_fs_usage_short ret; ++ u64 data, reserved; ++ ++ ret.capacity = c->capacity - ++ bch2_fs_usage_read_one(c, &c->usage_base->hidden); ++ ++ data = bch2_fs_usage_read_one(c, &c->usage_base->data) + ++ bch2_fs_usage_read_one(c, &c->usage_base->btree); ++ reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + ++ bch2_fs_usage_read_one(c, &c->usage_base->online_reserved); ++ ++ ret.used = min(ret.capacity, data + reserve_factor(reserved)); ++ ret.free = ret.capacity - ret.used; ++ ++ ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes); ++ ++ return ret; ++} ++ ++struct bch_fs_usage_short ++bch2_fs_usage_read_short(struct bch_fs *c) ++{ ++ struct bch_fs_usage_short ret; ++ ++ percpu_down_read(&c->mark_lock); ++ ret = __bch2_fs_usage_read_short(c); ++ percpu_up_read(&c->mark_lock); ++ ++ return ret; ++} ++ ++static inline int is_unavailable_bucket(struct bucket_mark m) ++{ ++ return !is_available_bucket(m); ++} ++ ++static inline int is_fragmented_bucket(struct bucket_mark m, ++ struct bch_dev *ca) ++{ ++ if (!m.owned_by_allocator && ++ m.data_type == BCH_DATA_user && ++ bucket_sectors_used(m)) ++ return max_t(int, 0, (int) ca->mi.bucket_size - ++ bucket_sectors_used(m)); ++ return 0; ++} ++ ++static inline int bucket_stripe_sectors(struct bucket_mark m) ++{ ++ return m.stripe ? m.dirty_sectors : 0; ++} ++ ++static inline enum bch_data_type bucket_type(struct bucket_mark m) ++{ ++ return m.cached_sectors && !m.dirty_sectors ++ ? BCH_DATA_cached ++ : m.data_type; ++} ++ ++static bool bucket_became_unavailable(struct bucket_mark old, ++ struct bucket_mark new) ++{ ++ return is_available_bucket(old) && ++ !is_available_bucket(new); ++} ++ ++int bch2_fs_usage_apply(struct bch_fs *c, ++ struct bch_fs_usage *fs_usage, ++ struct disk_reservation *disk_res, ++ unsigned journal_seq) ++{ ++ s64 added = fs_usage->data + fs_usage->reserved; ++ s64 should_not_have_added; ++ int ret = 0; ++ ++ percpu_rwsem_assert_held(&c->mark_lock); ++ ++ /* ++ * Not allowed to reduce sectors_available except by getting a ++ * reservation: ++ */ ++ should_not_have_added = added - (s64) (disk_res ? disk_res->sectors : 0); ++ if (WARN_ONCE(should_not_have_added > 0, ++ "disk usage increased by %lli without a reservation", ++ should_not_have_added)) { ++ atomic64_sub(should_not_have_added, &c->sectors_available); ++ added -= should_not_have_added; ++ ret = -1; ++ } ++ ++ if (added > 0) { ++ disk_res->sectors -= added; ++ fs_usage->online_reserved -= added; ++ } ++ ++ preempt_disable(); ++ acc_u64s((u64 *) fs_usage_ptr(c, journal_seq, false), ++ (u64 *) fs_usage, fs_usage_u64s(c)); ++ preempt_enable(); ++ ++ return ret; ++} ++ ++static inline void account_bucket(struct bch_fs_usage *fs_usage, ++ struct bch_dev_usage *dev_usage, ++ enum bch_data_type type, ++ int nr, s64 size) ++{ ++ if (type == BCH_DATA_sb || type == BCH_DATA_journal) ++ fs_usage->hidden += size; ++ ++ dev_usage->buckets[type] += nr; ++} ++ ++static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, ++ struct bch_fs_usage *fs_usage, ++ struct bucket_mark old, struct bucket_mark new, ++ bool gc) ++{ ++ struct bch_dev_usage *u; ++ ++ percpu_rwsem_assert_held(&c->mark_lock); ++ ++ preempt_disable(); ++ u = this_cpu_ptr(ca->usage[gc]); ++ ++ if (bucket_type(old)) ++ account_bucket(fs_usage, u, bucket_type(old), ++ -1, -ca->mi.bucket_size); ++ ++ if (bucket_type(new)) ++ account_bucket(fs_usage, u, bucket_type(new), ++ 1, ca->mi.bucket_size); ++ ++ u->buckets_alloc += ++ (int) new.owned_by_allocator - (int) old.owned_by_allocator; ++ u->buckets_unavailable += ++ is_unavailable_bucket(new) - is_unavailable_bucket(old); ++ ++ u->buckets_ec += (int) new.stripe - (int) old.stripe; ++ u->sectors_ec += bucket_stripe_sectors(new) - ++ bucket_stripe_sectors(old); ++ ++ u->sectors[old.data_type] -= old.dirty_sectors; ++ u->sectors[new.data_type] += new.dirty_sectors; ++ u->sectors[BCH_DATA_cached] += ++ (int) new.cached_sectors - (int) old.cached_sectors; ++ u->sectors_fragmented += ++ is_fragmented_bucket(new, ca) - is_fragmented_bucket(old, ca); ++ preempt_enable(); ++ ++ if (!is_available_bucket(old) && is_available_bucket(new)) ++ bch2_wake_allocator(ca); ++} ++ ++__flatten ++void bch2_dev_usage_from_buckets(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ struct bucket_mark old = { .v.counter = 0 }; ++ struct bucket_array *buckets; ++ struct bucket *g; ++ unsigned i; ++ int cpu; ++ ++ c->usage_base->hidden = 0; ++ ++ for_each_member_device(ca, c, i) { ++ for_each_possible_cpu(cpu) ++ memset(per_cpu_ptr(ca->usage[0], cpu), 0, ++ sizeof(*ca->usage[0])); ++ ++ buckets = bucket_array(ca); ++ ++ for_each_bucket(g, buckets) ++ bch2_dev_usage_update(c, ca, c->usage_base, ++ old, g->mark, false); ++ } ++} ++ ++static inline int update_replicas(struct bch_fs *c, ++ struct bch_fs_usage *fs_usage, ++ struct bch_replicas_entry *r, ++ s64 sectors) ++{ ++ int idx = bch2_replicas_entry_idx(c, r); ++ ++ if (idx < 0) ++ return -1; ++ ++ if (!fs_usage) ++ return 0; ++ ++ switch (r->data_type) { ++ case BCH_DATA_btree: ++ fs_usage->btree += sectors; ++ break; ++ case BCH_DATA_user: ++ fs_usage->data += sectors; ++ break; ++ case BCH_DATA_cached: ++ fs_usage->cached += sectors; ++ break; ++ } ++ fs_usage->replicas[idx] += sectors; ++ return 0; ++} ++ ++static inline void update_cached_sectors(struct bch_fs *c, ++ struct bch_fs_usage *fs_usage, ++ unsigned dev, s64 sectors) ++{ ++ struct bch_replicas_padded r; ++ ++ bch2_replicas_entry_cached(&r.e, dev); ++ ++ update_replicas(c, fs_usage, &r.e, sectors); ++} ++ ++static struct replicas_delta_list * ++replicas_deltas_realloc(struct btree_trans *trans, unsigned more) ++{ ++ struct replicas_delta_list *d = trans->fs_usage_deltas; ++ unsigned new_size = d ? (d->size + more) * 2 : 128; ++ ++ if (!d || d->used + more > d->size) { ++ d = krealloc(d, sizeof(*d) + new_size, GFP_NOIO|__GFP_ZERO); ++ BUG_ON(!d); ++ ++ d->size = new_size; ++ trans->fs_usage_deltas = d; ++ } ++ return d; ++} ++ ++static inline void update_replicas_list(struct btree_trans *trans, ++ struct bch_replicas_entry *r, ++ s64 sectors) ++{ ++ struct replicas_delta_list *d; ++ struct replicas_delta *n; ++ unsigned b; ++ ++ if (!sectors) ++ return; ++ ++ b = replicas_entry_bytes(r) + 8; ++ d = replicas_deltas_realloc(trans, b); ++ ++ n = (void *) d->d + d->used; ++ n->delta = sectors; ++ memcpy(&n->r, r, replicas_entry_bytes(r)); ++ d->used += b; ++} ++ ++static inline void update_cached_sectors_list(struct btree_trans *trans, ++ unsigned dev, s64 sectors) ++{ ++ struct bch_replicas_padded r; ++ ++ bch2_replicas_entry_cached(&r.e, dev); ++ ++ update_replicas_list(trans, &r.e, sectors); ++} ++ ++static inline struct replicas_delta * ++replicas_delta_next(struct replicas_delta *d) ++{ ++ return (void *) d + replicas_entry_bytes(&d->r) + 8; ++} ++ ++int bch2_replicas_delta_list_apply(struct bch_fs *c, ++ struct bch_fs_usage *fs_usage, ++ struct replicas_delta_list *r) ++{ ++ struct replicas_delta *d = r->d; ++ struct replicas_delta *top = (void *) r->d + r->used; ++ unsigned i; ++ ++ for (d = r->d; d != top; d = replicas_delta_next(d)) ++ if (update_replicas(c, fs_usage, &d->r, d->delta)) { ++ top = d; ++ goto unwind; ++ } ++ ++ if (!fs_usage) ++ return 0; ++ ++ fs_usage->nr_inodes += r->nr_inodes; ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) { ++ fs_usage->reserved += r->persistent_reserved[i]; ++ fs_usage->persistent_reserved[i] += r->persistent_reserved[i]; ++ } ++ ++ return 0; ++unwind: ++ for (d = r->d; d != top; d = replicas_delta_next(d)) ++ update_replicas(c, fs_usage, &d->r, -d->delta); ++ return -1; ++} ++ ++#define do_mark_fn(fn, c, pos, flags, ...) \ ++({ \ ++ int gc, ret = 0; \ ++ \ ++ percpu_rwsem_assert_held(&c->mark_lock); \ ++ \ ++ for (gc = 0; gc < 2 && !ret; gc++) \ ++ if (!gc == !(flags & BTREE_TRIGGER_GC) || \ ++ (gc && gc_visited(c, pos))) \ ++ ret = fn(c, __VA_ARGS__, gc); \ ++ ret; \ ++}) ++ ++static int __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, struct bucket_mark *ret, ++ bool gc) ++{ ++ struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); ++ struct bucket *g = __bucket(ca, b, gc); ++ struct bucket_mark old, new; ++ ++ old = bucket_cmpxchg(g, new, ({ ++ BUG_ON(!is_available_bucket(new)); ++ ++ new.owned_by_allocator = true; ++ new.data_type = 0; ++ new.cached_sectors = 0; ++ new.dirty_sectors = 0; ++ new.gen++; ++ })); ++ ++ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); ++ ++ if (old.cached_sectors) ++ update_cached_sectors(c, fs_usage, ca->dev_idx, ++ -((s64) old.cached_sectors)); ++ ++ if (!gc) ++ *ret = old; ++ return 0; ++} ++ ++void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, struct bucket_mark *old) ++{ ++ do_mark_fn(__bch2_invalidate_bucket, c, gc_phase(GC_PHASE_START), 0, ++ ca, b, old); ++ ++ if (!old->owned_by_allocator && old->cached_sectors) ++ trace_invalidate(ca, bucket_to_sector(ca, b), ++ old->cached_sectors); ++} ++ ++static int __bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, bool owned_by_allocator, ++ bool gc) ++{ ++ struct bch_fs_usage *fs_usage = fs_usage_ptr(c, 0, gc); ++ struct bucket *g = __bucket(ca, b, gc); ++ struct bucket_mark old, new; ++ ++ old = bucket_cmpxchg(g, new, ({ ++ new.owned_by_allocator = owned_by_allocator; ++ })); ++ ++ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); ++ ++ BUG_ON(!gc && ++ !owned_by_allocator && !old.owned_by_allocator); ++ ++ return 0; ++} ++ ++void bch2_mark_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, bool owned_by_allocator, ++ struct gc_pos pos, unsigned flags) ++{ ++ preempt_disable(); ++ ++ do_mark_fn(__bch2_mark_alloc_bucket, c, pos, flags, ++ ca, b, owned_by_allocator); ++ ++ preempt_enable(); ++} ++ ++static int bch2_mark_alloc(struct bch_fs *c, ++ struct bkey_s_c old, struct bkey_s_c new, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, unsigned flags) ++{ ++ bool gc = flags & BTREE_TRIGGER_GC; ++ struct bkey_alloc_unpacked u; ++ struct bch_dev *ca; ++ struct bucket *g; ++ struct bucket_mark old_m, m; ++ ++ /* We don't do anything for deletions - do we?: */ ++ if (new.k->type != KEY_TYPE_alloc) ++ return 0; ++ ++ /* ++ * alloc btree is read in by bch2_alloc_read, not gc: ++ */ ++ if ((flags & BTREE_TRIGGER_GC) && ++ !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) ++ return 0; ++ ++ ca = bch_dev_bkey_exists(c, new.k->p.inode); ++ ++ if (new.k->p.offset >= ca->mi.nbuckets) ++ return 0; ++ ++ g = __bucket(ca, new.k->p.offset, gc); ++ u = bch2_alloc_unpack(new); ++ ++ old_m = bucket_cmpxchg(g, m, ({ ++ m.gen = u.gen; ++ m.data_type = u.data_type; ++ m.dirty_sectors = u.dirty_sectors; ++ m.cached_sectors = u.cached_sectors; ++ ++ if (journal_seq) { ++ m.journal_seq_valid = 1; ++ m.journal_seq = journal_seq; ++ } ++ })); ++ ++ bch2_dev_usage_update(c, ca, fs_usage, old_m, m, gc); ++ ++ g->io_time[READ] = u.read_time; ++ g->io_time[WRITE] = u.write_time; ++ g->oldest_gen = u.oldest_gen; ++ g->gen_valid = 1; ++ ++ /* ++ * need to know if we're getting called from the invalidate path or ++ * not: ++ */ ++ ++ if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && ++ old_m.cached_sectors) { ++ update_cached_sectors(c, fs_usage, ca->dev_idx, ++ -old_m.cached_sectors); ++ trace_invalidate(ca, bucket_to_sector(ca, new.k->p.offset), ++ old_m.cached_sectors); ++ } ++ ++ return 0; ++} ++ ++#define checked_add(a, b) \ ++({ \ ++ unsigned _res = (unsigned) (a) + (b); \ ++ bool overflow = _res > U16_MAX; \ ++ if (overflow) \ ++ _res = U16_MAX; \ ++ (a) = _res; \ ++ overflow; \ ++}) ++ ++static int __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, enum bch_data_type data_type, ++ unsigned sectors, bool gc) ++{ ++ struct bucket *g = __bucket(ca, b, gc); ++ struct bucket_mark old, new; ++ bool overflow; ++ ++ BUG_ON(data_type != BCH_DATA_sb && ++ data_type != BCH_DATA_journal); ++ ++ old = bucket_cmpxchg(g, new, ({ ++ new.data_type = data_type; ++ overflow = checked_add(new.dirty_sectors, sectors); ++ })); ++ ++ bch2_fs_inconsistent_on(old.data_type && ++ old.data_type != data_type, c, ++ "different types of data in same bucket: %s, %s", ++ bch2_data_types[old.data_type], ++ bch2_data_types[data_type]); ++ ++ bch2_fs_inconsistent_on(overflow, c, ++ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > U16_MAX", ++ ca->dev_idx, b, new.gen, ++ bch2_data_types[old.data_type ?: data_type], ++ old.dirty_sectors, sectors); ++ ++ if (c) ++ bch2_dev_usage_update(c, ca, fs_usage_ptr(c, 0, gc), ++ old, new, gc); ++ ++ return 0; ++} ++ ++void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, enum bch_data_type type, ++ unsigned sectors, struct gc_pos pos, ++ unsigned flags) ++{ ++ BUG_ON(type != BCH_DATA_sb && ++ type != BCH_DATA_journal); ++ ++ preempt_disable(); ++ ++ if (likely(c)) { ++ do_mark_fn(__bch2_mark_metadata_bucket, c, pos, flags, ++ ca, b, type, sectors); ++ } else { ++ __bch2_mark_metadata_bucket(c, ca, b, type, sectors, 0); ++ } ++ ++ preempt_enable(); ++} ++ ++static s64 disk_sectors_scaled(unsigned n, unsigned d, unsigned sectors) ++{ ++ return DIV_ROUND_UP(sectors * n, d); ++} ++ ++static s64 __ptr_disk_sectors_delta(unsigned old_size, ++ unsigned offset, s64 delta, ++ unsigned flags, ++ unsigned n, unsigned d) ++{ ++ BUG_ON(!n || !d); ++ ++ if (flags & BTREE_TRIGGER_OVERWRITE_SPLIT) { ++ BUG_ON(offset + -delta > old_size); ++ ++ return -disk_sectors_scaled(n, d, old_size) + ++ disk_sectors_scaled(n, d, offset) + ++ disk_sectors_scaled(n, d, old_size - offset + delta); ++ } else if (flags & BTREE_TRIGGER_OVERWRITE) { ++ BUG_ON(offset + -delta > old_size); ++ ++ return -disk_sectors_scaled(n, d, old_size) + ++ disk_sectors_scaled(n, d, old_size + delta); ++ } else { ++ return disk_sectors_scaled(n, d, delta); ++ } ++} ++ ++static s64 ptr_disk_sectors_delta(struct extent_ptr_decoded p, ++ unsigned offset, s64 delta, ++ unsigned flags) ++{ ++ return __ptr_disk_sectors_delta(p.crc.live_size, ++ offset, delta, flags, ++ p.crc.compressed_size, ++ p.crc.uncompressed_size); ++} ++ ++static int check_bucket_ref(struct bch_fs *c, struct bkey_s_c k, ++ const struct bch_extent_ptr *ptr, ++ s64 sectors, enum bch_data_type ptr_data_type, ++ u8 bucket_gen, u8 bucket_data_type, ++ u16 dirty_sectors, u16 cached_sectors) ++{ ++ size_t bucket_nr = PTR_BUCKET_NR(bch_dev_bkey_exists(c, ptr->dev), ptr); ++ u16 bucket_sectors = !ptr->cached ++ ? dirty_sectors ++ : cached_sectors; ++ char buf[200]; ++ ++ if (gen_after(ptr->gen, bucket_gen)) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" ++ "while marking %s", ++ ptr->dev, bucket_nr, bucket_gen, ++ bch2_data_types[bucket_data_type ?: ptr_data_type], ++ ptr->gen, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ return -EIO; ++ } ++ ++ if (gen_cmp(bucket_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" ++ "while marking %s", ++ ptr->dev, bucket_nr, bucket_gen, ++ bch2_data_types[bucket_data_type ?: ptr_data_type], ++ ptr->gen, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ return -EIO; ++ } ++ ++ if (bucket_gen != ptr->gen && !ptr->cached) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u data type %s: stale dirty ptr (gen %u)\n" ++ "while marking %s", ++ ptr->dev, bucket_nr, bucket_gen, ++ bch2_data_types[bucket_data_type ?: ptr_data_type], ++ ptr->gen, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ return -EIO; ++ } ++ ++ if (bucket_gen != ptr->gen) ++ return 1; ++ ++ if (bucket_data_type && ptr_data_type && ++ bucket_data_type != ptr_data_type) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" ++ "while marking %s", ++ ptr->dev, bucket_nr, bucket_gen, ++ bch2_data_types[bucket_data_type], ++ bch2_data_types[ptr_data_type], ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ return -EIO; ++ } ++ ++ if ((unsigned) (bucket_sectors + sectors) > U16_MAX) { ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n" ++ "while marking %s", ++ ptr->dev, bucket_nr, bucket_gen, ++ bch2_data_types[bucket_data_type ?: ptr_data_type], ++ bucket_sectors, sectors, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ return -EIO; ++ } ++ ++ return 0; ++} ++ ++static int bucket_set_stripe(struct bch_fs *c, struct bkey_s_c k, ++ const struct bch_extent_ptr *ptr, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, ++ unsigned flags, ++ bool enabled) ++{ ++ bool gc = flags & BTREE_TRIGGER_GC; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bucket *g = PTR_BUCKET(ca, ptr, gc); ++ struct bucket_mark new, old; ++ char buf[200]; ++ int ret; ++ ++ old = bucket_cmpxchg(g, new, ({ ++ ret = check_bucket_ref(c, k, ptr, 0, 0, new.gen, new.data_type, ++ new.dirty_sectors, new.cached_sectors); ++ if (ret) ++ return ret; ++ ++ if (new.stripe && enabled) ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", ++ ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ ++ if (!new.stripe && !enabled) ++ bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ++ "bucket %u:%zu gen %u: deleting stripe but not marked\n%s", ++ ptr->dev, PTR_BUCKET_NR(ca, ptr), new.gen, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf)); ++ ++ new.stripe = enabled; ++ if (journal_seq) { ++ new.journal_seq_valid = 1; ++ new.journal_seq = journal_seq; ++ } ++ })); ++ ++ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); ++ return 0; ++} ++ ++static int __mark_pointer(struct bch_fs *c, struct bkey_s_c k, ++ const struct bch_extent_ptr *ptr, ++ s64 sectors, enum bch_data_type ptr_data_type, ++ u8 bucket_gen, u8 *bucket_data_type, ++ u16 *dirty_sectors, u16 *cached_sectors) ++{ ++ u16 *dst_sectors = !ptr->cached ++ ? dirty_sectors ++ : cached_sectors; ++ int ret = check_bucket_ref(c, k, ptr, sectors, ptr_data_type, ++ bucket_gen, *bucket_data_type, ++ *dirty_sectors, *cached_sectors); ++ ++ if (ret) ++ return ret; ++ ++ *dst_sectors += sectors; ++ *bucket_data_type = *dirty_sectors || *cached_sectors ++ ? ptr_data_type : 0; ++ return 0; ++} ++ ++static int bch2_mark_pointer(struct bch_fs *c, struct bkey_s_c k, ++ struct extent_ptr_decoded p, ++ s64 sectors, enum bch_data_type data_type, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, unsigned flags) ++{ ++ bool gc = flags & BTREE_TRIGGER_GC; ++ struct bucket_mark old, new; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ struct bucket *g = PTR_BUCKET(ca, &p.ptr, gc); ++ u8 bucket_data_type; ++ u64 v; ++ int ret; ++ ++ v = atomic64_read(&g->_mark.v); ++ do { ++ new.v.counter = old.v.counter = v; ++ bucket_data_type = new.data_type; ++ ++ ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, new.gen, ++ &bucket_data_type, ++ &new.dirty_sectors, ++ &new.cached_sectors); ++ if (ret) ++ return ret; ++ ++ new.data_type = bucket_data_type; ++ ++ if (journal_seq) { ++ new.journal_seq_valid = 1; ++ new.journal_seq = journal_seq; ++ } ++ ++ if (flags & BTREE_TRIGGER_NOATOMIC) { ++ g->_mark = new; ++ break; ++ } ++ } while ((v = atomic64_cmpxchg(&g->_mark.v, ++ old.v.counter, ++ new.v.counter)) != old.v.counter); ++ ++ bch2_dev_usage_update(c, ca, fs_usage, old, new, gc); ++ ++ BUG_ON(!gc && bucket_became_unavailable(old, new)); ++ ++ return 0; ++} ++ ++static int bch2_mark_stripe_ptr(struct bch_fs *c, ++ struct bch_extent_stripe_ptr p, ++ enum bch_data_type data_type, ++ struct bch_fs_usage *fs_usage, ++ s64 sectors, unsigned flags, ++ struct bch_replicas_padded *r, ++ unsigned *nr_data, ++ unsigned *nr_parity) ++{ ++ bool gc = flags & BTREE_TRIGGER_GC; ++ struct stripe *m; ++ unsigned i, blocks_nonempty = 0; ++ ++ m = genradix_ptr(&c->stripes[gc], p.idx); ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ ++ if (!m || !m->alive) { ++ spin_unlock(&c->ec_stripes_heap_lock); ++ bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", ++ (u64) p.idx); ++ return -EIO; ++ } ++ ++ BUG_ON(m->r.e.data_type != data_type); ++ ++ *nr_data = m->nr_blocks - m->nr_redundant; ++ *nr_parity = m->nr_redundant; ++ *r = m->r; ++ ++ m->block_sectors[p.block] += sectors; ++ ++ for (i = 0; i < m->nr_blocks; i++) ++ blocks_nonempty += m->block_sectors[i] != 0; ++ ++ if (m->blocks_nonempty != blocks_nonempty) { ++ m->blocks_nonempty = blocks_nonempty; ++ if (!gc) ++ bch2_stripes_heap_update(c, m, p.idx); ++ } ++ ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ return 0; ++} ++ ++static int bch2_mark_extent(struct bch_fs *c, ++ struct bkey_s_c old, struct bkey_s_c new, ++ unsigned offset, s64 sectors, ++ enum bch_data_type data_type, ++ struct bch_fs_usage *fs_usage, ++ unsigned journal_seq, unsigned flags) ++{ ++ struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ struct bch_replicas_padded r; ++ s64 dirty_sectors = 0; ++ bool stale; ++ int ret; ++ ++ r.e.data_type = data_type; ++ r.e.nr_devs = 0; ++ r.e.nr_required = 1; ++ ++ BUG_ON(!sectors); ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ s64 disk_sectors = data_type == BCH_DATA_btree ++ ? sectors ++ : ptr_disk_sectors_delta(p, offset, sectors, flags); ++ ++ ret = bch2_mark_pointer(c, k, p, disk_sectors, data_type, ++ fs_usage, journal_seq, flags); ++ if (ret < 0) ++ return ret; ++ ++ stale = ret > 0; ++ ++ if (p.ptr.cached) { ++ if (!stale) ++ update_cached_sectors(c, fs_usage, p.ptr.dev, ++ disk_sectors); ++ } else if (!p.has_ec) { ++ dirty_sectors += disk_sectors; ++ r.e.devs[r.e.nr_devs++] = p.ptr.dev; ++ } else { ++ struct bch_replicas_padded ec_r; ++ unsigned nr_data, nr_parity; ++ s64 parity_sectors; ++ ++ ret = bch2_mark_stripe_ptr(c, p.ec, data_type, ++ fs_usage, disk_sectors, flags, ++ &ec_r, &nr_data, &nr_parity); ++ if (ret) ++ return ret; ++ ++ parity_sectors = ++ __ptr_disk_sectors_delta(p.crc.live_size, ++ offset, sectors, flags, ++ p.crc.compressed_size * nr_parity, ++ p.crc.uncompressed_size * nr_data); ++ ++ update_replicas(c, fs_usage, &ec_r.e, ++ disk_sectors + parity_sectors); ++ ++ /* ++ * There may be other dirty pointers in this extent, but ++ * if so they're not required for mounting if we have an ++ * erasure coded pointer in this extent: ++ */ ++ r.e.nr_required = 0; ++ } ++ } ++ ++ if (r.e.nr_devs) ++ update_replicas(c, fs_usage, &r.e, dirty_sectors); ++ ++ return 0; ++} ++ ++static int bch2_mark_stripe(struct bch_fs *c, ++ struct bkey_s_c old, struct bkey_s_c new, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, unsigned flags) ++{ ++ bool gc = flags & BTREE_TRIGGER_GC; ++ size_t idx = new.k->p.offset; ++ const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe ++ ? bkey_s_c_to_stripe(old).v : NULL; ++ const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe ++ ? bkey_s_c_to_stripe(new).v : NULL; ++ struct stripe *m = genradix_ptr(&c->stripes[gc], idx); ++ unsigned i; ++ int ret; ++ ++ if (!m || (old_s && !m->alive)) { ++ bch_err_ratelimited(c, "error marking nonexistent stripe %zu", ++ idx); ++ return -1; ++ } ++ ++ if (!new_s) { ++ /* Deleting: */ ++ for (i = 0; i < old_s->nr_blocks; i++) { ++ ret = bucket_set_stripe(c, old, old_s->ptrs + i, fs_usage, ++ journal_seq, flags, false); ++ if (ret) ++ return ret; ++ } ++ ++ if (!gc && m->on_heap) { ++ spin_lock(&c->ec_stripes_heap_lock); ++ bch2_stripes_heap_del(c, m, idx); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ } ++ ++ memset(m, 0, sizeof(*m)); ++ } else { ++ BUG_ON(old_s && new_s->nr_blocks != old_s->nr_blocks); ++ BUG_ON(old_s && new_s->nr_redundant != old_s->nr_redundant); ++ ++ for (i = 0; i < new_s->nr_blocks; i++) { ++ if (!old_s || ++ memcmp(new_s->ptrs + i, ++ old_s->ptrs + i, ++ sizeof(struct bch_extent_ptr))) { ++ ++ if (old_s) { ++ bucket_set_stripe(c, old, old_s->ptrs + i, fs_usage, ++ journal_seq, flags, false); ++ if (ret) ++ return ret; ++ } ++ ret = bucket_set_stripe(c, new, new_s->ptrs + i, fs_usage, ++ journal_seq, flags, true); ++ if (ret) ++ return ret; ++ } ++ } ++ ++ m->alive = true; ++ m->sectors = le16_to_cpu(new_s->sectors); ++ m->algorithm = new_s->algorithm; ++ m->nr_blocks = new_s->nr_blocks; ++ m->nr_redundant = new_s->nr_redundant; ++ ++ bch2_bkey_to_replicas(&m->r.e, new); ++ ++ /* gc recalculates these fields: */ ++ if (!(flags & BTREE_TRIGGER_GC)) { ++ m->blocks_nonempty = 0; ++ ++ for (i = 0; i < new_s->nr_blocks; i++) { ++ m->block_sectors[i] = ++ stripe_blockcount_get(new_s, i); ++ m->blocks_nonempty += !!m->block_sectors[i]; ++ } ++ } ++ ++ if (!gc) { ++ spin_lock(&c->ec_stripes_heap_lock); ++ bch2_stripes_heap_update(c, m, idx); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ } ++ } ++ ++ return 0; ++} ++ ++static int bch2_mark_key_locked(struct bch_fs *c, ++ struct bkey_s_c old, ++ struct bkey_s_c new, ++ unsigned offset, s64 sectors, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, unsigned flags) ++{ ++ struct bkey_s_c k = flags & BTREE_TRIGGER_INSERT ? new : old; ++ int ret = 0; ++ ++ BUG_ON(!(flags & (BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE))); ++ ++ preempt_disable(); ++ ++ if (!fs_usage || (flags & BTREE_TRIGGER_GC)) ++ fs_usage = fs_usage_ptr(c, journal_seq, ++ flags & BTREE_TRIGGER_GC); ++ ++ switch (k.k->type) { ++ case KEY_TYPE_alloc: ++ ret = bch2_mark_alloc(c, old, new, fs_usage, journal_seq, flags); ++ break; ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: ++ sectors = !(flags & BTREE_TRIGGER_OVERWRITE) ++ ? c->opts.btree_node_size ++ : -c->opts.btree_node_size; ++ ++ ret = bch2_mark_extent(c, old, new, offset, sectors, ++ BCH_DATA_btree, fs_usage, journal_seq, flags); ++ break; ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ ret = bch2_mark_extent(c, old, new, offset, sectors, ++ BCH_DATA_user, fs_usage, journal_seq, flags); ++ break; ++ case KEY_TYPE_stripe: ++ ret = bch2_mark_stripe(c, old, new, fs_usage, journal_seq, flags); ++ break; ++ case KEY_TYPE_inode: ++ if (!(flags & BTREE_TRIGGER_OVERWRITE)) ++ fs_usage->nr_inodes++; ++ else ++ fs_usage->nr_inodes--; ++ break; ++ case KEY_TYPE_reservation: { ++ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; ++ ++ sectors *= replicas; ++ replicas = clamp_t(unsigned, replicas, 1, ++ ARRAY_SIZE(fs_usage->persistent_reserved)); ++ ++ fs_usage->reserved += sectors; ++ fs_usage->persistent_reserved[replicas - 1] += sectors; ++ break; ++ } ++ } ++ ++ preempt_enable(); ++ ++ return ret; ++} ++ ++int bch2_mark_key(struct bch_fs *c, struct bkey_s_c new, ++ unsigned offset, s64 sectors, ++ struct bch_fs_usage *fs_usage, ++ u64 journal_seq, unsigned flags) ++{ ++ struct bkey deleted; ++ struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; ++ int ret; ++ ++ bkey_init(&deleted); ++ ++ percpu_down_read(&c->mark_lock); ++ ret = bch2_mark_key_locked(c, old, new, offset, sectors, ++ fs_usage, journal_seq, ++ BTREE_TRIGGER_INSERT|flags); ++ percpu_up_read(&c->mark_lock); ++ ++ return ret; ++} ++ ++int bch2_mark_update(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *new, ++ struct bch_fs_usage *fs_usage, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree *b = iter_l(iter)->b; ++ struct btree_node_iter node_iter = iter_l(iter)->iter; ++ struct bkey_packed *_old; ++ struct bkey_s_c old; ++ struct bkey unpacked; ++ int ret = 0; ++ ++ if (unlikely(flags & BTREE_TRIGGER_NORUN)) ++ return 0; ++ ++ if (!btree_node_type_needs_gc(iter->btree_id)) ++ return 0; ++ ++ bkey_init(&unpacked); ++ old = (struct bkey_s_c) { &unpacked, NULL }; ++ ++ if (!btree_node_type_is_extents(iter->btree_id)) { ++ if (btree_iter_type(iter) != BTREE_ITER_CACHED) { ++ _old = bch2_btree_node_iter_peek(&node_iter, b); ++ if (_old) ++ old = bkey_disassemble(b, _old, &unpacked); ++ } else { ++ struct bkey_cached *ck = (void *) iter->l[0].b; ++ ++ if (ck->valid) ++ old = bkey_i_to_s_c(ck->k); ++ } ++ ++ if (old.k->type == new->k.type) { ++ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, ++ fs_usage, trans->journal_res.seq, ++ BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); ++ ++ } else { ++ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, ++ fs_usage, trans->journal_res.seq, ++ BTREE_TRIGGER_INSERT|flags); ++ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), 0, 0, ++ fs_usage, trans->journal_res.seq, ++ BTREE_TRIGGER_OVERWRITE|flags); ++ } ++ } else { ++ BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED); ++ bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), ++ 0, new->k.size, ++ fs_usage, trans->journal_res.seq, ++ BTREE_TRIGGER_INSERT|flags); ++ ++ while ((_old = bch2_btree_node_iter_peek(&node_iter, b))) { ++ unsigned offset = 0; ++ s64 sectors; ++ ++ old = bkey_disassemble(b, _old, &unpacked); ++ sectors = -((s64) old.k->size); ++ ++ flags |= BTREE_TRIGGER_OVERWRITE; ++ ++ if (bkey_cmp(new->k.p, bkey_start_pos(old.k)) <= 0) ++ return 0; ++ ++ switch (bch2_extent_overlap(&new->k, old.k)) { ++ case BCH_EXTENT_OVERLAP_ALL: ++ offset = 0; ++ sectors = -((s64) old.k->size); ++ break; ++ case BCH_EXTENT_OVERLAP_BACK: ++ offset = bkey_start_offset(&new->k) - ++ bkey_start_offset(old.k); ++ sectors = bkey_start_offset(&new->k) - ++ old.k->p.offset; ++ break; ++ case BCH_EXTENT_OVERLAP_FRONT: ++ offset = 0; ++ sectors = bkey_start_offset(old.k) - ++ new->k.p.offset; ++ break; ++ case BCH_EXTENT_OVERLAP_MIDDLE: ++ offset = bkey_start_offset(&new->k) - ++ bkey_start_offset(old.k); ++ sectors = -((s64) new->k.size); ++ flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; ++ break; ++ } ++ ++ BUG_ON(sectors >= 0); ++ ++ ret = bch2_mark_key_locked(c, old, bkey_i_to_s_c(new), ++ offset, sectors, fs_usage, ++ trans->journal_res.seq, flags) ?: 1; ++ if (ret <= 0) ++ break; ++ ++ bch2_btree_node_iter_advance(&node_iter, b); ++ } ++ } ++ ++ return ret; ++} ++ ++void bch2_trans_fs_usage_apply(struct btree_trans *trans, ++ struct bch_fs_usage *fs_usage) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_insert_entry *i; ++ static int warned_disk_usage = 0; ++ u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; ++ char buf[200]; ++ ++ if (!bch2_fs_usage_apply(c, fs_usage, trans->disk_res, ++ trans->journal_res.seq) || ++ warned_disk_usage || ++ xchg(&warned_disk_usage, 1)) ++ return; ++ ++ bch_err(c, "disk usage increased more than %llu sectors reserved", ++ disk_res_sectors); ++ ++ trans_for_each_update(trans, i) { ++ pr_err("while inserting"); ++ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(i->k)); ++ pr_err("%s", buf); ++ pr_err("overlapping with"); ++ ++ if (btree_iter_type(i->iter) != BTREE_ITER_CACHED) { ++ struct btree *b = iter_l(i->iter)->b; ++ struct btree_node_iter node_iter = iter_l(i->iter)->iter; ++ struct bkey_packed *_k; ++ ++ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { ++ struct bkey unpacked; ++ struct bkey_s_c k; ++ ++ pr_info("_k %px format %u", _k, _k->format); ++ k = bkey_disassemble(b, _k, &unpacked); ++ ++ if (btree_node_is_extents(b) ++ ? bkey_cmp(i->k->k.p, bkey_start_pos(k.k)) <= 0 ++ : bkey_cmp(i->k->k.p, k.k->p)) ++ break; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, k); ++ pr_err("%s", buf); ++ ++ bch2_btree_node_iter_advance(&node_iter, b); ++ } ++ } else { ++ struct bkey_cached *ck = (void *) i->iter->l[0].b; ++ ++ if (ck->valid) { ++ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(ck->k)); ++ pr_err("%s", buf); ++ } ++ } ++ } ++} ++ ++/* trans_mark: */ ++ ++static struct btree_iter *trans_get_update(struct btree_trans *trans, ++ enum btree_id btree_id, struct bpos pos, ++ struct bkey_s_c *k) ++{ ++ struct btree_insert_entry *i; ++ ++ trans_for_each_update(trans, i) ++ if (i->iter->btree_id == btree_id && ++ (btree_node_type_is_extents(btree_id) ++ ? bkey_cmp(pos, bkey_start_pos(&i->k->k)) >= 0 && ++ bkey_cmp(pos, i->k->k.p) < 0 ++ : !bkey_cmp(pos, i->iter->pos))) { ++ *k = bkey_i_to_s_c(i->k); ++ return i->iter; ++ } ++ ++ return NULL; ++} ++ ++static int trans_get_key(struct btree_trans *trans, ++ enum btree_id btree_id, struct bpos pos, ++ struct btree_iter **iter, ++ struct bkey_s_c *k) ++{ ++ unsigned flags = btree_id != BTREE_ID_ALLOC ++ ? BTREE_ITER_SLOTS ++ : BTREE_ITER_CACHED; ++ int ret; ++ ++ *iter = trans_get_update(trans, btree_id, pos, k); ++ if (*iter) ++ return 1; ++ ++ *iter = bch2_trans_get_iter(trans, btree_id, pos, ++ flags|BTREE_ITER_INTENT); ++ if (IS_ERR(*iter)) ++ return PTR_ERR(*iter); ++ ++ *k = __bch2_btree_iter_peek(*iter, flags); ++ ret = bkey_err(*k); ++ if (ret) ++ bch2_trans_iter_put(trans, *iter); ++ return ret; ++} ++ ++static int bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter **_iter, ++ const struct bch_extent_ptr *ptr, ++ struct bkey_alloc_unpacked *u) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct bpos pos = POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); ++ struct bucket *g; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ iter = trans_get_update(trans, BTREE_ID_ALLOC, pos, &k); ++ if (iter) { ++ *u = bch2_alloc_unpack(k); ++ } else { ++ iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, pos, ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_INTENT); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) { ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++ } ++ ++ percpu_down_read(&c->mark_lock); ++ g = bucket(ca, pos.offset); ++ *u = alloc_mem_to_key(g, READ_ONCE(g->mark)); ++ percpu_up_read(&c->mark_lock); ++ } ++ ++ *_iter = iter; ++ return 0; ++} ++ ++static int bch2_trans_mark_pointer(struct btree_trans *trans, ++ struct bkey_s_c k, struct extent_ptr_decoded p, ++ s64 sectors, enum bch_data_type data_type) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter; ++ struct bkey_alloc_unpacked u; ++ struct bkey_i_alloc *a; ++ int ret; ++ ++ ret = bch2_trans_start_alloc_update(trans, &iter, &p.ptr, &u); ++ if (ret) ++ return ret; ++ ++ ret = __mark_pointer(c, k, &p.ptr, sectors, data_type, u.gen, &u.data_type, ++ &u.dirty_sectors, &u.cached_sectors); ++ if (ret) ++ goto out; ++ ++ a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); ++ ret = PTR_ERR_OR_ZERO(a); ++ if (ret) ++ goto out; ++ ++ bkey_alloc_init(&a->k_i); ++ a->k.p = iter->pos; ++ bch2_alloc_pack(a, u); ++ bch2_trans_update(trans, iter, &a->k_i, 0); ++out: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, ++ struct bch_extent_stripe_ptr p, ++ s64 sectors, enum bch_data_type data_type, ++ struct bch_replicas_padded *r, ++ unsigned *nr_data, ++ unsigned *nr_parity) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_i_stripe *s; ++ int ret = 0; ++ ++ ret = trans_get_key(trans, BTREE_ID_EC, POS(0, p.idx), &iter, &k); ++ if (ret < 0) ++ return ret; ++ ++ if (k.k->type != KEY_TYPE_stripe) { ++ bch2_fs_inconsistent(c, ++ "pointer to nonexistent stripe %llu", ++ (u64) p.idx); ++ ret = -EIO; ++ goto out; ++ } ++ ++ s = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(s); ++ if (ret) ++ goto out; ++ ++ bkey_reassemble(&s->k_i, k); ++ ++ stripe_blockcount_set(&s->v, p.block, ++ stripe_blockcount_get(&s->v, p.block) + ++ sectors); ++ ++ *nr_data = s->v.nr_blocks - s->v.nr_redundant; ++ *nr_parity = s->v.nr_redundant; ++ bch2_bkey_to_replicas(&r->e, bkey_i_to_s_c(&s->k_i)); ++ bch2_trans_update(trans, iter, &s->k_i, 0); ++out: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static int bch2_trans_mark_extent(struct btree_trans *trans, ++ struct bkey_s_c k, unsigned offset, ++ s64 sectors, unsigned flags, ++ enum bch_data_type data_type) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ struct bch_replicas_padded r; ++ s64 dirty_sectors = 0; ++ bool stale; ++ int ret; ++ ++ r.e.data_type = data_type; ++ r.e.nr_devs = 0; ++ r.e.nr_required = 1; ++ ++ BUG_ON(!sectors); ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ s64 disk_sectors = data_type == BCH_DATA_btree ++ ? sectors ++ : ptr_disk_sectors_delta(p, offset, sectors, flags); ++ ++ ret = bch2_trans_mark_pointer(trans, k, p, disk_sectors, ++ data_type); ++ if (ret < 0) ++ return ret; ++ ++ stale = ret > 0; ++ ++ if (p.ptr.cached) { ++ if (!stale) ++ update_cached_sectors_list(trans, p.ptr.dev, ++ disk_sectors); ++ } else if (!p.has_ec) { ++ dirty_sectors += disk_sectors; ++ r.e.devs[r.e.nr_devs++] = p.ptr.dev; ++ } else { ++ struct bch_replicas_padded ec_r; ++ unsigned nr_data, nr_parity; ++ s64 parity_sectors; ++ ++ ret = bch2_trans_mark_stripe_ptr(trans, p.ec, ++ disk_sectors, data_type, ++ &ec_r, &nr_data, &nr_parity); ++ if (ret) ++ return ret; ++ ++ parity_sectors = ++ __ptr_disk_sectors_delta(p.crc.live_size, ++ offset, sectors, flags, ++ p.crc.compressed_size * nr_parity, ++ p.crc.uncompressed_size * nr_data); ++ ++ update_replicas_list(trans, &ec_r.e, ++ disk_sectors + parity_sectors); ++ ++ r.e.nr_required = 0; ++ } ++ } ++ ++ if (r.e.nr_devs) ++ update_replicas_list(trans, &r.e, dirty_sectors); ++ ++ return 0; ++} ++ ++static int bch2_trans_mark_stripe(struct btree_trans *trans, ++ struct bkey_s_c k) ++{ ++ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; ++ struct bkey_alloc_unpacked u; ++ struct bkey_i_alloc *a; ++ struct btree_iter *iter; ++ unsigned i; ++ int ret = 0; ++ ++ /* ++ * The allocator code doesn't necessarily update bucket gens in the ++ * btree when incrementing them, right before handing out new buckets - ++ * we just need to persist those updates here along with the new stripe: ++ */ ++ ++ for (i = 0; i < s->nr_blocks && !ret; i++) { ++ ret = bch2_trans_start_alloc_update(trans, &iter, ++ &s->ptrs[i], &u); ++ if (ret) ++ break; ++ ++ a = bch2_trans_kmalloc(trans, BKEY_ALLOC_U64s_MAX * 8); ++ ret = PTR_ERR_OR_ZERO(a); ++ if (ret) ++ goto put_iter; ++ ++ bkey_alloc_init(&a->k_i); ++ a->k.p = iter->pos; ++ bch2_alloc_pack(a, u); ++ bch2_trans_update(trans, iter, &a->k_i, 0); ++put_iter: ++ bch2_trans_iter_put(trans, iter); ++ } ++ ++ return ret; ++} ++ ++static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, ++ struct bkey_s_c_reflink_p p, ++ u64 idx, unsigned sectors, ++ unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_i_reflink_v *r_v; ++ s64 ret; ++ ++ ret = trans_get_key(trans, BTREE_ID_REFLINK, ++ POS(0, idx), &iter, &k); ++ if (ret < 0) ++ return ret; ++ ++ if (k.k->type != KEY_TYPE_reflink_v) { ++ bch2_fs_inconsistent(c, ++ "%llu:%llu len %u points to nonexistent indirect extent %llu", ++ p.k->p.inode, p.k->p.offset, p.k->size, idx); ++ ret = -EIO; ++ goto err; ++ } ++ ++ if ((flags & BTREE_TRIGGER_OVERWRITE) && ++ (bkey_start_offset(k.k) < idx || ++ k.k->p.offset > idx + sectors)) ++ goto out; ++ ++ sectors = k.k->p.offset - idx; ++ ++ r_v = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(r_v); ++ if (ret) ++ goto err; ++ ++ bkey_reassemble(&r_v->k_i, k); ++ ++ le64_add_cpu(&r_v->v.refcount, ++ !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1); ++ ++ if (!r_v->v.refcount) { ++ r_v->k.type = KEY_TYPE_deleted; ++ set_bkey_val_u64s(&r_v->k, 0); ++ } ++ ++ bch2_btree_iter_set_pos(iter, bkey_start_pos(k.k)); ++ BUG_ON(iter->uptodate > BTREE_ITER_NEED_PEEK); ++ ++ bch2_trans_update(trans, iter, &r_v->k_i, 0); ++out: ++ ret = sectors; ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static int bch2_trans_mark_reflink_p(struct btree_trans *trans, ++ struct bkey_s_c_reflink_p p, unsigned offset, ++ s64 sectors, unsigned flags) ++{ ++ u64 idx = le64_to_cpu(p.v->idx) + offset; ++ s64 ret = 0; ++ ++ sectors = abs(sectors); ++ BUG_ON(offset + sectors > p.k->size); ++ ++ while (sectors) { ++ ret = __bch2_trans_mark_reflink_p(trans, p, idx, sectors, flags); ++ if (ret < 0) ++ break; ++ ++ idx += ret; ++ sectors = max_t(s64, 0LL, sectors - ret); ++ ret = 0; ++ } ++ ++ return ret; ++} ++ ++int bch2_trans_mark_key(struct btree_trans *trans, struct bkey_s_c k, ++ unsigned offset, s64 sectors, unsigned flags) ++{ ++ struct replicas_delta_list *d; ++ struct bch_fs *c = trans->c; ++ ++ switch (k.k->type) { ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: ++ sectors = !(flags & BTREE_TRIGGER_OVERWRITE) ++ ? c->opts.btree_node_size ++ : -c->opts.btree_node_size; ++ ++ return bch2_trans_mark_extent(trans, k, offset, sectors, ++ flags, BCH_DATA_btree); ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ return bch2_trans_mark_extent(trans, k, offset, sectors, ++ flags, BCH_DATA_user); ++ case KEY_TYPE_stripe: ++ return bch2_trans_mark_stripe(trans, k); ++ case KEY_TYPE_inode: ++ d = replicas_deltas_realloc(trans, 0); ++ ++ if (!(flags & BTREE_TRIGGER_OVERWRITE)) ++ d->nr_inodes++; ++ else ++ d->nr_inodes--; ++ return 0; ++ case KEY_TYPE_reservation: { ++ unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; ++ ++ d = replicas_deltas_realloc(trans, 0); ++ ++ sectors *= replicas; ++ replicas = clamp_t(unsigned, replicas, 1, ++ ARRAY_SIZE(d->persistent_reserved)); ++ ++ d->persistent_reserved[replicas - 1] += sectors; ++ return 0; ++ } ++ case KEY_TYPE_reflink_p: ++ return bch2_trans_mark_reflink_p(trans, ++ bkey_s_c_to_reflink_p(k), ++ offset, sectors, flags); ++ default: ++ return 0; ++ } ++} ++ ++int bch2_trans_mark_update(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert, ++ unsigned flags) ++{ ++ struct btree *b = iter_l(iter)->b; ++ struct btree_node_iter node_iter = iter_l(iter)->iter; ++ struct bkey_packed *_k; ++ int ret; ++ ++ if (unlikely(flags & BTREE_TRIGGER_NORUN)) ++ return 0; ++ ++ if (!btree_node_type_needs_gc(iter->btree_id)) ++ return 0; ++ ++ ret = bch2_trans_mark_key(trans, bkey_i_to_s_c(insert), ++ 0, insert->k.size, BTREE_TRIGGER_INSERT); ++ if (ret) ++ return ret; ++ ++ if (btree_iter_type(iter) == BTREE_ITER_CACHED) { ++ struct bkey_cached *ck = (void *) iter->l[0].b; ++ ++ return bch2_trans_mark_key(trans, bkey_i_to_s_c(ck->k), ++ 0, 0, BTREE_TRIGGER_OVERWRITE); ++ } ++ ++ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { ++ struct bkey unpacked; ++ struct bkey_s_c k; ++ unsigned offset = 0; ++ s64 sectors = 0; ++ unsigned flags = BTREE_TRIGGER_OVERWRITE; ++ ++ k = bkey_disassemble(b, _k, &unpacked); ++ ++ if (btree_node_is_extents(b) ++ ? bkey_cmp(insert->k.p, bkey_start_pos(k.k)) <= 0 ++ : bkey_cmp(insert->k.p, k.k->p)) ++ break; ++ ++ if (btree_node_is_extents(b)) { ++ switch (bch2_extent_overlap(&insert->k, k.k)) { ++ case BCH_EXTENT_OVERLAP_ALL: ++ offset = 0; ++ sectors = -((s64) k.k->size); ++ break; ++ case BCH_EXTENT_OVERLAP_BACK: ++ offset = bkey_start_offset(&insert->k) - ++ bkey_start_offset(k.k); ++ sectors = bkey_start_offset(&insert->k) - ++ k.k->p.offset; ++ break; ++ case BCH_EXTENT_OVERLAP_FRONT: ++ offset = 0; ++ sectors = bkey_start_offset(k.k) - ++ insert->k.p.offset; ++ break; ++ case BCH_EXTENT_OVERLAP_MIDDLE: ++ offset = bkey_start_offset(&insert->k) - ++ bkey_start_offset(k.k); ++ sectors = -((s64) insert->k.size); ++ flags |= BTREE_TRIGGER_OVERWRITE_SPLIT; ++ break; ++ } ++ ++ BUG_ON(sectors >= 0); ++ } ++ ++ ret = bch2_trans_mark_key(trans, k, offset, sectors, flags); ++ if (ret) ++ return ret; ++ ++ bch2_btree_node_iter_advance(&node_iter, b); ++ } ++ ++ return 0; ++} ++ ++/* Disk reservations: */ ++ ++static u64 bch2_recalc_sectors_available(struct bch_fs *c) ++{ ++ percpu_u64_set(&c->pcpu->sectors_available, 0); ++ ++ return avail_factor(__bch2_fs_usage_read_short(c).free); ++} ++ ++void __bch2_disk_reservation_put(struct bch_fs *c, struct disk_reservation *res) ++{ ++ percpu_down_read(&c->mark_lock); ++ this_cpu_sub(c->usage[0]->online_reserved, ++ res->sectors); ++ percpu_up_read(&c->mark_lock); ++ ++ res->sectors = 0; ++} ++ ++#define SECTORS_CACHE 1024 ++ ++int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, ++ unsigned sectors, int flags) ++{ ++ struct bch_fs_pcpu *pcpu; ++ u64 old, v, get; ++ s64 sectors_available; ++ int ret; ++ ++ percpu_down_read(&c->mark_lock); ++ preempt_disable(); ++ pcpu = this_cpu_ptr(c->pcpu); ++ ++ if (sectors <= pcpu->sectors_available) ++ goto out; ++ ++ v = atomic64_read(&c->sectors_available); ++ do { ++ old = v; ++ get = min((u64) sectors + SECTORS_CACHE, old); ++ ++ if (get < sectors) { ++ preempt_enable(); ++ percpu_up_read(&c->mark_lock); ++ goto recalculate; ++ } ++ } while ((v = atomic64_cmpxchg(&c->sectors_available, ++ old, old - get)) != old); ++ ++ pcpu->sectors_available += get; ++ ++out: ++ pcpu->sectors_available -= sectors; ++ this_cpu_add(c->usage[0]->online_reserved, sectors); ++ res->sectors += sectors; ++ ++ preempt_enable(); ++ percpu_up_read(&c->mark_lock); ++ return 0; ++ ++recalculate: ++ percpu_down_write(&c->mark_lock); ++ ++ sectors_available = bch2_recalc_sectors_available(c); ++ ++ if (sectors <= sectors_available || ++ (flags & BCH_DISK_RESERVATION_NOFAIL)) { ++ atomic64_set(&c->sectors_available, ++ max_t(s64, 0, sectors_available - sectors)); ++ this_cpu_add(c->usage[0]->online_reserved, sectors); ++ res->sectors += sectors; ++ ret = 0; ++ } else { ++ atomic64_set(&c->sectors_available, sectors_available); ++ ret = -ENOSPC; ++ } ++ ++ percpu_up_write(&c->mark_lock); ++ ++ return ret; ++} ++ ++/* Startup/shutdown: */ ++ ++static void buckets_free_rcu(struct rcu_head *rcu) ++{ ++ struct bucket_array *buckets = ++ container_of(rcu, struct bucket_array, rcu); ++ ++ kvpfree(buckets, ++ sizeof(struct bucket_array) + ++ buckets->nbuckets * sizeof(struct bucket)); ++} ++ ++int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ++{ ++ struct bucket_array *buckets = NULL, *old_buckets = NULL; ++ unsigned long *buckets_nouse = NULL; ++ alloc_fifo free[RESERVE_NR]; ++ alloc_fifo free_inc; ++ alloc_heap alloc_heap; ++ ++ size_t btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, ++ ca->mi.bucket_size / c->opts.btree_node_size); ++ /* XXX: these should be tunable */ ++ size_t reserve_none = max_t(size_t, 1, nbuckets >> 9); ++ size_t copygc_reserve = max_t(size_t, 2, nbuckets >> 7); ++ size_t free_inc_nr = max(max_t(size_t, 1, nbuckets >> 12), ++ btree_reserve * 2); ++ bool resize = ca->buckets[0] != NULL; ++ int ret = -ENOMEM; ++ unsigned i; ++ ++ memset(&free, 0, sizeof(free)); ++ memset(&free_inc, 0, sizeof(free_inc)); ++ memset(&alloc_heap, 0, sizeof(alloc_heap)); ++ ++ if (!(buckets = kvpmalloc(sizeof(struct bucket_array) + ++ nbuckets * sizeof(struct bucket), ++ GFP_KERNEL|__GFP_ZERO)) || ++ !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * ++ sizeof(unsigned long), ++ GFP_KERNEL|__GFP_ZERO)) || ++ !init_fifo(&free[RESERVE_BTREE], btree_reserve, GFP_KERNEL) || ++ !init_fifo(&free[RESERVE_MOVINGGC], ++ copygc_reserve, GFP_KERNEL) || ++ !init_fifo(&free[RESERVE_NONE], reserve_none, GFP_KERNEL) || ++ !init_fifo(&free_inc, free_inc_nr, GFP_KERNEL) || ++ !init_heap(&alloc_heap, ALLOC_SCAN_BATCH(ca) << 1, GFP_KERNEL)) ++ goto err; ++ ++ buckets->first_bucket = ca->mi.first_bucket; ++ buckets->nbuckets = nbuckets; ++ ++ bch2_copygc_stop(c); ++ ++ if (resize) { ++ down_write(&c->gc_lock); ++ down_write(&ca->bucket_lock); ++ percpu_down_write(&c->mark_lock); ++ } ++ ++ old_buckets = bucket_array(ca); ++ ++ if (resize) { ++ size_t n = min(buckets->nbuckets, old_buckets->nbuckets); ++ ++ memcpy(buckets->b, ++ old_buckets->b, ++ n * sizeof(struct bucket)); ++ memcpy(buckets_nouse, ++ ca->buckets_nouse, ++ BITS_TO_LONGS(n) * sizeof(unsigned long)); ++ } ++ ++ rcu_assign_pointer(ca->buckets[0], buckets); ++ buckets = old_buckets; ++ ++ swap(ca->buckets_nouse, buckets_nouse); ++ ++ if (resize) { ++ percpu_up_write(&c->mark_lock); ++ up_write(&c->gc_lock); ++ } ++ ++ spin_lock(&c->freelist_lock); ++ for (i = 0; i < RESERVE_NR; i++) { ++ fifo_move(&free[i], &ca->free[i]); ++ swap(ca->free[i], free[i]); ++ } ++ fifo_move(&free_inc, &ca->free_inc); ++ swap(ca->free_inc, free_inc); ++ spin_unlock(&c->freelist_lock); ++ ++ /* with gc lock held, alloc_heap can't be in use: */ ++ swap(ca->alloc_heap, alloc_heap); ++ ++ nbuckets = ca->mi.nbuckets; ++ ++ if (resize) ++ up_write(&ca->bucket_lock); ++ ++ ret = 0; ++err: ++ free_heap(&alloc_heap); ++ free_fifo(&free_inc); ++ for (i = 0; i < RESERVE_NR; i++) ++ free_fifo(&free[i]); ++ kvpfree(buckets_nouse, ++ BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); ++ if (buckets) ++ call_rcu(&old_buckets->rcu, buckets_free_rcu); ++ ++ return ret; ++} ++ ++void bch2_dev_buckets_free(struct bch_dev *ca) ++{ ++ unsigned i; ++ ++ free_heap(&ca->alloc_heap); ++ free_fifo(&ca->free_inc); ++ for (i = 0; i < RESERVE_NR; i++) ++ free_fifo(&ca->free[i]); ++ kvpfree(ca->buckets_nouse, ++ BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); ++ kvpfree(rcu_dereference_protected(ca->buckets[0], 1), ++ sizeof(struct bucket_array) + ++ ca->mi.nbuckets * sizeof(struct bucket)); ++ ++ free_percpu(ca->usage[0]); ++} ++ ++int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) ++{ ++ if (!(ca->usage[0] = alloc_percpu(struct bch_dev_usage))) ++ return -ENOMEM; ++ ++ return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets);; ++} +diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h +new file mode 100644 +index 000000000000..a3873becbb70 +--- /dev/null ++++ b/fs/bcachefs/buckets.h +@@ -0,0 +1,318 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* ++ * Code for manipulating bucket marks for garbage collection. ++ * ++ * Copyright 2014 Datera, Inc. ++ */ ++ ++#ifndef _BUCKETS_H ++#define _BUCKETS_H ++ ++#include "buckets_types.h" ++#include "super.h" ++ ++#define for_each_bucket(_b, _buckets) \ ++ for (_b = (_buckets)->b + (_buckets)->first_bucket; \ ++ _b < (_buckets)->b + (_buckets)->nbuckets; _b++) ++ ++#define bucket_cmpxchg(g, new, expr) \ ++({ \ ++ struct bucket *_g = g; \ ++ u64 _v = atomic64_read(&(g)->_mark.v); \ ++ struct bucket_mark _old; \ ++ \ ++ do { \ ++ (new).v.counter = _old.v.counter = _v; \ ++ expr; \ ++ } while ((_v = atomic64_cmpxchg(&(_g)->_mark.v, \ ++ _old.v.counter, \ ++ (new).v.counter)) != _old.v.counter);\ ++ _old; \ ++}) ++ ++static inline struct bucket_array *__bucket_array(struct bch_dev *ca, ++ bool gc) ++{ ++ return rcu_dereference_check(ca->buckets[gc], ++ !ca->fs || ++ percpu_rwsem_is_held(&ca->fs->mark_lock) || ++ lockdep_is_held(&ca->fs->gc_lock) || ++ lockdep_is_held(&ca->bucket_lock)); ++} ++ ++static inline struct bucket_array *bucket_array(struct bch_dev *ca) ++{ ++ return __bucket_array(ca, false); ++} ++ ++static inline struct bucket *__bucket(struct bch_dev *ca, size_t b, bool gc) ++{ ++ struct bucket_array *buckets = __bucket_array(ca, gc); ++ ++ BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); ++ return buckets->b + b; ++} ++ ++static inline struct bucket *bucket(struct bch_dev *ca, size_t b) ++{ ++ return __bucket(ca, b, false); ++} ++ ++static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw) ++{ ++ return c->bucket_clock[rw].hand - g->io_time[rw]; ++} ++ ++/* ++ * bucket_gc_gen() returns the difference between the bucket's current gen and ++ * the oldest gen of any pointer into that bucket in the btree. ++ */ ++ ++static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b) ++{ ++ struct bucket *g = bucket(ca, b); ++ ++ return g->mark.gen - g->oldest_gen; ++} ++ ++static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, ++ const struct bch_extent_ptr *ptr) ++{ ++ return sector_to_bucket(ca, ptr->offset); ++} ++ ++static inline struct bucket *PTR_BUCKET(struct bch_dev *ca, ++ const struct bch_extent_ptr *ptr, ++ bool gc) ++{ ++ return __bucket(ca, PTR_BUCKET_NR(ca, ptr), gc); ++} ++ ++static inline enum bch_data_type ptr_data_type(const struct bkey *k, ++ const struct bch_extent_ptr *ptr) ++{ ++ if (k->type == KEY_TYPE_btree_ptr || ++ k->type == KEY_TYPE_btree_ptr_v2) ++ return BCH_DATA_btree; ++ ++ return ptr->cached ? BCH_DATA_cached : BCH_DATA_user; ++} ++ ++static inline struct bucket_mark ptr_bucket_mark(struct bch_dev *ca, ++ const struct bch_extent_ptr *ptr) ++{ ++ struct bucket_mark m; ++ ++ rcu_read_lock(); ++ m = READ_ONCE(PTR_BUCKET(ca, ptr, 0)->mark); ++ rcu_read_unlock(); ++ ++ return m; ++} ++ ++static inline int gen_cmp(u8 a, u8 b) ++{ ++ return (s8) (a - b); ++} ++ ++static inline int gen_after(u8 a, u8 b) ++{ ++ int r = gen_cmp(a, b); ++ ++ return r > 0 ? r : 0; ++} ++ ++/** ++ * ptr_stale() - check if a pointer points into a bucket that has been ++ * invalidated. ++ */ ++static inline u8 ptr_stale(struct bch_dev *ca, ++ const struct bch_extent_ptr *ptr) ++{ ++ return gen_after(ptr_bucket_mark(ca, ptr).gen, ptr->gen); ++} ++ ++static inline s64 __ptr_disk_sectors(struct extent_ptr_decoded p, ++ unsigned live_size) ++{ ++ return live_size && p.crc.compression_type ++ ? max(1U, DIV_ROUND_UP(live_size * p.crc.compressed_size, ++ p.crc.uncompressed_size)) ++ : live_size; ++} ++ ++static inline s64 ptr_disk_sectors(struct extent_ptr_decoded p) ++{ ++ return __ptr_disk_sectors(p, p.crc.live_size); ++} ++ ++/* bucket gc marks */ ++ ++static inline unsigned bucket_sectors_used(struct bucket_mark mark) ++{ ++ return mark.dirty_sectors + mark.cached_sectors; ++} ++ ++static inline bool bucket_unused(struct bucket_mark mark) ++{ ++ return !mark.owned_by_allocator && ++ !mark.data_type && ++ !bucket_sectors_used(mark); ++} ++ ++static inline bool is_available_bucket(struct bucket_mark mark) ++{ ++ return (!mark.owned_by_allocator && ++ !mark.dirty_sectors && ++ !mark.stripe); ++} ++ ++static inline bool bucket_needs_journal_commit(struct bucket_mark m, ++ u16 last_seq_ondisk) ++{ ++ return m.journal_seq_valid && ++ ((s16) m.journal_seq - (s16) last_seq_ondisk > 0); ++} ++ ++/* Device usage: */ ++ ++struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *); ++ ++void bch2_dev_usage_from_buckets(struct bch_fs *); ++ ++static inline u64 __dev_buckets_available(struct bch_dev *ca, ++ struct bch_dev_usage stats) ++{ ++ u64 total = ca->mi.nbuckets - ca->mi.first_bucket; ++ ++ if (WARN_ONCE(stats.buckets_unavailable > total, ++ "buckets_unavailable overflow (%llu > %llu)\n", ++ stats.buckets_unavailable, total)) ++ return 0; ++ ++ return total - stats.buckets_unavailable; ++} ++ ++/* ++ * Number of reclaimable buckets - only for use by the allocator thread: ++ */ ++static inline u64 dev_buckets_available(struct bch_dev *ca) ++{ ++ return __dev_buckets_available(ca, bch2_dev_usage_read(ca)); ++} ++ ++static inline u64 __dev_buckets_free(struct bch_dev *ca, ++ struct bch_dev_usage stats) ++{ ++ return __dev_buckets_available(ca, stats) + ++ fifo_used(&ca->free[RESERVE_NONE]) + ++ fifo_used(&ca->free_inc); ++} ++ ++static inline u64 dev_buckets_free(struct bch_dev *ca) ++{ ++ return __dev_buckets_free(ca, bch2_dev_usage_read(ca)); ++} ++ ++/* Filesystem usage: */ ++ ++static inline unsigned fs_usage_u64s(struct bch_fs *c) ++{ ++ ++ return sizeof(struct bch_fs_usage) / sizeof(u64) + ++ READ_ONCE(c->replicas.nr); ++} ++ ++void bch2_fs_usage_scratch_put(struct bch_fs *, struct bch_fs_usage *); ++struct bch_fs_usage *bch2_fs_usage_scratch_get(struct bch_fs *); ++ ++u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *); ++ ++struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *); ++ ++void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned); ++ ++void bch2_fs_usage_to_text(struct printbuf *, ++ struct bch_fs *, struct bch_fs_usage *); ++ ++u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage *); ++ ++struct bch_fs_usage_short ++bch2_fs_usage_read_short(struct bch_fs *); ++ ++/* key/bucket marking: */ ++ ++void bch2_bucket_seq_cleanup(struct bch_fs *); ++void bch2_fs_usage_initialize(struct bch_fs *); ++ ++void bch2_invalidate_bucket(struct bch_fs *, struct bch_dev *, ++ size_t, struct bucket_mark *); ++void bch2_mark_alloc_bucket(struct bch_fs *, struct bch_dev *, ++ size_t, bool, struct gc_pos, unsigned); ++void bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, ++ size_t, enum bch_data_type, unsigned, ++ struct gc_pos, unsigned); ++ ++int bch2_mark_key(struct bch_fs *, struct bkey_s_c, unsigned, ++ s64, struct bch_fs_usage *, u64, unsigned); ++int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *, ++ struct disk_reservation *, unsigned); ++ ++int bch2_mark_update(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, struct bch_fs_usage *, unsigned); ++ ++int bch2_replicas_delta_list_apply(struct bch_fs *, ++ struct bch_fs_usage *, ++ struct replicas_delta_list *); ++int bch2_trans_mark_key(struct btree_trans *, struct bkey_s_c, ++ unsigned, s64, unsigned); ++int bch2_trans_mark_update(struct btree_trans *, struct btree_iter *iter, ++ struct bkey_i *insert, unsigned); ++void bch2_trans_fs_usage_apply(struct btree_trans *, struct bch_fs_usage *); ++ ++/* disk reservations: */ ++ ++void __bch2_disk_reservation_put(struct bch_fs *, struct disk_reservation *); ++ ++static inline void bch2_disk_reservation_put(struct bch_fs *c, ++ struct disk_reservation *res) ++{ ++ if (res->sectors) ++ __bch2_disk_reservation_put(c, res); ++} ++ ++#define BCH_DISK_RESERVATION_NOFAIL (1 << 0) ++ ++int bch2_disk_reservation_add(struct bch_fs *, ++ struct disk_reservation *, ++ unsigned, int); ++ ++static inline struct disk_reservation ++bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) ++{ ++ return (struct disk_reservation) { ++ .sectors = 0, ++#if 0 ++ /* not used yet: */ ++ .gen = c->capacity_gen, ++#endif ++ .nr_replicas = nr_replicas, ++ }; ++} ++ ++static inline int bch2_disk_reservation_get(struct bch_fs *c, ++ struct disk_reservation *res, ++ unsigned sectors, ++ unsigned nr_replicas, ++ int flags) ++{ ++ *res = bch2_disk_reservation_init(c, nr_replicas); ++ ++ return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags); ++} ++ ++int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64); ++void bch2_dev_buckets_free(struct bch_dev *); ++int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *); ++ ++#endif /* _BUCKETS_H */ +diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h +new file mode 100644 +index 000000000000..d5215b14d7d9 +--- /dev/null ++++ b/fs/bcachefs/buckets_types.h +@@ -0,0 +1,135 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BUCKETS_TYPES_H ++#define _BUCKETS_TYPES_H ++ ++#include "bcachefs_format.h" ++#include "util.h" ++ ++#define BUCKET_JOURNAL_SEQ_BITS 16 ++ ++struct bucket_mark { ++ union { ++ atomic64_t v; ++ ++ struct { ++ u8 gen; ++ u8 data_type:3, ++ owned_by_allocator:1, ++ journal_seq_valid:1, ++ stripe:1; ++ u16 dirty_sectors; ++ u16 cached_sectors; ++ ++ /* ++ * low bits of journal sequence number when this bucket was most ++ * recently modified: if journal_seq_valid is set, this bucket can't be ++ * reused until the journal sequence number written to disk is >= the ++ * bucket's journal sequence number: ++ */ ++ u16 journal_seq; ++ }; ++ }; ++}; ++ ++struct bucket { ++ union { ++ struct bucket_mark _mark; ++ const struct bucket_mark mark; ++ }; ++ ++ u16 io_time[2]; ++ u8 oldest_gen; ++ u8 gc_gen; ++ unsigned gen_valid:1; ++}; ++ ++struct bucket_array { ++ struct rcu_head rcu; ++ u16 first_bucket; ++ size_t nbuckets; ++ struct bucket b[]; ++}; ++ ++struct bch_dev_usage { ++ u64 buckets[BCH_DATA_NR]; ++ u64 buckets_alloc; ++ u64 buckets_unavailable; ++ ++ /* _compressed_ sectors: */ ++ u64 sectors[BCH_DATA_NR]; ++ u64 sectors_fragmented; ++ ++ u64 buckets_ec; ++ u64 sectors_ec; ++}; ++ ++struct bch_fs_usage { ++ /* all fields are in units of 512 byte sectors: */ ++ ++ u64 online_reserved; ++ ++ /* fields after online_reserved are cleared/recalculated by gc: */ ++ u64 gc_start[0]; ++ ++ u64 hidden; ++ u64 btree; ++ u64 data; ++ u64 cached; ++ u64 reserved; ++ u64 nr_inodes; ++ ++ /* XXX: add stats for compression ratio */ ++#if 0 ++ u64 uncompressed; ++ u64 compressed; ++#endif ++ ++ /* broken out: */ ++ ++ u64 persistent_reserved[BCH_REPLICAS_MAX]; ++ u64 replicas[]; ++}; ++ ++struct bch_fs_usage_short { ++ u64 capacity; ++ u64 used; ++ u64 free; ++ u64 nr_inodes; ++}; ++ ++struct replicas_delta { ++ s64 delta; ++ struct bch_replicas_entry r; ++} __packed; ++ ++struct replicas_delta_list { ++ unsigned size; ++ unsigned used; ++ ++ struct {} memset_start; ++ u64 nr_inodes; ++ u64 persistent_reserved[BCH_REPLICAS_MAX]; ++ struct {} memset_end; ++ struct replicas_delta d[0]; ++}; ++ ++/* ++ * A reservation for space on disk: ++ */ ++struct disk_reservation { ++ u64 sectors; ++ u32 gen; ++ unsigned nr_replicas; ++}; ++ ++struct copygc_heap_entry { ++ u8 dev; ++ u8 gen; ++ u16 fragmentation; ++ u32 sectors; ++ u64 offset; ++}; ++ ++typedef HEAP(struct copygc_heap_entry) copygc_heap; ++ ++#endif /* _BUCKETS_TYPES_H */ +diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c +new file mode 100644 +index 000000000000..0377f9018d27 +--- /dev/null ++++ b/fs/bcachefs/chardev.c +@@ -0,0 +1,704 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef NO_BCACHEFS_CHARDEV ++ ++#include "bcachefs.h" ++#include "bcachefs_ioctl.h" ++#include "buckets.h" ++#include "chardev.h" ++#include "move.h" ++#include "replicas.h" ++#include "super.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* returns with ref on ca->ref */ ++static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, ++ unsigned flags) ++{ ++ struct bch_dev *ca; ++ ++ if (flags & BCH_BY_INDEX) { ++ if (dev >= c->sb.nr_devices) ++ return ERR_PTR(-EINVAL); ++ ++ rcu_read_lock(); ++ ca = rcu_dereference(c->devs[dev]); ++ if (ca) ++ percpu_ref_get(&ca->ref); ++ rcu_read_unlock(); ++ ++ if (!ca) ++ return ERR_PTR(-EINVAL); ++ } else { ++ char *path; ++ ++ path = strndup_user((const char __user *) ++ (unsigned long) dev, PATH_MAX); ++ if (IS_ERR(path)) ++ return ERR_CAST(path); ++ ++ ca = bch2_dev_lookup(c, path); ++ kfree(path); ++ } ++ ++ return ca; ++} ++ ++#if 0 ++static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) ++{ ++ struct bch_ioctl_assemble arg; ++ struct bch_fs *c; ++ u64 *user_devs = NULL; ++ char **devs = NULL; ++ unsigned i; ++ int ret = -EFAULT; ++ ++ if (copy_from_user(&arg, user_arg, sizeof(arg))) ++ return -EFAULT; ++ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL); ++ if (!user_devs) ++ return -ENOMEM; ++ ++ devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL); ++ ++ if (copy_from_user(user_devs, user_arg->devs, ++ sizeof(u64) * arg.nr_devs)) ++ goto err; ++ ++ for (i = 0; i < arg.nr_devs; i++) { ++ devs[i] = strndup_user((const char __user *)(unsigned long) ++ user_devs[i], ++ PATH_MAX); ++ if (!devs[i]) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ } ++ ++ c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty()); ++ ret = PTR_ERR_OR_ZERO(c); ++ if (!ret) ++ closure_put(&c->cl); ++err: ++ if (devs) ++ for (i = 0; i < arg.nr_devs; i++) ++ kfree(devs[i]); ++ kfree(devs); ++ return ret; ++} ++ ++static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg) ++{ ++ struct bch_ioctl_incremental arg; ++ const char *err; ++ char *path; ++ ++ if (copy_from_user(&arg, user_arg, sizeof(arg))) ++ return -EFAULT; ++ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); ++ if (!path) ++ return -ENOMEM; ++ ++ err = bch2_fs_open_incremental(path); ++ kfree(path); ++ ++ if (err) { ++ pr_err("Could not register bcachefs devices: %s", err); ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++#endif ++ ++static long bch2_global_ioctl(unsigned cmd, void __user *arg) ++{ ++ switch (cmd) { ++#if 0 ++ case BCH_IOCTL_ASSEMBLE: ++ return bch2_ioctl_assemble(arg); ++ case BCH_IOCTL_INCREMENTAL: ++ return bch2_ioctl_incremental(arg); ++#endif ++ default: ++ return -ENOTTY; ++ } ++} ++ ++static long bch2_ioctl_query_uuid(struct bch_fs *c, ++ struct bch_ioctl_query_uuid __user *user_arg) ++{ ++ return copy_to_user(&user_arg->uuid, ++ &c->sb.user_uuid, ++ sizeof(c->sb.user_uuid)); ++} ++ ++#if 0 ++static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg) ++{ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ return bch2_fs_start(c); ++} ++ ++static long bch2_ioctl_stop(struct bch_fs *c) ++{ ++ bch2_fs_stop(c); ++ return 0; ++} ++#endif ++ ++static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) ++{ ++ char *path; ++ int ret; ++ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); ++ if (!path) ++ return -ENOMEM; ++ ++ ret = bch2_dev_add(c, path); ++ kfree(path); ++ ++ return ret; ++} ++ ++static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg) ++{ ++ struct bch_dev *ca; ++ ++ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| ++ BCH_FORCE_IF_METADATA_LOST| ++ BCH_FORCE_IF_DEGRADED| ++ BCH_BY_INDEX)) || ++ arg.pad) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ return bch2_dev_remove(c, ca, arg.flags); ++} ++ ++static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg) ++{ ++ char *path; ++ int ret; ++ ++ if (arg.flags || arg.pad) ++ return -EINVAL; ++ ++ path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); ++ if (!path) ++ return -ENOMEM; ++ ++ ret = bch2_dev_online(c, path); ++ kfree(path); ++ return ret; ++} ++ ++static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) ++{ ++ struct bch_dev *ca; ++ int ret; ++ ++ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| ++ BCH_FORCE_IF_METADATA_LOST| ++ BCH_FORCE_IF_DEGRADED| ++ BCH_BY_INDEX)) || ++ arg.pad) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ ret = bch2_dev_offline(c, ca, arg.flags); ++ percpu_ref_put(&ca->ref); ++ return ret; ++} ++ ++static long bch2_ioctl_disk_set_state(struct bch_fs *c, ++ struct bch_ioctl_disk_set_state arg) ++{ ++ struct bch_dev *ca; ++ int ret; ++ ++ if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| ++ BCH_FORCE_IF_METADATA_LOST| ++ BCH_FORCE_IF_DEGRADED| ++ BCH_BY_INDEX)) || ++ arg.pad[0] || arg.pad[1] || arg.pad[2]) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags); ++ ++ percpu_ref_put(&ca->ref); ++ return ret; ++} ++ ++struct bch_data_ctx { ++ struct bch_fs *c; ++ struct bch_ioctl_data arg; ++ struct bch_move_stats stats; ++ ++ int ret; ++ ++ struct task_struct *thread; ++}; ++ ++static int bch2_data_thread(void *arg) ++{ ++ struct bch_data_ctx *ctx = arg; ++ ++ ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); ++ ++ ctx->stats.data_type = U8_MAX; ++ return 0; ++} ++ ++static int bch2_data_job_release(struct inode *inode, struct file *file) ++{ ++ struct bch_data_ctx *ctx = file->private_data; ++ ++ kthread_stop(ctx->thread); ++ put_task_struct(ctx->thread); ++ kfree(ctx); ++ return 0; ++} ++ ++static ssize_t bch2_data_job_read(struct file *file, char __user *buf, ++ size_t len, loff_t *ppos) ++{ ++ struct bch_data_ctx *ctx = file->private_data; ++ struct bch_fs *c = ctx->c; ++ struct bch_ioctl_data_event e = { ++ .type = BCH_DATA_EVENT_PROGRESS, ++ .p.data_type = ctx->stats.data_type, ++ .p.btree_id = ctx->stats.btree_id, ++ .p.pos = ctx->stats.pos, ++ .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), ++ .p.sectors_total = bch2_fs_usage_read_short(c).used, ++ }; ++ ++ if (len < sizeof(e)) ++ return -EINVAL; ++ ++ return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e); ++} ++ ++static const struct file_operations bcachefs_data_ops = { ++ .release = bch2_data_job_release, ++ .read = bch2_data_job_read, ++ .llseek = no_llseek, ++}; ++ ++static long bch2_ioctl_data(struct bch_fs *c, ++ struct bch_ioctl_data arg) ++{ ++ struct bch_data_ctx *ctx = NULL; ++ struct file *file = NULL; ++ unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK; ++ int ret, fd = -1; ++ ++ if (arg.op >= BCH_DATA_OP_NR || arg.flags) ++ return -EINVAL; ++ ++ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); ++ if (!ctx) ++ return -ENOMEM; ++ ++ ctx->c = c; ++ ctx->arg = arg; ++ ++ ctx->thread = kthread_create(bch2_data_thread, ctx, "[bcachefs]"); ++ if (IS_ERR(ctx->thread)) { ++ ret = PTR_ERR(ctx->thread); ++ goto err; ++ } ++ ++ ret = get_unused_fd_flags(flags); ++ if (ret < 0) ++ goto err; ++ fd = ret; ++ ++ file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags); ++ if (IS_ERR(file)) { ++ ret = PTR_ERR(file); ++ goto err; ++ } ++ ++ fd_install(fd, file); ++ ++ get_task_struct(ctx->thread); ++ wake_up_process(ctx->thread); ++ ++ return fd; ++err: ++ if (fd >= 0) ++ put_unused_fd(fd); ++ if (!IS_ERR_OR_NULL(ctx->thread)) ++ kthread_stop(ctx->thread); ++ kfree(ctx); ++ return ret; ++} ++ ++static long bch2_ioctl_fs_usage(struct bch_fs *c, ++ struct bch_ioctl_fs_usage __user *user_arg) ++{ ++ struct bch_ioctl_fs_usage *arg = NULL; ++ struct bch_replicas_usage *dst_e, *dst_end; ++ struct bch_fs_usage *src; ++ u32 replica_entries_bytes; ++ unsigned i; ++ int ret = 0; ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EINVAL; ++ ++ if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) ++ return -EFAULT; ++ ++ arg = kzalloc(sizeof(*arg) + replica_entries_bytes, GFP_KERNEL); ++ if (!arg) ++ return -ENOMEM; ++ ++ src = bch2_fs_usage_read(c); ++ if (!src) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ arg->capacity = c->capacity; ++ arg->used = bch2_fs_sectors_used(c, src); ++ arg->online_reserved = src->online_reserved; ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) ++ arg->persistent_reserved[i] = src->persistent_reserved[i]; ++ ++ dst_e = arg->replicas; ++ dst_end = (void *) arg->replicas + replica_entries_bytes; ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *src_e = ++ cpu_replicas_entry(&c->replicas, i); ++ ++ if (replicas_usage_next(dst_e) > dst_end) { ++ ret = -ERANGE; ++ break; ++ } ++ ++ dst_e->sectors = src->replicas[i]; ++ dst_e->r = *src_e; ++ ++ /* recheck after setting nr_devs: */ ++ if (replicas_usage_next(dst_e) > dst_end) { ++ ret = -ERANGE; ++ break; ++ } ++ ++ memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs); ++ ++ dst_e = replicas_usage_next(dst_e); ++ } ++ ++ arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas; ++ ++ percpu_up_read(&c->mark_lock); ++ kfree(src); ++ ++ if (!ret) ++ ret = copy_to_user(user_arg, arg, ++ sizeof(*arg) + arg->replica_entries_bytes); ++err: ++ kfree(arg); ++ return ret; ++} ++ ++static long bch2_ioctl_dev_usage(struct bch_fs *c, ++ struct bch_ioctl_dev_usage __user *user_arg) ++{ ++ struct bch_ioctl_dev_usage arg; ++ struct bch_dev_usage src; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EINVAL; ++ ++ if (copy_from_user(&arg, user_arg, sizeof(arg))) ++ return -EFAULT; ++ ++ if ((arg.flags & ~BCH_BY_INDEX) || ++ arg.pad[0] || ++ arg.pad[1] || ++ arg.pad[2]) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ src = bch2_dev_usage_read(ca); ++ ++ arg.state = ca->mi.state; ++ arg.bucket_size = ca->mi.bucket_size; ++ arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; ++ arg.available_buckets = arg.nr_buckets - src.buckets_unavailable; ++ arg.ec_buckets = src.buckets_ec; ++ arg.ec_sectors = src.sectors_ec; ++ ++ for (i = 0; i < BCH_DATA_NR; i++) { ++ arg.buckets[i] = src.buckets[i]; ++ arg.sectors[i] = src.sectors[i]; ++ } ++ ++ percpu_ref_put(&ca->ref); ++ ++ return copy_to_user(user_arg, &arg, sizeof(arg)); ++} ++ ++static long bch2_ioctl_read_super(struct bch_fs *c, ++ struct bch_ioctl_read_super arg) ++{ ++ struct bch_dev *ca = NULL; ++ struct bch_sb *sb; ++ int ret = 0; ++ ++ if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) || ++ arg.pad) ++ return -EINVAL; ++ ++ mutex_lock(&c->sb_lock); ++ ++ if (arg.flags & BCH_READ_DEV) { ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ ++ if (IS_ERR(ca)) { ++ ret = PTR_ERR(ca); ++ goto err; ++ } ++ ++ sb = ca->disk_sb.sb; ++ } else { ++ sb = c->disk_sb.sb; ++ } ++ ++ if (vstruct_bytes(sb) > arg.size) { ++ ret = -ERANGE; ++ goto err; ++ } ++ ++ ret = copy_to_user((void __user *)(unsigned long)arg.sb, ++ sb, vstruct_bytes(sb)); ++err: ++ if (ca) ++ percpu_ref_put(&ca->ref); ++ mutex_unlock(&c->sb_lock); ++ return ret; ++} ++ ++static long bch2_ioctl_disk_get_idx(struct bch_fs *c, ++ struct bch_ioctl_disk_get_idx arg) ++{ ++ dev_t dev = huge_decode_dev(arg.dev); ++ struct bch_dev *ca; ++ unsigned i; ++ ++ for_each_online_member(ca, c, i) ++ if (ca->disk_sb.bdev->bd_dev == dev) { ++ percpu_ref_put(&ca->io_ref); ++ return i; ++ } ++ ++ return -ENOENT; ++} ++ ++static long bch2_ioctl_disk_resize(struct bch_fs *c, ++ struct bch_ioctl_disk_resize arg) ++{ ++ struct bch_dev *ca; ++ int ret; ++ ++ if ((arg.flags & ~BCH_BY_INDEX) || ++ arg.pad) ++ return -EINVAL; ++ ++ ca = bch2_device_lookup(c, arg.dev, arg.flags); ++ if (IS_ERR(ca)) ++ return PTR_ERR(ca); ++ ++ ret = bch2_dev_resize(c, ca, arg.nbuckets); ++ ++ percpu_ref_put(&ca->ref); ++ return ret; ++} ++ ++#define BCH_IOCTL(_name, _argtype) \ ++do { \ ++ _argtype i; \ ++ \ ++ if (copy_from_user(&i, arg, sizeof(i))) \ ++ return -EFAULT; \ ++ return bch2_ioctl_##_name(c, i); \ ++} while (0) ++ ++long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) ++{ ++ /* ioctls that don't require admin cap: */ ++ switch (cmd) { ++ case BCH_IOCTL_QUERY_UUID: ++ return bch2_ioctl_query_uuid(c, arg); ++ case BCH_IOCTL_FS_USAGE: ++ return bch2_ioctl_fs_usage(c, arg); ++ case BCH_IOCTL_DEV_USAGE: ++ return bch2_ioctl_dev_usage(c, arg); ++ } ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ switch (cmd) { ++#if 0 ++ case BCH_IOCTL_START: ++ BCH_IOCTL(start, struct bch_ioctl_start); ++ case BCH_IOCTL_STOP: ++ return bch2_ioctl_stop(c); ++#endif ++ case BCH_IOCTL_READ_SUPER: ++ BCH_IOCTL(read_super, struct bch_ioctl_read_super); ++ case BCH_IOCTL_DISK_GET_IDX: ++ BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx); ++ } ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EINVAL; ++ ++ /* ioctls that do require admin cap: */ ++ switch (cmd) { ++ case BCH_IOCTL_DISK_ADD: ++ BCH_IOCTL(disk_add, struct bch_ioctl_disk); ++ case BCH_IOCTL_DISK_REMOVE: ++ BCH_IOCTL(disk_remove, struct bch_ioctl_disk); ++ case BCH_IOCTL_DISK_ONLINE: ++ BCH_IOCTL(disk_online, struct bch_ioctl_disk); ++ case BCH_IOCTL_DISK_OFFLINE: ++ BCH_IOCTL(disk_offline, struct bch_ioctl_disk); ++ case BCH_IOCTL_DISK_SET_STATE: ++ BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state); ++ case BCH_IOCTL_DATA: ++ BCH_IOCTL(data, struct bch_ioctl_data); ++ case BCH_IOCTL_DISK_RESIZE: ++ BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); ++ ++ default: ++ return -ENOTTY; ++ } ++} ++ ++static DEFINE_IDR(bch_chardev_minor); ++ ++static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v) ++{ ++ unsigned minor = iminor(file_inode(filp)); ++ struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL; ++ void __user *arg = (void __user *) v; ++ ++ return c ++ ? bch2_fs_ioctl(c, cmd, arg) ++ : bch2_global_ioctl(cmd, arg); ++} ++ ++static const struct file_operations bch_chardev_fops = { ++ .owner = THIS_MODULE, ++ .unlocked_ioctl = bch2_chardev_ioctl, ++ .open = nonseekable_open, ++}; ++ ++static int bch_chardev_major; ++static struct class *bch_chardev_class; ++static struct device *bch_chardev; ++ ++void bch2_fs_chardev_exit(struct bch_fs *c) ++{ ++ if (!IS_ERR_OR_NULL(c->chardev)) ++ device_unregister(c->chardev); ++ if (c->minor >= 0) ++ idr_remove(&bch_chardev_minor, c->minor); ++} ++ ++int bch2_fs_chardev_init(struct bch_fs *c) ++{ ++ c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL); ++ if (c->minor < 0) ++ return c->minor; ++ ++ c->chardev = device_create(bch_chardev_class, NULL, ++ MKDEV(bch_chardev_major, c->minor), c, ++ "bcachefs%u-ctl", c->minor); ++ if (IS_ERR(c->chardev)) ++ return PTR_ERR(c->chardev); ++ ++ return 0; ++} ++ ++void bch2_chardev_exit(void) ++{ ++ if (!IS_ERR_OR_NULL(bch_chardev_class)) ++ device_destroy(bch_chardev_class, ++ MKDEV(bch_chardev_major, U8_MAX)); ++ if (!IS_ERR_OR_NULL(bch_chardev_class)) ++ class_destroy(bch_chardev_class); ++ if (bch_chardev_major > 0) ++ unregister_chrdev(bch_chardev_major, "bcachefs"); ++} ++ ++int __init bch2_chardev_init(void) ++{ ++ bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops); ++ if (bch_chardev_major < 0) ++ return bch_chardev_major; ++ ++ bch_chardev_class = class_create(THIS_MODULE, "bcachefs"); ++ if (IS_ERR(bch_chardev_class)) ++ return PTR_ERR(bch_chardev_class); ++ ++ bch_chardev = device_create(bch_chardev_class, NULL, ++ MKDEV(bch_chardev_major, U8_MAX), ++ NULL, "bcachefs-ctl"); ++ if (IS_ERR(bch_chardev)) ++ return PTR_ERR(bch_chardev); ++ ++ return 0; ++} ++ ++#endif /* NO_BCACHEFS_CHARDEV */ +diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h +new file mode 100644 +index 000000000000..3a4890d39ff9 +--- /dev/null ++++ b/fs/bcachefs/chardev.h +@@ -0,0 +1,31 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_CHARDEV_H ++#define _BCACHEFS_CHARDEV_H ++ ++#ifndef NO_BCACHEFS_FS ++ ++long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *); ++ ++void bch2_fs_chardev_exit(struct bch_fs *); ++int bch2_fs_chardev_init(struct bch_fs *); ++ ++void bch2_chardev_exit(void); ++int __init bch2_chardev_init(void); ++ ++#else ++ ++static inline long bch2_fs_ioctl(struct bch_fs *c, ++ unsigned cmd, void __user * arg) ++{ ++ return -ENOSYS; ++} ++ ++static inline void bch2_fs_chardev_exit(struct bch_fs *c) {} ++static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; } ++ ++static inline void bch2_chardev_exit(void) {} ++static inline int __init bch2_chardev_init(void) { return 0; } ++ ++#endif /* NO_BCACHEFS_FS */ ++ ++#endif /* _BCACHEFS_CHARDEV_H */ +diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c +new file mode 100644 +index 000000000000..3d88719ba86c +--- /dev/null ++++ b/fs/bcachefs/checksum.c +@@ -0,0 +1,618 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "checksum.h" ++#include "super.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static u64 bch2_checksum_init(unsigned type) ++{ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ return 0; ++ case BCH_CSUM_CRC32C_NONZERO: ++ return U32_MAX; ++ case BCH_CSUM_CRC64_NONZERO: ++ return U64_MAX; ++ case BCH_CSUM_CRC32C: ++ return 0; ++ case BCH_CSUM_CRC64: ++ return 0; ++ default: ++ BUG(); ++ } ++} ++ ++static u64 bch2_checksum_final(unsigned type, u64 crc) ++{ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ return 0; ++ case BCH_CSUM_CRC32C_NONZERO: ++ return crc ^ U32_MAX; ++ case BCH_CSUM_CRC64_NONZERO: ++ return crc ^ U64_MAX; ++ case BCH_CSUM_CRC32C: ++ return crc; ++ case BCH_CSUM_CRC64: ++ return crc; ++ default: ++ BUG(); ++ } ++} ++ ++static u64 bch2_checksum_update(unsigned type, u64 crc, const void *data, size_t len) ++{ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ return 0; ++ case BCH_CSUM_CRC32C_NONZERO: ++ case BCH_CSUM_CRC32C: ++ return crc32c(crc, data, len); ++ case BCH_CSUM_CRC64_NONZERO: ++ case BCH_CSUM_CRC64: ++ return crc64_be(crc, data, len); ++ default: ++ BUG(); ++ } ++} ++ ++static inline void do_encrypt_sg(struct crypto_sync_skcipher *tfm, ++ struct nonce nonce, ++ struct scatterlist *sg, size_t len) ++{ ++ SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); ++ int ret; ++ ++ skcipher_request_set_sync_tfm(req, tfm); ++ skcipher_request_set_crypt(req, sg, sg, len, nonce.d); ++ ++ ret = crypto_skcipher_encrypt(req); ++ BUG_ON(ret); ++} ++ ++static inline void do_encrypt(struct crypto_sync_skcipher *tfm, ++ struct nonce nonce, ++ void *buf, size_t len) ++{ ++ struct scatterlist sg; ++ ++ sg_init_one(&sg, buf, len); ++ do_encrypt_sg(tfm, nonce, &sg, len); ++} ++ ++int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, ++ void *buf, size_t len) ++{ ++ struct crypto_sync_skcipher *chacha20 = ++ crypto_alloc_sync_skcipher("chacha20", 0, 0); ++ int ret; ++ ++ if (!chacha20) { ++ pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20)); ++ return PTR_ERR(chacha20); ++ } ++ ++ ret = crypto_skcipher_setkey(&chacha20->base, ++ (void *) key, sizeof(*key)); ++ if (ret) { ++ pr_err("crypto_skcipher_setkey() error: %i", ret); ++ goto err; ++ } ++ ++ do_encrypt(chacha20, nonce, buf, len); ++err: ++ crypto_free_sync_skcipher(chacha20); ++ return ret; ++} ++ ++static void gen_poly_key(struct bch_fs *c, struct shash_desc *desc, ++ struct nonce nonce) ++{ ++ u8 key[POLY1305_KEY_SIZE]; ++ ++ nonce.d[3] ^= BCH_NONCE_POLY; ++ ++ memset(key, 0, sizeof(key)); ++ do_encrypt(c->chacha20, nonce, key, sizeof(key)); ++ ++ desc->tfm = c->poly1305; ++ crypto_shash_init(desc); ++ crypto_shash_update(desc, key, sizeof(key)); ++} ++ ++struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, ++ struct nonce nonce, const void *data, size_t len) ++{ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ case BCH_CSUM_CRC32C_NONZERO: ++ case BCH_CSUM_CRC64_NONZERO: ++ case BCH_CSUM_CRC32C: ++ case BCH_CSUM_CRC64: { ++ u64 crc = bch2_checksum_init(type); ++ ++ crc = bch2_checksum_update(type, crc, data, len); ++ crc = bch2_checksum_final(type, crc); ++ ++ return (struct bch_csum) { .lo = cpu_to_le64(crc) }; ++ } ++ ++ case BCH_CSUM_CHACHA20_POLY1305_80: ++ case BCH_CSUM_CHACHA20_POLY1305_128: { ++ SHASH_DESC_ON_STACK(desc, c->poly1305); ++ u8 digest[POLY1305_DIGEST_SIZE]; ++ struct bch_csum ret = { 0 }; ++ ++ gen_poly_key(c, desc, nonce); ++ ++ crypto_shash_update(desc, data, len); ++ crypto_shash_final(desc, digest); ++ ++ memcpy(&ret, digest, bch_crc_bytes[type]); ++ return ret; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++void bch2_encrypt(struct bch_fs *c, unsigned type, ++ struct nonce nonce, void *data, size_t len) ++{ ++ if (!bch2_csum_type_is_encryption(type)) ++ return; ++ ++ do_encrypt(c->chacha20, nonce, data, len); ++} ++ ++static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, ++ struct nonce nonce, struct bio *bio, ++ struct bvec_iter *iter) ++{ ++ struct bio_vec bv; ++ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ return (struct bch_csum) { 0 }; ++ case BCH_CSUM_CRC32C_NONZERO: ++ case BCH_CSUM_CRC64_NONZERO: ++ case BCH_CSUM_CRC32C: ++ case BCH_CSUM_CRC64: { ++ u64 crc = bch2_checksum_init(type); ++ ++#ifdef CONFIG_HIGHMEM ++ __bio_for_each_segment(bv, bio, *iter, *iter) { ++ void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; ++ crc = bch2_checksum_update(type, ++ crc, p, bv.bv_len); ++ kunmap_atomic(p); ++ } ++#else ++ __bio_for_each_bvec(bv, bio, *iter, *iter) ++ crc = bch2_checksum_update(type, crc, ++ page_address(bv.bv_page) + bv.bv_offset, ++ bv.bv_len); ++#endif ++ crc = bch2_checksum_final(type, crc); ++ return (struct bch_csum) { .lo = cpu_to_le64(crc) }; ++ } ++ ++ case BCH_CSUM_CHACHA20_POLY1305_80: ++ case BCH_CSUM_CHACHA20_POLY1305_128: { ++ SHASH_DESC_ON_STACK(desc, c->poly1305); ++ u8 digest[POLY1305_DIGEST_SIZE]; ++ struct bch_csum ret = { 0 }; ++ ++ gen_poly_key(c, desc, nonce); ++ ++#ifdef CONFIG_HIGHMEM ++ __bio_for_each_segment(bv, bio, *iter, *iter) { ++ void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; ++ ++ crypto_shash_update(desc, p, bv.bv_len); ++ kunmap_atomic(p); ++ } ++#else ++ __bio_for_each_bvec(bv, bio, *iter, *iter) ++ crypto_shash_update(desc, ++ page_address(bv.bv_page) + bv.bv_offset, ++ bv.bv_len); ++#endif ++ crypto_shash_final(desc, digest); ++ ++ memcpy(&ret, digest, bch_crc_bytes[type]); ++ return ret; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, ++ struct nonce nonce, struct bio *bio) ++{ ++ struct bvec_iter iter = bio->bi_iter; ++ ++ return __bch2_checksum_bio(c, type, nonce, bio, &iter); ++} ++ ++void bch2_encrypt_bio(struct bch_fs *c, unsigned type, ++ struct nonce nonce, struct bio *bio) ++{ ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ struct scatterlist sgl[16], *sg = sgl; ++ size_t bytes = 0; ++ ++ if (!bch2_csum_type_is_encryption(type)) ++ return; ++ ++ sg_init_table(sgl, ARRAY_SIZE(sgl)); ++ ++ bio_for_each_segment(bv, bio, iter) { ++ if (sg == sgl + ARRAY_SIZE(sgl)) { ++ sg_mark_end(sg - 1); ++ do_encrypt_sg(c->chacha20, nonce, sgl, bytes); ++ ++ nonce = nonce_add(nonce, bytes); ++ bytes = 0; ++ ++ sg_init_table(sgl, ARRAY_SIZE(sgl)); ++ sg = sgl; ++ } ++ ++ sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset); ++ bytes += bv.bv_len; ++ } ++ ++ sg_mark_end(sg - 1); ++ do_encrypt_sg(c->chacha20, nonce, sgl, bytes); ++} ++ ++struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, ++ struct bch_csum b, size_t b_len) ++{ ++ BUG_ON(!bch2_checksum_mergeable(type)); ++ ++ while (b_len) { ++ unsigned b = min_t(unsigned, b_len, PAGE_SIZE); ++ ++ a.lo = bch2_checksum_update(type, a.lo, ++ page_address(ZERO_PAGE(0)), b); ++ b_len -= b; ++ } ++ ++ a.lo ^= b.lo; ++ a.hi ^= b.hi; ++ return a; ++} ++ ++int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, ++ struct bversion version, ++ struct bch_extent_crc_unpacked crc_old, ++ struct bch_extent_crc_unpacked *crc_a, ++ struct bch_extent_crc_unpacked *crc_b, ++ unsigned len_a, unsigned len_b, ++ unsigned new_csum_type) ++{ ++ struct bvec_iter iter = bio->bi_iter; ++ struct nonce nonce = extent_nonce(version, crc_old); ++ struct bch_csum merged = { 0 }; ++ struct crc_split { ++ struct bch_extent_crc_unpacked *crc; ++ unsigned len; ++ unsigned csum_type; ++ struct bch_csum csum; ++ } splits[3] = { ++ { crc_a, len_a, new_csum_type }, ++ { crc_b, len_b, new_csum_type }, ++ { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type }, ++ }, *i; ++ bool mergeable = crc_old.csum_type == new_csum_type && ++ bch2_checksum_mergeable(new_csum_type); ++ unsigned crc_nonce = crc_old.nonce; ++ ++ BUG_ON(len_a + len_b > bio_sectors(bio)); ++ BUG_ON(crc_old.uncompressed_size != bio_sectors(bio)); ++ BUG_ON(crc_is_compressed(crc_old)); ++ BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) != ++ bch2_csum_type_is_encryption(new_csum_type)); ++ ++ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { ++ iter.bi_size = i->len << 9; ++ if (mergeable || i->crc) ++ i->csum = __bch2_checksum_bio(c, i->csum_type, ++ nonce, bio, &iter); ++ else ++ bio_advance_iter(bio, &iter, i->len << 9); ++ nonce = nonce_add(nonce, i->len << 9); ++ } ++ ++ if (mergeable) ++ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) ++ merged = bch2_checksum_merge(new_csum_type, merged, ++ i->csum, i->len << 9); ++ else ++ merged = bch2_checksum_bio(c, crc_old.csum_type, ++ extent_nonce(version, crc_old), bio); ++ ++ if (bch2_crc_cmp(merged, crc_old.csum)) ++ return -EIO; ++ ++ for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { ++ if (i->crc) ++ *i->crc = (struct bch_extent_crc_unpacked) { ++ .csum_type = i->csum_type, ++ .compression_type = crc_old.compression_type, ++ .compressed_size = i->len, ++ .uncompressed_size = i->len, ++ .offset = 0, ++ .live_size = i->len, ++ .nonce = crc_nonce, ++ .csum = i->csum, ++ }; ++ ++ if (bch2_csum_type_is_encryption(new_csum_type)) ++ crc_nonce += i->len; ++ } ++ ++ return 0; ++} ++ ++#ifdef __KERNEL__ ++int bch2_request_key(struct bch_sb *sb, struct bch_key *key) ++{ ++ char key_description[60]; ++ struct key *keyring_key; ++ const struct user_key_payload *ukp; ++ int ret; ++ ++ snprintf(key_description, sizeof(key_description), ++ "bcachefs:%pUb", &sb->user_uuid); ++ ++ keyring_key = request_key(&key_type_logon, key_description, NULL); ++ if (IS_ERR(keyring_key)) ++ return PTR_ERR(keyring_key); ++ ++ down_read(&keyring_key->sem); ++ ukp = dereference_key_locked(keyring_key); ++ if (ukp->datalen == sizeof(*key)) { ++ memcpy(key, ukp->data, ukp->datalen); ++ ret = 0; ++ } else { ++ ret = -EINVAL; ++ } ++ up_read(&keyring_key->sem); ++ key_put(keyring_key); ++ ++ return ret; ++} ++#else ++#include ++#include ++ ++int bch2_request_key(struct bch_sb *sb, struct bch_key *key) ++{ ++ key_serial_t key_id; ++ char key_description[60]; ++ char uuid[40]; ++ ++ uuid_unparse_lower(sb->user_uuid.b, uuid); ++ sprintf(key_description, "bcachefs:%s", uuid); ++ ++ key_id = request_key("user", key_description, NULL, ++ KEY_SPEC_USER_KEYRING); ++ if (key_id < 0) ++ return -errno; ++ ++ if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key)) ++ return -1; ++ ++ return 0; ++} ++#endif ++ ++int bch2_decrypt_sb_key(struct bch_fs *c, ++ struct bch_sb_field_crypt *crypt, ++ struct bch_key *key) ++{ ++ struct bch_encrypted_key sb_key = crypt->key; ++ struct bch_key user_key; ++ int ret = 0; ++ ++ /* is key encrypted? */ ++ if (!bch2_key_is_encrypted(&sb_key)) ++ goto out; ++ ++ ret = bch2_request_key(c->disk_sb.sb, &user_key); ++ if (ret) { ++ bch_err(c, "error requesting encryption key: %i", ret); ++ goto err; ++ } ++ ++ /* decrypt real key: */ ++ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), ++ &sb_key, sizeof(sb_key)); ++ if (ret) ++ goto err; ++ ++ if (bch2_key_is_encrypted(&sb_key)) { ++ bch_err(c, "incorrect encryption key"); ++ ret = -EINVAL; ++ goto err; ++ } ++out: ++ *key = sb_key.key; ++err: ++ memzero_explicit(&sb_key, sizeof(sb_key)); ++ memzero_explicit(&user_key, sizeof(user_key)); ++ return ret; ++} ++ ++static int bch2_alloc_ciphers(struct bch_fs *c) ++{ ++ if (!c->chacha20) ++ c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); ++ if (IS_ERR(c->chacha20)) { ++ bch_err(c, "error requesting chacha20 module: %li", ++ PTR_ERR(c->chacha20)); ++ return PTR_ERR(c->chacha20); ++ } ++ ++ if (!c->poly1305) ++ c->poly1305 = crypto_alloc_shash("poly1305", 0, 0); ++ if (IS_ERR(c->poly1305)) { ++ bch_err(c, "error requesting poly1305 module: %li", ++ PTR_ERR(c->poly1305)); ++ return PTR_ERR(c->poly1305); ++ } ++ ++ return 0; ++} ++ ++int bch2_disable_encryption(struct bch_fs *c) ++{ ++ struct bch_sb_field_crypt *crypt; ++ struct bch_key key; ++ int ret = -EINVAL; ++ ++ mutex_lock(&c->sb_lock); ++ ++ crypt = bch2_sb_get_crypt(c->disk_sb.sb); ++ if (!crypt) ++ goto out; ++ ++ /* is key encrypted? */ ++ ret = 0; ++ if (bch2_key_is_encrypted(&crypt->key)) ++ goto out; ++ ++ ret = bch2_decrypt_sb_key(c, crypt, &key); ++ if (ret) ++ goto out; ++ ++ crypt->key.magic = BCH_KEY_MAGIC; ++ crypt->key.key = key; ++ ++ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0); ++ bch2_write_super(c); ++out: ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++int bch2_enable_encryption(struct bch_fs *c, bool keyed) ++{ ++ struct bch_encrypted_key key; ++ struct bch_key user_key; ++ struct bch_sb_field_crypt *crypt; ++ int ret = -EINVAL; ++ ++ mutex_lock(&c->sb_lock); ++ ++ /* Do we already have an encryption key? */ ++ if (bch2_sb_get_crypt(c->disk_sb.sb)) ++ goto err; ++ ++ ret = bch2_alloc_ciphers(c); ++ if (ret) ++ goto err; ++ ++ key.magic = BCH_KEY_MAGIC; ++ get_random_bytes(&key.key, sizeof(key.key)); ++ ++ if (keyed) { ++ ret = bch2_request_key(c->disk_sb.sb, &user_key); ++ if (ret) { ++ bch_err(c, "error requesting encryption key: %i", ret); ++ goto err; ++ } ++ ++ ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), ++ &key, sizeof(key)); ++ if (ret) ++ goto err; ++ } ++ ++ ret = crypto_skcipher_setkey(&c->chacha20->base, ++ (void *) &key.key, sizeof(key.key)); ++ if (ret) ++ goto err; ++ ++ crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64)); ++ if (!crypt) { ++ ret = -ENOMEM; /* XXX this technically could be -ENOSPC */ ++ goto err; ++ } ++ ++ crypt->key = key; ++ ++ /* write superblock */ ++ SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1); ++ bch2_write_super(c); ++err: ++ mutex_unlock(&c->sb_lock); ++ memzero_explicit(&user_key, sizeof(user_key)); ++ memzero_explicit(&key, sizeof(key)); ++ return ret; ++} ++ ++void bch2_fs_encryption_exit(struct bch_fs *c) ++{ ++ if (!IS_ERR_OR_NULL(c->poly1305)) ++ crypto_free_shash(c->poly1305); ++ if (!IS_ERR_OR_NULL(c->chacha20)) ++ crypto_free_sync_skcipher(c->chacha20); ++ if (!IS_ERR_OR_NULL(c->sha256)) ++ crypto_free_shash(c->sha256); ++} ++ ++int bch2_fs_encryption_init(struct bch_fs *c) ++{ ++ struct bch_sb_field_crypt *crypt; ++ struct bch_key key; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ c->sha256 = crypto_alloc_shash("sha256", 0, 0); ++ if (IS_ERR(c->sha256)) { ++ bch_err(c, "error requesting sha256 module"); ++ ret = PTR_ERR(c->sha256); ++ goto out; ++ } ++ ++ crypt = bch2_sb_get_crypt(c->disk_sb.sb); ++ if (!crypt) ++ goto out; ++ ++ ret = bch2_alloc_ciphers(c); ++ if (ret) ++ goto out; ++ ++ ret = bch2_decrypt_sb_key(c, crypt, &key); ++ if (ret) ++ goto out; ++ ++ ret = crypto_skcipher_setkey(&c->chacha20->base, ++ (void *) &key.key, sizeof(key.key)); ++ if (ret) ++ goto out; ++out: ++ memzero_explicit(&key, sizeof(key)); ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} +diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h +new file mode 100644 +index 000000000000..24dee8039d57 +--- /dev/null ++++ b/fs/bcachefs/checksum.h +@@ -0,0 +1,202 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_CHECKSUM_H ++#define _BCACHEFS_CHECKSUM_H ++ ++#include "bcachefs.h" ++#include "extents_types.h" ++#include "super-io.h" ++ ++#include ++#include ++ ++static inline bool bch2_checksum_mergeable(unsigned type) ++{ ++ ++ switch (type) { ++ case BCH_CSUM_NONE: ++ case BCH_CSUM_CRC32C: ++ case BCH_CSUM_CRC64: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum, ++ struct bch_csum, size_t); ++ ++#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28) ++#define BCH_NONCE_BTREE cpu_to_le32(2 << 28) ++#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28) ++#define BCH_NONCE_PRIO cpu_to_le32(4 << 28) ++#define BCH_NONCE_POLY cpu_to_le32(1 << 31) ++ ++struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, ++ const void *, size_t); ++ ++/* ++ * This is used for various on disk data structures - bch_sb, prio_set, bset, ++ * jset: The checksum is _always_ the first field of these structs ++ */ ++#define csum_vstruct(_c, _type, _nonce, _i) \ ++({ \ ++ const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \ ++ const void *end = vstruct_end(_i); \ ++ \ ++ bch2_checksum(_c, _type, _nonce, start, end - start); \ ++}) ++ ++int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); ++int bch2_request_key(struct bch_sb *, struct bch_key *); ++ ++void bch2_encrypt(struct bch_fs *, unsigned, struct nonce, ++ void *data, size_t); ++ ++struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned, ++ struct nonce, struct bio *); ++ ++int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion, ++ struct bch_extent_crc_unpacked, ++ struct bch_extent_crc_unpacked *, ++ struct bch_extent_crc_unpacked *, ++ unsigned, unsigned, unsigned); ++ ++void bch2_encrypt_bio(struct bch_fs *, unsigned, ++ struct nonce, struct bio *); ++ ++int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, ++ struct bch_key *); ++ ++int bch2_disable_encryption(struct bch_fs *); ++int bch2_enable_encryption(struct bch_fs *, bool); ++ ++void bch2_fs_encryption_exit(struct bch_fs *); ++int bch2_fs_encryption_init(struct bch_fs *); ++ ++static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, ++ bool data) ++{ ++ switch (type) { ++ case BCH_CSUM_OPT_NONE: ++ return BCH_CSUM_NONE; ++ case BCH_CSUM_OPT_CRC32C: ++ return data ? BCH_CSUM_CRC32C : BCH_CSUM_CRC32C_NONZERO; ++ case BCH_CSUM_OPT_CRC64: ++ return data ? BCH_CSUM_CRC64 : BCH_CSUM_CRC64_NONZERO; ++ default: ++ BUG(); ++ } ++} ++ ++static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, ++ unsigned opt) ++{ ++ if (c->sb.encryption_type) ++ return c->opts.wide_macs ++ ? BCH_CSUM_CHACHA20_POLY1305_128 ++ : BCH_CSUM_CHACHA20_POLY1305_80; ++ ++ return bch2_csum_opt_to_type(opt, true); ++} ++ ++static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) ++{ ++ if (c->sb.encryption_type) ++ return BCH_CSUM_CHACHA20_POLY1305_128; ++ ++ return bch2_csum_opt_to_type(c->opts.metadata_checksum, false); ++} ++ ++static const unsigned bch2_compression_opt_to_type[] = { ++#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, ++ BCH_COMPRESSION_OPTS() ++#undef x ++}; ++ ++static inline bool bch2_checksum_type_valid(const struct bch_fs *c, ++ unsigned type) ++{ ++ if (type >= BCH_CSUM_NR) ++ return false; ++ ++ if (bch2_csum_type_is_encryption(type) && !c->chacha20) ++ return false; ++ ++ return true; ++} ++ ++/* returns true if not equal */ ++static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) ++{ ++ /* ++ * XXX: need some way of preventing the compiler from optimizing this ++ * into a form that isn't constant time.. ++ */ ++ return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0; ++} ++ ++/* for skipping ahead and encrypting/decrypting at an offset: */ ++static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) ++{ ++ EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); ++ ++ le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); ++ return nonce; ++} ++ ++static inline struct nonce null_nonce(void) ++{ ++ struct nonce ret; ++ ++ memset(&ret, 0, sizeof(ret)); ++ return ret; ++} ++ ++static inline struct nonce extent_nonce(struct bversion version, ++ struct bch_extent_crc_unpacked crc) ++{ ++ unsigned compression_type = crc_is_compressed(crc) ++ ? crc.compression_type ++ : 0; ++ unsigned size = compression_type ? crc.uncompressed_size : 0; ++ struct nonce nonce = (struct nonce) {{ ++ [0] = cpu_to_le32(size << 22), ++ [1] = cpu_to_le32(version.lo), ++ [2] = cpu_to_le32(version.lo >> 32), ++ [3] = cpu_to_le32(version.hi| ++ (compression_type << 24))^BCH_NONCE_EXTENT, ++ }}; ++ ++ return nonce_add(nonce, crc.nonce << 9); ++} ++ ++static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key) ++{ ++ return le64_to_cpu(key->magic) != BCH_KEY_MAGIC; ++} ++ ++static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb) ++{ ++ __le64 magic = __bch2_sb_magic(sb); ++ ++ return (struct nonce) {{ ++ [0] = 0, ++ [1] = 0, ++ [2] = ((__le32 *) &magic)[0], ++ [3] = ((__le32 *) &magic)[1], ++ }}; ++} ++ ++static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c) ++{ ++ __le64 magic = bch2_sb_magic(c); ++ ++ return (struct nonce) {{ ++ [0] = 0, ++ [1] = 0, ++ [2] = ((__le32 *) &magic)[0], ++ [3] = ((__le32 *) &magic)[1], ++ }}; ++} ++ ++#endif /* _BCACHEFS_CHECKSUM_H */ +diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c +new file mode 100644 +index 000000000000..1d1590de55e8 +--- /dev/null ++++ b/fs/bcachefs/clock.c +@@ -0,0 +1,191 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "clock.h" ++ ++#include ++#include ++#include ++ ++static inline long io_timer_cmp(io_timer_heap *h, ++ struct io_timer *l, ++ struct io_timer *r) ++{ ++ return l->expire - r->expire; ++} ++ ++void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) ++{ ++ size_t i; ++ ++ spin_lock(&clock->timer_lock); ++ ++ if (time_after_eq((unsigned long) atomic_long_read(&clock->now), ++ timer->expire)) { ++ spin_unlock(&clock->timer_lock); ++ timer->fn(timer); ++ return; ++ } ++ ++ for (i = 0; i < clock->timers.used; i++) ++ if (clock->timers.data[i] == timer) ++ goto out; ++ ++ BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL)); ++out: ++ spin_unlock(&clock->timer_lock); ++} ++ ++void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) ++{ ++ size_t i; ++ ++ spin_lock(&clock->timer_lock); ++ ++ for (i = 0; i < clock->timers.used; i++) ++ if (clock->timers.data[i] == timer) { ++ heap_del(&clock->timers, i, io_timer_cmp, NULL); ++ break; ++ } ++ ++ spin_unlock(&clock->timer_lock); ++} ++ ++struct io_clock_wait { ++ struct io_timer io_timer; ++ struct timer_list cpu_timer; ++ struct task_struct *task; ++ int expired; ++}; ++ ++static void io_clock_wait_fn(struct io_timer *timer) ++{ ++ struct io_clock_wait *wait = container_of(timer, ++ struct io_clock_wait, io_timer); ++ ++ wait->expired = 1; ++ wake_up_process(wait->task); ++} ++ ++static void io_clock_cpu_timeout(struct timer_list *timer) ++{ ++ struct io_clock_wait *wait = container_of(timer, ++ struct io_clock_wait, cpu_timer); ++ ++ wait->expired = 1; ++ wake_up_process(wait->task); ++} ++ ++void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until) ++{ ++ struct io_clock_wait wait; ++ ++ /* XXX: calculate sleep time rigorously */ ++ wait.io_timer.expire = until; ++ wait.io_timer.fn = io_clock_wait_fn; ++ wait.task = current; ++ wait.expired = 0; ++ bch2_io_timer_add(clock, &wait.io_timer); ++ ++ schedule(); ++ ++ bch2_io_timer_del(clock, &wait.io_timer); ++} ++ ++void bch2_kthread_io_clock_wait(struct io_clock *clock, ++ unsigned long io_until, ++ unsigned long cpu_timeout) ++{ ++ bool kthread = (current->flags & PF_KTHREAD) != 0; ++ struct io_clock_wait wait; ++ ++ wait.io_timer.expire = io_until; ++ wait.io_timer.fn = io_clock_wait_fn; ++ wait.task = current; ++ wait.expired = 0; ++ bch2_io_timer_add(clock, &wait.io_timer); ++ ++ timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0); ++ ++ if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) ++ mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); ++ ++ while (1) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ if (kthread && kthread_should_stop()) ++ break; ++ ++ if (wait.expired) ++ break; ++ ++ schedule(); ++ try_to_freeze(); ++ } ++ ++ __set_current_state(TASK_RUNNING); ++ del_singleshot_timer_sync(&wait.cpu_timer); ++ destroy_timer_on_stack(&wait.cpu_timer); ++ bch2_io_timer_del(clock, &wait.io_timer); ++} ++ ++static struct io_timer *get_expired_timer(struct io_clock *clock, ++ unsigned long now) ++{ ++ struct io_timer *ret = NULL; ++ ++ spin_lock(&clock->timer_lock); ++ ++ if (clock->timers.used && ++ time_after_eq(now, clock->timers.data[0]->expire)) ++ heap_pop(&clock->timers, ret, io_timer_cmp, NULL); ++ ++ spin_unlock(&clock->timer_lock); ++ ++ return ret; ++} ++ ++void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) ++{ ++ struct io_timer *timer; ++ unsigned long now = atomic_long_add_return(sectors, &clock->now); ++ ++ while ((timer = get_expired_timer(clock, now))) ++ timer->fn(timer); ++} ++ ++void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) ++{ ++ unsigned long now; ++ unsigned i; ++ ++ spin_lock(&clock->timer_lock); ++ now = atomic_long_read(&clock->now); ++ ++ for (i = 0; i < clock->timers.used; i++) ++ pr_buf(out, "%ps:\t%li\n", ++ clock->timers.data[i]->fn, ++ clock->timers.data[i]->expire - now); ++ spin_unlock(&clock->timer_lock); ++} ++ ++void bch2_io_clock_exit(struct io_clock *clock) ++{ ++ free_heap(&clock->timers); ++ free_percpu(clock->pcpu_buf); ++} ++ ++int bch2_io_clock_init(struct io_clock *clock) ++{ ++ atomic_long_set(&clock->now, 0); ++ spin_lock_init(&clock->timer_lock); ++ ++ clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); ++ ++ clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf); ++ if (!clock->pcpu_buf) ++ return -ENOMEM; ++ ++ if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ return 0; ++} +diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h +new file mode 100644 +index 000000000000..70a0f7436c84 +--- /dev/null ++++ b/fs/bcachefs/clock.h +@@ -0,0 +1,38 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_CLOCK_H ++#define _BCACHEFS_CLOCK_H ++ ++void bch2_io_timer_add(struct io_clock *, struct io_timer *); ++void bch2_io_timer_del(struct io_clock *, struct io_timer *); ++void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long, ++ unsigned long); ++ ++void __bch2_increment_clock(struct io_clock *, unsigned); ++ ++static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors, ++ int rw) ++{ ++ struct io_clock *clock = &c->io_clock[rw]; ++ ++ if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >= ++ IO_CLOCK_PCPU_SECTORS)) ++ __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0)); ++} ++ ++void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); ++ ++#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\ ++({ \ ++ long __ret = timeout; \ ++ might_sleep(); \ ++ if (!___wait_cond_timeout(condition)) \ ++ __ret = __wait_event_timeout(wq, condition, timeout); \ ++ __ret; \ ++}) ++ ++void bch2_io_timers_to_text(struct printbuf *, struct io_clock *); ++ ++void bch2_io_clock_exit(struct io_clock *); ++int bch2_io_clock_init(struct io_clock *); ++ ++#endif /* _BCACHEFS_CLOCK_H */ +diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h +new file mode 100644 +index 000000000000..92c740a47565 +--- /dev/null ++++ b/fs/bcachefs/clock_types.h +@@ -0,0 +1,37 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_CLOCK_TYPES_H ++#define _BCACHEFS_CLOCK_TYPES_H ++ ++#include "util.h" ++ ++#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3) ++ ++/* ++ * Clocks/timers in units of sectors of IO: ++ * ++ * Note - they use percpu batching, so they're only approximate. ++ */ ++ ++struct io_timer; ++typedef void (*io_timer_fn)(struct io_timer *); ++ ++struct io_timer { ++ io_timer_fn fn; ++ unsigned long expire; ++}; ++ ++/* Amount to buffer up on a percpu counter */ ++#define IO_CLOCK_PCPU_SECTORS 128 ++ ++typedef HEAP(struct io_timer *) io_timer_heap; ++ ++struct io_clock { ++ atomic_long_t now; ++ u16 __percpu *pcpu_buf; ++ unsigned max_slop; ++ ++ spinlock_t timer_lock; ++ io_timer_heap timers; ++}; ++ ++#endif /* _BCACHEFS_CLOCK_TYPES_H */ +diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c +new file mode 100644 +index 000000000000..b50d2b0d5fd3 +--- /dev/null ++++ b/fs/bcachefs/compress.c +@@ -0,0 +1,629 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "checksum.h" ++#include "compress.h" ++#include "extents.h" ++#include "io.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++ ++/* Bounce buffer: */ ++struct bbuf { ++ void *b; ++ enum { ++ BB_NONE, ++ BB_VMAP, ++ BB_KMALLOC, ++ BB_MEMPOOL, ++ } type; ++ int rw; ++}; ++ ++static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw) ++{ ++ void *b; ++ ++ BUG_ON(size > c->sb.encoded_extent_max << 9); ++ ++ b = kmalloc(size, GFP_NOIO|__GFP_NOWARN); ++ if (b) ++ return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw }; ++ ++ b = mempool_alloc(&c->compression_bounce[rw], GFP_NOIO); ++ if (b) ++ return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw }; ++ ++ BUG(); ++} ++ ++static bool bio_phys_contig(struct bio *bio, struct bvec_iter start) ++{ ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ void *expected_start = NULL; ++ ++ __bio_for_each_bvec(bv, bio, iter, start) { ++ if (expected_start && ++ expected_start != page_address(bv.bv_page) + bv.bv_offset) ++ return false; ++ ++ expected_start = page_address(bv.bv_page) + ++ bv.bv_offset + bv.bv_len; ++ } ++ ++ return true; ++} ++ ++static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, ++ struct bvec_iter start, int rw) ++{ ++ struct bbuf ret; ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ unsigned nr_pages = 0; ++ struct page *stack_pages[16]; ++ struct page **pages = NULL; ++ void *data; ++ ++ BUG_ON(bvec_iter_sectors(start) > c->sb.encoded_extent_max); ++ ++ if (!IS_ENABLED(CONFIG_HIGHMEM) && ++ bio_phys_contig(bio, start)) ++ return (struct bbuf) { ++ .b = page_address(bio_iter_page(bio, start)) + ++ bio_iter_offset(bio, start), ++ .type = BB_NONE, .rw = rw ++ }; ++ ++ /* check if we can map the pages contiguously: */ ++ __bio_for_each_segment(bv, bio, iter, start) { ++ if (iter.bi_size != start.bi_size && ++ bv.bv_offset) ++ goto bounce; ++ ++ if (bv.bv_len < iter.bi_size && ++ bv.bv_offset + bv.bv_len < PAGE_SIZE) ++ goto bounce; ++ ++ nr_pages++; ++ } ++ ++ BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages); ++ ++ pages = nr_pages > ARRAY_SIZE(stack_pages) ++ ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOIO) ++ : stack_pages; ++ if (!pages) ++ goto bounce; ++ ++ nr_pages = 0; ++ __bio_for_each_segment(bv, bio, iter, start) ++ pages[nr_pages++] = bv.bv_page; ++ ++ data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); ++ if (pages != stack_pages) ++ kfree(pages); ++ ++ if (data) ++ return (struct bbuf) { ++ .b = data + bio_iter_offset(bio, start), ++ .type = BB_VMAP, .rw = rw ++ }; ++bounce: ++ ret = __bounce_alloc(c, start.bi_size, rw); ++ ++ if (rw == READ) ++ memcpy_from_bio(ret.b, bio, start); ++ ++ return ret; ++} ++ ++static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw) ++{ ++ return __bio_map_or_bounce(c, bio, bio->bi_iter, rw); ++} ++ ++static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf) ++{ ++ switch (buf.type) { ++ case BB_NONE: ++ break; ++ case BB_VMAP: ++ vunmap((void *) ((unsigned long) buf.b & PAGE_MASK)); ++ break; ++ case BB_KMALLOC: ++ kfree(buf.b); ++ break; ++ case BB_MEMPOOL: ++ mempool_free(buf.b, &c->compression_bounce[buf.rw]); ++ break; ++ } ++} ++ ++static inline void zlib_set_workspace(z_stream *strm, void *workspace) ++{ ++#ifdef __KERNEL__ ++ strm->workspace = workspace; ++#endif ++} ++ ++static int __bio_uncompress(struct bch_fs *c, struct bio *src, ++ void *dst_data, struct bch_extent_crc_unpacked crc) ++{ ++ struct bbuf src_data = { NULL }; ++ size_t src_len = src->bi_iter.bi_size; ++ size_t dst_len = crc.uncompressed_size << 9; ++ void *workspace; ++ int ret; ++ ++ src_data = bio_map_or_bounce(c, src, READ); ++ ++ switch (crc.compression_type) { ++ case BCH_COMPRESSION_TYPE_lz4_old: ++ case BCH_COMPRESSION_TYPE_lz4: ++ ret = LZ4_decompress_safe_partial(src_data.b, dst_data, ++ src_len, dst_len, dst_len); ++ if (ret != dst_len) ++ goto err; ++ break; ++ case BCH_COMPRESSION_TYPE_gzip: { ++ z_stream strm = { ++ .next_in = src_data.b, ++ .avail_in = src_len, ++ .next_out = dst_data, ++ .avail_out = dst_len, ++ }; ++ ++ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); ++ ++ zlib_set_workspace(&strm, workspace); ++ zlib_inflateInit2(&strm, -MAX_WBITS); ++ ret = zlib_inflate(&strm, Z_FINISH); ++ ++ mempool_free(workspace, &c->decompress_workspace); ++ ++ if (ret != Z_STREAM_END) ++ goto err; ++ break; ++ } ++ case BCH_COMPRESSION_TYPE_zstd: { ++ ZSTD_DCtx *ctx; ++ size_t real_src_len = le32_to_cpup(src_data.b); ++ ++ if (real_src_len > src_len - 4) ++ goto err; ++ ++ workspace = mempool_alloc(&c->decompress_workspace, GFP_NOIO); ++ ctx = ZSTD_initDCtx(workspace, ZSTD_DCtxWorkspaceBound()); ++ ++ ret = ZSTD_decompressDCtx(ctx, ++ dst_data, dst_len, ++ src_data.b + 4, real_src_len); ++ ++ mempool_free(workspace, &c->decompress_workspace); ++ ++ if (ret != dst_len) ++ goto err; ++ break; ++ } ++ default: ++ BUG(); ++ } ++ ret = 0; ++out: ++ bio_unmap_or_unbounce(c, src_data); ++ return ret; ++err: ++ ret = -EIO; ++ goto out; ++} ++ ++int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, ++ struct bch_extent_crc_unpacked *crc) ++{ ++ struct bbuf data = { NULL }; ++ size_t dst_len = crc->uncompressed_size << 9; ++ ++ /* bio must own its pages: */ ++ BUG_ON(!bio->bi_vcnt); ++ BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); ++ ++ if (crc->uncompressed_size > c->sb.encoded_extent_max || ++ crc->compressed_size > c->sb.encoded_extent_max) { ++ bch_err(c, "error rewriting existing data: extent too big"); ++ return -EIO; ++ } ++ ++ data = __bounce_alloc(c, dst_len, WRITE); ++ ++ if (__bio_uncompress(c, bio, data.b, *crc)) { ++ bch_err(c, "error rewriting existing data: decompression error"); ++ bio_unmap_or_unbounce(c, data); ++ return -EIO; ++ } ++ ++ /* ++ * XXX: don't have a good way to assert that the bio was allocated with ++ * enough space, we depend on bch2_move_extent doing the right thing ++ */ ++ bio->bi_iter.bi_size = crc->live_size << 9; ++ ++ memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9)); ++ ++ crc->csum_type = 0; ++ crc->compression_type = 0; ++ crc->compressed_size = crc->live_size; ++ crc->uncompressed_size = crc->live_size; ++ crc->offset = 0; ++ crc->csum = (struct bch_csum) { 0, 0 }; ++ ++ bio_unmap_or_unbounce(c, data); ++ return 0; ++} ++ ++int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, ++ struct bio *dst, struct bvec_iter dst_iter, ++ struct bch_extent_crc_unpacked crc) ++{ ++ struct bbuf dst_data = { NULL }; ++ size_t dst_len = crc.uncompressed_size << 9; ++ int ret = -ENOMEM; ++ ++ if (crc.uncompressed_size > c->sb.encoded_extent_max || ++ crc.compressed_size > c->sb.encoded_extent_max) ++ return -EIO; ++ ++ dst_data = dst_len == dst_iter.bi_size ++ ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) ++ : __bounce_alloc(c, dst_len, WRITE); ++ ++ ret = __bio_uncompress(c, src, dst_data.b, crc); ++ if (ret) ++ goto err; ++ ++ if (dst_data.type != BB_NONE && ++ dst_data.type != BB_VMAP) ++ memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9)); ++err: ++ bio_unmap_or_unbounce(c, dst_data); ++ return ret; ++} ++ ++static int attempt_compress(struct bch_fs *c, ++ void *workspace, ++ void *dst, size_t dst_len, ++ void *src, size_t src_len, ++ enum bch_compression_type compression_type) ++{ ++ switch (compression_type) { ++ case BCH_COMPRESSION_TYPE_lz4: { ++ int len = src_len; ++ int ret = LZ4_compress_destSize( ++ src, dst, ++ &len, dst_len, ++ workspace); ++ ++ if (len < src_len) ++ return -len; ++ ++ return ret; ++ } ++ case BCH_COMPRESSION_TYPE_gzip: { ++ z_stream strm = { ++ .next_in = src, ++ .avail_in = src_len, ++ .next_out = dst, ++ .avail_out = dst_len, ++ }; ++ ++ zlib_set_workspace(&strm, workspace); ++ zlib_deflateInit2(&strm, Z_DEFAULT_COMPRESSION, ++ Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, ++ Z_DEFAULT_STRATEGY); ++ ++ if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END) ++ return 0; ++ ++ if (zlib_deflateEnd(&strm) != Z_OK) ++ return 0; ++ ++ return strm.total_out; ++ } ++ case BCH_COMPRESSION_TYPE_zstd: { ++ ZSTD_CCtx *ctx = ZSTD_initCCtx(workspace, ++ ZSTD_CCtxWorkspaceBound(c->zstd_params.cParams)); ++ ++ size_t len = ZSTD_compressCCtx(ctx, ++ dst + 4, dst_len - 4, ++ src, src_len, ++ c->zstd_params); ++ if (ZSTD_isError(len)) ++ return 0; ++ ++ *((__le32 *) dst) = cpu_to_le32(len); ++ return len + 4; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++static unsigned __bio_compress(struct bch_fs *c, ++ struct bio *dst, size_t *dst_len, ++ struct bio *src, size_t *src_len, ++ enum bch_compression_type compression_type) ++{ ++ struct bbuf src_data = { NULL }, dst_data = { NULL }; ++ void *workspace; ++ unsigned pad; ++ int ret = 0; ++ ++ BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR); ++ BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type])); ++ ++ /* If it's only one block, don't bother trying to compress: */ ++ if (bio_sectors(src) <= c->opts.block_size) ++ return 0; ++ ++ dst_data = bio_map_or_bounce(c, dst, WRITE); ++ src_data = bio_map_or_bounce(c, src, READ); ++ ++ workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOIO); ++ ++ *src_len = src->bi_iter.bi_size; ++ *dst_len = dst->bi_iter.bi_size; ++ ++ /* ++ * XXX: this algorithm sucks when the compression code doesn't tell us ++ * how much would fit, like LZ4 does: ++ */ ++ while (1) { ++ if (*src_len <= block_bytes(c)) { ++ ret = -1; ++ break; ++ } ++ ++ ret = attempt_compress(c, workspace, ++ dst_data.b, *dst_len, ++ src_data.b, *src_len, ++ compression_type); ++ if (ret > 0) { ++ *dst_len = ret; ++ ret = 0; ++ break; ++ } ++ ++ /* Didn't fit: should we retry with a smaller amount? */ ++ if (*src_len <= *dst_len) { ++ ret = -1; ++ break; ++ } ++ ++ /* ++ * If ret is negative, it's a hint as to how much data would fit ++ */ ++ BUG_ON(-ret >= *src_len); ++ ++ if (ret < 0) ++ *src_len = -ret; ++ else ++ *src_len -= (*src_len - *dst_len) / 2; ++ *src_len = round_down(*src_len, block_bytes(c)); ++ } ++ ++ mempool_free(workspace, &c->compress_workspace[compression_type]); ++ ++ if (ret) ++ goto err; ++ ++ /* Didn't get smaller: */ ++ if (round_up(*dst_len, block_bytes(c)) >= *src_len) ++ goto err; ++ ++ pad = round_up(*dst_len, block_bytes(c)) - *dst_len; ++ ++ memset(dst_data.b + *dst_len, 0, pad); ++ *dst_len += pad; ++ ++ if (dst_data.type != BB_NONE && ++ dst_data.type != BB_VMAP) ++ memcpy_to_bio(dst, dst->bi_iter, dst_data.b); ++ ++ BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size); ++ BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size); ++ BUG_ON(*dst_len & (block_bytes(c) - 1)); ++ BUG_ON(*src_len & (block_bytes(c) - 1)); ++out: ++ bio_unmap_or_unbounce(c, src_data); ++ bio_unmap_or_unbounce(c, dst_data); ++ return compression_type; ++err: ++ compression_type = BCH_COMPRESSION_TYPE_incompressible; ++ goto out; ++} ++ ++unsigned bch2_bio_compress(struct bch_fs *c, ++ struct bio *dst, size_t *dst_len, ++ struct bio *src, size_t *src_len, ++ unsigned compression_type) ++{ ++ unsigned orig_dst = dst->bi_iter.bi_size; ++ unsigned orig_src = src->bi_iter.bi_size; ++ ++ /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ ++ src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size, ++ c->sb.encoded_extent_max << 9); ++ /* Don't generate a bigger output than input: */ ++ dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); ++ ++ if (compression_type == BCH_COMPRESSION_TYPE_lz4_old) ++ compression_type = BCH_COMPRESSION_TYPE_lz4; ++ ++ compression_type = ++ __bio_compress(c, dst, dst_len, src, src_len, compression_type); ++ ++ dst->bi_iter.bi_size = orig_dst; ++ src->bi_iter.bi_size = orig_src; ++ return compression_type; ++} ++ ++static int __bch2_fs_compress_init(struct bch_fs *, u64); ++ ++#define BCH_FEATURE_none 0 ++ ++static const unsigned bch2_compression_opt_to_feature[] = { ++#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t, ++ BCH_COMPRESSION_OPTS() ++#undef x ++}; ++ ++#undef BCH_FEATURE_none ++ ++static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f) ++{ ++ int ret = 0; ++ ++ if ((c->sb.features & f) == f) ++ return 0; ++ ++ mutex_lock(&c->sb_lock); ++ ++ if ((c->sb.features & f) == f) { ++ mutex_unlock(&c->sb_lock); ++ return 0; ++ } ++ ++ ret = __bch2_fs_compress_init(c, c->sb.features|f); ++ if (ret) { ++ mutex_unlock(&c->sb_lock); ++ return ret; ++ } ++ ++ c->disk_sb.sb->features[0] |= cpu_to_le64(f); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++int bch2_check_set_has_compressed_data(struct bch_fs *c, ++ unsigned compression_type) ++{ ++ BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature)); ++ ++ return compression_type ++ ? __bch2_check_set_has_compressed_data(c, ++ 1ULL << bch2_compression_opt_to_feature[compression_type]) ++ : 0; ++} ++ ++void bch2_fs_compress_exit(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ mempool_exit(&c->decompress_workspace); ++ for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++) ++ mempool_exit(&c->compress_workspace[i]); ++ mempool_exit(&c->compression_bounce[WRITE]); ++ mempool_exit(&c->compression_bounce[READ]); ++} ++ ++static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) ++{ ++ size_t max_extent = c->sb.encoded_extent_max << 9; ++ size_t decompress_workspace_size = 0; ++ bool decompress_workspace_needed; ++ ZSTD_parameters params = ZSTD_getParams(0, max_extent, 0); ++ struct { ++ unsigned feature; ++ unsigned type; ++ size_t compress_workspace; ++ size_t decompress_workspace; ++ } compression_types[] = { ++ { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, LZ4_MEM_COMPRESS, 0 }, ++ { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip, ++ zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), ++ zlib_inflate_workspacesize(), }, ++ { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd, ++ ZSTD_CCtxWorkspaceBound(params.cParams), ++ ZSTD_DCtxWorkspaceBound() }, ++ }, *i; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ c->zstd_params = params; ++ ++ for (i = compression_types; ++ i < compression_types + ARRAY_SIZE(compression_types); ++ i++) ++ if (features & (1 << i->feature)) ++ goto have_compressed; ++ ++ goto out; ++have_compressed: ++ ++ if (!mempool_initialized(&c->compression_bounce[READ])) { ++ ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], ++ 1, max_extent); ++ if (ret) ++ goto out; ++ } ++ ++ if (!mempool_initialized(&c->compression_bounce[WRITE])) { ++ ret = mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], ++ 1, max_extent); ++ if (ret) ++ goto out; ++ } ++ ++ for (i = compression_types; ++ i < compression_types + ARRAY_SIZE(compression_types); ++ i++) { ++ decompress_workspace_size = ++ max(decompress_workspace_size, i->decompress_workspace); ++ ++ if (!(features & (1 << i->feature))) ++ continue; ++ ++ if (i->decompress_workspace) ++ decompress_workspace_needed = true; ++ ++ if (mempool_initialized(&c->compress_workspace[i->type])) ++ continue; ++ ++ ret = mempool_init_kvpmalloc_pool( ++ &c->compress_workspace[i->type], ++ 1, i->compress_workspace); ++ if (ret) ++ goto out; ++ } ++ ++ if (!mempool_initialized(&c->decompress_workspace)) { ++ ret = mempool_init_kvpmalloc_pool( ++ &c->decompress_workspace, ++ 1, decompress_workspace_size); ++ if (ret) ++ goto out; ++ } ++out: ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} ++ ++int bch2_fs_compress_init(struct bch_fs *c) ++{ ++ u64 f = c->sb.features; ++ ++ if (c->opts.compression) ++ f |= 1ULL << bch2_compression_opt_to_feature[c->opts.compression]; ++ ++ if (c->opts.background_compression) ++ f |= 1ULL << bch2_compression_opt_to_feature[c->opts.background_compression]; ++ ++ return __bch2_fs_compress_init(c, f); ++ ++} +diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h +new file mode 100644 +index 000000000000..4bab1f61b3b5 +--- /dev/null ++++ b/fs/bcachefs/compress.h +@@ -0,0 +1,18 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_COMPRESS_H ++#define _BCACHEFS_COMPRESS_H ++ ++#include "extents_types.h" ++ ++int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, ++ struct bch_extent_crc_unpacked *); ++int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, ++ struct bvec_iter, struct bch_extent_crc_unpacked); ++unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, ++ struct bio *, size_t *, unsigned); ++ ++int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned); ++void bch2_fs_compress_exit(struct bch_fs *); ++int bch2_fs_compress_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_COMPRESS_H */ +diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c +new file mode 100644 +index 000000000000..aa10591a3b1a +--- /dev/null ++++ b/fs/bcachefs/debug.c +@@ -0,0 +1,432 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Assorted bcachefs debug code ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "debug.h" ++#include "error.h" ++#include "extents.h" ++#include "fsck.h" ++#include "inode.h" ++#include "io.h" ++#include "super.h" ++ ++#include ++#include ++#include ++#include ++#include ++ ++static struct dentry *bch_debug; ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++void __bch2_btree_verify(struct bch_fs *c, struct btree *b) ++{ ++ struct btree *v = c->verify_data; ++ struct btree_node *n_ondisk, *n_sorted, *n_inmemory; ++ struct bset *sorted, *inmemory; ++ struct extent_ptr_decoded pick; ++ struct bch_dev *ca; ++ struct bio *bio; ++ ++ if (c->opts.nochanges) ++ return; ++ ++ btree_node_io_lock(b); ++ mutex_lock(&c->verify_lock); ++ ++ n_ondisk = c->verify_ondisk; ++ n_sorted = c->verify_data->data; ++ n_inmemory = b->data; ++ ++ bkey_copy(&v->key, &b->key); ++ v->written = 0; ++ v->c.level = b->c.level; ++ v->c.btree_id = b->c.btree_id; ++ bch2_btree_keys_init(v, &c->expensive_debug_checks); ++ ++ if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), ++ NULL, &pick) <= 0) ++ return; ++ ++ ca = bch_dev_bkey_exists(c, pick.ptr.dev); ++ if (!bch2_dev_get_ioref(ca, READ)) ++ return; ++ ++ bio = bio_alloc_bioset(GFP_NOIO, ++ buf_pages(n_sorted, btree_bytes(c)), ++ &c->btree_bio); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_opf = REQ_OP_READ|REQ_META; ++ bio->bi_iter.bi_sector = pick.ptr.offset; ++ bch2_bio_map(bio, n_sorted, btree_bytes(c)); ++ ++ submit_bio_wait(bio); ++ ++ bio_put(bio); ++ percpu_ref_put(&ca->io_ref); ++ ++ memcpy(n_ondisk, n_sorted, btree_bytes(c)); ++ ++ if (bch2_btree_node_read_done(c, v, false)) ++ goto out; ++ ++ n_sorted = c->verify_data->data; ++ sorted = &n_sorted->keys; ++ inmemory = &n_inmemory->keys; ++ ++ if (inmemory->u64s != sorted->u64s || ++ memcmp(inmemory->start, ++ sorted->start, ++ vstruct_end(inmemory) - (void *) inmemory->start)) { ++ unsigned offset = 0, sectors; ++ struct bset *i; ++ unsigned j; ++ ++ console_lock(); ++ ++ printk(KERN_ERR "*** in memory:\n"); ++ bch2_dump_bset(c, b, inmemory, 0); ++ ++ printk(KERN_ERR "*** read back in:\n"); ++ bch2_dump_bset(c, v, sorted, 0); ++ ++ while (offset < b->written) { ++ if (!offset ) { ++ i = &n_ondisk->keys; ++ sectors = vstruct_blocks(n_ondisk, c->block_bits) << ++ c->block_bits; ++ } else { ++ struct btree_node_entry *bne = ++ (void *) n_ondisk + (offset << 9); ++ i = &bne->keys; ++ ++ sectors = vstruct_blocks(bne, c->block_bits) << ++ c->block_bits; ++ } ++ ++ printk(KERN_ERR "*** on disk block %u:\n", offset); ++ bch2_dump_bset(c, b, i, offset); ++ ++ offset += sectors; ++ } ++ ++ printk(KERN_ERR "*** block %u/%u not written\n", ++ offset >> c->block_bits, btree_blocks(c)); ++ ++ for (j = 0; j < le16_to_cpu(inmemory->u64s); j++) ++ if (inmemory->_data[j] != sorted->_data[j]) ++ break; ++ ++ printk(KERN_ERR "b->written %u\n", b->written); ++ ++ console_unlock(); ++ panic("verify failed at %u\n", j); ++ } ++out: ++ mutex_unlock(&c->verify_lock); ++ btree_node_io_unlock(b); ++} ++ ++#endif ++ ++#ifdef CONFIG_DEBUG_FS ++ ++/* XXX: bch_fs refcounting */ ++ ++struct dump_iter { ++ struct bpos from; ++ struct bch_fs *c; ++ enum btree_id id; ++ ++ char buf[PAGE_SIZE]; ++ size_t bytes; /* what's currently in buf */ ++ ++ char __user *ubuf; /* destination user buffer */ ++ size_t size; /* size of requested read */ ++ ssize_t ret; /* bytes read so far */ ++}; ++ ++static int flush_buf(struct dump_iter *i) ++{ ++ if (i->bytes) { ++ size_t bytes = min(i->bytes, i->size); ++ int err = copy_to_user(i->ubuf, i->buf, bytes); ++ ++ if (err) ++ return err; ++ ++ i->ret += bytes; ++ i->ubuf += bytes; ++ i->size -= bytes; ++ i->bytes -= bytes; ++ memmove(i->buf, i->buf + bytes, i->bytes); ++ } ++ ++ return 0; ++} ++ ++static int bch2_dump_open(struct inode *inode, struct file *file) ++{ ++ struct btree_debug *bd = inode->i_private; ++ struct dump_iter *i; ++ ++ i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); ++ if (!i) ++ return -ENOMEM; ++ ++ file->private_data = i; ++ i->from = POS_MIN; ++ i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]); ++ i->id = bd->id; ++ ++ return 0; ++} ++ ++static int bch2_dump_release(struct inode *inode, struct file *file) ++{ ++ kfree(file->private_data); ++ return 0; ++} ++ ++static ssize_t bch2_read_btree(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ err = flush_buf(i); ++ if (err) ++ return err; ++ ++ if (!i->size) ++ return i->ret; ++ ++ bch2_trans_init(&trans, i->c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); ++ k = bch2_btree_iter_peek(iter); ++ ++ while (k.k && !(err = bkey_err(k))) { ++ bch2_bkey_val_to_text(&PBUF(i->buf), i->c, k); ++ i->bytes = strlen(i->buf); ++ BUG_ON(i->bytes >= PAGE_SIZE); ++ i->buf[i->bytes] = '\n'; ++ i->bytes++; ++ ++ k = bch2_btree_iter_next(iter); ++ i->from = iter->pos; ++ ++ err = flush_buf(i); ++ if (err) ++ break; ++ ++ if (!i->size) ++ break; ++ } ++ bch2_trans_exit(&trans); ++ ++ return err < 0 ? err : i->ret; ++} ++ ++static const struct file_operations btree_debug_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_dump_open, ++ .release = bch2_dump_release, ++ .read = bch2_read_btree, ++}; ++ ++static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct btree *b; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ err = flush_buf(i); ++ if (err) ++ return err; ++ ++ if (!i->size || !bkey_cmp(POS_MAX, i->from)) ++ return i->ret; ++ ++ bch2_trans_init(&trans, i->c, 0, 0); ++ ++ for_each_btree_node(&trans, iter, i->id, i->from, 0, b) { ++ bch2_btree_node_to_text(&PBUF(i->buf), i->c, b); ++ i->bytes = strlen(i->buf); ++ err = flush_buf(i); ++ if (err) ++ break; ++ ++ /* ++ * can't easily correctly restart a btree node traversal across ++ * all nodes, meh ++ */ ++ i->from = bkey_cmp(POS_MAX, b->key.k.p) ++ ? bkey_successor(b->key.k.p) ++ : b->key.k.p; ++ ++ if (!i->size) ++ break; ++ } ++ bch2_trans_exit(&trans); ++ ++ return err < 0 ? err : i->ret; ++} ++ ++static const struct file_operations btree_format_debug_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_dump_open, ++ .release = bch2_dump_release, ++ .read = bch2_read_btree_formats, ++}; ++ ++static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, ++ size_t size, loff_t *ppos) ++{ ++ struct dump_iter *i = file->private_data; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct btree *prev_node = NULL; ++ int err; ++ ++ i->ubuf = buf; ++ i->size = size; ++ i->ret = 0; ++ ++ err = flush_buf(i); ++ if (err) ++ return err; ++ ++ if (!i->size) ++ return i->ret; ++ ++ bch2_trans_init(&trans, i->c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, i->id, i->from, BTREE_ITER_PREFETCH); ++ ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(err = bkey_err(k))) { ++ struct btree_iter_level *l = &iter->l[0]; ++ struct bkey_packed *_k = ++ bch2_btree_node_iter_peek(&l->iter, l->b); ++ ++ if (l->b != prev_node) { ++ bch2_btree_node_to_text(&PBUF(i->buf), i->c, l->b); ++ i->bytes = strlen(i->buf); ++ err = flush_buf(i); ++ if (err) ++ break; ++ } ++ prev_node = l->b; ++ ++ bch2_bfloat_to_text(&PBUF(i->buf), l->b, _k); ++ i->bytes = strlen(i->buf); ++ err = flush_buf(i); ++ if (err) ++ break; ++ ++ bch2_btree_iter_next(iter); ++ i->from = iter->pos; ++ ++ err = flush_buf(i); ++ if (err) ++ break; ++ ++ if (!i->size) ++ break; ++ } ++ bch2_trans_exit(&trans); ++ ++ return err < 0 ? err : i->ret; ++} ++ ++static const struct file_operations bfloat_failed_debug_ops = { ++ .owner = THIS_MODULE, ++ .open = bch2_dump_open, ++ .release = bch2_dump_release, ++ .read = bch2_read_bfloat_failed, ++}; ++ ++void bch2_fs_debug_exit(struct bch_fs *c) ++{ ++ if (!IS_ERR_OR_NULL(c->debug)) ++ debugfs_remove_recursive(c->debug); ++} ++ ++void bch2_fs_debug_init(struct bch_fs *c) ++{ ++ struct btree_debug *bd; ++ char name[100]; ++ ++ if (IS_ERR_OR_NULL(bch_debug)) ++ return; ++ ++ snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); ++ c->debug = debugfs_create_dir(name, bch_debug); ++ if (IS_ERR_OR_NULL(c->debug)) ++ return; ++ ++ for (bd = c->btree_debug; ++ bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); ++ bd++) { ++ bd->id = bd - c->btree_debug; ++ bd->btree = debugfs_create_file(bch2_btree_ids[bd->id], ++ 0400, c->debug, bd, ++ &btree_debug_ops); ++ ++ snprintf(name, sizeof(name), "%s-formats", ++ bch2_btree_ids[bd->id]); ++ ++ bd->btree_format = debugfs_create_file(name, 0400, c->debug, bd, ++ &btree_format_debug_ops); ++ ++ snprintf(name, sizeof(name), "%s-bfloat-failed", ++ bch2_btree_ids[bd->id]); ++ ++ bd->failed = debugfs_create_file(name, 0400, c->debug, bd, ++ &bfloat_failed_debug_ops); ++ } ++} ++ ++#endif ++ ++void bch2_debug_exit(void) ++{ ++ if (!IS_ERR_OR_NULL(bch_debug)) ++ debugfs_remove_recursive(bch_debug); ++} ++ ++int __init bch2_debug_init(void) ++{ ++ int ret = 0; ++ ++ bch_debug = debugfs_create_dir("bcachefs", NULL); ++ return ret; ++} +diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h +new file mode 100644 +index 000000000000..56c2d1ab5f63 +--- /dev/null ++++ b/fs/bcachefs/debug.h +@@ -0,0 +1,63 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_DEBUG_H ++#define _BCACHEFS_DEBUG_H ++ ++#include "bcachefs.h" ++ ++struct bio; ++struct btree; ++struct bch_fs; ++ ++#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; ++BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++#define BCH_DEBUG_PARAM(name, description) \ ++ static inline bool name(struct bch_fs *c) \ ++ { return bch2_##name || c->name; } ++BCH_DEBUG_PARAMS_ALWAYS() ++#undef BCH_DEBUG_PARAM ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++#define BCH_DEBUG_PARAM(name, description) \ ++ static inline bool name(struct bch_fs *c) \ ++ { return bch2_##name || c->name; } ++BCH_DEBUG_PARAMS_DEBUG() ++#undef BCH_DEBUG_PARAM ++ ++void __bch2_btree_verify(struct bch_fs *, struct btree *); ++ ++#define bypass_torture_test(d) ((d)->bypass_torture_test) ++ ++#else /* DEBUG */ ++ ++#define BCH_DEBUG_PARAM(name, description) \ ++ static inline bool name(struct bch_fs *c) { return false; } ++BCH_DEBUG_PARAMS_DEBUG() ++#undef BCH_DEBUG_PARAM ++ ++static inline void __bch2_btree_verify(struct bch_fs *c, struct btree *b) {} ++ ++#define bypass_torture_test(d) 0 ++ ++#endif ++ ++static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) ++{ ++ if (verify_btree_ondisk(c)) ++ __bch2_btree_verify(c, b); ++} ++ ++#ifdef CONFIG_DEBUG_FS ++void bch2_fs_debug_exit(struct bch_fs *); ++void bch2_fs_debug_init(struct bch_fs *); ++#else ++static inline void bch2_fs_debug_exit(struct bch_fs *c) {} ++static inline void bch2_fs_debug_init(struct bch_fs *c) {} ++#endif ++ ++void bch2_debug_exit(void); ++int bch2_debug_init(void); ++ ++#endif /* _BCACHEFS_DEBUG_H */ +diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c +new file mode 100644 +index 000000000000..f34bfda8ab0d +--- /dev/null ++++ b/fs/bcachefs/dirent.c +@@ -0,0 +1,385 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_update.h" ++#include "extents.h" ++#include "dirent.h" ++#include "fs.h" ++#include "keylist.h" ++#include "str_hash.h" ++ ++#include ++ ++unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) ++{ ++ unsigned len = bkey_val_bytes(d.k) - ++ offsetof(struct bch_dirent, d_name); ++ ++ return strnlen(d.v->d_name, len); ++} ++ ++static u64 bch2_dirent_hash(const struct bch_hash_info *info, ++ const struct qstr *name) ++{ ++ struct bch_str_hash_ctx ctx; ++ ++ bch2_str_hash_init(&ctx, info); ++ bch2_str_hash_update(&ctx, info, name->name, name->len); ++ ++ /* [0,2) reserved for dots */ ++ return max_t(u64, bch2_str_hash_end(&ctx, info), 2); ++} ++ ++static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) ++{ ++ return bch2_dirent_hash(info, key); ++} ++ ++static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) ++{ ++ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); ++ struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); ++ ++ return bch2_dirent_hash(info, &name); ++} ++ ++static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) ++{ ++ struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); ++ int len = bch2_dirent_name_bytes(l); ++ const struct qstr *r = _r; ++ ++ return len - r->len ?: memcmp(l.v->d_name, r->name, len); ++} ++ ++static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) ++{ ++ struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); ++ struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r); ++ int l_len = bch2_dirent_name_bytes(l); ++ int r_len = bch2_dirent_name_bytes(r); ++ ++ return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len); ++} ++ ++const struct bch_hash_desc bch2_dirent_hash_desc = { ++ .btree_id = BTREE_ID_DIRENTS, ++ .key_type = KEY_TYPE_dirent, ++ .hash_key = dirent_hash_key, ++ .hash_bkey = dirent_hash_bkey, ++ .cmp_key = dirent_cmp_key, ++ .cmp_bkey = dirent_cmp_bkey, ++}; ++ ++const char *bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); ++ unsigned len; ++ ++ if (bkey_val_bytes(k.k) < sizeof(struct bch_dirent)) ++ return "value too small"; ++ ++ len = bch2_dirent_name_bytes(d); ++ if (!len) ++ return "empty name"; ++ ++ /* ++ * older versions of bcachefs were buggy and creating dirent ++ * keys that were bigger than necessary: ++ */ ++ if (bkey_val_u64s(k.k) > dirent_val_u64s(len + 7)) ++ return "value too big"; ++ ++ if (len > BCH_NAME_MAX) ++ return "dirent name too big"; ++ ++ return NULL; ++} ++ ++void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); ++ ++ bch_scnmemcpy(out, d.v->d_name, ++ bch2_dirent_name_bytes(d)); ++ pr_buf(out, " -> %llu type %u", d.v->d_inum, d.v->d_type); ++} ++ ++static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, ++ u8 type, const struct qstr *name, u64 dst) ++{ ++ struct bkey_i_dirent *dirent; ++ unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len); ++ ++ if (name->len > BCH_NAME_MAX) ++ return ERR_PTR(-ENAMETOOLONG); ++ ++ BUG_ON(u64s > U8_MAX); ++ ++ dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); ++ if (IS_ERR(dirent)) ++ return dirent; ++ ++ bkey_dirent_init(&dirent->k_i); ++ dirent->k.u64s = u64s; ++ dirent->v.d_inum = cpu_to_le64(dst); ++ dirent->v.d_type = type; ++ ++ memcpy(dirent->v.d_name, name->name, name->len); ++ memset(dirent->v.d_name + name->len, 0, ++ bkey_val_bytes(&dirent->k) - ++ offsetof(struct bch_dirent, d_name) - ++ name->len); ++ ++ EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len); ++ ++ return dirent; ++} ++ ++int bch2_dirent_create(struct btree_trans *trans, ++ u64 dir_inum, const struct bch_hash_info *hash_info, ++ u8 type, const struct qstr *name, u64 dst_inum, ++ int flags) ++{ ++ struct bkey_i_dirent *dirent; ++ int ret; ++ ++ dirent = dirent_create_key(trans, type, name, dst_inum); ++ ret = PTR_ERR_OR_ZERO(dirent); ++ if (ret) ++ return ret; ++ ++ return bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, ++ dir_inum, &dirent->k_i, flags); ++} ++ ++static void dirent_copy_target(struct bkey_i_dirent *dst, ++ struct bkey_s_c_dirent src) ++{ ++ dst->v.d_inum = src.v->d_inum; ++ dst->v.d_type = src.v->d_type; ++} ++ ++int bch2_dirent_rename(struct btree_trans *trans, ++ u64 src_dir, struct bch_hash_info *src_hash, ++ u64 dst_dir, struct bch_hash_info *dst_hash, ++ const struct qstr *src_name, u64 *src_inum, ++ const struct qstr *dst_name, u64 *dst_inum, ++ enum bch_rename_mode mode) ++{ ++ struct btree_iter *src_iter = NULL, *dst_iter = NULL; ++ struct bkey_s_c old_src, old_dst; ++ struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; ++ struct bpos dst_pos = ++ POS(dst_dir, bch2_dirent_hash(dst_hash, dst_name)); ++ int ret = 0; ++ ++ *src_inum = *dst_inum = 0; ++ ++ /* ++ * Lookup dst: ++ * ++ * Note that in BCH_RENAME mode, we're _not_ checking if ++ * the target already exists - we're relying on the VFS ++ * to do that check for us for correctness: ++ */ ++ dst_iter = mode == BCH_RENAME ++ ? bch2_hash_hole(trans, bch2_dirent_hash_desc, ++ dst_hash, dst_dir, dst_name) ++ : bch2_hash_lookup(trans, bch2_dirent_hash_desc, ++ dst_hash, dst_dir, dst_name, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(dst_iter); ++ if (ret) ++ goto out; ++ ++ old_dst = bch2_btree_iter_peek_slot(dst_iter); ++ ++ if (mode != BCH_RENAME) ++ *dst_inum = le64_to_cpu(bkey_s_c_to_dirent(old_dst).v->d_inum); ++ ++ /* Lookup src: */ ++ src_iter = bch2_hash_lookup(trans, bch2_dirent_hash_desc, ++ src_hash, src_dir, src_name, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(src_iter); ++ if (ret) ++ goto out; ++ ++ old_src = bch2_btree_iter_peek_slot(src_iter); ++ *src_inum = le64_to_cpu(bkey_s_c_to_dirent(old_src).v->d_inum); ++ ++ /* Create new dst key: */ ++ new_dst = dirent_create_key(trans, 0, dst_name, 0); ++ ret = PTR_ERR_OR_ZERO(new_dst); ++ if (ret) ++ goto out; ++ ++ dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); ++ new_dst->k.p = dst_iter->pos; ++ ++ /* Create new src key: */ ++ if (mode == BCH_RENAME_EXCHANGE) { ++ new_src = dirent_create_key(trans, 0, src_name, 0); ++ ret = PTR_ERR_OR_ZERO(new_src); ++ if (ret) ++ goto out; ++ ++ dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); ++ new_src->k.p = src_iter->pos; ++ } else { ++ new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); ++ ret = PTR_ERR_OR_ZERO(new_src); ++ if (ret) ++ goto out; ++ ++ bkey_init(&new_src->k); ++ new_src->k.p = src_iter->pos; ++ ++ if (bkey_cmp(dst_pos, src_iter->pos) <= 0 && ++ bkey_cmp(src_iter->pos, dst_iter->pos) < 0) { ++ /* ++ * We have a hash collision for the new dst key, ++ * and new_src - the key we're deleting - is between ++ * new_dst's hashed slot and the slot we're going to be ++ * inserting it into - oops. This will break the hash ++ * table if we don't deal with it: ++ */ ++ if (mode == BCH_RENAME) { ++ /* ++ * If we're not overwriting, we can just insert ++ * new_dst at the src position: ++ */ ++ new_dst->k.p = src_iter->pos; ++ bch2_trans_update(trans, src_iter, ++ &new_dst->k_i, 0); ++ goto out; ++ } else { ++ /* If we're overwriting, we can't insert new_dst ++ * at a different slot because it has to ++ * overwrite old_dst - just make sure to use a ++ * whiteout when deleting src: ++ */ ++ new_src->k.type = KEY_TYPE_whiteout; ++ } ++ } else { ++ /* Check if we need a whiteout to delete src: */ ++ ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc, ++ src_hash, src_iter); ++ if (ret < 0) ++ goto out; ++ ++ if (ret) ++ new_src->k.type = KEY_TYPE_whiteout; ++ } ++ } ++ ++ bch2_trans_update(trans, src_iter, &new_src->k_i, 0); ++ bch2_trans_update(trans, dst_iter, &new_dst->k_i, 0); ++out: ++ bch2_trans_iter_put(trans, src_iter); ++ bch2_trans_iter_put(trans, dst_iter); ++ return ret; ++} ++ ++int bch2_dirent_delete_at(struct btree_trans *trans, ++ const struct bch_hash_info *hash_info, ++ struct btree_iter *iter) ++{ ++ return bch2_hash_delete_at(trans, bch2_dirent_hash_desc, ++ hash_info, iter); ++} ++ ++struct btree_iter * ++__bch2_dirent_lookup_trans(struct btree_trans *trans, u64 dir_inum, ++ const struct bch_hash_info *hash_info, ++ const struct qstr *name, unsigned flags) ++{ ++ return bch2_hash_lookup(trans, bch2_dirent_hash_desc, ++ hash_info, dir_inum, name, flags); ++} ++ ++u64 bch2_dirent_lookup(struct bch_fs *c, u64 dir_inum, ++ const struct bch_hash_info *hash_info, ++ const struct qstr *name) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 inum = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = __bch2_dirent_lookup_trans(&trans, dir_inum, ++ hash_info, name, 0); ++ if (IS_ERR(iter)) { ++ BUG_ON(PTR_ERR(iter) == -EINTR); ++ goto out; ++ } ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); ++out: ++ bch2_trans_exit(&trans); ++ return inum; ++} ++ ++int bch2_empty_dir_trans(struct btree_trans *trans, u64 dir_inum) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ for_each_btree_key(trans, iter, BTREE_ID_DIRENTS, ++ POS(dir_inum, 0), 0, k, ret) { ++ if (k.k->p.inode > dir_inum) ++ break; ++ ++ if (k.k->type == KEY_TYPE_dirent) { ++ ret = -ENOTEMPTY; ++ break; ++ } ++ } ++ bch2_trans_iter_put(trans, iter); ++ ++ return ret; ++} ++ ++int bch2_readdir(struct bch_fs *c, u64 inum, struct dir_context *ctx) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_dirent dirent; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, ++ POS(inum, ctx->pos), 0, k, ret) { ++ if (k.k->p.inode > inum) ++ break; ++ ++ if (k.k->type != KEY_TYPE_dirent) ++ continue; ++ ++ dirent = bkey_s_c_to_dirent(k); ++ ++ /* ++ * XXX: dir_emit() can fault and block, while we're holding ++ * locks ++ */ ++ ctx->pos = dirent.k->p.offset; ++ if (!dir_emit(ctx, dirent.v->d_name, ++ bch2_dirent_name_bytes(dirent), ++ le64_to_cpu(dirent.v->d_inum), ++ dirent.v->d_type)) ++ break; ++ ctx->pos = dirent.k->p.offset + 1; ++ } ++ ret = bch2_trans_exit(&trans) ?: ret; ++ ++ return ret; ++} +diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h +new file mode 100644 +index 000000000000..34769371dd13 +--- /dev/null ++++ b/fs/bcachefs/dirent.h +@@ -0,0 +1,63 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_DIRENT_H ++#define _BCACHEFS_DIRENT_H ++ ++#include "str_hash.h" ++ ++extern const struct bch_hash_desc bch2_dirent_hash_desc; ++ ++const char *bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_dirent (struct bkey_ops) { \ ++ .key_invalid = bch2_dirent_invalid, \ ++ .val_to_text = bch2_dirent_to_text, \ ++} ++ ++struct qstr; ++struct file; ++struct dir_context; ++struct bch_fs; ++struct bch_hash_info; ++struct bch_inode_info; ++ ++unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent); ++ ++static inline unsigned dirent_val_u64s(unsigned len) ++{ ++ return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len, ++ sizeof(u64)); ++} ++ ++int bch2_dirent_create(struct btree_trans *, u64, ++ const struct bch_hash_info *, u8, ++ const struct qstr *, u64, int); ++ ++int bch2_dirent_delete_at(struct btree_trans *, ++ const struct bch_hash_info *, ++ struct btree_iter *); ++ ++enum bch_rename_mode { ++ BCH_RENAME, ++ BCH_RENAME_OVERWRITE, ++ BCH_RENAME_EXCHANGE, ++}; ++ ++int bch2_dirent_rename(struct btree_trans *, ++ u64, struct bch_hash_info *, ++ u64, struct bch_hash_info *, ++ const struct qstr *, u64 *, ++ const struct qstr *, u64 *, ++ enum bch_rename_mode); ++ ++struct btree_iter * ++__bch2_dirent_lookup_trans(struct btree_trans *, u64, ++ const struct bch_hash_info *, ++ const struct qstr *, unsigned); ++u64 bch2_dirent_lookup(struct bch_fs *, u64, const struct bch_hash_info *, ++ const struct qstr *); ++ ++int bch2_empty_dir_trans(struct btree_trans *, u64); ++int bch2_readdir(struct bch_fs *, u64, struct dir_context *); ++ ++#endif /* _BCACHEFS_DIRENT_H */ +diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c +new file mode 100644 +index 000000000000..c52b6faac9b4 +--- /dev/null ++++ b/fs/bcachefs/disk_groups.c +@@ -0,0 +1,486 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "disk_groups.h" ++#include "super-io.h" ++ ++#include ++ ++static int group_cmp(const void *_l, const void *_r) ++{ ++ const struct bch_disk_group *l = _l; ++ const struct bch_disk_group *r = _r; ++ ++ return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) - ++ (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?: ++ ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) - ++ (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?: ++ strncmp(l->label, r->label, sizeof(l->label)); ++} ++ ++static const char *bch2_sb_disk_groups_validate(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ field_to_type(f, disk_groups); ++ struct bch_disk_group *g, *sorted = NULL; ++ struct bch_sb_field_members *mi; ++ struct bch_member *m; ++ unsigned i, nr_groups, len; ++ const char *err = NULL; ++ ++ mi = bch2_sb_get_members(sb); ++ groups = bch2_sb_get_disk_groups(sb); ++ nr_groups = disk_groups_nr(groups); ++ ++ for (m = mi->members; ++ m < mi->members + sb->nr_devices; ++ m++) { ++ unsigned g; ++ ++ if (!BCH_MEMBER_GROUP(m)) ++ continue; ++ ++ g = BCH_MEMBER_GROUP(m) - 1; ++ ++ if (g >= nr_groups || ++ BCH_GROUP_DELETED(&groups->entries[g])) ++ return "disk has invalid group"; ++ } ++ ++ if (!nr_groups) ++ return NULL; ++ ++ for (g = groups->entries; ++ g < groups->entries + nr_groups; ++ g++) { ++ if (BCH_GROUP_DELETED(g)) ++ continue; ++ ++ len = strnlen(g->label, sizeof(g->label)); ++ if (!len) { ++ err = "group with empty label"; ++ goto err; ++ } ++ } ++ ++ sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL); ++ if (!sorted) ++ return "cannot allocate memory"; ++ ++ memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted)); ++ sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL); ++ ++ for (i = 0; i + 1 < nr_groups; i++) ++ if (!BCH_GROUP_DELETED(sorted + i) && ++ !group_cmp(sorted + i, sorted + i + 1)) { ++ err = "duplicate groups"; ++ goto err; ++ } ++ ++ err = NULL; ++err: ++ kfree(sorted); ++ return err; ++} ++ ++static void bch2_sb_disk_groups_to_text(struct printbuf *out, ++ struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ field_to_type(f, disk_groups); ++ struct bch_disk_group *g; ++ unsigned nr_groups = disk_groups_nr(groups); ++ ++ for (g = groups->entries; ++ g < groups->entries + nr_groups; ++ g++) { ++ if (g != groups->entries) ++ pr_buf(out, " "); ++ ++ if (BCH_GROUP_DELETED(g)) ++ pr_buf(out, "[deleted]"); ++ else ++ pr_buf(out, "[parent %llu name %s]", ++ BCH_GROUP_PARENT(g), g->label); ++ } ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = { ++ .validate = bch2_sb_disk_groups_validate, ++ .to_text = bch2_sb_disk_groups_to_text ++}; ++ ++int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) ++{ ++ struct bch_sb_field_members *mi; ++ struct bch_sb_field_disk_groups *groups; ++ struct bch_disk_groups_cpu *cpu_g, *old_g; ++ unsigned i, g, nr_groups; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ groups = bch2_sb_get_disk_groups(c->disk_sb.sb); ++ nr_groups = disk_groups_nr(groups); ++ ++ if (!groups) ++ return 0; ++ ++ cpu_g = kzalloc(sizeof(*cpu_g) + ++ sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL); ++ if (!cpu_g) ++ return -ENOMEM; ++ ++ cpu_g->nr = nr_groups; ++ ++ for (i = 0; i < nr_groups; i++) { ++ struct bch_disk_group *src = &groups->entries[i]; ++ struct bch_disk_group_cpu *dst = &cpu_g->entries[i]; ++ ++ dst->deleted = BCH_GROUP_DELETED(src); ++ dst->parent = BCH_GROUP_PARENT(src); ++ } ++ ++ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { ++ struct bch_member *m = mi->members + i; ++ struct bch_disk_group_cpu *dst = ++ &cpu_g->entries[BCH_MEMBER_GROUP(m)]; ++ ++ if (!bch2_member_exists(m)) ++ continue; ++ ++ g = BCH_MEMBER_GROUP(m); ++ while (g) { ++ dst = &cpu_g->entries[g - 1]; ++ __set_bit(i, dst->devs.d); ++ g = dst->parent; ++ } ++ } ++ ++ old_g = rcu_dereference_protected(c->disk_groups, ++ lockdep_is_held(&c->sb_lock)); ++ rcu_assign_pointer(c->disk_groups, cpu_g); ++ if (old_g) ++ kfree_rcu(old_g, rcu); ++ ++ return 0; ++} ++ ++const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) ++{ ++ struct target t = target_decode(target); ++ ++ switch (t.type) { ++ case TARGET_NULL: ++ return NULL; ++ case TARGET_DEV: { ++ struct bch_dev *ca = t.dev < c->sb.nr_devices ++ ? rcu_dereference(c->devs[t.dev]) ++ : NULL; ++ return ca ? &ca->self : NULL; ++ } ++ case TARGET_GROUP: { ++ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); ++ ++ return g && t.group < g->nr && !g->entries[t.group].deleted ++ ? &g->entries[t.group].devs ++ : NULL; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) ++{ ++ struct target t = target_decode(target); ++ ++ switch (t.type) { ++ case TARGET_NULL: ++ return false; ++ case TARGET_DEV: ++ return dev == t.dev; ++ case TARGET_GROUP: { ++ struct bch_disk_groups_cpu *g; ++ const struct bch_devs_mask *m; ++ bool ret; ++ ++ rcu_read_lock(); ++ g = rcu_dereference(c->disk_groups); ++ m = g && t.group < g->nr && !g->entries[t.group].deleted ++ ? &g->entries[t.group].devs ++ : NULL; ++ ++ ret = m ? test_bit(dev, m->d) : false; ++ rcu_read_unlock(); ++ ++ return ret; ++ } ++ default: ++ BUG(); ++ } ++} ++ ++static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups, ++ unsigned parent, ++ const char *name, unsigned namelen) ++{ ++ unsigned i, nr_groups = disk_groups_nr(groups); ++ ++ if (!namelen || namelen > BCH_SB_LABEL_SIZE) ++ return -EINVAL; ++ ++ for (i = 0; i < nr_groups; i++) { ++ struct bch_disk_group *g = groups->entries + i; ++ ++ if (BCH_GROUP_DELETED(g)) ++ continue; ++ ++ if (!BCH_GROUP_DELETED(g) && ++ BCH_GROUP_PARENT(g) == parent && ++ strnlen(g->label, sizeof(g->label)) == namelen && ++ !memcmp(name, g->label, namelen)) ++ return i; ++ } ++ ++ return -1; ++} ++ ++static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent, ++ const char *name, unsigned namelen) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ bch2_sb_get_disk_groups(sb->sb); ++ unsigned i, nr_groups = disk_groups_nr(groups); ++ struct bch_disk_group *g; ++ ++ if (!namelen || namelen > BCH_SB_LABEL_SIZE) ++ return -EINVAL; ++ ++ for (i = 0; ++ i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]); ++ i++) ++ ; ++ ++ if (i == nr_groups) { ++ unsigned u64s = ++ (sizeof(struct bch_sb_field_disk_groups) + ++ sizeof(struct bch_disk_group) * (nr_groups + 1)) / ++ sizeof(u64); ++ ++ groups = bch2_sb_resize_disk_groups(sb, u64s); ++ if (!groups) ++ return -ENOSPC; ++ ++ nr_groups = disk_groups_nr(groups); ++ } ++ ++ BUG_ON(i >= nr_groups); ++ ++ g = &groups->entries[i]; ++ ++ memcpy(g->label, name, namelen); ++ if (namelen < sizeof(g->label)) ++ g->label[namelen] = '\0'; ++ SET_BCH_GROUP_DELETED(g, 0); ++ SET_BCH_GROUP_PARENT(g, parent); ++ SET_BCH_GROUP_DATA_ALLOWED(g, ~0); ++ ++ return i; ++} ++ ++int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ bch2_sb_get_disk_groups(sb->sb); ++ int v = -1; ++ ++ do { ++ const char *next = strchrnul(name, '.'); ++ unsigned len = next - name; ++ ++ if (*next == '.') ++ next++; ++ ++ v = __bch2_disk_group_find(groups, v + 1, name, len); ++ name = next; ++ } while (*name && v >= 0); ++ ++ return v; ++} ++ ++int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) ++{ ++ struct bch_sb_field_disk_groups *groups; ++ unsigned parent = 0; ++ int v = -1; ++ ++ do { ++ const char *next = strchrnul(name, '.'); ++ unsigned len = next - name; ++ ++ if (*next == '.') ++ next++; ++ ++ groups = bch2_sb_get_disk_groups(sb->sb); ++ ++ v = __bch2_disk_group_find(groups, parent, name, len); ++ if (v < 0) ++ v = __bch2_disk_group_add(sb, parent, name, len); ++ if (v < 0) ++ return v; ++ ++ parent = v + 1; ++ name = next; ++ } while (*name && v >= 0); ++ ++ return v; ++} ++ ++void bch2_disk_path_to_text(struct printbuf *out, ++ struct bch_sb_handle *sb, ++ unsigned v) ++{ ++ struct bch_sb_field_disk_groups *groups = ++ bch2_sb_get_disk_groups(sb->sb); ++ struct bch_disk_group *g; ++ unsigned nr = 0; ++ u16 path[32]; ++ ++ while (1) { ++ if (nr == ARRAY_SIZE(path)) ++ goto inval; ++ ++ if (v >= disk_groups_nr(groups)) ++ goto inval; ++ ++ g = groups->entries + v; ++ ++ if (BCH_GROUP_DELETED(g)) ++ goto inval; ++ ++ path[nr++] = v; ++ ++ if (!BCH_GROUP_PARENT(g)) ++ break; ++ ++ v = BCH_GROUP_PARENT(g) - 1; ++ } ++ ++ while (nr) { ++ v = path[--nr]; ++ g = groups->entries + v; ++ ++ bch_scnmemcpy(out, g->label, ++ strnlen(g->label, sizeof(g->label))); ++ ++ if (nr) ++ pr_buf(out, "."); ++ } ++ return; ++inval: ++ pr_buf(out, "invalid group %u", v); ++} ++ ++int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) ++{ ++ struct bch_member *mi; ++ int v = -1; ++ int ret = 0; ++ ++ mutex_lock(&c->sb_lock); ++ ++ if (!strlen(name) || !strcmp(name, "none")) ++ goto write_sb; ++ ++ v = bch2_disk_path_find_or_create(&c->disk_sb, name); ++ if (v < 0) { ++ mutex_unlock(&c->sb_lock); ++ return v; ++ } ++ ++ ret = bch2_sb_disk_groups_to_cpu(c); ++ if (ret) ++ goto unlock; ++write_sb: ++ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; ++ SET_BCH_MEMBER_GROUP(mi, v + 1); ++ ++ bch2_write_super(c); ++unlock: ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++int bch2_opt_target_parse(struct bch_fs *c, const char *buf, u64 *v) ++{ ++ struct bch_dev *ca; ++ int g; ++ ++ if (!strlen(buf) || !strcmp(buf, "none")) { ++ *v = 0; ++ return 0; ++ } ++ ++ /* Is it a device? */ ++ ca = bch2_dev_lookup(c, buf); ++ if (!IS_ERR(ca)) { ++ *v = dev_to_target(ca->dev_idx); ++ percpu_ref_put(&ca->ref); ++ return 0; ++ } ++ ++ mutex_lock(&c->sb_lock); ++ g = bch2_disk_path_find(&c->disk_sb, buf); ++ mutex_unlock(&c->sb_lock); ++ ++ if (g >= 0) { ++ *v = group_to_target(g); ++ return 0; ++ } ++ ++ return -EINVAL; ++} ++ ++void bch2_opt_target_to_text(struct printbuf *out, struct bch_fs *c, u64 v) ++{ ++ struct target t = target_decode(v); ++ ++ switch (t.type) { ++ case TARGET_NULL: ++ pr_buf(out, "none"); ++ break; ++ case TARGET_DEV: { ++ struct bch_dev *ca; ++ ++ rcu_read_lock(); ++ ca = t.dev < c->sb.nr_devices ++ ? rcu_dereference(c->devs[t.dev]) ++ : NULL; ++ ++ if (ca && percpu_ref_tryget(&ca->io_ref)) { ++ char b[BDEVNAME_SIZE]; ++ ++ pr_buf(out, "/dev/%s", ++ bdevname(ca->disk_sb.bdev, b)); ++ percpu_ref_put(&ca->io_ref); ++ } else if (ca) { ++ pr_buf(out, "offline device %u", t.dev); ++ } else { ++ pr_buf(out, "invalid device %u", t.dev); ++ } ++ ++ rcu_read_unlock(); ++ break; ++ } ++ case TARGET_GROUP: ++ mutex_lock(&c->sb_lock); ++ bch2_disk_path_to_text(out, &c->disk_sb, t.group); ++ mutex_unlock(&c->sb_lock); ++ break; ++ default: ++ BUG(); ++ } ++} +diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h +new file mode 100644 +index 000000000000..3d84f23c34ed +--- /dev/null ++++ b/fs/bcachefs/disk_groups.h +@@ -0,0 +1,91 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_DISK_GROUPS_H ++#define _BCACHEFS_DISK_GROUPS_H ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups; ++ ++static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups) ++{ ++ return groups ++ ? (vstruct_end(&groups->field) - ++ (void *) &groups->entries[0]) / sizeof(struct bch_disk_group) ++ : 0; ++} ++ ++struct target { ++ enum { ++ TARGET_NULL, ++ TARGET_DEV, ++ TARGET_GROUP, ++ } type; ++ union { ++ unsigned dev; ++ unsigned group; ++ }; ++}; ++ ++#define TARGET_DEV_START 1 ++#define TARGET_GROUP_START (256 + TARGET_DEV_START) ++ ++static inline u16 dev_to_target(unsigned dev) ++{ ++ return TARGET_DEV_START + dev; ++} ++ ++static inline u16 group_to_target(unsigned group) ++{ ++ return TARGET_GROUP_START + group; ++} ++ ++static inline struct target target_decode(unsigned target) ++{ ++ if (target >= TARGET_GROUP_START) ++ return (struct target) { ++ .type = TARGET_GROUP, ++ .group = target - TARGET_GROUP_START ++ }; ++ ++ if (target >= TARGET_DEV_START) ++ return (struct target) { ++ .type = TARGET_DEV, ++ .group = target - TARGET_DEV_START ++ }; ++ ++ return (struct target) { .type = TARGET_NULL }; ++} ++ ++const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned); ++ ++static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c, ++ enum bch_data_type data_type, ++ u16 target) ++{ ++ struct bch_devs_mask devs = c->rw_devs[data_type]; ++ const struct bch_devs_mask *t = bch2_target_to_mask(c, target); ++ ++ if (t) ++ bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); ++ return devs; ++} ++ ++bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned); ++ ++int bch2_disk_path_find(struct bch_sb_handle *, const char *); ++ ++/* Exported for userspace bcachefs-tools: */ ++int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); ++ ++void bch2_disk_path_to_text(struct printbuf *, struct bch_sb_handle *, ++ unsigned); ++ ++int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *); ++void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, u64); ++ ++int bch2_sb_disk_groups_to_cpu(struct bch_fs *); ++ ++int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); ++ ++const char *bch2_sb_validate_disk_groups(struct bch_sb *, ++ struct bch_sb_field *); ++ ++#endif /* _BCACHEFS_DISK_GROUPS_H */ +diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c +new file mode 100644 +index 000000000000..eac750ad2240 +--- /dev/null ++++ b/fs/bcachefs/ec.c +@@ -0,0 +1,1636 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++/* erasure coding */ ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_on_stack.h" ++#include "bset.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "error.h" ++#include "io.h" ++#include "keylist.h" ++#include "recovery.h" ++#include "super-io.h" ++#include "util.h" ++ ++#include ++ ++#ifdef __KERNEL__ ++ ++#include ++#include ++ ++static void raid5_recov(unsigned disks, unsigned failed_idx, ++ size_t size, void **data) ++{ ++ unsigned i = 2, nr; ++ ++ BUG_ON(failed_idx >= disks); ++ ++ swap(data[0], data[failed_idx]); ++ memcpy(data[0], data[1], size); ++ ++ while (i < disks) { ++ nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); ++ xor_blocks(nr, size, data[0], data + i); ++ i += nr; ++ } ++ ++ swap(data[0], data[failed_idx]); ++} ++ ++static void raid_gen(int nd, int np, size_t size, void **v) ++{ ++ if (np >= 1) ++ raid5_recov(nd + np, nd, size, v); ++ if (np >= 2) ++ raid6_call.gen_syndrome(nd + np, size, v); ++ BUG_ON(np > 2); ++} ++ ++static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) ++{ ++ switch (nr) { ++ case 0: ++ break; ++ case 1: ++ if (ir[0] < nd + 1) ++ raid5_recov(nd + 1, ir[0], size, v); ++ else ++ raid6_call.gen_syndrome(nd + np, size, v); ++ break; ++ case 2: ++ if (ir[1] < nd) { ++ /* data+data failure. */ ++ raid6_2data_recov(nd + np, size, ir[0], ir[1], v); ++ } else if (ir[0] < nd) { ++ /* data + p/q failure */ ++ ++ if (ir[1] == nd) /* data + p failure */ ++ raid6_datap_recov(nd + np, size, ir[0], v); ++ else { /* data + q failure */ ++ raid5_recov(nd + 1, ir[0], size, v); ++ raid6_call.gen_syndrome(nd + np, size, v); ++ } ++ } else { ++ raid_gen(nd, np, size, v); ++ } ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++#else ++ ++#include ++ ++#endif ++ ++struct ec_bio { ++ struct bch_dev *ca; ++ struct ec_stripe_buf *buf; ++ size_t idx; ++ struct bio bio; ++}; ++ ++/* Stripes btree keys: */ ++ ++const char *bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; ++ ++ if (k.k->p.inode) ++ return "invalid stripe key"; ++ ++ if (bkey_val_bytes(k.k) < sizeof(*s)) ++ return "incorrect value size"; ++ ++ if (bkey_val_bytes(k.k) < sizeof(*s) || ++ bkey_val_u64s(k.k) < stripe_val_u64s(s)) ++ return "incorrect value size"; ++ ++ return bch2_bkey_ptrs_invalid(c, k); ++} ++ ++void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; ++ unsigned i; ++ ++ pr_buf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", ++ s->algorithm, ++ le16_to_cpu(s->sectors), ++ s->nr_blocks - s->nr_redundant, ++ s->nr_redundant, ++ s->csum_type, ++ 1U << s->csum_granularity_bits); ++ ++ for (i = 0; i < s->nr_blocks; i++) ++ pr_buf(out, " %u:%llu:%u", s->ptrs[i].dev, ++ (u64) s->ptrs[i].offset, ++ stripe_blockcount_get(s, i)); ++} ++ ++static int ptr_matches_stripe(struct bch_fs *c, ++ struct bch_stripe *v, ++ const struct bch_extent_ptr *ptr) ++{ ++ unsigned i; ++ ++ for (i = 0; i < v->nr_blocks - v->nr_redundant; i++) { ++ const struct bch_extent_ptr *ptr2 = v->ptrs + i; ++ ++ if (ptr->dev == ptr2->dev && ++ ptr->gen == ptr2->gen && ++ ptr->offset >= ptr2->offset && ++ ptr->offset < ptr2->offset + le16_to_cpu(v->sectors)) ++ return i; ++ } ++ ++ return -1; ++} ++ ++static int extent_matches_stripe(struct bch_fs *c, ++ struct bch_stripe *v, ++ struct bkey_s_c k) ++{ ++ ++ switch (k.k->type) { ++ case KEY_TYPE_extent: { ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ const struct bch_extent_ptr *ptr; ++ int idx; ++ ++ extent_for_each_ptr(e, ptr) { ++ idx = ptr_matches_stripe(c, v, ptr); ++ if (idx >= 0) ++ return idx; ++ } ++ break; ++ } ++ } ++ ++ return -1; ++} ++ ++static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_extent: { ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ const union bch_extent_entry *entry; ++ ++ extent_for_each_entry(e, entry) ++ if (extent_entry_type(entry) == ++ BCH_EXTENT_ENTRY_stripe_ptr && ++ entry->stripe_ptr.idx == idx) ++ return true; ++ ++ break; ++ } ++ } ++ ++ return false; ++} ++ ++/* Checksumming: */ ++ ++static void ec_generate_checksums(struct ec_stripe_buf *buf) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned csum_granularity = 1 << v->csum_granularity_bits; ++ unsigned csums_per_device = stripe_csums_per_device(v); ++ unsigned csum_bytes = bch_crc_bytes[v->csum_type]; ++ unsigned i, j; ++ ++ if (!csum_bytes) ++ return; ++ ++ BUG_ON(buf->offset); ++ BUG_ON(buf->size != le16_to_cpu(v->sectors)); ++ ++ for (i = 0; i < v->nr_blocks; i++) { ++ for (j = 0; j < csums_per_device; j++) { ++ unsigned offset = j << v->csum_granularity_bits; ++ unsigned len = min(csum_granularity, buf->size - offset); ++ ++ struct bch_csum csum = ++ bch2_checksum(NULL, v->csum_type, ++ null_nonce(), ++ buf->data[i] + (offset << 9), ++ len << 9); ++ ++ memcpy(stripe_csum(v, i, j), &csum, csum_bytes); ++ } ++ } ++} ++ ++static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned csum_granularity = 1 << v->csum_granularity_bits; ++ unsigned csum_bytes = bch_crc_bytes[v->csum_type]; ++ unsigned i; ++ ++ if (!csum_bytes) ++ return; ++ ++ for (i = 0; i < v->nr_blocks; i++) { ++ unsigned offset = buf->offset; ++ unsigned end = buf->offset + buf->size; ++ ++ if (!test_bit(i, buf->valid)) ++ continue; ++ ++ while (offset < end) { ++ unsigned j = offset >> v->csum_granularity_bits; ++ unsigned len = min(csum_granularity, end - offset); ++ struct bch_csum csum; ++ ++ BUG_ON(offset & (csum_granularity - 1)); ++ BUG_ON(offset + len != le16_to_cpu(v->sectors) && ++ ((offset + len) & (csum_granularity - 1))); ++ ++ csum = bch2_checksum(NULL, v->csum_type, ++ null_nonce(), ++ buf->data[i] + ((offset - buf->offset) << 9), ++ len << 9); ++ ++ if (memcmp(stripe_csum(v, i, j), &csum, csum_bytes)) { ++ __bcache_io_error(c, ++ "checksum error while doing reconstruct read (%u:%u)", ++ i, j); ++ clear_bit(i, buf->valid); ++ break; ++ } ++ ++ offset += len; ++ } ++ } ++} ++ ++/* Erasure coding: */ ++ ++static void ec_generate_ec(struct ec_stripe_buf *buf) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned nr_data = v->nr_blocks - v->nr_redundant; ++ unsigned bytes = le16_to_cpu(v->sectors) << 9; ++ ++ raid_gen(nr_data, v->nr_redundant, bytes, buf->data); ++} ++ ++static unsigned __ec_nr_failed(struct ec_stripe_buf *buf, unsigned nr) ++{ ++ return nr - bitmap_weight(buf->valid, nr); ++} ++ ++static unsigned ec_nr_failed(struct ec_stripe_buf *buf) ++{ ++ return __ec_nr_failed(buf, buf->key.v.nr_blocks); ++} ++ ++static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned i, failed[EC_STRIPE_MAX], nr_failed = 0; ++ unsigned nr_data = v->nr_blocks - v->nr_redundant; ++ unsigned bytes = buf->size << 9; ++ ++ if (ec_nr_failed(buf) > v->nr_redundant) { ++ __bcache_io_error(c, ++ "error doing reconstruct read: unable to read enough blocks"); ++ return -1; ++ } ++ ++ for (i = 0; i < nr_data; i++) ++ if (!test_bit(i, buf->valid)) ++ failed[nr_failed++] = i; ++ ++ raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); ++ return 0; ++} ++ ++/* IO: */ ++ ++static void ec_block_endio(struct bio *bio) ++{ ++ struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); ++ struct bch_dev *ca = ec_bio->ca; ++ struct closure *cl = bio->bi_private; ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s: %s", ++ bio_data_dir(bio) ? "write" : "read", ++ bch2_blk_status_to_str(bio->bi_status))) ++ clear_bit(ec_bio->idx, ec_bio->buf->valid); ++ ++ bio_put(&ec_bio->bio); ++ percpu_ref_put(&ca->io_ref); ++ closure_put(cl); ++} ++ ++static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, ++ unsigned rw, unsigned idx, struct closure *cl) ++{ ++ struct bch_stripe *v = &buf->key.v; ++ unsigned offset = 0, bytes = buf->size << 9; ++ struct bch_extent_ptr *ptr = &v->ptrs[idx]; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ if (!bch2_dev_get_ioref(ca, rw)) { ++ clear_bit(idx, buf->valid); ++ return; ++ } ++ ++ while (offset < bytes) { ++ unsigned nr_iovecs = min_t(size_t, BIO_MAX_PAGES, ++ DIV_ROUND_UP(bytes, PAGE_SIZE)); ++ unsigned b = min_t(size_t, bytes - offset, ++ nr_iovecs << PAGE_SHIFT); ++ struct ec_bio *ec_bio; ++ ++ ec_bio = container_of(bio_alloc_bioset(GFP_KERNEL, nr_iovecs, ++ &c->ec_bioset), ++ struct ec_bio, bio); ++ ++ ec_bio->ca = ca; ++ ec_bio->buf = buf; ++ ec_bio->idx = idx; ++ ++ bio_set_dev(&ec_bio->bio, ca->disk_sb.bdev); ++ bio_set_op_attrs(&ec_bio->bio, rw, 0); ++ ++ ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); ++ ec_bio->bio.bi_end_io = ec_block_endio; ++ ec_bio->bio.bi_private = cl; ++ ++ bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); ++ ++ closure_get(cl); ++ percpu_ref_get(&ca->io_ref); ++ ++ submit_bio(&ec_bio->bio); ++ ++ offset += b; ++ } ++ ++ percpu_ref_put(&ca->io_ref); ++} ++ ++/* recovery read path: */ ++int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct ec_stripe_buf *buf; ++ struct closure cl; ++ struct bkey_s_c k; ++ struct bch_stripe *v; ++ unsigned stripe_idx; ++ unsigned offset, end; ++ unsigned i, nr_data, csum_granularity; ++ int ret = 0, idx; ++ ++ closure_init_stack(&cl); ++ ++ BUG_ON(!rbio->pick.has_ec); ++ ++ stripe_idx = rbio->pick.ec.idx; ++ ++ buf = kzalloc(sizeof(*buf), GFP_NOIO); ++ if (!buf) ++ return -ENOMEM; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, ++ POS(0, stripe_idx), ++ BTREE_ITER_SLOTS); ++ k = bch2_btree_iter_peek_slot(iter); ++ if (bkey_err(k) || k.k->type != KEY_TYPE_stripe) { ++ __bcache_io_error(c, ++ "error doing reconstruct read: stripe not found"); ++ kfree(buf); ++ return bch2_trans_exit(&trans) ?: -EIO; ++ } ++ ++ bkey_reassemble(&buf->key.k_i, k); ++ bch2_trans_exit(&trans); ++ ++ v = &buf->key.v; ++ ++ nr_data = v->nr_blocks - v->nr_redundant; ++ ++ idx = ptr_matches_stripe(c, v, &rbio->pick.ptr); ++ BUG_ON(idx < 0); ++ ++ csum_granularity = 1U << v->csum_granularity_bits; ++ ++ offset = rbio->bio.bi_iter.bi_sector - v->ptrs[idx].offset; ++ end = offset + bio_sectors(&rbio->bio); ++ ++ BUG_ON(end > le16_to_cpu(v->sectors)); ++ ++ buf->offset = round_down(offset, csum_granularity); ++ buf->size = min_t(unsigned, le16_to_cpu(v->sectors), ++ round_up(end, csum_granularity)) - buf->offset; ++ ++ for (i = 0; i < v->nr_blocks; i++) { ++ buf->data[i] = kmalloc(buf->size << 9, GFP_NOIO); ++ if (!buf->data[i]) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ } ++ ++ memset(buf->valid, 0xFF, sizeof(buf->valid)); ++ ++ for (i = 0; i < v->nr_blocks; i++) { ++ struct bch_extent_ptr *ptr = v->ptrs + i; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ if (ptr_stale(ca, ptr)) { ++ __bcache_io_error(c, ++ "error doing reconstruct read: stale pointer"); ++ clear_bit(i, buf->valid); ++ continue; ++ } ++ ++ ec_block_io(c, buf, REQ_OP_READ, i, &cl); ++ } ++ ++ closure_sync(&cl); ++ ++ if (ec_nr_failed(buf) > v->nr_redundant) { ++ __bcache_io_error(c, ++ "error doing reconstruct read: unable to read enough blocks"); ++ ret = -EIO; ++ goto err; ++ } ++ ++ ec_validate_checksums(c, buf); ++ ++ ret = ec_do_recov(c, buf); ++ if (ret) ++ goto err; ++ ++ memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, ++ buf->data[idx] + ((offset - buf->offset) << 9)); ++err: ++ for (i = 0; i < v->nr_blocks; i++) ++ kfree(buf->data[i]); ++ kfree(buf); ++ return ret; ++} ++ ++/* stripe bucket accounting: */ ++ ++static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) ++{ ++ ec_stripes_heap n, *h = &c->ec_stripes_heap; ++ ++ if (idx >= h->size) { ++ if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) ++ return -ENOMEM; ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ if (n.size > h->size) { ++ memcpy(n.data, h->data, h->used * sizeof(h->data[0])); ++ n.used = h->used; ++ swap(*h, n); ++ } ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ free_heap(&n); ++ } ++ ++ if (!genradix_ptr_alloc(&c->stripes[0], idx, gfp)) ++ return -ENOMEM; ++ ++ if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && ++ !genradix_ptr_alloc(&c->stripes[1], idx, gfp)) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++static int ec_stripe_mem_alloc(struct bch_fs *c, ++ struct btree_iter *iter) ++{ ++ size_t idx = iter->pos.offset; ++ int ret = 0; ++ ++ if (!__ec_stripe_mem_alloc(c, idx, GFP_NOWAIT|__GFP_NOWARN)) ++ return ret; ++ ++ bch2_trans_unlock(iter->trans); ++ ret = -EINTR; ++ ++ if (!__ec_stripe_mem_alloc(c, idx, GFP_KERNEL)) ++ return ret; ++ ++ return -ENOMEM; ++} ++ ++static ssize_t stripe_idx_to_delete(struct bch_fs *c) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ ++ return h->used && h->data[0].blocks_nonempty == 0 ++ ? h->data[0].idx : -1; ++} ++ ++static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, ++ struct ec_stripe_heap_entry l, ++ struct ec_stripe_heap_entry r) ++{ ++ return ((l.blocks_nonempty > r.blocks_nonempty) - ++ (l.blocks_nonempty < r.blocks_nonempty)); ++} ++ ++static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, ++ size_t i) ++{ ++ struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); ++ ++ genradix_ptr(&c->stripes[0], h->data[i].idx)->heap_idx = i; ++} ++ ++static void heap_verify_backpointer(struct bch_fs *c, size_t idx) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ struct stripe *m = genradix_ptr(&c->stripes[0], idx); ++ ++ BUG_ON(!m->alive); ++ BUG_ON(m->heap_idx >= h->used); ++ BUG_ON(h->data[m->heap_idx].idx != idx); ++} ++ ++void bch2_stripes_heap_del(struct bch_fs *c, ++ struct stripe *m, size_t idx) ++{ ++ if (!m->on_heap) ++ return; ++ ++ m->on_heap = false; ++ ++ heap_verify_backpointer(c, idx); ++ ++ heap_del(&c->ec_stripes_heap, m->heap_idx, ++ ec_stripes_heap_cmp, ++ ec_stripes_heap_set_backpointer); ++} ++ ++void bch2_stripes_heap_insert(struct bch_fs *c, ++ struct stripe *m, size_t idx) ++{ ++ if (m->on_heap) ++ return; ++ ++ BUG_ON(heap_full(&c->ec_stripes_heap)); ++ ++ m->on_heap = true; ++ ++ heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { ++ .idx = idx, ++ .blocks_nonempty = m->blocks_nonempty, ++ }), ++ ec_stripes_heap_cmp, ++ ec_stripes_heap_set_backpointer); ++ ++ heap_verify_backpointer(c, idx); ++} ++ ++void bch2_stripes_heap_update(struct bch_fs *c, ++ struct stripe *m, size_t idx) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ size_t i; ++ ++ if (!m->on_heap) ++ return; ++ ++ heap_verify_backpointer(c, idx); ++ ++ h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; ++ ++ i = m->heap_idx; ++ heap_sift_up(h, i, ec_stripes_heap_cmp, ++ ec_stripes_heap_set_backpointer); ++ heap_sift_down(h, i, ec_stripes_heap_cmp, ++ ec_stripes_heap_set_backpointer); ++ ++ heap_verify_backpointer(c, idx); ++ ++ if (stripe_idx_to_delete(c) >= 0 && ++ !percpu_ref_is_dying(&c->writes)) ++ schedule_work(&c->ec_stripe_delete_work); ++} ++ ++/* stripe deletion */ ++ ++static int ec_stripe_delete(struct bch_fs *c, size_t idx) ++{ ++ //pr_info("deleting stripe %zu", idx); ++ return bch2_btree_delete_range(c, BTREE_ID_EC, ++ POS(0, idx), ++ POS(0, idx + 1), ++ NULL); ++} ++ ++static void ec_stripe_delete_work(struct work_struct *work) ++{ ++ struct bch_fs *c = ++ container_of(work, struct bch_fs, ec_stripe_delete_work); ++ ssize_t idx; ++ ++ while (1) { ++ spin_lock(&c->ec_stripes_heap_lock); ++ idx = stripe_idx_to_delete(c); ++ if (idx < 0) { ++ spin_unlock(&c->ec_stripes_heap_lock); ++ break; ++ } ++ ++ bch2_stripes_heap_del(c, genradix_ptr(&c->stripes[0], idx), idx); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ if (ec_stripe_delete(c, idx)) ++ break; ++ } ++} ++ ++/* stripe creation: */ ++ ++static int ec_stripe_bkey_insert(struct bch_fs *c, ++ struct bkey_i_stripe *stripe) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bpos start_pos = POS(0, c->ec_stripe_hint); ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EC, start_pos, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ if (bkey_cmp(k.k->p, POS(0, U32_MAX)) > 0) { ++ if (start_pos.offset) { ++ start_pos = POS_MIN; ++ bch2_btree_iter_set_pos(iter, start_pos); ++ continue; ++ } ++ ++ ret = -ENOSPC; ++ break; ++ } ++ ++ if (bkey_deleted(k.k)) ++ goto found_slot; ++ } ++ ++ goto err; ++found_slot: ++ start_pos = iter->pos; ++ ++ ret = ec_stripe_mem_alloc(c, iter); ++ if (ret) ++ goto err; ++ ++ stripe->k.p = iter->pos; ++ ++ bch2_trans_update(&trans, iter, &stripe->k_i, 0); ++ ++ ret = bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL); ++err: ++ bch2_trans_iter_put(&trans, iter); ++ ++ if (ret == -EINTR) ++ goto retry; ++ ++ c->ec_stripe_hint = ret ? start_pos.offset : start_pos.offset + 1; ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++static void extent_stripe_ptr_add(struct bkey_s_extent e, ++ struct ec_stripe_buf *s, ++ struct bch_extent_ptr *ptr, ++ unsigned block) ++{ ++ struct bch_extent_stripe_ptr *dst = (void *) ptr; ++ union bch_extent_entry *end = extent_entry_last(e); ++ ++ memmove_u64s_up(dst + 1, dst, (u64 *) end - (u64 *) dst); ++ e.k->u64s += sizeof(*dst) / sizeof(u64); ++ ++ *dst = (struct bch_extent_stripe_ptr) { ++ .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, ++ .block = block, ++ .idx = s->key.k.p.offset, ++ }; ++} ++ ++static int ec_stripe_update_ptrs(struct bch_fs *c, ++ struct ec_stripe_buf *s, ++ struct bkey *pos) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_s_extent e; ++ struct bkey_on_stack sk; ++ int ret = 0, dev, idx; ++ ++ bkey_on_stack_init(&sk); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ /* XXX this doesn't support the reflink btree */ ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ bkey_start_pos(pos), ++ BTREE_ITER_INTENT); ++ ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret = bkey_err(k)) && ++ bkey_cmp(bkey_start_pos(k.k), pos->p) < 0) { ++ struct bch_extent_ptr *ptr, *ec_ptr = NULL; ++ ++ if (extent_has_stripe_ptr(k, s->key.k.p.offset)) { ++ bch2_btree_iter_next(iter); ++ continue; ++ } ++ ++ idx = extent_matches_stripe(c, &s->key.v, k); ++ if (idx < 0) { ++ bch2_btree_iter_next(iter); ++ continue; ++ } ++ ++ dev = s->key.v.ptrs[idx].dev; ++ ++ bkey_on_stack_reassemble(&sk, c, k); ++ e = bkey_i_to_s_extent(sk.k); ++ ++ bch2_bkey_drop_ptrs(e.s, ptr, ptr->dev != dev); ++ ec_ptr = (void *) bch2_bkey_has_device(e.s_c, dev); ++ BUG_ON(!ec_ptr); ++ ++ extent_stripe_ptr_add(e, s, ec_ptr, idx); ++ ++ bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); ++ bch2_trans_update(&trans, iter, sk.k, 0); ++ ++ ret = bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE); ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ break; ++ } ++ ++ bch2_trans_exit(&trans); ++ bkey_on_stack_exit(&sk, c); ++ ++ return ret; ++} ++ ++/* ++ * data buckets of new stripe all written: create the stripe ++ */ ++static void ec_stripe_create(struct ec_stripe_new *s) ++{ ++ struct bch_fs *c = s->c; ++ struct open_bucket *ob; ++ struct bkey_i *k; ++ struct stripe *m; ++ struct bch_stripe *v = &s->stripe.key.v; ++ unsigned i, nr_data = v->nr_blocks - v->nr_redundant; ++ struct closure cl; ++ int ret; ++ ++ BUG_ON(s->h->s == s); ++ ++ closure_init_stack(&cl); ++ ++ if (s->err) { ++ if (s->err != -EROFS) ++ bch_err(c, "error creating stripe: error writing data buckets"); ++ goto err; ++ } ++ ++ BUG_ON(!s->allocated); ++ ++ if (!percpu_ref_tryget(&c->writes)) ++ goto err; ++ ++ BUG_ON(bitmap_weight(s->blocks_allocated, ++ s->blocks.nr) != s->blocks.nr); ++ ++ ec_generate_ec(&s->stripe); ++ ++ ec_generate_checksums(&s->stripe); ++ ++ /* write p/q: */ ++ for (i = nr_data; i < v->nr_blocks; i++) ++ ec_block_io(c, &s->stripe, REQ_OP_WRITE, i, &cl); ++ ++ closure_sync(&cl); ++ ++ for (i = nr_data; i < v->nr_blocks; i++) ++ if (!test_bit(i, s->stripe.valid)) { ++ bch_err(c, "error creating stripe: error writing redundancy buckets"); ++ goto err_put_writes; ++ } ++ ++ ret = s->existing_stripe ++ ? bch2_btree_insert(c, BTREE_ID_EC, &s->stripe.key.k_i, ++ NULL, NULL, BTREE_INSERT_NOFAIL) ++ : ec_stripe_bkey_insert(c, &s->stripe.key); ++ if (ret) { ++ bch_err(c, "error creating stripe: error creating stripe key"); ++ goto err_put_writes; ++ } ++ ++ for_each_keylist_key(&s->keys, k) { ++ ret = ec_stripe_update_ptrs(c, &s->stripe, &k->k); ++ if (ret) { ++ bch_err(c, "error creating stripe: error updating pointers"); ++ break; ++ } ++ } ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ m = genradix_ptr(&c->stripes[0], s->stripe.key.k.p.offset); ++#if 0 ++ pr_info("created a %s stripe %llu", ++ s->existing_stripe ? "existing" : "new", ++ s->stripe.key.k.p.offset); ++#endif ++ BUG_ON(m->on_heap); ++ bch2_stripes_heap_insert(c, m, s->stripe.key.k.p.offset); ++ spin_unlock(&c->ec_stripes_heap_lock); ++err_put_writes: ++ percpu_ref_put(&c->writes); ++err: ++ open_bucket_for_each(c, &s->blocks, ob, i) { ++ ob->ec = NULL; ++ __bch2_open_bucket_put(c, ob); ++ } ++ ++ bch2_open_buckets_put(c, &s->parity); ++ ++ bch2_keylist_free(&s->keys, s->inline_keys); ++ ++ for (i = 0; i < s->stripe.key.v.nr_blocks; i++) ++ kvpfree(s->stripe.data[i], s->stripe.size << 9); ++ kfree(s); ++} ++ ++static void ec_stripe_create_work(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, ++ struct bch_fs, ec_stripe_create_work); ++ struct ec_stripe_new *s, *n; ++restart: ++ mutex_lock(&c->ec_stripe_new_lock); ++ list_for_each_entry_safe(s, n, &c->ec_stripe_new_list, list) ++ if (!atomic_read(&s->pin)) { ++ list_del(&s->list); ++ mutex_unlock(&c->ec_stripe_new_lock); ++ ec_stripe_create(s); ++ goto restart; ++ } ++ mutex_unlock(&c->ec_stripe_new_lock); ++} ++ ++static void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s) ++{ ++ BUG_ON(atomic_read(&s->pin) <= 0); ++ ++ if (atomic_dec_and_test(&s->pin)) { ++ BUG_ON(!s->pending); ++ queue_work(system_long_wq, &c->ec_stripe_create_work); ++ } ++} ++ ++static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) ++{ ++ struct ec_stripe_new *s = h->s; ++ ++ BUG_ON(!s->allocated && !s->err); ++ ++ h->s = NULL; ++ s->pending = true; ++ ++ mutex_lock(&c->ec_stripe_new_lock); ++ list_add(&s->list, &c->ec_stripe_new_list); ++ mutex_unlock(&c->ec_stripe_new_lock); ++ ++ ec_stripe_new_put(c, s); ++} ++ ++/* have a full bucket - hand it off to be erasure coded: */ ++void bch2_ec_bucket_written(struct bch_fs *c, struct open_bucket *ob) ++{ ++ struct ec_stripe_new *s = ob->ec; ++ ++ if (ob->sectors_free) ++ s->err = -1; ++ ++ ec_stripe_new_put(c, s); ++} ++ ++void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) ++{ ++ struct ec_stripe_new *s = ob->ec; ++ ++ s->err = -EIO; ++} ++ ++void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) ++{ ++ struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); ++ struct bch_dev *ca; ++ unsigned offset; ++ ++ if (!ob) ++ return NULL; ++ ++ ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ offset = ca->mi.bucket_size - ob->sectors_free; ++ ++ return ob->ec->stripe.data[ob->ec_idx] + (offset << 9); ++} ++ ++void bch2_ec_add_backpointer(struct bch_fs *c, struct write_point *wp, ++ struct bpos pos, unsigned sectors) ++{ ++ struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); ++ struct ec_stripe_new *ec; ++ ++ if (!ob) ++ return; ++ ++ //pr_info("adding backpointer at %llu:%llu", pos.inode, pos.offset); ++ ++ ec = ob->ec; ++ mutex_lock(&ec->lock); ++ ++ if (bch2_keylist_realloc(&ec->keys, ec->inline_keys, ++ ARRAY_SIZE(ec->inline_keys), ++ BKEY_U64s)) { ++ BUG(); ++ } ++ ++ bkey_init(&ec->keys.top->k); ++ ec->keys.top->k.p = pos; ++ bch2_key_resize(&ec->keys.top->k, sectors); ++ bch2_keylist_push(&ec->keys); ++ ++ mutex_unlock(&ec->lock); ++} ++ ++static int unsigned_cmp(const void *_l, const void *_r) ++{ ++ unsigned l = *((const unsigned *) _l); ++ unsigned r = *((const unsigned *) _r); ++ ++ return cmp_int(l, r); ++} ++ ++/* pick most common bucket size: */ ++static unsigned pick_blocksize(struct bch_fs *c, ++ struct bch_devs_mask *devs) ++{ ++ struct bch_dev *ca; ++ unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX]; ++ struct { ++ unsigned nr, size; ++ } cur = { 0, 0 }, best = { 0, 0 }; ++ ++ for_each_member_device_rcu(ca, c, i, devs) ++ sizes[nr++] = ca->mi.bucket_size; ++ ++ sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); ++ ++ for (i = 0; i < nr; i++) { ++ if (sizes[i] != cur.size) { ++ if (cur.nr > best.nr) ++ best = cur; ++ ++ cur.nr = 0; ++ cur.size = sizes[i]; ++ } ++ ++ cur.nr++; ++ } ++ ++ if (cur.nr > best.nr) ++ best = cur; ++ ++ return best.size; ++} ++ ++static bool may_create_new_stripe(struct bch_fs *c) ++{ ++ return false; ++} ++ ++static void ec_stripe_key_init(struct bch_fs *c, ++ struct bkey_i_stripe *s, ++ unsigned nr_data, ++ unsigned nr_parity, ++ unsigned stripe_size) ++{ ++ unsigned u64s; ++ ++ bkey_stripe_init(&s->k_i); ++ s->v.sectors = cpu_to_le16(stripe_size); ++ s->v.algorithm = 0; ++ s->v.nr_blocks = nr_data + nr_parity; ++ s->v.nr_redundant = nr_parity; ++ s->v.csum_granularity_bits = ilog2(c->sb.encoded_extent_max); ++ s->v.csum_type = BCH_CSUM_CRC32C; ++ s->v.pad = 0; ++ ++ while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { ++ BUG_ON(1 << s->v.csum_granularity_bits >= ++ le16_to_cpu(s->v.sectors) || ++ s->v.csum_granularity_bits == U8_MAX); ++ s->v.csum_granularity_bits++; ++ } ++ ++ set_bkey_val_u64s(&s->k, u64s); ++} ++ ++static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) ++{ ++ struct ec_stripe_new *s; ++ unsigned i; ++ ++ lockdep_assert_held(&h->lock); ++ ++ s = kzalloc(sizeof(*s), GFP_KERNEL); ++ if (!s) ++ return -ENOMEM; ++ ++ mutex_init(&s->lock); ++ atomic_set(&s->pin, 1); ++ s->c = c; ++ s->h = h; ++ s->nr_data = min_t(unsigned, h->nr_active_devs, ++ EC_STRIPE_MAX) - h->redundancy; ++ s->nr_parity = h->redundancy; ++ ++ bch2_keylist_init(&s->keys, s->inline_keys); ++ ++ s->stripe.offset = 0; ++ s->stripe.size = h->blocksize; ++ memset(s->stripe.valid, 0xFF, sizeof(s->stripe.valid)); ++ ++ ec_stripe_key_init(c, &s->stripe.key, s->nr_data, ++ s->nr_parity, h->blocksize); ++ ++ for (i = 0; i < s->stripe.key.v.nr_blocks; i++) { ++ s->stripe.data[i] = kvpmalloc(s->stripe.size << 9, GFP_KERNEL); ++ if (!s->stripe.data[i]) ++ goto err; ++ } ++ ++ h->s = s; ++ ++ return 0; ++err: ++ for (i = 0; i < s->stripe.key.v.nr_blocks; i++) ++ kvpfree(s->stripe.data[i], s->stripe.size << 9); ++ kfree(s); ++ return -ENOMEM; ++} ++ ++static struct ec_stripe_head * ++ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, ++ unsigned algo, unsigned redundancy) ++{ ++ struct ec_stripe_head *h; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ h = kzalloc(sizeof(*h), GFP_KERNEL); ++ if (!h) ++ return NULL; ++ ++ mutex_init(&h->lock); ++ mutex_lock(&h->lock); ++ ++ h->target = target; ++ h->algo = algo; ++ h->redundancy = redundancy; ++ ++ rcu_read_lock(); ++ h->devs = target_rw_devs(c, BCH_DATA_user, target); ++ ++ for_each_member_device_rcu(ca, c, i, &h->devs) ++ if (!ca->mi.durability) ++ __clear_bit(i, h->devs.d); ++ ++ h->blocksize = pick_blocksize(c, &h->devs); ++ ++ for_each_member_device_rcu(ca, c, i, &h->devs) ++ if (ca->mi.bucket_size == h->blocksize) ++ h->nr_active_devs++; ++ ++ rcu_read_unlock(); ++ list_add(&h->list, &c->ec_stripe_head_list); ++ return h; ++} ++ ++void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) ++{ ++ if (h->s && ++ h->s->allocated && ++ bitmap_weight(h->s->blocks_allocated, ++ h->s->blocks.nr) == h->s->blocks.nr) ++ ec_stripe_set_pending(c, h); ++ ++ mutex_unlock(&h->lock); ++} ++ ++struct ec_stripe_head *__bch2_ec_stripe_head_get(struct bch_fs *c, ++ unsigned target, ++ unsigned algo, ++ unsigned redundancy) ++{ ++ struct ec_stripe_head *h; ++ ++ if (!redundancy) ++ return NULL; ++ ++ mutex_lock(&c->ec_stripe_head_lock); ++ list_for_each_entry(h, &c->ec_stripe_head_list, list) ++ if (h->target == target && ++ h->algo == algo && ++ h->redundancy == redundancy) { ++ mutex_lock(&h->lock); ++ goto found; ++ } ++ ++ h = ec_new_stripe_head_alloc(c, target, algo, redundancy); ++found: ++ mutex_unlock(&c->ec_stripe_head_lock); ++ return h; ++} ++ ++/* ++ * XXX: use a higher watermark for allocating open buckets here: ++ */ ++static int new_stripe_alloc_buckets(struct bch_fs *c, struct ec_stripe_head *h) ++{ ++ struct bch_devs_mask devs; ++ struct open_bucket *ob; ++ unsigned i, nr_have, nr_data = ++ min_t(unsigned, h->nr_active_devs, ++ EC_STRIPE_MAX) - h->redundancy; ++ bool have_cache = true; ++ int ret = 0; ++ ++ devs = h->devs; ++ ++ for_each_set_bit(i, h->s->blocks_allocated, EC_STRIPE_MAX) { ++ __clear_bit(h->s->stripe.key.v.ptrs[i].dev, devs.d); ++ --nr_data; ++ } ++ ++ BUG_ON(h->s->blocks.nr > nr_data); ++ BUG_ON(h->s->parity.nr > h->redundancy); ++ ++ open_bucket_for_each(c, &h->s->parity, ob, i) ++ __clear_bit(ob->ptr.dev, devs.d); ++ open_bucket_for_each(c, &h->s->blocks, ob, i) ++ __clear_bit(ob->ptr.dev, devs.d); ++ ++ percpu_down_read(&c->mark_lock); ++ rcu_read_lock(); ++ ++ if (h->s->parity.nr < h->redundancy) { ++ nr_have = h->s->parity.nr; ++ ++ ret = bch2_bucket_alloc_set(c, &h->s->parity, ++ &h->parity_stripe, ++ &devs, ++ h->redundancy, ++ &nr_have, ++ &have_cache, ++ RESERVE_NONE, ++ 0, ++ NULL); ++ if (ret) ++ goto err; ++ } ++ ++ if (h->s->blocks.nr < nr_data) { ++ nr_have = h->s->blocks.nr; ++ ++ ret = bch2_bucket_alloc_set(c, &h->s->blocks, ++ &h->block_stripe, ++ &devs, ++ nr_data, ++ &nr_have, ++ &have_cache, ++ RESERVE_NONE, ++ 0, ++ NULL); ++ if (ret) ++ goto err; ++ } ++err: ++ rcu_read_unlock(); ++ percpu_up_read(&c->mark_lock); ++ return ret; ++} ++ ++/* XXX: doesn't obey target: */ ++static s64 get_existing_stripe(struct bch_fs *c, ++ unsigned target, ++ unsigned algo, ++ unsigned redundancy) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ struct stripe *m; ++ size_t heap_idx; ++ u64 stripe_idx; ++ ++ if (may_create_new_stripe(c)) ++ return -1; ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ for (heap_idx = 0; heap_idx < h->used; heap_idx++) { ++ if (!h->data[heap_idx].blocks_nonempty) ++ continue; ++ ++ stripe_idx = h->data[heap_idx].idx; ++ m = genradix_ptr(&c->stripes[0], stripe_idx); ++ ++ if (m->algorithm == algo && ++ m->nr_redundant == redundancy && ++ m->blocks_nonempty < m->nr_blocks - m->nr_redundant) { ++ bch2_stripes_heap_del(c, m, stripe_idx); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ return stripe_idx; ++ } ++ } ++ ++ spin_unlock(&c->ec_stripes_heap_lock); ++ return -1; ++} ++ ++static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, idx), BTREE_ITER_SLOTS); ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (!ret) ++ bkey_reassemble(&stripe->key.k_i, k); ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *c, ++ unsigned target, ++ unsigned algo, ++ unsigned redundancy) ++{ ++ struct closure cl; ++ struct ec_stripe_head *h; ++ struct open_bucket *ob; ++ unsigned i, data_idx = 0; ++ s64 idx; ++ ++ closure_init_stack(&cl); ++ ++ h = __bch2_ec_stripe_head_get(c, target, algo, redundancy); ++ if (!h) ++ return NULL; ++ ++ if (!h->s && ec_new_stripe_alloc(c, h)) { ++ bch2_ec_stripe_head_put(c, h); ++ return NULL; ++ } ++ ++ if (!h->s->allocated) { ++ if (!h->s->existing_stripe && ++ (idx = get_existing_stripe(c, target, algo, redundancy)) >= 0) { ++ //pr_info("got existing stripe %llu", idx); ++ ++ h->s->existing_stripe = true; ++ h->s->existing_stripe_idx = idx; ++ if (get_stripe_key(c, idx, &h->s->stripe)) { ++ /* btree error */ ++ BUG(); ++ } ++ ++ for (i = 0; i < h->s->stripe.key.v.nr_blocks; i++) ++ if (stripe_blockcount_get(&h->s->stripe.key.v, i)) { ++ __set_bit(i, h->s->blocks_allocated); ++ ec_block_io(c, &h->s->stripe, READ, i, &cl); ++ } ++ } ++ ++ if (new_stripe_alloc_buckets(c, h)) { ++ bch2_ec_stripe_head_put(c, h); ++ h = NULL; ++ goto out; ++ } ++ ++ open_bucket_for_each(c, &h->s->blocks, ob, i) { ++ data_idx = find_next_zero_bit(h->s->blocks_allocated, ++ h->s->nr_data, data_idx); ++ BUG_ON(data_idx >= h->s->nr_data); ++ ++ h->s->stripe.key.v.ptrs[data_idx] = ob->ptr; ++ h->s->data_block_idx[i] = data_idx; ++ data_idx++; ++ } ++ ++ open_bucket_for_each(c, &h->s->parity, ob, i) ++ h->s->stripe.key.v.ptrs[h->s->nr_data + i] = ob->ptr; ++ ++ //pr_info("new stripe, blocks_allocated %lx", h->s->blocks_allocated[0]); ++ h->s->allocated = true; ++ } ++out: ++ closure_sync(&cl); ++ return h; ++} ++ ++void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct ec_stripe_head *h; ++ struct open_bucket *ob; ++ unsigned i; ++ ++ mutex_lock(&c->ec_stripe_head_lock); ++ list_for_each_entry(h, &c->ec_stripe_head_list, list) { ++ ++ mutex_lock(&h->lock); ++ if (!h->s) ++ goto unlock; ++ ++ open_bucket_for_each(c, &h->s->blocks, ob, i) ++ if (ob->ptr.dev == ca->dev_idx) ++ goto found; ++ open_bucket_for_each(c, &h->s->parity, ob, i) ++ if (ob->ptr.dev == ca->dev_idx) ++ goto found; ++ goto unlock; ++found: ++ h->s->err = -EROFS; ++ ec_stripe_set_pending(c, h); ++unlock: ++ mutex_unlock(&h->lock); ++ } ++ mutex_unlock(&c->ec_stripe_head_lock); ++} ++ ++static int __bch2_stripe_write_key(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct stripe *m, ++ size_t idx, ++ struct bkey_i_stripe *new_key) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_s_c k; ++ unsigned i; ++ int ret; ++ ++ bch2_btree_iter_set_pos(iter, POS(0, idx)); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ return ret; ++ ++ if (k.k->type != KEY_TYPE_stripe) ++ return -EIO; ++ ++ bkey_reassemble(&new_key->k_i, k); ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ ++ for (i = 0; i < new_key->v.nr_blocks; i++) ++ stripe_blockcount_set(&new_key->v, i, ++ m->block_sectors[i]); ++ m->dirty = false; ++ ++ spin_unlock(&c->ec_stripes_heap_lock); ++ ++ bch2_trans_update(trans, iter, &new_key->k_i, 0); ++ return 0; ++} ++ ++int bch2_stripes_write(struct bch_fs *c, unsigned flags) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct genradix_iter giter; ++ struct bkey_i_stripe *new_key; ++ struct stripe *m; ++ int ret = 0; ++ ++ new_key = kmalloc(255 * sizeof(u64), GFP_KERNEL); ++ BUG_ON(!new_key); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS_MIN, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ ++ genradix_for_each(&c->stripes[0], giter, m) { ++ if (!m->dirty) ++ continue; ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL|flags, ++ __bch2_stripe_write_key(&trans, iter, m, ++ giter.pos, new_key)); ++ ++ if (ret) ++ break; ++ } ++ ++ bch2_trans_exit(&trans); ++ ++ kfree(new_key); ++ ++ return ret; ++} ++ ++static int bch2_stripes_read_fn(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_s_c k) ++{ ++ int ret = 0; ++ ++ if (k.k->type == KEY_TYPE_stripe) { ++ struct stripe *m; ++ ++ ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL) ?: ++ bch2_mark_key(c, k, 0, 0, NULL, 0, ++ BTREE_TRIGGER_NOATOMIC); ++ if (ret) ++ return ret; ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ m = genradix_ptr(&c->stripes[0], k.k->p.offset); ++ bch2_stripes_heap_insert(c, m, k.k->p.offset); ++ spin_unlock(&c->ec_stripes_heap_lock); ++ } ++ ++ return ret; ++} ++ ++int bch2_stripes_read(struct bch_fs *c, struct journal_keys *journal_keys) ++{ ++ int ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_EC, ++ NULL, bch2_stripes_read_fn); ++ if (ret) ++ bch_err(c, "error reading stripes: %i", ret); ++ ++ return ret; ++} ++ ++int bch2_ec_mem_alloc(struct bch_fs *c, bool gc) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ size_t i, idx = 0; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EC, POS(0, U64_MAX), 0); ++ ++ k = bch2_btree_iter_prev(iter); ++ if (!IS_ERR_OR_NULL(k.k)) ++ idx = k.k->p.offset + 1; ++ ret = bch2_trans_exit(&trans); ++ if (ret) ++ return ret; ++ ++ if (!idx) ++ return 0; ++ ++ if (!gc && ++ !init_heap(&c->ec_stripes_heap, roundup_pow_of_two(idx), ++ GFP_KERNEL)) ++ return -ENOMEM; ++#if 0 ++ ret = genradix_prealloc(&c->stripes[gc], idx, GFP_KERNEL); ++#else ++ for (i = 0; i < idx; i++) ++ if (!genradix_ptr_alloc(&c->stripes[gc], i, GFP_KERNEL)) ++ return -ENOMEM; ++#endif ++ return 0; ++} ++ ++void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ ec_stripes_heap *h = &c->ec_stripes_heap; ++ struct stripe *m; ++ size_t i; ++ ++ spin_lock(&c->ec_stripes_heap_lock); ++ for (i = 0; i < min(h->used, 20UL); i++) { ++ m = genradix_ptr(&c->stripes[0], h->data[i].idx); ++ ++ pr_buf(out, "%zu %u/%u+%u\n", h->data[i].idx, ++ h->data[i].blocks_nonempty, ++ m->nr_blocks - m->nr_redundant, ++ m->nr_redundant); ++ } ++ spin_unlock(&c->ec_stripes_heap_lock); ++} ++ ++void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ struct ec_stripe_head *h; ++ struct ec_stripe_new *s; ++ ++ mutex_lock(&c->ec_stripe_head_lock); ++ list_for_each_entry(h, &c->ec_stripe_head_list, list) { ++ pr_buf(out, "target %u algo %u redundancy %u:\n", ++ h->target, h->algo, h->redundancy); ++ ++ if (h->s) ++ pr_buf(out, "\tpending: blocks %u allocated %u\n", ++ h->s->blocks.nr, ++ bitmap_weight(h->s->blocks_allocated, ++ h->s->blocks.nr)); ++ } ++ mutex_unlock(&c->ec_stripe_head_lock); ++ ++ mutex_lock(&c->ec_stripe_new_lock); ++ list_for_each_entry(s, &c->ec_stripe_new_list, list) { ++ pr_buf(out, "\tin flight: blocks %u allocated %u pin %u\n", ++ s->blocks.nr, ++ bitmap_weight(s->blocks_allocated, ++ s->blocks.nr), ++ atomic_read(&s->pin)); ++ } ++ mutex_unlock(&c->ec_stripe_new_lock); ++} ++ ++void bch2_fs_ec_exit(struct bch_fs *c) ++{ ++ struct ec_stripe_head *h; ++ ++ while (1) { ++ mutex_lock(&c->ec_stripe_head_lock); ++ h = list_first_entry_or_null(&c->ec_stripe_head_list, ++ struct ec_stripe_head, list); ++ if (h) ++ list_del(&h->list); ++ mutex_unlock(&c->ec_stripe_head_lock); ++ if (!h) ++ break; ++ ++ BUG_ON(h->s); ++ kfree(h); ++ } ++ ++ BUG_ON(!list_empty(&c->ec_stripe_new_list)); ++ ++ free_heap(&c->ec_stripes_heap); ++ genradix_free(&c->stripes[0]); ++ bioset_exit(&c->ec_bioset); ++} ++ ++int bch2_fs_ec_init(struct bch_fs *c) ++{ ++ INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work); ++ INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); ++ ++ return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), ++ BIOSET_NEED_BVECS); ++} +diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h +new file mode 100644 +index 000000000000..6db16cf768da +--- /dev/null ++++ b/fs/bcachefs/ec.h +@@ -0,0 +1,169 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EC_H ++#define _BCACHEFS_EC_H ++ ++#include "ec_types.h" ++#include "keylist_types.h" ++ ++const char *bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ ++#define bch2_bkey_ops_stripe (struct bkey_ops) { \ ++ .key_invalid = bch2_stripe_invalid, \ ++ .val_to_text = bch2_stripe_to_text, \ ++ .swab = bch2_ptr_swab, \ ++} ++ ++static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) ++{ ++ return DIV_ROUND_UP(le16_to_cpu(s->sectors), ++ 1 << s->csum_granularity_bits); ++} ++ ++static inline unsigned stripe_csum_offset(const struct bch_stripe *s, ++ unsigned dev, unsigned csum_idx) ++{ ++ unsigned csum_bytes = bch_crc_bytes[s->csum_type]; ++ ++ return sizeof(struct bch_stripe) + ++ sizeof(struct bch_extent_ptr) * s->nr_blocks + ++ (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes; ++} ++ ++static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s, ++ unsigned idx) ++{ ++ return stripe_csum_offset(s, s->nr_blocks, 0) + ++ sizeof(u16) * idx; ++} ++ ++static inline unsigned stripe_blockcount_get(const struct bch_stripe *s, ++ unsigned idx) ++{ ++ return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx)); ++} ++ ++static inline void stripe_blockcount_set(struct bch_stripe *s, ++ unsigned idx, unsigned v) ++{ ++ __le16 *p = (void *) s + stripe_blockcount_offset(s, idx); ++ ++ *p = cpu_to_le16(v); ++} ++ ++static inline unsigned stripe_val_u64s(const struct bch_stripe *s) ++{ ++ return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks), ++ sizeof(u64)); ++} ++ ++static inline void *stripe_csum(struct bch_stripe *s, ++ unsigned dev, unsigned csum_idx) ++{ ++ return (void *) s + stripe_csum_offset(s, dev, csum_idx); ++} ++ ++struct bch_read_bio; ++ ++struct ec_stripe_buf { ++ /* might not be buffering the entire stripe: */ ++ unsigned offset; ++ unsigned size; ++ unsigned long valid[BITS_TO_LONGS(EC_STRIPE_MAX)]; ++ ++ void *data[EC_STRIPE_MAX]; ++ ++ union { ++ struct bkey_i_stripe key; ++ u64 pad[255]; ++ }; ++}; ++ ++struct ec_stripe_head; ++ ++struct ec_stripe_new { ++ struct bch_fs *c; ++ struct ec_stripe_head *h; ++ struct mutex lock; ++ struct list_head list; ++ ++ /* counts in flight writes, stripe is created when pin == 0 */ ++ atomic_t pin; ++ ++ int err; ++ ++ u8 nr_data; ++ u8 nr_parity; ++ bool allocated; ++ bool pending; ++ bool existing_stripe; ++ u64 existing_stripe_idx; ++ ++ unsigned long blocks_allocated[BITS_TO_LONGS(EC_STRIPE_MAX)]; ++ ++ struct open_buckets blocks; ++ u8 data_block_idx[EC_STRIPE_MAX]; ++ struct open_buckets parity; ++ ++ struct keylist keys; ++ u64 inline_keys[BKEY_U64s * 8]; ++ ++ struct ec_stripe_buf stripe; ++}; ++ ++struct ec_stripe_head { ++ struct list_head list; ++ struct mutex lock; ++ ++ unsigned target; ++ unsigned algo; ++ unsigned redundancy; ++ ++ struct bch_devs_mask devs; ++ unsigned nr_active_devs; ++ ++ unsigned blocksize; ++ ++ struct dev_stripe_state block_stripe; ++ struct dev_stripe_state parity_stripe; ++ ++ struct ec_stripe_new *s; ++}; ++ ++int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *); ++ ++void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); ++void bch2_ec_add_backpointer(struct bch_fs *, struct write_point *, ++ struct bpos, unsigned); ++ ++void bch2_ec_bucket_written(struct bch_fs *, struct open_bucket *); ++void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); ++ ++int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); ++ ++void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); ++struct ec_stripe_head *bch2_ec_stripe_head_get(struct bch_fs *, unsigned, ++ unsigned, unsigned); ++ ++void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); ++void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); ++void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t); ++ ++void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); ++ ++void bch2_ec_flush_new_stripes(struct bch_fs *); ++ ++struct journal_keys; ++int bch2_stripes_read(struct bch_fs *, struct journal_keys *); ++int bch2_stripes_write(struct bch_fs *, unsigned); ++ ++int bch2_ec_mem_alloc(struct bch_fs *, bool); ++ ++void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); ++void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); ++ ++void bch2_fs_ec_exit(struct bch_fs *); ++int bch2_fs_ec_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_EC_H */ +diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h +new file mode 100644 +index 000000000000..e4d633fca5bf +--- /dev/null ++++ b/fs/bcachefs/ec_types.h +@@ -0,0 +1,39 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EC_TYPES_H ++#define _BCACHEFS_EC_TYPES_H ++ ++#include ++ ++#define EC_STRIPE_MAX 16 ++ ++struct bch_replicas_padded { ++ struct bch_replicas_entry e; ++ u8 pad[EC_STRIPE_MAX]; ++}; ++ ++struct stripe { ++ size_t heap_idx; ++ ++ u16 sectors; ++ u8 algorithm; ++ ++ u8 nr_blocks; ++ u8 nr_redundant; ++ ++ unsigned alive:1; ++ unsigned dirty:1; ++ unsigned on_heap:1; ++ u8 blocks_nonempty; ++ u16 block_sectors[EC_STRIPE_MAX]; ++ ++ struct bch_replicas_padded r; ++}; ++ ++struct ec_stripe_heap_entry { ++ size_t idx; ++ unsigned blocks_nonempty; ++}; ++ ++typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap; ++ ++#endif /* _BCACHEFS_EC_TYPES_H */ +diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c +new file mode 100644 +index 000000000000..cd46706fb6f5 +--- /dev/null ++++ b/fs/bcachefs/error.c +@@ -0,0 +1,172 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "error.h" ++#include "io.h" ++#include "super.h" ++ ++#define FSCK_ERR_RATELIMIT_NR 10 ++ ++bool bch2_inconsistent_error(struct bch_fs *c) ++{ ++ set_bit(BCH_FS_ERROR, &c->flags); ++ ++ switch (c->opts.errors) { ++ case BCH_ON_ERROR_CONTINUE: ++ return false; ++ case BCH_ON_ERROR_RO: ++ if (bch2_fs_emergency_read_only(c)) ++ bch_err(c, "emergency read only"); ++ return true; ++ case BCH_ON_ERROR_PANIC: ++ panic(bch2_fmt(c, "panic after error")); ++ return true; ++ default: ++ BUG(); ++ } ++} ++ ++void bch2_fatal_error(struct bch_fs *c) ++{ ++ if (bch2_fs_emergency_read_only(c)) ++ bch_err(c, "emergency read only"); ++} ++ ++void bch2_io_error_work(struct work_struct *work) ++{ ++ struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); ++ struct bch_fs *c = ca->fs; ++ bool dev; ++ ++ down_write(&c->state_lock); ++ dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_RO, ++ BCH_FORCE_IF_DEGRADED); ++ if (dev ++ ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_RO, ++ BCH_FORCE_IF_DEGRADED) ++ : bch2_fs_emergency_read_only(c)) ++ bch_err(ca, ++ "too many IO errors, setting %s RO", ++ dev ? "device" : "filesystem"); ++ up_write(&c->state_lock); ++} ++ ++void bch2_io_error(struct bch_dev *ca) ++{ ++ //queue_work(system_long_wq, &ca->io_error_work); ++} ++ ++#ifdef __KERNEL__ ++#define ask_yn() false ++#else ++#include "tools-util.h" ++#endif ++ ++enum fsck_err_ret bch2_fsck_err(struct bch_fs *c, unsigned flags, ++ const char *fmt, ...) ++{ ++ struct fsck_err_state *s = NULL; ++ va_list args; ++ bool fix = false, print = true, suppressing = false; ++ char _buf[sizeof(s->buf)], *buf = _buf; ++ ++ if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) { ++ va_start(args, fmt); ++ vprintk(fmt, args); ++ va_end(args); ++ ++ return bch2_inconsistent_error(c) ++ ? FSCK_ERR_EXIT ++ : FSCK_ERR_FIX; ++ } ++ ++ mutex_lock(&c->fsck_error_lock); ++ ++ list_for_each_entry(s, &c->fsck_errors, list) ++ if (s->fmt == fmt) ++ goto found; ++ ++ s = kzalloc(sizeof(*s), GFP_NOFS); ++ if (!s) { ++ if (!c->fsck_alloc_err) ++ bch_err(c, "kmalloc err, cannot ratelimit fsck errs"); ++ c->fsck_alloc_err = true; ++ buf = _buf; ++ goto print; ++ } ++ ++ INIT_LIST_HEAD(&s->list); ++ s->fmt = fmt; ++found: ++ list_move(&s->list, &c->fsck_errors); ++ s->nr++; ++ if (c->opts.ratelimit_errors && ++ s->nr >= FSCK_ERR_RATELIMIT_NR) { ++ if (s->nr == FSCK_ERR_RATELIMIT_NR) ++ suppressing = true; ++ else ++ print = false; ++ } ++ buf = s->buf; ++print: ++ va_start(args, fmt); ++ vscnprintf(buf, sizeof(_buf), fmt, args); ++ va_end(args); ++ ++ if (c->opts.fix_errors == FSCK_OPT_EXIT) { ++ bch_err(c, "%s, exiting", buf); ++ } else if (flags & FSCK_CAN_FIX) { ++ if (c->opts.fix_errors == FSCK_OPT_ASK) { ++ printk(KERN_ERR "%s: fix?", buf); ++ fix = ask_yn(); ++ } else if (c->opts.fix_errors == FSCK_OPT_YES || ++ (c->opts.nochanges && ++ !(flags & FSCK_CAN_IGNORE))) { ++ if (print) ++ bch_err(c, "%s, fixing", buf); ++ fix = true; ++ } else { ++ if (print) ++ bch_err(c, "%s, not fixing", buf); ++ fix = false; ++ } ++ } else if (flags & FSCK_NEED_FSCK) { ++ if (print) ++ bch_err(c, "%s (run fsck to correct)", buf); ++ } else { ++ if (print) ++ bch_err(c, "%s (repair unimplemented)", buf); ++ } ++ ++ if (suppressing) ++ bch_err(c, "Ratelimiting new instances of previous error"); ++ ++ mutex_unlock(&c->fsck_error_lock); ++ ++ if (fix) { ++ set_bit(BCH_FS_ERRORS_FIXED, &c->flags); ++ return FSCK_ERR_FIX; ++ } else { ++ set_bit(BCH_FS_ERROR, &c->flags); ++ return c->opts.fix_errors == FSCK_OPT_EXIT || ++ !(flags & FSCK_CAN_IGNORE) ++ ? FSCK_ERR_EXIT ++ : FSCK_ERR_IGNORE; ++ } ++} ++ ++void bch2_flush_fsck_errs(struct bch_fs *c) ++{ ++ struct fsck_err_state *s, *n; ++ ++ mutex_lock(&c->fsck_error_lock); ++ ++ list_for_each_entry_safe(s, n, &c->fsck_errors, list) { ++ if (s->ratelimited) ++ bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->buf); ++ ++ list_del(&s->list); ++ kfree(s); ++ } ++ ++ mutex_unlock(&c->fsck_error_lock); ++} +diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h +new file mode 100644 +index 000000000000..94b53312fbbd +--- /dev/null ++++ b/fs/bcachefs/error.h +@@ -0,0 +1,211 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_ERROR_H ++#define _BCACHEFS_ERROR_H ++ ++#include ++#include ++ ++struct bch_dev; ++struct bch_fs; ++struct work_struct; ++ ++/* ++ * XXX: separate out errors that indicate on disk data is inconsistent, and flag ++ * superblock as such ++ */ ++ ++/* Error messages: */ ++ ++/* ++ * Inconsistency errors: The on disk data is inconsistent. If these occur during ++ * initial recovery, they don't indicate a bug in the running code - we walk all ++ * the metadata before modifying anything. If they occur at runtime, they ++ * indicate either a bug in the running code or (less likely) data is being ++ * silently corrupted under us. ++ * ++ * XXX: audit all inconsistent errors and make sure they're all recoverable, in ++ * BCH_ON_ERROR_CONTINUE mode ++ */ ++ ++bool bch2_inconsistent_error(struct bch_fs *); ++ ++#define bch2_fs_inconsistent(c, ...) \ ++({ \ ++ bch_err(c, __VA_ARGS__); \ ++ bch2_inconsistent_error(c); \ ++}) ++ ++#define bch2_fs_inconsistent_on(cond, c, ...) \ ++({ \ ++ int _ret = !!(cond); \ ++ \ ++ if (_ret) \ ++ bch2_fs_inconsistent(c, __VA_ARGS__); \ ++ _ret; \ ++}) ++ ++/* ++ * Later we might want to mark only the particular device inconsistent, not the ++ * entire filesystem: ++ */ ++ ++#define bch2_dev_inconsistent(ca, ...) \ ++do { \ ++ bch_err(ca, __VA_ARGS__); \ ++ bch2_inconsistent_error((ca)->fs); \ ++} while (0) ++ ++#define bch2_dev_inconsistent_on(cond, ca, ...) \ ++({ \ ++ int _ret = !!(cond); \ ++ \ ++ if (_ret) \ ++ bch2_dev_inconsistent(ca, __VA_ARGS__); \ ++ _ret; \ ++}) ++ ++/* ++ * Fsck errors: inconsistency errors we detect at mount time, and should ideally ++ * be able to repair: ++ */ ++ ++enum { ++ BCH_FSCK_OK = 0, ++ BCH_FSCK_ERRORS_NOT_FIXED = 1, ++ BCH_FSCK_REPAIR_UNIMPLEMENTED = 2, ++ BCH_FSCK_REPAIR_IMPOSSIBLE = 3, ++ BCH_FSCK_UNKNOWN_VERSION = 4, ++}; ++ ++enum fsck_err_opts { ++ FSCK_OPT_EXIT, ++ FSCK_OPT_YES, ++ FSCK_OPT_NO, ++ FSCK_OPT_ASK, ++}; ++ ++enum fsck_err_ret { ++ FSCK_ERR_IGNORE = 0, ++ FSCK_ERR_FIX = 1, ++ FSCK_ERR_EXIT = 2, ++}; ++ ++struct fsck_err_state { ++ struct list_head list; ++ const char *fmt; ++ u64 nr; ++ bool ratelimited; ++ char buf[512]; ++}; ++ ++#define FSCK_CAN_FIX (1 << 0) ++#define FSCK_CAN_IGNORE (1 << 1) ++#define FSCK_NEED_FSCK (1 << 2) ++ ++__printf(3, 4) __cold ++enum fsck_err_ret bch2_fsck_err(struct bch_fs *, ++ unsigned, const char *, ...); ++void bch2_flush_fsck_errs(struct bch_fs *); ++ ++#define __fsck_err(c, _flags, msg, ...) \ ++({ \ ++ int _fix = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__);\ ++ \ ++ if (_fix == FSCK_ERR_EXIT) { \ ++ bch_err(c, "Unable to continue, halting"); \ ++ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ goto fsck_err; \ ++ } \ ++ \ ++ _fix; \ ++}) ++ ++/* These macros return true if error should be fixed: */ ++ ++/* XXX: mark in superblock that filesystem contains errors, if we ignore: */ ++ ++#define __fsck_err_on(cond, c, _flags, ...) \ ++ ((cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false) ++ ++#define need_fsck_err_on(cond, c, ...) \ ++ __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) ++ ++#define need_fsck_err(c, ...) \ ++ __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) ++ ++#define mustfix_fsck_err(c, ...) \ ++ __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__) ++ ++#define mustfix_fsck_err_on(cond, c, ...) \ ++ __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__) ++ ++#define fsck_err(c, ...) \ ++ __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) ++ ++#define fsck_err_on(cond, c, ...) \ ++ __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) ++ ++/* ++ * Fatal errors: these don't indicate a bug, but we can't continue running in RW ++ * mode - pretty much just due to metadata IO errors: ++ */ ++ ++void bch2_fatal_error(struct bch_fs *); ++ ++#define bch2_fs_fatal_error(c, ...) \ ++do { \ ++ bch_err(c, __VA_ARGS__); \ ++ bch2_fatal_error(c); \ ++} while (0) ++ ++#define bch2_fs_fatal_err_on(cond, c, ...) \ ++({ \ ++ int _ret = !!(cond); \ ++ \ ++ if (_ret) \ ++ bch2_fs_fatal_error(c, __VA_ARGS__); \ ++ _ret; \ ++}) ++ ++/* ++ * IO errors: either recoverable metadata IO (because we have replicas), or data ++ * IO - we need to log it and print out a message, but we don't (necessarily) ++ * want to shut down the fs: ++ */ ++ ++void bch2_io_error_work(struct work_struct *); ++ ++/* Does the error handling without logging a message */ ++void bch2_io_error(struct bch_dev *); ++ ++/* Logs message and handles the error: */ ++#define bch2_dev_io_error(ca, fmt, ...) \ ++do { \ ++ printk_ratelimited(KERN_ERR bch2_fmt((ca)->fs, \ ++ "IO error on %s for " fmt), \ ++ (ca)->name, ##__VA_ARGS__); \ ++ bch2_io_error(ca); \ ++} while (0) ++ ++#define bch2_dev_io_err_on(cond, ca, ...) \ ++({ \ ++ bool _ret = (cond); \ ++ \ ++ if (_ret) \ ++ bch2_dev_io_error(ca, __VA_ARGS__); \ ++ _ret; \ ++}) ++ ++/* kill? */ ++ ++#define __bcache_io_error(c, fmt, ...) \ ++ printk_ratelimited(KERN_ERR bch2_fmt(c, \ ++ "IO error: " fmt), ##__VA_ARGS__) ++ ++#define bcache_io_error(c, bio, fmt, ...) \ ++do { \ ++ __bcache_io_error(c, fmt, ##__VA_ARGS__); \ ++ (bio)->bi_status = BLK_STS_IOERR; \ ++} while (0) ++ ++#endif /* _BCACHEFS_ERROR_H */ +diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c +new file mode 100644 +index 000000000000..fd011df3cb99 +--- /dev/null ++++ b/fs/bcachefs/extent_update.c +@@ -0,0 +1,229 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "bkey_on_stack.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "debug.h" ++#include "extents.h" ++#include "extent_update.h" ++ ++/* ++ * This counts the number of iterators to the alloc & ec btrees we'll need ++ * inserting/removing this extent: ++ */ ++static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ unsigned ret = 0; ++ ++ bkey_extent_entry_for_each(ptrs, entry) { ++ switch (__extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ ret++; ++ } ++ } ++ ++ return ret; ++} ++ ++static int count_iters_for_insert(struct btree_trans *trans, ++ struct bkey_s_c k, ++ unsigned offset, ++ struct bpos *end, ++ unsigned *nr_iters, ++ unsigned max_iters) ++{ ++ int ret = 0, ret2 = 0; ++ ++ if (*nr_iters >= max_iters) { ++ *end = bpos_min(*end, k.k->p); ++ ret = 1; ++ } ++ ++ switch (k.k->type) { ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ *nr_iters += bch2_bkey_nr_alloc_ptrs(k); ++ ++ if (*nr_iters >= max_iters) { ++ *end = bpos_min(*end, k.k->p); ++ ret = 1; ++ } ++ ++ break; ++ case KEY_TYPE_reflink_p: { ++ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); ++ u64 idx = le64_to_cpu(p.v->idx); ++ unsigned sectors = bpos_min(*end, p.k->p).offset - ++ bkey_start_offset(p.k); ++ struct btree_iter *iter; ++ struct bkey_s_c r_k; ++ ++ for_each_btree_key(trans, iter, ++ BTREE_ID_REFLINK, POS(0, idx + offset), ++ BTREE_ITER_SLOTS, r_k, ret2) { ++ if (bkey_cmp(bkey_start_pos(r_k.k), ++ POS(0, idx + sectors)) >= 0) ++ break; ++ ++ /* extent_update_to_keys(), for the reflink_v update */ ++ *nr_iters += 1; ++ ++ *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); ++ ++ if (*nr_iters >= max_iters) { ++ struct bpos pos = bkey_start_pos(k.k); ++ pos.offset += min_t(u64, k.k->size, ++ r_k.k->p.offset - idx); ++ ++ *end = bpos_min(*end, pos); ++ ret = 1; ++ break; ++ } ++ } ++ ++ bch2_trans_iter_put(trans, iter); ++ break; ++ } ++ } ++ ++ return ret2 ?: ret; ++} ++ ++#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) ++ ++int bch2_extent_atomic_end(struct btree_iter *iter, ++ struct bkey_i *insert, ++ struct bpos *end) ++{ ++ struct btree_trans *trans = iter->trans; ++ struct btree *b; ++ struct btree_node_iter node_iter; ++ struct bkey_packed *_k; ++ unsigned nr_iters = 0; ++ int ret; ++ ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ return ret; ++ ++ b = iter->l[0].b; ++ node_iter = iter->l[0].iter; ++ ++ BUG_ON(bkey_cmp(b->data->min_key, POS_MIN) && ++ bkey_cmp(bkey_start_pos(&insert->k), ++ bkey_predecessor(b->data->min_key)) < 0); ++ ++ *end = bpos_min(insert->k.p, b->key.k.p); ++ ++ /* extent_update_to_keys(): */ ++ nr_iters += 1; ++ ++ ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, ++ &nr_iters, EXTENT_ITERS_MAX / 2); ++ if (ret < 0) ++ return ret; ++ ++ while ((_k = bch2_btree_node_iter_peek(&node_iter, b))) { ++ struct bkey unpacked; ++ struct bkey_s_c k = bkey_disassemble(b, _k, &unpacked); ++ unsigned offset = 0; ++ ++ if (bkey_cmp(bkey_start_pos(k.k), *end) >= 0) ++ break; ++ ++ if (bkey_cmp(bkey_start_pos(&insert->k), ++ bkey_start_pos(k.k)) > 0) ++ offset = bkey_start_offset(&insert->k) - ++ bkey_start_offset(k.k); ++ ++ /* extent_handle_overwrites(): */ ++ switch (bch2_extent_overlap(&insert->k, k.k)) { ++ case BCH_EXTENT_OVERLAP_ALL: ++ case BCH_EXTENT_OVERLAP_FRONT: ++ nr_iters += 1; ++ break; ++ case BCH_EXTENT_OVERLAP_BACK: ++ case BCH_EXTENT_OVERLAP_MIDDLE: ++ nr_iters += 2; ++ break; ++ } ++ ++ ret = count_iters_for_insert(trans, k, offset, end, ++ &nr_iters, EXTENT_ITERS_MAX); ++ if (ret) ++ break; ++ ++ bch2_btree_node_iter_advance(&node_iter, b); ++ } ++ ++ return ret < 0 ? ret : 0; ++} ++ ++int bch2_extent_trim_atomic(struct bkey_i *k, struct btree_iter *iter) ++{ ++ struct bpos end; ++ int ret; ++ ++ ret = bch2_extent_atomic_end(iter, k, &end); ++ if (ret) ++ return ret; ++ ++ bch2_cut_back(end, k); ++ return 0; ++} ++ ++int bch2_extent_is_atomic(struct bkey_i *k, struct btree_iter *iter) ++{ ++ struct bpos end; ++ int ret; ++ ++ ret = bch2_extent_atomic_end(iter, k, &end); ++ if (ret) ++ return ret; ++ ++ return !bkey_cmp(end, k->k.p); ++} ++ ++enum btree_insert_ret ++bch2_extent_can_insert(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *insert) ++{ ++ struct btree_iter_level *l = &iter->l[0]; ++ struct btree_node_iter node_iter = l->iter; ++ struct bkey_packed *_k; ++ struct bkey_s_c k; ++ struct bkey unpacked; ++ int sectors; ++ ++ _k = bch2_btree_node_iter_peek(&node_iter, l->b); ++ if (!_k) ++ return BTREE_INSERT_OK; ++ ++ k = bkey_disassemble(l->b, _k, &unpacked); ++ ++ /* Check if we're splitting a compressed extent: */ ++ ++ if (bkey_cmp(bkey_start_pos(&insert->k), bkey_start_pos(k.k)) > 0 && ++ bkey_cmp(insert->k.p, k.k->p) < 0 && ++ (sectors = bch2_bkey_sectors_compressed(k))) { ++ int flags = trans->flags & BTREE_INSERT_NOFAIL ++ ? BCH_DISK_RESERVATION_NOFAIL : 0; ++ ++ switch (bch2_disk_reservation_add(trans->c, trans->disk_res, ++ sectors, flags)) { ++ case 0: ++ break; ++ case -ENOSPC: ++ return BTREE_INSERT_ENOSPC; ++ default: ++ BUG(); ++ } ++ } ++ ++ return BTREE_INSERT_OK; ++} +diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h +new file mode 100644 +index 000000000000..38dc084627d2 +--- /dev/null ++++ b/fs/bcachefs/extent_update.h +@@ -0,0 +1,16 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EXTENT_UPDATE_H ++#define _BCACHEFS_EXTENT_UPDATE_H ++ ++#include "bcachefs.h" ++ ++int bch2_extent_atomic_end(struct btree_iter *, struct bkey_i *, ++ struct bpos *); ++int bch2_extent_trim_atomic(struct bkey_i *, struct btree_iter *); ++int bch2_extent_is_atomic(struct bkey_i *, struct btree_iter *); ++ ++enum btree_insert_ret ++bch2_extent_can_insert(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *); ++ ++#endif /* _BCACHEFS_EXTENT_UPDATE_H */ +diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c +new file mode 100644 +index 000000000000..568f039edcff +--- /dev/null ++++ b/fs/bcachefs/extents.c +@@ -0,0 +1,1258 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Copyright (C) 2010 Kent Overstreet ++ * ++ * Code for managing the extent btree and dynamically updating the writeback ++ * dirty sector count. ++ */ ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_gc.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "debug.h" ++#include "disk_groups.h" ++#include "error.h" ++#include "extents.h" ++#include "inode.h" ++#include "journal.h" ++#include "replicas.h" ++#include "super.h" ++#include "super-io.h" ++#include "util.h" ++ ++#include ++ ++static unsigned bch2_crc_field_size_max[] = { ++ [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, ++ [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, ++ [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, ++}; ++ ++static void bch2_extent_crc_pack(union bch_extent_crc *, ++ struct bch_extent_crc_unpacked, ++ enum bch_extent_entry_type); ++ ++static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, ++ unsigned dev) ++{ ++ struct bch_dev_io_failures *i; ++ ++ for (i = f->devs; i < f->devs + f->nr; i++) ++ if (i->dev == dev) ++ return i; ++ ++ return NULL; ++} ++ ++void bch2_mark_io_failure(struct bch_io_failures *failed, ++ struct extent_ptr_decoded *p) ++{ ++ struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev); ++ ++ if (!f) { ++ BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); ++ ++ f = &failed->devs[failed->nr++]; ++ f->dev = p->ptr.dev; ++ f->idx = p->idx; ++ f->nr_failed = 1; ++ f->nr_retries = 0; ++ } else if (p->idx != f->idx) { ++ f->idx = p->idx; ++ f->nr_failed = 1; ++ f->nr_retries = 0; ++ } else { ++ f->nr_failed++; ++ } ++} ++ ++/* ++ * returns true if p1 is better than p2: ++ */ ++static inline bool ptr_better(struct bch_fs *c, ++ const struct extent_ptr_decoded p1, ++ const struct extent_ptr_decoded p2) ++{ ++ if (likely(!p1.idx && !p2.idx)) { ++ struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev); ++ struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev); ++ ++ u64 l1 = atomic64_read(&dev1->cur_latency[READ]); ++ u64 l2 = atomic64_read(&dev2->cur_latency[READ]); ++ ++ /* Pick at random, biased in favor of the faster device: */ ++ ++ return bch2_rand_range(l1 + l2) > l1; ++ } ++ ++ if (force_reconstruct_read(c)) ++ return p1.idx > p2.idx; ++ ++ return p1.idx < p2.idx; ++} ++ ++/* ++ * This picks a non-stale pointer, preferably from a device other than @avoid. ++ * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to ++ * other devices, it will still pick a pointer from avoid. ++ */ ++int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, ++ struct bch_io_failures *failed, ++ struct extent_ptr_decoded *pick) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ struct bch_dev_io_failures *f; ++ struct bch_dev *ca; ++ int ret = 0; ++ ++ if (k.k->type == KEY_TYPE_error) ++ return -EIO; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ ++ /* ++ * If there are any dirty pointers it's an error if we can't ++ * read: ++ */ ++ if (!ret && !p.ptr.cached) ++ ret = -EIO; ++ ++ if (p.ptr.cached && ptr_stale(ca, &p.ptr)) ++ continue; ++ ++ f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; ++ if (f) ++ p.idx = f->nr_failed < f->nr_retries ++ ? f->idx ++ : f->idx + 1; ++ ++ if (!p.idx && ++ !bch2_dev_is_readable(ca)) ++ p.idx++; ++ ++ if (force_reconstruct_read(c) && ++ !p.idx && p.has_ec) ++ p.idx++; ++ ++ if (p.idx >= (unsigned) p.has_ec + 1) ++ continue; ++ ++ if (ret > 0 && !ptr_better(c, p, *pick)) ++ continue; ++ ++ *pick = p; ++ ret = 1; ++ } ++ ++ return ret; ++} ++ ++/* KEY_TYPE_btree_ptr: */ ++ ++const char *bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) ++ return "value too big"; ++ ++ return bch2_bkey_ptrs_invalid(c, k); ++} ++ ++void bch2_btree_ptr_debugcheck(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ const char *err; ++ char buf[160]; ++ struct bucket_mark mark; ++ struct bch_dev *ca; ++ ++ if (!test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) ++ return; ++ ++ if (!percpu_down_read_trylock(&c->mark_lock)) ++ return; ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ mark = ptr_bucket_mark(ca, ptr); ++ ++ err = "stale"; ++ if (gen_after(mark.gen, ptr->gen)) ++ goto err; ++ ++ err = "inconsistent"; ++ if (mark.data_type != BCH_DATA_btree || ++ mark.dirty_sectors < c->opts.btree_node_size) ++ goto err; ++ } ++out: ++ percpu_up_read(&c->mark_lock); ++ return; ++err: ++ bch2_fs_inconsistent(c, "%s btree pointer %s: bucket %zi gen %i mark %08x", ++ err, (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf), ++ PTR_BUCKET_NR(ca, ptr), ++ mark.gen, (unsigned) mark.v.counter); ++ goto out; ++} ++ ++void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ bch2_bkey_ptrs_to_text(out, c, k); ++} ++ ++void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); ++ ++ pr_buf(out, "seq %llx sectors %u written %u min_key ", ++ le64_to_cpu(bp.v->seq), ++ le16_to_cpu(bp.v->sectors), ++ le16_to_cpu(bp.v->sectors_written)); ++ ++ bch2_bpos_to_text(out, bp.v->min_key); ++ pr_buf(out, " "); ++ bch2_bkey_ptrs_to_text(out, c, k); ++} ++ ++void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, ++ unsigned big_endian, int write, ++ struct bkey_s k) ++{ ++ struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k); ++ ++ compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key); ++ ++ if (version < bcachefs_metadata_version_inode_btree_change && ++ btree_node_type_is_extents(btree_id) && ++ bkey_cmp(bp.v->min_key, POS_MIN)) ++ bp.v->min_key = write ++ ? bkey_predecessor(bp.v->min_key) ++ : bkey_successor(bp.v->min_key); ++} ++ ++/* KEY_TYPE_extent: */ ++ ++const char *bch2_extent_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ return bch2_bkey_ptrs_invalid(c, k); ++} ++ ++void bch2_extent_debugcheck(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ char buf[160]; ++ ++ if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) || ++ !test_bit(BCH_FS_INITIAL_GC_DONE, &c->flags)) ++ return; ++ ++ if (!percpu_down_read_trylock(&c->mark_lock)) ++ return; ++ ++ extent_for_each_ptr_decode(e, p, entry) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ struct bucket_mark mark = ptr_bucket_mark(ca, &p.ptr); ++ unsigned stale = gen_after(mark.gen, p.ptr.gen); ++ unsigned disk_sectors = ptr_disk_sectors(p); ++ unsigned mark_sectors = p.ptr.cached ++ ? mark.cached_sectors ++ : mark.dirty_sectors; ++ ++ bch2_fs_inconsistent_on(stale && !p.ptr.cached, c, ++ "stale dirty pointer (ptr gen %u bucket %u", ++ p.ptr.gen, mark.gen); ++ ++ bch2_fs_inconsistent_on(stale > 96, c, ++ "key too stale: %i", stale); ++ ++ bch2_fs_inconsistent_on(!stale && ++ (mark.data_type != BCH_DATA_user || ++ mark_sectors < disk_sectors), c, ++ "extent pointer not marked: %s:\n" ++ "type %u sectors %u < %u", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, e.s_c), buf), ++ mark.data_type, ++ mark_sectors, disk_sectors); ++ } ++ ++ percpu_up_read(&c->mark_lock); ++} ++ ++void bch2_extent_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ bch2_bkey_ptrs_to_text(out, c, k); ++} ++ ++enum merge_result bch2_extent_merge(struct bch_fs *c, ++ struct bkey_s _l, struct bkey_s _r) ++{ ++ struct bkey_s_extent l = bkey_s_to_extent(_l); ++ struct bkey_s_extent r = bkey_s_to_extent(_r); ++ union bch_extent_entry *en_l = l.v->start; ++ union bch_extent_entry *en_r = r.v->start; ++ struct bch_extent_crc_unpacked crc_l, crc_r; ++ ++ if (bkey_val_u64s(l.k) != bkey_val_u64s(r.k)) ++ return BCH_MERGE_NOMERGE; ++ ++ crc_l = bch2_extent_crc_unpack(l.k, NULL); ++ ++ extent_for_each_entry(l, en_l) { ++ en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); ++ ++ if (extent_entry_type(en_l) != extent_entry_type(en_r)) ++ return BCH_MERGE_NOMERGE; ++ ++ switch (extent_entry_type(en_l)) { ++ case BCH_EXTENT_ENTRY_ptr: { ++ const struct bch_extent_ptr *lp = &en_l->ptr; ++ const struct bch_extent_ptr *rp = &en_r->ptr; ++ struct bch_dev *ca; ++ ++ if (lp->offset + crc_l.compressed_size != rp->offset || ++ lp->dev != rp->dev || ++ lp->gen != rp->gen) ++ return BCH_MERGE_NOMERGE; ++ ++ /* We don't allow extents to straddle buckets: */ ++ ca = bch_dev_bkey_exists(c, lp->dev); ++ ++ if (PTR_BUCKET_NR(ca, lp) != PTR_BUCKET_NR(ca, rp)) ++ return BCH_MERGE_NOMERGE; ++ ++ break; ++ } ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ if (en_l->stripe_ptr.block != en_r->stripe_ptr.block || ++ en_l->stripe_ptr.idx != en_r->stripe_ptr.idx) ++ return BCH_MERGE_NOMERGE; ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ case BCH_EXTENT_ENTRY_crc64: ++ case BCH_EXTENT_ENTRY_crc128: ++ crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); ++ crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); ++ ++ if (crc_l.csum_type != crc_r.csum_type || ++ crc_l.compression_type != crc_r.compression_type || ++ crc_l.nonce != crc_r.nonce) ++ return BCH_MERGE_NOMERGE; ++ ++ if (crc_l.offset + crc_l.live_size != crc_l.compressed_size || ++ crc_r.offset) ++ return BCH_MERGE_NOMERGE; ++ ++ if (!bch2_checksum_mergeable(crc_l.csum_type)) ++ return BCH_MERGE_NOMERGE; ++ ++ if (crc_is_compressed(crc_l)) ++ return BCH_MERGE_NOMERGE; ++ ++ if (crc_l.csum_type && ++ crc_l.uncompressed_size + ++ crc_r.uncompressed_size > c->sb.encoded_extent_max) ++ return BCH_MERGE_NOMERGE; ++ ++ if (crc_l.uncompressed_size + crc_r.uncompressed_size > ++ bch2_crc_field_size_max[extent_entry_type(en_l)]) ++ return BCH_MERGE_NOMERGE; ++ ++ break; ++ default: ++ return BCH_MERGE_NOMERGE; ++ } ++ } ++ ++ extent_for_each_entry(l, en_l) { ++ struct bch_extent_crc_unpacked crc_l, crc_r; ++ ++ en_r = vstruct_idx(r.v, (u64 *) en_l - l.v->_data); ++ ++ if (!extent_entry_is_crc(en_l)) ++ continue; ++ ++ crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); ++ crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); ++ ++ crc_l.csum = bch2_checksum_merge(crc_l.csum_type, ++ crc_l.csum, ++ crc_r.csum, ++ crc_r.uncompressed_size << 9); ++ ++ crc_l.uncompressed_size += crc_r.uncompressed_size; ++ crc_l.compressed_size += crc_r.compressed_size; ++ ++ bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, ++ extent_entry_type(en_l)); ++ } ++ ++ bch2_key_resize(l.k, l.k->size + r.k->size); ++ ++ return BCH_MERGE_MERGE; ++} ++ ++/* KEY_TYPE_reservation: */ ++ ++const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); ++ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_reservation)) ++ return "incorrect value size"; ++ ++ if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) ++ return "invalid nr_replicas"; ++ ++ return NULL; ++} ++ ++void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); ++ ++ pr_buf(out, "generation %u replicas %u", ++ le32_to_cpu(r.v->generation), ++ r.v->nr_replicas); ++} ++ ++enum merge_result bch2_reservation_merge(struct bch_fs *c, ++ struct bkey_s _l, struct bkey_s _r) ++{ ++ struct bkey_s_reservation l = bkey_s_to_reservation(_l); ++ struct bkey_s_reservation r = bkey_s_to_reservation(_r); ++ ++ if (l.v->generation != r.v->generation || ++ l.v->nr_replicas != r.v->nr_replicas) ++ return BCH_MERGE_NOMERGE; ++ ++ if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { ++ bch2_key_resize(l.k, KEY_SIZE_MAX); ++ bch2_cut_front_s(l.k->p, r.s); ++ return BCH_MERGE_PARTIAL; ++ } ++ ++ bch2_key_resize(l.k, l.k->size + r.k->size); ++ ++ return BCH_MERGE_MERGE; ++} ++ ++/* Extent checksum entries: */ ++ ++/* returns true if not equal */ ++static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, ++ struct bch_extent_crc_unpacked r) ++{ ++ return (l.csum_type != r.csum_type || ++ l.compression_type != r.compression_type || ++ l.compressed_size != r.compressed_size || ++ l.uncompressed_size != r.uncompressed_size || ++ l.offset != r.offset || ++ l.live_size != r.live_size || ++ l.nonce != r.nonce || ++ bch2_crc_cmp(l.csum, r.csum)); ++} ++ ++static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, ++ struct bch_extent_crc_unpacked n) ++{ ++ return !crc_is_compressed(u) && ++ u.csum_type && ++ u.uncompressed_size > u.live_size && ++ bch2_csum_type_is_encryption(u.csum_type) == ++ bch2_csum_type_is_encryption(n.csum_type); ++} ++ ++bool bch2_can_narrow_extent_crcs(struct bkey_s_c k, ++ struct bch_extent_crc_unpacked n) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ struct bch_extent_crc_unpacked crc; ++ const union bch_extent_entry *i; ++ ++ if (!n.csum_type) ++ return false; ++ ++ bkey_for_each_crc(k.k, ptrs, crc, i) ++ if (can_narrow_crc(crc, n)) ++ return true; ++ ++ return false; ++} ++ ++/* ++ * We're writing another replica for this extent, so while we've got the data in ++ * memory we'll be computing a new checksum for the currently live data. ++ * ++ * If there are other replicas we aren't moving, and they are checksummed but ++ * not compressed, we can modify them to point to only the data that is ++ * currently live (so that readers won't have to bounce) while we've got the ++ * checksum we need: ++ */ ++bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); ++ struct bch_extent_crc_unpacked u; ++ struct extent_ptr_decoded p; ++ union bch_extent_entry *i; ++ bool ret = false; ++ ++ /* Find a checksum entry that covers only live data: */ ++ if (!n.csum_type) { ++ bkey_for_each_crc(&k->k, ptrs, u, i) ++ if (!crc_is_compressed(u) && ++ u.csum_type && ++ u.live_size == u.uncompressed_size) { ++ n = u; ++ goto found; ++ } ++ return false; ++ } ++found: ++ BUG_ON(crc_is_compressed(n)); ++ BUG_ON(n.offset); ++ BUG_ON(n.live_size != k->k.size); ++ ++restart_narrow_pointers: ++ ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); ++ ++ bkey_for_each_ptr_decode(&k->k, ptrs, p, i) ++ if (can_narrow_crc(p.crc, n)) { ++ bch2_bkey_drop_ptr(bkey_i_to_s(k), &i->ptr); ++ p.ptr.offset += p.crc.offset; ++ p.crc = n; ++ bch2_extent_ptr_decoded_append(k, &p); ++ ret = true; ++ goto restart_narrow_pointers; ++ } ++ ++ return ret; ++} ++ ++static void bch2_extent_crc_pack(union bch_extent_crc *dst, ++ struct bch_extent_crc_unpacked src, ++ enum bch_extent_entry_type type) ++{ ++#define set_common_fields(_dst, _src) \ ++ _dst.type = 1 << type; \ ++ _dst.csum_type = _src.csum_type, \ ++ _dst.compression_type = _src.compression_type, \ ++ _dst._compressed_size = _src.compressed_size - 1, \ ++ _dst._uncompressed_size = _src.uncompressed_size - 1, \ ++ _dst.offset = _src.offset ++ ++ switch (type) { ++ case BCH_EXTENT_ENTRY_crc32: ++ set_common_fields(dst->crc32, src); ++ dst->crc32.csum = *((__le32 *) &src.csum.lo); ++ break; ++ case BCH_EXTENT_ENTRY_crc64: ++ set_common_fields(dst->crc64, src); ++ dst->crc64.nonce = src.nonce; ++ dst->crc64.csum_lo = src.csum.lo; ++ dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); ++ break; ++ case BCH_EXTENT_ENTRY_crc128: ++ set_common_fields(dst->crc128, src); ++ dst->crc128.nonce = src.nonce; ++ dst->crc128.csum = src.csum; ++ break; ++ default: ++ BUG(); ++ } ++#undef set_common_fields ++} ++ ++void bch2_extent_crc_append(struct bkey_i *k, ++ struct bch_extent_crc_unpacked new) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); ++ union bch_extent_crc *crc = (void *) ptrs.end; ++ enum bch_extent_entry_type type; ++ ++ if (bch_crc_bytes[new.csum_type] <= 4 && ++ new.uncompressed_size <= CRC32_SIZE_MAX && ++ new.nonce <= CRC32_NONCE_MAX) ++ type = BCH_EXTENT_ENTRY_crc32; ++ else if (bch_crc_bytes[new.csum_type] <= 10 && ++ new.uncompressed_size <= CRC64_SIZE_MAX && ++ new.nonce <= CRC64_NONCE_MAX) ++ type = BCH_EXTENT_ENTRY_crc64; ++ else if (bch_crc_bytes[new.csum_type] <= 16 && ++ new.uncompressed_size <= CRC128_SIZE_MAX && ++ new.nonce <= CRC128_NONCE_MAX) ++ type = BCH_EXTENT_ENTRY_crc128; ++ else ++ BUG(); ++ ++ bch2_extent_crc_pack(crc, new, type); ++ ++ k->k.u64s += extent_entry_u64s(ptrs.end); ++ ++ EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); ++} ++ ++/* Generic code for keys with pointers: */ ++ ++unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) ++{ ++ return bch2_bkey_devs(k).nr; ++} ++ ++unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) ++{ ++ return k.k->type == KEY_TYPE_reservation ++ ? bkey_s_c_to_reservation(k).v->nr_replicas ++ : bch2_bkey_dirty_devs(k).nr; ++} ++ ++unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) ++{ ++ unsigned ret = 0; ++ ++ if (k.k->type == KEY_TYPE_reservation) { ++ ret = bkey_s_c_to_reservation(k).v->nr_replicas; ++ } else { ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ ret += !p.ptr.cached && !crc_is_compressed(p.crc); ++ } ++ ++ return ret; ++} ++ ++unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ unsigned ret = 0; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (!p.ptr.cached && crc_is_compressed(p.crc)) ++ ret += p.crc.compressed_size; ++ ++ return ret; ++} ++ ++bool bch2_bkey_is_incompressible(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct bch_extent_crc_unpacked crc; ++ ++ bkey_for_each_crc(k.k, ptrs, crc, entry) ++ if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) ++ return true; ++ return false; ++} ++ ++bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size, ++ unsigned nr_replicas) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bpos end = pos; ++ struct bkey_s_c k; ++ bool ret = true; ++ int err; ++ ++ end.offset += size; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, pos, ++ BTREE_ITER_SLOTS, k, err) { ++ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) ++ break; ++ ++ if (nr_replicas > bch2_bkey_nr_ptrs_fully_allocated(k)) { ++ ret = false; ++ break; ++ } ++ } ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++static unsigned bch2_extent_ptr_durability(struct bch_fs *c, ++ struct extent_ptr_decoded p) ++{ ++ unsigned durability = 0; ++ struct bch_dev *ca; ++ ++ if (p.ptr.cached) ++ return 0; ++ ++ ca = bch_dev_bkey_exists(c, p.ptr.dev); ++ ++ if (ca->mi.state != BCH_MEMBER_STATE_FAILED) ++ durability = max_t(unsigned, durability, ca->mi.durability); ++ ++ if (p.has_ec) { ++ struct stripe *s = ++ genradix_ptr(&c->stripes[0], p.ec.idx); ++ ++ if (WARN_ON(!s)) ++ goto out; ++ ++ durability += s->nr_redundant; ++ } ++out: ++ return durability; ++} ++ ++unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ unsigned durability = 0; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ durability += bch2_extent_ptr_durability(c, p); ++ ++ return durability; ++} ++ ++void bch2_bkey_mark_replicas_cached(struct bch_fs *c, struct bkey_s k, ++ unsigned target, ++ unsigned nr_desired_replicas) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); ++ union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ int extra = bch2_bkey_durability(c, k.s_c) - nr_desired_replicas; ++ ++ if (target && extra > 0) ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ int n = bch2_extent_ptr_durability(c, p); ++ ++ if (n && n <= extra && ++ !bch2_dev_in_target(c, p.ptr.dev, target)) { ++ entry->ptr.cached = true; ++ extra -= n; ++ } ++ } ++ ++ if (extra > 0) ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ int n = bch2_extent_ptr_durability(c, p); ++ ++ if (n && n <= extra) { ++ entry->ptr.cached = true; ++ extra -= n; ++ } ++ } ++} ++ ++void bch2_bkey_append_ptr(struct bkey_i *k, ++ struct bch_extent_ptr ptr) ++{ ++ EBUG_ON(bch2_bkey_has_device(bkey_i_to_s_c(k), ptr.dev)); ++ ++ switch (k->k.type) { ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: ++ case KEY_TYPE_extent: ++ EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); ++ ++ ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; ++ ++ memcpy((void *) &k->v + bkey_val_bytes(&k->k), ++ &ptr, ++ sizeof(ptr)); ++ k->u64s++; ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static inline void __extent_entry_insert(struct bkey_i *k, ++ union bch_extent_entry *dst, ++ union bch_extent_entry *new) ++{ ++ union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); ++ ++ memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), ++ dst, (u64 *) end - (u64 *) dst); ++ k->k.u64s += extent_entry_u64s(new); ++ memcpy(dst, new, extent_entry_bytes(new)); ++} ++ ++void bch2_extent_ptr_decoded_append(struct bkey_i *k, ++ struct extent_ptr_decoded *p) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); ++ struct bch_extent_crc_unpacked crc = ++ bch2_extent_crc_unpack(&k->k, NULL); ++ union bch_extent_entry *pos; ++ ++ if (!bch2_crc_unpacked_cmp(crc, p->crc)) { ++ pos = ptrs.start; ++ goto found; ++ } ++ ++ bkey_for_each_crc(&k->k, ptrs, crc, pos) ++ if (!bch2_crc_unpacked_cmp(crc, p->crc)) { ++ pos = extent_entry_next(pos); ++ goto found; ++ } ++ ++ bch2_extent_crc_append(k, p->crc); ++ pos = bkey_val_end(bkey_i_to_s(k)); ++found: ++ p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; ++ __extent_entry_insert(k, pos, to_entry(&p->ptr)); ++ ++ if (p->has_ec) { ++ p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; ++ __extent_entry_insert(k, pos, to_entry(&p->ec)); ++ } ++} ++ ++static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, ++ union bch_extent_entry *entry) ++{ ++ union bch_extent_entry *i = ptrs.start; ++ ++ if (i == entry) ++ return NULL; ++ ++ while (extent_entry_next(i) != entry) ++ i = extent_entry_next(i); ++ return i; ++} ++ ++union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, ++ struct bch_extent_ptr *ptr) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); ++ union bch_extent_entry *dst, *src, *prev; ++ bool drop_crc = true; ++ ++ EBUG_ON(ptr < &ptrs.start->ptr || ++ ptr >= &ptrs.end->ptr); ++ EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); ++ ++ src = extent_entry_next(to_entry(ptr)); ++ if (src != ptrs.end && ++ !extent_entry_is_crc(src)) ++ drop_crc = false; ++ ++ dst = to_entry(ptr); ++ while ((prev = extent_entry_prev(ptrs, dst))) { ++ if (extent_entry_is_ptr(prev)) ++ break; ++ ++ if (extent_entry_is_crc(prev)) { ++ if (drop_crc) ++ dst = prev; ++ break; ++ } ++ ++ dst = prev; ++ } ++ ++ memmove_u64s_down(dst, src, ++ (u64 *) ptrs.end - (u64 *) src); ++ k.k->u64s -= (u64 *) src - (u64 *) dst; ++ ++ return dst; ++} ++ ++void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) ++{ ++ struct bch_extent_ptr *ptr; ++ ++ bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); ++} ++ ++const struct bch_extent_ptr * ++bch2_bkey_has_device(struct bkey_s_c k, unsigned dev) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(ptrs, ptr) ++ if (ptr->dev == dev) ++ return ptr; ++ ++ return NULL; ++} ++ ++bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(ptrs, ptr) ++ if (bch2_dev_in_target(c, ptr->dev, target) && ++ (!ptr->cached || ++ !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) ++ return true; ++ ++ return false; ++} ++ ++bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, ++ struct bch_extent_ptr m, u64 offset) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (p.ptr.dev == m.dev && ++ p.ptr.gen == m.gen && ++ (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == ++ (s64) m.offset - offset) ++ return true; ++ ++ return false; ++} ++ ++/* ++ * bch_extent_normalize - clean up an extent, dropping stale pointers etc. ++ * ++ * Returns true if @k should be dropped entirely ++ * ++ * For existing keys, only called when btree nodes are being rewritten, not when ++ * they're merely being compacted/resorted in memory. ++ */ ++bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) ++{ ++ struct bch_extent_ptr *ptr; ++ ++ bch2_bkey_drop_ptrs(k, ptr, ++ ptr->cached && ++ ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); ++ ++ /* will only happen if all pointers were cached: */ ++ if (!bch2_bkey_nr_ptrs(k.s_c)) ++ k.k->type = KEY_TYPE_discard; ++ ++ return bkey_whiteout(k.k); ++} ++ ++void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct bch_extent_crc_unpacked crc; ++ const struct bch_extent_ptr *ptr; ++ const struct bch_extent_stripe_ptr *ec; ++ struct bch_dev *ca; ++ bool first = true; ++ ++ bkey_extent_entry_for_each(ptrs, entry) { ++ if (!first) ++ pr_buf(out, " "); ++ ++ switch (__extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ ptr = entry_to_ptr(entry); ++ ca = ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] ++ ? bch_dev_bkey_exists(c, ptr->dev) ++ : NULL; ++ ++ pr_buf(out, "ptr: %u:%llu gen %u%s%s", ptr->dev, ++ (u64) ptr->offset, ptr->gen, ++ ptr->cached ? " cached" : "", ++ ca && ptr_stale(ca, ptr) ++ ? " stale" : ""); ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ case BCH_EXTENT_ENTRY_crc64: ++ case BCH_EXTENT_ENTRY_crc128: ++ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); ++ ++ pr_buf(out, "crc: c_size %u size %u offset %u nonce %u csum %u compress %u", ++ crc.compressed_size, ++ crc.uncompressed_size, ++ crc.offset, crc.nonce, ++ crc.csum_type, ++ crc.compression_type); ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ ec = &entry->stripe_ptr; ++ ++ pr_buf(out, "ec: idx %llu block %u", ++ (u64) ec->idx, ec->block); ++ break; ++ default: ++ pr_buf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); ++ return; ++ } ++ ++ first = false; ++ } ++} ++ ++static const char *extent_ptr_invalid(const struct bch_fs *c, ++ struct bkey_s_c k, ++ const struct bch_extent_ptr *ptr, ++ unsigned size_ondisk, ++ bool metadata) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr2; ++ struct bch_dev *ca; ++ ++ if (!bch2_dev_exists2(c, ptr->dev)) ++ return "pointer to invalid device"; ++ ++ ca = bch_dev_bkey_exists(c, ptr->dev); ++ if (!ca) ++ return "pointer to invalid device"; ++ ++ bkey_for_each_ptr(ptrs, ptr2) ++ if (ptr != ptr2 && ptr->dev == ptr2->dev) ++ return "multiple pointers to same device"; ++ ++ if (ptr->offset + size_ondisk > bucket_to_sector(ca, ca->mi.nbuckets)) ++ return "offset past end of device"; ++ ++ if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) ++ return "offset before first bucket"; ++ ++ if (bucket_remainder(ca, ptr->offset) + ++ size_ondisk > ca->mi.bucket_size) ++ return "spans multiple buckets"; ++ ++ return NULL; ++} ++ ++const char *bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct bch_extent_crc_unpacked crc; ++ unsigned size_ondisk = k.k->size; ++ const char *reason; ++ unsigned nonce = UINT_MAX; ++ ++ if (k.k->type == KEY_TYPE_btree_ptr) ++ size_ondisk = c->opts.btree_node_size; ++ if (k.k->type == KEY_TYPE_btree_ptr_v2) ++ size_ondisk = le16_to_cpu(bkey_s_c_to_btree_ptr_v2(k).v->sectors); ++ ++ bkey_extent_entry_for_each(ptrs, entry) { ++ if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) ++ return "invalid extent entry type"; ++ ++ if (k.k->type == KEY_TYPE_btree_ptr && ++ !extent_entry_is_ptr(entry)) ++ return "has non ptr field"; ++ ++ switch (extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ reason = extent_ptr_invalid(c, k, &entry->ptr, ++ size_ondisk, false); ++ if (reason) ++ return reason; ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ case BCH_EXTENT_ENTRY_crc64: ++ case BCH_EXTENT_ENTRY_crc128: ++ crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); ++ ++ if (crc.offset + crc.live_size > ++ crc.uncompressed_size) ++ return "checksum offset + key size > uncompressed size"; ++ ++ size_ondisk = crc.compressed_size; ++ ++ if (!bch2_checksum_type_valid(c, crc.csum_type)) ++ return "invalid checksum type"; ++ ++ if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) ++ return "invalid compression type"; ++ ++ if (bch2_csum_type_is_encryption(crc.csum_type)) { ++ if (nonce == UINT_MAX) ++ nonce = crc.offset + crc.nonce; ++ else if (nonce != crc.offset + crc.nonce) ++ return "incorrect nonce"; ++ } ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ break; ++ } ++ } ++ ++ return NULL; ++} ++ ++void bch2_ptr_swab(struct bkey_s k) ++{ ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); ++ union bch_extent_entry *entry; ++ u64 *d; ++ ++ for (d = (u64 *) ptrs.start; ++ d != (u64 *) ptrs.end; ++ d++) ++ *d = swab64(*d); ++ ++ for (entry = ptrs.start; ++ entry < ptrs.end; ++ entry = extent_entry_next(entry)) { ++ switch (extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ entry->crc32.csum = swab32(entry->crc32.csum); ++ break; ++ case BCH_EXTENT_ENTRY_crc64: ++ entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); ++ entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); ++ break; ++ case BCH_EXTENT_ENTRY_crc128: ++ entry->crc128.csum.hi = (__force __le64) ++ swab64((__force u64) entry->crc128.csum.hi); ++ entry->crc128.csum.lo = (__force __le64) ++ swab64((__force u64) entry->crc128.csum.lo); ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ break; ++ } ++ } ++} ++ ++/* Generic extent code: */ ++ ++int bch2_cut_front_s(struct bpos where, struct bkey_s k) ++{ ++ unsigned new_val_u64s = bkey_val_u64s(k.k); ++ int val_u64s_delta; ++ u64 sub; ++ ++ if (bkey_cmp(where, bkey_start_pos(k.k)) <= 0) ++ return 0; ++ ++ EBUG_ON(bkey_cmp(where, k.k->p) > 0); ++ ++ sub = where.offset - bkey_start_offset(k.k); ++ ++ k.k->size -= sub; ++ ++ if (!k.k->size) { ++ k.k->type = KEY_TYPE_deleted; ++ new_val_u64s = 0; ++ } ++ ++ switch (k.k->type) { ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: { ++ struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); ++ union bch_extent_entry *entry; ++ bool seen_crc = false; ++ ++ bkey_extent_entry_for_each(ptrs, entry) { ++ switch (extent_entry_type(entry)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ if (!seen_crc) ++ entry->ptr.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_crc32: ++ entry->crc32.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_crc64: ++ entry->crc64.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_crc128: ++ entry->crc128.offset += sub; ++ break; ++ case BCH_EXTENT_ENTRY_stripe_ptr: ++ break; ++ } ++ ++ if (extent_entry_is_crc(entry)) ++ seen_crc = true; ++ } ++ ++ break; ++ } ++ case KEY_TYPE_reflink_p: { ++ struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); ++ ++ le64_add_cpu(&p.v->idx, sub); ++ break; ++ } ++ case KEY_TYPE_inline_data: { ++ struct bkey_s_inline_data d = bkey_s_to_inline_data(k); ++ ++ sub = min_t(u64, sub << 9, bkey_val_bytes(d.k)); ++ ++ memmove(d.v->data, ++ d.v->data + sub, ++ bkey_val_bytes(d.k) - sub); ++ ++ new_val_u64s -= sub >> 3; ++ break; ++ } ++ } ++ ++ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; ++ BUG_ON(val_u64s_delta < 0); ++ ++ set_bkey_val_u64s(k.k, new_val_u64s); ++ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); ++ return -val_u64s_delta; ++} ++ ++int bch2_cut_back_s(struct bpos where, struct bkey_s k) ++{ ++ unsigned new_val_u64s = bkey_val_u64s(k.k); ++ int val_u64s_delta; ++ u64 len = 0; ++ ++ if (bkey_cmp(where, k.k->p) >= 0) ++ return 0; ++ ++ EBUG_ON(bkey_cmp(where, bkey_start_pos(k.k)) < 0); ++ ++ len = where.offset - bkey_start_offset(k.k); ++ ++ k.k->p = where; ++ k.k->size = len; ++ ++ if (!len) { ++ k.k->type = KEY_TYPE_deleted; ++ new_val_u64s = 0; ++ } ++ ++ switch (k.k->type) { ++ case KEY_TYPE_inline_data: ++ new_val_u64s = min(new_val_u64s, k.k->size << 6); ++ break; ++ } ++ ++ val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; ++ BUG_ON(val_u64s_delta < 0); ++ ++ set_bkey_val_u64s(k.k, new_val_u64s); ++ memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); ++ return -val_u64s_delta; ++} +diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h +new file mode 100644 +index 000000000000..29b15365d19c +--- /dev/null ++++ b/fs/bcachefs/extents.h +@@ -0,0 +1,603 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EXTENTS_H ++#define _BCACHEFS_EXTENTS_H ++ ++#include "bcachefs.h" ++#include "bkey.h" ++#include "extents_types.h" ++ ++struct bch_fs; ++struct btree_trans; ++ ++/* extent entries: */ ++ ++#define extent_entry_last(_e) \ ++ ((typeof(&(_e).v->start[0])) bkey_val_end(_e)) ++ ++#define entry_to_ptr(_entry) \ ++({ \ ++ EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \ ++ \ ++ __builtin_choose_expr( \ ++ type_is_exact(_entry, const union bch_extent_entry *), \ ++ (const struct bch_extent_ptr *) (_entry), \ ++ (struct bch_extent_ptr *) (_entry)); \ ++}) ++ ++/* downcast, preserves const */ ++#define to_entry(_entry) \ ++({ \ ++ BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \ ++ !type_is(_entry, struct bch_extent_ptr *) && \ ++ !type_is(_entry, struct bch_extent_stripe_ptr *)); \ ++ \ ++ __builtin_choose_expr( \ ++ (type_is_exact(_entry, const union bch_extent_crc *) || \ ++ type_is_exact(_entry, const struct bch_extent_ptr *) ||\ ++ type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\ ++ (const union bch_extent_entry *) (_entry), \ ++ (union bch_extent_entry *) (_entry)); \ ++}) ++ ++#define extent_entry_next(_entry) \ ++ ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) ++ ++static inline unsigned ++__extent_entry_type(const union bch_extent_entry *e) ++{ ++ return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX; ++} ++ ++static inline enum bch_extent_entry_type ++extent_entry_type(const union bch_extent_entry *e) ++{ ++ int ret = __ffs(e->type); ++ ++ EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX); ++ ++ return ret; ++} ++ ++static inline size_t extent_entry_bytes(const union bch_extent_entry *entry) ++{ ++ switch (extent_entry_type(entry)) { ++#define x(f, n) \ ++ case BCH_EXTENT_ENTRY_##f: \ ++ return sizeof(struct bch_extent_##f); ++ BCH_EXTENT_ENTRY_TYPES() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) ++{ ++ return extent_entry_bytes(entry) / sizeof(u64); ++} ++ ++static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) ++{ ++ switch (extent_entry_type(e)) { ++ case BCH_EXTENT_ENTRY_ptr: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static inline bool extent_entry_is_crc(const union bch_extent_entry *e) ++{ ++ switch (extent_entry_type(e)) { ++ case BCH_EXTENT_ENTRY_crc32: ++ case BCH_EXTENT_ENTRY_crc64: ++ case BCH_EXTENT_ENTRY_crc128: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++union bch_extent_crc { ++ u8 type; ++ struct bch_extent_crc32 crc32; ++ struct bch_extent_crc64 crc64; ++ struct bch_extent_crc128 crc128; ++}; ++ ++#define __entry_to_crc(_entry) \ ++ __builtin_choose_expr( \ ++ type_is_exact(_entry, const union bch_extent_entry *), \ ++ (const union bch_extent_crc *) (_entry), \ ++ (union bch_extent_crc *) (_entry)) ++ ++#define entry_to_crc(_entry) \ ++({ \ ++ EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \ ++ \ ++ __entry_to_crc(_entry); \ ++}) ++ ++static inline struct bch_extent_crc_unpacked ++bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) ++{ ++#define common_fields(_crc) \ ++ .csum_type = _crc.csum_type, \ ++ .compression_type = _crc.compression_type, \ ++ .compressed_size = _crc._compressed_size + 1, \ ++ .uncompressed_size = _crc._uncompressed_size + 1, \ ++ .offset = _crc.offset, \ ++ .live_size = k->size ++ ++ if (!crc) ++ return (struct bch_extent_crc_unpacked) { ++ .compressed_size = k->size, ++ .uncompressed_size = k->size, ++ .live_size = k->size, ++ }; ++ ++ switch (extent_entry_type(to_entry(crc))) { ++ case BCH_EXTENT_ENTRY_crc32: { ++ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { ++ common_fields(crc->crc32), ++ }; ++ ++ *((__le32 *) &ret.csum.lo) = crc->crc32.csum; ++ ++ memcpy(&ret.csum.lo, &crc->crc32.csum, ++ sizeof(crc->crc32.csum)); ++ ++ return ret; ++ } ++ case BCH_EXTENT_ENTRY_crc64: { ++ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { ++ common_fields(crc->crc64), ++ .nonce = crc->crc64.nonce, ++ .csum.lo = (__force __le64) crc->crc64.csum_lo, ++ }; ++ ++ *((__le16 *) &ret.csum.hi) = crc->crc64.csum_hi; ++ ++ return ret; ++ } ++ case BCH_EXTENT_ENTRY_crc128: { ++ struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { ++ common_fields(crc->crc128), ++ .nonce = crc->crc128.nonce, ++ .csum = crc->crc128.csum, ++ }; ++ ++ return ret; ++ } ++ default: ++ BUG(); ++ } ++#undef common_fields ++} ++ ++static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc) ++{ ++ return (crc.compression_type != BCH_COMPRESSION_TYPE_none && ++ crc.compression_type != BCH_COMPRESSION_TYPE_incompressible); ++} ++ ++/* bkey_ptrs: generically over any key type that has ptrs */ ++ ++struct bkey_ptrs_c { ++ const union bch_extent_entry *start; ++ const union bch_extent_entry *end; ++}; ++ ++struct bkey_ptrs { ++ union bch_extent_entry *start; ++ union bch_extent_entry *end; ++}; ++ ++static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_btree_ptr: { ++ struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); ++ return (struct bkey_ptrs_c) { ++ to_entry(&e.v->start[0]), ++ to_entry(extent_entry_last(e)) ++ }; ++ } ++ case KEY_TYPE_extent: { ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ return (struct bkey_ptrs_c) { ++ e.v->start, ++ extent_entry_last(e) ++ }; ++ } ++ case KEY_TYPE_stripe: { ++ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); ++ return (struct bkey_ptrs_c) { ++ to_entry(&s.v->ptrs[0]), ++ to_entry(&s.v->ptrs[s.v->nr_blocks]), ++ }; ++ } ++ case KEY_TYPE_reflink_v: { ++ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); ++ ++ return (struct bkey_ptrs_c) { ++ r.v->start, ++ bkey_val_end(r), ++ }; ++ } ++ case KEY_TYPE_btree_ptr_v2: { ++ struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k); ++ return (struct bkey_ptrs_c) { ++ to_entry(&e.v->start[0]), ++ to_entry(extent_entry_last(e)) ++ }; ++ } ++ default: ++ return (struct bkey_ptrs_c) { NULL, NULL }; ++ } ++} ++ ++static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) ++{ ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c); ++ ++ return (struct bkey_ptrs) { ++ (void *) p.start, ++ (void *) p.end ++ }; ++} ++ ++#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ ++ for ((_entry) = (_start); \ ++ (_entry) < (_end); \ ++ (_entry) = extent_entry_next(_entry)) ++ ++#define __bkey_ptr_next(_ptr, _end) \ ++({ \ ++ typeof(_end) _entry; \ ++ \ ++ __bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry) \ ++ if (extent_entry_is_ptr(_entry)) \ ++ break; \ ++ \ ++ _entry < (_end) ? entry_to_ptr(_entry) : NULL; \ ++}) ++ ++#define bkey_extent_entry_for_each_from(_p, _entry, _start) \ ++ __bkey_extent_entry_for_each_from(_start, (_p).end, _entry) ++ ++#define bkey_extent_entry_for_each(_p, _entry) \ ++ bkey_extent_entry_for_each_from(_p, _entry, _p.start) ++ ++#define __bkey_for_each_ptr(_start, _end, _ptr) \ ++ for ((_ptr) = (_start); \ ++ ((_ptr) = __bkey_ptr_next(_ptr, _end)); \ ++ (_ptr)++) ++ ++#define bkey_ptr_next(_p, _ptr) \ ++ __bkey_ptr_next(_ptr, (_p).end) ++ ++#define bkey_for_each_ptr(_p, _ptr) \ ++ __bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr) ++ ++#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry) \ ++({ \ ++ __label__ out; \ ++ \ ++ (_ptr).idx = 0; \ ++ (_ptr).has_ec = false; \ ++ \ ++ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ ++ switch (extent_entry_type(_entry)) { \ ++ case BCH_EXTENT_ENTRY_ptr: \ ++ (_ptr).ptr = _entry->ptr; \ ++ goto out; \ ++ case BCH_EXTENT_ENTRY_crc32: \ ++ case BCH_EXTENT_ENTRY_crc64: \ ++ case BCH_EXTENT_ENTRY_crc128: \ ++ (_ptr).crc = bch2_extent_crc_unpack(_k, \ ++ entry_to_crc(_entry)); \ ++ break; \ ++ case BCH_EXTENT_ENTRY_stripe_ptr: \ ++ (_ptr).ec = _entry->stripe_ptr; \ ++ (_ptr).has_ec = true; \ ++ break; \ ++ } \ ++out: \ ++ _entry < (_end); \ ++}) ++ ++#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry) \ ++ for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \ ++ (_entry) = _start; \ ++ __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \ ++ (_entry) = extent_entry_next(_entry)) ++ ++#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \ ++ __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ ++ _ptr, _entry) ++ ++#define bkey_crc_next(_k, _start, _end, _crc, _iter) \ ++({ \ ++ __bkey_extent_entry_for_each_from(_iter, _end, _iter) \ ++ if (extent_entry_is_crc(_iter)) { \ ++ (_crc) = bch2_extent_crc_unpack(_k, \ ++ entry_to_crc(_iter)); \ ++ break; \ ++ } \ ++ \ ++ (_iter) < (_end); \ ++}) ++ ++#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \ ++ for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \ ++ (_iter) = (_start); \ ++ bkey_crc_next(_k, _start, _end, _crc, _iter); \ ++ (_iter) = extent_entry_next(_iter)) ++ ++#define bkey_for_each_crc(_k, _p, _crc, _iter) \ ++ __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter) ++ ++/* Iterate over pointers in KEY_TYPE_extent: */ ++ ++#define extent_for_each_entry_from(_e, _entry, _start) \ ++ __bkey_extent_entry_for_each_from(_start, \ ++ extent_entry_last(_e),_entry) ++ ++#define extent_for_each_entry(_e, _entry) \ ++ extent_for_each_entry_from(_e, _entry, (_e).v->start) ++ ++#define extent_ptr_next(_e, _ptr) \ ++ __bkey_ptr_next(_ptr, extent_entry_last(_e)) ++ ++#define extent_for_each_ptr(_e, _ptr) \ ++ __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) ++ ++#define extent_for_each_ptr_decode(_e, _ptr, _entry) \ ++ __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ ++ extent_entry_last(_e), _ptr, _entry) ++ ++/* utility code common to all keys with pointers: */ ++ ++void bch2_mark_io_failure(struct bch_io_failures *, ++ struct extent_ptr_decoded *); ++int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, ++ struct bch_io_failures *, ++ struct extent_ptr_decoded *); ++ ++/* KEY_TYPE_btree_ptr: */ ++ ++const char *bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_btree_ptr_debugcheck(struct bch_fs *, struct bkey_s_c); ++void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ ++void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, ++ int, struct bkey_s); ++ ++#define bch2_bkey_ops_btree_ptr (struct bkey_ops) { \ ++ .key_invalid = bch2_btree_ptr_invalid, \ ++ .key_debugcheck = bch2_btree_ptr_debugcheck, \ ++ .val_to_text = bch2_btree_ptr_to_text, \ ++ .swab = bch2_ptr_swab, \ ++} ++ ++#define bch2_bkey_ops_btree_ptr_v2 (struct bkey_ops) { \ ++ .key_invalid = bch2_btree_ptr_invalid, \ ++ .key_debugcheck = bch2_btree_ptr_debugcheck, \ ++ .val_to_text = bch2_btree_ptr_v2_to_text, \ ++ .swab = bch2_ptr_swab, \ ++ .compat = bch2_btree_ptr_v2_compat, \ ++} ++ ++/* KEY_TYPE_extent: */ ++ ++const char *bch2_extent_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_extent_debugcheck(struct bch_fs *, struct bkey_s_c); ++void bch2_extent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++enum merge_result bch2_extent_merge(struct bch_fs *, ++ struct bkey_s, struct bkey_s); ++ ++#define bch2_bkey_ops_extent (struct bkey_ops) { \ ++ .key_invalid = bch2_extent_invalid, \ ++ .key_debugcheck = bch2_extent_debugcheck, \ ++ .val_to_text = bch2_extent_to_text, \ ++ .swab = bch2_ptr_swab, \ ++ .key_normalize = bch2_extent_normalize, \ ++ .key_merge = bch2_extent_merge, \ ++} ++ ++/* KEY_TYPE_reservation: */ ++ ++const char *bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++enum merge_result bch2_reservation_merge(struct bch_fs *, ++ struct bkey_s, struct bkey_s); ++ ++#define bch2_bkey_ops_reservation (struct bkey_ops) { \ ++ .key_invalid = bch2_reservation_invalid, \ ++ .val_to_text = bch2_reservation_to_text, \ ++ .key_merge = bch2_reservation_merge, \ ++} ++ ++/* Extent checksum entries: */ ++ ++bool bch2_can_narrow_extent_crcs(struct bkey_s_c, ++ struct bch_extent_crc_unpacked); ++bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); ++void bch2_extent_crc_append(struct bkey_i *, ++ struct bch_extent_crc_unpacked); ++ ++/* Generic code for keys with pointers: */ ++ ++static inline bool bkey_extent_is_direct_data(const struct bkey *k) ++{ ++ switch (k->type) { ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static inline bool bkey_extent_is_data(const struct bkey *k) ++{ ++ return bkey_extent_is_direct_data(k) || ++ k->type == KEY_TYPE_inline_data || ++ k->type == KEY_TYPE_reflink_p; ++} ++ ++/* ++ * Should extent be counted under inode->i_sectors? ++ */ ++static inline bool bkey_extent_is_allocation(const struct bkey *k) ++{ ++ switch (k->type) { ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reservation: ++ case KEY_TYPE_reflink_p: ++ case KEY_TYPE_reflink_v: ++ case KEY_TYPE_inline_data: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) ++{ ++ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(p, ptr) ++ ret.devs[ret.nr++] = ptr->dev; ++ ++ return ret; ++} ++ ++static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) ++{ ++ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(p, ptr) ++ if (!ptr->cached) ++ ret.devs[ret.nr++] = ptr->dev; ++ ++ return ret; ++} ++ ++static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) ++{ ++ struct bch_devs_list ret = (struct bch_devs_list) { 0 }; ++ struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(p, ptr) ++ if (ptr->cached) ++ ret.devs[ret.nr++] = ptr->dev; ++ ++ return ret; ++} ++ ++unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); ++unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); ++unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); ++bool bch2_bkey_is_incompressible(struct bkey_s_c); ++unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); ++bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned); ++unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); ++ ++void bch2_bkey_mark_replicas_cached(struct bch_fs *, struct bkey_s, ++ unsigned, unsigned); ++ ++void bch2_bkey_append_ptr(struct bkey_i *, struct bch_extent_ptr); ++void bch2_extent_ptr_decoded_append(struct bkey_i *, ++ struct extent_ptr_decoded *); ++union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, ++ struct bch_extent_ptr *); ++ ++#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \ ++do { \ ++ struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \ ++ \ ++ _ptr = &_ptrs.start->ptr; \ ++ \ ++ while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \ ++ if (_cond) { \ ++ _ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr); \ ++ _ptrs = bch2_bkey_ptrs(_k); \ ++ continue; \ ++ } \ ++ \ ++ (_ptr)++; \ ++ } \ ++} while (0) ++ ++void bch2_bkey_drop_device(struct bkey_s, unsigned); ++const struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s_c, unsigned); ++bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); ++ ++bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, ++ struct bch_extent_ptr, u64); ++ ++bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); ++void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++const char *bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c); ++ ++void bch2_ptr_swab(struct bkey_s); ++ ++/* Generic extent code: */ ++ ++int bch2_cut_front_s(struct bpos, struct bkey_s); ++int bch2_cut_back_s(struct bpos, struct bkey_s); ++ ++static inline void bch2_cut_front(struct bpos where, struct bkey_i *k) ++{ ++ bch2_cut_front_s(where, bkey_i_to_s(k)); ++} ++ ++static inline void bch2_cut_back(struct bpos where, struct bkey_i *k) ++{ ++ bch2_cut_back_s(where, bkey_i_to_s(k)); ++} ++ ++/** ++ * bch_key_resize - adjust size of @k ++ * ++ * bkey_start_offset(k) will be preserved, modifies where the extent ends ++ */ ++static inline void bch2_key_resize(struct bkey *k, unsigned new_size) ++{ ++ k->p.offset -= k->size; ++ k->p.offset += new_size; ++ k->size = new_size; ++} ++ ++/* ++ * In extent_sort_fix_overlapping(), insert_fixup_extent(), ++ * extent_merge_inline() - we're modifying keys in place that are packed. To do ++ * that we have to unpack the key, modify the unpacked key - then this ++ * copies/repacks the unpacked to the original as necessary. ++ */ ++static inline void extent_save(struct btree *b, struct bkey_packed *dst, ++ struct bkey *src) ++{ ++ struct bkey_format *f = &b->format; ++ struct bkey_i *dst_unpacked; ++ ++ if ((dst_unpacked = packed_to_bkey(dst))) ++ dst_unpacked->k = *src; ++ else ++ BUG_ON(!bch2_bkey_pack_key(dst, src, f)); ++} ++ ++#endif /* _BCACHEFS_EXTENTS_H */ +diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h +new file mode 100644 +index 000000000000..43d6c341ecca +--- /dev/null ++++ b/fs/bcachefs/extents_types.h +@@ -0,0 +1,40 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_EXTENTS_TYPES_H ++#define _BCACHEFS_EXTENTS_TYPES_H ++ ++#include "bcachefs_format.h" ++ ++struct bch_extent_crc_unpacked { ++ u32 compressed_size; ++ u32 uncompressed_size; ++ u32 live_size; ++ ++ u8 csum_type; ++ u8 compression_type; ++ ++ u16 offset; ++ ++ u16 nonce; ++ ++ struct bch_csum csum; ++}; ++ ++struct extent_ptr_decoded { ++ unsigned idx; ++ bool has_ec; ++ struct bch_extent_crc_unpacked crc; ++ struct bch_extent_ptr ptr; ++ struct bch_extent_stripe_ptr ec; ++}; ++ ++struct bch_io_failures { ++ u8 nr; ++ struct bch_dev_io_failures { ++ u8 dev; ++ u8 idx; ++ u8 nr_failed; ++ u8 nr_retries; ++ } devs[BCH_REPLICAS_MAX]; ++}; ++ ++#endif /* _BCACHEFS_EXTENTS_TYPES_H */ +diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h +new file mode 100644 +index 000000000000..26d5cad7e6a5 +--- /dev/null ++++ b/fs/bcachefs/eytzinger.h +@@ -0,0 +1,285 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _EYTZINGER_H ++#define _EYTZINGER_H ++ ++#include ++#include ++ ++#include "util.h" ++ ++/* ++ * Traversal for trees in eytzinger layout - a full binary tree layed out in an ++ * array ++ */ ++ ++/* ++ * One based indexing version: ++ * ++ * With one based indexing each level of the tree starts at a power of two - ++ * good for cacheline alignment: ++ * ++ * Size parameter is treated as if we were using 0 based indexing, however: ++ * valid nodes, and inorder indices, are in the range [1..size) - that is, there ++ * are actually size - 1 elements ++ */ ++ ++static inline unsigned eytzinger1_child(unsigned i, unsigned child) ++{ ++ EBUG_ON(child > 1); ++ ++ return (i << 1) + child; ++} ++ ++static inline unsigned eytzinger1_left_child(unsigned i) ++{ ++ return eytzinger1_child(i, 0); ++} ++ ++static inline unsigned eytzinger1_right_child(unsigned i) ++{ ++ return eytzinger1_child(i, 1); ++} ++ ++static inline unsigned eytzinger1_first(unsigned size) ++{ ++ return rounddown_pow_of_two(size - 1); ++} ++ ++static inline unsigned eytzinger1_last(unsigned size) ++{ ++ return rounddown_pow_of_two(size) - 1; ++} ++ ++/* ++ * eytzinger1_next() and eytzinger1_prev() have the nice properties that ++ * ++ * eytzinger1_next(0) == eytzinger1_first()) ++ * eytzinger1_prev(0) == eytzinger1_last()) ++ * ++ * eytzinger1_prev(eytzinger1_first()) == 0 ++ * eytzinger1_next(eytzinger1_last()) == 0 ++ */ ++ ++static inline unsigned eytzinger1_next(unsigned i, unsigned size) ++{ ++ EBUG_ON(i >= size); ++ ++ if (eytzinger1_right_child(i) < size) { ++ i = eytzinger1_right_child(i); ++ ++ i <<= __fls(size) - __fls(i); ++ i >>= i >= size; ++ } else { ++ i >>= ffz(i) + 1; ++ } ++ ++ return i; ++} ++ ++static inline unsigned eytzinger1_prev(unsigned i, unsigned size) ++{ ++ EBUG_ON(i >= size); ++ ++ if (eytzinger1_left_child(i) < size) { ++ i = eytzinger1_left_child(i) + 1; ++ ++ i <<= __fls(size) - __fls(i); ++ i -= 1; ++ i >>= i >= size; ++ } else { ++ i >>= __ffs(i) + 1; ++ } ++ ++ return i; ++} ++ ++static inline unsigned eytzinger1_extra(unsigned size) ++{ ++ return (size - rounddown_pow_of_two(size - 1)) << 1; ++} ++ ++static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, ++ unsigned extra) ++{ ++ unsigned b = __fls(i); ++ unsigned shift = __fls(size - 1) - b; ++ int s; ++ ++ EBUG_ON(!i || i >= size); ++ ++ i ^= 1U << b; ++ i <<= 1; ++ i |= 1; ++ i <<= shift; ++ ++ /* ++ * sign bit trick: ++ * ++ * if (i > extra) ++ * i -= (i - extra) >> 1; ++ */ ++ s = extra - i; ++ i += (s >> 1) & (s >> 31); ++ ++ return i; ++} ++ ++static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, ++ unsigned extra) ++{ ++ unsigned shift; ++ int s; ++ ++ EBUG_ON(!i || i >= size); ++ ++ /* ++ * sign bit trick: ++ * ++ * if (i > extra) ++ * i += i - extra; ++ */ ++ s = extra - i; ++ i -= s & (s >> 31); ++ ++ shift = __ffs(i); ++ ++ i >>= shift + 1; ++ i |= 1U << (__fls(size - 1) - shift); ++ ++ return i; ++} ++ ++static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size) ++{ ++ return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size)); ++} ++ ++static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) ++{ ++ return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size)); ++} ++ ++#define eytzinger1_for_each(_i, _size) \ ++ for ((_i) = eytzinger1_first((_size)); \ ++ (_i) != 0; \ ++ (_i) = eytzinger1_next((_i), (_size))) ++ ++/* Zero based indexing version: */ ++ ++static inline unsigned eytzinger0_child(unsigned i, unsigned child) ++{ ++ EBUG_ON(child > 1); ++ ++ return (i << 1) + 1 + child; ++} ++ ++static inline unsigned eytzinger0_left_child(unsigned i) ++{ ++ return eytzinger0_child(i, 0); ++} ++ ++static inline unsigned eytzinger0_right_child(unsigned i) ++{ ++ return eytzinger0_child(i, 1); ++} ++ ++static inline unsigned eytzinger0_first(unsigned size) ++{ ++ return eytzinger1_first(size + 1) - 1; ++} ++ ++static inline unsigned eytzinger0_last(unsigned size) ++{ ++ return eytzinger1_last(size + 1) - 1; ++} ++ ++static inline unsigned eytzinger0_next(unsigned i, unsigned size) ++{ ++ return eytzinger1_next(i + 1, size + 1) - 1; ++} ++ ++static inline unsigned eytzinger0_prev(unsigned i, unsigned size) ++{ ++ return eytzinger1_prev(i + 1, size + 1) - 1; ++} ++ ++static inline unsigned eytzinger0_extra(unsigned size) ++{ ++ return eytzinger1_extra(size + 1); ++} ++ ++static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size, ++ unsigned extra) ++{ ++ return __eytzinger1_to_inorder(i + 1, size + 1, extra) - 1; ++} ++ ++static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size, ++ unsigned extra) ++{ ++ return __inorder_to_eytzinger1(i + 1, size + 1, extra) - 1; ++} ++ ++static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size) ++{ ++ return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size)); ++} ++ ++static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) ++{ ++ return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size)); ++} ++ ++#define eytzinger0_for_each(_i, _size) \ ++ for ((_i) = eytzinger0_first((_size)); \ ++ (_i) != -1; \ ++ (_i) = eytzinger0_next((_i), (_size))) ++ ++typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size); ++ ++/* return greatest node <= @search, or -1 if not found */ ++static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, ++ eytzinger_cmp_fn cmp, const void *search) ++{ ++ unsigned i, n = 0; ++ ++ if (!nr) ++ return -1; ++ ++ do { ++ i = n; ++ n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0); ++ } while (n < nr); ++ ++ if (n & 1) { ++ /* @i was greater than @search, return previous node: */ ++ ++ if (i == eytzinger0_first(nr)) ++ return -1; ++ ++ return eytzinger0_prev(i, nr); ++ } else { ++ return i; ++ } ++} ++ ++#define eytzinger0_find(base, nr, size, _cmp, search) \ ++({ \ ++ void *_base = (base); \ ++ void *_search = (search); \ ++ size_t _nr = (nr); \ ++ size_t _size = (size); \ ++ size_t _i = 0; \ ++ int _res; \ ++ \ ++ while (_i < _nr && \ ++ (_res = _cmp(_search, _base + _i * _size, _size))) \ ++ _i = eytzinger0_child(_i, _res > 0); \ ++ _i; \ ++}) ++ ++void eytzinger0_sort(void *, size_t, size_t, ++ int (*cmp_func)(const void *, const void *, size_t), ++ void (*swap_func)(void *, void *, size_t)); ++ ++#endif /* _EYTZINGER_H */ +diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h +new file mode 100644 +index 000000000000..cdb272708a4b +--- /dev/null ++++ b/fs/bcachefs/fifo.h +@@ -0,0 +1,127 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FIFO_H ++#define _BCACHEFS_FIFO_H ++ ++#include "util.h" ++ ++#define FIFO(type) \ ++struct { \ ++ size_t front, back, size, mask; \ ++ type *data; \ ++} ++ ++#define DECLARE_FIFO(type, name) FIFO(type) name ++ ++#define fifo_buf_size(fifo) \ ++ ((fifo)->size \ ++ ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]) \ ++ : 0) ++ ++#define init_fifo(fifo, _size, _gfp) \ ++({ \ ++ (fifo)->front = (fifo)->back = 0; \ ++ (fifo)->size = (_size); \ ++ (fifo)->mask = (fifo)->size \ ++ ? roundup_pow_of_two((fifo)->size) - 1 \ ++ : 0; \ ++ (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \ ++}) ++ ++#define free_fifo(fifo) \ ++do { \ ++ kvpfree((fifo)->data, fifo_buf_size(fifo)); \ ++ (fifo)->data = NULL; \ ++} while (0) ++ ++#define fifo_swap(l, r) \ ++do { \ ++ swap((l)->front, (r)->front); \ ++ swap((l)->back, (r)->back); \ ++ swap((l)->size, (r)->size); \ ++ swap((l)->mask, (r)->mask); \ ++ swap((l)->data, (r)->data); \ ++} while (0) ++ ++#define fifo_move(dest, src) \ ++do { \ ++ typeof(*((dest)->data)) _t; \ ++ while (!fifo_full(dest) && \ ++ fifo_pop(src, _t)) \ ++ fifo_push(dest, _t); \ ++} while (0) ++ ++#define fifo_used(fifo) (((fifo)->back - (fifo)->front)) ++#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) ++ ++#define fifo_empty(fifo) ((fifo)->front == (fifo)->back) ++#define fifo_full(fifo) (fifo_used(fifo) == (fifo)->size) ++ ++#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask]) ++#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) ++ ++#define fifo_entry_idx_abs(fifo, p) \ ++ ((((p) >= &fifo_peek_front(fifo) \ ++ ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) + \ ++ (((p) - (fifo)->data))) ++ ++#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask) ++#define fifo_idx_entry(fifo, i) (fifo)->data[((fifo)->front + (i)) & (fifo)->mask] ++ ++#define fifo_push_back_ref(f) \ ++ (fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask]) ++ ++#define fifo_push_front_ref(f) \ ++ (fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask]) ++ ++#define fifo_push_back(fifo, new) \ ++({ \ ++ typeof((fifo)->data) _r = fifo_push_back_ref(fifo); \ ++ if (_r) \ ++ *_r = (new); \ ++ _r != NULL; \ ++}) ++ ++#define fifo_push_front(fifo, new) \ ++({ \ ++ typeof((fifo)->data) _r = fifo_push_front_ref(fifo); \ ++ if (_r) \ ++ *_r = (new); \ ++ _r != NULL; \ ++}) ++ ++#define fifo_pop_front(fifo, i) \ ++({ \ ++ bool _r = !fifo_empty((fifo)); \ ++ if (_r) \ ++ (i) = (fifo)->data[(fifo)->front++ & (fifo)->mask]; \ ++ _r; \ ++}) ++ ++#define fifo_pop_back(fifo, i) \ ++({ \ ++ bool _r = !fifo_empty((fifo)); \ ++ if (_r) \ ++ (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]; \ ++ _r; \ ++}) ++ ++#define fifo_push_ref(fifo) fifo_push_back_ref(fifo) ++#define fifo_push(fifo, i) fifo_push_back(fifo, (i)) ++#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) ++#define fifo_peek(fifo) fifo_peek_front(fifo) ++ ++#define fifo_for_each_entry(_entry, _fifo, _iter) \ ++ for (typecheck(typeof((_fifo)->front), _iter), \ ++ (_iter) = (_fifo)->front; \ ++ ((_iter != (_fifo)->back) && \ ++ (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \ ++ (_iter)++) ++ ++#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \ ++ for (typecheck(typeof((_fifo)->front), _iter), \ ++ (_iter) = (_fifo)->front; \ ++ ((_iter != (_fifo)->back) && \ ++ (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \ ++ (_iter)++) ++ ++#endif /* _BCACHEFS_FIFO_H */ +diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c +new file mode 100644 +index 000000000000..878419d40992 +--- /dev/null ++++ b/fs/bcachefs/fs-common.c +@@ -0,0 +1,317 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "acl.h" ++#include "btree_update.h" ++#include "dirent.h" ++#include "fs-common.h" ++#include "inode.h" ++#include "xattr.h" ++ ++#include ++ ++int bch2_create_trans(struct btree_trans *trans, u64 dir_inum, ++ struct bch_inode_unpacked *dir_u, ++ struct bch_inode_unpacked *new_inode, ++ const struct qstr *name, ++ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, ++ struct posix_acl *default_acl, ++ struct posix_acl *acl) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *dir_iter = NULL; ++ struct bch_hash_info hash = bch2_hash_info_init(c, new_inode); ++ u64 now = bch2_current_time(trans->c); ++ int ret; ++ ++ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(dir_iter); ++ if (ret) ++ goto err; ++ ++ bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); ++ ++ if (!name) ++ new_inode->bi_flags |= BCH_INODE_UNLINKED; ++ ++ ret = bch2_inode_create(trans, new_inode, ++ BLOCKDEV_INODE_MAX, 0, ++ &c->unused_inode_hint); ++ if (ret) ++ goto err; ++ ++ if (default_acl) { ++ ret = bch2_set_acl_trans(trans, new_inode, &hash, ++ default_acl, ACL_TYPE_DEFAULT); ++ if (ret) ++ goto err; ++ } ++ ++ if (acl) { ++ ret = bch2_set_acl_trans(trans, new_inode, &hash, ++ acl, ACL_TYPE_ACCESS); ++ if (ret) ++ goto err; ++ } ++ ++ if (name) { ++ struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u); ++ dir_u->bi_mtime = dir_u->bi_ctime = now; ++ ++ if (S_ISDIR(new_inode->bi_mode)) ++ dir_u->bi_nlink++; ++ ++ ret = bch2_inode_write(trans, dir_iter, dir_u); ++ if (ret) ++ goto err; ++ ++ ret = bch2_dirent_create(trans, dir_inum, &dir_hash, ++ mode_to_type(new_inode->bi_mode), ++ name, new_inode->bi_inum, ++ BCH_HASH_SET_MUST_CREATE); ++ if (ret) ++ goto err; ++ } ++err: ++ bch2_trans_iter_put(trans, dir_iter); ++ return ret; ++} ++ ++int bch2_link_trans(struct btree_trans *trans, u64 dir_inum, ++ u64 inum, struct bch_inode_unpacked *dir_u, ++ struct bch_inode_unpacked *inode_u, const struct qstr *name) ++{ ++ struct btree_iter *dir_iter = NULL, *inode_iter = NULL; ++ struct bch_hash_info dir_hash; ++ u64 now = bch2_current_time(trans->c); ++ int ret; ++ ++ inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(inode_iter); ++ if (ret) ++ goto err; ++ ++ inode_u->bi_ctime = now; ++ bch2_inode_nlink_inc(inode_u); ++ ++ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, 0); ++ ret = PTR_ERR_OR_ZERO(dir_iter); ++ if (ret) ++ goto err; ++ ++ dir_u->bi_mtime = dir_u->bi_ctime = now; ++ ++ dir_hash = bch2_hash_info_init(trans->c, dir_u); ++ ++ ret = bch2_dirent_create(trans, dir_inum, &dir_hash, ++ mode_to_type(inode_u->bi_mode), ++ name, inum, BCH_HASH_SET_MUST_CREATE) ?: ++ bch2_inode_write(trans, dir_iter, dir_u) ?: ++ bch2_inode_write(trans, inode_iter, inode_u); ++err: ++ bch2_trans_iter_put(trans, dir_iter); ++ bch2_trans_iter_put(trans, inode_iter); ++ return ret; ++} ++ ++int bch2_unlink_trans(struct btree_trans *trans, ++ u64 dir_inum, struct bch_inode_unpacked *dir_u, ++ struct bch_inode_unpacked *inode_u, ++ const struct qstr *name) ++{ ++ struct btree_iter *dir_iter = NULL, *dirent_iter = NULL, ++ *inode_iter = NULL; ++ struct bch_hash_info dir_hash; ++ u64 inum, now = bch2_current_time(trans->c); ++ struct bkey_s_c k; ++ int ret; ++ ++ dir_iter = bch2_inode_peek(trans, dir_u, dir_inum, BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(dir_iter); ++ if (ret) ++ goto err; ++ ++ dir_hash = bch2_hash_info_init(trans->c, dir_u); ++ ++ dirent_iter = __bch2_dirent_lookup_trans(trans, dir_inum, &dir_hash, ++ name, BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(dirent_iter); ++ if (ret) ++ goto err; ++ ++ k = bch2_btree_iter_peek_slot(dirent_iter); ++ inum = le64_to_cpu(bkey_s_c_to_dirent(k).v->d_inum); ++ ++ inode_iter = bch2_inode_peek(trans, inode_u, inum, BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(inode_iter); ++ if (ret) ++ goto err; ++ ++ dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; ++ dir_u->bi_nlink -= S_ISDIR(inode_u->bi_mode); ++ bch2_inode_nlink_dec(inode_u); ++ ++ ret = (S_ISDIR(inode_u->bi_mode) ++ ? bch2_empty_dir_trans(trans, inum) ++ : 0) ?: ++ bch2_dirent_delete_at(trans, &dir_hash, dirent_iter) ?: ++ bch2_inode_write(trans, dir_iter, dir_u) ?: ++ bch2_inode_write(trans, inode_iter, inode_u); ++err: ++ bch2_trans_iter_put(trans, inode_iter); ++ bch2_trans_iter_put(trans, dirent_iter); ++ bch2_trans_iter_put(trans, dir_iter); ++ return ret; ++} ++ ++bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, ++ struct bch_inode_unpacked *src_u) ++{ ++ u64 src, dst; ++ unsigned id; ++ bool ret = false; ++ ++ for (id = 0; id < Inode_opt_nr; id++) { ++ if (dst_u->bi_fields_set & (1 << id)) ++ continue; ++ ++ src = bch2_inode_opt_get(src_u, id); ++ dst = bch2_inode_opt_get(dst_u, id); ++ ++ if (src == dst) ++ continue; ++ ++ bch2_inode_opt_set(dst_u, id, src); ++ ret = true; ++ } ++ ++ return ret; ++} ++ ++int bch2_rename_trans(struct btree_trans *trans, ++ u64 src_dir, struct bch_inode_unpacked *src_dir_u, ++ u64 dst_dir, struct bch_inode_unpacked *dst_dir_u, ++ struct bch_inode_unpacked *src_inode_u, ++ struct bch_inode_unpacked *dst_inode_u, ++ const struct qstr *src_name, ++ const struct qstr *dst_name, ++ enum bch_rename_mode mode) ++{ ++ struct btree_iter *src_dir_iter = NULL, *dst_dir_iter = NULL; ++ struct btree_iter *src_inode_iter = NULL, *dst_inode_iter = NULL; ++ struct bch_hash_info src_hash, dst_hash; ++ u64 src_inode, dst_inode, now = bch2_current_time(trans->c); ++ int ret; ++ ++ src_dir_iter = bch2_inode_peek(trans, src_dir_u, src_dir, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(src_dir_iter); ++ if (ret) ++ goto err; ++ ++ src_hash = bch2_hash_info_init(trans->c, src_dir_u); ++ ++ if (dst_dir != src_dir) { ++ dst_dir_iter = bch2_inode_peek(trans, dst_dir_u, dst_dir, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(dst_dir_iter); ++ if (ret) ++ goto err; ++ ++ dst_hash = bch2_hash_info_init(trans->c, dst_dir_u); ++ } else { ++ dst_dir_u = src_dir_u; ++ dst_hash = src_hash; ++ } ++ ++ ret = bch2_dirent_rename(trans, ++ src_dir, &src_hash, ++ dst_dir, &dst_hash, ++ src_name, &src_inode, ++ dst_name, &dst_inode, ++ mode); ++ if (ret) ++ goto err; ++ ++ src_inode_iter = bch2_inode_peek(trans, src_inode_u, src_inode, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(src_inode_iter); ++ if (ret) ++ goto err; ++ ++ if (dst_inode) { ++ dst_inode_iter = bch2_inode_peek(trans, dst_inode_u, dst_inode, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(dst_inode_iter); ++ if (ret) ++ goto err; ++ } ++ ++ if (mode == BCH_RENAME_OVERWRITE) { ++ if (S_ISDIR(src_inode_u->bi_mode) != ++ S_ISDIR(dst_inode_u->bi_mode)) { ++ ret = -ENOTDIR; ++ goto err; ++ } ++ ++ if (S_ISDIR(dst_inode_u->bi_mode) && ++ bch2_empty_dir_trans(trans, dst_inode)) { ++ ret = -ENOTEMPTY; ++ goto err; ++ } ++ } ++ ++ if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && ++ S_ISDIR(src_inode_u->bi_mode)) { ++ ret = -EXDEV; ++ goto err; ++ } ++ ++ if (mode == BCH_RENAME_EXCHANGE && ++ bch2_reinherit_attrs(dst_inode_u, src_dir_u) && ++ S_ISDIR(dst_inode_u->bi_mode)) { ++ ret = -EXDEV; ++ goto err; ++ } ++ ++ if (S_ISDIR(src_inode_u->bi_mode)) { ++ src_dir_u->bi_nlink--; ++ dst_dir_u->bi_nlink++; ++ } ++ ++ if (dst_inode && S_ISDIR(dst_inode_u->bi_mode)) { ++ dst_dir_u->bi_nlink--; ++ src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; ++ } ++ ++ if (mode == BCH_RENAME_OVERWRITE) ++ bch2_inode_nlink_dec(dst_inode_u); ++ ++ src_dir_u->bi_mtime = now; ++ src_dir_u->bi_ctime = now; ++ ++ if (src_dir != dst_dir) { ++ dst_dir_u->bi_mtime = now; ++ dst_dir_u->bi_ctime = now; ++ } ++ ++ src_inode_u->bi_ctime = now; ++ ++ if (dst_inode) ++ dst_inode_u->bi_ctime = now; ++ ++ ret = bch2_inode_write(trans, src_dir_iter, src_dir_u) ?: ++ (src_dir != dst_dir ++ ? bch2_inode_write(trans, dst_dir_iter, dst_dir_u) ++ : 0 ) ?: ++ bch2_inode_write(trans, src_inode_iter, src_inode_u) ?: ++ (dst_inode ++ ? bch2_inode_write(trans, dst_inode_iter, dst_inode_u) ++ : 0 ); ++err: ++ bch2_trans_iter_put(trans, dst_inode_iter); ++ bch2_trans_iter_put(trans, src_inode_iter); ++ bch2_trans_iter_put(trans, dst_dir_iter); ++ bch2_trans_iter_put(trans, src_dir_iter); ++ return ret; ++} +diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h +new file mode 100644 +index 000000000000..2273b7961c9b +--- /dev/null ++++ b/fs/bcachefs/fs-common.h +@@ -0,0 +1,37 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FS_COMMON_H ++#define _BCACHEFS_FS_COMMON_H ++ ++struct posix_acl; ++ ++int bch2_create_trans(struct btree_trans *, u64, ++ struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, ++ const struct qstr *, ++ uid_t, gid_t, umode_t, dev_t, ++ struct posix_acl *, ++ struct posix_acl *); ++ ++int bch2_link_trans(struct btree_trans *, u64, ++ u64, struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, ++ const struct qstr *); ++ ++int bch2_unlink_trans(struct btree_trans *, ++ u64, struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, ++ const struct qstr *); ++ ++int bch2_rename_trans(struct btree_trans *, ++ u64, struct bch_inode_unpacked *, ++ u64, struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *, ++ const struct qstr *, ++ const struct qstr *, ++ enum bch_rename_mode); ++ ++bool bch2_reinherit_attrs(struct bch_inode_unpacked *, ++ struct bch_inode_unpacked *); ++ ++#endif /* _BCACHEFS_FS_COMMON_H */ +diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c +new file mode 100644 +index 000000000000..4ceeafcfa33c +--- /dev/null ++++ b/fs/bcachefs/fs-io.c +@@ -0,0 +1,3140 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef NO_BCACHEFS_FS ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_on_stack.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "clock.h" ++#include "error.h" ++#include "extents.h" ++#include "extent_update.h" ++#include "fs.h" ++#include "fs-io.h" ++#include "fsck.h" ++#include "inode.h" ++#include "journal.h" ++#include "io.h" ++#include "keylist.h" ++#include "quota.h" ++#include "reflink.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++struct quota_res { ++ u64 sectors; ++}; ++ ++struct bch_writepage_io { ++ struct closure cl; ++ struct bch_inode_info *inode; ++ ++ /* must be last: */ ++ struct bch_write_op op; ++}; ++ ++struct dio_write { ++ struct completion done; ++ struct kiocb *req; ++ struct mm_struct *mm; ++ unsigned loop:1, ++ sync:1, ++ free_iov:1; ++ struct quota_res quota_res; ++ u64 written; ++ ++ struct iov_iter iter; ++ struct iovec inline_vecs[2]; ++ ++ /* must be last: */ ++ struct bch_write_op op; ++}; ++ ++struct dio_read { ++ struct closure cl; ++ struct kiocb *req; ++ long ret; ++ struct bch_read_bio rbio; ++}; ++ ++/* pagecache_block must be held */ ++static int write_invalidate_inode_pages_range(struct address_space *mapping, ++ loff_t start, loff_t end) ++{ ++ int ret; ++ ++ /* ++ * XXX: the way this is currently implemented, we can spin if a process ++ * is continually redirtying a specific page ++ */ ++ do { ++ if (!mapping->nrpages && ++ !mapping->nrexceptional) ++ return 0; ++ ++ ret = filemap_write_and_wait_range(mapping, start, end); ++ if (ret) ++ break; ++ ++ if (!mapping->nrpages) ++ return 0; ++ ++ ret = invalidate_inode_pages2_range(mapping, ++ start >> PAGE_SHIFT, ++ end >> PAGE_SHIFT); ++ } while (ret == -EBUSY); ++ ++ return ret; ++} ++ ++/* quotas */ ++ ++#ifdef CONFIG_BCACHEFS_QUOTA ++ ++static void bch2_quota_reservation_put(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res) ++{ ++ if (!res->sectors) ++ return; ++ ++ mutex_lock(&inode->ei_quota_lock); ++ BUG_ON(res->sectors > inode->ei_quota_reserved); ++ ++ bch2_quota_acct(c, inode->ei_qid, Q_SPC, ++ -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); ++ inode->ei_quota_reserved -= res->sectors; ++ mutex_unlock(&inode->ei_quota_lock); ++ ++ res->sectors = 0; ++} ++ ++static int bch2_quota_reservation_add(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res, ++ unsigned sectors, ++ bool check_enospc) ++{ ++ int ret; ++ ++ mutex_lock(&inode->ei_quota_lock); ++ ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, ++ check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); ++ if (likely(!ret)) { ++ inode->ei_quota_reserved += sectors; ++ res->sectors += sectors; ++ } ++ mutex_unlock(&inode->ei_quota_lock); ++ ++ return ret; ++} ++ ++#else ++ ++static void bch2_quota_reservation_put(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res) ++{ ++} ++ ++static int bch2_quota_reservation_add(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct quota_res *res, ++ unsigned sectors, ++ bool check_enospc) ++{ ++ return 0; ++} ++ ++#endif ++ ++/* i_size updates: */ ++ ++struct inode_new_size { ++ loff_t new_size; ++ u64 now; ++ unsigned fields; ++}; ++ ++static int inode_set_size(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct inode_new_size *s = p; ++ ++ bi->bi_size = s->new_size; ++ if (s->fields & ATTR_ATIME) ++ bi->bi_atime = s->now; ++ if (s->fields & ATTR_MTIME) ++ bi->bi_mtime = s->now; ++ if (s->fields & ATTR_CTIME) ++ bi->bi_ctime = s->now; ++ ++ return 0; ++} ++ ++int __must_check bch2_write_inode_size(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ loff_t new_size, unsigned fields) ++{ ++ struct inode_new_size s = { ++ .new_size = new_size, ++ .now = bch2_current_time(c), ++ .fields = fields, ++ }; ++ ++ return bch2_write_inode(c, inode, inode_set_size, &s, fields); ++} ++ ++static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, ++ struct quota_res *quota_res, s64 sectors) ++{ ++ if (!sectors) ++ return; ++ ++ mutex_lock(&inode->ei_quota_lock); ++#ifdef CONFIG_BCACHEFS_QUOTA ++ if (quota_res && sectors > 0) { ++ BUG_ON(sectors > quota_res->sectors); ++ BUG_ON(sectors > inode->ei_quota_reserved); ++ ++ quota_res->sectors -= sectors; ++ inode->ei_quota_reserved -= sectors; ++ } else { ++ bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); ++ } ++#endif ++ inode->v.i_blocks += sectors; ++ mutex_unlock(&inode->ei_quota_lock); ++} ++ ++/* page state: */ ++ ++/* stored in page->private: */ ++ ++struct bch_page_sector { ++ /* Uncompressed, fully allocated replicas: */ ++ unsigned nr_replicas:3; ++ ++ /* Owns PAGE_SECTORS * replicas_reserved sized reservation: */ ++ unsigned replicas_reserved:3; ++ ++ /* i_sectors: */ ++ enum { ++ SECTOR_UNALLOCATED, ++ SECTOR_RESERVED, ++ SECTOR_DIRTY, ++ SECTOR_ALLOCATED, ++ } state:2; ++}; ++ ++struct bch_page_state { ++ spinlock_t lock; ++ atomic_t write_count; ++ struct bch_page_sector s[PAGE_SECTORS]; ++}; ++ ++static inline struct bch_page_state *__bch2_page_state(struct page *page) ++{ ++ return page_has_private(page) ++ ? (struct bch_page_state *) page_private(page) ++ : NULL; ++} ++ ++static inline struct bch_page_state *bch2_page_state(struct page *page) ++{ ++ EBUG_ON(!PageLocked(page)); ++ ++ return __bch2_page_state(page); ++} ++ ++/* for newly allocated pages: */ ++static void __bch2_page_state_release(struct page *page) ++{ ++ struct bch_page_state *s = __bch2_page_state(page); ++ ++ if (!s) ++ return; ++ ++ ClearPagePrivate(page); ++ set_page_private(page, 0); ++ put_page(page); ++ kfree(s); ++} ++ ++static void bch2_page_state_release(struct page *page) ++{ ++ struct bch_page_state *s = bch2_page_state(page); ++ ++ if (!s) ++ return; ++ ++ ClearPagePrivate(page); ++ set_page_private(page, 0); ++ put_page(page); ++ kfree(s); ++} ++ ++/* for newly allocated pages: */ ++static struct bch_page_state *__bch2_page_state_create(struct page *page, ++ gfp_t gfp) ++{ ++ struct bch_page_state *s; ++ ++ s = kzalloc(sizeof(*s), GFP_NOFS|gfp); ++ if (!s) ++ return NULL; ++ ++ spin_lock_init(&s->lock); ++ /* ++ * migrate_page_move_mapping() assumes that pages with private data ++ * have their count elevated by 1. ++ */ ++ get_page(page); ++ set_page_private(page, (unsigned long) s); ++ SetPagePrivate(page); ++ return s; ++} ++ ++static struct bch_page_state *bch2_page_state_create(struct page *page, ++ gfp_t gfp) ++{ ++ return bch2_page_state(page) ?: __bch2_page_state_create(page, gfp); ++} ++ ++static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) ++{ ++ /* XXX: this should not be open coded */ ++ return inode->ei_inode.bi_data_replicas ++ ? inode->ei_inode.bi_data_replicas - 1 ++ : c->opts.data_replicas; ++} ++ ++static inline unsigned sectors_to_reserve(struct bch_page_sector *s, ++ unsigned nr_replicas) ++{ ++ return max(0, (int) nr_replicas - ++ s->nr_replicas - ++ s->replicas_reserved); ++} ++ ++static int bch2_get_page_disk_reservation(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct page *page, bool check_enospc) ++{ ++ struct bch_page_state *s = bch2_page_state_create(page, 0); ++ unsigned nr_replicas = inode_nr_replicas(c, inode); ++ struct disk_reservation disk_res = { 0 }; ++ unsigned i, disk_res_sectors = 0; ++ int ret; ++ ++ if (!s) ++ return -ENOMEM; ++ ++ for (i = 0; i < ARRAY_SIZE(s->s); i++) ++ disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); ++ ++ if (!disk_res_sectors) ++ return 0; ++ ++ ret = bch2_disk_reservation_get(c, &disk_res, ++ disk_res_sectors, 1, ++ !check_enospc ++ ? BCH_DISK_RESERVATION_NOFAIL ++ : 0); ++ if (unlikely(ret)) ++ return ret; ++ ++ for (i = 0; i < ARRAY_SIZE(s->s); i++) ++ s->s[i].replicas_reserved += ++ sectors_to_reserve(&s->s[i], nr_replicas); ++ ++ return 0; ++} ++ ++struct bch2_page_reservation { ++ struct disk_reservation disk; ++ struct quota_res quota; ++}; ++ ++static void bch2_page_reservation_init(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch2_page_reservation *res) ++{ ++ memset(res, 0, sizeof(*res)); ++ ++ res->disk.nr_replicas = inode_nr_replicas(c, inode); ++} ++ ++static void bch2_page_reservation_put(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch2_page_reservation *res) ++{ ++ bch2_disk_reservation_put(c, &res->disk); ++ bch2_quota_reservation_put(c, inode, &res->quota); ++} ++ ++static int bch2_page_reservation_get(struct bch_fs *c, ++ struct bch_inode_info *inode, struct page *page, ++ struct bch2_page_reservation *res, ++ unsigned offset, unsigned len, bool check_enospc) ++{ ++ struct bch_page_state *s = bch2_page_state_create(page, 0); ++ unsigned i, disk_sectors = 0, quota_sectors = 0; ++ int ret; ++ ++ if (!s) ++ return -ENOMEM; ++ ++ for (i = round_down(offset, block_bytes(c)) >> 9; ++ i < round_up(offset + len, block_bytes(c)) >> 9; ++ i++) { ++ disk_sectors += sectors_to_reserve(&s->s[i], ++ res->disk.nr_replicas); ++ quota_sectors += s->s[i].state == SECTOR_UNALLOCATED; ++ } ++ ++ if (disk_sectors) { ++ ret = bch2_disk_reservation_add(c, &res->disk, ++ disk_sectors, ++ !check_enospc ++ ? BCH_DISK_RESERVATION_NOFAIL ++ : 0); ++ if (unlikely(ret)) ++ return ret; ++ } ++ ++ if (quota_sectors) { ++ ret = bch2_quota_reservation_add(c, inode, &res->quota, ++ quota_sectors, ++ check_enospc); ++ if (unlikely(ret)) { ++ struct disk_reservation tmp = { ++ .sectors = disk_sectors ++ }; ++ ++ bch2_disk_reservation_put(c, &tmp); ++ res->disk.sectors -= disk_sectors; ++ return ret; ++ } ++ } ++ ++ return 0; ++} ++ ++static void bch2_clear_page_bits(struct page *page) ++{ ++ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_page_state *s = bch2_page_state(page); ++ struct disk_reservation disk_res = { 0 }; ++ int i, dirty_sectors = 0; ++ ++ if (!s) ++ return; ++ ++ EBUG_ON(!PageLocked(page)); ++ EBUG_ON(PageWriteback(page)); ++ ++ for (i = 0; i < ARRAY_SIZE(s->s); i++) { ++ disk_res.sectors += s->s[i].replicas_reserved; ++ s->s[i].replicas_reserved = 0; ++ ++ if (s->s[i].state == SECTOR_DIRTY) { ++ dirty_sectors++; ++ s->s[i].state = SECTOR_UNALLOCATED; ++ } ++ } ++ ++ bch2_disk_reservation_put(c, &disk_res); ++ ++ if (dirty_sectors) ++ i_sectors_acct(c, inode, NULL, -dirty_sectors); ++ ++ bch2_page_state_release(page); ++} ++ ++static void bch2_set_page_dirty(struct bch_fs *c, ++ struct bch_inode_info *inode, struct page *page, ++ struct bch2_page_reservation *res, ++ unsigned offset, unsigned len) ++{ ++ struct bch_page_state *s = bch2_page_state(page); ++ unsigned i, dirty_sectors = 0; ++ ++ WARN_ON((u64) page_offset(page) + offset + len > ++ round_up((u64) i_size_read(&inode->v), block_bytes(c))); ++ ++ spin_lock(&s->lock); ++ ++ for (i = round_down(offset, block_bytes(c)) >> 9; ++ i < round_up(offset + len, block_bytes(c)) >> 9; ++ i++) { ++ unsigned sectors = sectors_to_reserve(&s->s[i], ++ res->disk.nr_replicas); ++ ++ /* ++ * This can happen if we race with the error path in ++ * bch2_writepage_io_done(): ++ */ ++ sectors = min_t(unsigned, sectors, res->disk.sectors); ++ ++ s->s[i].replicas_reserved += sectors; ++ res->disk.sectors -= sectors; ++ ++ if (s->s[i].state == SECTOR_UNALLOCATED) ++ dirty_sectors++; ++ ++ s->s[i].state = max_t(unsigned, s->s[i].state, SECTOR_DIRTY); ++ } ++ ++ spin_unlock(&s->lock); ++ ++ if (dirty_sectors) ++ i_sectors_acct(c, inode, &res->quota, dirty_sectors); ++ ++ if (!PageDirty(page)) ++ __set_page_dirty_nobuffers(page); ++} ++ ++vm_fault_t bch2_page_fault(struct vm_fault *vmf) ++{ ++ struct file *file = vmf->vma->vm_file; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ int ret; ++ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ret = filemap_fault(vmf); ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ ++ return ret; ++} ++ ++vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) ++{ ++ struct page *page = vmf->page; ++ struct file *file = vmf->vma->vm_file; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct address_space *mapping = file->f_mapping; ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch2_page_reservation res; ++ unsigned len; ++ loff_t isize; ++ int ret = VM_FAULT_LOCKED; ++ ++ bch2_page_reservation_init(c, inode, &res); ++ ++ sb_start_pagefault(inode->v.i_sb); ++ file_update_time(file); ++ ++ /* ++ * Not strictly necessary, but helps avoid dio writes livelocking in ++ * write_invalidate_inode_pages_range() - can drop this if/when we get ++ * a write_invalidate_inode_pages_range() that works without dropping ++ * page lock before invalidating page ++ */ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ++ lock_page(page); ++ isize = i_size_read(&inode->v); ++ ++ if (page->mapping != mapping || page_offset(page) >= isize) { ++ unlock_page(page); ++ ret = VM_FAULT_NOPAGE; ++ goto out; ++ } ++ ++ len = min_t(loff_t, PAGE_SIZE, isize - page_offset(page)); ++ ++ if (bch2_page_reservation_get(c, inode, page, &res, 0, len, true)) { ++ unlock_page(page); ++ ret = VM_FAULT_SIGBUS; ++ goto out; ++ } ++ ++ bch2_set_page_dirty(c, inode, page, &res, 0, len); ++ bch2_page_reservation_put(c, inode, &res); ++ ++ wait_for_stable_page(page); ++out: ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ sb_end_pagefault(inode->v.i_sb); ++ ++ return ret; ++} ++ ++void bch2_invalidatepage(struct page *page, unsigned int offset, ++ unsigned int length) ++{ ++ if (offset || length < PAGE_SIZE) ++ return; ++ ++ bch2_clear_page_bits(page); ++} ++ ++int bch2_releasepage(struct page *page, gfp_t gfp_mask) ++{ ++ if (PageDirty(page)) ++ return 0; ++ ++ bch2_clear_page_bits(page); ++ return 1; ++} ++ ++#ifdef CONFIG_MIGRATION ++int bch2_migrate_page(struct address_space *mapping, struct page *newpage, ++ struct page *page, enum migrate_mode mode) ++{ ++ int ret; ++ ++ EBUG_ON(!PageLocked(page)); ++ EBUG_ON(!PageLocked(newpage)); ++ ++ ret = migrate_page_move_mapping(mapping, newpage, page, 0); ++ if (ret != MIGRATEPAGE_SUCCESS) ++ return ret; ++ ++ if (PagePrivate(page)) { ++ ClearPagePrivate(page); ++ get_page(newpage); ++ set_page_private(newpage, page_private(page)); ++ set_page_private(page, 0); ++ put_page(page); ++ SetPagePrivate(newpage); ++ } ++ ++ if (mode != MIGRATE_SYNC_NO_COPY) ++ migrate_page_copy(newpage, page); ++ else ++ migrate_page_states(newpage, page); ++ return MIGRATEPAGE_SUCCESS; ++} ++#endif ++ ++/* readpage(s): */ ++ ++static void bch2_readpages_end_io(struct bio *bio) ++{ ++ struct bvec_iter_all iter; ++ struct bio_vec *bv; ++ ++ bio_for_each_segment_all(bv, bio, iter) { ++ struct page *page = bv->bv_page; ++ ++ if (!bio->bi_status) { ++ SetPageUptodate(page); ++ } else { ++ ClearPageUptodate(page); ++ SetPageError(page); ++ } ++ unlock_page(page); ++ } ++ ++ bio_put(bio); ++} ++ ++static inline void page_state_init_for_read(struct page *page) ++{ ++ SetPagePrivate(page); ++ page->private = 0; ++} ++ ++struct readpages_iter { ++ struct address_space *mapping; ++ struct page **pages; ++ unsigned nr_pages; ++ unsigned nr_added; ++ unsigned idx; ++ pgoff_t offset; ++}; ++ ++static int readpages_iter_init(struct readpages_iter *iter, ++ struct address_space *mapping, ++ struct list_head *pages, unsigned nr_pages) ++{ ++ memset(iter, 0, sizeof(*iter)); ++ ++ iter->mapping = mapping; ++ iter->offset = list_last_entry(pages, struct page, lru)->index; ++ ++ iter->pages = kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS); ++ if (!iter->pages) ++ return -ENOMEM; ++ ++ while (!list_empty(pages)) { ++ struct page *page = list_last_entry(pages, struct page, lru); ++ ++ __bch2_page_state_create(page, __GFP_NOFAIL); ++ ++ iter->pages[iter->nr_pages++] = page; ++ list_del(&page->lru); ++ } ++ ++ return 0; ++} ++ ++static inline struct page *readpage_iter_next(struct readpages_iter *iter) ++{ ++ struct page *page; ++ unsigned i; ++ int ret; ++ ++ BUG_ON(iter->idx > iter->nr_added); ++ BUG_ON(iter->nr_added > iter->nr_pages); ++ ++ if (iter->idx < iter->nr_added) ++ goto out; ++ ++ while (1) { ++ if (iter->idx == iter->nr_pages) ++ return NULL; ++ ++ ret = add_to_page_cache_lru_vec(iter->mapping, ++ iter->pages + iter->nr_added, ++ iter->nr_pages - iter->nr_added, ++ iter->offset + iter->nr_added, ++ GFP_NOFS); ++ if (ret > 0) ++ break; ++ ++ page = iter->pages[iter->nr_added]; ++ iter->idx++; ++ iter->nr_added++; ++ ++ __bch2_page_state_release(page); ++ put_page(page); ++ } ++ ++ iter->nr_added += ret; ++ ++ for (i = iter->idx; i < iter->nr_added; i++) ++ put_page(iter->pages[i]); ++out: ++ EBUG_ON(iter->pages[iter->idx]->index != iter->offset + iter->idx); ++ ++ return iter->pages[iter->idx]; ++} ++ ++static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k) ++{ ++ struct bvec_iter iter; ++ struct bio_vec bv; ++ unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v ++ ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); ++ unsigned state = k.k->type == KEY_TYPE_reservation ++ ? SECTOR_RESERVED ++ : SECTOR_ALLOCATED; ++ ++ bio_for_each_segment(bv, bio, iter) { ++ struct bch_page_state *s = bch2_page_state(bv.bv_page); ++ unsigned i; ++ ++ for (i = bv.bv_offset >> 9; ++ i < (bv.bv_offset + bv.bv_len) >> 9; ++ i++) { ++ s->s[i].nr_replicas = nr_ptrs; ++ s->s[i].state = state; ++ } ++ } ++} ++ ++static bool extent_partial_reads_expensive(struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ struct bch_extent_crc_unpacked crc; ++ const union bch_extent_entry *i; ++ ++ bkey_for_each_crc(k.k, ptrs, crc, i) ++ if (crc.csum_type || crc.compression_type) ++ return true; ++ return false; ++} ++ ++static void readpage_bio_extend(struct readpages_iter *iter, ++ struct bio *bio, ++ unsigned sectors_this_extent, ++ bool get_more) ++{ ++ while (bio_sectors(bio) < sectors_this_extent && ++ bio->bi_vcnt < bio->bi_max_vecs) { ++ pgoff_t page_offset = bio_end_sector(bio) >> PAGE_SECTOR_SHIFT; ++ struct page *page = readpage_iter_next(iter); ++ int ret; ++ ++ if (page) { ++ if (iter->offset + iter->idx != page_offset) ++ break; ++ ++ iter->idx++; ++ } else { ++ if (!get_more) ++ break; ++ ++ page = xa_load(&iter->mapping->i_pages, page_offset); ++ if (page && !xa_is_value(page)) ++ break; ++ ++ page = __page_cache_alloc(readahead_gfp_mask(iter->mapping)); ++ if (!page) ++ break; ++ ++ if (!__bch2_page_state_create(page, 0)) { ++ put_page(page); ++ break; ++ } ++ ++ ret = add_to_page_cache_lru(page, iter->mapping, ++ page_offset, GFP_NOFS); ++ if (ret) { ++ __bch2_page_state_release(page); ++ put_page(page); ++ break; ++ } ++ ++ put_page(page); ++ } ++ ++ BUG_ON(!bio_add_page(bio, page, PAGE_SIZE, 0)); ++ } ++} ++ ++static void bchfs_read(struct btree_trans *trans, struct btree_iter *iter, ++ struct bch_read_bio *rbio, u64 inum, ++ struct readpages_iter *readpages_iter) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_on_stack sk; ++ int flags = BCH_READ_RETRY_IF_STALE| ++ BCH_READ_MAY_PROMOTE; ++ int ret = 0; ++ ++ rbio->c = c; ++ rbio->start_time = local_clock(); ++ ++ bkey_on_stack_init(&sk); ++retry: ++ while (1) { ++ struct bkey_s_c k; ++ unsigned bytes, sectors, offset_into_extent; ++ ++ bch2_btree_iter_set_pos(iter, ++ POS(inum, rbio->bio.bi_iter.bi_sector)); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ break; ++ ++ bkey_on_stack_reassemble(&sk, c, k); ++ k = bkey_i_to_s_c(sk.k); ++ ++ offset_into_extent = iter->pos.offset - ++ bkey_start_offset(k.k); ++ sectors = k.k->size - offset_into_extent; ++ ++ ret = bch2_read_indirect_extent(trans, ++ &offset_into_extent, &sk); ++ if (ret) ++ break; ++ ++ sectors = min(sectors, k.k->size - offset_into_extent); ++ ++ bch2_trans_unlock(trans); ++ ++ if (readpages_iter) ++ readpage_bio_extend(readpages_iter, &rbio->bio, sectors, ++ extent_partial_reads_expensive(k)); ++ ++ bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; ++ swap(rbio->bio.bi_iter.bi_size, bytes); ++ ++ if (rbio->bio.bi_iter.bi_size == bytes) ++ flags |= BCH_READ_LAST_FRAGMENT; ++ ++ if (bkey_extent_is_allocation(k.k)) ++ bch2_add_page_sectors(&rbio->bio, k); ++ ++ bch2_read_extent(trans, rbio, k, offset_into_extent, flags); ++ ++ if (flags & BCH_READ_LAST_FRAGMENT) ++ break; ++ ++ swap(rbio->bio.bi_iter.bi_size, bytes); ++ bio_advance(&rbio->bio, bytes); ++ } ++ ++ if (ret == -EINTR) ++ goto retry; ++ ++ if (ret) { ++ bcache_io_error(c, &rbio->bio, "btree IO error %i", ret); ++ bio_endio(&rbio->bio); ++ } ++ ++ bkey_on_stack_exit(&sk, c); ++} ++ ++int bch2_readpages(struct file *file, struct address_space *mapping, ++ struct list_head *pages, unsigned nr_pages) ++{ ++ struct bch_inode_info *inode = to_bch_ei(mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct page *page; ++ struct readpages_iter readpages_iter; ++ int ret; ++ ++ ret = readpages_iter_init(&readpages_iter, mapping, pages, nr_pages); ++ BUG_ON(ret); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, ++ BTREE_ITER_SLOTS); ++ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ++ while ((page = readpage_iter_next(&readpages_iter))) { ++ pgoff_t index = readpages_iter.offset + readpages_iter.idx; ++ unsigned n = min_t(unsigned, ++ readpages_iter.nr_pages - ++ readpages_iter.idx, ++ BIO_MAX_PAGES); ++ struct bch_read_bio *rbio = ++ rbio_init(bio_alloc_bioset(GFP_NOFS, n, &c->bio_read), ++ opts); ++ ++ readpages_iter.idx++; ++ ++ bio_set_op_attrs(&rbio->bio, REQ_OP_READ, 0); ++ rbio->bio.bi_iter.bi_sector = (sector_t) index << PAGE_SECTOR_SHIFT; ++ rbio->bio.bi_end_io = bch2_readpages_end_io; ++ BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); ++ ++ bchfs_read(&trans, iter, rbio, inode->v.i_ino, ++ &readpages_iter); ++ } ++ ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ ++ bch2_trans_exit(&trans); ++ kfree(readpages_iter.pages); ++ ++ return 0; ++} ++ ++static void __bchfs_readpage(struct bch_fs *c, struct bch_read_bio *rbio, ++ u64 inum, struct page *page) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ ++ bch2_page_state_create(page, __GFP_NOFAIL); ++ ++ bio_set_op_attrs(&rbio->bio, REQ_OP_READ, REQ_SYNC); ++ rbio->bio.bi_iter.bi_sector = ++ (sector_t) page->index << PAGE_SECTOR_SHIFT; ++ BUG_ON(!bio_add_page(&rbio->bio, page, PAGE_SIZE, 0)); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, ++ BTREE_ITER_SLOTS); ++ ++ bchfs_read(&trans, iter, rbio, inum, NULL); ++ ++ bch2_trans_exit(&trans); ++} ++ ++int bch2_readpage(struct file *file, struct page *page) ++{ ++ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); ++ struct bch_read_bio *rbio; ++ ++ rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), opts); ++ rbio->bio.bi_end_io = bch2_readpages_end_io; ++ ++ __bchfs_readpage(c, rbio, inode->v.i_ino, page); ++ return 0; ++} ++ ++static void bch2_read_single_page_end_io(struct bio *bio) ++{ ++ complete(bio->bi_private); ++} ++ ++static int bch2_read_single_page(struct page *page, ++ struct address_space *mapping) ++{ ++ struct bch_inode_info *inode = to_bch_ei(mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_read_bio *rbio; ++ int ret; ++ DECLARE_COMPLETION_ONSTACK(done); ++ ++ rbio = rbio_init(bio_alloc_bioset(GFP_NOFS, 1, &c->bio_read), ++ io_opts(c, &inode->ei_inode)); ++ rbio->bio.bi_private = &done; ++ rbio->bio.bi_end_io = bch2_read_single_page_end_io; ++ ++ __bchfs_readpage(c, rbio, inode->v.i_ino, page); ++ wait_for_completion(&done); ++ ++ ret = blk_status_to_errno(rbio->bio.bi_status); ++ bio_put(&rbio->bio); ++ ++ if (ret < 0) ++ return ret; ++ ++ SetPageUptodate(page); ++ return 0; ++} ++ ++/* writepages: */ ++ ++struct bch_writepage_state { ++ struct bch_writepage_io *io; ++ struct bch_io_opts opts; ++}; ++ ++static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, ++ struct bch_inode_info *inode) ++{ ++ return (struct bch_writepage_state) { ++ .opts = io_opts(c, &inode->ei_inode) ++ }; ++} ++ ++static void bch2_writepage_io_free(struct closure *cl) ++{ ++ struct bch_writepage_io *io = container_of(cl, ++ struct bch_writepage_io, cl); ++ ++ bio_put(&io->op.wbio.bio); ++} ++ ++static void bch2_writepage_io_done(struct closure *cl) ++{ ++ struct bch_writepage_io *io = container_of(cl, ++ struct bch_writepage_io, cl); ++ struct bch_fs *c = io->op.c; ++ struct bio *bio = &io->op.wbio.bio; ++ struct bvec_iter_all iter; ++ struct bio_vec *bvec; ++ unsigned i; ++ ++ if (io->op.error) { ++ bio_for_each_segment_all(bvec, bio, iter) { ++ struct bch_page_state *s; ++ ++ SetPageError(bvec->bv_page); ++ mapping_set_error(bvec->bv_page->mapping, -EIO); ++ ++ s = __bch2_page_state(bvec->bv_page); ++ spin_lock(&s->lock); ++ for (i = 0; i < PAGE_SECTORS; i++) ++ s->s[i].nr_replicas = 0; ++ spin_unlock(&s->lock); ++ } ++ } ++ ++ if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { ++ bio_for_each_segment_all(bvec, bio, iter) { ++ struct bch_page_state *s; ++ ++ s = __bch2_page_state(bvec->bv_page); ++ spin_lock(&s->lock); ++ for (i = 0; i < PAGE_SECTORS; i++) ++ s->s[i].nr_replicas = 0; ++ spin_unlock(&s->lock); ++ } ++ } ++ ++ /* ++ * racing with fallocate can cause us to add fewer sectors than ++ * expected - but we shouldn't add more sectors than expected: ++ */ ++ BUG_ON(io->op.i_sectors_delta > 0); ++ ++ /* ++ * (error (due to going RO) halfway through a page can screw that up ++ * slightly) ++ * XXX wtf? ++ BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); ++ */ ++ ++ /* ++ * PageWriteback is effectively our ref on the inode - fixup i_blocks ++ * before calling end_page_writeback: ++ */ ++ i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); ++ ++ bio_for_each_segment_all(bvec, bio, iter) { ++ struct bch_page_state *s = __bch2_page_state(bvec->bv_page); ++ ++ if (atomic_dec_and_test(&s->write_count)) ++ end_page_writeback(bvec->bv_page); ++ } ++ ++ closure_return_with_destructor(&io->cl, bch2_writepage_io_free); ++} ++ ++static void bch2_writepage_do_io(struct bch_writepage_state *w) ++{ ++ struct bch_writepage_io *io = w->io; ++ ++ w->io = NULL; ++ closure_call(&io->op.cl, bch2_write, NULL, &io->cl); ++ continue_at(&io->cl, bch2_writepage_io_done, NULL); ++} ++ ++/* ++ * Get a bch_writepage_io and add @page to it - appending to an existing one if ++ * possible, else allocating a new one: ++ */ ++static void bch2_writepage_io_alloc(struct bch_fs *c, ++ struct writeback_control *wbc, ++ struct bch_writepage_state *w, ++ struct bch_inode_info *inode, ++ u64 sector, ++ unsigned nr_replicas) ++{ ++ struct bch_write_op *op; ++ ++ w->io = container_of(bio_alloc_bioset(GFP_NOFS, ++ BIO_MAX_PAGES, ++ &c->writepage_bioset), ++ struct bch_writepage_io, op.wbio.bio); ++ ++ closure_init(&w->io->cl, NULL); ++ w->io->inode = inode; ++ ++ op = &w->io->op; ++ bch2_write_op_init(op, c, w->opts); ++ op->target = w->opts.foreground_target; ++ op_journal_seq_set(op, &inode->ei_journal_seq); ++ op->nr_replicas = nr_replicas; ++ op->res.nr_replicas = nr_replicas; ++ op->write_point = writepoint_hashed(inode->ei_last_dirtied); ++ op->pos = POS(inode->v.i_ino, sector); ++ op->wbio.bio.bi_iter.bi_sector = sector; ++ op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); ++} ++ ++static int __bch2_writepage(struct page *page, ++ struct writeback_control *wbc, ++ void *data) ++{ ++ struct bch_inode_info *inode = to_bch_ei(page->mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_writepage_state *w = data; ++ struct bch_page_state *s, orig; ++ unsigned i, offset, nr_replicas_this_write = U32_MAX; ++ loff_t i_size = i_size_read(&inode->v); ++ pgoff_t end_index = i_size >> PAGE_SHIFT; ++ int ret; ++ ++ EBUG_ON(!PageUptodate(page)); ++ ++ /* Is the page fully inside i_size? */ ++ if (page->index < end_index) ++ goto do_io; ++ ++ /* Is the page fully outside i_size? (truncate in progress) */ ++ offset = i_size & (PAGE_SIZE - 1); ++ if (page->index > end_index || !offset) { ++ unlock_page(page); ++ return 0; ++ } ++ ++ /* ++ * The page straddles i_size. It must be zeroed out on each and every ++ * writepage invocation because it may be mmapped. "A file is mapped ++ * in multiples of the page size. For a file that is not a multiple of ++ * the page size, the remaining memory is zeroed when mapped, and ++ * writes to that region are not written out to the file." ++ */ ++ zero_user_segment(page, offset, PAGE_SIZE); ++do_io: ++ s = bch2_page_state_create(page, __GFP_NOFAIL); ++ ++ ret = bch2_get_page_disk_reservation(c, inode, page, true); ++ if (ret) { ++ SetPageError(page); ++ mapping_set_error(page->mapping, ret); ++ unlock_page(page); ++ return 0; ++ } ++ ++ /* Before unlocking the page, get copy of reservations: */ ++ orig = *s; ++ ++ for (i = 0; i < PAGE_SECTORS; i++) { ++ if (s->s[i].state < SECTOR_DIRTY) ++ continue; ++ ++ nr_replicas_this_write = ++ min_t(unsigned, nr_replicas_this_write, ++ s->s[i].nr_replicas + ++ s->s[i].replicas_reserved); ++ } ++ ++ for (i = 0; i < PAGE_SECTORS; i++) { ++ if (s->s[i].state < SECTOR_DIRTY) ++ continue; ++ ++ s->s[i].nr_replicas = w->opts.compression ++ ? 0 : nr_replicas_this_write; ++ ++ s->s[i].replicas_reserved = 0; ++ s->s[i].state = SECTOR_ALLOCATED; ++ } ++ ++ BUG_ON(atomic_read(&s->write_count)); ++ atomic_set(&s->write_count, 1); ++ ++ BUG_ON(PageWriteback(page)); ++ set_page_writeback(page); ++ ++ unlock_page(page); ++ ++ offset = 0; ++ while (1) { ++ unsigned sectors = 1, dirty_sectors = 0, reserved_sectors = 0; ++ u64 sector; ++ ++ while (offset < PAGE_SECTORS && ++ orig.s[offset].state < SECTOR_DIRTY) ++ offset++; ++ ++ if (offset == PAGE_SECTORS) ++ break; ++ ++ sector = ((u64) page->index << PAGE_SECTOR_SHIFT) + offset; ++ ++ while (offset + sectors < PAGE_SECTORS && ++ orig.s[offset + sectors].state >= SECTOR_DIRTY) ++ sectors++; ++ ++ for (i = offset; i < offset + sectors; i++) { ++ reserved_sectors += orig.s[i].replicas_reserved; ++ dirty_sectors += orig.s[i].state == SECTOR_DIRTY; ++ } ++ ++ if (w->io && ++ (w->io->op.res.nr_replicas != nr_replicas_this_write || ++ bio_full(&w->io->op.wbio.bio, PAGE_SIZE) || ++ w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= ++ (BIO_MAX_PAGES * PAGE_SIZE) || ++ bio_end_sector(&w->io->op.wbio.bio) != sector)) ++ bch2_writepage_do_io(w); ++ ++ if (!w->io) ++ bch2_writepage_io_alloc(c, wbc, w, inode, sector, ++ nr_replicas_this_write); ++ ++ atomic_inc(&s->write_count); ++ ++ BUG_ON(inode != w->io->inode); ++ BUG_ON(!bio_add_page(&w->io->op.wbio.bio, page, ++ sectors << 9, offset << 9)); ++ ++ /* Check for writing past i_size: */ ++ WARN_ON((bio_end_sector(&w->io->op.wbio.bio) << 9) > ++ round_up(i_size, block_bytes(c))); ++ ++ w->io->op.res.sectors += reserved_sectors; ++ w->io->op.i_sectors_delta -= dirty_sectors; ++ w->io->op.new_i_size = i_size; ++ ++ offset += sectors; ++ } ++ ++ if (atomic_dec_and_test(&s->write_count)) ++ end_page_writeback(page); ++ ++ return 0; ++} ++ ++int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) ++{ ++ struct bch_fs *c = mapping->host->i_sb->s_fs_info; ++ struct bch_writepage_state w = ++ bch_writepage_state_init(c, to_bch_ei(mapping->host)); ++ struct blk_plug plug; ++ int ret; ++ ++ blk_start_plug(&plug); ++ ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); ++ if (w.io) ++ bch2_writepage_do_io(&w); ++ blk_finish_plug(&plug); ++ return ret; ++} ++ ++int bch2_writepage(struct page *page, struct writeback_control *wbc) ++{ ++ struct bch_fs *c = page->mapping->host->i_sb->s_fs_info; ++ struct bch_writepage_state w = ++ bch_writepage_state_init(c, to_bch_ei(page->mapping->host)); ++ int ret; ++ ++ ret = __bch2_writepage(page, wbc, &w); ++ if (w.io) ++ bch2_writepage_do_io(&w); ++ ++ return ret; ++} ++ ++/* buffered writes: */ ++ ++int bch2_write_begin(struct file *file, struct address_space *mapping, ++ loff_t pos, unsigned len, unsigned flags, ++ struct page **pagep, void **fsdata) ++{ ++ struct bch_inode_info *inode = to_bch_ei(mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch2_page_reservation *res; ++ pgoff_t index = pos >> PAGE_SHIFT; ++ unsigned offset = pos & (PAGE_SIZE - 1); ++ struct page *page; ++ int ret = -ENOMEM; ++ ++ res = kmalloc(sizeof(*res), GFP_KERNEL); ++ if (!res) ++ return -ENOMEM; ++ ++ bch2_page_reservation_init(c, inode, res); ++ *fsdata = res; ++ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ++ page = grab_cache_page_write_begin(mapping, index, flags); ++ if (!page) ++ goto err_unlock; ++ ++ if (PageUptodate(page)) ++ goto out; ++ ++ /* If we're writing entire page, don't need to read it in first: */ ++ if (len == PAGE_SIZE) ++ goto out; ++ ++ if (!offset && pos + len >= inode->v.i_size) { ++ zero_user_segment(page, len, PAGE_SIZE); ++ flush_dcache_page(page); ++ goto out; ++ } ++ ++ if (index > inode->v.i_size >> PAGE_SHIFT) { ++ zero_user_segments(page, 0, offset, offset + len, PAGE_SIZE); ++ flush_dcache_page(page); ++ goto out; ++ } ++readpage: ++ ret = bch2_read_single_page(page, mapping); ++ if (ret) ++ goto err; ++out: ++ ret = bch2_page_reservation_get(c, inode, page, res, ++ offset, len, true); ++ if (ret) { ++ if (!PageUptodate(page)) { ++ /* ++ * If the page hasn't been read in, we won't know if we ++ * actually need a reservation - we don't actually need ++ * to read here, we just need to check if the page is ++ * fully backed by uncompressed data: ++ */ ++ goto readpage; ++ } ++ ++ goto err; ++ } ++ ++ *pagep = page; ++ return 0; ++err: ++ unlock_page(page); ++ put_page(page); ++ *pagep = NULL; ++err_unlock: ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ kfree(res); ++ *fsdata = NULL; ++ return ret; ++} ++ ++int bch2_write_end(struct file *file, struct address_space *mapping, ++ loff_t pos, unsigned len, unsigned copied, ++ struct page *page, void *fsdata) ++{ ++ struct bch_inode_info *inode = to_bch_ei(mapping->host); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch2_page_reservation *res = fsdata; ++ unsigned offset = pos & (PAGE_SIZE - 1); ++ ++ lockdep_assert_held(&inode->v.i_rwsem); ++ ++ if (unlikely(copied < len && !PageUptodate(page))) { ++ /* ++ * The page needs to be read in, but that would destroy ++ * our partial write - simplest thing is to just force ++ * userspace to redo the write: ++ */ ++ zero_user(page, 0, PAGE_SIZE); ++ flush_dcache_page(page); ++ copied = 0; ++ } ++ ++ spin_lock(&inode->v.i_lock); ++ if (pos + copied > inode->v.i_size) ++ i_size_write(&inode->v, pos + copied); ++ spin_unlock(&inode->v.i_lock); ++ ++ if (copied) { ++ if (!PageUptodate(page)) ++ SetPageUptodate(page); ++ ++ bch2_set_page_dirty(c, inode, page, res, offset, copied); ++ ++ inode->ei_last_dirtied = (unsigned long) current; ++ } ++ ++ unlock_page(page); ++ put_page(page); ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ ++ bch2_page_reservation_put(c, inode, res); ++ kfree(res); ++ ++ return copied; ++} ++ ++#define WRITE_BATCH_PAGES 32 ++ ++static int __bch2_buffered_write(struct bch_inode_info *inode, ++ struct address_space *mapping, ++ struct iov_iter *iter, ++ loff_t pos, unsigned len) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct page *pages[WRITE_BATCH_PAGES]; ++ struct bch2_page_reservation res; ++ unsigned long index = pos >> PAGE_SHIFT; ++ unsigned offset = pos & (PAGE_SIZE - 1); ++ unsigned nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); ++ unsigned i, reserved = 0, set_dirty = 0; ++ unsigned copied = 0, nr_pages_copied = 0; ++ int ret = 0; ++ ++ BUG_ON(!len); ++ BUG_ON(nr_pages > ARRAY_SIZE(pages)); ++ ++ bch2_page_reservation_init(c, inode, &res); ++ ++ for (i = 0; i < nr_pages; i++) { ++ pages[i] = grab_cache_page_write_begin(mapping, index + i, 0); ++ if (!pages[i]) { ++ nr_pages = i; ++ if (!i) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ len = min_t(unsigned, len, ++ nr_pages * PAGE_SIZE - offset); ++ break; ++ } ++ } ++ ++ if (offset && !PageUptodate(pages[0])) { ++ ret = bch2_read_single_page(pages[0], mapping); ++ if (ret) ++ goto out; ++ } ++ ++ if ((pos + len) & (PAGE_SIZE - 1) && ++ !PageUptodate(pages[nr_pages - 1])) { ++ if ((index + nr_pages - 1) << PAGE_SHIFT >= inode->v.i_size) { ++ zero_user(pages[nr_pages - 1], 0, PAGE_SIZE); ++ } else { ++ ret = bch2_read_single_page(pages[nr_pages - 1], mapping); ++ if (ret) ++ goto out; ++ } ++ } ++ ++ while (reserved < len) { ++ struct page *page = pages[(offset + reserved) >> PAGE_SHIFT]; ++ unsigned pg_offset = (offset + reserved) & (PAGE_SIZE - 1); ++ unsigned pg_len = min_t(unsigned, len - reserved, ++ PAGE_SIZE - pg_offset); ++retry_reservation: ++ ret = bch2_page_reservation_get(c, inode, page, &res, ++ pg_offset, pg_len, true); ++ ++ if (ret && !PageUptodate(page)) { ++ ret = bch2_read_single_page(page, mapping); ++ if (!ret) ++ goto retry_reservation; ++ } ++ ++ if (ret) ++ goto out; ++ ++ reserved += pg_len; ++ } ++ ++ if (mapping_writably_mapped(mapping)) ++ for (i = 0; i < nr_pages; i++) ++ flush_dcache_page(pages[i]); ++ ++ while (copied < len) { ++ struct page *page = pages[(offset + copied) >> PAGE_SHIFT]; ++ unsigned pg_offset = (offset + copied) & (PAGE_SIZE - 1); ++ unsigned pg_len = min_t(unsigned, len - copied, ++ PAGE_SIZE - pg_offset); ++ unsigned pg_copied = iov_iter_copy_from_user_atomic(page, ++ iter, pg_offset, pg_len); ++ ++ if (!pg_copied) ++ break; ++ ++ if (!PageUptodate(page) && ++ pg_copied != PAGE_SIZE && ++ pos + copied + pg_copied < inode->v.i_size) { ++ zero_user(page, 0, PAGE_SIZE); ++ break; ++ } ++ ++ flush_dcache_page(page); ++ iov_iter_advance(iter, pg_copied); ++ copied += pg_copied; ++ ++ if (pg_copied != pg_len) ++ break; ++ } ++ ++ if (!copied) ++ goto out; ++ ++ spin_lock(&inode->v.i_lock); ++ if (pos + copied > inode->v.i_size) ++ i_size_write(&inode->v, pos + copied); ++ spin_unlock(&inode->v.i_lock); ++ ++ while (set_dirty < copied) { ++ struct page *page = pages[(offset + set_dirty) >> PAGE_SHIFT]; ++ unsigned pg_offset = (offset + set_dirty) & (PAGE_SIZE - 1); ++ unsigned pg_len = min_t(unsigned, copied - set_dirty, ++ PAGE_SIZE - pg_offset); ++ ++ if (!PageUptodate(page)) ++ SetPageUptodate(page); ++ ++ bch2_set_page_dirty(c, inode, page, &res, pg_offset, pg_len); ++ unlock_page(page); ++ put_page(page); ++ ++ set_dirty += pg_len; ++ } ++ ++ nr_pages_copied = DIV_ROUND_UP(offset + copied, PAGE_SIZE); ++ inode->ei_last_dirtied = (unsigned long) current; ++out: ++ for (i = nr_pages_copied; i < nr_pages; i++) { ++ unlock_page(pages[i]); ++ put_page(pages[i]); ++ } ++ ++ bch2_page_reservation_put(c, inode, &res); ++ ++ return copied ?: ret; ++} ++ ++static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) ++{ ++ struct file *file = iocb->ki_filp; ++ struct address_space *mapping = file->f_mapping; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ loff_t pos = iocb->ki_pos; ++ ssize_t written = 0; ++ int ret = 0; ++ ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ++ do { ++ unsigned offset = pos & (PAGE_SIZE - 1); ++ unsigned bytes = min_t(unsigned long, iov_iter_count(iter), ++ PAGE_SIZE * WRITE_BATCH_PAGES - offset); ++again: ++ /* ++ * Bring in the user page that we will copy from _first_. ++ * Otherwise there's a nasty deadlock on copying from the ++ * same page as we're writing to, without it being marked ++ * up-to-date. ++ * ++ * Not only is this an optimisation, but it is also required ++ * to check that the address is actually valid, when atomic ++ * usercopies are used, below. ++ */ ++ if (unlikely(iov_iter_fault_in_readable(iter, bytes))) { ++ bytes = min_t(unsigned long, iov_iter_count(iter), ++ PAGE_SIZE - offset); ++ ++ if (unlikely(iov_iter_fault_in_readable(iter, bytes))) { ++ ret = -EFAULT; ++ break; ++ } ++ } ++ ++ if (unlikely(fatal_signal_pending(current))) { ++ ret = -EINTR; ++ break; ++ } ++ ++ ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); ++ if (unlikely(ret < 0)) ++ break; ++ ++ cond_resched(); ++ ++ if (unlikely(ret == 0)) { ++ /* ++ * If we were unable to copy any data at all, we must ++ * fall back to a single segment length write. ++ * ++ * If we didn't fallback here, we could livelock ++ * because not all segments in the iov can be copied at ++ * once without a pagefault. ++ */ ++ bytes = min_t(unsigned long, PAGE_SIZE - offset, ++ iov_iter_single_seg_count(iter)); ++ goto again; ++ } ++ pos += ret; ++ written += ret; ++ ret = 0; ++ ++ balance_dirty_pages_ratelimited(mapping); ++ } while (iov_iter_count(iter)); ++ ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ ++ return written ? written : ret; ++} ++ ++/* O_DIRECT reads */ ++ ++static void bch2_dio_read_complete(struct closure *cl) ++{ ++ struct dio_read *dio = container_of(cl, struct dio_read, cl); ++ ++ dio->req->ki_complete(dio->req, dio->ret, 0); ++ bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ ++} ++ ++static void bch2_direct_IO_read_endio(struct bio *bio) ++{ ++ struct dio_read *dio = bio->bi_private; ++ ++ if (bio->bi_status) ++ dio->ret = blk_status_to_errno(bio->bi_status); ++ ++ closure_put(&dio->cl); ++} ++ ++static void bch2_direct_IO_read_split_endio(struct bio *bio) ++{ ++ bch2_direct_IO_read_endio(bio); ++ bio_check_pages_dirty(bio); /* transfers ownership */ ++} ++ ++static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) ++{ ++ struct file *file = req->ki_filp; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_io_opts opts = io_opts(c, &inode->ei_inode); ++ struct dio_read *dio; ++ struct bio *bio; ++ loff_t offset = req->ki_pos; ++ bool sync = is_sync_kiocb(req); ++ size_t shorten; ++ ssize_t ret; ++ ++ if ((offset|iter->count) & (block_bytes(c) - 1)) ++ return -EINVAL; ++ ++ ret = min_t(loff_t, iter->count, ++ max_t(loff_t, 0, i_size_read(&inode->v) - offset)); ++ ++ if (!ret) ++ return ret; ++ ++ shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); ++ iter->count -= shorten; ++ ++ bio = bio_alloc_bioset(GFP_KERNEL, ++ iov_iter_npages(iter, BIO_MAX_PAGES), ++ &c->dio_read_bioset); ++ ++ bio->bi_end_io = bch2_direct_IO_read_endio; ++ ++ dio = container_of(bio, struct dio_read, rbio.bio); ++ closure_init(&dio->cl, NULL); ++ ++ /* ++ * this is a _really_ horrible hack just to avoid an atomic sub at the ++ * end: ++ */ ++ if (!sync) { ++ set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL); ++ atomic_set(&dio->cl.remaining, ++ CLOSURE_REMAINING_INITIALIZER - ++ CLOSURE_RUNNING + ++ CLOSURE_DESTRUCTOR); ++ } else { ++ atomic_set(&dio->cl.remaining, ++ CLOSURE_REMAINING_INITIALIZER + 1); ++ } ++ ++ dio->req = req; ++ dio->ret = ret; ++ ++ goto start; ++ while (iter->count) { ++ bio = bio_alloc_bioset(GFP_KERNEL, ++ iov_iter_npages(iter, BIO_MAX_PAGES), ++ &c->bio_read); ++ bio->bi_end_io = bch2_direct_IO_read_split_endio; ++start: ++ bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC); ++ bio->bi_iter.bi_sector = offset >> 9; ++ bio->bi_private = dio; ++ ++ ret = bio_iov_iter_get_pages(bio, iter); ++ if (ret < 0) { ++ /* XXX: fault inject this path */ ++ bio->bi_status = BLK_STS_RESOURCE; ++ bio_endio(bio); ++ break; ++ } ++ ++ offset += bio->bi_iter.bi_size; ++ bio_set_pages_dirty(bio); ++ ++ if (iter->count) ++ closure_get(&dio->cl); ++ ++ bch2_read(c, rbio_init(bio, opts), inode->v.i_ino); ++ } ++ ++ iter->count += shorten; ++ ++ if (sync) { ++ closure_sync(&dio->cl); ++ closure_debug_destroy(&dio->cl); ++ ret = dio->ret; ++ bio_check_pages_dirty(&dio->rbio.bio); /* transfers ownership */ ++ return ret; ++ } else { ++ return -EIOCBQUEUED; ++ } ++} ++ ++ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) ++{ ++ struct file *file = iocb->ki_filp; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct address_space *mapping = file->f_mapping; ++ size_t count = iov_iter_count(iter); ++ ssize_t ret; ++ ++ if (!count) ++ return 0; /* skip atime */ ++ ++ if (iocb->ki_flags & IOCB_DIRECT) { ++ struct blk_plug plug; ++ ++ ret = filemap_write_and_wait_range(mapping, ++ iocb->ki_pos, ++ iocb->ki_pos + count - 1); ++ if (ret < 0) ++ return ret; ++ ++ file_accessed(file); ++ ++ blk_start_plug(&plug); ++ ret = bch2_direct_IO_read(iocb, iter); ++ blk_finish_plug(&plug); ++ ++ if (ret >= 0) ++ iocb->ki_pos += ret; ++ } else { ++ bch2_pagecache_add_get(&inode->ei_pagecache_lock); ++ ret = generic_file_read_iter(iocb, iter); ++ bch2_pagecache_add_put(&inode->ei_pagecache_lock); ++ } ++ ++ return ret; ++} ++ ++/* O_DIRECT writes */ ++ ++static void bch2_dio_write_loop_async(struct bch_write_op *); ++ ++static long bch2_dio_write_loop(struct dio_write *dio) ++{ ++ bool kthread = (current->flags & PF_KTHREAD) != 0; ++ struct kiocb *req = dio->req; ++ struct address_space *mapping = req->ki_filp->f_mapping; ++ struct bch_inode_info *inode = file_bch_inode(req->ki_filp); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bio *bio = &dio->op.wbio.bio; ++ struct bvec_iter_all iter; ++ struct bio_vec *bv; ++ unsigned unaligned; ++ bool sync = dio->sync; ++ long ret; ++ ++ if (dio->loop) ++ goto loop; ++ ++ while (1) { ++ if (kthread) ++ kthread_use_mm(dio->mm); ++ BUG_ON(current->faults_disabled_mapping); ++ current->faults_disabled_mapping = mapping; ++ ++ ret = bio_iov_iter_get_pages(bio, &dio->iter); ++ ++ current->faults_disabled_mapping = NULL; ++ if (kthread) ++ kthread_unuse_mm(dio->mm); ++ ++ if (unlikely(ret < 0)) ++ goto err; ++ ++ unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); ++ bio->bi_iter.bi_size -= unaligned; ++ iov_iter_revert(&dio->iter, unaligned); ++ ++ if (!bio->bi_iter.bi_size) { ++ /* ++ * bio_iov_iter_get_pages was only able to get < ++ * blocksize worth of pages: ++ */ ++ bio_for_each_segment_all(bv, bio, iter) ++ put_page(bv->bv_page); ++ ret = -EFAULT; ++ goto err; ++ } ++ ++ bch2_write_op_init(&dio->op, c, io_opts(c, &inode->ei_inode)); ++ dio->op.end_io = bch2_dio_write_loop_async; ++ dio->op.target = dio->op.opts.foreground_target; ++ op_journal_seq_set(&dio->op, &inode->ei_journal_seq); ++ dio->op.write_point = writepoint_hashed((unsigned long) current); ++ dio->op.nr_replicas = dio->op.opts.data_replicas; ++ dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); ++ ++ if ((req->ki_flags & IOCB_DSYNC) && ++ !c->opts.journal_flush_disabled) ++ dio->op.flags |= BCH_WRITE_FLUSH; ++ ++ ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), ++ dio->op.opts.data_replicas, 0); ++ if (unlikely(ret) && ++ !bch2_check_range_allocated(c, dio->op.pos, ++ bio_sectors(bio), dio->op.opts.data_replicas)) ++ goto err; ++ ++ task_io_account_write(bio->bi_iter.bi_size); ++ ++ if (!dio->sync && !dio->loop && dio->iter.count) { ++ struct iovec *iov = dio->inline_vecs; ++ ++ if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { ++ iov = kmalloc(dio->iter.nr_segs * sizeof(*iov), ++ GFP_KERNEL); ++ if (unlikely(!iov)) { ++ dio->sync = sync = true; ++ goto do_io; ++ } ++ ++ dio->free_iov = true; ++ } ++ ++ memcpy(iov, dio->iter.iov, dio->iter.nr_segs * sizeof(*iov)); ++ dio->iter.iov = iov; ++ } ++do_io: ++ dio->loop = true; ++ closure_call(&dio->op.cl, bch2_write, NULL, NULL); ++ ++ if (sync) ++ wait_for_completion(&dio->done); ++ else ++ return -EIOCBQUEUED; ++loop: ++ i_sectors_acct(c, inode, &dio->quota_res, ++ dio->op.i_sectors_delta); ++ req->ki_pos += (u64) dio->op.written << 9; ++ dio->written += dio->op.written; ++ ++ spin_lock(&inode->v.i_lock); ++ if (req->ki_pos > inode->v.i_size) ++ i_size_write(&inode->v, req->ki_pos); ++ spin_unlock(&inode->v.i_lock); ++ ++ bio_for_each_segment_all(bv, bio, iter) ++ put_page(bv->bv_page); ++ if (!dio->iter.count || dio->op.error) ++ break; ++ ++ bio_reset(bio); ++ reinit_completion(&dio->done); ++ } ++ ++ ret = dio->op.error ?: ((long) dio->written << 9); ++err: ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ bch2_quota_reservation_put(c, inode, &dio->quota_res); ++ ++ if (dio->free_iov) ++ kfree(dio->iter.iov); ++ ++ bio_put(bio); ++ ++ /* inode->i_dio_count is our ref on inode and thus bch_fs */ ++ inode_dio_end(&inode->v); ++ ++ if (!sync) { ++ req->ki_complete(req, ret, 0); ++ ret = -EIOCBQUEUED; ++ } ++ return ret; ++} ++ ++static void bch2_dio_write_loop_async(struct bch_write_op *op) ++{ ++ struct dio_write *dio = container_of(op, struct dio_write, op); ++ ++ if (dio->sync) ++ complete(&dio->done); ++ else ++ bch2_dio_write_loop(dio); ++} ++ ++static noinline ++ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) ++{ ++ struct file *file = req->ki_filp; ++ struct address_space *mapping = file->f_mapping; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct dio_write *dio; ++ struct bio *bio; ++ bool locked = true, extending; ++ ssize_t ret; ++ ++ prefetch(&c->opts); ++ prefetch((void *) &c->opts + 64); ++ prefetch(&inode->ei_inode); ++ prefetch((void *) &inode->ei_inode + 64); ++ ++ inode_lock(&inode->v); ++ ++ ret = generic_write_checks(req, iter); ++ if (unlikely(ret <= 0)) ++ goto err; ++ ++ ret = file_remove_privs(file); ++ if (unlikely(ret)) ++ goto err; ++ ++ ret = file_update_time(file); ++ if (unlikely(ret)) ++ goto err; ++ ++ if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) ++ goto err; ++ ++ inode_dio_begin(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ extending = req->ki_pos + iter->count > inode->v.i_size; ++ if (!extending) { ++ inode_unlock(&inode->v); ++ locked = false; ++ } ++ ++ bio = bio_alloc_bioset(GFP_KERNEL, ++ iov_iter_npages(iter, BIO_MAX_PAGES), ++ &c->dio_write_bioset); ++ dio = container_of(bio, struct dio_write, op.wbio.bio); ++ init_completion(&dio->done); ++ dio->req = req; ++ dio->mm = current->mm; ++ dio->loop = false; ++ dio->sync = is_sync_kiocb(req) || extending; ++ dio->free_iov = false; ++ dio->quota_res.sectors = 0; ++ dio->written = 0; ++ dio->iter = *iter; ++ ++ ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, ++ iter->count >> 9, true); ++ if (unlikely(ret)) ++ goto err_put_bio; ++ ++ ret = write_invalidate_inode_pages_range(mapping, ++ req->ki_pos, ++ req->ki_pos + iter->count - 1); ++ if (unlikely(ret)) ++ goto err_put_bio; ++ ++ ret = bch2_dio_write_loop(dio); ++err: ++ if (locked) ++ inode_unlock(&inode->v); ++ return ret; ++err_put_bio: ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ bch2_quota_reservation_put(c, inode, &dio->quota_res); ++ bio_put(bio); ++ inode_dio_end(&inode->v); ++ goto err; ++} ++ ++ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) ++{ ++ struct file *file = iocb->ki_filp; ++ struct bch_inode_info *inode = file_bch_inode(file); ++ ssize_t ret; ++ ++ if (iocb->ki_flags & IOCB_DIRECT) ++ return bch2_direct_write(iocb, from); ++ ++ /* We can write back this queue in page reclaim */ ++ current->backing_dev_info = inode_to_bdi(&inode->v); ++ inode_lock(&inode->v); ++ ++ ret = generic_write_checks(iocb, from); ++ if (ret <= 0) ++ goto unlock; ++ ++ ret = file_remove_privs(file); ++ if (ret) ++ goto unlock; ++ ++ ret = file_update_time(file); ++ if (ret) ++ goto unlock; ++ ++ ret = bch2_buffered_write(iocb, from); ++ if (likely(ret > 0)) ++ iocb->ki_pos += ret; ++unlock: ++ inode_unlock(&inode->v); ++ current->backing_dev_info = NULL; ++ ++ if (ret > 0) ++ ret = generic_write_sync(iocb, ret); ++ ++ return ret; ++} ++ ++/* fsync: */ ++ ++int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ int ret, ret2; ++ ++ ret = file_write_and_wait_range(file, start, end); ++ if (ret) ++ return ret; ++ ++ if (datasync && !(inode->v.i_state & I_DIRTY_DATASYNC)) ++ goto out; ++ ++ ret = sync_inode_metadata(&inode->v, 1); ++ if (ret) ++ return ret; ++out: ++ if (!c->opts.journal_flush_disabled) ++ ret = bch2_journal_flush_seq(&c->journal, ++ inode->ei_journal_seq); ++ ret2 = file_check_and_advance_wb_err(file); ++ ++ return ret ?: ret2; ++} ++ ++/* truncate: */ ++ ++static inline int range_has_data(struct bch_fs *c, ++ struct bpos start, ++ struct bpos end) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, start, 0, k, ret) { ++ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) ++ break; ++ ++ if (bkey_extent_is_data(k.k)) { ++ ret = 1; ++ break; ++ } ++ } ++ ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++static int __bch2_truncate_page(struct bch_inode_info *inode, ++ pgoff_t index, loff_t start, loff_t end) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct address_space *mapping = inode->v.i_mapping; ++ struct bch_page_state *s; ++ unsigned start_offset = start & (PAGE_SIZE - 1); ++ unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; ++ unsigned i; ++ struct page *page; ++ int ret = 0; ++ ++ /* Page boundary? Nothing to do */ ++ if (!((index == start >> PAGE_SHIFT && start_offset) || ++ (index == end >> PAGE_SHIFT && end_offset != PAGE_SIZE))) ++ return 0; ++ ++ /* Above i_size? */ ++ if (index << PAGE_SHIFT >= inode->v.i_size) ++ return 0; ++ ++ page = find_lock_page(mapping, index); ++ if (!page) { ++ /* ++ * XXX: we're doing two index lookups when we end up reading the ++ * page ++ */ ++ ret = range_has_data(c, ++ POS(inode->v.i_ino, index << PAGE_SECTOR_SHIFT), ++ POS(inode->v.i_ino, (index + 1) << PAGE_SECTOR_SHIFT)); ++ if (ret <= 0) ++ return ret; ++ ++ page = find_or_create_page(mapping, index, GFP_KERNEL); ++ if (unlikely(!page)) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ } ++ ++ s = bch2_page_state_create(page, 0); ++ if (!s) { ++ ret = -ENOMEM; ++ goto unlock; ++ } ++ ++ if (!PageUptodate(page)) { ++ ret = bch2_read_single_page(page, mapping); ++ if (ret) ++ goto unlock; ++ } ++ ++ if (index != start >> PAGE_SHIFT) ++ start_offset = 0; ++ if (index != end >> PAGE_SHIFT) ++ end_offset = PAGE_SIZE; ++ ++ for (i = round_up(start_offset, block_bytes(c)) >> 9; ++ i < round_down(end_offset, block_bytes(c)) >> 9; ++ i++) { ++ s->s[i].nr_replicas = 0; ++ s->s[i].state = SECTOR_UNALLOCATED; ++ } ++ ++ zero_user_segment(page, start_offset, end_offset); ++ ++ /* ++ * Bit of a hack - we don't want truncate to fail due to -ENOSPC. ++ * ++ * XXX: because we aren't currently tracking whether the page has actual ++ * data in it (vs. just 0s, or only partially written) this wrong. ick. ++ */ ++ ret = bch2_get_page_disk_reservation(c, inode, page, false); ++ BUG_ON(ret); ++ ++ /* ++ * This removes any writeable userspace mappings; we need to force ++ * .page_mkwrite to be called again before any mmapped writes, to ++ * redirty the full page: ++ */ ++ page_mkclean(page); ++ __set_page_dirty_nobuffers(page); ++unlock: ++ unlock_page(page); ++ put_page(page); ++out: ++ return ret; ++} ++ ++static int bch2_truncate_page(struct bch_inode_info *inode, loff_t from) ++{ ++ return __bch2_truncate_page(inode, from >> PAGE_SHIFT, ++ from, round_up(from, PAGE_SIZE)); ++} ++ ++static int bch2_extend(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *inode_u, ++ struct iattr *iattr) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct address_space *mapping = inode->v.i_mapping; ++ int ret; ++ ++ /* ++ * sync appends: ++ * ++ * this has to be done _before_ extending i_size: ++ */ ++ ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); ++ if (ret) ++ return ret; ++ ++ truncate_setsize(&inode->v, iattr->ia_size); ++ setattr_copy(&inode->v, iattr); ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode_size(c, inode, inode->v.i_size, ++ ATTR_MTIME|ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++ ++ return ret; ++} ++ ++static int bch2_truncate_finish_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; ++ bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); ++ return 0; ++} ++ ++static int bch2_truncate_start_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, void *p) ++{ ++ u64 *new_i_size = p; ++ ++ bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY; ++ bi->bi_size = *new_i_size; ++ return 0; ++} ++ ++int bch2_truncate(struct bch_inode_info *inode, struct iattr *iattr) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct address_space *mapping = inode->v.i_mapping; ++ struct bch_inode_unpacked inode_u; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ u64 new_i_size = iattr->ia_size; ++ s64 i_sectors_delta = 0; ++ int ret = 0; ++ ++ inode_dio_wait(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ /* ++ * fetch current on disk i_size: inode is locked, i_size can only ++ * increase underneath us: ++ */ ++ bch2_trans_init(&trans, c, 0, 0); ++ iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, 0); ++ ret = PTR_ERR_OR_ZERO(iter); ++ bch2_trans_exit(&trans); ++ ++ if (ret) ++ goto err; ++ ++ /* ++ * check this before next assertion; on filesystem error our normal ++ * invariants are a bit broken (truncate has to truncate the page cache ++ * before the inode). ++ */ ++ ret = bch2_journal_error(&c->journal); ++ if (ret) ++ goto err; ++ ++ BUG_ON(inode->v.i_size < inode_u.bi_size); ++ ++ if (iattr->ia_size > inode->v.i_size) { ++ ret = bch2_extend(inode, &inode_u, iattr); ++ goto err; ++ } ++ ++ ret = bch2_truncate_page(inode, iattr->ia_size); ++ if (unlikely(ret)) ++ goto err; ++ ++ /* ++ * When extending, we're going to write the new i_size to disk ++ * immediately so we need to flush anything above the current on disk ++ * i_size first: ++ * ++ * Also, when extending we need to flush the page that i_size currently ++ * straddles - if it's mapped to userspace, we need to ensure that ++ * userspace has to redirty it and call .mkwrite -> set_page_dirty ++ * again to allocate the part of the page that was extended. ++ */ ++ if (iattr->ia_size > inode_u.bi_size) ++ ret = filemap_write_and_wait_range(mapping, ++ inode_u.bi_size, ++ iattr->ia_size - 1); ++ else if (iattr->ia_size & (PAGE_SIZE - 1)) ++ ret = filemap_write_and_wait_range(mapping, ++ round_down(iattr->ia_size, PAGE_SIZE), ++ iattr->ia_size - 1); ++ if (ret) ++ goto err; ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode(c, inode, bch2_truncate_start_fn, ++ &new_i_size, 0); ++ mutex_unlock(&inode->ei_update_lock); ++ ++ if (unlikely(ret)) ++ goto err; ++ ++ truncate_setsize(&inode->v, iattr->ia_size); ++ ++ ret = bch2_fpunch(c, inode->v.i_ino, ++ round_up(iattr->ia_size, block_bytes(c)) >> 9, ++ U64_MAX, &inode->ei_journal_seq, &i_sectors_delta); ++ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++ ++ if (unlikely(ret)) ++ goto err; ++ ++ setattr_copy(&inode->v, iattr); ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, ++ ATTR_MTIME|ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++err: ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ return ret; ++} ++ ++/* fallocate: */ ++ ++static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ u64 discard_start = round_up(offset, block_bytes(c)) >> 9; ++ u64 discard_end = round_down(offset + len, block_bytes(c)) >> 9; ++ int ret = 0; ++ ++ inode_lock(&inode->v); ++ inode_dio_wait(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ ret = __bch2_truncate_page(inode, ++ offset >> PAGE_SHIFT, ++ offset, offset + len); ++ if (unlikely(ret)) ++ goto err; ++ ++ if (offset >> PAGE_SHIFT != ++ (offset + len) >> PAGE_SHIFT) { ++ ret = __bch2_truncate_page(inode, ++ (offset + len) >> PAGE_SHIFT, ++ offset, offset + len); ++ if (unlikely(ret)) ++ goto err; ++ } ++ ++ truncate_pagecache_range(&inode->v, offset, offset + len - 1); ++ ++ if (discard_start < discard_end) { ++ s64 i_sectors_delta = 0; ++ ++ ret = bch2_fpunch(c, inode->v.i_ino, ++ discard_start, discard_end, ++ &inode->ei_journal_seq, ++ &i_sectors_delta); ++ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++ } ++err: ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ inode_unlock(&inode->v); ++ ++ return ret; ++} ++ ++static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, ++ loff_t offset, loff_t len, ++ bool insert) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct address_space *mapping = inode->v.i_mapping; ++ struct bkey_on_stack copy; ++ struct btree_trans trans; ++ struct btree_iter *src, *dst; ++ loff_t shift, new_size; ++ u64 src_start; ++ int ret; ++ ++ if ((offset | len) & (block_bytes(c) - 1)) ++ return -EINVAL; ++ ++ bkey_on_stack_init(©); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 256); ++ ++ /* ++ * We need i_mutex to keep the page cache consistent with the extents ++ * btree, and the btree consistent with i_size - we don't need outside ++ * locking for the extents btree itself, because we're using linked ++ * iterators ++ */ ++ inode_lock(&inode->v); ++ inode_dio_wait(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ if (insert) { ++ ret = -EFBIG; ++ if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len) ++ goto err; ++ ++ ret = -EINVAL; ++ if (offset >= inode->v.i_size) ++ goto err; ++ ++ src_start = U64_MAX; ++ shift = len; ++ } else { ++ ret = -EINVAL; ++ if (offset + len >= inode->v.i_size) ++ goto err; ++ ++ src_start = offset + len; ++ shift = -len; ++ } ++ ++ new_size = inode->v.i_size + shift; ++ ++ ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); ++ if (ret) ++ goto err; ++ ++ if (insert) { ++ i_size_write(&inode->v, new_size); ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode_size(c, inode, new_size, ++ ATTR_MTIME|ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++ } else { ++ s64 i_sectors_delta = 0; ++ ++ ret = bch2_fpunch(c, inode->v.i_ino, ++ offset >> 9, (offset + len) >> 9, ++ &inode->ei_journal_seq, ++ &i_sectors_delta); ++ i_sectors_acct(c, inode, NULL, i_sectors_delta); ++ ++ if (ret) ++ goto err; ++ } ++ ++ src = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(inode->v.i_ino, src_start >> 9), ++ BTREE_ITER_INTENT); ++ BUG_ON(IS_ERR_OR_NULL(src)); ++ ++ dst = bch2_trans_copy_iter(&trans, src); ++ BUG_ON(IS_ERR_OR_NULL(dst)); ++ ++ while (1) { ++ struct disk_reservation disk_res = ++ bch2_disk_reservation_init(c, 0); ++ struct bkey_i delete; ++ struct bkey_s_c k; ++ struct bpos next_pos; ++ struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); ++ struct bpos atomic_end; ++ unsigned trigger_flags = 0; ++ ++ k = insert ++ ? bch2_btree_iter_peek_prev(src) ++ : bch2_btree_iter_peek(src); ++ if ((ret = bkey_err(k))) ++ goto bkey_err; ++ ++ if (!k.k || k.k->p.inode != inode->v.i_ino) ++ break; ++ ++ BUG_ON(bkey_cmp(src->pos, bkey_start_pos(k.k))); ++ ++ if (insert && ++ bkey_cmp(k.k->p, POS(inode->v.i_ino, offset >> 9)) <= 0) ++ break; ++reassemble: ++ bkey_on_stack_reassemble(©, c, k); ++ ++ if (insert && ++ bkey_cmp(bkey_start_pos(k.k), move_pos) < 0) ++ bch2_cut_front(move_pos, copy.k); ++ ++ copy.k->k.p.offset += shift >> 9; ++ bch2_btree_iter_set_pos(dst, bkey_start_pos(©.k->k)); ++ ++ ret = bch2_extent_atomic_end(dst, copy.k, &atomic_end); ++ if (ret) ++ goto bkey_err; ++ ++ if (bkey_cmp(atomic_end, copy.k->k.p)) { ++ if (insert) { ++ move_pos = atomic_end; ++ move_pos.offset -= shift >> 9; ++ goto reassemble; ++ } else { ++ bch2_cut_back(atomic_end, copy.k); ++ } ++ } ++ ++ bkey_init(&delete.k); ++ delete.k.p = copy.k->k.p; ++ delete.k.size = copy.k->k.size; ++ delete.k.p.offset -= shift >> 9; ++ ++ next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; ++ ++ if (copy.k->k.size == k.k->size) { ++ /* ++ * If we're moving the entire extent, we can skip ++ * running triggers: ++ */ ++ trigger_flags |= BTREE_TRIGGER_NORUN; ++ } else { ++ /* We might end up splitting compressed extents: */ ++ unsigned nr_ptrs = ++ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k)); ++ ++ ret = bch2_disk_reservation_get(c, &disk_res, ++ copy.k->k.size, nr_ptrs, ++ BCH_DISK_RESERVATION_NOFAIL); ++ BUG_ON(ret); ++ } ++ ++ bch2_btree_iter_set_pos(src, bkey_start_pos(&delete.k)); ++ ++ ret = bch2_trans_update(&trans, src, &delete, trigger_flags) ?: ++ bch2_trans_update(&trans, dst, copy.k, trigger_flags) ?: ++ bch2_trans_commit(&trans, &disk_res, ++ &inode->ei_journal_seq, ++ BTREE_INSERT_NOFAIL); ++ bch2_disk_reservation_put(c, &disk_res); ++bkey_err: ++ if (!ret) ++ bch2_btree_iter_set_pos(src, next_pos); ++ ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ goto err; ++ ++ bch2_trans_cond_resched(&trans); ++ } ++ bch2_trans_unlock(&trans); ++ ++ if (!insert) { ++ i_size_write(&inode->v, new_size); ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode_size(c, inode, new_size, ++ ATTR_MTIME|ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++ } ++err: ++ bch2_trans_exit(&trans); ++ bkey_on_stack_exit(©, c); ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ inode_unlock(&inode->v); ++ return ret; ++} ++ ++static long bchfs_fallocate(struct bch_inode_info *inode, int mode, ++ loff_t offset, loff_t len) ++{ ++ struct address_space *mapping = inode->v.i_mapping; ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bpos end_pos; ++ loff_t end = offset + len; ++ loff_t block_start = round_down(offset, block_bytes(c)); ++ loff_t block_end = round_up(end, block_bytes(c)); ++ unsigned sectors; ++ unsigned replicas = io_opts(c, &inode->ei_inode).data_replicas; ++ int ret; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ inode_lock(&inode->v); ++ inode_dio_wait(&inode->v); ++ bch2_pagecache_block_get(&inode->ei_pagecache_lock); ++ ++ if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { ++ ret = inode_newsize_ok(&inode->v, end); ++ if (ret) ++ goto err; ++ } ++ ++ if (mode & FALLOC_FL_ZERO_RANGE) { ++ ret = __bch2_truncate_page(inode, ++ offset >> PAGE_SHIFT, ++ offset, end); ++ ++ if (!ret && ++ offset >> PAGE_SHIFT != end >> PAGE_SHIFT) ++ ret = __bch2_truncate_page(inode, ++ end >> PAGE_SHIFT, ++ offset, end); ++ ++ if (unlikely(ret)) ++ goto err; ++ ++ truncate_pagecache_range(&inode->v, offset, end - 1); ++ } ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(inode->v.i_ino, block_start >> 9), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ end_pos = POS(inode->v.i_ino, block_end >> 9); ++ ++ while (bkey_cmp(iter->pos, end_pos) < 0) { ++ s64 i_sectors_delta = 0; ++ struct disk_reservation disk_res = { 0 }; ++ struct quota_res quota_res = { 0 }; ++ struct bkey_i_reservation reservation; ++ struct bkey_s_c k; ++ ++ bch2_trans_begin(&trans); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ if ((ret = bkey_err(k))) ++ goto bkey_err; ++ ++ /* already reserved */ ++ if (k.k->type == KEY_TYPE_reservation && ++ bkey_s_c_to_reservation(k).v->nr_replicas >= replicas) { ++ bch2_btree_iter_next_slot(iter); ++ continue; ++ } ++ ++ if (bkey_extent_is_data(k.k) && ++ !(mode & FALLOC_FL_ZERO_RANGE)) { ++ bch2_btree_iter_next_slot(iter); ++ continue; ++ } ++ ++ bkey_reservation_init(&reservation.k_i); ++ reservation.k.type = KEY_TYPE_reservation; ++ reservation.k.p = k.k->p; ++ reservation.k.size = k.k->size; ++ ++ bch2_cut_front(iter->pos, &reservation.k_i); ++ bch2_cut_back(end_pos, &reservation.k_i); ++ ++ sectors = reservation.k.size; ++ reservation.v.nr_replicas = bch2_bkey_nr_ptrs_allocated(k); ++ ++ if (!bkey_extent_is_allocation(k.k)) { ++ ret = bch2_quota_reservation_add(c, inode, ++ "a_res, ++ sectors, true); ++ if (unlikely(ret)) ++ goto bkey_err; ++ } ++ ++ if (reservation.v.nr_replicas < replicas || ++ bch2_bkey_sectors_compressed(k)) { ++ ret = bch2_disk_reservation_get(c, &disk_res, sectors, ++ replicas, 0); ++ if (unlikely(ret)) ++ goto bkey_err; ++ ++ reservation.v.nr_replicas = disk_res.nr_replicas; ++ } ++ ++ ret = bch2_extent_update(&trans, iter, &reservation.k_i, ++ &disk_res, &inode->ei_journal_seq, ++ 0, &i_sectors_delta); ++ i_sectors_acct(c, inode, "a_res, i_sectors_delta); ++bkey_err: ++ bch2_quota_reservation_put(c, inode, "a_res); ++ bch2_disk_reservation_put(c, &disk_res); ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ goto err; ++ } ++ ++ /* ++ * Do we need to extend the file? ++ * ++ * If we zeroed up to the end of the file, we dropped whatever writes ++ * were going to write out the current i_size, so we have to extend ++ * manually even if FL_KEEP_SIZE was set: ++ */ ++ if (end >= inode->v.i_size && ++ (!(mode & FALLOC_FL_KEEP_SIZE) || ++ (mode & FALLOC_FL_ZERO_RANGE))) { ++ struct btree_iter *inode_iter; ++ struct bch_inode_unpacked inode_u; ++ ++ do { ++ bch2_trans_begin(&trans); ++ inode_iter = bch2_inode_peek(&trans, &inode_u, ++ inode->v.i_ino, 0); ++ ret = PTR_ERR_OR_ZERO(inode_iter); ++ } while (ret == -EINTR); ++ ++ bch2_trans_unlock(&trans); ++ ++ if (ret) ++ goto err; ++ ++ /* ++ * Sync existing appends before extending i_size, ++ * as in bch2_extend(): ++ */ ++ ret = filemap_write_and_wait_range(mapping, ++ inode_u.bi_size, S64_MAX); ++ if (ret) ++ goto err; ++ ++ if (mode & FALLOC_FL_KEEP_SIZE) ++ end = inode->v.i_size; ++ else ++ i_size_write(&inode->v, end); ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode_size(c, inode, end, 0); ++ mutex_unlock(&inode->ei_update_lock); ++ } ++err: ++ bch2_trans_exit(&trans); ++ bch2_pagecache_block_put(&inode->ei_pagecache_lock); ++ inode_unlock(&inode->v); ++ return ret; ++} ++ ++long bch2_fallocate_dispatch(struct file *file, int mode, ++ loff_t offset, loff_t len) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ long ret; ++ ++ if (!percpu_ref_tryget(&c->writes)) ++ return -EROFS; ++ ++ if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) ++ ret = bchfs_fallocate(inode, mode, offset, len); ++ else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) ++ ret = bchfs_fpunch(inode, offset, len); ++ else if (mode == FALLOC_FL_INSERT_RANGE) ++ ret = bchfs_fcollapse_finsert(inode, offset, len, true); ++ else if (mode == FALLOC_FL_COLLAPSE_RANGE) ++ ret = bchfs_fcollapse_finsert(inode, offset, len, false); ++ else ++ ret = -EOPNOTSUPP; ++ ++ percpu_ref_put(&c->writes); ++ ++ return ret; ++} ++ ++static void mark_range_unallocated(struct bch_inode_info *inode, ++ loff_t start, loff_t end) ++{ ++ pgoff_t index = start >> PAGE_SHIFT; ++ pgoff_t end_index = (end - 1) >> PAGE_SHIFT; ++ struct pagevec pvec; ++ ++ pagevec_init(&pvec); ++ ++ do { ++ unsigned nr_pages, i, j; ++ ++ nr_pages = pagevec_lookup_range(&pvec, inode->v.i_mapping, ++ &index, end_index); ++ if (nr_pages == 0) ++ break; ++ ++ for (i = 0; i < nr_pages; i++) { ++ struct page *page = pvec.pages[i]; ++ struct bch_page_state *s; ++ ++ lock_page(page); ++ s = bch2_page_state(page); ++ ++ if (s) { ++ spin_lock(&s->lock); ++ for (j = 0; j < PAGE_SECTORS; j++) ++ s->s[j].nr_replicas = 0; ++ spin_unlock(&s->lock); ++ } ++ ++ unlock_page(page); ++ } ++ pagevec_release(&pvec); ++ } while (index <= end_index); ++} ++ ++loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, ++ struct file *file_dst, loff_t pos_dst, ++ loff_t len, unsigned remap_flags) ++{ ++ struct bch_inode_info *src = file_bch_inode(file_src); ++ struct bch_inode_info *dst = file_bch_inode(file_dst); ++ struct bch_fs *c = src->v.i_sb->s_fs_info; ++ s64 i_sectors_delta = 0; ++ u64 aligned_len; ++ loff_t ret = 0; ++ ++ if (!c->opts.reflink) ++ return -EOPNOTSUPP; ++ ++ if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) ++ return -EINVAL; ++ ++ if (remap_flags & REMAP_FILE_DEDUP) ++ return -EOPNOTSUPP; ++ ++ if ((pos_src & (block_bytes(c) - 1)) || ++ (pos_dst & (block_bytes(c) - 1))) ++ return -EINVAL; ++ ++ if (src == dst && ++ abs(pos_src - pos_dst) < len) ++ return -EINVAL; ++ ++ bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); ++ ++ file_update_time(file_dst); ++ ++ inode_dio_wait(&src->v); ++ inode_dio_wait(&dst->v); ++ ++ ret = generic_remap_file_range_prep(file_src, pos_src, ++ file_dst, pos_dst, ++ &len, remap_flags); ++ if (ret < 0 || len == 0) ++ goto err; ++ ++ aligned_len = round_up((u64) len, block_bytes(c)); ++ ++ ret = write_invalidate_inode_pages_range(dst->v.i_mapping, ++ pos_dst, pos_dst + len - 1); ++ if (ret) ++ goto err; ++ ++ mark_range_unallocated(src, pos_src, pos_src + aligned_len); ++ ++ ret = bch2_remap_range(c, ++ POS(dst->v.i_ino, pos_dst >> 9), ++ POS(src->v.i_ino, pos_src >> 9), ++ aligned_len >> 9, ++ &dst->ei_journal_seq, ++ pos_dst + len, &i_sectors_delta); ++ if (ret < 0) ++ goto err; ++ ++ /* ++ * due to alignment, we might have remapped slightly more than requsted ++ */ ++ ret = min((u64) ret << 9, (u64) len); ++ ++ /* XXX get a quota reservation */ ++ i_sectors_acct(c, dst, NULL, i_sectors_delta); ++ ++ spin_lock(&dst->v.i_lock); ++ if (pos_dst + ret > dst->v.i_size) ++ i_size_write(&dst->v, pos_dst + ret); ++ spin_unlock(&dst->v.i_lock); ++err: ++ bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); ++ ++ return ret; ++} ++ ++/* fseek: */ ++ ++static int page_data_offset(struct page *page, unsigned offset) ++{ ++ struct bch_page_state *s = bch2_page_state(page); ++ unsigned i; ++ ++ if (s) ++ for (i = offset >> 9; i < PAGE_SECTORS; i++) ++ if (s->s[i].state >= SECTOR_DIRTY) ++ return i << 9; ++ ++ return -1; ++} ++ ++static loff_t bch2_seek_pagecache_data(struct inode *vinode, ++ loff_t start_offset, ++ loff_t end_offset) ++{ ++ struct address_space *mapping = vinode->i_mapping; ++ struct page *page; ++ pgoff_t start_index = start_offset >> PAGE_SHIFT; ++ pgoff_t end_index = end_offset >> PAGE_SHIFT; ++ pgoff_t index = start_index; ++ loff_t ret; ++ int offset; ++ ++ while (index <= end_index) { ++ if (find_get_pages_range(mapping, &index, end_index, 1, &page)) { ++ lock_page(page); ++ ++ offset = page_data_offset(page, ++ page->index == start_index ++ ? start_offset & (PAGE_SIZE - 1) ++ : 0); ++ if (offset >= 0) { ++ ret = clamp(((loff_t) page->index << PAGE_SHIFT) + ++ offset, ++ start_offset, end_offset); ++ unlock_page(page); ++ put_page(page); ++ return ret; ++ } ++ ++ unlock_page(page); ++ put_page(page); ++ } else { ++ break; ++ } ++ } ++ ++ return end_offset; ++} ++ ++static loff_t bch2_seek_data(struct file *file, u64 offset) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 isize, next_data = MAX_LFS_FILESIZE; ++ int ret; ++ ++ isize = i_size_read(&inode->v); ++ if (offset >= isize) ++ return -ENXIO; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, ++ POS(inode->v.i_ino, offset >> 9), 0, k, ret) { ++ if (k.k->p.inode != inode->v.i_ino) { ++ break; ++ } else if (bkey_extent_is_data(k.k)) { ++ next_data = max(offset, bkey_start_offset(k.k) << 9); ++ break; ++ } else if (k.k->p.offset >> 9 > isize) ++ break; ++ } ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ if (ret) ++ return ret; ++ ++ if (next_data > offset) ++ next_data = bch2_seek_pagecache_data(&inode->v, ++ offset, next_data); ++ ++ if (next_data >= isize) ++ return -ENXIO; ++ ++ return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); ++} ++ ++static int __page_hole_offset(struct page *page, unsigned offset) ++{ ++ struct bch_page_state *s = bch2_page_state(page); ++ unsigned i; ++ ++ if (!s) ++ return 0; ++ ++ for (i = offset >> 9; i < PAGE_SECTORS; i++) ++ if (s->s[i].state < SECTOR_DIRTY) ++ return i << 9; ++ ++ return -1; ++} ++ ++static loff_t page_hole_offset(struct address_space *mapping, loff_t offset) ++{ ++ pgoff_t index = offset >> PAGE_SHIFT; ++ struct page *page; ++ int pg_offset; ++ loff_t ret = -1; ++ ++ page = find_lock_entry(mapping, index); ++ if (!page || xa_is_value(page)) ++ return offset; ++ ++ pg_offset = __page_hole_offset(page, offset & (PAGE_SIZE - 1)); ++ if (pg_offset >= 0) ++ ret = ((loff_t) index << PAGE_SHIFT) + pg_offset; ++ ++ unlock_page(page); ++ ++ return ret; ++} ++ ++static loff_t bch2_seek_pagecache_hole(struct inode *vinode, ++ loff_t start_offset, ++ loff_t end_offset) ++{ ++ struct address_space *mapping = vinode->i_mapping; ++ loff_t offset = start_offset, hole; ++ ++ while (offset < end_offset) { ++ hole = page_hole_offset(mapping, offset); ++ if (hole >= 0 && hole <= end_offset) ++ return max(start_offset, hole); ++ ++ offset += PAGE_SIZE; ++ offset &= PAGE_MASK; ++ } ++ ++ return end_offset; ++} ++ ++static loff_t bch2_seek_hole(struct file *file, u64 offset) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 isize, next_hole = MAX_LFS_FILESIZE; ++ int ret; ++ ++ isize = i_size_read(&inode->v); ++ if (offset >= isize) ++ return -ENXIO; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, ++ POS(inode->v.i_ino, offset >> 9), ++ BTREE_ITER_SLOTS, k, ret) { ++ if (k.k->p.inode != inode->v.i_ino) { ++ next_hole = bch2_seek_pagecache_hole(&inode->v, ++ offset, MAX_LFS_FILESIZE); ++ break; ++ } else if (!bkey_extent_is_data(k.k)) { ++ next_hole = bch2_seek_pagecache_hole(&inode->v, ++ max(offset, bkey_start_offset(k.k) << 9), ++ k.k->p.offset << 9); ++ ++ if (next_hole < k.k->p.offset << 9) ++ break; ++ } else { ++ offset = max(offset, bkey_start_offset(k.k) << 9); ++ } ++ } ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ if (ret) ++ return ret; ++ ++ if (next_hole > isize) ++ next_hole = isize; ++ ++ return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); ++} ++ ++loff_t bch2_llseek(struct file *file, loff_t offset, int whence) ++{ ++ switch (whence) { ++ case SEEK_SET: ++ case SEEK_CUR: ++ case SEEK_END: ++ return generic_file_llseek(file, offset, whence); ++ case SEEK_DATA: ++ return bch2_seek_data(file, offset); ++ case SEEK_HOLE: ++ return bch2_seek_hole(file, offset); ++ } ++ ++ return -EINVAL; ++} ++ ++void bch2_fs_fsio_exit(struct bch_fs *c) ++{ ++ bioset_exit(&c->dio_write_bioset); ++ bioset_exit(&c->dio_read_bioset); ++ bioset_exit(&c->writepage_bioset); ++} ++ ++int bch2_fs_fsio_init(struct bch_fs *c) ++{ ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ if (bioset_init(&c->writepage_bioset, ++ 4, offsetof(struct bch_writepage_io, op.wbio.bio), ++ BIOSET_NEED_BVECS) || ++ bioset_init(&c->dio_read_bioset, ++ 4, offsetof(struct dio_read, rbio.bio), ++ BIOSET_NEED_BVECS) || ++ bioset_init(&c->dio_write_bioset, ++ 4, offsetof(struct dio_write, op.wbio.bio), ++ BIOSET_NEED_BVECS)) ++ ret = -ENOMEM; ++ ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} ++ ++#endif /* NO_BCACHEFS_FS */ +diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h +new file mode 100644 +index 000000000000..7063556d289b +--- /dev/null ++++ b/fs/bcachefs/fs-io.h +@@ -0,0 +1,57 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FS_IO_H ++#define _BCACHEFS_FS_IO_H ++ ++#ifndef NO_BCACHEFS_FS ++ ++#include "buckets.h" ++#include "io_types.h" ++ ++#include ++ ++struct quota_res; ++ ++int __must_check bch2_write_inode_size(struct bch_fs *, ++ struct bch_inode_info *, ++ loff_t, unsigned); ++ ++int bch2_writepage(struct page *, struct writeback_control *); ++int bch2_readpage(struct file *, struct page *); ++ ++int bch2_writepages(struct address_space *, struct writeback_control *); ++int bch2_readpages(struct file *, struct address_space *, ++ struct list_head *, unsigned); ++ ++int bch2_write_begin(struct file *, struct address_space *, loff_t, ++ unsigned, unsigned, struct page **, void **); ++int bch2_write_end(struct file *, struct address_space *, loff_t, ++ unsigned, unsigned, struct page *, void *); ++ ++ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); ++ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); ++ ++int bch2_fsync(struct file *, loff_t, loff_t, int); ++ ++int bch2_truncate(struct bch_inode_info *, struct iattr *); ++long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); ++ ++loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, ++ loff_t, loff_t, unsigned); ++ ++loff_t bch2_llseek(struct file *, loff_t, int); ++ ++vm_fault_t bch2_page_fault(struct vm_fault *); ++vm_fault_t bch2_page_mkwrite(struct vm_fault *); ++void bch2_invalidatepage(struct page *, unsigned int, unsigned int); ++int bch2_releasepage(struct page *, gfp_t); ++int bch2_migrate_page(struct address_space *, struct page *, ++ struct page *, enum migrate_mode); ++ ++void bch2_fs_fsio_exit(struct bch_fs *); ++int bch2_fs_fsio_init(struct bch_fs *); ++#else ++static inline void bch2_fs_fsio_exit(struct bch_fs *c) {} ++static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; } ++#endif ++ ++#endif /* _BCACHEFS_FS_IO_H */ +diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c +new file mode 100644 +index 000000000000..0873d2f0928c +--- /dev/null ++++ b/fs/bcachefs/fs-ioctl.c +@@ -0,0 +1,312 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef NO_BCACHEFS_FS ++ ++#include "bcachefs.h" ++#include "chardev.h" ++#include "dirent.h" ++#include "fs.h" ++#include "fs-common.h" ++#include "fs-ioctl.h" ++#include "quota.h" ++ ++#include ++#include ++ ++#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) ++ ++struct flags_set { ++ unsigned mask; ++ unsigned flags; ++ ++ unsigned projid; ++}; ++ ++static int bch2_inode_flags_set(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ /* ++ * We're relying on btree locking here for exclusion with other ioctl ++ * calls - use the flags in the btree (@bi), not inode->i_flags: ++ */ ++ struct flags_set *s = p; ++ unsigned newflags = s->flags; ++ unsigned oldflags = bi->bi_flags & s->mask; ++ ++ if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) && ++ !capable(CAP_LINUX_IMMUTABLE)) ++ return -EPERM; ++ ++ if (!S_ISREG(bi->bi_mode) && ++ !S_ISDIR(bi->bi_mode) && ++ (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags) ++ return -EINVAL; ++ ++ bi->bi_flags &= ~s->mask; ++ bi->bi_flags |= newflags; ++ ++ bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); ++ return 0; ++} ++ ++static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg) ++{ ++ unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags); ++ ++ return put_user(flags, arg); ++} ++ ++static int bch2_ioc_setflags(struct bch_fs *c, ++ struct file *file, ++ struct bch_inode_info *inode, ++ void __user *arg) ++{ ++ struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) }; ++ unsigned uflags; ++ int ret; ++ ++ if (get_user(uflags, (int __user *) arg)) ++ return -EFAULT; ++ ++ s.flags = map_flags_rev(bch_flags_to_uflags, uflags); ++ if (uflags) ++ return -EOPNOTSUPP; ++ ++ ret = mnt_want_write_file(file); ++ if (ret) ++ return ret; ++ ++ inode_lock(&inode->v); ++ if (!inode_owner_or_capable(&inode->v)) { ++ ret = -EACCES; ++ goto setflags_out; ++ } ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode(c, inode, bch2_inode_flags_set, &s, ++ ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++ ++setflags_out: ++ inode_unlock(&inode->v); ++ mnt_drop_write_file(file); ++ return ret; ++} ++ ++static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, ++ struct fsxattr __user *arg) ++{ ++ struct fsxattr fa = { 0 }; ++ ++ fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags); ++ fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ]; ++ ++ return copy_to_user(arg, &fa, sizeof(fa)); ++} ++ ++static int fssetxattr_inode_update_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct flags_set *s = p; ++ ++ if (s->projid != bi->bi_project) { ++ bi->bi_fields_set |= 1U << Inode_opt_project; ++ bi->bi_project = s->projid; ++ } ++ ++ return bch2_inode_flags_set(inode, bi, p); ++} ++ ++static int bch2_ioc_fssetxattr(struct bch_fs *c, ++ struct file *file, ++ struct bch_inode_info *inode, ++ struct fsxattr __user *arg) ++{ ++ struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) }; ++ struct fsxattr fa; ++ int ret; ++ ++ if (copy_from_user(&fa, arg, sizeof(fa))) ++ return -EFAULT; ++ ++ s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags); ++ if (fa.fsx_xflags) ++ return -EOPNOTSUPP; ++ ++ if (fa.fsx_projid >= U32_MAX) ++ return -EINVAL; ++ ++ /* ++ * inode fields accessible via the xattr interface are stored with a +1 ++ * bias, so that 0 means unset: ++ */ ++ s.projid = fa.fsx_projid + 1; ++ ++ ret = mnt_want_write_file(file); ++ if (ret) ++ return ret; ++ ++ inode_lock(&inode->v); ++ if (!inode_owner_or_capable(&inode->v)) { ++ ret = -EACCES; ++ goto err; ++ } ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_set_projid(c, inode, fa.fsx_projid); ++ if (ret) ++ goto err_unlock; ++ ++ ret = bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, ++ ATTR_CTIME); ++err_unlock: ++ mutex_unlock(&inode->ei_update_lock); ++err: ++ inode_unlock(&inode->v); ++ mnt_drop_write_file(file); ++ return ret; ++} ++ ++static int bch2_reinherit_attrs_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct bch_inode_info *dir = p; ++ ++ return !bch2_reinherit_attrs(bi, &dir->ei_inode); ++} ++ ++static int bch2_ioc_reinherit_attrs(struct bch_fs *c, ++ struct file *file, ++ struct bch_inode_info *src, ++ const char __user *name) ++{ ++ struct bch_inode_info *dst; ++ struct inode *vinode = NULL; ++ char *kname = NULL; ++ struct qstr qstr; ++ int ret = 0; ++ u64 inum; ++ ++ kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); ++ if (!kname) ++ return -ENOMEM; ++ ++ ret = strncpy_from_user(kname, name, BCH_NAME_MAX); ++ if (unlikely(ret < 0)) ++ goto err1; ++ ++ qstr.len = ret; ++ qstr.name = kname; ++ ++ ret = -ENOENT; ++ inum = bch2_dirent_lookup(c, src->v.i_ino, ++ &src->ei_str_hash, ++ &qstr); ++ if (!inum) ++ goto err1; ++ ++ vinode = bch2_vfs_inode_get(c, inum); ++ ret = PTR_ERR_OR_ZERO(vinode); ++ if (ret) ++ goto err1; ++ ++ dst = to_bch_ei(vinode); ++ ++ ret = mnt_want_write_file(file); ++ if (ret) ++ goto err2; ++ ++ bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst); ++ ++ if (inode_attr_changing(src, dst, Inode_opt_project)) { ++ ret = bch2_fs_quota_transfer(c, dst, ++ src->ei_qid, ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (ret) ++ goto err3; ++ } ++ ++ ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0); ++err3: ++ bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst); ++ ++ /* return true if we did work */ ++ if (ret >= 0) ++ ret = !ret; ++ ++ mnt_drop_write_file(file); ++err2: ++ iput(vinode); ++err1: ++ kfree(kname); ++ ++ return ret; ++} ++ ++long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct super_block *sb = inode->v.i_sb; ++ struct bch_fs *c = sb->s_fs_info; ++ ++ switch (cmd) { ++ case FS_IOC_GETFLAGS: ++ return bch2_ioc_getflags(inode, (int __user *) arg); ++ ++ case FS_IOC_SETFLAGS: ++ return bch2_ioc_setflags(c, file, inode, (int __user *) arg); ++ ++ case FS_IOC_FSGETXATTR: ++ return bch2_ioc_fsgetxattr(inode, (void __user *) arg); ++ case FS_IOC_FSSETXATTR: ++ return bch2_ioc_fssetxattr(c, file, inode, ++ (void __user *) arg); ++ ++ case BCHFS_IOC_REINHERIT_ATTRS: ++ return bch2_ioc_reinherit_attrs(c, file, inode, ++ (void __user *) arg); ++ ++ case FS_IOC_GETVERSION: ++ return -ENOTTY; ++ case FS_IOC_SETVERSION: ++ return -ENOTTY; ++ ++ case FS_IOC_GOINGDOWN: ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ down_write(&sb->s_umount); ++ sb->s_flags |= SB_RDONLY; ++ if (bch2_fs_emergency_read_only(c)) ++ bch_err(c, "emergency read only due to ioctl"); ++ up_write(&sb->s_umount); ++ return 0; ++ ++ default: ++ return bch2_fs_ioctl(c, cmd, (void __user *) arg); ++ } ++} ++ ++#ifdef CONFIG_COMPAT ++long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg) ++{ ++ /* These are just misnamed, they actually get/put from/to user an int */ ++ switch (cmd) { ++ case FS_IOC_GETFLAGS: ++ cmd = FS_IOC_GETFLAGS; ++ break; ++ case FS_IOC32_SETFLAGS: ++ cmd = FS_IOC_SETFLAGS; ++ break; ++ default: ++ return -ENOIOCTLCMD; ++ } ++ return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); ++} ++#endif ++ ++#endif /* NO_BCACHEFS_FS */ +diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h +new file mode 100644 +index 000000000000..f201980ef2c3 +--- /dev/null ++++ b/fs/bcachefs/fs-ioctl.h +@@ -0,0 +1,81 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FS_IOCTL_H ++#define _BCACHEFS_FS_IOCTL_H ++ ++/* Inode flags: */ ++ ++/* bcachefs inode flags -> vfs inode flags: */ ++static const unsigned bch_flags_to_vfs[] = { ++ [__BCH_INODE_SYNC] = S_SYNC, ++ [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE, ++ [__BCH_INODE_APPEND] = S_APPEND, ++ [__BCH_INODE_NOATIME] = S_NOATIME, ++}; ++ ++/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ ++static const unsigned bch_flags_to_uflags[] = { ++ [__BCH_INODE_SYNC] = FS_SYNC_FL, ++ [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL, ++ [__BCH_INODE_APPEND] = FS_APPEND_FL, ++ [__BCH_INODE_NODUMP] = FS_NODUMP_FL, ++ [__BCH_INODE_NOATIME] = FS_NOATIME_FL, ++}; ++ ++/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ ++static const unsigned bch_flags_to_xflags[] = { ++ [__BCH_INODE_SYNC] = FS_XFLAG_SYNC, ++ [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE, ++ [__BCH_INODE_APPEND] = FS_XFLAG_APPEND, ++ [__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP, ++ [__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME, ++ //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; ++}; ++ ++#define set_flags(_map, _in, _out) \ ++do { \ ++ unsigned _i; \ ++ \ ++ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ ++ if ((_in) & (1 << _i)) \ ++ (_out) |= _map[_i]; \ ++ else \ ++ (_out) &= ~_map[_i]; \ ++} while (0) ++ ++#define map_flags(_map, _in) \ ++({ \ ++ unsigned _out = 0; \ ++ \ ++ set_flags(_map, _in, _out); \ ++ _out; \ ++}) ++ ++#define map_flags_rev(_map, _in) \ ++({ \ ++ unsigned _i, _out = 0; \ ++ \ ++ for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ ++ if ((_in) & _map[_i]) { \ ++ (_out) |= 1 << _i; \ ++ (_in) &= ~_map[_i]; \ ++ } \ ++ (_out); \ ++}) ++ ++#define map_defined(_map) \ ++({ \ ++ unsigned _in = ~0; \ ++ \ ++ map_flags_rev(_map, _in); \ ++}) ++ ++/* Set VFS inode flags from bcachefs inode: */ ++static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) ++{ ++ set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); ++} ++ ++long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long); ++long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long); ++ ++#endif /* _BCACHEFS_FS_IOCTL_H */ +diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c +new file mode 100644 +index 000000000000..6a9820e83db7 +--- /dev/null ++++ b/fs/bcachefs/fs.c +@@ -0,0 +1,1614 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifndef NO_BCACHEFS_FS ++ ++#include "bcachefs.h" ++#include "acl.h" ++#include "bkey_on_stack.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "chardev.h" ++#include "dirent.h" ++#include "extents.h" ++#include "fs.h" ++#include "fs-common.h" ++#include "fs-io.h" ++#include "fs-ioctl.h" ++#include "fsck.h" ++#include "inode.h" ++#include "io.h" ++#include "journal.h" ++#include "keylist.h" ++#include "quota.h" ++#include "super.h" ++#include "xattr.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static struct kmem_cache *bch2_inode_cache; ++ ++static void bch2_vfs_inode_init(struct bch_fs *, ++ struct bch_inode_info *, ++ struct bch_inode_unpacked *); ++ ++static void journal_seq_copy(struct bch_fs *c, ++ struct bch_inode_info *dst, ++ u64 journal_seq) ++{ ++ u64 old, v = READ_ONCE(dst->ei_journal_seq); ++ ++ do { ++ old = v; ++ ++ if (old >= journal_seq) ++ break; ++ } while ((v = cmpxchg(&dst->ei_journal_seq, old, journal_seq)) != old); ++ ++ bch2_journal_set_has_inum(&c->journal, dst->v.i_ino, journal_seq); ++} ++ ++static void __pagecache_lock_put(struct pagecache_lock *lock, long i) ++{ ++ BUG_ON(atomic_long_read(&lock->v) == 0); ++ ++ if (atomic_long_sub_return_release(i, &lock->v) == 0) ++ wake_up_all(&lock->wait); ++} ++ ++static bool __pagecache_lock_tryget(struct pagecache_lock *lock, long i) ++{ ++ long v = atomic_long_read(&lock->v), old; ++ ++ do { ++ old = v; ++ ++ if (i > 0 ? v < 0 : v > 0) ++ return false; ++ } while ((v = atomic_long_cmpxchg_acquire(&lock->v, ++ old, old + i)) != old); ++ return true; ++} ++ ++static void __pagecache_lock_get(struct pagecache_lock *lock, long i) ++{ ++ wait_event(lock->wait, __pagecache_lock_tryget(lock, i)); ++} ++ ++void bch2_pagecache_add_put(struct pagecache_lock *lock) ++{ ++ __pagecache_lock_put(lock, 1); ++} ++ ++void bch2_pagecache_add_get(struct pagecache_lock *lock) ++{ ++ __pagecache_lock_get(lock, 1); ++} ++ ++void bch2_pagecache_block_put(struct pagecache_lock *lock) ++{ ++ __pagecache_lock_put(lock, -1); ++} ++ ++void bch2_pagecache_block_get(struct pagecache_lock *lock) ++{ ++ __pagecache_lock_get(lock, -1); ++} ++ ++void bch2_inode_update_after_write(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ unsigned fields) ++{ ++ set_nlink(&inode->v, bch2_inode_nlink_get(bi)); ++ i_uid_write(&inode->v, bi->bi_uid); ++ i_gid_write(&inode->v, bi->bi_gid); ++ inode->v.i_mode = bi->bi_mode; ++ ++ if (fields & ATTR_ATIME) ++ inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime); ++ if (fields & ATTR_MTIME) ++ inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime); ++ if (fields & ATTR_CTIME) ++ inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime); ++ ++ inode->ei_inode = *bi; ++ ++ bch2_inode_flags_to_vfs(inode); ++} ++ ++int __must_check bch2_write_inode(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ inode_set_fn set, ++ void *p, unsigned fields) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bch_inode_unpacked inode_u; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(iter) ?: ++ (set ? set(inode, &inode_u, p) : 0) ?: ++ bch2_inode_write(&trans, iter, &inode_u) ?: ++ bch2_trans_commit(&trans, NULL, ++ &inode->ei_journal_seq, ++ BTREE_INSERT_NOUNLOCK| ++ BTREE_INSERT_NOFAIL); ++ ++ /* ++ * the btree node lock protects inode->ei_inode, not ei_update_lock; ++ * this is important for inode updates via bchfs_write_index_update ++ */ ++ if (!ret) ++ bch2_inode_update_after_write(c, inode, &inode_u, fields); ++ ++ bch2_trans_iter_put(&trans, iter); ++ ++ if (ret == -EINTR) ++ goto retry; ++ ++ bch2_trans_exit(&trans); ++ return ret < 0 ? ret : 0; ++} ++ ++int bch2_fs_quota_transfer(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch_qid new_qid, ++ unsigned qtypes, ++ enum quota_acct_mode mode) ++{ ++ unsigned i; ++ int ret; ++ ++ qtypes &= enabled_qtypes(c); ++ ++ for (i = 0; i < QTYP_NR; i++) ++ if (new_qid.q[i] == inode->ei_qid.q[i]) ++ qtypes &= ~(1U << i); ++ ++ if (!qtypes) ++ return 0; ++ ++ mutex_lock(&inode->ei_quota_lock); ++ ++ ret = bch2_quota_transfer(c, qtypes, new_qid, ++ inode->ei_qid, ++ inode->v.i_blocks + ++ inode->ei_quota_reserved, ++ mode); ++ if (!ret) ++ for (i = 0; i < QTYP_NR; i++) ++ if (qtypes & (1 << i)) ++ inode->ei_qid.q[i] = new_qid.q[i]; ++ ++ mutex_unlock(&inode->ei_quota_lock); ++ ++ return ret; ++} ++ ++struct inode *bch2_vfs_inode_get(struct bch_fs *c, u64 inum) ++{ ++ struct bch_inode_unpacked inode_u; ++ struct bch_inode_info *inode; ++ int ret; ++ ++ inode = to_bch_ei(iget_locked(c->vfs_sb, inum)); ++ if (unlikely(!inode)) ++ return ERR_PTR(-ENOMEM); ++ if (!(inode->v.i_state & I_NEW)) ++ return &inode->v; ++ ++ ret = bch2_inode_find_by_inum(c, inum, &inode_u); ++ if (ret) { ++ iget_failed(&inode->v); ++ return ERR_PTR(ret); ++ } ++ ++ bch2_vfs_inode_init(c, inode, &inode_u); ++ ++ inode->ei_journal_seq = bch2_inode_journal_seq(&c->journal, inum); ++ ++ unlock_new_inode(&inode->v); ++ ++ return &inode->v; ++} ++ ++static struct bch_inode_info * ++__bch2_create(struct bch_inode_info *dir, struct dentry *dentry, ++ umode_t mode, dev_t rdev, bool tmpfile) ++{ ++ struct bch_fs *c = dir->v.i_sb->s_fs_info; ++ struct user_namespace *ns = dir->v.i_sb->s_user_ns; ++ struct btree_trans trans; ++ struct bch_inode_unpacked dir_u; ++ struct bch_inode_info *inode, *old; ++ struct bch_inode_unpacked inode_u; ++ struct posix_acl *default_acl = NULL, *acl = NULL; ++ u64 journal_seq = 0; ++ int ret; ++ ++ /* ++ * preallocate acls + vfs inode before btree transaction, so that ++ * nothing can fail after the transaction succeeds: ++ */ ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); ++ if (ret) ++ return ERR_PTR(ret); ++#endif ++ inode = to_bch_ei(new_inode(c->vfs_sb)); ++ if (unlikely(!inode)) { ++ inode = ERR_PTR(-ENOMEM); ++ goto err; ++ } ++ ++ bch2_inode_init_early(c, &inode_u); ++ ++ if (!tmpfile) ++ mutex_lock(&dir->ei_update_lock); ++ ++ bch2_trans_init(&trans, c, 8, 1024); ++retry: ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_create_trans(&trans, dir->v.i_ino, &dir_u, &inode_u, ++ !tmpfile ? &dentry->d_name : NULL, ++ from_kuid(ns, current_fsuid()), ++ from_kgid(ns, current_fsgid()), ++ mode, rdev, ++ default_acl, acl) ?: ++ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (unlikely(ret)) ++ goto err_before_quota; ++ ++ ret = bch2_trans_commit(&trans, NULL, &journal_seq, ++ BTREE_INSERT_NOUNLOCK); ++ if (unlikely(ret)) { ++ bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, ++ KEY_TYPE_QUOTA_WARN); ++err_before_quota: ++ if (ret == -EINTR) ++ goto retry; ++ goto err_trans; ++ } ++ ++ if (!tmpfile) { ++ bch2_inode_update_after_write(c, dir, &dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ journal_seq_copy(c, dir, journal_seq); ++ mutex_unlock(&dir->ei_update_lock); ++ } ++ ++ bch2_vfs_inode_init(c, inode, &inode_u); ++ journal_seq_copy(c, inode, journal_seq); ++ ++ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); ++ set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); ++ ++ /* ++ * we must insert the new inode into the inode cache before calling ++ * bch2_trans_exit() and dropping locks, else we could race with another ++ * thread pulling the inode in and modifying it: ++ */ ++ ++ old = to_bch_ei(insert_inode_locked2(&inode->v)); ++ if (unlikely(old)) { ++ /* ++ * We raced, another process pulled the new inode into cache ++ * before us: ++ */ ++ journal_seq_copy(c, old, journal_seq); ++ make_bad_inode(&inode->v); ++ iput(&inode->v); ++ ++ inode = old; ++ } else { ++ /* ++ * we really don't want insert_inode_locked2() to be setting ++ * I_NEW... ++ */ ++ unlock_new_inode(&inode->v); ++ } ++ ++ bch2_trans_exit(&trans); ++err: ++ posix_acl_release(default_acl); ++ posix_acl_release(acl); ++ return inode; ++err_trans: ++ if (!tmpfile) ++ mutex_unlock(&dir->ei_update_lock); ++ ++ bch2_trans_exit(&trans); ++ make_bad_inode(&inode->v); ++ iput(&inode->v); ++ inode = ERR_PTR(ret); ++ goto err; ++} ++ ++/* methods */ ++ ++static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, ++ unsigned int flags) ++{ ++ struct bch_fs *c = vdir->i_sb->s_fs_info; ++ struct bch_inode_info *dir = to_bch_ei(vdir); ++ struct inode *vinode = NULL; ++ u64 inum; ++ ++ inum = bch2_dirent_lookup(c, dir->v.i_ino, ++ &dir->ei_str_hash, ++ &dentry->d_name); ++ ++ if (inum) ++ vinode = bch2_vfs_inode_get(c, inum); ++ ++ return d_splice_alias(vinode, dentry); ++} ++ ++static int bch2_mknod(struct inode *vdir, struct dentry *dentry, ++ umode_t mode, dev_t rdev) ++{ ++ struct bch_inode_info *inode = ++ __bch2_create(to_bch_ei(vdir), dentry, mode, rdev, false); ++ ++ if (IS_ERR(inode)) ++ return PTR_ERR(inode); ++ ++ d_instantiate(dentry, &inode->v); ++ return 0; ++} ++ ++static int bch2_create(struct inode *vdir, struct dentry *dentry, ++ umode_t mode, bool excl) ++{ ++ return bch2_mknod(vdir, dentry, mode|S_IFREG, 0); ++} ++ ++static int __bch2_link(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch_inode_info *dir, ++ struct dentry *dentry) ++{ ++ struct btree_trans trans; ++ struct bch_inode_unpacked dir_u, inode_u; ++ int ret; ++ ++ mutex_lock(&inode->ei_update_lock); ++ bch2_trans_init(&trans, c, 4, 1024); ++ ++ do { ++ bch2_trans_begin(&trans); ++ ret = bch2_link_trans(&trans, ++ dir->v.i_ino, ++ inode->v.i_ino, &dir_u, &inode_u, ++ &dentry->d_name) ?: ++ bch2_trans_commit(&trans, NULL, ++ &inode->ei_journal_seq, ++ BTREE_INSERT_NOUNLOCK); ++ } while (ret == -EINTR); ++ ++ if (likely(!ret)) { ++ BUG_ON(inode_u.bi_inum != inode->v.i_ino); ++ ++ journal_seq_copy(c, inode, dir->ei_journal_seq); ++ bch2_inode_update_after_write(c, dir, &dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ bch2_inode_update_after_write(c, inode, &inode_u, ATTR_CTIME); ++ } ++ ++ bch2_trans_exit(&trans); ++ mutex_unlock(&inode->ei_update_lock); ++ return ret; ++} ++ ++static int bch2_link(struct dentry *old_dentry, struct inode *vdir, ++ struct dentry *dentry) ++{ ++ struct bch_fs *c = vdir->i_sb->s_fs_info; ++ struct bch_inode_info *dir = to_bch_ei(vdir); ++ struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode); ++ int ret; ++ ++ lockdep_assert_held(&inode->v.i_rwsem); ++ ++ ret = __bch2_link(c, inode, dir, dentry); ++ if (unlikely(ret)) ++ return ret; ++ ++ ihold(&inode->v); ++ d_instantiate(dentry, &inode->v); ++ return 0; ++} ++ ++static int bch2_unlink(struct inode *vdir, struct dentry *dentry) ++{ ++ struct bch_fs *c = vdir->i_sb->s_fs_info; ++ struct bch_inode_info *dir = to_bch_ei(vdir); ++ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); ++ struct bch_inode_unpacked dir_u, inode_u; ++ struct btree_trans trans; ++ int ret; ++ ++ bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); ++ bch2_trans_init(&trans, c, 4, 1024); ++ ++ do { ++ bch2_trans_begin(&trans); ++ ++ ret = bch2_unlink_trans(&trans, ++ dir->v.i_ino, &dir_u, ++ &inode_u, &dentry->d_name) ?: ++ bch2_trans_commit(&trans, NULL, ++ &dir->ei_journal_seq, ++ BTREE_INSERT_NOUNLOCK| ++ BTREE_INSERT_NOFAIL); ++ } while (ret == -EINTR); ++ ++ if (likely(!ret)) { ++ BUG_ON(inode_u.bi_inum != inode->v.i_ino); ++ ++ journal_seq_copy(c, inode, dir->ei_journal_seq); ++ bch2_inode_update_after_write(c, dir, &dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ bch2_inode_update_after_write(c, inode, &inode_u, ++ ATTR_MTIME); ++ } ++ ++ bch2_trans_exit(&trans); ++ bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); ++ ++ return ret; ++} ++ ++static int bch2_symlink(struct inode *vdir, struct dentry *dentry, ++ const char *symname) ++{ ++ struct bch_fs *c = vdir->i_sb->s_fs_info; ++ struct bch_inode_info *dir = to_bch_ei(vdir), *inode; ++ int ret; ++ ++ inode = __bch2_create(dir, dentry, S_IFLNK|S_IRWXUGO, 0, true); ++ if (unlikely(IS_ERR(inode))) ++ return PTR_ERR(inode); ++ ++ inode_lock(&inode->v); ++ ret = page_symlink(&inode->v, symname, strlen(symname) + 1); ++ inode_unlock(&inode->v); ++ ++ if (unlikely(ret)) ++ goto err; ++ ++ ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX); ++ if (unlikely(ret)) ++ goto err; ++ ++ journal_seq_copy(c, dir, inode->ei_journal_seq); ++ ++ ret = __bch2_link(c, inode, dir, dentry); ++ if (unlikely(ret)) ++ goto err; ++ ++ d_instantiate(dentry, &inode->v); ++ return 0; ++err: ++ iput(&inode->v); ++ return ret; ++} ++ ++static int bch2_mkdir(struct inode *vdir, struct dentry *dentry, umode_t mode) ++{ ++ return bch2_mknod(vdir, dentry, mode|S_IFDIR, 0); ++} ++ ++static int bch2_rename2(struct inode *src_vdir, struct dentry *src_dentry, ++ struct inode *dst_vdir, struct dentry *dst_dentry, ++ unsigned flags) ++{ ++ struct bch_fs *c = src_vdir->i_sb->s_fs_info; ++ struct bch_inode_info *src_dir = to_bch_ei(src_vdir); ++ struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir); ++ struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); ++ struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); ++ struct bch_inode_unpacked dst_dir_u, src_dir_u; ++ struct bch_inode_unpacked src_inode_u, dst_inode_u; ++ struct btree_trans trans; ++ enum bch_rename_mode mode = flags & RENAME_EXCHANGE ++ ? BCH_RENAME_EXCHANGE ++ : dst_dentry->d_inode ++ ? BCH_RENAME_OVERWRITE : BCH_RENAME; ++ u64 journal_seq = 0; ++ int ret; ++ ++ if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) ++ return -EINVAL; ++ ++ if (mode == BCH_RENAME_OVERWRITE) { ++ ret = filemap_write_and_wait_range(src_inode->v.i_mapping, ++ 0, LLONG_MAX); ++ if (ret) ++ return ret; ++ } ++ ++ bch2_trans_init(&trans, c, 8, 2048); ++ ++ bch2_lock_inodes(INODE_UPDATE_LOCK, ++ src_dir, ++ dst_dir, ++ src_inode, ++ dst_inode); ++ ++ if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { ++ ret = bch2_fs_quota_transfer(c, src_inode, ++ dst_dir->ei_qid, ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (ret) ++ goto err; ++ } ++ ++ if (mode == BCH_RENAME_EXCHANGE && ++ inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) { ++ ret = bch2_fs_quota_transfer(c, dst_inode, ++ src_dir->ei_qid, ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (ret) ++ goto err; ++ } ++ ++retry: ++ bch2_trans_begin(&trans); ++ ret = bch2_rename_trans(&trans, ++ src_dir->v.i_ino, &src_dir_u, ++ dst_dir->v.i_ino, &dst_dir_u, ++ &src_inode_u, ++ &dst_inode_u, ++ &src_dentry->d_name, ++ &dst_dentry->d_name, ++ mode) ?: ++ bch2_trans_commit(&trans, NULL, ++ &journal_seq, ++ BTREE_INSERT_NOUNLOCK); ++ if (ret == -EINTR) ++ goto retry; ++ if (unlikely(ret)) ++ goto err; ++ ++ BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); ++ BUG_ON(dst_inode && ++ dst_inode->v.i_ino != dst_inode_u.bi_inum); ++ ++ bch2_inode_update_after_write(c, src_dir, &src_dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ journal_seq_copy(c, src_dir, journal_seq); ++ ++ if (src_dir != dst_dir) { ++ bch2_inode_update_after_write(c, dst_dir, &dst_dir_u, ++ ATTR_MTIME|ATTR_CTIME); ++ journal_seq_copy(c, dst_dir, journal_seq); ++ } ++ ++ bch2_inode_update_after_write(c, src_inode, &src_inode_u, ++ ATTR_CTIME); ++ journal_seq_copy(c, src_inode, journal_seq); ++ ++ if (dst_inode) { ++ bch2_inode_update_after_write(c, dst_inode, &dst_inode_u, ++ ATTR_CTIME); ++ journal_seq_copy(c, dst_inode, journal_seq); ++ } ++err: ++ bch2_trans_exit(&trans); ++ ++ bch2_fs_quota_transfer(c, src_inode, ++ bch_qid(&src_inode->ei_inode), ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_NOCHECK); ++ if (dst_inode) ++ bch2_fs_quota_transfer(c, dst_inode, ++ bch_qid(&dst_inode->ei_inode), ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_NOCHECK); ++ ++ bch2_unlock_inodes(INODE_UPDATE_LOCK, ++ src_dir, ++ dst_dir, ++ src_inode, ++ dst_inode); ++ ++ return ret; ++} ++ ++void bch2_setattr_copy(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ struct iattr *attr) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ unsigned int ia_valid = attr->ia_valid; ++ ++ if (ia_valid & ATTR_UID) ++ bi->bi_uid = from_kuid(c->vfs_sb->s_user_ns, attr->ia_uid); ++ if (ia_valid & ATTR_GID) ++ bi->bi_gid = from_kgid(c->vfs_sb->s_user_ns, attr->ia_gid); ++ ++ if (ia_valid & ATTR_ATIME) ++ bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); ++ if (ia_valid & ATTR_MTIME) ++ bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime); ++ if (ia_valid & ATTR_CTIME) ++ bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime); ++ ++ if (ia_valid & ATTR_MODE) { ++ umode_t mode = attr->ia_mode; ++ kgid_t gid = ia_valid & ATTR_GID ++ ? attr->ia_gid ++ : inode->v.i_gid; ++ ++ if (!in_group_p(gid) && ++ !capable_wrt_inode_uidgid(&inode->v, CAP_FSETID)) ++ mode &= ~S_ISGID; ++ bi->bi_mode = mode; ++ } ++} ++ ++static int bch2_setattr_nonsize(struct bch_inode_info *inode, ++ struct iattr *attr) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_qid qid; ++ struct btree_trans trans; ++ struct btree_iter *inode_iter; ++ struct bch_inode_unpacked inode_u; ++ struct posix_acl *acl = NULL; ++ int ret; ++ ++ mutex_lock(&inode->ei_update_lock); ++ ++ qid = inode->ei_qid; ++ ++ if (attr->ia_valid & ATTR_UID) ++ qid.q[QTYP_USR] = from_kuid(&init_user_ns, attr->ia_uid); ++ ++ if (attr->ia_valid & ATTR_GID) ++ qid.q[QTYP_GRP] = from_kgid(&init_user_ns, attr->ia_gid); ++ ++ ret = bch2_fs_quota_transfer(c, inode, qid, ~0, ++ KEY_TYPE_QUOTA_PREALLOC); ++ if (ret) ++ goto err; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ kfree(acl); ++ acl = NULL; ++ ++ inode_iter = bch2_inode_peek(&trans, &inode_u, inode->v.i_ino, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(inode_iter); ++ if (ret) ++ goto btree_err; ++ ++ bch2_setattr_copy(inode, &inode_u, attr); ++ ++ if (attr->ia_valid & ATTR_MODE) { ++ ret = bch2_acl_chmod(&trans, inode, inode_u.bi_mode, &acl); ++ if (ret) ++ goto btree_err; ++ } ++ ++ ret = bch2_inode_write(&trans, inode_iter, &inode_u) ?: ++ bch2_trans_commit(&trans, NULL, ++ &inode->ei_journal_seq, ++ BTREE_INSERT_NOUNLOCK| ++ BTREE_INSERT_NOFAIL); ++btree_err: ++ if (ret == -EINTR) ++ goto retry; ++ if (unlikely(ret)) ++ goto err_trans; ++ ++ bch2_inode_update_after_write(c, inode, &inode_u, attr->ia_valid); ++ ++ if (acl) ++ set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); ++err_trans: ++ bch2_trans_exit(&trans); ++err: ++ mutex_unlock(&inode->ei_update_lock); ++ ++ return ret; ++} ++ ++static int bch2_getattr(const struct path *path, struct kstat *stat, ++ u32 request_mask, unsigned query_flags) ++{ ++ struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ stat->dev = inode->v.i_sb->s_dev; ++ stat->ino = inode->v.i_ino; ++ stat->mode = inode->v.i_mode; ++ stat->nlink = inode->v.i_nlink; ++ stat->uid = inode->v.i_uid; ++ stat->gid = inode->v.i_gid; ++ stat->rdev = inode->v.i_rdev; ++ stat->size = i_size_read(&inode->v); ++ stat->atime = inode->v.i_atime; ++ stat->mtime = inode->v.i_mtime; ++ stat->ctime = inode->v.i_ctime; ++ stat->blksize = block_bytes(c); ++ stat->blocks = inode->v.i_blocks; ++ ++ if (request_mask & STATX_BTIME) { ++ stat->result_mask |= STATX_BTIME; ++ stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); ++ } ++ ++ if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE) ++ stat->attributes |= STATX_ATTR_IMMUTABLE; ++ stat->attributes_mask |= STATX_ATTR_IMMUTABLE; ++ ++ if (inode->ei_inode.bi_flags & BCH_INODE_APPEND) ++ stat->attributes |= STATX_ATTR_APPEND; ++ stat->attributes_mask |= STATX_ATTR_APPEND; ++ ++ if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP) ++ stat->attributes |= STATX_ATTR_NODUMP; ++ stat->attributes_mask |= STATX_ATTR_NODUMP; ++ ++ return 0; ++} ++ ++static int bch2_setattr(struct dentry *dentry, struct iattr *iattr) ++{ ++ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); ++ int ret; ++ ++ lockdep_assert_held(&inode->v.i_rwsem); ++ ++ ret = setattr_prepare(dentry, iattr); ++ if (ret) ++ return ret; ++ ++ return iattr->ia_valid & ATTR_SIZE ++ ? bch2_truncate(inode, iattr) ++ : bch2_setattr_nonsize(inode, iattr); ++} ++ ++static int bch2_tmpfile(struct inode *vdir, struct dentry *dentry, umode_t mode) ++{ ++ struct bch_inode_info *inode = ++ __bch2_create(to_bch_ei(vdir), dentry, mode, 0, true); ++ ++ if (IS_ERR(inode)) ++ return PTR_ERR(inode); ++ ++ d_mark_tmpfile(dentry, &inode->v); ++ d_instantiate(dentry, &inode->v); ++ return 0; ++} ++ ++static int bch2_fill_extent(struct bch_fs *c, ++ struct fiemap_extent_info *info, ++ struct bkey_s_c k, unsigned flags) ++{ ++ if (bkey_extent_is_data(k.k)) { ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ int ret; ++ ++ if (k.k->type == KEY_TYPE_reflink_v) ++ flags |= FIEMAP_EXTENT_SHARED; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ int flags2 = 0; ++ u64 offset = p.ptr.offset; ++ ++ if (p.crc.compression_type) ++ flags2 |= FIEMAP_EXTENT_ENCODED; ++ else ++ offset += p.crc.offset; ++ ++ if ((offset & (c->opts.block_size - 1)) || ++ (k.k->size & (c->opts.block_size - 1))) ++ flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; ++ ++ ret = fiemap_fill_next_extent(info, ++ bkey_start_offset(k.k) << 9, ++ offset << 9, ++ k.k->size << 9, flags|flags2); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++ } else if (k.k->type == KEY_TYPE_reservation) { ++ return fiemap_fill_next_extent(info, ++ bkey_start_offset(k.k) << 9, ++ 0, k.k->size << 9, ++ flags| ++ FIEMAP_EXTENT_DELALLOC| ++ FIEMAP_EXTENT_UNWRITTEN); ++ } else { ++ BUG(); ++ } ++} ++ ++static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, ++ u64 start, u64 len) ++{ ++ struct bch_fs *c = vinode->i_sb->s_fs_info; ++ struct bch_inode_info *ei = to_bch_ei(vinode); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_on_stack cur, prev; ++ struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); ++ unsigned offset_into_extent, sectors; ++ bool have_extent = false; ++ int ret = 0; ++ ++ ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); ++ if (ret) ++ return ret; ++ ++ if (start + len < start) ++ return -EINVAL; ++ ++ bkey_on_stack_init(&cur); ++ bkey_on_stack_init(&prev); ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(ei->v.i_ino, start >> 9), 0); ++retry: ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret = bkey_err(k)) && ++ bkey_cmp(iter->pos, end) < 0) { ++ if (!bkey_extent_is_data(k.k) && ++ k.k->type != KEY_TYPE_reservation) { ++ bch2_btree_iter_next(iter); ++ continue; ++ } ++ ++ bkey_on_stack_realloc(&cur, c, k.k->u64s); ++ bkey_on_stack_realloc(&prev, c, k.k->u64s); ++ bkey_reassemble(cur.k, k); ++ k = bkey_i_to_s_c(cur.k); ++ ++ offset_into_extent = iter->pos.offset - ++ bkey_start_offset(k.k); ++ sectors = k.k->size - offset_into_extent; ++ ++ ret = bch2_read_indirect_extent(&trans, ++ &offset_into_extent, &cur); ++ if (ret) ++ break; ++ ++ sectors = min(sectors, k.k->size - offset_into_extent); ++ ++ if (offset_into_extent) ++ bch2_cut_front(POS(k.k->p.inode, ++ bkey_start_offset(k.k) + ++ offset_into_extent), ++ cur.k); ++ bch2_key_resize(&cur.k->k, sectors); ++ cur.k->k.p = iter->pos; ++ cur.k->k.p.offset += cur.k->k.size; ++ ++ if (have_extent) { ++ ret = bch2_fill_extent(c, info, ++ bkey_i_to_s_c(prev.k), 0); ++ if (ret) ++ break; ++ } ++ ++ bkey_copy(prev.k, cur.k); ++ have_extent = true; ++ ++ if (k.k->type == KEY_TYPE_reflink_v) ++ bch2_btree_iter_set_pos(iter, k.k->p); ++ else ++ bch2_btree_iter_next(iter); ++ } ++ ++ if (ret == -EINTR) ++ goto retry; ++ ++ if (!ret && have_extent) ++ ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), ++ FIEMAP_EXTENT_LAST); ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ bkey_on_stack_exit(&cur, c); ++ bkey_on_stack_exit(&prev, c); ++ return ret < 0 ? ret : 0; ++} ++ ++static const struct vm_operations_struct bch_vm_ops = { ++ .fault = bch2_page_fault, ++ .map_pages = filemap_map_pages, ++ .page_mkwrite = bch2_page_mkwrite, ++}; ++ ++static int bch2_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ file_accessed(file); ++ ++ vma->vm_ops = &bch_vm_ops; ++ return 0; ++} ++ ++/* Directories: */ ++ ++static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) ++{ ++ return generic_file_llseek_size(file, offset, whence, ++ S64_MAX, S64_MAX); ++} ++ ++static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) ++{ ++ struct bch_inode_info *inode = file_bch_inode(file); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ if (!dir_emit_dots(file, ctx)) ++ return 0; ++ ++ return bch2_readdir(c, inode->v.i_ino, ctx); ++} ++ ++static const struct file_operations bch_file_operations = { ++ .llseek = bch2_llseek, ++ .read_iter = bch2_read_iter, ++ .write_iter = bch2_write_iter, ++ .mmap = bch2_mmap, ++ .open = generic_file_open, ++ .fsync = bch2_fsync, ++ .splice_read = generic_file_splice_read, ++ /* ++ * Broken, on v5.3: ++ .splice_write = iter_file_splice_write, ++ */ ++ .fallocate = bch2_fallocate_dispatch, ++ .unlocked_ioctl = bch2_fs_file_ioctl, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = bch2_compat_fs_ioctl, ++#endif ++ .remap_file_range = bch2_remap_file_range, ++}; ++ ++static const struct inode_operations bch_file_inode_operations = { ++ .getattr = bch2_getattr, ++ .setattr = bch2_setattr, ++ .fiemap = bch2_fiemap, ++ .listxattr = bch2_xattr_list, ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ .get_acl = bch2_get_acl, ++ .set_acl = bch2_set_acl, ++#endif ++}; ++ ++static const struct inode_operations bch_dir_inode_operations = { ++ .lookup = bch2_lookup, ++ .create = bch2_create, ++ .link = bch2_link, ++ .unlink = bch2_unlink, ++ .symlink = bch2_symlink, ++ .mkdir = bch2_mkdir, ++ .rmdir = bch2_unlink, ++ .mknod = bch2_mknod, ++ .rename = bch2_rename2, ++ .getattr = bch2_getattr, ++ .setattr = bch2_setattr, ++ .tmpfile = bch2_tmpfile, ++ .listxattr = bch2_xattr_list, ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ .get_acl = bch2_get_acl, ++ .set_acl = bch2_set_acl, ++#endif ++}; ++ ++static const struct file_operations bch_dir_file_operations = { ++ .llseek = bch2_dir_llseek, ++ .read = generic_read_dir, ++ .iterate_shared = bch2_vfs_readdir, ++ .fsync = bch2_fsync, ++ .unlocked_ioctl = bch2_fs_file_ioctl, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = bch2_compat_fs_ioctl, ++#endif ++}; ++ ++static const struct inode_operations bch_symlink_inode_operations = { ++ .get_link = page_get_link, ++ .getattr = bch2_getattr, ++ .setattr = bch2_setattr, ++ .listxattr = bch2_xattr_list, ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ .get_acl = bch2_get_acl, ++ .set_acl = bch2_set_acl, ++#endif ++}; ++ ++static const struct inode_operations bch_special_inode_operations = { ++ .getattr = bch2_getattr, ++ .setattr = bch2_setattr, ++ .listxattr = bch2_xattr_list, ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ .get_acl = bch2_get_acl, ++ .set_acl = bch2_set_acl, ++#endif ++}; ++ ++static const struct address_space_operations bch_address_space_operations = { ++ .writepage = bch2_writepage, ++ .readpage = bch2_readpage, ++ .writepages = bch2_writepages, ++ .readpages = bch2_readpages, ++ .set_page_dirty = __set_page_dirty_nobuffers, ++ .write_begin = bch2_write_begin, ++ .write_end = bch2_write_end, ++ .invalidatepage = bch2_invalidatepage, ++ .releasepage = bch2_releasepage, ++ .direct_IO = noop_direct_IO, ++#ifdef CONFIG_MIGRATION ++ .migratepage = bch2_migrate_page, ++#endif ++ .error_remove_page = generic_error_remove_page, ++}; ++ ++static struct inode *bch2_nfs_get_inode(struct super_block *sb, ++ u64 ino, u32 generation) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct inode *vinode; ++ ++ if (ino < BCACHEFS_ROOT_INO) ++ return ERR_PTR(-ESTALE); ++ ++ vinode = bch2_vfs_inode_get(c, ino); ++ if (IS_ERR(vinode)) ++ return ERR_CAST(vinode); ++ if (generation && vinode->i_generation != generation) { ++ /* we didn't find the right inode.. */ ++ iput(vinode); ++ return ERR_PTR(-ESTALE); ++ } ++ return vinode; ++} ++ ++static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *fid, ++ int fh_len, int fh_type) ++{ ++ return generic_fh_to_dentry(sb, fid, fh_len, fh_type, ++ bch2_nfs_get_inode); ++} ++ ++static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *fid, ++ int fh_len, int fh_type) ++{ ++ return generic_fh_to_parent(sb, fid, fh_len, fh_type, ++ bch2_nfs_get_inode); ++} ++ ++static const struct export_operations bch_export_ops = { ++ .fh_to_dentry = bch2_fh_to_dentry, ++ .fh_to_parent = bch2_fh_to_parent, ++ //.get_parent = bch2_get_parent, ++}; ++ ++static void bch2_vfs_inode_init(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi) ++{ ++ bch2_inode_update_after_write(c, inode, bi, ~0); ++ ++ inode->v.i_blocks = bi->bi_sectors; ++ inode->v.i_ino = bi->bi_inum; ++ inode->v.i_rdev = bi->bi_dev; ++ inode->v.i_generation = bi->bi_generation; ++ inode->v.i_size = bi->bi_size; ++ ++ inode->ei_journal_seq = 0; ++ inode->ei_quota_reserved = 0; ++ inode->ei_str_hash = bch2_hash_info_init(c, bi); ++ inode->ei_qid = bch_qid(bi); ++ ++ inode->v.i_mapping->a_ops = &bch_address_space_operations; ++ ++ switch (inode->v.i_mode & S_IFMT) { ++ case S_IFREG: ++ inode->v.i_op = &bch_file_inode_operations; ++ inode->v.i_fop = &bch_file_operations; ++ break; ++ case S_IFDIR: ++ inode->v.i_op = &bch_dir_inode_operations; ++ inode->v.i_fop = &bch_dir_file_operations; ++ break; ++ case S_IFLNK: ++ inode_nohighmem(&inode->v); ++ inode->v.i_op = &bch_symlink_inode_operations; ++ break; ++ default: ++ init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev); ++ inode->v.i_op = &bch_special_inode_operations; ++ break; ++ } ++} ++ ++static struct inode *bch2_alloc_inode(struct super_block *sb) ++{ ++ struct bch_inode_info *inode; ++ ++ inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS); ++ if (!inode) ++ return NULL; ++ ++ inode_init_once(&inode->v); ++ mutex_init(&inode->ei_update_lock); ++ pagecache_lock_init(&inode->ei_pagecache_lock); ++ mutex_init(&inode->ei_quota_lock); ++ inode->ei_journal_seq = 0; ++ ++ return &inode->v; ++} ++ ++static void bch2_i_callback(struct rcu_head *head) ++{ ++ struct inode *vinode = container_of(head, struct inode, i_rcu); ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ ++ kmem_cache_free(bch2_inode_cache, inode); ++} ++ ++static void bch2_destroy_inode(struct inode *vinode) ++{ ++ call_rcu(&vinode->i_rcu, bch2_i_callback); ++} ++ ++static int inode_update_times_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ bi->bi_atime = timespec_to_bch2_time(c, inode->v.i_atime); ++ bi->bi_mtime = timespec_to_bch2_time(c, inode->v.i_mtime); ++ bi->bi_ctime = timespec_to_bch2_time(c, inode->v.i_ctime); ++ ++ return 0; ++} ++ ++static int bch2_vfs_write_inode(struct inode *vinode, ++ struct writeback_control *wbc) ++{ ++ struct bch_fs *c = vinode->i_sb->s_fs_info; ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ int ret; ++ ++ mutex_lock(&inode->ei_update_lock); ++ ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, ++ ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); ++ mutex_unlock(&inode->ei_update_lock); ++ ++ return ret; ++} ++ ++static void bch2_evict_inode(struct inode *vinode) ++{ ++ struct bch_fs *c = vinode->i_sb->s_fs_info; ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ ++ truncate_inode_pages_final(&inode->v.i_data); ++ ++ clear_inode(&inode->v); ++ ++ BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); ++ ++ if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) { ++ bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), ++ KEY_TYPE_QUOTA_WARN); ++ bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, ++ KEY_TYPE_QUOTA_WARN); ++ bch2_inode_rm(c, inode->v.i_ino); ++ } ++} ++ ++static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) ++{ ++ struct super_block *sb = dentry->d_sb; ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); ++ unsigned shift = sb->s_blocksize_bits - 9; ++ u64 fsid; ++ ++ buf->f_type = BCACHEFS_STATFS_MAGIC; ++ buf->f_bsize = sb->s_blocksize; ++ buf->f_blocks = usage.capacity >> shift; ++ buf->f_bfree = (usage.capacity - usage.used) >> shift; ++ buf->f_bavail = buf->f_bfree; ++ buf->f_files = 0; ++ buf->f_ffree = 0; ++ ++ fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ ++ le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); ++ buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; ++ buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; ++ buf->f_namelen = BCH_NAME_MAX; ++ ++ return 0; ++} ++ ++static int bch2_sync_fs(struct super_block *sb, int wait) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ ++ if (c->opts.journal_flush_disabled) ++ return 0; ++ ++ if (!wait) { ++ bch2_journal_flush_async(&c->journal, NULL); ++ return 0; ++ } ++ ++ return bch2_journal_flush(&c->journal); ++} ++ ++static struct bch_fs *bch2_path_to_fs(const char *dev) ++{ ++ struct bch_fs *c; ++ struct block_device *bdev = lookup_bdev(dev); ++ ++ if (IS_ERR(bdev)) ++ return ERR_CAST(bdev); ++ ++ c = bch2_bdev_to_fs(bdev); ++ bdput(bdev); ++ if (c) ++ closure_put(&c->cl); ++ return c ?: ERR_PTR(-ENOENT); ++} ++ ++static char **split_devs(const char *_dev_name, unsigned *nr) ++{ ++ char *dev_name = NULL, **devs = NULL, *s; ++ size_t i, nr_devs = 0; ++ ++ dev_name = kstrdup(_dev_name, GFP_KERNEL); ++ if (!dev_name) ++ return NULL; ++ ++ for (s = dev_name; s; s = strchr(s + 1, ':')) ++ nr_devs++; ++ ++ devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL); ++ if (!devs) { ++ kfree(dev_name); ++ return NULL; ++ } ++ ++ for (i = 0, s = dev_name; ++ s; ++ (s = strchr(s, ':')) && (*s++ = '\0')) ++ devs[i++] = s; ++ ++ *nr = nr_devs; ++ return devs; ++} ++ ++static int bch2_remount(struct super_block *sb, int *flags, char *data) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_opts opts = bch2_opts_empty(); ++ int ret; ++ ++ opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); ++ ++ ret = bch2_parse_mount_opts(&opts, data); ++ if (ret) ++ return ret; ++ ++ if (opts.read_only != c->opts.read_only) { ++ down_write(&c->state_lock); ++ ++ if (opts.read_only) { ++ bch2_fs_read_only(c); ++ ++ sb->s_flags |= SB_RDONLY; ++ } else { ++ ret = bch2_fs_read_write(c); ++ if (ret) { ++ bch_err(c, "error going rw: %i", ret); ++ up_write(&c->state_lock); ++ return -EINVAL; ++ } ++ ++ sb->s_flags &= ~SB_RDONLY; ++ } ++ ++ c->opts.read_only = opts.read_only; ++ ++ up_write(&c->state_lock); ++ } ++ ++ if (opts.errors >= 0) ++ c->opts.errors = opts.errors; ++ ++ return ret; ++} ++ ++static int bch2_show_devname(struct seq_file *seq, struct dentry *root) ++{ ++ struct bch_fs *c = root->d_sb->s_fs_info; ++ struct bch_dev *ca; ++ unsigned i; ++ bool first = true; ++ ++ for_each_online_member(ca, c, i) { ++ if (!first) ++ seq_putc(seq, ':'); ++ first = false; ++ seq_puts(seq, "/dev/"); ++ seq_puts(seq, ca->name); ++ } ++ ++ return 0; ++} ++ ++static int bch2_show_options(struct seq_file *seq, struct dentry *root) ++{ ++ struct bch_fs *c = root->d_sb->s_fs_info; ++ enum bch_opt_id i; ++ char buf[512]; ++ ++ for (i = 0; i < bch2_opts_nr; i++) { ++ const struct bch_option *opt = &bch2_opt_table[i]; ++ u64 v = bch2_opt_get_by_id(&c->opts, i); ++ ++ if (!(opt->mode & OPT_MOUNT)) ++ continue; ++ ++ if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) ++ continue; ++ ++ bch2_opt_to_text(&PBUF(buf), c, opt, v, ++ OPT_SHOW_MOUNT_STYLE); ++ seq_putc(seq, ','); ++ seq_puts(seq, buf); ++ } ++ ++ return 0; ++} ++ ++static void bch2_put_super(struct super_block *sb) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ ++ __bch2_fs_stop(c); ++} ++ ++static const struct super_operations bch_super_operations = { ++ .alloc_inode = bch2_alloc_inode, ++ .destroy_inode = bch2_destroy_inode, ++ .write_inode = bch2_vfs_write_inode, ++ .evict_inode = bch2_evict_inode, ++ .sync_fs = bch2_sync_fs, ++ .statfs = bch2_statfs, ++ .show_devname = bch2_show_devname, ++ .show_options = bch2_show_options, ++ .remount_fs = bch2_remount, ++ .put_super = bch2_put_super, ++#if 0 ++ .freeze_fs = bch2_freeze, ++ .unfreeze_fs = bch2_unfreeze, ++#endif ++}; ++ ++static int bch2_set_super(struct super_block *s, void *data) ++{ ++ s->s_fs_info = data; ++ return 0; ++} ++ ++static int bch2_noset_super(struct super_block *s, void *data) ++{ ++ return -EBUSY; ++} ++ ++static int bch2_test_super(struct super_block *s, void *data) ++{ ++ struct bch_fs *c = s->s_fs_info; ++ struct bch_fs **devs = data; ++ unsigned i; ++ ++ if (!c) ++ return false; ++ ++ for (i = 0; devs[i]; i++) ++ if (c != devs[i]) ++ return false; ++ return true; ++} ++ ++static struct dentry *bch2_mount(struct file_system_type *fs_type, ++ int flags, const char *dev_name, void *data) ++{ ++ struct bch_fs *c; ++ struct bch_dev *ca; ++ struct super_block *sb; ++ struct inode *vinode; ++ struct bch_opts opts = bch2_opts_empty(); ++ char **devs; ++ struct bch_fs **devs_to_fs = NULL; ++ unsigned i, nr_devs; ++ int ret; ++ ++ opt_set(opts, read_only, (flags & SB_RDONLY) != 0); ++ ++ ret = bch2_parse_mount_opts(&opts, data); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ devs = split_devs(dev_name, &nr_devs); ++ if (!devs) ++ return ERR_PTR(-ENOMEM); ++ ++ devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL); ++ if (!devs_to_fs) { ++ sb = ERR_PTR(-ENOMEM); ++ goto got_sb; ++ } ++ ++ for (i = 0; i < nr_devs; i++) ++ devs_to_fs[i] = bch2_path_to_fs(devs[i]); ++ ++ sb = sget(fs_type, bch2_test_super, bch2_noset_super, ++ flags|SB_NOSEC, devs_to_fs); ++ if (!IS_ERR(sb)) ++ goto got_sb; ++ ++ c = bch2_fs_open(devs, nr_devs, opts); ++ ++ if (!IS_ERR(c)) ++ sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c); ++ else ++ sb = ERR_CAST(c); ++got_sb: ++ kfree(devs_to_fs); ++ kfree(devs[0]); ++ kfree(devs); ++ ++ if (IS_ERR(sb)) ++ return ERR_CAST(sb); ++ ++ c = sb->s_fs_info; ++ ++ if (sb->s_root) { ++ if ((flags ^ sb->s_flags) & SB_RDONLY) { ++ ret = -EBUSY; ++ goto err_put_super; ++ } ++ goto out; ++ } ++ ++ sb->s_blocksize = block_bytes(c); ++ sb->s_blocksize_bits = ilog2(block_bytes(c)); ++ sb->s_maxbytes = MAX_LFS_FILESIZE; ++ sb->s_op = &bch_super_operations; ++ sb->s_export_op = &bch_export_ops; ++#ifdef CONFIG_BCACHEFS_QUOTA ++ sb->s_qcop = &bch2_quotactl_operations; ++ sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ; ++#endif ++ sb->s_xattr = bch2_xattr_handlers; ++ sb->s_magic = BCACHEFS_STATFS_MAGIC; ++ sb->s_time_gran = c->sb.time_precision; ++ c->vfs_sb = sb; ++ strlcpy(sb->s_id, c->name, sizeof(sb->s_id)); ++ ++ ret = super_setup_bdi(sb); ++ if (ret) ++ goto err_put_super; ++ ++ sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; ++ ++ for_each_online_member(ca, c, i) { ++ struct block_device *bdev = ca->disk_sb.bdev; ++ ++ /* XXX: create an anonymous device for multi device filesystems */ ++ sb->s_bdev = bdev; ++ sb->s_dev = bdev->bd_dev; ++ percpu_ref_put(&ca->io_ref); ++ break; ++ } ++ ++#ifdef CONFIG_BCACHEFS_POSIX_ACL ++ if (c->opts.acl) ++ sb->s_flags |= SB_POSIXACL; ++#endif ++ ++ vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_INO); ++ if (IS_ERR(vinode)) { ++ bch_err(c, "error mounting: error getting root inode %i", ++ (int) PTR_ERR(vinode)); ++ ret = PTR_ERR(vinode); ++ goto err_put_super; ++ } ++ ++ sb->s_root = d_make_root(vinode); ++ if (!sb->s_root) { ++ bch_err(c, "error mounting: error allocating root dentry"); ++ ret = -ENOMEM; ++ goto err_put_super; ++ } ++ ++ sb->s_flags |= SB_ACTIVE; ++out: ++ return dget(sb->s_root); ++ ++err_put_super: ++ deactivate_locked_super(sb); ++ return ERR_PTR(ret); ++} ++ ++static void bch2_kill_sb(struct super_block *sb) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ ++ generic_shutdown_super(sb); ++ bch2_fs_free(c); ++} ++ ++static struct file_system_type bcache_fs_type = { ++ .owner = THIS_MODULE, ++ .name = "bcachefs", ++ .mount = bch2_mount, ++ .kill_sb = bch2_kill_sb, ++ .fs_flags = FS_REQUIRES_DEV, ++}; ++ ++MODULE_ALIAS_FS("bcachefs"); ++ ++void bch2_vfs_exit(void) ++{ ++ unregister_filesystem(&bcache_fs_type); ++ if (bch2_inode_cache) ++ kmem_cache_destroy(bch2_inode_cache); ++} ++ ++int __init bch2_vfs_init(void) ++{ ++ int ret = -ENOMEM; ++ ++ bch2_inode_cache = KMEM_CACHE(bch_inode_info, 0); ++ if (!bch2_inode_cache) ++ goto err; ++ ++ ret = register_filesystem(&bcache_fs_type); ++ if (ret) ++ goto err; ++ ++ return 0; ++err: ++ bch2_vfs_exit(); ++ return ret; ++} ++ ++#endif /* NO_BCACHEFS_FS */ +diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h +new file mode 100644 +index 000000000000..eda903a45325 +--- /dev/null ++++ b/fs/bcachefs/fs.h +@@ -0,0 +1,174 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FS_H ++#define _BCACHEFS_FS_H ++ ++#include "inode.h" ++#include "opts.h" ++#include "str_hash.h" ++#include "quota_types.h" ++ ++#include ++#include ++ ++/* ++ * Two-state lock - can be taken for add or block - both states are shared, ++ * like read side of rwsem, but conflict with other state: ++ */ ++struct pagecache_lock { ++ atomic_long_t v; ++ wait_queue_head_t wait; ++}; ++ ++static inline void pagecache_lock_init(struct pagecache_lock *lock) ++{ ++ atomic_long_set(&lock->v, 0); ++ init_waitqueue_head(&lock->wait); ++} ++ ++void bch2_pagecache_add_put(struct pagecache_lock *); ++void bch2_pagecache_add_get(struct pagecache_lock *); ++void bch2_pagecache_block_put(struct pagecache_lock *); ++void bch2_pagecache_block_get(struct pagecache_lock *); ++ ++struct bch_inode_info { ++ struct inode v; ++ ++ struct mutex ei_update_lock; ++ u64 ei_journal_seq; ++ u64 ei_quota_reserved; ++ unsigned long ei_last_dirtied; ++ ++ struct pagecache_lock ei_pagecache_lock; ++ ++ struct mutex ei_quota_lock; ++ struct bch_qid ei_qid; ++ ++ struct bch_hash_info ei_str_hash; ++ ++ /* copy of inode in btree: */ ++ struct bch_inode_unpacked ei_inode; ++}; ++ ++#define to_bch_ei(_inode) \ ++ container_of_or_null(_inode, struct bch_inode_info, v) ++ ++static inline int ptrcmp(void *l, void *r) ++{ ++ return cmp_int(l, r); ++} ++ ++enum bch_inode_lock_op { ++ INODE_LOCK = (1U << 0), ++ INODE_PAGECACHE_BLOCK = (1U << 1), ++ INODE_UPDATE_LOCK = (1U << 2), ++}; ++ ++#define bch2_lock_inodes(_locks, ...) \ ++do { \ ++ struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ ++ unsigned i; \ ++ \ ++ bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ ++ \ ++ for (i = 1; i < ARRAY_SIZE(a); i++) \ ++ if (a[i] != a[i - 1]) { \ ++ if ((_locks) & INODE_LOCK) \ ++ down_write_nested(&a[i]->v.i_rwsem, i); \ ++ if ((_locks) & INODE_PAGECACHE_BLOCK) \ ++ bch2_pagecache_block_get(&a[i]->ei_pagecache_lock);\ ++ if ((_locks) & INODE_UPDATE_LOCK) \ ++ mutex_lock_nested(&a[i]->ei_update_lock, i);\ ++ } \ ++} while (0) ++ ++#define bch2_unlock_inodes(_locks, ...) \ ++do { \ ++ struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ ++ unsigned i; \ ++ \ ++ bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ ++ \ ++ for (i = 1; i < ARRAY_SIZE(a); i++) \ ++ if (a[i] != a[i - 1]) { \ ++ if ((_locks) & INODE_LOCK) \ ++ up_write(&a[i]->v.i_rwsem); \ ++ if ((_locks) & INODE_PAGECACHE_BLOCK) \ ++ bch2_pagecache_block_put(&a[i]->ei_pagecache_lock);\ ++ if ((_locks) & INODE_UPDATE_LOCK) \ ++ mutex_unlock(&a[i]->ei_update_lock); \ ++ } \ ++} while (0) ++ ++static inline struct bch_inode_info *file_bch_inode(struct file *file) ++{ ++ return to_bch_ei(file_inode(file)); ++} ++ ++static inline bool inode_attr_changing(struct bch_inode_info *dir, ++ struct bch_inode_info *inode, ++ enum inode_opt_id id) ++{ ++ return !(inode->ei_inode.bi_fields_set & (1 << id)) && ++ bch2_inode_opt_get(&dir->ei_inode, id) != ++ bch2_inode_opt_get(&inode->ei_inode, id); ++} ++ ++static inline bool inode_attrs_changing(struct bch_inode_info *dir, ++ struct bch_inode_info *inode) ++{ ++ unsigned id; ++ ++ for (id = 0; id < Inode_opt_nr; id++) ++ if (inode_attr_changing(dir, inode, id)) ++ return true; ++ ++ return false; ++} ++ ++struct bch_inode_unpacked; ++ ++#ifndef NO_BCACHEFS_FS ++ ++int bch2_fs_quota_transfer(struct bch_fs *, ++ struct bch_inode_info *, ++ struct bch_qid, ++ unsigned, ++ enum quota_acct_mode); ++ ++static inline int bch2_set_projid(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ u32 projid) ++{ ++ struct bch_qid qid = inode->ei_qid; ++ ++ qid.q[QTYP_PRJ] = projid; ++ ++ return bch2_fs_quota_transfer(c, inode, qid, ++ 1 << QTYP_PRJ, ++ KEY_TYPE_QUOTA_PREALLOC); ++} ++ ++struct inode *bch2_vfs_inode_get(struct bch_fs *, u64); ++ ++/* returns 0 if we want to do the update, or error is passed up */ ++typedef int (*inode_set_fn)(struct bch_inode_info *, ++ struct bch_inode_unpacked *, void *); ++ ++void bch2_inode_update_after_write(struct bch_fs *, ++ struct bch_inode_info *, ++ struct bch_inode_unpacked *, ++ unsigned); ++int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, ++ inode_set_fn, void *, unsigned); ++ ++void bch2_vfs_exit(void); ++int bch2_vfs_init(void); ++ ++#else ++ ++static inline void bch2_vfs_exit(void) {} ++static inline int bch2_vfs_init(void) { return 0; } ++ ++#endif /* NO_BCACHEFS_FS */ ++ ++#endif /* _BCACHEFS_FS_H */ +diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c +new file mode 100644 +index 000000000000..5a6df3d1973a +--- /dev/null ++++ b/fs/bcachefs/fsck.c +@@ -0,0 +1,1502 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_on_stack.h" ++#include "btree_update.h" ++#include "dirent.h" ++#include "error.h" ++#include "fs-common.h" ++#include "fsck.h" ++#include "inode.h" ++#include "keylist.h" ++#include "super.h" ++#include "xattr.h" ++ ++#include /* struct qstr */ ++#include ++ ++#define QSTR(n) { { { .len = strlen(n) } }, .name = n } ++ ++static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 sectors = 0; ++ int ret; ++ ++ for_each_btree_key(trans, iter, BTREE_ID_EXTENTS, ++ POS(inum, 0), 0, k, ret) { ++ if (k.k->p.inode != inum) ++ break; ++ ++ if (bkey_extent_is_allocation(k.k)) ++ sectors += k.k->size; ++ } ++ ++ bch2_trans_iter_free(trans, iter); ++ ++ return ret ?: sectors; ++} ++ ++static int __remove_dirent(struct btree_trans *trans, ++ struct bkey_s_c_dirent dirent) ++{ ++ struct bch_fs *c = trans->c; ++ struct qstr name; ++ struct bch_inode_unpacked dir_inode; ++ struct bch_hash_info dir_hash_info; ++ u64 dir_inum = dirent.k->p.inode; ++ int ret; ++ char *buf; ++ ++ name.len = bch2_dirent_name_bytes(dirent); ++ buf = bch2_trans_kmalloc(trans, name.len + 1); ++ if (IS_ERR(buf)) ++ return PTR_ERR(buf); ++ ++ memcpy(buf, dirent.v->d_name, name.len); ++ buf[name.len] = '\0'; ++ name.name = buf; ++ ++ ret = bch2_inode_find_by_inum_trans(trans, dir_inum, &dir_inode); ++ if (ret && ret != -EINTR) ++ bch_err(c, "remove_dirent: err %i looking up directory inode", ret); ++ if (ret) ++ return ret; ++ ++ dir_hash_info = bch2_hash_info_init(c, &dir_inode); ++ ++ ret = bch2_hash_delete(trans, bch2_dirent_hash_desc, ++ &dir_hash_info, dir_inum, &name); ++ if (ret && ret != -EINTR) ++ bch_err(c, "remove_dirent: err %i deleting dirent", ret); ++ if (ret) ++ return ret; ++ ++ return 0; ++} ++ ++static int remove_dirent(struct btree_trans *trans, ++ struct bkey_s_c_dirent dirent) ++{ ++ return __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ __remove_dirent(trans, dirent)); ++} ++ ++static int reattach_inode(struct bch_fs *c, ++ struct bch_inode_unpacked *lostfound_inode, ++ u64 inum) ++{ ++ struct bch_inode_unpacked dir_u, inode_u; ++ char name_buf[20]; ++ struct qstr name; ++ int ret; ++ ++ snprintf(name_buf, sizeof(name_buf), "%llu", inum); ++ name = (struct qstr) QSTR(name_buf); ++ ++ ret = bch2_trans_do(c, NULL, NULL, ++ BTREE_INSERT_LAZY_RW, ++ bch2_link_trans(&trans, lostfound_inode->bi_inum, ++ inum, &dir_u, &inode_u, &name)); ++ if (ret) ++ bch_err(c, "error %i reattaching inode %llu", ret, inum); ++ ++ return ret; ++} ++ ++struct inode_walker { ++ bool first_this_inode; ++ bool have_inode; ++ u64 cur_inum; ++ struct bch_inode_unpacked inode; ++}; ++ ++static struct inode_walker inode_walker_init(void) ++{ ++ return (struct inode_walker) { ++ .cur_inum = -1, ++ .have_inode = false, ++ }; ++} ++ ++static int walk_inode(struct btree_trans *trans, ++ struct inode_walker *w, u64 inum) ++{ ++ if (inum != w->cur_inum) { ++ int ret = bch2_inode_find_by_inum_trans(trans, inum, ++ &w->inode); ++ ++ if (ret && ret != -ENOENT) ++ return ret; ++ ++ w->have_inode = !ret; ++ w->cur_inum = inum; ++ w->first_this_inode = true; ++ } else { ++ w->first_this_inode = false; ++ } ++ ++ return 0; ++} ++ ++struct hash_check { ++ struct bch_hash_info info; ++ ++ /* start of current chain of hash collisions: */ ++ struct btree_iter *chain; ++ ++ /* next offset in current chain of hash collisions: */ ++ u64 chain_end; ++}; ++ ++static void hash_check_init(struct hash_check *h) ++{ ++ h->chain = NULL; ++ h->chain_end = 0; ++} ++ ++static void hash_stop_chain(struct btree_trans *trans, ++ struct hash_check *h) ++{ ++ if (h->chain) ++ bch2_trans_iter_free(trans, h->chain); ++ h->chain = NULL; ++} ++ ++static void hash_check_set_inode(struct btree_trans *trans, ++ struct hash_check *h, ++ const struct bch_inode_unpacked *bi) ++{ ++ h->info = bch2_hash_info_init(trans->c, bi); ++ hash_stop_chain(trans, h); ++} ++ ++static int hash_redo_key(const struct bch_hash_desc desc, ++ struct btree_trans *trans, struct hash_check *h, ++ struct btree_iter *k_iter, struct bkey_s_c k, ++ u64 hashed) ++{ ++ struct bkey_i delete; ++ struct bkey_i *tmp; ++ ++ tmp = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ if (IS_ERR(tmp)) ++ return PTR_ERR(tmp); ++ ++ bkey_reassemble(tmp, k); ++ ++ bkey_init(&delete.k); ++ delete.k.p = k_iter->pos; ++ bch2_trans_update(trans, k_iter, &delete, 0); ++ ++ return bch2_hash_set(trans, desc, &h->info, k_iter->pos.inode, ++ tmp, BCH_HASH_SET_MUST_CREATE); ++} ++ ++static int fsck_hash_delete_at(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ struct bch_hash_info *info, ++ struct btree_iter *iter) ++{ ++ int ret; ++retry: ++ ret = bch2_hash_delete_at(trans, desc, info, iter) ?: ++ bch2_trans_commit(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW); ++ if (ret == -EINTR) { ++ ret = bch2_btree_iter_traverse(iter); ++ if (!ret) ++ goto retry; ++ } ++ ++ return ret; ++} ++ ++static int hash_check_duplicates(struct btree_trans *trans, ++ const struct bch_hash_desc desc, struct hash_check *h, ++ struct btree_iter *k_iter, struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *iter; ++ struct bkey_s_c k2; ++ char buf[200]; ++ int ret = 0; ++ ++ if (!bkey_cmp(h->chain->pos, k_iter->pos)) ++ return 0; ++ ++ iter = bch2_trans_copy_iter(trans, h->chain); ++ BUG_ON(IS_ERR(iter)); ++ ++ for_each_btree_key_continue(iter, 0, k2, ret) { ++ if (bkey_cmp(k2.k->p, k.k->p) >= 0) ++ break; ++ ++ if (fsck_err_on(k2.k->type == desc.key_type && ++ !desc.cmp_bkey(k, k2), c, ++ "duplicate hash table keys:\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) { ++ ret = fsck_hash_delete_at(trans, desc, &h->info, k_iter); ++ if (ret) ++ return ret; ++ ret = 1; ++ break; ++ } ++ } ++fsck_err: ++ bch2_trans_iter_free(trans, iter); ++ return ret; ++} ++ ++static void hash_set_chain_start(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ struct hash_check *h, ++ struct btree_iter *k_iter, struct bkey_s_c k) ++{ ++ bool hole = (k.k->type != KEY_TYPE_whiteout && ++ k.k->type != desc.key_type); ++ ++ if (hole || k.k->p.offset > h->chain_end + 1) ++ hash_stop_chain(trans, h); ++ ++ if (!hole) { ++ if (!h->chain) { ++ h->chain = bch2_trans_copy_iter(trans, k_iter); ++ BUG_ON(IS_ERR(h->chain)); ++ } ++ ++ h->chain_end = k.k->p.offset; ++ } ++} ++ ++static bool key_has_correct_hash(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ struct hash_check *h, ++ struct btree_iter *k_iter, struct bkey_s_c k) ++{ ++ u64 hash; ++ ++ hash_set_chain_start(trans, desc, h, k_iter, k); ++ ++ if (k.k->type != desc.key_type) ++ return true; ++ ++ hash = desc.hash_bkey(&h->info, k); ++ ++ return hash >= h->chain->pos.offset && ++ hash <= k.k->p.offset; ++} ++ ++static int hash_check_key(struct btree_trans *trans, ++ const struct bch_hash_desc desc, struct hash_check *h, ++ struct btree_iter *k_iter, struct bkey_s_c k) ++{ ++ struct bch_fs *c = trans->c; ++ char buf[200]; ++ u64 hashed; ++ int ret = 0; ++ ++ hash_set_chain_start(trans, desc, h, k_iter, k); ++ ++ if (k.k->type != desc.key_type) ++ return 0; ++ ++ hashed = desc.hash_bkey(&h->info, k); ++ ++ if (fsck_err_on(hashed < h->chain->pos.offset || ++ hashed > k.k->p.offset, c, ++ "hash table key at wrong offset: btree %u, %llu, " ++ "hashed to %llu chain starts at %llu\n%s", ++ desc.btree_id, k.k->p.offset, ++ hashed, h->chain->pos.offset, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, k), buf))) { ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, ++ hash_redo_key(desc, trans, h, k_iter, k, hashed)); ++ if (ret) { ++ bch_err(c, "hash_redo_key err %i", ret); ++ return ret; ++ } ++ return 1; ++ } ++ ++ ret = hash_check_duplicates(trans, desc, h, k_iter, k); ++fsck_err: ++ return ret; ++} ++ ++static int check_dirent_hash(struct btree_trans *trans, struct hash_check *h, ++ struct btree_iter *iter, struct bkey_s_c *k) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_i_dirent *d = NULL; ++ int ret = -EINVAL; ++ char buf[200]; ++ unsigned len; ++ u64 hash; ++ ++ if (key_has_correct_hash(trans, bch2_dirent_hash_desc, h, iter, *k)) ++ return 0; ++ ++ len = bch2_dirent_name_bytes(bkey_s_c_to_dirent(*k)); ++ BUG_ON(!len); ++ ++ memcpy(buf, bkey_s_c_to_dirent(*k).v->d_name, len); ++ buf[len] = '\0'; ++ ++ d = kmalloc(bkey_bytes(k->k), GFP_KERNEL); ++ if (!d) { ++ bch_err(c, "memory allocation failure"); ++ return -ENOMEM; ++ } ++ ++ bkey_reassemble(&d->k_i, *k); ++ ++ do { ++ --len; ++ if (!len) ++ goto err_redo; ++ ++ d->k.u64s = BKEY_U64s + dirent_val_u64s(len); ++ ++ BUG_ON(bkey_val_bytes(&d->k) < ++ offsetof(struct bch_dirent, d_name) + len); ++ ++ memset(d->v.d_name + len, 0, ++ bkey_val_bytes(&d->k) - ++ offsetof(struct bch_dirent, d_name) - len); ++ ++ hash = bch2_dirent_hash_desc.hash_bkey(&h->info, ++ bkey_i_to_s_c(&d->k_i)); ++ } while (hash < h->chain->pos.offset || ++ hash > k->k->p.offset); ++ ++ if (fsck_err(c, "dirent with junk at end, was %s (%zu) now %s (%u)", ++ buf, strlen(buf), d->v.d_name, len)) { ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ (bch2_trans_update(trans, iter, &d->k_i, 0), 0)); ++ if (ret) ++ goto err; ++ ++ *k = bch2_btree_iter_peek(iter); ++ ++ BUG_ON(k->k->type != KEY_TYPE_dirent); ++ } ++err: ++fsck_err: ++ kfree(d); ++ return ret; ++err_redo: ++ hash = bch2_dirent_hash_desc.hash_bkey(&h->info, *k); ++ ++ if (fsck_err(c, "cannot fix dirent by removing trailing garbage %s (%zu)\n" ++ "hash table key at wrong offset: btree %u, offset %llu, " ++ "hashed to %llu chain starts at %llu\n%s", ++ buf, strlen(buf), BTREE_ID_DIRENTS, ++ k->k->p.offset, hash, h->chain->pos.offset, ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ *k), buf))) { ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, ++ hash_redo_key(bch2_dirent_hash_desc, trans, ++ h, iter, *k, hash)); ++ if (ret) ++ bch_err(c, "hash_redo_key err %i", ret); ++ else ++ ret = 1; ++ } ++ ++ goto err; ++} ++ ++static int bch2_inode_truncate(struct bch_fs *c, u64 inode_nr, u64 new_size) ++{ ++ return bch2_btree_delete_range(c, BTREE_ID_EXTENTS, ++ POS(inode_nr, round_up(new_size, block_bytes(c)) >> 9), ++ POS(inode_nr + 1, 0), NULL); ++} ++ ++static int bch2_fix_overlapping_extent(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_s_c k, struct bpos cut_at) ++{ ++ struct btree_iter *u_iter; ++ struct bkey_i *u; ++ int ret; ++ ++ u = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); ++ ret = PTR_ERR_OR_ZERO(u); ++ if (ret) ++ return ret; ++ ++ bkey_reassemble(u, k); ++ bch2_cut_front(cut_at, u); ++ ++ u_iter = bch2_trans_copy_iter(trans, iter); ++ ret = PTR_ERR_OR_ZERO(u_iter); ++ if (ret) ++ return ret; ++ ++ /* ++ * We don't want to go through the ++ * extent_handle_overwrites path: ++ */ ++ __bch2_btree_iter_set_pos(u_iter, u->k.p, false); ++ ++ /* ++ * XXX: this is going to leave disk space ++ * accounting slightly wrong ++ */ ++ ret = bch2_trans_update(trans, u_iter, u, 0); ++ bch2_trans_iter_put(trans, u_iter); ++ return ret; ++} ++ ++/* ++ * Walk extents: verify that extents have a corresponding S_ISREG inode, and ++ * that i_size an i_sectors are consistent ++ */ ++noinline_for_stack ++static int check_extents(struct bch_fs *c) ++{ ++ struct inode_walker w = inode_walker_init(); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_on_stack prev; ++ u64 i_sectors; ++ int ret = 0; ++ ++ bkey_on_stack_init(&prev); ++ prev.k->k = KEY(0, 0, 0); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ bch_verbose(c, "checking extents"); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(BCACHEFS_ROOT_INO, 0), ++ BTREE_ITER_INTENT); ++retry: ++ for_each_btree_key_continue(iter, 0, k, ret) { ++ if (bkey_cmp(prev.k->k.p, bkey_start_pos(k.k)) > 0) { ++ char buf1[200]; ++ char buf2[200]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(prev.k)); ++ bch2_bkey_val_to_text(&PBUF(buf2), c, k); ++ ++ if (fsck_err(c, "overlapping extents:\n%s\n%s", buf1, buf2)) { ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ bch2_fix_overlapping_extent(&trans, ++ iter, k, prev.k->k.p)); ++ if (ret) ++ goto err; ++ } ++ } ++ bkey_on_stack_reassemble(&prev, c, k); ++ ++ ret = walk_inode(&trans, &w, k.k->p.inode); ++ if (ret) ++ break; ++ ++ if (fsck_err_on(!w.have_inode, c, ++ "extent type %u for missing inode %llu", ++ k.k->type, k.k->p.inode) || ++ fsck_err_on(w.have_inode && ++ !S_ISREG(w.inode.bi_mode) && !S_ISLNK(w.inode.bi_mode), c, ++ "extent type %u for non regular file, inode %llu mode %o", ++ k.k->type, k.k->p.inode, w.inode.bi_mode)) { ++ bch2_trans_unlock(&trans); ++ ++ ret = bch2_inode_truncate(c, k.k->p.inode, 0); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ if (fsck_err_on(w.first_this_inode && ++ w.have_inode && ++ !(w.inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY) && ++ w.inode.bi_sectors != ++ (i_sectors = bch2_count_inode_sectors(&trans, w.cur_inum)), ++ c, "inode %llu has incorrect i_sectors: got %llu, should be %llu", ++ w.inode.bi_inum, ++ w.inode.bi_sectors, i_sectors)) { ++ struct bkey_inode_buf p; ++ ++ w.inode.bi_sectors = i_sectors; ++ ++ bch2_trans_unlock(&trans); ++ ++ bch2_inode_pack(&p, &w.inode); ++ ++ ret = bch2_btree_insert(c, BTREE_ID_INODES, ++ &p.inode.k_i, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW); ++ if (ret) { ++ bch_err(c, "error in fsck: error %i updating inode", ret); ++ goto err; ++ } ++ ++ /* revalidate iterator: */ ++ k = bch2_btree_iter_peek(iter); ++ } ++ ++ if (fsck_err_on(w.have_inode && ++ !(w.inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && ++ k.k->type != KEY_TYPE_reservation && ++ k.k->p.offset > round_up(w.inode.bi_size, block_bytes(c)) >> 9, c, ++ "extent type %u offset %llu past end of inode %llu, i_size %llu", ++ k.k->type, k.k->p.offset, k.k->p.inode, w.inode.bi_size)) { ++ bch2_trans_unlock(&trans); ++ ++ ret = bch2_inode_truncate(c, k.k->p.inode, ++ w.inode.bi_size); ++ if (ret) ++ goto err; ++ continue; ++ } ++ } ++err: ++fsck_err: ++ if (ret == -EINTR) ++ goto retry; ++ bkey_on_stack_exit(&prev, c); ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++/* ++ * Walk dirents: verify that they all have a corresponding S_ISDIR inode, ++ * validate d_type ++ */ ++noinline_for_stack ++static int check_dirents(struct bch_fs *c) ++{ ++ struct inode_walker w = inode_walker_init(); ++ struct hash_check h; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ unsigned name_len; ++ char buf[200]; ++ int ret = 0; ++ ++ bch_verbose(c, "checking dirents"); ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ hash_check_init(&h); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_DIRENTS, ++ POS(BCACHEFS_ROOT_INO, 0), 0); ++retry: ++ for_each_btree_key_continue(iter, 0, k, ret) { ++ struct bkey_s_c_dirent d; ++ struct bch_inode_unpacked target; ++ bool have_target; ++ u64 d_inum; ++ ++ ret = walk_inode(&trans, &w, k.k->p.inode); ++ if (ret) ++ break; ++ ++ if (fsck_err_on(!w.have_inode, c, ++ "dirent in nonexisting directory:\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf)) || ++ fsck_err_on(!S_ISDIR(w.inode.bi_mode), c, ++ "dirent in non directory inode type %u:\n%s", ++ mode_to_type(w.inode.bi_mode), ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) { ++ ret = bch2_btree_delete_at(&trans, iter, 0); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ if (w.first_this_inode && w.have_inode) ++ hash_check_set_inode(&trans, &h, &w.inode); ++ ++ ret = check_dirent_hash(&trans, &h, iter, &k); ++ if (ret > 0) { ++ ret = 0; ++ continue; ++ } ++ if (ret) ++ goto fsck_err; ++ ++ if (ret) ++ goto fsck_err; ++ ++ if (k.k->type != KEY_TYPE_dirent) ++ continue; ++ ++ d = bkey_s_c_to_dirent(k); ++ d_inum = le64_to_cpu(d.v->d_inum); ++ ++ name_len = bch2_dirent_name_bytes(d); ++ ++ if (fsck_err_on(!name_len, c, "empty dirent") || ++ fsck_err_on(name_len == 1 && ++ !memcmp(d.v->d_name, ".", 1), c, ++ ". dirent") || ++ fsck_err_on(name_len == 2 && ++ !memcmp(d.v->d_name, "..", 2), c, ++ ".. dirent") || ++ fsck_err_on(name_len == 2 && ++ !memcmp(d.v->d_name, "..", 2), c, ++ ".. dirent") || ++ fsck_err_on(memchr(d.v->d_name, '/', name_len), c, ++ "dirent name has invalid chars")) { ++ ret = remove_dirent(&trans, d); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ if (fsck_err_on(d_inum == d.k->p.inode, c, ++ "dirent points to own directory:\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) { ++ ret = remove_dirent(&trans, d); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ ret = bch2_inode_find_by_inum_trans(&trans, d_inum, &target); ++ if (ret && ret != -ENOENT) ++ break; ++ ++ have_target = !ret; ++ ret = 0; ++ ++ if (fsck_err_on(!have_target, c, ++ "dirent points to missing inode:\n%s", ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) { ++ ret = remove_dirent(&trans, d); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ if (fsck_err_on(have_target && ++ d.v->d_type != ++ mode_to_type(target.bi_mode), c, ++ "incorrect d_type: should be %u:\n%s", ++ mode_to_type(target.bi_mode), ++ (bch2_bkey_val_to_text(&PBUF(buf), c, ++ k), buf))) { ++ struct bkey_i_dirent *n; ++ ++ n = kmalloc(bkey_bytes(d.k), GFP_KERNEL); ++ if (!n) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ bkey_reassemble(&n->k_i, d.s_c); ++ n->v.d_type = mode_to_type(target.bi_mode); ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ (bch2_trans_update(&trans, iter, &n->k_i, 0), 0)); ++ kfree(n); ++ if (ret) ++ goto err; ++ ++ } ++ } ++ ++ hash_stop_chain(&trans, &h); ++err: ++fsck_err: ++ if (ret == -EINTR) ++ goto retry; ++ ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++/* ++ * Walk xattrs: verify that they all have a corresponding inode ++ */ ++noinline_for_stack ++static int check_xattrs(struct bch_fs *c) ++{ ++ struct inode_walker w = inode_walker_init(); ++ struct hash_check h; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch_verbose(c, "checking xattrs"); ++ ++ hash_check_init(&h); ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, ++ POS(BCACHEFS_ROOT_INO, 0), 0); ++retry: ++ for_each_btree_key_continue(iter, 0, k, ret) { ++ ret = walk_inode(&trans, &w, k.k->p.inode); ++ if (ret) ++ break; ++ ++ if (fsck_err_on(!w.have_inode, c, ++ "xattr for missing inode %llu", ++ k.k->p.inode)) { ++ ret = bch2_btree_delete_at(&trans, iter, 0); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ if (w.first_this_inode && w.have_inode) ++ hash_check_set_inode(&trans, &h, &w.inode); ++ ++ ret = hash_check_key(&trans, bch2_xattr_hash_desc, ++ &h, iter, k); ++ if (ret) ++ goto fsck_err; ++ } ++err: ++fsck_err: ++ if (ret == -EINTR) ++ goto retry; ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++/* Get root directory, create if it doesn't exist: */ ++static int check_root(struct bch_fs *c, struct bch_inode_unpacked *root_inode) ++{ ++ struct bkey_inode_buf packed; ++ int ret; ++ ++ bch_verbose(c, "checking root directory"); ++ ++ ret = bch2_inode_find_by_inum(c, BCACHEFS_ROOT_INO, root_inode); ++ if (ret && ret != -ENOENT) ++ return ret; ++ ++ if (fsck_err_on(ret, c, "root directory missing")) ++ goto create_root; ++ ++ if (fsck_err_on(!S_ISDIR(root_inode->bi_mode), c, ++ "root inode not a directory")) ++ goto create_root; ++ ++ return 0; ++fsck_err: ++ return ret; ++create_root: ++ bch2_inode_init(c, root_inode, 0, 0, S_IFDIR|0755, ++ 0, NULL); ++ root_inode->bi_inum = BCACHEFS_ROOT_INO; ++ ++ bch2_inode_pack(&packed, root_inode); ++ ++ return bch2_btree_insert(c, BTREE_ID_INODES, &packed.inode.k_i, ++ NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW); ++} ++ ++/* Get lost+found, create if it doesn't exist: */ ++static int check_lostfound(struct bch_fs *c, ++ struct bch_inode_unpacked *root_inode, ++ struct bch_inode_unpacked *lostfound_inode) ++{ ++ struct qstr lostfound = QSTR("lost+found"); ++ struct bch_hash_info root_hash_info = ++ bch2_hash_info_init(c, root_inode); ++ u64 inum; ++ int ret; ++ ++ bch_verbose(c, "checking lost+found"); ++ ++ inum = bch2_dirent_lookup(c, BCACHEFS_ROOT_INO, &root_hash_info, ++ &lostfound); ++ if (!inum) { ++ bch_notice(c, "creating lost+found"); ++ goto create_lostfound; ++ } ++ ++ ret = bch2_inode_find_by_inum(c, inum, lostfound_inode); ++ if (ret && ret != -ENOENT) ++ return ret; ++ ++ if (fsck_err_on(ret, c, "lost+found missing")) ++ goto create_lostfound; ++ ++ if (fsck_err_on(!S_ISDIR(lostfound_inode->bi_mode), c, ++ "lost+found inode not a directory")) ++ goto create_lostfound; ++ ++ return 0; ++fsck_err: ++ return ret; ++create_lostfound: ++ bch2_inode_init_early(c, lostfound_inode); ++ ++ ret = bch2_trans_do(c, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ bch2_create_trans(&trans, ++ BCACHEFS_ROOT_INO, root_inode, ++ lostfound_inode, &lostfound, ++ 0, 0, S_IFDIR|0700, 0, NULL, NULL)); ++ if (ret) ++ bch_err(c, "error creating lost+found: %i", ret); ++ ++ return ret; ++} ++ ++struct inode_bitmap { ++ unsigned long *bits; ++ size_t size; ++}; ++ ++static inline bool inode_bitmap_test(struct inode_bitmap *b, size_t nr) ++{ ++ return nr < b->size ? test_bit(nr, b->bits) : false; ++} ++ ++static inline int inode_bitmap_set(struct inode_bitmap *b, size_t nr) ++{ ++ if (nr >= b->size) { ++ size_t new_size = max_t(size_t, max_t(size_t, ++ PAGE_SIZE * 8, ++ b->size * 2), ++ nr + 1); ++ void *n; ++ ++ new_size = roundup_pow_of_two(new_size); ++ n = krealloc(b->bits, new_size / 8, GFP_KERNEL|__GFP_ZERO); ++ if (!n) { ++ return -ENOMEM; ++ } ++ ++ b->bits = n; ++ b->size = new_size; ++ } ++ ++ __set_bit(nr, b->bits); ++ return 0; ++} ++ ++struct pathbuf { ++ size_t nr; ++ size_t size; ++ ++ struct pathbuf_entry { ++ u64 inum; ++ u64 offset; ++ } *entries; ++}; ++ ++static int path_down(struct pathbuf *p, u64 inum) ++{ ++ if (p->nr == p->size) { ++ size_t new_size = max_t(size_t, 256UL, p->size * 2); ++ void *n = krealloc(p->entries, ++ new_size * sizeof(p->entries[0]), ++ GFP_KERNEL); ++ if (!n) ++ return -ENOMEM; ++ ++ p->entries = n; ++ p->size = new_size; ++ }; ++ ++ p->entries[p->nr++] = (struct pathbuf_entry) { ++ .inum = inum, ++ .offset = 0, ++ }; ++ return 0; ++} ++ ++noinline_for_stack ++static int check_directory_structure(struct bch_fs *c, ++ struct bch_inode_unpacked *lostfound_inode) ++{ ++ struct inode_bitmap dirs_done = { NULL, 0 }; ++ struct pathbuf path = { 0, 0, NULL }; ++ struct pathbuf_entry *e; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_dirent dirent; ++ bool had_unreachable; ++ u64 d_inum; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ bch_verbose(c, "checking directory structure"); ++ ++ /* DFS: */ ++restart_dfs: ++ had_unreachable = false; ++ ++ ret = inode_bitmap_set(&dirs_done, BCACHEFS_ROOT_INO); ++ if (ret) { ++ bch_err(c, "memory allocation failure in inode_bitmap_set()"); ++ goto err; ++ } ++ ++ ret = path_down(&path, BCACHEFS_ROOT_INO); ++ if (ret) ++ goto err; ++ ++ while (path.nr) { ++next: ++ e = &path.entries[path.nr - 1]; ++ ++ if (e->offset == U64_MAX) ++ goto up; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, ++ POS(e->inum, e->offset + 1), 0, k, ret) { ++ if (k.k->p.inode != e->inum) ++ break; ++ ++ e->offset = k.k->p.offset; ++ ++ if (k.k->type != KEY_TYPE_dirent) ++ continue; ++ ++ dirent = bkey_s_c_to_dirent(k); ++ ++ if (dirent.v->d_type != DT_DIR) ++ continue; ++ ++ d_inum = le64_to_cpu(dirent.v->d_inum); ++ ++ if (fsck_err_on(inode_bitmap_test(&dirs_done, d_inum), c, ++ "directory %llu has multiple hardlinks", ++ d_inum)) { ++ ret = remove_dirent(&trans, dirent); ++ if (ret) ++ goto err; ++ continue; ++ } ++ ++ ret = inode_bitmap_set(&dirs_done, d_inum); ++ if (ret) { ++ bch_err(c, "memory allocation failure in inode_bitmap_set()"); ++ goto err; ++ } ++ ++ ret = path_down(&path, d_inum); ++ if (ret) { ++ goto err; ++ } ++ ++ ret = bch2_trans_iter_free(&trans, iter); ++ if (ret) { ++ bch_err(c, "btree error %i in fsck", ret); ++ goto err; ++ } ++ goto next; ++ } ++ ret = bch2_trans_iter_free(&trans, iter) ?: ret; ++ if (ret) { ++ bch_err(c, "btree error %i in fsck", ret); ++ goto err; ++ } ++up: ++ path.nr--; ++ } ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS_MIN, 0); ++retry: ++ for_each_btree_key_continue(iter, 0, k, ret) { ++ if (k.k->type != KEY_TYPE_inode) ++ continue; ++ ++ if (!S_ISDIR(le16_to_cpu(bkey_s_c_to_inode(k).v->bi_mode))) ++ continue; ++ ++ ret = bch2_empty_dir_trans(&trans, k.k->p.inode); ++ if (ret == -EINTR) ++ goto retry; ++ if (!ret) ++ continue; ++ ++ if (fsck_err_on(!inode_bitmap_test(&dirs_done, k.k->p.offset), c, ++ "unreachable directory found (inum %llu)", ++ k.k->p.offset)) { ++ bch2_trans_unlock(&trans); ++ ++ ret = reattach_inode(c, lostfound_inode, k.k->p.offset); ++ if (ret) { ++ goto err; ++ } ++ ++ had_unreachable = true; ++ } ++ } ++ bch2_trans_iter_free(&trans, iter); ++ if (ret) ++ goto err; ++ ++ if (had_unreachable) { ++ bch_info(c, "reattached unreachable directories, restarting pass to check for loops"); ++ kfree(dirs_done.bits); ++ kfree(path.entries); ++ memset(&dirs_done, 0, sizeof(dirs_done)); ++ memset(&path, 0, sizeof(path)); ++ goto restart_dfs; ++ } ++err: ++fsck_err: ++ ret = bch2_trans_exit(&trans) ?: ret; ++ kfree(dirs_done.bits); ++ kfree(path.entries); ++ return ret; ++} ++ ++struct nlink { ++ u32 count; ++ u32 dir_count; ++}; ++ ++typedef GENRADIX(struct nlink) nlink_table; ++ ++static void inc_link(struct bch_fs *c, nlink_table *links, ++ u64 range_start, u64 *range_end, ++ u64 inum, bool dir) ++{ ++ struct nlink *link; ++ ++ if (inum < range_start || inum >= *range_end) ++ return; ++ ++ link = genradix_ptr_alloc(links, inum - range_start, GFP_KERNEL); ++ if (!link) { ++ bch_verbose(c, "allocation failed during fsck - will need another pass"); ++ *range_end = inum; ++ return; ++ } ++ ++ if (dir) ++ link->dir_count++; ++ else ++ link->count++; ++} ++ ++noinline_for_stack ++static int bch2_gc_walk_dirents(struct bch_fs *c, nlink_table *links, ++ u64 range_start, u64 *range_end) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_dirent d; ++ u64 d_inum; ++ int ret; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ inc_link(c, links, range_start, range_end, BCACHEFS_ROOT_INO, false); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_DIRENTS, POS_MIN, 0, k, ret) { ++ switch (k.k->type) { ++ case KEY_TYPE_dirent: ++ d = bkey_s_c_to_dirent(k); ++ d_inum = le64_to_cpu(d.v->d_inum); ++ ++ if (d.v->d_type == DT_DIR) ++ inc_link(c, links, range_start, range_end, ++ d.k->p.inode, true); ++ ++ inc_link(c, links, range_start, range_end, ++ d_inum, false); ++ ++ break; ++ } ++ ++ bch2_trans_cond_resched(&trans); ++ } ++ ret = bch2_trans_exit(&trans) ?: ret; ++ if (ret) ++ bch_err(c, "error in fsck: btree error %i while walking dirents", ret); ++ ++ return ret; ++} ++ ++static int check_inode_nlink(struct bch_fs *c, ++ struct bch_inode_unpacked *lostfound_inode, ++ struct bch_inode_unpacked *u, ++ struct nlink *link, ++ bool *do_update) ++{ ++ u32 i_nlink = bch2_inode_nlink_get(u); ++ u32 real_i_nlink = ++ link->count * nlink_bias(u->bi_mode) + ++ link->dir_count; ++ int ret = 0; ++ ++ /* ++ * These should have been caught/fixed by earlier passes, we don't ++ * repair them here: ++ */ ++ if (S_ISDIR(u->bi_mode) && link->count > 1) { ++ need_fsck_err(c, "directory %llu with multiple hardlinks: %u", ++ u->bi_inum, link->count); ++ return 0; ++ } ++ ++ if (S_ISDIR(u->bi_mode) && !link->count) { ++ need_fsck_err(c, "unreachable directory found (inum %llu)", ++ u->bi_inum); ++ return 0; ++ } ++ ++ if (!S_ISDIR(u->bi_mode) && link->dir_count) { ++ need_fsck_err(c, "non directory with subdirectories (inum %llu)", ++ u->bi_inum); ++ return 0; ++ } ++ ++ if (!link->count && ++ !(u->bi_flags & BCH_INODE_UNLINKED) && ++ (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { ++ if (fsck_err(c, "unreachable inode %llu not marked as unlinked (type %u)", ++ u->bi_inum, mode_to_type(u->bi_mode)) == ++ FSCK_ERR_IGNORE) ++ return 0; ++ ++ ret = reattach_inode(c, lostfound_inode, u->bi_inum); ++ if (ret) ++ return ret; ++ ++ link->count = 1; ++ real_i_nlink = nlink_bias(u->bi_mode) + link->dir_count; ++ goto set_i_nlink; ++ } ++ ++ if (i_nlink < link->count) { ++ if (fsck_err(c, "inode %llu i_link too small (%u < %u, type %i)", ++ u->bi_inum, i_nlink, link->count, ++ mode_to_type(u->bi_mode)) == FSCK_ERR_IGNORE) ++ return 0; ++ goto set_i_nlink; ++ } ++ ++ if (i_nlink != real_i_nlink && ++ c->sb.clean) { ++ if (fsck_err(c, "filesystem marked clean, " ++ "but inode %llu has wrong i_nlink " ++ "(type %u i_nlink %u, should be %u)", ++ u->bi_inum, mode_to_type(u->bi_mode), ++ i_nlink, real_i_nlink) == FSCK_ERR_IGNORE) ++ return 0; ++ goto set_i_nlink; ++ } ++ ++ if (i_nlink != real_i_nlink && ++ (c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { ++ if (fsck_err(c, "inode %llu has wrong i_nlink " ++ "(type %u i_nlink %u, should be %u)", ++ u->bi_inum, mode_to_type(u->bi_mode), ++ i_nlink, real_i_nlink) == FSCK_ERR_IGNORE) ++ return 0; ++ goto set_i_nlink; ++ } ++ ++ if (real_i_nlink && i_nlink != real_i_nlink) ++ bch_verbose(c, "setting inode %llu nlink from %u to %u", ++ u->bi_inum, i_nlink, real_i_nlink); ++set_i_nlink: ++ if (i_nlink != real_i_nlink) { ++ bch2_inode_nlink_set(u, real_i_nlink); ++ *do_update = true; ++ } ++fsck_err: ++ return ret; ++} ++ ++static int check_inode(struct btree_trans *trans, ++ struct bch_inode_unpacked *lostfound_inode, ++ struct btree_iter *iter, ++ struct bkey_s_c_inode inode, ++ struct nlink *link) ++{ ++ struct bch_fs *c = trans->c; ++ struct bch_inode_unpacked u; ++ bool do_update = false; ++ int ret = 0; ++ ++ ret = bch2_inode_unpack(inode, &u); ++ ++ bch2_trans_unlock(trans); ++ ++ if (bch2_fs_inconsistent_on(ret, c, ++ "error unpacking inode %llu in fsck", ++ inode.k->p.inode)) ++ return ret; ++ ++ if (link) { ++ ret = check_inode_nlink(c, lostfound_inode, &u, link, ++ &do_update); ++ if (ret) ++ return ret; ++ } ++ ++ if (u.bi_flags & BCH_INODE_UNLINKED && ++ (!c->sb.clean || ++ fsck_err(c, "filesystem marked clean, but inode %llu unlinked", ++ u.bi_inum))) { ++ bch_verbose(c, "deleting inode %llu", u.bi_inum); ++ ++ bch2_fs_lazy_rw(c); ++ ++ ret = bch2_inode_rm(c, u.bi_inum); ++ if (ret) ++ bch_err(c, "error in fsck: error %i while deleting inode", ret); ++ return ret; ++ } ++ ++ if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY && ++ (!c->sb.clean || ++ fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty", ++ u.bi_inum))) { ++ bch_verbose(c, "truncating inode %llu", u.bi_inum); ++ ++ bch2_fs_lazy_rw(c); ++ ++ /* ++ * XXX: need to truncate partial blocks too here - or ideally ++ * just switch units to bytes and that issue goes away ++ */ ++ ++ ret = bch2_inode_truncate(c, u.bi_inum, u.bi_size); ++ if (ret) { ++ bch_err(c, "error in fsck: error %i truncating inode", ret); ++ return ret; ++ } ++ ++ /* ++ * We truncated without our normal sector accounting hook, just ++ * make sure we recalculate it: ++ */ ++ u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY; ++ ++ u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; ++ do_update = true; ++ } ++ ++ if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY && ++ (!c->sb.clean || ++ fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty", ++ u.bi_inum))) { ++ s64 sectors; ++ ++ bch_verbose(c, "recounting sectors for inode %llu", ++ u.bi_inum); ++ ++ sectors = bch2_count_inode_sectors(trans, u.bi_inum); ++ if (sectors < 0) { ++ bch_err(c, "error in fsck: error %i recounting inode sectors", ++ (int) sectors); ++ return sectors; ++ } ++ ++ u.bi_sectors = sectors; ++ u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY; ++ do_update = true; ++ } ++ ++ if (do_update) { ++ struct bkey_inode_buf p; ++ ++ bch2_inode_pack(&p, &u); ++ ++ ret = __bch2_trans_do(trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW, ++ (bch2_trans_update(trans, iter, &p.inode.k_i, 0), 0)); ++ if (ret) ++ bch_err(c, "error in fsck: error %i " ++ "updating inode", ret); ++ } ++fsck_err: ++ return ret; ++} ++ ++noinline_for_stack ++static int bch2_gc_walk_inodes(struct bch_fs *c, ++ struct bch_inode_unpacked *lostfound_inode, ++ nlink_table *links, ++ u64 range_start, u64 range_end) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct nlink *link, zero_links = { 0, 0 }; ++ struct genradix_iter nlinks_iter; ++ int ret = 0, ret2 = 0; ++ u64 nlinks_pos; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, ++ POS(0, range_start), 0); ++ nlinks_iter = genradix_iter_init(links, 0); ++ ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret2 = bkey_err(k))) { ++peek_nlinks: link = genradix_iter_peek(&nlinks_iter, links); ++ ++ if (!link && (!k.k || iter->pos.offset >= range_end)) ++ break; ++ ++ nlinks_pos = range_start + nlinks_iter.pos; ++ if (iter->pos.offset > nlinks_pos) { ++ /* Should have been caught by dirents pass: */ ++ need_fsck_err_on(link && link->count, c, ++ "missing inode %llu (nlink %u)", ++ nlinks_pos, link->count); ++ genradix_iter_advance(&nlinks_iter, links); ++ goto peek_nlinks; ++ } ++ ++ if (iter->pos.offset < nlinks_pos || !link) ++ link = &zero_links; ++ ++ if (k.k && k.k->type == KEY_TYPE_inode) { ++ ret = check_inode(&trans, lostfound_inode, iter, ++ bkey_s_c_to_inode(k), link); ++ BUG_ON(ret == -EINTR); ++ if (ret) ++ break; ++ } else { ++ /* Should have been caught by dirents pass: */ ++ need_fsck_err_on(link->count, c, ++ "missing inode %llu (nlink %u)", ++ nlinks_pos, link->count); ++ } ++ ++ if (nlinks_pos == iter->pos.offset) ++ genradix_iter_advance(&nlinks_iter, links); ++ ++ bch2_btree_iter_next(iter); ++ bch2_trans_cond_resched(&trans); ++ } ++fsck_err: ++ bch2_trans_exit(&trans); ++ ++ if (ret2) ++ bch_err(c, "error in fsck: btree error %i while walking inodes", ret2); ++ ++ return ret ?: ret2; ++} ++ ++noinline_for_stack ++static int check_inode_nlinks(struct bch_fs *c, ++ struct bch_inode_unpacked *lostfound_inode) ++{ ++ nlink_table links; ++ u64 this_iter_range_start, next_iter_range_start = 0; ++ int ret = 0; ++ ++ bch_verbose(c, "checking inode nlinks"); ++ ++ genradix_init(&links); ++ ++ do { ++ this_iter_range_start = next_iter_range_start; ++ next_iter_range_start = U64_MAX; ++ ++ ret = bch2_gc_walk_dirents(c, &links, ++ this_iter_range_start, ++ &next_iter_range_start); ++ if (ret) ++ break; ++ ++ ret = bch2_gc_walk_inodes(c, lostfound_inode, &links, ++ this_iter_range_start, ++ next_iter_range_start); ++ if (ret) ++ break; ++ ++ genradix_free(&links); ++ } while (next_iter_range_start != U64_MAX); ++ ++ genradix_free(&links); ++ ++ return ret; ++} ++ ++/* ++ * Checks for inconsistencies that shouldn't happen, unless we have a bug. ++ * Doesn't fix them yet, mainly because they haven't yet been observed: ++ */ ++int bch2_fsck_full(struct bch_fs *c) ++{ ++ struct bch_inode_unpacked root_inode, lostfound_inode; ++ ++ return check_extents(c) ?: ++ check_dirents(c) ?: ++ check_xattrs(c) ?: ++ check_root(c, &root_inode) ?: ++ check_lostfound(c, &root_inode, &lostfound_inode) ?: ++ check_directory_structure(c, &lostfound_inode) ?: ++ check_inode_nlinks(c, &lostfound_inode); ++} ++ ++int bch2_fsck_inode_nlink(struct bch_fs *c) ++{ ++ struct bch_inode_unpacked root_inode, lostfound_inode; ++ ++ return check_root(c, &root_inode) ?: ++ check_lostfound(c, &root_inode, &lostfound_inode) ?: ++ check_inode_nlinks(c, &lostfound_inode); ++} ++ ++int bch2_fsck_walk_inodes_only(struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_s_c_inode inode; ++ int ret; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, 0, k, ret) { ++ if (k.k->type != KEY_TYPE_inode) ++ continue; ++ ++ inode = bkey_s_c_to_inode(k); ++ ++ if (inode.v->bi_flags & ++ (BCH_INODE_I_SIZE_DIRTY| ++ BCH_INODE_I_SECTORS_DIRTY| ++ BCH_INODE_UNLINKED)) { ++ ret = check_inode(&trans, NULL, iter, inode, NULL); ++ BUG_ON(ret == -EINTR); ++ if (ret) ++ break; ++ } ++ } ++ BUG_ON(ret == -EINTR); ++ ++ return bch2_trans_exit(&trans) ?: ret; ++} +diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h +new file mode 100644 +index 000000000000..9e4af02bde1e +--- /dev/null ++++ b/fs/bcachefs/fsck.h +@@ -0,0 +1,9 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_FSCK_H ++#define _BCACHEFS_FSCK_H ++ ++int bch2_fsck_full(struct bch_fs *); ++int bch2_fsck_inode_nlink(struct bch_fs *); ++int bch2_fsck_walk_inodes_only(struct bch_fs *); ++ ++#endif /* _BCACHEFS_FSCK_H */ +diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c +new file mode 100644 +index 000000000000..7d20f082ad45 +--- /dev/null ++++ b/fs/bcachefs/inode.c +@@ -0,0 +1,554 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_update.h" ++#include "error.h" ++#include "extents.h" ++#include "inode.h" ++#include "str_hash.h" ++ ++#include ++ ++#include ++ ++const char * const bch2_inode_opts[] = { ++#define x(name, ...) #name, ++ BCH_INODE_OPTS() ++#undef x ++ NULL, ++}; ++ ++static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; ++static const u8 bits_table[8] = { ++ 1 * 8 - 1, ++ 2 * 8 - 2, ++ 3 * 8 - 3, ++ 4 * 8 - 4, ++ 6 * 8 - 5, ++ 8 * 8 - 6, ++ 10 * 8 - 7, ++ 13 * 8 - 8, ++}; ++ ++static int inode_encode_field(u8 *out, u8 *end, u64 hi, u64 lo) ++{ ++ __be64 in[2] = { cpu_to_be64(hi), cpu_to_be64(lo), }; ++ unsigned shift, bytes, bits = likely(!hi) ++ ? fls64(lo) ++ : fls64(hi) + 64; ++ ++ for (shift = 1; shift <= 8; shift++) ++ if (bits < bits_table[shift - 1]) ++ goto got_shift; ++ ++ BUG(); ++got_shift: ++ bytes = byte_table[shift - 1]; ++ ++ BUG_ON(out + bytes > end); ++ ++ memcpy(out, (u8 *) in + 16 - bytes, bytes); ++ *out |= (1 << 8) >> shift; ++ ++ return bytes; ++} ++ ++static int inode_decode_field(const u8 *in, const u8 *end, ++ u64 out[2], unsigned *out_bits) ++{ ++ __be64 be[2] = { 0, 0 }; ++ unsigned bytes, shift; ++ u8 *p; ++ ++ if (in >= end) ++ return -1; ++ ++ if (!*in) ++ return -1; ++ ++ /* ++ * position of highest set bit indicates number of bytes: ++ * shift = number of bits to remove in high byte: ++ */ ++ shift = 8 - __fls(*in); /* 1 <= shift <= 8 */ ++ bytes = byte_table[shift - 1]; ++ ++ if (in + bytes > end) ++ return -1; ++ ++ p = (u8 *) be + 16 - bytes; ++ memcpy(p, in, bytes); ++ *p ^= (1 << 8) >> shift; ++ ++ out[0] = be64_to_cpu(be[0]); ++ out[1] = be64_to_cpu(be[1]); ++ *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]); ++ ++ return bytes; ++} ++ ++void bch2_inode_pack(struct bkey_inode_buf *packed, ++ const struct bch_inode_unpacked *inode) ++{ ++ u8 *out = packed->inode.v.fields; ++ u8 *end = (void *) &packed[1]; ++ u8 *last_nonzero_field = out; ++ unsigned nr_fields = 0, last_nonzero_fieldnr = 0; ++ unsigned bytes; ++ ++ bkey_inode_init(&packed->inode.k_i); ++ packed->inode.k.p.offset = inode->bi_inum; ++ packed->inode.v.bi_hash_seed = inode->bi_hash_seed; ++ packed->inode.v.bi_flags = cpu_to_le32(inode->bi_flags); ++ packed->inode.v.bi_mode = cpu_to_le16(inode->bi_mode); ++ ++#define x(_name, _bits) \ ++ out += inode_encode_field(out, end, 0, inode->_name); \ ++ nr_fields++; \ ++ \ ++ if (inode->_name) { \ ++ last_nonzero_field = out; \ ++ last_nonzero_fieldnr = nr_fields; \ ++ } ++ ++ BCH_INODE_FIELDS() ++#undef x ++ ++ out = last_nonzero_field; ++ nr_fields = last_nonzero_fieldnr; ++ ++ bytes = out - (u8 *) &packed->inode.v; ++ set_bkey_val_bytes(&packed->inode.k, bytes); ++ memset_u64s_tail(&packed->inode.v, 0, bytes); ++ ++ SET_INODE_NR_FIELDS(&packed->inode.v, nr_fields); ++ ++ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { ++ struct bch_inode_unpacked unpacked; ++ ++ int ret = bch2_inode_unpack(inode_i_to_s_c(&packed->inode), ++ &unpacked); ++ BUG_ON(ret); ++ BUG_ON(unpacked.bi_inum != inode->bi_inum); ++ BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); ++ BUG_ON(unpacked.bi_mode != inode->bi_mode); ++ ++#define x(_name, _bits) BUG_ON(unpacked._name != inode->_name); ++ BCH_INODE_FIELDS() ++#undef x ++ } ++} ++ ++int bch2_inode_unpack(struct bkey_s_c_inode inode, ++ struct bch_inode_unpacked *unpacked) ++{ ++ const u8 *in = inode.v->fields; ++ const u8 *end = (void *) inode.v + bkey_val_bytes(inode.k); ++ u64 field[2]; ++ unsigned fieldnr = 0, field_bits; ++ int ret; ++ ++ unpacked->bi_inum = inode.k->p.offset; ++ unpacked->bi_hash_seed = inode.v->bi_hash_seed; ++ unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); ++ unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); ++ ++#define x(_name, _bits) \ ++ if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ ++ memset(&unpacked->_name, 0, \ ++ sizeof(*unpacked) - \ ++ offsetof(struct bch_inode_unpacked, _name)); \ ++ return 0; \ ++ } \ ++ \ ++ ret = inode_decode_field(in, end, field, &field_bits); \ ++ if (ret < 0) \ ++ return ret; \ ++ \ ++ if (field_bits > sizeof(unpacked->_name) * 8) \ ++ return -1; \ ++ \ ++ unpacked->_name = field[1]; \ ++ in += ret; ++ ++ BCH_INODE_FIELDS() ++#undef x ++ ++ /* XXX: signal if there were more fields than expected? */ ++ ++ return 0; ++} ++ ++struct btree_iter *bch2_inode_peek(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode, ++ u64 inum, unsigned flags) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, POS(0, inum), ++ BTREE_ITER_SLOTS|flags); ++ if (IS_ERR(iter)) ++ return iter; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ ret = k.k->type == KEY_TYPE_inode ? 0 : -EIO; ++ if (ret) ++ goto err; ++ ++ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), inode); ++ if (ret) ++ goto err; ++ ++ return iter; ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ERR_PTR(ret); ++} ++ ++int bch2_inode_write(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bch_inode_unpacked *inode) ++{ ++ struct bkey_inode_buf *inode_p; ++ ++ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); ++ if (IS_ERR(inode_p)) ++ return PTR_ERR(inode_p); ++ ++ bch2_inode_pack(inode_p, inode); ++ bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); ++ return 0; ++} ++ ++const char *bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); ++ struct bch_inode_unpacked unpacked; ++ ++ if (k.k->p.inode) ++ return "nonzero k.p.inode"; ++ ++ if (bkey_val_bytes(k.k) < sizeof(struct bch_inode)) ++ return "incorrect value size"; ++ ++ if (k.k->p.offset < BLOCKDEV_INODE_MAX) ++ return "fs inode in blockdev range"; ++ ++ if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) ++ return "invalid str hash type"; ++ ++ if (bch2_inode_unpack(inode, &unpacked)) ++ return "invalid variable length fields"; ++ ++ if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) ++ return "invalid data checksum type"; ++ ++ if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) ++ return "invalid data checksum type"; ++ ++ if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && ++ unpacked.bi_nlink != 0) ++ return "flagged as unlinked but bi_nlink != 0"; ++ ++ return NULL; ++} ++ ++void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); ++ struct bch_inode_unpacked unpacked; ++ ++ if (bch2_inode_unpack(inode, &unpacked)) { ++ pr_buf(out, "(unpack error)"); ++ return; ++ } ++ ++#define x(_name, _bits) \ ++ pr_buf(out, #_name ": %llu ", (u64) unpacked._name); ++ BCH_INODE_FIELDS() ++#undef x ++} ++ ++const char *bch2_inode_generation_invalid(const struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ if (k.k->p.inode) ++ return "nonzero k.p.inode"; ++ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_inode_generation)) ++ return "incorrect value size"; ++ ++ return NULL; ++} ++ ++void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k); ++ ++ pr_buf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); ++} ++ ++void bch2_inode_init_early(struct bch_fs *c, ++ struct bch_inode_unpacked *inode_u) ++{ ++ enum bch_str_hash_type str_hash = ++ bch2_str_hash_opt_to_type(c, c->opts.str_hash); ++ ++ memset(inode_u, 0, sizeof(*inode_u)); ++ ++ /* ick */ ++ inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET; ++ get_random_bytes(&inode_u->bi_hash_seed, ++ sizeof(inode_u->bi_hash_seed)); ++} ++ ++void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, ++ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, ++ struct bch_inode_unpacked *parent) ++{ ++ inode_u->bi_mode = mode; ++ inode_u->bi_uid = uid; ++ inode_u->bi_gid = gid; ++ inode_u->bi_dev = rdev; ++ inode_u->bi_atime = now; ++ inode_u->bi_mtime = now; ++ inode_u->bi_ctime = now; ++ inode_u->bi_otime = now; ++ ++ if (parent && parent->bi_mode & S_ISGID) { ++ inode_u->bi_gid = parent->bi_gid; ++ if (S_ISDIR(mode)) ++ inode_u->bi_mode |= S_ISGID; ++ } ++ ++ if (parent) { ++#define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name; ++ BCH_INODE_OPTS() ++#undef x ++ } ++} ++ ++void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, ++ uid_t uid, gid_t gid, umode_t mode, dev_t rdev, ++ struct bch_inode_unpacked *parent) ++{ ++ bch2_inode_init_early(c, inode_u); ++ bch2_inode_init_late(inode_u, bch2_current_time(c), ++ uid, gid, mode, rdev, parent); ++} ++ ++static inline u32 bkey_generation(struct bkey_s_c k) ++{ ++ switch (k.k->type) { ++ case KEY_TYPE_inode: ++ BUG(); ++ case KEY_TYPE_inode_generation: ++ return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); ++ default: ++ return 0; ++ } ++} ++ ++int bch2_inode_create(struct btree_trans *trans, ++ struct bch_inode_unpacked *inode_u, ++ u64 min, u64 max, u64 *hint) ++{ ++ struct bkey_inode_buf *inode_p; ++ struct btree_iter *iter = NULL; ++ struct bkey_s_c k; ++ u64 start; ++ int ret; ++ ++ if (!max) ++ max = ULLONG_MAX; ++ ++ if (trans->c->opts.inodes_32bit) ++ max = min_t(u64, max, U32_MAX); ++ ++ start = READ_ONCE(*hint); ++ ++ if (start >= max || start < min) ++ start = min; ++ ++ inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); ++ if (IS_ERR(inode_p)) ++ return PTR_ERR(inode_p); ++again: ++ for_each_btree_key(trans, iter, BTREE_ID_INODES, POS(0, start), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ if (bkey_cmp(iter->pos, POS(0, max)) > 0) ++ break; ++ ++ if (k.k->type != KEY_TYPE_inode) ++ goto found_slot; ++ } ++ ++ bch2_trans_iter_put(trans, iter); ++ ++ if (ret) ++ return ret; ++ ++ if (start != min) { ++ /* Retry from start */ ++ start = min; ++ goto again; ++ } ++ ++ return -ENOSPC; ++found_slot: ++ *hint = k.k->p.offset; ++ inode_u->bi_inum = k.k->p.offset; ++ inode_u->bi_generation = bkey_generation(k); ++ ++ bch2_inode_pack(inode_p, inode_u); ++ bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); ++ bch2_trans_iter_put(trans, iter); ++ return 0; ++} ++ ++int bch2_inode_rm(struct bch_fs *c, u64 inode_nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_i_inode_generation delete; ++ struct bpos start = POS(inode_nr, 0); ++ struct bpos end = POS(inode_nr + 1, 0); ++ int ret; ++ ++ /* ++ * If this was a directory, there shouldn't be any real dirents left - ++ * but there could be whiteouts (from hash collisions) that we should ++ * delete: ++ * ++ * XXX: the dirent could ideally would delete whiteouts when they're no ++ * longer needed ++ */ ++ ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, ++ start, end, NULL) ?: ++ bch2_btree_delete_range(c, BTREE_ID_XATTRS, ++ start, end, NULL) ?: ++ bch2_btree_delete_range(c, BTREE_ID_DIRENTS, ++ start, end, NULL); ++ if (ret) ++ return ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_INODES, POS(0, inode_nr), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ do { ++ struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); ++ u32 bi_generation = 0; ++ ++ ret = bkey_err(k); ++ if (ret) ++ break; ++ ++ bch2_fs_inconsistent_on(k.k->type != KEY_TYPE_inode, c, ++ "inode %llu not found when deleting", ++ inode_nr); ++ ++ switch (k.k->type) { ++ case KEY_TYPE_inode: { ++ struct bch_inode_unpacked inode_u; ++ ++ if (!bch2_inode_unpack(bkey_s_c_to_inode(k), &inode_u)) ++ bi_generation = inode_u.bi_generation + 1; ++ break; ++ } ++ case KEY_TYPE_inode_generation: { ++ struct bkey_s_c_inode_generation g = ++ bkey_s_c_to_inode_generation(k); ++ bi_generation = le32_to_cpu(g.v->bi_generation); ++ break; ++ } ++ } ++ ++ if (!bi_generation) { ++ bkey_init(&delete.k); ++ delete.k.p.offset = inode_nr; ++ } else { ++ bkey_inode_generation_init(&delete.k_i); ++ delete.k.p.offset = inode_nr; ++ delete.v.bi_generation = cpu_to_le32(bi_generation); ++ } ++ ++ bch2_trans_update(&trans, iter, &delete.k_i, 0); ++ ++ ret = bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL); ++ } while (ret == -EINTR); ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++int bch2_inode_find_by_inum_trans(struct btree_trans *trans, u64 inode_nr, ++ struct bch_inode_unpacked *inode) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_INODES, ++ POS(0, inode_nr), BTREE_ITER_SLOTS); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ ret = k.k->type == KEY_TYPE_inode ++ ? bch2_inode_unpack(bkey_s_c_to_inode(k), inode) ++ : -ENOENT; ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++int bch2_inode_find_by_inum(struct bch_fs *c, u64 inode_nr, ++ struct bch_inode_unpacked *inode) ++{ ++ return bch2_trans_do(c, NULL, NULL, 0, ++ bch2_inode_find_by_inum_trans(&trans, inode_nr, inode)); ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_inode_pack_test(void) ++{ ++ struct bch_inode_unpacked *u, test_inodes[] = { ++ { ++ .bi_atime = U64_MAX, ++ .bi_ctime = U64_MAX, ++ .bi_mtime = U64_MAX, ++ .bi_otime = U64_MAX, ++ .bi_size = U64_MAX, ++ .bi_sectors = U64_MAX, ++ .bi_uid = U32_MAX, ++ .bi_gid = U32_MAX, ++ .bi_nlink = U32_MAX, ++ .bi_generation = U32_MAX, ++ .bi_dev = U32_MAX, ++ }, ++ }; ++ ++ for (u = test_inodes; ++ u < test_inodes + ARRAY_SIZE(test_inodes); ++ u++) { ++ struct bkey_inode_buf p; ++ ++ bch2_inode_pack(&p, u); ++ } ++} ++#endif +diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h +new file mode 100644 +index 000000000000..bb759a46dc41 +--- /dev/null ++++ b/fs/bcachefs/inode.h +@@ -0,0 +1,177 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_INODE_H ++#define _BCACHEFS_INODE_H ++ ++#include "opts.h" ++ ++extern const char * const bch2_inode_opts[]; ++ ++const char *bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_inode (struct bkey_ops) { \ ++ .key_invalid = bch2_inode_invalid, \ ++ .val_to_text = bch2_inode_to_text, \ ++} ++ ++const char *bch2_inode_generation_invalid(const struct bch_fs *, ++ struct bkey_s_c); ++void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ ++#define bch2_bkey_ops_inode_generation (struct bkey_ops) { \ ++ .key_invalid = bch2_inode_generation_invalid, \ ++ .val_to_text = bch2_inode_generation_to_text, \ ++} ++ ++struct bch_inode_unpacked { ++ u64 bi_inum; ++ __le64 bi_hash_seed; ++ u32 bi_flags; ++ u16 bi_mode; ++ ++#define x(_name, _bits) u##_bits _name; ++ BCH_INODE_FIELDS() ++#undef x ++}; ++ ++struct bkey_inode_buf { ++ struct bkey_i_inode inode; ++ ++#define x(_name, _bits) + 8 + _bits / 8 ++ u8 _pad[0 + BCH_INODE_FIELDS()]; ++#undef x ++} __attribute__((packed, aligned(8))); ++ ++void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); ++int bch2_inode_unpack(struct bkey_s_c_inode, struct bch_inode_unpacked *); ++ ++struct btree_iter *bch2_inode_peek(struct btree_trans *, ++ struct bch_inode_unpacked *, u64, unsigned); ++int bch2_inode_write(struct btree_trans *, struct btree_iter *, ++ struct bch_inode_unpacked *); ++ ++void bch2_inode_init_early(struct bch_fs *, ++ struct bch_inode_unpacked *); ++void bch2_inode_init_late(struct bch_inode_unpacked *, u64, ++ uid_t, gid_t, umode_t, dev_t, ++ struct bch_inode_unpacked *); ++void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, ++ uid_t, gid_t, umode_t, dev_t, ++ struct bch_inode_unpacked *); ++ ++int bch2_inode_create(struct btree_trans *, ++ struct bch_inode_unpacked *, ++ u64, u64, u64 *); ++ ++int bch2_inode_rm(struct bch_fs *, u64); ++ ++int bch2_inode_find_by_inum_trans(struct btree_trans *, u64, ++ struct bch_inode_unpacked *); ++int bch2_inode_find_by_inum(struct bch_fs *, u64, struct bch_inode_unpacked *); ++ ++static inline struct bch_io_opts bch2_inode_opts_get(struct bch_inode_unpacked *inode) ++{ ++ struct bch_io_opts ret = { 0 }; ++ ++#define x(_name, _bits) \ ++ if (inode->bi_##_name) \ ++ opt_set(ret, _name, inode->bi_##_name - 1); ++ BCH_INODE_OPTS() ++#undef x ++ return ret; ++} ++ ++static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode, ++ enum inode_opt_id id, u64 v) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Inode_opt_##_name: \ ++ inode->bi_##_name = v; \ ++ break; ++ BCH_INODE_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode, ++ enum inode_opt_id id) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Inode_opt_##_name: \ ++ return inode->bi_##_name; ++ BCH_INODE_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++static inline struct bch_io_opts ++io_opts(struct bch_fs *c, struct bch_inode_unpacked *inode) ++{ ++ struct bch_io_opts opts = bch2_opts_to_inode_opts(c->opts); ++ ++ bch2_io_opts_apply(&opts, bch2_inode_opts_get(inode)); ++ return opts; ++} ++ ++static inline u8 mode_to_type(umode_t mode) ++{ ++ return (mode >> 12) & 15; ++} ++ ++/* i_nlink: */ ++ ++static inline unsigned nlink_bias(umode_t mode) ++{ ++ return S_ISDIR(mode) ? 2 : 1; ++} ++ ++static inline void bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) ++{ ++ if (bi->bi_flags & BCH_INODE_UNLINKED) ++ bi->bi_flags &= ~BCH_INODE_UNLINKED; ++ else ++ bi->bi_nlink++; ++} ++ ++static inline void bch2_inode_nlink_dec(struct bch_inode_unpacked *bi) ++{ ++ BUG_ON(bi->bi_flags & BCH_INODE_UNLINKED); ++ if (bi->bi_nlink) ++ bi->bi_nlink--; ++ else ++ bi->bi_flags |= BCH_INODE_UNLINKED; ++} ++ ++static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi) ++{ ++ return bi->bi_flags & BCH_INODE_UNLINKED ++ ? 0 ++ : bi->bi_nlink + nlink_bias(bi->bi_mode); ++} ++ ++static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, ++ unsigned nlink) ++{ ++ if (nlink) { ++ bi->bi_nlink = nlink - nlink_bias(bi->bi_mode); ++ bi->bi_flags &= ~BCH_INODE_UNLINKED; ++ } else { ++ bi->bi_nlink = 0; ++ bi->bi_flags |= BCH_INODE_UNLINKED; ++ } ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_inode_pack_test(void); ++#else ++static inline void bch2_inode_pack_test(void) {} ++#endif ++ ++#endif /* _BCACHEFS_INODE_H */ +diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c +new file mode 100644 +index 000000000000..0a4b4eed465c +--- /dev/null ++++ b/fs/bcachefs/io.c +@@ -0,0 +1,2389 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Some low level IO code, and hacks for various block layer limitations ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "bkey_on_stack.h" ++#include "bset.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "compress.h" ++#include "clock.h" ++#include "debug.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "error.h" ++#include "extent_update.h" ++#include "inode.h" ++#include "io.h" ++#include "journal.h" ++#include "keylist.h" ++#include "move.h" ++#include "rebalance.h" ++#include "super.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++ ++#include ++ ++const char *bch2_blk_status_to_str(blk_status_t status) ++{ ++ if (status == BLK_STS_REMOVED) ++ return "device removed"; ++ return blk_status_to_str(status); ++} ++ ++static bool bch2_target_congested(struct bch_fs *c, u16 target) ++{ ++ const struct bch_devs_mask *devs; ++ unsigned d, nr = 0, total = 0; ++ u64 now = local_clock(), last; ++ s64 congested; ++ struct bch_dev *ca; ++ ++ if (!target) ++ return false; ++ ++ rcu_read_lock(); ++ devs = bch2_target_to_mask(c, target) ?: ++ &c->rw_devs[BCH_DATA_user]; ++ ++ for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { ++ ca = rcu_dereference(c->devs[d]); ++ if (!ca) ++ continue; ++ ++ congested = atomic_read(&ca->congested); ++ last = READ_ONCE(ca->congested_last); ++ if (time_after64(now, last)) ++ congested -= (now - last) >> 12; ++ ++ total += max(congested, 0LL); ++ nr++; ++ } ++ rcu_read_unlock(); ++ ++ return bch2_rand_range(nr * CONGESTED_MAX) < total; ++} ++ ++static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, ++ u64 now, int rw) ++{ ++ u64 latency_capable = ++ ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; ++ /* ideally we'd be taking into account the device's variance here: */ ++ u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); ++ s64 latency_over = io_latency - latency_threshold; ++ ++ if (latency_threshold && latency_over > 0) { ++ /* ++ * bump up congested by approximately latency_over * 4 / ++ * latency_threshold - we don't need much accuracy here so don't ++ * bother with the divide: ++ */ ++ if (atomic_read(&ca->congested) < CONGESTED_MAX) ++ atomic_add(latency_over >> ++ max_t(int, ilog2(latency_threshold) - 2, 0), ++ &ca->congested); ++ ++ ca->congested_last = now; ++ } else if (atomic_read(&ca->congested) > 0) { ++ atomic_dec(&ca->congested); ++ } ++} ++ ++void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) ++{ ++ atomic64_t *latency = &ca->cur_latency[rw]; ++ u64 now = local_clock(); ++ u64 io_latency = time_after64(now, submit_time) ++ ? now - submit_time ++ : 0; ++ u64 old, new, v = atomic64_read(latency); ++ ++ do { ++ old = v; ++ ++ /* ++ * If the io latency was reasonably close to the current ++ * latency, skip doing the update and atomic operation - most of ++ * the time: ++ */ ++ if (abs((int) (old - io_latency)) < (old >> 1) && ++ now & ~(~0 << 5)) ++ break; ++ ++ new = ewma_add(old, io_latency, 5); ++ } while ((v = atomic64_cmpxchg(latency, old, new)) != old); ++ ++ bch2_congested_acct(ca, io_latency, now, rw); ++ ++ __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); ++} ++ ++/* Allocate, free from mempool: */ ++ ++void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) ++{ ++ struct bvec_iter_all iter; ++ struct bio_vec *bv; ++ ++ bio_for_each_segment_all(bv, bio, iter) ++ if (bv->bv_page != ZERO_PAGE(0)) ++ mempool_free(bv->bv_page, &c->bio_bounce_pages); ++ bio->bi_vcnt = 0; ++} ++ ++static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) ++{ ++ struct page *page; ++ ++ if (likely(!*using_mempool)) { ++ page = alloc_page(GFP_NOIO); ++ if (unlikely(!page)) { ++ mutex_lock(&c->bio_bounce_pages_lock); ++ *using_mempool = true; ++ goto pool_alloc; ++ ++ } ++ } else { ++pool_alloc: ++ page = mempool_alloc(&c->bio_bounce_pages, GFP_NOIO); ++ } ++ ++ return page; ++} ++ ++void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, ++ size_t size) ++{ ++ bool using_mempool = false; ++ ++ while (size) { ++ struct page *page = __bio_alloc_page_pool(c, &using_mempool); ++ unsigned len = min(PAGE_SIZE, size); ++ ++ BUG_ON(!bio_add_page(bio, page, len, 0)); ++ size -= len; ++ } ++ ++ if (using_mempool) ++ mutex_unlock(&c->bio_bounce_pages_lock); ++} ++ ++/* Extent update path: */ ++ ++static int sum_sector_overwrites(struct btree_trans *trans, ++ struct btree_iter *extent_iter, ++ struct bkey_i *new, ++ bool may_allocate, ++ bool *maybe_extending, ++ s64 *delta) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c old; ++ int ret = 0; ++ ++ *maybe_extending = true; ++ *delta = 0; ++ ++ iter = bch2_trans_copy_iter(trans, extent_iter); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, old, ret) { ++ if (!may_allocate && ++ bch2_bkey_nr_ptrs_fully_allocated(old) < ++ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new))) { ++ ret = -ENOSPC; ++ break; ++ } ++ ++ *delta += (min(new->k.p.offset, ++ old.k->p.offset) - ++ max(bkey_start_offset(&new->k), ++ bkey_start_offset(old.k))) * ++ (bkey_extent_is_allocation(&new->k) - ++ bkey_extent_is_allocation(old.k)); ++ ++ if (bkey_cmp(old.k->p, new->k.p) >= 0) { ++ /* ++ * Check if there's already data above where we're ++ * going to be writing to - this means we're definitely ++ * not extending the file: ++ * ++ * Note that it's not sufficient to check if there's ++ * data up to the sector offset we're going to be ++ * writing to, because i_size could be up to one block ++ * less: ++ */ ++ if (!bkey_cmp(old.k->p, new->k.p)) ++ old = bch2_btree_iter_next(iter); ++ ++ if (old.k && !bkey_err(old) && ++ old.k->p.inode == extent_iter->pos.inode && ++ bkey_extent_is_data(old.k)) ++ *maybe_extending = false; ++ ++ break; ++ } ++ } ++ ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++int bch2_extent_update(struct btree_trans *trans, ++ struct btree_iter *iter, ++ struct bkey_i *k, ++ struct disk_reservation *disk_res, ++ u64 *journal_seq, ++ u64 new_i_size, ++ s64 *i_sectors_delta) ++{ ++ /* this must live until after bch2_trans_commit(): */ ++ struct bkey_inode_buf inode_p; ++ bool extending = false; ++ s64 delta = 0; ++ int ret; ++ ++ ret = bch2_extent_trim_atomic(k, iter); ++ if (ret) ++ return ret; ++ ++ ret = sum_sector_overwrites(trans, iter, k, ++ disk_res && disk_res->sectors != 0, ++ &extending, &delta); ++ if (ret) ++ return ret; ++ ++ new_i_size = extending ++ ? min(k->k.p.offset << 9, new_i_size) ++ : 0; ++ ++ if (delta || new_i_size) { ++ struct btree_iter *inode_iter; ++ struct bch_inode_unpacked inode_u; ++ ++ inode_iter = bch2_inode_peek(trans, &inode_u, ++ k->k.p.inode, BTREE_ITER_INTENT); ++ if (IS_ERR(inode_iter)) ++ return PTR_ERR(inode_iter); ++ ++ /* ++ * XXX: ++ * writeback can race a bit with truncate, because truncate ++ * first updates the inode then truncates the pagecache. This is ++ * ugly, but lets us preserve the invariant that the in memory ++ * i_size is always >= the on disk i_size. ++ * ++ BUG_ON(new_i_size > inode_u.bi_size && ++ (inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY)); ++ */ ++ BUG_ON(new_i_size > inode_u.bi_size && !extending); ++ ++ if (!(inode_u.bi_flags & BCH_INODE_I_SIZE_DIRTY) && ++ new_i_size > inode_u.bi_size) ++ inode_u.bi_size = new_i_size; ++ else ++ new_i_size = 0; ++ ++ inode_u.bi_sectors += delta; ++ ++ if (delta || new_i_size) { ++ bch2_inode_pack(&inode_p, &inode_u); ++ bch2_trans_update(trans, inode_iter, ++ &inode_p.inode.k_i, 0); ++ } ++ ++ bch2_trans_iter_put(trans, inode_iter); ++ } ++ ++ bch2_trans_update(trans, iter, k, 0); ++ ++ ret = bch2_trans_commit(trans, disk_res, journal_seq, ++ BTREE_INSERT_NOCHECK_RW| ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE); ++ if (!ret && i_sectors_delta) ++ *i_sectors_delta += delta; ++ ++ return ret; ++} ++ ++int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, ++ struct bpos end, u64 *journal_seq, ++ s64 *i_sectors_delta) ++{ ++ struct bch_fs *c = trans->c; ++ unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); ++ struct bkey_s_c k; ++ int ret = 0, ret2 = 0; ++ ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ bkey_cmp(iter->pos, end) < 0) { ++ struct disk_reservation disk_res = ++ bch2_disk_reservation_init(c, 0); ++ struct bkey_i delete; ++ ++ bch2_trans_begin(trans); ++ ++ ret = bkey_err(k); ++ if (ret) ++ goto btree_err; ++ ++ bkey_init(&delete.k); ++ delete.k.p = iter->pos; ++ ++ /* create the biggest key we can */ ++ bch2_key_resize(&delete.k, max_sectors); ++ bch2_cut_back(end, &delete); ++ ++ ret = bch2_extent_update(trans, iter, &delete, ++ &disk_res, journal_seq, ++ 0, i_sectors_delta); ++ bch2_disk_reservation_put(c, &disk_res); ++btree_err: ++ if (ret == -EINTR) { ++ ret2 = ret; ++ ret = 0; ++ } ++ if (ret) ++ break; ++ } ++ ++ if (bkey_cmp(iter->pos, end) > 0) { ++ bch2_btree_iter_set_pos(iter, end); ++ ret = bch2_btree_iter_traverse(iter); ++ } ++ ++ return ret ?: ret2; ++} ++ ++int bch2_fpunch(struct bch_fs *c, u64 inum, u64 start, u64 end, ++ u64 *journal_seq, s64 *i_sectors_delta) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(inum, start), ++ BTREE_ITER_INTENT); ++ ++ ret = bch2_fpunch_at(&trans, iter, POS(inum, end), ++ journal_seq, i_sectors_delta); ++ bch2_trans_exit(&trans); ++ ++ if (ret == -EINTR) ++ ret = 0; ++ ++ return ret; ++} ++ ++int bch2_write_index_default(struct bch_write_op *op) ++{ ++ struct bch_fs *c = op->c; ++ struct bkey_on_stack sk; ++ struct keylist *keys = &op->insert_keys; ++ struct bkey_i *k = bch2_keylist_front(keys); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ int ret; ++ ++ bkey_on_stack_init(&sk); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ bkey_start_pos(&k->k), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ ++ do { ++ bch2_trans_begin(&trans); ++ ++ k = bch2_keylist_front(keys); ++ ++ bkey_on_stack_realloc(&sk, c, k->k.u64s); ++ bkey_copy(sk.k, k); ++ bch2_cut_front(iter->pos, sk.k); ++ ++ ret = bch2_extent_update(&trans, iter, sk.k, ++ &op->res, op_journal_seq(op), ++ op->new_i_size, &op->i_sectors_delta); ++ if (ret == -EINTR) ++ continue; ++ if (ret) ++ break; ++ ++ if (bkey_cmp(iter->pos, k->k.p) >= 0) ++ bch2_keylist_pop_front(keys); ++ } while (!bch2_keylist_empty(keys)); ++ ++ bch2_trans_exit(&trans); ++ bkey_on_stack_exit(&sk, c); ++ ++ return ret; ++} ++ ++/* Writes */ ++ ++void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, ++ enum bch_data_type type, ++ const struct bkey_i *k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); ++ const struct bch_extent_ptr *ptr; ++ struct bch_write_bio *n; ++ struct bch_dev *ca; ++ ++ BUG_ON(c->opts.nochanges); ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || ++ !c->devs[ptr->dev]); ++ ++ ca = bch_dev_bkey_exists(c, ptr->dev); ++ ++ if (to_entry(ptr + 1) < ptrs.end) { ++ n = to_wbio(bio_clone_fast(&wbio->bio, GFP_NOIO, ++ &ca->replica_set)); ++ ++ n->bio.bi_end_io = wbio->bio.bi_end_io; ++ n->bio.bi_private = wbio->bio.bi_private; ++ n->parent = wbio; ++ n->split = true; ++ n->bounce = false; ++ n->put_bio = true; ++ n->bio.bi_opf = wbio->bio.bi_opf; ++ bio_inc_remaining(&wbio->bio); ++ } else { ++ n = wbio; ++ n->split = false; ++ } ++ ++ n->c = c; ++ n->dev = ptr->dev; ++ n->have_ioref = bch2_dev_get_ioref(ca, ++ type == BCH_DATA_btree ? READ : WRITE); ++ n->submit_time = local_clock(); ++ n->bio.bi_iter.bi_sector = ptr->offset; ++ ++ if (!journal_flushes_device(ca)) ++ n->bio.bi_opf |= REQ_FUA; ++ ++ if (likely(n->have_ioref)) { ++ this_cpu_add(ca->io_done->sectors[WRITE][type], ++ bio_sectors(&n->bio)); ++ ++ bio_set_dev(&n->bio, ca->disk_sb.bdev); ++ submit_bio(&n->bio); ++ } else { ++ n->bio.bi_status = BLK_STS_REMOVED; ++ bio_endio(&n->bio); ++ } ++ } ++} ++ ++static void __bch2_write(struct closure *); ++ ++static void bch2_write_done(struct closure *cl) ++{ ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bch_fs *c = op->c; ++ ++ if (!op->error && (op->flags & BCH_WRITE_FLUSH)) ++ op->error = bch2_journal_error(&c->journal); ++ ++ bch2_disk_reservation_put(c, &op->res); ++ percpu_ref_put(&c->writes); ++ bch2_keylist_free(&op->insert_keys, op->inline_keys); ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); ++ ++ if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) ++ up(&c->io_in_flight); ++ ++ if (op->end_io) { ++ EBUG_ON(cl->parent); ++ closure_debug_destroy(cl); ++ op->end_io(op); ++ } else { ++ closure_return(cl); ++ } ++} ++ ++/** ++ * bch_write_index - after a write, update index to point to new data ++ */ ++static void __bch2_write_index(struct bch_write_op *op) ++{ ++ struct bch_fs *c = op->c; ++ struct keylist *keys = &op->insert_keys; ++ struct bch_extent_ptr *ptr; ++ struct bkey_i *src, *dst = keys->keys, *n, *k; ++ unsigned dev; ++ int ret; ++ ++ for (src = keys->keys; src != keys->top; src = n) { ++ n = bkey_next(src); ++ ++ if (bkey_extent_is_direct_data(&src->k)) { ++ bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, ++ test_bit(ptr->dev, op->failed.d)); ++ ++ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) { ++ ret = -EIO; ++ goto err; ++ } ++ } ++ ++ if (dst != src) ++ memmove_u64s_down(dst, src, src->u64s); ++ dst = bkey_next(dst); ++ } ++ ++ keys->top = dst; ++ ++ /* ++ * probably not the ideal place to hook this in, but I don't ++ * particularly want to plumb io_opts all the way through the btree ++ * update stack right now ++ */ ++ for_each_keylist_key(keys, k) { ++ bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts); ++ ++ if (bch2_bkey_is_incompressible(bkey_i_to_s_c(k))) ++ bch2_check_set_feature(op->c, BCH_FEATURE_incompressible); ++ ++ } ++ ++ if (!bch2_keylist_empty(keys)) { ++ u64 sectors_start = keylist_sectors(keys); ++ int ret = op->index_update_fn(op); ++ ++ BUG_ON(ret == -EINTR); ++ BUG_ON(keylist_sectors(keys) && !ret); ++ ++ op->written += sectors_start - keylist_sectors(keys); ++ ++ if (ret) { ++ __bcache_io_error(c, "btree IO error %i", ret); ++ op->error = ret; ++ } ++ } ++out: ++ /* If some a bucket wasn't written, we can't erasure code it: */ ++ for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) ++ bch2_open_bucket_write_error(c, &op->open_buckets, dev); ++ ++ bch2_open_buckets_put(c, &op->open_buckets); ++ return; ++err: ++ keys->top = keys->keys; ++ op->error = ret; ++ goto out; ++} ++ ++static void bch2_write_index(struct closure *cl) ++{ ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bch_fs *c = op->c; ++ ++ __bch2_write_index(op); ++ ++ if (!(op->flags & BCH_WRITE_DONE)) { ++ continue_at(cl, __bch2_write, index_update_wq(op)); ++ } else if (!op->error && (op->flags & BCH_WRITE_FLUSH)) { ++ bch2_journal_flush_seq_async(&c->journal, ++ *op_journal_seq(op), ++ cl); ++ continue_at(cl, bch2_write_done, index_update_wq(op)); ++ } else { ++ continue_at_nobarrier(cl, bch2_write_done, NULL); ++ } ++} ++ ++static void bch2_write_endio(struct bio *bio) ++{ ++ struct closure *cl = bio->bi_private; ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bch_write_bio *wbio = to_wbio(bio); ++ struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; ++ struct bch_fs *c = wbio->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "data write: %s", ++ bch2_blk_status_to_str(bio->bi_status))) ++ set_bit(wbio->dev, op->failed.d); ++ ++ if (wbio->have_ioref) { ++ bch2_latency_acct(ca, wbio->submit_time, WRITE); ++ percpu_ref_put(&ca->io_ref); ++ } ++ ++ if (wbio->bounce) ++ bch2_bio_free_pages_pool(c, bio); ++ ++ if (wbio->put_bio) ++ bio_put(bio); ++ ++ if (parent) ++ bio_endio(&parent->bio); ++ else if (!(op->flags & BCH_WRITE_SKIP_CLOSURE_PUT)) ++ closure_put(cl); ++ else ++ continue_at_nobarrier(cl, bch2_write_index, index_update_wq(op)); ++} ++ ++static void init_append_extent(struct bch_write_op *op, ++ struct write_point *wp, ++ struct bversion version, ++ struct bch_extent_crc_unpacked crc) ++{ ++ struct bch_fs *c = op->c; ++ struct bkey_i_extent *e; ++ struct open_bucket *ob; ++ unsigned i; ++ ++ BUG_ON(crc.compressed_size > wp->sectors_free); ++ wp->sectors_free -= crc.compressed_size; ++ op->pos.offset += crc.uncompressed_size; ++ ++ e = bkey_extent_init(op->insert_keys.top); ++ e->k.p = op->pos; ++ e->k.size = crc.uncompressed_size; ++ e->k.version = version; ++ ++ if (crc.csum_type || ++ crc.compression_type || ++ crc.nonce) ++ bch2_extent_crc_append(&e->k_i, crc); ++ ++ open_bucket_for_each(c, &wp->ptrs, ob, i) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ob->ptr.dev); ++ union bch_extent_entry *end = ++ bkey_val_end(bkey_i_to_s(&e->k_i)); ++ ++ end->ptr = ob->ptr; ++ end->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; ++ end->ptr.cached = !ca->mi.durability || ++ (op->flags & BCH_WRITE_CACHED) != 0; ++ end->ptr.offset += ca->mi.bucket_size - ob->sectors_free; ++ ++ e->k.u64s++; ++ ++ BUG_ON(crc.compressed_size > ob->sectors_free); ++ ob->sectors_free -= crc.compressed_size; ++ } ++ ++ bch2_keylist_push(&op->insert_keys); ++} ++ ++static struct bio *bch2_write_bio_alloc(struct bch_fs *c, ++ struct write_point *wp, ++ struct bio *src, ++ bool *page_alloc_failed, ++ void *buf) ++{ ++ struct bch_write_bio *wbio; ++ struct bio *bio; ++ unsigned output_available = ++ min(wp->sectors_free << 9, src->bi_iter.bi_size); ++ unsigned pages = DIV_ROUND_UP(output_available + ++ (buf ++ ? ((unsigned long) buf & (PAGE_SIZE - 1)) ++ : 0), PAGE_SIZE); ++ ++ bio = bio_alloc_bioset(GFP_NOIO, pages, &c->bio_write); ++ wbio = wbio_init(bio); ++ wbio->put_bio = true; ++ /* copy WRITE_SYNC flag */ ++ wbio->bio.bi_opf = src->bi_opf; ++ ++ if (buf) { ++ bch2_bio_map(bio, buf, output_available); ++ return bio; ++ } ++ ++ wbio->bounce = true; ++ ++ /* ++ * We can't use mempool for more than c->sb.encoded_extent_max ++ * worth of pages, but we'd like to allocate more if we can: ++ */ ++ bch2_bio_alloc_pages_pool(c, bio, ++ min_t(unsigned, output_available, ++ c->sb.encoded_extent_max << 9)); ++ ++ if (bio->bi_iter.bi_size < output_available) ++ *page_alloc_failed = ++ bch2_bio_alloc_pages(bio, ++ output_available - ++ bio->bi_iter.bi_size, ++ GFP_NOFS) != 0; ++ ++ return bio; ++} ++ ++static int bch2_write_rechecksum(struct bch_fs *c, ++ struct bch_write_op *op, ++ unsigned new_csum_type) ++{ ++ struct bio *bio = &op->wbio.bio; ++ struct bch_extent_crc_unpacked new_crc; ++ int ret; ++ ++ /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ ++ ++ if (bch2_csum_type_is_encryption(op->crc.csum_type) != ++ bch2_csum_type_is_encryption(new_csum_type)) ++ new_csum_type = op->crc.csum_type; ++ ++ ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, ++ NULL, &new_crc, ++ op->crc.offset, op->crc.live_size, ++ new_csum_type); ++ if (ret) ++ return ret; ++ ++ bio_advance(bio, op->crc.offset << 9); ++ bio->bi_iter.bi_size = op->crc.live_size << 9; ++ op->crc = new_crc; ++ return 0; ++} ++ ++static int bch2_write_decrypt(struct bch_write_op *op) ++{ ++ struct bch_fs *c = op->c; ++ struct nonce nonce = extent_nonce(op->version, op->crc); ++ struct bch_csum csum; ++ ++ if (!bch2_csum_type_is_encryption(op->crc.csum_type)) ++ return 0; ++ ++ /* ++ * If we need to decrypt data in the write path, we'll no longer be able ++ * to verify the existing checksum (poly1305 mac, in this case) after ++ * it's decrypted - this is the last point we'll be able to reverify the ++ * checksum: ++ */ ++ csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); ++ if (bch2_crc_cmp(op->crc.csum, csum)) ++ return -EIO; ++ ++ bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); ++ op->crc.csum_type = 0; ++ op->crc.csum = (struct bch_csum) { 0, 0 }; ++ return 0; ++} ++ ++static enum prep_encoded_ret { ++ PREP_ENCODED_OK, ++ PREP_ENCODED_ERR, ++ PREP_ENCODED_CHECKSUM_ERR, ++ PREP_ENCODED_DO_WRITE, ++} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) ++{ ++ struct bch_fs *c = op->c; ++ struct bio *bio = &op->wbio.bio; ++ ++ if (!(op->flags & BCH_WRITE_DATA_ENCODED)) ++ return PREP_ENCODED_OK; ++ ++ BUG_ON(bio_sectors(bio) != op->crc.compressed_size); ++ ++ /* Can we just write the entire extent as is? */ ++ if (op->crc.uncompressed_size == op->crc.live_size && ++ op->crc.compressed_size <= wp->sectors_free && ++ (op->crc.compression_type == op->compression_type || ++ op->incompressible)) { ++ if (!crc_is_compressed(op->crc) && ++ op->csum_type != op->crc.csum_type && ++ bch2_write_rechecksum(c, op, op->csum_type)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ return PREP_ENCODED_DO_WRITE; ++ } ++ ++ /* ++ * If the data is compressed and we couldn't write the entire extent as ++ * is, we have to decompress it: ++ */ ++ if (crc_is_compressed(op->crc)) { ++ struct bch_csum csum; ++ ++ if (bch2_write_decrypt(op)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ /* Last point we can still verify checksum: */ ++ csum = bch2_checksum_bio(c, op->crc.csum_type, ++ extent_nonce(op->version, op->crc), ++ bio); ++ if (bch2_crc_cmp(op->crc.csum, csum)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) ++ return PREP_ENCODED_ERR; ++ } ++ ++ /* ++ * No longer have compressed data after this point - data might be ++ * encrypted: ++ */ ++ ++ /* ++ * If the data is checksummed and we're only writing a subset, ++ * rechecksum and adjust bio to point to currently live data: ++ */ ++ if ((op->crc.live_size != op->crc.uncompressed_size || ++ op->crc.csum_type != op->csum_type) && ++ bch2_write_rechecksum(c, op, op->csum_type)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ /* ++ * If we want to compress the data, it has to be decrypted: ++ */ ++ if ((op->compression_type || ++ bch2_csum_type_is_encryption(op->crc.csum_type) != ++ bch2_csum_type_is_encryption(op->csum_type)) && ++ bch2_write_decrypt(op)) ++ return PREP_ENCODED_CHECKSUM_ERR; ++ ++ return PREP_ENCODED_OK; ++} ++ ++static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, ++ struct bio **_dst) ++{ ++ struct bch_fs *c = op->c; ++ struct bio *src = &op->wbio.bio, *dst = src; ++ struct bvec_iter saved_iter; ++ void *ec_buf; ++ struct bpos ec_pos = op->pos; ++ unsigned total_output = 0, total_input = 0; ++ bool bounce = false; ++ bool page_alloc_failed = false; ++ int ret, more = 0; ++ ++ BUG_ON(!bio_sectors(src)); ++ ++ ec_buf = bch2_writepoint_ec_buf(c, wp); ++ ++ switch (bch2_write_prep_encoded_data(op, wp)) { ++ case PREP_ENCODED_OK: ++ break; ++ case PREP_ENCODED_ERR: ++ ret = -EIO; ++ goto err; ++ case PREP_ENCODED_CHECKSUM_ERR: ++ BUG(); ++ goto csum_err; ++ case PREP_ENCODED_DO_WRITE: ++ /* XXX look for bug here */ ++ if (ec_buf) { ++ dst = bch2_write_bio_alloc(c, wp, src, ++ &page_alloc_failed, ++ ec_buf); ++ bio_copy_data(dst, src); ++ bounce = true; ++ } ++ init_append_extent(op, wp, op->version, op->crc); ++ goto do_write; ++ } ++ ++ if (ec_buf || ++ op->compression_type || ++ (op->csum_type && ++ !(op->flags & BCH_WRITE_PAGES_STABLE)) || ++ (bch2_csum_type_is_encryption(op->csum_type) && ++ !(op->flags & BCH_WRITE_PAGES_OWNED))) { ++ dst = bch2_write_bio_alloc(c, wp, src, ++ &page_alloc_failed, ++ ec_buf); ++ bounce = true; ++ } ++ ++ saved_iter = dst->bi_iter; ++ ++ do { ++ struct bch_extent_crc_unpacked crc = ++ (struct bch_extent_crc_unpacked) { 0 }; ++ struct bversion version = op->version; ++ size_t dst_len, src_len; ++ ++ if (page_alloc_failed && ++ bio_sectors(dst) < wp->sectors_free && ++ bio_sectors(dst) < c->sb.encoded_extent_max) ++ break; ++ ++ BUG_ON(op->compression_type && ++ (op->flags & BCH_WRITE_DATA_ENCODED) && ++ bch2_csum_type_is_encryption(op->crc.csum_type)); ++ BUG_ON(op->compression_type && !bounce); ++ ++ crc.compression_type = op->incompressible ++ ? BCH_COMPRESSION_TYPE_incompressible ++ : op->compression_type ++ ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, ++ op->compression_type) ++ : 0; ++ if (!crc_is_compressed(crc)) { ++ dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); ++ dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); ++ ++ if (op->csum_type) ++ dst_len = min_t(unsigned, dst_len, ++ c->sb.encoded_extent_max << 9); ++ ++ if (bounce) { ++ swap(dst->bi_iter.bi_size, dst_len); ++ bio_copy_data(dst, src); ++ swap(dst->bi_iter.bi_size, dst_len); ++ } ++ ++ src_len = dst_len; ++ } ++ ++ BUG_ON(!src_len || !dst_len); ++ ++ if (bch2_csum_type_is_encryption(op->csum_type)) { ++ if (bversion_zero(version)) { ++ version.lo = atomic64_inc_return(&c->key_version); ++ } else { ++ crc.nonce = op->nonce; ++ op->nonce += src_len >> 9; ++ } ++ } ++ ++ if ((op->flags & BCH_WRITE_DATA_ENCODED) && ++ !crc_is_compressed(crc) && ++ bch2_csum_type_is_encryption(op->crc.csum_type) == ++ bch2_csum_type_is_encryption(op->csum_type)) { ++ /* ++ * Note: when we're using rechecksum(), we need to be ++ * checksumming @src because it has all the data our ++ * existing checksum covers - if we bounced (because we ++ * were trying to compress), @dst will only have the ++ * part of the data the new checksum will cover. ++ * ++ * But normally we want to be checksumming post bounce, ++ * because part of the reason for bouncing is so the ++ * data can't be modified (by userspace) while it's in ++ * flight. ++ */ ++ if (bch2_rechecksum_bio(c, src, version, op->crc, ++ &crc, &op->crc, ++ src_len >> 9, ++ bio_sectors(src) - (src_len >> 9), ++ op->csum_type)) ++ goto csum_err; ++ } else { ++ if ((op->flags & BCH_WRITE_DATA_ENCODED) && ++ bch2_rechecksum_bio(c, src, version, op->crc, ++ NULL, &op->crc, ++ src_len >> 9, ++ bio_sectors(src) - (src_len >> 9), ++ op->crc.csum_type)) ++ goto csum_err; ++ ++ crc.compressed_size = dst_len >> 9; ++ crc.uncompressed_size = src_len >> 9; ++ crc.live_size = src_len >> 9; ++ ++ swap(dst->bi_iter.bi_size, dst_len); ++ bch2_encrypt_bio(c, op->csum_type, ++ extent_nonce(version, crc), dst); ++ crc.csum = bch2_checksum_bio(c, op->csum_type, ++ extent_nonce(version, crc), dst); ++ crc.csum_type = op->csum_type; ++ swap(dst->bi_iter.bi_size, dst_len); ++ } ++ ++ init_append_extent(op, wp, version, crc); ++ ++ if (dst != src) ++ bio_advance(dst, dst_len); ++ bio_advance(src, src_len); ++ total_output += dst_len; ++ total_input += src_len; ++ } while (dst->bi_iter.bi_size && ++ src->bi_iter.bi_size && ++ wp->sectors_free && ++ !bch2_keylist_realloc(&op->insert_keys, ++ op->inline_keys, ++ ARRAY_SIZE(op->inline_keys), ++ BKEY_EXTENT_U64s_MAX)); ++ ++ more = src->bi_iter.bi_size != 0; ++ ++ dst->bi_iter = saved_iter; ++ ++ if (dst == src && more) { ++ BUG_ON(total_output != total_input); ++ ++ dst = bio_split(src, total_input >> 9, ++ GFP_NOIO, &c->bio_write); ++ wbio_init(dst)->put_bio = true; ++ /* copy WRITE_SYNC flag */ ++ dst->bi_opf = src->bi_opf; ++ } ++ ++ dst->bi_iter.bi_size = total_output; ++do_write: ++ /* might have done a realloc... */ ++ bch2_ec_add_backpointer(c, wp, ec_pos, total_input >> 9); ++ ++ *_dst = dst; ++ return more; ++csum_err: ++ bch_err(c, "error verifying existing checksum while " ++ "rewriting existing data (memory corruption?)"); ++ ret = -EIO; ++err: ++ if (to_wbio(dst)->bounce) ++ bch2_bio_free_pages_pool(c, dst); ++ if (to_wbio(dst)->put_bio) ++ bio_put(dst); ++ ++ return ret; ++} ++ ++static void __bch2_write(struct closure *cl) ++{ ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bch_fs *c = op->c; ++ struct write_point *wp; ++ struct bio *bio; ++ bool skip_put = true; ++ unsigned nofs_flags; ++ int ret; ++ ++ nofs_flags = memalloc_nofs_save(); ++again: ++ memset(&op->failed, 0, sizeof(op->failed)); ++ ++ do { ++ struct bkey_i *key_to_write; ++ unsigned key_to_write_offset = op->insert_keys.top_p - ++ op->insert_keys.keys_p; ++ ++ /* +1 for possible cache device: */ ++ if (op->open_buckets.nr + op->nr_replicas + 1 > ++ ARRAY_SIZE(op->open_buckets.v)) ++ goto flush_io; ++ ++ if (bch2_keylist_realloc(&op->insert_keys, ++ op->inline_keys, ++ ARRAY_SIZE(op->inline_keys), ++ BKEY_EXTENT_U64s_MAX)) ++ goto flush_io; ++ ++ if ((op->flags & BCH_WRITE_FROM_INTERNAL) && ++ percpu_ref_is_dying(&c->writes)) { ++ ret = -EROFS; ++ goto err; ++ } ++ ++ /* ++ * The copygc thread is now global, which means it's no longer ++ * freeing up space on specific disks, which means that ++ * allocations for specific disks may hang arbitrarily long: ++ */ ++ wp = bch2_alloc_sectors_start(c, ++ op->target, ++ op->opts.erasure_code, ++ op->write_point, ++ &op->devs_have, ++ op->nr_replicas, ++ op->nr_replicas_required, ++ op->alloc_reserve, ++ op->flags, ++ (op->flags & (BCH_WRITE_ALLOC_NOWAIT| ++ BCH_WRITE_ONLY_SPECIFIED_DEVS)) ? NULL : cl); ++ EBUG_ON(!wp); ++ ++ if (unlikely(IS_ERR(wp))) { ++ if (unlikely(PTR_ERR(wp) != -EAGAIN)) { ++ ret = PTR_ERR(wp); ++ goto err; ++ } ++ ++ goto flush_io; ++ } ++ ++ /* ++ * It's possible for the allocator to fail, put us on the ++ * freelist waitlist, and then succeed in one of various retry ++ * paths: if that happens, we need to disable the skip_put ++ * optimization because otherwise there won't necessarily be a ++ * barrier before we free the bch_write_op: ++ */ ++ if (atomic_read(&cl->remaining) & CLOSURE_WAITING) ++ skip_put = false; ++ ++ bch2_open_bucket_get(c, wp, &op->open_buckets); ++ ret = bch2_write_extent(op, wp, &bio); ++ bch2_alloc_sectors_done(c, wp); ++ ++ if (ret < 0) ++ goto err; ++ ++ if (ret) { ++ skip_put = false; ++ } else { ++ /* ++ * for the skip_put optimization this has to be set ++ * before we submit the bio: ++ */ ++ op->flags |= BCH_WRITE_DONE; ++ } ++ ++ bio->bi_end_io = bch2_write_endio; ++ bio->bi_private = &op->cl; ++ bio->bi_opf |= REQ_OP_WRITE; ++ ++ if (!skip_put) ++ closure_get(bio->bi_private); ++ else ++ op->flags |= BCH_WRITE_SKIP_CLOSURE_PUT; ++ ++ key_to_write = (void *) (op->insert_keys.keys_p + ++ key_to_write_offset); ++ ++ bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, ++ key_to_write); ++ } while (ret); ++ ++ if (!skip_put) ++ continue_at(cl, bch2_write_index, index_update_wq(op)); ++out: ++ memalloc_nofs_restore(nofs_flags); ++ return; ++err: ++ op->error = ret; ++ op->flags |= BCH_WRITE_DONE; ++ ++ continue_at(cl, bch2_write_index, index_update_wq(op)); ++ goto out; ++flush_io: ++ /* ++ * If the write can't all be submitted at once, we generally want to ++ * block synchronously as that signals backpressure to the caller. ++ * ++ * However, if we're running out of a workqueue, we can't block here ++ * because we'll be blocking other work items from completing: ++ */ ++ if (current->flags & PF_WQ_WORKER) { ++ continue_at(cl, bch2_write_index, index_update_wq(op)); ++ goto out; ++ } ++ ++ closure_sync(cl); ++ ++ if (!bch2_keylist_empty(&op->insert_keys)) { ++ __bch2_write_index(op); ++ ++ if (op->error) { ++ op->flags |= BCH_WRITE_DONE; ++ continue_at_nobarrier(cl, bch2_write_done, NULL); ++ goto out; ++ } ++ } ++ ++ goto again; ++} ++ ++static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) ++{ ++ struct closure *cl = &op->cl; ++ struct bio *bio = &op->wbio.bio; ++ struct bvec_iter iter; ++ struct bkey_i_inline_data *id; ++ unsigned sectors; ++ int ret; ++ ++ bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); ++ ++ ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, ++ ARRAY_SIZE(op->inline_keys), ++ BKEY_U64s + DIV_ROUND_UP(data_len, 8)); ++ if (ret) { ++ op->error = ret; ++ goto err; ++ } ++ ++ sectors = bio_sectors(bio); ++ op->pos.offset += sectors; ++ ++ id = bkey_inline_data_init(op->insert_keys.top); ++ id->k.p = op->pos; ++ id->k.version = op->version; ++ id->k.size = sectors; ++ ++ iter = bio->bi_iter; ++ iter.bi_size = data_len; ++ memcpy_from_bio(id->v.data, bio, iter); ++ ++ while (data_len & 7) ++ id->v.data[data_len++] = '\0'; ++ set_bkey_val_bytes(&id->k, data_len); ++ bch2_keylist_push(&op->insert_keys); ++ ++ op->flags |= BCH_WRITE_WROTE_DATA_INLINE; ++ op->flags |= BCH_WRITE_DONE; ++ ++ continue_at_nobarrier(cl, bch2_write_index, NULL); ++ return; ++err: ++ bch2_write_done(&op->cl); ++} ++ ++/** ++ * bch_write - handle a write to a cache device or flash only volume ++ * ++ * This is the starting point for any data to end up in a cache device; it could ++ * be from a normal write, or a writeback write, or a write to a flash only ++ * volume - it's also used by the moving garbage collector to compact data in ++ * mostly empty buckets. ++ * ++ * It first writes the data to the cache, creating a list of keys to be inserted ++ * (if the data won't fit in a single open bucket, there will be multiple keys); ++ * after the data is written it calls bch_journal, and after the keys have been ++ * added to the next journal write they're inserted into the btree. ++ * ++ * If op->discard is true, instead of inserting the data it invalidates the ++ * region of the cache represented by op->bio and op->inode. ++ */ ++void bch2_write(struct closure *cl) ++{ ++ struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); ++ struct bio *bio = &op->wbio.bio; ++ struct bch_fs *c = op->c; ++ unsigned data_len; ++ ++ BUG_ON(!op->nr_replicas); ++ BUG_ON(!op->write_point.v); ++ BUG_ON(!bkey_cmp(op->pos, POS_MAX)); ++ ++ op->start_time = local_clock(); ++ bch2_keylist_init(&op->insert_keys, op->inline_keys); ++ wbio_init(bio)->put_bio = false; ++ ++ if (bio_sectors(bio) & (c->opts.block_size - 1)) { ++ __bcache_io_error(c, "misaligned write"); ++ op->error = -EIO; ++ goto err; ++ } ++ ++ if (c->opts.nochanges || ++ !percpu_ref_tryget(&c->writes)) { ++ if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) ++ __bcache_io_error(c, "read only"); ++ op->error = -EROFS; ++ goto err; ++ } ++ ++ /* ++ * Can't ratelimit copygc - we'd deadlock: ++ */ ++ if (!(op->flags & BCH_WRITE_FROM_INTERNAL)) ++ down(&c->io_in_flight); ++ ++ bch2_increment_clock(c, bio_sectors(bio), WRITE); ++ ++ data_len = min_t(u64, bio->bi_iter.bi_size, ++ op->new_i_size - (op->pos.offset << 9)); ++ ++ if (c->opts.inline_data && ++ data_len <= min(block_bytes(c) / 2, 1024U)) { ++ bch2_write_data_inline(op, data_len); ++ return; ++ } ++ ++ continue_at_nobarrier(cl, __bch2_write, NULL); ++ return; ++err: ++ bch2_disk_reservation_put(c, &op->res); ++ ++ if (op->end_io) { ++ EBUG_ON(cl->parent); ++ closure_debug_destroy(cl); ++ op->end_io(op); ++ } else { ++ closure_return(cl); ++ } ++} ++ ++/* Cache promotion on read */ ++ ++struct promote_op { ++ struct closure cl; ++ struct rcu_head rcu; ++ u64 start_time; ++ ++ struct rhash_head hash; ++ struct bpos pos; ++ ++ struct migrate_write write; ++ struct bio_vec bi_inline_vecs[0]; /* must be last */ ++}; ++ ++static const struct rhashtable_params bch_promote_params = { ++ .head_offset = offsetof(struct promote_op, hash), ++ .key_offset = offsetof(struct promote_op, pos), ++ .key_len = sizeof(struct bpos), ++}; ++ ++static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k, ++ struct bpos pos, ++ struct bch_io_opts opts, ++ unsigned flags) ++{ ++ if (!(flags & BCH_READ_MAY_PROMOTE)) ++ return false; ++ ++ if (!opts.promote_target) ++ return false; ++ ++ if (bch2_bkey_has_target(c, k, opts.promote_target)) ++ return false; ++ ++ if (bch2_target_congested(c, opts.promote_target)) { ++ /* XXX trace this */ ++ return false; ++ } ++ ++ if (rhashtable_lookup_fast(&c->promote_table, &pos, ++ bch_promote_params)) ++ return false; ++ ++ return true; ++} ++ ++static void promote_free(struct bch_fs *c, struct promote_op *op) ++{ ++ int ret; ++ ++ ret = rhashtable_remove_fast(&c->promote_table, &op->hash, ++ bch_promote_params); ++ BUG_ON(ret); ++ percpu_ref_put(&c->writes); ++ kfree_rcu(op, rcu); ++} ++ ++static void promote_done(struct closure *cl) ++{ ++ struct promote_op *op = ++ container_of(cl, struct promote_op, cl); ++ struct bch_fs *c = op->write.op.c; ++ ++ bch2_time_stats_update(&c->times[BCH_TIME_data_promote], ++ op->start_time); ++ ++ bch2_bio_free_pages_pool(c, &op->write.op.wbio.bio); ++ promote_free(c, op); ++} ++ ++static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) ++{ ++ struct bch_fs *c = rbio->c; ++ struct closure *cl = &op->cl; ++ struct bio *bio = &op->write.op.wbio.bio; ++ ++ trace_promote(&rbio->bio); ++ ++ /* we now own pages: */ ++ BUG_ON(!rbio->bounce); ++ BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); ++ ++ memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, ++ sizeof(struct bio_vec) * rbio->bio.bi_vcnt); ++ swap(bio->bi_vcnt, rbio->bio.bi_vcnt); ++ ++ bch2_migrate_read_done(&op->write, rbio); ++ ++ closure_init(cl, NULL); ++ closure_call(&op->write.op.cl, bch2_write, c->wq, cl); ++ closure_return_with_destructor(cl, promote_done); ++} ++ ++static struct promote_op *__promote_alloc(struct bch_fs *c, ++ enum btree_id btree_id, ++ struct bkey_s_c k, ++ struct bpos pos, ++ struct extent_ptr_decoded *pick, ++ struct bch_io_opts opts, ++ unsigned sectors, ++ struct bch_read_bio **rbio) ++{ ++ struct promote_op *op = NULL; ++ struct bio *bio; ++ unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); ++ int ret; ++ ++ if (!percpu_ref_tryget(&c->writes)) ++ return NULL; ++ ++ op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOIO); ++ if (!op) ++ goto err; ++ ++ op->start_time = local_clock(); ++ op->pos = pos; ++ ++ /* ++ * We don't use the mempool here because extents that aren't ++ * checksummed or compressed can be too big for the mempool: ++ */ ++ *rbio = kzalloc(sizeof(struct bch_read_bio) + ++ sizeof(struct bio_vec) * pages, ++ GFP_NOIO); ++ if (!*rbio) ++ goto err; ++ ++ rbio_init(&(*rbio)->bio, opts); ++ bio_init(&(*rbio)->bio, (*rbio)->bio.bi_inline_vecs, pages); ++ ++ if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, ++ GFP_NOIO)) ++ goto err; ++ ++ (*rbio)->bounce = true; ++ (*rbio)->split = true; ++ (*rbio)->kmalloc = true; ++ ++ if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, ++ bch_promote_params)) ++ goto err; ++ ++ bio = &op->write.op.wbio.bio; ++ bio_init(bio, bio->bi_inline_vecs, pages); ++ ++ ret = bch2_migrate_write_init(c, &op->write, ++ writepoint_hashed((unsigned long) current), ++ opts, ++ DATA_PROMOTE, ++ (struct data_opts) { ++ .target = opts.promote_target ++ }, ++ btree_id, k); ++ BUG_ON(ret); ++ ++ return op; ++err: ++ if (*rbio) ++ bio_free_pages(&(*rbio)->bio); ++ kfree(*rbio); ++ *rbio = NULL; ++ kfree(op); ++ percpu_ref_put(&c->writes); ++ return NULL; ++} ++ ++noinline ++static struct promote_op *promote_alloc(struct bch_fs *c, ++ struct bvec_iter iter, ++ struct bkey_s_c k, ++ struct extent_ptr_decoded *pick, ++ struct bch_io_opts opts, ++ unsigned flags, ++ struct bch_read_bio **rbio, ++ bool *bounce, ++ bool *read_full) ++{ ++ bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); ++ /* data might have to be decompressed in the write path: */ ++ unsigned sectors = promote_full ++ ? max(pick->crc.compressed_size, pick->crc.live_size) ++ : bvec_iter_sectors(iter); ++ struct bpos pos = promote_full ++ ? bkey_start_pos(k.k) ++ : POS(k.k->p.inode, iter.bi_sector); ++ struct promote_op *promote; ++ ++ if (!should_promote(c, k, pos, opts, flags)) ++ return NULL; ++ ++ promote = __promote_alloc(c, ++ k.k->type == KEY_TYPE_reflink_v ++ ? BTREE_ID_REFLINK ++ : BTREE_ID_EXTENTS, ++ k, pos, pick, opts, sectors, rbio); ++ if (!promote) ++ return NULL; ++ ++ *bounce = true; ++ *read_full = promote_full; ++ return promote; ++} ++ ++/* Read */ ++ ++#define READ_RETRY_AVOID 1 ++#define READ_RETRY 2 ++#define READ_ERR 3 ++ ++enum rbio_context { ++ RBIO_CONTEXT_NULL, ++ RBIO_CONTEXT_HIGHPRI, ++ RBIO_CONTEXT_UNBOUND, ++}; ++ ++static inline struct bch_read_bio * ++bch2_rbio_parent(struct bch_read_bio *rbio) ++{ ++ return rbio->split ? rbio->parent : rbio; ++} ++ ++__always_inline ++static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, ++ enum rbio_context context, ++ struct workqueue_struct *wq) ++{ ++ if (context <= rbio->context) { ++ fn(&rbio->work); ++ } else { ++ rbio->work.func = fn; ++ rbio->context = context; ++ queue_work(wq, &rbio->work); ++ } ++} ++ ++static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) ++{ ++ BUG_ON(rbio->bounce && !rbio->split); ++ ++ if (rbio->promote) ++ promote_free(rbio->c, rbio->promote); ++ rbio->promote = NULL; ++ ++ if (rbio->bounce) ++ bch2_bio_free_pages_pool(rbio->c, &rbio->bio); ++ ++ if (rbio->split) { ++ struct bch_read_bio *parent = rbio->parent; ++ ++ if (rbio->kmalloc) ++ kfree(rbio); ++ else ++ bio_put(&rbio->bio); ++ ++ rbio = parent; ++ } ++ ++ return rbio; ++} ++ ++/* ++ * Only called on a top level bch_read_bio to complete an entire read request, ++ * not a split: ++ */ ++static void bch2_rbio_done(struct bch_read_bio *rbio) ++{ ++ if (rbio->start_time) ++ bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], ++ rbio->start_time); ++ bio_endio(&rbio->bio); ++} ++ ++static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, ++ struct bvec_iter bvec_iter, u64 inode, ++ struct bch_io_failures *failed, ++ unsigned flags) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_on_stack sk; ++ struct bkey_s_c k; ++ int ret; ++ ++ flags &= ~BCH_READ_LAST_FRAGMENT; ++ flags |= BCH_READ_MUST_CLONE; ++ ++ bkey_on_stack_init(&sk); ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ rbio->pos, BTREE_ITER_SLOTS); ++retry: ++ rbio->bio.bi_status = 0; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ if (bkey_err(k)) ++ goto err; ++ ++ bkey_on_stack_reassemble(&sk, c, k); ++ k = bkey_i_to_s_c(sk.k); ++ bch2_trans_unlock(&trans); ++ ++ if (!bch2_bkey_matches_ptr(c, k, ++ rbio->pick.ptr, ++ rbio->pos.offset - ++ rbio->pick.crc.offset)) { ++ /* extent we wanted to read no longer exists: */ ++ rbio->hole = true; ++ goto out; ++ } ++ ++ ret = __bch2_read_extent(&trans, rbio, bvec_iter, k, 0, failed, flags); ++ if (ret == READ_RETRY) ++ goto retry; ++ if (ret) ++ goto err; ++out: ++ bch2_rbio_done(rbio); ++ bch2_trans_exit(&trans); ++ bkey_on_stack_exit(&sk, c); ++ return; ++err: ++ rbio->bio.bi_status = BLK_STS_IOERR; ++ goto out; ++} ++ ++static void bch2_read_retry(struct bch_fs *c, struct bch_read_bio *rbio, ++ struct bvec_iter bvec_iter, u64 inode, ++ struct bch_io_failures *failed, unsigned flags) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_on_stack sk; ++ struct bkey_s_c k; ++ int ret; ++ ++ flags &= ~BCH_READ_LAST_FRAGMENT; ++ flags |= BCH_READ_MUST_CLONE; ++ ++ bkey_on_stack_init(&sk); ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, ++ POS(inode, bvec_iter.bi_sector), ++ BTREE_ITER_SLOTS, k, ret) { ++ unsigned bytes, sectors, offset_into_extent; ++ ++ bkey_on_stack_reassemble(&sk, c, k); ++ k = bkey_i_to_s_c(sk.k); ++ ++ offset_into_extent = iter->pos.offset - ++ bkey_start_offset(k.k); ++ sectors = k.k->size - offset_into_extent; ++ ++ ret = bch2_read_indirect_extent(&trans, ++ &offset_into_extent, &sk); ++ if (ret) ++ break; ++ ++ sectors = min(sectors, k.k->size - offset_into_extent); ++ ++ bch2_trans_unlock(&trans); ++ ++ bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; ++ swap(bvec_iter.bi_size, bytes); ++ ++ ret = __bch2_read_extent(&trans, rbio, bvec_iter, k, ++ offset_into_extent, failed, flags); ++ switch (ret) { ++ case READ_RETRY: ++ goto retry; ++ case READ_ERR: ++ goto err; ++ }; ++ ++ if (bytes == bvec_iter.bi_size) ++ goto out; ++ ++ swap(bvec_iter.bi_size, bytes); ++ bio_advance_iter(&rbio->bio, &bvec_iter, bytes); ++ } ++ ++ if (ret == -EINTR) ++ goto retry; ++ /* ++ * If we get here, it better have been because there was an error ++ * reading a btree node ++ */ ++ BUG_ON(!ret); ++ __bcache_io_error(c, "btree IO error: %i", ret); ++err: ++ rbio->bio.bi_status = BLK_STS_IOERR; ++out: ++ bch2_trans_exit(&trans); ++ bkey_on_stack_exit(&sk, c); ++ bch2_rbio_done(rbio); ++} ++ ++static void bch2_rbio_retry(struct work_struct *work) ++{ ++ struct bch_read_bio *rbio = ++ container_of(work, struct bch_read_bio, work); ++ struct bch_fs *c = rbio->c; ++ struct bvec_iter iter = rbio->bvec_iter; ++ unsigned flags = rbio->flags; ++ u64 inode = rbio->pos.inode; ++ struct bch_io_failures failed = { .nr = 0 }; ++ ++ trace_read_retry(&rbio->bio); ++ ++ if (rbio->retry == READ_RETRY_AVOID) ++ bch2_mark_io_failure(&failed, &rbio->pick); ++ ++ rbio->bio.bi_status = 0; ++ ++ rbio = bch2_rbio_free(rbio); ++ ++ flags |= BCH_READ_IN_RETRY; ++ flags &= ~BCH_READ_MAY_PROMOTE; ++ ++ if (flags & BCH_READ_NODECODE) ++ bch2_read_retry_nodecode(c, rbio, iter, inode, &failed, flags); ++ else ++ bch2_read_retry(c, rbio, iter, inode, &failed, flags); ++} ++ ++static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, ++ blk_status_t error) ++{ ++ rbio->retry = retry; ++ ++ if (rbio->flags & BCH_READ_IN_RETRY) ++ return; ++ ++ if (retry == READ_ERR) { ++ rbio = bch2_rbio_free(rbio); ++ ++ rbio->bio.bi_status = error; ++ bch2_rbio_done(rbio); ++ } else { ++ bch2_rbio_punt(rbio, bch2_rbio_retry, ++ RBIO_CONTEXT_UNBOUND, system_unbound_wq); ++ } ++} ++ ++static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, ++ struct bch_read_bio *rbio) ++{ ++ struct bch_fs *c = rbio->c; ++ u64 data_offset = rbio->pos.offset - rbio->pick.crc.offset; ++ struct bch_extent_crc_unpacked new_crc; ++ struct btree_iter *iter = NULL; ++ struct bkey_i *new; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ if (crc_is_compressed(rbio->pick.crc)) ++ return 0; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_EXTENTS, rbio->pos, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ if ((ret = PTR_ERR_OR_ZERO(iter))) ++ goto out; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ if ((ret = bkey_err(k))) ++ goto out; ++ ++ /* ++ * going to be temporarily appending another checksum entry: ++ */ ++ new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + ++ BKEY_EXTENT_U64s_MAX * 8); ++ if ((ret = PTR_ERR_OR_ZERO(new))) ++ goto out; ++ ++ bkey_reassemble(new, k); ++ k = bkey_i_to_s_c(new); ++ ++ if (bversion_cmp(k.k->version, rbio->version) || ++ !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) ++ goto out; ++ ++ /* Extent was merged? */ ++ if (bkey_start_offset(k.k) < data_offset || ++ k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) ++ goto out; ++ ++ if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, ++ rbio->pick.crc, NULL, &new_crc, ++ bkey_start_offset(k.k) - data_offset, k.k->size, ++ rbio->pick.crc.csum_type)) { ++ bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); ++ ret = 0; ++ goto out; ++ } ++ ++ if (!bch2_bkey_narrow_crcs(new, new_crc)) ++ goto out; ++ ++ bch2_trans_update(trans, iter, new, 0); ++out: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) ++{ ++ bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, ++ __bch2_rbio_narrow_crcs(&trans, rbio)); ++} ++ ++/* Inner part that may run in process context */ ++static void __bch2_read_endio(struct work_struct *work) ++{ ++ struct bch_read_bio *rbio = ++ container_of(work, struct bch_read_bio, work); ++ struct bch_fs *c = rbio->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); ++ struct bio *src = &rbio->bio; ++ struct bio *dst = &bch2_rbio_parent(rbio)->bio; ++ struct bvec_iter dst_iter = rbio->bvec_iter; ++ struct bch_extent_crc_unpacked crc = rbio->pick.crc; ++ struct nonce nonce = extent_nonce(rbio->version, crc); ++ struct bch_csum csum; ++ ++ /* Reset iterator for checksumming and copying bounced data: */ ++ if (rbio->bounce) { ++ src->bi_iter.bi_size = crc.compressed_size << 9; ++ src->bi_iter.bi_idx = 0; ++ src->bi_iter.bi_bvec_done = 0; ++ } else { ++ src->bi_iter = rbio->bvec_iter; ++ } ++ ++ csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); ++ if (bch2_crc_cmp(csum, rbio->pick.crc.csum)) ++ goto csum_err; ++ ++ if (unlikely(rbio->narrow_crcs)) ++ bch2_rbio_narrow_crcs(rbio); ++ ++ if (rbio->flags & BCH_READ_NODECODE) ++ goto nodecode; ++ ++ /* Adjust crc to point to subset of data we want: */ ++ crc.offset += rbio->offset_into_extent; ++ crc.live_size = bvec_iter_sectors(rbio->bvec_iter); ++ ++ if (crc_is_compressed(crc)) { ++ bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) ++ goto decompression_err; ++ } else { ++ /* don't need to decrypt the entire bio: */ ++ nonce = nonce_add(nonce, crc.offset << 9); ++ bio_advance(src, crc.offset << 9); ++ ++ BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); ++ src->bi_iter.bi_size = dst_iter.bi_size; ++ ++ bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ ++ if (rbio->bounce) { ++ struct bvec_iter src_iter = src->bi_iter; ++ bio_copy_data_iter(dst, &dst_iter, src, &src_iter); ++ } ++ } ++ ++ if (rbio->promote) { ++ /* ++ * Re encrypt data we decrypted, so it's consistent with ++ * rbio->crc: ++ */ ++ bch2_encrypt_bio(c, crc.csum_type, nonce, src); ++ promote_start(rbio->promote, rbio); ++ rbio->promote = NULL; ++ } ++nodecode: ++ if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { ++ rbio = bch2_rbio_free(rbio); ++ bch2_rbio_done(rbio); ++ } ++ return; ++csum_err: ++ /* ++ * Checksum error: if the bio wasn't bounced, we may have been ++ * reading into buffers owned by userspace (that userspace can ++ * scribble over) - retry the read, bouncing it this time: ++ */ ++ if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { ++ rbio->flags |= BCH_READ_MUST_BOUNCE; ++ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); ++ return; ++ } ++ ++ bch2_dev_io_error(ca, ++ "data checksum error, inode %llu offset %llu: expected %0llx:%0llx got %0llx:%0llx (type %u)", ++ rbio->pos.inode, (u64) rbio->bvec_iter.bi_sector, ++ rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, ++ csum.hi, csum.lo, crc.csum_type); ++ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); ++ return; ++decompression_err: ++ __bcache_io_error(c, "decompression error, inode %llu offset %llu", ++ rbio->pos.inode, ++ (u64) rbio->bvec_iter.bi_sector); ++ bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); ++ return; ++} ++ ++static void bch2_read_endio(struct bio *bio) ++{ ++ struct bch_read_bio *rbio = ++ container_of(bio, struct bch_read_bio, bio); ++ struct bch_fs *c = rbio->c; ++ struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); ++ struct workqueue_struct *wq = NULL; ++ enum rbio_context context = RBIO_CONTEXT_NULL; ++ ++ if (rbio->have_ioref) { ++ bch2_latency_acct(ca, rbio->submit_time, READ); ++ percpu_ref_put(&ca->io_ref); ++ } ++ ++ if (!rbio->split) ++ rbio->bio.bi_end_io = rbio->end_io; ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "data read; %s", ++ bch2_blk_status_to_str(bio->bi_status))) { ++ bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); ++ return; ++ } ++ ++ if (rbio->pick.ptr.cached && ++ (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || ++ ptr_stale(ca, &rbio->pick.ptr))) { ++ atomic_long_inc(&c->read_realloc_races); ++ ++ if (rbio->flags & BCH_READ_RETRY_IF_STALE) ++ bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); ++ else ++ bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); ++ return; ++ } ++ ++ if (rbio->narrow_crcs || ++ crc_is_compressed(rbio->pick.crc) || ++ bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) ++ context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; ++ else if (rbio->pick.crc.csum_type) ++ context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; ++ ++ bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); ++} ++ ++int __bch2_read_indirect_extent(struct btree_trans *trans, ++ unsigned *offset_into_extent, ++ struct bkey_on_stack *orig_k) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 reflink_offset; ++ int ret; ++ ++ reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + ++ *offset_into_extent; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_REFLINK, ++ POS(0, reflink_offset), ++ BTREE_ITER_SLOTS); ++ ret = PTR_ERR_OR_ZERO(iter); ++ if (ret) ++ return ret; ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ if (k.k->type != KEY_TYPE_reflink_v) { ++ __bcache_io_error(trans->c, ++ "pointer to nonexistent indirect extent"); ++ ret = -EIO; ++ goto err; ++ } ++ ++ *offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); ++ bkey_on_stack_reassemble(orig_k, trans->c, k); ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, ++ struct bvec_iter iter, struct bkey_s_c k, ++ unsigned offset_into_extent, ++ struct bch_io_failures *failed, unsigned flags) ++{ ++ struct bch_fs *c = trans->c; ++ struct extent_ptr_decoded pick; ++ struct bch_read_bio *rbio = NULL; ++ struct bch_dev *ca; ++ struct promote_op *promote = NULL; ++ bool bounce = false, read_full = false, narrow_crcs = false; ++ struct bpos pos = bkey_start_pos(k.k); ++ int pick_ret; ++ ++ if (k.k->type == KEY_TYPE_inline_data) { ++ struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); ++ unsigned bytes = min_t(unsigned, iter.bi_size, ++ bkey_val_bytes(d.k)); ++ ++ swap(iter.bi_size, bytes); ++ memcpy_to_bio(&orig->bio, iter, d.v->data); ++ swap(iter.bi_size, bytes); ++ bio_advance_iter(&orig->bio, &iter, bytes); ++ zero_fill_bio_iter(&orig->bio, iter); ++ goto out_read_done; ++ } ++ ++ pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); ++ ++ /* hole or reservation - just zero fill: */ ++ if (!pick_ret) ++ goto hole; ++ ++ if (pick_ret < 0) { ++ __bcache_io_error(c, "no device to read from"); ++ goto err; ++ } ++ ++ if (pick_ret > 0) ++ ca = bch_dev_bkey_exists(c, pick.ptr.dev); ++ ++ if (flags & BCH_READ_NODECODE) { ++ /* ++ * can happen if we retry, and the extent we were going to read ++ * has been merged in the meantime: ++ */ ++ if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) ++ goto hole; ++ ++ iter.bi_size = pick.crc.compressed_size << 9; ++ goto get_bio; ++ } ++ ++ if (!(flags & BCH_READ_LAST_FRAGMENT) || ++ bio_flagged(&orig->bio, BIO_CHAIN)) ++ flags |= BCH_READ_MUST_CLONE; ++ ++ narrow_crcs = !(flags & BCH_READ_IN_RETRY) && ++ bch2_can_narrow_extent_crcs(k, pick.crc); ++ ++ if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) ++ flags |= BCH_READ_MUST_BOUNCE; ++ ++ EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); ++ ++ if (crc_is_compressed(pick.crc) || ++ (pick.crc.csum_type != BCH_CSUM_NONE && ++ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || ++ (bch2_csum_type_is_encryption(pick.crc.csum_type) && ++ (flags & BCH_READ_USER_MAPPED)) || ++ (flags & BCH_READ_MUST_BOUNCE)))) { ++ read_full = true; ++ bounce = true; ++ } ++ ++ if (orig->opts.promote_target) ++ promote = promote_alloc(c, iter, k, &pick, orig->opts, flags, ++ &rbio, &bounce, &read_full); ++ ++ if (!read_full) { ++ EBUG_ON(crc_is_compressed(pick.crc)); ++ EBUG_ON(pick.crc.csum_type && ++ (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || ++ bvec_iter_sectors(iter) != pick.crc.live_size || ++ pick.crc.offset || ++ offset_into_extent)); ++ ++ pos.offset += offset_into_extent; ++ pick.ptr.offset += pick.crc.offset + ++ offset_into_extent; ++ offset_into_extent = 0; ++ pick.crc.compressed_size = bvec_iter_sectors(iter); ++ pick.crc.uncompressed_size = bvec_iter_sectors(iter); ++ pick.crc.offset = 0; ++ pick.crc.live_size = bvec_iter_sectors(iter); ++ offset_into_extent = 0; ++ } ++get_bio: ++ if (rbio) { ++ /* ++ * promote already allocated bounce rbio: ++ * promote needs to allocate a bio big enough for uncompressing ++ * data in the write path, but we're not going to use it all ++ * here: ++ */ ++ EBUG_ON(rbio->bio.bi_iter.bi_size < ++ pick.crc.compressed_size << 9); ++ rbio->bio.bi_iter.bi_size = ++ pick.crc.compressed_size << 9; ++ } else if (bounce) { ++ unsigned sectors = pick.crc.compressed_size; ++ ++ rbio = rbio_init(bio_alloc_bioset(GFP_NOIO, ++ DIV_ROUND_UP(sectors, PAGE_SECTORS), ++ &c->bio_read_split), ++ orig->opts); ++ ++ bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); ++ rbio->bounce = true; ++ rbio->split = true; ++ } else if (flags & BCH_READ_MUST_CLONE) { ++ /* ++ * Have to clone if there were any splits, due to error ++ * reporting issues (if a split errored, and retrying didn't ++ * work, when it reports the error to its parent (us) we don't ++ * know if the error was from our bio, and we should retry, or ++ * from the whole bio, in which case we don't want to retry and ++ * lose the error) ++ */ ++ rbio = rbio_init(bio_clone_fast(&orig->bio, GFP_NOIO, ++ &c->bio_read_split), ++ orig->opts); ++ rbio->bio.bi_iter = iter; ++ rbio->split = true; ++ } else { ++ rbio = orig; ++ rbio->bio.bi_iter = iter; ++ EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); ++ } ++ ++ EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); ++ ++ rbio->c = c; ++ rbio->submit_time = local_clock(); ++ if (rbio->split) ++ rbio->parent = orig; ++ else ++ rbio->end_io = orig->bio.bi_end_io; ++ rbio->bvec_iter = iter; ++ rbio->offset_into_extent= offset_into_extent; ++ rbio->flags = flags; ++ rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); ++ rbio->narrow_crcs = narrow_crcs; ++ rbio->hole = 0; ++ rbio->retry = 0; ++ rbio->context = 0; ++ /* XXX: only initialize this if needed */ ++ rbio->devs_have = bch2_bkey_devs(k); ++ rbio->pick = pick; ++ rbio->pos = pos; ++ rbio->version = k.k->version; ++ rbio->promote = promote; ++ INIT_WORK(&rbio->work, NULL); ++ ++ rbio->bio.bi_opf = orig->bio.bi_opf; ++ rbio->bio.bi_iter.bi_sector = pick.ptr.offset; ++ rbio->bio.bi_end_io = bch2_read_endio; ++ ++ if (rbio->bounce) ++ trace_read_bounce(&rbio->bio); ++ ++ bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); ++ ++ if (pick.ptr.cached) ++ bch2_bucket_io_time_reset(trans, pick.ptr.dev, ++ PTR_BUCKET_NR(ca, &pick.ptr), READ); ++ ++ if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { ++ bio_inc_remaining(&orig->bio); ++ trace_read_split(&orig->bio); ++ } ++ ++ if (!rbio->pick.idx) { ++ if (!rbio->have_ioref) { ++ __bcache_io_error(c, "no device to read from"); ++ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); ++ goto out; ++ } ++ ++ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], ++ bio_sectors(&rbio->bio)); ++ bio_set_dev(&rbio->bio, ca->disk_sb.bdev); ++ ++ if (likely(!(flags & BCH_READ_IN_RETRY))) ++ submit_bio(&rbio->bio); ++ else ++ submit_bio_wait(&rbio->bio); ++ } else { ++ /* Attempting reconstruct read: */ ++ if (bch2_ec_read_extent(c, rbio)) { ++ bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); ++ goto out; ++ } ++ ++ if (likely(!(flags & BCH_READ_IN_RETRY))) ++ bio_endio(&rbio->bio); ++ } ++out: ++ if (likely(!(flags & BCH_READ_IN_RETRY))) { ++ return 0; ++ } else { ++ int ret; ++ ++ rbio->context = RBIO_CONTEXT_UNBOUND; ++ bch2_read_endio(&rbio->bio); ++ ++ ret = rbio->retry; ++ rbio = bch2_rbio_free(rbio); ++ ++ if (ret == READ_RETRY_AVOID) { ++ bch2_mark_io_failure(failed, &pick); ++ ret = READ_RETRY; ++ } ++ ++ return ret; ++ } ++ ++err: ++ if (flags & BCH_READ_IN_RETRY) ++ return READ_ERR; ++ ++ orig->bio.bi_status = BLK_STS_IOERR; ++ goto out_read_done; ++ ++hole: ++ /* ++ * won't normally happen in the BCH_READ_NODECODE ++ * (bch2_move_extent()) path, but if we retry and the extent we wanted ++ * to read no longer exists we have to signal that: ++ */ ++ if (flags & BCH_READ_NODECODE) ++ orig->hole = true; ++ ++ zero_fill_bio_iter(&orig->bio, iter); ++out_read_done: ++ if (flags & BCH_READ_LAST_FRAGMENT) ++ bch2_rbio_done(orig); ++ return 0; ++} ++ ++void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, u64 inode) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_on_stack sk; ++ struct bkey_s_c k; ++ unsigned flags = BCH_READ_RETRY_IF_STALE| ++ BCH_READ_MAY_PROMOTE| ++ BCH_READ_USER_MAPPED; ++ int ret; ++ ++ BUG_ON(rbio->_state); ++ BUG_ON(flags & BCH_READ_NODECODE); ++ BUG_ON(flags & BCH_READ_IN_RETRY); ++ ++ rbio->c = c; ++ rbio->start_time = local_clock(); ++ ++ bkey_on_stack_init(&sk); ++ bch2_trans_init(&trans, c, 0, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, ++ POS(inode, rbio->bio.bi_iter.bi_sector), ++ BTREE_ITER_SLOTS); ++ while (1) { ++ unsigned bytes, sectors, offset_into_extent; ++ ++ bch2_btree_iter_set_pos(iter, ++ POS(inode, rbio->bio.bi_iter.bi_sector)); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ offset_into_extent = iter->pos.offset - ++ bkey_start_offset(k.k); ++ sectors = k.k->size - offset_into_extent; ++ ++ bkey_on_stack_reassemble(&sk, c, k); ++ k = bkey_i_to_s_c(sk.k); ++ ++ ret = bch2_read_indirect_extent(&trans, ++ &offset_into_extent, &sk); ++ if (ret) ++ goto err; ++ ++ /* ++ * With indirect extents, the amount of data to read is the min ++ * of the original extent and the indirect extent: ++ */ ++ sectors = min(sectors, k.k->size - offset_into_extent); ++ ++ /* ++ * Unlock the iterator while the btree node's lock is still in ++ * cache, before doing the IO: ++ */ ++ bch2_trans_unlock(&trans); ++ ++ bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; ++ swap(rbio->bio.bi_iter.bi_size, bytes); ++ ++ if (rbio->bio.bi_iter.bi_size == bytes) ++ flags |= BCH_READ_LAST_FRAGMENT; ++ ++ bch2_read_extent(&trans, rbio, k, offset_into_extent, flags); ++ ++ if (flags & BCH_READ_LAST_FRAGMENT) ++ break; ++ ++ swap(rbio->bio.bi_iter.bi_size, bytes); ++ bio_advance(&rbio->bio, bytes); ++ } ++out: ++ bch2_trans_exit(&trans); ++ bkey_on_stack_exit(&sk, c); ++ return; ++err: ++ if (ret == -EINTR) ++ goto retry; ++ ++ bcache_io_error(c, &rbio->bio, "btree IO error: %i", ret); ++ bch2_rbio_done(rbio); ++ goto out; ++} ++ ++void bch2_fs_io_exit(struct bch_fs *c) ++{ ++ if (c->promote_table.tbl) ++ rhashtable_destroy(&c->promote_table); ++ mempool_exit(&c->bio_bounce_pages); ++ bioset_exit(&c->bio_write); ++ bioset_exit(&c->bio_read_split); ++ bioset_exit(&c->bio_read); ++} ++ ++int bch2_fs_io_init(struct bch_fs *c) ++{ ++ if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), ++ BIOSET_NEED_BVECS) || ++ bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), ++ BIOSET_NEED_BVECS) || ++ bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), ++ BIOSET_NEED_BVECS) || ++ mempool_init_page_pool(&c->bio_bounce_pages, ++ max_t(unsigned, ++ c->opts.btree_node_size, ++ c->sb.encoded_extent_max) / ++ PAGE_SECTORS, 0) || ++ rhashtable_init(&c->promote_table, &bch_promote_params)) ++ return -ENOMEM; ++ ++ return 0; ++} +diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h +new file mode 100644 +index 000000000000..e6aac594f3e6 +--- /dev/null ++++ b/fs/bcachefs/io.h +@@ -0,0 +1,169 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_IO_H ++#define _BCACHEFS_IO_H ++ ++#include "checksum.h" ++#include "bkey_on_stack.h" ++#include "io_types.h" ++ ++#define to_wbio(_bio) \ ++ container_of((_bio), struct bch_write_bio, bio) ++ ++#define to_rbio(_bio) \ ++ container_of((_bio), struct bch_read_bio, bio) ++ ++void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); ++void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); ++ ++void bch2_latency_acct(struct bch_dev *, u64, int); ++ ++void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, ++ enum bch_data_type, const struct bkey_i *); ++ ++#define BLK_STS_REMOVED ((__force blk_status_t)128) ++ ++const char *bch2_blk_status_to_str(blk_status_t); ++ ++enum bch_write_flags { ++ BCH_WRITE_ALLOC_NOWAIT = (1 << 0), ++ BCH_WRITE_CACHED = (1 << 1), ++ BCH_WRITE_FLUSH = (1 << 2), ++ BCH_WRITE_DATA_ENCODED = (1 << 3), ++ BCH_WRITE_PAGES_STABLE = (1 << 4), ++ BCH_WRITE_PAGES_OWNED = (1 << 5), ++ BCH_WRITE_ONLY_SPECIFIED_DEVS = (1 << 6), ++ BCH_WRITE_WROTE_DATA_INLINE = (1 << 7), ++ BCH_WRITE_FROM_INTERNAL = (1 << 8), ++ ++ /* Internal: */ ++ BCH_WRITE_JOURNAL_SEQ_PTR = (1 << 9), ++ BCH_WRITE_SKIP_CLOSURE_PUT = (1 << 10), ++ BCH_WRITE_DONE = (1 << 11), ++}; ++ ++static inline u64 *op_journal_seq(struct bch_write_op *op) ++{ ++ return (op->flags & BCH_WRITE_JOURNAL_SEQ_PTR) ++ ? op->journal_seq_p : &op->journal_seq; ++} ++ ++static inline void op_journal_seq_set(struct bch_write_op *op, u64 *journal_seq) ++{ ++ op->journal_seq_p = journal_seq; ++ op->flags |= BCH_WRITE_JOURNAL_SEQ_PTR; ++} ++ ++static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) ++{ ++ return op->alloc_reserve == RESERVE_MOVINGGC ++ ? op->c->copygc_wq ++ : op->c->wq; ++} ++ ++int bch2_extent_update(struct btree_trans *, struct btree_iter *, ++ struct bkey_i *, struct disk_reservation *, ++ u64 *, u64, s64 *); ++int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, ++ struct bpos, u64 *, s64 *); ++int bch2_fpunch(struct bch_fs *c, u64, u64, u64, u64 *, s64 *); ++ ++int bch2_write_index_default(struct bch_write_op *); ++ ++static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, ++ struct bch_io_opts opts) ++{ ++ op->c = c; ++ op->end_io = NULL; ++ op->flags = 0; ++ op->written = 0; ++ op->error = 0; ++ op->csum_type = bch2_data_checksum_type(c, opts.data_checksum); ++ op->compression_type = bch2_compression_opt_to_type[opts.compression]; ++ op->nr_replicas = 0; ++ op->nr_replicas_required = c->opts.data_replicas_required; ++ op->alloc_reserve = RESERVE_NONE; ++ op->incompressible = 0; ++ op->open_buckets.nr = 0; ++ op->devs_have.nr = 0; ++ op->target = 0; ++ op->opts = opts; ++ op->pos = POS_MAX; ++ op->version = ZERO_VERSION; ++ op->write_point = (struct write_point_specifier) { 0 }; ++ op->res = (struct disk_reservation) { 0 }; ++ op->journal_seq = 0; ++ op->new_i_size = U64_MAX; ++ op->i_sectors_delta = 0; ++ op->index_update_fn = bch2_write_index_default; ++} ++ ++void bch2_write(struct closure *); ++ ++static inline struct bch_write_bio *wbio_init(struct bio *bio) ++{ ++ struct bch_write_bio *wbio = to_wbio(bio); ++ ++ memset(wbio, 0, offsetof(struct bch_write_bio, bio)); ++ return wbio; ++} ++ ++struct bch_devs_mask; ++struct cache_promote_op; ++struct extent_ptr_decoded; ++ ++int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, ++ struct bkey_on_stack *); ++ ++static inline int bch2_read_indirect_extent(struct btree_trans *trans, ++ unsigned *offset_into_extent, ++ struct bkey_on_stack *k) ++{ ++ return k->k->k.type == KEY_TYPE_reflink_p ++ ? __bch2_read_indirect_extent(trans, offset_into_extent, k) ++ : 0; ++} ++ ++enum bch_read_flags { ++ BCH_READ_RETRY_IF_STALE = 1 << 0, ++ BCH_READ_MAY_PROMOTE = 1 << 1, ++ BCH_READ_USER_MAPPED = 1 << 2, ++ BCH_READ_NODECODE = 1 << 3, ++ BCH_READ_LAST_FRAGMENT = 1 << 4, ++ ++ /* internal: */ ++ BCH_READ_MUST_BOUNCE = 1 << 5, ++ BCH_READ_MUST_CLONE = 1 << 6, ++ BCH_READ_IN_RETRY = 1 << 7, ++}; ++ ++int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, ++ struct bvec_iter, struct bkey_s_c, unsigned, ++ struct bch_io_failures *, unsigned); ++ ++static inline void bch2_read_extent(struct btree_trans *trans, ++ struct bch_read_bio *rbio, ++ struct bkey_s_c k, ++ unsigned offset_into_extent, ++ unsigned flags) ++{ ++ __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, k, ++ offset_into_extent, NULL, flags); ++} ++ ++void bch2_read(struct bch_fs *, struct bch_read_bio *, u64); ++ ++static inline struct bch_read_bio *rbio_init(struct bio *bio, ++ struct bch_io_opts opts) ++{ ++ struct bch_read_bio *rbio = to_rbio(bio); ++ ++ rbio->_state = 0; ++ rbio->promote = NULL; ++ rbio->opts = opts; ++ return rbio; ++} ++ ++void bch2_fs_io_exit(struct bch_fs *); ++int bch2_fs_io_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_IO_H */ +diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h +new file mode 100644 +index 000000000000..b23727d212b9 +--- /dev/null ++++ b/fs/bcachefs/io_types.h +@@ -0,0 +1,148 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_IO_TYPES_H ++#define _BCACHEFS_IO_TYPES_H ++ ++#include "alloc_types.h" ++#include "btree_types.h" ++#include "buckets_types.h" ++#include "extents_types.h" ++#include "keylist_types.h" ++#include "opts.h" ++#include "super_types.h" ++ ++#include ++#include ++ ++struct bch_read_bio { ++ struct bch_fs *c; ++ u64 start_time; ++ u64 submit_time; ++ ++ /* ++ * Reads will often have to be split, and if the extent being read from ++ * was checksummed or compressed we'll also have to allocate bounce ++ * buffers and copy the data back into the original bio. ++ * ++ * If we didn't have to split, we have to save and restore the original ++ * bi_end_io - @split below indicates which: ++ */ ++ union { ++ struct bch_read_bio *parent; ++ bio_end_io_t *end_io; ++ }; ++ ++ /* ++ * Saved copy of bio->bi_iter, from submission time - allows us to ++ * resubmit on IO error, and also to copy data back to the original bio ++ * when we're bouncing: ++ */ ++ struct bvec_iter bvec_iter; ++ ++ unsigned offset_into_extent; ++ ++ u16 flags; ++ union { ++ struct { ++ u16 bounce:1, ++ split:1, ++ kmalloc:1, ++ have_ioref:1, ++ narrow_crcs:1, ++ hole:1, ++ retry:2, ++ context:2; ++ }; ++ u16 _state; ++ }; ++ ++ struct bch_devs_list devs_have; ++ ++ struct extent_ptr_decoded pick; ++ /* start pos of data we read (may not be pos of data we want) */ ++ struct bpos pos; ++ struct bversion version; ++ ++ struct promote_op *promote; ++ ++ struct bch_io_opts opts; ++ ++ struct work_struct work; ++ ++ struct bio bio; ++}; ++ ++struct bch_write_bio { ++ struct bch_fs *c; ++ struct bch_write_bio *parent; ++ ++ u64 submit_time; ++ ++ struct bch_devs_list failed; ++ u8 dev; ++ ++ unsigned split:1, ++ bounce:1, ++ put_bio:1, ++ have_ioref:1, ++ used_mempool:1; ++ ++ struct bio bio; ++}; ++ ++struct bch_write_op { ++ struct closure cl; ++ struct bch_fs *c; ++ void (*end_io)(struct bch_write_op *); ++ u64 start_time; ++ ++ unsigned written; /* sectors */ ++ u16 flags; ++ s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ ++ ++ unsigned csum_type:4; ++ unsigned compression_type:4; ++ unsigned nr_replicas:4; ++ unsigned nr_replicas_required:4; ++ unsigned alloc_reserve:3; ++ unsigned incompressible:1; ++ ++ struct bch_devs_list devs_have; ++ u16 target; ++ u16 nonce; ++ struct bch_io_opts opts; ++ ++ struct bpos pos; ++ struct bversion version; ++ ++ /* For BCH_WRITE_DATA_ENCODED: */ ++ struct bch_extent_crc_unpacked crc; ++ ++ struct write_point_specifier write_point; ++ ++ struct disk_reservation res; ++ ++ struct open_buckets open_buckets; ++ ++ /* ++ * If caller wants to flush but hasn't passed us a journal_seq ptr, we ++ * still need to stash the journal_seq somewhere: ++ */ ++ union { ++ u64 *journal_seq_p; ++ u64 journal_seq; ++ }; ++ u64 new_i_size; ++ s64 i_sectors_delta; ++ ++ int (*index_update_fn)(struct bch_write_op *); ++ ++ struct bch_devs_mask failed; ++ ++ struct keylist insert_keys; ++ u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2]; ++ ++ /* Must be last: */ ++ struct bch_write_bio wbio; ++}; ++ ++#endif /* _BCACHEFS_IO_TYPES_H */ +diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c +new file mode 100644 +index 000000000000..b8b719902c63 +--- /dev/null ++++ b/fs/bcachefs/journal.c +@@ -0,0 +1,1263 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * bcachefs journalling code, for btree insertions ++ * ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_methods.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "journal.h" ++#include "journal_io.h" ++#include "journal_reclaim.h" ++#include "journal_seq_blacklist.h" ++#include "super-io.h" ++ ++#include ++ ++static inline struct journal_buf *journal_seq_to_buf(struct journal *, u64); ++ ++static bool __journal_entry_is_open(union journal_res_state state) ++{ ++ return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; ++} ++ ++static bool journal_entry_is_open(struct journal *j) ++{ ++ return __journal_entry_is_open(j->reservations); ++} ++ ++static void journal_pin_new_entry(struct journal *j, int count) ++{ ++ struct journal_entry_pin_list *p; ++ ++ /* ++ * The fifo_push() needs to happen at the same time as j->seq is ++ * incremented for journal_last_seq() to be calculated correctly ++ */ ++ atomic64_inc(&j->seq); ++ p = fifo_push_ref(&j->pin); ++ ++ INIT_LIST_HEAD(&p->list); ++ INIT_LIST_HEAD(&p->flushed); ++ atomic_set(&p->count, count); ++ p->devs.nr = 0; ++} ++ ++static void bch2_journal_buf_init(struct journal *j) ++{ ++ struct journal_buf *buf = journal_cur_buf(j); ++ ++ memset(buf->has_inode, 0, sizeof(buf->has_inode)); ++ ++ memset(buf->data, 0, sizeof(*buf->data)); ++ buf->data->seq = cpu_to_le64(journal_cur_seq(j)); ++ buf->data->u64s = 0; ++} ++ ++void bch2_journal_halt(struct journal *j) ++{ ++ union journal_res_state old, new; ++ u64 v = atomic64_read(&j->reservations.counter); ++ ++ do { ++ old.v = new.v = v; ++ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) ++ return; ++ ++ new.cur_entry_offset = JOURNAL_ENTRY_ERROR_VAL; ++ } while ((v = atomic64_cmpxchg(&j->reservations.counter, ++ old.v, new.v)) != old.v); ++ ++ journal_wake(j); ++ closure_wake_up(&journal_cur_buf(j)->wait); ++} ++ ++/* journal entry close/open: */ ++ ++void __bch2_journal_buf_put(struct journal *j, bool need_write_just_set) ++{ ++ if (!need_write_just_set && ++ test_bit(JOURNAL_NEED_WRITE, &j->flags)) ++ bch2_time_stats_update(j->delay_time, ++ j->need_write_time); ++ ++ clear_bit(JOURNAL_NEED_WRITE, &j->flags); ++ ++ closure_call(&j->io, bch2_journal_write, system_highpri_wq, NULL); ++} ++ ++/* ++ * Returns true if journal entry is now closed: ++ */ ++static bool __journal_entry_close(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_buf *buf = journal_cur_buf(j); ++ union journal_res_state old, new; ++ u64 v = atomic64_read(&j->reservations.counter); ++ bool set_need_write = false; ++ unsigned sectors; ++ ++ lockdep_assert_held(&j->lock); ++ ++ do { ++ old.v = new.v = v; ++ if (old.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL) ++ return true; ++ ++ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) { ++ /* this entry will never be written: */ ++ closure_wake_up(&buf->wait); ++ return true; ++ } ++ ++ if (!test_bit(JOURNAL_NEED_WRITE, &j->flags)) { ++ set_bit(JOURNAL_NEED_WRITE, &j->flags); ++ j->need_write_time = local_clock(); ++ set_need_write = true; ++ } ++ ++ if (new.prev_buf_unwritten) ++ return false; ++ ++ new.cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL; ++ new.idx++; ++ new.prev_buf_unwritten = 1; ++ ++ BUG_ON(journal_state_count(new, new.idx)); ++ } while ((v = atomic64_cmpxchg(&j->reservations.counter, ++ old.v, new.v)) != old.v); ++ ++ buf->data->u64s = cpu_to_le32(old.cur_entry_offset); ++ ++ sectors = vstruct_blocks_plus(buf->data, c->block_bits, ++ buf->u64s_reserved) << c->block_bits; ++ BUG_ON(sectors > buf->sectors); ++ buf->sectors = sectors; ++ ++ bkey_extent_init(&buf->key); ++ ++ /* ++ * We have to set last_seq here, _before_ opening a new journal entry: ++ * ++ * A threads may replace an old pin with a new pin on their current ++ * journal reservation - the expectation being that the journal will ++ * contain either what the old pin protected or what the new pin ++ * protects. ++ * ++ * After the old pin is dropped journal_last_seq() won't include the old ++ * pin, so we can only write the updated last_seq on the entry that ++ * contains whatever the new pin protects. ++ * ++ * Restated, we can _not_ update last_seq for a given entry if there ++ * could be a newer entry open with reservations/pins that have been ++ * taken against it. ++ * ++ * Hence, we want update/set last_seq on the current journal entry right ++ * before we open a new one: ++ */ ++ buf->data->last_seq = cpu_to_le64(journal_last_seq(j)); ++ ++ if (journal_entry_empty(buf->data)) ++ clear_bit(JOURNAL_NOT_EMPTY, &j->flags); ++ else ++ set_bit(JOURNAL_NOT_EMPTY, &j->flags); ++ ++ journal_pin_new_entry(j, 1); ++ ++ bch2_journal_buf_init(j); ++ ++ cancel_delayed_work(&j->write_work); ++ ++ bch2_journal_space_available(j); ++ ++ bch2_journal_buf_put(j, old.idx, set_need_write); ++ return true; ++} ++ ++static bool journal_entry_close(struct journal *j) ++{ ++ bool ret; ++ ++ spin_lock(&j->lock); ++ ret = __journal_entry_close(j); ++ spin_unlock(&j->lock); ++ ++ return ret; ++} ++ ++/* ++ * should _only_ called from journal_res_get() - when we actually want a ++ * journal reservation - journal entry is open means journal is dirty: ++ * ++ * returns: ++ * 0: success ++ * -ENOSPC: journal currently full, must invoke reclaim ++ * -EAGAIN: journal blocked, must wait ++ * -EROFS: insufficient rw devices or journal error ++ */ ++static int journal_entry_open(struct journal *j) ++{ ++ struct journal_buf *buf = journal_cur_buf(j); ++ union journal_res_state old, new; ++ int u64s; ++ u64 v; ++ ++ lockdep_assert_held(&j->lock); ++ BUG_ON(journal_entry_is_open(j)); ++ ++ if (j->blocked) ++ return -EAGAIN; ++ ++ if (j->cur_entry_error) ++ return j->cur_entry_error; ++ ++ BUG_ON(!j->cur_entry_sectors); ++ ++ buf->u64s_reserved = j->entry_u64s_reserved; ++ buf->disk_sectors = j->cur_entry_sectors; ++ buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9); ++ ++ u64s = (int) (buf->sectors << 9) / sizeof(u64) - ++ journal_entry_overhead(j); ++ u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); ++ ++ if (u64s <= le32_to_cpu(buf->data->u64s)) ++ return -ENOSPC; ++ ++ /* ++ * Must be set before marking the journal entry as open: ++ */ ++ j->cur_entry_u64s = u64s; ++ ++ v = atomic64_read(&j->reservations.counter); ++ do { ++ old.v = new.v = v; ++ ++ if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL) ++ return -EROFS; ++ ++ /* Handle any already added entries */ ++ new.cur_entry_offset = le32_to_cpu(buf->data->u64s); ++ ++ EBUG_ON(journal_state_count(new, new.idx)); ++ journal_state_inc(&new); ++ } while ((v = atomic64_cmpxchg(&j->reservations.counter, ++ old.v, new.v)) != old.v); ++ ++ if (j->res_get_blocked_start) ++ bch2_time_stats_update(j->blocked_time, ++ j->res_get_blocked_start); ++ j->res_get_blocked_start = 0; ++ ++ mod_delayed_work(system_freezable_wq, ++ &j->write_work, ++ msecs_to_jiffies(j->write_delay_ms)); ++ journal_wake(j); ++ return 0; ++} ++ ++static bool journal_quiesced(struct journal *j) ++{ ++ union journal_res_state state = READ_ONCE(j->reservations); ++ bool ret = !state.prev_buf_unwritten && !__journal_entry_is_open(state); ++ ++ if (!ret) ++ journal_entry_close(j); ++ return ret; ++} ++ ++static void journal_quiesce(struct journal *j) ++{ ++ wait_event(j->wait, journal_quiesced(j)); ++} ++ ++static void journal_write_work(struct work_struct *work) ++{ ++ struct journal *j = container_of(work, struct journal, write_work.work); ++ ++ journal_entry_close(j); ++} ++ ++/* ++ * Given an inode number, if that inode number has data in the journal that ++ * hasn't yet been flushed, return the journal sequence number that needs to be ++ * flushed: ++ */ ++u64 bch2_inode_journal_seq(struct journal *j, u64 inode) ++{ ++ size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); ++ u64 seq = 0; ++ ++ if (!test_bit(h, j->buf[0].has_inode) && ++ !test_bit(h, j->buf[1].has_inode)) ++ return 0; ++ ++ spin_lock(&j->lock); ++ if (test_bit(h, journal_cur_buf(j)->has_inode)) ++ seq = journal_cur_seq(j); ++ else if (test_bit(h, journal_prev_buf(j)->has_inode)) ++ seq = journal_cur_seq(j) - 1; ++ spin_unlock(&j->lock); ++ ++ return seq; ++} ++ ++void bch2_journal_set_has_inum(struct journal *j, u64 inode, u64 seq) ++{ ++ size_t h = hash_64(inode, ilog2(sizeof(j->buf[0].has_inode) * 8)); ++ struct journal_buf *buf; ++ ++ spin_lock(&j->lock); ++ ++ if ((buf = journal_seq_to_buf(j, seq))) ++ set_bit(h, buf->has_inode); ++ ++ spin_unlock(&j->lock); ++} ++ ++static int __journal_res_get(struct journal *j, struct journal_res *res, ++ unsigned flags) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_buf *buf; ++ bool can_discard; ++ int ret; ++retry: ++ if (journal_res_get_fast(j, res, flags)) ++ return 0; ++ ++ if (bch2_journal_error(j)) ++ return -EROFS; ++ ++ spin_lock(&j->lock); ++ ++ /* ++ * Recheck after taking the lock, so we don't race with another thread ++ * that just did journal_entry_open() and call journal_entry_close() ++ * unnecessarily ++ */ ++ if (journal_res_get_fast(j, res, flags)) { ++ spin_unlock(&j->lock); ++ return 0; ++ } ++ ++ if (!(flags & JOURNAL_RES_GET_RESERVED) && ++ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { ++ /* ++ * Don't want to close current journal entry, just need to ++ * invoke reclaim: ++ */ ++ ret = -ENOSPC; ++ goto unlock; ++ } ++ ++ /* ++ * If we couldn't get a reservation because the current buf filled up, ++ * and we had room for a bigger entry on disk, signal that we want to ++ * realloc the journal bufs: ++ */ ++ buf = journal_cur_buf(j); ++ if (journal_entry_is_open(j) && ++ buf->buf_size >> 9 < buf->disk_sectors && ++ buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) ++ j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); ++ ++ if (journal_entry_is_open(j) && ++ !__journal_entry_close(j)) { ++ /* ++ * We failed to get a reservation on the current open journal ++ * entry because it's full, and we can't close it because ++ * there's still a previous one in flight: ++ */ ++ trace_journal_entry_full(c); ++ ret = -EAGAIN; ++ } else { ++ ret = journal_entry_open(j); ++ } ++unlock: ++ if ((ret == -EAGAIN || ret == -ENOSPC) && ++ !j->res_get_blocked_start) ++ j->res_get_blocked_start = local_clock() ?: 1; ++ ++ can_discard = j->can_discard; ++ spin_unlock(&j->lock); ++ ++ if (!ret) ++ goto retry; ++ ++ if (ret == -ENOSPC) { ++ WARN_ONCE(!can_discard && (flags & JOURNAL_RES_GET_RESERVED), ++ "JOURNAL_RES_GET_RESERVED set but journal full"); ++ ++ /* ++ * Journal is full - can't rely on reclaim from work item due to ++ * freezing: ++ */ ++ trace_journal_full(c); ++ ++ if (!(flags & JOURNAL_RES_GET_NONBLOCK)) { ++ if (can_discard) { ++ bch2_journal_do_discards(j); ++ goto retry; ++ } ++ ++ if (mutex_trylock(&j->reclaim_lock)) { ++ bch2_journal_reclaim(j); ++ mutex_unlock(&j->reclaim_lock); ++ } ++ } ++ ++ ret = -EAGAIN; ++ } ++ ++ return ret; ++} ++ ++/* ++ * Essentially the entry function to the journaling code. When bcachefs is doing ++ * a btree insert, it calls this function to get the current journal write. ++ * Journal write is the structure used set up journal writes. The calling ++ * function will then add its keys to the structure, queuing them for the next ++ * write. ++ * ++ * To ensure forward progress, the current task must not be holding any ++ * btree node write locks. ++ */ ++int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, ++ unsigned flags) ++{ ++ int ret; ++ ++ closure_wait_event(&j->async_wait, ++ (ret = __journal_res_get(j, res, flags)) != -EAGAIN || ++ (flags & JOURNAL_RES_GET_NONBLOCK)); ++ return ret; ++} ++ ++/* journal_preres: */ ++ ++static bool journal_preres_available(struct journal *j, ++ struct journal_preres *res, ++ unsigned new_u64s, ++ unsigned flags) ++{ ++ bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags); ++ ++ if (!ret) ++ bch2_journal_reclaim_work(&j->reclaim_work.work); ++ ++ return ret; ++} ++ ++int __bch2_journal_preres_get(struct journal *j, ++ struct journal_preres *res, ++ unsigned new_u64s, ++ unsigned flags) ++{ ++ int ret; ++ ++ closure_wait_event(&j->preres_wait, ++ (ret = bch2_journal_error(j)) || ++ journal_preres_available(j, res, new_u64s, flags)); ++ return ret; ++} ++ ++/* journal_entry_res: */ ++ ++void bch2_journal_entry_res_resize(struct journal *j, ++ struct journal_entry_res *res, ++ unsigned new_u64s) ++{ ++ union journal_res_state state; ++ int d = new_u64s - res->u64s; ++ ++ spin_lock(&j->lock); ++ ++ j->entry_u64s_reserved += d; ++ if (d <= 0) ++ goto out; ++ ++ j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); ++ smp_mb(); ++ state = READ_ONCE(j->reservations); ++ ++ if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL && ++ state.cur_entry_offset > j->cur_entry_u64s) { ++ j->cur_entry_u64s += d; ++ /* ++ * Not enough room in current journal entry, have to flush it: ++ */ ++ __journal_entry_close(j); ++ } else { ++ journal_cur_buf(j)->u64s_reserved += d; ++ } ++out: ++ spin_unlock(&j->lock); ++ res->u64s += d; ++} ++ ++/* journal flushing: */ ++ ++u64 bch2_journal_last_unwritten_seq(struct journal *j) ++{ ++ u64 seq; ++ ++ spin_lock(&j->lock); ++ seq = journal_cur_seq(j); ++ if (j->reservations.prev_buf_unwritten) ++ seq--; ++ spin_unlock(&j->lock); ++ ++ return seq; ++} ++ ++/** ++ * bch2_journal_open_seq_async - try to open a new journal entry if @seq isn't ++ * open yet, or wait if we cannot ++ * ++ * used by the btree interior update machinery, when it needs to write a new ++ * btree root - every journal entry contains the roots of all the btrees, so it ++ * doesn't need to bother with getting a journal reservation ++ */ ++int bch2_journal_open_seq_async(struct journal *j, u64 seq, struct closure *cl) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ int ret; ++ ++ spin_lock(&j->lock); ++ ++ /* ++ * Can't try to open more than one sequence number ahead: ++ */ ++ BUG_ON(journal_cur_seq(j) < seq && !journal_entry_is_open(j)); ++ ++ if (journal_cur_seq(j) > seq || ++ journal_entry_is_open(j)) { ++ spin_unlock(&j->lock); ++ return 0; ++ } ++ ++ if (journal_cur_seq(j) < seq && ++ !__journal_entry_close(j)) { ++ /* haven't finished writing out the previous one: */ ++ trace_journal_entry_full(c); ++ ret = -EAGAIN; ++ } else { ++ BUG_ON(journal_cur_seq(j) != seq); ++ ++ ret = journal_entry_open(j); ++ } ++ ++ if ((ret == -EAGAIN || ret == -ENOSPC) && ++ !j->res_get_blocked_start) ++ j->res_get_blocked_start = local_clock() ?: 1; ++ ++ if (ret == -EAGAIN || ret == -ENOSPC) ++ closure_wait(&j->async_wait, cl); ++ ++ spin_unlock(&j->lock); ++ ++ if (ret == -ENOSPC) { ++ trace_journal_full(c); ++ bch2_journal_reclaim_work(&j->reclaim_work.work); ++ ret = -EAGAIN; ++ } ++ ++ return ret; ++} ++ ++static int journal_seq_error(struct journal *j, u64 seq) ++{ ++ union journal_res_state state = READ_ONCE(j->reservations); ++ ++ if (seq == journal_cur_seq(j)) ++ return bch2_journal_error(j); ++ ++ if (seq + 1 == journal_cur_seq(j) && ++ !state.prev_buf_unwritten && ++ seq > j->seq_ondisk) ++ return -EIO; ++ ++ return 0; ++} ++ ++static inline struct journal_buf * ++journal_seq_to_buf(struct journal *j, u64 seq) ++{ ++ /* seq should be for a journal entry that has been opened: */ ++ BUG_ON(seq > journal_cur_seq(j)); ++ BUG_ON(seq == journal_cur_seq(j) && ++ j->reservations.cur_entry_offset == JOURNAL_ENTRY_CLOSED_VAL); ++ ++ if (seq == journal_cur_seq(j)) ++ return journal_cur_buf(j); ++ if (seq + 1 == journal_cur_seq(j) && ++ j->reservations.prev_buf_unwritten) ++ return journal_prev_buf(j); ++ return NULL; ++} ++ ++/** ++ * bch2_journal_wait_on_seq - wait for a journal entry to be written ++ * ++ * does _not_ cause @seq to be written immediately - if there is no other ++ * activity to cause the relevant journal entry to be filled up or flushed it ++ * can wait for an arbitrary amount of time (up to @j->write_delay_ms, which is ++ * configurable). ++ */ ++void bch2_journal_wait_on_seq(struct journal *j, u64 seq, ++ struct closure *parent) ++{ ++ struct journal_buf *buf; ++ ++ spin_lock(&j->lock); ++ ++ if ((buf = journal_seq_to_buf(j, seq))) { ++ if (!closure_wait(&buf->wait, parent)) ++ BUG(); ++ ++ if (seq == journal_cur_seq(j)) { ++ smp_mb(); ++ if (bch2_journal_error(j)) ++ closure_wake_up(&buf->wait); ++ } ++ } ++ ++ spin_unlock(&j->lock); ++} ++ ++/** ++ * bch2_journal_flush_seq_async - wait for a journal entry to be written ++ * ++ * like bch2_journal_wait_on_seq, except that it triggers a write immediately if ++ * necessary ++ */ ++void bch2_journal_flush_seq_async(struct journal *j, u64 seq, ++ struct closure *parent) ++{ ++ struct journal_buf *buf; ++ ++ spin_lock(&j->lock); ++ ++ if (parent && ++ (buf = journal_seq_to_buf(j, seq))) ++ if (!closure_wait(&buf->wait, parent)) ++ BUG(); ++ ++ if (seq == journal_cur_seq(j)) ++ __journal_entry_close(j); ++ spin_unlock(&j->lock); ++} ++ ++static int journal_seq_flushed(struct journal *j, u64 seq) ++{ ++ int ret; ++ ++ spin_lock(&j->lock); ++ ret = seq <= j->seq_ondisk ? 1 : journal_seq_error(j, seq); ++ ++ if (seq == journal_cur_seq(j)) ++ __journal_entry_close(j); ++ spin_unlock(&j->lock); ++ ++ return ret; ++} ++ ++int bch2_journal_flush_seq(struct journal *j, u64 seq) ++{ ++ u64 start_time = local_clock(); ++ int ret, ret2; ++ ++ ret = wait_event_killable(j->wait, (ret2 = journal_seq_flushed(j, seq))); ++ ++ bch2_time_stats_update(j->flush_seq_time, start_time); ++ ++ return ret ?: ret2 < 0 ? ret2 : 0; ++} ++ ++/** ++ * bch2_journal_meta_async - force a journal entry to be written ++ */ ++void bch2_journal_meta_async(struct journal *j, struct closure *parent) ++{ ++ struct journal_res res; ++ ++ memset(&res, 0, sizeof(res)); ++ ++ bch2_journal_res_get(j, &res, jset_u64s(0), 0); ++ bch2_journal_res_put(j, &res); ++ ++ bch2_journal_flush_seq_async(j, res.seq, parent); ++} ++ ++int bch2_journal_meta(struct journal *j) ++{ ++ struct journal_res res; ++ int ret; ++ ++ memset(&res, 0, sizeof(res)); ++ ++ ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); ++ if (ret) ++ return ret; ++ ++ bch2_journal_res_put(j, &res); ++ ++ return bch2_journal_flush_seq(j, res.seq); ++} ++ ++/* ++ * bch2_journal_flush_async - if there is an open journal entry, or a journal ++ * still being written, write it and wait for the write to complete ++ */ ++void bch2_journal_flush_async(struct journal *j, struct closure *parent) ++{ ++ u64 seq, journal_seq; ++ ++ spin_lock(&j->lock); ++ journal_seq = journal_cur_seq(j); ++ ++ if (journal_entry_is_open(j)) { ++ seq = journal_seq; ++ } else if (journal_seq) { ++ seq = journal_seq - 1; ++ } else { ++ spin_unlock(&j->lock); ++ return; ++ } ++ spin_unlock(&j->lock); ++ ++ bch2_journal_flush_seq_async(j, seq, parent); ++} ++ ++int bch2_journal_flush(struct journal *j) ++{ ++ u64 seq, journal_seq; ++ ++ spin_lock(&j->lock); ++ journal_seq = journal_cur_seq(j); ++ ++ if (journal_entry_is_open(j)) { ++ seq = journal_seq; ++ } else if (journal_seq) { ++ seq = journal_seq - 1; ++ } else { ++ spin_unlock(&j->lock); ++ return 0; ++ } ++ spin_unlock(&j->lock); ++ ++ return bch2_journal_flush_seq(j, seq); ++} ++ ++/* block/unlock the journal: */ ++ ++void bch2_journal_unblock(struct journal *j) ++{ ++ spin_lock(&j->lock); ++ j->blocked--; ++ spin_unlock(&j->lock); ++ ++ journal_wake(j); ++} ++ ++void bch2_journal_block(struct journal *j) ++{ ++ spin_lock(&j->lock); ++ j->blocked++; ++ spin_unlock(&j->lock); ++ ++ journal_quiesce(j); ++} ++ ++/* allocate journal on a device: */ ++ ++static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, ++ bool new_fs, struct closure *cl) ++{ ++ struct bch_fs *c = ca->fs; ++ struct journal_device *ja = &ca->journal; ++ struct bch_sb_field_journal *journal_buckets; ++ u64 *new_bucket_seq = NULL, *new_buckets = NULL; ++ int ret = 0; ++ ++ /* don't handle reducing nr of buckets yet: */ ++ if (nr <= ja->nr) ++ return 0; ++ ++ ret = -ENOMEM; ++ new_buckets = kzalloc(nr * sizeof(u64), GFP_KERNEL); ++ new_bucket_seq = kzalloc(nr * sizeof(u64), GFP_KERNEL); ++ if (!new_buckets || !new_bucket_seq) ++ goto err; ++ ++ journal_buckets = bch2_sb_resize_journal(&ca->disk_sb, ++ nr + sizeof(*journal_buckets) / sizeof(u64)); ++ if (!journal_buckets) ++ goto err; ++ ++ /* ++ * We may be called from the device add path, before the new device has ++ * actually been added to the running filesystem: ++ */ ++ if (c) ++ spin_lock(&c->journal.lock); ++ ++ memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); ++ memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); ++ swap(new_buckets, ja->buckets); ++ swap(new_bucket_seq, ja->bucket_seq); ++ ++ if (c) ++ spin_unlock(&c->journal.lock); ++ ++ while (ja->nr < nr) { ++ struct open_bucket *ob = NULL; ++ unsigned pos; ++ long bucket; ++ ++ if (new_fs) { ++ bucket = bch2_bucket_alloc_new_fs(ca); ++ if (bucket < 0) { ++ ret = -ENOSPC; ++ goto err; ++ } ++ } else { ++ ob = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, ++ false, cl); ++ if (IS_ERR(ob)) { ++ ret = cl ? -EAGAIN : -ENOSPC; ++ goto err; ++ } ++ ++ bucket = sector_to_bucket(ca, ob->ptr.offset); ++ } ++ ++ if (c) { ++ percpu_down_read(&c->mark_lock); ++ spin_lock(&c->journal.lock); ++ } ++ ++ pos = ja->nr ? (ja->cur_idx + 1) % ja->nr : 0; ++ __array_insert_item(ja->buckets, ja->nr, pos); ++ __array_insert_item(ja->bucket_seq, ja->nr, pos); ++ __array_insert_item(journal_buckets->buckets, ja->nr, pos); ++ ja->nr++; ++ ++ ja->buckets[pos] = bucket; ++ ja->bucket_seq[pos] = 0; ++ journal_buckets->buckets[pos] = cpu_to_le64(bucket); ++ ++ if (pos <= ja->discard_idx) ++ ja->discard_idx = (ja->discard_idx + 1) % ja->nr; ++ if (pos <= ja->dirty_idx_ondisk) ++ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; ++ if (pos <= ja->dirty_idx) ++ ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; ++ if (pos <= ja->cur_idx) ++ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ++ ++ bch2_mark_metadata_bucket(c, ca, bucket, BCH_DATA_journal, ++ ca->mi.bucket_size, ++ gc_phase(GC_PHASE_SB), ++ 0); ++ ++ if (c) { ++ spin_unlock(&c->journal.lock); ++ percpu_up_read(&c->mark_lock); ++ } ++ ++ if (!new_fs) ++ bch2_open_bucket_put(c, ob); ++ } ++ ++ ret = 0; ++err: ++ kfree(new_bucket_seq); ++ kfree(new_buckets); ++ ++ return ret; ++} ++ ++/* ++ * Allocate more journal space at runtime - not currently making use if it, but ++ * the code works: ++ */ ++int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, ++ unsigned nr) ++{ ++ struct journal_device *ja = &ca->journal; ++ struct closure cl; ++ unsigned current_nr; ++ int ret; ++ ++ closure_init_stack(&cl); ++ ++ do { ++ struct disk_reservation disk_res = { 0, 0 }; ++ ++ closure_sync(&cl); ++ ++ mutex_lock(&c->sb_lock); ++ current_nr = ja->nr; ++ ++ /* ++ * note: journal buckets aren't really counted as _sectors_ used yet, so ++ * we don't need the disk reservation to avoid the BUG_ON() in buckets.c ++ * when space used goes up without a reservation - but we do need the ++ * reservation to ensure we'll actually be able to allocate: ++ */ ++ ++ if (bch2_disk_reservation_get(c, &disk_res, ++ bucket_to_sector(ca, nr - ja->nr), 1, 0)) { ++ mutex_unlock(&c->sb_lock); ++ return -ENOSPC; ++ } ++ ++ ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl); ++ ++ bch2_disk_reservation_put(c, &disk_res); ++ ++ if (ja->nr != current_nr) ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ } while (ret == -EAGAIN); ++ ++ return ret; ++} ++ ++int bch2_dev_journal_alloc(struct bch_dev *ca) ++{ ++ unsigned nr; ++ ++ if (dynamic_fault("bcachefs:add:journal_alloc")) ++ return -ENOMEM; ++ ++ /* ++ * clamp journal size to 1024 buckets or 512MB (in sectors), whichever ++ * is smaller: ++ */ ++ nr = clamp_t(unsigned, ca->mi.nbuckets >> 8, ++ BCH_JOURNAL_BUCKETS_MIN, ++ min(1 << 10, ++ (1 << 20) / ca->mi.bucket_size)); ++ ++ return __bch2_set_nr_journal_buckets(ca, nr, true, NULL); ++} ++ ++/* startup/shutdown: */ ++ ++static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) ++{ ++ union journal_res_state state; ++ struct journal_buf *w; ++ bool ret; ++ ++ spin_lock(&j->lock); ++ state = READ_ONCE(j->reservations); ++ w = j->buf + !state.idx; ++ ++ ret = state.prev_buf_unwritten && ++ bch2_bkey_has_device(bkey_i_to_s_c(&w->key), dev_idx); ++ spin_unlock(&j->lock); ++ ++ return ret; ++} ++ ++void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) ++{ ++ wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx)); ++} ++ ++void bch2_fs_journal_stop(struct journal *j) ++{ ++ bch2_journal_flush_all_pins(j); ++ ++ wait_event(j->wait, journal_entry_close(j)); ++ ++ /* do we need to write another journal entry? */ ++ if (test_bit(JOURNAL_NOT_EMPTY, &j->flags)) ++ bch2_journal_meta(j); ++ ++ journal_quiesce(j); ++ ++ BUG_ON(!bch2_journal_error(j) && ++ test_bit(JOURNAL_NOT_EMPTY, &j->flags)); ++ ++ cancel_delayed_work_sync(&j->write_work); ++ cancel_delayed_work_sync(&j->reclaim_work); ++} ++ ++int bch2_fs_journal_start(struct journal *j, u64 cur_seq, ++ struct list_head *journal_entries) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_entry_pin_list *p; ++ struct journal_replay *i; ++ u64 last_seq = cur_seq, nr, seq; ++ ++ if (!list_empty(journal_entries)) ++ last_seq = le64_to_cpu(list_last_entry(journal_entries, ++ struct journal_replay, list)->j.last_seq); ++ ++ nr = cur_seq - last_seq; ++ ++ if (nr + 1 > j->pin.size) { ++ free_fifo(&j->pin); ++ init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); ++ if (!j->pin.data) { ++ bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); ++ return -ENOMEM; ++ } ++ } ++ ++ j->replay_journal_seq = last_seq; ++ j->replay_journal_seq_end = cur_seq; ++ j->last_seq_ondisk = last_seq; ++ j->pin.front = last_seq; ++ j->pin.back = cur_seq; ++ atomic64_set(&j->seq, cur_seq - 1); ++ ++ fifo_for_each_entry_ptr(p, &j->pin, seq) { ++ INIT_LIST_HEAD(&p->list); ++ INIT_LIST_HEAD(&p->flushed); ++ atomic_set(&p->count, 1); ++ p->devs.nr = 0; ++ } ++ ++ list_for_each_entry(i, journal_entries, list) { ++ seq = le64_to_cpu(i->j.seq); ++ BUG_ON(seq >= cur_seq); ++ ++ if (seq < last_seq) ++ continue; ++ ++ journal_seq_pin(j, seq)->devs = i->devs; ++ } ++ ++ spin_lock(&j->lock); ++ ++ set_bit(JOURNAL_STARTED, &j->flags); ++ ++ journal_pin_new_entry(j, 1); ++ bch2_journal_buf_init(j); ++ ++ c->last_bucket_seq_cleanup = journal_cur_seq(j); ++ ++ bch2_journal_space_available(j); ++ spin_unlock(&j->lock); ++ ++ return 0; ++} ++ ++/* init/exit: */ ++ ++void bch2_dev_journal_exit(struct bch_dev *ca) ++{ ++ kfree(ca->journal.bio); ++ kfree(ca->journal.buckets); ++ kfree(ca->journal.bucket_seq); ++ ++ ca->journal.bio = NULL; ++ ca->journal.buckets = NULL; ++ ca->journal.bucket_seq = NULL; ++} ++ ++int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) ++{ ++ struct journal_device *ja = &ca->journal; ++ struct bch_sb_field_journal *journal_buckets = ++ bch2_sb_get_journal(sb); ++ unsigned i; ++ ++ ja->nr = bch2_nr_journal_buckets(journal_buckets); ++ ++ ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); ++ if (!ja->bucket_seq) ++ return -ENOMEM; ++ ++ ca->journal.bio = bio_kmalloc(GFP_KERNEL, ++ DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE)); ++ if (!ca->journal.bio) ++ return -ENOMEM; ++ ++ ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); ++ if (!ja->buckets) ++ return -ENOMEM; ++ ++ for (i = 0; i < ja->nr; i++) ++ ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); ++ ++ return 0; ++} ++ ++void bch2_fs_journal_exit(struct journal *j) ++{ ++ kvpfree(j->buf[1].data, j->buf[1].buf_size); ++ kvpfree(j->buf[0].data, j->buf[0].buf_size); ++ free_fifo(&j->pin); ++} ++ ++int bch2_fs_journal_init(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ static struct lock_class_key res_key; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ spin_lock_init(&j->lock); ++ spin_lock_init(&j->err_lock); ++ init_waitqueue_head(&j->wait); ++ INIT_DELAYED_WORK(&j->write_work, journal_write_work); ++ INIT_DELAYED_WORK(&j->reclaim_work, bch2_journal_reclaim_work); ++ init_waitqueue_head(&j->pin_flush_wait); ++ mutex_init(&j->reclaim_lock); ++ mutex_init(&j->discard_lock); ++ ++ lockdep_init_map(&j->res_map, "journal res", &res_key, 0); ++ ++ j->buf[0].buf_size = JOURNAL_ENTRY_SIZE_MIN; ++ j->buf[1].buf_size = JOURNAL_ENTRY_SIZE_MIN; ++ j->write_delay_ms = 1000; ++ j->reclaim_delay_ms = 100; ++ ++ /* Btree roots: */ ++ j->entry_u64s_reserved += ++ BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_EXTENT_U64s_MAX); ++ ++ atomic64_set(&j->reservations.counter, ++ ((union journal_res_state) ++ { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); ++ ++ if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL)) || ++ !(j->buf[0].data = kvpmalloc(j->buf[0].buf_size, GFP_KERNEL)) || ++ !(j->buf[1].data = kvpmalloc(j->buf[1].buf_size, GFP_KERNEL))) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ j->pin.front = j->pin.back = 1; ++out: ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++} ++ ++/* debug: */ ++ ++void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ union journal_res_state s; ++ struct bch_dev *ca; ++ unsigned iter; ++ ++ rcu_read_lock(); ++ spin_lock(&j->lock); ++ s = READ_ONCE(j->reservations); ++ ++ pr_buf(out, ++ "active journal entries:\t%llu\n" ++ "seq:\t\t\t%llu\n" ++ "last_seq:\t\t%llu\n" ++ "last_seq_ondisk:\t%llu\n" ++ "prereserved:\t\t%u/%u\n" ++ "current entry sectors:\t%u\n" ++ "current entry:\t\t", ++ fifo_used(&j->pin), ++ journal_cur_seq(j), ++ journal_last_seq(j), ++ j->last_seq_ondisk, ++ j->prereserved.reserved, ++ j->prereserved.remaining, ++ j->cur_entry_sectors); ++ ++ switch (s.cur_entry_offset) { ++ case JOURNAL_ENTRY_ERROR_VAL: ++ pr_buf(out, "error\n"); ++ break; ++ case JOURNAL_ENTRY_CLOSED_VAL: ++ pr_buf(out, "closed\n"); ++ break; ++ default: ++ pr_buf(out, "%u/%u\n", ++ s.cur_entry_offset, ++ j->cur_entry_u64s); ++ break; ++ } ++ ++ pr_buf(out, ++ "current entry refs:\t%u\n" ++ "prev entry unwritten:\t", ++ journal_state_count(s, s.idx)); ++ ++ if (s.prev_buf_unwritten) ++ pr_buf(out, "yes, ref %u sectors %u\n", ++ journal_state_count(s, !s.idx), ++ journal_prev_buf(j)->sectors); ++ else ++ pr_buf(out, "no\n"); ++ ++ pr_buf(out, ++ "need write:\t\t%i\n" ++ "replay done:\t\t%i\n", ++ test_bit(JOURNAL_NEED_WRITE, &j->flags), ++ test_bit(JOURNAL_REPLAY_DONE, &j->flags)); ++ ++ for_each_member_device_rcu(ca, c, iter, ++ &c->rw_devs[BCH_DATA_journal]) { ++ struct journal_device *ja = &ca->journal; ++ ++ if (!ja->nr) ++ continue; ++ ++ pr_buf(out, ++ "dev %u:\n" ++ "\tnr\t\t%u\n" ++ "\tavailable\t%u:%u\n" ++ "\tdiscard_idx\t\t%u\n" ++ "\tdirty_idx_ondisk\t%u (seq %llu)\n" ++ "\tdirty_idx\t\t%u (seq %llu)\n" ++ "\tcur_idx\t\t%u (seq %llu)\n", ++ iter, ja->nr, ++ bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ++ ja->sectors_free, ++ ja->discard_idx, ++ ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk], ++ ja->dirty_idx, ja->bucket_seq[ja->dirty_idx], ++ ja->cur_idx, ja->bucket_seq[ja->cur_idx]); ++ } ++ ++ spin_unlock(&j->lock); ++ rcu_read_unlock(); ++} ++ ++void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) ++{ ++ struct journal_entry_pin_list *pin_list; ++ struct journal_entry_pin *pin; ++ u64 i; ++ ++ spin_lock(&j->lock); ++ fifo_for_each_entry_ptr(pin_list, &j->pin, i) { ++ pr_buf(out, "%llu: count %u\n", ++ i, atomic_read(&pin_list->count)); ++ ++ list_for_each_entry(pin, &pin_list->list, list) ++ pr_buf(out, "\t%px %ps\n", ++ pin, pin->flush); ++ ++ if (!list_empty(&pin_list->flushed)) ++ pr_buf(out, "flushed:\n"); ++ ++ list_for_each_entry(pin, &pin_list->flushed, list) ++ pr_buf(out, "\t%px %ps\n", ++ pin, pin->flush); ++ } ++ spin_unlock(&j->lock); ++} +diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h +new file mode 100644 +index 000000000000..f60bc964ee1f +--- /dev/null ++++ b/fs/bcachefs/journal.h +@@ -0,0 +1,520 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_H ++#define _BCACHEFS_JOURNAL_H ++ ++/* ++ * THE JOURNAL: ++ * ++ * The primary purpose of the journal is to log updates (insertions) to the ++ * b-tree, to avoid having to do synchronous updates to the b-tree on disk. ++ * ++ * Without the journal, the b-tree is always internally consistent on ++ * disk - and in fact, in the earliest incarnations bcache didn't have a journal ++ * but did handle unclean shutdowns by doing all index updates synchronously ++ * (with coalescing). ++ * ++ * Updates to interior nodes still happen synchronously and without the journal ++ * (for simplicity) - this may change eventually but updates to interior nodes ++ * are rare enough it's not a huge priority. ++ * ++ * This means the journal is relatively separate from the b-tree; it consists of ++ * just a list of keys and journal replay consists of just redoing those ++ * insertions in same order that they appear in the journal. ++ * ++ * PERSISTENCE: ++ * ++ * For synchronous updates (where we're waiting on the index update to hit ++ * disk), the journal entry will be written out immediately (or as soon as ++ * possible, if the write for the previous journal entry was still in flight). ++ * ++ * Synchronous updates are specified by passing a closure (@flush_cl) to ++ * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter ++ * down to the journalling code. That closure will will wait on the journal ++ * write to complete (via closure_wait()). ++ * ++ * If the index update wasn't synchronous, the journal entry will be ++ * written out after 10 ms have elapsed, by default (the delay_ms field ++ * in struct journal). ++ * ++ * JOURNAL ENTRIES: ++ * ++ * A journal entry is variable size (struct jset), it's got a fixed length ++ * header and then a variable number of struct jset_entry entries. ++ * ++ * Journal entries are identified by monotonically increasing 64 bit sequence ++ * numbers - jset->seq; other places in the code refer to this sequence number. ++ * ++ * A jset_entry entry contains one or more bkeys (which is what gets inserted ++ * into the b-tree). We need a container to indicate which b-tree the key is ++ * for; also, the roots of the various b-trees are stored in jset_entry entries ++ * (one for each b-tree) - this lets us add new b-tree types without changing ++ * the on disk format. ++ * ++ * We also keep some things in the journal header that are logically part of the ++ * superblock - all the things that are frequently updated. This is for future ++ * bcache on raw flash support; the superblock (which will become another ++ * journal) can't be moved or wear leveled, so it contains just enough ++ * information to find the main journal, and the superblock only has to be ++ * rewritten when we want to move/wear level the main journal. ++ * ++ * JOURNAL LAYOUT ON DISK: ++ * ++ * The journal is written to a ringbuffer of buckets (which is kept in the ++ * superblock); the individual buckets are not necessarily contiguous on disk ++ * which means that journal entries are not allowed to span buckets, but also ++ * that we can resize the journal at runtime if desired (unimplemented). ++ * ++ * The journal buckets exist in the same pool as all the other buckets that are ++ * managed by the allocator and garbage collection - garbage collection marks ++ * the journal buckets as metadata buckets. ++ * ++ * OPEN/DIRTY JOURNAL ENTRIES: ++ * ++ * Open/dirty journal entries are journal entries that contain b-tree updates ++ * that have not yet been written out to the b-tree on disk. We have to track ++ * which journal entries are dirty, and we also have to avoid wrapping around ++ * the journal and overwriting old but still dirty journal entries with new ++ * journal entries. ++ * ++ * On disk, this is represented with the "last_seq" field of struct jset; ++ * last_seq is the first sequence number that journal replay has to replay. ++ * ++ * To avoid overwriting dirty journal entries on disk, we keep a mapping (in ++ * journal_device->seq) of for each journal bucket, the highest sequence number ++ * any journal entry it contains. Then, by comparing that against last_seq we ++ * can determine whether that journal bucket contains dirty journal entries or ++ * not. ++ * ++ * To track which journal entries are dirty, we maintain a fifo of refcounts ++ * (where each entry corresponds to a specific sequence number) - when a ref ++ * goes to 0, that journal entry is no longer dirty. ++ * ++ * Journalling of index updates is done at the same time as the b-tree itself is ++ * being modified (see btree_insert_key()); when we add the key to the journal ++ * the pending b-tree write takes a ref on the journal entry the key was added ++ * to. If a pending b-tree write would need to take refs on multiple dirty ++ * journal entries, it only keeps the ref on the oldest one (since a newer ++ * journal entry will still be replayed if an older entry was dirty). ++ * ++ * JOURNAL FILLING UP: ++ * ++ * There are two ways the journal could fill up; either we could run out of ++ * space to write to, or we could have too many open journal entries and run out ++ * of room in the fifo of refcounts. Since those refcounts are decremented ++ * without any locking we can't safely resize that fifo, so we handle it the ++ * same way. ++ * ++ * If the journal fills up, we start flushing dirty btree nodes until we can ++ * allocate space for a journal write again - preferentially flushing btree ++ * nodes that are pinning the oldest journal entries first. ++ */ ++ ++#include ++ ++#include "journal_types.h" ++ ++struct bch_fs; ++ ++static inline void journal_wake(struct journal *j) ++{ ++ wake_up(&j->wait); ++ closure_wake_up(&j->async_wait); ++ closure_wake_up(&j->preres_wait); ++} ++ ++static inline struct journal_buf *journal_cur_buf(struct journal *j) ++{ ++ return j->buf + j->reservations.idx; ++} ++ ++static inline struct journal_buf *journal_prev_buf(struct journal *j) ++{ ++ return j->buf + !j->reservations.idx; ++} ++ ++/* Sequence number of oldest dirty journal entry */ ++ ++static inline u64 journal_last_seq(struct journal *j) ++{ ++ return j->pin.front; ++} ++ ++static inline u64 journal_cur_seq(struct journal *j) ++{ ++ BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); ++ ++ return j->pin.back - 1; ++} ++ ++u64 bch2_inode_journal_seq(struct journal *, u64); ++void bch2_journal_set_has_inum(struct journal *, u64, u64); ++ ++static inline int journal_state_count(union journal_res_state s, int idx) ++{ ++ return idx == 0 ? s.buf0_count : s.buf1_count; ++} ++ ++static inline void journal_state_inc(union journal_res_state *s) ++{ ++ s->buf0_count += s->idx == 0; ++ s->buf1_count += s->idx == 1; ++} ++ ++static inline void bch2_journal_set_has_inode(struct journal *j, ++ struct journal_res *res, ++ u64 inum) ++{ ++ struct journal_buf *buf = &j->buf[res->idx]; ++ unsigned long bit = hash_64(inum, ilog2(sizeof(buf->has_inode) * 8)); ++ ++ /* avoid atomic op if possible */ ++ if (unlikely(!test_bit(bit, buf->has_inode))) ++ set_bit(bit, buf->has_inode); ++} ++ ++/* ++ * Amount of space that will be taken up by some keys in the journal (i.e. ++ * including the jset header) ++ */ ++static inline unsigned jset_u64s(unsigned u64s) ++{ ++ return u64s + sizeof(struct jset_entry) / sizeof(u64); ++} ++ ++static inline int journal_entry_overhead(struct journal *j) ++{ ++ return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved; ++} ++ ++static inline struct jset_entry * ++bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) ++{ ++ struct jset *jset = buf->data; ++ struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s)); ++ ++ memset(entry, 0, sizeof(*entry)); ++ entry->u64s = cpu_to_le16(u64s); ++ ++ le32_add_cpu(&jset->u64s, jset_u64s(u64s)); ++ ++ return entry; ++} ++ ++static inline struct jset_entry * ++journal_res_entry(struct journal *j, struct journal_res *res) ++{ ++ return vstruct_idx(j->buf[res->idx].data, res->offset); ++} ++ ++static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type, ++ enum btree_id id, unsigned level, ++ const void *data, unsigned u64s) ++{ ++ memset(entry, 0, sizeof(*entry)); ++ entry->u64s = cpu_to_le16(u64s); ++ entry->type = type; ++ entry->btree_id = id; ++ entry->level = level; ++ memcpy_u64s_small(entry->_data, data, u64s); ++ ++ return jset_u64s(u64s); ++} ++ ++static inline void bch2_journal_add_entry(struct journal *j, struct journal_res *res, ++ unsigned type, enum btree_id id, ++ unsigned level, ++ const void *data, unsigned u64s) ++{ ++ unsigned actual = journal_entry_set(journal_res_entry(j, res), ++ type, id, level, data, u64s); ++ ++ EBUG_ON(!res->ref); ++ EBUG_ON(actual > res->u64s); ++ ++ res->offset += actual; ++ res->u64s -= actual; ++} ++ ++static inline void bch2_journal_add_keys(struct journal *j, struct journal_res *res, ++ enum btree_id id, const struct bkey_i *k) ++{ ++ bch2_journal_add_entry(j, res, BCH_JSET_ENTRY_btree_keys, ++ id, 0, k, k->k.u64s); ++} ++ ++static inline bool journal_entry_empty(struct jset *j) ++{ ++ struct jset_entry *i; ++ ++ if (j->seq != j->last_seq) ++ return false; ++ ++ vstruct_for_each(j, i) ++ if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s) ++ return false; ++ return true; ++} ++ ++void __bch2_journal_buf_put(struct journal *, bool); ++ ++static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, ++ bool need_write_just_set) ++{ ++ union journal_res_state s; ++ ++ s.v = atomic64_sub_return(((union journal_res_state) { ++ .buf0_count = idx == 0, ++ .buf1_count = idx == 1, ++ }).v, &j->reservations.counter); ++ if (!journal_state_count(s, idx)) { ++ EBUG_ON(s.idx == idx || !s.prev_buf_unwritten); ++ __bch2_journal_buf_put(j, need_write_just_set); ++ } ++} ++ ++/* ++ * This function releases the journal write structure so other threads can ++ * then proceed to add their keys as well. ++ */ ++static inline void bch2_journal_res_put(struct journal *j, ++ struct journal_res *res) ++{ ++ if (!res->ref) ++ return; ++ ++ lock_release(&j->res_map, _THIS_IP_); ++ ++ while (res->u64s) ++ bch2_journal_add_entry(j, res, ++ BCH_JSET_ENTRY_btree_keys, ++ 0, 0, NULL, 0); ++ ++ bch2_journal_buf_put(j, res->idx, false); ++ ++ res->ref = 0; ++} ++ ++int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, ++ unsigned); ++ ++#define JOURNAL_RES_GET_NONBLOCK (1 << 0) ++#define JOURNAL_RES_GET_CHECK (1 << 1) ++#define JOURNAL_RES_GET_RESERVED (1 << 2) ++#define JOURNAL_RES_GET_RECLAIM (1 << 3) ++ ++static inline int journal_res_get_fast(struct journal *j, ++ struct journal_res *res, ++ unsigned flags) ++{ ++ union journal_res_state old, new; ++ u64 v = atomic64_read(&j->reservations.counter); ++ ++ do { ++ old.v = new.v = v; ++ ++ /* ++ * Check if there is still room in the current journal ++ * entry: ++ */ ++ if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) ++ return 0; ++ ++ EBUG_ON(!journal_state_count(new, new.idx)); ++ ++ if (!(flags & JOURNAL_RES_GET_RESERVED) && ++ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) ++ return 0; ++ ++ if (flags & JOURNAL_RES_GET_CHECK) ++ return 1; ++ ++ new.cur_entry_offset += res->u64s; ++ journal_state_inc(&new); ++ } while ((v = atomic64_cmpxchg(&j->reservations.counter, ++ old.v, new.v)) != old.v); ++ ++ res->ref = true; ++ res->idx = old.idx; ++ res->offset = old.cur_entry_offset; ++ res->seq = le64_to_cpu(j->buf[old.idx].data->seq); ++ return 1; ++} ++ ++static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, ++ unsigned u64s, unsigned flags) ++{ ++ int ret; ++ ++ EBUG_ON(res->ref); ++ EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); ++ ++ res->u64s = u64s; ++ ++ if (journal_res_get_fast(j, res, flags)) ++ goto out; ++ ++ ret = bch2_journal_res_get_slowpath(j, res, flags); ++ if (ret) ++ return ret; ++out: ++ if (!(flags & JOURNAL_RES_GET_CHECK)) { ++ lock_acquire_shared(&j->res_map, 0, ++ (flags & JOURNAL_RES_GET_NONBLOCK) != 0, ++ NULL, _THIS_IP_); ++ EBUG_ON(!res->ref); ++ } ++ return 0; ++} ++ ++/* journal_preres: */ ++ ++static inline bool journal_check_may_get_unreserved(struct journal *j) ++{ ++ union journal_preres_state s = READ_ONCE(j->prereserved); ++ bool ret = s.reserved <= s.remaining && ++ fifo_free(&j->pin) > 8; ++ ++ lockdep_assert_held(&j->lock); ++ ++ if (ret != test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { ++ if (ret) { ++ set_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); ++ journal_wake(j); ++ } else { ++ clear_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags); ++ } ++ } ++ return ret; ++} ++ ++static inline void bch2_journal_preres_put(struct journal *j, ++ struct journal_preres *res) ++{ ++ union journal_preres_state s = { .reserved = res->u64s }; ++ ++ if (!res->u64s) ++ return; ++ ++ s.v = atomic64_sub_return(s.v, &j->prereserved.counter); ++ res->u64s = 0; ++ closure_wake_up(&j->preres_wait); ++ ++ if (s.reserved <= s.remaining && ++ !test_bit(JOURNAL_MAY_GET_UNRESERVED, &j->flags)) { ++ spin_lock(&j->lock); ++ journal_check_may_get_unreserved(j); ++ spin_unlock(&j->lock); ++ } ++} ++ ++int __bch2_journal_preres_get(struct journal *, ++ struct journal_preres *, unsigned, unsigned); ++ ++static inline int bch2_journal_preres_get_fast(struct journal *j, ++ struct journal_preres *res, ++ unsigned new_u64s, ++ unsigned flags) ++{ ++ int d = new_u64s - res->u64s; ++ union journal_preres_state old, new; ++ u64 v = atomic64_read(&j->prereserved.counter); ++ ++ do { ++ old.v = new.v = v; ++ ++ new.reserved += d; ++ ++ /* ++ * If we're being called from the journal reclaim path, we have ++ * to unconditionally give out the pre-reservation, there's ++ * nothing else sensible we can do - otherwise we'd recurse back ++ * into the reclaim path and deadlock: ++ */ ++ ++ if (!(flags & JOURNAL_RES_GET_RECLAIM) && ++ new.reserved > new.remaining) ++ return 0; ++ } while ((v = atomic64_cmpxchg(&j->prereserved.counter, ++ old.v, new.v)) != old.v); ++ ++ res->u64s += d; ++ return 1; ++} ++ ++static inline int bch2_journal_preres_get(struct journal *j, ++ struct journal_preres *res, ++ unsigned new_u64s, ++ unsigned flags) ++{ ++ if (new_u64s <= res->u64s) ++ return 0; ++ ++ if (bch2_journal_preres_get_fast(j, res, new_u64s, flags)) ++ return 0; ++ ++ if (flags & JOURNAL_RES_GET_NONBLOCK) ++ return -EAGAIN; ++ ++ return __bch2_journal_preres_get(j, res, new_u64s, flags); ++} ++ ++/* journal_entry_res: */ ++ ++void bch2_journal_entry_res_resize(struct journal *, ++ struct journal_entry_res *, ++ unsigned); ++ ++u64 bch2_journal_last_unwritten_seq(struct journal *); ++int bch2_journal_open_seq_async(struct journal *, u64, struct closure *); ++ ++void bch2_journal_wait_on_seq(struct journal *, u64, struct closure *); ++void bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); ++void bch2_journal_flush_async(struct journal *, struct closure *); ++void bch2_journal_meta_async(struct journal *, struct closure *); ++ ++int bch2_journal_flush_seq(struct journal *, u64); ++int bch2_journal_flush(struct journal *); ++int bch2_journal_meta(struct journal *); ++ ++void bch2_journal_halt(struct journal *); ++ ++static inline int bch2_journal_error(struct journal *j) ++{ ++ return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL ++ ? -EIO : 0; ++} ++ ++struct bch_dev; ++ ++static inline bool journal_flushes_device(struct bch_dev *ca) ++{ ++ return true; ++} ++ ++static inline void bch2_journal_set_replay_done(struct journal *j) ++{ ++ BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); ++ set_bit(JOURNAL_REPLAY_DONE, &j->flags); ++} ++ ++void bch2_journal_unblock(struct journal *); ++void bch2_journal_block(struct journal *); ++ ++void bch2_journal_debug_to_text(struct printbuf *, struct journal *); ++void bch2_journal_pins_to_text(struct printbuf *, struct journal *); ++ ++int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, ++ unsigned nr); ++int bch2_dev_journal_alloc(struct bch_dev *); ++ ++void bch2_dev_journal_stop(struct journal *, struct bch_dev *); ++ ++void bch2_fs_journal_stop(struct journal *); ++int bch2_fs_journal_start(struct journal *, u64, struct list_head *); ++ ++void bch2_dev_journal_exit(struct bch_dev *); ++int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); ++void bch2_fs_journal_exit(struct journal *); ++int bch2_fs_journal_init(struct journal *); ++ ++#endif /* _BCACHEFS_JOURNAL_H */ +diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c +new file mode 100644 +index 000000000000..bd0e6b371701 +--- /dev/null ++++ b/fs/bcachefs/journal_io.c +@@ -0,0 +1,1183 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "btree_io.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "error.h" ++#include "io.h" ++#include "journal.h" ++#include "journal_io.h" ++#include "journal_reclaim.h" ++#include "replicas.h" ++ ++#include ++ ++struct journal_list { ++ struct closure cl; ++ struct mutex lock; ++ struct list_head *head; ++ int ret; ++}; ++ ++#define JOURNAL_ENTRY_ADD_OK 0 ++#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 ++ ++/* ++ * Given a journal entry we just read, add it to the list of journal entries to ++ * be replayed: ++ */ ++static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, ++ struct journal_list *jlist, struct jset *j, ++ bool bad) ++{ ++ struct journal_replay *i, *pos; ++ struct bch_devs_list devs = { .nr = 0 }; ++ struct list_head *where; ++ size_t bytes = vstruct_bytes(j); ++ __le64 last_seq; ++ int ret; ++ ++ last_seq = !list_empty(jlist->head) ++ ? list_last_entry(jlist->head, struct journal_replay, ++ list)->j.last_seq ++ : 0; ++ ++ if (!c->opts.read_entire_journal) { ++ /* Is this entry older than the range we need? */ ++ if (le64_to_cpu(j->seq) < le64_to_cpu(last_seq)) { ++ ret = JOURNAL_ENTRY_ADD_OUT_OF_RANGE; ++ goto out; ++ } ++ ++ /* Drop entries we don't need anymore */ ++ list_for_each_entry_safe(i, pos, jlist->head, list) { ++ if (le64_to_cpu(i->j.seq) >= le64_to_cpu(j->last_seq)) ++ break; ++ list_del(&i->list); ++ kvpfree(i, offsetof(struct journal_replay, j) + ++ vstruct_bytes(&i->j)); ++ } ++ } ++ ++ list_for_each_entry_reverse(i, jlist->head, list) { ++ if (le64_to_cpu(j->seq) > le64_to_cpu(i->j.seq)) { ++ where = &i->list; ++ goto add; ++ } ++ } ++ ++ where = jlist->head; ++add: ++ i = where->next != jlist->head ++ ? container_of(where->next, struct journal_replay, list) ++ : NULL; ++ ++ /* ++ * Duplicate journal entries? If so we want the one that didn't have a ++ * checksum error: ++ */ ++ if (i && le64_to_cpu(j->seq) == le64_to_cpu(i->j.seq)) { ++ if (i->bad) { ++ devs = i->devs; ++ list_del(&i->list); ++ kvpfree(i, offsetof(struct journal_replay, j) + ++ vstruct_bytes(&i->j)); ++ } else if (bad) { ++ goto found; ++ } else { ++ fsck_err_on(bytes != vstruct_bytes(&i->j) || ++ memcmp(j, &i->j, bytes), c, ++ "found duplicate but non identical journal entries (seq %llu)", ++ le64_to_cpu(j->seq)); ++ goto found; ++ } ++ ++ } ++ ++ i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); ++ if (!i) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ list_add(&i->list, where); ++ i->devs = devs; ++ i->bad = bad; ++ memcpy(&i->j, j, bytes); ++found: ++ if (!bch2_dev_list_has_dev(i->devs, ca->dev_idx)) ++ bch2_dev_list_add_dev(&i->devs, ca->dev_idx); ++ else ++ fsck_err_on(1, c, "duplicate journal entries on same device"); ++ ret = JOURNAL_ENTRY_ADD_OK; ++out: ++fsck_err: ++ return ret; ++} ++ ++static struct nonce journal_nonce(const struct jset *jset) ++{ ++ return (struct nonce) {{ ++ [0] = 0, ++ [1] = ((__le32 *) &jset->seq)[0], ++ [2] = ((__le32 *) &jset->seq)[1], ++ [3] = BCH_NONCE_JOURNAL, ++ }}; ++} ++ ++/* this fills in a range with empty jset_entries: */ ++static void journal_entry_null_range(void *start, void *end) ++{ ++ struct jset_entry *entry; ++ ++ for (entry = start; entry != end; entry = vstruct_next(entry)) ++ memset(entry, 0, sizeof(*entry)); ++} ++ ++#define JOURNAL_ENTRY_REREAD 5 ++#define JOURNAL_ENTRY_NONE 6 ++#define JOURNAL_ENTRY_BAD 7 ++ ++#define journal_entry_err(c, msg, ...) \ ++({ \ ++ switch (write) { \ ++ case READ: \ ++ mustfix_fsck_err(c, msg, ##__VA_ARGS__); \ ++ break; \ ++ case WRITE: \ ++ bch_err(c, "corrupt metadata before write:\n" \ ++ msg, ##__VA_ARGS__); \ ++ if (bch2_fs_inconsistent(c)) { \ ++ ret = BCH_FSCK_ERRORS_NOT_FIXED; \ ++ goto fsck_err; \ ++ } \ ++ break; \ ++ } \ ++ true; \ ++}) ++ ++#define journal_entry_err_on(cond, c, msg, ...) \ ++ ((cond) ? journal_entry_err(c, msg, ##__VA_ARGS__) : false) ++ ++static int journal_validate_key(struct bch_fs *c, struct jset *jset, ++ struct jset_entry *entry, ++ unsigned level, enum btree_id btree_id, ++ struct bkey_i *k, ++ const char *type, int write) ++{ ++ void *next = vstruct_next(entry); ++ const char *invalid; ++ unsigned version = le32_to_cpu(jset->version); ++ int ret = 0; ++ ++ if (journal_entry_err_on(!k->k.u64s, c, ++ "invalid %s in journal: k->u64s 0", type)) { ++ entry->u64s = cpu_to_le16((u64 *) k - entry->_data); ++ journal_entry_null_range(vstruct_next(entry), next); ++ return 0; ++ } ++ ++ if (journal_entry_err_on((void *) bkey_next(k) > ++ (void *) vstruct_next(entry), c, ++ "invalid %s in journal: extends past end of journal entry", ++ type)) { ++ entry->u64s = cpu_to_le16((u64 *) k - entry->_data); ++ journal_entry_null_range(vstruct_next(entry), next); ++ return 0; ++ } ++ ++ if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, c, ++ "invalid %s in journal: bad format %u", ++ type, k->k.format)) { ++ le16_add_cpu(&entry->u64s, -k->k.u64s); ++ memmove(k, bkey_next(k), next - (void *) bkey_next(k)); ++ journal_entry_null_range(vstruct_next(entry), next); ++ return 0; ++ } ++ ++ if (!write) ++ bch2_bkey_compat(level, btree_id, version, ++ JSET_BIG_ENDIAN(jset), write, ++ NULL, bkey_to_packed(k)); ++ ++ invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(k), ++ __btree_node_type(level, btree_id)); ++ if (invalid) { ++ char buf[160]; ++ ++ bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(k)); ++ mustfix_fsck_err(c, "invalid %s in journal: %s\n%s", ++ type, invalid, buf); ++ ++ le16_add_cpu(&entry->u64s, -k->k.u64s); ++ memmove(k, bkey_next(k), next - (void *) bkey_next(k)); ++ journal_entry_null_range(vstruct_next(entry), next); ++ return 0; ++ } ++ ++ if (write) ++ bch2_bkey_compat(level, btree_id, version, ++ JSET_BIG_ENDIAN(jset), write, ++ NULL, bkey_to_packed(k)); ++fsck_err: ++ return ret; ++} ++ ++static int journal_entry_validate_btree_keys(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ struct bkey_i *k; ++ ++ vstruct_for_each(entry, k) { ++ int ret = journal_validate_key(c, jset, entry, ++ entry->level, ++ entry->btree_id, ++ k, "key", write); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static int journal_entry_validate_btree_root(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ struct bkey_i *k = entry->start; ++ int ret = 0; ++ ++ if (journal_entry_err_on(!entry->u64s || ++ le16_to_cpu(entry->u64s) != k->k.u64s, c, ++ "invalid btree root journal entry: wrong number of keys")) { ++ void *next = vstruct_next(entry); ++ /* ++ * we don't want to null out this jset_entry, ++ * just the contents, so that later we can tell ++ * we were _supposed_ to have a btree root ++ */ ++ entry->u64s = 0; ++ journal_entry_null_range(vstruct_next(entry), next); ++ return 0; ++ } ++ ++ return journal_validate_key(c, jset, entry, 1, entry->btree_id, k, ++ "btree root", write); ++fsck_err: ++ return ret; ++} ++ ++static int journal_entry_validate_prio_ptrs(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ /* obsolete, don't care: */ ++ return 0; ++} ++ ++static int journal_entry_validate_blacklist(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ int ret = 0; ++ ++ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, c, ++ "invalid journal seq blacklist entry: bad size")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ } ++fsck_err: ++ return ret; ++} ++ ++static int journal_entry_validate_blacklist_v2(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ struct jset_entry_blacklist_v2 *bl_entry; ++ int ret = 0; ++ ++ if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, c, ++ "invalid journal seq blacklist entry: bad size")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ goto out; ++ } ++ ++ bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); ++ ++ if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > ++ le64_to_cpu(bl_entry->end), c, ++ "invalid journal seq blacklist entry: start > end")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ } ++out: ++fsck_err: ++ return ret; ++} ++ ++static int journal_entry_validate_usage(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); ++ int ret = 0; ++ ++ if (journal_entry_err_on(bytes < sizeof(*u), ++ c, ++ "invalid journal entry usage: bad size")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ return ret; ++ } ++ ++fsck_err: ++ return ret; ++} ++ ++static int journal_entry_validate_data_usage(struct bch_fs *c, ++ struct jset *jset, ++ struct jset_entry *entry, ++ int write) ++{ ++ struct jset_entry_data_usage *u = ++ container_of(entry, struct jset_entry_data_usage, entry); ++ unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); ++ int ret = 0; ++ ++ if (journal_entry_err_on(bytes < sizeof(*u) || ++ bytes < sizeof(*u) + u->r.nr_devs, ++ c, ++ "invalid journal entry usage: bad size")) { ++ journal_entry_null_range(entry, vstruct_next(entry)); ++ return ret; ++ } ++ ++fsck_err: ++ return ret; ++} ++ ++struct jset_entry_ops { ++ int (*validate)(struct bch_fs *, struct jset *, ++ struct jset_entry *, int); ++}; ++ ++static const struct jset_entry_ops bch2_jset_entry_ops[] = { ++#define x(f, nr) \ ++ [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ ++ .validate = journal_entry_validate_##f, \ ++ }, ++ BCH_JSET_ENTRY_TYPES() ++#undef x ++}; ++ ++static int journal_entry_validate(struct bch_fs *c, struct jset *jset, ++ struct jset_entry *entry, int write) ++{ ++ return entry->type < BCH_JSET_ENTRY_NR ++ ? bch2_jset_entry_ops[entry->type].validate(c, jset, ++ entry, write) ++ : 0; ++} ++ ++static int jset_validate_entries(struct bch_fs *c, struct jset *jset, ++ int write) ++{ ++ struct jset_entry *entry; ++ int ret = 0; ++ ++ vstruct_for_each(jset, entry) { ++ if (journal_entry_err_on(vstruct_next(entry) > ++ vstruct_last(jset), c, ++ "journal entry extends past end of jset")) { ++ jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); ++ break; ++ } ++ ++ ret = journal_entry_validate(c, jset, entry, write); ++ if (ret) ++ break; ++ } ++fsck_err: ++ return ret; ++} ++ ++static int jset_validate(struct bch_fs *c, ++ struct bch_dev *ca, ++ struct jset *jset, u64 sector, ++ unsigned bucket_sectors_left, ++ unsigned sectors_read, ++ int write) ++{ ++ size_t bytes = vstruct_bytes(jset); ++ struct bch_csum csum; ++ unsigned version; ++ int ret = 0; ++ ++ if (le64_to_cpu(jset->magic) != jset_magic(c)) ++ return JOURNAL_ENTRY_NONE; ++ ++ version = le32_to_cpu(jset->version); ++ if (journal_entry_err_on((version != BCH_JSET_VERSION_OLD && ++ version < bcachefs_metadata_version_min) || ++ version >= bcachefs_metadata_version_max, c, ++ "%s sector %llu seq %llu: unknown journal entry version %u", ++ ca->name, sector, le64_to_cpu(jset->seq), ++ version)) { ++ /* XXX: note we might have missing journal entries */ ++ return JOURNAL_ENTRY_BAD; ++ } ++ ++ if (journal_entry_err_on(bytes > bucket_sectors_left << 9, c, ++ "%s sector %llu seq %llu: journal entry too big (%zu bytes)", ++ ca->name, sector, le64_to_cpu(jset->seq), bytes)) { ++ /* XXX: note we might have missing journal entries */ ++ return JOURNAL_ENTRY_BAD; ++ } ++ ++ if (bytes > sectors_read << 9) ++ return JOURNAL_ENTRY_REREAD; ++ ++ if (fsck_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), c, ++ "%s sector %llu seq %llu: journal entry with unknown csum type %llu", ++ ca->name, sector, le64_to_cpu(jset->seq), ++ JSET_CSUM_TYPE(jset))) ++ return JOURNAL_ENTRY_BAD; ++ ++ csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset); ++ if (journal_entry_err_on(bch2_crc_cmp(csum, jset->csum), c, ++ "%s sector %llu seq %llu: journal checksum bad", ++ ca->name, sector, le64_to_cpu(jset->seq))) { ++ /* XXX: retry IO, when we start retrying checksum errors */ ++ /* XXX: note we might have missing journal entries */ ++ return JOURNAL_ENTRY_BAD; ++ } ++ ++ bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), ++ jset->encrypted_start, ++ vstruct_end(jset) - (void *) jset->encrypted_start); ++ ++ if (journal_entry_err_on(le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), c, ++ "invalid journal entry: last_seq > seq")) { ++ jset->last_seq = jset->seq; ++ return JOURNAL_ENTRY_BAD; ++ } ++ ++ return 0; ++fsck_err: ++ return ret; ++} ++ ++struct journal_read_buf { ++ void *data; ++ size_t size; ++}; ++ ++static int journal_read_buf_realloc(struct journal_read_buf *b, ++ size_t new_size) ++{ ++ void *n; ++ ++ /* the bios are sized for this many pages, max: */ ++ if (new_size > JOURNAL_ENTRY_SIZE_MAX) ++ return -ENOMEM; ++ ++ new_size = roundup_pow_of_two(new_size); ++ n = kvpmalloc(new_size, GFP_KERNEL); ++ if (!n) ++ return -ENOMEM; ++ ++ kvpfree(b->data, b->size); ++ b->data = n; ++ b->size = new_size; ++ return 0; ++} ++ ++static int journal_read_bucket(struct bch_dev *ca, ++ struct journal_read_buf *buf, ++ struct journal_list *jlist, ++ unsigned bucket) ++{ ++ struct bch_fs *c = ca->fs; ++ struct journal_device *ja = &ca->journal; ++ struct jset *j = NULL; ++ unsigned sectors, sectors_read = 0; ++ u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), ++ end = offset + ca->mi.bucket_size; ++ bool saw_bad = false; ++ int ret = 0; ++ ++ pr_debug("reading %u", bucket); ++ ++ while (offset < end) { ++ if (!sectors_read) { ++ struct bio *bio; ++reread: ++ sectors_read = min_t(unsigned, ++ end - offset, buf->size >> 9); ++ ++ bio = bio_kmalloc(GFP_KERNEL, ++ buf_pages(buf->data, ++ sectors_read << 9)); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_iter.bi_sector = offset; ++ bio_set_op_attrs(bio, REQ_OP_READ, 0); ++ bch2_bio_map(bio, buf->data, sectors_read << 9); ++ ++ ret = submit_bio_wait(bio); ++ bio_put(bio); ++ ++ if (bch2_dev_io_err_on(ret, ca, ++ "journal read from sector %llu", ++ offset) || ++ bch2_meta_read_fault("journal")) ++ return -EIO; ++ ++ j = buf->data; ++ } ++ ++ ret = jset_validate(c, ca, j, offset, ++ end - offset, sectors_read, ++ READ); ++ switch (ret) { ++ case BCH_FSCK_OK: ++ sectors = vstruct_sectors(j, c->block_bits); ++ break; ++ case JOURNAL_ENTRY_REREAD: ++ if (vstruct_bytes(j) > buf->size) { ++ ret = journal_read_buf_realloc(buf, ++ vstruct_bytes(j)); ++ if (ret) ++ return ret; ++ } ++ goto reread; ++ case JOURNAL_ENTRY_NONE: ++ if (!saw_bad) ++ return 0; ++ sectors = c->opts.block_size; ++ goto next_block; ++ case JOURNAL_ENTRY_BAD: ++ saw_bad = true; ++ /* ++ * On checksum error we don't really trust the size ++ * field of the journal entry we read, so try reading ++ * again at next block boundary: ++ */ ++ sectors = c->opts.block_size; ++ break; ++ default: ++ return ret; ++ } ++ ++ /* ++ * This happens sometimes if we don't have discards on - ++ * when we've partially overwritten a bucket with new ++ * journal entries. We don't need the rest of the ++ * bucket: ++ */ ++ if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) ++ return 0; ++ ++ ja->bucket_seq[bucket] = le64_to_cpu(j->seq); ++ ++ mutex_lock(&jlist->lock); ++ ret = journal_entry_add(c, ca, jlist, j, ret != 0); ++ mutex_unlock(&jlist->lock); ++ ++ switch (ret) { ++ case JOURNAL_ENTRY_ADD_OK: ++ break; ++ case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: ++ break; ++ default: ++ return ret; ++ } ++next_block: ++ pr_debug("next"); ++ offset += sectors; ++ sectors_read -= sectors; ++ j = ((void *) j) + (sectors << 9); ++ } ++ ++ return 0; ++} ++ ++static void bch2_journal_read_device(struct closure *cl) ++{ ++ struct journal_device *ja = ++ container_of(cl, struct journal_device, read); ++ struct bch_dev *ca = container_of(ja, struct bch_dev, journal); ++ struct journal_list *jlist = ++ container_of(cl->parent, struct journal_list, cl); ++ struct journal_read_buf buf = { NULL, 0 }; ++ u64 min_seq = U64_MAX; ++ unsigned i; ++ int ret; ++ ++ if (!ja->nr) ++ goto out; ++ ++ ret = journal_read_buf_realloc(&buf, PAGE_SIZE); ++ if (ret) ++ goto err; ++ ++ pr_debug("%u journal buckets", ja->nr); ++ ++ for (i = 0; i < ja->nr; i++) { ++ ret = journal_read_bucket(ca, &buf, jlist, i); ++ if (ret) ++ goto err; ++ } ++ ++ /* Find the journal bucket with the highest sequence number: */ ++ for (i = 0; i < ja->nr; i++) { ++ if (ja->bucket_seq[i] > ja->bucket_seq[ja->cur_idx]) ++ ja->cur_idx = i; ++ ++ min_seq = min(ja->bucket_seq[i], min_seq); ++ } ++ ++ /* ++ * If there's duplicate journal entries in multiple buckets (which ++ * definitely isn't supposed to happen, but...) - make sure to start ++ * cur_idx at the last of those buckets, so we don't deadlock trying to ++ * allocate ++ */ ++ while (ja->bucket_seq[ja->cur_idx] > min_seq && ++ ja->bucket_seq[ja->cur_idx] > ++ ja->bucket_seq[(ja->cur_idx + 1) % ja->nr]) ++ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ++ ++ ja->sectors_free = 0; ++ ++ /* ++ * Set dirty_idx to indicate the entire journal is full and needs to be ++ * reclaimed - journal reclaim will immediately reclaim whatever isn't ++ * pinned when it first runs: ++ */ ++ ja->discard_idx = ja->dirty_idx_ondisk = ++ ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; ++out: ++ kvpfree(buf.data, buf.size); ++ percpu_ref_put(&ca->io_ref); ++ closure_return(cl); ++ return; ++err: ++ mutex_lock(&jlist->lock); ++ jlist->ret = ret; ++ mutex_unlock(&jlist->lock); ++ goto out; ++} ++ ++int bch2_journal_read(struct bch_fs *c, struct list_head *list) ++{ ++ struct journal_list jlist; ++ struct journal_replay *i; ++ struct bch_dev *ca; ++ unsigned iter; ++ size_t keys = 0, entries = 0; ++ bool degraded = false; ++ int ret = 0; ++ ++ closure_init_stack(&jlist.cl); ++ mutex_init(&jlist.lock); ++ jlist.head = list; ++ jlist.ret = 0; ++ ++ for_each_member_device(ca, c, iter) { ++ if (!test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) && ++ !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) ++ continue; ++ ++ if ((ca->mi.state == BCH_MEMBER_STATE_RW || ++ ca->mi.state == BCH_MEMBER_STATE_RO) && ++ percpu_ref_tryget(&ca->io_ref)) ++ closure_call(&ca->journal.read, ++ bch2_journal_read_device, ++ system_unbound_wq, ++ &jlist.cl); ++ else ++ degraded = true; ++ } ++ ++ closure_sync(&jlist.cl); ++ ++ if (jlist.ret) ++ return jlist.ret; ++ ++ list_for_each_entry(i, list, list) { ++ struct jset_entry *entry; ++ struct bkey_i *k, *_n; ++ struct bch_replicas_padded replicas; ++ char buf[80]; ++ ++ ret = jset_validate_entries(c, &i->j, READ); ++ if (ret) ++ goto fsck_err; ++ ++ /* ++ * If we're mounting in degraded mode - if we didn't read all ++ * the devices - this is wrong: ++ */ ++ ++ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, i->devs); ++ ++ if (!degraded && ++ (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) || ++ fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c, ++ "superblock not marked as containing replicas %s", ++ (bch2_replicas_entry_to_text(&PBUF(buf), ++ &replicas.e), buf)))) { ++ ret = bch2_mark_replicas(c, &replicas.e); ++ if (ret) ++ return ret; ++ } ++ ++ for_each_jset_key(k, _n, entry, &i->j) ++ keys++; ++ entries++; ++ } ++ ++ if (!list_empty(list)) { ++ i = list_last_entry(list, struct journal_replay, list); ++ ++ bch_info(c, "journal read done, %zu keys in %zu entries, seq %llu", ++ keys, entries, le64_to_cpu(i->j.seq)); ++ } ++fsck_err: ++ return ret; ++} ++ ++/* journal write: */ ++ ++static void __journal_write_alloc(struct journal *j, ++ struct journal_buf *w, ++ struct dev_alloc_list *devs_sorted, ++ unsigned sectors, ++ unsigned *replicas, ++ unsigned replicas_want) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_device *ja; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ if (*replicas >= replicas_want) ++ return; ++ ++ for (i = 0; i < devs_sorted->nr; i++) { ++ ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); ++ if (!ca) ++ continue; ++ ++ ja = &ca->journal; ++ ++ /* ++ * Check that we can use this device, and aren't already using ++ * it: ++ */ ++ if (!ca->mi.durability || ++ ca->mi.state != BCH_MEMBER_STATE_RW || ++ !ja->nr || ++ bch2_bkey_has_device(bkey_i_to_s_c(&w->key), ++ ca->dev_idx) || ++ sectors > ja->sectors_free) ++ continue; ++ ++ bch2_dev_stripe_increment(ca, &j->wp.stripe); ++ ++ bch2_bkey_append_ptr(&w->key, ++ (struct bch_extent_ptr) { ++ .offset = bucket_to_sector(ca, ++ ja->buckets[ja->cur_idx]) + ++ ca->mi.bucket_size - ++ ja->sectors_free, ++ .dev = ca->dev_idx, ++ }); ++ ++ ja->sectors_free -= sectors; ++ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); ++ ++ *replicas += ca->mi.durability; ++ ++ if (*replicas >= replicas_want) ++ break; ++ } ++} ++ ++/** ++ * journal_next_bucket - move on to the next journal bucket if possible ++ */ ++static int journal_write_alloc(struct journal *j, struct journal_buf *w, ++ unsigned sectors) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_device *ja; ++ struct bch_dev *ca; ++ struct dev_alloc_list devs_sorted; ++ unsigned i, replicas = 0, replicas_want = ++ READ_ONCE(c->opts.metadata_replicas); ++ ++ rcu_read_lock(); ++ ++ devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, ++ &c->rw_devs[BCH_DATA_journal]); ++ ++ __journal_write_alloc(j, w, &devs_sorted, ++ sectors, &replicas, replicas_want); ++ ++ if (replicas >= replicas_want) ++ goto done; ++ ++ for (i = 0; i < devs_sorted.nr; i++) { ++ ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); ++ if (!ca) ++ continue; ++ ++ ja = &ca->journal; ++ ++ if (sectors > ja->sectors_free && ++ sectors <= ca->mi.bucket_size && ++ bch2_journal_dev_buckets_available(j, ja, ++ journal_space_discarded)) { ++ ja->cur_idx = (ja->cur_idx + 1) % ja->nr; ++ ja->sectors_free = ca->mi.bucket_size; ++ ++ /* ++ * ja->bucket_seq[ja->cur_idx] must always have ++ * something sensible: ++ */ ++ ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); ++ } ++ } ++ ++ __journal_write_alloc(j, w, &devs_sorted, ++ sectors, &replicas, replicas_want); ++done: ++ rcu_read_unlock(); ++ ++ return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; ++} ++ ++static void journal_write_compact(struct jset *jset) ++{ ++ struct jset_entry *i, *next, *prev = NULL; ++ ++ /* ++ * Simple compaction, dropping empty jset_entries (from journal ++ * reservations that weren't fully used) and merging jset_entries that ++ * can be. ++ * ++ * If we wanted to be really fancy here, we could sort all the keys in ++ * the jset and drop keys that were overwritten - probably not worth it: ++ */ ++ vstruct_for_each_safe(jset, i, next) { ++ unsigned u64s = le16_to_cpu(i->u64s); ++ ++ /* Empty entry: */ ++ if (!u64s) ++ continue; ++ ++ /* Can we merge with previous entry? */ ++ if (prev && ++ i->btree_id == prev->btree_id && ++ i->level == prev->level && ++ i->type == prev->type && ++ i->type == BCH_JSET_ENTRY_btree_keys && ++ le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { ++ memmove_u64s_down(vstruct_next(prev), ++ i->_data, ++ u64s); ++ le16_add_cpu(&prev->u64s, u64s); ++ continue; ++ } ++ ++ /* Couldn't merge, move i into new position (after prev): */ ++ prev = prev ? vstruct_next(prev) : jset->start; ++ if (i != prev) ++ memmove_u64s_down(prev, i, jset_u64s(u64s)); ++ } ++ ++ prev = prev ? vstruct_next(prev) : jset->start; ++ jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); ++} ++ ++static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) ++{ ++ /* we aren't holding j->lock: */ ++ unsigned new_size = READ_ONCE(j->buf_size_want); ++ void *new_buf; ++ ++ if (buf->buf_size >= new_size) ++ return; ++ ++ new_buf = kvpmalloc(new_size, GFP_NOIO|__GFP_NOWARN); ++ if (!new_buf) ++ return; ++ ++ memcpy(new_buf, buf->data, buf->buf_size); ++ kvpfree(buf->data, buf->buf_size); ++ buf->data = new_buf; ++ buf->buf_size = new_size; ++} ++ ++static void journal_write_done(struct closure *cl) ++{ ++ struct journal *j = container_of(cl, struct journal, io); ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_buf *w = journal_prev_buf(j); ++ struct bch_devs_list devs = ++ bch2_bkey_devs(bkey_i_to_s_c(&w->key)); ++ struct bch_replicas_padded replicas; ++ u64 seq = le64_to_cpu(w->data->seq); ++ u64 last_seq = le64_to_cpu(w->data->last_seq); ++ ++ bch2_time_stats_update(j->write_time, j->write_start_time); ++ ++ if (!devs.nr) { ++ bch_err(c, "unable to write journal to sufficient devices"); ++ goto err; ++ } ++ ++ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, devs); ++ ++ if (bch2_mark_replicas(c, &replicas.e)) ++ goto err; ++ ++ spin_lock(&j->lock); ++ if (seq >= j->pin.front) ++ journal_seq_pin(j, seq)->devs = devs; ++ ++ j->seq_ondisk = seq; ++ j->last_seq_ondisk = last_seq; ++ bch2_journal_space_available(j); ++ ++ /* ++ * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard ++ * more buckets: ++ * ++ * Must come before signaling write completion, for ++ * bch2_fs_journal_stop(): ++ */ ++ mod_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, 0); ++out: ++ /* also must come before signalling write completion: */ ++ closure_debug_destroy(cl); ++ ++ BUG_ON(!j->reservations.prev_buf_unwritten); ++ atomic64_sub(((union journal_res_state) { .prev_buf_unwritten = 1 }).v, ++ &j->reservations.counter); ++ ++ closure_wake_up(&w->wait); ++ journal_wake(j); ++ ++ if (test_bit(JOURNAL_NEED_WRITE, &j->flags)) ++ mod_delayed_work(system_freezable_wq, &j->write_work, 0); ++ spin_unlock(&j->lock); ++ return; ++err: ++ bch2_fatal_error(c); ++ spin_lock(&j->lock); ++ goto out; ++} ++ ++static void journal_write_endio(struct bio *bio) ++{ ++ struct bch_dev *ca = bio->bi_private; ++ struct journal *j = &ca->fs->journal; ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "journal write: %s", ++ bch2_blk_status_to_str(bio->bi_status)) || ++ bch2_meta_write_fault("journal")) { ++ struct journal_buf *w = journal_prev_buf(j); ++ unsigned long flags; ++ ++ spin_lock_irqsave(&j->err_lock, flags); ++ bch2_bkey_drop_device(bkey_i_to_s(&w->key), ca->dev_idx); ++ spin_unlock_irqrestore(&j->err_lock, flags); ++ } ++ ++ closure_put(&j->io); ++ percpu_ref_put(&ca->io_ref); ++} ++ ++void bch2_journal_write(struct closure *cl) ++{ ++ struct journal *j = container_of(cl, struct journal, io); ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ struct journal_buf *w = journal_prev_buf(j); ++ struct jset_entry *start, *end; ++ struct jset *jset; ++ struct bio *bio; ++ struct bch_extent_ptr *ptr; ++ bool validate_before_checksum = false; ++ unsigned i, sectors, bytes, u64s; ++ int ret; ++ ++ bch2_journal_pin_put(j, le64_to_cpu(w->data->seq)); ++ ++ journal_buf_realloc(j, w); ++ jset = w->data; ++ ++ j->write_start_time = local_clock(); ++ ++ /* ++ * New btree roots are set by journalling them; when the journal entry ++ * gets written we have to propagate them to c->btree_roots ++ * ++ * But, every journal entry we write has to contain all the btree roots ++ * (at least for now); so after we copy btree roots to c->btree_roots we ++ * have to get any missing btree roots and add them to this journal ++ * entry: ++ */ ++ ++ bch2_journal_entries_to_btree_roots(c, jset); ++ ++ start = end = vstruct_last(jset); ++ ++ end = bch2_btree_roots_to_journal_entries(c, jset->start, end); ++ ++ end = bch2_journal_super_entries_add_common(c, end, ++ le64_to_cpu(jset->seq)); ++ u64s = (u64 *) end - (u64 *) start; ++ BUG_ON(u64s > j->entry_u64s_reserved); ++ ++ le32_add_cpu(&jset->u64s, u64s); ++ BUG_ON(vstruct_sectors(jset, c->block_bits) > w->sectors); ++ ++ journal_write_compact(jset); ++ ++ jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); ++ jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); ++ jset->magic = cpu_to_le64(jset_magic(c)); ++ ++ jset->version = c->sb.version < bcachefs_metadata_version_new_versioning ++ ? cpu_to_le32(BCH_JSET_VERSION_OLD) ++ : cpu_to_le32(c->sb.version); ++ ++ SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); ++ SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); ++ ++ if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) ++ validate_before_checksum = true; ++ ++ if (le32_to_cpu(jset->version) < bcachefs_metadata_version_max) ++ validate_before_checksum = true; ++ ++ if (validate_before_checksum && ++ jset_validate_entries(c, jset, WRITE)) ++ goto err; ++ ++ bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), ++ jset->encrypted_start, ++ vstruct_end(jset) - (void *) jset->encrypted_start); ++ ++ jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), ++ journal_nonce(jset), jset); ++ ++ if (!validate_before_checksum && ++ jset_validate_entries(c, jset, WRITE)) ++ goto err; ++ ++ sectors = vstruct_sectors(jset, c->block_bits); ++ BUG_ON(sectors > w->sectors); ++ ++ bytes = vstruct_bytes(jset); ++ memset((void *) jset + bytes, 0, (sectors << 9) - bytes); ++ ++retry_alloc: ++ spin_lock(&j->lock); ++ ret = journal_write_alloc(j, w, sectors); ++ ++ if (ret && j->can_discard) { ++ spin_unlock(&j->lock); ++ bch2_journal_do_discards(j); ++ goto retry_alloc; ++ } ++ ++ /* ++ * write is allocated, no longer need to account for it in ++ * bch2_journal_space_available(): ++ */ ++ w->sectors = 0; ++ ++ /* ++ * journal entry has been compacted and allocated, recalculate space ++ * available: ++ */ ++ bch2_journal_space_available(j); ++ spin_unlock(&j->lock); ++ ++ if (ret) { ++ bch_err(c, "Unable to allocate journal write"); ++ bch2_fatal_error(c); ++ continue_at(cl, journal_write_done, system_highpri_wq); ++ return; ++ } ++ ++ /* ++ * XXX: we really should just disable the entire journal in nochanges ++ * mode ++ */ ++ if (c->opts.nochanges) ++ goto no_io; ++ ++ extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { ++ ca = bch_dev_bkey_exists(c, ptr->dev); ++ if (!percpu_ref_tryget(&ca->io_ref)) { ++ /* XXX: fix this */ ++ bch_err(c, "missing device for journal write\n"); ++ continue; ++ } ++ ++ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], ++ sectors); ++ ++ bio = ca->journal.bio; ++ bio_reset(bio); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_iter.bi_sector = ptr->offset; ++ bio->bi_end_io = journal_write_endio; ++ bio->bi_private = ca; ++ bio_set_op_attrs(bio, REQ_OP_WRITE, ++ REQ_SYNC|REQ_META|REQ_PREFLUSH|REQ_FUA); ++ bch2_bio_map(bio, jset, sectors << 9); ++ ++ trace_journal_write(bio); ++ closure_bio_submit(bio, cl); ++ ++ ca->journal.bucket_seq[ca->journal.cur_idx] = le64_to_cpu(jset->seq); ++ } ++ ++ for_each_rw_member(ca, c, i) ++ if (journal_flushes_device(ca) && ++ !bch2_bkey_has_device(bkey_i_to_s_c(&w->key), i)) { ++ percpu_ref_get(&ca->io_ref); ++ ++ bio = ca->journal.bio; ++ bio_reset(bio); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_opf = REQ_OP_FLUSH; ++ bio->bi_end_io = journal_write_endio; ++ bio->bi_private = ca; ++ closure_bio_submit(bio, cl); ++ } ++ ++no_io: ++ bch2_bucket_seq_cleanup(c); ++ ++ continue_at(cl, journal_write_done, system_highpri_wq); ++ return; ++err: ++ bch2_inconsistent_error(c); ++ continue_at(cl, journal_write_done, system_highpri_wq); ++} +diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h +new file mode 100644 +index 000000000000..6958ee0f8cf2 +--- /dev/null ++++ b/fs/bcachefs/journal_io.h +@@ -0,0 +1,44 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_IO_H ++#define _BCACHEFS_JOURNAL_IO_H ++ ++/* ++ * Only used for holding the journal entries we read in btree_journal_read() ++ * during cache_registration ++ */ ++struct journal_replay { ++ struct list_head list; ++ struct bch_devs_list devs; ++ /* checksum error, but we may want to try using it anyways: */ ++ bool bad; ++ /* must be last: */ ++ struct jset j; ++}; ++ ++static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, ++ struct jset_entry *entry, unsigned type) ++{ ++ while (entry < vstruct_last(jset)) { ++ if (entry->type == type) ++ return entry; ++ ++ entry = vstruct_next(entry); ++ } ++ ++ return NULL; ++} ++ ++#define for_each_jset_entry_type(entry, jset, type) \ ++ for (entry = (jset)->start; \ ++ (entry = __jset_entry_type_next(jset, entry, type)); \ ++ entry = vstruct_next(entry)) ++ ++#define for_each_jset_key(k, _n, entry, jset) \ ++ for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys) \ ++ vstruct_for_each_safe(entry, k, _n) ++ ++int bch2_journal_read(struct bch_fs *, struct list_head *); ++ ++void bch2_journal_write(struct closure *); ++ ++#endif /* _BCACHEFS_JOURNAL_IO_H */ +diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c +new file mode 100644 +index 000000000000..57591983eebd +--- /dev/null ++++ b/fs/bcachefs/journal_reclaim.c +@@ -0,0 +1,644 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "journal.h" ++#include "journal_io.h" ++#include "journal_reclaim.h" ++#include "replicas.h" ++#include "super.h" ++ ++/* Free space calculations: */ ++ ++static unsigned journal_space_from(struct journal_device *ja, ++ enum journal_space_from from) ++{ ++ switch (from) { ++ case journal_space_discarded: ++ return ja->discard_idx; ++ case journal_space_clean_ondisk: ++ return ja->dirty_idx_ondisk; ++ case journal_space_clean: ++ return ja->dirty_idx; ++ default: ++ BUG(); ++ } ++} ++ ++unsigned bch2_journal_dev_buckets_available(struct journal *j, ++ struct journal_device *ja, ++ enum journal_space_from from) ++{ ++ unsigned available = (journal_space_from(ja, from) - ++ ja->cur_idx - 1 + ja->nr) % ja->nr; ++ ++ /* ++ * Don't use the last bucket unless writing the new last_seq ++ * will make another bucket available: ++ */ ++ if (available && ja->dirty_idx_ondisk == ja->dirty_idx) ++ --available; ++ ++ return available; ++} ++ ++static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) ++{ ++ union journal_preres_state old, new; ++ u64 v = atomic64_read(&j->prereserved.counter); ++ ++ do { ++ old.v = new.v = v; ++ new.remaining = u64s_remaining; ++ } while ((v = atomic64_cmpxchg(&j->prereserved.counter, ++ old.v, new.v)) != old.v); ++} ++ ++static struct journal_space { ++ unsigned next_entry; ++ unsigned remaining; ++} __journal_space_available(struct journal *j, unsigned nr_devs_want, ++ enum journal_space_from from) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ unsigned sectors_next_entry = UINT_MAX; ++ unsigned sectors_total = UINT_MAX; ++ unsigned i, nr_devs = 0; ++ unsigned unwritten_sectors = j->reservations.prev_buf_unwritten ++ ? journal_prev_buf(j)->sectors ++ : 0; ++ ++ rcu_read_lock(); ++ for_each_member_device_rcu(ca, c, i, ++ &c->rw_devs[BCH_DATA_journal]) { ++ struct journal_device *ja = &ca->journal; ++ unsigned buckets_this_device, sectors_this_device; ++ ++ if (!ja->nr) ++ continue; ++ ++ buckets_this_device = bch2_journal_dev_buckets_available(j, ja, from); ++ sectors_this_device = ja->sectors_free; ++ ++ /* ++ * We that we don't allocate the space for a journal entry ++ * until we write it out - thus, account for it here: ++ */ ++ if (unwritten_sectors >= sectors_this_device) { ++ if (!buckets_this_device) ++ continue; ++ ++ buckets_this_device--; ++ sectors_this_device = ca->mi.bucket_size; ++ } ++ ++ sectors_this_device -= unwritten_sectors; ++ ++ if (sectors_this_device < ca->mi.bucket_size && ++ buckets_this_device) { ++ buckets_this_device--; ++ sectors_this_device = ca->mi.bucket_size; ++ } ++ ++ if (!sectors_this_device) ++ continue; ++ ++ sectors_next_entry = min(sectors_next_entry, ++ sectors_this_device); ++ ++ sectors_total = min(sectors_total, ++ buckets_this_device * ca->mi.bucket_size + ++ sectors_this_device); ++ ++ nr_devs++; ++ } ++ rcu_read_unlock(); ++ ++ if (nr_devs < nr_devs_want) ++ return (struct journal_space) { 0, 0 }; ++ ++ return (struct journal_space) { ++ .next_entry = sectors_next_entry, ++ .remaining = max_t(int, 0, sectors_total - sectors_next_entry), ++ }; ++} ++ ++void bch2_journal_space_available(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ struct journal_space discarded, clean_ondisk, clean; ++ unsigned overhead, u64s_remaining = 0; ++ unsigned max_entry_size = min(j->buf[0].buf_size >> 9, ++ j->buf[1].buf_size >> 9); ++ unsigned i, nr_online = 0, nr_devs_want; ++ bool can_discard = false; ++ int ret = 0; ++ ++ lockdep_assert_held(&j->lock); ++ ++ rcu_read_lock(); ++ for_each_member_device_rcu(ca, c, i, ++ &c->rw_devs[BCH_DATA_journal]) { ++ struct journal_device *ja = &ca->journal; ++ ++ if (!ja->nr) ++ continue; ++ ++ while (ja->dirty_idx != ja->cur_idx && ++ ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j)) ++ ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; ++ ++ while (ja->dirty_idx_ondisk != ja->dirty_idx && ++ ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk) ++ ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; ++ ++ if (ja->discard_idx != ja->dirty_idx_ondisk) ++ can_discard = true; ++ ++ max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); ++ nr_online++; ++ } ++ rcu_read_unlock(); ++ ++ j->can_discard = can_discard; ++ ++ if (nr_online < c->opts.metadata_replicas_required) { ++ ret = -EROFS; ++ goto out; ++ } ++ ++ if (!fifo_free(&j->pin)) { ++ ret = -ENOSPC; ++ goto out; ++ } ++ ++ nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); ++ ++ discarded = __journal_space_available(j, nr_devs_want, journal_space_discarded); ++ clean_ondisk = __journal_space_available(j, nr_devs_want, journal_space_clean_ondisk); ++ clean = __journal_space_available(j, nr_devs_want, journal_space_clean); ++ ++ if (!discarded.next_entry) ++ ret = -ENOSPC; ++ ++ overhead = DIV_ROUND_UP(clean.remaining, max_entry_size) * ++ journal_entry_overhead(j); ++ u64s_remaining = clean.remaining << 6; ++ u64s_remaining = max_t(int, 0, u64s_remaining - overhead); ++ u64s_remaining /= 4; ++out: ++ j->cur_entry_sectors = !ret ? discarded.next_entry : 0; ++ j->cur_entry_error = ret; ++ journal_set_remaining(j, u64s_remaining); ++ journal_check_may_get_unreserved(j); ++ ++ if (!ret) ++ journal_wake(j); ++} ++ ++/* Discards - last part of journal reclaim: */ ++ ++static bool should_discard_bucket(struct journal *j, struct journal_device *ja) ++{ ++ bool ret; ++ ++ spin_lock(&j->lock); ++ ret = ja->discard_idx != ja->dirty_idx_ondisk; ++ spin_unlock(&j->lock); ++ ++ return ret; ++} ++ ++/* ++ * Advance ja->discard_idx as long as it points to buckets that are no longer ++ * dirty, issuing discards if necessary: ++ */ ++void bch2_journal_do_discards(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ unsigned iter; ++ ++ mutex_lock(&j->discard_lock); ++ ++ for_each_rw_member(ca, c, iter) { ++ struct journal_device *ja = &ca->journal; ++ ++ while (should_discard_bucket(j, ja)) { ++ if (ca->mi.discard && ++ blk_queue_discard(bdev_get_queue(ca->disk_sb.bdev))) ++ blkdev_issue_discard(ca->disk_sb.bdev, ++ bucket_to_sector(ca, ++ ja->buckets[ja->discard_idx]), ++ ca->mi.bucket_size, GFP_NOIO, 0); ++ ++ spin_lock(&j->lock); ++ ja->discard_idx = (ja->discard_idx + 1) % ja->nr; ++ ++ bch2_journal_space_available(j); ++ spin_unlock(&j->lock); ++ } ++ } ++ ++ mutex_unlock(&j->discard_lock); ++} ++ ++/* ++ * Journal entry pinning - machinery for holding a reference on a given journal ++ * entry, holding it open to ensure it gets replayed during recovery: ++ */ ++ ++static void bch2_journal_reclaim_fast(struct journal *j) ++{ ++ struct journal_entry_pin_list temp; ++ bool popped = false; ++ ++ lockdep_assert_held(&j->lock); ++ ++ /* ++ * Unpin journal entries whose reference counts reached zero, meaning ++ * all btree nodes got written out ++ */ ++ while (!fifo_empty(&j->pin) && ++ !atomic_read(&fifo_peek_front(&j->pin).count)) { ++ BUG_ON(!list_empty(&fifo_peek_front(&j->pin).list)); ++ BUG_ON(!fifo_pop(&j->pin, temp)); ++ popped = true; ++ } ++ ++ if (popped) ++ bch2_journal_space_available(j); ++} ++ ++void bch2_journal_pin_put(struct journal *j, u64 seq) ++{ ++ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); ++ ++ if (atomic_dec_and_test(&pin_list->count)) { ++ spin_lock(&j->lock); ++ bch2_journal_reclaim_fast(j); ++ spin_unlock(&j->lock); ++ } ++} ++ ++static inline void __journal_pin_drop(struct journal *j, ++ struct journal_entry_pin *pin) ++{ ++ struct journal_entry_pin_list *pin_list; ++ ++ if (!journal_pin_active(pin)) ++ return; ++ ++ pin_list = journal_seq_pin(j, pin->seq); ++ pin->seq = 0; ++ list_del_init(&pin->list); ++ ++ /* ++ * Unpinning a journal entry make make journal_next_bucket() succeed, if ++ * writing a new last_seq will now make another bucket available: ++ */ ++ if (atomic_dec_and_test(&pin_list->count) && ++ pin_list == &fifo_peek_front(&j->pin)) ++ bch2_journal_reclaim_fast(j); ++ else if (fifo_used(&j->pin) == 1 && ++ atomic_read(&pin_list->count) == 1) ++ journal_wake(j); ++} ++ ++void bch2_journal_pin_drop(struct journal *j, ++ struct journal_entry_pin *pin) ++{ ++ spin_lock(&j->lock); ++ __journal_pin_drop(j, pin); ++ spin_unlock(&j->lock); ++} ++ ++static void bch2_journal_pin_add_locked(struct journal *j, u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) ++{ ++ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); ++ ++ __journal_pin_drop(j, pin); ++ ++ BUG_ON(!atomic_read(&pin_list->count) && seq == journal_last_seq(j)); ++ ++ atomic_inc(&pin_list->count); ++ pin->seq = seq; ++ pin->flush = flush_fn; ++ ++ list_add(&pin->list, flush_fn ? &pin_list->list : &pin_list->flushed); ++} ++ ++void __bch2_journal_pin_add(struct journal *j, u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) ++{ ++ spin_lock(&j->lock); ++ bch2_journal_pin_add_locked(j, seq, pin, flush_fn); ++ spin_unlock(&j->lock); ++ ++ /* ++ * If the journal is currently full, we might want to call flush_fn ++ * immediately: ++ */ ++ journal_wake(j); ++} ++ ++void bch2_journal_pin_update(struct journal *j, u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) ++{ ++ if (journal_pin_active(pin) && pin->seq < seq) ++ return; ++ ++ spin_lock(&j->lock); ++ ++ if (pin->seq != seq) { ++ bch2_journal_pin_add_locked(j, seq, pin, flush_fn); ++ } else { ++ struct journal_entry_pin_list *pin_list = ++ journal_seq_pin(j, seq); ++ ++ /* ++ * If the pin is already pinning the right sequence number, it ++ * still might've already been flushed: ++ */ ++ list_move(&pin->list, &pin_list->list); ++ } ++ ++ spin_unlock(&j->lock); ++ ++ /* ++ * If the journal is currently full, we might want to call flush_fn ++ * immediately: ++ */ ++ journal_wake(j); ++} ++ ++void bch2_journal_pin_copy(struct journal *j, ++ struct journal_entry_pin *dst, ++ struct journal_entry_pin *src, ++ journal_pin_flush_fn flush_fn) ++{ ++ spin_lock(&j->lock); ++ ++ if (journal_pin_active(src) && ++ (!journal_pin_active(dst) || src->seq < dst->seq)) ++ bch2_journal_pin_add_locked(j, src->seq, dst, flush_fn); ++ ++ spin_unlock(&j->lock); ++} ++ ++/** ++ * bch2_journal_pin_flush: ensure journal pin callback is no longer running ++ */ ++void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) ++{ ++ BUG_ON(journal_pin_active(pin)); ++ ++ wait_event(j->pin_flush_wait, j->flush_in_progress != pin); ++} ++ ++/* ++ * Journal reclaim: flush references to open journal entries to reclaim space in ++ * the journal ++ * ++ * May be done by the journal code in the background as needed to free up space ++ * for more journal entries, or as part of doing a clean shutdown, or to migrate ++ * data off of a specific device: ++ */ ++ ++static struct journal_entry_pin * ++journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq) ++{ ++ struct journal_entry_pin_list *pin_list; ++ struct journal_entry_pin *ret = NULL; ++ ++ if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags)) ++ return NULL; ++ ++ spin_lock(&j->lock); ++ ++ fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) ++ if (*seq > max_seq || ++ (ret = list_first_entry_or_null(&pin_list->list, ++ struct journal_entry_pin, list))) ++ break; ++ ++ if (ret) { ++ list_move(&ret->list, &pin_list->flushed); ++ BUG_ON(j->flush_in_progress); ++ j->flush_in_progress = ret; ++ j->last_flushed = jiffies; ++ } ++ ++ spin_unlock(&j->lock); ++ ++ return ret; ++} ++ ++/* returns true if we did work */ ++static bool journal_flush_pins(struct journal *j, u64 seq_to_flush, ++ unsigned min_nr) ++{ ++ struct journal_entry_pin *pin; ++ bool ret = false; ++ u64 seq; ++ ++ lockdep_assert_held(&j->reclaim_lock); ++ ++ while ((pin = journal_get_next_pin(j, min_nr ++ ? U64_MAX : seq_to_flush, &seq))) { ++ if (min_nr) ++ min_nr--; ++ ++ pin->flush(j, pin, seq); ++ ++ BUG_ON(j->flush_in_progress != pin); ++ j->flush_in_progress = NULL; ++ wake_up(&j->pin_flush_wait); ++ ret = true; ++ } ++ ++ return ret; ++} ++ ++/** ++ * bch2_journal_reclaim - free up journal buckets ++ * ++ * Background journal reclaim writes out btree nodes. It should be run ++ * early enough so that we never completely run out of journal buckets. ++ * ++ * High watermarks for triggering background reclaim: ++ * - FIFO has fewer than 512 entries left ++ * - fewer than 25% journal buckets free ++ * ++ * Background reclaim runs until low watermarks are reached: ++ * - FIFO has more than 1024 entries left ++ * - more than 50% journal buckets free ++ * ++ * As long as a reclaim can complete in the time it takes to fill up ++ * 512 journal entries or 25% of all journal buckets, then ++ * journal_next_bucket() should not stall. ++ */ ++void bch2_journal_reclaim(struct journal *j) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct bch_dev *ca; ++ unsigned iter, min_nr = 0; ++ u64 seq_to_flush = 0; ++ ++ lockdep_assert_held(&j->reclaim_lock); ++ ++ bch2_journal_do_discards(j); ++ ++ spin_lock(&j->lock); ++ ++ for_each_rw_member(ca, c, iter) { ++ struct journal_device *ja = &ca->journal; ++ unsigned nr_buckets, bucket_to_flush; ++ ++ if (!ja->nr) ++ continue; ++ ++ /* Try to keep the journal at most half full: */ ++ nr_buckets = ja->nr / 2; ++ ++ /* And include pre-reservations: */ ++ nr_buckets += DIV_ROUND_UP(j->prereserved.reserved, ++ (ca->mi.bucket_size << 6) - ++ journal_entry_overhead(j)); ++ ++ nr_buckets = min(nr_buckets, ja->nr); ++ ++ bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; ++ seq_to_flush = max(seq_to_flush, ++ ja->bucket_seq[bucket_to_flush]); ++ } ++ ++ /* Also flush if the pin fifo is more than half full */ ++ seq_to_flush = max_t(s64, seq_to_flush, ++ (s64) journal_cur_seq(j) - ++ (j->pin.size >> 1)); ++ spin_unlock(&j->lock); ++ ++ /* ++ * If it's been longer than j->reclaim_delay_ms since we last flushed, ++ * make sure to flush at least one journal pin: ++ */ ++ if (time_after(jiffies, j->last_flushed + ++ msecs_to_jiffies(j->reclaim_delay_ms))) ++ min_nr = 1; ++ ++ if (j->prereserved.reserved * 2 > j->prereserved.remaining) { ++ seq_to_flush = max(seq_to_flush, journal_last_seq(j)); ++ min_nr = 1; ++ } ++ ++ journal_flush_pins(j, seq_to_flush, min_nr); ++ ++ if (!bch2_journal_error(j)) ++ queue_delayed_work(c->journal_reclaim_wq, &j->reclaim_work, ++ msecs_to_jiffies(j->reclaim_delay_ms)); ++} ++ ++void bch2_journal_reclaim_work(struct work_struct *work) ++{ ++ struct journal *j = container_of(to_delayed_work(work), ++ struct journal, reclaim_work); ++ ++ mutex_lock(&j->reclaim_lock); ++ bch2_journal_reclaim(j); ++ mutex_unlock(&j->reclaim_lock); ++} ++ ++static int journal_flush_done(struct journal *j, u64 seq_to_flush, ++ bool *did_work) ++{ ++ int ret; ++ ++ ret = bch2_journal_error(j); ++ if (ret) ++ return ret; ++ ++ mutex_lock(&j->reclaim_lock); ++ ++ *did_work = journal_flush_pins(j, seq_to_flush, 0); ++ ++ spin_lock(&j->lock); ++ /* ++ * If journal replay hasn't completed, the unreplayed journal entries ++ * hold refs on their corresponding sequence numbers ++ */ ++ ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || ++ journal_last_seq(j) > seq_to_flush || ++ (fifo_used(&j->pin) == 1 && ++ atomic_read(&fifo_peek_front(&j->pin).count) == 1); ++ ++ spin_unlock(&j->lock); ++ mutex_unlock(&j->reclaim_lock); ++ ++ return ret; ++} ++ ++bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) ++{ ++ bool did_work = false; ++ ++ if (!test_bit(JOURNAL_STARTED, &j->flags)) ++ return false; ++ ++ closure_wait_event(&j->async_wait, ++ journal_flush_done(j, seq_to_flush, &did_work)); ++ ++ return did_work; ++} ++ ++int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) ++{ ++ struct bch_fs *c = container_of(j, struct bch_fs, journal); ++ struct journal_entry_pin_list *p; ++ u64 iter, seq = 0; ++ int ret = 0; ++ ++ spin_lock(&j->lock); ++ fifo_for_each_entry_ptr(p, &j->pin, iter) ++ if (dev_idx >= 0 ++ ? bch2_dev_list_has_dev(p->devs, dev_idx) ++ : p->devs.nr < c->opts.metadata_replicas) ++ seq = iter; ++ spin_unlock(&j->lock); ++ ++ bch2_journal_flush_pins(j, seq); ++ ++ ret = bch2_journal_error(j); ++ if (ret) ++ return ret; ++ ++ mutex_lock(&c->replicas_gc_lock); ++ bch2_replicas_gc_start(c, 1 << BCH_DATA_journal); ++ ++ seq = 0; ++ ++ spin_lock(&j->lock); ++ while (!ret && seq < j->pin.back) { ++ struct bch_replicas_padded replicas; ++ ++ seq = max(seq, journal_last_seq(j)); ++ bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, ++ journal_seq_pin(j, seq)->devs); ++ seq++; ++ ++ spin_unlock(&j->lock); ++ ret = bch2_mark_replicas(c, &replicas.e); ++ spin_lock(&j->lock); ++ } ++ spin_unlock(&j->lock); ++ ++ ret = bch2_replicas_gc_end(c, ret); ++ mutex_unlock(&c->replicas_gc_lock); ++ ++ return ret; ++} +diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h +new file mode 100644 +index 000000000000..8128907a7623 +--- /dev/null ++++ b/fs/bcachefs/journal_reclaim.h +@@ -0,0 +1,69 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_RECLAIM_H ++#define _BCACHEFS_JOURNAL_RECLAIM_H ++ ++#define JOURNAL_PIN (32 * 1024) ++ ++enum journal_space_from { ++ journal_space_discarded, ++ journal_space_clean_ondisk, ++ journal_space_clean, ++}; ++ ++unsigned bch2_journal_dev_buckets_available(struct journal *, ++ struct journal_device *, ++ enum journal_space_from); ++void bch2_journal_space_available(struct journal *); ++ ++static inline bool journal_pin_active(struct journal_entry_pin *pin) ++{ ++ return pin->seq != 0; ++} ++ ++static inline struct journal_entry_pin_list * ++journal_seq_pin(struct journal *j, u64 seq) ++{ ++ EBUG_ON(seq < j->pin.front || seq >= j->pin.back); ++ ++ return &j->pin.data[seq & j->pin.mask]; ++} ++ ++void bch2_journal_pin_put(struct journal *, u64); ++void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); ++ ++void __bch2_journal_pin_add(struct journal *, u64, struct journal_entry_pin *, ++ journal_pin_flush_fn); ++ ++static inline void bch2_journal_pin_add(struct journal *j, u64 seq, ++ struct journal_entry_pin *pin, ++ journal_pin_flush_fn flush_fn) ++{ ++ if (unlikely(!journal_pin_active(pin) || pin->seq > seq)) ++ __bch2_journal_pin_add(j, seq, pin, flush_fn); ++} ++ ++void bch2_journal_pin_update(struct journal *, u64, ++ struct journal_entry_pin *, ++ journal_pin_flush_fn); ++ ++void bch2_journal_pin_copy(struct journal *, ++ struct journal_entry_pin *, ++ struct journal_entry_pin *, ++ journal_pin_flush_fn); ++ ++void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); ++ ++void bch2_journal_do_discards(struct journal *); ++void bch2_journal_reclaim(struct journal *); ++void bch2_journal_reclaim_work(struct work_struct *); ++ ++bool bch2_journal_flush_pins(struct journal *, u64); ++ ++static inline bool bch2_journal_flush_all_pins(struct journal *j) ++{ ++ return bch2_journal_flush_pins(j, U64_MAX); ++} ++ ++int bch2_journal_flush_device_pins(struct journal *, int); ++ ++#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ +diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c +new file mode 100644 +index 000000000000..d0f1bbf8f6a7 +--- /dev/null ++++ b/fs/bcachefs/journal_seq_blacklist.c +@@ -0,0 +1,309 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_iter.h" ++#include "eytzinger.h" ++#include "journal_seq_blacklist.h" ++#include "super-io.h" ++ ++/* ++ * journal_seq_blacklist machinery: ++ * ++ * To guarantee order of btree updates after a crash, we need to detect when a ++ * btree node entry (bset) is newer than the newest journal entry that was ++ * successfully written, and ignore it - effectively ignoring any btree updates ++ * that didn't make it into the journal. ++ * ++ * If we didn't do this, we might have two btree nodes, a and b, both with ++ * updates that weren't written to the journal yet: if b was updated after a, ++ * but b was flushed and not a - oops; on recovery we'll find that the updates ++ * to b happened, but not the updates to a that happened before it. ++ * ++ * Ignoring bsets that are newer than the newest journal entry is always safe, ++ * because everything they contain will also have been journalled - and must ++ * still be present in the journal on disk until a journal entry has been ++ * written _after_ that bset was written. ++ * ++ * To accomplish this, bsets record the newest journal sequence number they ++ * contain updates for; then, on startup, the btree code queries the journal ++ * code to ask "Is this sequence number newer than the newest journal entry? If ++ * so, ignore it." ++ * ++ * When this happens, we must blacklist that journal sequence number: the ++ * journal must not write any entries with that sequence number, and it must ++ * record that it was blacklisted so that a) on recovery we don't think we have ++ * missing journal entries and b) so that the btree code continues to ignore ++ * that bset, until that btree node is rewritten. ++ */ ++ ++static unsigned sb_blacklist_u64s(unsigned nr) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl; ++ ++ return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); ++} ++ ++static struct bch_sb_field_journal_seq_blacklist * ++blacklist_entry_try_merge(struct bch_fs *c, ++ struct bch_sb_field_journal_seq_blacklist *bl, ++ unsigned i) ++{ ++ unsigned nr = blacklist_nr_entries(bl); ++ ++ if (le64_to_cpu(bl->start[i].end) >= ++ le64_to_cpu(bl->start[i + 1].start)) { ++ bl->start[i].end = bl->start[i + 1].end; ++ --nr; ++ memmove(&bl->start[i], ++ &bl->start[i + 1], ++ sizeof(bl->start[0]) * (nr - i)); ++ ++ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, ++ sb_blacklist_u64s(nr)); ++ BUG_ON(!bl); ++ } ++ ++ return bl; ++} ++ ++int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl; ++ unsigned i, nr; ++ int ret = 0; ++ ++ mutex_lock(&c->sb_lock); ++ bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); ++ nr = blacklist_nr_entries(bl); ++ ++ if (bl) { ++ for (i = 0; i < nr; i++) { ++ struct journal_seq_blacklist_entry *e = ++ bl->start + i; ++ ++ if (start == le64_to_cpu(e->start) && ++ end == le64_to_cpu(e->end)) ++ goto out; ++ ++ if (start <= le64_to_cpu(e->start) && ++ end >= le64_to_cpu(e->end)) { ++ e->start = cpu_to_le64(start); ++ e->end = cpu_to_le64(end); ++ ++ if (i + 1 < nr) ++ bl = blacklist_entry_try_merge(c, ++ bl, i); ++ if (i) ++ bl = blacklist_entry_try_merge(c, ++ bl, i - 1); ++ goto out_write_sb; ++ } ++ } ++ } ++ ++ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, ++ sb_blacklist_u64s(nr + 1)); ++ if (!bl) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ bl->start[nr].start = cpu_to_le64(start); ++ bl->start[nr].end = cpu_to_le64(end); ++out_write_sb: ++ c->disk_sb.sb->features[0] |= ++ 1ULL << BCH_FEATURE_journal_seq_blacklist_v3; ++ ++ ret = bch2_write_super(c); ++out: ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++static int journal_seq_blacklist_table_cmp(const void *_l, ++ const void *_r, size_t size) ++{ ++ const struct journal_seq_blacklist_table_entry *l = _l; ++ const struct journal_seq_blacklist_table_entry *r = _r; ++ ++ return cmp_int(l->start, r->start); ++} ++ ++bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, ++ bool dirty) ++{ ++ struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; ++ struct journal_seq_blacklist_table_entry search = { .start = seq }; ++ int idx; ++ ++ if (!t) ++ return false; ++ ++ idx = eytzinger0_find_le(t->entries, t->nr, ++ sizeof(t->entries[0]), ++ journal_seq_blacklist_table_cmp, ++ &search); ++ if (idx < 0) ++ return false; ++ ++ BUG_ON(t->entries[idx].start > seq); ++ ++ if (seq >= t->entries[idx].end) ++ return false; ++ ++ if (dirty) ++ t->entries[idx].dirty = true; ++ return true; ++} ++ ++int bch2_blacklist_table_initialize(struct bch_fs *c) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl = ++ bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); ++ struct journal_seq_blacklist_table *t; ++ unsigned i, nr = blacklist_nr_entries(bl); ++ ++ BUG_ON(c->journal_seq_blacklist_table); ++ ++ if (!bl) ++ return 0; ++ ++ t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr, ++ GFP_KERNEL); ++ if (!t) ++ return -ENOMEM; ++ ++ t->nr = nr; ++ ++ for (i = 0; i < nr; i++) { ++ t->entries[i].start = le64_to_cpu(bl->start[i].start); ++ t->entries[i].end = le64_to_cpu(bl->start[i].end); ++ } ++ ++ eytzinger0_sort(t->entries, ++ t->nr, ++ sizeof(t->entries[0]), ++ journal_seq_blacklist_table_cmp, ++ NULL); ++ ++ c->journal_seq_blacklist_table = t; ++ return 0; ++} ++ ++static const char * ++bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl = ++ field_to_type(f, journal_seq_blacklist); ++ struct journal_seq_blacklist_entry *i; ++ unsigned nr = blacklist_nr_entries(bl); ++ ++ for (i = bl->start; i < bl->start + nr; i++) { ++ if (le64_to_cpu(i->start) >= ++ le64_to_cpu(i->end)) ++ return "entry start >= end"; ++ ++ if (i + 1 < bl->start + nr && ++ le64_to_cpu(i[0].end) > ++ le64_to_cpu(i[1].start)) ++ return "entries out of order"; ++ } ++ ++ return NULL; ++} ++ ++static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, ++ struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_journal_seq_blacklist *bl = ++ field_to_type(f, journal_seq_blacklist); ++ struct journal_seq_blacklist_entry *i; ++ unsigned nr = blacklist_nr_entries(bl); ++ ++ for (i = bl->start; i < bl->start + nr; i++) { ++ if (i != bl->start) ++ pr_buf(out, " "); ++ ++ pr_buf(out, "%llu-%llu", ++ le64_to_cpu(i->start), ++ le64_to_cpu(i->end)); ++ } ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { ++ .validate = bch2_sb_journal_seq_blacklist_validate, ++ .to_text = bch2_sb_journal_seq_blacklist_to_text ++}; ++ ++void bch2_blacklist_entries_gc(struct work_struct *work) ++{ ++ struct bch_fs *c = container_of(work, struct bch_fs, ++ journal_seq_blacklist_gc_work); ++ struct journal_seq_blacklist_table *t; ++ struct bch_sb_field_journal_seq_blacklist *bl; ++ struct journal_seq_blacklist_entry *src, *dst; ++ struct btree_trans trans; ++ unsigned i, nr, new_nr; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) { ++ struct btree_iter *iter; ++ struct btree *b; ++ ++ for_each_btree_node(&trans, iter, i, POS_MIN, ++ BTREE_ITER_PREFETCH, b) ++ if (test_bit(BCH_FS_STOPPING, &c->flags)) { ++ bch2_trans_exit(&trans); ++ return; ++ } ++ bch2_trans_iter_free(&trans, iter); ++ } ++ ++ ret = bch2_trans_exit(&trans); ++ if (ret) ++ return; ++ ++ mutex_lock(&c->sb_lock); ++ bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); ++ if (!bl) ++ goto out; ++ ++ nr = blacklist_nr_entries(bl); ++ dst = bl->start; ++ ++ t = c->journal_seq_blacklist_table; ++ BUG_ON(nr != t->nr); ++ ++ for (src = bl->start, i = eytzinger0_first(t->nr); ++ src < bl->start + nr; ++ src++, i = eytzinger0_next(i, nr)) { ++ BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); ++ BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); ++ ++ if (t->entries[i].dirty) ++ *dst++ = *src; ++ } ++ ++ new_nr = dst - bl->start; ++ ++ bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); ++ ++ if (new_nr != nr) { ++ bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, ++ new_nr ? sb_blacklist_u64s(new_nr) : 0); ++ BUG_ON(new_nr && !bl); ++ ++ if (!new_nr) ++ c->disk_sb.sb->features[0] &= ++ ~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); ++ ++ bch2_write_super(c); ++ } ++out: ++ mutex_unlock(&c->sb_lock); ++} +diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h +new file mode 100644 +index 000000000000..afb886ec8e25 +--- /dev/null ++++ b/fs/bcachefs/journal_seq_blacklist.h +@@ -0,0 +1,22 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H ++#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H ++ ++static inline unsigned ++blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) ++{ ++ return bl ++ ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / ++ sizeof(struct journal_seq_blacklist_entry)) ++ : 0; ++} ++ ++bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); ++int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); ++int bch2_blacklist_table_initialize(struct bch_fs *); ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; ++ ++void bch2_blacklist_entries_gc(struct work_struct *); ++ ++#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ +diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h +new file mode 100644 +index 000000000000..154b51b891d3 +--- /dev/null ++++ b/fs/bcachefs/journal_types.h +@@ -0,0 +1,277 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_JOURNAL_TYPES_H ++#define _BCACHEFS_JOURNAL_TYPES_H ++ ++#include ++#include ++ ++#include "alloc_types.h" ++#include "super_types.h" ++#include "fifo.h" ++ ++struct journal_res; ++ ++/* ++ * We put two of these in struct journal; we used them for writes to the ++ * journal that are being staged or in flight. ++ */ ++struct journal_buf { ++ struct jset *data; ++ ++ BKEY_PADDED(key); ++ ++ struct closure_waitlist wait; ++ ++ unsigned buf_size; /* size in bytes of @data */ ++ unsigned sectors; /* maximum size for current entry */ ++ unsigned disk_sectors; /* maximum size entry could have been, if ++ buf_size was bigger */ ++ unsigned u64s_reserved; ++ /* bloom filter: */ ++ unsigned long has_inode[1024 / sizeof(unsigned long)]; ++}; ++ ++/* ++ * Something that makes a journal entry dirty - i.e. a btree node that has to be ++ * flushed: ++ */ ++ ++struct journal_entry_pin_list { ++ struct list_head list; ++ struct list_head flushed; ++ atomic_t count; ++ struct bch_devs_list devs; ++}; ++ ++struct journal; ++struct journal_entry_pin; ++typedef void (*journal_pin_flush_fn)(struct journal *j, ++ struct journal_entry_pin *, u64); ++ ++struct journal_entry_pin { ++ struct list_head list; ++ journal_pin_flush_fn flush; ++ u64 seq; ++}; ++ ++struct journal_res { ++ bool ref; ++ u8 idx; ++ u16 u64s; ++ u32 offset; ++ u64 seq; ++}; ++ ++/* ++ * For reserving space in the journal prior to getting a reservation on a ++ * particular journal entry: ++ */ ++struct journal_preres { ++ unsigned u64s; ++}; ++ ++union journal_res_state { ++ struct { ++ atomic64_t counter; ++ }; ++ ++ struct { ++ u64 v; ++ }; ++ ++ struct { ++ u64 cur_entry_offset:20, ++ idx:1, ++ prev_buf_unwritten:1, ++ buf0_count:21, ++ buf1_count:21; ++ }; ++}; ++ ++union journal_preres_state { ++ struct { ++ atomic64_t counter; ++ }; ++ ++ struct { ++ u64 v; ++ }; ++ ++ struct { ++ u32 reserved; ++ u32 remaining; ++ }; ++}; ++ ++/* bytes: */ ++#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ ++#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ ++ ++/* ++ * We stash some journal state as sentinal values in cur_entry_offset: ++ * note - cur_entry_offset is in units of u64s ++ */ ++#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) ++ ++#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) ++#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) ++ ++/* ++ * JOURNAL_NEED_WRITE - current (pending) journal entry should be written ASAP, ++ * either because something's waiting on the write to complete or because it's ++ * been dirty too long and the timer's expired. ++ */ ++ ++enum { ++ JOURNAL_REPLAY_DONE, ++ JOURNAL_STARTED, ++ JOURNAL_RECLAIM_STARTED, ++ JOURNAL_NEED_WRITE, ++ JOURNAL_NOT_EMPTY, ++ JOURNAL_MAY_GET_UNRESERVED, ++}; ++ ++/* Embedded in struct bch_fs */ ++struct journal { ++ /* Fastpath stuff up front: */ ++ ++ unsigned long flags; ++ ++ union journal_res_state reservations; ++ ++ /* Max size of current journal entry */ ++ unsigned cur_entry_u64s; ++ unsigned cur_entry_sectors; ++ ++ /* ++ * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if ++ * insufficient devices: ++ */ ++ int cur_entry_error; ++ ++ union journal_preres_state prereserved; ++ ++ /* Reserved space in journal entry to be used just prior to write */ ++ unsigned entry_u64s_reserved; ++ ++ unsigned buf_size_want; ++ ++ /* ++ * Two journal entries -- one is currently open for new entries, the ++ * other is possibly being written out. ++ */ ++ struct journal_buf buf[2]; ++ ++ spinlock_t lock; ++ ++ /* if nonzero, we may not open a new journal entry: */ ++ unsigned blocked; ++ ++ /* Used when waiting because the journal was full */ ++ wait_queue_head_t wait; ++ struct closure_waitlist async_wait; ++ struct closure_waitlist preres_wait; ++ ++ struct closure io; ++ struct delayed_work write_work; ++ ++ /* Sequence number of most recent journal entry (last entry in @pin) */ ++ atomic64_t seq; ++ ++ /* seq, last_seq from the most recent journal entry successfully written */ ++ u64 seq_ondisk; ++ u64 last_seq_ondisk; ++ ++ /* ++ * FIFO of journal entries whose btree updates have not yet been ++ * written out. ++ * ++ * Each entry is a reference count. The position in the FIFO is the ++ * entry's sequence number relative to @seq. ++ * ++ * The journal entry itself holds a reference count, put when the ++ * journal entry is written out. Each btree node modified by the journal ++ * entry also holds a reference count, put when the btree node is ++ * written. ++ * ++ * When a reference count reaches zero, the journal entry is no longer ++ * needed. When all journal entries in the oldest journal bucket are no ++ * longer needed, the bucket can be discarded and reused. ++ */ ++ struct { ++ u64 front, back, size, mask; ++ struct journal_entry_pin_list *data; ++ } pin; ++ ++ u64 replay_journal_seq; ++ u64 replay_journal_seq_end; ++ ++ struct write_point wp; ++ spinlock_t err_lock; ++ ++ struct delayed_work reclaim_work; ++ struct mutex reclaim_lock; ++ unsigned long last_flushed; ++ struct journal_entry_pin *flush_in_progress; ++ wait_queue_head_t pin_flush_wait; ++ ++ /* protects advancing ja->discard_idx: */ ++ struct mutex discard_lock; ++ bool can_discard; ++ ++ unsigned write_delay_ms; ++ unsigned reclaim_delay_ms; ++ ++ u64 res_get_blocked_start; ++ u64 need_write_time; ++ u64 write_start_time; ++ ++ struct time_stats *write_time; ++ struct time_stats *delay_time; ++ struct time_stats *blocked_time; ++ struct time_stats *flush_seq_time; ++ ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map res_map; ++#endif ++}; ++ ++/* ++ * Embedded in struct bch_dev. First three fields refer to the array of journal ++ * buckets, in bch_sb. ++ */ ++struct journal_device { ++ /* ++ * For each journal bucket, contains the max sequence number of the ++ * journal writes it contains - so we know when a bucket can be reused. ++ */ ++ u64 *bucket_seq; ++ ++ unsigned sectors_free; ++ ++ /* ++ * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx: ++ */ ++ unsigned discard_idx; /* Next bucket to discard */ ++ unsigned dirty_idx_ondisk; ++ unsigned dirty_idx; ++ unsigned cur_idx; /* Journal bucket we're currently writing to */ ++ unsigned nr; ++ ++ u64 *buckets; ++ ++ /* Bio for journal reads/writes to this device */ ++ struct bio *bio; ++ ++ /* for bch_journal_read_device */ ++ struct closure read; ++}; ++ ++/* ++ * journal_entry_res - reserve space in every journal entry: ++ */ ++struct journal_entry_res { ++ unsigned u64s; ++}; ++ ++#endif /* _BCACHEFS_JOURNAL_TYPES_H */ +diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c +new file mode 100644 +index 000000000000..864dfaa67b7a +--- /dev/null ++++ b/fs/bcachefs/keylist.c +@@ -0,0 +1,67 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "keylist.h" ++ ++int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s, ++ size_t nr_inline_u64s, size_t new_u64s) ++{ ++ size_t oldsize = bch2_keylist_u64s(l); ++ size_t newsize = oldsize + new_u64s; ++ u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p; ++ u64 *new_keys; ++ ++ newsize = roundup_pow_of_two(newsize); ++ ++ if (newsize <= nr_inline_u64s || ++ (old_buf && roundup_pow_of_two(oldsize) == newsize)) ++ return 0; ++ ++ new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOIO); ++ if (!new_keys) ++ return -ENOMEM; ++ ++ if (!old_buf) ++ memcpy_u64s(new_keys, inline_u64s, oldsize); ++ ++ l->keys_p = new_keys; ++ l->top_p = new_keys + oldsize; ++ ++ return 0; ++} ++ ++void bch2_keylist_add_in_order(struct keylist *l, struct bkey_i *insert) ++{ ++ struct bkey_i *where; ++ ++ for_each_keylist_key(l, where) ++ if (bkey_cmp(insert->k.p, where->k.p) < 0) ++ break; ++ ++ memmove_u64s_up((u64 *) where + insert->k.u64s, ++ where, ++ ((u64 *) l->top) - ((u64 *) where)); ++ ++ l->top_p += insert->k.u64s; ++ bkey_copy(where, insert); ++} ++ ++void bch2_keylist_pop_front(struct keylist *l) ++{ ++ l->top_p -= bch2_keylist_front(l)->k.u64s; ++ ++ memmove_u64s_down(l->keys, ++ bkey_next(l->keys), ++ bch2_keylist_u64s(l)); ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_verify_keylist_sorted(struct keylist *l) ++{ ++ struct bkey_i *k; ++ ++ for_each_keylist_key(l, k) ++ BUG_ON(bkey_next(k) != l->top && ++ bkey_cmp(k->k.p, bkey_next(k)->k.p) >= 0); ++} ++#endif +diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h +new file mode 100644 +index 000000000000..195799bb20bc +--- /dev/null ++++ b/fs/bcachefs/keylist.h +@@ -0,0 +1,76 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_KEYLIST_H ++#define _BCACHEFS_KEYLIST_H ++ ++#include "keylist_types.h" ++ ++int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t); ++void bch2_keylist_add_in_order(struct keylist *, struct bkey_i *); ++void bch2_keylist_pop_front(struct keylist *); ++ ++static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys) ++{ ++ l->top_p = l->keys_p = inline_keys; ++} ++ ++static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys) ++{ ++ if (l->keys_p != inline_keys) ++ kfree(l->keys_p); ++ bch2_keylist_init(l, inline_keys); ++} ++ ++static inline void bch2_keylist_push(struct keylist *l) ++{ ++ l->top = bkey_next(l->top); ++} ++ ++static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k) ++{ ++ bkey_copy(l->top, k); ++ bch2_keylist_push(l); ++} ++ ++static inline bool bch2_keylist_empty(struct keylist *l) ++{ ++ return l->top == l->keys; ++} ++ ++static inline size_t bch2_keylist_u64s(struct keylist *l) ++{ ++ return l->top_p - l->keys_p; ++} ++ ++static inline size_t bch2_keylist_bytes(struct keylist *l) ++{ ++ return bch2_keylist_u64s(l) * sizeof(u64); ++} ++ ++static inline struct bkey_i *bch2_keylist_front(struct keylist *l) ++{ ++ return l->keys; ++} ++ ++#define for_each_keylist_key(_keylist, _k) \ ++ for (_k = (_keylist)->keys; \ ++ _k != (_keylist)->top; \ ++ _k = bkey_next(_k)) ++ ++static inline u64 keylist_sectors(struct keylist *keys) ++{ ++ struct bkey_i *k; ++ u64 ret = 0; ++ ++ for_each_keylist_key(keys, k) ++ ret += k->k.size; ++ ++ return ret; ++} ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++void bch2_verify_keylist_sorted(struct keylist *); ++#else ++static inline void bch2_verify_keylist_sorted(struct keylist *l) {} ++#endif ++ ++#endif /* _BCACHEFS_KEYLIST_H */ +diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h +new file mode 100644 +index 000000000000..4b3ff7d8a875 +--- /dev/null ++++ b/fs/bcachefs/keylist_types.h +@@ -0,0 +1,16 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_KEYLIST_TYPES_H ++#define _BCACHEFS_KEYLIST_TYPES_H ++ ++struct keylist { ++ union { ++ struct bkey_i *keys; ++ u64 *keys_p; ++ }; ++ union { ++ struct bkey_i *top; ++ u64 *top_p; ++ }; ++}; ++ ++#endif /* _BCACHEFS_KEYLIST_TYPES_H */ +diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c +new file mode 100644 +index 000000000000..96c8690adc5b +--- /dev/null ++++ b/fs/bcachefs/migrate.c +@@ -0,0 +1,170 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Code for moving data off a device. ++ */ ++ ++#include "bcachefs.h" ++#include "bkey_on_stack.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "extents.h" ++#include "io.h" ++#include "journal.h" ++#include "keylist.h" ++#include "migrate.h" ++#include "move.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, ++ unsigned dev_idx, int flags, bool metadata) ++{ ++ unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; ++ unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; ++ unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED; ++ unsigned nr_good; ++ ++ bch2_bkey_drop_device(k, dev_idx); ++ ++ nr_good = bch2_bkey_durability(c, k.s_c); ++ if ((!nr_good && !(flags & lost)) || ++ (nr_good < replicas && !(flags & degraded))) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static int __bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags, ++ enum btree_id btree_id) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_on_stack sk; ++ int ret = 0; ++ ++ bkey_on_stack_init(&sk); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ iter = bch2_trans_get_iter(&trans, btree_id, POS_MIN, ++ BTREE_ITER_PREFETCH); ++ ++ while ((k = bch2_btree_iter_peek(iter)).k && ++ !(ret = bkey_err(k))) { ++ if (!bch2_bkey_has_device(k, dev_idx)) { ++ bch2_btree_iter_next(iter); ++ continue; ++ } ++ ++ bkey_on_stack_reassemble(&sk, c, k); ++ ++ ret = drop_dev_ptrs(c, bkey_i_to_s(sk.k), ++ dev_idx, flags, false); ++ if (ret) ++ break; ++ ++ /* ++ * If the new extent no longer has any pointers, bch2_extent_normalize() ++ * will do the appropriate thing with it (turning it into a ++ * KEY_TYPE_error key, or just a discard if it was a cached extent) ++ */ ++ bch2_extent_normalize(c, bkey_i_to_s(sk.k)); ++ ++ bch2_btree_iter_set_pos(iter, bkey_start_pos(&sk.k->k)); ++ ++ bch2_trans_update(&trans, iter, sk.k, 0); ++ ++ ret = bch2_trans_commit(&trans, NULL, NULL, ++ BTREE_INSERT_NOFAIL); ++ ++ /* ++ * don't want to leave ret == -EINTR, since if we raced and ++ * something else overwrote the key we could spuriously return ++ * -EINTR below: ++ */ ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ break; ++ } ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ bkey_on_stack_exit(&sk, c); ++ ++ BUG_ON(ret == -EINTR); ++ ++ return ret; ++} ++ ++static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) ++{ ++ return __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_EXTENTS) ?: ++ __bch2_dev_usrdata_drop(c, dev_idx, flags, BTREE_ID_REFLINK); ++} ++ ++static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct closure cl; ++ struct btree *b; ++ unsigned id; ++ int ret; ++ ++ /* don't handle this yet: */ ++ if (flags & BCH_FORCE_IF_METADATA_LOST) ++ return -EINVAL; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ closure_init_stack(&cl); ++ ++ for (id = 0; id < BTREE_ID_NR; id++) { ++ for_each_btree_node(&trans, iter, id, POS_MIN, ++ BTREE_ITER_PREFETCH, b) { ++ __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; ++retry: ++ if (!bch2_bkey_has_device(bkey_i_to_s_c(&b->key), ++ dev_idx)) ++ continue; ++ ++ bkey_copy(&tmp.k, &b->key); ++ ++ ret = drop_dev_ptrs(c, bkey_i_to_s(&tmp.k), ++ dev_idx, flags, true); ++ if (ret) { ++ bch_err(c, "Cannot drop device without losing data"); ++ goto err; ++ } ++ ++ ret = bch2_btree_node_update_key(c, iter, b, &tmp.k); ++ if (ret == -EINTR) { ++ b = bch2_btree_iter_peek_node(iter); ++ goto retry; ++ } ++ if (ret) { ++ bch_err(c, "Error updating btree node key: %i", ret); ++ goto err; ++ } ++ } ++ bch2_trans_iter_free(&trans, iter); ++ } ++ ++ /* flush relevant btree updates */ ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c)); ++ ++ ret = 0; ++err: ++ ret = bch2_trans_exit(&trans) ?: ret; ++ ++ BUG_ON(ret == -EINTR); ++ ++ return ret; ++} ++ ++int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) ++{ ++ return bch2_dev_usrdata_drop(c, dev_idx, flags) ?: ++ bch2_dev_metadata_drop(c, dev_idx, flags); ++} +diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h +new file mode 100644 +index 000000000000..027efaa0d575 +--- /dev/null ++++ b/fs/bcachefs/migrate.h +@@ -0,0 +1,7 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_MIGRATE_H ++#define _BCACHEFS_MIGRATE_H ++ ++int bch2_dev_data_drop(struct bch_fs *, unsigned, int); ++ ++#endif /* _BCACHEFS_MIGRATE_H */ +diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c +new file mode 100644 +index 000000000000..62dcac79ed06 +--- /dev/null ++++ b/fs/bcachefs/move.c +@@ -0,0 +1,826 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "bkey_on_stack.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "disk_groups.h" ++#include "inode.h" ++#include "io.h" ++#include "journal_reclaim.h" ++#include "move.h" ++#include "replicas.h" ++#include "super-io.h" ++#include "keylist.h" ++ ++#include ++#include ++ ++#include ++ ++#define SECTORS_IN_FLIGHT_PER_DEVICE 2048 ++ ++struct moving_io { ++ struct list_head list; ++ struct closure cl; ++ bool read_completed; ++ ++ unsigned read_sectors; ++ unsigned write_sectors; ++ ++ struct bch_read_bio rbio; ++ ++ struct migrate_write write; ++ /* Must be last since it is variable size */ ++ struct bio_vec bi_inline_vecs[0]; ++}; ++ ++struct moving_context { ++ /* Closure for waiting on all reads and writes to complete */ ++ struct closure cl; ++ ++ struct bch_move_stats *stats; ++ ++ struct list_head reads; ++ ++ /* in flight sectors: */ ++ atomic_t read_sectors; ++ atomic_t write_sectors; ++ ++ wait_queue_head_t wait; ++}; ++ ++static int bch2_migrate_index_update(struct bch_write_op *op) ++{ ++ struct bch_fs *c = op->c; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct migrate_write *m = ++ container_of(op, struct migrate_write, op); ++ struct keylist *keys = &op->insert_keys; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++ ++ iter = bch2_trans_get_iter(&trans, m->btree_id, ++ bkey_start_pos(&bch2_keylist_front(keys)->k), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ ++ while (1) { ++ struct bkey_s_c k; ++ struct bkey_i *insert; ++ struct bkey_i_extent *new; ++ BKEY_PADDED(k) _new, _insert; ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ bool did_work = false; ++ int nr; ++ ++ bch2_trans_reset(&trans, 0); ++ ++ k = bch2_btree_iter_peek_slot(iter); ++ ret = bkey_err(k); ++ if (ret) { ++ if (ret == -EINTR) ++ continue; ++ break; ++ } ++ ++ new = bkey_i_to_extent(bch2_keylist_front(keys)); ++ ++ if (bversion_cmp(k.k->version, new->k.version) || ++ !bch2_bkey_matches_ptr(c, k, m->ptr, m->offset)) ++ goto nomatch; ++ ++ bkey_reassemble(&_insert.k, k); ++ insert = &_insert.k; ++ ++ bkey_copy(&_new.k, bch2_keylist_front(keys)); ++ new = bkey_i_to_extent(&_new.k); ++ bch2_cut_front(iter->pos, &new->k_i); ++ ++ bch2_cut_front(iter->pos, insert); ++ bch2_cut_back(new->k.p, insert); ++ bch2_cut_back(insert->k.p, &new->k_i); ++ ++ if (m->data_cmd == DATA_REWRITE) { ++ struct bch_extent_ptr *new_ptr, *old_ptr = (void *) ++ bch2_bkey_has_device(bkey_i_to_s_c(insert), ++ m->data_opts.rewrite_dev); ++ if (!old_ptr) ++ goto nomatch; ++ ++ if (old_ptr->cached) ++ extent_for_each_ptr(extent_i_to_s(new), new_ptr) ++ new_ptr->cached = true; ++ ++ bch2_bkey_drop_ptr(bkey_i_to_s(insert), old_ptr); ++ } ++ ++ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) { ++ if (bch2_bkey_has_device(bkey_i_to_s_c(insert), p.ptr.dev)) { ++ /* ++ * raced with another move op? extent already ++ * has a pointer to the device we just wrote ++ * data to ++ */ ++ continue; ++ } ++ ++ bch2_extent_ptr_decoded_append(insert, &p); ++ did_work = true; ++ } ++ ++ if (!did_work) ++ goto nomatch; ++ ++ bch2_bkey_narrow_crcs(insert, ++ (struct bch_extent_crc_unpacked) { 0 }); ++ bch2_extent_normalize(c, bkey_i_to_s(insert)); ++ bch2_bkey_mark_replicas_cached(c, bkey_i_to_s(insert), ++ op->opts.background_target, ++ op->opts.data_replicas); ++ ++ /* ++ * If we're not fully overwriting @k, and it's compressed, we ++ * need a reservation for all the pointers in @insert ++ */ ++ nr = bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(insert)) - ++ m->nr_ptrs_reserved; ++ ++ if (insert->k.size < k.k->size && ++ bch2_bkey_sectors_compressed(k) && ++ nr > 0) { ++ ret = bch2_disk_reservation_add(c, &op->res, ++ keylist_sectors(keys) * nr, 0); ++ if (ret) ++ goto out; ++ ++ m->nr_ptrs_reserved += nr; ++ goto next; ++ } ++ ++ bch2_trans_update(&trans, iter, insert, 0); ++ ++ ret = bch2_trans_commit(&trans, &op->res, ++ op_journal_seq(op), ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ m->data_opts.btree_insert_flags); ++ if (!ret) ++ atomic_long_inc(&c->extent_migrate_done); ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ break; ++next: ++ while (bkey_cmp(iter->pos, bch2_keylist_front(keys)->k.p) >= 0) { ++ bch2_keylist_pop_front(keys); ++ if (bch2_keylist_empty(keys)) ++ goto out; ++ } ++ continue; ++nomatch: ++ if (m->ctxt) { ++ BUG_ON(k.k->p.offset <= iter->pos.offset); ++ atomic64_inc(&m->ctxt->stats->keys_raced); ++ atomic64_add(k.k->p.offset - iter->pos.offset, ++ &m->ctxt->stats->sectors_raced); ++ } ++ atomic_long_inc(&c->extent_migrate_raced); ++ trace_move_race(&new->k); ++ bch2_btree_iter_next_slot(iter); ++ goto next; ++ } ++out: ++ bch2_trans_exit(&trans); ++ BUG_ON(ret == -EINTR); ++ return ret; ++} ++ ++void bch2_migrate_read_done(struct migrate_write *m, struct bch_read_bio *rbio) ++{ ++ /* write bio must own pages: */ ++ BUG_ON(!m->op.wbio.bio.bi_vcnt); ++ ++ m->ptr = rbio->pick.ptr; ++ m->offset = rbio->pos.offset - rbio->pick.crc.offset; ++ m->op.devs_have = rbio->devs_have; ++ m->op.pos = rbio->pos; ++ m->op.version = rbio->version; ++ m->op.crc = rbio->pick.crc; ++ m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; ++ ++ if (bch2_csum_type_is_encryption(m->op.crc.csum_type)) { ++ m->op.nonce = m->op.crc.nonce + m->op.crc.offset; ++ m->op.csum_type = m->op.crc.csum_type; ++ } ++ ++ if (m->data_cmd == DATA_REWRITE) ++ bch2_dev_list_drop_dev(&m->op.devs_have, m->data_opts.rewrite_dev); ++} ++ ++int bch2_migrate_write_init(struct bch_fs *c, struct migrate_write *m, ++ struct write_point_specifier wp, ++ struct bch_io_opts io_opts, ++ enum data_cmd data_cmd, ++ struct data_opts data_opts, ++ enum btree_id btree_id, ++ struct bkey_s_c k) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ int ret; ++ ++ m->btree_id = btree_id; ++ m->data_cmd = data_cmd; ++ m->data_opts = data_opts; ++ m->nr_ptrs_reserved = 0; ++ ++ bch2_write_op_init(&m->op, c, io_opts); ++ ++ if (!bch2_bkey_is_incompressible(k)) ++ m->op.compression_type = ++ bch2_compression_opt_to_type[io_opts.background_compression ?: ++ io_opts.compression]; ++ else ++ m->op.incompressible = true; ++ ++ m->op.target = data_opts.target, ++ m->op.write_point = wp; ++ ++ if (m->data_opts.btree_insert_flags & BTREE_INSERT_USE_RESERVE) { ++ m->op.alloc_reserve = RESERVE_MOVINGGC; ++ m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; ++ } else { ++ /* XXX: this should probably be passed in */ ++ m->op.flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; ++ } ++ ++ m->op.flags |= BCH_WRITE_PAGES_STABLE| ++ BCH_WRITE_PAGES_OWNED| ++ BCH_WRITE_DATA_ENCODED| ++ BCH_WRITE_FROM_INTERNAL; ++ ++ m->op.nr_replicas = 1; ++ m->op.nr_replicas_required = 1; ++ m->op.index_update_fn = bch2_migrate_index_update; ++ ++ switch (data_cmd) { ++ case DATA_ADD_REPLICAS: { ++ /* ++ * DATA_ADD_REPLICAS is used for moving data to a different ++ * device in the background, and due to compression the new copy ++ * might take up more space than the old copy: ++ */ ++#if 0 ++ int nr = (int) io_opts.data_replicas - ++ bch2_bkey_nr_ptrs_allocated(k); ++#endif ++ int nr = (int) io_opts.data_replicas; ++ ++ if (nr > 0) { ++ m->op.nr_replicas = m->nr_ptrs_reserved = nr; ++ ++ ret = bch2_disk_reservation_get(c, &m->op.res, ++ k.k->size, m->op.nr_replicas, 0); ++ if (ret) ++ return ret; ++ } ++ break; ++ } ++ case DATA_REWRITE: { ++ unsigned compressed_sectors = 0; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (p.ptr.dev == data_opts.rewrite_dev && ++ !p.ptr.cached && ++ crc_is_compressed(p.crc)) ++ compressed_sectors += p.crc.compressed_size; ++ ++ if (compressed_sectors) { ++ ret = bch2_disk_reservation_add(c, &m->op.res, ++ k.k->size * m->op.nr_replicas, ++ BCH_DISK_RESERVATION_NOFAIL); ++ if (ret) ++ return ret; ++ } ++ break; ++ } ++ case DATA_PROMOTE: ++ m->op.flags |= BCH_WRITE_ALLOC_NOWAIT; ++ m->op.flags |= BCH_WRITE_CACHED; ++ break; ++ default: ++ BUG(); ++ } ++ ++ return 0; ++} ++ ++static void move_free(struct closure *cl) ++{ ++ struct moving_io *io = container_of(cl, struct moving_io, cl); ++ struct moving_context *ctxt = io->write.ctxt; ++ struct bvec_iter_all iter; ++ struct bio_vec *bv; ++ ++ bch2_disk_reservation_put(io->write.op.c, &io->write.op.res); ++ ++ bio_for_each_segment_all(bv, &io->write.op.wbio.bio, iter) ++ if (bv->bv_page) ++ __free_page(bv->bv_page); ++ ++ wake_up(&ctxt->wait); ++ ++ kfree(io); ++} ++ ++static void move_write_done(struct closure *cl) ++{ ++ struct moving_io *io = container_of(cl, struct moving_io, cl); ++ ++ atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); ++ closure_return_with_destructor(cl, move_free); ++} ++ ++static void move_write(struct closure *cl) ++{ ++ struct moving_io *io = container_of(cl, struct moving_io, cl); ++ ++ if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { ++ closure_return_with_destructor(cl, move_free); ++ return; ++ } ++ ++ bch2_migrate_read_done(&io->write, &io->rbio); ++ ++ atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); ++ closure_call(&io->write.op.cl, bch2_write, NULL, cl); ++ continue_at(cl, move_write_done, NULL); ++} ++ ++static inline struct moving_io *next_pending_write(struct moving_context *ctxt) ++{ ++ struct moving_io *io = ++ list_first_entry_or_null(&ctxt->reads, struct moving_io, list); ++ ++ return io && io->read_completed ? io : NULL; ++} ++ ++static void move_read_endio(struct bio *bio) ++{ ++ struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); ++ struct moving_context *ctxt = io->write.ctxt; ++ ++ atomic_sub(io->read_sectors, &ctxt->read_sectors); ++ io->read_completed = true; ++ ++ if (next_pending_write(ctxt)) ++ wake_up(&ctxt->wait); ++ ++ closure_put(&ctxt->cl); ++} ++ ++static void do_pending_writes(struct moving_context *ctxt) ++{ ++ struct moving_io *io; ++ ++ while ((io = next_pending_write(ctxt))) { ++ list_del(&io->list); ++ closure_call(&io->cl, move_write, NULL, &ctxt->cl); ++ } ++} ++ ++#define move_ctxt_wait_event(_ctxt, _cond) \ ++do { \ ++ do_pending_writes(_ctxt); \ ++ \ ++ if (_cond) \ ++ break; \ ++ __wait_event((_ctxt)->wait, \ ++ next_pending_write(_ctxt) || (_cond)); \ ++} while (1) ++ ++static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt) ++{ ++ unsigned sectors_pending = atomic_read(&ctxt->write_sectors); ++ ++ move_ctxt_wait_event(ctxt, ++ !atomic_read(&ctxt->write_sectors) || ++ atomic_read(&ctxt->write_sectors) != sectors_pending); ++} ++ ++static int bch2_move_extent(struct btree_trans *trans, ++ struct moving_context *ctxt, ++ struct write_point_specifier wp, ++ struct bch_io_opts io_opts, ++ enum btree_id btree_id, ++ struct bkey_s_c k, ++ enum data_cmd data_cmd, ++ struct data_opts data_opts) ++{ ++ struct bch_fs *c = trans->c; ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ struct moving_io *io; ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ unsigned sectors = k.k->size, pages; ++ int ret = -ENOMEM; ++ ++ move_ctxt_wait_event(ctxt, ++ atomic_read(&ctxt->write_sectors) < ++ SECTORS_IN_FLIGHT_PER_DEVICE); ++ ++ move_ctxt_wait_event(ctxt, ++ atomic_read(&ctxt->read_sectors) < ++ SECTORS_IN_FLIGHT_PER_DEVICE); ++ ++ /* write path might have to decompress data: */ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); ++ ++ pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); ++ io = kzalloc(sizeof(struct moving_io) + ++ sizeof(struct bio_vec) * pages, GFP_KERNEL); ++ if (!io) ++ goto err; ++ ++ io->write.ctxt = ctxt; ++ io->read_sectors = k.k->size; ++ io->write_sectors = k.k->size; ++ ++ bio_init(&io->write.op.wbio.bio, io->bi_inline_vecs, pages); ++ bio_set_prio(&io->write.op.wbio.bio, ++ IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); ++ ++ if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, ++ GFP_KERNEL)) ++ goto err_free; ++ ++ io->rbio.c = c; ++ io->rbio.opts = io_opts; ++ bio_init(&io->rbio.bio, io->bi_inline_vecs, pages); ++ io->rbio.bio.bi_vcnt = pages; ++ bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); ++ io->rbio.bio.bi_iter.bi_size = sectors << 9; ++ ++ bio_set_op_attrs(&io->rbio.bio, REQ_OP_READ, 0); ++ io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); ++ io->rbio.bio.bi_end_io = move_read_endio; ++ ++ ret = bch2_migrate_write_init(c, &io->write, wp, io_opts, ++ data_cmd, data_opts, btree_id, k); ++ if (ret) ++ goto err_free_pages; ++ ++ atomic64_inc(&ctxt->stats->keys_moved); ++ atomic64_add(k.k->size, &ctxt->stats->sectors_moved); ++ ++ trace_move_extent(k.k); ++ ++ atomic_add(io->read_sectors, &ctxt->read_sectors); ++ list_add_tail(&io->list, &ctxt->reads); ++ ++ /* ++ * dropped by move_read_endio() - guards against use after free of ++ * ctxt when doing wakeup ++ */ ++ closure_get(&ctxt->cl); ++ bch2_read_extent(trans, &io->rbio, k, 0, ++ BCH_READ_NODECODE| ++ BCH_READ_LAST_FRAGMENT); ++ return 0; ++err_free_pages: ++ bio_free_pages(&io->write.op.wbio.bio); ++err_free: ++ kfree(io); ++err: ++ trace_move_alloc_fail(k.k); ++ return ret; ++} ++ ++static int __bch2_move_data(struct bch_fs *c, ++ struct moving_context *ctxt, ++ struct bch_ratelimit *rate, ++ struct write_point_specifier wp, ++ struct bpos start, ++ struct bpos end, ++ move_pred_fn pred, void *arg, ++ struct bch_move_stats *stats, ++ enum btree_id btree_id) ++{ ++ bool kthread = (current->flags & PF_KTHREAD) != 0; ++ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); ++ struct bkey_on_stack sk; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct data_opts data_opts; ++ enum data_cmd data_cmd; ++ u64 delay, cur_inum = U64_MAX; ++ int ret = 0, ret2; ++ ++ bkey_on_stack_init(&sk); ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ stats->data_type = BCH_DATA_user; ++ stats->btree_id = btree_id; ++ stats->pos = POS_MIN; ++ ++ iter = bch2_trans_get_iter(&trans, btree_id, start, ++ BTREE_ITER_PREFETCH); ++ ++ if (rate) ++ bch2_ratelimit_reset(rate); ++ ++ while (1) { ++ do { ++ delay = rate ? bch2_ratelimit_delay(rate) : 0; ++ ++ if (delay) { ++ bch2_trans_unlock(&trans); ++ set_current_state(TASK_INTERRUPTIBLE); ++ } ++ ++ if (kthread && (ret = kthread_should_stop())) { ++ __set_current_state(TASK_RUNNING); ++ goto out; ++ } ++ ++ if (delay) ++ schedule_timeout(delay); ++ ++ if (unlikely(freezing(current))) { ++ bch2_trans_unlock(&trans); ++ move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads)); ++ try_to_freeze(); ++ } ++ } while (delay); ++peek: ++ k = bch2_btree_iter_peek(iter); ++ ++ stats->pos = iter->pos; ++ ++ if (!k.k) ++ break; ++ ret = bkey_err(k); ++ if (ret) ++ break; ++ if (bkey_cmp(bkey_start_pos(k.k), end) >= 0) ++ break; ++ ++ if (!bkey_extent_is_direct_data(k.k)) ++ goto next_nondata; ++ ++ if (btree_id == BTREE_ID_EXTENTS && ++ cur_inum != k.k->p.inode) { ++ struct bch_inode_unpacked inode; ++ ++ /* don't hold btree locks while looking up inode: */ ++ bch2_trans_unlock(&trans); ++ ++ io_opts = bch2_opts_to_inode_opts(c->opts); ++ if (!bch2_inode_find_by_inum(c, k.k->p.inode, &inode)) ++ bch2_io_opts_apply(&io_opts, bch2_inode_opts_get(&inode)); ++ cur_inum = k.k->p.inode; ++ goto peek; ++ } ++ ++ switch ((data_cmd = pred(c, arg, k, &io_opts, &data_opts))) { ++ case DATA_SKIP: ++ goto next; ++ case DATA_SCRUB: ++ BUG(); ++ case DATA_ADD_REPLICAS: ++ case DATA_REWRITE: ++ case DATA_PROMOTE: ++ break; ++ default: ++ BUG(); ++ } ++ ++ /* unlock before doing IO: */ ++ bkey_on_stack_reassemble(&sk, c, k); ++ k = bkey_i_to_s_c(sk.k); ++ bch2_trans_unlock(&trans); ++ ++ ret2 = bch2_move_extent(&trans, ctxt, wp, io_opts, btree_id, k, ++ data_cmd, data_opts); ++ if (ret2) { ++ if (ret2 == -ENOMEM) { ++ /* memory allocation failure, wait for some IO to finish */ ++ bch2_move_ctxt_wait_for_io(ctxt); ++ continue; ++ } ++ ++ /* XXX signal failure */ ++ goto next; ++ } ++ ++ if (rate) ++ bch2_ratelimit_increment(rate, k.k->size); ++next: ++ atomic64_add(k.k->size * bch2_bkey_nr_ptrs_allocated(k), ++ &stats->sectors_seen); ++next_nondata: ++ bch2_btree_iter_next(iter); ++ bch2_trans_cond_resched(&trans); ++ } ++out: ++ ret = bch2_trans_exit(&trans) ?: ret; ++ bkey_on_stack_exit(&sk, c); ++ ++ return ret; ++} ++ ++int bch2_move_data(struct bch_fs *c, ++ struct bch_ratelimit *rate, ++ struct write_point_specifier wp, ++ struct bpos start, ++ struct bpos end, ++ move_pred_fn pred, void *arg, ++ struct bch_move_stats *stats) ++{ ++ struct moving_context ctxt = { .stats = stats }; ++ int ret; ++ ++ closure_init_stack(&ctxt.cl); ++ INIT_LIST_HEAD(&ctxt.reads); ++ init_waitqueue_head(&ctxt.wait); ++ ++ stats->data_type = BCH_DATA_user; ++ ++ ret = __bch2_move_data(c, &ctxt, rate, wp, start, end, ++ pred, arg, stats, BTREE_ID_EXTENTS) ?: ++ __bch2_move_data(c, &ctxt, rate, wp, start, end, ++ pred, arg, stats, BTREE_ID_REFLINK); ++ ++ move_ctxt_wait_event(&ctxt, list_empty(&ctxt.reads)); ++ closure_sync(&ctxt.cl); ++ ++ EBUG_ON(atomic_read(&ctxt.write_sectors)); ++ ++ trace_move_data(c, ++ atomic64_read(&stats->sectors_moved), ++ atomic64_read(&stats->keys_moved)); ++ ++ return ret; ++} ++ ++static int bch2_move_btree(struct bch_fs *c, ++ move_pred_fn pred, ++ void *arg, ++ struct bch_move_stats *stats) ++{ ++ struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct btree *b; ++ unsigned id; ++ struct data_opts data_opts; ++ enum data_cmd cmd; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ stats->data_type = BCH_DATA_btree; ++ ++ for (id = 0; id < BTREE_ID_NR; id++) { ++ stats->btree_id = id; ++ ++ for_each_btree_node(&trans, iter, id, POS_MIN, ++ BTREE_ITER_PREFETCH, b) { ++ stats->pos = iter->pos; ++ ++ switch ((cmd = pred(c, arg, ++ bkey_i_to_s_c(&b->key), ++ &io_opts, &data_opts))) { ++ case DATA_SKIP: ++ goto next; ++ case DATA_SCRUB: ++ BUG(); ++ case DATA_ADD_REPLICAS: ++ case DATA_REWRITE: ++ break; ++ default: ++ BUG(); ++ } ++ ++ ret = bch2_btree_node_rewrite(c, iter, ++ b->data->keys.seq, 0) ?: ret; ++next: ++ bch2_trans_cond_resched(&trans); ++ } ++ ++ ret = bch2_trans_iter_free(&trans, iter) ?: ret; ++ } ++ ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++#if 0 ++static enum data_cmd scrub_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) ++{ ++ return DATA_SCRUB; ++} ++#endif ++ ++static enum data_cmd rereplicate_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) ++{ ++ unsigned nr_good = bch2_bkey_durability(c, k); ++ unsigned replicas = 0; ++ ++ switch (k.k->type) { ++ case KEY_TYPE_btree_ptr: ++ replicas = c->opts.metadata_replicas; ++ break; ++ case KEY_TYPE_extent: ++ replicas = io_opts->data_replicas; ++ break; ++ } ++ ++ if (!nr_good || nr_good >= replicas) ++ return DATA_SKIP; ++ ++ data_opts->target = 0; ++ data_opts->btree_insert_flags = 0; ++ return DATA_ADD_REPLICAS; ++} ++ ++static enum data_cmd migrate_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) ++{ ++ struct bch_ioctl_data *op = arg; ++ ++ if (!bch2_bkey_has_device(k, op->migrate.dev)) ++ return DATA_SKIP; ++ ++ data_opts->target = 0; ++ data_opts->btree_insert_flags = 0; ++ data_opts->rewrite_dev = op->migrate.dev; ++ return DATA_REWRITE; ++} ++ ++int bch2_data_job(struct bch_fs *c, ++ struct bch_move_stats *stats, ++ struct bch_ioctl_data op) ++{ ++ int ret = 0; ++ ++ switch (op.op) { ++ case BCH_DATA_OP_REREPLICATE: ++ stats->data_type = BCH_DATA_journal; ++ ret = bch2_journal_flush_device_pins(&c->journal, -1); ++ ++ ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret; ++ ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c)); ++ ++ ret = bch2_replicas_gc2(c) ?: ret; ++ ++ ret = bch2_move_data(c, NULL, ++ writepoint_hashed((unsigned long) current), ++ op.start, ++ op.end, ++ rereplicate_pred, c, stats) ?: ret; ++ ret = bch2_replicas_gc2(c) ?: ret; ++ break; ++ case BCH_DATA_OP_MIGRATE: ++ if (op.migrate.dev >= c->sb.nr_devices) ++ return -EINVAL; ++ ++ stats->data_type = BCH_DATA_journal; ++ ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); ++ ++ ret = bch2_move_btree(c, migrate_pred, &op, stats) ?: ret; ++ ret = bch2_replicas_gc2(c) ?: ret; ++ ++ ret = bch2_move_data(c, NULL, ++ writepoint_hashed((unsigned long) current), ++ op.start, ++ op.end, ++ migrate_pred, &op, stats) ?: ret; ++ ret = bch2_replicas_gc2(c) ?: ret; ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ ++ return ret; ++} +diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h +new file mode 100644 +index 000000000000..0acd1720d4f8 +--- /dev/null ++++ b/fs/bcachefs/move.h +@@ -0,0 +1,64 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_MOVE_H ++#define _BCACHEFS_MOVE_H ++ ++#include "btree_iter.h" ++#include "buckets.h" ++#include "io_types.h" ++#include "move_types.h" ++ ++struct bch_read_bio; ++struct moving_context; ++ ++enum data_cmd { ++ DATA_SKIP, ++ DATA_SCRUB, ++ DATA_ADD_REPLICAS, ++ DATA_REWRITE, ++ DATA_PROMOTE, ++}; ++ ++struct data_opts { ++ u16 target; ++ unsigned rewrite_dev; ++ int btree_insert_flags; ++}; ++ ++struct migrate_write { ++ enum btree_id btree_id; ++ enum data_cmd data_cmd; ++ struct data_opts data_opts; ++ ++ unsigned nr_ptrs_reserved; ++ ++ struct moving_context *ctxt; ++ ++ /* what we read: */ ++ struct bch_extent_ptr ptr; ++ u64 offset; ++ ++ struct bch_write_op op; ++}; ++ ++void bch2_migrate_read_done(struct migrate_write *, struct bch_read_bio *); ++int bch2_migrate_write_init(struct bch_fs *, struct migrate_write *, ++ struct write_point_specifier, ++ struct bch_io_opts, ++ enum data_cmd, struct data_opts, ++ enum btree_id, struct bkey_s_c); ++ ++typedef enum data_cmd (*move_pred_fn)(struct bch_fs *, void *, ++ struct bkey_s_c, ++ struct bch_io_opts *, struct data_opts *); ++ ++int bch2_move_data(struct bch_fs *, struct bch_ratelimit *, ++ struct write_point_specifier, ++ struct bpos, struct bpos, ++ move_pred_fn, void *, ++ struct bch_move_stats *); ++ ++int bch2_data_job(struct bch_fs *, ++ struct bch_move_stats *, ++ struct bch_ioctl_data); ++ ++#endif /* _BCACHEFS_MOVE_H */ +diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h +new file mode 100644 +index 000000000000..fc0de165af9f +--- /dev/null ++++ b/fs/bcachefs/move_types.h +@@ -0,0 +1,17 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_MOVE_TYPES_H ++#define _BCACHEFS_MOVE_TYPES_H ++ ++struct bch_move_stats { ++ enum bch_data_type data_type; ++ enum btree_id btree_id; ++ struct bpos pos; ++ ++ atomic64_t keys_moved; ++ atomic64_t keys_raced; ++ atomic64_t sectors_moved; ++ atomic64_t sectors_seen; ++ atomic64_t sectors_raced; ++}; ++ ++#endif /* _BCACHEFS_MOVE_TYPES_H */ +diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c +new file mode 100644 +index 000000000000..de0a7974ec9f +--- /dev/null ++++ b/fs/bcachefs/movinggc.c +@@ -0,0 +1,359 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Moving/copying garbage collector ++ * ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "btree_iter.h" ++#include "btree_update.h" ++#include "buckets.h" ++#include "clock.h" ++#include "disk_groups.h" ++#include "error.h" ++#include "extents.h" ++#include "eytzinger.h" ++#include "io.h" ++#include "keylist.h" ++#include "move.h" ++#include "movinggc.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * We can't use the entire copygc reserve in one iteration of copygc: we may ++ * need the buckets we're freeing up to go back into the copygc reserve to make ++ * forward progress, but if the copygc reserve is full they'll be available for ++ * any allocation - and it's possible that in a given iteration, we free up most ++ * of the buckets we're going to free before we allocate most of the buckets ++ * we're going to allocate. ++ * ++ * If we only use half of the reserve per iteration, then in steady state we'll ++ * always have room in the reserve for the buckets we're going to need in the ++ * next iteration: ++ */ ++#define COPYGC_BUCKETS_PER_ITER(ca) \ ++ ((ca)->free[RESERVE_MOVINGGC].size / 2) ++ ++static int bucket_offset_cmp(const void *_l, const void *_r, size_t size) ++{ ++ const struct copygc_heap_entry *l = _l; ++ const struct copygc_heap_entry *r = _r; ++ ++ return cmp_int(l->dev, r->dev) ?: ++ cmp_int(l->offset, r->offset); ++} ++ ++static int __copygc_pred(struct bch_fs *c, struct bkey_s_c k) ++{ ++ copygc_heap *h = &c->copygc_heap; ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const struct bch_extent_ptr *ptr; ++ ++ bkey_for_each_ptr(ptrs, ptr) { ++ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); ++ struct copygc_heap_entry search = { ++ .dev = ptr->dev, ++ .offset = ptr->offset ++ }; ++ ++ ssize_t i = eytzinger0_find_le(h->data, h->used, ++ sizeof(h->data[0]), ++ bucket_offset_cmp, &search); ++#if 0 ++ /* eytzinger search verify code: */ ++ ssize_t j = -1, k; ++ ++ for (k = 0; k < h->used; k++) ++ if (h->data[k].offset <= ptr->offset && ++ (j < 0 || h->data[k].offset > h->data[j].offset)) ++ j = k; ++ ++ BUG_ON(i != j); ++#endif ++ if (i >= 0 && ++ ptr->offset < h->data[i].offset + ca->mi.bucket_size && ++ ptr->gen == h->data[i].gen) ++ return ptr->dev; ++ } ++ ++ return -1; ++} ++ ++static enum data_cmd copygc_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) ++{ ++ int dev_idx = __copygc_pred(c, k); ++ if (dev_idx < 0) ++ return DATA_SKIP; ++ ++ data_opts->target = io_opts->background_target; ++ data_opts->btree_insert_flags = BTREE_INSERT_USE_RESERVE; ++ data_opts->rewrite_dev = dev_idx; ++ return DATA_REWRITE; ++} ++ ++static bool have_copygc_reserve(struct bch_dev *ca) ++{ ++ bool ret; ++ ++ spin_lock(&ca->fs->freelist_lock); ++ ret = fifo_full(&ca->free[RESERVE_MOVINGGC]) || ++ ca->allocator_state != ALLOCATOR_RUNNING; ++ spin_unlock(&ca->fs->freelist_lock); ++ ++ return ret; ++} ++ ++static inline int fragmentation_cmp(copygc_heap *heap, ++ struct copygc_heap_entry l, ++ struct copygc_heap_entry r) ++{ ++ return cmp_int(l.fragmentation, r.fragmentation); ++} ++ ++static int bch2_copygc(struct bch_fs *c) ++{ ++ copygc_heap *h = &c->copygc_heap; ++ struct copygc_heap_entry e, *i; ++ struct bucket_array *buckets; ++ struct bch_move_stats move_stats; ++ u64 sectors_to_move = 0, sectors_not_moved = 0; ++ u64 sectors_reserved = 0; ++ u64 buckets_to_move, buckets_not_moved = 0; ++ struct bch_dev *ca; ++ unsigned dev_idx; ++ size_t b, heap_size = 0; ++ int ret; ++ ++ memset(&move_stats, 0, sizeof(move_stats)); ++ /* ++ * Find buckets with lowest sector counts, skipping completely ++ * empty buckets, by building a maxheap sorted by sector count, ++ * and repeatedly replacing the maximum element until all ++ * buckets have been visited. ++ */ ++ h->used = 0; ++ ++ for_each_rw_member(ca, c, dev_idx) ++ heap_size += ca->mi.nbuckets >> 7; ++ ++ if (h->size < heap_size) { ++ free_heap(&c->copygc_heap); ++ if (!init_heap(&c->copygc_heap, heap_size, GFP_KERNEL)) { ++ bch_err(c, "error allocating copygc heap"); ++ return 0; ++ } ++ } ++ ++ for_each_rw_member(ca, c, dev_idx) { ++ closure_wait_event(&c->freelist_wait, have_copygc_reserve(ca)); ++ ++ spin_lock(&ca->fs->freelist_lock); ++ sectors_reserved += fifo_used(&ca->free[RESERVE_MOVINGGC]) * ca->mi.bucket_size; ++ spin_unlock(&ca->fs->freelist_lock); ++ ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ for (b = buckets->first_bucket; b < buckets->nbuckets; b++) { ++ struct bucket_mark m = READ_ONCE(buckets->b[b].mark); ++ struct copygc_heap_entry e; ++ ++ if (m.owned_by_allocator || ++ m.data_type != BCH_DATA_user || ++ !bucket_sectors_used(m) || ++ bucket_sectors_used(m) >= ca->mi.bucket_size) ++ continue; ++ ++ e = (struct copygc_heap_entry) { ++ .dev = dev_idx, ++ .gen = m.gen, ++ .fragmentation = bucket_sectors_used(m) * (1U << 15) ++ / ca->mi.bucket_size, ++ .sectors = bucket_sectors_used(m), ++ .offset = bucket_to_sector(ca, b), ++ }; ++ heap_add_or_replace(h, e, -fragmentation_cmp, NULL); ++ } ++ up_read(&ca->bucket_lock); ++ } ++ ++ if (!sectors_reserved) { ++ bch2_fs_fatal_error(c, "stuck, ran out of copygc reserve!"); ++ return -1; ++ } ++ ++ for (i = h->data; i < h->data + h->used; i++) ++ sectors_to_move += i->sectors; ++ ++ while (sectors_to_move > sectors_reserved) { ++ BUG_ON(!heap_pop(h, e, -fragmentation_cmp, NULL)); ++ sectors_to_move -= e.sectors; ++ } ++ ++ buckets_to_move = h->used; ++ ++ if (!buckets_to_move) ++ return 0; ++ ++ eytzinger0_sort(h->data, h->used, ++ sizeof(h->data[0]), ++ bucket_offset_cmp, NULL); ++ ++ ret = bch2_move_data(c, &c->copygc_pd.rate, ++ writepoint_ptr(&c->copygc_write_point), ++ POS_MIN, POS_MAX, ++ copygc_pred, NULL, ++ &move_stats); ++ ++ for_each_rw_member(ca, c, dev_idx) { ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ for (i = h->data; i < h->data + h->used; i++) { ++ struct bucket_mark m; ++ size_t b; ++ ++ if (i->dev != dev_idx) ++ continue; ++ ++ b = sector_to_bucket(ca, i->offset); ++ m = READ_ONCE(buckets->b[b].mark); ++ ++ if (i->gen == m.gen && ++ bucket_sectors_used(m)) { ++ sectors_not_moved += bucket_sectors_used(m); ++ buckets_not_moved++; ++ } ++ } ++ up_read(&ca->bucket_lock); ++ } ++ ++ if (sectors_not_moved && !ret) ++ bch_warn_ratelimited(c, ++ "copygc finished but %llu/%llu sectors, %llu/%llu buckets not moved (move stats: moved %llu sectors, raced %llu keys, %llu sectors)", ++ sectors_not_moved, sectors_to_move, ++ buckets_not_moved, buckets_to_move, ++ atomic64_read(&move_stats.sectors_moved), ++ atomic64_read(&move_stats.keys_raced), ++ atomic64_read(&move_stats.sectors_raced)); ++ ++ trace_copygc(c, ++ atomic64_read(&move_stats.sectors_moved), sectors_not_moved, ++ buckets_to_move, buckets_not_moved); ++ return 0; ++} ++ ++/* ++ * Copygc runs when the amount of fragmented data is above some arbitrary ++ * threshold: ++ * ++ * The threshold at the limit - when the device is full - is the amount of space ++ * we reserved in bch2_recalc_capacity; we can't have more than that amount of ++ * disk space stranded due to fragmentation and store everything we have ++ * promised to store. ++ * ++ * But we don't want to be running copygc unnecessarily when the device still ++ * has plenty of free space - rather, we want copygc to smoothly run every so ++ * often and continually reduce the amount of fragmented space as the device ++ * fills up. So, we increase the threshold by half the current free space. ++ */ ++unsigned long bch2_copygc_wait_amount(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned dev_idx; ++ u64 fragmented_allowed = c->copygc_threshold; ++ u64 fragmented = 0; ++ ++ for_each_rw_member(ca, c, dev_idx) { ++ struct bch_dev_usage usage = bch2_dev_usage_read(ca); ++ ++ fragmented_allowed += ((__dev_buckets_available(ca, usage) * ++ ca->mi.bucket_size) >> 1); ++ fragmented += usage.sectors_fragmented; ++ } ++ ++ return max_t(s64, 0, fragmented_allowed - fragmented); ++} ++ ++static int bch2_copygc_thread(void *arg) ++{ ++ struct bch_fs *c = arg; ++ struct io_clock *clock = &c->io_clock[WRITE]; ++ unsigned long last, wait; ++ ++ set_freezable(); ++ ++ while (!kthread_should_stop()) { ++ if (kthread_wait_freezable(c->copy_gc_enabled)) ++ break; ++ ++ last = atomic_long_read(&clock->now); ++ wait = bch2_copygc_wait_amount(c); ++ ++ if (wait > clock->max_slop) { ++ bch2_kthread_io_clock_wait(clock, last + wait, ++ MAX_SCHEDULE_TIMEOUT); ++ continue; ++ } ++ ++ if (bch2_copygc(c)) ++ break; ++ } ++ ++ return 0; ++} ++ ++void bch2_copygc_stop(struct bch_fs *c) ++{ ++ c->copygc_pd.rate.rate = UINT_MAX; ++ bch2_ratelimit_reset(&c->copygc_pd.rate); ++ ++ if (c->copygc_thread) { ++ kthread_stop(c->copygc_thread); ++ put_task_struct(c->copygc_thread); ++ } ++ c->copygc_thread = NULL; ++} ++ ++int bch2_copygc_start(struct bch_fs *c) ++{ ++ struct task_struct *t; ++ ++ if (c->copygc_thread) ++ return 0; ++ ++ if (c->opts.nochanges) ++ return 0; ++ ++ if (bch2_fs_init_fault("copygc_start")) ++ return -ENOMEM; ++ ++ t = kthread_create(bch2_copygc_thread, c, "bch_copygc"); ++ if (IS_ERR(t)) ++ return PTR_ERR(t); ++ ++ get_task_struct(t); ++ ++ c->copygc_thread = t; ++ wake_up_process(c->copygc_thread); ++ ++ return 0; ++} ++ ++void bch2_fs_copygc_init(struct bch_fs *c) ++{ ++ bch2_pd_controller_init(&c->copygc_pd); ++ c->copygc_pd.d_term = 0; ++} +diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h +new file mode 100644 +index 000000000000..922738247d03 +--- /dev/null ++++ b/fs/bcachefs/movinggc.h +@@ -0,0 +1,9 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_MOVINGGC_H ++#define _BCACHEFS_MOVINGGC_H ++ ++void bch2_copygc_stop(struct bch_fs *); ++int bch2_copygc_start(struct bch_fs *); ++void bch2_fs_copygc_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_MOVINGGC_H */ +diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c +new file mode 100644 +index 000000000000..afe25cd26c06 +--- /dev/null ++++ b/fs/bcachefs/opts.c +@@ -0,0 +1,437 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include ++ ++#include "bcachefs.h" ++#include "compress.h" ++#include "disk_groups.h" ++#include "opts.h" ++#include "super-io.h" ++#include "util.h" ++ ++const char * const bch2_error_actions[] = { ++ "continue", ++ "remount-ro", ++ "panic", ++ NULL ++}; ++ ++const char * const bch2_sb_features[] = { ++#define x(f, n) #f, ++ BCH_SB_FEATURES() ++#undef x ++ NULL ++}; ++ ++const char * const bch2_csum_opts[] = { ++ "none", ++ "crc32c", ++ "crc64", ++ NULL ++}; ++ ++const char * const bch2_compression_opts[] = { ++#define x(t, n) #t, ++ BCH_COMPRESSION_OPTS() ++#undef x ++ NULL ++}; ++ ++const char * const bch2_str_hash_types[] = { ++ "crc32c", ++ "crc64", ++ "siphash", ++ NULL ++}; ++ ++const char * const bch2_data_types[] = { ++#define x(t, n) #t, ++ BCH_DATA_TYPES() ++#undef x ++ NULL ++}; ++ ++const char * const bch2_cache_replacement_policies[] = { ++ "lru", ++ "fifo", ++ "random", ++ NULL ++}; ++ ++/* Default is -1; we skip past it for struct cached_dev's cache mode */ ++const char * const bch2_cache_modes[] = { ++ "default", ++ "writethrough", ++ "writeback", ++ "writearound", ++ "none", ++ NULL ++}; ++ ++const char * const bch2_dev_state[] = { ++ "readwrite", ++ "readonly", ++ "failed", ++ "spare", ++ NULL ++}; ++ ++void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) ++{ ++#define x(_name, ...) \ ++ if (opt_defined(src, _name)) \ ++ opt_set(*dst, _name, src._name); ++ ++ BCH_OPTS() ++#undef x ++} ++ ++bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Opt_##_name: \ ++ return opt_defined(*opts, _name); ++ BCH_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Opt_##_name: \ ++ return opts->_name; ++ BCH_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) ++{ ++ switch (id) { ++#define x(_name, ...) \ ++ case Opt_##_name: \ ++ opt_set(*opts, _name, v); \ ++ break; ++ BCH_OPTS() ++#undef x ++ default: ++ BUG(); ++ } ++} ++ ++/* ++ * Initial options from superblock - here we don't want any options undefined, ++ * any options the superblock doesn't specify are set to 0: ++ */ ++struct bch_opts bch2_opts_from_sb(struct bch_sb *sb) ++{ ++ struct bch_opts opts = bch2_opts_empty(); ++ ++#define x(_name, _bits, _mode, _type, _sb_opt, ...) \ ++ if (_sb_opt != NO_SB_OPT) \ ++ opt_set(opts, _name, _sb_opt(sb)); ++ BCH_OPTS() ++#undef x ++ ++ return opts; ++} ++ ++const struct bch_option bch2_opt_table[] = { ++#define OPT_BOOL() .type = BCH_OPT_BOOL ++#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, .min = _min, .max = _max ++#define OPT_SECTORS(_min, _max) .type = BCH_OPT_SECTORS, .min = _min, .max = _max ++#define OPT_STR(_choices) .type = BCH_OPT_STR, .choices = _choices ++#define OPT_FN(_fn) .type = BCH_OPT_FN, \ ++ .parse = _fn##_parse, \ ++ .to_text = _fn##_to_text ++ ++#define x(_name, _bits, _mode, _type, _sb_opt, _default, _hint, _help) \ ++ [Opt_##_name] = { \ ++ .attr = { \ ++ .name = #_name, \ ++ .mode = (_mode) & OPT_RUNTIME ? 0644 : 0444, \ ++ }, \ ++ .mode = _mode, \ ++ .hint = _hint, \ ++ .help = _help, \ ++ .set_sb = SET_##_sb_opt, \ ++ _type \ ++ }, ++ ++ BCH_OPTS() ++#undef x ++}; ++ ++int bch2_opt_lookup(const char *name) ++{ ++ const struct bch_option *i; ++ ++ for (i = bch2_opt_table; ++ i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table); ++ i++) ++ if (!strcmp(name, i->attr.name)) ++ return i - bch2_opt_table; ++ ++ return -1; ++} ++ ++struct synonym { ++ const char *s1, *s2; ++}; ++ ++static const struct synonym bch_opt_synonyms[] = { ++ { "quota", "usrquota" }, ++}; ++ ++static int bch2_mount_opt_lookup(const char *name) ++{ ++ const struct synonym *i; ++ ++ for (i = bch_opt_synonyms; ++ i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms); ++ i++) ++ if (!strcmp(name, i->s1)) ++ name = i->s2; ++ ++ return bch2_opt_lookup(name); ++} ++ ++int bch2_opt_parse(struct bch_fs *c, const struct bch_option *opt, ++ const char *val, u64 *res) ++{ ++ ssize_t ret; ++ ++ switch (opt->type) { ++ case BCH_OPT_BOOL: ++ ret = kstrtou64(val, 10, res); ++ if (ret < 0) ++ return ret; ++ ++ if (*res > 1) ++ return -ERANGE; ++ break; ++ case BCH_OPT_UINT: ++ ret = kstrtou64(val, 10, res); ++ if (ret < 0) ++ return ret; ++ ++ if (*res < opt->min || *res >= opt->max) ++ return -ERANGE; ++ break; ++ case BCH_OPT_SECTORS: ++ ret = bch2_strtou64_h(val, res); ++ if (ret < 0) ++ return ret; ++ ++ if (*res & 511) ++ return -EINVAL; ++ ++ *res >>= 9; ++ ++ if (*res < opt->min || *res >= opt->max) ++ return -ERANGE; ++ break; ++ case BCH_OPT_STR: ++ ret = match_string(opt->choices, -1, val); ++ if (ret < 0) ++ return ret; ++ ++ *res = ret; ++ break; ++ case BCH_OPT_FN: ++ if (!c) ++ return -EINVAL; ++ ++ return opt->parse(c, val, res); ++ } ++ ++ return 0; ++} ++ ++void bch2_opt_to_text(struct printbuf *out, struct bch_fs *c, ++ const struct bch_option *opt, u64 v, ++ unsigned flags) ++{ ++ if (flags & OPT_SHOW_MOUNT_STYLE) { ++ if (opt->type == BCH_OPT_BOOL) { ++ pr_buf(out, "%s%s", ++ v ? "" : "no", ++ opt->attr.name); ++ return; ++ } ++ ++ pr_buf(out, "%s=", opt->attr.name); ++ } ++ ++ switch (opt->type) { ++ case BCH_OPT_BOOL: ++ case BCH_OPT_UINT: ++ pr_buf(out, "%lli", v); ++ break; ++ case BCH_OPT_SECTORS: ++ bch2_hprint(out, v); ++ break; ++ case BCH_OPT_STR: ++ if (flags & OPT_SHOW_FULL_LIST) ++ bch2_string_opt_to_text(out, opt->choices, v); ++ else ++ pr_buf(out, opt->choices[v]); ++ break; ++ case BCH_OPT_FN: ++ opt->to_text(out, c, v); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) ++{ ++ int ret = 0; ++ ++ switch (id) { ++ case Opt_compression: ++ case Opt_background_compression: ++ ret = bch2_check_set_has_compressed_data(c, v); ++ break; ++ case Opt_erasure_code: ++ if (v) ++ bch2_check_set_feature(c, BCH_FEATURE_ec); ++ break; ++ } ++ ++ return ret; ++} ++ ++int bch2_opts_check_may_set(struct bch_fs *c) ++{ ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < bch2_opts_nr; i++) { ++ ret = bch2_opt_check_may_set(c, i, ++ bch2_opt_get_by_id(&c->opts, i)); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++int bch2_parse_mount_opts(struct bch_opts *opts, char *options) ++{ ++ char *opt, *name, *val; ++ int ret, id; ++ u64 v; ++ ++ while ((opt = strsep(&options, ",")) != NULL) { ++ name = strsep(&opt, "="); ++ val = opt; ++ ++ if (val) { ++ id = bch2_mount_opt_lookup(name); ++ if (id < 0) ++ goto bad_opt; ++ ++ ret = bch2_opt_parse(NULL, &bch2_opt_table[id], val, &v); ++ if (ret < 0) ++ goto bad_val; ++ } else { ++ id = bch2_mount_opt_lookup(name); ++ v = 1; ++ ++ if (id < 0 && ++ !strncmp("no", name, 2)) { ++ id = bch2_mount_opt_lookup(name + 2); ++ v = 0; ++ } ++ ++ if (id < 0) ++ goto bad_opt; ++ ++ if (bch2_opt_table[id].type != BCH_OPT_BOOL) ++ goto no_val; ++ } ++ ++ if (!(bch2_opt_table[id].mode & OPT_MOUNT)) ++ goto bad_opt; ++ ++ if (id == Opt_acl && ++ !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL)) ++ goto bad_opt; ++ ++ if ((id == Opt_usrquota || ++ id == Opt_grpquota) && ++ !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) ++ goto bad_opt; ++ ++ bch2_opt_set_by_id(opts, id, v); ++ } ++ ++ return 0; ++bad_opt: ++ pr_err("Bad mount option %s", name); ++ return -1; ++bad_val: ++ pr_err("Invalid value %s for mount option %s", val, name); ++ return -1; ++no_val: ++ pr_err("Mount option %s requires a value", name); ++ return -1; ++} ++ ++/* io opts: */ ++ ++struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) ++{ ++ struct bch_io_opts ret = { 0 }; ++#define x(_name, _bits) \ ++ if (opt_defined(src, _name)) \ ++ opt_set(ret, _name, src._name); ++ BCH_INODE_OPTS() ++#undef x ++ return ret; ++} ++ ++struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts src) ++{ ++ struct bch_opts ret = { 0 }; ++#define x(_name, _bits) \ ++ if (opt_defined(src, _name)) \ ++ opt_set(ret, _name, src._name); ++ BCH_INODE_OPTS() ++#undef x ++ return ret; ++} ++ ++void bch2_io_opts_apply(struct bch_io_opts *dst, struct bch_io_opts src) ++{ ++#define x(_name, _bits) \ ++ if (opt_defined(src, _name)) \ ++ opt_set(*dst, _name, src._name); ++ BCH_INODE_OPTS() ++#undef x ++} ++ ++bool bch2_opt_is_inode_opt(enum bch_opt_id id) ++{ ++ static const enum bch_opt_id inode_opt_list[] = { ++#define x(_name, _bits) Opt_##_name, ++ BCH_INODE_OPTS() ++#undef x ++ }; ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++) ++ if (inode_opt_list[i] == id) ++ return true; ++ ++ return false; ++} +diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h +new file mode 100644 +index 000000000000..014c608ca0c6 +--- /dev/null ++++ b/fs/bcachefs/opts.h +@@ -0,0 +1,440 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_OPTS_H ++#define _BCACHEFS_OPTS_H ++ ++#include ++#include ++#include ++#include ++#include "bcachefs_format.h" ++ ++extern const char * const bch2_error_actions[]; ++extern const char * const bch2_sb_features[]; ++extern const char * const bch2_csum_opts[]; ++extern const char * const bch2_compression_opts[]; ++extern const char * const bch2_str_hash_types[]; ++extern const char * const bch2_data_types[]; ++extern const char * const bch2_cache_replacement_policies[]; ++extern const char * const bch2_cache_modes[]; ++extern const char * const bch2_dev_state[]; ++ ++/* ++ * Mount options; we also store defaults in the superblock. ++ * ++ * Also exposed via sysfs: if an option is writeable, and it's also stored in ++ * the superblock, changing it via sysfs (currently? might change this) also ++ * updates the superblock. ++ * ++ * We store options as signed integers, where -1 means undefined. This means we ++ * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only ++ * apply the options from that struct that are defined. ++ */ ++ ++/* dummy option, for options that aren't stored in the superblock */ ++LE64_BITMASK(NO_SB_OPT, struct bch_sb, flags[0], 0, 0); ++ ++/* When can be set: */ ++enum opt_mode { ++ OPT_FORMAT = (1 << 0), ++ OPT_MOUNT = (1 << 1), ++ OPT_RUNTIME = (1 << 2), ++ OPT_INODE = (1 << 3), ++ OPT_DEVICE = (1 << 4), ++}; ++ ++enum opt_type { ++ BCH_OPT_BOOL, ++ BCH_OPT_UINT, ++ BCH_OPT_SECTORS, ++ BCH_OPT_STR, ++ BCH_OPT_FN, ++}; ++ ++/** ++ * x(name, shortopt, type, in mem type, mode, sb_opt) ++ * ++ * @name - name of mount option, sysfs attribute, and struct bch_opts ++ * member ++ * ++ * @mode - when opt may be set ++ * ++ * @sb_option - name of corresponding superblock option ++ * ++ * @type - one of OPT_BOOL, OPT_UINT, OPT_STR ++ */ ++ ++/* ++ * XXX: add fields for ++ * - default value ++ * - helptext ++ */ ++ ++#ifdef __KERNEL__ ++#define RATELIMIT_ERRORS true ++#else ++#define RATELIMIT_ERRORS false ++#endif ++ ++#define BCH_OPTS() \ ++ x(block_size, u16, \ ++ OPT_FORMAT, \ ++ OPT_SECTORS(1, 128), \ ++ BCH_SB_BLOCK_SIZE, 8, \ ++ "size", NULL) \ ++ x(btree_node_size, u16, \ ++ OPT_FORMAT, \ ++ OPT_SECTORS(1, 512), \ ++ BCH_SB_BTREE_NODE_SIZE, 512, \ ++ "size", "Btree node size, default 256k") \ ++ x(errors, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_STR(bch2_error_actions), \ ++ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_RO, \ ++ NULL, "Action to take on filesystem error") \ ++ x(metadata_replicas, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_UINT(1, BCH_REPLICAS_MAX), \ ++ BCH_SB_META_REPLICAS_WANT, 1, \ ++ "#", "Number of metadata replicas") \ ++ x(data_replicas, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_UINT(1, BCH_REPLICAS_MAX), \ ++ BCH_SB_DATA_REPLICAS_WANT, 1, \ ++ "#", "Number of data replicas") \ ++ x(metadata_replicas_required, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_UINT(1, BCH_REPLICAS_MAX), \ ++ BCH_SB_META_REPLICAS_REQ, 1, \ ++ "#", NULL) \ ++ x(data_replicas_required, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_UINT(1, BCH_REPLICAS_MAX), \ ++ BCH_SB_DATA_REPLICAS_REQ, 1, \ ++ "#", NULL) \ ++ x(metadata_checksum, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_STR(bch2_csum_opts), \ ++ BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ ++ NULL, NULL) \ ++ x(data_checksum, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_STR(bch2_csum_opts), \ ++ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_CRC32C, \ ++ NULL, NULL) \ ++ x(compression, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_STR(bch2_compression_opts), \ ++ BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ ++ NULL, NULL) \ ++ x(background_compression, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_STR(bch2_compression_opts), \ ++ BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ ++ NULL, NULL) \ ++ x(str_hash, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_STR(bch2_str_hash_types), \ ++ BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_SIPHASH, \ ++ NULL, "Hash function for directory entries and xattrs")\ ++ x(foreground_target, u16, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FN(bch2_opt_target), \ ++ BCH_SB_FOREGROUND_TARGET, 0, \ ++ "(target)", "Device or disk group for foreground writes") \ ++ x(background_target, u16, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FN(bch2_opt_target), \ ++ BCH_SB_BACKGROUND_TARGET, 0, \ ++ "(target)", "Device or disk group to move data to in the background")\ ++ x(promote_target, u16, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_FN(bch2_opt_target), \ ++ BCH_SB_PROMOTE_TARGET, 0, \ ++ "(target)", "Device or disk group to promote data to on read")\ ++ x(erasure_code, u16, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ ++ OPT_BOOL(), \ ++ BCH_SB_ERASURE_CODE, false, \ ++ NULL, "Enable erasure coding (DO NOT USE YET)") \ ++ x(inodes_32bit, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH_SB_INODE_32BIT, false, \ ++ NULL, "Constrain inode numbers to 32 bits") \ ++ x(gc_reserve_percent, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_UINT(5, 21), \ ++ BCH_SB_GC_RESERVE, 8, \ ++ "%", "Percentage of disk space to reserve for copygc")\ ++ x(gc_reserve_bytes, u64, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_SECTORS(0, U64_MAX), \ ++ BCH_SB_GC_RESERVE_BYTES, 0, \ ++ "%", "Amount of disk space to reserve for copygc\n" \ ++ "Takes precedence over gc_reserve_percent if set")\ ++ x(root_reserve_percent, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_UINT(0, 100), \ ++ BCH_SB_ROOT_RESERVE, 0, \ ++ "%", "Percentage of disk space to reserve for superuser")\ ++ x(wide_macs, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH_SB_128_BIT_MACS, false, \ ++ NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\ ++ x(inline_data, u8, \ ++ OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Enable inline data extents") \ ++ x(acl, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH_SB_POSIX_ACL, true, \ ++ NULL, "Enable POSIX acls") \ ++ x(usrquota, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH_SB_USRQUOTA, false, \ ++ NULL, "Enable user quotas") \ ++ x(grpquota, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH_SB_GRPQUOTA, false, \ ++ NULL, "Enable group quotas") \ ++ x(prjquota, u8, \ ++ OPT_FORMAT|OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ BCH_SB_PRJQUOTA, false, \ ++ NULL, "Enable project quotas") \ ++ x(reflink, u8, \ ++ OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ BCH_SB_REFLINK, true, \ ++ NULL, "Enable reflink support") \ ++ x(degraded, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Allow mounting in degraded mode") \ ++ x(discard, u8, \ ++ OPT_MOUNT|OPT_DEVICE, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Enable discard/TRIM support") \ ++ x(verbose, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Extra debugging information during mount/recovery")\ ++ x(journal_flush_disabled, u8, \ ++ OPT_MOUNT|OPT_RUNTIME, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Disable journal flush on sync/fsync\n" \ ++ "If enabled, writes can be lost, but only since the\n"\ ++ "last journal write (default 1 second)") \ ++ x(fsck, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Run fsck on mount") \ ++ x(fix_errors, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Fix errors during fsck without asking") \ ++ x(ratelimit_errors, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, RATELIMIT_ERRORS, \ ++ NULL, "Ratelimit error messages during fsck") \ ++ x(nochanges, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Super read only mode - no writes at all will be issued,\n"\ ++ "even if we have to replay the journal") \ ++ x(norecovery, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Don't replay the journal") \ ++ x(rebuild_replicas, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Rebuild the superblock replicas section") \ ++ x(keep_journal, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Don't free journal entries/keys after startup")\ ++ x(read_entire_journal, u8, \ ++ 0, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Read all journal entries, not just dirty ones")\ ++ x(noexcl, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Don't open device in exclusive mode") \ ++ x(sb, u64, \ ++ OPT_MOUNT, \ ++ OPT_UINT(0, S64_MAX), \ ++ NO_SB_OPT, BCH_SB_SECTOR, \ ++ "offset", "Sector offset of superblock") \ ++ x(read_only, u8, \ ++ 0, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, NULL) \ ++ x(nostart, u8, \ ++ 0, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Don\'t start filesystem, only open devices") \ ++ x(reconstruct_alloc, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Reconstruct alloc btree") \ ++ x(version_upgrade, u8, \ ++ OPT_MOUNT, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, "Set superblock to latest version,\n" \ ++ "allowing any new features to be used") \ ++ x(project, u8, \ ++ OPT_INODE, \ ++ OPT_BOOL(), \ ++ NO_SB_OPT, false, \ ++ NULL, NULL) \ ++ x(fs_size, u64, \ ++ OPT_DEVICE, \ ++ OPT_SECTORS(0, S64_MAX), \ ++ NO_SB_OPT, 0, \ ++ "size", "Size of filesystem on device") \ ++ x(bucket, u32, \ ++ OPT_DEVICE, \ ++ OPT_SECTORS(0, S64_MAX), \ ++ NO_SB_OPT, 0, \ ++ "size", "Size of filesystem on device") \ ++ x(durability, u8, \ ++ OPT_DEVICE, \ ++ OPT_UINT(0, BCH_REPLICAS_MAX), \ ++ NO_SB_OPT, 1, \ ++ "n", "Data written to this device will be considered\n"\ ++ "to have already been replicated n times") ++ ++struct bch_opts { ++#define x(_name, _bits, ...) unsigned _name##_defined:1; ++ BCH_OPTS() ++#undef x ++ ++#define x(_name, _bits, ...) _bits _name; ++ BCH_OPTS() ++#undef x ++}; ++ ++static const struct bch_opts bch2_opts_default = { ++#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \ ++ ._name##_defined = true, \ ++ ._name = _default, \ ++ ++ BCH_OPTS() ++#undef x ++}; ++ ++#define opt_defined(_opts, _name) ((_opts)._name##_defined) ++ ++#define opt_get(_opts, _name) \ ++ (opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name) ++ ++#define opt_set(_opts, _name, _v) \ ++do { \ ++ (_opts)._name##_defined = true; \ ++ (_opts)._name = _v; \ ++} while (0) ++ ++static inline struct bch_opts bch2_opts_empty(void) ++{ ++ return (struct bch_opts) { 0 }; ++} ++ ++void bch2_opts_apply(struct bch_opts *, struct bch_opts); ++ ++enum bch_opt_id { ++#define x(_name, ...) Opt_##_name, ++ BCH_OPTS() ++#undef x ++ bch2_opts_nr ++}; ++ ++struct bch_fs; ++struct printbuf; ++ ++struct bch_option { ++ struct attribute attr; ++ void (*set_sb)(struct bch_sb *, u64); ++ enum opt_mode mode; ++ enum opt_type type; ++ ++ union { ++ struct { ++ u64 min, max; ++ }; ++ struct { ++ const char * const *choices; ++ }; ++ struct { ++ int (*parse)(struct bch_fs *, const char *, u64 *); ++ void (*to_text)(struct printbuf *, struct bch_fs *, u64); ++ }; ++ }; ++ ++ const char *hint; ++ const char *help; ++ ++}; ++ ++extern const struct bch_option bch2_opt_table[]; ++ ++bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); ++u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); ++void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); ++ ++struct bch_opts bch2_opts_from_sb(struct bch_sb *); ++ ++int bch2_opt_lookup(const char *); ++int bch2_opt_parse(struct bch_fs *, const struct bch_option *, const char *, u64 *); ++ ++#define OPT_SHOW_FULL_LIST (1 << 0) ++#define OPT_SHOW_MOUNT_STYLE (1 << 1) ++ ++void bch2_opt_to_text(struct printbuf *, struct bch_fs *, ++ const struct bch_option *, u64, unsigned); ++ ++int bch2_opt_check_may_set(struct bch_fs *, int, u64); ++int bch2_opts_check_may_set(struct bch_fs *); ++int bch2_parse_mount_opts(struct bch_opts *, char *); ++ ++/* inode opts: */ ++ ++struct bch_io_opts { ++#define x(_name, _bits) unsigned _name##_defined:1; ++ BCH_INODE_OPTS() ++#undef x ++ ++#define x(_name, _bits) u##_bits _name; ++ BCH_INODE_OPTS() ++#undef x ++}; ++ ++struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); ++struct bch_opts bch2_inode_opts_to_opts(struct bch_io_opts); ++void bch2_io_opts_apply(struct bch_io_opts *, struct bch_io_opts); ++bool bch2_opt_is_inode_opt(enum bch_opt_id); ++ ++#endif /* _BCACHEFS_OPTS_H */ +diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c +new file mode 100644 +index 000000000000..d3032a46e7f3 +--- /dev/null ++++ b/fs/bcachefs/quota.c +@@ -0,0 +1,783 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "btree_update.h" ++#include "inode.h" ++#include "quota.h" ++#include "super-io.h" ++ ++static const char *bch2_sb_validate_quota(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_quota *q = field_to_type(f, quota); ++ ++ if (vstruct_bytes(&q->field) != sizeof(*q)) ++ return "invalid field quota: wrong size"; ++ ++ return NULL; ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_quota = { ++ .validate = bch2_sb_validate_quota, ++}; ++ ++const char *bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ if (k.k->p.inode >= QTYP_NR) ++ return "invalid quota type"; ++ ++ if (bkey_val_bytes(k.k) != sizeof(struct bch_quota)) ++ return "incorrect value size"; ++ ++ return NULL; ++} ++ ++static const char * const bch2_quota_counters[] = { ++ "space", ++ "inodes", ++}; ++ ++void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_quota dq = bkey_s_c_to_quota(k); ++ unsigned i; ++ ++ for (i = 0; i < Q_COUNTERS; i++) ++ pr_buf(out, "%s hardlimit %llu softlimit %llu", ++ bch2_quota_counters[i], ++ le64_to_cpu(dq.v->c[i].hardlimit), ++ le64_to_cpu(dq.v->c[i].softlimit)); ++} ++ ++#ifdef CONFIG_BCACHEFS_QUOTA ++ ++#include ++#include ++#include ++ ++static inline unsigned __next_qtype(unsigned i, unsigned qtypes) ++{ ++ qtypes >>= i; ++ return qtypes ? i + __ffs(qtypes) : QTYP_NR; ++} ++ ++#define for_each_set_qtype(_c, _i, _q, _qtypes) \ ++ for (_i = 0; \ ++ (_i = __next_qtype(_i, _qtypes), \ ++ _q = &(_c)->quotas[_i], \ ++ _i < QTYP_NR); \ ++ _i++) ++ ++static bool ignore_hardlimit(struct bch_memquota_type *q) ++{ ++ if (capable(CAP_SYS_RESOURCE)) ++ return true; ++#if 0 ++ struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; ++ ++ return capable(CAP_SYS_RESOURCE) && ++ (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || ++ !(info->dqi_flags & DQF_ROOT_SQUASH)); ++#endif ++ return false; ++} ++ ++enum quota_msg { ++ SOFTWARN, /* Softlimit reached */ ++ SOFTLONGWARN, /* Grace time expired */ ++ HARDWARN, /* Hardlimit reached */ ++ ++ HARDBELOW, /* Usage got below inode hardlimit */ ++ SOFTBELOW, /* Usage got below inode softlimit */ ++}; ++ ++static int quota_nl[][Q_COUNTERS] = { ++ [HARDWARN][Q_SPC] = QUOTA_NL_BHARDWARN, ++ [SOFTLONGWARN][Q_SPC] = QUOTA_NL_BSOFTLONGWARN, ++ [SOFTWARN][Q_SPC] = QUOTA_NL_BSOFTWARN, ++ [HARDBELOW][Q_SPC] = QUOTA_NL_BHARDBELOW, ++ [SOFTBELOW][Q_SPC] = QUOTA_NL_BSOFTBELOW, ++ ++ [HARDWARN][Q_INO] = QUOTA_NL_IHARDWARN, ++ [SOFTLONGWARN][Q_INO] = QUOTA_NL_ISOFTLONGWARN, ++ [SOFTWARN][Q_INO] = QUOTA_NL_ISOFTWARN, ++ [HARDBELOW][Q_INO] = QUOTA_NL_IHARDBELOW, ++ [SOFTBELOW][Q_INO] = QUOTA_NL_ISOFTBELOW, ++}; ++ ++struct quota_msgs { ++ u8 nr; ++ struct { ++ u8 qtype; ++ u8 msg; ++ } m[QTYP_NR * Q_COUNTERS]; ++}; ++ ++static void prepare_msg(unsigned qtype, ++ enum quota_counters counter, ++ struct quota_msgs *msgs, ++ enum quota_msg msg_type) ++{ ++ BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m)); ++ ++ msgs->m[msgs->nr].qtype = qtype; ++ msgs->m[msgs->nr].msg = quota_nl[msg_type][counter]; ++ msgs->nr++; ++} ++ ++static void prepare_warning(struct memquota_counter *qc, ++ unsigned qtype, ++ enum quota_counters counter, ++ struct quota_msgs *msgs, ++ enum quota_msg msg_type) ++{ ++ if (qc->warning_issued & (1 << msg_type)) ++ return; ++ ++ prepare_msg(qtype, counter, msgs, msg_type); ++} ++ ++static void flush_warnings(struct bch_qid qid, ++ struct super_block *sb, ++ struct quota_msgs *msgs) ++{ ++ unsigned i; ++ ++ for (i = 0; i < msgs->nr; i++) ++ quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]), ++ sb->s_dev, msgs->m[i].msg); ++} ++ ++static int bch2_quota_check_limit(struct bch_fs *c, ++ unsigned qtype, ++ struct bch_memquota *mq, ++ struct quota_msgs *msgs, ++ enum quota_counters counter, ++ s64 v, ++ enum quota_acct_mode mode) ++{ ++ struct bch_memquota_type *q = &c->quotas[qtype]; ++ struct memquota_counter *qc = &mq->c[counter]; ++ u64 n = qc->v + v; ++ ++ BUG_ON((s64) n < 0); ++ ++ if (mode == KEY_TYPE_QUOTA_NOCHECK) ++ return 0; ++ ++ if (v <= 0) { ++ if (n < qc->hardlimit && ++ (qc->warning_issued & (1 << HARDWARN))) { ++ qc->warning_issued &= ~(1 << HARDWARN); ++ prepare_msg(qtype, counter, msgs, HARDBELOW); ++ } ++ ++ if (n < qc->softlimit && ++ (qc->warning_issued & (1 << SOFTWARN))) { ++ qc->warning_issued &= ~(1 << SOFTWARN); ++ prepare_msg(qtype, counter, msgs, SOFTBELOW); ++ } ++ ++ qc->warning_issued = 0; ++ return 0; ++ } ++ ++ if (qc->hardlimit && ++ qc->hardlimit < n && ++ !ignore_hardlimit(q)) { ++ if (mode == KEY_TYPE_QUOTA_PREALLOC) ++ return -EDQUOT; ++ ++ prepare_warning(qc, qtype, counter, msgs, HARDWARN); ++ } ++ ++ if (qc->softlimit && ++ qc->softlimit < n && ++ qc->timer && ++ ktime_get_real_seconds() >= qc->timer && ++ !ignore_hardlimit(q)) { ++ if (mode == KEY_TYPE_QUOTA_PREALLOC) ++ return -EDQUOT; ++ ++ prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN); ++ } ++ ++ if (qc->softlimit && ++ qc->softlimit < n && ++ qc->timer == 0) { ++ if (mode == KEY_TYPE_QUOTA_PREALLOC) ++ return -EDQUOT; ++ ++ prepare_warning(qc, qtype, counter, msgs, SOFTWARN); ++ ++ /* XXX is this the right one? */ ++ qc->timer = ktime_get_real_seconds() + ++ q->limits[counter].warnlimit; ++ } ++ ++ return 0; ++} ++ ++int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, ++ enum quota_counters counter, s64 v, ++ enum quota_acct_mode mode) ++{ ++ unsigned qtypes = enabled_qtypes(c); ++ struct bch_memquota_type *q; ++ struct bch_memquota *mq[QTYP_NR]; ++ struct quota_msgs msgs; ++ unsigned i; ++ int ret = 0; ++ ++ memset(&msgs, 0, sizeof(msgs)); ++ ++ for_each_set_qtype(c, i, q, qtypes) ++ mutex_lock_nested(&q->lock, i); ++ ++ for_each_set_qtype(c, i, q, qtypes) { ++ mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_NOFS); ++ if (!mq[i]) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode); ++ if (ret) ++ goto err; ++ } ++ ++ for_each_set_qtype(c, i, q, qtypes) ++ mq[i]->c[counter].v += v; ++err: ++ for_each_set_qtype(c, i, q, qtypes) ++ mutex_unlock(&q->lock); ++ ++ flush_warnings(qid, c->vfs_sb, &msgs); ++ ++ return ret; ++} ++ ++static void __bch2_quota_transfer(struct bch_memquota *src_q, ++ struct bch_memquota *dst_q, ++ enum quota_counters counter, s64 v) ++{ ++ BUG_ON(v > src_q->c[counter].v); ++ BUG_ON(v + dst_q->c[counter].v < v); ++ ++ src_q->c[counter].v -= v; ++ dst_q->c[counter].v += v; ++} ++ ++int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, ++ struct bch_qid dst, ++ struct bch_qid src, u64 space, ++ enum quota_acct_mode mode) ++{ ++ struct bch_memquota_type *q; ++ struct bch_memquota *src_q[3], *dst_q[3]; ++ struct quota_msgs msgs; ++ unsigned i; ++ int ret = 0; ++ ++ qtypes &= enabled_qtypes(c); ++ ++ memset(&msgs, 0, sizeof(msgs)); ++ ++ for_each_set_qtype(c, i, q, qtypes) ++ mutex_lock_nested(&q->lock, i); ++ ++ for_each_set_qtype(c, i, q, qtypes) { ++ src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_NOFS); ++ dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_NOFS); ++ ++ if (!src_q[i] || !dst_q[i]) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC, ++ dst_q[i]->c[Q_SPC].v + space, ++ mode); ++ if (ret) ++ goto err; ++ ++ ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO, ++ dst_q[i]->c[Q_INO].v + 1, ++ mode); ++ if (ret) ++ goto err; ++ } ++ ++ for_each_set_qtype(c, i, q, qtypes) { ++ __bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space); ++ __bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1); ++ } ++ ++err: ++ for_each_set_qtype(c, i, q, qtypes) ++ mutex_unlock(&q->lock); ++ ++ flush_warnings(dst, c->vfs_sb, &msgs); ++ ++ return ret; ++} ++ ++static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_quota dq; ++ struct bch_memquota_type *q; ++ struct bch_memquota *mq; ++ unsigned i; ++ ++ BUG_ON(k.k->p.inode >= QTYP_NR); ++ ++ switch (k.k->type) { ++ case KEY_TYPE_quota: ++ dq = bkey_s_c_to_quota(k); ++ q = &c->quotas[k.k->p.inode]; ++ ++ mutex_lock(&q->lock); ++ mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL); ++ if (!mq) { ++ mutex_unlock(&q->lock); ++ return -ENOMEM; ++ } ++ ++ for (i = 0; i < Q_COUNTERS; i++) { ++ mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit); ++ mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit); ++ } ++ ++ mutex_unlock(&q->lock); ++ } ++ ++ return 0; ++} ++ ++static int bch2_quota_init_type(struct bch_fs *c, enum quota_types type) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_QUOTAS, POS(type, 0), ++ BTREE_ITER_PREFETCH, k, ret) { ++ if (k.k->p.inode != type) ++ break; ++ ++ ret = __bch2_quota_set(c, k); ++ if (ret) ++ break; ++ } ++ ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++void bch2_fs_quota_exit(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(c->quotas); i++) ++ genradix_free(&c->quotas[i].table); ++} ++ ++void bch2_fs_quota_init(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ for (i = 0; i < ARRAY_SIZE(c->quotas); i++) ++ mutex_init(&c->quotas[i].lock); ++} ++ ++static void bch2_sb_quota_read(struct bch_fs *c) ++{ ++ struct bch_sb_field_quota *sb_quota; ++ unsigned i, j; ++ ++ sb_quota = bch2_sb_get_quota(c->disk_sb.sb); ++ if (!sb_quota) ++ return; ++ ++ for (i = 0; i < QTYP_NR; i++) { ++ struct bch_memquota_type *q = &c->quotas[i]; ++ ++ for (j = 0; j < Q_COUNTERS; j++) { ++ q->limits[j].timelimit = ++ le32_to_cpu(sb_quota->q[i].c[j].timelimit); ++ q->limits[j].warnlimit = ++ le32_to_cpu(sb_quota->q[i].c[j].warnlimit); ++ } ++ } ++} ++ ++int bch2_fs_quota_read(struct bch_fs *c) ++{ ++ unsigned i, qtypes = enabled_qtypes(c); ++ struct bch_memquota_type *q; ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bch_inode_unpacked u; ++ struct bkey_s_c k; ++ int ret; ++ ++ mutex_lock(&c->sb_lock); ++ bch2_sb_quota_read(c); ++ mutex_unlock(&c->sb_lock); ++ ++ for_each_set_qtype(c, i, q, qtypes) { ++ ret = bch2_quota_init_type(c, i); ++ if (ret) ++ return ret; ++ } ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_INODES, POS_MIN, ++ BTREE_ITER_PREFETCH, k, ret) { ++ switch (k.k->type) { ++ case KEY_TYPE_inode: ++ ret = bch2_inode_unpack(bkey_s_c_to_inode(k), &u); ++ if (ret) ++ return ret; ++ ++ bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, ++ KEY_TYPE_QUOTA_NOCHECK); ++ bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, ++ KEY_TYPE_QUOTA_NOCHECK); ++ } ++ } ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++/* Enable/disable/delete quotas for an entire filesystem: */ ++ ++static int bch2_quota_enable(struct super_block *sb, unsigned uflags) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ /* Accounting must be enabled at mount time: */ ++ if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT)) ++ return -EINVAL; ++ ++ /* Can't enable enforcement without accounting: */ ++ if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota) ++ return -EINVAL; ++ ++ if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota) ++ return -EINVAL; ++ ++ if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota) ++ return -EINVAL; ++ ++ mutex_lock(&c->sb_lock); ++ if (uflags & FS_QUOTA_UDQ_ENFD) ++ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true); ++ ++ if (uflags & FS_QUOTA_GDQ_ENFD) ++ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true); ++ ++ if (uflags & FS_QUOTA_PDQ_ENFD) ++ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++static int bch2_quota_disable(struct super_block *sb, unsigned uflags) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ mutex_lock(&c->sb_lock); ++ if (uflags & FS_QUOTA_UDQ_ENFD) ++ SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false); ++ ++ if (uflags & FS_QUOTA_GDQ_ENFD) ++ SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false); ++ ++ if (uflags & FS_QUOTA_PDQ_ENFD) ++ SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++static int bch2_quota_remove(struct super_block *sb, unsigned uflags) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ int ret; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ if (uflags & FS_USER_QUOTA) { ++ if (c->opts.usrquota) ++ return -EINVAL; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, ++ POS(QTYP_USR, 0), ++ POS(QTYP_USR + 1, 0), ++ NULL); ++ if (ret) ++ return ret; ++ } ++ ++ if (uflags & FS_GROUP_QUOTA) { ++ if (c->opts.grpquota) ++ return -EINVAL; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, ++ POS(QTYP_GRP, 0), ++ POS(QTYP_GRP + 1, 0), ++ NULL); ++ if (ret) ++ return ret; ++ } ++ ++ if (uflags & FS_PROJ_QUOTA) { ++ if (c->opts.prjquota) ++ return -EINVAL; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_QUOTAS, ++ POS(QTYP_PRJ, 0), ++ POS(QTYP_PRJ + 1, 0), ++ NULL); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Return quota status information, such as enforcements, quota file inode ++ * numbers etc. ++ */ ++static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ unsigned qtypes = enabled_qtypes(c); ++ unsigned i; ++ ++ memset(state, 0, sizeof(*state)); ++ ++ for (i = 0; i < QTYP_NR; i++) { ++ state->s_state[i].flags |= QCI_SYSFILE; ++ ++ if (!(qtypes & (1 << i))) ++ continue; ++ ++ state->s_state[i].flags |= QCI_ACCT_ENABLED; ++ ++ state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit; ++ state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit; ++ ++ state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit; ++ state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Adjust quota timers & warnings ++ */ ++static int bch2_quota_set_info(struct super_block *sb, int type, ++ struct qc_info *info) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_sb_field_quota *sb_quota; ++ struct bch_memquota_type *q; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ if (type >= QTYP_NR) ++ return -EINVAL; ++ ++ if (!((1 << type) & enabled_qtypes(c))) ++ return -ESRCH; ++ ++ if (info->i_fieldmask & ++ ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS)) ++ return -EINVAL; ++ ++ q = &c->quotas[type]; ++ ++ mutex_lock(&c->sb_lock); ++ sb_quota = bch2_sb_get_quota(c->disk_sb.sb); ++ if (!sb_quota) { ++ sb_quota = bch2_sb_resize_quota(&c->disk_sb, ++ sizeof(*sb_quota) / sizeof(u64)); ++ if (!sb_quota) ++ return -ENOSPC; ++ } ++ ++ if (info->i_fieldmask & QC_SPC_TIMER) ++ sb_quota->q[type].c[Q_SPC].timelimit = ++ cpu_to_le32(info->i_spc_timelimit); ++ ++ if (info->i_fieldmask & QC_SPC_WARNS) ++ sb_quota->q[type].c[Q_SPC].warnlimit = ++ cpu_to_le32(info->i_spc_warnlimit); ++ ++ if (info->i_fieldmask & QC_INO_TIMER) ++ sb_quota->q[type].c[Q_INO].timelimit = ++ cpu_to_le32(info->i_ino_timelimit); ++ ++ if (info->i_fieldmask & QC_INO_WARNS) ++ sb_quota->q[type].c[Q_INO].warnlimit = ++ cpu_to_le32(info->i_ino_warnlimit); ++ ++ bch2_sb_quota_read(c); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++/* Get/set individual quotas: */ ++ ++static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src) ++{ ++ dst->d_space = src->c[Q_SPC].v << 9; ++ dst->d_spc_hardlimit = src->c[Q_SPC].hardlimit << 9; ++ dst->d_spc_softlimit = src->c[Q_SPC].softlimit << 9; ++ dst->d_spc_timer = src->c[Q_SPC].timer; ++ dst->d_spc_warns = src->c[Q_SPC].warns; ++ ++ dst->d_ino_count = src->c[Q_INO].v; ++ dst->d_ino_hardlimit = src->c[Q_INO].hardlimit; ++ dst->d_ino_softlimit = src->c[Q_INO].softlimit; ++ dst->d_ino_timer = src->c[Q_INO].timer; ++ dst->d_ino_warns = src->c[Q_INO].warns; ++} ++ ++static int bch2_get_quota(struct super_block *sb, struct kqid kqid, ++ struct qc_dqblk *qdq) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_memquota_type *q = &c->quotas[kqid.type]; ++ qid_t qid = from_kqid(&init_user_ns, kqid); ++ struct bch_memquota *mq; ++ ++ memset(qdq, 0, sizeof(*qdq)); ++ ++ mutex_lock(&q->lock); ++ mq = genradix_ptr(&q->table, qid); ++ if (mq) ++ __bch2_quota_get(qdq, mq); ++ mutex_unlock(&q->lock); ++ ++ return 0; ++} ++ ++static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid, ++ struct qc_dqblk *qdq) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct bch_memquota_type *q = &c->quotas[kqid->type]; ++ qid_t qid = from_kqid(&init_user_ns, *kqid); ++ struct genradix_iter iter; ++ struct bch_memquota *mq; ++ int ret = 0; ++ ++ mutex_lock(&q->lock); ++ ++ genradix_for_each_from(&q->table, iter, mq, qid) ++ if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) { ++ __bch2_quota_get(qdq, mq); ++ *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos); ++ goto found; ++ } ++ ++ ret = -ENOENT; ++found: ++ mutex_unlock(&q->lock); ++ return ret; ++} ++ ++static int bch2_set_quota_trans(struct btree_trans *trans, ++ struct bkey_i_quota *new_quota, ++ struct qc_dqblk *qdq) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_QUOTAS, new_quota->k.p, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT); ++ k = bch2_btree_iter_peek_slot(iter); ++ ++ ret = bkey_err(k); ++ if (unlikely(ret)) ++ return ret; ++ ++ if (k.k->type == KEY_TYPE_quota) ++ new_quota->v = *bkey_s_c_to_quota(k).v; ++ ++ if (qdq->d_fieldmask & QC_SPC_SOFT) ++ new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9); ++ if (qdq->d_fieldmask & QC_SPC_HARD) ++ new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9); ++ ++ if (qdq->d_fieldmask & QC_INO_SOFT) ++ new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit); ++ if (qdq->d_fieldmask & QC_INO_HARD) ++ new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); ++ ++ return bch2_trans_update(trans, iter, &new_quota->k_i, 0); ++} ++ ++static int bch2_set_quota(struct super_block *sb, struct kqid qid, ++ struct qc_dqblk *qdq) ++{ ++ struct bch_fs *c = sb->s_fs_info; ++ struct btree_trans trans; ++ struct bkey_i_quota new_quota; ++ int ret; ++ ++ if (sb->s_flags & SB_RDONLY) ++ return -EROFS; ++ ++ bkey_quota_init(&new_quota.k_i); ++ new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOUNLOCK, ++ bch2_set_quota_trans(&trans, &new_quota, qdq)) ?: ++ __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i)); ++ ++ bch2_trans_exit(&trans); ++ ++ return ret; ++} ++ ++const struct quotactl_ops bch2_quotactl_operations = { ++ .quota_enable = bch2_quota_enable, ++ .quota_disable = bch2_quota_disable, ++ .rm_xquota = bch2_quota_remove, ++ ++ .get_state = bch2_quota_get_state, ++ .set_info = bch2_quota_set_info, ++ ++ .get_dqblk = bch2_get_quota, ++ .get_nextdqblk = bch2_get_next_quota, ++ .set_dqblk = bch2_set_quota, ++}; ++ ++#endif /* CONFIG_BCACHEFS_QUOTA */ +diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h +new file mode 100644 +index 000000000000..51e4f9713ef0 +--- /dev/null ++++ b/fs/bcachefs/quota.h +@@ -0,0 +1,71 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_QUOTA_H ++#define _BCACHEFS_QUOTA_H ++ ++#include "inode.h" ++#include "quota_types.h" ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_quota; ++ ++const char *bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_quota (struct bkey_ops) { \ ++ .key_invalid = bch2_quota_invalid, \ ++ .val_to_text = bch2_quota_to_text, \ ++} ++ ++static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u) ++{ ++ return (struct bch_qid) { ++ .q[QTYP_USR] = u->bi_uid, ++ .q[QTYP_GRP] = u->bi_gid, ++ .q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0, ++ }; ++} ++ ++static inline unsigned enabled_qtypes(struct bch_fs *c) ++{ ++ return ((c->opts.usrquota << QTYP_USR)| ++ (c->opts.grpquota << QTYP_GRP)| ++ (c->opts.prjquota << QTYP_PRJ)); ++} ++ ++#ifdef CONFIG_BCACHEFS_QUOTA ++ ++int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters, ++ s64, enum quota_acct_mode); ++ ++int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid, ++ struct bch_qid, u64, enum quota_acct_mode); ++ ++void bch2_fs_quota_exit(struct bch_fs *); ++void bch2_fs_quota_init(struct bch_fs *); ++int bch2_fs_quota_read(struct bch_fs *); ++ ++extern const struct quotactl_ops bch2_quotactl_operations; ++ ++#else ++ ++static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, ++ enum quota_counters counter, s64 v, ++ enum quota_acct_mode mode) ++{ ++ return 0; ++} ++ ++static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, ++ struct bch_qid dst, ++ struct bch_qid src, u64 space, ++ enum quota_acct_mode mode) ++{ ++ return 0; ++} ++ ++static inline void bch2_fs_quota_exit(struct bch_fs *c) {} ++static inline void bch2_fs_quota_init(struct bch_fs *c) {} ++static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; } ++ ++#endif ++ ++#endif /* _BCACHEFS_QUOTA_H */ +diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h +new file mode 100644 +index 000000000000..6a136083d389 +--- /dev/null ++++ b/fs/bcachefs/quota_types.h +@@ -0,0 +1,43 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_QUOTA_TYPES_H ++#define _BCACHEFS_QUOTA_TYPES_H ++ ++#include ++ ++struct bch_qid { ++ u32 q[QTYP_NR]; ++}; ++ ++enum quota_acct_mode { ++ KEY_TYPE_QUOTA_PREALLOC, ++ KEY_TYPE_QUOTA_WARN, ++ KEY_TYPE_QUOTA_NOCHECK, ++}; ++ ++struct memquota_counter { ++ u64 v; ++ u64 hardlimit; ++ u64 softlimit; ++ s64 timer; ++ int warns; ++ int warning_issued; ++}; ++ ++struct bch_memquota { ++ struct memquota_counter c[Q_COUNTERS]; ++}; ++ ++typedef GENRADIX(struct bch_memquota) bch_memquota_table; ++ ++struct quota_limit { ++ u32 timelimit; ++ u32 warnlimit; ++}; ++ ++struct bch_memquota_type { ++ struct quota_limit limits[Q_COUNTERS]; ++ bch_memquota_table table; ++ struct mutex lock; ++}; ++ ++#endif /* _BCACHEFS_QUOTA_TYPES_H */ +diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c +new file mode 100644 +index 000000000000..56a1f761271f +--- /dev/null ++++ b/fs/bcachefs/rebalance.c +@@ -0,0 +1,331 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "alloc_foreground.h" ++#include "btree_iter.h" ++#include "buckets.h" ++#include "clock.h" ++#include "disk_groups.h" ++#include "extents.h" ++#include "io.h" ++#include "move.h" ++#include "rebalance.h" ++#include "super-io.h" ++ ++#include ++#include ++#include ++#include ++ ++/* ++ * Check if an extent should be moved: ++ * returns -1 if it should not be moved, or ++ * device of pointer that should be moved, if known, or INT_MAX if unknown ++ */ ++static int __bch2_rebalance_pred(struct bch_fs *c, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ if (io_opts->background_compression && ++ !bch2_bkey_is_incompressible(k)) ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (!p.ptr.cached && ++ p.crc.compression_type != ++ bch2_compression_opt_to_type[io_opts->background_compression]) ++ return p.ptr.dev; ++ ++ if (io_opts->background_target) ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) ++ if (!p.ptr.cached && ++ !bch2_dev_in_target(c, p.ptr.dev, io_opts->background_target)) ++ return p.ptr.dev; ++ ++ return -1; ++} ++ ++void bch2_rebalance_add_key(struct bch_fs *c, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts) ++{ ++ atomic64_t *counter; ++ int dev; ++ ++ dev = __bch2_rebalance_pred(c, k, io_opts); ++ if (dev < 0) ++ return; ++ ++ counter = dev < INT_MAX ++ ? &bch_dev_bkey_exists(c, dev)->rebalance_work ++ : &c->rebalance.work_unknown_dev; ++ ++ if (atomic64_add_return(k.k->size, counter) == k.k->size) ++ rebalance_wakeup(c); ++} ++ ++static enum data_cmd rebalance_pred(struct bch_fs *c, void *arg, ++ struct bkey_s_c k, ++ struct bch_io_opts *io_opts, ++ struct data_opts *data_opts) ++{ ++ if (__bch2_rebalance_pred(c, k, io_opts) >= 0) { ++ data_opts->target = io_opts->background_target; ++ data_opts->btree_insert_flags = 0; ++ return DATA_ADD_REPLICAS; ++ } else { ++ return DATA_SKIP; ++ } ++} ++ ++void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) ++{ ++ if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) == ++ sectors) ++ rebalance_wakeup(c); ++} ++ ++struct rebalance_work { ++ int dev_most_full_idx; ++ unsigned dev_most_full_percent; ++ u64 dev_most_full_work; ++ u64 dev_most_full_capacity; ++ u64 total_work; ++}; ++ ++static void rebalance_work_accumulate(struct rebalance_work *w, ++ u64 dev_work, u64 unknown_dev, u64 capacity, int idx) ++{ ++ unsigned percent_full; ++ u64 work = dev_work + unknown_dev; ++ ++ if (work < dev_work || work < unknown_dev) ++ work = U64_MAX; ++ work = min(work, capacity); ++ ++ percent_full = div64_u64(work * 100, capacity); ++ ++ if (percent_full >= w->dev_most_full_percent) { ++ w->dev_most_full_idx = idx; ++ w->dev_most_full_percent = percent_full; ++ w->dev_most_full_work = work; ++ w->dev_most_full_capacity = capacity; ++ } ++ ++ if (w->total_work + dev_work >= w->total_work && ++ w->total_work + dev_work >= dev_work) ++ w->total_work += dev_work; ++} ++ ++static struct rebalance_work rebalance_work(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ struct rebalance_work ret = { .dev_most_full_idx = -1 }; ++ u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev); ++ unsigned i; ++ ++ for_each_online_member(ca, c, i) ++ rebalance_work_accumulate(&ret, ++ atomic64_read(&ca->rebalance_work), ++ unknown_dev, ++ bucket_to_sector(ca, ca->mi.nbuckets - ++ ca->mi.first_bucket), ++ i); ++ ++ rebalance_work_accumulate(&ret, ++ unknown_dev, 0, c->capacity, -1); ++ ++ return ret; ++} ++ ++static void rebalance_work_reset(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ for_each_online_member(ca, c, i) ++ atomic64_set(&ca->rebalance_work, 0); ++ ++ atomic64_set(&c->rebalance.work_unknown_dev, 0); ++} ++ ++static unsigned long curr_cputime(void) ++{ ++ u64 utime, stime; ++ ++ task_cputime_adjusted(current, &utime, &stime); ++ return nsecs_to_jiffies(utime + stime); ++} ++ ++static int bch2_rebalance_thread(void *arg) ++{ ++ struct bch_fs *c = arg; ++ struct bch_fs_rebalance *r = &c->rebalance; ++ struct io_clock *clock = &c->io_clock[WRITE]; ++ struct rebalance_work w, p; ++ unsigned long start, prev_start; ++ unsigned long prev_run_time, prev_run_cputime; ++ unsigned long cputime, prev_cputime; ++ unsigned long io_start; ++ long throttle; ++ ++ set_freezable(); ++ ++ io_start = atomic_long_read(&clock->now); ++ p = rebalance_work(c); ++ prev_start = jiffies; ++ prev_cputime = curr_cputime(); ++ ++ while (!kthread_wait_freezable(r->enabled)) { ++ cond_resched(); ++ ++ start = jiffies; ++ cputime = curr_cputime(); ++ ++ prev_run_time = start - prev_start; ++ prev_run_cputime = cputime - prev_cputime; ++ ++ w = rebalance_work(c); ++ BUG_ON(!w.dev_most_full_capacity); ++ ++ if (!w.total_work) { ++ r->state = REBALANCE_WAITING; ++ kthread_wait_freezable(rebalance_work(c).total_work); ++ continue; ++ } ++ ++ /* ++ * If there isn't much work to do, throttle cpu usage: ++ */ ++ throttle = prev_run_cputime * 100 / ++ max(1U, w.dev_most_full_percent) - ++ prev_run_time; ++ ++ if (w.dev_most_full_percent < 20 && throttle > 0) { ++ r->throttled_until_iotime = io_start + ++ div_u64(w.dev_most_full_capacity * ++ (20 - w.dev_most_full_percent), ++ 50); ++ ++ if (atomic_long_read(&clock->now) + clock->max_slop < ++ r->throttled_until_iotime) { ++ r->throttled_until_cputime = start + throttle; ++ r->state = REBALANCE_THROTTLED; ++ ++ bch2_kthread_io_clock_wait(clock, ++ r->throttled_until_iotime, ++ throttle); ++ continue; ++ } ++ } ++ ++ /* minimum 1 mb/sec: */ ++ r->pd.rate.rate = ++ max_t(u64, 1 << 11, ++ r->pd.rate.rate * ++ max(p.dev_most_full_percent, 1U) / ++ max(w.dev_most_full_percent, 1U)); ++ ++ io_start = atomic_long_read(&clock->now); ++ p = w; ++ prev_start = start; ++ prev_cputime = cputime; ++ ++ r->state = REBALANCE_RUNNING; ++ memset(&r->move_stats, 0, sizeof(r->move_stats)); ++ rebalance_work_reset(c); ++ ++ bch2_move_data(c, ++ /* ratelimiting disabled for now */ ++ NULL, /* &r->pd.rate, */ ++ writepoint_ptr(&c->rebalance_write_point), ++ POS_MIN, POS_MAX, ++ rebalance_pred, NULL, ++ &r->move_stats); ++ } ++ ++ return 0; ++} ++ ++void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ struct bch_fs_rebalance *r = &c->rebalance; ++ struct rebalance_work w = rebalance_work(c); ++ char h1[21], h2[21]; ++ ++ bch2_hprint(&PBUF(h1), w.dev_most_full_work << 9); ++ bch2_hprint(&PBUF(h2), w.dev_most_full_capacity << 9); ++ pr_buf(out, "fullest_dev (%i):\t%s/%s\n", ++ w.dev_most_full_idx, h1, h2); ++ ++ bch2_hprint(&PBUF(h1), w.total_work << 9); ++ bch2_hprint(&PBUF(h2), c->capacity << 9); ++ pr_buf(out, "total work:\t\t%s/%s\n", h1, h2); ++ ++ pr_buf(out, "rate:\t\t\t%u\n", r->pd.rate.rate); ++ ++ switch (r->state) { ++ case REBALANCE_WAITING: ++ pr_buf(out, "waiting\n"); ++ break; ++ case REBALANCE_THROTTLED: ++ bch2_hprint(&PBUF(h1), ++ (r->throttled_until_iotime - ++ atomic_long_read(&c->io_clock[WRITE].now)) << 9); ++ pr_buf(out, "throttled for %lu sec or %s io\n", ++ (r->throttled_until_cputime - jiffies) / HZ, ++ h1); ++ break; ++ case REBALANCE_RUNNING: ++ pr_buf(out, "running\n"); ++ pr_buf(out, "pos %llu:%llu\n", ++ r->move_stats.pos.inode, ++ r->move_stats.pos.offset); ++ break; ++ } ++} ++ ++void bch2_rebalance_stop(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ c->rebalance.pd.rate.rate = UINT_MAX; ++ bch2_ratelimit_reset(&c->rebalance.pd.rate); ++ ++ p = rcu_dereference_protected(c->rebalance.thread, 1); ++ c->rebalance.thread = NULL; ++ ++ if (p) { ++ /* for sychronizing with rebalance_wakeup() */ ++ synchronize_rcu(); ++ ++ kthread_stop(p); ++ put_task_struct(p); ++ } ++} ++ ++int bch2_rebalance_start(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ if (c->opts.nochanges) ++ return 0; ++ ++ p = kthread_create(bch2_rebalance_thread, c, "bch_rebalance"); ++ if (IS_ERR(p)) ++ return PTR_ERR(p); ++ ++ get_task_struct(p); ++ rcu_assign_pointer(c->rebalance.thread, p); ++ wake_up_process(p); ++ return 0; ++} ++ ++void bch2_fs_rebalance_init(struct bch_fs *c) ++{ ++ bch2_pd_controller_init(&c->rebalance.pd); ++ ++ atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX); ++} +diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h +new file mode 100644 +index 000000000000..7ade0bb81cce +--- /dev/null ++++ b/fs/bcachefs/rebalance.h +@@ -0,0 +1,28 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_REBALANCE_H ++#define _BCACHEFS_REBALANCE_H ++ ++#include "rebalance_types.h" ++ ++static inline void rebalance_wakeup(struct bch_fs *c) ++{ ++ struct task_struct *p; ++ ++ rcu_read_lock(); ++ p = rcu_dereference(c->rebalance.thread); ++ if (p) ++ wake_up_process(p); ++ rcu_read_unlock(); ++} ++ ++void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c, ++ struct bch_io_opts *); ++void bch2_rebalance_add_work(struct bch_fs *, u64); ++ ++void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *); ++ ++void bch2_rebalance_stop(struct bch_fs *); ++int bch2_rebalance_start(struct bch_fs *); ++void bch2_fs_rebalance_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_REBALANCE_H */ +diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h +new file mode 100644 +index 000000000000..192c6be20ced +--- /dev/null ++++ b/fs/bcachefs/rebalance_types.h +@@ -0,0 +1,27 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_REBALANCE_TYPES_H ++#define _BCACHEFS_REBALANCE_TYPES_H ++ ++#include "move_types.h" ++ ++enum rebalance_state { ++ REBALANCE_WAITING, ++ REBALANCE_THROTTLED, ++ REBALANCE_RUNNING, ++}; ++ ++struct bch_fs_rebalance { ++ struct task_struct __rcu *thread; ++ struct bch_pd_controller pd; ++ ++ atomic64_t work_unknown_dev; ++ ++ enum rebalance_state state; ++ unsigned long throttled_until_iotime; ++ unsigned long throttled_until_cputime; ++ struct bch_move_stats move_stats; ++ ++ unsigned enabled:1; ++}; ++ ++#endif /* _BCACHEFS_REBALANCE_TYPES_H */ +diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c +new file mode 100644 +index 000000000000..d70fa968db50 +--- /dev/null ++++ b/fs/bcachefs/recovery.c +@@ -0,0 +1,1350 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "btree_gc.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_io.h" ++#include "buckets.h" ++#include "dirent.h" ++#include "ec.h" ++#include "error.h" ++#include "fs-common.h" ++#include "fsck.h" ++#include "journal_io.h" ++#include "journal_reclaim.h" ++#include "journal_seq_blacklist.h" ++#include "quota.h" ++#include "recovery.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++#include ++#include ++ ++#define QSTR(n) { { { .len = strlen(n) } }, .name = n } ++ ++/* iterate over keys read from the journal: */ ++ ++static struct journal_key *journal_key_search(struct journal_keys *journal_keys, ++ enum btree_id id, unsigned level, ++ struct bpos pos) ++{ ++ size_t l = 0, r = journal_keys->nr, m; ++ ++ while (l < r) { ++ m = l + ((r - l) >> 1); ++ if ((cmp_int(id, journal_keys->d[m].btree_id) ?: ++ cmp_int(level, journal_keys->d[m].level) ?: ++ bkey_cmp(pos, journal_keys->d[m].k->k.p)) > 0) ++ l = m + 1; ++ else ++ r = m; ++ } ++ ++ BUG_ON(l < journal_keys->nr && ++ (cmp_int(id, journal_keys->d[l].btree_id) ?: ++ cmp_int(level, journal_keys->d[l].level) ?: ++ bkey_cmp(pos, journal_keys->d[l].k->k.p)) > 0); ++ ++ BUG_ON(l && ++ (cmp_int(id, journal_keys->d[l - 1].btree_id) ?: ++ cmp_int(level, journal_keys->d[l - 1].level) ?: ++ bkey_cmp(pos, journal_keys->d[l - 1].k->k.p)) <= 0); ++ ++ return l < journal_keys->nr ? journal_keys->d + l : NULL; ++} ++ ++static struct bkey_i *bch2_journal_iter_peek(struct journal_iter *iter) ++{ ++ if (iter->k && ++ iter->k < iter->keys->d + iter->keys->nr && ++ iter->k->btree_id == iter->btree_id && ++ iter->k->level == iter->level) ++ return iter->k->k; ++ ++ iter->k = NULL; ++ return NULL; ++} ++ ++static void bch2_journal_iter_advance(struct journal_iter *iter) ++{ ++ if (iter->k) ++ iter->k++; ++} ++ ++static void bch2_journal_iter_init(struct journal_iter *iter, ++ struct journal_keys *journal_keys, ++ enum btree_id id, unsigned level, ++ struct bpos pos) ++{ ++ iter->btree_id = id; ++ iter->level = level; ++ iter->keys = journal_keys; ++ iter->k = journal_key_search(journal_keys, id, level, pos); ++} ++ ++static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) ++{ ++ return iter->btree ++ ? bch2_btree_iter_peek(iter->btree) ++ : bch2_btree_node_iter_peek_unpack(&iter->node_iter, ++ iter->b, &iter->unpacked); ++} ++ ++static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) ++{ ++ if (iter->btree) ++ bch2_btree_iter_next(iter->btree); ++ else ++ bch2_btree_node_iter_advance(&iter->node_iter, iter->b); ++} ++ ++void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) ++{ ++ switch (iter->last) { ++ case none: ++ break; ++ case btree: ++ bch2_journal_iter_advance_btree(iter); ++ break; ++ case journal: ++ bch2_journal_iter_advance(&iter->journal); ++ break; ++ } ++ ++ iter->last = none; ++} ++ ++struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) ++{ ++ struct bkey_s_c ret; ++ ++ while (1) { ++ struct bkey_s_c btree_k = ++ bch2_journal_iter_peek_btree(iter); ++ struct bkey_s_c journal_k = ++ bkey_i_to_s_c(bch2_journal_iter_peek(&iter->journal)); ++ ++ if (btree_k.k && journal_k.k) { ++ int cmp = bkey_cmp(btree_k.k->p, journal_k.k->p); ++ ++ if (!cmp) ++ bch2_journal_iter_advance_btree(iter); ++ ++ iter->last = cmp < 0 ? btree : journal; ++ } else if (btree_k.k) { ++ iter->last = btree; ++ } else if (journal_k.k) { ++ iter->last = journal; ++ } else { ++ iter->last = none; ++ return bkey_s_c_null; ++ } ++ ++ ret = iter->last == journal ? journal_k : btree_k; ++ ++ if (iter->b && ++ bkey_cmp(ret.k->p, iter->b->data->max_key) > 0) { ++ iter->journal.k = NULL; ++ iter->last = none; ++ return bkey_s_c_null; ++ } ++ ++ if (!bkey_deleted(ret.k)) ++ break; ++ ++ bch2_btree_and_journal_iter_advance(iter); ++ } ++ ++ return ret; ++} ++ ++struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *iter) ++{ ++ bch2_btree_and_journal_iter_advance(iter); ++ ++ return bch2_btree_and_journal_iter_peek(iter); ++} ++ ++void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *iter, ++ struct btree_trans *trans, ++ struct journal_keys *journal_keys, ++ enum btree_id id, struct bpos pos) ++{ ++ memset(iter, 0, sizeof(*iter)); ++ ++ iter->btree = bch2_trans_get_iter(trans, id, pos, 0); ++ bch2_journal_iter_init(&iter->journal, journal_keys, id, 0, pos); ++} ++ ++void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, ++ struct journal_keys *journal_keys, ++ struct btree *b) ++{ ++ memset(iter, 0, sizeof(*iter)); ++ ++ iter->b = b; ++ bch2_btree_node_iter_init_from_start(&iter->node_iter, iter->b); ++ bch2_journal_iter_init(&iter->journal, journal_keys, ++ b->c.btree_id, b->c.level, b->data->min_key); ++} ++ ++/* Walk btree, overlaying keys from the journal: */ ++ ++static int bch2_btree_and_journal_walk_recurse(struct bch_fs *c, struct btree *b, ++ struct journal_keys *journal_keys, ++ enum btree_id btree_id, ++ btree_walk_node_fn node_fn, ++ btree_walk_key_fn key_fn) ++{ ++ struct btree_and_journal_iter iter; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ bch2_btree_and_journal_iter_init_node_iter(&iter, journal_keys, b); ++ ++ while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { ++ ret = key_fn(c, btree_id, b->c.level, k); ++ if (ret) ++ break; ++ ++ if (b->c.level) { ++ struct btree *child; ++ BKEY_PADDED(k) tmp; ++ ++ bkey_reassemble(&tmp.k, k); ++ k = bkey_i_to_s_c(&tmp.k); ++ ++ bch2_btree_and_journal_iter_advance(&iter); ++ ++ if (b->c.level > 0) { ++ child = bch2_btree_node_get_noiter(c, &tmp.k, ++ b->c.btree_id, b->c.level - 1); ++ ret = PTR_ERR_OR_ZERO(child); ++ if (ret) ++ break; ++ ++ ret = (node_fn ? node_fn(c, b) : 0) ?: ++ bch2_btree_and_journal_walk_recurse(c, child, ++ journal_keys, btree_id, node_fn, key_fn); ++ six_unlock_read(&child->c.lock); ++ ++ if (ret) ++ break; ++ } ++ } else { ++ bch2_btree_and_journal_iter_advance(&iter); ++ } ++ } ++ ++ return ret; ++} ++ ++int bch2_btree_and_journal_walk(struct bch_fs *c, struct journal_keys *journal_keys, ++ enum btree_id btree_id, ++ btree_walk_node_fn node_fn, ++ btree_walk_key_fn key_fn) ++{ ++ struct btree *b = c->btree_roots[btree_id].b; ++ int ret = 0; ++ ++ if (btree_node_fake(b)) ++ return 0; ++ ++ six_lock_read(&b->c.lock, NULL, NULL); ++ ret = (node_fn ? node_fn(c, b) : 0) ?: ++ bch2_btree_and_journal_walk_recurse(c, b, journal_keys, btree_id, ++ node_fn, key_fn) ?: ++ key_fn(c, btree_id, b->c.level + 1, bkey_i_to_s_c(&b->key)); ++ six_unlock_read(&b->c.lock); ++ ++ return ret; ++} ++ ++/* sort and dedup all keys in the journal: */ ++ ++void bch2_journal_entries_free(struct list_head *list) ++{ ++ ++ while (!list_empty(list)) { ++ struct journal_replay *i = ++ list_first_entry(list, struct journal_replay, list); ++ list_del(&i->list); ++ kvpfree(i, offsetof(struct journal_replay, j) + ++ vstruct_bytes(&i->j)); ++ } ++} ++ ++/* ++ * When keys compare equal, oldest compares first: ++ */ ++static int journal_sort_key_cmp(const void *_l, const void *_r) ++{ ++ const struct journal_key *l = _l; ++ const struct journal_key *r = _r; ++ ++ return cmp_int(l->btree_id, r->btree_id) ?: ++ cmp_int(l->level, r->level) ?: ++ bkey_cmp(l->k->k.p, r->k->k.p) ?: ++ cmp_int(l->journal_seq, r->journal_seq) ?: ++ cmp_int(l->journal_offset, r->journal_offset); ++} ++ ++void bch2_journal_keys_free(struct journal_keys *keys) ++{ ++ kvfree(keys->d); ++ keys->d = NULL; ++ keys->nr = 0; ++} ++ ++static struct journal_keys journal_keys_sort(struct list_head *journal_entries) ++{ ++ struct journal_replay *p; ++ struct jset_entry *entry; ++ struct bkey_i *k, *_n; ++ struct journal_keys keys = { NULL }; ++ struct journal_key *src, *dst; ++ size_t nr_keys = 0; ++ ++ if (list_empty(journal_entries)) ++ return keys; ++ ++ keys.journal_seq_base = ++ le64_to_cpu(list_last_entry(journal_entries, ++ struct journal_replay, list)->j.last_seq); ++ ++ list_for_each_entry(p, journal_entries, list) { ++ if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) ++ continue; ++ ++ for_each_jset_key(k, _n, entry, &p->j) ++ nr_keys++; ++ } ++ ++ ++ keys.d = kvmalloc(sizeof(keys.d[0]) * nr_keys, GFP_KERNEL); ++ if (!keys.d) ++ goto err; ++ ++ list_for_each_entry(p, journal_entries, list) { ++ if (le64_to_cpu(p->j.seq) < keys.journal_seq_base) ++ continue; ++ ++ for_each_jset_key(k, _n, entry, &p->j) ++ keys.d[keys.nr++] = (struct journal_key) { ++ .btree_id = entry->btree_id, ++ .level = entry->level, ++ .k = k, ++ .journal_seq = le64_to_cpu(p->j.seq) - ++ keys.journal_seq_base, ++ .journal_offset = k->_data - p->j._data, ++ }; ++ } ++ ++ sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_key_cmp, NULL); ++ ++ src = dst = keys.d; ++ while (src < keys.d + keys.nr) { ++ while (src + 1 < keys.d + keys.nr && ++ src[0].btree_id == src[1].btree_id && ++ src[0].level == src[1].level && ++ !bkey_cmp(src[0].k->k.p, src[1].k->k.p)) ++ src++; ++ ++ *dst++ = *src++; ++ } ++ ++ keys.nr = dst - keys.d; ++err: ++ return keys; ++} ++ ++/* journal replay: */ ++ ++static void replay_now_at(struct journal *j, u64 seq) ++{ ++ BUG_ON(seq < j->replay_journal_seq); ++ BUG_ON(seq > j->replay_journal_seq_end); ++ ++ while (j->replay_journal_seq < seq) ++ bch2_journal_pin_put(j, j->replay_journal_seq++); ++} ++ ++static int bch2_extent_replay_key(struct bch_fs *c, enum btree_id btree_id, ++ struct bkey_i *k) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter, *split_iter; ++ /* ++ * We might cause compressed extents to be split, so we need to pass in ++ * a disk_reservation: ++ */ ++ struct disk_reservation disk_res = ++ bch2_disk_reservation_init(c, 0); ++ struct bkey_i *split; ++ struct bpos atomic_end; ++ /* ++ * Some extents aren't equivalent - w.r.t. what the triggers do ++ * - if they're split: ++ */ ++ bool remark_if_split = bch2_bkey_sectors_compressed(bkey_i_to_s_c(k)) || ++ k->k.type == KEY_TYPE_reflink_p; ++ bool remark = false; ++ int ret; ++ ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); ++retry: ++ bch2_trans_begin(&trans); ++ ++ iter = bch2_trans_get_iter(&trans, btree_id, ++ bkey_start_pos(&k->k), ++ BTREE_ITER_INTENT); ++ ++ do { ++ ret = bch2_btree_iter_traverse(iter); ++ if (ret) ++ goto err; ++ ++ atomic_end = bpos_min(k->k.p, iter->l[0].b->key.k.p); ++ ++ split = bch2_trans_kmalloc(&trans, bkey_bytes(&k->k)); ++ ret = PTR_ERR_OR_ZERO(split); ++ if (ret) ++ goto err; ++ ++ if (!remark && ++ remark_if_split && ++ bkey_cmp(atomic_end, k->k.p) < 0) { ++ ret = bch2_disk_reservation_add(c, &disk_res, ++ k->k.size * ++ bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(k)), ++ BCH_DISK_RESERVATION_NOFAIL); ++ BUG_ON(ret); ++ ++ remark = true; ++ } ++ ++ bkey_copy(split, k); ++ bch2_cut_front(iter->pos, split); ++ bch2_cut_back(atomic_end, split); ++ ++ split_iter = bch2_trans_copy_iter(&trans, iter); ++ ret = PTR_ERR_OR_ZERO(split_iter); ++ if (ret) ++ goto err; ++ ++ /* ++ * It's important that we don't go through the ++ * extent_handle_overwrites() and extent_update_to_keys() path ++ * here: journal replay is supposed to treat extents like ++ * regular keys ++ */ ++ __bch2_btree_iter_set_pos(split_iter, split->k.p, false); ++ bch2_trans_update(&trans, split_iter, split, ++ BTREE_TRIGGER_NORUN); ++ ++ bch2_btree_iter_set_pos(iter, split->k.p); ++ ++ if (remark) { ++ ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(split), ++ 0, split->k.size, ++ BTREE_TRIGGER_INSERT); ++ if (ret) ++ goto err; ++ } ++ } while (bkey_cmp(iter->pos, k->k.p) < 0); ++ ++ if (remark) { ++ ret = bch2_trans_mark_key(&trans, bkey_i_to_s_c(k), ++ 0, -((s64) k->k.size), ++ BTREE_TRIGGER_OVERWRITE); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_trans_commit(&trans, &disk_res, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_JOURNAL_REPLAY); ++err: ++ if (ret == -EINTR) ++ goto retry; ++ ++ bch2_disk_reservation_put(c, &disk_res); ++ ++ return bch2_trans_exit(&trans) ?: ret; ++} ++ ++static int __bch2_journal_replay_key(struct btree_trans *trans, ++ enum btree_id id, unsigned level, ++ struct bkey_i *k) ++{ ++ struct btree_iter *iter; ++ int ret; ++ ++ iter = bch2_trans_get_node_iter(trans, id, k->k.p, ++ BTREE_MAX_DEPTH, level, ++ BTREE_ITER_INTENT); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ /* ++ * iter->flags & BTREE_ITER_IS_EXTENTS triggers the update path to run ++ * extent_handle_overwrites() and extent_update_to_keys() - but we don't ++ * want that here, journal replay is supposed to treat extents like ++ * regular keys: ++ */ ++ __bch2_btree_iter_set_pos(iter, k->k.p, false); ++ ++ ret = bch2_btree_iter_traverse(iter) ?: ++ bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static int bch2_journal_replay_key(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_i *k) ++{ ++ return bch2_trans_do(c, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_JOURNAL_REPLAY, ++ __bch2_journal_replay_key(&trans, id, level, k)); ++} ++ ++static int __bch2_alloc_replay_key(struct btree_trans *trans, struct bkey_i *k) ++{ ++ struct btree_iter *iter; ++ int ret; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, k->k.p, ++ BTREE_ITER_CACHED| ++ BTREE_ITER_CACHED_NOFILL| ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(iter) ?: ++ bch2_trans_update(trans, iter, k, BTREE_TRIGGER_NORUN); ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static int bch2_alloc_replay_key(struct bch_fs *c, struct bkey_i *k) ++{ ++ return bch2_trans_do(c, NULL, NULL, ++ BTREE_INSERT_NOFAIL| ++ BTREE_INSERT_USE_RESERVE| ++ BTREE_INSERT_LAZY_RW| ++ BTREE_INSERT_JOURNAL_REPLAY, ++ __bch2_alloc_replay_key(&trans, k)); ++} ++ ++static int journal_sort_seq_cmp(const void *_l, const void *_r) ++{ ++ const struct journal_key *l = _l; ++ const struct journal_key *r = _r; ++ ++ return cmp_int(r->level, l->level) ?: ++ cmp_int(l->journal_seq, r->journal_seq) ?: ++ cmp_int(l->btree_id, r->btree_id) ?: ++ bkey_cmp(l->k->k.p, r->k->k.p); ++} ++ ++static int bch2_journal_replay(struct bch_fs *c, ++ struct journal_keys keys) ++{ ++ struct journal *j = &c->journal; ++ struct journal_key *i; ++ u64 seq; ++ int ret; ++ ++ sort(keys.d, keys.nr, sizeof(keys.d[0]), journal_sort_seq_cmp, NULL); ++ ++ if (keys.nr) ++ replay_now_at(j, keys.journal_seq_base); ++ ++ seq = j->replay_journal_seq; ++ ++ /* ++ * First replay updates to the alloc btree - these will only update the ++ * btree key cache: ++ */ ++ for_each_journal_key(keys, i) { ++ cond_resched(); ++ ++ if (!i->level && i->btree_id == BTREE_ID_ALLOC) { ++ j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; ++ ret = bch2_alloc_replay_key(c, i->k); ++ if (ret) ++ goto err; ++ } ++ } ++ ++ /* ++ * Next replay updates to interior btree nodes: ++ */ ++ for_each_journal_key(keys, i) { ++ cond_resched(); ++ ++ if (i->level) { ++ j->replay_journal_seq = keys.journal_seq_base + i->journal_seq; ++ ret = bch2_journal_replay_key(c, i->btree_id, i->level, i->k); ++ if (ret) ++ goto err; ++ } ++ } ++ ++ /* ++ * Now that the btree is in a consistent state, we can start journal ++ * reclaim (which will be flushing entries from the btree key cache back ++ * to the btree: ++ */ ++ set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); ++ set_bit(JOURNAL_RECLAIM_STARTED, &j->flags); ++ ++ j->replay_journal_seq = seq; ++ ++ /* ++ * Now replay leaf node updates: ++ */ ++ for_each_journal_key(keys, i) { ++ cond_resched(); ++ ++ if (i->level || i->btree_id == BTREE_ID_ALLOC) ++ continue; ++ ++ replay_now_at(j, keys.journal_seq_base + i->journal_seq); ++ ++ ret = i->k->k.size ++ ? bch2_extent_replay_key(c, i->btree_id, i->k) ++ : bch2_journal_replay_key(c, i->btree_id, i->level, i->k); ++ if (ret) ++ goto err; ++ } ++ ++ replay_now_at(j, j->replay_journal_seq_end); ++ j->replay_journal_seq = 0; ++ ++ bch2_journal_set_replay_done(j); ++ bch2_journal_flush_all_pins(j); ++ return bch2_journal_error(j); ++err: ++ bch_err(c, "journal replay: error %d while replaying key", ret); ++ return ret; ++} ++ ++static bool journal_empty(struct list_head *journal) ++{ ++ return list_empty(journal) || ++ journal_entry_empty(&list_last_entry(journal, ++ struct journal_replay, list)->j); ++} ++ ++static int ++verify_journal_entries_not_blacklisted_or_missing(struct bch_fs *c, ++ struct list_head *journal) ++{ ++ struct journal_replay *i = ++ list_last_entry(journal, struct journal_replay, list); ++ u64 start_seq = le64_to_cpu(i->j.last_seq); ++ u64 end_seq = le64_to_cpu(i->j.seq); ++ u64 seq = start_seq; ++ int ret = 0; ++ ++ list_for_each_entry(i, journal, list) { ++ if (le64_to_cpu(i->j.seq) < start_seq) ++ continue; ++ ++ fsck_err_on(seq != le64_to_cpu(i->j.seq), c, ++ "journal entries %llu-%llu missing! (replaying %llu-%llu)", ++ seq, le64_to_cpu(i->j.seq) - 1, ++ start_seq, end_seq); ++ ++ seq = le64_to_cpu(i->j.seq); ++ ++ fsck_err_on(bch2_journal_seq_is_blacklisted(c, seq, false), c, ++ "found blacklisted journal entry %llu", seq); ++ ++ do { ++ seq++; ++ } while (bch2_journal_seq_is_blacklisted(c, seq, false)); ++ } ++fsck_err: ++ return ret; ++} ++ ++/* journal replay early: */ ++ ++static int journal_replay_entry_early(struct bch_fs *c, ++ struct jset_entry *entry) ++{ ++ int ret = 0; ++ ++ switch (entry->type) { ++ case BCH_JSET_ENTRY_btree_root: { ++ struct btree_root *r; ++ ++ if (entry->btree_id >= BTREE_ID_NR) { ++ bch_err(c, "filesystem has unknown btree type %u", ++ entry->btree_id); ++ return -EINVAL; ++ } ++ ++ r = &c->btree_roots[entry->btree_id]; ++ ++ if (entry->u64s) { ++ r->level = entry->level; ++ bkey_copy(&r->key, &entry->start[0]); ++ r->error = 0; ++ } else { ++ r->error = -EIO; ++ } ++ r->alive = true; ++ break; ++ } ++ case BCH_JSET_ENTRY_usage: { ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ ++ switch (entry->btree_id) { ++ case FS_USAGE_RESERVED: ++ if (entry->level < BCH_REPLICAS_MAX) ++ c->usage_base->persistent_reserved[entry->level] = ++ le64_to_cpu(u->v); ++ break; ++ case FS_USAGE_INODES: ++ c->usage_base->nr_inodes = le64_to_cpu(u->v); ++ break; ++ case FS_USAGE_KEY_VERSION: ++ atomic64_set(&c->key_version, ++ le64_to_cpu(u->v)); ++ break; ++ } ++ ++ break; ++ } ++ case BCH_JSET_ENTRY_data_usage: { ++ struct jset_entry_data_usage *u = ++ container_of(entry, struct jset_entry_data_usage, entry); ++ ret = bch2_replicas_set_usage(c, &u->r, ++ le64_to_cpu(u->v)); ++ break; ++ } ++ case BCH_JSET_ENTRY_blacklist: { ++ struct jset_entry_blacklist *bl_entry = ++ container_of(entry, struct jset_entry_blacklist, entry); ++ ++ ret = bch2_journal_seq_blacklist_add(c, ++ le64_to_cpu(bl_entry->seq), ++ le64_to_cpu(bl_entry->seq) + 1); ++ break; ++ } ++ case BCH_JSET_ENTRY_blacklist_v2: { ++ struct jset_entry_blacklist_v2 *bl_entry = ++ container_of(entry, struct jset_entry_blacklist_v2, entry); ++ ++ ret = bch2_journal_seq_blacklist_add(c, ++ le64_to_cpu(bl_entry->start), ++ le64_to_cpu(bl_entry->end) + 1); ++ break; ++ } ++ } ++ ++ return ret; ++} ++ ++static int journal_replay_early(struct bch_fs *c, ++ struct bch_sb_field_clean *clean, ++ struct list_head *journal) ++{ ++ struct jset_entry *entry; ++ int ret; ++ ++ if (clean) { ++ c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock); ++ c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock); ++ ++ for (entry = clean->start; ++ entry != vstruct_end(&clean->field); ++ entry = vstruct_next(entry)) { ++ ret = journal_replay_entry_early(c, entry); ++ if (ret) ++ return ret; ++ } ++ } else { ++ struct journal_replay *i = ++ list_last_entry(journal, struct journal_replay, list); ++ ++ c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); ++ c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); ++ ++ list_for_each_entry(i, journal, list) ++ vstruct_for_each(&i->j, entry) { ++ ret = journal_replay_entry_early(c, entry); ++ if (ret) ++ return ret; ++ } ++ } ++ ++ bch2_fs_usage_initialize(c); ++ ++ return 0; ++} ++ ++/* sb clean section: */ ++ ++static struct bkey_i *btree_root_find(struct bch_fs *c, ++ struct bch_sb_field_clean *clean, ++ struct jset *j, ++ enum btree_id id, unsigned *level) ++{ ++ struct bkey_i *k; ++ struct jset_entry *entry, *start, *end; ++ ++ if (clean) { ++ start = clean->start; ++ end = vstruct_end(&clean->field); ++ } else { ++ start = j->start; ++ end = vstruct_last(j); ++ } ++ ++ for (entry = start; entry < end; entry = vstruct_next(entry)) ++ if (entry->type == BCH_JSET_ENTRY_btree_root && ++ entry->btree_id == id) ++ goto found; ++ ++ return NULL; ++found: ++ if (!entry->u64s) ++ return ERR_PTR(-EINVAL); ++ ++ k = entry->start; ++ *level = entry->level; ++ return k; ++} ++ ++static int verify_superblock_clean(struct bch_fs *c, ++ struct bch_sb_field_clean **cleanp, ++ struct jset *j) ++{ ++ unsigned i; ++ struct bch_sb_field_clean *clean = *cleanp; ++ int ret = 0; ++ ++ if (!c->sb.clean || !j) ++ return 0; ++ ++ if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, ++ "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", ++ le64_to_cpu(clean->journal_seq), ++ le64_to_cpu(j->seq))) { ++ kfree(clean); ++ *cleanp = NULL; ++ return 0; ++ } ++ ++ mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, ++ "superblock read clock %u doesn't match journal %u after clean shutdown", ++ clean->read_clock, j->read_clock); ++ mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, ++ "superblock write clock %u doesn't match journal %u after clean shutdown", ++ clean->write_clock, j->write_clock); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) { ++ char buf1[200], buf2[200]; ++ struct bkey_i *k1, *k2; ++ unsigned l1 = 0, l2 = 0; ++ ++ k1 = btree_root_find(c, clean, NULL, i, &l1); ++ k2 = btree_root_find(c, NULL, j, i, &l2); ++ ++ if (!k1 && !k2) ++ continue; ++ ++ mustfix_fsck_err_on(!k1 || !k2 || ++ IS_ERR(k1) || ++ IS_ERR(k2) || ++ k1->k.u64s != k2->k.u64s || ++ memcmp(k1, k2, bkey_bytes(k1)) || ++ l1 != l2, c, ++ "superblock btree root %u doesn't match journal after clean shutdown\n" ++ "sb: l=%u %s\n" ++ "journal: l=%u %s\n", i, ++ l1, (bch2_bkey_val_to_text(&PBUF(buf1), c, bkey_i_to_s_c(k1)), buf1), ++ l2, (bch2_bkey_val_to_text(&PBUF(buf2), c, bkey_i_to_s_c(k2)), buf2)); ++ } ++fsck_err: ++ return ret; ++} ++ ++static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) ++{ ++ struct bch_sb_field_clean *clean, *sb_clean; ++ int ret; ++ ++ mutex_lock(&c->sb_lock); ++ sb_clean = bch2_sb_get_clean(c->disk_sb.sb); ++ ++ if (fsck_err_on(!sb_clean, c, ++ "superblock marked clean but clean section not present")) { ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ c->sb.clean = false; ++ mutex_unlock(&c->sb_lock); ++ return NULL; ++ } ++ ++ clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), ++ GFP_KERNEL); ++ if (!clean) { ++ mutex_unlock(&c->sb_lock); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ if (le16_to_cpu(c->disk_sb.sb->version) < ++ bcachefs_metadata_version_bkey_renumber) ++ bch2_sb_clean_renumber(clean, READ); ++ ++ mutex_unlock(&c->sb_lock); ++ ++ return clean; ++fsck_err: ++ mutex_unlock(&c->sb_lock); ++ return ERR_PTR(ret); ++} ++ ++static int read_btree_roots(struct bch_fs *c) ++{ ++ unsigned i; ++ int ret = 0; ++ ++ for (i = 0; i < BTREE_ID_NR; i++) { ++ struct btree_root *r = &c->btree_roots[i]; ++ ++ if (!r->alive) ++ continue; ++ ++ if (i == BTREE_ID_ALLOC && ++ c->opts.reconstruct_alloc) { ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); ++ continue; ++ } ++ ++ ++ if (r->error) { ++ __fsck_err(c, i == BTREE_ID_ALLOC ++ ? FSCK_CAN_IGNORE : 0, ++ "invalid btree root %s", ++ bch2_btree_ids[i]); ++ if (i == BTREE_ID_ALLOC) ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); ++ } ++ ++ ret = bch2_btree_root_read(c, i, &r->key, r->level); ++ if (ret) { ++ __fsck_err(c, i == BTREE_ID_ALLOC ++ ? FSCK_CAN_IGNORE : 0, ++ "error reading btree root %s", ++ bch2_btree_ids[i]); ++ if (i == BTREE_ID_ALLOC) ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); ++ } ++ } ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ if (!c->btree_roots[i].b) ++ bch2_btree_root_alloc(c, i); ++fsck_err: ++ return ret; ++} ++ ++int bch2_fs_recovery(struct bch_fs *c) ++{ ++ const char *err = "cannot allocate memory"; ++ struct bch_sb_field_clean *clean = NULL; ++ u64 journal_seq; ++ bool write_sb = false, need_write_alloc = false; ++ int ret; ++ ++ if (c->sb.clean) ++ clean = read_superblock_clean(c); ++ ret = PTR_ERR_OR_ZERO(clean); ++ if (ret) ++ goto err; ++ ++ if (c->sb.clean) ++ bch_info(c, "recovering from clean shutdown, journal seq %llu", ++ le64_to_cpu(clean->journal_seq)); ++ ++ if (!c->replicas.entries || ++ c->opts.rebuild_replicas) { ++ bch_info(c, "building replicas info"); ++ set_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); ++ } ++ ++ if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { ++ struct jset *j; ++ ++ ret = bch2_journal_read(c, &c->journal_entries); ++ if (ret) ++ goto err; ++ ++ if (mustfix_fsck_err_on(c->sb.clean && !journal_empty(&c->journal_entries), c, ++ "filesystem marked clean but journal not empty")) { ++ c->sb.compat &= ~(1ULL << BCH_COMPAT_FEAT_ALLOC_INFO); ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ c->sb.clean = false; ++ } ++ ++ if (!c->sb.clean && list_empty(&c->journal_entries)) { ++ bch_err(c, "no journal entries found"); ++ ret = BCH_FSCK_REPAIR_IMPOSSIBLE; ++ goto err; ++ } ++ ++ c->journal_keys = journal_keys_sort(&c->journal_entries); ++ if (!c->journal_keys.d) { ++ ret = -ENOMEM; ++ goto err; ++ } ++ ++ j = &list_last_entry(&c->journal_entries, ++ struct journal_replay, list)->j; ++ ++ ret = verify_superblock_clean(c, &clean, j); ++ if (ret) ++ goto err; ++ ++ journal_seq = le64_to_cpu(j->seq) + 1; ++ } else { ++ journal_seq = le64_to_cpu(clean->journal_seq) + 1; ++ } ++ ++ if (!c->sb.clean && ++ !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { ++ bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ ret = journal_replay_early(c, clean, &c->journal_entries); ++ if (ret) ++ goto err; ++ ++ if (!c->sb.clean) { ++ ret = bch2_journal_seq_blacklist_add(c, ++ journal_seq, ++ journal_seq + 4); ++ if (ret) { ++ bch_err(c, "error creating new journal seq blacklist entry"); ++ goto err; ++ } ++ ++ journal_seq += 4; ++ ++ /* ++ * The superblock needs to be written before we do any btree ++ * node writes: it will be in the read_write() path ++ */ ++ } ++ ++ ret = bch2_blacklist_table_initialize(c); ++ ++ if (!list_empty(&c->journal_entries)) { ++ ret = verify_journal_entries_not_blacklisted_or_missing(c, ++ &c->journal_entries); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_fs_journal_start(&c->journal, journal_seq, ++ &c->journal_entries); ++ if (ret) ++ goto err; ++ ++ ret = read_btree_roots(c); ++ if (ret) ++ goto err; ++ ++ bch_verbose(c, "starting alloc read"); ++ err = "error reading allocation information"; ++ ret = bch2_alloc_read(c, &c->journal_keys); ++ if (ret) ++ goto err; ++ bch_verbose(c, "alloc read done"); ++ ++ bch_verbose(c, "starting stripes_read"); ++ err = "error reading stripes"; ++ ret = bch2_stripes_read(c, &c->journal_keys); ++ if (ret) ++ goto err; ++ bch_verbose(c, "stripes_read done"); ++ ++ set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); ++ ++ if ((c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) && ++ !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA))) { ++ /* ++ * interior btree node updates aren't consistent with the ++ * journal; after an unclean shutdown we have to walk all ++ * pointers to metadata: ++ */ ++ bch_info(c, "starting metadata mark and sweep"); ++ err = "error in mark and sweep"; ++ ret = bch2_gc(c, &c->journal_keys, true, true); ++ if (ret < 0) ++ goto err; ++ if (ret) ++ need_write_alloc = true; ++ bch_verbose(c, "mark and sweep done"); ++ } ++ ++ if (c->opts.fsck || ++ !(c->sb.compat & (1ULL << BCH_COMPAT_FEAT_ALLOC_INFO)) || ++ test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags)) { ++ bch_info(c, "starting mark and sweep"); ++ err = "error in mark and sweep"; ++ ret = bch2_gc(c, &c->journal_keys, true, false); ++ if (ret < 0) ++ goto err; ++ if (ret) ++ need_write_alloc = true; ++ bch_verbose(c, "mark and sweep done"); ++ } ++ ++ clear_bit(BCH_FS_REBUILD_REPLICAS, &c->flags); ++ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); ++ ++ /* ++ * Skip past versions that might have possibly been used (as nonces), ++ * but hadn't had their pointers written: ++ */ ++ if (c->sb.encryption_type && !c->sb.clean) ++ atomic64_add(1 << 16, &c->key_version); ++ ++ if (c->opts.norecovery) ++ goto out; ++ ++ bch_verbose(c, "starting journal replay"); ++ err = "journal replay failed"; ++ ret = bch2_journal_replay(c, c->journal_keys); ++ if (ret) ++ goto err; ++ bch_verbose(c, "journal replay done"); ++ ++ if (need_write_alloc && !c->opts.nochanges) { ++ /* ++ * note that even when filesystem was clean there might be work ++ * to do here, if we ran gc (because of fsck) which recalculated ++ * oldest_gen: ++ */ ++ bch_verbose(c, "writing allocation info"); ++ err = "error writing out alloc info"; ++ ret = bch2_stripes_write(c, BTREE_INSERT_LAZY_RW) ?: ++ bch2_alloc_write(c, BTREE_INSERT_LAZY_RW); ++ if (ret) { ++ bch_err(c, "error writing alloc info"); ++ goto err; ++ } ++ bch_verbose(c, "alloc write done"); ++ ++ set_bit(BCH_FS_ALLOC_WRITTEN, &c->flags); ++ } ++ ++ if (!c->sb.clean) { ++ if (!(c->sb.features & (1 << BCH_FEATURE_atomic_nlink))) { ++ bch_info(c, "checking inode link counts"); ++ err = "error in recovery"; ++ ret = bch2_fsck_inode_nlink(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "check inodes done"); ++ ++ } else { ++ bch_verbose(c, "checking for deleted inodes"); ++ err = "error in recovery"; ++ ret = bch2_fsck_walk_inodes_only(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "check inodes done"); ++ } ++ } ++ ++ if (c->opts.fsck) { ++ bch_info(c, "starting fsck"); ++ err = "error in fsck"; ++ ret = bch2_fsck_full(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "fsck done"); ++ } ++ ++ if (enabled_qtypes(c)) { ++ bch_verbose(c, "reading quotas"); ++ ret = bch2_fs_quota_read(c); ++ if (ret) ++ goto err; ++ bch_verbose(c, "quotas done"); ++ } ++ ++ mutex_lock(&c->sb_lock); ++ if (c->opts.version_upgrade) { ++ if (c->sb.version < bcachefs_metadata_version_new_versioning) ++ c->disk_sb.sb->version_min = ++ le16_to_cpu(bcachefs_metadata_version_min); ++ c->disk_sb.sb->version = le16_to_cpu(bcachefs_metadata_version_current); ++ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; ++ write_sb = true; ++ } ++ ++ if (!test_bit(BCH_FS_ERROR, &c->flags)) { ++ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; ++ write_sb = true; ++ } ++ ++ if (c->opts.fsck && ++ !test_bit(BCH_FS_ERROR, &c->flags)) { ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; ++ SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); ++ write_sb = true; ++ } ++ ++ if (write_sb) ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ if (c->journal_seq_blacklist_table && ++ c->journal_seq_blacklist_table->nr > 128) ++ queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); ++out: ++ ret = 0; ++err: ++fsck_err: ++ set_bit(BCH_FS_FSCK_DONE, &c->flags); ++ bch2_flush_fsck_errs(c); ++ ++ if (!c->opts.keep_journal) { ++ bch2_journal_keys_free(&c->journal_keys); ++ bch2_journal_entries_free(&c->journal_entries); ++ } ++ kfree(clean); ++ if (ret) ++ bch_err(c, "Error in recovery: %s (%i)", err, ret); ++ else ++ bch_verbose(c, "ret %i", ret); ++ return ret; ++} ++ ++int bch2_fs_initialize(struct bch_fs *c) ++{ ++ struct bch_inode_unpacked root_inode, lostfound_inode; ++ struct bkey_inode_buf packed_inode; ++ struct qstr lostfound = QSTR("lost+found"); ++ const char *err = "cannot allocate memory"; ++ struct bch_dev *ca; ++ LIST_HEAD(journal); ++ unsigned i; ++ int ret; ++ ++ bch_notice(c, "initializing new filesystem"); ++ ++ mutex_lock(&c->sb_lock); ++ for_each_online_member(ca, c, i) ++ bch2_mark_dev_superblock(c, ca, 0); ++ mutex_unlock(&c->sb_lock); ++ ++ mutex_lock(&c->sb_lock); ++ c->disk_sb.sb->version = c->disk_sb.sb->version_min = ++ le16_to_cpu(bcachefs_metadata_version_current); ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_atomic_nlink; ++ c->disk_sb.sb->features[0] |= BCH_SB_FEATURES_ALL; ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ set_bit(BCH_FS_ALLOC_READ_DONE, &c->flags); ++ set_bit(BCH_FS_INITIAL_GC_DONE, &c->flags); ++ ++ for (i = 0; i < BTREE_ID_NR; i++) ++ bch2_btree_root_alloc(c, i); ++ ++ set_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags); ++ set_bit(JOURNAL_RECLAIM_STARTED, &c->journal.flags); ++ ++ err = "unable to allocate journal buckets"; ++ for_each_online_member(ca, c, i) { ++ ret = bch2_dev_journal_alloc(ca); ++ if (ret) { ++ percpu_ref_put(&ca->io_ref); ++ goto err; ++ } ++ } ++ ++ /* ++ * journal_res_get() will crash if called before this has ++ * set up the journal.pin FIFO and journal.cur pointer: ++ */ ++ bch2_fs_journal_start(&c->journal, 1, &journal); ++ bch2_journal_set_replay_done(&c->journal); ++ ++ err = "error going read-write"; ++ ret = bch2_fs_read_write_early(c); ++ if (ret) ++ goto err; ++ ++ /* ++ * Write out the superblock and journal buckets, now that we can do ++ * btree updates ++ */ ++ err = "error writing alloc info"; ++ ret = bch2_alloc_write(c, 0); ++ if (ret) ++ goto err; ++ ++ bch2_inode_init(c, &root_inode, 0, 0, ++ S_IFDIR|S_IRWXU|S_IRUGO|S_IXUGO, 0, NULL); ++ root_inode.bi_inum = BCACHEFS_ROOT_INO; ++ bch2_inode_pack(&packed_inode, &root_inode); ++ ++ err = "error creating root directory"; ++ ret = bch2_btree_insert(c, BTREE_ID_INODES, ++ &packed_inode.inode.k_i, ++ NULL, NULL, 0); ++ if (ret) ++ goto err; ++ ++ bch2_inode_init_early(c, &lostfound_inode); ++ ++ err = "error creating lost+found"; ++ ret = bch2_trans_do(c, NULL, NULL, 0, ++ bch2_create_trans(&trans, BCACHEFS_ROOT_INO, ++ &root_inode, &lostfound_inode, ++ &lostfound, ++ 0, 0, S_IFDIR|0700, 0, ++ NULL, NULL)); ++ if (ret) ++ goto err; ++ ++ if (enabled_qtypes(c)) { ++ ret = bch2_fs_quota_read(c); ++ if (ret) ++ goto err; ++ } ++ ++ err = "error writing first journal entry"; ++ ret = bch2_journal_meta(&c->journal); ++ if (ret) ++ goto err; ++ ++ mutex_lock(&c->sb_lock); ++ SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++err: ++ pr_err("Error initializing new filesystem: %s (%i)", err, ret); ++ return ret; ++} +diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h +new file mode 100644 +index 000000000000..a66827c9addf +--- /dev/null ++++ b/fs/bcachefs/recovery.h +@@ -0,0 +1,60 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_RECOVERY_H ++#define _BCACHEFS_RECOVERY_H ++ ++#define for_each_journal_key(keys, i) \ ++ for (i = (keys).d; i < (keys).d + (keys).nr; (i)++) ++ ++struct journal_iter { ++ enum btree_id btree_id; ++ unsigned level; ++ struct journal_keys *keys; ++ struct journal_key *k; ++}; ++ ++/* ++ * Iterate over keys in the btree, with keys from the journal overlaid on top: ++ */ ++ ++struct btree_and_journal_iter { ++ struct btree_iter *btree; ++ ++ struct btree *b; ++ struct btree_node_iter node_iter; ++ struct bkey unpacked; ++ ++ struct journal_iter journal; ++ ++ enum last_key_returned { ++ none, ++ btree, ++ journal, ++ } last; ++}; ++ ++void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); ++struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); ++struct bkey_s_c bch2_btree_and_journal_iter_next(struct btree_and_journal_iter *); ++ ++void bch2_btree_and_journal_iter_init(struct btree_and_journal_iter *, ++ struct btree_trans *, ++ struct journal_keys *, ++ enum btree_id, struct bpos); ++void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, ++ struct journal_keys *, ++ struct btree *); ++ ++typedef int (*btree_walk_node_fn)(struct bch_fs *c, struct btree *b); ++typedef int (*btree_walk_key_fn)(struct bch_fs *c, enum btree_id id, ++ unsigned level, struct bkey_s_c k); ++ ++int bch2_btree_and_journal_walk(struct bch_fs *, struct journal_keys *, enum btree_id, ++ btree_walk_node_fn, btree_walk_key_fn); ++ ++void bch2_journal_keys_free(struct journal_keys *); ++void bch2_journal_entries_free(struct list_head *); ++ ++int bch2_fs_recovery(struct bch_fs *); ++int bch2_fs_initialize(struct bch_fs *); ++ ++#endif /* _BCACHEFS_RECOVERY_H */ +diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c +new file mode 100644 +index 000000000000..3c473f1380a6 +--- /dev/null ++++ b/fs/bcachefs/reflink.c +@@ -0,0 +1,303 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "bkey_on_stack.h" ++#include "btree_update.h" ++#include "extents.h" ++#include "inode.h" ++#include "io.h" ++#include "reflink.h" ++ ++#include ++ ++/* reflink pointers */ ++ ++const char *bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); ++ ++ if (bkey_val_bytes(p.k) != sizeof(*p.v)) ++ return "incorrect value size"; ++ ++ return NULL; ++} ++ ++void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); ++ ++ pr_buf(out, "idx %llu", le64_to_cpu(p.v->idx)); ++} ++ ++enum merge_result bch2_reflink_p_merge(struct bch_fs *c, ++ struct bkey_s _l, struct bkey_s _r) ++{ ++ struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l); ++ struct bkey_s_reflink_p r = bkey_s_to_reflink_p(_r); ++ ++ if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) ++ return BCH_MERGE_NOMERGE; ++ ++ if ((u64) l.k->size + r.k->size > KEY_SIZE_MAX) { ++ bch2_key_resize(l.k, KEY_SIZE_MAX); ++ bch2_cut_front_s(l.k->p, _r); ++ return BCH_MERGE_PARTIAL; ++ } ++ ++ bch2_key_resize(l.k, l.k->size + r.k->size); ++ ++ return BCH_MERGE_MERGE; ++} ++ ++/* indirect extents */ ++ ++const char *bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); ++ ++ if (bkey_val_bytes(r.k) < sizeof(*r.v)) ++ return "incorrect value size"; ++ ++ return bch2_bkey_ptrs_invalid(c, k); ++} ++ ++void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); ++ ++ pr_buf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); ++ ++ bch2_bkey_ptrs_to_text(out, c, k); ++} ++ ++static int bch2_make_extent_indirect(struct btree_trans *trans, ++ struct btree_iter *extent_iter, ++ struct bkey_i_extent *e) ++{ ++ struct bch_fs *c = trans->c; ++ struct btree_iter *reflink_iter; ++ struct bkey_s_c k; ++ struct bkey_i_reflink_v *r_v; ++ struct bkey_i_reflink_p *r_p; ++ int ret; ++ ++ for_each_btree_key(trans, reflink_iter, BTREE_ID_REFLINK, ++ POS(0, c->reflink_hint), ++ BTREE_ITER_INTENT|BTREE_ITER_SLOTS, k, ret) { ++ if (reflink_iter->pos.inode) { ++ bch2_btree_iter_set_pos(reflink_iter, POS_MIN); ++ continue; ++ } ++ ++ if (bkey_deleted(k.k) && e->k.size <= k.k->size) ++ break; ++ } ++ ++ if (ret) ++ goto err; ++ ++ /* rewind iter to start of hole, if necessary: */ ++ bch2_btree_iter_set_pos(reflink_iter, bkey_start_pos(k.k)); ++ ++ r_v = bch2_trans_kmalloc(trans, sizeof(*r_v) + bkey_val_bytes(&e->k)); ++ ret = PTR_ERR_OR_ZERO(r_v); ++ if (ret) ++ goto err; ++ ++ bkey_reflink_v_init(&r_v->k_i); ++ r_v->k.p = reflink_iter->pos; ++ bch2_key_resize(&r_v->k, e->k.size); ++ r_v->k.version = e->k.version; ++ ++ set_bkey_val_u64s(&r_v->k, bkey_val_u64s(&r_v->k) + ++ bkey_val_u64s(&e->k)); ++ r_v->v.refcount = 0; ++ memcpy(r_v->v.start, e->v.start, bkey_val_bytes(&e->k)); ++ ++ bch2_trans_update(trans, reflink_iter, &r_v->k_i, 0); ++ ++ r_p = bch2_trans_kmalloc(trans, sizeof(*r_p)); ++ if (IS_ERR(r_p)) ++ return PTR_ERR(r_p); ++ ++ e->k.type = KEY_TYPE_reflink_p; ++ r_p = bkey_i_to_reflink_p(&e->k_i); ++ set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); ++ r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); ++ ++ bch2_trans_update(trans, extent_iter, &r_p->k_i, 0); ++err: ++ if (!IS_ERR(reflink_iter)) ++ c->reflink_hint = reflink_iter->pos.offset; ++ bch2_trans_iter_put(trans, reflink_iter); ++ ++ return ret; ++} ++ ++static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) ++{ ++ struct bkey_s_c k = bch2_btree_iter_peek(iter); ++ int ret; ++ ++ for_each_btree_key_continue(iter, 0, k, ret) { ++ if (bkey_cmp(iter->pos, end) >= 0) ++ return bkey_s_c_null; ++ ++ if (k.k->type == KEY_TYPE_extent || ++ k.k->type == KEY_TYPE_reflink_p) ++ break; ++ } ++ ++ return k; ++} ++ ++s64 bch2_remap_range(struct bch_fs *c, ++ struct bpos dst_start, struct bpos src_start, ++ u64 remap_sectors, u64 *journal_seq, ++ u64 new_i_size, s64 *i_sectors_delta) ++{ ++ struct btree_trans trans; ++ struct btree_iter *dst_iter, *src_iter; ++ struct bkey_s_c src_k; ++ BKEY_PADDED(k) new_dst; ++ struct bkey_on_stack new_src; ++ struct bpos dst_end = dst_start, src_end = src_start; ++ struct bpos dst_want, src_want; ++ u64 src_done, dst_done; ++ int ret = 0, ret2 = 0; ++ ++ if (!c->opts.reflink) ++ return -EOPNOTSUPP; ++ ++ if (!percpu_ref_tryget(&c->writes)) ++ return -EROFS; ++ ++ bch2_check_set_feature(c, BCH_FEATURE_reflink); ++ ++ dst_end.offset += remap_sectors; ++ src_end.offset += remap_sectors; ++ ++ bkey_on_stack_init(&new_src); ++ bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); ++ ++ src_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, src_start, ++ BTREE_ITER_INTENT); ++ dst_iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, dst_start, ++ BTREE_ITER_INTENT); ++ ++ while (1) { ++ bch2_trans_begin(&trans); ++ ++ trans.mem_top = 0; ++ ++ if (fatal_signal_pending(current)) { ++ ret = -EINTR; ++ goto err; ++ } ++ ++ src_k = get_next_src(src_iter, src_end); ++ ret = bkey_err(src_k); ++ if (ret) ++ goto btree_err; ++ ++ src_done = bpos_min(src_iter->pos, src_end).offset - ++ src_start.offset; ++ dst_want = POS(dst_start.inode, dst_start.offset + src_done); ++ ++ if (bkey_cmp(dst_iter->pos, dst_want) < 0) { ++ ret = bch2_fpunch_at(&trans, dst_iter, dst_want, ++ journal_seq, i_sectors_delta); ++ if (ret) ++ goto btree_err; ++ continue; ++ } ++ ++ BUG_ON(bkey_cmp(dst_iter->pos, dst_want)); ++ ++ if (!bkey_cmp(dst_iter->pos, dst_end)) ++ break; ++ ++ if (src_k.k->type == KEY_TYPE_extent) { ++ bkey_on_stack_reassemble(&new_src, c, src_k); ++ src_k = bkey_i_to_s_c(new_src.k); ++ ++ bch2_cut_front(src_iter->pos, new_src.k); ++ bch2_cut_back(src_end, new_src.k); ++ ++ ret = bch2_make_extent_indirect(&trans, src_iter, ++ bkey_i_to_extent(new_src.k)); ++ if (ret) ++ goto btree_err; ++ ++ BUG_ON(src_k.k->type != KEY_TYPE_reflink_p); ++ } ++ ++ if (src_k.k->type == KEY_TYPE_reflink_p) { ++ struct bkey_s_c_reflink_p src_p = ++ bkey_s_c_to_reflink_p(src_k); ++ struct bkey_i_reflink_p *dst_p = ++ bkey_reflink_p_init(&new_dst.k); ++ ++ u64 offset = le64_to_cpu(src_p.v->idx) + ++ (src_iter->pos.offset - ++ bkey_start_offset(src_k.k)); ++ ++ dst_p->v.idx = cpu_to_le64(offset); ++ } else { ++ BUG(); ++ } ++ ++ new_dst.k.k.p = dst_iter->pos; ++ bch2_key_resize(&new_dst.k.k, ++ min(src_k.k->p.offset - src_iter->pos.offset, ++ dst_end.offset - dst_iter->pos.offset)); ++ ++ ret = bch2_extent_update(&trans, dst_iter, &new_dst.k, ++ NULL, journal_seq, ++ new_i_size, i_sectors_delta); ++ if (ret) ++ goto btree_err; ++ ++ dst_done = dst_iter->pos.offset - dst_start.offset; ++ src_want = POS(src_start.inode, src_start.offset + dst_done); ++ bch2_btree_iter_set_pos(src_iter, src_want); ++btree_err: ++ if (ret == -EINTR) ++ ret = 0; ++ if (ret) ++ goto err; ++ } ++ ++ BUG_ON(bkey_cmp(dst_iter->pos, dst_end)); ++err: ++ BUG_ON(bkey_cmp(dst_iter->pos, dst_end) > 0); ++ ++ dst_done = dst_iter->pos.offset - dst_start.offset; ++ new_i_size = min(dst_iter->pos.offset << 9, new_i_size); ++ ++ bch2_trans_begin(&trans); ++ ++ do { ++ struct bch_inode_unpacked inode_u; ++ struct btree_iter *inode_iter; ++ ++ inode_iter = bch2_inode_peek(&trans, &inode_u, ++ dst_start.inode, BTREE_ITER_INTENT); ++ ret2 = PTR_ERR_OR_ZERO(inode_iter); ++ ++ if (!ret2 && ++ inode_u.bi_size < new_i_size) { ++ inode_u.bi_size = new_i_size; ++ ret2 = bch2_inode_write(&trans, inode_iter, &inode_u) ?: ++ bch2_trans_commit(&trans, NULL, journal_seq, 0); ++ } ++ } while (ret2 == -EINTR); ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ bkey_on_stack_exit(&new_src, c); ++ ++ percpu_ref_put(&c->writes); ++ ++ return dst_done ?: ret ?: ret2; ++} +diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h +new file mode 100644 +index 000000000000..5445c1cf0797 +--- /dev/null ++++ b/fs/bcachefs/reflink.h +@@ -0,0 +1,31 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_REFLINK_H ++#define _BCACHEFS_REFLINK_H ++ ++const char *bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++enum merge_result bch2_reflink_p_merge(struct bch_fs *, ++ struct bkey_s, struct bkey_s); ++ ++#define bch2_bkey_ops_reflink_p (struct bkey_ops) { \ ++ .key_invalid = bch2_reflink_p_invalid, \ ++ .val_to_text = bch2_reflink_p_to_text, \ ++ .key_merge = bch2_reflink_p_merge, \ ++} ++ ++const char *bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, ++ struct bkey_s_c); ++ ++ ++#define bch2_bkey_ops_reflink_v (struct bkey_ops) { \ ++ .key_invalid = bch2_reflink_v_invalid, \ ++ .val_to_text = bch2_reflink_v_to_text, \ ++ .swab = bch2_ptr_swab, \ ++} ++ ++s64 bch2_remap_range(struct bch_fs *, struct bpos, struct bpos, ++ u64, u64 *, u64, s64 *); ++ ++#endif /* _BCACHEFS_REFLINK_H */ +diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c +new file mode 100644 +index 000000000000..6b6506c68609 +--- /dev/null ++++ b/fs/bcachefs/replicas.c +@@ -0,0 +1,1059 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "buckets.h" ++#include "journal.h" ++#include "replicas.h" ++#include "super-io.h" ++ ++static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, ++ struct bch_replicas_cpu *); ++ ++/* Replicas tracking - in memory: */ ++ ++static inline int u8_cmp(u8 l, u8 r) ++{ ++ return cmp_int(l, r); ++} ++ ++static void verify_replicas_entry(struct bch_replicas_entry *e) ++{ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ unsigned i; ++ ++ BUG_ON(e->data_type >= BCH_DATA_NR); ++ BUG_ON(!e->nr_devs); ++ BUG_ON(e->nr_required > 1 && ++ e->nr_required >= e->nr_devs); ++ ++ for (i = 0; i + 1 < e->nr_devs; i++) ++ BUG_ON(e->devs[i] >= e->devs[i + 1]); ++#endif ++} ++ ++static void replicas_entry_sort(struct bch_replicas_entry *e) ++{ ++ bubble_sort(e->devs, e->nr_devs, u8_cmp); ++} ++ ++static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) ++{ ++ eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); ++} ++ ++void bch2_replicas_entry_to_text(struct printbuf *out, ++ struct bch_replicas_entry *e) ++{ ++ unsigned i; ++ ++ pr_buf(out, "%s: %u/%u [", ++ bch2_data_types[e->data_type], ++ e->nr_required, ++ e->nr_devs); ++ ++ for (i = 0; i < e->nr_devs; i++) ++ pr_buf(out, i ? " %u" : "%u", e->devs[i]); ++ pr_buf(out, "]"); ++} ++ ++void bch2_cpu_replicas_to_text(struct printbuf *out, ++ struct bch_replicas_cpu *r) ++{ ++ struct bch_replicas_entry *e; ++ bool first = true; ++ ++ for_each_cpu_replicas_entry(r, e) { ++ if (!first) ++ pr_buf(out, " "); ++ first = false; ++ ++ bch2_replicas_entry_to_text(out, e); ++ } ++} ++ ++static void extent_to_replicas(struct bkey_s_c k, ++ struct bch_replicas_entry *r) ++{ ++ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ r->nr_required = 1; ++ ++ bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { ++ if (p.ptr.cached) ++ continue; ++ ++ if (!p.has_ec) ++ r->devs[r->nr_devs++] = p.ptr.dev; ++ else ++ r->nr_required = 0; ++ } ++} ++ ++static void stripe_to_replicas(struct bkey_s_c k, ++ struct bch_replicas_entry *r) ++{ ++ struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); ++ const struct bch_extent_ptr *ptr; ++ ++ r->nr_required = s.v->nr_blocks - s.v->nr_redundant; ++ ++ for (ptr = s.v->ptrs; ++ ptr < s.v->ptrs + s.v->nr_blocks; ++ ptr++) ++ r->devs[r->nr_devs++] = ptr->dev; ++} ++ ++void bch2_bkey_to_replicas(struct bch_replicas_entry *e, ++ struct bkey_s_c k) ++{ ++ e->nr_devs = 0; ++ ++ switch (k.k->type) { ++ case KEY_TYPE_btree_ptr: ++ case KEY_TYPE_btree_ptr_v2: ++ e->data_type = BCH_DATA_btree; ++ extent_to_replicas(k, e); ++ break; ++ case KEY_TYPE_extent: ++ case KEY_TYPE_reflink_v: ++ e->data_type = BCH_DATA_user; ++ extent_to_replicas(k, e); ++ break; ++ case KEY_TYPE_stripe: ++ e->data_type = BCH_DATA_user; ++ stripe_to_replicas(k, e); ++ break; ++ } ++ ++ replicas_entry_sort(e); ++} ++ ++void bch2_devlist_to_replicas(struct bch_replicas_entry *e, ++ enum bch_data_type data_type, ++ struct bch_devs_list devs) ++{ ++ unsigned i; ++ ++ BUG_ON(!data_type || ++ data_type == BCH_DATA_sb || ++ data_type >= BCH_DATA_NR); ++ ++ e->data_type = data_type; ++ e->nr_devs = 0; ++ e->nr_required = 1; ++ ++ for (i = 0; i < devs.nr; i++) ++ e->devs[e->nr_devs++] = devs.devs[i]; ++ ++ replicas_entry_sort(e); ++} ++ ++static struct bch_replicas_cpu ++cpu_replicas_add_entry(struct bch_replicas_cpu *old, ++ struct bch_replicas_entry *new_entry) ++{ ++ unsigned i; ++ struct bch_replicas_cpu new = { ++ .nr = old->nr + 1, ++ .entry_size = max_t(unsigned, old->entry_size, ++ replicas_entry_bytes(new_entry)), ++ }; ++ ++ BUG_ON(!new_entry->data_type); ++ verify_replicas_entry(new_entry); ++ ++ new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO); ++ if (!new.entries) ++ return new; ++ ++ for (i = 0; i < old->nr; i++) ++ memcpy(cpu_replicas_entry(&new, i), ++ cpu_replicas_entry(old, i), ++ old->entry_size); ++ ++ memcpy(cpu_replicas_entry(&new, old->nr), ++ new_entry, ++ replicas_entry_bytes(new_entry)); ++ ++ bch2_cpu_replicas_sort(&new); ++ return new; ++} ++ ++static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, ++ struct bch_replicas_entry *search) ++{ ++ int idx, entry_size = replicas_entry_bytes(search); ++ ++ if (unlikely(entry_size > r->entry_size)) ++ return -1; ++ ++ verify_replicas_entry(search); ++ ++#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) ++ idx = eytzinger0_find(r->entries, r->nr, r->entry_size, ++ entry_cmp, search); ++#undef entry_cmp ++ ++ return idx < r->nr ? idx : -1; ++} ++ ++int bch2_replicas_entry_idx(struct bch_fs *c, ++ struct bch_replicas_entry *search) ++{ ++ replicas_entry_sort(search); ++ ++ return __replicas_entry_idx(&c->replicas, search); ++} ++ ++static bool __replicas_has_entry(struct bch_replicas_cpu *r, ++ struct bch_replicas_entry *search) ++{ ++ return __replicas_entry_idx(r, search) >= 0; ++} ++ ++bool bch2_replicas_marked(struct bch_fs *c, ++ struct bch_replicas_entry *search) ++{ ++ bool marked; ++ ++ if (!search->nr_devs) ++ return true; ++ ++ verify_replicas_entry(search); ++ ++ percpu_down_read(&c->mark_lock); ++ marked = __replicas_has_entry(&c->replicas, search) && ++ (likely((!c->replicas_gc.entries)) || ++ __replicas_has_entry(&c->replicas_gc, search)); ++ percpu_up_read(&c->mark_lock); ++ ++ return marked; ++} ++ ++static void __replicas_table_update(struct bch_fs_usage *dst, ++ struct bch_replicas_cpu *dst_r, ++ struct bch_fs_usage *src, ++ struct bch_replicas_cpu *src_r) ++{ ++ int src_idx, dst_idx; ++ ++ *dst = *src; ++ ++ for (src_idx = 0; src_idx < src_r->nr; src_idx++) { ++ if (!src->replicas[src_idx]) ++ continue; ++ ++ dst_idx = __replicas_entry_idx(dst_r, ++ cpu_replicas_entry(src_r, src_idx)); ++ BUG_ON(dst_idx < 0); ++ ++ dst->replicas[dst_idx] = src->replicas[src_idx]; ++ } ++} ++ ++static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, ++ struct bch_replicas_cpu *dst_r, ++ struct bch_fs_usage __percpu *src_p, ++ struct bch_replicas_cpu *src_r) ++{ ++ unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; ++ struct bch_fs_usage *dst, *src = (void *) ++ bch2_acc_percpu_u64s((void *) src_p, src_nr); ++ ++ preempt_disable(); ++ dst = this_cpu_ptr(dst_p); ++ preempt_enable(); ++ ++ __replicas_table_update(dst, dst_r, src, src_r); ++} ++ ++/* ++ * Resize filesystem accounting: ++ */ ++static int replicas_table_update(struct bch_fs *c, ++ struct bch_replicas_cpu *new_r) ++{ ++ struct bch_fs_usage __percpu *new_usage[2] = { NULL, NULL }; ++ struct bch_fs_usage *new_scratch = NULL; ++ struct bch_fs_usage __percpu *new_gc = NULL; ++ struct bch_fs_usage *new_base = NULL; ++ unsigned bytes = sizeof(struct bch_fs_usage) + ++ sizeof(u64) * new_r->nr; ++ int ret = -ENOMEM; ++ ++ if (!(new_base = kzalloc(bytes, GFP_NOIO)) || ++ !(new_usage[0] = __alloc_percpu_gfp(bytes, sizeof(u64), ++ GFP_NOIO)) || ++ !(new_usage[1] = __alloc_percpu_gfp(bytes, sizeof(u64), ++ GFP_NOIO)) || ++ !(new_scratch = kmalloc(bytes, GFP_NOIO)) || ++ (c->usage_gc && ++ !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_NOIO)))) { ++ bch_err(c, "error updating replicas table: memory allocation failure"); ++ goto err; ++ } ++ ++ if (c->usage_base) ++ __replicas_table_update(new_base, new_r, ++ c->usage_base, &c->replicas); ++ if (c->usage[0]) ++ __replicas_table_update_pcpu(new_usage[0], new_r, ++ c->usage[0], &c->replicas); ++ if (c->usage[1]) ++ __replicas_table_update_pcpu(new_usage[1], new_r, ++ c->usage[1], &c->replicas); ++ if (c->usage_gc) ++ __replicas_table_update_pcpu(new_gc, new_r, ++ c->usage_gc, &c->replicas); ++ ++ swap(c->usage_base, new_base); ++ swap(c->usage[0], new_usage[0]); ++ swap(c->usage[1], new_usage[1]); ++ swap(c->usage_scratch, new_scratch); ++ swap(c->usage_gc, new_gc); ++ swap(c->replicas, *new_r); ++ ret = 0; ++err: ++ free_percpu(new_gc); ++ kfree(new_scratch); ++ free_percpu(new_usage[1]); ++ free_percpu(new_usage[0]); ++ kfree(new_base); ++ return ret; ++} ++ ++static unsigned reserve_journal_replicas(struct bch_fs *c, ++ struct bch_replicas_cpu *r) ++{ ++ struct bch_replicas_entry *e; ++ unsigned journal_res_u64s = 0; ++ ++ /* nr_inodes: */ ++ journal_res_u64s += ++ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); ++ ++ /* key_version: */ ++ journal_res_u64s += ++ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); ++ ++ /* persistent_reserved: */ ++ journal_res_u64s += ++ DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) * ++ BCH_REPLICAS_MAX; ++ ++ for_each_cpu_replicas_entry(r, e) ++ journal_res_u64s += ++ DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) + ++ e->nr_devs, sizeof(u64)); ++ return journal_res_u64s; ++} ++ ++noinline ++static int bch2_mark_replicas_slowpath(struct bch_fs *c, ++ struct bch_replicas_entry *new_entry) ++{ ++ struct bch_replicas_cpu new_r, new_gc; ++ int ret = 0; ++ ++ verify_replicas_entry(new_entry); ++ ++ memset(&new_r, 0, sizeof(new_r)); ++ memset(&new_gc, 0, sizeof(new_gc)); ++ ++ mutex_lock(&c->sb_lock); ++ ++ if (c->replicas_gc.entries && ++ !__replicas_has_entry(&c->replicas_gc, new_entry)) { ++ new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry); ++ if (!new_gc.entries) ++ goto err; ++ } ++ ++ if (!__replicas_has_entry(&c->replicas, new_entry)) { ++ new_r = cpu_replicas_add_entry(&c->replicas, new_entry); ++ if (!new_r.entries) ++ goto err; ++ ++ ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); ++ if (ret) ++ goto err; ++ ++ bch2_journal_entry_res_resize(&c->journal, ++ &c->replicas_journal_res, ++ reserve_journal_replicas(c, &new_r)); ++ } ++ ++ if (!new_r.entries && ++ !new_gc.entries) ++ goto out; ++ ++ /* allocations done, now commit: */ ++ ++ if (new_r.entries) ++ bch2_write_super(c); ++ ++ /* don't update in memory replicas until changes are persistent */ ++ percpu_down_write(&c->mark_lock); ++ if (new_r.entries) ++ ret = replicas_table_update(c, &new_r); ++ if (new_gc.entries) ++ swap(new_gc, c->replicas_gc); ++ percpu_up_write(&c->mark_lock); ++out: ++ mutex_unlock(&c->sb_lock); ++ ++ kfree(new_r.entries); ++ kfree(new_gc.entries); ++ ++ return ret; ++err: ++ bch_err(c, "error adding replicas entry: memory allocation failure"); ++ ret = -ENOMEM; ++ goto out; ++} ++ ++static int __bch2_mark_replicas(struct bch_fs *c, ++ struct bch_replicas_entry *r, ++ bool check) ++{ ++ return likely(bch2_replicas_marked(c, r)) ? 0 ++ : check ? -1 ++ : bch2_mark_replicas_slowpath(c, r); ++} ++ ++int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) ++{ ++ return __bch2_mark_replicas(c, r, false); ++} ++ ++static int __bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k, ++ bool check) ++{ ++ struct bch_replicas_padded search; ++ struct bch_devs_list cached = bch2_bkey_cached_devs(k); ++ unsigned i; ++ int ret; ++ ++ for (i = 0; i < cached.nr; i++) { ++ bch2_replicas_entry_cached(&search.e, cached.devs[i]); ++ ++ ret = __bch2_mark_replicas(c, &search.e, check); ++ if (ret) ++ return ret; ++ } ++ ++ bch2_bkey_to_replicas(&search.e, k); ++ ++ return __bch2_mark_replicas(c, &search.e, check); ++} ++ ++bool bch2_bkey_replicas_marked(struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ return __bch2_mark_bkey_replicas(c, k, true) == 0; ++} ++ ++int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) ++{ ++ return __bch2_mark_bkey_replicas(c, k, false); ++} ++ ++int bch2_replicas_gc_end(struct bch_fs *c, int ret) ++{ ++ unsigned i; ++ ++ lockdep_assert_held(&c->replicas_gc_lock); ++ ++ mutex_lock(&c->sb_lock); ++ percpu_down_write(&c->mark_lock); ++ ++ /* ++ * this is kind of crappy; the replicas gc mechanism needs to be ripped ++ * out ++ */ ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ struct bch_replicas_cpu n; ++ ++ if (!__replicas_has_entry(&c->replicas_gc, e) && ++ (c->usage_base->replicas[i] || ++ percpu_u64_get(&c->usage[0]->replicas[i]) || ++ percpu_u64_get(&c->usage[1]->replicas[i]))) { ++ n = cpu_replicas_add_entry(&c->replicas_gc, e); ++ if (!n.entries) { ++ ret = -ENOSPC; ++ goto err; ++ } ++ ++ swap(n, c->replicas_gc); ++ kfree(n.entries); ++ } ++ } ++ ++ if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) { ++ ret = -ENOSPC; ++ goto err; ++ } ++ ++ ret = replicas_table_update(c, &c->replicas_gc); ++err: ++ kfree(c->replicas_gc.entries); ++ c->replicas_gc.entries = NULL; ++ ++ percpu_up_write(&c->mark_lock); ++ ++ if (!ret) ++ bch2_write_super(c); ++ ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) ++{ ++ struct bch_replicas_entry *e; ++ unsigned i = 0; ++ ++ lockdep_assert_held(&c->replicas_gc_lock); ++ ++ mutex_lock(&c->sb_lock); ++ BUG_ON(c->replicas_gc.entries); ++ ++ c->replicas_gc.nr = 0; ++ c->replicas_gc.entry_size = 0; ++ ++ for_each_cpu_replicas_entry(&c->replicas, e) ++ if (!((1 << e->data_type) & typemask)) { ++ c->replicas_gc.nr++; ++ c->replicas_gc.entry_size = ++ max_t(unsigned, c->replicas_gc.entry_size, ++ replicas_entry_bytes(e)); ++ } ++ ++ c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, ++ c->replicas_gc.entry_size, ++ GFP_NOIO); ++ if (!c->replicas_gc.entries) { ++ mutex_unlock(&c->sb_lock); ++ bch_err(c, "error allocating c->replicas_gc"); ++ return -ENOMEM; ++ } ++ ++ for_each_cpu_replicas_entry(&c->replicas, e) ++ if (!((1 << e->data_type) & typemask)) ++ memcpy(cpu_replicas_entry(&c->replicas_gc, i++), ++ e, c->replicas_gc.entry_size); ++ ++ bch2_cpu_replicas_sort(&c->replicas_gc); ++ mutex_unlock(&c->sb_lock); ++ ++ return 0; ++} ++ ++int bch2_replicas_gc2(struct bch_fs *c) ++{ ++ struct bch_replicas_cpu new = { 0 }; ++ unsigned i, nr; ++ int ret = 0; ++ ++ bch2_journal_meta(&c->journal); ++retry: ++ nr = READ_ONCE(c->replicas.nr); ++ new.entry_size = READ_ONCE(c->replicas.entry_size); ++ new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); ++ if (!new.entries) { ++ bch_err(c, "error allocating c->replicas_gc"); ++ return -ENOMEM; ++ } ++ ++ mutex_lock(&c->sb_lock); ++ percpu_down_write(&c->mark_lock); ++ ++ if (nr != c->replicas.nr || ++ new.entry_size != c->replicas.entry_size) { ++ percpu_up_write(&c->mark_lock); ++ mutex_unlock(&c->sb_lock); ++ kfree(new.entries); ++ goto retry; ++ } ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ ++ if (e->data_type == BCH_DATA_journal || ++ c->usage_base->replicas[i] || ++ percpu_u64_get(&c->usage[0]->replicas[i]) || ++ percpu_u64_get(&c->usage[1]->replicas[i])) ++ memcpy(cpu_replicas_entry(&new, new.nr++), ++ e, new.entry_size); ++ } ++ ++ bch2_cpu_replicas_sort(&new); ++ ++ if (bch2_cpu_replicas_to_sb_replicas(c, &new)) { ++ ret = -ENOSPC; ++ goto err; ++ } ++ ++ ret = replicas_table_update(c, &new); ++err: ++ kfree(new.entries); ++ ++ percpu_up_write(&c->mark_lock); ++ ++ if (!ret) ++ bch2_write_super(c); ++ ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++int bch2_replicas_set_usage(struct bch_fs *c, ++ struct bch_replicas_entry *r, ++ u64 sectors) ++{ ++ int ret, idx = bch2_replicas_entry_idx(c, r); ++ ++ if (idx < 0) { ++ struct bch_replicas_cpu n; ++ ++ n = cpu_replicas_add_entry(&c->replicas, r); ++ if (!n.entries) ++ return -ENOMEM; ++ ++ ret = replicas_table_update(c, &n); ++ if (ret) ++ return ret; ++ ++ kfree(n.entries); ++ ++ idx = bch2_replicas_entry_idx(c, r); ++ BUG_ON(ret < 0); ++ } ++ ++ c->usage_base->replicas[idx] = sectors; ++ ++ return 0; ++} ++ ++/* Replicas tracking - superblock: */ ++ ++static int ++__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, ++ struct bch_replicas_cpu *cpu_r) ++{ ++ struct bch_replicas_entry *e, *dst; ++ unsigned nr = 0, entry_size = 0, idx = 0; ++ ++ for_each_replicas_entry(sb_r, e) { ++ entry_size = max_t(unsigned, entry_size, ++ replicas_entry_bytes(e)); ++ nr++; ++ } ++ ++ cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); ++ if (!cpu_r->entries) ++ return -ENOMEM; ++ ++ cpu_r->nr = nr; ++ cpu_r->entry_size = entry_size; ++ ++ for_each_replicas_entry(sb_r, e) { ++ dst = cpu_replicas_entry(cpu_r, idx++); ++ memcpy(dst, e, replicas_entry_bytes(e)); ++ replicas_entry_sort(dst); ++ } ++ ++ return 0; ++} ++ ++static int ++__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, ++ struct bch_replicas_cpu *cpu_r) ++{ ++ struct bch_replicas_entry_v0 *e; ++ unsigned nr = 0, entry_size = 0, idx = 0; ++ ++ for_each_replicas_entry(sb_r, e) { ++ entry_size = max_t(unsigned, entry_size, ++ replicas_entry_bytes(e)); ++ nr++; ++ } ++ ++ entry_size += sizeof(struct bch_replicas_entry) - ++ sizeof(struct bch_replicas_entry_v0); ++ ++ cpu_r->entries = kcalloc(nr, entry_size, GFP_NOIO); ++ if (!cpu_r->entries) ++ return -ENOMEM; ++ ++ cpu_r->nr = nr; ++ cpu_r->entry_size = entry_size; ++ ++ for_each_replicas_entry(sb_r, e) { ++ struct bch_replicas_entry *dst = ++ cpu_replicas_entry(cpu_r, idx++); ++ ++ dst->data_type = e->data_type; ++ dst->nr_devs = e->nr_devs; ++ dst->nr_required = 1; ++ memcpy(dst->devs, e->devs, e->nr_devs); ++ replicas_entry_sort(dst); ++ } ++ ++ return 0; ++} ++ ++int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) ++{ ++ struct bch_sb_field_replicas *sb_v1; ++ struct bch_sb_field_replicas_v0 *sb_v0; ++ struct bch_replicas_cpu new_r = { 0, 0, NULL }; ++ int ret = 0; ++ ++ if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb))) ++ ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r); ++ else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb))) ++ ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r); ++ ++ if (ret) ++ return -ENOMEM; ++ ++ bch2_cpu_replicas_sort(&new_r); ++ ++ percpu_down_write(&c->mark_lock); ++ ++ ret = replicas_table_update(c, &new_r); ++ percpu_up_write(&c->mark_lock); ++ ++ kfree(new_r.entries); ++ ++ return 0; ++} ++ ++static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, ++ struct bch_replicas_cpu *r) ++{ ++ struct bch_sb_field_replicas_v0 *sb_r; ++ struct bch_replicas_entry_v0 *dst; ++ struct bch_replicas_entry *src; ++ size_t bytes; ++ ++ bytes = sizeof(struct bch_sb_field_replicas); ++ ++ for_each_cpu_replicas_entry(r, src) ++ bytes += replicas_entry_bytes(src) - 1; ++ ++ sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb, ++ DIV_ROUND_UP(bytes, sizeof(u64))); ++ if (!sb_r) ++ return -ENOSPC; ++ ++ bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); ++ sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb); ++ ++ memset(&sb_r->entries, 0, ++ vstruct_end(&sb_r->field) - ++ (void *) &sb_r->entries); ++ ++ dst = sb_r->entries; ++ for_each_cpu_replicas_entry(r, src) { ++ dst->data_type = src->data_type; ++ dst->nr_devs = src->nr_devs; ++ memcpy(dst->devs, src->devs, src->nr_devs); ++ ++ dst = replicas_entry_next(dst); ++ ++ BUG_ON((void *) dst > vstruct_end(&sb_r->field)); ++ } ++ ++ return 0; ++} ++ ++static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, ++ struct bch_replicas_cpu *r) ++{ ++ struct bch_sb_field_replicas *sb_r; ++ struct bch_replicas_entry *dst, *src; ++ bool need_v1 = false; ++ size_t bytes; ++ ++ bytes = sizeof(struct bch_sb_field_replicas); ++ ++ for_each_cpu_replicas_entry(r, src) { ++ bytes += replicas_entry_bytes(src); ++ if (src->nr_required != 1) ++ need_v1 = true; ++ } ++ ++ if (!need_v1) ++ return bch2_cpu_replicas_to_sb_replicas_v0(c, r); ++ ++ sb_r = bch2_sb_resize_replicas(&c->disk_sb, ++ DIV_ROUND_UP(bytes, sizeof(u64))); ++ if (!sb_r) ++ return -ENOSPC; ++ ++ bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); ++ sb_r = bch2_sb_get_replicas(c->disk_sb.sb); ++ ++ memset(&sb_r->entries, 0, ++ vstruct_end(&sb_r->field) - ++ (void *) &sb_r->entries); ++ ++ dst = sb_r->entries; ++ for_each_cpu_replicas_entry(r, src) { ++ memcpy(dst, src, replicas_entry_bytes(src)); ++ ++ dst = replicas_entry_next(dst); ++ ++ BUG_ON((void *) dst > vstruct_end(&sb_r->field)); ++ } ++ ++ return 0; ++} ++ ++static const char *check_dup_replicas_entries(struct bch_replicas_cpu *cpu_r) ++{ ++ unsigned i; ++ ++ sort_cmp_size(cpu_r->entries, ++ cpu_r->nr, ++ cpu_r->entry_size, ++ memcmp, NULL); ++ ++ for (i = 0; i + 1 < cpu_r->nr; i++) { ++ struct bch_replicas_entry *l = ++ cpu_replicas_entry(cpu_r, i); ++ struct bch_replicas_entry *r = ++ cpu_replicas_entry(cpu_r, i + 1); ++ ++ BUG_ON(memcmp(l, r, cpu_r->entry_size) > 0); ++ ++ if (!memcmp(l, r, cpu_r->entry_size)) ++ return "duplicate replicas entry"; ++ } ++ ++ return NULL; ++} ++ ++static const char *bch2_sb_validate_replicas(struct bch_sb *sb, struct bch_sb_field *f) ++{ ++ struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); ++ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); ++ struct bch_replicas_cpu cpu_r = { .entries = NULL }; ++ struct bch_replicas_entry *e; ++ const char *err; ++ unsigned i; ++ ++ for_each_replicas_entry(sb_r, e) { ++ err = "invalid replicas entry: invalid data type"; ++ if (e->data_type >= BCH_DATA_NR) ++ goto err; ++ ++ err = "invalid replicas entry: no devices"; ++ if (!e->nr_devs) ++ goto err; ++ ++ err = "invalid replicas entry: bad nr_required"; ++ if (e->nr_required > 1 && ++ e->nr_required >= e->nr_devs) ++ goto err; ++ ++ err = "invalid replicas entry: invalid device"; ++ for (i = 0; i < e->nr_devs; i++) ++ if (!bch2_dev_exists(sb, mi, e->devs[i])) ++ goto err; ++ } ++ ++ err = "cannot allocate memory"; ++ if (__bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r)) ++ goto err; ++ ++ err = check_dup_replicas_entries(&cpu_r); ++err: ++ kfree(cpu_r.entries); ++ return err; ++} ++ ++static void bch2_sb_replicas_to_text(struct printbuf *out, ++ struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_replicas *r = field_to_type(f, replicas); ++ struct bch_replicas_entry *e; ++ bool first = true; ++ ++ for_each_replicas_entry(r, e) { ++ if (!first) ++ pr_buf(out, " "); ++ first = false; ++ ++ bch2_replicas_entry_to_text(out, e); ++ } ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_replicas = { ++ .validate = bch2_sb_validate_replicas, ++ .to_text = bch2_sb_replicas_to_text, ++}; ++ ++static const char *bch2_sb_validate_replicas_v0(struct bch_sb *sb, struct bch_sb_field *f) ++{ ++ struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); ++ struct bch_sb_field_members *mi = bch2_sb_get_members(sb); ++ struct bch_replicas_cpu cpu_r = { .entries = NULL }; ++ struct bch_replicas_entry_v0 *e; ++ const char *err; ++ unsigned i; ++ ++ for_each_replicas_entry_v0(sb_r, e) { ++ err = "invalid replicas entry: invalid data type"; ++ if (e->data_type >= BCH_DATA_NR) ++ goto err; ++ ++ err = "invalid replicas entry: no devices"; ++ if (!e->nr_devs) ++ goto err; ++ ++ err = "invalid replicas entry: invalid device"; ++ for (i = 0; i < e->nr_devs; i++) ++ if (!bch2_dev_exists(sb, mi, e->devs[i])) ++ goto err; ++ } ++ ++ err = "cannot allocate memory"; ++ if (__bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r)) ++ goto err; ++ ++ err = check_dup_replicas_entries(&cpu_r); ++err: ++ kfree(cpu_r.entries); ++ return err; ++} ++ ++const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { ++ .validate = bch2_sb_validate_replicas_v0, ++}; ++ ++/* Query replicas: */ ++ ++struct replicas_status __bch2_replicas_status(struct bch_fs *c, ++ struct bch_devs_mask online_devs) ++{ ++ struct bch_sb_field_members *mi; ++ struct bch_replicas_entry *e; ++ unsigned i, nr_online, nr_offline; ++ struct replicas_status ret; ++ ++ memset(&ret, 0, sizeof(ret)); ++ ++ for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) ++ ret.replicas[i].redundancy = INT_MAX; ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ ++ percpu_down_read(&c->mark_lock); ++ ++ for_each_cpu_replicas_entry(&c->replicas, e) { ++ if (e->data_type >= ARRAY_SIZE(ret.replicas)) ++ panic("e %p data_type %u\n", e, e->data_type); ++ ++ nr_online = nr_offline = 0; ++ ++ for (i = 0; i < e->nr_devs; i++) { ++ BUG_ON(!bch2_dev_exists(c->disk_sb.sb, mi, ++ e->devs[i])); ++ ++ if (test_bit(e->devs[i], online_devs.d)) ++ nr_online++; ++ else ++ nr_offline++; ++ } ++ ++ ret.replicas[e->data_type].redundancy = ++ min(ret.replicas[e->data_type].redundancy, ++ (int) nr_online - (int) e->nr_required); ++ ++ ret.replicas[e->data_type].nr_offline = ++ max(ret.replicas[e->data_type].nr_offline, ++ nr_offline); ++ } ++ ++ percpu_up_read(&c->mark_lock); ++ ++ for (i = 0; i < ARRAY_SIZE(ret.replicas); i++) ++ if (ret.replicas[i].redundancy == INT_MAX) ++ ret.replicas[i].redundancy = 0; ++ ++ return ret; ++} ++ ++struct replicas_status bch2_replicas_status(struct bch_fs *c) ++{ ++ return __bch2_replicas_status(c, bch2_online_devs(c)); ++} ++ ++static bool have_enough_devs(struct replicas_status s, ++ enum bch_data_type type, ++ bool force_if_degraded, ++ bool force_if_lost) ++{ ++ return (!s.replicas[type].nr_offline || force_if_degraded) && ++ (s.replicas[type].redundancy >= 0 || force_if_lost); ++} ++ ++bool bch2_have_enough_devs(struct replicas_status s, unsigned flags) ++{ ++ return (have_enough_devs(s, BCH_DATA_journal, ++ flags & BCH_FORCE_IF_METADATA_DEGRADED, ++ flags & BCH_FORCE_IF_METADATA_LOST) && ++ have_enough_devs(s, BCH_DATA_btree, ++ flags & BCH_FORCE_IF_METADATA_DEGRADED, ++ flags & BCH_FORCE_IF_METADATA_LOST) && ++ have_enough_devs(s, BCH_DATA_user, ++ flags & BCH_FORCE_IF_DATA_DEGRADED, ++ flags & BCH_FORCE_IF_DATA_LOST)); ++} ++ ++int bch2_replicas_online(struct bch_fs *c, bool meta) ++{ ++ struct replicas_status s = bch2_replicas_status(c); ++ ++ return (meta ++ ? min(s.replicas[BCH_DATA_journal].redundancy, ++ s.replicas[BCH_DATA_btree].redundancy) ++ : s.replicas[BCH_DATA_user].redundancy) + 1; ++} ++ ++unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bch_replicas_entry *e; ++ unsigned i, ret = 0; ++ ++ percpu_down_read(&c->mark_lock); ++ ++ for_each_cpu_replicas_entry(&c->replicas, e) ++ for (i = 0; i < e->nr_devs; i++) ++ if (e->devs[i] == ca->dev_idx) ++ ret |= 1 << e->data_type; ++ ++ percpu_up_read(&c->mark_lock); ++ ++ return ret; ++} ++ ++int bch2_fs_replicas_init(struct bch_fs *c) ++{ ++ c->journal.entry_u64s_reserved += ++ reserve_journal_replicas(c, &c->replicas); ++ ++ return replicas_table_update(c, &c->replicas); ++} +diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h +new file mode 100644 +index 000000000000..8b95164fbb56 +--- /dev/null ++++ b/fs/bcachefs/replicas.h +@@ -0,0 +1,91 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_REPLICAS_H ++#define _BCACHEFS_REPLICAS_H ++ ++#include "eytzinger.h" ++#include "replicas_types.h" ++ ++void bch2_replicas_entry_to_text(struct printbuf *, ++ struct bch_replicas_entry *); ++void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); ++ ++static inline struct bch_replicas_entry * ++cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) ++{ ++ return (void *) r->entries + r->entry_size * i; ++} ++ ++int bch2_replicas_entry_idx(struct bch_fs *, ++ struct bch_replicas_entry *); ++ ++void bch2_devlist_to_replicas(struct bch_replicas_entry *, ++ enum bch_data_type, ++ struct bch_devs_list); ++bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *); ++int bch2_mark_replicas(struct bch_fs *, ++ struct bch_replicas_entry *); ++ ++void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); ++bool bch2_bkey_replicas_marked(struct bch_fs *, struct bkey_s_c); ++int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c); ++ ++static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, ++ unsigned dev) ++{ ++ e->data_type = BCH_DATA_cached; ++ e->nr_devs = 1; ++ e->nr_required = 1; ++ e->devs[0] = dev; ++} ++ ++struct replicas_status { ++ struct { ++ int redundancy; ++ unsigned nr_offline; ++ } replicas[BCH_DATA_NR]; ++}; ++ ++struct replicas_status __bch2_replicas_status(struct bch_fs *, ++ struct bch_devs_mask); ++struct replicas_status bch2_replicas_status(struct bch_fs *); ++bool bch2_have_enough_devs(struct replicas_status, unsigned); ++ ++int bch2_replicas_online(struct bch_fs *, bool); ++unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); ++ ++int bch2_replicas_gc_end(struct bch_fs *, int); ++int bch2_replicas_gc_start(struct bch_fs *, unsigned); ++int bch2_replicas_gc2(struct bch_fs *); ++ ++int bch2_replicas_set_usage(struct bch_fs *, ++ struct bch_replicas_entry *, ++ u64); ++ ++#define for_each_cpu_replicas_entry(_r, _i) \ ++ for (_i = (_r)->entries; \ ++ (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ ++ _i = (void *) (_i) + (_r)->entry_size) ++ ++/* iterate over superblock replicas - used by userspace tools: */ ++ ++#define replicas_entry_next(_i) \ ++ ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i))) ++ ++#define for_each_replicas_entry(_r, _i) \ ++ for (_i = (_r)->entries; \ ++ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ ++ (_i) = replicas_entry_next(_i)) ++ ++#define for_each_replicas_entry_v0(_r, _i) \ ++ for (_i = (_r)->entries; \ ++ (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ ++ (_i) = replicas_entry_next(_i)) ++ ++int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); ++ ++extern const struct bch_sb_field_ops bch_sb_field_ops_replicas; ++extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0; ++ ++int bch2_fs_replicas_init(struct bch_fs *); ++ ++#endif /* _BCACHEFS_REPLICAS_H */ +diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h +new file mode 100644 +index 000000000000..0535b1d3760e +--- /dev/null ++++ b/fs/bcachefs/replicas_types.h +@@ -0,0 +1,10 @@ ++#ifndef _BCACHEFS_REPLICAS_TYPES_H ++#define _BCACHEFS_REPLICAS_TYPES_H ++ ++struct bch_replicas_cpu { ++ unsigned nr; ++ unsigned entry_size; ++ struct bch_replicas_entry *entries; ++}; ++ ++#endif /* _BCACHEFS_REPLICAS_TYPES_H */ +diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c +new file mode 100644 +index 000000000000..c062edb3fbc2 +--- /dev/null ++++ b/fs/bcachefs/siphash.c +@@ -0,0 +1,173 @@ ++// SPDX-License-Identifier: BSD-3-Clause ++/* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */ ++ ++/*- ++ * Copyright (c) 2013 Andre Oppermann ++ * All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. The name of the author may not be used to endorse or promote ++ * products derived from this software without specific prior written ++ * permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND ++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS ++ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ++ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY ++ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ++ * SUCH DAMAGE. ++ */ ++ ++/* ++ * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d ++ * are the number of compression rounds and the number of finalization rounds. ++ * A compression round is identical to a finalization round and this round ++ * function is called SipRound. Given a 128-bit key k and a (possibly empty) ++ * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m). ++ * ++ * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18, ++ * by Jean-Philippe Aumasson and Daniel J. Bernstein, ++ * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa ++ * https://131002.net/siphash/siphash.pdf ++ * https://131002.net/siphash/ ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include "siphash.h" ++ ++static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds) ++{ ++ while (rounds--) { ++ ctx->v[0] += ctx->v[1]; ++ ctx->v[2] += ctx->v[3]; ++ ctx->v[1] = rol64(ctx->v[1], 13); ++ ctx->v[3] = rol64(ctx->v[3], 16); ++ ++ ctx->v[1] ^= ctx->v[0]; ++ ctx->v[3] ^= ctx->v[2]; ++ ctx->v[0] = rol64(ctx->v[0], 32); ++ ++ ctx->v[2] += ctx->v[1]; ++ ctx->v[0] += ctx->v[3]; ++ ctx->v[1] = rol64(ctx->v[1], 17); ++ ctx->v[3] = rol64(ctx->v[3], 21); ++ ++ ctx->v[1] ^= ctx->v[2]; ++ ctx->v[3] ^= ctx->v[0]; ++ ctx->v[2] = rol64(ctx->v[2], 32); ++ } ++} ++ ++static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds) ++{ ++ u64 m = get_unaligned_le64(ptr); ++ ++ ctx->v[3] ^= m; ++ SipHash_Rounds(ctx, rounds); ++ ctx->v[0] ^= m; ++} ++ ++void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key) ++{ ++ u64 k0, k1; ++ ++ k0 = le64_to_cpu(key->k0); ++ k1 = le64_to_cpu(key->k1); ++ ++ ctx->v[0] = 0x736f6d6570736575ULL ^ k0; ++ ctx->v[1] = 0x646f72616e646f6dULL ^ k1; ++ ctx->v[2] = 0x6c7967656e657261ULL ^ k0; ++ ctx->v[3] = 0x7465646279746573ULL ^ k1; ++ ++ memset(ctx->buf, 0, sizeof(ctx->buf)); ++ ctx->bytes = 0; ++} ++ ++void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, ++ const void *src, size_t len) ++{ ++ const u8 *ptr = src; ++ size_t left, used; ++ ++ if (len == 0) ++ return; ++ ++ used = ctx->bytes % sizeof(ctx->buf); ++ ctx->bytes += len; ++ ++ if (used > 0) { ++ left = sizeof(ctx->buf) - used; ++ ++ if (len >= left) { ++ memcpy(&ctx->buf[used], ptr, left); ++ SipHash_CRounds(ctx, ctx->buf, rc); ++ len -= left; ++ ptr += left; ++ } else { ++ memcpy(&ctx->buf[used], ptr, len); ++ return; ++ } ++ } ++ ++ while (len >= sizeof(ctx->buf)) { ++ SipHash_CRounds(ctx, ptr, rc); ++ len -= sizeof(ctx->buf); ++ ptr += sizeof(ctx->buf); ++ } ++ ++ if (len > 0) ++ memcpy(&ctx->buf[used], ptr, len); ++} ++ ++void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf) ++{ ++ u64 r; ++ ++ r = SipHash_End(ctx, rc, rf); ++ ++ *((__le64 *) dst) = cpu_to_le64(r); ++} ++ ++u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf) ++{ ++ u64 r; ++ size_t left, used; ++ ++ used = ctx->bytes % sizeof(ctx->buf); ++ left = sizeof(ctx->buf) - used; ++ memset(&ctx->buf[used], 0, left - 1); ++ ctx->buf[7] = ctx->bytes; ++ ++ SipHash_CRounds(ctx, ctx->buf, rc); ++ ctx->v[2] ^= 0xff; ++ SipHash_Rounds(ctx, rf); ++ ++ r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]); ++ memset(ctx, 0, sizeof(*ctx)); ++ return (r); ++} ++ ++u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len) ++{ ++ SIPHASH_CTX ctx; ++ ++ SipHash_Init(&ctx, key); ++ SipHash_Update(&ctx, rc, rf, src, len); ++ return SipHash_End(&ctx, rc, rf); ++} +diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h +new file mode 100644 +index 000000000000..3dfaf34a43b2 +--- /dev/null ++++ b/fs/bcachefs/siphash.h +@@ -0,0 +1,87 @@ ++/* SPDX-License-Identifier: BSD-3-Clause */ ++/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */ ++/*- ++ * Copyright (c) 2013 Andre Oppermann ++ * All rights reserved. ++ * ++ * Redistribution and use in source and binary forms, with or without ++ * modification, are permitted provided that the following conditions ++ * are met: ++ * 1. Redistributions of source code must retain the above copyright ++ * notice, this list of conditions and the following disclaimer. ++ * 2. Redistributions in binary form must reproduce the above copyright ++ * notice, this list of conditions and the following disclaimer in the ++ * documentation and/or other materials provided with the distribution. ++ * 3. The name of the author may not be used to endorse or promote ++ * products derived from this software without specific prior written ++ * permission. ++ * ++ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND ++ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE ++ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS ++ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) ++ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT ++ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY ++ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF ++ * SUCH DAMAGE. ++ * ++ * $FreeBSD$ ++ */ ++ ++/* ++ * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions) ++ * optimized for speed on short messages returning a 64bit hash/digest value. ++ * ++ * The number of rounds is defined during the initialization: ++ * SipHash24_Init() for the fast and resonable strong version ++ * SipHash48_Init() for the strong version (half as fast) ++ * ++ * struct SIPHASH_CTX ctx; ++ * SipHash24_Init(&ctx); ++ * SipHash_SetKey(&ctx, "16bytes long key"); ++ * SipHash_Update(&ctx, pointer_to_string, length_of_string); ++ * SipHash_Final(output, &ctx); ++ */ ++ ++#ifndef _SIPHASH_H_ ++#define _SIPHASH_H_ ++ ++#include ++ ++#define SIPHASH_BLOCK_LENGTH 8 ++#define SIPHASH_KEY_LENGTH 16 ++#define SIPHASH_DIGEST_LENGTH 8 ++ ++typedef struct _SIPHASH_CTX { ++ u64 v[4]; ++ u8 buf[SIPHASH_BLOCK_LENGTH]; ++ u32 bytes; ++} SIPHASH_CTX; ++ ++typedef struct { ++ __le64 k0; ++ __le64 k1; ++} SIPHASH_KEY; ++ ++void SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *); ++void SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t); ++u64 SipHash_End(SIPHASH_CTX *, int, int); ++void SipHash_Final(void *, SIPHASH_CTX *, int, int); ++u64 SipHash(const SIPHASH_KEY *, int, int, const void *, size_t); ++ ++#define SipHash24_Init(_c, _k) SipHash_Init((_c), (_k)) ++#define SipHash24_Update(_c, _p, _l) SipHash_Update((_c), 2, 4, (_p), (_l)) ++#define SipHash24_End(_d) SipHash_End((_d), 2, 4) ++#define SipHash24_Final(_d, _c) SipHash_Final((_d), (_c), 2, 4) ++#define SipHash24(_k, _p, _l) SipHash((_k), 2, 4, (_p), (_l)) ++ ++#define SipHash48_Init(_c, _k) SipHash_Init((_c), (_k)) ++#define SipHash48_Update(_c, _p, _l) SipHash_Update((_c), 4, 8, (_p), (_l)) ++#define SipHash48_End(_d) SipHash_End((_d), 4, 8) ++#define SipHash48_Final(_d, _c) SipHash_Final((_d), (_c), 4, 8) ++#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l)) ++ ++#endif /* _SIPHASH_H_ */ +diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h +new file mode 100644 +index 000000000000..dea9b7252b88 +--- /dev/null ++++ b/fs/bcachefs/str_hash.h +@@ -0,0 +1,336 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_STR_HASH_H ++#define _BCACHEFS_STR_HASH_H ++ ++#include "btree_iter.h" ++#include "btree_update.h" ++#include "checksum.h" ++#include "error.h" ++#include "inode.h" ++#include "siphash.h" ++#include "super.h" ++ ++#include ++#include ++#include ++ ++static inline enum bch_str_hash_type ++bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) ++{ ++ switch (opt) { ++ case BCH_STR_HASH_OPT_CRC32C: ++ return BCH_STR_HASH_CRC32C; ++ case BCH_STR_HASH_OPT_CRC64: ++ return BCH_STR_HASH_CRC64; ++ case BCH_STR_HASH_OPT_SIPHASH: ++ return c->sb.features & (1ULL << BCH_FEATURE_new_siphash) ++ ? BCH_STR_HASH_SIPHASH ++ : BCH_STR_HASH_SIPHASH_OLD; ++ default: ++ BUG(); ++ } ++} ++ ++struct bch_hash_info { ++ u8 type; ++ union { ++ __le64 crc_key; ++ SIPHASH_KEY siphash_key; ++ }; ++}; ++ ++static inline struct bch_hash_info ++bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) ++{ ++ /* XXX ick */ ++ struct bch_hash_info info = { ++ .type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) & ++ ~(~0U << INODE_STR_HASH_BITS), ++ .crc_key = bi->bi_hash_seed, ++ }; ++ ++ if (unlikely(info.type == BCH_STR_HASH_SIPHASH_OLD)) { ++ SHASH_DESC_ON_STACK(desc, c->sha256); ++ u8 digest[SHA256_DIGEST_SIZE]; ++ ++ desc->tfm = c->sha256; ++ ++ crypto_shash_digest(desc, (void *) &bi->bi_hash_seed, ++ sizeof(bi->bi_hash_seed), digest); ++ memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); ++ } ++ ++ return info; ++} ++ ++struct bch_str_hash_ctx { ++ union { ++ u32 crc32c; ++ u64 crc64; ++ SIPHASH_CTX siphash; ++ }; ++}; ++ ++static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx, ++ const struct bch_hash_info *info) ++{ ++ switch (info->type) { ++ case BCH_STR_HASH_CRC32C: ++ ctx->crc32c = crc32c(~0, &info->crc_key, sizeof(info->crc_key)); ++ break; ++ case BCH_STR_HASH_CRC64: ++ ctx->crc64 = crc64_be(~0, &info->crc_key, sizeof(info->crc_key)); ++ break; ++ case BCH_STR_HASH_SIPHASH_OLD: ++ case BCH_STR_HASH_SIPHASH: ++ SipHash24_Init(&ctx->siphash, &info->siphash_key); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx, ++ const struct bch_hash_info *info, ++ const void *data, size_t len) ++{ ++ switch (info->type) { ++ case BCH_STR_HASH_CRC32C: ++ ctx->crc32c = crc32c(ctx->crc32c, data, len); ++ break; ++ case BCH_STR_HASH_CRC64: ++ ctx->crc64 = crc64_be(ctx->crc64, data, len); ++ break; ++ case BCH_STR_HASH_SIPHASH_OLD: ++ case BCH_STR_HASH_SIPHASH: ++ SipHash24_Update(&ctx->siphash, data, len); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, ++ const struct bch_hash_info *info) ++{ ++ switch (info->type) { ++ case BCH_STR_HASH_CRC32C: ++ return ctx->crc32c; ++ case BCH_STR_HASH_CRC64: ++ return ctx->crc64 >> 1; ++ case BCH_STR_HASH_SIPHASH_OLD: ++ case BCH_STR_HASH_SIPHASH: ++ return SipHash24_End(&ctx->siphash) >> 1; ++ default: ++ BUG(); ++ } ++} ++ ++struct bch_hash_desc { ++ enum btree_id btree_id; ++ u8 key_type; ++ ++ u64 (*hash_key)(const struct bch_hash_info *, const void *); ++ u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c); ++ bool (*cmp_key)(struct bkey_s_c, const void *); ++ bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c); ++}; ++ ++static __always_inline struct btree_iter * ++bch2_hash_lookup(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ u64 inode, const void *key, ++ unsigned flags) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ for_each_btree_key(trans, iter, desc.btree_id, ++ POS(inode, desc.hash_key(info, key)), ++ BTREE_ITER_SLOTS|flags, k, ret) { ++ if (iter->pos.inode != inode) ++ break; ++ ++ if (k.k->type == desc.key_type) { ++ if (!desc.cmp_key(k, key)) ++ return iter; ++ } else if (k.k->type == KEY_TYPE_whiteout) { ++ ; ++ } else { ++ /* hole, not found */ ++ break; ++ } ++ } ++ bch2_trans_iter_put(trans, iter); ++ ++ return ERR_PTR(ret ?: -ENOENT); ++} ++ ++static __always_inline struct btree_iter * ++bch2_hash_hole(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ u64 inode, const void *key) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ for_each_btree_key(trans, iter, desc.btree_id, ++ POS(inode, desc.hash_key(info, key)), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ if (iter->pos.inode != inode) ++ break; ++ ++ if (k.k->type != desc.key_type) ++ return iter; ++ } ++ ++ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ bch2_trans_iter_put(trans, iter); ++ ++ return ERR_PTR(ret ?: -ENOSPC); ++} ++ ++static __always_inline ++int bch2_hash_needs_whiteout(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ struct btree_iter *start) ++{ ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ iter = bch2_trans_copy_iter(trans, start); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ bch2_btree_iter_next_slot(iter); ++ ++ for_each_btree_key_continue(iter, BTREE_ITER_SLOTS, k, ret) { ++ if (k.k->type != desc.key_type && ++ k.k->type != KEY_TYPE_whiteout) ++ break; ++ ++ if (k.k->type == desc.key_type && ++ desc.hash_bkey(info, k) <= start->pos.offset) { ++ iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT; ++ ret = 1; ++ break; ++ } ++ } ++ ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static __always_inline ++int bch2_hash_set(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ u64 inode, struct bkey_i *insert, int flags) ++{ ++ struct btree_iter *iter, *slot = NULL; ++ struct bkey_s_c k; ++ bool found = false; ++ int ret; ++ ++ for_each_btree_key(trans, iter, desc.btree_id, ++ POS(inode, desc.hash_bkey(info, bkey_i_to_s_c(insert))), ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ if (iter->pos.inode != inode) ++ break; ++ ++ if (k.k->type == desc.key_type) { ++ if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) ++ goto found; ++ ++ /* hash collision: */ ++ continue; ++ } ++ ++ if (!slot && ++ !(flags & BCH_HASH_SET_MUST_REPLACE)) { ++ slot = bch2_trans_copy_iter(trans, iter); ++ if (IS_ERR(slot)) ++ return PTR_ERR(slot); ++ } ++ ++ if (k.k->type != KEY_TYPE_whiteout) ++ goto not_found; ++ } ++ ++ if (!ret) ++ ret = -ENOSPC; ++out: ++ bch2_trans_iter_put(trans, slot); ++ bch2_trans_iter_put(trans, iter); ++ ++ return ret; ++found: ++ found = true; ++not_found: ++ ++ if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) { ++ ret = -ENOENT; ++ } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) { ++ ret = -EEXIST; ++ } else { ++ if (!found && slot) ++ swap(iter, slot); ++ ++ insert->k.p = iter->pos; ++ bch2_trans_update(trans, iter, insert, 0); ++ } ++ ++ goto out; ++} ++ ++static __always_inline ++int bch2_hash_delete_at(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ struct btree_iter *iter) ++{ ++ struct bkey_i *delete; ++ int ret; ++ ++ ret = bch2_hash_needs_whiteout(trans, desc, info, iter); ++ if (ret < 0) ++ return ret; ++ ++ delete = bch2_trans_kmalloc(trans, sizeof(*delete)); ++ if (IS_ERR(delete)) ++ return PTR_ERR(delete); ++ ++ bkey_init(&delete->k); ++ delete->k.p = iter->pos; ++ delete->k.type = ret ? KEY_TYPE_whiteout : KEY_TYPE_deleted; ++ ++ bch2_trans_update(trans, iter, delete, 0); ++ return 0; ++} ++ ++static __always_inline ++int bch2_hash_delete(struct btree_trans *trans, ++ const struct bch_hash_desc desc, ++ const struct bch_hash_info *info, ++ u64 inode, const void *key) ++{ ++ struct btree_iter *iter; ++ int ret; ++ ++ iter = bch2_hash_lookup(trans, desc, info, inode, key, ++ BTREE_ITER_INTENT); ++ if (IS_ERR(iter)) ++ return PTR_ERR(iter); ++ ++ ret = bch2_hash_delete_at(trans, desc, info, iter); ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++#endif /* _BCACHEFS_STR_HASH_H */ +diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c +new file mode 100644 +index 000000000000..cee6cc938734 +--- /dev/null ++++ b/fs/bcachefs/super-io.c +@@ -0,0 +1,1158 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "btree_update_interior.h" ++#include "buckets.h" ++#include "checksum.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "error.h" ++#include "io.h" ++#include "journal.h" ++#include "journal_seq_blacklist.h" ++#include "replicas.h" ++#include "quota.h" ++#include "super-io.h" ++#include "super.h" ++#include "vstructs.h" ++ ++#include ++#include ++ ++const char * const bch2_sb_fields[] = { ++#define x(name, nr) #name, ++ BCH_SB_FIELDS() ++#undef x ++ NULL ++}; ++ ++static const char *bch2_sb_field_validate(struct bch_sb *, ++ struct bch_sb_field *); ++ ++struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb, ++ enum bch_sb_field_type type) ++{ ++ struct bch_sb_field *f; ++ ++ /* XXX: need locking around superblock to access optional fields */ ++ ++ vstruct_for_each(sb, f) ++ if (le32_to_cpu(f->type) == type) ++ return f; ++ return NULL; ++} ++ ++static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, ++ struct bch_sb_field *f, ++ unsigned u64s) ++{ ++ unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; ++ unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s; ++ ++ BUG_ON(get_order(__vstruct_bytes(struct bch_sb, sb_u64s)) > ++ sb->page_order); ++ ++ if (!f && !u64s) { ++ /* nothing to do: */ ++ } else if (!f) { ++ f = vstruct_last(sb->sb); ++ memset(f, 0, sizeof(u64) * u64s); ++ f->u64s = cpu_to_le32(u64s); ++ f->type = 0; ++ } else { ++ void *src, *dst; ++ ++ src = vstruct_end(f); ++ ++ if (u64s) { ++ f->u64s = cpu_to_le32(u64s); ++ dst = vstruct_end(f); ++ } else { ++ dst = f; ++ } ++ ++ memmove(dst, src, vstruct_end(sb->sb) - src); ++ ++ if (dst > src) ++ memset(src, 0, dst - src); ++ } ++ ++ sb->sb->u64s = cpu_to_le32(sb_u64s); ++ ++ return u64s ? f : NULL; ++} ++ ++void bch2_sb_field_delete(struct bch_sb_handle *sb, ++ enum bch_sb_field_type type) ++{ ++ struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); ++ ++ if (f) ++ __bch2_sb_field_resize(sb, f, 0); ++} ++ ++/* Superblock realloc/free: */ ++ ++void bch2_free_super(struct bch_sb_handle *sb) ++{ ++ if (sb->bio) ++ bio_put(sb->bio); ++ if (!IS_ERR_OR_NULL(sb->bdev)) ++ blkdev_put(sb->bdev, sb->mode); ++ ++ free_pages((unsigned long) sb->sb, sb->page_order); ++ memset(sb, 0, sizeof(*sb)); ++} ++ ++int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) ++{ ++ size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s); ++ unsigned order = get_order(new_bytes); ++ struct bch_sb *new_sb; ++ struct bio *bio; ++ ++ if (sb->sb && sb->page_order >= order) ++ return 0; ++ ++ if (sb->have_layout) { ++ u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; ++ ++ if (new_bytes > max_bytes) { ++ char buf[BDEVNAME_SIZE]; ++ ++ pr_err("%s: superblock too big: want %zu but have %llu", ++ bdevname(sb->bdev, buf), new_bytes, max_bytes); ++ return -ENOSPC; ++ } ++ } ++ ++ if (sb->page_order >= order && sb->sb) ++ return 0; ++ ++ if (dynamic_fault("bcachefs:add:super_realloc")) ++ return -ENOMEM; ++ ++ if (sb->have_bio) { ++ bio = bio_kmalloc(GFP_KERNEL, 1 << order); ++ if (!bio) ++ return -ENOMEM; ++ ++ if (sb->bio) ++ bio_put(sb->bio); ++ sb->bio = bio; ++ } ++ ++ new_sb = (void *) __get_free_pages(GFP_NOFS|__GFP_ZERO, order); ++ if (!new_sb) ++ return -ENOMEM; ++ ++ if (sb->sb) ++ memcpy(new_sb, sb->sb, PAGE_SIZE << sb->page_order); ++ ++ free_pages((unsigned long) sb->sb, sb->page_order); ++ sb->sb = new_sb; ++ ++ sb->page_order = order; ++ ++ return 0; ++} ++ ++struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb, ++ enum bch_sb_field_type type, ++ unsigned u64s) ++{ ++ struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); ++ ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; ++ ssize_t d = -old_u64s + u64s; ++ ++ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) ++ return NULL; ++ ++ if (sb->fs_sb) { ++ struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb); ++ struct bch_dev *ca; ++ unsigned i; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ /* XXX: we're not checking that offline device have enough space */ ++ ++ for_each_online_member(ca, c, i) { ++ struct bch_sb_handle *sb = &ca->disk_sb; ++ ++ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { ++ percpu_ref_put(&ca->ref); ++ return NULL; ++ } ++ } ++ } ++ ++ f = bch2_sb_field_get(sb->sb, type); ++ f = __bch2_sb_field_resize(sb, f, u64s); ++ if (f) ++ f->type = cpu_to_le32(type); ++ return f; ++} ++ ++/* Superblock validate: */ ++ ++static inline void __bch2_sb_layout_size_assert(void) ++{ ++ BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); ++} ++ ++static const char *validate_sb_layout(struct bch_sb_layout *layout) ++{ ++ u64 offset, prev_offset, max_sectors; ++ unsigned i; ++ ++ if (uuid_le_cmp(layout->magic, BCACHE_MAGIC)) ++ return "Not a bcachefs superblock layout"; ++ ++ if (layout->layout_type != 0) ++ return "Invalid superblock layout type"; ++ ++ if (!layout->nr_superblocks) ++ return "Invalid superblock layout: no superblocks"; ++ ++ if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) ++ return "Invalid superblock layout: too many superblocks"; ++ ++ max_sectors = 1 << layout->sb_max_size_bits; ++ ++ prev_offset = le64_to_cpu(layout->sb_offset[0]); ++ ++ for (i = 1; i < layout->nr_superblocks; i++) { ++ offset = le64_to_cpu(layout->sb_offset[i]); ++ ++ if (offset < prev_offset + max_sectors) ++ return "Invalid superblock layout: superblocks overlap"; ++ prev_offset = offset; ++ } ++ ++ return NULL; ++} ++ ++const char *bch2_sb_validate(struct bch_sb_handle *disk_sb) ++{ ++ struct bch_sb *sb = disk_sb->sb; ++ struct bch_sb_field *f; ++ struct bch_sb_field_members *mi; ++ const char *err; ++ u32 version, version_min; ++ u16 block_size; ++ ++ version = le16_to_cpu(sb->version); ++ version_min = version >= bcachefs_metadata_version_new_versioning ++ ? le16_to_cpu(sb->version_min) ++ : version; ++ ++ if (version >= bcachefs_metadata_version_max || ++ version_min < bcachefs_metadata_version_min) ++ return "Unsupported superblock version"; ++ ++ if (version_min > version) ++ return "Bad minimum version"; ++ ++ if (sb->features[1] || ++ (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) ++ return "Filesystem has incompatible features"; ++ ++ block_size = le16_to_cpu(sb->block_size); ++ ++ if (!is_power_of_2(block_size) || ++ block_size > PAGE_SECTORS) ++ return "Bad block size"; ++ ++ if (bch2_is_zero(sb->user_uuid.b, sizeof(uuid_le))) ++ return "Bad user UUID"; ++ ++ if (bch2_is_zero(sb->uuid.b, sizeof(uuid_le))) ++ return "Bad internal UUID"; ++ ++ if (!sb->nr_devices || ++ sb->nr_devices <= sb->dev_idx || ++ sb->nr_devices > BCH_SB_MEMBERS_MAX) ++ return "Bad number of member devices"; ++ ++ if (!BCH_SB_META_REPLICAS_WANT(sb) || ++ BCH_SB_META_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) ++ return "Invalid number of metadata replicas"; ++ ++ if (!BCH_SB_META_REPLICAS_REQ(sb) || ++ BCH_SB_META_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) ++ return "Invalid number of metadata replicas"; ++ ++ if (!BCH_SB_DATA_REPLICAS_WANT(sb) || ++ BCH_SB_DATA_REPLICAS_WANT(sb) >= BCH_REPLICAS_MAX) ++ return "Invalid number of data replicas"; ++ ++ if (!BCH_SB_DATA_REPLICAS_REQ(sb) || ++ BCH_SB_DATA_REPLICAS_REQ(sb) >= BCH_REPLICAS_MAX) ++ return "Invalid number of data replicas"; ++ ++ if (BCH_SB_META_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) ++ return "Invalid metadata checksum type"; ++ ++ if (BCH_SB_DATA_CSUM_TYPE(sb) >= BCH_CSUM_OPT_NR) ++ return "Invalid metadata checksum type"; ++ ++ if (BCH_SB_COMPRESSION_TYPE(sb) >= BCH_COMPRESSION_OPT_NR) ++ return "Invalid compression type"; ++ ++ if (!BCH_SB_BTREE_NODE_SIZE(sb)) ++ return "Btree node size not set"; ++ ++ if (!is_power_of_2(BCH_SB_BTREE_NODE_SIZE(sb))) ++ return "Btree node size not a power of two"; ++ ++ if (BCH_SB_GC_RESERVE(sb) < 5) ++ return "gc reserve percentage too small"; ++ ++ if (!sb->time_precision || ++ le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) ++ return "invalid time precision"; ++ ++ /* validate layout */ ++ err = validate_sb_layout(&sb->layout); ++ if (err) ++ return err; ++ ++ vstruct_for_each(sb, f) { ++ if (!f->u64s) ++ return "Invalid superblock: invalid optional field"; ++ ++ if (vstruct_next(f) > vstruct_last(sb)) ++ return "Invalid superblock: invalid optional field"; ++ } ++ ++ /* members must be validated first: */ ++ mi = bch2_sb_get_members(sb); ++ if (!mi) ++ return "Invalid superblock: member info area missing"; ++ ++ err = bch2_sb_field_validate(sb, &mi->field); ++ if (err) ++ return err; ++ ++ vstruct_for_each(sb, f) { ++ if (le32_to_cpu(f->type) == BCH_SB_FIELD_members) ++ continue; ++ ++ err = bch2_sb_field_validate(sb, f); ++ if (err) ++ return err; ++ } ++ ++ return NULL; ++} ++ ++/* device open: */ ++ ++static void bch2_sb_update(struct bch_fs *c) ++{ ++ struct bch_sb *src = c->disk_sb.sb; ++ struct bch_sb_field_members *mi = bch2_sb_get_members(src); ++ struct bch_dev *ca; ++ unsigned i; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ c->sb.uuid = src->uuid; ++ c->sb.user_uuid = src->user_uuid; ++ c->sb.version = le16_to_cpu(src->version); ++ c->sb.nr_devices = src->nr_devices; ++ c->sb.clean = BCH_SB_CLEAN(src); ++ c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); ++ c->sb.encoded_extent_max= 1 << BCH_SB_ENCODED_EXTENT_MAX_BITS(src); ++ c->sb.time_base_lo = le64_to_cpu(src->time_base_lo); ++ c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); ++ c->sb.time_precision = le32_to_cpu(src->time_precision); ++ c->sb.features = le64_to_cpu(src->features[0]); ++ c->sb.compat = le64_to_cpu(src->compat[0]); ++ ++ for_each_member_device(ca, c, i) ++ ca->mi = bch2_mi_to_cpu(mi->members + i); ++} ++ ++/* doesn't copy member info */ ++static void __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) ++{ ++ struct bch_sb_field *src_f, *dst_f; ++ struct bch_sb *dst = dst_handle->sb; ++ unsigned i; ++ ++ dst->version = src->version; ++ dst->version_min = src->version_min; ++ dst->seq = src->seq; ++ dst->uuid = src->uuid; ++ dst->user_uuid = src->user_uuid; ++ memcpy(dst->label, src->label, sizeof(dst->label)); ++ ++ dst->block_size = src->block_size; ++ dst->nr_devices = src->nr_devices; ++ ++ dst->time_base_lo = src->time_base_lo; ++ dst->time_base_hi = src->time_base_hi; ++ dst->time_precision = src->time_precision; ++ ++ memcpy(dst->flags, src->flags, sizeof(dst->flags)); ++ memcpy(dst->features, src->features, sizeof(dst->features)); ++ memcpy(dst->compat, src->compat, sizeof(dst->compat)); ++ ++ for (i = 0; i < BCH_SB_FIELD_NR; i++) { ++ if (i == BCH_SB_FIELD_journal) ++ continue; ++ ++ src_f = bch2_sb_field_get(src, i); ++ dst_f = bch2_sb_field_get(dst, i); ++ dst_f = __bch2_sb_field_resize(dst_handle, dst_f, ++ src_f ? le32_to_cpu(src_f->u64s) : 0); ++ ++ if (src_f) ++ memcpy(dst_f, src_f, vstruct_bytes(src_f)); ++ } ++} ++ ++int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) ++{ ++ struct bch_sb_field_journal *journal_buckets = ++ bch2_sb_get_journal(src); ++ unsigned journal_u64s = journal_buckets ++ ? le32_to_cpu(journal_buckets->field.u64s) ++ : 0; ++ int ret; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ ret = bch2_sb_realloc(&c->disk_sb, ++ le32_to_cpu(src->u64s) - journal_u64s); ++ if (ret) ++ return ret; ++ ++ __copy_super(&c->disk_sb, src); ++ ++ ret = bch2_sb_replicas_to_cpu_replicas(c); ++ if (ret) ++ return ret; ++ ++ ret = bch2_sb_disk_groups_to_cpu(c); ++ if (ret) ++ return ret; ++ ++ bch2_sb_update(c); ++ return 0; ++} ++ ++int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bch_sb *src = c->disk_sb.sb, *dst = ca->disk_sb.sb; ++ struct bch_sb_field_journal *journal_buckets = ++ bch2_sb_get_journal(dst); ++ unsigned journal_u64s = journal_buckets ++ ? le32_to_cpu(journal_buckets->field.u64s) ++ : 0; ++ unsigned u64s = le32_to_cpu(src->u64s) + journal_u64s; ++ int ret; ++ ++ ret = bch2_sb_realloc(&ca->disk_sb, u64s); ++ if (ret) ++ return ret; ++ ++ __copy_super(&ca->disk_sb, src); ++ return 0; ++} ++ ++/* read superblock: */ ++ ++static const char *read_one_super(struct bch_sb_handle *sb, u64 offset) ++{ ++ struct bch_csum csum; ++ size_t bytes; ++reread: ++ bio_reset(sb->bio); ++ bio_set_dev(sb->bio, sb->bdev); ++ sb->bio->bi_iter.bi_sector = offset; ++ bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); ++ bch2_bio_map(sb->bio, sb->sb, PAGE_SIZE << sb->page_order); ++ ++ if (submit_bio_wait(sb->bio)) ++ return "IO error"; ++ ++ if (uuid_le_cmp(sb->sb->magic, BCACHE_MAGIC)) ++ return "Not a bcachefs superblock"; ++ ++ if (le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_min || ++ le16_to_cpu(sb->sb->version) >= bcachefs_metadata_version_max) ++ return "Unsupported superblock version"; ++ ++ bytes = vstruct_bytes(sb->sb); ++ ++ if (bytes > 512 << sb->sb->layout.sb_max_size_bits) ++ return "Bad superblock: too big"; ++ ++ if (get_order(bytes) > sb->page_order) { ++ if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s))) ++ return "cannot allocate memory"; ++ goto reread; ++ } ++ ++ if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) ++ return "unknown csum type"; ++ ++ /* XXX: verify MACs */ ++ csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), ++ null_nonce(), sb->sb); ++ ++ if (bch2_crc_cmp(csum, sb->sb->csum)) ++ return "bad checksum reading superblock"; ++ ++ sb->seq = le64_to_cpu(sb->sb->seq); ++ ++ return NULL; ++} ++ ++int bch2_read_super(const char *path, struct bch_opts *opts, ++ struct bch_sb_handle *sb) ++{ ++ u64 offset = opt_get(*opts, sb); ++ struct bch_sb_layout layout; ++ const char *err; ++ __le64 *i; ++ int ret; ++ ++ pr_verbose_init(*opts, ""); ++ ++ memset(sb, 0, sizeof(*sb)); ++ sb->mode = FMODE_READ; ++ sb->have_bio = true; ++ ++ if (!opt_get(*opts, noexcl)) ++ sb->mode |= FMODE_EXCL; ++ ++ if (!opt_get(*opts, nochanges)) ++ sb->mode |= FMODE_WRITE; ++ ++ sb->bdev = blkdev_get_by_path(path, sb->mode, sb); ++ if (IS_ERR(sb->bdev) && ++ PTR_ERR(sb->bdev) == -EACCES && ++ opt_get(*opts, read_only)) { ++ sb->mode &= ~FMODE_WRITE; ++ ++ sb->bdev = blkdev_get_by_path(path, sb->mode, sb); ++ if (!IS_ERR(sb->bdev)) ++ opt_set(*opts, nochanges, true); ++ } ++ ++ if (IS_ERR(sb->bdev)) { ++ ret = PTR_ERR(sb->bdev); ++ goto out; ++ } ++ ++ err = "cannot allocate memory"; ++ ret = bch2_sb_realloc(sb, 0); ++ if (ret) ++ goto err; ++ ++ ret = -EFAULT; ++ err = "dynamic fault"; ++ if (bch2_fs_init_fault("read_super")) ++ goto err; ++ ++ ret = -EINVAL; ++ err = read_one_super(sb, offset); ++ if (!err) ++ goto got_super; ++ ++ if (opt_defined(*opts, sb)) ++ goto err; ++ ++ pr_err("error reading default superblock: %s", err); ++ ++ /* ++ * Error reading primary superblock - read location of backup ++ * superblocks: ++ */ ++ bio_reset(sb->bio); ++ bio_set_dev(sb->bio, sb->bdev); ++ sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; ++ bio_set_op_attrs(sb->bio, REQ_OP_READ, REQ_SYNC|REQ_META); ++ /* ++ * use sb buffer to read layout, since sb buffer is page aligned but ++ * layout won't be: ++ */ ++ bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout)); ++ ++ err = "IO error"; ++ if (submit_bio_wait(sb->bio)) ++ goto err; ++ ++ memcpy(&layout, sb->sb, sizeof(layout)); ++ err = validate_sb_layout(&layout); ++ if (err) ++ goto err; ++ ++ for (i = layout.sb_offset; ++ i < layout.sb_offset + layout.nr_superblocks; i++) { ++ offset = le64_to_cpu(*i); ++ ++ if (offset == opt_get(*opts, sb)) ++ continue; ++ ++ err = read_one_super(sb, offset); ++ if (!err) ++ goto got_super; ++ } ++ ++ ret = -EINVAL; ++ goto err; ++ ++got_super: ++ err = "Superblock block size smaller than device block size"; ++ ret = -EINVAL; ++ if (le16_to_cpu(sb->sb->block_size) << 9 < ++ bdev_logical_block_size(sb->bdev)) ++ goto err; ++ ++ if (sb->mode & FMODE_WRITE) ++ bdev_get_queue(sb->bdev)->backing_dev_info->capabilities ++ |= BDI_CAP_STABLE_WRITES; ++ ret = 0; ++ sb->have_layout = true; ++out: ++ pr_verbose_init(*opts, "ret %i", ret); ++ return ret; ++err: ++ bch2_free_super(sb); ++ pr_err("error reading superblock: %s", err); ++ goto out; ++} ++ ++/* write superblock: */ ++ ++static void write_super_endio(struct bio *bio) ++{ ++ struct bch_dev *ca = bio->bi_private; ++ ++ /* XXX: return errors directly */ ++ ++ if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write: %s", ++ bch2_blk_status_to_str(bio->bi_status))) ++ ca->sb_write_error = 1; ++ ++ closure_put(&ca->fs->sb_write); ++ percpu_ref_put(&ca->io_ref); ++} ++ ++static void read_back_super(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct bch_sb *sb = ca->disk_sb.sb; ++ struct bio *bio = ca->disk_sb.bio; ++ ++ bio_reset(bio); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); ++ bio->bi_end_io = write_super_endio; ++ bio->bi_private = ca; ++ bio_set_op_attrs(bio, REQ_OP_READ, REQ_SYNC|REQ_META); ++ bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); ++ ++ this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], ++ bio_sectors(bio)); ++ ++ percpu_ref_get(&ca->io_ref); ++ closure_bio_submit(bio, &c->sb_write); ++} ++ ++static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) ++{ ++ struct bch_sb *sb = ca->disk_sb.sb; ++ struct bio *bio = ca->disk_sb.bio; ++ ++ sb->offset = sb->layout.sb_offset[idx]; ++ ++ SET_BCH_SB_CSUM_TYPE(sb, c->opts.metadata_checksum); ++ sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), ++ null_nonce(), sb); ++ ++ bio_reset(bio); ++ bio_set_dev(bio, ca->disk_sb.bdev); ++ bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); ++ bio->bi_end_io = write_super_endio; ++ bio->bi_private = ca; ++ bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META); ++ bch2_bio_map(bio, sb, ++ roundup((size_t) vstruct_bytes(sb), ++ bdev_logical_block_size(ca->disk_sb.bdev))); ++ ++ this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], ++ bio_sectors(bio)); ++ ++ percpu_ref_get(&ca->io_ref); ++ closure_bio_submit(bio, &c->sb_write); ++} ++ ++int bch2_write_super(struct bch_fs *c) ++{ ++ struct closure *cl = &c->sb_write; ++ struct bch_dev *ca; ++ unsigned i, sb = 0, nr_wrote; ++ const char *err; ++ struct bch_devs_mask sb_written; ++ bool wrote, can_mount_without_written, can_mount_with_written; ++ int ret = 0; ++ ++ lockdep_assert_held(&c->sb_lock); ++ ++ closure_init_stack(cl); ++ memset(&sb_written, 0, sizeof(sb_written)); ++ ++ le64_add_cpu(&c->disk_sb.sb->seq, 1); ++ ++ if (test_bit(BCH_FS_ERROR, &c->flags)) ++ SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); ++ ++ for_each_online_member(ca, c, i) ++ bch2_sb_from_fs(c, ca); ++ ++ for_each_online_member(ca, c, i) { ++ err = bch2_sb_validate(&ca->disk_sb); ++ if (err) { ++ bch2_fs_inconsistent(c, "sb invalid before write: %s", err); ++ ret = -1; ++ goto out; ++ } ++ } ++ ++ if (c->opts.nochanges) ++ goto out; ++ ++ for_each_online_member(ca, c, i) { ++ __set_bit(ca->dev_idx, sb_written.d); ++ ca->sb_write_error = 0; ++ } ++ ++ for_each_online_member(ca, c, i) ++ read_back_super(c, ca); ++ closure_sync(cl); ++ ++ for_each_online_member(ca, c, i) { ++ if (!ca->sb_write_error && ++ ca->disk_sb.seq != ++ le64_to_cpu(ca->sb_read_scratch->seq)) { ++ bch2_fs_fatal_error(c, ++ "Superblock modified by another process"); ++ percpu_ref_put(&ca->io_ref); ++ ret = -EROFS; ++ goto out; ++ } ++ } ++ ++ do { ++ wrote = false; ++ for_each_online_member(ca, c, i) ++ if (!ca->sb_write_error && ++ sb < ca->disk_sb.sb->layout.nr_superblocks) { ++ write_one_super(c, ca, sb); ++ wrote = true; ++ } ++ closure_sync(cl); ++ sb++; ++ } while (wrote); ++ ++ for_each_online_member(ca, c, i) { ++ if (ca->sb_write_error) ++ __clear_bit(ca->dev_idx, sb_written.d); ++ else ++ ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq); ++ } ++ ++ nr_wrote = dev_mask_nr(&sb_written); ++ ++ can_mount_with_written = ++ bch2_have_enough_devs(__bch2_replicas_status(c, sb_written), ++ BCH_FORCE_IF_DEGRADED); ++ ++ for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) ++ sb_written.d[i] = ~sb_written.d[i]; ++ ++ can_mount_without_written = ++ bch2_have_enough_devs(__bch2_replicas_status(c, sb_written), ++ BCH_FORCE_IF_DEGRADED); ++ ++ /* ++ * If we would be able to mount _without_ the devices we successfully ++ * wrote superblocks to, we weren't able to write to enough devices: ++ * ++ * Exception: if we can mount without the successes because we haven't ++ * written anything (new filesystem), we continue if we'd be able to ++ * mount with the devices we did successfully write to: ++ */ ++ if (bch2_fs_fatal_err_on(!nr_wrote || ++ (can_mount_without_written && ++ !can_mount_with_written), c, ++ "Unable to write superblock to sufficient devices")) ++ ret = -1; ++out: ++ /* Make new options visible after they're persistent: */ ++ bch2_sb_update(c); ++ return ret; ++} ++ ++void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) ++{ ++ mutex_lock(&c->sb_lock); ++ if (!(c->sb.features & (1ULL << feat))) { ++ c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat); ++ ++ bch2_write_super(c); ++ } ++ mutex_unlock(&c->sb_lock); ++} ++ ++/* BCH_SB_FIELD_journal: */ ++ ++static int u64_cmp(const void *_l, const void *_r) ++{ ++ u64 l = *((const u64 *) _l), r = *((const u64 *) _r); ++ ++ return l < r ? -1 : l > r ? 1 : 0; ++} ++ ++static const char *bch2_sb_validate_journal(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_journal *journal = field_to_type(f, journal); ++ struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; ++ const char *err; ++ unsigned nr; ++ unsigned i; ++ u64 *b; ++ ++ journal = bch2_sb_get_journal(sb); ++ if (!journal) ++ return NULL; ++ ++ nr = bch2_nr_journal_buckets(journal); ++ if (!nr) ++ return NULL; ++ ++ b = kmalloc_array(sizeof(u64), nr, GFP_KERNEL); ++ if (!b) ++ return "cannot allocate memory"; ++ ++ for (i = 0; i < nr; i++) ++ b[i] = le64_to_cpu(journal->buckets[i]); ++ ++ sort(b, nr, sizeof(u64), u64_cmp, NULL); ++ ++ err = "journal bucket at sector 0"; ++ if (!b[0]) ++ goto err; ++ ++ err = "journal bucket before first bucket"; ++ if (m && b[0] < le16_to_cpu(m->first_bucket)) ++ goto err; ++ ++ err = "journal bucket past end of device"; ++ if (m && b[nr - 1] >= le64_to_cpu(m->nbuckets)) ++ goto err; ++ ++ err = "duplicate journal buckets"; ++ for (i = 0; i + 1 < nr; i++) ++ if (b[i] == b[i + 1]) ++ goto err; ++ ++ err = NULL; ++err: ++ kfree(b); ++ return err; ++} ++ ++static const struct bch_sb_field_ops bch_sb_field_ops_journal = { ++ .validate = bch2_sb_validate_journal, ++}; ++ ++/* BCH_SB_FIELD_members: */ ++ ++static const char *bch2_sb_validate_members(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_members *mi = field_to_type(f, members); ++ struct bch_member *m; ++ ++ if ((void *) (mi->members + sb->nr_devices) > ++ vstruct_end(&mi->field)) ++ return "Invalid superblock: bad member info"; ++ ++ for (m = mi->members; ++ m < mi->members + sb->nr_devices; ++ m++) { ++ if (!bch2_member_exists(m)) ++ continue; ++ ++ if (le64_to_cpu(m->nbuckets) > LONG_MAX) ++ return "Too many buckets"; ++ ++ if (le64_to_cpu(m->nbuckets) - ++ le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) ++ return "Not enough buckets"; ++ ++ if (le16_to_cpu(m->bucket_size) < ++ le16_to_cpu(sb->block_size)) ++ return "bucket size smaller than block size"; ++ ++ if (le16_to_cpu(m->bucket_size) < ++ BCH_SB_BTREE_NODE_SIZE(sb)) ++ return "bucket size smaller than btree node size"; ++ } ++ ++ return NULL; ++} ++ ++static const struct bch_sb_field_ops bch_sb_field_ops_members = { ++ .validate = bch2_sb_validate_members, ++}; ++ ++/* BCH_SB_FIELD_crypt: */ ++ ++static const char *bch2_sb_validate_crypt(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); ++ ++ if (vstruct_bytes(&crypt->field) != sizeof(*crypt)) ++ return "invalid field crypt: wrong size"; ++ ++ if (BCH_CRYPT_KDF_TYPE(crypt)) ++ return "invalid field crypt: bad kdf type"; ++ ++ return NULL; ++} ++ ++static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { ++ .validate = bch2_sb_validate_crypt, ++}; ++ ++/* BCH_SB_FIELD_clean: */ ++ ++void bch2_sb_clean_renumber(struct bch_sb_field_clean *clean, int write) ++{ ++ struct jset_entry *entry; ++ ++ for (entry = clean->start; ++ entry < (struct jset_entry *) vstruct_end(&clean->field); ++ entry = vstruct_next(entry)) ++ bch2_bkey_renumber(BKEY_TYPE_BTREE, bkey_to_packed(entry->start), write); ++} ++ ++int bch2_fs_mark_dirty(struct bch_fs *c) ++{ ++ int ret; ++ ++ /* ++ * Unconditionally write superblock, to verify it hasn't changed before ++ * we go rw: ++ */ ++ ++ mutex_lock(&c->sb_lock); ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, false); ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_new_extent_overwrite; ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_extents_above_btree_updates; ++ c->disk_sb.sb->features[0] |= 1ULL << BCH_FEATURE_btree_updates_journalled; ++ ret = bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ return ret; ++} ++ ++static void ++entry_init_u64s(struct jset_entry *entry, unsigned u64s) ++{ ++ memset(entry, 0, u64s * sizeof(u64)); ++ ++ /* ++ * The u64s field counts from the start of data, ignoring the shared ++ * fields. ++ */ ++ entry->u64s = u64s - 1; ++} ++ ++static void ++entry_init_size(struct jset_entry *entry, size_t size) ++{ ++ unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); ++ entry_init_u64s(entry, u64s); ++} ++ ++struct jset_entry * ++bch2_journal_super_entries_add_common(struct bch_fs *c, ++ struct jset_entry *entry, ++ u64 journal_seq) ++{ ++ unsigned i; ++ ++ percpu_down_write(&c->mark_lock); ++ ++ if (!journal_seq) { ++ bch2_fs_usage_acc_to_base(c, 0); ++ bch2_fs_usage_acc_to_base(c, 1); ++ } else { ++ bch2_fs_usage_acc_to_base(c, journal_seq & 1); ++ } ++ ++ { ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ ++ entry_init_size(entry, sizeof(*u)); ++ u->entry.type = BCH_JSET_ENTRY_usage; ++ u->entry.btree_id = FS_USAGE_INODES; ++ u->v = cpu_to_le64(c->usage_base->nr_inodes); ++ ++ entry = vstruct_next(entry); ++ } ++ ++ { ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ ++ entry_init_size(entry, sizeof(*u)); ++ u->entry.type = BCH_JSET_ENTRY_usage; ++ u->entry.btree_id = FS_USAGE_KEY_VERSION; ++ u->v = cpu_to_le64(atomic64_read(&c->key_version)); ++ ++ entry = vstruct_next(entry); ++ } ++ ++ for (i = 0; i < BCH_REPLICAS_MAX; i++) { ++ struct jset_entry_usage *u = ++ container_of(entry, struct jset_entry_usage, entry); ++ ++ entry_init_size(entry, sizeof(*u)); ++ u->entry.type = BCH_JSET_ENTRY_usage; ++ u->entry.btree_id = FS_USAGE_RESERVED; ++ u->entry.level = i; ++ u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); ++ ++ entry = vstruct_next(entry); ++ } ++ ++ for (i = 0; i < c->replicas.nr; i++) { ++ struct bch_replicas_entry *e = ++ cpu_replicas_entry(&c->replicas, i); ++ struct jset_entry_data_usage *u = ++ container_of(entry, struct jset_entry_data_usage, entry); ++ ++ entry_init_size(entry, sizeof(*u) + e->nr_devs); ++ u->entry.type = BCH_JSET_ENTRY_data_usage; ++ u->v = cpu_to_le64(c->usage_base->replicas[i]); ++ memcpy(&u->r, e, replicas_entry_bytes(e)); ++ ++ entry = vstruct_next(entry); ++ } ++ ++ percpu_up_write(&c->mark_lock); ++ ++ return entry; ++} ++ ++void bch2_fs_mark_clean(struct bch_fs *c) ++{ ++ struct bch_sb_field_clean *sb_clean; ++ struct jset_entry *entry; ++ unsigned u64s; ++ ++ mutex_lock(&c->sb_lock); ++ if (BCH_SB_CLEAN(c->disk_sb.sb)) ++ goto out; ++ ++ SET_BCH_SB_CLEAN(c->disk_sb.sb, true); ++ ++ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_INFO; ++ c->disk_sb.sb->compat[0] |= 1ULL << BCH_COMPAT_FEAT_ALLOC_METADATA; ++ c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_extents_above_btree_updates); ++ c->disk_sb.sb->features[0] &= ~(1ULL << BCH_FEATURE_btree_updates_journalled); ++ ++ u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; ++ ++ sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s); ++ if (!sb_clean) { ++ bch_err(c, "error resizing superblock while setting filesystem clean"); ++ goto out; ++ } ++ ++ sb_clean->flags = 0; ++ sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); ++ sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); ++ sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1); ++ ++ /* Trying to catch outstanding bug: */ ++ BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); ++ ++ entry = sb_clean->start; ++ entry = bch2_journal_super_entries_add_common(c, entry, 0); ++ entry = bch2_btree_roots_to_journal_entries(c, entry, entry); ++ BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); ++ ++ memset(entry, 0, ++ vstruct_end(&sb_clean->field) - (void *) entry); ++ ++ if (le16_to_cpu(c->disk_sb.sb->version) < ++ bcachefs_metadata_version_bkey_renumber) ++ bch2_sb_clean_renumber(sb_clean, WRITE); ++ ++ bch2_write_super(c); ++out: ++ mutex_unlock(&c->sb_lock); ++} ++ ++static const char *bch2_sb_validate_clean(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ struct bch_sb_field_clean *clean = field_to_type(f, clean); ++ ++ if (vstruct_bytes(&clean->field) < sizeof(*clean)) ++ return "invalid field crypt: wrong size"; ++ ++ return NULL; ++} ++ ++static const struct bch_sb_field_ops bch_sb_field_ops_clean = { ++ .validate = bch2_sb_validate_clean, ++}; ++ ++static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { ++#define x(f, nr) \ ++ [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, ++ BCH_SB_FIELDS() ++#undef x ++}; ++ ++static const char *bch2_sb_field_validate(struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ unsigned type = le32_to_cpu(f->type); ++ ++ return type < BCH_SB_FIELD_NR ++ ? bch2_sb_field_ops[type]->validate(sb, f) ++ : NULL; ++} ++ ++void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, ++ struct bch_sb_field *f) ++{ ++ unsigned type = le32_to_cpu(f->type); ++ const struct bch_sb_field_ops *ops = type < BCH_SB_FIELD_NR ++ ? bch2_sb_field_ops[type] : NULL; ++ ++ if (ops) ++ pr_buf(out, "%s", bch2_sb_fields[type]); ++ else ++ pr_buf(out, "(unknown field %u)", type); ++ ++ pr_buf(out, " (size %llu):", vstruct_bytes(f)); ++ ++ if (ops && ops->to_text) ++ bch2_sb_field_ops[type]->to_text(out, sb, f); ++} +diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h +new file mode 100644 +index 000000000000..7a068158efca +--- /dev/null ++++ b/fs/bcachefs/super-io.h +@@ -0,0 +1,137 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SUPER_IO_H ++#define _BCACHEFS_SUPER_IO_H ++ ++#include "extents.h" ++#include "eytzinger.h" ++#include "super_types.h" ++#include "super.h" ++ ++#include ++ ++struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type); ++struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *, ++ enum bch_sb_field_type, unsigned); ++void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type); ++ ++#define field_to_type(_f, _name) \ ++ container_of_or_null(_f, struct bch_sb_field_##_name, field) ++ ++#define x(_name, _nr) \ ++static inline struct bch_sb_field_##_name * \ ++bch2_sb_get_##_name(struct bch_sb *sb) \ ++{ \ ++ return field_to_type(bch2_sb_field_get(sb, \ ++ BCH_SB_FIELD_##_name), _name); \ ++} \ ++ \ ++static inline struct bch_sb_field_##_name * \ ++bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s) \ ++{ \ ++ return field_to_type(bch2_sb_field_resize(sb, \ ++ BCH_SB_FIELD_##_name, u64s), _name); \ ++} ++ ++BCH_SB_FIELDS() ++#undef x ++ ++extern const char * const bch2_sb_fields[]; ++ ++struct bch_sb_field_ops { ++ const char * (*validate)(struct bch_sb *, struct bch_sb_field *); ++ void (*to_text)(struct printbuf *, struct bch_sb *, ++ struct bch_sb_field *); ++}; ++ ++static inline __le64 bch2_sb_magic(struct bch_fs *c) ++{ ++ __le64 ret; ++ memcpy(&ret, &c->sb.uuid, sizeof(ret)); ++ return ret; ++} ++ ++static inline __u64 jset_magic(struct bch_fs *c) ++{ ++ return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC); ++} ++ ++static inline __u64 bset_magic(struct bch_fs *c) ++{ ++ return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC); ++} ++ ++int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *); ++int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); ++ ++void bch2_free_super(struct bch_sb_handle *); ++int bch2_sb_realloc(struct bch_sb_handle *, unsigned); ++ ++const char *bch2_sb_validate(struct bch_sb_handle *); ++ ++int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); ++int bch2_write_super(struct bch_fs *); ++void __bch2_check_set_feature(struct bch_fs *, unsigned); ++ ++static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) ++{ ++ if (!(c->sb.features & (1ULL << feat))) ++ __bch2_check_set_feature(c, feat); ++} ++ ++/* BCH_SB_FIELD_journal: */ ++ ++static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) ++{ ++ return j ++ ? (__le64 *) vstruct_end(&j->field) - j->buckets ++ : 0; ++} ++ ++/* BCH_SB_FIELD_members: */ ++ ++static inline bool bch2_member_exists(struct bch_member *m) ++{ ++ return !bch2_is_zero(m->uuid.b, sizeof(uuid_le)); ++} ++ ++static inline bool bch2_dev_exists(struct bch_sb *sb, ++ struct bch_sb_field_members *mi, ++ unsigned dev) ++{ ++ return dev < sb->nr_devices && ++ bch2_member_exists(&mi->members[dev]); ++} ++ ++static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) ++{ ++ return (struct bch_member_cpu) { ++ .nbuckets = le64_to_cpu(mi->nbuckets), ++ .first_bucket = le16_to_cpu(mi->first_bucket), ++ .bucket_size = le16_to_cpu(mi->bucket_size), ++ .group = BCH_MEMBER_GROUP(mi), ++ .state = BCH_MEMBER_STATE(mi), ++ .replacement = BCH_MEMBER_REPLACEMENT(mi), ++ .discard = BCH_MEMBER_DISCARD(mi), ++ .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), ++ .durability = BCH_MEMBER_DURABILITY(mi) ++ ? BCH_MEMBER_DURABILITY(mi) - 1 ++ : 1, ++ .valid = !bch2_is_zero(mi->uuid.b, sizeof(uuid_le)), ++ }; ++} ++ ++/* BCH_SB_FIELD_clean: */ ++ ++struct jset_entry * ++bch2_journal_super_entries_add_common(struct bch_fs *, ++ struct jset_entry *, u64); ++ ++void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int); ++ ++int bch2_fs_mark_dirty(struct bch_fs *); ++void bch2_fs_mark_clean(struct bch_fs *); ++ ++void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, ++ struct bch_sb_field *); ++ ++#endif /* _BCACHEFS_SUPER_IO_H */ +diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c +new file mode 100644 +index 000000000000..015bbd9f21fd +--- /dev/null ++++ b/fs/bcachefs/super.c +@@ -0,0 +1,2037 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * bcachefs setup/teardown code, and some metadata io - read a superblock and ++ * figure out what to do with it. ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "alloc_foreground.h" ++#include "bkey_sort.h" ++#include "btree_cache.h" ++#include "btree_gc.h" ++#include "btree_key_cache.h" ++#include "btree_update_interior.h" ++#include "btree_io.h" ++#include "chardev.h" ++#include "checksum.h" ++#include "clock.h" ++#include "compress.h" ++#include "debug.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "error.h" ++#include "fs.h" ++#include "fs-io.h" ++#include "fsck.h" ++#include "inode.h" ++#include "io.h" ++#include "journal.h" ++#include "journal_reclaim.h" ++#include "journal_seq_blacklist.h" ++#include "move.h" ++#include "migrate.h" ++#include "movinggc.h" ++#include "quota.h" ++#include "rebalance.h" ++#include "recovery.h" ++#include "replicas.h" ++#include "super.h" ++#include "super-io.h" ++#include "sysfs.h" ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Kent Overstreet "); ++ ++#define KTYPE(type) \ ++struct kobj_type type ## _ktype = { \ ++ .release = type ## _release, \ ++ .sysfs_ops = &type ## _sysfs_ops, \ ++ .default_attrs = type ## _files \ ++} ++ ++static void bch2_fs_release(struct kobject *); ++static void bch2_dev_release(struct kobject *); ++ ++static void bch2_fs_internal_release(struct kobject *k) ++{ ++} ++ ++static void bch2_fs_opts_dir_release(struct kobject *k) ++{ ++} ++ ++static void bch2_fs_time_stats_release(struct kobject *k) ++{ ++} ++ ++static KTYPE(bch2_fs); ++static KTYPE(bch2_fs_internal); ++static KTYPE(bch2_fs_opts_dir); ++static KTYPE(bch2_fs_time_stats); ++static KTYPE(bch2_dev); ++ ++static struct kset *bcachefs_kset; ++static LIST_HEAD(bch_fs_list); ++static DEFINE_MUTEX(bch_fs_list_lock); ++ ++static DECLARE_WAIT_QUEUE_HEAD(bch_read_only_wait); ++ ++static void bch2_dev_free(struct bch_dev *); ++static int bch2_dev_alloc(struct bch_fs *, unsigned); ++static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); ++static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); ++ ++struct bch_fs *bch2_bdev_to_fs(struct block_device *bdev) ++{ ++ struct bch_fs *c; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ mutex_lock(&bch_fs_list_lock); ++ rcu_read_lock(); ++ ++ list_for_each_entry(c, &bch_fs_list, list) ++ for_each_member_device_rcu(ca, c, i, NULL) ++ if (ca->disk_sb.bdev == bdev) { ++ closure_get(&c->cl); ++ goto found; ++ } ++ c = NULL; ++found: ++ rcu_read_unlock(); ++ mutex_unlock(&bch_fs_list_lock); ++ ++ return c; ++} ++ ++static struct bch_fs *__bch2_uuid_to_fs(uuid_le uuid) ++{ ++ struct bch_fs *c; ++ ++ lockdep_assert_held(&bch_fs_list_lock); ++ ++ list_for_each_entry(c, &bch_fs_list, list) ++ if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid_le))) ++ return c; ++ ++ return NULL; ++} ++ ++struct bch_fs *bch2_uuid_to_fs(uuid_le uuid) ++{ ++ struct bch_fs *c; ++ ++ mutex_lock(&bch_fs_list_lock); ++ c = __bch2_uuid_to_fs(uuid); ++ if (c) ++ closure_get(&c->cl); ++ mutex_unlock(&bch_fs_list_lock); ++ ++ return c; ++} ++ ++/* Filesystem RO/RW: */ ++ ++/* ++ * For startup/shutdown of RW stuff, the dependencies are: ++ * ++ * - foreground writes depend on copygc and rebalance (to free up space) ++ * ++ * - copygc and rebalance depend on mark and sweep gc (they actually probably ++ * don't because they either reserve ahead of time or don't block if ++ * allocations fail, but allocations can require mark and sweep gc to run ++ * because of generation number wraparound) ++ * ++ * - all of the above depends on the allocator threads ++ * ++ * - allocator depends on the journal (when it rewrites prios and gens) ++ */ ++ ++static void __bch2_fs_read_only(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i, clean_passes = 0; ++ ++ bch2_rebalance_stop(c); ++ bch2_copygc_stop(c); ++ bch2_gc_thread_stop(c); ++ ++ bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale); ++ bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale); ++ ++ /* ++ * Flush journal before stopping allocators, because flushing journal ++ * blacklist entries involves allocating new btree nodes: ++ */ ++ bch2_journal_flush_all_pins(&c->journal); ++ ++ /* ++ * If the allocator threads didn't all start up, the btree updates to ++ * write out alloc info aren't going to work: ++ */ ++ if (!test_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags)) ++ goto nowrote_alloc; ++ ++ bch_verbose(c, "flushing journal and stopping allocators"); ++ ++ bch2_journal_flush_all_pins(&c->journal); ++ set_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); ++ ++ do { ++ clean_passes++; ++ ++ if (bch2_journal_flush_all_pins(&c->journal)) ++ clean_passes = 0; ++ ++ /* ++ * In flight interior btree updates will generate more journal ++ * updates and btree updates (alloc btree): ++ */ ++ if (bch2_btree_interior_updates_nr_pending(c)) { ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c)); ++ clean_passes = 0; ++ } ++ flush_work(&c->btree_interior_update_work); ++ ++ if (bch2_journal_flush_all_pins(&c->journal)) ++ clean_passes = 0; ++ } while (clean_passes < 2); ++ bch_verbose(c, "flushing journal and stopping allocators complete"); ++ ++ set_bit(BCH_FS_ALLOC_CLEAN, &c->flags); ++nowrote_alloc: ++ closure_wait_event(&c->btree_interior_update_wait, ++ !bch2_btree_interior_updates_nr_pending(c)); ++ flush_work(&c->btree_interior_update_work); ++ ++ for_each_member_device(ca, c, i) ++ bch2_dev_allocator_stop(ca); ++ ++ clear_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); ++ clear_bit(BCH_FS_ALLOCATOR_STOPPING, &c->flags); ++ ++ bch2_fs_journal_stop(&c->journal); ++ ++ /* ++ * the journal kicks off btree writes via reclaim - wait for in flight ++ * writes after stopping journal: ++ */ ++ if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) ++ bch2_btree_flush_all_writes(c); ++ else ++ bch2_btree_verify_flushed(c); ++ ++ /* ++ * After stopping journal: ++ */ ++ for_each_member_device(ca, c, i) ++ bch2_dev_allocator_remove(c, ca); ++} ++ ++static void bch2_writes_disabled(struct percpu_ref *writes) ++{ ++ struct bch_fs *c = container_of(writes, struct bch_fs, writes); ++ ++ set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); ++ wake_up(&bch_read_only_wait); ++} ++ ++void bch2_fs_read_only(struct bch_fs *c) ++{ ++ if (!test_bit(BCH_FS_RW, &c->flags)) { ++ cancel_delayed_work_sync(&c->journal.reclaim_work); ++ return; ++ } ++ ++ BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); ++ ++ /* ++ * Block new foreground-end write operations from starting - any new ++ * writes will return -EROFS: ++ * ++ * (This is really blocking new _allocations_, writes to previously ++ * allocated space can still happen until stopping the allocator in ++ * bch2_dev_allocator_stop()). ++ */ ++ percpu_ref_kill(&c->writes); ++ ++ cancel_work_sync(&c->ec_stripe_delete_work); ++ cancel_delayed_work(&c->pd_controllers_update); ++ ++ /* ++ * If we're not doing an emergency shutdown, we want to wait on ++ * outstanding writes to complete so they don't see spurious errors due ++ * to shutting down the allocator: ++ * ++ * If we are doing an emergency shutdown outstanding writes may ++ * hang until we shutdown the allocator so we don't want to wait ++ * on outstanding writes before shutting everything down - but ++ * we do need to wait on them before returning and signalling ++ * that going RO is complete: ++ */ ++ wait_event(bch_read_only_wait, ++ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) || ++ test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); ++ ++ __bch2_fs_read_only(c); ++ ++ wait_event(bch_read_only_wait, ++ test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); ++ ++ clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); ++ ++ if (!bch2_journal_error(&c->journal) && ++ !test_bit(BCH_FS_ERROR, &c->flags) && ++ !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) && ++ test_bit(BCH_FS_STARTED, &c->flags) && ++ test_bit(BCH_FS_ALLOC_CLEAN, &c->flags) && ++ !c->opts.norecovery) { ++ bch_verbose(c, "marking filesystem clean"); ++ bch2_fs_mark_clean(c); ++ } ++ ++ clear_bit(BCH_FS_RW, &c->flags); ++} ++ ++static void bch2_fs_read_only_work(struct work_struct *work) ++{ ++ struct bch_fs *c = ++ container_of(work, struct bch_fs, read_only_work); ++ ++ down_write(&c->state_lock); ++ bch2_fs_read_only(c); ++ up_write(&c->state_lock); ++} ++ ++static void bch2_fs_read_only_async(struct bch_fs *c) ++{ ++ queue_work(system_long_wq, &c->read_only_work); ++} ++ ++bool bch2_fs_emergency_read_only(struct bch_fs *c) ++{ ++ bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags); ++ ++ bch2_journal_halt(&c->journal); ++ bch2_fs_read_only_async(c); ++ ++ wake_up(&bch_read_only_wait); ++ return ret; ++} ++ ++static int bch2_fs_read_write_late(struct bch_fs *c) ++{ ++ int ret; ++ ++ ret = bch2_gc_thread_start(c); ++ if (ret) { ++ bch_err(c, "error starting gc thread"); ++ return ret; ++ } ++ ++ ret = bch2_copygc_start(c); ++ if (ret) { ++ bch_err(c, "error starting copygc thread"); ++ return ret; ++ } ++ ++ ret = bch2_rebalance_start(c); ++ if (ret) { ++ bch_err(c, "error starting rebalance thread"); ++ return ret; ++ } ++ ++ schedule_delayed_work(&c->pd_controllers_update, 5 * HZ); ++ ++ schedule_work(&c->ec_stripe_delete_work); ++ ++ return 0; ++} ++ ++static int __bch2_fs_read_write(struct bch_fs *c, bool early) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ int ret; ++ ++ if (test_bit(BCH_FS_RW, &c->flags)) ++ return 0; ++ ++ /* ++ * nochanges is used for fsck -n mode - we have to allow going rw ++ * during recovery for that to work: ++ */ ++ if (c->opts.norecovery || ++ (c->opts.nochanges && ++ (!early || c->opts.read_only))) ++ return -EROFS; ++ ++ ret = bch2_fs_mark_dirty(c); ++ if (ret) ++ goto err; ++ ++ /* ++ * We need to write out a journal entry before we start doing btree ++ * updates, to ensure that on unclean shutdown new journal blacklist ++ * entries are created: ++ */ ++ bch2_journal_meta(&c->journal); ++ ++ clear_bit(BCH_FS_ALLOC_CLEAN, &c->flags); ++ ++ for_each_rw_member(ca, c, i) ++ bch2_dev_allocator_add(c, ca); ++ bch2_recalc_capacity(c); ++ ++ bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale); ++ bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale); ++ ++ for_each_rw_member(ca, c, i) { ++ ret = bch2_dev_allocator_start(ca); ++ if (ret) { ++ bch_err(c, "error starting allocator threads"); ++ percpu_ref_put(&ca->io_ref); ++ goto err; ++ } ++ } ++ ++ set_bit(BCH_FS_ALLOCATOR_RUNNING, &c->flags); ++ ++ if (!early) { ++ ret = bch2_fs_read_write_late(c); ++ if (ret) ++ goto err; ++ } ++ ++ percpu_ref_reinit(&c->writes); ++ set_bit(BCH_FS_RW, &c->flags); ++ ++ queue_delayed_work(c->journal_reclaim_wq, ++ &c->journal.reclaim_work, 0); ++ return 0; ++err: ++ __bch2_fs_read_only(c); ++ return ret; ++} ++ ++int bch2_fs_read_write(struct bch_fs *c) ++{ ++ return __bch2_fs_read_write(c, false); ++} ++ ++int bch2_fs_read_write_early(struct bch_fs *c) ++{ ++ lockdep_assert_held(&c->state_lock); ++ ++ return __bch2_fs_read_write(c, true); ++} ++ ++/* Filesystem startup/shutdown: */ ++ ++static void __bch2_fs_free(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ for (i = 0; i < BCH_TIME_STAT_NR; i++) ++ bch2_time_stats_exit(&c->times[i]); ++ ++ bch2_fs_quota_exit(c); ++ bch2_fs_fsio_exit(c); ++ bch2_fs_ec_exit(c); ++ bch2_fs_encryption_exit(c); ++ bch2_fs_io_exit(c); ++ bch2_fs_btree_interior_update_exit(c); ++ bch2_fs_btree_iter_exit(c); ++ bch2_fs_btree_key_cache_exit(&c->btree_key_cache); ++ bch2_fs_btree_cache_exit(c); ++ bch2_fs_journal_exit(&c->journal); ++ bch2_io_clock_exit(&c->io_clock[WRITE]); ++ bch2_io_clock_exit(&c->io_clock[READ]); ++ bch2_fs_compress_exit(c); ++ bch2_journal_keys_free(&c->journal_keys); ++ bch2_journal_entries_free(&c->journal_entries); ++ percpu_free_rwsem(&c->mark_lock); ++ kfree(c->usage_scratch); ++ free_percpu(c->usage[1]); ++ free_percpu(c->usage[0]); ++ kfree(c->usage_base); ++ free_percpu(c->pcpu); ++ mempool_exit(&c->large_bkey_pool); ++ mempool_exit(&c->btree_bounce_pool); ++ bioset_exit(&c->btree_bio); ++ mempool_exit(&c->fill_iter); ++ percpu_ref_exit(&c->writes); ++ kfree(c->replicas.entries); ++ kfree(c->replicas_gc.entries); ++ kfree(rcu_dereference_protected(c->disk_groups, 1)); ++ kfree(c->journal_seq_blacklist_table); ++ free_heap(&c->copygc_heap); ++ ++ if (c->journal_reclaim_wq) ++ destroy_workqueue(c->journal_reclaim_wq); ++ if (c->copygc_wq) ++ destroy_workqueue(c->copygc_wq); ++ if (c->wq) ++ destroy_workqueue(c->wq); ++ ++ free_pages((unsigned long) c->disk_sb.sb, ++ c->disk_sb.page_order); ++ kvpfree(c, sizeof(*c)); ++ module_put(THIS_MODULE); ++} ++ ++static void bch2_fs_release(struct kobject *kobj) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); ++ ++ __bch2_fs_free(c); ++} ++ ++void __bch2_fs_stop(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ unsigned i; ++ ++ bch_verbose(c, "shutting down"); ++ ++ set_bit(BCH_FS_STOPPING, &c->flags); ++ ++ cancel_work_sync(&c->journal_seq_blacklist_gc_work); ++ ++ down_write(&c->state_lock); ++ bch2_fs_read_only(c); ++ up_write(&c->state_lock); ++ ++ for_each_member_device(ca, c, i) ++ if (ca->kobj.state_in_sysfs && ++ ca->disk_sb.bdev) ++ sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, ++ "bcachefs"); ++ ++ if (c->kobj.state_in_sysfs) ++ kobject_del(&c->kobj); ++ ++ bch2_fs_debug_exit(c); ++ bch2_fs_chardev_exit(c); ++ ++ kobject_put(&c->time_stats); ++ kobject_put(&c->opts_dir); ++ kobject_put(&c->internal); ++ ++ /* btree prefetch might have kicked off reads in the background: */ ++ bch2_btree_flush_all_reads(c); ++ ++ for_each_member_device(ca, c, i) ++ cancel_work_sync(&ca->io_error_work); ++ ++ cancel_work_sync(&c->btree_write_error_work); ++ cancel_delayed_work_sync(&c->pd_controllers_update); ++ cancel_work_sync(&c->read_only_work); ++ ++ for (i = 0; i < c->sb.nr_devices; i++) ++ if (c->devs[i]) ++ bch2_free_super(&c->devs[i]->disk_sb); ++} ++ ++void bch2_fs_free(struct bch_fs *c) ++{ ++ unsigned i; ++ ++ mutex_lock(&bch_fs_list_lock); ++ list_del(&c->list); ++ mutex_unlock(&bch_fs_list_lock); ++ ++ closure_sync(&c->cl); ++ closure_debug_destroy(&c->cl); ++ ++ for (i = 0; i < c->sb.nr_devices; i++) ++ if (c->devs[i]) ++ bch2_dev_free(rcu_dereference_protected(c->devs[i], 1)); ++ ++ bch_verbose(c, "shutdown complete"); ++ ++ kobject_put(&c->kobj); ++} ++ ++void bch2_fs_stop(struct bch_fs *c) ++{ ++ __bch2_fs_stop(c); ++ bch2_fs_free(c); ++} ++ ++static const char *bch2_fs_online(struct bch_fs *c) ++{ ++ struct bch_dev *ca; ++ const char *err = NULL; ++ unsigned i; ++ int ret; ++ ++ lockdep_assert_held(&bch_fs_list_lock); ++ ++ if (!list_empty(&c->list)) ++ return NULL; ++ ++ if (__bch2_uuid_to_fs(c->sb.uuid)) ++ return "filesystem UUID already open"; ++ ++ ret = bch2_fs_chardev_init(c); ++ if (ret) ++ return "error creating character device"; ++ ++ bch2_fs_debug_init(c); ++ ++ if (kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) || ++ kobject_add(&c->internal, &c->kobj, "internal") || ++ kobject_add(&c->opts_dir, &c->kobj, "options") || ++ kobject_add(&c->time_stats, &c->kobj, "time_stats") || ++ bch2_opts_create_sysfs_files(&c->opts_dir)) ++ return "error creating sysfs objects"; ++ ++ down_write(&c->state_lock); ++ ++ err = "error creating sysfs objects"; ++ __for_each_member_device(ca, c, i, NULL) ++ if (bch2_dev_sysfs_online(c, ca)) ++ goto err; ++ ++ list_add(&c->list, &bch_fs_list); ++ err = NULL; ++err: ++ up_write(&c->state_lock); ++ return err; ++} ++ ++static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) ++{ ++ struct bch_sb_field_members *mi; ++ struct bch_fs *c; ++ unsigned i, iter_size; ++ const char *err; ++ ++ pr_verbose_init(opts, ""); ++ ++ c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); ++ if (!c) ++ goto out; ++ ++ __module_get(THIS_MODULE); ++ ++ closure_init(&c->cl, NULL); ++ ++ c->kobj.kset = bcachefs_kset; ++ kobject_init(&c->kobj, &bch2_fs_ktype); ++ kobject_init(&c->internal, &bch2_fs_internal_ktype); ++ kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); ++ kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); ++ ++ c->minor = -1; ++ c->disk_sb.fs_sb = true; ++ ++ init_rwsem(&c->state_lock); ++ mutex_init(&c->sb_lock); ++ mutex_init(&c->replicas_gc_lock); ++ mutex_init(&c->btree_root_lock); ++ INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); ++ ++ init_rwsem(&c->gc_lock); ++ ++ for (i = 0; i < BCH_TIME_STAT_NR; i++) ++ bch2_time_stats_init(&c->times[i]); ++ ++ bch2_fs_copygc_init(c); ++ bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); ++ bch2_fs_allocator_background_init(c); ++ bch2_fs_allocator_foreground_init(c); ++ bch2_fs_rebalance_init(c); ++ bch2_fs_quota_init(c); ++ ++ INIT_LIST_HEAD(&c->list); ++ ++ mutex_init(&c->usage_scratch_lock); ++ ++ mutex_init(&c->bio_bounce_pages_lock); ++ ++ bio_list_init(&c->btree_write_error_list); ++ spin_lock_init(&c->btree_write_error_lock); ++ INIT_WORK(&c->btree_write_error_work, bch2_btree_write_error_work); ++ ++ INIT_WORK(&c->journal_seq_blacklist_gc_work, ++ bch2_blacklist_entries_gc); ++ ++ INIT_LIST_HEAD(&c->journal_entries); ++ ++ INIT_LIST_HEAD(&c->fsck_errors); ++ mutex_init(&c->fsck_error_lock); ++ ++ INIT_LIST_HEAD(&c->ec_stripe_head_list); ++ mutex_init(&c->ec_stripe_head_lock); ++ ++ INIT_LIST_HEAD(&c->ec_stripe_new_list); ++ mutex_init(&c->ec_stripe_new_lock); ++ ++ spin_lock_init(&c->ec_stripes_heap_lock); ++ ++ seqcount_init(&c->gc_pos_lock); ++ ++ seqcount_init(&c->usage_lock); ++ ++ sema_init(&c->io_in_flight, 64); ++ ++ c->copy_gc_enabled = 1; ++ c->rebalance.enabled = 1; ++ c->promote_whole_extents = true; ++ ++ c->journal.write_time = &c->times[BCH_TIME_journal_write]; ++ c->journal.delay_time = &c->times[BCH_TIME_journal_delay]; ++ c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal]; ++ c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; ++ ++ bch2_fs_btree_cache_init_early(&c->btree_cache); ++ ++ if (percpu_init_rwsem(&c->mark_lock)) ++ goto err; ++ ++ mutex_lock(&c->sb_lock); ++ ++ if (bch2_sb_to_fs(c, sb)) { ++ mutex_unlock(&c->sb_lock); ++ goto err; ++ } ++ ++ mutex_unlock(&c->sb_lock); ++ ++ scnprintf(c->name, sizeof(c->name), "%pU", &c->sb.user_uuid); ++ ++ c->opts = bch2_opts_default; ++ bch2_opts_apply(&c->opts, bch2_opts_from_sb(sb)); ++ bch2_opts_apply(&c->opts, opts); ++ ++ c->block_bits = ilog2(c->opts.block_size); ++ c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); ++ ++ if (bch2_fs_init_fault("fs_alloc")) ++ goto err; ++ ++ iter_size = sizeof(struct sort_iter) + ++ (btree_blocks(c) + 1) * 2 * ++ sizeof(struct sort_iter_set); ++ ++ if (!(c->wq = alloc_workqueue("bcachefs", ++ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || ++ !(c->copygc_wq = alloc_workqueue("bcache_copygc", ++ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || ++ !(c->journal_reclaim_wq = alloc_workqueue("bcache_journal", ++ WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) || ++ percpu_ref_init(&c->writes, bch2_writes_disabled, ++ PERCPU_REF_INIT_DEAD, GFP_KERNEL) || ++ mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || ++ bioset_init(&c->btree_bio, 1, ++ max(offsetof(struct btree_read_bio, bio), ++ offsetof(struct btree_write_bio, wbio.bio)), ++ BIOSET_NEED_BVECS) || ++ !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || ++ mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, ++ btree_bytes(c)) || ++ mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || ++ bch2_io_clock_init(&c->io_clock[READ]) || ++ bch2_io_clock_init(&c->io_clock[WRITE]) || ++ bch2_fs_journal_init(&c->journal) || ++ bch2_fs_replicas_init(c) || ++ bch2_fs_btree_cache_init(c) || ++ bch2_fs_btree_key_cache_init(&c->btree_key_cache) || ++ bch2_fs_btree_iter_init(c) || ++ bch2_fs_btree_interior_update_init(c) || ++ bch2_fs_io_init(c) || ++ bch2_fs_encryption_init(c) || ++ bch2_fs_compress_init(c) || ++ bch2_fs_ec_init(c) || ++ bch2_fs_fsio_init(c)) ++ goto err; ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ for (i = 0; i < c->sb.nr_devices; i++) ++ if (bch2_dev_exists(c->disk_sb.sb, mi, i) && ++ bch2_dev_alloc(c, i)) ++ goto err; ++ ++ mutex_lock(&bch_fs_list_lock); ++ err = bch2_fs_online(c); ++ mutex_unlock(&bch_fs_list_lock); ++ if (err) { ++ bch_err(c, "bch2_fs_online() error: %s", err); ++ goto err; ++ } ++out: ++ pr_verbose_init(opts, "ret %i", c ? 0 : -ENOMEM); ++ return c; ++err: ++ bch2_fs_free(c); ++ c = NULL; ++ goto out; ++} ++ ++noinline_for_stack ++static void print_mount_opts(struct bch_fs *c) ++{ ++ enum bch_opt_id i; ++ char buf[512]; ++ struct printbuf p = PBUF(buf); ++ bool first = true; ++ ++ strcpy(buf, "(null)"); ++ ++ if (c->opts.read_only) { ++ pr_buf(&p, "ro"); ++ first = false; ++ } ++ ++ for (i = 0; i < bch2_opts_nr; i++) { ++ const struct bch_option *opt = &bch2_opt_table[i]; ++ u64 v = bch2_opt_get_by_id(&c->opts, i); ++ ++ if (!(opt->mode & OPT_MOUNT)) ++ continue; ++ ++ if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) ++ continue; ++ ++ if (!first) ++ pr_buf(&p, ","); ++ first = false; ++ bch2_opt_to_text(&p, c, opt, v, OPT_SHOW_MOUNT_STYLE); ++ } ++ ++ bch_info(c, "mounted with opts: %s", buf); ++} ++ ++int bch2_fs_start(struct bch_fs *c) ++{ ++ const char *err = "cannot allocate memory"; ++ struct bch_sb_field_members *mi; ++ struct bch_dev *ca; ++ time64_t now = ktime_get_real_seconds(); ++ unsigned i; ++ int ret = -EINVAL; ++ ++ down_write(&c->state_lock); ++ ++ BUG_ON(test_bit(BCH_FS_STARTED, &c->flags)); ++ ++ mutex_lock(&c->sb_lock); ++ ++ for_each_online_member(ca, c, i) ++ bch2_sb_from_fs(c, ca); ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ for_each_online_member(ca, c, i) ++ mi->members[ca->dev_idx].last_mount = cpu_to_le64(now); ++ ++ mutex_unlock(&c->sb_lock); ++ ++ for_each_rw_member(ca, c, i) ++ bch2_dev_allocator_add(c, ca); ++ bch2_recalc_capacity(c); ++ ++ ret = BCH_SB_INITIALIZED(c->disk_sb.sb) ++ ? bch2_fs_recovery(c) ++ : bch2_fs_initialize(c); ++ if (ret) ++ goto err; ++ ++ ret = bch2_opts_check_may_set(c); ++ if (ret) ++ goto err; ++ ++ err = "dynamic fault"; ++ ret = -EINVAL; ++ if (bch2_fs_init_fault("fs_start")) ++ goto err; ++ ++ set_bit(BCH_FS_STARTED, &c->flags); ++ ++ /* ++ * Allocator threads don't start filling copygc reserve until after we ++ * set BCH_FS_STARTED - wake them now: ++ */ ++ for_each_online_member(ca, c, i) ++ bch2_wake_allocator(ca); ++ ++ if (c->opts.read_only || c->opts.nochanges) { ++ bch2_fs_read_only(c); ++ } else { ++ err = "error going read write"; ++ ret = !test_bit(BCH_FS_RW, &c->flags) ++ ? bch2_fs_read_write(c) ++ : bch2_fs_read_write_late(c); ++ if (ret) ++ goto err; ++ } ++ ++ print_mount_opts(c); ++ ret = 0; ++out: ++ up_write(&c->state_lock); ++ return ret; ++err: ++ switch (ret) { ++ case BCH_FSCK_ERRORS_NOT_FIXED: ++ bch_err(c, "filesystem contains errors: please report this to the developers"); ++ pr_cont("mount with -o fix_errors to repair\n"); ++ err = "fsck error"; ++ break; ++ case BCH_FSCK_REPAIR_UNIMPLEMENTED: ++ bch_err(c, "filesystem contains errors: please report this to the developers"); ++ pr_cont("repair unimplemented: inform the developers so that it can be added\n"); ++ err = "fsck error"; ++ break; ++ case BCH_FSCK_REPAIR_IMPOSSIBLE: ++ bch_err(c, "filesystem contains errors, but repair impossible"); ++ err = "fsck error"; ++ break; ++ case BCH_FSCK_UNKNOWN_VERSION: ++ err = "unknown metadata version";; ++ break; ++ case -ENOMEM: ++ err = "cannot allocate memory"; ++ break; ++ case -EIO: ++ err = "IO error"; ++ break; ++ } ++ ++ if (ret >= 0) ++ ret = -EIO; ++ goto out; ++} ++ ++static const char *bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) ++{ ++ struct bch_sb_field_members *sb_mi; ++ ++ sb_mi = bch2_sb_get_members(sb); ++ if (!sb_mi) ++ return "Invalid superblock: member info area missing"; ++ ++ if (le16_to_cpu(sb->block_size) != c->opts.block_size) ++ return "mismatched block size"; ++ ++ if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) < ++ BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) ++ return "new cache bucket size is too small"; ++ ++ return NULL; ++} ++ ++static const char *bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb) ++{ ++ struct bch_sb *newest = ++ le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb; ++ struct bch_sb_field_members *mi = bch2_sb_get_members(newest); ++ ++ if (uuid_le_cmp(fs->uuid, sb->uuid)) ++ return "device not a member of filesystem"; ++ ++ if (!bch2_dev_exists(newest, mi, sb->dev_idx)) ++ return "device has been removed"; ++ ++ if (fs->block_size != sb->block_size) ++ return "mismatched block size"; ++ ++ return NULL; ++} ++ ++/* Device startup/shutdown: */ ++ ++static void bch2_dev_release(struct kobject *kobj) ++{ ++ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); ++ ++ kfree(ca); ++} ++ ++static void bch2_dev_free(struct bch_dev *ca) ++{ ++ cancel_work_sync(&ca->io_error_work); ++ ++ if (ca->kobj.state_in_sysfs && ++ ca->disk_sb.bdev) ++ sysfs_remove_link(&part_to_dev(ca->disk_sb.bdev->bd_part)->kobj, ++ "bcachefs"); ++ ++ if (ca->kobj.state_in_sysfs) ++ kobject_del(&ca->kobj); ++ ++ bch2_free_super(&ca->disk_sb); ++ bch2_dev_journal_exit(ca); ++ ++ free_percpu(ca->io_done); ++ bioset_exit(&ca->replica_set); ++ bch2_dev_buckets_free(ca); ++ free_page((unsigned long) ca->sb_read_scratch); ++ ++ bch2_time_stats_exit(&ca->io_latency[WRITE]); ++ bch2_time_stats_exit(&ca->io_latency[READ]); ++ ++ percpu_ref_exit(&ca->io_ref); ++ percpu_ref_exit(&ca->ref); ++ kobject_put(&ca->kobj); ++} ++ ++static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) ++{ ++ ++ lockdep_assert_held(&c->state_lock); ++ ++ if (percpu_ref_is_zero(&ca->io_ref)) ++ return; ++ ++ __bch2_dev_read_only(c, ca); ++ ++ reinit_completion(&ca->io_ref_completion); ++ percpu_ref_kill(&ca->io_ref); ++ wait_for_completion(&ca->io_ref_completion); ++ ++ if (ca->kobj.state_in_sysfs) { ++ struct kobject *block = ++ &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj; ++ ++ sysfs_remove_link(block, "bcachefs"); ++ sysfs_remove_link(&ca->kobj, "block"); ++ } ++ ++ bch2_free_super(&ca->disk_sb); ++ bch2_dev_journal_exit(ca); ++} ++ ++static void bch2_dev_ref_complete(struct percpu_ref *ref) ++{ ++ struct bch_dev *ca = container_of(ref, struct bch_dev, ref); ++ ++ complete(&ca->ref_completion); ++} ++ ++static void bch2_dev_io_ref_complete(struct percpu_ref *ref) ++{ ++ struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); ++ ++ complete(&ca->io_ref_completion); ++} ++ ++static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) ++{ ++ int ret; ++ ++ if (!c->kobj.state_in_sysfs) ++ return 0; ++ ++ if (!ca->kobj.state_in_sysfs) { ++ ret = kobject_add(&ca->kobj, &c->kobj, ++ "dev-%u", ca->dev_idx); ++ if (ret) ++ return ret; ++ } ++ ++ if (ca->disk_sb.bdev) { ++ struct kobject *block = ++ &part_to_dev(ca->disk_sb.bdev->bd_part)->kobj; ++ ++ ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); ++ if (ret) ++ return ret; ++ ret = sysfs_create_link(&ca->kobj, block, "block"); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, ++ struct bch_member *member) ++{ ++ struct bch_dev *ca; ++ ++ ca = kzalloc(sizeof(*ca), GFP_KERNEL); ++ if (!ca) ++ return NULL; ++ ++ kobject_init(&ca->kobj, &bch2_dev_ktype); ++ init_completion(&ca->ref_completion); ++ init_completion(&ca->io_ref_completion); ++ ++ init_rwsem(&ca->bucket_lock); ++ ++ INIT_WORK(&ca->io_error_work, bch2_io_error_work); ++ ++ bch2_time_stats_init(&ca->io_latency[READ]); ++ bch2_time_stats_init(&ca->io_latency[WRITE]); ++ ++ ca->mi = bch2_mi_to_cpu(member); ++ ca->uuid = member->uuid; ++ ++ if (opt_defined(c->opts, discard)) ++ ca->mi.discard = opt_get(c->opts, discard); ++ ++ if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, ++ 0, GFP_KERNEL) || ++ percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, ++ PERCPU_REF_INIT_DEAD, GFP_KERNEL) || ++ !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || ++ bch2_dev_buckets_alloc(c, ca) || ++ bioset_init(&ca->replica_set, 4, ++ offsetof(struct bch_write_bio, bio), 0) || ++ !(ca->io_done = alloc_percpu(*ca->io_done))) ++ goto err; ++ ++ return ca; ++err: ++ bch2_dev_free(ca); ++ return NULL; ++} ++ ++static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, ++ unsigned dev_idx) ++{ ++ ca->dev_idx = dev_idx; ++ __set_bit(ca->dev_idx, ca->self.d); ++ scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); ++ ++ ca->fs = c; ++ rcu_assign_pointer(c->devs[ca->dev_idx], ca); ++ ++ if (bch2_dev_sysfs_online(c, ca)) ++ pr_warn("error creating sysfs objects"); ++} ++ ++static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) ++{ ++ struct bch_member *member = ++ bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx; ++ struct bch_dev *ca = NULL; ++ int ret = 0; ++ ++ pr_verbose_init(c->opts, ""); ++ ++ if (bch2_fs_init_fault("dev_alloc")) ++ goto err; ++ ++ ca = __bch2_dev_alloc(c, member); ++ if (!ca) ++ goto err; ++ ++ bch2_dev_attach(c, ca, dev_idx); ++out: ++ pr_verbose_init(c->opts, "ret %i", ret); ++ return ret; ++err: ++ if (ca) ++ bch2_dev_free(ca); ++ ret = -ENOMEM; ++ goto out; ++} ++ ++static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) ++{ ++ unsigned ret; ++ ++ if (bch2_dev_is_online(ca)) { ++ bch_err(ca, "already have device online in slot %u", ++ sb->sb->dev_idx); ++ return -EINVAL; ++ } ++ ++ if (get_capacity(sb->bdev->bd_disk) < ++ ca->mi.bucket_size * ca->mi.nbuckets) { ++ bch_err(ca, "cannot online: device too small"); ++ return -EINVAL; ++ } ++ ++ BUG_ON(!percpu_ref_is_zero(&ca->io_ref)); ++ ++ if (get_capacity(sb->bdev->bd_disk) < ++ ca->mi.bucket_size * ca->mi.nbuckets) { ++ bch_err(ca, "device too small"); ++ return -EINVAL; ++ } ++ ++ ret = bch2_dev_journal_init(ca, sb->sb); ++ if (ret) ++ return ret; ++ ++ /* Commit: */ ++ ca->disk_sb = *sb; ++ if (sb->mode & FMODE_EXCL) ++ ca->disk_sb.bdev->bd_holder = ca; ++ memset(sb, 0, sizeof(*sb)); ++ ++ percpu_ref_reinit(&ca->io_ref); ++ ++ return 0; ++} ++ ++static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) ++{ ++ struct bch_dev *ca; ++ int ret; ++ ++ lockdep_assert_held(&c->state_lock); ++ ++ if (le64_to_cpu(sb->sb->seq) > ++ le64_to_cpu(c->disk_sb.sb->seq)) ++ bch2_sb_to_fs(c, sb->sb); ++ ++ BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices || ++ !c->devs[sb->sb->dev_idx]); ++ ++ ca = bch_dev_locked(c, sb->sb->dev_idx); ++ ++ ret = __bch2_dev_attach_bdev(ca, sb); ++ if (ret) ++ return ret; ++ ++ if (test_bit(BCH_FS_ALLOC_READ_DONE, &c->flags) && ++ !percpu_u64_get(&ca->usage[0]->buckets[BCH_DATA_sb])) { ++ mutex_lock(&c->sb_lock); ++ bch2_mark_dev_superblock(ca->fs, ca, 0); ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ bch2_dev_sysfs_online(c, ca); ++ ++ if (c->sb.nr_devices == 1) ++ bdevname(ca->disk_sb.bdev, c->name); ++ bdevname(ca->disk_sb.bdev, ca->name); ++ ++ rebalance_wakeup(c); ++ return 0; ++} ++ ++/* Device management: */ ++ ++/* ++ * Note: this function is also used by the error paths - when a particular ++ * device sees an error, we call it to determine whether we can just set the ++ * device RO, or - if this function returns false - we'll set the whole ++ * filesystem RO: ++ * ++ * XXX: maybe we should be more explicit about whether we're changing state ++ * because we got an error or what have you? ++ */ ++bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, ++ enum bch_member_state new_state, int flags) ++{ ++ struct bch_devs_mask new_online_devs; ++ struct replicas_status s; ++ struct bch_dev *ca2; ++ int i, nr_rw = 0, required; ++ ++ lockdep_assert_held(&c->state_lock); ++ ++ switch (new_state) { ++ case BCH_MEMBER_STATE_RW: ++ return true; ++ case BCH_MEMBER_STATE_RO: ++ if (ca->mi.state != BCH_MEMBER_STATE_RW) ++ return true; ++ ++ /* do we have enough devices to write to? */ ++ for_each_member_device(ca2, c, i) ++ if (ca2 != ca) ++ nr_rw += ca2->mi.state == BCH_MEMBER_STATE_RW; ++ ++ required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) ++ ? c->opts.metadata_replicas ++ : c->opts.metadata_replicas_required, ++ !(flags & BCH_FORCE_IF_DATA_DEGRADED) ++ ? c->opts.data_replicas ++ : c->opts.data_replicas_required); ++ ++ return nr_rw >= required; ++ case BCH_MEMBER_STATE_FAILED: ++ case BCH_MEMBER_STATE_SPARE: ++ if (ca->mi.state != BCH_MEMBER_STATE_RW && ++ ca->mi.state != BCH_MEMBER_STATE_RO) ++ return true; ++ ++ /* do we have enough devices to read from? */ ++ new_online_devs = bch2_online_devs(c); ++ __clear_bit(ca->dev_idx, new_online_devs.d); ++ ++ s = __bch2_replicas_status(c, new_online_devs); ++ ++ return bch2_have_enough_devs(s, flags); ++ default: ++ BUG(); ++ } ++} ++ ++static bool bch2_fs_may_start(struct bch_fs *c) ++{ ++ struct replicas_status s; ++ struct bch_sb_field_members *mi; ++ struct bch_dev *ca; ++ unsigned i, flags = c->opts.degraded ++ ? BCH_FORCE_IF_DEGRADED ++ : 0; ++ ++ if (!c->opts.degraded) { ++ mutex_lock(&c->sb_lock); ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ ++ for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { ++ if (!bch2_dev_exists(c->disk_sb.sb, mi, i)) ++ continue; ++ ++ ca = bch_dev_locked(c, i); ++ ++ if (!bch2_dev_is_online(ca) && ++ (ca->mi.state == BCH_MEMBER_STATE_RW || ++ ca->mi.state == BCH_MEMBER_STATE_RO)) { ++ mutex_unlock(&c->sb_lock); ++ return false; ++ } ++ } ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ s = bch2_replicas_status(c); ++ ++ return bch2_have_enough_devs(s, flags); ++} ++ ++static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) ++{ ++ /* ++ * Device going read only means the copygc reserve get smaller, so we ++ * don't want that happening while copygc is in progress: ++ */ ++ bch2_copygc_stop(c); ++ ++ /* ++ * The allocator thread itself allocates btree nodes, so stop it first: ++ */ ++ bch2_dev_allocator_stop(ca); ++ bch2_dev_allocator_remove(c, ca); ++ bch2_dev_journal_stop(&c->journal, ca); ++ ++ bch2_copygc_start(c); ++} ++ ++static const char *__bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) ++{ ++ lockdep_assert_held(&c->state_lock); ++ ++ BUG_ON(ca->mi.state != BCH_MEMBER_STATE_RW); ++ ++ bch2_dev_allocator_add(c, ca); ++ bch2_recalc_capacity(c); ++ ++ if (bch2_dev_allocator_start(ca)) ++ return "error starting allocator thread"; ++ ++ return NULL; ++} ++ ++int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, ++ enum bch_member_state new_state, int flags) ++{ ++ struct bch_sb_field_members *mi; ++ int ret = 0; ++ ++ if (ca->mi.state == new_state) ++ return 0; ++ ++ if (!bch2_dev_state_allowed(c, ca, new_state, flags)) ++ return -EINVAL; ++ ++ if (new_state != BCH_MEMBER_STATE_RW) ++ __bch2_dev_read_only(c, ca); ++ ++ bch_notice(ca, "%s", bch2_dev_state[new_state]); ++ ++ mutex_lock(&c->sb_lock); ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ if (new_state == BCH_MEMBER_STATE_RW && ++ __bch2_dev_read_write(c, ca)) ++ ret = -ENOMEM; ++ ++ rebalance_wakeup(c); ++ ++ return ret; ++} ++ ++int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, ++ enum bch_member_state new_state, int flags) ++{ ++ int ret; ++ ++ down_write(&c->state_lock); ++ ret = __bch2_dev_set_state(c, ca, new_state, flags); ++ up_write(&c->state_lock); ++ ++ return ret; ++} ++ ++/* Device add/removal: */ ++ ++int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) ++{ ++ struct btree_trans trans; ++ size_t i; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < ca->mi.nbuckets; i++) { ++ ret = bch2_btree_key_cache_flush(&trans, ++ BTREE_ID_ALLOC, POS(ca->dev_idx, i)); ++ if (ret) ++ break; ++ } ++ bch2_trans_exit(&trans); ++ ++ if (ret) ++ return ret; ++ ++ return bch2_btree_delete_range(c, BTREE_ID_ALLOC, ++ POS(ca->dev_idx, 0), ++ POS(ca->dev_idx + 1, 0), ++ NULL); ++} ++ ++int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) ++{ ++ struct bch_sb_field_members *mi; ++ unsigned dev_idx = ca->dev_idx, data; ++ int ret = -EINVAL; ++ ++ down_write(&c->state_lock); ++ ++ /* ++ * We consume a reference to ca->ref, regardless of whether we succeed ++ * or fail: ++ */ ++ percpu_ref_put(&ca->ref); ++ ++ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { ++ bch_err(ca, "Cannot remove without losing data"); ++ goto err; ++ } ++ ++ __bch2_dev_read_only(c, ca); ++ ++ ret = bch2_dev_data_drop(c, ca->dev_idx, flags); ++ if (ret) { ++ bch_err(ca, "Remove failed: error %i dropping data", ret); ++ goto err; ++ } ++ ++ ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); ++ if (ret) { ++ bch_err(ca, "Remove failed: error %i flushing journal", ret); ++ goto err; ++ } ++ ++ ret = bch2_dev_remove_alloc(c, ca); ++ if (ret) { ++ bch_err(ca, "Remove failed, error deleting alloc info"); ++ goto err; ++ } ++ ++ /* ++ * must flush all existing journal entries, they might have ++ * (overwritten) keys that point to the device we're removing: ++ */ ++ bch2_journal_flush_all_pins(&c->journal); ++ /* ++ * hack to ensure bch2_replicas_gc2() clears out entries to this device ++ */ ++ bch2_journal_meta(&c->journal); ++ ret = bch2_journal_error(&c->journal); ++ if (ret) { ++ bch_err(ca, "Remove failed, journal error"); ++ goto err; ++ } ++ ++ ret = bch2_replicas_gc2(c); ++ if (ret) { ++ bch_err(ca, "Remove failed: error %i from replicas gc", ret); ++ goto err; ++ } ++ ++ data = bch2_dev_has_data(c, ca); ++ if (data) { ++ char data_has_str[100]; ++ ++ bch2_flags_to_text(&PBUF(data_has_str), ++ bch2_data_types, data); ++ bch_err(ca, "Remove failed, still has data (%s)", data_has_str); ++ ret = -EBUSY; ++ goto err; ++ } ++ ++ __bch2_dev_offline(c, ca); ++ ++ mutex_lock(&c->sb_lock); ++ rcu_assign_pointer(c->devs[ca->dev_idx], NULL); ++ mutex_unlock(&c->sb_lock); ++ ++ percpu_ref_kill(&ca->ref); ++ wait_for_completion(&ca->ref_completion); ++ ++ bch2_dev_free(ca); ++ ++ /* ++ * Free this device's slot in the bch_member array - all pointers to ++ * this device must be gone: ++ */ ++ mutex_lock(&c->sb_lock); ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid)); ++ ++ bch2_write_super(c); ++ ++ mutex_unlock(&c->sb_lock); ++ up_write(&c->state_lock); ++ return 0; ++err: ++ if (ca->mi.state == BCH_MEMBER_STATE_RW && ++ !percpu_ref_is_zero(&ca->io_ref)) ++ __bch2_dev_read_write(c, ca); ++ up_write(&c->state_lock); ++ return ret; ++} ++ ++static void dev_usage_clear(struct bch_dev *ca) ++{ ++ struct bucket_array *buckets; ++ ++ percpu_memset(ca->usage[0], 0, sizeof(*ca->usage[0])); ++ ++ down_read(&ca->bucket_lock); ++ buckets = bucket_array(ca); ++ ++ memset(buckets->b, 0, sizeof(buckets->b[0]) * buckets->nbuckets); ++ up_read(&ca->bucket_lock); ++} ++ ++/* Add new device to running filesystem: */ ++int bch2_dev_add(struct bch_fs *c, const char *path) ++{ ++ struct bch_opts opts = bch2_opts_empty(); ++ struct bch_sb_handle sb; ++ const char *err; ++ struct bch_dev *ca = NULL; ++ struct bch_sb_field_members *mi; ++ struct bch_member dev_mi; ++ unsigned dev_idx, nr_devices, u64s; ++ int ret; ++ ++ ret = bch2_read_super(path, &opts, &sb); ++ if (ret) ++ return ret; ++ ++ err = bch2_sb_validate(&sb); ++ if (err) ++ return -EINVAL; ++ ++ dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx]; ++ ++ err = bch2_dev_may_add(sb.sb, c); ++ if (err) ++ return -EINVAL; ++ ++ ca = __bch2_dev_alloc(c, &dev_mi); ++ if (!ca) { ++ bch2_free_super(&sb); ++ return -ENOMEM; ++ } ++ ++ ret = __bch2_dev_attach_bdev(ca, &sb); ++ if (ret) { ++ bch2_dev_free(ca); ++ return ret; ++ } ++ ++ /* ++ * We want to allocate journal on the new device before adding the new ++ * device to the filesystem because allocating after we attach requires ++ * spinning up the allocator thread, and the allocator thread requires ++ * doing btree writes, which if the existing devices are RO isn't going ++ * to work ++ * ++ * So we have to mark where the superblocks are, but marking allocated ++ * data normally updates the filesystem usage too, so we have to mark, ++ * allocate the journal, reset all the marks, then remark after we ++ * attach... ++ */ ++ bch2_mark_dev_superblock(ca->fs, ca, 0); ++ ++ err = "journal alloc failed"; ++ ret = bch2_dev_journal_alloc(ca); ++ if (ret) ++ goto err; ++ ++ dev_usage_clear(ca); ++ ++ down_write(&c->state_lock); ++ mutex_lock(&c->sb_lock); ++ ++ err = "insufficient space in new superblock"; ++ ret = bch2_sb_from_fs(c, ca); ++ if (ret) ++ goto err_unlock; ++ ++ mi = bch2_sb_get_members(ca->disk_sb.sb); ++ ++ if (!bch2_sb_resize_members(&ca->disk_sb, ++ le32_to_cpu(mi->field.u64s) + ++ sizeof(dev_mi) / sizeof(u64))) { ++ ret = -ENOSPC; ++ goto err_unlock; ++ } ++ ++ if (dynamic_fault("bcachefs:add:no_slot")) ++ goto no_slot; ++ ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) ++ if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx)) ++ goto have_slot; ++no_slot: ++ err = "no slots available in superblock"; ++ ret = -ENOSPC; ++ goto err_unlock; ++ ++have_slot: ++ nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); ++ u64s = (sizeof(struct bch_sb_field_members) + ++ sizeof(struct bch_member) * nr_devices) / sizeof(u64); ++ ++ err = "no space in superblock for member info"; ++ ret = -ENOSPC; ++ ++ mi = bch2_sb_resize_members(&c->disk_sb, u64s); ++ if (!mi) ++ goto err_unlock; ++ ++ /* success: */ ++ ++ mi->members[dev_idx] = dev_mi; ++ mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_real_seconds()); ++ c->disk_sb.sb->nr_devices = nr_devices; ++ ++ ca->disk_sb.sb->dev_idx = dev_idx; ++ bch2_dev_attach(c, ca, dev_idx); ++ ++ bch2_mark_dev_superblock(c, ca, 0); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ err = "alloc write failed"; ++ ret = bch2_dev_alloc_write(c, ca, 0); ++ if (ret) ++ goto err; ++ ++ if (ca->mi.state == BCH_MEMBER_STATE_RW) { ++ err = __bch2_dev_read_write(c, ca); ++ if (err) ++ goto err_late; ++ } ++ ++ up_write(&c->state_lock); ++ return 0; ++ ++err_unlock: ++ mutex_unlock(&c->sb_lock); ++ up_write(&c->state_lock); ++err: ++ if (ca) ++ bch2_dev_free(ca); ++ bch2_free_super(&sb); ++ bch_err(c, "Unable to add device: %s", err); ++ return ret; ++err_late: ++ bch_err(c, "Error going rw after adding device: %s", err); ++ return -EINVAL; ++} ++ ++/* Hot add existing device to running filesystem: */ ++int bch2_dev_online(struct bch_fs *c, const char *path) ++{ ++ struct bch_opts opts = bch2_opts_empty(); ++ struct bch_sb_handle sb = { NULL }; ++ struct bch_sb_field_members *mi; ++ struct bch_dev *ca; ++ unsigned dev_idx; ++ const char *err; ++ int ret; ++ ++ down_write(&c->state_lock); ++ ++ ret = bch2_read_super(path, &opts, &sb); ++ if (ret) { ++ up_write(&c->state_lock); ++ return ret; ++ } ++ ++ dev_idx = sb.sb->dev_idx; ++ ++ err = bch2_dev_in_fs(c->disk_sb.sb, sb.sb); ++ if (err) ++ goto err; ++ ++ if (bch2_dev_attach_bdev(c, &sb)) { ++ err = "bch2_dev_attach_bdev() error"; ++ goto err; ++ } ++ ++ ca = bch_dev_locked(c, dev_idx); ++ if (ca->mi.state == BCH_MEMBER_STATE_RW) { ++ err = __bch2_dev_read_write(c, ca); ++ if (err) ++ goto err; ++ } ++ ++ mutex_lock(&c->sb_lock); ++ mi = bch2_sb_get_members(c->disk_sb.sb); ++ ++ mi->members[ca->dev_idx].last_mount = ++ cpu_to_le64(ktime_get_real_seconds()); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ up_write(&c->state_lock); ++ return 0; ++err: ++ up_write(&c->state_lock); ++ bch2_free_super(&sb); ++ bch_err(c, "error bringing %s online: %s", path, err); ++ return -EINVAL; ++} ++ ++int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) ++{ ++ down_write(&c->state_lock); ++ ++ if (!bch2_dev_is_online(ca)) { ++ bch_err(ca, "Already offline"); ++ up_write(&c->state_lock); ++ return 0; ++ } ++ ++ if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_FAILED, flags)) { ++ bch_err(ca, "Cannot offline required disk"); ++ up_write(&c->state_lock); ++ return -EINVAL; ++ } ++ ++ __bch2_dev_offline(c, ca); ++ ++ up_write(&c->state_lock); ++ return 0; ++} ++ ++int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) ++{ ++ struct bch_member *mi; ++ int ret = 0; ++ ++ down_write(&c->state_lock); ++ ++ if (nbuckets < ca->mi.nbuckets) { ++ bch_err(ca, "Cannot shrink yet"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ if (bch2_dev_is_online(ca) && ++ get_capacity(ca->disk_sb.bdev->bd_disk) < ++ ca->mi.bucket_size * nbuckets) { ++ bch_err(ca, "New size larger than device"); ++ ret = -EINVAL; ++ goto err; ++ } ++ ++ ret = bch2_dev_buckets_resize(c, ca, nbuckets); ++ if (ret) { ++ bch_err(ca, "Resize error: %i", ret); ++ goto err; ++ } ++ ++ mutex_lock(&c->sb_lock); ++ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; ++ mi->nbuckets = cpu_to_le64(nbuckets); ++ ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ ++ bch2_recalc_capacity(c); ++err: ++ up_write(&c->state_lock); ++ return ret; ++} ++ ++/* return with ref on ca->ref: */ ++struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *path) ++{ ++ struct block_device *bdev = lookup_bdev(path); ++ struct bch_dev *ca; ++ unsigned i; ++ ++ if (IS_ERR(bdev)) ++ return ERR_CAST(bdev); ++ ++ for_each_member_device(ca, c, i) ++ if (ca->disk_sb.bdev == bdev) ++ goto found; ++ ++ ca = ERR_PTR(-ENOENT); ++found: ++ bdput(bdev); ++ return ca; ++} ++ ++/* Filesystem open: */ ++ ++struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, ++ struct bch_opts opts) ++{ ++ struct bch_sb_handle *sb = NULL; ++ struct bch_fs *c = NULL; ++ struct bch_sb_field_members *mi; ++ unsigned i, best_sb = 0; ++ const char *err; ++ int ret = -ENOMEM; ++ ++ pr_verbose_init(opts, ""); ++ ++ if (!nr_devices) { ++ c = ERR_PTR(-EINVAL); ++ goto out2; ++ } ++ ++ if (!try_module_get(THIS_MODULE)) { ++ c = ERR_PTR(-ENODEV); ++ goto out2; ++ } ++ ++ sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); ++ if (!sb) ++ goto err; ++ ++ for (i = 0; i < nr_devices; i++) { ++ ret = bch2_read_super(devices[i], &opts, &sb[i]); ++ if (ret) ++ goto err; ++ ++ err = bch2_sb_validate(&sb[i]); ++ if (err) ++ goto err_print; ++ } ++ ++ for (i = 1; i < nr_devices; i++) ++ if (le64_to_cpu(sb[i].sb->seq) > ++ le64_to_cpu(sb[best_sb].sb->seq)) ++ best_sb = i; ++ ++ mi = bch2_sb_get_members(sb[best_sb].sb); ++ ++ i = 0; ++ while (i < nr_devices) { ++ if (i != best_sb && ++ !bch2_dev_exists(sb[best_sb].sb, mi, sb[i].sb->dev_idx)) { ++ char buf[BDEVNAME_SIZE]; ++ pr_info("%s has been removed, skipping", ++ bdevname(sb[i].bdev, buf)); ++ bch2_free_super(&sb[i]); ++ array_remove_item(sb, nr_devices, i); ++ continue; ++ } ++ ++ err = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb); ++ if (err) ++ goto err_print; ++ i++; ++ } ++ ++ ret = -ENOMEM; ++ c = bch2_fs_alloc(sb[best_sb].sb, opts); ++ if (!c) ++ goto err; ++ ++ err = "bch2_dev_online() error"; ++ down_write(&c->state_lock); ++ for (i = 0; i < nr_devices; i++) ++ if (bch2_dev_attach_bdev(c, &sb[i])) { ++ up_write(&c->state_lock); ++ goto err_print; ++ } ++ up_write(&c->state_lock); ++ ++ err = "insufficient devices"; ++ if (!bch2_fs_may_start(c)) ++ goto err_print; ++ ++ if (!c->opts.nostart) { ++ ret = bch2_fs_start(c); ++ if (ret) ++ goto err; ++ } ++out: ++ kfree(sb); ++ module_put(THIS_MODULE); ++out2: ++ pr_verbose_init(opts, "ret %i", PTR_ERR_OR_ZERO(c)); ++ return c; ++err_print: ++ pr_err("bch_fs_open err opening %s: %s", ++ devices[0], err); ++ ret = -EINVAL; ++err: ++ if (c) ++ bch2_fs_stop(c); ++ for (i = 0; i < nr_devices; i++) ++ bch2_free_super(&sb[i]); ++ c = ERR_PTR(ret); ++ goto out; ++} ++ ++static const char *__bch2_fs_open_incremental(struct bch_sb_handle *sb, ++ struct bch_opts opts) ++{ ++ const char *err; ++ struct bch_fs *c; ++ bool allocated_fs = false; ++ int ret; ++ ++ err = bch2_sb_validate(sb); ++ if (err) ++ return err; ++ ++ mutex_lock(&bch_fs_list_lock); ++ c = __bch2_uuid_to_fs(sb->sb->uuid); ++ if (c) { ++ closure_get(&c->cl); ++ ++ err = bch2_dev_in_fs(c->disk_sb.sb, sb->sb); ++ if (err) ++ goto err; ++ } else { ++ c = bch2_fs_alloc(sb->sb, opts); ++ err = "cannot allocate memory"; ++ if (!c) ++ goto err; ++ ++ allocated_fs = true; ++ } ++ ++ err = "bch2_dev_online() error"; ++ ++ mutex_lock(&c->sb_lock); ++ if (bch2_dev_attach_bdev(c, sb)) { ++ mutex_unlock(&c->sb_lock); ++ goto err; ++ } ++ mutex_unlock(&c->sb_lock); ++ ++ if (!c->opts.nostart && bch2_fs_may_start(c)) { ++ err = "error starting filesystem"; ++ ret = bch2_fs_start(c); ++ if (ret) ++ goto err; ++ } ++ ++ closure_put(&c->cl); ++ mutex_unlock(&bch_fs_list_lock); ++ ++ return NULL; ++err: ++ mutex_unlock(&bch_fs_list_lock); ++ ++ if (allocated_fs) ++ bch2_fs_stop(c); ++ else if (c) ++ closure_put(&c->cl); ++ ++ return err; ++} ++ ++const char *bch2_fs_open_incremental(const char *path) ++{ ++ struct bch_sb_handle sb; ++ struct bch_opts opts = bch2_opts_empty(); ++ const char *err; ++ ++ if (bch2_read_super(path, &opts, &sb)) ++ return "error reading superblock"; ++ ++ err = __bch2_fs_open_incremental(&sb, opts); ++ bch2_free_super(&sb); ++ ++ return err; ++} ++ ++/* Global interfaces/init */ ++ ++static void bcachefs_exit(void) ++{ ++ bch2_debug_exit(); ++ bch2_vfs_exit(); ++ bch2_chardev_exit(); ++ if (bcachefs_kset) ++ kset_unregister(bcachefs_kset); ++} ++ ++static int __init bcachefs_init(void) ++{ ++ bch2_bkey_pack_test(); ++ bch2_inode_pack_test(); ++ ++ if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || ++ bch2_chardev_init() || ++ bch2_vfs_init() || ++ bch2_debug_init()) ++ goto err; ++ ++ return 0; ++err: ++ bcachefs_exit(); ++ return -ENOMEM; ++} ++ ++#define BCH_DEBUG_PARAM(name, description) \ ++ bool bch2_##name; \ ++ module_param_named(name, bch2_##name, bool, 0644); \ ++ MODULE_PARM_DESC(name, description); ++BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++module_exit(bcachefs_exit); ++module_init(bcachefs_init); +diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h +new file mode 100644 +index 000000000000..02c81f3555c3 +--- /dev/null ++++ b/fs/bcachefs/super.h +@@ -0,0 +1,241 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SUPER_H ++#define _BCACHEFS_SUPER_H ++ ++#include "extents.h" ++ ++#include "bcachefs_ioctl.h" ++ ++#include ++ ++static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) ++{ ++ return div_u64(s, ca->mi.bucket_size); ++} ++ ++static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) ++{ ++ return ((sector_t) b) * ca->mi.bucket_size; ++} ++ ++static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) ++{ ++ u32 remainder; ++ ++ div_u64_rem(s, ca->mi.bucket_size, &remainder); ++ return remainder; ++} ++ ++static inline bool bch2_dev_is_online(struct bch_dev *ca) ++{ ++ return !percpu_ref_is_zero(&ca->io_ref); ++} ++ ++static inline bool bch2_dev_is_readable(struct bch_dev *ca) ++{ ++ return bch2_dev_is_online(ca) && ++ ca->mi.state != BCH_MEMBER_STATE_FAILED; ++} ++ ++static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) ++{ ++ if (!percpu_ref_tryget(&ca->io_ref)) ++ return false; ++ ++ if (ca->mi.state == BCH_MEMBER_STATE_RW || ++ (ca->mi.state == BCH_MEMBER_STATE_RO && rw == READ)) ++ return true; ++ ++ percpu_ref_put(&ca->io_ref); ++ return false; ++} ++ ++static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) ++{ ++ return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); ++} ++ ++static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, ++ unsigned dev) ++{ ++ unsigned i; ++ ++ for (i = 0; i < devs.nr; i++) ++ if (devs.devs[i] == dev) ++ return true; ++ ++ return false; ++} ++ ++static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, ++ unsigned dev) ++{ ++ unsigned i; ++ ++ for (i = 0; i < devs->nr; i++) ++ if (devs->devs[i] == dev) { ++ array_remove_item(devs->devs, devs->nr, i); ++ return; ++ } ++} ++ ++static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, ++ unsigned dev) ++{ ++ BUG_ON(bch2_dev_list_has_dev(*devs, dev)); ++ BUG_ON(devs->nr >= BCH_REPLICAS_MAX); ++ devs->devs[devs->nr++] = dev; ++} ++ ++static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) ++{ ++ return (struct bch_devs_list) { .nr = 1, .devs[0] = dev }; ++} ++ ++static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, ++ const struct bch_devs_mask *mask) ++{ ++ struct bch_dev *ca = NULL; ++ ++ while ((*iter = mask ++ ? find_next_bit(mask->d, c->sb.nr_devices, *iter) ++ : *iter) < c->sb.nr_devices && ++ !(ca = rcu_dereference_check(c->devs[*iter], ++ lockdep_is_held(&c->state_lock)))) ++ (*iter)++; ++ ++ return ca; ++} ++ ++#define __for_each_member_device(ca, c, iter, mask) \ ++ for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++) ++ ++#define for_each_member_device_rcu(ca, c, iter, mask) \ ++ __for_each_member_device(ca, c, iter, mask) ++ ++static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter) ++{ ++ struct bch_dev *ca; ++ ++ rcu_read_lock(); ++ if ((ca = __bch2_next_dev(c, iter, NULL))) ++ percpu_ref_get(&ca->ref); ++ rcu_read_unlock(); ++ ++ return ca; ++} ++ ++/* ++ * If you break early, you must drop your ref on the current device ++ */ ++#define for_each_member_device(ca, c, iter) \ ++ for ((iter) = 0; \ ++ (ca = bch2_get_next_dev(c, &(iter))); \ ++ percpu_ref_put(&ca->ref), (iter)++) ++ ++static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, ++ unsigned *iter, ++ int state_mask) ++{ ++ struct bch_dev *ca; ++ ++ rcu_read_lock(); ++ while ((ca = __bch2_next_dev(c, iter, NULL)) && ++ (!((1 << ca->mi.state) & state_mask) || ++ !percpu_ref_tryget(&ca->io_ref))) ++ (*iter)++; ++ rcu_read_unlock(); ++ ++ return ca; ++} ++ ++#define __for_each_online_member(ca, c, iter, state_mask) \ ++ for ((iter) = 0; \ ++ (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \ ++ percpu_ref_put(&ca->io_ref), (iter)++) ++ ++#define for_each_online_member(ca, c, iter) \ ++ __for_each_online_member(ca, c, iter, ~0) ++ ++#define for_each_rw_member(ca, c, iter) \ ++ __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_RW) ++ ++#define for_each_readable_member(ca, c, iter) \ ++ __for_each_online_member(ca, c, iter, \ ++ (1 << BCH_MEMBER_STATE_RW)|(1 << BCH_MEMBER_STATE_RO)) ++ ++/* ++ * If a key exists that references a device, the device won't be going away and ++ * we can omit rcu_read_lock(): ++ */ ++static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx) ++{ ++ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); ++ ++ return rcu_dereference_check(c->devs[idx], 1); ++} ++ ++static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) ++{ ++ EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); ++ ++ return rcu_dereference_protected(c->devs[idx], ++ lockdep_is_held(&c->sb_lock) || ++ lockdep_is_held(&c->state_lock)); ++} ++ ++/* XXX kill, move to struct bch_fs */ ++static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) ++{ ++ struct bch_devs_mask devs; ++ struct bch_dev *ca; ++ unsigned i; ++ ++ memset(&devs, 0, sizeof(devs)); ++ for_each_online_member(ca, c, i) ++ __set_bit(ca->dev_idx, devs.d); ++ return devs; ++} ++ ++struct bch_fs *bch2_bdev_to_fs(struct block_device *); ++struct bch_fs *bch2_uuid_to_fs(uuid_le); ++ ++bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, ++ enum bch_member_state, int); ++int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *, ++ enum bch_member_state, int); ++int bch2_dev_set_state(struct bch_fs *, struct bch_dev *, ++ enum bch_member_state, int); ++ ++int bch2_dev_fail(struct bch_dev *, int); ++int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int); ++int bch2_dev_add(struct bch_fs *, const char *); ++int bch2_dev_online(struct bch_fs *, const char *); ++int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int); ++int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); ++struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); ++ ++bool bch2_fs_emergency_read_only(struct bch_fs *); ++void bch2_fs_read_only(struct bch_fs *); ++ ++int bch2_fs_read_write(struct bch_fs *); ++int bch2_fs_read_write_early(struct bch_fs *); ++ ++/* ++ * Only for use in the recovery/fsck path: ++ */ ++static inline void bch2_fs_lazy_rw(struct bch_fs *c) ++{ ++ if (percpu_ref_is_zero(&c->writes)) ++ bch2_fs_read_write_early(c); ++} ++ ++void __bch2_fs_stop(struct bch_fs *); ++void bch2_fs_free(struct bch_fs *); ++void bch2_fs_stop(struct bch_fs *); ++ ++int bch2_fs_start(struct bch_fs *); ++struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); ++const char *bch2_fs_open_incremental(const char *path); ++ ++#endif /* _BCACHEFS_SUPER_H */ +diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h +new file mode 100644 +index 000000000000..20406ebd6f5b +--- /dev/null ++++ b/fs/bcachefs/super_types.h +@@ -0,0 +1,51 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SUPER_TYPES_H ++#define _BCACHEFS_SUPER_TYPES_H ++ ++struct bch_sb_handle { ++ struct bch_sb *sb; ++ struct block_device *bdev; ++ struct bio *bio; ++ unsigned page_order; ++ fmode_t mode; ++ unsigned have_layout:1; ++ unsigned have_bio:1; ++ unsigned fs_sb:1; ++ u64 seq; ++}; ++ ++struct bch_devs_mask { ++ unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)]; ++}; ++ ++struct bch_devs_list { ++ u8 nr; ++ u8 devs[BCH_REPLICAS_MAX + 1]; ++}; ++ ++struct bch_member_cpu { ++ u64 nbuckets; /* device size */ ++ u16 first_bucket; /* index of first bucket used */ ++ u16 bucket_size; /* sectors */ ++ u16 group; ++ u8 state; ++ u8 replacement; ++ u8 discard; ++ u8 data_allowed; ++ u8 durability; ++ u8 valid; ++}; ++ ++struct bch_disk_group_cpu { ++ bool deleted; ++ u16 parent; ++ struct bch_devs_mask devs; ++}; ++ ++struct bch_disk_groups_cpu { ++ struct rcu_head rcu; ++ unsigned nr; ++ struct bch_disk_group_cpu entries[]; ++}; ++ ++#endif /* _BCACHEFS_SUPER_TYPES_H */ +diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c +new file mode 100644 +index 000000000000..0cb29f43d99d +--- /dev/null ++++ b/fs/bcachefs/sysfs.c +@@ -0,0 +1,1074 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * bcache sysfs interfaces ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#ifndef NO_BCACHEFS_SYSFS ++ ++#include "bcachefs.h" ++#include "alloc_background.h" ++#include "sysfs.h" ++#include "btree_cache.h" ++#include "btree_io.h" ++#include "btree_iter.h" ++#include "btree_key_cache.h" ++#include "btree_update.h" ++#include "btree_update_interior.h" ++#include "btree_gc.h" ++#include "buckets.h" ++#include "clock.h" ++#include "disk_groups.h" ++#include "ec.h" ++#include "inode.h" ++#include "journal.h" ++#include "keylist.h" ++#include "move.h" ++#include "opts.h" ++#include "rebalance.h" ++#include "replicas.h" ++#include "super-io.h" ++#include "tests.h" ++ ++#include ++#include ++#include ++ ++#include "util.h" ++ ++#define SYSFS_OPS(type) \ ++struct sysfs_ops type ## _sysfs_ops = { \ ++ .show = type ## _show, \ ++ .store = type ## _store \ ++} ++ ++#define SHOW(fn) \ ++static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ ++ char *buf) \ ++ ++#define STORE(fn) \ ++static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ ++ const char *buf, size_t size) \ ++ ++#define __sysfs_attribute(_name, _mode) \ ++ static struct attribute sysfs_##_name = \ ++ { .name = #_name, .mode = _mode } ++ ++#define write_attribute(n) __sysfs_attribute(n, S_IWUSR) ++#define read_attribute(n) __sysfs_attribute(n, S_IRUGO) ++#define rw_attribute(n) __sysfs_attribute(n, S_IRUGO|S_IWUSR) ++ ++#define sysfs_printf(file, fmt, ...) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ return scnprintf(buf, PAGE_SIZE, fmt "\n", __VA_ARGS__);\ ++} while (0) ++ ++#define sysfs_print(file, var) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ return snprint(buf, PAGE_SIZE, var); \ ++} while (0) ++ ++#define sysfs_hprint(file, val) \ ++do { \ ++ if (attr == &sysfs_ ## file) { \ ++ bch2_hprint(&out, val); \ ++ pr_buf(&out, "\n"); \ ++ return out.pos - buf; \ ++ } \ ++} while (0) ++ ++#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var)) ++#define var_print(_var) sysfs_print(_var, var(_var)) ++#define var_hprint(_var) sysfs_hprint(_var, var(_var)) ++ ++#define sysfs_strtoul(file, var) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ return strtoul_safe(buf, var) ?: (ssize_t) size; \ ++} while (0) ++ ++#define sysfs_strtoul_clamp(file, var, min, max) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ return strtoul_safe_clamp(buf, var, min, max) \ ++ ?: (ssize_t) size; \ ++} while (0) ++ ++#define strtoul_or_return(cp) \ ++({ \ ++ unsigned long _v; \ ++ int _r = kstrtoul(cp, 10, &_v); \ ++ if (_r) \ ++ return _r; \ ++ _v; \ ++}) ++ ++#define strtoul_restrict_or_return(cp, min, max) \ ++({ \ ++ unsigned long __v = 0; \ ++ int _r = strtoul_safe_restrict(cp, __v, min, max); \ ++ if (_r) \ ++ return _r; \ ++ __v; \ ++}) ++ ++#define strtoi_h_or_return(cp) \ ++({ \ ++ u64 _v; \ ++ int _r = strtoi_h(cp, &_v); \ ++ if (_r) \ ++ return _r; \ ++ _v; \ ++}) ++ ++#define sysfs_hatoi(file, var) \ ++do { \ ++ if (attr == &sysfs_ ## file) \ ++ return strtoi_h(buf, &var) ?: (ssize_t) size; \ ++} while (0) ++ ++write_attribute(trigger_journal_flush); ++write_attribute(trigger_btree_coalesce); ++write_attribute(trigger_gc); ++write_attribute(prune_cache); ++rw_attribute(btree_gc_periodic); ++ ++read_attribute(uuid); ++read_attribute(minor); ++read_attribute(bucket_size); ++read_attribute(block_size); ++read_attribute(btree_node_size); ++read_attribute(first_bucket); ++read_attribute(nbuckets); ++read_attribute(durability); ++read_attribute(iodone); ++ ++read_attribute(io_latency_read); ++read_attribute(io_latency_write); ++read_attribute(io_latency_stats_read); ++read_attribute(io_latency_stats_write); ++read_attribute(congested); ++ ++read_attribute(bucket_quantiles_last_read); ++read_attribute(bucket_quantiles_last_write); ++read_attribute(bucket_quantiles_fragmentation); ++read_attribute(bucket_quantiles_oldest_gen); ++ ++read_attribute(reserve_stats); ++read_attribute(btree_cache_size); ++read_attribute(compression_stats); ++read_attribute(journal_debug); ++read_attribute(journal_pins); ++read_attribute(btree_updates); ++read_attribute(dirty_btree_nodes); ++read_attribute(btree_key_cache); ++read_attribute(btree_transactions); ++read_attribute(stripes_heap); ++ ++read_attribute(internal_uuid); ++ ++read_attribute(has_data); ++read_attribute(alloc_debug); ++write_attribute(wake_allocator); ++ ++read_attribute(read_realloc_races); ++read_attribute(extent_migrate_done); ++read_attribute(extent_migrate_raced); ++ ++rw_attribute(journal_write_delay_ms); ++rw_attribute(journal_reclaim_delay_ms); ++ ++rw_attribute(discard); ++rw_attribute(cache_replacement_policy); ++rw_attribute(label); ++ ++rw_attribute(copy_gc_enabled); ++sysfs_pd_controller_attribute(copy_gc); ++ ++rw_attribute(rebalance_enabled); ++sysfs_pd_controller_attribute(rebalance); ++read_attribute(rebalance_work); ++rw_attribute(promote_whole_extents); ++ ++read_attribute(new_stripes); ++ ++rw_attribute(pd_controllers_update_seconds); ++ ++read_attribute(meta_replicas_have); ++read_attribute(data_replicas_have); ++ ++read_attribute(io_timers_read); ++read_attribute(io_timers_write); ++ ++#ifdef CONFIG_BCACHEFS_TESTS ++write_attribute(perf_test); ++#endif /* CONFIG_BCACHEFS_TESTS */ ++ ++#define BCH_DEBUG_PARAM(name, description) \ ++ rw_attribute(name); ++ ++ BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++#define x(_name) \ ++ static struct attribute sysfs_time_stat_##_name = \ ++ { .name = #_name, .mode = S_IRUGO }; ++ BCH_TIME_STATS() ++#undef x ++ ++static struct attribute sysfs_state_rw = { ++ .name = "state", ++ .mode = S_IRUGO ++}; ++ ++static size_t bch2_btree_cache_size(struct bch_fs *c) ++{ ++ size_t ret = 0; ++ struct btree *b; ++ ++ mutex_lock(&c->btree_cache.lock); ++ list_for_each_entry(b, &c->btree_cache.live, list) ++ ret += btree_bytes(c); ++ ++ mutex_unlock(&c->btree_cache.lock); ++ return ret; ++} ++ ++static int fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c); ++ ++ if (!fs_usage) ++ return -ENOMEM; ++ ++ bch2_fs_usage_to_text(out, c, fs_usage); ++ ++ percpu_up_read(&c->mark_lock); ++ ++ kfree(fs_usage); ++ return 0; ++} ++ ++static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 nr_uncompressed_extents = 0, uncompressed_sectors = 0, ++ nr_compressed_extents = 0, ++ compressed_sectors_compressed = 0, ++ compressed_sectors_uncompressed = 0; ++ int ret; ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EPERM; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, 0, k, ret) ++ if (k.k->type == KEY_TYPE_extent) { ++ struct bkey_s_c_extent e = bkey_s_c_to_extent(k); ++ const union bch_extent_entry *entry; ++ struct extent_ptr_decoded p; ++ ++ extent_for_each_ptr_decode(e, p, entry) { ++ if (!crc_is_compressed(p.crc)) { ++ nr_uncompressed_extents++; ++ uncompressed_sectors += e.k->size; ++ } else { ++ nr_compressed_extents++; ++ compressed_sectors_compressed += ++ p.crc.compressed_size; ++ compressed_sectors_uncompressed += ++ p.crc.uncompressed_size; ++ } ++ ++ /* only looking at the first ptr */ ++ break; ++ } ++ } ++ ++ ret = bch2_trans_exit(&trans) ?: ret; ++ if (ret) ++ return ret; ++ ++ pr_buf(out, ++ "uncompressed data:\n" ++ " nr extents: %llu\n" ++ " size (bytes): %llu\n" ++ "compressed data:\n" ++ " nr extents: %llu\n" ++ " compressed size (bytes): %llu\n" ++ " uncompressed size (bytes): %llu\n", ++ nr_uncompressed_extents, ++ uncompressed_sectors << 9, ++ nr_compressed_extents, ++ compressed_sectors_compressed << 9, ++ compressed_sectors_uncompressed << 9); ++ return 0; ++} ++ ++SHOW(bch2_fs) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ ++ sysfs_print(minor, c->minor); ++ sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); ++ ++ sysfs_print(journal_write_delay_ms, c->journal.write_delay_ms); ++ sysfs_print(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); ++ ++ sysfs_print(block_size, block_bytes(c)); ++ sysfs_print(btree_node_size, btree_bytes(c)); ++ sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); ++ ++ sysfs_print(read_realloc_races, ++ atomic_long_read(&c->read_realloc_races)); ++ sysfs_print(extent_migrate_done, ++ atomic_long_read(&c->extent_migrate_done)); ++ sysfs_print(extent_migrate_raced, ++ atomic_long_read(&c->extent_migrate_raced)); ++ ++ sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); ++ ++ sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); ++ ++ sysfs_print(pd_controllers_update_seconds, ++ c->pd_controllers_update_seconds); ++ ++ sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); ++ sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ ++ sysfs_pd_controller_show(copy_gc, &c->copygc_pd); ++ ++ if (attr == &sysfs_rebalance_work) { ++ bch2_rebalance_work_to_text(&out, c); ++ return out.pos - buf; ++ } ++ ++ sysfs_print(promote_whole_extents, c->promote_whole_extents); ++ ++ sysfs_printf(meta_replicas_have, "%i", bch2_replicas_online(c, true)); ++ sysfs_printf(data_replicas_have, "%i", bch2_replicas_online(c, false)); ++ ++ /* Debugging: */ ++ ++ if (attr == &sysfs_alloc_debug) ++ return fs_alloc_debug_to_text(&out, c) ?: out.pos - buf; ++ ++ if (attr == &sysfs_journal_debug) { ++ bch2_journal_debug_to_text(&out, &c->journal); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_journal_pins) { ++ bch2_journal_pins_to_text(&out, &c->journal); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_btree_updates) { ++ bch2_btree_updates_to_text(&out, c); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_dirty_btree_nodes) { ++ bch2_dirty_btree_nodes_to_text(&out, c); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_btree_key_cache) { ++ bch2_btree_key_cache_to_text(&out, &c->btree_key_cache); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_btree_transactions) { ++ bch2_btree_trans_to_text(&out, c); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_stripes_heap) { ++ bch2_stripes_heap_to_text(&out, c); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_compression_stats) { ++ bch2_compression_stats_to_text(&out, c); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_new_stripes) { ++ bch2_new_stripes_to_text(&out, c); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_io_timers_read) { ++ bch2_io_timers_to_text(&out, &c->io_clock[READ]); ++ return out.pos - buf; ++ } ++ if (attr == &sysfs_io_timers_write) { ++ bch2_io_timers_to_text(&out, &c->io_clock[WRITE]); ++ return out.pos - buf; ++ } ++ ++#define BCH_DEBUG_PARAM(name, description) sysfs_print(name, c->name); ++ BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++ return 0; ++} ++ ++STORE(bch2_fs) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); ++ ++ sysfs_strtoul(journal_write_delay_ms, c->journal.write_delay_ms); ++ sysfs_strtoul(journal_reclaim_delay_ms, c->journal.reclaim_delay_ms); ++ ++ if (attr == &sysfs_btree_gc_periodic) { ++ ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic) ++ ?: (ssize_t) size; ++ ++ wake_up_process(c->gc_thread); ++ return ret; ++ } ++ ++ if (attr == &sysfs_copy_gc_enabled) { ++ ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) ++ ?: (ssize_t) size; ++ ++ if (c->copygc_thread) ++ wake_up_process(c->copygc_thread); ++ return ret; ++ } ++ ++ if (attr == &sysfs_rebalance_enabled) { ++ ssize_t ret = strtoul_safe(buf, c->rebalance.enabled) ++ ?: (ssize_t) size; ++ ++ rebalance_wakeup(c); ++ return ret; ++ } ++ ++ sysfs_strtoul(pd_controllers_update_seconds, ++ c->pd_controllers_update_seconds); ++ sysfs_pd_controller_store(rebalance, &c->rebalance.pd); ++ sysfs_pd_controller_store(copy_gc, &c->copygc_pd); ++ ++ sysfs_strtoul(promote_whole_extents, c->promote_whole_extents); ++ ++ /* Debugging: */ ++ ++#define BCH_DEBUG_PARAM(name, description) sysfs_strtoul(name, c->name); ++ BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++ if (!test_bit(BCH_FS_STARTED, &c->flags)) ++ return -EPERM; ++ ++ /* Debugging: */ ++ ++ if (attr == &sysfs_trigger_journal_flush) ++ bch2_journal_meta_async(&c->journal, NULL); ++ ++ if (attr == &sysfs_trigger_btree_coalesce) ++ bch2_coalesce(c); ++ ++ if (attr == &sysfs_trigger_gc) { ++ /* ++ * Full gc is currently incompatible with btree key cache: ++ */ ++#if 0 ++ down_read(&c->state_lock); ++ bch2_gc(c, NULL, false, false); ++ up_read(&c->state_lock); ++#else ++ bch2_gc_gens(c); ++#endif ++ } ++ ++ if (attr == &sysfs_prune_cache) { ++ struct shrink_control sc; ++ ++ sc.gfp_mask = GFP_KERNEL; ++ sc.nr_to_scan = strtoul_or_return(buf); ++ c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); ++ } ++ ++#ifdef CONFIG_BCACHEFS_TESTS ++ if (attr == &sysfs_perf_test) { ++ char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; ++ char *test = strsep(&p, " \t\n"); ++ char *nr_str = strsep(&p, " \t\n"); ++ char *threads_str = strsep(&p, " \t\n"); ++ unsigned threads; ++ u64 nr; ++ int ret = -EINVAL; ++ ++ if (threads_str && ++ !(ret = kstrtouint(threads_str, 10, &threads)) && ++ !(ret = bch2_strtoull_h(nr_str, &nr))) ++ bch2_btree_perf_test(c, test, nr, threads); ++ else ++ size = ret; ++ kfree(tmp); ++ } ++#endif ++ return size; ++} ++SYSFS_OPS(bch2_fs); ++ ++struct attribute *bch2_fs_files[] = { ++ &sysfs_minor, ++ &sysfs_block_size, ++ &sysfs_btree_node_size, ++ &sysfs_btree_cache_size, ++ ++ &sysfs_meta_replicas_have, ++ &sysfs_data_replicas_have, ++ ++ &sysfs_journal_write_delay_ms, ++ &sysfs_journal_reclaim_delay_ms, ++ ++ &sysfs_promote_whole_extents, ++ ++ &sysfs_compression_stats, ++ ++#ifdef CONFIG_BCACHEFS_TESTS ++ &sysfs_perf_test, ++#endif ++ NULL ++}; ++ ++/* internal dir - just a wrapper */ ++ ++SHOW(bch2_fs_internal) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, internal); ++ return bch2_fs_show(&c->kobj, attr, buf); ++} ++ ++STORE(bch2_fs_internal) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, internal); ++ return bch2_fs_store(&c->kobj, attr, buf, size); ++} ++SYSFS_OPS(bch2_fs_internal); ++ ++struct attribute *bch2_fs_internal_files[] = { ++ &sysfs_alloc_debug, ++ &sysfs_journal_debug, ++ &sysfs_journal_pins, ++ &sysfs_btree_updates, ++ &sysfs_dirty_btree_nodes, ++ &sysfs_btree_key_cache, ++ &sysfs_btree_transactions, ++ &sysfs_stripes_heap, ++ ++ &sysfs_read_realloc_races, ++ &sysfs_extent_migrate_done, ++ &sysfs_extent_migrate_raced, ++ ++ &sysfs_trigger_journal_flush, ++ &sysfs_trigger_btree_coalesce, ++ &sysfs_trigger_gc, ++ &sysfs_prune_cache, ++ ++ &sysfs_copy_gc_enabled, ++ ++ &sysfs_rebalance_enabled, ++ &sysfs_rebalance_work, ++ sysfs_pd_controller_files(rebalance), ++ sysfs_pd_controller_files(copy_gc), ++ ++ &sysfs_new_stripes, ++ ++ &sysfs_io_timers_read, ++ &sysfs_io_timers_write, ++ ++ &sysfs_internal_uuid, ++ ++#define BCH_DEBUG_PARAM(name, description) &sysfs_##name, ++ BCH_DEBUG_PARAMS() ++#undef BCH_DEBUG_PARAM ++ ++ NULL ++}; ++ ++/* options */ ++ ++SHOW(bch2_fs_opts_dir) ++{ ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); ++ const struct bch_option *opt = container_of(attr, struct bch_option, attr); ++ int id = opt - bch2_opt_table; ++ u64 v = bch2_opt_get_by_id(&c->opts, id); ++ ++ bch2_opt_to_text(&out, c, opt, v, OPT_SHOW_FULL_LIST); ++ pr_buf(&out, "\n"); ++ ++ return out.pos - buf; ++} ++ ++STORE(bch2_fs_opts_dir) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); ++ const struct bch_option *opt = container_of(attr, struct bch_option, attr); ++ int ret, id = opt - bch2_opt_table; ++ char *tmp; ++ u64 v; ++ ++ tmp = kstrdup(buf, GFP_KERNEL); ++ if (!tmp) ++ return -ENOMEM; ++ ++ ret = bch2_opt_parse(c, opt, strim(tmp), &v); ++ kfree(tmp); ++ ++ if (ret < 0) ++ return ret; ++ ++ ret = bch2_opt_check_may_set(c, id, v); ++ if (ret < 0) ++ return ret; ++ ++ if (opt->set_sb != SET_NO_SB_OPT) { ++ mutex_lock(&c->sb_lock); ++ opt->set_sb(c->disk_sb.sb, v); ++ bch2_write_super(c); ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ bch2_opt_set_by_id(&c->opts, id, v); ++ ++ if ((id == Opt_background_target || ++ id == Opt_background_compression) && v) { ++ bch2_rebalance_add_work(c, S64_MAX); ++ rebalance_wakeup(c); ++ } ++ ++ return size; ++} ++SYSFS_OPS(bch2_fs_opts_dir); ++ ++struct attribute *bch2_fs_opts_dir_files[] = { NULL }; ++ ++int bch2_opts_create_sysfs_files(struct kobject *kobj) ++{ ++ const struct bch_option *i; ++ int ret; ++ ++ for (i = bch2_opt_table; ++ i < bch2_opt_table + bch2_opts_nr; ++ i++) { ++ if (!(i->mode & (OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME))) ++ continue; ++ ++ ret = sysfs_create_file(kobj, &i->attr); ++ if (ret) ++ return ret; ++ } ++ ++ return 0; ++} ++ ++/* time stats */ ++ ++SHOW(bch2_fs_time_stats) ++{ ++ struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ ++#define x(name) \ ++ if (attr == &sysfs_time_stat_##name) { \ ++ bch2_time_stats_to_text(&out, &c->times[BCH_TIME_##name]);\ ++ return out.pos - buf; \ ++ } ++ BCH_TIME_STATS() ++#undef x ++ ++ return 0; ++} ++ ++STORE(bch2_fs_time_stats) ++{ ++ return size; ++} ++SYSFS_OPS(bch2_fs_time_stats); ++ ++struct attribute *bch2_fs_time_stats_files[] = { ++#define x(name) \ ++ &sysfs_time_stat_##name, ++ BCH_TIME_STATS() ++#undef x ++ NULL ++}; ++ ++typedef unsigned (bucket_map_fn)(struct bch_fs *, struct bch_dev *, ++ size_t, void *); ++ ++static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, void *private) ++{ ++ int rw = (private ? 1 : 0); ++ ++ return bucket_last_io(c, bucket(ca, b), rw); ++} ++ ++static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, void *private) ++{ ++ struct bucket *g = bucket(ca, b); ++ return bucket_sectors_used(g->mark); ++} ++ ++static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca, ++ size_t b, void *private) ++{ ++ return bucket_gc_gen(ca, b); ++} ++ ++static int unsigned_cmp(const void *_l, const void *_r) ++{ ++ const unsigned *l = _l; ++ const unsigned *r = _r; ++ ++ return cmp_int(*l, *r); ++} ++ ++static int quantiles_to_text(struct printbuf *out, ++ struct bch_fs *c, struct bch_dev *ca, ++ bucket_map_fn *fn, void *private) ++{ ++ size_t i, n; ++ /* Compute 31 quantiles */ ++ unsigned q[31], *p; ++ ++ down_read(&ca->bucket_lock); ++ n = ca->mi.nbuckets; ++ ++ p = vzalloc(n * sizeof(unsigned)); ++ if (!p) { ++ up_read(&ca->bucket_lock); ++ return -ENOMEM; ++ } ++ ++ for (i = ca->mi.first_bucket; i < n; i++) ++ p[i] = fn(c, ca, i, private); ++ ++ sort(p, n, sizeof(unsigned), unsigned_cmp, NULL); ++ up_read(&ca->bucket_lock); ++ ++ while (n && ++ !p[n - 1]) ++ --n; ++ ++ for (i = 0; i < ARRAY_SIZE(q); i++) ++ q[i] = p[n * (i + 1) / (ARRAY_SIZE(q) + 1)]; ++ ++ vfree(p); ++ ++ for (i = 0; i < ARRAY_SIZE(q); i++) ++ pr_buf(out, "%u ", q[i]); ++ pr_buf(out, "\n"); ++ return 0; ++} ++ ++static void reserve_stats_to_text(struct printbuf *out, struct bch_dev *ca) ++{ ++ enum alloc_reserve i; ++ ++ spin_lock(&ca->fs->freelist_lock); ++ ++ pr_buf(out, "free_inc:\t%zu\t%zu\n", ++ fifo_used(&ca->free_inc), ++ ca->free_inc.size); ++ ++ for (i = 0; i < RESERVE_NR; i++) ++ pr_buf(out, "free[%u]:\t%zu\t%zu\n", i, ++ fifo_used(&ca->free[i]), ++ ca->free[i].size); ++ ++ spin_unlock(&ca->fs->freelist_lock); ++} ++ ++static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) ++{ ++ struct bch_fs *c = ca->fs; ++ struct bch_dev_usage stats = bch2_dev_usage_read(ca); ++ unsigned i, nr[BCH_DATA_NR]; ++ ++ memset(nr, 0, sizeof(nr)); ++ ++ for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) ++ nr[c->open_buckets[i].type]++; ++ ++ pr_buf(out, ++ "free_inc: %zu/%zu\n" ++ "free[RESERVE_BTREE]: %zu/%zu\n" ++ "free[RESERVE_MOVINGGC]: %zu/%zu\n" ++ "free[RESERVE_NONE]: %zu/%zu\n" ++ "buckets:\n" ++ " capacity: %llu\n" ++ " alloc: %llu\n" ++ " sb: %llu\n" ++ " journal: %llu\n" ++ " meta: %llu\n" ++ " user: %llu\n" ++ " cached: %llu\n" ++ " erasure coded: %llu\n" ++ " available: %lli\n" ++ "sectors:\n" ++ " sb: %llu\n" ++ " journal: %llu\n" ++ " meta: %llu\n" ++ " user: %llu\n" ++ " cached: %llu\n" ++ " erasure coded: %llu\n" ++ " fragmented: %llu\n" ++ " copygc threshold: %llu\n" ++ "freelist_wait: %s\n" ++ "open buckets: %u/%u (reserved %u)\n" ++ "open_buckets_wait: %s\n" ++ "open_buckets_btree: %u\n" ++ "open_buckets_user: %u\n" ++ "btree reserve cache: %u\n", ++ fifo_used(&ca->free_inc), ca->free_inc.size, ++ fifo_used(&ca->free[RESERVE_BTREE]), ca->free[RESERVE_BTREE].size, ++ fifo_used(&ca->free[RESERVE_MOVINGGC]), ca->free[RESERVE_MOVINGGC].size, ++ fifo_used(&ca->free[RESERVE_NONE]), ca->free[RESERVE_NONE].size, ++ ca->mi.nbuckets - ca->mi.first_bucket, ++ stats.buckets_alloc, ++ stats.buckets[BCH_DATA_sb], ++ stats.buckets[BCH_DATA_journal], ++ stats.buckets[BCH_DATA_btree], ++ stats.buckets[BCH_DATA_user], ++ stats.buckets[BCH_DATA_cached], ++ stats.buckets_ec, ++ __dev_buckets_available(ca, stats), ++ stats.sectors[BCH_DATA_sb], ++ stats.sectors[BCH_DATA_journal], ++ stats.sectors[BCH_DATA_btree], ++ stats.sectors[BCH_DATA_user], ++ stats.sectors[BCH_DATA_cached], ++ stats.sectors_ec, ++ stats.sectors_fragmented, ++ c->copygc_threshold, ++ c->freelist_wait.list.first ? "waiting" : "empty", ++ c->open_buckets_nr_free, OPEN_BUCKETS_COUNT, ++ BTREE_NODE_OPEN_BUCKET_RESERVE, ++ c->open_buckets_wait.list.first ? "waiting" : "empty", ++ nr[BCH_DATA_btree], ++ nr[BCH_DATA_user], ++ c->btree_reserve_cache_nr); ++} ++ ++static const char * const bch2_rw[] = { ++ "read", ++ "write", ++ NULL ++}; ++ ++static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca) ++{ ++ int rw, i; ++ ++ for (rw = 0; rw < 2; rw++) { ++ pr_buf(out, "%s:\n", bch2_rw[rw]); ++ ++ for (i = 1; i < BCH_DATA_NR; i++) ++ pr_buf(out, "%-12s:%12llu\n", ++ bch2_data_types[i], ++ percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); ++ } ++} ++ ++SHOW(bch2_dev) ++{ ++ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); ++ struct bch_fs *c = ca->fs; ++ struct printbuf out = _PBUF(buf, PAGE_SIZE); ++ ++ sysfs_printf(uuid, "%pU\n", ca->uuid.b); ++ ++ sysfs_print(bucket_size, bucket_bytes(ca)); ++ sysfs_print(block_size, block_bytes(c)); ++ sysfs_print(first_bucket, ca->mi.first_bucket); ++ sysfs_print(nbuckets, ca->mi.nbuckets); ++ sysfs_print(durability, ca->mi.durability); ++ sysfs_print(discard, ca->mi.discard); ++ ++ if (attr == &sysfs_label) { ++ if (ca->mi.group) { ++ mutex_lock(&c->sb_lock); ++ bch2_disk_path_to_text(&out, &c->disk_sb, ++ ca->mi.group - 1); ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ pr_buf(&out, "\n"); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_has_data) { ++ bch2_flags_to_text(&out, bch2_data_types, ++ bch2_dev_has_data(c, ca)); ++ pr_buf(&out, "\n"); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_cache_replacement_policy) { ++ bch2_string_opt_to_text(&out, ++ bch2_cache_replacement_policies, ++ ca->mi.replacement); ++ pr_buf(&out, "\n"); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_state_rw) { ++ bch2_string_opt_to_text(&out, bch2_dev_state, ++ ca->mi.state); ++ pr_buf(&out, "\n"); ++ return out.pos - buf; ++ } ++ ++ if (attr == &sysfs_iodone) { ++ dev_iodone_to_text(&out, ca); ++ return out.pos - buf; ++ } ++ ++ sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); ++ sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); ++ ++ if (attr == &sysfs_io_latency_stats_read) { ++ bch2_time_stats_to_text(&out, &ca->io_latency[READ]); ++ return out.pos - buf; ++ } ++ if (attr == &sysfs_io_latency_stats_write) { ++ bch2_time_stats_to_text(&out, &ca->io_latency[WRITE]); ++ return out.pos - buf; ++ } ++ ++ sysfs_printf(congested, "%u%%", ++ clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) ++ * 100 / CONGESTED_MAX); ++ ++ if (attr == &sysfs_bucket_quantiles_last_read) ++ return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 0) ?: out.pos - buf; ++ if (attr == &sysfs_bucket_quantiles_last_write) ++ return quantiles_to_text(&out, c, ca, bucket_last_io_fn, (void *) 1) ?: out.pos - buf; ++ if (attr == &sysfs_bucket_quantiles_fragmentation) ++ return quantiles_to_text(&out, c, ca, bucket_sectors_used_fn, NULL) ?: out.pos - buf; ++ if (attr == &sysfs_bucket_quantiles_oldest_gen) ++ return quantiles_to_text(&out, c, ca, bucket_oldest_gen_fn, NULL) ?: out.pos - buf; ++ ++ if (attr == &sysfs_reserve_stats) { ++ reserve_stats_to_text(&out, ca); ++ return out.pos - buf; ++ } ++ if (attr == &sysfs_alloc_debug) { ++ dev_alloc_debug_to_text(&out, ca); ++ return out.pos - buf; ++ } ++ ++ return 0; ++} ++ ++STORE(bch2_dev) ++{ ++ struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); ++ struct bch_fs *c = ca->fs; ++ struct bch_member *mi; ++ ++ if (attr == &sysfs_discard) { ++ bool v = strtoul_or_return(buf); ++ ++ mutex_lock(&c->sb_lock); ++ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; ++ ++ if (v != BCH_MEMBER_DISCARD(mi)) { ++ SET_BCH_MEMBER_DISCARD(mi, v); ++ bch2_write_super(c); ++ } ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ if (attr == &sysfs_cache_replacement_policy) { ++ ssize_t v = __sysfs_match_string(bch2_cache_replacement_policies, -1, buf); ++ ++ if (v < 0) ++ return v; ++ ++ mutex_lock(&c->sb_lock); ++ mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; ++ ++ if ((unsigned) v != BCH_MEMBER_REPLACEMENT(mi)) { ++ SET_BCH_MEMBER_REPLACEMENT(mi, v); ++ bch2_write_super(c); ++ } ++ mutex_unlock(&c->sb_lock); ++ } ++ ++ if (attr == &sysfs_label) { ++ char *tmp; ++ int ret; ++ ++ tmp = kstrdup(buf, GFP_KERNEL); ++ if (!tmp) ++ return -ENOMEM; ++ ++ ret = bch2_dev_group_set(c, ca, strim(tmp)); ++ kfree(tmp); ++ if (ret) ++ return ret; ++ } ++ ++ if (attr == &sysfs_wake_allocator) ++ bch2_wake_allocator(ca); ++ ++ return size; ++} ++SYSFS_OPS(bch2_dev); ++ ++struct attribute *bch2_dev_files[] = { ++ &sysfs_uuid, ++ &sysfs_bucket_size, ++ &sysfs_block_size, ++ &sysfs_first_bucket, ++ &sysfs_nbuckets, ++ &sysfs_durability, ++ ++ /* settings: */ ++ &sysfs_discard, ++ &sysfs_cache_replacement_policy, ++ &sysfs_state_rw, ++ &sysfs_label, ++ ++ &sysfs_has_data, ++ &sysfs_iodone, ++ ++ &sysfs_io_latency_read, ++ &sysfs_io_latency_write, ++ &sysfs_io_latency_stats_read, ++ &sysfs_io_latency_stats_write, ++ &sysfs_congested, ++ ++ /* alloc info - other stats: */ ++ &sysfs_bucket_quantiles_last_read, ++ &sysfs_bucket_quantiles_last_write, ++ &sysfs_bucket_quantiles_fragmentation, ++ &sysfs_bucket_quantiles_oldest_gen, ++ ++ &sysfs_reserve_stats, ++ ++ /* debug: */ ++ &sysfs_alloc_debug, ++ &sysfs_wake_allocator, ++ NULL ++}; ++ ++#endif /* _BCACHEFS_SYSFS_H_ */ +diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h +new file mode 100644 +index 000000000000..525fd05d91f7 +--- /dev/null ++++ b/fs/bcachefs/sysfs.h +@@ -0,0 +1,44 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_SYSFS_H_ ++#define _BCACHEFS_SYSFS_H_ ++ ++#include ++ ++#ifndef NO_BCACHEFS_SYSFS ++ ++struct attribute; ++struct sysfs_ops; ++ ++extern struct attribute *bch2_fs_files[]; ++extern struct attribute *bch2_fs_internal_files[]; ++extern struct attribute *bch2_fs_opts_dir_files[]; ++extern struct attribute *bch2_fs_time_stats_files[]; ++extern struct attribute *bch2_dev_files[]; ++ ++extern struct sysfs_ops bch2_fs_sysfs_ops; ++extern struct sysfs_ops bch2_fs_internal_sysfs_ops; ++extern struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; ++extern struct sysfs_ops bch2_fs_time_stats_sysfs_ops; ++extern struct sysfs_ops bch2_dev_sysfs_ops; ++ ++int bch2_opts_create_sysfs_files(struct kobject *); ++ ++#else ++ ++static struct attribute *bch2_fs_files[] = {}; ++static struct attribute *bch2_fs_internal_files[] = {}; ++static struct attribute *bch2_fs_opts_dir_files[] = {}; ++static struct attribute *bch2_fs_time_stats_files[] = {}; ++static struct attribute *bch2_dev_files[] = {}; ++ ++static const struct sysfs_ops bch2_fs_sysfs_ops; ++static const struct sysfs_ops bch2_fs_internal_sysfs_ops; ++static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; ++static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; ++static const struct sysfs_ops bch2_dev_sysfs_ops; ++ ++static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; } ++ ++#endif /* NO_BCACHEFS_SYSFS */ ++ ++#endif /* _BCACHEFS_SYSFS_H_ */ +diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c +new file mode 100644 +index 000000000000..4dcace650416 +--- /dev/null ++++ b/fs/bcachefs/tests.c +@@ -0,0 +1,725 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#ifdef CONFIG_BCACHEFS_TESTS ++ ++#include "bcachefs.h" ++#include "btree_update.h" ++#include "journal_reclaim.h" ++#include "tests.h" ++ ++#include "linux/kthread.h" ++#include "linux/random.h" ++ ++static void delete_test_keys(struct bch_fs *c) ++{ ++ int ret; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_EXTENTS, ++ POS(0, 0), POS(0, U64_MAX), ++ NULL); ++ BUG_ON(ret); ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, ++ POS(0, 0), POS(0, U64_MAX), ++ NULL); ++ BUG_ON(ret); ++} ++ ++/* unit tests */ ++ ++static void test_delete(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_i_cookie k; ++ int ret; ++ ++ bkey_cookie_init(&k.k_i); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p, ++ BTREE_ITER_INTENT); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ BUG_ON(ret); ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_trans_update(&trans, iter, &k.k_i, 0)); ++ BUG_ON(ret); ++ ++ pr_info("deleting once"); ++ ret = bch2_btree_delete_at(&trans, iter, 0); ++ BUG_ON(ret); ++ ++ pr_info("deleting twice"); ++ ret = bch2_btree_delete_at(&trans, iter, 0); ++ BUG_ON(ret); ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_delete_written(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_i_cookie k; ++ int ret; ++ ++ bkey_cookie_init(&k.k_i); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, k.k.p, ++ BTREE_ITER_INTENT); ++ ++ ret = bch2_btree_iter_traverse(iter); ++ BUG_ON(ret); ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_trans_update(&trans, iter, &k.k_i, 0)); ++ BUG_ON(ret); ++ ++ bch2_journal_flush_all_pins(&c->journal); ++ ++ ret = bch2_btree_delete_at(&trans, iter, 0); ++ BUG_ON(ret); ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_iterate(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 i; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ delete_test_keys(c); ++ ++ pr_info("inserting test keys"); ++ ++ for (i = 0; i < nr; i++) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = i; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, ++ NULL, NULL, 0); ++ BUG_ON(ret); ++ } ++ ++ pr_info("iterating forwards"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, ++ POS_MIN, 0, k, ret) { ++ if (k.k->p.inode) ++ break; ++ ++ BUG_ON(k.k->p.offset != i++); ++ } ++ ++ BUG_ON(i != nr); ++ ++ pr_info("iterating backwards"); ++ ++ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) ++ BUG_ON(k.k->p.offset != --i); ++ ++ BUG_ON(i); ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_iterate_extents(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 i; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ delete_test_keys(c); ++ ++ pr_info("inserting test extents"); ++ ++ for (i = 0; i < nr; i += 8) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = i + 8; ++ k.k.size = 8; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, ++ NULL, NULL, 0); ++ BUG_ON(ret); ++ } ++ ++ pr_info("iterating forwards"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, ++ POS_MIN, 0, k, ret) { ++ BUG_ON(bkey_start_offset(k.k) != i); ++ i = k.k->p.offset; ++ } ++ ++ BUG_ON(i != nr); ++ ++ pr_info("iterating backwards"); ++ ++ while (!IS_ERR_OR_NULL((k = bch2_btree_iter_prev(iter)).k)) { ++ BUG_ON(k.k->p.offset != i); ++ i = bkey_start_offset(k.k); ++ } ++ ++ BUG_ON(i); ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_iterate_slots(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 i; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ delete_test_keys(c); ++ ++ pr_info("inserting test keys"); ++ ++ for (i = 0; i < nr; i++) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = i * 2; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_XATTRS, &k.k_i, ++ NULL, NULL, 0); ++ BUG_ON(ret); ++ } ++ ++ pr_info("iterating forwards"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, ++ 0, k, ret) { ++ if (k.k->p.inode) ++ break; ++ ++ BUG_ON(k.k->p.offset != i); ++ i += 2; ++ } ++ bch2_trans_iter_free(&trans, iter); ++ ++ BUG_ON(i != nr * 2); ++ ++ pr_info("iterating forwards by slots"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, ++ BTREE_ITER_SLOTS, k, ret) { ++ BUG_ON(k.k->p.offset != i); ++ BUG_ON(bkey_deleted(k.k) != (i & 1)); ++ ++ i++; ++ if (i == nr * 2) ++ break; ++ } ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_iterate_slots_extents(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 i; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ delete_test_keys(c); ++ ++ pr_info("inserting test keys"); ++ ++ for (i = 0; i < nr; i += 16) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = i + 16; ++ k.k.size = 8; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, ++ NULL, NULL, 0); ++ BUG_ON(ret); ++ } ++ ++ pr_info("iterating forwards"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, ++ 0, k, ret) { ++ BUG_ON(bkey_start_offset(k.k) != i + 8); ++ BUG_ON(k.k->size != 8); ++ i += 16; ++ } ++ bch2_trans_iter_free(&trans, iter); ++ ++ BUG_ON(i != nr); ++ ++ pr_info("iterating forwards by slots"); ++ ++ i = 0; ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_EXTENTS, POS_MIN, ++ BTREE_ITER_SLOTS, k, ret) { ++ BUG_ON(bkey_deleted(k.k) != !(i % 16)); ++ ++ BUG_ON(bkey_start_offset(k.k) != i); ++ BUG_ON(k.k->size != 8); ++ i = k.k->p.offset; ++ ++ if (i == nr) ++ break; ++ } ++ ++ bch2_trans_exit(&trans); ++} ++ ++/* ++ * XXX: we really want to make sure we've got a btree with depth > 0 for these ++ * tests ++ */ ++static void test_peek_end(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, POS_MIN, 0); ++ ++ k = bch2_btree_iter_peek(iter); ++ BUG_ON(k.k); ++ ++ k = bch2_btree_iter_peek(iter); ++ BUG_ON(k.k); ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void test_peek_end_extents(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_EXTENTS, POS_MIN, 0); ++ ++ k = bch2_btree_iter_peek(iter); ++ BUG_ON(k.k); ++ ++ k = bch2_btree_iter_peek(iter); ++ BUG_ON(k.k); ++ ++ bch2_trans_exit(&trans); ++} ++ ++/* extent unit tests */ ++ ++u64 test_version; ++ ++static void insert_test_extent(struct bch_fs *c, ++ u64 start, u64 end) ++{ ++ struct bkey_i_cookie k; ++ int ret; ++ ++ //pr_info("inserting %llu-%llu v %llu", start, end, test_version); ++ ++ bkey_cookie_init(&k.k_i); ++ k.k_i.k.p.offset = end; ++ k.k_i.k.size = end - start; ++ k.k_i.k.version.lo = test_version++; ++ ++ ret = bch2_btree_insert(c, BTREE_ID_EXTENTS, &k.k_i, ++ NULL, NULL, 0); ++ BUG_ON(ret); ++} ++ ++static void __test_extent_overwrite(struct bch_fs *c, ++ u64 e1_start, u64 e1_end, ++ u64 e2_start, u64 e2_end) ++{ ++ insert_test_extent(c, e1_start, e1_end); ++ insert_test_extent(c, e2_start, e2_end); ++ ++ delete_test_keys(c); ++} ++ ++static void test_extent_overwrite_front(struct bch_fs *c, u64 nr) ++{ ++ __test_extent_overwrite(c, 0, 64, 0, 32); ++ __test_extent_overwrite(c, 8, 64, 0, 32); ++} ++ ++static void test_extent_overwrite_back(struct bch_fs *c, u64 nr) ++{ ++ __test_extent_overwrite(c, 0, 64, 32, 64); ++ __test_extent_overwrite(c, 0, 64, 32, 72); ++} ++ ++static void test_extent_overwrite_middle(struct bch_fs *c, u64 nr) ++{ ++ __test_extent_overwrite(c, 0, 64, 32, 40); ++} ++ ++static void test_extent_overwrite_all(struct bch_fs *c, u64 nr) ++{ ++ __test_extent_overwrite(c, 32, 64, 0, 64); ++ __test_extent_overwrite(c, 32, 64, 0, 128); ++ __test_extent_overwrite(c, 32, 64, 32, 64); ++ __test_extent_overwrite(c, 32, 64, 32, 128); ++} ++ ++/* perf tests */ ++ ++static u64 test_rand(void) ++{ ++ u64 v; ++#if 0 ++ v = prandom_u32(); ++#else ++ prandom_bytes(&v, sizeof(v)); ++#endif ++ return v; ++} ++ ++static void rand_insert(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct bkey_i_cookie k; ++ int ret; ++ u64 i; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < nr; i++) { ++ bkey_cookie_init(&k.k_i); ++ k.k.p.offset = test_rand(); ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ __bch2_btree_insert(&trans, BTREE_ID_XATTRS, &k.k_i)); ++ ++ BUG_ON(ret); ++ } ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void rand_lookup(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ u64 i; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < nr; i++) { ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, ++ POS(0, test_rand()), 0); ++ ++ k = bch2_btree_iter_peek(iter); ++ bch2_trans_iter_free(&trans, iter); ++ } ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void rand_mixed(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ u64 i; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < nr; i++) { ++ iter = bch2_trans_get_iter(&trans, BTREE_ID_XATTRS, ++ POS(0, test_rand()), 0); ++ ++ k = bch2_btree_iter_peek(iter); ++ ++ if (!(i & 3) && k.k) { ++ struct bkey_i_cookie k; ++ ++ bkey_cookie_init(&k.k_i); ++ k.k.p = iter->pos; ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_trans_update(&trans, iter, &k.k_i, 0)); ++ ++ BUG_ON(ret); ++ } ++ ++ bch2_trans_iter_free(&trans, iter); ++ } ++ ++ bch2_trans_exit(&trans); ++} ++ ++static int __do_delete(struct btree_trans *trans, struct bpos pos) ++{ ++ struct btree_iter *iter; ++ struct bkey_i delete; ++ struct bkey_s_c k; ++ int ret = 0; ++ ++ iter = bch2_trans_get_iter(trans, BTREE_ID_XATTRS, pos, ++ BTREE_ITER_INTENT); ++ ret = PTR_ERR_OR_ZERO(iter); ++ if (ret) ++ goto err; ++ ++ k = bch2_btree_iter_peek(iter); ++ ret = bkey_err(k); ++ if (ret) ++ goto err; ++ ++ bkey_init(&delete.k); ++ delete.k.p = k.k->p; ++ ++ bch2_trans_update(trans, iter, &delete, 0); ++err: ++ bch2_trans_iter_put(trans, iter); ++ return ret; ++} ++ ++static void rand_delete(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ int ret; ++ u64 i; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for (i = 0; i < nr; i++) { ++ struct bpos pos = POS(0, test_rand()); ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ __do_delete(&trans, pos)); ++ BUG_ON(ret); ++ } ++ ++ bch2_trans_exit(&trans); ++} ++ ++static void seq_insert(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct bkey_i_cookie insert; ++ int ret; ++ u64 i = 0; ++ ++ bkey_cookie_init(&insert.k_i); ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, ++ BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { ++ insert.k.p = iter->pos; ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_trans_update(&trans, iter, &insert.k_i, 0)); ++ ++ BUG_ON(ret); ++ ++ if (++i == nr) ++ break; ++ } ++ bch2_trans_exit(&trans); ++} ++ ++static void seq_lookup(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, 0, k, ret) ++ ; ++ bch2_trans_exit(&trans); ++} ++ ++static void seq_overwrite(struct bch_fs *c, u64 nr) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, POS_MIN, ++ BTREE_ITER_INTENT, k, ret) { ++ struct bkey_i_cookie u; ++ ++ bkey_reassemble(&u.k_i, k); ++ ++ ret = __bch2_trans_do(&trans, NULL, NULL, 0, ++ bch2_trans_update(&trans, iter, &u.k_i, 0)); ++ ++ BUG_ON(ret); ++ } ++ bch2_trans_exit(&trans); ++} ++ ++static void seq_delete(struct bch_fs *c, u64 nr) ++{ ++ int ret; ++ ++ ret = bch2_btree_delete_range(c, BTREE_ID_XATTRS, ++ POS(0, 0), POS(0, U64_MAX), ++ NULL); ++ BUG_ON(ret); ++} ++ ++typedef void (*perf_test_fn)(struct bch_fs *, u64); ++ ++struct test_job { ++ struct bch_fs *c; ++ u64 nr; ++ unsigned nr_threads; ++ perf_test_fn fn; ++ ++ atomic_t ready; ++ wait_queue_head_t ready_wait; ++ ++ atomic_t done; ++ struct completion done_completion; ++ ++ u64 start; ++ u64 finish; ++}; ++ ++static int btree_perf_test_thread(void *data) ++{ ++ struct test_job *j = data; ++ ++ if (atomic_dec_and_test(&j->ready)) { ++ wake_up(&j->ready_wait); ++ j->start = sched_clock(); ++ } else { ++ wait_event(j->ready_wait, !atomic_read(&j->ready)); ++ } ++ ++ j->fn(j->c, j->nr / j->nr_threads); ++ ++ if (atomic_dec_and_test(&j->done)) { ++ j->finish = sched_clock(); ++ complete(&j->done_completion); ++ } ++ ++ return 0; ++} ++ ++void bch2_btree_perf_test(struct bch_fs *c, const char *testname, ++ u64 nr, unsigned nr_threads) ++{ ++ struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; ++ char name_buf[20], nr_buf[20], per_sec_buf[20]; ++ unsigned i; ++ u64 time; ++ ++ atomic_set(&j.ready, nr_threads); ++ init_waitqueue_head(&j.ready_wait); ++ ++ atomic_set(&j.done, nr_threads); ++ init_completion(&j.done_completion); ++ ++#define perf_test(_test) \ ++ if (!strcmp(testname, #_test)) j.fn = _test ++ ++ perf_test(rand_insert); ++ perf_test(rand_lookup); ++ perf_test(rand_mixed); ++ perf_test(rand_delete); ++ ++ perf_test(seq_insert); ++ perf_test(seq_lookup); ++ perf_test(seq_overwrite); ++ perf_test(seq_delete); ++ ++ /* a unit test, not a perf test: */ ++ perf_test(test_delete); ++ perf_test(test_delete_written); ++ perf_test(test_iterate); ++ perf_test(test_iterate_extents); ++ perf_test(test_iterate_slots); ++ perf_test(test_iterate_slots_extents); ++ perf_test(test_peek_end); ++ perf_test(test_peek_end_extents); ++ ++ perf_test(test_extent_overwrite_front); ++ perf_test(test_extent_overwrite_back); ++ perf_test(test_extent_overwrite_middle); ++ perf_test(test_extent_overwrite_all); ++ ++ if (!j.fn) { ++ pr_err("unknown test %s", testname); ++ return; ++ } ++ ++ //pr_info("running test %s:", testname); ++ ++ if (nr_threads == 1) ++ btree_perf_test_thread(&j); ++ else ++ for (i = 0; i < nr_threads; i++) ++ kthread_run(btree_perf_test_thread, &j, ++ "bcachefs perf test[%u]", i); ++ ++ while (wait_for_completion_interruptible(&j.done_completion)) ++ ; ++ ++ time = j.finish - j.start; ++ ++ scnprintf(name_buf, sizeof(name_buf), "%s:", testname); ++ bch2_hprint(&PBUF(nr_buf), nr); ++ bch2_hprint(&PBUF(per_sec_buf), nr * NSEC_PER_SEC / time); ++ printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n", ++ name_buf, nr_buf, nr_threads, ++ time / NSEC_PER_SEC, ++ time * nr_threads / nr, ++ per_sec_buf); ++} ++ ++#endif /* CONFIG_BCACHEFS_TESTS */ +diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h +new file mode 100644 +index 000000000000..551d0764225e +--- /dev/null ++++ b/fs/bcachefs/tests.h +@@ -0,0 +1,15 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_TEST_H ++#define _BCACHEFS_TEST_H ++ ++struct bch_fs; ++ ++#ifdef CONFIG_BCACHEFS_TESTS ++ ++void bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned); ++ ++#else ++ ++#endif /* CONFIG_BCACHEFS_TESTS */ ++ ++#endif /* _BCACHEFS_TEST_H */ +diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c +new file mode 100644 +index 000000000000..59e8dfa3d245 +--- /dev/null ++++ b/fs/bcachefs/trace.c +@@ -0,0 +1,12 @@ ++// SPDX-License-Identifier: GPL-2.0 ++#include "bcachefs.h" ++#include "alloc_types.h" ++#include "buckets.h" ++#include "btree_types.h" ++#include "keylist.h" ++ ++#include ++#include "keylist.h" ++ ++#define CREATE_TRACE_POINTS ++#include +diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c +new file mode 100644 +index 000000000000..fd4044a6a08f +--- /dev/null ++++ b/fs/bcachefs/util.c +@@ -0,0 +1,907 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * random utiility code, for bcache but in theory not specific to bcache ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "eytzinger.h" ++#include "util.h" ++ ++static const char si_units[] = "?kMGTPEZY"; ++ ++static int __bch2_strtoh(const char *cp, u64 *res, ++ u64 t_max, bool t_signed) ++{ ++ bool positive = *cp != '-'; ++ unsigned u; ++ u64 v = 0; ++ ++ if (*cp == '+' || *cp == '-') ++ cp++; ++ ++ if (!isdigit(*cp)) ++ return -EINVAL; ++ ++ do { ++ if (v > U64_MAX / 10) ++ return -ERANGE; ++ v *= 10; ++ if (v > U64_MAX - (*cp - '0')) ++ return -ERANGE; ++ v += *cp - '0'; ++ cp++; ++ } while (isdigit(*cp)); ++ ++ for (u = 1; u < strlen(si_units); u++) ++ if (*cp == si_units[u]) { ++ cp++; ++ goto got_unit; ++ } ++ u = 0; ++got_unit: ++ if (*cp == '\n') ++ cp++; ++ if (*cp) ++ return -EINVAL; ++ ++ if (fls64(v) + u * 10 > 64) ++ return -ERANGE; ++ ++ v <<= u * 10; ++ ++ if (positive) { ++ if (v > t_max) ++ return -ERANGE; ++ } else { ++ if (v && !t_signed) ++ return -ERANGE; ++ ++ if (v > t_max + 1) ++ return -ERANGE; ++ v = -v; ++ } ++ ++ *res = v; ++ return 0; ++} ++ ++#define STRTO_H(name, type) \ ++int bch2_ ## name ## _h(const char *cp, type *res) \ ++{ \ ++ u64 v; \ ++ int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \ ++ ANYSINT_MAX(type) != ((type) ~0ULL)); \ ++ *res = v; \ ++ return ret; \ ++} ++ ++STRTO_H(strtoint, int) ++STRTO_H(strtouint, unsigned int) ++STRTO_H(strtoll, long long) ++STRTO_H(strtoull, unsigned long long) ++STRTO_H(strtou64, u64) ++ ++void bch2_hprint(struct printbuf *buf, s64 v) ++{ ++ int u, t = 0; ++ ++ for (u = 0; v >= 1024 || v <= -1024; u++) { ++ t = v & ~(~0U << 10); ++ v >>= 10; ++ } ++ ++ pr_buf(buf, "%lli", v); ++ ++ /* ++ * 103 is magic: t is in the range [-1023, 1023] and we want ++ * to turn it into [-9, 9] ++ */ ++ if (u && v < 100 && v > -100) ++ pr_buf(buf, ".%i", t / 103); ++ if (u) ++ pr_buf(buf, "%c", si_units[u]); ++} ++ ++void bch2_string_opt_to_text(struct printbuf *out, ++ const char * const list[], ++ size_t selected) ++{ ++ size_t i; ++ ++ for (i = 0; list[i]; i++) ++ pr_buf(out, i == selected ? "[%s] " : "%s ", list[i]); ++} ++ ++void bch2_flags_to_text(struct printbuf *out, ++ const char * const list[], u64 flags) ++{ ++ unsigned bit, nr = 0; ++ bool first = true; ++ ++ if (out->pos != out->end) ++ *out->pos = '\0'; ++ ++ while (list[nr]) ++ nr++; ++ ++ while (flags && (bit = __ffs(flags)) < nr) { ++ if (!first) ++ pr_buf(out, ","); ++ first = false; ++ pr_buf(out, "%s", list[bit]); ++ flags ^= 1 << bit; ++ } ++} ++ ++u64 bch2_read_flag_list(char *opt, const char * const list[]) ++{ ++ u64 ret = 0; ++ char *p, *s, *d = kstrndup(opt, PAGE_SIZE - 1, GFP_KERNEL); ++ ++ if (!d) ++ return -ENOMEM; ++ ++ s = strim(d); ++ ++ while ((p = strsep(&s, ","))) { ++ int flag = match_string(list, -1, p); ++ if (flag < 0) { ++ ret = -1; ++ break; ++ } ++ ++ ret |= 1 << flag; ++ } ++ ++ kfree(d); ++ ++ return ret; ++} ++ ++bool bch2_is_zero(const void *_p, size_t n) ++{ ++ const char *p = _p; ++ size_t i; ++ ++ for (i = 0; i < n; i++) ++ if (p[i]) ++ return false; ++ return true; ++} ++ ++static void bch2_quantiles_update(struct quantiles *q, u64 v) ++{ ++ unsigned i = 0; ++ ++ while (i < ARRAY_SIZE(q->entries)) { ++ struct quantile_entry *e = q->entries + i; ++ ++ if (unlikely(!e->step)) { ++ e->m = v; ++ e->step = max_t(unsigned, v / 2, 1024); ++ } else if (e->m > v) { ++ e->m = e->m >= e->step ++ ? e->m - e->step ++ : 0; ++ } else if (e->m < v) { ++ e->m = e->m + e->step > e->m ++ ? e->m + e->step ++ : U32_MAX; ++ } ++ ++ if ((e->m > v ? e->m - v : v - e->m) < e->step) ++ e->step = max_t(unsigned, e->step / 2, 1); ++ ++ if (v >= e->m) ++ break; ++ ++ i = eytzinger0_child(i, v > e->m); ++ } ++} ++ ++/* time stats: */ ++ ++static void bch2_time_stats_update_one(struct time_stats *stats, ++ u64 start, u64 end) ++{ ++ u64 duration, freq; ++ ++ duration = time_after64(end, start) ++ ? end - start : 0; ++ freq = time_after64(end, stats->last_event) ++ ? end - stats->last_event : 0; ++ ++ stats->count++; ++ ++ stats->average_duration = stats->average_duration ++ ? ewma_add(stats->average_duration, duration, 6) ++ : duration; ++ ++ stats->average_frequency = stats->average_frequency ++ ? ewma_add(stats->average_frequency, freq, 6) ++ : freq; ++ ++ stats->max_duration = max(stats->max_duration, duration); ++ ++ stats->last_event = end; ++ ++ bch2_quantiles_update(&stats->quantiles, duration); ++} ++ ++void __bch2_time_stats_update(struct time_stats *stats, u64 start, u64 end) ++{ ++ unsigned long flags; ++ ++ if (!stats->buffer) { ++ spin_lock_irqsave(&stats->lock, flags); ++ bch2_time_stats_update_one(stats, start, end); ++ ++ if (stats->average_frequency < 32 && ++ stats->count > 1024) ++ stats->buffer = ++ alloc_percpu_gfp(struct time_stat_buffer, ++ GFP_ATOMIC); ++ spin_unlock_irqrestore(&stats->lock, flags); ++ } else { ++ struct time_stat_buffer_entry *i; ++ struct time_stat_buffer *b; ++ ++ preempt_disable(); ++ b = this_cpu_ptr(stats->buffer); ++ ++ BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); ++ b->entries[b->nr++] = (struct time_stat_buffer_entry) { ++ .start = start, ++ .end = end ++ }; ++ ++ if (b->nr == ARRAY_SIZE(b->entries)) { ++ spin_lock_irqsave(&stats->lock, flags); ++ for (i = b->entries; ++ i < b->entries + ARRAY_SIZE(b->entries); ++ i++) ++ bch2_time_stats_update_one(stats, i->start, i->end); ++ spin_unlock_irqrestore(&stats->lock, flags); ++ ++ b->nr = 0; ++ } ++ ++ preempt_enable(); ++ } ++} ++ ++static const struct time_unit { ++ const char *name; ++ u32 nsecs; ++} time_units[] = { ++ { "ns", 1 }, ++ { "us", NSEC_PER_USEC }, ++ { "ms", NSEC_PER_MSEC }, ++ { "sec", NSEC_PER_SEC }, ++}; ++ ++static const struct time_unit *pick_time_units(u64 ns) ++{ ++ const struct time_unit *u; ++ ++ for (u = time_units; ++ u + 1 < time_units + ARRAY_SIZE(time_units) && ++ ns >= u[1].nsecs << 1; ++ u++) ++ ; ++ ++ return u; ++} ++ ++static void pr_time_units(struct printbuf *out, u64 ns) ++{ ++ const struct time_unit *u = pick_time_units(ns); ++ ++ pr_buf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); ++} ++ ++void bch2_time_stats_to_text(struct printbuf *out, struct time_stats *stats) ++{ ++ const struct time_unit *u; ++ u64 freq = READ_ONCE(stats->average_frequency); ++ u64 q, last_q = 0; ++ int i; ++ ++ pr_buf(out, "count:\t\t%llu\n", ++ stats->count); ++ pr_buf(out, "rate:\t\t%llu/sec\n", ++ freq ? div64_u64(NSEC_PER_SEC, freq) : 0); ++ ++ pr_buf(out, "frequency:\t"); ++ pr_time_units(out, freq); ++ ++ pr_buf(out, "\navg duration:\t"); ++ pr_time_units(out, stats->average_duration); ++ ++ pr_buf(out, "\nmax duration:\t"); ++ pr_time_units(out, stats->max_duration); ++ ++ i = eytzinger0_first(NR_QUANTILES); ++ u = pick_time_units(stats->quantiles.entries[i].m); ++ ++ pr_buf(out, "\nquantiles (%s):\t", u->name); ++ eytzinger0_for_each(i, NR_QUANTILES) { ++ bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; ++ ++ q = max(stats->quantiles.entries[i].m, last_q); ++ pr_buf(out, "%llu%s", ++ div_u64(q, u->nsecs), ++ is_last ? "\n" : " "); ++ last_q = q; ++ } ++} ++ ++void bch2_time_stats_exit(struct time_stats *stats) ++{ ++ free_percpu(stats->buffer); ++} ++ ++void bch2_time_stats_init(struct time_stats *stats) ++{ ++ memset(stats, 0, sizeof(*stats)); ++ spin_lock_init(&stats->lock); ++} ++ ++/* ratelimit: */ ++ ++/** ++ * bch2_ratelimit_delay() - return how long to delay until the next time to do ++ * some work ++ * ++ * @d - the struct bch_ratelimit to update ++ * ++ * Returns the amount of time to delay by, in jiffies ++ */ ++u64 bch2_ratelimit_delay(struct bch_ratelimit *d) ++{ ++ u64 now = local_clock(); ++ ++ return time_after64(d->next, now) ++ ? nsecs_to_jiffies(d->next - now) ++ : 0; ++} ++ ++/** ++ * bch2_ratelimit_increment() - increment @d by the amount of work done ++ * ++ * @d - the struct bch_ratelimit to update ++ * @done - the amount of work done, in arbitrary units ++ */ ++void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done) ++{ ++ u64 now = local_clock(); ++ ++ d->next += div_u64(done * NSEC_PER_SEC, d->rate); ++ ++ if (time_before64(now + NSEC_PER_SEC, d->next)) ++ d->next = now + NSEC_PER_SEC; ++ ++ if (time_after64(now - NSEC_PER_SEC * 2, d->next)) ++ d->next = now - NSEC_PER_SEC * 2; ++} ++ ++/* pd controller: */ ++ ++/* ++ * Updates pd_controller. Attempts to scale inputed values to units per second. ++ * @target: desired value ++ * @actual: current value ++ * ++ * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing ++ * it makes actual go down. ++ */ ++void bch2_pd_controller_update(struct bch_pd_controller *pd, ++ s64 target, s64 actual, int sign) ++{ ++ s64 proportional, derivative, change; ++ ++ unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ; ++ ++ if (seconds_since_update == 0) ++ return; ++ ++ pd->last_update = jiffies; ++ ++ proportional = actual - target; ++ proportional *= seconds_since_update; ++ proportional = div_s64(proportional, pd->p_term_inverse); ++ ++ derivative = actual - pd->last_actual; ++ derivative = div_s64(derivative, seconds_since_update); ++ derivative = ewma_add(pd->smoothed_derivative, derivative, ++ (pd->d_term / seconds_since_update) ?: 1); ++ derivative = derivative * pd->d_term; ++ derivative = div_s64(derivative, pd->p_term_inverse); ++ ++ change = proportional + derivative; ++ ++ /* Don't increase rate if not keeping up */ ++ if (change > 0 && ++ pd->backpressure && ++ time_after64(local_clock(), ++ pd->rate.next + NSEC_PER_MSEC)) ++ change = 0; ++ ++ change *= (sign * -1); ++ ++ pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change, ++ 1, UINT_MAX); ++ ++ pd->last_actual = actual; ++ pd->last_derivative = derivative; ++ pd->last_proportional = proportional; ++ pd->last_change = change; ++ pd->last_target = target; ++} ++ ++void bch2_pd_controller_init(struct bch_pd_controller *pd) ++{ ++ pd->rate.rate = 1024; ++ pd->last_update = jiffies; ++ pd->p_term_inverse = 6000; ++ pd->d_term = 30; ++ pd->d_smooth = pd->d_term; ++ pd->backpressure = 1; ++} ++ ++size_t bch2_pd_controller_print_debug(struct bch_pd_controller *pd, char *buf) ++{ ++ /* 2^64 - 1 is 20 digits, plus null byte */ ++ char rate[21]; ++ char actual[21]; ++ char target[21]; ++ char proportional[21]; ++ char derivative[21]; ++ char change[21]; ++ s64 next_io; ++ ++ bch2_hprint(&PBUF(rate), pd->rate.rate); ++ bch2_hprint(&PBUF(actual), pd->last_actual); ++ bch2_hprint(&PBUF(target), pd->last_target); ++ bch2_hprint(&PBUF(proportional), pd->last_proportional); ++ bch2_hprint(&PBUF(derivative), pd->last_derivative); ++ bch2_hprint(&PBUF(change), pd->last_change); ++ ++ next_io = div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC); ++ ++ return sprintf(buf, ++ "rate:\t\t%s/sec\n" ++ "target:\t\t%s\n" ++ "actual:\t\t%s\n" ++ "proportional:\t%s\n" ++ "derivative:\t%s\n" ++ "change:\t\t%s/sec\n" ++ "next io:\t%llims\n", ++ rate, target, actual, proportional, ++ derivative, change, next_io); ++} ++ ++/* misc: */ ++ ++void bch2_bio_map(struct bio *bio, void *base, size_t size) ++{ ++ while (size) { ++ struct page *page = is_vmalloc_addr(base) ++ ? vmalloc_to_page(base) ++ : virt_to_page(base); ++ unsigned offset = offset_in_page(base); ++ unsigned len = min_t(size_t, PAGE_SIZE - offset, size); ++ ++ BUG_ON(!bio_add_page(bio, page, len, offset)); ++ size -= len; ++ base += len; ++ } ++} ++ ++int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) ++{ ++ while (size) { ++ struct page *page = alloc_page(gfp_mask); ++ unsigned len = min(PAGE_SIZE, size); ++ ++ if (!page) ++ return -ENOMEM; ++ ++ BUG_ON(!bio_add_page(bio, page, len, 0)); ++ size -= len; ++ } ++ ++ return 0; ++} ++ ++size_t bch2_rand_range(size_t max) ++{ ++ size_t rand; ++ ++ if (!max) ++ return 0; ++ ++ do { ++ rand = get_random_long(); ++ rand &= roundup_pow_of_two(max) - 1; ++ } while (rand >= max); ++ ++ return rand; ++} ++ ++void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src) ++{ ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ ++ __bio_for_each_segment(bv, dst, iter, dst_iter) { ++ void *dstp = kmap_atomic(bv.bv_page); ++ memcpy(dstp + bv.bv_offset, src, bv.bv_len); ++ kunmap_atomic(dstp); ++ ++ src += bv.bv_len; ++ } ++} ++ ++void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) ++{ ++ struct bio_vec bv; ++ struct bvec_iter iter; ++ ++ __bio_for_each_segment(bv, src, iter, src_iter) { ++ void *srcp = kmap_atomic(bv.bv_page); ++ memcpy(dst, srcp + bv.bv_offset, bv.bv_len); ++ kunmap_atomic(srcp); ++ ++ dst += bv.bv_len; ++ } ++} ++ ++void bch_scnmemcpy(struct printbuf *out, ++ const char *src, size_t len) ++{ ++ size_t n = printbuf_remaining(out); ++ ++ if (n) { ++ n = min(n - 1, len); ++ memcpy(out->pos, src, n); ++ out->pos += n; ++ *out->pos = '\0'; ++ } ++} ++ ++#include "eytzinger.h" ++ ++static int alignment_ok(const void *base, size_t align) ++{ ++ return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || ++ ((unsigned long)base & (align - 1)) == 0; ++} ++ ++static void u32_swap(void *a, void *b, size_t size) ++{ ++ u32 t = *(u32 *)a; ++ *(u32 *)a = *(u32 *)b; ++ *(u32 *)b = t; ++} ++ ++static void u64_swap(void *a, void *b, size_t size) ++{ ++ u64 t = *(u64 *)a; ++ *(u64 *)a = *(u64 *)b; ++ *(u64 *)b = t; ++} ++ ++static void generic_swap(void *a, void *b, size_t size) ++{ ++ char t; ++ ++ do { ++ t = *(char *)a; ++ *(char *)a++ = *(char *)b; ++ *(char *)b++ = t; ++ } while (--size > 0); ++} ++ ++static inline int do_cmp(void *base, size_t n, size_t size, ++ int (*cmp_func)(const void *, const void *, size_t), ++ size_t l, size_t r) ++{ ++ return cmp_func(base + inorder_to_eytzinger0(l, n) * size, ++ base + inorder_to_eytzinger0(r, n) * size, ++ size); ++} ++ ++static inline void do_swap(void *base, size_t n, size_t size, ++ void (*swap_func)(void *, void *, size_t), ++ size_t l, size_t r) ++{ ++ swap_func(base + inorder_to_eytzinger0(l, n) * size, ++ base + inorder_to_eytzinger0(r, n) * size, ++ size); ++} ++ ++void eytzinger0_sort(void *base, size_t n, size_t size, ++ int (*cmp_func)(const void *, const void *, size_t), ++ void (*swap_func)(void *, void *, size_t)) ++{ ++ int i, c, r; ++ ++ if (!swap_func) { ++ if (size == 4 && alignment_ok(base, 4)) ++ swap_func = u32_swap; ++ else if (size == 8 && alignment_ok(base, 8)) ++ swap_func = u64_swap; ++ else ++ swap_func = generic_swap; ++ } ++ ++ /* heapify */ ++ for (i = n / 2 - 1; i >= 0; --i) { ++ for (r = i; r * 2 + 1 < n; r = c) { ++ c = r * 2 + 1; ++ ++ if (c + 1 < n && ++ do_cmp(base, n, size, cmp_func, c, c + 1) < 0) ++ c++; ++ ++ if (do_cmp(base, n, size, cmp_func, r, c) >= 0) ++ break; ++ ++ do_swap(base, n, size, swap_func, r, c); ++ } ++ } ++ ++ /* sort */ ++ for (i = n - 1; i > 0; --i) { ++ do_swap(base, n, size, swap_func, 0, i); ++ ++ for (r = 0; r * 2 + 1 < i; r = c) { ++ c = r * 2 + 1; ++ ++ if (c + 1 < i && ++ do_cmp(base, n, size, cmp_func, c, c + 1) < 0) ++ c++; ++ ++ if (do_cmp(base, n, size, cmp_func, r, c) >= 0) ++ break; ++ ++ do_swap(base, n, size, swap_func, r, c); ++ } ++ } ++} ++ ++void sort_cmp_size(void *base, size_t num, size_t size, ++ int (*cmp_func)(const void *, const void *, size_t), ++ void (*swap_func)(void *, void *, size_t size)) ++{ ++ /* pre-scale counters for performance */ ++ int i = (num/2 - 1) * size, n = num * size, c, r; ++ ++ if (!swap_func) { ++ if (size == 4 && alignment_ok(base, 4)) ++ swap_func = u32_swap; ++ else if (size == 8 && alignment_ok(base, 8)) ++ swap_func = u64_swap; ++ else ++ swap_func = generic_swap; ++ } ++ ++ /* heapify */ ++ for ( ; i >= 0; i -= size) { ++ for (r = i; r * 2 + size < n; r = c) { ++ c = r * 2 + size; ++ if (c < n - size && ++ cmp_func(base + c, base + c + size, size) < 0) ++ c += size; ++ if (cmp_func(base + r, base + c, size) >= 0) ++ break; ++ swap_func(base + r, base + c, size); ++ } ++ } ++ ++ /* sort */ ++ for (i = n - size; i > 0; i -= size) { ++ swap_func(base, base + i, size); ++ for (r = 0; r * 2 + size < i; r = c) { ++ c = r * 2 + size; ++ if (c < i - size && ++ cmp_func(base + c, base + c + size, size) < 0) ++ c += size; ++ if (cmp_func(base + r, base + c, size) >= 0) ++ break; ++ swap_func(base + r, base + c, size); ++ } ++ } ++} ++ ++static void mempool_free_vp(void *element, void *pool_data) ++{ ++ size_t size = (size_t) pool_data; ++ ++ vpfree(element, size); ++} ++ ++static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data) ++{ ++ size_t size = (size_t) pool_data; ++ ++ return vpmalloc(size, gfp_mask); ++} ++ ++int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size) ++{ ++ return size < PAGE_SIZE ++ ? mempool_init_kmalloc_pool(pool, min_nr, size) ++ : mempool_init(pool, min_nr, mempool_alloc_vp, ++ mempool_free_vp, (void *) size); ++} ++ ++#if 0 ++void eytzinger1_test(void) ++{ ++ unsigned inorder, eytz, size; ++ ++ pr_info("1 based eytzinger test:"); ++ ++ for (size = 2; ++ size < 65536; ++ size++) { ++ unsigned extra = eytzinger1_extra(size); ++ ++ if (!(size % 4096)) ++ pr_info("tree size %u", size); ++ ++ BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size)); ++ BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size)); ++ ++ BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0); ++ BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0); ++ ++ inorder = 1; ++ eytzinger1_for_each(eytz, size) { ++ BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz); ++ BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder); ++ BUG_ON(eytz != eytzinger1_last(size) && ++ eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz); ++ ++ inorder++; ++ } ++ } ++} ++ ++void eytzinger0_test(void) ++{ ++ ++ unsigned inorder, eytz, size; ++ ++ pr_info("0 based eytzinger test:"); ++ ++ for (size = 1; ++ size < 65536; ++ size++) { ++ unsigned extra = eytzinger0_extra(size); ++ ++ if (!(size % 4096)) ++ pr_info("tree size %u", size); ++ ++ BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size)); ++ BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size)); ++ ++ BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1); ++ BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1); ++ ++ inorder = 0; ++ eytzinger0_for_each(eytz, size) { ++ BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz); ++ BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder); ++ BUG_ON(eytz != eytzinger0_last(size) && ++ eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz); ++ ++ inorder++; ++ } ++ } ++} ++ ++static inline int cmp_u16(const void *_l, const void *_r, size_t size) ++{ ++ const u16 *l = _l, *r = _r; ++ ++ return (*l > *r) - (*r - *l); ++} ++ ++static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) ++{ ++ int i, c1 = -1, c2 = -1; ++ ssize_t r; ++ ++ r = eytzinger0_find_le(test_array, nr, ++ sizeof(test_array[0]), ++ cmp_u16, &search); ++ if (r >= 0) ++ c1 = test_array[r]; ++ ++ for (i = 0; i < nr; i++) ++ if (test_array[i] <= search && test_array[i] > c2) ++ c2 = test_array[i]; ++ ++ if (c1 != c2) { ++ eytzinger0_for_each(i, nr) ++ pr_info("[%3u] = %12u", i, test_array[i]); ++ pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i", ++ i, r, c1, c2); ++ } ++} ++ ++void eytzinger0_find_test(void) ++{ ++ unsigned i, nr, allocated = 1 << 12; ++ u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL); ++ ++ for (nr = 1; nr < allocated; nr++) { ++ pr_info("testing %u elems", nr); ++ ++ get_random_bytes(test_array, nr * sizeof(test_array[0])); ++ eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); ++ ++ /* verify array is sorted correctly: */ ++ eytzinger0_for_each(i, nr) ++ BUG_ON(i != eytzinger0_last(nr) && ++ test_array[i] > test_array[eytzinger0_next(i, nr)]); ++ ++ for (i = 0; i < U16_MAX; i += 1 << 12) ++ eytzinger0_find_test_val(test_array, nr, i); ++ ++ for (i = 0; i < nr; i++) { ++ eytzinger0_find_test_val(test_array, nr, test_array[i] - 1); ++ eytzinger0_find_test_val(test_array, nr, test_array[i]); ++ eytzinger0_find_test_val(test_array, nr, test_array[i] + 1); ++ } ++ } ++ ++ kfree(test_array); ++} ++#endif ++ ++/* ++ * Accumulate percpu counters onto one cpu's copy - only valid when access ++ * against any percpu counter is guarded against ++ */ ++u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) ++{ ++ u64 *ret; ++ int cpu; ++ ++ preempt_disable(); ++ ret = this_cpu_ptr(p); ++ preempt_enable(); ++ ++ for_each_possible_cpu(cpu) { ++ u64 *i = per_cpu_ptr(p, cpu); ++ ++ if (i != ret) { ++ acc_u64s(ret, i, nr); ++ memset(i, 0, nr * sizeof(u64)); ++ } ++ } ++ ++ return ret; ++} +diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h +new file mode 100644 +index 000000000000..f48c6380684f +--- /dev/null ++++ b/fs/bcachefs/util.h +@@ -0,0 +1,761 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_UTIL_H ++#define _BCACHEFS_UTIL_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define PAGE_SECTOR_SHIFT (PAGE_SHIFT - 9) ++#define PAGE_SECTORS (1UL << PAGE_SECTOR_SHIFT) ++ ++struct closure; ++ ++#ifdef CONFIG_BCACHEFS_DEBUG ++ ++#define EBUG_ON(cond) BUG_ON(cond) ++#define atomic_dec_bug(v) BUG_ON(atomic_dec_return(v) < 0) ++#define atomic_inc_bug(v, i) BUG_ON(atomic_inc_return(v) <= i) ++#define atomic_sub_bug(i, v) BUG_ON(atomic_sub_return(i, v) < 0) ++#define atomic_add_bug(i, v) BUG_ON(atomic_add_return(i, v) < 0) ++#define atomic_long_dec_bug(v) BUG_ON(atomic_long_dec_return(v) < 0) ++#define atomic_long_sub_bug(i, v) BUG_ON(atomic_long_sub_return(i, v) < 0) ++#define atomic64_dec_bug(v) BUG_ON(atomic64_dec_return(v) < 0) ++#define atomic64_inc_bug(v, i) BUG_ON(atomic64_inc_return(v) <= i) ++#define atomic64_sub_bug(i, v) BUG_ON(atomic64_sub_return(i, v) < 0) ++#define atomic64_add_bug(i, v) BUG_ON(atomic64_add_return(i, v) < 0) ++ ++#define memcpy(dst, src, len) \ ++({ \ ++ void *_dst = (dst); \ ++ const void *_src = (src); \ ++ size_t _len = (len); \ ++ \ ++ BUG_ON(!((void *) (_dst) >= (void *) (_src) + (_len) || \ ++ (void *) (_dst) + (_len) <= (void *) (_src))); \ ++ memcpy(_dst, _src, _len); \ ++}) ++ ++#else /* DEBUG */ ++ ++#define EBUG_ON(cond) ++#define atomic_dec_bug(v) atomic_dec(v) ++#define atomic_inc_bug(v, i) atomic_inc(v) ++#define atomic_sub_bug(i, v) atomic_sub(i, v) ++#define atomic_add_bug(i, v) atomic_add(i, v) ++#define atomic_long_dec_bug(v) atomic_long_dec(v) ++#define atomic_long_sub_bug(i, v) atomic_long_sub(i, v) ++#define atomic64_dec_bug(v) atomic64_dec(v) ++#define atomic64_inc_bug(v, i) atomic64_inc(v) ++#define atomic64_sub_bug(i, v) atomic64_sub(i, v) ++#define atomic64_add_bug(i, v) atomic64_add(i, v) ++ ++#endif ++ ++#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ ++#define CPU_BIG_ENDIAN 0 ++#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ ++#define CPU_BIG_ENDIAN 1 ++#endif ++ ++/* type hackery */ ++ ++#define type_is_exact(_val, _type) \ ++ __builtin_types_compatible_p(typeof(_val), _type) ++ ++#define type_is(_val, _type) \ ++ (__builtin_types_compatible_p(typeof(_val), _type) || \ ++ __builtin_types_compatible_p(typeof(_val), const _type)) ++ ++/* Userspace doesn't align allocations as nicely as the kernel allocators: */ ++static inline size_t buf_pages(void *p, size_t len) ++{ ++ return DIV_ROUND_UP(len + ++ ((unsigned long) p & (PAGE_SIZE - 1)), ++ PAGE_SIZE); ++} ++ ++static inline void vpfree(void *p, size_t size) ++{ ++ if (is_vmalloc_addr(p)) ++ vfree(p); ++ else ++ free_pages((unsigned long) p, get_order(size)); ++} ++ ++static inline void *vpmalloc(size_t size, gfp_t gfp_mask) ++{ ++ return (void *) __get_free_pages(gfp_mask|__GFP_NOWARN, ++ get_order(size)) ?: ++ __vmalloc(size, gfp_mask); ++} ++ ++static inline void kvpfree(void *p, size_t size) ++{ ++ if (size < PAGE_SIZE) ++ kfree(p); ++ else ++ vpfree(p, size); ++} ++ ++static inline void *kvpmalloc(size_t size, gfp_t gfp_mask) ++{ ++ return size < PAGE_SIZE ++ ? kmalloc(size, gfp_mask) ++ : vpmalloc(size, gfp_mask); ++} ++ ++int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t); ++ ++#define HEAP(type) \ ++struct { \ ++ size_t size, used; \ ++ type *data; \ ++} ++ ++#define DECLARE_HEAP(type, name) HEAP(type) name ++ ++#define init_heap(heap, _size, gfp) \ ++({ \ ++ (heap)->used = 0; \ ++ (heap)->size = (_size); \ ++ (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\ ++ (gfp)); \ ++}) ++ ++#define free_heap(heap) \ ++do { \ ++ kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \ ++ (heap)->data = NULL; \ ++} while (0) ++ ++#define heap_set_backpointer(h, i, _fn) \ ++do { \ ++ void (*fn)(typeof(h), size_t) = _fn; \ ++ if (fn) \ ++ fn(h, i); \ ++} while (0) ++ ++#define heap_swap(h, i, j, set_backpointer) \ ++do { \ ++ swap((h)->data[i], (h)->data[j]); \ ++ heap_set_backpointer(h, i, set_backpointer); \ ++ heap_set_backpointer(h, j, set_backpointer); \ ++} while (0) ++ ++#define heap_peek(h) \ ++({ \ ++ EBUG_ON(!(h)->used); \ ++ (h)->data[0]; \ ++}) ++ ++#define heap_full(h) ((h)->used == (h)->size) ++ ++#define heap_sift_down(h, i, cmp, set_backpointer) \ ++do { \ ++ size_t _c, _j = i; \ ++ \ ++ for (; _j * 2 + 1 < (h)->used; _j = _c) { \ ++ _c = _j * 2 + 1; \ ++ if (_c + 1 < (h)->used && \ ++ cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0) \ ++ _c++; \ ++ \ ++ if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0) \ ++ break; \ ++ heap_swap(h, _c, _j, set_backpointer); \ ++ } \ ++} while (0) ++ ++#define heap_sift_up(h, i, cmp, set_backpointer) \ ++do { \ ++ while (i) { \ ++ size_t p = (i - 1) / 2; \ ++ if (cmp(h, (h)->data[i], (h)->data[p]) >= 0) \ ++ break; \ ++ heap_swap(h, i, p, set_backpointer); \ ++ i = p; \ ++ } \ ++} while (0) ++ ++#define __heap_add(h, d, cmp, set_backpointer) \ ++({ \ ++ size_t _i = (h)->used++; \ ++ (h)->data[_i] = d; \ ++ heap_set_backpointer(h, _i, set_backpointer); \ ++ \ ++ heap_sift_up(h, _i, cmp, set_backpointer); \ ++ _i; \ ++}) ++ ++#define heap_add(h, d, cmp, set_backpointer) \ ++({ \ ++ bool _r = !heap_full(h); \ ++ if (_r) \ ++ __heap_add(h, d, cmp, set_backpointer); \ ++ _r; \ ++}) ++ ++#define heap_add_or_replace(h, new, cmp, set_backpointer) \ ++do { \ ++ if (!heap_add(h, new, cmp, set_backpointer) && \ ++ cmp(h, new, heap_peek(h)) >= 0) { \ ++ (h)->data[0] = new; \ ++ heap_set_backpointer(h, 0, set_backpointer); \ ++ heap_sift_down(h, 0, cmp, set_backpointer); \ ++ } \ ++} while (0) ++ ++#define heap_del(h, i, cmp, set_backpointer) \ ++do { \ ++ size_t _i = (i); \ ++ \ ++ BUG_ON(_i >= (h)->used); \ ++ (h)->used--; \ ++ heap_swap(h, _i, (h)->used, set_backpointer); \ ++ heap_sift_up(h, _i, cmp, set_backpointer); \ ++ heap_sift_down(h, _i, cmp, set_backpointer); \ ++} while (0) ++ ++#define heap_pop(h, d, cmp, set_backpointer) \ ++({ \ ++ bool _r = (h)->used; \ ++ if (_r) { \ ++ (d) = (h)->data[0]; \ ++ heap_del(h, 0, cmp, set_backpointer); \ ++ } \ ++ _r; \ ++}) ++ ++#define heap_resort(heap, cmp, set_backpointer) \ ++do { \ ++ ssize_t _i; \ ++ for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \ ++ heap_sift_down(heap, _i, cmp, set_backpointer); \ ++} while (0) ++ ++#define ANYSINT_MAX(t) \ ++ ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) ++ ++struct printbuf { ++ char *pos; ++ char *end; ++}; ++ ++static inline size_t printbuf_remaining(struct printbuf *buf) ++{ ++ return buf->end - buf->pos; ++} ++ ++#define _PBUF(_buf, _len) \ ++ ((struct printbuf) { \ ++ .pos = _buf, \ ++ .end = _buf + _len, \ ++ }) ++ ++#define PBUF(_buf) _PBUF(_buf, sizeof(_buf)) ++ ++#define pr_buf(_out, ...) \ ++do { \ ++ (_out)->pos += scnprintf((_out)->pos, printbuf_remaining(_out), \ ++ __VA_ARGS__); \ ++} while (0) ++ ++void bch_scnmemcpy(struct printbuf *, const char *, size_t); ++ ++int bch2_strtoint_h(const char *, int *); ++int bch2_strtouint_h(const char *, unsigned int *); ++int bch2_strtoll_h(const char *, long long *); ++int bch2_strtoull_h(const char *, unsigned long long *); ++int bch2_strtou64_h(const char *, u64 *); ++ ++static inline int bch2_strtol_h(const char *cp, long *res) ++{ ++#if BITS_PER_LONG == 32 ++ return bch2_strtoint_h(cp, (int *) res); ++#else ++ return bch2_strtoll_h(cp, (long long *) res); ++#endif ++} ++ ++static inline int bch2_strtoul_h(const char *cp, long *res) ++{ ++#if BITS_PER_LONG == 32 ++ return bch2_strtouint_h(cp, (unsigned int *) res); ++#else ++ return bch2_strtoull_h(cp, (unsigned long long *) res); ++#endif ++} ++ ++#define strtoi_h(cp, res) \ ++ ( type_is(*res, int) ? bch2_strtoint_h(cp, (void *) res)\ ++ : type_is(*res, long) ? bch2_strtol_h(cp, (void *) res)\ ++ : type_is(*res, long long) ? bch2_strtoll_h(cp, (void *) res)\ ++ : type_is(*res, unsigned) ? bch2_strtouint_h(cp, (void *) res)\ ++ : type_is(*res, unsigned long) ? bch2_strtoul_h(cp, (void *) res)\ ++ : type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\ ++ : -EINVAL) ++ ++#define strtoul_safe(cp, var) \ ++({ \ ++ unsigned long _v; \ ++ int _r = kstrtoul(cp, 10, &_v); \ ++ if (!_r) \ ++ var = _v; \ ++ _r; \ ++}) ++ ++#define strtoul_safe_clamp(cp, var, min, max) \ ++({ \ ++ unsigned long _v; \ ++ int _r = kstrtoul(cp, 10, &_v); \ ++ if (!_r) \ ++ var = clamp_t(typeof(var), _v, min, max); \ ++ _r; \ ++}) ++ ++#define strtoul_safe_restrict(cp, var, min, max) \ ++({ \ ++ unsigned long _v; \ ++ int _r = kstrtoul(cp, 10, &_v); \ ++ if (!_r && _v >= min && _v <= max) \ ++ var = _v; \ ++ else \ ++ _r = -EINVAL; \ ++ _r; \ ++}) ++ ++#define snprint(buf, size, var) \ ++ snprintf(buf, size, \ ++ type_is(var, int) ? "%i\n" \ ++ : type_is(var, unsigned) ? "%u\n" \ ++ : type_is(var, long) ? "%li\n" \ ++ : type_is(var, unsigned long) ? "%lu\n" \ ++ : type_is(var, s64) ? "%lli\n" \ ++ : type_is(var, u64) ? "%llu\n" \ ++ : type_is(var, char *) ? "%s\n" \ ++ : "%i\n", var) ++ ++void bch2_hprint(struct printbuf *, s64); ++ ++bool bch2_is_zero(const void *, size_t); ++ ++void bch2_string_opt_to_text(struct printbuf *, ++ const char * const [], size_t); ++ ++void bch2_flags_to_text(struct printbuf *, const char * const[], u64); ++u64 bch2_read_flag_list(char *, const char * const[]); ++ ++#define NR_QUANTILES 15 ++#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) ++#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) ++#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) ++ ++struct quantiles { ++ struct quantile_entry { ++ u64 m; ++ u64 step; ++ } entries[NR_QUANTILES]; ++}; ++ ++struct time_stat_buffer { ++ unsigned nr; ++ struct time_stat_buffer_entry { ++ u64 start; ++ u64 end; ++ } entries[32]; ++}; ++ ++struct time_stats { ++ spinlock_t lock; ++ u64 count; ++ /* all fields are in nanoseconds */ ++ u64 average_duration; ++ u64 average_frequency; ++ u64 max_duration; ++ u64 last_event; ++ struct quantiles quantiles; ++ ++ struct time_stat_buffer __percpu *buffer; ++}; ++ ++void __bch2_time_stats_update(struct time_stats *stats, u64, u64); ++ ++static inline void bch2_time_stats_update(struct time_stats *stats, u64 start) ++{ ++ __bch2_time_stats_update(stats, start, local_clock()); ++} ++ ++void bch2_time_stats_to_text(struct printbuf *, struct time_stats *); ++ ++void bch2_time_stats_exit(struct time_stats *); ++void bch2_time_stats_init(struct time_stats *); ++ ++#define ewma_add(ewma, val, weight) \ ++({ \ ++ typeof(ewma) _ewma = (ewma); \ ++ typeof(weight) _weight = (weight); \ ++ \ ++ (((_ewma << _weight) - _ewma) + (val)) >> _weight; \ ++}) ++ ++struct bch_ratelimit { ++ /* Next time we want to do some work, in nanoseconds */ ++ u64 next; ++ ++ /* ++ * Rate at which we want to do work, in units per nanosecond ++ * The units here correspond to the units passed to ++ * bch2_ratelimit_increment() ++ */ ++ unsigned rate; ++}; ++ ++static inline void bch2_ratelimit_reset(struct bch_ratelimit *d) ++{ ++ d->next = local_clock(); ++} ++ ++u64 bch2_ratelimit_delay(struct bch_ratelimit *); ++void bch2_ratelimit_increment(struct bch_ratelimit *, u64); ++ ++struct bch_pd_controller { ++ struct bch_ratelimit rate; ++ unsigned long last_update; ++ ++ s64 last_actual; ++ s64 smoothed_derivative; ++ ++ unsigned p_term_inverse; ++ unsigned d_smooth; ++ unsigned d_term; ++ ++ /* for exporting to sysfs (no effect on behavior) */ ++ s64 last_derivative; ++ s64 last_proportional; ++ s64 last_change; ++ s64 last_target; ++ ++ /* If true, the rate will not increase if bch2_ratelimit_delay() ++ * is not being called often enough. */ ++ bool backpressure; ++}; ++ ++void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int); ++void bch2_pd_controller_init(struct bch_pd_controller *); ++size_t bch2_pd_controller_print_debug(struct bch_pd_controller *, char *); ++ ++#define sysfs_pd_controller_attribute(name) \ ++ rw_attribute(name##_rate); \ ++ rw_attribute(name##_rate_bytes); \ ++ rw_attribute(name##_rate_d_term); \ ++ rw_attribute(name##_rate_p_term_inverse); \ ++ read_attribute(name##_rate_debug) ++ ++#define sysfs_pd_controller_files(name) \ ++ &sysfs_##name##_rate, \ ++ &sysfs_##name##_rate_bytes, \ ++ &sysfs_##name##_rate_d_term, \ ++ &sysfs_##name##_rate_p_term_inverse, \ ++ &sysfs_##name##_rate_debug ++ ++#define sysfs_pd_controller_show(name, var) \ ++do { \ ++ sysfs_hprint(name##_rate, (var)->rate.rate); \ ++ sysfs_print(name##_rate_bytes, (var)->rate.rate); \ ++ sysfs_print(name##_rate_d_term, (var)->d_term); \ ++ sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \ ++ \ ++ if (attr == &sysfs_##name##_rate_debug) \ ++ return bch2_pd_controller_print_debug(var, buf); \ ++} while (0) ++ ++#define sysfs_pd_controller_store(name, var) \ ++do { \ ++ sysfs_strtoul_clamp(name##_rate, \ ++ (var)->rate.rate, 1, UINT_MAX); \ ++ sysfs_strtoul_clamp(name##_rate_bytes, \ ++ (var)->rate.rate, 1, UINT_MAX); \ ++ sysfs_strtoul(name##_rate_d_term, (var)->d_term); \ ++ sysfs_strtoul_clamp(name##_rate_p_term_inverse, \ ++ (var)->p_term_inverse, 1, INT_MAX); \ ++} while (0) ++ ++#define container_of_or_null(ptr, type, member) \ ++({ \ ++ typeof(ptr) _ptr = ptr; \ ++ _ptr ? container_of(_ptr, type, member) : NULL; \ ++}) ++ ++/* Does linear interpolation between powers of two */ ++static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) ++{ ++ unsigned fract = x & ~(~0 << fract_bits); ++ ++ x >>= fract_bits; ++ x = 1 << x; ++ x += (x * fract) >> fract_bits; ++ ++ return x; ++} ++ ++void bch2_bio_map(struct bio *bio, void *base, size_t); ++int bch2_bio_alloc_pages(struct bio *, size_t, gfp_t); ++ ++static inline sector_t bdev_sectors(struct block_device *bdev) ++{ ++ return bdev->bd_inode->i_size >> 9; ++} ++ ++#define closure_bio_submit(bio, cl) \ ++do { \ ++ closure_get(cl); \ ++ submit_bio(bio); \ ++} while (0) ++ ++#define kthread_wait_freezable(cond) \ ++({ \ ++ int _ret = 0; \ ++ while (1) { \ ++ set_current_state(TASK_INTERRUPTIBLE); \ ++ if (kthread_should_stop()) { \ ++ _ret = -1; \ ++ break; \ ++ } \ ++ \ ++ if (cond) \ ++ break; \ ++ \ ++ schedule(); \ ++ try_to_freeze(); \ ++ } \ ++ set_current_state(TASK_RUNNING); \ ++ _ret; \ ++}) ++ ++size_t bch2_rand_range(size_t); ++ ++void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); ++void memcpy_from_bio(void *, struct bio *, struct bvec_iter); ++ ++static inline void memcpy_u64s_small(void *dst, const void *src, ++ unsigned u64s) ++{ ++ u64 *d = dst; ++ const u64 *s = src; ++ ++ while (u64s--) ++ *d++ = *s++; ++} ++ ++static inline void __memcpy_u64s(void *dst, const void *src, ++ unsigned u64s) ++{ ++#ifdef CONFIG_X86_64 ++ long d0, d1, d2; ++ asm volatile("rep ; movsq" ++ : "=&c" (d0), "=&D" (d1), "=&S" (d2) ++ : "0" (u64s), "1" (dst), "2" (src) ++ : "memory"); ++#else ++ u64 *d = dst; ++ const u64 *s = src; ++ ++ while (u64s--) ++ *d++ = *s++; ++#endif ++} ++ ++static inline void memcpy_u64s(void *dst, const void *src, ++ unsigned u64s) ++{ ++ EBUG_ON(!(dst >= src + u64s * sizeof(u64) || ++ dst + u64s * sizeof(u64) <= src)); ++ ++ __memcpy_u64s(dst, src, u64s); ++} ++ ++static inline void __memmove_u64s_down(void *dst, const void *src, ++ unsigned u64s) ++{ ++ __memcpy_u64s(dst, src, u64s); ++} ++ ++static inline void memmove_u64s_down(void *dst, const void *src, ++ unsigned u64s) ++{ ++ EBUG_ON(dst > src); ++ ++ __memmove_u64s_down(dst, src, u64s); ++} ++ ++static inline void __memmove_u64s_up_small(void *_dst, const void *_src, ++ unsigned u64s) ++{ ++ u64 *dst = (u64 *) _dst + u64s; ++ u64 *src = (u64 *) _src + u64s; ++ ++ while (u64s--) ++ *--dst = *--src; ++} ++ ++static inline void memmove_u64s_up_small(void *dst, const void *src, ++ unsigned u64s) ++{ ++ EBUG_ON(dst < src); ++ ++ __memmove_u64s_up_small(dst, src, u64s); ++} ++ ++static inline void __memmove_u64s_up(void *_dst, const void *_src, ++ unsigned u64s) ++{ ++ u64 *dst = (u64 *) _dst + u64s - 1; ++ u64 *src = (u64 *) _src + u64s - 1; ++ ++#ifdef CONFIG_X86_64 ++ long d0, d1, d2; ++ asm volatile("std ;\n" ++ "rep ; movsq\n" ++ "cld ;\n" ++ : "=&c" (d0), "=&D" (d1), "=&S" (d2) ++ : "0" (u64s), "1" (dst), "2" (src) ++ : "memory"); ++#else ++ while (u64s--) ++ *dst-- = *src--; ++#endif ++} ++ ++static inline void memmove_u64s_up(void *dst, const void *src, ++ unsigned u64s) ++{ ++ EBUG_ON(dst < src); ++ ++ __memmove_u64s_up(dst, src, u64s); ++} ++ ++static inline void memmove_u64s(void *dst, const void *src, ++ unsigned u64s) ++{ ++ if (dst < src) ++ __memmove_u64s_down(dst, src, u64s); ++ else ++ __memmove_u64s_up(dst, src, u64s); ++} ++ ++/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */ ++static inline void memset_u64s_tail(void *s, int c, unsigned bytes) ++{ ++ unsigned rem = round_up(bytes, sizeof(u64)) - bytes; ++ ++ memset(s + bytes, c, rem); ++} ++ ++void sort_cmp_size(void *base, size_t num, size_t size, ++ int (*cmp_func)(const void *, const void *, size_t), ++ void (*swap_func)(void *, void *, size_t)); ++ ++/* just the memmove, doesn't update @_nr */ ++#define __array_insert_item(_array, _nr, _pos) \ ++ memmove(&(_array)[(_pos) + 1], \ ++ &(_array)[(_pos)], \ ++ sizeof((_array)[0]) * ((_nr) - (_pos))) ++ ++#define array_insert_item(_array, _nr, _pos, _new_item) \ ++do { \ ++ __array_insert_item(_array, _nr, _pos); \ ++ (_nr)++; \ ++ (_array)[(_pos)] = (_new_item); \ ++} while (0) ++ ++#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \ ++do { \ ++ (_nr) -= (_nr_to_remove); \ ++ memmove(&(_array)[(_pos)], \ ++ &(_array)[(_pos) + (_nr_to_remove)], \ ++ sizeof((_array)[0]) * ((_nr) - (_pos))); \ ++} while (0) ++ ++#define array_remove_item(_array, _nr, _pos) \ ++ array_remove_items(_array, _nr, _pos, 1) ++ ++#define bubble_sort(_base, _nr, _cmp) \ ++do { \ ++ ssize_t _i, _end; \ ++ bool _swapped = true; \ ++ \ ++ for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\ ++ _swapped = false; \ ++ for (_i = 0; _i < _end; _i++) \ ++ if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \ ++ swap((_base)[_i], (_base)[_i + 1]); \ ++ _swapped = true; \ ++ } \ ++ } \ ++} while (0) ++ ++static inline u64 percpu_u64_get(u64 __percpu *src) ++{ ++ u64 ret = 0; ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ ret += *per_cpu_ptr(src, cpu); ++ return ret; ++} ++ ++static inline void percpu_u64_set(u64 __percpu *dst, u64 src) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ *per_cpu_ptr(dst, cpu) = 0; ++ ++ preempt_disable(); ++ *this_cpu_ptr(dst) = src; ++ preempt_enable(); ++} ++ ++static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr) ++{ ++ unsigned i; ++ ++ for (i = 0; i < nr; i++) ++ acc[i] += src[i]; ++} ++ ++static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src, ++ unsigned nr) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ acc_u64s(acc, per_cpu_ptr(src, cpu), nr); ++} ++ ++static inline void percpu_memset(void __percpu *p, int c, size_t bytes) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ memset(per_cpu_ptr(p, cpu), c, bytes); ++} ++ ++u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); ++ ++#define cmp_int(l, r) ((l > r) - (l < r)) ++ ++#endif /* _BCACHEFS_UTIL_H */ +diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h +new file mode 100644 +index 000000000000..c099cdc0605f +--- /dev/null ++++ b/fs/bcachefs/vstructs.h +@@ -0,0 +1,63 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _VSTRUCTS_H ++#define _VSTRUCTS_H ++ ++#include "util.h" ++ ++/* ++ * NOTE: we can't differentiate between __le64 and u64 with type_is - this ++ * assumes u64 is little endian: ++ */ ++#define __vstruct_u64s(_s) \ ++({ \ ++ ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s) \ ++ : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s) \ ++ : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s) \ ++ : ((__force u8) ((_s)->u64s))); \ ++}) ++ ++#define __vstruct_bytes(_type, _u64s) \ ++({ \ ++ BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \ ++ \ ++ (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ ++}) ++ ++#define vstruct_bytes(_s) \ ++ __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s)) ++ ++#define __vstruct_blocks(_type, _sector_block_bits, _u64s) \ ++ (round_up(__vstruct_bytes(_type, _u64s), \ ++ 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits))) ++ ++#define vstruct_blocks(_s, _sector_block_bits) \ ++ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s)) ++ ++#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s) \ ++ __vstruct_blocks(typeof(*(_s)), _sector_block_bits, \ ++ __vstruct_u64s(_s) + (_u64s)) ++ ++#define vstruct_sectors(_s, _sector_block_bits) \ ++ (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9) ++ ++#define vstruct_next(_s) \ ++ ((typeof(_s)) ((_s)->_data + __vstruct_u64s(_s))) ++#define vstruct_last(_s) \ ++ ((typeof(&(_s)->start[0])) ((_s)->_data + __vstruct_u64s(_s))) ++#define vstruct_end(_s) \ ++ ((void *) ((_s)->_data + __vstruct_u64s(_s))) ++ ++#define vstruct_for_each(_s, _i) \ ++ for (_i = (_s)->start; \ ++ _i < vstruct_last(_s); \ ++ _i = vstruct_next(_i)) ++ ++#define vstruct_for_each_safe(_s, _i, _t) \ ++ for (_i = (_s)->start; \ ++ _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \ ++ _i = _t) ++ ++#define vstruct_idx(_s, _idx) \ ++ ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx))) ++ ++#endif /* _VSTRUCTS_H */ +diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c +new file mode 100644 +index 000000000000..21f64cb7e402 +--- /dev/null ++++ b/fs/bcachefs/xattr.c +@@ -0,0 +1,586 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include "bcachefs.h" ++#include "bkey_methods.h" ++#include "btree_update.h" ++#include "extents.h" ++#include "fs.h" ++#include "rebalance.h" ++#include "str_hash.h" ++#include "xattr.h" ++ ++#include ++#include ++#include ++ ++static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned); ++ ++static u64 bch2_xattr_hash(const struct bch_hash_info *info, ++ const struct xattr_search_key *key) ++{ ++ struct bch_str_hash_ctx ctx; ++ ++ bch2_str_hash_init(&ctx, info); ++ bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type)); ++ bch2_str_hash_update(&ctx, info, key->name.name, key->name.len); ++ ++ return bch2_str_hash_end(&ctx, info); ++} ++ ++static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key) ++{ ++ return bch2_xattr_hash(info, key); ++} ++ ++static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) ++{ ++ struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k); ++ ++ return bch2_xattr_hash(info, ++ &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len)); ++} ++ ++static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) ++{ ++ struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); ++ const struct xattr_search_key *r = _r; ++ ++ return l.v->x_type != r->type || ++ l.v->x_name_len != r->name.len || ++ memcmp(l.v->x_name, r->name.name, r->name.len); ++} ++ ++static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) ++{ ++ struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); ++ struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r); ++ ++ return l.v->x_type != r.v->x_type || ++ l.v->x_name_len != r.v->x_name_len || ++ memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len); ++} ++ ++const struct bch_hash_desc bch2_xattr_hash_desc = { ++ .btree_id = BTREE_ID_XATTRS, ++ .key_type = KEY_TYPE_xattr, ++ .hash_key = xattr_hash_key, ++ .hash_bkey = xattr_hash_bkey, ++ .cmp_key = xattr_cmp_key, ++ .cmp_bkey = xattr_cmp_bkey, ++}; ++ ++const char *bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k) ++{ ++ const struct xattr_handler *handler; ++ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); ++ ++ if (bkey_val_bytes(k.k) < sizeof(struct bch_xattr)) ++ return "value too small"; ++ ++ if (bkey_val_u64s(k.k) < ++ xattr_val_u64s(xattr.v->x_name_len, ++ le16_to_cpu(xattr.v->x_val_len))) ++ return "value too small"; ++ ++ if (bkey_val_u64s(k.k) > ++ xattr_val_u64s(xattr.v->x_name_len, ++ le16_to_cpu(xattr.v->x_val_len) + 4)) ++ return "value too big"; ++ ++ handler = bch2_xattr_type_to_handler(xattr.v->x_type); ++ if (!handler) ++ return "invalid type"; ++ ++ if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) ++ return "xattr name has invalid characters"; ++ ++ return NULL; ++} ++ ++void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, ++ struct bkey_s_c k) ++{ ++ const struct xattr_handler *handler; ++ struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); ++ ++ handler = bch2_xattr_type_to_handler(xattr.v->x_type); ++ if (handler && handler->prefix) ++ pr_buf(out, "%s", handler->prefix); ++ else if (handler) ++ pr_buf(out, "(type %u)", xattr.v->x_type); ++ else ++ pr_buf(out, "(unknown type %u)", xattr.v->x_type); ++ ++ bch_scnmemcpy(out, xattr.v->x_name, ++ xattr.v->x_name_len); ++ pr_buf(out, ":"); ++ bch_scnmemcpy(out, xattr_val(xattr.v), ++ le16_to_cpu(xattr.v->x_val_len)); ++} ++ ++int bch2_xattr_get(struct bch_fs *c, struct bch_inode_info *inode, ++ const char *name, void *buffer, size_t size, int type) ++{ ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c_xattr xattr; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc, ++ &inode->ei_str_hash, inode->v.i_ino, ++ &X_SEARCH(type, name, strlen(name)), ++ 0); ++ if (IS_ERR(iter)) { ++ bch2_trans_exit(&trans); ++ BUG_ON(PTR_ERR(iter) == -EINTR); ++ ++ return PTR_ERR(iter) == -ENOENT ? -ENODATA : PTR_ERR(iter); ++ } ++ ++ xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter)); ++ ret = le16_to_cpu(xattr.v->x_val_len); ++ if (buffer) { ++ if (ret > size) ++ ret = -ERANGE; ++ else ++ memcpy(buffer, xattr_val(xattr.v), ret); ++ } ++ ++ bch2_trans_exit(&trans); ++ return ret; ++} ++ ++int bch2_xattr_set(struct btree_trans *trans, u64 inum, ++ const struct bch_hash_info *hash_info, ++ const char *name, const void *value, size_t size, ++ int type, int flags) ++{ ++ int ret; ++ ++ if (value) { ++ struct bkey_i_xattr *xattr; ++ unsigned namelen = strlen(name); ++ unsigned u64s = BKEY_U64s + ++ xattr_val_u64s(namelen, size); ++ ++ if (u64s > U8_MAX) ++ return -ERANGE; ++ ++ xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); ++ if (IS_ERR(xattr)) ++ return PTR_ERR(xattr); ++ ++ bkey_xattr_init(&xattr->k_i); ++ xattr->k.u64s = u64s; ++ xattr->v.x_type = type; ++ xattr->v.x_name_len = namelen; ++ xattr->v.x_val_len = cpu_to_le16(size); ++ memcpy(xattr->v.x_name, name, namelen); ++ memcpy(xattr_val(&xattr->v), value, size); ++ ++ ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, ++ inum, &xattr->k_i, ++ (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)| ++ (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0)); ++ } else { ++ struct xattr_search_key search = ++ X_SEARCH(type, name, strlen(name)); ++ ++ ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, ++ hash_info, inum, &search); ++ } ++ ++ if (ret == -ENOENT) ++ ret = flags & XATTR_REPLACE ? -ENODATA : 0; ++ ++ return ret; ++} ++ ++struct xattr_buf { ++ char *buf; ++ size_t len; ++ size_t used; ++}; ++ ++static int __bch2_xattr_emit(const char *prefix, ++ const char *name, size_t name_len, ++ struct xattr_buf *buf) ++{ ++ const size_t prefix_len = strlen(prefix); ++ const size_t total_len = prefix_len + name_len + 1; ++ ++ if (buf->buf) { ++ if (buf->used + total_len > buf->len) ++ return -ERANGE; ++ ++ memcpy(buf->buf + buf->used, prefix, prefix_len); ++ memcpy(buf->buf + buf->used + prefix_len, ++ name, name_len); ++ buf->buf[buf->used + prefix_len + name_len] = '\0'; ++ } ++ ++ buf->used += total_len; ++ return 0; ++} ++ ++static int bch2_xattr_emit(struct dentry *dentry, ++ const struct bch_xattr *xattr, ++ struct xattr_buf *buf) ++{ ++ const struct xattr_handler *handler = ++ bch2_xattr_type_to_handler(xattr->x_type); ++ ++ return handler && (!handler->list || handler->list(dentry)) ++ ? __bch2_xattr_emit(handler->prefix ?: handler->name, ++ xattr->x_name, xattr->x_name_len, buf) ++ : 0; ++} ++ ++static int bch2_xattr_list_bcachefs(struct bch_fs *c, ++ struct bch_inode_info *inode, ++ struct xattr_buf *buf, ++ bool all) ++{ ++ const char *prefix = all ? "bcachefs_effective." : "bcachefs."; ++ unsigned id; ++ int ret = 0; ++ u64 v; ++ ++ for (id = 0; id < Inode_opt_nr; id++) { ++ v = bch2_inode_opt_get(&inode->ei_inode, id); ++ if (!v) ++ continue; ++ ++ if (!all && ++ !(inode->ei_inode.bi_fields_set & (1 << id))) ++ continue; ++ ++ ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id], ++ strlen(bch2_inode_opts[id]), buf); ++ if (ret) ++ break; ++ } ++ ++ return ret; ++} ++ ++ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) ++{ ++ struct bch_fs *c = dentry->d_sb->s_fs_info; ++ struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); ++ struct btree_trans trans; ++ struct btree_iter *iter; ++ struct bkey_s_c k; ++ struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; ++ u64 inum = dentry->d_inode->i_ino; ++ int ret; ++ ++ bch2_trans_init(&trans, c, 0, 0); ++ ++ for_each_btree_key(&trans, iter, BTREE_ID_XATTRS, ++ POS(inum, 0), 0, k, ret) { ++ BUG_ON(k.k->p.inode < inum); ++ ++ if (k.k->p.inode > inum) ++ break; ++ ++ if (k.k->type != KEY_TYPE_xattr) ++ continue; ++ ++ ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); ++ if (ret) ++ break; ++ } ++ ret = bch2_trans_exit(&trans) ?: ret; ++ ++ if (ret) ++ return ret; ++ ++ ret = bch2_xattr_list_bcachefs(c, inode, &buf, false); ++ if (ret) ++ return ret; ++ ++ ret = bch2_xattr_list_bcachefs(c, inode, &buf, true); ++ if (ret) ++ return ret; ++ ++ return buf.used; ++} ++ ++static int bch2_xattr_get_handler(const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, void *buffer, size_t size) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ return bch2_xattr_get(c, inode, name, buffer, size, handler->flags); ++} ++ ++static int bch2_xattr_set_handler(const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, const void *value, ++ size_t size, int flags) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ ++ return bch2_trans_do(c, NULL, &inode->ei_journal_seq, 0, ++ bch2_xattr_set(&trans, inode->v.i_ino, ++ &inode->ei_str_hash, ++ name, value, size, ++ handler->flags, flags)); ++} ++ ++static const struct xattr_handler bch_xattr_user_handler = { ++ .prefix = XATTR_USER_PREFIX, ++ .get = bch2_xattr_get_handler, ++ .set = bch2_xattr_set_handler, ++ .flags = KEY_TYPE_XATTR_INDEX_USER, ++}; ++ ++static bool bch2_xattr_trusted_list(struct dentry *dentry) ++{ ++ return capable(CAP_SYS_ADMIN); ++} ++ ++static const struct xattr_handler bch_xattr_trusted_handler = { ++ .prefix = XATTR_TRUSTED_PREFIX, ++ .list = bch2_xattr_trusted_list, ++ .get = bch2_xattr_get_handler, ++ .set = bch2_xattr_set_handler, ++ .flags = KEY_TYPE_XATTR_INDEX_TRUSTED, ++}; ++ ++static const struct xattr_handler bch_xattr_security_handler = { ++ .prefix = XATTR_SECURITY_PREFIX, ++ .get = bch2_xattr_get_handler, ++ .set = bch2_xattr_set_handler, ++ .flags = KEY_TYPE_XATTR_INDEX_SECURITY, ++}; ++ ++#ifndef NO_BCACHEFS_FS ++ ++static int opt_to_inode_opt(int id) ++{ ++ switch (id) { ++#define x(name, ...) \ ++ case Opt_##name: return Inode_opt_##name; ++ BCH_INODE_OPTS() ++#undef x ++ default: ++ return -1; ++ } ++} ++ ++static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, void *buffer, size_t size, ++ bool all) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ struct bch_opts opts = ++ bch2_inode_opts_to_opts(bch2_inode_opts_get(&inode->ei_inode)); ++ const struct bch_option *opt; ++ int id, inode_opt_id; ++ char buf[512]; ++ struct printbuf out = PBUF(buf); ++ unsigned val_len; ++ u64 v; ++ ++ id = bch2_opt_lookup(name); ++ if (id < 0 || !bch2_opt_is_inode_opt(id)) ++ return -EINVAL; ++ ++ inode_opt_id = opt_to_inode_opt(id); ++ if (inode_opt_id < 0) ++ return -EINVAL; ++ ++ opt = bch2_opt_table + id; ++ ++ if (!bch2_opt_defined_by_id(&opts, id)) ++ return -ENODATA; ++ ++ if (!all && ++ !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id))) ++ return -ENODATA; ++ ++ v = bch2_opt_get_by_id(&opts, id); ++ bch2_opt_to_text(&out, c, opt, v, 0); ++ ++ val_len = out.pos - buf; ++ ++ if (buffer && val_len > size) ++ return -ERANGE; ++ ++ if (buffer) ++ memcpy(buffer, buf, val_len); ++ return val_len; ++} ++ ++static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, void *buffer, size_t size) ++{ ++ return __bch2_xattr_bcachefs_get(handler, dentry, vinode, ++ name, buffer, size, false); ++} ++ ++struct inode_opt_set { ++ int id; ++ u64 v; ++ bool defined; ++}; ++ ++static int inode_opt_set_fn(struct bch_inode_info *inode, ++ struct bch_inode_unpacked *bi, ++ void *p) ++{ ++ struct inode_opt_set *s = p; ++ ++ if (s->defined) ++ bi->bi_fields_set |= 1U << s->id; ++ else ++ bi->bi_fields_set &= ~(1U << s->id); ++ ++ bch2_inode_opt_set(bi, s->id, s->v); ++ ++ return 0; ++} ++ ++static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, const void *value, ++ size_t size, int flags) ++{ ++ struct bch_inode_info *inode = to_bch_ei(vinode); ++ struct bch_fs *c = inode->v.i_sb->s_fs_info; ++ const struct bch_option *opt; ++ char *buf; ++ struct inode_opt_set s; ++ int opt_id, inode_opt_id, ret; ++ ++ opt_id = bch2_opt_lookup(name); ++ if (opt_id < 0) ++ return -EINVAL; ++ ++ opt = bch2_opt_table + opt_id; ++ ++ inode_opt_id = opt_to_inode_opt(opt_id); ++ if (inode_opt_id < 0) ++ return -EINVAL; ++ ++ s.id = inode_opt_id; ++ ++ if (value) { ++ u64 v = 0; ++ ++ buf = kmalloc(size + 1, GFP_KERNEL); ++ if (!buf) ++ return -ENOMEM; ++ memcpy(buf, value, size); ++ buf[size] = '\0'; ++ ++ ret = bch2_opt_parse(c, opt, buf, &v); ++ kfree(buf); ++ ++ if (ret < 0) ++ return ret; ++ ++ ret = bch2_opt_check_may_set(c, opt_id, v); ++ if (ret < 0) ++ return ret; ++ ++ s.v = v + 1; ++ s.defined = true; ++ } else { ++ if (!IS_ROOT(dentry)) { ++ struct bch_inode_info *dir = ++ to_bch_ei(d_inode(dentry->d_parent)); ++ ++ s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id); ++ } else { ++ s.v = 0; ++ } ++ ++ s.defined = false; ++ } ++ ++ mutex_lock(&inode->ei_update_lock); ++ if (inode_opt_id == Inode_opt_project) { ++ /* ++ * inode fields accessible via the xattr interface are stored ++ * with a +1 bias, so that 0 means unset: ++ */ ++ ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0); ++ if (ret) ++ goto err; ++ } ++ ++ ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); ++err: ++ mutex_unlock(&inode->ei_update_lock); ++ ++ if (value && ++ (opt_id == Opt_background_compression || ++ opt_id == Opt_background_target)) ++ bch2_rebalance_add_work(c, inode->v.i_blocks); ++ ++ return ret; ++} ++ ++static const struct xattr_handler bch_xattr_bcachefs_handler = { ++ .prefix = "bcachefs.", ++ .get = bch2_xattr_bcachefs_get, ++ .set = bch2_xattr_bcachefs_set, ++}; ++ ++static int bch2_xattr_bcachefs_get_effective( ++ const struct xattr_handler *handler, ++ struct dentry *dentry, struct inode *vinode, ++ const char *name, void *buffer, size_t size) ++{ ++ return __bch2_xattr_bcachefs_get(handler, dentry, vinode, ++ name, buffer, size, true); ++} ++ ++static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { ++ .prefix = "bcachefs_effective.", ++ .get = bch2_xattr_bcachefs_get_effective, ++ .set = bch2_xattr_bcachefs_set, ++}; ++ ++#endif /* NO_BCACHEFS_FS */ ++ ++const struct xattr_handler *bch2_xattr_handlers[] = { ++ &bch_xattr_user_handler, ++ &posix_acl_access_xattr_handler, ++ &posix_acl_default_xattr_handler, ++ &bch_xattr_trusted_handler, ++ &bch_xattr_security_handler, ++#ifndef NO_BCACHEFS_FS ++ &bch_xattr_bcachefs_handler, ++ &bch_xattr_bcachefs_effective_handler, ++#endif ++ NULL ++}; ++ ++static const struct xattr_handler *bch_xattr_handler_map[] = { ++ [KEY_TYPE_XATTR_INDEX_USER] = &bch_xattr_user_handler, ++ [KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS] = ++ &posix_acl_access_xattr_handler, ++ [KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT] = ++ &posix_acl_default_xattr_handler, ++ [KEY_TYPE_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler, ++ [KEY_TYPE_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler, ++}; ++ ++static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type) ++{ ++ return type < ARRAY_SIZE(bch_xattr_handler_map) ++ ? bch_xattr_handler_map[type] ++ : NULL; ++} +diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h +new file mode 100644 +index 000000000000..4151065ab853 +--- /dev/null ++++ b/fs/bcachefs/xattr.h +@@ -0,0 +1,49 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _BCACHEFS_XATTR_H ++#define _BCACHEFS_XATTR_H ++ ++#include "str_hash.h" ++ ++extern const struct bch_hash_desc bch2_xattr_hash_desc; ++ ++const char *bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c); ++void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); ++ ++#define bch2_bkey_ops_xattr (struct bkey_ops) { \ ++ .key_invalid = bch2_xattr_invalid, \ ++ .val_to_text = bch2_xattr_to_text, \ ++} ++ ++static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) ++{ ++ return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + ++ name_len + val_len, sizeof(u64)); ++} ++ ++#define xattr_val(_xattr) \ ++ ((void *) (_xattr)->x_name + (_xattr)->x_name_len) ++ ++struct xattr_search_key { ++ u8 type; ++ struct qstr name; ++}; ++ ++#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \ ++ { .type = _type, .name = QSTR_INIT(_name, _len) }) ++ ++struct dentry; ++struct xattr_handler; ++struct bch_hash_info; ++struct bch_inode_info; ++ ++int bch2_xattr_get(struct bch_fs *, struct bch_inode_info *, ++ const char *, void *, size_t, int); ++ ++int bch2_xattr_set(struct btree_trans *, u64, const struct bch_hash_info *, ++ const char *, const void *, size_t, int, int); ++ ++ssize_t bch2_xattr_list(struct dentry *, char *, size_t); ++ ++extern const struct xattr_handler *bch2_xattr_handlers[]; ++ ++#endif /* _BCACHEFS_XATTR_H */ +diff --git a/fs/cifs/file.c b/fs/cifs/file.c +index be46fab4c96d..a17a21181e18 100644 +--- a/fs/cifs/file.c ++++ b/fs/cifs/file.c +@@ -4296,20 +4296,12 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, + + page = lru_to_page(page_list); + +- /* +- * Lock the page and put it in the cache. Since no one else +- * should have access to this page, we're safe to simply set +- * PG_locked without checking it first. +- */ +- __SetPageLocked(page); +- rc = add_to_page_cache_locked(page, mapping, +- page->index, gfp); ++ rc = add_to_page_cache(page, mapping, ++ page->index, gfp); + + /* give up if we can't stick it in the cache */ +- if (rc) { +- __ClearPageLocked(page); ++ if (rc) + return rc; +- } + + /* move first page to the tmplist */ + *offset = (loff_t)page->index << PAGE_SHIFT; +@@ -4328,12 +4320,9 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, + if (*bytes + PAGE_SIZE > rsize) + break; + +- __SetPageLocked(page); +- rc = add_to_page_cache_locked(page, mapping, page->index, gfp); +- if (rc) { +- __ClearPageLocked(page); ++ rc = add_to_page_cache(page, mapping, page->index, gfp); ++ if (rc) + break; +- } + list_move_tail(&page->lru, tmplist); + (*bytes) += PAGE_SIZE; + expected_index++; +diff --git a/fs/dcache.c b/fs/dcache.c +index ea0485861d93..b4d6e3e86285 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -3132,9 +3132,8 @@ void d_genocide(struct dentry *parent) + + EXPORT_SYMBOL(d_genocide); + +-void d_tmpfile(struct dentry *dentry, struct inode *inode) ++void d_mark_tmpfile(struct dentry *dentry, struct inode *inode) + { +- inode_dec_link_count(inode); + BUG_ON(dentry->d_name.name != dentry->d_iname || + !hlist_unhashed(&dentry->d_u.d_alias) || + !d_unlinked(dentry)); +@@ -3144,6 +3143,13 @@ void d_tmpfile(struct dentry *dentry, struct inode *inode) + (unsigned long long)inode->i_ino); + spin_unlock(&dentry->d_lock); + spin_unlock(&dentry->d_parent->d_lock); ++} ++EXPORT_SYMBOL(d_mark_tmpfile); ++ ++void d_tmpfile(struct dentry *dentry, struct inode *inode) ++{ ++ inode_dec_link_count(inode); ++ d_mark_tmpfile(dentry, inode); + d_instantiate(dentry, inode); + } + EXPORT_SYMBOL(d_tmpfile); +diff --git a/fs/inode.c b/fs/inode.c +index 72c4c347afb7..e70ad3d2d01c 100644 +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -1578,6 +1578,46 @@ int insert_inode_locked(struct inode *inode) + } + EXPORT_SYMBOL(insert_inode_locked); + ++struct inode *insert_inode_locked2(struct inode *inode) ++{ ++ struct super_block *sb = inode->i_sb; ++ ino_t ino = inode->i_ino; ++ struct hlist_head *head = inode_hashtable + hash(sb, ino); ++ ++ while (1) { ++ struct inode *old = NULL; ++ spin_lock(&inode_hash_lock); ++ hlist_for_each_entry(old, head, i_hash) { ++ if (old->i_ino != ino) ++ continue; ++ if (old->i_sb != sb) ++ continue; ++ spin_lock(&old->i_lock); ++ if (old->i_state & (I_FREEING|I_WILL_FREE)) { ++ spin_unlock(&old->i_lock); ++ continue; ++ } ++ break; ++ } ++ if (likely(!old)) { ++ spin_lock(&inode->i_lock); ++ inode->i_state |= I_NEW | I_CREATING; ++ hlist_add_head(&inode->i_hash, head); ++ spin_unlock(&inode->i_lock); ++ spin_unlock(&inode_hash_lock); ++ return NULL; ++ } ++ __iget(old); ++ spin_unlock(&old->i_lock); ++ spin_unlock(&inode_hash_lock); ++ wait_on_inode(old); ++ if (unlikely(!inode_unhashed(old))) ++ return old; ++ iput(old); ++ } ++} ++EXPORT_SYMBOL(insert_inode_locked2); ++ + int insert_inode_locked4(struct inode *inode, unsigned long hashval, + int (*test)(struct inode *, void *), void *data) + { +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 868e11face00..d9e3b7b0175e 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -936,6 +936,7 @@ extern const char *blk_op_str(unsigned int op); + + int blk_status_to_errno(blk_status_t status); + blk_status_t errno_to_blk_status(int errno); ++const char *blk_status_to_str(blk_status_t status); + + int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin); + +diff --git a/include/linux/closure.h b/include/linux/closure.h +new file mode 100644 +index 000000000000..36b4a83f9b77 +--- /dev/null ++++ b/include/linux/closure.h +@@ -0,0 +1,399 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef _LINUX_CLOSURE_H ++#define _LINUX_CLOSURE_H ++ ++#include ++#include ++#include ++#include ++ ++/* ++ * Closure is perhaps the most overused and abused term in computer science, but ++ * since I've been unable to come up with anything better you're stuck with it ++ * again. ++ * ++ * What are closures? ++ * ++ * They embed a refcount. The basic idea is they count "things that are in ++ * progress" - in flight bios, some other thread that's doing something else - ++ * anything you might want to wait on. ++ * ++ * The refcount may be manipulated with closure_get() and closure_put(). ++ * closure_put() is where many of the interesting things happen, when it causes ++ * the refcount to go to 0. ++ * ++ * Closures can be used to wait on things both synchronously and asynchronously, ++ * and synchronous and asynchronous use can be mixed without restriction. To ++ * wait synchronously, use closure_sync() - you will sleep until your closure's ++ * refcount hits 1. ++ * ++ * To wait asynchronously, use ++ * continue_at(cl, next_function, workqueue); ++ * ++ * passing it, as you might expect, the function to run when nothing is pending ++ * and the workqueue to run that function out of. ++ * ++ * continue_at() also, critically, requires a 'return' immediately following the ++ * location where this macro is referenced, to return to the calling function. ++ * There's good reason for this. ++ * ++ * To use safely closures asynchronously, they must always have a refcount while ++ * they are running owned by the thread that is running them. Otherwise, suppose ++ * you submit some bios and wish to have a function run when they all complete: ++ * ++ * foo_endio(struct bio *bio) ++ * { ++ * closure_put(cl); ++ * } ++ * ++ * closure_init(cl); ++ * ++ * do_stuff(); ++ * closure_get(cl); ++ * bio1->bi_endio = foo_endio; ++ * bio_submit(bio1); ++ * ++ * do_more_stuff(); ++ * closure_get(cl); ++ * bio2->bi_endio = foo_endio; ++ * bio_submit(bio2); ++ * ++ * continue_at(cl, complete_some_read, system_wq); ++ * ++ * If closure's refcount started at 0, complete_some_read() could run before the ++ * second bio was submitted - which is almost always not what you want! More ++ * importantly, it wouldn't be possible to say whether the original thread or ++ * complete_some_read()'s thread owned the closure - and whatever state it was ++ * associated with! ++ * ++ * So, closure_init() initializes a closure's refcount to 1 - and when a ++ * closure_fn is run, the refcount will be reset to 1 first. ++ * ++ * Then, the rule is - if you got the refcount with closure_get(), release it ++ * with closure_put() (i.e, in a bio->bi_endio function). If you have a refcount ++ * on a closure because you called closure_init() or you were run out of a ++ * closure - _always_ use continue_at(). Doing so consistently will help ++ * eliminate an entire class of particularly pernicious races. ++ * ++ * Lastly, you might have a wait list dedicated to a specific event, and have no ++ * need for specifying the condition - you just want to wait until someone runs ++ * closure_wake_up() on the appropriate wait list. In that case, just use ++ * closure_wait(). It will return either true or false, depending on whether the ++ * closure was already on a wait list or not - a closure can only be on one wait ++ * list at a time. ++ * ++ * Parents: ++ * ++ * closure_init() takes two arguments - it takes the closure to initialize, and ++ * a (possibly null) parent. ++ * ++ * If parent is non null, the new closure will have a refcount for its lifetime; ++ * a closure is considered to be "finished" when its refcount hits 0 and the ++ * function to run is null. Hence ++ * ++ * continue_at(cl, NULL, NULL); ++ * ++ * returns up the (spaghetti) stack of closures, precisely like normal return ++ * returns up the C stack. continue_at() with non null fn is better thought of ++ * as doing a tail call. ++ * ++ * All this implies that a closure should typically be embedded in a particular ++ * struct (which its refcount will normally control the lifetime of), and that ++ * struct can very much be thought of as a stack frame. ++ */ ++ ++struct closure; ++struct closure_syncer; ++typedef void (closure_fn) (struct closure *); ++extern struct dentry *bcache_debug; ++ ++struct closure_waitlist { ++ struct llist_head list; ++}; ++ ++enum closure_state { ++ /* ++ * CLOSURE_WAITING: Set iff the closure is on a waitlist. Must be set by ++ * the thread that owns the closure, and cleared by the thread that's ++ * waking up the closure. ++ * ++ * The rest are for debugging and don't affect behaviour: ++ * ++ * CLOSURE_RUNNING: Set when a closure is running (i.e. by ++ * closure_init() and when closure_put() runs then next function), and ++ * must be cleared before remaining hits 0. Primarily to help guard ++ * against incorrect usage and accidentally transferring references. ++ * continue_at() and closure_return() clear it for you, if you're doing ++ * something unusual you can use closure_set_dead() which also helps ++ * annotate where references are being transferred. ++ */ ++ ++ CLOSURE_BITS_START = (1U << 26), ++ CLOSURE_DESTRUCTOR = (1U << 26), ++ CLOSURE_WAITING = (1U << 28), ++ CLOSURE_RUNNING = (1U << 30), ++}; ++ ++#define CLOSURE_GUARD_MASK \ ++ ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1) ++ ++#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1) ++#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING) ++ ++struct closure { ++ union { ++ struct { ++ struct workqueue_struct *wq; ++ struct closure_syncer *s; ++ struct llist_node list; ++ closure_fn *fn; ++ }; ++ struct work_struct work; ++ }; ++ ++ struct closure *parent; ++ ++ atomic_t remaining; ++ ++#ifdef CONFIG_DEBUG_CLOSURES ++#define CLOSURE_MAGIC_DEAD 0xc054dead ++#define CLOSURE_MAGIC_ALIVE 0xc054a11e ++ ++ unsigned int magic; ++ struct list_head all; ++ unsigned long ip; ++ unsigned long waiting_on; ++#endif ++}; ++ ++void closure_sub(struct closure *cl, int v); ++void closure_put(struct closure *cl); ++void __closure_wake_up(struct closure_waitlist *list); ++bool closure_wait(struct closure_waitlist *list, struct closure *cl); ++void __closure_sync(struct closure *cl); ++ ++/** ++ * closure_sync - sleep until a closure a closure has nothing left to wait on ++ * ++ * Sleeps until the refcount hits 1 - the thread that's running the closure owns ++ * the last refcount. ++ */ ++static inline void closure_sync(struct closure *cl) ++{ ++ if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) ++ __closure_sync(cl); ++} ++ ++#ifdef CONFIG_DEBUG_CLOSURES ++ ++void closure_debug_create(struct closure *cl); ++void closure_debug_destroy(struct closure *cl); ++ ++#else ++ ++static inline void closure_debug_create(struct closure *cl) {} ++static inline void closure_debug_destroy(struct closure *cl) {} ++ ++#endif ++ ++static inline void closure_set_ip(struct closure *cl) ++{ ++#ifdef CONFIG_DEBUG_CLOSURES ++ cl->ip = _THIS_IP_; ++#endif ++} ++ ++static inline void closure_set_ret_ip(struct closure *cl) ++{ ++#ifdef CONFIG_DEBUG_CLOSURES ++ cl->ip = _RET_IP_; ++#endif ++} ++ ++static inline void closure_set_waiting(struct closure *cl, unsigned long f) ++{ ++#ifdef CONFIG_DEBUG_CLOSURES ++ cl->waiting_on = f; ++#endif ++} ++ ++static inline void closure_set_stopped(struct closure *cl) ++{ ++ atomic_sub(CLOSURE_RUNNING, &cl->remaining); ++} ++ ++static inline void set_closure_fn(struct closure *cl, closure_fn *fn, ++ struct workqueue_struct *wq) ++{ ++ closure_set_ip(cl); ++ cl->fn = fn; ++ cl->wq = wq; ++ /* between atomic_dec() in closure_put() */ ++ smp_mb__before_atomic(); ++} ++ ++static inline void closure_queue(struct closure *cl) ++{ ++ struct workqueue_struct *wq = cl->wq; ++ /** ++ * Changes made to closure, work_struct, or a couple of other structs ++ * may cause work.func not pointing to the right location. ++ */ ++ BUILD_BUG_ON(offsetof(struct closure, fn) ++ != offsetof(struct work_struct, func)); ++ ++ if (wq) { ++ INIT_WORK(&cl->work, cl->work.func); ++ BUG_ON(!queue_work(wq, &cl->work)); ++ } else ++ cl->fn(cl); ++} ++ ++/** ++ * closure_get - increment a closure's refcount ++ */ ++static inline void closure_get(struct closure *cl) ++{ ++#ifdef CONFIG_DEBUG_CLOSURES ++ BUG_ON((atomic_inc_return(&cl->remaining) & ++ CLOSURE_REMAINING_MASK) <= 1); ++#else ++ atomic_inc(&cl->remaining); ++#endif ++} ++ ++/** ++ * closure_init - Initialize a closure, setting the refcount to 1 ++ * @cl: closure to initialize ++ * @parent: parent of the new closure. cl will take a refcount on it for its ++ * lifetime; may be NULL. ++ */ ++static inline void closure_init(struct closure *cl, struct closure *parent) ++{ ++ cl->fn = NULL; ++ cl->parent = parent; ++ if (parent) ++ closure_get(parent); ++ ++ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); ++ ++ closure_debug_create(cl); ++ closure_set_ip(cl); ++} ++ ++static inline void closure_init_stack(struct closure *cl) ++{ ++ memset(cl, 0, sizeof(struct closure)); ++ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER); ++} ++ ++/** ++ * closure_wake_up - wake up all closures on a wait list, ++ * with memory barrier ++ */ ++static inline void closure_wake_up(struct closure_waitlist *list) ++{ ++ /* Memory barrier for the wait list */ ++ smp_mb(); ++ __closure_wake_up(list); ++} ++ ++/** ++ * continue_at - jump to another function with barrier ++ * ++ * After @cl is no longer waiting on anything (i.e. all outstanding refs have ++ * been dropped with closure_put()), it will resume execution at @fn running out ++ * of @wq (or, if @wq is NULL, @fn will be called by closure_put() directly). ++ * ++ * This is because after calling continue_at() you no longer have a ref on @cl, ++ * and whatever @cl owns may be freed out from under you - a running closure fn ++ * has a ref on its own closure which continue_at() drops. ++ * ++ * Note you are expected to immediately return after using this macro. ++ */ ++#define continue_at(_cl, _fn, _wq) \ ++do { \ ++ set_closure_fn(_cl, _fn, _wq); \ ++ closure_sub(_cl, CLOSURE_RUNNING + 1); \ ++} while (0) ++ ++/** ++ * closure_return - finish execution of a closure ++ * ++ * This is used to indicate that @cl is finished: when all outstanding refs on ++ * @cl have been dropped @cl's ref on its parent closure (as passed to ++ * closure_init()) will be dropped, if one was specified - thus this can be ++ * thought of as returning to the parent closure. ++ */ ++#define closure_return(_cl) continue_at((_cl), NULL, NULL) ++ ++/** ++ * continue_at_nobarrier - jump to another function without barrier ++ * ++ * Causes @fn to be executed out of @cl, in @wq context (or called directly if ++ * @wq is NULL). ++ * ++ * The ref the caller of continue_at_nobarrier() had on @cl is now owned by @fn, ++ * thus it's not safe to touch anything protected by @cl after a ++ * continue_at_nobarrier(). ++ */ ++#define continue_at_nobarrier(_cl, _fn, _wq) \ ++do { \ ++ set_closure_fn(_cl, _fn, _wq); \ ++ closure_queue(_cl); \ ++} while (0) ++ ++/** ++ * closure_return_with_destructor - finish execution of a closure, ++ * with destructor ++ * ++ * Works like closure_return(), except @destructor will be called when all ++ * outstanding refs on @cl have been dropped; @destructor may be used to safely ++ * free the memory occupied by @cl, and it is called with the ref on the parent ++ * closure still held - so @destructor could safely return an item to a ++ * freelist protected by @cl's parent. ++ */ ++#define closure_return_with_destructor(_cl, _destructor) \ ++do { \ ++ set_closure_fn(_cl, _destructor, NULL); \ ++ closure_sub(_cl, CLOSURE_RUNNING - CLOSURE_DESTRUCTOR + 1); \ ++} while (0) ++ ++/** ++ * closure_call - execute @fn out of a new, uninitialized closure ++ * ++ * Typically used when running out of one closure, and we want to run @fn ++ * asynchronously out of a new closure - @parent will then wait for @cl to ++ * finish. ++ */ ++static inline void closure_call(struct closure *cl, closure_fn fn, ++ struct workqueue_struct *wq, ++ struct closure *parent) ++{ ++ closure_init(cl, parent); ++ continue_at_nobarrier(cl, fn, wq); ++} ++ ++#define __closure_wait_event(waitlist, _cond) \ ++do { \ ++ struct closure cl; \ ++ \ ++ closure_init_stack(&cl); \ ++ \ ++ while (1) { \ ++ closure_wait(waitlist, &cl); \ ++ if (_cond) \ ++ break; \ ++ closure_sync(&cl); \ ++ } \ ++ closure_wake_up(waitlist); \ ++ closure_sync(&cl); \ ++} while (0) ++ ++#define closure_wait_event(waitlist, _cond) \ ++do { \ ++ if (!(_cond)) \ ++ __closure_wait_event(waitlist, _cond); \ ++} while (0) ++ ++#endif /* _LINUX_CLOSURE_H */ +diff --git a/include/linux/compiler_attributes.h b/include/linux/compiler_attributes.h +index ea7b756b1c8f..51658b72de72 100644 +--- a/include/linux/compiler_attributes.h ++++ b/include/linux/compiler_attributes.h +@@ -278,4 +278,9 @@ + */ + #define __weak __attribute__((__weak__)) + ++/* ++ * gcc: https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html#index-flatten-function-attribute ++ */ ++#define __flatten __attribute__((flatten)) ++ + #endif /* __LINUX_COMPILER_ATTRIBUTES_H */ +diff --git a/include/linux/dcache.h b/include/linux/dcache.h +index 65d975bf9390..008573618071 100644 +--- a/include/linux/dcache.h ++++ b/include/linux/dcache.h +@@ -256,6 +256,7 @@ extern struct dentry * d_make_root(struct inode *); + /* - the ramfs-type tree */ + extern void d_genocide(struct dentry *); + ++extern void d_mark_tmpfile(struct dentry *, struct inode *); + extern void d_tmpfile(struct dentry *, struct inode *); + + extern struct dentry *d_find_alias(struct inode *); +diff --git a/include/linux/fs.h b/include/linux/fs.h +index 7519ae003a08..305d316f01f3 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -2953,6 +2953,7 @@ extern struct inode *find_inode_rcu(struct super_block *, unsigned long, + extern struct inode *find_inode_by_ino_rcu(struct super_block *, unsigned long); + extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *); + extern int insert_inode_locked(struct inode *); ++extern struct inode *insert_inode_locked2(struct inode *); + #ifdef CONFIG_DEBUG_LOCK_ALLOC + extern void lockdep_annotate_inode_mutex_key(struct inode *inode); + #else +diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h +index 434c9c34aeb6..620535006624 100644 +--- a/include/linux/pagemap.h ++++ b/include/linux/pagemap.h +@@ -689,10 +689,15 @@ static inline int fault_in_pages_readable(const char __user *uaddr, int size) + return 0; + } + +-int add_to_page_cache_locked(struct page *page, struct address_space *mapping, +- pgoff_t index, gfp_t gfp_mask); ++int add_to_page_cache(struct page *page, struct address_space *mapping, ++ pgoff_t index, gfp_t gfp_mask); + int add_to_page_cache_lru(struct page *page, struct address_space *mapping, + pgoff_t index, gfp_t gfp_mask); ++int add_to_page_cache_lru_vec(struct address_space *mapping, ++ struct page **pages, ++ unsigned nr_pages, ++ pgoff_t offset, gfp_t gfp_mask); ++ + extern void delete_from_page_cache(struct page *page); + extern void __delete_from_page_cache(struct page *page, void *shadow); + int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); +@@ -710,22 +715,6 @@ void page_cache_readahead_unbounded(struct address_space *, struct file *, + pgoff_t index, unsigned long nr_to_read, + unsigned long lookahead_count); + +-/* +- * Like add_to_page_cache_locked, but used to add newly allocated pages: +- * the page is new, so we can just run __SetPageLocked() against it. +- */ +-static inline int add_to_page_cache(struct page *page, +- struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) +-{ +- int error; +- +- __SetPageLocked(page); +- error = add_to_page_cache_locked(page, mapping, offset, gfp_mask); +- if (unlikely(error)) +- __ClearPageLocked(page); +- return error; +-} +- + /** + * struct readahead_control - Describes a readahead request. + * +diff --git a/include/linux/sched.h b/include/linux/sched.h +index afe01e232935..793b07788062 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -747,6 +747,7 @@ struct task_struct { + + struct mm_struct *mm; + struct mm_struct *active_mm; ++ struct address_space *faults_disabled_mapping; + + /* Per-thread vma caching: */ + struct vmacache vmacache; +diff --git a/include/linux/six.h b/include/linux/six.h +new file mode 100644 +index 000000000000..a16e94f482e9 +--- /dev/null ++++ b/include/linux/six.h +@@ -0,0 +1,197 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++ ++#ifndef _LINUX_SIX_H ++#define _LINUX_SIX_H ++ ++/* ++ * Shared/intent/exclusive locks: sleepable read/write locks, much like rw ++ * semaphores, except with a third intermediate state, intent. Basic operations ++ * are: ++ * ++ * six_lock_read(&foo->lock); ++ * six_unlock_read(&foo->lock); ++ * ++ * six_lock_intent(&foo->lock); ++ * six_unlock_intent(&foo->lock); ++ * ++ * six_lock_write(&foo->lock); ++ * six_unlock_write(&foo->lock); ++ * ++ * Intent locks block other intent locks, but do not block read locks, and you ++ * must have an intent lock held before taking a write lock, like so: ++ * ++ * six_lock_intent(&foo->lock); ++ * six_lock_write(&foo->lock); ++ * six_unlock_write(&foo->lock); ++ * six_unlock_intent(&foo->lock); ++ * ++ * Other operations: ++ * ++ * six_trylock_read() ++ * six_trylock_intent() ++ * six_trylock_write() ++ * ++ * six_lock_downgrade(): convert from intent to read ++ * six_lock_tryupgrade(): attempt to convert from read to intent ++ * ++ * Locks also embed a sequence number, which is incremented when the lock is ++ * locked or unlocked for write. The current sequence number can be grabbed ++ * while a lock is held from lock->state.seq; then, if you drop the lock you can ++ * use six_relock_(read|intent_write)(lock, seq) to attempt to retake the lock ++ * iff it hasn't been locked for write in the meantime. ++ * ++ * There are also operations that take the lock type as a parameter, where the ++ * type is one of SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write: ++ * ++ * six_lock_type(lock, type) ++ * six_unlock_type(lock, type) ++ * six_relock(lock, type, seq) ++ * six_trylock_type(lock, type) ++ * six_trylock_convert(lock, from, to) ++ * ++ * A lock may be held multiple types by the same thread (for read or intent, ++ * not write). However, the six locks code does _not_ implement the actual ++ * recursive checks itself though - rather, if your code (e.g. btree iterator ++ * code) knows that the current thread already has a lock held, and for the ++ * correct type, six_lock_increment() may be used to bump up the counter for ++ * that type - the only effect is that one more call to unlock will be required ++ * before the lock is unlocked. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#define SIX_LOCK_SEPARATE_LOCKFNS ++ ++union six_lock_state { ++ struct { ++ atomic64_t counter; ++ }; ++ ++ struct { ++ u64 v; ++ }; ++ ++ struct { ++ /* for waitlist_bitnr() */ ++ unsigned long l; ++ }; ++ ++ struct { ++ unsigned read_lock:28; ++ unsigned intent_lock:1; ++ unsigned waiters:3; ++ /* ++ * seq works much like in seqlocks: it's incremented every time ++ * we lock and unlock for write. ++ * ++ * If it's odd write lock is held, even unlocked. ++ * ++ * Thus readers can unlock, and then lock again later iff it ++ * hasn't been modified in the meantime. ++ */ ++ u32 seq; ++ }; ++}; ++ ++enum six_lock_type { ++ SIX_LOCK_read, ++ SIX_LOCK_intent, ++ SIX_LOCK_write, ++}; ++ ++struct six_lock { ++ union six_lock_state state; ++ unsigned intent_lock_recurse; ++ struct task_struct *owner; ++ struct optimistic_spin_queue osq; ++ ++ raw_spinlock_t wait_lock; ++ struct list_head wait_list[2]; ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ struct lockdep_map dep_map; ++#endif ++}; ++ ++typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); ++ ++static __always_inline void __six_lock_init(struct six_lock *lock, ++ const char *name, ++ struct lock_class_key *key) ++{ ++ atomic64_set(&lock->state.counter, 0); ++ raw_spin_lock_init(&lock->wait_lock); ++ INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_read]); ++ INIT_LIST_HEAD(&lock->wait_list[SIX_LOCK_intent]); ++#ifdef CONFIG_DEBUG_LOCK_ALLOC ++ debug_check_no_locks_freed((void *) lock, sizeof(*lock)); ++ lockdep_init_map(&lock->dep_map, name, key, 0); ++#endif ++} ++ ++#define six_lock_init(lock) \ ++do { \ ++ static struct lock_class_key __key; \ ++ \ ++ __six_lock_init((lock), #lock, &__key); \ ++} while (0) ++ ++#define __SIX_VAL(field, _v) (((union six_lock_state) { .field = _v }).v) ++ ++#define __SIX_LOCK(type) \ ++bool six_trylock_##type(struct six_lock *); \ ++bool six_relock_##type(struct six_lock *, u32); \ ++int six_lock_##type(struct six_lock *, six_lock_should_sleep_fn, void *);\ ++void six_unlock_##type(struct six_lock *); ++ ++__SIX_LOCK(read) ++__SIX_LOCK(intent) ++__SIX_LOCK(write) ++#undef __SIX_LOCK ++ ++#define SIX_LOCK_DISPATCH(type, fn, ...) \ ++ switch (type) { \ ++ case SIX_LOCK_read: \ ++ return fn##_read(__VA_ARGS__); \ ++ case SIX_LOCK_intent: \ ++ return fn##_intent(__VA_ARGS__); \ ++ case SIX_LOCK_write: \ ++ return fn##_write(__VA_ARGS__); \ ++ default: \ ++ BUG(); \ ++ } ++ ++static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) ++{ ++ SIX_LOCK_DISPATCH(type, six_trylock, lock); ++} ++ ++static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, ++ unsigned seq) ++{ ++ SIX_LOCK_DISPATCH(type, six_relock, lock, seq); ++} ++ ++static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) ++{ ++ SIX_LOCK_DISPATCH(type, six_lock, lock, should_sleep_fn, p); ++} ++ ++static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) ++{ ++ SIX_LOCK_DISPATCH(type, six_unlock, lock); ++} ++ ++void six_lock_downgrade(struct six_lock *); ++bool six_lock_tryupgrade(struct six_lock *); ++bool six_trylock_convert(struct six_lock *, enum six_lock_type, ++ enum six_lock_type); ++ ++void six_lock_increment(struct six_lock *, enum six_lock_type); ++ ++void six_lock_wakeup_all(struct six_lock *); ++ ++#endif /* _LINUX_SIX_H */ +diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h +index 0221f852a7e1..f81f60d891ac 100644 +--- a/include/linux/vmalloc.h ++++ b/include/linux/vmalloc.h +@@ -106,6 +106,7 @@ extern void *vzalloc(unsigned long size); + extern void *vmalloc_user(unsigned long size); + extern void *vmalloc_node(unsigned long size, int node); + extern void *vzalloc_node(unsigned long size, int node); ++extern void *vmalloc_exec(unsigned long size, gfp_t gfp_mask); + extern void *vmalloc_32(unsigned long size); + extern void *vmalloc_32_user(unsigned long size); + extern void *__vmalloc(unsigned long size, gfp_t gfp_mask); +diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h +new file mode 100644 +index 000000000000..9b4e8295ed75 +--- /dev/null ++++ b/include/trace/events/bcachefs.h +@@ -0,0 +1,664 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#undef TRACE_SYSTEM ++#define TRACE_SYSTEM bcachefs ++ ++#if !defined(_TRACE_BCACHE_H) || defined(TRACE_HEADER_MULTI_READ) ++#define _TRACE_BCACHE_H ++ ++#include ++ ++DECLARE_EVENT_CLASS(bpos, ++ TP_PROTO(struct bpos *p), ++ TP_ARGS(p), ++ ++ TP_STRUCT__entry( ++ __field(u64, inode ) ++ __field(u64, offset ) ++ ), ++ ++ TP_fast_assign( ++ __entry->inode = p->inode; ++ __entry->offset = p->offset; ++ ), ++ ++ TP_printk("%llu:%llu", __entry->inode, __entry->offset) ++); ++ ++DECLARE_EVENT_CLASS(bkey, ++ TP_PROTO(const struct bkey *k), ++ TP_ARGS(k), ++ ++ TP_STRUCT__entry( ++ __field(u64, inode ) ++ __field(u64, offset ) ++ __field(u32, size ) ++ ), ++ ++ TP_fast_assign( ++ __entry->inode = k->p.inode; ++ __entry->offset = k->p.offset; ++ __entry->size = k->size; ++ ), ++ ++ TP_printk("%llu:%llu len %u", __entry->inode, ++ __entry->offset, __entry->size) ++); ++ ++DECLARE_EVENT_CLASS(bch_fs, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ ), ++ ++ TP_printk("%pU", __entry->uuid) ++); ++ ++DECLARE_EVENT_CLASS(bio, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio), ++ ++ TP_STRUCT__entry( ++ __field(dev_t, dev ) ++ __field(sector_t, sector ) ++ __field(unsigned int, nr_sector ) ++ __array(char, rwbs, 6 ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = bio->bi_disk ? bio_dev(bio) : 0; ++ __entry->sector = bio->bi_iter.bi_sector; ++ __entry->nr_sector = bio->bi_iter.bi_size >> 9; ++ blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); ++ ), ++ ++ TP_printk("%d,%d %s %llu + %u", ++ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, ++ (unsigned long long)__entry->sector, __entry->nr_sector) ++); ++ ++/* io.c: */ ++ ++DEFINE_EVENT(bio, read_split, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++DEFINE_EVENT(bio, read_bounce, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++DEFINE_EVENT(bio, read_retry, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++DEFINE_EVENT(bio, promote, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++/* Journal */ ++ ++DEFINE_EVENT(bch_fs, journal_full, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, journal_entry_full, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bio, journal_write, ++ TP_PROTO(struct bio *bio), ++ TP_ARGS(bio) ++); ++ ++/* bset.c: */ ++ ++DEFINE_EVENT(bpos, bkey_pack_pos_fail, ++ TP_PROTO(struct bpos *p), ++ TP_ARGS(p) ++); ++ ++/* Btree */ ++ ++DECLARE_EVENT_CLASS(btree_node, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(u8, level ) ++ __field(u8, id ) ++ __field(u64, inode ) ++ __field(u64, offset ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->level = b->c.level; ++ __entry->id = b->c.btree_id; ++ __entry->inode = b->key.k.p.inode; ++ __entry->offset = b->key.k.p.offset; ++ ), ++ ++ TP_printk("%pU %u id %u %llu:%llu", ++ __entry->uuid, __entry->level, __entry->id, ++ __entry->inode, __entry->offset) ++); ++ ++DEFINE_EVENT(btree_node, btree_read, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++TRACE_EVENT(btree_write, ++ TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors), ++ TP_ARGS(b, bytes, sectors), ++ ++ TP_STRUCT__entry( ++ __field(enum btree_node_type, type) ++ __field(unsigned, bytes ) ++ __field(unsigned, sectors ) ++ ), ++ ++ TP_fast_assign( ++ __entry->type = btree_node_type(b); ++ __entry->bytes = bytes; ++ __entry->sectors = sectors; ++ ), ++ ++ TP_printk("bkey type %u bytes %u sectors %u", ++ __entry->type , __entry->bytes, __entry->sectors) ++); ++ ++DEFINE_EVENT(btree_node, btree_node_alloc, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_node_free, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_node_reap, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DECLARE_EVENT_CLASS(btree_node_cannibalize_lock, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ ), ++ ++ TP_printk("%pU", __entry->uuid) ++); ++ ++DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock_fail, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize_lock, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(btree_node_cannibalize_lock, btree_node_cannibalize, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, btree_node_cannibalize_unlock, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++TRACE_EVENT(btree_reserve_get_fail, ++ TP_PROTO(struct bch_fs *c, size_t required, struct closure *cl), ++ TP_ARGS(c, required, cl), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(size_t, required ) ++ __field(struct closure *, cl ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->required = required; ++ __entry->cl = cl; ++ ), ++ ++ TP_printk("%pU required %zu by %p", __entry->uuid, ++ __entry->required, __entry->cl) ++); ++ ++TRACE_EVENT(btree_insert_key, ++ TP_PROTO(struct bch_fs *c, struct btree *b, struct bkey_i *k), ++ TP_ARGS(c, b, k), ++ ++ TP_STRUCT__entry( ++ __field(u8, id ) ++ __field(u64, inode ) ++ __field(u64, offset ) ++ __field(u32, size ) ++ ), ++ ++ TP_fast_assign( ++ __entry->id = b->c.btree_id; ++ __entry->inode = k->k.p.inode; ++ __entry->offset = k->k.p.offset; ++ __entry->size = k->k.size; ++ ), ++ ++ TP_printk("btree %u: %llu:%llu len %u", __entry->id, ++ __entry->inode, __entry->offset, __entry->size) ++); ++ ++DEFINE_EVENT(btree_node, btree_split, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_compact, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_merge, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_set_root, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++/* Garbage collection */ ++ ++DEFINE_EVENT(btree_node, btree_gc_coalesce, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++TRACE_EVENT(btree_gc_coalesce_fail, ++ TP_PROTO(struct bch_fs *c, int reason), ++ TP_ARGS(c, reason), ++ ++ TP_STRUCT__entry( ++ __field(u8, reason ) ++ __array(char, uuid, 16 ) ++ ), ++ ++ TP_fast_assign( ++ __entry->reason = reason; ++ memcpy(__entry->uuid, c->disk_sb.sb->user_uuid.b, 16); ++ ), ++ ++ TP_printk("%pU: %u", __entry->uuid, __entry->reason) ++); ++ ++DEFINE_EVENT(btree_node, btree_gc_rewrite_node, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(btree_node, btree_gc_rewrite_node_fail, ++ TP_PROTO(struct bch_fs *c, struct btree *b), ++ TP_ARGS(c, b) ++); ++ ++DEFINE_EVENT(bch_fs, gc_start, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, gc_end, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, gc_coalesce_start, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, gc_coalesce_end, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DEFINE_EVENT(bch_fs, gc_cannot_inc_gens, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++/* Allocator */ ++ ++TRACE_EVENT(alloc_batch, ++ TP_PROTO(struct bch_dev *ca, size_t free, size_t total), ++ TP_ARGS(ca, free, total), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(size_t, free ) ++ __field(size_t, total ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, ca->uuid.b, 16); ++ __entry->free = free; ++ __entry->total = total; ++ ), ++ ++ TP_printk("%pU free %zu total %zu", ++ __entry->uuid, __entry->free, __entry->total) ++); ++ ++TRACE_EVENT(invalidate, ++ TP_PROTO(struct bch_dev *ca, u64 offset, unsigned sectors), ++ TP_ARGS(ca, offset, sectors), ++ ++ TP_STRUCT__entry( ++ __field(unsigned, sectors ) ++ __field(dev_t, dev ) ++ __field(__u64, offset ) ++ ), ++ ++ TP_fast_assign( ++ __entry->dev = ca->disk_sb.bdev->bd_dev; ++ __entry->offset = offset, ++ __entry->sectors = sectors; ++ ), ++ ++ TP_printk("invalidated %u sectors at %d,%d sector=%llu", ++ __entry->sectors, MAJOR(__entry->dev), ++ MINOR(__entry->dev), __entry->offset) ++); ++ ++DEFINE_EVENT(bch_fs, rescale_prios, ++ TP_PROTO(struct bch_fs *c), ++ TP_ARGS(c) ++); ++ ++DECLARE_EVENT_CLASS(bucket_alloc, ++ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), ++ TP_ARGS(ca, reserve), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16) ++ __field(enum alloc_reserve, reserve ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, ca->uuid.b, 16); ++ __entry->reserve = reserve; ++ ), ++ ++ TP_printk("%pU reserve %d", __entry->uuid, __entry->reserve) ++); ++ ++DEFINE_EVENT(bucket_alloc, bucket_alloc, ++ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), ++ TP_ARGS(ca, reserve) ++); ++ ++DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, ++ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), ++ TP_ARGS(ca, reserve) ++); ++ ++DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail, ++ TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), ++ TP_ARGS(ca, reserve) ++); ++ ++/* Moving IO */ ++ ++DEFINE_EVENT(bkey, move_extent, ++ TP_PROTO(const struct bkey *k), ++ TP_ARGS(k) ++); ++ ++DEFINE_EVENT(bkey, move_alloc_fail, ++ TP_PROTO(const struct bkey *k), ++ TP_ARGS(k) ++); ++ ++DEFINE_EVENT(bkey, move_race, ++ TP_PROTO(const struct bkey *k), ++ TP_ARGS(k) ++); ++ ++TRACE_EVENT(move_data, ++ TP_PROTO(struct bch_fs *c, u64 sectors_moved, ++ u64 keys_moved), ++ TP_ARGS(c, sectors_moved, keys_moved), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(u64, sectors_moved ) ++ __field(u64, keys_moved ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->sectors_moved = sectors_moved; ++ __entry->keys_moved = keys_moved; ++ ), ++ ++ TP_printk("%pU sectors_moved %llu keys_moved %llu", ++ __entry->uuid, __entry->sectors_moved, __entry->keys_moved) ++); ++ ++TRACE_EVENT(copygc, ++ TP_PROTO(struct bch_fs *c, ++ u64 sectors_moved, u64 sectors_not_moved, ++ u64 buckets_moved, u64 buckets_not_moved), ++ TP_ARGS(c, ++ sectors_moved, sectors_not_moved, ++ buckets_moved, buckets_not_moved), ++ ++ TP_STRUCT__entry( ++ __array(char, uuid, 16 ) ++ __field(u64, sectors_moved ) ++ __field(u64, sectors_not_moved ) ++ __field(u64, buckets_moved ) ++ __field(u64, buckets_not_moved ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->uuid, c->sb.user_uuid.b, 16); ++ __entry->sectors_moved = sectors_moved; ++ __entry->sectors_not_moved = sectors_not_moved; ++ __entry->buckets_moved = buckets_moved; ++ __entry->buckets_not_moved = buckets_moved; ++ ), ++ ++ TP_printk("%pU sectors moved %llu remain %llu buckets moved %llu remain %llu", ++ __entry->uuid, ++ __entry->sectors_moved, __entry->sectors_not_moved, ++ __entry->buckets_moved, __entry->buckets_not_moved) ++); ++ ++TRACE_EVENT(transaction_restart_ip, ++ TP_PROTO(unsigned long caller, unsigned long ip), ++ TP_ARGS(caller, ip), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, caller ) ++ __field(unsigned long, ip ) ++ ), ++ ++ TP_fast_assign( ++ __entry->caller = caller; ++ __entry->ip = ip; ++ ), ++ ++ TP_printk("%pF %pF", (void *) __entry->caller, (void *) __entry->ip) ++); ++ ++DECLARE_EVENT_CLASS(transaction_restart, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, ip ) ++ ), ++ ++ TP_fast_assign( ++ __entry->ip = ip; ++ ), ++ ++ TP_printk("%pf", (void *) __entry->ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_btree_node_reused, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_would_deadlock, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++TRACE_EVENT(trans_restart_iters_realloced, ++ TP_PROTO(unsigned long ip, unsigned nr), ++ TP_ARGS(ip, nr), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, ip ) ++ __field(unsigned, nr ) ++ ), ++ ++ TP_fast_assign( ++ __entry->ip = ip; ++ __entry->nr = nr; ++ ), ++ ++ TP_printk("%pf nr %u", (void *) __entry->ip, __entry->nr) ++); ++ ++TRACE_EVENT(trans_restart_mem_realloced, ++ TP_PROTO(unsigned long ip, unsigned long bytes), ++ TP_ARGS(ip, bytes), ++ ++ TP_STRUCT__entry( ++ __field(unsigned long, ip ) ++ __field(unsigned long, bytes ) ++ ), ++ ++ TP_fast_assign( ++ __entry->ip = ip; ++ __entry->bytes = bytes; ++ ), ++ ++ TP_printk("%pf bytes %lu", (void *) __entry->ip, __entry->bytes) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_journal_res_get, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_journal_preres_get, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_mark_replicas, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_fault_inject, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_btree_node_split, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_mark, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_upgrade, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_iter_upgrade, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_traverse, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DEFINE_EVENT(transaction_restart, trans_restart_atomic, ++ TP_PROTO(unsigned long ip), ++ TP_ARGS(ip) ++); ++ ++DECLARE_EVENT_CLASS(node_lock_fail, ++ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), ++ TP_ARGS(level, iter_seq, node, node_seq), ++ ++ TP_STRUCT__entry( ++ __field(u32, level) ++ __field(u32, iter_seq) ++ __field(u32, node) ++ __field(u32, node_seq) ++ ), ++ ++ TP_fast_assign( ++ __entry->level = level; ++ __entry->iter_seq = iter_seq; ++ __entry->node = node; ++ __entry->node_seq = node_seq; ++ ), ++ ++ TP_printk("level %u iter seq %u node %u node seq %u", ++ __entry->level, __entry->iter_seq, ++ __entry->node, __entry->node_seq) ++); ++ ++DEFINE_EVENT(node_lock_fail, node_upgrade_fail, ++ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), ++ TP_ARGS(level, iter_seq, node, node_seq) ++); ++ ++DEFINE_EVENT(node_lock_fail, node_relock_fail, ++ TP_PROTO(unsigned level, u32 iter_seq, unsigned node, u32 node_seq), ++ TP_ARGS(level, iter_seq, node, node_seq) ++); ++ ++#endif /* _TRACE_BCACHE_H */ ++ ++/* This part must be outside protection */ ++#include +diff --git a/init/init_task.c b/init/init_task.c +index f6889fce64af..94706c45bb6a 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -84,6 +84,7 @@ struct task_struct init_task + .nr_cpus_allowed= NR_CPUS, + .mm = NULL, + .active_mm = &init_mm, ++ .faults_disabled_mapping = NULL, + .restart_block = { + .fn = do_no_restart_syscall, + }, +diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks +index 3de8fd11873b..ab8aa082ce56 100644 +--- a/kernel/Kconfig.locks ++++ b/kernel/Kconfig.locks +@@ -259,3 +259,6 @@ config ARCH_HAS_MMIOWB + config MMIOWB + def_bool y if ARCH_HAS_MMIOWB + depends on SMP ++ ++config SIXLOCKS ++ bool +diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile +index 6d11cfb9b41f..4c13937e8f37 100644 +--- a/kernel/locking/Makefile ++++ b/kernel/locking/Makefile +@@ -32,3 +32,4 @@ obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o + obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o + obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o + obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o ++obj-$(CONFIG_SIXLOCKS) += six.o +diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h +index b0be1560ed17..6388e42cfd68 100644 +--- a/kernel/locking/lockdep_internals.h ++++ b/kernel/locking/lockdep_internals.h +@@ -98,7 +98,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = + #else + #define MAX_LOCKDEP_ENTRIES 32768UL + +-#define MAX_LOCKDEP_CHAINS_BITS 16 ++#define MAX_LOCKDEP_CHAINS_BITS 18 + + /* + * Stack-trace: tightly packed array of stack backtrace +@@ -116,7 +116,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ = + + #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) + +-#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) ++#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*10) + + extern struct list_head all_lock_classes; + extern struct lock_chain lock_chains[]; +diff --git a/kernel/locking/six.c b/kernel/locking/six.c +new file mode 100644 +index 000000000000..49d46ed2e18e +--- /dev/null ++++ b/kernel/locking/six.c +@@ -0,0 +1,553 @@ ++// SPDX-License-Identifier: GPL-2.0 ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#ifdef DEBUG ++#define EBUG_ON(cond) BUG_ON(cond) ++#else ++#define EBUG_ON(cond) do {} while (0) ++#endif ++ ++#define six_acquire(l, t) lock_acquire(l, 0, t, 0, 0, NULL, _RET_IP_) ++#define six_release(l) lock_release(l, _RET_IP_) ++ ++struct six_lock_vals { ++ /* Value we add to the lock in order to take the lock: */ ++ u64 lock_val; ++ ++ /* If the lock has this value (used as a mask), taking the lock fails: */ ++ u64 lock_fail; ++ ++ /* Value we add to the lock in order to release the lock: */ ++ u64 unlock_val; ++ ++ /* Mask that indicates lock is held for this type: */ ++ u64 held_mask; ++ ++ /* Waitlist we wakeup when releasing the lock: */ ++ enum six_lock_type unlock_wakeup; ++}; ++ ++#define __SIX_LOCK_HELD_read __SIX_VAL(read_lock, ~0) ++#define __SIX_LOCK_HELD_intent __SIX_VAL(intent_lock, ~0) ++#define __SIX_LOCK_HELD_write __SIX_VAL(seq, 1) ++ ++#define LOCK_VALS { \ ++ [SIX_LOCK_read] = { \ ++ .lock_val = __SIX_VAL(read_lock, 1), \ ++ .lock_fail = __SIX_LOCK_HELD_write, \ ++ .unlock_val = -__SIX_VAL(read_lock, 1), \ ++ .held_mask = __SIX_LOCK_HELD_read, \ ++ .unlock_wakeup = SIX_LOCK_write, \ ++ }, \ ++ [SIX_LOCK_intent] = { \ ++ .lock_val = __SIX_VAL(intent_lock, 1), \ ++ .lock_fail = __SIX_LOCK_HELD_intent, \ ++ .unlock_val = -__SIX_VAL(intent_lock, 1), \ ++ .held_mask = __SIX_LOCK_HELD_intent, \ ++ .unlock_wakeup = SIX_LOCK_intent, \ ++ }, \ ++ [SIX_LOCK_write] = { \ ++ .lock_val = __SIX_VAL(seq, 1), \ ++ .lock_fail = __SIX_LOCK_HELD_read, \ ++ .unlock_val = __SIX_VAL(seq, 1), \ ++ .held_mask = __SIX_LOCK_HELD_write, \ ++ .unlock_wakeup = SIX_LOCK_read, \ ++ }, \ ++} ++ ++static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, ++ union six_lock_state old) ++{ ++ if (type != SIX_LOCK_intent) ++ return; ++ ++ if (!old.intent_lock) { ++ EBUG_ON(lock->owner); ++ lock->owner = current; ++ } else { ++ EBUG_ON(lock->owner != current); ++ } ++} ++ ++static __always_inline bool do_six_trylock_type(struct six_lock *lock, ++ enum six_lock_type type) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ union six_lock_state old; ++ u64 v = READ_ONCE(lock->state.v); ++ ++ EBUG_ON(type == SIX_LOCK_write && lock->owner != current); ++ ++ do { ++ old.v = v; ++ ++ EBUG_ON(type == SIX_LOCK_write && ++ ((old.v & __SIX_LOCK_HELD_write) || ++ !(old.v & __SIX_LOCK_HELD_intent))); ++ ++ if (old.v & l[type].lock_fail) ++ return false; ++ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, ++ old.v, ++ old.v + l[type].lock_val)) != old.v); ++ ++ six_set_owner(lock, type, old); ++ return true; ++} ++ ++__always_inline __flatten ++static bool __six_trylock_type(struct six_lock *lock, enum six_lock_type type) ++{ ++ if (!do_six_trylock_type(lock, type)) ++ return false; ++ ++ if (type != SIX_LOCK_write) ++ six_acquire(&lock->dep_map, 1); ++ return true; ++} ++ ++__always_inline __flatten ++static bool __six_relock_type(struct six_lock *lock, enum six_lock_type type, ++ unsigned seq) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ union six_lock_state old; ++ u64 v = READ_ONCE(lock->state.v); ++ ++ do { ++ old.v = v; ++ ++ if (old.seq != seq || old.v & l[type].lock_fail) ++ return false; ++ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, ++ old.v, ++ old.v + l[type].lock_val)) != old.v); ++ ++ six_set_owner(lock, type, old); ++ if (type != SIX_LOCK_write) ++ six_acquire(&lock->dep_map, 1); ++ return true; ++} ++ ++struct six_lock_waiter { ++ struct list_head list; ++ struct task_struct *task; ++}; ++ ++/* This is probably up there with the more evil things I've done */ ++#define waitlist_bitnr(id) ilog2((((union six_lock_state) { .waiters = 1 << (id) }).l)) ++ ++#ifdef CONFIG_LOCK_SPIN_ON_OWNER ++ ++static inline int six_can_spin_on_owner(struct six_lock *lock) ++{ ++ struct task_struct *owner; ++ int retval = 1; ++ ++ if (need_resched()) ++ return 0; ++ ++ rcu_read_lock(); ++ owner = READ_ONCE(lock->owner); ++ if (owner) ++ retval = owner->on_cpu; ++ rcu_read_unlock(); ++ /* ++ * if lock->owner is not set, the mutex owner may have just acquired ++ * it and not set the owner yet or the mutex has been released. ++ */ ++ return retval; ++} ++ ++static inline bool six_spin_on_owner(struct six_lock *lock, ++ struct task_struct *owner) ++{ ++ bool ret = true; ++ ++ rcu_read_lock(); ++ while (lock->owner == owner) { ++ /* ++ * Ensure we emit the owner->on_cpu, dereference _after_ ++ * checking lock->owner still matches owner. If that fails, ++ * owner might point to freed memory. If it still matches, ++ * the rcu_read_lock() ensures the memory stays valid. ++ */ ++ barrier(); ++ ++ if (!owner->on_cpu || need_resched()) { ++ ret = false; ++ break; ++ } ++ ++ cpu_relax(); ++ } ++ rcu_read_unlock(); ++ ++ return ret; ++} ++ ++static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) ++{ ++ struct task_struct *task = current; ++ ++ if (type == SIX_LOCK_write) ++ return false; ++ ++ preempt_disable(); ++ if (!six_can_spin_on_owner(lock)) ++ goto fail; ++ ++ if (!osq_lock(&lock->osq)) ++ goto fail; ++ ++ while (1) { ++ struct task_struct *owner; ++ ++ /* ++ * If there's an owner, wait for it to either ++ * release the lock or go to sleep. ++ */ ++ owner = READ_ONCE(lock->owner); ++ if (owner && !six_spin_on_owner(lock, owner)) ++ break; ++ ++ if (do_six_trylock_type(lock, type)) { ++ osq_unlock(&lock->osq); ++ preempt_enable(); ++ return true; ++ } ++ ++ /* ++ * When there's no owner, we might have preempted between the ++ * owner acquiring the lock and setting the owner field. If ++ * we're an RT task that will live-lock because we won't let ++ * the owner complete. ++ */ ++ if (!owner && (need_resched() || rt_task(task))) ++ break; ++ ++ /* ++ * The cpu_relax() call is a compiler barrier which forces ++ * everything in this loop to be re-loaded. We don't need ++ * memory barriers as we'll eventually observe the right ++ * values at the cost of a few extra spins. ++ */ ++ cpu_relax(); ++ } ++ ++ osq_unlock(&lock->osq); ++fail: ++ preempt_enable(); ++ ++ /* ++ * If we fell out of the spin path because of need_resched(), ++ * reschedule now, before we try-lock again. This avoids getting ++ * scheduled out right after we obtained the lock. ++ */ ++ if (need_resched()) ++ schedule(); ++ ++ return false; ++} ++ ++#else /* CONFIG_LOCK_SPIN_ON_OWNER */ ++ ++static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) ++{ ++ return false; ++} ++ ++#endif ++ ++noinline ++static int __six_lock_type_slowpath(struct six_lock *lock, enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ union six_lock_state old, new; ++ struct six_lock_waiter wait; ++ int ret = 0; ++ u64 v; ++ ++ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; ++ if (ret) ++ return ret; ++ ++ if (six_optimistic_spin(lock, type)) ++ return 0; ++ ++ lock_contended(&lock->dep_map, _RET_IP_); ++ ++ INIT_LIST_HEAD(&wait.list); ++ wait.task = current; ++ ++ while (1) { ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ if (type == SIX_LOCK_write) ++ EBUG_ON(lock->owner != current); ++ else if (list_empty_careful(&wait.list)) { ++ raw_spin_lock(&lock->wait_lock); ++ list_add_tail(&wait.list, &lock->wait_list[type]); ++ raw_spin_unlock(&lock->wait_lock); ++ } ++ ++ ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; ++ if (ret) ++ break; ++ ++ v = READ_ONCE(lock->state.v); ++ do { ++ new.v = old.v = v; ++ ++ if (!(old.v & l[type].lock_fail)) ++ new.v += l[type].lock_val; ++ else if (!(new.waiters & (1 << type))) ++ new.waiters |= 1 << type; ++ else ++ break; /* waiting bit already set */ ++ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, ++ old.v, new.v)) != old.v); ++ ++ if (!(old.v & l[type].lock_fail)) ++ break; ++ ++ schedule(); ++ } ++ ++ if (!ret) ++ six_set_owner(lock, type, old); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ if (!list_empty_careful(&wait.list)) { ++ raw_spin_lock(&lock->wait_lock); ++ list_del_init(&wait.list); ++ raw_spin_unlock(&lock->wait_lock); ++ } ++ ++ return ret; ++} ++ ++__always_inline ++static int __six_lock_type(struct six_lock *lock, enum six_lock_type type, ++ six_lock_should_sleep_fn should_sleep_fn, void *p) ++{ ++ int ret; ++ ++ if (type != SIX_LOCK_write) ++ six_acquire(&lock->dep_map, 0); ++ ++ ret = do_six_trylock_type(lock, type) ? 0 ++ : __six_lock_type_slowpath(lock, type, should_sleep_fn, p); ++ ++ if (ret && type != SIX_LOCK_write) ++ six_release(&lock->dep_map); ++ if (!ret) ++ lock_acquired(&lock->dep_map, _RET_IP_); ++ ++ return ret; ++} ++ ++static inline void six_lock_wakeup(struct six_lock *lock, ++ union six_lock_state state, ++ unsigned waitlist_id) ++{ ++ struct list_head *wait_list = &lock->wait_list[waitlist_id]; ++ struct six_lock_waiter *w, *next; ++ ++ if (waitlist_id == SIX_LOCK_write && state.read_lock) ++ return; ++ ++ if (!(state.waiters & (1 << waitlist_id))) ++ return; ++ ++ clear_bit(waitlist_bitnr(waitlist_id), ++ (unsigned long *) &lock->state.v); ++ ++ if (waitlist_id == SIX_LOCK_write) { ++ struct task_struct *p = READ_ONCE(lock->owner); ++ ++ if (p) ++ wake_up_process(p); ++ return; ++ } ++ ++ raw_spin_lock(&lock->wait_lock); ++ ++ list_for_each_entry_safe(w, next, wait_list, list) { ++ list_del_init(&w->list); ++ ++ if (wake_up_process(w->task) && ++ waitlist_id != SIX_LOCK_read) { ++ if (!list_empty(wait_list)) ++ set_bit(waitlist_bitnr(waitlist_id), ++ (unsigned long *) &lock->state.v); ++ break; ++ } ++ } ++ ++ raw_spin_unlock(&lock->wait_lock); ++} ++ ++__always_inline __flatten ++static void __six_unlock_type(struct six_lock *lock, enum six_lock_type type) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ union six_lock_state state; ++ ++ EBUG_ON(!(lock->state.v & l[type].held_mask)); ++ EBUG_ON(type == SIX_LOCK_write && ++ !(lock->state.v & __SIX_LOCK_HELD_intent)); ++ ++ if (type != SIX_LOCK_write) ++ six_release(&lock->dep_map); ++ ++ if (type == SIX_LOCK_intent) { ++ EBUG_ON(lock->owner != current); ++ ++ if (lock->intent_lock_recurse) { ++ --lock->intent_lock_recurse; ++ return; ++ } ++ ++ lock->owner = NULL; ++ } ++ ++ state.v = atomic64_add_return_release(l[type].unlock_val, ++ &lock->state.counter); ++ six_lock_wakeup(lock, state, l[type].unlock_wakeup); ++} ++ ++#define __SIX_LOCK(type) \ ++bool six_trylock_##type(struct six_lock *lock) \ ++{ \ ++ return __six_trylock_type(lock, SIX_LOCK_##type); \ ++} \ ++EXPORT_SYMBOL_GPL(six_trylock_##type); \ ++ \ ++bool six_relock_##type(struct six_lock *lock, u32 seq) \ ++{ \ ++ return __six_relock_type(lock, SIX_LOCK_##type, seq); \ ++} \ ++EXPORT_SYMBOL_GPL(six_relock_##type); \ ++ \ ++int six_lock_##type(struct six_lock *lock, \ ++ six_lock_should_sleep_fn should_sleep_fn, void *p) \ ++{ \ ++ return __six_lock_type(lock, SIX_LOCK_##type, should_sleep_fn, p);\ ++} \ ++EXPORT_SYMBOL_GPL(six_lock_##type); \ ++ \ ++void six_unlock_##type(struct six_lock *lock) \ ++{ \ ++ __six_unlock_type(lock, SIX_LOCK_##type); \ ++} \ ++EXPORT_SYMBOL_GPL(six_unlock_##type); ++ ++__SIX_LOCK(read) ++__SIX_LOCK(intent) ++__SIX_LOCK(write) ++ ++#undef __SIX_LOCK ++ ++/* Convert from intent to read: */ ++void six_lock_downgrade(struct six_lock *lock) ++{ ++ six_lock_increment(lock, SIX_LOCK_read); ++ six_unlock_intent(lock); ++} ++EXPORT_SYMBOL_GPL(six_lock_downgrade); ++ ++bool six_lock_tryupgrade(struct six_lock *lock) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ union six_lock_state old, new; ++ u64 v = READ_ONCE(lock->state.v); ++ ++ do { ++ new.v = old.v = v; ++ ++ EBUG_ON(!(old.v & l[SIX_LOCK_read].held_mask)); ++ ++ new.v += l[SIX_LOCK_read].unlock_val; ++ ++ if (new.v & l[SIX_LOCK_intent].lock_fail) ++ return false; ++ ++ new.v += l[SIX_LOCK_intent].lock_val; ++ } while ((v = atomic64_cmpxchg_acquire(&lock->state.counter, ++ old.v, new.v)) != old.v); ++ ++ six_set_owner(lock, SIX_LOCK_intent, old); ++ six_lock_wakeup(lock, new, l[SIX_LOCK_read].unlock_wakeup); ++ ++ return true; ++} ++EXPORT_SYMBOL_GPL(six_lock_tryupgrade); ++ ++bool six_trylock_convert(struct six_lock *lock, ++ enum six_lock_type from, ++ enum six_lock_type to) ++{ ++ EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write); ++ ++ if (to == from) ++ return true; ++ ++ if (to == SIX_LOCK_read) { ++ six_lock_downgrade(lock); ++ return true; ++ } else { ++ return six_lock_tryupgrade(lock); ++ } ++} ++EXPORT_SYMBOL_GPL(six_trylock_convert); ++ ++/* ++ * Increment read/intent lock count, assuming we already have it read or intent ++ * locked: ++ */ ++void six_lock_increment(struct six_lock *lock, enum six_lock_type type) ++{ ++ const struct six_lock_vals l[] = LOCK_VALS; ++ ++ EBUG_ON(type == SIX_LOCK_write); ++ six_acquire(&lock->dep_map, 0); ++ ++ /* XXX: assert already locked, and that we don't overflow: */ ++ ++ switch (type) { ++ case SIX_LOCK_read: ++ atomic64_add(l[type].lock_val, &lock->state.counter); ++ break; ++ case SIX_LOCK_intent: ++ lock->intent_lock_recurse++; ++ break; ++ case SIX_LOCK_write: ++ BUG(); ++ break; ++ } ++} ++EXPORT_SYMBOL_GPL(six_lock_increment); ++ ++void six_lock_wakeup_all(struct six_lock *lock) ++{ ++ struct six_lock_waiter *w; ++ ++ raw_spin_lock(&lock->wait_lock); ++ ++ list_for_each_entry(w, &lock->wait_list[0], list) ++ wake_up_process(w->task); ++ list_for_each_entry(w, &lock->wait_list[1], list) ++ wake_up_process(w->task); ++ ++ raw_spin_unlock(&lock->wait_lock); ++} ++EXPORT_SYMBOL_GPL(six_lock_wakeup_all); +diff --git a/kernel/module.c b/kernel/module.c +index 1c5cff34d9f2..8f9f37b0bfaa 100644 +--- a/kernel/module.c ++++ b/kernel/module.c +@@ -2830,9 +2830,7 @@ static void dynamic_debug_remove(struct module *mod, struct _ddebug *debug) + + void * __weak module_alloc(unsigned long size) + { +- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, +- GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, +- NUMA_NO_NODE, __builtin_return_address(0)); ++ return vmalloc_exec(size, GFP_KERNEL); + } + + bool __weak module_init_section(const char *name) +diff --git a/lib/Kconfig b/lib/Kconfig +index b4b98a03ff98..7ec0b400c545 100644 +--- a/lib/Kconfig ++++ b/lib/Kconfig +@@ -461,6 +461,9 @@ config ASSOCIATIVE_ARRAY + + for more information. + ++config CLOSURES ++ bool ++ + config HAS_IOMEM + bool + depends on !NO_IOMEM +diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug +index 0c781f912f9f..efe645766784 100644 +--- a/lib/Kconfig.debug ++++ b/lib/Kconfig.debug +@@ -1517,6 +1517,15 @@ config DEBUG_CREDENTIALS + + source "kernel/rcu/Kconfig.debug" + ++config DEBUG_CLOSURES ++ bool "Debug closures (bcache async widgits)" ++ depends on CLOSURES ++ select DEBUG_FS ++ help ++ Keeps all active closures in a linked list and provides a debugfs ++ interface to list them, which makes it possible to see asynchronous ++ operations that get stuck. ++ + config DEBUG_WQ_FORCE_RR_CPU + bool "Force round-robin CPU selection for unbound work items" + depends on DEBUG_KERNEL +diff --git a/lib/Makefile b/lib/Makefile +index a4a4c6864f51..dfefe98c29ec 100644 +--- a/lib/Makefile ++++ b/lib/Makefile +@@ -234,6 +234,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o + + obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o + ++obj-$(CONFIG_CLOSURES) += closure.o ++ + obj-$(CONFIG_DQL) += dynamic_queue_limits.o + + obj-$(CONFIG_GLOB) += glob.o +diff --git a/lib/closure.c b/lib/closure.c +new file mode 100644 +index 000000000000..3e6366c26209 +--- /dev/null ++++ b/lib/closure.c +@@ -0,0 +1,214 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Asynchronous refcounty things ++ * ++ * Copyright 2010, 2011 Kent Overstreet ++ * Copyright 2012 Google, Inc. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++static inline void closure_put_after_sub(struct closure *cl, int flags) ++{ ++ int r = flags & CLOSURE_REMAINING_MASK; ++ ++ BUG_ON(flags & CLOSURE_GUARD_MASK); ++ BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR)); ++ ++ if (!r) { ++ if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) { ++ atomic_set(&cl->remaining, ++ CLOSURE_REMAINING_INITIALIZER); ++ closure_queue(cl); ++ } else { ++ struct closure *parent = cl->parent; ++ closure_fn *destructor = cl->fn; ++ ++ closure_debug_destroy(cl); ++ ++ if (destructor) ++ destructor(cl); ++ ++ if (parent) ++ closure_put(parent); ++ } ++ } ++} ++ ++/* For clearing flags with the same atomic op as a put */ ++void closure_sub(struct closure *cl, int v) ++{ ++ closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); ++} ++EXPORT_SYMBOL(closure_sub); ++ ++/* ++ * closure_put - decrement a closure's refcount ++ */ ++void closure_put(struct closure *cl) ++{ ++ closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); ++} ++EXPORT_SYMBOL(closure_put); ++ ++/* ++ * closure_wake_up - wake up all closures on a wait list, without memory barrier ++ */ ++void __closure_wake_up(struct closure_waitlist *wait_list) ++{ ++ struct llist_node *list; ++ struct closure *cl, *t; ++ struct llist_node *reverse = NULL; ++ ++ list = llist_del_all(&wait_list->list); ++ ++ /* We first reverse the list to preserve FIFO ordering and fairness */ ++ reverse = llist_reverse_order(list); ++ ++ /* Then do the wakeups */ ++ llist_for_each_entry_safe(cl, t, reverse, list) { ++ closure_set_waiting(cl, 0); ++ closure_sub(cl, CLOSURE_WAITING + 1); ++ } ++} ++EXPORT_SYMBOL(__closure_wake_up); ++ ++/** ++ * closure_wait - add a closure to a waitlist ++ * @waitlist: will own a ref on @cl, which will be released when ++ * closure_wake_up() is called on @waitlist. ++ * @cl: closure pointer. ++ * ++ */ ++bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) ++{ ++ if (atomic_read(&cl->remaining) & CLOSURE_WAITING) ++ return false; ++ ++ closure_set_waiting(cl, _RET_IP_); ++ atomic_add(CLOSURE_WAITING + 1, &cl->remaining); ++ llist_add(&cl->list, &waitlist->list); ++ ++ return true; ++} ++EXPORT_SYMBOL(closure_wait); ++ ++struct closure_syncer { ++ struct task_struct *task; ++ int done; ++}; ++ ++static void closure_sync_fn(struct closure *cl) ++{ ++ struct closure_syncer *s = cl->s; ++ struct task_struct *p; ++ ++ rcu_read_lock(); ++ p = READ_ONCE(s->task); ++ s->done = 1; ++ wake_up_process(p); ++ rcu_read_unlock(); ++} ++ ++void __sched __closure_sync(struct closure *cl) ++{ ++ struct closure_syncer s = { .task = current }; ++ ++ cl->s = &s; ++ continue_at(cl, closure_sync_fn, NULL); ++ ++ while (1) { ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ if (s.done) ++ break; ++ schedule(); ++ } ++ ++ __set_current_state(TASK_RUNNING); ++} ++EXPORT_SYMBOL(__closure_sync); ++ ++#ifdef CONFIG_DEBUG_CLOSURES ++ ++static LIST_HEAD(closure_list); ++static DEFINE_SPINLOCK(closure_list_lock); ++ ++void closure_debug_create(struct closure *cl) ++{ ++ unsigned long flags; ++ ++ BUG_ON(cl->magic == CLOSURE_MAGIC_ALIVE); ++ cl->magic = CLOSURE_MAGIC_ALIVE; ++ ++ spin_lock_irqsave(&closure_list_lock, flags); ++ list_add(&cl->all, &closure_list); ++ spin_unlock_irqrestore(&closure_list_lock, flags); ++} ++EXPORT_SYMBOL(closure_debug_create); ++ ++void closure_debug_destroy(struct closure *cl) ++{ ++ unsigned long flags; ++ ++ BUG_ON(cl->magic != CLOSURE_MAGIC_ALIVE); ++ cl->magic = CLOSURE_MAGIC_DEAD; ++ ++ spin_lock_irqsave(&closure_list_lock, flags); ++ list_del(&cl->all); ++ spin_unlock_irqrestore(&closure_list_lock, flags); ++} ++EXPORT_SYMBOL(closure_debug_destroy); ++ ++static int debug_seq_show(struct seq_file *f, void *data) ++{ ++ struct closure *cl; ++ ++ spin_lock_irq(&closure_list_lock); ++ ++ list_for_each_entry(cl, &closure_list, all) { ++ int r = atomic_read(&cl->remaining); ++ ++ seq_printf(f, "%p: %pS -> %pS p %p r %i ", ++ cl, (void *) cl->ip, cl->fn, cl->parent, ++ r & CLOSURE_REMAINING_MASK); ++ ++ seq_printf(f, "%s%s\n", ++ test_bit(WORK_STRUCT_PENDING_BIT, ++ work_data_bits(&cl->work)) ? "Q" : "", ++ r & CLOSURE_RUNNING ? "R" : ""); ++ ++ if (r & CLOSURE_WAITING) ++ seq_printf(f, " W %pS\n", ++ (void *) cl->waiting_on); ++ ++ seq_puts(f, "\n"); ++ } ++ ++ spin_unlock_irq(&closure_list_lock); ++ return 0; ++} ++ ++static int debug_seq_open(struct inode *inode, struct file *file) ++{ ++ return single_open(file, debug_seq_show, NULL); ++} ++ ++static const struct file_operations debug_ops = { ++ .owner = THIS_MODULE, ++ .open = debug_seq_open, ++ .read = seq_read, ++ .release = single_release ++}; ++ ++static int __init closure_debug_init(void) ++{ ++ debugfs_create_file("closures", 0400, NULL, NULL, &debug_ops); ++ return 0; ++} ++late_initcall(closure_debug_init) ++ ++#endif +diff --git a/mm/filemap.c b/mm/filemap.c +index 99c49eeae71b..a5a07767a2eb 100644 +--- a/mm/filemap.c ++++ b/mm/filemap.c +@@ -117,6 +117,69 @@ + * ->tasklist_lock (memory_failure, collect_procs_ao) + */ + ++static int page_cache_tree_insert_vec(struct page *pages[], ++ unsigned nr_pages, ++ struct address_space *mapping, ++ pgoff_t index, ++ gfp_t gfp_mask, ++ void *shadow[]) ++{ ++ XA_STATE(xas, &mapping->i_pages, index); ++ void *old; ++ int i = 0, error = 0; ++ ++ mapping_set_update(&xas, mapping); ++ ++ if (!nr_pages) ++ return 0; ++ ++ xa_lock_irq(&mapping->i_pages); ++ ++ while (1) { ++ old = xas_load(&xas); ++ if (old && !xa_is_value(old)) { ++ error = -EEXIST; ++ break; ++ } ++ ++ xas_store(&xas, pages[i]); ++ error = xas_error(&xas); ++ ++ if (error == -ENOMEM) { ++ xa_unlock_irq(&mapping->i_pages); ++ if (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)) ++ error = 0; ++ xa_lock_irq(&mapping->i_pages); ++ ++ if (!error) ++ continue; ++ break; ++ } ++ ++ if (error) ++ break; ++ ++ if (shadow) ++ shadow[i] = old; ++ if (xa_is_value(old)) ++ mapping->nrexceptional--; ++ mapping->nrpages++; ++ ++ /* hugetlb pages do not participate in page cache accounting. */ ++ if (!PageHuge(pages[i])) ++ __inc_lruvec_page_state(pages[i], NR_FILE_PAGES); ++ ++ if (++i == nr_pages) ++ break; ++ ++ xas_next(&xas); ++ } ++ ++ xa_unlock_irq(&mapping->i_pages); ++ ++ return i ?: error; ++} ++ + static void page_cache_delete(struct address_space *mapping, + struct page *page, void *shadow) + { +@@ -827,114 +890,148 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask) + } + EXPORT_SYMBOL_GPL(replace_page_cache_page); + +-static int __add_to_page_cache_locked(struct page *page, +- struct address_space *mapping, +- pgoff_t offset, gfp_t gfp_mask, +- void **shadowp) ++static int add_to_page_cache_vec(struct page **pages, unsigned nr_pages, ++ struct address_space *mapping, ++ pgoff_t index, gfp_t gfp_mask, ++ void *shadow[]) + { +- XA_STATE(xas, &mapping->i_pages, offset); +- int huge = PageHuge(page); +- int error; +- void *old; ++ int i, nr_added = 0, error = 0; + +- VM_BUG_ON_PAGE(!PageLocked(page), page); +- VM_BUG_ON_PAGE(PageSwapBacked(page), page); +- mapping_set_update(&xas, mapping); ++ for (i = 0; i < nr_pages; i++) { ++ struct page *page = pages[i]; + +- get_page(page); +- page->mapping = mapping; +- page->index = offset; ++ VM_BUG_ON_PAGE(PageSwapBacked(page), page); ++ VM_BUG_ON_PAGE(PageSwapCache(page), page); + +- if (!huge) { +- error = mem_cgroup_charge(page, current->mm, gfp_mask); +- if (error) +- goto error; ++ __SetPageLocked(page); ++ get_page(page); ++ page->mapping = mapping; ++ page->index = index + i; ++ ++ if (!PageHuge(page)) { ++ error = mem_cgroup_charge(page, current->mm, gfp_mask); ++ if (error) { ++ page->mapping = NULL; ++ /* Leave page->index set: truncation relies upon it */ ++ put_page(page); ++ __ClearPageLocked(page); ++ if (!i) ++ return error; ++ nr_pages = i; ++ break; ++ } ++ } + } + +- do { +- xas_lock_irq(&xas); +- old = xas_load(&xas); +- if (old && !xa_is_value(old)) +- xas_set_err(&xas, -EEXIST); +- xas_store(&xas, page); +- if (xas_error(&xas)) +- goto unlock; ++ error = page_cache_tree_insert_vec(pages, nr_pages, mapping, ++ index, gfp_mask, shadow); ++ if (error > 0) { ++ nr_added = error; ++ error = 0; ++ } + +- if (xa_is_value(old)) { +- mapping->nrexceptional--; +- if (shadowp) +- *shadowp = old; +- } +- mapping->nrpages++; ++ for (i = 0; i < nr_added; i++) ++ trace_mm_filemap_add_to_page_cache(pages[i]); + +- /* hugetlb pages do not participate in page cache accounting */ +- if (!huge) +- __inc_lruvec_page_state(page, NR_FILE_PAGES); +-unlock: +- xas_unlock_irq(&xas); +- } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK)); ++ for (i = nr_added; i < nr_pages; i++) { ++ struct page *page = pages[i]; + +- if (xas_error(&xas)) { +- error = xas_error(&xas); +- goto error; ++ /* Leave page->index set: truncation relies upon it */ ++ page->mapping = NULL; ++ put_page(page); ++ __ClearPageLocked(page); + } + +- trace_mm_filemap_add_to_page_cache(page); +- return 0; +-error: +- page->mapping = NULL; +- /* Leave page->index set: truncation relies upon it */ +- put_page(page); +- return error; ++ return nr_added ?: error; + } +-ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO); ++ALLOW_ERROR_INJECTION(__add_to_page_cache, ERRNO); + + /** +- * add_to_page_cache_locked - add a locked page to the pagecache ++ * add_to_page_cache - add a newly allocated page to the pagecache + * @page: page to add + * @mapping: the page's address_space + * @offset: page index + * @gfp_mask: page allocation mode + * +- * This function is used to add a page to the pagecache. It must be locked. +- * This function does not add the page to the LRU. The caller must do that. ++ * This function is used to add a page to the pagecache. It must be newly ++ * allocated. This function does not add the page to the LRU. The caller must ++ * do that. + * + * Return: %0 on success, negative error code otherwise. + */ +-int add_to_page_cache_locked(struct page *page, struct address_space *mapping, +- pgoff_t offset, gfp_t gfp_mask) ++int add_to_page_cache(struct page *page, struct address_space *mapping, ++ pgoff_t offset, gfp_t gfp_mask) + { +- return __add_to_page_cache_locked(page, mapping, offset, +- gfp_mask, NULL); ++ int ret = add_to_page_cache_vec(&page, 1, mapping, offset, ++ gfp_mask, NULL); ++ if (ret < 0) ++ return ret; ++ return 0; + } +-EXPORT_SYMBOL(add_to_page_cache_locked); ++EXPORT_SYMBOL(add_to_page_cache); ++ALLOW_ERROR_INJECTION(add_to_page_cache, ERRNO); + +-int add_to_page_cache_lru(struct page *page, struct address_space *mapping, +- pgoff_t offset, gfp_t gfp_mask) ++int add_to_page_cache_lru_vec(struct address_space *mapping, ++ struct page **pages, ++ unsigned nr_pages, ++ pgoff_t offset, gfp_t gfp_mask) + { +- void *shadow = NULL; +- int ret; ++ void *shadow_stack[8], **shadow = shadow_stack; ++ int i, ret = 0, err = 0, nr_added; ++ ++ if (nr_pages > ARRAY_SIZE(shadow_stack)) { ++ shadow = kmalloc_array(nr_pages, sizeof(void *), gfp_mask); ++ if (!shadow) ++ goto slowpath; ++ } ++ ++ for (i = 0; i < nr_pages; i++) ++ VM_BUG_ON_PAGE(PageActive(pages[i]), pages[i]); ++ ++ ret = add_to_page_cache_vec(pages, nr_pages, mapping, ++ offset, gfp_mask, shadow); ++ nr_added = ret > 0 ? ret : 0; ++ ++ /* ++ * The page might have been evicted from cache only recently, in which ++ * case it should be activated like any other repeatedly accessed page. ++ * The exception is pages getting rewritten; evicting other data from ++ * the working set, only to cache data that will get overwritten with ++ * something else, is a waste of memory. ++ */ ++ for (i = 0; i < nr_added; i++) { ++ struct page *page = pages[i]; ++ void *s = shadow[i]; + +- __SetPageLocked(page); +- ret = __add_to_page_cache_locked(page, mapping, offset, +- gfp_mask, &shadow); +- if (unlikely(ret)) +- __ClearPageLocked(page); +- else { +- /* +- * The page might have been evicted from cache only +- * recently, in which case it should be activated like +- * any other repeatedly accessed page. +- * The exception is pages getting rewritten; evicting other +- * data from the working set, only to cache data that will +- * get overwritten with something else, is a waste of memory. +- */ + WARN_ON_ONCE(PageActive(page)); +- if (!(gfp_mask & __GFP_WRITE) && shadow) +- workingset_refault(page, shadow); ++ if (!(gfp_mask & __GFP_WRITE) && s) ++ workingset_refault(page, s); + lru_cache_add(page); + } ++ ++ if (shadow != shadow_stack) ++ kfree(shadow); ++ + return ret; ++slowpath: ++ for (i = 0; i < nr_pages; i++) { ++ err = add_to_page_cache_lru(pages[i], mapping, ++ offset + i, gfp_mask); ++ if (err) ++ break; ++ } ++ ++ return i ?: err; ++} ++EXPORT_SYMBOL_GPL(add_to_page_cache_lru_vec); ++ ++int add_to_page_cache_lru(struct page *page, struct address_space *mapping, ++ pgoff_t offset, gfp_t gfp_mask) ++{ ++ int ret = add_to_page_cache_lru_vec(mapping, &page, 1, offset, gfp_mask); ++ if (ret < 0) ++ return ret; ++ return 0; + } + EXPORT_SYMBOL_GPL(add_to_page_cache_lru); + +@@ -1990,6 +2087,7 @@ unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, + + return ret; + } ++EXPORT_SYMBOL(find_get_pages_range); + + /** + * find_get_pages_contig - gang contiguous pagecache lookup +@@ -2138,6 +2236,259 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra) + ra->ra_pages /= 4; + } + ++static int lock_page_for_iocb(struct kiocb *iocb, struct page *page) ++{ ++ if (iocb->ki_flags & IOCB_WAITQ) ++ return lock_page_async(page, iocb->ki_waitq); ++ else if (iocb->ki_flags & IOCB_NOWAIT) ++ return trylock_page(page) ? 0 : -EAGAIN; ++ else ++ return lock_page_killable(page); ++} ++ ++static struct page * ++generic_file_buffered_read_readpage(struct kiocb *iocb, ++ struct file *filp, ++ struct address_space *mapping, ++ struct page *page) ++{ ++ struct file_ra_state *ra = &filp->f_ra; ++ int error; ++ ++ if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) { ++ unlock_page(page); ++ put_page(page); ++ return ERR_PTR(-EAGAIN); ++ } ++ ++ /* ++ * A previous I/O error may have been due to temporary ++ * failures, eg. multipath errors. ++ * PG_error will be set again if readpage fails. ++ */ ++ ClearPageError(page); ++ /* Start the actual read. The read will unlock the page. */ ++ error = mapping->a_ops->readpage(filp, page); ++ ++ if (unlikely(error)) { ++ put_page(page); ++ return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL; ++ } ++ ++ if (!PageUptodate(page)) { ++ error = lock_page_for_iocb(iocb, page); ++ if (unlikely(error)) { ++ put_page(page); ++ return ERR_PTR(error); ++ } ++ if (!PageUptodate(page)) { ++ if (page->mapping == NULL) { ++ /* ++ * invalidate_mapping_pages got it ++ */ ++ unlock_page(page); ++ put_page(page); ++ return NULL; ++ } ++ unlock_page(page); ++ shrink_readahead_size_eio(ra); ++ put_page(page); ++ return ERR_PTR(-EIO); ++ } ++ unlock_page(page); ++ } ++ ++ return page; ++} ++ ++static struct page * ++generic_file_buffered_read_pagenotuptodate(struct kiocb *iocb, ++ struct file *filp, ++ struct iov_iter *iter, ++ struct page *page, ++ loff_t pos, loff_t count) ++{ ++ struct address_space *mapping = filp->f_mapping; ++ struct inode *inode = mapping->host; ++ int error; ++ ++ /* ++ * See comment in do_read_cache_page on why ++ * wait_on_page_locked is used to avoid unnecessarily ++ * serialisations and why it's safe. ++ */ ++ if (iocb->ki_flags & IOCB_WAITQ) { ++ error = wait_on_page_locked_async(page, ++ iocb->ki_waitq); ++ } else { ++ error = wait_on_page_locked_killable(page); ++ } ++ if (unlikely(error)) { ++ put_page(page); ++ return ERR_PTR(error); ++ } ++ if (PageUptodate(page)) ++ return page; ++ ++ if (inode->i_blkbits == PAGE_SHIFT || ++ !mapping->a_ops->is_partially_uptodate) ++ goto page_not_up_to_date; ++ /* pipes can't handle partially uptodate pages */ ++ if (unlikely(iov_iter_is_pipe(iter))) ++ goto page_not_up_to_date; ++ if (!trylock_page(page)) ++ goto page_not_up_to_date; ++ /* Did it get truncated before we got the lock? */ ++ if (!page->mapping) ++ goto page_not_up_to_date_locked; ++ if (!mapping->a_ops->is_partially_uptodate(page, ++ pos & ~PAGE_MASK, count)) ++ goto page_not_up_to_date_locked; ++ unlock_page(page); ++ return page; ++ ++page_not_up_to_date: ++ /* Get exclusive access to the page ... */ ++ error = lock_page_for_iocb(iocb, page); ++ if (unlikely(error)) { ++ put_page(page); ++ return ERR_PTR(error); ++ } ++ ++page_not_up_to_date_locked: ++ /* Did it get truncated before we got the lock? */ ++ if (!page->mapping) { ++ unlock_page(page); ++ put_page(page); ++ return NULL; ++ } ++ ++ /* Did somebody else fill it already? */ ++ if (PageUptodate(page)) { ++ unlock_page(page); ++ return page; ++ } ++ ++ return generic_file_buffered_read_readpage(iocb, filp, mapping, page); ++} ++ ++static struct page * ++generic_file_buffered_read_no_cached_page(struct kiocb *iocb, ++ struct iov_iter *iter) ++{ ++ struct file *filp = iocb->ki_filp; ++ struct address_space *mapping = filp->f_mapping; ++ pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; ++ struct page *page; ++ int error; ++ ++ if (iocb->ki_flags & IOCB_NOIO) ++ return ERR_PTR(-EAGAIN); ++ ++ /* ++ * Ok, it wasn't cached, so we need to create a new ++ * page.. ++ */ ++ page = page_cache_alloc(mapping); ++ if (!page) ++ return ERR_PTR(-ENOMEM); ++ ++ error = add_to_page_cache_lru(page, mapping, index, ++ mapping_gfp_constraint(mapping, GFP_KERNEL)); ++ if (error) { ++ put_page(page); ++ return error != -EEXIST ? ERR_PTR(error) : NULL; ++ } ++ ++ return generic_file_buffered_read_readpage(iocb, filp, mapping, page); ++} ++ ++static int generic_file_buffered_read_get_pages(struct kiocb *iocb, ++ struct iov_iter *iter, ++ struct page **pages, ++ unsigned int nr) ++{ ++ struct file *filp = iocb->ki_filp; ++ struct address_space *mapping = filp->f_mapping; ++ struct file_ra_state *ra = &filp->f_ra; ++ pgoff_t index = iocb->ki_pos >> PAGE_SHIFT; ++ pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; ++ int i, j, nr_got, err = 0; ++ ++ nr = min_t(unsigned long, last_index - index, nr); ++find_page: ++ if (fatal_signal_pending(current)) ++ return -EINTR; ++ ++ nr_got = find_get_pages_contig(mapping, index, nr, pages); ++ if (nr_got) ++ goto got_pages; ++ ++ if (iocb->ki_flags & IOCB_NOIO) ++ return -EAGAIN; ++ ++ page_cache_sync_readahead(mapping, ra, filp, index, last_index - index); ++ ++ nr_got = find_get_pages_contig(mapping, index, nr, pages); ++ if (nr_got) ++ goto got_pages; ++ ++ pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter); ++ err = PTR_ERR_OR_ZERO(pages[0]); ++ if (!IS_ERR_OR_NULL(pages[0])) ++ nr_got = 1; ++got_pages: ++ for (i = 0; i < nr_got; i++) { ++ struct page *page = pages[i]; ++ pgoff_t pg_index = index + i; ++ loff_t pg_pos = max(iocb->ki_pos, ++ (loff_t) pg_index << PAGE_SHIFT); ++ loff_t pg_count = iocb->ki_pos + iter->count - pg_pos; ++ ++ if (PageReadahead(page)) { ++ if (iocb->ki_flags & IOCB_NOIO) { ++ for (j = i; j < nr_got; j++) ++ put_page(pages[j]); ++ nr_got = i; ++ err = -EAGAIN; ++ break; ++ } ++ page_cache_async_readahead(mapping, ra, filp, page, ++ pg_index, last_index - pg_index); ++ } ++ ++ if (!PageUptodate(page)) { ++ if ((iocb->ki_flags & IOCB_NOWAIT) || ++ ((iocb->ki_flags & IOCB_WAITQ) && i)) { ++ for (j = i; j < nr_got; j++) ++ put_page(pages[j]); ++ nr_got = i; ++ err = -EAGAIN; ++ break; ++ } ++ ++ page = generic_file_buffered_read_pagenotuptodate(iocb, ++ filp, iter, page, pg_pos, pg_count); ++ if (IS_ERR_OR_NULL(page)) { ++ for (j = i + 1; j < nr_got; j++) ++ put_page(pages[j]); ++ nr_got = i; ++ err = PTR_ERR_OR_ZERO(page); ++ break; ++ } ++ } ++ } ++ ++ if (likely(nr_got)) ++ return nr_got; ++ if (err) ++ return err; ++ /* ++ * No pages and no error means we raced and should retry: ++ */ ++ goto find_page; ++} ++ + /** + * generic_file_buffered_read - generic file read routine + * @iocb: the iocb to read +@@ -2158,276 +2509,116 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb, + struct iov_iter *iter, ssize_t written) + { + struct file *filp = iocb->ki_filp; ++ struct file_ra_state *ra = &filp->f_ra; + struct address_space *mapping = filp->f_mapping; + struct inode *inode = mapping->host; +- struct file_ra_state *ra = &filp->f_ra; +- loff_t *ppos = &iocb->ki_pos; +- pgoff_t index; +- pgoff_t last_index; +- pgoff_t prev_index; +- unsigned long offset; /* offset into pagecache page */ +- unsigned int prev_offset; +- int error = 0; +- +- if (unlikely(*ppos >= inode->i_sb->s_maxbytes)) ++ struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL; ++ unsigned int nr_pages = min_t(unsigned int, 512, ++ ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) - ++ (iocb->ki_pos >> PAGE_SHIFT)); ++ int i, pg_nr, error = 0; ++ bool writably_mapped; ++ loff_t isize, end_offset; ++ ++ if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes)) + return 0; + iov_iter_truncate(iter, inode->i_sb->s_maxbytes); + +- index = *ppos >> PAGE_SHIFT; +- prev_index = ra->prev_pos >> PAGE_SHIFT; +- prev_offset = ra->prev_pos & (PAGE_SIZE-1); +- last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT; +- offset = *ppos & ~PAGE_MASK; ++ if (nr_pages > ARRAY_SIZE(pages_onstack)) ++ pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL); + +- for (;;) { +- struct page *page; +- pgoff_t end_index; +- loff_t isize; +- unsigned long nr, ret; ++ if (!pages) { ++ pages = pages_onstack; ++ nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack)); ++ } + ++ do { + cond_resched(); +-find_page: +- if (fatal_signal_pending(current)) { +- error = -EINTR; +- goto out; +- } + +- page = find_get_page(mapping, index); +- if (!page) { +- if (iocb->ki_flags & IOCB_NOIO) +- goto would_block; +- page_cache_sync_readahead(mapping, +- ra, filp, +- index, last_index - index); +- page = find_get_page(mapping, index); +- if (unlikely(page == NULL)) +- goto no_cached_page; +- } +- if (PageReadahead(page)) { +- if (iocb->ki_flags & IOCB_NOIO) { +- put_page(page); +- goto out; +- } +- page_cache_async_readahead(mapping, +- ra, filp, page, +- index, last_index - index); +- } +- if (!PageUptodate(page)) { +- /* +- * See comment in do_read_cache_page on why +- * wait_on_page_locked is used to avoid unnecessarily +- * serialisations and why it's safe. +- */ +- if (iocb->ki_flags & IOCB_WAITQ) { +- if (written) { +- put_page(page); +- goto out; +- } +- error = wait_on_page_locked_async(page, +- iocb->ki_waitq); +- } else { +- if (iocb->ki_flags & IOCB_NOWAIT) { +- put_page(page); +- goto would_block; +- } +- error = wait_on_page_locked_killable(page); +- } +- if (unlikely(error)) +- goto readpage_error; +- if (PageUptodate(page)) +- goto page_ok; +- +- if (inode->i_blkbits == PAGE_SHIFT || +- !mapping->a_ops->is_partially_uptodate) +- goto page_not_up_to_date; +- /* pipes can't handle partially uptodate pages */ +- if (unlikely(iov_iter_is_pipe(iter))) +- goto page_not_up_to_date; +- if (!trylock_page(page)) +- goto page_not_up_to_date; +- /* Did it get truncated before we got the lock? */ +- if (!page->mapping) +- goto page_not_up_to_date_locked; +- if (!mapping->a_ops->is_partially_uptodate(page, +- offset, iter->count)) +- goto page_not_up_to_date_locked; +- unlock_page(page); ++ /* ++ * We can't return -EIOCBQUEUED once we've done some work, so ++ * ensure we don't block: ++ */ ++ if ((iocb->ki_flags & IOCB_WAITQ) && written) ++ iocb->ki_flags |= IOCB_NOWAIT; ++ ++ i = 0; ++ pg_nr = generic_file_buffered_read_get_pages(iocb, iter, ++ pages, nr_pages); ++ if (pg_nr < 0) { ++ error = pg_nr; ++ break; + } +-page_ok: ++ + /* +- * i_size must be checked after we know the page is Uptodate. ++ * i_size must be checked after we know the pages are Uptodate. + * + * Checking i_size after the check allows us to calculate + * the correct value for "nr", which means the zero-filled + * part of the page is not copied back to userspace (unless + * another truncate extends the file - this is desired though). + */ +- + isize = i_size_read(inode); +- end_index = (isize - 1) >> PAGE_SHIFT; +- if (unlikely(!isize || index > end_index)) { +- put_page(page); +- goto out; +- } ++ if (unlikely(iocb->ki_pos >= isize)) ++ goto put_pages; + +- /* nr is the maximum number of bytes to copy from this page */ +- nr = PAGE_SIZE; +- if (index == end_index) { +- nr = ((isize - 1) & ~PAGE_MASK) + 1; +- if (nr <= offset) { +- put_page(page); +- goto out; +- } +- } +- nr = nr - offset; ++ end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count); + +- /* If users can be writing to this page using arbitrary +- * virtual addresses, take care about potential aliasing +- * before reading the page on the kernel side. +- */ +- if (mapping_writably_mapped(mapping)) +- flush_dcache_page(page); ++ while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr > ++ (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT) ++ put_page(pages[--pg_nr]); + + /* +- * When a sequential read accesses a page several times, +- * only mark it as accessed the first time. ++ * Once we start copying data, we don't want to be touching any ++ * cachelines that might be contended: + */ +- if (prev_index != index || offset != prev_offset) +- mark_page_accessed(page); +- prev_index = index; ++ writably_mapped = mapping_writably_mapped(mapping); + + /* +- * Ok, we have the page, and it's up-to-date, so +- * now we can copy it to user space... ++ * When a sequential read accesses a page several times, only ++ * mark it as accessed the first time. + */ ++ if (iocb->ki_pos >> PAGE_SHIFT != ++ ra->prev_pos >> PAGE_SHIFT) ++ mark_page_accessed(pages[0]); ++ for (i = 1; i < pg_nr; i++) ++ mark_page_accessed(pages[i]); ++ ++ for (i = 0; i < pg_nr; i++) { ++ unsigned int offset = iocb->ki_pos & ~PAGE_MASK; ++ unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos, ++ PAGE_SIZE - offset); ++ unsigned int copied; + +- ret = copy_page_to_iter(page, offset, nr, iter); +- offset += ret; +- index += offset >> PAGE_SHIFT; +- offset &= ~PAGE_MASK; +- prev_offset = offset; +- +- put_page(page); +- written += ret; +- if (!iov_iter_count(iter)) +- goto out; +- if (ret < nr) { +- error = -EFAULT; +- goto out; +- } +- continue; +- +-page_not_up_to_date: +- /* Get exclusive access to the page ... */ +- if (iocb->ki_flags & IOCB_WAITQ) +- error = lock_page_async(page, iocb->ki_waitq); +- else +- error = lock_page_killable(page); +- if (unlikely(error)) +- goto readpage_error; +- +-page_not_up_to_date_locked: +- /* Did it get truncated before we got the lock? */ +- if (!page->mapping) { +- unlock_page(page); +- put_page(page); +- continue; +- } +- +- /* Did somebody else fill it already? */ +- if (PageUptodate(page)) { +- unlock_page(page); +- goto page_ok; +- } ++ /* ++ * If users can be writing to this page using arbitrary ++ * virtual addresses, take care about potential aliasing ++ * before reading the page on the kernel side. ++ */ ++ if (writably_mapped) ++ flush_dcache_page(pages[i]); + +-readpage: +- if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) { +- unlock_page(page); +- put_page(page); +- goto would_block; +- } +- /* +- * A previous I/O error may have been due to temporary +- * failures, eg. multipath errors. +- * PG_error will be set again if readpage fails. +- */ +- ClearPageError(page); +- /* Start the actual read. The read will unlock the page. */ +- error = mapping->a_ops->readpage(filp, page); ++ copied = copy_page_to_iter(pages[i], offset, bytes, iter); + +- if (unlikely(error)) { +- if (error == AOP_TRUNCATED_PAGE) { +- put_page(page); +- error = 0; +- goto find_page; +- } +- goto readpage_error; +- } ++ written += copied; ++ iocb->ki_pos += copied; ++ ra->prev_pos = iocb->ki_pos; + +- if (!PageUptodate(page)) { +- if (iocb->ki_flags & IOCB_WAITQ) +- error = lock_page_async(page, iocb->ki_waitq); +- else +- error = lock_page_killable(page); +- +- if (unlikely(error)) +- goto readpage_error; +- if (!PageUptodate(page)) { +- if (page->mapping == NULL) { +- /* +- * invalidate_mapping_pages got it +- */ +- unlock_page(page); +- put_page(page); +- goto find_page; +- } +- unlock_page(page); +- shrink_readahead_size_eio(ra); +- error = -EIO; +- goto readpage_error; ++ if (copied < bytes) { ++ error = -EFAULT; ++ break; + } +- unlock_page(page); + } ++put_pages: ++ for (i = 0; i < pg_nr; i++) ++ put_page(pages[i]); ++ } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error); + +- goto page_ok; +- +-readpage_error: +- /* UHHUH! A synchronous read error occurred. Report it */ +- put_page(page); +- goto out; +- +-no_cached_page: +- /* +- * Ok, it wasn't cached, so we need to create a new +- * page.. +- */ +- page = page_cache_alloc(mapping); +- if (!page) { +- error = -ENOMEM; +- goto out; +- } +- error = add_to_page_cache_lru(page, mapping, index, +- mapping_gfp_constraint(mapping, GFP_KERNEL)); +- if (error) { +- put_page(page); +- if (error == -EEXIST) { +- error = 0; +- goto find_page; +- } +- goto out; +- } +- goto readpage; +- } ++ file_accessed(filp); + +-would_block: +- error = -EAGAIN; +-out: +- ra->prev_pos = prev_index; +- ra->prev_pos <<= PAGE_SHIFT; +- ra->prev_pos |= prev_offset; ++ if (pages != pages_onstack) ++ kfree(pages); + +- *ppos = ((loff_t)index << PAGE_SHIFT) + offset; +- file_accessed(filp); + return written ? written : error; + } + EXPORT_SYMBOL_GPL(generic_file_buffered_read); +diff --git a/mm/gup.c b/mm/gup.c +index e869c634cc9a..9bfb3e933deb 100644 +--- a/mm/gup.c ++++ b/mm/gup.c +@@ -1085,6 +1085,13 @@ static long __get_user_pages(struct mm_struct *mm, + } + cond_resched(); + ++ if (current->faults_disabled_mapping && ++ vma->vm_file && ++ vma->vm_file->f_mapping == current->faults_disabled_mapping) { ++ ret = -EFAULT; ++ goto out; ++ } ++ + page = follow_page_mask(vma, start, foll_flags, &ctx); + if (!page) { + ret = faultin_page(vma, start, &foll_flags, locked); +diff --git a/mm/nommu.c b/mm/nommu.c +index 75a327149af1..fe0a77d01656 100644 +--- a/mm/nommu.c ++++ b/mm/nommu.c +@@ -290,6 +290,24 @@ void *vzalloc_node(unsigned long size, int node) + } + EXPORT_SYMBOL(vzalloc_node); + ++/** ++ * vmalloc_exec - allocate virtually contiguous, executable memory ++ * @size: allocation size ++ * ++ * Kernel-internal function to allocate enough pages to cover @size ++ * the page level allocator and map them into contiguous and ++ * executable kernel virtual space. ++ * ++ * For tight control over page level allocator and protection flags ++ * use __vmalloc() instead. ++ */ ++ ++void *vmalloc_exec(unsigned long size, gfp_t gfp_mask) ++{ ++ return __vmalloc(size, gfp_mask); ++} ++EXPORT_SYMBOL_GPL(vmalloc_exec); ++ + /** + * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) + * @size: allocation size +diff --git a/mm/page-writeback.c b/mm/page-writeback.c +index 4e4ddd67b71e..563cc766f511 100644 +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -2475,20 +2475,19 @@ int __set_page_dirty_nobuffers(struct page *page) + lock_page_memcg(page); + if (!TestSetPageDirty(page)) { + struct address_space *mapping = page_mapping(page); +- unsigned long flags; + + if (!mapping) { + unlock_page_memcg(page); + return 1; + } + +- xa_lock_irqsave(&mapping->i_pages, flags); ++ xa_lock_irq(&mapping->i_pages); + BUG_ON(page_mapping(page) != mapping); + WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); + account_page_dirtied(page, mapping); + __xa_set_mark(&mapping->i_pages, page_index(page), + PAGECACHE_TAG_DIRTY); +- xa_unlock_irqrestore(&mapping->i_pages, flags); ++ xa_unlock_irq(&mapping->i_pages); + unlock_page_memcg(page); + + if (mapping->host) { +diff --git a/mm/vmalloc.c b/mm/vmalloc.c +index be4724b916b3..efd7f9dd1eb8 100644 +--- a/mm/vmalloc.c ++++ b/mm/vmalloc.c +@@ -2665,6 +2665,27 @@ void *vzalloc_node(unsigned long size, int node) + } + EXPORT_SYMBOL(vzalloc_node); + ++/** ++ * vmalloc_exec - allocate virtually contiguous, executable memory ++ * @size: allocation size ++ * ++ * Kernel-internal function to allocate enough pages to cover @size ++ * the page level allocator and map them into contiguous and ++ * executable kernel virtual space. ++ * ++ * For tight control over page level allocator and protection flags ++ * use __vmalloc() instead. ++ * ++ * Return: pointer to the allocated memory or %NULL on error ++ */ ++void *vmalloc_exec(unsigned long size, gfp_t gfp_mask) ++{ ++ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, ++ gfp_mask, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS, ++ NUMA_NO_NODE, __builtin_return_address(0)); ++} ++EXPORT_SYMBOL_GPL(vmalloc_exec); ++ + #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) + #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) + #elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA) diff --git a/linux-tkg/linux-tkg-patches/5.9/0009-glitched-bmq.patch b/linux-tkg/linux-tkg-patches/5.9/0009-glitched-bmq.patch new file mode 100644 index 0000000..e42e522 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.9/0009-glitched-bmq.patch @@ -0,0 +1,90 @@ +From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001 +From: Tk-Glitch +Date: Wed, 4 Jul 2018 04:30:08 +0200 +Subject: glitched - BMQ + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_500 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -39,6 +39,13 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -52,6 +59,7 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1d9c7ed79b11 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,7 @@ + + choice + prompt "Timer frequency" +- default HZ_500 ++ default HZ_750 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -46,6 +46,13 @@ choice + on desktops with great smoothness without increasing CPU power + consumption and sacrificing the battery life on laptops. + ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness without sacrificing too ++ much throughput. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -60,6 +67,7 @@ config HZ + default 250 if HZ_250 + default 300 if HZ_300 + default 500 if HZ_500 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 9270a4370d54..30d01e647417 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -169,7 +169,7 @@ + /* + * From 0 .. 200. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 20; + + static void set_task_reclaim_state(struct task_struct *task, + struct reclaim_state *rs) diff --git a/linux-tkg/linux-tkg-patches/5.9/0009-glitched-ondemand-bmq.patch b/linux-tkg/linux-tkg-patches/5.9/0009-glitched-ondemand-bmq.patch new file mode 100644 index 0000000..a926040 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.9/0009-glitched-ondemand-bmq.patch @@ -0,0 +1,18 @@ +diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c +index 6b423eebfd5d..61e3271675d6 100644 +--- a/drivers/cpufreq/cpufreq_ondemand.c ++++ b/drivers/cpufreq/cpufreq_ondemand.c +@@ -21,10 +21,10 @@ + #include "cpufreq_ondemand.h" + + /* On-demand governor macros */ +-#define DEF_FREQUENCY_UP_THRESHOLD (80) +-#define DEF_SAMPLING_DOWN_FACTOR (1) ++#define DEF_FREQUENCY_UP_THRESHOLD (55) ++#define DEF_SAMPLING_DOWN_FACTOR (5) + #define MAX_SAMPLING_DOWN_FACTOR (100000) +-#define MICRO_FREQUENCY_UP_THRESHOLD (95) ++#define MICRO_FREQUENCY_UP_THRESHOLD (63) + #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) + #define MIN_FREQUENCY_UP_THRESHOLD (1) + #define MAX_FREQUENCY_UP_THRESHOLD (100) diff --git a/linux-tkg/linux-tkg-patches/5.9/0009-prjc_v5.9-r0.patch b/linux-tkg/linux-tkg-patches/5.9/0009-prjc_v5.9-r0.patch new file mode 100644 index 0000000..550d29c --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.9/0009-prjc_v5.9-r0.patch @@ -0,0 +1,8809 @@ +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index a1068742a6df..b97a9697fde4 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4611,6 +4611,12 @@ + + sbni= [NET] Granch SBNI12 leased line adapter + ++ sched_timeslice= ++ [KNL] Time slice in us for BMQ/PDS scheduler. ++ Format: (must be >= 1000) ++ Default: 4000 ++ See Documentation/scheduler/sched-BMQ.txt ++ + sched_debug [KNL] Enables verbose scheduler debug messages. + + schedstats= [KNL,X86] Enable or disable scheduled statistics. +diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst +index d4b32cc32bb7..14118e5168ef 100644 +--- a/Documentation/admin-guide/sysctl/kernel.rst ++++ b/Documentation/admin-guide/sysctl/kernel.rst +@@ -1515,3 +1515,13 @@ is 10 seconds. + + The softlockup threshold is (``2 * watchdog_thresh``). Setting this + tunable to zero will disable lockup detection altogether. ++ ++yield_type: ++=========== ++ ++BMQ/PDS CPU scheduler only. This determines what type of yield calls ++to sched_yield will perform. ++ ++ 0 - No yield. ++ 1 - Deboost and requeue task. (default) ++ 2 - Set run queue skip task. +diff --git a/Documentation/scheduler/sched-BMQ.txt b/Documentation/scheduler/sched-BMQ.txt +new file mode 100644 +index 000000000000..05c84eec0f31 +--- /dev/null ++++ b/Documentation/scheduler/sched-BMQ.txt +@@ -0,0 +1,110 @@ ++ BitMap queue CPU Scheduler ++ -------------------------- ++ ++CONTENT ++======== ++ ++ Background ++ Design ++ Overview ++ Task policy ++ Priority management ++ BitMap Queue ++ CPU Assignment and Migration ++ ++ ++Background ++========== ++ ++BitMap Queue CPU scheduler, referred to as BMQ from here on, is an evolution ++of previous Priority and Deadline based Skiplist multiple queue scheduler(PDS), ++and inspired by Zircon scheduler. The goal of it is to keep the scheduler code ++simple, while efficiency and scalable for interactive tasks, such as desktop, ++movie playback and gaming etc. ++ ++Design ++====== ++ ++Overview ++-------- ++ ++BMQ use per CPU run queue design, each CPU(logical) has it's own run queue, ++each CPU is responsible for scheduling the tasks that are putting into it's ++run queue. ++ ++The run queue is a set of priority queues. Note that these queues are fifo ++queue for non-rt tasks or priority queue for rt tasks in data structure. See ++BitMap Queue below for details. BMQ is optimized for non-rt tasks in the fact ++that most applications are non-rt tasks. No matter the queue is fifo or ++priority, In each queue is an ordered list of runnable tasks awaiting execution ++and the data structures are the same. When it is time for a new task to run, ++the scheduler simply looks the lowest numbered queueue that contains a task, ++and runs the first task from the head of that queue. And per CPU idle task is ++also in the run queue, so the scheduler can always find a task to run on from ++its run queue. ++ ++Each task will assigned the same timeslice(default 4ms) when it is picked to ++start running. Task will be reinserted at the end of the appropriate priority ++queue when it uses its whole timeslice. When the scheduler selects a new task ++from the priority queue it sets the CPU's preemption timer for the remainder of ++the previous timeslice. When that timer fires the scheduler will stop execution ++on that task, select another task and start over again. ++ ++If a task blocks waiting for a shared resource then it's taken out of its ++priority queue and is placed in a wait queue for the shared resource. When it ++is unblocked it will be reinserted in the appropriate priority queue of an ++eligible CPU. ++ ++Task policy ++----------- ++ ++BMQ supports DEADLINE, FIFO, RR, NORMAL, BATCH and IDLE task policy like the ++mainline CFS scheduler. But BMQ is heavy optimized for non-rt task, that's ++NORMAL/BATCH/IDLE policy tasks. Below is the implementation detail of each ++policy. ++ ++DEADLINE ++ It is squashed as priority 0 FIFO task. ++ ++FIFO/RR ++ All RT tasks share one single priority queue in BMQ run queue designed. The ++complexity of insert operation is O(n). BMQ is not designed for system runs ++with major rt policy tasks. ++ ++NORMAL/BATCH/IDLE ++ BATCH and IDLE tasks are treated as the same policy. They compete CPU with ++NORMAL policy tasks, but they just don't boost. To control the priority of ++NORMAL/BATCH/IDLE tasks, simply use nice level. ++ ++ISO ++ ISO policy is not supported in BMQ. Please use nice level -20 NORMAL policy ++task instead. ++ ++Priority management ++------------------- ++ ++RT tasks have priority from 0-99. For non-rt tasks, there are three different ++factors used to determine the effective priority of a task. The effective ++priority being what is used to determine which queue it will be in. ++ ++The first factor is simply the task’s static priority. Which is assigned from ++task's nice level, within [-20, 19] in userland's point of view and [0, 39] ++internally. ++ ++The second factor is the priority boost. This is a value bounded between ++[-MAX_PRIORITY_ADJ, MAX_PRIORITY_ADJ] used to offset the base priority, it is ++modified by the following cases: ++ ++*When a thread has used up its entire timeslice, always deboost its boost by ++increasing by one. ++*When a thread gives up cpu control(voluntary or non-voluntary) to reschedule, ++and its switch-in time(time after last switch and run) below the thredhold ++based on its priority boost, will boost its boost by decreasing by one buti is ++capped at 0 (won’t go negative). ++ ++The intent in this system is to ensure that interactive threads are serviced ++quickly. These are usually the threads that interact directly with the user ++and cause user-perceivable latency. These threads usually do little work and ++spend most of their time blocked awaiting another user event. So they get the ++priority boost from unblocking while background threads that do most of the ++processing receive the priority penalty for using their entire timeslice. +diff --git a/fs/proc/base.c b/fs/proc/base.c +index 617db4e0faa0..f85926764f9a 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -479,7 +479,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + seq_puts(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff --git a/include/asm-generic/resource.h b/include/asm-generic/resource.h +index 8874f681b056..59eb72bf7d5f 100644 +--- a/include/asm-generic/resource.h ++++ b/include/asm-generic/resource.h +@@ -23,7 +23,7 @@ + [RLIMIT_LOCKS] = { RLIM_INFINITY, RLIM_INFINITY }, \ + [RLIMIT_SIGPENDING] = { 0, 0 }, \ + [RLIMIT_MSGQUEUE] = { MQ_BYTES_MAX, MQ_BYTES_MAX }, \ +- [RLIMIT_NICE] = { 0, 0 }, \ ++ [RLIMIT_NICE] = { 30, 30 }, \ + [RLIMIT_RTPRIO] = { 0, 0 }, \ + [RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, \ + } +diff --git a/include/linux/sched.h b/include/linux/sched.h +index afe01e232935..8918609cb9f0 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -34,6 +34,7 @@ + #include + #include + #include ++#include + + /* task_struct member predeclarations (sorted alphabetically): */ + struct audit_context; +@@ -652,12 +653,18 @@ struct task_struct { + unsigned int ptrace; + + #ifdef CONFIG_SMP +- int on_cpu; + struct __call_single_node wake_entry; ++#endif ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_ALT) ++ int on_cpu; ++#endif ++ ++#ifdef CONFIG_SMP + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ + unsigned int cpu; + #endif ++#ifndef CONFIG_SCHED_ALT + unsigned int wakee_flips; + unsigned long wakee_flip_decay_ts; + struct task_struct *last_wakee; +@@ -671,6 +678,7 @@ struct task_struct { + */ + int recent_used_cpu; + int wake_cpu; ++#endif /* !CONFIG_SCHED_ALT */ + #endif + int on_rq; + +@@ -679,13 +687,33 @@ struct task_struct { + int normal_prio; + unsigned int rt_priority; + ++#ifdef CONFIG_SCHED_ALT ++ u64 last_ran; ++ s64 time_slice; ++#ifdef CONFIG_SCHED_BMQ ++ int boost_prio; ++ int bmq_idx; ++ struct list_head bmq_node; ++#endif /* CONFIG_SCHED_BMQ */ ++#ifdef CONFIG_SCHED_PDS ++ u64 deadline; ++ u64 priodl; ++ /* skip list level */ ++ int sl_level; ++ /* skip list node */ ++ struct skiplist_node sl_node; ++#endif /* CONFIG_SCHED_PDS */ ++ /* sched_clock time spent running */ ++ u64 sched_time; ++#else /* !CONFIG_SCHED_ALT */ + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++ struct sched_dl_entity dl; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +- struct sched_dl_entity dl; + + #ifdef CONFIG_UCLAMP_TASK + /* +@@ -1332,6 +1360,15 @@ struct task_struct { + */ + }; + ++#ifdef CONFIG_SCHED_ALT ++#define tsk_seruntime(t) ((t)->sched_time) ++/* replace the uncertian rt_timeout with 0UL */ ++#define tsk_rttimeout(t) (0UL) ++#else /* CFS */ ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++#endif /* !CONFIG_SCHED_ALT */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index 1aff00b65f3c..179d77c8360e 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -1,5 +1,24 @@ + /* SPDX-License-Identifier: GPL-2.0 */ + ++#ifdef CONFIG_SCHED_ALT ++ ++static inline int dl_task(struct task_struct *p) ++{ ++ return 0; ++} ++ ++#ifdef CONFIG_SCHED_BMQ ++#define __tsk_deadline(p) (0UL) ++#endif ++ ++#ifdef CONFIG_SCHED_PDS ++#define __tsk_deadline(p) ((p)->priodl) ++#endif ++ ++#else ++ ++#define __tsk_deadline(p) ((p)->dl.deadline) ++ + /* + * SCHED_DEADLINE tasks has negative priorities, reflecting + * the fact that any of them has higher prio than RT and +@@ -19,6 +38,7 @@ static inline int dl_task(struct task_struct *p) + { + return dl_prio(p->prio); + } ++#endif /* CONFIG_SCHED_ALT */ + + static inline bool dl_time_before(u64 a, u64 b) + { +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index 7d64feafc408..42730d27ceb5 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -20,11 +20,20 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ + #define MAX_RT_PRIO MAX_USER_RT_PRIO + + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) + ++/* +/- priority levels from the base priority */ ++#ifdef CONFIG_SCHED_BMQ ++#define MAX_PRIORITY_ADJ 7 ++#endif ++#ifdef CONFIG_SCHED_PDS ++#define MAX_PRIORITY_ADJ 0 ++#endif ++ + /* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index e5af028c08b4..0a7565d0d3cf 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -24,8 +24,10 @@ static inline bool task_is_realtime(struct task_struct *tsk) + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_ALT + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff --git a/include/linux/skip_list.h b/include/linux/skip_list.h +new file mode 100644 +index 000000000000..47ca955a451d +--- /dev/null ++++ b/include/linux/skip_list.h +@@ -0,0 +1,177 @@ ++/* ++ * Copyright (C) 2016 Alfred Chen. ++ * ++ * Code based on Con Kolivas's skip list implementation for BFS, and ++ * which is based on example originally by William Pugh. ++ * ++ * Skip Lists are a probabilistic alternative to balanced trees, as ++ * described in the June 1990 issue of CACM and were invented by ++ * William Pugh in 1987. ++ * ++ * A couple of comments about this implementation: ++ * ++ * This file only provides a infrastructure of skip list. ++ * ++ * skiplist_node is embedded into container data structure, to get rid ++ * the dependency of kmalloc/kfree operation in scheduler code. ++ * ++ * A customized search function should be defined using DEFINE_SKIPLIST_INSERT ++ * macro and be used for skip list insert operation. ++ * ++ * Random Level is also not defined in this file, instead, it should be ++ * customized implemented and set to node->level then pass to the customized ++ * skiplist_insert function. ++ * ++ * Levels start at zero and go up to (NUM_SKIPLIST_LEVEL -1) ++ * ++ * NUM_SKIPLIST_LEVEL in this implementation is 8 instead of origin 16, ++ * considering that there will be 256 entries to enable the top level when using ++ * random level p=0.5, and that number is more than enough for a run queue usage ++ * in a scheduler usage. And it also help to reduce the memory usage of the ++ * embedded skip list node in task_struct to about 50%. ++ * ++ * The insertion routine has been implemented so as to use the ++ * dirty hack described in the CACM paper: if a random level is ++ * generated that is more than the current maximum level, the ++ * current maximum level plus one is used instead. ++ * ++ * BFS Notes: In this implementation of skiplists, there are bidirectional ++ * next/prev pointers and the insert function returns a pointer to the actual ++ * node the value is stored. The key here is chosen by the scheduler so as to ++ * sort tasks according to the priority list requirements and is no longer used ++ * by the scheduler after insertion. The scheduler lookup, however, occurs in ++ * O(1) time because it is always the first item in the level 0 linked list. ++ * Since the task struct stores a copy of the node pointer upon skiplist_insert, ++ * it can also remove it much faster than the original implementation with the ++ * aid of prev<->next pointer manipulation and no searching. ++ */ ++#ifndef _LINUX_SKIP_LIST_H ++#define _LINUX_SKIP_LIST_H ++ ++#include ++ ++#define NUM_SKIPLIST_LEVEL (8) ++ ++struct skiplist_node { ++ int level; /* Levels in this node */ ++ struct skiplist_node *next[NUM_SKIPLIST_LEVEL]; ++ struct skiplist_node *prev[NUM_SKIPLIST_LEVEL]; ++}; ++ ++#define SKIPLIST_NODE_INIT(name) { 0,\ ++ {&name, &name, &name, &name,\ ++ &name, &name, &name, &name},\ ++ {&name, &name, &name, &name,\ ++ &name, &name, &name, &name},\ ++ } ++ ++static inline void INIT_SKIPLIST_NODE(struct skiplist_node *node) ++{ ++ /* only level 0 ->next matters in skiplist_empty() */ ++ WRITE_ONCE(node->next[0], node); ++} ++ ++/** ++ * FULL_INIT_SKIPLIST_NODE -- fully init a skiplist_node, expecially for header ++ * @node: the skip list node to be inited. ++ */ ++static inline void FULL_INIT_SKIPLIST_NODE(struct skiplist_node *node) ++{ ++ int i; ++ ++ node->level = 0; ++ for (i = 0; i < NUM_SKIPLIST_LEVEL; i++) { ++ WRITE_ONCE(node->next[i], node); ++ node->prev[i] = node; ++ } ++} ++ ++/** ++ * skiplist_empty - test whether a skip list is empty ++ * @head: the skip list to test. ++ */ ++static inline int skiplist_empty(const struct skiplist_node *head) ++{ ++ return READ_ONCE(head->next[0]) == head; ++} ++ ++/** ++ * skiplist_entry - get the struct for this entry ++ * @ptr: the &struct skiplist_node pointer. ++ * @type: the type of the struct this is embedded in. ++ * @member: the name of the skiplist_node within the struct. ++ */ ++#define skiplist_entry(ptr, type, member) \ ++ container_of(ptr, type, member) ++ ++/** ++ * DEFINE_SKIPLIST_INSERT_FUNC -- macro to define a customized skip list insert ++ * function, which takes two parameters, first one is the header node of the ++ * skip list, second one is the skip list node to be inserted ++ * @func_name: the customized skip list insert function name ++ * @search_func: the search function to be used, which takes two parameters, ++ * 1st one is the itrator of skiplist_node in the list, the 2nd is the skip list ++ * node to be inserted, the function should return true if search should be ++ * continued, otherwise return false. ++ * Returns 1 if @node is inserted as the first item of skip list at level zero, ++ * otherwise 0 ++ */ ++#define DEFINE_SKIPLIST_INSERT_FUNC(func_name, search_func)\ ++static inline int func_name(struct skiplist_node *head, struct skiplist_node *node)\ ++{\ ++ struct skiplist_node *update[NUM_SKIPLIST_LEVEL];\ ++ struct skiplist_node *p, *q;\ ++ int k = head->level;\ ++\ ++ p = head;\ ++ do {\ ++ while (q = p->next[k], q != head && search_func(q, node))\ ++ p = q;\ ++ update[k] = p;\ ++ } while (--k >= 0);\ ++\ ++ k = node->level;\ ++ if (unlikely(k > head->level)) {\ ++ node->level = k = ++head->level;\ ++ update[k] = head;\ ++ }\ ++\ ++ do {\ ++ p = update[k];\ ++ q = p->next[k];\ ++ node->next[k] = q;\ ++ p->next[k] = node;\ ++ node->prev[k] = p;\ ++ q->prev[k] = node;\ ++ } while (--k >= 0);\ ++\ ++ return (p == head);\ ++} ++ ++/** ++ * skiplist_del_init -- delete skip list node from a skip list and reset it's ++ * init state ++ * @head: the header node of the skip list to be deleted from. ++ * @node: the skip list node to be deleted, the caller need to ensure @node is ++ * in skip list which @head represent. ++ * Returns 1 if @node is the first item of skip level at level zero, otherwise 0 ++ */ ++static inline int ++skiplist_del_init(struct skiplist_node *head, struct skiplist_node *node) ++{ ++ int l, m = node->level; ++ ++ for (l = 0; l <= m; l++) { ++ node->prev[l]->next[l] = node->next[l]; ++ node->next[l]->prev[l] = node->prev[l]; ++ } ++ if (m == head->level && m > 0) { ++ while (head->next[m] == head && m > 0) ++ m--; ++ head->level = m; ++ } ++ INIT_SKIPLIST_NODE(node); ++ ++ return (node->prev[0] == head); ++} ++#endif /* _LINUX_SKIP_LIST_H */ +diff --git a/init/Kconfig b/init/Kconfig +index d6a0b31b13dc..2122dba5596f 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -770,9 +770,39 @@ config GENERIC_SCHED_CLOCK + + menu "Scheduler features" + ++menuconfig SCHED_ALT ++ bool "Alternative CPU Schedulers" ++ default y ++ help ++ This feature enable alternative CPU scheduler" ++ ++if SCHED_ALT ++ ++choice ++ prompt "Alternative CPU Scheduler" ++ default SCHED_BMQ ++ ++config SCHED_BMQ ++ bool "BMQ CPU scheduler" ++ help ++ The BitMap Queue CPU scheduler for excellent interactivity and ++ responsiveness on the desktop and solid scalability on normal ++ hardware and commodity servers. ++ ++config SCHED_PDS ++ bool "PDS CPU scheduler" ++ help ++ The Priority and Deadline based Skip list multiple queue CPU ++ Scheduler. ++ ++endchoice ++ ++endif ++ + config UCLAMP_TASK + bool "Enable utilization clamping for RT/FAIR tasks" + depends on CPU_FREQ_GOV_SCHEDUTIL ++ depends on !SCHED_ALT + help + This feature enables the scheduler to track the clamped utilization + of each CPU based on RUNNABLE tasks scheduled on that CPU. +@@ -858,6 +888,7 @@ config NUMA_BALANCING + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_ALT + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -944,7 +975,7 @@ menuconfig CGROUP_SCHED + bandwidth allocation to such task groups. It uses cgroups to group + tasks. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_ALT + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -1200,6 +1231,7 @@ config CHECKPOINT_RESTORE + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_ALT + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff --git a/init/init_task.c b/init/init_task.c +index f6889fce64af..5a23122f3d2c 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -75,9 +75,15 @@ struct task_struct init_task + .stack = init_stack, + .usage = REFCOUNT_INIT(2), + .flags = PF_KTHREAD, ++#ifdef CONFIG_SCHED_ALT ++ .prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, ++ .static_prio = DEFAULT_PRIO, ++ .normal_prio = DEFAULT_PRIO + MAX_PRIORITY_ADJ, ++#else + .prio = MAX_PRIO - 20, + .static_prio = MAX_PRIO - 20, + .normal_prio = MAX_PRIO - 20, ++#endif + .policy = SCHED_NORMAL, + .cpus_ptr = &init_task.cpus_mask, + .cpus_mask = CPU_MASK_ALL, +@@ -87,6 +93,19 @@ struct task_struct init_task + .restart_block = { + .fn = do_no_restart_syscall, + }, ++#ifdef CONFIG_SCHED_ALT ++#ifdef CONFIG_SCHED_BMQ ++ .boost_prio = 0, ++ .bmq_idx = 15, ++ .bmq_node = LIST_HEAD_INIT(init_task.bmq_node), ++#endif ++#ifdef CONFIG_SCHED_PDS ++ .deadline = 0, ++ .sl_level = 0, ++ .sl_node = SKIPLIST_NODE_INIT(init_task.sl_node), ++#endif ++ .time_slice = HZ, ++#else + .se = { + .group_node = LIST_HEAD_INIT(init_task.se.group_node), + }, +@@ -94,6 +113,7 @@ struct task_struct init_task + .run_list = LIST_HEAD_INIT(init_task.rt.run_list), + .time_slice = RR_TIMESLICE, + }, ++#endif + .tasks = LIST_HEAD_INIT(init_task.tasks), + #ifdef CONFIG_SMP + .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), +diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c +index 642415b8c3c9..7e0e1fe18035 100644 +--- a/kernel/cgroup/cpuset.c ++++ b/kernel/cgroup/cpuset.c +@@ -636,7 +636,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) + return ret; + } + +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_ALT) + /* + * Helper routine for generate_sched_domains(). + * Do cpusets a, b have overlapping effective cpus_allowed masks? +@@ -1009,7 +1009,7 @@ static void rebuild_sched_domains_locked(void) + /* Have scheduler rebuild the domains */ + partition_and_rebuild_sched_domains(ndoms, doms, attr); + } +-#else /* !CONFIG_SMP */ ++#else /* !CONFIG_SMP || CONFIG_SCHED_ALT */ + static void rebuild_sched_domains_locked(void) + { + } +diff --git a/kernel/delayacct.c b/kernel/delayacct.c +index 27725754ac99..769d773c7182 100644 +--- a/kernel/delayacct.c ++++ b/kernel/delayacct.c +@@ -106,7 +106,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff --git a/kernel/exit.c b/kernel/exit.c +index 733e80f334e7..3f3506c851fd 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -121,7 +121,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -142,7 +142,7 @@ static void __exit_signal(struct task_struct *tsk) + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +index f6310f848f34..4176ad070bc9 100644 +--- a/kernel/livepatch/transition.c ++++ b/kernel/livepatch/transition.c +@@ -306,7 +306,11 @@ static bool klp_try_switch_task(struct task_struct *task) + */ + rq = task_rq_lock(task, &flags); + ++#ifdef CONFIG_SCHED_ALT ++ if (task_running(task) && task != current) { ++#else + if (task_running(rq, task) && task != current) { ++#endif + snprintf(err_buf, STACK_ERR_BUF_SIZE, + "%s: %s:%d is running\n", __func__, task->comm, + task->pid); +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index cfdd5b93264d..84c284eb544a 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -227,15 +227,19 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + * Only use with rt_mutex_waiter_{less,equal}() + */ + #define task_to_waiter(p) \ +- &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } ++ &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = __tsk_deadline(p) } + + static inline int + rt_mutex_waiter_less(struct rt_mutex_waiter *left, + struct rt_mutex_waiter *right) + { ++#ifdef CONFIG_SCHED_PDS ++ return (left->deadline < right->deadline); ++#else + if (left->prio < right->prio) + return 1; + ++#ifndef CONFIG_SCHED_BMQ + /* + * If both waiters have dl_prio(), we check the deadlines of the + * associated tasks. +@@ -244,17 +248,23 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, + */ + if (dl_prio(left->prio)) + return dl_time_before(left->deadline, right->deadline); ++#endif + + return 0; ++#endif + } + + static inline int + rt_mutex_waiter_equal(struct rt_mutex_waiter *left, + struct rt_mutex_waiter *right) + { ++#ifdef CONFIG_SCHED_PDS ++ return (left->deadline == right->deadline); ++#else + if (left->prio != right->prio) + return 0; + ++#ifndef CONFIG_SCHED_BMQ + /* + * If both waiters have dl_prio(), we check the deadlines of the + * associated tasks. +@@ -263,8 +273,10 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, + */ + if (dl_prio(left->prio)) + return left->deadline == right->deadline; ++#endif + + return 1; ++#endif + } + + static void +@@ -678,7 +690,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, + * the values of the node being removed. + */ + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + rt_mutex_enqueue(lock, waiter); + +@@ -951,7 +963,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + waiter->task = task; + waiter->lock = lock; + waiter->prio = task->prio; +- waiter->deadline = task->dl.deadline; ++ waiter->deadline = __tsk_deadline(task); + + /* Get the top priority waiter on the lock */ + if (rt_mutex_has_waiters(lock)) +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 5fc9c9b70862..eb6d7d87779f 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -22,14 +22,20 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + +-obj-y += core.o loadavg.o clock.o cputime.o +-obj-y += idle.o fair.o rt.o deadline.o +-obj-y += wait.o wait_bit.o swait.o completion.o +- +-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o ++ifdef CONFIG_SCHED_ALT ++obj-y += alt_core.o alt_debug.o ++else ++obj-y += core.o ++obj-y += fair.o rt.o deadline.o ++obj-$(CONFIG_SMP) += cpudeadline.o stop_task.o + obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o +-obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_SCHED_DEBUG) += debug.o ++endif ++obj-y += loadavg.o clock.o cputime.o ++obj-y += idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o ++obj-$(CONFIG_SMP) += cpupri.o pelt.o topology.o ++obj-$(CONFIG_SCHEDSTATS) += stats.o + obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o + obj-$(CONFIG_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o +diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c +new file mode 100644 +index 000000000000..f36264fea75c +--- /dev/null ++++ b/kernel/sched/alt_core.c +@@ -0,0 +1,6360 @@ ++/* ++ * kernel/sched/alt_core.c ++ * ++ * Core alternative kernel scheduler code and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2017-09-06 Priority and Deadline based Skip list multiple queue kernel ++ * scheduler by Alfred Chen. ++ * 2019-02-20 BMQ(BitMap Queue) kernel scheduler by Alfred Chen. ++ */ ++#include "sched.h" ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include ++ ++#include "../workqueue_internal.h" ++#include "../../fs/io-wq.h" ++#include "../smpboot.h" ++ ++#include "pelt.h" ++#include "smp.h" ++ ++#define CREATE_TRACE_POINTS ++#include ++ ++#define ALT_SCHED_VERSION "v5.9-r0" ++ ++/* rt_prio(prio) defined in include/linux/sched/rt.h */ ++#define rt_task(p) rt_prio((p)->prio) ++#define rt_policy(policy) ((policy) == SCHED_FIFO || (policy) == SCHED_RR) ++#define task_has_rt_policy(p) (rt_policy((p)->policy)) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* Default time slice is 4 in ms, can be set via kernel parameter "sched_timeslice" */ ++u64 sched_timeslice_ns __read_mostly = (4 * 1000 * 1000); ++ ++static int __init sched_timeslice(char *str) ++{ ++ int timeslice_us; ++ ++ get_option(&str, ×lice_us); ++ if (timeslice_us >= 1000) ++ sched_timeslice_ns = timeslice_us * 1000; ++ ++ return 0; ++} ++early_param("sched_timeslice", sched_timeslice); ++ ++/* Reschedule if less than this many μs left */ ++#define RESCHED_NS (100 * 1000) ++ ++/** ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Deboost and requeue task. (default) ++ * 2: Set rq skip task. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++#ifdef CONFIG_SMP ++static cpumask_t sched_rq_pending_mask ____cacheline_aligned_in_smp; ++ ++DEFINE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_masks); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_affinity_end_mask); ++DEFINE_PER_CPU(cpumask_t *, sched_cpu_llc_mask); ++ ++#ifdef CONFIG_SCHED_SMT ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++EXPORT_SYMBOL_GPL(sched_smt_present); ++#endif ++ ++/* ++ * Keep a unique ID per domain (we use the first CPUs number in the cpumask of ++ * the domain), this allows us to quickly tell if two cpus are in the same cache ++ * domain, see cpus_share_cache(). ++ */ ++DEFINE_PER_CPU(int, sd_llc_id); ++#endif /* CONFIG_SMP */ ++ ++static DEFINE_MUTEX(sched_hotcpu_mutex); ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++#define IDLE_WM (IDLE_TASK_SCHED_PRIO) ++ ++#ifdef CONFIG_SCHED_SMT ++static cpumask_t sched_sg_idle_mask ____cacheline_aligned_in_smp; ++#endif ++static cpumask_t sched_rq_watermark[SCHED_BITS] ____cacheline_aligned_in_smp; ++ ++#ifdef CONFIG_SCHED_BMQ ++#include "bmq_imp.h" ++#endif ++#ifdef CONFIG_SCHED_PDS ++#include "pds_imp.h" ++#endif ++ ++static inline void update_sched_rq_watermark(struct rq *rq) ++{ ++ unsigned long watermark = sched_queue_watermark(rq); ++ unsigned long last_wm = rq->watermark; ++ unsigned long i; ++ int cpu; ++ ++ /*printk(KERN_INFO "sched: watermark(%d) %d, last %d\n", ++ cpu_of(rq), watermark, last_wm);*/ ++ if (watermark == last_wm) ++ return; ++ ++ rq->watermark = watermark; ++ cpu = cpu_of(rq); ++ if (watermark < last_wm) { ++ for (i = watermark + 1; i <= last_wm; i++) ++ cpumask_andnot(&sched_rq_watermark[i], ++ &sched_rq_watermark[i], cpumask_of(cpu)); ++#ifdef CONFIG_SCHED_SMT ++ if (!static_branch_likely(&sched_smt_present)) ++ return; ++ if (IDLE_WM == last_wm) ++ cpumask_andnot(&sched_sg_idle_mask, ++ &sched_sg_idle_mask, cpu_smt_mask(cpu)); ++#endif ++ return; ++ } ++ /* last_wm < watermark */ ++ for (i = last_wm + 1; i <= watermark; i++) ++ cpumask_set_cpu(cpu, &sched_rq_watermark[i]); ++#ifdef CONFIG_SCHED_SMT ++ if (!static_branch_likely(&sched_smt_present)) ++ return; ++ if (IDLE_WM == watermark) { ++ cpumask_t tmp; ++ cpumask_and(&tmp, cpu_smt_mask(cpu), &sched_rq_watermark[IDLE_WM]); ++ if (cpumask_equal(&tmp, cpu_smt_mask(cpu))) ++ cpumask_or(&sched_sg_idle_mask, cpu_smt_mask(cpu), ++ &sched_sg_idle_mask); ++ } ++#endif ++} ++ ++static inline struct task_struct *rq_runnable_task(struct rq *rq) ++{ ++ struct task_struct *next = sched_rq_first_task(rq); ++ ++ if (unlikely(next == rq->skip)) ++ next = sched_rq_next_task(next, rq); ++ ++ return next; ++} ++ ++/* ++ * Serialization rules: ++ * ++ * Lock order: ++ * ++ * p->pi_lock ++ * rq->lock ++ * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls) ++ * ++ * rq1->lock ++ * rq2->lock where: rq1 < rq2 ++ * ++ * Regular state: ++ * ++ * Normal scheduling state is serialized by rq->lock. __schedule() takes the ++ * local CPU's rq->lock, it optionally removes the task from the runqueue and ++ * always looks at the local rq data structures to find the most elegible task ++ * to run next. ++ * ++ * Task enqueue is also under rq->lock, possibly taken from another CPU. ++ * Wakeups from another LLC domain might use an IPI to transfer the enqueue to ++ * the local CPU to avoid bouncing the runqueue state around [ see ++ * ttwu_queue_wakelist() ] ++ * ++ * Task wakeup, specifically wakeups that involve migration, are horribly ++ * complicated to avoid having to take two rq->locks. ++ * ++ * Special state: ++ * ++ * System-calls and anything external will use task_rq_lock() which acquires ++ * both p->pi_lock and rq->lock. As a consequence the state they change is ++ * stable while holding either lock: ++ * ++ * - sched_setaffinity()/ ++ * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed ++ * - set_user_nice(): p->se.load, p->*prio ++ * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio, ++ * p->se.load, p->rt_priority, ++ * p->dl.dl_{runtime, deadline, period, flags, bw, density} ++ * - sched_setnuma(): p->numa_preferred_nid ++ * - sched_move_task()/ ++ * cpu_cgroup_fork(): p->sched_task_group ++ * - uclamp_update_active() p->uclamp* ++ * ++ * p->state <- TASK_*: ++ * ++ * is changed locklessly using set_current_state(), __set_current_state() or ++ * set_special_state(), see their respective comments, or by ++ * try_to_wake_up(). This latter uses p->pi_lock to serialize against ++ * concurrent self. ++ * ++ * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }: ++ * ++ * is set by activate_task() and cleared by deactivate_task(), under ++ * rq->lock. Non-zero indicates the task is runnable, the special ++ * ON_RQ_MIGRATING state is used for migration without holding both ++ * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock(). ++ * ++ * p->on_cpu <- { 0, 1 }: ++ * ++ * is set by prepare_task() and cleared by finish_task() such that it will be ++ * set before p is scheduled-in and cleared after p is scheduled-out, both ++ * under rq->lock. Non-zero indicates the task is running on its CPU. ++ * ++ * [ The astute reader will observe that it is possible for two tasks on one ++ * CPU to have ->on_cpu = 1 at the same time. ] ++ * ++ * task_cpu(p): is changed by set_task_cpu(), the rules are: ++ * ++ * - Don't call set_task_cpu() on a blocked task: ++ * ++ * We don't care what CPU we're not running on, this simplifies hotplug, ++ * the CPU assignment of blocked tasks isn't required to be valid. ++ * ++ * - for try_to_wake_up(), called under p->pi_lock: ++ * ++ * This allows try_to_wake_up() to only take one rq->lock, see its comment. ++ * ++ * - for migration called under rq->lock: ++ * [ see task_on_rq_migrating() in task_rq_lock() ] ++ * ++ * o move_queued_task() ++ * o detach_task() ++ * ++ * - for migration called under double_rq_lock(): ++ * ++ * o __migrate_swap_task() ++ * o push_rt_task() / pull_rt_task() ++ * o push_dl_task() / pull_dl_task() ++ * o dl_task_offline_migration() ++ * ++ */ ++ ++/* ++ * Context: p->pi_lock ++ */ ++static inline struct rq ++*__task_access_lock(struct task_struct *p, raw_spinlock_t **plock) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock(&rq->lock); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ *plock = NULL; ++ return rq; ++ } ++ } ++} ++ ++static inline void ++__task_access_unlock(struct task_struct *p, raw_spinlock_t *lock) ++{ ++ if (NULL != lock) ++ raw_spin_unlock(lock); ++} ++ ++static inline struct rq ++*task_access_lock_irqsave(struct task_struct *p, raw_spinlock_t **plock, ++ unsigned long *flags) ++{ ++ struct rq *rq; ++ for (;;) { ++ rq = task_rq(p); ++ if (p->on_cpu || task_on_rq_queued(p)) { ++ raw_spin_lock_irqsave(&rq->lock, *flags); ++ if (likely((p->on_cpu || task_on_rq_queued(p)) ++ && rq == task_rq(p))) { ++ *plock = &rq->lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&rq->lock, *flags); ++ } else if (task_on_rq_migrating(p)) { ++ do { ++ cpu_relax(); ++ } while (unlikely(task_on_rq_migrating(p))); ++ } else { ++ raw_spin_lock_irqsave(&p->pi_lock, *flags); ++ if (likely(!p->on_cpu && !p->on_rq && ++ rq == task_rq(p))) { ++ *plock = &p->pi_lock; ++ return rq; ++ } ++ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); ++ } ++ } ++} ++ ++static inline void ++task_access_unlock_irqrestore(struct task_struct *p, raw_spinlock_t *lock, ++ unsigned long *flags) ++{ ++ raw_spin_unlock_irqrestore(lock, *flags); ++} ++ ++/* ++ * __task_rq_lock - lock the rq @p resides on. ++ */ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ for (;;) { ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) ++ return rq; ++ raw_spin_unlock(&rq->lock); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++/* ++ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. ++ */ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ for (;;) { ++ raw_spin_lock_irqsave(&p->pi_lock, rf->flags); ++ rq = task_rq(p); ++ raw_spin_lock(&rq->lock); ++ /* ++ * move_queued_task() task_rq_lock() ++ * ++ * ACQUIRE (rq->lock) ++ * [S] ->on_rq = MIGRATING [L] rq = task_rq() ++ * WMB (__set_task_cpu()) ACQUIRE (rq->lock); ++ * [S] ->cpu = new_cpu [L] task_rq() ++ * [L] ->on_rq ++ * RELEASE (rq->lock) ++ * ++ * If we observe the old CPU in task_rq_lock(), the acquire of ++ * the old rq->lock will fully serialize against the stores. ++ * ++ * If we observe the new CPU in task_rq_lock(), the address ++ * dependency headed by '[L] rq = task_rq()' and the acquire ++ * will pair with the WMB to ensure we then also see migrating. ++ */ ++ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { ++ return rq; ++ } ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++ ++ while (unlikely(task_on_rq_migrating(p))) ++ cpu_relax(); ++ } ++} ++ ++static inline void ++rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irqsave(&rq->lock, rf->flags); ++} ++ ++static inline void ++rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irqrestore(&rq->lock, rf->flags); ++} ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++ s64 __maybe_unused steal = 0, irq_delta = 0; ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ delta -= steal; ++ } ++#endif ++ ++ rq->clock_task += delta; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ if ((irq_delta + steal)) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta <= 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out ++ * of nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (rq->nr_running < 2) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++/* ++ * Add/Remove/Requeue task to/from the runqueue routines ++ * Context: rq->lock ++ */ ++static inline void dequeue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ /*printk(KERN_INFO "sched: dequeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ ++ WARN_ONCE(task_rq(p) != rq, "sched: dequeue task reside on cpu%d from cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ ++ __SCHED_DEQUEUE_TASK(p, rq, flags, update_sched_rq_watermark(rq)); ++ --rq->nr_running; ++#ifdef CONFIG_SMP ++ if (1 == rq->nr_running) ++ cpumask_clear_cpu(cpu_of(rq), &sched_rq_pending_mask); ++#endif ++ ++ sched_update_tick_dependency(rq); ++} ++ ++static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ /*printk(KERN_INFO "sched: enqueue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ ++ WARN_ONCE(task_rq(p) != rq, "sched: enqueue task reside on cpu%d to cpu%d\n", ++ task_cpu(p), cpu_of(rq)); ++ ++ __SCHED_ENQUEUE_TASK(p, rq, flags); ++ update_sched_rq_watermark(rq); ++ ++rq->nr_running; ++#ifdef CONFIG_SMP ++ if (2 == rq->nr_running) ++ cpumask_set_cpu(cpu_of(rq), &sched_rq_pending_mask); ++#endif ++ ++ sched_update_tick_dependency(rq); ++ ++ /* ++ * If in_iowait is set, the code below may not trigger any cpufreq ++ * utilization updates, so do it here explicitly with the IOWAIT flag ++ * passed. ++ */ ++ if (p->in_iowait) ++ cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); ++} ++ ++static inline void requeue_task(struct task_struct *p, struct rq *rq) ++{ ++ lockdep_assert_held(&rq->lock); ++ /*printk(KERN_INFO "sched: requeue(%d) %px %016llx\n", cpu_of(rq), p, p->priodl);*/ ++ WARN_ONCE(task_rq(p) != rq, "sched: cpu[%d] requeue task reside on cpu%d\n", ++ cpu_of(rq), task_cpu(p)); ++ ++ __SCHED_REQUEUE_TASK(p, rq, update_sched_rq_watermark(rq)); ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * In order to ensure that a pending wakeup will observe our pending ++ * state, even in the failed case, an explicit smp_mb() must be used. ++ */ ++ smp_mb__before_atomic(); ++ if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) ++ return false; ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++ return true; ++} ++ ++/** ++ * wake_q_add() - queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ */ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (__wake_q_add(head, task)) ++ get_task_struct(task); ++} ++ ++/** ++ * wake_q_add_safe() - safely queue a wakeup for 'later' waking. ++ * @head: the wake_q_head to add @task to ++ * @task: the task to queue for 'later' wakeup ++ * ++ * Queue a task for later wakeup, most likely by the wake_up_q() call in the ++ * same context, _HOWEVER_ this is not guaranteed, the wakeup can come ++ * instantly. ++ * ++ * This function must be used as-if it were wake_up_process(); IOW the task ++ * must be ready to be woken at this location. ++ * ++ * This function is essentially a task-safe equivalent to wake_q_add(). Callers ++ * that already hold reference to @task can call the 'safe' version and trust ++ * wake_q to do the right thing depending whether or not the @task is already ++ * queued for wakeup. ++ */ ++void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) ++{ ++ if (!__wake_q_add(head, task)) ++ put_task_struct(task); ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* task can safely be re-inserted now: */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++/* ++ * resched_curr - mark rq's current task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_curr(struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ int cpu; ++ ++ lockdep_assert_held(&rq->lock); ++ ++ if (test_tsk_need_resched(curr)) ++ return; ++ ++ cpu = cpu_of(rq); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(curr)) ++ smp_send_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(cpu_rq(cpu)); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++} ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_NO_HZ_COMMON ++void nohz_balance_enter_idle(int cpu) {} ++ ++void select_nohz_load_balancer(int stop_tick) {} ++ ++void set_cpu_sd_state_idle(void) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(), default_cpu = -1; ++ struct cpumask *mask; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { ++ if (!idle_cpu(cpu)) ++ return cpu; ++ default_cpu = cpu; ++ } ++ ++ for (mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ mask < per_cpu(sched_cpu_affinity_end_mask, cpu); mask++) ++ for_each_cpu_and(i, mask, housekeeping_cpumask(HK_FLAG_TIMER)) ++ if (!idle_cpu(i)) ++ return i; ++ ++ if (default_cpu == -1) ++ default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++ cpu = default_cpu; ++ ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++static inline void wake_up_idle_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ if (set_nr_and_not_polling(rq->idle)) ++ smp_send_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++static inline bool wake_up_full_nohz_cpu(int cpu) ++{ ++ /* ++ * We just need the target to call irq_exit() and re-evaluate ++ * the next tick. The nohz full kick at least implies that. ++ * If needed we can still optimize that later with an ++ * empty IRQ. ++ */ ++ if (cpu_is_offline(cpu)) ++ return true; /* Don't try to wake offline CPUs. */ ++ if (tick_nohz_full_cpu(cpu)) { ++ if (cpu != smp_processor_id() || ++ tick_nohz_tick_stopped()) ++ tick_nohz_full_kick_cpu(cpu); ++ return true; ++ } ++ ++ return false; ++} ++ ++void wake_up_nohz_cpu(int cpu) ++{ ++ if (!wake_up_full_nohz_cpu(cpu)) ++ wake_up_idle_cpu(cpu); ++} ++ ++static void nohz_csd_func(void *info) ++{ ++ struct rq *rq = info; ++ int cpu = cpu_of(rq); ++ unsigned int flags; ++ ++ /* ++ * Release the rq::nohz_csd. ++ */ ++ flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu)); ++ WARN_ON(!(flags & NOHZ_KICK_MASK)); ++ ++ rq->idle_balance = idle_cpu(cpu); ++ if (rq->idle_balance && !need_resched()) { ++ rq->nohz_idle_balance = flags; ++ raise_softirq_irqoff(SCHED_SOFTIRQ); ++ } ++} ++ ++#endif /* CONFIG_NO_HZ_COMMON */ ++#endif /* CONFIG_SMP */ ++ ++static inline void check_preempt_curr(struct rq *rq) ++{ ++ if (sched_rq_first_task(rq) != rq->curr) ++ resched_curr(rq); ++} ++ ++static inline void ++rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func) ++{ ++ csd->flags = 0; ++ csd->func = func; ++ csd->info = rq; ++} ++ ++#ifdef CONFIG_SCHED_HRTICK ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++ ++static void hrtick_clear(struct rq *rq) ++{ ++ if (hrtimer_active(&rq->hrtick_timer)) ++ hrtimer_cancel(&rq->hrtick_timer); ++} ++ ++/* ++ * High-resolution timer tick. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrtick(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrtick_timer); ++ struct task_struct *p; ++ ++ WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); ++ ++ raw_spin_lock(&rq->lock); ++ p = rq->curr; ++ p->time_slice = 0; ++ resched_curr(rq); ++ raw_spin_unlock(&rq->lock); ++ ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Use hrtick when: ++ * - enabled by features ++ * - hrtimer is actually high res ++ */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ /** ++ * Alt schedule FW doesn't support sched_feat yet ++ if (!sched_feat(HRTICK)) ++ return 0; ++ */ ++ if (!cpu_active(cpu_of(rq))) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrtick_timer); ++} ++ ++#ifdef CONFIG_SMP ++ ++static void __hrtick_restart(struct rq *rq) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ++ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); ++} ++ ++/* ++ * called from hardirq (IPI) context ++ */ ++static void __hrtick_start(void *arg) ++{ ++ struct rq *rq = arg; ++ ++ raw_spin_lock(&rq->lock); ++ __hrtick_restart(rq); ++ raw_spin_unlock(&rq->lock); ++} ++ ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ struct hrtimer *timer = &rq->hrtick_timer; ++ ktime_t time; ++ s64 delta; ++ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense and can cause timer DoS. ++ */ ++ delta = max_t(s64, delay, 10000LL); ++ time = ktime_add_ns(timer->base->get_time(), delta); ++ ++ hrtimer_set_expires(timer, time); ++ ++ if (rq == this_rq()) ++ __hrtick_restart(rq); ++ else ++ smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd); ++} ++ ++#else ++/* ++ * Called to set the hrtick timer state. ++ * ++ * called with rq->lock held and irqs disabled ++ */ ++void hrtick_start(struct rq *rq, u64 delay) ++{ ++ /* ++ * Don't schedule slices shorter than 10000ns, that just ++ * doesn't make sense. Rely on vruntime for fairness. ++ */ ++ delay = max_t(u64, delay, 10000LL); ++ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED_HARD); ++} ++#endif /* CONFIG_SMP */ ++ ++static void hrtick_rq_init(struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start); ++#endif ++ ++ hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); ++ rq->hrtick_timer.function = hrtick; ++} ++#else /* CONFIG_SCHED_HRTICK */ ++static inline int hrtick_enabled(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline void hrtick_clear(struct rq *rq) ++{ ++} ++ ++static inline void hrtick_rq_init(struct rq *rq) ++{ ++} ++#endif /* CONFIG_SCHED_HRTICK */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ if (task_has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ ++ return p->static_prio + MAX_PRIORITY_ADJ; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static void activate_task(struct task_struct *p, struct rq *rq) ++{ ++ enqueue_task(p, rq, ENQUEUE_WAKEUP); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++ cpufreq_update_util(rq, 0); ++} ++ ++/* ++ * deactivate_task - remove a task from the runqueue. ++ * ++ * Context: rq->lock ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ dequeue_task(p, rq, DEQUEUE_SLEEP); ++ p->on_rq = 0; ++ cpufreq_update_util(rq, 0); ++} ++ ++static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->cpu is set up to a new value, task_access_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ WRITE_ONCE(p->cpu, cpu); ++#else ++ WRITE_ONCE(task_thread_info(p)->cpu, cpu); ++#endif ++#endif ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++#ifdef CONFIG_SCHED_DEBUG ++ /* ++ * We should never call set_task_cpu() on a blocked task, ++ * ttwu() will sort out the placement. ++ */ ++ WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && ++ !p->on_rq); ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * sched_move_task() holds both and thus holding either pins the cgroup, ++ * see task_group(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(&task_rq(p)->lock))); ++#endif ++ /* ++ * Clearly, migrating tasks to offline CPUs is a fairly daft thing. ++ */ ++ WARN_ON_ONCE(!cpu_online(new_cpu)); ++#endif ++ if (task_cpu(p) == new_cpu) ++ return; ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ __set_task_cpu(p, new_cpu); ++} ++ ++static inline bool is_per_cpu_kthread(struct task_struct *p) ++{ ++ return ((p->flags & PF_KTHREAD) && (1 == p->nr_cpus_allowed)); ++} ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr() and select_fallback_rq(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) ++ return false; ++ ++ if (is_per_cpu_kthread(p)) ++ return cpu_online(cpu); ++ ++ return cpu_active(cpu); ++} ++ ++/* ++ * This is how migration works: ++ * ++ * 1) we invoke migration_cpu_stop() on the target CPU using ++ * stop_one_cpu(). ++ * 2) stopper starts to run (implicitly forcing the migrated thread ++ * off the CPU) ++ * 3) it checks whether the migrated task is still in the wrong runqueue. ++ * 4) if it's in the wrong runqueue then the migration thread removes ++ * it and puts it into the right queue. ++ * 5) stopper completes and stop_one_cpu() returns and the migration ++ * is done. ++ */ ++ ++/* ++ * move_queued_task - move a queued task to new rq. ++ * ++ * Returns (locked) new rq. Old rq's lock is released. ++ */ ++static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int ++ new_cpu) ++{ ++ lockdep_assert_held(&rq->lock); ++ ++ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); ++ dequeue_task(p, rq, 0); ++ set_task_cpu(p, new_cpu); ++ raw_spin_unlock(&rq->lock); ++ ++ rq = cpu_rq(new_cpu); ++ ++ raw_spin_lock(&rq->lock); ++ BUG_ON(task_cpu(p) != new_cpu); ++ enqueue_task(p, rq, 0); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++ check_preempt_curr(rq); ++ ++ return rq; ++} ++ ++struct migration_arg { ++ struct task_struct *task; ++ int dest_cpu; ++}; ++ ++/* ++ * Move (not current) task off this CPU, onto the destination CPU. We're doing ++ * this because either it can't run here any more (set_cpus_allowed() ++ * away from this CPU, or CPU going down), or because we're ++ * attempting to rebalance this task on exec (sched_exec). ++ * ++ * So we race with normal scheduler movements, but that's OK, as long ++ * as the task is no longer on this CPU. ++ */ ++static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int ++ dest_cpu) ++{ ++ /* Affinity changed (again). */ ++ if (!is_cpu_allowed(p, dest_cpu)) ++ return rq; ++ ++ update_rq_clock(rq); ++ return move_queued_task(rq, p, dest_cpu); ++} ++ ++/* ++ * migration_cpu_stop - this will be executed by a highprio stopper thread ++ * and performs thread migration by bumping thread off CPU then ++ * 'pushing' onto another runqueue. ++ */ ++static int migration_cpu_stop(void *data) ++{ ++ struct migration_arg *arg = data; ++ struct task_struct *p = arg->task; ++ struct rq *rq = this_rq(); ++ ++ /* ++ * The original target CPU might have gone down and we might ++ * be on another CPU but it doesn't matter. ++ */ ++ local_irq_disable(); ++ /* ++ * We need to explicitly wake pending tasks before running ++ * __migrate_task() such that we will not miss enforcing cpus_ptr ++ * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test. ++ */ ++ flush_smp_call_function_from_idle(); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ /* ++ * If task_rq(p) != rq, it cannot be migrated here, because we're ++ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because ++ * we're holding p->pi_lock. ++ */ ++ if (task_rq(p) == rq && task_on_rq_queued(p)) ++ rq = __migrate_task(rq, p, arg->dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_enable(); ++ return 0; ++} ++ ++static inline void ++set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_mask, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ set_cpus_allowed_common(p, new_mask); ++} ++#endif ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ unsigned long flags; ++ bool running, on_rq; ++ unsigned long ncsw; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(p) && p == rq->curr) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ task_access_lock_irqsave(p, &lock, &flags); ++ trace_sched_wait_task(p); ++ running = task_running(p); ++ on_rq = p->on_rq; ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(on_rq)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_send_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++ ++/* ++ * ->cpus_ptr is protected by both rq->lock and p->pi_lock ++ * ++ * A few notes on cpu_active vs cpu_online: ++ * ++ * - cpu_active must be a subset of cpu_online ++ * ++ * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, ++ * see __set_cpus_allowed_ptr(). At this point the newly online ++ * CPU isn't yet part of the sched domains, and balancing will not ++ * see it. ++ * ++ * - on cpu-down we clear cpu_active() to mask the sched domains and ++ * avoid the load balancer to place new tasks on the to be removed ++ * CPU. Existing tasks will remain running there and will be taken ++ * off. ++ * ++ * This means that fallback selection must not select !active CPUs. ++ * And can assume that any active CPU must be online. Conversely ++ * select_task_rq() below may allow selection of !active CPUs in order ++ * to satisfy the above rules. ++ */ ++static int select_fallback_rq(int cpu, struct task_struct *p) ++{ ++ int nid = cpu_to_node(cpu); ++ const struct cpumask *nodemask = NULL; ++ enum { cpuset, possible, fail } state = cpuset; ++ int dest_cpu; ++ ++ /* ++ * If the node that the CPU is on has been offlined, cpu_to_node() ++ * will return -1. There is no CPU on the node, and we should ++ * select the CPU on the other node. ++ */ ++ if (nid != -1) { ++ nodemask = cpumask_of_node(nid); ++ ++ /* Look for allowed, online CPU in same node. */ ++ for_each_cpu(dest_cpu, nodemask) { ++ if (!cpu_active(dest_cpu)) ++ continue; ++ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) ++ return dest_cpu; ++ } ++ } ++ ++ for (;;) { ++ /* Any allowed, online CPU? */ ++ for_each_cpu(dest_cpu, p->cpus_ptr) { ++ if (!is_cpu_allowed(p, dest_cpu)) ++ continue; ++ goto out; ++ } ++ ++ /* No more Mr. Nice Guy. */ ++ switch (state) { ++ case cpuset: ++ if (IS_ENABLED(CONFIG_CPUSETS)) { ++ cpuset_cpus_allowed_fallback(p); ++ state = possible; ++ break; ++ } ++ fallthrough; ++ case possible: ++ do_set_cpus_allowed(p, cpu_possible_mask); ++ state = fail; ++ break; ++ ++ case fail: ++ BUG(); ++ break; ++ } ++ } ++ ++out: ++ if (state != cpuset) { ++ /* ++ * Don't tell them about moving exiting tasks or ++ * kernel threads (both mm NULL), since they never ++ * leave kernel. ++ */ ++ if (p->mm && printk_ratelimit()) { ++ printk_deferred("process %d (%s) no longer affine to cpu%d\n", ++ task_pid_nr(p), p->comm, cpu); ++ } ++ } ++ ++ return dest_cpu; ++} ++ ++static inline int select_task_rq(struct task_struct *p, struct rq *rq) ++{ ++ cpumask_t chk_mask, tmp; ++ ++ if (unlikely(!cpumask_and(&chk_mask, p->cpus_ptr, cpu_online_mask))) ++ return select_fallback_rq(task_cpu(p), p); ++ ++ if ( ++#ifdef CONFIG_SCHED_SMT ++ cpumask_and(&tmp, &chk_mask, &sched_sg_idle_mask) || ++#endif ++ cpumask_and(&tmp, &chk_mask, &sched_rq_watermark[IDLE_WM]) || ++ cpumask_and(&tmp, &chk_mask, ++ &sched_rq_watermark[task_sched_prio(p, rq) + 1])) ++ return best_mask_cpu(task_cpu(p), &tmp); ++ ++ return best_mask_cpu(task_cpu(p), &chk_mask); ++} ++ ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ int dest_cpu; ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ if (cpumask_equal(&p->cpus_mask, new_mask)) ++ goto out; ++ ++ dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ if (dest_cpu >= nr_cpu_ids) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ do_set_cpus_allowed(p, new_mask); ++ ++ if (p->flags & PF_KTHREAD) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(p) || p->state == TASK_WAKING) { ++ struct migration_arg arg = { p, dest_cpu }; ++ ++ /* Need help from migration thread: drop lock and wait. */ ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); ++ return 0; ++ } ++ if (task_on_rq_queued(p)) { ++ /* ++ * OK, since we're going to drop the lock immediately ++ * afterwards anyway. ++ */ ++ update_rq_clock(rq); ++ rq = move_queued_task(rq, p, dest_cpu); ++ lock = &rq->lock; ++ } ++ ++out: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#else /* CONFIG_SMP */ ++ ++static inline int select_task_rq(struct task_struct *p, struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline int ++__set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++ ++#endif /* CONFIG_SMP */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq= this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) ++ __schedstat_inc(rq->ttwu_local); ++ else { ++ /** Alt schedule FW ToDo: ++ * How to do ttwu_wake_remote ++ */ ++ } ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static inline void ++ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ check_preempt_curr(rq); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static inline void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++ ++ activate_task(p, rq); ++ ttwu_do_wakeup(rq, p, 0); ++} ++ ++/* ++ * Consider @p being inside a wait loop: ++ * ++ * for (;;) { ++ * set_current_state(TASK_UNINTERRUPTIBLE); ++ * ++ * if (CONDITION) ++ * break; ++ * ++ * schedule(); ++ * } ++ * __set_current_state(TASK_RUNNING); ++ * ++ * between set_current_state() and schedule(). In this case @p is still ++ * runnable, so all that needs doing is change p->state back to TASK_RUNNING in ++ * an atomic manner. ++ * ++ * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq ++ * then schedule() must still happen and p->state can be changed to ++ * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we ++ * need to do a full wakeup with enqueue. ++ * ++ * Returns: %true when the wakeup is done, ++ * %false otherwise. ++ */ ++static int ttwu_runnable(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ int ret = 0; ++ ++ rq = __task_access_lock(p, &lock); ++ if (task_on_rq_queued(p)) { ++ /* check_preempt_curr() may use rq clock */ ++ update_rq_clock(rq); ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_access_unlock(p, lock); ++ ++ return ret; ++} ++ ++#ifdef CONFIG_SMP ++void sched_ttwu_pending(void *arg) ++{ ++ struct llist_node *llist = arg; ++ struct rq *rq = this_rq(); ++ struct task_struct *p, *t; ++ struct rq_flags rf; ++ ++ if (!llist) ++ return; ++ ++ /* ++ * rq::ttwu_pending racy indication of out-standing wakeups. ++ * Races such that false-negatives are possible, since they ++ * are shorter lived that false-positives would be. ++ */ ++ WRITE_ONCE(rq->ttwu_pending, 0); ++ ++ rq_lock_irqsave(rq, &rf); ++ update_rq_clock(rq); ++ ++ llist_for_each_entry_safe(p, t, llist, wake_entry.llist) { ++ if (WARN_ON_ONCE(p->on_cpu)) ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) ++ set_task_cpu(p, cpu_of(rq)); ++ ++ ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0); ++ } ++ ++ rq_unlock_irqrestore(rq, &rf); ++} ++ ++void send_call_function_single_ipi(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (!set_nr_if_polling(rq->idle)) ++ arch_send_call_function_single_ipi(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++/* ++ * Queue a task on the target CPUs wake_list and wake the CPU via IPI if ++ * necessary. The wakee CPU on receipt of the IPI will queue the task ++ * via sched_ttwu_wakeup() for activation so the wakee incurs the cost ++ * of the wakeup instead of the waker. ++ */ ++static void __ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED); ++ ++ WRITE_ONCE(rq->ttwu_pending, 1); ++ __smp_call_single_queue(cpu, &p->wake_entry.llist); ++} ++ ++static inline bool ttwu_queue_cond(int cpu, int wake_flags) ++{ ++ /* ++ * If the CPU does not share cache, then queue the task on the ++ * remote rqs wakelist to avoid accessing remote data. ++ */ ++ if (!cpus_share_cache(smp_processor_id(), cpu)) ++ return true; ++ ++ /* ++ * If the task is descheduling and the only running task on the ++ * CPU then use the wakelist to offload the task activation to ++ * the soon-to-be-idle CPU as the current CPU is likely busy. ++ * nr_running is checked to avoid unnecessary task stacking. ++ */ ++ if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) ++ return true; ++ ++ return false; ++} ++ ++static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) ++{ ++ if (__is_defined(ALT_SCHED_TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { ++ if (WARN_ON_ONCE(cpu == smp_processor_id())) ++ return false; ++ ++ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ ++ __ttwu_queue_wakelist(p, cpu, wake_flags); ++ return true; ++ } ++ ++ return false; ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ if (is_idle_task(rq->curr)) ++ smp_send_reschedule(cpu); ++ /* Else CPU is not idle, do nothing here */ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); ++} ++#else /* !CONFIG_SMP */ ++ ++static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) ++{ ++ return false; ++} ++ ++#endif /* CONFIG_SMP */ ++ ++static inline void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (ttwu_queue_wakelist(p, cpu, wake_flags)) ++ return; ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ raw_spin_unlock(&rq->lock); ++} ++ ++/* ++ * Notes on Program-Order guarantees on SMP systems. ++ * ++ * MIGRATION ++ * ++ * The basic program-order guarantee on SMP systems is that when a task [t] ++ * migrates, all its activity on its old CPU [c0] happens-before any subsequent ++ * execution on its new CPU [c1]. ++ * ++ * For migration (of runnable tasks) this is provided by the following means: ++ * ++ * A) UNLOCK of the rq(c0)->lock scheduling out task t ++ * B) migration for t is required to synchronize *both* rq(c0)->lock and ++ * rq(c1)->lock (if not at the same time, then in that order). ++ * C) LOCK of the rq(c1)->lock scheduling in task ++ * ++ * Transitivity guarantees that B happens after A and C after B. ++ * Note: we only require RCpc transitivity. ++ * Note: the CPU doing B need not be c0 or c1 ++ * ++ * Example: ++ * ++ * CPU0 CPU1 CPU2 ++ * ++ * LOCK rq(0)->lock ++ * sched-out X ++ * sched-in Y ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(0)->lock // orders against CPU0 ++ * dequeue X ++ * UNLOCK rq(0)->lock ++ * ++ * LOCK rq(1)->lock ++ * enqueue X ++ * UNLOCK rq(1)->lock ++ * ++ * LOCK rq(1)->lock // orders against CPU2 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(1)->lock ++ * ++ * ++ * BLOCKING -- aka. SLEEP + WAKEUP ++ * ++ * For blocking we (obviously) need to provide the same guarantee as for ++ * migration. However the means are completely different as there is no lock ++ * chain to provide order. Instead we do: ++ * ++ * 1) smp_store_release(X->on_cpu, 0) -- finish_task() ++ * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up() ++ * ++ * Example: ++ * ++ * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) ++ * ++ * LOCK rq(0)->lock LOCK X->pi_lock ++ * dequeue X ++ * sched-out X ++ * smp_store_release(X->on_cpu, 0); ++ * ++ * smp_cond_load_acquire(&X->on_cpu, !VAL); ++ * X->state = WAKING ++ * set_task_cpu(X,2) ++ * ++ * LOCK rq(2)->lock ++ * enqueue X ++ * X->state = RUNNING ++ * UNLOCK rq(2)->lock ++ * ++ * LOCK rq(2)->lock // orders against CPU1 ++ * sched-out Z ++ * sched-in X ++ * UNLOCK rq(2)->lock ++ * ++ * UNLOCK X->pi_lock ++ * UNLOCK rq(0)->lock ++ * ++ * ++ * However; for wakeups there is a second guarantee we must provide, namely we ++ * must observe the state that lead to our wakeup. That is, not only must our ++ * task observe its own prior state, it must also observe the stores prior to ++ * its wakeup. ++ * ++ * This means that any means of doing remote wakeups must order the CPU doing ++ * the wakeup against the CPU the task is going to end up running on. This, ++ * however, is already required for the regular Program-Order guarantee above, ++ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). ++ * ++ */ ++ ++/** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Conceptually does: ++ * ++ * If (@state & @p->state) @p->state = TASK_RUNNING. ++ * ++ * If the task was not queued/runnable, also place it back on a runqueue. ++ * ++ * This function is atomic against schedule() which would dequeue the task. ++ * ++ * It issues a full memory barrier before accessing @p->state, see the comment ++ * with set_current_state(). ++ * ++ * Uses p->pi_lock to serialize against concurrent wake-ups. ++ * ++ * Relies on p->pi_lock stabilizing: ++ * - p->sched_class ++ * - p->cpus_ptr ++ * - p->sched_task_group ++ * in order to do migration, see its use of select_task_rq()/set_task_cpu(). ++ * ++ * Tries really hard to only take one task_rq(p)->lock for performance. ++ * Takes rq->lock in: ++ * - ttwu_runnable() -- old rq, unavoidable, see comment there; ++ * - ttwu_queue() -- new rq, for enqueue of the task; ++ * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us. ++ * ++ * As a consequence we race really badly with just about everything. See the ++ * many memory barriers and their comments for details. ++ * ++ * Return: %true if @p->state changes (an actual wakeup was done), ++ * %false otherwise. ++ */ ++static int try_to_wake_up(struct task_struct *p, unsigned int state, ++ int wake_flags) ++{ ++ unsigned long flags; ++ int cpu, success = 0; ++ ++ preempt_disable(); ++ if (p == current) { ++ /* ++ * We're waking current, this means 'p->on_rq' and 'task_cpu(p) ++ * == smp_processor_id()'. Together this means we can special ++ * case the whole 'p->on_rq && ttwu_runnable()' case below ++ * without taking any locks. ++ * ++ * In particular: ++ * - we rely on Program-Order guarantees for all the ordering, ++ * - we're serialized against set_special_state() by virtue of ++ * it disabling IRQs (this allows not taking ->pi_lock). ++ */ ++ if (!(p->state & state)) ++ goto out; ++ ++ success = 1; ++ trace_sched_waking(p); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++ goto out; ++ } ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with smp_store_mb() ++ * in set_current_state() that the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ if (!(p->state & state)) ++ goto unlock; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ * ++ * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). ++ */ ++ smp_rmb(); ++ if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) ++ goto unlock; ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ * ++ * Form a control-dep-acquire with p->on_rq == 0 above, to ensure ++ * schedule()'s deactivate_task() has 'happened' and p will no longer ++ * care about it's own p->state. See the comment in __schedule(). ++ */ ++ smp_acquire__after_ctrl_dep(); ++ ++ /* ++ * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq ++ * == 0), which means we need to do an enqueue, change p->state to ++ * TASK_WAKING such that we can unlock p->pi_lock before doing the ++ * enqueue, such as ttwu_queue_wakelist(). ++ */ ++ p->state = TASK_WAKING; ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, considering queueing p on the remote CPUs wake_list ++ * which potentially sends an IPI instead of spinning on p->on_cpu to ++ * let the waker make forward progress. This is safe because IRQs are ++ * disabled and the IPI will deliver after on_cpu is cleared. ++ * ++ * Ensure we load task_cpu(p) after p->on_cpu: ++ * ++ * set_task_cpu(p, cpu); ++ * STORE p->cpu = @cpu ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock ++ * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu) ++ * STORE p->on_cpu = 1 LOAD p->cpu ++ * ++ * to ensure we observe the correct CPU on which the task is currently ++ * scheduling. ++ */ ++ if (smp_load_acquire(&p->on_cpu) && ++ ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) ++ goto unlock; ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until its done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ sched_task_ttwu(p); ++ ++ cpu = select_task_rq(p, this_rq()); ++ ++ if (cpu != task_cpu(p)) { ++ wake_flags |= WF_MIGRATED; ++ psi_ttwu_dequeue(p); ++ set_task_cpu(p, cpu); ++ } ++#else ++ cpu = task_cpu(p); ++#endif /* CONFIG_SMP */ ++ ++ ttwu_queue(p, cpu, wake_flags); ++unlock: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++out: ++ if (success) ++ ttwu_stat(p, task_cpu(p), wake_flags); ++ preempt_enable(); ++ ++ return success; ++} ++ ++/** ++ * try_invoke_on_locked_down_task - Invoke a function on task in fixed state ++ * @p: Process for which the function is to be invoked. ++ * @func: Function to invoke. ++ * @arg: Argument to function. ++ * ++ * If the specified task can be quickly locked into a definite state ++ * (either sleeping or on a given runqueue), arrange to keep it in that ++ * state while invoking @func(@arg). This function can use ->on_rq and ++ * task_curr() to work out what the state is, if required. Given that ++ * @func can be invoked with a runqueue lock held, it had better be quite ++ * lightweight. ++ * ++ * Returns: ++ * @false if the task slipped out from under the locks. ++ * @true if the task was locked onto a runqueue or is sleeping. ++ * However, @func can override this by returning @false. ++ */ ++bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) ++{ ++ bool ret = false; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ lockdep_assert_irqs_enabled(); ++ raw_spin_lock_irq(&p->pi_lock); ++ if (p->on_rq) { ++ rq = __task_rq_lock(p, &rf); ++ if (task_rq(p) == rq) ++ ret = func(p, arg); ++ __task_rq_unlock(rq, &rf); ++ } else { ++ switch (p->state) { ++ case TASK_RUNNING: ++ case TASK_WAKING: ++ break; ++ default: ++ smp_rmb(); // See smp_rmb() comment in try_to_wake_up(). ++ if (!p->on_rq) ++ ret = func(p, arg); ++ } ++ } ++ raw_spin_unlock_irq(&p->pi_lock); ++ return ret; ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ * ++ * __sched_fork() is basic setup used by init_idle() too: ++ */ ++static inline void __sched_fork(unsigned long clone_flags, struct task_struct *p) ++{ ++ p->on_rq = 0; ++ p->on_cpu = 0; ++ p->utime = 0; ++ p->stime = 0; ++ p->sched_time = 0; ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ ++#ifdef CONFIG_COMPACTION ++ p->capture_control = NULL; ++#endif ++#ifdef CONFIG_SMP ++ p->wake_entry.u_flags = CSD_TYPE_TTWU; ++#endif ++} ++ ++/* ++ * fork()/clone()-time setup: ++ */ ++int sched_fork(unsigned long clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ ++ __sched_fork(clone_flags, p); ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = current->normal_prio; ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (task_has_rt_policy(p)) { ++ p->policy = SCHED_NORMAL; ++ p->static_prio = NICE_TO_PRIO(0); ++ p->rt_priority = 0; ++ } else if (PRIO_TO_NICE(p->static_prio) < 0) ++ p->static_prio = NICE_TO_PRIO(0); ++ ++ p->prio = p->normal_prio = normal_prio(p); ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ /* ++ * The child is not yet in the pid-hash so no cgroup attach races, ++ * and the cgroup is pinned to this child due to cgroup_fork() ++ * is ran before sched_fork(). ++ * ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. ++ */ ++ rq = this_rq(); ++ raw_spin_lock(&rq->lock); ++ ++ rq->curr->time_slice /= 2; ++ p->time_slice = rq->curr->time_slice; ++#ifdef CONFIG_SCHED_HRTICK ++ hrtick_start(rq, rq->curr->time_slice); ++#endif ++ ++ if (p->time_slice < RESCHED_NS) { ++ p->time_slice = sched_timeslice_ns; ++ resched_curr(rq); ++ } ++ sched_task_fork(p, rq); ++ raw_spin_unlock(&rq->lock); ++ ++ rseq_migrate(p); ++ /* ++ * We're setting the CPU for the first time, we don't migrate, ++ * so use __set_task_cpu(). ++ */ ++ __set_task_cpu(p, cpu_of(rq)); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ return 0; ++} ++ ++void sched_post_fork(struct task_struct *p) {} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ p->state = TASK_RUNNING; ++ ++ rq = cpu_rq(select_task_rq(p, this_rq())); ++#ifdef CONFIG_SMP ++ rseq_migrate(p); ++ /* ++ * Fork balancing, do it here and not earlier because: ++ * - cpus_ptr can change in the fork path ++ * - any previously selected CPU might disappear through hotplug ++ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, ++ * as we're not fully set-up yet. ++ */ ++ __set_task_cpu(p, cpu_of(rq)); ++#endif ++ ++ raw_spin_lock(&rq->lock); ++ ++ update_rq_clock(rq); ++ activate_task(p, rq); ++ trace_sched_wakeup_new(p); ++ check_preempt_curr(rq); ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ * ++ * See the ttwu() WF_ON_CPU case and its ordering comment. ++ */ ++ WRITE_ONCE(next->on_cpu, 1); ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * This must be the very last reference to @prev from this CPU. After ++ * p->on_cpu is cleared, the task can be moved to a different CPU. We ++ * must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#else ++ prev->on_cpu = 0; ++#endif ++} ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock.dep_map, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock.owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static struct rq *finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(&rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ finish_lock_switch(rq); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct_rcu_user(prev); ++ } ++ ++ tick_nohz_task_switch(); ++ return rq; ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq; ++ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ rq = finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline struct rq * ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prepare_task_switch(rq, prev, next); ++ ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * kernel -> kernel lazy + transfer active ++ * user -> kernel lazy + mmgrab() active ++ * ++ * kernel -> user switch + mmdrop() active ++ * user -> user switch ++ */ ++ if (!next->mm) { // to kernel ++ enter_lazy_tlb(prev->active_mm, next); ++ ++ next->active_mm = prev->active_mm; ++ if (prev->mm) // from user ++ mmgrab(prev->active_mm); ++ else ++ prev->active_mm = NULL; ++ } else { // to user ++ membarrier_switch_mm(rq, prev->active_mm, next->mm); ++ /* ++ * sys_membarrier() requires an smp_mb() between setting ++ * rq->curr / membarrier_switch_mm() and returning to userspace. ++ * ++ * The below provides this either through switch_mm(), or in ++ * case 'prev->active_mm == next->mm' through ++ * finish_task_switch()'s mmdrop(). ++ */ ++ switch_mm_irqs_off(prev->active_mm, next->mm, next); ++ ++ if (!prev->mm) { // from kernel ++ /* will mmdrop() in finish_task_switch(). */ ++ rq->prev_mm = prev->active_mm; ++ prev->active_mm = NULL; ++ } ++ } ++ ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ return finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptible section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ return raw_rq()->nr_running == 1; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int i; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += cpu_rq(i)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpuidle menu ++ * governor, are using nonsensical data. Preferring shallow idle state selection ++ * for a CPU that has IO-wait which might not even end up running the task when ++ * it does become runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ return atomic_read(&cpu_rq(cpu)->nr_iowait); ++} ++ ++/* ++ * IO-wait accounting, and how its mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += nr_iowait_cpu(i); ++ ++ return sum; ++} ++ ++#ifdef CONFIG_SMP ++ ++/* ++ * sched_exec - execve() is a valuable balancing opportunity, because at ++ * this point the task has the smallest effective memory and cache ++ * footprint. ++ */ ++void sched_exec(void) ++{ ++ struct task_struct *p = current; ++ unsigned long flags; ++ int dest_cpu; ++ struct rq *rq; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = this_rq(); ++ ++ if (rq != task_rq(p) || rq->nr_running < 2) ++ goto unlock; ++ ++ dest_cpu = select_task_rq(p, task_rq(p)); ++ if (dest_cpu == smp_processor_id()) ++ goto unlock; ++ ++ if (likely(cpu_active(dest_cpu))) { ++ struct migration_arg arg = { p, dest_cpu }; ++ ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); ++ return; ++ } ++unlock: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#endif ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++static inline void update_curr(struct rq *rq, struct task_struct *p) ++{ ++ s64 ns = rq->clock_task - p->last_ran; ++ ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ p->time_slice -= ns; ++ p->last_ran = rq->clock_task; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimization chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_access_lock_irqsave(p, &lock, &flags); ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_rq_clock(rq); ++ update_curr(rq, p); ++ } ++ ns = tsk_seruntime(p); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++ return ns; ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static inline void scheduler_task_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ if (is_idle_task(p)) ++ return; ++ ++ update_curr(rq, p); ++ cpufreq_update_util(rq, 0); ++ ++ /* ++ * Tasks have less than RESCHED_NS of time slice left they will be ++ * rescheduled. ++ */ ++ if (p->time_slice >= RESCHED_NS) ++ return; ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ arch_scale_freq_tick(); ++ sched_clock_tick(); ++ ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ scheduler_task_tick(rq); ++ calc_global_load_tick(rq); ++ psi_task_tick(rq); ++ ++ rq->last_tick = rq->clock; ++ raw_spin_unlock(&rq->lock); ++ ++ perf_event_task_tick(); ++} ++ ++#ifdef CONFIG_SCHED_SMT ++static inline int active_load_balance_cpu_stop(void *data) ++{ ++ struct rq *rq = this_rq(); ++ struct task_struct *p = data; ++ cpumask_t tmp; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ rq->active_balance = 0; ++ /* _something_ may have changed the task, double check again */ ++ if (task_on_rq_queued(p) && task_rq(p) == rq && ++ cpumask_and(&tmp, p->cpus_ptr, &sched_sg_idle_mask)) { ++ int cpu = cpu_of(rq); ++ int dcpu = __best_mask_cpu(cpu, &tmp, ++ per_cpu(sched_cpu_llc_mask, cpu)); ++ rq = move_queued_task(rq, p, dcpu); ++ } ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ local_irq_restore(flags); ++ ++ return 0; ++} ++ ++/* sg_balance_trigger - trigger slibing group balance for @cpu */ ++static inline int sg_balance_trigger(const int cpu) ++{ ++ struct rq *rq= cpu_rq(cpu); ++ unsigned long flags; ++ struct task_struct *curr; ++ int res; ++ ++ if (!raw_spin_trylock_irqsave(&rq->lock, flags)) ++ return 0; ++ curr = rq->curr; ++ res = (!is_idle_task(curr)) && (1 == rq->nr_running) &&\ ++ cpumask_intersects(curr->cpus_ptr, &sched_sg_idle_mask) &&\ ++ (!rq->active_balance); ++ ++ if (res) ++ rq->active_balance = 1; ++ ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ if (res) ++ stop_one_cpu_nowait(cpu, active_load_balance_cpu_stop, ++ curr, &rq->active_balance_work); ++ return res; ++} ++ ++/* ++ * sg_balance_check - slibing group balance check for run queue @rq ++ */ ++static inline void sg_balance_check(struct rq *rq) ++{ ++ cpumask_t chk; ++ int cpu; ++ ++ /* exit when no sg in idle */ ++ if (cpumask_empty(&sched_sg_idle_mask)) ++ return; ++ ++ cpu = cpu_of(rq); ++ /* ++ * Only cpu in slibing idle group will do the checking and then ++ * find potential cpus which can migrate the current running task ++ */ ++ if (cpumask_test_cpu(cpu, &sched_sg_idle_mask) && ++ cpumask_andnot(&chk, cpu_online_mask, &sched_rq_pending_mask) && ++ cpumask_andnot(&chk, &chk, &sched_rq_watermark[IDLE_WM])) { ++ int i, tried = 0; ++ ++ for_each_cpu_wrap(i, &chk, cpu) { ++ if (cpumask_subset(cpu_smt_mask(i), &chk)) { ++ if (sg_balance_trigger(i)) ++ return; ++ if (tried) ++ return; ++ tried++; ++ } ++ } ++ } ++} ++#endif /* CONFIG_SCHED_SMT */ ++ ++#ifdef CONFIG_NO_HZ_FULL ++ ++struct tick_work { ++ int cpu; ++ atomic_t state; ++ struct delayed_work work; ++}; ++/* Values for ->state, see diagram below. */ ++#define TICK_SCHED_REMOTE_OFFLINE 0 ++#define TICK_SCHED_REMOTE_OFFLINING 1 ++#define TICK_SCHED_REMOTE_RUNNING 2 ++ ++/* ++ * State diagram for ->state: ++ * ++ * ++ * TICK_SCHED_REMOTE_OFFLINE ++ * | ^ ++ * | | ++ * | | sched_tick_remote() ++ * | | ++ * | | ++ * +--TICK_SCHED_REMOTE_OFFLINING ++ * | ^ ++ * | | ++ * sched_tick_start() | | sched_tick_stop() ++ * | | ++ * V | ++ * TICK_SCHED_REMOTE_RUNNING ++ * ++ * ++ * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote() ++ * and sched_tick_start() are happy to leave the state in RUNNING. ++ */ ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ unsigned long flags; ++ u64 delta; ++ int os; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (!tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ curr = rq->curr; ++ if (cpu_is_offline(cpu)) ++ goto out_unlock; ++ ++ update_rq_clock(rq); ++ if (!is_idle_task(curr)) { ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ delta = rq_clock_task(rq) - curr->last_ran; ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ } ++ scheduler_task_tick(rq); ++ ++ calc_load_nohz_remote(rq); ++out_unlock: ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++out_requeue: ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. But ++ * first update state to reflect hotplug activity if required. ++ */ ++ os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE); ++ if (os == TICK_SCHED_REMOTE_RUNNING) ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ int os; ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING); ++ WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING); ++ if (os == TICK_SCHED_REMOTE_OFFLINE) { ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++ } ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ cancel_delayed_work_sync(&twork->work); ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++#if defined(CONFIG_PREEMPTION) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_PREEMPT_TRACER)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(KERN_ERR, preempt_disable_ip); ++ } ++ if (panic_on_warn) ++ panic("scheduling while atomic\n"); ++ ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev, bool preempt) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++ ++ if (task_scs_end_corrupted(prev)) ++ panic("corrupted shadow stack detected inside scheduler\n"); ++#endif ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++ if (!preempt && prev->state && prev->non_block_count) { ++ printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n", ++ prev->comm, prev->pid, prev->non_block_count); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++ } ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++/* ++ * Compile time debug macro ++ * #define ALT_SCHED_DEBUG ++ */ ++ ++#ifdef ALT_SCHED_DEBUG ++void alt_sched_debug(void) ++{ ++ printk(KERN_INFO "sched: pending: 0x%04lx, idle: 0x%04lx, sg_idle: 0x%04lx\n", ++ sched_rq_pending_mask.bits[0], ++ sched_rq_watermark[IDLE_WM].bits[0], ++ sched_sg_idle_mask.bits[0]); ++} ++#else ++inline void alt_sched_debug(void) {} ++#endif ++ ++#ifdef CONFIG_SMP ++ ++#define SCHED_RQ_NR_MIGRATION (32UL) ++/* ++ * Migrate pending tasks in @rq to @dest_cpu ++ * Will try to migrate mininal of half of @rq nr_running tasks and ++ * SCHED_RQ_NR_MIGRATION to @dest_cpu ++ */ ++static inline int ++migrate_pending_tasks(struct rq *rq, struct rq *dest_rq, const int dest_cpu) ++{ ++ struct task_struct *p, *skip = rq->curr; ++ int nr_migrated = 0; ++ int nr_tries = min(rq->nr_running / 2, SCHED_RQ_NR_MIGRATION); ++ ++ while (skip != rq->idle && nr_tries && ++ (p = sched_rq_next_task(skip, rq)) != rq->idle) { ++ skip = sched_rq_next_task(p, rq); ++ if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) { ++ __SCHED_DEQUEUE_TASK(p, rq, 0, ); ++ set_task_cpu(p, dest_cpu); ++ __SCHED_ENQUEUE_TASK(p, dest_rq, 0); ++ nr_migrated++; ++ } ++ nr_tries--; ++ } ++ ++ return nr_migrated; ++} ++ ++static inline int take_other_rq_tasks(struct rq *rq, int cpu) ++{ ++ struct cpumask *affinity_mask, *end_mask; ++ ++ if (unlikely(!rq->online)) ++ return 0; ++ ++ if (cpumask_empty(&sched_rq_pending_mask)) ++ return 0; ++ ++ affinity_mask = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ end_mask = per_cpu(sched_cpu_affinity_end_mask, cpu); ++ do { ++ int i; ++ for_each_cpu_and(i, &sched_rq_pending_mask, affinity_mask) { ++ int nr_migrated; ++ struct rq *src_rq; ++ ++ src_rq = cpu_rq(i); ++ if (!do_raw_spin_trylock(&src_rq->lock)) ++ continue; ++ spin_acquire(&src_rq->lock.dep_map, ++ SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ ++ if ((nr_migrated = migrate_pending_tasks(src_rq, rq, cpu))) { ++ src_rq->nr_running -= nr_migrated; ++#ifdef CONFIG_SMP ++ if (src_rq->nr_running < 2) ++ cpumask_clear_cpu(i, &sched_rq_pending_mask); ++#endif ++ rq->nr_running += nr_migrated; ++#ifdef CONFIG_SMP ++ if (rq->nr_running > 1) ++ cpumask_set_cpu(cpu, &sched_rq_pending_mask); ++#endif ++ update_sched_rq_watermark(rq); ++ cpufreq_update_util(rq, 0); ++ ++ spin_release(&src_rq->lock.dep_map, _RET_IP_); ++ do_raw_spin_unlock(&src_rq->lock); ++ ++ return 1; ++ } ++ ++ spin_release(&src_rq->lock.dep_map, _RET_IP_); ++ do_raw_spin_unlock(&src_rq->lock); ++ } ++ } while (++affinity_mask < end_mask); ++ ++ return 0; ++} ++#endif ++ ++/* ++ * Timeslices below RESCHED_NS are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. ++ */ ++static inline void check_curr(struct task_struct *p, struct rq *rq) ++{ ++ if (unlikely(rq->idle == p)) ++ return; ++ ++ update_curr(rq, p); ++ ++ if (p->time_slice < RESCHED_NS) ++ time_slice_expired(p, rq); ++} ++ ++static inline struct task_struct * ++choose_next_task(struct rq *rq, int cpu, struct task_struct *prev) ++{ ++ struct task_struct *next; ++ ++ if (unlikely(rq->skip)) { ++ next = rq_runnable_task(rq); ++ if (next == rq->idle) { ++#ifdef CONFIG_SMP ++ if (!take_other_rq_tasks(rq, cpu)) { ++#endif ++ rq->skip = NULL; ++ schedstat_inc(rq->sched_goidle); ++ return next; ++#ifdef CONFIG_SMP ++ } ++ next = rq_runnable_task(rq); ++#endif ++ } ++ rq->skip = NULL; ++#ifdef CONFIG_HIGH_RES_TIMERS ++ hrtick_start(rq, next->time_slice); ++#endif ++ return next; ++ } ++ ++ next = sched_rq_first_task(rq); ++ if (next == rq->idle) { ++#ifdef CONFIG_SMP ++ if (!take_other_rq_tasks(rq, cpu)) { ++#endif ++ schedstat_inc(rq->sched_goidle); ++ /*printk(KERN_INFO "sched: choose_next_task(%d) idle %px\n", cpu, next);*/ ++ return next; ++#ifdef CONFIG_SMP ++ } ++ next = sched_rq_first_task(rq); ++#endif ++ } ++#ifdef CONFIG_HIGH_RES_TIMERS ++ hrtick_start(rq, next->time_slice); ++#endif ++ /*printk(KERN_INFO "sched: choose_next_task(%d) next %px\n", cpu, ++ * next);*/ ++ return next; ++} ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPTION=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next; ++ unsigned long *switch_count; ++ unsigned long prev_state; ++ struct rq *rq; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ ++ schedule_debug(prev, preempt); ++ ++ /* by passing sched_feat(HRTICK) checking which Alt schedule FW doesn't support */ ++ hrtick_clear(rq); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(): ++ * ++ * __set_current_state(@state) signal_wake_up() ++ * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING) ++ * wake_up_state(p, state) ++ * LOCK rq->lock LOCK p->pi_state ++ * smp_mb__after_spinlock() smp_mb__after_spinlock() ++ * if (signal_pending_state()) if (p->state & @state) ++ * ++ * Also, the membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ raw_spin_lock(&rq->lock); ++ smp_mb__after_spinlock(); ++ ++ update_rq_clock(rq); ++ ++ switch_count = &prev->nivcsw; ++ /* ++ * We must load prev->state once (task_struct::state is volatile), such ++ * that: ++ * ++ * - we form a control dependency vs deactivate_task() below. ++ * - ptrace_{,un}freeze_traced() can change ->state underneath us. ++ */ ++ prev_state = prev->state; ++ if (!preempt && prev_state && prev_state == prev->state) { ++ if (signal_pending_state(prev_state, prev)) { ++ prev->state = TASK_RUNNING; ++ } else { ++ prev->sched_contributes_to_load = ++ (prev_state & TASK_UNINTERRUPTIBLE) && ++ !(prev_state & TASK_NOLOAD) && ++ !(prev->flags & PF_FROZEN); ++ ++ if (prev->sched_contributes_to_load) ++ rq->nr_uninterruptible++; ++ ++ /* ++ * __schedule() ttwu() ++ * prev_state = prev->state; if (p->on_rq && ...) ++ * if (prev_state) goto out; ++ * p->on_rq = 0; smp_acquire__after_ctrl_dep(); ++ * p->state = TASK_WAKING ++ * ++ * Where __schedule() and ttwu() have matching control dependencies. ++ * ++ * After this, schedule() must not care about p->state any more. ++ */ ++ sched_task_deactivate(prev, rq); ++ deactivate_task(prev, rq); ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ check_curr(prev, rq); ++ ++ next = choose_next_task(rq, cpu, prev); ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ ++ if (likely(prev != next)) { ++ next->last_ran = rq->clock_task; ++ rq->last_ts_switch = rq->clock; ++ ++ rq->nr_switches++; ++ /* ++ * RCU users of rcu_dereference(rq->curr) may not see ++ * changes to task_struct made by pick_next_task(). ++ */ ++ RCU_INIT_POINTER(rq->curr, next); ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ ++ psi_sched_switch(prev, next, !task_on_rq_queued(prev)); ++ ++ trace_sched_switch(preempt, prev, next); ++ ++ /* Also unlocks the rq: */ ++ rq = context_switch(rq, prev, next); ++ } else ++ raw_spin_unlock_irq(&rq->lock); ++ ++#ifdef CONFIG_SCHED_SMT ++ sg_balance_check(rq); ++#endif ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(): */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ ++ __schedule(false); ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ if (!tsk->state) ++ return; ++ ++ /* ++ * If a worker went to sleep, notify and ask workqueue whether ++ * it wants to wake up a task to maintain concurrency. ++ * As this function is called inside the schedule() context, ++ * we disable preemption to avoid it calling schedule() again ++ * in the possible wakeup of a kworker and because wq_worker_sleeping() ++ * requires it. ++ */ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ preempt_disable(); ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_sleeping(tsk); ++ else ++ io_wq_worker_sleeping(tsk); ++ preempt_enable_no_resched(); ++ } ++ ++ if (tsk_is_pi_blocked(tsk)) ++ return; ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++static void sched_update_worker(struct task_struct *tsk) ++{ ++ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { ++ if (tsk->flags & PF_WQ_WORKER) ++ wq_worker_running(tsk); ++ else ++ io_wq_worker_running(tsk); ++ } ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ sched_update_worker(tsk); ++} ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_CONTEXT_TRACKING ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != CONTEXT_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPTION ++/* ++ * This is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPTION */ ++ ++/* ++ * This is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && wake_flags & ~WF_SYNC); ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++static inline void check_task_changed(struct rq *rq, struct task_struct *p) ++{ ++ /* Trigger resched if task sched_prio has been modified. */ ++ if (task_on_rq_queued(p) && sched_task_need_requeue(p, rq)) { ++ requeue_task(p, rq); ++ check_preempt_curr(rq); ++ } ++} ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_access_lock(p, &lock); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guaratees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ p->prio = prio; ++ update_task_priodl(p); ++ ++ check_task_changed(rq, p); ++out_unlock: ++ __task_access_unlock(p, lock); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ raw_spinlock_t *lock; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ rq = __task_access_lock(p, &lock); ++ ++ p->static_prio = NICE_TO_PRIO(nice); ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (task_has_rt_policy(p)) ++ goto out_unlock; ++ ++ p->prio = effective_prio(p); ++ update_task_priodl(p); ++ ++ check_task_changed(rq, p); ++out_unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (rq->curr != rq->idle) ++ return 0; ++ ++ if (rq->nr_running) ++ return 0; ++ ++#ifdef CONFIG_SMP ++ if (rq->ttwu_pending) ++ return 0; ++#endif ++ ++ return 1; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the cpu @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++static void __setscheduler_params(struct task_struct *p, ++ const struct sched_attr *attr) ++{ ++ int policy = attr->sched_policy; ++ ++ if (policy == SETPARAM_POLICY) ++ policy = p->policy; ++ ++ p->policy = policy; ++ ++ /* ++ * allow normal nice value to be set, but will not have any ++ * effect on scheduling until the task not SCHED_NORMAL/ ++ * SCHED_BATCH ++ */ ++ p->static_prio = NICE_TO_PRIO(attr->sched_nice); ++ ++ /* ++ * __sched_setscheduler() ensures attr->sched_priority == 0 when ++ * !rt_policy. Always setting this ensures that things like ++ * getparam()/getattr() don't report silly values for !rt tasks. ++ */ ++ p->rt_priority = attr->sched_priority; ++ p->normal_prio = normal_prio(p); ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct rq *rq, struct task_struct *p, ++ const struct sched_attr *attr, bool keep_boost) ++{ ++ __setscheduler_params(p, attr); ++ ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++ update_task_priodl(p); ++} ++ ++/* ++ * check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int __sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, ++ bool user, bool pi) ++{ ++ const struct sched_attr dl_squash_attr = { ++ .size = sizeof(struct sched_attr), ++ .sched_policy = SCHED_FIFO, ++ .sched_nice = 0, ++ .sched_priority = 99, ++ }; ++ int newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ int retval, oldpolicy = -1; ++ int policy = attr->sched_policy; ++ unsigned long flags; ++ struct rq *rq; ++ int reset_on_fork; ++ raw_spinlock_t *lock; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ /* ++ * Alt schedule FW supports SCHED_DEADLINE by squash it as prio 0 SCHED_FIFO ++ */ ++ if (unlikely(SCHED_DEADLINE == policy)) { ++ attr = &dl_squash_attr; ++ policy = attr->sched_policy; ++ newprio = MAX_RT_PRIO - 1 - attr->sched_priority; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(attr->sched_flags & SCHED_RESET_ON_FORK); ++ ++ if (policy > SCHED_IDLE) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH and SCHED_IDLE is 0. ++ */ ++ if (attr->sched_priority < 0 || ++ (p->mm && attr->sched_priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if ((SCHED_RR == policy || SCHED_FIFO == policy) != ++ (attr->sched_priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (SCHED_FIFO == policy || SCHED_RR == policy) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (attr->sched_priority > p->rt_priority && ++ attr->sched_priority > rlim_rtprio) ++ return -EPERM; ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ if (pi) ++ cpuset_read_lock(); ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ ++ /* ++ * To be able to change p->policy safely, task_access_lock() ++ * must be called. ++ * IF use task_access_lock() here: ++ * For the task p which is not running, reading rq->stop is ++ * racy but acceptable as ->stop doesn't change much. ++ * An enhancemnet can be made to read rq->stop saftly. ++ */ ++ rq = __task_access_lock(p, &lock); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea ++ */ ++ if (p == rq->stop) { ++ retval = -EINVAL; ++ goto unlock; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy)) { ++ if (rt_policy(policy) && attr->sched_priority != p->rt_priority) ++ goto change; ++ if (!rt_policy(policy) && ++ NICE_TO_PRIO(attr->sched_nice) != p->static_prio) ++ goto change; ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ retval = 0; ++ goto unlock; ++ } ++change: ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ goto recheck; ++ } ++ ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ if (pi) { ++ /* ++ * Take priority boosted tasks into account. If the new ++ * effective priority is unchanged, we just store the new ++ * normal parameters and do not touch the scheduler class and ++ * the runqueue. This will be done when the task deboost ++ * itself. ++ */ ++ if (rt_effective_prio(p, newprio) == p->prio) { ++ __setscheduler_params(p, attr); ++ retval = 0; ++ goto unlock; ++ } ++ } ++ ++ __setscheduler(rq, p, attr, pi); ++ ++ check_task_changed(rq, p); ++ ++ /* Avoid rq from going away on us: */ ++ preempt_disable(); ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ if (pi) { ++ cpuset_read_unlock(); ++ rt_mutex_adjust_pi(p); ++ } ++ ++ preempt_enable(); ++ ++ return 0; ++ ++unlock: ++ __task_access_unlock(p, lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ if (pi) ++ cpuset_read_unlock(); ++ return retval; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ ++ if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { ++ attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; ++ policy &= ~SCHED_RESET_ON_FORK; ++ attr.sched_policy = policy; ++ } ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++ ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Use sched_set_fifo(), read its comment. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++ ++/* ++ * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally ++ * incapable of resource management, which is the one thing an OS really should ++ * be doing. ++ * ++ * This is of course the reason it is limited to privileged users only. ++ * ++ * Worse still; it is fundamentally impossible to compose static priority ++ * workloads. You cannot take two correctly working static prio workloads ++ * and smash them together and still expect them to work. ++ * ++ * For this reason 'all' FIFO tasks the kernel creates are basically at: ++ * ++ * MAX_RT_PRIO / 2 ++ * ++ * The administrator _MUST_ configure the system, the kernel simply doesn't ++ * know enough information to make a sensible choice. ++ */ ++void sched_set_fifo(struct task_struct *p) ++{ ++ struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 }; ++ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); ++} ++EXPORT_SYMBOL_GPL(sched_set_fifo); ++ ++/* ++ * For when you don't much care about FIFO, but want to be above SCHED_NORMAL. ++ */ ++void sched_set_fifo_low(struct task_struct *p) ++{ ++ struct sched_param sp = { .sched_priority = 1 }; ++ WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0); ++} ++EXPORT_SYMBOL_GPL(sched_set_fifo_low); ++ ++void sched_set_normal(struct task_struct *p, int nice) ++{ ++ struct sched_attr attr = { ++ .sched_policy = SCHED_NORMAL, ++ .sched_nice = nice, ++ }; ++ WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0); ++} ++EXPORT_SYMBOL_GPL(sched_set_normal); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (likely(p)) ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (likely(p)) { ++ retval = sched_setscheduler(p, policy, &lparam); ++ put_task_struct(p); ++ } ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) ++ goto err_size; ++ ++ ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size); ++ if (ret) { ++ if (ret == -E2BIG) ++ goto err_size; ++ return ret; ++ } ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * @param: structure containing the new RT priority. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (p != NULL) ++ retval = sched_setattr(p, &attr); ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (task_has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/* ++ * Copy the kernel size attribute structure (which might be larger ++ * than what user-space knows about) to user-space. ++ * ++ * Note that all cases are valid: user-space buffer can be larger or ++ * smaller than the kernel-space buffer. The usual case is that both ++ * have the same size. ++ */ ++static int ++sched_attr_copy_to_user(struct sched_attr __user *uattr, ++ struct sched_attr *kattr, ++ unsigned int usize) ++{ ++ unsigned int ksize = sizeof(*kattr); ++ ++ if (!access_ok(uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * sched_getattr() ABI forwards and backwards compatibility: ++ * ++ * If usize == ksize then we just copy everything to user-space and all is good. ++ * ++ * If usize < ksize then we only copy as much as user-space has space for, ++ * this keeps ABI compatibility as well. We skip the rest. ++ * ++ * If usize > ksize then user-space is using a newer version of the ABI, ++ * which part the kernel doesn't know about. Just ignore it - tooling can ++ * detect the kernel's knowledge of attributes from the attr->size value ++ * which is set to ksize in this case. ++ */ ++ kattr->size = min(usize, ksize); ++ ++ if (copy_to_user(uattr, kattr, kattr->size)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @usize: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, usize, unsigned int, flags) ++{ ++ struct sched_attr kattr = { }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || usize > PAGE_SIZE || ++ usize < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ kattr.sched_policy = p->policy; ++ if (p->sched_reset_on_fork) ++ kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; ++ if (task_has_rt_policy(p)) ++ kattr.sched_priority = p->rt_priority; ++ else ++ kattr.sched_nice = task_nice(p); ++ ++#ifdef CONFIG_UCLAMP_TASK ++ kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; ++ kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; ++#endif ++ ++ rcu_read_unlock(); ++ ++ return sched_attr_copy_to_user(uattr, &kattr, usize); ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_allowed, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ put_online_cpus(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_allowed); ++ cpumask_and(new_mask, in_mask, cpus_allowed); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_allowed); ++ if (!cpumask_subset(new_mask, cpus_allowed)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_allowed to the ++ * cpuset's cpus_allowed ++ */ ++ cpumask_copy(new_mask, cpus_allowed); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_allowed); ++out_put_task: ++ put_task_struct(p); ++ put_online_cpus(); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ struct cpumask *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ raw_spinlock_t *lock; ++ unsigned long flags; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ task_access_lock_irqsave(p, &lock, &flags); ++ cpumask_and(mask, &p->cpus_mask, cpu_active_mask); ++ task_access_unlock_irqrestore(p, lock, &flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: size of CPU mask copied to user_mask_ptr on success. An ++ * error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min_t(size_t, len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ * ++ * Return: 0. ++ */ ++static void do_sched_yield(void) ++{ ++ struct rq *rq; ++ struct rq_flags rf; ++ ++ if (!sched_yield_type) ++ return; ++ ++ rq = this_rq_lock_irq(&rf); ++ ++ schedstat_inc(rq->yld_count); ++ ++ if (1 == sched_yield_type) { ++ if (!rt_task(current)) ++ do_sched_yield_type_1(current, rq); ++ } else if (2 == sched_yield_type) { ++ if (rq->nr_running > 1) ++ rq->skip = current; ++ } ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ preempt_disable(); ++ raw_spin_unlock(&rq->lock); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPTION ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ rcu_all_qs(); ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPTION. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, its already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * In Alt schedule FW, yield_to is not supported. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void __sched io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ int retval; ++ ++ alt_sched_debug(); ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ rcu_read_unlock(); ++ ++ *t = ns_to_timespec64(sched_timeslice_ns); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct __kernel_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT_32BIT_TIME ++SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid, ++ struct old_timespec32 __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_old_timespec32(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ pr_cont(" running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", ++ free, task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ show_stack(p, NULL, KERN_INFO); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++#ifdef CONFIG_SCHED_DEBUG ++ /* TODO: Alt schedule FW should support this ++ if (!state_filter) ++ sysrq_sched_debug_show(); ++ */ ++#endif ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: CPU the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ __sched_fork(0, idle); ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(&rq->lock); ++ update_rq_clock(rq); ++ ++ idle->last_ran = rq->clock_task; ++ idle->state = TASK_RUNNING; ++ idle->flags |= PF_IDLE; ++ sched_queue_init_idle(rq, idle); ++ ++ scs_task_reset(idle); ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#endif ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ __set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->idle = idle; ++ rcu_assign_pointer(rq->curr, idle); ++ idle->on_cpu = 1; ++ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++#ifdef CONFIG_SMP ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_mask may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++bool sched_smp_initialized __read_mostly; ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Ensures that the idle task is using init_mm right before its CPU goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(current != this_rq()->idle); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ finish_arch_post_lock_switch(); ++ } ++ ++ /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ ++} ++ ++/* ++ * Migrate all tasks from the rq, sleeping tasks will be migrated by ++ * try_to_wake_up()->select_task_rq(). ++ * ++ * Called with rq->lock held even though we'er in stop_machine() and ++ * there's no concurrency possible, we hold the required locks anyway ++ * because of lock validation efforts. ++ */ ++static void migrate_tasks(struct rq *dead_rq) ++{ ++ struct rq *rq = dead_rq; ++ struct task_struct *p, *stop = rq->stop; ++ int count = 0; ++ ++ /* ++ * Fudge the rq selection such that the below task selection loop ++ * doesn't get stuck on the currently eligible stop task. ++ * ++ * We're currently inside stop_machine() and the rq is either stuck ++ * in the stop_machine_cpu_stop() loop, or we're executing this code, ++ * either way we should never end up calling schedule() until we're ++ * done here. ++ */ ++ rq->stop = NULL; ++ ++ p = sched_rq_first_task(rq); ++ while (p != rq->idle) { ++ int dest_cpu; ++ ++ /* skip the running task */ ++ if (task_running(p) || 1 == p->nr_cpus_allowed) { ++ p = sched_rq_next_task(p, rq); ++ continue; ++ } ++ ++ /* ++ * Rules for changing task_struct::cpus_allowed are holding ++ * both pi_lock and rq->lock, such that holding either ++ * stabilizes the mask. ++ * ++ * Drop rq->lock is not quite as disastrous as it usually is ++ * because !cpu_active at this point, which means load-balance ++ * will not interfere. Also, stop-machine. ++ */ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_lock(&p->pi_lock); ++ raw_spin_lock(&rq->lock); ++ ++ /* ++ * Since we're inside stop-machine, _nothing_ should have ++ * changed the task, WARN if weird stuff happened, because in ++ * that case the above rq->lock drop is a fail too. ++ */ ++ if (WARN_ON(task_rq(p) != rq || !task_on_rq_queued(p))) { ++ raw_spin_unlock(&p->pi_lock); ++ p = sched_rq_next_task(p, rq); ++ continue; ++ } ++ ++ count++; ++ /* Find suitable destination for @next, with force if needed. */ ++ dest_cpu = select_fallback_rq(dead_rq->cpu, p); ++ rq = __migrate_task(rq, p, dest_cpu); ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock(&p->pi_lock); ++ ++ rq = dead_rq; ++ raw_spin_lock(&rq->lock); ++ /* Check queued task all over from the header again */ ++ p = sched_rq_first_task(rq); ++ } ++ ++ rq->stop = stop; ++} ++ ++static void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) ++ rq->online = false; ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++static void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) ++ rq->online = true; ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going up, increment the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) ++ static_branch_inc_cpuslocked(&sched_smt_present); ++#endif ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) ++ cpuset_cpu_active(); ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all cpus have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_online(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu(); ++ ++#ifdef CONFIG_SCHED_SMT ++ /* ++ * When going down, decrement the number of cores with SMT present. ++ */ ++ if (cpumask_weight(cpu_smt_mask(cpu)) == 2) { ++ static_branch_dec_cpuslocked(&sched_smt_present); ++ if (!static_branch_likely(&sched_smt_present)) ++ cpumask_clear(&sched_sg_idle_mask); ++ } ++#endif ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ return 0; ++} ++ ++static void sched_rq_cpu_starting(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ rq->calc_load_update = calc_load_update; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_rq_cpu_starting(cpu); ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ /* Handle pending wakeups and then migrate everything off */ ++ sched_tick_stop(cpu); ++ ++ raw_spin_lock_irqsave(&rq->lock, flags); ++ set_rq_offline(rq); ++ migrate_tasks(rq); ++ raw_spin_unlock_irqrestore(&rq->lock, flags); ++ ++ hrtick_clear(rq); ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_SMP ++static void sched_init_topology_cpumask_early(void) ++{ ++ int cpu, level; ++ cpumask_t *tmp; ++ ++ for_each_possible_cpu(cpu) { ++ for (level = 0; level < NR_CPU_AFFINITY_CHK_LEVEL; level++) { ++ tmp = &(per_cpu(sched_cpu_affinity_masks, cpu)[level]); ++ cpumask_copy(tmp, cpu_possible_mask); ++ cpumask_clear_cpu(cpu, tmp); ++ } ++ per_cpu(sched_cpu_llc_mask, cpu) = ++ &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ per_cpu(sched_cpu_affinity_end_mask, cpu) = ++ &(per_cpu(sched_cpu_affinity_masks, cpu)[1]); ++ /*per_cpu(sd_llc_id, cpu) = cpu;*/ ++ } ++} ++ ++#define TOPOLOGY_CPUMASK(name, mask, last) \ ++ if (cpumask_and(chk, chk, mask)) \ ++ printk(KERN_INFO "sched: cpu#%02d affinity mask: 0x%08lx - "#name,\ ++ cpu, (chk++)->bits[0]); \ ++ if (!last) \ ++ cpumask_complement(chk, mask) ++ ++static void sched_init_topology_cpumask(void) ++{ ++ int cpu; ++ cpumask_t *chk; ++ ++ for_each_online_cpu(cpu) { ++ /* take chance to reset time slice for idle tasks */ ++ cpu_rq(cpu)->idle->time_slice = sched_timeslice_ns; ++ ++ chk = &(per_cpu(sched_cpu_affinity_masks, cpu)[0]); ++ ++ cpumask_complement(chk, cpumask_of(cpu)); ++#ifdef CONFIG_SCHED_SMT ++ TOPOLOGY_CPUMASK(smt, topology_sibling_cpumask(cpu), false); ++#endif ++ per_cpu(sd_llc_id, cpu) = cpumask_first(cpu_coregroup_mask(cpu)); ++ per_cpu(sched_cpu_llc_mask, cpu) = chk; ++ TOPOLOGY_CPUMASK(coregroup, cpu_coregroup_mask(cpu), false); ++ ++ TOPOLOGY_CPUMASK(core, topology_core_cpumask(cpu), false); ++ ++ TOPOLOGY_CPUMASK(others, cpu_online_mask, true); ++ ++ per_cpu(sched_cpu_affinity_end_mask, cpu) = chk; ++ printk(KERN_INFO "sched: cpu#%02d llc_id = %d, llc_mask idx = %d\n", ++ cpu, per_cpu(sd_llc_id, cpu), ++ (int) (per_cpu(sched_cpu_llc_mask, cpu) - ++ &(per_cpu(sched_cpu_affinity_masks, cpu)[0]))); ++ } ++} ++#endif ++ ++void __init sched_init_smp(void) ++{ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) ++ BUG(); ++ ++ sched_init_topology_cpumask(); ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++ cpu_rq(0)->idle->time_slice = sched_timeslice_ns; ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++ int i; ++ struct rq *rq; ++ ++ printk(KERN_INFO ALT_SCHED_VERSION_MSG); ++ ++ wait_bit_init(); ++ ++#ifdef CONFIG_SMP ++ for (i = 0; i < SCHED_BITS; i++) ++ cpumask_copy(&sched_rq_watermark[i], cpu_present_mask); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ ++ sched_queue_init(rq); ++ rq->watermark = IDLE_WM; ++ rq->skip = NULL; ++ ++ raw_spin_lock_init(&rq->lock); ++ rq->nr_running = rq->nr_uninterruptible = 0; ++ rq->calc_load_active = 0; ++ rq->calc_load_update = jiffies + LOAD_FREQ; ++#ifdef CONFIG_SMP ++ rq->online = false; ++ rq->cpu = i; ++ ++#ifdef CONFIG_SCHED_SMT ++ rq->active_balance = 0; ++#endif ++ ++#ifdef CONFIG_NO_HZ_COMMON ++ rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); ++#endif ++#endif /* CONFIG_SMP */ ++ rq->nr_switches = 0; ++ ++ hrtick_rq_init(rq); ++ atomic_set(&rq->nr_iowait, 0); ++ } ++#ifdef CONFIG_SMP ++ /* Set rq->online for cpu 0 */ ++ cpu_rq(0)->online = true; ++#endif ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++ ++ sched_init_topology_cpumask_early(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++ ++ psi_init(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current) && !current->non_block_count) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++#ifdef CONFIG_DEBUG_PREEMPT ++ if (!preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(KERN_ERR, preempt_disable_ip); ++ } ++#endif ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++ ++void __cant_sleep(const char *file, int line, int preempt_offset) ++{ ++ static unsigned long prev_jiffy; ++ ++ if (irqs_disabled()) ++ return; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) ++ return; ++ ++ if (preempt_count() > preempt_offset) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line); ++ printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ debug_show_held_locks(current); ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL_GPL(__cant_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++void normalize_rt_tasks(void) ++{ ++ struct task_struct *g, *p; ++ struct sched_attr attr = { ++ .sched_policy = SCHED_NORMAL, ++ }; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p)) { ++ /* ++ * Renice negative nice level userspace ++ * tasks back to 0: ++ */ ++ if (task_nice(p) < 0) ++ set_user_nice(p, 0); ++ continue; ++ } ++ ++ __sched_setscheduler(p, &attr, false, false); ++ } ++ read_unlock(&tasklist_lock); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * ia64_set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_legacy_files[] = { ++ { } /* Terminate */ ++}; ++ ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++#undef CREATE_TRACE_POINTS +diff --git a/kernel/sched/alt_debug.c b/kernel/sched/alt_debug.c +new file mode 100644 +index 000000000000..1212a031700e +--- /dev/null ++++ b/kernel/sched/alt_debug.c +@@ -0,0 +1,31 @@ ++/* ++ * kernel/sched/alt_debug.c ++ * ++ * Print the alt scheduler debugging details ++ * ++ * Author: Alfred Chen ++ * Date : 2020 ++ */ ++#include "sched.h" ++ ++/* ++ * This allows printing both to /proc/sched_debug and ++ * to the console ++ */ ++#define SEQ_printf(m, x...) \ ++ do { \ ++ if (m) \ ++ seq_printf(m, x); \ ++ else \ ++ pr_cont(x); \ ++ } while (0) ++ ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{ ++ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns), ++ get_nr_threads(p)); ++} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} +diff --git a/kernel/sched/alt_sched.h b/kernel/sched/alt_sched.h +new file mode 100644 +index 000000000000..99be2c51c88d +--- /dev/null ++++ b/kernel/sched/alt_sched.h +@@ -0,0 +1,555 @@ ++#ifndef ALT_SCHED_H ++#define ALT_SCHED_H ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#ifdef CONFIG_PARAVIRT ++# include ++#endif ++ ++#include "cpupri.h" ++ ++#ifdef CONFIG_SCHED_BMQ ++#include "bmq.h" ++#endif ++#ifdef CONFIG_SCHED_PDS ++#include "pds.h" ++#endif ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; ++} ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++#define WF_ON_CPU 0x08 /* Wakee is on_rq */ ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ /* runqueue lock: */ ++ raw_spinlock_t lock; ++ ++ struct task_struct __rcu *curr; ++ struct task_struct *idle, *stop, *skip; ++ struct mm_struct *prev_mm; ++ ++#ifdef CONFIG_SCHED_BMQ ++ struct bmq queue; ++#endif ++#ifdef CONFIG_SCHED_PDS ++ struct skiplist_node sl_header; ++#endif ++ unsigned long watermark; ++ ++ /* switch count */ ++ u64 nr_switches; ++ ++ atomic_t nr_iowait; ++ ++#ifdef CONFIG_MEMBARRIER ++ int membarrier_state; ++#endif ++ ++#ifdef CONFIG_SMP ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++ unsigned int ttwu_pending; ++ unsigned char nohz_idle_balance; ++ unsigned char idle_balance; ++ ++#ifdef CONFIG_HAVE_SCHED_AVG_IRQ ++ struct sched_avg avg_irq; ++#endif ++ ++#ifdef CONFIG_SCHED_SMT ++ int active_balance; ++ struct cpu_stop_work active_balance_work; ++#endif ++#endif /* CONFIG_SMP */ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ /* calc_load related fields */ ++ unsigned long calc_load_update; ++ long calc_load_active; ++ ++ u64 clock, last_tick; ++ u64 last_ts_switch; ++ u64 clock_task; ++ ++ unsigned long nr_running; ++ unsigned long nr_uninterruptible; ++ ++#ifdef CONFIG_SCHED_HRTICK ++#ifdef CONFIG_SMP ++ call_single_data_t hrtick_csd; ++#endif ++ struct hrtimer hrtick_timer; ++#endif ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++ ++#ifdef CONFIG_NO_HZ_COMMON ++#ifdef CONFIG_SMP ++ call_single_data_t nohz_csd; ++#endif ++ atomic_t nohz_flags; ++#endif /* CONFIG_NO_HZ_COMMON */ ++}; ++ ++extern unsigned long calc_load_update; ++extern atomic_long_t calc_load_tasks; ++ ++extern void calc_global_load_tick(struct rq *this_rq); ++extern long calc_load_fold_active(struct rq *this_rq, long adjust); ++ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) ++#define this_rq() this_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++ ++#ifdef CONFIG_SMP ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++extern bool sched_smp_initialized; ++ ++enum { ++ BASE_CPU_AFFINITY_CHK_LEVEL = 1, ++#ifdef CONFIG_SCHED_SMT ++ SMT_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++#ifdef CONFIG_SCHED_MC ++ MC_CPU_AFFINITY_CHK_LEVEL_SPACE_HOLDER, ++#endif ++ NR_CPU_AFFINITY_CHK_LEVEL ++}; ++ ++DECLARE_PER_CPU(cpumask_t [NR_CPU_AFFINITY_CHK_LEVEL], sched_cpu_affinity_masks); ++ ++static inline int __best_mask_cpu(int cpu, const cpumask_t *cpumask, ++ const cpumask_t *mask) ++{ ++ while ((cpu = cpumask_any_and(cpumask, mask)) >= nr_cpu_ids) ++ mask++; ++ return cpu; ++} ++ ++static inline int best_mask_cpu(int cpu, const cpumask_t *cpumask) ++{ ++ return cpumask_test_cpu(cpu, cpumask)? cpu : ++ __best_mask_cpu(cpu, cpumask, &(per_cpu(sched_cpu_affinity_masks, cpu)[0])); ++} ++ ++extern void flush_smp_call_function_from_idle(void); ++ ++#else /* !CONFIG_SMP */ ++static inline void flush_smp_call_function_from_idle(void) { } ++#endif ++ ++#ifndef arch_scale_freq_tick ++static __always_inline ++void arch_scale_freq_tick(void) ++{ ++} ++#endif ++ ++#ifndef arch_scale_freq_capacity ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ /* ++ * Relax lockdep_assert_held() checking as in VRQ, call to ++ * sched_info_xxxx() may not held rq->lock ++ * lockdep_assert_held(&rq->lock); ++ */ ++ return rq->clock_task; ++} ++ ++/* ++ * {de,en}queue flags: ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ */ ++ ++#define DEQUEUE_SLEEP 0x01 ++ ++#define ENQUEUE_WAKEUP 0x01 ++ ++ ++/* ++ * Below are scheduler API which using in other kernel code ++ * It use the dummy rq_flags ++ * ToDo : BMQ need to support these APIs for compatibility with mainline ++ * scheduler code. ++ */ ++struct rq_flags { ++ unsigned long flags; ++}; ++ ++struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(rq->lock); ++ ++struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock); ++ ++static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(&rq->lock); ++} ++ ++static inline void ++task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ raw_spin_unlock(&rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); ++} ++ ++static inline void ++rq_unlock_irq(struct rq *rq, struct rq_flags *rf) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(&rq->lock); ++} ++ ++static inline struct rq * ++this_rq_lock_irq(struct rq_flags *rf) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ raw_spin_lock(&rq->lock); ++ ++ return rq; ++} ++ ++static inline int task_current(struct rq *rq, struct task_struct *p) ++{ ++ return rq->curr == p; ++} ++ ++static inline bool task_running(struct task_struct *p) ++{ ++ return p->on_cpu; ++} ++ ++extern struct static_key_false sched_schedstats; ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++static inline int cpu_of(const struct rq *rq) ++{ ++#ifdef CONFIG_SMP ++ return rq->cpu; ++#else ++ return 0; ++#endif ++} ++ ++#include "stats.h" ++ ++#ifdef CONFIG_NO_HZ_COMMON ++#define NOHZ_BALANCE_KICK_BIT 0 ++#define NOHZ_STATS_KICK_BIT 1 ++ ++#define NOHZ_BALANCE_KICK BIT(NOHZ_BALANCE_KICK_BIT) ++#define NOHZ_STATS_KICK BIT(NOHZ_STATS_KICK_BIT) ++ ++#define NOHZ_KICK_MASK (NOHZ_BALANCE_KICK | NOHZ_STATS_KICK) ++ ++#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) ++ ++/* TODO: needed? ++extern void nohz_balance_exit_idle(struct rq *rq); ++#else ++static inline void nohz_balance_exit_idle(struct rq *rq) { } ++*/ ++#endif ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data __rcu *, cpufreq_update_util_data); ++ ++/** ++ * cpufreq_update_util - Take a note about CPU utilization changes. ++ * @rq: Runqueue to carry out the update for. ++ * @flags: Update reason flags. ++ * ++ * This function is called by the scheduler on the CPU whose utilization is ++ * being updated. ++ * ++ * It can only be called from RCU-sched read-side critical sections. ++ * ++ * The way cpufreq is currently arranged requires it to evaluate the CPU ++ * performance state (frequency/voltage) on a regular basis to prevent it from ++ * being stuck in a completely inadequate performance level for too long. ++ * That is not guaranteed to happen if the updates are only triggered from CFS ++ * and DL, though, because they may not be coming in if only RT tasks are ++ * active all the time (or there are RT tasks only). ++ * ++ * As a workaround for that issue, this function is called periodically by the ++ * RT sched class to trigger extra cpufreq updates to prevent it from stalling, ++ * but that really is a band-aid. Going forward it should be replaced with ++ * solutions targeted more specifically at RT tasks. ++ */ ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); ++ if (data) ++ data->func(data, rq_clock(rq), flags); ++} ++#else ++static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} ++#endif /* CONFIG_CPU_FREQ */ ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern int __init sched_tick_offload_init(void); ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++#endif ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++extern void schedule_idle(void); ++ ++/* ++ * !! For sched_setattr_nocheck() (kernel) only !! ++ * ++ * This is actually gross. :( ++ * ++ * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE ++ * tasks, but still be able to sleep. We need this on platforms that cannot ++ * atomically change clock frequency. Remove once fast switching will be ++ * available on such platforms. ++ * ++ * SUGOV stands for SchedUtil GOVernor. ++ */ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++#ifdef CONFIG_MEMBARRIER ++/* ++ * The scheduler provides memory barriers required by membarrier between: ++ * - prior user-space memory accesses and store to rq->membarrier_state, ++ * - store to rq->membarrier_state and following user-space memory accesses. ++ * In the same way it provides those guarantees around store to rq->curr. ++ */ ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++ int membarrier_state; ++ ++ if (prev_mm == next_mm) ++ return; ++ ++ membarrier_state = atomic_read(&next_mm->membarrier_state); ++ if (READ_ONCE(rq->membarrier_state) == membarrier_state) ++ return; ++ ++ WRITE_ONCE(rq->membarrier_state, membarrier_state); ++} ++#else ++static inline void membarrier_switch_mm(struct rq *rq, ++ struct mm_struct *prev_mm, ++ struct mm_struct *next_mm) ++{ ++} ++#endif ++ ++#ifdef CONFIG_NUMA ++extern int sched_numa_find_closest(const struct cpumask *cpus, int cpu); ++#else ++static inline int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return nr_cpu_ids; ++} ++#endif ++ ++void swake_up_all_locked(struct swait_queue_head *q); ++void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); ++ ++#endif /* ALT_SCHED_H */ +diff --git a/kernel/sched/bmq.h b/kernel/sched/bmq.h +new file mode 100644 +index 000000000000..aff0bb30a884 +--- /dev/null ++++ b/kernel/sched/bmq.h +@@ -0,0 +1,20 @@ ++#ifndef BMQ_H ++#define BMQ_H ++ ++/* bits: ++ * RT(0-99), (Low prio adj range, nice width, high prio adj range) / 2, cpu idle task */ ++#define SCHED_BITS (MAX_RT_PRIO + NICE_WIDTH / 2 + MAX_PRIORITY_ADJ + 1) ++#define IDLE_TASK_SCHED_PRIO (SCHED_BITS - 1) ++ ++struct bmq { ++ DECLARE_BITMAP(bitmap, SCHED_BITS); ++ struct list_head heads[SCHED_BITS]; ++}; ++ ++ ++static inline int task_running_nice(struct task_struct *p) ++{ ++ return (p->prio + p->boost_prio > DEFAULT_PRIO + MAX_PRIORITY_ADJ); ++} ++ ++#endif +diff --git a/kernel/sched/bmq_imp.h b/kernel/sched/bmq_imp.h +new file mode 100644 +index 000000000000..ad9a7c448da7 +--- /dev/null ++++ b/kernel/sched/bmq_imp.h +@@ -0,0 +1,185 @@ ++#define ALT_SCHED_VERSION_MSG "sched/bmq: BMQ CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n" ++ ++/* ++ * BMQ only routines ++ */ ++#define rq_switch_time(rq) ((rq)->clock - (rq)->last_ts_switch) ++#define boost_threshold(p) (sched_timeslice_ns >>\ ++ (15 - MAX_PRIORITY_ADJ - (p)->boost_prio)) ++ ++static inline void boost_task(struct task_struct *p) ++{ ++ int limit; ++ ++ switch (p->policy) { ++ case SCHED_NORMAL: ++ limit = -MAX_PRIORITY_ADJ; ++ break; ++ case SCHED_BATCH: ++ case SCHED_IDLE: ++ limit = 0; ++ break; ++ default: ++ return; ++ } ++ ++ if (p->boost_prio > limit) ++ p->boost_prio--; ++} ++ ++static inline void deboost_task(struct task_struct *p) ++{ ++ if (p->boost_prio < MAX_PRIORITY_ADJ) ++ p->boost_prio++; ++} ++ ++/* ++ * Common interfaces ++ */ ++static inline int task_sched_prio(struct task_struct *p, struct rq *rq) ++{ ++ return (p->prio < MAX_RT_PRIO)? p->prio : MAX_RT_PRIO / 2 + (p->prio + p->boost_prio) / 2; ++} ++ ++static inline void requeue_task(struct task_struct *p, struct rq *rq); ++ ++static inline void time_slice_expired(struct task_struct *p, struct rq *rq) ++{ ++ p->time_slice = sched_timeslice_ns; ++ ++ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) { ++ if (SCHED_RR != p->policy) ++ deboost_task(p); ++ requeue_task(p, rq); ++ } ++} ++ ++static inline void update_task_priodl(struct task_struct *p) {} ++ ++static inline unsigned long sched_queue_watermark(struct rq *rq) ++{ ++ return find_first_bit(rq->queue.bitmap, SCHED_BITS); ++} ++ ++static inline void sched_queue_init(struct rq *rq) ++{ ++ struct bmq *q = &rq->queue; ++ int i; ++ ++ bitmap_zero(q->bitmap, SCHED_BITS); ++ for(i = 0; i < SCHED_BITS; i++) ++ INIT_LIST_HEAD(&q->heads[i]); ++} ++ ++static inline void sched_queue_init_idle(struct rq *rq, struct task_struct *idle) ++{ ++ struct bmq *q = &rq->queue; ++ ++ idle->bmq_idx = IDLE_TASK_SCHED_PRIO; ++ INIT_LIST_HEAD(&q->heads[idle->bmq_idx]); ++ list_add(&idle->bmq_node, &q->heads[idle->bmq_idx]); ++ set_bit(idle->bmq_idx, q->bitmap); ++} ++ ++/* ++ * This routine used in bmq scheduler only which assume the idle task in the bmq ++ */ ++static inline struct task_struct *sched_rq_first_task(struct rq *rq) ++{ ++ unsigned long idx = find_first_bit(rq->queue.bitmap, SCHED_BITS); ++ const struct list_head *head = &rq->queue.heads[idx]; ++ ++ return list_first_entry(head, struct task_struct, bmq_node); ++} ++ ++static inline struct task_struct * ++sched_rq_next_task(struct task_struct *p, struct rq *rq) ++{ ++ unsigned long idx = p->bmq_idx; ++ struct list_head *head = &rq->queue.heads[idx]; ++ ++ if (list_is_last(&p->bmq_node, head)) { ++ idx = find_next_bit(rq->queue.bitmap, SCHED_BITS, idx + 1); ++ head = &rq->queue.heads[idx]; ++ ++ return list_first_entry(head, struct task_struct, bmq_node); ++ } ++ ++ return list_next_entry(p, bmq_node); ++} ++ ++#define __SCHED_DEQUEUE_TASK(p, rq, flags, func) \ ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); \ ++ sched_info_dequeued(rq, p); \ ++ \ ++ list_del(&p->bmq_node); \ ++ if (list_empty(&rq->queue.heads[p->bmq_idx])) { \ ++ clear_bit(p->bmq_idx, rq->queue.bitmap);\ ++ func; \ ++ } ++ ++#define __SCHED_ENQUEUE_TASK(p, rq, flags) \ ++ sched_info_queued(rq, p); \ ++ psi_enqueue(p, flags); \ ++ \ ++ p->bmq_idx = task_sched_prio(p, rq); \ ++ list_add_tail(&p->bmq_node, &rq->queue.heads[p->bmq_idx]); \ ++ set_bit(p->bmq_idx, rq->queue.bitmap) ++ ++#define __SCHED_REQUEUE_TASK(p, rq, func) \ ++{ \ ++ int idx = task_sched_prio(p, rq); \ ++\ ++ list_del(&p->bmq_node); \ ++ list_add_tail(&p->bmq_node, &rq->queue.heads[idx]); \ ++ if (idx != p->bmq_idx) { \ ++ if (list_empty(&rq->queue.heads[p->bmq_idx])) \ ++ clear_bit(p->bmq_idx, rq->queue.bitmap); \ ++ p->bmq_idx = idx; \ ++ set_bit(p->bmq_idx, rq->queue.bitmap); \ ++ func; \ ++ } \ ++} ++ ++static inline bool sched_task_need_requeue(struct task_struct *p, struct rq *rq) ++{ ++ return (task_sched_prio(p, rq) != p->bmq_idx); ++} ++ ++static void sched_task_fork(struct task_struct *p, struct rq *rq) ++{ ++ p->boost_prio = (p->boost_prio < 0) ? ++ p->boost_prio + MAX_PRIORITY_ADJ : MAX_PRIORITY_ADJ; ++} ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ if (p->prio < MAX_RT_PRIO) ++ return (p->prio - MAX_RT_PRIO); ++ return (p->prio - MAX_RT_PRIO + p->boost_prio); ++} ++ ++static void do_sched_yield_type_1(struct task_struct *p, struct rq *rq) ++{ ++ p->boost_prio = MAX_PRIORITY_ADJ; ++} ++ ++static void sched_task_ttwu(struct task_struct *p) ++{ ++ if(this_rq()->clock_task - p->last_ran > sched_timeslice_ns) ++ boost_task(p); ++} ++ ++static void sched_task_deactivate(struct task_struct *p, struct rq *rq) ++{ ++ if (rq_switch_time(rq) < boost_threshold(p)) ++ boost_task(p); ++} +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index e39008242cf4..5963716fe391 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -183,6 +183,7 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifndef CONFIG_SCHED_ALT + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -300,6 +301,13 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) + + return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); + } ++#else /* CONFIG_SCHED_ALT */ ++static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) ++{ ++ sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu); ++ return sg_cpu->max; ++} ++#endif + + /** + * sugov_iowait_reset() - Reset the IO boost status of a CPU. +@@ -443,7 +451,9 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } + */ + static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy) + { ++#ifndef CONFIG_SCHED_ALT + if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl) ++#endif + sg_policy->limits_changed = true; + } + +@@ -686,6 +696,7 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + } + + ret = sched_setattr_nocheck(thread, &attr); ++ + if (ret) { + kthread_stop(thread); + pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); +@@ -912,6 +923,7 @@ struct cpufreq_governor *cpufreq_default_governor(void) + cpufreq_governor_init(schedutil_gov); + + #ifdef CONFIG_ENERGY_MODEL ++#ifndef CONFIG_SCHED_ALT + extern bool sched_energy_update; + extern struct mutex sched_energy_mutex; + +@@ -942,4 +954,10 @@ void sched_cpufreq_governor_change(struct cpufreq_policy *policy, + } + + } ++#else /* CONFIG_SCHED_ALT */ ++void sched_cpufreq_governor_change(struct cpufreq_policy *policy, ++ struct cpufreq_governor *old_gov) ++{ ++} ++#endif + #endif +diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c +index 5a55d2300452..66a0ab7165f0 100644 +--- a/kernel/sched/cputime.c ++++ b/kernel/sched/cputime.c +@@ -122,7 +122,7 @@ void account_user_time(struct task_struct *p, u64 cputime) + p->utime += cputime; + account_group_user_time(p, cputime); + +- index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; ++ index = task_running_nice(p) ? CPUTIME_NICE : CPUTIME_USER; + + /* Add user time to cpustat. */ + task_group_account_field(p, index, cputime); +@@ -146,7 +146,7 @@ void account_guest_time(struct task_struct *p, u64 cputime) + p->gtime += cputime; + + /* Add guest time to cpustat. */ +- if (task_nice(p) > 0) { ++ if (task_running_nice(p)) { + cpustat[CPUTIME_NICE] += cputime; + cpustat[CPUTIME_GUEST_NICE] += cputime; + } else { +@@ -269,7 +269,7 @@ static inline u64 account_other_time(u64 max) + #ifdef CONFIG_64BIT + static inline u64 read_sum_exec_runtime(struct task_struct *t) + { +- return t->se.sum_exec_runtime; ++ return tsk_seruntime(t); + } + #else + static u64 read_sum_exec_runtime(struct task_struct *t) +@@ -279,7 +279,7 @@ static u64 read_sum_exec_runtime(struct task_struct *t) + struct rq *rq; + + rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; ++ ns = tsk_seruntime(t); + task_rq_unlock(rq, t, &rf); + + return ns; +@@ -614,7 +614,7 @@ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index f324dc36fc43..a6b566bda65b 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -369,6 +369,7 @@ void cpu_startup_entry(enum cpuhp_state state) + do_idle(); + } + ++#ifndef CONFIG_SCHED_ALT + /* + * idle-task scheduling class. + */ +@@ -482,3 +483,4 @@ const struct sched_class idle_sched_class + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif +diff --git a/kernel/sched/pds.h b/kernel/sched/pds.h +new file mode 100644 +index 000000000000..7fdeace7e8a5 +--- /dev/null ++++ b/kernel/sched/pds.h +@@ -0,0 +1,14 @@ ++#ifndef PDS_H ++#define PDS_H ++ ++/* bits: ++ * RT(0-99), (Low prio adj range, nice width, high prio adj range) / 2, cpu idle task */ ++#define SCHED_BITS (MAX_RT_PRIO + 20 + 1) ++#define IDLE_TASK_SCHED_PRIO (SCHED_BITS - 1) ++ ++static inline int task_running_nice(struct task_struct *p) ++{ ++ return (p->prio > DEFAULT_PRIO); ++} ++ ++#endif +diff --git a/kernel/sched/pds_imp.h b/kernel/sched/pds_imp.h +new file mode 100644 +index 000000000000..6baee5e961b9 +--- /dev/null ++++ b/kernel/sched/pds_imp.h +@@ -0,0 +1,257 @@ ++#define ALT_SCHED_VERSION_MSG "sched/pds: PDS CPU Scheduler "ALT_SCHED_VERSION" by Alfred Chen.\n" ++ ++static const u64 user_prio2deadline[NICE_WIDTH] = { ++/* -20 */ 4194304, 4613734, 5075107, 5582617, 6140878, ++/* -15 */ 6754965, 7430461, 8173507, 8990857, 9889942, ++/* -10 */ 10878936, 11966829, 13163511, 14479862, 15927848, ++/* -5 */ 17520632, 19272695, 21199964, 23319960, 25651956, ++/* 0 */ 28217151, 31038866, 34142752, 37557027, 41312729, ++/* 5 */ 45444001, 49988401, 54987241, 60485965, 66534561, ++/* 10 */ 73188017, 80506818, 88557499, 97413248, 107154572, ++/* 15 */ 117870029, 129657031, 142622734, 156885007, 172573507 ++}; ++ ++static const unsigned char dl_level_map[] = { ++/* 0 4 8 12 */ ++ 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, ++/* 16 20 24 28 */ ++ 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 17, 17, 17, 17, 17, ++/* 32 36 40 44 */ ++ 17, 17, 17, 17, 16, 16, 16, 16, 16, 16, 16, 16, 15, 15, 15, 15, ++/* 48 52 56 60 */ ++ 15, 15, 15, 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, 12, 12, 12, ++/* 64 68 72 76 */ ++ 12, 11, 11, 11, 10, 10, 10, 9, 9, 8, 7, 6, 5, 4, 3, 2, ++/* 80 84 88 92 */ ++ 1, 0 ++}; ++ ++static inline int ++task_sched_prio(const struct task_struct *p, const struct rq *rq) ++{ ++ size_t delta; ++ ++ if (p == rq->idle) ++ return IDLE_TASK_SCHED_PRIO; ++ ++ if (p->prio < MAX_RT_PRIO) ++ return p->prio; ++ ++ delta = (rq->clock + user_prio2deadline[39] - p->deadline) >> 21; ++ delta = min((size_t)delta, ARRAY_SIZE(dl_level_map) - 1); ++ ++ return MAX_RT_PRIO + dl_level_map[delta]; ++} ++ ++static inline void update_task_priodl(struct task_struct *p) ++{ ++ p->priodl = (((u64) (p->prio))<<56) | ((p->deadline)>>8); ++} ++ ++static inline void requeue_task(struct task_struct *p, struct rq *rq); ++ ++static inline void time_slice_expired(struct task_struct *p, struct rq *rq) ++{ ++ /*printk(KERN_INFO "sched: time_slice_expired(%d) - %px\n", cpu_of(rq), p);*/ ++ p->time_slice = sched_timeslice_ns; ++ ++ if (p->prio >= MAX_RT_PRIO) ++ p->deadline = rq->clock + user_prio2deadline[TASK_USER_PRIO(p)]; ++ update_task_priodl(p); ++ ++ if (SCHED_FIFO != p->policy && task_on_rq_queued(p)) ++ requeue_task(p, rq); ++} ++ ++/* ++ * pds_skiplist_task_search -- search function used in PDS run queue skip list ++ * node insert operation. ++ * @it: iterator pointer to the node in the skip list ++ * @node: pointer to the skiplist_node to be inserted ++ * ++ * Returns true if key of @it is less or equal to key value of @node, otherwise ++ * false. ++ */ ++static inline bool ++pds_skiplist_task_search(struct skiplist_node *it, struct skiplist_node *node) ++{ ++ return (skiplist_entry(it, struct task_struct, sl_node)->priodl <= ++ skiplist_entry(node, struct task_struct, sl_node)->priodl); ++} ++ ++/* ++ * Define the skip list insert function for PDS ++ */ ++DEFINE_SKIPLIST_INSERT_FUNC(pds_skiplist_insert, pds_skiplist_task_search); ++ ++/* ++ * Init the queue structure in rq ++ */ ++static inline void sched_queue_init(struct rq *rq) ++{ ++ FULL_INIT_SKIPLIST_NODE(&rq->sl_header); ++} ++ ++/* ++ * Init idle task and put into queue structure of rq ++ * IMPORTANT: may be called multiple times for a single cpu ++ */ ++static inline void sched_queue_init_idle(struct rq *rq, struct task_struct *idle) ++{ ++ /*printk(KERN_INFO "sched: init(%d) - %px\n", cpu_of(rq), idle);*/ ++ int default_prio = idle->prio; ++ ++ idle->prio = MAX_PRIO; ++ idle->deadline = 0ULL; ++ update_task_priodl(idle); ++ ++ FULL_INIT_SKIPLIST_NODE(&rq->sl_header); ++ ++ idle->sl_node.level = idle->sl_level; ++ pds_skiplist_insert(&rq->sl_header, &idle->sl_node); ++ ++ idle->prio = default_prio; ++} ++ ++/* ++ * This routine assume that the idle task always in queue ++ */ ++static inline struct task_struct *sched_rq_first_task(struct rq *rq) ++{ ++ struct skiplist_node *node = rq->sl_header.next[0]; ++ ++ BUG_ON(node == &rq->sl_header); ++ return skiplist_entry(node, struct task_struct, sl_node); ++} ++ ++static inline struct task_struct * ++sched_rq_next_task(struct task_struct *p, struct rq *rq) ++{ ++ struct skiplist_node *next = p->sl_node.next[0]; ++ ++ BUG_ON(next == &rq->sl_header); ++ return skiplist_entry(next, struct task_struct, sl_node); ++} ++ ++static inline unsigned long sched_queue_watermark(struct rq *rq) ++{ ++ return task_sched_prio(sched_rq_first_task(rq), rq); ++} ++ ++#define __SCHED_DEQUEUE_TASK(p, rq, flags, func) \ ++ psi_dequeue(p, flags & DEQUEUE_SLEEP); \ ++ sched_info_dequeued(rq, p); \ ++ \ ++ if (skiplist_del_init(&rq->sl_header, &p->sl_node)) { \ ++ func; \ ++ } ++ ++#define __SCHED_ENQUEUE_TASK(p, rq, flags) \ ++ sched_info_queued(rq, p); \ ++ psi_enqueue(p, flags); \ ++ \ ++ p->sl_node.level = p->sl_level; \ ++ pds_skiplist_insert(&rq->sl_header, &p->sl_node) ++ ++/* ++ * Requeue a task @p to @rq ++ */ ++#define __SCHED_REQUEUE_TASK(p, rq, func) \ ++{\ ++ bool b_first = skiplist_del_init(&rq->sl_header, &p->sl_node); \ ++\ ++ p->sl_node.level = p->sl_level; \ ++ if (pds_skiplist_insert(&rq->sl_header, &p->sl_node) || b_first) { \ ++ func; \ ++ } \ ++} ++ ++static inline bool sched_task_need_requeue(struct task_struct *p, struct rq *rq) ++{ ++ struct skiplist_node *node = p->sl_node.prev[0]; ++ ++ if (node != &rq->sl_header) { ++ struct task_struct *t = skiplist_entry(node, struct task_struct, sl_node); ++ ++ if (t->priodl > p->priodl) ++ return true; ++ } ++ ++ node = p->sl_node.next[0]; ++ if (node != &rq->sl_header) { ++ struct task_struct *t = skiplist_entry(node, struct task_struct, sl_node); ++ ++ if (t->priodl < p->priodl) ++ return true; ++ } ++ ++ return false; ++} ++ ++/* ++ * pds_skiplist_random_level -- Returns a pseudo-random level number for skip ++ * list node which is used in PDS run queue. ++ * ++ * In current implementation, based on testing, the first 8 bits in microseconds ++ * of niffies are suitable for random level population. ++ * find_first_bit() is used to satisfy p = 0.5 between each levels, and there ++ * should be platform hardware supported instruction(known as ctz/clz) to speed ++ * up this function. ++ * The skiplist level for a task is populated when task is created and doesn't ++ * change in task's life time. When task is being inserted into run queue, this ++ * skiplist level is set to task's sl_node->level, the skiplist insert function ++ * may change it based on current level of the skip lsit. ++ */ ++static inline int pds_skiplist_random_level(const struct task_struct *p) ++{ ++ long unsigned int randseed; ++ ++ /* ++ * 1. Some architectures don't have better than microsecond resolution ++ * so mask out ~microseconds as a factor of the random seed for skiplist ++ * insertion. ++ * 2. Use address of task structure pointer as another factor of the ++ * random seed for task burst forking scenario. ++ */ ++ randseed = (task_rq(p)->clock ^ (long unsigned int)p) >> 10; ++ ++ return find_first_bit(&randseed, NUM_SKIPLIST_LEVEL - 1); ++} ++ ++static void sched_task_fork(struct task_struct *p, struct rq *rq) ++{ ++ p->sl_level = pds_skiplist_random_level(p); ++ if (p->prio >= MAX_RT_PRIO) ++ p->deadline = rq->clock + user_prio2deadline[TASK_USER_PRIO(p)]; ++ update_task_priodl(p); ++} ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0(SCHED_ISO) up to 82 (nice +19 SCHED_IDLE). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ int ret; ++ ++ if (p->prio < MAX_RT_PRIO) ++ return (p->prio - MAX_RT_PRIO); ++ ++ preempt_disable(); ++ ret = task_sched_prio(p, this_rq()) - MAX_RT_PRIO; ++ preempt_enable(); ++ ++ return ret; ++} ++ ++static void do_sched_yield_type_1(struct task_struct *p, struct rq *rq) ++{ ++ time_slice_expired(p, rq); ++} ++ ++static void sched_task_ttwu(struct task_struct *p) {} ++static void sched_task_deactivate(struct task_struct *p, struct rq *rq) {} +diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c +index 2c613e1cff3a..0103b2a7201d 100644 +--- a/kernel/sched/pelt.c ++++ b/kernel/sched/pelt.c +@@ -270,6 +270,7 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load) + WRITE_ONCE(sa->util_avg, sa->util_sum / divider); + } + ++#ifndef CONFIG_SCHED_ALT + /* + * sched_entity: + * +@@ -387,8 +388,9 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + + return 0; + } ++#endif + +-#ifdef CONFIG_SCHED_THERMAL_PRESSURE ++#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT) + /* + * thermal: + * +diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h +index 795e43e02afc..856163dac896 100644 +--- a/kernel/sched/pelt.h ++++ b/kernel/sched/pelt.h +@@ -1,13 +1,15 @@ + #ifdef CONFIG_SMP + #include "sched-pelt.h" + ++#ifndef CONFIG_SCHED_ALT + int __update_load_avg_blocked_se(u64 now, struct sched_entity *se); + int __update_load_avg_se(u64 now, struct cfs_rq *cfs_rq, struct sched_entity *se); + int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq); + int update_rt_rq_load_avg(u64 now, struct rq *rq, int running); + int update_dl_rq_load_avg(u64 now, struct rq *rq, int running); ++#endif + +-#ifdef CONFIG_SCHED_THERMAL_PRESSURE ++#if defined(CONFIG_SCHED_THERMAL_PRESSURE) && !defined(CONFIG_SCHED_ALT) + int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity); + + static inline u64 thermal_load_avg(struct rq *rq) +@@ -42,6 +44,7 @@ static inline u32 get_pelt_divider(struct sched_avg *avg) + return LOAD_AVG_MAX - 1024 + avg->period_contrib; + } + ++#ifndef CONFIG_SCHED_ALT + /* + * When a task is dequeued, its estimated utilization should not be update if + * its util_avg has not been updated at least once. +@@ -162,9 +165,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq) + return rq_clock_pelt(rq_of(cfs_rq)); + } + #endif ++#endif /* CONFIG_SCHED_ALT */ + + #else + ++#ifndef CONFIG_SCHED_ALT + static inline int + update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) + { +@@ -182,6 +187,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running) + { + return 0; + } ++#endif + + static inline int + update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 28709f6b0975..6bc68bacbac8 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2,6 +2,10 @@ + /* + * Scheduler internal types and methods: + */ ++#ifdef CONFIG_SCHED_ALT ++#include "alt_sched.h" ++#else ++ + #include + + #include +@@ -2626,3 +2630,9 @@ static inline bool is_per_cpu_kthread(struct task_struct *p) + + void swake_up_all_locked(struct swait_queue_head *q); + void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait); ++ ++static inline int task_running_nice(struct task_struct *p) ++{ ++ return (task_nice(p) > 0); ++} ++#endif /* !CONFIG_SCHED_ALT */ +diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c +index 750fb3c67eed..108422ebc7bf 100644 +--- a/kernel/sched/stats.c ++++ b/kernel/sched/stats.c +@@ -22,8 +22,10 @@ static int show_schedstat(struct seq_file *seq, void *v) + } else { + struct rq *rq; + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_ALT + struct sched_domain *sd; + int dcount = 0; ++#endif + #endif + cpu = (unsigned long)(v - 2); + rq = cpu_rq(cpu); +@@ -40,6 +42,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + seq_printf(seq, "\n"); + + #ifdef CONFIG_SMP ++#ifndef CONFIG_SCHED_ALT + /* domain-specific stats */ + rcu_read_lock(); + for_each_domain(cpu, sd) { +@@ -68,6 +71,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + sd->ttwu_move_balance); + } + rcu_read_unlock(); ++#endif + #endif + } + return 0; +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 1bd7e3af904f..cc946a9bd550 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -4,6 +4,7 @@ + */ + #include "sched.h" + ++#ifndef CONFIG_SCHED_ALT + DEFINE_MUTEX(sched_domains_mutex); + + /* Protected by sched_domains_mutex: */ +@@ -1180,8 +1181,10 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) + */ + + static int default_relax_domain_level = -1; ++#endif /* CONFIG_SCHED_ALT */ + int sched_domain_level_max; + ++#ifndef CONFIG_SCHED_ALT + static int __init setup_relax_domain_level(char *str) + { + if (kstrtoint(str, 0, &default_relax_domain_level)) +@@ -1413,6 +1416,7 @@ sd_init(struct sched_domain_topology_level *tl, + + return sd; + } ++#endif /* CONFIG_SCHED_ALT */ + + /* + * Topology list, bottom-up. +@@ -1442,6 +1446,7 @@ void set_sched_topology(struct sched_domain_topology_level *tl) + sched_domain_topology = tl; + } + ++#ifndef CONFIG_SCHED_ALT + #ifdef CONFIG_NUMA + + static const struct cpumask *sd_numa_mask(int cpu) +@@ -2316,3 +2321,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], + partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); + mutex_unlock(&sched_domains_mutex); + } ++#else /* CONFIG_SCHED_ALT */ ++void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], ++ struct sched_domain_attr *dattr_new) ++{} ++ ++#ifdef CONFIG_NUMA ++int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; ++ ++int sched_numa_find_closest(const struct cpumask *cpus, int cpu) ++{ ++ return best_mask_cpu(cpu, cpus); ++} ++#endif /* CONFIG_NUMA */ ++#endif +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index afad085960b8..e91b4cb3042b 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -120,6 +120,10 @@ static unsigned long long_max = LONG_MAX; + static int one_hundred = 100; + static int two_hundred = 200; + static int one_thousand = 1000; ++#ifdef CONFIG_SCHED_ALT ++static int __maybe_unused zero = 0; ++extern int sched_yield_type; ++#endif + #ifdef CONFIG_PRINTK + static int ten_thousand = 10000; + #endif +@@ -184,7 +188,7 @@ static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT; + int sysctl_legacy_va_layout; + #endif + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_ALT) + static int min_sched_granularity_ns = 100000; /* 100 usecs */ + static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns; /* 0 usecs */ +@@ -1652,6 +1656,7 @@ int proc_do_static_key(struct ctl_table *table, int write, + } + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_ALT + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -1854,6 +1859,7 @@ static struct ctl_table kern_table[] = { + .extra2 = SYSCTL_ONE, + }, + #endif ++#endif /* !CONFIG_SCHED_ALT */ + #ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", +@@ -2430,6 +2436,17 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_ALT ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &two, ++ }, ++#endif + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index 95b6a708b040..81f2ee62c807 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -1927,8 +1927,10 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, + int ret = 0; + u64 slack; + ++#ifndef CONFIG_SCHED_ALT + slack = current->timer_slack_ns; + if (dl_task(current) || rt_task(current)) ++#endif + slack = 0; + + hrtimer_init_sleeper_on_stack(&t, clockid, mode); +diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +index a71758e34e45..d20c347df861 100644 +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -216,7 +216,7 @@ static void task_sample_cputime(struct task_struct *p, u64 *samples) + u64 stime, utime; + + task_cputime(p, &utime, &stime); +- store_samples(samples, stime, utime, p->se.sum_exec_runtime); ++ store_samples(samples, stime, utime, tsk_seruntime(p)); + } + + static void proc_sample_cputime_atomic(struct task_cputime_atomic *at, +@@ -801,6 +801,7 @@ static void collect_posix_cputimers(struct posix_cputimers *pct, u64 *samples, + } + } + ++#ifndef CONFIG_SCHED_ALT + static inline void check_dl_overrun(struct task_struct *tsk) + { + if (tsk->dl.dl_overrun) { +@@ -808,6 +809,7 @@ static inline void check_dl_overrun(struct task_struct *tsk) + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + } + } ++#endif + + static bool check_rlimit(u64 time, u64 limit, int signo, bool rt, bool hard) + { +@@ -835,8 +837,10 @@ static void check_thread_timers(struct task_struct *tsk, + u64 samples[CPUCLOCK_MAX]; + unsigned long soft; + ++#ifndef CONFIG_SCHED_ALT + if (dl_task(tsk)) + check_dl_overrun(tsk); ++#endif + + if (expiry_cache_is_inactive(pct)) + return; +@@ -850,7 +854,7 @@ static void check_thread_timers(struct task_struct *tsk, + soft = task_rlimit(tsk, RLIMIT_RTTIME); + if (soft != RLIM_INFINITY) { + /* Task RT timeout is accounted in jiffies. RTTIME is usec */ +- unsigned long rttime = tsk->rt.timeout * (USEC_PER_SEC / HZ); ++ unsigned long rttime = tsk_rttimeout(tsk) * (USEC_PER_SEC / HZ); + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + /* At the hard limit, send SIGKILL. No further action. */ +@@ -1086,8 +1090,10 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) + return true; + } + ++#ifndef CONFIG_SCHED_ALT + if (dl_task(tsk) && tsk->dl.dl_overrun) + return true; ++#endif + + return false; + } +diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +index b5e3496cf803..65f60c77bc50 100644 +--- a/kernel/trace/trace_selftest.c ++++ b/kernel/trace/trace_selftest.c +@@ -1048,10 +1048,15 @@ static int trace_wakeup_test_thread(void *data) + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_ALT ++ /* No deadline on BMQ/PDS, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + +diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c +index f36264fea75c6ca7c34eaa259c0bff829cbf6ac0..d43ca62fd00fe442bda9b4ad548fae432a7436de 100644 +--- a/kernel/sched/alt_core.c ++++ b/kernel/sched/alt_core.c +@@ -11,6 +11,10 @@ + * scheduler by Alfred Chen. + * 2019-02-20 BMQ(BitMap Queue) kernel scheduler by Alfred Chen. + */ ++#define CREATE_TRACE_POINTS ++#include ++#undef CREATE_TRACE_POINTS ++ + #include "sched.h" + + #include +@@ -42,8 +46,11 @@ + #include "pelt.h" + #include "smp.h" + +-#define CREATE_TRACE_POINTS +-#include ++/* ++ * Export tracepoints that act as a bare tracehook (ie: have no trace event ++ * associated with them) to allow external modules to probe them. ++ */ ++EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp); + + #define ALT_SCHED_VERSION "v5.9-r0" + +diff --git a/kernel/sched/alt_sched.h b/kernel/sched/alt_sched.h +index 99be2c51c88d0406cced20b36d7230da12930a5c..03f8b8b1aa27eeb15989af25b4050c767da12aad 100644 +--- a/kernel/sched/alt_sched.h ++++ b/kernel/sched/alt_sched.h +@@ -46,6 +46,8 @@ + + #include "cpupri.h" + ++#include ++ + #ifdef CONFIG_SCHED_BMQ + #include "bmq.h" + #endif +@@ -496,6 +498,8 @@ static inline int sched_tick_offload_init(void) { return 0; } + + extern void schedule_idle(void); + ++#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) ++ + /* + * !! For sched_setattr_nocheck() (kernel) only !! + * diff --git a/linux-tkg/linux-tkg-patches/5.9/0011-ZFS-fix.patch b/linux-tkg/linux-tkg-patches/5.9/0011-ZFS-fix.patch new file mode 100644 index 0000000..af71d04 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.9/0011-ZFS-fix.patch @@ -0,0 +1,43 @@ +From 1e010beda2896bdf3082fb37a3e49f8ce20e04d8 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= +Date: Thu, 2 May 2019 05:28:08 +0100 +Subject: [PATCH] x86/fpu: Export kernel_fpu_{begin,end}() with + EXPORT_SYMBOL_GPL +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +We need these symbols in zfs as the fpu implementation breaks userspace: + +https://github.com/zfsonlinux/zfs/issues/9346 +Signed-off-by: Jörg Thalheim +--- + arch/x86/kernel/fpu/core.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c +index 12c70840980e..352538b3bb5d 100644 +--- a/arch/x86/kernel/fpu/core.c ++++ b/arch/x86/kernel/fpu/core.c +@@ -102,7 +102,7 @@ void kernel_fpu_begin(void) + } + __cpu_invalidate_fpregs_state(); + } +-EXPORT_SYMBOL_GPL(kernel_fpu_begin); ++EXPORT_SYMBOL(kernel_fpu_begin); + + void kernel_fpu_end(void) + { +@@ -111,7 +111,7 @@ void kernel_fpu_end(void) + this_cpu_write(in_kernel_fpu, false); + preempt_enable(); + } +-EXPORT_SYMBOL_GPL(kernel_fpu_end); ++EXPORT_SYMBOL(kernel_fpu_end); + + /* + * Save the FPU state (mark it for reload if necessary): +-- +2.23.0 + + diff --git a/linux-tkg/linux-tkg-patches/5.9/0012-misc-additions.patch b/linux-tkg/linux-tkg-patches/5.9/0012-misc-additions.patch new file mode 100644 index 0000000..a4efaef --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.9/0012-misc-additions.patch @@ -0,0 +1,54 @@ +diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig +index 0840d27381ea..73aba9a31064 100644 +--- a/drivers/tty/Kconfig ++++ b/drivers/tty/Kconfig +@@ -75,6 +75,19 @@ config VT_CONSOLE_SLEEP + def_bool y + depends on VT_CONSOLE && PM_SLEEP + ++config NR_TTY_DEVICES ++ int "Maximum tty device number" ++ depends on VT ++ range 12 63 ++ default 63 ++ help ++ This option is used to change the number of tty devices in /dev. ++ The default value is 63. The lowest number you can set is 12, ++ 63 is also the upper limit so we don't overrun the serial ++ consoles. ++ ++ If unsure, say 63. ++ + config HW_CONSOLE + bool + depends on VT && !UML +diff --git a/include/uapi/linux/vt.h b/include/uapi/linux/vt.h +index e9d39c48520a..3bceead8da40 100644 +--- a/include/uapi/linux/vt.h ++++ b/include/uapi/linux/vt.h +@@ -3,12 +3,25 @@ + #define _UAPI_LINUX_VT_H + + ++/* ++ * We will make this definition solely for the purpose of making packages ++ * such as splashutils build, because they can not understand that ++ * NR_TTY_DEVICES is defined in the kernel configuration. ++ */ ++#ifndef CONFIG_NR_TTY_DEVICES ++#define CONFIG_NR_TTY_DEVICES 63 ++#endif ++ + /* + * These constants are also useful for user-level apps (e.g., VC + * resizing). + */ + #define MIN_NR_CONSOLES 1 /* must be at least 1 */ +-#define MAX_NR_CONSOLES 63 /* serial lines start at 64 */ ++/* ++ * NR_TTY_DEVICES: ++ * Value MUST be at least 12 and must never be higher then 63 ++ */ ++#define MAX_NR_CONSOLES CONFIG_NR_TTY_DEVICES /* serial lines start above this */ + /* Note: the ioctl VT_GETSTATE does not work for + consoles 16 and higher (since it returns a short) */ \ No newline at end of file diff --git a/linux-tkg/linux-tkg-patches/5.9/0013-remove-debian-deps-cross.patch b/linux-tkg/linux-tkg-patches/5.9/0013-remove-debian-deps-cross.patch new file mode 100644 index 0000000..32a9e45 --- /dev/null +++ b/linux-tkg/linux-tkg-patches/5.9/0013-remove-debian-deps-cross.patch @@ -0,0 +1,25 @@ +From 6368e2994fb1d82dd159f6b5ef021adaf92912a4 Mon Sep 17 00:00:00 2001 +From: Hyper-KVM +Date: Mon, 12 Oct 2020 21:05:18 -0400 +Subject: [PATCH] scripts/mkdebian: remove build depends + +--- + scripts/package/mkdebian | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/scripts/package/mkdebian b/scripts/package/mkdebian +index 48fbd3d0284a..c0f93dd33d74 100755 +--- a/scripts/package/mkdebian ++++ b/scripts/package/mkdebian +@@ -174,7 +174,7 @@ Source: $sourcename + Section: kernel + Priority: optional + Maintainer: $maintainer +-Build-Depends: bc, rsync, kmod, cpio, bison, flex | flex:native $extra_build_depends ++Build-Depends: + Homepage: https://www.kernel.org/ + + Package: $packagename +-- +2.28.0 +